blob: 793a09ffb7a612057764076c98dc49ebfab6f0b3 [file] [log] [blame] [edit]
// Copyright 2021 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
use bitflags::bitflags;
use extended_pstate::ExtendedPstateState;
use fuchsia_zircon::{
self as zx, sys::zx_thread_state_general_regs_t, AsHandleRef, Signals, Task as _,
};
use once_cell::sync::OnceCell;
use starnix_lock::{Mutex, RwLock, RwLockReadGuard, RwLockWriteGuard};
use starnix_sync::{EventWaitGuard, WakeReason};
use std::{
cmp,
convert::TryFrom,
ffi::CString,
fmt,
sync::{
atomic::{AtomicU8, Ordering},
Arc,
},
};
use crate::{
arch::{
registers::RegisterState,
task::{decode_page_fault_exception_report, get_signal_for_general_exception},
},
auth::*,
execution::*,
fs::{
FdFlags, FdNumber, FdTable, FileHandle, FsContext, FsStr, FsString, LookupContext,
NamespaceNode, SymlinkMode, SymlinkTarget,
},
loader::*,
logging::*,
mm::{DumpPolicy, MemoryAccessor, MemoryAccessorExt, MemoryManager},
signals::{send_signal, types::*, SignalInfo},
syscalls::{decls::Syscall, SyscallResult},
task::*,
types::*,
};
/// The task object associated with the currently executing thread.
///
/// We often pass the `CurrentTask` as the first argument to functions if those functions need to
/// know contextual information about the thread on which they are running. For example, we often
/// use the `CurrentTask` to perform access checks, which ensures that the caller is authorized to
/// perform the requested operation.
///
/// The `CurrentTask` also has state that can be referenced only on the currently executing thread,
/// such as the register state for that thread. Syscalls are given a mutable references to the
/// `CurrentTask`, which lets them manipulate this state.
///
/// See also `Task` for more information about tasks.
pub struct CurrentTask {
/// The underlying task object.
pub task: OwnedRefByRef<Task>,
/// A copy of the registers associated with the Zircon thread. Up-to-date values can be read
/// from `self.handle.read_state_general_regs()`. To write these values back to the thread, call
/// `self.handle.write_state_general_regs(self.registers.into())`.
pub registers: RegisterState,
/// Copy of the current extended processor state including floating point and vector registers.
pub extended_pstate: ExtendedPstateState,
/// A custom function to resume a syscall that has been interrupted by SIGSTOP.
/// To use, call set_syscall_restart_func and return ERESTART_RESTARTBLOCK. sys_restart_syscall
/// will eventually call it.
pub syscall_restart_func: Option<Box<SyscallRestartFunc>>,
}
type SyscallRestartFunc =
dyn FnOnce(&mut CurrentTask) -> Result<SyscallResult, Errno> + Send + Sync;
impl ReleasableByRef for CurrentTask {
type Context<'a> = ();
fn release(&self, _: ()) {
self.notify_robust_list();
let _ignored = self.clear_child_tid_if_needed();
self.task.release(self);
}
}
impl std::ops::Deref for CurrentTask {
type Target = Task;
fn deref(&self) -> &Self::Target {
&self.task
}
}
impl MemoryAccessor for CurrentTask {
fn read_memory_to_slice(&self, addr: UserAddress, bytes: &mut [u8]) -> Result<(), Errno> {
self.mm.read_memory_to_slice(addr, bytes)
}
fn read_memory_partial_to_slice(
&self,
addr: UserAddress,
bytes: &mut [u8],
) -> Result<usize, Errno> {
self.mm.read_memory_partial_to_slice(addr, bytes)
}
fn write_memory(&self, addr: UserAddress, bytes: &[u8]) -> Result<usize, Errno> {
self.mm.write_memory(addr, bytes)
}
fn write_memory_partial(&self, addr: UserAddress, bytes: &[u8]) -> Result<usize, Errno> {
self.mm.write_memory_partial(addr, bytes)
}
fn zero(&self, addr: UserAddress, length: usize) -> Result<usize, Errno> {
self.mm.zero(addr, length)
}
}
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum ExitStatus {
Exit(u8),
Kill(SignalInfo),
CoreDump(SignalInfo),
Stop(SignalInfo),
Continue(SignalInfo),
}
impl ExitStatus {
/// Converts the given exit status to a status code suitable for returning from wait syscalls.
pub fn wait_status(&self) -> i32 {
let maybe_ptrace = |siginfo: &SignalInfo| {
if ((siginfo.code >> 8) as u32) == PTRACE_EVENT_STOP {
(PTRACE_EVENT_STOP << 16) as i32
} else {
0
}
};
match self {
ExitStatus::Exit(status) => (*status as i32) << 8,
ExitStatus::Kill(siginfo) => siginfo.signal.number() as i32,
ExitStatus::CoreDump(siginfo) => (siginfo.signal.number() as i32) | 0x80,
ExitStatus::Continue(siginfo) => {
if maybe_ptrace(siginfo) != 0 {
(siginfo.signal.number() as i32) | maybe_ptrace(siginfo)
} else {
0xffff
}
}
ExitStatus::Stop(siginfo) => {
(0x7f + ((siginfo.signal.number() as i32) << 8)) | maybe_ptrace(siginfo)
}
}
}
pub fn signal_info_code(&self) -> u32 {
match self {
ExitStatus::Exit(_) => CLD_EXITED,
ExitStatus::Kill(_) => CLD_KILLED,
ExitStatus::CoreDump(_) => CLD_DUMPED,
ExitStatus::Stop(_) => CLD_STOPPED,
ExitStatus::Continue(_) => CLD_CONTINUED,
}
}
pub fn signal_info_status(&self) -> i32 {
match self {
ExitStatus::Exit(status) => *status as i32,
ExitStatus::Kill(siginfo)
| ExitStatus::CoreDump(siginfo)
| ExitStatus::Continue(siginfo)
| ExitStatus::Stop(siginfo) => siginfo.signal.number() as i32,
}
}
}
pub struct AtomicStopState {
inner: AtomicU8,
}
impl AtomicStopState {
pub fn new(state: StopState) -> Self {
Self { inner: AtomicU8::new(state as u8) }
}
pub fn load(&self, ordering: Ordering) -> StopState {
let v = self.inner.load(ordering);
// SAFETY: we only ever store to the atomic a value originating
// from a valid `StopState`.
unsafe { std::mem::transmute(v) }
}
pub fn store(&self, state: StopState, ordering: Ordering) {
self.inner.store(state as u8, ordering)
}
}
/// This enum describes the state that a task or thread group can be in when being stopped.
/// The names are taken from ptrace(2).
#[derive(Clone, Copy, PartialEq)]
#[repr(u8)]
pub enum StopState {
/// In this state, the process has been told to wake up, but has not yet been woken.
/// Individual threads may still be stopped.
Waking,
/// In this state, at least one thread is awake.
Awake,
/// Same as the above, but you are not allowed to make further transitions. Used
/// to kill the task / group. These names are not in ptrace(2).
ForceWaking,
ForceAwake,
/// In this state, the process has been told to stop via a signal, but has not yet stopped.
GroupStopping,
/// In this state, at least one thread of the process has stopped
GroupStopped,
/// In this state, the task has received a signal, and it is being traced, so it will
/// stop at the next opportunity.
SignalDeliveryStopping,
/// Same as the last one, but has stopped.
SignalDeliveryStopped,
// TODO: Other states.
}
impl StopState {
/// This means a stop is either in progress or we've stopped.
pub fn is_stopping_or_stopped(&self) -> bool {
self.is_stopped() || self.is_stopping()
}
/// This means a stop is in progress. Refers to any stop state ending in "ing".
pub fn is_stopping(&self) -> bool {
*self == StopState::GroupStopping || *self == StopState::SignalDeliveryStopping
}
/// This means task is stopped.
pub fn is_stopped(&self) -> bool {
*self == StopState::GroupStopped || *self == StopState::SignalDeliveryStopped
}
/// Returns the "ed" version of this StopState, if it is "ing".
pub fn finalize(&self) -> Result<StopState, ()> {
match *self {
StopState::GroupStopping => Ok(StopState::GroupStopped),
StopState::SignalDeliveryStopping => Ok(StopState::SignalDeliveryStopped),
StopState::Waking => Ok(StopState::Awake),
StopState::ForceWaking => Ok(StopState::ForceAwake),
_ => Err(()),
}
}
pub fn is_downgrade(&self, new_state: &StopState) -> bool {
match *self {
StopState::GroupStopped => *new_state == StopState::GroupStopping,
StopState::SignalDeliveryStopped => *new_state == StopState::SignalDeliveryStopping,
StopState::Awake => *new_state == StopState::Waking,
_ => false,
}
}
pub fn is_waking_or_awake(&self) -> bool {
*self == StopState::Waking
|| *self == StopState::Awake
|| *self == StopState::ForceWaking
|| *self == StopState::ForceAwake
}
/// Indicate if the transition to the stopped / awake state is not finished. This
/// function is typically used to determine when it is time to notify waiters.
pub fn is_in_progress(&self) -> bool {
*self == StopState::Waking
|| *self == StopState::ForceWaking
|| *self == StopState::GroupStopping
|| *self == StopState::SignalDeliveryStopping
}
pub fn ptrace_only(&self) -> bool {
!self.is_waking_or_awake()
&& *self != StopState::GroupStopped
&& *self != StopState::GroupStopping
}
pub fn is_illegal_transition(&self, new_state: StopState) -> bool {
*self == StopState::ForceAwake
|| (*self == StopState::ForceWaking && new_state != StopState::ForceAwake)
|| new_state == *self
|| self.is_downgrade(&new_state)
}
}
bitflags! {
pub struct TaskFlags: u8 {
const EXITED = 0x1;
const SIGNALS_AVAILABLE = 0x2;
const TEMPORARY_SIGNAL_MASK = 0x4;
/// Whether the executor should dump the stack of this task when it exits.
/// Currently used to implement ExitStatus::CoreDump.
const DUMP_ON_EXIT = 0x8;
}
}
pub struct AtomicTaskFlags {
flags: AtomicU8,
}
impl AtomicTaskFlags {
fn new(flags: TaskFlags) -> Self {
Self { flags: AtomicU8::new(flags.bits()) }
}
fn load(&self, ordering: Ordering) -> TaskFlags {
let flags = self.flags.load(ordering);
// SAFETY: We only ever store values from a `TaskFlags`.
unsafe { TaskFlags::from_bits_unchecked(flags) }
}
fn swap(&self, flags: TaskFlags, ordering: Ordering) -> TaskFlags {
let flags = self.flags.swap(flags.bits(), ordering);
// SAFETY: We only ever store values from a `TaskFlags`.
unsafe { TaskFlags::from_bits_unchecked(flags) }
}
}
pub struct TaskMutableState {
// See https://man7.org/linux/man-pages/man2/set_tid_address.2.html
pub clear_child_tid: UserRef<pid_t>,
/// Signal handler related state. This is grouped together for when atomicity is needed during
/// signal sending and delivery.
pub signals: SignalState,
/// The exit status that this task exited with.
exit_status: Option<ExitStatus>,
/// Desired scheduler policy for the task.
pub scheduler_policy: SchedulerPolicy,
/// The UTS namespace assigned to this thread.
///
/// This field is kept in the mutable state because the UTS namespace of a thread
/// can be forked using `clone()` or `unshare()` syscalls.
///
/// We use UtsNamespaceHandle because the UTS properties can be modified
/// by any other thread that shares this namespace.
pub uts_ns: UtsNamespaceHandle,
/// Bit that determines whether a newly started program can have privileges its parent does
/// not have. See Documentation/prctl/no_new_privs.txt in the Linux kernel for details.
/// Note that Starnix does not currently implement the relevant privileges (e.g.,
/// setuid/setgid binaries). So, you can set this, but it does nothing other than get
/// propagated to children.
///
/// The documentation indicates that this can only ever be set to
/// true, and it cannot be reverted to false. Accessor methods
/// for this field ensure this property.
no_new_privs: bool,
/// Userspace hint about how to adjust the OOM score for this process.
pub oom_score_adj: i32,
/// List of currently installed seccomp_filters
pub seccomp_filters: SeccompFilterContainer,
/// A pointer to the head of the robust futex list of this thread in
/// userspace. See get_robust_list(2)
pub robust_list_head: UserRef<robust_list_head>,
/// The timer slack used to group timer expirations for the calling thread.
///
/// Timers may expire up to `timerslack_ns` late, but never early.
///
/// If this value is 0, the task's default timerslack is used.
timerslack_ns: u64,
/// The default value for `timerslack_ns`. This value cannot change during the lifetime of a
/// task.
///
/// This value is set to the `timerslack_ns` of the creating thread, and thus is not constant
/// across tasks.
default_timerslack_ns: u64,
/// Information that a tracer needs to communicate with this process, if it
/// is being traced.
pub ptrace: Option<PtraceState>,
}
impl TaskMutableState {
pub fn no_new_privs(&self) -> bool {
self.no_new_privs
}
/// Sets the value of no_new_privs to true. It is an error to set
/// it to anything else.
pub fn enable_no_new_privs(&mut self) {
self.no_new_privs = true;
}
pub fn get_timerslack_ns(&self) -> u64 {
self.timerslack_ns
}
/// Sets the current timerslack of the task to `ns`.
///
/// If `ns` is zero, the current timerslack gets reset to the task's default timerslack.
pub fn set_timerslack_ns(&mut self, ns: u64) {
if ns == 0 {
self.timerslack_ns = self.default_timerslack_ns;
} else {
self.timerslack_ns = ns;
}
}
pub fn set_ptrace(&mut self, tracer: Option<pid_t>) -> Result<(), Errno> {
if let Some(tracer) = tracer {
if self.ptrace.is_some() {
return Err(errno!(EPERM));
}
self.ptrace = Some(PtraceState::new(tracer));
} else {
self.ptrace = None;
}
Ok(())
}
pub fn is_ptraced(&self) -> bool {
self.ptrace.is_some()
}
pub fn ptrace_on_signal_consume(&mut self) -> bool {
self.ptrace.as_mut().map_or(false, |ptrace: &mut PtraceState| {
if ptrace.stop_status == PtraceStatus::Continuing {
ptrace.stop_status = PtraceStatus::Default;
false
} else {
true
}
})
}
pub fn notify_ptracers(&mut self) {
if let Some(ptrace) = &self.ptrace {
ptrace.tracer_waiters.notify_all();
}
}
pub fn wait_on_ptracer(&self, waiter: &Waiter) {
if let Some(ptrace) = &self.ptrace {
ptrace.tracee_waiters.wait_async(&waiter);
}
}
pub fn notify_ptracees(&mut self) {
if let Some(ptrace) = &self.ptrace {
ptrace.tracee_waiters.notify_all();
}
}
}
pub enum ExceptionResult {
/// The exception was handled and no further action is required.
Handled,
// The exception generated a signal that should be delivered.
Signal(SignalInfo),
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TaskStateCode {
// Task is being executed.
Running,
// Task is waiting for an event.
Sleeping,
// Task has exited.
Zombie,
}
impl TaskStateCode {
pub fn code_char(&self) -> char {
match self {
TaskStateCode::Running => 'R',
TaskStateCode::Sleeping => 'S',
TaskStateCode::Zombie => 'Z',
}
}
pub fn name(&self) -> &'static str {
match self {
TaskStateCode::Running => "running",
TaskStateCode::Sleeping => "sleeping",
TaskStateCode::Zombie => "zombie",
}
}
}
/// The information of the task that needs to be available to the `ThreadGroup` while computing
/// which process a wait can target. It is necessary to shared this data with the `ThreadGroup` so
/// that it is available while the task is being dropped and so is not accessible from a weak
/// pointer.
#[derive(Clone, Debug)]
pub struct TaskPersistentInfoState {
/// Immutable information about the task
tid: pid_t,
pid: pid_t,
/// The command of this task.
command: CString,
/// The security credentials for this task.
creds: Credentials,
/// The signal this task generates on exit.
exit_signal: Option<Signal>,
}
impl TaskPersistentInfoState {
fn new(
tid: pid_t,
pid: pid_t,
command: CString,
creds: Credentials,
exit_signal: Option<Signal>,
) -> TaskPersistentInfo {
Arc::new(Mutex::new(Self { tid, pid, command, creds, exit_signal }))
}
pub fn tid(&self) -> pid_t {
self.tid
}
pub fn pid(&self) -> pid_t {
self.pid
}
pub fn command(&self) -> &CString {
&self.command
}
pub fn creds(&self) -> &Credentials {
&self.creds
}
pub fn exit_signal(&self) -> &Option<Signal> {
&self.exit_signal
}
}
pub type TaskPersistentInfo = Arc<Mutex<TaskPersistentInfoState>>;
/// A unit of execution.
///
/// A task is the primary unit of execution in the Starnix kernel. Most tasks are *user* tasks,
/// which have an associated Zircon thread. The Zircon thread switches between restricted mode,
/// in which the thread runs userspace code, and normal mode, in which the thread runs Starnix
/// code.
///
/// Tasks track the resources used by userspace by referencing various objects, such as an
/// `FdTable`, a `MemoryManager`, and an `FsContext`. Many tasks can share references to these
/// objects. In principle, which objects are shared between which tasks can be largely arbitrary,
/// but there are common patterns of sharing. For example, tasks created with `pthread_create`
/// will share the `FdTable`, `MemoryManager`, and `FsContext` and are often called "threads" by
/// userspace programmers. Tasks created by `posix_spawn` do not share these objects and are often
/// called "processes" by userspace programmers. However, inside the kernel, there is no clear
/// definition of a "thread" or a "process".
///
/// During boot, the kernel creates the first task, often called `init`. The vast majority of other
/// tasks are created as transitive clones (e.g., using `clone(2)`) of that task. Sometimes, the
/// kernel will create new tasks from whole cloth, either with a corresponding userspace component
/// or to represent some background work inside the kernel.
///
/// See also `CurrentTask`, which represents the task corresponding to the thread that is currently
/// executing.
pub struct Task {
/// A unique identifier for this task.
///
/// This value can be read in userspace using `gettid(2)`. In general, this value
/// is different from the value return by `getpid(2)`, which returns the `id` of the leader
/// of the `thread_group`.
pub id: pid_t,
/// The thread group to which this task belongs.
///
/// The group of tasks in a thread group roughly corresponds to the userspace notion of a
/// process.
pub thread_group: Arc<ThreadGroup>,
/// A handle to the underlying Zircon thread object.
///
/// Some tasks lack an underlying Zircon thread. These tasks are used internally by the
/// Starnix kernel to track background work, typically on a `kthread`.
pub thread: RwLock<Option<zx::Thread>>,
/// The file descriptor table for this task.
///
/// This table can be share by many tasks.
pub files: FdTable,
/// The memory manager for this task.
pub mm: Arc<MemoryManager>,
/// The file system for this task.
///
/// The only case when this is not set is for the initial task while the FsContext is built.
fs: OnceCell<Arc<FsContext>>,
/// The namespace for abstract AF_UNIX sockets for this task.
pub abstract_socket_namespace: Arc<AbstractUnixSocketNamespace>,
/// The namespace for AF_VSOCK for this task.
pub abstract_vsock_namespace: Arc<AbstractVsockSocketNamespace>,
/// The stop state of the task, distinct from the stop state of the thread group.
///
/// Must only be set when the `mutable_state` write lock is held.
stop_state: AtomicStopState,
/// The flags for the task.
///
/// Must only be set the then `mutable_state` write lock is held.
flags: AtomicTaskFlags,
/// The mutable state of the Task.
mutable_state: RwLock<TaskMutableState>,
/// The information of the task that needs to be available to the `ThreadGroup` while computing
/// which process a wait can target.
/// Contains the command line, the task credentials and the exit signal.
/// See `TaskPersistentInfo` for more information.
pub persistent_info: TaskPersistentInfo,
/// For vfork and clone() with CLONE_VFORK, this is set when the task exits or calls execve().
/// It allows the calling task to block until the fork has been completed. Only populated
/// when created with the CLONE_VFORK flag.
vfork_event: Option<Arc<zx::Event>>,
/// Variable that can tell you whether there are currently seccomp
/// filters without holding a lock
pub seccomp_filter_state: SeccompState,
}
/// The decoded cross-platform parts we care about for page fault exception reports.
pub struct PageFaultExceptionReport {
pub faulting_address: u64,
pub not_present: bool, // Set when the page fault was due to a not-present page.
pub is_write: bool, // Set when the triggering memory operation was a write.
}
impl Task {
pub fn flags(&self) -> TaskFlags {
self.flags.load(Ordering::Relaxed)
}
fn check_mutable_state_lock_held(&self, guard: &mut TaskMutableState) {
// We don't actually use the guard but we require it to enforce that the
// caller holds the task's mutable state lock (identified by mutable
// access to the task's mutable state).
let _ = guard;
// Ideally we would assert `!self.mutable_state.is_locked_exclusive()`
// but this tries to take the lock underneath which triggers a lock
// dependency check, resulting in a panic.
}
pub fn update_flags(&self, guard: &mut TaskMutableState, clear: TaskFlags, set: TaskFlags) {
self.check_mutable_state_lock_held(guard);
debug_assert_eq!(clear ^ set, clear | set);
let observed = self.flags();
let swapped = self.flags.swap((observed | set) & !clear, Ordering::Relaxed);
debug_assert_eq!(swapped, observed);
}
pub fn set_flags(&self, guard: &mut TaskMutableState, flag: TaskFlags, v: bool) {
let (clear, set) = if v { (TaskFlags::empty(), flag) } else { (flag, TaskFlags::empty()) };
self.update_flags(guard, clear, set);
}
pub fn set_exit_status(&self, guard: &mut TaskMutableState, status: ExitStatus) {
self.set_flags(guard, TaskFlags::EXITED, true);
guard.exit_status = Some(status);
}
pub fn set_exit_status_if_not_already(&self, guard: &mut TaskMutableState, status: ExitStatus) {
self.set_flags(guard, TaskFlags::EXITED, true);
guard.exit_status.get_or_insert(status);
}
pub fn exit_status(&self) -> Option<ExitStatus> {
self.is_exitted().then(|| self.read().exit_status.clone()).flatten()
}
pub fn is_exitted(&self) -> bool {
self.flags().contains(TaskFlags::EXITED)
}
pub fn load_stopped(&self) -> StopState {
self.stop_state.load(Ordering::Relaxed)
}
fn store_stopped(&self, state: StopState, guard: &mut TaskMutableState) {
self.check_mutable_state_lock_held(guard);
self.stop_state.store(state, Ordering::Relaxed)
}
pub fn set_stopped(
&self,
guard: &mut TaskMutableState,
stopped: StopState,
siginfo: Option<SignalInfo>,
) {
if stopped.ptrace_only() && guard.ptrace.is_none() {
return;
}
if self.load_stopped().is_illegal_transition(stopped) {
return;
}
// TODO(https://g-issues.fuchsia.dev/issues/306438676): When task can be
// stopped inside user code, task will need to be either restarted or
// stopped here.
self.store_stopped(stopped, guard);
if let Some(ref mut ptrace) = &mut guard.ptrace {
ptrace.set_last_signal(siginfo);
}
if stopped == StopState::Waking || stopped == StopState::ForceWaking {
guard.notify_ptracees();
}
if !stopped.is_in_progress() {
guard.notify_ptracers();
}
}
/// Upgrade a Reference to a Task, returning a ESRCH errno if the reference cannot be borrowed.
pub fn from_weak(weak: &WeakRef<Task>) -> Result<TempRef<'_, Task>, Errno> {
weak.upgrade().ok_or_else(|| errno!(ESRCH))
}
/// Internal function for creating a Task object. Useful when you need to specify the value of
/// every field. create_process and create_thread are more likely to be what you want.
///
/// Any fields that should be initialized fresh for every task, even if the task was created
/// with fork, are initialized to their defaults inside this function. All other fields are
/// passed as parameters.
#[allow(clippy::let_and_return)]
fn new(
id: pid_t,
command: CString,
thread_group: Arc<ThreadGroup>,
thread: Option<zx::Thread>,
files: FdTable,
mm: Arc<MemoryManager>,
// The only case where fs should be None if when building the initial task that is the
// used to build the initial FsContext.
fs: Option<Arc<FsContext>>,
creds: Credentials,
abstract_socket_namespace: Arc<AbstractUnixSocketNamespace>,
abstract_vsock_namespace: Arc<AbstractVsockSocketNamespace>,
exit_signal: Option<Signal>,
signal_mask: SigSet,
vfork_event: Option<Arc<zx::Event>>,
scheduler_policy: SchedulerPolicy,
uts_ns: UtsNamespaceHandle,
no_new_privs: bool,
seccomp_filter_state: SeccompState,
seccomp_filters: SeccompFilterContainer,
robust_list_head: UserRef<robust_list_head>,
timerslack_ns: u64,
) -> Self {
let fs = {
let result = OnceCell::new();
if let Some(fs) = fs {
result.get_or_init(|| fs);
}
result
};
let pid = thread_group.leader;
let task = Task {
id,
thread_group,
thread: RwLock::new(thread),
files,
mm,
fs,
abstract_socket_namespace,
abstract_vsock_namespace,
vfork_event,
stop_state: AtomicStopState::new(StopState::Awake),
flags: AtomicTaskFlags::new(TaskFlags::empty()),
mutable_state: RwLock::new(TaskMutableState {
clear_child_tid: UserRef::default(),
signals: SignalState::with_mask(signal_mask),
exit_status: None,
scheduler_policy,
uts_ns,
no_new_privs,
oom_score_adj: Default::default(),
seccomp_filters,
robust_list_head,
timerslack_ns,
// The default timerslack is set to the current timerslack of the creating thread.
default_timerslack_ns: timerslack_ns,
ptrace: None,
}),
persistent_info: TaskPersistentInfoState::new(id, pid, command, creds, exit_signal),
seccomp_filter_state,
};
#[cfg(any(test, debug_assertions))]
{
let _l1 = task.read();
let _l2 = task.persistent_info.lock();
}
task
}
/// Access mutable state with a read lock.
pub fn read(&self) -> RwLockReadGuard<'_, TaskMutableState> {
self.mutable_state.read()
}
/// Access mutable state with a write lock.
pub fn write(&self) -> RwLockWriteGuard<'_, TaskMutableState> {
self.mutable_state.write()
}
pub fn add_file(&self, file: FileHandle, flags: FdFlags) -> Result<FdNumber, Errno> {
self.files.add_with_flags(self, file, flags)
}
pub fn creds(&self) -> Credentials {
self.persistent_info.lock().creds.clone()
}
pub fn exit_signal(&self) -> Option<Signal> {
self.persistent_info.lock().exit_signal
}
pub fn set_creds(&self, creds: Credentials) {
self.persistent_info.lock().creds = creds;
}
pub fn fs(&self) -> &Arc<FsContext> {
self.fs.get().unwrap()
}
pub fn set_fs(&self, fs: Arc<FsContext>) {
self.fs.set(fs).map_err(|_| "Cannot set fs multiple times").unwrap();
}
// See "Ptrace access mode checking" in https://man7.org/linux/man-pages/man2/ptrace.2.html
pub fn check_ptrace_access_mode(
&self,
mode: PtraceAccessMode,
target: &Task,
) -> Result<(), Errno> {
// (1) If the calling thread and the target thread are in the same
// thread group, access is always allowed.
if self.thread_group.leader == target.thread_group.leader {
return Ok(());
}
// (2) If the access mode specifies PTRACE_MODE_FSCREDS, then, for
// the check in the next step, employ the caller's filesystem
// UID and GID. (As noted in credentials(7), the filesystem
// UID and GID almost always have the same values as the
// corresponding effective IDs.)
//
// Otherwise, the access mode specifies PTRACE_MODE_REALCREDS,
// so use the caller's real UID and GID for the checks in the
// next step. (Most APIs that check the caller's UID and GID
// use the effective IDs. For historical reasons, the
// PTRACE_MODE_REALCREDS check uses the real IDs instead.)
let creds = self.creds();
let (uid, gid) = if mode.contains(PTRACE_MODE_FSCREDS) {
let fscred = creds.as_fscred();
(fscred.uid, fscred.gid)
} else if mode.contains(PTRACE_MODE_REALCREDS) {
(creds.uid, creds.gid)
} else {
unreachable!();
};
// (3) Deny access if neither of the following is true:
//
// - The real, effective, and saved-set user IDs of the target
// match the caller's user ID, and the real, effective, and
// saved-set group IDs of the target match the caller's
// group ID.
//
// - The caller has the CAP_SYS_PTRACE capability in the user
// namespace of the target.
let target_creds = target.creds();
if !creds.has_capability(CAP_SYS_PTRACE)
&& !(target_creds.uid == uid
&& target_creds.euid == uid
&& target_creds.saved_uid == uid
&& target_creds.gid == gid
&& target_creds.egid == gid
&& target_creds.saved_gid == gid)
{
return error!(EPERM);
}
// (4) Deny access if the target process "dumpable" attribute has a
// value other than 1 (SUID_DUMP_USER; see the discussion of
// PR_SET_DUMPABLE in prctl(2)), and the caller does not have
// the CAP_SYS_PTRACE capability in the user namespace of the
// target process.
let dumpable = *target.mm.dumpable.lock();
if dumpable != DumpPolicy::User && !creds.has_capability(CAP_SYS_PTRACE) {
return error!(EPERM);
}
// TODO: Implement the LSM security_ptrace_access_check() interface.
//
// (5) The kernel LSM security_ptrace_access_check() interface is
// invoked to see if ptrace access is permitted.
// (6) If access has not been denied by any of the preceding steps,
// then access is allowed.
Ok(())
}
pub fn create_init_child_process(
kernel: &Arc<Kernel>,
binary_path: &CString,
) -> Result<CurrentTask, Errno> {
let weak_init = kernel.pids.read().get_task(1);
let init_task = weak_init.upgrade().ok_or_else(|| errno!(EINVAL))?;
let task = Self::create_process_without_parent(
kernel,
binary_path.clone(),
Some(init_task.fs().fork()),
)?;
{
let mut init_writer = init_task.thread_group.write();
let mut new_process_writer = task.thread_group.write();
new_process_writer.parent = Some(init_task.thread_group.clone());
init_writer.children.insert(task.id, Arc::downgrade(&task.thread_group));
}
// A child process created via fork(2) inherits its parent's
// resource limits. Resource limits are preserved across execve(2).
let limits = init_task.thread_group.limits.lock().clone();
*task.thread_group.limits.lock() = limits;
Ok(task)
}
/// Create a task that is the leader of a new thread group.
///
/// This function creates an underlying Zircon process to host the new
/// task.
///
/// `fs` should only be None for the init task, and set_fs should be called as soon as the
/// FsContext is build.
pub fn create_process_without_parent(
kernel: &Arc<Kernel>,
initial_name: CString,
fs: Option<Arc<FsContext>>,
) -> Result<CurrentTask, Errno> {
let initial_name_bytes = initial_name.as_bytes().to_owned();
Self::create_task(kernel, initial_name, fs, |pid, process_group| {
create_zircon_process(
kernel,
None,
pid,
process_group,
SignalActions::default(),
&initial_name_bytes,
)
})
}
/// Create a task that runs inside the kernel.
///
/// There is no underlying Zircon process to host the task.
pub fn create_kernel_task(
kernel: &Arc<Kernel>,
initial_name: CString,
fs: Arc<FsContext>,
) -> Result<CurrentTask, Errno> {
Self::create_task(kernel, initial_name, Some(fs), |pid, process_group| {
let process = zx::Process::from(zx::Handle::invalid());
let memory_manager = Arc::new(MemoryManager::new_empty());
let thread_group = ThreadGroup::new(
kernel.clone(),
process,
None,
pid,
process_group,
SignalActions::default(),
);
Ok(TaskInfo { thread: None, thread_group, memory_manager })
})
}
fn create_task<F>(
kernel: &Arc<Kernel>,
initial_name: CString,
root_fs: Option<Arc<FsContext>>,
task_info_factory: F,
) -> Result<CurrentTask, Errno>
where
F: FnOnce(i32, Arc<ProcessGroup>) -> Result<TaskInfo, Errno>,
{
let mut pids = kernel.pids.write();
let pid = pids.allocate_pid();
let process_group = ProcessGroup::new(pid, None);
pids.add_process_group(&process_group);
let TaskInfo { thread, thread_group, memory_manager } =
task_info_factory(pid, process_group.clone())?;
process_group.insert(&thread_group);
// > The timer slack values of init (PID 1), the ancestor of all processes, are 50,000
// > nanoseconds (50 microseconds). The timer slack value is inherited by a child created
// > via fork(2), and is preserved across execve(2).
// https://man7.org/linux/man-pages/man2/prctl.2.html
let default_timerslack = 50_000;
let current_task = CurrentTask::new(Self::new(
pid,
initial_name,
thread_group,
thread,
FdTable::default(),
memory_manager,
root_fs,
Credentials::root(),
Arc::clone(&kernel.default_abstract_socket_namespace),
Arc::clone(&kernel.default_abstract_vsock_namespace),
None,
Default::default(),
None,
Default::default(),
kernel.root_uts_ns.clone(),
false,
SeccompState::default(),
SeccompFilterContainer::default(),
UserAddress::NULL.into(),
default_timerslack,
));
release_on_error!(current_task, (), {
let temp_task = current_task.temp_task();
current_task.thread_group.add(&temp_task)?;
pids.add_task(&temp_task);
pids.add_thread_group(&current_task.thread_group);
Ok(())
});
Ok(current_task)
}
/// Create a kernel task in the same ThreadGroup as the given `system_task`.
///
/// There is no underlying Zircon thread to host the task.
pub fn create_kernel_thread(
system_task: &CurrentTask,
initial_name: CString,
) -> Result<CurrentTask, Errno> {
let mut pids = system_task.kernel().pids.write();
let pid = pids.allocate_pid();
let scheduler_policy;
let uts_ns;
let default_timerslack_ns;
{
let state = system_task.read();
scheduler_policy = state.scheduler_policy;
uts_ns = state.uts_ns.clone();
default_timerslack_ns = state.default_timerslack_ns;
}
let current_task = CurrentTask::new(Self::new(
pid,
initial_name,
Arc::clone(&system_task.thread_group),
None,
FdTable::default(),
Arc::clone(&system_task.mm),
Some(Arc::clone(system_task.fs())),
system_task.creds(),
Arc::clone(&system_task.abstract_socket_namespace),
Arc::clone(&system_task.abstract_vsock_namespace),
None,
Default::default(),
None,
scheduler_policy,
uts_ns,
false,
SeccompState::default(),
SeccompFilterContainer::default(),
UserAddress::NULL.into(),
default_timerslack_ns,
));
release_on_error!(current_task, (), {
let temp_task = current_task.temp_task();
current_task.thread_group.add(&temp_task)?;
pids.add_task(&temp_task);
Ok(())
});
Ok(current_task)
}
/// Clone this task.
///
/// Creates a new task object that shares some state with this task
/// according to the given flags.
///
/// Used by the clone() syscall to create both processes and threads.
///
/// The exit signal is broken out from the flags parameter like clone3() rather than being
/// bitwise-ORed like clone().
pub fn clone_task(
&self,
flags: u64,
child_exit_signal: Option<Signal>,
user_parent_tid: UserRef<pid_t>,
user_child_tid: UserRef<pid_t>,
) -> Result<CurrentTask, Errno> {
// TODO: Implement more flags.
const IMPLEMENTED_FLAGS: u64 = (CLONE_VM
| CLONE_FS
| CLONE_FILES
| CLONE_SIGHAND
| CLONE_THREAD
| CLONE_SYSVSEM
| CLONE_SETTLS
| CLONE_PARENT_SETTID
| CLONE_CHILD_CLEARTID
| CLONE_CHILD_SETTID
| CLONE_VFORK) as u64;
// A mask with all valid flags set, because we want to return a different error code for an
// invalid flag vs an unimplemented flag. Subtracting 1 from the largest valid flag gives a
// mask with all flags below it set. Shift up by one to make sure the largest flag is also
// set.
const VALID_FLAGS: u64 = (CLONE_INTO_CGROUP << 1) - 1;
// CLONE_SETTLS is implemented by sys_clone.
let clone_thread = flags & (CLONE_THREAD as u64) != 0;
let clone_vm = flags & (CLONE_VM as u64) != 0;
let clone_sighand = flags & (CLONE_SIGHAND as u64) != 0;
let clone_vfork = flags & (CLONE_VFORK as u64) != 0;
let new_uts = flags & (CLONE_NEWUTS as u64) != 0;
if clone_sighand && !clone_vm {
return error!(EINVAL);
}
if clone_thread && !clone_sighand {
return error!(EINVAL);
}
if flags & !VALID_FLAGS != 0 {
return error!(EINVAL);
}
if clone_vm && !clone_thread {
// TODO(fxbug.dev/114813) Implement CLONE_VM for child processes (not just child
// threads). Currently this executes CLONE_VM (explicitly passed to clone() or as
// used by vfork()) as a fork (the VM in the child is copy-on-write) which is almost
// always OK.
//
// CLONE_VM is primarily as an optimization to avoid making a copy-on-write version of a
// process' VM that will be immediately replaced with a call to exec(). The main users
// (libc and language runtimes) don't actually rely on the memory being shared between
// the two processes. And the vfork() man page explicitly allows vfork() to be
// implemented as fork() which is what we do here.
if !clone_vfork {
log_warn!("CLONE_VM set without CLONE_THREAD. Ignoring CLONE_VM (doing a fork).");
}
} else if clone_thread && !clone_vm {
not_implemented!("CLONE_THREAD without CLONE_VM is not implemented");
return error!(ENOSYS);
}
if flags & !IMPLEMENTED_FLAGS != 0 {
not_implemented!("clone does not implement flags: 0x{:x}", flags & !IMPLEMENTED_FLAGS);
return error!(ENOSYS);
}
let fs = if flags & (CLONE_FS as u64) != 0 { self.fs().clone() } else { self.fs().fork() };
let files =
if flags & (CLONE_FILES as u64) != 0 { self.files.clone() } else { self.files.fork() };
let kernel = &self.thread_group.kernel;
let mut pids = kernel.pids.write();
let pid;
let command;
let creds;
let scheduler_policy;
let uts_ns;
let no_new_privs;
let seccomp_filters;
let robust_list_head = UserAddress::NULL.into();
let child_signal_mask;
let timerslack_ns;
let TaskInfo { thread, thread_group, memory_manager } = {
// Make sure to drop these locks ASAP to avoid inversion
let thread_group_state = self.thread_group.write();
let state = self.read();
no_new_privs = state.no_new_privs;
seccomp_filters = state.seccomp_filters.clone();
child_signal_mask = state.signals.mask();
pid = pids.allocate_pid();
command = self.command();
creds = self.creds();
// TODO(https://fxbug.dev/297961833) implement SCHED_RESET_ON_FORK
scheduler_policy = state.scheduler_policy;
timerslack_ns = state.timerslack_ns;
uts_ns = if new_uts {
if !self.creds().has_capability(CAP_SYS_ADMIN) {
return error!(EPERM);
}
// Fork the UTS namespace of the existing task.
let new_uts_ns = state.uts_ns.read().clone();
Arc::new(RwLock::new(new_uts_ns))
} else {
// Inherit the UTS of the existing task.
state.uts_ns.clone()
};
if clone_thread {
let thread_group = self.thread_group.clone();
let memory_manager = self.mm.clone();
TaskInfo { thread: None, thread_group, memory_manager }
} else {
// Drop the lock on this task before entering `create_zircon_process`, because it will
// take a lock on the new thread group, and locks on thread groups have a higher
// priority than locks on the task in the thread group.
std::mem::drop(state);
let signal_actions = if clone_sighand {
self.thread_group.signal_actions.clone()
} else {
self.thread_group.signal_actions.fork()
};
let process_group = thread_group_state.process_group.clone();
create_zircon_process(
kernel,
Some(thread_group_state),
pid,
process_group,
signal_actions,
command.as_bytes(),
)?
}
};
// Only create the vfork event when the caller requested CLONE_VFORK.
let vfork_event = if flags & (CLONE_VFORK as u64) != 0 {
Some(Arc::new(zx::Event::create()))
} else {
None
};
let child = CurrentTask::new(Self::new(
pid,
command,
thread_group,
thread,
files,
memory_manager,
Some(fs),
creds,
self.abstract_socket_namespace.clone(),
self.abstract_vsock_namespace.clone(),
child_exit_signal,
child_signal_mask,
vfork_event,
scheduler_policy,
uts_ns,
no_new_privs,
SeccompState::from(&self.seccomp_filter_state),
seccomp_filters,
robust_list_head,
timerslack_ns,
));
release_on_error!(child, (), {
let child_task = TempRef::from(&child.task);
// Drop the pids lock as soon as possible after creating the child. Destroying the child
// and removing it from the pids table itself requires the pids lock, so if an early exit
// takes place we have a self deadlock.
pids.add_task(&child_task);
if !clone_thread {
pids.add_thread_group(&child.thread_group);
}
std::mem::drop(pids);
// Child lock must be taken before this lock. Drop the lock on the task, take a writable
// lock on the child and take the current state back.
#[cfg(any(test, debug_assertions))]
{
// Take the lock on the thread group and its child in the correct order to ensure any wrong ordering
// will trigger the tracing-mutex at the right call site.
if !clone_thread {
let _l1 = self.thread_group.read();
let _l2 = child.thread_group.read();
}
}
if clone_thread {
self.thread_group.add(&child_task)?;
} else {
child.thread_group.add(&child_task)?;
let mut child_state = child.write();
let state = self.read();
child_state.signals.alt_stack = state.signals.alt_stack;
child_state.signals.set_mask(state.signals.mask());
self.mm.snapshot_to(&child.mm)?;
}
if flags & (CLONE_PARENT_SETTID as u64) != 0 {
self.mm.write_object(user_parent_tid, &child.id)?;
}
if flags & (CLONE_CHILD_CLEARTID as u64) != 0 {
child.write().clear_child_tid = user_child_tid;
}
if flags & (CLONE_CHILD_SETTID as u64) != 0 {
child.mm.write_object(user_child_tid, &child.id)?;
}
Ok(())
});
Ok(child)
}
/// Signals the vfork event, if any, to unblock waiters.
pub fn signal_vfork(&self) {
if let Some(event) = &self.vfork_event {
if let Err(status) = event.signal_handle(Signals::NONE, Signals::USER_0) {
log_warn!("Failed to set vfork signal {status}");
}
};
}
/// Blocks the caller until the task has exited or executed execve(). This is used to implement
/// vfork() and clone(... CLONE_VFORK, ...). The task musy have created with CLONE_EXECVE.
pub fn wait_for_execve(&self, task_to_wait: WeakRef<Task>) -> Result<(), Errno> {
let event = task_to_wait.upgrade().and_then(|t| t.vfork_event.clone());
if let Some(event) = event {
event
.wait_handle(zx::Signals::USER_0, zx::Time::INFINITE)
.map_err(|status| from_status_like_fdio!(status))?;
}
Ok(())
}
/// The flags indicates only the flags as in clone3(), and does not use the low 8 bits for the
/// exit signal as in clone().
#[cfg(test)]
pub fn clone_task_for_test(
&self,
flags: u64,
exit_signal: Option<Signal>,
) -> crate::testing::AutoReleasableTask {
let result = self
.clone_task(flags, exit_signal, UserRef::default(), UserRef::default())
.expect("failed to create task in test");
// Take the lock on thread group and task in the correct order to ensure any wrong ordering
// will trigger the tracing-mutex at the right call site.
{
let _l1 = result.thread_group.read();
let _l2 = result.read();
}
result.into()
}
/// If needed, clear the child tid for this task.
///
/// Userspace can ask us to clear the child tid and issue a futex wake at
/// the child tid address when we tear down a task. For example, bionic
/// uses this mechanism to implement pthread_join. The thread that calls
/// pthread_join sleeps using FUTEX_WAIT on the child tid address. We wake
/// them up here to let them know the thread is done.
fn clear_child_tid_if_needed(&self) -> Result<(), Errno> {
let mut state = self.write();
let user_tid = state.clear_child_tid;
if !user_tid.is_null() {
let zero: pid_t = 0;
self.mm.write_object(user_tid, &zero)?;
self.thread_group.kernel.shared_futexes.wake(
self,
user_tid.addr(),
usize::MAX,
FUTEX_BITSET_MATCH_ANY,
)?;
state.clear_child_tid = UserRef::default();
}
Ok(())
}
pub fn get_task(&self, pid: pid_t) -> WeakRef<Task> {
self.thread_group.kernel.pids.read().get_task(pid)
}
pub fn get_pid(&self) -> pid_t {
self.thread_group.leader
}
pub fn get_tid(&self) -> pid_t {
self.id
}
pub fn read_argv(&self) -> Result<Vec<FsString>, Errno> {
let (argv_start, argv_end) = {
let mm_state = self.mm.state.read();
(mm_state.argv_start, mm_state.argv_end)
};
self.mm.read_nul_delimited_c_string_list(argv_start, argv_end - argv_start)
}
pub fn thread_runtime_info(&self) -> Result<zx::TaskRuntimeInfo, Errno> {
self.thread
.read()
.as_ref()
.ok_or_else(|| errno!(EINVAL))?
.get_runtime_info()
.map_err(|status| from_status_like_fdio!(status))
}
pub fn as_ucred(&self) -> ucred {
let creds = self.creds();
ucred { pid: self.get_pid(), uid: creds.uid, gid: creds.gid }
}
pub fn as_fscred(&self) -> FsCred {
self.creds().as_fscred()
}
pub fn can_signal(&self, target: &Task, unchecked_signal: &UncheckedSignal) -> bool {
// If both the tasks share a thread group the signal can be sent. This is not documented
// in kill(2) because kill does not support task-level granularity in signal sending.
if self.thread_group == target.thread_group {
return true;
}
let self_creds = self.creds();
if self_creds.has_capability(CAP_KILL) {
return true;
}
if self_creds.has_same_uid(&target.creds()) {
return true;
}
// TODO(lindkvist): This check should also verify that the sessions are the same.
if Signal::try_from(unchecked_signal) == Ok(SIGCONT) {
return true;
}
false
}
/// Interrupts the current task.
///
/// This will interrupt any blocking syscalls if the task is blocked on one.
/// The signal_state of the task must not be locked.
pub fn interrupt(&self) {
self.read().signals.run_state.wake();
if let Some(thread) = self.thread.read().as_ref() {
crate::execution::interrupt_thread(thread);
}
}
pub fn command(&self) -> CString {
self.persistent_info.lock().command.clone()
}
pub fn set_command_name(&self, name: CString) {
// Set the name on the Linux thread.
if let Some(thread) = self.thread.read().as_ref() {
set_zx_name(thread, name.as_bytes());
}
// If this is the thread group leader, use this name for the process too.
if self.get_pid() == self.get_tid() {
set_zx_name(&self.thread_group.process, name.as_bytes());
}
// Truncate to 16 bytes, including null byte.
let bytes = name.to_bytes();
self.persistent_info.lock().command = if bytes.len() > 15 {
// SAFETY: Substring of a CString will contain no null bytes.
CString::new(&bytes[..15]).unwrap()
} else {
name
};
}
pub fn set_seccomp_state(&self, state: SeccompStateValue) -> Result<(), Errno> {
self.seccomp_filter_state.set(&state)
}
pub fn state_code(&self) -> TaskStateCode {
let status = self.read();
if status.exit_status.is_some() {
TaskStateCode::Zombie
} else if status.signals.run_state.is_blocked() {
TaskStateCode::Sleeping
} else {
TaskStateCode::Running
}
}
pub fn time_stats(&self) -> TaskTimeStats {
let info = match &*self.thread.read() {
Some(thread) => zx::Task::get_runtime_info(thread).expect("Failed to get thread stats"),
None => return TaskTimeStats::default(),
};
TaskTimeStats {
user_time: zx::Duration::from_nanos(info.cpu_time),
// TODO(fxbug.dev/127682): How can we calculate system time?
system_time: zx::Duration::default(),
}
}
/// Sets the stop state (per set_stopped), and also notifies all listeners,
/// including the parent process if appropriate.
pub fn set_stopped_and_notify(&self, stopped: StopState, siginfo: Option<SignalInfo>) {
self.set_stopped(&mut *self.write(), stopped, siginfo);
if !stopped.is_in_progress() {
let parent = self.thread_group.read().parent.clone();
if let Some(parent) = parent {
parent.write().child_status_waiters.notify_all();
}
}
}
/// If the task is stopping, set it as stopped. return whether the caller
/// should stop.
pub fn finalize_stop_state(&self) -> bool {
// Stopping because the thread group is stopping.
// Try to flip to GroupStopped - will fail if we shouldn't.
if self.thread_group.set_stopped(StopState::GroupStopped, None, true)
== StopState::GroupStopped
{
// stopping because the thread group has stopped
self.set_stopped(&mut *self.write(), StopState::GroupStopped, None);
return true;
}
// Stopping because the task is stopping
let stopped = self.load_stopped();
if stopped.is_stopping_or_stopped() {
if let Ok(stopped) = stopped.finalize() {
self.set_stopped_and_notify(stopped, None);
}
return true;
}
false
}
/// If waking, promotes from waking to awake. If not waking, make waiter async
/// wait until woken. Returns true if woken.
pub fn wake_or_wait_until_unstopped_async(&self, waiter: &Waiter) -> bool {
let group_state = self.thread_group.read();
let task_state = self.write();
// If we've woken up, return.
let task_stop_state = self.load_stopped();
let group_stop_state = self.thread_group.load_stopped();
if (task_stop_state == StopState::GroupStopped && group_stop_state.is_waking_or_awake())
|| task_stop_state.is_waking_or_awake()
{
let new_state = if task_stop_state.is_waking_or_awake() {
task_stop_state.finalize()
} else {
group_stop_state.finalize()
};
if let Ok(new_state) = new_state {
drop(group_state);
drop(task_state);
self.thread_group.set_stopped(new_state, None, false);
self.set_stopped(&mut *self.write(), new_state, None);
return true;
}
}
if group_stop_state.is_stopped() || task_stop_state.is_stopped() {
group_state.stopped_waiters.wait_async(&waiter);
task_state.wait_on_ptracer(&waiter);
}
false
}
}
impl ReleasableByRef for Task {
type Context<'a> = &'a CurrentTask;
fn release(&self, current_task: &CurrentTask) {
// Disconnect from tracer, if one is present.
let ptracer_pid = self.read().ptrace.as_ref().map_or(None, |ptrace| Some(ptrace.pid));
if let Some(ptracer_pid) = ptracer_pid {
if let Some(ProcessEntryRef::Process(tg)) =
self.thread_group.kernel.pids.read().get_process(ptracer_pid)
{
let pid = self.get_pid();
tg.ptracees.lock().remove(&pid);
}
let _ = self.write().set_ptrace(None);
}
self.thread_group.remove(self);
// Release the fd table.
self.files.release(current_task);
self.signal_vfork();
}
}
impl CurrentTask {
fn new(task: Task) -> CurrentTask {
CurrentTask {
task: OwnedRefByRef::new(task),
registers: RegisterState::default(),
extended_pstate: ExtendedPstateState::default(),
syscall_restart_func: None,
}
}
pub fn kernel(&self) -> &Arc<Kernel> {
&self.thread_group.kernel
}
pub fn weak_task(&self) -> WeakRef<Task> {
WeakRef::from(&self.task)
}
pub fn temp_task(&self) -> TempRef<'_, Task> {
TempRef::from(&self.task)
}
pub fn set_syscall_restart_func<R: Into<SyscallResult>>(
&mut self,
f: impl FnOnce(&mut CurrentTask) -> Result<R, Errno> + Send + Sync + 'static,
) {
self.syscall_restart_func = Some(Box::new(|current_task| Ok(f(current_task)?.into())));
}
/// Sets the task's signal mask to `signal_mask` and runs `wait_function`.
///
/// Signals are dequeued prior to the original signal mask being restored. This is done by the
/// signal machinery in the syscall dispatch loop.
///
/// The returned result is the result returned from the wait function.
pub fn wait_with_temporary_mask<F, T>(
&mut self,
signal_mask: SigSet,
wait_function: F,
) -> Result<T, Errno>
where
F: FnOnce(&CurrentTask) -> Result<T, Errno>,
{
{
let mut state = self.write();
self.set_flags(&mut *state, TaskFlags::TEMPORARY_SIGNAL_MASK, true);
state.signals.set_temporary_mask(signal_mask);
}
wait_function(self)
}
/// Set the RunState for the current task to the given value and then call the given callback.
///
/// When the callback is done, the run_state is restored to `RunState::Running`.
///
/// This function is typically used just before blocking the current task on some operation.
/// The given `run_state` registers the mechasim for interrupting the blocking operation with
/// the task and the given `callback` actually blocks the task.
///
/// This function can only be called in the `RunState::Running` state and cannot set the
/// run state to `RunState::Running`. For this reason, this function cannot be reentered.
pub fn run_in_state<F, T>(&self, run_state: RunState, callback: F) -> Result<T, Errno>
where
F: FnOnce() -> Result<T, Errno>,
{
assert_ne!(run_state, RunState::Running);
{
let mut state = self.write();
assert!(!state.signals.run_state.is_blocked());
if state.signals.is_any_pending() {
return error!(EINTR);
}
state.signals.run_state = run_state.clone();
}
let result = callback();
{
let mut state = self.write();
assert_eq!(
state.signals.run_state, run_state,
"SignalState run state changed while waiting!"
);
state.signals.run_state = RunState::Running;
};
result
}
pub fn block_until(&self, guard: EventWaitGuard<'_>, deadline: zx::Time) -> Result<(), Errno> {
self.run_in_state(RunState::Event(guard.event().clone()), move || {
guard.block_until(deadline).map_err(|e| match e {
WakeReason::Interrupted => errno!(EINTR),
WakeReason::DeadlineExpired => errno!(ETIMEDOUT),
})
})
}
/// Determine namespace node indicated by the dir_fd.
///
/// Returns the namespace node and the path to use relative to that node.
pub fn resolve_dir_fd<'a>(
&self,
dir_fd: FdNumber,
mut path: &'a FsStr,
) -> Result<(NamespaceNode, &'a FsStr), Errno> {
let dir = if !path.is_empty() && path[0] == b'/' {
path = &path[1..];
self.fs().root()
} else if dir_fd == FdNumber::AT_FDCWD {
self.fs().cwd()
} else {
let file = self.files.get(dir_fd)?;
file.name.clone()
};
if !path.is_empty() {
if !dir.entry.node.is_dir() {
return error!(ENOTDIR);
}
dir.check_access(self, Access::EXEC)?;
}
Ok((dir, path))
}
/// A convenient wrapper for opening files relative to FdNumber::AT_FDCWD.
///
/// Returns a FileHandle but does not install the FileHandle in the FdTable
/// for this task.
pub fn open_file(&self, path: &FsStr, flags: OpenFlags) -> Result<FileHandle, Errno> {
if flags.contains(OpenFlags::CREAT) {
// In order to support OpenFlags::CREAT we would need to take a
// FileMode argument.
return error!(EINVAL);
}
self.open_file_at(FdNumber::AT_FDCWD, path, flags, FileMode::default())
}
/// Resolves a path for open.
///
/// If the final path component points to a symlink, the symlink is followed (as long as
/// the symlink traversal limit has not been reached).
///
/// If the final path component (after following any symlinks, if enabled) does not exist,
/// and `flags` contains `OpenFlags::CREAT`, a new node is created at the location of the
/// final path component.
///
/// This returns the resolved node, and a boolean indicating whether the node has been created.
fn resolve_open_path(
&self,
context: &mut LookupContext,
dir: NamespaceNode,
path: &FsStr,
mode: FileMode,
flags: OpenFlags,
) -> Result<(NamespaceNode, bool), Errno> {
context.update_for_path(path);
let mut parent_content = context.with(SymlinkMode::Follow);
let (parent, basename) = self.lookup_parent(&mut parent_content, dir, path)?;
context.remaining_follows = parent_content.remaining_follows;
let must_create = flags.contains(OpenFlags::CREAT) && flags.contains(OpenFlags::EXCL);
// Lookup the child, without following a symlink or expecting it to be a directory.
let mut child_context = context.with(SymlinkMode::NoFollow);
child_context.must_be_directory = false;
match parent.lookup_child(self, &mut child_context, basename) {
Ok(name) => {
if name.entry.node.is_lnk() {
if context.symlink_mode == SymlinkMode::NoFollow
|| context.remaining_follows == 0
{
if must_create {
// Since `must_create` is set, and a node was found, this returns EEXIST
// instead of ELOOP.
return error!(EEXIST);
}
// A symlink was found, but too many symlink traversals have been
// attempted.
return error!(ELOOP);
}
context.remaining_follows -= 1;
match name.readlink(self)? {
SymlinkTarget::Path(path) => {
let dir = if path[0] == b'/' { self.fs().root() } else { parent };
self.resolve_open_path(context, dir, &path, mode, flags)
}
SymlinkTarget::Node(node) => Ok((node, false)),
}
} else {
if must_create {
return error!(EEXIST);
}
Ok((name, false))
}
}
Err(e) if e == errno!(ENOENT) && flags.contains(OpenFlags::CREAT) => {
if context.must_be_directory {
return error!(EISDIR);
}
Ok((
parent.open_create_node(
self,
basename,
mode.with_type(FileMode::IFREG),
DeviceType::NONE,
flags,
)?,
true,
))
}
Err(e) => Err(e),
}
}
/// The primary entry point for opening files relative to a task.
///
/// Absolute paths are resolve relative to the root of the FsContext for
/// this task. Relative paths are resolve relative to dir_fd. To resolve
/// relative to the current working directory, pass FdNumber::AT_FDCWD for
/// dir_fd.
///
/// Returns a FileHandle but does not install the FileHandle in the FdTable
/// for this task.
pub fn open_file_at(
&self,
dir_fd: FdNumber,
path: &FsStr,
flags: OpenFlags,
mode: FileMode,
) -> Result<FileHandle, Errno> {
if path.is_empty() {
return error!(ENOENT);
}
let (dir, path) = self.resolve_dir_fd(dir_fd, path)?;
self.open_namespace_node_at(dir, path, flags, mode)
}
pub fn open_namespace_node_at(
&self,
dir: NamespaceNode,
path: &FsStr,
flags: OpenFlags,
mode: FileMode,
) -> Result<FileHandle, Errno> {
// 64-bit kernels force the O_LARGEFILE flag to be on.
let mut flags = flags | OpenFlags::LARGEFILE;
if flags.contains(OpenFlags::PATH) {
// When O_PATH is specified in flags, flag bits other than O_CLOEXEC,
// O_DIRECTORY, and O_NOFOLLOW are ignored.
const ALLOWED_FLAGS: OpenFlags = OpenFlags::from_bits_truncate(
OpenFlags::PATH.bits()
| OpenFlags::CLOEXEC.bits()
| OpenFlags::DIRECTORY.bits()
| OpenFlags::NOFOLLOW.bits(),
);
flags &= ALLOWED_FLAGS;
}
if flags.contains(OpenFlags::TMPFILE) && !flags.can_write() {
return error!(EINVAL);
}
let nofollow = flags.contains(OpenFlags::NOFOLLOW);
let must_create = flags.contains(OpenFlags::CREAT) && flags.contains(OpenFlags::EXCL);
let symlink_mode =
if nofollow || must_create { SymlinkMode::NoFollow } else { SymlinkMode::Follow };
let mut context = LookupContext::new(symlink_mode);
context.must_be_directory = flags.contains(OpenFlags::DIRECTORY);
let (name, created) = self.resolve_open_path(&mut context, dir, path, mode, flags)?;
let name = if flags.contains(OpenFlags::TMPFILE) {
name.create_tmpfile(self, mode.with_type(FileMode::IFREG), flags)?
} else {
let mode = name.entry.node.info().mode;
// These checks are not needed in the `O_TMPFILE` case because `mode` refers to the
// file we are opening. With `O_TMPFILE`, that file is the regular file we just
// created rather than the node we found by resolving the path.
//
// For example, we do not need to produce `ENOTDIR` when `must_be_directory` is set
// because `must_be_directory` refers to the node we found by resolving the path.
// If that node was not a directory, then `create_tmpfile` will produce an error.
//
// Similarly, we never need to call `truncate` because `O_TMPFILE` is newly created
// and therefor already an empty file.
if nofollow && mode.is_lnk() {
return error!(ELOOP);
}
if mode.is_dir() {
if flags.can_write()
|| flags.contains(OpenFlags::CREAT)
|| flags.contains(OpenFlags::TRUNC)
{
return error!(EISDIR);
}
if flags.contains(OpenFlags::DIRECT) {
return error!(EINVAL);
}
} else if context.must_be_directory {
return error!(ENOTDIR);
}
if flags.contains(OpenFlags::TRUNC) && mode.is_reg() && !created {
// You might think we should check file.can_write() at this
// point, which is what the docs suggest, but apparently we
// are supposed to truncate the file if this task can write
// to the underlying node, even if we are opening the file
// as read-only. See OpenTest.CanTruncateReadOnly.
name.truncate(self, 0)?;
}
name
};
// If the node has been created, the open operation should not verify access right:
// From <https://man7.org/linux/man-pages/man2/open.2.html>
//
// > Note that mode applies only to future accesses of the newly created file; the
// > open() call that creates a read-only file may well return a read/write file
// > descriptor.
name.open(self, flags, !created)
}
/// A wrapper for FsContext::lookup_parent_at that resolves the given
/// dir_fd to a NamespaceNode.
///
/// Absolute paths are resolve relative to the root of the FsContext for
/// this task. Relative paths are resolve relative to dir_fd. To resolve
/// relative to the current working directory, pass FdNumber::AT_FDCWD for
/// dir_fd.
pub fn lookup_parent_at<'a>(
&self,
context: &mut LookupContext,
dir_fd: FdNumber,
path: &'a FsStr,
) -> Result<(NamespaceNode, &'a FsStr), Errno> {
let (dir, path) = self.resolve_dir_fd(dir_fd, path)?;
self.lookup_parent(context, dir, path)
}
/// Lookup the parent of a namespace node.
///
/// Consider using Task::open_file_at or Task::lookup_parent_at rather than
/// calling this function directly.
///
/// This function resolves all but the last component of the given path.
/// The function returns the parent directory of the last component as well
/// as the last component.
///
/// If path is empty, this function returns dir and an empty path.
/// Similarly, if path ends with "." or "..", these components will be
/// returned along with the parent.
///
/// The returned parent might not be a directory.
pub fn lookup_parent<'a>(
&self,
context: &mut LookupContext,
dir: NamespaceNode,
path: &'a FsStr,
) -> Result<(NamespaceNode, &'a FsStr), Errno> {
context.update_for_path(path);
let mut current_node = dir;
let mut it = path.split(|c| *c == b'/').filter(|p| !p.is_empty());
let mut current_path_component = it.next().unwrap_or(b"");
for next_path_component in it {
current_node = current_node.lookup_child(self, context, current_path_component)?;
current_path_component = next_path_component;
}
Ok((current_node, current_path_component))
}
/// Lookup a namespace node.
///
/// Consider using Task::open_file_at or Task::lookup_parent_at rather than
/// calling this function directly.
///
/// This function resolves the component of the given path.
pub fn lookup_path(
&self,
context: &mut LookupContext,
dir: NamespaceNode,
path: &FsStr,
) -> Result<NamespaceNode, Errno> {
let (parent, basename) = self.lookup_parent(context, dir, path)?;
parent.lookup_child(self, context, basename)
}
/// Lookup a namespace node starting at the root directory.
///
/// Resolves symlinks.
pub fn lookup_path_from_root(&self, path: &FsStr) -> Result<NamespaceNode, Errno> {
let mut context = LookupContext::default();
self.lookup_path(&mut context, self.fs().root(), path)
}
pub fn exec(
&mut self,
executable: FileHandle,
path: CString,
argv: Vec<CString>,
environ: Vec<CString>,
) -> Result<(), Errno> {
// Executable must be a regular file
if !executable.name.entry.node.is_reg() {
return error!(EACCES);
}
// File node must have EXEC mode permissions.
// Note that the ability to execute a file is unrelated to the flags
// used in the `open` call.
executable.name.check_access(self, Access::EXEC)?;
let resolved_elf = resolve_executable(self, executable, path.clone(), argv, environ)?;
// TODO(https://fxbug.dev/132623): Starnix doesn't yet support running exec on a
// multi-thread process.
if self.thread_group.read().tasks_count() > 1 {
not_implemented!("exec on multithread process is not supported");
return error!(EINVAL);
}
if let Err(err) = self.finish_exec(path, resolved_elf) {
// TODO(tbodt): Replace this panic with a log and force a SIGSEGV.
log_warn!("unrecoverable error in exec: {err:?}");
send_signal(
self,
SignalInfo { code: SI_KERNEL as i32, force: true, ..SignalInfo::default(SIGSEGV) },
);
return Err(err);
}
self.signal_vfork();
Ok(())
}
/// After the memory is unmapped, any failure in exec is unrecoverable and results in the
/// process crashing. This function is for that second half; any error returned from this
/// function will be considered unrecoverable.
fn finish_exec(&mut self, path: CString, resolved_elf: ResolvedElf) -> Result<(), Errno> {
// Now that the exec will definitely finish (or crash), notify owners of
// locked futexes for the current process, which will be impossible to
// update after process image is replaced. See get_robust_list(2).
self.notify_robust_list();
self.mm
.exec(resolved_elf.file.name.clone())
.map_err(|status| from_status_like_fdio!(status))?;
let start_info = load_executable(self, resolved_elf, &path)?;
let regs: zx_thread_state_general_regs_t = start_info.into();
self.registers = regs.into();
{
let mut state = self.write();
let mut persistent_info = self.persistent_info.lock();
state.signals.alt_stack = None;
state.robust_list_head = UserAddress::NULL.into();
// TODO(tbodt): Check whether capability xattrs are set on the file, and grant/limit
// capabilities accordingly.
persistent_info.creds.exec();
}
self.extended_pstate.reset();
self.thread_group.signal_actions.reset_for_exec();
// TODO: The termination signal is reset to SIGCHLD.
// TODO(https://fxbug.dev/132623): All threads other than the calling thread are destroyed.
// TODO: The file descriptor table is unshared, undoing the effect of
// the CLONE_FILES flag of clone(2).
//
// To make this work, we can put the files in an RwLock and then cache
// a reference to the files on the CurrentTask. That will let
// functions that have CurrentTask access the FdTable without
// needing to grab the read-lock.
//
// For now, we do not implement that behavior.
self.files.exec();
// TODO: POSIX timers are not preserved.
self.thread_group.write().did_exec = true;
// Get the basename of the path, which will be used as the name displayed with
// `prctl(PR_GET_NAME)` and `/proc/self/stat`
let basename = if let Some(idx) = memchr::memrchr(b'/', path.to_bytes()) {
// SAFETY: Substring of a CString will contain no null bytes.
CString::new(&path.to_bytes()[idx + 1..]).unwrap()
} else {
path
};
self.set_command_name(basename);
crate::logging::set_current_task_info(self);
Ok(())
}
pub fn add_seccomp_filter(
&mut self,
bpf_filter: UserAddress,
flags: u32,
) -> Result<SyscallResult, Errno> {
let fprog: sock_fprog = self.mm.read_object(UserRef::new(bpf_filter))?;
if u32::from(fprog.len) > BPF_MAXINSNS || fprog.len == 0 {
return Err(errno!(EINVAL));
}
let code: Vec<sock_filter> =
self.read_objects_to_vec(fprog.filter.into(), fprog.len as usize)?;
let new_filter = Arc::new(SeccompFilter::from_cbpf(
&code,
self.thread_group.next_seccomp_filter_id.fetch_add(1, Ordering::SeqCst),
flags & SECCOMP_FILTER_FLAG_LOG != 0,
)?);
let mut maybe_fd: Option<FdNumber> = None;
if flags & SECCOMP_FILTER_FLAG_NEW_LISTENER != 0 {
let mut task_state = self.mutable_state.write();
maybe_fd = Some(task_state.seccomp_filters.create_listener(self)?);
}
// We take the process lock here because we can't change any of the threads
// while doing a tsync. So, you hold the process lock while making any changes.
let state = self.thread_group.write();
if flags & SECCOMP_FILTER_FLAG_TSYNC != 0 {
// TSYNC synchronizes all filters for all threads in the current process to
// the current thread's
// We collect the filters for the current task upfront to save us acquiring
// the task's lock a lot of times below.
let mut filters: SeccompFilterContainer = self.read().seccomp_filters.clone();
// For TSYNC to work, all of the other thread filters in this process have to
// be a prefix of this thread's filters, and none of them can be in
// strict mode.
let tasks = state.tasks().collect::<Vec<_>>();
for task in &tasks {
if task.id == self.id {
continue;
}
let other_task_state = task.mutable_state.read();
// Target threads cannot be in SECCOMP_MODE_STRICT
if task.seccomp_filter_state.get() == SeccompStateValue::Strict {
return Self::seccomp_tsync_error(task.id, flags);
}
// Target threads' filters must be a subsequence of this thread's
if !other_task_state.seccomp_filters.can_sync_to(&filters) {
return Self::seccomp_tsync_error(task.id, flags);
}
}
// Now that we're sure we're allowed to do so, add the filter to all threads.
filters.add_filter(new_filter, fprog.len)?;
for task in &tasks {
let mut other_task_state = task.mutable_state.write();
other_task_state.enable_no_new_privs();
other_task_state.seccomp_filters = filters.clone();
task.set_seccomp_state(SeccompStateValue::UserDefined)?;
}
} else {
let mut task_state = self.mutable_state.write();
task_state.seccomp_filters.add_filter(new_filter, fprog.len)?;
self.set_seccomp_state(SeccompStateValue::UserDefined)?;
}
if let Some(fd) = maybe_fd {
Ok(fd.into())
} else {
Ok(().into())
}
}
pub fn run_seccomp_filters(
&mut self,
syscall: &Syscall,
) -> Option<Result<SyscallResult, Errno>> {
profile_duration!("RunSeccompFilters");
// Implementation of SECCOMP_FILTER_STRICT, which has slightly different semantics
// from user-defined seccomp filters.
if self.seccomp_filter_state.get() == SeccompStateValue::Strict {
return SeccompState::do_strict(self, syscall);
}
// Run user-defined seccomp filters
let result = self.mutable_state.read().seccomp_filters.run_all(self, syscall);
SeccompState::do_user_defined(result, self, syscall)
}
fn seccomp_tsync_error(id: i32, flags: u32) -> Result<SyscallResult, Errno> {
// By default, TSYNC indicates failure state by returning the first thread
// id not to be able to sync, rather than by returning -1 and setting
// errno. However, if TSYNC_ESRCH is set, it returns ESRCH. This
// prevents conflicts with fact that SECCOMP_FILTER_FLAG_NEW_LISTENER
// makes seccomp return an fd.
if flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH != 0 {
Err(errno!(ESRCH))
} else {
Ok(id.into())
}
}
// Notify all futexes in robust list. The robust list is in user space, so we
// are very careful about walking it, and there are a lot of quiet returns if
// we fail to walk it.
// TODO(fxbug.dev/128610): This only sets the FUTEX_OWNER_DIED bit; it does
// not wake up a waiter.
pub fn notify_robust_list(&self) {
let task_state = self.write();
let robust_list_addr = task_state.robust_list_head.addr();
if robust_list_addr == UserAddress::NULL {
// No one has called set_robust_list.
return;
}
let robust_list_res = self.mm.read_object(task_state.robust_list_head);
let head = if let Ok(head) = robust_list_res {
head
} else {
return;
};
let offset = head.futex_offset;
let mut entries_count = 0;
let mut curr_ptr = head.list.next;
while curr_ptr.addr != robust_list_addr.into() && entries_count < ROBUST_LIST_LIMIT {
let curr_ref = self.mm.read_object(curr_ptr.into());
let curr = if let Ok(curr) = curr_ref {
curr
} else {
return;
};
let futex_base: u64;
if let Some(fb) = curr_ptr.addr.addr.checked_add_signed(offset) {
futex_base = fb;
} else {
return;
}
let futex_ref = UserRef::<u32>::new(UserAddress::from(futex_base));
// TODO(b/299096230): Futex modification should be atomic.
let futex = if let Ok(futex) = self.mm.read_object(futex_ref) {
futex
} else {
return;
};
if (futex & FUTEX_TID_MASK) as i32 == self.id {
let owner_died = FUTEX_OWNER_DIED | futex;
if self.write_object(futex_ref, &owner_died).is_err() {
return;
}
}
curr_ptr = curr.next;
entries_count += 1;
}
}
/// Returns a ref to this thread's SeccompNotifier.
pub fn get_seccomp_notifier(&mut self) -> Option<SeccompNotifierHandle> {
self.mutable_state.write().seccomp_filters.notifier.clone()
}
pub fn set_seccomp_notifier(&mut self, notifier: Option<SeccompNotifierHandle>) {
self.mutable_state.write().seccomp_filters.notifier = notifier;
}
/// Processes a Zircon exception associated with this task.
///
/// If the exception is fully handled, returns Ok(None)
/// If the exception produces a signal, returns Ok(Some(SigInfo)).
/// If the exception could not be handled returns Err(())
pub fn process_exception(&self, report: &zx::sys::zx_exception_report_t) -> ExceptionResult {
match report.header.type_ {
zx::sys::ZX_EXCP_GENERAL => match get_signal_for_general_exception(&report.context) {
Some(sig) => ExceptionResult::Signal(SignalInfo::default(sig)),
None => {
log_warn!("Unrecognized general exception: {:?}", report);
ExceptionResult::Signal(SignalInfo::default(SIGILL))
}
},
zx::sys::ZX_EXCP_FATAL_PAGE_FAULT => {
// A page fault may be resolved by extending a growsdown mapping to cover the faulting
// address. Ask the memory manager if it can extend a mapping to cover the faulting
// address and if says that it's found a mapping that exists or that can be extended to
// cover this address mark the exception as handled so that the instruction can try
// again. Otherwise let the regular handling proceed.
// We should only attempt growth on a not-present fault and we should only extend if the
// access type matches the protection on the GROWSDOWN mapping.
let decoded = decode_page_fault_exception_report(report);
if decoded.not_present {
match self.mm.extend_growsdown_mapping_to_address(
UserAddress::from(decoded.faulting_address),
decoded.is_write,
) {
Ok(true) => {
return ExceptionResult::Handled;
}
Err(e) => {
log_warn!("Error handling page fault: {e}")
}
_ => {}
}
}
// For this exception type, the synth_code field in the exception report's context is the
// error generated by the page fault handler. For us this is used to distinguish between a
// segmentation violation and a bus error. Unfortunately this detail is not documented in
// Zircon's public documentation and is only described in the architecture-specific exception
// definitions such as:
// zircon/kernel/arch/x86/include/arch/x86.h
// zircon/kernel/arch/arm64/include/arch/arm64.h
let signo = match report.context.synth_code as zx::sys::zx_status_t {
zx::sys::ZX_ERR_OUT_OF_RANGE => SIGBUS,
_ => SIGSEGV,
};
ExceptionResult::Signal(SignalInfo::new(
signo,
SI_KERNEL,
SignalDetail::SigFault { addr: decoded.faulting_address },
))
}
zx::sys::ZX_EXCP_UNDEFINED_INSTRUCTION => {
ExceptionResult::Signal(SignalInfo::default(SIGILL))
}
zx::sys::ZX_EXCP_UNALIGNED_ACCESS => {
ExceptionResult::Signal(SignalInfo::default(SIGBUS))
}
zx::sys::ZX_EXCP_SW_BREAKPOINT => ExceptionResult::Signal(SignalInfo::default(SIGTRAP)),
_ => {
log_error!("Unknown exception {:?}", report);
ExceptionResult::Signal(SignalInfo::default(SIGSEGV))
}
}
}
}
impl MemoryAccessor for Task {
fn read_memory_to_slice(&self, addr: UserAddress, bytes: &mut [u8]) -> Result<(), Errno> {
self.mm.read_memory_to_slice(addr, bytes)
}
fn read_memory_partial_to_slice(
&self,
addr: UserAddress,
bytes: &mut [u8],
) -> Result<usize, Errno> {
self.mm.read_memory_partial_to_slice(addr, bytes)
}
fn write_memory(&self, addr: UserAddress, bytes: &[u8]) -> Result<usize, Errno> {
self.mm.write_memory(addr, bytes)
}
fn write_memory_partial(&self, addr: UserAddress, bytes: &[u8]) -> Result<usize, Errno> {
self.mm.write_memory_partial(addr, bytes)
}
fn zero(&self, addr: UserAddress, length: usize) -> Result<usize, Errno> {
self.mm.zero(addr, length)
}
}
impl fmt::Debug for Task {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"{}:{}[{}]",
self.thread_group.leader,
self.id,
self.persistent_info.lock().command.to_string_lossy()
)
}
}
impl fmt::Debug for CurrentTask {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
self.task.fmt(f)
}
}
impl cmp::PartialEq for Task {
fn eq(&self, other: &Self) -> bool {
let ptr: *const Task = self;
let other_ptr: *const Task = other;
ptr == other_ptr
}
}
impl cmp::Eq for Task {}
impl From<&Task> for FsCred {
fn from(t: &Task) -> FsCred {
t.creds().into()
}
}
#[cfg(test)]
mod test {
use super::*;
use crate::testing::*;
#[::fuchsia::test]
async fn test_tid_allocation() {
let (kernel, current_task) = create_kernel_and_task();
assert_eq!(current_task.get_tid(), 1);
let another_current = create_task(&kernel, "another-task");
// tid 2 gets assigned to kthreadd.
assert_eq!(another_current.get_tid(), 3);
let pids = kernel.pids.read();
assert_eq!(pids.get_task(1).upgrade().unwrap().get_tid(), 1);
assert_eq!(pids.get_task(2).upgrade().unwrap().get_tid(), 2);
assert_eq!(pids.get_task(3).upgrade().unwrap().get_tid(), 3);
assert!(pids.get_task(4).upgrade().is_none());
}
#[::fuchsia::test]
async fn test_clone_pid_and_parent_pid() {
let (_kernel, current_task) = create_kernel_and_task();
let thread = current_task
.clone_task_for_test((CLONE_THREAD | CLONE_VM | CLONE_SIGHAND) as u64, Some(SIGCHLD));
assert_eq!(current_task.get_pid(), thread.get_pid());
assert_ne!(current_task.get_tid(), thread.get_tid());
assert_eq!(current_task.thread_group.leader, thread.thread_group.leader);
let child_task = current_task.clone_task_for_test(0, Some(SIGCHLD));
assert_ne!(current_task.get_pid(), child_task.get_pid());
assert_ne!(current_task.get_tid(), child_task.get_tid());
assert_eq!(current_task.get_pid(), child_task.thread_group.read().get_ppid());
}
#[::fuchsia::test]
async fn test_root_capabilities() {
let (_kernel, current_task) = create_kernel_and_task();
assert!(current_task.creds().has_capability(CAP_SYS_ADMIN));
current_task.set_creds(Credentials::with_ids(1, 1));
assert!(!current_task.creds().has_capability(CAP_SYS_ADMIN));
}
#[::fuchsia::test]
async fn test_clone_rlimit() {
let (_kernel, current_task) = create_kernel_and_task();
let prev_fsize = current_task.thread_group.get_rlimit(Resource::FSIZE);
assert_ne!(prev_fsize, 10);
current_task
.thread_group
.limits
.lock()
.set(Resource::FSIZE, rlimit { rlim_cur: 10, rlim_max: 100 });
let current_fsize = current_task.thread_group.get_rlimit(Resource::FSIZE);
assert_eq!(current_fsize, 10);
let child_task = current_task.clone_task_for_test(0, Some(SIGCHLD));
let child_fsize = child_task.thread_group.get_rlimit(Resource::FSIZE);
assert_eq!(child_fsize, 10)
}
}