| // Copyright 2021 The Fuchsia Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| use crate::mm::{MemoryAccessor, MemoryAccessorExt, MemoryManager, TaskMemoryAccessor}; |
| use crate::mutable_state::{state_accessor, state_implementation}; |
| use crate::ptrace::{ |
| AtomicStopState, PtraceEvent, PtraceEventData, PtraceState, PtraceStatus, StopState, |
| }; |
| use crate::security; |
| use crate::signals::{KernelSignal, RunState, SignalDetail, SignalInfo, SignalState}; |
| use crate::task::memory_attribution::MemoryAttributionLifecycleEvent; |
| use crate::task::tracing::KoidPair; |
| use crate::task::{ |
| AbstractUnixSocketNamespace, AbstractVsockSocketNamespace, CurrentTask, EventHandler, Kernel, |
| NormalPriority, PidTable, ProcessEntryRef, ProcessExitInfo, RealtimePriority, SchedulerState, |
| SchedulingPolicy, SeccompFilterContainer, SeccompState, SeccompStateValue, ThreadGroup, |
| ThreadGroupKey, ThreadState, UtsNamespaceHandle, WaitCanceler, Waiter, ZombieProcess, |
| }; |
| use crate::vfs::{FdTable, FsContext, FsNodeHandle, FsString}; |
| use bitflags::bitflags; |
| use fuchsia_rcu::rcu_arc::RcuArc; |
| use fuchsia_rcu::rcu_option_arc::RcuOptionArc; |
| use fuchsia_rcu::rcu_ptr::RcuReadGuard; |
| use macro_rules_attribute::apply; |
| use starnix_logging::{log_warn, set_zx_name}; |
| use starnix_registers::{HeapRegs, RegisterStorageEnum}; |
| use starnix_sync::{ |
| LockBefore, Locked, Mutex, MutexGuard, RwLock, RwLockReadGuard, RwLockWriteGuard, TaskRelease, |
| TerminalLock, |
| }; |
| use starnix_task_command::TaskCommand; |
| use starnix_types::arch::ArchWidth; |
| use starnix_types::ownership::{OwnedRef, Releasable, ReleaseGuard, TempRef, WeakRef}; |
| use starnix_types::stats::TaskTimeStats; |
| use starnix_uapi::auth::{Credentials, FsCred}; |
| use starnix_uapi::errors::Errno; |
| use starnix_uapi::signals::{SIGCHLD, SigSet, Signal, sigaltstack_contains_pointer}; |
| use starnix_uapi::user_address::{ |
| ArchSpecific, MappingMultiArchUserRef, UserAddress, UserCString, UserRef, |
| }; |
| use starnix_uapi::{ |
| CLD_CONTINUED, CLD_DUMPED, CLD_EXITED, CLD_KILLED, CLD_STOPPED, CLD_TRAPPED, |
| FUTEX_BITSET_MATCH_ANY, errno, error, from_status_like_fdio, pid_t, sigaction_t, sigaltstack, |
| tid_t, uapi, |
| }; |
| use std::collections::VecDeque; |
| use std::mem::MaybeUninit; |
| use std::ops::Deref; |
| use std::sync::atomic::{AtomicBool, AtomicU8, Ordering}; |
| use std::sync::{Arc, Weak}; |
| use std::{cmp, fmt}; |
| use zx::{Signals, Task as _}; |
| |
| #[derive(Clone, Debug, Eq, PartialEq)] |
| pub enum ExitStatus { |
| Exit(u8), |
| Kill(SignalInfo), |
| CoreDump(SignalInfo), |
| // The second field for Stop and Continue contains the type of ptrace stop |
| // event that made it stop / continue, if applicable (PTRACE_EVENT_STOP, |
| // PTRACE_EVENT_FORK, etc) |
| Stop(SignalInfo, PtraceEvent), |
| Continue(SignalInfo, PtraceEvent), |
| } |
| impl ExitStatus { |
| /// Converts the given exit status to a status code suitable for returning from wait syscalls. |
| pub fn wait_status(&self) -> i32 { |
| match self { |
| ExitStatus::Exit(status) => (*status as i32) << 8, |
| ExitStatus::Kill(siginfo) => siginfo.signal.number() as i32, |
| ExitStatus::CoreDump(siginfo) => (siginfo.signal.number() as i32) | 0x80, |
| ExitStatus::Continue(siginfo, trace_event) => { |
| let trace_event_val = *trace_event as u32; |
| if trace_event_val != 0 { |
| (siginfo.signal.number() as i32) | (trace_event_val << 16) as i32 |
| } else { |
| 0xffff |
| } |
| } |
| ExitStatus::Stop(siginfo, trace_event) => { |
| let trace_event_val = *trace_event as u32; |
| (0x7f + ((siginfo.signal.number() as i32) << 8)) | (trace_event_val << 16) as i32 |
| } |
| } |
| } |
| |
| pub fn signal_info_code(&self) -> i32 { |
| match self { |
| ExitStatus::Exit(_) => CLD_EXITED as i32, |
| ExitStatus::Kill(_) => CLD_KILLED as i32, |
| ExitStatus::CoreDump(_) => CLD_DUMPED as i32, |
| ExitStatus::Stop(_, _) => CLD_STOPPED as i32, |
| ExitStatus::Continue(_, _) => CLD_CONTINUED as i32, |
| } |
| } |
| |
| pub fn signal_info_status(&self) -> i32 { |
| match self { |
| ExitStatus::Exit(status) => *status as i32, |
| ExitStatus::Kill(siginfo) |
| | ExitStatus::CoreDump(siginfo) |
| | ExitStatus::Continue(siginfo, _) |
| | ExitStatus::Stop(siginfo, _) => siginfo.signal.number() as i32, |
| } |
| } |
| } |
| |
| bitflags! { |
| #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] |
| pub struct TaskFlags: u8 { |
| const EXITED = 0x1; |
| const SIGNALS_AVAILABLE = 0x2; |
| const TEMPORARY_SIGNAL_MASK = 0x4; |
| /// Whether the executor should dump the stack of this task when it exits. |
| /// Currently used to implement ExitStatus::CoreDump. |
| const DUMP_ON_EXIT = 0x8; |
| const KERNEL_SIGNALS_AVAILABLE = 0x10; |
| } |
| } |
| |
| pub struct AtomicTaskFlags { |
| flags: AtomicU8, |
| } |
| |
| impl AtomicTaskFlags { |
| fn new(flags: TaskFlags) -> Self { |
| Self { flags: AtomicU8::new(flags.bits()) } |
| } |
| |
| fn load(&self, ordering: Ordering) -> TaskFlags { |
| let flags = self.flags.load(ordering); |
| // We only ever store values from a `TaskFlags`. |
| TaskFlags::from_bits_retain(flags) |
| } |
| |
| fn swap(&self, flags: TaskFlags, ordering: Ordering) -> TaskFlags { |
| let flags = self.flags.swap(flags.bits(), ordering); |
| // We only ever store values from a `TaskFlags`. |
| TaskFlags::from_bits_retain(flags) |
| } |
| } |
| |
| /// This contains thread state that tracers can inspect and modify. It is |
| /// captured when a thread stops, and optionally copied back (if dirty) when a |
| /// thread starts again. An alternative implementation would involve the |
| /// tracers acting on thread state directly; however, this would involve sharing |
| /// CurrentTask structures across multiple threads, which goes against the |
| /// intent of the design of CurrentTask. |
| pub struct CapturedThreadState { |
| /// The thread state of the traced task. This is copied out when the thread |
| /// stops. |
| pub thread_state: ThreadState<HeapRegs>, |
| |
| /// Indicates that the last ptrace operation changed the thread state, so it |
| /// should be written back to the original thread. |
| pub dirty: bool, |
| } |
| |
| impl ArchSpecific for CapturedThreadState { |
| fn is_arch32(&self) -> bool { |
| self.thread_state.is_arch32() |
| } |
| } |
| |
| #[derive(Debug)] |
| pub struct RobustList { |
| pub next: RobustListPtr, |
| } |
| |
| pub type RobustListPtr = |
| MappingMultiArchUserRef<RobustList, uapi::robust_list, uapi::arch32::robust_list>; |
| |
| impl From<uapi::robust_list> for RobustList { |
| fn from(robust_list: uapi::robust_list) -> Self { |
| Self { next: RobustListPtr::from(robust_list.next) } |
| } |
| } |
| |
| #[cfg(target_arch = "aarch64")] |
| impl From<uapi::arch32::robust_list> for RobustList { |
| fn from(robust_list: uapi::arch32::robust_list) -> Self { |
| Self { next: RobustListPtr::from(robust_list.next) } |
| } |
| } |
| |
| #[derive(Debug)] |
| pub struct RobustListHead { |
| pub list: RobustList, |
| pub futex_offset: isize, |
| } |
| |
| pub type RobustListHeadPtr = |
| MappingMultiArchUserRef<RobustListHead, uapi::robust_list_head, uapi::arch32::robust_list_head>; |
| |
| impl From<uapi::robust_list_head> for RobustListHead { |
| fn from(robust_list_head: uapi::robust_list_head) -> Self { |
| Self { |
| list: robust_list_head.list.into(), |
| futex_offset: robust_list_head.futex_offset as isize, |
| } |
| } |
| } |
| |
| #[cfg(target_arch = "aarch64")] |
| impl From<uapi::arch32::robust_list_head> for RobustListHead { |
| fn from(robust_list_head: uapi::arch32::robust_list_head) -> Self { |
| Self { |
| list: robust_list_head.list.into(), |
| futex_offset: robust_list_head.futex_offset as isize, |
| } |
| } |
| } |
| |
| pub struct TaskMutableState { |
| // See https://man7.org/linux/man-pages/man2/set_tid_address.2.html |
| pub clear_child_tid: UserRef<tid_t>, |
| |
| /// Signal handler related state. This is grouped together for when atomicity is needed during |
| /// signal sending and delivery. |
| signals: SignalState, |
| |
| /// Internal signals that have a higher priority than a regular signal. |
| /// |
| /// Storing in a separate queue outside of `SignalState` ensures the internal signals will |
| /// never be ignored or masked when dequeuing. Higher priority ensures that no user signals |
| /// will jump the queue, e.g. ptrace, which delays the delivery. |
| /// |
| /// This design is not about observable consequence, but about convenient implementation. |
| kernel_signals: VecDeque<KernelSignal>, |
| |
| /// The exit status that this task exited with. |
| exit_status: Option<ExitStatus>, |
| |
| /// Desired scheduler state for the task. |
| pub scheduler_state: SchedulerState, |
| |
| /// The UTS namespace assigned to this thread. |
| /// |
| /// This field is kept in the mutable state because the UTS namespace of a thread |
| /// can be forked using `clone()` or `unshare()` syscalls. |
| /// |
| /// We use UtsNamespaceHandle because the UTS properties can be modified |
| /// by any other thread that shares this namespace. |
| pub uts_ns: UtsNamespaceHandle, |
| |
| /// Bit that determines whether a newly started program can have privileges its parent does |
| /// not have. See Documentation/prctl/no_new_privs.txt in the Linux kernel for details. |
| /// Note that Starnix does not currently implement the relevant privileges (e.g., |
| /// setuid/setgid binaries). So, you can set this, but it does nothing other than get |
| /// propagated to children. |
| /// |
| /// The documentation indicates that this can only ever be set to |
| /// true, and it cannot be reverted to false. Accessor methods |
| /// for this field ensure this property. |
| no_new_privs: bool, |
| |
| /// Userspace hint about how to adjust the OOM score for this process. |
| pub oom_score_adj: i32, |
| |
| /// List of currently installed seccomp_filters |
| pub seccomp_filters: SeccompFilterContainer, |
| |
| /// A pointer to the head of the robust futex list of this thread in |
| /// userspace. See get_robust_list(2) |
| pub robust_list_head: RobustListHeadPtr, |
| |
| /// The timer slack used to group timer expirations for the calling thread. |
| /// |
| /// Timers may expire up to `timerslack_ns` late, but never early. |
| /// |
| /// If this value is 0, the task's default timerslack is used. |
| pub timerslack_ns: u64, |
| |
| /// The default value for `timerslack_ns`. This value cannot change during the lifetime of a |
| /// task. |
| /// |
| /// This value is set to the `timerslack_ns` of the creating thread, and thus is not constant |
| /// across tasks. |
| pub default_timerslack_ns: u64, |
| |
| /// Information that a tracer needs to communicate with this process, if it |
| /// is being traced. |
| pub ptrace: Option<Box<PtraceState>>, |
| |
| /// Information that a tracer needs to inspect this process. |
| pub captured_thread_state: Option<Box<CapturedThreadState>>, |
| } |
| |
| impl TaskMutableState { |
| pub fn no_new_privs(&self) -> bool { |
| self.no_new_privs |
| } |
| |
| /// Sets the value of no_new_privs to true. It is an error to set |
| /// it to anything else. |
| pub fn enable_no_new_privs(&mut self) { |
| self.no_new_privs = true; |
| } |
| |
| pub fn get_timerslack<T: zx::Timeline>(&self) -> zx::Duration<T> { |
| zx::Duration::from_nanos(self.timerslack_ns as i64) |
| } |
| |
| /// Sets the current timerslack of the task to `ns`. |
| /// |
| /// If `ns` is zero, the current timerslack gets reset to the task's default timerslack. |
| pub fn set_timerslack_ns(&mut self, ns: u64) { |
| if ns == 0 { |
| self.timerslack_ns = self.default_timerslack_ns; |
| } else { |
| self.timerslack_ns = ns; |
| } |
| } |
| |
| pub fn is_ptraced(&self) -> bool { |
| self.ptrace.is_some() |
| } |
| |
| pub fn is_ptrace_listening(&self) -> bool { |
| self.ptrace.as_ref().is_some_and(|ptrace| ptrace.stop_status == PtraceStatus::Listening) |
| } |
| |
| pub fn ptrace_on_signal_consume(&mut self) -> bool { |
| self.ptrace.as_mut().is_some_and(|ptrace: &mut Box<PtraceState>| { |
| if ptrace.stop_status.is_continuing() { |
| ptrace.stop_status = PtraceStatus::Default; |
| false |
| } else { |
| true |
| } |
| }) |
| } |
| |
| pub fn notify_ptracers(&mut self) { |
| if let Some(ptrace) = &self.ptrace { |
| ptrace.tracer_waiters().notify_all(); |
| } |
| } |
| |
| pub fn wait_on_ptracer(&self, waiter: &Waiter) { |
| if let Some(ptrace) = &self.ptrace { |
| ptrace.tracee_waiters.wait_async(&waiter); |
| } |
| } |
| |
| pub fn notify_ptracees(&mut self) { |
| if let Some(ptrace) = &self.ptrace { |
| ptrace.tracee_waiters.notify_all(); |
| } |
| } |
| |
| pub fn take_captured_state(&mut self) -> Option<Box<CapturedThreadState>> { |
| if self.captured_thread_state.is_some() { |
| let mut state = None; |
| std::mem::swap(&mut state, &mut self.captured_thread_state); |
| return state; |
| } |
| None |
| } |
| |
| pub fn copy_state_from(&mut self, current_task: &CurrentTask) { |
| self.captured_thread_state = Some(Box::new(CapturedThreadState { |
| thread_state: current_task.thread_state.extended_snapshot::<HeapRegs>(), |
| dirty: false, |
| })); |
| } |
| |
| /// Returns the task's currently active signal mask. |
| pub fn signal_mask(&self) -> SigSet { |
| self.signals.mask() |
| } |
| |
| /// Returns true if `signal` is currently blocked by this task's signal mask. |
| pub fn is_signal_masked(&self, signal: Signal) -> bool { |
| self.signals.mask().has_signal(signal) |
| } |
| |
| /// Returns true if `signal` is blocked by the saved signal mask. |
| /// |
| /// Note that the current signal mask may still not be blocking the signal. |
| pub fn is_signal_masked_by_saved_mask(&self, signal: Signal) -> bool { |
| self.signals.saved_mask().is_some_and(|mask| mask.has_signal(signal)) |
| } |
| |
| /// Removes the currently active, temporary, signal mask and restores the |
| /// previously active signal mask. |
| pub fn restore_signal_mask(&mut self) { |
| self.signals.restore_mask(); |
| } |
| |
| /// Returns true if the task's current `RunState` is blocked. |
| pub fn is_blocked(&self) -> bool { |
| self.signals.run_state.is_blocked() |
| } |
| |
| /// Sets the task's `RunState` to `run_state`. |
| pub fn set_run_state(&mut self, run_state: RunState) { |
| self.signals.run_state = run_state; |
| } |
| |
| pub fn run_state(&self) -> RunState { |
| self.signals.run_state.clone() |
| } |
| |
| pub fn on_signal_stack(&self, stack_pointer_register: u64) -> bool { |
| self.signals |
| .alt_stack |
| .map(|signal_stack| sigaltstack_contains_pointer(&signal_stack, stack_pointer_register)) |
| .unwrap_or(false) |
| } |
| |
| pub fn set_sigaltstack(&mut self, stack: Option<sigaltstack>) { |
| self.signals.alt_stack = stack; |
| } |
| |
| pub fn sigaltstack(&self) -> Option<sigaltstack> { |
| self.signals.alt_stack |
| } |
| |
| pub fn wait_on_signal(&mut self, waiter: &Waiter) { |
| self.signals.signal_wait.wait_async(waiter); |
| } |
| |
| pub fn signals_mut(&mut self) -> &mut SignalState { |
| &mut self.signals |
| } |
| |
| pub fn wait_on_signal_fd_events( |
| &self, |
| waiter: &Waiter, |
| mask: SigSet, |
| handler: EventHandler, |
| ) -> WaitCanceler { |
| self.signals.signal_wait.wait_async_signal_mask(waiter, mask, handler) |
| } |
| |
| pub fn notify_signal_waiters(&self, signal: &Signal) { |
| self.signals.signal_wait.notify_signal(signal); |
| } |
| |
| /// Thaw the task if has been frozen |
| pub fn thaw(&mut self) { |
| if let RunState::Frozen(waiter) = self.run_state() { |
| waiter.notify(); |
| } |
| } |
| |
| pub fn is_frozen(&self) -> bool { |
| matches!(self.run_state(), RunState::Frozen(_)) |
| } |
| |
| #[cfg(test)] |
| pub fn kernel_signals_for_test(&self) -> &VecDeque<KernelSignal> { |
| &self.kernel_signals |
| } |
| } |
| |
| #[apply(state_implementation!)] |
| impl TaskMutableState<Base = Task> { |
| pub fn set_stopped( |
| &mut self, |
| stopped: StopState, |
| siginfo: Option<SignalInfo>, |
| current_task: Option<&CurrentTask>, |
| event: Option<PtraceEventData>, |
| ) { |
| if stopped.ptrace_only() && self.ptrace.is_none() { |
| return; |
| } |
| |
| if self.base.load_stopped().is_illegal_transition(stopped) { |
| return; |
| } |
| |
| // TODO(https://g-issues.fuchsia.dev/issues/306438676): When task can be |
| // stopped inside user code, task will need to be either restarted or |
| // stopped here. |
| self.store_stopped(stopped); |
| if stopped.is_stopped() { |
| if let Some(ref current_task) = current_task { |
| self.copy_state_from(current_task); |
| } |
| } |
| if let Some(ptrace) = &mut self.ptrace { |
| ptrace.set_last_signal(siginfo); |
| ptrace.set_last_event(event); |
| } |
| if stopped == StopState::Waking || stopped == StopState::ForceWaking { |
| self.notify_ptracees(); |
| } |
| if !stopped.is_in_progress() { |
| self.notify_ptracers(); |
| } |
| } |
| |
| /// Enqueues a signal at the back of the task's signal queue. |
| pub fn enqueue_signal(&mut self, signal: SignalInfo) { |
| self.signals.enqueue(signal); |
| self.set_flags(TaskFlags::SIGNALS_AVAILABLE, self.signals.is_any_pending()); |
| } |
| |
| /// Enqueues the signal, allowing the signal to skip straight to the front of the task's queue. |
| /// |
| /// `enqueue_signal` is the more common API to use. |
| /// |
| /// Note that this will not guarantee that the signal is dequeued before any process-directed |
| /// signals. |
| pub fn enqueue_signal_front(&mut self, signal: SignalInfo) { |
| self.signals.enqueue(signal); |
| self.set_flags(TaskFlags::SIGNALS_AVAILABLE, self.signals.is_any_pending()); |
| } |
| |
| /// Sets the current signal mask of the task. |
| pub fn set_signal_mask(&mut self, mask: SigSet) { |
| self.signals.set_mask(mask); |
| self.set_flags(TaskFlags::SIGNALS_AVAILABLE, self.signals.is_any_pending()); |
| } |
| |
| /// Sets a temporary signal mask for the task. |
| /// |
| /// This mask should be removed by a matching call to `restore_signal_mask`. |
| pub fn set_temporary_signal_mask(&mut self, mask: SigSet) { |
| self.signals.set_temporary_mask(mask); |
| self.set_flags(TaskFlags::SIGNALS_AVAILABLE, self.signals.is_any_pending()); |
| } |
| |
| /// Returns the number of pending signals for this task, without considering the signal mask. |
| pub fn pending_signal_count(&self) -> usize { |
| self.signals.num_queued() + self.base.thread_group().num_signals_queued() |
| } |
| |
| /// Returns `true` if `signal` is pending for this task, without considering the signal mask. |
| pub fn has_signal_pending(&self, signal: Signal) -> bool { |
| self.signals.has_queued(signal) || self.base.thread_group().has_signal_queued(signal) |
| } |
| |
| // Prepare a SignalInfo to be sent to the tracer, if any. |
| pub fn prepare_signal_info( |
| &mut self, |
| stopped: StopState, |
| ) -> Option<(Weak<ThreadGroup>, SignalInfo)> { |
| if !stopped.is_stopped() { |
| return None; |
| } |
| |
| if let Some(ptrace) = &self.ptrace { |
| if let Some(last_signal) = ptrace.get_last_signal_ref() { |
| let signal_info = SignalInfo::new( |
| SIGCHLD, |
| CLD_TRAPPED as i32, |
| SignalDetail::SIGCHLD { |
| pid: self.base.tid, |
| uid: self.base.real_creds().uid, |
| status: last_signal.signal.number() as i32, |
| }, |
| ); |
| |
| return Some((ptrace.core_state.thread_group.clone(), signal_info)); |
| } |
| } |
| |
| None |
| } |
| |
| pub fn set_ptrace(&mut self, tracer: Option<Box<PtraceState>>) -> Result<(), Errno> { |
| if tracer.is_some() && self.ptrace.is_some() { |
| return error!(EPERM); |
| } |
| |
| if tracer.is_none() { |
| // Handle the case where this is called while the thread group is being released. |
| if let Ok(tg_stop_state) = self.base.thread_group().load_stopped().as_in_progress() { |
| self.set_stopped(tg_stop_state, None, None, None); |
| } |
| } |
| self.ptrace = tracer; |
| Ok(()) |
| } |
| |
| pub fn can_accept_ptrace_commands(&mut self) -> bool { |
| !self.base.load_stopped().is_waking_or_awake() |
| && self.is_ptraced() |
| && !self.is_ptrace_listening() |
| } |
| |
| fn store_stopped(&mut self, state: StopState) { |
| // We don't actually use the guard but we require it to enforce that the |
| // caller holds the thread group's mutable state lock (identified by |
| // mutable access to the thread group's mutable state). |
| |
| self.base.stop_state.store(state, Ordering::Relaxed) |
| } |
| |
| pub fn update_flags(&mut self, clear: TaskFlags, set: TaskFlags) { |
| // We don't actually use the guard but we require it to enforce that the |
| // caller holds the task's mutable state lock (identified by mutable |
| // access to the task's mutable state). |
| |
| debug_assert_eq!(clear ^ set, clear | set); |
| let observed = self.base.flags(); |
| let swapped = self.base.flags.swap((observed | set) & !clear, Ordering::Relaxed); |
| debug_assert_eq!(swapped, observed); |
| } |
| |
| pub fn set_flags(&mut self, flag: TaskFlags, v: bool) { |
| let (clear, set) = if v { (TaskFlags::empty(), flag) } else { (flag, TaskFlags::empty()) }; |
| |
| self.update_flags(clear, set); |
| } |
| |
| pub fn set_exit_status(&mut self, status: ExitStatus) { |
| self.set_flags(TaskFlags::EXITED, true); |
| self.exit_status = Some(status); |
| } |
| |
| pub fn set_exit_status_if_not_already(&mut self, status: ExitStatus) { |
| self.set_flags(TaskFlags::EXITED, true); |
| self.exit_status.get_or_insert(status); |
| } |
| |
| /// The set of pending signals for the task, including the signals pending for the thread |
| /// group. |
| pub fn pending_signals(&self) -> SigSet { |
| self.signals.pending() | self.base.thread_group().get_pending_signals() |
| } |
| |
| /// The set of pending signals for the task specifically, not including the signals pending |
| /// for the thread group. |
| pub fn task_specific_pending_signals(&self) -> SigSet { |
| self.signals.pending() |
| } |
| |
| /// Returns true if any currently pending signal is allowed by `mask`. |
| pub fn is_any_signal_allowed_by_mask(&self, mask: SigSet) -> bool { |
| self.signals.is_any_allowed_by_mask(mask) |
| || self.base.thread_group().is_any_signal_allowed_by_mask(mask) |
| } |
| |
| /// Returns whether or not a signal is pending for this task, taking the current |
| /// signal mask into account. |
| pub fn is_any_signal_pending(&self) -> bool { |
| let mask = self.signal_mask(); |
| self.signals.is_any_pending() |
| || self.base.thread_group().is_any_signal_allowed_by_mask(mask) |
| } |
| |
| /// Returns the next pending signal that passes `predicate`. |
| fn take_next_signal_where<F>(&mut self, predicate: F) -> Option<SignalInfo> |
| where |
| F: Fn(&SignalInfo) -> bool, |
| { |
| if let Some(signal) = self.base.thread_group().take_next_signal_where(&predicate) { |
| Some(signal) |
| } else { |
| let s = self.signals.take_next_where(&predicate); |
| self.set_flags(TaskFlags::SIGNALS_AVAILABLE, self.signals.is_any_pending()); |
| s |
| } |
| } |
| |
| /// Removes and returns the next pending `signal` for this task. |
| /// |
| /// Returns `None` if `siginfo` is a blocked signal, or no such signal is pending. |
| pub fn take_specific_signal(&mut self, siginfo: SignalInfo) -> Option<SignalInfo> { |
| let signal_mask = self.signal_mask(); |
| if signal_mask.has_signal(siginfo.signal) { |
| return None; |
| } |
| |
| let predicate = |s: &SignalInfo| s.signal == siginfo.signal; |
| self.take_next_signal_where(predicate) |
| } |
| |
| /// Removes and returns a pending signal that is unblocked by the current signal mask. |
| /// |
| /// Returns `None` if there are no unblocked signals pending. |
| pub fn take_any_signal(&mut self) -> Option<SignalInfo> { |
| self.take_signal_with_mask(self.signal_mask()) |
| } |
| |
| /// Removes and returns a pending signal that is unblocked by `signal_mask`. |
| /// |
| /// Returns `None` if there are no signals pending that are unblocked by `signal_mask`. |
| pub fn take_signal_with_mask(&mut self, signal_mask: SigSet) -> Option<SignalInfo> { |
| let predicate = |s: &SignalInfo| !signal_mask.has_signal(s.signal) || s.force; |
| self.take_next_signal_where(predicate) |
| } |
| |
| /// Enqueues an internal signal at the back of the task's kernel signal queue. |
| pub fn enqueue_kernel_signal(&mut self, signal: KernelSignal) { |
| self.kernel_signals.push_back(signal); |
| self.set_flags(TaskFlags::KERNEL_SIGNALS_AVAILABLE, true); |
| } |
| |
| /// Removes and returns a pending internal signal. |
| /// |
| /// Returns `None` if there are no signals pending. |
| pub fn take_kernel_signal(&mut self) -> Option<KernelSignal> { |
| let signal = self.kernel_signals.pop_front(); |
| if self.kernel_signals.is_empty() { |
| self.set_flags(TaskFlags::KERNEL_SIGNALS_AVAILABLE, false); |
| } |
| signal |
| } |
| |
| #[cfg(test)] |
| pub fn queued_signal_count(&self, signal: Signal) -> usize { |
| self.signals.queued_count(signal) |
| + self.base.thread_group().pending_signals.lock().queued_count(signal) |
| } |
| } |
| |
| #[derive(Debug, Clone, Copy, PartialEq, Eq)] |
| pub enum TaskStateCode { |
| // Task is being executed. |
| Running, |
| |
| // Task is waiting for an event. |
| Sleeping, |
| |
| // Tracing stop |
| TracingStop, |
| |
| // Task has exited. |
| Zombie, |
| } |
| |
| impl TaskStateCode { |
| pub fn code_char(&self) -> char { |
| match self { |
| TaskStateCode::Running => 'R', |
| TaskStateCode::Sleeping => 'S', |
| TaskStateCode::TracingStop => 't', |
| TaskStateCode::Zombie => 'Z', |
| } |
| } |
| |
| pub fn name(&self) -> &'static str { |
| match self { |
| TaskStateCode::Running => "running", |
| TaskStateCode::Sleeping => "sleeping", |
| TaskStateCode::TracingStop => "tracing stop", |
| TaskStateCode::Zombie => "zombie", |
| } |
| } |
| } |
| |
| /// The information of the task that needs to be available to the `ThreadGroup` while computing |
| /// which process a wait can target. It is necessary to shared this data with the `ThreadGroup` so |
| /// that it is available while the task is being dropped and so is not accessible from a weak |
| /// pointer. |
| #[derive(Debug)] |
| pub struct TaskPersistentInfoState { |
| /// Immutable information about the task |
| tid: tid_t, |
| thread_group_key: ThreadGroupKey, |
| |
| /// The command of this task. |
| command: Mutex<TaskCommand>, |
| |
| /// The security credentials for this task. These are only set when the task is the CurrentTask, |
| /// or on task creation. |
| creds: RcuArc<Credentials>, |
| |
| // A lock for the security credentials. Writers must take the lock, readers that need to ensure |
| // that the task state does not change may take the lock. |
| creds_lock: RwLock<()>, |
| } |
| |
| /// Guard for reading locked credentials. |
| pub struct CredentialsReadGuard<'a> { |
| _lock: RwLockReadGuard<'a, ()>, |
| creds: RcuReadGuard<Credentials>, |
| } |
| |
| impl<'a> Deref for CredentialsReadGuard<'a> { |
| type Target = Credentials; |
| |
| fn deref(&self) -> &Self::Target { |
| self.creds.deref() |
| } |
| } |
| |
| /// Guard for writing credentials. No `CredentialsReadGuard` to the same task can concurrently |
| /// exist. |
| pub struct CredentialsWriteGuard<'a> { |
| _lock: RwLockWriteGuard<'a, ()>, |
| creds: &'a RcuArc<Credentials>, |
| } |
| |
| impl<'a> CredentialsWriteGuard<'a> { |
| pub fn update(&mut self, creds: Arc<Credentials>) { |
| self.creds.update(creds); |
| } |
| } |
| |
| impl TaskPersistentInfoState { |
| fn new( |
| tid: tid_t, |
| thread_group_key: ThreadGroupKey, |
| command: TaskCommand, |
| creds: Arc<Credentials>, |
| ) -> TaskPersistentInfo { |
| Arc::new(Self { |
| tid, |
| thread_group_key, |
| command: Mutex::new(command), |
| creds: RcuArc::new(creds), |
| creds_lock: RwLock::new(()), |
| }) |
| } |
| |
| pub fn tid(&self) -> tid_t { |
| self.tid |
| } |
| |
| pub fn pid(&self) -> pid_t { |
| self.thread_group_key.pid() |
| } |
| |
| pub fn command_guard(&self) -> MutexGuard<'_, TaskCommand> { |
| self.command.lock() |
| } |
| |
| /// Snapshots the credentials, returning a short-lived RCU-guarded reference. |
| pub fn real_creds(&self) -> RcuReadGuard<Credentials> { |
| self.creds.read() |
| } |
| |
| /// Snapshots the credentials, returning a new reference. Use this if you need to stash the |
| /// credentials somewhere. |
| pub fn clone_creds(&self) -> Arc<Credentials> { |
| self.creds.to_arc() |
| } |
| |
| /// Returns a read lock on the credentials. This is appropriate if you need to guarantee that |
| /// the Task's credentials will not change during a security-sensitive operation. |
| pub fn lock_creds(&self) -> CredentialsReadGuard<'_> { |
| let lock = self.creds_lock.read(); |
| CredentialsReadGuard { _lock: lock, creds: self.creds.read() } |
| } |
| |
| /// Locks the credentials for writing. |
| /// SAFETY: Only use from CurrentTask, and keep the subjective credentials stored in CurrentTask |
| /// in sync. |
| pub(in crate::task) unsafe fn write_creds(&self) -> CredentialsWriteGuard<'_> { |
| let lock = self.creds_lock.write(); |
| CredentialsWriteGuard { _lock: lock, creds: &self.creds } |
| } |
| } |
| |
| pub type TaskPersistentInfo = Arc<TaskPersistentInfoState>; |
| |
| /// A unit of execution. |
| /// |
| /// A task is the primary unit of execution in the Starnix kernel. Most tasks are *user* tasks, |
| /// which have an associated Zircon thread. The Zircon thread switches between restricted mode, |
| /// in which the thread runs userspace code, and normal mode, in which the thread runs Starnix |
| /// code. |
| /// |
| /// Tasks track the resources used by userspace by referencing various objects, such as an |
| /// `FdTable`, a `MemoryManager`, and an `FsContext`. Many tasks can share references to these |
| /// objects. In principle, which objects are shared between which tasks can be largely arbitrary, |
| /// but there are common patterns of sharing. For example, tasks created with `pthread_create` |
| /// will share the `FdTable`, `MemoryManager`, and `FsContext` and are often called "threads" by |
| /// userspace programmers. Tasks created by `posix_spawn` do not share these objects and are often |
| /// called "processes" by userspace programmers. However, inside the kernel, there is no clear |
| /// definition of a "thread" or a "process". |
| /// |
| /// During boot, the kernel creates the first task, often called `init`. The vast majority of other |
| /// tasks are created as transitive clones (e.g., using `clone(2)`) of that task. Sometimes, the |
| /// kernel will create new tasks from whole cloth, either with a corresponding userspace component |
| /// or to represent some background work inside the kernel. |
| /// |
| /// See also `CurrentTask`, which represents the task corresponding to the thread that is currently |
| /// executing. |
| pub struct Task { |
| /// Weak reference to the `OwnedRef` of this `Task`. This allows to retrieve the |
| /// `TempRef` from a raw `Task`. |
| pub weak_self: WeakRef<Self>, |
| |
| /// A unique identifier for this task. |
| /// |
| /// This value can be read in userspace using `gettid(2)`. In general, this value |
| /// is different from the value return by `getpid(2)`, which returns the `id` of the leader |
| /// of the `thread_group`. |
| pub tid: tid_t, |
| |
| /// The process key of this task. |
| pub thread_group_key: ThreadGroupKey, |
| |
| /// The kernel to which this thread group belongs. |
| pub kernel: Arc<Kernel>, |
| |
| /// The thread group to which this task belongs. |
| /// |
| /// The group of tasks in a thread group roughly corresponds to the userspace notion of a |
| /// process. |
| pub thread_group: Arc<ThreadGroup>, |
| |
| /// A handle to the underlying Zircon thread object. |
| /// |
| /// Some tasks lack an underlying Zircon thread. These tasks are used internally by the |
| /// Starnix kernel to track background work, typically on a `kthread`. |
| pub thread: RwLock<Option<Arc<zx::Thread>>>, |
| |
| /// The file descriptor table for this task. |
| /// |
| /// This table can be share by many tasks. |
| pub files: FdTable, |
| |
| /// The memory manager for this task. This is `None` only for system tasks. |
| pub mm: RcuOptionArc<MemoryManager>, |
| |
| /// The file system for this task. |
| fs: RcuOptionArc<FsContext>, |
| |
| /// The namespace for abstract AF_UNIX sockets for this task. |
| pub abstract_socket_namespace: Arc<AbstractUnixSocketNamespace>, |
| |
| /// The namespace for AF_VSOCK for this task. |
| pub abstract_vsock_namespace: Arc<AbstractVsockSocketNamespace>, |
| |
| /// The stop state of the task, distinct from the stop state of the thread group. |
| /// |
| /// Must only be set when the `mutable_state` write lock is held. |
| stop_state: AtomicStopState, |
| |
| /// The flags for the task. |
| /// |
| /// Must only be set the then `mutable_state` write lock is held. |
| flags: AtomicTaskFlags, |
| |
| /// The mutable state of the Task. |
| mutable_state: RwLock<TaskMutableState>, |
| |
| /// The information of the task that needs to be available to the `ThreadGroup` while computing |
| /// which process a wait can target. |
| /// Contains the command line, the task credentials and the exit signal. |
| /// See `TaskPersistentInfo` for more information. |
| pub persistent_info: TaskPersistentInfo, |
| |
| /// For vfork and clone() with CLONE_VFORK, this is set when the task exits or calls execve(). |
| /// It allows the calling task to block until the fork has been completed. Only populated |
| /// when created with the CLONE_VFORK flag. |
| vfork_event: Option<Arc<zx::Event>>, |
| |
| /// Variable that can tell you whether there are currently seccomp |
| /// filters without holding a lock |
| pub seccomp_filter_state: SeccompState, |
| |
| /// Tell you whether you are tracing syscall entry / exit without a lock. |
| pub trace_syscalls: AtomicBool, |
| |
| // The pid directory, so it doesn't have to be generated and thrown away on every access. |
| // See https://fxbug.dev/291962828 for details. |
| pub proc_pid_directory_cache: Mutex<Option<FsNodeHandle>>, |
| |
| /// The Linux Security Modules state for this thread group. This should be the last member of |
| /// this struct. |
| pub security_state: security::TaskState, |
| } |
| |
| /// The decoded cross-platform parts we care about for page fault exception reports. |
| #[derive(Debug)] |
| pub struct PageFaultExceptionReport { |
| pub faulting_address: u64, |
| pub not_present: bool, // Set when the page fault was due to a not-present page. |
| pub is_write: bool, // Set when the triggering memory operation was a write. |
| pub is_execute: bool, // Set when the triggering memory operation was an execute. |
| } |
| |
| impl Task { |
| pub fn kernel(&self) -> &Arc<Kernel> { |
| &self.kernel |
| } |
| |
| pub fn thread_group(&self) -> &Arc<ThreadGroup> { |
| &self.thread_group |
| } |
| |
| pub fn has_same_address_space(&self, other: Option<&Arc<MemoryManager>>) -> bool { |
| match (self.mm(), other) { |
| (Ok(this), Some(other)) => Arc::ptr_eq(&this, other), |
| (Err(_), None) => true, |
| _ => false, |
| } |
| } |
| |
| pub fn flags(&self) -> TaskFlags { |
| self.flags.load(Ordering::Relaxed) |
| } |
| |
| /// When the task exits, if there is a notification that needs to propagate |
| /// to a ptracer, make sure it will propagate. |
| pub fn set_ptrace_zombie(&self, pids: &mut crate::task::PidTable) { |
| let pgid = self.thread_group().read().process_group.leader; |
| let exit_signal = self.thread_group().read().exit_signal.clone(); |
| let mut state = self.write(); |
| state.set_stopped(StopState::ForceAwake, None, None, None); |
| if let Some(ptrace) = &mut state.ptrace { |
| // Add a zombie that the ptracer will notice. |
| ptrace.last_signal_waitable = true; |
| let tracer_pid = ptrace.get_pid(); |
| let tracer_tg = pids.get_thread_group(tracer_pid); |
| if let Some(tracer_tg) = tracer_tg { |
| drop(state); |
| let mut tracer_state = tracer_tg.write(); |
| |
| let exit_status = self.exit_status().unwrap_or_else(|| { |
| starnix_logging::log_error!("Exiting without an exit code."); |
| ExitStatus::Exit(u8::MAX) |
| }); |
| let uid = self.real_creds().uid; |
| let exit_info = ProcessExitInfo { status: exit_status, exit_signal }; |
| let zombie = ZombieProcess { |
| thread_group_key: self.thread_group_key.clone(), |
| pgid, |
| uid, |
| exit_info: exit_info, |
| // ptrace doesn't need this. |
| time_stats: TaskTimeStats::default(), |
| is_canonical: false, |
| }; |
| |
| tracer_state.zombie_ptracees.add(pids, self.tid, zombie); |
| }; |
| } |
| } |
| |
| /// Disconnects this task from the tracer, if the tracer is still running. |
| pub fn ptrace_disconnect(&mut self, pids: &PidTable) { |
| let mut state = self.write(); |
| let ptracer_pid = state.ptrace.as_ref().map(|ptrace| ptrace.get_pid()); |
| if let Some(ptracer_pid) = ptracer_pid { |
| let _ = state.set_ptrace(None); |
| if let Some(ProcessEntryRef::Process(tg)) = pids.get_process(ptracer_pid) { |
| let tid = self.get_tid(); |
| drop(state); |
| tg.ptracees.lock().remove(&tid); |
| } |
| } |
| } |
| |
| pub fn exit_status(&self) -> Option<ExitStatus> { |
| self.is_exitted().then(|| self.read().exit_status.clone()).flatten() |
| } |
| |
| pub fn is_exitted(&self) -> bool { |
| self.flags().contains(TaskFlags::EXITED) |
| } |
| |
| pub fn load_stopped(&self) -> StopState { |
| self.stop_state.load(Ordering::Relaxed) |
| } |
| |
| /// Upgrade a Reference to a Task, returning a ESRCH errno if the reference cannot be borrowed. |
| pub fn from_weak(weak: &WeakRef<Task>) -> Result<TempRef<'_, Task>, Errno> { |
| weak.upgrade().ok_or_else(|| errno!(ESRCH)) |
| } |
| |
| /// Internal function for creating a Task object. Useful when you need to specify the value of |
| /// every field. create_process and create_thread are more likely to be what you want. |
| /// |
| /// Any fields that should be initialized fresh for every task, even if the task was created |
| /// with fork, are initialized to their defaults inside this function. All other fields are |
| /// passed as parameters. |
| #[allow(clippy::let_and_return)] |
| pub fn new( |
| tid: tid_t, |
| command: TaskCommand, |
| thread_group: Arc<ThreadGroup>, |
| thread: Option<zx::Thread>, |
| files: FdTable, |
| mm: Option<Arc<MemoryManager>>, |
| // The only case where fs should be None if when building the initial task that is the |
| // used to build the initial FsContext. |
| fs: Arc<FsContext>, |
| creds: Arc<Credentials>, |
| abstract_socket_namespace: Arc<AbstractUnixSocketNamespace>, |
| abstract_vsock_namespace: Arc<AbstractVsockSocketNamespace>, |
| signal_mask: SigSet, |
| kernel_signals: VecDeque<KernelSignal>, |
| vfork_event: Option<Arc<zx::Event>>, |
| scheduler_state: SchedulerState, |
| uts_ns: UtsNamespaceHandle, |
| no_new_privs: bool, |
| seccomp_filter_state: SeccompState, |
| seccomp_filters: SeccompFilterContainer, |
| robust_list_head: RobustListHeadPtr, |
| timerslack_ns: u64, |
| security_state: security::TaskState, |
| ) -> OwnedRef<Self> { |
| let thread_group_key = ThreadGroupKey::from(&thread_group); |
| OwnedRef::new_cyclic(|weak_self| { |
| let task = Task { |
| weak_self, |
| tid, |
| thread_group_key: thread_group_key.clone(), |
| kernel: Arc::clone(&thread_group.kernel), |
| thread_group, |
| thread: RwLock::new(thread.map(Arc::new)), |
| files, |
| mm: RcuOptionArc::new(mm), |
| fs: RcuOptionArc::new(Some(fs)), |
| abstract_socket_namespace, |
| abstract_vsock_namespace, |
| vfork_event, |
| stop_state: AtomicStopState::new(StopState::Awake), |
| flags: AtomicTaskFlags::new(TaskFlags::empty()), |
| mutable_state: RwLock::new(TaskMutableState { |
| clear_child_tid: UserRef::default(), |
| signals: SignalState::with_mask(signal_mask), |
| kernel_signals, |
| exit_status: None, |
| scheduler_state, |
| uts_ns, |
| no_new_privs, |
| oom_score_adj: Default::default(), |
| seccomp_filters, |
| robust_list_head, |
| timerslack_ns, |
| // The default timerslack is set to the current timerslack of the creating thread. |
| default_timerslack_ns: timerslack_ns, |
| ptrace: None, |
| captured_thread_state: None, |
| }), |
| persistent_info: TaskPersistentInfoState::new( |
| tid, |
| thread_group_key, |
| command, |
| creds, |
| ), |
| seccomp_filter_state, |
| trace_syscalls: AtomicBool::new(false), |
| proc_pid_directory_cache: Mutex::new(None), |
| security_state, |
| }; |
| |
| #[cfg(any(test, debug_assertions))] |
| { |
| // Note that `Kernel::pids` is already locked by the caller of `Task::new()`. |
| let _l1 = task.read(); |
| let _l2 = task.persistent_info.lock_creds(); |
| let _l3 = task.persistent_info.command_guard(); |
| } |
| task |
| }) |
| } |
| |
| state_accessor!(Task, mutable_state); |
| |
| /// Returns the real credentials of the task as a short-lived RCU-guarded reference. These |
| /// credentials are used to check permissions for actions performed on the task. If the task |
| /// itself is performing an action, use `CurrentTask::current_creds` instead. This does not |
| /// lock the credentials. |
| pub fn real_creds(&self) -> RcuReadGuard<Credentials> { |
| self.persistent_info.real_creds() |
| } |
| |
| /// Returns a new long-lived reference to the real credentials of the task. These credentials |
| /// are used to check permissions for actions performed on the task. If the task itself is |
| /// performing an action, use `CurrentTask::current_creds` instead. This does not lock the |
| /// credentials. |
| pub fn clone_creds(&self) -> Arc<Credentials> { |
| self.persistent_info.clone_creds() |
| } |
| |
| pub fn ptracer_task(&self) -> WeakRef<Task> { |
| let ptracer = { |
| let state = self.read(); |
| state.ptrace.as_ref().map(|p| p.core_state.pid) |
| }; |
| |
| let Some(ptracer) = ptracer else { |
| return WeakRef::default(); |
| }; |
| |
| self.get_task(ptracer) |
| } |
| |
| pub fn fs(&self) -> Arc<FsContext> { |
| self.fs.to_option_arc().expect("fs must be set") |
| } |
| |
| pub fn has_shared_fs(&self) -> bool { |
| let maybe_fs = self.fs.to_option_arc(); |
| // This check is incorrect because someone else could be holding a temporary Arc to the |
| // FsContext and therefore increasing the strong count. |
| maybe_fs.is_some_and(|fs| Arc::strong_count(&fs) > 2usize) |
| } |
| |
| #[track_caller] |
| pub fn mm(&self) -> Result<Arc<MemoryManager>, Errno> { |
| self.mm.to_option_arc().ok_or_else(|| errno!(EINVAL)) |
| } |
| |
| pub fn unshare_fs(&self) { |
| let fs = self.fs().fork(); |
| self.fs.update(Some(fs)); |
| } |
| |
| /// Modify the given elements of the scheduler state with new values and update the |
| /// task's thread's role. |
| pub(crate) fn set_scheduler_policy_priority_and_reset_on_fork( |
| &self, |
| policy: SchedulingPolicy, |
| priority: RealtimePriority, |
| reset_on_fork: bool, |
| ) -> Result<(), Errno> { |
| self.update_scheduler_state_then_role(|scheduler_state| { |
| scheduler_state.policy = policy; |
| scheduler_state.realtime_priority = priority; |
| scheduler_state.reset_on_fork = reset_on_fork; |
| }) |
| } |
| |
| /// Modify the scheduler state's priority and update the task's thread's role. |
| pub(crate) fn set_scheduler_priority(&self, priority: RealtimePriority) -> Result<(), Errno> { |
| self.update_scheduler_state_then_role(|scheduler_state| { |
| scheduler_state.realtime_priority = priority |
| }) |
| } |
| |
| /// Modify the scheduler state's nice and update the task's thread's role. |
| pub(crate) fn set_scheduler_nice(&self, nice: NormalPriority) -> Result<(), Errno> { |
| self.update_scheduler_state_then_role(|scheduler_state| { |
| scheduler_state.normal_priority = nice |
| }) |
| } |
| |
| /// Overwrite the existing scheduler state with a new one and update the task's thread's role. |
| pub fn set_scheduler_state(&self, scheduler_state: SchedulerState) -> Result<(), Errno> { |
| self.update_scheduler_state_then_role(|task_scheduler_state| { |
| *task_scheduler_state = scheduler_state |
| }) |
| } |
| |
| /// Update the task's thread's role based on its current scheduler state without making any |
| /// changes to the state. |
| /// |
| /// This should be called on tasks that have newly created threads, e.g. after cloning. |
| pub fn sync_scheduler_state_to_role(&self) -> Result<(), Errno> { |
| self.update_scheduler_state_then_role(|_| {}) |
| } |
| |
| fn update_scheduler_state_then_role( |
| &self, |
| updater: impl FnOnce(&mut SchedulerState), |
| ) -> Result<(), Errno> { |
| let new_scheduler_state = { |
| // Hold the task state lock as briefly as possible, it's not needed to update the role. |
| let mut state = self.write(); |
| updater(&mut state.scheduler_state); |
| state.scheduler_state |
| }; |
| self.thread_group().kernel.scheduler.set_thread_role(self, new_scheduler_state)?; |
| Ok(()) |
| } |
| |
| /// Signals the vfork event, if any, to unblock waiters. |
| pub fn signal_vfork(&self) { |
| if let Some(event) = &self.vfork_event { |
| if let Err(status) = event.signal(Signals::NONE, Signals::USER_0) { |
| log_warn!("Failed to set vfork signal {status}"); |
| } |
| }; |
| } |
| |
| /// Blocks the caller until the task has exited or executed execve(). This is used to implement |
| /// vfork() and clone(... CLONE_VFORK, ...). The task must have created with CLONE_EXECVE. |
| pub fn wait_for_execve(&self, task_to_wait: WeakRef<Task>) -> Result<(), Errno> { |
| let event = task_to_wait.upgrade().and_then(|t| t.vfork_event.clone()); |
| if let Some(event) = event { |
| event |
| .wait_one(zx::Signals::USER_0, zx::MonotonicInstant::INFINITE) |
| .map_err(|status| from_status_like_fdio!(status))?; |
| } |
| Ok(()) |
| } |
| |
| /// If needed, clear the child tid for this task. |
| /// |
| /// Userspace can ask us to clear the child tid and issue a futex wake at |
| /// the child tid address when we tear down a task. For example, bionic |
| /// uses this mechanism to implement pthread_join. The thread that calls |
| /// pthread_join sleeps using FUTEX_WAIT on the child tid address. We wake |
| /// them up here to let them know the thread is done. |
| pub fn clear_child_tid_if_needed<L>(&self, locked: &mut Locked<L>) -> Result<(), Errno> |
| where |
| L: LockBefore<TerminalLock>, |
| { |
| let mut state = self.write(); |
| let user_tid = state.clear_child_tid; |
| if !user_tid.is_null() { |
| let zero: tid_t = 0; |
| self.write_object(user_tid, &zero)?; |
| self.kernel().shared_futexes.wake( |
| locked, |
| self, |
| user_tid.addr(), |
| usize::MAX, |
| FUTEX_BITSET_MATCH_ANY, |
| )?; |
| state.clear_child_tid = UserRef::default(); |
| } |
| Ok(()) |
| } |
| |
| pub fn get_task(&self, tid: tid_t) -> WeakRef<Task> { |
| self.kernel().pids.read().get_task(tid) |
| } |
| |
| pub fn get_pid(&self) -> pid_t { |
| self.thread_group_key.pid() |
| } |
| |
| pub fn get_tid(&self) -> tid_t { |
| self.tid |
| } |
| |
| pub fn is_leader(&self) -> bool { |
| self.get_pid() == self.get_tid() |
| } |
| |
| pub fn read_argv(&self, max_len: usize) -> Result<Vec<FsString>, Errno> { |
| // argv is empty for kthreads |
| let Ok(mm) = self.mm() else { |
| return Ok(vec![]); |
| }; |
| let (argv_start, argv_end) = { |
| let mm_state = mm.state.read(); |
| (mm_state.argv_start, mm_state.argv_end) |
| }; |
| |
| let len_to_read = std::cmp::min(argv_end - argv_start, max_len); |
| self.read_nul_delimited_c_string_list(argv_start, len_to_read) |
| } |
| |
| pub fn read_argv0(&self) -> Result<FsString, Errno> { |
| // argv is empty for kthreads |
| let Ok(mm) = self.mm() else { |
| return Ok(FsString::default()); |
| }; |
| let argv_start = { |
| let mm_state = mm.state.read(); |
| mm_state.argv_start |
| }; |
| // Assuming a 64-bit arch width is fine for a type that's just u8's on all arches. |
| let argv_start = UserCString::new(&ArchWidth::Arch64, argv_start); |
| self.read_path(argv_start) |
| } |
| |
| pub fn read_env(&self, max_len: usize) -> Result<Vec<FsString>, Errno> { |
| // environment is empty for kthreads |
| let Ok(mm) = self.mm() else { return Ok(vec![]) }; |
| let (env_start, env_end) = { |
| let mm_state = mm.state.read(); |
| (mm_state.environ_start, mm_state.environ_end) |
| }; |
| |
| let len_to_read = std::cmp::min(env_end - env_start, max_len); |
| self.read_nul_delimited_c_string_list(env_start, len_to_read) |
| } |
| |
| pub fn thread_runtime_info(&self) -> Result<zx::TaskRuntimeInfo, Errno> { |
| self.thread |
| .read() |
| .as_ref() |
| .ok_or_else(|| errno!(EINVAL))? |
| .get_runtime_info() |
| .map_err(|status| from_status_like_fdio!(status)) |
| } |
| |
| pub fn real_fscred(&self) -> FsCred { |
| self.real_creds().as_fscred() |
| } |
| |
| /// Interrupts the current task. |
| /// |
| /// This will interrupt any blocking syscalls if the task is blocked on one. |
| /// The signal_state of the task must not be locked. |
| pub fn interrupt(&self) { |
| self.read().signals.run_state.wake(); |
| if let Some(thread) = self.thread.read().as_ref() { |
| #[allow( |
| clippy::undocumented_unsafe_blocks, |
| reason = "Force documented unsafe blocks in Starnix" |
| )] |
| let status = unsafe { zx::sys::zx_restricted_kick(thread.raw_handle(), 0) }; |
| if status != zx::sys::ZX_OK { |
| // zx_restricted_kick() could return ZX_ERR_BAD_STATE if the target thread is already in the |
| // DYING or DEAD states. That's fine since it means that the task is in the process of |
| // tearing down, so allow it. |
| assert_eq!(status, zx::sys::ZX_ERR_BAD_STATE); |
| } |
| } |
| } |
| |
| pub fn command(&self) -> TaskCommand { |
| self.persistent_info.command.lock().clone() |
| } |
| |
| pub fn set_command_name(&self, mut new_name: TaskCommand) { |
| // If we're going to update the process name, see if we can get a longer one than normally |
| // provided in the Linux uapi. Only choose the argv0-based name if it's a superset of the |
| // uapi-provided name to avoid clobbering the name provided by the user. |
| if let Ok(argv0) = self.read_argv0() { |
| let argv0 = TaskCommand::from_path_bytes(&argv0); |
| if let Some(embedded_name) = argv0.try_embed(&new_name) { |
| new_name = embedded_name; |
| } |
| } |
| |
| // Acquire this before modifying Zircon state to ensure consistency under concurrent access. |
| // Ideally this would also guard the logic above to read argv[0] but we can't due to lock |
| // cycles with SELinux checks. |
| let mut command_guard = self.persistent_info.command_guard(); |
| |
| // Set the name on the Linux thread. |
| if let Some(thread) = self.thread.read().as_ref() { |
| set_zx_name(&**thread, new_name.as_bytes()); |
| } |
| |
| // If this is the thread group leader, use this name for the process too. |
| if self.is_leader() { |
| set_zx_name(&self.thread_group().process, new_name.as_bytes()); |
| let _ = zx::Thread::raise_user_exception( |
| zx::RaiseExceptionOptions::TARGET_JOB_DEBUGGER, |
| zx::sys::ZX_EXCP_USER_CODE_PROCESS_NAME_CHANGED, |
| 0, |
| ); |
| } |
| |
| // Avoid a lock cycle by dropping the guard before notifying memory attribution of the |
| // change. |
| *command_guard = new_name; |
| drop(command_guard); |
| |
| if self.is_leader() { |
| if let Some(notifier) = &self.thread_group().read().notifier { |
| let _ = notifier.send(MemoryAttributionLifecycleEvent::name_change(self.tid)); |
| } |
| } |
| } |
| |
| pub fn set_seccomp_state(&self, state: SeccompStateValue) -> Result<(), Errno> { |
| self.seccomp_filter_state.set(&state) |
| } |
| |
| pub fn state_code(&self) -> TaskStateCode { |
| let status = self.read(); |
| if status.exit_status.is_some() { |
| TaskStateCode::Zombie |
| } else if status.signals.run_state.is_blocked() { |
| let stop_state = self.load_stopped(); |
| if stop_state.ptrace_only() && stop_state.is_stopped() { |
| TaskStateCode::TracingStop |
| } else { |
| TaskStateCode::Sleeping |
| } |
| } else { |
| TaskStateCode::Running |
| } |
| } |
| |
| pub fn time_stats(&self) -> TaskTimeStats { |
| use zx::Task; |
| let info = match &*self.thread.read() { |
| Some(thread) => thread.get_runtime_info().expect("Failed to get thread stats"), |
| None => return TaskTimeStats::default(), |
| }; |
| |
| TaskTimeStats { |
| user_time: zx::MonotonicDuration::from_nanos(info.cpu_time), |
| // TODO(https://fxbug.dev/42078242): How can we calculate system time? |
| system_time: zx::MonotonicDuration::default(), |
| } |
| } |
| |
| pub fn get_signal_action(&self, signal: Signal) -> sigaction_t { |
| self.thread_group().signal_actions.get(signal) |
| } |
| |
| pub fn should_check_for_pending_signals(&self) -> bool { |
| self.flags().intersects( |
| TaskFlags::KERNEL_SIGNALS_AVAILABLE |
| | TaskFlags::SIGNALS_AVAILABLE |
| | TaskFlags::TEMPORARY_SIGNAL_MASK, |
| ) || self.thread_group.has_pending_signals.load(Ordering::Relaxed) |
| } |
| |
| pub fn record_pid_koid_mapping(&self) { |
| let Some(ref mapping_table) = *self.kernel().pid_to_koid_mapping.read() else { return }; |
| |
| let pkoid = self.thread_group().get_process_koid().ok(); |
| let tkoid = self.thread.read().as_ref().and_then(|t| t.koid().ok()); |
| mapping_table.write().insert(self.tid, KoidPair { process: pkoid, thread: tkoid }); |
| } |
| } |
| |
| impl Releasable for Task { |
| type Context<'a> = ( |
| ThreadState<RegisterStorageEnum>, |
| &'a mut Locked<TaskRelease>, |
| RwLockWriteGuard<'a, PidTable>, |
| ); |
| |
| fn release<'a>(mut self, context: Self::Context<'a>) { |
| let (thread_state, locked, pids) = context; |
| |
| *self.proc_pid_directory_cache.get_mut() = None; |
| self.ptrace_disconnect(&pids); |
| |
| std::mem::drop(pids); |
| |
| self.files.release(); |
| |
| self.signal_vfork(); |
| |
| // Drop fields that can end up owning a FsNode to ensure no FsNode are owned by this task. |
| self.fs.update(None); |
| self.mm.update(None); |
| |
| // Rebuild a temporary CurrentTask to run the release actions that requires a CurrentState. |
| let current_task = CurrentTask::new(OwnedRef::new(self), thread_state.into()); |
| |
| // Apply any delayed releasers left. |
| current_task.trigger_delayed_releaser(locked); |
| |
| // Drop the task now that is has been released. This requires to take it from the OwnedRef |
| // and from the resulting ReleaseGuard. |
| let CurrentTask { mut task, .. } = current_task; |
| let task = OwnedRef::take(&mut task).expect("task should not have been re-owned"); |
| let _task: Self = ReleaseGuard::take(task); |
| } |
| } |
| |
| impl MemoryAccessor for Task { |
| fn read_memory<'a>( |
| &self, |
| addr: UserAddress, |
| bytes: &'a mut [MaybeUninit<u8>], |
| ) -> Result<&'a mut [u8], Errno> { |
| // Using a `Task` to read memory generally indicates that the memory |
| // is being read from a task different than the `CurrentTask`. When |
| // this `Task` is not current, its address space is not mapped |
| // so we need to go through the VMO. |
| self.mm()?.syscall_read_memory(addr, bytes) |
| } |
| |
| fn read_memory_partial_until_null_byte<'a>( |
| &self, |
| addr: UserAddress, |
| bytes: &'a mut [MaybeUninit<u8>], |
| ) -> Result<&'a mut [u8], Errno> { |
| // Using a `Task` to read memory generally indicates that the memory |
| // is being read from a task different than the `CurrentTask`. When |
| // this `Task` is not current, its address space is not mapped |
| // so we need to go through the VMO. |
| self.mm()?.syscall_read_memory_partial_until_null_byte(addr, bytes) |
| } |
| |
| fn read_memory_partial<'a>( |
| &self, |
| addr: UserAddress, |
| bytes: &'a mut [MaybeUninit<u8>], |
| ) -> Result<&'a mut [u8], Errno> { |
| // Using a `Task` to read memory generally indicates that the memory |
| // is being read from a task different than the `CurrentTask`. When |
| // this `Task` is not current, its address space is not mapped |
| // so we need to go through the VMO. |
| self.mm()?.syscall_read_memory_partial(addr, bytes) |
| } |
| |
| fn write_memory(&self, addr: UserAddress, bytes: &[u8]) -> Result<usize, Errno> { |
| // Using a `Task` to write memory generally indicates that the memory |
| // is being written to a task different than the `CurrentTask`. When |
| // this `Task` is not current, its address space is not mapped |
| // so we need to go through the VMO. |
| self.mm()?.syscall_write_memory(addr, bytes) |
| } |
| |
| fn write_memory_partial(&self, addr: UserAddress, bytes: &[u8]) -> Result<usize, Errno> { |
| // Using a `Task` to write memory generally indicates that the memory |
| // is being written to a task different than the `CurrentTask`. When |
| // this `Task` is not current, its address space is not mapped |
| // so we need to go through the VMO. |
| self.mm()?.syscall_write_memory_partial(addr, bytes) |
| } |
| |
| fn zero(&self, addr: UserAddress, length: usize) -> Result<usize, Errno> { |
| // Using a `Task` to zero memory generally indicates that the memory |
| // is being zeroed from a task different than the `CurrentTask`. When |
| // this `Task` is not current, its address space is not mapped |
| // so we need to go through the VMO. |
| self.mm()?.syscall_zero(addr, length) |
| } |
| } |
| |
| impl TaskMemoryAccessor for Task { |
| fn maximum_valid_address(&self) -> Option<UserAddress> { |
| self.mm().map(|mm| mm.maximum_valid_user_address).ok() |
| } |
| } |
| |
| impl fmt::Debug for Task { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!( |
| f, |
| "{}:{}[{}]", |
| self.thread_group().leader, |
| self.tid, |
| self.persistent_info.command.lock() |
| ) |
| } |
| } |
| |
| impl cmp::PartialEq for Task { |
| fn eq(&self, other: &Self) -> bool { |
| let ptr: *const Task = self; |
| let other_ptr: *const Task = other; |
| ptr == other_ptr |
| } |
| } |
| |
| impl cmp::Eq for Task {} |
| |
| #[cfg(test)] |
| mod test { |
| use super::*; |
| use crate::testing::*; |
| use starnix_uapi::auth::{CAP_SYS_ADMIN, Capabilities}; |
| use starnix_uapi::resource_limits::Resource; |
| use starnix_uapi::signals::SIGCHLD; |
| use starnix_uapi::{CLONE_SIGHAND, CLONE_THREAD, CLONE_VM, rlimit}; |
| |
| #[::fuchsia::test] |
| async fn test_tid_allocation() { |
| spawn_kernel_and_run(async |locked, current_task| { |
| let kernel = current_task.kernel(); |
| assert_eq!(current_task.get_tid(), 1); |
| let another_current = create_task(locked, &kernel, "another-task"); |
| let another_tid = another_current.get_tid(); |
| assert!(another_tid >= 2); |
| |
| let pids = kernel.pids.read(); |
| assert_eq!(pids.get_task(1).upgrade().unwrap().get_tid(), 1); |
| assert_eq!(pids.get_task(another_tid).upgrade().unwrap().get_tid(), another_tid); |
| }) |
| .await; |
| } |
| |
| #[::fuchsia::test] |
| async fn test_clone_pid_and_parent_pid() { |
| spawn_kernel_and_run(async |locked, current_task| { |
| let thread = current_task.clone_task_for_test( |
| locked, |
| (CLONE_THREAD | CLONE_VM | CLONE_SIGHAND) as u64, |
| Some(SIGCHLD), |
| ); |
| assert_eq!(current_task.get_pid(), thread.get_pid()); |
| assert_ne!(current_task.get_tid(), thread.get_tid()); |
| assert_eq!(current_task.thread_group().leader, thread.thread_group().leader); |
| |
| let child_task = current_task.clone_task_for_test(locked, 0, Some(SIGCHLD)); |
| assert_ne!(current_task.get_pid(), child_task.get_pid()); |
| assert_ne!(current_task.get_tid(), child_task.get_tid()); |
| assert_eq!(current_task.get_pid(), child_task.thread_group().read().get_ppid()); |
| }) |
| .await; |
| } |
| |
| #[::fuchsia::test] |
| async fn test_root_capabilities() { |
| spawn_kernel_and_run(async |_, current_task| { |
| assert!(security::is_task_capable_noaudit(current_task, CAP_SYS_ADMIN)); |
| assert_eq!(current_task.real_creds().cap_inheritable, Capabilities::empty()); |
| |
| current_task.set_creds(Credentials::with_ids(1, 1)); |
| assert!(!security::is_task_capable_noaudit(current_task, CAP_SYS_ADMIN)); |
| }) |
| .await; |
| } |
| |
| #[::fuchsia::test] |
| async fn test_clone_rlimit() { |
| spawn_kernel_and_run(async |locked, current_task| { |
| let prev_fsize = current_task.thread_group().get_rlimit(locked, Resource::FSIZE); |
| assert_ne!(prev_fsize, 10); |
| current_task |
| .thread_group() |
| .limits |
| .lock(locked) |
| .set(Resource::FSIZE, rlimit { rlim_cur: 10, rlim_max: 100 }); |
| let current_fsize = current_task.thread_group().get_rlimit(locked, Resource::FSIZE); |
| assert_eq!(current_fsize, 10); |
| |
| let child_task = current_task.clone_task_for_test(locked, 0, Some(SIGCHLD)); |
| let child_fsize = child_task.thread_group().get_rlimit(locked, Resource::FSIZE); |
| assert_eq!(child_fsize, 10) |
| }) |
| .await; |
| } |
| } |