src/starnix/kernel/task/current_task.rs - fuchsia - Git at Google

 // Copyright 2023 The Fuchsia Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 use crate::{
     arch::{
         registers::RegisterState,
         task::{decode_page_fault_exception_report, get_signal_for_general_exception},
     },
     execution::{create_zircon_process, TaskInfo},
     fs::proc::pid_directory::TaskDirectory,
     loader::{load_executable, resolve_executable, ResolvedElf},
     mm::{MemoryAccessor, MemoryAccessorExt, MemoryManager, TaskMemoryAccessor},
     selinux::hooks::current_task_hooks as selinux_hooks,
     signals::{send_signal_first, send_standard_signal, RunState, SignalActions, SignalInfo},
     task::{
         ExitStatus, Kernel, PidTable, ProcessGroup, PtraceCoreState, PtraceEvent, PtraceEventData,
         PtraceOptions, SeccompFilter, SeccompFilterContainer, SeccompNotifierHandle, SeccompState,
         SeccompStateValue, StopState, Task, TaskFlags, ThreadGroup, Waiter,
     },
     vfs::{
         FdNumber, FdTable, FileHandle, FsContext, FsStr, LookupContext, NamespaceNode, ResolveBase,
         SymlinkMode, SymlinkTarget, MAX_SYMLINK_FOLLOWS,
     },
 };
 use extended_pstate::ExtendedPstateState;
 use fuchsia_inspect_contrib::profile_duration;
 use fuchsia_zircon::{
     sys::zx_thread_state_general_regs_t,
     {self as zx},
 };
 use starnix_logging::{log_error, log_warn, set_zx_name, track_file_not_found, track_stub};
 use starnix_sync::{
     DeviceOpen, EventWaitGuard, FileOpsCore, LockBefore, Locked, MmDumpable, RwLock,
     RwLockWriteGuard, TaskRelease, WakeReason,
 };
 use starnix_syscalls::{decls::Syscall, SyscallResult};
 use starnix_uapi::{
     auth::{Credentials, CAP_SYS_ADMIN},
     clone_args,
     device_type::DeviceType,
     errno, error,
     errors::Errno,
     file_mode::{Access, FileMode},
     from_status_like_fdio,
     open_flags::OpenFlags,
     ownership::{release_on_error, OwnedRef, Releasable, TempRef, WeakRef},
     pid_t,
     resource_limits::Resource,
     rlimit,
     signals::{SigSet, Signal, SIGBUS, SIGCHLD, SIGILL, SIGSEGV, SIGTRAP},
     sock_filter, sock_fprog,
     user_address::{UserAddress, UserRef},
     vfs::ResolveFlags,
     BPF_MAXINSNS, CLONE_CHILD_CLEARTID, CLONE_CHILD_SETTID, CLONE_FILES, CLONE_FS,
     CLONE_INTO_CGROUP, CLONE_NEWUTS, CLONE_PARENT_SETTID, CLONE_PTRACE, CLONE_SETTLS,
     CLONE_SIGHAND, CLONE_SYSVSEM, CLONE_THREAD, CLONE_VFORK, CLONE_VM, FUTEX_OWNER_DIED,
     FUTEX_TID_MASK, ROBUST_LIST_LIMIT, SECCOMP_FILTER_FLAG_LOG, SECCOMP_FILTER_FLAG_NEW_LISTENER,
     SECCOMP_FILTER_FLAG_TSYNC, SECCOMP_FILTER_FLAG_TSYNC_ESRCH, SI_KERNEL,
 };
 use std::{ffi::CString, fmt, marker::PhantomData, mem::MaybeUninit, sync::Arc};

 pub struct TaskBuilder {
     /// The underlying task object.
     pub task: OwnedRef<Task>,

     pub thread_state: ThreadState,
 }

 impl TaskBuilder {
     pub fn new(task: Task) -> Self {
         Self { task: OwnedRef::new(task), thread_state: Default::default() }
     }

     #[inline(always)]
     pub fn release<L>(self, locked: &mut Locked<'_, L>)
     where
         L: LockBefore<TaskRelease>,
     {
         let mut locked = locked.cast_locked::<TaskRelease>();
         Releasable::release(self, &mut locked);
     }
 }

 impl From<TaskBuilder> for CurrentTask {
     fn from(builder: TaskBuilder) -> Self {
         Self::new(builder.task, builder.thread_state)
     }
 }

 impl Releasable for TaskBuilder {
     type Context<'a> = &'a mut Locked<'a, TaskRelease>;

     fn release<'a>(self, locked: &'a mut Locked<'a, TaskRelease>) {
         let context = (self.thread_state, locked);
         self.task.release(context);
     }
 }

 impl std::ops::Deref for TaskBuilder {
     type Target = Task;
     fn deref(&self) -> &Self::Target {
         &self.task
     }
 }

 /// The task object associated with the currently executing thread.
 ///
 /// We often pass the `CurrentTask` as the first argument to functions if those functions need to
 /// know contextual information about the thread on which they are running. For example, we often
 /// use the `CurrentTask` to perform access checks, which ensures that the caller is authorized to
 /// perform the requested operation.
 ///
 /// The `CurrentTask` also has state that can be referenced only on the currently executing thread,
 /// such as the register state for that thread. Syscalls are given a mutable references to the
 /// `CurrentTask`, which lets them manipulate this state.
 ///
 /// See also `Task` for more information about tasks.
 pub struct CurrentTask {
     /// The underlying task object.
     pub task: OwnedRef<Task>,

     pub thread_state: ThreadState,

     /// Makes CurrentTask neither Sync not Send.
     _local_marker: PhantomData<*mut u8>,
 }

 /// The thread related information of a `CurrentTask`. The information should never be used  outside
 /// of the thread owning the `CurrentTask`.
 #[derive(Default)]
 pub struct ThreadState {
     /// A copy of the registers associated with the Zircon thread. Up-to-date values can be read
     /// from `self.handle.read_state_general_regs()`. To write these values back to the thread, call
     /// `self.handle.write_state_general_regs(self.thread_state.registers.into())`.
     pub registers: RegisterState,

     /// Copy of the current extended processor state including floating point and vector registers.
     pub extended_pstate: ExtendedPstateState,

     /// A custom function to resume a syscall that has been interrupted by SIGSTOP.
     /// To use, call set_syscall_restart_func and return ERESTART_RESTARTBLOCK. sys_restart_syscall
     /// will eventually call it.
     pub syscall_restart_func: Option<Box<SyscallRestartFunc>>,
 }

 impl ThreadState {
     /// Returns a new `ThreadState` with the same `registers` as this one.
     fn snapshot(&self) -> Self {
         Self {
             registers: self.registers,
             extended_pstate: Default::default(),
             syscall_restart_func: None,
         }
     }

     pub fn extended_snapshot(&self) -> Self {
         Self {
             registers: self.registers.clone(),
             extended_pstate: self.extended_pstate.clone(),
             syscall_restart_func: None,
         }
     }

     pub fn replace_registers(&mut self, other: &ThreadState) {
         self.registers = other.registers;
         self.extended_pstate = other.extended_pstate;
     }

     pub fn get_user_register(&mut self, offset: usize) -> Result<usize, Errno> {
         let mut result: usize = 0;
         self.registers.apply_user_register(offset, &mut |register| result = *register as usize)?;
         Ok(result)
     }

     pub fn set_user_register(&mut self, offset: usize, value: usize) -> Result<(), Errno> {
         self.registers.apply_user_register(offset, &mut |register| *register = value as u64)
     }
 }

 type SyscallRestartFunc =
     dyn FnOnce(&mut CurrentTask) -> Result<SyscallResult, Errno> + Send + Sync;

 impl Releasable for CurrentTask {
     type Context<'a> = &'a mut Locked<'a, TaskRelease>;

     fn release<'a>(self, locked: &'a mut Locked<'a, TaskRelease>) {
         self.notify_robust_list();
         let _ignored = self.clear_child_tid_if_needed();

         // We remove from the thread group here because the WeakRef in the pid
         // table to this task must be valid until this task is removed from the
         // thread group, but self.task.release() below invalidates it.
         self.thread_group.remove(locked, &self);

         let context = (self.thread_state, locked);
         self.task.release(context);
     }
 }

 impl std::ops::Deref for CurrentTask {
     type Target = Task;
     fn deref(&self) -> &Self::Target {
         &self.task
     }
 }

 impl fmt::Debug for CurrentTask {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         self.task.fmt(f)
     }
 }

 impl CurrentTask {
     pub fn new(task: OwnedRef<Task>, thread_state: ThreadState) -> Self {
         Self { task, thread_state, _local_marker: Default::default() }
     }

     pub fn trigger_delayed_releaser(&self) {
         self.kernel().delayed_releaser.apply(self);
     }

     pub fn weak_task(&self) -> WeakRef<Task> {
         WeakRef::from(&self.task)
     }

     pub fn temp_task(&self) -> TempRef<'_, Task> {
         TempRef::from(&self.task)
     }

     pub fn set_creds(&self, creds: Credentials) {
         *self.temp_task().persistent_info.lock().creds_mut() = creds;
         // The /proc/pid direectory's ownership is updated when the task's euid
         // or egid changes. See proc(5).
         let mut state = self.proc_pid_directory_cache.lock();
         TaskDirectory::maybe_force_chown(self, &mut state, &self.creds());
     }

     #[inline(always)]
     pub fn release<L>(self, locked: &mut Locked<'_, L>)
     where
         L: LockBefore<TaskRelease>,
     {
         let mut locked = locked.cast_locked::<TaskRelease>();
         Releasable::release(self, &mut locked);
     }

     pub fn set_syscall_restart_func<R: Into<SyscallResult>>(
         &mut self,
         f: impl FnOnce(&mut CurrentTask) -> Result<R, Errno> + Send + Sync + 'static,
     ) {
         self.thread_state.syscall_restart_func =
             Some(Box::new(|current_task| Ok(f(current_task)?.into())));
     }

     /// Sets the task's signal mask to `signal_mask` and runs `wait_function`.
     ///
     /// Signals are dequeued prior to the original signal mask being restored. This is done by the
     /// signal machinery in the syscall dispatch loop.
     ///
     /// The returned result is the result returned from the wait function.
     pub fn wait_with_temporary_mask<F, T>(
         &mut self,
         signal_mask: SigSet,
         wait_function: F,
     ) -> Result<T, Errno>
     where
         F: FnOnce(&CurrentTask) -> Result<T, Errno>,
     {
         {
             let mut state = self.write();
             state.set_flags(TaskFlags::TEMPORARY_SIGNAL_MASK, true);
             state.signals.set_temporary_mask(signal_mask);
         }
         wait_function(self)
     }

     /// If waking, promotes from waking to awake.  If not waking, make waiter async
     /// wait until woken.  Returns true if woken.
     pub fn wake_or_wait_until_unstopped_async(&self, waiter: &Waiter) -> bool {
         let group_state = self.thread_group.read();
         let mut task_state = self.write();

         // Wake up if
         //   a) we should wake up, meaning:
         //      i) we're in group stop, and the thread group has exited group stop, or
         //      ii) we're waking up,
         //   b) and ptrace isn't stopping us from waking up, but
         //   c) always wake up if we got a SIGKILL.
         let task_stop_state = self.load_stopped();
         let group_stop_state = self.thread_group.load_stopped();
         if ((task_stop_state == StopState::GroupStopped && group_stop_state.is_waking_or_awake())
             || task_stop_state.is_waking_or_awake())
             && (!task_state.is_ptrace_listening() || task_stop_state.is_force())
         {
             let new_state = if task_stop_state.is_waking_or_awake() {
                 task_stop_state.finalize()
             } else {
                 group_stop_state.finalize()
             };
             if let Ok(new_state) = new_state {
                 task_state.set_stopped(new_state, None, Some(self), None);
                 drop(group_state);
                 drop(task_state);
                 // It is possible for the stop state to be changed by another
                 // thread between when it is checked above and the following
                 // invocation, but set_stopped does sufficient checking while
                 // holding the lock to make sure that such a change won't result
                 // in corrupted state.
                 self.thread_group.set_stopped(new_state, None, false);
                 return true;
             }
         }

         // We will wait.
         if self.thread_group.load_stopped().is_stopped() || task_stop_state.is_stopped() {
             // If we've stopped or PTRACE_LISTEN has been sent, wait for a
             // signal or instructions from the tracer.
             group_state.stopped_waiters.wait_async(&waiter);
             task_state.wait_on_ptracer(&waiter);
         } else if task_state.can_accept_ptrace_commands() {
             // If we're stopped because a tracer has seen the stop and not taken
             // further action, wait for further instructions from the tracer.
             task_state.wait_on_ptracer(&waiter);
         } else if task_state.is_ptrace_listening() {
             // A PTRACE_LISTEN is a state where we can get signals and notify a
             // ptracer, but otherwise remain blocked.
             if let Some(ref mut ptrace) = &mut task_state.ptrace {
                 ptrace.set_last_signal(Some(SignalInfo::default(SIGTRAP)));
                 ptrace.set_last_event(Some(PtraceEventData::new_from_event(PtraceEvent::Stop, 0)));
             }
             task_state.wait_on_ptracer(&waiter);
             task_state.notify_ptracers();
         }
         false
     }

     /// Set the RunState for the current task to the given value and then call the given callback.
     ///
     /// When the callback is done, the run_state is restored to `RunState::Running`.
     ///
     /// This function is typically used just before blocking the current task on some operation.
     /// The given `run_state` registers the mechanism for interrupting the blocking operation with
     /// the task and the given `callback` actually blocks the task.
     ///
     /// This function can only be called in the `RunState::Running` state and cannot set the
     /// run state to `RunState::Running`. For this reason, this function cannot be reentered.
     pub fn run_in_state<F, T>(&self, run_state: RunState, callback: F) -> Result<T, Errno>
     where
         F: FnOnce() -> Result<T, Errno>,
     {
         assert_ne!(run_state, RunState::Running);

         {
             let mut state = self.write();
             assert!(!state.signals.run_state.is_blocked());
             // A note on PTRACE_LISTEN - the thread cannot be scheduled
             // regardless of pending signals.
             if state.signals.is_any_pending() && !state.is_ptrace_listening() {
                 return error!(EINTR);
             }
             state.signals.run_state = run_state.clone();
         }

         let result = callback();

         {
             let mut state = self.write();
             assert_eq!(
                 state.signals.run_state, run_state,
                 "SignalState run state changed while waiting!"
             );
             state.signals.run_state = RunState::Running;
         };

         result
     }

     pub fn block_until(&self, guard: EventWaitGuard<'_>, deadline: zx::Time) -> Result<(), Errno> {
         self.run_in_state(RunState::Event(guard.event().clone()), move || {
             guard.block_until(deadline).map_err(|e| match e {
                 WakeReason::Interrupted => errno!(EINTR),
                 WakeReason::DeadlineExpired => errno!(ETIMEDOUT),
             })
         })
     }

     /// Determine namespace node indicated by the dir_fd.
     ///
     /// Returns the namespace node and the path to use relative to that node.
     pub fn resolve_dir_fd<'a>(
         &self,
         dir_fd: FdNumber,
         mut path: &'a FsStr,
         flags: ResolveFlags,
     ) -> Result<(NamespaceNode, &'a FsStr), Errno> {
         let path_is_absolute = path.starts_with(b"/");
         if path_is_absolute {
             if flags.contains(ResolveFlags::BENEATH) {
                 return error!(EXDEV);
             }
             path = &path[1..];
         }

         let dir = if path_is_absolute && !flags.contains(ResolveFlags::IN_ROOT) {
             self.fs().root()
         } else if dir_fd == FdNumber::AT_FDCWD {
             self.fs().cwd()
         } else {
             // O_PATH allowed for:
             //
             //   Passing the file descriptor as the dirfd argument of
             //   openat() and the other "*at()" system calls.  This
             //   includes linkat(2) with AT_EMPTY_PATH (or via procfs
             //   using AT_SYMLINK_FOLLOW) even if the file is not a
             //   directory.
             //
             // See https://man7.org/linux/man-pages/man2/open.2.html
             let file = self.files.get_allowing_opath(dir_fd)?;
             file.name.clone()
         };

         if !path.is_empty() {
             if !dir.entry.node.is_dir() {
                 return error!(ENOTDIR);
             }
             dir.check_access(self, Access::EXEC)?;
         }
         Ok((dir, path.into()))
     }

     /// A convenient wrapper for opening files relative to FdNumber::AT_FDCWD.
     ///
     /// Returns a FileHandle but does not install the FileHandle in the FdTable
     /// for this task.
     pub fn open_file<L>(
         &self,
         locked: &mut Locked<'_, L>,
         path: &FsStr,
         flags: OpenFlags,
     ) -> Result<FileHandle, Errno>
     where
         L: LockBefore<FileOpsCore>,
         L: LockBefore<DeviceOpen>,
     {
         if flags.contains(OpenFlags::CREAT) {
             // In order to support OpenFlags::CREAT we would need to take a
             // FileMode argument.
             return error!(EINVAL);
         }
         self.open_file_at(
             locked,
             FdNumber::AT_FDCWD,
             path,
             flags,
             FileMode::default(),
             ResolveFlags::empty(),
         )
     }

     /// Resolves a path for open.
     ///
     /// If the final path component points to a symlink, the symlink is followed (as long as
     /// the symlink traversal limit has not been reached).
     ///
     /// If the final path component (after following any symlinks, if enabled) does not exist,
     /// and `flags` contains `OpenFlags::CREAT`, a new node is created at the location of the
     /// final path component.
     ///
     /// This returns the resolved node, and a boolean indicating whether the node has been created.
     fn resolve_open_path<L>(
         &self,
         locked: &mut Locked<'_, L>,
         context: &mut LookupContext,
         dir: &NamespaceNode,
         path: &FsStr,
         mode: FileMode,
         flags: OpenFlags,
     ) -> Result<(NamespaceNode, bool), Errno>
     where
         L: LockBefore<FileOpsCore>,
     {
         context.update_for_path(path);
         let mut parent_content = context.with(SymlinkMode::Follow);
         let (parent, basename) = self.lookup_parent(&mut parent_content, dir, path)?;
         context.remaining_follows = parent_content.remaining_follows;

         let must_create = flags.contains(OpenFlags::CREAT) && flags.contains(OpenFlags::EXCL);

         // Lookup the child, without following a symlink or expecting it to be a directory.
         let mut child_context = context.with(SymlinkMode::NoFollow);
         child_context.must_be_directory = false;

         match parent.lookup_child(self, &mut child_context, basename) {
             Ok(name) => {
                 if name.entry.node.is_lnk() {
                     if flags.contains(OpenFlags::PATH)
                         && context.symlink_mode == SymlinkMode::NoFollow
                     {
                         // When O_PATH is specified in flags, if pathname is a symbolic link
                         // and the O_NOFOLLOW flag is also specified, then the call returns
                         // a file descriptor referring to the symbolic link.
                         // See https://man7.org/linux/man-pages/man2/openat.2.html
                         //
                         // If the trailing component (i.e., basename) of
                         // pathname is a symbolic link, how.resolve contains
                         // RESOLVE_NO_SYMLINKS, and how.flags contains both
                         // O_PATH and O_NOFOLLOW, then an O_PATH file
                         // descriptor referencing the symbolic link will be
                         // returned.
                         // See https://man7.org/linux/man-pages/man2/openat2.2.html
                         return Ok((name, false));
                     }

                     if (!flags.contains(OpenFlags::PATH)
                         && context.symlink_mode == SymlinkMode::NoFollow)
                         || context.resolve_flags.contains(ResolveFlags::NO_SYMLINKS)
                         || context.remaining_follows == 0
                     {
                         if must_create {
                             // Since `must_create` is set, and a node was found, this returns EEXIST
                             // instead of ELOOP.
                             return error!(EEXIST);
                         }
                         // A symlink was found, but one of the following is true:
                         // * flags specified O_NOFOLLOW but not O_PATH.
                         // * how.resolve contains RESOLVE_NO_SYMLINKS
                         // * too many symlink traversals have been attempted
                         return error!(ELOOP);
                     }

                     context.remaining_follows -= 1;
                     match name.readlink(self)? {
                         SymlinkTarget::Path(path) => {
                             let dir = if path[0] == b'/' { self.fs().root() } else { parent };
                             self.resolve_open_path(
                                 locked,
                                 context,
                                 &dir,
                                 path.as_ref(),
                                 mode,
                                 flags,
                             )
                         }
                         SymlinkTarget::Node(node) => {
                             if context.resolve_flags.contains(ResolveFlags::NO_MAGICLINKS) {
                                 error!(ELOOP)
                             } else {
                                 Ok((node, false))
                             }
                         }
                     }
                 } else {
                     if must_create {
                         return error!(EEXIST);
                     }
                     Ok((name, false))
                 }
             }
             Err(e) if e == errno!(ENOENT) && flags.contains(OpenFlags::CREAT) => {
                 if context.must_be_directory {
                     return error!(EISDIR);
                 }
                 Ok((
                     parent.open_create_node(
                         locked,
                         self,
                         basename,
                         mode.with_type(FileMode::IFREG),
                         DeviceType::NONE,
                         flags,
                     )?,
                     true,
                 ))
             }
             Err(e) => Err(e),
         }
     }

     /// The primary entry point for opening files relative to a task.
     ///
     /// Absolute paths are resolve relative to the root of the FsContext for
     /// this task. Relative paths are resolve relative to dir_fd. To resolve
     /// relative to the current working directory, pass FdNumber::AT_FDCWD for
     /// dir_fd.
     ///
     /// Returns a FileHandle but does not install the FileHandle in the FdTable
     /// for this task.
     pub fn open_file_at<L>(
         &self,
         locked: &mut Locked<'_, L>,
         dir_fd: FdNumber,
         path: &FsStr,
         flags: OpenFlags,
         mode: FileMode,
         resolve_flags: ResolveFlags,
     ) -> Result<FileHandle, Errno>
     where
         L: LockBefore<FileOpsCore>,
         L: LockBefore<DeviceOpen>,
     {
         if path.is_empty() {
             return error!(ENOENT);
         }

         let (dir, path) = self.resolve_dir_fd(dir_fd, path, resolve_flags)?;
         self.open_namespace_node_at(locked, dir, path, flags, mode, resolve_flags)
     }

     pub fn open_namespace_node_at<L>(
         &self,
         locked: &mut Locked<'_, L>,
         dir: NamespaceNode,
         path: &FsStr,
         flags: OpenFlags,
         mode: FileMode,
         mut resolve_flags: ResolveFlags,
     ) -> Result<FileHandle, Errno>
     where
         L: LockBefore<FileOpsCore>,
         L: LockBefore<DeviceOpen>,
     {
         // 64-bit kernels force the O_LARGEFILE flag to be on.
         let mut flags = flags | OpenFlags::LARGEFILE;
         let opath = flags.contains(OpenFlags::PATH);
         if opath {
             // When O_PATH is specified in flags, flag bits other than O_CLOEXEC,
             // O_DIRECTORY, and O_NOFOLLOW are ignored.
             const ALLOWED_FLAGS: OpenFlags = OpenFlags::from_bits_truncate(
                 OpenFlags::PATH.bits()
                     | OpenFlags::CLOEXEC.bits()
                     | OpenFlags::DIRECTORY.bits()
                     | OpenFlags::NOFOLLOW.bits(),
             );
             flags &= ALLOWED_FLAGS;
         }

         if flags.contains(OpenFlags::TMPFILE) && !flags.can_write() {
             return error!(EINVAL);
         }

         let nofollow = flags.contains(OpenFlags::NOFOLLOW);
         let must_create = flags.contains(OpenFlags::CREAT) && flags.contains(OpenFlags::EXCL);

         let symlink_mode =
             if nofollow || must_create { SymlinkMode::NoFollow } else { SymlinkMode::Follow };

         let resolve_base = match (
             resolve_flags.contains(ResolveFlags::BENEATH),
             resolve_flags.contains(ResolveFlags::IN_ROOT),
         ) {
             (false, false) => ResolveBase::None,
             (true, false) => ResolveBase::Beneath(dir.clone()),
             (false, true) => ResolveBase::InRoot(dir.clone()),
             (true, true) => return error!(EINVAL),
         };

         // `RESOLVE_BENEATH` and `RESOLVE_IN_ROOT` imply `RESOLVE_NO_MAGICLINKS`. This matches
         // Linux behavior. Strictly speaking it's is not really required, but it's hard to
         // implement `BENEATH` and `IN_ROOT` flags correctly otherwise.
         if resolve_base != ResolveBase::None {
             resolve_flags.insert(ResolveFlags::NO_MAGICLINKS);
         }

         let mut context = LookupContext {
             symlink_mode,
             remaining_follows: MAX_SYMLINK_FOLLOWS,
             must_be_directory: flags.contains(OpenFlags::DIRECTORY),
             resolve_flags,
             resolve_base,
         };
         let (name, created) =
             match self.resolve_open_path(locked, &mut context, &dir, path, mode, flags) {
                 Ok((n, c)) => (n, c),
                 Err(e) => {
                     let mut abs_path = dir.path(&self.task);
                     abs_path.extend(&**path);
                     track_file_not_found(abs_path);
                     return Err(e);
                 }
             };

         let name = if flags.contains(OpenFlags::TMPFILE) {
             name.create_tmpfile(self, mode.with_type(FileMode::IFREG), flags)?
         } else {
             let mode = name.entry.node.info().mode;

             // These checks are not needed in the `O_TMPFILE` case because `mode` refers to the
             // file we are opening. With `O_TMPFILE`, that file is the regular file we just
             // created rather than the node we found by resolving the path.
             //
             // For example, we do not need to produce `ENOTDIR` when `must_be_directory` is set
             // because `must_be_directory` refers to the node we found by resolving the path.
             // If that node was not a directory, then `create_tmpfile` will produce an error.
             //
             // Similarly, we never need to call `truncate` because `O_TMPFILE` is newly created
             // and therefor already an empty file.

             if !opath && nofollow && mode.is_lnk() {
                 return error!(ELOOP);
             }

             if mode.is_dir() {
                 if flags.can_write()
                     || flags.contains(OpenFlags::CREAT)
                     || flags.contains(OpenFlags::TRUNC)
                 {
                     return error!(EISDIR);
                 }
                 if flags.contains(OpenFlags::DIRECT) {
                     return error!(EINVAL);
                 }
             } else if context.must_be_directory {
                 return error!(ENOTDIR);
             }

             if flags.contains(OpenFlags::TRUNC) && mode.is_reg() && !created {
                 // You might think we should check file.can_write() at this
                 // point, which is what the docs suggest, but apparently we
                 // are supposed to truncate the file if this task can write
                 // to the underlying node, even if we are opening the file
                 // as read-only. See OpenTest.CanTruncateReadOnly.
                 name.truncate(locked, self, 0)?;
             }

             name
         };

         // If the node has been created, the open operation should not verify access right:
         // From <https://man7.org/linux/man-pages/man2/open.2.html>
         //
         // > Note that mode applies only to future accesses of the newly created file; the
         // > open() call that creates a read-only file may well return a  read/write  file
         // > descriptor.

         name.open(locked, self, flags, !created)
     }

     /// A wrapper for FsContext::lookup_parent_at that resolves the given
     /// dir_fd to a NamespaceNode.
     ///
     /// Absolute paths are resolve relative to the root of the FsContext for
     /// this task. Relative paths are resolve relative to dir_fd. To resolve
     /// relative to the current working directory, pass FdNumber::AT_FDCWD for
     /// dir_fd.
     pub fn lookup_parent_at<'a>(
         &self,
         context: &mut LookupContext,
         dir_fd: FdNumber,
         path: &'a FsStr,
     ) -> Result<(NamespaceNode, &'a FsStr), Errno> {
         let (dir, path) = self.resolve_dir_fd(dir_fd, path, ResolveFlags::empty())?;
         self.lookup_parent(context, &dir, path)
     }

     /// Lookup the parent of a namespace node.
     ///
     /// Consider using Task::open_file_at or Task::lookup_parent_at rather than
     /// calling this function directly.
     ///
     /// This function resolves all but the last component of the given path.
     /// The function returns the parent directory of the last component as well
     /// as the last component.
     ///
     /// If path is empty, this function returns dir and an empty path.
     /// Similarly, if path ends with "." or "..", these components will be
     /// returned along with the parent.
     ///
     /// The returned parent might not be a directory.
     pub fn lookup_parent<'a>(
         &self,
         context: &mut LookupContext,
         dir: &NamespaceNode,
         path: &'a FsStr,
     ) -> Result<(NamespaceNode, &'a FsStr), Errno> {
         context.update_for_path(path);

         let mut current_node = dir.clone();
         let mut it = path.split(|c| *c == b'/').filter(|p| !p.is_empty()).map(<&FsStr>::from);
         let mut current_path_component = it.next().unwrap_or_default();
         for next_path_component in it {
             current_node = current_node.lookup_child(self, context, current_path_component)?;
             current_path_component = next_path_component;
         }
         Ok((current_node, current_path_component))
     }

     /// Lookup a namespace node.
     ///
     /// Consider using Task::open_file_at or Task::lookup_parent_at rather than
     /// calling this function directly.
     ///
     /// This function resolves the component of the given path.
     pub fn lookup_path(
         &self,
         context: &mut LookupContext,
         dir: NamespaceNode,
         path: &FsStr,
     ) -> Result<NamespaceNode, Errno> {
         let (parent, basename) = self.lookup_parent(context, &dir, path)?;
         parent.lookup_child(self, context, basename)
     }

     /// Lookup a namespace node starting at the root directory.
     ///
     /// Resolves symlinks.
     pub fn lookup_path_from_root(&self, path: &FsStr) -> Result<NamespaceNode, Errno> {
         let mut context = LookupContext::default();
         self.lookup_path(&mut context, self.fs().root(), path)
     }

     pub fn exec<L>(
         &mut self,
         locked: &mut Locked<'_, L>,
         executable: FileHandle,
         path: CString,
         argv: Vec<CString>,
         environ: Vec<CString>,
     ) -> Result<(), Errno>
     where
         L: LockBefore<FileOpsCore>,
         L: LockBefore<DeviceOpen>,
     {
         // Executable must be a regular file
         if !executable.name.entry.node.is_reg() {
             return error!(EACCES);
         }

         // File node must have EXEC mode permissions.
         // Note that the ability to execute a file is unrelated to the flags
         // used in the `open` call.
         executable.name.check_access(self, Access::EXEC)?;

         let elf_selinux_state = selinux_hooks::check_exec_access(self, executable.node())?;

         let resolved_elf = resolve_executable(
             locked,
             self,
             executable,
             path.clone(),
             argv,
             environ,
             elf_selinux_state,
         )?;

         if self.thread_group.read().tasks_count() > 1 {
             track_stub!(TODO("https://fxbug.dev/297434895"), "exec on multithread process");
             return error!(EINVAL);
         }

         if let Err(err) = self.finish_exec(path, resolved_elf) {
             log_warn!("unrecoverable error in exec: {err:?}");

             send_standard_signal(
                 self,
                 SignalInfo { code: SI_KERNEL as i32, force: true, ..SignalInfo::default(SIGSEGV) },
             );
             return Err(err);
         }

         self.ptrace_event(PtraceOptions::TRACEEXEC, self.task.id as u64);
         self.signal_vfork();

         Ok(())
     }

     /// After the memory is unmapped, any failure in exec is unrecoverable and results in the
     /// process crashing. This function is for that second half; any error returned from this
     /// function will be considered unrecoverable.
     fn finish_exec(&mut self, path: CString, resolved_elf: ResolvedElf) -> Result<(), Errno> {
         // Now that the exec will definitely finish (or crash), notify owners of
         // locked futexes for the current process, which will be impossible to
         // update after process image is replaced.  See get_robust_list(2).
         self.notify_robust_list();

         self.mm()
             .exec(resolved_elf.file.name.clone())
             .map_err(|status| from_status_like_fdio!(status))?;

         // Update the SELinux state, if enabled.
         selinux_hooks::update_state_on_exec(self, &resolved_elf.selinux_state);

         let start_info = load_executable(self, resolved_elf, &path)?;
         let regs: zx_thread_state_general_regs_t = start_info.into();
         self.thread_state.registers = regs.into();

         {
             let mut state = self.write();
             let mut persistent_info = self.persistent_info.lock();
             state.signals.alt_stack = None;
             state.robust_list_head = UserAddress::NULL.into();

             // TODO(tbodt): Check whether capability xattrs are set on the file, and grant/limit
             // capabilities accordingly.
             persistent_info.creds_mut().exec();
         }
         self.thread_state.extended_pstate.reset();

         self.thread_group.signal_actions.reset_for_exec();

         // TODO(http://b/320436714): when adding SELinux support for the file subsystem, implement
         // hook to clean up state after exec.

         // TODO: The termination signal is reset to SIGCHLD.

         // TODO(https://fxbug.dev/42082680): All threads other than the calling thread are destroyed.

         // TODO: The file descriptor table is unshared, undoing the effect of
         //       the CLONE_FILES flag of clone(2).
         //
         // To make this work, we can put the files in an RwLock and then cache
         // a reference to the files on the CurrentTask. That will let
         // functions that have CurrentTask access the FdTable without
         // needing to grab the read-lock.
         //
         // For now, we do not implement that behavior.
         self.files.exec();

         // TODO: POSIX timers are not preserved.

         self.thread_group.write().did_exec = true;

         // Get the basename of the path, which will be used as the name displayed with
         // `prctl(PR_GET_NAME)` and `/proc/self/stat`
         let basename = if let Some(idx) = memchr::memrchr(b'/', path.to_bytes()) {
             // SAFETY: Substring of a CString will contain no null bytes.
             CString::new(&path.to_bytes()[idx + 1..]).unwrap()
         } else {
             path
         };
         set_zx_name(&fuchsia_runtime::thread_self(), basename.as_bytes());
         self.set_command_name(basename);

         Ok(())
     }

     pub fn add_seccomp_filter(
         &mut self,
         bpf_filter: UserAddress,
         flags: u32,
     ) -> Result<SyscallResult, Errno> {
         let fprog: sock_fprog = self.read_object(UserRef::new(bpf_filter))?;

         if u32::from(fprog.len) > BPF_MAXINSNS || fprog.len == 0 {
             return Err(errno!(EINVAL));
         }

         let code: Vec<sock_filter> =
             self.read_objects_to_vec(fprog.filter.into(), fprog.len as usize)?;

         let new_filter = Arc::new(SeccompFilter::from_cbpf(
             &code,
             self.thread_group.next_seccomp_filter_id.add(1),
             flags & SECCOMP_FILTER_FLAG_LOG != 0,
         )?);

         let mut maybe_fd: Option<FdNumber> = None;

         if flags & SECCOMP_FILTER_FLAG_NEW_LISTENER != 0 {
             let mut task_state = self.task.write();
             maybe_fd = Some(task_state.seccomp_filters.create_listener(self)?);
         }

         // We take the process lock here because we can't change any of the threads
         // while doing a tsync.  So, you hold the process lock while making any changes.
         let state = self.thread_group.write();

         if flags & SECCOMP_FILTER_FLAG_TSYNC != 0 {
             // TSYNC synchronizes all filters for all threads in the current process to
             // the current thread's

             // We collect the filters for the current task upfront to save us acquiring
             // the task's lock a lot of times below.
             let mut filters: SeccompFilterContainer = self.read().seccomp_filters.clone();

             // For TSYNC to work, all of the other thread filters in this process have to
             // be a prefix of this thread's filters, and none of them can be in
             // strict mode.
             let tasks = state.tasks().collect::<Vec<_>>();
             for task in &tasks {
                 if task.id == self.id {
                     continue;
                 }
                 let other_task_state = task.read();

                 // Target threads cannot be in SECCOMP_MODE_STRICT
                 if task.seccomp_filter_state.get() == SeccompStateValue::Strict {
                     return Self::seccomp_tsync_error(task.id, flags);
                 }

                 // Target threads' filters must be a subsequence of this thread's
                 if !other_task_state.seccomp_filters.can_sync_to(&filters) {
                     return Self::seccomp_tsync_error(task.id, flags);
                 }
             }

             // Now that we're sure we're allowed to do so, add the filter to all threads.
             filters.add_filter(new_filter, fprog.len)?;

             for task in &tasks {
                 let mut other_task_state = task.write();

                 other_task_state.enable_no_new_privs();
                 other_task_state.seccomp_filters = filters.clone();
                 task.set_seccomp_state(SeccompStateValue::UserDefined)?;
             }
         } else {
             let mut task_state = self.task.write();

             task_state.seccomp_filters.add_filter(new_filter, fprog.len)?;
             self.set_seccomp_state(SeccompStateValue::UserDefined)?;
         }

         if let Some(fd) = maybe_fd {
             Ok(fd.into())
         } else {
             Ok(().into())
         }
     }

     pub fn run_seccomp_filters(
         &mut self,
         syscall: &Syscall,
     ) -> Option<Result<SyscallResult, Errno>> {
         profile_duration!("RunSeccompFilters");
         // Implementation of SECCOMP_FILTER_STRICT, which has slightly different semantics
         // from user-defined seccomp filters.
         if self.seccomp_filter_state.get() == SeccompStateValue::Strict {
             return SeccompState::do_strict(self, syscall);
         }

         // Run user-defined seccomp filters
         let result = self.task.read().seccomp_filters.run_all(self, syscall);

         SeccompState::do_user_defined(result, self, syscall)
     }

     fn seccomp_tsync_error(id: i32, flags: u32) -> Result<SyscallResult, Errno> {
         // By default, TSYNC indicates failure state by returning the first thread
         // id not to be able to sync, rather than by returning -1 and setting
         // errno.  However, if TSYNC_ESRCH is set, it returns ESRCH.  This
         // prevents conflicts with fact that SECCOMP_FILTER_FLAG_NEW_LISTENER
         // makes seccomp return an fd.
         if flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH != 0 {
             Err(errno!(ESRCH))
         } else {
             Ok(id.into())
         }
     }

     // Notify all futexes in robust list.  The robust list is in user space, so we
     // are very careful about walking it, and there are a lot of quiet returns if
     // we fail to walk it.
     // TODO(https://fxbug.dev/42079081): This only sets the FUTEX_OWNER_DIED bit; it does
     // not wake up a waiter.
     pub fn notify_robust_list(&self) {
         let task_state = self.write();
         let robust_list_addr = task_state.robust_list_head.addr();
         if robust_list_addr == UserAddress::NULL {
             // No one has called set_robust_list.
             return;
         }
         let robust_list_res = self.read_object(task_state.robust_list_head);

         let head = if let Ok(head) = robust_list_res {
             head
         } else {
             return;
         };

         let offset = head.futex_offset;

         let mut entries_count = 0;
         let mut curr_ptr = head.list.next;
         while curr_ptr.addr != robust_list_addr.into() && entries_count < ROBUST_LIST_LIMIT {
             let curr_ref = self.read_object(curr_ptr.into());

             let curr = if let Ok(curr) = curr_ref {
                 curr
             } else {
                 return;
             };

             let futex_base: u64;
             if let Some(fb) = curr_ptr.addr.addr.checked_add_signed(offset) {
                 futex_base = fb;
             } else {
                 return;
             }

             let futex_addr = UserAddress::from(futex_base);
             // TODO - What if this isn't 4 byte aligned?

             let futex = if let Ok(futex) = self.mm().atomic_load_u32_relaxed(futex_addr) {
                 futex
             } else {
                 return;
             };

             if (futex & FUTEX_TID_MASK) as i32 == self.id {
                 let owner_died = FUTEX_OWNER_DIED | futex;
                 if self.mm().atomic_store_u32_relaxed(futex_addr, owner_died).is_err() {
                     return;
                 }
             }
             curr_ptr = curr.next;
             entries_count += 1;
         }
     }

     /// Returns a ref to this thread's SeccompNotifier.
     pub fn get_seccomp_notifier(&mut self) -> Option<SeccompNotifierHandle> {
         self.task.write().seccomp_filters.notifier.clone()
     }

     pub fn set_seccomp_notifier(&mut self, notifier: Option<SeccompNotifierHandle>) {
         self.task.write().seccomp_filters.notifier = notifier;
     }

     /// Processes a Zircon exception associated with this task.
     pub fn process_exception(&self, report: &zx::sys::zx_exception_report_t) -> ExceptionResult {
         match report.header.type_ {
             zx::sys::ZX_EXCP_GENERAL => match get_signal_for_general_exception(&report.context) {
                 Some(sig) => ExceptionResult::Signal(SignalInfo::default(sig)),
                 None => {
                     log_warn!("Unrecognized general exception: {:?}", report);
                     ExceptionResult::Signal(SignalInfo::default(SIGILL))
                 }
             },
             zx::sys::ZX_EXCP_FATAL_PAGE_FAULT => self.mm().handle_page_fault(
                 decode_page_fault_exception_report(report),
                 zx::Status::from_raw(report.context.synth_code as zx::zx_status_t),
             ),
             zx::sys::ZX_EXCP_UNDEFINED_INSTRUCTION => {
                 ExceptionResult::Signal(SignalInfo::default(SIGILL))
             }
             zx::sys::ZX_EXCP_UNALIGNED_ACCESS => {
                 ExceptionResult::Signal(SignalInfo::default(SIGBUS))
             }
             zx::sys::ZX_EXCP_SW_BREAKPOINT => ExceptionResult::Signal(SignalInfo::default(SIGTRAP)),
             unknown => {
                 track_stub!(TODO("https://fxbug.dev/322874381"), "zircon exception", unknown);
                 log_error!("Unknown exception {:?}", report);
                 ExceptionResult::Signal(SignalInfo::default(SIGSEGV))
             }
         }
     }

     /// Create a process that is a child of the `init` process.
     ///
     /// The created process will be a task that is the leader of a new thread group.
     ///
     /// Most processes are created by userspace and are descendants of the `init` process. In
     /// some situations, the kernel needs to create a process itself. This function is the
     /// preferred way of creating an actual userspace process because making the process a child of
     /// `init` means that `init` is responsible for waiting on the process when it dies and thereby
     /// cleaning up its zombie.
     ///
     /// If you just need a kernel task, and not an entire userspace process, consider using
     /// `create_system_task` instead. Even better, consider using the `kthreads` threadpool.
     ///
     /// This function creates an underlying Zircon process to host the new task.
     pub fn create_init_child_process<L>(
         locked: &mut Locked<'_, L>,
         kernel: &Arc<Kernel>,
         initial_name: &CString,
     ) -> Result<TaskBuilder, Errno>
     where
         L: LockBefore<TaskRelease>,
     {
         let weak_init = kernel.pids.read().get_task(1);
         let init_task = weak_init.upgrade().ok_or_else(|| errno!(EINVAL))?;
         let initial_name_bytes = initial_name.as_bytes().to_owned();
         let task = Self::create_task(
             locked,
             kernel,
             initial_name.clone(),
             init_task.fs().fork(),
             |locked, pid, process_group| {
                 create_zircon_process(
                     locked,
                     kernel,
                     None,
                     pid,
                     process_group,
                     SignalActions::default(),
                     &initial_name_bytes,
                 )
             },
         )?;
         {
             let mut init_writer = init_task.thread_group.write();
             let mut new_process_writer = task.thread_group.write();
             new_process_writer.parent = Some(init_task.thread_group.clone());
             init_writer.children.insert(task.id, Arc::downgrade(&task.thread_group));
         }
         // A child process created via fork(2) inherits its parent's
         // resource limits.  Resource limits are preserved across execve(2).
         let limits = init_task.thread_group.limits.lock().clone();
         *task.thread_group.limits.lock() = limits;
         Ok(task)
     }

     /// Creates the initial process for a kernel.
     ///
     /// The created process will be a task that is the leader of a new thread group.
     ///
     /// The init process is special because it's the root of the parent/child relationship between
     /// tasks. If a task dies, the init process is ultimately responsible for waiting on that task
     /// and removing it from the zombie list.
     ///
     /// It's possible for the kernel to create tasks whose ultimate parent isn't init, but such
     /// tasks cannot be created by userspace directly.
     ///
     /// This function should only be called as part of booting a kernel instance. To create a
     /// process after the kernel has already booted, consider `create_init_child_process`
     /// or `create_system_task`.
     ///
     /// The process created by this function should always have pid 1. We require the caller to
     /// pass the `pid` as an argument to clarify that it's the callers responsibility to determine
     /// the pid for the process.
     pub fn create_init_process<L>(
         locked: &mut Locked<'_, L>,
         kernel: &Arc<Kernel>,
         pid: pid_t,
         initial_name: CString,
         fs: Arc<FsContext>,
         rlimits: &[(Resource, u64)],
     ) -> Result<TaskBuilder, Errno>
     where
         L: LockBefore<TaskRelease>,
     {
         let initial_name_bytes = initial_name.as_bytes().to_owned();
         let pids = kernel.pids.write();
         Self::create_task_with_pid(
             locked,
             kernel,
             pids,
             pid,
             initial_name,
             fs,
             |locked, pid, process_group| {
                 create_zircon_process(
                     locked,
                     kernel,
                     None,
                     pid,
                     process_group,
                     SignalActions::default(),
                     &initial_name_bytes,
                 )
             },
             Credentials::root(),
             rlimits,
         )
     }

     /// Create a task that runs inside the kernel.
     ///
     /// There is no underlying Zircon process to host the task. Instead, the work done by this task
     /// is performed by a thread in the original Starnix process, possible as part of a thread
     /// pool.
     ///
     /// This function is the preferred way to create a context for doing background work inside the
     /// kernel.
     ///
     /// Rather than calling this function directly, consider using `kthreads`, which provides both
     /// a system task and a threadpool on which the task can do work.
     pub fn create_system_task<L>(
         locked: &mut Locked<'_, L>,
         kernel: &Arc<Kernel>,
         fs: Arc<FsContext>,
     ) -> Result<CurrentTask, Errno>
     where
         L: LockBefore<TaskRelease>,
     {
         let builder = Self::create_task(
             locked,
             kernel,
             CString::new("[kthreadd]").unwrap(),
             fs,
             |locked, pid, process_group| {
                 let process = zx::Process::from(zx::Handle::invalid());
                 let memory_manager = Arc::new(MemoryManager::new_empty());
                 let thread_group = ThreadGroup::new(
                     locked,
                     kernel.clone(),
                     process,
                     None,
                     pid,
                     process_group,
                     SignalActions::default(),
                 );
                 Ok(TaskInfo { thread: None, thread_group, memory_manager })
             },
         )?;
         Ok(builder.into())
     }

     fn create_task<F, L>(
         locked: &mut Locked<'_, L>,
         kernel: &Arc<Kernel>,
         initial_name: CString,
         root_fs: Arc<FsContext>,
         task_info_factory: F,
     ) -> Result<TaskBuilder, Errno>
     where
         F: FnOnce(&mut Locked<'_, L>, i32, Arc<ProcessGroup>) -> Result<TaskInfo, Errno>,
         L: LockBefore<TaskRelease>,
     {
         let mut pids = kernel.pids.write();
         let pid = pids.allocate_pid();
         Self::create_task_with_pid(
             locked,
             kernel,
             pids,
             pid,
             initial_name,
             root_fs,
             task_info_factory,
             Credentials::root(),
             &[],
         )
     }

     fn create_task_with_pid<F, L>(
         locked: &mut Locked<'_, L>,
         kernel: &Arc<Kernel>,
         mut pids: RwLockWriteGuard<'_, PidTable>,
         pid: pid_t,
         initial_name: CString,
         root_fs: Arc<FsContext>,
         task_info_factory: F,
         creds: Credentials,
         rlimits: &[(Resource, u64)],
     ) -> Result<TaskBuilder, Errno>
     where
         F: FnOnce(&mut Locked<'_, L>, i32, Arc<ProcessGroup>) -> Result<TaskInfo, Errno>,
         L: LockBefore<TaskRelease>,
     {
         debug_assert!(pids.get_task(pid).upgrade().is_none());

         let process_group = ProcessGroup::new(pid, None);
         pids.add_process_group(&process_group);

         let TaskInfo { thread, thread_group, memory_manager } =
             task_info_factory(locked, pid, process_group.clone())?;

         process_group.insert(locked, &thread_group);

         // > The timer slack values of init (PID 1), the ancestor of all processes, are 50,000
         // > nanoseconds (50 microseconds).  The timer slack value is inherited by a child created
         // > via fork(2), and is preserved across execve(2).
         // https://man7.org/linux/man-pages/man2/prctl.2.html
         let default_timerslack = 50_000;
         let builder = TaskBuilder {
             task: OwnedRef::new(Task::new(
                 pid,
                 initial_name,
                 thread_group,
                 thread,
                 FdTable::default(),
                 memory_manager,
                 root_fs,
                 creds,
                 Arc::clone(&kernel.default_abstract_socket_namespace),
                 Arc::clone(&kernel.default_abstract_vsock_namespace),
                 Some(SIGCHLD),
                 Default::default(),
                 None,
                 Default::default(),
                 kernel.root_uts_ns.clone(),
                 false,
                 SeccompState::default(),
                 SeccompFilterContainer::default(),
                 UserAddress::NULL.into(),
                 default_timerslack,
             )),
             thread_state: Default::default(),
         };
         release_on_error!(builder, locked, {
             let temp_task = TempRef::from(&builder.task);
             builder.thread_group.add(&temp_task)?;
             for (resource, limit) in rlimits {
                 builder
                     .thread_group
                     .limits
                     .lock()
                     .set(*resource, rlimit { rlim_cur: *limit, rlim_max: *limit });
             }

             pids.add_task(&temp_task);
             pids.add_thread_group(&builder.thread_group);
             Ok(())
         });
         Ok(builder)
     }

     /// Create a kernel task in the same ThreadGroup as the given `system_task`.
     ///
     /// There is no underlying Zircon thread to host the task.
     pub fn create_kernel_thread<L>(
         locked: &mut Locked<'_, L>,
         system_task: &Task,
         initial_name: CString,
     ) -> Result<CurrentTask, Errno>
     where
         L: LockBefore<TaskRelease>,
     {
         let mut pids = system_task.kernel().pids.write();
         let pid = pids.allocate_pid();

         let scheduler_policy;
         let uts_ns;
         let default_timerslack_ns;
         {
             let state = system_task.read();
             scheduler_policy = state.scheduler_policy;
             uts_ns = state.uts_ns.clone();
             default_timerslack_ns = state.default_timerslack_ns;
         }

         let current_task: CurrentTask = TaskBuilder::new(Task::new(
             pid,
             initial_name,
             Arc::clone(&system_task.thread_group),
             None,
             FdTable::default(),
             Arc::clone(system_task.mm()),
             Arc::clone(system_task.fs()),
             system_task.creds(),
             Arc::clone(&system_task.abstract_socket_namespace),
             Arc::clone(&system_task.abstract_vsock_namespace),
             None,
             Default::default(),
             None,
             scheduler_policy,
             uts_ns,
             false,
             SeccompState::default(),
             SeccompFilterContainer::default(),
             UserAddress::NULL.into(),
             default_timerslack_ns,
         ))
         .into();
         release_on_error!(current_task, locked, {
             let temp_task = current_task.temp_task();
             current_task.thread_group.add(&temp_task)?;
             pids.add_task(&temp_task);
             Ok(())
         });
         Ok(current_task)
     }

     /// Clone this task.
     ///
     /// Creates a new task object that shares some state with this task
     /// according to the given flags.
     ///
     /// Used by the clone() syscall to create both processes and threads.
     ///
     /// The exit signal is broken out from the flags parameter like clone3() rather than being
     /// bitwise-ORed like clone().
     pub fn clone_task<L>(
         &self,
         locked: &mut Locked<'_, L>,
         flags: u64,
         child_exit_signal: Option<Signal>,
         user_parent_tid: UserRef<pid_t>,
         user_child_tid: UserRef<pid_t>,
     ) -> Result<TaskBuilder, Errno>
     where
         L: LockBefore<MmDumpable>,
         L: LockBefore<TaskRelease>,
     {
         const IMPLEMENTED_FLAGS: u64 = (CLONE_VM
             | CLONE_FS
             | CLONE_FILES
             | CLONE_SIGHAND
             | CLONE_THREAD
             | CLONE_SYSVSEM
             | CLONE_SETTLS
             | CLONE_PARENT_SETTID
             | CLONE_CHILD_CLEARTID
             | CLONE_CHILD_SETTID
             | CLONE_VFORK
             | CLONE_PTRACE) as u64;
         // A mask with all valid flags set, because we want to return a different error code for an
         // invalid flag vs an unimplemented flag. Subtracting 1 from the largest valid flag gives a
         // mask with all flags below it set. Shift up by one to make sure the largest flag is also
         // set.
         const VALID_FLAGS: u64 = (CLONE_INTO_CGROUP << 1) - 1;

         // CLONE_SETTLS is implemented by sys_clone.

         let clone_files = flags & (CLONE_FILES as u64) != 0;
         let clone_fs = flags & (CLONE_FS as u64) != 0;
         let clone_parent_settid = flags & (CLONE_PARENT_SETTID as u64) != 0;
         let clone_child_cleartid = flags & (CLONE_CHILD_CLEARTID as u64) != 0;
         let clone_child_settid = flags & (CLONE_CHILD_SETTID as u64) != 0;
         let clone_sysvsem = flags & (CLONE_SYSVSEM as u64) != 0;
         let clone_ptrace = flags & (CLONE_PTRACE as u64) != 0;
         let clone_thread = flags & (CLONE_THREAD as u64) != 0;
         let clone_vm = flags & (CLONE_VM as u64) != 0;
         let clone_sighand = flags & (CLONE_SIGHAND as u64) != 0;
         let clone_vfork = flags & (CLONE_VFORK as u64) != 0;

         let new_uts = flags & (CLONE_NEWUTS as u64) != 0;

         if clone_ptrace {
             track_stub!(TODO("https://fxbug.dev/322874630"), "CLONE_PTRACE");
         }

         if clone_sysvsem {
             track_stub!(TODO("https://fxbug.dev/322875185"), "CLONE_SYSVSEM");
         }

         if clone_sighand && !clone_vm {
             return error!(EINVAL);
         }
         if clone_thread && !clone_sighand {
             return error!(EINVAL);
         }
         if flags & !VALID_FLAGS != 0 {
             return error!(EINVAL);
         }

         if clone_vm && !clone_thread {
             // TODO(https://fxbug.dev/42066087) Implement CLONE_VM for child processes (not just child
             // threads). Currently this executes CLONE_VM (explicitly passed to clone() or as
             // used by vfork()) as a fork (the VM in the child is copy-on-write) which is almost
             // always OK.
             //
             // CLONE_VM is primarily as an optimization to avoid making a copy-on-write version of a
             // process' VM that will be immediately replaced with a call to exec(). The main users
             // (libc and language runtimes) don't actually rely on the memory being shared between
             // the two processes. And the vfork() man page explicitly allows vfork() to be
             // implemented as fork() which is what we do here.
             if !clone_vfork {
                 track_stub!(
                     TODO("https://fxbug.dev/322875227"),
                     "CLONE_VM without CLONE_THREAD or CLONE_VFORK"
                 );
             }
         } else if clone_thread && !clone_vm {
             track_stub!(TODO("https://fxbug.dev/322875167"), "CLONE_THREAD without CLONE_VM");
             return error!(ENOSYS);
         }

         if flags & !IMPLEMENTED_FLAGS != 0 {
             track_stub!(
                 TODO("https://fxbug.dev/322875130"),
                 "clone unknown flags",
                 flags & !IMPLEMENTED_FLAGS
             );
             return error!(ENOSYS);
         }

         let fs = if clone_fs { self.fs().clone() } else { self.fs().fork() };
         let files = if clone_files { self.files.clone() } else { self.files.fork() };

         let kernel = self.kernel();
         let mut pids = kernel.pids.write();

         let pid;
         let command;
         let creds;
         let scheduler_policy;
         let uts_ns;
         let no_new_privs;
         let seccomp_filters;
         let robust_list_head = UserAddress::NULL.into();
         let child_signal_mask;
         let timerslack_ns;

         let TaskInfo { thread, thread_group, memory_manager } = {
             // Make sure to drop these locks ASAP to avoid inversion
             let thread_group_state = self.thread_group.write();
             let state = self.read();

             no_new_privs = state.no_new_privs();
             seccomp_filters = state.seccomp_filters.clone();
             child_signal_mask = state.signals.mask();

             pid = pids.allocate_pid();
             command = self.command();
             creds = self.creds();
             scheduler_policy = state.scheduler_policy.fork();
             timerslack_ns = state.timerslack_ns;

             uts_ns = if new_uts {
                 if !self.creds().has_capability(CAP_SYS_ADMIN) {
                     return error!(EPERM);
                 }

                 // Fork the UTS namespace of the existing task.
                 let new_uts_ns = state.uts_ns.read().clone();
                 Arc::new(RwLock::new(new_uts_ns))
             } else {
                 // Inherit the UTS of the existing task.
                 state.uts_ns.clone()
             };

             if clone_thread {
                 let thread_group = self.thread_group.clone();
                 let memory_manager = self.mm().clone();
                 TaskInfo { thread: None, thread_group, memory_manager }
             } else {
                 // Drop the lock on this task before entering `create_zircon_process`, because it will
                 // take a lock on the new thread group, and locks on thread groups have a higher
                 // priority than locks on the task in the thread group.
                 std::mem::drop(state);
                 let signal_actions = if clone_sighand {
                     self.thread_group.signal_actions.clone()
                 } else {
                     self.thread_group.signal_actions.fork()
                 };
                 let process_group = thread_group_state.process_group.clone();
                 create_zircon_process(
                     locked,
                     kernel,
                     Some(thread_group_state),
                     pid,
                     process_group,
                     signal_actions,
                     command.as_bytes(),
                 )?
             }
         };

         // Only create the vfork event when the caller requested CLONE_VFORK.
         let vfork_event = if clone_vfork { Some(Arc::new(zx::Event::create())) } else { None };

         let mut child = TaskBuilder::new(Task::new(
             pid,
             command,
             thread_group,
             thread,
             files,
             memory_manager,
             fs,
             creds,
             self.abstract_socket_namespace.clone(),
             self.abstract_vsock_namespace.clone(),
             child_exit_signal,
             child_signal_mask,
             vfork_event,
             scheduler_policy,
             uts_ns,
             no_new_privs,
             SeccompState::from(&self.seccomp_filter_state),
             seccomp_filters,
             robust_list_head,
             timerslack_ns,
         ));

         release_on_error!(child, locked, {
             let child_task = TempRef::from(&child.task);
             // Drop the pids lock as soon as possible after creating the child. Destroying the child
             // and removing it from the pids table itself requires the pids lock, so if an early exit
             // takes place we have a self deadlock.
             pids.add_task(&child_task);
             if !clone_thread {
                 pids.add_thread_group(&child.thread_group);
             }
             std::mem::drop(pids);

             // Child lock must be taken before this lock. Drop the lock on the task, take a writable
             // lock on the child and take the current state back.

             #[cfg(any(test, debug_assertions))]
             {
                 // Take the lock on the thread group and its child in the correct order to ensure any wrong ordering
                 // will trigger the tracing-mutex at the right call site.
                 if !clone_thread {
                     let _l1 = self.thread_group.read();
                     let _l2 = child.thread_group.read();
                 }
             }

             if clone_thread {
                 self.thread_group.add(&child_task)?;
             } else {
                 child.thread_group.add(&child_task)?;
                 let mut child_state = child.write();
                 let state = self.read();
                 child_state.signals.alt_stack = state.signals.alt_stack;
                 child_state.signals.set_mask(state.signals.mask());
                 self.mm().snapshot_to(locked, child.mm())?;
             }

             if clone_parent_settid {
                 self.write_object(user_parent_tid, &child.id)?;
             }

             if clone_child_cleartid {
                 child.write().clear_child_tid = user_child_tid;
             }

             if clone_child_settid {
                 child.write_object(user_child_tid, &child.id)?;
             }
             child.thread_state = self.thread_state.snapshot();
             Ok(())
         });
         // Take the lock on thread group and task in the correct order to ensure any wrong ordering
         // will trigger the tracing-mutex at the right call site.
         #[cfg(any(test, debug_assertions))]
         {
             let _l1 = child.thread_group.read();
             let _l2 = child.read();
         }

         Ok(child)
     }

     /// Sets the stop state (per set_stopped), and also notifies all listeners,
     /// including the parent process if appropriate.
     pub fn set_stopped_and_notify(&self, stopped: StopState, siginfo: Option<SignalInfo>) {
         {
             let mut state = self.write();
             state.copy_state_from(self);
             state.set_stopped(stopped, siginfo, Some(self), None);
         }

         if !stopped.is_in_progress() {
             let parent = self.thread_group.read().parent.clone();
             if let Some(parent) = parent {
                 parent.write().child_status_waiters.notify_all();
             }
         }
     }

     /// If the task is stopping, set it as stopped. return whether the caller
     /// should stop.  The task might also be waking up.
     pub fn finalize_stop_state(&mut self) -> bool {
         let stopped = self.load_stopped();

         if !stopped.is_stopping_or_stopped() {
             // If we are waking up, potentially write back state a tracer may have modified.
             let captured_state = self.write().take_captured_state();
             if let Some(captured) = captured_state {
                 if captured.dirty {
                     self.thread_state.replace_registers(&captured.thread_state);
                 }
             }
         }

         // Stopping because the thread group is stopping.
         // Try to flip to GroupStopped - will fail if we shouldn't.
         if self.thread_group.set_stopped(StopState::GroupStopped, None, true)
             == StopState::GroupStopped
         {
             let signal = self.thread_group.read().last_signal.clone();
             // stopping because the thread group has stopped
             let event = Some(PtraceEventData::new_from_event(PtraceEvent::Stop, 0));
             self.write().set_stopped(StopState::GroupStopped, signal, Some(self), event);
             return true;
         }

         // Stopping because the task is stopping
         if stopped.is_stopping_or_stopped() {
             if let Ok(stopped) = stopped.finalize() {
                 self.set_stopped_and_notify(stopped, None);
             }
             return true;
         }

         false
     }

     /// Block the execution of `current_task` as long as the task is stopped and
     /// not terminated.
     pub fn block_while_stopped(&mut self) {
         // Upgrade the state from stopping to stopped if needed. Return if the task
         // should not be stopped.
         if !self.finalize_stop_state() {
             return;
         }

         let waiter = Waiter::new_ignoring_signals();
         loop {
             // If we've exited, unstop the threads and return without notifying
             // waiters.
             if self.is_exitted() {
                 self.thread_group.set_stopped(StopState::ForceAwake, None, false);
                 self.write().set_stopped(StopState::ForceAwake, None, Some(self), None);
                 return;
             }

             if self.wake_or_wait_until_unstopped_async(&waiter) {
                 return;
             }

             // Do the wait. Result is not needed, as this is not in a syscall.
             let _: Result<(), Errno> = waiter.wait(self);

             // Maybe go from stopping to stopped, if we are currently stopping
             // again.
             self.finalize_stop_state();
         }
     }

     /// For traced tasks, this will return the data neceessary for a cloned task
     /// to attach to the same tracer.
     pub fn get_ptrace_core_state_for_clone(
         &mut self,
         clone_args: &clone_args,
     ) -> (PtraceOptions, Option<PtraceCoreState>) {
         let state = self.write();
         if let Some(ref ptrace) = &state.ptrace {
             ptrace.get_core_state_for_clone(clone_args)
         } else {
             (PtraceOptions::empty(), None)
         }
     }

     /// If currently being ptraced with the given option, emit the appropriate
     /// event.  PTRACE_EVENTMSG will return the given message.  Also emits the
     /// appropriate event for execve in the absence of TRACEEXEC.
     ///
     /// Note that the Linux kernel has a documented bug where, if TRACEEXIT is
     /// enabled, SIGKILL will trigger an event.  We do not exhibit this
     /// behavior.
     pub fn ptrace_event(&mut self, trace_kind: PtraceOptions, msg: u64) {
         if !trace_kind.is_empty() {
             {
                 let mut state = self.write();
                 if let Some(ref mut ptrace) = &mut state.ptrace {
                     if !ptrace.has_option(trace_kind) {
                         // If this would be a TRACEEXEC, but TRACEEXEC is not
                         // turned on, then send a SIGTRAP.
                         if trace_kind == PtraceOptions::TRACEEXEC && !ptrace.is_seized() {
                             // Send a SIGTRAP so that the parent can gain control.
                             send_signal_first(self, state, SignalInfo::default(SIGTRAP));
                         }

                         return;
                     }
                     let mut siginfo = SignalInfo::default(starnix_uapi::signals::SIGTRAP);
                     siginfo.code = (((PtraceEvent::from_option(&trace_kind) as u32) << 8)
                         | linux_uapi::SIGTRAP) as i32;
                     state.set_stopped(
                         StopState::PtraceEventStopping,
                         Some(siginfo),
                         None,
                         Some(PtraceEventData::new(trace_kind, msg)),
                     );
                 } else {
                     return;
                 }
             }
             self.block_while_stopped();
         }
     }

     /// Causes the current thread's thread group to exit, notifying any ptracer
     /// of this task first.
     pub fn thread_group_exit(&mut self, exit_status: ExitStatus) {
         self.ptrace_event(PtraceOptions::TRACEEXIT, exit_status.signal_info_status() as u64);
         self.thread_group.exit(exit_status, None);
     }

     /// The flags indicates only the flags as in clone3(), and does not use the low 8 bits for the
     /// exit signal as in clone().
     pub fn clone_task_for_test<L>(
         &self,
         locked: &mut Locked<'_, L>,
         flags: u64,
         exit_signal: Option<Signal>,
     ) -> crate::testing::AutoReleasableTask
     where
         L: LockBefore<MmDumpable>,
         L: LockBefore<TaskRelease>,
     {
         let result = self
             .clone_task(locked, flags, exit_signal, UserRef::default(), UserRef::default())
             .expect("failed to create task in test");

         result.into()
     }
 }

 impl MemoryAccessor for CurrentTask {
     fn read_memory<'a>(
         &self,
         addr: UserAddress,
         bytes: &'a mut [MaybeUninit<u8>],
     ) -> Result<&'a mut [u8], Errno> {
         self.mm().unified_read_memory(self, addr, bytes)
     }

     fn read_memory_partial_until_null_byte<'a>(
         &self,
         addr: UserAddress,
         bytes: &'a mut [MaybeUninit<u8>],
     ) -> Result<&'a mut [u8], Errno> {
         self.mm().unified_read_memory_partial_until_null_byte(self, addr, bytes)
     }

     fn read_memory_partial<'a>(
         &self,
         addr: UserAddress,
         bytes: &'a mut [MaybeUninit<u8>],
     ) -> Result<&'a mut [u8], Errno> {
         self.mm().unified_read_memory_partial(self, addr, bytes)
     }

     fn write_memory(&self, addr: UserAddress, bytes: &[u8]) -> Result<usize, Errno> {
         self.mm().unified_write_memory(self, addr, bytes)
     }

     fn write_memory_partial(&self, addr: UserAddress, bytes: &[u8]) -> Result<usize, Errno> {
         self.mm().unified_write_memory_partial(self, addr, bytes)
     }

     fn zero(&self, addr: UserAddress, length: usize) -> Result<usize, Errno> {
         self.mm().unified_zero(self, addr, length)
     }
 }

 impl TaskMemoryAccessor for CurrentTask {
     fn maximum_valid_address(&self) -> UserAddress {
         self.mm().maximum_valid_user_address
     }
 }

 pub enum ExceptionResult {
     /// The exception was handled and no further action is required.
     Handled,

     // The exception generated a signal that should be delivered.
     Signal(SignalInfo),
 }