| // Copyright 2023 The Fuchsia Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| use crate::{ |
| arch::{ |
| registers::RegisterState, |
| task::{decode_page_fault_exception_report, get_signal_for_general_exception}, |
| }, |
| execution::{create_zircon_process, TaskInfo}, |
| fs::proc::pid_directory::TaskDirectory, |
| loader::{load_executable, resolve_executable, ResolvedElf}, |
| mm::{MemoryAccessor, MemoryAccessorExt, MemoryManager, TaskMemoryAccessor}, |
| selinux::hooks::current_task_hooks as selinux_hooks, |
| signals::{send_signal_first, send_standard_signal, RunState, SignalActions, SignalInfo}, |
| task::{ |
| ExitStatus, Kernel, PidTable, ProcessGroup, PtraceCoreState, PtraceEvent, PtraceEventData, |
| PtraceOptions, SeccompFilter, SeccompFilterContainer, SeccompNotifierHandle, SeccompState, |
| SeccompStateValue, StopState, Task, TaskFlags, ThreadGroup, Waiter, |
| }, |
| vfs::{ |
| FdNumber, FdTable, FileHandle, FsContext, FsStr, LookupContext, NamespaceNode, ResolveBase, |
| SymlinkMode, SymlinkTarget, MAX_SYMLINK_FOLLOWS, |
| }, |
| }; |
| use extended_pstate::ExtendedPstateState; |
| use fuchsia_inspect_contrib::profile_duration; |
| use fuchsia_zircon::{ |
| sys::zx_thread_state_general_regs_t, |
| {self as zx}, |
| }; |
| use starnix_logging::{log_error, log_warn, set_zx_name, track_file_not_found, track_stub}; |
| use starnix_sync::{ |
| DeviceOpen, EventWaitGuard, FileOpsCore, LockBefore, Locked, MmDumpable, RwLock, |
| RwLockWriteGuard, TaskRelease, WakeReason, |
| }; |
| use starnix_syscalls::{decls::Syscall, SyscallResult}; |
| use starnix_uapi::{ |
| auth::{Credentials, CAP_SYS_ADMIN}, |
| clone_args, |
| device_type::DeviceType, |
| errno, error, |
| errors::Errno, |
| file_mode::{Access, FileMode}, |
| from_status_like_fdio, |
| open_flags::OpenFlags, |
| ownership::{release_on_error, OwnedRef, Releasable, TempRef, WeakRef}, |
| pid_t, |
| resource_limits::Resource, |
| rlimit, |
| signals::{SigSet, Signal, SIGBUS, SIGCHLD, SIGILL, SIGSEGV, SIGTRAP}, |
| sock_filter, sock_fprog, |
| user_address::{UserAddress, UserRef}, |
| vfs::ResolveFlags, |
| BPF_MAXINSNS, CLONE_CHILD_CLEARTID, CLONE_CHILD_SETTID, CLONE_FILES, CLONE_FS, |
| CLONE_INTO_CGROUP, CLONE_NEWUTS, CLONE_PARENT_SETTID, CLONE_PTRACE, CLONE_SETTLS, |
| CLONE_SIGHAND, CLONE_SYSVSEM, CLONE_THREAD, CLONE_VFORK, CLONE_VM, FUTEX_OWNER_DIED, |
| FUTEX_TID_MASK, ROBUST_LIST_LIMIT, SECCOMP_FILTER_FLAG_LOG, SECCOMP_FILTER_FLAG_NEW_LISTENER, |
| SECCOMP_FILTER_FLAG_TSYNC, SECCOMP_FILTER_FLAG_TSYNC_ESRCH, SI_KERNEL, |
| }; |
| use std::{ffi::CString, fmt, marker::PhantomData, mem::MaybeUninit, sync::Arc}; |
| |
| pub struct TaskBuilder { |
| /// The underlying task object. |
| pub task: OwnedRef<Task>, |
| |
| pub thread_state: ThreadState, |
| } |
| |
| impl TaskBuilder { |
| pub fn new(task: Task) -> Self { |
| Self { task: OwnedRef::new(task), thread_state: Default::default() } |
| } |
| |
| #[inline(always)] |
| pub fn release<L>(self, locked: &mut Locked<'_, L>) |
| where |
| L: LockBefore<TaskRelease>, |
| { |
| let mut locked = locked.cast_locked::<TaskRelease>(); |
| Releasable::release(self, &mut locked); |
| } |
| } |
| |
| impl From<TaskBuilder> for CurrentTask { |
| fn from(builder: TaskBuilder) -> Self { |
| Self::new(builder.task, builder.thread_state) |
| } |
| } |
| |
| impl Releasable for TaskBuilder { |
| type Context<'a> = &'a mut Locked<'a, TaskRelease>; |
| |
| fn release<'a>(self, locked: &'a mut Locked<'a, TaskRelease>) { |
| let context = (self.thread_state, locked); |
| self.task.release(context); |
| } |
| } |
| |
| impl std::ops::Deref for TaskBuilder { |
| type Target = Task; |
| fn deref(&self) -> &Self::Target { |
| &self.task |
| } |
| } |
| |
| /// The task object associated with the currently executing thread. |
| /// |
| /// We often pass the `CurrentTask` as the first argument to functions if those functions need to |
| /// know contextual information about the thread on which they are running. For example, we often |
| /// use the `CurrentTask` to perform access checks, which ensures that the caller is authorized to |
| /// perform the requested operation. |
| /// |
| /// The `CurrentTask` also has state that can be referenced only on the currently executing thread, |
| /// such as the register state for that thread. Syscalls are given a mutable references to the |
| /// `CurrentTask`, which lets them manipulate this state. |
| /// |
| /// See also `Task` for more information about tasks. |
| pub struct CurrentTask { |
| /// The underlying task object. |
| pub task: OwnedRef<Task>, |
| |
| pub thread_state: ThreadState, |
| |
| /// Makes CurrentTask neither Sync not Send. |
| _local_marker: PhantomData<*mut u8>, |
| } |
| |
| /// The thread related information of a `CurrentTask`. The information should never be used outside |
| /// of the thread owning the `CurrentTask`. |
| #[derive(Default)] |
| pub struct ThreadState { |
| /// A copy of the registers associated with the Zircon thread. Up-to-date values can be read |
| /// from `self.handle.read_state_general_regs()`. To write these values back to the thread, call |
| /// `self.handle.write_state_general_regs(self.thread_state.registers.into())`. |
| pub registers: RegisterState, |
| |
| /// Copy of the current extended processor state including floating point and vector registers. |
| pub extended_pstate: ExtendedPstateState, |
| |
| /// A custom function to resume a syscall that has been interrupted by SIGSTOP. |
| /// To use, call set_syscall_restart_func and return ERESTART_RESTARTBLOCK. sys_restart_syscall |
| /// will eventually call it. |
| pub syscall_restart_func: Option<Box<SyscallRestartFunc>>, |
| } |
| |
| impl ThreadState { |
| /// Returns a new `ThreadState` with the same `registers` as this one. |
| fn snapshot(&self) -> Self { |
| Self { |
| registers: self.registers, |
| extended_pstate: Default::default(), |
| syscall_restart_func: None, |
| } |
| } |
| |
| pub fn extended_snapshot(&self) -> Self { |
| Self { |
| registers: self.registers.clone(), |
| extended_pstate: self.extended_pstate.clone(), |
| syscall_restart_func: None, |
| } |
| } |
| |
| pub fn replace_registers(&mut self, other: &ThreadState) { |
| self.registers = other.registers; |
| self.extended_pstate = other.extended_pstate; |
| } |
| |
| pub fn get_user_register(&mut self, offset: usize) -> Result<usize, Errno> { |
| let mut result: usize = 0; |
| self.registers.apply_user_register(offset, &mut |register| result = *register as usize)?; |
| Ok(result) |
| } |
| |
| pub fn set_user_register(&mut self, offset: usize, value: usize) -> Result<(), Errno> { |
| self.registers.apply_user_register(offset, &mut |register| *register = value as u64) |
| } |
| } |
| |
| type SyscallRestartFunc = |
| dyn FnOnce(&mut CurrentTask) -> Result<SyscallResult, Errno> + Send + Sync; |
| |
| impl Releasable for CurrentTask { |
| type Context<'a> = &'a mut Locked<'a, TaskRelease>; |
| |
| fn release<'a>(self, locked: &'a mut Locked<'a, TaskRelease>) { |
| self.notify_robust_list(); |
| let _ignored = self.clear_child_tid_if_needed(); |
| |
| // We remove from the thread group here because the WeakRef in the pid |
| // table to this task must be valid until this task is removed from the |
| // thread group, but self.task.release() below invalidates it. |
| self.thread_group.remove(locked, &self); |
| |
| let context = (self.thread_state, locked); |
| self.task.release(context); |
| } |
| } |
| |
| impl std::ops::Deref for CurrentTask { |
| type Target = Task; |
| fn deref(&self) -> &Self::Target { |
| &self.task |
| } |
| } |
| |
| impl fmt::Debug for CurrentTask { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| self.task.fmt(f) |
| } |
| } |
| |
| impl CurrentTask { |
| pub fn new(task: OwnedRef<Task>, thread_state: ThreadState) -> Self { |
| Self { task, thread_state, _local_marker: Default::default() } |
| } |
| |
| pub fn trigger_delayed_releaser(&self) { |
| self.kernel().delayed_releaser.apply(self); |
| } |
| |
| pub fn weak_task(&self) -> WeakRef<Task> { |
| WeakRef::from(&self.task) |
| } |
| |
| pub fn temp_task(&self) -> TempRef<'_, Task> { |
| TempRef::from(&self.task) |
| } |
| |
| pub fn set_creds(&self, creds: Credentials) { |
| *self.temp_task().persistent_info.lock().creds_mut() = creds; |
| // The /proc/pid direectory's ownership is updated when the task's euid |
| // or egid changes. See proc(5). |
| let mut state = self.proc_pid_directory_cache.lock(); |
| TaskDirectory::maybe_force_chown(self, &mut state, &self.creds()); |
| } |
| |
| #[inline(always)] |
| pub fn release<L>(self, locked: &mut Locked<'_, L>) |
| where |
| L: LockBefore<TaskRelease>, |
| { |
| let mut locked = locked.cast_locked::<TaskRelease>(); |
| Releasable::release(self, &mut locked); |
| } |
| |
| pub fn set_syscall_restart_func<R: Into<SyscallResult>>( |
| &mut self, |
| f: impl FnOnce(&mut CurrentTask) -> Result<R, Errno> + Send + Sync + 'static, |
| ) { |
| self.thread_state.syscall_restart_func = |
| Some(Box::new(|current_task| Ok(f(current_task)?.into()))); |
| } |
| |
| /// Sets the task's signal mask to `signal_mask` and runs `wait_function`. |
| /// |
| /// Signals are dequeued prior to the original signal mask being restored. This is done by the |
| /// signal machinery in the syscall dispatch loop. |
| /// |
| /// The returned result is the result returned from the wait function. |
| pub fn wait_with_temporary_mask<F, T>( |
| &mut self, |
| signal_mask: SigSet, |
| wait_function: F, |
| ) -> Result<T, Errno> |
| where |
| F: FnOnce(&CurrentTask) -> Result<T, Errno>, |
| { |
| { |
| let mut state = self.write(); |
| state.set_flags(TaskFlags::TEMPORARY_SIGNAL_MASK, true); |
| state.signals.set_temporary_mask(signal_mask); |
| } |
| wait_function(self) |
| } |
| |
| /// If waking, promotes from waking to awake. If not waking, make waiter async |
| /// wait until woken. Returns true if woken. |
| pub fn wake_or_wait_until_unstopped_async(&self, waiter: &Waiter) -> bool { |
| let group_state = self.thread_group.read(); |
| let mut task_state = self.write(); |
| |
| // Wake up if |
| // a) we should wake up, meaning: |
| // i) we're in group stop, and the thread group has exited group stop, or |
| // ii) we're waking up, |
| // b) and ptrace isn't stopping us from waking up, but |
| // c) always wake up if we got a SIGKILL. |
| let task_stop_state = self.load_stopped(); |
| let group_stop_state = self.thread_group.load_stopped(); |
| if ((task_stop_state == StopState::GroupStopped && group_stop_state.is_waking_or_awake()) |
| || task_stop_state.is_waking_or_awake()) |
| && (!task_state.is_ptrace_listening() || task_stop_state.is_force()) |
| { |
| let new_state = if task_stop_state.is_waking_or_awake() { |
| task_stop_state.finalize() |
| } else { |
| group_stop_state.finalize() |
| }; |
| if let Ok(new_state) = new_state { |
| task_state.set_stopped(new_state, None, Some(self), None); |
| drop(group_state); |
| drop(task_state); |
| // It is possible for the stop state to be changed by another |
| // thread between when it is checked above and the following |
| // invocation, but set_stopped does sufficient checking while |
| // holding the lock to make sure that such a change won't result |
| // in corrupted state. |
| self.thread_group.set_stopped(new_state, None, false); |
| return true; |
| } |
| } |
| |
| // We will wait. |
| if self.thread_group.load_stopped().is_stopped() || task_stop_state.is_stopped() { |
| // If we've stopped or PTRACE_LISTEN has been sent, wait for a |
| // signal or instructions from the tracer. |
| group_state.stopped_waiters.wait_async(&waiter); |
| task_state.wait_on_ptracer(&waiter); |
| } else if task_state.can_accept_ptrace_commands() { |
| // If we're stopped because a tracer has seen the stop and not taken |
| // further action, wait for further instructions from the tracer. |
| task_state.wait_on_ptracer(&waiter); |
| } else if task_state.is_ptrace_listening() { |
| // A PTRACE_LISTEN is a state where we can get signals and notify a |
| // ptracer, but otherwise remain blocked. |
| if let Some(ref mut ptrace) = &mut task_state.ptrace { |
| ptrace.set_last_signal(Some(SignalInfo::default(SIGTRAP))); |
| ptrace.set_last_event(Some(PtraceEventData::new_from_event(PtraceEvent::Stop, 0))); |
| } |
| task_state.wait_on_ptracer(&waiter); |
| task_state.notify_ptracers(); |
| } |
| false |
| } |
| |
| /// Set the RunState for the current task to the given value and then call the given callback. |
| /// |
| /// When the callback is done, the run_state is restored to `RunState::Running`. |
| /// |
| /// This function is typically used just before blocking the current task on some operation. |
| /// The given `run_state` registers the mechanism for interrupting the blocking operation with |
| /// the task and the given `callback` actually blocks the task. |
| /// |
| /// This function can only be called in the `RunState::Running` state and cannot set the |
| /// run state to `RunState::Running`. For this reason, this function cannot be reentered. |
| pub fn run_in_state<F, T>(&self, run_state: RunState, callback: F) -> Result<T, Errno> |
| where |
| F: FnOnce() -> Result<T, Errno>, |
| { |
| assert_ne!(run_state, RunState::Running); |
| |
| { |
| let mut state = self.write(); |
| assert!(!state.signals.run_state.is_blocked()); |
| // A note on PTRACE_LISTEN - the thread cannot be scheduled |
| // regardless of pending signals. |
| if state.signals.is_any_pending() && !state.is_ptrace_listening() { |
| return error!(EINTR); |
| } |
| state.signals.run_state = run_state.clone(); |
| } |
| |
| let result = callback(); |
| |
| { |
| let mut state = self.write(); |
| assert_eq!( |
| state.signals.run_state, run_state, |
| "SignalState run state changed while waiting!" |
| ); |
| state.signals.run_state = RunState::Running; |
| }; |
| |
| result |
| } |
| |
| pub fn block_until(&self, guard: EventWaitGuard<'_>, deadline: zx::Time) -> Result<(), Errno> { |
| self.run_in_state(RunState::Event(guard.event().clone()), move || { |
| guard.block_until(deadline).map_err(|e| match e { |
| WakeReason::Interrupted => errno!(EINTR), |
| WakeReason::DeadlineExpired => errno!(ETIMEDOUT), |
| }) |
| }) |
| } |
| |
| /// Determine namespace node indicated by the dir_fd. |
| /// |
| /// Returns the namespace node and the path to use relative to that node. |
| pub fn resolve_dir_fd<'a>( |
| &self, |
| dir_fd: FdNumber, |
| mut path: &'a FsStr, |
| flags: ResolveFlags, |
| ) -> Result<(NamespaceNode, &'a FsStr), Errno> { |
| let path_is_absolute = path.starts_with(b"/"); |
| if path_is_absolute { |
| if flags.contains(ResolveFlags::BENEATH) { |
| return error!(EXDEV); |
| } |
| path = &path[1..]; |
| } |
| |
| let dir = if path_is_absolute && !flags.contains(ResolveFlags::IN_ROOT) { |
| self.fs().root() |
| } else if dir_fd == FdNumber::AT_FDCWD { |
| self.fs().cwd() |
| } else { |
| // O_PATH allowed for: |
| // |
| // Passing the file descriptor as the dirfd argument of |
| // openat() and the other "*at()" system calls. This |
| // includes linkat(2) with AT_EMPTY_PATH (or via procfs |
| // using AT_SYMLINK_FOLLOW) even if the file is not a |
| // directory. |
| // |
| // See https://man7.org/linux/man-pages/man2/open.2.html |
| let file = self.files.get_allowing_opath(dir_fd)?; |
| file.name.clone() |
| }; |
| |
| if !path.is_empty() { |
| if !dir.entry.node.is_dir() { |
| return error!(ENOTDIR); |
| } |
| dir.check_access(self, Access::EXEC)?; |
| } |
| Ok((dir, path.into())) |
| } |
| |
| /// A convenient wrapper for opening files relative to FdNumber::AT_FDCWD. |
| /// |
| /// Returns a FileHandle but does not install the FileHandle in the FdTable |
| /// for this task. |
| pub fn open_file<L>( |
| &self, |
| locked: &mut Locked<'_, L>, |
| path: &FsStr, |
| flags: OpenFlags, |
| ) -> Result<FileHandle, Errno> |
| where |
| L: LockBefore<FileOpsCore>, |
| L: LockBefore<DeviceOpen>, |
| { |
| if flags.contains(OpenFlags::CREAT) { |
| // In order to support OpenFlags::CREAT we would need to take a |
| // FileMode argument. |
| return error!(EINVAL); |
| } |
| self.open_file_at( |
| locked, |
| FdNumber::AT_FDCWD, |
| path, |
| flags, |
| FileMode::default(), |
| ResolveFlags::empty(), |
| ) |
| } |
| |
| /// Resolves a path for open. |
| /// |
| /// If the final path component points to a symlink, the symlink is followed (as long as |
| /// the symlink traversal limit has not been reached). |
| /// |
| /// If the final path component (after following any symlinks, if enabled) does not exist, |
| /// and `flags` contains `OpenFlags::CREAT`, a new node is created at the location of the |
| /// final path component. |
| /// |
| /// This returns the resolved node, and a boolean indicating whether the node has been created. |
| fn resolve_open_path<L>( |
| &self, |
| locked: &mut Locked<'_, L>, |
| context: &mut LookupContext, |
| dir: &NamespaceNode, |
| path: &FsStr, |
| mode: FileMode, |
| flags: OpenFlags, |
| ) -> Result<(NamespaceNode, bool), Errno> |
| where |
| L: LockBefore<FileOpsCore>, |
| { |
| context.update_for_path(path); |
| let mut parent_content = context.with(SymlinkMode::Follow); |
| let (parent, basename) = self.lookup_parent(&mut parent_content, dir, path)?; |
| context.remaining_follows = parent_content.remaining_follows; |
| |
| let must_create = flags.contains(OpenFlags::CREAT) && flags.contains(OpenFlags::EXCL); |
| |
| // Lookup the child, without following a symlink or expecting it to be a directory. |
| let mut child_context = context.with(SymlinkMode::NoFollow); |
| child_context.must_be_directory = false; |
| |
| match parent.lookup_child(self, &mut child_context, basename) { |
| Ok(name) => { |
| if name.entry.node.is_lnk() { |
| if flags.contains(OpenFlags::PATH) |
| && context.symlink_mode == SymlinkMode::NoFollow |
| { |
| // When O_PATH is specified in flags, if pathname is a symbolic link |
| // and the O_NOFOLLOW flag is also specified, then the call returns |
| // a file descriptor referring to the symbolic link. |
| // See https://man7.org/linux/man-pages/man2/openat.2.html |
| // |
| // If the trailing component (i.e., basename) of |
| // pathname is a symbolic link, how.resolve contains |
| // RESOLVE_NO_SYMLINKS, and how.flags contains both |
| // O_PATH and O_NOFOLLOW, then an O_PATH file |
| // descriptor referencing the symbolic link will be |
| // returned. |
| // See https://man7.org/linux/man-pages/man2/openat2.2.html |
| return Ok((name, false)); |
| } |
| |
| if (!flags.contains(OpenFlags::PATH) |
| && context.symlink_mode == SymlinkMode::NoFollow) |
| || context.resolve_flags.contains(ResolveFlags::NO_SYMLINKS) |
| || context.remaining_follows == 0 |
| { |
| if must_create { |
| // Since `must_create` is set, and a node was found, this returns EEXIST |
| // instead of ELOOP. |
| return error!(EEXIST); |
| } |
| // A symlink was found, but one of the following is true: |
| // * flags specified O_NOFOLLOW but not O_PATH. |
| // * how.resolve contains RESOLVE_NO_SYMLINKS |
| // * too many symlink traversals have been attempted |
| return error!(ELOOP); |
| } |
| |
| context.remaining_follows -= 1; |
| match name.readlink(self)? { |
| SymlinkTarget::Path(path) => { |
| let dir = if path[0] == b'/' { self.fs().root() } else { parent }; |
| self.resolve_open_path( |
| locked, |
| context, |
| &dir, |
| path.as_ref(), |
| mode, |
| flags, |
| ) |
| } |
| SymlinkTarget::Node(node) => { |
| if context.resolve_flags.contains(ResolveFlags::NO_MAGICLINKS) { |
| error!(ELOOP) |
| } else { |
| Ok((node, false)) |
| } |
| } |
| } |
| } else { |
| if must_create { |
| return error!(EEXIST); |
| } |
| Ok((name, false)) |
| } |
| } |
| Err(e) if e == errno!(ENOENT) && flags.contains(OpenFlags::CREAT) => { |
| if context.must_be_directory { |
| return error!(EISDIR); |
| } |
| Ok(( |
| parent.open_create_node( |
| locked, |
| self, |
| basename, |
| mode.with_type(FileMode::IFREG), |
| DeviceType::NONE, |
| flags, |
| )?, |
| true, |
| )) |
| } |
| Err(e) => Err(e), |
| } |
| } |
| |
| /// The primary entry point for opening files relative to a task. |
| /// |
| /// Absolute paths are resolve relative to the root of the FsContext for |
| /// this task. Relative paths are resolve relative to dir_fd. To resolve |
| /// relative to the current working directory, pass FdNumber::AT_FDCWD for |
| /// dir_fd. |
| /// |
| /// Returns a FileHandle but does not install the FileHandle in the FdTable |
| /// for this task. |
| pub fn open_file_at<L>( |
| &self, |
| locked: &mut Locked<'_, L>, |
| dir_fd: FdNumber, |
| path: &FsStr, |
| flags: OpenFlags, |
| mode: FileMode, |
| resolve_flags: ResolveFlags, |
| ) -> Result<FileHandle, Errno> |
| where |
| L: LockBefore<FileOpsCore>, |
| L: LockBefore<DeviceOpen>, |
| { |
| if path.is_empty() { |
| return error!(ENOENT); |
| } |
| |
| let (dir, path) = self.resolve_dir_fd(dir_fd, path, resolve_flags)?; |
| self.open_namespace_node_at(locked, dir, path, flags, mode, resolve_flags) |
| } |
| |
| pub fn open_namespace_node_at<L>( |
| &self, |
| locked: &mut Locked<'_, L>, |
| dir: NamespaceNode, |
| path: &FsStr, |
| flags: OpenFlags, |
| mode: FileMode, |
| mut resolve_flags: ResolveFlags, |
| ) -> Result<FileHandle, Errno> |
| where |
| L: LockBefore<FileOpsCore>, |
| L: LockBefore<DeviceOpen>, |
| { |
| // 64-bit kernels force the O_LARGEFILE flag to be on. |
| let mut flags = flags | OpenFlags::LARGEFILE; |
| let opath = flags.contains(OpenFlags::PATH); |
| if opath { |
| // When O_PATH is specified in flags, flag bits other than O_CLOEXEC, |
| // O_DIRECTORY, and O_NOFOLLOW are ignored. |
| const ALLOWED_FLAGS: OpenFlags = OpenFlags::from_bits_truncate( |
| OpenFlags::PATH.bits() |
| | OpenFlags::CLOEXEC.bits() |
| | OpenFlags::DIRECTORY.bits() |
| | OpenFlags::NOFOLLOW.bits(), |
| ); |
| flags &= ALLOWED_FLAGS; |
| } |
| |
| if flags.contains(OpenFlags::TMPFILE) && !flags.can_write() { |
| return error!(EINVAL); |
| } |
| |
| let nofollow = flags.contains(OpenFlags::NOFOLLOW); |
| let must_create = flags.contains(OpenFlags::CREAT) && flags.contains(OpenFlags::EXCL); |
| |
| let symlink_mode = |
| if nofollow || must_create { SymlinkMode::NoFollow } else { SymlinkMode::Follow }; |
| |
| let resolve_base = match ( |
| resolve_flags.contains(ResolveFlags::BENEATH), |
| resolve_flags.contains(ResolveFlags::IN_ROOT), |
| ) { |
| (false, false) => ResolveBase::None, |
| (true, false) => ResolveBase::Beneath(dir.clone()), |
| (false, true) => ResolveBase::InRoot(dir.clone()), |
| (true, true) => return error!(EINVAL), |
| }; |
| |
| // `RESOLVE_BENEATH` and `RESOLVE_IN_ROOT` imply `RESOLVE_NO_MAGICLINKS`. This matches |
| // Linux behavior. Strictly speaking it's is not really required, but it's hard to |
| // implement `BENEATH` and `IN_ROOT` flags correctly otherwise. |
| if resolve_base != ResolveBase::None { |
| resolve_flags.insert(ResolveFlags::NO_MAGICLINKS); |
| } |
| |
| let mut context = LookupContext { |
| symlink_mode, |
| remaining_follows: MAX_SYMLINK_FOLLOWS, |
| must_be_directory: flags.contains(OpenFlags::DIRECTORY), |
| resolve_flags, |
| resolve_base, |
| }; |
| let (name, created) = |
| match self.resolve_open_path(locked, &mut context, &dir, path, mode, flags) { |
| Ok((n, c)) => (n, c), |
| Err(e) => { |
| let mut abs_path = dir.path(&self.task); |
| abs_path.extend(&**path); |
| track_file_not_found(abs_path); |
| return Err(e); |
| } |
| }; |
| |
| let name = if flags.contains(OpenFlags::TMPFILE) { |
| name.create_tmpfile(self, mode.with_type(FileMode::IFREG), flags)? |
| } else { |
| let mode = name.entry.node.info().mode; |
| |
| // These checks are not needed in the `O_TMPFILE` case because `mode` refers to the |
| // file we are opening. With `O_TMPFILE`, that file is the regular file we just |
| // created rather than the node we found by resolving the path. |
| // |
| // For example, we do not need to produce `ENOTDIR` when `must_be_directory` is set |
| // because `must_be_directory` refers to the node we found by resolving the path. |
| // If that node was not a directory, then `create_tmpfile` will produce an error. |
| // |
| // Similarly, we never need to call `truncate` because `O_TMPFILE` is newly created |
| // and therefor already an empty file. |
| |
| if !opath && nofollow && mode.is_lnk() { |
| return error!(ELOOP); |
| } |
| |
| if mode.is_dir() { |
| if flags.can_write() |
| || flags.contains(OpenFlags::CREAT) |
| || flags.contains(OpenFlags::TRUNC) |
| { |
| return error!(EISDIR); |
| } |
| if flags.contains(OpenFlags::DIRECT) { |
| return error!(EINVAL); |
| } |
| } else if context.must_be_directory { |
| return error!(ENOTDIR); |
| } |
| |
| if flags.contains(OpenFlags::TRUNC) && mode.is_reg() && !created { |
| // You might think we should check file.can_write() at this |
| // point, which is what the docs suggest, but apparently we |
| // are supposed to truncate the file if this task can write |
| // to the underlying node, even if we are opening the file |
| // as read-only. See OpenTest.CanTruncateReadOnly. |
| name.truncate(locked, self, 0)?; |
| } |
| |
| name |
| }; |
| |
| // If the node has been created, the open operation should not verify access right: |
| // From <https://man7.org/linux/man-pages/man2/open.2.html> |
| // |
| // > Note that mode applies only to future accesses of the newly created file; the |
| // > open() call that creates a read-only file may well return a read/write file |
| // > descriptor. |
| |
| name.open(locked, self, flags, !created) |
| } |
| |
| /// A wrapper for FsContext::lookup_parent_at that resolves the given |
| /// dir_fd to a NamespaceNode. |
| /// |
| /// Absolute paths are resolve relative to the root of the FsContext for |
| /// this task. Relative paths are resolve relative to dir_fd. To resolve |
| /// relative to the current working directory, pass FdNumber::AT_FDCWD for |
| /// dir_fd. |
| pub fn lookup_parent_at<'a>( |
| &self, |
| context: &mut LookupContext, |
| dir_fd: FdNumber, |
| path: &'a FsStr, |
| ) -> Result<(NamespaceNode, &'a FsStr), Errno> { |
| let (dir, path) = self.resolve_dir_fd(dir_fd, path, ResolveFlags::empty())?; |
| self.lookup_parent(context, &dir, path) |
| } |
| |
| /// Lookup the parent of a namespace node. |
| /// |
| /// Consider using Task::open_file_at or Task::lookup_parent_at rather than |
| /// calling this function directly. |
| /// |
| /// This function resolves all but the last component of the given path. |
| /// The function returns the parent directory of the last component as well |
| /// as the last component. |
| /// |
| /// If path is empty, this function returns dir and an empty path. |
| /// Similarly, if path ends with "." or "..", these components will be |
| /// returned along with the parent. |
| /// |
| /// The returned parent might not be a directory. |
| pub fn lookup_parent<'a>( |
| &self, |
| context: &mut LookupContext, |
| dir: &NamespaceNode, |
| path: &'a FsStr, |
| ) -> Result<(NamespaceNode, &'a FsStr), Errno> { |
| context.update_for_path(path); |
| |
| let mut current_node = dir.clone(); |
| let mut it = path.split(|c| *c == b'/').filter(|p| !p.is_empty()).map(<&FsStr>::from); |
| let mut current_path_component = it.next().unwrap_or_default(); |
| for next_path_component in it { |
| current_node = current_node.lookup_child(self, context, current_path_component)?; |
| current_path_component = next_path_component; |
| } |
| Ok((current_node, current_path_component)) |
| } |
| |
| /// Lookup a namespace node. |
| /// |
| /// Consider using Task::open_file_at or Task::lookup_parent_at rather than |
| /// calling this function directly. |
| /// |
| /// This function resolves the component of the given path. |
| pub fn lookup_path( |
| &self, |
| context: &mut LookupContext, |
| dir: NamespaceNode, |
| path: &FsStr, |
| ) -> Result<NamespaceNode, Errno> { |
| let (parent, basename) = self.lookup_parent(context, &dir, path)?; |
| parent.lookup_child(self, context, basename) |
| } |
| |
| /// Lookup a namespace node starting at the root directory. |
| /// |
| /// Resolves symlinks. |
| pub fn lookup_path_from_root(&self, path: &FsStr) -> Result<NamespaceNode, Errno> { |
| let mut context = LookupContext::default(); |
| self.lookup_path(&mut context, self.fs().root(), path) |
| } |
| |
| pub fn exec<L>( |
| &mut self, |
| locked: &mut Locked<'_, L>, |
| executable: FileHandle, |
| path: CString, |
| argv: Vec<CString>, |
| environ: Vec<CString>, |
| ) -> Result<(), Errno> |
| where |
| L: LockBefore<FileOpsCore>, |
| L: LockBefore<DeviceOpen>, |
| { |
| // Executable must be a regular file |
| if !executable.name.entry.node.is_reg() { |
| return error!(EACCES); |
| } |
| |
| // File node must have EXEC mode permissions. |
| // Note that the ability to execute a file is unrelated to the flags |
| // used in the `open` call. |
| executable.name.check_access(self, Access::EXEC)?; |
| |
| let elf_selinux_state = selinux_hooks::check_exec_access(self, executable.node())?; |
| |
| let resolved_elf = resolve_executable( |
| locked, |
| self, |
| executable, |
| path.clone(), |
| argv, |
| environ, |
| elf_selinux_state, |
| )?; |
| |
| if self.thread_group.read().tasks_count() > 1 { |
| track_stub!(TODO("https://fxbug.dev/297434895"), "exec on multithread process"); |
| return error!(EINVAL); |
| } |
| |
| if let Err(err) = self.finish_exec(path, resolved_elf) { |
| log_warn!("unrecoverable error in exec: {err:?}"); |
| |
| send_standard_signal( |
| self, |
| SignalInfo { code: SI_KERNEL as i32, force: true, ..SignalInfo::default(SIGSEGV) }, |
| ); |
| return Err(err); |
| } |
| |
| self.ptrace_event(PtraceOptions::TRACEEXEC, self.task.id as u64); |
| self.signal_vfork(); |
| |
| Ok(()) |
| } |
| |
| /// After the memory is unmapped, any failure in exec is unrecoverable and results in the |
| /// process crashing. This function is for that second half; any error returned from this |
| /// function will be considered unrecoverable. |
| fn finish_exec(&mut self, path: CString, resolved_elf: ResolvedElf) -> Result<(), Errno> { |
| // Now that the exec will definitely finish (or crash), notify owners of |
| // locked futexes for the current process, which will be impossible to |
| // update after process image is replaced. See get_robust_list(2). |
| self.notify_robust_list(); |
| |
| self.mm() |
| .exec(resolved_elf.file.name.clone()) |
| .map_err(|status| from_status_like_fdio!(status))?; |
| |
| // Update the SELinux state, if enabled. |
| selinux_hooks::update_state_on_exec(self, &resolved_elf.selinux_state); |
| |
| let start_info = load_executable(self, resolved_elf, &path)?; |
| let regs: zx_thread_state_general_regs_t = start_info.into(); |
| self.thread_state.registers = regs.into(); |
| |
| { |
| let mut state = self.write(); |
| let mut persistent_info = self.persistent_info.lock(); |
| state.signals.alt_stack = None; |
| state.robust_list_head = UserAddress::NULL.into(); |
| |
| // TODO(tbodt): Check whether capability xattrs are set on the file, and grant/limit |
| // capabilities accordingly. |
| persistent_info.creds_mut().exec(); |
| } |
| self.thread_state.extended_pstate.reset(); |
| |
| self.thread_group.signal_actions.reset_for_exec(); |
| |
| // TODO(http://b/320436714): when adding SELinux support for the file subsystem, implement |
| // hook to clean up state after exec. |
| |
| // TODO: The termination signal is reset to SIGCHLD. |
| |
| // TODO(https://fxbug.dev/42082680): All threads other than the calling thread are destroyed. |
| |
| // TODO: The file descriptor table is unshared, undoing the effect of |
| // the CLONE_FILES flag of clone(2). |
| // |
| // To make this work, we can put the files in an RwLock and then cache |
| // a reference to the files on the CurrentTask. That will let |
| // functions that have CurrentTask access the FdTable without |
| // needing to grab the read-lock. |
| // |
| // For now, we do not implement that behavior. |
| self.files.exec(); |
| |
| // TODO: POSIX timers are not preserved. |
| |
| self.thread_group.write().did_exec = true; |
| |
| // Get the basename of the path, which will be used as the name displayed with |
| // `prctl(PR_GET_NAME)` and `/proc/self/stat` |
| let basename = if let Some(idx) = memchr::memrchr(b'/', path.to_bytes()) { |
| // SAFETY: Substring of a CString will contain no null bytes. |
| CString::new(&path.to_bytes()[idx + 1..]).unwrap() |
| } else { |
| path |
| }; |
| set_zx_name(&fuchsia_runtime::thread_self(), basename.as_bytes()); |
| self.set_command_name(basename); |
| |
| Ok(()) |
| } |
| |
| pub fn add_seccomp_filter( |
| &mut self, |
| bpf_filter: UserAddress, |
| flags: u32, |
| ) -> Result<SyscallResult, Errno> { |
| let fprog: sock_fprog = self.read_object(UserRef::new(bpf_filter))?; |
| |
| if u32::from(fprog.len) > BPF_MAXINSNS || fprog.len == 0 { |
| return Err(errno!(EINVAL)); |
| } |
| |
| let code: Vec<sock_filter> = |
| self.read_objects_to_vec(fprog.filter.into(), fprog.len as usize)?; |
| |
| let new_filter = Arc::new(SeccompFilter::from_cbpf( |
| &code, |
| self.thread_group.next_seccomp_filter_id.add(1), |
| flags & SECCOMP_FILTER_FLAG_LOG != 0, |
| )?); |
| |
| let mut maybe_fd: Option<FdNumber> = None; |
| |
| if flags & SECCOMP_FILTER_FLAG_NEW_LISTENER != 0 { |
| let mut task_state = self.task.write(); |
| maybe_fd = Some(task_state.seccomp_filters.create_listener(self)?); |
| } |
| |
| // We take the process lock here because we can't change any of the threads |
| // while doing a tsync. So, you hold the process lock while making any changes. |
| let state = self.thread_group.write(); |
| |
| if flags & SECCOMP_FILTER_FLAG_TSYNC != 0 { |
| // TSYNC synchronizes all filters for all threads in the current process to |
| // the current thread's |
| |
| // We collect the filters for the current task upfront to save us acquiring |
| // the task's lock a lot of times below. |
| let mut filters: SeccompFilterContainer = self.read().seccomp_filters.clone(); |
| |
| // For TSYNC to work, all of the other thread filters in this process have to |
| // be a prefix of this thread's filters, and none of them can be in |
| // strict mode. |
| let tasks = state.tasks().collect::<Vec<_>>(); |
| for task in &tasks { |
| if task.id == self.id { |
| continue; |
| } |
| let other_task_state = task.read(); |
| |
| // Target threads cannot be in SECCOMP_MODE_STRICT |
| if task.seccomp_filter_state.get() == SeccompStateValue::Strict { |
| return Self::seccomp_tsync_error(task.id, flags); |
| } |
| |
| // Target threads' filters must be a subsequence of this thread's |
| if !other_task_state.seccomp_filters.can_sync_to(&filters) { |
| return Self::seccomp_tsync_error(task.id, flags); |
| } |
| } |
| |
| // Now that we're sure we're allowed to do so, add the filter to all threads. |
| filters.add_filter(new_filter, fprog.len)?; |
| |
| for task in &tasks { |
| let mut other_task_state = task.write(); |
| |
| other_task_state.enable_no_new_privs(); |
| other_task_state.seccomp_filters = filters.clone(); |
| task.set_seccomp_state(SeccompStateValue::UserDefined)?; |
| } |
| } else { |
| let mut task_state = self.task.write(); |
| |
| task_state.seccomp_filters.add_filter(new_filter, fprog.len)?; |
| self.set_seccomp_state(SeccompStateValue::UserDefined)?; |
| } |
| |
| if let Some(fd) = maybe_fd { |
| Ok(fd.into()) |
| } else { |
| Ok(().into()) |
| } |
| } |
| |
| pub fn run_seccomp_filters( |
| &mut self, |
| syscall: &Syscall, |
| ) -> Option<Result<SyscallResult, Errno>> { |
| profile_duration!("RunSeccompFilters"); |
| // Implementation of SECCOMP_FILTER_STRICT, which has slightly different semantics |
| // from user-defined seccomp filters. |
| if self.seccomp_filter_state.get() == SeccompStateValue::Strict { |
| return SeccompState::do_strict(self, syscall); |
| } |
| |
| // Run user-defined seccomp filters |
| let result = self.task.read().seccomp_filters.run_all(self, syscall); |
| |
| SeccompState::do_user_defined(result, self, syscall) |
| } |
| |
| fn seccomp_tsync_error(id: i32, flags: u32) -> Result<SyscallResult, Errno> { |
| // By default, TSYNC indicates failure state by returning the first thread |
| // id not to be able to sync, rather than by returning -1 and setting |
| // errno. However, if TSYNC_ESRCH is set, it returns ESRCH. This |
| // prevents conflicts with fact that SECCOMP_FILTER_FLAG_NEW_LISTENER |
| // makes seccomp return an fd. |
| if flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH != 0 { |
| Err(errno!(ESRCH)) |
| } else { |
| Ok(id.into()) |
| } |
| } |
| |
| // Notify all futexes in robust list. The robust list is in user space, so we |
| // are very careful about walking it, and there are a lot of quiet returns if |
| // we fail to walk it. |
| // TODO(https://fxbug.dev/42079081): This only sets the FUTEX_OWNER_DIED bit; it does |
| // not wake up a waiter. |
| pub fn notify_robust_list(&self) { |
| let task_state = self.write(); |
| let robust_list_addr = task_state.robust_list_head.addr(); |
| if robust_list_addr == UserAddress::NULL { |
| // No one has called set_robust_list. |
| return; |
| } |
| let robust_list_res = self.read_object(task_state.robust_list_head); |
| |
| let head = if let Ok(head) = robust_list_res { |
| head |
| } else { |
| return; |
| }; |
| |
| let offset = head.futex_offset; |
| |
| let mut entries_count = 0; |
| let mut curr_ptr = head.list.next; |
| while curr_ptr.addr != robust_list_addr.into() && entries_count < ROBUST_LIST_LIMIT { |
| let curr_ref = self.read_object(curr_ptr.into()); |
| |
| let curr = if let Ok(curr) = curr_ref { |
| curr |
| } else { |
| return; |
| }; |
| |
| let futex_base: u64; |
| if let Some(fb) = curr_ptr.addr.addr.checked_add_signed(offset) { |
| futex_base = fb; |
| } else { |
| return; |
| } |
| |
| let futex_addr = UserAddress::from(futex_base); |
| // TODO - What if this isn't 4 byte aligned? |
| |
| let futex = if let Ok(futex) = self.mm().atomic_load_u32_relaxed(futex_addr) { |
| futex |
| } else { |
| return; |
| }; |
| |
| if (futex & FUTEX_TID_MASK) as i32 == self.id { |
| let owner_died = FUTEX_OWNER_DIED | futex; |
| if self.mm().atomic_store_u32_relaxed(futex_addr, owner_died).is_err() { |
| return; |
| } |
| } |
| curr_ptr = curr.next; |
| entries_count += 1; |
| } |
| } |
| |
| /// Returns a ref to this thread's SeccompNotifier. |
| pub fn get_seccomp_notifier(&mut self) -> Option<SeccompNotifierHandle> { |
| self.task.write().seccomp_filters.notifier.clone() |
| } |
| |
| pub fn set_seccomp_notifier(&mut self, notifier: Option<SeccompNotifierHandle>) { |
| self.task.write().seccomp_filters.notifier = notifier; |
| } |
| |
| /// Processes a Zircon exception associated with this task. |
| pub fn process_exception(&self, report: &zx::sys::zx_exception_report_t) -> ExceptionResult { |
| match report.header.type_ { |
| zx::sys::ZX_EXCP_GENERAL => match get_signal_for_general_exception(&report.context) { |
| Some(sig) => ExceptionResult::Signal(SignalInfo::default(sig)), |
| None => { |
| log_warn!("Unrecognized general exception: {:?}", report); |
| ExceptionResult::Signal(SignalInfo::default(SIGILL)) |
| } |
| }, |
| zx::sys::ZX_EXCP_FATAL_PAGE_FAULT => self.mm().handle_page_fault( |
| decode_page_fault_exception_report(report), |
| zx::Status::from_raw(report.context.synth_code as zx::zx_status_t), |
| ), |
| zx::sys::ZX_EXCP_UNDEFINED_INSTRUCTION => { |
| ExceptionResult::Signal(SignalInfo::default(SIGILL)) |
| } |
| zx::sys::ZX_EXCP_UNALIGNED_ACCESS => { |
| ExceptionResult::Signal(SignalInfo::default(SIGBUS)) |
| } |
| zx::sys::ZX_EXCP_SW_BREAKPOINT => ExceptionResult::Signal(SignalInfo::default(SIGTRAP)), |
| unknown => { |
| track_stub!(TODO("https://fxbug.dev/322874381"), "zircon exception", unknown); |
| log_error!("Unknown exception {:?}", report); |
| ExceptionResult::Signal(SignalInfo::default(SIGSEGV)) |
| } |
| } |
| } |
| |
| /// Create a process that is a child of the `init` process. |
| /// |
| /// The created process will be a task that is the leader of a new thread group. |
| /// |
| /// Most processes are created by userspace and are descendants of the `init` process. In |
| /// some situations, the kernel needs to create a process itself. This function is the |
| /// preferred way of creating an actual userspace process because making the process a child of |
| /// `init` means that `init` is responsible for waiting on the process when it dies and thereby |
| /// cleaning up its zombie. |
| /// |
| /// If you just need a kernel task, and not an entire userspace process, consider using |
| /// `create_system_task` instead. Even better, consider using the `kthreads` threadpool. |
| /// |
| /// This function creates an underlying Zircon process to host the new task. |
| pub fn create_init_child_process<L>( |
| locked: &mut Locked<'_, L>, |
| kernel: &Arc<Kernel>, |
| initial_name: &CString, |
| ) -> Result<TaskBuilder, Errno> |
| where |
| L: LockBefore<TaskRelease>, |
| { |
| let weak_init = kernel.pids.read().get_task(1); |
| let init_task = weak_init.upgrade().ok_or_else(|| errno!(EINVAL))?; |
| let initial_name_bytes = initial_name.as_bytes().to_owned(); |
| let task = Self::create_task( |
| locked, |
| kernel, |
| initial_name.clone(), |
| init_task.fs().fork(), |
| |locked, pid, process_group| { |
| create_zircon_process( |
| locked, |
| kernel, |
| None, |
| pid, |
| process_group, |
| SignalActions::default(), |
| &initial_name_bytes, |
| ) |
| }, |
| )?; |
| { |
| let mut init_writer = init_task.thread_group.write(); |
| let mut new_process_writer = task.thread_group.write(); |
| new_process_writer.parent = Some(init_task.thread_group.clone()); |
| init_writer.children.insert(task.id, Arc::downgrade(&task.thread_group)); |
| } |
| // A child process created via fork(2) inherits its parent's |
| // resource limits. Resource limits are preserved across execve(2). |
| let limits = init_task.thread_group.limits.lock().clone(); |
| *task.thread_group.limits.lock() = limits; |
| Ok(task) |
| } |
| |
| /// Creates the initial process for a kernel. |
| /// |
| /// The created process will be a task that is the leader of a new thread group. |
| /// |
| /// The init process is special because it's the root of the parent/child relationship between |
| /// tasks. If a task dies, the init process is ultimately responsible for waiting on that task |
| /// and removing it from the zombie list. |
| /// |
| /// It's possible for the kernel to create tasks whose ultimate parent isn't init, but such |
| /// tasks cannot be created by userspace directly. |
| /// |
| /// This function should only be called as part of booting a kernel instance. To create a |
| /// process after the kernel has already booted, consider `create_init_child_process` |
| /// or `create_system_task`. |
| /// |
| /// The process created by this function should always have pid 1. We require the caller to |
| /// pass the `pid` as an argument to clarify that it's the callers responsibility to determine |
| /// the pid for the process. |
| pub fn create_init_process<L>( |
| locked: &mut Locked<'_, L>, |
| kernel: &Arc<Kernel>, |
| pid: pid_t, |
| initial_name: CString, |
| fs: Arc<FsContext>, |
| rlimits: &[(Resource, u64)], |
| ) -> Result<TaskBuilder, Errno> |
| where |
| L: LockBefore<TaskRelease>, |
| { |
| let initial_name_bytes = initial_name.as_bytes().to_owned(); |
| let pids = kernel.pids.write(); |
| Self::create_task_with_pid( |
| locked, |
| kernel, |
| pids, |
| pid, |
| initial_name, |
| fs, |
| |locked, pid, process_group| { |
| create_zircon_process( |
| locked, |
| kernel, |
| None, |
| pid, |
| process_group, |
| SignalActions::default(), |
| &initial_name_bytes, |
| ) |
| }, |
| Credentials::root(), |
| rlimits, |
| ) |
| } |
| |
| /// Create a task that runs inside the kernel. |
| /// |
| /// There is no underlying Zircon process to host the task. Instead, the work done by this task |
| /// is performed by a thread in the original Starnix process, possible as part of a thread |
| /// pool. |
| /// |
| /// This function is the preferred way to create a context for doing background work inside the |
| /// kernel. |
| /// |
| /// Rather than calling this function directly, consider using `kthreads`, which provides both |
| /// a system task and a threadpool on which the task can do work. |
| pub fn create_system_task<L>( |
| locked: &mut Locked<'_, L>, |
| kernel: &Arc<Kernel>, |
| fs: Arc<FsContext>, |
| ) -> Result<CurrentTask, Errno> |
| where |
| L: LockBefore<TaskRelease>, |
| { |
| let builder = Self::create_task( |
| locked, |
| kernel, |
| CString::new("[kthreadd]").unwrap(), |
| fs, |
| |locked, pid, process_group| { |
| let process = zx::Process::from(zx::Handle::invalid()); |
| let memory_manager = Arc::new(MemoryManager::new_empty()); |
| let thread_group = ThreadGroup::new( |
| locked, |
| kernel.clone(), |
| process, |
| None, |
| pid, |
| process_group, |
| SignalActions::default(), |
| ); |
| Ok(TaskInfo { thread: None, thread_group, memory_manager }) |
| }, |
| )?; |
| Ok(builder.into()) |
| } |
| |
| fn create_task<F, L>( |
| locked: &mut Locked<'_, L>, |
| kernel: &Arc<Kernel>, |
| initial_name: CString, |
| root_fs: Arc<FsContext>, |
| task_info_factory: F, |
| ) -> Result<TaskBuilder, Errno> |
| where |
| F: FnOnce(&mut Locked<'_, L>, i32, Arc<ProcessGroup>) -> Result<TaskInfo, Errno>, |
| L: LockBefore<TaskRelease>, |
| { |
| let mut pids = kernel.pids.write(); |
| let pid = pids.allocate_pid(); |
| Self::create_task_with_pid( |
| locked, |
| kernel, |
| pids, |
| pid, |
| initial_name, |
| root_fs, |
| task_info_factory, |
| Credentials::root(), |
| &[], |
| ) |
| } |
| |
| fn create_task_with_pid<F, L>( |
| locked: &mut Locked<'_, L>, |
| kernel: &Arc<Kernel>, |
| mut pids: RwLockWriteGuard<'_, PidTable>, |
| pid: pid_t, |
| initial_name: CString, |
| root_fs: Arc<FsContext>, |
| task_info_factory: F, |
| creds: Credentials, |
| rlimits: &[(Resource, u64)], |
| ) -> Result<TaskBuilder, Errno> |
| where |
| F: FnOnce(&mut Locked<'_, L>, i32, Arc<ProcessGroup>) -> Result<TaskInfo, Errno>, |
| L: LockBefore<TaskRelease>, |
| { |
| debug_assert!(pids.get_task(pid).upgrade().is_none()); |
| |
| let process_group = ProcessGroup::new(pid, None); |
| pids.add_process_group(&process_group); |
| |
| let TaskInfo { thread, thread_group, memory_manager } = |
| task_info_factory(locked, pid, process_group.clone())?; |
| |
| process_group.insert(locked, &thread_group); |
| |
| // > The timer slack values of init (PID 1), the ancestor of all processes, are 50,000 |
| // > nanoseconds (50 microseconds). The timer slack value is inherited by a child created |
| // > via fork(2), and is preserved across execve(2). |
| // https://man7.org/linux/man-pages/man2/prctl.2.html |
| let default_timerslack = 50_000; |
| let builder = TaskBuilder { |
| task: OwnedRef::new(Task::new( |
| pid, |
| initial_name, |
| thread_group, |
| thread, |
| FdTable::default(), |
| memory_manager, |
| root_fs, |
| creds, |
| Arc::clone(&kernel.default_abstract_socket_namespace), |
| Arc::clone(&kernel.default_abstract_vsock_namespace), |
| Some(SIGCHLD), |
| Default::default(), |
| None, |
| Default::default(), |
| kernel.root_uts_ns.clone(), |
| false, |
| SeccompState::default(), |
| SeccompFilterContainer::default(), |
| UserAddress::NULL.into(), |
| default_timerslack, |
| )), |
| thread_state: Default::default(), |
| }; |
| release_on_error!(builder, locked, { |
| let temp_task = TempRef::from(&builder.task); |
| builder.thread_group.add(&temp_task)?; |
| for (resource, limit) in rlimits { |
| builder |
| .thread_group |
| .limits |
| .lock() |
| .set(*resource, rlimit { rlim_cur: *limit, rlim_max: *limit }); |
| } |
| |
| pids.add_task(&temp_task); |
| pids.add_thread_group(&builder.thread_group); |
| Ok(()) |
| }); |
| Ok(builder) |
| } |
| |
| /// Create a kernel task in the same ThreadGroup as the given `system_task`. |
| /// |
| /// There is no underlying Zircon thread to host the task. |
| pub fn create_kernel_thread<L>( |
| locked: &mut Locked<'_, L>, |
| system_task: &Task, |
| initial_name: CString, |
| ) -> Result<CurrentTask, Errno> |
| where |
| L: LockBefore<TaskRelease>, |
| { |
| let mut pids = system_task.kernel().pids.write(); |
| let pid = pids.allocate_pid(); |
| |
| let scheduler_policy; |
| let uts_ns; |
| let default_timerslack_ns; |
| { |
| let state = system_task.read(); |
| scheduler_policy = state.scheduler_policy; |
| uts_ns = state.uts_ns.clone(); |
| default_timerslack_ns = state.default_timerslack_ns; |
| } |
| |
| let current_task: CurrentTask = TaskBuilder::new(Task::new( |
| pid, |
| initial_name, |
| Arc::clone(&system_task.thread_group), |
| None, |
| FdTable::default(), |
| Arc::clone(system_task.mm()), |
| Arc::clone(system_task.fs()), |
| system_task.creds(), |
| Arc::clone(&system_task.abstract_socket_namespace), |
| Arc::clone(&system_task.abstract_vsock_namespace), |
| None, |
| Default::default(), |
| None, |
| scheduler_policy, |
| uts_ns, |
| false, |
| SeccompState::default(), |
| SeccompFilterContainer::default(), |
| UserAddress::NULL.into(), |
| default_timerslack_ns, |
| )) |
| .into(); |
| release_on_error!(current_task, locked, { |
| let temp_task = current_task.temp_task(); |
| current_task.thread_group.add(&temp_task)?; |
| pids.add_task(&temp_task); |
| Ok(()) |
| }); |
| Ok(current_task) |
| } |
| |
| /// Clone this task. |
| /// |
| /// Creates a new task object that shares some state with this task |
| /// according to the given flags. |
| /// |
| /// Used by the clone() syscall to create both processes and threads. |
| /// |
| /// The exit signal is broken out from the flags parameter like clone3() rather than being |
| /// bitwise-ORed like clone(). |
| pub fn clone_task<L>( |
| &self, |
| locked: &mut Locked<'_, L>, |
| flags: u64, |
| child_exit_signal: Option<Signal>, |
| user_parent_tid: UserRef<pid_t>, |
| user_child_tid: UserRef<pid_t>, |
| ) -> Result<TaskBuilder, Errno> |
| where |
| L: LockBefore<MmDumpable>, |
| L: LockBefore<TaskRelease>, |
| { |
| const IMPLEMENTED_FLAGS: u64 = (CLONE_VM |
| | CLONE_FS |
| | CLONE_FILES |
| | CLONE_SIGHAND |
| | CLONE_THREAD |
| | CLONE_SYSVSEM |
| | CLONE_SETTLS |
| | CLONE_PARENT_SETTID |
| | CLONE_CHILD_CLEARTID |
| | CLONE_CHILD_SETTID |
| | CLONE_VFORK |
| | CLONE_PTRACE) as u64; |
| // A mask with all valid flags set, because we want to return a different error code for an |
| // invalid flag vs an unimplemented flag. Subtracting 1 from the largest valid flag gives a |
| // mask with all flags below it set. Shift up by one to make sure the largest flag is also |
| // set. |
| const VALID_FLAGS: u64 = (CLONE_INTO_CGROUP << 1) - 1; |
| |
| // CLONE_SETTLS is implemented by sys_clone. |
| |
| let clone_files = flags & (CLONE_FILES as u64) != 0; |
| let clone_fs = flags & (CLONE_FS as u64) != 0; |
| let clone_parent_settid = flags & (CLONE_PARENT_SETTID as u64) != 0; |
| let clone_child_cleartid = flags & (CLONE_CHILD_CLEARTID as u64) != 0; |
| let clone_child_settid = flags & (CLONE_CHILD_SETTID as u64) != 0; |
| let clone_sysvsem = flags & (CLONE_SYSVSEM as u64) != 0; |
| let clone_ptrace = flags & (CLONE_PTRACE as u64) != 0; |
| let clone_thread = flags & (CLONE_THREAD as u64) != 0; |
| let clone_vm = flags & (CLONE_VM as u64) != 0; |
| let clone_sighand = flags & (CLONE_SIGHAND as u64) != 0; |
| let clone_vfork = flags & (CLONE_VFORK as u64) != 0; |
| |
| let new_uts = flags & (CLONE_NEWUTS as u64) != 0; |
| |
| if clone_ptrace { |
| track_stub!(TODO("https://fxbug.dev/322874630"), "CLONE_PTRACE"); |
| } |
| |
| if clone_sysvsem { |
| track_stub!(TODO("https://fxbug.dev/322875185"), "CLONE_SYSVSEM"); |
| } |
| |
| if clone_sighand && !clone_vm { |
| return error!(EINVAL); |
| } |
| if clone_thread && !clone_sighand { |
| return error!(EINVAL); |
| } |
| if flags & !VALID_FLAGS != 0 { |
| return error!(EINVAL); |
| } |
| |
| if clone_vm && !clone_thread { |
| // TODO(https://fxbug.dev/42066087) Implement CLONE_VM for child processes (not just child |
| // threads). Currently this executes CLONE_VM (explicitly passed to clone() or as |
| // used by vfork()) as a fork (the VM in the child is copy-on-write) which is almost |
| // always OK. |
| // |
| // CLONE_VM is primarily as an optimization to avoid making a copy-on-write version of a |
| // process' VM that will be immediately replaced with a call to exec(). The main users |
| // (libc and language runtimes) don't actually rely on the memory being shared between |
| // the two processes. And the vfork() man page explicitly allows vfork() to be |
| // implemented as fork() which is what we do here. |
| if !clone_vfork { |
| track_stub!( |
| TODO("https://fxbug.dev/322875227"), |
| "CLONE_VM without CLONE_THREAD or CLONE_VFORK" |
| ); |
| } |
| } else if clone_thread && !clone_vm { |
| track_stub!(TODO("https://fxbug.dev/322875167"), "CLONE_THREAD without CLONE_VM"); |
| return error!(ENOSYS); |
| } |
| |
| if flags & !IMPLEMENTED_FLAGS != 0 { |
| track_stub!( |
| TODO("https://fxbug.dev/322875130"), |
| "clone unknown flags", |
| flags & !IMPLEMENTED_FLAGS |
| ); |
| return error!(ENOSYS); |
| } |
| |
| let fs = if clone_fs { self.fs().clone() } else { self.fs().fork() }; |
| let files = if clone_files { self.files.clone() } else { self.files.fork() }; |
| |
| let kernel = self.kernel(); |
| let mut pids = kernel.pids.write(); |
| |
| let pid; |
| let command; |
| let creds; |
| let scheduler_policy; |
| let uts_ns; |
| let no_new_privs; |
| let seccomp_filters; |
| let robust_list_head = UserAddress::NULL.into(); |
| let child_signal_mask; |
| let timerslack_ns; |
| |
| let TaskInfo { thread, thread_group, memory_manager } = { |
| // Make sure to drop these locks ASAP to avoid inversion |
| let thread_group_state = self.thread_group.write(); |
| let state = self.read(); |
| |
| no_new_privs = state.no_new_privs(); |
| seccomp_filters = state.seccomp_filters.clone(); |
| child_signal_mask = state.signals.mask(); |
| |
| pid = pids.allocate_pid(); |
| command = self.command(); |
| creds = self.creds(); |
| scheduler_policy = state.scheduler_policy.fork(); |
| timerslack_ns = state.timerslack_ns; |
| |
| uts_ns = if new_uts { |
| if !self.creds().has_capability(CAP_SYS_ADMIN) { |
| return error!(EPERM); |
| } |
| |
| // Fork the UTS namespace of the existing task. |
| let new_uts_ns = state.uts_ns.read().clone(); |
| Arc::new(RwLock::new(new_uts_ns)) |
| } else { |
| // Inherit the UTS of the existing task. |
| state.uts_ns.clone() |
| }; |
| |
| if clone_thread { |
| let thread_group = self.thread_group.clone(); |
| let memory_manager = self.mm().clone(); |
| TaskInfo { thread: None, thread_group, memory_manager } |
| } else { |
| // Drop the lock on this task before entering `create_zircon_process`, because it will |
| // take a lock on the new thread group, and locks on thread groups have a higher |
| // priority than locks on the task in the thread group. |
| std::mem::drop(state); |
| let signal_actions = if clone_sighand { |
| self.thread_group.signal_actions.clone() |
| } else { |
| self.thread_group.signal_actions.fork() |
| }; |
| let process_group = thread_group_state.process_group.clone(); |
| create_zircon_process( |
| locked, |
| kernel, |
| Some(thread_group_state), |
| pid, |
| process_group, |
| signal_actions, |
| command.as_bytes(), |
| )? |
| } |
| }; |
| |
| // Only create the vfork event when the caller requested CLONE_VFORK. |
| let vfork_event = if clone_vfork { Some(Arc::new(zx::Event::create())) } else { None }; |
| |
| let mut child = TaskBuilder::new(Task::new( |
| pid, |
| command, |
| thread_group, |
| thread, |
| files, |
| memory_manager, |
| fs, |
| creds, |
| self.abstract_socket_namespace.clone(), |
| self.abstract_vsock_namespace.clone(), |
| child_exit_signal, |
| child_signal_mask, |
| vfork_event, |
| scheduler_policy, |
| uts_ns, |
| no_new_privs, |
| SeccompState::from(&self.seccomp_filter_state), |
| seccomp_filters, |
| robust_list_head, |
| timerslack_ns, |
| )); |
| |
| release_on_error!(child, locked, { |
| let child_task = TempRef::from(&child.task); |
| // Drop the pids lock as soon as possible after creating the child. Destroying the child |
| // and removing it from the pids table itself requires the pids lock, so if an early exit |
| // takes place we have a self deadlock. |
| pids.add_task(&child_task); |
| if !clone_thread { |
| pids.add_thread_group(&child.thread_group); |
| } |
| std::mem::drop(pids); |
| |
| // Child lock must be taken before this lock. Drop the lock on the task, take a writable |
| // lock on the child and take the current state back. |
| |
| #[cfg(any(test, debug_assertions))] |
| { |
| // Take the lock on the thread group and its child in the correct order to ensure any wrong ordering |
| // will trigger the tracing-mutex at the right call site. |
| if !clone_thread { |
| let _l1 = self.thread_group.read(); |
| let _l2 = child.thread_group.read(); |
| } |
| } |
| |
| if clone_thread { |
| self.thread_group.add(&child_task)?; |
| } else { |
| child.thread_group.add(&child_task)?; |
| let mut child_state = child.write(); |
| let state = self.read(); |
| child_state.signals.alt_stack = state.signals.alt_stack; |
| child_state.signals.set_mask(state.signals.mask()); |
| self.mm().snapshot_to(locked, child.mm())?; |
| } |
| |
| if clone_parent_settid { |
| self.write_object(user_parent_tid, &child.id)?; |
| } |
| |
| if clone_child_cleartid { |
| child.write().clear_child_tid = user_child_tid; |
| } |
| |
| if clone_child_settid { |
| child.write_object(user_child_tid, &child.id)?; |
| } |
| child.thread_state = self.thread_state.snapshot(); |
| Ok(()) |
| }); |
| // Take the lock on thread group and task in the correct order to ensure any wrong ordering |
| // will trigger the tracing-mutex at the right call site. |
| #[cfg(any(test, debug_assertions))] |
| { |
| let _l1 = child.thread_group.read(); |
| let _l2 = child.read(); |
| } |
| |
| Ok(child) |
| } |
| |
| /// Sets the stop state (per set_stopped), and also notifies all listeners, |
| /// including the parent process if appropriate. |
| pub fn set_stopped_and_notify(&self, stopped: StopState, siginfo: Option<SignalInfo>) { |
| { |
| let mut state = self.write(); |
| state.copy_state_from(self); |
| state.set_stopped(stopped, siginfo, Some(self), None); |
| } |
| |
| if !stopped.is_in_progress() { |
| let parent = self.thread_group.read().parent.clone(); |
| if let Some(parent) = parent { |
| parent.write().child_status_waiters.notify_all(); |
| } |
| } |
| } |
| |
| /// If the task is stopping, set it as stopped. return whether the caller |
| /// should stop. The task might also be waking up. |
| pub fn finalize_stop_state(&mut self) -> bool { |
| let stopped = self.load_stopped(); |
| |
| if !stopped.is_stopping_or_stopped() { |
| // If we are waking up, potentially write back state a tracer may have modified. |
| let captured_state = self.write().take_captured_state(); |
| if let Some(captured) = captured_state { |
| if captured.dirty { |
| self.thread_state.replace_registers(&captured.thread_state); |
| } |
| } |
| } |
| |
| // Stopping because the thread group is stopping. |
| // Try to flip to GroupStopped - will fail if we shouldn't. |
| if self.thread_group.set_stopped(StopState::GroupStopped, None, true) |
| == StopState::GroupStopped |
| { |
| let signal = self.thread_group.read().last_signal.clone(); |
| // stopping because the thread group has stopped |
| let event = Some(PtraceEventData::new_from_event(PtraceEvent::Stop, 0)); |
| self.write().set_stopped(StopState::GroupStopped, signal, Some(self), event); |
| return true; |
| } |
| |
| // Stopping because the task is stopping |
| if stopped.is_stopping_or_stopped() { |
| if let Ok(stopped) = stopped.finalize() { |
| self.set_stopped_and_notify(stopped, None); |
| } |
| return true; |
| } |
| |
| false |
| } |
| |
| /// Block the execution of `current_task` as long as the task is stopped and |
| /// not terminated. |
| pub fn block_while_stopped(&mut self) { |
| // Upgrade the state from stopping to stopped if needed. Return if the task |
| // should not be stopped. |
| if !self.finalize_stop_state() { |
| return; |
| } |
| |
| let waiter = Waiter::new_ignoring_signals(); |
| loop { |
| // If we've exited, unstop the threads and return without notifying |
| // waiters. |
| if self.is_exitted() { |
| self.thread_group.set_stopped(StopState::ForceAwake, None, false); |
| self.write().set_stopped(StopState::ForceAwake, None, Some(self), None); |
| return; |
| } |
| |
| if self.wake_or_wait_until_unstopped_async(&waiter) { |
| return; |
| } |
| |
| // Do the wait. Result is not needed, as this is not in a syscall. |
| let _: Result<(), Errno> = waiter.wait(self); |
| |
| // Maybe go from stopping to stopped, if we are currently stopping |
| // again. |
| self.finalize_stop_state(); |
| } |
| } |
| |
| /// For traced tasks, this will return the data neceessary for a cloned task |
| /// to attach to the same tracer. |
| pub fn get_ptrace_core_state_for_clone( |
| &mut self, |
| clone_args: &clone_args, |
| ) -> (PtraceOptions, Option<PtraceCoreState>) { |
| let state = self.write(); |
| if let Some(ref ptrace) = &state.ptrace { |
| ptrace.get_core_state_for_clone(clone_args) |
| } else { |
| (PtraceOptions::empty(), None) |
| } |
| } |
| |
| /// If currently being ptraced with the given option, emit the appropriate |
| /// event. PTRACE_EVENTMSG will return the given message. Also emits the |
| /// appropriate event for execve in the absence of TRACEEXEC. |
| /// |
| /// Note that the Linux kernel has a documented bug where, if TRACEEXIT is |
| /// enabled, SIGKILL will trigger an event. We do not exhibit this |
| /// behavior. |
| pub fn ptrace_event(&mut self, trace_kind: PtraceOptions, msg: u64) { |
| if !trace_kind.is_empty() { |
| { |
| let mut state = self.write(); |
| if let Some(ref mut ptrace) = &mut state.ptrace { |
| if !ptrace.has_option(trace_kind) { |
| // If this would be a TRACEEXEC, but TRACEEXEC is not |
| // turned on, then send a SIGTRAP. |
| if trace_kind == PtraceOptions::TRACEEXEC && !ptrace.is_seized() { |
| // Send a SIGTRAP so that the parent can gain control. |
| send_signal_first(self, state, SignalInfo::default(SIGTRAP)); |
| } |
| |
| return; |
| } |
| let mut siginfo = SignalInfo::default(starnix_uapi::signals::SIGTRAP); |
| siginfo.code = (((PtraceEvent::from_option(&trace_kind) as u32) << 8) |
| | linux_uapi::SIGTRAP) as i32; |
| state.set_stopped( |
| StopState::PtraceEventStopping, |
| Some(siginfo), |
| None, |
| Some(PtraceEventData::new(trace_kind, msg)), |
| ); |
| } else { |
| return; |
| } |
| } |
| self.block_while_stopped(); |
| } |
| } |
| |
| /// Causes the current thread's thread group to exit, notifying any ptracer |
| /// of this task first. |
| pub fn thread_group_exit(&mut self, exit_status: ExitStatus) { |
| self.ptrace_event(PtraceOptions::TRACEEXIT, exit_status.signal_info_status() as u64); |
| self.thread_group.exit(exit_status, None); |
| } |
| |
| /// The flags indicates only the flags as in clone3(), and does not use the low 8 bits for the |
| /// exit signal as in clone(). |
| pub fn clone_task_for_test<L>( |
| &self, |
| locked: &mut Locked<'_, L>, |
| flags: u64, |
| exit_signal: Option<Signal>, |
| ) -> crate::testing::AutoReleasableTask |
| where |
| L: LockBefore<MmDumpable>, |
| L: LockBefore<TaskRelease>, |
| { |
| let result = self |
| .clone_task(locked, flags, exit_signal, UserRef::default(), UserRef::default()) |
| .expect("failed to create task in test"); |
| |
| result.into() |
| } |
| } |
| |
| impl MemoryAccessor for CurrentTask { |
| fn read_memory<'a>( |
| &self, |
| addr: UserAddress, |
| bytes: &'a mut [MaybeUninit<u8>], |
| ) -> Result<&'a mut [u8], Errno> { |
| self.mm().unified_read_memory(self, addr, bytes) |
| } |
| |
| fn read_memory_partial_until_null_byte<'a>( |
| &self, |
| addr: UserAddress, |
| bytes: &'a mut [MaybeUninit<u8>], |
| ) -> Result<&'a mut [u8], Errno> { |
| self.mm().unified_read_memory_partial_until_null_byte(self, addr, bytes) |
| } |
| |
| fn read_memory_partial<'a>( |
| &self, |
| addr: UserAddress, |
| bytes: &'a mut [MaybeUninit<u8>], |
| ) -> Result<&'a mut [u8], Errno> { |
| self.mm().unified_read_memory_partial(self, addr, bytes) |
| } |
| |
| fn write_memory(&self, addr: UserAddress, bytes: &[u8]) -> Result<usize, Errno> { |
| self.mm().unified_write_memory(self, addr, bytes) |
| } |
| |
| fn write_memory_partial(&self, addr: UserAddress, bytes: &[u8]) -> Result<usize, Errno> { |
| self.mm().unified_write_memory_partial(self, addr, bytes) |
| } |
| |
| fn zero(&self, addr: UserAddress, length: usize) -> Result<usize, Errno> { |
| self.mm().unified_zero(self, addr, length) |
| } |
| } |
| |
| impl TaskMemoryAccessor for CurrentTask { |
| fn maximum_valid_address(&self) -> UserAddress { |
| self.mm().maximum_valid_user_address |
| } |
| } |
| |
| pub enum ExceptionResult { |
| /// The exception was handled and no further action is required. |
| Handled, |
| |
| // The exception generated a signal that should be delivered. |
| Signal(SignalInfo), |
| } |