blob: 15afb4317c8cb3ff6fa3b2b080c7879aac02f9ed [file] [log] [blame]
// Copyright 2025 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
use crate::mm::MemoryManager;
use crate::security;
use crate::signals::SignalActions;
use crate::task::{
CurrentTask, Kernel, PidTable, ProcessGroup, RobustListHeadPtr, SeccompFilterContainer,
SeccompState, Task, TaskBuilder, ThreadGroup, ThreadGroupParent, ThreadGroupWriteGuard,
};
use crate::vfs::{FdTable, FsContext};
use starnix_sync::{
LockBefore, Locked, ProcessGroupState, RwLockWriteGuard, TaskRelease, Unlocked,
};
use starnix_task_command::TaskCommand;
use starnix_types::arch::ArchWidth;
use starnix_types::ownership::TempRef;
use starnix_types::release_on_error;
use starnix_uapi::auth::Credentials;
use starnix_uapi::errors::Errno;
use starnix_uapi::resource_limits::Resource;
use starnix_uapi::signals::{SIGCHLD, Signal};
use starnix_uapi::{errno, error, from_status_like_fdio, pid_t, rlimit};
use std::ffi::CString;
use std::sync::Arc;
/// Result returned when creating new Zircon threads and processes for tasks.
pub struct TaskInfo {
/// The thread that was created for the task.
pub thread: Option<zx::Thread>,
/// The thread group that the task should be added to.
pub thread_group: Arc<ThreadGroup>,
/// The memory manager to use for the task.
pub memory_manager: Option<Arc<MemoryManager>>,
}
pub fn create_zircon_process<L>(
locked: &mut Locked<L>,
kernel: &Arc<Kernel>,
parent: Option<ThreadGroupWriteGuard<'_>>,
pid: pid_t,
exit_signal: Option<Signal>,
process_group: Arc<ProcessGroup>,
signal_actions: Arc<SignalActions>,
name: TaskCommand,
) -> Result<TaskInfo, Errno>
where
L: LockBefore<ProcessGroupState>,
{
// Don't allow new processes to be created once the kernel has started shutting down.
if kernel.is_shutting_down() {
return error!(EBUSY);
}
let (process, root_vmar) =
create_shared(&kernel.kthreads.starnix_process, zx::ProcessOptions::empty(), name)
.map_err(|status| from_status_like_fdio!(status))?;
// Make sure that if this process panics in normal mode that the whole kernel's job is killed.
fuchsia_runtime::job_default()
.set_critical(zx::JobCriticalOptions::RETCODE_NONZERO, &process)
.map_err(|status| from_status_like_fdio!(status))?;
let memory_manager =
Arc::new(MemoryManager::new(root_vmar).map_err(|status| from_status_like_fdio!(status))?);
let thread_group = ThreadGroup::new(
locked,
kernel.clone(),
process,
parent,
pid,
exit_signal,
process_group,
signal_actions,
);
Ok(TaskInfo { thread: None, thread_group, memory_manager: Some(memory_manager) })
}
/// Creates a process that shares half its address space with this process.
///
/// The created process will also share its handle table and futex context with `self`.
///
/// Returns the created process and a handle to the created process' restricted address space.
///
/// Wraps the
/// [zx_process_create_shared](https://fuchsia.dev/fuchsia-src/reference/syscalls/process_create_shared.md)
/// syscall.
fn create_shared(
process: &zx::Process,
options: zx::ProcessOptions,
name: TaskCommand,
) -> Result<(zx::Process, zx::Vmar), zx::Status> {
let self_raw = process.raw_handle();
let name_bytes = name.as_bytes();
let mut process_out = 0;
let mut restricted_vmar_out = 0;
#[allow(
clippy::undocumented_unsafe_blocks,
reason = "Force documented unsafe blocks in Starnix"
)]
let status = unsafe {
zx::sys::zx_process_create_shared(
self_raw,
options.bits(),
name_bytes.as_ptr(),
name_bytes.len(),
&mut process_out,
&mut restricted_vmar_out,
)
};
zx::ok(status)?;
#[allow(
clippy::undocumented_unsafe_blocks,
reason = "Force documented unsafe blocks in Starnix"
)]
unsafe {
Ok((
zx::Process::from(zx::NullableHandle::from_raw(process_out)),
zx::Vmar::from(zx::NullableHandle::from_raw(restricted_vmar_out)),
))
}
}
/// Create a process that is a child of the `init` process.
///
/// The created process will be a task that is the leader of a new thread group.
///
/// Most processes are created by userspace and are descendants of the `init` process. In
/// some situations, the kernel needs to create a process itself. This function is the
/// preferred way of creating an actual userspace process because making the process a child of
/// `init` means that `init` is responsible for waiting on the process when it dies and thereby
/// cleaning up its zombie.
///
/// If you just need a kernel task, and not an entire userspace process, consider using
/// `create_system_task` instead. Even better, consider using the `kthreads` threadpool.
///
/// If `seclabel` is set, or the container specified a `default_seclabel`, then it will be
/// resolved against the `kernel`'s active security policy, and applied to the new task.
/// Otherwise the task will inherit its LSM state from the "init" task.
///
/// This function creates an underlying Zircon process to host the new task.
pub fn create_init_child_process<L>(
locked: &mut Locked<L>,
kernel: &Arc<Kernel>,
initial_name: TaskCommand,
seclabel: Option<&CString>,
) -> Result<TaskBuilder, Errno>
where
L: LockBefore<TaskRelease>,
{
let weak_init = kernel.pids.read().get_task(1);
let init_task = weak_init.upgrade().ok_or_else(|| errno!(EINVAL))?;
let security_context = if let Some(seclabel) = seclabel {
security::task_for_context(&init_task, seclabel.as_bytes().into())?
} else if let Some(default_seclabel) = kernel.features.default_seclabel.as_ref() {
security::task_for_context(&init_task, default_seclabel.as_bytes().into())?
} else {
// If SELinux is enabled then this call will fail with `EINVAL`.
security::task_for_context(&init_task, b"".into()).map_err(|_| {
errno!(EINVAL, "Container has SELinux enabled but no Security Context specified")
})?
};
let task = create_task(
locked,
kernel,
initial_name.clone(),
init_task.fs().fork(),
|locked, pid, process_group| {
create_zircon_process(
locked.cast_locked::<TaskRelease>(),
kernel,
None,
pid,
Some(SIGCHLD),
process_group,
SignalActions::default(),
initial_name.clone(),
)
},
security_context,
)?;
{
let mut init_writer = init_task.thread_group().write();
let mut new_process_writer = task.thread_group().write();
new_process_writer.parent =
Some(ThreadGroupParent::new(Arc::downgrade(&init_task.thread_group())));
init_writer.children.insert(task.tid, Arc::downgrade(task.thread_group()));
}
// A child process created via fork(2) inherits its parent's
// resource limits. Resource limits are preserved across execve(2).
let limits = init_task.thread_group().limits.lock(locked.cast_locked::<TaskRelease>()).clone();
*task.thread_group().limits.lock(locked.cast_locked::<TaskRelease>()) = limits;
Ok(task)
}
/// Creates the initial process for a kernel.
///
/// The created process will be a task that is the leader of a new thread group.
///
/// The init process is special because it's the root of the parent/child relationship between
/// tasks. If a task dies, the init process is ultimately responsible for waiting on that task
/// and removing it from the zombie list.
///
/// It's possible for the kernel to create tasks whose ultimate parent isn't init, but such
/// tasks cannot be created by userspace directly.
///
/// This function should only be called as part of booting a kernel instance. To create a
/// process after the kernel has already booted, consider `create_init_child_process`
/// or `create_system_task`.
///
/// The process created by this function should always have pid 1. We require the caller to
/// pass the `pid` as an argument to clarify that it's the callers responsibility to determine
/// the pid for the process.
pub fn create_init_process(
locked: &mut Locked<Unlocked>,
kernel: &Arc<Kernel>,
pid: pid_t,
initial_name: TaskCommand,
fs: Arc<FsContext>,
rlimits: &[(Resource, u64)],
) -> Result<TaskBuilder, Errno> {
let pids = kernel.pids.write();
create_task_with_pid(
locked,
kernel,
pids,
pid,
initial_name.clone(),
fs,
|locked, pid, process_group| {
create_zircon_process(
locked,
kernel,
None,
pid,
Some(SIGCHLD),
process_group,
SignalActions::default(),
initial_name.clone(),
)
},
Credentials::root(),
rlimits,
// If SELinux is enabled then `exec()` of the "init" executable will normally be
// configured by policy to transition to the desired init task Security Context.
security::task_alloc_for_kernel(),
)
}
/// Create a task that runs inside the kernel.
///
/// There is no underlying Zircon process to host the task. Instead, the work done by this task
/// is performed by a thread in the original Starnix process, possible as part of a thread
/// pool.
///
/// This function is the preferred way to create a context for doing background work inside the
/// kernel.
///
/// Rather than calling this function directly, consider using `kthreads`, which provides both
/// a system task and a threadpool on which the task can do work.
pub fn create_system_task<L>(
locked: &mut Locked<L>,
kernel: &Arc<Kernel>,
fs: Arc<FsContext>,
) -> Result<CurrentTask, Errno>
where
L: LockBefore<TaskRelease>,
{
let builder = create_task(
locked,
kernel,
TaskCommand::new(b"kthreadd"),
fs,
|locked, pid, process_group| {
let process = zx::Process::from(zx::NullableHandle::invalid());
let thread_group = ThreadGroup::new(
locked.cast_locked::<TaskRelease>(),
kernel.clone(),
process,
None,
pid,
Some(SIGCHLD),
process_group,
SignalActions::default(),
);
Ok(TaskInfo { thread: None, thread_group, memory_manager: None }.into())
},
security::task_alloc_for_kernel(),
)?;
Ok(builder.into())
}
pub fn create_task<F, L>(
locked: &mut Locked<L>,
kernel: &Kernel,
initial_name: TaskCommand,
root_fs: Arc<FsContext>,
task_info_factory: F,
security_state: security::TaskState,
) -> Result<TaskBuilder, Errno>
where
F: FnOnce(&mut Locked<L>, i32, Arc<ProcessGroup>) -> Result<TaskInfo, Errno>,
L: LockBefore<TaskRelease>,
{
let mut pids = kernel.pids.write();
let pid = pids.allocate_pid();
create_task_with_pid(
locked,
kernel,
pids,
pid,
initial_name,
root_fs,
task_info_factory,
Credentials::root(),
&[],
security_state,
)
}
fn create_task_with_pid<F, L>(
locked: &mut Locked<L>,
kernel: &Kernel,
mut pids: RwLockWriteGuard<'_, PidTable>,
pid: pid_t,
initial_name: TaskCommand,
root_fs: Arc<FsContext>,
task_info_factory: F,
creds: Arc<Credentials>,
rlimits: &[(Resource, u64)],
security_state: security::TaskState,
) -> Result<TaskBuilder, Errno>
where
F: FnOnce(&mut Locked<L>, i32, Arc<ProcessGroup>) -> Result<TaskInfo, Errno>,
L: LockBefore<TaskRelease>,
{
debug_assert!(pids.get_task(pid).upgrade().is_none());
let process_group = ProcessGroup::new(pid, None);
pids.add_process_group(process_group.clone());
let TaskInfo { thread, thread_group, memory_manager } =
task_info_factory(locked, pid, process_group.clone())?;
process_group.insert(locked.cast_locked::<TaskRelease>(), &thread_group);
// > The timer slack values of init (PID 1), the ancestor of all processes, are 50,000
// > nanoseconds (50 microseconds). The timer slack value is inherited by a child created
// > via fork(2), and is preserved across execve(2).
// https://man7.org/linux/man-pages/man2/prctl.2.html
let default_timerslack = 50_000;
let builder = TaskBuilder {
task: Task::new(
pid,
initial_name,
thread_group,
thread,
FdTable::default(),
memory_manager,
root_fs,
creds,
Arc::clone(&kernel.default_abstract_socket_namespace),
Arc::clone(&kernel.default_abstract_vsock_namespace),
Default::default(),
Default::default(),
None,
Default::default(),
kernel.root_uts_ns.clone(),
false,
SeccompState::default(),
SeccompFilterContainer::default(),
RobustListHeadPtr::null(&ArchWidth::Arch64),
default_timerslack,
security_state,
),
thread_state: Default::default(),
};
release_on_error!(builder, locked, {
let temp_task = TempRef::from(&builder.task);
builder.thread_group().add(&temp_task)?;
for (resource, limit) in rlimits {
builder
.thread_group()
.limits
.lock(locked.cast_locked::<TaskRelease>())
.set(*resource, rlimit { rlim_cur: *limit, rlim_max: *limit });
}
pids.add_task(&temp_task);
Ok(())
});
Ok(builder)
}
/// Create a kernel task in the same ThreadGroup as the given `system_task`.
///
/// There is no underlying Zircon thread to host the task.
pub fn create_kernel_thread<L>(
locked: &mut Locked<L>,
system_task: &Task,
initial_name: TaskCommand,
) -> Result<CurrentTask, Errno>
where
L: LockBefore<TaskRelease>,
{
let mut pids = system_task.kernel().pids.write();
let pid = pids.allocate_pid();
let scheduler_state;
let uts_ns;
let default_timerslack_ns;
let security_state;
{
let state = system_task.read();
scheduler_state = state.scheduler_state;
uts_ns = state.uts_ns.clone();
default_timerslack_ns = state.default_timerslack_ns;
security_state = security::task_alloc_for_kernel();
}
let current_task: CurrentTask = TaskBuilder::new(Task::new(
pid,
initial_name,
system_task.thread_group().clone(),
None,
FdTable::default(),
system_task.mm().ok(),
system_task.fs(),
system_task.clone_creds(),
Arc::clone(&system_task.abstract_socket_namespace),
Arc::clone(&system_task.abstract_vsock_namespace),
Default::default(),
Default::default(),
None,
scheduler_state,
uts_ns,
false,
SeccompState::default(),
SeccompFilterContainer::default(),
RobustListHeadPtr::null(&ArchWidth::Arch64),
default_timerslack_ns,
security_state,
))
.into();
release_on_error!(current_task, locked, {
let temp_task = current_task.temp_task();
current_task.thread_group().add(&temp_task)?;
pids.add_task(&temp_task);
Ok(())
});
Ok(current_task)
}