src/starnix/kernel/core/task/kernel.rs - fuchsia - Git at Google

 // Copyright 2021 The Fuchsia Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 use crate::bpf::EbpfState;
 use crate::device::remote_block_device::RemoteBlockDeviceRegistry;
 use crate::device::{DeviceMode, DeviceRegistry};
 use crate::execution::CrashReporter;
 use crate::mm::{FutexTable, MappingSummary, MlockPinFlavor, SharedFutexKey};
 use crate::power::SuspendResumeManagerHandle;
 use crate::ptrace::StopState;
 use crate::security::{self, AuditLogger};
 use crate::task::container_namespace::ContainerNamespace;
 use crate::task::limits::SystemLimits;
 use crate::task::memory_attribution::MemoryAttributionManager;
 use crate::task::net::NetstackDevices;
 use crate::task::tracing::PidToKoidMap;
 use crate::task::{
     AbstractUnixSocketNamespace, AbstractVsockSocketNamespace, CurrentTask, DelayedReleaser,
     IpTables, KernelCgroups, KernelStats, KernelThreads, PidTable, SchedulerManager, Syslog,
     ThreadGroup, UtsNamespace, UtsNamespaceHandle,
 };
 use crate::time::{HrTimerManager, HrTimerManagerHandle};
 use crate::vdso::vdso_loader::Vdso;
 use crate::vfs::fs_args::MountParams;
 use crate::vfs::socket::{
     GenericMessage, GenericNetlink, NetlinkAccessControl, NetlinkContextImpl,
     NetlinkToClientSender, SocketAddress, SocketTokensStore,
 };
 use crate::vfs::{CacheConfig, FileOps, FsNodeHandle, FsString, Mounts, NamespaceNode};
 use bstr::{BString, ByteSlice};
 use devicetree::types::Devicetree;
 use expando::Expando;
 use fidl::endpoints::{
     ClientEnd, ControlHandle, DiscoverableProtocolMarker, ProtocolMarker, create_endpoints,
 };
 use fidl_fuchsia_component_runner::{ComponentControllerControlHandle, ComponentStopInfo};
 use fidl_fuchsia_feedback::CrashReporterProxy;
 use fidl_fuchsia_time_external::AdjustSynchronousProxy;
 use fuchsia_inspect::ArrayProperty;
 use futures::FutureExt;
 use netlink::interfaces::InterfacesHandler;
 use netlink::{NETLINK_LOG_TAG, Netlink};
 use once_cell::sync::OnceCell;
 use starnix_lifecycle::{AtomicU32Counter, AtomicU64Counter};
 use starnix_logging::{SyscallLogFilter, log_debug, log_error, log_info, log_warn};
 use starnix_sync::{
     FileOpsCore, KernelSwapFiles, LockEqualOrBefore, Locked, Mutex, OrderedMutex, RwLock,
 };
 use starnix_types::ownership::TempRef;
 use starnix_uapi::device_type::DeviceType;
 use starnix_uapi::errors::{Errno, errno};
 use starnix_uapi::open_flags::OpenFlags;
 use starnix_uapi::{VMADDR_CID_HOST, from_status_like_fdio};
 use std::borrow::Cow;
 use std::collections::{HashMap, HashSet};
 use std::num::NonZeroU64;
 use std::path::PathBuf;
 use std::sync::atomic::{AtomicBool, AtomicU8, AtomicU16, Ordering};
 use std::sync::{Arc, OnceLock, Weak};
 use zx::CpuFeatureFlags;
 use {
     fidl_fuchsia_io as fio, fidl_fuchsia_memory_attribution as fattribution,
     fuchsia_async as fasync,
 };

 /// Kernel features are specified in the component manifest of the starnix container
 /// or explicitly provided to the kernel constructor in tests.
 #[derive(Debug, Default, Clone)]
 pub struct KernelFeatures {
     pub bpf_v2: bool,

     /// Whether the kernel supports the S_ISUID and S_ISGID bits.
     ///
     /// For example, these bits are used by `sudo`.
     ///
     /// Enabling this feature is potentially a security risk because they allow privilege
     /// escalation.
     pub enable_suid: bool,

     /// Whether io_uring is enabled.
     ///
     /// TODO(https://fxbug.dev/297431387): Enabled by default once the feature is completed.
     pub io_uring: bool,

     /// Whether the kernel should return an error to userspace, rather than panicking, if `reboot()`
     /// is requested but cannot be enacted because the kernel lacks the relevant capabilities.
     pub error_on_failed_reboot: bool,

     /// The default seclabel that is applied to components that are run in this kernel.
     ///
     /// Components can override this by setting the `seclabel` field in their program block.
     pub default_seclabel: Option<String>,

     /// Whether the kernel is being used to run the SELinux Test Suite.
     ///
     /// TODO: https://fxbug.dev/388077431 - remove this once we no longer need workarounds for the
     /// SELinux Test Suite.
     pub selinux_test_suite: bool,

     /// The default mount options to use when mounting directories from a component's namespace.
     ///
     /// The key is the path in the component's namespace, and the value is the mount options
     /// string.
     pub default_ns_mount_options: Option<HashMap<String, String>>,

     /// The default uid that is applied to components that are run in this kernel.
     ///
     /// Components can override this by setting the `uid` field in their program block.
     pub default_uid: u32,

     /// mlock() never prefaults pages.
     pub mlock_always_onfault: bool,

     /// Implementation of mlock() to use for this kernel instance.
     pub mlock_pin_flavor: MlockPinFlavor,

     /// Whether excessive crash reports should be throttled.
     pub crash_report_throttling: bool,

     /// Whether or not to serve wifi support to Android.
     pub wifi: bool,

     /// The number of bytes to cache in pages for reading zx::MapInfo from VMARs.
     pub cached_zx_map_info_bytes: u32,

     /// The size of the Dirent LRU cache.
     pub dirent_cache_size: u32,

     /// Whether to expose a stub '/dev/ion' node, as a temporary workaround for compatibility.
     // TODO(https://fxbug.dev/485370648) remove when unnecessary
     pub fake_ion: bool,
 }

 impl KernelFeatures {
     /// Returns the `MountParams` to use when mounting the specified path from a component's
     /// namespace.  This mechanism is also used to specified options for mounts created via
     /// container features, by specifying a pseudo-path e.g. "#container".
     pub fn ns_mount_options(&self, ns_path: &str) -> Result<MountParams, Errno> {
         if let Some(all_options) = &self.default_ns_mount_options {
             if let Some(options) = all_options.get(ns_path) {
                 return MountParams::parse(options.as_bytes().into());
             }
         }
         Ok(MountParams::default())
     }
 }

 /// Kernel command line argument structure
 pub struct ArgNameAndValue<'a> {
     pub name: &'a str,
     pub value: Option<&'a str>,
 }

 /// The shared, mutable state for the entire Starnix kernel.
 ///
 /// The `Kernel` object holds all kernel threads, userspace tasks, and file system resources for a
 /// single instance of the Starnix kernel. In production, there is one instance of this object for
 /// the entire Starnix kernel. However, multiple instances of this object can be created in one
 /// process during unit testing.
 ///
 /// The structure of this object will likely need to evolve as we implement more namespacing and
 /// isolation mechanisms, such as `namespaces(7)` and `pid_namespaces(7)`.
 pub struct Kernel {
     /// Weak reference to self. Allows to not have to pass &Arc<Kernel> in apis.
     pub weak_self: Weak<Kernel>,

     /// The kernel threads running on behalf of this kernel.
     pub kthreads: KernelThreads,

     /// The features enabled for this kernel.
     pub features: KernelFeatures,

     /// The processes and threads running in this kernel, organized by pid_t.
     pub pids: RwLock<PidTable>,

     /// Used to record the pid/tid to Koid mappings. Set when collecting trace data.
     pub pid_to_koid_mapping: Arc<RwLock<Option<PidToKoidMap>>>,

     /// Subsystem-specific properties that hang off the Kernel object.
     ///
     /// Instead of adding yet another property to the Kernel object, consider storing the property
     /// in an expando if that property is only used by one part of the system, such as a module.
     pub expando: Expando,

     /// The default namespace for abstract AF_UNIX sockets in this kernel.
     ///
     /// Rather than use this default namespace, abstract socket addresses
     /// should be looked up in the AbstractSocketNamespace on each Task
     /// object because some Task objects might have a non-default namespace.
     pub default_abstract_socket_namespace: Arc<AbstractUnixSocketNamespace>,

     /// The default namespace for abstract AF_VSOCK sockets in this kernel.
     pub default_abstract_vsock_namespace: Arc<AbstractVsockSocketNamespace>,

     /// The kernel command line. Shows up in /proc/cmdline.
     pub cmdline: BString,

     pub device_tree: Option<Devicetree>,

     // Global state held by the Linux Security Modules subsystem.
     pub security_state: security::KernelState,

     /// The registry of device drivers.
     pub device_registry: DeviceRegistry,

     /// Mapping of top-level namespace entries to an associated proxy.
     /// For example, "/svc" to the respective proxy. Only the namespace entries
     /// which were known at component startup will be available by the kernel.
     pub container_namespace: ContainerNamespace,

     /// The registry of block devices backed by a remote fuchsia.io file.
     pub remote_block_device_registry: Arc<RemoteBlockDeviceRegistry>,

     /// The iptables used for filtering network packets.
     iptables: OnceLock<IpTables>,

     /// The futexes shared across processes.
     pub shared_futexes: Arc<FutexTable<SharedFutexKey>>,

     /// The default UTS namespace for all tasks.
     ///
     /// Because each task can have its own UTS namespace, you probably want to use
     /// the UTS namespace handle of the task, which may/may not point to this one.
     pub root_uts_ns: UtsNamespaceHandle,

     /// A struct containing a VMO with a vDSO implementation, if implemented for a given architecture, and possibly an offset for a sigreturn function.
     pub vdso: Vdso,

     /// A struct containing a VMO with a arch32-vDSO implementation, if implemented for a given architecture.
     // TODO(https://fxbug.dev/380431743) This could be made less clunky -- maybe a Vec<Vdso> above or
     // something else
     pub vdso_arch32: Option<Vdso>,

     /// The table of devices installed on the netstack and their associated
     /// state local to this `Kernel`.
     pub netstack_devices: Arc<NetstackDevices>,

     /// Files that are currently available for swapping.
     /// Note: Starnix never actually swaps memory to these files. We just need to track them
     /// to pass conformance tests.
     pub swap_files: OrderedMutex<Vec<FsNodeHandle>, KernelSwapFiles>,

     /// The implementation of generic Netlink protocol families.
     generic_netlink: OnceLock<GenericNetlink<NetlinkToClientSender<GenericMessage>>>,

     /// The implementation of networking-related Netlink protocol families.
     network_netlink: OnceLock<Netlink<NetlinkContextImpl>>,

     /// Inspect instrumentation for this kernel instance.
     pub inspect_node: fuchsia_inspect::Node,

     /// The kinds of seccomp action that gets logged, stored as a bit vector.
     /// Each potential SeccompAction gets a bit in the vector, as specified by
     /// SeccompAction::logged_bit_offset.  If the bit is set, that means the
     /// action should be logged when it is taken, subject to the caveats
     /// described in seccomp(2).  The value of the bit vector is exposed to users
     /// in a text form in the file /proc/sys/kernel/seccomp/actions_logged.
     pub actions_logged: AtomicU16,

     /// The manager for suspend/resume.
     pub suspend_resume_manager: SuspendResumeManagerHandle,

     /// Unique IDs for new mounts and mount namespaces.
     pub next_mount_id: AtomicU64Counter,
     pub next_peer_group_id: AtomicU64Counter,
     pub next_namespace_id: AtomicU64Counter,

     /// Unique IDs for file objects.
     pub next_file_object_id: AtomicU64Counter,

     /// Unique cookie used to link two inotify events, usually an IN_MOVE_FROM/IN_MOVE_TO pair.
     pub next_inotify_cookie: AtomicU32Counter,

     /// Controls which processes a process is allowed to ptrace.  See Documentation/security/Yama.txt
     pub ptrace_scope: AtomicU8,

     // The Fuchsia build version returned by `fuchsia.buildinfo.Provider`.
     pub build_version: OnceCell<String>,

     pub stats: Arc<KernelStats>,

     /// Resource limits that are exposed, for example, via sysctl.
     pub system_limits: SystemLimits,

     // The service to handle delayed releases. This is required for elements that requires to
     // execute some code when released and requires a known context (both in term of lock context,
     // as well as `CurrentTask`).
     pub delayed_releaser: DelayedReleaser,

     /// Manages task priorities.
     pub scheduler: SchedulerManager,

     /// The syslog manager.
     pub syslog: Syslog,

     /// All mounts.
     pub mounts: Mounts,

     /// The manager for creating and managing high-resolution timers.
     pub hrtimer_manager: HrTimerManagerHandle,

     /// The manager for monitoring and reporting resources used by the kernel.
     pub memory_attribution_manager: MemoryAttributionManager,

     /// Handler for crashing Linux processes.
     pub crash_reporter: CrashReporter,

     /// Whether this kernel is shutting down. When shutting down, new processes may not be spawned.
     shutting_down: AtomicBool,

     /// True to disable syslog access to unprivileged callers.  This also controls whether read
     /// access to /dev/kmsg requires privileged capabilities.
     pub restrict_dmesg: AtomicBool,

     /// Determines whether unprivileged BPF is permitted, or can be re-enabled.
     ///   0 - Unprivileged BPF is permitted.
     ///   1 - Unprivileged BPF is not permitted, and cannot be enabled.
     ///   2 - Unprivileged BPF is not permitted, but can be enabled by a privileged task.
     pub disable_unprivileged_bpf: AtomicU8,

     /// Control handle to the running container's ComponentController.
     pub container_control_handle: Mutex<Option<ComponentControllerControlHandle>>,

     /// eBPF state: loaded programs, eBPF maps, etc.
     pub ebpf_state: EbpfState,

     /// Cgroups of the kernel.
     pub cgroups: KernelCgroups,

     /// Used to communicate requests to adjust system time from within a Starnix
     /// container. Used from syscalls.
     pub time_adjustment_proxy: Option<AdjustSynchronousProxy>,

     /// Used to store tokens for sockets, particularly per-uid sharing domain sockets.
     pub socket_tokens_store: SocketTokensStore,

     /// Hardware capabilities to push onto stack when loading an ELF binary.
     pub hwcaps: HwCaps,

     /// Filters for syscall logging. Processes with names matching these filters will have syscalls
     /// logged at INFO level.
     pub syscall_log_filters: Mutex<Vec<SyscallLogFilter>>,
 }

 /// Hardware capabilities.
 #[derive(Debug, Clone, Copy, Default)]
 pub struct HwCap {
     /// The value for `AT_HWCAP`.
     pub hwcap: u32,
     /// The value for `AT_HWCAP2`.
     pub hwcap2: u32,
 }

 /// Hardware capabilities for both 32-bit and 64-bit ELF binaries.
 #[derive(Debug, Clone, Copy, Default)]
 pub struct HwCaps {
     /// For 32-bit binaries.
     #[cfg(target_arch = "aarch64")]
     pub arch32: HwCap,
     /// For 64-bit binaries.
     pub arch64: HwCap,
 }

 /// An implementation of [`InterfacesHandler`].
 ///
 /// This holds a `Weak<Kernel>` because it is held within a [`Netlink`] which
 /// is itself held within an `Arc<Kernel>`. Holding an `Arc<T>` within an
 /// `Arc<T>` prevents the `Arc`'s ref count from ever reaching 0, causing a
 /// leak.
 struct InterfacesHandlerImpl(Weak<Kernel>);

 impl InterfacesHandlerImpl {
     fn kernel(&self) -> Option<Arc<Kernel>> {
         self.0.upgrade()
     }
 }

 impl InterfacesHandler for InterfacesHandlerImpl {
     fn handle_new_link(&mut self, name: &str, interface_id: NonZeroU64) {
         if let Some(kernel) = self.kernel() {
             kernel.netstack_devices.add_device(&kernel, name.into(), interface_id);
         }
     }

     fn handle_deleted_link(&mut self, name: &str) {
         if let Some(kernel) = self.kernel() {
             kernel.netstack_devices.remove_device(&kernel, name.into());
         }
     }

     fn handle_idle_event(&mut self) {
         let Some(kernel) = self.kernel() else {
             log_error!("kernel went away while netlink is initializing");
             return;
         };
         let (initialized, wq) = &kernel.netstack_devices.initialized_and_wq;
         if initialized.swap(true, Ordering::SeqCst) {
             log_error!("netlink initial devices should only be reported once");
             return;
         }
         wq.notify_all()
     }
 }

 impl Kernel {
     pub fn new(
         cmdline: BString,
         features: KernelFeatures,
         system_limits: SystemLimits,
         container_namespace: ContainerNamespace,
         scheduler: SchedulerManager,
         crash_reporter_proxy: Option<CrashReporterProxy>,
         inspect_node: fuchsia_inspect::Node,
         security_state: security::KernelState,
         time_adjustment_proxy: Option<AdjustSynchronousProxy>,
         device_tree: Option<Devicetree>,
     ) -> Result<Arc<Kernel>, zx::Status> {
         let unix_address_maker =
             Box::new(|x: FsString| -> SocketAddress { SocketAddress::Unix(x) });
         let vsock_address_maker = Box::new(|x: u32| -> SocketAddress {
             SocketAddress::Vsock { port: x, cid: VMADDR_CID_HOST }
         });

         let crash_reporter = CrashReporter::new(
             &inspect_node,
             crash_reporter_proxy,
             zx::Duration::from_minutes(8),
             features.crash_report_throttling,
         );
         let hrtimer_manager = HrTimerManager::new(&inspect_node);

         let cpu_feature_flags =
             zx::system_get_feature_flags::<CpuFeatureFlags>().unwrap_or_else(|e| {
                 log_debug!("CPU feature flags are only supported on ARM64: {}, reporting 0", e);
                 CpuFeatureFlags::empty()
             });
         let hwcaps = HwCaps::from_cpu_feature_flags(cpu_feature_flags);

         let this = Arc::new_cyclic(|kernel| Kernel {
             weak_self: kernel.clone(),
             kthreads: KernelThreads::new(kernel.clone()),
             features,
             pids: Default::default(),
             pid_to_koid_mapping: Arc::new(RwLock::new(None)),
             expando: Default::default(),
             default_abstract_socket_namespace: AbstractUnixSocketNamespace::new(unix_address_maker),
             default_abstract_vsock_namespace: AbstractVsockSocketNamespace::new(
                 vsock_address_maker,
             ),
             cmdline,
             device_tree,
             security_state,
             device_registry: Default::default(),
             container_namespace,
             remote_block_device_registry: Default::default(),
             iptables: OnceLock::new(),
             shared_futexes: Arc::<FutexTable<SharedFutexKey>>::default(),
             root_uts_ns: Arc::new(RwLock::new(UtsNamespace::default())),
             vdso: Vdso::new(),
             vdso_arch32: Vdso::new_arch32(),
             netstack_devices: Arc::default(),
             swap_files: Default::default(),
             generic_netlink: OnceLock::new(),
             network_netlink: OnceLock::new(),
             inspect_node,
             actions_logged: AtomicU16::new(0),
             suspend_resume_manager: Default::default(),
             next_mount_id: AtomicU64Counter::new(1),
             next_peer_group_id: AtomicU64Counter::new(1),
             next_namespace_id: AtomicU64Counter::new(1),
             next_inotify_cookie: AtomicU32Counter::new(1),
             next_file_object_id: Default::default(),
             system_limits,
             ptrace_scope: AtomicU8::new(0), // Disable YAMA checks by default.
             restrict_dmesg: AtomicBool::new(false),
             disable_unprivileged_bpf: AtomicU8::new(0), // Enable unprivileged BPF by default.
             build_version: OnceCell::new(),
             stats: Arc::new(KernelStats::default()),
             delayed_releaser: Default::default(),
             scheduler,
             syslog: Default::default(),
             mounts: Mounts::new(),
             hrtimer_manager,
             memory_attribution_manager: MemoryAttributionManager::new(kernel.clone()),
             crash_reporter,
             shutting_down: AtomicBool::new(false),
             container_control_handle: Mutex::new(None),
             ebpf_state: Default::default(),
             cgroups: Default::default(),
             time_adjustment_proxy,
             socket_tokens_store: Default::default(),
             hwcaps,
             syscall_log_filters: Default::default(),
         });

         // Initialize the device registry before registering any devices.
         //
         // We will create sysfs recursively within this function.
         this.device_registry.objects.init(&mut this.kthreads.unlocked_for_async(), &this);

         // Make a copy of this Arc for the inspect lazy node to use but don't create an Arc cycle
         // because the inspect node that owns this reference is owned by the kernel.
         let kernel = Arc::downgrade(&this);
         this.inspect_node.record_lazy_child("thread_groups", move || {
             if let Some(kernel) = kernel.upgrade() {
                 let inspector = kernel.get_thread_groups_inspect();
                 async move { Ok(inspector) }.boxed()
             } else {
                 async move { Err(anyhow::format_err!("kernel was dropped")) }.boxed()
             }
         });

         let kernel = Arc::downgrade(&this);
         this.inspect_node.record_lazy_child("cgroupv2", move || {
             if let Some(kernel) = kernel.upgrade() {
                 async move { Ok(kernel.cgroups.cgroup2.get_cgroup_inspect()) }.boxed()
             } else {
                 async move { Err(anyhow::format_err!("kernel was dropped")) }.boxed()
             }
         });

         Ok(this)
     }

     /// Shuts down userspace and the kernel in an orderly fashion, eventually terminating the root
     /// kernel process.
     pub fn shut_down(self: &Arc<Self>) {
         // Run shutdown code on a kthread in the main process so that it can be the last process
         // alive.
         self.kthreads.spawn_future(
             {
                 let kernel = self.clone();
                 move || async move {
                     kernel.run_shutdown().await;
                 }
             },
             "run_shutdown",
         );
     }

     /// Starts shutting down the Starnix kernel and any running container. Only one thread can drive
     /// shutdown at a time. This function will return immediately if shut down is already under way.
     ///
     /// Shutdown happens in several phases:
     ///
     /// 1. Disable launching new processes
     /// 2. Shut down individual ThreadGroups until only the init and system tasks remain
     /// 3. Repeat the above for the init task
     /// 4. Clean up kernel-internal structures that can hold processes alive
     /// 5. Ensure this process is the only one running in the kernel job.
     /// 6. Unmounts the kernel's mounts' FileSystems.
     /// 7. Tell CF the container component has stopped
     /// 8. Exit this process
     ///
     /// If a ThreadGroup does not shut down on its own (including after SIGKILL), that phase of
     /// shutdown will hang. To gracefully shut down any further we need the other kernel processes
     /// to do controlled exits that properly release access to shared state. If our orderly shutdown
     /// does hang, eventually CF will kill the container component which will lead to the job of
     /// this process being killed and shutdown will still complete.
     async fn run_shutdown(&self) {
         const INIT_PID: i32 = 1;
         const SYSTEM_TASK_PID: i32 = 2;

         // Step 1: Prevent new processes from being created once they observe this update. We don't
         // want the thread driving shutdown to be racing with other threads creating new processes.
         if self
             .shutting_down
             .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire)
             .is_err()
         {
             log_info!("Additional thread tried to initiate shutdown while already in-progress.");
             return;
         }

         log_info!("Shutting down Starnix kernel.");

         // Step 2: Shut down thread groups in a loop until init and the system task are all that
         // remain.
         loop {
             let tgs = {
                 // Exiting thread groups need to acquire a write lock for the pid table to
                 // successfully exit so we need to acquire that lock in a reduced scope.
                 self.pids
                     .read()
                     .get_thread_groups()
                     .filter(|tg| tg.leader != SYSTEM_TASK_PID && tg.leader != INIT_PID)
                     .collect::<Vec<_>>()
             };
             if tgs.is_empty() {
                 log_info!("pid table is empty except init and system task");
                 break;
             }

             log_info!(tgs:?; "shutting down thread groups");
             let mut tasks = vec![];
             for tg in tgs {
                 let task = fasync::Task::local(ThreadGroup::shut_down(Arc::downgrade(&tg)));
                 tasks.push(task);
             }
             futures::future::join_all(tasks).await;
         }

         // Step 3: Terminate the init process.
         let maybe_init = {
             // Exiting thread groups need to acquire a write lock for the pid table to successfully
             // exit so we need to acquire that lock in a reduced scope.
             self.pids.read().get_thread_group(1).map(|tg| Arc::downgrade(&tg))
         };
         if let Some(init) = maybe_init {
             log_info!("shutting down init");
             ThreadGroup::shut_down(init).await;
         } else {
             log_info!("init already terminated");
         }

         // Step 4: Clean up any structures that can keep non-Linux processes live in our job.
         log_info!("cleaning up pinned memory");
         self.expando.remove::<crate::mm::InfoCacheShadowProcess>();
         self.expando.remove::<crate::mm::MlockShadowProcess>();

         // Step 5: Make sure this is the only process running in the job. We already should have
         // cleared up all processes other than the system task at this point, but wait on any that
         // might be around for good measure.
         //
         // Use unwrap liberally since we're shutting down anyway and errors will still tear down the
         // kernel.
         let kernel_job = fuchsia_runtime::job_default();
         assert_eq!(kernel_job.children().unwrap(), &[], "starnix does not create any child jobs");
         let own_koid = fuchsia_runtime::process_self().koid().unwrap();

         log_info!("waiting for this to be the only process in the job");
         loop {
             let mut remaining_processes = kernel_job
                 .processes()
                 .unwrap()
                 .into_iter()
                 // Don't wait for ourselves to exit.
                 .filter(|pid| pid != &own_koid)
                 .peekable();
             if remaining_processes.peek().is_none() {
                 log_info!("No stray Zircon processes.");
                 break;
             }

             let mut terminated_signals = vec![];
             for pid in remaining_processes {
                 let handle = match kernel_job
                     .get_child(&pid, zx::Rights::BASIC | zx::Rights::PROPERTY | zx::Rights::DESTROY)
                 {
                     Ok(h) => h,
                     Err(e) => {
                         log_info!(pid:?, e:?; "failed to get child process from job");
                         continue;
                     }
                 };
                 log_info!(
                     pid:?,
                     name:? = handle.get_name();
                     "waiting on process terminated signal"
                 );
                 terminated_signals
                     .push(fuchsia_async::OnSignals::new(handle, zx::Signals::PROCESS_TERMINATED));
             }
             log_info!("waiting on process terminated signals");
             futures::future::join_all(terminated_signals).await;
         }

         // Step 6: Forcibly unmounts the mounts' FileSystems.
         log_info!("clearing mounts");
         self.mounts.clear();

         // Step 7: Tell CF the container stopped.
         log_info!("all non-root processes killed, notifying CF container is stopped");
         if let Some(control_handle) = self.container_control_handle.lock().take() {
             log_info!("Notifying CF that the container has stopped.");
             control_handle
                 .send_on_stop(ComponentStopInfo {
                     termination_status: Some(zx::Status::OK.into_raw()),
                     exit_code: Some(0),
                     ..ComponentStopInfo::default()
                 })
                 .unwrap();
             control_handle.shutdown_with_epitaph(zx::Status::OK);
         } else {
             log_warn!("Shutdown invoked without a container controller control handle.");
         }

         // Step 8: exiting this process.
         log_info!("All tasks killed, exiting Starnix kernel root process.");
         // Normally a Rust program exits its process by calling `std::process::exit()` which goes
         // through libc to exit the program. This runs drop impls on any thread-local variables
         // which can cause issues during Starnix shutdown when we haven't yet integrated every
         // subsystem with the shutdown flow. While those issues are indicative of underlying
         // problems, we can't solve them without finishing the implementation of graceful shutdown.
         // Instead, ask Zircon to exit our process directly, bypassing any libc atexit handlers.
         // TODO(https://fxbug.dev/295073633) return from main instead of avoiding atexit handlers
         zx::Process::exit(0);
     }

     pub fn is_shutting_down(&self) -> bool {
         self.shutting_down.load(Ordering::Acquire)
     }

     /// Opens a device file (driver) identified by `dev`.
     pub fn open_device<L>(
         &self,
         locked: &mut Locked<L>,
         current_task: &CurrentTask,
         node: &NamespaceNode,
         flags: OpenFlags,
         dev: DeviceType,
         mode: DeviceMode,
     ) -> Result<Box<dyn FileOps>, Errno>
     where
         L: LockEqualOrBefore<FileOpsCore>,
     {
         self.device_registry.open_device(locked, current_task, node, flags, dev, mode)
     }

     /// Return a reference to the Audit Framework
     ///
     /// This function follows the lazy initialization pattern.
     pub fn audit_logger(&self) -> Arc<AuditLogger> {
         self.expando.get_or_init(|| AuditLogger::new(self))
     }

     /// Return a reference to the GenericNetlink implementation.
     ///
     /// This function follows the lazy initialization pattern, where the first
     /// call will instantiate the Generic Netlink server in a separate kthread.
     pub fn generic_netlink(&self) -> &GenericNetlink<NetlinkToClientSender<GenericMessage>> {
         self.generic_netlink.get_or_init(|| {
             let (generic_netlink, worker_params) = GenericNetlink::new();
             let enable_nl80211 = self.features.wifi;
             self.kthreads.spawn_future(
                 move || async move {
                     crate::vfs::socket::run_generic_netlink_worker(worker_params, enable_nl80211)
                         .await;
                     log_error!("Generic Netlink future unexpectedly exited");
                 },
                 "generic_netlink_worker",
             );
             generic_netlink
         })
     }

     /// Return a reference to the [`netlink::Netlink`] implementation.
     ///
     /// This function follows the lazy initialization pattern, where the first
     /// call will instantiate the Netlink implementation.
     pub fn network_netlink(self: &Arc<Self>) -> &Netlink<NetlinkContextImpl> {
         self.network_netlink.get_or_init(|| {
             let (network_netlink, worker_params) =
                 Netlink::new(InterfacesHandlerImpl(self.weak_self.clone()));

             let kernel = self.clone();
             self.kthreads.spawn_future(
                 move || async move {
                     netlink::run_netlink_worker(
                         worker_params,
                         NetlinkAccessControl::new(kernel.kthreads.system_task()),
                     )
                     .await;
                     log_error!(tag = NETLINK_LOG_TAG; "Netlink async worker unexpectedly exited");
                 },
                 "network_netlink_worker",
             );
             network_netlink
         })
     }

     pub fn iptables(&self) -> &IpTables {
         self.iptables.get_or_init(|| IpTables::new())
     }

     /// Returns a Proxy to the service used by the container at `filename`.
     #[allow(unused)]
     pub fn connect_to_named_protocol_at_container_svc<P: ProtocolMarker>(
         &self,
         filename: &str,
     ) -> Result<ClientEnd<P>, Errno> {
         match self.container_namespace.get_namespace_channel("/svc") {
             Ok(channel) => {
                 let (client_end, server_end) = create_endpoints::<P>();
                 fdio::service_connect_at(channel.as_ref(), filename, server_end.into_channel())
                     .map_err(|status| from_status_like_fdio!(status))?;
                 Ok(client_end)
             }
             Err(err) => {
                 log_error!("Unable to get /svc namespace channel! {}", err);
                 Err(errno!(ENOENT))
             }
         }
     }

     /// Returns a Proxy to the service `P` used by the container.
     pub fn connect_to_protocol_at_container_svc<P: DiscoverableProtocolMarker>(
         &self,
     ) -> Result<ClientEnd<P>, Errno> {
         self.connect_to_named_protocol_at_container_svc::<P>(P::PROTOCOL_NAME)
     }

     pub fn add_syscall_log_filter(&self, name: &str) {
         let filter = SyscallLogFilter::new(name.to_string());
         {
             let mut filters = self.syscall_log_filters.lock();
             if filters.contains(&filter) {
                 return;
             }
             filters.push(filter);
         }
         for headers in self.pids.read().get_thread_groups() {
             headers.sync_syscall_log_level();
         }
     }

     pub fn clear_syscall_log_filters(&self) {
         {
             let mut filters = self.syscall_log_filters.lock();
             if filters.is_empty() {
                 return;
             }
             filters.clear();
         }
         for headers in self.pids.read().get_thread_groups() {
             headers.sync_syscall_log_level();
         }
     }

     fn get_thread_groups_inspect(&self) -> fuchsia_inspect::Inspector {
         let inspector = fuchsia_inspect::Inspector::default();

         let thread_groups = inspector.root();
         let mut mm_summary = MappingSummary::default();
         let mut mms_summarized = HashSet::new();

         // Avoid holding locks for the entire iteration.
         let all_thread_groups = {
             let pid_table = self.pids.read();
             pid_table.get_thread_groups().collect::<Vec<_>>()
         };
         for thread_group in all_thread_groups {
             // Avoid holding the state lock while summarizing.
             let (ppid, tasks) = {
                 let tg = thread_group.read();
                 (tg.get_ppid() as i64, tg.tasks().map(TempRef::into_static).collect::<Vec<_>>())
             };

             let tg_node = thread_groups.create_child(format!("{}", thread_group.leader));
             if let Ok(koid) = &thread_group.process.koid() {
                 tg_node.record_int("koid", koid.raw_koid() as i64);
             }
             tg_node.record_int("pid", thread_group.leader as i64);
             tg_node.record_int("ppid", ppid);
             tg_node.record_bool("stopped", thread_group.load_stopped() == StopState::GroupStopped);

             let tasks_node = tg_node.create_child("tasks");
             for task in tasks {
                 if let Ok(mm) = task.mm() {
                     if mms_summarized.insert(Arc::as_ptr(&mm) as usize) {
                         mm.summarize(&mut mm_summary);
                     }
                 }
                 let set_properties = |node: &fuchsia_inspect::Node| {
                     node.record_string("command", task.command().to_string());

                     let scheduler_state = task.read().scheduler_state;
                     if !scheduler_state.is_default() {
                         node.record_child("sched", |node| {
                             node.record_string(
                                 "role_name",
                                 self.scheduler
                                     .role_name(&task)
                                     .map(|n| Cow::Borrowed(n))
                                     .unwrap_or_else(|e| Cow::Owned(e.to_string())),
                             );
                             node.record_string("state", format!("{scheduler_state:?}"));
                         });
                     }
                 };
                 if task.tid == thread_group.leader {
                     let mut argv = task.read_argv(256).unwrap_or_default();

                     // Any runtime that overwrites argv is likely to leave a lot of trailing
                     // nulls, no need to print those in inspect.
                     argv.retain(|arg| !arg.is_empty());

                     let inspect_argv = tg_node.create_string_array("argv", argv.len());
                     for (i, arg) in argv.iter().enumerate() {
                         inspect_argv.set(i, arg.to_string());
                     }
                     tg_node.record(inspect_argv);

                     set_properties(&tg_node);
                 } else {
                     tasks_node.record_child(task.tid.to_string(), |task_node| {
                         set_properties(task_node);
                     });
                 };
             }
             tg_node.record(tasks_node);
             thread_groups.record(tg_node);
         }

         thread_groups.record_child("memory_managers", |node| mm_summary.record(node));

         inspector
     }

     pub fn new_memory_attribution_observer(
         &self,
         control_handle: fattribution::ProviderControlHandle,
     ) -> attribution_server::Observer {
         self.memory_attribution_manager.new_observer(control_handle)
     }

     /// Opens and returns a directory proxy from the container's namespace, at
     /// the requested path, using the provided flags. This method will open the
     /// closest existing path from the namespace hierarchy, and then attempt
     /// initialize an open on the remaining subdirectory path, using the given open_flags.
     ///
     /// For example, given the parameter provided is `/path/to/foo/bar` and there
     /// are namespace entries already for `/path/to/foo` and `/path/to`. The entry
     /// for /path/to/foo will be opened, and then the /bar will attempt to be opened
     /// underneath that directory with the given open_flags. The returned value
     /// will be the proxy to the parent (/path/to/foo) and the string to the child
     /// path (/bar). The caller of this method can expect /bar to be initialized.
     pub fn open_ns_dir(
         &self,
         path: &str,
         open_flags: fio::Flags,
     ) -> Result<(fio::DirectorySynchronousProxy, String), Errno> {
         let ns_path = PathBuf::from(path);
         match self.container_namespace.find_closest_channel(&ns_path) {
             Ok((root_channel, remaining_subdir)) => {
                 let (_, server_end) = create_endpoints::<fio::DirectoryMarker>();
                 fdio::open_at(
                     &root_channel,
                     &remaining_subdir,
                     open_flags,
                     server_end.into_channel(),
                 )
                 .map_err(|e| {
                     log_error!("Failed to intialize the subdirs: {}", e);
                     errno!(EIO)
                 })?;

                 Ok((fio::DirectorySynchronousProxy::new(root_channel), remaining_subdir))
             }
             Err(err) => {
                 log_error!(
                     "Unable to find a channel for {}. Received error: {}",
                     ns_path.display(),
                     err
                 );
                 Err(errno!(ENOENT))
             }
         }
     }

     /// Returns an iterator of the command line arguments.
     pub fn cmdline_args_iter(&self) -> impl Iterator<Item = ArgNameAndValue<'_>> {
         parse_cmdline(self.cmdline.to_str().unwrap_or_default()).filter_map(|arg| {
             arg.split_once('=')
                 .map(|(name, value)| ArgNameAndValue { name: name, value: Some(value) })
                 .or(Some(ArgNameAndValue { name: arg, value: None }))
         })
     }

     /// Returns the container-configured CacheConfig.
     pub fn fs_cache_config(&self) -> CacheConfig {
         CacheConfig { capacity: self.features.dirent_cache_size as usize }
     }
 }

 pub fn parse_cmdline(cmdline: &str) -> impl Iterator<Item = &str> {
     let mut args = Vec::new();
     let mut arg_start: Option<usize> = None;
     let mut in_quotes = false;
     let mut previous_char = ' ';

     for (i, c) in cmdline.char_indices() {
         if let Some(start) = arg_start {
             match c {
                 ' ' if !in_quotes => {
                     args.push(&cmdline[start..i]);
                     arg_start = None;
                 }
                 '"' if previous_char != '\\' => {
                     in_quotes = !in_quotes;
                 }
                 _ => {}
             }
         } else if c != ' ' {
             arg_start = Some(i);
             if c == '"' {
                 in_quotes = true;
             }
         }
         previous_char = c;
     }
     if let Some(start) = arg_start {
         args.push(&cmdline[start..]);
     }
     args.into_iter()
 }

 impl std::fmt::Debug for Kernel {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         f.debug_struct("Kernel").finish()
     }
 }

 // TODO(https://fxbug.dev/380427153): move arch dependent code to `kernel/core/arch/*`.
 #[cfg(target_arch = "aarch64")]
 fn arm32_hwcap(cpu_feature_flags: CpuFeatureFlags) -> HwCap {
     use starnix_uapi::arch32;
     const COMPAT_ARM32_ELF_HWCAP: u32 = arch32::HWCAP_HALF
         | arch32::HWCAP_THUMB
         | arch32::HWCAP_FAST_MULT
         | arch32::HWCAP_EDSP
         | arch32::HWCAP_TLS
         | arch32::HWCAP_IDIV // == IDIVA | IDIVT.
         | arch32::HWCAP_LPAE
         | arch32::HWCAP_EVTSTRM;

     let mut hwcap = COMPAT_ARM32_ELF_HWCAP;
     let mut hwcap2 = 0;
     for feature in cpu_feature_flags.iter() {
         match feature {
             CpuFeatureFlags::ARM64_FEATURE_ISA_ASIMD => hwcap |= arch32::HWCAP_NEON,
             CpuFeatureFlags::ARM64_FEATURE_ISA_AES => hwcap2 |= arch32::HWCAP2_AES,
             CpuFeatureFlags::ARM64_FEATURE_ISA_PMULL => hwcap2 |= arch32::HWCAP2_PMULL,
             CpuFeatureFlags::ARM64_FEATURE_ISA_SHA1 => hwcap2 |= arch32::HWCAP2_SHA1,
             CpuFeatureFlags::ARM64_FEATURE_ISA_SHA256 => hwcap2 |= arch32::HWCAP2_SHA2,
             CpuFeatureFlags::ARM64_FEATURE_ISA_CRC32 => hwcap2 |= arch32::HWCAP2_CRC32,
             CpuFeatureFlags::ARM64_FEATURE_ISA_I8MM => hwcap |= arch32::HWCAP_I8MM,
             CpuFeatureFlags::ARM64_FEATURE_ISA_FHM => hwcap |= arch32::HWCAP_ASIMDFHM,
             CpuFeatureFlags::ARM64_FEATURE_ISA_DP => hwcap |= arch32::HWCAP_ASIMDDP,
             CpuFeatureFlags::ARM64_FEATURE_ISA_FP => {
                 hwcap |= arch32::HWCAP_VFP | arch32::HWCAP_VFPv3 | arch32::HWCAP_VFPv4
             }
             _ => {}
         }
     }
     HwCap { hwcap, hwcap2 }
 }

 #[cfg(target_arch = "aarch64")]
 fn arm64_hwcap(cpu_feature_flags: CpuFeatureFlags) -> HwCap {
     // See https://docs.kernel.org/arch/arm64/elf_hwcaps.html for details.
     use starnix_uapi;
     let mut hwcap = 0;
     let mut hwcap2 = 0;

     for feature in cpu_feature_flags.iter() {
         match feature {
             CpuFeatureFlags::ARM64_FEATURE_ISA_FP => hwcap |= starnix_uapi::HWCAP_FP,
             CpuFeatureFlags::ARM64_FEATURE_ISA_ASIMD => hwcap |= starnix_uapi::HWCAP_ASIMD,
             CpuFeatureFlags::ARM64_FEATURE_ISA_AES => hwcap |= starnix_uapi::HWCAP_AES,
             CpuFeatureFlags::ARM64_FEATURE_ISA_PMULL => hwcap |= starnix_uapi::HWCAP_PMULL,
             CpuFeatureFlags::ARM64_FEATURE_ISA_SHA1 => hwcap |= starnix_uapi::HWCAP_SHA1,
             CpuFeatureFlags::ARM64_FEATURE_ISA_SHA256 => hwcap |= starnix_uapi::HWCAP_SHA2,
             CpuFeatureFlags::ARM64_FEATURE_ISA_CRC32 => hwcap |= starnix_uapi::HWCAP_CRC32,
             CpuFeatureFlags::ARM64_FEATURE_ISA_I8MM => hwcap2 |= starnix_uapi::HWCAP2_I8MM,
             CpuFeatureFlags::ARM64_FEATURE_ISA_FHM => hwcap |= starnix_uapi::HWCAP_ASIMDFHM,
             CpuFeatureFlags::ARM64_FEATURE_ISA_DP => hwcap |= starnix_uapi::HWCAP_ASIMDDP,
             CpuFeatureFlags::ARM64_FEATURE_ISA_SM3 => hwcap |= starnix_uapi::HWCAP_SM3,
             CpuFeatureFlags::ARM64_FEATURE_ISA_SM4 => hwcap |= starnix_uapi::HWCAP_SM4,
             CpuFeatureFlags::ARM64_FEATURE_ISA_SHA3 => hwcap |= starnix_uapi::HWCAP_SHA3,
             CpuFeatureFlags::ARM64_FEATURE_ISA_SHA512 => hwcap |= starnix_uapi::HWCAP_SHA512,
             CpuFeatureFlags::ARM64_FEATURE_ISA_ATOMICS => hwcap |= starnix_uapi::HWCAP_ATOMICS,
             CpuFeatureFlags::ARM64_FEATURE_ISA_RDM => hwcap |= starnix_uapi::HWCAP_ASIMDRDM,
             CpuFeatureFlags::ARM64_FEATURE_ISA_TS => hwcap |= starnix_uapi::HWCAP_FLAGM,
             CpuFeatureFlags::ARM64_FEATURE_ISA_DPB => hwcap |= starnix_uapi::HWCAP_DCPOP,
             CpuFeatureFlags::ARM64_FEATURE_ISA_RNDR => hwcap2 |= starnix_uapi::HWCAP2_RNG,
             _ => {}
         }
     }
     HwCap { hwcap, hwcap2 }
 }

 impl HwCaps {
     #[cfg(target_arch = "aarch64")]
     pub fn from_cpu_feature_flags(cpu_feature_flags: CpuFeatureFlags) -> Self {
         Self { arch32: arm32_hwcap(cpu_feature_flags), arch64: arm64_hwcap(cpu_feature_flags) }
     }

     #[cfg(not(target_arch = "aarch64"))]
     pub fn from_cpu_feature_flags(_cpu_feature_flags: CpuFeatureFlags) -> Self {
         Self { arch64: HwCap::default() }
     }
 }

 #[cfg(test)]
 mod test {
     use super::parse_cmdline;

     #[test]
     fn test_parse_cmdline() {
         let cmdline =
             r#"first second=third "fourth fifth" sixth="seventh eighth" "ninth\" tenth" eleventh"#;
         let expected = vec![
             "first",
             "second=third",
             "\"fourth fifth\"",
             "sixth=\"seventh eighth\"",
             "\"ninth\\\" tenth\"",
             "eleventh",
         ];
         assert_eq!(parse_cmdline(cmdline).collect::<Vec<_>>(), expected);
     }
 }