blob: b306823bce775fc024ec998985fe549af977dbf2 [file] [log] [blame] [edit]
// Copyright 2021 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
use bstr::BString;
use fidl::{
endpoints::{create_endpoints, ClientEnd, ProtocolMarker, Proxy},
AsHandleRef,
};
use fidl_fuchsia_io as fio;
use fuchsia_async as fasync;
use fuchsia_zircon as zx;
use futures::FutureExt;
use netlink::{interfaces::InterfacesHandler, Netlink, NETLINK_LOG_TAG};
use once_cell::sync::OnceCell;
use selinux::security_server::SecurityServer;
use starnix_lock::{declare_lock_levels, OrderedRwLock, RwLock};
use std::{
collections::BTreeMap,
sync::{
atomic::{AtomicI32, AtomicU16, AtomicU32, AtomicU64, AtomicU8},
Arc, Weak,
},
};
use crate::{
device::{
framebuffer::Framebuffer, input::InputDevice, loop_device::LoopDeviceRegistry,
BinderDriver, DeviceMode, DeviceOps, DeviceRegistry, Features,
},
diagnostics::CoreDumpList,
fs::{
devtmpfs::devtmpfs_create_device,
inotify::InotifyLimits,
kobject::{KObjectDeviceAttribute, KType, UEventAction},
socket::{
GenericMessage, GenericNetlink, NetlinkSenderReceiverProvider, NetlinkToClientSender,
SocketAddress,
},
sysfs::{BlockDeviceDirectory, DeviceDirectory, SysFsDirectory},
FileOps, FileSystemHandle, FsNode,
},
logging::log_error,
mm::{FutexTable, SharedFutexKey},
power::PowerManager,
task::{
AbstractUnixSocketNamespace, AbstractVsockSocketNamespace, CurrentTask, IpTables,
KernelThreads, NetstackDevices, PidTable, StopState, UtsNamespace, UtsNamespaceHandle,
},
types::{errno, from_status_like_fdio, DeviceType, Errno, OpenFlags},
vdso::vdso_loader::Vdso,
};
declare_lock_levels![KernelIpTables: OrderedRwLock<IpTables>];
use self::lock_levels::*;
/// The shared, mutable state for the entire Starnix kernel.
///
/// The `Kernel` object holds all kernel threads, userspace tasks, and file system resources for a
/// single instance of the Starnix kernel. In production, there is one instance of this object for
/// the entire Starnix kernel. However, multiple instances of this object can be created in one
/// process during unit testing.
///
/// The structure of this object will likely need to evolve as we implement more namespacing and
/// isolation mechanisms, such as `namespaces(7)` and `pid_namespaces(7)`.
pub struct Kernel {
/// The kernel threads running on behalf of this kernel.
pub kthreads: KernelThreads,
/// The processes and threads running in this kernel, organized by pid_t.
pub pids: RwLock<PidTable>,
/// The default namespace for abstract AF_UNIX sockets in this kernel.
///
/// Rather than use this default namespace, abstract socket addresses
/// should be looked up in the AbstractSocketNamespace on each Task
/// object because some Task objects might have a non-default namespace.
pub default_abstract_socket_namespace: Arc<AbstractUnixSocketNamespace>,
/// The default namespace for abstract AF_VSOCK sockets in this kernel.
pub default_abstract_vsock_namespace: Arc<AbstractVsockSocketNamespace>,
/// The kernel command line. Shows up in /proc/cmdline.
pub cmdline: BString,
// Owned by anon_node.rs
pub anon_fs: OnceCell<FileSystemHandle>,
// Owned by pipe.rs
pub pipe_fs: OnceCell<FileSystemHandle>,
// Owned by socket.rs
pub socket_fs: OnceCell<FileSystemHandle>,
// Owned by devtmpfs.rs
pub dev_tmp_fs: OnceCell<FileSystemHandle>,
// Owned by devpts.rs
pub dev_pts_fs: OnceCell<FileSystemHandle>,
// Owned by procfs.rs
pub proc_fs: OnceCell<FileSystemHandle>,
// Owned by sysfs.rs
pub sys_fs: OnceCell<FileSystemHandle>,
// Owned by selinux/fs.rs
pub selinux_fs: OnceCell<FileSystemHandle>,
// The SELinux security server. Initialized if SELinux is enabled.
pub security_server: Option<SecurityServer>,
// Owned by tracefs/fs.rs
pub trace_fs: OnceCell<FileSystemHandle>,
/// The registry of device drivers.
pub device_registry: DeviceRegistry,
// The features enabled for the container this kernel is associated with, as specified in
// the container's configuration file.
pub features: Features,
/// The service directory of the container.
container_svc: Option<fio::DirectoryProxy>,
/// The data directory of the container.
pub container_data_dir: Option<fio::DirectorySynchronousProxy>,
/// The registry of active loop devices.
///
/// See <https://man7.org/linux/man-pages/man4/loop.4.html>
pub loop_device_registry: Arc<LoopDeviceRegistry>,
/// A `Framebuffer` that can be used to display a view in the workstation UI. If the container
/// specifies the `framebuffer` feature this framebuffer will be registered as a device.
///
/// When a component is run in that container and also specifies the `framebuffer` feature, the
/// framebuffer will be served as the view of the component.
pub framebuffer: Arc<Framebuffer>,
/// An `InputDevice` that can be opened to read input events from Fuchsia.
///
/// If the container specifies the `framebuffer` features, this `InputDevice` will be registered
/// as a device.
///
/// When a component is run in that container, and also specifies the `framebuffer` feature,
/// Starnix will relay input events from Fuchsia to the component.
pub input_device: Arc<InputDevice>,
/// The binder driver registered for this container, indexed by their device type.
pub binders: RwLock<BTreeMap<DeviceType, Arc<BinderDriver>>>,
/// The iptables used for filtering network packets.
pub iptables: OrderedRwLock<IpTables, KernelIpTables>,
/// The futexes shared across processes.
pub shared_futexes: FutexTable<SharedFutexKey>,
/// The default UTS namespace for all tasks.
///
/// Because each task can have its own UTS namespace, you probably want to use
/// the UTS namespace handle of the task, which may/may not point to this one.
pub root_uts_ns: UtsNamespaceHandle,
/// A struct containing a VMO with a vDSO implementation, if implemented for a given architecture, and possibly an offset for a sigreturn function.
pub vdso: Vdso,
/// The table of devices installed on the netstack and their associated
/// state local to this `Kernel`.
pub netstack_devices: Arc<NetstackDevices>,
/// The implementation of generic Netlink protocol families.
generic_netlink: OnceCell<GenericNetlink<NetlinkToClientSender<GenericMessage>>>,
/// The implementation of networking-related Netlink protocol families.
network_netlink: OnceCell<Netlink<NetlinkSenderReceiverProvider>>,
/// Inspect instrumentation for this kernel instance.
inspect_node: fuchsia_inspect::Node,
/// Diagnostics information about crashed tasks.
pub core_dumps: CoreDumpList,
// The kinds of seccomp action that gets logged, stored as a bit vector.
// Each potential SeccompAction gets a bit in the vector, as specified by
// SeccompAction::logged_bit_offset. If the bit is set, that means the
// action should be logged when it is taken, subject to the caveats
// described in seccomp(2). The value of the bit vector is exposed to users
// in a text form in the file /proc/sys/kernel/seccomp/actions_logged.
pub actions_logged: AtomicU16,
/// The manager for power subsystems including reboot and suspend.
pub power_manager: PowerManager,
/// Unique IDs for new mounts and mount namespaces.
pub next_mount_id: AtomicU64,
pub next_peer_group_id: AtomicU64,
pub next_namespace_id: AtomicU64,
/// Unique cookie used to link two inotify events, usually an IN_MOVE_FROM/IN_MOVE_TO pair.
pub next_inotify_cookie: AtomicU32,
pub inotify_limits: InotifyLimits,
// Controls which processes a process is allowed to ptrace. See Documentation/security/Yama.txt
pub ptrace_scope: AtomicU8,
// The Fuchsia build version returned by `fuchsia.buildinfo.Provider`.
pub build_version: OnceCell<String>,
}
/// An implementation of [`InterfacesHandler`].
///
/// This holds a `Weak<Kernel>` because it is held within a [`Netlink`] which
/// is itself held within an `Arc<Kernel>`. Holding an `Arc<T>` within an
/// `Arc<T>` prevents the `Arc`'s ref count from ever reaching 0, causing a
/// leak.
struct InterfacesHandlerImpl(Weak<Kernel>);
impl InterfacesHandlerImpl {
fn with_netstack_devices<
F: FnOnce(&Arc<NetstackDevices>, Option<&FileSystemHandle>, Option<&FileSystemHandle>),
>(
&mut self,
f: F,
) {
let Self(rc) = self;
let Some(rc) = rc.upgrade() else {
// The kernel may be getting torn-down.
return;
};
f(&rc.netstack_devices, rc.proc_fs.get(), rc.sys_fs.get())
}
}
impl InterfacesHandler for InterfacesHandlerImpl {
fn handle_new_link(&mut self, name: &str) {
self.with_netstack_devices(|devs, proc_fs, sys_fs| devs.add_dev(name, proc_fs, sys_fs))
}
fn handle_deleted_link(&mut self, name: &str) {
self.with_netstack_devices(|devs, _proc_fs, _sys_fs| devs.remove_dev(name))
}
}
impl Kernel {
pub fn new(
cmdline: BString,
features: Features,
container_svc: Option<fio::DirectoryProxy>,
container_data_dir: Option<fio::DirectorySynchronousProxy>,
inspect_node: fuchsia_inspect::Node,
) -> Result<Arc<Kernel>, zx::Status> {
let unix_address_maker = Box::new(|x: Vec<u8>| -> SocketAddress { SocketAddress::Unix(x) });
let vsock_address_maker = Box::new(|x: u32| -> SocketAddress { SocketAddress::Vsock(x) });
let framebuffer =
Framebuffer::new(features.aspect_ratio.as_ref()).expect("Failed to create framebuffer");
let input_device = InputDevice::new(framebuffer.clone(), &inspect_node);
let core_dumps = CoreDumpList::new(inspect_node.create_child("coredumps"));
let security_server = if features.selinux { Some(SecurityServer::new()) } else { None };
let this = Arc::new(Kernel {
kthreads: KernelThreads::default(),
pids: RwLock::new(PidTable::new()),
default_abstract_socket_namespace: AbstractUnixSocketNamespace::new(unix_address_maker),
default_abstract_vsock_namespace: AbstractVsockSocketNamespace::new(
vsock_address_maker,
),
cmdline,
anon_fs: OnceCell::new(),
pipe_fs: OnceCell::new(),
dev_tmp_fs: OnceCell::new(),
dev_pts_fs: OnceCell::new(),
proc_fs: OnceCell::new(),
socket_fs: OnceCell::new(),
sys_fs: OnceCell::new(),
selinux_fs: OnceCell::new(),
security_server,
trace_fs: OnceCell::new(),
device_registry: DeviceRegistry::new(),
features,
container_svc,
container_data_dir,
loop_device_registry: Default::default(),
framebuffer,
input_device,
binders: Default::default(),
iptables: OrderedRwLock::new(IpTables::new()),
shared_futexes: FutexTable::<SharedFutexKey>::default(),
root_uts_ns: Arc::new(RwLock::new(UtsNamespace::default())),
vdso: Vdso::new(),
netstack_devices: Arc::default(),
generic_netlink: OnceCell::new(),
network_netlink: OnceCell::new(),
inspect_node,
core_dumps,
actions_logged: AtomicU16::new(0),
power_manager: PowerManager::default(),
next_mount_id: AtomicU64::new(1),
next_peer_group_id: AtomicU64::new(1),
next_namespace_id: AtomicU64::new(1),
next_inotify_cookie: AtomicU32::new(1),
inotify_limits: InotifyLimits {
max_queued_events: AtomicI32::new(16384),
max_user_instances: AtomicI32::new(128),
max_user_watches: AtomicI32::new(1048576),
},
ptrace_scope: AtomicU8::new(0),
build_version: OnceCell::new(),
});
// Make a copy of this Arc for the inspect lazy node to use but don't create an Arc cycle
// because the inspect node that owns this reference is owned by the kernel.
let kernel = Arc::downgrade(&this);
this.inspect_node.record_lazy_child("thread_groups", move || {
if let Some(kernel) = kernel.upgrade() {
let inspector = kernel.get_thread_groups_inspect();
async move { Ok(inspector) }.boxed()
} else {
async move { Err(anyhow::format_err!("kernel was dropped")) }.boxed()
}
});
Ok(this)
}
/// Add a device in the hierarchy tree.
///
/// If it's a Block device, the device will be added under "block" class.
pub fn add_device(self: &Arc<Self>, dev_attr: KObjectDeviceAttribute) {
let kobj_device = match dev_attr.device.mode {
DeviceMode::Char => {
assert!(dev_attr.class.is_some(), "no class is associated with the device.");
dev_attr.class.unwrap().get_or_create_child(
&dev_attr.name,
KType::Device(dev_attr.device.clone()),
DeviceDirectory::new,
)
}
DeviceMode::Block => {
let block_class = self.device_registry.virtual_bus().get_or_create_child(
b"block",
KType::Class,
SysFsDirectory::new,
);
block_class.get_or_create_child(
&dev_attr.name,
KType::Device(dev_attr.device.clone()),
BlockDeviceDirectory::new,
)
}
};
self.device_registry.dispatch_uevent(UEventAction::Add, kobj_device);
match devtmpfs_create_device(self, dev_attr.device.clone()) {
Ok(_) => (),
Err(err) => {
log_error!("Cannot add block device {:?} in devtmpfs ({:?})", dev_attr.device, err)
}
};
}
/// Add a device in the hierarchy tree and register its DeviceOps.
pub fn add_and_register_device(
self: &Arc<Self>,
dev_attr: KObjectDeviceAttribute,
dev_ops: impl DeviceOps,
) {
match match dev_attr.device.mode {
DeviceMode::Char => self.device_registry.register_chrdev(
dev_attr.device.device_type.major(),
dev_attr.device.device_type.minor(),
1,
dev_ops,
),
DeviceMode::Block => self.device_registry.register_blkdev(
dev_attr.device.device_type.major(),
dev_attr.device.device_type.minor(),
1,
dev_ops,
),
} {
Ok(_) => (),
Err(err) => log_error!("Cannot register device {:?} ({:?})", dev_attr.device, err),
}
self.add_device(dev_attr);
}
/// Opens a device file (driver) identified by `dev`.
pub fn open_device(
&self,
current_task: &CurrentTask,
node: &FsNode,
flags: OpenFlags,
dev: DeviceType,
mode: DeviceMode,
) -> Result<Box<dyn FileOps>, Errno> {
self.device_registry.open_device(current_task, node, flags, dev, mode)
}
/// Return a reference to the GenericNetlink implementation.
///
/// This function follows the lazy initialization pattern, where the first
/// call will instantiate the Generic Netlink server in a separate kthread.
pub(crate) fn generic_netlink(&self) -> &GenericNetlink<NetlinkToClientSender<GenericMessage>> {
self.generic_netlink.get_or_init(|| {
let (generic_netlink, generic_netlink_fut) = GenericNetlink::new();
self.kthreads.spawner.spawn(move || {
fasync::LocalExecutor::new().run_singlethreaded(generic_netlink_fut);
log_error!("Generic Netlink future unexpectedly exited");
});
generic_netlink
})
}
/// Return a reference to the [`netlink::Netlink`] implementation.
///
/// This function follows the lazy initialization pattern, where the first
/// call will instantiate the Netlink implementation.
pub(crate) fn network_netlink<'a>(
self: &'a Arc<Self>,
) -> &'a Netlink<NetlinkSenderReceiverProvider> {
self.network_netlink.get_or_init(|| {
let (network_netlink, network_netlink_async_worker) =
Netlink::new(InterfacesHandlerImpl(Arc::downgrade(self)));
self.kthreads.spawn(move || {
fasync::LocalExecutor::new().run_singlethreaded(network_netlink_async_worker);
log_error!(tag = NETLINK_LOG_TAG, "Netlink async worker unexpectedly exited");
});
network_netlink
})
}
/// Returns a Proxy to the service exposed to the container at `filename`.
#[allow(unused)]
pub fn connect_to_named_protocol_at_container_svc<P: ProtocolMarker>(
&self,
filename: &str,
) -> Result<ClientEnd<P>, Errno> {
let svc = self.container_svc.as_ref().ok_or_else(|| errno!(ENOENT))?;
let (client_end, server_end) = create_endpoints::<P>();
fdio::service_connect_at(svc.as_channel().as_ref(), filename, server_end.into_channel())
.map_err(|status| from_status_like_fdio!(status))?;
Ok(client_end)
}
pub fn mock_selinux(&self) -> bool {
self.features.mock_selinux
}
fn get_thread_groups_inspect(&self) -> fuchsia_inspect::Inspector {
let inspector = fuchsia_inspect::Inspector::default();
let thread_groups = inspector.root();
for thread_group in self.pids.read().get_thread_groups() {
let tg = thread_group.read();
let tg_node = thread_groups.create_child(format!("{}", thread_group.leader));
if let Ok(koid) = &thread_group.process.get_koid() {
tg_node.record_int("koid", koid.raw_koid() as i64);
}
tg_node.record_int("pid", thread_group.leader as i64);
tg_node.record_int("ppid", tg.get_ppid() as i64);
tg_node.record_bool("stopped", thread_group.load_stopped() == StopState::GroupStopped);
let tasks_node = tg_node.create_child("tasks");
for task in tg.tasks() {
let set_properties = |node: &fuchsia_inspect::Node| {
node.record_string("command", task.command().to_str().unwrap_or("{err}"));
let sched_policy = task.read().scheduler_policy;
if !sched_policy.is_default() {
node.record_string("sched_policy", format!("{sched_policy:?}"));
}
};
if task.id == thread_group.leader {
set_properties(&tg_node);
} else {
tasks_node.record_child(task.id.to_string(), |task_node| {
set_properties(task_node);
});
};
}
tg_node.record(tasks_node);
thread_groups.record(tg_node);
}
inspector
}
}
impl std::fmt::Debug for Kernel {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("Kernel").finish()
}
}