| // Copyright 2023 The Fuchsia Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| use crate::mm::MemoryAccessorExt; |
| use crate::signals::{SignalDetail, SignalInfo, SignalSource, send_standard_signal}; |
| use crate::task::{ |
| CurrentTask, EventHandler, ExitStatus, Kernel, Task, TaskFlags, WaitCanceler, WaitQueue, Waiter, |
| }; |
| use crate::vfs::buffers::{InputBuffer, OutputBuffer}; |
| use crate::vfs::{ |
| Anon, FdFlags, FdNumber, FileObject, FileObjectState, FileOps, fileops_impl_nonseekable, |
| fileops_impl_noop_sync, |
| }; |
| use bstr::ByteSlice; |
| use ebpf::{ |
| BPF_ABS, BPF_LD, BPF_ST, BpfProgramContext, CbpfConfig, EbpfProgram, MemoryId, NoMap, |
| ProgramArgument, Type, bpf_addressing_mode, bpf_class, convert_and_link_cbpf, |
| }; |
| use ebpf_api::SECCOMP_CBPF_CONFIG; |
| use linux_uapi::AUDIT_SECCOMP; |
| use starnix_lifecycle::AtomicU64Counter; |
| use starnix_logging::{log_warn, track_stub}; |
| use starnix_sync::{FileOpsCore, Locked, Mutex, Unlocked}; |
| use starnix_syscalls::decls::Syscall; |
| use starnix_syscalls::{SyscallArg, SyscallResult}; |
| use starnix_uapi::errors::Errno; |
| use starnix_uapi::open_flags::OpenFlags; |
| use starnix_uapi::signals::{SIGKILL, SIGSYS}; |
| #[cfg(target_arch = "aarch64")] |
| use starnix_uapi::user_address::ArchSpecific; |
| use starnix_uapi::user_address::{UserAddress, UserRef}; |
| use starnix_uapi::vfs::FdEvents; |
| use starnix_uapi::{ |
| __NR_exit, __NR_read, __NR_write, SECCOMP_IOCTL_NOTIF_ADDFD, SECCOMP_IOCTL_NOTIF_ID_VALID, |
| SECCOMP_IOCTL_NOTIF_RECV, SECCOMP_IOCTL_NOTIF_SEND, SECCOMP_MODE_DISABLED, SECCOMP_MODE_FILTER, |
| SECCOMP_MODE_STRICT, SECCOMP_RET_ACTION_FULL, SECCOMP_RET_DATA, |
| SECCOMP_USER_NOTIF_FLAG_CONTINUE, SYS_SECCOMP, errno, errno_from_code, error, seccomp_data, |
| seccomp_notif, seccomp_notif_resp, sock_filter, |
| }; |
| use std::collections::HashMap; |
| use std::sync::atomic::{AtomicU8, Ordering}; |
| use std::sync::{Arc, LazyLock}; |
| use zerocopy::{FromBytes, Immutable, IntoBytes, KnownLayout}; |
| |
| #[cfg(target_arch = "aarch64")] |
| use starnix_uapi::__NR_clock_getres; |
| #[cfg(target_arch = "aarch64")] |
| use starnix_uapi::__NR_clock_gettime; |
| #[cfg(target_arch = "aarch64")] |
| use starnix_uapi::__NR_gettimeofday; |
| #[cfg(target_arch = "aarch64")] |
| use starnix_uapi::{AUDIT_ARCH_AARCH64, AUDIT_ARCH_ARM}; |
| |
| #[cfg(target_arch = "x86_64")] |
| use starnix_uapi::__NR_clock_gettime; |
| #[cfg(target_arch = "x86_64")] |
| use starnix_uapi::__NR_getcpu; |
| #[cfg(target_arch = "x86_64")] |
| use starnix_uapi::__NR_gettimeofday; |
| #[cfg(target_arch = "x86_64")] |
| use starnix_uapi::__NR_time; |
| #[cfg(target_arch = "x86_64")] |
| use starnix_uapi::AUDIT_ARCH_X86_64; |
| |
| #[cfg(target_arch = "riscv64")] |
| use starnix_uapi::AUDIT_ARCH_RISCV64; |
| |
| pub struct SeccompFilter { |
| /// The BPF program associated with this filter. |
| program: EbpfProgram<SeccompFilter>, |
| |
| /// The unique-to-this-process id of thi1s filter. SECCOMP_FILTER_FLAG_TSYNC only works if all |
| /// threads in this process have filters that are a prefix of the filters of the thread |
| /// attempting to do the TSYNC. Identical filters attached in separate seccomp calls are treated |
| /// as different from each other for this purpose, so we need a way of distinguishing them. |
| unique_id: u64, |
| |
| /// The next cookie (unique id for this syscall), as used by SECCOMP_RET_USER_NOTIF |
| cookie: AtomicU64Counter, |
| |
| // Whether to log the results of this filter |
| log: bool, |
| } |
| |
| /// The result of running a set of seccomp filters. |
| pub struct SeccompFilterResult { |
| /// The action indicated by the seccomp filter with the highest priority result. |
| action: SeccompAction, |
| |
| /// The filter that returned the highest priority result, as used by SECCOMP_RET_USER_NOTIF, |
| /// which has to have access to its cookie value |
| filter: Option<Arc<SeccompFilter>>, |
| } |
| |
| impl SeccompFilter { |
| /// Creates a SeccompFilter object from the given sock_filter. Associates the user-provided |
| /// id with it, which is intended to be unique to this process. |
| pub fn from_cbpf( |
| code: &Vec<sock_filter>, |
| maybe_unique_id: u64, |
| should_log: bool, |
| ) -> Result<Self, Errno> { |
| // If an instruction loads from / stores to an absolute address, that address has to be |
| // 32-bit aligned and inside the struct seccomp_data passed in. |
| for insn in code { |
| if (bpf_class(insn) == BPF_LD || bpf_class(insn) == BPF_ST) |
| && (bpf_addressing_mode(insn) == BPF_ABS) |
| && (insn.k & 0x3 != 0 || std::mem::size_of::<seccomp_data>() < insn.k as usize) |
| { |
| return error!(EINVAL); |
| } |
| } |
| |
| let program = convert_and_link_cbpf::<SeccompFilter>(code).map_err(|errmsg| { |
| log_warn!("{}", errmsg); |
| errno!(EINVAL) |
| })?; |
| |
| Ok(SeccompFilter { |
| program, |
| unique_id: maybe_unique_id, |
| cookie: AtomicU64Counter::new(0), |
| log: should_log, |
| }) |
| } |
| |
| pub fn run(&self, data: &seccomp_data) -> u32 { |
| self.program.run(&mut (), &SeccompData(*data)) as u32 |
| } |
| } |
| |
| // Wrapper for `seccomp_data`. Required in order to implement the `ProgramArgument` trait below. |
| #[repr(C)] |
| #[derive(Debug, Default, Clone, IntoBytes, FromBytes, KnownLayout, Immutable)] |
| pub struct SeccompData(seccomp_data); |
| |
| impl BpfProgramContext for SeccompFilter { |
| type RunContext<'a> = (); |
| type Packet<'a> = &'a SeccompData; |
| type Map = NoMap; |
| const CBPF_CONFIG: &'static CbpfConfig = &SECCOMP_CBPF_CONFIG; |
| } |
| |
| ebpf::empty_static_helper_set!(SeccompFilter); |
| |
| static SECCOMP_DATA_TYPE: LazyLock<Type> = |
| LazyLock::new(|| Type::PtrToMemory { id: MemoryId::new(), offset: 0.into(), buffer_size: 0 }); |
| |
| impl ProgramArgument for &'_ SeccompData { |
| fn get_type() -> &'static Type { |
| &*SECCOMP_DATA_TYPE |
| } |
| } |
| |
| const SECCOMP_MAX_INSNS_PER_PATH: u16 = 32768; |
| |
| /// A list of seccomp filters, intended to be associated with a specific process. |
| #[derive(Default)] |
| pub struct SeccompFilterContainer { |
| /// List of currently installed seccomp_filters; most recently added is last. |
| pub filters: Vec<Arc<SeccompFilter>>, |
| |
| // The total length of the provided seccomp filters, which cannot |
| // exceed SECCOMP_MAX_INSNS_PER_PATH - 4 * the number of filters. This is stored |
| // instead of computed because we store seccomp filters in an |
| // expanded form, and it is impossible to get the original length. |
| pub provided_instructions: u16, |
| |
| // Data needed by SECCOMP_RET_USER_NOTIF |
| pub notifier: Option<SeccompNotifierHandle>, |
| } |
| |
| impl Clone for SeccompFilterContainer { |
| fn clone(&self) -> Self { |
| if let Some(n) = &self.notifier { |
| n.lock().add_thread(); |
| } |
| SeccompFilterContainer { |
| filters: self.filters.clone(), |
| provided_instructions: self.provided_instructions, |
| notifier: self.notifier.clone(), |
| } |
| } |
| } |
| |
| impl Drop for SeccompFilterContainer { |
| fn drop(&mut self) { |
| if let Some(n) = &self.notifier { |
| // Notifier needs to send threads a HUP when there is no one left |
| // referencing it. |
| n.lock().remove_thread(); |
| } |
| } |
| } |
| |
| fn make_seccomp_data( |
| #[allow(unused_variables)] current_task: &CurrentTask, |
| syscall: &Syscall, |
| ip: u64, |
| ) -> seccomp_data { |
| #[cfg(target_arch = "x86_64")] |
| let arch_val = AUDIT_ARCH_X86_64; |
| #[cfg(target_arch = "aarch64")] |
| let arch_val = if current_task.is_arch32() { AUDIT_ARCH_ARM } else { AUDIT_ARCH_AARCH64 }; |
| #[cfg(target_arch = "riscv64")] |
| let arch_val = AUDIT_ARCH_RISCV64; |
| seccomp_data { |
| nr: syscall.decl.number as i32, |
| arch: arch_val, |
| instruction_pointer: ip, |
| args: [ |
| syscall.arg0.raw(), |
| syscall.arg1.raw(), |
| syscall.arg2.raw(), |
| syscall.arg3.raw(), |
| syscall.arg4.raw(), |
| syscall.arg5.raw(), |
| ], |
| } |
| } |
| |
| impl SeccompFilterContainer { |
| /// Ensures that this set of seccomp filters can be "synced to" the given set. |
| /// This means that our filters are a prefix of the given set of filters. |
| pub fn can_sync_to(&self, source: &SeccompFilterContainer) -> bool { |
| if source.filters.len() < self.filters.len() { |
| return false; |
| } |
| for (filter, other_filter) in self.filters.iter().zip(source.filters.iter()) { |
| if other_filter.unique_id != filter.unique_id { |
| return false; |
| } |
| } |
| true |
| } |
| |
| /// Adds the given filter to this list. The original_length parameter is the length of |
| /// the originally provided BPF (i.e., the number of sock_filter instructions), used |
| /// to ensure the total length does not exceed SECCOMP_MAX_INSNS_PER_PATH |
| pub fn add_filter( |
| &mut self, |
| filter: Arc<SeccompFilter>, |
| original_length: u16, |
| ) -> Result<(), Errno> { |
| let maybe_new_length = self.provided_instructions + original_length + 4; |
| if maybe_new_length > SECCOMP_MAX_INSNS_PER_PATH { |
| return error!(ENOMEM); |
| } |
| |
| self.provided_instructions = maybe_new_length; |
| self.filters.push(filter); |
| Ok(()) |
| } |
| |
| /// Runs all of the seccomp filters in this container, most-to-least recent. Returns the |
| /// highest priority result (which contains a reference to the filter that generated it) |
| pub fn run_all(&self, current_task: &CurrentTask, syscall: &Syscall) -> SeccompFilterResult { |
| let mut r = SeccompFilterResult { action: SeccompAction::Allow, filter: None }; |
| |
| // VDSO calls can't be caught by seccomp, so most seccomp filters forget to declare them. |
| // But our VDSO implementation is incomplete, and most of the calls forward to the actual |
| // syscalls. So seccomp should ignore them until they're implemented correctly in the VDSO. |
| #[cfg(target_arch = "x86_64")] // The set of VDSO calls is arch dependent. |
| #[allow(non_upper_case_globals)] |
| if let __NR_clock_gettime | __NR_getcpu | __NR_gettimeofday | __NR_time = |
| syscall.decl.number as u32 |
| { |
| return r; |
| } |
| #[cfg(target_arch = "aarch64")] |
| #[allow(non_upper_case_globals)] |
| if let __NR_clock_gettime | __NR_clock_getres | __NR_gettimeofday = |
| syscall.decl.number as u32 |
| { |
| return r; |
| } |
| |
| let data = make_seccomp_data( |
| current_task, |
| syscall, |
| current_task.thread_state.registers.instruction_pointer_register(), |
| ); |
| |
| // Filters are executed in reverse order of addition |
| for filter in self.filters.iter().rev() { |
| let new_result = filter.run(&data); |
| |
| let action = SeccompAction::from_u32(new_result).unwrap_or(SeccompAction::KillProcess); |
| |
| if SeccompAction::has_prio(&action, &r.action) == std::cmp::Ordering::Less { |
| r = SeccompFilterResult { action, filter: Some(filter.clone()) }; |
| } |
| } |
| r |
| } |
| |
| /// Creates a new listener for use by SECCOMP_RET_USER_NOTIF. Returns its fd. |
| pub fn create_listener( |
| locked: &mut Locked<Unlocked>, |
| current_task: &CurrentTask, |
| ) -> Result<FdNumber, Errno> { |
| // Create the `Anon` handle file before taking the write lock on the task, because |
| // `Anon::new_file()` needs to read the `current_task` SID to label the file object. |
| let the_notifier = SeccompNotifier::new(); |
| let handle = Anon::new_file( |
| locked, |
| current_task, |
| Box::new(SeccompNotifierFileObject { notifier: the_notifier.clone() }), |
| OpenFlags::RDWR, |
| "seccomp notify", |
| )?; |
| |
| // Take the write lock to check for an existing notifier, and initialize and store the new |
| // notifier otherwise. |
| let filters = &mut current_task.write().seccomp_filters; |
| if filters.notifier.is_some() { |
| return error!(EBUSY); |
| } |
| let fd = current_task.add_file(locked, handle, FdFlags::CLOEXEC)?; |
| { |
| let mut state = the_notifier.lock(); |
| state.add_thread(); |
| } |
| filters.notifier = Some(the_notifier); |
| Ok(fd) |
| } |
| } |
| |
| /// Possible values for the current status of the seccomp filters for |
| /// this process. |
| #[repr(u8)] |
| #[derive(Clone, Copy, PartialEq)] |
| pub enum SeccompStateValue { |
| None = SECCOMP_MODE_DISABLED as u8, |
| Strict = SECCOMP_MODE_STRICT as u8, |
| UserDefined = SECCOMP_MODE_FILTER as u8, |
| } |
| |
| /// Per-process state that cannot be stored in the container (e.g., whether there is a container). |
| #[derive(Default)] |
| pub struct SeccompState { |
| // This AtomicU8 corresponds to a SeccompStateValue. |
| filter_state: AtomicU8, |
| } |
| |
| impl SeccompState { |
| pub fn from(state: &SeccompState) -> SeccompState { |
| SeccompState { filter_state: AtomicU8::new(state.filter_state.load(Ordering::Acquire)) } |
| } |
| |
| fn from_u8(value: u8) -> SeccompStateValue { |
| match value { |
| v if v == SECCOMP_MODE_DISABLED as u8 => SeccompStateValue::None, |
| v if v == SECCOMP_MODE_STRICT as u8 => SeccompStateValue::Strict, |
| v if v == SECCOMP_MODE_FILTER as u8 => SeccompStateValue::UserDefined, |
| _ => unreachable!(), |
| } |
| } |
| |
| pub fn get(&self) -> SeccompStateValue { |
| Self::from_u8(self.filter_state.load(Ordering::Acquire)) |
| } |
| |
| pub fn set(&self, state: &SeccompStateValue) -> Result<(), Errno> { |
| loop { |
| let seccomp_filter_status = self.get(); |
| if seccomp_filter_status == *state { |
| return Ok(()); |
| } |
| if seccomp_filter_status != SeccompStateValue::None { |
| return error!(EINVAL); |
| } |
| |
| if self |
| .filter_state |
| .compare_exchange( |
| seccomp_filter_status as u8, |
| *state as u8, |
| Ordering::Release, |
| Ordering::Acquire, |
| ) |
| .is_ok() |
| { |
| return Ok(()); |
| } |
| } |
| } |
| |
| /// Check to see if this syscall is allowed in STRICT mode, and, if not, |
| /// send the current task a SIGKILL. |
| pub fn do_strict( |
| locked: &mut Locked<Unlocked>, |
| task: &Task, |
| syscall: &Syscall, |
| ) -> Option<Result<SyscallResult, Errno>> { |
| if syscall.decl.number as u32 != __NR_exit |
| && syscall.decl.number as u32 != __NR_read |
| && syscall.decl.number as u32 != __NR_write |
| { |
| send_standard_signal(locked, task, SignalInfo::default(SIGKILL)); |
| return Some(Err(errno_from_code!(0))); |
| } |
| None |
| } |
| |
| // This is supposed to be put in the audit log, but starnix does not yet have an |
| // audit log. Also, it does not match the Linux format. Still, the machinery |
| // is in place for when we have to support it for real. |
| fn log_action(task: &CurrentTask, syscall: &Syscall) { |
| let creds = task.current_creds(); |
| let (uid, gid) = (creds.uid, creds.gid); |
| let arch = if cfg!(target_arch = "x86_64") { |
| "x86_64" |
| } else if cfg!(target_arch = "aarch64") { |
| "aarch64" |
| } else { |
| "unknown" |
| }; |
| task.kernel().audit_logger().audit_log(AUDIT_SECCOMP as u16, || { |
| format!( |
| "uid={} gid={} pid={} comm={} syscall={} ip={} ARCH={} SYSCALL={}", |
| uid, |
| gid, |
| task.thread_group().leader, |
| task.command(), |
| syscall.decl.number, |
| task.thread_state.registers.instruction_pointer_register(), |
| arch, |
| syscall.decl.name(), |
| ) |
| }); |
| } |
| |
| /// Take the given |action| on the given |task|. The action is one of the SECCOMP_RET values |
| /// (ALLOW, LOG, KILL, KILL_PROCESS, TRAP, ERRNO, USER_NOTIF, TRACE). |task| is the thread that |
| /// invoked the syscall, and |syscall| is the syscall that was invoked. |
| /// Returns the result that the syscall will be forced to return by this |
| /// filter, or None, if the syscall should return its actual return value. |
| // NB: Allow warning below so that it is clear what we are doing on KILL_PROCESS |
| #[allow(clippy::wildcard_in_or_patterns)] |
| pub fn do_user_defined( |
| locked: &mut Locked<Unlocked>, |
| result: SeccompFilterResult, |
| current_task: &mut CurrentTask, |
| syscall: &Syscall, |
| ) -> Option<Result<SyscallResult, Errno>> { |
| let action = result.action; |
| if let Some(filter) = result.filter.as_ref() { |
| if action.is_logged(current_task.kernel(), filter.log) { |
| Self::log_action(current_task, syscall); |
| } |
| } |
| match action { |
| SeccompAction::Allow => None, |
| SeccompAction::Errno(code) => Some(Err(errno_from_code!(code as i16))), |
| SeccompAction::KillThread => { |
| let siginfo = SignalInfo::default(SIGSYS); |
| |
| let is_last_thread = current_task.thread_group().read().tasks_count() == 1; |
| let mut task_state = current_task.write(); |
| |
| if is_last_thread { |
| task_state.set_flags(TaskFlags::DUMP_ON_EXIT, true); |
| task_state.set_exit_status_if_not_already(ExitStatus::CoreDump(siginfo)); |
| } else { |
| task_state.set_exit_status_if_not_already(ExitStatus::Kill(siginfo)); |
| } |
| Some(Err(errno_from_code!(0))) |
| } |
| SeccompAction::KillProcess => { |
| current_task |
| .thread_group_exit(locked, ExitStatus::CoreDump(SignalInfo::default(SIGSYS))); |
| Some(Err(errno_from_code!(0))) |
| } |
| SeccompAction::Log => { |
| Self::log_action(current_task, syscall); |
| None |
| } |
| SeccompAction::Trace => { |
| track_stub!(TODO("https://fxbug.dev/297311898"), "ptrace seccomp support"); |
| Some(error!(ENOSYS)) |
| } |
| SeccompAction::Trap(errno) => { |
| #[cfg(target_arch = "x86_64")] |
| let arch_val = AUDIT_ARCH_X86_64; |
| #[cfg(target_arch = "aarch64")] |
| let arch_val = |
| if current_task.is_arch32() { AUDIT_ARCH_ARM } else { AUDIT_ARCH_AARCH64 }; |
| #[cfg(target_arch = "riscv64")] |
| let arch_val = AUDIT_ARCH_RISCV64; |
| |
| let siginfo = SignalInfo { |
| signal: SIGSYS, |
| errno: errno as i32, |
| code: SYS_SECCOMP as i32, |
| detail: SignalDetail::SIGSYS { |
| call_addr: current_task |
| .thread_state |
| .registers |
| .instruction_pointer_register() |
| .into(), |
| syscall: syscall.decl.number as i32, |
| arch: arch_val, |
| }, |
| force: true, |
| source: SignalSource::capture(), |
| }; |
| |
| send_standard_signal(locked, current_task, siginfo); |
| Some(Err(errno_from_code!(-(syscall.decl.number as i16)))) |
| } |
| SeccompAction::UserNotif => { |
| if let Some(notifier) = current_task.get_seccomp_notifier() { |
| let cookie = result.filter.as_ref().unwrap().cookie.next(); |
| let msg = seccomp_notif { |
| id: cookie, |
| pid: current_task.tid as u32, |
| flags: 0, |
| data: make_seccomp_data( |
| current_task, |
| syscall, |
| current_task.thread_state.registers.instruction_pointer_register(), |
| ), |
| }; |
| // First, add a pending notification, and wake up the supervisor waiting for it. |
| let waiter = Waiter::new(); |
| { |
| let mut notifier = notifier.lock(); |
| if notifier.is_closed { |
| // Someone explicitly close()d the fd with the notifier, which does not |
| // clear the thread-local notifier. Do it now. |
| drop(notifier); |
| current_task.set_seccomp_notifier(None); |
| return Some(error!(ENOSYS)); |
| } |
| notifier.create_notification(cookie, msg); |
| notifier.waiters.wait_async_value(&waiter, cookie); |
| } |
| |
| // Next, wait for a response from the supervisor |
| if let Err(e) = waiter.wait(locked, current_task) { |
| return Some(Err(e)); |
| } |
| |
| // Fetch the response. |
| let resp: Option<seccomp_notif_resp>; |
| { |
| let mut notifier = notifier.lock(); |
| resp = notifier.get_response(cookie); |
| notifier.delete_notification(cookie); |
| } |
| |
| // The response indicates what you are supposed to do with this syscall. |
| if let Some(response) = resp { |
| if response.val != 0 { |
| return Some(Ok(response.val.into())); |
| } |
| if response.error != 0 { |
| if response.error > 0 { |
| return Some(Ok(response.error.into())); |
| } else { |
| return Some(Err(errno_from_code!(-response.error as i16))); |
| } |
| } |
| if response.flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE != 0 { |
| return None; |
| } |
| } |
| Some(Ok(0.into())) |
| } else { |
| Some(error!(ENOSYS)) |
| } |
| } |
| } |
| } |
| } |
| |
| #[derive(Clone, Copy, PartialEq)] |
| pub enum SeccompAction { |
| Allow, |
| Errno(u32), |
| KillProcess, |
| KillThread, |
| Log, |
| Trap(u32), |
| Trace, |
| UserNotif, |
| } |
| |
| impl SeccompAction { |
| pub fn is_action_available(action: u32) -> Result<SyscallResult, Errno> { |
| if SeccompAction::from_u32(action).is_none() { |
| return error!(EOPNOTSUPP); |
| } |
| Ok(0.into()) |
| } |
| |
| pub fn from_u32(action: u32) -> Option<SeccompAction> { |
| match action & !SECCOMP_RET_DATA { |
| linux_uapi::SECCOMP_RET_ALLOW => Some(Self::Allow), |
| linux_uapi::SECCOMP_RET_ERRNO => { |
| let mut action = action & SECCOMP_RET_DATA; |
| // Linux kernel compatibility: if errno exceeds 0xfff, it is capped at 0xfff. |
| action = std::cmp::min(action & 0xffff, 0xfff); |
| Some(Self::Errno(action)) |
| } |
| linux_uapi::SECCOMP_RET_KILL_PROCESS => Some(Self::KillProcess), |
| linux_uapi::SECCOMP_RET_KILL_THREAD => Some(Self::KillThread), |
| linux_uapi::SECCOMP_RET_LOG => Some(Self::Log), |
| linux_uapi::SECCOMP_RET_TRACE => Some(Self::Trace), |
| linux_uapi::SECCOMP_RET_TRAP => Some(Self::Trap(action & SECCOMP_RET_DATA)), |
| |
| linux_uapi::SECCOMP_RET_USER_NOTIF => Some(Self::UserNotif), |
| _ => None, |
| } |
| } |
| |
| pub fn to_isize(self) -> isize { |
| match self { |
| Self::Allow => linux_uapi::SECCOMP_RET_ALLOW as isize, |
| Self::Errno(x) => (linux_uapi::SECCOMP_RET_ERRNO | x) as isize, |
| Self::KillProcess => linux_uapi::SECCOMP_RET_KILL_PROCESS as isize, |
| Self::KillThread => linux_uapi::SECCOMP_RET_KILL_THREAD as isize, |
| Self::Log => linux_uapi::SECCOMP_RET_LOG as isize, |
| Self::Trace => linux_uapi::SECCOMP_RET_TRACE as isize, |
| Self::Trap(x) => (linux_uapi::SECCOMP_RET_TRAP | x) as isize, |
| Self::UserNotif => linux_uapi::SECCOMP_RET_USER_NOTIF as isize, |
| } |
| } |
| |
| pub fn canonical_name(self) -> &'static str { |
| match self { |
| Self::Allow => &"allow", |
| Self::Errno(_) => &"errno", |
| Self::KillProcess => &"kill_process", |
| Self::KillThread => &"kill_thread", |
| Self::Log => &"log", |
| Self::Trace => &"trace", |
| Self::Trap(_) => &"trap", |
| Self::UserNotif => &"user_notif", |
| } |
| } |
| |
| pub fn has_prio(a: &SeccompAction, b: &SeccompAction) -> std::cmp::Ordering { |
| let anum = a.to_isize() as i32; |
| let bnum = b.to_isize() as i32; |
| let fullnum = SECCOMP_RET_ACTION_FULL as i32; |
| let aval = anum & fullnum; |
| let bval = bnum & fullnum; |
| aval.cmp(&bval) |
| } |
| |
| /// Returns a vector of all available actions, sorted by priority. |
| pub fn all_actions() -> Vec<SeccompAction> { |
| let mut result = vec![ |
| Self::Allow, |
| Self::Errno(0), |
| Self::KillProcess, |
| Self::KillThread, |
| Self::Log, |
| Self::Trace, |
| Self::Trap(0), |
| Self::UserNotif, |
| ]; |
| |
| result.sort_by(Self::has_prio); |
| result |
| } |
| |
| /// Gets the contents of /proc/sys/kernel/seccomp/actions_avail |
| pub fn get_actions_avail_file() -> Vec<u8> { |
| let all_actions = Self::all_actions(); |
| if all_actions.len() == 0 { |
| return vec![]; |
| } |
| let mut result = String::from(all_actions[0].canonical_name()); |
| for i in 1..all_actions.len() { |
| result.push_str(" "); |
| result.push_str(all_actions[i].canonical_name()); |
| } |
| result.push('\n'); |
| result.into_bytes() |
| } |
| |
| fn logged_bit_offset(&self) -> u32 { |
| match self { |
| Self::Allow => 1, |
| Self::Errno(_) => 2, |
| Self::KillProcess => 3, |
| Self::KillThread => 4, |
| Self::Log => 5, |
| Self::Trace => 6, |
| Self::Trap(_) => 7, |
| Self::UserNotif => 8, |
| } |
| } |
| |
| fn set_logged_bit(&self, dst: &mut u16) { |
| *dst |= 1 << self.logged_bit_offset(); |
| } |
| |
| pub fn is_logged(&self, kernel: &Kernel, filter_flag: bool) -> bool { |
| if kernel.actions_logged.load(Ordering::Relaxed) & (1 << self.logged_bit_offset()) != 0 { |
| match self { |
| // Per the documentation on audit logging of seccomp actions in |
| // seccomp(2), just because it is listed as logged, that doesn't |
| // mean we actually log it. |
| |
| // If it is KILL_PROCESS or KILL_THREAD, return true |
| Self::KillProcess | Self::KillThread => true, |
| // If it is one of these and the filter flag was set, return true. |
| Self::Errno(_) | Self::Log | Self::Trap(_) | Self::UserNotif => filter_flag, |
| // Never log ALLOW |
| _ => false, |
| } |
| } else { |
| false |
| } |
| } |
| |
| pub fn set_actions_logged(kernel: &Kernel, data: &[u8]) -> Result<(), Errno> { |
| let mut new_actions_logged: u16 = 0; |
| for action_res in data.fields_with(|c| c.is_ascii_whitespace()) { |
| if let Ok(action) = action_res.to_str() { |
| match action { |
| "errno" => Self::Errno(0).set_logged_bit(&mut new_actions_logged), |
| "kill_process" => Self::KillProcess.set_logged_bit(&mut new_actions_logged), |
| "kill_thread" => Self::KillThread.set_logged_bit(&mut new_actions_logged), |
| "log" => Self::Log.set_logged_bit(&mut new_actions_logged), |
| "trace" => Self::Trace.set_logged_bit(&mut new_actions_logged), |
| "trap" => Self::Trap(0).set_logged_bit(&mut new_actions_logged), |
| "user_notif" => Self::UserNotif.set_logged_bit(&mut new_actions_logged), |
| // Not allowed to write anything other than the approved actions to that list. |
| _ => return error!(EINVAL), |
| } |
| } else { |
| return error!(EINVAL); |
| } |
| } |
| kernel.actions_logged.store(new_actions_logged, Ordering::Relaxed); |
| Ok(()) |
| } |
| |
| pub fn get_actions_logged(kernel: &Kernel) -> Vec<u8> { |
| let al = kernel.actions_logged.load(Ordering::Relaxed); |
| let mut result: String = "".to_string(); |
| for action in Self::all_actions() { |
| if (al & (1 << action.logged_bit_offset())) != 0 { |
| result.push_str(action.canonical_name()); |
| result.push(' '); |
| } |
| } |
| if !result.is_empty() { |
| // remove trailing whitespace. |
| result.pop(); |
| } |
| |
| result.into_bytes() |
| } |
| } |
| |
| /// This struct contains data that needs to be shuttled back and forth between the thread doing |
| /// a USER_NOTIF and the supervisor thread responding to it. |
| #[derive(Default)] |
| struct SeccompNotification { |
| /// notif is the notification set by the filter. When this is set, the associated fd will |
| /// be set to POLLIN. |
| notif: seccomp_notif, |
| |
| /// Consumed indicates whether a supervisor process has read this notification (and so it |
| /// can no longer be consumed by any other SECCOMP_IOCTL_NOTIF_RECV ioctl). When the notif |
| /// is consumed, the associated fd will be set to POLLOUT, indicating that it is ready to |
| /// receive a response. |
| consumed: bool, |
| |
| /// resp is the response that the supervisor sends. When this is set, an event will be sent |
| /// to SeccompNotifiers::waiters corresponding to the unique id of the notification. This |
| /// will wake up the filter that is waiting for this particular response. |
| resp: Option<seccomp_notif_resp>, |
| } |
| |
| impl SeccompNotification { |
| fn new(data: seccomp_notif) -> SeccompNotification { |
| SeccompNotification { notif: data, resp: None, consumed: false } |
| } |
| } |
| |
| /// The underlying implementation of the file descriptor that connects a process that triggers a |
| /// SECCOMP_RET_USER_NOTIF with the monitoring process. This support seccomp's ability to notify a |
| /// user-space process on specific syscall triggers. See seccomp_unotify(2) for the semantics. |
| pub struct SeccompNotifier { |
| waiters: WaitQueue, |
| |
| pending_notifications: HashMap<u64, SeccompNotification>, |
| |
| // This keeps track of the number of threads using this notifier as a filter. If that hits |
| // zero, the listeners need to receive a HUP. |
| num_active_threads: u64, |
| |
| // notifiers are referenced both by fds and in SeccompFilterContainer. If the file no longer |
| // has fds referring to it, it will be closed, and the SeccompFilterContainers should stop |
| // using it. |
| pub is_closed: bool, |
| } |
| |
| pub type SeccompNotifierHandle = Arc<Mutex<SeccompNotifier>>; |
| |
| impl SeccompNotifier { |
| pub fn new() -> SeccompNotifierHandle { |
| Arc::new(Mutex::new(SeccompNotifier { |
| waiters: WaitQueue::default(), |
| pending_notifications: HashMap::default(), |
| num_active_threads: 0, |
| is_closed: false, |
| })) |
| } |
| |
| fn add_thread(&mut self) { |
| self.num_active_threads += 1; |
| } |
| |
| fn remove_thread(&mut self) { |
| self.num_active_threads -= 1; |
| if self.num_active_threads == 0 { |
| self.waiters.notify_fd_events(FdEvents::POLLHUP); |
| } |
| } |
| |
| // Creates a pending notification for communication between the |
| // target thread and a supervisor, and notifies readers there is |
| // an opportunity to read. |
| fn create_notification(&mut self, cookie: u64, notif: seccomp_notif) { |
| self.pending_notifications.insert(cookie, SeccompNotification::new(notif)); |
| self.waiters.notify_fd_events(FdEvents::POLLIN | FdEvents::POLLRDNORM); |
| } |
| |
| // Gets a notification that needs to be handled by a supervisor, |
| // and notifies waiters that there is an opportunity to write. |
| fn consume_some_notification(&mut self) -> Option<seccomp_notif> { |
| for (_, notif) in self.pending_notifications.iter_mut() { |
| if !notif.consumed { |
| notif.consumed = true; |
| self.waiters.notify_fd_events(FdEvents::POLLOUT | FdEvents::POLLWRNORM); |
| return Some(notif.notif); |
| } |
| } |
| None |
| } |
| |
| // In case something goes wrong after we consume the notification. |
| fn unconsume(&mut self, cookie: u64) { |
| if let Some(n) = self.pending_notifications.get_mut(&cookie).as_mut() { |
| n.consumed = false; |
| } |
| } |
| |
| // Returns the appropriate notifications if someone is waiting with poll/epoll/select. |
| fn get_fd_notifications(&self) -> FdEvents { |
| let mut events = FdEvents::empty(); |
| |
| for (_, notification) in self.pending_notifications.iter() { |
| if !notification.consumed { |
| events |= FdEvents::POLLIN | FdEvents::POLLRDNORM; |
| } else if notification.resp.is_none() { |
| events |= FdEvents::POLLOUT | FdEvents::POLLWRNORM; |
| } |
| } |
| |
| if self.num_active_threads == 0 { |
| events |= FdEvents::POLLHUP; |
| } |
| events |
| } |
| |
| // Sets the value read by the target in response to this notification. Intended for use by the |
| // supervisor. Notifies the filter there is a response to this request. |
| fn set_response(&mut self, cookie: u64, resp: seccomp_notif_resp) -> Option<Errno> { |
| if let Some(entry) = self.pending_notifications.get_mut(&cookie) { |
| if entry.resp.is_some() { |
| return Some(errno!(EINPROGRESS)); |
| } |
| entry.resp = Some(resp); |
| self.waiters.notify_value(resp.id); |
| None |
| } else { |
| Some(errno!(EINVAL)) |
| } |
| } |
| |
| // Gets the value set by the supervisor for the target to read. |
| fn get_response(&self, cookie: u64) -> Option<seccomp_notif_resp> { |
| if let Some(value) = self.pending_notifications.get(&cookie) { |
| return value.resp; |
| } |
| None |
| } |
| |
| // Returns whether the cookie represents an active notification. |
| fn notification_pending(&self, cookie: u64) -> bool { |
| self.pending_notifications.contains_key(&cookie) |
| } |
| |
| // Deletes the notification, when the target is done processing it. |
| fn delete_notification(&mut self, cookie: u64) { |
| let _ = self.pending_notifications.remove(&cookie); |
| } |
| } |
| |
| struct SeccompNotifierFileObject { |
| notifier: SeccompNotifierHandle, |
| } |
| |
| impl FileOps for SeccompNotifierFileObject { |
| fileops_impl_nonseekable!(); |
| fileops_impl_noop_sync!(); |
| |
| fn close( |
| self: Box<Self>, |
| _locked: &mut Locked<FileOpsCore>, |
| _file: &FileObjectState, |
| _current_task: &CurrentTask, |
| ) { |
| let mut state = self.notifier.lock(); |
| |
| for (cookie, notification) in state.pending_notifications.iter() { |
| if !notification.consumed { |
| state.waiters.notify_value(*cookie); |
| state.waiters.notify_fd_events(FdEvents::POLLIN | FdEvents::POLLRDNORM); |
| } else if notification.resp.is_none() { |
| state.waiters.notify_fd_events(FdEvents::POLLOUT | FdEvents::POLLWRNORM); |
| } |
| } |
| state.waiters.notify_fd_events(FdEvents::POLLHUP); |
| |
| state.pending_notifications.clear(); |
| |
| state.is_closed = true; |
| } |
| |
| fn read( |
| &self, |
| _locked: &mut Locked<FileOpsCore>, |
| _file: &FileObject, |
| _current_task: &CurrentTask, |
| _offset: usize, |
| _usize: &mut dyn OutputBuffer, |
| ) -> Result<usize, Errno> { |
| error!(EINVAL) |
| } |
| |
| fn write( |
| &self, |
| _locked: &mut Locked<FileOpsCore>, |
| _file: &FileObject, |
| _current_task: &CurrentTask, |
| _offset: usize, |
| _buffer: &mut dyn InputBuffer, |
| ) -> Result<usize, Errno> { |
| error!(EINVAL) |
| } |
| |
| fn ioctl( |
| &self, |
| locked: &mut Locked<Unlocked>, |
| _file: &FileObject, |
| current_task: &CurrentTask, |
| request: u32, |
| arg: SyscallArg, |
| ) -> Result<SyscallResult, Errno> { |
| let user_addr = UserAddress::from(arg); |
| match request { |
| SECCOMP_IOCTL_NOTIF_RECV => { |
| if let Ok(notif) = |
| current_task.read_memory_to_vec(user_addr, std::mem::size_of::<seccomp_notif>()) |
| { |
| for value in notif.iter() { |
| if *value != 0 { |
| return error!(EINVAL); |
| } |
| } |
| } |
| // A RECV reads a notification, optionally waiting for one to become available. |
| let mut notif: Option<seccomp_notif>; |
| loop { |
| // Grab a notification or wait for one to become readable. |
| let waiter = Waiter::new(); |
| { |
| let mut notifier = self.notifier.lock(); |
| notif = notifier.consume_some_notification(); |
| if notif.is_some() { |
| break; |
| } |
| notifier.waiters.wait_async_fd_events( |
| &waiter, |
| FdEvents::POLLIN | FdEvents::POLLHUP, |
| EventHandler::None, |
| ); |
| } |
| waiter.wait(locked, current_task)?; |
| } |
| if let Some(notif) = notif { |
| if let Err(e) = |
| current_task.write_object(UserRef::<seccomp_notif>::new(user_addr), ¬if) |
| { |
| self.notifier.lock().unconsume(notif.id); |
| return Err(e); |
| } |
| } |
| |
| Ok(0.into()) |
| } |
| SECCOMP_IOCTL_NOTIF_SEND => { |
| // A SEND sends a response to a previously received notification. |
| let resp: seccomp_notif_resp = current_task.read_object(UserRef::new(user_addr))?; |
| if resp.flags & !SECCOMP_USER_NOTIF_FLAG_CONTINUE != 0 { |
| return error!(EINVAL); |
| } |
| if resp.flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE != 0 |
| && (resp.error != 0 || resp.val != 0) |
| { |
| return error!(EINVAL); |
| } |
| { |
| let mut notifier = self.notifier.lock(); |
| if let Some(err) = notifier.set_response(resp.id, resp) { |
| return Err(err); |
| } |
| } |
| Ok(0.into()) |
| } |
| SECCOMP_IOCTL_NOTIF_ID_VALID => { |
| // An ID_VALID indicates that the notification is still in progress. |
| let cookie: u64 = current_task.read_object(UserRef::new(user_addr))?; |
| { |
| let notifier = self.notifier.lock(); |
| if notifier.notification_pending(cookie) { |
| Ok(0.into()) |
| } else { |
| error!(ENOENT) |
| } |
| } |
| } |
| SECCOMP_IOCTL_NOTIF_ADDFD => error!(EINVAL), |
| _ => error!(EINVAL), |
| } |
| } |
| |
| fn wait_async( |
| &self, |
| _locked: &mut Locked<FileOpsCore>, |
| _file: &FileObject, |
| _current_task: &CurrentTask, |
| waiter: &Waiter, |
| events: FdEvents, |
| handler: EventHandler, |
| ) -> Option<WaitCanceler> { |
| let notifier = self.notifier.lock(); |
| Some(notifier.waiters.wait_async_fd_events(waiter, events, handler)) |
| } |
| |
| fn query_events( |
| &self, |
| _locked: &mut Locked<FileOpsCore>, |
| _file: &FileObject, |
| _current_task: &CurrentTask, |
| ) -> Result<FdEvents, Errno> { |
| Ok(self.notifier.lock().get_fd_notifications()) |
| } |
| } |
| |
| #[cfg(test)] |
| mod test { |
| use crate::task::SeccompAction; |
| use crate::testing::spawn_kernel_and_run; |
| |
| #[::fuchsia::test] |
| async fn test_actions_logged_accepts_legal_string() { |
| spawn_kernel_and_run(async |_, current_task| { |
| let kernel = current_task.kernel(); |
| let mut actions = SeccompAction::get_actions_avail_file(); |
| // This is a test in Rust instead of a syscall test because we don't want to change the |
| // global config in a test. |
| assert!( |
| SeccompAction::set_actions_logged(&kernel, &actions[..]).is_err(), |
| "Should not be able to write allow to actions_logged file" |
| ); |
| let action_string = std::string::String::from_utf8(actions.clone()).unwrap(); |
| if let Some(action_index) = action_string.find("allow") { |
| actions.drain(action_index..action_index + "allow".len()); |
| } |
| let write_result = SeccompAction::set_actions_logged(&kernel, &actions[..]); |
| assert!( |
| write_result.is_ok(), |
| "Could not write legal string \"{}\" to actions_logged file: error {}", |
| std::string::String::from_utf8(actions.clone()).unwrap(), |
| write_result.unwrap_err() |
| ); |
| }) |
| .await; |
| } |
| } |