blob: ae0d95a8a5b2dd6dfee9a2742fd14c05a70ab8ca [file] [log] [blame] [edit]
// Copyright 2022 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
//! Implementation of (e)BPF.
//!
//! BPF stands for Berkeley Packet Filter and is an API introduced in BSD that allows filtering
//! network packets by running little programs in the kernel. eBPF stands for extended BFP and
//! is a Linux extension of BPF that allows hooking BPF programs into many different
//! non-networking-related contexts.
// TODO(https://github.com/rust-lang/rust/issues/39371): remove
#![allow(non_upper_case_globals)]
use starnix_lock::{declare_lock_levels, OrderedMutex};
use std::{collections::BTreeMap, ops::Bound, sync::Arc};
use zerocopy::{AsBytes, FromBytes};
use crate::{
auth::*,
fs::{
buffers::{InputBuffer, OutputBuffer},
*,
},
lock_ordering::*,
mm::{MemoryAccessor, MemoryAccessorExt},
syscalls::*,
task::Kernel,
types::{as_any::AsAny, *},
};
declare_lock_levels![BpfMapEntries: OrderedMutex<BTreeMap<Vec<u8>, Vec<u8>>>];
use self::lock_levels::*;
/// The default selinux context to use for each BPF object.
const DEFAULT_BPF_SELINUX_CONTEXT: &FsStr = b"u:object_r:fs_bpf:s0";
trait BpfObject: Send + Sync + AsAny + 'static {}
/// A reference to a BPF object that can be stored in either an FD or an entry in the /sys/fs/bpf
/// filesystem.
#[derive(Clone)]
struct BpfHandle(Arc<dyn BpfObject>);
impl FileOps for BpfHandle {
fileops_impl_nonseekable!();
fn read(
&self,
_file: &FileObject,
_current_task: &crate::task::CurrentTask,
offset: usize,
_data: &mut dyn OutputBuffer,
) -> Result<usize, Errno> {
debug_assert!(offset == 0);
error!(EINVAL) // TODO
}
fn write(
&self,
_file: &FileObject,
_current_task: &crate::task::CurrentTask,
offset: usize,
_data: &mut dyn InputBuffer,
) -> Result<usize, Errno> {
debug_assert!(offset == 0);
error!(EINVAL) // TODO
}
}
impl BpfHandle {
fn new(obj: impl BpfObject) -> Self {
Self(Arc::new(obj))
}
fn downcast<T: BpfObject>(&self) -> Option<&T> {
(*self.0).as_any().downcast_ref::<T>()
}
}
/// A BPF Type Format object, often referred to as BTF. See
/// https://www.kernel.org/doc/html/latest/bpf/btf.html
struct BpfTypeFormat {
#[allow(dead_code)]
data: Vec<u8>,
}
impl BpfObject for BpfTypeFormat {}
/// A BPF map. This is a hashtable that can be accessed both by BPF programs and userspace.
struct Map {
map_type: bpf_map_type,
key_size: u32,
value_size: u32,
max_entries: u32,
flags: u32,
// TODO(tbodt): Linux actually has 30 different implementations of a BPF map, from hashmap to
// array to bloom filter. BTreeMap is probably the correct semantics for none of them. This
// will ultimately need to be a trait object.
entries: OrderedMutex<BTreeMap<Vec<u8>, Vec<u8>>, BpfMapEntries>,
}
impl BpfObject for Map {}
/// A BPF program. Currently empty because none of the state of a program actually matters to us
/// yet.
struct Program;
impl BpfObject for Program {}
/// Read the arguments for a BPF command. The ABI works like this: If the arguments struct
/// passed is larger than the kernel knows about, the excess must be zeros. Similarly, if the
/// arguments struct is smaller than the kernel knows about, the kernel fills the excess with
/// zero.
fn read_attr<Attr: FromBytes>(
current_task: &CurrentTask,
attr_addr: UserAddress,
attr_size: u32,
) -> Result<Attr, Errno> {
let attr_size = attr_size as usize;
let sizeof_attr = std::mem::size_of::<Attr>();
// Verify that the extra is all zeros.
if attr_size > sizeof_attr {
let tail =
current_task.read_memory_to_vec(attr_addr + sizeof_attr, attr_size - sizeof_attr)?;
for byte in tail {
if byte != 0 {
return error!(E2BIG);
}
}
}
// If the struct passed is smaller than our definition of the struct, let whatever is not
// passed be zero.
let mut attr = Attr::new_zeroed();
// SAFETY: attr is FromBytes, meaning it is safe to write any bit pattern to its storage. (The
// unsafe slice construction is necessary because it's not necessarily safe to read from its
// storage directly.)
current_task.read_memory_to_slice(attr_addr, unsafe {
std::slice::from_raw_parts_mut(&mut attr as *mut Attr as *mut u8, sizeof_attr)
})?;
Ok(attr)
}
fn install_bpf_fd(current_task: &CurrentTask, obj: impl BpfObject) -> Result<SyscallResult, Errno> {
install_bpf_handle_fd(current_task, BpfHandle::new(obj))
}
fn install_bpf_handle_fd(
current_task: &CurrentTask,
handle: BpfHandle,
) -> Result<SyscallResult, Errno> {
// All BPF FDs have the CLOEXEC flag turned on by default.
let file = Anon::new_file(current_task, Box::new(handle), OpenFlags::CLOEXEC);
Ok(current_task.add_file(file, FdFlags::CLOEXEC)?.into())
}
fn get_bpf_fd(current_task: &CurrentTask, fd: u32) -> Result<BpfHandle, Errno> {
Ok(current_task
.files
.get(FdNumber::from_raw(fd as i32))?
.downcast_file::<BpfHandle>()
.ok_or_else(|| errno!(EBADF))?
.clone())
}
// TODO(b/278731253): Returns custom selinux context for everything under /sys/fs/bpf/net_shared/*.
// Otherwise, returns default selinux context for BPF objects.
fn get_selinux_context(path: &FsStr) -> FsString {
if String::from_utf8_lossy(path).contains("net_shared") {
b"u:object_r:fs_bpf_net_shared:s0".to_vec()
} else {
DEFAULT_BPF_SELINUX_CONTEXT.to_vec()
}
}
pub fn sys_bpf(
locked: &mut Locked<'_, Unlocked>,
current_task: &CurrentTask,
cmd: bpf_cmd,
attr_addr: UserAddress,
attr_size: u32,
) -> Result<SyscallResult, Errno> {
// TODO(security): Implement the actual security semantics of BPF. This is commented out
// because Android calls bpf from unprivileged processes.
// if !current_task.creds().has_capability(CAP_SYS_ADMIN) {
// return error!(EPERM);
// }
// The best available documentation on the various BPF commands is at
// https://www.kernel.org/doc/html/latest/userspace-api/ebpf/syscall.html.
// Comments on commands are copied from there.
match cmd {
// Create a map and return a file descriptor that refers to the map.
bpf_cmd_BPF_MAP_CREATE => {
let map_attr: bpf_attr__bindgen_ty_1 = read_attr(current_task, attr_addr, attr_size)?;
log_trace!("BPF_MAP_CREATE {:?}", map_attr);
let mut map = Map {
map_type: map_attr.map_type,
key_size: map_attr.key_size,
value_size: map_attr.value_size,
max_entries: map_attr.max_entries,
flags: map_attr.map_flags,
entries: Default::default(),
};
// To quote
// https://cs.android.com/android/platform/superproject/+/master:system/bpf/libbpf_android/Loader.cpp;l=670;drc=28e295395471b33e662b7116378d15f1e88f0864
// "DEVMAPs are readonly from the bpf program side's point of view, as such the kernel
// in kernel/bpf/devmap.c dev_map_init_map() will set the flag"
if map.map_type == bpf_map_type_BPF_MAP_TYPE_DEVMAP
|| map.map_type == bpf_map_type_BPF_MAP_TYPE_DEVMAP_HASH
{
map.flags |= BPF_F_RDONLY_PROG;
}
install_bpf_fd(current_task, map)
}
// Create or update an element (key/value pair) in a specified map.
bpf_cmd_BPF_MAP_UPDATE_ELEM => {
let elem_attr: bpf_attr__bindgen_ty_2 = read_attr(current_task, attr_addr, attr_size)?;
log_trace!("BPF_MAP_UPDATE_ELEM");
let map = get_bpf_fd(current_task, elem_attr.map_fd)?;
let map = map.downcast::<Map>().ok_or_else(|| errno!(EINVAL))?;
let key = current_task
.mm
.read_memory_to_vec(UserAddress::from(elem_attr.key), map.key_size as usize)?;
// SAFETY: this union object was created with FromBytes so it's safe to access any
// variant because all variants must be valid with all bit patterns.
let value_addr = unsafe { elem_attr.__bindgen_anon_1.value };
let value = current_task
.mm
.read_memory_to_vec(UserAddress::from(value_addr), map.value_size as usize)?;
map.entries.lock(locked).insert(key, value);
Ok(SUCCESS)
}
// Look up an element by key in a specified map and return the key of the next element. Can
// be used to iterate over all elements in the map.
bpf_cmd_BPF_MAP_GET_NEXT_KEY => {
let elem_attr: bpf_attr__bindgen_ty_2 = read_attr(current_task, attr_addr, attr_size)?;
log_trace!("BPF_MAP_GET_NEXT_KEY");
let map = get_bpf_fd(current_task, elem_attr.map_fd)?;
let map = map.downcast::<Map>().ok_or_else(|| errno!(EINVAL))?;
let key = if elem_attr.key != 0 {
let key = current_task
.mm
.read_memory_to_vec(UserAddress::from(elem_attr.key), map.key_size as usize)?;
Some(key)
} else {
None
};
let entries = map.entries.lock(locked);
let next_entry = match key {
Some(key) if entries.contains_key(&key) => {
entries.range((Bound::Excluded(key), Bound::Unbounded)).next()
}
_ => entries.iter().next(),
};
let (next_key, _next_value) = next_entry.ok_or_else(|| errno!(ENOENT))?;
// SAFETY: this union object was created with FromBytes so it's safe to access any
// variant (right?)
let next_key_addr = unsafe { elem_attr.__bindgen_anon_1.next_key };
current_task.write_memory(UserAddress::from(next_key_addr), next_key)?;
Ok(SUCCESS)
}
// Verify and load an eBPF program, returning a new file descriptor associated with the
// program.
bpf_cmd_BPF_PROG_LOAD => {
let _prog_attr: bpf_attr__bindgen_ty_4 = read_attr(current_task, attr_addr, attr_size)?;
log_trace!("BPF_PROG_LOAD");
// Just pretend to load the program. We certainly can't execute it.
install_bpf_fd(current_task, Program)
}
// Attach an eBPF program to a target_fd at the specified attach_type hook.
bpf_cmd_BPF_PROG_ATTACH => {
log_trace!("BPF_PROG_ATTACH");
not_implemented!("Bpf::BPF_PROG_ATTACH is stubbed");
Ok(SUCCESS)
}
// Obtain information about eBPF programs associated with the specified attach_type hook.
bpf_cmd_BPF_PROG_QUERY => {
let mut prog_attr: bpf_attr__bindgen_ty_10 =
read_attr(current_task, attr_addr, attr_size)?;
log_trace!("BPF_PROG_QUERY");
not_implemented!("Bpf::BPF_PROG_QUERY is stubbed");
current_task.write_memory(UserAddress::from(prog_attr.prog_ids), 1.as_bytes())?;
prog_attr.prog_cnt = std::mem::size_of::<u64>() as u32;
current_task.write_memory(attr_addr, prog_attr.as_bytes())?;
Ok(SUCCESS)
}
// Pin an eBPF program or map referred by the specified bpf_fd to the provided pathname on
// the filesystem.
bpf_cmd_BPF_OBJ_PIN => {
let pin_attr: bpf_attr__bindgen_ty_5 = read_attr(current_task, attr_addr, attr_size)?;
log_trace!("BPF_OBJ_PIN {:?}", pin_attr);
let object = get_bpf_fd(current_task, pin_attr.bpf_fd)?;
let path_addr = UserCString::new(UserAddress::from(pin_attr.pathname));
let pathname = current_task.read_c_string_to_vec(path_addr, PATH_MAX as usize)?;
let (parent, basename) = current_task.lookup_parent_at(
&mut LookupContext::default(),
FdNumber::AT_FDCWD,
&pathname,
)?;
let bpf_dir =
parent.entry.node.downcast_ops::<BpfFsDir>().ok_or_else(|| errno!(EINVAL))?;
let selinux_context = get_selinux_context(&pathname);
bpf_dir.register_pin(current_task, &parent, basename, object, &selinux_context)?;
Ok(SUCCESS)
}
// Open a file descriptor for the eBPF object pinned to the specified pathname.
bpf_cmd_BPF_OBJ_GET => {
let path_attr: bpf_attr__bindgen_ty_5 = read_attr(current_task, attr_addr, attr_size)?;
log_trace!("BPF_OBJ_GET {:?}", path_attr);
let path_addr = UserCString::new(UserAddress::from(path_attr.pathname));
let pathname = current_task.read_c_string_to_vec(path_addr, PATH_MAX as usize)?;
let node = current_task.lookup_path_from_root(&pathname)?;
// TODO(tbodt): This might be the wrong error code, write a test program to find out
let node =
node.entry.node.downcast_ops::<BpfFsObject>().ok_or_else(|| errno!(EINVAL))?;
install_bpf_handle_fd(current_task, node.handle.clone())
}
// Obtain information about the eBPF object corresponding to bpf_fd.
bpf_cmd_BPF_OBJ_GET_INFO_BY_FD => {
let mut get_info_attr: bpf_attr__bindgen_ty_9 =
read_attr(current_task, attr_addr, attr_size)?;
log_trace!("BPF_OBJ_GET_INFO_BY_FD {:?}", get_info_attr);
let fd = get_bpf_fd(current_task, get_info_attr.bpf_fd)?;
let mut info = if let Some(map) = fd.downcast::<Map>() {
bpf_map_info {
type_: map.map_type,
id: 0, // not used by android as far as I can tell
key_size: map.key_size,
value_size: map.value_size,
max_entries: map.max_entries,
map_flags: map.flags,
..Default::default()
}
.as_bytes()
.to_owned()
} else if let Some(_prog) = fd.downcast::<Program>() {
#[allow(unknown_lints, clippy::unnecessary_struct_initialization)]
bpf_prog_info {
// Doesn't matter yet
..Default::default()
}
.as_bytes()
.to_owned()
} else {
return error!(EINVAL);
};
// If info_len is larger than info, write out the full length of info and write the
// smaller size into info_len. If info_len is smaller, truncate info.
// TODO(tbodt): This is just a guess for the behavior. Works with BpfSyscallWrappers.h,
// but could be wrong.
info.truncate(get_info_attr.info_len as usize);
get_info_attr.info_len = info.len() as u32;
current_task.write_memory(UserAddress::from(get_info_attr.info), &info)?;
current_task.write_memory(attr_addr, get_info_attr.as_bytes())?;
Ok(SUCCESS)
}
// Verify and load BPF Type Format (BTF) metadata into the kernel, returning a new file
// descriptor associated with the metadata. BTF is described in more detail at
// https://www.kernel.org/doc/html/latest/bpf/btf.html.
bpf_cmd_BPF_BTF_LOAD => {
let btf_attr: bpf_attr__bindgen_ty_12 = read_attr(current_task, attr_addr, attr_size)?;
log_trace!("BPF_BTF_LOAD {:?}", btf_attr);
let data = current_task
.mm
.read_memory_to_vec(UserAddress::from(btf_attr.btf), btf_attr.btf_size as usize)?;
install_bpf_fd(current_task, BpfTypeFormat { data })
}
_ => {
not_implemented!("bpf command {}", cmd);
error!(EINVAL)
}
}
}
pub struct BpfFs;
impl BpfFs {
pub fn new_fs(
kernel: &Arc<Kernel>,
options: FileSystemOptions,
) -> Result<FileSystemHandle, Errno> {
let fs = FileSystem::new(kernel, CacheMode::Permanent, BpfFs, options);
let node =
FsNode::new_root_with_properties(BpfFsDir::new(DEFAULT_BPF_SELINUX_CONTEXT), |info| {
info.mode |= FileMode::ISVTX;
});
fs.set_root_node(node);
Ok(fs)
}
}
impl FileSystemOps for BpfFs {
fn statfs(&self, _fs: &FileSystem, _current_task: &CurrentTask) -> Result<statfs, Errno> {
Ok(statfs::default(BPF_FS_MAGIC))
}
fn name(&self) -> &'static FsStr {
b"bpf"
}
fn rename(
&self,
_fs: &FileSystem,
_current_task: &CurrentTask,
_old_parent: &FsNodeHandle,
_old_name: &FsStr,
_new_parent: &FsNodeHandle,
_new_name: &FsStr,
_renamed: &FsNodeHandle,
_replaced: Option<&FsNodeHandle>,
) -> Result<(), Errno> {
Ok(())
}
}
struct BpfFsDir {
xattrs: MemoryXattrStorage,
}
impl BpfFsDir {
fn new(selinux_context: &FsStr) -> Self {
let xattrs = MemoryXattrStorage::default();
xattrs
.set_xattr(b"security.selinux", selinux_context, XattrOp::Create)
.expect("Failed to set selinux context.");
Self { xattrs }
}
fn register_pin(
&self,
current_task: &CurrentTask,
node: &NamespaceNode,
name: &FsStr,
object: BpfHandle,
selinux_context: &FsStr,
) -> Result<(), Errno> {
node.entry.create_entry(current_task, &node.mount, name, |dir, _mount, _name| {
Ok(dir.fs().create_node(
BpfFsObject::new(object, &selinux_context),
FsNodeInfo::new_factory(mode!(IFREG, 0o600), FsCred::root()),
))
})?;
Ok(())
}
}
impl FsNodeOps for BpfFsDir {
fs_node_impl_xattr_delegate!(self, self.xattrs);
fn create_file_ops(
&self,
_node: &FsNode,
_current_task: &CurrentTask,
_flags: OpenFlags,
) -> Result<Box<dyn FileOps>, Errno> {
Ok(Box::new(MemoryDirectoryFile::new()))
}
fn mkdir(
&self,
node: &FsNode,
_current_task: &CurrentTask,
name: &FsStr,
mode: FileMode,
owner: FsCred,
) -> Result<FsNodeHandle, Errno> {
let selinux_context = get_selinux_context(name);
Ok(node.fs().create_node(
BpfFsDir::new(&selinux_context),
FsNodeInfo::new_factory(mode | FileMode::ISVTX, owner),
))
}
fn mknod(
&self,
_node: &FsNode,
_current_task: &CurrentTask,
_name: &FsStr,
_mode: FileMode,
_dev: DeviceType,
_owner: FsCred,
) -> Result<FsNodeHandle, Errno> {
error!(EPERM)
}
fn create_symlink(
&self,
_node: &FsNode,
_current_task: &CurrentTask,
_name: &FsStr,
_target: &FsStr,
_owner: FsCred,
) -> Result<FsNodeHandle, Errno> {
error!(EPERM)
}
fn link(
&self,
_node: &FsNode,
_current_task: &CurrentTask,
_name: &FsStr,
_child: &FsNodeHandle,
) -> Result<(), Errno> {
Ok(())
}
fn unlink(
&self,
_node: &FsNode,
_current_task: &CurrentTask,
_name: &FsStr,
_child: &FsNodeHandle,
) -> Result<(), Errno> {
Ok(())
}
}
struct BpfFsObject {
handle: BpfHandle,
xattrs: MemoryXattrStorage,
}
impl BpfFsObject {
fn new(handle: BpfHandle, selinux_context: &FsStr) -> Self {
let xattrs = MemoryXattrStorage::default();
xattrs
.set_xattr(b"security.selinux", selinux_context, XattrOp::Create)
.expect("Failed to set selinux context.");
Self { handle, xattrs }
}
}
impl FsNodeOps for BpfFsObject {
fs_node_impl_not_dir!();
fs_node_impl_xattr_delegate!(self, self.xattrs);
fn create_file_ops(
&self,
_node: &FsNode,
_current_task: &CurrentTask,
_flags: OpenFlags,
) -> Result<Box<dyn FileOps>, Errno> {
error!(EIO)
}
}