blob: 41650009787771d01791ad1db8f2f2b87e811dab [file] [log] [blame]
// Copyright 2025 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
use crate::task::dynamic_thread_spawner::SpawnRequestBuilder;
use anyhow::Context;
use fuchsia_component::client::connect_to_protocol;
use futures::StreamExt;
use futures::channel::mpsc as future_mpsc;
use regex::Regex;
use std::collections::HashMap;
use std::error::Error;
use std::sync::atomic::{AtomicPtr, AtomicU64, Ordering};
use std::sync::{Arc, mpsc as sync_mpsc};
use std::time::Duration;
use zerocopy::{Immutable, IntoBytes};
use zx::AsHandleRef;
use {fidl_fuchsia_cpu_profiler as profiler, fuchsia_async};
use futures::io::{AsyncReadExt, Cursor};
use fxt::TraceRecord;
use fxt::profiler::ProfilerRecord;
use fxt::session::SessionParser;
use seq_lock::SeqLock;
use starnix_logging::{log_info, log_warn, track_stub};
use starnix_sync::{FileOpsCore, Locked, Mutex, RwLock, Unlocked};
use starnix_syscalls::{SUCCESS, SyscallArg, SyscallResult};
use starnix_uapi::arch32::{
PERF_EVENT_IOC_DISABLE, PERF_EVENT_IOC_ENABLE, PERF_EVENT_IOC_ID,
PERF_EVENT_IOC_MODIFY_ATTRIBUTES, PERF_EVENT_IOC_PAUSE_OUTPUT, PERF_EVENT_IOC_PERIOD,
PERF_EVENT_IOC_QUERY_BPF, PERF_EVENT_IOC_REFRESH, PERF_EVENT_IOC_RESET, PERF_EVENT_IOC_SET_BPF,
PERF_EVENT_IOC_SET_FILTER, PERF_EVENT_IOC_SET_OUTPUT, PERF_RECORD_MISC_KERNEL,
perf_event_sample_format_PERF_SAMPLE_CALLCHAIN, perf_event_sample_format_PERF_SAMPLE_ID,
perf_event_sample_format_PERF_SAMPLE_IDENTIFIER, perf_event_sample_format_PERF_SAMPLE_IP,
perf_event_sample_format_PERF_SAMPLE_PERIOD, perf_event_sample_format_PERF_SAMPLE_TID,
perf_event_type_PERF_RECORD_SAMPLE,
};
use starnix_uapi::errors::Errno;
use starnix_uapi::open_flags::OpenFlags;
use starnix_uapi::user_address::UserRef;
use starnix_uapi::{
error, perf_event_attr, perf_event_header, perf_event_mmap_page__bindgen_ty_1,
perf_event_read_format_PERF_FORMAT_GROUP, perf_event_read_format_PERF_FORMAT_ID,
perf_event_read_format_PERF_FORMAT_LOST, perf_event_read_format_PERF_FORMAT_TOTAL_TIME_ENABLED,
perf_event_read_format_PERF_FORMAT_TOTAL_TIME_RUNNING, tid_t, uapi,
};
use crate::security::{self, TargetTaskType};
use crate::task::{Kernel, LockedAndTask};
static READ_FORMAT_ID_GENERATOR: AtomicU64 = AtomicU64::new(0);
// Default buffer size to read from socket (for sampling data).
const DEFAULT_CHUNK_SIZE: usize = 4096;
const ESTIMATED_MMAP_BUFFER_SIZE: u64 = 40960; // 4096 * 10, page size * 10.
// perf_event_header struct size: 32 + 16 + 16 = 8 bytes.
const PERF_EVENT_HEADER_SIZE: u16 = 8;
// FXT magic bytes (little endian).
const FXT_MAGIC_BYTES: [u8; 8] = [0x10, 0x00, 0x04, 0x46, 0x78, 0x54, 0x16, 0x00];
#[repr(C)]
#[derive(Copy, Clone, IntoBytes, Immutable)]
struct PerfMetadataHeader {
version: u32,
compat_version: u32,
}
#[repr(C, packed)]
#[derive(Copy, Clone, IntoBytes, Immutable)]
struct PerfMetadataValue {
index: u32,
offset: i64,
time_enabled: u64,
time_running: u64,
__bindgen_anon_1: perf_event_mmap_page__bindgen_ty_1,
pmc_width: u16,
time_shift: u16,
time_mult: u32,
time_offset: u64,
time_zero: u64,
size: u32,
__reserved_1: u32,
time_cycles: u64,
time_mask: u64,
__reserved: [u8; 928usize],
data_head: u64,
data_tail: u64,
data_offset: u64,
data_size: u64,
aux_head: u64,
aux_tail: u64,
aux_offset: u64,
aux_size: u64,
}
struct PerfState {
// This table maps a group leader's file object id to its unique u64 "format ID".
//
// When a sample is generated for any event in a group, we use this
// "format ID" from the group leader as the value for *both* the
// `PERF_SAMPLE_ID` and `PERF_SAMPLE_IDENTIFIER` fields.
format_id_lookup_table: Mutex<HashMap<FileObjectId, u64>>,
}
impl Default for PerfState {
fn default() -> Self {
Self { format_id_lookup_table: Mutex::new(HashMap::new()) }
}
}
fn get_perf_state(kernel: &Arc<Kernel>) -> Arc<PerfState> {
kernel.expando.get_or_init(PerfState::default)
}
uapi::check_arch_independent_layout! {
perf_event_attr {
type_, // "type" is a reserved keyword so add a trailing underscore.
size,
config,
__bindgen_anon_1,
sample_type,
read_format,
_bitfield_1,
__bindgen_anon_2,
bp_type,
__bindgen_anon_3,
__bindgen_anon_4,
branch_sample_type,
sample_regs_user,
sample_stack_user,
clockid,
sample_regs_intr,
aux_watermark,
sample_max_stack,
__reserved_2,
aux_sample_size,
__reserved_3,
sig_data,
config3,
}
}
#[derive(Clone, Copy, Debug, PartialEq)]
enum IoctlOp {
Enable,
}
struct PerfEventFileState {
attr: perf_event_attr,
rf_value: u64, // "count" for the config we passed in for the event.
// The most recent timestamp (ns) where we changed into an enabled state
// i.e. the most recent time we got an ENABLE ioctl().
most_recent_enabled_time: u64,
// Sum of all previous enablement segment durations (ns). If we are
// currently in an enabled state, explicitly does NOT include the current
// segment.
total_time_running: u64,
rf_id: u64,
sample_id: u64,
_rf_lost: u64,
disabled: u64,
sample_type: u64,
// Handle to blob that stores all the perf data that a user may want.
// At the moment it only stores some metadata and backtraces (bts).
perf_data_vmo: zx::Vmo,
// Remember to increment this offset as the number of pages increases.
// Currently we just have a bound of 1 page_size of information.
vmo_write_offset: u64,
// Channel used to send IoctlOps to start/stop sampling.
ioctl_sender: future_mpsc::Sender<(IoctlOp, sync_mpsc::Sender<()>)>,
}
// Have an implementation for PerfEventFileState because VMO
// doesn't have Default so we can't derive it.
impl PerfEventFileState {
fn new(
attr: perf_event_attr,
rf_value: u64,
disabled: u64,
sample_type: u64,
perf_data_vmo: zx::Vmo,
vmo_write_offset: u64,
ioctl_sender: future_mpsc::Sender<(IoctlOp, sync_mpsc::Sender<()>)>,
) -> PerfEventFileState {
PerfEventFileState {
attr,
rf_value,
most_recent_enabled_time: 0,
total_time_running: 0,
rf_id: 0,
sample_id: 0,
_rf_lost: 0,
disabled,
sample_type,
perf_data_vmo,
vmo_write_offset,
ioctl_sender,
}
}
}
pub struct PerfEventFile {
_tid: tid_t,
_cpu: i32,
perf_event_file: RwLock<PerfEventFileState>,
// The security state for this PerfEventFile.
pub security_state: security::PerfEventState,
// Pointer to the perf_event_mmap_page metadata's data_head.
// TODO(https://fxbug.dev/460203776) Remove Arc after figuring out
// "borrowed value does not live long enough" issue.
data_head_pointer: Arc<AtomicPtr<u64>>,
}
// PerfEventFile object that implements FileOps.
// See https://man7.org/linux/man-pages/man2/perf_event_open.2.html for
// implementation details.
// This object can be saved as a FileDescriptor.
impl FileOps for PerfEventFile {
// Don't need to implement seek or sync for PerfEventFile.
fileops_impl_nonseekable!();
fileops_impl_noop_sync!();
fn close(
self: Box<Self>,
_locked: &mut Locked<FileOpsCore>,
file: &FileObjectState,
current_task: &CurrentTask,
) {
let perf_state = get_perf_state(&current_task.kernel);
let mut events = perf_state.format_id_lookup_table.lock();
events.remove(&file.id);
}
// See "Reading results" section of https://man7.org/linux/man-pages/man2/perf_event_open.2.html.
fn read(
&self,
_locked: &mut Locked<FileOpsCore>,
_file: &FileObject,
current_task: &CurrentTask,
_offset: usize,
data: &mut dyn OutputBuffer,
) -> Result<usize, Errno> {
// Create/calculate and return the ReadFormatData object.
// If we create it earlier we might want to change it and it's immutable once created.
let read_format_data = {
// Once we get the `value` or count from kernel, we can change this to a read()
// call instead of write().
let mut perf_event_file = self.perf_event_file.write();
security::check_perf_event_read_access(current_task, &self)?;
let mut total_time_running_including_curr = perf_event_file.total_time_running;
// Only update values if enabled (either by perf_event_attr or ioctl ENABLE call).
if perf_event_file.disabled == 0 {
// Calculate the value or "count" of the config we're interested in.
// This value should reflect the value we are counting (defined in the config).
// E.g. for PERF_COUNT_SW_CPU_CLOCK it would return the value from the CPU clock.
// For now we just return rf_value + 1.
track_stub!(
TODO("https://fxbug.dev/402938671"),
"[perf_event_open] implement read_format value"
);
perf_event_file.rf_value += 1;
// Update time duration.
let curr_time = zx::MonotonicInstant::get().into_nanos() as u64;
total_time_running_including_curr +=
curr_time - perf_event_file.most_recent_enabled_time;
}
let mut output = Vec::<u8>::new();
let value = perf_event_file.rf_value.to_ne_bytes();
output.extend(value);
let read_format = perf_event_file.attr.read_format;
if (read_format & perf_event_read_format_PERF_FORMAT_TOTAL_TIME_ENABLED as u64) != 0 {
// Total time (ns) event was enabled and running (currently same as TIME_RUNNING).
output.extend(total_time_running_including_curr.to_ne_bytes());
}
if (read_format & perf_event_read_format_PERF_FORMAT_TOTAL_TIME_RUNNING as u64) != 0 {
// Total time (ns) event was enabled and running (currently same as TIME_ENABLED).
output.extend(total_time_running_including_curr.to_ne_bytes());
}
if (read_format & perf_event_read_format_PERF_FORMAT_ID as u64) != 0 {
// Adds a 64-bit unique value that corresponds to the event group.
output.extend(perf_event_file.rf_id.to_ne_bytes());
}
output
};
// The regular read() call allows the case where the bytes-we-want-to-read-in won't
// fit in the output buffer. However, for perf_event_open's read(), "If you attempt to read
// into a buffer that is not big enough to hold the data, the error ENOSPC results."
if data.available() < read_format_data.len() {
return error!(ENOSPC);
}
track_stub!(
TODO("https://fxbug.dev/402453955"),
"[perf_event_open] implement remaining error handling"
);
data.write(&read_format_data)
}
fn ioctl(
&self,
_locked: &mut Locked<Unlocked>,
_file: &FileObject,
current_task: &CurrentTask,
op: u32,
_arg: SyscallArg,
) -> Result<SyscallResult, Errno> {
track_stub!(
TODO("https://fxbug.dev/405463320"),
"[perf_event_open] implement PERF_IOC_FLAG_GROUP"
);
security::check_perf_event_write_access(current_task, &self)?;
let mut perf_event_file = self.perf_event_file.write();
match op {
PERF_EVENT_IOC_ENABLE => {
if perf_event_file.disabled != 0 {
perf_event_file.disabled = 0; // 0 = false.
perf_event_file.most_recent_enabled_time =
zx::MonotonicInstant::get().into_nanos() as u64;
}
// If we are sampling, invoke the profiler and collect a sample.
// Currently this is an example sample collection.
track_stub!(
TODO("https://fxbug.dev/398914921"),
"[perf_event_open] implement full sampling features"
);
if perf_event_file.attr.freq() == 0
// SAFETY: sample_period is a u64 field in a union with u64 sample_freq.
// This is always sound regardless of the union's tag.
&& unsafe { perf_event_file.attr.__bindgen_anon_1.sample_period != 0 }
{
ping_receiver(perf_event_file.ioctl_sender.clone(), IoctlOp::Enable);
}
return Ok(SUCCESS);
}
PERF_EVENT_IOC_DISABLE => {
if perf_event_file.disabled == 0 {
perf_event_file.disabled = 1; // 1 = true.
// Update total_time_running now that the segment has ended.
let curr_time = zx::MonotonicInstant::get().into_nanos() as u64;
perf_event_file.total_time_running +=
curr_time - perf_event_file.most_recent_enabled_time;
}
track_stub!(
TODO("https://fxbug.dev/422502681"),
"[perf_event_open] implement Disable to not hardcode profiling"
);
return Ok(SUCCESS);
}
PERF_EVENT_IOC_RESET => {
perf_event_file.rf_value = 0;
return Ok(SUCCESS);
}
PERF_EVENT_IOC_REFRESH
| PERF_EVENT_IOC_PERIOD
| PERF_EVENT_IOC_SET_OUTPUT
| PERF_EVENT_IOC_SET_FILTER
| PERF_EVENT_IOC_ID
| PERF_EVENT_IOC_SET_BPF
| PERF_EVENT_IOC_PAUSE_OUTPUT
| PERF_EVENT_IOC_MODIFY_ATTRIBUTES
| PERF_EVENT_IOC_QUERY_BPF => {
track_stub!(
TODO("https://fxbug.dev/404941053"),
"[perf_event_open] implement remaining ioctl() calls"
);
return error!(ENOSYS);
}
_ => error!(ENOTTY),
}
}
// TODO(https://fxbug.dev/460245383) match behavior when mmap() is called multiple times.
// Gets called when mmap() is called.
// Immediately before sampling, this should get called by the user (e.g. the test
// or Perfetto). We will then write the metadata to the VMO and return the pointer to it.
fn get_memory(
&self,
_locked: &mut Locked<FileOpsCore>,
_file: &FileObject,
current_task: &CurrentTask,
length: Option<usize>,
_prot: ProtectionFlags,
) -> Result<Arc<MemoryObject>, Errno> {
let buffer_size: u64 = length.unwrap_or(0) as u64;
if buffer_size == 0 {
return error!(EINVAL);
}
let page_size = zx::system_get_page_size() as u64;
security::check_perf_event_read_access(current_task, &self)?;
// TODO(https://fxbug.dev/460246292) confirm when to create metadata.
// Create metadata structs. Currently we hardcode everything just to get
// something E2E working.
let metadata_header = PerfMetadataHeader { version: 1, compat_version: 2 };
let metadata_value = PerfMetadataValue {
index: 2,
offset: 19337,
time_enabled: 0,
time_running: 0,
__bindgen_anon_1: perf_event_mmap_page__bindgen_ty_1 { capabilities: 30 },
pmc_width: 0,
time_shift: 0,
time_mult: 0,
time_offset: 0,
time_zero: 0,
size: 0,
__reserved_1: 0,
time_cycles: 0,
time_mask: 0,
__reserved: [0; 928usize],
data_head: page_size,
// Start reading from 0; it is the user's responsibility to increment on their end.
data_tail: 0,
data_offset: page_size,
data_size: (buffer_size - page_size) as u64,
aux_head: 0,
aux_tail: 0,
aux_offset: 0,
aux_size: 0,
};
// Then, wrap metadata in a SeqLock so that user can be made aware of updates.
// SeqLock is formatted thusly:
// header_struct : any size, values should not change
// sequence_counter : u32
// value_struct : any size, needs locking because each value can change
// We split our perf_event_mmap_page accordingly. The `version` and `compat_version`
// should not change while the params below the `lock` may change.
// Sequence counter for `lock` param gets inserted between these via
// the `SeqLock` implementation.
let perf_event_file = self.perf_event_file.read();
// VMO does not implement Copy trait. We duplicate the VMO handle
// so that we can pass it to the SeqLock and the MemoryObject.
let vmo_handle_copy = match perf_event_file
.perf_data_vmo
.as_handle_ref()
.duplicate(zx::Rights::SAME_RIGHTS)
{
Ok(h) => h,
Err(_) => return error!(EINVAL),
};
// SAFETY: This is ok right now because we are the only reference to this memory.
// Once there are multiple references we should update this comment to confirm that
// there are only atomic accesses to this memory (see seq_lock lib.rs for details).
let mut seq_lock = match unsafe {
SeqLock::new_from_vmo(metadata_header, metadata_value, vmo_handle_copy.into())
} {
Ok(s) => s,
Err(_) => return error!(EINVAL),
};
// Now, the perf_data_vmo contains the full metadata page enclosed in a SeqLock.
// Save data_head pointer so that we can write atomically to it after profiling.
let metadata_struct = seq_lock.get_map_address() as *mut PerfMetadataValue;
// SAFETY: This is ok as we previously set the exact format (PerfMetadataValue).
let data_head_pointer = unsafe { std::ptr::addr_of_mut!((*metadata_struct).data_head) };
self.data_head_pointer.store(data_head_pointer, Ordering::Release);
match perf_event_file.perf_data_vmo.as_handle_ref().duplicate(zx::Rights::SAME_RIGHTS) {
Ok(vmo) => {
let memory = MemoryObject::Vmo(vmo.into());
return Ok(Arc::new(memory));
}
Err(_) => {
track_stub!(
TODO("https://fxbug.dev/416323134"),
"[perf_event_open] handle get_memory() errors"
);
return error!(EINVAL);
}
};
}
fn write(
&self,
_locked: &mut Locked<FileOpsCore>,
_file: &FileObject,
_current_task: &CurrentTask,
_offset: usize,
_data: &mut dyn InputBuffer,
) -> Result<usize, Errno> {
track_stub!(
TODO("https://fxbug.dev/394960158"),
"[perf_event_open] implement perf event functions"
);
error!(ENOSYS)
}
}
// Given a PerfRecordSample struct, write it via the correct output format
// (per https://man7.org/linux/man-pages/man2/perf_event_open.2.html) to the VMO.
// We don't currently support all the sample_types listed in the docs.
// Input:
// PerfRecordSample { pid: 5, tid: 10, nr: 3, ips[nr]: [111, 222, 333] }
// Human-understandable output:
// 9 1 40 111 5 10 3 111 222 333
// Actual output (no spaces or \n in real output, just making it more readable):
// 0x0000 0x0009 <-- starts at `offset` bytes
// 0x0001
// 0x0040
// 0x0000 0x0000 0x0000 0x006F <-- starts at `offset` + 8 bytes
// 0x0000 0x0000 0x0000 0x0005
// 0x0000 0x0000 0x0000 0x0010
// 0x0000 0x0000 0x0000 0x0003
// 0x0000 0x0000 0x0000 0x006F
// 0x0000 0x0000 0x0000 0x00DE
// 0x0000 0x0000 0x0000 0x014D
//
// Returns the length of bytes written. In above case, 8 + 28 = 36.
// This information is used to increment the global offset.
fn write_record_to_vmo(
perf_record_sample: PerfRecordSample,
perf_data_vmo: &zx::Vmo,
_data_head_pointer: &AtomicPtr<u64>,
sample_type: u64,
sample_id: u64,
sample_period: u64,
offset: u64,
) -> u64 {
// Write header.
track_stub!(
TODO("https://fxbug.dev/432501467"),
"[perf_event_open] determines whether the record is KERNEL or USER"
);
let perf_event_header = perf_event_header {
type_: perf_event_type_PERF_RECORD_SAMPLE,
misc: PERF_RECORD_MISC_KERNEL as u16,
size: PERF_EVENT_HEADER_SIZE,
};
match perf_data_vmo.write(&perf_event_header.as_bytes(), offset) {
Ok(_) => (),
Err(e) => log_warn!("Failed to write perf_event_header: {}", e),
}
// Write sample.
let mut sample = Vec::<u8>::new();
// sample_id
if (sample_type & perf_event_sample_format_PERF_SAMPLE_IDENTIFIER as u64) != 0 {
sample.extend(sample_id.to_ne_bytes());
}
// ip
if (sample_type & perf_event_sample_format_PERF_SAMPLE_IP as u64) != 0 {
sample.extend(perf_record_sample.ips[0].to_ne_bytes());
}
if (sample_type & perf_event_sample_format_PERF_SAMPLE_TID as u64) != 0 {
// pid
sample.extend(perf_record_sample.pid.expect("missing pid").to_ne_bytes());
// tid
sample.extend(perf_record_sample.tid.expect("missing tid").to_ne_bytes());
}
// id
if (sample_type & perf_event_sample_format_PERF_SAMPLE_ID as u64) != 0 {
sample.extend(sample_id.to_ne_bytes());
}
// sample period
if (sample_type & perf_event_sample_format_PERF_SAMPLE_PERIOD as u64) != 0 {
sample.extend(sample_period.to_ne_bytes());
}
if (sample_type & perf_event_sample_format_PERF_SAMPLE_CALLCHAIN as u64) != 0 {
// nr
sample.extend(perf_record_sample.ips.len().to_ne_bytes());
// ips[nr] - list of ips, u64 per ip.
for i in perf_record_sample.ips {
sample.extend(i.to_ne_bytes());
}
}
// The remaining data are not defined for now.
match perf_data_vmo.write(&sample, offset + (std::mem::size_of::<perf_event_header>() as u64)) {
Ok(_) => {
let bytes_written: u64 =
(std::mem::size_of::<perf_event_header>() + sample.len()) as u64;
// TODO(http://fuchsia.dev/460203776) implement this better before enabling
// any setting of data_head value.
// Update data_head because we have now written to the VMO.
// Ordering::Release pushes update that this (and, transitively, the sample
// too) has updated.
// data_head_pointer.fetch_add(bytes_written, Ordering::Release);
// Return the total size we wrote (header + sample) so that we can
// increment offset counter.
return bytes_written;
}
Err(e) => {
log_warn!("Failed to write PerfRecordSample to VMO due to: {}", e);
// Failed to write. Don't increment offset counter.
return 0;
}
}
}
#[derive(Debug, Clone)]
struct PerfRecordSample {
pid: Option<u32>,
tid: Option<u32>,
// Instruction pointers (currently this is the address). First one is `ip` param.
ips: Vec<u64>,
}
// Parses a backtrace (bt) to obtain the params for a PerfRecordSample. Example:
//
// 1234 pid
// 5555 tid
// {{{bt:0:0x1111:pc}}} {{{bt:frame_number:address:type}}}
// {{{bt:1:0x2222:ra}}}
// {{{bt:2:0x3333:ra}}}
//
// Results in:
// PerfRecordSample { pid: 1234, tid: 5555, nr: 3, ips: [0x1111, 0x2222, 0x3333] }
fn parse_perf_record_sample_format(backtrace: &str) -> Option<PerfRecordSample> {
let mut pid: Option<u32> = None;
let mut tid: Option<u32> = None;
let mut ips: Vec<u64> = Vec::new();
let mut numbers_found = 0;
track_stub!(TODO("https://fxbug.dev/437171287"), "[perf_event_open] handle regex nuances");
let backtrace_regex =
Regex::new(r"^\s*\{\{\{bt:\d+:((0x[0-9a-fA-F]+)):(?:pc|ra)\}\}\}\s*$").unwrap();
for line in backtrace.lines() {
let trimmed_line = line.trim();
// Try to parse as a raw number (for PID/TID).
if numbers_found < 2 {
if let Ok(num) = trimmed_line.parse::<u32>() {
if numbers_found == 0 {
pid = Some(num);
} else {
tid = Some(num);
}
numbers_found += 1;
continue;
}
}
// Try to parse as a backtrace line.
if let Some(parsed_bt) = backtrace_regex.captures(trimmed_line) {
let address_str = parsed_bt.get(1).unwrap().as_str();
if let Ok(ip_addr) = u64::from_str_radix(address_str.trim_start_matches("0x"), 16) {
ips.push(ip_addr);
}
}
}
if pid == None || tid == None || ips.is_empty() {
// This data chunk might've been an {{{mmap}}} chunk, and not a {{{bt}}}.
log_info!("No ips while getting PerfRecordSample");
None
} else {
Some(PerfRecordSample { pid: pid, tid: tid, ips: ips })
}
}
async fn set_up_profiler(
sample_period: zx::MonotonicDuration,
) -> Result<(profiler::SessionProxy, fidl::AsyncSocket), Errno> {
// Configuration for how we want to sample.
let sample = profiler::Sample {
callgraph: Some(profiler::CallgraphConfig {
strategy: Some(profiler::CallgraphStrategy::FramePointer),
..Default::default()
}),
..Default::default()
};
let sampling_config = profiler::SamplingConfig {
period: Some(sample_period.into_nanos() as u64),
timebase: Some(profiler::Counter::PlatformIndependent(profiler::CounterId::Nanoseconds)),
sample: Some(sample),
..Default::default()
};
let tasks = vec![
// Should return ~300 samples for 100 millis.
profiler::Task::SystemWide(profiler::SystemWide {}),
];
let targets = profiler::TargetConfig::Tasks(tasks);
let config = profiler::Config {
configs: Some(vec![sampling_config]),
target: Some(targets),
..Default::default()
};
let (client, server) = fidl::Socket::create_stream();
let configure = profiler::SessionConfigureRequest {
output: Some(server),
config: Some(config),
..Default::default()
};
let proxy = connect_to_protocol::<profiler::SessionMarker>()
.context("Error connecting to Profiler protocol");
let session_proxy: profiler::SessionProxy = match proxy {
Ok(p) => p.clone(),
Err(e) => return error!(EINVAL, e),
};
// Must configure before sampling start().
let config_request = session_proxy.configure(configure).await;
match config_request {
Ok(_) => Ok((session_proxy, fidl::AsyncSocket::from_socket(client))),
Err(e) => return error!(EINVAL, e),
}
}
// Collects samples and puts backtrace in VMO.
// - Starts and stops sampling for a duration.
// - Reads in the buffer from the socket for that duration in chunks.
// - Parses the buffer backtraces into PERF_RECORD_SAMPLE format.
// - Writes the PERF_RECORD_SAMPLE into VMO.
async fn collect_sample(
session_proxy: profiler::SessionProxy,
mut client: fidl::AsyncSocket,
duration: Duration,
perf_data_vmo: &zx::Vmo,
data_head_pointer: &AtomicPtr<u64>,
sample_type: u64,
sample_id: u64,
sample_period: u64,
vmo_write_offset: u64,
) -> Result<(), Errno> {
let start_request = profiler::SessionStartRequest {
buffer_results: Some(true),
buffer_size_mb: Some(8 as u64),
..Default::default()
};
let _ = session_proxy.start(&start_request).await.expect("Failed to start profiling");
// Hardcode a duration so that samples can be collected. This is currently solely used to
// demonstrate that an E2E implementation of sample collection works.
track_stub!(
TODO("https://fxbug.dev/428974888"),
"[perf_event_open] don't hardcode sleep; test/user should decide sample duration"
);
let _ = fuchsia_async::Timer::new(duration).await;
let stats = session_proxy.stop().await;
let samples_collected = match stats {
Ok(stats) => stats.samples_collected.unwrap(),
Err(e) => return error!(EINVAL, e),
};
track_stub!(
TODO("https://fxbug.dev/422502681"),
"[perf_event_open] symbolize sample output and delete the below log_info"
);
log_info!("profiler samples_collected: {:?}", samples_collected);
// Peek at the first 8 bytes to determine if it's FXT or text.
let mut header = [0; 8];
let mut bytes_read = 0;
while bytes_read < 8 {
match client.read(&mut header[bytes_read..]).await {
Ok(0) => {
// Peer closed the socket. This is the normal end of the stream.
log_info!("[perf_event_open] Finished reading fxt record from socket.");
break;
}
Ok(n) => bytes_read += n,
Err(e) => {
log_warn!("[perf_event_open] Error reading from socket: {:?}", e);
break;
}
}
}
if bytes_read > 0 {
if bytes_read == 8 && header == FXT_MAGIC_BYTES {
// FXT format.
let header_cursor = Cursor::new(header);
let reader = header_cursor.chain(client);
let (mut stream, _task) = SessionParser::new_async(reader);
while let Some(record_result) = stream.next().await {
match record_result {
Ok(TraceRecord::Profiler(ProfilerRecord::Backtrace(backtrace))) => {
let ips: Vec<u64> = backtrace.data;
let pid = Some(backtrace.process.0 as u32);
let tid = Some(backtrace.thread.0 as u32);
let perf_record_sample = PerfRecordSample { pid, tid, ips };
write_record_to_vmo(
perf_record_sample,
perf_data_vmo,
data_head_pointer,
sample_type,
sample_id,
sample_period,
vmo_write_offset,
);
}
Ok(_) => {
// Ignore other records.
}
Err(e) => {
log_warn!("[perf_event_open] Error parsing FXT: {:?}", e);
break;
}
}
}
} else {
// Text format.
// Read chunks of sampling data from socket in this buffer temporarily. We will parse
// the data and write it into the output VMO (the one mmap points to).
let mut buffer = vec![0; DEFAULT_CHUNK_SIZE];
loop {
// Attempt to read data. This awaits until data is available, EOF, or error.
// Ignore the first 8 bytes as it's the {{{reset}}} marker.
let socket_data = client.read(&mut buffer).await;
match socket_data {
Ok(0) => {
// Peer closed the socket. This is the normal end of the stream.
log_info!("[perf_event_open] Finished reading from socket.");
break;
}
Ok(bytes_read) => {
// Receive data in format {{{...}}}.
let received_data = match std::str::from_utf8(&buffer[..bytes_read]) {
Ok(data) => data,
Err(e) => return error!(EINVAL, e),
};
// Parse data to PerfRecordSample struct.
if let Some(perf_record_sample) =
parse_perf_record_sample_format(received_data)
{
write_record_to_vmo(
perf_record_sample,
perf_data_vmo,
data_head_pointer,
sample_type,
sample_id,
sample_period,
vmo_write_offset,
);
}
}
Err(e) => {
log_warn!("[perf_event_open] Error reading from socket: {:?}", e);
break;
}
}
}
}
}
let reset_status = session_proxy.reset().await;
return match reset_status {
Ok(_) => Ok(()),
Err(e) => error!(EINVAL, e),
};
}
// Notifies other thread that we should start/stop sampling.
// Once sampling is complete, that profiler session is no longer needed.
// At that point, send back notification so that this is no longer blocking
// (e.g. so that other profiler sessions can start).
fn ping_receiver(
mut ioctl_sender: future_mpsc::Sender<(IoctlOp, sync_mpsc::Sender<()>)>,
command: IoctlOp,
) {
log_info!("[perf_event_open] Received sampling command: {:?}", command);
let (profiling_complete_sender, profiling_complete_receiver) = sync_mpsc::channel::<()>();
match ioctl_sender.try_send((command, profiling_complete_sender)) {
Ok(_) => (),
Err(e) => {
if e.is_full() {
log_warn!("[perf_event_open] Failed to send {:?}: Channel full", command);
} else if e.is_disconnected() {
log_warn!("[perf_event_open] Failed to send {:?}: Receiver disconnected", command);
} else {
log_warn!("[perf_event_open] Failed to send {:?} due to {:?}", command, e.source());
}
}
};
// Block on / wait until profiling is complete before returning.
// This notifies that the profiler is free to be used for another session.
let _ = profiling_complete_receiver.recv().unwrap();
}
pub fn sys_perf_event_open(
locked: &mut Locked<Unlocked>,
current_task: &CurrentTask,
attr: UserRef<perf_event_attr>,
// Note that this is pid in Linux docs.
tid: tid_t,
cpu: i32,
group_fd: FdNumber,
_flags: u64,
) -> Result<SyscallResult, Errno> {
// So far, the implementation only sets the read_data_format according to the "Reading results"
// section of https://man7.org/linux/man-pages/man2/perf_event_open.2.html for a single event.
// Other features will be added in the future (see below track_stubs).
let perf_event_attrs: perf_event_attr = current_task.read_object(attr)?;
if tid == -1 && cpu == -1 {
return error!(EINVAL);
}
let target_task_type = match tid {
-1 => TargetTaskType::AllTasks,
0 => TargetTaskType::CurrentTask,
_ => {
track_stub!(TODO("https://fxbug.dev/409621963"), "[perf_event_open] implement tid > 0");
return error!(ENOSYS);
}
};
security::check_perf_event_open_access(
current_task,
target_task_type,
&perf_event_attrs,
perf_event_attrs.type_.try_into()?,
)?;
// Channel used to send info between notifier and spawned task thread.
// We somewhat arbitrarily picked 8 for now in case we get a bunch of ioctls that are in
// quick succession (instead of something lower).
let (sender, mut receiver) = future_mpsc::channel(8);
let page_size = zx::system_get_page_size() as u64;
let mut perf_event_file = PerfEventFileState::new(
perf_event_attrs,
0,
perf_event_attrs.disabled(),
perf_event_attrs.sample_type,
zx::Vmo::create(ESTIMATED_MMAP_BUFFER_SIZE).unwrap(),
page_size, // Start with this amount of offset, we can increment as we write.
sender,
);
let read_format = perf_event_attrs.read_format;
if (read_format & perf_event_read_format_PERF_FORMAT_TOTAL_TIME_ENABLED as u64) != 0
|| (read_format & perf_event_read_format_PERF_FORMAT_TOTAL_TIME_RUNNING as u64) != 0
{
// Only keep track of most_recent_enabled_time if we are currently in ENABLED state,
// as otherwise this param shouldn't be used for calculating anything.
if perf_event_file.disabled == 0 {
perf_event_file.most_recent_enabled_time =
zx::MonotonicInstant::get().into_nanos() as u64;
}
// Initialize this to 0 as we will need to return a time duration later during read().
perf_event_file.total_time_running = 0;
}
let event_id = READ_FORMAT_ID_GENERATOR.fetch_add(1, Ordering::Relaxed);
perf_event_file.rf_id = event_id;
if group_fd.raw() == -1 {
perf_event_file.sample_id = event_id;
} else {
let group_file = current_task.files.get(group_fd)?;
let group_file_object_id = group_file.id;
let perf_state = get_perf_state(&current_task.kernel);
let events = perf_state.format_id_lookup_table.lock();
if let Some(rf_id) = events.get(&group_file_object_id) {
perf_event_file.sample_id = *rf_id;
} else {
return error!(EINVAL);
}
}
if (read_format & perf_event_read_format_PERF_FORMAT_GROUP as u64) != 0 {
track_stub!(
TODO("https://fxbug.dev/402238049"),
"[perf_event_open] implement read_format group"
);
return error!(ENOSYS);
}
if (read_format & perf_event_read_format_PERF_FORMAT_LOST as u64) != 0 {
track_stub!(
TODO("https://fxbug.dev/402260383"),
"[perf_event_open] implement read_format lost"
);
}
// Set up notifier for handling ioctl calls to enable/disable sampling.
let mut vmo_handle_copy =
perf_event_file.perf_data_vmo.as_handle_ref().duplicate(zx::Rights::SAME_RIGHTS);
// SAFETY: sample_period is a u64 field in a union with u64 sample_freq.
// This is always sound regardless of the union's tag.
let sample_period_in_ticks = unsafe { perf_event_file.attr.__bindgen_anon_1.sample_period };
// The sample period from the PERF_COUNT_SW_CPU_CLOCK is
// 1 nanosecond per tick. Convert this duration into zx::duration.
let zx_sample_period = zx::MonotonicDuration::from_nanos(sample_period_in_ticks as i64);
let data_head_pointer = Arc::new(AtomicPtr::new(std::ptr::null_mut::<u64>()));
// Pass cloned into the thread.
let cloned_data_head_pointer = Arc::clone(&data_head_pointer);
let closure = async move |_: LockedAndTask<'_>| {
// This loop will wait for messages from the sender.
while let Some((command, profiling_complete_receiver)) = receiver.next().await {
match command {
IoctlOp::Enable => {
match set_up_profiler(zx_sample_period).await {
Ok((session_proxy, client)) => {
track_stub!(
TODO("https://fxbug.dev/422502681"),
"[perf_event_open] don't hardcode profiling duration"
);
let handle = vmo_handle_copy
.as_mut()
.expect("Failed to get VMO handle")
.as_handle_ref()
.duplicate(zx::Rights::SAME_RIGHTS)
.unwrap();
let _ = collect_sample(
session_proxy,
client,
Duration::from_millis(100),
&zx::Vmo::from(handle),
&*cloned_data_head_pointer,
perf_event_file.sample_type,
perf_event_file.sample_id,
sample_period_in_ticks,
perf_event_file.vmo_write_offset,
)
.await;
// Send notification that profiler session is over.
let _ = profiling_complete_receiver.send(());
}
Err(e) => {
log_warn!("Failed to profile: {}", e);
}
};
}
}
}
()
};
let req = SpawnRequestBuilder::new()
.with_debug_name("perf-event-sampler")
.with_async_closure(closure)
.build();
current_task.kernel().kthreads.spawner().spawn_from_request(req);
let file = Box::new(PerfEventFile {
_tid: tid,
_cpu: cpu,
perf_event_file: RwLock::new(perf_event_file),
security_state: security::perf_event_alloc(current_task),
data_head_pointer: data_head_pointer,
});
// TODO: https://fxbug.dev/404739824 - Confirm whether to handle this as a "private" node.
let file_handle =
Anon::new_private_file(locked, current_task, file, OpenFlags::RDWR, "[perf_event]");
let file_object_id = file_handle.id;
let file_descriptor: Result<FdNumber, Errno> =
current_task.add_file(locked, file_handle, FdFlags::empty());
match file_descriptor {
Ok(fd) => {
if group_fd.raw() == -1 {
let perf_state = get_perf_state(&current_task.kernel);
let mut events = perf_state.format_id_lookup_table.lock();
events.insert(file_object_id, event_id);
}
Ok(fd.into())
}
Err(_) => {
track_stub!(
TODO("https://fxbug.dev/402453955"),
"[perf_event_open] implement remaining error handling"
);
error!(EMFILE)
}
}
}
// Syscalls for arch32 usage
#[cfg(target_arch = "aarch64")]
mod arch32 {
pub use super::sys_perf_event_open as sys_arch32_perf_event_open;
}
#[cfg(target_arch = "aarch64")]
pub use arch32::*;
use crate::mm::memory::MemoryObject;
use crate::mm::{MemoryAccessorExt, ProtectionFlags};
use crate::task::CurrentTask;
use crate::vfs::{
Anon, FdFlags, FdNumber, FileObject, FileObjectId, FileObjectState, FileOps, InputBuffer,
OutputBuffer,
};
use crate::{fileops_impl_nonseekable, fileops_impl_noop_sync};