| // Copyright 2025 The Fuchsia Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| use crate::task::dynamic_thread_spawner::SpawnRequestBuilder; |
| use anyhow::Context; |
| use fuchsia_component::client::connect_to_protocol; |
| use futures::StreamExt; |
| use futures::channel::mpsc as future_mpsc; |
| use regex::Regex; |
| use std::collections::HashMap; |
| use std::error::Error; |
| use std::sync::atomic::{AtomicPtr, AtomicU64, Ordering}; |
| use std::sync::{Arc, mpsc as sync_mpsc}; |
| use std::time::Duration; |
| use zerocopy::{Immutable, IntoBytes}; |
| use zx::AsHandleRef; |
| use {fidl_fuchsia_cpu_profiler as profiler, fuchsia_async}; |
| |
| use futures::io::{AsyncReadExt, Cursor}; |
| use fxt::TraceRecord; |
| use fxt::profiler::ProfilerRecord; |
| use fxt::session::SessionParser; |
| use seq_lock::SeqLock; |
| use starnix_logging::{log_info, log_warn, track_stub}; |
| use starnix_sync::{FileOpsCore, Locked, Mutex, RwLock, Unlocked}; |
| use starnix_syscalls::{SUCCESS, SyscallArg, SyscallResult}; |
| use starnix_uapi::arch32::{ |
| PERF_EVENT_IOC_DISABLE, PERF_EVENT_IOC_ENABLE, PERF_EVENT_IOC_ID, |
| PERF_EVENT_IOC_MODIFY_ATTRIBUTES, PERF_EVENT_IOC_PAUSE_OUTPUT, PERF_EVENT_IOC_PERIOD, |
| PERF_EVENT_IOC_QUERY_BPF, PERF_EVENT_IOC_REFRESH, PERF_EVENT_IOC_RESET, PERF_EVENT_IOC_SET_BPF, |
| PERF_EVENT_IOC_SET_FILTER, PERF_EVENT_IOC_SET_OUTPUT, PERF_RECORD_MISC_KERNEL, |
| perf_event_sample_format_PERF_SAMPLE_CALLCHAIN, perf_event_sample_format_PERF_SAMPLE_ID, |
| perf_event_sample_format_PERF_SAMPLE_IDENTIFIER, perf_event_sample_format_PERF_SAMPLE_IP, |
| perf_event_sample_format_PERF_SAMPLE_PERIOD, perf_event_sample_format_PERF_SAMPLE_TID, |
| perf_event_type_PERF_RECORD_SAMPLE, |
| }; |
| use starnix_uapi::errors::Errno; |
| use starnix_uapi::open_flags::OpenFlags; |
| use starnix_uapi::user_address::UserRef; |
| use starnix_uapi::{ |
| error, perf_event_attr, perf_event_header, perf_event_mmap_page__bindgen_ty_1, |
| perf_event_read_format_PERF_FORMAT_GROUP, perf_event_read_format_PERF_FORMAT_ID, |
| perf_event_read_format_PERF_FORMAT_LOST, perf_event_read_format_PERF_FORMAT_TOTAL_TIME_ENABLED, |
| perf_event_read_format_PERF_FORMAT_TOTAL_TIME_RUNNING, tid_t, uapi, |
| }; |
| |
| use crate::security::{self, TargetTaskType}; |
| use crate::task::{Kernel, LockedAndTask}; |
| |
| static READ_FORMAT_ID_GENERATOR: AtomicU64 = AtomicU64::new(0); |
| // Default buffer size to read from socket (for sampling data). |
| const DEFAULT_CHUNK_SIZE: usize = 4096; |
| const ESTIMATED_MMAP_BUFFER_SIZE: u64 = 40960; // 4096 * 10, page size * 10. |
| // perf_event_header struct size: 32 + 16 + 16 = 8 bytes. |
| const PERF_EVENT_HEADER_SIZE: u16 = 8; |
| // FXT magic bytes (little endian). |
| const FXT_MAGIC_BYTES: [u8; 8] = [0x10, 0x00, 0x04, 0x46, 0x78, 0x54, 0x16, 0x00]; |
| |
| #[repr(C)] |
| #[derive(Copy, Clone, IntoBytes, Immutable)] |
| struct PerfMetadataHeader { |
| version: u32, |
| compat_version: u32, |
| } |
| |
| #[repr(C, packed)] |
| #[derive(Copy, Clone, IntoBytes, Immutable)] |
| struct PerfMetadataValue { |
| index: u32, |
| offset: i64, |
| time_enabled: u64, |
| time_running: u64, |
| __bindgen_anon_1: perf_event_mmap_page__bindgen_ty_1, |
| pmc_width: u16, |
| time_shift: u16, |
| time_mult: u32, |
| time_offset: u64, |
| time_zero: u64, |
| size: u32, |
| __reserved_1: u32, |
| time_cycles: u64, |
| time_mask: u64, |
| __reserved: [u8; 928usize], |
| data_head: u64, |
| data_tail: u64, |
| data_offset: u64, |
| data_size: u64, |
| aux_head: u64, |
| aux_tail: u64, |
| aux_offset: u64, |
| aux_size: u64, |
| } |
| |
| struct PerfState { |
| // This table maps a group leader's file object id to its unique u64 "format ID". |
| // |
| // When a sample is generated for any event in a group, we use this |
| // "format ID" from the group leader as the value for *both* the |
| // `PERF_SAMPLE_ID` and `PERF_SAMPLE_IDENTIFIER` fields. |
| format_id_lookup_table: Mutex<HashMap<FileObjectId, u64>>, |
| } |
| |
| impl Default for PerfState { |
| fn default() -> Self { |
| Self { format_id_lookup_table: Mutex::new(HashMap::new()) } |
| } |
| } |
| |
| fn get_perf_state(kernel: &Arc<Kernel>) -> Arc<PerfState> { |
| kernel.expando.get_or_init(PerfState::default) |
| } |
| |
| uapi::check_arch_independent_layout! { |
| perf_event_attr { |
| type_, // "type" is a reserved keyword so add a trailing underscore. |
| size, |
| config, |
| __bindgen_anon_1, |
| sample_type, |
| read_format, |
| _bitfield_1, |
| __bindgen_anon_2, |
| bp_type, |
| __bindgen_anon_3, |
| __bindgen_anon_4, |
| branch_sample_type, |
| sample_regs_user, |
| sample_stack_user, |
| clockid, |
| sample_regs_intr, |
| aux_watermark, |
| sample_max_stack, |
| __reserved_2, |
| aux_sample_size, |
| __reserved_3, |
| sig_data, |
| config3, |
| } |
| } |
| |
| #[derive(Clone, Copy, Debug, PartialEq)] |
| enum IoctlOp { |
| Enable, |
| } |
| |
| struct PerfEventFileState { |
| attr: perf_event_attr, |
| rf_value: u64, // "count" for the config we passed in for the event. |
| // The most recent timestamp (ns) where we changed into an enabled state |
| // i.e. the most recent time we got an ENABLE ioctl(). |
| most_recent_enabled_time: u64, |
| // Sum of all previous enablement segment durations (ns). If we are |
| // currently in an enabled state, explicitly does NOT include the current |
| // segment. |
| total_time_running: u64, |
| rf_id: u64, |
| sample_id: u64, |
| _rf_lost: u64, |
| disabled: u64, |
| sample_type: u64, |
| // Handle to blob that stores all the perf data that a user may want. |
| // At the moment it only stores some metadata and backtraces (bts). |
| perf_data_vmo: zx::Vmo, |
| // Remember to increment this offset as the number of pages increases. |
| // Currently we just have a bound of 1 page_size of information. |
| vmo_write_offset: u64, |
| // Channel used to send IoctlOps to start/stop sampling. |
| ioctl_sender: future_mpsc::Sender<(IoctlOp, sync_mpsc::Sender<()>)>, |
| } |
| |
| // Have an implementation for PerfEventFileState because VMO |
| // doesn't have Default so we can't derive it. |
| impl PerfEventFileState { |
| fn new( |
| attr: perf_event_attr, |
| rf_value: u64, |
| disabled: u64, |
| sample_type: u64, |
| perf_data_vmo: zx::Vmo, |
| vmo_write_offset: u64, |
| ioctl_sender: future_mpsc::Sender<(IoctlOp, sync_mpsc::Sender<()>)>, |
| ) -> PerfEventFileState { |
| PerfEventFileState { |
| attr, |
| rf_value, |
| most_recent_enabled_time: 0, |
| total_time_running: 0, |
| rf_id: 0, |
| sample_id: 0, |
| _rf_lost: 0, |
| disabled, |
| sample_type, |
| perf_data_vmo, |
| vmo_write_offset, |
| ioctl_sender, |
| } |
| } |
| } |
| |
| pub struct PerfEventFile { |
| _tid: tid_t, |
| _cpu: i32, |
| perf_event_file: RwLock<PerfEventFileState>, |
| // The security state for this PerfEventFile. |
| pub security_state: security::PerfEventState, |
| // Pointer to the perf_event_mmap_page metadata's data_head. |
| // TODO(https://fxbug.dev/460203776) Remove Arc after figuring out |
| // "borrowed value does not live long enough" issue. |
| data_head_pointer: Arc<AtomicPtr<u64>>, |
| } |
| |
| // PerfEventFile object that implements FileOps. |
| // See https://man7.org/linux/man-pages/man2/perf_event_open.2.html for |
| // implementation details. |
| // This object can be saved as a FileDescriptor. |
| impl FileOps for PerfEventFile { |
| // Don't need to implement seek or sync for PerfEventFile. |
| fileops_impl_nonseekable!(); |
| fileops_impl_noop_sync!(); |
| |
| fn close( |
| self: Box<Self>, |
| _locked: &mut Locked<FileOpsCore>, |
| file: &FileObjectState, |
| current_task: &CurrentTask, |
| ) { |
| let perf_state = get_perf_state(¤t_task.kernel); |
| let mut events = perf_state.format_id_lookup_table.lock(); |
| events.remove(&file.id); |
| } |
| |
| // See "Reading results" section of https://man7.org/linux/man-pages/man2/perf_event_open.2.html. |
| fn read( |
| &self, |
| _locked: &mut Locked<FileOpsCore>, |
| _file: &FileObject, |
| current_task: &CurrentTask, |
| _offset: usize, |
| data: &mut dyn OutputBuffer, |
| ) -> Result<usize, Errno> { |
| // Create/calculate and return the ReadFormatData object. |
| // If we create it earlier we might want to change it and it's immutable once created. |
| let read_format_data = { |
| // Once we get the `value` or count from kernel, we can change this to a read() |
| // call instead of write(). |
| let mut perf_event_file = self.perf_event_file.write(); |
| |
| security::check_perf_event_read_access(current_task, &self)?; |
| |
| let mut total_time_running_including_curr = perf_event_file.total_time_running; |
| |
| // Only update values if enabled (either by perf_event_attr or ioctl ENABLE call). |
| if perf_event_file.disabled == 0 { |
| // Calculate the value or "count" of the config we're interested in. |
| // This value should reflect the value we are counting (defined in the config). |
| // E.g. for PERF_COUNT_SW_CPU_CLOCK it would return the value from the CPU clock. |
| // For now we just return rf_value + 1. |
| track_stub!( |
| TODO("https://fxbug.dev/402938671"), |
| "[perf_event_open] implement read_format value" |
| ); |
| perf_event_file.rf_value += 1; |
| |
| // Update time duration. |
| let curr_time = zx::MonotonicInstant::get().into_nanos() as u64; |
| total_time_running_including_curr += |
| curr_time - perf_event_file.most_recent_enabled_time; |
| } |
| |
| let mut output = Vec::<u8>::new(); |
| let value = perf_event_file.rf_value.to_ne_bytes(); |
| output.extend(value); |
| |
| let read_format = perf_event_file.attr.read_format; |
| |
| if (read_format & perf_event_read_format_PERF_FORMAT_TOTAL_TIME_ENABLED as u64) != 0 { |
| // Total time (ns) event was enabled and running (currently same as TIME_RUNNING). |
| output.extend(total_time_running_including_curr.to_ne_bytes()); |
| } |
| if (read_format & perf_event_read_format_PERF_FORMAT_TOTAL_TIME_RUNNING as u64) != 0 { |
| // Total time (ns) event was enabled and running (currently same as TIME_ENABLED). |
| output.extend(total_time_running_including_curr.to_ne_bytes()); |
| } |
| if (read_format & perf_event_read_format_PERF_FORMAT_ID as u64) != 0 { |
| // Adds a 64-bit unique value that corresponds to the event group. |
| output.extend(perf_event_file.rf_id.to_ne_bytes()); |
| } |
| |
| output |
| }; |
| |
| // The regular read() call allows the case where the bytes-we-want-to-read-in won't |
| // fit in the output buffer. However, for perf_event_open's read(), "If you attempt to read |
| // into a buffer that is not big enough to hold the data, the error ENOSPC results." |
| if data.available() < read_format_data.len() { |
| return error!(ENOSPC); |
| } |
| track_stub!( |
| TODO("https://fxbug.dev/402453955"), |
| "[perf_event_open] implement remaining error handling" |
| ); |
| |
| data.write(&read_format_data) |
| } |
| |
| fn ioctl( |
| &self, |
| _locked: &mut Locked<Unlocked>, |
| _file: &FileObject, |
| current_task: &CurrentTask, |
| op: u32, |
| _arg: SyscallArg, |
| ) -> Result<SyscallResult, Errno> { |
| track_stub!( |
| TODO("https://fxbug.dev/405463320"), |
| "[perf_event_open] implement PERF_IOC_FLAG_GROUP" |
| ); |
| security::check_perf_event_write_access(current_task, &self)?; |
| let mut perf_event_file = self.perf_event_file.write(); |
| match op { |
| PERF_EVENT_IOC_ENABLE => { |
| if perf_event_file.disabled != 0 { |
| perf_event_file.disabled = 0; // 0 = false. |
| perf_event_file.most_recent_enabled_time = |
| zx::MonotonicInstant::get().into_nanos() as u64; |
| } |
| |
| // If we are sampling, invoke the profiler and collect a sample. |
| // Currently this is an example sample collection. |
| track_stub!( |
| TODO("https://fxbug.dev/398914921"), |
| "[perf_event_open] implement full sampling features" |
| ); |
| if perf_event_file.attr.freq() == 0 |
| // SAFETY: sample_period is a u64 field in a union with u64 sample_freq. |
| // This is always sound regardless of the union's tag. |
| && unsafe { perf_event_file.attr.__bindgen_anon_1.sample_period != 0 } |
| { |
| ping_receiver(perf_event_file.ioctl_sender.clone(), IoctlOp::Enable); |
| } |
| return Ok(SUCCESS); |
| } |
| PERF_EVENT_IOC_DISABLE => { |
| if perf_event_file.disabled == 0 { |
| perf_event_file.disabled = 1; // 1 = true. |
| |
| // Update total_time_running now that the segment has ended. |
| let curr_time = zx::MonotonicInstant::get().into_nanos() as u64; |
| perf_event_file.total_time_running += |
| curr_time - perf_event_file.most_recent_enabled_time; |
| } |
| track_stub!( |
| TODO("https://fxbug.dev/422502681"), |
| "[perf_event_open] implement Disable to not hardcode profiling" |
| ); |
| return Ok(SUCCESS); |
| } |
| PERF_EVENT_IOC_RESET => { |
| perf_event_file.rf_value = 0; |
| return Ok(SUCCESS); |
| } |
| PERF_EVENT_IOC_REFRESH |
| | PERF_EVENT_IOC_PERIOD |
| | PERF_EVENT_IOC_SET_OUTPUT |
| | PERF_EVENT_IOC_SET_FILTER |
| | PERF_EVENT_IOC_ID |
| | PERF_EVENT_IOC_SET_BPF |
| | PERF_EVENT_IOC_PAUSE_OUTPUT |
| | PERF_EVENT_IOC_MODIFY_ATTRIBUTES |
| | PERF_EVENT_IOC_QUERY_BPF => { |
| track_stub!( |
| TODO("https://fxbug.dev/404941053"), |
| "[perf_event_open] implement remaining ioctl() calls" |
| ); |
| return error!(ENOSYS); |
| } |
| _ => error!(ENOTTY), |
| } |
| } |
| |
| // TODO(https://fxbug.dev/460245383) match behavior when mmap() is called multiple times. |
| // Gets called when mmap() is called. |
| // Immediately before sampling, this should get called by the user (e.g. the test |
| // or Perfetto). We will then write the metadata to the VMO and return the pointer to it. |
| fn get_memory( |
| &self, |
| _locked: &mut Locked<FileOpsCore>, |
| _file: &FileObject, |
| current_task: &CurrentTask, |
| length: Option<usize>, |
| _prot: ProtectionFlags, |
| ) -> Result<Arc<MemoryObject>, Errno> { |
| let buffer_size: u64 = length.unwrap_or(0) as u64; |
| if buffer_size == 0 { |
| return error!(EINVAL); |
| } |
| let page_size = zx::system_get_page_size() as u64; |
| |
| security::check_perf_event_read_access(current_task, &self)?; |
| |
| // TODO(https://fxbug.dev/460246292) confirm when to create metadata. |
| // Create metadata structs. Currently we hardcode everything just to get |
| // something E2E working. |
| let metadata_header = PerfMetadataHeader { version: 1, compat_version: 2 }; |
| let metadata_value = PerfMetadataValue { |
| index: 2, |
| offset: 19337, |
| time_enabled: 0, |
| time_running: 0, |
| __bindgen_anon_1: perf_event_mmap_page__bindgen_ty_1 { capabilities: 30 }, |
| pmc_width: 0, |
| time_shift: 0, |
| time_mult: 0, |
| time_offset: 0, |
| time_zero: 0, |
| size: 0, |
| __reserved_1: 0, |
| time_cycles: 0, |
| time_mask: 0, |
| __reserved: [0; 928usize], |
| data_head: page_size, |
| // Start reading from 0; it is the user's responsibility to increment on their end. |
| data_tail: 0, |
| data_offset: page_size, |
| data_size: (buffer_size - page_size) as u64, |
| aux_head: 0, |
| aux_tail: 0, |
| aux_offset: 0, |
| aux_size: 0, |
| }; |
| |
| // Then, wrap metadata in a SeqLock so that user can be made aware of updates. |
| // SeqLock is formatted thusly: |
| // header_struct : any size, values should not change |
| // sequence_counter : u32 |
| // value_struct : any size, needs locking because each value can change |
| // We split our perf_event_mmap_page accordingly. The `version` and `compat_version` |
| // should not change while the params below the `lock` may change. |
| // Sequence counter for `lock` param gets inserted between these via |
| // the `SeqLock` implementation. |
| let perf_event_file = self.perf_event_file.read(); |
| // VMO does not implement Copy trait. We duplicate the VMO handle |
| // so that we can pass it to the SeqLock and the MemoryObject. |
| let vmo_handle_copy = match perf_event_file |
| .perf_data_vmo |
| .as_handle_ref() |
| .duplicate(zx::Rights::SAME_RIGHTS) |
| { |
| Ok(h) => h, |
| Err(_) => return error!(EINVAL), |
| }; |
| |
| // SAFETY: This is ok right now because we are the only reference to this memory. |
| // Once there are multiple references we should update this comment to confirm that |
| // there are only atomic accesses to this memory (see seq_lock lib.rs for details). |
| let mut seq_lock = match unsafe { |
| SeqLock::new_from_vmo(metadata_header, metadata_value, vmo_handle_copy.into()) |
| } { |
| Ok(s) => s, |
| Err(_) => return error!(EINVAL), |
| }; |
| |
| // Now, the perf_data_vmo contains the full metadata page enclosed in a SeqLock. |
| // Save data_head pointer so that we can write atomically to it after profiling. |
| let metadata_struct = seq_lock.get_map_address() as *mut PerfMetadataValue; |
| // SAFETY: This is ok as we previously set the exact format (PerfMetadataValue). |
| let data_head_pointer = unsafe { std::ptr::addr_of_mut!((*metadata_struct).data_head) }; |
| self.data_head_pointer.store(data_head_pointer, Ordering::Release); |
| |
| match perf_event_file.perf_data_vmo.as_handle_ref().duplicate(zx::Rights::SAME_RIGHTS) { |
| Ok(vmo) => { |
| let memory = MemoryObject::Vmo(vmo.into()); |
| return Ok(Arc::new(memory)); |
| } |
| Err(_) => { |
| track_stub!( |
| TODO("https://fxbug.dev/416323134"), |
| "[perf_event_open] handle get_memory() errors" |
| ); |
| return error!(EINVAL); |
| } |
| }; |
| } |
| |
| fn write( |
| &self, |
| _locked: &mut Locked<FileOpsCore>, |
| _file: &FileObject, |
| _current_task: &CurrentTask, |
| _offset: usize, |
| _data: &mut dyn InputBuffer, |
| ) -> Result<usize, Errno> { |
| track_stub!( |
| TODO("https://fxbug.dev/394960158"), |
| "[perf_event_open] implement perf event functions" |
| ); |
| error!(ENOSYS) |
| } |
| } |
| |
| // Given a PerfRecordSample struct, write it via the correct output format |
| // (per https://man7.org/linux/man-pages/man2/perf_event_open.2.html) to the VMO. |
| // We don't currently support all the sample_types listed in the docs. |
| // Input: |
| // PerfRecordSample { pid: 5, tid: 10, nr: 3, ips[nr]: [111, 222, 333] } |
| // Human-understandable output: |
| // 9 1 40 111 5 10 3 111 222 333 |
| // Actual output (no spaces or \n in real output, just making it more readable): |
| // 0x0000 0x0009 <-- starts at `offset` bytes |
| // 0x0001 |
| // 0x0040 |
| // 0x0000 0x0000 0x0000 0x006F <-- starts at `offset` + 8 bytes |
| // 0x0000 0x0000 0x0000 0x0005 |
| // 0x0000 0x0000 0x0000 0x0010 |
| // 0x0000 0x0000 0x0000 0x0003 |
| // 0x0000 0x0000 0x0000 0x006F |
| // 0x0000 0x0000 0x0000 0x00DE |
| // 0x0000 0x0000 0x0000 0x014D |
| // |
| // Returns the length of bytes written. In above case, 8 + 28 = 36. |
| // This information is used to increment the global offset. |
| fn write_record_to_vmo( |
| perf_record_sample: PerfRecordSample, |
| perf_data_vmo: &zx::Vmo, |
| _data_head_pointer: &AtomicPtr<u64>, |
| sample_type: u64, |
| sample_id: u64, |
| sample_period: u64, |
| offset: u64, |
| ) -> u64 { |
| // Write header. |
| track_stub!( |
| TODO("https://fxbug.dev/432501467"), |
| "[perf_event_open] determines whether the record is KERNEL or USER" |
| ); |
| let perf_event_header = perf_event_header { |
| type_: perf_event_type_PERF_RECORD_SAMPLE, |
| misc: PERF_RECORD_MISC_KERNEL as u16, |
| size: PERF_EVENT_HEADER_SIZE, |
| }; |
| |
| match perf_data_vmo.write(&perf_event_header.as_bytes(), offset) { |
| Ok(_) => (), |
| Err(e) => log_warn!("Failed to write perf_event_header: {}", e), |
| } |
| |
| // Write sample. |
| let mut sample = Vec::<u8>::new(); |
| // sample_id |
| if (sample_type & perf_event_sample_format_PERF_SAMPLE_IDENTIFIER as u64) != 0 { |
| sample.extend(sample_id.to_ne_bytes()); |
| } |
| // ip |
| if (sample_type & perf_event_sample_format_PERF_SAMPLE_IP as u64) != 0 { |
| sample.extend(perf_record_sample.ips[0].to_ne_bytes()); |
| } |
| |
| if (sample_type & perf_event_sample_format_PERF_SAMPLE_TID as u64) != 0 { |
| // pid |
| sample.extend(perf_record_sample.pid.expect("missing pid").to_ne_bytes()); |
| // tid |
| sample.extend(perf_record_sample.tid.expect("missing tid").to_ne_bytes()); |
| } |
| |
| // id |
| if (sample_type & perf_event_sample_format_PERF_SAMPLE_ID as u64) != 0 { |
| sample.extend(sample_id.to_ne_bytes()); |
| } |
| |
| // sample period |
| if (sample_type & perf_event_sample_format_PERF_SAMPLE_PERIOD as u64) != 0 { |
| sample.extend(sample_period.to_ne_bytes()); |
| } |
| |
| if (sample_type & perf_event_sample_format_PERF_SAMPLE_CALLCHAIN as u64) != 0 { |
| // nr |
| sample.extend(perf_record_sample.ips.len().to_ne_bytes()); |
| |
| // ips[nr] - list of ips, u64 per ip. |
| for i in perf_record_sample.ips { |
| sample.extend(i.to_ne_bytes()); |
| } |
| } |
| // The remaining data are not defined for now. |
| |
| match perf_data_vmo.write(&sample, offset + (std::mem::size_of::<perf_event_header>() as u64)) { |
| Ok(_) => { |
| let bytes_written: u64 = |
| (std::mem::size_of::<perf_event_header>() + sample.len()) as u64; |
| |
| // TODO(http://fuchsia.dev/460203776) implement this better before enabling |
| // any setting of data_head value. |
| // Update data_head because we have now written to the VMO. |
| // Ordering::Release pushes update that this (and, transitively, the sample |
| // too) has updated. |
| // data_head_pointer.fetch_add(bytes_written, Ordering::Release); |
| |
| // Return the total size we wrote (header + sample) so that we can |
| // increment offset counter. |
| return bytes_written; |
| } |
| Err(e) => { |
| log_warn!("Failed to write PerfRecordSample to VMO due to: {}", e); |
| // Failed to write. Don't increment offset counter. |
| return 0; |
| } |
| } |
| } |
| |
| #[derive(Debug, Clone)] |
| struct PerfRecordSample { |
| pid: Option<u32>, |
| tid: Option<u32>, |
| // Instruction pointers (currently this is the address). First one is `ip` param. |
| ips: Vec<u64>, |
| } |
| |
| // Parses a backtrace (bt) to obtain the params for a PerfRecordSample. Example: |
| // |
| // 1234 pid |
| // 5555 tid |
| // {{{bt:0:0x1111:pc}}} {{{bt:frame_number:address:type}}} |
| // {{{bt:1:0x2222:ra}}} |
| // {{{bt:2:0x3333:ra}}} |
| // |
| // Results in: |
| // PerfRecordSample { pid: 1234, tid: 5555, nr: 3, ips: [0x1111, 0x2222, 0x3333] } |
| |
| fn parse_perf_record_sample_format(backtrace: &str) -> Option<PerfRecordSample> { |
| let mut pid: Option<u32> = None; |
| let mut tid: Option<u32> = None; |
| let mut ips: Vec<u64> = Vec::new(); |
| let mut numbers_found = 0; |
| track_stub!(TODO("https://fxbug.dev/437171287"), "[perf_event_open] handle regex nuances"); |
| let backtrace_regex = |
| Regex::new(r"^\s*\{\{\{bt:\d+:((0x[0-9a-fA-F]+)):(?:pc|ra)\}\}\}\s*$").unwrap(); |
| |
| for line in backtrace.lines() { |
| let trimmed_line = line.trim(); |
| // Try to parse as a raw number (for PID/TID). |
| if numbers_found < 2 { |
| if let Ok(num) = trimmed_line.parse::<u32>() { |
| if numbers_found == 0 { |
| pid = Some(num); |
| } else { |
| tid = Some(num); |
| } |
| numbers_found += 1; |
| continue; |
| } |
| } |
| |
| // Try to parse as a backtrace line. |
| if let Some(parsed_bt) = backtrace_regex.captures(trimmed_line) { |
| let address_str = parsed_bt.get(1).unwrap().as_str(); |
| if let Ok(ip_addr) = u64::from_str_radix(address_str.trim_start_matches("0x"), 16) { |
| ips.push(ip_addr); |
| } |
| } |
| } |
| |
| if pid == None || tid == None || ips.is_empty() { |
| // This data chunk might've been an {{{mmap}}} chunk, and not a {{{bt}}}. |
| log_info!("No ips while getting PerfRecordSample"); |
| None |
| } else { |
| Some(PerfRecordSample { pid: pid, tid: tid, ips: ips }) |
| } |
| } |
| |
| async fn set_up_profiler( |
| sample_period: zx::MonotonicDuration, |
| ) -> Result<(profiler::SessionProxy, fidl::AsyncSocket), Errno> { |
| // Configuration for how we want to sample. |
| let sample = profiler::Sample { |
| callgraph: Some(profiler::CallgraphConfig { |
| strategy: Some(profiler::CallgraphStrategy::FramePointer), |
| ..Default::default() |
| }), |
| ..Default::default() |
| }; |
| |
| let sampling_config = profiler::SamplingConfig { |
| period: Some(sample_period.into_nanos() as u64), |
| timebase: Some(profiler::Counter::PlatformIndependent(profiler::CounterId::Nanoseconds)), |
| sample: Some(sample), |
| ..Default::default() |
| }; |
| |
| let tasks = vec![ |
| // Should return ~300 samples for 100 millis. |
| profiler::Task::SystemWide(profiler::SystemWide {}), |
| ]; |
| let targets = profiler::TargetConfig::Tasks(tasks); |
| let config = profiler::Config { |
| configs: Some(vec![sampling_config]), |
| target: Some(targets), |
| ..Default::default() |
| }; |
| let (client, server) = fidl::Socket::create_stream(); |
| let configure = profiler::SessionConfigureRequest { |
| output: Some(server), |
| config: Some(config), |
| ..Default::default() |
| }; |
| |
| let proxy = connect_to_protocol::<profiler::SessionMarker>() |
| .context("Error connecting to Profiler protocol"); |
| let session_proxy: profiler::SessionProxy = match proxy { |
| Ok(p) => p.clone(), |
| Err(e) => return error!(EINVAL, e), |
| }; |
| |
| // Must configure before sampling start(). |
| let config_request = session_proxy.configure(configure).await; |
| match config_request { |
| Ok(_) => Ok((session_proxy, fidl::AsyncSocket::from_socket(client))), |
| Err(e) => return error!(EINVAL, e), |
| } |
| } |
| |
| // Collects samples and puts backtrace in VMO. |
| // - Starts and stops sampling for a duration. |
| // - Reads in the buffer from the socket for that duration in chunks. |
| // - Parses the buffer backtraces into PERF_RECORD_SAMPLE format. |
| // - Writes the PERF_RECORD_SAMPLE into VMO. |
| async fn collect_sample( |
| session_proxy: profiler::SessionProxy, |
| mut client: fidl::AsyncSocket, |
| duration: Duration, |
| perf_data_vmo: &zx::Vmo, |
| data_head_pointer: &AtomicPtr<u64>, |
| sample_type: u64, |
| sample_id: u64, |
| sample_period: u64, |
| vmo_write_offset: u64, |
| ) -> Result<(), Errno> { |
| let start_request = profiler::SessionStartRequest { |
| buffer_results: Some(true), |
| buffer_size_mb: Some(8 as u64), |
| ..Default::default() |
| }; |
| let _ = session_proxy.start(&start_request).await.expect("Failed to start profiling"); |
| |
| // Hardcode a duration so that samples can be collected. This is currently solely used to |
| // demonstrate that an E2E implementation of sample collection works. |
| track_stub!( |
| TODO("https://fxbug.dev/428974888"), |
| "[perf_event_open] don't hardcode sleep; test/user should decide sample duration" |
| ); |
| let _ = fuchsia_async::Timer::new(duration).await; |
| |
| let stats = session_proxy.stop().await; |
| let samples_collected = match stats { |
| Ok(stats) => stats.samples_collected.unwrap(), |
| Err(e) => return error!(EINVAL, e), |
| }; |
| |
| track_stub!( |
| TODO("https://fxbug.dev/422502681"), |
| "[perf_event_open] symbolize sample output and delete the below log_info" |
| ); |
| log_info!("profiler samples_collected: {:?}", samples_collected); |
| |
| // Peek at the first 8 bytes to determine if it's FXT or text. |
| let mut header = [0; 8]; |
| let mut bytes_read = 0; |
| while bytes_read < 8 { |
| match client.read(&mut header[bytes_read..]).await { |
| Ok(0) => { |
| // Peer closed the socket. This is the normal end of the stream. |
| log_info!("[perf_event_open] Finished reading fxt record from socket."); |
| break; |
| } |
| Ok(n) => bytes_read += n, |
| Err(e) => { |
| log_warn!("[perf_event_open] Error reading from socket: {:?}", e); |
| break; |
| } |
| } |
| } |
| |
| if bytes_read > 0 { |
| if bytes_read == 8 && header == FXT_MAGIC_BYTES { |
| // FXT format. |
| let header_cursor = Cursor::new(header); |
| let reader = header_cursor.chain(client); |
| let (mut stream, _task) = SessionParser::new_async(reader); |
| while let Some(record_result) = stream.next().await { |
| match record_result { |
| Ok(TraceRecord::Profiler(ProfilerRecord::Backtrace(backtrace))) => { |
| let ips: Vec<u64> = backtrace.data; |
| let pid = Some(backtrace.process.0 as u32); |
| let tid = Some(backtrace.thread.0 as u32); |
| let perf_record_sample = PerfRecordSample { pid, tid, ips }; |
| write_record_to_vmo( |
| perf_record_sample, |
| perf_data_vmo, |
| data_head_pointer, |
| sample_type, |
| sample_id, |
| sample_period, |
| vmo_write_offset, |
| ); |
| } |
| Ok(_) => { |
| // Ignore other records. |
| } |
| Err(e) => { |
| log_warn!("[perf_event_open] Error parsing FXT: {:?}", e); |
| break; |
| } |
| } |
| } |
| } else { |
| // Text format. |
| // Read chunks of sampling data from socket in this buffer temporarily. We will parse |
| // the data and write it into the output VMO (the one mmap points to). |
| let mut buffer = vec![0; DEFAULT_CHUNK_SIZE]; |
| |
| loop { |
| // Attempt to read data. This awaits until data is available, EOF, or error. |
| // Ignore the first 8 bytes as it's the {{{reset}}} marker. |
| let socket_data = client.read(&mut buffer).await; |
| |
| match socket_data { |
| Ok(0) => { |
| // Peer closed the socket. This is the normal end of the stream. |
| log_info!("[perf_event_open] Finished reading from socket."); |
| break; |
| } |
| Ok(bytes_read) => { |
| // Receive data in format {{{...}}}. |
| let received_data = match std::str::from_utf8(&buffer[..bytes_read]) { |
| Ok(data) => data, |
| Err(e) => return error!(EINVAL, e), |
| }; |
| // Parse data to PerfRecordSample struct. |
| if let Some(perf_record_sample) = |
| parse_perf_record_sample_format(received_data) |
| { |
| write_record_to_vmo( |
| perf_record_sample, |
| perf_data_vmo, |
| data_head_pointer, |
| sample_type, |
| sample_id, |
| sample_period, |
| vmo_write_offset, |
| ); |
| } |
| } |
| Err(e) => { |
| log_warn!("[perf_event_open] Error reading from socket: {:?}", e); |
| break; |
| } |
| } |
| } |
| } |
| } |
| |
| let reset_status = session_proxy.reset().await; |
| return match reset_status { |
| Ok(_) => Ok(()), |
| Err(e) => error!(EINVAL, e), |
| }; |
| } |
| |
| // Notifies other thread that we should start/stop sampling. |
| // Once sampling is complete, that profiler session is no longer needed. |
| // At that point, send back notification so that this is no longer blocking |
| // (e.g. so that other profiler sessions can start). |
| fn ping_receiver( |
| mut ioctl_sender: future_mpsc::Sender<(IoctlOp, sync_mpsc::Sender<()>)>, |
| command: IoctlOp, |
| ) { |
| log_info!("[perf_event_open] Received sampling command: {:?}", command); |
| let (profiling_complete_sender, profiling_complete_receiver) = sync_mpsc::channel::<()>(); |
| match ioctl_sender.try_send((command, profiling_complete_sender)) { |
| Ok(_) => (), |
| Err(e) => { |
| if e.is_full() { |
| log_warn!("[perf_event_open] Failed to send {:?}: Channel full", command); |
| } else if e.is_disconnected() { |
| log_warn!("[perf_event_open] Failed to send {:?}: Receiver disconnected", command); |
| } else { |
| log_warn!("[perf_event_open] Failed to send {:?} due to {:?}", command, e.source()); |
| } |
| } |
| }; |
| // Block on / wait until profiling is complete before returning. |
| // This notifies that the profiler is free to be used for another session. |
| let _ = profiling_complete_receiver.recv().unwrap(); |
| } |
| |
| pub fn sys_perf_event_open( |
| locked: &mut Locked<Unlocked>, |
| current_task: &CurrentTask, |
| attr: UserRef<perf_event_attr>, |
| // Note that this is pid in Linux docs. |
| tid: tid_t, |
| cpu: i32, |
| group_fd: FdNumber, |
| _flags: u64, |
| ) -> Result<SyscallResult, Errno> { |
| // So far, the implementation only sets the read_data_format according to the "Reading results" |
| // section of https://man7.org/linux/man-pages/man2/perf_event_open.2.html for a single event. |
| // Other features will be added in the future (see below track_stubs). |
| let perf_event_attrs: perf_event_attr = current_task.read_object(attr)?; |
| |
| if tid == -1 && cpu == -1 { |
| return error!(EINVAL); |
| } |
| |
| let target_task_type = match tid { |
| -1 => TargetTaskType::AllTasks, |
| 0 => TargetTaskType::CurrentTask, |
| _ => { |
| track_stub!(TODO("https://fxbug.dev/409621963"), "[perf_event_open] implement tid > 0"); |
| return error!(ENOSYS); |
| } |
| }; |
| security::check_perf_event_open_access( |
| current_task, |
| target_task_type, |
| &perf_event_attrs, |
| perf_event_attrs.type_.try_into()?, |
| )?; |
| |
| // Channel used to send info between notifier and spawned task thread. |
| // We somewhat arbitrarily picked 8 for now in case we get a bunch of ioctls that are in |
| // quick succession (instead of something lower). |
| let (sender, mut receiver) = future_mpsc::channel(8); |
| |
| let page_size = zx::system_get_page_size() as u64; |
| let mut perf_event_file = PerfEventFileState::new( |
| perf_event_attrs, |
| 0, |
| perf_event_attrs.disabled(), |
| perf_event_attrs.sample_type, |
| zx::Vmo::create(ESTIMATED_MMAP_BUFFER_SIZE).unwrap(), |
| page_size, // Start with this amount of offset, we can increment as we write. |
| sender, |
| ); |
| |
| let read_format = perf_event_attrs.read_format; |
| |
| if (read_format & perf_event_read_format_PERF_FORMAT_TOTAL_TIME_ENABLED as u64) != 0 |
| || (read_format & perf_event_read_format_PERF_FORMAT_TOTAL_TIME_RUNNING as u64) != 0 |
| { |
| // Only keep track of most_recent_enabled_time if we are currently in ENABLED state, |
| // as otherwise this param shouldn't be used for calculating anything. |
| if perf_event_file.disabled == 0 { |
| perf_event_file.most_recent_enabled_time = |
| zx::MonotonicInstant::get().into_nanos() as u64; |
| } |
| // Initialize this to 0 as we will need to return a time duration later during read(). |
| perf_event_file.total_time_running = 0; |
| } |
| |
| let event_id = READ_FORMAT_ID_GENERATOR.fetch_add(1, Ordering::Relaxed); |
| perf_event_file.rf_id = event_id; |
| |
| if group_fd.raw() == -1 { |
| perf_event_file.sample_id = event_id; |
| } else { |
| let group_file = current_task.files.get(group_fd)?; |
| let group_file_object_id = group_file.id; |
| let perf_state = get_perf_state(¤t_task.kernel); |
| let events = perf_state.format_id_lookup_table.lock(); |
| if let Some(rf_id) = events.get(&group_file_object_id) { |
| perf_event_file.sample_id = *rf_id; |
| } else { |
| return error!(EINVAL); |
| } |
| } |
| |
| if (read_format & perf_event_read_format_PERF_FORMAT_GROUP as u64) != 0 { |
| track_stub!( |
| TODO("https://fxbug.dev/402238049"), |
| "[perf_event_open] implement read_format group" |
| ); |
| return error!(ENOSYS); |
| } |
| if (read_format & perf_event_read_format_PERF_FORMAT_LOST as u64) != 0 { |
| track_stub!( |
| TODO("https://fxbug.dev/402260383"), |
| "[perf_event_open] implement read_format lost" |
| ); |
| } |
| |
| // Set up notifier for handling ioctl calls to enable/disable sampling. |
| let mut vmo_handle_copy = |
| perf_event_file.perf_data_vmo.as_handle_ref().duplicate(zx::Rights::SAME_RIGHTS); |
| |
| // SAFETY: sample_period is a u64 field in a union with u64 sample_freq. |
| // This is always sound regardless of the union's tag. |
| let sample_period_in_ticks = unsafe { perf_event_file.attr.__bindgen_anon_1.sample_period }; |
| // The sample period from the PERF_COUNT_SW_CPU_CLOCK is |
| // 1 nanosecond per tick. Convert this duration into zx::duration. |
| let zx_sample_period = zx::MonotonicDuration::from_nanos(sample_period_in_ticks as i64); |
| |
| let data_head_pointer = Arc::new(AtomicPtr::new(std::ptr::null_mut::<u64>())); |
| // Pass cloned into the thread. |
| let cloned_data_head_pointer = Arc::clone(&data_head_pointer); |
| |
| let closure = async move |_: LockedAndTask<'_>| { |
| // This loop will wait for messages from the sender. |
| while let Some((command, profiling_complete_receiver)) = receiver.next().await { |
| match command { |
| IoctlOp::Enable => { |
| match set_up_profiler(zx_sample_period).await { |
| Ok((session_proxy, client)) => { |
| track_stub!( |
| TODO("https://fxbug.dev/422502681"), |
| "[perf_event_open] don't hardcode profiling duration" |
| ); |
| |
| let handle = vmo_handle_copy |
| .as_mut() |
| .expect("Failed to get VMO handle") |
| .as_handle_ref() |
| .duplicate(zx::Rights::SAME_RIGHTS) |
| .unwrap(); |
| |
| let _ = collect_sample( |
| session_proxy, |
| client, |
| Duration::from_millis(100), |
| &zx::Vmo::from(handle), |
| &*cloned_data_head_pointer, |
| perf_event_file.sample_type, |
| perf_event_file.sample_id, |
| sample_period_in_ticks, |
| perf_event_file.vmo_write_offset, |
| ) |
| .await; |
| // Send notification that profiler session is over. |
| let _ = profiling_complete_receiver.send(()); |
| } |
| Err(e) => { |
| log_warn!("Failed to profile: {}", e); |
| } |
| }; |
| } |
| } |
| } |
| () |
| }; |
| let req = SpawnRequestBuilder::new() |
| .with_debug_name("perf-event-sampler") |
| .with_async_closure(closure) |
| .build(); |
| current_task.kernel().kthreads.spawner().spawn_from_request(req); |
| |
| let file = Box::new(PerfEventFile { |
| _tid: tid, |
| _cpu: cpu, |
| perf_event_file: RwLock::new(perf_event_file), |
| security_state: security::perf_event_alloc(current_task), |
| data_head_pointer: data_head_pointer, |
| }); |
| // TODO: https://fxbug.dev/404739824 - Confirm whether to handle this as a "private" node. |
| let file_handle = |
| Anon::new_private_file(locked, current_task, file, OpenFlags::RDWR, "[perf_event]"); |
| let file_object_id = file_handle.id; |
| let file_descriptor: Result<FdNumber, Errno> = |
| current_task.add_file(locked, file_handle, FdFlags::empty()); |
| |
| match file_descriptor { |
| Ok(fd) => { |
| if group_fd.raw() == -1 { |
| let perf_state = get_perf_state(¤t_task.kernel); |
| let mut events = perf_state.format_id_lookup_table.lock(); |
| events.insert(file_object_id, event_id); |
| } |
| Ok(fd.into()) |
| } |
| Err(_) => { |
| track_stub!( |
| TODO("https://fxbug.dev/402453955"), |
| "[perf_event_open] implement remaining error handling" |
| ); |
| error!(EMFILE) |
| } |
| } |
| } |
| // Syscalls for arch32 usage |
| #[cfg(target_arch = "aarch64")] |
| mod arch32 { |
| pub use super::sys_perf_event_open as sys_arch32_perf_event_open; |
| } |
| |
| #[cfg(target_arch = "aarch64")] |
| pub use arch32::*; |
| |
| use crate::mm::memory::MemoryObject; |
| use crate::mm::{MemoryAccessorExt, ProtectionFlags}; |
| use crate::task::CurrentTask; |
| use crate::vfs::{ |
| Anon, FdFlags, FdNumber, FileObject, FileObjectId, FileObjectState, FileOps, InputBuffer, |
| OutputBuffer, |
| }; |
| use crate::{fileops_impl_nonseekable, fileops_impl_noop_sync}; |