blob: 7cc9e73ac3f63e181dbb67022106fcff7fd3588d [file] [log] [blame]
// Copyright 2017 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#pragma once
#include <assert.h>
#include <stddef.h>
#include <stdint.h>
#ifndef __cplusplus
#include <stdalign.h> // for alignof
#endif
#include <zircon/types.h>
#ifdef __Fuchsia__
#include <zircon/device/ioctl.h>
#include <zircon/device/ioctl-wrapper.h>
#endif
__BEGIN_CDECLS
// API version number (useful when doing incompatible upgrades)
#define CPUPERF_API_VERSION 3
// Buffer format version
#define CPUPERF_BUFFER_VERSION 0
// The maximum number of events we support simultaneously.
// Typically the h/w supports less than this, e.g., 7 or so.
// TODO(dje): Have the device driver multiplex the events when more is
// asked for than the h/w supports.
#define CPUPERF_MAX_EVENTS 32u
// Header for each data buffer.
typedef struct {
// Format version number (CPUPERF_BUFFER_VERSION).
uint16_t version;
// The architecture that generated the data.
uint16_t arch;
#define CPUPERF_BUFFER_ARCH_UNKNOWN 0u
#define CPUPERF_BUFFER_ARCH_X86_64 1u
#define CPUPERF_BUFFER_ARCH_ARM64 2u
uint32_t flags;
// The buffer filled, and records were dropped.
#define CPUPERF_BUFFER_FLAG_FULL (1u << 0)
// zx_ticks_per_second in the kernel
zx_ticks_t ticks_per_second;
// Offset into the buffer of the end of the data.
uint64_t capture_end;
} cpuperf_buffer_header_t;
// The various types of emitted records.
typedef enum {
// Reserved, unused.
CPUPERF_RECORD_RESERVED = 0,
// The current time, in a |cpuperf_time_record_t|, to be applied to all
// subsequent records until the next TIME record.
CPUPERF_RECORD_TIME = 1,
// The record is a |cpuperf_tick_record_t|.
// TODO(dje): Rename, the name is confusing with TIME records.
CPUPERF_RECORD_TICK = 2,
// The record is a |cpuperf_count_record_t|.
CPUPERF_RECORD_COUNT = 3,
// The record is a |cpuperf_value_record_t|.
CPUPERF_RECORD_VALUE = 4,
// The record is a |cpuperf_pc_record_t|.
CPUPERF_RECORD_PC = 5,
// The record is a |cpuperf_last_branch_record_t|.
CPUPERF_RECORD_LAST_BRANCH = 6,
} cpuperf_record_type_t;
// Trace buffer space is expensive, we want to keep records small.
// Having more than 64K different events for any one arch is unlikely
// so we use 16 bits for the event id.
// To help each arch manage the plethora of different events, the event id
// is split it two parts: 6 bit event group, and 10 bit event within that
// group.
// An event id of zero is defined to be unused. To simplify things we just
// take the whole set of |group| == 0 as reserved.
typedef uint16_t cpuperf_event_id_t;
#define CPUPERF_MAKE_EVENT_ID(group, event) (((group) << 10) | (event))
#define CPUPERF_EVENT_ID_GROUP(id) (((id) >> 10) & 0x3f)
#define CPUPERF_EVENT_ID_EVENT(id) ((id) & 0x3ff)
#define CPUPERF_MAX_GROUP 0x3f
#define CPUPERF_MAX_EVENT 0x3ff
#define CPUPERF_EVENT_ID_NONE 0
// Possible values for the |group| field of |cpuperf_event_id_t|.
// TODO(dje): Reorganize these into something like
// {arch,model} -x- {fixed,programmable}, which these currently are,
// it's just not immediately apparent.
typedef enum {
CPUPERF_GROUP_RESERVED = 0,
CPUPERF_GROUP_ARCH = 1,
CPUPERF_GROUP_FIXED = 2,
CPUPERF_GROUP_MODEL = 3,
CPUPERF_GROUP_MISC = 4,
} cpuperf_group_type_t;
// The typical record is a tick record which is 4 + 8 bytes.
// Aligning records to 8-byte boundaries would waste a lot of space,
// so currently we align everything to 4-byte boundaries.
// TODO(dje): Collect data to see what this saves. Keep it?
#define CPUPERF_ALIGN_RECORD __PACKED __ALIGNED(4)
// Trace record header.
// Note: Avoid holes in all trace records.
typedef struct {
// One of CPUPERF_RECORD_*.
uint8_t type;
// A possible usage of this field is to add some type-specific flags.
uint8_t reserved_flags;
// The event the record is for.
// If there is none then use CPUPERF_EVENT_ID_NONE.
cpuperf_event_id_t event;
} CPUPERF_ALIGN_RECORD cpuperf_record_header_t;
// Verify our alignment assumptions.
static_assert(sizeof(cpuperf_record_header_t) == 4,
"record header not 4 bytes");
// Record the current time of the trace.
// If the event id is non-zero (!NONE) then it must be for a counting event
// and then this record is also a "tick" record indicating the counter has
// reached its sample rate. The counter resets to zero after this record.
typedef struct {
cpuperf_record_header_t header;
// The value is architecture and possibly platform specific.
// The |ticks_per_second| field in the buffer header provides the
// conversion factor from this value to ticks per second.
// For x86 this is the TSC value.
zx_ticks_t time;
} CPUPERF_ALIGN_RECORD cpuperf_time_record_t;
// Verify our alignment assumptions.
// We don't need to do this for every record, but doing it for this one
// verifies CPUPERF_ALIGN_RECORD is working.
static_assert(sizeof(cpuperf_time_record_t) == 12,
"time record not 12 bytes");
static_assert(alignof(cpuperf_time_record_t) == 4,
"time record not 4-byte aligned");
// Record that a counting event reached its sample rate.
// It is expected that this record follows a TIME record.
// The counter resets to zero after this record.
// This does not include the event's value in order to keep the size small:
// the value is the sample rate which is known from the configuration.
typedef struct {
cpuperf_record_header_t header;
} CPUPERF_ALIGN_RECORD cpuperf_tick_record_t;
// Record the value of a counter at a particular time.
// It is expected that this record follows a TIME record.
// The counter resets to zero after this record.
// This is used when another timebase is driving the sampling, e.g., another
// counter. Otherwise the "tick" record is generally used as it takes less
// space.
typedef struct {
cpuperf_record_header_t header;
uint64_t count;
} CPUPERF_ALIGN_RECORD cpuperf_count_record_t;
// Record the value of an event.
// It is expected that this record follows a TIME record.
// This value is not a count and cannot be used to produce a "rate"
// (e.g., some value per second).
typedef struct {
cpuperf_record_header_t header;
uint64_t value;
} CPUPERF_ALIGN_RECORD cpuperf_value_record_t;
// Record the aspace+pc values.
// If the event id is not NONE, then this record also indicates that the
// event reached its tick point, and is used instead of a tick record. This
// record is overloaded to save space in trace buffer output.
// It is expected that this record follows a TIME record.
// This is used when doing gprof-like profiling.
// The event's value is not included here as this is typically used when
// the counter is its own trigger: the value is known from the sample rate.
typedef struct {
cpuperf_record_header_t header;
// The aspace id at the time data was collected.
// The meaning of the value is architecture-specific.
// In the case of x86 this is the cr3 value.
uint64_t aspace;
uint64_t pc;
} CPUPERF_ALIGN_RECORD cpuperf_pc_record_t;
// Entry in a last branch record.
typedef struct {
uint64_t from;
uint64_t to;
// Various bits of info about this branch. See CPUPERF_LAST_BRANCH_INFO_*.
uint64_t info;
} CPUPERF_ALIGN_RECORD cpuperf_last_branch_t;
// Utility to compute masks for fields in this file.
#define CPUPERF_GEN_MASK64(len, shift) (((1ULL << (len)) - 1) << (shift))
// Fields in |cpuperf_last_branch_t.info|.
// Number of cycles since the last branch, or zero if unknown.
// The unit of measurement is architecture-specific.
#define CPUPERF_LAST_BRANCH_INFO_CYCLES_SHIFT (0u)
#define CPUPERF_LAST_BRANCH_INFO_CYCLES_LEN (16u)
#define CPUPERF_LAST_BRANCH_INFO_CYCLES_MASK \
CPUPERF_GEN_MASK64(CPUPERF_LAST_BRANCH_INFO_CYCLES_SHIFT, \
CPUPERF_LAST_BRANCH_INFO_CYCLES_LEN)
// Non-zero if branch was mispredicted.
// Whether this bit is available is architecture-specific.
#define CPUPERF_LAST_BRANCH_INFO_MISPRED_SHIFT (16u)
#define CPUPERF_LAST_BRANCH_INFO_MISPRED_LEN (1u)
#define CPUPERF_LAST_BRANCH_INFO_MISPRED_MASK \
CPUPERF_GEN_MASK64(CPUPERF_LAST_BRANCH_INFO_MISPRED_SHIFT, \
CPUPERF_LAST_BRANCH_INFO_MISPRED_LEN)
// Record a set of last branches executed.
// It is expected that this record follows a TIME record.
// Note that this record is variable-length.
// This is used when doing gprof-like profiling.
typedef struct {
cpuperf_record_header_t header;
// Number of entries in |branch|.
uint32_t num_branches;
// The aspace id at the time data was collected. This is not necessarily
// the aspace id of each branch. S/W will need to determine from the
// branch addresses how far back aspace is valid.
// The meaning of the value is architecture-specific.
// In the case of x86 this is the cr3 value.
uint64_t aspace;
// The set of last branches, in reverse chronological order:
// The first entry is the most recent one.
// Note that the emitted record may be smaller than this, as indicated by
// |num_branches|.
// Reverse order seems most useful.
// 32 is the max value for Skylake
#define CPUPERF_MAX_NUM_LAST_BRANCH (32u)
cpuperf_last_branch_t branches[CPUPERF_MAX_NUM_LAST_BRANCH];
} CPUPERF_ALIGN_RECORD cpuperf_last_branch_record_t;
// Return the size of valid last branch record |lbr|.
#define CPUPERF_LAST_BRANCH_RECORD_SIZE(lbr) \
(sizeof(cpuperf_last_branch_record_t) - \
(CPUPERF_MAX_NUM_LAST_BRANCH - (lbr)->num_branches) * sizeof((lbr)->branches[0]))
// The properties of this system.
typedef struct {
// S/W API version = CPUPERF_API_VERSION.
uint16_t api_version;
// The H/W Performance Monitor version.
uint16_t pm_version;
// The number of fixed events.
uint16_t num_fixed_events;
// The number of programmable events.
uint16_t num_programmable_events;
// For fixed events that are counters, the width in bits.
// If different counters have different widths, the choice is architecture
// specific.
uint16_t fixed_counter_width;
// For programmable events that are counters, the width in bits.
// If different counters have different widths, the choice is architecture
// specific.
uint16_t programmable_counter_width;
// Various flags.
uint32_t flags;
#define CPUPERF_PROPERTY_FLAG_HAS_LAST_BRANCH (1u << 0)
} cpuperf_properties_t;
// The type of the |rate| field of cpuperf_config_t.
typedef uint32_t cpuperf_rate_t;
// Passed to STAGE_CONFIG to select the data to be collected.
// Events must be consecutively allocated from the front with no holes.
// A value of CPUPERF_EVENT_ID_NONE in |events| marks the end.
typedef struct {
// Events to collect data for.
// The values are architecture specific ids: cpuperf_<arch>_event_id_t
// Each event may appear at most once.
// |events[0]| is special: It is used as the timebase when any other
// event has CPUPERF_CONFIG_FLAG_TIMEBASE0 set.
cpuperf_event_id_t events[CPUPERF_MAX_EVENTS];
// Sampling rate for each event in |events|.
// If zero then do simple counting (collect a tally of the count and
// report at the end). Otherwise (non-zero) then when the event gets
// this many hits data is collected (e.g., pc, time).
// The value can be non-zero only for counting based events.
// This value is ignored if CPUPERF_CONFIG_FLAG_TIMEBASE0 is set.
// Setting CPUPERF_CONFIG_FLAG_TIMEBASE0 in |flags[0]| is redundant but ok.
cpuperf_rate_t rate[CPUPERF_MAX_EVENTS];
// Flags for each event in |events|.
// TODO(dje): hypervisor, host/guest os/user
uint32_t flags[CPUPERF_MAX_EVENTS];
// Valid bits in |flags|.
#define CPUPERF_CONFIG_FLAG_MASK 0x1f
// Collect os data.
#define CPUPERF_CONFIG_FLAG_OS (1u << 0)
// Collect userspace data.
#define CPUPERF_CONFIG_FLAG_USER (1u << 1)
// Collect aspace+pc values.
#define CPUPERF_CONFIG_FLAG_PC (1u << 2)
// If set then use |events[0]| as the timebase: data for this event is
// collected when data for |events[0]| is collected, and the record emitted
// for this event is either a CPUPERF_RECORD_COUNT or CPUPERF_RECORD_VALUE
// record (depending on what the event is).
// It is an error to have this bit set for an event and have rate[0] be zero.
#define CPUPERF_CONFIG_FLAG_TIMEBASE0 (1u << 3)
// Collect the available set of last branches.
// Branch data is emitted as CPUPERF_RECORD_LAST_BRANCH records.
// This is only available when the underlying system supports it.
// TODO(dje): Provide knob to specify how many branches.
#define CPUPERF_CONFIG_FLAG_LAST_BRANCH (1u << 4)
} cpuperf_config_t;
///////////////////////////////////////////////////////////////////////////////
#ifdef __Fuchsia__
// ioctls
// Fetch the cpu trace properties of the system.
// Output: cpuperf_properties_t
#define IOCTL_CPUPERF_GET_PROPERTIES \
IOCTL(IOCTL_KIND_DEFAULT, IOCTL_FAMILY_CPUPERF, 0)
IOCTL_WRAPPER_OUT(ioctl_cpuperf_get_properties,
IOCTL_CPUPERF_GET_PROPERTIES,
cpuperf_properties_t);
// The allocation configuration for a data collection run.
// This is generally the first call to allocate resources for a trace,
// "trace" is used generically here: == "data collection run".
typedef struct {
// must be #cpus for now
uint32_t num_buffers;
// each cpu gets same buffer size
uint32_t buffer_size;
} ioctl_cpuperf_alloc_t;
// Create a trace, allocating the needed trace buffers and other resources.
// "other resources" is basically a catch-all for other things that will
// be needed. This does not include reserving the events, that is done later
// by STAGE_CONFIG.
// Input: ioctl_cpuperf_alloc_t
#define IOCTL_CPUPERF_ALLOC_TRACE \
IOCTL(IOCTL_KIND_DEFAULT, IOCTL_FAMILY_CPUPERF, 1)
IOCTL_WRAPPER_IN(ioctl_cpuperf_alloc_trace, IOCTL_CPUPERF_ALLOC_TRACE,
ioctl_cpuperf_alloc_t);
// Free all trace buffers and any other resources allocated for the trace.
// This is also done when the fd is closed (as well as stopping the trace).
#define IOCTL_CPUPERF_FREE_TRACE \
IOCTL(IOCTL_KIND_DEFAULT, IOCTL_FAMILY_CPUPERF, 2)
IOCTL_WRAPPER(ioctl_cpuperf_free_trace, IOCTL_CPUPERF_FREE_TRACE);
// Return trace allocation config.
// Output: ioctl_cpuperf_alloc_t
#define IOCTL_CPUPERF_GET_ALLOC \
IOCTL(IOCTL_KIND_DEFAULT, IOCTL_FAMILY_CPUPERF, 3)
IOCTL_WRAPPER_OUT(ioctl_cpuperf_get_alloc, IOCTL_CPUPERF_GET_ALLOC,
ioctl_cpuperf_alloc_t);
// Stage performance monitor specification for a cpu.
// Must be called with data collection off and after ALLOC.
// Note: This doesn't actually configure the h/w, this just stages
// the values for subsequent use by START.
// Input: cpuperf_config_t
#define IOCTL_CPUPERF_STAGE_CONFIG \
IOCTL(IOCTL_KIND_DEFAULT, IOCTL_FAMILY_CPUPERF, 4)
IOCTL_WRAPPER_IN(ioctl_cpuperf_stage_config, IOCTL_CPUPERF_STAGE_CONFIG,
cpuperf_config_t);
// Fetch performance monitor specification for a cpu.
// Must be called with data collection off and after STAGE_CONFIG.
// Output: cpuperf_config_t
#define IOCTL_CPUPERF_GET_CONFIG \
IOCTL(IOCTL_KIND_DEFAULT, IOCTL_FAMILY_CPUPERF, 5)
IOCTL_WRAPPER_OUT(ioctl_cpuperf_get_config, IOCTL_CPUPERF_GET_CONFIG,
cpuperf_config_t);
typedef struct {
uint32_t descriptor;
} ioctl_cpuperf_buffer_handle_req_t;
// Return a handle of a trace buffer.
// Input: trace buffer descriptor (0, 1, 2, ..., |num_buffers|-1)
// Output: handle of the vmo of the buffer
#define IOCTL_CPUPERF_GET_BUFFER_HANDLE \
IOCTL(IOCTL_KIND_GET_HANDLE, IOCTL_FAMILY_CPUPERF, 6)
IOCTL_WRAPPER_INOUT(ioctl_cpuperf_get_buffer_handle,
IOCTL_CPUPERF_GET_BUFFER_HANDLE,
ioctl_cpuperf_buffer_handle_req_t, zx_handle_t);
// Turn on data collection.
// Must be called after ALLOC+STAGE_CONFIG and with data collection off.
#define IOCTL_CPUPERF_START \
IOCTL(IOCTL_KIND_DEFAULT, IOCTL_FAMILY_CPUPERF, 7)
IOCTL_WRAPPER(ioctl_cpuperf_start, IOCTL_CPUPERF_START);
// Turn off data collection.
// May be called any time after ALLOC has been called and before FREE.
// May be called multiple times.
#define IOCTL_CPUPERF_STOP \
IOCTL(IOCTL_KIND_DEFAULT, IOCTL_FAMILY_CPUPERF, 8)
IOCTL_WRAPPER(ioctl_cpuperf_stop, IOCTL_CPUPERF_STOP);
#endif // __Fuchsia__
__END_CDECLS