system/ulib/zircon-internal/include/lib/zircon-internal/device/cpu-trace/cpu-perf.h - zircon/ - Git at Google

 // Copyright 2017 The Fuchsia Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #pragma once

 #include <assert.h>
 #include <stddef.h>
 #include <stdint.h>
 #ifndef __cplusplus
 #include <stdalign.h>  // for alignof
 #endif

 #include <zircon/types.h>

 #ifdef __Fuchsia__
 #include <zircon/device/ioctl.h>
 #include <zircon/device/ioctl-wrapper.h>
 #endif

 __BEGIN_CDECLS

 // API version number (useful when doing incompatible upgrades)
 #define CPUPERF_API_VERSION 3

 // Buffer format version
 #define CPUPERF_BUFFER_VERSION 0

 // The maximum number of events we support simultaneously.
 // Typically the h/w supports less than this, e.g., 7 or so.
 // TODO(dje): Have the device driver multiplex the events when more is
 // asked for than the h/w supports.
 #define CPUPERF_MAX_EVENTS 32u

 // Header for each data buffer.
 typedef struct {
     // Format version number (CPUPERF_BUFFER_VERSION).
     uint16_t version;

     // The architecture that generated the data.
     uint16_t arch;
 #define CPUPERF_BUFFER_ARCH_UNKNOWN 0u
 #define CPUPERF_BUFFER_ARCH_X86_64  1u
 #define CPUPERF_BUFFER_ARCH_ARM64   2u

     uint32_t flags;
 // The buffer filled, and records were dropped.
 #define CPUPERF_BUFFER_FLAG_FULL (1u << 0)

     // zx_ticks_per_second in the kernel
     zx_ticks_t ticks_per_second;

     // Offset into the buffer of the end of the data.
     uint64_t capture_end;
 } cpuperf_buffer_header_t;

 // The various types of emitted records.
 typedef enum {
   // Reserved, unused.
   CPUPERF_RECORD_RESERVED = 0,
   // The current time, in a |cpuperf_time_record_t|, to be applied to all
   // subsequent records until the next TIME record.
   CPUPERF_RECORD_TIME = 1,
   // The record is a |cpuperf_tick_record_t|.
   // TODO(dje): Rename, the name is confusing with TIME records.
   CPUPERF_RECORD_TICK = 2,
   // The record is a |cpuperf_count_record_t|.
   CPUPERF_RECORD_COUNT = 3,
   // The record is a |cpuperf_value_record_t|.
   CPUPERF_RECORD_VALUE = 4,
   // The record is a |cpuperf_pc_record_t|.
   CPUPERF_RECORD_PC = 5,
   // The record is a |cpuperf_last_branch_record_t|.
   CPUPERF_RECORD_LAST_BRANCH = 6,
 } cpuperf_record_type_t;

 // Trace buffer space is expensive, we want to keep records small.
 // Having more than 64K different events for any one arch is unlikely
 // so we use 16 bits for the event id.
 // To help each arch manage the plethora of different events, the event id
 // is split it two parts: 6 bit event group, and 10 bit event within that
 // group.
 // An event id of zero is defined to be unused. To simplify things we just
 // take the whole set of |group| == 0 as reserved.
 typedef uint16_t cpuperf_event_id_t;
 #define CPUPERF_MAKE_EVENT_ID(group, event) (((group) << 10) | (event))
 #define CPUPERF_EVENT_ID_GROUP(id) (((id) >> 10) & 0x3f)
 #define CPUPERF_EVENT_ID_EVENT(id) ((id) & 0x3ff)
 #define CPUPERF_MAX_GROUP 0x3f
 #define CPUPERF_MAX_EVENT 0x3ff
 #define CPUPERF_EVENT_ID_NONE 0

 // Possible values for the |group| field of |cpuperf_event_id_t|.
 // TODO(dje): Reorganize these into something like
 // {arch,model} -x- {fixed,programmable}, which these currently are,
 // it's just not immediately apparent.
 typedef enum {
     CPUPERF_GROUP_RESERVED = 0,
     CPUPERF_GROUP_ARCH = 1,
     CPUPERF_GROUP_FIXED = 2,
     CPUPERF_GROUP_MODEL = 3,
     CPUPERF_GROUP_MISC = 4,
 } cpuperf_group_type_t;

 // The typical record is a tick record which is 4 + 8 bytes.
 // Aligning records to 8-byte boundaries would waste a lot of space,
 // so currently we align everything to 4-byte boundaries.
 // TODO(dje): Collect data to see what this saves. Keep it?
 #define CPUPERF_ALIGN_RECORD __PACKED __ALIGNED(4)

 // Trace record header.
 // Note: Avoid holes in all trace records.
 typedef struct {
     // One of CPUPERF_RECORD_*.
     uint8_t type;

     // A possible usage of this field is to add some type-specific flags.
     uint8_t reserved_flags;

     // The event the record is for.
     // If there is none then use CPUPERF_EVENT_ID_NONE.
     cpuperf_event_id_t event;
 } CPUPERF_ALIGN_RECORD cpuperf_record_header_t;

 // Verify our alignment assumptions.
 static_assert(sizeof(cpuperf_record_header_t) == 4,
               "record header not 4 bytes");

 // Record the current time of the trace.
 // If the event id is non-zero (!NONE) then it must be for a counting event
 // and then this record is also a "tick" record indicating the counter has
 // reached its sample rate. The counter resets to zero after this record.
 typedef struct {
     cpuperf_record_header_t header;
     // The value is architecture and possibly platform specific.
     // The |ticks_per_second| field in the buffer header provides the
     // conversion factor from this value to ticks per second.
     // For x86 this is the TSC value.
     zx_ticks_t time;
 } CPUPERF_ALIGN_RECORD cpuperf_time_record_t;

 // Verify our alignment assumptions.
 // We don't need to do this for every record, but doing it for this one
 // verifies CPUPERF_ALIGN_RECORD is working.
 static_assert(sizeof(cpuperf_time_record_t) == 12,
               "time record not 12 bytes");
 static_assert(alignof(cpuperf_time_record_t) == 4,
               "time record not 4-byte aligned");

 // Record that a counting event reached its sample rate.
 // It is expected that this record follows a TIME record.
 // The counter resets to zero after this record.
 // This does not include the event's value in order to keep the size small:
 // the value is the sample rate which is known from the configuration.
 typedef struct {
     cpuperf_record_header_t header;
 } CPUPERF_ALIGN_RECORD cpuperf_tick_record_t;

 // Record the value of a counter at a particular time.
 // It is expected that this record follows a TIME record.
 // The counter resets to zero after this record.
 // This is used when another timebase is driving the sampling, e.g., another
 // counter. Otherwise the "tick" record is generally used as it takes less
 // space.
 typedef struct {
     cpuperf_record_header_t header;
     uint64_t count;
 } CPUPERF_ALIGN_RECORD cpuperf_count_record_t;

 // Record the value of an event.
 // It is expected that this record follows a TIME record.
 // This value is not a count and cannot be used to produce a "rate"
 // (e.g., some value per second).
 typedef struct {
     cpuperf_record_header_t header;
     uint64_t value;
 } CPUPERF_ALIGN_RECORD cpuperf_value_record_t;

 // Record the aspace+pc values.
 // If the event id is not NONE, then this record also indicates that the
 // event reached its tick point, and is used instead of a tick record. This
 // record is overloaded to save space in trace buffer output.
 // It is expected that this record follows a TIME record.
 // This is used when doing gprof-like profiling.
 // The event's value is not included here as this is typically used when
 // the counter is its own trigger: the value is known from the sample rate.
 typedef struct {
     cpuperf_record_header_t header;
     // The aspace id at the time data was collected.
     // The meaning of the value is architecture-specific.
     // In the case of x86 this is the cr3 value.
     uint64_t aspace;
     uint64_t pc;
 } CPUPERF_ALIGN_RECORD cpuperf_pc_record_t;

 // Entry in a last branch record.
 typedef struct  {
     uint64_t from;
     uint64_t to;
     // Various bits of info about this branch. See CPUPERF_LAST_BRANCH_INFO_*.
     uint64_t info;
 } CPUPERF_ALIGN_RECORD cpuperf_last_branch_t;

 // Utility to compute masks for fields in this file.
 #define CPUPERF_GEN_MASK64(len, shift) (((1ULL << (len)) - 1) << (shift))

 // Fields in |cpuperf_last_branch_t.info|.

 // Number of cycles since the last branch, or zero if unknown.
 // The unit of measurement is architecture-specific.
 #define CPUPERF_LAST_BRANCH_INFO_CYCLES_SHIFT (0u)
 #define CPUPERF_LAST_BRANCH_INFO_CYCLES_LEN   (16u)
 #define CPUPERF_LAST_BRANCH_INFO_CYCLES_MASK  \
     CPUPERF_GEN_MASK64(CPUPERF_LAST_BRANCH_INFO_CYCLES_SHIFT, \
                        CPUPERF_LAST_BRANCH_INFO_CYCLES_LEN)

 // Non-zero if branch was mispredicted.
 // Whether this bit is available is architecture-specific.
 #define CPUPERF_LAST_BRANCH_INFO_MISPRED_SHIFT (16u)
 #define CPUPERF_LAST_BRANCH_INFO_MISPRED_LEN   (1u)
 #define CPUPERF_LAST_BRANCH_INFO_MISPRED_MASK  \
     CPUPERF_GEN_MASK64(CPUPERF_LAST_BRANCH_INFO_MISPRED_SHIFT, \
                        CPUPERF_LAST_BRANCH_INFO_MISPRED_LEN)

 // Record a set of last branches executed.
 // It is expected that this record follows a TIME record.
 // Note that this record is variable-length.
 // This is used when doing gprof-like profiling.
 typedef struct {
     cpuperf_record_header_t header;
     // Number of entries in |branch|.
     uint32_t num_branches;
     // The aspace id at the time data was collected. This is not necessarily
     // the aspace id of each branch. S/W will need to determine from the
     // branch addresses how far back aspace is valid.
     // The meaning of the value is architecture-specific.
     // In the case of x86 this is the cr3 value.
     uint64_t aspace;
     // The set of last branches, in reverse chronological order:
     // The first entry is the most recent one.
     // Note that the emitted record may be smaller than this, as indicated by
     // |num_branches|.
     // Reverse order seems most useful.
 // 32 is the max value for Skylake
 #define CPUPERF_MAX_NUM_LAST_BRANCH (32u)
     cpuperf_last_branch_t branches[CPUPERF_MAX_NUM_LAST_BRANCH];
 } CPUPERF_ALIGN_RECORD cpuperf_last_branch_record_t;

 // Return the size of valid last branch record |lbr|.
 #define CPUPERF_LAST_BRANCH_RECORD_SIZE(lbr) \
     (sizeof(cpuperf_last_branch_record_t) - \
      (CPUPERF_MAX_NUM_LAST_BRANCH - (lbr)->num_branches) * sizeof((lbr)->branches[0]))

 // The properties of this system.
 typedef struct {
     // S/W API version = CPUPERF_API_VERSION.
     uint16_t api_version;
     // The H/W Performance Monitor version.
     uint16_t pm_version;
     // The number of fixed events.
     uint16_t num_fixed_events;
     // The number of programmable events.
     uint16_t num_programmable_events;
     // For fixed events that are counters, the width in bits.
     // If different counters have different widths, the choice is architecture
     // specific.
     uint16_t fixed_counter_width;
     // For programmable events that are counters, the width in bits.
     // If different counters have different widths, the choice is architecture
     // specific.
     uint16_t programmable_counter_width;
     // Various flags.
     uint32_t flags;
 #define CPUPERF_PROPERTY_FLAG_HAS_LAST_BRANCH (1u << 0)
 } cpuperf_properties_t;

 // The type of the |rate| field of cpuperf_config_t.
 typedef uint32_t cpuperf_rate_t;

 // Passed to STAGE_CONFIG to select the data to be collected.
 // Events must be consecutively allocated from the front with no holes.
 // A value of CPUPERF_EVENT_ID_NONE in |events| marks the end.
 typedef struct {
     // Events to collect data for.
     // The values are architecture specific ids: cpuperf_<arch>_event_id_t
     // Each event may appear at most once.
     // |events[0]| is special: It is used as the timebase when any other
     // event has CPUPERF_CONFIG_FLAG_TIMEBASE0 set.
     cpuperf_event_id_t events[CPUPERF_MAX_EVENTS];

     // Sampling rate for each event in |events|.
     // If zero then do simple counting (collect a tally of the count and
     // report at the end). Otherwise (non-zero) then when the event gets
     // this many hits data is collected (e.g., pc, time).
     // The value can be non-zero only for counting based events.
     // This value is ignored if CPUPERF_CONFIG_FLAG_TIMEBASE0 is set.
     // Setting CPUPERF_CONFIG_FLAG_TIMEBASE0 in |flags[0]| is redundant but ok.
     cpuperf_rate_t rate[CPUPERF_MAX_EVENTS];

     // Flags for each event in |events|.
     // TODO(dje): hypervisor, host/guest os/user
     uint32_t flags[CPUPERF_MAX_EVENTS];
 // Valid bits in |flags|.
 #define CPUPERF_CONFIG_FLAG_MASK      0x1f
 // Collect os data.
 #define CPUPERF_CONFIG_FLAG_OS        (1u << 0)
 // Collect userspace data.
 #define CPUPERF_CONFIG_FLAG_USER      (1u << 1)
 // Collect aspace+pc values.
 #define CPUPERF_CONFIG_FLAG_PC        (1u << 2)
 // If set then use |events[0]| as the timebase: data for this event is
 // collected when data for |events[0]| is collected, and the record emitted
 // for this event is either a CPUPERF_RECORD_COUNT or CPUPERF_RECORD_VALUE
 // record (depending on what the event is).
 // It is an error to have this bit set for an event and have rate[0] be zero.
 #define CPUPERF_CONFIG_FLAG_TIMEBASE0 (1u << 3)
 // Collect the available set of last branches.
 // Branch data is emitted as CPUPERF_RECORD_LAST_BRANCH records.
 // This is only available when the underlying system supports it.
 // TODO(dje): Provide knob to specify how many branches.
 #define CPUPERF_CONFIG_FLAG_LAST_BRANCH (1u << 4)
 } cpuperf_config_t;

 ///////////////////////////////////////////////////////////////////////////////

 #ifdef __Fuchsia__

 // ioctls

 // Fetch the cpu trace properties of the system.
 // Output: cpuperf_properties_t
 #define IOCTL_CPUPERF_GET_PROPERTIES \
     IOCTL(IOCTL_KIND_DEFAULT, IOCTL_FAMILY_CPUPERF, 0)
 IOCTL_WRAPPER_OUT(ioctl_cpuperf_get_properties,
                   IOCTL_CPUPERF_GET_PROPERTIES,
                   cpuperf_properties_t);

 // The allocation configuration for a data collection run.
 // This is generally the first call to allocate resources for a trace,
 // "trace" is used generically here: == "data collection run".
 typedef struct {
     // must be #cpus for now
     uint32_t num_buffers;

     // each cpu gets same buffer size
     uint32_t buffer_size;
 } ioctl_cpuperf_alloc_t;

 // Create a trace, allocating the needed trace buffers and other resources.
 // "other resources" is basically a catch-all for other things that will
 // be needed. This does not include reserving the events, that is done later
 // by STAGE_CONFIG.
 // Input: ioctl_cpuperf_alloc_t
 #define IOCTL_CPUPERF_ALLOC_TRACE \
     IOCTL(IOCTL_KIND_DEFAULT, IOCTL_FAMILY_CPUPERF, 1)
 IOCTL_WRAPPER_IN(ioctl_cpuperf_alloc_trace, IOCTL_CPUPERF_ALLOC_TRACE,
                  ioctl_cpuperf_alloc_t);

 // Free all trace buffers and any other resources allocated for the trace.
 // This is also done when the fd is closed (as well as stopping the trace).
 #define IOCTL_CPUPERF_FREE_TRACE \
     IOCTL(IOCTL_KIND_DEFAULT, IOCTL_FAMILY_CPUPERF, 2)
 IOCTL_WRAPPER(ioctl_cpuperf_free_trace, IOCTL_CPUPERF_FREE_TRACE);

 // Return trace allocation config.
 // Output: ioctl_cpuperf_alloc_t
 #define IOCTL_CPUPERF_GET_ALLOC \
     IOCTL(IOCTL_KIND_DEFAULT, IOCTL_FAMILY_CPUPERF, 3)
 IOCTL_WRAPPER_OUT(ioctl_cpuperf_get_alloc, IOCTL_CPUPERF_GET_ALLOC,
                   ioctl_cpuperf_alloc_t);

 // Stage performance monitor specification for a cpu.
 // Must be called with data collection off and after ALLOC.
 // Note: This doesn't actually configure the h/w, this just stages
 // the values for subsequent use by START.
 // Input: cpuperf_config_t
 #define IOCTL_CPUPERF_STAGE_CONFIG \
     IOCTL(IOCTL_KIND_DEFAULT, IOCTL_FAMILY_CPUPERF, 4)
 IOCTL_WRAPPER_IN(ioctl_cpuperf_stage_config, IOCTL_CPUPERF_STAGE_CONFIG,
                  cpuperf_config_t);

 // Fetch performance monitor specification for a cpu.
 // Must be called with data collection off and after STAGE_CONFIG.
 // Output: cpuperf_config_t
 #define IOCTL_CPUPERF_GET_CONFIG \
     IOCTL(IOCTL_KIND_DEFAULT, IOCTL_FAMILY_CPUPERF, 5)
 IOCTL_WRAPPER_OUT(ioctl_cpuperf_get_config, IOCTL_CPUPERF_GET_CONFIG,
                   cpuperf_config_t);

 typedef struct {
     uint32_t descriptor;
 } ioctl_cpuperf_buffer_handle_req_t;

 // Return a handle of a trace buffer.
 // Input: trace buffer descriptor (0, 1, 2, ..., |num_buffers|-1)
 // Output: handle of the vmo of the buffer
 #define IOCTL_CPUPERF_GET_BUFFER_HANDLE \
     IOCTL(IOCTL_KIND_GET_HANDLE, IOCTL_FAMILY_CPUPERF, 6)
 IOCTL_WRAPPER_INOUT(ioctl_cpuperf_get_buffer_handle,
                     IOCTL_CPUPERF_GET_BUFFER_HANDLE,
                     ioctl_cpuperf_buffer_handle_req_t, zx_handle_t);

 // Turn on data collection.
 // Must be called after ALLOC+STAGE_CONFIG and with data collection off.
 #define IOCTL_CPUPERF_START \
     IOCTL(IOCTL_KIND_DEFAULT, IOCTL_FAMILY_CPUPERF, 7)
 IOCTL_WRAPPER(ioctl_cpuperf_start, IOCTL_CPUPERF_START);

 // Turn off data collection.
 // May be called any time after ALLOC has been called and before FREE.
 // May be called multiple times.
 #define IOCTL_CPUPERF_STOP \
     IOCTL(IOCTL_KIND_DEFAULT, IOCTL_FAMILY_CPUPERF, 8)
 IOCTL_WRAPPER(ioctl_cpuperf_stop, IOCTL_CPUPERF_STOP);

 #endif // __Fuchsia__

 __END_CDECLS
	// Copyright 2017 The Fuchsia Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#pragma once

	#include <assert.h>
	#include <stddef.h>
	#include <stdint.h>
	#ifndef __cplusplus
	#include <stdalign.h> // for alignof
	#endif

	#include <zircon/types.h>

	#ifdef __Fuchsia__
	#include <zircon/device/ioctl.h>
	#include <zircon/device/ioctl-wrapper.h>
	#endif

	__BEGIN_CDECLS

	// API version number (useful when doing incompatible upgrades)
	#define CPUPERF_API_VERSION 3

	// Buffer format version
	#define CPUPERF_BUFFER_VERSION 0

	// The maximum number of events we support simultaneously.
	// Typically the h/w supports less than this, e.g., 7 or so.
	// TODO(dje): Have the device driver multiplex the events when more is
	// asked for than the h/w supports.
	#define CPUPERF_MAX_EVENTS 32u

	// Header for each data buffer.
	typedef struct {
	// Format version number (CPUPERF_BUFFER_VERSION).
	uint16_t version;

	// The architecture that generated the data.
	uint16_t arch;
	#define CPUPERF_BUFFER_ARCH_UNKNOWN 0u
	#define CPUPERF_BUFFER_ARCH_X86_64 1u
	#define CPUPERF_BUFFER_ARCH_ARM64 2u

	uint32_t flags;
	// The buffer filled, and records were dropped.
	#define CPUPERF_BUFFER_FLAG_FULL (1u << 0)

	// zx_ticks_per_second in the kernel
	zx_ticks_t ticks_per_second;

	// Offset into the buffer of the end of the data.
	uint64_t capture_end;
	} cpuperf_buffer_header_t;

	// The various types of emitted records.
	typedef enum {
	// Reserved, unused.
	CPUPERF_RECORD_RESERVED = 0,
	// The current time, in a \|cpuperf_time_record_t\|, to be applied to all
	// subsequent records until the next TIME record.
	CPUPERF_RECORD_TIME = 1,
	// The record is a \|cpuperf_tick_record_t\|.
	// TODO(dje): Rename, the name is confusing with TIME records.
	CPUPERF_RECORD_TICK = 2,
	// The record is a \|cpuperf_count_record_t\|.
	CPUPERF_RECORD_COUNT = 3,
	// The record is a \|cpuperf_value_record_t\|.
	CPUPERF_RECORD_VALUE = 4,
	// The record is a \|cpuperf_pc_record_t\|.
	CPUPERF_RECORD_PC = 5,
	// The record is a \|cpuperf_last_branch_record_t\|.
	CPUPERF_RECORD_LAST_BRANCH = 6,
	} cpuperf_record_type_t;

	// Trace buffer space is expensive, we want to keep records small.
	// Having more than 64K different events for any one arch is unlikely
	// so we use 16 bits for the event id.
	// To help each arch manage the plethora of different events, the event id
	// is split it two parts: 6 bit event group, and 10 bit event within that
	// group.
	// An event id of zero is defined to be unused. To simplify things we just
	// take the whole set of \|group\| == 0 as reserved.
	typedef uint16_t cpuperf_event_id_t;
	#define CPUPERF_MAKE_EVENT_ID(group, event) (((group) << 10) \| (event))
	#define CPUPERF_EVENT_ID_GROUP(id) (((id) >> 10) & 0x3f)
	#define CPUPERF_EVENT_ID_EVENT(id) ((id) & 0x3ff)
	#define CPUPERF_MAX_GROUP 0x3f
	#define CPUPERF_MAX_EVENT 0x3ff
	#define CPUPERF_EVENT_ID_NONE 0

	// Possible values for the \|group\| field of \|cpuperf_event_id_t\|.
	// TODO(dje): Reorganize these into something like
	// {arch,model} -x- {fixed,programmable}, which these currently are,
	// it's just not immediately apparent.
	typedef enum {
	CPUPERF_GROUP_RESERVED = 0,
	CPUPERF_GROUP_ARCH = 1,
	CPUPERF_GROUP_FIXED = 2,
	CPUPERF_GROUP_MODEL = 3,
	CPUPERF_GROUP_MISC = 4,
	} cpuperf_group_type_t;

	// The typical record is a tick record which is 4 + 8 bytes.
	// Aligning records to 8-byte boundaries would waste a lot of space,
	// so currently we align everything to 4-byte boundaries.
	// TODO(dje): Collect data to see what this saves. Keep it?
	#define CPUPERF_ALIGN_RECORD __PACKED __ALIGNED(4)

	// Trace record header.
	// Note: Avoid holes in all trace records.
	typedef struct {
	// One of CPUPERF_RECORD_*.
	uint8_t type;

	// A possible usage of this field is to add some type-specific flags.
	uint8_t reserved_flags;

	// The event the record is for.
	// If there is none then use CPUPERF_EVENT_ID_NONE.
	cpuperf_event_id_t event;
	} CPUPERF_ALIGN_RECORD cpuperf_record_header_t;

	// Verify our alignment assumptions.
	static_assert(sizeof(cpuperf_record_header_t) == 4,
	"record header not 4 bytes");

	// Record the current time of the trace.
	// If the event id is non-zero (!NONE) then it must be for a counting event
	// and then this record is also a "tick" record indicating the counter has
	// reached its sample rate. The counter resets to zero after this record.
	typedef struct {
	cpuperf_record_header_t header;
	// The value is architecture and possibly platform specific.
	// The \|ticks_per_second\| field in the buffer header provides the
	// conversion factor from this value to ticks per second.
	// For x86 this is the TSC value.
	zx_ticks_t time;
	} CPUPERF_ALIGN_RECORD cpuperf_time_record_t;

	// Verify our alignment assumptions.
	// We don't need to do this for every record, but doing it for this one
	// verifies CPUPERF_ALIGN_RECORD is working.
	static_assert(sizeof(cpuperf_time_record_t) == 12,
	"time record not 12 bytes");
	static_assert(alignof(cpuperf_time_record_t) == 4,
	"time record not 4-byte aligned");

	// Record that a counting event reached its sample rate.
	// It is expected that this record follows a TIME record.
	// The counter resets to zero after this record.
	// This does not include the event's value in order to keep the size small:
	// the value is the sample rate which is known from the configuration.
	typedef struct {
	cpuperf_record_header_t header;
	} CPUPERF_ALIGN_RECORD cpuperf_tick_record_t;

	// Record the value of a counter at a particular time.
	// It is expected that this record follows a TIME record.
	// The counter resets to zero after this record.
	// This is used when another timebase is driving the sampling, e.g., another
	// counter. Otherwise the "tick" record is generally used as it takes less
	// space.
	typedef struct {
	cpuperf_record_header_t header;
	uint64_t count;
	} CPUPERF_ALIGN_RECORD cpuperf_count_record_t;

	// Record the value of an event.
	// It is expected that this record follows a TIME record.
	// This value is not a count and cannot be used to produce a "rate"
	// (e.g., some value per second).
	typedef struct {
	cpuperf_record_header_t header;
	uint64_t value;
	} CPUPERF_ALIGN_RECORD cpuperf_value_record_t;

	// Record the aspace+pc values.
	// If the event id is not NONE, then this record also indicates that the
	// event reached its tick point, and is used instead of a tick record. This
	// record is overloaded to save space in trace buffer output.
	// It is expected that this record follows a TIME record.
	// This is used when doing gprof-like profiling.
	// The event's value is not included here as this is typically used when
	// the counter is its own trigger: the value is known from the sample rate.
	typedef struct {
	cpuperf_record_header_t header;
	// The aspace id at the time data was collected.
	// The meaning of the value is architecture-specific.
	// In the case of x86 this is the cr3 value.
	uint64_t aspace;
	uint64_t pc;
	} CPUPERF_ALIGN_RECORD cpuperf_pc_record_t;

	// Entry in a last branch record.
	typedef struct {
	uint64_t from;
	uint64_t to;
	// Various bits of info about this branch. See CPUPERF_LAST_BRANCH_INFO_*.
	uint64_t info;
	} CPUPERF_ALIGN_RECORD cpuperf_last_branch_t;

	// Utility to compute masks for fields in this file.
	#define CPUPERF_GEN_MASK64(len, shift) (((1ULL << (len)) - 1) << (shift))

	// Fields in \|cpuperf_last_branch_t.info\|.

	// Number of cycles since the last branch, or zero if unknown.
	// The unit of measurement is architecture-specific.
	#define CPUPERF_LAST_BRANCH_INFO_CYCLES_SHIFT (0u)
	#define CPUPERF_LAST_BRANCH_INFO_CYCLES_LEN (16u)
	#define CPUPERF_LAST_BRANCH_INFO_CYCLES_MASK \
	CPUPERF_GEN_MASK64(CPUPERF_LAST_BRANCH_INFO_CYCLES_SHIFT, \
	CPUPERF_LAST_BRANCH_INFO_CYCLES_LEN)

	// Non-zero if branch was mispredicted.
	// Whether this bit is available is architecture-specific.
	#define CPUPERF_LAST_BRANCH_INFO_MISPRED_SHIFT (16u)
	#define CPUPERF_LAST_BRANCH_INFO_MISPRED_LEN (1u)
	#define CPUPERF_LAST_BRANCH_INFO_MISPRED_MASK \
	CPUPERF_GEN_MASK64(CPUPERF_LAST_BRANCH_INFO_MISPRED_SHIFT, \
	CPUPERF_LAST_BRANCH_INFO_MISPRED_LEN)

	// Record a set of last branches executed.
	// It is expected that this record follows a TIME record.
	// Note that this record is variable-length.
	// This is used when doing gprof-like profiling.
	typedef struct {
	cpuperf_record_header_t header;
	// Number of entries in \|branch\|.
	uint32_t num_branches;
	// The aspace id at the time data was collected. This is not necessarily
	// the aspace id of each branch. S/W will need to determine from the
	// branch addresses how far back aspace is valid.
	// The meaning of the value is architecture-specific.
	// In the case of x86 this is the cr3 value.
	uint64_t aspace;
	// The set of last branches, in reverse chronological order:
	// The first entry is the most recent one.
	// Note that the emitted record may be smaller than this, as indicated by
	// \|num_branches\|.
	// Reverse order seems most useful.
	// 32 is the max value for Skylake
	#define CPUPERF_MAX_NUM_LAST_BRANCH (32u)
	cpuperf_last_branch_t branches[CPUPERF_MAX_NUM_LAST_BRANCH];
	} CPUPERF_ALIGN_RECORD cpuperf_last_branch_record_t;

	// Return the size of valid last branch record \|lbr\|.
	#define CPUPERF_LAST_BRANCH_RECORD_SIZE(lbr) \
	(sizeof(cpuperf_last_branch_record_t) - \
	(CPUPERF_MAX_NUM_LAST_BRANCH - (lbr)->num_branches) * sizeof((lbr)->branches[0]))

	// The properties of this system.
	typedef struct {
	// S/W API version = CPUPERF_API_VERSION.
	uint16_t api_version;
	// The H/W Performance Monitor version.
	uint16_t pm_version;
	// The number of fixed events.
	uint16_t num_fixed_events;
	// The number of programmable events.
	uint16_t num_programmable_events;
	// For fixed events that are counters, the width in bits.
	// If different counters have different widths, the choice is architecture
	// specific.
	uint16_t fixed_counter_width;
	// For programmable events that are counters, the width in bits.
	// If different counters have different widths, the choice is architecture
	// specific.
	uint16_t programmable_counter_width;
	// Various flags.
	uint32_t flags;
	#define CPUPERF_PROPERTY_FLAG_HAS_LAST_BRANCH (1u << 0)
	} cpuperf_properties_t;

	// The type of the \|rate\| field of cpuperf_config_t.
	typedef uint32_t cpuperf_rate_t;

	// Passed to STAGE_CONFIG to select the data to be collected.
	// Events must be consecutively allocated from the front with no holes.
	// A value of CPUPERF_EVENT_ID_NONE in \|events\| marks the end.
	typedef struct {
	// Events to collect data for.
	// The values are architecture specific ids: cpuperf_<arch>_event_id_t
	// Each event may appear at most once.
	// \|events[0]\| is special: It is used as the timebase when any other
	// event has CPUPERF_CONFIG_FLAG_TIMEBASE0 set.
	cpuperf_event_id_t events[CPUPERF_MAX_EVENTS];

	// Sampling rate for each event in \|events\|.
	// If zero then do simple counting (collect a tally of the count and
	// report at the end). Otherwise (non-zero) then when the event gets
	// this many hits data is collected (e.g., pc, time).
	// The value can be non-zero only for counting based events.
	// This value is ignored if CPUPERF_CONFIG_FLAG_TIMEBASE0 is set.
	// Setting CPUPERF_CONFIG_FLAG_TIMEBASE0 in \|flags[0]\| is redundant but ok.
	cpuperf_rate_t rate[CPUPERF_MAX_EVENTS];

	// Flags for each event in \|events\|.
	// TODO(dje): hypervisor, host/guest os/user
	uint32_t flags[CPUPERF_MAX_EVENTS];
	// Valid bits in \|flags\|.
	#define CPUPERF_CONFIG_FLAG_MASK 0x1f
	// Collect os data.
	#define CPUPERF_CONFIG_FLAG_OS (1u << 0)
	// Collect userspace data.
	#define CPUPERF_CONFIG_FLAG_USER (1u << 1)
	// Collect aspace+pc values.
	#define CPUPERF_CONFIG_FLAG_PC (1u << 2)
	// If set then use \|events[0]\| as the timebase: data for this event is
	// collected when data for \|events[0]\| is collected, and the record emitted
	// for this event is either a CPUPERF_RECORD_COUNT or CPUPERF_RECORD_VALUE
	// record (depending on what the event is).
	// It is an error to have this bit set for an event and have rate[0] be zero.
	#define CPUPERF_CONFIG_FLAG_TIMEBASE0 (1u << 3)
	// Collect the available set of last branches.
	// Branch data is emitted as CPUPERF_RECORD_LAST_BRANCH records.
	// This is only available when the underlying system supports it.
	// TODO(dje): Provide knob to specify how many branches.
	#define CPUPERF_CONFIG_FLAG_LAST_BRANCH (1u << 4)
	} cpuperf_config_t;

	///////////////////////////////////////////////////////////////////////////////

	#ifdef __Fuchsia__

	// ioctls

	// Fetch the cpu trace properties of the system.
	// Output: cpuperf_properties_t
	#define IOCTL_CPUPERF_GET_PROPERTIES \
	IOCTL(IOCTL_KIND_DEFAULT, IOCTL_FAMILY_CPUPERF, 0)
	IOCTL_WRAPPER_OUT(ioctl_cpuperf_get_properties,
	IOCTL_CPUPERF_GET_PROPERTIES,
	cpuperf_properties_t);

	// The allocation configuration for a data collection run.
	// This is generally the first call to allocate resources for a trace,
	// "trace" is used generically here: == "data collection run".
	typedef struct {
	// must be #cpus for now
	uint32_t num_buffers;

	// each cpu gets same buffer size
	uint32_t buffer_size;
	} ioctl_cpuperf_alloc_t;

	// Create a trace, allocating the needed trace buffers and other resources.
	// "other resources" is basically a catch-all for other things that will
	// be needed. This does not include reserving the events, that is done later
	// by STAGE_CONFIG.
	// Input: ioctl_cpuperf_alloc_t
	#define IOCTL_CPUPERF_ALLOC_TRACE \
	IOCTL(IOCTL_KIND_DEFAULT, IOCTL_FAMILY_CPUPERF, 1)
	IOCTL_WRAPPER_IN(ioctl_cpuperf_alloc_trace, IOCTL_CPUPERF_ALLOC_TRACE,
	ioctl_cpuperf_alloc_t);

	// Free all trace buffers and any other resources allocated for the trace.
	// This is also done when the fd is closed (as well as stopping the trace).
	#define IOCTL_CPUPERF_FREE_TRACE \
	IOCTL(IOCTL_KIND_DEFAULT, IOCTL_FAMILY_CPUPERF, 2)
	IOCTL_WRAPPER(ioctl_cpuperf_free_trace, IOCTL_CPUPERF_FREE_TRACE);

	// Return trace allocation config.
	// Output: ioctl_cpuperf_alloc_t
	#define IOCTL_CPUPERF_GET_ALLOC \
	IOCTL(IOCTL_KIND_DEFAULT, IOCTL_FAMILY_CPUPERF, 3)
	IOCTL_WRAPPER_OUT(ioctl_cpuperf_get_alloc, IOCTL_CPUPERF_GET_ALLOC,
	ioctl_cpuperf_alloc_t);

	// Stage performance monitor specification for a cpu.
	// Must be called with data collection off and after ALLOC.
	// Note: This doesn't actually configure the h/w, this just stages
	// the values for subsequent use by START.
	// Input: cpuperf_config_t
	#define IOCTL_CPUPERF_STAGE_CONFIG \
	IOCTL(IOCTL_KIND_DEFAULT, IOCTL_FAMILY_CPUPERF, 4)
	IOCTL_WRAPPER_IN(ioctl_cpuperf_stage_config, IOCTL_CPUPERF_STAGE_CONFIG,
	cpuperf_config_t);

	// Fetch performance monitor specification for a cpu.
	// Must be called with data collection off and after STAGE_CONFIG.
	// Output: cpuperf_config_t
	#define IOCTL_CPUPERF_GET_CONFIG \
	IOCTL(IOCTL_KIND_DEFAULT, IOCTL_FAMILY_CPUPERF, 5)
	IOCTL_WRAPPER_OUT(ioctl_cpuperf_get_config, IOCTL_CPUPERF_GET_CONFIG,
	cpuperf_config_t);

	typedef struct {
	uint32_t descriptor;
	} ioctl_cpuperf_buffer_handle_req_t;

	// Return a handle of a trace buffer.
	// Input: trace buffer descriptor (0, 1, 2, ..., \|num_buffers\|-1)
	// Output: handle of the vmo of the buffer
	#define IOCTL_CPUPERF_GET_BUFFER_HANDLE \
	IOCTL(IOCTL_KIND_GET_HANDLE, IOCTL_FAMILY_CPUPERF, 6)
	IOCTL_WRAPPER_INOUT(ioctl_cpuperf_get_buffer_handle,
	IOCTL_CPUPERF_GET_BUFFER_HANDLE,
	ioctl_cpuperf_buffer_handle_req_t, zx_handle_t);

	// Turn on data collection.
	// Must be called after ALLOC+STAGE_CONFIG and with data collection off.
	#define IOCTL_CPUPERF_START \
	IOCTL(IOCTL_KIND_DEFAULT, IOCTL_FAMILY_CPUPERF, 7)
	IOCTL_WRAPPER(ioctl_cpuperf_start, IOCTL_CPUPERF_START);

	// Turn off data collection.
	// May be called any time after ALLOC has been called and before FREE.
	// May be called multiple times.
	#define IOCTL_CPUPERF_STOP \
	IOCTL(IOCTL_KIND_DEFAULT, IOCTL_FAMILY_CPUPERF, 8)
	IOCTL_WRAPPER(ioctl_cpuperf_stop, IOCTL_CPUPERF_STOP);

	#endif // __Fuchsia__

	__END_CDECLS