blob: 8e88095a6cee34a3942b724ad9e8e518db29c007 [file] [log] [blame]
// Copyright 2017 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// See the README.md in this directory for documentation.
#include <ddk/binding.h>
#include <ddk/debug.h>
#include <ddk/device.h>
#include <ddk/driver.h>
#include <ddk/io-buffer.h>
#include <ddk/platform-defs.h>
#include <ddk/protocol/platform/device.h>
#include <lib/zircon-internal/device/cpu-trace/intel-pm.h>
#include <lib/zircon-internal/mtrace.h>
#include <zircon/syscalls.h>
#include <zircon/syscalls/resource.h>
#include <zircon/types.h>
#include <assert.h>
#include <cpuid.h>
#include <inttypes.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <threads.h>
// TODO(dje): Having trouble getting this working, so just punt for now.
#define TRY_FREEZE_ON_PMI 0
// Individual bits in the fixed counter enable field.
// See Intel Volume 3, Figure 18-2 "Layout of IA32_FIXED_CTR_CTRL MSR".
#define FIXED_CTR_ENABLE_OS 1
#define FIXED_CTR_ENABLE_USR 2
// There's only a few fixed events, so handle them directly.
typedef enum {
#define DEF_FIXED_EVENT(symbol, event_name, id, regnum, flags, readable_name, description) \
symbol ## _ID = CPUPERF_MAKE_EVENT_ID(CPUPERF_GROUP_FIXED, id),
#include <lib/zircon-internal/device/cpu-trace/intel-pm-events.inc>
} fixed_event_id_t;
// Verify each fixed counter regnum < IPM_MAX_FIXED_COUNTERS.
#define DEF_FIXED_EVENT(symbol, event_name, id, regnum, flags, readable_name, description) \
&& (regnum) < IPM_MAX_FIXED_COUNTERS
static_assert(1
#include <lib/zircon-internal/device/cpu-trace/intel-pm-events.inc>
, "");
typedef enum {
#define DEF_MISC_SKL_EVENT(symbol, event_name, id, offset, size, flags, readable_name, description) \
symbol ## _ID = CPUPERF_MAKE_EVENT_ID(CPUPERF_GROUP_MISC, id),
#include <lib/zircon-internal/device/cpu-trace/skylake-misc-events.inc>
} misc_event_id_t;
// Misc event ids needn't be consecutive.
// Build a lookup table we can use to track duplicates.
typedef enum {
#define DEF_MISC_SKL_EVENT(symbol, event_name, id, offset, size, flags, readable_name, description) \
symbol ## _NUMBER,
#include <lib/zircon-internal/device/cpu-trace/skylake-misc-events.inc>
NUM_MISC_EVENTS
} misc_event_number_t;
// This table is sorted at startup.
static cpuperf_event_id_t misc_event_table_contents[NUM_MISC_EVENTS] = {
#define DEF_MISC_SKL_EVENT(symbol, event_name, id, offset, size, flags, readable_name, description) \
CPUPERF_MAKE_EVENT_ID(CPUPERF_GROUP_MISC, id),
#include <lib/zircon-internal/device/cpu-trace/skylake-misc-events.inc>
};
// Const accessor to give the illusion of the table being const.
static const cpuperf_event_id_t* misc_event_table = &misc_event_table_contents[0];
static void pmu_init_misc_event_table(void);
typedef enum {
#define DEF_ARCH_EVENT(symbol, event_name, id, ebx_bit, event, umask, flags, readable_name, description) \
symbol,
#include <lib/zircon-internal/device/cpu-trace/intel-pm-events.inc>
} arch_event_t;
typedef enum {
#define DEF_SKL_EVENT(symbol, event_name, id, event, umask, flags, readable_name, description) \
symbol,
#include <lib/zircon-internal/device/cpu-trace/skylake-pm-events.inc>
} model_event_t;
typedef struct {
uint32_t event;
uint32_t umask;
uint32_t flags;
} event_details_t;
static const event_details_t kArchEvents[] = {
#define DEF_ARCH_EVENT(symbol, event_name, id, ebx_bit, event, umask, flags, readable_name, description) \
{ event, umask, flags },
#include <lib/zircon-internal/device/cpu-trace/intel-pm-events.inc>
};
static const event_details_t kModelEvents[] = {
#define DEF_SKL_EVENT(symbol, event_name, id, event, umask, flags, readable_name, description) \
{ event, umask, flags },
#include <lib/zircon-internal/device/cpu-trace/skylake-pm-events.inc>
};
static const uint16_t kArchEventMap[] = {
#define DEF_ARCH_EVENT(symbol, event_name, id, ebx_bit, event, umask, flags, readable_name, description) \
[id] = symbol,
#include <lib/zircon-internal/device/cpu-trace/intel-pm-events.inc>
};
static_assert(countof(kArchEventMap) <= CPUPERF_MAX_EVENT + 1, "");
static const uint16_t kModelEventMap[] = {
#define DEF_SKL_EVENT(symbol, event_name, id, event, umask, flags, readable_name, description) \
[id] = symbol,
#include <lib/zircon-internal/device/cpu-trace/skylake-pm-events.inc>
};
static_assert(countof(kModelEventMap) <= CPUPERF_MAX_EVENT + 1, "");
// All configuration data is staged here before writing any MSRs, etc.
// Then when ready the "START" ioctl will write all the necessary MSRS,
// and do whatever kernel operations are required for collecting data.
typedef struct pmu_per_trace_state {
// true if |config| has been set.
bool configured;
// The trace configuration as given to us via the ioctl.
cpuperf_config_t ioctl_config;
// The internalized form of |config| that we pass to the kernel.
zx_x86_pmu_config_t config;
// # of entries in |buffers|.
// TODO(dje): This is generally the number of cpus, but it could be
// something else later.
uint32_t num_buffers;
// Each buffer is the same size (at least for now, KISS).
// There is one buffer per cpu.
// This is a uint32 instead of uint64 as there's no point in supporting
// that large of a buffer.
uint32_t buffer_size;
io_buffer_t* buffers;
} pmu_per_trace_state_t;
typedef struct cpuperf_device {
mtx_t lock;
// Only one open of this device is supported at a time. KISS for now.
bool opened;
// Once tracing has started various things are not allowed until it stops.
bool active;
// one entry for each trace
// TODO(dje): At the moment we only support one trace at a time.
// "trace" == "data collection run"
pmu_per_trace_state_t* per_trace_state;
zx_handle_t bti;
} cpuperf_device_t;
static bool pmu_supported = false;
// This is only valid if |pmu_supported| is true.
static zx_x86_pmu_properties_t pmu_properties;
// maximum space, in bytes, for trace buffers (per cpu)
#define MAX_PER_TRACE_SPACE (256 * 1024 * 1024)
static zx_status_t cpuperf_init_once(void)
{
pmu_init_misc_event_table();
zx_x86_pmu_properties_t props;
zx_handle_t resource = get_root_resource();
zx_status_t status =
zx_mtrace_control(resource, MTRACE_KIND_CPUPERF, MTRACE_CPUPERF_GET_PROPERTIES,
0, &props, sizeof(props));
if (status != ZX_OK) {
if (status == ZX_ERR_NOT_SUPPORTED)
zxlogf(INFO, "%s: No PM support\n", __func__);
else
zxlogf(INFO, "%s: Error %d fetching ipm properties\n",
__func__, status);
return status;
}
// Skylake supports version 4. KISS and begin with that.
// Note: This should agree with the kernel driver's check.
if (props.pm_version < 4) {
zxlogf(INFO, "%s: PM version 4 or above is required\n", __func__);
return ZX_ERR_NOT_SUPPORTED;
}
pmu_supported = true;
pmu_properties = props;
zxlogf(TRACE, "Intel Performance Monitor configuration for this chipset:\n");
zxlogf(TRACE, "IPM: version: %u\n", pmu_properties.pm_version);
zxlogf(TRACE, "IPM: num_programmable_events: %u\n",
pmu_properties.num_programmable_events);
zxlogf(TRACE, "IPM: num_fixed_events: %u\n",
pmu_properties.num_fixed_events);
zxlogf(TRACE, "IPM: num_misc_events: %u\n",
pmu_properties.num_misc_events);
zxlogf(TRACE, "IPM: programmable_counter_width: %u\n",
pmu_properties.programmable_counter_width);
zxlogf(TRACE, "IPM: fixed_counter_width: %u\n",
pmu_properties.fixed_counter_width);
zxlogf(TRACE, "IPM: perf_capabilities: 0x%lx\n",
pmu_properties.perf_capabilities);
return ZX_OK;
}
// Helper routines for the ioctls.
static void pmu_free_buffers_for_trace(pmu_per_trace_state_t* per_trace, uint32_t num_allocated) {
// Note: This may be called with partially allocated buffers.
assert(per_trace->buffers);
assert(num_allocated <= per_trace->num_buffers);
for (uint32_t i = 0; i < num_allocated; ++i)
io_buffer_release(&per_trace->buffers[i]);
free(per_trace->buffers);
per_trace->buffers = NULL;
}
// Map a fixed counter event id to its h/w register number.
// Returns IPM_MAX_FIXED_COUNTERS if |id| is unknown.
static unsigned pmu_fixed_counter_number(cpuperf_event_id_t id) {
enum {
#define DEF_FIXED_EVENT(symbol, event_name, id, regnum, flags, readable_name, description) \
symbol ## _NUMBER = regnum,
#include <lib/zircon-internal/device/cpu-trace/intel-pm-events.inc>
};
switch (id) {
case FIXED_INSTRUCTIONS_RETIRED_ID:
return FIXED_INSTRUCTIONS_RETIRED_NUMBER;
case FIXED_UNHALTED_CORE_CYCLES_ID:
return FIXED_UNHALTED_CORE_CYCLES_NUMBER;
case FIXED_UNHALTED_REFERENCE_CYCLES_ID:
return FIXED_UNHALTED_REFERENCE_CYCLES_NUMBER;
default:
return IPM_MAX_FIXED_COUNTERS;
}
}
static int pmu_compare_cpuperf_event_id(const void* ap, const void* bp) {
const cpuperf_event_id_t* a = ap;
const cpuperf_event_id_t* b = bp;
if (*a < *b)
return -1;
if (*a > *b)
return 1;
return 0;
}
static void pmu_init_misc_event_table(void) {
qsort(misc_event_table_contents,
countof(misc_event_table_contents),
sizeof(misc_event_table_contents[0]),
pmu_compare_cpuperf_event_id);
}
// Map a misc event id to its ordinal (unique number in range
// 0 ... NUM_MISC_EVENTS - 1).
// Returns -1 if |id| is unknown.
static int pmu_lookup_misc_event(cpuperf_event_id_t id) {
cpuperf_event_id_t* p = bsearch(&id, misc_event_table,
countof(misc_event_table_contents),
sizeof(id),
pmu_compare_cpuperf_event_id);
if (!p)
return -1;
ptrdiff_t result = p - misc_event_table;
assert(result < NUM_MISC_EVENTS);
return (int) result;
}
static bool pmu_lbr_supported(void) {
return pmu_properties.lbr_stack_size > 0;
}
// The userspace side of the driver.
static zx_status_t pmu_get_properties(cpuperf_device_t* dev,
void* reply, size_t replymax,
size_t* out_actual) {
zxlogf(TRACE, "%s called\n", __func__);
if (!pmu_supported)
return ZX_ERR_NOT_SUPPORTED;
cpuperf_properties_t props;
if (replymax < sizeof(props))
return ZX_ERR_BUFFER_TOO_SMALL;
memset(&props, 0, sizeof(props));
props.api_version = CPUPERF_API_VERSION;
props.pm_version = pmu_properties.pm_version;
// To the arch-independent API, the misc events on Intel are currently
// all "fixed" in the sense that they don't occupy a limited number of
// programmable slots. Ultimately there could still be limitations (e.g.,
// some combination of events can't be supported) but that's ok. This
// data is for informational/debug purposes.
// TODO(dje): Something more elaborate can wait for publishing them via
// some namespace.
props.num_fixed_events = (pmu_properties.num_fixed_events +
pmu_properties.num_misc_events);
props.num_programmable_events = pmu_properties.num_programmable_events;
props.fixed_counter_width = pmu_properties.fixed_counter_width;
props.programmable_counter_width = pmu_properties.programmable_counter_width;
if (pmu_lbr_supported())
props.flags |= CPUPERF_PROPERTY_FLAG_HAS_LAST_BRANCH;
memcpy(reply, &props, sizeof(props));
*out_actual = sizeof(props);
return ZX_OK;
}
static zx_status_t pmu_alloc_trace(cpuperf_device_t* dev,
const void* cmd, size_t cmdlen) {
zxlogf(TRACE, "%s called\n", __func__);
if (!pmu_supported)
return ZX_ERR_NOT_SUPPORTED;
if (dev->per_trace_state)
return ZX_ERR_BAD_STATE;
// Note: The remaining API calls don't have to check |pmu_supported|
// because this will never succeed otherwise, and they all require this
// to be done first.
ioctl_cpuperf_alloc_t alloc;
if (cmdlen != sizeof(alloc))
return ZX_ERR_INVALID_ARGS;
memcpy(&alloc, cmd, sizeof(alloc));
if (alloc.buffer_size > MAX_PER_TRACE_SPACE)
return ZX_ERR_INVALID_ARGS;
uint32_t num_cpus = zx_system_get_num_cpus();
if (alloc.num_buffers != num_cpus) // TODO(dje): for now
return ZX_ERR_INVALID_ARGS;
pmu_per_trace_state_t* per_trace = calloc(1, sizeof(dev->per_trace_state[0]));
if (!per_trace) {
return ZX_ERR_NO_MEMORY;
}
per_trace->buffers = calloc(num_cpus, sizeof(per_trace->buffers[0]));
if (!per_trace->buffers) {
free(per_trace);
return ZX_ERR_NO_MEMORY;
}
uint32_t i = 0;
for ( ; i < num_cpus; ++i) {
zx_status_t status =
io_buffer_init(&per_trace->buffers[i], dev->bti, alloc.buffer_size, IO_BUFFER_RW);
if (status != ZX_OK)
break;
}
if (i != num_cpus) {
pmu_free_buffers_for_trace(per_trace, i);
free(per_trace);
return ZX_ERR_NO_MEMORY;
}
per_trace->num_buffers = alloc.num_buffers;
per_trace->buffer_size = alloc.buffer_size;
dev->per_trace_state = per_trace;
return ZX_OK;
}
static zx_status_t pmu_free_trace(cpuperf_device_t* dev) {
zxlogf(TRACE, "%s called\n", __func__);
if (dev->active)
return ZX_ERR_BAD_STATE;
pmu_per_trace_state_t* per_trace = dev->per_trace_state;
if (!per_trace)
return ZX_ERR_BAD_STATE;
pmu_free_buffers_for_trace(per_trace, per_trace->num_buffers);
free(per_trace);
dev->per_trace_state = NULL;
return ZX_OK;
}
static zx_status_t pmu_get_alloc(cpuperf_device_t* dev,
void* reply, size_t replymax,
size_t* out_actual) {
zxlogf(TRACE, "%s called\n", __func__);
const pmu_per_trace_state_t* per_trace = dev->per_trace_state;
if (!per_trace)
return ZX_ERR_BAD_STATE;
ioctl_cpuperf_alloc_t alloc;
if (replymax < sizeof(alloc))
return ZX_ERR_BUFFER_TOO_SMALL;
alloc.num_buffers = per_trace->num_buffers;
alloc.buffer_size = per_trace->buffer_size;
memcpy(reply, &alloc, sizeof(alloc));
*out_actual = sizeof(alloc);
return ZX_OK;
}
static zx_status_t pmu_get_buffer_handle(cpuperf_device_t* dev,
const void* cmd, size_t cmdlen,
void* reply, size_t replymax,
size_t* out_actual) {
zxlogf(TRACE, "%s called\n", __func__);
const pmu_per_trace_state_t* per_trace = dev->per_trace_state;
if (!per_trace)
return ZX_ERR_BAD_STATE;
ioctl_cpuperf_buffer_handle_req_t req;
zx_handle_t h;
if (cmdlen != sizeof(req))
return ZX_ERR_INVALID_ARGS;
if (replymax < sizeof(h))
return ZX_ERR_BUFFER_TOO_SMALL;
memcpy(&req, cmd, sizeof(req));
if (req.descriptor >= per_trace->num_buffers)
return ZX_ERR_INVALID_ARGS;
zx_status_t status = zx_handle_duplicate(per_trace->buffers[req.descriptor].vmo_handle, ZX_RIGHT_SAME_RIGHTS, &h);
if (status < 0)
return status;
memcpy(reply, &h, sizeof(h));
*out_actual = sizeof(h);
return ZX_OK;
}
typedef struct {
// Maximum number of each event we can handle.
unsigned max_num_fixed;
unsigned max_num_programmable;
unsigned max_num_misc;
// The number of events in use.
unsigned num_fixed;
unsigned num_programmable;
unsigned num_misc;
// The maximum value the counter can have before overflowing.
uint64_t max_fixed_value;
uint64_t max_programmable_value;
// For catching duplicates of the fixed counters.
bool have_fixed[IPM_MAX_FIXED_COUNTERS];
// For catching duplicates of the misc events, 1 bit per event.
uint64_t have_misc[(NUM_MISC_EVENTS + 63) / 64];
bool have_timebase0_user;
} staging_state_t;
static zx_status_t pmu_stage_fixed_config(const cpuperf_config_t* icfg,
staging_state_t* ss,
unsigned input_index,
zx_x86_pmu_config_t* ocfg) {
const unsigned ii = input_index;
const cpuperf_event_id_t id = icfg->events[ii];
bool uses_timebase0 = !!(icfg->flags[ii] & CPUPERF_CONFIG_FLAG_TIMEBASE0);
unsigned counter = pmu_fixed_counter_number(id);
if (counter == IPM_MAX_FIXED_COUNTERS ||
counter >= countof(ocfg->fixed_ids) ||
counter >= ss->max_num_fixed) {
zxlogf(ERROR, "%s: Invalid fixed event [%u]\n", __func__, ii);
return ZX_ERR_INVALID_ARGS;
}
if (ss->have_fixed[counter]) {
zxlogf(ERROR, "%s: Fixed event [%u] already provided\n",
__func__, counter);
return ZX_ERR_INVALID_ARGS;
}
ss->have_fixed[counter] = true;
ocfg->fixed_ids[ss->num_fixed] = id;
if ((uses_timebase0 && input_index != 0) || icfg->rate[ii] == 0) {
ocfg->fixed_initial_value[ss->num_fixed] = 0;
} else {
if (icfg->rate[ii] > ss->max_fixed_value) {
zxlogf(ERROR, "%s: Rate too large, event [%u]\n", __func__, ii);
return ZX_ERR_INVALID_ARGS;
}
ocfg->fixed_initial_value[ss->num_fixed] =
ss->max_fixed_value - icfg->rate[ii] + 1;
}
// KISS: For now don't generate PMI's for counters that use
// another as the timebase.
if (!uses_timebase0 || ii == 0)
ocfg->fixed_ctrl |= IA32_FIXED_CTR_CTRL_PMI_MASK(counter);
unsigned enable = 0;
if (icfg->flags[ii] & CPUPERF_CONFIG_FLAG_OS)
enable |= FIXED_CTR_ENABLE_OS;
if (icfg->flags[ii] & CPUPERF_CONFIG_FLAG_USER)
enable |= FIXED_CTR_ENABLE_USR;
ocfg->fixed_ctrl |= enable << IA32_FIXED_CTR_CTRL_EN_SHIFT(counter);
ocfg->global_ctrl |= IA32_PERF_GLOBAL_CTRL_FIXED_EN_MASK(counter);
if (icfg->flags[ii] & CPUPERF_CONFIG_FLAG_TIMEBASE0)
ocfg->fixed_flags[ss->num_fixed] |= IPM_CONFIG_FLAG_TIMEBASE;
if (icfg->flags[ii] & CPUPERF_CONFIG_FLAG_PC)
ocfg->fixed_flags[ss->num_fixed] |= IPM_CONFIG_FLAG_PC;
if (icfg->flags[ii] & CPUPERF_CONFIG_FLAG_LAST_BRANCH) {
if (!pmu_lbr_supported()) {
zxlogf(ERROR, "%s: Last branch not supported, event [%u]\n"
, __func__, ii);
return ZX_ERR_INVALID_ARGS;
}
if (icfg->rate[ii] == 0 ||
((icfg->flags[ii] & CPUPERF_CONFIG_FLAG_TIMEBASE0) &&
ii != 0)) {
zxlogf(ERROR, "%s: Last branch requires own timebase, event [%u]\n"
, __func__, ii);
return ZX_ERR_INVALID_ARGS;
}
ocfg->fixed_flags[ss->num_fixed] |= IPM_CONFIG_FLAG_LBR;
ocfg->debug_ctrl |= IA32_DEBUGCTL_LBR_MASK;
}
++ss->num_fixed;
return ZX_OK;
}
static zx_status_t pmu_stage_programmable_config(const cpuperf_config_t* icfg,
staging_state_t* ss,
unsigned input_index,
zx_x86_pmu_config_t* ocfg) {
const unsigned ii = input_index;
cpuperf_event_id_t id = icfg->events[ii];
unsigned group = CPUPERF_EVENT_ID_GROUP(id);
unsigned event = CPUPERF_EVENT_ID_EVENT(id);
bool uses_timebase0 = !!(icfg->flags[ii] & CPUPERF_CONFIG_FLAG_TIMEBASE0);
// TODO(dje): Verify no duplicates.
if (ss->num_programmable == ss->max_num_programmable) {
zxlogf(ERROR, "%s: Too many programmable counters provided\n",
__func__);
return ZX_ERR_INVALID_ARGS;
}
ocfg->programmable_ids[ss->num_programmable] = id;
if ((uses_timebase0 && input_index != 0) || icfg->rate[ii] == 0) {
ocfg->programmable_initial_value[ss->num_programmable] = 0;
} else {
if (icfg->rate[ii] > ss->max_programmable_value) {
zxlogf(ERROR, "%s: Rate too large, event [%u]\n", __func__, ii);
return ZX_ERR_INVALID_ARGS;
}
ocfg->programmable_initial_value[ss->num_programmable] =
ss->max_programmable_value - icfg->rate[ii] + 1;
}
const event_details_t* details = NULL;
switch (group) {
case CPUPERF_GROUP_ARCH:
if (event >= countof(kArchEventMap)) {
zxlogf(ERROR, "%s: Invalid event id, event [%u]\n", __func__, ii);
return ZX_ERR_INVALID_ARGS;
}
details = &kArchEvents[kArchEventMap[event]];
break;
case CPUPERF_GROUP_MODEL:
if (event >= countof(kModelEventMap)) {
zxlogf(ERROR, "%s: Invalid event id, event [%u]\n", __func__, ii);
return ZX_ERR_INVALID_ARGS;
}
details = &kModelEvents[kModelEventMap[event]];
break;
default:
zxlogf(ERROR, "%s: Invalid event id, event [%u]\n", __func__, ii);
return ZX_ERR_INVALID_ARGS;
}
if (details->event == 0 && details->umask == 0) {
zxlogf(ERROR, "%s: Invalid event id, event [%u]\n", __func__, ii);
return ZX_ERR_INVALID_ARGS;
}
uint64_t evtsel = 0;
evtsel |= details->event << IA32_PERFEVTSEL_EVENT_SELECT_SHIFT;
evtsel |= details->umask << IA32_PERFEVTSEL_UMASK_SHIFT;
if (icfg->flags[ii] & CPUPERF_CONFIG_FLAG_OS)
evtsel |= IA32_PERFEVTSEL_OS_MASK;
if (icfg->flags[ii] & CPUPERF_CONFIG_FLAG_USER)
evtsel |= IA32_PERFEVTSEL_USR_MASK;
if (details->flags & IPM_REG_FLAG_EDG)
evtsel |= IA32_PERFEVTSEL_E_MASK;
if (details->flags & IPM_REG_FLAG_ANYT)
evtsel |= IA32_PERFEVTSEL_ANY_MASK;
if (details->flags & IPM_REG_FLAG_INV)
evtsel |= IA32_PERFEVTSEL_INV_MASK;
evtsel |= (details->flags & IPM_REG_FLAG_CMSK_MASK) << IA32_PERFEVTSEL_CMASK_SHIFT;
// KISS: For now don't generate PMI's for counters that use
// another as the timebase. We still generate interrupts in
// "counting mode" in case the counter overflows.
if (!uses_timebase0 || ii == 0)
evtsel |= IA32_PERFEVTSEL_INT_MASK;
evtsel |= IA32_PERFEVTSEL_EN_MASK;
ocfg->programmable_events[ss->num_programmable] = evtsel;
ocfg->global_ctrl |= IA32_PERF_GLOBAL_CTRL_PMC_EN_MASK(ss->num_programmable);
if (icfg->flags[ii] & CPUPERF_CONFIG_FLAG_TIMEBASE0)
ocfg->programmable_flags[ss->num_programmable] |= IPM_CONFIG_FLAG_TIMEBASE;
if (icfg->flags[ii] & CPUPERF_CONFIG_FLAG_PC)
ocfg->programmable_flags[ss->num_programmable] |= IPM_CONFIG_FLAG_PC;
if (icfg->flags[ii] & CPUPERF_CONFIG_FLAG_LAST_BRANCH) {
if (!pmu_lbr_supported()) {
zxlogf(ERROR, "%s: Last branch not supported, event [%u]\n"
, __func__, ii);
return ZX_ERR_INVALID_ARGS;
}
if (icfg->rate[ii] == 0 ||
((icfg->flags[ii] & CPUPERF_CONFIG_FLAG_TIMEBASE0) &&
ii != 0)) {
zxlogf(ERROR, "%s: Last branch requires own timebase, event [%u]\n"
, __func__, ii);
return ZX_ERR_INVALID_ARGS;
}
ocfg->programmable_flags[ss->num_programmable] |= IPM_CONFIG_FLAG_LBR;
ocfg->debug_ctrl |= IA32_DEBUGCTL_LBR_MASK;
}
++ss->num_programmable;
return ZX_OK;
}
static zx_status_t pmu_stage_misc_config(const cpuperf_config_t* icfg,
staging_state_t* ss,
unsigned input_index,
zx_x86_pmu_config_t* ocfg) {
const unsigned ii = input_index;
cpuperf_event_id_t id = icfg->events[ii];
int event = pmu_lookup_misc_event(id);
if (event < 0) {
zxlogf(ERROR, "%s: Invalid misc event [%u]\n", __func__, ii);
return ZX_ERR_INVALID_ARGS;
}
if (ss->num_misc == ss->max_num_misc) {
zxlogf(ERROR, "%s: Too many misc counters provided\n",
__func__);
return ZX_ERR_INVALID_ARGS;
}
if (ss->have_misc[event / 64] & (1ul << (event % 64))) {
zxlogf(ERROR, "%s: Misc event [%u] already provided\n",
__func__, ii);
return ZX_ERR_INVALID_ARGS;
}
ss->have_misc[event / 64] |= 1ul << (event % 64);
ocfg->misc_ids[ss->num_misc] = id;
if (icfg->flags[ii] & CPUPERF_CONFIG_FLAG_TIMEBASE0) {
ocfg->misc_flags[ss->num_misc] |= IPM_CONFIG_FLAG_TIMEBASE;
} else {
if (icfg->rate[ii] != 0) {
zxlogf(ERROR, "%s: Misc event [%u] requires a timebase\n",
__func__, ii);
return ZX_ERR_INVALID_ARGS;
}
}
++ss->num_misc;
return ZX_OK;
}
static zx_status_t pmu_stage_config(cpuperf_device_t* dev,
const void* cmd, size_t cmdlen) {
zxlogf(TRACE, "%s called\n", __func__);
if (dev->active)
return ZX_ERR_BAD_STATE;
pmu_per_trace_state_t* per_trace = dev->per_trace_state;
if (!per_trace)
return ZX_ERR_BAD_STATE;
// If we subsequently get an error, make sure any previous configuration
// can't be used.
per_trace->configured = false;
cpuperf_config_t ioctl_config;
cpuperf_config_t* icfg = &ioctl_config;
if (cmdlen != sizeof(*icfg))
return ZX_ERR_INVALID_ARGS;
memcpy(icfg, cmd, sizeof(*icfg));
zx_x86_pmu_config_t* ocfg = &per_trace->config;
memset(ocfg, 0, sizeof(*ocfg));
// Validate the config and convert it to our internal form.
// TODO(dje): Multiplexing support.
staging_state_t staging_state;
staging_state_t* ss = &staging_state;
ss->max_num_fixed = pmu_properties.num_fixed_events;
ss->max_num_programmable = pmu_properties.num_programmable_events;
ss->max_num_misc = pmu_properties.num_misc_events;
ss->num_fixed = 0;
ss->num_programmable = 0;
ss->num_misc = 0;
ss->max_fixed_value =
(pmu_properties.fixed_counter_width < 64
? (1ul << pmu_properties.fixed_counter_width) - 1
: ~0ul);
ss->max_programmable_value =
(pmu_properties.programmable_counter_width < 64
? (1ul << pmu_properties.programmable_counter_width) - 1
: ~0ul);
for (unsigned i = 0; i < countof(ss->have_fixed); ++i)
ss->have_fixed[i] = false;
for (unsigned i = 0; i < countof(ss->have_misc); ++i)
ss->have_misc[i] = false;
ss->have_timebase0_user = false;
zx_status_t status;
unsigned ii; // ii: input index
for (ii = 0; ii < countof(icfg->events); ++ii) {
cpuperf_event_id_t id = icfg->events[ii];
zxlogf(TRACE, "%s: processing [%u] = %u\n", __func__, ii, id);
if (id == 0)
break;
unsigned group = CPUPERF_EVENT_ID_GROUP(id);
if (icfg->flags[ii] & ~CPUPERF_CONFIG_FLAG_MASK) {
zxlogf(ERROR, "%s: reserved flag bits set [%u]\n", __func__, ii);
return ZX_ERR_INVALID_ARGS;
}
switch (group) {
case CPUPERF_GROUP_FIXED:
status = pmu_stage_fixed_config(icfg, ss, ii, ocfg);
if (status != ZX_OK)
return status;
break;
case CPUPERF_GROUP_ARCH:
case CPUPERF_GROUP_MODEL:
status = pmu_stage_programmable_config(icfg, ss, ii, ocfg);
if (status != ZX_OK)
return status;
break;
case CPUPERF_GROUP_MISC:
status = pmu_stage_misc_config(icfg, ss, ii, ocfg);
if (status != ZX_OK)
return status;
break;
default:
zxlogf(ERROR, "%s: Invalid event [%u] (bad group)\n",
__func__, ii);
return ZX_ERR_INVALID_ARGS;
}
if (icfg->flags[ii] & CPUPERF_CONFIG_FLAG_TIMEBASE0)
ss->have_timebase0_user = true;
}
if (ii == 0) {
zxlogf(ERROR, "%s: No events provided\n", __func__);
return ZX_ERR_INVALID_ARGS;
}
// Ensure there are no holes.
for (; ii < countof(icfg->events); ++ii) {
if (icfg->events[ii] != 0) {
zxlogf(ERROR, "%s: Hole at event [%u]\n", __func__, ii);
return ZX_ERR_INVALID_ARGS;
}
}
if (ss->have_timebase0_user) {
ocfg->timebase_id = icfg->events[0];
}
#if TRY_FREEZE_ON_PMI
ocfg->debug_ctrl |= IA32_DEBUGCTL_FREEZE_PERFMON_ON_PMI_MASK;
#endif
// Require something to be enabled in order to start tracing.
// This is mostly a sanity check.
if (per_trace->config.global_ctrl == 0) {
zxlogf(ERROR, "%s: Requested config doesn't collect any data\n",
__func__);
return ZX_ERR_INVALID_ARGS;
}
per_trace->ioctl_config = *icfg;
per_trace->configured = true;
return ZX_OK;
}
static zx_status_t pmu_get_config(cpuperf_device_t* dev,
void* reply, size_t replymax,
size_t* out_actual) {
zxlogf(TRACE, "%s called\n", __func__);
const pmu_per_trace_state_t* per_trace = dev->per_trace_state;
if (!per_trace)
return ZX_ERR_BAD_STATE;
if (!per_trace->configured)
return ZX_ERR_BAD_STATE;
const cpuperf_config_t* config = &per_trace->ioctl_config;
if (replymax < sizeof(*config))
return ZX_ERR_BUFFER_TOO_SMALL;
memcpy(reply, config, sizeof(*config));
*out_actual = sizeof(*config);
return ZX_OK;
}
static zx_status_t pmu_start(cpuperf_device_t* dev) {
zxlogf(TRACE, "%s called\n", __func__);
if (dev->active)
return ZX_ERR_BAD_STATE;
pmu_per_trace_state_t* per_trace = dev->per_trace_state;
if (!per_trace)
return ZX_ERR_BAD_STATE;
if (!per_trace->configured)
return ZX_ERR_BAD_STATE;
// Step 1: Get the configuration data into the kernel for use by START.
zxlogf(TRACE, "%s: global ctrl 0x%" PRIx64 ", fixed ctrl 0x%" PRIx64 "\n",
__func__, per_trace->config.global_ctrl,
per_trace->config.fixed_ctrl);
// |per_trace->configured| should not have been set if there's nothing
// to trace.
assert(per_trace->config.global_ctrl != 0);
zx_handle_t resource = get_root_resource();
zx_status_t status =
zx_mtrace_control(resource, MTRACE_KIND_CPUPERF,
MTRACE_CPUPERF_INIT, 0, NULL, 0);
if (status != ZX_OK)
return status;
uint32_t num_cpus = zx_system_get_num_cpus();
for (uint32_t cpu = 0; cpu < num_cpus; ++cpu) {
zx_x86_pmu_buffer_t buffer;
io_buffer_t* io_buffer = &per_trace->buffers[cpu];
buffer.vmo = io_buffer->vmo_handle;
status = zx_mtrace_control(resource, MTRACE_KIND_CPUPERF,
MTRACE_CPUPERF_ASSIGN_BUFFER, cpu,
&buffer, sizeof(buffer));
if (status != ZX_OK)
goto fail;
}
status = zx_mtrace_control(resource, MTRACE_KIND_CPUPERF,
MTRACE_CPUPERF_STAGE_CONFIG, 0,
&per_trace->config, sizeof(per_trace->config));
if (status != ZX_OK)
goto fail;
// Step 2: Start data collection.
status = zx_mtrace_control(resource, MTRACE_KIND_CPUPERF, MTRACE_CPUPERF_START,
0, NULL, 0);
if (status != ZX_OK)
goto fail;
dev->active = true;
return ZX_OK;
fail:
{
zx_status_t status2 =
zx_mtrace_control(resource, MTRACE_KIND_CPUPERF,
MTRACE_CPUPERF_FINI, 0, NULL, 0);
if (status2 != ZX_OK)
zxlogf(TRACE, "%s: MTRACE_CPUPERF_FINI failed: %d\n", __func__, status2);
assert(status2 == ZX_OK);
return status;
}
}
static zx_status_t pmu_stop(cpuperf_device_t* dev) {
zxlogf(TRACE, "%s called\n", __func__);
pmu_per_trace_state_t* per_trace = dev->per_trace_state;
if (!per_trace)
return ZX_ERR_BAD_STATE;
zx_handle_t resource = get_root_resource();
zx_status_t status =
zx_mtrace_control(resource, MTRACE_KIND_CPUPERF,
MTRACE_CPUPERF_STOP, 0, NULL, 0);
if (status == ZX_OK) {
dev->active = false;
status = zx_mtrace_control(resource, MTRACE_KIND_CPUPERF,
MTRACE_CPUPERF_FINI, 0, NULL, 0);
}
return status;
}
zx_status_t cpuperf_ioctl_worker(cpuperf_device_t* dev, uint32_t op,
const void* cmd, size_t cmdlen,
void* reply, size_t replymax,
size_t* out_actual) {
assert(IOCTL_FAMILY(op) == IOCTL_FAMILY_CPUPERF);
switch (op) {
case IOCTL_CPUPERF_GET_PROPERTIES:
if (cmdlen != 0)
return ZX_ERR_INVALID_ARGS;
return pmu_get_properties(dev, reply, replymax, out_actual);
case IOCTL_CPUPERF_ALLOC_TRACE:
if (replymax != 0)
return ZX_ERR_INVALID_ARGS;
return pmu_alloc_trace(dev, cmd, cmdlen);
case IOCTL_CPUPERF_FREE_TRACE:
if (cmdlen != 0 || replymax != 0)
return ZX_ERR_INVALID_ARGS;
return pmu_free_trace(dev);
case IOCTL_CPUPERF_GET_ALLOC:
if (cmdlen != 0)
return ZX_ERR_INVALID_ARGS;
return pmu_get_alloc(dev, reply, replymax, out_actual);
case IOCTL_CPUPERF_GET_BUFFER_HANDLE:
return pmu_get_buffer_handle(dev, cmd, cmdlen, reply, replymax, out_actual);
case IOCTL_CPUPERF_STAGE_CONFIG:
if (replymax != 0)
return ZX_ERR_INVALID_ARGS;
return pmu_stage_config(dev, cmd, cmdlen);
case IOCTL_CPUPERF_GET_CONFIG:
return pmu_get_config(dev, reply, replymax, out_actual);
case IOCTL_CPUPERF_START:
if (cmdlen != 0 || replymax != 0)
return ZX_ERR_INVALID_ARGS;
return pmu_start(dev);
case IOCTL_CPUPERF_STOP:
if (cmdlen != 0 || replymax != 0)
return ZX_ERR_INVALID_ARGS;
return pmu_stop(dev);
default:
return ZX_ERR_INVALID_ARGS;
}
}
// Devhost interface.
static zx_status_t cpuperf_open(void* ctx, zx_device_t** dev_out, uint32_t flags) {
cpuperf_device_t* dev = ctx;
if (dev->opened)
return ZX_ERR_ALREADY_BOUND;
dev->opened = true;
return ZX_OK;
}
static zx_status_t cpuperf_close(void* ctx, uint32_t flags) {
cpuperf_device_t* dev = ctx;
dev->opened = false;
return ZX_OK;
}
static zx_status_t cpuperf_ioctl(void* ctx, uint32_t op,
const void* cmd, size_t cmdlen,
void* reply, size_t replymax,
size_t* out_actual) {
cpuperf_device_t* dev = ctx;
mtx_lock(&dev->lock);
ssize_t result;
switch (IOCTL_FAMILY(op)) {
case IOCTL_FAMILY_CPUPERF:
result = cpuperf_ioctl_worker(dev, op, cmd, cmdlen,
reply, replymax, out_actual);
break;
default:
result = ZX_ERR_INVALID_ARGS;
break;
}
mtx_unlock(&dev->lock);
return result;
}
static void cpuperf_release(void* ctx) {
cpuperf_device_t* dev = ctx;
// TODO(dje): None of these should fail. What to do?
// Suggest flagging things as busted and prevent further use.
pmu_stop(dev);
pmu_free_trace(dev);
zx_handle_close(dev->bti);
free(dev);
}
static zx_protocol_device_t cpuperf_device_proto = {
.version = DEVICE_OPS_VERSION,
.open = cpuperf_open,
.close = cpuperf_close,
.ioctl = cpuperf_ioctl,
.release = cpuperf_release,
};
zx_status_t cpuperf_bind(void* ctx, zx_device_t* parent) {
zx_status_t status = cpuperf_init_once();
if (status != ZX_OK) {
return status;
}
pdev_protocol_t pdev;
status = device_get_protocol(parent, ZX_PROTOCOL_PDEV, &pdev);
if (status != ZX_OK) {
return status;
}
cpuperf_device_t* dev = calloc(1, sizeof(*dev));
if (!dev) {
return ZX_ERR_NO_MEMORY;
}
dev->bti = ZX_HANDLE_INVALID;
status = pdev_get_bti(&pdev, 0, &dev->bti);
if (status != ZX_OK) {
goto fail;
}
device_add_args_t args = {
.version = DEVICE_ADD_ARGS_VERSION,
.name = "cpuperf",
.ctx = dev,
.ops = &cpuperf_device_proto,
};
if ((status = device_add(parent, &args, NULL)) < 0) {
goto fail;
}
return ZX_OK;
fail:
zx_handle_close(dev->bti);
free(dev);
return status;
}