blob: b3828471be610602389b02a2a5dd53e88f50ed09 [file] [log] [blame]
// Copyright 2016 The Fuchsia Authors
//
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file or at
// https://opensource.org/licenses/MIT
// TODO(fxbug.dev/30938): Need to be able to r/w MSRs.
// The thought is to use resources (as in ResourceDispatcher), at which point
// this will all get rewritten. Until such time, the goal here is KISS.
// This file contains the lower part of Intel Processor Trace support that must
// be done in the kernel (so that we can read/write msrs).
// The userspace driver is in system/dev/misc/cpu-trace/intel-pt.c.
//
// We currently only support Table of Physical Addresses mode:
// it supports discontiguous buffers and supports stop-on-full behavior
// in addition to wrap-around.
//
// IPT tracing has two "modes":
// - per-cpu tracing
// - thread-specific tracing
// Tracing can only be done in one mode at a time. This is because saving/
// restoring thread PT state via the xsaves/xrstors instructions is a global
// flag in the XSS msr.
// Plus once a trace has been done with IPT_MODE_THREAD one cannot go back
// to IPT_MODE_CPU: supporting this requires flushing trace state from all
// threads which is a bit of work. For now it's easy enough to just require
// the user to reboot. fxbug.dev/30840
#include "arch/x86/proc_trace.h"
#include <lib/arch/x86/boot-cpuid.h>
#include <lib/ktrace.h>
#include <lib/zircon-internal/device/cpu-trace/intel-pt.h>
#include <lib/zircon-internal/ktrace.h>
#include <lib/zircon-internal/mtrace.h>
#include <lib/zircon-internal/thread_annotations.h>
#include <pow2.h>
#include <string.h>
#include <trace.h>
#include <zircon/errors.h>
#include <zircon/types.h>
#include <arch/arch_ops.h>
#include <arch/x86.h>
#include <arch/x86/feature.h>
#include <arch/x86/mmu.h>
#include <fbl/auto_lock.h>
#include <fbl/macros.h>
#include <kernel/cpu.h>
#include <kernel/mp.h>
#include <kernel/mutex.h>
#include <kernel/thread.h>
#include <ktl/unique_ptr.h>
#include <vm/vm.h>
#include <vm/vm_aspace.h>
#define LOCAL_TRACE 0
// Control MSRs
#define IA32_RTIT_OUTPUT_BASE 0x560
#define IA32_RTIT_OUTPUT_MASK_PTRS 0x561
#define IA32_RTIT_CTL 0x570
#define IA32_RTIT_STATUS 0x571
#define IA32_RTIT_CR3_MATCH 0x572
#define IA32_RTIT_ADDR0_A 0x580
#define IA32_RTIT_ADDR0_B 0x581
#define IA32_RTIT_ADDR1_A 0x582
#define IA32_RTIT_ADDR1_B 0x583
#define IA32_RTIT_ADDR2_A 0x584
#define IA32_RTIT_ADDR2_B 0x585
#define IA32_RTIT_ADDR3_A 0x586
#define IA32_RTIT_ADDR3_B 0x587
// We need bits[15:8] to get the "maximum non-turbo ratio".
// See libipt:intel-pt.h:pt_config, and Intel Vol. 3 chapter 35.5.
#define IA32_PLATFORM_INFO 0xce
// Our own copy of what h/w supports, mostly for sanity checking.
static bool supports_pt = false;
static bool supports_cr3_filtering = false;
static bool supports_psb = false;
static bool supports_ip_filtering = false;
static bool supports_mtc = false;
static bool supports_ptwrite = false;
static bool supports_power_events = false;
static bool supports_output_topa = false;
static bool supports_output_topa_multi = false;
static bool supports_output_single = false;
static bool supports_output_transport = false;
struct ipt_trace_state_t {
uint64_t ctl;
uint64_t status;
uint64_t output_base;
uint64_t output_mask_ptrs;
uint64_t cr3_match;
struct {
uint64_t a, b;
} addr_ranges[IPT_MAX_NUM_ADDR_RANGES];
};
namespace {
DECLARE_SINGLETON_MUTEX(IptLock);
} // namespace
static ipt_trace_state_t* ipt_trace_state TA_GUARDED(IptLock::Get());
static bool active TA_GUARDED(IptLock::Get()) = false;
static zx_insntrace_trace_mode_t trace_mode TA_GUARDED(IptLock::Get()) = IPT_MODE_CPU;
// In cpu mode this arch_max_num_cpus.
// In thread mode this is provided by the user.
static uint32_t ipt_num_traces TA_GUARDED(IptLock::Get());
void x86_processor_trace_init(void) {
if (!arch::BootCpuid<arch::CpuidExtendedFeatureFlagsB>().intel_pt()) {
return;
}
auto pt_b = arch::BootCpuid<arch::CpuidProcessorTraceMainB>();
auto pt_c = arch::BootCpuid<arch::CpuidProcessorTraceMainC>();
supports_pt = true;
// Keep our own copy of these flags, mostly for potential sanity checks.
supports_cr3_filtering = pt_b.crc3_filtering();
supports_psb = pt_b.psb();
supports_ip_filtering = pt_b.ip_filtering();
supports_mtc = pt_b.mtc();
supports_ptwrite = pt_b.ptwrite();
supports_power_events = pt_b.power_event_trace();
supports_output_topa = pt_c.topa();
supports_output_topa_multi = pt_c.topa_multi();
supports_output_single = pt_c.single_range_output();
supports_output_transport = pt_c.trace_transport();
}
// Intel Processor Trace support needs to be able to map cr3 values that
// appear in the trace to pids that ld.so uses to dump memory maps.
void arch_trace_process_create(uint64_t pid, paddr_t pt_phys) {
// The cr3 value that appears in Intel PT h/w tracing.
uint64_t cr3 = pt_phys;
ktrace(TAG_IPT_PROCESS_CREATE, (uint32_t)pid, (uint32_t)(pid >> 32), (uint32_t)cr3,
(uint32_t)(cr3 >> 32));
}
// Worker for x86_ipt_alloc_trace to be executed on all cpus.
// This is invoked via mp_sync_exec which thread safety analysis cannot follow.
static void x86_ipt_set_mode_task(void* raw_context) TA_REQ(IptLock::Get()) {
DEBUG_ASSERT(arch_ints_disabled());
DEBUG_ASSERT(!active);
// When changing modes make sure all PT MSRs are in the init state.
// We don't want a value to appear in the xsave buffer and have xrstors
// #gp because XCOMP_BV has the PT bit set that's not set in XSS.
// We still need to do this, even with fxbug.dev/30840, when transitioning
// from IPT_MODE_CPU to IPT_MODE_THREAD.
write_msr(IA32_RTIT_CTL, 0);
write_msr(IA32_RTIT_STATUS, 0);
write_msr(IA32_RTIT_OUTPUT_BASE, 0);
write_msr(IA32_RTIT_OUTPUT_MASK_PTRS, 0);
if (supports_cr3_filtering)
write_msr(IA32_RTIT_CR3_MATCH, 0);
// TODO(dje): addr range msrs
zx_insntrace_trace_mode_t new_mode =
static_cast<zx_insntrace_trace_mode_t>(reinterpret_cast<uintptr_t>(raw_context));
// PT state saving, if supported, was enabled during boot so there's no
// need to recalculate the xsave space needed.
x86_set_extended_register_pt_state(new_mode == IPT_MODE_THREAD);
}
zx_status_t x86_ipt_alloc_trace(zx_insntrace_trace_mode_t mode, uint32_t num_traces) {
Guard<Mutex> guard(IptLock::Get());
DEBUG_ASSERT(mode == IPT_MODE_CPU || mode == IPT_MODE_THREAD);
if (mode == IPT_MODE_CPU) {
if (num_traces != arch_max_num_cpus())
return ZX_ERR_INVALID_ARGS;
} else {
return ZX_ERR_NOT_SUPPORTED;
}
if (!supports_pt)
return ZX_ERR_NOT_SUPPORTED;
if (active)
return ZX_ERR_BAD_STATE;
if (ipt_trace_state)
return ZX_ERR_BAD_STATE;
// fxbug.dev/30840: We don't support changing the mode from IPT_MODE_THREAD to
// IPT_MODE_CPU: We can't turn off XSS.PT until we're sure all threads
// have no PT state, and that's too tricky to do right now. Instead,
// require the developer to reboot.
if (trace_mode == IPT_MODE_THREAD && mode == IPT_MODE_CPU)
return ZX_ERR_NOT_SUPPORTED;
ipt_trace_state =
reinterpret_cast<ipt_trace_state_t*>(calloc(num_traces, sizeof(*ipt_trace_state)));
if (!ipt_trace_state)
return ZX_ERR_NO_MEMORY;
mp_sync_exec(MP_IPI_TARGET_ALL, 0, x86_ipt_set_mode_task,
reinterpret_cast<void*>(static_cast<uintptr_t>(mode)));
trace_mode = mode;
ipt_num_traces = num_traces;
return ZX_OK;
}
// Free resources obtained by x86_ipt_alloc_trace().
// This doesn't care if resources have already been freed to save callers
// from having to care during any cleanup.
zx_status_t x86_ipt_free_trace() {
Guard<Mutex> guard(IptLock::Get());
// Terminating tracing in thread mode is done differently: Tracing state
// is recorded, in part, with traced threads.
// This is the only situation where this fails.
// TODO(fxbug.dev/30840): We could take a more heavy-handed approach here and
// do the work necessary to clear out tracing on all threads. It's a bit
// of work, but the resulting functionality would simplify the u/i.
if (trace_mode == IPT_MODE_THREAD) {
return ZX_ERR_BAD_STATE;
}
if (!supports_pt) {
// If tracing is not supported we're already terminated.
return ZX_OK;
}
if (active) {
[[maybe_unused]] zx_status_t status = x86_ipt_stop();
// This should succeed. The only time it can fail is in thread-mode,
// but we've already checked for that.
DEBUG_ASSERT(status == ZX_OK);
DEBUG_ASSERT(!active);
}
free(ipt_trace_state);
ipt_trace_state = nullptr;
return ZX_OK;
}
static void x86_ipt_start_cpu_task(void* raw_context) TA_REQ(IptLock::Get()) {
DEBUG_ASSERT(arch_ints_disabled());
DEBUG_ASSERT(active && raw_context);
ipt_trace_state_t* context = reinterpret_cast<ipt_trace_state_t*>(raw_context);
cpu_num_t cpu = arch_curr_cpu_num();
ipt_trace_state_t* state = &context[cpu];
DEBUG_ASSERT(!(read_msr(IA32_RTIT_CTL) & IPT_CTL_TRACE_EN_MASK));
// Load the ToPA configuration
write_msr(IA32_RTIT_OUTPUT_BASE, state->output_base);
write_msr(IA32_RTIT_OUTPUT_MASK_PTRS, state->output_mask_ptrs);
// Load all other msrs, prior to enabling tracing.
write_msr(IA32_RTIT_STATUS, state->status);
if (supports_cr3_filtering)
write_msr(IA32_RTIT_CR3_MATCH, state->cr3_match);
// Enable the trace
write_msr(IA32_RTIT_CTL, state->ctl);
}
// Begin the trace.
zx_status_t x86_ipt_start() {
Guard<Mutex> guard(IptLock::Get());
if (!supports_pt)
return ZX_ERR_NOT_SUPPORTED;
if (trace_mode == IPT_MODE_THREAD)
return ZX_ERR_BAD_STATE;
if (active)
return ZX_ERR_BAD_STATE;
if (!ipt_trace_state)
return ZX_ERR_BAD_STATE;
uint64_t kernel_cr3 = x86_kernel_cr3();
TRACEF("Starting processor trace, kernel cr3: 0x%" PRIxPTR "\n", kernel_cr3);
if (LOCAL_TRACE && trace_mode == IPT_MODE_CPU) {
uint32_t num_cpus = ipt_num_traces;
for (uint32_t cpu = 0; cpu < num_cpus; ++cpu) {
TRACEF("Cpu %u: ctl 0x%" PRIx64 ", status 0x%" PRIx64 ", base 0x%" PRIx64 ", mask 0x%" PRIx64
"\n",
cpu, ipt_trace_state[cpu].ctl, ipt_trace_state[cpu].status,
ipt_trace_state[cpu].output_base, ipt_trace_state[cpu].output_mask_ptrs);
}
}
active = true;
// Sideband info needed by the trace reader.
uint64_t platform_msr = read_msr(IA32_PLATFORM_INFO);
unsigned nom_freq = (platform_msr >> 8) & 0xff;
ktrace(TAG_IPT_START, (uint32_t)nom_freq, 0, (uint32_t)kernel_cr3, (uint32_t)(kernel_cr3 >> 32));
const struct x86_model_info* model_info = x86_get_model();
ktrace(TAG_IPT_CPU_INFO, model_info->processor_type, model_info->display_family,
model_info->display_model, model_info->stepping);
if (trace_mode == IPT_MODE_CPU) {
mp_sync_exec(MP_IPI_TARGET_ALL, 0, x86_ipt_start_cpu_task, ipt_trace_state);
}
return ZX_OK;
}
static void x86_ipt_stop_cpu_task(void* raw_context) TA_REQ(IptLock::Get()) {
DEBUG_ASSERT(arch_ints_disabled());
DEBUG_ASSERT(raw_context);
ipt_trace_state_t* context = reinterpret_cast<ipt_trace_state_t*>(raw_context);
cpu_num_t cpu = arch_curr_cpu_num();
ipt_trace_state_t* state = &context[cpu];
// Disable the trace
write_msr(IA32_RTIT_CTL, 0);
// Retrieve msr values for later providing to userspace
state->ctl = 0;
state->status = read_msr(IA32_RTIT_STATUS);
state->output_base = read_msr(IA32_RTIT_OUTPUT_BASE);
state->output_mask_ptrs = read_msr(IA32_RTIT_OUTPUT_MASK_PTRS);
// Zero all MSRs so that we are in the XSAVE initial configuration.
// This allows h/w to do some optimizations regarding the state.
write_msr(IA32_RTIT_STATUS, 0);
write_msr(IA32_RTIT_OUTPUT_BASE, 0);
write_msr(IA32_RTIT_OUTPUT_MASK_PTRS, 0);
if (supports_cr3_filtering)
write_msr(IA32_RTIT_CR3_MATCH, 0);
// TODO(dje): Make it explicit that packets have been completely written.
// See Intel Vol 3 chapter 36.2.4.
// TODO(teisenbe): Clear ADDR* MSRs depending on leaf 1
}
// This can be called while not active, so the caller doesn't have to care
// during any cleanup.
zx_status_t x86_ipt_stop() {
Guard<Mutex> guard(IptLock::Get());
// Stopping tracing in thread mode is done differently: Tracing state
// is recorded, in part, with traced threads.
// This is the only situation where this fails.
// TODO(fxbug.dev/30840): We could take a more heavy-handed approach here and
// do the work necessary to clear out tracing on all threads. It's a bit
// of work, but the resulting functionality would simplify the u/i.
if (trace_mode == IPT_MODE_THREAD) {
return ZX_ERR_BAD_STATE;
}
if (!supports_pt) {
// If tracing is not supported we're already stopped.
return ZX_OK;
}
if (!ipt_trace_state) {
// If tracing is not enabled we're already stopped.
return ZX_OK;
}
TRACEF("Stopping processor trace\n");
if (trace_mode == IPT_MODE_CPU) {
mp_sync_exec(MP_IPI_TARGET_ALL, 0, x86_ipt_stop_cpu_task, ipt_trace_state);
}
ktrace(TAG_IPT_STOP, 0, 0, 0, 0);
active = false;
if (LOCAL_TRACE && trace_mode == IPT_MODE_CPU) {
uint32_t num_cpus = ipt_num_traces;
for (uint32_t cpu = 0; cpu < num_cpus; ++cpu) {
TRACEF("Cpu %u: ctl 0x%" PRIx64 ", status 0x%" PRIx64 ", base 0x%" PRIx64 ", mask 0x%" PRIx64
"\n",
cpu, ipt_trace_state[cpu].ctl, ipt_trace_state[cpu].status,
ipt_trace_state[cpu].output_base, ipt_trace_state[cpu].output_mask_ptrs);
}
}
return ZX_OK;
}
zx_status_t x86_ipt_stage_trace_data(zx_insntrace_buffer_descriptor_t descriptor,
const zx_x86_pt_regs_t* regs) {
Guard<Mutex> guard(IptLock::Get());
if (!supports_pt)
return ZX_ERR_NOT_SUPPORTED;
if (trace_mode == IPT_MODE_CPU && active)
return ZX_ERR_BAD_STATE;
if (!ipt_trace_state)
return ZX_ERR_BAD_STATE;
if (descriptor >= ipt_num_traces)
return ZX_ERR_INVALID_ARGS;
ipt_trace_state[descriptor].ctl = regs->ctl;
ipt_trace_state[descriptor].status = regs->status;
ipt_trace_state[descriptor].output_base = regs->output_base;
ipt_trace_state[descriptor].output_mask_ptrs = regs->output_mask_ptrs;
ipt_trace_state[descriptor].cr3_match = regs->cr3_match;
static_assert(sizeof(ipt_trace_state[descriptor].addr_ranges) == sizeof(regs->addr_ranges),
"addr_ranges size mismatch");
memcpy(ipt_trace_state[descriptor].addr_ranges, regs->addr_ranges, sizeof(regs->addr_ranges));
return ZX_OK;
}
zx_status_t x86_ipt_get_trace_data(zx_insntrace_buffer_descriptor_t descriptor,
zx_x86_pt_regs_t* regs) {
Guard<Mutex> guard(IptLock::Get());
if (!supports_pt)
return ZX_ERR_NOT_SUPPORTED;
if (trace_mode == IPT_MODE_CPU && active)
return ZX_ERR_BAD_STATE;
if (!ipt_trace_state)
return ZX_ERR_BAD_STATE;
if (descriptor >= ipt_num_traces)
return ZX_ERR_INVALID_ARGS;
regs->ctl = ipt_trace_state[descriptor].ctl;
regs->status = ipt_trace_state[descriptor].status;
regs->output_base = ipt_trace_state[descriptor].output_base;
regs->output_mask_ptrs = ipt_trace_state[descriptor].output_mask_ptrs;
regs->cr3_match = ipt_trace_state[descriptor].cr3_match;
static_assert(sizeof(regs->addr_ranges) == sizeof(ipt_trace_state[descriptor].addr_ranges),
"addr_ranges size mismatch");
memcpy(regs->addr_ranges, ipt_trace_state[descriptor].addr_ranges, sizeof(regs->addr_ranges));
return ZX_OK;
}