| // Copyright 2017 The Fuchsia Authors |
| // |
| // Use of this source code is governed by a MIT-style |
| // license that can be found in the LICENSE file or at |
| // https://opensource.org/licenses/MIT |
| |
| // This file contains the lower part of Intel Performance Monitor support that |
| // must be done in the kernel (so that we can read/write msrs). |
| // The common code is in kernel/lib/perfmon/perfmon.cpp. |
| // The userspace driver is in system/dev/misc/cpu-trace/intel-pm.c. |
| |
| // TODO(dje): See Intel Vol 3 18.2.3.1 for hypervisor recommendations. |
| // TODO(dje): LBR, BTS, et.al. See Intel Vol 3 Chapter 17. |
| // TODO(dje): PMI mitigations |
| // TODO(dje): Eventually may wish to virtualize some/all of the MSRs, |
| // some have multiple disparate uses. |
| // TODO(dje): vmo management |
| // TODO(dje): check hyperthread handling |
| // TODO(dje): See about reducing two loops (programmable+fixed) into one. |
| // TODO(dje): If we're using one counter as the trigger, we could skip |
| // resetting the other counters and instead record the last value (so that we |
| // can continue to emit the delta into the trace buffer) - assuming the write |
| // to memory is faster than the wrmsr which is apparently true. |
| // TODO(dje): rdpmc |
| #include "arch/x86/perf_mon.h" |
| |
| #include <assert.h> |
| #include <lib/arch/x86/boot-cpuid.h> |
| #include <lib/ktrace.h> |
| #include <lib/pci/pio.h> |
| #include <lib/perfmon.h> |
| #include <lib/zircon-internal/mtrace.h> |
| #include <lib/zircon-internal/thread_annotations.h> |
| #include <platform.h> |
| #include <pow2.h> |
| #include <string.h> |
| #include <trace.h> |
| #include <zircon/errors.h> |
| #include <zircon/types.h> |
| |
| #include <new> |
| |
| #include <arch/arch_ops.h> |
| #include <arch/regs.h> |
| #include <arch/x86.h> |
| #include <arch/x86/apic.h> |
| #include <arch/x86/feature.h> |
| #include <arch/x86/mmu.h> |
| #include <fbl/algorithm.h> |
| #include <fbl/alloc_checker.h> |
| #include <fbl/macros.h> |
| #include <fbl/ref_ptr.h> |
| #include <kernel/align.h> |
| #include <kernel/cpu.h> |
| #include <kernel/mp.h> |
| #include <kernel/mutex.h> |
| #include <kernel/stats.h> |
| #include <kernel/thread.h> |
| #include <ktl/atomic.h> |
| #include <ktl/iterator.h> |
| #include <ktl/move.h> |
| #include <ktl/unique_ptr.h> |
| #include <lk/init.h> |
| #include <vm/vm.h> |
| #include <vm/vm_address_region.h> |
| #include <vm/vm_aspace.h> |
| #include <vm/vm_object_physical.h> |
| |
| #include <ktl/enforce.h> |
| |
| #define LOCAL_TRACE 0 |
| |
| static void x86_perfmon_reset_task(void* raw_context); |
| |
| // TODO(cja): Sort out headers so the kernel can include these sorts of definitions |
| // without needing DDK access |
| #define PCI_CONFIG_VENDOR_ID 0x00 |
| #define PCI_CONFIG_DEVICE_ID 0x02 |
| |
| // There's only a few misc events, and they're non-homogeneous, |
| // so handle them directly. |
| typedef enum { |
| #define DEF_MISC_SKL_EVENT(symbol, event_name, id, offset, size, flags, readable_name, \ |
| description) \ |
| symbol##_ID = perfmon::MakeEventId(perfmon::kGroupMisc, id), |
| #include <lib/zircon-internal/device/cpu-trace/skylake-misc-events.inc> |
| } misc_event_id_t; |
| |
| // h/w address of misc events. |
| typedef enum { |
| #define DEF_MISC_SKL_EVENT(symbol, event_name, id, offset, size, flags, readable_name, \ |
| description) \ |
| symbol##_OFFSET = offset, |
| #include <lib/zircon-internal/device/cpu-trace/skylake-misc-events.inc> |
| } misc_event_offset_t; |
| |
| // TODO(dje): Freeze-on-PMI doesn't work in skylake. |
| // This is here for experimentation purposes. |
| #define TRY_FREEZE_ON_PMI 0 |
| |
| // At a minimum we require Performance Monitoring version 2 |
| #define MINIMUM_INTEL_PERFMON_VERSION 2 |
| #define PERFMON_VERSION_OVERFLOW_INDICATOR_SUPPORTED (4) |
| |
| // MSRs |
| |
| #define IA32_PLATFORM_INFO 0xce |
| |
| #define IA32_PERF_CAPABILITIES 0x345 |
| |
| // The counter MSR addresses are contiguous from here. |
| #define IA32_PMC_FIRST 0x0c1 |
| // The event selection MSR addresses are contiguous from here. |
| #define IA32_PERFEVTSEL_FIRST 0x186 |
| |
| #define IA32_FIXED_CTR_CTRL 0x38d |
| |
| // The fixed counter MSR addresses are contiguous from here. |
| #define IA32_FIXED_CTR0 0x309 |
| |
| #define IA32_PERF_GLOBAL_CTRL 0x38f |
| #define IA32_PERF_GLOBAL_STATUS 0x38e |
| #define IA32_PERF_GLOBAL_OVF_CTRL 0x390 |
| #define IA32_PERF_GLOBAL_STATUS_RESET 0x390 // Yes, same as OVF_CTRL. |
| #define IA32_PERF_GLOBAL_STATUS_SET 0x391 |
| #define IA32_PERF_GLOBAL_INUSE 0x392 |
| |
| #define IA32_DEBUGCTL 0x1d9 |
| |
| #define SKL_LAST_BRANCH_SELECT 0x1c8 |
| #define SKL_LAST_BRANCH_TOS 0x1c9 |
| |
| // N.B. These values have changed across models. |
| #define SKL_LAST_BRANCH_FROM_0 0x680 |
| #define SKL_LAST_BRANCH_FROM_16 0x690 |
| #define SKL_LAST_BRANCH_TO_0 0x6c0 |
| #define SKL_LAST_BRANCH_TO_16 0x6d0 |
| #define SKL_LAST_BRANCH_INFO_0 0xdc0 |
| #define SKL_LAST_BRANCH_INFO_16 0xdd0 |
| |
| // Vendor,device ids of the device with MCHBAR stats registers. |
| #define INTEL_MCHBAR_PCI_VENDOR_ID 0x8086 |
| const uint16_t supported_mem_device_ids[] = { |
| 0x1900, // docs use this value |
| 0x1904, // seen on NUC6 |
| 0x5904, // seen on NUC7 |
| 0x590c, // Amber Lake-Y/Kaby Lake-Y (Atlas) |
| }; |
| |
| // Offset in PCI config space of the BAR (base address register) of the |
| // MCHBAR stats registers. |
| #define INTEL_MCHBAR_PCI_CONFIG_OFFSET 0x48 |
| |
| // Offsets from the BAR in the memory controller hub mmio space of counters |
| // we're interested in. See the specs for MCHBAR in, e.g., |
| // "6th Generation Intel Core Processor Family Datasheet, Vol. 2". |
| // TODO(dje): These values are model specific. The current values work for |
| // currently supported platforms. Need to detect when we're on a supported |
| // platform. |
| // The BEGIN/END values are for computing the page(s) we need to map. |
| // Offset from BAR of the first byte we need to map. |
| #define UNC_IMC_STATS_BEGIN 0x5040 // MISC_MEM_GT_REQUESTS |
| // Offset from BAR of the last byte we need to map. |
| #define UNC_IMC_STATS_END 0x5983 // MISC_PKG_GT_TEMP |
| |
| // Verify all values are within [BEGIN,END]. |
| #define DEF_MISC_SKL_EVENT(symbol, event_name, id, offset, size, flags, readable_name, \ |
| description) \ |
| &&(offset >= UNC_IMC_STATS_BEGIN && (offset + size / 8) <= UNC_IMC_STATS_END + 1) |
| static_assert(1 |
| #include <lib/zircon-internal/device/cpu-trace/skylake-misc-events.inc> |
| , |
| ""); |
| |
| // These aren't constexpr as we iterate to fill in values for each counter. |
| static uint64_t kGlobalCtrlWritableBits; |
| static uint64_t kFixedCounterCtrlWritableBits; |
| |
| // Commented out values represent currently unsupported features. |
| // They remain present for documentation purposes. |
| // Note: Making this const assumes at least PM version >= 2 (e.g., |
| // IA32_DEBUGCTL_FREEZE_LBRS_ON_PMI_MASK). |
| // Note: At least FREEZE_WHILE_SMM needs to be set based on a runtime |
| // determination (need to check PERF_CAPABILITIES). |
| static constexpr uint64_t kDebugCtrlWritableBits = (IA32_DEBUGCTL_LBR_MASK | |
| /*IA32_DEBUGCTL_BTF_MASK |*/ |
| /*IA32_DEBUGCTL_TR_MASK |*/ |
| /*IA32_DEBUGCTL_BTS_MASK |*/ |
| /*IA32_DEBUGCTL_BTINT_MASK |*/ |
| /*IA32_DEBUGCTL_BTS_OFF_OS_MASK |*/ |
| /*IA32_DEBUGCTL_BTS_OFF_USR_MASK |*/ |
| IA32_DEBUGCTL_FREEZE_LBRS_ON_PMI_MASK | |
| #if TRY_FREEZE_ON_PMI |
| IA32_DEBUGCTL_FREEZE_PERFMON_ON_PMI_MASK | |
| #endif |
| /*IA32_DEBUGCTL_FREEZE_WHILE_SMM_MASK |*/ |
| /*IA32_DEBUGCTL_RTM_MASK |*/ |
| 0); |
| static constexpr uint64_t kEventSelectWritableBits = |
| (IA32_PERFEVTSEL_EVENT_SELECT_MASK | IA32_PERFEVTSEL_UMASK_MASK | IA32_PERFEVTSEL_USR_MASK | |
| IA32_PERFEVTSEL_OS_MASK | IA32_PERFEVTSEL_E_MASK | IA32_PERFEVTSEL_PC_MASK | |
| IA32_PERFEVTSEL_INT_MASK | IA32_PERFEVTSEL_ANY_MASK | IA32_PERFEVTSEL_EN_MASK | |
| IA32_PERFEVTSEL_INV_MASK | IA32_PERFEVTSEL_CMASK_MASK); |
| |
| enum LbrFormat { |
| LBR_FORMAT_32 = 0, |
| // The format contains LBR_INFO in addition to LBR_FROM/LBR_TO. |
| LBR_FORMAT_INFO = 0b101, |
| }; |
| |
| static bool perfmon_hw_initialized = false; |
| |
| static uint16_t perfmon_version = 0; |
| |
| // The maximum number of programmable counters that can be simultaneously |
| // handled, and their maximum width; |
| static uint16_t perfmon_num_programmable_counters = 0; |
| static uint16_t perfmon_programmable_counter_width = 0; |
| |
| // The maximum number of fixed counters that can be simultaneously |
| // handled, and their maximum width; |
| static uint16_t perfmon_num_fixed_counters = 0; |
| static uint16_t perfmon_fixed_counter_width = 0; |
| |
| static uint32_t perfmon_unsupported_events = 0; |
| static uint32_t perfmon_capabilities = 0; |
| |
| // Maximum counter values, derived from their width. |
| static uint64_t perfmon_max_fixed_counter_value = 0; |
| static uint64_t perfmon_max_programmable_counter_value = 0; |
| |
| // Number of entries we can write in an LBR record. |
| static uint32_t perfmon_lbr_stack_size = 0; |
| // Format of LBR MSRs |
| static unsigned g_lbr_format = 0; |
| |
| // Counter bits in GLOBAL_STATUS to check on each interrupt. |
| static uint64_t perfmon_counter_status_bits = 0; |
| |
| // BAR (base address register) of Intel MCHBAR performance |
| // registers. These registers are accessible via mmio. |
| static uint32_t perfmon_mchbar_bar = 0; |
| |
| // The maximum number of "miscellaneous" events we can handle at once |
| // and their width. This is mostly for information purposes, there may be |
| // additional constraints which depend on the counters in question. |
| static uint16_t perfmon_num_misc_events = 0; |
| static uint16_t perfmon_misc_counter_width = 64; |
| |
| struct MemoryControllerHubData { |
| // Where the regs are mapped. |
| fbl::RefPtr<VmMapping> mapping; |
| |
| // The address where UNC_IMC_STATS_BEGIN is mapped, or zero if not mapped. |
| volatile void* stats_addr = 0; |
| |
| // We can't reset the events, and even if we could it's preferable to |
| // avoid making the device writable (lots of critical stuff in there), |
| // so record the previous values so that we can emit into the trace buffer |
| // the delta since the last interrupt. |
| struct { |
| uint32_t bytes_read = 0; |
| uint32_t bytes_written = 0; |
| uint32_t gt_requests = 0; |
| uint32_t ia_requests = 0; |
| uint32_t io_requests = 0; |
| uint64_t all_active_core_cycles = 0; |
| uint64_t any_active_core_cycles = 0; |
| uint64_t active_gt_cycles = 0; |
| uint64_t active_ia_gt_cycles = 0; |
| uint64_t active_gt_slice_cycles = 0; |
| uint64_t active_gt_engine_cycles = 0; |
| // The remaining registers don't count anything. |
| } last_mem; |
| }; |
| |
| struct PerfmonState : public PerfmonStateBase { |
| static zx_status_t Create(unsigned n_cpus, ktl::unique_ptr<PerfmonState>* out_state); |
| explicit PerfmonState(unsigned n_cpus); |
| |
| // IA32_PERF_GLOBAL_CTRL |
| uint64_t global_ctrl = 0; |
| |
| // IA32_FIXED_CTR_CTRL |
| uint64_t fixed_ctrl = 0; |
| |
| // IA32_DEBUGCTL |
| uint64_t debug_ctrl = 0; |
| |
| // True if MCHBAR perf regs need to be mapped in. |
| bool need_mchbar = false; |
| |
| // See intel-pm.h:X86PmuConfig. |
| PmuEventId timebase_event = perfmon::kEventIdNone; |
| |
| // The number of each kind of event in use, so we don't have to iterate |
| // over the entire arrays. |
| unsigned num_used_fixed = 0; |
| unsigned num_used_programmable = 0; |
| unsigned num_used_misc = 0; |
| |
| // True if last branch records have been requested. |
| bool request_lbr_record = false; |
| |
| MemoryControllerHubData mchbar_data; |
| |
| // |fixed_hw_map[i]| is the h/w fixed counter number. |
| // This is used to only look at fixed counters that are used. |
| unsigned fixed_hw_map[IPM_MAX_FIXED_COUNTERS] = {}; |
| |
| // The ids for each of the in-use events, or zero if not used. |
| // These are passed in from the driver and then written to the buffer, |
| // but otherwise have no meaning to us. |
| // All in-use entries appear consecutively. |
| PmuEventId fixed_events[IPM_MAX_FIXED_COUNTERS] = {}; |
| PmuEventId programmable_events[IPM_MAX_PROGRAMMABLE_COUNTERS] = {}; |
| PmuEventId misc_events[IPM_MAX_MISC_EVENTS] = {}; |
| |
| // The counters are reset to this at the start. |
| // And again for those that are reset on overflow. |
| uint64_t fixed_initial_value[IPM_MAX_FIXED_COUNTERS] = {}; |
| uint64_t programmable_initial_value[IPM_MAX_PROGRAMMABLE_COUNTERS] = {}; |
| |
| // Flags for each event/counter, perfmon::kPmuConfigFlag*. |
| uint32_t fixed_flags[IPM_MAX_FIXED_COUNTERS] = {}; |
| uint32_t programmable_flags[IPM_MAX_PROGRAMMABLE_COUNTERS] = {}; |
| uint32_t misc_flags[IPM_MAX_MISC_EVENTS] = {}; |
| |
| // IA32_PERFEVTSEL_* |
| uint64_t programmable_hw_events[IPM_MAX_PROGRAMMABLE_COUNTERS] = {}; |
| }; |
| |
| namespace { |
| DECLARE_SINGLETON_MUTEX(PerfmonLock); |
| } // namespace |
| |
| static ktl::unique_ptr<PerfmonState> perfmon_state TA_GUARDED(PerfmonLock::Get()); |
| |
| static inline bool x86_perfmon_lbr_is_supported() { return perfmon_lbr_stack_size > 0; } |
| |
| static inline void enable_counters(PerfmonState* state) { |
| write_msr(IA32_PERF_GLOBAL_CTRL, state->global_ctrl); |
| } |
| |
| static inline void disable_counters() { write_msr(IA32_PERF_GLOBAL_CTRL, 0); } |
| |
| zx_status_t PerfmonState::Create(unsigned n_cpus, ktl::unique_ptr<PerfmonState>* out_state) { |
| fbl::AllocChecker ac; |
| auto state = ktl::unique_ptr<PerfmonState>(new (&ac) PerfmonState(n_cpus)); |
| if (!ac.check()) |
| return ZX_ERR_NO_MEMORY; |
| |
| if (!state->AllocatePerCpuData()) { |
| return ZX_ERR_NO_MEMORY; |
| } |
| |
| *out_state = ktl::move(state); |
| return ZX_OK; |
| } |
| |
| PerfmonState::PerfmonState(unsigned n_cpus) : PerfmonStateBase(n_cpus) {} |
| |
| static bool x86_perfmon_have_mchbar_data() { |
| uint32_t vendor_id, device_id; |
| |
| auto status = Pci::PioCfgRead(0, 0, 0, PCI_CONFIG_VENDOR_ID, &vendor_id, 16); |
| if (status != ZX_OK) |
| return false; |
| if (vendor_id != INTEL_MCHBAR_PCI_VENDOR_ID) |
| return false; |
| status = Pci::PioCfgRead(0, 0, 0, PCI_CONFIG_DEVICE_ID, &device_id, 16); |
| if (status != ZX_OK) |
| return false; |
| for (auto supported_device_id : supported_mem_device_ids) { |
| if (supported_device_id == device_id) |
| return true; |
| } |
| |
| TRACEF("perfmon: unsupported pci device: 0x%x.0x%x\n", vendor_id, device_id); |
| return false; |
| } |
| |
| static void x86_perfmon_init_mchbar() { |
| uint32_t bar; |
| auto status = Pci::PioCfgRead(0, 0, 0, INTEL_MCHBAR_PCI_CONFIG_OFFSET, &bar, 32); |
| if (status == ZX_OK) { |
| LTRACEF("perfmon: mchbar: 0x%x\n", bar); |
| // TODO(dje): The lower four bits contain useful data, but punt for now. |
| // See PCI spec 6.2.5.1. |
| perfmon_mchbar_bar = bar & ~15u; |
| perfmon_num_misc_events = static_cast<uint16_t>(ktl::size(ArchPmuConfig{}.misc_events)); |
| } else { |
| TRACEF("perfmon: error %d reading mchbar\n", status); |
| } |
| } |
| |
| // Return the size of the LBR stack, or zero if not supported. |
| static unsigned x86_perfmon_lbr_stack_size() { |
| // See [intel/vol3]: Table 17-4. LBR Stack Size and TOS Pointer Range |
| static const struct { |
| x86_microarch_list microarch; |
| uint8_t stack_size; |
| } supported_chips[] = { |
| {X86_MICROARCH_INTEL_SKYLAKE, 32}, |
| {X86_MICROARCH_INTEL_CANNONLAKE, 32}, |
| }; |
| |
| // TODO(dje): KISS and only support these formats for now. |
| switch (g_lbr_format) { |
| case LBR_FORMAT_INFO: |
| break; |
| default: |
| return 0; |
| } |
| |
| for (const auto& chip : supported_chips) { |
| if (chip.microarch == x86_get_microarch_config()->x86_microarch) |
| return chip.stack_size; |
| } |
| |
| return 0; |
| } |
| |
| static void x86_perfmon_init_lbr(uint32_t lbr_stack_size) { |
| perfmon_lbr_stack_size = lbr_stack_size; |
| } |
| |
| static void x86_perfmon_lbr_clear() { |
| switch (g_lbr_format) { |
| case LBR_FORMAT_INFO: |
| for (uint i = 0; i < perfmon_lbr_stack_size; i++) { |
| write_msr(SKL_LAST_BRANCH_FROM_0 + i, 0); |
| write_msr(SKL_LAST_BRANCH_TO_0 + i, 0); |
| write_msr(SKL_LAST_BRANCH_INFO_0 + i, 0); |
| } |
| write_msr(SKL_LAST_BRANCH_TOS, 0); |
| break; |
| } |
| } |
| |
| static void x86_perfmon_init_once(uint level) { |
| if (!arch::BootCpuidSupports<arch::CpuidPerformanceMonitoringA>()) { |
| return; |
| } |
| |
| auto perfmon_a = arch::BootCpuid<arch::CpuidPerformanceMonitoringA>(); |
| auto perfmon_b = arch::BootCpuid<arch::CpuidPerformanceMonitoringB>(); |
| auto perfmon_d = arch::BootCpuid<arch::CpuidPerformanceMonitoringD>(); |
| |
| perfmon_version = static_cast<uint16_t>(perfmon_a.version()); |
| |
| perfmon_num_programmable_counters = static_cast<uint16_t>(perfmon_a.num_general_counters()); |
| if (perfmon_num_programmable_counters > IPM_MAX_PROGRAMMABLE_COUNTERS) { |
| TRACEF("perfmon: unexpected num programmable counters %u in cpuid.0AH\n", |
| perfmon_num_programmable_counters); |
| return; |
| } |
| perfmon_programmable_counter_width = static_cast<uint16_t>(perfmon_a.general_counter_width()); |
| // The <16 test is just something simple to ensure it's usable. |
| if (perfmon_programmable_counter_width < 16 || perfmon_programmable_counter_width > 64) { |
| TRACEF("perfmon: unexpected programmable counter width %u in cpuid.0AH\n", |
| perfmon_programmable_counter_width); |
| return; |
| } |
| perfmon_max_programmable_counter_value = ~0ul; |
| if (perfmon_programmable_counter_width < 64) { |
| perfmon_max_programmable_counter_value = (1ul << perfmon_programmable_counter_width) - 1; |
| } |
| |
| unsigned ebx_length = perfmon_a.ebx_vector_length(); |
| if (ebx_length > 7) { |
| TRACEF("perfmon: unexpected value %u in cpuid.0AH.EAH[31..24]\n", ebx_length); |
| return; |
| } |
| perfmon_unsupported_events = perfmon_b.reg_value() & ((1u << ebx_length) - 1); |
| |
| perfmon_num_fixed_counters = static_cast<uint16_t>(perfmon_d.num_fixed_counters()); |
| if (perfmon_num_fixed_counters > IPM_MAX_FIXED_COUNTERS) { |
| TRACEF("perfmon: unexpected num fixed counters %u in cpuid.0AH\n", perfmon_num_fixed_counters); |
| return; |
| } |
| perfmon_fixed_counter_width = static_cast<uint16_t>(perfmon_d.fixed_counter_width()); |
| // The <16 test is just something simple to ensure it's usable. |
| if (perfmon_fixed_counter_width < 16 || perfmon_fixed_counter_width > 64) { |
| TRACEF("perfmon: unexpected fixed counter width %u in cpuid.0AH\n", |
| perfmon_fixed_counter_width); |
| return; |
| } |
| perfmon_max_fixed_counter_value = ~0ul; |
| if (perfmon_fixed_counter_width < 64) { |
| perfmon_max_fixed_counter_value = (1ul << perfmon_fixed_counter_width) - 1; |
| } |
| |
| perfmon_supported = perfmon_version >= MINIMUM_INTEL_PERFMON_VERSION; |
| |
| if (arch::BootCpuid<arch::CpuidFeatureFlagsC>().pdcm()) { |
| perfmon_capabilities = static_cast<uint32_t>(read_msr(IA32_PERF_CAPABILITIES)); |
| } |
| g_lbr_format = perfmon_capabilities & ((1u << IA32_PERF_CAPABILITIES_LBR_FORMAT_LEN) - 1); |
| |
| perfmon_counter_status_bits = 0; |
| for (unsigned i = 0; i < perfmon_num_programmable_counters; ++i) |
| perfmon_counter_status_bits |= IA32_PERF_GLOBAL_STATUS_PMC_OVF_MASK(i); |
| for (unsigned i = 0; i < perfmon_num_fixed_counters; ++i) |
| perfmon_counter_status_bits |= IA32_PERF_GLOBAL_STATUS_FIXED_OVF_MASK(i); |
| |
| kGlobalCtrlWritableBits = 0; |
| for (unsigned i = 0; i < perfmon_num_programmable_counters; ++i) |
| kGlobalCtrlWritableBits |= IA32_PERF_GLOBAL_CTRL_PMC_EN_MASK(i); |
| for (unsigned i = 0; i < perfmon_num_fixed_counters; ++i) |
| kGlobalCtrlWritableBits |= IA32_PERF_GLOBAL_CTRL_FIXED_EN_MASK(i); |
| kFixedCounterCtrlWritableBits = 0; |
| for (unsigned i = 0; i < perfmon_num_fixed_counters; ++i) { |
| kFixedCounterCtrlWritableBits |= IA32_FIXED_CTR_CTRL_EN_MASK(i); |
| kFixedCounterCtrlWritableBits |= IA32_FIXED_CTR_CTRL_ANY_MASK(i); |
| kFixedCounterCtrlWritableBits |= IA32_FIXED_CTR_CTRL_PMI_MASK(i); |
| } |
| |
| if (x86_perfmon_have_mchbar_data()) { |
| x86_perfmon_init_mchbar(); |
| } |
| |
| unsigned lbr_stack_size = x86_perfmon_lbr_stack_size(); |
| if (lbr_stack_size != 0) { |
| // Don't crash if the h/w supports more than we do, just clip it. |
| if (lbr_stack_size > perfmon::LastBranchRecord::kMaxNumLastBranch) { |
| TRACEF("WARNING: H/W LBR stack size is %u, clipping to %u\n", lbr_stack_size, |
| perfmon::LastBranchRecord::kMaxNumLastBranch); |
| lbr_stack_size = perfmon::LastBranchRecord::kMaxNumLastBranch; |
| } |
| x86_perfmon_init_lbr(lbr_stack_size); |
| } |
| |
| printf("PMU: version %u\n", perfmon_version); |
| } |
| |
| LK_INIT_HOOK(x86_perfmon, x86_perfmon_init_once, LK_INIT_LEVEL_ARCH) |
| |
| static void x86_perfmon_clear_overflow_indicators() { |
| uint64_t value = (IA32_PERF_GLOBAL_OVF_CTRL_CLR_COND_CHGD_MASK | |
| IA32_PERF_GLOBAL_OVF_CTRL_DS_BUFFER_CLR_OVF_MASK); |
| |
| // Clear overflow indicator for uncore PMU only if it is supported. |
| // The uncore PMU overflow indicator is supported by PMU version 4 |
| // and later, but some hypervisors enumerate that version and don't |
| // support it anyway. |
| if ((perfmon_version >= PERFMON_VERSION_OVERFLOW_INDICATOR_SUPPORTED) && |
| !arch::BootCpuid<arch::CpuidFeatureFlagsC>().hypervisor()) { |
| value |= IA32_PERF_GLOBAL_OVF_CTRL_UNCORE_CLR_OVF_MASK; |
| } |
| |
| // This function isn't performance critical enough to precompute this. |
| for (unsigned i = 0; i < perfmon_num_programmable_counters; ++i) { |
| value |= IA32_PERF_GLOBAL_OVF_CTRL_PMC_CLR_OVF_MASK(i); |
| } |
| |
| for (unsigned i = 0; i < perfmon_num_fixed_counters; ++i) { |
| value |= IA32_PERF_GLOBAL_OVF_CTRL_FIXED_CTR_CLR_OVF_MASK(i); |
| } |
| |
| write_msr(IA32_PERF_GLOBAL_OVF_CTRL, value); |
| } |
| |
| // Return the h/w register number for fixed event id |id| |
| // or IPM_MAX_FIXED_COUNTERS if not found. |
| static unsigned x86_perfmon_lookup_fixed_counter(PmuEventId id) { |
| if (perfmon::GetEventIdGroup(id) != perfmon::kGroupFixed) |
| return IPM_MAX_FIXED_COUNTERS; |
| switch (perfmon::GetEventIdEvent(id)) { |
| #define DEF_FIXED_EVENT(symbol, event_name, id, regnum, flags, readable_name, description) \ |
| case id: \ |
| return regnum; |
| #include <lib/zircon-internal/device/cpu-trace/intel-pm-events.inc> |
| default: |
| return IPM_MAX_FIXED_COUNTERS; |
| } |
| } |
| |
| static size_t get_max_space_needed_for_all_records(PerfmonState* state) { |
| size_t num_events = (state->num_used_programmable + state->num_used_fixed + state->num_used_misc); |
| size_t space_needed = (sizeof(perfmon::TimeRecord) + num_events * kMaxEventRecordSize); |
| if (state->request_lbr_record) |
| space_needed += sizeof(perfmon::LastBranchRecord); |
| return space_needed; |
| } |
| |
| zx_status_t arch_perfmon_get_properties(ArchPmuProperties* props) { |
| Guard<Mutex> guard(PerfmonLock::Get()); |
| |
| if (!perfmon_supported) |
| return ZX_ERR_NOT_SUPPORTED; |
| |
| *props = {}; |
| props->common.pm_version = perfmon_version; |
| props->common.max_num_fixed_events = perfmon_num_fixed_counters; |
| props->common.max_num_programmable_events = perfmon_num_programmable_counters; |
| props->common.max_num_misc_events = perfmon_num_misc_events; |
| props->common.max_fixed_counter_width = perfmon_fixed_counter_width; |
| props->common.max_programmable_counter_width = perfmon_programmable_counter_width; |
| props->common.max_misc_counter_width = perfmon_misc_counter_width; |
| props->perf_capabilities = perfmon_capabilities; |
| props->lbr_stack_size = perfmon_lbr_stack_size; |
| |
| return ZX_OK; |
| } |
| |
| zx_status_t arch_perfmon_init() { |
| Guard<Mutex> guard(PerfmonLock::Get()); |
| |
| if (!perfmon_supported) |
| return ZX_ERR_NOT_SUPPORTED; |
| if (perfmon_active.load()) |
| return ZX_ERR_BAD_STATE; |
| if (perfmon_state) |
| return ZX_ERR_BAD_STATE; |
| |
| ktl::unique_ptr<PerfmonState> state; |
| auto status = PerfmonState::Create(arch_max_num_cpus(), &state); |
| if (status != ZX_OK) |
| return status; |
| |
| perfmon_state = ktl::move(state); |
| return ZX_OK; |
| } |
| |
| zx_status_t arch_perfmon_assign_buffer(uint32_t cpu, fbl::RefPtr<VmObject> vmo) { |
| Guard<Mutex> guard(PerfmonLock::Get()); |
| |
| if (!perfmon_supported) |
| return ZX_ERR_NOT_SUPPORTED; |
| if (perfmon_active.load()) |
| return ZX_ERR_BAD_STATE; |
| if (!perfmon_state) |
| return ZX_ERR_BAD_STATE; |
| if (cpu >= perfmon_state->num_cpus) |
| return ZX_ERR_INVALID_ARGS; |
| |
| // A simple safe approximation of the minimum size needed. |
| size_t min_size_needed = sizeof(perfmon::BufferHeader); |
| min_size_needed += sizeof(perfmon::TimeRecord); |
| min_size_needed += perfmon::kMaxNumEvents * kMaxEventRecordSize; |
| if (vmo->size() < min_size_needed) |
| return ZX_ERR_INVALID_ARGS; |
| |
| auto data = &perfmon_state->cpu_data[cpu]; |
| data->buffer_vmo = vmo; |
| data->buffer_size = vmo->size(); |
| // The buffer is mapped into kernelspace later. |
| |
| return ZX_OK; |
| } |
| |
| static zx_status_t x86_perfmon_verify_control_config(const ArchPmuConfig* config) { |
| #if TRY_FREEZE_ON_PMI |
| if (!(config->debug_ctrl & IA32_DEBUGCTL_FREEZE_PERFMON_ON_PMI_MASK)) { |
| // IWBN to pass back a hint, instead of either nothing or |
| // a log message. |
| TRACEF("IA32_DEBUGCTL_FREEZE_PERFMON_ON_PMI not set\n"); |
| return ZX_ERR_INVALID_ARGS; |
| } |
| #else |
| if (config->debug_ctrl & IA32_DEBUGCTL_FREEZE_PERFMON_ON_PMI_MASK) { |
| TRACEF("IA32_DEBUGCTL_FREEZE_PERFMON_ON_PMI is set\n"); |
| return ZX_ERR_INVALID_ARGS; |
| } |
| #endif |
| |
| if (config->global_ctrl & ~kGlobalCtrlWritableBits) { |
| TRACEF("Non writable bits set in |global_ctrl|\n"); |
| return ZX_ERR_INVALID_ARGS; |
| } |
| if (config->fixed_ctrl & ~kFixedCounterCtrlWritableBits) { |
| TRACEF("Non writable bits set in |fixed_ctrl|\n"); |
| return ZX_ERR_INVALID_ARGS; |
| } |
| if (config->debug_ctrl & ~kDebugCtrlWritableBits) { |
| TRACEF("Non writable bits set in |debug_ctrl|\n"); |
| return ZX_ERR_INVALID_ARGS; |
| } |
| |
| return ZX_OK; |
| } |
| |
| static zx_status_t x86_perfmon_verify_fixed_config(const ArchPmuConfig* config, |
| unsigned* out_num_used) { |
| bool seen_last = false; |
| unsigned num_used = perfmon_num_fixed_counters; |
| for (unsigned i = 0; i < perfmon_num_fixed_counters; ++i) { |
| PmuEventId id = config->fixed_events[i]; |
| if (id != 0 && seen_last) { |
| TRACEF("Active fixed events not front-filled\n"); |
| return ZX_ERR_INVALID_ARGS; |
| } |
| // As a rule this file is agnostic to event ids, it's the device |
| // driver's job to map them to values we use. Thus we don't |
| // validate the ID here. We are given it so that we can include |
| // this ID in the trace output. |
| if (id == 0) { |
| if (!seen_last) |
| num_used = i; |
| seen_last = true; |
| } |
| if (seen_last) { |
| if (config->fixed_initial_value[i] != 0) { |
| TRACEF("Unused |fixed_initial_value[%u]| not zero\n", i); |
| return ZX_ERR_INVALID_ARGS; |
| } |
| if (config->fixed_flags[i] != 0) { |
| TRACEF("Unused |fixed_flags[%u]| not zero\n", i); |
| return ZX_ERR_INVALID_ARGS; |
| } |
| } else { |
| if (config->fixed_initial_value[i] > perfmon_max_fixed_counter_value) { |
| TRACEF("Initial value too large for |fixed_initial_value[%u]|\n", i); |
| return ZX_ERR_INVALID_ARGS; |
| } |
| if (config->fixed_flags[i] & ~perfmon::kPmuConfigFlagMask) { |
| TRACEF("Unused bits set in |fixed_flags[%u]|\n", i); |
| return ZX_ERR_INVALID_ARGS; |
| } |
| if (!x86_perfmon_lbr_is_supported() && |
| (config->fixed_flags[i] & perfmon::kPmuConfigFlagLastBranch) != 0) { |
| TRACEF( |
| "Last branch records requested for |fixed_flags[%u]|," |
| " but not supported\n", |
| i); |
| return ZX_ERR_NOT_SUPPORTED; |
| } |
| if ((config->fixed_flags[i] & perfmon::kPmuConfigFlagUsesTimebase) && |
| config->timebase_event == perfmon::kEventIdNone) { |
| TRACEF("Timebase requested for |fixed_flags[%u]|, but not provided\n", i); |
| return ZX_ERR_INVALID_ARGS; |
| } |
| unsigned hw_regnum = x86_perfmon_lookup_fixed_counter(id); |
| if (hw_regnum == IPM_MAX_FIXED_COUNTERS) { |
| TRACEF("Invalid fixed counter id |fixed_events[%u]|\n", i); |
| return ZX_ERR_INVALID_ARGS; |
| } |
| } |
| } |
| |
| *out_num_used = num_used; |
| return ZX_OK; |
| } |
| |
| static zx_status_t x86_perfmon_verify_programmable_config(const ArchPmuConfig* config, |
| unsigned* out_num_used) { |
| bool seen_last = false; |
| unsigned num_used = perfmon_num_programmable_counters; |
| for (unsigned i = 0; i < perfmon_num_programmable_counters; ++i) { |
| PmuEventId id = config->programmable_events[i]; |
| if (id != 0 && seen_last) { |
| TRACEF("Active programmable events not front-filled\n"); |
| return ZX_ERR_INVALID_ARGS; |
| } |
| // As a rule this file is agnostic to event ids, it's the device |
| // driver's job to map them to the hw values we use. Thus we don't |
| // validate the ID here. We are given it so that we can include |
| // this ID in the trace output. |
| if (id == 0) { |
| if (!seen_last) |
| num_used = i; |
| seen_last = true; |
| } |
| if (seen_last) { |
| if (config->programmable_hw_events[i] != 0) { |
| TRACEF("Unused |programmable_hw_events[%u]| not zero\n", i); |
| return ZX_ERR_INVALID_ARGS; |
| } |
| if (config->programmable_initial_value[i] != 0) { |
| TRACEF("Unused |programmable_initial_value[%u]| not zero\n", i); |
| return ZX_ERR_INVALID_ARGS; |
| } |
| if (config->programmable_flags[i] != 0) { |
| TRACEF("Unused |programmable_flags[%u]| not zero\n", i); |
| return ZX_ERR_INVALID_ARGS; |
| } |
| } else { |
| if (config->programmable_hw_events[i] & ~kEventSelectWritableBits) { |
| TRACEF("Non writable bits set in |programmable_hw_events[%u]|\n", i); |
| return ZX_ERR_INVALID_ARGS; |
| } |
| if (config->programmable_initial_value[i] > perfmon_max_programmable_counter_value) { |
| TRACEF("Initial value too large for |programmable_initial_value[%u]|\n", i); |
| return ZX_ERR_INVALID_ARGS; |
| } |
| if (config->programmable_flags[i] & ~perfmon::kPmuConfigFlagMask) { |
| TRACEF("Unused bits set in |programmable_flags[%u]|\n", i); |
| return ZX_ERR_INVALID_ARGS; |
| } |
| if (!x86_perfmon_lbr_is_supported() && |
| (config->programmable_flags[i] & perfmon::kPmuConfigFlagLastBranch) != 0) { |
| TRACEF( |
| "Last branch records requested for |programmable_flags[%u]|," |
| " but not supported\n", |
| i); |
| return ZX_ERR_NOT_SUPPORTED; |
| } |
| if ((config->programmable_flags[i] & perfmon::kPmuConfigFlagUsesTimebase) && |
| config->timebase_event == perfmon::kEventIdNone) { |
| TRACEF("Timebase requested for |programmable_flags[%u]|, but not provided\n", i); |
| return ZX_ERR_INVALID_ARGS; |
| } |
| } |
| } |
| |
| *out_num_used = num_used; |
| return ZX_OK; |
| } |
| |
| static zx_status_t x86_perfmon_verify_misc_config(const ArchPmuConfig* config, |
| unsigned* out_num_used) { |
| bool seen_last = false; |
| size_t max_num_used = ktl::size(config->misc_events); |
| size_t num_used = max_num_used; |
| for (size_t i = 0; i < max_num_used; ++i) { |
| PmuEventId id = config->misc_events[i]; |
| if (id != 0 && seen_last) { |
| TRACEF("Active misc events not front-filled\n"); |
| return ZX_ERR_INVALID_ARGS; |
| } |
| if (id == 0) { |
| if (!seen_last) |
| num_used = i; |
| seen_last = true; |
| } |
| if (seen_last) { |
| if (config->misc_flags[i] != 0) { |
| TRACEF("Unused |misc_flags[%zu]| not zero\n", i); |
| return ZX_ERR_INVALID_ARGS; |
| } |
| } else { |
| if (config->misc_flags[i] & ~perfmon::kPmuConfigFlagMask) { |
| TRACEF("Unused bits set in |misc_flags[%zu]|\n", i); |
| return ZX_ERR_INVALID_ARGS; |
| } |
| // Currently we only support the MCHBAR events. |
| // They cannot provide pc. We ignore the OS/USER bits. |
| if (config->misc_flags[i] & (perfmon::kPmuConfigFlagPc | perfmon::kPmuConfigFlagLastBranch)) { |
| TRACEF("Invalid bits (0x%x) in |misc_flags[%zu]|\n", config->misc_flags[i], i); |
| return ZX_ERR_INVALID_ARGS; |
| } |
| if ((config->misc_flags[i] & perfmon::kPmuConfigFlagUsesTimebase) && |
| config->timebase_event == perfmon::kEventIdNone) { |
| TRACEF("Timebase requested for |misc_flags[%zu]|, but not provided\n", i); |
| return ZX_ERR_INVALID_ARGS; |
| } |
| switch (perfmon::GetEventIdEvent(id)) { |
| #define DEF_MISC_SKL_EVENT(symbol, event_name, id, offset, size, flags, readable_name, \ |
| description) \ |
| case id: \ |
| break; |
| #include <lib/zircon-internal/device/cpu-trace/skylake-misc-events.inc> |
| default: |
| TRACEF("Invalid misc event id |misc_events[%zu]|\n", i); |
| return ZX_ERR_INVALID_ARGS; |
| } |
| } |
| } |
| |
| *out_num_used = static_cast<unsigned>(num_used); |
| return ZX_OK; |
| } |
| |
| static zx_status_t x86_perfmon_verify_timebase_config(ArchPmuConfig* config, unsigned num_fixed, |
| unsigned num_programmable) { |
| if (config->timebase_event == perfmon::kEventIdNone) { |
| return ZX_OK; |
| } |
| |
| for (unsigned i = 0; i < num_fixed; ++i) { |
| if (config->fixed_events[i] == config->timebase_event) { |
| // The PMI code is simpler if this is the case. |
| config->fixed_flags[i] &= ~perfmon::kPmuConfigFlagUsesTimebase; |
| return ZX_OK; |
| } |
| } |
| |
| for (unsigned i = 0; i < num_programmable; ++i) { |
| if (config->programmable_events[i] == config->timebase_event) { |
| // The PMI code is simpler if this is the case. |
| config->programmable_flags[i] &= ~perfmon::kPmuConfigFlagUsesTimebase; |
| return ZX_OK; |
| } |
| } |
| |
| TRACEF("Timebase 0x%x requested but not present\n", config->timebase_event); |
| return ZX_ERR_INVALID_ARGS; |
| } |
| |
| static zx_status_t x86_perfmon_verify_config(ArchPmuConfig* config, PerfmonState* state) { |
| auto status = x86_perfmon_verify_control_config(config); |
| if (status != ZX_OK) |
| return status; |
| |
| unsigned num_used_fixed; |
| status = x86_perfmon_verify_fixed_config(config, &num_used_fixed); |
| if (status != ZX_OK) |
| return status; |
| state->num_used_fixed = num_used_fixed; |
| |
| unsigned num_used_programmable; |
| status = x86_perfmon_verify_programmable_config(config, &num_used_programmable); |
| if (status != ZX_OK) |
| return status; |
| state->num_used_programmable = num_used_programmable; |
| |
| unsigned num_used_misc; |
| status = x86_perfmon_verify_misc_config(config, &num_used_misc); |
| if (status != ZX_OK) |
| return status; |
| state->num_used_misc = num_used_misc; |
| |
| status = x86_perfmon_verify_timebase_config(config, state->num_used_fixed, |
| state->num_used_programmable); |
| if (status != ZX_OK) |
| return status; |
| |
| return ZX_OK; |
| } |
| |
| static void x86_perfmon_stage_fixed_config(const ArchPmuConfig* config, PerfmonState* state) { |
| static_assert(sizeof(state->fixed_events) == sizeof(config->fixed_events), ""); |
| memcpy(state->fixed_events, config->fixed_events, sizeof(state->fixed_events)); |
| |
| static_assert(sizeof(state->fixed_initial_value) == sizeof(config->fixed_initial_value), ""); |
| memcpy(state->fixed_initial_value, config->fixed_initial_value, |
| sizeof(state->fixed_initial_value)); |
| |
| static_assert(sizeof(state->fixed_flags) == sizeof(config->fixed_flags), ""); |
| memcpy(state->fixed_flags, config->fixed_flags, sizeof(state->fixed_flags)); |
| |
| for (unsigned i = 0; i < ktl::size(state->fixed_hw_map); ++i) { |
| state->fixed_hw_map[i] = x86_perfmon_lookup_fixed_counter(config->fixed_events[i]); |
| } |
| } |
| |
| static void x86_perfmon_stage_programmable_config(const ArchPmuConfig* config, |
| PerfmonState* state) { |
| static_assert(sizeof(state->programmable_events) == sizeof(config->programmable_events), ""); |
| memcpy(state->programmable_events, config->programmable_events, |
| sizeof(state->programmable_events)); |
| |
| static_assert( |
| sizeof(state->programmable_initial_value) == sizeof(config->programmable_initial_value), ""); |
| memcpy(state->programmable_initial_value, config->programmable_initial_value, |
| sizeof(state->programmable_initial_value)); |
| |
| static_assert(sizeof(state->programmable_flags) == sizeof(config->programmable_flags), ""); |
| memcpy(state->programmable_flags, config->programmable_flags, sizeof(state->programmable_flags)); |
| |
| static_assert(sizeof(state->programmable_hw_events) == sizeof(config->programmable_hw_events), |
| ""); |
| memcpy(state->programmable_hw_events, config->programmable_hw_events, |
| sizeof(state->programmable_hw_events)); |
| } |
| |
| static void x86_perfmon_stage_misc_config(const ArchPmuConfig* config, PerfmonState* state) { |
| static_assert(sizeof(state->misc_events) == sizeof(config->misc_events), ""); |
| memcpy(state->misc_events, config->misc_events, sizeof(state->misc_events)); |
| |
| static_assert(sizeof(state->misc_flags) == sizeof(config->misc_flags), ""); |
| memcpy(state->misc_flags, config->misc_flags, sizeof(state->misc_flags)); |
| |
| state->need_mchbar = false; |
| for (unsigned i = 0; i < state->num_used_misc; ++i) { |
| // All misc events currently come from MCHBAR. |
| // When needed we can add a flag to the event to denote origin. |
| switch (perfmon::GetEventIdEvent(state->misc_events[i])) { |
| #define DEF_MISC_SKL_EVENT(symbol, event_name, id, offset, size, flags, readable_name, \ |
| description) \ |
| case id: |
| #include <lib/zircon-internal/device/cpu-trace/skylake-misc-events.inc> |
| state->need_mchbar = true; |
| break; |
| default: |
| break; |
| } |
| } |
| |
| // What we'd like to do here is record the current values of these |
| // events, but they're not mapped in yet. |
| memset(&state->mchbar_data.last_mem, 0, sizeof(state->mchbar_data.last_mem)); |
| } |
| |
| // Stage the configuration for later activation by START. |
| // One of the main goals of this function is to verify the provided config |
| // is ok, e.g., it won't cause us to crash. |
| zx_status_t arch_perfmon_stage_config(ArchPmuConfig* config) { |
| Guard<Mutex> guard(PerfmonLock::Get()); |
| |
| if (!perfmon_supported) |
| return ZX_ERR_NOT_SUPPORTED; |
| if (perfmon_active.load()) |
| return ZX_ERR_BAD_STATE; |
| if (!perfmon_state) |
| return ZX_ERR_BAD_STATE; |
| |
| auto state = perfmon_state.get(); |
| |
| LTRACEF("global_ctrl 0x%" PRIx64 "\n", config->global_ctrl); |
| |
| // Note: The verification pass may also alter |config| to make things |
| // simpler for the implementation. |
| auto status = x86_perfmon_verify_config(config, state); |
| if (status != ZX_OK) |
| return status; |
| |
| state->global_ctrl = config->global_ctrl; |
| state->fixed_ctrl = config->fixed_ctrl; |
| state->debug_ctrl = config->debug_ctrl; |
| state->timebase_event = config->timebase_event; |
| |
| if (state->debug_ctrl & IA32_DEBUGCTL_LBR_MASK) { |
| if (!x86_perfmon_lbr_is_supported()) { |
| TRACEF("Last branch records requested in |debug_ctrl|, but not supported\n"); |
| return ZX_ERR_NOT_SUPPORTED; |
| } |
| state->request_lbr_record = true; |
| } |
| |
| x86_perfmon_stage_fixed_config(config, state); |
| x86_perfmon_stage_programmable_config(config, state); |
| x86_perfmon_stage_misc_config(config, state); |
| |
| return ZX_OK; |
| } |
| |
| // System statistics that come from MCHBAR. |
| // See, e.g., desktop-6th-gen-core-family-datasheet-vol-2. |
| // TODO(dje): Consider moving misc event support to a separate file |
| // when the amount of code to support them gets large enough. |
| |
| // Take advantage of the ABI's support for returning two values so that |
| // we can return both in registers. |
| struct ReadMiscResult { |
| // The value of the register. |
| uint64_t value; |
| // The record type to use, either |perfmon::kRecordTypeCount| or |
| // |perfmon::kRecordTypeValue|. |
| uint8_t type; |
| }; |
| |
| // Read the 32-bit counter from MCHBAR and return the delta |
| // since the last read. We do this in part because it's easier for clients |
| // to process and in part to catch the cases of the counter wrapping that |
| // we can (they're only 32 bits in h/w and are read-only). |
| // WARNING: This function has the side-effect of updating |*last_value|. |
| static uint32_t read_mc_counter32(volatile uint32_t* addr, uint32_t* last_value_addr) { |
| uint32_t value = *addr; |
| uint32_t last_value = *last_value_addr; |
| *last_value_addr = value; |
| // Check for overflow. The code is the same in both branches, the if() |
| // exists to document the issue. |
| if (value < last_value) { |
| // Overflow, counter wrapped. |
| // We don't know how many times it wrapped, assume once. |
| // We rely on unsigned twos-complement arithmetic here. |
| return value - last_value; |
| } else { |
| // The counter may still have wrapped, but we can't detect this case. |
| return value - last_value; |
| } |
| } |
| |
| // Read the 64-bit counter from MCHBAR and return the delta |
| // since the last read. We do this because it's easier for clients to process. |
| // Overflow is highly unlikely with a 64-bit counter. |
| // WARNING: This function has the side-effect of updating |*last_value|. |
| static uint64_t read_mc_counter64(volatile uint64_t* addr, uint64_t* last_value_addr) { |
| uint64_t value = *addr; |
| uint64_t last_value = *last_value_addr; |
| *last_value_addr = value; |
| return value - last_value; |
| } |
| |
| // Read the 32-bit non-counter value from MCHBAR. |
| static uint32_t read_mc_value32(volatile uint32_t* addr) { return *addr; } |
| |
| static ReadMiscResult read_mc_typed_counter32(volatile uint32_t* addr, uint32_t* last_value_addr) { |
| return ReadMiscResult{read_mc_counter32(addr, last_value_addr), perfmon::kRecordTypeCount}; |
| } |
| |
| static ReadMiscResult read_mc_typed_counter64(volatile uint64_t* addr, uint64_t* last_value_addr) { |
| return ReadMiscResult{read_mc_counter64(addr, last_value_addr), perfmon::kRecordTypeCount}; |
| } |
| |
| static ReadMiscResult read_mc_typed_value32(volatile uint32_t* addr) { |
| return ReadMiscResult{read_mc_value32(addr), perfmon::kRecordTypeValue}; |
| } |
| |
| static volatile uint32_t* get_mc_addr32(PerfmonState* state, uint32_t hw_addr) { |
| return reinterpret_cast<volatile uint32_t*>( |
| reinterpret_cast<volatile char*>(state->mchbar_data.stats_addr) + hw_addr - |
| UNC_IMC_STATS_BEGIN); |
| } |
| |
| static volatile uint64_t* get_mc_addr64(PerfmonState* state, uint32_t hw_addr) { |
| return reinterpret_cast<volatile uint64_t*>( |
| reinterpret_cast<volatile char*>(state->mchbar_data.stats_addr) + hw_addr - |
| UNC_IMC_STATS_BEGIN); |
| } |
| |
| static ReadMiscResult read_mc_bytes_read(PerfmonState* state) { |
| uint32_t value = read_mc_counter32(get_mc_addr32(state, MISC_MEM_BYTES_READ_OFFSET), |
| &state->mchbar_data.last_mem.bytes_read); |
| // Return the value in bytes, easier for human readers of the |
| // resulting report. |
| return ReadMiscResult{value * 64ul, perfmon::kRecordTypeCount}; |
| } |
| |
| static ReadMiscResult read_mc_bytes_written(PerfmonState* state) { |
| uint32_t value = read_mc_counter32(get_mc_addr32(state, MISC_MEM_BYTES_WRITTEN_OFFSET), |
| &state->mchbar_data.last_mem.bytes_written); |
| // Return the value in bytes, easier for human readers of the |
| // resulting report. |
| return ReadMiscResult{value * 64ul, perfmon::kRecordTypeCount}; |
| } |
| |
| static ReadMiscResult read_mc_gt_requests(PerfmonState* state) { |
| return read_mc_typed_counter32(get_mc_addr32(state, MISC_MEM_GT_REQUESTS_OFFSET), |
| &state->mchbar_data.last_mem.gt_requests); |
| } |
| |
| static ReadMiscResult read_mc_ia_requests(PerfmonState* state) { |
| return read_mc_typed_counter32(get_mc_addr32(state, MISC_MEM_IA_REQUESTS_OFFSET), |
| &state->mchbar_data.last_mem.ia_requests); |
| } |
| |
| static ReadMiscResult read_mc_io_requests(PerfmonState* state) { |
| return read_mc_typed_counter32(get_mc_addr32(state, MISC_MEM_IO_REQUESTS_OFFSET), |
| &state->mchbar_data.last_mem.io_requests); |
| } |
| |
| static ReadMiscResult read_mc_all_active_core_cycles(PerfmonState* state) { |
| return read_mc_typed_counter64(get_mc_addr64(state, MISC_PKG_ALL_ACTIVE_CORE_CYCLES_OFFSET), |
| &state->mchbar_data.last_mem.all_active_core_cycles); |
| } |
| |
| static ReadMiscResult read_mc_any_active_core_cycles(PerfmonState* state) { |
| return read_mc_typed_counter64(get_mc_addr64(state, MISC_PKG_ANY_ACTIVE_CORE_CYCLES_OFFSET), |
| &state->mchbar_data.last_mem.any_active_core_cycles); |
| } |
| |
| static ReadMiscResult read_mc_active_gt_cycles(PerfmonState* state) { |
| return read_mc_typed_counter64(get_mc_addr64(state, MISC_PKG_ACTIVE_GT_CYCLES_OFFSET), |
| &state->mchbar_data.last_mem.active_gt_cycles); |
| } |
| |
| static ReadMiscResult read_mc_active_ia_gt_cycles(PerfmonState* state) { |
| return read_mc_typed_counter64(get_mc_addr64(state, MISC_PKG_ACTIVE_IA_GT_CYCLES_OFFSET), |
| &state->mchbar_data.last_mem.active_ia_gt_cycles); |
| } |
| |
| static ReadMiscResult read_mc_active_gt_slice_cycles(PerfmonState* state) { |
| return read_mc_typed_counter64(get_mc_addr64(state, MISC_PKG_ACTIVE_GT_SLICE_CYCLES_OFFSET), |
| &state->mchbar_data.last_mem.active_gt_slice_cycles); |
| } |
| |
| static ReadMiscResult read_mc_active_gt_engine_cycles(PerfmonState* state) { |
| return read_mc_typed_counter64(get_mc_addr64(state, MISC_PKG_ACTIVE_GT_ENGINE_CYCLES_OFFSET), |
| &state->mchbar_data.last_mem.active_gt_engine_cycles); |
| } |
| |
| static ReadMiscResult read_mc_peci_therm_margin(PerfmonState* state) { |
| uint32_t value = read_mc_value32(get_mc_addr32(state, MISC_PKG_PECI_THERM_MARGIN_OFFSET)); |
| return ReadMiscResult{value & 0xffff, perfmon::kRecordTypeValue}; |
| } |
| |
| static ReadMiscResult read_mc_rapl_perf_status(PerfmonState* state) { |
| return read_mc_typed_value32(get_mc_addr32(state, MISC_PKG_RAPL_PERF_STATUS_OFFSET)); |
| } |
| |
| static ReadMiscResult read_mc_ia_freq_clamping_reasons(PerfmonState* state) { |
| // Some of the reserved bits have read as ones. Remove them to make the |
| // reported value easier to read. |
| const uint32_t kReserved = (1u << 31) | (1u << 30) | (1u << 25) | (1u << 19) | (1u << 18) | |
| (1u << 15) | (1u << 14) | (1u << 9) | (1u << 3) | (1u << 2); |
| uint32_t value = read_mc_value32(get_mc_addr32(state, MISC_PKG_IA_FREQ_CLAMPING_REASONS_OFFSET)); |
| return ReadMiscResult{value & ~kReserved, perfmon::kRecordTypeValue}; |
| } |
| |
| static ReadMiscResult read_mc_gt_freq_clamping_reasons(PerfmonState* state) { |
| // Some of the reserved bits have read as ones. Remove them to make the |
| // reported value easier to read. |
| const uint32_t kReserved = (1u << 31) | (1u << 30) | (1u << 29) | (1u << 25) | (1u << 20) | |
| (1u << 19) | (1u << 18) | (1u << 15) | (1u << 14) | (1u << 13) | |
| (1u << 9) | (1u << 4) | (1u << 3) | (1u << 2); |
| uint32_t value = read_mc_value32(get_mc_addr32(state, MISC_PKG_GT_FREQ_CLAMPING_REASONS_OFFSET)); |
| return ReadMiscResult{value & ~kReserved, perfmon::kRecordTypeValue}; |
| } |
| |
| static ReadMiscResult read_mc_rp_slice_freq(PerfmonState* state) { |
| uint32_t value = read_mc_value32(get_mc_addr32(state, MISC_PKG_RP_GT_SLICE_FREQ_OFFSET)); |
| value = (value >> 17) & 0x1ff; |
| // Convert the value to Mhz. |
| // We can't do floating point, and this doesn't have to be perfect. |
| uint64_t scaled_value = value * 16667ul / 1000 /*16.667*/; |
| return ReadMiscResult{scaled_value, perfmon::kRecordTypeValue}; |
| } |
| |
| static ReadMiscResult read_mc_rp_unslice_freq(PerfmonState* state) { |
| uint32_t value = read_mc_value32(get_mc_addr32(state, MISC_PKG_RP_GT_UNSLICE_FREQ_OFFSET)); |
| value = (value >> 8) & 0x1ff; |
| // Convert the value to Mhz. |
| // We can't do floating point, and this doesn't have to be perfect. |
| uint64_t scaled_value = value * 16667ul / 1000 /*16.667*/; |
| return ReadMiscResult{scaled_value, perfmon::kRecordTypeValue}; |
| } |
| |
| static ReadMiscResult read_mc_rp_gt_volt(PerfmonState* state) { |
| uint32_t value = read_mc_value32(get_mc_addr32(state, MISC_PKG_RP_GT_VOLT_OFFSET)); |
| return ReadMiscResult{value & 0xff, perfmon::kRecordTypeValue}; |
| } |
| |
| static ReadMiscResult read_mc_edram_temp(PerfmonState* state) { |
| uint32_t value = read_mc_value32(get_mc_addr32(state, MISC_PKG_EDRAM_TEMP_OFFSET)); |
| return ReadMiscResult{value & 0xff, perfmon::kRecordTypeValue}; |
| } |
| |
| static ReadMiscResult read_mc_pkg_temp(PerfmonState* state) { |
| uint32_t value = read_mc_value32(get_mc_addr32(state, MISC_PKG_PKG_TEMP_OFFSET)); |
| return ReadMiscResult{value & 0xff, perfmon::kRecordTypeValue}; |
| } |
| |
| static ReadMiscResult read_mc_ia_temp(PerfmonState* state) { |
| uint32_t value = read_mc_value32(get_mc_addr32(state, MISC_PKG_IA_TEMP_OFFSET)); |
| return ReadMiscResult{value & 0xff, perfmon::kRecordTypeValue}; |
| } |
| |
| static ReadMiscResult read_mc_gt_temp(PerfmonState* state) { |
| uint32_t value = read_mc_value32(get_mc_addr32(state, MISC_PKG_GT_TEMP_OFFSET)); |
| return ReadMiscResult{value & 0xff, perfmon::kRecordTypeValue}; |
| } |
| |
| static ReadMiscResult read_misc_event(PerfmonState* state, PmuEventId id) { |
| switch (id) { |
| case MISC_MEM_BYTES_READ_ID: |
| return read_mc_bytes_read(state); |
| case MISC_MEM_BYTES_WRITTEN_ID: |
| return read_mc_bytes_written(state); |
| case MISC_MEM_GT_REQUESTS_ID: |
| return read_mc_gt_requests(state); |
| case MISC_MEM_IA_REQUESTS_ID: |
| return read_mc_ia_requests(state); |
| case MISC_MEM_IO_REQUESTS_ID: |
| return read_mc_io_requests(state); |
| case MISC_PKG_ALL_ACTIVE_CORE_CYCLES_ID: |
| return read_mc_all_active_core_cycles(state); |
| case MISC_PKG_ANY_ACTIVE_CORE_CYCLES_ID: |
| return read_mc_any_active_core_cycles(state); |
| case MISC_PKG_ACTIVE_GT_CYCLES_ID: |
| return read_mc_active_gt_cycles(state); |
| case MISC_PKG_ACTIVE_IA_GT_CYCLES_ID: |
| return read_mc_active_ia_gt_cycles(state); |
| case MISC_PKG_ACTIVE_GT_SLICE_CYCLES_ID: |
| return read_mc_active_gt_slice_cycles(state); |
| case MISC_PKG_ACTIVE_GT_ENGINE_CYCLES_ID: |
| return read_mc_active_gt_engine_cycles(state); |
| case MISC_PKG_PECI_THERM_MARGIN_ID: |
| return read_mc_peci_therm_margin(state); |
| case MISC_PKG_RAPL_PERF_STATUS_ID: |
| return read_mc_rapl_perf_status(state); |
| case MISC_PKG_IA_FREQ_CLAMPING_REASONS_ID: |
| return read_mc_ia_freq_clamping_reasons(state); |
| case MISC_PKG_GT_FREQ_CLAMPING_REASONS_ID: |
| return read_mc_gt_freq_clamping_reasons(state); |
| case MISC_PKG_RP_GT_SLICE_FREQ_ID: |
| return read_mc_rp_slice_freq(state); |
| case MISC_PKG_RP_GT_UNSLICE_FREQ_ID: |
| return read_mc_rp_unslice_freq(state); |
| case MISC_PKG_RP_GT_VOLT_ID: |
| return read_mc_rp_gt_volt(state); |
| case MISC_PKG_EDRAM_TEMP_ID: |
| return read_mc_edram_temp(state); |
| case MISC_PKG_PKG_TEMP_ID: |
| return read_mc_pkg_temp(state); |
| case MISC_PKG_IA_TEMP_ID: |
| return read_mc_ia_temp(state); |
| case MISC_PKG_GT_TEMP_ID: |
| return read_mc_gt_temp(state); |
| default: |
| __UNREACHABLE; |
| } |
| } |
| |
| static void x86_perfmon_unmap_buffers_locked(PerfmonState* state) { |
| unsigned num_cpus = state->num_cpus; |
| for (unsigned cpu = 0; cpu < num_cpus; ++cpu) { |
| auto data = &state->cpu_data[cpu]; |
| if (data->buffer_start) { |
| data->buffer_mapping->Destroy(); |
| } |
| data->buffer_mapping.reset(); |
| data->buffer_start = nullptr; |
| data->buffer_end = nullptr; |
| data->buffer_next = nullptr; |
| } |
| |
| if (state->mchbar_data.mapping) { |
| state->mchbar_data.mapping->Destroy(); |
| } |
| state->mchbar_data.mapping.reset(); |
| state->mchbar_data.stats_addr = nullptr; |
| |
| LTRACEF("buffers unmapped"); |
| } |
| |
| static zx_status_t x86_map_mchbar_stat_registers(PerfmonState* state) { |
| DEBUG_ASSERT(perfmon_mchbar_bar != 0); |
| fbl::RefPtr<VmObjectPhysical> vmo; |
| vaddr_t begin_page = (perfmon_mchbar_bar + UNC_IMC_STATS_BEGIN) & ~(PAGE_SIZE - 1); |
| vaddr_t end_page = (perfmon_mchbar_bar + UNC_IMC_STATS_END) & ~(PAGE_SIZE - 1); |
| size_t num_bytes_to_map = end_page + PAGE_SIZE - begin_page; |
| size_t begin_offset = (perfmon_mchbar_bar + UNC_IMC_STATS_BEGIN) & (PAGE_SIZE - 1); |
| |
| // We only map in the page(s) with the data we need. |
| auto status = VmObjectPhysical::Create(begin_page, num_bytes_to_map, &vmo); |
| if (status != ZX_OK) |
| return status; |
| |
| const char name[] = "perfmon-mchbar"; |
| vmo->set_name(name, sizeof(name)); |
| status = vmo->SetMappingCachePolicy(ZX_CACHE_POLICY_UNCACHED_DEVICE); |
| if (status != ZX_OK) |
| return status; |
| |
| auto vmar = VmAspace::kernel_aspace()->RootVmar(); |
| uint32_t vmar_flags = 0; |
| uint32_t arch_mmu_flags = ARCH_MMU_FLAG_PERM_READ; |
| fbl::RefPtr<VmMapping> mapping; |
| status = vmar->CreateVmMapping(0, PAGE_SIZE, /*align_pow2*/ 0, vmar_flags, ktl::move(vmo), 0, |
| arch_mmu_flags, name, &mapping); |
| if (status != ZX_OK) |
| return status; |
| |
| status = mapping->MapRange(0, PAGE_SIZE, false); |
| if (status != ZX_OK) |
| return status; |
| |
| state->mchbar_data.mapping = mapping; |
| state->mchbar_data.stats_addr = reinterpret_cast<void*>(mapping->base() + begin_offset); |
| |
| // Record the current values of these so that the trace will only include |
| // the delta since tracing started. |
| #define INIT_MC_COUNT(member) \ |
| do { \ |
| state->mchbar_data.last_mem.member = 0; \ |
| (void)read_mc_##member(state); \ |
| } while (0) |
| INIT_MC_COUNT(bytes_read); |
| INIT_MC_COUNT(bytes_written); |
| INIT_MC_COUNT(gt_requests); |
| INIT_MC_COUNT(ia_requests); |
| INIT_MC_COUNT(io_requests); |
| INIT_MC_COUNT(all_active_core_cycles); |
| INIT_MC_COUNT(any_active_core_cycles); |
| INIT_MC_COUNT(active_gt_cycles); |
| INIT_MC_COUNT(active_ia_gt_cycles); |
| INIT_MC_COUNT(active_gt_slice_cycles); |
| INIT_MC_COUNT(active_gt_engine_cycles); |
| #undef INIT_MC_COUNT |
| |
| LTRACEF("memory stats mapped: begin 0x%lx, %zu bytes\n", mapping->base(), num_bytes_to_map); |
| |
| return ZX_OK; |
| } |
| |
| static zx_status_t x86_perfmon_map_buffers_locked(PerfmonState* state) { |
| unsigned num_cpus = state->num_cpus; |
| zx_status_t status = ZX_OK; |
| for (unsigned cpu = 0; cpu < num_cpus; ++cpu) { |
| auto data = &state->cpu_data[cpu]; |
| // Heads up: The logic is off if |vmo_offset| is non-zero. |
| const uint64_t vmo_offset = 0; |
| const size_t size = data->buffer_size; |
| const uint arch_mmu_flags = ARCH_MMU_FLAG_PERM_READ | ARCH_MMU_FLAG_PERM_WRITE; |
| const char* name = "ipm-buffer"; |
| status = VmAspace::kernel_aspace()->RootVmar()->CreateVmMapping( |
| 0 /* ignored */, size, 0 /* align pow2 */, 0 /* vmar flags */, data->buffer_vmo, vmo_offset, |
| arch_mmu_flags, name, &data->buffer_mapping); |
| if (status != ZX_OK) { |
| TRACEF("error %d mapping buffer: cpu %u, size 0x%zx\n", status, cpu, size); |
| break; |
| } |
| // Pass true for |commit| so that we get our pages mapped up front. |
| // Otherwise we'll need to allow for a page fault to happen in the |
| // PMI handler. |
| status = data->buffer_mapping->MapRange(vmo_offset, size, true); |
| if (status != ZX_OK) { |
| TRACEF("error %d mapping range: cpu %u, size 0x%zx\n", status, cpu, size); |
| data->buffer_mapping->Destroy(); |
| data->buffer_mapping.reset(); |
| break; |
| } |
| data->buffer_start = |
| reinterpret_cast<perfmon::BufferHeader*>(data->buffer_mapping->base() + vmo_offset); |
| data->buffer_end = reinterpret_cast<char*>(data->buffer_start) + size; |
| LTRACEF("buffer mapped: cpu %u, start %p, end %p\n", cpu, data->buffer_start, data->buffer_end); |
| |
| auto hdr = data->buffer_start; |
| hdr->version = perfmon::kBufferVersion; |
| hdr->arch = perfmon::kArchX64; |
| hdr->flags = 0; |
| hdr->ticks_per_second = ticks_per_second(); |
| hdr->capture_end = sizeof(*hdr); |
| data->buffer_next = reinterpret_cast<perfmon::RecordHeader*>( |
| reinterpret_cast<char*>(data->buffer_start) + hdr->capture_end); |
| } |
| |
| // Get access to MCHBAR stats if we can. |
| if (status == ZX_OK && state->need_mchbar) { |
| status = x86_map_mchbar_stat_registers(state); |
| } |
| |
| if (status != ZX_OK) { |
| x86_perfmon_unmap_buffers_locked(state); |
| } |
| |
| return status; |
| } |
| |
| static void x86_perfmon_start_cpu_task(void* raw_context) { |
| DEBUG_ASSERT(arch_ints_disabled()); |
| DEBUG_ASSERT(!perfmon_active.load() && raw_context); |
| |
| auto state = reinterpret_cast<PerfmonState*>(raw_context); |
| |
| for (unsigned i = 0; i < state->num_used_fixed; ++i) { |
| unsigned hw_num = state->fixed_hw_map[i]; |
| DEBUG_ASSERT(hw_num < perfmon_num_fixed_counters); |
| write_msr(IA32_FIXED_CTR0 + hw_num, state->fixed_initial_value[i]); |
| } |
| write_msr(IA32_FIXED_CTR_CTRL, state->fixed_ctrl); |
| |
| for (unsigned i = 0; i < state->num_used_programmable; ++i) { |
| // Ensure PERFEVTSEL.EN is zero before resetting the counter value, |
| // h/w requires it (apparently even if global ctrl is off). |
| write_msr(IA32_PERFEVTSEL_FIRST + i, 0); |
| // The counter must be written before PERFEVTSEL.EN is set to 1. |
| write_msr(IA32_PMC_FIRST + i, state->programmable_initial_value[i]); |
| write_msr(IA32_PERFEVTSEL_FIRST + i, state->programmable_hw_events[i]); |
| } |
| |
| x86_perfmon_lbr_clear(); |
| |
| write_msr(IA32_DEBUGCTL, state->debug_ctrl); |
| |
| apic_pmi_unmask(); |
| |
| // Enable counters as late as possible so that our setup doesn't contribute |
| // to the data. |
| enable_counters(state); |
| } |
| |
| // Begin collecting data. |
| |
| zx_status_t arch_perfmon_start() { |
| Guard<Mutex> guard(PerfmonLock::Get()); |
| |
| if (!perfmon_supported) |
| return ZX_ERR_NOT_SUPPORTED; |
| if (perfmon_active.load()) |
| return ZX_ERR_BAD_STATE; |
| if (!perfmon_state) |
| return ZX_ERR_BAD_STATE; |
| |
| // Make sure all relevant sysregs have been wiped clean. |
| if (!perfmon_hw_initialized) { |
| mp_sync_exec(MP_IPI_TARGET_ALL, 0, x86_perfmon_reset_task, nullptr); |
| perfmon_hw_initialized = true; |
| } |
| |
| // Sanity check the buffers and map them in. |
| // This is deferred until now so that they are mapped in as minimally as |
| // necessary. |
| // TODO(dje): OTOH one might want to start/stop/start/stop/... and |
| // continually mapping/unmapping will be painful. Revisit when things |
| // settle down. |
| auto state = perfmon_state.get(); |
| auto status = x86_perfmon_map_buffers_locked(state); |
| if (status != ZX_OK) |
| return status; |
| |
| TRACEF("Enabling perfmon, %u fixed, %u programmable, %u misc\n", state->num_used_fixed, |
| state->num_used_programmable, state->num_used_misc); |
| if (LOCAL_TRACE) { |
| LTRACEF("global ctrl: 0x%" PRIx64 ", fixed ctrl: 0x%" PRIx64 "\n", state->global_ctrl, |
| state->fixed_ctrl); |
| for (unsigned i = 0; i < state->num_used_fixed; ++i) { |
| LTRACEF("fixed[%u]: num %u, initial 0x%" PRIx64 "\n", i, state->fixed_hw_map[i], |
| state->fixed_initial_value[i]); |
| } |
| for (unsigned i = 0; i < state->num_used_programmable; ++i) { |
| LTRACEF("programmable[%u]: id 0x%x, initial 0x%" PRIx64 "\n", i, |
| state->programmable_events[i], state->programmable_initial_value[i]); |
| } |
| } |
| |
| mp_sync_exec(MP_IPI_TARGET_ALL, 0, x86_perfmon_start_cpu_task, state); |
| perfmon_active.store(true); |
| |
| return ZX_OK; |
| } |
| |
| // This is invoked via mp_sync_exec which thread safety analysis cannot follow. |
| static void x86_perfmon_write_last_records(PerfmonState* state, cpu_num_t cpu) { |
| PerfmonCpuData* data = &state->cpu_data[cpu]; |
| perfmon::RecordHeader* next = data->buffer_next; |
| |
| zx_time_t now = _rdtsc(); |
| next = arch_perfmon_write_time_record(next, perfmon::kEventIdNone, now); |
| |
| // If the counter triggers interrupts then the PMI handler will |
| // continually reset it to its initial value. To keep things simple |
| // just always subtract out the initial value from the current value |
| // and write the difference out. For non-interrupt triggering events |
| // the user should normally initialize the counter to zero to get |
| // correct results. |
| // Counters that don't trigger interrupts could overflow and we won't |
| // necessarily catch it, but there's nothing we can do about it. |
| // We can handle the overflowed-once case, which should catch the |
| // vast majority of cases. |
| // TODO(dje): Counters that trigger interrupts should never have |
| // an overflowed value here, but that's what I'm seeing. |
| |
| for (unsigned i = 0; i < state->num_used_programmable; ++i) { |
| PmuEventId id = state->programmable_events[i]; |
| DEBUG_ASSERT(id != 0); |
| uint64_t count = read_msr(IA32_PMC_FIRST + i); |
| if (count >= state->programmable_initial_value[i]) { |
| count -= state->programmable_initial_value[i]; |
| } else { |
| // The max counter value is generally not 64 bits. |
| count += (perfmon_max_programmable_counter_value - state->programmable_initial_value[i] + 1); |
| } |
| next = arch_perfmon_write_count_record(next, id, count); |
| } |
| for (unsigned i = 0; i < state->num_used_fixed; ++i) { |
| PmuEventId id = state->fixed_events[i]; |
| DEBUG_ASSERT(id != 0); |
| unsigned hw_num = state->fixed_hw_map[i]; |
| DEBUG_ASSERT(hw_num < perfmon_num_fixed_counters); |
| uint64_t count = read_msr(IA32_FIXED_CTR0 + hw_num); |
| if (count >= state->fixed_initial_value[i]) { |
| count -= state->fixed_initial_value[i]; |
| } else { |
| // The max counter value is generally not 64 bits. |
| count += (perfmon_max_fixed_counter_value - state->fixed_initial_value[i] + 1); |
| } |
| next = arch_perfmon_write_count_record(next, id, count); |
| } |
| // Misc events are currently all non-cpu-specific. |
| // Just report for cpu 0. See pmi_interrupt_handler. |
| if (cpu == 0) { |
| for (unsigned i = 0; i < state->num_used_misc; ++i) { |
| PmuEventId id = state->misc_events[i]; |
| ReadMiscResult typed_value = read_misc_event(state, id); |
| switch (typed_value.type) { |
| case perfmon::kRecordTypeCount: |
| next = arch_perfmon_write_count_record(next, id, typed_value.value); |
| break; |
| case perfmon::kRecordTypeValue: |
| next = arch_perfmon_write_value_record(next, id, typed_value.value); |
| break; |
| default: |
| __UNREACHABLE; |
| } |
| } |
| } |
| |
| data->buffer_next = next; |
| } |
| |
| static void x86_perfmon_finalize_buffer(PerfmonState* state, cpu_num_t cpu) { |
| LTRACEF("Collecting last data for cpu %u\n", cpu); |
| |
| PerfmonCpuData* data = &state->cpu_data[cpu]; |
| perfmon::BufferHeader* hdr = data->buffer_start; |
| |
| // KISS. There may be enough space to write some of what we want to write |
| // here, but don't try. Just use the same simple check that |
| // |pmi_interrupt_handler()| does. |
| size_t space_needed = get_max_space_needed_for_all_records(state); |
| if (reinterpret_cast<char*>(data->buffer_next) + space_needed > data->buffer_end) { |
| hdr->flags |= perfmon::BufferHeader::kBufferFlagFull; |
| LTRACEF("Buffer overflow on cpu %u\n", cpu); |
| } else { |
| x86_perfmon_write_last_records(state, cpu); |
| } |
| |
| hdr->capture_end = |
| reinterpret_cast<char*>(data->buffer_next) - reinterpret_cast<char*>(data->buffer_start); |
| } |
| |
| static void x86_perfmon_stop_cpu_task(void* raw_context) { |
| // Disable all counters ASAP. |
| disable_counters(); |
| apic_pmi_mask(); |
| |
| DEBUG_ASSERT(arch_ints_disabled()); |
| DEBUG_ASSERT(!perfmon_active.load()); |
| DEBUG_ASSERT(raw_context); |
| |
| auto state = reinterpret_cast<PerfmonState*>(raw_context); |
| auto cpu = arch_curr_cpu_num(); |
| auto data = &state->cpu_data[cpu]; |
| |
| // Retrieve final event values and write into the trace buffer. |
| |
| if (data->buffer_start) { |
| x86_perfmon_finalize_buffer(state, cpu); |
| } |
| |
| x86_perfmon_clear_overflow_indicators(); |
| x86_perfmon_lbr_clear(); |
| } |
| |
| static void arch_perfmon_stop_locked() TA_REQ(PerfmonLock::Get()) { |
| if (!perfmon_supported) { |
| // Nothing to do. |
| return; |
| } |
| if (!perfmon_state) { |
| // Nothing to do. |
| return; |
| } |
| if (!perfmon_active.load()) { |
| // Nothing to do. |
| return; |
| } |
| |
| TRACEF("Disabling perfmon\n"); |
| |
| // Do this before anything else so that any PMI interrupts from this point |
| // on won't try to access potentially unmapped memory. |
| perfmon_active.store(false); |
| |
| // TODO(dje): Check clobbering of values - user should be able to do |
| // multiple stops and still read register values. |
| |
| auto state = perfmon_state.get(); |
| mp_sync_exec(MP_IPI_TARGET_ALL, 0, x86_perfmon_stop_cpu_task, state); |
| |
| // x86_perfmon_start currently maps the buffers in, so we unmap them here. |
| // Make sure to do this after we've turned everything off so that we |
| // don't get another PMI after this. |
| x86_perfmon_unmap_buffers_locked(state); |
| } |
| |
| // Stop collecting data. |
| void arch_perfmon_stop() { |
| Guard<Mutex> guard(PerfmonLock::Get()); |
| arch_perfmon_stop_locked(); |
| } |
| |
| // Worker for x86_perfmon_fini to be executed on all cpus. |
| // This is invoked via mp_sync_exec which thread safety analysis cannot follow. |
| static void x86_perfmon_reset_task(void* raw_context) { |
| DEBUG_ASSERT(arch_ints_disabled()); |
| DEBUG_ASSERT(!perfmon_active.load()); |
| DEBUG_ASSERT(!raw_context); |
| |
| disable_counters(); |
| apic_pmi_mask(); |
| x86_perfmon_clear_overflow_indicators(); |
| |
| write_msr(IA32_DEBUGCTL, 0); |
| |
| for (unsigned i = 0; i < perfmon_num_programmable_counters; ++i) { |
| write_msr(IA32_PERFEVTSEL_FIRST + i, 0); |
| write_msr(IA32_PMC_FIRST + i, 0); |
| } |
| |
| write_msr(IA32_FIXED_CTR_CTRL, 0); |
| for (unsigned i = 0; i < perfmon_num_fixed_counters; ++i) { |
| write_msr(IA32_FIXED_CTR0 + i, 0); |
| } |
| } |
| |
| // Finish data collection, reset h/w back to initial state and undo |
| // everything x86_perfmon_init did. |
| void arch_perfmon_fini() { |
| Guard<Mutex> guard(PerfmonLock::Get()); |
| |
| if (!perfmon_supported) { |
| // Nothing to do. |
| return; |
| } |
| |
| if (perfmon_active.load()) { |
| arch_perfmon_stop_locked(); |
| DEBUG_ASSERT(!perfmon_active.load()); |
| } |
| |
| mp_sync_exec(MP_IPI_TARGET_ALL, 0, x86_perfmon_reset_task, nullptr); |
| |
| perfmon_state.reset(); |
| } |
| |
| // Interrupt handling. |
| |
| // Write out a |perfmon::LastBranchRecord| record. |
| static perfmon::RecordHeader* x86_perfmon_write_last_branches(PerfmonState* state, uint64_t cr3, |
| perfmon::RecordHeader* hdr, |
| PmuEventId id) { |
| auto rec = reinterpret_cast<perfmon::LastBranchRecord*>(hdr); |
| auto num_entries = perfmon_lbr_stack_size; |
| static_assert(perfmon::LastBranchRecord::kMaxNumLastBranch == |
| ktl::size(perfmon::LastBranchRecord{}.branches)); |
| DEBUG_ASSERT(num_entries > 0 && num_entries <= perfmon::LastBranchRecord::kMaxNumLastBranch); |
| arch_perfmon_write_header(&rec->header, perfmon::kRecordTypeLastBranch, id); |
| rec->num_branches = num_entries; |
| rec->aspace = cr3; |
| |
| auto* branches = rec->branches; |
| unsigned tos = |
| ((read_msr(SKL_LAST_BRANCH_TOS) & IA32_LBR_TOS_TOS_MASK) >> IA32_LBR_TOS_TOS_SHIFT); |
| for (unsigned i = 0; i < num_entries; ++i) { |
| unsigned msr_offset = (tos - i) % num_entries; |
| branches[i].from = read_msr(SKL_LAST_BRANCH_FROM_0 + msr_offset); |
| branches[i].to = read_msr(SKL_LAST_BRANCH_TO_0 + msr_offset); |
| uint64_t info = read_msr(SKL_LAST_BRANCH_INFO_0 + msr_offset); |
| // Only write these bits out. |
| info &= (IA32_LBR_INFO_CYCLE_COUNT_MASK | IA32_LBR_INFO_MISPRED_MASK); |
| branches[i].info = info; |
| } |
| |
| // Get a pointer to the end of this record. Since this record is |
| // variable length it's more complicated than just "rec + 1". |
| auto next = reinterpret_cast<perfmon::RecordHeader*>(reinterpret_cast<char*>(rec) + |
| perfmon::LastBranchRecordSize(rec)); |
| LTRACEF("LBR record: num branches %u, @%p, next @%p\n", num_entries, hdr, next); |
| return next; |
| } |
| |
| // Helper function so that there is only one place where we enable/disable |
| // interrupts (our caller). |
| // Returns true if success, false if buffer is full. |
| |
| static bool pmi_interrupt_handler(iframe_t* frame, PerfmonState* state) { |
| cpu_num_t cpu = arch_curr_cpu_num(); |
| auto data = &state->cpu_data[cpu]; |
| |
| // On x86 zx_ticks_get uses rdtsc. |
| zx_time_t now = _rdtsc(); |
| LTRACEF("cpu %u: now %" PRIi64 ", sp %p\n", cpu, now, __GET_FRAME()); |
| |
| // Rather than continually checking if we have enough space, just |
| // conservatively check for the maximum amount we'll need. |
| size_t space_needed = get_max_space_needed_for_all_records(state); |
| if (reinterpret_cast<char*>(data->buffer_next) + space_needed > data->buffer_end) { |
| TRACEF("cpu %u: @%" PRIi64 " pmi buffer full\n", cpu, now); |
| data->buffer_start->flags |= perfmon::BufferHeader::kBufferFlagFull; |
| return false; |
| } |
| |
| const uint64_t status = read_msr(IA32_PERF_GLOBAL_STATUS); |
| uint64_t bits_to_clear = 0; |
| uint64_t cr3 = x86_get_cr3(); |
| |
| LTRACEF("cpu %u: status 0x%" PRIx64 "\n", cpu, status); |
| |
| if (status & perfmon_counter_status_bits) { |
| #if TRY_FREEZE_ON_PMI |
| if (!(status & IA32_PERF_GLOBAL_STATUS_CTR_FRZ_MASK)) |
| LTRACEF("Eh? status.CTR_FRZ not set\n"); |
| #else |
| if (status & IA32_PERF_GLOBAL_STATUS_CTR_FRZ_MASK) |
| LTRACEF("Eh? status.CTR_FRZ is set\n"); |
| #endif |
| |
| auto next = data->buffer_next; |
| bool saw_timebase = false; |
| // We can't record every event that requested LBR data. |
| // It is unspecified which one we pick. |
| PmuEventId lbr_id = perfmon::kEventIdNone; |
| |
| next = arch_perfmon_write_time_record(next, perfmon::kEventIdNone, now); |
| |
| // Note: We don't write "value" records here instead prefering the |
| // smaller "tick" record. If the user is tallying the counts the user |
| // is required to recognize this and apply the tick rate. |
| // TODO(dje): Precompute mask to detect whether the interrupt is for |
| // the timebase counter, and then combine the loops. |
| |
| for (unsigned i = 0; i < state->num_used_programmable; ++i) { |
| if (!(status & IA32_PERF_GLOBAL_STATUS_PMC_OVF_MASK(i))) |
| continue; |
| PmuEventId id = state->programmable_events[i]; |
| // Counters using a separate timebase are handled below. |
| // We shouldn't get an interrupt on a counter using a timebase. |
| // TODO(dje): The counter could still overflow. Later. |
| if (id == state->timebase_event) { |
| saw_timebase = true; |
| } else if (state->programmable_flags[i] & perfmon::kPmuConfigFlagUsesTimebase) { |
| continue; |
| } |
| if (state->programmable_flags[i] & perfmon::kPmuConfigFlagPc) { |
| next = arch_perfmon_write_pc_record(next, id, cr3, frame->ip); |
| } else { |
| next = arch_perfmon_write_tick_record(next, id); |
| } |
| if (state->programmable_flags[i] & perfmon::kPmuConfigFlagLastBranch) { |
| lbr_id = id; |
| } |
| LTRACEF("cpu %u: resetting PMC %u to 0x%" PRIx64 "\n", cpu, i, |
| state->programmable_initial_value[i]); |
| write_msr(IA32_PMC_FIRST + i, state->programmable_initial_value[i]); |
| } |
| |
| for (unsigned i = 0; i < state->num_used_fixed; ++i) { |
| unsigned hw_num = state->fixed_hw_map[i]; |
| DEBUG_ASSERT(hw_num < perfmon_num_fixed_counters); |
| if (!(status & IA32_PERF_GLOBAL_STATUS_FIXED_OVF_MASK(hw_num))) |
| continue; |
| PmuEventId id = state->fixed_events[i]; |
| // Counters using a separate timebase are handled below. |
| // We shouldn't get an interrupt on a counter using a timebase. |
| // TODO(dje): The counter could still overflow. Later. |
| if (id == state->timebase_event) { |
| saw_timebase = true; |
| } else if (state->fixed_flags[i] & perfmon::kPmuConfigFlagUsesTimebase) { |
| continue; |
| } |
| if (state->fixed_flags[i] & perfmon::kPmuConfigFlagPc) { |
| next = arch_perfmon_write_pc_record(next, id, cr3, frame->ip); |
| } else { |
| next = arch_perfmon_write_tick_record(next, id); |
| } |
| if (state->fixed_flags[i] & perfmon::kPmuConfigFlagLastBranch) { |
| lbr_id = id; |
| } |
| LTRACEF("cpu %u: resetting FIXED %u to 0x%" PRIx64 "\n", cpu, hw_num, |
| state->fixed_initial_value[i]); |
| write_msr(IA32_FIXED_CTR0 + hw_num, state->fixed_initial_value[i]); |
| } |
| |
| bits_to_clear |= perfmon_counter_status_bits; |
| |
| // Now handle events that have kPmuConfigFlagTimebase0 set. |
| if (saw_timebase) { |
| for (unsigned i = 0; i < state->num_used_programmable; ++i) { |
| if (!(state->programmable_flags[i] & perfmon::kPmuConfigFlagUsesTimebase)) |
| continue; |
| PmuEventId id = state->programmable_events[i]; |
| uint64_t count = read_msr(IA32_PMC_FIRST + i); |
| next = arch_perfmon_write_count_record(next, id, count); |
| // We could leave the counter alone, but it could overflow. |
| // Instead reduce the risk and just always reset to zero. |
| LTRACEF("cpu %u: resetting PMC %u to 0x%" PRIx64 "\n", cpu, i, |
| state->programmable_initial_value[i]); |
| write_msr(IA32_PMC_FIRST + i, state->programmable_initial_value[i]); |
| } |
| for (unsigned i = 0; i < state->num_used_fixed; ++i) { |
| if (!(state->fixed_flags[i] & perfmon::kPmuConfigFlagUsesTimebase)) |
| continue; |
| PmuEventId id = state->fixed_events[i]; |
| unsigned hw_num = state->fixed_hw_map[i]; |
| DEBUG_ASSERT(hw_num < perfmon_num_fixed_counters); |
| uint64_t count = read_msr(IA32_FIXED_CTR0 + hw_num); |
| next = arch_perfmon_write_count_record(next, id, count); |
| // We could leave the counter alone, but it could overflow. |
| // Instead reduce the risk and just always reset to zero. |
| LTRACEF("cpu %u: resetting FIXED %u to 0x%" PRIx64 "\n", cpu, hw_num, |
| state->fixed_initial_value[i]); |
| write_msr(IA32_FIXED_CTR0 + hw_num, state->fixed_initial_value[i]); |
| } |
| // Misc events are currently all non-cpu-specific. We have a |
| // timebase driving their collection, but useful timebases |
| // are triggered on each cpu. One thing we'd like to avoid is |
| // contention for the cache line containing these counters. |
| // For now, only collect data when we're running on cpu 0. |
| // This is not ideal, it could be mostly idle. OTOH, some |
| // interrupts are currently only serviced on cpu 0 so that |
| // ameliorates the problem somewhat. |
| if (cpu == 0) { |
| for (unsigned i = 0; i < state->num_used_misc; ++i) { |
| if (!(state->misc_flags[i] & perfmon::kPmuConfigFlagUsesTimebase)) { |
| // While a timebase is required for all current misc |
| // counters, we don't assume this here. |
| continue; |
| } |
| PmuEventId id = state->misc_events[i]; |
| ReadMiscResult typed_value = read_misc_event(state, id); |
| switch (typed_value.type) { |
| case perfmon::kRecordTypeCount: |
| next = arch_perfmon_write_count_record(next, id, typed_value.value); |
| break; |
| case perfmon::kRecordTypeValue: |
| next = arch_perfmon_write_value_record(next, id, typed_value.value); |
| break; |
| default: |
| __UNREACHABLE; |
| } |
| } |
| } |
| } |
| |
| if (lbr_id != perfmon::kEventIdNone) { |
| next = x86_perfmon_write_last_branches(state, cr3, next, lbr_id); |
| } |
| |
| data->buffer_next = next; |
| } |
| |
| // We shouldn't be seeing these set (at least not yet). |
| if (status & IA32_PERF_GLOBAL_STATUS_TRACE_TOPA_PMI_MASK) |
| LTRACEF("WARNING: GLOBAL_STATUS_TRACE_TOPA_PMI set\n"); |
| if (status & IA32_PERF_GLOBAL_STATUS_LBR_FRZ_MASK) |
| LTRACEF("WARNING: GLOBAL_STATUS_LBR_FRZ set\n"); |
| if (status & IA32_PERF_GLOBAL_STATUS_DS_BUFFER_OVF_MASK) |
| LTRACEF("WARNING: GLOBAL_STATUS_DS_BUFFER_OVF set\n"); |
| // TODO(dje): IA32_PERF_GLOBAL_STATUS_ASCI_MASK ??? |
| |
| // Note IA32_PERF_GLOBAL_STATUS_CTR_FRZ_MASK is readonly. |
| bits_to_clear |= |
| (IA32_PERF_GLOBAL_STATUS_UNCORE_OVF_MASK | IA32_PERF_GLOBAL_STATUS_COND_CHGD_MASK); |
| |
| // TODO(dje): No need to accumulate bits to clear if we're going to clear |
| // everything that's set anyway. Kept as is during development. |
| bits_to_clear |= status; |
| |
| LTRACEF("cpu %u: clearing status bits 0x%" PRIx64 "\n", cpu, bits_to_clear); |
| write_msr(IA32_PERF_GLOBAL_STATUS_RESET, bits_to_clear); |
| |
| // Writing to IA32_PERF_GLOBAL_STATUS_RESET should clear IA32_PERF_GLOBAL_STATUS indicators. |
| // If debug asserts are implemented, read the MSR to be sure; avoid the read otherwise, as |
| // reading the MSR here is expensive. |
| if (DEBUG_ASSERT_IMPLEMENTED) { |
| uint64_t end_status = read_msr(IA32_PERF_GLOBAL_STATUS); |
| if (end_status != 0) |
| TRACEF("WARNING: cpu %u: end status 0x%" PRIx64 "\n", cpu, end_status); |
| } |
| |
| return true; |
| } |
| |
| void apic_pmi_interrupt_handler(iframe_t* frame) TA_REQ(PerfmonLock::Get()) { |
| if (!perfmon_active.load()) { |
| apic_issue_eoi(); |
| return; |
| } |
| |
| #if TRY_FREEZE_ON_PMI |
| // Note: We're using perfmon v4 "streamlined" processing here. |
| // See Intel vol3 table 17-3 "Legacy and Streamlined Operation with |
| // Freeze_Perfmon_On_PMI = 1, Counter Overflowed". |
| #else |
| // Turn all counters off as soon as possible so that the counters that |
| // haven't overflowed yet stop counting while we're working. |
| // TODO(dje): Is this necessary with CTR_FRZ? |
| // Otherwise once we reset the counter that overflowed the other counters |
| // will resume counting, and if we don't reset them too then CTR_FRZ |
| // remains set and we'll get no more PMIs. |
| disable_counters(); |
| #endif |
| |
| DEBUG_ASSERT(arch_ints_disabled()); |
| |
| CPU_STATS_INC(perf_ints); |
| |
| auto state = perfmon_state.get(); |
| |
| #if 0 |
| // TODO(dje): We may want this anyway. If we want to be able to handle |
| // page faults inside this handler we'll need to turn interrupts back |
| // on. At the moment we can't do this as we don't handle recursive PMIs. |
| arch_set_blocking_disallowed(false); |
| arch_enable_ints(); |
| #endif |
| |
| bool success = pmi_interrupt_handler(frame, state); |
| |
| #if 0 |
| arch_disable_ints(); |
| arch_set_blocking_disallowed(true); |
| #endif |
| |
| // This is done here instead of in the caller so that we have full control |
| // of when counting is restored. |
| apic_issue_eoi(); |
| |
| // If buffer is full leave everything turned off. |
| if (!success) { |
| #if TRY_FREEZE_ON_PMI |
| disable_counters(); |
| #else |
| // Don't restore GLOBAL_CTRL, leave everything turned off. |
| #endif |
| } else { |
| // The docs suggest this is only necessary for earlier chips |
| // (e.g., not Skylake). Intel vol3 section 10.5.1 "Local Vector Table". |
| // However, this is needed for at least Skylake too (at least when |
| // Freeze-On-PMI is off). |
| apic_pmi_unmask(); |
| |
| #if !TRY_FREEZE_ON_PMI |
| // This is the last thing we do: Once we do this the counters |
| // will start counting again. |
| enable_counters(state); |
| #endif |
| } |
| } |