| /* Copyright 2019 Google LLC. All Rights Reserved. |
| |
| Licensed under the Apache License, Version 2.0 (the "License"); |
| you may not use this file except in compliance with the License. |
| You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| ==============================================================================*/ |
| |
| #include "ruy/pmu.h" |
| |
| #include "ruy/check_macros.h" |
| |
| #ifdef __linux__ |
| #include <asm/unistd.h> |
| #include <linux/perf_event.h> |
| #include <sys/ioctl.h> |
| #include <syscall.h> |
| #include <unistd.h> |
| |
| #include <cstdio> |
| #endif |
| |
| #include <algorithm> |
| #include <cstdint> |
| #include <cstdlib> |
| #include <cstring> |
| |
| namespace ruy { |
| |
| // Linux-specific. Not ARM-specific. |
| #ifdef __linux__ |
| class PerfEvent { |
| public: |
| PerfEvent(std::uint32_t type, std::uint64_t config) { |
| perf_event_attr pe; |
| memset(&pe, 0, sizeof(pe)); |
| pe.size = sizeof(pe); |
| pe.type = type; |
| pe.config = config; |
| pe.disabled = 1; |
| pe.exclude_kernel = 1; |
| pe.exclude_hv = 1; |
| pe.inherit = 1; |
| fd_ = syscall(__NR_perf_event_open, &pe, 0, -1, -1, 0); |
| if (fd_ == -1) { |
| fprintf(stderr, "perf_event_open failed for config 0x%lx\n", |
| static_cast<unsigned long>(config)); |
| // abort(); |
| } |
| } |
| |
| ~PerfEvent() { |
| RUY_CHECK(!started_); |
| close(fd_); |
| } |
| |
| void Start() { |
| RUY_CHECK(!started_); |
| started_ = true; |
| ioctl(fd_, PERF_EVENT_IOC_RESET, 0); |
| ioctl(fd_, PERF_EVENT_IOC_ENABLE, 0); |
| count_at_start_ = Read(); |
| } |
| |
| void Stop() { |
| RUY_CHECK(started_); |
| started_ = false; |
| ioctl(fd_, PERF_EVENT_IOC_DISABLE, 0); |
| count_at_stop_ = Read(); |
| } |
| |
| std::int64_t Count() const { |
| RUY_CHECK(!started_); |
| return count_at_stop_ - count_at_start_; |
| } |
| |
| private: |
| std::int64_t Read() const { |
| std::int64_t count; |
| RUY_CHECK_NE(read(fd_, &count, sizeof(count)), -1); |
| return count; |
| } |
| std::int64_t count_at_start_ = -1; |
| std::int64_t count_at_stop_ = -1; |
| bool started_ = false; |
| int fd_ = -1; |
| }; |
| #else |
| // Placeholder implementation to at least compile outside of linux. |
| #define PERF_TYPE_RAW 0 |
| class PerfEvent { |
| public: |
| PerfEvent(std::uint32_t, std::uint64_t) {} |
| ~PerfEvent() {} |
| void Start() {} |
| void Stop() {} |
| std::int64_t Count() const { return 0; } |
| }; |
| #endif |
| |
| // ARM-specific. Query ARM PMU counters as Linux perf events using |
| // PERF_TYPE_RAW. |
| namespace arm_pmuv3 { |
| |
| #pragma GCC diagnostic push |
| #pragma GCC diagnostic ignored "-Wunused-const-variable" |
| |
| // These event numbers are listed in the ARMv8 architecture reference manual. |
| constexpr std::uint16_t L1I_CACHE_REFILL = 0x01; |
| constexpr std::uint16_t L1I_TLB_REFILL = 0x02; |
| constexpr std::uint16_t L1D_CACHE_REFILL = 0x03; |
| constexpr std::uint16_t L1D_CACHE = 0x04; |
| constexpr std::uint16_t L1D_TLB_REFILL = 0x05; |
| constexpr std::uint16_t LD_RETIRED = 0x06; |
| constexpr std::uint16_t ST_RETIRED = 0x07; |
| constexpr std::uint16_t INST_RETIRED = 0x08; |
| constexpr std::uint16_t EXC_TAKEN = 0x09; |
| constexpr std::uint16_t EXC_RETURN = 0x0A; |
| constexpr std::uint16_t CID_WRITE_RETIRED = 0x0B; |
| constexpr std::uint16_t PC_WRITE_RETIRED = 0x0C; |
| constexpr std::uint16_t BR_IMMED_RETIRED = 0x0D; |
| constexpr std::uint16_t BR_RETURN_RETIRED = 0x0E; |
| constexpr std::uint16_t UNALIGNED_LDST_RETIRED = 0x0F; |
| constexpr std::uint16_t BR_MIS_PRED = 0x10; |
| constexpr std::uint16_t CPU_CYCLES = 0x11; |
| constexpr std::uint16_t BR_PRED = 0x12; |
| constexpr std::uint16_t MEM_ACCESS = 0x13; |
| constexpr std::uint16_t L1I_CACHE = 0x14; |
| constexpr std::uint16_t L1D_CACHE_WB = 0x15; |
| constexpr std::uint16_t L2D_CACHE = 0x16; |
| constexpr std::uint16_t L2D_CACHE_REFILL = 0x17; |
| constexpr std::uint16_t L2D_CACHE_WB = 0x18; |
| constexpr std::uint16_t BUS_ACCESS = 0x19; |
| constexpr std::uint16_t MEMORY_ERROR = 0x1A; |
| constexpr std::uint16_t INST_SPEC = 0x1B; |
| constexpr std::uint16_t TTBR_WRITE_RETIRED = 0x1C; |
| constexpr std::uint16_t BUS_CYCLES = 0x1D; |
| constexpr std::uint16_t CHAIN = 0x1E; |
| constexpr std::uint16_t L1D_CACHE_ALLOCATE = 0x1F; |
| constexpr std::uint16_t L2D_CACHE_ALLOCATE = 0x20; |
| constexpr std::uint16_t BR_RETIRED = 0x21; |
| constexpr std::uint16_t BR_MIS_PRED_RETIRED = 0x22; |
| constexpr std::uint16_t STALL_FRONTEND = 0x23; |
| constexpr std::uint16_t STALL_BACKEND = 0x24; |
| constexpr std::uint16_t L1D_TLB = 0x25; |
| constexpr std::uint16_t L1I_TLB = 0x26; |
| constexpr std::uint16_t L2I_CACHE = 0x27; |
| constexpr std::uint16_t L2I_CACHE_REFILL = 0x28; |
| constexpr std::uint16_t L3D_CACHE_ALLOCATE = 0x29; |
| constexpr std::uint16_t L3D_CACHE_REFILL = 0x2A; |
| constexpr std::uint16_t L3D_CACHE = 0x2B; |
| constexpr std::uint16_t L3D_CACHE_WB = 0x2C; |
| constexpr std::uint16_t L2D_TLB_REFILL = 0x2D; |
| constexpr std::uint16_t L2I_TLB_REFILL = 0x2E; |
| constexpr std::uint16_t L2D_TLB = 0x2F; |
| constexpr std::uint16_t L2I_TLB = 0x30; |
| constexpr std::uint16_t LL_CACHE = 0x32; |
| constexpr std::uint16_t LL_CACHE_MISS = 0x33; |
| constexpr std::uint16_t DTLB_WALK = 0x34; |
| constexpr std::uint16_t LL_CACHE_RD = 0x36; |
| constexpr std::uint16_t LL_CACHE_MISS_RD = 0x37; |
| |
| // Additional implementation-defined events found by googling around. |
| constexpr std::uint16_t L1D_CACHE_RD = 0x40; |
| constexpr std::uint16_t L1D_CACHE_REFILL_RD = 0x42; |
| constexpr std::uint16_t L1D_TLB_REFILL_RD = 0x4C; |
| constexpr std::uint16_t L1D_TLB_RD = 0x4E; |
| constexpr std::uint16_t L2D_CACHE_RD = 0x50; |
| constexpr std::uint16_t L2D_CACHE_REFILL_RD = 0x52; |
| constexpr std::uint16_t BUS_ACCESS_RD = 0x60; |
| constexpr std::uint16_t MEM_ACCESS_RD = 0x66; |
| constexpr std::uint16_t L3D_CACHE_RD = 0xA0; |
| constexpr std::uint16_t L3D_CACHE_REFILL_RD = 0xA2; |
| |
| #pragma GCC diagnostic pop |
| |
| } // namespace arm_pmuv3 |
| |
| class PmuEventsPrivate { |
| public: |
| PmuEventsPrivate() |
| : l1d_cache_refill(PERF_TYPE_RAW, arm_pmuv3::L1D_CACHE_REFILL), |
| l2d_cache_refill(PERF_TYPE_RAW, arm_pmuv3::L2D_CACHE_REFILL), |
| l3d_cache_refill(PERF_TYPE_RAW, arm_pmuv3::L3D_CACHE_REFILL), |
| ll_cache_miss(PERF_TYPE_RAW, arm_pmuv3::LL_CACHE_MISS), |
| l1d_tlb_refill(PERF_TYPE_RAW, arm_pmuv3::L1D_TLB_REFILL), |
| l2d_tlb_refill(PERF_TYPE_RAW, arm_pmuv3::L2D_TLB_REFILL), |
| stall_frontend(PERF_TYPE_RAW, arm_pmuv3::STALL_FRONTEND), |
| stall_backend(PERF_TYPE_RAW, arm_pmuv3::STALL_BACKEND), |
| br_mis_pred(PERF_TYPE_RAW, arm_pmuv3::BR_MIS_PRED) {} |
| |
| private: |
| friend class PmuEvents; |
| PerfEvent l1d_cache_refill; |
| PerfEvent l2d_cache_refill; |
| PerfEvent l3d_cache_refill; |
| PerfEvent ll_cache_miss; |
| PerfEvent l1d_tlb_refill; |
| PerfEvent l2d_tlb_refill; |
| PerfEvent stall_frontend; |
| PerfEvent stall_backend; |
| PerfEvent br_mis_pred; |
| }; |
| |
| PmuEvents::PmuEvents() : priv(new PmuEventsPrivate) {} |
| PmuEvents::~PmuEvents() { delete priv; } |
| |
| void PmuEvents::StartRecording() { |
| priv->l1d_cache_refill.Start(); |
| priv->l2d_cache_refill.Start(); |
| priv->l3d_cache_refill.Start(); |
| priv->ll_cache_miss.Start(); |
| priv->l1d_tlb_refill.Start(); |
| priv->l2d_tlb_refill.Start(); |
| priv->stall_frontend.Start(); |
| priv->stall_backend.Start(); |
| priv->br_mis_pred.Start(); |
| } |
| |
| void PmuEvents::StopRecording() { |
| priv->l1d_cache_refill.Stop(); |
| priv->l2d_cache_refill.Stop(); |
| priv->l3d_cache_refill.Stop(); |
| priv->ll_cache_miss.Stop(); |
| priv->l1d_tlb_refill.Stop(); |
| priv->l2d_tlb_refill.Stop(); |
| priv->stall_frontend.Stop(); |
| priv->stall_backend.Stop(); |
| priv->br_mis_pred.Stop(); |
| } |
| |
| float PmuEvents::BranchMispredictionCount() const { |
| return static_cast<float>(priv->br_mis_pred.Count()); |
| } |
| |
| float PmuEvents::FrontendStallCount() const { |
| return static_cast<float>(priv->stall_frontend.Count()); |
| } |
| |
| float PmuEvents::BackendStallCount() const { |
| return static_cast<float>(priv->stall_backend.Count()); |
| } |
| |
| float PmuEvents::L1RefillCount() const { |
| return static_cast<float>(priv->l1d_cache_refill.Count()); |
| } |
| |
| float PmuEvents::L2RefillCount() const { |
| return static_cast<float>(priv->l2d_cache_refill.Count()); |
| } |
| |
| float PmuEvents::L3RefillCount() const { |
| // Important: this was discovered in the context of the above experiments, |
| // which also tested the _RD variants of these counters. So it's possible that |
| // it's just not needed here with the default (non _RD) counters. |
| // |
| // Some CPUs implement LL_CACHE_MISS[_RD], some implement |
| // L3D_CACHE_REFILL[_RD]. It seems that either one of these two counters is |
| // zero, or they roughly both agree with each other. Therefore, taking the max |
| // of them is a reasonable way to get something more portable across various |
| // CPUs. |
| return static_cast<float>( |
| std::max(priv->l3d_cache_refill.Count(), priv->ll_cache_miss.Count())); |
| } |
| |
| float PmuEvents::L1TLBRefillCount() const { |
| return static_cast<float>(priv->l1d_tlb_refill.Count()); |
| } |
| |
| float PmuEvents::L2TLBRefillCount() const { |
| return static_cast<float>(priv->l2d_tlb_refill.Count()); |
| } |
| |
| } // namespace ruy |