| // Copyright 2022 The Fuchsia Authors |
| // |
| // Use of this source code is governed by a MIT-style |
| // license that can be found in the LICENSE file or at |
| // https://opensource.org/licenses/MIT |
| |
| #include <assert.h> |
| #include <lib/affine/ratio.h> |
| #include <lib/console.h> |
| #include <zircon/compiler.h> |
| #include <zircon/types.h> |
| |
| #include <arch/mp.h> |
| #include <arch/x86.h> |
| #include <arch/x86/apic.h> |
| #include <arch/x86/feature.h> |
| #include <fbl/alloc_checker.h> |
| #include <fbl/array.h> |
| #include <ktl/array.h> |
| #include <ktl/atomic.h> |
| #include <ktl/bit.h> |
| #include <ktl/iterator.h> |
| #include <ktl/limits.h> |
| |
| /* |
| * MSR Benchmark: |
| * |
| * This benchmark attempts to measure the cost of reading and writing MSR |
| * registers (specifically, the TSC Deadline register used to implement timers |
| * on x64), and the effect that doing so might have on other CPUs' performance. |
| * |
| * These measurements are meant to serve two purposes: |
| * |
| * 1) To compare the relative performance of MSR reads/writes across |
| * 1a) Native HW environments (eg; running on a 'host'); |
| * 1b) Guest VM environments running directly inside of a host. |
| * 1b) Nested guest VM environments (eg; a guest inside of a guest inside of a |
| * host) |
| * 2) To see if reading/writing MSR registers on one CPU has an affect on other |
| * CPUs. |
| * |
| * #1 helps us to understand the cost of MSR access in a VM environment, while |
| * #2 helps us to understand if a VM environments implementation of MSR access |
| * affects other CPUs. We expected that it would not, but VMs can be tricky |
| * (esp. nested VMs). |
| * |
| * The structure of the benchmark is as follows: |
| * |
| * We will take measurements across a number of stages across all currently |
| * online CPUs. One of the online CPUs is considered to be the "primary" CPU, |
| * while the others are considered to be "secondaries". Each stage has two |
| * "actions" it will perform, one for the primary CPU, and another for the |
| * secondaries. During the measurement for a stage, each CPU will disable |
| * interrupts, and then see how many times they can complete their assigned |
| * action within a fixed measurement interval. |
| * |
| * During the first stage, all of the CPU actions will consist of simple |
| * arithmetic in order to establish a baseline. Subsequent stages will consist |
| * of the tests of MSR register reads and writes, split into two phases. In the |
| * first phase the primary CPU will perform MSR reads/writes, while the |
| * secondaries run the arithmetic action. In the second phase, all of the CPUs |
| * will perform the MSR read/writes performed by the primary CPU in the first |
| * phase. |
| * |
| * After taking measurements for each stage the test threads shut down and the |
| * results are printed. If MSR reads/writes are not having an affect on other |
| * CPUs, we expect to see the arithmetic numbers for secondaries to be basically |
| * unchanged from the baseline established in the first stage when the primary |
| * CPU is performing MSR accesses. Likewise, if MSR accesses have no affect on |
| * other CPUs, we expect all CPUs to show the same MSR performance when running |
| * concurrently as the primary CPU did when it was the only CPU performing MSR |
| * accesses. |
| * |
| * The console thread is used to sequence the benchmarks, but is not actually |
| * responsible for taking any measurements. It creates one thread per-active |
| * CPU, each of which run with default weight and has hard affinity for one of |
| * the currently active CPUs. Each of these threads will |
| * spin-sleep until the console thread tells them to start the next measurement |
| * stage. |
| * |
| * At that point in time, all of the threads become more aggressive in their |
| * spinning behavior. Once realizing that the stage has started, each CPU |
| * disables interrupts, and then each secondary CPU signals to the primary that |
| * they are ready to start before spin-waiting on the signal from the primary |
| * CPU to start. |
| * |
| * The primary spin-waits for the secondaries to become ready, then assigns a |
| * deadline for the stage, finally signals to everyone that the measurement is |
| * ready to start. Each thread: |
| * 1) Counts the number of times they are able to make it through their stage's |
| * measurement action before the deadline. |
| * 2) Records the result. |
| * 3) Signals to the console thread that they are finished. |
| * 4) Re-enables interrupts. |
| * 5) And finally waits for the console thread to tell them to start the next |
| * stage. |
| * |
| * Once all of the measurements have been taken, the measurement threads exit, |
| * the console thread prints the results, and finally cleans up all of the test |
| * resources. |
| * |
| */ |
| |
| namespace { |
| |
| class BenchmarkState; |
| |
| // The structure which defines the name of, and actions for, each |
| // measurement stage. |
| struct TestStage { |
| using Action = uint64_t (*)(uint64_t a, uint64_t b); |
| using EnabledTest = bool (*)(); |
| TestStage(const char* _name, Action _primary_action, Action _secondary_action, |
| EnabledTest _enabled_test) |
| : name{_name}, |
| primary_action{_primary_action}, |
| secondary_action{_secondary_action}, |
| enabled_test(_enabled_test) {} |
| |
| bool enabled() const { return enabled_test(); } |
| |
| const char* const name; |
| const Action primary_action; |
| const Action secondary_action; |
| const EnabledTest enabled_test; |
| }; |
| |
| // The structure which holds the result for a stage. Specifically, the start |
| // time, end time, and number of times that a CPU managed to execute its action |
| // during the stage. When results are printed, they are normalized to show the |
| // number of actions/second the CPU managed to execute. |
| struct StageResults { |
| zx_ticks_t start{0}; |
| zx_ticks_t end{0}; |
| size_t count{0}; |
| }; |
| |
| // The arithmetic action just does some simple adds and multiplies before |
| // exiting. Note, we need to flag our accumulator as volatile in order to |
| // convince the compiler to not simply optimize away this operation. |
| static uint64_t ArithmeticAction(uint64_t a, uint64_t b) { |
| static constexpr uint32_t kCycles = 1 << 10; |
| volatile uint64_t acc = 0; |
| |
| for (uint32_t i = 0; i < kCycles; ++i) { |
| acc += a; |
| acc *= b; |
| } |
| |
| return acc; |
| } |
| |
| // Read the TSC Deadline register 256 times. |
| static uint64_t TscDeadlineReadAction(uint64_t a, uint64_t b) { |
| static constexpr uint32_t kCycles = 1 << 8; |
| |
| for (uint32_t i = 0; i < kCycles; ++i) { |
| [[maybe_unused]] volatile const uint64_t val = read_msr(X86_MSR_IA32_TSC_DEADLINE); |
| } |
| |
| return 0; |
| } |
| |
| // Read the TSC Deadline register, then write to it 256 times before finally |
| // restoring it to the initially read value. |
| static uint64_t TscDeadlineWriteAction(uint64_t a, uint64_t b) { |
| static constexpr uint32_t kCycles = 1 << 8; |
| const uint64_t original = read_msr(X86_MSR_IA32_TSC_DEADLINE); |
| |
| for (uint32_t i = 0; i < kCycles; ++i) { |
| write_msr(X86_MSR_IA32_TSC_DEADLINE, original + i + 1); |
| } |
| |
| write_msr(X86_MSR_IA32_TSC_DEADLINE, original); |
| return original; |
| } |
| |
| // Read the LVT Timer Interrupt control register 256 times. |
| static uint64_t LvtTimerReadAction(uint64_t a, uint64_t b) { |
| static constexpr uint32_t kCycles = 1 << 8; |
| |
| for (uint32_t i = 0; i < kCycles; ++i) { |
| [[maybe_unused]] volatile const uint64_t val = read_msr(X86_MSR_IA32_X2APIC_LVT_TIMER); |
| } |
| |
| return 0; |
| } |
| |
| // Read the LVT Timer Interrupt control register, then write to it toggling the |
| // Masked bit 256 times. Make sure that we also backup and restore the value in |
| // the TSC_DEADLINE register in the process. When we perform a write to the |
| // timer interrupt control register, it will disable any armed deadline. We can |
| // re-arm the deadline by writing to the deadline register again. |
| static uint64_t LvtTimerWriteAction(uint64_t a, uint64_t b) { |
| static constexpr uint32_t kCycles = 1 << 8; |
| static constexpr uint64_t kMaskBit = 0x10000; // Intel SW Dev Manual, Vol 3, section 10.5.1 |
| // |
| const uint64_t old_deadline = read_msr(X86_MSR_IA32_TSC_DEADLINE); |
| const uint64_t original = read_msr(X86_MSR_IA32_X2APIC_LVT_TIMER); |
| uint64_t val = original; |
| |
| for (uint32_t i = 0; i < kCycles; ++i) { |
| val ^= kMaskBit; |
| write_msr(X86_MSR_IA32_X2APIC_LVT_TIMER, val); |
| } |
| |
| write_msr(X86_MSR_IA32_X2APIC_LVT_TIMER, original); |
| // Make sure we put an explicit MFENCE in-between the write to the timer |
| // interrupt control register and the deadline register. If the timer write |
| // hits the register after the deadline write, it will disable the armed |
| // deadline. |
| arch::DeviceMemoryBarrier(); |
| write_msr(X86_MSR_IA32_TSC_DEADLINE, old_deadline); |
| return original; |
| } |
| |
| static bool enable_tscd() { return x86_feature_test(X86_FEATURE_TSC_DEADLINE); } |
| static bool enable_lvtt_rd() { return is_x2apic_enabled(); } |
| static bool enable_lvtt_wr() { return is_x2apic_enabled() && enable_tscd(); } |
| |
| // The definitions of each benchmark stage. |
| static const ktl::array kStages{ |
| TestStage{"basic arithmetic", ArithmeticAction, ArithmeticAction, []() { return true; }}, |
| TestStage{"primary TSCD Rd", TscDeadlineReadAction, ArithmeticAction, enable_tscd}, |
| TestStage{"primary TSCD Wr", TscDeadlineWriteAction, ArithmeticAction, enable_tscd}, |
| TestStage{"all TSCD Rd", TscDeadlineReadAction, TscDeadlineReadAction, enable_tscd}, |
| TestStage{"all TSCD Wr", TscDeadlineWriteAction, TscDeadlineWriteAction, enable_tscd}, |
| TestStage{"primary LVTT Rd", LvtTimerReadAction, ArithmeticAction, enable_lvtt_rd}, |
| TestStage{"primary LVTT Wr", LvtTimerWriteAction, ArithmeticAction, enable_lvtt_wr}, |
| TestStage{"all LVTT Rd", LvtTimerReadAction, LvtTimerReadAction, enable_lvtt_rd}, |
| TestStage{"all LVTT Wr", LvtTimerWriteAction, LvtTimerWriteAction, enable_lvtt_wr}, |
| }; |
| |
| // A structure which holds a CPUs context. Mostly, this holds the state for a |
| // CPU's thread, and the results for that CPU's measurements. |
| struct CpuContext { |
| ~CpuContext() { DEBUG_ASSERT(thread == nullptr); } |
| |
| zx_status_t Init(BenchmarkState* _owner, cpu_num_t _cpu_id, bool _is_primary, |
| thread_start_routine entry, void* arg) { |
| DEBUG_ASSERT(thread == nullptr); |
| DEBUG_ASSERT(cpu_id == INVALID_CPU); |
| |
| cpu_id = _cpu_id; |
| is_primary = _is_primary; |
| owner = _owner; |
| |
| char name[ZX_MAX_NAME_LEN]; |
| snprintf(name, sizeof(name), "BenchmarkState %u", cpu_id); |
| |
| // Create our thread, then set its hard affinity its assigned CPU before |
| // allowing it to run. |
| thread = Thread::Create(name, entry, arg, DEFAULT_PRIORITY); |
| if (thread == nullptr) { |
| return ZX_ERR_NO_MEMORY; |
| } |
| |
| thread->SetCpuAffinity(cpu_num_to_mask(cpu_id)); |
| thread->Resume(); |
| |
| return ZX_OK; |
| } |
| |
| void Cleanup() { |
| if (thread != nullptr) { |
| int junk; |
| thread->Join(&junk, ZX_TIME_INFINITE); |
| thread = nullptr; |
| } |
| } |
| |
| BenchmarkState* owner{nullptr}; |
| Thread* thread{nullptr}; |
| cpu_num_t cpu_id{INVALID_CPU}; |
| bool is_primary{false}; |
| ktl::array<StageResults, ktl::size(kStages)> results; |
| }; |
| |
| // The top level state for the benchmark. This holds each of the CPU contexts, |
| // as well as the atomic variables used for advancing through each the stages, |
| // as well as synchronizing the CPU test threads during the measurement phase of |
| // each stage. |
| class BenchmarkState { |
| public: |
| constexpr BenchmarkState() = default; |
| ~BenchmarkState() { Cleanup(); } |
| |
| int Run(); |
| int RunContext(CpuContext& ctx); |
| |
| private: |
| static inline constexpr zx_duration_t kMeasurementTime = ZX_SEC(1); |
| |
| bool WaitForGate(size_t gate_id) { |
| while (!shutdown_now_.load() && (stage_gate_.load() < gate_id)) { |
| Thread::Current::SleepRelative(ZX_MSEC(1)); |
| } |
| |
| return !shutdown_now_.load(); |
| } |
| |
| void Cleanup() { |
| // Release any running threads from whatever they are doing. |
| shutdown_now_.store(true); |
| |
| // Then clean them all up. |
| for (auto& ctx : cpu_contexts_) { |
| ctx.Cleanup(); |
| } |
| } |
| |
| fbl::Array<CpuContext> cpu_contexts_; |
| ktl::atomic<bool> shutdown_now_{false}; |
| ktl::atomic<size_t> stage_gate_{0}; |
| ktl::atomic<size_t> ready_to_start_count_{0}; |
| ktl::atomic<size_t> finished_count_{0}; |
| ktl::atomic<zx_ticks_t> ticks_deadline_{0}; |
| }; |
| |
| int BenchmarkState::Run() { |
| // Figure out how many CPUs we have currently online. |
| cpu_mask_t online_cpus = mp_get_online_mask(); |
| size_t online_count = ktl::popcount(online_cpus); |
| |
| // Allocate enough context storage for the online CPUs. |
| fbl::AllocChecker ac; |
| cpu_contexts_.reset(new (&ac) CpuContext[online_count], online_count); |
| if (!ac.check()) { |
| printf("Failed to allocate %zu CpuContexts (mask 0x%08x)\n", online_count, online_cpus); |
| return -1; |
| } |
| |
| // Now start each of the test threads. |
| cpu_num_t cpu_id = 0; |
| bool is_primary = true; |
| size_t ndx = 0; |
| while (online_cpus) { |
| if (online_cpus & 0x1) { |
| zx_status_t status = cpu_contexts_[ndx].Init( |
| this, cpu_id, is_primary, |
| [](void* _ctx) -> int { |
| CpuContext& ctx = *(reinterpret_cast<CpuContext*>(_ctx)); |
| return ctx.owner->RunContext(ctx); |
| }, |
| &cpu_contexts_[ndx]); |
| |
| if (status != ZX_OK) { |
| printf("Failed to initialize CpuContext for cpu %u (status %d)\n", cpu_id, status); |
| return -1; |
| } |
| ++ndx; |
| } |
| |
| ++cpu_id; |
| is_primary = false; |
| online_cpus >>= 1; |
| } |
| |
| // Cycle all of test threads through all of the stages. |
| for (size_t stage = 0; stage < kStages.size(); ++stage) { |
| // Reset the stage sync state, and report which stage we are about to measure. |
| const TestStage& s = kStages[stage]; |
| ready_to_start_count_.store(0); |
| finished_count_.store(0); |
| printf("%s stage \"%s\".\n", s.enabled() ? "Measuring" : "Skipping", s.name); |
| Thread::Current::SleepRelative(ZX_MSEC(10)); |
| |
| // Signal the threads that they may start the next measurement stage, and |
| // wait until they have finished. |
| stage_gate_.store(stage + 1); |
| while (finished_count_.load() < cpu_contexts_.size()) { |
| Thread::Current::SleepRelative(ZX_MSEC(1)); |
| } |
| } |
| |
| // Print out results and exit, cleaning up as we go. |
| printf(" %22s |", "Stage"); |
| for (const auto& ctx : cpu_contexts_) { |
| printf(" %cCPU %2u |", ctx.is_primary ? '*' : ' ', ctx.cpu_id); |
| } |
| |
| printf("\n------------------------+"); |
| for (size_t i = 0; i < cpu_contexts_.size(); ++i) { |
| printf("--------------+"); |
| } |
| printf("\n"); |
| |
| for (size_t stage = 0; stage < kStages.size(); ++stage) { |
| if (kStages[stage].enabled() == false) { |
| continue; |
| } |
| printf(" %22s |", kStages[stage].name); |
| for (const auto& ctx : cpu_contexts_) { |
| DEBUG_ASSERT(kStages.size() == ctx.results.size()); |
| |
| const StageResults& result = ctx.results[stage]; |
| const zx_ticks_t ticks_duration = result.end - result.start; |
| const zx_time_t time_duration = platform_get_ticks_to_time_ratio().Scale(ticks_duration); |
| if ((time_duration > 0) && (time_duration <= ktl::numeric_limits<uint32_t>::max())) { |
| printf(" %12ld |", |
| affine::Ratio{ZX_SEC(1), static_cast<uint32_t>(time_duration)}.Scale(result.count)); |
| } else { |
| printf(" %12s |", "???"); |
| } |
| } |
| printf("\n"); |
| } |
| |
| return 0; |
| } |
| |
| int BenchmarkState::RunContext(CpuContext& ctx) { |
| const size_t cpu_count = cpu_contexts_.size(); |
| DEBUG_ASSERT(cpu_count >= 1); |
| |
| // Run through all of the measurement stages, syncing up with the other |
| // threads at each stage. |
| for (size_t stage = 0; stage < std::size(kStages); ++stage) { |
| // Wait until the control thread tells us it is OK to shut interrupts off |
| // and to start the next measurement. If something goes wrong, this wait |
| // will return false, and we should bail out immediately. |
| if (!WaitForGate(stage + 1)) { |
| return -1; |
| } |
| |
| // It is time to take the next stage measurements. Turn off interrupts for |
| // the duration of the measurement cycle. |
| { |
| InterruptDisableGuard irqd; |
| |
| // Only take the measurement if this stage is actually enabled. |
| if (kStages[stage].enabled()) { |
| // Are we the "primary" CPU? If so, wait until all of the secondary |
| // CPUs are ready to go. Then set up the deadline for the measurement |
| // cycle and join the group of ready threads (signaling that the |
| // measurement is ready to start). |
| // |
| // If we are a "secondary" CPU, simply indicate that we are ready to go, |
| // and wait for all of the other CPUs to be ready as well. |
| TestStage::Action action = |
| ctx.is_primary ? kStages[stage].primary_action : kStages[stage].secondary_action; |
| if (ctx.is_primary) { |
| while (ready_to_start_count_.load() < (cpu_count - 1)) { |
| arch::Yield(); |
| } |
| |
| zx_ticks_t ticks = platform_get_ticks_to_time_ratio().Inverse().Scale(kMeasurementTime); |
| ticks_deadline_.store(current_ticks() + ticks); |
| ready_to_start_count_.fetch_add(1); |
| } else { |
| ready_to_start_count_.fetch_add(1); |
| while (ready_to_start_count_.load() < cpu_count) { |
| arch::Yield(); |
| } |
| } |
| |
| // OK, time to take the actual measurement. See how many times we can |
| // make it through the measurement action before we hit the deadline, then |
| // record the start/end times, as well as the count. |
| size_t count = 0; |
| zx_ticks_t end = 0; |
| zx_ticks_t deadline = ticks_deadline_.load(); |
| zx_ticks_t start = current_ticks(); |
| |
| do { |
| action(0xc235754ef00c463d, 0x9ba8562ddc0932cf); |
| ++count; |
| } while ((end = current_ticks()) < deadline); |
| |
| // Record our results; |
| ctx.results[stage].start = start; |
| ctx.results[stage].end = end; |
| ctx.results[stage].count = count; |
| } |
| |
| // Signal that we are finished, then wait until everyone else is as well. |
| finished_count_.fetch_add(1); |
| while (finished_count_.load() < cpu_count) { |
| arch::Yield(); |
| } |
| } |
| } |
| |
| return 0; |
| } |
| |
| int msr_bench(int argc, const cmd_args* argv, uint32_t flags) { |
| fbl::AllocChecker ac; |
| ktl::unique_ptr<BenchmarkState> benchmark = fbl::make_unique_checked<BenchmarkState>(&ac); |
| |
| if (!ac.check()) { |
| printf("Failed to allocate benchmark context!\n"); |
| return -1; |
| } |
| |
| return benchmark->Run(); |
| } |
| |
| STATIC_COMMAND_START |
| STATIC_COMMAND("msr_bench", "MSR bechmarks", msr_bench) |
| STATIC_COMMAND_END(msr_x64) |
| |
| } // namespace |