blob: b8302b636d4a1937b38a748f35f8cc14987c9573 [file] [log] [blame]
// Copyright 2022 The Fuchsia Authors
//
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file or at
// https://opensource.org/licenses/MIT
#include <assert.h>
#include <lib/affine/ratio.h>
#include <lib/console.h>
#include <zircon/compiler.h>
#include <zircon/types.h>
#include <arch/mp.h>
#include <arch/x86.h>
#include <arch/x86/apic.h>
#include <arch/x86/feature.h>
#include <fbl/alloc_checker.h>
#include <fbl/array.h>
#include <ktl/array.h>
#include <ktl/atomic.h>
#include <ktl/bit.h>
#include <ktl/iterator.h>
#include <ktl/limits.h>
/*
* MSR Benchmark:
*
* This benchmark attempts to measure the cost of reading and writing MSR
* registers (specifically, the TSC Deadline register used to implement timers
* on x64), and the effect that doing so might have on other CPUs' performance.
*
* These measurements are meant to serve two purposes:
*
* 1) To compare the relative performance of MSR reads/writes across
* 1a) Native HW environments (eg; running on a 'host');
* 1b) Guest VM environments running directly inside of a host.
* 1b) Nested guest VM environments (eg; a guest inside of a guest inside of a
* host)
* 2) To see if reading/writing MSR registers on one CPU has an affect on other
* CPUs.
*
* #1 helps us to understand the cost of MSR access in a VM environment, while
* #2 helps us to understand if a VM environments implementation of MSR access
* affects other CPUs. We expected that it would not, but VMs can be tricky
* (esp. nested VMs).
*
* The structure of the benchmark is as follows:
*
* We will take measurements across a number of stages across all currently
* online CPUs. One of the online CPUs is considered to be the "primary" CPU,
* while the others are considered to be "secondaries". Each stage has two
* "actions" it will perform, one for the primary CPU, and another for the
* secondaries. During the measurement for a stage, each CPU will disable
* interrupts, and then see how many times they can complete their assigned
* action within a fixed measurement interval.
*
* During the first stage, all of the CPU actions will consist of simple
* arithmetic in order to establish a baseline. Subsequent stages will consist
* of the tests of MSR register reads and writes, split into two phases. In the
* first phase the primary CPU will perform MSR reads/writes, while the
* secondaries run the arithmetic action. In the second phase, all of the CPUs
* will perform the MSR read/writes performed by the primary CPU in the first
* phase.
*
* After taking measurements for each stage the test threads shut down and the
* results are printed. If MSR reads/writes are not having an affect on other
* CPUs, we expect to see the arithmetic numbers for secondaries to be basically
* unchanged from the baseline established in the first stage when the primary
* CPU is performing MSR accesses. Likewise, if MSR accesses have no affect on
* other CPUs, we expect all CPUs to show the same MSR performance when running
* concurrently as the primary CPU did when it was the only CPU performing MSR
* accesses.
*
* The console thread is used to sequence the benchmarks, but is not actually
* responsible for taking any measurements. It creates one thread per-active
* CPU, each of which run with default weight and has hard affinity for one of
* the currently active CPUs. Each of these threads will
* spin-sleep until the console thread tells them to start the next measurement
* stage.
*
* At that point in time, all of the threads become more aggressive in their
* spinning behavior. Once realizing that the stage has started, each CPU
* disables interrupts, and then each secondary CPU signals to the primary that
* they are ready to start before spin-waiting on the signal from the primary
* CPU to start.
*
* The primary spin-waits for the secondaries to become ready, then assigns a
* deadline for the stage, finally signals to everyone that the measurement is
* ready to start. Each thread:
* 1) Counts the number of times they are able to make it through their stage's
* measurement action before the deadline.
* 2) Records the result.
* 3) Signals to the console thread that they are finished.
* 4) Re-enables interrupts.
* 5) And finally waits for the console thread to tell them to start the next
* stage.
*
* Once all of the measurements have been taken, the measurement threads exit,
* the console thread prints the results, and finally cleans up all of the test
* resources.
*
*/
namespace {
class BenchmarkState;
// The structure which defines the name of, and actions for, each
// measurement stage.
struct TestStage {
using Action = uint64_t (*)(uint64_t a, uint64_t b);
using EnabledTest = bool (*)();
TestStage(const char* _name, Action _primary_action, Action _secondary_action,
EnabledTest _enabled_test)
: name{_name},
primary_action{_primary_action},
secondary_action{_secondary_action},
enabled_test(_enabled_test) {}
bool enabled() const { return enabled_test(); }
const char* const name;
const Action primary_action;
const Action secondary_action;
const EnabledTest enabled_test;
};
// The structure which holds the result for a stage. Specifically, the start
// time, end time, and number of times that a CPU managed to execute its action
// during the stage. When results are printed, they are normalized to show the
// number of actions/second the CPU managed to execute.
struct StageResults {
zx_ticks_t start{0};
zx_ticks_t end{0};
size_t count{0};
};
// The arithmetic action just does some simple adds and multiplies before
// exiting. Note, we need to flag our accumulator as volatile in order to
// convince the compiler to not simply optimize away this operation.
static uint64_t ArithmeticAction(uint64_t a, uint64_t b) {
static constexpr uint32_t kCycles = 1 << 10;
volatile uint64_t acc = 0;
for (uint32_t i = 0; i < kCycles; ++i) {
acc += a;
acc *= b;
}
return acc;
}
// Read the TSC Deadline register 256 times.
static uint64_t TscDeadlineReadAction(uint64_t a, uint64_t b) {
static constexpr uint32_t kCycles = 1 << 8;
for (uint32_t i = 0; i < kCycles; ++i) {
[[maybe_unused]] volatile const uint64_t val = read_msr(X86_MSR_IA32_TSC_DEADLINE);
}
return 0;
}
// Read the TSC Deadline register, then write to it 256 times before finally
// restoring it to the initially read value.
static uint64_t TscDeadlineWriteAction(uint64_t a, uint64_t b) {
static constexpr uint32_t kCycles = 1 << 8;
const uint64_t original = read_msr(X86_MSR_IA32_TSC_DEADLINE);
for (uint32_t i = 0; i < kCycles; ++i) {
write_msr(X86_MSR_IA32_TSC_DEADLINE, original + i + 1);
}
write_msr(X86_MSR_IA32_TSC_DEADLINE, original);
return original;
}
// Read the LVT Timer Interrupt control register 256 times.
static uint64_t LvtTimerReadAction(uint64_t a, uint64_t b) {
static constexpr uint32_t kCycles = 1 << 8;
for (uint32_t i = 0; i < kCycles; ++i) {
[[maybe_unused]] volatile const uint64_t val = read_msr(X86_MSR_IA32_X2APIC_LVT_TIMER);
}
return 0;
}
// Read the LVT Timer Interrupt control register, then write to it toggling the
// Masked bit 256 times. Make sure that we also backup and restore the value in
// the TSC_DEADLINE register in the process. When we perform a write to the
// timer interrupt control register, it will disable any armed deadline. We can
// re-arm the deadline by writing to the deadline register again.
static uint64_t LvtTimerWriteAction(uint64_t a, uint64_t b) {
static constexpr uint32_t kCycles = 1 << 8;
static constexpr uint64_t kMaskBit = 0x10000; // Intel SW Dev Manual, Vol 3, section 10.5.1
//
const uint64_t old_deadline = read_msr(X86_MSR_IA32_TSC_DEADLINE);
const uint64_t original = read_msr(X86_MSR_IA32_X2APIC_LVT_TIMER);
uint64_t val = original;
for (uint32_t i = 0; i < kCycles; ++i) {
val ^= kMaskBit;
write_msr(X86_MSR_IA32_X2APIC_LVT_TIMER, val);
}
write_msr(X86_MSR_IA32_X2APIC_LVT_TIMER, original);
// Make sure we put an explicit MFENCE in-between the write to the timer
// interrupt control register and the deadline register. If the timer write
// hits the register after the deadline write, it will disable the armed
// deadline.
arch::DeviceMemoryBarrier();
write_msr(X86_MSR_IA32_TSC_DEADLINE, old_deadline);
return original;
}
static bool enable_tscd() { return x86_feature_test(X86_FEATURE_TSC_DEADLINE); }
static bool enable_lvtt_rd() { return is_x2apic_enabled(); }
static bool enable_lvtt_wr() { return is_x2apic_enabled() && enable_tscd(); }
// The definitions of each benchmark stage.
static const ktl::array kStages{
TestStage{"basic arithmetic", ArithmeticAction, ArithmeticAction, []() { return true; }},
TestStage{"primary TSCD Rd", TscDeadlineReadAction, ArithmeticAction, enable_tscd},
TestStage{"primary TSCD Wr", TscDeadlineWriteAction, ArithmeticAction, enable_tscd},
TestStage{"all TSCD Rd", TscDeadlineReadAction, TscDeadlineReadAction, enable_tscd},
TestStage{"all TSCD Wr", TscDeadlineWriteAction, TscDeadlineWriteAction, enable_tscd},
TestStage{"primary LVTT Rd", LvtTimerReadAction, ArithmeticAction, enable_lvtt_rd},
TestStage{"primary LVTT Wr", LvtTimerWriteAction, ArithmeticAction, enable_lvtt_wr},
TestStage{"all LVTT Rd", LvtTimerReadAction, LvtTimerReadAction, enable_lvtt_rd},
TestStage{"all LVTT Wr", LvtTimerWriteAction, LvtTimerWriteAction, enable_lvtt_wr},
};
// A structure which holds a CPUs context. Mostly, this holds the state for a
// CPU's thread, and the results for that CPU's measurements.
struct CpuContext {
~CpuContext() { DEBUG_ASSERT(thread == nullptr); }
zx_status_t Init(BenchmarkState* _owner, cpu_num_t _cpu_id, bool _is_primary,
thread_start_routine entry, void* arg) {
DEBUG_ASSERT(thread == nullptr);
DEBUG_ASSERT(cpu_id == INVALID_CPU);
cpu_id = _cpu_id;
is_primary = _is_primary;
owner = _owner;
char name[ZX_MAX_NAME_LEN];
snprintf(name, sizeof(name), "BenchmarkState %u", cpu_id);
// Create our thread, then set its hard affinity its assigned CPU before
// allowing it to run.
thread = Thread::Create(name, entry, arg, DEFAULT_PRIORITY);
if (thread == nullptr) {
return ZX_ERR_NO_MEMORY;
}
thread->SetCpuAffinity(cpu_num_to_mask(cpu_id));
thread->Resume();
return ZX_OK;
}
void Cleanup() {
if (thread != nullptr) {
int junk;
thread->Join(&junk, ZX_TIME_INFINITE);
thread = nullptr;
}
}
BenchmarkState* owner{nullptr};
Thread* thread{nullptr};
cpu_num_t cpu_id{INVALID_CPU};
bool is_primary{false};
ktl::array<StageResults, ktl::size(kStages)> results;
};
// The top level state for the benchmark. This holds each of the CPU contexts,
// as well as the atomic variables used for advancing through each the stages,
// as well as synchronizing the CPU test threads during the measurement phase of
// each stage.
class BenchmarkState {
public:
constexpr BenchmarkState() = default;
~BenchmarkState() { Cleanup(); }
int Run();
int RunContext(CpuContext& ctx);
private:
static inline constexpr zx_duration_t kMeasurementTime = ZX_SEC(1);
bool WaitForGate(size_t gate_id) {
while (!shutdown_now_.load() && (stage_gate_.load() < gate_id)) {
Thread::Current::SleepRelative(ZX_MSEC(1));
}
return !shutdown_now_.load();
}
void Cleanup() {
// Release any running threads from whatever they are doing.
shutdown_now_.store(true);
// Then clean them all up.
for (auto& ctx : cpu_contexts_) {
ctx.Cleanup();
}
}
fbl::Array<CpuContext> cpu_contexts_;
ktl::atomic<bool> shutdown_now_{false};
ktl::atomic<size_t> stage_gate_{0};
ktl::atomic<size_t> ready_to_start_count_{0};
ktl::atomic<size_t> finished_count_{0};
ktl::atomic<zx_ticks_t> ticks_deadline_{0};
};
int BenchmarkState::Run() {
// Figure out how many CPUs we have currently online.
cpu_mask_t online_cpus = mp_get_online_mask();
size_t online_count = ktl::popcount(online_cpus);
// Allocate enough context storage for the online CPUs.
fbl::AllocChecker ac;
cpu_contexts_.reset(new (&ac) CpuContext[online_count], online_count);
if (!ac.check()) {
printf("Failed to allocate %zu CpuContexts (mask 0x%08x)\n", online_count, online_cpus);
return -1;
}
// Now start each of the test threads.
cpu_num_t cpu_id = 0;
bool is_primary = true;
size_t ndx = 0;
while (online_cpus) {
if (online_cpus & 0x1) {
zx_status_t status = cpu_contexts_[ndx].Init(
this, cpu_id, is_primary,
[](void* _ctx) -> int {
CpuContext& ctx = *(reinterpret_cast<CpuContext*>(_ctx));
return ctx.owner->RunContext(ctx);
},
&cpu_contexts_[ndx]);
if (status != ZX_OK) {
printf("Failed to initialize CpuContext for cpu %u (status %d)\n", cpu_id, status);
return -1;
}
++ndx;
}
++cpu_id;
is_primary = false;
online_cpus >>= 1;
}
// Cycle all of test threads through all of the stages.
for (size_t stage = 0; stage < kStages.size(); ++stage) {
// Reset the stage sync state, and report which stage we are about to measure.
const TestStage& s = kStages[stage];
ready_to_start_count_.store(0);
finished_count_.store(0);
printf("%s stage \"%s\".\n", s.enabled() ? "Measuring" : "Skipping", s.name);
Thread::Current::SleepRelative(ZX_MSEC(10));
// Signal the threads that they may start the next measurement stage, and
// wait until they have finished.
stage_gate_.store(stage + 1);
while (finished_count_.load() < cpu_contexts_.size()) {
Thread::Current::SleepRelative(ZX_MSEC(1));
}
}
// Print out results and exit, cleaning up as we go.
printf(" %22s |", "Stage");
for (const auto& ctx : cpu_contexts_) {
printf(" %cCPU %2u |", ctx.is_primary ? '*' : ' ', ctx.cpu_id);
}
printf("\n------------------------+");
for (size_t i = 0; i < cpu_contexts_.size(); ++i) {
printf("--------------+");
}
printf("\n");
for (size_t stage = 0; stage < kStages.size(); ++stage) {
if (kStages[stage].enabled() == false) {
continue;
}
printf(" %22s |", kStages[stage].name);
for (const auto& ctx : cpu_contexts_) {
DEBUG_ASSERT(kStages.size() == ctx.results.size());
const StageResults& result = ctx.results[stage];
const zx_ticks_t ticks_duration = result.end - result.start;
const zx_time_t time_duration = platform_get_ticks_to_time_ratio().Scale(ticks_duration);
if ((time_duration > 0) && (time_duration <= ktl::numeric_limits<uint32_t>::max())) {
printf(" %12ld |",
affine::Ratio{ZX_SEC(1), static_cast<uint32_t>(time_duration)}.Scale(result.count));
} else {
printf(" %12s |", "???");
}
}
printf("\n");
}
return 0;
}
int BenchmarkState::RunContext(CpuContext& ctx) {
const size_t cpu_count = cpu_contexts_.size();
DEBUG_ASSERT(cpu_count >= 1);
// Run through all of the measurement stages, syncing up with the other
// threads at each stage.
for (size_t stage = 0; stage < std::size(kStages); ++stage) {
// Wait until the control thread tells us it is OK to shut interrupts off
// and to start the next measurement. If something goes wrong, this wait
// will return false, and we should bail out immediately.
if (!WaitForGate(stage + 1)) {
return -1;
}
// It is time to take the next stage measurements. Turn off interrupts for
// the duration of the measurement cycle.
{
InterruptDisableGuard irqd;
// Only take the measurement if this stage is actually enabled.
if (kStages[stage].enabled()) {
// Are we the "primary" CPU? If so, wait until all of the secondary
// CPUs are ready to go. Then set up the deadline for the measurement
// cycle and join the group of ready threads (signaling that the
// measurement is ready to start).
//
// If we are a "secondary" CPU, simply indicate that we are ready to go,
// and wait for all of the other CPUs to be ready as well.
TestStage::Action action =
ctx.is_primary ? kStages[stage].primary_action : kStages[stage].secondary_action;
if (ctx.is_primary) {
while (ready_to_start_count_.load() < (cpu_count - 1)) {
arch::Yield();
}
zx_ticks_t ticks = platform_get_ticks_to_time_ratio().Inverse().Scale(kMeasurementTime);
ticks_deadline_.store(current_ticks() + ticks);
ready_to_start_count_.fetch_add(1);
} else {
ready_to_start_count_.fetch_add(1);
while (ready_to_start_count_.load() < cpu_count) {
arch::Yield();
}
}
// OK, time to take the actual measurement. See how many times we can
// make it through the measurement action before we hit the deadline, then
// record the start/end times, as well as the count.
size_t count = 0;
zx_ticks_t end = 0;
zx_ticks_t deadline = ticks_deadline_.load();
zx_ticks_t start = current_ticks();
do {
action(0xc235754ef00c463d, 0x9ba8562ddc0932cf);
++count;
} while ((end = current_ticks()) < deadline);
// Record our results;
ctx.results[stage].start = start;
ctx.results[stage].end = end;
ctx.results[stage].count = count;
}
// Signal that we are finished, then wait until everyone else is as well.
finished_count_.fetch_add(1);
while (finished_count_.load() < cpu_count) {
arch::Yield();
}
}
}
return 0;
}
int msr_bench(int argc, const cmd_args* argv, uint32_t flags) {
fbl::AllocChecker ac;
ktl::unique_ptr<BenchmarkState> benchmark = fbl::make_unique_checked<BenchmarkState>(&ac);
if (!ac.check()) {
printf("Failed to allocate benchmark context!\n");
return -1;
}
return benchmark->Run();
}
STATIC_COMMAND_START
STATIC_COMMAND("msr_bench", "MSR bechmarks", msr_bench)
STATIC_COMMAND_END(msr_x64)
} // namespace