blob: b22a3314b415b440a98d7a4673faa1441f9fa198 [file] [log] [blame]
/*
* QEMU Windows Hypervisor Platform accelerator (WHPX)
*
* Copyright Microsoft Corp. 2017
*
* This work is licensed under the terms of the GNU GPL, version 2 or later.
* See the COPYING file in the top-level directory.
*
*/
#include "qemu/osdep.h"
#include "cpu.h"
#include "exec/address-spaces.h"
#include "exec/ioport.h"
#include "exec/gdbstub.h"
#include "qemu/accel.h"
#include "sysemu/whpx.h"
#include "sysemu/cpus.h"
#include "sysemu/runstate.h"
#include "qemu/main-loop.h"
#include "hw/boards.h"
#include "hw/i386/ioapic.h"
#include "hw/i386/apic_internal.h"
#include "qemu/error-report.h"
#include "qapi/error.h"
#include "qapi/qapi-types-common.h"
#include "qapi/qapi-visit-common.h"
#include "migration/blocker.h"
#include <winerror.h>
#include "whpx-internal.h"
#include "whpx-accel-ops.h"
#include <WinHvPlatform.h>
#include <WinHvEmulation.h>
#define HYPERV_APIC_BUS_FREQUENCY (200000000ULL)
static const WHV_REGISTER_NAME whpx_register_names[] = {
/* X64 General purpose registers */
WHvX64RegisterRax,
WHvX64RegisterRcx,
WHvX64RegisterRdx,
WHvX64RegisterRbx,
WHvX64RegisterRsp,
WHvX64RegisterRbp,
WHvX64RegisterRsi,
WHvX64RegisterRdi,
WHvX64RegisterR8,
WHvX64RegisterR9,
WHvX64RegisterR10,
WHvX64RegisterR11,
WHvX64RegisterR12,
WHvX64RegisterR13,
WHvX64RegisterR14,
WHvX64RegisterR15,
WHvX64RegisterRip,
WHvX64RegisterRflags,
/* X64 Segment registers */
WHvX64RegisterEs,
WHvX64RegisterCs,
WHvX64RegisterSs,
WHvX64RegisterDs,
WHvX64RegisterFs,
WHvX64RegisterGs,
WHvX64RegisterLdtr,
WHvX64RegisterTr,
/* X64 Table registers */
WHvX64RegisterIdtr,
WHvX64RegisterGdtr,
/* X64 Control Registers */
WHvX64RegisterCr0,
WHvX64RegisterCr2,
WHvX64RegisterCr3,
WHvX64RegisterCr4,
WHvX64RegisterCr8,
/* X64 Debug Registers */
/*
* WHvX64RegisterDr0,
* WHvX64RegisterDr1,
* WHvX64RegisterDr2,
* WHvX64RegisterDr3,
* WHvX64RegisterDr6,
* WHvX64RegisterDr7,
*/
/* X64 Floating Point and Vector Registers */
WHvX64RegisterXmm0,
WHvX64RegisterXmm1,
WHvX64RegisterXmm2,
WHvX64RegisterXmm3,
WHvX64RegisterXmm4,
WHvX64RegisterXmm5,
WHvX64RegisterXmm6,
WHvX64RegisterXmm7,
WHvX64RegisterXmm8,
WHvX64RegisterXmm9,
WHvX64RegisterXmm10,
WHvX64RegisterXmm11,
WHvX64RegisterXmm12,
WHvX64RegisterXmm13,
WHvX64RegisterXmm14,
WHvX64RegisterXmm15,
WHvX64RegisterFpMmx0,
WHvX64RegisterFpMmx1,
WHvX64RegisterFpMmx2,
WHvX64RegisterFpMmx3,
WHvX64RegisterFpMmx4,
WHvX64RegisterFpMmx5,
WHvX64RegisterFpMmx6,
WHvX64RegisterFpMmx7,
WHvX64RegisterFpControlStatus,
WHvX64RegisterXmmControlStatus,
/* X64 MSRs */
WHvX64RegisterEfer,
#ifdef TARGET_X86_64
WHvX64RegisterKernelGsBase,
#endif
WHvX64RegisterApicBase,
/* WHvX64RegisterPat, */
WHvX64RegisterSysenterCs,
WHvX64RegisterSysenterEip,
WHvX64RegisterSysenterEsp,
WHvX64RegisterStar,
#ifdef TARGET_X86_64
WHvX64RegisterLstar,
WHvX64RegisterCstar,
WHvX64RegisterSfmask,
#endif
/* Interrupt / Event Registers */
/*
* WHvRegisterPendingInterruption,
* WHvRegisterInterruptState,
* WHvRegisterPendingEvent0,
* WHvRegisterPendingEvent1
* WHvX64RegisterDeliverabilityNotifications,
*/
};
struct whpx_register_set {
WHV_REGISTER_VALUE values[RTL_NUMBER_OF(whpx_register_names)];
};
/*
* The current implementation of instruction stepping sets the TF flag
* in RFLAGS, causing the CPU to raise an INT1 after each instruction.
* This corresponds to the WHvX64ExceptionTypeDebugTrapOrFault exception.
*
* This approach has a few limitations:
* 1. Stepping over a PUSHF/SAHF instruction will save the TF flag
* along with the other flags, possibly restoring it later. It would
* result in another INT1 when the flags are restored, triggering
* a stop in gdb that could be cleared by doing another step.
*
* Stepping over a POPF/LAHF instruction will let it overwrite the
* TF flags, ending the stepping mode.
*
* 2. Stepping over an instruction raising an exception (e.g. INT, DIV,
* or anything that could result in a page fault) will save the flags
* to the stack, clear the TF flag, and let the guest execute the
* handler. Normally, the guest will restore the original flags,
* that will continue single-stepping.
*
* 3. Debuggers running on the guest may wish to set TF to do instruction
* stepping. INT1 events generated by it would be intercepted by us,
* as long as the gdb is connected to QEMU.
*
* In practice this means that:
* 1. Stepping through flags-modifying instructions may cause gdb to
* continue or stop in unexpected places. This will be fully recoverable
* and will not crash the target.
*
* 2. Stepping over an instruction that triggers an exception will step
* over the exception handler, not into it.
*
* 3. Debugging the guest via gdb, while running debugger on the guest
* at the same time may lead to unexpected effects. Removing all
* breakpoints set via QEMU will prevent any further interference
* with the guest-level debuggers.
*
* The limitations can be addressed as shown below:
* 1. PUSHF/SAHF/POPF/LAHF/IRET instructions can be emulated instead of
* stepping through them. The exact semantics of the instructions is
* defined in the "Combined Volume Set of Intel 64 and IA-32
* Architectures Software Developer's Manuals", however it involves a
* fair amount of corner cases due to compatibility with real mode,
* virtual 8086 mode, and differences between 64-bit and 32-bit modes.
*
* 2. We could step into the guest's exception handlers using the following
* sequence:
* a. Temporarily enable catching of all exception types via
* whpx_set_exception_exit_bitmap().
* b. Once an exception is intercepted, read the IDT/GDT and locate
* the original handler.
* c. Patch the original handler, injecting an INT3 at the beginning.
* d. Update the exception exit bitmap to only catch the
* WHvX64ExceptionTypeBreakpointTrap exception.
* e. Let the affected CPU run in the exclusive mode.
* f. Restore the original handler and the exception exit bitmap.
* Note that handling all corner cases related to IDT/GDT is harder
* than it may seem. See x86_cpu_get_phys_page_attrs_debug() for a
* rough idea.
*
* 3. In order to properly support guest-level debugging in parallel with
* the QEMU-level debugging, we would need to be able to pass some INT1
* events to the guest. This could be done via the following methods:
* a. Using the WHvRegisterPendingEvent register. As of Windows 21H1,
* it seems to only work for interrupts and not software
* exceptions.
* b. Locating and patching the original handler by parsing IDT/GDT.
* This involves relatively complex logic outlined in the previous
* paragraph.
* c. Emulating the exception invocation (i.e. manually updating RIP,
* RFLAGS, and pushing the old values to stack). This is even more
* complicated than the previous option, since it involves checking
* CPL, gate attributes, and doing various adjustments depending
* on the current CPU mode, whether the CPL is changing, etc.
*/
typedef enum WhpxStepMode {
WHPX_STEP_NONE = 0,
/* Halt other VCPUs */
WHPX_STEP_EXCLUSIVE,
} WhpxStepMode;
struct whpx_vcpu {
WHV_EMULATOR_HANDLE emulator;
bool window_registered;
bool interruptable;
bool ready_for_pic_interrupt;
uint64_t tpr;
uint64_t apic_base;
bool interruption_pending;
/* Must be the last field as it may have a tail */
WHV_RUN_VP_EXIT_CONTEXT exit_ctx;
};
static bool whpx_allowed;
static bool whp_dispatch_initialized;
static HMODULE hWinHvPlatform, hWinHvEmulation;
static uint32_t max_vcpu_index;
static WHV_PROCESSOR_XSAVE_FEATURES whpx_xsave_cap;
struct whpx_state whpx_global;
struct WHPDispatch whp_dispatch;
static bool whpx_has_xsave(void)
{
return whpx_xsave_cap.XsaveSupport;
}
/*
* VP support
*/
static struct whpx_vcpu *get_whpx_vcpu(CPUState *cpu)
{
return (struct whpx_vcpu *)cpu->hax_vcpu;
}
static WHV_X64_SEGMENT_REGISTER whpx_seg_q2h(const SegmentCache *qs, int v86,
int r86)
{
WHV_X64_SEGMENT_REGISTER hs;
unsigned flags = qs->flags;
hs.Base = qs->base;
hs.Limit = qs->limit;
hs.Selector = qs->selector;
if (v86) {
hs.Attributes = 0;
hs.SegmentType = 3;
hs.Present = 1;
hs.DescriptorPrivilegeLevel = 3;
hs.NonSystemSegment = 1;
} else {
hs.Attributes = (flags >> DESC_TYPE_SHIFT);
if (r86) {
/* hs.Base &= 0xfffff; */
}
}
return hs;
}
static SegmentCache whpx_seg_h2q(const WHV_X64_SEGMENT_REGISTER *hs)
{
SegmentCache qs;
qs.base = hs->Base;
qs.limit = hs->Limit;
qs.selector = hs->Selector;
qs.flags = ((uint32_t)hs->Attributes) << DESC_TYPE_SHIFT;
return qs;
}
/* X64 Extended Control Registers */
static void whpx_set_xcrs(CPUState *cpu)
{
CPUX86State *env = cpu->env_ptr;
HRESULT hr;
struct whpx_state *whpx = &whpx_global;
WHV_REGISTER_VALUE xcr0;
WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;
if (!whpx_has_xsave()) {
return;
}
/* Only xcr0 is supported by the hypervisor currently */
xcr0.Reg64 = env->xcr0;
hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);
if (FAILED(hr)) {
error_report("WHPX: Failed to set register xcr0, hr=%08lx", hr);
}
}
static int whpx_set_tsc(CPUState *cpu)
{
CPUX86State *env = cpu->env_ptr;
WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
WHV_REGISTER_VALUE tsc_val;
HRESULT hr;
struct whpx_state *whpx = &whpx_global;
/*
* Suspend the partition prior to setting the TSC to reduce the variance
* in TSC across vCPUs. When the first vCPU runs post suspend, the
* partition is automatically resumed.
*/
if (whp_dispatch.WHvSuspendPartitionTime) {
/*
* Unable to suspend partition while setting TSC is not a fatal
* error. It just increases the likelihood of TSC variance between
* vCPUs and some guest OS are able to handle that just fine.
*/
hr = whp_dispatch.WHvSuspendPartitionTime(whpx->partition);
if (FAILED(hr)) {
warn_report("WHPX: Failed to suspend partition, hr=%08lx", hr);
}
}
tsc_val.Reg64 = env->tsc;
hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
if (FAILED(hr)) {
error_report("WHPX: Failed to set TSC, hr=%08lx", hr);
return -1;
}
return 0;
}
/*
* The CR8 register in the CPU is mapped to the TPR register of the APIC,
* however, they use a slightly different encoding. Specifically:
*
* APIC.TPR[bits 7:4] = CR8[bits 3:0]
*
* This mechanism is described in section 10.8.6.1 of Volume 3 of Intel 64
* and IA-32 Architectures Software Developer's Manual.
*
* The functions below translate the value of CR8 to TPR and vice versa.
*/
static uint64_t whpx_apic_tpr_to_cr8(uint64_t tpr)
{
return tpr >> 4;
}
static uint64_t whpx_cr8_to_apic_tpr(uint64_t cr8)
{
return cr8 << 4;
}
static void whpx_set_registers(CPUState *cpu, int level)
{
struct whpx_state *whpx = &whpx_global;
struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
CPUX86State *env = cpu->env_ptr;
X86CPU *x86_cpu = X86_CPU(cpu);
struct whpx_register_set vcxt;
HRESULT hr;
int idx;
int idx_next;
int i;
int v86, r86;
assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
/*
* Following MSRs have side effects on the guest or are too heavy for
* runtime. Limit them to full state update.
*/
if (level >= WHPX_SET_RESET_STATE) {
whpx_set_tsc(cpu);
}
memset(&vcxt, 0, sizeof(struct whpx_register_set));
v86 = (env->eflags & VM_MASK);
r86 = !(env->cr[0] & CR0_PE_MASK);
vcpu->tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));
vcpu->apic_base = cpu_get_apic_base(x86_cpu->apic_state);
idx = 0;
/* Indexes for first 16 registers match between HV and QEMU definitions */
idx_next = 16;
for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
vcxt.values[idx].Reg64 = (uint64_t)env->regs[idx];
}
idx = idx_next;
/* Same goes for RIP and RFLAGS */
assert(whpx_register_names[idx] == WHvX64RegisterRip);
vcxt.values[idx++].Reg64 = env->eip;
assert(whpx_register_names[idx] == WHvX64RegisterRflags);
vcxt.values[idx++].Reg64 = env->eflags;
/* Translate 6+4 segment registers. HV and QEMU order matches */
assert(idx == WHvX64RegisterEs);
for (i = 0; i < 6; i += 1, idx += 1) {
vcxt.values[idx].Segment = whpx_seg_q2h(&env->segs[i], v86, r86);
}
assert(idx == WHvX64RegisterLdtr);
vcxt.values[idx++].Segment = whpx_seg_q2h(&env->ldt, 0, 0);
assert(idx == WHvX64RegisterTr);
vcxt.values[idx++].Segment = whpx_seg_q2h(&env->tr, 0, 0);
assert(idx == WHvX64RegisterIdtr);
vcxt.values[idx].Table.Base = env->idt.base;
vcxt.values[idx].Table.Limit = env->idt.limit;
idx += 1;
assert(idx == WHvX64RegisterGdtr);
vcxt.values[idx].Table.Base = env->gdt.base;
vcxt.values[idx].Table.Limit = env->gdt.limit;
idx += 1;
/* CR0, 2, 3, 4, 8 */
assert(whpx_register_names[idx] == WHvX64RegisterCr0);
vcxt.values[idx++].Reg64 = env->cr[0];
assert(whpx_register_names[idx] == WHvX64RegisterCr2);
vcxt.values[idx++].Reg64 = env->cr[2];
assert(whpx_register_names[idx] == WHvX64RegisterCr3);
vcxt.values[idx++].Reg64 = env->cr[3];
assert(whpx_register_names[idx] == WHvX64RegisterCr4);
vcxt.values[idx++].Reg64 = env->cr[4];
assert(whpx_register_names[idx] == WHvX64RegisterCr8);
vcxt.values[idx++].Reg64 = vcpu->tpr;
/* 8 Debug Registers - Skipped */
/*
* Extended control registers needs to be handled separately depending
* on whether xsave is supported/enabled or not.
*/
whpx_set_xcrs(cpu);
/* 16 XMM registers */
assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
idx_next = idx + 16;
for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
vcxt.values[idx].Reg128.Low64 = env->xmm_regs[i].ZMM_Q(0);
vcxt.values[idx].Reg128.High64 = env->xmm_regs[i].ZMM_Q(1);
}
idx = idx_next;
/* 8 FP registers */
assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
for (i = 0; i < 8; i += 1, idx += 1) {
vcxt.values[idx].Fp.AsUINT128.Low64 = env->fpregs[i].mmx.MMX_Q(0);
/* vcxt.values[idx].Fp.AsUINT128.High64 =
env->fpregs[i].mmx.MMX_Q(1);
*/
}
/* FP control status register */
assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
vcxt.values[idx].FpControlStatus.FpControl = env->fpuc;
vcxt.values[idx].FpControlStatus.FpStatus =
(env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
vcxt.values[idx].FpControlStatus.FpTag = 0;
for (i = 0; i < 8; ++i) {
vcxt.values[idx].FpControlStatus.FpTag |= (!env->fptags[i]) << i;
}
vcxt.values[idx].FpControlStatus.Reserved = 0;
vcxt.values[idx].FpControlStatus.LastFpOp = env->fpop;
vcxt.values[idx].FpControlStatus.LastFpRip = env->fpip;
idx += 1;
/* XMM control status register */
assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
vcxt.values[idx].XmmControlStatus.LastFpRdp = 0;
vcxt.values[idx].XmmControlStatus.XmmStatusControl = env->mxcsr;
vcxt.values[idx].XmmControlStatus.XmmStatusControlMask = 0x0000ffff;
idx += 1;
/* MSRs */
assert(whpx_register_names[idx] == WHvX64RegisterEfer);
vcxt.values[idx++].Reg64 = env->efer;
#ifdef TARGET_X86_64
assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
vcxt.values[idx++].Reg64 = env->kernelgsbase;
#endif
assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
vcxt.values[idx++].Reg64 = vcpu->apic_base;
/* WHvX64RegisterPat - Skipped */
assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
vcxt.values[idx++].Reg64 = env->sysenter_cs;
assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
vcxt.values[idx++].Reg64 = env->sysenter_eip;
assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
vcxt.values[idx++].Reg64 = env->sysenter_esp;
assert(whpx_register_names[idx] == WHvX64RegisterStar);
vcxt.values[idx++].Reg64 = env->star;
#ifdef TARGET_X86_64
assert(whpx_register_names[idx] == WHvX64RegisterLstar);
vcxt.values[idx++].Reg64 = env->lstar;
assert(whpx_register_names[idx] == WHvX64RegisterCstar);
vcxt.values[idx++].Reg64 = env->cstar;
assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
vcxt.values[idx++].Reg64 = env->fmask;
#endif
/* Interrupt / Event Registers - Skipped */
assert(idx == RTL_NUMBER_OF(whpx_register_names));
hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
whpx->partition, cpu->cpu_index,
whpx_register_names,
RTL_NUMBER_OF(whpx_register_names),
&vcxt.values[0]);
if (FAILED(hr)) {
error_report("WHPX: Failed to set virtual processor context, hr=%08lx",
hr);
}
return;
}
static int whpx_get_tsc(CPUState *cpu)
{
CPUX86State *env = cpu->env_ptr;
WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
WHV_REGISTER_VALUE tsc_val;
HRESULT hr;
struct whpx_state *whpx = &whpx_global;
hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
if (FAILED(hr)) {
error_report("WHPX: Failed to get TSC, hr=%08lx", hr);
return -1;
}
env->tsc = tsc_val.Reg64;
return 0;
}
/* X64 Extended Control Registers */
static void whpx_get_xcrs(CPUState *cpu)
{
CPUX86State *env = cpu->env_ptr;
HRESULT hr;
struct whpx_state *whpx = &whpx_global;
WHV_REGISTER_VALUE xcr0;
WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;
if (!whpx_has_xsave()) {
return;
}
/* Only xcr0 is supported by the hypervisor currently */
hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);
if (FAILED(hr)) {
error_report("WHPX: Failed to get register xcr0, hr=%08lx", hr);
return;
}
env->xcr0 = xcr0.Reg64;
}
static void whpx_get_registers(CPUState *cpu)
{
struct whpx_state *whpx = &whpx_global;
struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
CPUX86State *env = cpu->env_ptr;
X86CPU *x86_cpu = X86_CPU(cpu);
struct whpx_register_set vcxt;
uint64_t tpr, apic_base;
HRESULT hr;
int idx;
int idx_next;
int i;
assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
if (!env->tsc_valid) {
whpx_get_tsc(cpu);
env->tsc_valid = !runstate_is_running();
}
hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
whpx->partition, cpu->cpu_index,
whpx_register_names,
RTL_NUMBER_OF(whpx_register_names),
&vcxt.values[0]);
if (FAILED(hr)) {
error_report("WHPX: Failed to get virtual processor context, hr=%08lx",
hr);
}
if (whpx_apic_in_platform()) {
/*
* Fetch the TPR value from the emulated APIC. It may get overwritten
* below with the value from CR8 returned by
* WHvGetVirtualProcessorRegisters().
*/
whpx_apic_get(x86_cpu->apic_state);
vcpu->tpr = whpx_apic_tpr_to_cr8(
cpu_get_apic_tpr(x86_cpu->apic_state));
}
idx = 0;
/* Indexes for first 16 registers match between HV and QEMU definitions */
idx_next = 16;
for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
env->regs[idx] = vcxt.values[idx].Reg64;
}
idx = idx_next;
/* Same goes for RIP and RFLAGS */
assert(whpx_register_names[idx] == WHvX64RegisterRip);
env->eip = vcxt.values[idx++].Reg64;
assert(whpx_register_names[idx] == WHvX64RegisterRflags);
env->eflags = vcxt.values[idx++].Reg64;
/* Translate 6+4 segment registers. HV and QEMU order matches */
assert(idx == WHvX64RegisterEs);
for (i = 0; i < 6; i += 1, idx += 1) {
env->segs[i] = whpx_seg_h2q(&vcxt.values[idx].Segment);
}
assert(idx == WHvX64RegisterLdtr);
env->ldt = whpx_seg_h2q(&vcxt.values[idx++].Segment);
assert(idx == WHvX64RegisterTr);
env->tr = whpx_seg_h2q(&vcxt.values[idx++].Segment);
assert(idx == WHvX64RegisterIdtr);
env->idt.base = vcxt.values[idx].Table.Base;
env->idt.limit = vcxt.values[idx].Table.Limit;
idx += 1;
assert(idx == WHvX64RegisterGdtr);
env->gdt.base = vcxt.values[idx].Table.Base;
env->gdt.limit = vcxt.values[idx].Table.Limit;
idx += 1;
/* CR0, 2, 3, 4, 8 */
assert(whpx_register_names[idx] == WHvX64RegisterCr0);
env->cr[0] = vcxt.values[idx++].Reg64;
assert(whpx_register_names[idx] == WHvX64RegisterCr2);
env->cr[2] = vcxt.values[idx++].Reg64;
assert(whpx_register_names[idx] == WHvX64RegisterCr3);
env->cr[3] = vcxt.values[idx++].Reg64;
assert(whpx_register_names[idx] == WHvX64RegisterCr4);
env->cr[4] = vcxt.values[idx++].Reg64;
assert(whpx_register_names[idx] == WHvX64RegisterCr8);
tpr = vcxt.values[idx++].Reg64;
if (tpr != vcpu->tpr) {
vcpu->tpr = tpr;
cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(tpr));
}
/* 8 Debug Registers - Skipped */
/*
* Extended control registers needs to be handled separately depending
* on whether xsave is supported/enabled or not.
*/
whpx_get_xcrs(cpu);
/* 16 XMM registers */
assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
idx_next = idx + 16;
for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
env->xmm_regs[i].ZMM_Q(0) = vcxt.values[idx].Reg128.Low64;
env->xmm_regs[i].ZMM_Q(1) = vcxt.values[idx].Reg128.High64;
}
idx = idx_next;
/* 8 FP registers */
assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
for (i = 0; i < 8; i += 1, idx += 1) {
env->fpregs[i].mmx.MMX_Q(0) = vcxt.values[idx].Fp.AsUINT128.Low64;
/* env->fpregs[i].mmx.MMX_Q(1) =
vcxt.values[idx].Fp.AsUINT128.High64;
*/
}
/* FP control status register */
assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
env->fpuc = vcxt.values[idx].FpControlStatus.FpControl;
env->fpstt = (vcxt.values[idx].FpControlStatus.FpStatus >> 11) & 0x7;
env->fpus = vcxt.values[idx].FpControlStatus.FpStatus & ~0x3800;
for (i = 0; i < 8; ++i) {
env->fptags[i] = !((vcxt.values[idx].FpControlStatus.FpTag >> i) & 1);
}
env->fpop = vcxt.values[idx].FpControlStatus.LastFpOp;
env->fpip = vcxt.values[idx].FpControlStatus.LastFpRip;
idx += 1;
/* XMM control status register */
assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
env->mxcsr = vcxt.values[idx].XmmControlStatus.XmmStatusControl;
idx += 1;
/* MSRs */
assert(whpx_register_names[idx] == WHvX64RegisterEfer);
env->efer = vcxt.values[idx++].Reg64;
#ifdef TARGET_X86_64
assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
env->kernelgsbase = vcxt.values[idx++].Reg64;
#endif
assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
apic_base = vcxt.values[idx++].Reg64;
if (apic_base != vcpu->apic_base) {
vcpu->apic_base = apic_base;
cpu_set_apic_base(x86_cpu->apic_state, vcpu->apic_base);
}
/* WHvX64RegisterPat - Skipped */
assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
env->sysenter_cs = vcxt.values[idx++].Reg64;
assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
env->sysenter_eip = vcxt.values[idx++].Reg64;
assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
env->sysenter_esp = vcxt.values[idx++].Reg64;
assert(whpx_register_names[idx] == WHvX64RegisterStar);
env->star = vcxt.values[idx++].Reg64;
#ifdef TARGET_X86_64
assert(whpx_register_names[idx] == WHvX64RegisterLstar);
env->lstar = vcxt.values[idx++].Reg64;
assert(whpx_register_names[idx] == WHvX64RegisterCstar);
env->cstar = vcxt.values[idx++].Reg64;
assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
env->fmask = vcxt.values[idx++].Reg64;
#endif
/* Interrupt / Event Registers - Skipped */
assert(idx == RTL_NUMBER_OF(whpx_register_names));
if (whpx_apic_in_platform()) {
whpx_apic_get(x86_cpu->apic_state);
}
x86_update_hflags(env);
return;
}
static HRESULT CALLBACK whpx_emu_ioport_callback(
void *ctx,
WHV_EMULATOR_IO_ACCESS_INFO *IoAccess)
{
MemTxAttrs attrs = { 0 };
address_space_rw(&address_space_io, IoAccess->Port, attrs,
&IoAccess->Data, IoAccess->AccessSize,
IoAccess->Direction);
return S_OK;
}
static HRESULT CALLBACK whpx_emu_mmio_callback(
void *ctx,
WHV_EMULATOR_MEMORY_ACCESS_INFO *ma)
{
cpu_physical_memory_rw(ma->GpaAddress, ma->Data, ma->AccessSize,
ma->Direction);
return S_OK;
}
static HRESULT CALLBACK whpx_emu_getreg_callback(
void *ctx,
const WHV_REGISTER_NAME *RegisterNames,
UINT32 RegisterCount,
WHV_REGISTER_VALUE *RegisterValues)
{
HRESULT hr;
struct whpx_state *whpx = &whpx_global;
CPUState *cpu = (CPUState *)ctx;
hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
whpx->partition, cpu->cpu_index,
RegisterNames, RegisterCount,
RegisterValues);
if (FAILED(hr)) {
error_report("WHPX: Failed to get virtual processor registers,"
" hr=%08lx", hr);
}
return hr;
}
static HRESULT CALLBACK whpx_emu_setreg_callback(
void *ctx,
const WHV_REGISTER_NAME *RegisterNames,
UINT32 RegisterCount,
const WHV_REGISTER_VALUE *RegisterValues)
{
HRESULT hr;
struct whpx_state *whpx = &whpx_global;
CPUState *cpu = (CPUState *)ctx;
hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
whpx->partition, cpu->cpu_index,
RegisterNames, RegisterCount,
RegisterValues);
if (FAILED(hr)) {
error_report("WHPX: Failed to set virtual processor registers,"
" hr=%08lx", hr);
}
/*
* The emulator just successfully wrote the register state. We clear the
* dirty state so we avoid the double write on resume of the VP.
*/
cpu->vcpu_dirty = false;
return hr;
}
static HRESULT CALLBACK whpx_emu_translate_callback(
void *ctx,
WHV_GUEST_VIRTUAL_ADDRESS Gva,
WHV_TRANSLATE_GVA_FLAGS TranslateFlags,
WHV_TRANSLATE_GVA_RESULT_CODE *TranslationResult,
WHV_GUEST_PHYSICAL_ADDRESS *Gpa)
{
HRESULT hr;
struct whpx_state *whpx = &whpx_global;
CPUState *cpu = (CPUState *)ctx;
WHV_TRANSLATE_GVA_RESULT res;
hr = whp_dispatch.WHvTranslateGva(whpx->partition, cpu->cpu_index,
Gva, TranslateFlags, &res, Gpa);
if (FAILED(hr)) {
error_report("WHPX: Failed to translate GVA, hr=%08lx", hr);
} else {
*TranslationResult = res.ResultCode;
}
return hr;
}
static const WHV_EMULATOR_CALLBACKS whpx_emu_callbacks = {
.Size = sizeof(WHV_EMULATOR_CALLBACKS),
.WHvEmulatorIoPortCallback = whpx_emu_ioport_callback,
.WHvEmulatorMemoryCallback = whpx_emu_mmio_callback,
.WHvEmulatorGetVirtualProcessorRegisters = whpx_emu_getreg_callback,
.WHvEmulatorSetVirtualProcessorRegisters = whpx_emu_setreg_callback,
.WHvEmulatorTranslateGvaPage = whpx_emu_translate_callback,
};
static int whpx_handle_mmio(CPUState *cpu, WHV_MEMORY_ACCESS_CONTEXT *ctx)
{
HRESULT hr;
struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
WHV_EMULATOR_STATUS emu_status;
hr = whp_dispatch.WHvEmulatorTryMmioEmulation(
vcpu->emulator, cpu,
&vcpu->exit_ctx.VpContext, ctx,
&emu_status);
if (FAILED(hr)) {
error_report("WHPX: Failed to parse MMIO access, hr=%08lx", hr);
return -1;
}
if (!emu_status.EmulationSuccessful) {
error_report("WHPX: Failed to emulate MMIO access with"
" EmulatorReturnStatus: %u", emu_status.AsUINT32);
return -1;
}
return 0;
}
static int whpx_handle_portio(CPUState *cpu,
WHV_X64_IO_PORT_ACCESS_CONTEXT *ctx)
{
HRESULT hr;
struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
WHV_EMULATOR_STATUS emu_status;
hr = whp_dispatch.WHvEmulatorTryIoEmulation(
vcpu->emulator, cpu,
&vcpu->exit_ctx.VpContext, ctx,
&emu_status);
if (FAILED(hr)) {
error_report("WHPX: Failed to parse PortIO access, hr=%08lx", hr);
return -1;
}
if (!emu_status.EmulationSuccessful) {
error_report("WHPX: Failed to emulate PortIO access with"
" EmulatorReturnStatus: %u", emu_status.AsUINT32);
return -1;
}
return 0;
}
/*
* Controls whether we should intercept various exceptions on the guest,
* namely breakpoint/single-step events.
*
* The 'exceptions' argument accepts a bitmask, e.g:
* (1 << WHvX64ExceptionTypeDebugTrapOrFault) | (...)
*/
static HRESULT whpx_set_exception_exit_bitmap(UINT64 exceptions)
{
struct whpx_state *whpx = &whpx_global;
WHV_PARTITION_PROPERTY prop = { 0, };
HRESULT hr;
if (exceptions == whpx->exception_exit_bitmap) {
return S_OK;
}
prop.ExceptionExitBitmap = exceptions;
hr = whp_dispatch.WHvSetPartitionProperty(
whpx->partition,
WHvPartitionPropertyCodeExceptionExitBitmap,
&prop,
sizeof(WHV_PARTITION_PROPERTY));
if (SUCCEEDED(hr)) {
whpx->exception_exit_bitmap = exceptions;
}
return hr;
}
/*
* This function is called before/after stepping over a single instruction.
* It will update the CPU registers to arm/disarm the instruction stepping
* accordingly.
*/
static HRESULT whpx_vcpu_configure_single_stepping(CPUState *cpu,
bool set,
uint64_t *exit_context_rflags)
{
WHV_REGISTER_NAME reg_name;
WHV_REGISTER_VALUE reg_value;
HRESULT hr;
struct whpx_state *whpx = &whpx_global;
/*
* If we are trying to step over a single instruction, we need to set the
* TF bit in rflags. Otherwise, clear it.
*/
reg_name = WHvX64RegisterRflags;
hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
whpx->partition,
cpu->cpu_index,
&reg_name,
1,
&reg_value);
if (FAILED(hr)) {
error_report("WHPX: Failed to get rflags, hr=%08lx", hr);
return hr;
}
if (exit_context_rflags) {
assert(*exit_context_rflags == reg_value.Reg64);
}
if (set) {
/* Raise WHvX64ExceptionTypeDebugTrapOrFault after each instruction */
reg_value.Reg64 |= TF_MASK;
} else {
reg_value.Reg64 &= ~TF_MASK;
}
if (exit_context_rflags) {
*exit_context_rflags = reg_value.Reg64;
}
hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
whpx->partition,
cpu->cpu_index,
&reg_name,
1,
&reg_value);
if (FAILED(hr)) {
error_report("WHPX: Failed to set rflags,"
" hr=%08lx",
hr);
return hr;
}
reg_name = WHvRegisterInterruptState;
reg_value.Reg64 = 0;
/* Suspend delivery of hardware interrupts during single-stepping. */
reg_value.InterruptState.InterruptShadow = set != 0;
hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
whpx->partition,
cpu->cpu_index,
&reg_name,
1,
&reg_value);
if (FAILED(hr)) {
error_report("WHPX: Failed to set InterruptState,"
" hr=%08lx",
hr);
return hr;
}
if (!set) {
/*
* We have just finished stepping over a single instruction,
* and intercepted the INT1 generated by it.
* We need to now hide the INT1 from the guest,
* as it would not be expecting it.
*/
reg_name = WHvX64RegisterPendingDebugException;
hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
whpx->partition,
cpu->cpu_index,
&reg_name,
1,
&reg_value);
if (FAILED(hr)) {
error_report("WHPX: Failed to get pending debug exceptions,"
"hr=%08lx", hr);
return hr;
}
if (reg_value.PendingDebugException.SingleStep) {
reg_value.PendingDebugException.SingleStep = 0;
hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
whpx->partition,
cpu->cpu_index,
&reg_name,
1,
&reg_value);
if (FAILED(hr)) {
error_report("WHPX: Failed to clear pending debug exceptions,"
"hr=%08lx", hr);
return hr;
}
}
}
return S_OK;
}
/* Tries to find a breakpoint at the specified address. */
static struct whpx_breakpoint *whpx_lookup_breakpoint_by_addr(uint64_t address)
{
struct whpx_state *whpx = &whpx_global;
int i;
if (whpx->breakpoints.breakpoints) {
for (i = 0; i < whpx->breakpoints.breakpoints->used; i++) {
if (address == whpx->breakpoints.breakpoints->data[i].address) {
return &whpx->breakpoints.breakpoints->data[i];
}
}
}
return NULL;
}
/*
* Linux uses int3 (0xCC) during startup (see int3_selftest()) and for
* debugging user-mode applications. Since the WHPX API does not offer
* an easy way to pass the intercepted exception back to the guest, we
* resort to using INT1 instead, and let the guest always handle INT3.
*/
static const uint8_t whpx_breakpoint_instruction = 0xF1;
/*
* The WHPX QEMU backend implements breakpoints by writing the INT1
* instruction into memory (ignoring the DRx registers). This raises a few
* issues that need to be carefully handled:
*
* 1. Although unlikely, other parts of QEMU may set multiple breakpoints
* at the same location, and later remove them in arbitrary order.
* This should not cause memory corruption, and should only remove the
* physical breakpoint instruction when the last QEMU breakpoint is gone.
*
* 2. Writing arbitrary virtual memory may fail if it's not mapped to a valid
* physical location. Hence, physically adding/removing a breakpoint can
* theoretically fail at any time. We need to keep track of it.
*
* The function below rebuilds a list of low-level breakpoints (one per
* address, tracking the original instruction and any errors) from the list of
* high-level breakpoints (set via cpu_breakpoint_insert()).
*
* In order to optimize performance, this function stores the list of
* high-level breakpoints (a.k.a. CPU breakpoints) used to compute the
* low-level ones, so that it won't be re-invoked until these breakpoints
* change.
*
* Note that this function decides which breakpoints should be inserted into,
* memory, but doesn't actually do it. The memory accessing is done in
* whpx_apply_breakpoints().
*/
static void whpx_translate_cpu_breakpoints(
struct whpx_breakpoints *breakpoints,
CPUState *cpu,
int cpu_breakpoint_count)
{
CPUBreakpoint *bp;
int cpu_bp_index = 0;
breakpoints->original_addresses =
g_renew(vaddr, breakpoints->original_addresses, cpu_breakpoint_count);
breakpoints->original_address_count = cpu_breakpoint_count;
int max_breakpoints = cpu_breakpoint_count +
(breakpoints->breakpoints ? breakpoints->breakpoints->used : 0);
struct whpx_breakpoint_collection *new_breakpoints =
(struct whpx_breakpoint_collection *)g_malloc0(
sizeof(struct whpx_breakpoint_collection) +
max_breakpoints * sizeof(struct whpx_breakpoint));
new_breakpoints->allocated = max_breakpoints;
new_breakpoints->used = 0;
/*
* 1. Preserve all old breakpoints that could not be automatically
* cleared when the CPU got stopped.
*/
if (breakpoints->breakpoints) {
int i;
for (i = 0; i < breakpoints->breakpoints->used; i++) {
if (breakpoints->breakpoints->data[i].state != WHPX_BP_CLEARED) {
new_breakpoints->data[new_breakpoints->used++] =
breakpoints->breakpoints->data[i];
}
}
}
/* 2. Map all CPU breakpoints to WHPX breakpoints */
QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
int i;
bool found = false;
/* This will be used to detect changed CPU breakpoints later. */
breakpoints->original_addresses[cpu_bp_index++] = bp->pc;
for (i = 0; i < new_breakpoints->used; i++) {
/*
* WARNING: This loop has O(N^2) complexity, where N is the
* number of breakpoints. It should not be a bottleneck in
* real-world scenarios, since it only needs to run once after
* the breakpoints have been modified.
* If this ever becomes a concern, it can be optimized by storing
* high-level breakpoint objects in a tree or hash map.
*/
if (new_breakpoints->data[i].address == bp->pc) {
/* There was already a breakpoint at this address. */
if (new_breakpoints->data[i].state == WHPX_BP_CLEAR_PENDING) {
new_breakpoints->data[i].state = WHPX_BP_SET;
} else if (new_breakpoints->data[i].state == WHPX_BP_SET) {
new_breakpoints->data[i].state = WHPX_BP_SET_PENDING;
}
found = true;
break;
}
}
if (!found && new_breakpoints->used < new_breakpoints->allocated) {
/* No WHPX breakpoint at this address. Create one. */
new_breakpoints->data[new_breakpoints->used].address = bp->pc;
new_breakpoints->data[new_breakpoints->used].state =
WHPX_BP_SET_PENDING;
new_breakpoints->used++;
}
}
if (breakpoints->breakpoints) {
/*
* Free the previous breakpoint list. This can be optimized by keeping
* it as shadow buffer for the next computation instead of freeing
* it immediately.
*/
g_free(breakpoints->breakpoints);
}
breakpoints->breakpoints = new_breakpoints;
}
/*
* Physically inserts/removes the breakpoints by reading and writing the
* physical memory, keeping a track of the failed attempts.
*
* Passing resuming=true will try to set all previously unset breakpoints.
* Passing resuming=false will remove all inserted ones.
*/
static void whpx_apply_breakpoints(
struct whpx_breakpoint_collection *breakpoints,
CPUState *cpu,
bool resuming)
{
int i, rc;
if (!breakpoints) {
return;
}
for (i = 0; i < breakpoints->used; i++) {
/* Decide what to do right now based on the last known state. */
WhpxBreakpointState state = breakpoints->data[i].state;
switch (state) {
case WHPX_BP_CLEARED:
if (resuming) {
state = WHPX_BP_SET_PENDING;
}
break;
case WHPX_BP_SET_PENDING:
if (!resuming) {
state = WHPX_BP_CLEARED;
}
break;
case WHPX_BP_SET:
if (!resuming) {
state = WHPX_BP_CLEAR_PENDING;
}
break;
case WHPX_BP_CLEAR_PENDING:
if (resuming) {
state = WHPX_BP_SET;
}
break;
}
if (state == WHPX_BP_SET_PENDING) {
/* Remember the original instruction. */
rc = cpu_memory_rw_debug(cpu,
breakpoints->data[i].address,
&breakpoints->data[i].original_instruction,
1,
false);
if (!rc) {
/* Write the breakpoint instruction. */
rc = cpu_memory_rw_debug(cpu,
breakpoints->data[i].address,
(void *)&whpx_breakpoint_instruction,
1,
true);
}
if (!rc) {
state = WHPX_BP_SET;
}
}
if (state == WHPX_BP_CLEAR_PENDING) {
/* Restore the original instruction. */
rc = cpu_memory_rw_debug(cpu,
breakpoints->data[i].address,
&breakpoints->data[i].original_instruction,
1,
true);
if (!rc) {
state = WHPX_BP_CLEARED;
}
}
breakpoints->data[i].state = state;
}
}
/*
* This function is called when the a VCPU is about to start and no other
* VCPUs have been started so far. Since the VCPU start order could be
* arbitrary, it doesn't have to be VCPU#0.
*
* It is used to commit the breakpoints into memory, and configure WHPX
* to intercept debug exceptions.
*
* Note that whpx_set_exception_exit_bitmap() cannot be called if one or
* more VCPUs are already running, so this is the best place to do it.
*/
static int whpx_first_vcpu_starting(CPUState *cpu)
{
struct whpx_state *whpx = &whpx_global;
HRESULT hr;
g_assert(qemu_mutex_iothread_locked());
if (!QTAILQ_EMPTY(&cpu->breakpoints) ||
(whpx->breakpoints.breakpoints &&
whpx->breakpoints.breakpoints->used)) {
CPUBreakpoint *bp;
int i = 0;
bool update_pending = false;
QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
if (i >= whpx->breakpoints.original_address_count ||
bp->pc != whpx->breakpoints.original_addresses[i]) {
update_pending = true;
}
i++;
}
if (i != whpx->breakpoints.original_address_count) {
update_pending = true;
}
if (update_pending) {
/*
* The CPU breakpoints have changed since the last call to
* whpx_translate_cpu_breakpoints(). WHPX breakpoints must
* now be recomputed.
*/
whpx_translate_cpu_breakpoints(&whpx->breakpoints, cpu, i);
}
/* Actually insert the breakpoints into the memory. */
whpx_apply_breakpoints(whpx->breakpoints.breakpoints, cpu, true);
}
uint64_t exception_mask;
if (whpx->step_pending ||
(whpx->breakpoints.breakpoints &&
whpx->breakpoints.breakpoints->used)) {
/*
* We are either attempting to single-step one or more CPUs, or
* have one or more breakpoints enabled. Both require intercepting
* the WHvX64ExceptionTypeBreakpointTrap exception.
*/
exception_mask = 1UL << WHvX64ExceptionTypeDebugTrapOrFault;
} else {
/* Let the guest handle all exceptions. */
exception_mask = 0;
}
hr = whpx_set_exception_exit_bitmap(exception_mask);
if (!SUCCEEDED(hr)) {
error_report("WHPX: Failed to update exception exit mask,"
"hr=%08lx.", hr);
return 1;
}
return 0;
}
/*
* This function is called when the last VCPU has finished running.
* It is used to remove any previously set breakpoints from memory.
*/
static int whpx_last_vcpu_stopping(CPUState *cpu)
{
whpx_apply_breakpoints(whpx_global.breakpoints.breakpoints, cpu, false);
return 0;
}
/* Returns the address of the next instruction that is about to be executed. */
static vaddr whpx_vcpu_get_pc(CPUState *cpu, bool exit_context_valid)
{
if (cpu->vcpu_dirty) {
/* The CPU registers have been modified by other parts of QEMU. */
CPUArchState *env = (CPUArchState *)(cpu->env_ptr);
return env->eip;
} else if (exit_context_valid) {
/*
* The CPU registers have not been modified by neither other parts
* of QEMU, nor this port by calling WHvSetVirtualProcessorRegisters().
* This is the most common case.
*/
struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
return vcpu->exit_ctx.VpContext.Rip;
} else {
/*
* The CPU registers have been modified by a call to
* WHvSetVirtualProcessorRegisters() and must be re-queried from
* the target.
*/
WHV_REGISTER_VALUE reg_value;
WHV_REGISTER_NAME reg_name = WHvX64RegisterRip;
HRESULT hr;
struct whpx_state *whpx = &whpx_global;
hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
whpx->partition,
cpu->cpu_index,
&reg_name,
1,
&reg_value);
if (FAILED(hr)) {
error_report("WHPX: Failed to get PC, hr=%08lx", hr);
return 0;
}
return reg_value.Reg64;
}
}
static int whpx_handle_halt(CPUState *cpu)
{
CPUX86State *env = cpu->env_ptr;
int ret = 0;
qemu_mutex_lock_iothread();
if (!((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
(env->eflags & IF_MASK)) &&
!(cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
cpu->exception_index = EXCP_HLT;
cpu->halted = true;
ret = 1;
}
qemu_mutex_unlock_iothread();
return ret;
}
static void whpx_vcpu_pre_run(CPUState *cpu)
{
HRESULT hr;
struct whpx_state *whpx = &whpx_global;
struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
CPUX86State *env = cpu->env_ptr;
X86CPU *x86_cpu = X86_CPU(cpu);
int irq;
uint8_t tpr;
WHV_X64_PENDING_INTERRUPTION_REGISTER new_int;
UINT32 reg_count = 0;
WHV_REGISTER_VALUE reg_values[3];
WHV_REGISTER_NAME reg_names[3];
memset(&new_int, 0, sizeof(new_int));
memset(reg_values, 0, sizeof(reg_values));
qemu_mutex_lock_iothread();
/* Inject NMI */
if (!vcpu->interruption_pending &&
cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) {
if (cpu->interrupt_request & CPU_INTERRUPT_NMI) {
cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
vcpu->interruptable = false;
new_int.InterruptionType = WHvX64PendingNmi;
new_int.InterruptionPending = 1;
new_int.InterruptionVector = 2;
}
if (cpu->interrupt_request & CPU_INTERRUPT_SMI) {
cpu->interrupt_request &= ~CPU_INTERRUPT_SMI;
}
}
/*
* Force the VCPU out of its inner loop to process any INIT requests or
* commit pending TPR access.
*/
if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
!(env->hflags & HF_SMM_MASK)) {
cpu->exit_request = 1;
}
if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
cpu->exit_request = 1;
}
}
/* Get pending hard interruption or replay one that was overwritten */
if (!whpx_apic_in_platform()) {
if (!vcpu->interruption_pending &&
vcpu->interruptable && (env->eflags & IF_MASK)) {
assert(!new_int.InterruptionPending);
if (cpu->interrupt_request & CPU_INTERRUPT_HARD) {
cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
irq = cpu_get_pic_interrupt(env);
if (irq >= 0) {
new_int.InterruptionType = WHvX64PendingInterrupt;
new_int.InterruptionPending = 1;
new_int.InterruptionVector = irq;
}
}
}
/* Setup interrupt state if new one was prepared */
if (new_int.InterruptionPending) {
reg_values[reg_count].PendingInterruption = new_int;
reg_names[reg_count] = WHvRegisterPendingInterruption;
reg_count += 1;
}
} else if (vcpu->ready_for_pic_interrupt &&
(cpu->interrupt_request & CPU_INTERRUPT_HARD)) {
cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
irq = cpu_get_pic_interrupt(env);
if (irq >= 0) {
reg_names[reg_count] = WHvRegisterPendingEvent;
reg_values[reg_count].ExtIntEvent = (WHV_X64_PENDING_EXT_INT_EVENT)
{
.EventPending = 1,
.EventType = WHvX64PendingEventExtInt,
.Vector = irq,
};
reg_count += 1;
}
}
/* Sync the TPR to the CR8 if was modified during the intercept */
tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));
if (tpr != vcpu->tpr) {
vcpu->tpr = tpr;
reg_values[reg_count].Reg64 = tpr;
cpu->exit_request = 1;
reg_names[reg_count] = WHvX64RegisterCr8;
reg_count += 1;
}
/* Update the state of the interrupt delivery notification */
if (!vcpu->window_registered &&
cpu->interrupt_request & CPU_INTERRUPT_HARD) {
reg_values[reg_count].DeliverabilityNotifications =
(WHV_X64_DELIVERABILITY_NOTIFICATIONS_REGISTER) {
.InterruptNotification = 1
};
vcpu->window_registered = 1;
reg_names[reg_count] = WHvX64RegisterDeliverabilityNotifications;
reg_count += 1;
}
qemu_mutex_unlock_iothread();
vcpu->ready_for_pic_interrupt = false;
if (reg_count) {
hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
whpx->partition, cpu->cpu_index,
reg_names, reg_count, reg_values);
if (FAILED(hr)) {
error_report("WHPX: Failed to set interrupt state registers,"
" hr=%08lx", hr);
}
}
return;
}
static void whpx_vcpu_post_run(CPUState *cpu)
{
struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
CPUX86State *env = cpu->env_ptr;
X86CPU *x86_cpu = X86_CPU(cpu);
env->eflags = vcpu->exit_ctx.VpContext.Rflags;
uint64_t tpr = vcpu->exit_ctx.VpContext.Cr8;
if (vcpu->tpr != tpr) {
vcpu->tpr = tpr;
qemu_mutex_lock_iothread();
cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(vcpu->tpr));
qemu_mutex_unlock_iothread();
}
vcpu->interruption_pending =
vcpu->exit_ctx.VpContext.ExecutionState.InterruptionPending;
vcpu->interruptable =
!vcpu->exit_ctx.VpContext.ExecutionState.InterruptShadow;
return;
}
static void whpx_vcpu_process_async_events(CPUState *cpu)
{
CPUX86State *env = cpu->env_ptr;
X86CPU *x86_cpu = X86_CPU(cpu);
struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
!(env->hflags & HF_SMM_MASK)) {
whpx_cpu_synchronize_state(cpu);
do_cpu_init(x86_cpu);
vcpu->interruptable = true;
}
if (cpu->interrupt_request & CPU_INTERRUPT_POLL) {
cpu->interrupt_request &= ~CPU_INTERRUPT_POLL;
apic_poll_irq(x86_cpu->apic_state);
}
if (((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
(env->eflags & IF_MASK)) ||
(cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
cpu->halted = false;
}
if (cpu->interrupt_request & CPU_INTERRUPT_SIPI) {
whpx_cpu_synchronize_state(cpu);
do_cpu_sipi(x86_cpu);
}
if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
cpu->interrupt_request &= ~CPU_INTERRUPT_TPR;
whpx_cpu_synchronize_state(cpu);
apic_handle_tpr_access_report(x86_cpu->apic_state, env->eip,
env->tpr_access_type);
}
return;
}
static int whpx_vcpu_run(CPUState *cpu)
{
HRESULT hr;
struct whpx_state *whpx = &whpx_global;
struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
struct whpx_breakpoint *stepped_over_bp = NULL;
WhpxStepMode exclusive_step_mode = WHPX_STEP_NONE;
int ret;
g_assert(qemu_mutex_iothread_locked());
if (whpx->running_cpus++ == 0) {
/* Insert breakpoints into memory, update exception exit bitmap. */
ret = whpx_first_vcpu_starting(cpu);
if (ret != 0) {
return ret;
}
}
if (whpx->breakpoints.breakpoints &&
whpx->breakpoints.breakpoints->used > 0)
{
uint64_t pc = whpx_vcpu_get_pc(cpu, true);
stepped_over_bp = whpx_lookup_breakpoint_by_addr(pc);
if (stepped_over_bp && stepped_over_bp->state != WHPX_BP_SET) {
stepped_over_bp = NULL;
}
if (stepped_over_bp) {
/*
* We are trying to run the instruction overwritten by an active
* breakpoint. We will temporarily disable the breakpoint, suspend
* other CPUs, and step over the instruction.
*/
exclusive_step_mode = WHPX_STEP_EXCLUSIVE;
}
}
if (exclusive_step_mode == WHPX_STEP_NONE) {
whpx_vcpu_process_async_events(cpu);
if (cpu->halted && !whpx_apic_in_platform()) {
cpu->exception_index = EXCP_HLT;
qatomic_set(&cpu->exit_request, false);
return 0;
}
}
qemu_mutex_unlock_iothread();
if (exclusive_step_mode != WHPX_STEP_NONE) {
start_exclusive();
g_assert(cpu == current_cpu);
g_assert(!cpu->running);
cpu->running = true;
hr = whpx_set_exception_exit_bitmap(
1UL << WHvX64ExceptionTypeDebugTrapOrFault);
if (!SUCCEEDED(hr)) {
error_report("WHPX: Failed to update exception exit mask, "
"hr=%08lx.", hr);
return 1;
}
if (stepped_over_bp) {
/* Temporarily disable the triggered breakpoint. */
cpu_memory_rw_debug(cpu,
stepped_over_bp->address,
&stepped_over_bp->original_instruction,
1,
true);
}
} else {
cpu_exec_start(cpu);
}
do {
if (cpu->vcpu_dirty) {
whpx_set_registers(cpu, WHPX_SET_RUNTIME_STATE);
cpu->vcpu_dirty = false;
}
if (exclusive_step_mode == WHPX_STEP_NONE) {
whpx_vcpu_pre_run(cpu);
if (qatomic_read(&cpu->exit_request)) {
whpx_vcpu_kick(cpu);
}
}
if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
whpx_vcpu_configure_single_stepping(cpu, true, NULL);
}
hr = whp_dispatch.WHvRunVirtualProcessor(
whpx->partition, cpu->cpu_index,
&vcpu->exit_ctx, sizeof(vcpu->exit_ctx));
if (FAILED(hr)) {
error_report("WHPX: Failed to exec a virtual processor,"
" hr=%08lx", hr);
ret = -1;
break;
}
if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
whpx_vcpu_configure_single_stepping(cpu,
false,
&vcpu->exit_ctx.VpContext.Rflags);
}
whpx_vcpu_post_run(cpu);
switch (vcpu->exit_ctx.ExitReason) {
case WHvRunVpExitReasonMemoryAccess:
ret = whpx_handle_mmio(cpu, &vcpu->exit_ctx.MemoryAccess);
break;
case WHvRunVpExitReasonX64IoPortAccess:
ret = whpx_handle_portio(cpu, &vcpu->exit_ctx.IoPortAccess);
break;
case WHvRunVpExitReasonX64InterruptWindow:
vcpu->ready_for_pic_interrupt = 1;
vcpu->window_registered = 0;
ret = 0;
break;
case WHvRunVpExitReasonX64ApicEoi:
assert(whpx_apic_in_platform());
ioapic_eoi_broadcast(vcpu->exit_ctx.ApicEoi.InterruptVector);
break;
case WHvRunVpExitReasonX64Halt:
/*
* WARNING: as of build 19043.1526 (21H1), this exit reason is no
* longer used.
*/
ret = whpx_handle_halt(cpu);
break;
case WHvRunVpExitReasonX64ApicInitSipiTrap: {
WHV_INTERRUPT_CONTROL ipi = {0};
uint64_t icr = vcpu->exit_ctx.ApicInitSipi.ApicIcr;
uint32_t delivery_mode =
(icr & APIC_ICR_DELIV_MOD) >> APIC_ICR_DELIV_MOD_SHIFT;
int dest_shorthand =
(icr & APIC_ICR_DEST_SHORT) >> APIC_ICR_DEST_SHORT_SHIFT;
bool broadcast = false;
bool include_self = false;
uint32_t i;
/* We only registered for INIT and SIPI exits. */
if ((delivery_mode != APIC_DM_INIT) &&
(delivery_mode != APIC_DM_SIPI)) {
error_report(
"WHPX: Unexpected APIC exit that is not a INIT or SIPI");
break;
}
if (delivery_mode == APIC_DM_INIT) {
ipi.Type = WHvX64InterruptTypeInit;
} else {
ipi.Type = WHvX64InterruptTypeSipi;
}
ipi.DestinationMode =
((icr & APIC_ICR_DEST_MOD) >> APIC_ICR_DEST_MOD_SHIFT) ?
WHvX64InterruptDestinationModeLogical :
WHvX64InterruptDestinationModePhysical;
ipi.TriggerMode =
((icr & APIC_ICR_TRIGGER_MOD) >> APIC_ICR_TRIGGER_MOD_SHIFT) ?
WHvX64InterruptTriggerModeLevel :
WHvX64InterruptTriggerModeEdge;
ipi.Vector = icr & APIC_VECTOR_MASK;
switch (dest_shorthand) {
/* no shorthand. Bits 56-63 contain the destination. */
case 0:
ipi.Destination = (icr >> 56) & APIC_VECTOR_MASK;
hr = whp_dispatch.WHvRequestInterrupt(whpx->partition,
&ipi, sizeof(ipi));
if (FAILED(hr)) {
error_report("WHPX: Failed to request interrupt hr=%08lx",
hr);
}
break;
/* self */
case 1:
include_self = true;
break;
/* broadcast, including self */
case 2:
broadcast = true;
include_self = true;
break;
/* broadcast, excluding self */
case 3:
broadcast = true;
break;
}
if (!broadcast && !include_self) {
break;
}
for (i = 0; i <= max_vcpu_index; i++) {
if (i == cpu->cpu_index && !include_self) {
continue;
}
/*
* Assuming that APIC Ids are identity mapped since
* WHvX64RegisterApicId & WHvX64RegisterInitialApicId registers
* are not handled yet and the hypervisor doesn't allow the
* guest to modify the APIC ID.
*/
ipi.Destination = i;
hr = whp_dispatch.WHvRequestInterrupt(whpx->partition,
&ipi, sizeof(ipi));
if (FAILED(hr)) {
error_report(
"WHPX: Failed to request SIPI for %d, hr=%08lx",
i, hr);
}
}
break;
}
case WHvRunVpExitReasonCanceled:
if (exclusive_step_mode != WHPX_STEP_NONE) {
/*
* We are trying to step over a single instruction, and
* likely got a request to stop from another thread.
* Delay it until we are done stepping
* over.
*/
ret = 0;
} else {
cpu->exception_index = EXCP_INTERRUPT;
ret = 1;
}
break;
case WHvRunVpExitReasonX64MsrAccess: {
WHV_REGISTER_VALUE reg_values[3] = {0};
WHV_REGISTER_NAME reg_names[3];
UINT32 reg_count;
reg_names[0] = WHvX64RegisterRip;
reg_names[1] = WHvX64RegisterRax;
reg_names[2] = WHvX64RegisterRdx;
reg_values[0].Reg64 =
vcpu->exit_ctx.VpContext.Rip +
vcpu->exit_ctx.VpContext.InstructionLength;
/*
* For all unsupported MSR access we:
* ignore writes
* return 0 on read.
*/
reg_count = vcpu->exit_ctx.MsrAccess.AccessInfo.IsWrite ?
1 : 3;
hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
whpx->partition,
cpu->cpu_index,
reg_names, reg_count,
reg_values);
if (FAILED(hr)) {
error_report("WHPX: Failed to set MsrAccess state "
" registers, hr=%08lx", hr);
}
ret = 0;
break;
}
case WHvRunVpExitReasonX64Cpuid: {
WHV_REGISTER_VALUE reg_values[5];
WHV_REGISTER_NAME reg_names[5];
UINT32 reg_count = 5;
UINT64 cpuid_fn, rip = 0, rax = 0, rcx = 0, rdx = 0, rbx = 0;
X86CPU *x86_cpu = X86_CPU(cpu);
CPUX86State *env = &x86_cpu->env;
memset(reg_values, 0, sizeof(reg_values));
rip = vcpu->exit_ctx.VpContext.Rip +
vcpu->exit_ctx.VpContext.InstructionLength;
cpuid_fn = vcpu->exit_ctx.CpuidAccess.Rax;
/*
* Ideally, these should be supplied to the hypervisor during VCPU
* initialization and it should be able to satisfy this request.
* But, currently, WHPX doesn't support setting CPUID values in the
* hypervisor once the partition has been setup, which is too late
* since VCPUs are realized later. For now, use the values from
* QEMU to satisfy these requests, until WHPX adds support for
* being able to set these values in the hypervisor at runtime.
*/
cpu_x86_cpuid(env, cpuid_fn, 0, (UINT32 *)&rax, (UINT32 *)&rbx,
(UINT32 *)&rcx, (UINT32 *)&rdx);
switch (cpuid_fn) {
case 0x40000000:
/* Expose the vmware cpu frequency cpuid leaf */
rax = 0x40000010;
rbx = rcx = rdx = 0;
break;
case 0x40000010:
rax = env->tsc_khz;
rbx = env->apic_bus_freq / 1000; /* Hz to KHz */
rcx = rdx = 0;
break;
case 0x80000001:
/* Remove any support of OSVW */
rcx &= ~CPUID_EXT3_OSVW;
break;
}
reg_names[0] = WHvX64RegisterRip;
reg_names[1] = WHvX64RegisterRax;
reg_names[2] = WHvX64RegisterRcx;
reg_names[3] = WHvX64RegisterRdx;
reg_names[4] = WHvX64RegisterRbx;
reg_values[0].Reg64 = rip;
reg_values[1].Reg64 = rax;
reg_values[2].Reg64 = rcx;
reg_values[3].Reg64 = rdx;
reg_values[4].Reg64 = rbx;
hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
whpx->partition, cpu->cpu_index,
reg_names,
reg_count,
reg_values);
if (FAILED(hr)) {
error_report("WHPX: Failed to set CpuidAccess state registers,"
" hr=%08lx", hr);
}
ret = 0;
break;
}
case WHvRunVpExitReasonException:
whpx_get_registers(cpu);
if ((vcpu->exit_ctx.VpException.ExceptionType ==
WHvX64ExceptionTypeDebugTrapOrFault) &&
(vcpu->exit_ctx.VpException.InstructionByteCount >= 1) &&
(vcpu->exit_ctx.VpException.InstructionBytes[0] ==
whpx_breakpoint_instruction)) {
/* Stopped at a software breakpoint. */
cpu->exception_index = EXCP_DEBUG;
} else if ((vcpu->exit_ctx.VpException.ExceptionType ==
WHvX64ExceptionTypeDebugTrapOrFault) &&
!cpu->singlestep_enabled) {
/*
* Just finished stepping over a breakpoint, but the
* gdb does not expect us to do single-stepping.
* Don't do anything special.
*/
cpu->exception_index = EXCP_INTERRUPT;
} else {
/* Another exception or debug event. Report it to GDB. */
cpu->exception_index = EXCP_DEBUG;
}
ret = 1;
break;
case WHvRunVpExitReasonNone:
case WHvRunVpExitReasonUnrecoverableException:
case WHvRunVpExitReasonInvalidVpRegisterValue:
case WHvRunVpExitReasonUnsupportedFeature:
default:
error_report("WHPX: Unexpected VP exit code %d",
vcpu->exit_ctx.ExitReason);
whpx_get_registers(cpu);
qemu_mutex_lock_iothread();
qemu_system_guest_panicked(cpu_get_crash_info(cpu));
qemu_mutex_unlock_iothread();
break;
}
} while (!ret);
if (stepped_over_bp) {
/* Restore the breakpoint we stepped over */
cpu_memory_rw_debug(cpu,
stepped_over_bp->address,
(void *)&whpx_breakpoint_instruction,
1,
true);
}
if (exclusive_step_mode != WHPX_STEP_NONE) {
g_assert(cpu_in_exclusive_context(cpu));
cpu->running = false;
end_exclusive();
exclusive_step_mode = WHPX_STEP_NONE;
} else {
cpu_exec_end(cpu);
}
qemu_mutex_lock_iothread();
current_cpu = cpu;
if (--whpx->running_cpus == 0) {
whpx_last_vcpu_stopping(cpu);
}
qatomic_set(&cpu->exit_request, false);
return ret < 0;
}
static void do_whpx_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
{
if (!cpu->vcpu_dirty) {
whpx_get_registers(cpu);
cpu->vcpu_dirty = true;
}
}
static void do_whpx_cpu_synchronize_post_reset(CPUState *cpu,
run_on_cpu_data arg)
{
whpx_set_registers(cpu, WHPX_SET_RESET_STATE);
cpu->vcpu_dirty = false;
}
static void do_whpx_cpu_synchronize_post_init(CPUState *cpu,
run_on_cpu_data arg)
{
whpx_set_registers(cpu, WHPX_SET_FULL_STATE);
cpu->vcpu_dirty = false;
}
static void do_whpx_cpu_synchronize_pre_loadvm(CPUState *cpu,
run_on_cpu_data arg)
{
cpu->vcpu_dirty = true;
}
/*
* CPU support.
*/
void whpx_cpu_synchronize_state(CPUState *cpu)
{
if (!cpu->vcpu_dirty) {
run_on_cpu(cpu, do_whpx_cpu_synchronize_state, RUN_ON_CPU_NULL);
}
}
void whpx_cpu_synchronize_post_reset(CPUState *cpu)
{
run_on_cpu(cpu, do_whpx_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
}
void whpx_cpu_synchronize_post_init(CPUState *cpu)
{
run_on_cpu(cpu, do_whpx_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
}
void whpx_cpu_synchronize_pre_loadvm(CPUState *cpu)
{
run_on_cpu(cpu, do_whpx_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
}
void whpx_cpu_synchronize_pre_resume(bool step_pending)
{
whpx_global.step_pending = step_pending;
}
/*
* Vcpu support.
*/
static Error *whpx_migration_blocker;
static void whpx_cpu_update_state(void *opaque, bool running, RunState state)
{
CPUX86State *env = opaque;
if (running) {
env->tsc_valid = false;
}
}
int whpx_init_vcpu(CPUState *cpu)
{
HRESULT hr;
struct whpx_state *whpx = &whpx_global;
struct whpx_vcpu *vcpu = NULL;
Error *local_error = NULL;
CPUX86State *env = cpu->env_ptr;
X86CPU *x86_cpu = X86_CPU(cpu);
UINT64 freq = 0;
int ret;
/* Add migration blockers for all unsupported features of the
* Windows Hypervisor Platform
*/
if (whpx_migration_blocker == NULL) {
error_setg(&whpx_migration_blocker,
"State blocked due to non-migratable CPUID feature support,"
"dirty memory tracking support, and XSAVE/XRSTOR support");
if (migrate_add_blocker(whpx_migration_blocker, &local_error) < 0) {
error_report_err(local_error);
error_free(whpx_migration_blocker);
ret = -EINVAL;
goto error;
}
}
vcpu = g_new0(struct whpx_vcpu, 1);
if (!vcpu) {
error_report("WHPX: Failed to allocte VCPU context.");
ret = -ENOMEM;
goto error;
}
hr = whp_dispatch.WHvEmulatorCreateEmulator(
&whpx_emu_callbacks,
&vcpu->emulator);
if (FAILED(hr)) {
error_report("WHPX: Failed to setup instruction completion support,"
" hr=%08lx", hr);
ret = -EINVAL;
goto error;
}
hr = whp_dispatch.WHvCreateVirtualProcessor(
whpx->partition, cpu->cpu_index, 0);
if (FAILED(hr)) {
error_report("WHPX: Failed to create a virtual processor,"
" hr=%08lx", hr);
whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
ret = -EINVAL;
goto error;
}
/*
* vcpu's TSC frequency is either specified by user, or use the value
* provided by Hyper-V if the former is not present. In the latter case, we
* query it from Hyper-V and record in env->tsc_khz, so that vcpu's TSC
* frequency can be migrated later via this field.
*/
if (!env->tsc_khz) {
hr = whp_dispatch.WHvGetCapability(
WHvCapabilityCodeProcessorClockFrequency, &freq, sizeof(freq),
NULL);
if (hr != WHV_E_UNKNOWN_CAPABILITY) {
if (FAILED(hr)) {
printf("WHPX: Failed to query tsc frequency, hr=0x%08lx\n", hr);
} else {
env->tsc_khz = freq / 1000; /* Hz to KHz */
}
}
}
env->apic_bus_freq = HYPERV_APIC_BUS_FREQUENCY;
hr = whp_dispatch.WHvGetCapability(
WHvCapabilityCodeInterruptClockFrequency, &freq, sizeof(freq), NULL);
if (hr != WHV_E_UNKNOWN_CAPABILITY) {
if (FAILED(hr)) {
printf("WHPX: Failed to query apic bus frequency hr=0x%08lx\n", hr);
} else {
env->apic_bus_freq = freq;
}
}
/*
* If the vmware cpuid frequency leaf option is set, and we have a valid
* tsc value, trap the corresponding cpuid's.
*/
if (x86_cpu->vmware_cpuid_freq && env->tsc_khz) {
UINT32 cpuidExitList[] = {1, 0x80000001, 0x40000000, 0x40000010};
hr = whp_dispatch.WHvSetPartitionProperty(
whpx->partition,
WHvPartitionPropertyCodeCpuidExitList,
cpuidExitList,
RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
if (FAILED(hr)) {
error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
hr);
ret = -EINVAL;
goto error;
}
}
vcpu->interruptable = true;
cpu->vcpu_dirty = true;
cpu->hax_vcpu = (struct hax_vcpu_state *)vcpu;
max_vcpu_index = max(max_vcpu_index, cpu->cpu_index);
qemu_add_vm_change_state_handler(whpx_cpu_update_state, cpu->env_ptr);
return 0;
error:
g_free(vcpu);
return ret;
}
int whpx_vcpu_exec(CPUState *cpu)
{
int ret;
int fatal;
for (;;) {
if (cpu->exception_index >= EXCP_INTERRUPT) {
ret = cpu->exception_index;
cpu->exception_index = -1;
break;
}
fatal = whpx_vcpu_run(cpu);
if (fatal) {
error_report("WHPX: Failed to exec a virtual processor");
abort();
}
}
return ret;
}
void whpx_destroy_vcpu(CPUState *cpu)
{
struct whpx_state *whpx = &whpx_global;
struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
whp_dispatch.WHvDeleteVirtualProcessor(whpx->partition, cpu->cpu_index);
whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
g_free(cpu->hax_vcpu);
return;
}
void whpx_vcpu_kick(CPUState *cpu)
{
struct whpx_state *whpx = &whpx_global;
whp_dispatch.WHvCancelRunVirtualProcessor(
whpx->partition, cpu->cpu_index, 0);
}
/*
* Memory support.
*/
static void whpx_update_mapping(hwaddr start_pa, ram_addr_t size,
void *host_va, int add, int rom,
const char *name)
{
struct whpx_state *whpx = &whpx_global;
HRESULT hr;
/*
if (add) {
printf("WHPX: ADD PA:%p Size:%p, Host:%p, %s, '%s'\n",
(void*)start_pa, (void*)size, host_va,
(rom ? "ROM" : "RAM"), name);
} else {
printf("WHPX: DEL PA:%p Size:%p, Host:%p, '%s'\n",
(void*)start_pa, (void*)size, host_va, name);
}
*/
if (add) {
hr = whp_dispatch.WHvMapGpaRange(whpx->partition,
host_va,
start_pa,
size,
(WHvMapGpaRangeFlagRead |
WHvMapGpaRangeFlagExecute |
(rom ? 0 : WHvMapGpaRangeFlagWrite)));
} else {
hr = whp_dispatch.WHvUnmapGpaRange(whpx->partition,
start_pa,
size);
}
if (FAILED(hr)) {
error_report("WHPX: Failed to %s GPA range '%s' PA:%p, Size:%p bytes,"
" Host:%p, hr=%08lx",
(add ? "MAP" : "UNMAP"), name,
(void *)(uintptr_t)start_pa, (void *)size, host_va, hr);
}
}
static void whpx_process_section(MemoryRegionSection *section, int add)
{
MemoryRegion *mr = section->mr;
hwaddr start_pa = section->offset_within_address_space;
ram_addr_t size = int128_get64(section->size);
unsigned int delta;
uint64_t host_va;
if (!memory_region_is_ram(mr)) {
return;
}
delta = qemu_real_host_page_size() - (start_pa & ~qemu_real_host_page_mask());
delta &= ~qemu_real_host_page_mask();
if (delta > size) {
return;
}
start_pa += delta;
size -= delta;
size &= qemu_real_host_page_mask();
if (!size || (start_pa & ~qemu_real_host_page_mask())) {
return;
}
host_va = (uintptr_t)memory_region_get_ram_ptr(mr)
+ section->offset_within_region + delta;
whpx_update_mapping(start_pa, size, (void *)(uintptr_t)host_va, add,
memory_region_is_rom(mr), mr->name);
}
static void whpx_region_add(MemoryListener *listener,
MemoryRegionSection *section)
{
memory_region_ref(section->mr);
whpx_process_section(section, 1);
}
static void whpx_region_del(MemoryListener *listener,
MemoryRegionSection *section)
{
whpx_process_section(section, 0);
memory_region_unref(section->mr);
}
static void whpx_transaction_begin(MemoryListener *listener)
{
}
static void whpx_transaction_commit(MemoryListener *listener)
{
}
static void whpx_log_sync(MemoryListener *listener,
MemoryRegionSection *section)
{
MemoryRegion *mr = section->mr;
if (!memory_region_is_ram(mr)) {
return;
}
memory_region_set_dirty(mr, 0, int128_get64(section->size));
}
static MemoryListener whpx_memory_listener = {
.name = "whpx",
.begin = whpx_transaction_begin,
.commit = whpx_transaction_commit,
.region_add = whpx_region_add,
.region_del = whpx_region_del,
.log_sync = whpx_log_sync,
.priority = 10,
};
static void whpx_memory_init(void)
{
memory_listener_register(&whpx_memory_listener, &address_space_memory);
}
/*
* Load the functions from the given library, using the given handle. If a
* handle is provided, it is used, otherwise the library is opened. The
* handle will be updated on return with the opened one.
*/
static bool load_whp_dispatch_fns(HMODULE *handle,
WHPFunctionList function_list)
{
HMODULE hLib = *handle;
#define WINHV_PLATFORM_DLL "WinHvPlatform.dll"
#define WINHV_EMULATION_DLL "WinHvEmulation.dll"
#define WHP_LOAD_FIELD_OPTIONAL(return_type, function_name, signature) \
whp_dispatch.function_name = \
(function_name ## _t)GetProcAddress(hLib, #function_name); \
#define WHP_LOAD_FIELD(return_type, function_name, signature) \
whp_dispatch.function_name = \
(function_name ## _t)GetProcAddress(hLib, #function_name); \
if (!whp_dispatch.function_name) { \
error_report("Could not load function %s", #function_name); \
goto error; \
} \
#define WHP_LOAD_LIB(lib_name, handle_lib) \
if (!handle_lib) { \
handle_lib = LoadLibrary(lib_name); \
if (!handle_lib) { \
error_report("Could not load library %s.", lib_name); \
goto error; \
} \
} \
switch (function_list) {
case WINHV_PLATFORM_FNS_DEFAULT:
WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
LIST_WINHVPLATFORM_FUNCTIONS(WHP_LOAD_FIELD)
break;
case WINHV_EMULATION_FNS_DEFAULT:
WHP_LOAD_LIB(WINHV_EMULATION_DLL, hLib)
LIST_WINHVEMULATION_FUNCTIONS(WHP_LOAD_FIELD)
break;
case WINHV_PLATFORM_FNS_SUPPLEMENTAL:
WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
LIST_WINHVPLATFORM_FUNCTIONS_SUPPLEMENTAL(WHP_LOAD_FIELD_OPTIONAL)
break;
}
*handle = hLib;
return true;
error:
if (hLib) {
FreeLibrary(hLib);
}
return false;
}
static void whpx_set_kernel_irqchip(Object *obj, Visitor *v,
const char *name, void *opaque,
Error **errp)
{
struct whpx_state *whpx = &whpx_global;
OnOffSplit mode;
if (!visit_type_OnOffSplit(v, name, &mode, errp)) {
return;
}
switch (mode) {
case ON_OFF_SPLIT_ON:
whpx->kernel_irqchip_allowed = true;
whpx->kernel_irqchip_required = true;
break;
case ON_OFF_SPLIT_OFF:
whpx->kernel_irqchip_allowed = false;
whpx->kernel_irqchip_required = false;
break;
case ON_OFF_SPLIT_SPLIT:
error_setg(errp, "WHPX: split irqchip currently not supported");
error_append_hint(errp,
"Try without kernel-irqchip or with kernel-irqchip=on|off");
break;
default:
/*
* The value was checked in visit_type_OnOffSplit() above. If
* we get here, then something is wrong in QEMU.
*/
abort();
}
}
/*
* Partition support
*/
static int whpx_accel_init(MachineState *ms)
{
struct whpx_state *whpx;
int ret;
HRESULT hr;
WHV_CAPABILITY whpx_cap;
UINT32 whpx_cap_size;
WHV_PARTITION_PROPERTY prop;
UINT32 cpuidExitList[] = {1, 0x80000001};
WHV_CAPABILITY_FEATURES features = {0};
whpx = &whpx_global;
if (!init_whp_dispatch()) {
ret = -ENOSYS;
goto error;
}
whpx->mem_quota = ms->ram_size;
hr = whp_dispatch.WHvGetCapability(
WHvCapabilityCodeHypervisorPresent, &whpx_cap,
sizeof(whpx_cap), &whpx_cap_size);
if (FAILED(hr) || !whpx_cap.HypervisorPresent) {
error_report("WHPX: No accelerator found, hr=%08lx", hr);
ret = -ENOSPC;
goto error;
}
hr = whp_dispatch.WHvGetCapability(
WHvCapabilityCodeFeatures, &features, sizeof(features), NULL);
if (FAILED(hr)) {
error_report("WHPX: Failed to query capabilities, hr=%08lx", hr);
ret = -EINVAL;
goto error;
}
hr = whp_dispatch.WHvCreatePartition(&whpx->partition);
if (FAILED(hr)) {
error_report("WHPX: Failed to create partition, hr=%08lx", hr);
ret = -EINVAL;
goto error;
}
/*
* Query the XSAVE capability of the partition. Any error here is not
* considered fatal.
*/
hr = whp_dispatch.WHvGetPartitionProperty(
whpx->partition,
WHvPartitionPropertyCodeProcessorXsaveFeatures,
&whpx_xsave_cap,
sizeof(whpx_xsave_cap),
&whpx_cap_size);
/*
* Windows version which don't support this property will return with the
* specific error code.
*/
if (FAILED(hr) && hr != WHV_E_UNKNOWN_PROPERTY) {
error_report("WHPX: Failed to query XSAVE capability, hr=%08lx", hr);
}
if (!whpx_has_xsave()) {
printf("WHPX: Partition is not XSAVE capable\n");
}
memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
prop.ProcessorCount = ms->smp.cpus;
hr = whp_dispatch.WHvSetPartitionProperty(
whpx->partition,
WHvPartitionPropertyCodeProcessorCount,
&prop,
sizeof(WHV_PARTITION_PROPERTY));
if (FAILED(hr)) {
error_report("WHPX: Failed to set partition core count to %d,"
" hr=%08lx", ms->smp.cores, hr);
ret = -EINVAL;
goto error;
}
/*
* Error out if WHP doesn't support apic emulation and user is requiring
* it.
*/
if (whpx->kernel_irqchip_required && (!features.LocalApicEmulation ||
!whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2)) {
error_report("WHPX: kernel irqchip requested, but unavailable. "
"Try without kernel-irqchip or with kernel-irqchip=off");
ret = -EINVAL;
goto error;
}
if (whpx->kernel_irqchip_allowed && features.LocalApicEmulation &&
whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2) {
WHV_X64_LOCAL_APIC_EMULATION_MODE mode =
WHvX64LocalApicEmulationModeXApic;
printf("WHPX: setting APIC emulation mode in the hypervisor\n");
hr = whp_dispatch.WHvSetPartitionProperty(
whpx->partition,
WHvPartitionPropertyCodeLocalApicEmulationMode,
&mode,
sizeof(mode));
if (FAILED(hr)) {
error_report("WHPX: Failed to enable kernel irqchip hr=%08lx", hr);
if (whpx->kernel_irqchip_required) {
error_report("WHPX: kernel irqchip requested, but unavailable");
ret = -EINVAL;
goto error;
}
} else {
whpx->apic_in_platform = true;
}
}
/* Register for MSR and CPUID exits */
memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
prop.ExtendedVmExits.X64MsrExit = 1;
prop.ExtendedVmExits.X64CpuidExit = 1;
prop.ExtendedVmExits.ExceptionExit = 1;
if (whpx_apic_in_platform()) {
prop.ExtendedVmExits.X64ApicInitSipiExitTrap = 1;
}
hr = whp_dispatch.WHvSetPartitionProperty(
whpx->partition,
WHvPartitionPropertyCodeExtendedVmExits,
&prop,
sizeof(WHV_PARTITION_PROPERTY));
if (FAILED(hr)) {
error_report("WHPX: Failed to enable MSR & CPUIDexit, hr=%08lx", hr);
ret = -EINVAL;
goto error;
}
hr = whp_dispatch.WHvSetPartitionProperty(
whpx->partition,
WHvPartitionPropertyCodeCpuidExitList,
cpuidExitList,
RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
if (FAILED(hr)) {
error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
hr);
ret = -EINVAL;
goto error;
}
/*
* We do not want to intercept any exceptions from the guest,
* until we actually start debugging with gdb.
*/
whpx->exception_exit_bitmap = -1;
hr = whpx_set_exception_exit_bitmap(0);
if (FAILED(hr)) {
error_report("WHPX: Failed to set exception exit bitmap, hr=%08lx", hr);
ret = -EINVAL;
goto error;
}
hr = whp_dispatch.WHvSetupPartition(whpx->partition);
if (FAILED(hr)) {
error_report("WHPX: Failed to setup partition, hr=%08lx", hr);
ret = -EINVAL;
goto error;
}
whpx_memory_init();
printf("Windows Hypervisor Platform accelerator is operational\n");
return 0;
error:
if (NULL != whpx->partition) {
whp_dispatch.WHvDeletePartition(whpx->partition);
whpx->partition = NULL;
}
return ret;
}
int whpx_enabled(void)
{
return whpx_allowed;
}
bool whpx_apic_in_platform(void) {
return whpx_global.apic_in_platform;
}
static void whpx_accel_class_init(ObjectClass *oc, void *data)
{
AccelClass *ac = ACCEL_CLASS(oc);
ac->name = "WHPX";
ac->init_machine = whpx_accel_init;
ac->allowed = &whpx_allowed;
object_class_property_add(oc, "kernel-irqchip", "on|off|split",
NULL, whpx_set_kernel_irqchip,
NULL, NULL);
object_class_property_set_description(oc, "kernel-irqchip",
"Configure WHPX in-kernel irqchip");
}
static void whpx_accel_instance_init(Object *obj)
{
struct whpx_state *whpx = &whpx_global;
memset(whpx, 0, sizeof(struct whpx_state));
/* Turn on kernel-irqchip, by default */
whpx->kernel_irqchip_allowed = true;
}
static const TypeInfo whpx_accel_type = {
.name = ACCEL_CLASS_NAME("whpx"),
.parent = TYPE_ACCEL,
.instance_init = whpx_accel_instance_init,
.class_init = whpx_accel_class_init,
};
static void whpx_type_init(void)
{
type_register_static(&whpx_accel_type);
}
bool init_whp_dispatch(void)
{
if (whp_dispatch_initialized) {
return true;
}
if (!load_whp_dispatch_fns(&hWinHvPlatform, WINHV_PLATFORM_FNS_DEFAULT)) {
goto error;
}
if (!load_whp_dispatch_fns(&hWinHvEmulation, WINHV_EMULATION_FNS_DEFAULT)) {
goto error;
}
assert(load_whp_dispatch_fns(&hWinHvPlatform,
WINHV_PLATFORM_FNS_SUPPLEMENTAL));
whp_dispatch_initialized = true;
return true;
error:
if (hWinHvPlatform) {
FreeLibrary(hWinHvPlatform);
}
if (hWinHvEmulation) {
FreeLibrary(hWinHvEmulation);
}
return false;
}
type_init(whpx_type_init);