blob: e01dc58c4709ea1d06def2daf5d3b7489576a3d2 [file] [log] [blame]
// Copyright 2017 The Fuchsia Authors
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file or at
#include <bits.h>
#include <lib/arch/x86/boot-cpuid.h>
#include <lib/arch/x86/speculation.h>
#include <lib/boot-options/boot-options.h>
#include <lib/fit/defer.h>
#include <lib/ktrace.h>
#include <zircon/syscalls/hypervisor.h>
#include <new>
#include <arch/x86/descriptor.h>
#include <arch/x86/feature.h>
#include <arch/x86/hypervisor/invalidate.h>
#include <arch/x86/platform_access.h>
#include <arch/x86/pv.h>
#include <hwreg/x86msr.h>
#include <hypervisor/cpu.h>
#include <hypervisor/ktrace.h>
#include <kernel/percpu.h>
#include <kernel/stats.h>
#include <vm/fault.h>
#include <vm/pmm.h>
#include <vm/vm_object.h>
#include "pv_priv.h"
#include "vcpu_priv.h"
#include "vmexit_priv.h"
#include "vmx_cpu_state_priv.h"
namespace {
constexpr uint32_t kInterruptInfoValid = 1u << 31;
constexpr uint32_t kInterruptInfoDeliverErrorCode = 1u << 11;
constexpr uint32_t kInterruptTypeNmi = 2u << 8;
constexpr uint32_t kInterruptTypeHardwareException = 3u << 8;
constexpr uint32_t kInterruptTypeSoftwareException = 6u << 8;
constexpr uint16_t kBaseProcessorVpid = 1;
void vmptrld(paddr_t pa) {
uint8_t err;
__asm__ __volatile__("vmptrld %[pa]"
: "=@ccna"(err) // Set `err` on error (C or Z flag set)
: [pa] "m"(pa)
: "cc", "memory");
void vmclear(paddr_t pa) {
uint8_t err;
__asm__ __volatile__("vmclear %[pa]"
: "=@ccna"(err) // Set `err` on error (C or Z flag set)
: [pa] "m"(pa)
: "cc", "memory");
uint64_t vmread(uint64_t field) {
uint8_t err;
uint64_t val;
__asm__ __volatile__("vmread %[field], %[val]"
: [val] "=r"(val),
"=@ccna"(err) // Set `err` on error (C or Z flag set)
: [field] "r"(field)
: "cc");
return val;
void vmwrite(uint64_t field, uint64_t val) {
uint8_t err;
__asm__ __volatile__("vmwrite %[val], %[field]"
: "=@ccna"(err) // Set `err` on error (C or Z flag set)
: [val] "r"(val), [field] "r"(field)
: "cc");
bool has_error_code(uint32_t vector) {
switch (vector) {
case X86_INT_GP_FAULT:
return true;
return false;
void swap_extended_registers(uint8_t* save_extended_registers, uint64_t& save_xcr0, bool save,
uint8_t* load_extended_registers, uint64_t& load_xcr0, bool load) {
if (save) {
save_xcr0 = x86_xgetbv(0);
if (load) {
x86_xsetbv(0, load_xcr0);
template <typename Out, typename In>
void register_copy(Out& out, const In& in) {
out.rax = in.rax;
out.rcx = in.rcx;
out.rdx = in.rdx;
out.rbx = in.rbx;
out.rbp = in.rbp;
out.rsi = in.rsi;
out.rdi = in.rdi;
out.r8 = in.r8;
out.r9 = in.r9;
out.r10 = in.r10;
out.r11 = in.r11;
out.r12 = in.r12;
out.r13 = in.r13;
out.r14 = in.r14;
out.r15 = in.r15;
zx::result<> vmcs_init(AutoVmcs& vmcs, const VcpuConfig& config, uint16_t vpid, uintptr_t entry,
paddr_t msr_bitmaps_address, paddr_t ept_pml4, VmxState* vmx_state,
uint8_t* extended_register_state) {
// Setup secondary processor-based VMCS controls.
auto result =
vmcs.SetControl(VmcsField32::PROCBASED_CTLS2, read_msr(X86_MSR_IA32_VMX_PROCBASED_CTLS2), 0,
// Enable use of extended page tables.
kProcbasedCtls2Ept |
// Enable use of RDTSCP instruction.
kProcbasedCtls2Rdtscp |
// Enable X2APIC.
kProcbasedCtls2x2Apic |
// Associate cached translations of linear
// addresses with a virtual processor ID.
kProcbasedCtls2Vpid |
// If `unrestricted`, enable unrestricted guest.
(config.unrestricted ? kProcbasedCtls2UnrestrictedGuest : 0),
// If not `unrestricted`, disable unrestricted guest.
(config.unrestricted ? 0 : kProcbasedCtls2UnrestrictedGuest));
if (result.is_error()) {
return result;
// Enable use of INVPCID instruction if available.
std::ignore =
vmcs.SetControl(VmcsField32::PROCBASED_CTLS2, read_msr(X86_MSR_IA32_VMX_PROCBASED_CTLS2),
vmcs.Read(VmcsField32::PROCBASED_CTLS2), kProcbasedCtls2Invpcid, 0);
// Setup pin-based VMCS controls.
result =
vmcs.SetControl(VmcsField32::PINBASED_CTLS, read_msr(X86_MSR_IA32_VMX_TRUE_PINBASED_CTLS),
// External interrupts cause a VM exit.
kPinbasedCtlsExtIntExiting |
// Non-maskable interrupts cause a VM exit.
if (result.is_error()) {
return result;
const uint32_t cr_ctls =
// VM exit on CR3 load.
kProcbasedCtlsCr3LoadExiting |
// VM exit on CR3 store.
kProcbasedCtlsCr3StoreExiting |
// VM exit on CR8 load.
kProcbasedCtlsCr8LoadExiting |
// VM exit on CR8 store.
// Setup primary processor-based VMCS controls.
result =
vmcs.SetControl(VmcsField32::PROCBASED_CTLS, read_msr(X86_MSR_IA32_VMX_TRUE_PROCBASED_CTLS),
// Enable VM exit when interrupts are enabled.
kProcbasedCtlsIntWindowExiting |
// Enable VM exit on HLT instruction.
kProcbasedCtlsHltExiting |
// Enable TPR virtualization.
kProcbasedCtlsTprShadow |
// Enable VM exit on IO instructions.
kProcbasedCtlsIoExiting |
// Enable use of MSR bitmaps.
kProcbasedCtlsMsrBitmaps |
// Enable secondary processor-based controls.
kProcbasedCtlsProcbasedCtls2 |
// If `cr_exiting`, enable VM exit on CRs.
(config.cr_exiting ? cr_ctls : 0),
// If not `cr_exiting`, disable VM exit on CRs.
(config.cr_exiting ? 0 : cr_ctls));
if (result.is_error()) {
return result;
// We only enable interrupt-window exiting above to ensure that the
// processor supports it for later use. So disable it for now.
// Setup VM-exit VMCS controls.
result = vmcs.SetControl(VmcsField32::EXIT_CTLS, read_msr(X86_MSR_IA32_VMX_TRUE_EXIT_CTLS),
// Logical processor is in 64-bit mode after VM
// exit. On VM exit CS.L, IA32_EFER.LME, and
// IA32_EFER.LMA is set to true.
kExitCtls64bitMode |
// Acknowledge external interrupt on exit.
kExitCtlsAckIntOnExit |
// Save the guest IA32_PAT MSR on exit.
kExitCtlsSaveIa32Pat |
// Load the host IA32_PAT MSR on exit.
kExitCtlsLoadIa32Pat |
// Save the guest IA32_EFER MSR on exit.
kExitCtlsSaveIa32Efer |
// Load the host IA32_EFER MSR on exit.
if (result.is_error()) {
return result;
// Whether we are configuring the base processor. The base processor starts in
// 64-bit mode with all features enabled. For secondary processors, they must
// be bootstrapped by the operating system.
// If there is no base processor for this VCPU type, then default to true.
const bool is_base_processor = config.has_base_processor ? vpid == kBaseProcessorVpid : true;
// Setup VM-entry VMCS controls.
// Load the guest IA32_PAT MSR and IA32_EFER MSR on entry.
uint32_t entry_ctls = kEntryCtlsLoadIa32Pat | kEntryCtlsLoadIa32Efer;
if (is_base_processor) {
// On the BSP, go straight to 64-bit mode on entry.
entry_ctls |= kEntryCtls64bitMode;
result = vmcs.SetControl(VmcsField32::ENTRY_CTLS, read_msr(X86_MSR_IA32_VMX_TRUE_ENTRY_CTLS),
read_msr(X86_MSR_IA32_VMX_ENTRY_CTLS), entry_ctls, 0);
if (result.is_error()) {
return result;
// From Volume 3, Section 24.6.3: The exception bitmap is a 32-bit field
// that contains one bit for each exception. When an exception occurs,
// its vector is used to select a bit in this field. If the bit is 1,
// the exception causes a VM exit. If the bit is 0, the exception is
// delivered normally through the IDT, using the descriptor
// corresponding to the exception’s vector.
// From Volume 3, Section 25.2: If software desires VM exits on all page
// faults, it can set bit 14 in the exception bitmap to 1 and set the
// page-fault error-code mask and match fields each to 00000000H.
vmcs.Write(VmcsField32::PAGEFAULT_ERRORCODE_MASK, 0);
vmcs.Write(VmcsField32::PAGEFAULT_ERRORCODE_MATCH, 0);
// From Volume 3, Section 28.1: Virtual-processor identifiers (VPIDs)
// introduce to VMX operation a facility by which a logical processor may
// cache information for multiple linear-address spaces. When VPIDs are
// used, VMX transitions may retain cached information and the logical
// processor switches to a different linear-address space.
// From Volume 3, Section If the “enable VPID” VM-execution
// control is 1, the value of the VPID VM-execution control field must not
// be 0000H.
// From Volume 3, Section If EPT is in use, the logical processor
// associates all mappings it creates with the value of bits 51:12 of
// current EPTP. If a VMM uses different EPTP values for different guests,
// it may use the same VPID for those guests.
// From Volume 3, Section Operations that architecturally
// invalidate entries in the TLBs or paging-structure caches independent of
// VMX operation (e.g., the INVLPG and INVPCID instructions) invalidate
// linear mappings and combined mappings. They are required to do so only
// for the current VPID (but, for combined mappings, all EP4TAs). Linear
// mappings for the current VPID are invalidated even if EPT is in use.
// Combined mappings for the current VPID are invalidated even if EPT is
// not in use.
vmcs.Write(VmcsField16::VPID, vpid);
invvpid(InvVpid::SINGLE_CONTEXT, vpid, 0);
// From Volume 3, Section 28.2: The extended page-table mechanism (EPT) is a
// feature that can be used to support the virtualization of physical
// memory. When EPT is in use, certain addresses that would normally be
// treated as physical addresses (and used to access memory) are instead
// treated as guest-physical addresses. Guest-physical addresses are
// translated by traversing a set of EPT paging structures to produce
// physical addresses that are used to access memory.
const auto eptp = ept_pointer_from_pml4(ept_pml4);
vmcs.Write(VmcsField64::EPT_POINTER, eptp);
// Setup MSR handling.
vmcs.Write(VmcsField64::MSR_BITMAPS_ADDRESS, msr_bitmaps_address);
// Setup VMCS host state.
// NOTE: We are pinned to a thread when executing this function, therefore
// it is acceptable to use per-CPU state.
x86_percpu* percpu = x86_get_percpu();
vmcs.Write(VmcsField32::HOST_IA32_SYSENTER_CS, 0);
vmcs.Write(VmcsFieldXX::HOST_IA32_SYSENTER_ESP, 0);
vmcs.Write(VmcsFieldXX::HOST_IA32_SYSENTER_EIP, 0);
vmcs.Write(VmcsField64::HOST_IA32_PAT, read_msr(X86_MSR_IA32_PAT));
vmcs.Write(VmcsField64::HOST_IA32_EFER, read_msr(X86_MSR_IA32_EFER));
vmcs.Write(VmcsFieldXX::HOST_CR0, x86_get_cr0());
vmcs.Write(VmcsFieldXX::HOST_CR4, x86_get_cr4());
vmcs.Write(VmcsField16::HOST_ES_SELECTOR, 0);
vmcs.Write(VmcsField16::HOST_CS_SELECTOR, CODE_64_SELECTOR);
vmcs.Write(VmcsField16::HOST_DS_SELECTOR, 0);
vmcs.Write(VmcsField16::HOST_FS_SELECTOR, 0);
vmcs.Write(VmcsField16::HOST_GS_SELECTOR, 0);
vmcs.Write(VmcsField16::HOST_TR_SELECTOR, TSS_SELECTOR(percpu->cpu_num));
vmcs.Write(VmcsFieldXX::HOST_FS_BASE, read_msr(X86_MSR_IA32_FS_BASE));
vmcs.Write(VmcsFieldXX::HOST_GS_BASE, read_msr(X86_MSR_IA32_GS_BASE));
vmcs.Write(VmcsFieldXX::HOST_TR_BASE, reinterpret_cast<uint64_t>(&percpu->default_tss));
vmcs.Write(VmcsFieldXX::HOST_GDTR_BASE, reinterpret_cast<uint64_t>(gdt_get()));
vmcs.Write(VmcsFieldXX::HOST_IDTR_BASE, reinterpret_cast<uint64_t>(idt_get_readonly()));
vmcs.Write(VmcsFieldXX::HOST_RSP, reinterpret_cast<uint64_t>(vmx_state));
vmcs.Write(VmcsFieldXX::HOST_RIP, reinterpret_cast<uint64_t>(vmx_exit_asm));
// Setup VMCS guest state.
uint64_t cr0 = X86_CR0_ET | // Enable extension type
X86_CR0_NE | // Enable internal x87 exception handling
X86_CR0_WP; // Enable supervisor write protect
if (is_base_processor) {
// Enable protected mode and paging on the primary VCPU.
cr0 |= X86_CR0_PE | // Enable protected mode
X86_CR0_PG; // Enable paging
if (cr0_is_invalid(vmcs, cr0)) {
return zx::error(ZX_ERR_BAD_STATE);
vmcs.Write(VmcsFieldXX::GUEST_CR0, cr0);
uint64_t cr4 = X86_CR4_OSFXSR | X86_CR4_VMXE | X86_CR4_FSGSBASE | X86_CR4_OSXSAVE;
if (is_base_processor) {
// Enable PAE and PGE on the BSP.
cr4 |= X86_CR4_PAE | X86_CR4_PGE;
if (cr_is_invalid(cr4, X86_MSR_IA32_VMX_CR4_FIXED0, X86_MSR_IA32_VMX_CR4_FIXED1)) {
return zx::error(ZX_ERR_BAD_STATE);
vmcs.Write(VmcsFieldXX::GUEST_CR4, cr4);
vmcs.Write(VmcsField64::GUEST_IA32_PAT, read_msr(X86_MSR_IA32_PAT));
uint64_t guest_efer = read_msr(X86_MSR_IA32_EFER);
if (!is_base_processor) {
// Disable LME and LMA on all but the BSP.
guest_efer &= ~(X86_EFER_LME | X86_EFER_LMA);
vmcs.Write(VmcsField64::GUEST_IA32_EFER, guest_efer);
uint32_t cs_access_rights =
kGuestXxAccessRightsDefault | kGuestXxAccessRightsTypeE | kGuestXxAccessRightsTypeCode;
if (is_base_processor) {
// Ensure that the BSP starts with a 64-bit code segment.
cs_access_rights |= kGuestXxAccessRightsL;
vmcs.Write(VmcsField32::GUEST_CS_ACCESS_RIGHTS, cs_access_rights);
kGuestTrAccessRightsTssBusy | kGuestXxAccessRightsP);
vmcs.Write(VmcsField32::GUEST_SS_ACCESS_RIGHTS, kGuestXxAccessRightsDefault);
vmcs.Write(VmcsField32::GUEST_DS_ACCESS_RIGHTS, kGuestXxAccessRightsDefault);
vmcs.Write(VmcsField32::GUEST_ES_ACCESS_RIGHTS, kGuestXxAccessRightsDefault);
vmcs.Write(VmcsField32::GUEST_FS_ACCESS_RIGHTS, kGuestXxAccessRightsDefault);
vmcs.Write(VmcsField32::GUEST_GS_ACCESS_RIGHTS, kGuestXxAccessRightsDefault);
kGuestXxAccessRightsTypeW | kGuestXxAccessRightsP);
if (is_base_processor) {
// Use GUEST_RIP to set the entry point on the BSP.
vmcs.Write(VmcsFieldXX::GUEST_CS_BASE, 0);
vmcs.Write(VmcsField16::GUEST_CS_SELECTOR, 0);
vmcs.Write(VmcsFieldXX::GUEST_RIP, entry);
} else {
// Use CS to set the entry point on APs.
vmcs.Write(VmcsFieldXX::GUEST_CS_BASE, entry);
vmcs.Write(VmcsField16::GUEST_CS_SELECTOR, static_cast<uint16_t>(entry >> 4));
vmcs.Write(VmcsFieldXX::GUEST_RIP, 0);
vmcs.Write(VmcsField32::GUEST_CS_LIMIT, 0xffff);
vmcs.Write(VmcsFieldXX::GUEST_TR_BASE, 0);
vmcs.Write(VmcsField16::GUEST_TR_SELECTOR, 0);
vmcs.Write(VmcsField32::GUEST_TR_LIMIT, 0xffff);
vmcs.Write(VmcsFieldXX::GUEST_DS_BASE, 0);
vmcs.Write(VmcsField32::GUEST_DS_LIMIT, 0xffff);
vmcs.Write(VmcsFieldXX::GUEST_SS_BASE, 0);
vmcs.Write(VmcsField32::GUEST_SS_LIMIT, 0xffff);
vmcs.Write(VmcsFieldXX::GUEST_ES_BASE, 0);
vmcs.Write(VmcsField32::GUEST_ES_LIMIT, 0xffff);
vmcs.Write(VmcsFieldXX::GUEST_FS_BASE, 0);
vmcs.Write(VmcsField32::GUEST_FS_LIMIT, 0xffff);
vmcs.Write(VmcsFieldXX::GUEST_GS_BASE, 0);
vmcs.Write(VmcsField32::GUEST_GS_LIMIT, 0xffff);
vmcs.Write(VmcsField32::GUEST_LDTR_LIMIT, 0xffff);
vmcs.Write(VmcsFieldXX::GUEST_GDTR_BASE, 0);
vmcs.Write(VmcsField32::GUEST_GDTR_LIMIT, 0xffff);
vmcs.Write(VmcsFieldXX::GUEST_IDTR_BASE, 0);
vmcs.Write(VmcsField32::GUEST_IDTR_LIMIT, 0xffff);
// Set all reserved RFLAGS bits to their correct values
vmcs.Write(VmcsField32::GUEST_ACTIVITY_STATE, 0);
// From Volume 3, Section The IA32_SYSENTER_ESP field and the
// IA32_SYSENTER_EIP field must each contain a canonical address.
vmcs.Write(VmcsFieldXX::GUEST_IA32_SYSENTER_ESP, 0);
vmcs.Write(VmcsFieldXX::GUEST_IA32_SYSENTER_EIP, 0);
vmcs.Write(VmcsField32::GUEST_IA32_SYSENTER_CS, 0);
vmcs.Write(VmcsFieldXX::GUEST_RSP, 0);
// From Volume 3, Section 24.4.2: If the “VMCS shadowing” VM-execution
// control is 1, the VMREAD and VMWRITE instructions access the VMCS
// referenced by this pointer (see Section 24.10). Otherwise, software
// should set this field to FFFFFFFF_FFFFFFFFH to avoid VM-entry
// failures (see Section
vmcs.Write(VmcsField64::LINK_POINTER, kLinkPointerInvalidate);
if (x86_xsave_supported()) {
// Set initial guest XCR0 to host XCR0.
vmx_state->host_state.xcr0 = x86_xgetbv(0);
vmx_state->guest_state.xcr0 =
x86_extended_register_init_state_from_bv(extended_register_state, vmx_state->guest_state.xcr0);
return zx::ok();
// Injects an interrupt into the guest, if there is one pending.
zx_status_t local_apic_maybe_interrupt(AutoVmcs* vmcs, LocalApicState* local_apic_state) {
// Since hardware generated exceptions are delivered to the guest directly,
// the only exceptions we see here are those we generate in the VMM, e.g. GP
// faults in vmexit handlers. Therefore we simplify interrupt priority to 1)
// NMIs, 2) interrupts, and 3) generated exceptions. See Volume 3, Section
// 6.9, Table 6-2.
uint32_t vector = X86_INT_COUNT;
bool pending = local_apic_state->interrupt_tracker.TryPop(X86_INT_NMI);
if (pending) {
vector = X86_INT_NMI;
} else {
// Pop scans vectors from highest to lowest, which will correctly pop
// interrupts before exceptions. All vectors <= X86_INT_VIRT except the NMI
// vector are exceptions.
pending = local_apic_state->interrupt_tracker.Pop(&vector);
if (!pending) {
return ZX_OK;
// If type isn't inactive, then Pop should have initialized vector to a
// valid value.
// NMI injection is blocked if an NMI is already being serviced (Volume 3,
// Section 24.4.2, Table 24-3), and mov ss blocks *all* interrupts (Volume 2
// Section 4.3 MOV-Move instruction). Note that the IF flag does not affect
// NMIs (Volume 3, Section 6.8.1).
auto can_inject_nmi = [vmcs] {
return (vmcs->Read(VmcsField32::GUEST_INTERRUPTIBILITY_STATE) &
(kInterruptibilityNmiBlocking | kInterruptibilityMovSsBlocking)) == 0;
// External interrupts can be blocked due to STI, move SS or the IF flag.
auto can_inject_external_int = [vmcs] {
return (vmcs->Read(VmcsFieldXX::GUEST_RFLAGS) & X86_FLAGS_IF) &&
(kInterruptibilityStiBlocking | kInterruptibilityMovSsBlocking)) == 0;
if (vector > X86_INT_VIRT && vector < X86_INT_PLATFORM_BASE) {
dprintf(INFO, "Invalid interrupt vector: %u\n", vector);
} else if ((vector >= X86_INT_PLATFORM_BASE && !can_inject_external_int()) ||
(vector == X86_INT_NMI && !can_inject_nmi())) {
// If interrupts are disabled, we set VM exit on interrupt enable.
return ZX_OK;
// If the vector is non-maskable or interrupts are enabled, inject interrupt.
// Volume 3, Section 6.9: Lower priority exceptions are discarded; lower
// priority interrupts are held pending. Discarded exceptions are re-generated
// when the interrupt handler returns execution to the point in the program or
// task where the exceptions and/or interrupts occurred.
local_apic_state->interrupt_tracker.Clear(0, X86_INT_NMI);
local_apic_state->interrupt_tracker.Clear(X86_INT_NMI + 1, X86_INT_VIRT + 1);
return ZX_OK;
void interrupt_cpu(Thread* thread, cpu_num_t last_cpu) TA_REQ(ThreadLock::Get()) {
// Check if the VCPU is running and whether to send an IPI. We hold the thread
// lock to guard against thread migration between CPUs during the check.
// NOTE: `last_cpu` may be currently set to `INVALID_CPU` due to thread
// migration between CPUs.
if (thread != nullptr && thread->state() == THREAD_RUNNING && last_cpu != INVALID_CPU) {
mp_interrupt(MP_IPI_TARGET_MASK, cpu_num_to_mask(last_cpu));
} // namespace
AutoVmcs::AutoVmcs(paddr_t vmcs_address, bool clear) : vmcs_address_(vmcs_address) {
int_state_ = arch_interrupt_save();
if (clear) {
AutoVmcs::~AutoVmcs() {
if (vmcs_address_ != 0) {
void AutoVmcs::Invalidate() {
if (vmcs_address_ != 0) {
vmcs_address_ = 0;
void AutoVmcs::InterruptWindowExiting(bool enable) {
DEBUG_ASSERT(vmcs_address_ != 0);
uint32_t controls = Read(VmcsField32::PROCBASED_CTLS);
if (enable) {
controls |= kProcbasedCtlsIntWindowExiting;
} else {
controls &= ~kProcbasedCtlsIntWindowExiting;
Write(VmcsField32::PROCBASED_CTLS, controls);
void AutoVmcs::IssueInterrupt(uint32_t vector) {
DEBUG_ASSERT(vmcs_address_ != 0);
uint32_t interrupt_info = kInterruptInfoValid | (vector & UINT8_MAX);
if (vector == X86_INT_BREAKPOINT || vector == X86_INT_OVERFLOW) {
// From Volume 3, Section 24.8.3. A VMM should use type hardware exception for all
// exceptions other than breakpoints and overflows, which should be software exceptions.
interrupt_info |= kInterruptTypeSoftwareException;
} else if (vector == X86_INT_NMI) {
interrupt_info |= kInterruptTypeNmi;
} else if (vector <= X86_INT_VIRT) {
// From Volume 3, Section 6.15. All other vectors from 0 to X86_INT_VIRT are exceptions.
interrupt_info |= kInterruptTypeHardwareException;
if (has_error_code(vector)) {
interrupt_info |= kInterruptInfoDeliverErrorCode;
DEBUG_ASSERT((Read(VmcsField32::ENTRY_INTERRUPTION_INFORMATION) & kInterruptInfoValid) == 0);
Write(VmcsField32::ENTRY_INTERRUPTION_INFORMATION, interrupt_info);
uint16_t AutoVmcs::Read(VmcsField16 field) const {
DEBUG_ASSERT(vmcs_address_ != 0);
return static_cast<uint16_t>(vmread(static_cast<uint64_t>(field)));
uint32_t AutoVmcs::Read(VmcsField32 field) const {
DEBUG_ASSERT(vmcs_address_ != 0);
return static_cast<uint32_t>(vmread(static_cast<uint64_t>(field)));
uint64_t AutoVmcs::Read(VmcsField64 field) const {
DEBUG_ASSERT(vmcs_address_ != 0);
return vmread(static_cast<uint64_t>(field));
uint64_t AutoVmcs::Read(VmcsFieldXX field) const {
DEBUG_ASSERT(vmcs_address_ != 0);
return vmread(static_cast<uint64_t>(field));
void AutoVmcs::Write(VmcsField16 field, uint16_t val) {
DEBUG_ASSERT(vmcs_address_ != 0);
vmwrite(static_cast<uint64_t>(field), val);
void AutoVmcs::Write(VmcsField32 field, uint32_t val) {
DEBUG_ASSERT(vmcs_address_ != 0);
vmwrite(static_cast<uint64_t>(field), val);
void AutoVmcs::Write(VmcsField64 field, uint64_t val) {
DEBUG_ASSERT(vmcs_address_ != 0);
vmwrite(static_cast<uint64_t>(field), val);
void AutoVmcs::Write(VmcsFieldXX field, uint64_t val) {
DEBUG_ASSERT(vmcs_address_ != 0);
vmwrite(static_cast<uint64_t>(field), val);
zx::result<> AutoVmcs::SetControl(VmcsField32 controls, uint64_t true_msr, uint64_t old_msr,
uint32_t set, uint32_t clear) {
DEBUG_ASSERT(vmcs_address_ != 0);
uint32_t allowed_0 = static_cast<uint32_t>(BITS(true_msr, 31, 0));
uint32_t allowed_1 = static_cast<uint32_t>(BITS_SHIFT(true_msr, 63, 32));
if ((allowed_1 & set) != set) {
dprintf(INFO, "Failed to set VMCS controls %#x, %#x != %#x\n", static_cast<uint>(controls),
allowed_1, set);
return zx::error(ZX_ERR_NOT_SUPPORTED);
if ((~allowed_0 & clear) != clear) {
dprintf(INFO, "Failed to clear VMCS controls %#x, %#x != %#x\n", static_cast<uint>(controls),
~allowed_0, clear);
return zx::error(ZX_ERR_NOT_SUPPORTED);
if ((set & clear) != 0) {
dprintf(INFO, "Attempted to set and clear the same VMCS controls %#x\n",
return zx::error(ZX_ERR_INVALID_ARGS);
// See Volume 3, Section 31.5.1, Algorithm 3, Part C. If the control can be
// either 0 or 1 (flexible), and the control is unknown, then refer to the
// old MSR to find the default value.
uint32_t flexible = allowed_0 ^ allowed_1;
uint32_t unknown = flexible & ~(set | clear);
uint32_t defaults = unknown & BITS(old_msr, 31, 0);
Write(controls, allowed_0 | defaults | set);
return zx::ok();
bool cr0_is_invalid(AutoVmcs& vmcs, uint64_t cr0_value) {
uint64_t check_value = cr0_value;
// From Volume 3, Section PE and PG bits of CR0 are not checked when unrestricted
// guest is enabled. Set both here to avoid clashing with X86_MSR_IA32_VMX_CR0_FIXED1.
if (vmcs.Read(VmcsField32::PROCBASED_CTLS2) & kProcbasedCtls2UnrestrictedGuest) {
check_value |= X86_CR0_PE | X86_CR0_PG;
return cr_is_invalid(check_value, X86_MSR_IA32_VMX_CR0_FIXED0, X86_MSR_IA32_VMX_CR0_FIXED1);
// static
template <typename V, typename G>
zx::result<ktl::unique_ptr<V>> Vcpu::Create(G& guest, uint16_t vpid, zx_vaddr_t entry) {
if (fbl::RefPtr<VmAddressRegion> root_vmar = guest.RootVmar();
entry < root_vmar->base() || entry >= root_vmar->base() + root_vmar->size()) {
return zx::error(ZX_ERR_INVALID_ARGS);
Thread* thread = Thread::Current::Get();
if (thread->vcpu()) {
return zx::error(ZX_ERR_BAD_STATE);
fbl::AllocChecker ac;
ktl::unique_ptr<V> vcpu(new (&ac) V(guest, vpid, thread));
if (!ac.check()) {
return zx::error(ZX_ERR_NO_MEMORY);
VmxInfo vmx_info;
auto result = vcpu->vmcs_page_.Alloc(vmx_info, 0);
if (result.is_error()) {
return result.take_error();
VmxRegion* region = vcpu->vmcs_page_.template VirtualAddress<VmxRegion>();
region->revision_id = vmx_info.revision_id;
zx_paddr_t ept_pml4 = guest.PhysicalAspace().arch_aspace().arch_table_phys();
zx_paddr_t vmcs_address = vcpu->vmcs_page_.PhysicalAddress();
// We create the `AutoVmcs` object here, so that we ensure that interrupts are
// disabled from `vmcs_init` until `SetMigrateFn`. This is important to ensure
// that we do not migrate CPUs while setting up the VCPU.
AutoVmcs vmcs(vmcs_address, /*clear=*/true);
result = vmcs_init(vmcs, V::kConfig, vpid, entry, guest.MsrBitmapsAddress(), ept_pml4,
&vcpu->vmx_state_, vcpu->extended_register_state_);
if (result.is_error()) {
return result.take_error();
Guard<MonitoredSpinLock, IrqSave> guard{ThreadLock::Get(), SOURCE_TAG};
// Only set the thread migrate function after we have initialised the VMCS.
// Otherwise, the migrate function may interact with an uninitialised VMCS.
// We have to disable thread safety analysis because it's not smart enough to
// realize that SetMigrateFn will always be called with the ThreadLock.
thread->SetMigrateFnLocked([vcpu = vcpu.get()](Thread* thread, auto stage)
TA_NO_THREAD_SAFETY_ANALYSIS { vcpu->Migrate(thread, stage); });
thread->SetContextSwitchFnLocked([vcpu = vcpu.get()]() {
if (vcpu->entered_.load()) {
// `arch_context_switch()` saves and restores GS, so we can skip it.
return zx::ok(ktl::move(vcpu));
Vcpu::Vcpu(Guest& guest, uint16_t vpid, Thread* thread)
: guest_(guest),
vmx_state_(/* zero-init */),
msr_state_(/* zero-init */) {
Vcpu::~Vcpu() {
cpu_num_t cpu;
// Taking the ThreadLock guarantees that thread_ isn't going to be freed
// while we access it.
Guard<MonitoredSpinLock, IrqSave> guard{ThreadLock::Get(), SOURCE_TAG};
Thread* thread = thread_.load();
if (thread != nullptr) {
// Clear the migration function, so that |thread_| does not reference
// |this| after destruction of the VCPU.
cpu = last_cpu_;
if (vmcs_page_.IsAllocated() && cpu != INVALID_CPU) {
// Clear VMCS state from the CPU.
// The destructor may be called from a different thread, therefore we must
// IPI the CPU that last run the thread.
paddr_t paddr = vmcs_page_.PhysicalAddress();
MP_IPI_TARGET_MASK, cpu_num_to_mask(cpu),
[](void* paddr) { vmclear(reinterpret_cast<paddr_t>(paddr)); },
void Vcpu::Migrate(Thread* thread, Thread::MigrateStage stage) {
// Volume 3, Section 31.8.2: An MP-aware VMM is free to assign any logical
// processor to a VM. But for performance considerations, moving a guest VMCS
// to another logical processor is slower than resuming that guest VMCS on the
// same logical processor. Certain VMX performance features (such as caching
// of portions of the VMCS in the processor) are optimized for a guest VMCS
// that runs on the same logical processor.
// If the VMCS regions are identical (same revision ID) the following sequence
// can be used to move or copy the VMCS from one logical processor to another:
switch (stage) {
// * Perform a VMCLEAR operation on the source logical processor. This
// ensures that all VMCS data that may be cached by the processor are
// flushed to memory.
case Thread::MigrateStage::Before: {
// After VMCLEAR, `last_cpu_` can be cleared to indicate this VCPU is both
// not presently running, and its state is not loaded anywhere.
last_cpu_ = INVALID_CPU;
// * Copy the VMCS region from one memory location to another location. This
// is an optional step assuming the VMM wishes to relocate the VMCS or
// move the VMCS to another system.
// * Perform a VMPTRLD of the physical address of VMCS region on the
// destination processor to establish its current VMCS pointer.
case Thread::MigrateStage::After: {
// Volume 3, Section 31.8.2: To migrate a VMCS to another logical
// processor, a VMM must use the sequence of VMCLEAR, VMPTRLD and
// We set `resume` to false so that `vmx_enter` will call VMLAUNCH when
// entering the the guest, instead of VMRESUME.
vmx_state_.resume = false;
// Before performing the VMPTRLD, update the `last_cpu_` for
// `Vcpu::Interrupt()` and `vmcs_page_` state tracking. It is assumed that
// the `Thread::MigrateStage::Before` stage already happened and that a
// VMCLEAR has been performed on `last_cpu_`, hence the previous value of
last_cpu_ = thread->LastCpuLocked();
// Load the VMCS on the destination processor.
// Update the VMCS with the per-CPU variables of the destination
// processor.
x86_percpu* percpu = x86_get_percpu();
vmwrite(static_cast<uint64_t>(VmcsField16::HOST_TR_SELECTOR), TSS_SELECTOR(percpu->cpu_num));
vmwrite(static_cast<uint64_t>(VmcsFieldXX::HOST_FS_BASE), thread->arch().fs_base);
vmwrite(static_cast<uint64_t>(VmcsFieldXX::HOST_GS_BASE), read_msr(X86_MSR_IA32_GS_BASE));
// Invalidate TLB mappings for the VPID.
invvpid(InvVpid::SINGLE_CONTEXT, vpid_, 0);
case Thread::MigrateStage::Exiting: {
// The `thread_` is exiting and so we must clear our reference to it.;
void Vcpu::ContextSwitch(bool include_gs) {
uint64_t star = read_msr(X86_MSR_IA32_STAR);
uint64_t lstar = read_msr(X86_MSR_IA32_LSTAR);
uint64_t fmask = read_msr(X86_MSR_IA32_FMASK);
uint64_t tsc_aux = read_msr(X86_MSR_IA32_TSC_AUX);
write_msr(X86_MSR_IA32_LSTAR, msr_state_.lstar);
write_msr(X86_MSR_IA32_FMASK, msr_state_.fmask);
write_msr(X86_MSR_IA32_TSC_AUX, msr_state_.tsc_aux); = star;
msr_state_.lstar = lstar;
msr_state_.fmask = fmask;
msr_state_.tsc_aux = tsc_aux;
if (include_gs) {
uint64_t kernel_gs_base = read_msr(X86_MSR_IA32_KERNEL_GS_BASE);
write_msr(X86_MSR_IA32_KERNEL_GS_BASE, msr_state_.kernel_gs_base);
msr_state_.kernel_gs_base = kernel_gs_base;
void Vcpu::LoadExtendedRegisters(AutoVmcs& vmcs) {
arch_thread& thread = Thread::Current::Get()->arch();
bool save_host = x86_xsave_supported();
bool load_guest = vmcs.Read(VmcsFieldXX::GUEST_CR4) & X86_CR4_OSXSAVE;
swap_extended_registers(thread.extended_register_buffer, vmx_state_.host_state.xcr0, save_host,
extended_register_state_, vmx_state_.guest_state.xcr0, load_guest);
void Vcpu::SaveExtendedRegisters(AutoVmcs& vmcs) {
arch_thread& thread = Thread::Current::Get()->arch();
bool save_guest = vmcs.Read(VmcsFieldXX::GUEST_CR4) & X86_CR4_OSXSAVE;
bool load_host = x86_xsave_supported();
swap_extended_registers(extended_register_state_, vmx_state_.guest_state.xcr0, save_guest,
thread.extended_register_buffer, vmx_state_.host_state.xcr0, load_host);
zx::result<> vmx_enter(VmxState* vmx_state) {
// Perform the low-level vmlaunch or vmresume, entering the guest,
// and returning when the guest exits.
zx_status_t status = vmx_enter_asm(vmx_state);
// Reload the task segment in order to restore its limit. VMX always
// restores it with a limit of 0x67, which excludes the IO bitmap.
seg_sel_t selector = TSS_SELECTOR(arch_curr_cpu_num());
return zx::make_result(status);
template <typename PreEnterFn, typename PostExitFn>
zx::result<> Vcpu::EnterInternal(PreEnterFn pre_enter, PostExitFn post_exit,
zx_port_packet_t& packet) {
Thread* current_thread = Thread::Current::Get();
if (current_thread != thread_) {
return zx::error(ZX_ERR_BAD_STATE);
bool extended_registers_loaded = false;
auto defer = fit::defer([this, &extended_registers_loaded] {
if (extended_registers_loaded) {
AutoVmcs vmcs(vmcs_page_.PhysicalAddress());
// Spectre V2: Ensure that code executed in the VM guest cannot influence
// indirect branch prediction in the host.
// TODO( We may be able to avoid the IBPB here; the kernel
// is either built with a retpoline or has Enhanced IBRS enabled. We
// currently execute an IBPB on context-switch to a new aspace. The IBPB is
// currently only here to protect hypervisor user threads.
if (!gBootOptions->x86_disable_spec_mitigations && x86_cpu_has_ibpb()) {
arch::IssueIbpb(arch::BootCpuidIo{}, hwreg::X86MsrIo{});
zx::result<> result = zx::ok();
do {
// If the thread was killed or suspended, then we should exit with an error.
if (zx_status_t status = current_thread->CheckKillOrSuspendSignal(); status != ZX_OK) {
return zx::error(status);
AutoVmcs vmcs(vmcs_page_.PhysicalAddress());
// We check whether a kick was requested before entering the guest so that:
// 1. When we enter the syscall, we can return immediately without entering
// the guest.
// 2. If we have already exited the guest to handle a packet, it allows us
// to return and gives user-space a chance to handle that packet, without
// the request to kick interfering with the packet in-flight.
// We also do this after we have disabled interrupts, so if an interrupt was
// fired before we disabled interrupts, we have the opportunity to check
// whether a kick was requested, but the interrupt was lost. If an interrupt
// is fired after we have disabled interrupts, when we enter the guest we
// will exit due to the interrupt, and run this check again.
if ( {
return zx::error(ZX_ERR_CANCELED);
if (result = pre_enter(vmcs); result.is_error()) {
return result;
if (!extended_registers_loaded) {;
extended_registers_loaded = true;
if (x86_cpu_should_l1d_flush_on_vmentry()) {
// L1TF: Flush L1D$ before entering vCPU. If the CPU is affected by MDS,
// also flush microarchitectural buffers.
write_msr(X86_MSR_IA32_FLUSH_CMD, 1);
} else if (x86_cpu_should_md_clear_on_user_return()) {
// MDS: If the processor is not affected by L1TF but is affected by MDS or
// TAA, flush microarchitectural buffers.
KTRACE_DURATION_BEGIN("kernel:vcpu", "vcpu");
result = vmx_enter(&vmx_state_);
if (!gBootOptions->x86_disable_spec_mitigations) {
// Spectre V2: Ensure that code executed in the VM guest cannot influence
// return address prediction in the host.
if (result.is_ok()) {
vmx_state_.resume = true;
result = post_exit(vmcs, packet);
} else {
ktrace_vcpu_exit(VCPU_FAILURE, vmcs.Read(VmcsFieldXX::GUEST_RIP));
uint64_t error = vmcs.Read(VmcsField32::INSTRUCTION_ERROR);
dprintf(INFO, "VCPU enter failed: Instruction error %lu\n", error);
} while (result.is_ok());
return result.status_value() == ZX_ERR_NEXT ? zx::ok() : result;
zx::result<> Vcpu::ReadState(zx_vcpu_state_t& vcpu_state) {
if (Thread::Current::Get() != thread_) {
return zx::error(ZX_ERR_BAD_STATE);
register_copy(vcpu_state, vmx_state_.guest_state);
AutoVmcs vmcs(vmcs_page_.PhysicalAddress());
vcpu_state.rsp = vmcs.Read(VmcsFieldXX::GUEST_RSP);
vcpu_state.rflags = vmcs.Read(VmcsFieldXX::GUEST_RFLAGS) & X86_FLAGS_USER;
return zx::ok();
zx::result<> Vcpu::WriteState(const zx_vcpu_state_t& vcpu_state) {
if (Thread::Current::Get() != thread_) {
return zx::error(ZX_ERR_BAD_STATE);
register_copy(vmx_state_.guest_state, vcpu_state);
AutoVmcs vmcs(vmcs_page_.PhysicalAddress());
vmcs.Write(VmcsFieldXX::GUEST_RSP, vcpu_state.rsp);
if (vcpu_state.rflags & X86_FLAGS_RESERVED_ONES) {
const uint64_t rflags = vmcs.Read(VmcsFieldXX::GUEST_RFLAGS);
const uint64_t user_flags = (rflags & ~X86_FLAGS_USER) | (vcpu_state.rflags & X86_FLAGS_USER);
vmcs.Write(VmcsFieldXX::GUEST_RFLAGS, user_flags);
return zx::ok();
void Vcpu::GetInfo(zx_info_vcpu_t* info) {
if (kicked_.load()) {
info->flags |= ZX_INFO_VCPU_FLAG_KICKED;
// static
zx::result<ktl::unique_ptr<Vcpu>> NormalVcpu::Create(NormalGuest& guest, zx_vaddr_t entry) {
auto vpid = guest.TryAllocVpid();
if (vpid.is_error()) {
return vpid.take_error();
auto vcpu = Vcpu::Create<NormalVcpu>(guest, *vpid, entry);
if (vcpu.is_error()) {
auto result = guest.FreeVpid(*vpid);
return vcpu.take_error();
// Setup PV clock state.
vcpu->pv_clock_state_.is_stable = x86_hypervisor_has_pv_clock()
? pv_clock_is_stable()
: x86_feature_test(X86_FEATURE_INVAR_TSC);
AutoVmcs vmcs(vcpu->vmcs_page_.PhysicalAddress());
// Enable use of PAUSE-loop exiting if available.
auto result =
vmcs.SetControl(VmcsField32::PROCBASED_CTLS2, read_msr(X86_MSR_IA32_VMX_PROCBASED_CTLS2),
vmcs.Read(VmcsField32::PROCBASED_CTLS2), kProcbasedCtls2PauseLoopExiting, 0);
if (result.is_ok()) {
// From Volume 3, Section 25.1.3: The processor determines the amount of
// time between this execution of PAUSE and the previous execution of PAUSE
// at CPL 0. If this amount of time exceeds the value of the VM-execution
// control field PLE_Gap, the processor considers this execution to be the
// first execution of PAUSE in a loop. (It also does so for the first
// execution of PAUSE at CPL 0 after VM entry.)
// Otherwise, the processor determines the amount of time since the most
// recent execution of PAUSE that was considered to be the first in a loop.
// If this amount of time exceeds the value of the VM-execution control
// field PLE_Window, a VM exit occurs.
// For purposes of these computations, time is measured based on a counter
// that runs at the same rate as the timestamp counter (TSC).
// NOTE: These values are based on KVM, which was based on empirical
// analysis.
vmcs.Write(VmcsField32::PLE_GAP, 1u << 7);
vmcs.Write(VmcsField32::PLE_WINDOW, 1u << 12);
// From Volume 3, Section 27.5.1: The following bits are not modified: For
// CR0, ET, CD, NW; [the reserved bits], and any bits that are fixed in VMX
// operation.
// Any bit that is not restored must be masked, or the guest will be able to
// affect the host's cr0. However, we do not need to mask:
// * The reserved bits, which will generate GP faults;
// * ET, which is fixed to 1 (Volume 3 Section 2.5);
// * The bits that are fixed in VMX operation aside from PE and PG for
// unrestricted guests, which will generate GP faults (Volume 3 Section
// 25.3);
// Additionally, NE is fixed in VMX operation but some guests will attempt to
// clear it without handling the GP fault. So it should also be masked.
vmcs.Write(VmcsFieldXX::CR0_GUEST_HOST_MASK, X86_CR0_ET | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD);
// From Volume 3, Section 9.1.1: Following power-up, The state of control
// register CR0 is 60000010H (CD and ET are set.)
vmcs.Write(VmcsFieldXX::CR0_READ_SHADOW, X86_CR0_ET);
// Mask access to CR4.
vmcs.Write(VmcsFieldXX::CR4_GUEST_HOST_MASK, X86_CR4_VMXE);
vmcs.Write(VmcsFieldXX::CR4_READ_SHADOW, 0);
// Set host and guest CR3.
vmcs.Write(VmcsFieldXX::HOST_CR3, x86_get_cr3());
vmcs.Write(VmcsFieldXX::GUEST_CR3, 0);
// Do not VM exit on any exception.
vmcs.Write(VmcsField32::EXCEPTION_BITMAP, 0);
return zx::ok(ktl::move(*vcpu));
NormalVcpu::NormalVcpu(NormalGuest& guest, uint16_t vpid, Thread* thread)
: Vcpu(guest, vpid, thread) {}
NormalVcpu::~NormalVcpu() {
auto result = static_cast<NormalGuest&>(guest_).FreeVpid(vpid_);
zx::result<> NormalVcpu::Enter(zx_port_packet_t& packet) {
auto pre_enter = [this](AutoVmcs& vmcs) -> zx::result<> {
zx_status_t status = local_apic_maybe_interrupt(&vmcs, &local_apic_state_);
if (status != ZX_OK) {
return zx::error(status);
// Updates guest system time if the guest subscribed to updates.
auto& guest = static_cast<NormalGuest&>(guest_);
pv_clock_update_system_time(&pv_clock_state_, &guest.PhysicalAspace());
return zx::ok();
auto post_exit = [this](AutoVmcs& vmcs, zx_port_packet_t& packet) -> zx::result<> {
auto& guest = static_cast<NormalGuest&>(guest_);
return vmexit_handler_normal(vmcs, vmx_state_.guest_state, local_apic_state_, pv_clock_state_,
guest.PhysicalAspace(), guest.Traps(), packet);
return EnterInternal(ktl::move(pre_enter), ktl::move(post_exit), packet);
void NormalVcpu::Kick() {;
// Cancel any pending or upcoming wait-for-interrupts.
Guard<MonitoredSpinLock, IrqSave> guard{ThreadLock::Get(), SOURCE_TAG};
interrupt_cpu(thread_.load(), last_cpu_);
void NormalVcpu::Interrupt(uint32_t vector) {
Guard<MonitoredSpinLock, IrqSave> guard{ThreadLock::Get(), SOURCE_TAG};
interrupt_cpu(thread_.load(), last_cpu_);
zx::result<> NormalVcpu::WriteState(const zx_vcpu_io_t& io_state) {
if (Thread::Current::Get() != thread_) {
return zx::error(ZX_ERR_BAD_STATE);
if ((io_state.access_size != 1) && (io_state.access_size != 2) && (io_state.access_size != 4)) {
return zx::error(ZX_ERR_INVALID_ARGS);
static_assert(sizeof(vmx_state_.guest_state.rax) >= 4);
memcpy(&vmx_state_.guest_state.rax,, io_state.access_size);
return zx::ok();