blob: 684868d9e92331e1d0607e7211a36c309740be2b [file] [log] [blame]
// Copyright 2017 The Fuchsia Authors
//
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file or at
// https://opensource.org/licenses/MIT
#include <bits.h>
#include <arch/x86/descriptor.h>
#include <arch/x86/feature.h>
#include <hypervisor/guest_physical_address_space.h>
#include <kernel/mp.h>
#include <vm/fault.h>
#include <vm/pmm.h>
#include <vm/vm_object.h>
#include <zircon/syscalls/hypervisor.h>
#include <fbl/auto_call.h>
#include "vcpu_priv.h"
#include "vmexit_priv.h"
#include "vmx_cpu_state_priv.h"
extern uint8_t _gdt[];
static const uint kPfFlags = VMM_PF_FLAG_WRITE | VMM_PF_FLAG_SW_FAULT;
static const uint32_t kInterruptInfoValid = 1u << 31;
static const uint32_t kInterruptInfoDeliverErrorCode = 1u << 11;
static const uint32_t kInterruptTypeHardwareException = 3u << 8;
static zx_status_t vmptrld(paddr_t pa) {
uint8_t err;
__asm__ volatile(
"vmptrld %[pa];" VMX_ERR_CHECK(err)
: [err] "=r"(err)
: [pa] "m"(pa)
: "cc", "memory");
return err ? ZX_ERR_INTERNAL : ZX_OK;
}
static zx_status_t vmclear(paddr_t pa) {
uint8_t err;
__asm__ volatile(
"vmclear %[pa];" VMX_ERR_CHECK(err)
: [err] "=r"(err)
: [pa] "m"(pa)
: "cc", "memory");
return err ? ZX_ERR_INTERNAL : ZX_OK;
}
static uint64_t vmread(uint64_t field) {
uint8_t err;
uint64_t val;
__asm__ volatile(
"vmread %[field], %[val];" VMX_ERR_CHECK(err)
: [err] "=r"(err), [val] "=m"(val)
: [field] "r"(field)
: "cc");
DEBUG_ASSERT(err == ZX_OK);
return val;
}
static void vmwrite(uint64_t field, uint64_t val) {
uint8_t err;
__asm__ volatile(
"vmwrite %[val], %[field];" VMX_ERR_CHECK(err)
: [err] "=r"(err)
: [val] "r"(val), [field] "r"(field)
: "cc");
DEBUG_ASSERT(err == ZX_OK);
}
AutoVmcs::AutoVmcs(const paddr_t vmcs_address)
: vmcs_address_(vmcs_address) {
DEBUG_ASSERT(!arch_ints_disabled());
arch_disable_ints();
__UNUSED zx_status_t status = vmptrld(vmcs_address_);
DEBUG_ASSERT(status == ZX_OK);
}
AutoVmcs::~AutoVmcs() {
DEBUG_ASSERT(arch_ints_disabled());
arch_enable_ints();
}
void AutoVmcs::Reload() {
DEBUG_ASSERT(arch_ints_disabled());
__UNUSED zx_status_t status = vmptrld(vmcs_address_);
DEBUG_ASSERT(status == ZX_OK);
}
void AutoVmcs::InterruptibleReload() {
DEBUG_ASSERT(arch_ints_disabled());
// When we VM exit due to an external interrupt, we want to handle that
// interrupt. To do that, we temporarily re-enable interrupts. However,
// we must then reload the VMCS, in case it was changed in the interim.
arch_enable_ints();
arch_disable_ints();
Reload();
}
void AutoVmcs::InterruptWindowExiting(bool enable) {
uint32_t controls = Read(VmcsField32::PROCBASED_CTLS);
if (enable) {
controls |= PROCBASED_CTLS_INT_WINDOW_EXITING;
} else {
controls &= ~PROCBASED_CTLS_INT_WINDOW_EXITING;
}
Write(VmcsField32::PROCBASED_CTLS, controls);
}
static bool has_error_code(uint32_t vector) {
switch (vector) {
case X86_INT_DOUBLE_FAULT:
case X86_INT_INVALID_TSS:
case X86_INT_SEGMENT_NOT_PRESENT:
case X86_INT_STACK_FAULT:
case X86_INT_GP_FAULT:
case X86_INT_PAGE_FAULT:
case X86_INT_ALIGNMENT_CHECK:
return true;
default:
return false;
}
}
void AutoVmcs::IssueInterrupt(uint32_t vector) {
uint32_t interrupt_info = kInterruptInfoValid | (vector & UINT8_MAX);
if (vector <= X86_INT_MAX_INTEL_DEFINED)
interrupt_info |= kInterruptTypeHardwareException;
if (has_error_code(vector)) {
interrupt_info |= kInterruptInfoDeliverErrorCode;
Write(VmcsField32::ENTRY_EXCEPTION_ERROR_CODE, 0);
}
Write(VmcsField32::ENTRY_INTERRUPTION_INFORMATION, interrupt_info);
}
uint16_t AutoVmcs::Read(VmcsField16 field) const {
return static_cast<uint16_t>(vmread(static_cast<uint64_t>(field)));
}
uint32_t AutoVmcs::Read(VmcsField32 field) const {
return static_cast<uint32_t>(vmread(static_cast<uint64_t>(field)));
}
uint64_t AutoVmcs::Read(VmcsField64 field) const {
return vmread(static_cast<uint64_t>(field));
}
uint64_t AutoVmcs::Read(VmcsFieldXX field) const {
return vmread(static_cast<uint64_t>(field));
}
void AutoVmcs::Write(VmcsField16 field, uint16_t val) {
vmwrite(static_cast<uint64_t>(field), val);
}
void AutoVmcs::Write(VmcsField32 field, uint32_t val) {
vmwrite(static_cast<uint64_t>(field), val);
}
void AutoVmcs::Write(VmcsField64 field, uint64_t val) {
vmwrite(static_cast<uint64_t>(field), val);
}
void AutoVmcs::Write(VmcsFieldXX field, uint64_t val) {
vmwrite(static_cast<uint64_t>(field), val);
}
zx_status_t AutoVmcs::SetControl(VmcsField32 controls, uint64_t true_msr, uint64_t old_msr,
uint32_t set, uint32_t clear) {
uint32_t allowed_0 = static_cast<uint32_t>(BITS(true_msr, 31, 0));
uint32_t allowed_1 = static_cast<uint32_t>(BITS_SHIFT(true_msr, 63, 32));
if ((allowed_1 & set) != set) {
dprintf(SPEW, "can not set vmcs controls %#x\n", static_cast<uint>(controls));
return ZX_ERR_NOT_SUPPORTED;
}
if ((~allowed_0 & clear) != clear) {
dprintf(SPEW, "can not clear vmcs controls %#x\n", static_cast<uint>(controls));
return ZX_ERR_NOT_SUPPORTED;
}
if ((set & clear) != 0) {
dprintf(SPEW, "can not set and clear the same vmcs controls %#x\n",
static_cast<uint>(controls));
return ZX_ERR_INVALID_ARGS;
}
// Reference Volume 3, Section 31.5.1, Algorithm 3, Part C. If the control
// can be either 0 or 1 (flexible), and the control is unknown, then refer
// to the old MSR to find the default value.
uint32_t flexible = allowed_0 ^ allowed_1;
uint32_t unknown = flexible & ~(set | clear);
uint32_t defaults = unknown & BITS(old_msr, 31, 0);
Write(controls, allowed_0 | defaults | set);
return ZX_OK;
}
static uint cpu_of(uint16_t vpid) {
return vpid % arch_max_num_cpus();
}
static void pin_thread(thread_t* thread, uint16_t vpid) {
uint cpu = cpu_of(vpid);
if (thread_pinned_cpu(thread) != static_cast<int>(cpu))
thread_set_pinned_cpu(thread, cpu);
if (arch_curr_cpu_num() != cpu)
thread_reschedule();
}
static bool check_pinned_cpu_invariant(const thread_t* thread, uint16_t vpid) {
uint cpu = cpu_of(vpid);
return thread == get_current_thread() &&
thread_pinned_cpu(thread) == static_cast<int>(cpu) &&
arch_curr_cpu_num() == cpu;
}
AutoPin::AutoPin(const Vcpu* vcpu)
: thread_(get_current_thread()), prev_cpu_(thread_pinned_cpu(thread_)) {
pin_thread(thread_, vcpu->vpid());
}
AutoPin::~AutoPin() {
thread_set_pinned_cpu(thread_, prev_cpu_);
}
static uint64_t ept_pointer(paddr_t pml4_address) {
return
// Physical address of the PML4 page, page aligned.
pml4_address |
// Use write back memory.
VMX_MEMORY_TYPE_WRITE_BACK << 0 |
// Page walk length of 4 (defined as N minus 1).
3u << 3;
}
struct MsrListEntry {
uint32_t msr;
uint32_t reserved;
uint64_t value;
} __PACKED;
static void edit_msr_list(VmxPage* msr_list_page, size_t index, uint32_t msr, uint64_t value) {
// From Volume 3, Section 24.7.2.
// From Volume 3, Appendix A.6: Specifically, if the value bits 27:25 of
// IA32_VMX_MISC is N, then 512 * (N + 1) is the recommended maximum number
// of MSRs to be included in each list.
//
// From Volume 3, Section 24.7.2: This field specifies the number of MSRs to
// be stored on VM exit. It is recommended that this count not exceed 512
// bytes.
//
// Since these two statements conflict, we are taking the conservative
// minimum and asserting that: index < (512 bytes / size of MsrListEntry).
ASSERT(index < (512 / sizeof(MsrListEntry)));
MsrListEntry* entry = msr_list_page->VirtualAddress<MsrListEntry>() + index;
entry->msr = msr;
entry->value = value;
}
zx_status_t vmcs_init(paddr_t vmcs_address, uint16_t vpid, uintptr_t ip, uintptr_t cr3,
paddr_t virtual_apic_address, paddr_t apic_access_address,
paddr_t msr_bitmaps_address, paddr_t pml4_address, VmxState* vmx_state,
VmxPage* host_msr_page, VmxPage* guest_msr_page) {
zx_status_t status = vmclear(vmcs_address);
if (status != ZX_OK)
return status;
AutoVmcs vmcs(vmcs_address);
// Setup secondary processor-based VMCS controls.
status = vmcs.SetControl(VmcsField32::PROCBASED_CTLS2,
read_msr(X86_MSR_IA32_VMX_PROCBASED_CTLS2),
0,
// Enable APIC access virtualization.
PROCBASED_CTLS2_APIC_ACCESS |
// Enable use of extended page tables.
PROCBASED_CTLS2_EPT |
// Enable use of RDTSCP instruction.
PROCBASED_CTLS2_RDTSCP |
// Associate cached translations of linear
// addresses with a virtual processor ID.
PROCBASED_CTLS2_VPID |
// Enable use of INVPCID instruction.
PROCBASED_CTLS2_INVPCID,
0);
if (status != ZX_OK)
return status;
// Setup pin-based VMCS controls.
status = vmcs.SetControl(VmcsField32::PINBASED_CTLS,
read_msr(X86_MSR_IA32_VMX_TRUE_PINBASED_CTLS),
read_msr(X86_MSR_IA32_VMX_PINBASED_CTLS),
// External interrupts cause a VM exit.
PINBASED_CTLS_EXT_INT_EXITING |
// Non-maskable interrupts cause a VM exit.
PINBASED_CTLS_NMI_EXITING,
0);
if (status != ZX_OK)
return status;
// Setup primary processor-based VMCS controls.
status = vmcs.SetControl(VmcsField32::PROCBASED_CTLS,
read_msr(X86_MSR_IA32_VMX_TRUE_PROCBASED_CTLS),
read_msr(X86_MSR_IA32_VMX_PROCBASED_CTLS),
// Enable VM exit when interrupts are enabled.
PROCBASED_CTLS_INT_WINDOW_EXITING |
// Enable VM exit on HLT instruction.
PROCBASED_CTLS_HLT_EXITING |
// Enable TPR virtualization.
PROCBASED_CTLS_TPR_SHADOW |
// Enable VM exit on IO instructions.
PROCBASED_CTLS_IO_EXITING |
// Enable use of MSR bitmaps.
PROCBASED_CTLS_MSR_BITMAPS |
// Enable secondary processor-based controls.
PROCBASED_CTLS_PROCBASED_CTLS2,
// Disable VM exit on CR3 load.
PROCBASED_CTLS_CR3_LOAD_EXITING |
// Disable VM exit on CR3 store.
PROCBASED_CTLS_CR3_STORE_EXITING |
// Disable VM exit on CR8 load.
PROCBASED_CTLS_CR8_LOAD_EXITING |
// Disable VM exit on CR8 store.
PROCBASED_CTLS_CR8_STORE_EXITING);
if (status != ZX_OK)
return status;
// We only enable interrupt-window exiting above to ensure that the
// processor supports it for later use. So disable it for now.
vmcs.InterruptWindowExiting(false);
// Setup VM-exit VMCS controls.
status = vmcs.SetControl(VmcsField32::EXIT_CTLS,
read_msr(X86_MSR_IA32_VMX_TRUE_EXIT_CTLS),
read_msr(X86_MSR_IA32_VMX_EXIT_CTLS),
// Logical processor is in 64-bit mode after VM
// exit. On VM exit CS.L, IA32_EFER.LME, and
// IA32_EFER.LMA is set to true.
EXIT_CTLS_64BIT_MODE |
// Save the guest IA32_PAT MSR on exit.
EXIT_CTLS_SAVE_IA32_PAT |
// Load the host IA32_PAT MSR on exit.
EXIT_CTLS_LOAD_IA32_PAT |
// Save the guest IA32_EFER MSR on exit.
EXIT_CTLS_SAVE_IA32_EFER |
// Load the host IA32_EFER MSR on exit.
EXIT_CTLS_LOAD_IA32_EFER,
0);
if (status != ZX_OK)
return status;
// Setup VM-entry VMCS controls.
status = vmcs.SetControl(VmcsField32::ENTRY_CTLS,
read_msr(X86_MSR_IA32_VMX_TRUE_ENTRY_CTLS),
read_msr(X86_MSR_IA32_VMX_ENTRY_CTLS),
// After VM entry, logical processor is in IA-32e
// mode and IA32_EFER.LMA is set to true.
ENTRY_CTLS_IA32E_MODE |
// Load the guest IA32_PAT MSR on entry.
ENTRY_CTLS_LOAD_IA32_PAT |
// Load the guest IA32_EFER MSR on entry.
ENTRY_CTLS_LOAD_IA32_EFER,
0);
if (status != ZX_OK)
return status;
// From Volume 3, Section 24.6.3: The exception bitmap is a 32-bit field
// that contains one bit for each exception. When an exception occurs,
// its vector is used to select a bit in this field. If the bit is 1,
// the exception causes a VM exit. If the bit is 0, the exception is
// delivered normally through the IDT, using the descriptor
// corresponding to the exception’s vector.
//
// From Volume 3, Section 25.2: If software desires VM exits on all page
// faults, it can set bit 14 in the exception bitmap to 1 and set the
// page-fault error-code mask and match fields each to 00000000H.
vmcs.Write(VmcsField32::EXCEPTION_BITMAP, 0);
vmcs.Write(VmcsField32::PAGEFAULT_ERRORCODE_MASK, 0);
vmcs.Write(VmcsField32::PAGEFAULT_ERRORCODE_MATCH, 0);
// From Volume 3, Section 28.1: Virtual-processor identifiers (VPIDs)
// introduce to VMX operation a facility by which a logical processor may
// cache information for multiple linear-address spaces. When VPIDs are
// used, VMX transitions may retain cached information and the logical
// processor switches to a different linear-address space.
//
// From Volume 3, Section 26.2.1.1: If the “enable VPID” VM-execution
// control is 1, the value of the VPID VM-execution control field must not
// be 0000H.
//
// From Volume 3, Section 28.3.3.3: If EPT is in use, the logical processor
// associates all mappings it creates with the value of bits 51:12 of
// current EPTP. If a VMM uses different EPTP values for different guests,
// it may use the same VPID for those guests.
//
// From Volume 3, Section 28.3.3.1: Operations that architecturally
// invalidate entries in the TLBs or paging-structure caches independent of
// VMX operation (e.g., the INVLPG and INVPCID instructions) invalidate
// linear mappings and combined mappings. They are required to do so only
// for the current VPID (but, for combined mappings, all EP4TAs). Linear
// mappings for the current VPID are invalidated even if EPT is in use.
// Combined mappings for the current VPID are invalidated even if EPT is
// not in use.
vmcs.Write(VmcsField16::VPID, vpid);
// From Volume 3, Section 28.2: The extended page-table mechanism (EPT) is a
// feature that can be used to support the virtualization of physical
// memory. When EPT is in use, certain addresses that would normally be
// treated as physical addresses (and used to access memory) are instead
// treated as guest-physical addresses. Guest-physical addresses are
// translated by traversing a set of EPT paging structures to produce
// physical addresses that are used to access memory.
vmcs.Write(VmcsField64::EPT_POINTER, ept_pointer(pml4_address));
// Setup APIC handling.
vmcs.Write(VmcsField64::APIC_ACCESS_ADDRESS, apic_access_address);
vmcs.Write(VmcsField64::VIRTUAL_APIC_ADDRESS, virtual_apic_address);
// Setup MSR handling.
vmcs.Write(VmcsField64::MSR_BITMAPS_ADDRESS, msr_bitmaps_address);
edit_msr_list(host_msr_page, 0, X86_MSR_IA32_KERNEL_GS_BASE,
read_msr(X86_MSR_IA32_KERNEL_GS_BASE));
edit_msr_list(host_msr_page, 1, X86_MSR_IA32_STAR, read_msr(X86_MSR_IA32_STAR));
edit_msr_list(host_msr_page, 2, X86_MSR_IA32_LSTAR, read_msr(X86_MSR_IA32_LSTAR));
edit_msr_list(host_msr_page, 3, X86_MSR_IA32_FMASK, read_msr(X86_MSR_IA32_FMASK));
edit_msr_list(host_msr_page, 4, X86_MSR_IA32_TSC_ADJUST, read_msr(X86_MSR_IA32_TSC_ADJUST));
edit_msr_list(host_msr_page, 5, X86_MSR_IA32_TSC_AUX, read_msr(X86_MSR_IA32_TSC_AUX));
vmcs.Write(VmcsField64::EXIT_MSR_LOAD_ADDRESS, host_msr_page->PhysicalAddress());
vmcs.Write(VmcsField32::EXIT_MSR_LOAD_COUNT, 6);
edit_msr_list(guest_msr_page, 0, X86_MSR_IA32_KERNEL_GS_BASE, 0);
edit_msr_list(guest_msr_page, 1, X86_MSR_IA32_STAR, 0);
edit_msr_list(guest_msr_page, 2, X86_MSR_IA32_LSTAR, 0);
edit_msr_list(guest_msr_page, 3, X86_MSR_IA32_FMASK, 0);
edit_msr_list(guest_msr_page, 4, X86_MSR_IA32_TSC_ADJUST, 0);
edit_msr_list(guest_msr_page, 5, X86_MSR_IA32_TSC_AUX, 0);
vmcs.Write(VmcsField64::EXIT_MSR_STORE_ADDRESS, guest_msr_page->PhysicalAddress());
vmcs.Write(VmcsField32::EXIT_MSR_STORE_COUNT, 6);
vmcs.Write(VmcsField64::ENTRY_MSR_LOAD_ADDRESS, guest_msr_page->PhysicalAddress());
vmcs.Write(VmcsField32::ENTRY_MSR_LOAD_COUNT, 6);
// Setup VMCS host state.
//
// NOTE: We are pinned to a thread when executing this function, therefore
// it is acceptable to use per-CPU state.
x86_percpu* percpu = x86_get_percpu();
vmcs.Write(VmcsField64::HOST_IA32_PAT, read_msr(X86_MSR_IA32_PAT));
vmcs.Write(VmcsField64::HOST_IA32_EFER, read_msr(X86_MSR_IA32_EFER));
vmcs.Write(VmcsFieldXX::HOST_CR0, x86_get_cr0());
vmcs.Write(VmcsFieldXX::HOST_CR3, x86_get_cr3());
vmcs.Write(VmcsFieldXX::HOST_CR4, x86_get_cr4());
vmcs.Write(VmcsField16::HOST_ES_SELECTOR, 0);
vmcs.Write(VmcsField16::HOST_CS_SELECTOR, CODE_64_SELECTOR);
vmcs.Write(VmcsField16::HOST_SS_SELECTOR, DATA_SELECTOR);
vmcs.Write(VmcsField16::HOST_DS_SELECTOR, 0);
vmcs.Write(VmcsField16::HOST_FS_SELECTOR, 0);
vmcs.Write(VmcsField16::HOST_GS_SELECTOR, 0);
vmcs.Write(VmcsField16::HOST_TR_SELECTOR, TSS_SELECTOR(percpu->cpu_num));
vmcs.Write(VmcsFieldXX::HOST_FS_BASE, read_msr(X86_MSR_IA32_FS_BASE));
vmcs.Write(VmcsFieldXX::HOST_GS_BASE, read_msr(X86_MSR_IA32_GS_BASE));
vmcs.Write(VmcsFieldXX::HOST_TR_BASE, reinterpret_cast<uint64_t>(&percpu->default_tss));
vmcs.Write(VmcsFieldXX::HOST_GDTR_BASE, reinterpret_cast<uint64_t>(_gdt));
vmcs.Write(VmcsFieldXX::HOST_IDTR_BASE, reinterpret_cast<uint64_t>(idt_get_readonly()));
vmcs.Write(VmcsFieldXX::HOST_IA32_SYSENTER_ESP, 0);
vmcs.Write(VmcsFieldXX::HOST_IA32_SYSENTER_EIP, 0);
vmcs.Write(VmcsField32::HOST_IA32_SYSENTER_CS, 0);
vmcs.Write(VmcsFieldXX::HOST_RSP, reinterpret_cast<uint64_t>(vmx_state));
vmcs.Write(VmcsFieldXX::HOST_RIP, reinterpret_cast<uint64_t>(vmx_exit_entry));
// Setup VMCS guest state.
uint64_t cr0 = X86_CR0_PE | // Enable protected mode
X86_CR0_PG | // Enable paging
X86_CR0_NE; // Enable internal x87 exception handling
if (cr_is_invalid(cr0, X86_MSR_IA32_VMX_CR0_FIXED0, X86_MSR_IA32_VMX_CR0_FIXED1)) {
return ZX_ERR_BAD_STATE;
}
vmcs.Write(VmcsFieldXX::GUEST_CR0, cr0);
uint64_t cr4 = X86_CR4_PAE | // Enable PAE paging
X86_CR4_VMXE; // Enable VMX
if (cr_is_invalid(cr4, X86_MSR_IA32_VMX_CR4_FIXED0, X86_MSR_IA32_VMX_CR4_FIXED1)) {
return ZX_ERR_BAD_STATE;
}
vmcs.Write(VmcsFieldXX::GUEST_CR4, cr4);
// For now, the guest can own all of the CR4 bits except VMXE, which it shouldn't touch.
// TODO(andymutton): Implement proper CR4 handling.
vmcs.Write(VmcsFieldXX::CR4_GUEST_HOST_MASK, X86_CR4_VMXE);
vmcs.Write(VmcsFieldXX::CR4_READ_SHADOW, 0);
vmcs.Write(VmcsField64::GUEST_IA32_PAT, read_msr(X86_MSR_IA32_PAT));
vmcs.Write(VmcsField64::GUEST_IA32_EFER, read_msr(X86_MSR_IA32_EFER));
vmcs.Write(VmcsField32::GUEST_CS_ACCESS_RIGHTS,
GUEST_XX_ACCESS_RIGHTS_TYPE_A |
GUEST_XX_ACCESS_RIGHTS_TYPE_W |
GUEST_XX_ACCESS_RIGHTS_TYPE_E |
GUEST_XX_ACCESS_RIGHTS_TYPE_CODE |
GUEST_XX_ACCESS_RIGHTS_S |
GUEST_XX_ACCESS_RIGHTS_P |
GUEST_XX_ACCESS_RIGHTS_L);
vmcs.Write(VmcsField32::GUEST_TR_ACCESS_RIGHTS,
GUEST_TR_ACCESS_RIGHTS_TSS_BUSY |
GUEST_XX_ACCESS_RIGHTS_P);
// Disable all other segment selectors until we have a guest that uses them.
vmcs.Write(VmcsField32::GUEST_SS_ACCESS_RIGHTS, GUEST_XX_ACCESS_RIGHTS_UNUSABLE);
vmcs.Write(VmcsField32::GUEST_DS_ACCESS_RIGHTS, GUEST_XX_ACCESS_RIGHTS_UNUSABLE);
vmcs.Write(VmcsField32::GUEST_ES_ACCESS_RIGHTS, GUEST_XX_ACCESS_RIGHTS_UNUSABLE);
vmcs.Write(VmcsField32::GUEST_FS_ACCESS_RIGHTS, GUEST_XX_ACCESS_RIGHTS_UNUSABLE);
vmcs.Write(VmcsField32::GUEST_GS_ACCESS_RIGHTS, GUEST_XX_ACCESS_RIGHTS_UNUSABLE);
vmcs.Write(VmcsField32::GUEST_LDTR_ACCESS_RIGHTS, GUEST_XX_ACCESS_RIGHTS_UNUSABLE);
vmcs.Write(VmcsFieldXX::GUEST_GDTR_BASE, 0);
vmcs.Write(VmcsField32::GUEST_GDTR_LIMIT, 0);
vmcs.Write(VmcsFieldXX::GUEST_IDTR_BASE, 0);
vmcs.Write(VmcsField32::GUEST_IDTR_LIMIT, 0);
// Set all reserved RFLAGS bits to their correct values
vmcs.Write(VmcsFieldXX::GUEST_RFLAGS, X86_FLAGS_RESERVED_ONES);
vmcs.Write(VmcsField32::GUEST_ACTIVITY_STATE, 0);
vmcs.Write(VmcsField32::GUEST_INTERRUPTIBILITY_STATE, 0);
vmcs.Write(VmcsFieldXX::GUEST_PENDING_DEBUG_EXCEPTIONS, 0);
// From Volume 3, Section 26.3.1.1: The IA32_SYSENTER_ESP field and the
// IA32_SYSENTER_EIP field must each contain a canonical address.
vmcs.Write(VmcsFieldXX::GUEST_IA32_SYSENTER_ESP, 0);
vmcs.Write(VmcsFieldXX::GUEST_IA32_SYSENTER_EIP, 0);
vmcs.Write(VmcsField32::GUEST_IA32_SYSENTER_CS, 0);
vmcs.Write(VmcsFieldXX::GUEST_RSP, 0);
vmcs.Write(VmcsFieldXX::GUEST_RIP, ip);
vmcs.Write(VmcsFieldXX::GUEST_CR3, cr3);
// From Volume 3, Section 24.4.2: If the “VMCS shadowing” VM-execution
// control is 1, the VMREAD and VMWRITE instructions access the VMCS
// referenced by this pointer (see Section 24.10). Otherwise, software
// should set this field to FFFFFFFF_FFFFFFFFH to avoid VM-entry
// failures (see Section 26.3.1.5).
vmcs.Write(VmcsField64::LINK_POINTER, LINK_POINTER_INVALIDATE);
if (x86_feature_test(X86_FEATURE_XSAVE)) {
// Enable x87 state in guest XCR0.
vmx_state->guest_state.xcr0 = X86_XSAVE_STATE_X87;
}
return ZX_OK;
}
// static
zx_status_t Vcpu::Create(zx_vaddr_t ip, zx_vaddr_t cr3, fbl::RefPtr<VmObject> apic_vmo,
paddr_t apic_access_address, paddr_t msr_bitmaps_address,
GuestPhysicalAddressSpace* gpas, PacketMux& mux,
fbl::unique_ptr<Vcpu>* out) {
uint16_t vpid;
zx_status_t status = alloc_vpid(&vpid);
if (status != ZX_OK)
return status;
auto auto_call = fbl::MakeAutoCall([=]() { free_vpid(vpid); });
// When we create a VCPU, we bind it to the current thread and a CPU based
// on the VPID. The VCPU must always be run on the current thread and the
// given CPU, unless an explicit migration is performed.
//
// The reason we do this is that:
// 1. The state of the current thread is stored within the VMCS, to be
// restored upon a guest-to-host transition.
// 2. The state of the VMCS associated with the VCPU is cached within the
// CPU. To move to a different CPU, we must perform an explicit migration
// which will cost us performance.
thread_t* thread = get_current_thread();
pin_thread(thread, vpid);
fbl::AllocChecker ac;
fbl::unique_ptr<Vcpu> vcpu(new (&ac) Vcpu(thread, vpid, apic_vmo, gpas, mux));
if (!ac.check())
return ZX_ERR_NO_MEMORY;
timer_init(&vcpu->local_apic_state_.timer);
event_init(&vcpu->local_apic_state_.event, false, EVENT_FLAG_AUTOUNSIGNAL);
status = vcpu->local_apic_state_.interrupt_bitmap.Reset(kNumInterrupts);
if (status != ZX_OK)
return status;
paddr_t virtual_apic_address;
status = vcpu->apic_vmo_->Lookup(0, PAGE_SIZE, kPfFlags, guest_lookup_page,
&virtual_apic_address);
if (status != ZX_OK)
return status;
vcpu->local_apic_state_.apic_addr = paddr_to_kvaddr(virtual_apic_address);
VmxInfo vmx_info;
status = vcpu->host_msr_page_.Alloc(vmx_info, 0);
if (status != ZX_OK)
return status;
status = vcpu->guest_msr_page_.Alloc(vmx_info, 0);
if (status != ZX_OK)
return status;
status = vcpu->vmcs_page_.Alloc(vmx_info, 0);
if (status != ZX_OK)
return status;
VmxRegion* region = vcpu->vmcs_page_.VirtualAddress<VmxRegion>();
region->revision_id = vmx_info.revision_id;
status = vmcs_init(vcpu->vmcs_page_.PhysicalAddress(), vpid, ip, cr3, virtual_apic_address,
apic_access_address, msr_bitmaps_address, gpas->Pml4Address(),
&vcpu->vmx_state_, &vcpu->host_msr_page_, &vcpu->guest_msr_page_);
if (status != ZX_OK)
return status;
auto_call.cancel();
*out = fbl::move(vcpu);
return ZX_OK;
}
Vcpu::Vcpu(const thread_t* thread, uint16_t vpid, fbl::RefPtr<VmObject> apic_vmo,
GuestPhysicalAddressSpace* gpas, PacketMux& mux)
: thread_(thread), vpid_(vpid), apic_vmo_(apic_vmo), gpas_(gpas), mux_(mux),
vmx_state_(/* zero-init */) {}
Vcpu::~Vcpu() {
if (!vmcs_page_.IsAllocated())
return;
// The destructor may be called from a different thread, therefore we must
// pin the current thread to the same CPU as the VCPU.
AutoPin pin(this);
vmclear(vmcs_page_.PhysicalAddress());
__UNUSED zx_status_t status = free_vpid(vpid_);
DEBUG_ASSERT(status == ZX_OK);
}
zx_status_t Vcpu::Resume(zx_port_packet_t* packet) {
if (!check_pinned_cpu_invariant(thread_, vpid_))
return ZX_ERR_BAD_STATE;
zx_status_t status;
do {
AutoVmcs vmcs(vmcs_page_.PhysicalAddress());
if (x86_feature_test(X86_FEATURE_XSAVE)) {
// Save the host XCR0, and load the guest XCR0.
vmx_state_.host_state.xcr0 = x86_xgetbv(0);
x86_xsetbv(0, vmx_state_.guest_state.xcr0);
}
status = vmx_enter(&vmx_state_);
if (x86_feature_test(X86_FEATURE_XSAVE)) {
// Save the guest XCR0, and load the host XCR0.
vmx_state_.guest_state.xcr0 = x86_xgetbv(0);
x86_xsetbv(0, vmx_state_.host_state.xcr0);
}
if (status != ZX_OK) {
uint64_t error = vmcs.Read(VmcsField32::INSTRUCTION_ERROR);
dprintf(SPEW, "vmlaunch failed: %#" PRIx64 "\n", error);
} else {
vmx_state_.resume = true;
GuestState* guest_state = &vmx_state_.guest_state;
status = vmexit_handler(&vmcs, guest_state, &local_apic_state_, gpas_, mux_, packet);
}
} while (status == ZX_OK);
return status == ZX_ERR_NEXT ? ZX_OK : status;
}
void vmx_exit(VmxState* vmx_state) {
DEBUG_ASSERT(arch_ints_disabled());
// Reload the task segment in order to restore its limit. VMX always
// restores it with a limit of 0x67, which excludes the IO bitmap.
seg_sel_t selector = TSS_SELECTOR(arch_curr_cpu_num());
x86_clear_tss_busy(selector);
x86_ltr(selector);
// Reload the interrupt descriptor table in order to restore its limit. VMX
// always restores it with a limit of 0xffff, which is too large.
idt_load(idt_get_readonly());
}
zx_status_t Vcpu::Interrupt(uint32_t vector) {
if (vector > X86_MAX_INT)
return ZX_ERR_OUT_OF_RANGE;
if (!local_apic_signal_interrupt(&local_apic_state_, vector, true)) {
// If we did not signal the VCPU, it means it is currently running,
// therefore we should issue an IPI to force a VM exit.
mp_reschedule(MP_IPI_TARGET_MASK, 1u << cpu_of(vpid_), 0);
}
return ZX_OK;
}
template <typename Out, typename In>
static void register_copy(Out* out, const In& in) {
out->rax = in.rax;
out->rcx = in.rcx;
out->rdx = in.rdx;
out->rbx = in.rbx;
out->rbp = in.rbp;
out->rsi = in.rsi;
out->rdi = in.rdi;
out->r8 = in.r8;
out->r9 = in.r9;
out->r10 = in.r10;
out->r11 = in.r11;
out->r12 = in.r12;
out->r13 = in.r13;
out->r14 = in.r14;
out->r15 = in.r15;
}
zx_status_t Vcpu::ReadState(uint32_t kind, void* buffer, uint32_t len) const {
if (!check_pinned_cpu_invariant(thread_, vpid_))
return ZX_ERR_BAD_STATE;
switch (kind) {
case ZX_VCPU_STATE: {
if (len != sizeof(zx_vcpu_state_t))
break;
auto state = static_cast<zx_vcpu_state_t*>(buffer);
register_copy(state, vmx_state_.guest_state);
AutoVmcs vmcs(vmcs_page_.PhysicalAddress());
state->rsp = vmcs.Read(VmcsFieldXX::GUEST_RSP);
state->flags = vmcs.Read(VmcsFieldXX::GUEST_RFLAGS) & X86_FLAGS_USER;
return ZX_OK;
}
}
return ZX_ERR_INVALID_ARGS;
}
zx_status_t Vcpu::WriteState(uint32_t kind, const void* buffer, uint32_t len) {
if (!check_pinned_cpu_invariant(thread_, vpid_))
return ZX_ERR_BAD_STATE;
switch (kind) {
case ZX_VCPU_STATE: {
if (len != sizeof(zx_vcpu_state_t))
break;
auto state = static_cast<const zx_vcpu_state_t*>(buffer);
register_copy(&vmx_state_.guest_state, *state);
AutoVmcs vmcs(vmcs_page_.PhysicalAddress());
vmcs.Write(VmcsFieldXX::GUEST_RSP, state->rsp);
if (state->flags & X86_FLAGS_RESERVED_ONES) {
const uint64_t rflags = vmcs.Read(VmcsFieldXX::GUEST_RFLAGS);
const uint64_t user_flags = (rflags & ~X86_FLAGS_USER) |
(state->flags & X86_FLAGS_USER);
vmcs.Write(VmcsFieldXX::GUEST_RFLAGS, user_flags);
}
return ZX_OK;
}
case ZX_VCPU_IO: {
if (len != sizeof(zx_vcpu_io_t))
break;
auto io = static_cast<const zx_vcpu_io_t*>(buffer);
memcpy(&vmx_state_.guest_state.rax, io->data, io->access_size);
return ZX_OK;
}
}
return ZX_ERR_INVALID_ARGS;
}
zx_status_t x86_vcpu_create(zx_vaddr_t ip, zx_vaddr_t cr3, fbl::RefPtr<VmObject> apic_vmo,
paddr_t apic_access_address, paddr_t msr_bitmaps_address,
GuestPhysicalAddressSpace* gpas, PacketMux& mux,
fbl::unique_ptr<Vcpu>* out) {
return Vcpu::Create(ip, cr3, apic_vmo, apic_access_address, msr_bitmaps_address, gpas, mux,
out);
}
zx_status_t arch_vcpu_resume(Vcpu* vcpu, zx_port_packet_t* packet) {
return vcpu->Resume(packet);
}
zx_status_t arch_vcpu_interrupt(Vcpu* vcpu, uint32_t vector) {
return vcpu->Interrupt(vector);
}
zx_status_t arch_vcpu_read_state(const Vcpu* vcpu, uint32_t kind, void* buffer, uint32_t len) {
return vcpu->ReadState(kind, buffer, len);
}
zx_status_t arch_vcpu_write_state(Vcpu* vcpu, uint32_t kind, const void* buffer, uint32_t len) {
return vcpu->WriteState(kind, buffer, len);
}