| // Copyright 2017 The Fuchsia Authors |
| // |
| // Use of this source code is governed by a MIT-style |
| // license that can be found in the LICENSE file or at |
| // https://opensource.org/licenses/MIT |
| |
| #include <bits.h> |
| #include <inttypes.h> |
| #include <lib/ktrace.h> |
| #include <platform.h> |
| #include <string.h> |
| #include <trace.h> |
| #include <zircon/syscalls/hypervisor.h> |
| #include <zircon/time.h> |
| #include <zircon/types.h> |
| |
| #include <arch/hypervisor.h> |
| #include <arch/x86/apic.h> |
| #include <arch/x86/feature.h> |
| #include <arch/x86/hypervisor/invalidate.h> |
| #include <arch/x86/interrupts.h> |
| #include <arch/x86/mmu.h> |
| #include <arch/x86/pv.h> |
| #include <hypervisor/interrupt_tracker.h> |
| #include <hypervisor/ktrace.h> |
| #include <kernel/percpu.h> |
| #include <kernel/stats.h> |
| #include <ktl/algorithm.h> |
| #include <platform/pc/timer.h> |
| #include <vm/fault.h> |
| #include <vm/physmap.h> |
| #include <vm/pmm.h> |
| |
| #include "pv_priv.h" |
| #include "vcpu_priv.h" |
| #include "vmcall_priv.h" |
| #include "vmexit_priv.h" |
| |
| #include <ktl/enforce.h> |
| |
| #define LOCAL_TRACE 0 |
| |
| extern "C" void x86_call_external_interrupt_handler(uint64_t vector); |
| |
| namespace { |
| |
| constexpr uint64_t kLocalApicPhysBase = |
| APIC_PHYS_BASE | IA32_APIC_BASE_XAPIC_ENABLE | IA32_APIC_BASE_X2APIC_ENABLE; |
| |
| constexpr uint64_t kX2ApicMsrBase = 0x800; |
| constexpr uint64_t kX2ApicMsrMax = 0x83f; |
| |
| constexpr uint64_t kMiscEnableFastStrings = 1u << 0; |
| |
| constexpr uint32_t kFirstExtendedStateComponent = 2; |
| constexpr uint32_t kLastExtendedStateComponent = 9; |
| // From Volume 1, Section 13.4. |
| constexpr uint32_t kXsaveLegacyRegionSize = 512; |
| constexpr uint32_t kXsaveHeaderSize = 64; |
| |
| // NOTE: x86 instructions are guaranteed to be 15 bytes or fewer. |
| constexpr uint8_t kMaxInstructionSize = 15; |
| |
| alignas(uint32_t) constexpr char kHypVendorId[] = "KVMKVMKVM\0\0\0"; |
| static_assert(sizeof(kHypVendorId) - 1 == 12, "Vendor ID must be 12 characters long"); |
| |
| constexpr uint64_t kKvmFeatureNoIoDelay = 1u << 1; |
| |
| void dump_guest_state(const GuestState& guest_state, const ExitInfo& exit_info) { |
| dprintf(INFO, " RAX: %#18lx RCX: %#18lx RDX: %#18lx RBX: %#18lx\n", guest_state.rax, |
| guest_state.rcx, guest_state.rdx, guest_state.rbx); |
| dprintf(INFO, " RSP: xxxxxxxx xxxxxxxx RBP: %#18lx RSI: %#18lx RDI: %#18lx\n", |
| guest_state.rbp, guest_state.rsi, guest_state.rdi); |
| dprintf(INFO, " R8: %#18lx R9: %#18lx R10: %#18lx R11: %#18lx\n", guest_state.r8, |
| guest_state.r9, guest_state.r10, guest_state.r11); |
| dprintf(INFO, " R12: %#18lx R13: %#18lx R14: %#18lx R15: %#18lx\n", guest_state.r12, |
| guest_state.r13, guest_state.r14, guest_state.r15); |
| dprintf(INFO, " RIP: %#18lx CR2: %#18lx XCR0: %#18lx\n", exit_info.guest_rip, guest_state.cr2, |
| guest_state.xcr0); |
| |
| dprintf(INFO, "entry failure: %d\n", exit_info.entry_failure); |
| dprintf(INFO, "exit instruction length: %u\n", exit_info.exit_instruction_length); |
| } |
| |
| void next_rip(const ExitInfo& exit_info, AutoVmcs& vmcs) { |
| vmcs.Write(VmcsFieldXX::GUEST_RIP, exit_info.guest_rip + exit_info.exit_instruction_length); |
| |
| // Clear any flags blocking interrupt injection for a single instruction. |
| uint32_t guest_interruptibility = vmcs.Read(VmcsField32::GUEST_INTERRUPTIBILITY_STATE); |
| uint32_t new_interruptibility = |
| guest_interruptibility & ~(kInterruptibilityStiBlocking | kInterruptibilityMovSsBlocking); |
| if (new_interruptibility != guest_interruptibility) { |
| vmcs.Write(VmcsField32::GUEST_INTERRUPTIBILITY_STATE, new_interruptibility); |
| } |
| } |
| |
| zx::result<> handle_exception_or_nmi(AutoVmcs& vmcs) { |
| const ExitInterruptionInfo int_info(vmcs); |
| DEBUG_ASSERT(int_info.valid); |
| // Only handle page faults, everything else should terminate the VCPU. |
| if (int_info.interruption_type != InterruptionType::HARDWARE_EXCEPTION || |
| int_info.vector != X86_INT_PAGE_FAULT) { |
| return zx::error(ZX_ERR_BAD_STATE); |
| } |
| auto thread = Thread::Current::Get(); |
| // Page fault resume should not end up here. |
| if (thread->arch().page_fault_resume != 0) { |
| return zx::error(ZX_ERR_INTERNAL); |
| } |
| |
| const zx_vaddr_t guest_vaddr = vmcs.Read(VmcsFieldXX::EXIT_QUALIFICATION); |
| DEBUG_ASSERT(int_info.error_code_valid); |
| const PageFaultInfo pf_info(vmcs.Read(VmcsField32::EXIT_INTERRUPTION_ERROR_CODE)); |
| |
| // We may have to block when handling the page fault. |
| vmcs.Invalidate(); |
| zx_status_t status = vmm_page_fault_handler(guest_vaddr, pf_info.flags); |
| return zx::make_result(status); |
| } |
| |
| void handle_external_interrupt(AutoVmcs& vmcs) { |
| const ExitInterruptionInfo int_info(vmcs); |
| DEBUG_ASSERT(int_info.valid); |
| DEBUG_ASSERT(int_info.interruption_type == InterruptionType::EXTERNAL_INTERRUPT); |
| vmcs.Invalidate(); |
| x86_call_external_interrupt_handler(int_info.vector); |
| } |
| |
| void handle_interrupt_window(AutoVmcs& vmcs) { vmcs.InterruptWindowExiting(false); } |
| |
| // From Volume 2, Section 3.2, Table 3-8 "Processor Extended State Enumeration |
| // Main Leaf (EAX = 0DH, ECX = 0)". |
| // |
| // Bits 31-00: Maximum size (bytes, from the beginning of the XSAVE/XRSTOR save |
| // area) required by enabled features in XCR0. May be different than ECX if some |
| // features at the end of the XSAVE save area are not enabled. |
| zx::result<uint32_t> compute_xsave_size(uint64_t guest_xcr0) { |
| uint32_t xsave_size = kXsaveLegacyRegionSize + kXsaveHeaderSize; |
| for (uint32_t i = kFirstExtendedStateComponent; i <= kLastExtendedStateComponent; ++i) { |
| cpuid_leaf leaf; |
| if (!(guest_xcr0 & (1 << i))) { |
| continue; |
| } |
| if (!x86_get_cpuid_subleaf(X86_CPUID_XSAVE, i, &leaf)) { |
| return zx::error(ZX_ERR_INTERNAL); |
| } |
| if (leaf.a == 0 && leaf.b == 0 && leaf.c == 0 && leaf.d == 0) { |
| continue; |
| } |
| const uint32_t component_offset = leaf.b; |
| const uint32_t component_size = leaf.a; |
| xsave_size = component_offset + component_size; |
| } |
| return zx::ok(xsave_size); |
| } |
| |
| zx::result<> handle_cpuid(const ExitInfo& exit_info, AutoVmcs& vmcs, GuestState& guest_state) { |
| const uint32_t leaf = guest_state.eax(); |
| const uint32_t subleaf = guest_state.ecx(); |
| |
| next_rip(exit_info, vmcs); |
| switch (leaf) { |
| case X86_CPUID_BASE: |
| case X86_CPUID_EXT_BASE: |
| cpuid(leaf, reinterpret_cast<uint32_t*>(&guest_state.rax), |
| reinterpret_cast<uint32_t*>(&guest_state.rbx), |
| reinterpret_cast<uint32_t*>(&guest_state.rcx), |
| reinterpret_cast<uint32_t*>(&guest_state.rdx)); |
| return zx::ok(); |
| case X86_CPUID_BASE + 1 ... MAX_SUPPORTED_CPUID: |
| case X86_CPUID_EXT_BASE + 1 ... MAX_SUPPORTED_CPUID_EXT: |
| cpuid_c(leaf, subleaf, reinterpret_cast<uint32_t*>(&guest_state.rax), |
| reinterpret_cast<uint32_t*>(&guest_state.rbx), |
| reinterpret_cast<uint32_t*>(&guest_state.rcx), |
| reinterpret_cast<uint32_t*>(&guest_state.rdx)); |
| switch (leaf) { |
| case X86_CPUID_MODEL_FEATURES: |
| // Override the initial local APIC ID. From Vol 2, Table 3-8. |
| guest_state.rbx &= ~(0xff << 24); |
| guest_state.rbx |= (vmcs.Read(VmcsField16::VPID) - 1) << 24; |
| // Enable the hypervisor bit. |
| guest_state.rcx |= 1u << X86_FEATURE_HYPERVISOR.bit; |
| // Enable the x2APIC bit. |
| guest_state.rcx |= 1u << X86_FEATURE_X2APIC.bit; |
| // Always enable TSC deadline (this doesn't depend on the host feature). |
| guest_state.rcx |= 1u << X86_FEATURE_TSC_DEADLINE.bit; |
| // Disable the VMX bit. |
| guest_state.rcx &= ~(1u << X86_FEATURE_VMX.bit); |
| // Disable the PDCM bit. |
| guest_state.rcx &= ~(1u << X86_FEATURE_PDCM.bit); |
| // Disable MONITOR/MWAIT. |
| guest_state.rcx &= ~(1u << X86_FEATURE_MON.bit); |
| // Disable THERM_INTERRUPT and THERM_STATUS MSRs |
| guest_state.rcx &= ~(1u << X86_FEATURE_TM2.bit); |
| // Enable the SEP (SYSENTER support). |
| guest_state.rdx |= 1u << X86_FEATURE_SEP.bit; |
| // Disable the Thermal Monitor bit. |
| guest_state.rdx &= ~(1u << X86_FEATURE_TM.bit); |
| // Disable the THERM_CONTROL_MSR bit. |
| guest_state.rdx &= ~(1u << X86_FEATURE_ACPI.bit); |
| break; |
| case X86_CPUID_TOPOLOGY: |
| guest_state.rax = 0; |
| guest_state.rbx = 0; |
| guest_state.rcx = 0; |
| guest_state.rdx = vmcs.Read(VmcsField16::VPID) - 1; |
| break; |
| case X86_CPUID_XSAVE: |
| if (subleaf == 0) { |
| auto xsave_size = compute_xsave_size(guest_state.xcr0); |
| if (xsave_size.is_error()) { |
| return xsave_size.take_error(); |
| } |
| guest_state.rbx = *xsave_size; |
| } else if (subleaf == 1) { |
| guest_state.rax &= ~(1u << 3); |
| } |
| break; |
| case X86_CPUID_THERMAL_AND_POWER: |
| // Disable the performance energy bias bit. |
| guest_state.rcx &= ~(1u << X86_FEATURE_PERF_BIAS.bit); |
| // Disable the hardware coordination feedback bit. |
| guest_state.rcx &= ~(1u << X86_FEATURE_HW_FEEDBACK.bit); |
| guest_state.rax &= ~( |
| // Disable Digital Thermal Sensor |
| 1u << X86_FEATURE_DTS.bit | |
| // Disable Package Thermal Status MSR. |
| 1u << X86_FEATURE_PTM.bit | |
| // Disable THERM_STATUS MSR bits 10/11 & THERM_INTERRUPT MSR bit 24 |
| 1u << X86_FEATURE_PLN.bit | |
| // Disable HWP MSRs. |
| 1u << X86_FEATURE_HWP.bit | 1u << X86_FEATURE_HWP_NOT.bit | |
| 1u << X86_FEATURE_HWP_ACT.bit | 1u << X86_FEATURE_HWP_PREF.bit | |
| // Don't advertise Turbo. |
| 1u << X86_FEATURE_TURBO.bit | 1u << X86_FEATURE_TURBO_MAX.bit); |
| break; |
| case X86_CPUID_PERFORMANCE_MONITORING: { |
| // Disable all performance monitoring. |
| // 31-07 = Reserved 0, 06-00 = 1 if event is not available. |
| const uint32_t performance_monitoring_no_events = 0b1111111; |
| guest_state.rax = 0; |
| guest_state.rbx = performance_monitoring_no_events; |
| guest_state.rcx = 0; |
| guest_state.rdx = 0; |
| break; |
| } |
| case X86_CPUID_MON: |
| // MONITOR/MWAIT are not implemented. |
| guest_state.rax = 0; |
| guest_state.rbx = 0; |
| guest_state.rcx = 0; |
| guest_state.rdx = 0; |
| break; |
| case X86_CPUID_EXTENDED_FEATURE_FLAGS: |
| // It's possible when running under KVM in nVMX mode, that host |
| // CPUID indicates that invpcid is supported but VMX doesn't allow |
| // to enable INVPCID bit in secondary processor based controls. |
| // Therefore explicitly clear INVPCID bit in CPUID if the VMX flag |
| // wasn't set. |
| if ((vmcs.Read(VmcsField32::PROCBASED_CTLS2) & kProcbasedCtls2Invpcid) == 0) { |
| guest_state.rbx &= ~(1u << X86_FEATURE_INVPCID.bit); |
| } |
| // Disable: |
| // * Processor Trace bit |
| // * TSC Adjust bit |
| guest_state.rbx &= ~(1u << X86_FEATURE_PT.bit | 1u << X86_FEATURE_TSC_ADJUST.bit); |
| // Disable: |
| // * Indirect Branch Prediction Barrier bit |
| // * Single Thread Indirect Branch Predictors bit |
| // * Speculative Store Bypass Disable bit |
| // These imply support for the IA32_SPEC_CTRL and IA32_PRED_CMD |
| // MSRs, which are not implemented. |
| guest_state.rdx &= ~(1u << X86_FEATURE_IBRS_IBPB.bit | 1u << X86_FEATURE_STIBP.bit | |
| 1u << X86_FEATURE_SSBD.bit); |
| // Disable support for the IA32_ARCH_CAPABILITIES MSR. |
| guest_state.rdx &= ~(1u << X86_FEATURE_ARCH_CAPABILITIES.bit); |
| // Disable support for the IA32_FLUSH_CMD MSR. |
| guest_state.rdx &= ~(1u << X86_FEATURE_L1D_FLUSH.bit); |
| |
| // TODO(https://fxbug.dev/42060002): Enable AVX-512 if supported. |
| // |
| // Disabling this to work around invalid opcode errors trying to execute these |
| // instructions. |
| guest_state.rbx &= ~(1u << X86_FEATURE_AVX512F.bit | 1u << X86_FEATURE_AVX512DQ.bit | |
| 1u << X86_FEATURE_AVX512IFMA.bit | 1u << X86_FEATURE_AVX512PF.bit | |
| 1u << X86_FEATURE_AVX512ER.bit | 1u << X86_FEATURE_AVX512CD.bit | |
| 1u << X86_FEATURE_AVX512BW.bit | 1u << X86_FEATURE_AVX512VL.bit); |
| guest_state.rcx &= |
| ~(1u << X86_FEATURE_AVX512VBMI.bit | 1u << X86_FEATURE_AVX512VBMI2.bit | |
| 1u << X86_FEATURE_AVX512VNNI.bit | 1u << X86_FEATURE_AVX512BITALG.bit | |
| 1u << X86_FEATURE_AVX512VPDQ.bit); |
| guest_state.rdx &= |
| ~(1u << X86_FEATURE_AVX512QVNNIW.bit | 1u << X86_FEATURE_AVX512QFMA.bit); |
| |
| break; |
| } |
| return zx::ok(); |
| case X86_CPUID_HYP_VENDOR: { |
| // This leaf is commonly used to identify a hypervisor via ebx:ecx:edx. |
| auto regs = reinterpret_cast<const uint32_t*>(kHypVendorId); |
| // Since Zircon hypervisor disguises itself as KVM, it needs to return |
| // in EAX max CPUID function supported by hypervisor. Zero in EAX |
| // should be interpreted as 0x40000001. Details are available in the |
| // Linux kernel documentation (Documentation/virtual/kvm/cpuid.txt). |
| guest_state.rax = X86_CPUID_KVM_FEATURES; |
| guest_state.rbx = regs[0]; |
| guest_state.rcx = regs[1]; |
| guest_state.rdx = regs[2]; |
| return zx::ok(); |
| } |
| case X86_CPUID_KVM_FEATURES: |
| // We support KVM clock. |
| guest_state.rax = kKvmFeatureClockSourceOld | kKvmFeatureClockSource | kKvmFeatureNoIoDelay; |
| guest_state.rbx = 0; |
| guest_state.rcx = 0; |
| guest_state.rdx = 0; |
| return zx::ok(); |
| // From Volume 2A, CPUID instruction reference. If the EAX value is outside |
| // the range recognized by CPUID then the information for the highest |
| // supported base information leaf is returned. Any value in ECX is |
| // honored. |
| default: |
| cpuid_c(MAX_SUPPORTED_CPUID, subleaf, reinterpret_cast<uint32_t*>(&guest_state.rax), |
| reinterpret_cast<uint32_t*>(&guest_state.rbx), |
| reinterpret_cast<uint32_t*>(&guest_state.rcx), |
| reinterpret_cast<uint32_t*>(&guest_state.rdx)); |
| return zx::ok(); |
| } |
| } |
| |
| zx::result<> handle_hlt(const ExitInfo& exit_info, AutoVmcs& vmcs, |
| LocalApicState& local_apic_state) { |
| next_rip(exit_info, vmcs); |
| return local_apic_state.interrupt_tracker.Wait(ZX_TIME_INFINITE, &vmcs); |
| } |
| |
| zx::result<> handle_cr0_write(AutoVmcs& vmcs, uint64_t val, LocalApicState& local_apic_state) { |
| // X86_CR0_NE is masked so that guests may write to it, but depending on |
| // IA32_VMX_CR0_FIXED0 it might be unsupported in VMX operation to set it to |
| // zero. Allow the guest to control its value in CR0_READ_SHADOW but not in |
| // GUEST_CR0 so that GUEST_CR0 stays valid. |
| uint64_t cr0 = val | X86_CR0_NE; |
| if (cr0_is_invalid(vmcs, cr0)) { |
| return zx::error(ZX_ERR_INVALID_ARGS); |
| } |
| |
| // From Volume 3, Table 11-5: CD=0 and NW=1 is an invalid setting and should |
| // generate a GP fault. |
| if (!(val & X86_CR0_CD) && (val & X86_CR0_NW)) { |
| local_apic_state.interrupt_tracker.Interrupt(X86_INT_GP_FAULT); |
| return zx::ok(); |
| } |
| |
| // If CR0.PG is being changed, then invalidate the VPID. |
| uint64_t cr0_changed = val ^ vmcs.Read(VmcsFieldXX::GUEST_CR0); |
| if (cr0_changed & X86_CR0_PG) { |
| uint16_t vpid = vmcs.Read(VmcsField16::VPID); |
| invvpid(InvVpid::SINGLE_CONTEXT, vpid, 0); |
| } |
| |
| // From Volume 3, Section 26.3.2.1: CR0 is loaded from the CR0 field with the |
| // exception of the following bits, which are never modified on VM entry: ET |
| // (bit 4); reserved bits ...; NW (bit 29) and CD (bit 30). The values of |
| // these bits in the CR0 field are ignored. |
| // |
| // Even though these bits will be ignored on VM entry, to ensure that |
| // GUEST_CR0 matches the actual value of CR0 while the guest is running set |
| // those bits to match the host values. This is done only to make debugging |
| // simpler. |
| cr0 &= ~(X86_CR0_NW | X86_CR0_CD); |
| cr0 |= X86_CR0_ET; |
| vmcs.Write(VmcsFieldXX::GUEST_CR0, cr0); |
| |
| // From Volume 3, Section 25.3: For each position corresponding to a bit clear |
| // in the CR0 guest/host mask, the destination operand is loaded with the |
| // value of the corresponding bit in CR0. For each position corresponding to a |
| // bit set in the CR0 guest/host mask, the destination operand is loaded with |
| // the value of the corresponding bit in the CR0 read shadow. |
| // |
| // Allow the guest to control the shadow. |
| vmcs.Write(VmcsFieldXX::CR0_READ_SHADOW, val); |
| |
| // From Volume 3, Section 26.3.1.1: If CR0.PG and EFER.LME are set then |
| // EFER.LMA and the IA-32e mode guest entry control must also be set. |
| uint64_t efer = vmcs.Read(VmcsField64::GUEST_IA32_EFER); |
| if (!(efer & X86_EFER_LME && cr0 & X86_CR0_PG)) { |
| return zx::ok(); |
| } |
| vmcs.Write(VmcsField64::GUEST_IA32_EFER, efer | X86_EFER_LMA); |
| return vmcs.SetControl(VmcsField32::ENTRY_CTLS, read_msr(X86_MSR_IA32_VMX_TRUE_ENTRY_CTLS), |
| read_msr(X86_MSR_IA32_VMX_ENTRY_CTLS), kEntryCtls64bitMode, 0); |
| } |
| |
| zx::result<uint64_t> register_value(AutoVmcs& vmcs, const GuestState& guest_state, |
| uint8_t register_id) { |
| switch (register_id) { |
| // From Intel Volume 3, Table 27-3. |
| case 0: |
| return zx::ok(guest_state.rax); |
| case 1: |
| return zx::ok(guest_state.rcx); |
| case 2: |
| return zx::ok(guest_state.rdx); |
| case 3: |
| return zx::ok(guest_state.rbx); |
| case 4: |
| return zx::ok(vmcs.Read(VmcsFieldXX::GUEST_RSP)); |
| case 5: |
| return zx::ok(guest_state.rbp); |
| case 6: |
| return zx::ok(guest_state.rsi); |
| case 7: |
| return zx::ok(guest_state.rdi); |
| case 8: |
| return zx::ok(guest_state.r8); |
| case 9: |
| return zx::ok(guest_state.r9); |
| case 10: |
| return zx::ok(guest_state.r10); |
| case 11: |
| return zx::ok(guest_state.r11); |
| case 12: |
| return zx::ok(guest_state.r12); |
| case 13: |
| return zx::ok(guest_state.r13); |
| case 14: |
| return zx::ok(guest_state.r14); |
| case 15: |
| return zx::ok(guest_state.r15); |
| default: |
| return zx::error(ZX_ERR_INVALID_ARGS); |
| } |
| } |
| |
| zx::result<> handle_control_register_access(const ExitInfo& exit_info, AutoVmcs& vmcs, |
| const GuestState& guest_state, |
| LocalApicState& local_apic_state) { |
| const CrAccessInfo cr_access_info(vmcs.Read(VmcsFieldXX::EXIT_QUALIFICATION)); |
| switch (cr_access_info.access_type) { |
| case CrAccessType::MOV_TO_CR: { |
| // Handle CR0 only. |
| if (cr_access_info.cr_number != 0) { |
| return zx::error(ZX_ERR_NOT_SUPPORTED); |
| } |
| auto val = register_value(vmcs, guest_state, cr_access_info.reg); |
| if (val.is_error()) { |
| return val.take_error(); |
| } |
| auto result = handle_cr0_write(vmcs, *val, local_apic_state); |
| if (result.is_error()) { |
| return val.take_error(); |
| } |
| next_rip(exit_info, vmcs); |
| return zx::ok(); |
| } |
| default: |
| return zx::error(ZX_ERR_NOT_SUPPORTED); |
| } |
| } |
| |
| zx::result<> handle_io_instruction(const ExitInfo& exit_info, AutoVmcs& vmcs, |
| GuestState& guest_state, hypervisor::TrapMap& traps, |
| zx_port_packet_t& packet) { |
| const IoInfo io_info(vmcs.Read(VmcsFieldXX::EXIT_QUALIFICATION)); |
| if (io_info.string || io_info.repeat) { |
| dprintf(INFO, "hypervisor: Unsupported guest IO instruction\n"); |
| return zx::error(ZX_ERR_NOT_SUPPORTED); |
| } |
| |
| zx::result<hypervisor::Trap*> trap = traps.FindTrap(ZX_GUEST_TRAP_IO, io_info.port); |
| if (trap.is_error()) { |
| dprintf(INFO, "hypervisor: Unhandled guest IO port %s %#x\n", io_info.input ? "read" : "write", |
| io_info.port); |
| return trap.take_error(); |
| } |
| next_rip(exit_info, vmcs); |
| |
| memset(&packet, 0, sizeof(packet)); |
| packet.key = (*trap)->key(); |
| packet.type = ZX_PKT_TYPE_GUEST_IO; |
| packet.guest_io.port = io_info.port; |
| packet.guest_io.access_size = io_info.access_size; |
| packet.guest_io.input = io_info.input; |
| if (io_info.input) { |
| // From Volume 1, Section 3.4.1.1: 32-bit operands generate a 32-bit |
| // result, zero-extended to a 64-bit result in the destination general- |
| // purpose register. |
| if (io_info.access_size == 4) { |
| guest_state.rax = 0; |
| } |
| } else { |
| memcpy(packet.guest_io.data, &guest_state.rax, io_info.access_size); |
| if ((*trap)->HasPort()) { |
| return (*trap)->Queue(packet, &vmcs); |
| } |
| // If there was no port for the range, then return to user-space. |
| } |
| |
| return zx::error(ZX_ERR_NEXT); |
| } |
| |
| void handle_apic_rdmsr(const ExitInfo& exit_info, AutoVmcs& vmcs, GuestState& guest_state, |
| LocalApicState& local_apic_state) { |
| switch (static_cast<X2ApicMsr>(guest_state.ecx())) { |
| case X2ApicMsr::ID: |
| next_rip(exit_info, vmcs); |
| guest_state.rax = vmcs.Read(VmcsField16::VPID) - 1; |
| break; |
| case X2ApicMsr::VERSION: { |
| next_rip(exit_info, vmcs); |
| // We choose 15H as it causes us to be seen as a modern APIC by Linux, |
| // and is the highest non-reserved value. See Volume 3 Section 10.4.8. |
| const uint32_t version = 0x15; |
| const uint32_t max_lvt_entry = 0x6; // LVT entries minus 1. |
| const uint32_t eoi_suppression = 0; // Disable support for EOI-broadcast suppression. |
| guest_state.rax = version | (max_lvt_entry << 16) | (eoi_suppression << 24); |
| break; |
| } |
| case X2ApicMsr::SVR: |
| // Spurious interrupt vector resets to 0xff. See Volume 3 Section 10.12.5.1. |
| next_rip(exit_info, vmcs); |
| guest_state.rax = 0xff; |
| break; |
| case X2ApicMsr::TPR: |
| case X2ApicMsr::LDR: |
| case X2ApicMsr::ISR_31_0... X2ApicMsr::ISR_255_224: |
| case X2ApicMsr::TMR_31_0... X2ApicMsr::TMR_255_224: |
| case X2ApicMsr::IRR_31_0... X2ApicMsr::IRR_255_224: |
| case X2ApicMsr::ESR: |
| case X2ApicMsr::LVT_MONITOR: |
| // These registers reset to 0. See Volume 3 Section 10.12.5.1. |
| next_rip(exit_info, vmcs); |
| guest_state.rax = 0; |
| break; |
| case X2ApicMsr::LVT_LINT0: |
| case X2ApicMsr::LVT_LINT1: |
| case X2ApicMsr::LVT_THERMAL_SENSOR: |
| case X2ApicMsr::LVT_CMCI: |
| // LVT registers reset with the mask bit set. See Volume 3 Section 10.12.5.1. |
| next_rip(exit_info, vmcs); |
| guest_state.rax = LVT_MASKED; |
| break; |
| case X2ApicMsr::LVT_TIMER: |
| next_rip(exit_info, vmcs); |
| guest_state.rax = local_apic_state.lvt_timer; |
| break; |
| default: |
| // Issue a general protection fault for write only and unimplemented |
| // registers. |
| dprintf(INFO, "hypervisor: Unhandled guest x2APIC RDMSR %#lx\n", guest_state.rcx); |
| local_apic_state.interrupt_tracker.Interrupt(X86_INT_GP_FAULT); |
| break; |
| } |
| } |
| |
| void handle_rdmsr(const ExitInfo& exit_info, AutoVmcs& vmcs, GuestState& guest_state, |
| LocalApicState& local_apic_state) { |
| // On execution of rdmsr, ecx specifies the MSR and the result is stored in edx:eax. |
| switch (guest_state.ecx()) { |
| case X86_MSR_IA32_APIC_BASE: { |
| next_rip(exit_info, vmcs); |
| uint64_t result = kLocalApicPhysBase; |
| if (vmcs.Read(VmcsField16::VPID) == 1) { |
| result |= IA32_APIC_BASE_BSP; |
| } |
| guest_state.SetEdxEax(result); |
| break; |
| } |
| // From Volume 4, Section 2.1, Table 2-2: For now, only enable fast strings. |
| case X86_MSR_IA32_MISC_ENABLE: |
| next_rip(exit_info, vmcs); |
| guest_state.SetEdxEax(read_msr(X86_MSR_IA32_MISC_ENABLE) & kMiscEnableFastStrings); |
| break; |
| case X86_MSR_DRAM_ENERGY_STATUS: |
| case X86_MSR_DRAM_POWER_LIMIT: |
| // From Volume 3, Section 28.2.6.2: The MTRRs have no effect on the memory |
| // type used for an access to a guest-physical address. |
| case X86_MSR_IA32_MTRRCAP: |
| case X86_MSR_IA32_MTRR_DEF_TYPE: |
| case X86_MSR_IA32_MTRR_FIX64K_00000: |
| case X86_MSR_IA32_MTRR_FIX16K_80000 ... X86_MSR_IA32_MTRR_FIX16K_A0000: |
| case X86_MSR_IA32_MTRR_FIX4K_C0000 ... X86_MSR_IA32_MTRR_FIX4K_F8000: |
| case X86_MSR_IA32_MTRR_PHYSBASE0 ... X86_MSR_IA32_MTRR_PHYSMASK9: |
| // From Volume 3, Section 9.11.4: For now, 0. |
| case X86_MSR_IA32_PLATFORM_ID: |
| // From Volume 3, Section 9.11.7: 0 indicates no microcode update is loaded. |
| case X86_MSR_IA32_BIOS_SIGN_ID: |
| // From Volume 3, Section 15.3.1: 0 indicates that our machine has no |
| // checking capabilities. |
| case X86_MSR_IA32_MCG_CAP: |
| case X86_MSR_IA32_MCG_STATUS: |
| case X86_MSR_IA32_TEMPERATURE_TARGET: |
| case X86_MSR_PKG_ENERGY_STATUS: |
| case X86_MSR_PLATFORM_ENERGY_COUNTER: |
| case X86_MSR_PLATFORM_POWER_LIMIT: |
| case X86_MSR_PP0_ENERGY_STATUS: |
| case X86_MSR_PP0_POWER_LIMIT: |
| case X86_MSR_PP1_ENERGY_STATUS: |
| case X86_MSR_PP1_POWER_LIMIT: |
| case X86_MSR_RAPL_POWER_UNIT: |
| // From Volume 3, Section 14.2: We've configured CPUID to report no MPERF/APERF |
| // support, but Linux attempts to read stats anyhow. Just ignore it. |
| case X86_MSR_PPERF: |
| // From Volume 4, Table 2-15: Number of SMI interrupts since boot. |
| // We report 0 interrupts. |
| case X86_MSR_SMI_COUNT: |
| next_rip(exit_info, vmcs); |
| guest_state.SetEdxEax(0); |
| break; |
| case kX2ApicMsrBase ... kX2ApicMsrMax: |
| handle_apic_rdmsr(exit_info, vmcs, guest_state, local_apic_state); |
| break; |
| default: |
| dprintf(INFO, "hypervisor: Unhandled guest RDMSR %#lx\n", guest_state.rcx); |
| local_apic_state.interrupt_tracker.Interrupt(X86_INT_GP_FAULT); |
| break; |
| } |
| } |
| |
| zx_time_t lvt_deadline(LocalApicState& local_apic_state) { |
| if ((local_apic_state.lvt_timer & LVT_TIMER_MODE_MASK) != LVT_TIMER_MODE_ONESHOT && |
| (local_apic_state.lvt_timer & LVT_TIMER_MODE_MASK) != LVT_TIMER_MODE_PERIODIC) { |
| return 0; |
| } |
| uint32_t shift = BITS_SHIFT(local_apic_state.lvt_divide_config, 1, 0) | |
| (BIT_SHIFT(local_apic_state.lvt_divide_config, 3) << 2); |
| uint32_t divisor_shift = (shift + 1) & 7; |
| int64_t duration_tsc_ticks = |
| static_cast<int64_t>(local_apic_state.lvt_initial_count << divisor_shift); |
| zx_duration_t duration = convert_raw_tsc_duration_to_nanoseconds(duration_tsc_ticks); |
| return zx_time_add_duration(current_time(), duration); |
| } |
| |
| void update_timer(LocalApicState& local_apic_state, zx_time_t deadline); |
| |
| void deadline_callback(Timer* timer, zx_time_t now, void* arg) { |
| auto& local_apic_state = *static_cast<LocalApicState*>(arg); |
| if (local_apic_state.lvt_timer & LVT_MASKED) { |
| return; |
| } |
| if ((local_apic_state.lvt_timer & LVT_TIMER_MODE_MASK) == LVT_TIMER_MODE_PERIODIC) { |
| update_timer(local_apic_state, lvt_deadline(local_apic_state)); |
| } |
| uint8_t vector = local_apic_state.lvt_timer & LVT_TIMER_VECTOR_MASK; |
| local_apic_state.interrupt_tracker.Interrupt(vector); |
| } |
| |
| void update_timer(LocalApicState& local_apic_state, zx_time_t deadline) { |
| local_apic_state.timer.Cancel(); |
| if (deadline > 0) { |
| local_apic_state.timer.SetOneshot(deadline, deadline_callback, &local_apic_state); |
| } |
| } |
| |
| uint64_t ipi_target_mask(const InterruptCommandRegister& icr, uint16_t self) { |
| DEBUG_ASSERT(self < NormalGuest::kMaxGuestVcpus); |
| |
| switch (icr.destination_shorthand) { |
| case InterruptDestinationShorthand::NO_SHORTHAND: { |
| // Intel Volume 3, Section 10.12.9: A destination ID value of FFFF_FFFFH |
| // is used for broadcast of interrupts in both logical destination and |
| // physical destination modes. |
| if (icr.destination == kIpiBroadcastDestination) { |
| return UINT64_MAX; |
| } |
| |
| // If an invalid destination was provided, just return the empty mask. |
| if (unlikely(icr.destination >= NormalGuest::kMaxGuestVcpus)) { |
| return 0; |
| } |
| |
| // Otherwise, generate a mask for the target VCPU. |
| return 1u << icr.destination; |
| } |
| case InterruptDestinationShorthand::SELF: |
| return 1u << self; |
| case InterruptDestinationShorthand::ALL_INCLUDING_SELF: |
| return UINT64_MAX; |
| case InterruptDestinationShorthand::ALL_EXCLUDING_SELF: |
| return ~(1u << self); |
| } |
| |
| return 0; |
| } |
| |
| zx::result<> handle_ipi(const ExitInfo& exit_info, AutoVmcs& vmcs, const GuestState& guest_state, |
| zx_port_packet& packet) { |
| InterruptCommandRegister icr(guest_state.edx(), guest_state.eax()); |
| if (icr.destination_mode == InterruptDestinationMode::LOGICAL) { |
| dprintf(INFO, "hypervisor: Logical IPI destination mode requested by guest is not supported\n"); |
| return zx::error(ZX_ERR_NOT_SUPPORTED); |
| } |
| switch (icr.delivery_mode) { |
| case InterruptDeliveryMode::FIXED: { |
| uint16_t self = vmcs.Read(VmcsField16::VPID) - 1; |
| memset(&packet, 0, sizeof(packet)); |
| packet.type = ZX_PKT_TYPE_GUEST_VCPU; |
| packet.guest_vcpu.type = ZX_PKT_GUEST_VCPU_INTERRUPT; |
| packet.guest_vcpu.interrupt.mask = ipi_target_mask(icr, self); |
| packet.guest_vcpu.interrupt.vector = icr.vector; |
| next_rip(exit_info, vmcs); |
| return zx::error(ZX_ERR_NEXT); |
| } |
| case InterruptDeliveryMode::NMI: { |
| uint16_t self = vmcs.Read(VmcsField16::VPID) - 1; |
| memset(&packet, 0, sizeof(packet)); |
| packet.type = ZX_PKT_TYPE_GUEST_VCPU; |
| packet.guest_vcpu.type = ZX_PKT_GUEST_VCPU_INTERRUPT; |
| // Intel Volume 3a, Table 10-4 specifies that NMI to self is an invalid configuration and |
| // behavior is undefined for invalid configurations. |
| // |
| // For simplicity we'll just clear the self-bit in the mask. |
| packet.guest_vcpu.interrupt.mask = ipi_target_mask(icr, self) & ~(1 << self); |
| |
| // Intel Volume 3a, Section 10.6.1 Interrupt Command Register. |
| // |
| // For NMI the target information is ignored since the NMI vector is already defined. |
| packet.guest_vcpu.interrupt.vector = X86_INT_NMI; |
| next_rip(exit_info, vmcs); |
| return zx::error(ZX_ERR_NEXT); |
| } |
| case InterruptDeliveryMode::INIT: |
| // Ignore INIT IPIs, we only need STARTUP to bring up a VCPU. |
| next_rip(exit_info, vmcs); |
| return zx::ok(); |
| case InterruptDeliveryMode::STARTUP: |
| memset(&packet, 0, sizeof(packet)); |
| packet.type = ZX_PKT_TYPE_GUEST_VCPU; |
| packet.guest_vcpu.type = ZX_PKT_GUEST_VCPU_STARTUP; |
| packet.guest_vcpu.startup.id = icr.destination; |
| packet.guest_vcpu.startup.entry = icr.vector << 12; |
| next_rip(exit_info, vmcs); |
| return zx::error(ZX_ERR_NEXT); |
| default: |
| dprintf(INFO, "hypervisor: Unsupported guest IPI delivery mode %#x\n", |
| static_cast<uint8_t>(icr.delivery_mode)); |
| return zx::error(ZX_ERR_NOT_SUPPORTED); |
| } |
| } |
| |
| zx::result<> handle_apic_wrmsr(const ExitInfo& exit_info, AutoVmcs& vmcs, |
| const GuestState& guest_state, LocalApicState& local_apic_state, |
| zx_port_packet& packet) { |
| // Check for writes to reserved bits. |
| // |
| // From Volume 3, Section 10.12.1.2: "The upper 32-bits of all x2APIC MSRs |
| // (except for the ICR) are reserved." |
| X2ApicMsr reg = static_cast<X2ApicMsr>(guest_state.ecx()); |
| if (unlikely(guest_state.edx() != 0 && reg != X2ApicMsr::ICR)) { |
| local_apic_state.interrupt_tracker.Interrupt(X86_INT_GP_FAULT); |
| return zx::ok(); |
| } |
| |
| switch (reg) { |
| case X2ApicMsr::EOI: |
| case X2ApicMsr::ESR: |
| // From Volume 3, Section 10.12.1.2: "WRMSR of a non-zero value causes #GP(0)." |
| if (guest_state.eax() != 0) { |
| local_apic_state.interrupt_tracker.Interrupt(X86_INT_GP_FAULT); |
| return zx::ok(); |
| } |
| next_rip(exit_info, vmcs); |
| return zx::ok(); |
| case X2ApicMsr::TPR: |
| case X2ApicMsr::SVR: |
| case X2ApicMsr::LVT_MONITOR: |
| case X2ApicMsr::LVT_ERROR: |
| case X2ApicMsr::LVT_LINT0: |
| case X2ApicMsr::LVT_LINT1: |
| case X2ApicMsr::LVT_THERMAL_SENSOR: |
| case X2ApicMsr::LVT_CMCI: |
| next_rip(exit_info, vmcs); |
| return zx::ok(); |
| case X2ApicMsr::LVT_TIMER: |
| if ((guest_state.eax() & LVT_TIMER_MODE_MASK) == LVT_TIMER_MODE_RESERVED) { |
| return zx::error(ZX_ERR_INVALID_ARGS); |
| } |
| next_rip(exit_info, vmcs); |
| local_apic_state.lvt_timer = guest_state.eax(); |
| update_timer(local_apic_state, lvt_deadline(local_apic_state)); |
| return zx::ok(); |
| case X2ApicMsr::INITIAL_COUNT: |
| next_rip(exit_info, vmcs); |
| local_apic_state.lvt_initial_count = guest_state.eax(); |
| update_timer(local_apic_state, lvt_deadline(local_apic_state)); |
| return zx::ok(); |
| case X2ApicMsr::DCR: |
| next_rip(exit_info, vmcs); |
| local_apic_state.lvt_divide_config = guest_state.eax(); |
| update_timer(local_apic_state, lvt_deadline(local_apic_state)); |
| return zx::ok(); |
| case X2ApicMsr::SELF_IPI: { |
| next_rip(exit_info, vmcs); |
| uint32_t vector = guest_state.eax() & UINT8_MAX; |
| local_apic_state.interrupt_tracker.Interrupt(vector); |
| return zx::ok(); |
| } |
| case X2ApicMsr::ICR: |
| return handle_ipi(exit_info, vmcs, guest_state, packet); |
| default: |
| // Issue a general protection fault for read only and unimplemented |
| // registers. |
| dprintf(INFO, "hypervisor: Unhandled guest x2APIC WRMSR %#" PRIx32 "\n", guest_state.ecx()); |
| local_apic_state.interrupt_tracker.Interrupt(X86_INT_GP_FAULT); |
| return zx::ok(); |
| } |
| } |
| |
| zx::result<> handle_kvm_wrmsr(const ExitInfo& exit_info, AutoVmcs& vmcs, |
| const GuestState& guest_state, LocalApicState& local_apic_state, |
| PvClockState& pv_clock, hypervisor::GuestPhysicalAspace& gpa) { |
| zx_paddr_t guest_paddr = guest_state.EdxEax(); |
| |
| next_rip(exit_info, vmcs); |
| switch (guest_state.ecx()) { |
| case kKvmSystemTimeMsrOld: |
| case kKvmSystemTimeMsr: |
| vmcs.Invalidate(); |
| if ((guest_paddr & 1) != 0) { |
| return pv_clock_reset_clock(&pv_clock, &gpa, guest_paddr & ~static_cast<zx_paddr_t>(1)); |
| } else { |
| pv_clock_stop_clock(&pv_clock); |
| } |
| return zx::ok(); |
| case kKvmBootTimeOld: |
| case kKvmBootTime: |
| vmcs.Invalidate(); |
| return pv_clock_update_boot_time(&gpa, guest_paddr); |
| default: |
| local_apic_state.interrupt_tracker.Interrupt(X86_INT_GP_FAULT); |
| return zx::ok(); |
| } |
| } |
| |
| zx::result<> handle_wrmsr(const ExitInfo& exit_info, AutoVmcs& vmcs, const GuestState& guest_state, |
| LocalApicState& local_apic_state, PvClockState& pv_clock, |
| hypervisor::GuestPhysicalAspace& gpa, zx_port_packet& packet) { |
| // On execution of wrmsr, rcx specifies the MSR and edx:eax contains the value to be written. |
| switch (guest_state.ecx()) { |
| case X86_MSR_IA32_APIC_BASE: |
| if ((guest_state.EdxEax() & ~IA32_APIC_BASE_BSP) != kLocalApicPhysBase) { |
| return zx::error(ZX_ERR_INVALID_ARGS); |
| } |
| next_rip(exit_info, vmcs); |
| return zx::ok(); |
| // See note in handle_rdmsr. |
| case X86_MSR_IA32_MTRRCAP: |
| case X86_MSR_IA32_MTRR_DEF_TYPE: |
| case X86_MSR_IA32_MTRR_FIX64K_00000: |
| case X86_MSR_IA32_MTRR_FIX16K_80000 ... X86_MSR_IA32_MTRR_FIX16K_A0000: |
| case X86_MSR_IA32_MTRR_FIX4K_C0000 ... X86_MSR_IA32_MTRR_FIX4K_F8000: |
| case X86_MSR_IA32_MTRR_PHYSBASE0 ... X86_MSR_IA32_MTRR_PHYSMASK9: |
| case X86_MSR_IA32_BIOS_SIGN_ID: |
| case X86_MSR_DRAM_POWER_LIMIT: |
| case X86_MSR_PP0_POWER_LIMIT: |
| case X86_MSR_PP1_POWER_LIMIT: |
| case X86_MSR_PLATFORM_POWER_LIMIT: |
| // We disable the associated CPUID bits, but Linux still writes to these |
| // MSRs. Just ignore it. |
| case X86_MSR_IA32_SPEC_CTRL: |
| case X86_MSR_IA32_PRED_CMD: |
| // From AMD64 Volume 2, Section 6.1.1: CSTAR is unused, but Linux likes to |
| // set a null handler, even when not in compatibility mode. Just ignore it. |
| case X86_MSR_IA32_CSTAR: |
| next_rip(exit_info, vmcs); |
| return zx::ok(); |
| case X86_MSR_IA32_TSC_DEADLINE: { |
| if ((local_apic_state.lvt_timer & LVT_TIMER_MODE_MASK) != LVT_TIMER_MODE_TSC_DEADLINE) { |
| return zx::error(ZX_ERR_INVALID_ARGS); |
| } |
| next_rip(exit_info, vmcs); |
| int64_t tsc_deadline = static_cast<int64_t>(guest_state.EdxEax()); |
| zx_time_t mono_deadline = convert_raw_tsc_timestamp_to_clock_monotonic(tsc_deadline); |
| update_timer(local_apic_state, mono_deadline); |
| return zx::ok(); |
| } |
| case kX2ApicMsrBase ... kX2ApicMsrMax: |
| return handle_apic_wrmsr(exit_info, vmcs, guest_state, local_apic_state, packet); |
| case kKvmSystemTimeMsrOld: |
| case kKvmSystemTimeMsr: |
| case kKvmBootTimeOld: |
| case kKvmBootTime: |
| return handle_kvm_wrmsr(exit_info, vmcs, guest_state, local_apic_state, pv_clock, gpa); |
| default: |
| dprintf(INFO, "hypervisor: Unhandled guest WRMSR %#lx\n", guest_state.rcx); |
| local_apic_state.interrupt_tracker.Interrupt(X86_INT_GP_FAULT); |
| return zx::ok(); |
| } |
| } |
| |
| uint8_t default_operand_size(uint64_t efer, uint32_t cs_access_rights) { |
| // See Volume 3, Section 5.2.1. |
| if ((efer & X86_EFER_LMA) && (cs_access_rights & kGuestXxAccessRightsL)) { |
| // IA32-e 64 bit mode. |
| return 4; |
| } else if (cs_access_rights & kGuestXxAccessRightsD) { |
| // CS.D set (and not 64 bit mode). |
| return 4; |
| } else { |
| // CS.D clear (and not 64 bit mode). |
| return 2; |
| } |
| } |
| |
| zx::result<> handle_trap(const ExitInfo& exit_info, AutoVmcs& vmcs, bool read, |
| zx_vaddr_t guest_paddr, hypervisor::TrapMap& traps, |
| zx_port_packet_t& packet) { |
| zx::result<hypervisor::Trap*> trap = traps.FindTrap(ZX_GUEST_TRAP_BELL, guest_paddr); |
| if (trap.is_error()) { |
| return trap.take_error(); |
| } |
| next_rip(exit_info, vmcs); |
| |
| switch ((*trap)->kind()) { |
| case ZX_GUEST_TRAP_BELL: |
| if (read) { |
| return zx::error(ZX_ERR_NOT_SUPPORTED); |
| } |
| packet.key = (*trap)->key(); |
| packet.type = ZX_PKT_TYPE_GUEST_BELL; |
| packet.guest_bell.addr = guest_paddr; |
| if (!(*trap)->HasPort()) { |
| return zx::error(ZX_ERR_BAD_STATE); |
| } |
| return (*trap)->Queue(packet, &vmcs); |
| case ZX_GUEST_TRAP_MEM: |
| if (exit_info.exit_instruction_length > kMaxInstructionSize) { |
| return zx::error(ZX_ERR_INTERNAL); |
| } |
| packet.key = (*trap)->key(); |
| packet.type = ZX_PKT_TYPE_GUEST_MEM; |
| packet.guest_mem.addr = guest_paddr; |
| packet.guest_mem.cr3 = vmcs.Read(VmcsFieldXX::GUEST_CR3); |
| packet.guest_mem.rip = exit_info.guest_rip; |
| packet.guest_mem.instruction_size = static_cast<uint8_t>(exit_info.exit_instruction_length); |
| packet.guest_mem.default_operand_size = default_operand_size( |
| vmcs.Read(VmcsField64::GUEST_IA32_EFER), vmcs.Read(VmcsField32::GUEST_CS_ACCESS_RIGHTS)); |
| return zx::error(ZX_ERR_NEXT); |
| default: |
| return zx::error(ZX_ERR_BAD_STATE); |
| } |
| } |
| |
| zx::result<> handle_ept_violation(const ExitInfo& exit_info, AutoVmcs& vmcs, |
| hypervisor::GuestPhysicalAspace& gpa, hypervisor::TrapMap& traps, |
| zx_port_packet_t& packet) { |
| const EptViolationInfo ept_violation_info(vmcs.Read(VmcsFieldXX::EXIT_QUALIFICATION)); |
| zx_gpaddr_t guest_paddr = vmcs.Read(VmcsField64::GUEST_PHYSICAL_ADDRESS); |
| auto result = handle_trap(exit_info, vmcs, ept_violation_info.read, guest_paddr, traps, packet); |
| if (result.status_value() != ZX_ERR_NOT_FOUND) { |
| return result; |
| } |
| |
| // We may have to block when handling the page fault. |
| vmcs.Invalidate(); |
| |
| // If there was no trap associated with this address and it is outside of |
| // guest physical address space, return failure. |
| if (guest_paddr >= gpa.size()) { |
| return zx::error(ZX_ERR_OUT_OF_RANGE); |
| } |
| |
| result = gpa.PageFault(guest_paddr); |
| if (result.is_error()) { |
| dprintf(CRITICAL, "hypervisor: Unhandled EPT violation %#lx\n", guest_paddr); |
| } |
| return result; |
| } |
| |
| zx::result<> handle_xsetbv(const ExitInfo& exit_info, AutoVmcs& vmcs, GuestState& guest_state) { |
| uint64_t guest_cr4 = vmcs.Read(VmcsFieldXX::GUEST_CR4); |
| if (!(guest_cr4 & X86_CR4_OSXSAVE)) { |
| return zx::error(ZX_ERR_INVALID_ARGS); |
| } |
| |
| // We only support XCR0. |
| if (guest_state.rcx != 0) { |
| return zx::error(ZX_ERR_INVALID_ARGS); |
| } |
| |
| cpuid_leaf leaf; |
| if (!x86_get_cpuid_subleaf(X86_CPUID_XSAVE, 0, &leaf)) { |
| return zx::error(ZX_ERR_INTERNAL); |
| } |
| |
| // Check that XCR0 is valid. |
| uint64_t xcr0_bitmap = (static_cast<uint64_t>(leaf.d) << 32) | leaf.a; |
| uint64_t xcr0 = guest_state.EdxEax(); |
| if (~xcr0_bitmap & xcr0 || |
| // x87 state must be enabled. |
| (xcr0 & X86_XSAVE_STATE_BIT_X87) != X86_XSAVE_STATE_BIT_X87 || |
| // If AVX state is enabled, SSE state must be enabled. |
| (xcr0 & (X86_XSAVE_STATE_BIT_AVX | X86_XSAVE_STATE_BIT_SSE)) == X86_XSAVE_STATE_BIT_AVX) { |
| return zx::error(ZX_ERR_INVALID_ARGS); |
| } |
| |
| guest_state.xcr0 = xcr0; |
| next_rip(exit_info, vmcs); |
| return zx::ok(); |
| } |
| |
| void handle_pause(const ExitInfo& exit_info, AutoVmcs& vmcs) { next_rip(exit_info, vmcs); } |
| |
| bool is_cpl0(AutoVmcs& vmcs) { |
| const uint32_t access_rights = vmcs.Read(VmcsField32::GUEST_SS_ACCESS_RIGHTS); |
| // We only accept a VMCALL if CPL is 0. |
| return (access_rights & kGuestXxAccessRightsDplUser) == 0; |
| } |
| |
| void handle_vmcall_regular(const ExitInfo& exit_info, AutoVmcs& vmcs, GuestState& guest_state, |
| hypervisor::GuestPhysicalAspace& gpa) { |
| next_rip(exit_info, vmcs); |
| if (!is_cpl0(vmcs)) { |
| guest_state.rax = VmCallStatus::NOT_PERMITTED; |
| return; |
| } |
| vmcs.Invalidate(); |
| |
| // We never fail in case of hypercalls, we just return/propagate errors to the caller. |
| const VmCallInfo info(guest_state); |
| switch (info.type) { |
| case VmCallType::CLOCK_PAIRING: { |
| if (info.arg[1] != 0) { |
| dprintf(INFO, "hypervisor: CLOCK_PAIRING hypercall doesn't support clock type %lu\n", |
| info.arg[1]); |
| guest_state.rax = VmCallStatus::NOT_SUPPORTED; |
| break; |
| } |
| if (auto result = pv_clock_populate_offset(&gpa, info.arg[0]); result.is_error()) { |
| dprintf(INFO, "hypervisor: Failed to populate lock offset with error %d\n", |
| result.status_value()); |
| guest_state.rax = VmCallStatus::FAULT; |
| break; |
| } |
| guest_state.rax = VmCallStatus::OK; |
| break; |
| } |
| default: |
| dprintf(INFO, |
| "hypervisor: Unknown hypercall %lu (arg0=%#lx, arg1=%#lx, arg2=%#lx, arg3=%#lx)\n", |
| static_cast<uint64_t>(info.type), info.arg[0], info.arg[1], info.arg[2], info.arg[3]); |
| guest_state.rax = VmCallStatus::UNKNOWN_HYPERCALL; |
| break; |
| } |
| } |
| |
| zx::result<> handle_vmcall_direct(const ExitInfo& exit_info, AutoVmcs& vmcs, |
| GuestState& guest_state, uintptr_t& fs_base, |
| zx_port_packet_t& packet) { |
| next_rip(exit_info, vmcs); |
| if (!is_cpl0(vmcs)) { |
| guest_state.rax = ZX_ERR_ACCESS_DENIED; |
| return zx::ok(); |
| } |
| vmcs.Invalidate(); |
| zx_status_t status = vmcall_dispatch(guest_state, fs_base, packet); |
| return zx::make_result(status); |
| } |
| |
| } // namespace |
| |
| ExitInfo::ExitInfo(const AutoVmcs& vmcs) { |
| // From Volume 3, Section 26.7. |
| uint32_t full_exit_reason = vmcs.Read(VmcsField32::EXIT_REASON); |
| entry_failure = BIT(full_exit_reason, 31); |
| exit_reason = static_cast<ExitReason>(BITS(full_exit_reason, 15, 0)); |
| |
| exit_instruction_length = vmcs.Read(VmcsField32::EXIT_INSTRUCTION_LENGTH); |
| guest_rip = vmcs.Read(VmcsFieldXX::GUEST_RIP); |
| |
| if (exit_reason == ExitReason::EXTERNAL_INTERRUPT || exit_reason == ExitReason::IO_INSTRUCTION) { |
| return; |
| } |
| |
| LTRACEF("entry failure: %d\n", entry_failure); |
| LTRACEF("exit reason: %#x (%s)\n", static_cast<uint32_t>(exit_reason), |
| exit_reason_name(exit_reason)); |
| LTRACEF("exit instruction length: %#x\n", exit_instruction_length); |
| LTRACEF("guest activity state: %#x\n", vmcs.Read(VmcsField32::GUEST_ACTIVITY_STATE)); |
| LTRACEF("guest interruptibility state: %#x\n", |
| vmcs.Read(VmcsField32::GUEST_INTERRUPTIBILITY_STATE)); |
| LTRACEF("guest linear address: %#lx\n", vmcs.Read(VmcsFieldXX::GUEST_LINEAR_ADDRESS)); |
| LTRACEF("guest rip: %#lx\n", guest_rip); |
| } |
| |
| ExitInterruptionInfo::ExitInterruptionInfo(const AutoVmcs& vmcs) { |
| uint32_t int_info = vmcs.Read(VmcsField32::EXIT_INTERRUPTION_INFORMATION); |
| vector = static_cast<uint8_t>(BITS(int_info, 7, 0)); |
| interruption_type = static_cast<InterruptionType>(BITS_SHIFT(int_info, 10, 8)); |
| error_code_valid = BIT(int_info, 11); |
| valid = BIT(int_info, 31); |
| } |
| |
| PageFaultInfo::PageFaultInfo(uint32_t error_code) { |
| // From Volume 3A, Figure 4-12. |
| flags = 0; |
| flags |= (error_code & PFEX_W) ? VMM_PF_FLAG_WRITE : 0; |
| flags |= (error_code & PFEX_U) ? VMM_PF_FLAG_USER : 0; |
| flags |= (error_code & PFEX_I) ? VMM_PF_FLAG_INSTRUCTION : 0; |
| flags |= (error_code & PFEX_P) ? 0 : VMM_PF_FLAG_NOT_PRESENT; |
| } |
| |
| EptViolationInfo::EptViolationInfo(uint64_t qualification) { |
| // From Volume 3C, Table 27-7. |
| read = BIT(qualification, 0); |
| write = BIT(qualification, 1); |
| instruction = BIT(qualification, 2); |
| } |
| |
| CrAccessInfo::CrAccessInfo(uint64_t qualification) { |
| // From Volume 3, Table 27-3. |
| cr_number = static_cast<uint8_t>(BITS(qualification, 3, 0)); |
| access_type = static_cast<CrAccessType>(BITS_SHIFT(qualification, 5, 4)); |
| reg = static_cast<uint8_t>(BITS_SHIFT(qualification, 11, 8)); |
| } |
| |
| IoInfo::IoInfo(uint64_t qualification) { |
| access_size = static_cast<uint8_t>(BITS(qualification, 2, 0) + 1); |
| input = BIT_SHIFT(qualification, 3); |
| string = BIT_SHIFT(qualification, 4); |
| repeat = BIT_SHIFT(qualification, 5); |
| port = static_cast<uint16_t>(BITS_SHIFT(qualification, 31, 16)); |
| } |
| |
| InterruptCommandRegister::InterruptCommandRegister(uint32_t hi, uint32_t lo) { |
| destination = hi; |
| destination_mode = static_cast<InterruptDestinationMode>(BIT_SHIFT(lo, 11)); |
| delivery_mode = static_cast<InterruptDeliveryMode>(BITS_SHIFT(lo, 10, 8)); |
| destination_shorthand = static_cast<InterruptDestinationShorthand>(BITS_SHIFT(lo, 19, 18)); |
| vector = static_cast<uint8_t>(BITS(lo, 7, 0)); |
| } |
| |
| VmCallInfo::VmCallInfo(const GuestState& guest_state) { |
| // ABI is documented in Linux kernel documentation, see |
| // Documents/virtual/kvm/hypercalls.txt |
| type = static_cast<VmCallType>(guest_state.rax); |
| arg[0] = guest_state.rbx; |
| arg[1] = guest_state.rcx; |
| arg[2] = guest_state.rdx; |
| arg[3] = guest_state.rsi; |
| } |
| |
| zx::result<> vmexit_handler_normal(AutoVmcs& vmcs, GuestState& guest_state, |
| LocalApicState& local_apic_state, PvClockState& pv_clock, |
| hypervisor::GuestPhysicalAspace& gpa, hypervisor::TrapMap& traps, |
| zx_port_packet_t& packet) { |
| zx::result<> result = zx::ok(); |
| const ExitInfo exit_info(vmcs); |
| switch (exit_info.exit_reason) { |
| case ExitReason::EXTERNAL_INTERRUPT: |
| ktrace_vcpu_exit(VCPU_EXTERNAL_INTERRUPT, exit_info.guest_rip); |
| GUEST_STATS_INC(interrupts); |
| handle_external_interrupt(vmcs); |
| break; |
| case ExitReason::INTERRUPT_WINDOW: |
| ktrace_vcpu_exit(VCPU_INTERRUPT_WINDOW, exit_info.guest_rip); |
| GUEST_STATS_INC(interrupt_windows); |
| handle_interrupt_window(vmcs); |
| break; |
| case ExitReason::CPUID: |
| ktrace_vcpu_exit(VCPU_CPUID, exit_info.guest_rip); |
| GUEST_STATS_INC(cpuid_instructions); |
| result = handle_cpuid(exit_info, vmcs, guest_state); |
| break; |
| case ExitReason::HLT: |
| ktrace_vcpu_exit(VCPU_HLT, exit_info.guest_rip); |
| GUEST_STATS_INC(hlt_instructions); |
| result = handle_hlt(exit_info, vmcs, local_apic_state); |
| break; |
| case ExitReason::CONTROL_REGISTER_ACCESS: |
| ktrace_vcpu_exit(VCPU_CONTROL_REGISTER_ACCESS, exit_info.guest_rip); |
| GUEST_STATS_INC(control_register_accesses); |
| result = handle_control_register_access(exit_info, vmcs, guest_state, local_apic_state); |
| break; |
| case ExitReason::IO_INSTRUCTION: |
| ktrace_vcpu_exit(VCPU_IO_INSTRUCTION, exit_info.guest_rip); |
| GUEST_STATS_INC(io_instructions); |
| result = handle_io_instruction(exit_info, vmcs, guest_state, traps, packet); |
| break; |
| case ExitReason::RDMSR: |
| ktrace_vcpu_exit(VCPU_RDMSR, exit_info.guest_rip); |
| GUEST_STATS_INC(rdmsr_instructions); |
| handle_rdmsr(exit_info, vmcs, guest_state, local_apic_state); |
| break; |
| case ExitReason::WRMSR: |
| ktrace_vcpu_exit(VCPU_WRMSR, exit_info.guest_rip); |
| GUEST_STATS_INC(wrmsr_instructions); |
| result = handle_wrmsr(exit_info, vmcs, guest_state, local_apic_state, pv_clock, gpa, packet); |
| break; |
| case ExitReason::ENTRY_FAILURE_GUEST_STATE: |
| case ExitReason::ENTRY_FAILURE_MSR_LOADING: |
| case ExitReason::ENTRY_FAILURE_MACHINE_CHECK: |
| ktrace_vcpu_exit(VCPU_VM_ENTRY_FAILURE, exit_info.guest_rip); |
| result = zx::error(ZX_ERR_BAD_STATE); |
| break; |
| case ExitReason::EPT_VIOLATION: |
| ktrace_vcpu_exit(VCPU_EPT_VIOLATION, exit_info.guest_rip); |
| GUEST_STATS_INC(ept_violations); |
| result = handle_ept_violation(exit_info, vmcs, gpa, traps, packet); |
| break; |
| case ExitReason::XSETBV: |
| ktrace_vcpu_exit(VCPU_XSETBV, exit_info.guest_rip); |
| GUEST_STATS_INC(xsetbv_instructions); |
| result = handle_xsetbv(exit_info, vmcs, guest_state); |
| break; |
| case ExitReason::PAUSE: |
| ktrace_vcpu_exit(VCPU_PAUSE, exit_info.guest_rip); |
| GUEST_STATS_INC(pause_instructions); |
| handle_pause(exit_info, vmcs); |
| break; |
| case ExitReason::VMCALL: |
| ktrace_vcpu_exit(VCPU_VMCALL, exit_info.guest_rip); |
| GUEST_STATS_INC(vmcall_instructions); |
| handle_vmcall_regular(exit_info, vmcs, guest_state, gpa); |
| break; |
| case ExitReason::EXCEPTION_OR_NMI: |
| // Currently all exceptions, except NMIs, are delivered directly to guests. |
| // NMIs cause VM exits and are handled by the host via the IDT as any other |
| // interrupt/exception. |
| default: |
| ktrace_vcpu_exit(VCPU_NOT_SUPPORTED, exit_info.guest_rip); |
| result = zx::error(ZX_ERR_NOT_SUPPORTED); |
| break; |
| } |
| switch (result.status_value()) { |
| case ZX_OK: |
| case ZX_ERR_NEXT: |
| case ZX_ERR_INTERNAL_INTR_RETRY: |
| case ZX_ERR_INTERNAL_INTR_KILLED: |
| break; |
| default: |
| dprintf(CRITICAL, "hypervisor: VM exit handler (regular) for %s (%u) returned %d\n", |
| exit_reason_name(exit_info.exit_reason), static_cast<uint32_t>(exit_info.exit_reason), |
| result.status_value()); |
| dump_guest_state(guest_state, exit_info); |
| break; |
| } |
| return result; |
| } |
| |
| zx::result<> vmexit_handler_direct(AutoVmcs& vmcs, GuestState& guest_state, uintptr_t& fs_base, |
| zx_port_packet_t& packet) { |
| zx::result<> result = zx::ok(); |
| const ExitInfo exit_info(vmcs); |
| switch (exit_info.exit_reason) { |
| case ExitReason::EXCEPTION_OR_NMI: |
| ktrace_vcpu_exit(VCPU_EXCEPTION_OR_NMI, exit_info.guest_rip); |
| result = handle_exception_or_nmi(vmcs); |
| break; |
| case ExitReason::EXTERNAL_INTERRUPT: |
| ktrace_vcpu_exit(VCPU_EXTERNAL_INTERRUPT, exit_info.guest_rip); |
| GUEST_STATS_INC(interrupts); |
| handle_external_interrupt(vmcs); |
| break; |
| case ExitReason::CPUID: |
| ktrace_vcpu_exit(VCPU_CPUID, exit_info.guest_rip); |
| GUEST_STATS_INC(cpuid_instructions); |
| result = handle_cpuid(exit_info, vmcs, guest_state); |
| break; |
| case ExitReason::VMCALL: |
| ktrace_vcpu_exit(VCPU_VMCALL, exit_info.guest_rip); |
| GUEST_STATS_INC(vmcall_instructions); |
| result = handle_vmcall_direct(exit_info, vmcs, guest_state, fs_base, packet); |
| break; |
| case ExitReason::ENTRY_FAILURE_GUEST_STATE: |
| case ExitReason::ENTRY_FAILURE_MSR_LOADING: |
| case ExitReason::ENTRY_FAILURE_MACHINE_CHECK: |
| ktrace_vcpu_exit(VCPU_VM_ENTRY_FAILURE, exit_info.guest_rip); |
| result = zx::error(ZX_ERR_BAD_STATE); |
| break; |
| default: |
| ktrace_vcpu_exit(VCPU_NOT_SUPPORTED, exit_info.guest_rip); |
| result = zx::error(ZX_ERR_NOT_SUPPORTED); |
| break; |
| } |
| switch (result.status_value()) { |
| case ZX_OK: |
| case ZX_ERR_NEXT: |
| case ZX_ERR_INTERNAL_INTR_RETRY: |
| case ZX_ERR_INTERNAL_INTR_KILLED: |
| break; |
| default: |
| dprintf(CRITICAL, |
| "hypervisor: VM exit handler (direct) for %s (%u) returned %d on thread %s\n", |
| exit_reason_name(exit_info.exit_reason), static_cast<uint32_t>(exit_info.exit_reason), |
| result.status_value(), Thread::Current::Get()->name()); |
| dump_guest_state(guest_state, exit_info); |
| break; |
| } |
| return result; |
| } |