zircon/kernel/arch/x86/hypervisor/vmexit.cc - fuchsia - Git at Google

 // Copyright 2017 The Fuchsia Authors
 //
 // Use of this source code is governed by a MIT-style
 // license that can be found in the LICENSE file or at
 // https://opensource.org/licenses/MIT

 #include <bits.h>
 #include <inttypes.h>
 #include <lib/ktrace.h>
 #include <platform.h>
 #include <string.h>
 #include <trace.h>
 #include <zircon/syscalls/hypervisor.h>
 #include <zircon/time.h>
 #include <zircon/types.h>

 #include <arch/hypervisor.h>
 #include <arch/x86/apic.h>
 #include <arch/x86/feature.h>
 #include <arch/x86/hypervisor/invalidate.h>
 #include <arch/x86/interrupts.h>
 #include <arch/x86/mmu.h>
 #include <arch/x86/pv.h>
 #include <hypervisor/interrupt_tracker.h>
 #include <hypervisor/ktrace.h>
 #include <kernel/percpu.h>
 #include <kernel/stats.h>
 #include <ktl/algorithm.h>
 #include <platform/pc/timer.h>
 #include <vm/fault.h>
 #include <vm/physmap.h>
 #include <vm/pmm.h>

 #include "pv_priv.h"
 #include "vcpu_priv.h"
 #include "vmcall_priv.h"
 #include "vmexit_priv.h"

 #include <ktl/enforce.h>

 #define LOCAL_TRACE 0

 extern "C" void x86_call_external_interrupt_handler(uint64_t vector);

 namespace {

 constexpr uint64_t kLocalApicPhysBase =
     APIC_PHYS_BASE | IA32_APIC_BASE_XAPIC_ENABLE | IA32_APIC_BASE_X2APIC_ENABLE;

 constexpr uint64_t kX2ApicMsrBase = 0x800;
 constexpr uint64_t kX2ApicMsrMax = 0x83f;

 constexpr uint64_t kMiscEnableFastStrings = 1u << 0;

 constexpr uint32_t kFirstExtendedStateComponent = 2;
 constexpr uint32_t kLastExtendedStateComponent = 9;
 // From Volume 1, Section 13.4.
 constexpr uint32_t kXsaveLegacyRegionSize = 512;
 constexpr uint32_t kXsaveHeaderSize = 64;

 // NOTE: x86 instructions are guaranteed to be 15 bytes or fewer.
 constexpr uint8_t kMaxInstructionSize = 15;

 alignas(uint32_t) constexpr char kHypVendorId[] = "KVMKVMKVM\0\0\0";
 static_assert(sizeof(kHypVendorId) - 1 == 12, "Vendor ID must be 12 characters long");

 constexpr uint64_t kKvmFeatureNoIoDelay = 1u << 1;

 void dump_guest_state(const GuestState& guest_state, const ExitInfo& exit_info) {
   dprintf(INFO, " RAX: %#18lx  RCX: %#18lx  RDX: %#18lx  RBX: %#18lx\n", guest_state.rax,
           guest_state.rcx, guest_state.rdx, guest_state.rbx);
   dprintf(INFO, " RSP:  xxxxxxxx xxxxxxxx  RBP: %#18lx  RSI: %#18lx  RDI: %#18lx\n",
           guest_state.rbp, guest_state.rsi, guest_state.rdi);
   dprintf(INFO, "  R8: %#18lx   R9: %#18lx  R10: %#18lx  R11: %#18lx\n", guest_state.r8,
           guest_state.r9, guest_state.r10, guest_state.r11);
   dprintf(INFO, " R12: %#18lx  R13: %#18lx  R14: %#18lx  R15: %#18lx\n", guest_state.r12,
           guest_state.r13, guest_state.r14, guest_state.r15);
   dprintf(INFO, " RIP: %#18lx  CR2: %#18lx XCR0: %#18lx\n", exit_info.guest_rip, guest_state.cr2,
           guest_state.xcr0);

   dprintf(INFO, "entry failure: %d\n", exit_info.entry_failure);
   dprintf(INFO, "exit instruction length: %u\n", exit_info.exit_instruction_length);
 }

 void next_rip(const ExitInfo& exit_info, AutoVmcs& vmcs) {
   vmcs.Write(VmcsFieldXX::GUEST_RIP, exit_info.guest_rip + exit_info.exit_instruction_length);

   // Clear any flags blocking interrupt injection for a single instruction.
   uint32_t guest_interruptibility = vmcs.Read(VmcsField32::GUEST_INTERRUPTIBILITY_STATE);
   uint32_t new_interruptibility =
       guest_interruptibility & ~(kInterruptibilityStiBlocking | kInterruptibilityMovSsBlocking);
   if (new_interruptibility != guest_interruptibility) {
     vmcs.Write(VmcsField32::GUEST_INTERRUPTIBILITY_STATE, new_interruptibility);
   }
 }

 zx::result<> handle_exception_or_nmi(AutoVmcs& vmcs) {
   const ExitInterruptionInfo int_info(vmcs);
   DEBUG_ASSERT(int_info.valid);
   // Only handle page faults, everything else should terminate the VCPU.
   if (int_info.interruption_type != InterruptionType::HARDWARE_EXCEPTION ||
       int_info.vector != X86_INT_PAGE_FAULT) {
     return zx::error(ZX_ERR_BAD_STATE);
   }
   auto thread = Thread::Current::Get();
   // Page fault resume should not end up here.
   if (thread->arch().page_fault_resume != 0) {
     return zx::error(ZX_ERR_INTERNAL);
   }

   const zx_vaddr_t guest_vaddr = vmcs.Read(VmcsFieldXX::EXIT_QUALIFICATION);
   DEBUG_ASSERT(int_info.error_code_valid);
   const PageFaultInfo pf_info(vmcs.Read(VmcsField32::EXIT_INTERRUPTION_ERROR_CODE));

   // We may have to block when handling the page fault.
   vmcs.Invalidate();
   zx_status_t status = vmm_page_fault_handler(guest_vaddr, pf_info.flags);
   return zx::make_result(status);
 }

 void handle_external_interrupt(AutoVmcs& vmcs) {
   const ExitInterruptionInfo int_info(vmcs);
   DEBUG_ASSERT(int_info.valid);
   DEBUG_ASSERT(int_info.interruption_type == InterruptionType::EXTERNAL_INTERRUPT);
   vmcs.Invalidate();
   x86_call_external_interrupt_handler(int_info.vector);
 }

 void handle_interrupt_window(AutoVmcs& vmcs) { vmcs.InterruptWindowExiting(false); }

 // From Volume 2, Section 3.2, Table 3-8  "Processor Extended State Enumeration
 // Main Leaf (EAX = 0DH, ECX = 0)".
 //
 // Bits 31-00: Maximum size (bytes, from the beginning of the XSAVE/XRSTOR save
 // area) required by enabled features in XCR0. May be different than ECX if some
 // features at the end of the XSAVE save area are not enabled.
 zx::result<uint32_t> compute_xsave_size(uint64_t guest_xcr0) {
   uint32_t xsave_size = kXsaveLegacyRegionSize + kXsaveHeaderSize;
   for (uint32_t i = kFirstExtendedStateComponent; i <= kLastExtendedStateComponent; ++i) {
     cpuid_leaf leaf;
     if (!(guest_xcr0 & (1 << i))) {
       continue;
     }
     if (!x86_get_cpuid_subleaf(X86_CPUID_XSAVE, i, &leaf)) {
       return zx::error(ZX_ERR_INTERNAL);
     }
     if (leaf.a == 0 && leaf.b == 0 && leaf.c == 0 && leaf.d == 0) {
       continue;
     }
     const uint32_t component_offset = leaf.b;
     const uint32_t component_size = leaf.a;
     xsave_size = component_offset + component_size;
   }
   return zx::ok(xsave_size);
 }

 zx::result<> handle_cpuid(const ExitInfo& exit_info, AutoVmcs& vmcs, GuestState& guest_state) {
   const uint32_t leaf = guest_state.eax();
   const uint32_t subleaf = guest_state.ecx();

   next_rip(exit_info, vmcs);
   switch (leaf) {
     case X86_CPUID_BASE:
     case X86_CPUID_EXT_BASE:
       cpuid(leaf, reinterpret_cast<uint32_t*>(&guest_state.rax),
             reinterpret_cast<uint32_t*>(&guest_state.rbx),
             reinterpret_cast<uint32_t*>(&guest_state.rcx),
             reinterpret_cast<uint32_t*>(&guest_state.rdx));
       return zx::ok();
     case X86_CPUID_BASE + 1 ... MAX_SUPPORTED_CPUID:
     case X86_CPUID_EXT_BASE + 1 ... MAX_SUPPORTED_CPUID_EXT:
       cpuid_c(leaf, subleaf, reinterpret_cast<uint32_t*>(&guest_state.rax),
               reinterpret_cast<uint32_t*>(&guest_state.rbx),
               reinterpret_cast<uint32_t*>(&guest_state.rcx),
               reinterpret_cast<uint32_t*>(&guest_state.rdx));
       switch (leaf) {
         case X86_CPUID_MODEL_FEATURES:
           // Override the initial local APIC ID. From Vol 2, Table 3-8.
           guest_state.rbx &= ~(0xff << 24);
           guest_state.rbx |= (vmcs.Read(VmcsField16::VPID) - 1) << 24;
           // Enable the hypervisor bit.
           guest_state.rcx |= 1u << X86_FEATURE_HYPERVISOR.bit;
           // Enable the x2APIC bit.
           guest_state.rcx |= 1u << X86_FEATURE_X2APIC.bit;
           // Always enable TSC deadline (this doesn't depend on the host feature).
           guest_state.rcx |= 1u << X86_FEATURE_TSC_DEADLINE.bit;
           // Disable the VMX bit.
           guest_state.rcx &= ~(1u << X86_FEATURE_VMX.bit);
           // Disable the PDCM bit.
           guest_state.rcx &= ~(1u << X86_FEATURE_PDCM.bit);
           // Disable MONITOR/MWAIT.
           guest_state.rcx &= ~(1u << X86_FEATURE_MON.bit);
           // Disable THERM_INTERRUPT and THERM_STATUS MSRs
           guest_state.rcx &= ~(1u << X86_FEATURE_TM2.bit);
           // Enable the SEP (SYSENTER support).
           guest_state.rdx |= 1u << X86_FEATURE_SEP.bit;
           // Disable the Thermal Monitor bit.
           guest_state.rdx &= ~(1u << X86_FEATURE_TM.bit);
           // Disable the THERM_CONTROL_MSR bit.
           guest_state.rdx &= ~(1u << X86_FEATURE_ACPI.bit);
           break;
         case X86_CPUID_TOPOLOGY:
           guest_state.rax = 0;
           guest_state.rbx = 0;
           guest_state.rcx = 0;
           guest_state.rdx = vmcs.Read(VmcsField16::VPID) - 1;
           break;
         case X86_CPUID_XSAVE:
           if (subleaf == 0) {
             auto xsave_size = compute_xsave_size(guest_state.xcr0);
             if (xsave_size.is_error()) {
               return xsave_size.take_error();
             }
             guest_state.rbx = *xsave_size;
           } else if (subleaf == 1) {
             guest_state.rax &= ~(1u << 3);
           }
           break;
         case X86_CPUID_THERMAL_AND_POWER:
           // Disable the performance energy bias bit.
           guest_state.rcx &= ~(1u << X86_FEATURE_PERF_BIAS.bit);
           // Disable the hardware coordination feedback bit.
           guest_state.rcx &= ~(1u << X86_FEATURE_HW_FEEDBACK.bit);
           guest_state.rax &= ~(
               // Disable Digital Thermal Sensor
               1u << X86_FEATURE_DTS.bit |
               // Disable Package Thermal Status MSR.
               1u << X86_FEATURE_PTM.bit |
               // Disable THERM_STATUS MSR bits 10/11 & THERM_INTERRUPT MSR bit 24
               1u << X86_FEATURE_PLN.bit |
               // Disable HWP MSRs.
               1u << X86_FEATURE_HWP.bit | 1u << X86_FEATURE_HWP_NOT.bit |
               1u << X86_FEATURE_HWP_ACT.bit | 1u << X86_FEATURE_HWP_PREF.bit |
               // Don't advertise Turbo.
               1u << X86_FEATURE_TURBO.bit | 1u << X86_FEATURE_TURBO_MAX.bit);
           break;
         case X86_CPUID_PERFORMANCE_MONITORING: {
           // Disable all performance monitoring.
           // 31-07 = Reserved 0, 06-00 = 1 if event is not available.
           const uint32_t performance_monitoring_no_events = 0b1111111;
           guest_state.rax = 0;
           guest_state.rbx = performance_monitoring_no_events;
           guest_state.rcx = 0;
           guest_state.rdx = 0;
           break;
         }
         case X86_CPUID_MON:
           // MONITOR/MWAIT are not implemented.
           guest_state.rax = 0;
           guest_state.rbx = 0;
           guest_state.rcx = 0;
           guest_state.rdx = 0;
           break;
         case X86_CPUID_EXTENDED_FEATURE_FLAGS:
           // It's possible when running under KVM in nVMX mode, that host
           // CPUID indicates that invpcid is supported but VMX doesn't allow
           // to enable INVPCID bit in secondary processor based controls.
           // Therefore explicitly clear INVPCID bit in CPUID if the VMX flag
           // wasn't set.
           if ((vmcs.Read(VmcsField32::PROCBASED_CTLS2) & kProcbasedCtls2Invpcid) == 0) {
             guest_state.rbx &= ~(1u << X86_FEATURE_INVPCID.bit);
           }
           // Disable:
           //  * Processor Trace bit
           //  * TSC Adjust bit
           guest_state.rbx &= ~(1u << X86_FEATURE_PT.bit | 1u << X86_FEATURE_TSC_ADJUST.bit);
           // Disable:
           //  * Indirect Branch Prediction Barrier bit
           //  * Single Thread Indirect Branch Predictors bit
           //  * Speculative Store Bypass Disable bit
           // These imply support for the IA32_SPEC_CTRL and IA32_PRED_CMD
           // MSRs, which are not implemented.
           guest_state.rdx &= ~(1u << X86_FEATURE_IBRS_IBPB.bit | 1u << X86_FEATURE_STIBP.bit |
                                1u << X86_FEATURE_SSBD.bit);
           // Disable support for the IA32_ARCH_CAPABILITIES MSR.
           guest_state.rdx &= ~(1u << X86_FEATURE_ARCH_CAPABILITIES.bit);
           // Disable support for the IA32_FLUSH_CMD MSR.
           guest_state.rdx &= ~(1u << X86_FEATURE_L1D_FLUSH.bit);

           // TODO(https://fxbug.dev/42060002): Enable AVX-512 if supported.
           //
           // Disabling this to work around invalid opcode errors trying to execute these
           // instructions.
           guest_state.rbx &= ~(1u << X86_FEATURE_AVX512F.bit | 1u << X86_FEATURE_AVX512DQ.bit |
                                1u << X86_FEATURE_AVX512IFMA.bit | 1u << X86_FEATURE_AVX512PF.bit |
                                1u << X86_FEATURE_AVX512ER.bit | 1u << X86_FEATURE_AVX512CD.bit |
                                1u << X86_FEATURE_AVX512BW.bit | 1u << X86_FEATURE_AVX512VL.bit);
           guest_state.rcx &=
               ~(1u << X86_FEATURE_AVX512VBMI.bit | 1u << X86_FEATURE_AVX512VBMI2.bit |
                 1u << X86_FEATURE_AVX512VNNI.bit | 1u << X86_FEATURE_AVX512BITALG.bit |
                 1u << X86_FEATURE_AVX512VPDQ.bit);
           guest_state.rdx &=
               ~(1u << X86_FEATURE_AVX512QVNNIW.bit | 1u << X86_FEATURE_AVX512QFMA.bit);

           break;
       }
       return zx::ok();
     case X86_CPUID_HYP_VENDOR: {
       // This leaf is commonly used to identify a hypervisor via ebx:ecx:edx.
       auto regs = reinterpret_cast<const uint32_t*>(kHypVendorId);
       // Since Zircon hypervisor disguises itself as KVM, it needs to return
       // in EAX max CPUID function supported by hypervisor. Zero in EAX
       // should be interpreted as 0x40000001. Details are available in the
       // Linux kernel documentation (Documentation/virtual/kvm/cpuid.txt).
       guest_state.rax = X86_CPUID_KVM_FEATURES;
       guest_state.rbx = regs[0];
       guest_state.rcx = regs[1];
       guest_state.rdx = regs[2];
       return zx::ok();
     }
     case X86_CPUID_KVM_FEATURES:
       // We support KVM clock.
       guest_state.rax = kKvmFeatureClockSourceOld | kKvmFeatureClockSource | kKvmFeatureNoIoDelay;
       guest_state.rbx = 0;
       guest_state.rcx = 0;
       guest_state.rdx = 0;
       return zx::ok();
     // From Volume 2A, CPUID instruction reference. If the EAX value is outside
     // the range recognized by CPUID then the information for the highest
     // supported base information leaf is returned. Any value in ECX is
     // honored.
     default:
       cpuid_c(MAX_SUPPORTED_CPUID, subleaf, reinterpret_cast<uint32_t*>(&guest_state.rax),
               reinterpret_cast<uint32_t*>(&guest_state.rbx),
               reinterpret_cast<uint32_t*>(&guest_state.rcx),
               reinterpret_cast<uint32_t*>(&guest_state.rdx));
       return zx::ok();
   }
 }

 zx::result<> handle_hlt(const ExitInfo& exit_info, AutoVmcs& vmcs,
                         LocalApicState& local_apic_state) {
   next_rip(exit_info, vmcs);
   return local_apic_state.interrupt_tracker.Wait(ZX_TIME_INFINITE, &vmcs);
 }

 zx::result<> handle_cr0_write(AutoVmcs& vmcs, uint64_t val, LocalApicState& local_apic_state) {
   // X86_CR0_NE is masked so that guests may write to it, but depending on
   // IA32_VMX_CR0_FIXED0 it might be unsupported in VMX operation to set it to
   // zero. Allow the guest to control its value in CR0_READ_SHADOW but not in
   // GUEST_CR0 so that GUEST_CR0 stays valid.
   uint64_t cr0 = val | X86_CR0_NE;
   if (cr0_is_invalid(vmcs, cr0)) {
     return zx::error(ZX_ERR_INVALID_ARGS);
   }

   // From Volume 3, Table 11-5: CD=0 and NW=1 is an invalid setting and should
   // generate a GP fault.
   if (!(val & X86_CR0_CD) && (val & X86_CR0_NW)) {
     local_apic_state.interrupt_tracker.Interrupt(X86_INT_GP_FAULT);
     return zx::ok();
   }

   // If CR0.PG is being changed, then invalidate the VPID.
   uint64_t cr0_changed = val ^ vmcs.Read(VmcsFieldXX::GUEST_CR0);
   if (cr0_changed & X86_CR0_PG) {
     uint16_t vpid = vmcs.Read(VmcsField16::VPID);
     invvpid(InvVpid::SINGLE_CONTEXT, vpid, 0);
   }

   // From Volume 3, Section 26.3.2.1: CR0 is loaded from the CR0 field with the
   // exception of the following bits, which are never modified on VM entry: ET
   // (bit 4); reserved bits ...; NW (bit 29) and CD (bit 30). The values of
   // these bits in the CR0 field are ignored.
   //
   // Even though these bits will be ignored on VM entry, to ensure that
   // GUEST_CR0 matches the actual value of CR0 while the guest is running set
   // those bits to match the host values. This is done only to make debugging
   // simpler.
   cr0 &= ~(X86_CR0_NW | X86_CR0_CD);
   cr0 |= X86_CR0_ET;
   vmcs.Write(VmcsFieldXX::GUEST_CR0, cr0);

   // From Volume 3, Section 25.3: For each position corresponding to a bit clear
   // in the CR0 guest/host mask, the destination operand is loaded with the
   // value of the corresponding bit in CR0. For each position corresponding to a
   // bit set in the CR0 guest/host mask, the destination operand is loaded with
   // the value of the corresponding bit in the CR0 read shadow.
   //
   // Allow the guest to control the shadow.
   vmcs.Write(VmcsFieldXX::CR0_READ_SHADOW, val);

   // From Volume 3, Section 26.3.1.1: If CR0.PG and EFER.LME are set then
   // EFER.LMA and the IA-32e mode guest entry control must also be set.
   uint64_t efer = vmcs.Read(VmcsField64::GUEST_IA32_EFER);
   if (!(efer & X86_EFER_LME && cr0 & X86_CR0_PG)) {
     return zx::ok();
   }
   vmcs.Write(VmcsField64::GUEST_IA32_EFER, efer | X86_EFER_LMA);
   return vmcs.SetControl(VmcsField32::ENTRY_CTLS, read_msr(X86_MSR_IA32_VMX_TRUE_ENTRY_CTLS),
                          read_msr(X86_MSR_IA32_VMX_ENTRY_CTLS), kEntryCtls64bitMode, 0);
 }

 zx::result<uint64_t> register_value(AutoVmcs& vmcs, const GuestState& guest_state,
                                     uint8_t register_id) {
   switch (register_id) {
     // From Intel Volume 3, Table 27-3.
     case 0:
       return zx::ok(guest_state.rax);
     case 1:
       return zx::ok(guest_state.rcx);
     case 2:
       return zx::ok(guest_state.rdx);
     case 3:
       return zx::ok(guest_state.rbx);
     case 4:
       return zx::ok(vmcs.Read(VmcsFieldXX::GUEST_RSP));
     case 5:
       return zx::ok(guest_state.rbp);
     case 6:
       return zx::ok(guest_state.rsi);
     case 7:
       return zx::ok(guest_state.rdi);
     case 8:
       return zx::ok(guest_state.r8);
     case 9:
       return zx::ok(guest_state.r9);
     case 10:
       return zx::ok(guest_state.r10);
     case 11:
       return zx::ok(guest_state.r11);
     case 12:
       return zx::ok(guest_state.r12);
     case 13:
       return zx::ok(guest_state.r13);
     case 14:
       return zx::ok(guest_state.r14);
     case 15:
       return zx::ok(guest_state.r15);
     default:
       return zx::error(ZX_ERR_INVALID_ARGS);
   }
 }

 zx::result<> handle_control_register_access(const ExitInfo& exit_info, AutoVmcs& vmcs,
                                             const GuestState& guest_state,
                                             LocalApicState& local_apic_state) {
   const CrAccessInfo cr_access_info(vmcs.Read(VmcsFieldXX::EXIT_QUALIFICATION));
   switch (cr_access_info.access_type) {
     case CrAccessType::MOV_TO_CR: {
       // Handle CR0 only.
       if (cr_access_info.cr_number != 0) {
         return zx::error(ZX_ERR_NOT_SUPPORTED);
       }
       auto val = register_value(vmcs, guest_state, cr_access_info.reg);
       if (val.is_error()) {
         return val.take_error();
       }
       auto result = handle_cr0_write(vmcs, *val, local_apic_state);
       if (result.is_error()) {
         return val.take_error();
       }
       next_rip(exit_info, vmcs);
       return zx::ok();
     }
     default:
       return zx::error(ZX_ERR_NOT_SUPPORTED);
   }
 }

 zx::result<> handle_io_instruction(const ExitInfo& exit_info, AutoVmcs& vmcs,
                                    GuestState& guest_state, hypervisor::TrapMap& traps,
                                    zx_port_packet_t& packet) {
   const IoInfo io_info(vmcs.Read(VmcsFieldXX::EXIT_QUALIFICATION));
   if (io_info.string || io_info.repeat) {
     dprintf(INFO, "hypervisor: Unsupported guest IO instruction\n");
     return zx::error(ZX_ERR_NOT_SUPPORTED);
   }

   zx::result<hypervisor::Trap*> trap = traps.FindTrap(ZX_GUEST_TRAP_IO, io_info.port);
   if (trap.is_error()) {
     dprintf(INFO, "hypervisor: Unhandled guest IO port %s %#x\n", io_info.input ? "read" : "write",
             io_info.port);
     return trap.take_error();
   }
   next_rip(exit_info, vmcs);

   memset(&packet, 0, sizeof(packet));
   packet.key = (*trap)->key();
   packet.type = ZX_PKT_TYPE_GUEST_IO;
   packet.guest_io.port = io_info.port;
   packet.guest_io.access_size = io_info.access_size;
   packet.guest_io.input = io_info.input;
   if (io_info.input) {
     // From Volume 1, Section 3.4.1.1: 32-bit operands generate a 32-bit
     // result, zero-extended to a 64-bit result in the destination general-
     // purpose register.
     if (io_info.access_size == 4) {
       guest_state.rax = 0;
     }
   } else {
     memcpy(packet.guest_io.data, &guest_state.rax, io_info.access_size);
     if ((*trap)->HasPort()) {
       return (*trap)->Queue(packet, &vmcs);
     }
     // If there was no port for the range, then return to user-space.
   }

   return zx::error(ZX_ERR_NEXT);
 }

 void handle_apic_rdmsr(const ExitInfo& exit_info, AutoVmcs& vmcs, GuestState& guest_state,
                        LocalApicState& local_apic_state) {
   switch (static_cast<X2ApicMsr>(guest_state.ecx())) {
     case X2ApicMsr::ID:
       next_rip(exit_info, vmcs);
       guest_state.rax = vmcs.Read(VmcsField16::VPID) - 1;
       break;
     case X2ApicMsr::VERSION: {
       next_rip(exit_info, vmcs);
       // We choose 15H as it causes us to be seen as a modern APIC by Linux,
       // and is the highest non-reserved value. See Volume 3 Section 10.4.8.
       const uint32_t version = 0x15;
       const uint32_t max_lvt_entry = 0x6;  // LVT entries minus 1.
       const uint32_t eoi_suppression = 0;  // Disable support for EOI-broadcast suppression.
       guest_state.rax = version | (max_lvt_entry << 16) | (eoi_suppression << 24);
       break;
     }
     case X2ApicMsr::SVR:
       // Spurious interrupt vector resets to 0xff. See Volume 3 Section 10.12.5.1.
       next_rip(exit_info, vmcs);
       guest_state.rax = 0xff;
       break;
     case X2ApicMsr::TPR:
     case X2ApicMsr::LDR:
     case X2ApicMsr::ISR_31_0... X2ApicMsr::ISR_255_224:
     case X2ApicMsr::TMR_31_0... X2ApicMsr::TMR_255_224:
     case X2ApicMsr::IRR_31_0... X2ApicMsr::IRR_255_224:
     case X2ApicMsr::ESR:
     case X2ApicMsr::LVT_MONITOR:
       // These registers reset to 0. See Volume 3 Section 10.12.5.1.
       next_rip(exit_info, vmcs);
       guest_state.rax = 0;
       break;
     case X2ApicMsr::LVT_LINT0:
     case X2ApicMsr::LVT_LINT1:
     case X2ApicMsr::LVT_THERMAL_SENSOR:
     case X2ApicMsr::LVT_CMCI:
       // LVT registers reset with the mask bit set. See Volume 3 Section 10.12.5.1.
       next_rip(exit_info, vmcs);
       guest_state.rax = LVT_MASKED;
       break;
     case X2ApicMsr::LVT_TIMER:
       next_rip(exit_info, vmcs);
       guest_state.rax = local_apic_state.lvt_timer;
       break;
     default:
       // Issue a general protection fault for write only and unimplemented
       // registers.
       dprintf(INFO, "hypervisor: Unhandled guest x2APIC RDMSR %#lx\n", guest_state.rcx);
       local_apic_state.interrupt_tracker.Interrupt(X86_INT_GP_FAULT);
       break;
   }
 }

 void handle_rdmsr(const ExitInfo& exit_info, AutoVmcs& vmcs, GuestState& guest_state,
                   LocalApicState& local_apic_state) {
   // On execution of rdmsr, ecx specifies the MSR and the result is stored in edx:eax.
   switch (guest_state.ecx()) {
     case X86_MSR_IA32_APIC_BASE: {
       next_rip(exit_info, vmcs);
       uint64_t result = kLocalApicPhysBase;
       if (vmcs.Read(VmcsField16::VPID) == 1) {
         result |= IA32_APIC_BASE_BSP;
       }
       guest_state.SetEdxEax(result);
       break;
     }
     // From Volume 4, Section 2.1, Table 2-2: For now, only enable fast strings.
     case X86_MSR_IA32_MISC_ENABLE:
       next_rip(exit_info, vmcs);
       guest_state.SetEdxEax(read_msr(X86_MSR_IA32_MISC_ENABLE) & kMiscEnableFastStrings);
       break;
     case X86_MSR_DRAM_ENERGY_STATUS:
     case X86_MSR_DRAM_POWER_LIMIT:
     // From Volume 3, Section 28.2.6.2: The MTRRs have no effect on the memory
     // type used for an access to a guest-physical address.
     case X86_MSR_IA32_MTRRCAP:
     case X86_MSR_IA32_MTRR_DEF_TYPE:
     case X86_MSR_IA32_MTRR_FIX64K_00000:
     case X86_MSR_IA32_MTRR_FIX16K_80000 ... X86_MSR_IA32_MTRR_FIX16K_A0000:
     case X86_MSR_IA32_MTRR_FIX4K_C0000 ... X86_MSR_IA32_MTRR_FIX4K_F8000:
     case X86_MSR_IA32_MTRR_PHYSBASE0 ... X86_MSR_IA32_MTRR_PHYSMASK9:
     // From Volume 3, Section 9.11.4: For now, 0.
     case X86_MSR_IA32_PLATFORM_ID:
     // From Volume 3, Section 9.11.7: 0 indicates no microcode update is loaded.
     case X86_MSR_IA32_BIOS_SIGN_ID:
     // From Volume 3, Section 15.3.1: 0 indicates that our machine has no
     // checking capabilities.
     case X86_MSR_IA32_MCG_CAP:
     case X86_MSR_IA32_MCG_STATUS:
     case X86_MSR_IA32_TEMPERATURE_TARGET:
     case X86_MSR_PKG_ENERGY_STATUS:
     case X86_MSR_PLATFORM_ENERGY_COUNTER:
     case X86_MSR_PLATFORM_POWER_LIMIT:
     case X86_MSR_PP0_ENERGY_STATUS:
     case X86_MSR_PP0_POWER_LIMIT:
     case X86_MSR_PP1_ENERGY_STATUS:
     case X86_MSR_PP1_POWER_LIMIT:
     case X86_MSR_RAPL_POWER_UNIT:
     // From Volume 3, Section 14.2: We've configured CPUID to report no MPERF/APERF
     // support, but Linux attempts to read stats anyhow. Just ignore it.
     case X86_MSR_PPERF:
     // From Volume 4, Table 2-15: Number of SMI interrupts since boot.
     // We report 0 interrupts.
     case X86_MSR_SMI_COUNT:
       next_rip(exit_info, vmcs);
       guest_state.SetEdxEax(0);
       break;
     case kX2ApicMsrBase ... kX2ApicMsrMax:
       handle_apic_rdmsr(exit_info, vmcs, guest_state, local_apic_state);
       break;
     default:
       dprintf(INFO, "hypervisor: Unhandled guest RDMSR %#lx\n", guest_state.rcx);
       local_apic_state.interrupt_tracker.Interrupt(X86_INT_GP_FAULT);
       break;
   }
 }

 zx_time_t lvt_deadline(LocalApicState& local_apic_state) {
   if ((local_apic_state.lvt_timer & LVT_TIMER_MODE_MASK) != LVT_TIMER_MODE_ONESHOT &&
       (local_apic_state.lvt_timer & LVT_TIMER_MODE_MASK) != LVT_TIMER_MODE_PERIODIC) {
     return 0;
   }
   uint32_t shift = BITS_SHIFT(local_apic_state.lvt_divide_config, 1, 0) |
                    (BIT_SHIFT(local_apic_state.lvt_divide_config, 3) << 2);
   uint32_t divisor_shift = (shift + 1) & 7;
   int64_t duration_tsc_ticks =
       static_cast<int64_t>(local_apic_state.lvt_initial_count << divisor_shift);
   zx_duration_t duration = convert_raw_tsc_duration_to_nanoseconds(duration_tsc_ticks);
   return zx_time_add_duration(current_time(), duration);
 }

 void update_timer(LocalApicState& local_apic_state, zx_time_t deadline);

 void deadline_callback(Timer* timer, zx_time_t now, void* arg) {
   auto& local_apic_state = *static_cast<LocalApicState*>(arg);
   if (local_apic_state.lvt_timer & LVT_MASKED) {
     return;
   }
   if ((local_apic_state.lvt_timer & LVT_TIMER_MODE_MASK) == LVT_TIMER_MODE_PERIODIC) {
     update_timer(local_apic_state, lvt_deadline(local_apic_state));
   }
   uint8_t vector = local_apic_state.lvt_timer & LVT_TIMER_VECTOR_MASK;
   local_apic_state.interrupt_tracker.Interrupt(vector);
 }

 void update_timer(LocalApicState& local_apic_state, zx_time_t deadline) {
   local_apic_state.timer.Cancel();
   if (deadline > 0) {
     local_apic_state.timer.SetOneshot(deadline, deadline_callback, &local_apic_state);
   }
 }

 uint64_t ipi_target_mask(const InterruptCommandRegister& icr, uint16_t self) {
   DEBUG_ASSERT(self < NormalGuest::kMaxGuestVcpus);

   switch (icr.destination_shorthand) {
     case InterruptDestinationShorthand::NO_SHORTHAND: {
       // Intel Volume 3, Section 10.12.9: A destination ID value of FFFF_FFFFH
       // is used for broadcast of interrupts in both logical destination and
       // physical destination modes.
       if (icr.destination == kIpiBroadcastDestination) {
         return UINT64_MAX;
       }

       // If an invalid destination was provided, just return the empty mask.
       if (unlikely(icr.destination >= NormalGuest::kMaxGuestVcpus)) {
         return 0;
       }

       // Otherwise, generate a mask for the target VCPU.
       return 1u << icr.destination;
     }
     case InterruptDestinationShorthand::SELF:
       return 1u << self;
     case InterruptDestinationShorthand::ALL_INCLUDING_SELF:
       return UINT64_MAX;
     case InterruptDestinationShorthand::ALL_EXCLUDING_SELF:
       return ~(1u << self);
   }

   return 0;
 }

 zx::result<> handle_ipi(const ExitInfo& exit_info, AutoVmcs& vmcs, const GuestState& guest_state,
                         zx_port_packet& packet) {
   InterruptCommandRegister icr(guest_state.edx(), guest_state.eax());
   if (icr.destination_mode == InterruptDestinationMode::LOGICAL) {
     dprintf(INFO, "hypervisor: Logical IPI destination mode requested by guest is not supported\n");
     return zx::error(ZX_ERR_NOT_SUPPORTED);
   }
   switch (icr.delivery_mode) {
     case InterruptDeliveryMode::FIXED: {
       uint16_t self = vmcs.Read(VmcsField16::VPID) - 1;
       memset(&packet, 0, sizeof(packet));
       packet.type = ZX_PKT_TYPE_GUEST_VCPU;
       packet.guest_vcpu.type = ZX_PKT_GUEST_VCPU_INTERRUPT;
       packet.guest_vcpu.interrupt.mask = ipi_target_mask(icr, self);
       packet.guest_vcpu.interrupt.vector = icr.vector;
       next_rip(exit_info, vmcs);
       return zx::error(ZX_ERR_NEXT);
     }
     case InterruptDeliveryMode::NMI: {
       uint16_t self = vmcs.Read(VmcsField16::VPID) - 1;
       memset(&packet, 0, sizeof(packet));
       packet.type = ZX_PKT_TYPE_GUEST_VCPU;
       packet.guest_vcpu.type = ZX_PKT_GUEST_VCPU_INTERRUPT;
       // Intel Volume 3a, Table 10-4 specifies that NMI to self is an invalid configuration and
       // behavior is undefined for invalid configurations.
       //
       // For simplicity we'll just clear the self-bit in the mask.
       packet.guest_vcpu.interrupt.mask = ipi_target_mask(icr, self) & ~(1 << self);

       // Intel Volume 3a, Section 10.6.1 Interrupt Command Register.
       //
       // For NMI the target information is ignored since the NMI vector is already defined.
       packet.guest_vcpu.interrupt.vector = X86_INT_NMI;
       next_rip(exit_info, vmcs);
       return zx::error(ZX_ERR_NEXT);
     }
     case InterruptDeliveryMode::INIT:
       // Ignore INIT IPIs, we only need STARTUP to bring up a VCPU.
       next_rip(exit_info, vmcs);
       return zx::ok();
     case InterruptDeliveryMode::STARTUP:
       memset(&packet, 0, sizeof(packet));
       packet.type = ZX_PKT_TYPE_GUEST_VCPU;
       packet.guest_vcpu.type = ZX_PKT_GUEST_VCPU_STARTUP;
       packet.guest_vcpu.startup.id = icr.destination;
       packet.guest_vcpu.startup.entry = icr.vector << 12;
       next_rip(exit_info, vmcs);
       return zx::error(ZX_ERR_NEXT);
     default:
       dprintf(INFO, "hypervisor: Unsupported guest IPI delivery mode %#x\n",
               static_cast<uint8_t>(icr.delivery_mode));
       return zx::error(ZX_ERR_NOT_SUPPORTED);
   }
 }

 zx::result<> handle_apic_wrmsr(const ExitInfo& exit_info, AutoVmcs& vmcs,
                                const GuestState& guest_state, LocalApicState& local_apic_state,
                                zx_port_packet& packet) {
   // Check for writes to reserved bits.
   //
   // From Volume 3, Section 10.12.1.2: "The upper 32-bits of all x2APIC MSRs
   // (except for the ICR) are reserved."
   X2ApicMsr reg = static_cast<X2ApicMsr>(guest_state.ecx());
   if (unlikely(guest_state.edx() != 0 && reg != X2ApicMsr::ICR)) {
     local_apic_state.interrupt_tracker.Interrupt(X86_INT_GP_FAULT);
     return zx::ok();
   }

   switch (reg) {
     case X2ApicMsr::EOI:
     case X2ApicMsr::ESR:
       // From Volume 3, Section 10.12.1.2: "WRMSR of a non-zero value causes #GP(0)."
       if (guest_state.eax() != 0) {
         local_apic_state.interrupt_tracker.Interrupt(X86_INT_GP_FAULT);
         return zx::ok();
       }
       next_rip(exit_info, vmcs);
       return zx::ok();
     case X2ApicMsr::TPR:
     case X2ApicMsr::SVR:
     case X2ApicMsr::LVT_MONITOR:
     case X2ApicMsr::LVT_ERROR:
     case X2ApicMsr::LVT_LINT0:
     case X2ApicMsr::LVT_LINT1:
     case X2ApicMsr::LVT_THERMAL_SENSOR:
     case X2ApicMsr::LVT_CMCI:
       next_rip(exit_info, vmcs);
       return zx::ok();
     case X2ApicMsr::LVT_TIMER:
       if ((guest_state.eax() & LVT_TIMER_MODE_MASK) == LVT_TIMER_MODE_RESERVED) {
         return zx::error(ZX_ERR_INVALID_ARGS);
       }
       next_rip(exit_info, vmcs);
       local_apic_state.lvt_timer = guest_state.eax();
       update_timer(local_apic_state, lvt_deadline(local_apic_state));
       return zx::ok();
     case X2ApicMsr::INITIAL_COUNT:
       next_rip(exit_info, vmcs);
       local_apic_state.lvt_initial_count = guest_state.eax();
       update_timer(local_apic_state, lvt_deadline(local_apic_state));
       return zx::ok();
     case X2ApicMsr::DCR:
       next_rip(exit_info, vmcs);
       local_apic_state.lvt_divide_config = guest_state.eax();
       update_timer(local_apic_state, lvt_deadline(local_apic_state));
       return zx::ok();
     case X2ApicMsr::SELF_IPI: {
       next_rip(exit_info, vmcs);
       uint32_t vector = guest_state.eax() & UINT8_MAX;
       local_apic_state.interrupt_tracker.Interrupt(vector);
       return zx::ok();
     }
     case X2ApicMsr::ICR:
       return handle_ipi(exit_info, vmcs, guest_state, packet);
     default:
       // Issue a general protection fault for read only and unimplemented
       // registers.
       dprintf(INFO, "hypervisor: Unhandled guest x2APIC WRMSR %#" PRIx32 "\n", guest_state.ecx());
       local_apic_state.interrupt_tracker.Interrupt(X86_INT_GP_FAULT);
       return zx::ok();
   }
 }

 zx::result<> handle_kvm_wrmsr(const ExitInfo& exit_info, AutoVmcs& vmcs,
                               const GuestState& guest_state, LocalApicState& local_apic_state,
                               PvClockState& pv_clock, hypervisor::GuestPhysicalAspace& gpa) {
   zx_paddr_t guest_paddr = guest_state.EdxEax();

   next_rip(exit_info, vmcs);
   switch (guest_state.ecx()) {
     case kKvmSystemTimeMsrOld:
     case kKvmSystemTimeMsr:
       vmcs.Invalidate();
       if ((guest_paddr & 1) != 0) {
         return pv_clock_reset_clock(&pv_clock, &gpa, guest_paddr & ~static_cast<zx_paddr_t>(1));
       } else {
         pv_clock_stop_clock(&pv_clock);
       }
       return zx::ok();
     case kKvmBootTimeOld:
     case kKvmBootTime:
       vmcs.Invalidate();
       return pv_clock_update_boot_time(&gpa, guest_paddr);
     default:
       local_apic_state.interrupt_tracker.Interrupt(X86_INT_GP_FAULT);
       return zx::ok();
   }
 }

 zx::result<> handle_wrmsr(const ExitInfo& exit_info, AutoVmcs& vmcs, const GuestState& guest_state,
                           LocalApicState& local_apic_state, PvClockState& pv_clock,
                           hypervisor::GuestPhysicalAspace& gpa, zx_port_packet& packet) {
   // On execution of wrmsr, rcx specifies the MSR and edx:eax contains the value to be written.
   switch (guest_state.ecx()) {
     case X86_MSR_IA32_APIC_BASE:
       if ((guest_state.EdxEax() & ~IA32_APIC_BASE_BSP) != kLocalApicPhysBase) {
         return zx::error(ZX_ERR_INVALID_ARGS);
       }
       next_rip(exit_info, vmcs);
       return zx::ok();
     // See note in handle_rdmsr.
     case X86_MSR_IA32_MTRRCAP:
     case X86_MSR_IA32_MTRR_DEF_TYPE:
     case X86_MSR_IA32_MTRR_FIX64K_00000:
     case X86_MSR_IA32_MTRR_FIX16K_80000 ... X86_MSR_IA32_MTRR_FIX16K_A0000:
     case X86_MSR_IA32_MTRR_FIX4K_C0000 ... X86_MSR_IA32_MTRR_FIX4K_F8000:
     case X86_MSR_IA32_MTRR_PHYSBASE0 ... X86_MSR_IA32_MTRR_PHYSMASK9:
     case X86_MSR_IA32_BIOS_SIGN_ID:
     case X86_MSR_DRAM_POWER_LIMIT:
     case X86_MSR_PP0_POWER_LIMIT:
     case X86_MSR_PP1_POWER_LIMIT:
     case X86_MSR_PLATFORM_POWER_LIMIT:
     // We disable the associated CPUID bits, but Linux still writes to these
     // MSRs. Just ignore it.
     case X86_MSR_IA32_SPEC_CTRL:
     case X86_MSR_IA32_PRED_CMD:
     // From AMD64 Volume 2, Section 6.1.1: CSTAR is unused, but Linux likes to
     // set a null handler, even when not in compatibility mode. Just ignore it.
     case X86_MSR_IA32_CSTAR:
       next_rip(exit_info, vmcs);
       return zx::ok();
     case X86_MSR_IA32_TSC_DEADLINE: {
       if ((local_apic_state.lvt_timer & LVT_TIMER_MODE_MASK) != LVT_TIMER_MODE_TSC_DEADLINE) {
         return zx::error(ZX_ERR_INVALID_ARGS);
       }
       next_rip(exit_info, vmcs);
       int64_t tsc_deadline = static_cast<int64_t>(guest_state.EdxEax());
       zx_time_t mono_deadline = convert_raw_tsc_timestamp_to_clock_monotonic(tsc_deadline);
       update_timer(local_apic_state, mono_deadline);
       return zx::ok();
     }
     case kX2ApicMsrBase ... kX2ApicMsrMax:
       return handle_apic_wrmsr(exit_info, vmcs, guest_state, local_apic_state, packet);
     case kKvmSystemTimeMsrOld:
     case kKvmSystemTimeMsr:
     case kKvmBootTimeOld:
     case kKvmBootTime:
       return handle_kvm_wrmsr(exit_info, vmcs, guest_state, local_apic_state, pv_clock, gpa);
     default:
       dprintf(INFO, "hypervisor: Unhandled guest WRMSR %#lx\n", guest_state.rcx);
       local_apic_state.interrupt_tracker.Interrupt(X86_INT_GP_FAULT);
       return zx::ok();
   }
 }

 uint8_t default_operand_size(uint64_t efer, uint32_t cs_access_rights) {
   // See Volume 3, Section 5.2.1.
   if ((efer & X86_EFER_LMA) && (cs_access_rights & kGuestXxAccessRightsL)) {
     // IA32-e 64 bit mode.
     return 4;
   } else if (cs_access_rights & kGuestXxAccessRightsD) {
     // CS.D set (and not 64 bit mode).
     return 4;
   } else {
     // CS.D clear (and not 64 bit mode).
     return 2;
   }
 }

 zx::result<> handle_trap(const ExitInfo& exit_info, AutoVmcs& vmcs, bool read,
                          zx_vaddr_t guest_paddr, hypervisor::TrapMap& traps,
                          zx_port_packet_t& packet) {
   zx::result<hypervisor::Trap*> trap = traps.FindTrap(ZX_GUEST_TRAP_BELL, guest_paddr);
   if (trap.is_error()) {
     return trap.take_error();
   }
   next_rip(exit_info, vmcs);

   switch ((*trap)->kind()) {
     case ZX_GUEST_TRAP_BELL:
       if (read) {
         return zx::error(ZX_ERR_NOT_SUPPORTED);
       }
       packet.key = (*trap)->key();
       packet.type = ZX_PKT_TYPE_GUEST_BELL;
       packet.guest_bell.addr = guest_paddr;
       if (!(*trap)->HasPort()) {
         return zx::error(ZX_ERR_BAD_STATE);
       }
       return (*trap)->Queue(packet, &vmcs);
     case ZX_GUEST_TRAP_MEM:
       if (exit_info.exit_instruction_length > kMaxInstructionSize) {
         return zx::error(ZX_ERR_INTERNAL);
       }
       packet.key = (*trap)->key();
       packet.type = ZX_PKT_TYPE_GUEST_MEM;
       packet.guest_mem.addr = guest_paddr;
       packet.guest_mem.cr3 = vmcs.Read(VmcsFieldXX::GUEST_CR3);
       packet.guest_mem.rip = exit_info.guest_rip;
       packet.guest_mem.instruction_size = static_cast<uint8_t>(exit_info.exit_instruction_length);
       packet.guest_mem.default_operand_size = default_operand_size(
           vmcs.Read(VmcsField64::GUEST_IA32_EFER), vmcs.Read(VmcsField32::GUEST_CS_ACCESS_RIGHTS));
       return zx::error(ZX_ERR_NEXT);
     default:
       return zx::error(ZX_ERR_BAD_STATE);
   }
 }

 zx::result<> handle_ept_violation(const ExitInfo& exit_info, AutoVmcs& vmcs,
                                   hypervisor::GuestPhysicalAspace& gpa, hypervisor::TrapMap& traps,
                                   zx_port_packet_t& packet) {
   const EptViolationInfo ept_violation_info(vmcs.Read(VmcsFieldXX::EXIT_QUALIFICATION));
   zx_gpaddr_t guest_paddr = vmcs.Read(VmcsField64::GUEST_PHYSICAL_ADDRESS);
   auto result = handle_trap(exit_info, vmcs, ept_violation_info.read, guest_paddr, traps, packet);
   if (result.status_value() != ZX_ERR_NOT_FOUND) {
     return result;
   }

   // We may have to block when handling the page fault.
   vmcs.Invalidate();

   // If there was no trap associated with this address and it is outside of
   // guest physical address space, return failure.
   if (guest_paddr >= gpa.size()) {
     return zx::error(ZX_ERR_OUT_OF_RANGE);
   }

   result = gpa.PageFault(guest_paddr);
   if (result.is_error()) {
     dprintf(CRITICAL, "hypervisor: Unhandled EPT violation %#lx\n", guest_paddr);
   }
   return result;
 }

 zx::result<> handle_xsetbv(const ExitInfo& exit_info, AutoVmcs& vmcs, GuestState& guest_state) {
   uint64_t guest_cr4 = vmcs.Read(VmcsFieldXX::GUEST_CR4);
   if (!(guest_cr4 & X86_CR4_OSXSAVE)) {
     return zx::error(ZX_ERR_INVALID_ARGS);
   }

   // We only support XCR0.
   if (guest_state.rcx != 0) {
     return zx::error(ZX_ERR_INVALID_ARGS);
   }

   cpuid_leaf leaf;
   if (!x86_get_cpuid_subleaf(X86_CPUID_XSAVE, 0, &leaf)) {
     return zx::error(ZX_ERR_INTERNAL);
   }

   // Check that XCR0 is valid.
   uint64_t xcr0_bitmap = (static_cast<uint64_t>(leaf.d) << 32) | leaf.a;
   uint64_t xcr0 = guest_state.EdxEax();
   if (~xcr0_bitmap & xcr0 ||
       // x87 state must be enabled.
       (xcr0 & X86_XSAVE_STATE_BIT_X87) != X86_XSAVE_STATE_BIT_X87 ||
       // If AVX state is enabled, SSE state must be enabled.
       (xcr0 & (X86_XSAVE_STATE_BIT_AVX | X86_XSAVE_STATE_BIT_SSE)) == X86_XSAVE_STATE_BIT_AVX) {
     return zx::error(ZX_ERR_INVALID_ARGS);
   }

   guest_state.xcr0 = xcr0;
   next_rip(exit_info, vmcs);
   return zx::ok();
 }

 void handle_pause(const ExitInfo& exit_info, AutoVmcs& vmcs) { next_rip(exit_info, vmcs); }

 bool is_cpl0(AutoVmcs& vmcs) {
   const uint32_t access_rights = vmcs.Read(VmcsField32::GUEST_SS_ACCESS_RIGHTS);
   // We only accept a VMCALL if CPL is 0.
   return (access_rights & kGuestXxAccessRightsDplUser) == 0;
 }

 void handle_vmcall_regular(const ExitInfo& exit_info, AutoVmcs& vmcs, GuestState& guest_state,
                            hypervisor::GuestPhysicalAspace& gpa) {
   next_rip(exit_info, vmcs);
   if (!is_cpl0(vmcs)) {
     guest_state.rax = VmCallStatus::NOT_PERMITTED;
     return;
   }
   vmcs.Invalidate();

   // We never fail in case of hypercalls, we just return/propagate errors to the caller.
   const VmCallInfo info(guest_state);
   switch (info.type) {
     case VmCallType::CLOCK_PAIRING: {
       if (info.arg[1] != 0) {
         dprintf(INFO, "hypervisor: CLOCK_PAIRING hypercall doesn't support clock type %lu\n",
                 info.arg[1]);
         guest_state.rax = VmCallStatus::NOT_SUPPORTED;
         break;
       }
       if (auto result = pv_clock_populate_offset(&gpa, info.arg[0]); result.is_error()) {
         dprintf(INFO, "hypervisor: Failed to populate lock offset with error %d\n",
                 result.status_value());
         guest_state.rax = VmCallStatus::FAULT;
         break;
       }
       guest_state.rax = VmCallStatus::OK;
       break;
     }
     default:
       dprintf(INFO,
               "hypervisor: Unknown hypercall %lu (arg0=%#lx, arg1=%#lx, arg2=%#lx, arg3=%#lx)\n",
               static_cast<uint64_t>(info.type), info.arg[0], info.arg[1], info.arg[2], info.arg[3]);
       guest_state.rax = VmCallStatus::UNKNOWN_HYPERCALL;
       break;
   }
 }

 zx::result<> handle_vmcall_direct(const ExitInfo& exit_info, AutoVmcs& vmcs,
                                   GuestState& guest_state, uintptr_t& fs_base,
                                   zx_port_packet_t& packet) {
   next_rip(exit_info, vmcs);
   if (!is_cpl0(vmcs)) {
     guest_state.rax = ZX_ERR_ACCESS_DENIED;
     return zx::ok();
   }
   vmcs.Invalidate();
   zx_status_t status = vmcall_dispatch(guest_state, fs_base, packet);
   return zx::make_result(status);
 }

 }  // namespace

 ExitInfo::ExitInfo(const AutoVmcs& vmcs) {
   // From Volume 3, Section 26.7.
   uint32_t full_exit_reason = vmcs.Read(VmcsField32::EXIT_REASON);
   entry_failure = BIT(full_exit_reason, 31);
   exit_reason = static_cast<ExitReason>(BITS(full_exit_reason, 15, 0));

   exit_instruction_length = vmcs.Read(VmcsField32::EXIT_INSTRUCTION_LENGTH);
   guest_rip = vmcs.Read(VmcsFieldXX::GUEST_RIP);

   if (exit_reason == ExitReason::EXTERNAL_INTERRUPT || exit_reason == ExitReason::IO_INSTRUCTION) {
     return;
   }

   LTRACEF("entry failure: %d\n", entry_failure);
   LTRACEF("exit reason: %#x (%s)\n", static_cast<uint32_t>(exit_reason),
           exit_reason_name(exit_reason));
   LTRACEF("exit instruction length: %#x\n", exit_instruction_length);
   LTRACEF("guest activity state: %#x\n", vmcs.Read(VmcsField32::GUEST_ACTIVITY_STATE));
   LTRACEF("guest interruptibility state: %#x\n",
           vmcs.Read(VmcsField32::GUEST_INTERRUPTIBILITY_STATE));
   LTRACEF("guest linear address: %#lx\n", vmcs.Read(VmcsFieldXX::GUEST_LINEAR_ADDRESS));
   LTRACEF("guest rip: %#lx\n", guest_rip);
 }

 ExitInterruptionInfo::ExitInterruptionInfo(const AutoVmcs& vmcs) {
   uint32_t int_info = vmcs.Read(VmcsField32::EXIT_INTERRUPTION_INFORMATION);
   vector = static_cast<uint8_t>(BITS(int_info, 7, 0));
   interruption_type = static_cast<InterruptionType>(BITS_SHIFT(int_info, 10, 8));
   error_code_valid = BIT(int_info, 11);
   valid = BIT(int_info, 31);
 }

 PageFaultInfo::PageFaultInfo(uint32_t error_code) {
   // From Volume 3A, Figure 4-12.
   flags = 0;
   flags |= (error_code & PFEX_W) ? VMM_PF_FLAG_WRITE : 0;
   flags |= (error_code & PFEX_U) ? VMM_PF_FLAG_USER : 0;
   flags |= (error_code & PFEX_I) ? VMM_PF_FLAG_INSTRUCTION : 0;
   flags |= (error_code & PFEX_P) ? 0 : VMM_PF_FLAG_NOT_PRESENT;
 }

 EptViolationInfo::EptViolationInfo(uint64_t qualification) {
   // From Volume 3C, Table 27-7.
   read = BIT(qualification, 0);
   write = BIT(qualification, 1);
   instruction = BIT(qualification, 2);
 }

 CrAccessInfo::CrAccessInfo(uint64_t qualification) {
   // From Volume 3, Table 27-3.
   cr_number = static_cast<uint8_t>(BITS(qualification, 3, 0));
   access_type = static_cast<CrAccessType>(BITS_SHIFT(qualification, 5, 4));
   reg = static_cast<uint8_t>(BITS_SHIFT(qualification, 11, 8));
 }

 IoInfo::IoInfo(uint64_t qualification) {
   access_size = static_cast<uint8_t>(BITS(qualification, 2, 0) + 1);
   input = BIT_SHIFT(qualification, 3);
   string = BIT_SHIFT(qualification, 4);
   repeat = BIT_SHIFT(qualification, 5);
   port = static_cast<uint16_t>(BITS_SHIFT(qualification, 31, 16));
 }

 InterruptCommandRegister::InterruptCommandRegister(uint32_t hi, uint32_t lo) {
   destination = hi;
   destination_mode = static_cast<InterruptDestinationMode>(BIT_SHIFT(lo, 11));
   delivery_mode = static_cast<InterruptDeliveryMode>(BITS_SHIFT(lo, 10, 8));
   destination_shorthand = static_cast<InterruptDestinationShorthand>(BITS_SHIFT(lo, 19, 18));
   vector = static_cast<uint8_t>(BITS(lo, 7, 0));
 }

 VmCallInfo::VmCallInfo(const GuestState& guest_state) {
   // ABI is documented in Linux kernel documentation, see
   // Documents/virtual/kvm/hypercalls.txt
   type = static_cast<VmCallType>(guest_state.rax);
   arg[0] = guest_state.rbx;
   arg[1] = guest_state.rcx;
   arg[2] = guest_state.rdx;
   arg[3] = guest_state.rsi;
 }

 zx::result<> vmexit_handler_normal(AutoVmcs& vmcs, GuestState& guest_state,
                                    LocalApicState& local_apic_state, PvClockState& pv_clock,
                                    hypervisor::GuestPhysicalAspace& gpa, hypervisor::TrapMap& traps,
                                    zx_port_packet_t& packet) {
   zx::result<> result = zx::ok();
   const ExitInfo exit_info(vmcs);
   switch (exit_info.exit_reason) {
     case ExitReason::EXTERNAL_INTERRUPT:
       ktrace_vcpu_exit(VCPU_EXTERNAL_INTERRUPT, exit_info.guest_rip);
       GUEST_STATS_INC(interrupts);
       handle_external_interrupt(vmcs);
       break;
     case ExitReason::INTERRUPT_WINDOW:
       ktrace_vcpu_exit(VCPU_INTERRUPT_WINDOW, exit_info.guest_rip);
       GUEST_STATS_INC(interrupt_windows);
       handle_interrupt_window(vmcs);
       break;
     case ExitReason::CPUID:
       ktrace_vcpu_exit(VCPU_CPUID, exit_info.guest_rip);
       GUEST_STATS_INC(cpuid_instructions);
       result = handle_cpuid(exit_info, vmcs, guest_state);
       break;
     case ExitReason::HLT:
       ktrace_vcpu_exit(VCPU_HLT, exit_info.guest_rip);
       GUEST_STATS_INC(hlt_instructions);
       result = handle_hlt(exit_info, vmcs, local_apic_state);
       break;
     case ExitReason::CONTROL_REGISTER_ACCESS:
       ktrace_vcpu_exit(VCPU_CONTROL_REGISTER_ACCESS, exit_info.guest_rip);
       GUEST_STATS_INC(control_register_accesses);
       result = handle_control_register_access(exit_info, vmcs, guest_state, local_apic_state);
       break;
     case ExitReason::IO_INSTRUCTION:
       ktrace_vcpu_exit(VCPU_IO_INSTRUCTION, exit_info.guest_rip);
       GUEST_STATS_INC(io_instructions);
       result = handle_io_instruction(exit_info, vmcs, guest_state, traps, packet);
       break;
     case ExitReason::RDMSR:
       ktrace_vcpu_exit(VCPU_RDMSR, exit_info.guest_rip);
       GUEST_STATS_INC(rdmsr_instructions);
       handle_rdmsr(exit_info, vmcs, guest_state, local_apic_state);
       break;
     case ExitReason::WRMSR:
       ktrace_vcpu_exit(VCPU_WRMSR, exit_info.guest_rip);
       GUEST_STATS_INC(wrmsr_instructions);
       result = handle_wrmsr(exit_info, vmcs, guest_state, local_apic_state, pv_clock, gpa, packet);
       break;
     case ExitReason::ENTRY_FAILURE_GUEST_STATE:
     case ExitReason::ENTRY_FAILURE_MSR_LOADING:
     case ExitReason::ENTRY_FAILURE_MACHINE_CHECK:
       ktrace_vcpu_exit(VCPU_VM_ENTRY_FAILURE, exit_info.guest_rip);
       result = zx::error(ZX_ERR_BAD_STATE);
       break;
     case ExitReason::EPT_VIOLATION:
       ktrace_vcpu_exit(VCPU_EPT_VIOLATION, exit_info.guest_rip);
       GUEST_STATS_INC(ept_violations);
       result = handle_ept_violation(exit_info, vmcs, gpa, traps, packet);
       break;
     case ExitReason::XSETBV:
       ktrace_vcpu_exit(VCPU_XSETBV, exit_info.guest_rip);
       GUEST_STATS_INC(xsetbv_instructions);
       result = handle_xsetbv(exit_info, vmcs, guest_state);
       break;
     case ExitReason::PAUSE:
       ktrace_vcpu_exit(VCPU_PAUSE, exit_info.guest_rip);
       GUEST_STATS_INC(pause_instructions);
       handle_pause(exit_info, vmcs);
       break;
     case ExitReason::VMCALL:
       ktrace_vcpu_exit(VCPU_VMCALL, exit_info.guest_rip);
       GUEST_STATS_INC(vmcall_instructions);
       handle_vmcall_regular(exit_info, vmcs, guest_state, gpa);
       break;
     case ExitReason::EXCEPTION_OR_NMI:
     // Currently all exceptions, except NMIs, are delivered directly to guests.
     // NMIs cause VM exits and are handled by the host via the IDT as any other
     // interrupt/exception.
     default:
       ktrace_vcpu_exit(VCPU_NOT_SUPPORTED, exit_info.guest_rip);
       result = zx::error(ZX_ERR_NOT_SUPPORTED);
       break;
   }
   switch (result.status_value()) {
     case ZX_OK:
     case ZX_ERR_NEXT:
     case ZX_ERR_INTERNAL_INTR_RETRY:
     case ZX_ERR_INTERNAL_INTR_KILLED:
       break;
     default:
       dprintf(CRITICAL, "hypervisor: VM exit handler (regular) for %s (%u) returned %d\n",
               exit_reason_name(exit_info.exit_reason), static_cast<uint32_t>(exit_info.exit_reason),
               result.status_value());
       dump_guest_state(guest_state, exit_info);
       break;
   }
   return result;
 }

 zx::result<> vmexit_handler_direct(AutoVmcs& vmcs, GuestState& guest_state, uintptr_t& fs_base,
                                    zx_port_packet_t& packet) {
   zx::result<> result = zx::ok();
   const ExitInfo exit_info(vmcs);
   switch (exit_info.exit_reason) {
     case ExitReason::EXCEPTION_OR_NMI:
       ktrace_vcpu_exit(VCPU_EXCEPTION_OR_NMI, exit_info.guest_rip);
       result = handle_exception_or_nmi(vmcs);
       break;
     case ExitReason::EXTERNAL_INTERRUPT:
       ktrace_vcpu_exit(VCPU_EXTERNAL_INTERRUPT, exit_info.guest_rip);
       GUEST_STATS_INC(interrupts);
       handle_external_interrupt(vmcs);
       break;
     case ExitReason::CPUID:
       ktrace_vcpu_exit(VCPU_CPUID, exit_info.guest_rip);
       GUEST_STATS_INC(cpuid_instructions);
       result = handle_cpuid(exit_info, vmcs, guest_state);
       break;
     case ExitReason::VMCALL:
       ktrace_vcpu_exit(VCPU_VMCALL, exit_info.guest_rip);
       GUEST_STATS_INC(vmcall_instructions);
       result = handle_vmcall_direct(exit_info, vmcs, guest_state, fs_base, packet);
       break;
     case ExitReason::ENTRY_FAILURE_GUEST_STATE:
     case ExitReason::ENTRY_FAILURE_MSR_LOADING:
     case ExitReason::ENTRY_FAILURE_MACHINE_CHECK:
       ktrace_vcpu_exit(VCPU_VM_ENTRY_FAILURE, exit_info.guest_rip);
       result = zx::error(ZX_ERR_BAD_STATE);
       break;
     default:
       ktrace_vcpu_exit(VCPU_NOT_SUPPORTED, exit_info.guest_rip);
       result = zx::error(ZX_ERR_NOT_SUPPORTED);
       break;
   }
   switch (result.status_value()) {
     case ZX_OK:
     case ZX_ERR_NEXT:
     case ZX_ERR_INTERNAL_INTR_RETRY:
     case ZX_ERR_INTERNAL_INTR_KILLED:
       break;
     default:
       dprintf(CRITICAL,
               "hypervisor: VM exit handler (direct) for %s (%u) returned %d on thread %s\n",
               exit_reason_name(exit_info.exit_reason), static_cast<uint32_t>(exit_info.exit_reason),
               result.status_value(), Thread::Current::Get()->name());
       dump_guest_state(guest_state, exit_info);
       break;
   }
   return result;
 }