zircon/kernel/arch/x86/hypervisor/vcpu.cc - fuchsia - Git at Google

 // Copyright 2017 The Fuchsia Authors
 //
 // Use of this source code is governed by a MIT-style
 // license that can be found in the LICENSE file or at
 // https://opensource.org/licenses/MIT

 #include <bits.h>
 #include <lib/arch/x86/boot-cpuid.h>
 #include <lib/arch/x86/speculation.h>
 #include <lib/boot-options/boot-options.h>
 #include <lib/fit/defer.h>
 #include <lib/ktrace.h>
 #include <zircon/syscalls/hypervisor.h>

 #include <new>

 #include <arch/x86/descriptor.h>
 #include <arch/x86/feature.h>
 #include <arch/x86/hypervisor/invalidate.h>
 #include <arch/x86/platform_access.h>
 #include <arch/x86/pv.h>
 #include <hwreg/x86msr.h>
 #include <hypervisor/cpu.h>
 #include <hypervisor/ktrace.h>
 #include <kernel/percpu.h>
 #include <kernel/stats.h>
 #include <vm/fault.h>
 #include <vm/pmm.h>
 #include <vm/vm_object.h>

 #include "pv_priv.h"
 #include "vcpu_priv.h"
 #include "vmexit_priv.h"
 #include "vmx_cpu_state_priv.h"

 namespace {

 constexpr uint32_t kInterruptInfoValid = 1u << 31;
 constexpr uint32_t kInterruptInfoDeliverErrorCode = 1u << 11;
 constexpr uint32_t kInterruptTypeNmi = 2u << 8;
 constexpr uint32_t kInterruptTypeHardwareException = 3u << 8;
 constexpr uint32_t kInterruptTypeSoftwareException = 6u << 8;
 constexpr uint16_t kBaseProcessorVpid = 1;

 void vmptrld(paddr_t pa) {
   uint8_t err;

   __asm__ __volatile__("vmptrld %[pa]"
                        : "=@ccna"(err)  // Set `err` on error (C or Z flag set)
                        : [pa] "m"(pa)
                        : "cc", "memory");

   ASSERT(!err);
 }

 void vmclear(paddr_t pa) {
   uint8_t err;

   __asm__ __volatile__("vmclear %[pa]"
                        : "=@ccna"(err)  // Set `err` on error (C or Z flag set)
                        : [pa] "m"(pa)
                        : "cc", "memory");

   ASSERT(!err);
 }

 uint64_t vmread(uint64_t field) {
   uint8_t err;
   uint64_t val;

   __asm__ __volatile__("vmread %[field], %[val]"
                        : [val] "=r"(val),
                          "=@ccna"(err)  // Set `err` on error (C or Z flag set)
                        : [field] "r"(field)
                        : "cc");
   ASSERT(!err);
   return val;
 }

 void vmwrite(uint64_t field, uint64_t val) {
   uint8_t err;

   __asm__ __volatile__("vmwrite %[val], %[field]"
                        : "=@ccna"(err)  // Set `err` on error (C or Z flag set)
                        : [val] "r"(val), [field] "r"(field)
                        : "cc");
   ASSERT(!err);
 }

 bool has_error_code(uint32_t vector) {
   switch (vector) {
     case X86_INT_DOUBLE_FAULT:
     case X86_INT_INVALID_TSS:
     case X86_INT_SEGMENT_NOT_PRESENT:
     case X86_INT_STACK_FAULT:
     case X86_INT_GP_FAULT:
     case X86_INT_PAGE_FAULT:
     case X86_INT_ALIGNMENT_CHECK:
       return true;
     default:
       return false;
   }
 }

 void swap_extended_registers(uint8_t* save_extended_registers, uint64_t& save_xcr0, bool save,
                              uint8_t* load_extended_registers, uint64_t& load_xcr0, bool load) {
   x86_extended_register_save_state(save_extended_registers);
   if (save) {
     save_xcr0 = x86_xgetbv(0);
   }
   if (load) {
     x86_xsetbv(0, load_xcr0);
   }
   x86_extended_register_restore_state(load_extended_registers);
 }

 template <typename Out, typename In>
 void register_copy(Out& out, const In& in) {
   out.rax = in.rax;
   out.rcx = in.rcx;
   out.rdx = in.rdx;
   out.rbx = in.rbx;
   out.rbp = in.rbp;
   out.rsi = in.rsi;
   out.rdi = in.rdi;
   out.r8 = in.r8;
   out.r9 = in.r9;
   out.r10 = in.r10;
   out.r11 = in.r11;
   out.r12 = in.r12;
   out.r13 = in.r13;
   out.r14 = in.r14;
   out.r15 = in.r15;
 }

 zx::result<> vmcs_init(AutoVmcs& vmcs, const VcpuConfig& config, uint16_t vpid, uintptr_t entry,
                        paddr_t msr_bitmaps_address, paddr_t ept_pml4, VmxState* vmx_state,
                        uint8_t* extended_register_state) {
   // Setup secondary processor-based VMCS controls.
   auto result =
       vmcs.SetControl(VmcsField32::PROCBASED_CTLS2, read_msr(X86_MSR_IA32_VMX_PROCBASED_CTLS2), 0,
                       // Enable use of extended page tables.
                       kProcbasedCtls2Ept |
                           // Enable use of RDTSCP instruction.
                           kProcbasedCtls2Rdtscp |
                           // Enable X2APIC.
                           kProcbasedCtls2x2Apic |
                           // Associate cached translations of linear
                           // addresses with a virtual processor ID.
                           kProcbasedCtls2Vpid |
                           // If `unrestricted`, enable unrestricted guest.
                           (config.unrestricted ? kProcbasedCtls2UnrestrictedGuest : 0),
                       // If not `unrestricted`, disable unrestricted guest.
                       (config.unrestricted ? 0 : kProcbasedCtls2UnrestrictedGuest));
   if (result.is_error()) {
     return result;
   }

   // Enable use of INVPCID instruction if available.
   std::ignore =
       vmcs.SetControl(VmcsField32::PROCBASED_CTLS2, read_msr(X86_MSR_IA32_VMX_PROCBASED_CTLS2),
                       vmcs.Read(VmcsField32::PROCBASED_CTLS2), kProcbasedCtls2Invpcid, 0);

   // Setup pin-based VMCS controls.
   result =
       vmcs.SetControl(VmcsField32::PINBASED_CTLS, read_msr(X86_MSR_IA32_VMX_TRUE_PINBASED_CTLS),
                       read_msr(X86_MSR_IA32_VMX_PINBASED_CTLS),
                       // External interrupts cause a VM exit.
                       kPinbasedCtlsExtIntExiting |
                           // Non-maskable interrupts cause a VM exit.
                           kPinbasedCtlsNmiExiting,
                       0);
   if (result.is_error()) {
     return result;
   }

   const uint32_t cr_ctls =
       // VM exit on CR3 load.
       kProcbasedCtlsCr3LoadExiting |
       // VM exit on CR3 store.
       kProcbasedCtlsCr3StoreExiting |
       // VM exit on CR8 load.
       kProcbasedCtlsCr8LoadExiting |
       // VM exit on CR8 store.
       kProcbasedCtlsCr8StoreExiting;
   // Setup primary processor-based VMCS controls.
   result =
       vmcs.SetControl(VmcsField32::PROCBASED_CTLS, read_msr(X86_MSR_IA32_VMX_TRUE_PROCBASED_CTLS),
                       read_msr(X86_MSR_IA32_VMX_PROCBASED_CTLS),
                       // Enable VM exit when interrupts are enabled.
                       kProcbasedCtlsIntWindowExiting |
                           // Enable VM exit on HLT instruction.
                           kProcbasedCtlsHltExiting |
                           // Enable TPR virtualization.
                           kProcbasedCtlsTprShadow |
                           // Enable VM exit on IO instructions.
                           kProcbasedCtlsIoExiting |
                           // Enable use of MSR bitmaps.
                           kProcbasedCtlsMsrBitmaps |
                           // Enable secondary processor-based controls.
                           kProcbasedCtlsProcbasedCtls2 |
                           // If `cr_exiting`, enable VM exit on CRs.
                           (config.cr_exiting ? cr_ctls : 0),
                       // If not `cr_exiting`, disable VM exit on CRs.
                       (config.cr_exiting ? 0 : cr_ctls));
   if (result.is_error()) {
     return result;
   }

   // We only enable interrupt-window exiting above to ensure that the
   // processor supports it for later use. So disable it for now.
   vmcs.InterruptWindowExiting(false);

   // Setup VM-exit VMCS controls.
   result = vmcs.SetControl(VmcsField32::EXIT_CTLS, read_msr(X86_MSR_IA32_VMX_TRUE_EXIT_CTLS),
                            read_msr(X86_MSR_IA32_VMX_EXIT_CTLS),
                            // Logical processor is in 64-bit mode after VM
                            // exit. On VM exit CS.L, IA32_EFER.LME, and
                            // IA32_EFER.LMA is set to true.
                            kExitCtls64bitMode |
                                // Acknowledge external interrupt on exit.
                                kExitCtlsAckIntOnExit |
                                // Save the guest IA32_PAT MSR on exit.
                                kExitCtlsSaveIa32Pat |
                                // Load the host IA32_PAT MSR on exit.
                                kExitCtlsLoadIa32Pat |
                                // Save the guest IA32_EFER MSR on exit.
                                kExitCtlsSaveIa32Efer |
                                // Load the host IA32_EFER MSR on exit.
                                kExitCtlsLoadIa32Efer,
                            0);
   if (result.is_error()) {
     return result;
   }

   // Whether we are configuring the base processor. The base processor starts in
   // 64-bit mode with all features enabled. For secondary processors, they must
   // be bootstrapped by the operating system.
   //
   // If there is no base processor for this VCPU type, then default to true.
   const bool is_base_processor = config.has_base_processor ? vpid == kBaseProcessorVpid : true;

   // Setup VM-entry VMCS controls.
   // Load the guest IA32_PAT MSR and IA32_EFER MSR on entry.
   uint32_t entry_ctls = kEntryCtlsLoadIa32Pat | kEntryCtlsLoadIa32Efer;
   if (is_base_processor) {
     // On the BSP, go straight to 64-bit mode on entry.
     entry_ctls |= kEntryCtls64bitMode;
   }
   result = vmcs.SetControl(VmcsField32::ENTRY_CTLS, read_msr(X86_MSR_IA32_VMX_TRUE_ENTRY_CTLS),
                            read_msr(X86_MSR_IA32_VMX_ENTRY_CTLS), entry_ctls, 0);
   if (result.is_error()) {
     return result;
   }

   // From Volume 3, Section 24.6.3: The exception bitmap is a 32-bit field
   // that contains one bit for each exception. When an exception occurs,
   // its vector is used to select a bit in this field. If the bit is 1,
   // the exception causes a VM exit. If the bit is 0, the exception is
   // delivered normally through the IDT, using the descriptor
   // corresponding to the exception’s vector.
   //
   // From Volume 3, Section 25.2: If software desires VM exits on all page
   // faults, it can set bit 14 in the exception bitmap to 1 and set the
   // page-fault error-code mask and match fields each to 00000000H.
   vmcs.Write(VmcsField32::PAGEFAULT_ERRORCODE_MASK, 0);
   vmcs.Write(VmcsField32::PAGEFAULT_ERRORCODE_MATCH, 0);

   // From Volume 3, Section 28.1: Virtual-processor identifiers (VPIDs)
   // introduce to VMX operation a facility by which a logical processor may
   // cache information for multiple linear-address spaces. When VPIDs are
   // used, VMX transitions may retain cached information and the logical
   // processor switches to a different linear-address space.
   //
   // From Volume 3, Section 26.2.1.1: If the “enable VPID” VM-execution
   // control is 1, the value of the VPID VM-execution control field must not
   // be 0000H.
   //
   // From Volume 3, Section 28.3.3.3: If EPT is in use, the logical processor
   // associates all mappings it creates with the value of bits 51:12 of
   // current EPTP. If a VMM uses different EPTP values for different guests,
   // it may use the same VPID for those guests.
   //
   // From Volume 3, Section 28.3.3.1: Operations that architecturally
   // invalidate entries in the TLBs or paging-structure caches independent of
   // VMX operation (e.g., the INVLPG and INVPCID instructions) invalidate
   // linear mappings and combined mappings. They are required to do so only
   // for the current VPID (but, for combined mappings, all EP4TAs). Linear
   // mappings for the current VPID are invalidated even if EPT is in use.
   // Combined mappings for the current VPID are invalidated even if EPT is
   // not in use.
   vmcs.Write(VmcsField16::VPID, vpid);
   invvpid(InvVpid::SINGLE_CONTEXT, vpid, 0);

   // From Volume 3, Section 28.2: The extended page-table mechanism (EPT) is a
   // feature that can be used to support the virtualization of physical
   // memory. When EPT is in use, certain addresses that would normally be
   // treated as physical addresses (and used to access memory) are instead
   // treated as guest-physical addresses. Guest-physical addresses are
   // translated by traversing a set of EPT paging structures to produce
   // physical addresses that are used to access memory.
   const auto eptp = ept_pointer_from_pml4(ept_pml4);
   vmcs.Write(VmcsField64::EPT_POINTER, eptp);

   // Setup MSR handling.
   vmcs.Write(VmcsField64::MSR_BITMAPS_ADDRESS, msr_bitmaps_address);

   // Setup VMCS host state.
   //
   // NOTE: We are pinned to a thread when executing this function, therefore
   // it is acceptable to use per-CPU state.
   x86_percpu* percpu = x86_get_percpu();
   vmcs.Write(VmcsField32::HOST_IA32_SYSENTER_CS, 0);
   vmcs.Write(VmcsFieldXX::HOST_IA32_SYSENTER_ESP, 0);
   vmcs.Write(VmcsFieldXX::HOST_IA32_SYSENTER_EIP, 0);
   vmcs.Write(VmcsField64::HOST_IA32_PAT, read_msr(X86_MSR_IA32_PAT));
   vmcs.Write(VmcsField64::HOST_IA32_EFER, read_msr(X86_MSR_IA32_EFER));
   vmcs.Write(VmcsFieldXX::HOST_CR0, x86_get_cr0());
   vmcs.Write(VmcsFieldXX::HOST_CR4, x86_get_cr4());
   vmcs.Write(VmcsField16::HOST_ES_SELECTOR, 0);
   vmcs.Write(VmcsField16::HOST_CS_SELECTOR, CODE_64_SELECTOR);
   vmcs.Write(VmcsField16::HOST_SS_SELECTOR, DATA_SELECTOR);
   vmcs.Write(VmcsField16::HOST_DS_SELECTOR, 0);
   vmcs.Write(VmcsField16::HOST_FS_SELECTOR, 0);
   vmcs.Write(VmcsField16::HOST_GS_SELECTOR, 0);
   vmcs.Write(VmcsField16::HOST_TR_SELECTOR, TSS_SELECTOR(percpu->cpu_num));
   vmcs.Write(VmcsFieldXX::HOST_FS_BASE, read_msr(X86_MSR_IA32_FS_BASE));
   vmcs.Write(VmcsFieldXX::HOST_GS_BASE, read_msr(X86_MSR_IA32_GS_BASE));
   vmcs.Write(VmcsFieldXX::HOST_TR_BASE, reinterpret_cast<uint64_t>(&percpu->default_tss));
   vmcs.Write(VmcsFieldXX::HOST_GDTR_BASE, reinterpret_cast<uint64_t>(gdt_get()));
   vmcs.Write(VmcsFieldXX::HOST_IDTR_BASE, reinterpret_cast<uint64_t>(idt_get_readonly()));
   vmcs.Write(VmcsFieldXX::HOST_RSP, reinterpret_cast<uint64_t>(vmx_state));
   vmcs.Write(VmcsFieldXX::HOST_RIP, reinterpret_cast<uint64_t>(vmx_exit_asm));

   // Setup VMCS guest state.
   uint64_t cr0 = X86_CR0_ET |  // Enable extension type
                  X86_CR0_NE |  // Enable internal x87 exception handling
                  X86_CR0_WP;   // Enable supervisor write protect
   if (is_base_processor) {
     // Enable protected mode and paging on the primary VCPU.
     cr0 |= X86_CR0_PE |  // Enable protected mode
            X86_CR0_PG;   // Enable paging
   }
   if (cr0_is_invalid(vmcs, cr0)) {
     return zx::error(ZX_ERR_BAD_STATE);
   }
   vmcs.Write(VmcsFieldXX::GUEST_CR0, cr0);

   // Enable FXSAVE, VMX, FSGSBASE, and XSAVE.
   uint64_t cr4 = X86_CR4_OSFXSR | X86_CR4_VMXE | X86_CR4_FSGSBASE | X86_CR4_OSXSAVE;
   if (is_base_processor) {
     // Enable PAE and PGE on the BSP.
     cr4 |= X86_CR4_PAE | X86_CR4_PGE;
   }
   if (cr_is_invalid(cr4, X86_MSR_IA32_VMX_CR4_FIXED0, X86_MSR_IA32_VMX_CR4_FIXED1)) {
     return zx::error(ZX_ERR_BAD_STATE);
   }
   vmcs.Write(VmcsFieldXX::GUEST_CR4, cr4);

   vmcs.Write(VmcsField64::GUEST_IA32_PAT, read_msr(X86_MSR_IA32_PAT));

   uint64_t guest_efer = read_msr(X86_MSR_IA32_EFER);
   if (!is_base_processor) {
     // Disable LME and LMA on all but the BSP.
     guest_efer &= ~(X86_EFER_LME | X86_EFER_LMA);
   }
   vmcs.Write(VmcsField64::GUEST_IA32_EFER, guest_efer);

   uint32_t cs_access_rights =
       kGuestXxAccessRightsDefault | kGuestXxAccessRightsTypeE | kGuestXxAccessRightsTypeCode;
   if (is_base_processor) {
     // Ensure that the BSP starts with a 64-bit code segment.
     cs_access_rights |= kGuestXxAccessRightsL;
   }
   vmcs.Write(VmcsField32::GUEST_CS_ACCESS_RIGHTS, cs_access_rights);

   vmcs.Write(VmcsField32::GUEST_TR_ACCESS_RIGHTS,
              kGuestTrAccessRightsTssBusy | kGuestXxAccessRightsP);

   vmcs.Write(VmcsField32::GUEST_SS_ACCESS_RIGHTS, kGuestXxAccessRightsDefault);
   vmcs.Write(VmcsField32::GUEST_DS_ACCESS_RIGHTS, kGuestXxAccessRightsDefault);
   vmcs.Write(VmcsField32::GUEST_ES_ACCESS_RIGHTS, kGuestXxAccessRightsDefault);
   vmcs.Write(VmcsField32::GUEST_FS_ACCESS_RIGHTS, kGuestXxAccessRightsDefault);
   vmcs.Write(VmcsField32::GUEST_GS_ACCESS_RIGHTS, kGuestXxAccessRightsDefault);

   vmcs.Write(VmcsField32::GUEST_LDTR_ACCESS_RIGHTS,
              kGuestXxAccessRightsTypeW | kGuestXxAccessRightsP);

   if (is_base_processor) {
     // Use GUEST_RIP to set the entry point on the BSP.
     vmcs.Write(VmcsFieldXX::GUEST_CS_BASE, 0);
     vmcs.Write(VmcsField16::GUEST_CS_SELECTOR, 0);
     vmcs.Write(VmcsFieldXX::GUEST_RIP, entry);
   } else {
     // Use CS to set the entry point on APs.
     vmcs.Write(VmcsFieldXX::GUEST_CS_BASE, entry);
     vmcs.Write(VmcsField16::GUEST_CS_SELECTOR, static_cast<uint16_t>(entry >> 4));
     vmcs.Write(VmcsFieldXX::GUEST_RIP, 0);
   }
   vmcs.Write(VmcsField32::GUEST_CS_LIMIT, 0xffff);
   vmcs.Write(VmcsFieldXX::GUEST_TR_BASE, 0);
   vmcs.Write(VmcsField16::GUEST_TR_SELECTOR, 0);
   vmcs.Write(VmcsField32::GUEST_TR_LIMIT, 0xffff);
   vmcs.Write(VmcsFieldXX::GUEST_DS_BASE, 0);
   vmcs.Write(VmcsField32::GUEST_DS_LIMIT, 0xffff);
   vmcs.Write(VmcsFieldXX::GUEST_SS_BASE, 0);
   vmcs.Write(VmcsField32::GUEST_SS_LIMIT, 0xffff);
   vmcs.Write(VmcsFieldXX::GUEST_ES_BASE, 0);
   vmcs.Write(VmcsField32::GUEST_ES_LIMIT, 0xffff);
   vmcs.Write(VmcsFieldXX::GUEST_FS_BASE, 0);
   vmcs.Write(VmcsField32::GUEST_FS_LIMIT, 0xffff);
   vmcs.Write(VmcsFieldXX::GUEST_GS_BASE, 0);
   vmcs.Write(VmcsField32::GUEST_GS_LIMIT, 0xffff);
   vmcs.Write(VmcsField32::GUEST_LDTR_LIMIT, 0xffff);
   vmcs.Write(VmcsFieldXX::GUEST_GDTR_BASE, 0);
   vmcs.Write(VmcsField32::GUEST_GDTR_LIMIT, 0xffff);
   vmcs.Write(VmcsFieldXX::GUEST_IDTR_BASE, 0);
   vmcs.Write(VmcsField32::GUEST_IDTR_LIMIT, 0xffff);

   // Set all reserved RFLAGS bits to their correct values
   vmcs.Write(VmcsFieldXX::GUEST_RFLAGS, X86_FLAGS_RESERVED_ONES);

   vmcs.Write(VmcsField32::GUEST_ACTIVITY_STATE, 0);
   vmcs.Write(VmcsField32::GUEST_INTERRUPTIBILITY_STATE, 0);
   vmcs.Write(VmcsFieldXX::GUEST_PENDING_DEBUG_EXCEPTIONS, 0);

   // From Volume 3, Section 26.3.1.1: The IA32_SYSENTER_ESP field and the
   // IA32_SYSENTER_EIP field must each contain a canonical address.
   vmcs.Write(VmcsFieldXX::GUEST_IA32_SYSENTER_ESP, 0);
   vmcs.Write(VmcsFieldXX::GUEST_IA32_SYSENTER_EIP, 0);
   vmcs.Write(VmcsField32::GUEST_IA32_SYSENTER_CS, 0);

   vmcs.Write(VmcsFieldXX::GUEST_RSP, 0);

   // From Volume 3, Section 24.4.2: If the “VMCS shadowing” VM-execution
   // control is 1, the VMREAD and VMWRITE instructions access the VMCS
   // referenced by this pointer (see Section 24.10). Otherwise, software
   // should set this field to FFFFFFFF_FFFFFFFFH to avoid VM-entry
   // failures (see Section 26.3.1.5).
   vmcs.Write(VmcsField64::LINK_POINTER, kLinkPointerInvalidate);

   if (x86_xsave_supported()) {
     // Set initial guest XCR0 to host XCR0.
     vmx_state->host_state.xcr0 = x86_xgetbv(0);
     vmx_state->guest_state.xcr0 =
         X86_XSAVE_STATE_BIT_X87 | X86_XSAVE_STATE_BIT_SSE | X86_XSAVE_STATE_BIT_AVX;
     x86_extended_register_init_state_from_bv(extended_register_state, vmx_state->guest_state.xcr0);
   }

   return zx::ok();
 }

 // Injects an interrupt into the guest, if there is one pending.
 zx_status_t local_apic_maybe_interrupt(AutoVmcs* vmcs, LocalApicState* local_apic_state) {
   // Since hardware generated exceptions are delivered to the guest directly,
   // the only exceptions we see here are those we generate in the VMM, e.g. GP
   // faults in vmexit handlers. Therefore we simplify interrupt priority to 1)
   // NMIs, 2) interrupts, and 3) generated exceptions. See Volume 3, Section
   // 6.9, Table 6-2.
   uint32_t vector = X86_INT_COUNT;
   bool pending = local_apic_state->interrupt_tracker.TryPop(X86_INT_NMI);
   if (pending) {
     vector = X86_INT_NMI;
   } else {
     // Pop scans vectors from highest to lowest, which will correctly pop
     // interrupts before exceptions. All vectors <= X86_INT_VIRT except the NMI
     // vector are exceptions.
     pending = local_apic_state->interrupt_tracker.Pop(&vector);
     if (!pending) {
       return ZX_OK;
     }
     // If type isn't inactive, then Pop should have initialized vector to a
     // valid value.
     DEBUG_ASSERT(vector != X86_INT_COUNT);
   }

   // NMI injection is blocked if an NMI is already being serviced (Volume 3,
   // Section 24.4.2, Table 24-3), and mov ss blocks *all* interrupts (Volume 2
   // Section 4.3 MOV-Move instruction). Note that the IF flag does not affect
   // NMIs (Volume 3, Section 6.8.1).
   auto can_inject_nmi = [vmcs] {
     return (vmcs->Read(VmcsField32::GUEST_INTERRUPTIBILITY_STATE) &
             (kInterruptibilityNmiBlocking | kInterruptibilityMovSsBlocking)) == 0;
   };
   // External interrupts can be blocked due to STI, move SS or the IF flag.
   auto can_inject_external_int = [vmcs] {
     return (vmcs->Read(VmcsFieldXX::GUEST_RFLAGS) & X86_FLAGS_IF) &&
            (vmcs->Read(VmcsField32::GUEST_INTERRUPTIBILITY_STATE) &
             (kInterruptibilityStiBlocking | kInterruptibilityMovSsBlocking)) == 0;
   };

   if (vector > X86_INT_VIRT && vector < X86_INT_PLATFORM_BASE) {
     dprintf(INFO, "Invalid interrupt vector: %u\n", vector);
     return ZX_ERR_NOT_SUPPORTED;
   } else if ((vector >= X86_INT_PLATFORM_BASE && !can_inject_external_int()) ||
              (vector == X86_INT_NMI && !can_inject_nmi())) {
     local_apic_state->interrupt_tracker.Track(vector);
     // If interrupts are disabled, we set VM exit on interrupt enable.
     vmcs->InterruptWindowExiting(true);
     return ZX_OK;
   }

   // If the vector is non-maskable or interrupts are enabled, inject interrupt.
   vmcs->IssueInterrupt(vector);

   // Volume 3, Section 6.9: Lower priority exceptions are discarded; lower
   // priority interrupts are held pending. Discarded exceptions are re-generated
   // when the interrupt handler returns execution to the point in the program or
   // task where the exceptions and/or interrupts occurred.
   local_apic_state->interrupt_tracker.Clear(0, X86_INT_NMI);
   local_apic_state->interrupt_tracker.Clear(X86_INT_NMI + 1, X86_INT_VIRT + 1);

   return ZX_OK;
 }

 void interrupt_cpu(Thread* thread, cpu_num_t last_cpu) TA_REQ(ThreadLock::Get()) {
   // Check if the VCPU is running and whether to send an IPI. We hold the thread
   // lock to guard against thread migration between CPUs during the check.
   //
   // NOTE: `last_cpu` may be currently set to `INVALID_CPU` due to thread
   // migration between CPUs.
   if (thread != nullptr && thread->state() == THREAD_RUNNING && last_cpu != INVALID_CPU) {
     mp_interrupt(MP_IPI_TARGET_MASK, cpu_num_to_mask(last_cpu));
   }
 }

 }  // namespace

 AutoVmcs::AutoVmcs(paddr_t vmcs_address, bool clear) : vmcs_address_(vmcs_address) {
   DEBUG_ASSERT(!arch_ints_disabled());
   int_state_ = arch_interrupt_save();
   arch_set_blocking_disallowed(true);
   if (clear) {
     vmclear(vmcs_address_);
   }
   vmptrld(vmcs_address_);
 }

 AutoVmcs::~AutoVmcs() {
   DEBUG_ASSERT(arch_ints_disabled());
   if (vmcs_address_ != 0) {
     arch_set_blocking_disallowed(false);
   }
   arch_interrupt_restore(int_state_);
 }

 void AutoVmcs::Invalidate() {
   if (vmcs_address_ != 0) {
     vmcs_address_ = 0;
     arch_set_blocking_disallowed(false);
   }
 }

 void AutoVmcs::InterruptWindowExiting(bool enable) {
   DEBUG_ASSERT(vmcs_address_ != 0);
   uint32_t controls = Read(VmcsField32::PROCBASED_CTLS);
   if (enable) {
     controls |= kProcbasedCtlsIntWindowExiting;
   } else {
     controls &= ~kProcbasedCtlsIntWindowExiting;
   }
   Write(VmcsField32::PROCBASED_CTLS, controls);
 }

 void AutoVmcs::IssueInterrupt(uint32_t vector) {
   DEBUG_ASSERT(vmcs_address_ != 0);
   uint32_t interrupt_info = kInterruptInfoValid | (vector & UINT8_MAX);
   if (vector == X86_INT_BREAKPOINT || vector == X86_INT_OVERFLOW) {
     // From Volume 3, Section 24.8.3. A VMM should use type hardware exception for all
     // exceptions other than breakpoints and overflows, which should be software exceptions.
     interrupt_info |= kInterruptTypeSoftwareException;
   } else if (vector == X86_INT_NMI) {
     interrupt_info |= kInterruptTypeNmi;
   } else if (vector <= X86_INT_VIRT) {
     // From Volume 3, Section 6.15. All other vectors from 0 to X86_INT_VIRT are exceptions.
     interrupt_info |= kInterruptTypeHardwareException;
   }
   if (has_error_code(vector)) {
     interrupt_info |= kInterruptInfoDeliverErrorCode;
     Write(VmcsField32::ENTRY_EXCEPTION_ERROR_CODE, 0);
   }

   DEBUG_ASSERT((Read(VmcsField32::ENTRY_INTERRUPTION_INFORMATION) & kInterruptInfoValid) == 0);
   Write(VmcsField32::ENTRY_INTERRUPTION_INFORMATION, interrupt_info);
 }

 uint16_t AutoVmcs::Read(VmcsField16 field) const {
   DEBUG_ASSERT(vmcs_address_ != 0);
   return static_cast<uint16_t>(vmread(static_cast<uint64_t>(field)));
 }

 uint32_t AutoVmcs::Read(VmcsField32 field) const {
   DEBUG_ASSERT(vmcs_address_ != 0);
   return static_cast<uint32_t>(vmread(static_cast<uint64_t>(field)));
 }

 uint64_t AutoVmcs::Read(VmcsField64 field) const {
   DEBUG_ASSERT(vmcs_address_ != 0);
   return vmread(static_cast<uint64_t>(field));
 }

 uint64_t AutoVmcs::Read(VmcsFieldXX field) const {
   DEBUG_ASSERT(vmcs_address_ != 0);
   return vmread(static_cast<uint64_t>(field));
 }

 void AutoVmcs::Write(VmcsField16 field, uint16_t val) {
   DEBUG_ASSERT(vmcs_address_ != 0);
   vmwrite(static_cast<uint64_t>(field), val);
 }

 void AutoVmcs::Write(VmcsField32 field, uint32_t val) {
   DEBUG_ASSERT(vmcs_address_ != 0);
   vmwrite(static_cast<uint64_t>(field), val);
 }

 void AutoVmcs::Write(VmcsField64 field, uint64_t val) {
   DEBUG_ASSERT(vmcs_address_ != 0);
   vmwrite(static_cast<uint64_t>(field), val);
 }

 void AutoVmcs::Write(VmcsFieldXX field, uint64_t val) {
   DEBUG_ASSERT(vmcs_address_ != 0);
   vmwrite(static_cast<uint64_t>(field), val);
 }

 zx::result<> AutoVmcs::SetControl(VmcsField32 controls, uint64_t true_msr, uint64_t old_msr,
                                   uint32_t set, uint32_t clear) {
   DEBUG_ASSERT(vmcs_address_ != 0);
   uint32_t allowed_0 = static_cast<uint32_t>(BITS(true_msr, 31, 0));
   uint32_t allowed_1 = static_cast<uint32_t>(BITS_SHIFT(true_msr, 63, 32));
   if ((allowed_1 & set) != set) {
     dprintf(INFO, "Failed to set VMCS controls %#x, %#x != %#x\n", static_cast<uint>(controls),
             allowed_1, set);
     return zx::error(ZX_ERR_NOT_SUPPORTED);
   }
   if ((~allowed_0 & clear) != clear) {
     dprintf(INFO, "Failed to clear VMCS controls %#x, %#x != %#x\n", static_cast<uint>(controls),
             ~allowed_0, clear);
     return zx::error(ZX_ERR_NOT_SUPPORTED);
   }
   if ((set & clear) != 0) {
     dprintf(INFO, "Attempted to set and clear the same VMCS controls %#x\n",
             static_cast<uint>(controls));
     return zx::error(ZX_ERR_INVALID_ARGS);
   }

   // See Volume 3, Section 31.5.1, Algorithm 3, Part C. If the control can be
   // either 0 or 1 (flexible), and the control is unknown, then refer to the
   // old MSR to find the default value.
   uint32_t flexible = allowed_0 ^ allowed_1;
   uint32_t unknown = flexible & ~(set | clear);
   uint32_t defaults = unknown & BITS(old_msr, 31, 0);
   Write(controls, allowed_0 | defaults | set);
   return zx::ok();
 }

 bool cr0_is_invalid(AutoVmcs& vmcs, uint64_t cr0_value) {
   uint64_t check_value = cr0_value;
   // From Volume 3, Section 26.3.1.1: PE and PG bits of CR0 are not checked when unrestricted
   // guest is enabled. Set both here to avoid clashing with X86_MSR_IA32_VMX_CR0_FIXED1.
   if (vmcs.Read(VmcsField32::PROCBASED_CTLS2) & kProcbasedCtls2UnrestrictedGuest) {
     check_value |= X86_CR0_PE | X86_CR0_PG;
   }
   return cr_is_invalid(check_value, X86_MSR_IA32_VMX_CR0_FIXED0, X86_MSR_IA32_VMX_CR0_FIXED1);
 }

 // static
 template <typename V, typename G>
 zx::result<ktl::unique_ptr<V>> Vcpu::Create(G& guest, uint16_t vpid, zx_vaddr_t entry) {
   if (fbl::RefPtr<VmAddressRegion> root_vmar = guest.RootVmar();
       entry < root_vmar->base() || entry >= root_vmar->base() + root_vmar->size()) {
     return zx::error(ZX_ERR_INVALID_ARGS);
   }

   Thread* thread = Thread::Current::Get();
   if (thread->vcpu()) {
     return zx::error(ZX_ERR_BAD_STATE);
   }

   fbl::AllocChecker ac;
   ktl::unique_ptr<V> vcpu(new (&ac) V(guest, vpid, thread));
   if (!ac.check()) {
     return zx::error(ZX_ERR_NO_MEMORY);
   }

   VmxInfo vmx_info;
   auto result = vcpu->vmcs_page_.Alloc(vmx_info, 0);
   if (result.is_error()) {
     return result.take_error();
   }

   VmxRegion* region = vcpu->vmcs_page_.template VirtualAddress<VmxRegion>();
   region->revision_id = vmx_info.revision_id;

   zx_paddr_t ept_pml4 = guest.PhysicalAspace().arch_aspace().arch_table_phys();
   zx_paddr_t vmcs_address = vcpu->vmcs_page_.PhysicalAddress();
   // We create the `AutoVmcs` object here, so that we ensure that interrupts are
   // disabled from `vmcs_init` until `SetMigrateFn`. This is important to ensure
   // that we do not migrate CPUs while setting up the VCPU.
   AutoVmcs vmcs(vmcs_address, /*clear=*/true);
   result = vmcs_init(vmcs, V::kConfig, vpid, entry, guest.MsrBitmapsAddress(), ept_pml4,
                      &vcpu->vmx_state_, vcpu->extended_register_state_);
   if (result.is_error()) {
     return result.take_error();
   }

   {
     Guard<MonitoredSpinLock, IrqSave> guard{ThreadLock::Get(), SOURCE_TAG};
     // Only set the thread migrate function after we have initialised the VMCS.
     // Otherwise, the migrate function may interact with an uninitialised VMCS.
     //
     // We have to disable thread safety analysis because it's not smart enough to
     // realize that SetMigrateFn will always be called with the ThreadLock.
     thread->SetMigrateFnLocked([vcpu = vcpu.get()](Thread* thread, auto stage)
                                    TA_NO_THREAD_SAFETY_ANALYSIS { vcpu->Migrate(thread, stage); });
     thread->SetContextSwitchFnLocked([vcpu = vcpu.get()]() {
       if (vcpu->entered_.load()) {
         // `arch_context_switch()` saves and restores GS, so we can skip it.
         vcpu->ContextSwitch(/*include_gs=*/false);
       }
     });
   }

   return zx::ok(ktl::move(vcpu));
 }

 Vcpu::Vcpu(Guest& guest, uint16_t vpid, Thread* thread)
     : guest_(guest),
       vpid_(vpid),
       last_cpu_(thread->LastCpu()),
       thread_(thread),
       vmx_state_(/* zero-init */),
       msr_state_(/* zero-init */) {
   thread->set_vcpu(true);
 }

 Vcpu::~Vcpu() {
   cpu_num_t cpu;
   {
     // Taking the ThreadLock guarantees that thread_ isn't going to be freed
     // while we access it.
     Guard<MonitoredSpinLock, IrqSave> guard{ThreadLock::Get(), SOURCE_TAG};
     Thread* thread = thread_.load();
     if (thread != nullptr) {
       thread->set_vcpu(false);
       // Clear the migration function, so that |thread_| does not reference
       // |this| after destruction of the VCPU.
       thread->SetMigrateFnLocked(nullptr);
       thread->SetContextSwitchFnLocked(nullptr);
     }
     cpu = last_cpu_;
   }

   if (vmcs_page_.IsAllocated() && cpu != INVALID_CPU) {
     // Clear VMCS state from the CPU.
     //
     // The destructor may be called from a different thread, therefore we must
     // IPI the CPU that last run the thread.
     paddr_t paddr = vmcs_page_.PhysicalAddress();
     mp_sync_exec(
         MP_IPI_TARGET_MASK, cpu_num_to_mask(cpu),
         [](void* paddr) { vmclear(reinterpret_cast<paddr_t>(paddr)); },
         reinterpret_cast<void*>(paddr));
   }
 }

 void Vcpu::Migrate(Thread* thread, Thread::MigrateStage stage) {
   // Volume 3, Section 31.8.2: An MP-aware VMM is free to assign any logical
   // processor to a VM. But for performance considerations, moving a guest VMCS
   // to another logical processor is slower than resuming that guest VMCS on the
   // same logical processor. Certain VMX performance features (such as caching
   // of portions of the VMCS in the processor) are optimized for a guest VMCS
   // that runs on the same logical processor.
   //
   // If the VMCS regions are identical (same revision ID) the following sequence
   // can be used to move or copy the VMCS from one logical processor to another:
   switch (stage) {
     // * Perform a VMCLEAR operation on the source logical processor. This
     //   ensures that all VMCS data that may be cached by the processor are
     //   flushed to memory.
     case Thread::MigrateStage::Before: {
       vmclear(vmcs_page_.PhysicalAddress());
       // After VMCLEAR, `last_cpu_` can be cleared to indicate this VCPU is both
       // not presently running, and its state is not loaded anywhere.
       last_cpu_ = INVALID_CPU;
       break;
     }
     // * Copy the VMCS region from one memory location to another location. This
     //   is an optional step assuming the VMM wishes to relocate the VMCS or
     //   move the VMCS to another system.
     // * Perform a VMPTRLD of the physical address of VMCS region on the
     //   destination processor to establish its current VMCS pointer.
     case Thread::MigrateStage::After: {
       // Volume 3, Section 31.8.2: To migrate a VMCS to another logical
       // processor, a VMM must use the sequence of VMCLEAR, VMPTRLD and
       // VMLAUNCH.
       //
       // We set `resume` to false so that `vmx_enter` will call VMLAUNCH when
       // entering the the guest, instead of VMRESUME.
       vmx_state_.resume = false;

       // Before performing the VMPTRLD, update the `last_cpu_` for
       // `Vcpu::Interrupt()` and `vmcs_page_` state tracking. It is assumed that
       // the `Thread::MigrateStage::Before` stage already happened and that a
       // VMCLEAR has been performed on `last_cpu_`, hence the previous value of
       DEBUG_ASSERT(last_cpu_ == INVALID_CPU);
       last_cpu_ = thread->LastCpuLocked();

       // Load the VMCS on the destination processor.
       vmptrld(vmcs_page_.PhysicalAddress());

       // Update the VMCS with the per-CPU variables of the destination
       // processor.
       x86_percpu* percpu = x86_get_percpu();
       vmwrite(static_cast<uint64_t>(VmcsField16::HOST_TR_SELECTOR), TSS_SELECTOR(percpu->cpu_num));
       vmwrite(static_cast<uint64_t>(VmcsFieldXX::HOST_FS_BASE), thread->arch().fs_base);
       vmwrite(static_cast<uint64_t>(VmcsFieldXX::HOST_GS_BASE), read_msr(X86_MSR_IA32_GS_BASE));
       vmwrite(static_cast<uint64_t>(VmcsFieldXX::HOST_TR_BASE),
               reinterpret_cast<uint64_t>(&percpu->default_tss));

       // Invalidate TLB mappings for the VPID.
       invvpid(InvVpid::SINGLE_CONTEXT, vpid_, 0);
       break;
     }
     case Thread::MigrateStage::Exiting: {
       // The `thread_` is exiting and so we must clear our reference to it.
       thread_.store(nullptr);
       break;
     }
   }
 }

 void Vcpu::ContextSwitch(bool include_gs) {
   uint64_t star = read_msr(X86_MSR_IA32_STAR);
   uint64_t lstar = read_msr(X86_MSR_IA32_LSTAR);
   uint64_t fmask = read_msr(X86_MSR_IA32_FMASK);
   uint64_t tsc_aux = read_msr(X86_MSR_IA32_TSC_AUX);

   write_msr(X86_MSR_IA32_STAR, msr_state_.star);
   write_msr(X86_MSR_IA32_LSTAR, msr_state_.lstar);
   write_msr(X86_MSR_IA32_FMASK, msr_state_.fmask);
   write_msr(X86_MSR_IA32_TSC_AUX, msr_state_.tsc_aux);

   msr_state_.star = star;
   msr_state_.lstar = lstar;
   msr_state_.fmask = fmask;
   msr_state_.tsc_aux = tsc_aux;

   if (include_gs) {
     uint64_t kernel_gs_base = read_msr(X86_MSR_IA32_KERNEL_GS_BASE);
     write_msr(X86_MSR_IA32_KERNEL_GS_BASE, msr_state_.kernel_gs_base);
     msr_state_.kernel_gs_base = kernel_gs_base;
   }
 }

 void Vcpu::LoadExtendedRegisters(AutoVmcs& vmcs) {
   arch_thread& thread = Thread::Current::Get()->arch();
   bool save_host = x86_xsave_supported();
   bool load_guest = vmcs.Read(VmcsFieldXX::GUEST_CR4) & X86_CR4_OSXSAVE;
   swap_extended_registers(thread.extended_register_buffer, vmx_state_.host_state.xcr0, save_host,
                           extended_register_state_, vmx_state_.guest_state.xcr0, load_guest);
 }

 void Vcpu::SaveExtendedRegisters(AutoVmcs& vmcs) {
   arch_thread& thread = Thread::Current::Get()->arch();
   bool save_guest = vmcs.Read(VmcsFieldXX::GUEST_CR4) & X86_CR4_OSXSAVE;
   bool load_host = x86_xsave_supported();
   swap_extended_registers(extended_register_state_, vmx_state_.guest_state.xcr0, save_guest,
                           thread.extended_register_buffer, vmx_state_.host_state.xcr0, load_host);
 }

 zx::result<> vmx_enter(VmxState* vmx_state) {
   // Perform the low-level vmlaunch or vmresume, entering the guest,
   // and returning when the guest exits.
   zx_status_t status = vmx_enter_asm(vmx_state);

   DEBUG_ASSERT(arch_ints_disabled());

   // Reload the task segment in order to restore its limit. VMX always
   // restores it with a limit of 0x67, which excludes the IO bitmap.
   seg_sel_t selector = TSS_SELECTOR(arch_curr_cpu_num());
   x86_clear_tss_busy(selector);
   x86_ltr(selector);

   return zx::make_result(status);
 }

 template <typename PreEnterFn, typename PostExitFn>
 zx::result<> Vcpu::EnterInternal(PreEnterFn pre_enter, PostExitFn post_exit,
                                  zx_port_packet_t& packet) {
   Thread* current_thread = Thread::Current::Get();
   if (current_thread != thread_) {
     return zx::error(ZX_ERR_BAD_STATE);
   }

   bool extended_registers_loaded = false;
   auto defer = fit::defer([this, &extended_registers_loaded] {
     if (extended_registers_loaded) {
       AutoVmcs vmcs(vmcs_page_.PhysicalAddress());
       SaveExtendedRegisters(vmcs);
       ContextSwitch(/*include_gs=*/true);
       entered_.store(false);
     }
     // Spectre V2: Ensure that code executed in the VM guest cannot influence
     // indirect branch prediction in the host.
     //
     // TODO(https://fxbug.dev/42108888): We may be able to avoid the IBPB here; the kernel
     // is either built with a retpoline or has Enhanced IBRS enabled. We
     // currently execute an IBPB on context-switch to a new aspace. The IBPB is
     // currently only here to protect hypervisor user threads.
     if (!gBootOptions->x86_disable_spec_mitigations && x86_cpu_has_ibpb()) {
       arch::IssueIbpb(arch::BootCpuidIo{}, hwreg::X86MsrIo{});
     }
   });

   zx::result<> result = zx::ok();
   do {
     // If the thread was killed or suspended, then we should exit with an error.
     if (zx_status_t status = current_thread->CheckKillOrSuspendSignal(); status != ZX_OK) {
       return zx::error(status);
     }
     AutoVmcs vmcs(vmcs_page_.PhysicalAddress());

     // We check whether a kick was requested before entering the guest so that:
     // 1. When we enter the syscall, we can return immediately without entering
     //    the guest.
     // 2. If we have already exited the guest to handle a packet, it allows us
     //    to return and gives user-space a chance to handle that packet, without
     //    the request to kick interfering with the packet in-flight.
     //
     // We also do this after we have disabled interrupts, so if an interrupt was
     // fired before we disabled interrupts, we have the opportunity to check
     // whether a kick was requested, but the interrupt was lost. If an interrupt
     // is fired after we have disabled interrupts, when we enter the guest we
     // will exit due to the interrupt, and run this check again.
     if (kicked_.exchange(false)) {
       return zx::error(ZX_ERR_CANCELED);
     }

     if (result = pre_enter(vmcs); result.is_error()) {
       return result;
     }

     if (!extended_registers_loaded) {
       entered_.store(true);
       ContextSwitch(/*include_gs=*/true);
       LoadExtendedRegisters(vmcs);
       extended_registers_loaded = true;
     }

     if (x86_cpu_should_l1d_flush_on_vmentry()) {
       // L1TF: Flush L1D$ before entering vCPU. If the CPU is affected by MDS,
       // also flush microarchitectural buffers.
       write_msr(X86_MSR_IA32_FLUSH_CMD, 1);
     } else if (x86_cpu_should_md_clear_on_user_return()) {
       // MDS: If the processor is not affected by L1TF but is affected by MDS or
       // TAA, flush microarchitectural buffers.
       mds_buff_overwrite();
     }

     KTRACE_DURATION_BEGIN("kernel:vcpu", "vcpu");

     GUEST_STATS_INC(vm_entries);
     result = vmx_enter(&vmx_state_);
     GUEST_STATS_INC(vm_exits);

     if (!gBootOptions->x86_disable_spec_mitigations) {
       // Spectre V2: Ensure that code executed in the VM guest cannot influence
       // return address prediction in the host.
       x86_ras_fill();
     }

     if (result.is_ok()) {
       vmx_state_.resume = true;
       result = post_exit(vmcs, packet);
     } else {
       ktrace_vcpu_exit(VCPU_FAILURE, vmcs.Read(VmcsFieldXX::GUEST_RIP));
       uint64_t error = vmcs.Read(VmcsField32::INSTRUCTION_ERROR);
       dprintf(INFO, "VCPU enter failed: Instruction error %lu\n", error);
     }
   } while (result.is_ok());
   return result.status_value() == ZX_ERR_NEXT ? zx::ok() : result;
 }

 zx::result<> Vcpu::ReadState(zx_vcpu_state_t& vcpu_state) {
   if (Thread::Current::Get() != thread_) {
     return zx::error(ZX_ERR_BAD_STATE);
   }
   register_copy(vcpu_state, vmx_state_.guest_state);
   AutoVmcs vmcs(vmcs_page_.PhysicalAddress());
   vcpu_state.rsp = vmcs.Read(VmcsFieldXX::GUEST_RSP);
   vcpu_state.rflags = vmcs.Read(VmcsFieldXX::GUEST_RFLAGS) & X86_FLAGS_USER;
   return zx::ok();
 }

 zx::result<> Vcpu::WriteState(const zx_vcpu_state_t& vcpu_state) {
   if (Thread::Current::Get() != thread_) {
     return zx::error(ZX_ERR_BAD_STATE);
   }
   register_copy(vmx_state_.guest_state, vcpu_state);
   AutoVmcs vmcs(vmcs_page_.PhysicalAddress());
   vmcs.Write(VmcsFieldXX::GUEST_RSP, vcpu_state.rsp);
   if (vcpu_state.rflags & X86_FLAGS_RESERVED_ONES) {
     const uint64_t rflags = vmcs.Read(VmcsFieldXX::GUEST_RFLAGS);
     const uint64_t user_flags = (rflags & ~X86_FLAGS_USER) | (vcpu_state.rflags & X86_FLAGS_USER);
     vmcs.Write(VmcsFieldXX::GUEST_RFLAGS, user_flags);
   }
   return zx::ok();
 }

 void Vcpu::GetInfo(zx_info_vcpu_t* info) {
   if (kicked_.load()) {
     info->flags |= ZX_INFO_VCPU_FLAG_KICKED;
   }
 }

 // static
 zx::result<ktl::unique_ptr<Vcpu>> NormalVcpu::Create(NormalGuest& guest, zx_vaddr_t entry) {
   auto vpid = guest.TryAllocVpid();
   if (vpid.is_error()) {
     return vpid.take_error();
   }
   auto vcpu = Vcpu::Create<NormalVcpu>(guest, *vpid, entry);
   if (vcpu.is_error()) {
     auto result = guest.FreeVpid(*vpid);
     DEBUG_ASSERT(result.is_ok());
     return vcpu.take_error();
   }
   // Setup PV clock state.
   vcpu->pv_clock_state_.is_stable = x86_hypervisor_has_pv_clock()
                                         ? pv_clock_is_stable()
                                         : x86_feature_test(X86_FEATURE_INVAR_TSC);
   AutoVmcs vmcs(vcpu->vmcs_page_.PhysicalAddress());
   // Enable use of PAUSE-loop exiting if available.
   auto result =
       vmcs.SetControl(VmcsField32::PROCBASED_CTLS2, read_msr(X86_MSR_IA32_VMX_PROCBASED_CTLS2),
                       vmcs.Read(VmcsField32::PROCBASED_CTLS2), kProcbasedCtls2PauseLoopExiting, 0);
   if (result.is_ok()) {
     // From Volume 3, Section 25.1.3: The processor determines the amount of
     // time between this execution of PAUSE and the previous execution of PAUSE
     // at CPL 0. If this amount of time exceeds the value of the VM-execution
     // control field PLE_Gap, the processor considers this execution to be the
     // first execution of PAUSE in a loop. (It also does so for the first
     // execution of PAUSE at CPL 0 after VM entry.)
     //
     // Otherwise, the processor determines the amount of time since the most
     // recent execution of PAUSE that was considered to be the first in a loop.
     // If this amount of time exceeds the value of the VM-execution control
     // field PLE_Window, a VM exit occurs.
     //
     // For purposes of these computations, time is measured based on a counter
     // that runs at the same rate as the timestamp counter (TSC).
     //
     // NOTE: These values are based on KVM, which was based on empirical
     // analysis.
     vmcs.Write(VmcsField32::PLE_GAP, 1u << 7);
     vmcs.Write(VmcsField32::PLE_WINDOW, 1u << 12);
   }
   // From Volume 3, Section 27.5.1: The following bits are not modified: For
   // CR0, ET, CD, NW; [the reserved bits], and any bits that are fixed in VMX
   // operation.
   //
   // Any bit that is not restored must be masked, or the guest will be able to
   // affect the host's cr0. However, we do not need to mask:
   //   * The reserved bits, which will generate GP faults;
   //   * ET, which is fixed to 1 (Volume 3 Section 2.5);
   //   * The bits that are fixed in VMX operation aside from PE and PG for
   //     unrestricted guests, which will generate GP faults (Volume 3 Section
   //     25.3);
   //
   // Additionally, NE is fixed in VMX operation but some guests will attempt to
   // clear it without handling the GP fault. So it should also be masked.
   vmcs.Write(VmcsFieldXX::CR0_GUEST_HOST_MASK, X86_CR0_ET | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD);

   // From Volume 3, Section 9.1.1: Following power-up, The state of control
   // register CR0 is 60000010H (CD and ET are set.)
   vmcs.Write(VmcsFieldXX::CR0_READ_SHADOW, X86_CR0_ET);

   // Mask access to CR4.
   vmcs.Write(VmcsFieldXX::CR4_GUEST_HOST_MASK, X86_CR4_VMXE);
   vmcs.Write(VmcsFieldXX::CR4_READ_SHADOW, 0);

   // Set host and guest CR3.
   vmcs.Write(VmcsFieldXX::HOST_CR3, x86_get_cr3());
   vmcs.Write(VmcsFieldXX::GUEST_CR3, 0);
   // Do not VM exit on any exception.
   vmcs.Write(VmcsField32::EXCEPTION_BITMAP, 0);
   return zx::ok(ktl::move(*vcpu));
 }

 NormalVcpu::NormalVcpu(NormalGuest& guest, uint16_t vpid, Thread* thread)
     : Vcpu(guest, vpid, thread) {}

 NormalVcpu::~NormalVcpu() {
   local_apic_state_.timer.Cancel();
   auto result = static_cast<NormalGuest&>(guest_).FreeVpid(vpid_);
   DEBUG_ASSERT(result.is_ok());
 }

 zx::result<> NormalVcpu::Enter(zx_port_packet_t& packet) {
   auto pre_enter = [this](AutoVmcs& vmcs) -> zx::result<> {
     zx_status_t status = local_apic_maybe_interrupt(&vmcs, &local_apic_state_);
     if (status != ZX_OK) {
       return zx::error(status);
     }
     // Updates guest system time if the guest subscribed to updates.
     auto& guest = static_cast<NormalGuest&>(guest_);
     pv_clock_update_system_time(&pv_clock_state_, &guest.PhysicalAspace());
     return zx::ok();
   };
   auto post_exit = [this](AutoVmcs& vmcs, zx_port_packet_t& packet) -> zx::result<> {
     auto& guest = static_cast<NormalGuest&>(guest_);
     return vmexit_handler_normal(vmcs, vmx_state_.guest_state, local_apic_state_, pv_clock_state_,
                                  guest.PhysicalAspace(), guest.Traps(), packet);
   };
   return EnterInternal(ktl::move(pre_enter), ktl::move(post_exit), packet);
 }

 void NormalVcpu::Kick() {
   kicked_.store(true);
   // Cancel any pending or upcoming wait-for-interrupts.
   local_apic_state_.interrupt_tracker.Cancel();

   Guard<MonitoredSpinLock, IrqSave> guard{ThreadLock::Get(), SOURCE_TAG};
   interrupt_cpu(thread_.load(), last_cpu_);
 }

 void NormalVcpu::Interrupt(uint32_t vector) {
   local_apic_state_.interrupt_tracker.Interrupt(vector);

   Guard<MonitoredSpinLock, IrqSave> guard{ThreadLock::Get(), SOURCE_TAG};
   interrupt_cpu(thread_.load(), last_cpu_);
 }

 zx::result<> NormalVcpu::WriteState(const zx_vcpu_io_t& io_state) {
   if (Thread::Current::Get() != thread_) {
     return zx::error(ZX_ERR_BAD_STATE);
   }
   if ((io_state.access_size != 1) && (io_state.access_size != 2) && (io_state.access_size != 4)) {
     return zx::error(ZX_ERR_INVALID_ARGS);
   }
   static_assert(sizeof(vmx_state_.guest_state.rax) >= 4);
   memcpy(&vmx_state_.guest_state.rax, io_state.data, io_state.access_size);
   return zx::ok();
 }