[x86][hypervisor] Clear tracked exceptions when injecting an interrupt

Also, prioritise NMIs above other interrupts and exceptions.

This was causing flakiness in the guest integration tests on release
builds through the following order of events:
1. Linux guest begins a safe rdmsr
2. Hypervisor correctly responds by tracking a GP fault
3. On resume, a timer interrupt was prioritised before the exception
4. The guest exits again, then resumes injecting the GP fault
5. Guest panics because it receives an exception during the timer
interrupt handler.

This is likely also the cause for similar debian_guest failures in the
past.

There are two additional changes to follow: 1) Tidy up the use of
X86_INT_PLATFORM_BASE to use X86_INT_MAX_INTEL_DEFINED instead, and
handling undefined vectors. 2) Improved testing (See MAC-225).

MAC-223 #done

TEST=Run guest_integration_tests in a loop with a release build

Change-Id: Ide64857f5442b023ac49c6369a2de8aaa305e2fe
diff --git a/kernel/arch/x86/hypervisor/vcpu.cpp b/kernel/arch/x86/hypervisor/vcpu.cpp
index 8888f27..e997846 100644
--- a/kernel/arch/x86/hypervisor/vcpu.cpp
+++ b/kernel/arch/x86/hypervisor/vcpu.cpp
@@ -26,6 +26,7 @@
 
 static constexpr uint32_t kInterruptInfoValid = 1u << 31;
 static constexpr uint32_t kInterruptInfoDeliverErrorCode = 1u << 11;
+static constexpr uint32_t kInterruptTypeNmi = 2u << 8;
 static constexpr uint32_t kInterruptTypeHardwareException = 3u << 8;
 static constexpr uint32_t kInterruptTypeSoftwareException = 6u << 8;
 static constexpr uint16_t kBaseProcessorVpid = 1;
@@ -145,6 +146,8 @@
         // From Volume 3, Section 24.8.3. A VMM should use type hardware exception for all
         // exceptions other than breakpoints and overflows, which should be software exceptions.
         interrupt_info |= kInterruptTypeSoftwareException;
+    } else if (vector == X86_INT_NMI) {
+        interrupt_info |= kInterruptTypeNmi;
     } else if (vector < X86_INT_PLATFORM_BASE) {
         // From Volume 3, Section 6.15. Vectors from 0 to 32 (X86_INT_PLATFORM_BASE) are exceptions.
         interrupt_info |= kInterruptTypeHardwareException;
@@ -701,20 +704,43 @@
 
 // Injects an interrupt into the guest, if there is one pending.
 static zx_status_t local_apic_maybe_interrupt(AutoVmcs* vmcs, LocalApicState* local_apic_state) {
+    // Since hardware generated exceptions are delivered to the guest directly, the only exceptions
+    // we see here are those we generate in the VMM, e.g. GP faults in vmexit handlers. Therefore
+    // we simplify interrupt priority to 1) NMIs, 2) interrupts, and 3) generated exceptions. See
+    // Intel Volume 3, Section 6.9, Table 6-2.
     uint32_t vector;
-    hypervisor::InterruptType type = local_apic_state->interrupt_tracker.Pop(&vector);
-    if (type == hypervisor::InterruptType::INACTIVE) {
-        return ZX_OK;
+    hypervisor::InterruptType type = local_apic_state->interrupt_tracker.TryPop(X86_INT_NMI);
+    if (type != hypervisor::InterruptType::INACTIVE) {
+        vector = X86_INT_NMI;
+    } else {
+        // Pop scans vectors from highest to lowest, which will correctly pop interrupts before
+        // exceptions. All vectors < X86_INT_PLATFORM_BASE except the NMI vector are exceptions.
+        type = local_apic_state->interrupt_tracker.Pop(&vector);
+        if (type == hypervisor::InterruptType::INACTIVE) {
+            return ZX_OK;
+        }
     }
 
-    if (vector < X86_INT_PLATFORM_BASE || vmcs->Read(VmcsFieldXX::GUEST_RFLAGS) & X86_FLAGS_IF) {
-        // If the vector is non-maskable or interrupts are enabled, we inject an interrupt.
-        vmcs->IssueInterrupt(vector);
-    } else {
+    // Intel Volume 3, Section 6.8.1: The IF flag does not affect non-maskable interrupts (NMIs),
+    // [...] nor does it affect processor generated exceptions.
+    if (vector >= X86_INT_PLATFORM_BASE &&
+        !(vmcs->Read(VmcsFieldXX::GUEST_RFLAGS) & X86_FLAGS_IF)) {
         local_apic_state->interrupt_tracker.Track(vector, type);
         // If interrupts are disabled, we set VM exit on interrupt enable.
         vmcs->InterruptWindowExiting(true);
+        return ZX_OK;
     }
+
+    // If the vector is non-maskable or interrupts are enabled, we inject an interrupt.
+    vmcs->IssueInterrupt(vector);
+
+    // Intel Volume 3, Section 6.9: Lower priority exceptions are discarded; lower priority
+    // interrupts are held pending. Discarded exceptions are re-generated when the interrupt handler
+    // returns execution to the point in the program or task where the exceptions and/or interrupts
+    // occurred.
+    local_apic_state->interrupt_tracker.Clear(0, X86_INT_NMI);
+    local_apic_state->interrupt_tracker.Clear(X86_INT_NMI + 1, X86_INT_PLATFORM_BASE);
+
     return ZX_OK;
 }
 
diff --git a/kernel/arch/x86/hypervisor/vmexit.cpp b/kernel/arch/x86/hypervisor/vmexit.cpp
index 89ce9b0..1b26bcf 100644
--- a/kernel/arch/x86/hypervisor/vmexit.cpp
+++ b/kernel/arch/x86/hypervisor/vmexit.cpp
@@ -144,7 +144,7 @@
     }
 }
 
-static zx_status_t handle_external_interrupt(AutoVmcs* vmcs, LocalApicState* local_apic_state) {
+static zx_status_t handle_external_interrupt(AutoVmcs* vmcs) {
     ExitInterruptionInformation int_info(*vmcs);
     DEBUG_ASSERT(int_info.valid);
     DEBUG_ASSERT(int_info.interruption_type == InterruptionType::EXTERNAL_INTERRUPT);
@@ -1051,7 +1051,7 @@
     switch (exit_info.exit_reason) {
     case ExitReason::EXTERNAL_INTERRUPT:
         ktrace_vcpu_exit(VCPU_EXTERNAL_INTERRUPT, exit_info.guest_rip);
-        status = handle_external_interrupt(vmcs, local_apic_state);
+        status = handle_external_interrupt(vmcs);
         break;
     case ExitReason::INTERRUPT_WINDOW:
         LTRACEF("handling interrupt window\n\n");
diff --git a/kernel/lib/hypervisor/hypervisor_unittest.cpp b/kernel/lib/hypervisor/hypervisor_unittest.cpp
index 63fa3c1..12510f8 100644
--- a/kernel/lib/hypervisor/hypervisor_unittest.cpp
+++ b/kernel/lib/hypervisor/hypervisor_unittest.cpp
@@ -536,40 +536,51 @@
 
     vector = UINT32_MAX;
     bitmap.Set(0u, hypervisor::InterruptType::PHYSICAL);
-    EXPECT_EQ(hypervisor::InterruptType::PHYSICAL, bitmap.Get(0), "");
-    EXPECT_EQ(hypervisor::InterruptType::INACTIVE, bitmap.Get(1), "");
+    EXPECT_EQ(hypervisor::InterruptType::PHYSICAL, bitmap.Get(0u), "");
+    EXPECT_EQ(hypervisor::InterruptType::INACTIVE, bitmap.Get(1u), "");
     EXPECT_EQ(hypervisor::InterruptType::PHYSICAL, bitmap.Scan(&vector), "");
     EXPECT_EQ(0u, vector, "");
 
     vector = UINT32_MAX;
     bitmap.Set(0u, hypervisor::InterruptType::INACTIVE);
-    EXPECT_EQ(hypervisor::InterruptType::INACTIVE, bitmap.Get(0), "");
-    EXPECT_EQ(hypervisor::InterruptType::INACTIVE, bitmap.Get(1), "");
+    EXPECT_EQ(hypervisor::InterruptType::INACTIVE, bitmap.Get(0u), "");
+    EXPECT_EQ(hypervisor::InterruptType::INACTIVE, bitmap.Get(1u), "");
     EXPECT_EQ(hypervisor::InterruptType::INACTIVE, bitmap.Scan(&vector), "");
     EXPECT_EQ(UINT32_MAX, vector, "");
 
     // Index 1.
     vector = UINT32_MAX;
     bitmap.Set(1u, hypervisor::InterruptType::VIRTUAL);
-    EXPECT_EQ(hypervisor::InterruptType::INACTIVE, bitmap.Get(0), "");
-    EXPECT_EQ(hypervisor::InterruptType::VIRTUAL, bitmap.Get(1), "");
+    EXPECT_EQ(hypervisor::InterruptType::INACTIVE, bitmap.Get(0u), "");
+    EXPECT_EQ(hypervisor::InterruptType::VIRTUAL, bitmap.Get(1u), "");
     EXPECT_EQ(hypervisor::InterruptType::VIRTUAL, bitmap.Scan(&vector), "");
     EXPECT_EQ(1u, vector, "");
 
     vector = UINT32_MAX;
     bitmap.Set(1u, hypervisor::InterruptType::PHYSICAL);
-    EXPECT_EQ(hypervisor::InterruptType::INACTIVE, bitmap.Get(0), "");
-    EXPECT_EQ(hypervisor::InterruptType::PHYSICAL, bitmap.Get(1), "");
+    EXPECT_EQ(hypervisor::InterruptType::INACTIVE, bitmap.Get(0u), "");
+    EXPECT_EQ(hypervisor::InterruptType::PHYSICAL, bitmap.Get(1u), "");
     EXPECT_EQ(hypervisor::InterruptType::PHYSICAL, bitmap.Scan(&vector), "");
     EXPECT_EQ(1u, vector, "");
 
     vector = UINT32_MAX;
     bitmap.Set(1u, hypervisor::InterruptType::INACTIVE);
-    EXPECT_EQ(hypervisor::InterruptType::INACTIVE, bitmap.Get(0), "");
-    EXPECT_EQ(hypervisor::InterruptType::INACTIVE, bitmap.Get(1), "");
+    EXPECT_EQ(hypervisor::InterruptType::INACTIVE, bitmap.Get(0u), "");
+    EXPECT_EQ(hypervisor::InterruptType::INACTIVE, bitmap.Get(1u), "");
     EXPECT_EQ(hypervisor::InterruptType::INACTIVE, bitmap.Scan(&vector), "");
     EXPECT_EQ(UINT32_MAX, vector, "");
 
+    // Clear
+    bitmap.Set(0u, hypervisor::InterruptType::VIRTUAL);
+    bitmap.Set(1u, hypervisor::InterruptType::VIRTUAL);
+    bitmap.Set(2u, hypervisor::InterruptType::PHYSICAL);
+    bitmap.Set(3u, hypervisor::InterruptType::PHYSICAL);
+    bitmap.Clear(1u, 3u);
+    EXPECT_EQ(hypervisor::InterruptType::VIRTUAL, bitmap.Get(0u), "");
+    EXPECT_EQ(hypervisor::InterruptType::INACTIVE, bitmap.Get(1u), "");
+    EXPECT_EQ(hypervisor::InterruptType::INACTIVE, bitmap.Get(2u), "");
+    EXPECT_EQ(hypervisor::InterruptType::PHYSICAL, bitmap.Get(3u), "");
+
     END_TEST;
 }
 
diff --git a/kernel/lib/hypervisor/include/hypervisor/interrupt_tracker.h b/kernel/lib/hypervisor/include/hypervisor/interrupt_tracker.h
index b91710d..51b8fa7 100644
--- a/kernel/lib/hypervisor/include/hypervisor/interrupt_tracker.h
+++ b/kernel/lib/hypervisor/include/hypervisor/interrupt_tracker.h
@@ -61,6 +61,14 @@
         }
     }
 
+    void Clear(uint32_t min, uint32_t max) {
+        if (max < min || max >= N) {
+            DEBUG_ASSERT(false);
+            return;
+        }
+        bitmap_.Clear(min * 2, max * 2);
+    }
+
     InterruptType Scan(uint32_t* vector) {
         size_t bitoff;
 #if ARCH_ARM64
@@ -101,6 +109,22 @@
         return bitmap_.Scan(&vector) != InterruptType::INACTIVE;
     }
 
+    // Clears all vectors in the range [min, max).
+    void Clear(uint32_t min, uint32_t max) {
+        AutoSpinLock lock(&lock_);
+        bitmap_.Clear(min, max);
+    }
+
+    // Pops the specified vector, if it is pending.
+    InterruptType TryPop(uint32_t vector) {
+        AutoSpinLock lock(&lock_);
+        InterruptType type = bitmap_.Get(vector);
+        if (type != InterruptType::INACTIVE) {
+            bitmap_.Set(vector, InterruptType::INACTIVE);
+        }
+        return type;
+    }
+
     // Pops the highest priority interrupt.
     InterruptType Pop(uint32_t* vector) {
         AutoSpinLock lock(&lock_);
diff --git a/system/utest/hypervisor/guest.cpp b/system/utest/hypervisor/guest.cpp
index cb15390..63bfc04 100644
--- a/system/utest/hypervisor/guest.cpp
+++ b/system/utest/hypervisor/guest.cpp
@@ -27,7 +27,9 @@
                                            ZX_VM_PERM_EXECUTE | ZX_VM_SPECIFIC;
 static constexpr uint32_t kHostMapFlags = ZX_VM_PERM_READ | ZX_VM_PERM_WRITE;
 // Inject an interrupt with vector 32, the first user defined interrupt vector.
-static constexpr uint32_t kInterruptVector = 32;
+static constexpr uint32_t kInterruptVector = 32u;
+static constexpr uint32_t kNmiVector = 2u;
+static constexpr uint32_t kExceptionVector = 16u;
 static constexpr uint64_t kTrapKey = 0x1234;
 static constexpr char kSysInfoPath[] = "/dev/misc/sysinfo";
 
@@ -251,8 +253,136 @@
     }
     test.interrupts_enabled = true;
 
+#if __x86_64__
+    // Resume once and wait for the guest to set up an IDT.
+    ASSERT_TRUE(resume_and_clean_exit(&test));
+#endif
+
     ASSERT_EQ(test.vcpu.interrupt(kInterruptVector), ZX_OK);
     ASSERT_TRUE(resume_and_clean_exit(&test));
+
+#if __x86_64__
+    zx_vcpu_state_t vcpu_state;
+    ASSERT_EQ(test.vcpu.read_state(ZX_VCPU_STATE, &vcpu_state, sizeof(vcpu_state)), ZX_OK);
+    EXPECT_EQ(vcpu_state.rax, kInterruptVector);
+#endif
+
+    ASSERT_TRUE(teardown(&test));
+
+    END_TEST;
+}
+
+static bool vcpu_interrupt_priority() {
+    BEGIN_TEST;
+
+    test_t test;
+    ASSERT_TRUE(setup(&test, vcpu_interrupt_start, vcpu_interrupt_end));
+    if (!test.supported) {
+        // The hypervisor isn't supported, so don't run the test.
+        return true;
+    }
+    test.interrupts_enabled = true;
+
+    // Resume once and wait for the guest to set up an IDT.
+    ASSERT_TRUE(resume_and_clean_exit(&test));
+
+    // Check that interrupts have higher priority than exceptions.
+    ASSERT_EQ(test.vcpu.interrupt(kExceptionVector), ZX_OK);
+    ASSERT_EQ(test.vcpu.interrupt(kInterruptVector), ZX_OK);
+    ASSERT_TRUE(resume_and_clean_exit(&test));
+#if __x86_64__
+    zx_vcpu_state_t vcpu_state;
+    ASSERT_EQ(test.vcpu.read_state(ZX_VCPU_STATE, &vcpu_state, sizeof(vcpu_state)), ZX_OK);
+    EXPECT_EQ(vcpu_state.rax, kInterruptVector);
+#endif
+
+    // TODO(MAC-225): Check that the exception is cleared.
+
+    ASSERT_TRUE(teardown(&test));
+
+    END_TEST;
+}
+
+static bool vcpu_nmi() {
+    BEGIN_TEST;
+
+    test_t test;
+    ASSERT_TRUE(setup(&test, vcpu_interrupt_start, vcpu_interrupt_end));
+    if (!test.supported) {
+        // The hypervisor isn't supported, so don't run the test.
+        return true;
+    }
+    test.interrupts_enabled = true;
+
+    // Resume once and wait for the guest to set up an IDT.
+    ASSERT_TRUE(resume_and_clean_exit(&test));
+
+    // Check that NMIs are handled.
+    ASSERT_EQ(test.vcpu.interrupt(kNmiVector), ZX_OK);
+    ASSERT_TRUE(resume_and_clean_exit(&test));
+#if __x86_64__
+    zx_vcpu_state_t vcpu_state;
+    ASSERT_EQ(test.vcpu.read_state(ZX_VCPU_STATE, &vcpu_state, sizeof(vcpu_state)), ZX_OK);
+    EXPECT_EQ(vcpu_state.rax, kNmiVector);
+#endif
+    ASSERT_TRUE(teardown(&test));
+
+    END_TEST;
+}
+
+static bool vcpu_nmi_priority() {
+    BEGIN_TEST;
+
+    test_t test;
+    ASSERT_TRUE(setup(&test, vcpu_interrupt_start, vcpu_interrupt_end));
+    if (!test.supported) {
+        // The hypervisor isn't supported, so don't run the test.
+        return true;
+    }
+    test.interrupts_enabled = true;
+
+    // Resume once and wait for the guest to set up an IDT.
+    ASSERT_TRUE(resume_and_clean_exit(&test));
+
+    // Check that NMIs have higher priority than interrupts.
+    ASSERT_EQ(test.vcpu.interrupt(kInterruptVector), ZX_OK);
+    ASSERT_EQ(test.vcpu.interrupt(kNmiVector), ZX_OK);
+    ASSERT_TRUE(resume_and_clean_exit(&test));
+#if __x86_64__
+    zx_vcpu_state_t vcpu_state;
+    ASSERT_EQ(test.vcpu.read_state(ZX_VCPU_STATE, &vcpu_state, sizeof(vcpu_state)), ZX_OK);
+    EXPECT_EQ(vcpu_state.rax, kNmiVector);
+#endif
+
+    // TODO(MAC-225): Check that the interrupt is queued.
+
+    ASSERT_TRUE(teardown(&test));
+
+    END_TEST;
+}
+
+static bool vcpu_exception() {
+    BEGIN_TEST;
+
+    test_t test;
+    ASSERT_TRUE(setup(&test, vcpu_interrupt_start, vcpu_interrupt_end));
+    if (!test.supported) {
+        // The hypervisor isn't supported, so don't run the test.
+        return true;
+    }
+    test.interrupts_enabled = true;
+
+    // Resume once and wait for the guest to set up an IDT.
+    ASSERT_TRUE(resume_and_clean_exit(&test));
+
+    // Check that exceptions are handled.
+    ASSERT_EQ(test.vcpu.interrupt(kExceptionVector), ZX_OK);
+    ASSERT_TRUE(resume_and_clean_exit(&test));
+#if __x86_64__
+    zx_vcpu_state_t vcpu_state;
+    ASSERT_EQ(test.vcpu.read_state(ZX_VCPU_STATE, &vcpu_state, sizeof(vcpu_state)), ZX_OK);
+    EXPECT_EQ(vcpu_state.rax, kExceptionVector);
+#endif
     ASSERT_TRUE(teardown(&test));
 
     END_TEST;
@@ -696,6 +826,10 @@
 RUN_TEST(vcpu_fp_aarch32)
 #elif __x86_64__
 RUN_TEST(guest_set_trap_with_io)
+RUN_TEST(vcpu_interrupt_priority)
+RUN_TEST(vcpu_nmi)
+RUN_TEST(vcpu_nmi_priority)
+RUN_TEST(vcpu_exception)
 RUN_TEST(vcpu_hlt)
 RUN_TEST(vcpu_pause)
 RUN_TEST(vcpu_write_cr0)
diff --git a/system/utest/hypervisor/x86.S b/system/utest/hypervisor/x86.S
index 59480f9..f547c67 100644
--- a/system/utest/hypervisor/x86.S
+++ b/system/utest/hypervisor/x86.S
@@ -105,15 +105,13 @@
 end_of_init_gdt_for_\start:
 .endm
 
-.macro debug_isr start, n
+.macro isr start, n
 int_\n\()_for_\start:
     mov $\n, %rax
-    popq %rbx // Error code
-    popq %rcx // RIP
-    movq $-1, (EXIT_TEST_ADDR)
+    movq $0, (EXIT_TEST_ADDR)
 .endm
 
-.macro debug_idt start, n
+.macro idt start, n
     .2byte ABSOLUTE_ADDR(\start, \
                          int_\n\()_for_\start)      // Offset 15:00
     .2byte CODE_64_SELECTOR                         // Segment selector
@@ -131,27 +129,17 @@
     jmp end_of_\start
 
 .irp n, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, \
-       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
-    debug_isr \start \n
+       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
+    isr \start \n
 .endr
 
-int32_for_\start:
-    movq $0, (EXIT_TEST_ADDR)
-
 .align 8
 idt_for_\start:
 .irp n, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, \
-       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
-    debug_idt \start \n
+       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
+    idt \start \n
 .endr
 
-    .2byte ABSOLUTE_ADDR(\start, int32_for_\start)  // Offset 15:00
-    .2byte CODE_64_SELECTOR                         // Segment selector
-    .byte  0                                        // IST(000)
-    .byte  0b10001110                               // P(1) DPL(00) 0 Type(1110)
-    .2byte 0                                        // Offset 31:16
-    .4byte 0                                        // Offset 63:32
-    .4byte 0                                        // Reserved
 idtr_for_\start:
     .2byte idtr_for_\start - idt_for_\start - 1     // Limit
     .8byte ABSOLUTE_ADDR(\start, idt_for_\start)    // Address
@@ -181,6 +169,7 @@
 FUNCTION vcpu_interrupt_start
     init_interrupt_handling vcpu_interrupt_start
     sti
+    movq $0, (EXIT_TEST_ADDR)
     jmp .
 FUNCTION vcpu_interrupt_end