[vmm] Implement mask/unmask support in IO APIC

Our current IO APIC doesn't support masking / unmasking interrupts.
Instead, received interrupts will always be immediately forwarded to
whatever is configured in their redirect entry.

We have mostly gotten away with this imperfect implementation so far:
guest kernels sometimes complain about spurious interrupts, but
otherwise handle things just fine.

More recent Linux kernels however have changed how they perform IO APIC
shutdown. During shutdown, each IO APIC redirect register is set to
masked, but otherwise zeroed out. If, during shutdown, an interrupt
happens to fire, our IO APIC implementation will ignore the mask bit and
instead deliver an interrupt to vector 0, which is interpreted by guests
as a divide-by-zero exception. http://fxbug.dev/82015 is an example of
such a bug.

This CL adds support for masking/unmasking interrupts in the IO APIC:

 * If an IRQ is received while it is unmasked, nothing changes.

 * If an IRQ is received while it is masked, it is marked as pending.
   When it is finally unmasked, the current value in the IRQ's redirect
   register will be used as the destination.

Fixed: 82015

Change-Id: I0c3d3b145fb8a025b33d7d169c8022cac2c2bc87
Reviewed-on: https://fuchsia-review.googlesource.com/c/fuchsia/+/613549
Commit-Queue: Auto-Submit <auto-submit@fuchsia-infra.iam.gserviceaccount.com>
Fuchsia-Auto-Submit: David Greenaway <dgreenaway@google.com>
Reviewed-by: Abdulla Kamar <abdulla@google.com>
diff --git a/src/virtualization/bin/vmm/arch/x64/io_apic.cc b/src/virtualization/bin/vmm/arch/x64/io_apic.cc
index c52852f..598f4c6 100644
--- a/src/virtualization/bin/vmm/arch/x64/io_apic.cc
+++ b/src/virtualization/bin/vmm/arch/x64/io_apic.cc
@@ -17,57 +17,38 @@
 
 static constexpr uint64_t kMemSize = 0x1000;
 
-IoApic::IoApic(Guest* guest) : guest_(guest) {}
+IoApic::IoApic(Guest* guest, InterruptCallback interrupt)
+    : guest_(guest), interrupt_fn_(std::move(interrupt)) {}
+
+IoApic::IoApic(Guest* guest)
+    : guest_(guest), interrupt_fn_([guest](uint64_t mask, uint32_t vector) -> zx_status_t {
+        return guest->Interrupt(mask, vector);
+      }) {}
 
 zx_status_t IoApic::Init() {
   return guest_->CreateMapping(TrapType::MMIO_SYNC, kPhysBase, kMemSize, 0, this);
 }
 
 zx_status_t IoApic::Interrupt(uint32_t global_irq) {
-  if (global_irq >= kNumRedirects) {
+  if (global_irq >= kNumInterrupts) {
     return ZX_ERR_OUT_OF_RANGE;
   }
 
-  RedirectEntry entry;
+  IoApicRedirectEntry entry;
   {
     std::lock_guard<std::mutex> lock(mutex_);
-    entry = redirect_[global_irq];
-  }
+    InputInterrupt& interrupt = input_interrupts_[global_irq];
 
-  uint32_t vector = static_cast<uint32_t>(entry.vector());
-
-  // The "destination mode" (DESTMOD) determines how the dest field in the
-  // redirection entry should be interpreted.
-  //
-  // With a 'physical' mode, the destination is interpreted as the APIC ID
-  // of the target APIC to receive the interrupt.
-  //
-  // With a 'logical' mode, the target depends on the 'logical destination
-  // register'. In x2APIC mode this register is read-only and is derived from
-  // the local APIC ID.
-  //
-  // See Intel ICH10 Section 13.5.7.
-  // See Intel Volume 3, Section 10.12.10
-  if (entry.destination_mode() == kIoApicDestmodPhysical) {
-    uint64_t dest = entry.destination();
-
-    // Ensure that the top bits of dest are zero. From ICH10 Section 13.5.7:
-    // "If bit 11 of this entry is 0 (Physical), then bits 59:56 specifies an
-    // APIC ID. In this case, bits 63:59 should be programmed by software to
-    // 0."
-    if (dest >= kIoApicNumPhysicalDestinations) {
-      return ZX_ERR_NOT_SUPPORTED;
+    // If the interrupt is masked, mark it as pending, but don't deliver it.
+    if (interrupt.entry.mask()) {
+      interrupt.pending = true;
+      return ZX_OK;
     }
 
-    return guest_->Interrupt(1ul << dest, vector);
+    entry = interrupt.entry;
   }
 
-  // Logical DESTMOD. See Intel Volume 3, Section 10.12.10.2:
-  // logical ID = 1 << x2APIC ID[3:0].
-  //
-  // Note we're not currently respecting the DELMODE field and instead are only
-  // delivering to the fist local APIC that is targeted.
-  return guest_->Interrupt(entry.destination(), vector);
+  return DeliverInterrupt(entry);
 }
 
 zx_status_t IoApic::Read(uint64_t addr, IoValue* value) {
@@ -99,8 +80,22 @@
       return ZX_OK;
     }
     case kIoApicIoWin: {
-      std::lock_guard<std::mutex> lock(mutex_);
-      return WriteRegisterLocked(select_, value);
+      zx::status<Action> result;
+      {
+        std::lock_guard<std::mutex> lock(mutex_);
+        result = WriteRegisterLocked(select_, value);
+      }
+      if (!result.is_ok()) {
+        return result.status_value();
+      }
+
+      // If writing to a register caused an interrupt to fire (e.g.,
+      // unmasking an interrupt), deliver it now.
+      if (result.value()) {
+        DeliverInterrupt(*result.value());
+      }
+
+      return ZX_OK;
     }
     case kIoApicEOIR: {
       // End of interrupt.
@@ -133,7 +128,7 @@
       // the maximum redirection entry index.
       //
       // From Intel ICH10, Section 13.5.6.
-      value->u32 = (kNumRedirects - 1) << 16 | kIoApicVersion;
+      value->u32 = (kNumInterrupts - 1) << 16 | kIoApicVersion;
       return ZX_OK;
     case kIoApicRegisterArbitration:
       // Since we have a single I/O APIC, it is always the winner
@@ -141,15 +136,10 @@
       value->u32 = 0;
       return ZX_OK;
     case kFirstRedirectOffset ... kLastRedirectOffset: {
-      if (value->access_size != 4) {
-        return ZX_ERR_NOT_SUPPORTED;
-      }
       uint32_t redirect_offset = select_register - kFirstRedirectOffset;
-      const RedirectEntry& entry = redirect_[redirect_offset / 2];
-      uint32_t redirect_register =
-          static_cast<uint32_t>(redirect_offset % 2 == 0 ? entry.lower() : entry.upper());
-      value->u32 = redirect_register;
-      return ZX_OK;
+      uint32_t global_irq = redirect_offset / 2;
+      RedirectBits bits = redirect_offset % 2 == 0 ? RedirectBits::kLower : RedirectBits::kUpper;
+      return ReadRedirectEntryLocked(global_irq, bits, value);
     }
     default:
       FX_LOGS(ERROR) << "Unhandled IO APIC register read 0x" << std::hex << select_register;
@@ -157,31 +147,104 @@
   }
 }
 
-zx_status_t IoApic::WriteRegisterLocked(uint8_t select_register, const IoValue& value) {
+zx::status<IoApic::Action> IoApic::WriteRegisterLocked(uint8_t select_register,
+                                                       const IoValue& value) {
   switch (select_register) {
     case kIoApicRegisterId: {
       id_ = value.u32;
-      return ZX_OK;
+      return zx::ok(std::nullopt);
     }
     case kFirstRedirectOffset ... kLastRedirectOffset: {
-      if (value.access_size != 4) {
-        return ZX_ERR_NOT_SUPPORTED;
-      }
       uint32_t redirect_offset = select_register - kFirstRedirectOffset;
-      RedirectEntry& entry = redirect_[redirect_offset / 2];
-      if (redirect_offset % 2 == 0) {
-        entry.set_lower(value.u32);
-      } else {
-        entry.set_upper(value.u32);
-      }
-      return ZX_OK;
+      uint32_t global_irq = redirect_offset / 2;
+      RedirectBits bits = redirect_offset % 2 == 0 ? RedirectBits::kLower : RedirectBits::kUpper;
+      return WriteRedirectEntryLocked(global_irq, bits, value);
     }
     case kIoApicRegisterVer:
     case kIoApicRegisterArbitration:
       // Read-only, ignore writes.
-      return ZX_OK;
+      return zx::ok(std::nullopt);
     default:
       FX_LOGS(ERROR) << "Unhandled IO APIC register write 0x" << std::hex << select_register;
-      return ZX_ERR_NOT_SUPPORTED;
+      return zx::error(ZX_ERR_NOT_SUPPORTED);
   }
 }
+
+zx_status_t IoApic::ReadRedirectEntryLocked(uint32_t global_irq, RedirectBits bits,
+                                            IoValue* result) const {
+  if (result->access_size != 4) {
+    return ZX_ERR_NOT_SUPPORTED;
+  }
+  const IoApicRedirectEntry& entry = input_interrupts_[global_irq].entry;
+  result->u32 = static_cast<uint32_t>(bits == RedirectBits::kLower ? entry.lower() : entry.upper());
+  return ZX_OK;
+}
+
+zx::status<IoApic::Action> IoApic::WriteRedirectEntryLocked(uint32_t global_irq, RedirectBits bits,
+                                                            const IoValue& value) {
+  if (value.access_size != 4) {
+    return zx::error(ZX_ERR_NOT_SUPPORTED);
+  }
+
+  // Update the chosen 32 bits of the 64 bit register.
+  InputInterrupt& interrupt = input_interrupts_[global_irq];
+  if (bits == RedirectBits::kLower) {
+    interrupt.entry.set_lower(value.u32);
+  } else {
+    interrupt.entry.set_upper(value.u32);
+  }
+
+  // Report any pending and now unmasked interrupt to the caller.
+  //
+  // TODO(fxbug.dev/77786): We do not correctly support level-triggered interrupts
+  // here. In particular, the current IO APIC API only supports
+  // edge-triggered interrupts (i.e., when our `Interrupt` method is called),
+  // but we never find out when an interrupt stops being active.
+  //
+  // The result is that we can't be sure that the pending interrupt
+  // really is still pending. We opt to deliver it anyway, possibly
+  // generating a spurious interrupt.
+  if (interrupt.pending && interrupt.entry.mask() == 0) {
+    interrupt.pending = false;
+    return zx::ok(interrupt.entry);
+  }
+
+  return zx::ok(std::nullopt);
+}
+
+zx_status_t IoApic::DeliverInterrupt(const IoApicRedirectEntry& entry) {
+  uint32_t vector = static_cast<uint32_t>(entry.vector());
+
+  // The "destination mode" (DESTMOD) determines how the dest field in the
+  // redirection entry should be interpreted.
+  //
+  // With a 'physical' mode, the destination is interpreted as the APIC ID
+  // of the target APIC to receive the interrupt.
+  //
+  // With a 'logical' mode, the target depends on the 'logical destination
+  // register'. In x2APIC mode this register is read-only and is derived from
+  // the local APIC ID.
+  //
+  // See Intel ICH10 Section 13.5.7.
+  // See Intel Volume 3, Section 10.12.10
+  if (entry.destination_mode() == kIoApicDestmodPhysical) {
+    uint64_t dest = entry.destination();
+
+    // Ensure that the top bits of dest are zero. From ICH10 Section 13.5.7:
+    // "If bit 11 of this entry is 0 (Physical), then bits 59:56 specifies an
+    // APIC ID. In this case, bits 63:59 should be programmed by software to
+    // 0."
+    if (dest >= kIoApicNumPhysicalDestinations) {
+      return ZX_ERR_NOT_SUPPORTED;
+    }
+
+    return interrupt_fn_(1ul << dest, vector);
+  }
+
+  // Logical DESTMOD. See Intel Volume 3, Section 10.12.10.2:
+  // logical ID = 1 << x2APIC ID[3:0].
+  //
+  // Note we're not currently respecting the DELMODE field and instead are only
+  // delivering to the fist local APIC that is targeted.
+  return interrupt_fn_(entry.destination(), vector);
+}
diff --git a/src/virtualization/bin/vmm/arch/x64/io_apic.h b/src/virtualization/bin/vmm/arch/x64/io_apic.h
index b85c45b..5a3579c 100644
--- a/src/virtualization/bin/vmm/arch/x64/io_apic.h
+++ b/src/virtualization/bin/vmm/arch/x64/io_apic.h
@@ -5,6 +5,8 @@
 #ifndef SRC_VIRTUALIZATION_BIN_VMM_ARCH_X64_IO_APIC_H_
 #define SRC_VIRTUALIZATION_BIN_VMM_ARCH_X64_IO_APIC_H_
 
+#include <lib/zx/status.h>
+
 #include <mutex>
 
 #include <hwreg/bitfields.h>
@@ -14,6 +16,30 @@
 
 class Guest;
 
+// An entry in the IO APIC redirect table.
+//
+// Bit definitions for the redirect entry. See _Intel I/O Controller Hub
+// 10 (ICH10) Family Datasheet (October 2008), Section 13.5_.
+struct IoApicRedirectEntry {
+  uint64_t raw;
+
+  DEF_SUBFIELD(raw, 63, 56, destination);
+  DEF_SUBFIELD(raw, 55, 48, edid);  // Extended Destination ID
+  // Bits 47:17 reserved.
+  DEF_SUBBIT(raw, 16, mask);
+  DEF_SUBBIT(raw, 15, trigger_mode);
+  DEF_SUBBIT(raw, 14, remote_irr);
+  DEF_SUBBIT(raw, 13, interrupt_input_pin_polarity);
+  DEF_SUBBIT(raw, 12, delivery_status);
+  DEF_SUBBIT(raw, 11, destination_mode);
+  DEF_SUBFIELD(raw, 10, 8, delivery_mode);
+  DEF_SUBFIELD(raw, 7, 0, vector);
+
+  // Allow easy reading/writing to the upper/lower 32-bits of the word.
+  DEF_SUBFIELD(raw, 63, 32, upper);
+  DEF_SUBFIELD(raw, 31, 0, lower);
+};
+
 // Implements the IO APIC.
 //
 // See _82093AA (I/O APIC) datasheet_ for high-level details about the APIC,
@@ -21,34 +47,14 @@
 // Section 13.5_ for extensions to the original specification.
 class IoApic : public IoHandler, public PlatformDevice {
  public:
+  // Callback used when an interrupt is triggered.
+  using InterruptCallback = fit::function<zx_status_t(uint64_t mask, uint32_t vector)>;
+
   static constexpr uint64_t kPhysBase = 0xf8000000;
-  static constexpr uint8_t kNumRedirects = 48u;
-  static constexpr uint8_t kNumRedirectOffsets = kNumRedirects * 2;
+  static constexpr uint8_t kNumInterrupts = 48u;
 
-  // An entry in the redirect table.
-  struct RedirectEntry {
-    uint64_t raw;
-
-    // Bit definitions for the redirect entry. See _Intel I/O Controller Hub
-    // 10 (ICH10) Family Datasheet (October 2008), Section 13.5_.
-    DEF_SUBFIELD(raw, 63, 56, destination);
-    DEF_SUBFIELD(raw, 55, 48, edid);  // Extended Destination ID
-    // Bits 47:17 reserved.
-    DEF_SUBBIT(raw, 16, mask);
-    DEF_SUBBIT(raw, 15, trigger_mode);
-    DEF_SUBBIT(raw, 14, remote_irr);
-    DEF_SUBBIT(raw, 13, interrupt_input_pin_polarity);
-    DEF_SUBBIT(raw, 12, delivery_status);
-    DEF_SUBBIT(raw, 11, destination_mode);
-    DEF_SUBFIELD(raw, 10, 8, delivery_mode);
-    DEF_SUBFIELD(raw, 7, 0, vector);
-
-    // Allow easy reading/writing to the upper/lower 32-bits of the word.
-    DEF_SUBFIELD(raw, 63, 32, upper);
-    DEF_SUBFIELD(raw, 31, 0, lower);
-  };
-
-  IoApic(Guest* guest);
+  explicit IoApic(Guest* guest);
+  IoApic(Guest* guest, InterruptCallback interrupt);
 
   zx_status_t Init();
 
@@ -61,21 +67,57 @@
   zx_status_t Interrupt(uint32_t global_irq);
 
  private:
+  // State for global interrupts coming into the IO-APIC.
+  //
+  // The IO-APIC tracks which vector each IRQ should be routed to (via
+  // the IoApicRedirectEntry) and an interrupt has been received while the IRQ
+  // was masked.
+  struct InputInterrupt {
+    IoApicRedirectEntry entry;
+    bool pending;
+  };
+
+  // An action that needs to be taken by the caller.
+  //
+  // If std::nullopt, no action is required. If a IoApicRedirectEntry, the caller
+  // is responsible for deliverying the given interrupt.
+  using Action = std::optional<IoApicRedirectEntry>;
+
   // Read or write indirect registers directly.
   zx_status_t ReadRegisterLocked(uint8_t select_register, IoValue* value) const
       __TA_REQUIRES(mutex_);
-  zx_status_t WriteRegisterLocked(uint8_t select_register, const IoValue& value)
+  zx::status<Action> WriteRegisterLocked(uint8_t select_register, const IoValue& value)
       __TA_REQUIRES(mutex_);
 
-  Guest* guest_;
+  // Read/write the given redirect entry.
+  //
+  // Each redirect entry consists of two 32-bit words, which can be
+  // accessed via RedirectWord::kLower and RedirectWord::kUpper.
+  //
+  // Writes may require an interrupt to be fired (e.g., if an interrupt
+  // was unmasked). This is required to be done by the caller to avoid
+  // triggering interrupts inside the IO APIC's lock.
+  enum class RedirectBits {
+    kLower,  // Access bits [0:31]
+    kUpper,  // Access bits [32:63]
+  };
+  zx_status_t ReadRedirectEntryLocked(uint32_t global_irq, RedirectBits bits, IoValue* result) const
+      __TA_REQUIRES(mutex_);
+  zx::status<Action> WriteRedirectEntryLocked(uint32_t global_irq, RedirectBits bits,
+                                              const IoValue& value) __TA_REQUIRES(mutex_);
+
+  // Deliver an interrupt to the guest according to the given IoApicRedirectEntry.
+  zx_status_t DeliverInterrupt(const IoApicRedirectEntry& entry);
 
   mutable std::mutex mutex_;
-  // IO register-select register.
-  uint8_t select_ __TA_GUARDED(mutex_) = 0;
-  // IO APIC identification register.
-  uint32_t id_ __TA_GUARDED(mutex_) = 0;
-  // IO redirection table.
-  RedirectEntry redirect_[kNumRedirects] __TA_GUARDED(mutex_) = {};
+
+  Guest* const guest_;
+  const InterruptCallback interrupt_fn_;     // Callback for the IO APIC to trigger an interrupt
+  uint8_t select_ __TA_GUARDED(mutex_) = 0;  // IO register-select register.
+  uint32_t id_ __TA_GUARDED(mutex_) = 0;     // IO APIC identification register.
+
+  // Input global IRQs.
+  InputInterrupt input_interrupts_[kNumInterrupts] __TA_GUARDED(mutex_) = {};
 };
 
 #endif  // SRC_VIRTUALIZATION_BIN_VMM_ARCH_X64_IO_APIC_H_
diff --git a/src/virtualization/bin/vmm/arch/x64/io_apic_registers.h b/src/virtualization/bin/vmm/arch/x64/io_apic_registers.h
index 3b2212e..9e67c93 100644
--- a/src/virtualization/bin/vmm/arch/x64/io_apic_registers.h
+++ b/src/virtualization/bin/vmm/arch/x64/io_apic_registers.h
@@ -24,7 +24,7 @@
 // IO APIC configuration constants.
 constexpr uint8_t kIoApicVersion = 0x20;
 constexpr uint8_t kFirstRedirectOffset = 0x10;
-constexpr uint8_t kLastRedirectOffset = kFirstRedirectOffset + IoApic::kNumRedirectOffsets - 1;
+constexpr uint8_t kLastRedirectOffset = kFirstRedirectOffset + IoApic::kNumInterrupts * 2 - 1;
 
 // DESTMOD register.
 constexpr uint8_t kIoApicDestmodPhysical = 0x00;
diff --git a/src/virtualization/bin/vmm/arch/x64/io_apic_unittest.cc b/src/virtualization/bin/vmm/arch/x64/io_apic_unittest.cc
index dc7900d..c7677d3 100644
--- a/src/virtualization/bin/vmm/arch/x64/io_apic_unittest.cc
+++ b/src/virtualization/bin/vmm/arch/x64/io_apic_unittest.cc
@@ -4,6 +4,7 @@
 
 #include "src/virtualization/bin/vmm/arch/x64/io_apic.h"
 
+#include <lib/fit/function.h>
 #include <zircon/errors.h>
 #include <zircon/syscalls/hypervisor.h>
 
@@ -32,6 +33,18 @@
   return io_apic.Write(kIoApicIoWin, value);
 }
 
+// Write the given IoApicRedirectEntry for the given IRQ.
+zx_status_t WriteRedirectEntry(IoApic& io_apic, uint8_t global_irq,
+                               const IoApicRedirectEntry& entry) {
+  zx_status_t status = WriteRegister(io_apic, kFirstRedirectOffset + (global_irq * 2) + 0,
+                                     IoValue::FromU32(static_cast<uint32_t>(entry.lower())));
+  if (status != ZX_OK) {
+    return status;
+  }
+  return WriteRegister(io_apic, kFirstRedirectOffset + (global_irq * 2) + 1,
+                       IoValue::FromU32(static_cast<uint32_t>(entry.upper())));
+}
+
 TEST(IoApic, IndirectRegisterSelect) {
   Guest guest;
   IoApic io_apic(&guest);
@@ -121,4 +134,75 @@
   ASSERT_EQ(io_apic.Write(kIoApicEOIR, IoValue::FromU32(0x01)), ZX_OK);
 }
 
+class InterruptRecorder {
+ public:
+  IoApic::InterruptCallback Callback() {
+    return fit::bind_member(this, &InterruptRecorder::Interrupt);
+  }
+
+  uint64_t count() const { return count_; }
+  uint64_t mask() const { return mask_; }
+  uint32_t vector() const { return vector_; }
+
+ private:
+  zx_status_t Interrupt(uint64_t mask, uint32_t vector) {
+    count_++;
+    mask_ = mask;
+    vector_ = vector;
+    return ZX_OK;
+  }
+
+  uint64_t count_ = 0;
+  uint64_t mask_ = 0;
+  uint32_t vector_ = 0;
+};
+
+TEST(IoApic, SimpleInterrupt) {
+  Guest guest;
+  InterruptRecorder recorder;
+  IoApic io_apic(&guest, recorder.Callback());
+
+  // Set up a redirection from global IRQ 13 to vector 44 on CPU #1.
+  EXPECT_EQ(WriteRedirectEntry(io_apic, /*global_irq=*/13,
+                               IoApicRedirectEntry()
+                                   .set_vector(44)
+                                   .set_destination_mode(kIoApicDestmodPhysical)
+                                   .set_destination(1)),
+            ZX_OK);
+
+  // Trigger an interrupt. We expect it to be sent to vector 44 on CPU #1.
+  EXPECT_EQ(recorder.count(), 0u);
+  io_apic.Interrupt(13);
+  ASSERT_EQ(recorder.count(), 1u);
+  EXPECT_EQ(recorder.mask(), 1u << 1);
+  EXPECT_EQ(recorder.vector(), 44u);
+}
+
+TEST(IoApic, MaskedInterrupt) {
+  Guest guest;
+  InterruptRecorder recorder;
+  IoApic io_apic(&guest, recorder.Callback());
+
+  // Set up a redirection from global IRQ 13 to vector 44 on CPU #1,
+  // but keep it masked.
+  auto entry = IoApicRedirectEntry()
+                   .set_vector(44)
+                   .set_destination_mode(kIoApicDestmodPhysical)
+                   .set_destination(1)
+                   .set_mask(1);
+  EXPECT_EQ(WriteRedirectEntry(io_apic, /*global_irq=*/13, entry), ZX_OK);
+
+  // Trigger an interrupt. Because it is masked, we expect no callback.
+  EXPECT_EQ(recorder.count(), 0u);
+  io_apic.Interrupt(13);
+  EXPECT_EQ(recorder.count(), 0u);
+
+  // Unmask the interrupt. Expect to see the callback.
+  entry.set_mask(0);
+  EXPECT_EQ(WriteRedirectEntry(io_apic, /*global_irq=*/13, entry), ZX_OK);
+  ASSERT_EQ(recorder.count(), 1u);
+  EXPECT_EQ(recorder.mask(), 1u << 1);
+  EXPECT_EQ(recorder.vector(), 44u);
+}
+
 }  // namespace