[kernel][x86] Convert halt_interlock from legacy int -> ktl::atomic

|halt_interlock| is a per-CPU flag used in the HLT-version of the x86
idle loop; it is used in the adaptive spin before HLT, so that remote
wakeups can avoid an IPI while a target CPU is spinning.

Specifically:
. When a CPU is idle, it executes the idle thread; the thread executes
  the HLT instruction in a loop until there is something else to do.
. HLT may have a long entry / exit latency
. Before we HLT, we spin a bit, checking for work; while we're spinning
  remote cores can wake us with a pure store. After we enter halt, we need
  to use an IPI to wake a sleeping core.

1. A core that's about to sleep sets a per-cpu variable,
   'halt_interlock', to 1.
2. It then spins a bit, checking that halt_interlock is still == 1
3. It then uses CMPXCHG to switch halt_interlock -> 2.
4. Then it halts

A wakeup does:
1. CMPXCHG switches halt_interlock from 1->0
2. If it fails, we use an IPI to do a heavyweight wake

Convert the flag from a volatile int to a ktl::atomic<>.

Bug: 47117 Use only ktl::atomic in the kernel

Change-Id: I63a6b57e5d906e7d22e8b19919c4d176b8d372c2
Reviewed-on: https://fuchsia-review.googlesource.com/c/fuchsia/+/441925
Commit-Queue: Venkatesh Srinivas <venkateshs@google.com>
Reviewed-by: Nick Maniscalco <maniscalco@google.com>
Testability-Review: Nick Maniscalco <maniscalco@google.com>
diff --git a/zircon/kernel/arch/x86/include/arch/x86/mp.h b/zircon/kernel/arch/x86/include/arch/x86/mp.h
index f418c8e..ba070de 100644
--- a/zircon/kernel/arch/x86/include/arch/x86/mp.h
+++ b/zircon/kernel/arch/x86/include/arch/x86/mp.h
@@ -16,9 +16,9 @@
 //      ZX_TLS_STACK_GUARD_OFFSET      0x10
 //      ZX_TLS_UNSAFE_SP_OFFSET        0x18
 #define PERCPU_SAVED_USER_SP_OFFSET 0x20
-#define PERCPU_GPF_RETURN_OFFSET 0x48
-#define PERCPU_CPU_NUM_OFFSET 0x50
-#define PERCPU_DEFAULT_TSS_OFFSET 0x60
+#define PERCPU_GPF_RETURN_OFFSET 0x50
+#define PERCPU_CPU_NUM_OFFSET 0x58
+#define PERCPU_DEFAULT_TSS_OFFSET 0x70
 
 /* offset of default_tss.rsp0 */
 #define PERCPU_KERNEL_SP_OFFSET (PERCPU_DEFAULT_TSS_OFFSET + 4)
@@ -36,6 +36,7 @@
 #include <arch/x86/idt.h>
 #include <kernel/align.h>
 #include <kernel/cpu.h>
+#include <ktl/atomic.h>
 
 __BEGIN_CDECLS
 
@@ -59,13 +60,12 @@
   /* Whether blocking is disallowed.  See arch_blocking_disallowed(). */
   uint32_t blocking_disallowed;
 
-  union {
-    /* Memory for IPI-free rescheduling of idle CPUs with monitor/mwait. */
-    volatile uint8_t *monitor;
-    /* Interlock to avoid HLT on idle CPUs without monitor/mwait. */
-    /* halt_interlock is never used on CPUs that have enabled monitor/mwait for idle. */
-    volatile int halt_interlock;
-  };
+  /* Memory for IPI-free rescheduling of idle CPUs with monitor/mwait. */
+  volatile uint8_t *monitor;
+
+  /* Interlock to avoid HLT on idle CPUs without monitor/mwait. */
+  /* halt_interlock is never used on CPUs that have enabled monitor/mwait for idle. */
+  ktl::atomic<uint32_t> halt_interlock;
 
   /* Supported mwait C-states for idle CPUs. */
   X86IdleStates *idle_states;
diff --git a/zircon/kernel/arch/x86/mp.cc b/zircon/kernel/arch/x86/mp.cc
index d3193aed..6a0822f 100644
--- a/zircon/kernel/arch/x86/mp.cc
+++ b/zircon/kernel/arch/x86/mp.cc
@@ -80,6 +80,7 @@
 
     .blocking_disallowed = {},
     .monitor = &fake_monitor,
+    .halt_interlock = {},
     .idle_states = &fake_idle_states,
 
     // Start with an invalid ID until we know the local APIC is set up.
@@ -335,8 +336,8 @@
       cpu_num_t cpu_id = lowest_cpu_set(mask);
       cpu_mask_t cpu_mask = cpu_num_to_mask(cpu_id);
       struct x86_percpu* percpu = cpu_id ? &ap_percpus[cpu_id - 1] : &bp_percpu;
-      int expect_spin = 1;
-      bool did_fast_wakeup = atomic_cmpxchg(&percpu->halt_interlock, &expect_spin, 0);
+      uint32_t expect_spin = 1;
+      bool did_fast_wakeup = percpu->halt_interlock.compare_exchange_strong(expect_spin, 0);
       if (did_fast_wakeup) {
         needs_ipi &= ~cpu_mask;
       }
@@ -394,17 +395,18 @@
       // has woken us, avoid the halt instruction.
       LocalTraceDuration trace{"idle"_stringref};
       constexpr int kPauseIterations = 3000;
-      int halt_interlock_spinning = 1;
-      atomic_store_relaxed(&percpu->halt_interlock, halt_interlock_spinning);
+      uint32_t halt_interlock_spinning = 1;
+      percpu->halt_interlock.store(1, ktl::memory_order_relaxed);
       for (int i = 0; i < kPauseIterations; i++) {
         arch::Yield();
-        if (atomic_load_relaxed(&percpu->halt_interlock) != halt_interlock_spinning) {
+        if (percpu->halt_interlock.load(ktl::memory_order_relaxed) != 1) {
           break;
         }
       }
       // If the halt_interlock flag was changed, another CPU must have done it; avoid HLT and
       // switch to a new runnable thread.
-      bool no_fast_wakeup = atomic_cmpxchg(&percpu->halt_interlock, &halt_interlock_spinning, 2);
+      bool no_fast_wakeup = percpu->halt_interlock.compare_exchange_strong(halt_interlock_spinning,
+                                                                           2);
       if (no_fast_wakeup) {
         x86_idle();
       } else {