zircon/kernel/arch/x86/mp.cc - fuchsia - Git at Google

 // Copyright 2016 The Fuchsia Authors
 // Copyright (c) 2016 Travis Geiselbrecht
 //
 // Use of this source code is governed by a MIT-style
 // license that can be found in the LICENSE file or at
 // https://opensource.org/licenses/MIT
 #include "arch/x86/mp.h"

 #include <assert.h>
 #include <debug.h>
 #include <lib/arch/x86/boot-cpuid.h>
 #include <lib/arch/x86/bug.h>
 #include <lib/arch/x86/descriptor-regs.h>
 #include <lib/console.h>
 #include <lib/ktrace.h>
 #include <platform.h>
 #include <stdio.h>
 #include <string.h>
 #include <trace.h>
 #include <zircon/compiler.h>
 #include <zircon/errors.h>
 #include <zircon/types.h>

 #include <new>

 #include <arch/mp.h>
 #include <arch/ops.h>
 #include <arch/x86.h>
 #include <arch/x86/apic.h>
 #include <arch/x86/descriptor.h>
 #include <arch/x86/feature.h>
 #include <arch/x86/idle_states.h>
 #include <arch/x86/interrupts.h>
 #include <arch/x86/mmu.h>
 #include <dev/hw_rng.h>
 #include <dev/interrupt.h>
 #include <hwreg/x86msr.h>
 #include <kernel/auto_preempt_disabler.h>
 #include <kernel/cpu.h>
 #include <kernel/event.h>
 #include <kernel/timer.h>

 #define LOCAL_TRACE 0

 // Enable/disable ktraces local to this file.
 #define LOCAL_KTRACE_ENABLE 0 || LOCAL_TRACE

 using LocalTraceDuration =
     TraceDuration<TraceEnabled<LOCAL_KTRACE_ENABLE>, KTRACE_GRP_SCHEDULER, TraceContext::Cpu>;

 struct x86_percpu* ap_percpus;
 uint8_t x86_num_cpus = 1;
 static bool use_monitor = false;

 extern struct idt _idt;

 #if __has_feature(safe_stack)
 static uint8_t unsafe_kstack[PAGE_SIZE] __ALIGNED(16);
 #define unsafe_kstack_end (&unsafe_kstack[sizeof(unsafe_kstack)])
 #else
 #define unsafe_kstack_end nullptr
 #endif

 // Fake monitor to use until smp is initialized. The size of
 // the memory range doesn't matter, since it won't actually get
 // used in a non-smp environment.
 volatile uint8_t fake_monitor;

 // Also set up a fake table of idle states.
 x86_idle_states_t fake_supported_idle_states = {
     .states = {X86_CSTATE_C1(0)},
     .default_state_mask = kX86IdleStateMaskC1Only,
 };
 X86IdleStates fake_idle_states = X86IdleStates(&fake_supported_idle_states);

 // Pre-initialize the per cpu structure for the boot cpu. Referenced by
 // early boot code prior to being able to initialize via code.
 struct x86_percpu bp_percpu = {
     .direct = &bp_percpu,
     .current_thread = {},

     .stack_guard = {},
     .kernel_unsafe_sp = (uintptr_t)unsafe_kstack_end,
     .saved_user_sp = {},

     .blocking_disallowed = {},
     .monitor = &fake_monitor,
     .halt_interlock = {},
     .idle_states = &fake_idle_states,

     // Start with an invalid ID until we know the local APIC is set up.
     .apic_id = INVALID_APIC_ID,

     .gpf_return_target = {},

     .cpu_num = 0,
     .num_spinlocks = 0,
     .last_user_aspace = nullptr,

     .high_level_percpu = {},

     .default_tss = {},
     .interrupt_stacks = {},
 };

 zx_status_t x86_allocate_ap_structures(uint32_t* apic_ids, uint8_t cpu_count) {
   ASSERT(ap_percpus == nullptr);

   DEBUG_ASSERT(cpu_count >= 1);
   if (cpu_count == 0) {
     return ZX_ERR_INVALID_ARGS;
   }

   if (cpu_count > 1) {
     size_t len = sizeof(*ap_percpus) * (cpu_count - 1);
     ap_percpus = (x86_percpu*)memalign(MAX_CACHE_LINE, len);
     if (ap_percpus == nullptr) {
       return ZX_ERR_NO_MEMORY;
     }
     memset(ap_percpus, 0, len);

     use_monitor = arch::BootCpuid<arch::CpuidFeatureFlagsC>().monitor() &&
                   arch::BootCpuidSupports<arch::CpuidMonitorMwaitB>() &&
                   !x86_get_microarch_config()->idle_prefer_hlt;
     if (use_monitor) {
       auto monitor_size = static_cast<uint16_t>(
           arch::BootCpuid<arch::CpuidMonitorMwaitB>().largest_monitor_line_size());
       if (monitor_size < MAX_CACHE_LINE) {
         monitor_size = MAX_CACHE_LINE;
       }
       uint8_t* monitors = (uint8_t*)memalign(monitor_size, monitor_size * cpu_count);
       if (monitors == nullptr) {
         return ZX_ERR_NO_MEMORY;
       }
       bp_percpu.monitor = monitors;
       for (uint i = 1; i < cpu_count; ++i) {
         ap_percpus[i - 1].monitor = monitors + (i * monitor_size);
       }

       uint16_t idle_states_size = sizeof(X86IdleStates);
       if (idle_states_size < MAX_CACHE_LINE) {
         idle_states_size = MAX_CACHE_LINE;
       }
       X86IdleStates* idle_states =
           static_cast<X86IdleStates*>(memalign(idle_states_size, idle_states_size * cpu_count));
       if (idle_states == nullptr) {
         return ZX_ERR_NO_MEMORY;
       }
       const x86_idle_states_t* supported_idle_states = x86_get_idle_states();
       bp_percpu.idle_states = idle_states;
       // Placement new the BP idle-states table.
       new (bp_percpu.idle_states) X86IdleStates(supported_idle_states);
       for (uint i = 1; i < cpu_count; ++i) {
         ap_percpus[i - 1].idle_states = reinterpret_cast<X86IdleStates*>(
             reinterpret_cast<uintptr_t>(idle_states) + (i * idle_states_size));
         // Placement new the other idle-states tables.
         new (ap_percpus[i - 1].idle_states) X86IdleStates(supported_idle_states);
       }
     }
   }

   uint32_t bootstrap_ap = apic_local_id();
   DEBUG_ASSERT(bootstrap_ap == apic_bsp_id());

   uint apic_idx = 0;
   for (uint i = 0; i < cpu_count; ++i) {
     if (apic_ids[i] == bootstrap_ap) {
       continue;
     }
     DEBUG_ASSERT(apic_idx != (uint)(cpu_count - 1));
     if (apic_idx == (uint)cpu_count - 1) {
       /* Never found bootstrap CPU in apic id list */
       return ZX_ERR_BAD_STATE;
     }
     ap_percpus[apic_idx].cpu_num = apic_idx + 1;
     ap_percpus[apic_idx].apic_id = apic_ids[i];
     ap_percpus[apic_idx].direct = &ap_percpus[apic_idx];
     apic_idx++;
   }

   x86_num_cpus = cpu_count;
   return ZX_OK;
 }

 static struct x86_percpu* x86_percpu_for(cpu_num_t cpu_num) {
   return (cpu_num == 0) ? &bp_percpu : &ap_percpus[cpu_num - 1];
 }

 void x86_init_percpu(cpu_num_t cpu_num) {
   struct x86_percpu* const percpu = x86_percpu_for(cpu_num);
   DEBUG_ASSERT(percpu->cpu_num == cpu_num);
   DEBUG_ASSERT(percpu->direct == percpu);

   // Assembly code has already set up %gs.base so that this function's
   // own code can use it implicitly for stack-protector or safe-stack.
   DEBUG_ASSERT(read_msr(X86_MSR_IA32_GS_BASE) == (uintptr_t)percpu);

   /* set the KERNEL_GS_BASE MSR to 0 */
   /* when we enter user space, this will be populated via a swapgs */
   write_msr(X86_MSR_IA32_KERNEL_GS_BASE, 0);

   x86_feature_early_init_percpu();

   x86_extended_register_init();
   x86_extended_register_enable_feature(X86_EXTENDED_REGISTER_SSE);
   x86_extended_register_enable_feature(X86_EXTENDED_REGISTER_AVX);

   // This can be turned on/off later by the user. Turn it on here so that
   // the buffer size assumes it's on.
   x86_extended_register_enable_feature(X86_EXTENDED_REGISTER_PT);
   // But then set the default mode to off.
   x86_set_extended_register_pt_state(false);

   gdt_load(gdt_get());

   // Disable the LDT so userspace cannot make
   // segment selectors that point to it.
   // See fxbug.dev/79060
   arch::DisableLdt();

   x86_initialize_percpu_tss();

   // Setup the post early boot IDT
   if (cpu_num == 0) {
     idt_setup(&_idt);
     // Setup alternate stacks to guarantee stack sanity when handling these
     // interrupts
     idt_set_ist_index(&_idt, X86_INT_NMI, NMI_IST_INDEX);
     idt_set_ist_index(&_idt, X86_INT_MACHINE_CHECK, MCE_IST_INDEX);
     idt_set_ist_index(&_idt, X86_INT_DOUBLE_FAULT, DBF_IST_INDEX);
     idt_load(&_idt);
   } else {
     // Load the read-only IDT setup on arch initialization.
     idt_load(idt_get_readonly());
   }

   /* load the syscall entry point */
   write_msr(X86_MSR_IA32_LSTAR, (uint64_t)&x86_syscall);

   /* set the STAR MSR to load the appropriate kernel code selector on syscall
    * and the appropriate user code selector on return.
    * on syscall entry the following are loaded into segment registers:
    *   CS = CODE_64_SELECTOR      (STAR[47:32])
    *   SS = DATA_SELECTOR         (STAR[47:32] + 0x8)
    * on syscall exit:
    *   CS = USER_CODE_64_SELECTOR (STAR[63:48] + 0x16)
    *   SS = USER_DATA_SELECTOR    (STAR[63:48] + 0x8)
    */
   write_msr(X86_MSR_IA32_STAR,
             (uint64_t)USER_CODE_SELECTOR << 48 | (uint64_t)CODE_64_SELECTOR << 32);

   // Set the FMASK register to mask off certain bits in RFLAGS on syscall
   // entry.  See docs/kernel_invariants.md.
   uint64_t mask = X86_FLAGS_AC |         /* disable alignment check/access control (this
                                           * prevents ring 0 from performing data access
                                           * to ring 3 if SMAP is available) */
                   X86_FLAGS_NT |         /* clear nested task */
                   X86_FLAGS_IOPL_MASK |  /* set iopl to 0 */
                   X86_FLAGS_STATUS_MASK; /* clear all status flags, interrupt disabled, trap flag */
   write_msr(X86_MSR_IA32_FMASK, mask);

   // Apply the same mask to our current flags, to ensure that flags are
   // set to known-good values, because some flags may be inherited by
   // later kernel threads.  We do this just in case any bad values were
   // left behind by firmware or the bootloader.
   x86_restore_flags(x86_save_flags() & ~mask);

   /* enable syscall instruction */
   uint64_t efer_msr = read_msr(X86_MSR_IA32_EFER);
   efer_msr |= X86_EFER_SCE;
   write_msr(X86_MSR_IA32_EFER, efer_msr);

   uint64_t cr4 = x86_get_cr4();
   // Enable {rd,wr}{fs,gs}base instructions.
   if (x86_feature_test(X86_FEATURE_FSGSBASE)) {
     cr4 |= X86_CR4_FSGSBASE;
   }
   if (x86_feature_test(X86_FEATURE_UMIP)) {
     cr4 |= X86_CR4_UMIP;
   }
   x86_set_cr4(cr4);

   // Store the processor number in IA32_TSC_AUX, so RDTSCP/RDP can efficiently get the current CPU
   // from userspace.
   if (x86_feature_test(X86_FEATURE_RDTSCP)) {
     write_msr(X86_MSR_IA32_TSC_AUX, cpu_num);
   }

   switch (x86_vendor) {
     case X86_VENDOR_INTEL:
       x86_intel_init_percpu();
       break;
     case X86_VENDOR_AMD:
       x86_amd_init_percpu();
       break;
     default:
       break;
   }

   arch::ApplyX86ErrataWorkarounds(arch::BootCpuidIo{}, hwreg::X86MsrIo{});
 }

 void x86_set_local_apic_id(uint32_t apic_id) {
   struct x86_percpu* percpu = x86_get_percpu();
   DEBUG_ASSERT(percpu->cpu_num == 0);
   percpu->apic_id = apic_id;
 }

 int x86_apic_id_to_cpu_num(uint32_t apic_id) {
   if (bp_percpu.apic_id == apic_id) {
     return (int)bp_percpu.cpu_num;
   }

   for (uint i = 0; i < (uint)x86_num_cpus - 1; ++i) {
     if (ap_percpus[i].apic_id == apic_id) {
       return (int)ap_percpus[i].cpu_num;
     }
   }
   return -1;
 }

 void arch_mp_reschedule(cpu_mask_t mask) {
   thread_lock.AssertHeld();

   cpu_mask_t needs_ipi = 0;
   if (use_monitor) {
     while (mask) {
       cpu_num_t cpu_id = lowest_cpu_set(mask);
       cpu_mask_t cpu_mask = cpu_num_to_mask(cpu_id);
       struct x86_percpu* percpu = cpu_id ? &ap_percpus[cpu_id - 1] : &bp_percpu;

       // When a cpu see that it is about to start the idle thread, it sets its own
       // monitor flag. When a cpu is rescheduling another cpu, if it sees the monitor flag
       // set, it can clear the flag to wake up the other cpu w/o an IPI. When the other
       // cpu wakes up, the idle thread sees the cleared flag and preempts itself. Both of
       // these operations are under the scheduler lock, so there are no races where the
       // wrong signal can be sent.
       uint8_t old_val = *percpu->monitor;
       *percpu->monitor = 0;
       if (!old_val) {
         needs_ipi |= cpu_mask;
       }
       mask &= ~cpu_mask;
     }
   } else {
     needs_ipi = mask;
     // We are attempting to wake the set up CPUs in |mask| and cause them to schedule a new thread.
     // A target CPU spins for a short time before execuing halt; before it spins, it sets the
     // |halt_interlock| flag to '1'. Before a target CPU executes the halt instruction, it sets
     // the |halt_interlock| flag to '2' and skips the halt if the flag was cleared while spinning.
     // Try to clear the |halt_interlock| flag from 1 -> 0. If we do so, we can skip sending an
     // IPI and prevent an unnecessary halt instruction.
     while (mask) {
       cpu_num_t cpu_id = lowest_cpu_set(mask);
       cpu_mask_t cpu_mask = cpu_num_to_mask(cpu_id);
       struct x86_percpu* percpu = cpu_id ? &ap_percpus[cpu_id - 1] : &bp_percpu;
       uint32_t expect_spin = 1;
       bool did_fast_wakeup = percpu->halt_interlock.compare_exchange_strong(expect_spin, 0);
       if (did_fast_wakeup) {
         needs_ipi &= ~cpu_mask;
       }
       mask &= ~cpu_mask;
     }
   }

   if (needs_ipi) {
     arch_mp_send_ipi(MP_IPI_TARGET_MASK, needs_ipi, MP_IPI_RESCHEDULE);
   }
 }

 void arch_prepare_current_cpu_idle_state(bool idle) {
   thread_lock.AssertHeld();

   if (use_monitor) {
     *x86_get_percpu()->monitor = idle;
   }
 }

 __NO_RETURN int arch_idle_thread_routine(void*) {
   struct x86_percpu* percpu = x86_get_percpu();
   if (use_monitor) {
     for (;;) {
       AutoPreemptDisabler preempt_disabled;
       bool rsb_maybe_empty = false;
       while (*percpu->monitor && !Thread::Current::preemption_state().preempts_pending()) {
         X86IdleState* next_state = percpu->idle_states->PickIdleState();
         rsb_maybe_empty |= x86_intel_idle_state_may_empty_rsb(next_state);
         LocalTraceDuration trace{"idle"_stringref, next_state->MwaitHint(), 0u};
         x86_monitor(percpu->monitor);
         // Check percpu->monitor in case it was cleared between the first check and
         // the monitor being armed. Any writes after arming the monitor will trigger
         // it and cause mwait to return, so there aren't races after this check.
         if (*percpu->monitor && !Thread::Current::preemption_state().preempts_pending()) {
           auto start = current_time();
           x86_mwait(next_state->MwaitHint());
           auto duration = zx_time_sub_time(current_time(), start);

           percpu->idle_states->RecordDuration(duration);
           next_state->RecordDuration(duration);
           next_state->CountEntry();
         }
       }
       // Spectre V2: If we enter a deep sleep state, fill the RSB before RET-ing from this function.
       // (CVE-2017-5715, see Intel "Deep Dive: Retpoline: A Branch Target Injection Mitigation").
       if (x86_cpu_vulnerable_to_rsb_underflow() & rsb_maybe_empty) {
         x86_ras_fill();
       }
       Thread::Current::Reschedule();
       // Pending preemptions handled here as preempt_disabled goes out of scope.
     }
   } else {
     for (;;) {
       AutoPreemptDisabler preempt_disabled;
       // Set the halt_interlock flag and spin for a little bit, in case a wakeup happens very
       // shortly before we decide to go to sleep. If the halt_interlock flag is changed, another CPU
       // has woken us, avoid the halt instruction.
       LocalTraceDuration trace{"idle"_stringref};
       constexpr int kPauseIterations = 3000;
       uint32_t halt_interlock_spinning = 1;
       percpu->halt_interlock.store(1, ktl::memory_order_relaxed);
       for (int i = 0;
            i < kPauseIterations && !Thread::Current::preemption_state().preempts_pending(); i++) {
         arch::Yield();
         if (percpu->halt_interlock.load(ktl::memory_order_relaxed) != 1) {
           break;
         }
       }
       // Compare-exchange halt_interlock from 1 -> 2, to indicate we are no longer spinning.
       // If the halt_interlock flag was changed, another CPU must have done it; avoid HLT and
       // switch to a new runnable thread. Otherwise, setting it to '2' re-enables reschedule
       // IPIs.
       bool no_fast_wakeup =
           percpu->halt_interlock.compare_exchange_strong(halt_interlock_spinning, 2);
       if (no_fast_wakeup && !Thread::Current::preemption_state().preempts_pending()) {
         arch_disable_ints();
         if (!Thread::Current::preemption_state().preempts_pending()) {
           x86_enable_ints_and_hlt();
         } else {
           // Re-enable interrupts if a reschedule IPI, timer tick, or other PreemptSetPending
           // happened and we didn't call x86_idle.
           arch_enable_ints();
         }
       }
       Thread::Current::Reschedule();
       // Pending preemptions handled here as preempt_disabled goes out of scope.
     }
   }
 }

 void arch_mp_send_ipi(mp_ipi_target_t target, cpu_mask_t mask, mp_ipi_t ipi) {
   uint8_t vector = 0;
   switch (ipi) {
     case MP_IPI_GENERIC:
       vector = X86_INT_IPI_GENERIC;
       break;
     case MP_IPI_RESCHEDULE:
       vector = X86_INT_IPI_RESCHEDULE;
       break;
     case MP_IPI_INTERRUPT:
       vector = X86_INT_IPI_INTERRUPT;
       break;
     case MP_IPI_HALT:
       vector = X86_INT_IPI_HALT;
       break;
     default:
       panic("Unexpected MP IPI value: %u", static_cast<uint32_t>(ipi));
   }

   switch (target) {
     case MP_IPI_TARGET_ALL_BUT_LOCAL:
       apic_send_broadcast_ipi(vector, DELIVERY_MODE_FIXED);
       break;
     case MP_IPI_TARGET_ALL:
       apic_send_broadcast_self_ipi(vector, DELIVERY_MODE_FIXED);
       break;
     case MP_IPI_TARGET_MASK:
       apic_send_mask_ipi(vector, mask, DELIVERY_MODE_FIXED);
       break;
     default:
       panic("Unexpected MP IPI target: %u", static_cast<uint32_t>(target));
   }
 }

 void x86_ipi_halt_handler(void*) {
   printf("halting cpu %u\n", arch_curr_cpu_num());

   platform_halt_cpu();

   for (;;) {
     x86_cli();
     x86_hlt();
   }
 }

 // Forcibly stops all other CPUs except the current one and the BSP (which is
 // cpu 0)
 void x86_force_halt_all_but_local_and_bsp(void) {
   cpu_num_t self = arch_curr_cpu_num();
   for (cpu_num_t i = 1; i < x86_num_cpus; ++i) {
     if (i == self) {
       continue;
     }
     uint32_t dst_apic_id = ap_percpus[i - 1].apic_id;
     apic_send_ipi(0, static_cast<uint8_t>(dst_apic_id), DELIVERY_MODE_INIT);
   }
 }

 zx_status_t arch_mp_prep_cpu_unplug(cpu_num_t cpu_id) {
   if (cpu_id == 0 || cpu_id >= x86_num_cpus) {
     return ZX_ERR_INVALID_ARGS;
   }
   return ZX_OK;
 }

 zx_status_t arch_mp_cpu_unplug(cpu_num_t cpu_id) {
   /* we do not allow unplugging the bootstrap processor */
   if (cpu_id == 0 || cpu_id >= x86_num_cpus) {
     return ZX_ERR_INVALID_ARGS;
   }

   uint32_t dst_apic_id = ap_percpus[cpu_id - 1].apic_id;
   if (dst_apic_id == INVALID_APIC_ID) {
     /* This is a transient state that can occur during CPU onlining */
     return ZX_ERR_UNAVAILABLE;
   }

   DEBUG_ASSERT(dst_apic_id < UINT8_MAX);
   apic_send_ipi(0, (uint8_t)dst_apic_id, DELIVERY_MODE_INIT);
   return ZX_OK;
 }

 zx_status_t arch_mp_cpu_hotplug(cpu_num_t cpu_id) {
   if (cpu_id >= x86_num_cpus) {
     return ZX_ERR_INVALID_ARGS;
   }
   if (mp_is_cpu_online(cpu_id)) {
     return ZX_ERR_BAD_STATE;
   }
   DEBUG_ASSERT(cpu_id != 0);
   if (cpu_id == 0) {
     /* We shouldn't be able to shutoff the bootstrap CPU, so
      * no reason to be able to bring it back via this route. */
     return ZX_ERR_INVALID_ARGS;
   }

   struct x86_percpu* percpu = &ap_percpus[cpu_id - 1];
   DEBUG_ASSERT(percpu->apic_id != INVALID_APIC_ID);
   return x86_bringup_aps(&percpu->apic_id, 1);
 }

 /* Used to suspend work on a CPU until it is further shutdown */
 void arch_flush_state_and_halt(Event* flush_done) {
   DEBUG_ASSERT(arch_ints_disabled());

   __asm__ volatile("wbinvd" : : : "memory");

   Thread::Current::Get()->preemption_state().PreemptDisable();
   flush_done->Signal();
   while (1) {
     __asm__ volatile("cli; hlt" : : : "memory");
   }
 }

 void arch_setup_percpu(cpu_num_t cpu_num, struct percpu* percpu) {
   x86_percpu* arch_percpu = x86_percpu_for(cpu_num);
   DEBUG_ASSERT(arch_percpu != nullptr);
   DEBUG_ASSERT(arch_percpu->high_level_percpu == nullptr ||
                arch_percpu->high_level_percpu == percpu);
   arch_percpu->high_level_percpu = percpu;
 }

 static void reset_idle_counters(X86IdleStates* idle_states) {
   for (unsigned i = 0; i < idle_states->NumStates(); ++i) {
     idle_states->States()[i].ResetCounters();
   }
 }

 static void report_idlestates(cpu_num_t cpu_num, const X86IdleStates& idle_states) {
   printf("CPU %u:\n", cpu_num);
   const X86IdleState* states = idle_states.ConstStates();
   for (unsigned i = 0; i < idle_states.NumStates(); ++i) {
     const auto& state = states[i];
     printf("  %4s (MWAIT %02X): %lu entries, %lu ns avg duration (%ld ns total)\n", state.Name(),
            state.MwaitHint(), state.TimesEntered(),
            state.TimesEntered() > 0 ? state.CumulativeDuration() / (state.TimesEntered()) : 0l,
            state.CumulativeDuration());
   }
 }

 static int cmd_idlestates(int argc, const cmd_args* argv, uint32_t flags) {
   if (argc < 2) {
   usage:
     printf("Usage: %s (printstats | resetstats | setmask)\n", argv[0].str);
     return ZX_ERR_INVALID_ARGS;
   }
   if (!use_monitor) {
     printf("%s is only supported on systems with X86_FEATURE_MON\n", argv[0].str);
     return ZX_ERR_NOT_SUPPORTED;
   }
   if (!strcmp(argv[1].str, "resetstats")) {
     reset_idle_counters(bp_percpu.idle_states);
     for (cpu_num_t i = 1; i < x86_num_cpus; ++i) {
       reset_idle_counters(ap_percpus[i - 1].idle_states);
     }
   } else if (!strcmp(argv[1].str, "printstats")) {
     report_idlestates(0, *bp_percpu.idle_states);
     for (cpu_num_t i = 1; i < x86_num_cpus; ++i) {
       report_idlestates(i, *ap_percpus[i - 1].idle_states);
     }
   } else if (!strcmp(argv[1].str, "setmask")) {
     if (argc < 3) {
       printf("Usage: %s setmask $mask\n", argv[0].str);
       return ZX_ERR_INVALID_ARGS;
     }
     bp_percpu.idle_states->SetStateMask(static_cast<uint32_t>(argv[2].u));
     for (unsigned i = 1; i < x86_num_cpus; ++i) {
       ap_percpus[i - 1].idle_states->SetStateMask(static_cast<uint32_t>(argv[2].u));
     }
   } else {
     goto usage;
   }
   return ZX_OK;
 }

 STATIC_COMMAND_START
 STATIC_COMMAND("idlestates", "control or report on CPU idle state selection", &cmd_idlestates)
 STATIC_COMMAND_END(idlestates)
	// Copyright 2016 The Fuchsia Authors
	// Copyright (c) 2016 Travis Geiselbrecht
	//
	// Use of this source code is governed by a MIT-style
	// license that can be found in the LICENSE file or at
	// https://opensource.org/licenses/MIT
	#include "arch/x86/mp.h"

	#include <assert.h>
	#include <debug.h>
	#include <lib/arch/x86/boot-cpuid.h>
	#include <lib/arch/x86/bug.h>
	#include <lib/arch/x86/descriptor-regs.h>
	#include <lib/console.h>
	#include <lib/ktrace.h>
	#include <platform.h>
	#include <stdio.h>
	#include <string.h>
	#include <trace.h>
	#include <zircon/compiler.h>
	#include <zircon/errors.h>
	#include <zircon/types.h>

	#include <new>

	#include <arch/mp.h>
	#include <arch/ops.h>
	#include <arch/x86.h>
	#include <arch/x86/apic.h>
	#include <arch/x86/descriptor.h>
	#include <arch/x86/feature.h>
	#include <arch/x86/idle_states.h>
	#include <arch/x86/interrupts.h>
	#include <arch/x86/mmu.h>
	#include <dev/hw_rng.h>
	#include <dev/interrupt.h>
	#include <hwreg/x86msr.h>
	#include <kernel/auto_preempt_disabler.h>
	#include <kernel/cpu.h>
	#include <kernel/event.h>
	#include <kernel/timer.h>

	#define LOCAL_TRACE 0

	// Enable/disable ktraces local to this file.
	#define LOCAL_KTRACE_ENABLE 0 \|\| LOCAL_TRACE

	using LocalTraceDuration =
	TraceDuration<TraceEnabled<LOCAL_KTRACE_ENABLE>, KTRACE_GRP_SCHEDULER, TraceContext::Cpu>;

	struct x86_percpu* ap_percpus;
	uint8_t x86_num_cpus = 1;
	static bool use_monitor = false;

	extern struct idt _idt;

	#if __has_feature(safe_stack)
	static uint8_t unsafe_kstack[PAGE_SIZE] __ALIGNED(16);
	#define unsafe_kstack_end (&unsafe_kstack[sizeof(unsafe_kstack)])
	#else
	#define unsafe_kstack_end nullptr
	#endif

	// Fake monitor to use until smp is initialized. The size of
	// the memory range doesn't matter, since it won't actually get
	// used in a non-smp environment.
	volatile uint8_t fake_monitor;

	// Also set up a fake table of idle states.
	x86_idle_states_t fake_supported_idle_states = {
	.states = {X86_CSTATE_C1(0)},
	.default_state_mask = kX86IdleStateMaskC1Only,
	};
	X86IdleStates fake_idle_states = X86IdleStates(&fake_supported_idle_states);

	// Pre-initialize the per cpu structure for the boot cpu. Referenced by
	// early boot code prior to being able to initialize via code.
	struct x86_percpu bp_percpu = {
	.direct = &bp_percpu,
	.current_thread = {},

	.stack_guard = {},
	.kernel_unsafe_sp = (uintptr_t)unsafe_kstack_end,
	.saved_user_sp = {},

	.blocking_disallowed = {},
	.monitor = &fake_monitor,
	.halt_interlock = {},
	.idle_states = &fake_idle_states,

	// Start with an invalid ID until we know the local APIC is set up.
	.apic_id = INVALID_APIC_ID,

	.gpf_return_target = {},

	.cpu_num = 0,
	.num_spinlocks = 0,
	.last_user_aspace = nullptr,

	.high_level_percpu = {},

	.default_tss = {},
	.interrupt_stacks = {},
	};

	zx_status_t x86_allocate_ap_structures(uint32_t* apic_ids, uint8_t cpu_count) {
	ASSERT(ap_percpus == nullptr);

	DEBUG_ASSERT(cpu_count >= 1);
	if (cpu_count == 0) {
	return ZX_ERR_INVALID_ARGS;
	}

	if (cpu_count > 1) {
	size_t len = sizeof(ap_percpus) (cpu_count - 1);
	ap_percpus = (x86_percpu*)memalign(MAX_CACHE_LINE, len);
	if (ap_percpus == nullptr) {
	return ZX_ERR_NO_MEMORY;
	}
	memset(ap_percpus, 0, len);

	use_monitor = arch::BootCpuid<arch::CpuidFeatureFlagsC>().monitor() &&
	arch::BootCpuidSupports<arch::CpuidMonitorMwaitB>() &&
	!x86_get_microarch_config()->idle_prefer_hlt;
	if (use_monitor) {
	auto monitor_size = static_cast<uint16_t>(
	arch::BootCpuid<arch::CpuidMonitorMwaitB>().largest_monitor_line_size());
	if (monitor_size < MAX_CACHE_LINE) {
	monitor_size = MAX_CACHE_LINE;
	}
	uint8_t* monitors = (uint8_t)memalign(monitor_size, monitor_size cpu_count);
	if (monitors == nullptr) {
	return ZX_ERR_NO_MEMORY;
	}
	bp_percpu.monitor = monitors;
	for (uint i = 1; i < cpu_count; ++i) {
	ap_percpus[i - 1].monitor = monitors + (i * monitor_size);
	}

	uint16_t idle_states_size = sizeof(X86IdleStates);
	if (idle_states_size < MAX_CACHE_LINE) {
	idle_states_size = MAX_CACHE_LINE;
	}
	X86IdleStates* idle_states =
	static_cast<X86IdleStates>(memalign(idle_states_size, idle_states_size cpu_count));
	if (idle_states == nullptr) {
	return ZX_ERR_NO_MEMORY;
	}
	const x86_idle_states_t* supported_idle_states = x86_get_idle_states();
	bp_percpu.idle_states = idle_states;
	// Placement new the BP idle-states table.
	new (bp_percpu.idle_states) X86IdleStates(supported_idle_states);
	for (uint i = 1; i < cpu_count; ++i) {
	ap_percpus[i - 1].idle_states = reinterpret_cast<X86IdleStates*>(
	reinterpret_cast<uintptr_t>(idle_states) + (i * idle_states_size));
	// Placement new the other idle-states tables.
	new (ap_percpus[i - 1].idle_states) X86IdleStates(supported_idle_states);
	}
	}
	}

	uint32_t bootstrap_ap = apic_local_id();
	DEBUG_ASSERT(bootstrap_ap == apic_bsp_id());

	uint apic_idx = 0;
	for (uint i = 0; i < cpu_count; ++i) {
	if (apic_ids[i] == bootstrap_ap) {
	continue;
	}
	DEBUG_ASSERT(apic_idx != (uint)(cpu_count - 1));
	if (apic_idx == (uint)cpu_count - 1) {
	/* Never found bootstrap CPU in apic id list */
	return ZX_ERR_BAD_STATE;
	}
	ap_percpus[apic_idx].cpu_num = apic_idx + 1;
	ap_percpus[apic_idx].apic_id = apic_ids[i];
	ap_percpus[apic_idx].direct = &ap_percpus[apic_idx];
	apic_idx++;
	}

	x86_num_cpus = cpu_count;
	return ZX_OK;
	}

	static struct x86_percpu* x86_percpu_for(cpu_num_t cpu_num) {
	return (cpu_num == 0) ? &bp_percpu : &ap_percpus[cpu_num - 1];
	}

	void x86_init_percpu(cpu_num_t cpu_num) {
	struct x86_percpu* const percpu = x86_percpu_for(cpu_num);
	DEBUG_ASSERT(percpu->cpu_num == cpu_num);
	DEBUG_ASSERT(percpu->direct == percpu);

	// Assembly code has already set up %gs.base so that this function's
	// own code can use it implicitly for stack-protector or safe-stack.
	DEBUG_ASSERT(read_msr(X86_MSR_IA32_GS_BASE) == (uintptr_t)percpu);

	/* set the KERNEL_GS_BASE MSR to 0 */
	/* when we enter user space, this will be populated via a swapgs */
	write_msr(X86_MSR_IA32_KERNEL_GS_BASE, 0);

	x86_feature_early_init_percpu();

	x86_extended_register_init();
	x86_extended_register_enable_feature(X86_EXTENDED_REGISTER_SSE);
	x86_extended_register_enable_feature(X86_EXTENDED_REGISTER_AVX);

	// This can be turned on/off later by the user. Turn it on here so that
	// the buffer size assumes it's on.
	x86_extended_register_enable_feature(X86_EXTENDED_REGISTER_PT);
	// But then set the default mode to off.
	x86_set_extended_register_pt_state(false);

	gdt_load(gdt_get());

	// Disable the LDT so userspace cannot make
	// segment selectors that point to it.
	// See fxbug.dev/79060
	arch::DisableLdt();

	x86_initialize_percpu_tss();

	// Setup the post early boot IDT
	if (cpu_num == 0) {
	idt_setup(&_idt);
	// Setup alternate stacks to guarantee stack sanity when handling these
	// interrupts
	idt_set_ist_index(&_idt, X86_INT_NMI, NMI_IST_INDEX);
	idt_set_ist_index(&_idt, X86_INT_MACHINE_CHECK, MCE_IST_INDEX);
	idt_set_ist_index(&_idt, X86_INT_DOUBLE_FAULT, DBF_IST_INDEX);
	idt_load(&_idt);
	} else {
	// Load the read-only IDT setup on arch initialization.
	idt_load(idt_get_readonly());
	}

	/* load the syscall entry point */
	write_msr(X86_MSR_IA32_LSTAR, (uint64_t)&x86_syscall);

	/* set the STAR MSR to load the appropriate kernel code selector on syscall
	* and the appropriate user code selector on return.
	* on syscall entry the following are loaded into segment registers:
	* CS = CODE_64_SELECTOR (STAR[47:32])
	* SS = DATA_SELECTOR (STAR[47:32] + 0x8)
	* on syscall exit:
	* CS = USER_CODE_64_SELECTOR (STAR[63:48] + 0x16)
	* SS = USER_DATA_SELECTOR (STAR[63:48] + 0x8)
	*/
	write_msr(X86_MSR_IA32_STAR,
	(uint64_t)USER_CODE_SELECTOR << 48 \| (uint64_t)CODE_64_SELECTOR << 32);

	// Set the FMASK register to mask off certain bits in RFLAGS on syscall
	// entry. See docs/kernel_invariants.md.
	uint64_t mask = X86_FLAGS_AC \| /* disable alignment check/access control (this
	* prevents ring 0 from performing data access
	* to ring 3 if SMAP is available) */
	X86_FLAGS_NT \| /* clear nested task */
	X86_FLAGS_IOPL_MASK \| /* set iopl to 0 */
	X86_FLAGS_STATUS_MASK; /* clear all status flags, interrupt disabled, trap flag */
	write_msr(X86_MSR_IA32_FMASK, mask);

	// Apply the same mask to our current flags, to ensure that flags are
	// set to known-good values, because some flags may be inherited by
	// later kernel threads. We do this just in case any bad values were
	// left behind by firmware or the bootloader.
	x86_restore_flags(x86_save_flags() & ~mask);

	/* enable syscall instruction */
	uint64_t efer_msr = read_msr(X86_MSR_IA32_EFER);
	efer_msr \|= X86_EFER_SCE;
	write_msr(X86_MSR_IA32_EFER, efer_msr);

	uint64_t cr4 = x86_get_cr4();
	// Enable {rd,wr}{fs,gs}base instructions.
	if (x86_feature_test(X86_FEATURE_FSGSBASE)) {
	cr4 \|= X86_CR4_FSGSBASE;
	}
	if (x86_feature_test(X86_FEATURE_UMIP)) {
	cr4 \|= X86_CR4_UMIP;
	}
	x86_set_cr4(cr4);

	// Store the processor number in IA32_TSC_AUX, so RDTSCP/RDP can efficiently get the current CPU
	// from userspace.
	if (x86_feature_test(X86_FEATURE_RDTSCP)) {
	write_msr(X86_MSR_IA32_TSC_AUX, cpu_num);
	}

	switch (x86_vendor) {
	case X86_VENDOR_INTEL:
	x86_intel_init_percpu();
	break;
	case X86_VENDOR_AMD:
	x86_amd_init_percpu();
	break;
	default:
	break;
	}

	arch::ApplyX86ErrataWorkarounds(arch::BootCpuidIo{}, hwreg::X86MsrIo{});
	}

	void x86_set_local_apic_id(uint32_t apic_id) {
	struct x86_percpu* percpu = x86_get_percpu();
	DEBUG_ASSERT(percpu->cpu_num == 0);
	percpu->apic_id = apic_id;
	}

	int x86_apic_id_to_cpu_num(uint32_t apic_id) {
	if (bp_percpu.apic_id == apic_id) {
	return (int)bp_percpu.cpu_num;
	}

	for (uint i = 0; i < (uint)x86_num_cpus - 1; ++i) {
	if (ap_percpus[i].apic_id == apic_id) {
	return (int)ap_percpus[i].cpu_num;
	}
	}
	return -1;
	}

	void arch_mp_reschedule(cpu_mask_t mask) {
	thread_lock.AssertHeld();

	cpu_mask_t needs_ipi = 0;
	if (use_monitor) {
	while (mask) {
	cpu_num_t cpu_id = lowest_cpu_set(mask);
	cpu_mask_t cpu_mask = cpu_num_to_mask(cpu_id);
	struct x86_percpu* percpu = cpu_id ? &ap_percpus[cpu_id - 1] : &bp_percpu;

	// When a cpu see that it is about to start the idle thread, it sets its own
	// monitor flag. When a cpu is rescheduling another cpu, if it sees the monitor flag
	// set, it can clear the flag to wake up the other cpu w/o an IPI. When the other
	// cpu wakes up, the idle thread sees the cleared flag and preempts itself. Both of
	// these operations are under the scheduler lock, so there are no races where the
	// wrong signal can be sent.
	uint8_t old_val = *percpu->monitor;
	*percpu->monitor = 0;
	if (!old_val) {
	needs_ipi \|= cpu_mask;
	}
	mask &= ~cpu_mask;
	}
	} else {
	needs_ipi = mask;
	// We are attempting to wake the set up CPUs in \|mask\| and cause them to schedule a new thread.
	// A target CPU spins for a short time before execuing halt; before it spins, it sets the
	// \|halt_interlock\| flag to '1'. Before a target CPU executes the halt instruction, it sets
	// the \|halt_interlock\| flag to '2' and skips the halt if the flag was cleared while spinning.
	// Try to clear the \|halt_interlock\| flag from 1 -> 0. If we do so, we can skip sending an
	// IPI and prevent an unnecessary halt instruction.
	while (mask) {
	cpu_num_t cpu_id = lowest_cpu_set(mask);
	cpu_mask_t cpu_mask = cpu_num_to_mask(cpu_id);
	struct x86_percpu* percpu = cpu_id ? &ap_percpus[cpu_id - 1] : &bp_percpu;
	uint32_t expect_spin = 1;
	bool did_fast_wakeup = percpu->halt_interlock.compare_exchange_strong(expect_spin, 0);
	if (did_fast_wakeup) {
	needs_ipi &= ~cpu_mask;
	}
	mask &= ~cpu_mask;
	}
	}

	if (needs_ipi) {
	arch_mp_send_ipi(MP_IPI_TARGET_MASK, needs_ipi, MP_IPI_RESCHEDULE);
	}
	}

	void arch_prepare_current_cpu_idle_state(bool idle) {
	thread_lock.AssertHeld();

	if (use_monitor) {
	*x86_get_percpu()->monitor = idle;
	}
	}

	__NO_RETURN int arch_idle_thread_routine(void*) {
	struct x86_percpu* percpu = x86_get_percpu();
	if (use_monitor) {
	for (;;) {
	AutoPreemptDisabler preempt_disabled;
	bool rsb_maybe_empty = false;
	while (*percpu->monitor && !Thread::Current::preemption_state().preempts_pending()) {
	X86IdleState* next_state = percpu->idle_states->PickIdleState();
	rsb_maybe_empty \|= x86_intel_idle_state_may_empty_rsb(next_state);
	LocalTraceDuration trace{"idle"_stringref, next_state->MwaitHint(), 0u};
	x86_monitor(percpu->monitor);
	// Check percpu->monitor in case it was cleared between the first check and
	// the monitor being armed. Any writes after arming the monitor will trigger
	// it and cause mwait to return, so there aren't races after this check.
	if (*percpu->monitor && !Thread::Current::preemption_state().preempts_pending()) {
	auto start = current_time();
	x86_mwait(next_state->MwaitHint());
	auto duration = zx_time_sub_time(current_time(), start);

	percpu->idle_states->RecordDuration(duration);
	next_state->RecordDuration(duration);
	next_state->CountEntry();
	}
	}
	// Spectre V2: If we enter a deep sleep state, fill the RSB before RET-ing from this function.
	// (CVE-2017-5715, see Intel "Deep Dive: Retpoline: A Branch Target Injection Mitigation").
	if (x86_cpu_vulnerable_to_rsb_underflow() & rsb_maybe_empty) {
	x86_ras_fill();
	}
	Thread::Current::Reschedule();
	// Pending preemptions handled here as preempt_disabled goes out of scope.
	}
	} else {
	for (;;) {
	AutoPreemptDisabler preempt_disabled;
	// Set the halt_interlock flag and spin for a little bit, in case a wakeup happens very
	// shortly before we decide to go to sleep. If the halt_interlock flag is changed, another CPU
	// has woken us, avoid the halt instruction.
	LocalTraceDuration trace{"idle"_stringref};
	constexpr int kPauseIterations = 3000;
	uint32_t halt_interlock_spinning = 1;
	percpu->halt_interlock.store(1, ktl::memory_order_relaxed);
	for (int i = 0;
	i < kPauseIterations && !Thread::Current::preemption_state().preempts_pending(); i++) {
	arch::Yield();
	if (percpu->halt_interlock.load(ktl::memory_order_relaxed) != 1) {
	break;
	}
	}
	// Compare-exchange halt_interlock from 1 -> 2, to indicate we are no longer spinning.
	// If the halt_interlock flag was changed, another CPU must have done it; avoid HLT and
	// switch to a new runnable thread. Otherwise, setting it to '2' re-enables reschedule
	// IPIs.
	bool no_fast_wakeup =
	percpu->halt_interlock.compare_exchange_strong(halt_interlock_spinning, 2);
	if (no_fast_wakeup && !Thread::Current::preemption_state().preempts_pending()) {
	arch_disable_ints();
	if (!Thread::Current::preemption_state().preempts_pending()) {
	x86_enable_ints_and_hlt();
	} else {
	// Re-enable interrupts if a reschedule IPI, timer tick, or other PreemptSetPending
	// happened and we didn't call x86_idle.
	arch_enable_ints();
	}
	}
	Thread::Current::Reschedule();
	// Pending preemptions handled here as preempt_disabled goes out of scope.
	}
	}
	}

	void arch_mp_send_ipi(mp_ipi_target_t target, cpu_mask_t mask, mp_ipi_t ipi) {
	uint8_t vector = 0;
	switch (ipi) {
	case MP_IPI_GENERIC:
	vector = X86_INT_IPI_GENERIC;
	break;
	case MP_IPI_RESCHEDULE:
	vector = X86_INT_IPI_RESCHEDULE;
	break;
	case MP_IPI_INTERRUPT:
	vector = X86_INT_IPI_INTERRUPT;
	break;
	case MP_IPI_HALT:
	vector = X86_INT_IPI_HALT;
	break;
	default:
	panic("Unexpected MP IPI value: %u", static_cast<uint32_t>(ipi));
	}

	switch (target) {
	case MP_IPI_TARGET_ALL_BUT_LOCAL:
	apic_send_broadcast_ipi(vector, DELIVERY_MODE_FIXED);
	break;
	case MP_IPI_TARGET_ALL:
	apic_send_broadcast_self_ipi(vector, DELIVERY_MODE_FIXED);
	break;
	case MP_IPI_TARGET_MASK:
	apic_send_mask_ipi(vector, mask, DELIVERY_MODE_FIXED);
	break;
	default:
	panic("Unexpected MP IPI target: %u", static_cast<uint32_t>(target));
	}
	}

	void x86_ipi_halt_handler(void*) {
	printf("halting cpu %u\n", arch_curr_cpu_num());

	platform_halt_cpu();

	for (;;) {
	x86_cli();
	x86_hlt();
	}
	}

	// Forcibly stops all other CPUs except the current one and the BSP (which is
	// cpu 0)
	void x86_force_halt_all_but_local_and_bsp(void) {
	cpu_num_t self = arch_curr_cpu_num();
	for (cpu_num_t i = 1; i < x86_num_cpus; ++i) {
	if (i == self) {
	continue;
	}
	uint32_t dst_apic_id = ap_percpus[i - 1].apic_id;
	apic_send_ipi(0, static_cast<uint8_t>(dst_apic_id), DELIVERY_MODE_INIT);
	}
	}

	zx_status_t arch_mp_prep_cpu_unplug(cpu_num_t cpu_id) {
	if (cpu_id == 0 \|\| cpu_id >= x86_num_cpus) {
	return ZX_ERR_INVALID_ARGS;
	}
	return ZX_OK;
	}

	zx_status_t arch_mp_cpu_unplug(cpu_num_t cpu_id) {
	/* we do not allow unplugging the bootstrap processor */
	if (cpu_id == 0 \|\| cpu_id >= x86_num_cpus) {
	return ZX_ERR_INVALID_ARGS;
	}

	uint32_t dst_apic_id = ap_percpus[cpu_id - 1].apic_id;
	if (dst_apic_id == INVALID_APIC_ID) {
	/* This is a transient state that can occur during CPU onlining */
	return ZX_ERR_UNAVAILABLE;
	}

	DEBUG_ASSERT(dst_apic_id < UINT8_MAX);
	apic_send_ipi(0, (uint8_t)dst_apic_id, DELIVERY_MODE_INIT);
	return ZX_OK;
	}

	zx_status_t arch_mp_cpu_hotplug(cpu_num_t cpu_id) {
	if (cpu_id >= x86_num_cpus) {
	return ZX_ERR_INVALID_ARGS;
	}
	if (mp_is_cpu_online(cpu_id)) {
	return ZX_ERR_BAD_STATE;
	}
	DEBUG_ASSERT(cpu_id != 0);
	if (cpu_id == 0) {
	/* We shouldn't be able to shutoff the bootstrap CPU, so
	* no reason to be able to bring it back via this route. */
	return ZX_ERR_INVALID_ARGS;
	}

	struct x86_percpu* percpu = &ap_percpus[cpu_id - 1];
	DEBUG_ASSERT(percpu->apic_id != INVALID_APIC_ID);
	return x86_bringup_aps(&percpu->apic_id, 1);
	}

	/* Used to suspend work on a CPU until it is further shutdown */
	void arch_flush_state_and_halt(Event* flush_done) {
	DEBUG_ASSERT(arch_ints_disabled());

	__asm__ volatile("wbinvd" : : : "memory");

	Thread::Current::Get()->preemption_state().PreemptDisable();
	flush_done->Signal();
	while (1) {
	__asm__ volatile("cli; hlt" : : : "memory");
	}
	}

	void arch_setup_percpu(cpu_num_t cpu_num, struct percpu* percpu) {
	x86_percpu* arch_percpu = x86_percpu_for(cpu_num);
	DEBUG_ASSERT(arch_percpu != nullptr);
	DEBUG_ASSERT(arch_percpu->high_level_percpu == nullptr \|\|
	arch_percpu->high_level_percpu == percpu);
	arch_percpu->high_level_percpu = percpu;
	}

	static void reset_idle_counters(X86IdleStates* idle_states) {
	for (unsigned i = 0; i < idle_states->NumStates(); ++i) {
	idle_states->States()[i].ResetCounters();
	}
	}

	static void report_idlestates(cpu_num_t cpu_num, const X86IdleStates& idle_states) {
	printf("CPU %u:\n", cpu_num);
	const X86IdleState* states = idle_states.ConstStates();
	for (unsigned i = 0; i < idle_states.NumStates(); ++i) {
	const auto& state = states[i];
	printf(" %4s (MWAIT %02X): %lu entries, %lu ns avg duration (%ld ns total)\n", state.Name(),
	state.MwaitHint(), state.TimesEntered(),
	state.TimesEntered() > 0 ? state.CumulativeDuration() / (state.TimesEntered()) : 0l,
	state.CumulativeDuration());
	}
	}

	static int cmd_idlestates(int argc, const cmd_args* argv, uint32_t flags) {
	if (argc < 2) {
	usage:
	printf("Usage: %s (printstats \| resetstats \| setmask)\n", argv[0].str);
	return ZX_ERR_INVALID_ARGS;
	}
	if (!use_monitor) {
	printf("%s is only supported on systems with X86_FEATURE_MON\n", argv[0].str);
	return ZX_ERR_NOT_SUPPORTED;
	}
	if (!strcmp(argv[1].str, "resetstats")) {
	reset_idle_counters(bp_percpu.idle_states);
	for (cpu_num_t i = 1; i < x86_num_cpus; ++i) {
	reset_idle_counters(ap_percpus[i - 1].idle_states);
	}
	} else if (!strcmp(argv[1].str, "printstats")) {
	report_idlestates(0, *bp_percpu.idle_states);
	for (cpu_num_t i = 1; i < x86_num_cpus; ++i) {
	report_idlestates(i, *ap_percpus[i - 1].idle_states);
	}
	} else if (!strcmp(argv[1].str, "setmask")) {
	if (argc < 3) {
	printf("Usage: %s setmask $mask\n", argv[0].str);
	return ZX_ERR_INVALID_ARGS;
	}
	bp_percpu.idle_states->SetStateMask(static_cast<uint32_t>(argv[2].u));
	for (unsigned i = 1; i < x86_num_cpus; ++i) {
	ap_percpus[i - 1].idle_states->SetStateMask(static_cast<uint32_t>(argv[2].u));
	}
	} else {
	goto usage;
	}
	return ZX_OK;
	}

	STATIC_COMMAND_START
	STATIC_COMMAND("idlestates", "control or report on CPU idle state selection", &cmd_idlestates)
	STATIC_COMMAND_END(idlestates)