kernel/arch/x86/mp.cpp - zircon - Git at Google

 // Copyright 2016 The Fuchsia Authors
 // Copyright (c) 2016 Travis Geiselbrecht
 //
 // Use of this source code is governed by a MIT-style
 // license that can be found in the LICENSE file or at
 // https://opensource.org/licenses/MIT

 #include <assert.h>
 #include <debug.h>
 #include <err.h>
 #include <stdio.h>
 #include <string.h>
 #include <trace.h>
 #include <zircon/compiler.h>

 #include <arch/mp.h>
 #include <arch/ops.h>
 #include <arch/x86.h>
 #include <arch/x86/apic.h>
 #include <arch/x86/cpu_topology.h>
 #include <arch/x86/descriptor.h>
 #include <arch/x86/feature.h>
 #include <arch/x86/interrupts.h>
 #include <arch/x86/mmu.h>
 #include <arch/x86/mp.h>
 #include <arch/x86/tsc.h>
 #include <dev/hw_rng.h>
 #include <dev/interrupt.h>
 #include <kernel/event.h>
 #include <kernel/timer.h>
 #include <platform.h>
 #include <zircon/types.h>

 #define LOCAL_TRACE 0

 struct x86_percpu* ap_percpus;
 uint8_t x86_num_cpus = 1;
 static bool use_monitor = false;

 extern struct idt _idt;

 zx_status_t x86_allocate_ap_structures(uint32_t* apic_ids, uint8_t cpu_count) {
     ASSERT(ap_percpus == nullptr);

     DEBUG_ASSERT(cpu_count >= 1);
     if (cpu_count == 0) {
         return ZX_ERR_INVALID_ARGS;
     }

     if (cpu_count > 1) {
         size_t len = sizeof(*ap_percpus) * (cpu_count - 1);
         ap_percpus = (x86_percpu*)memalign(MAX_CACHE_LINE, len);
         if (ap_percpus == nullptr) {
             return ZX_ERR_NO_MEMORY;
         }
         memset(ap_percpus, 0, len);

         if ((use_monitor = x86_feature_test(X86_FEATURE_MON))) {
             uint16_t monitor_size = x86_get_cpuid_leaf(X86_CPUID_MON)->b & 0xffff;
             if (monitor_size < MAX_CACHE_LINE) {
                 monitor_size = MAX_CACHE_LINE;
             }
             uint8_t* monitors = (uint8_t*)memalign(monitor_size, monitor_size * cpu_count);
             if (monitors == nullptr) {
                 return ZX_ERR_NO_MEMORY;
             }
             bp_percpu.monitor = monitors;
             for (uint i = 1; i < cpu_count; ++i) {
                 ap_percpus[i - 1].monitor = monitors + (i * monitor_size);
             }
         }
     }

     uint32_t bootstrap_ap = apic_local_id();
     DEBUG_ASSERT(bootstrap_ap == apic_bsp_id());

     uint apic_idx = 0;
     for (uint i = 0; i < cpu_count; ++i) {
         if (apic_ids[i] == bootstrap_ap) {
             continue;
         }
         DEBUG_ASSERT(apic_idx != (uint)(cpu_count - 1));
         if (apic_idx == (uint)cpu_count - 1) {
             /* Never found bootstrap CPU in apic id list */
             return ZX_ERR_BAD_STATE;
         }
         ap_percpus[apic_idx].cpu_num = apic_idx + 1;
         ap_percpus[apic_idx].apic_id = apic_ids[i];
         ap_percpus[apic_idx].direct = &ap_percpus[apic_idx];
         apic_idx++;
     }

     x86_num_cpus = cpu_count;
     return ZX_OK;
 }

 void x86_init_percpu(cpu_num_t cpu_num) {
     struct x86_percpu* const percpu =
         cpu_num == 0 ? &bp_percpu : &ap_percpus[cpu_num - 1];
     DEBUG_ASSERT(percpu->cpu_num == cpu_num);
     DEBUG_ASSERT(percpu->direct == percpu);

     // Assembly code has already set up %gs.base so that this function's
     // own code can use it implicitly for stack-protector or safe-stack.
     DEBUG_ASSERT(read_msr(X86_MSR_IA32_GS_BASE) == (uintptr_t)percpu);

     /* set the KERNEL_GS_BASE MSR to 0 */
     /* when we enter user space, this will be populated via a swapgs */
     write_msr(X86_MSR_IA32_KERNEL_GS_BASE, 0);

     x86_feature_init();

     x86_cpu_topology_init();
     x86_extended_register_init();
     x86_extended_register_enable_feature(X86_EXTENDED_REGISTER_SSE);
     x86_extended_register_enable_feature(X86_EXTENDED_REGISTER_AVX);

     // This can be turned on/off later by the user. Turn it on here so that
     // the buffer size assumes it's on.
     x86_extended_register_enable_feature(X86_EXTENDED_REGISTER_PT);
     // But then set the default mode to off.
     x86_set_extended_register_pt_state(false);

     gdt_load(gdt_get());

     x86_initialize_percpu_tss();

     // Setup the post early boot IDT
     if (cpu_num == 0) {
         idt_setup(&_idt);
         // Setup alternate stacks to guarantee stack sanity when handling these
         // interrupts
         idt_set_ist_index(&_idt, X86_INT_NMI, NMI_IST_INDEX);
         idt_set_ist_index(&_idt, X86_INT_MACHINE_CHECK, MCE_IST_INDEX);
         idt_set_ist_index(&_idt, X86_INT_DOUBLE_FAULT, DBF_IST_INDEX);
         idt_load(&_idt);
     } else {
         // Load the read-only IDT setup on arch initialization.
         idt_load(idt_get_readonly());
     }

     // Apply any timestamp counter adjustment to keep a continuous clock across
     // suspend/resume.
     x86_tsc_adjust();

     /* load the syscall entry point */
     write_msr(X86_MSR_IA32_LSTAR, (uint64_t)&x86_syscall);

     /* set the STAR MSR to load the appropriate kernel code selector on syscall
      * and the appropriate user code selector on return.
      * on syscall entry the following are loaded into segment registers:
      *   CS = CODE_64_SELECTOR      (STAR[47:32])
      *   SS = DATA_SELECTOR         (STAR[47:32] + 0x8)
      * on syscall exit:
      *   CS = USER_CODE_64_SELECTOR (STAR[63:48] + 0x16)
      *   SS = USER_DATA_SELECTOR    (STAR[63:48] + 0x8)
      */
     write_msr(X86_MSR_IA32_STAR, (uint64_t)USER_CODE_SELECTOR << 48 | (uint64_t)CODE_64_SELECTOR << 32);

     // Set the FMASK register to mask off certain bits in RFLAGS on syscall
     // entry.  See docs/kernel_invariants.md.
     uint64_t mask =
         X86_FLAGS_AC |         /* disable alignment check/access control (this
                                 * prevents ring 0 from performing data access
                                 * to ring 3 if SMAP is available) */
         X86_FLAGS_NT |         /* clear nested task */
         X86_FLAGS_IOPL_MASK |  /* set iopl to 0 */
         X86_FLAGS_STATUS_MASK; /* clear all status flags, interrupt disabled, trap flag */
     write_msr(X86_MSR_IA32_FMASK, mask);

     // Apply the same mask to our current flags, to ensure that flags are
     // set to known-good values, because some flags may be inherited by
     // later kernel threads.  We do this just in case any bad values were
     // left behind by firmware or the bootloader.
     x86_restore_flags(x86_save_flags() & ~mask);

     /* enable syscall instruction */
     uint64_t efer_msr = read_msr(X86_MSR_IA32_EFER);
     efer_msr |= X86_EFER_SCE;
     write_msr(X86_MSR_IA32_EFER, efer_msr);

     uint64_t cr4 = x86_get_cr4();
     // Enable {rd,wr}{fs,gs}base instructions.
     if (x86_feature_test(X86_FEATURE_FSGSBASE)) {
         cr4 |= X86_CR4_FSGSBASE;
     }
     if (x86_feature_test(X86_FEATURE_UMIP)) {
         cr4 |= X86_CR4_UMIP;
     }
     x86_set_cr4(cr4);

     // Some intel cpus support auto-entering C1E state when all cores are at C1. In
     // C1E state the voltage is reduced on all cores as well as clock gated. There is
     // a latency associated with ramping the voltage on wake. Disable this feature here
     // to save time on the irq path from idle. (5-10us on skylake nuc from kernel irq
     // handler to user space handler).
     if (!x86_feature_test(X86_FEATURE_HYPERVISOR) &&
         x86_get_microarch_config()->disable_c1e) {
         uint64_t power_ctl_msr = read_msr(0x1fc);
         write_msr(0x1fc, power_ctl_msr & ~0x2);
     }

     mp_set_curr_cpu_online(true);
 }

 void x86_set_local_apic_id(uint32_t apic_id) {
     struct x86_percpu* percpu = x86_get_percpu();
     DEBUG_ASSERT(percpu->cpu_num == 0);
     percpu->apic_id = apic_id;
 }

 int x86_apic_id_to_cpu_num(uint32_t apic_id) {
     if (bp_percpu.apic_id == apic_id) {
         return (int)bp_percpu.cpu_num;
     }

     for (uint i = 0; i < (uint)x86_num_cpus - 1; ++i) {
         if (ap_percpus[i].apic_id == apic_id) {
             return (int)ap_percpus[i].cpu_num;
         }
     }
     return -1;
 }

 zx_status_t arch_mp_reschedule(cpu_mask_t mask) {
     DEBUG_ASSERT(thread_lock_held());

     cpu_mask_t needs_ipi = 0;
     if (use_monitor) {
         while (mask) {
             cpu_num_t cpu_id = lowest_cpu_set(mask);
             cpu_mask_t cpu_mask = cpu_num_to_mask(cpu_id);
             struct x86_percpu* percpu = cpu_id ? &ap_percpus[cpu_id - 1] : &bp_percpu;

             // When a cpu see that it is about to start the idle thread, it sets its own
             // monitor flag. When a cpu is rescheduling another cpu, if it sees the monitor flag
             // set, it can clear the flag to wake up the other cpu w/o an IPI. When the other
             // cpu wakes up, the idle thread sees the cleared flag and preempts itself. Both of
             // these operations are under the scheduler lock, so there are no races where the
             // wrong signal can be sent.
             uint8_t old_val = *percpu->monitor;
             *percpu->monitor = 0;
             if (!old_val) {
                 needs_ipi |= cpu_mask;
             }
             mask &= ~cpu_mask;
         }
     } else {
         needs_ipi = mask;
     }

     return needs_ipi ? arch_mp_send_ipi(MP_IPI_TARGET_MASK, needs_ipi, MP_IPI_RESCHEDULE) : ZX_OK;
 }

 void arch_prepare_current_cpu_idle_state(bool idle) {
     DEBUG_ASSERT(thread_lock_held());

     if (use_monitor) {
         *x86_get_percpu()->monitor = idle;
     }
 }

 __NO_RETURN int arch_idle_thread_routine(void*) {
     if (use_monitor) {
         struct x86_percpu* percpu = x86_get_percpu();
         for (;;) {
             while (*percpu->monitor) {
                 x86_monitor(percpu->monitor);
                 // Check percpu->monitor in case it was cleared between the first check and
                 // the monitor being armed. Any writes after arming the monitor will trigger
                 // it and cause mwait to return, so there aren't races after this check.
                 if (*percpu->monitor) {
                     x86_mwait();
                 }
             }
             thread_preempt();
         }
     } else {
         for (;;) {
             x86_idle();
         }
     }
 }

 zx_status_t arch_mp_send_ipi(mp_ipi_target_t target, cpu_mask_t mask, mp_ipi_t ipi) {
     uint8_t vector = 0;
     switch (ipi) {
     case MP_IPI_GENERIC:
         vector = X86_INT_IPI_GENERIC;
         break;
     case MP_IPI_RESCHEDULE:
         vector = X86_INT_IPI_RESCHEDULE;
         break;
     case MP_IPI_INTERRUPT:
         vector = X86_INT_IPI_INTERRUPT;
         break;
     case MP_IPI_HALT:
         vector = X86_INT_IPI_HALT;
         break;
     default:
         panic("Unexpected MP IPI value: %u", (uint)ipi);
     }

     if (target == MP_IPI_TARGET_ALL_BUT_LOCAL) {
         apic_send_broadcast_ipi(vector, DELIVERY_MODE_FIXED);
         return ZX_OK;
     } else if (target == MP_IPI_TARGET_ALL) {
         apic_send_broadcast_self_ipi(vector, DELIVERY_MODE_FIXED);
         return ZX_OK;
     }

     ASSERT(x86_num_cpus <= sizeof(mask) * CHAR_BIT);

     cpu_mask_t remaining = mask;
     uint cpu_id = 0;
     while (remaining && cpu_id < x86_num_cpus) {
         if (remaining & 1) {
             struct x86_percpu* percpu;
             if (cpu_id == 0) {
                 percpu = &bp_percpu;
             } else {
                 percpu = &ap_percpus[cpu_id - 1];
             }
             /* Reschedule IPIs may occur before all CPUs are fully up.  Just
              * ignore attempts to send them to down CPUs. */
             if (ipi != MP_IPI_RESCHEDULE) {
                 DEBUG_ASSERT(percpu->apic_id != INVALID_APIC_ID);
             }
             /* Make sure the CPU is actually up before sending the IPI */
             if (percpu->apic_id != INVALID_APIC_ID) {
                 apic_send_ipi(vector, (uint8_t)percpu->apic_id, DELIVERY_MODE_FIXED);
             }
         }
         remaining >>= 1;
         cpu_id++;
     }

     return ZX_OK;
 }

 void x86_ipi_halt_handler(void*) {
     printf("halting cpu %u\n", arch_curr_cpu_num());

     platform_halt_cpu();

     for (;;) {
         x86_cli();
         x86_hlt();
     }
 }

 // Forcibly stops all other CPUs except the current one and the BSP (which is
 // cpu 0)
 void x86_force_halt_all_but_local_and_bsp(void) {
     cpu_num_t self = arch_curr_cpu_num();
     for (cpu_num_t i = 1; i < x86_num_cpus; ++i) {
         if (i == self) {
             continue;
         }
         uint32_t dst_apic_id = ap_percpus[i - 1].apic_id;
         apic_send_ipi(0, static_cast<uint8_t>(dst_apic_id),
                       DELIVERY_MODE_INIT);
     }
 }

 zx_status_t arch_mp_prep_cpu_unplug(uint cpu_id) {
     if (cpu_id == 0 || cpu_id >= x86_num_cpus) {
         return ZX_ERR_INVALID_ARGS;
     }
     return ZX_OK;
 }

 zx_status_t arch_mp_cpu_unplug(uint cpu_id) {
     /* we do not allow unplugging the bootstrap processor */
     if (cpu_id == 0 || cpu_id >= x86_num_cpus) {
         return ZX_ERR_INVALID_ARGS;
     }

     uint32_t dst_apic_id = ap_percpus[cpu_id - 1].apic_id;
     if (dst_apic_id == INVALID_APIC_ID) {
         /* This is a transient state that can occur during CPU onlining */
         return ZX_ERR_UNAVAILABLE;
     }

     DEBUG_ASSERT(dst_apic_id < UINT8_MAX);
     apic_send_ipi(0, (uint8_t)dst_apic_id, DELIVERY_MODE_INIT);
     return ZX_OK;
 }

 zx_status_t arch_mp_cpu_hotplug(uint cpu_id) {
     if (cpu_id >= x86_num_cpus) {
         return ZX_ERR_INVALID_ARGS;
     }
     if (mp_is_cpu_online(cpu_id)) {
         return ZX_ERR_BAD_STATE;
     }
     DEBUG_ASSERT(cpu_id != 0);
     if (cpu_id == 0) {
         /* We shouldn't be able to shutoff the bootstrap CPU, so
          * no reason to be able to bring it back via this route. */
         return ZX_ERR_INVALID_ARGS;
     }

     struct x86_percpu* percpu = &ap_percpus[cpu_id - 1];
     DEBUG_ASSERT(percpu->apic_id != INVALID_APIC_ID);
     return x86_bringup_aps(&percpu->apic_id, 1);
 }

 /* Used to suspend work on a CPU until it is further shutdown */
 void arch_flush_state_and_halt(event_t* flush_done) {
     DEBUG_ASSERT(arch_ints_disabled());

     __asm__ volatile("wbinvd"
                      :
                      :
                      : "memory");

     event_signal(flush_done, false);
     while (1) {
         __asm__ volatile("cli; hlt"
                          :
                          :
                          : "memory");
     }
 }
	// Copyright 2016 The Fuchsia Authors
	// Copyright (c) 2016 Travis Geiselbrecht
	//
	// Use of this source code is governed by a MIT-style
	// license that can be found in the LICENSE file or at
	// https://opensource.org/licenses/MIT

	#include <assert.h>
	#include <debug.h>
	#include <err.h>
	#include <stdio.h>
	#include <string.h>
	#include <trace.h>
	#include <zircon/compiler.h>

	#include <arch/mp.h>
	#include <arch/ops.h>
	#include <arch/x86.h>
	#include <arch/x86/apic.h>
	#include <arch/x86/cpu_topology.h>
	#include <arch/x86/descriptor.h>
	#include <arch/x86/feature.h>
	#include <arch/x86/interrupts.h>
	#include <arch/x86/mmu.h>
	#include <arch/x86/mp.h>
	#include <arch/x86/tsc.h>
	#include <dev/hw_rng.h>
	#include <dev/interrupt.h>
	#include <kernel/event.h>
	#include <kernel/timer.h>
	#include <platform.h>
	#include <zircon/types.h>

	#define LOCAL_TRACE 0

	struct x86_percpu* ap_percpus;
	uint8_t x86_num_cpus = 1;
	static bool use_monitor = false;

	extern struct idt _idt;

	zx_status_t x86_allocate_ap_structures(uint32_t* apic_ids, uint8_t cpu_count) {
	ASSERT(ap_percpus == nullptr);

	DEBUG_ASSERT(cpu_count >= 1);
	if (cpu_count == 0) {
	return ZX_ERR_INVALID_ARGS;
	}

	if (cpu_count > 1) {
	size_t len = sizeof(ap_percpus) (cpu_count - 1);
	ap_percpus = (x86_percpu*)memalign(MAX_CACHE_LINE, len);
	if (ap_percpus == nullptr) {
	return ZX_ERR_NO_MEMORY;
	}
	memset(ap_percpus, 0, len);

	if ((use_monitor = x86_feature_test(X86_FEATURE_MON))) {
	uint16_t monitor_size = x86_get_cpuid_leaf(X86_CPUID_MON)->b & 0xffff;
	if (monitor_size < MAX_CACHE_LINE) {
	monitor_size = MAX_CACHE_LINE;
	}
	uint8_t* monitors = (uint8_t)memalign(monitor_size, monitor_size cpu_count);
	if (monitors == nullptr) {
	return ZX_ERR_NO_MEMORY;
	}
	bp_percpu.monitor = monitors;
	for (uint i = 1; i < cpu_count; ++i) {
	ap_percpus[i - 1].monitor = monitors + (i * monitor_size);
	}
	}
	}

	uint32_t bootstrap_ap = apic_local_id();
	DEBUG_ASSERT(bootstrap_ap == apic_bsp_id());

	uint apic_idx = 0;
	for (uint i = 0; i < cpu_count; ++i) {
	if (apic_ids[i] == bootstrap_ap) {
	continue;
	}
	DEBUG_ASSERT(apic_idx != (uint)(cpu_count - 1));
	if (apic_idx == (uint)cpu_count - 1) {
	/* Never found bootstrap CPU in apic id list */
	return ZX_ERR_BAD_STATE;
	}
	ap_percpus[apic_idx].cpu_num = apic_idx + 1;
	ap_percpus[apic_idx].apic_id = apic_ids[i];
	ap_percpus[apic_idx].direct = &ap_percpus[apic_idx];
	apic_idx++;
	}

	x86_num_cpus = cpu_count;
	return ZX_OK;
	}

	void x86_init_percpu(cpu_num_t cpu_num) {
	struct x86_percpu* const percpu =
	cpu_num == 0 ? &bp_percpu : &ap_percpus[cpu_num - 1];
	DEBUG_ASSERT(percpu->cpu_num == cpu_num);
	DEBUG_ASSERT(percpu->direct == percpu);

	// Assembly code has already set up %gs.base so that this function's
	// own code can use it implicitly for stack-protector or safe-stack.
	DEBUG_ASSERT(read_msr(X86_MSR_IA32_GS_BASE) == (uintptr_t)percpu);

	/* set the KERNEL_GS_BASE MSR to 0 */
	/* when we enter user space, this will be populated via a swapgs */
	write_msr(X86_MSR_IA32_KERNEL_GS_BASE, 0);

	x86_feature_init();

	x86_cpu_topology_init();
	x86_extended_register_init();
	x86_extended_register_enable_feature(X86_EXTENDED_REGISTER_SSE);
	x86_extended_register_enable_feature(X86_EXTENDED_REGISTER_AVX);

	// This can be turned on/off later by the user. Turn it on here so that
	// the buffer size assumes it's on.
	x86_extended_register_enable_feature(X86_EXTENDED_REGISTER_PT);
	// But then set the default mode to off.
	x86_set_extended_register_pt_state(false);

	gdt_load(gdt_get());

	x86_initialize_percpu_tss();

	// Setup the post early boot IDT
	if (cpu_num == 0) {
	idt_setup(&_idt);
	// Setup alternate stacks to guarantee stack sanity when handling these
	// interrupts
	idt_set_ist_index(&_idt, X86_INT_NMI, NMI_IST_INDEX);
	idt_set_ist_index(&_idt, X86_INT_MACHINE_CHECK, MCE_IST_INDEX);
	idt_set_ist_index(&_idt, X86_INT_DOUBLE_FAULT, DBF_IST_INDEX);
	idt_load(&_idt);
	} else {
	// Load the read-only IDT setup on arch initialization.
	idt_load(idt_get_readonly());
	}

	// Apply any timestamp counter adjustment to keep a continuous clock across
	// suspend/resume.
	x86_tsc_adjust();

	/* load the syscall entry point */
	write_msr(X86_MSR_IA32_LSTAR, (uint64_t)&x86_syscall);

	/* set the STAR MSR to load the appropriate kernel code selector on syscall
	* and the appropriate user code selector on return.
	* on syscall entry the following are loaded into segment registers:
	* CS = CODE_64_SELECTOR (STAR[47:32])
	* SS = DATA_SELECTOR (STAR[47:32] + 0x8)
	* on syscall exit:
	* CS = USER_CODE_64_SELECTOR (STAR[63:48] + 0x16)
	* SS = USER_DATA_SELECTOR (STAR[63:48] + 0x8)
	*/
	write_msr(X86_MSR_IA32_STAR, (uint64_t)USER_CODE_SELECTOR << 48 \| (uint64_t)CODE_64_SELECTOR << 32);

	// Set the FMASK register to mask off certain bits in RFLAGS on syscall
	// entry. See docs/kernel_invariants.md.
	uint64_t mask =
	X86_FLAGS_AC \| /* disable alignment check/access control (this
	* prevents ring 0 from performing data access
	* to ring 3 if SMAP is available) */
	X86_FLAGS_NT \| /* clear nested task */
	X86_FLAGS_IOPL_MASK \| /* set iopl to 0 */
	X86_FLAGS_STATUS_MASK; /* clear all status flags, interrupt disabled, trap flag */
	write_msr(X86_MSR_IA32_FMASK, mask);

	// Apply the same mask to our current flags, to ensure that flags are
	// set to known-good values, because some flags may be inherited by
	// later kernel threads. We do this just in case any bad values were
	// left behind by firmware or the bootloader.
	x86_restore_flags(x86_save_flags() & ~mask);

	/* enable syscall instruction */
	uint64_t efer_msr = read_msr(X86_MSR_IA32_EFER);
	efer_msr \|= X86_EFER_SCE;
	write_msr(X86_MSR_IA32_EFER, efer_msr);

	uint64_t cr4 = x86_get_cr4();
	// Enable {rd,wr}{fs,gs}base instructions.
	if (x86_feature_test(X86_FEATURE_FSGSBASE)) {
	cr4 \|= X86_CR4_FSGSBASE;
	}
	if (x86_feature_test(X86_FEATURE_UMIP)) {
	cr4 \|= X86_CR4_UMIP;
	}
	x86_set_cr4(cr4);

	// Some intel cpus support auto-entering C1E state when all cores are at C1. In
	// C1E state the voltage is reduced on all cores as well as clock gated. There is
	// a latency associated with ramping the voltage on wake. Disable this feature here
	// to save time on the irq path from idle. (5-10us on skylake nuc from kernel irq
	// handler to user space handler).
	if (!x86_feature_test(X86_FEATURE_HYPERVISOR) &&
	x86_get_microarch_config()->disable_c1e) {
	uint64_t power_ctl_msr = read_msr(0x1fc);
	write_msr(0x1fc, power_ctl_msr & ~0x2);
	}

	mp_set_curr_cpu_online(true);
	}

	void x86_set_local_apic_id(uint32_t apic_id) {
	struct x86_percpu* percpu = x86_get_percpu();
	DEBUG_ASSERT(percpu->cpu_num == 0);
	percpu->apic_id = apic_id;
	}

	int x86_apic_id_to_cpu_num(uint32_t apic_id) {
	if (bp_percpu.apic_id == apic_id) {
	return (int)bp_percpu.cpu_num;
	}

	for (uint i = 0; i < (uint)x86_num_cpus - 1; ++i) {
	if (ap_percpus[i].apic_id == apic_id) {
	return (int)ap_percpus[i].cpu_num;
	}
	}
	return -1;
	}

	zx_status_t arch_mp_reschedule(cpu_mask_t mask) {
	DEBUG_ASSERT(thread_lock_held());

	cpu_mask_t needs_ipi = 0;
	if (use_monitor) {
	while (mask) {
	cpu_num_t cpu_id = lowest_cpu_set(mask);
	cpu_mask_t cpu_mask = cpu_num_to_mask(cpu_id);
	struct x86_percpu* percpu = cpu_id ? &ap_percpus[cpu_id - 1] : &bp_percpu;

	// When a cpu see that it is about to start the idle thread, it sets its own
	// monitor flag. When a cpu is rescheduling another cpu, if it sees the monitor flag
	// set, it can clear the flag to wake up the other cpu w/o an IPI. When the other
	// cpu wakes up, the idle thread sees the cleared flag and preempts itself. Both of
	// these operations are under the scheduler lock, so there are no races where the
	// wrong signal can be sent.
	uint8_t old_val = *percpu->monitor;
	*percpu->monitor = 0;
	if (!old_val) {
	needs_ipi \|= cpu_mask;
	}
	mask &= ~cpu_mask;
	}
	} else {
	needs_ipi = mask;
	}

	return needs_ipi ? arch_mp_send_ipi(MP_IPI_TARGET_MASK, needs_ipi, MP_IPI_RESCHEDULE) : ZX_OK;
	}

	void arch_prepare_current_cpu_idle_state(bool idle) {
	DEBUG_ASSERT(thread_lock_held());

	if (use_monitor) {
	*x86_get_percpu()->monitor = idle;
	}
	}

	__NO_RETURN int arch_idle_thread_routine(void*) {
	if (use_monitor) {
	struct x86_percpu* percpu = x86_get_percpu();
	for (;;) {
	while (*percpu->monitor) {
	x86_monitor(percpu->monitor);
	// Check percpu->monitor in case it was cleared between the first check and
	// the monitor being armed. Any writes after arming the monitor will trigger
	// it and cause mwait to return, so there aren't races after this check.
	if (*percpu->monitor) {
	x86_mwait();
	}
	}
	thread_preempt();
	}
	} else {
	for (;;) {
	x86_idle();
	}
	}
	}

	zx_status_t arch_mp_send_ipi(mp_ipi_target_t target, cpu_mask_t mask, mp_ipi_t ipi) {
	uint8_t vector = 0;
	switch (ipi) {
	case MP_IPI_GENERIC:
	vector = X86_INT_IPI_GENERIC;
	break;
	case MP_IPI_RESCHEDULE:
	vector = X86_INT_IPI_RESCHEDULE;
	break;
	case MP_IPI_INTERRUPT:
	vector = X86_INT_IPI_INTERRUPT;
	break;
	case MP_IPI_HALT:
	vector = X86_INT_IPI_HALT;
	break;
	default:
	panic("Unexpected MP IPI value: %u", (uint)ipi);
	}

	if (target == MP_IPI_TARGET_ALL_BUT_LOCAL) {
	apic_send_broadcast_ipi(vector, DELIVERY_MODE_FIXED);
	return ZX_OK;
	} else if (target == MP_IPI_TARGET_ALL) {
	apic_send_broadcast_self_ipi(vector, DELIVERY_MODE_FIXED);
	return ZX_OK;
	}

	ASSERT(x86_num_cpus <= sizeof(mask) * CHAR_BIT);

	cpu_mask_t remaining = mask;
	uint cpu_id = 0;
	while (remaining && cpu_id < x86_num_cpus) {
	if (remaining & 1) {
	struct x86_percpu* percpu;
	if (cpu_id == 0) {
	percpu = &bp_percpu;
	} else {
	percpu = &ap_percpus[cpu_id - 1];
	}
	/* Reschedule IPIs may occur before all CPUs are fully up. Just
	* ignore attempts to send them to down CPUs. */
	if (ipi != MP_IPI_RESCHEDULE) {
	DEBUG_ASSERT(percpu->apic_id != INVALID_APIC_ID);
	}
	/* Make sure the CPU is actually up before sending the IPI */
	if (percpu->apic_id != INVALID_APIC_ID) {
	apic_send_ipi(vector, (uint8_t)percpu->apic_id, DELIVERY_MODE_FIXED);
	}
	}
	remaining >>= 1;
	cpu_id++;
	}

	return ZX_OK;
	}

	void x86_ipi_halt_handler(void*) {
	printf("halting cpu %u\n", arch_curr_cpu_num());

	platform_halt_cpu();

	for (;;) {
	x86_cli();
	x86_hlt();
	}
	}

	// Forcibly stops all other CPUs except the current one and the BSP (which is
	// cpu 0)
	void x86_force_halt_all_but_local_and_bsp(void) {
	cpu_num_t self = arch_curr_cpu_num();
	for (cpu_num_t i = 1; i < x86_num_cpus; ++i) {
	if (i == self) {
	continue;
	}
	uint32_t dst_apic_id = ap_percpus[i - 1].apic_id;
	apic_send_ipi(0, static_cast<uint8_t>(dst_apic_id),
	DELIVERY_MODE_INIT);
	}
	}

	zx_status_t arch_mp_prep_cpu_unplug(uint cpu_id) {
	if (cpu_id == 0 \|\| cpu_id >= x86_num_cpus) {
	return ZX_ERR_INVALID_ARGS;
	}
	return ZX_OK;
	}

	zx_status_t arch_mp_cpu_unplug(uint cpu_id) {
	/* we do not allow unplugging the bootstrap processor */
	if (cpu_id == 0 \|\| cpu_id >= x86_num_cpus) {
	return ZX_ERR_INVALID_ARGS;
	}

	uint32_t dst_apic_id = ap_percpus[cpu_id - 1].apic_id;
	if (dst_apic_id == INVALID_APIC_ID) {
	/* This is a transient state that can occur during CPU onlining */
	return ZX_ERR_UNAVAILABLE;
	}

	DEBUG_ASSERT(dst_apic_id < UINT8_MAX);
	apic_send_ipi(0, (uint8_t)dst_apic_id, DELIVERY_MODE_INIT);
	return ZX_OK;
	}

	zx_status_t arch_mp_cpu_hotplug(uint cpu_id) {
	if (cpu_id >= x86_num_cpus) {
	return ZX_ERR_INVALID_ARGS;
	}
	if (mp_is_cpu_online(cpu_id)) {
	return ZX_ERR_BAD_STATE;
	}
	DEBUG_ASSERT(cpu_id != 0);
	if (cpu_id == 0) {
	/* We shouldn't be able to shutoff the bootstrap CPU, so
	* no reason to be able to bring it back via this route. */
	return ZX_ERR_INVALID_ARGS;
	}

	struct x86_percpu* percpu = &ap_percpus[cpu_id - 1];
	DEBUG_ASSERT(percpu->apic_id != INVALID_APIC_ID);
	return x86_bringup_aps(&percpu->apic_id, 1);
	}

	/* Used to suspend work on a CPU until it is further shutdown */
	void arch_flush_state_and_halt(event_t* flush_done) {
	DEBUG_ASSERT(arch_ints_disabled());

	__asm__ volatile("wbinvd"
	:
	:
	: "memory");

	event_signal(flush_done, false);
	while (1) {
	__asm__ volatile("cli; hlt"
	:
	:
	: "memory");
	}
	}