blob: 27f61303778df0ff67c25ed55516642c23f6aaf2 [file] [log] [blame]
// Copyright 2016 The Fuchsia Authors
// Copyright (c) 2016 Travis Geiselbrecht
//
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file or at
// https://opensource.org/licenses/MIT
#include <assert.h>
#include <zircon/compiler.h>
#include <debug.h>
#include <err.h>
#include <stdio.h>
#include <string.h>
#include <trace.h>
#include <arch/mp.h>
#include <arch/ops.h>
#include <arch/x86.h>
#include <arch/x86/apic.h>
#include <arch/x86/cpu_topology.h>
#include <arch/x86/descriptor.h>
#include <arch/x86/feature.h>
#include <arch/x86/interrupts.h>
#include <arch/x86/mmu.h>
#include <arch/x86/mp.h>
#include <arch/x86/tsc.h>
#include <dev/hw_rng.h>
#include <dev/interrupt.h>
#include <kernel/event.h>
#include <kernel/timer.h>
#include <platform.h>
#define LOCAL_TRACE 0
struct x86_percpu *ap_percpus;
uint8_t x86_num_cpus = 1;
extern struct idt _idt;
status_t x86_allocate_ap_structures(uint32_t *apic_ids, uint8_t cpu_count)
{
ASSERT(ap_percpus == NULL);
DEBUG_ASSERT(cpu_count >= 1);
if (cpu_count == 0) {
return ZX_ERR_INVALID_ARGS;
}
if (cpu_count > 1) {
size_t len = sizeof(*ap_percpus) * (cpu_count - 1);
ap_percpus = (x86_percpu *)memalign(MAX_CACHE_LINE, len);
if (ap_percpus == NULL) {
return ZX_ERR_NO_MEMORY;
}
memset(ap_percpus, 0, len);
}
uint32_t bootstrap_ap = apic_local_id();
uint apic_idx = 0;
for (uint i = 0; i < cpu_count; ++i) {
if (apic_ids[i] == bootstrap_ap) {
continue;
}
DEBUG_ASSERT(apic_idx != (uint)(cpu_count - 1));
if (apic_idx == (uint)cpu_count - 1) {
/* Never found bootstrap CPU in apic id list */
return ZX_ERR_BAD_STATE;
}
ap_percpus[apic_idx].cpu_num = apic_idx + 1;
ap_percpus[apic_idx].apic_id = apic_ids[i];
ap_percpus[apic_idx].direct = &ap_percpus[apic_idx];
apic_idx++;
}
x86_num_cpus = cpu_count;
return ZX_OK;
}
void x86_init_percpu(uint cpu_num)
{
struct x86_percpu *const percpu =
cpu_num == 0 ? &bp_percpu : &ap_percpus[cpu_num - 1];
DEBUG_ASSERT(percpu->cpu_num == cpu_num);
DEBUG_ASSERT(percpu->direct == percpu);
// Assembly code has already set up %gs.base so that this function's
// own code can use it implicitly for stack-protector or safe-stack.
DEBUG_ASSERT(read_msr(X86_MSR_IA32_GS_BASE) == (uintptr_t)percpu);
/* set the KERNEL_GS_BASE MSR to 0 */
/* when we enter user space, this will be populated via a swapgs */
write_msr(X86_MSR_IA32_KERNEL_GS_BASE, 0);
x86_feature_init();
x86_cpu_topology_init();
x86_extended_register_init();
x86_extended_register_enable_feature(X86_EXTENDED_REGISTER_SSE);
x86_extended_register_enable_feature(X86_EXTENDED_REGISTER_AVX);
// This can be turned on/off later by the user. Turn it on here so that
// the buffer size assumes it's on.
x86_extended_register_enable_feature(X86_EXTENDED_REGISTER_PT);
// But then set the default mode to off.
x86_set_extended_register_pt_state(false);
x86_initialize_percpu_tss();
// Setup the post early boot IDT
if (cpu_num == 0) {
idt_setup(&_idt);
// Setup alternate stacks to guarantee stack sanity when handling these
// interrupts
idt_set_ist_index(&_idt, X86_INT_NMI, NMI_IST_INDEX);
idt_set_ist_index(&_idt, X86_INT_MACHINE_CHECK, MCE_IST_INDEX);
idt_set_ist_index(&_idt, X86_INT_DOUBLE_FAULT, DBF_IST_INDEX);
idt_load(&_idt);
} else {
// Load the read-only IDT setup on arch initialization.
idt_load(idt_get_readonly());
}
// Apply any timestamp counter adjustment to keep a continuous clock across
// suspend/resume.
x86_tsc_adjust();
/* load the syscall entry point */
write_msr(X86_MSR_IA32_LSTAR, (uint64_t)&x86_syscall);
/* set the STAR MSR to load the appropriate kernel code selector on syscall
* and the appropriate user code selector on return.
* on syscall entry the following are loaded into segment registers:
* CS = CODE_64_SELECTOR (STAR[47:32])
* SS = DATA_SELECTOR (STAR[47:32] + 0x8)
* on syscall exit:
* CS = USER_CODE_64_SELECTOR (STAR[63:48] + 0x16)
* SS = USER_DATA_SELECTOR (STAR[63:48] + 0x8)
*/
write_msr(X86_MSR_IA32_STAR, (uint64_t)USER_CODE_SELECTOR << 48 | (uint64_t)CODE_64_SELECTOR << 32);
// Set the FMASK register to mask off certain bits in RFLAGS on syscall
// entry. See docs/kernel_invariants.md.
uint64_t mask =
X86_FLAGS_AC | /* disable alignment check/access control (this
* prevents ring 0 from performing data access
* to ring 3 if SMAP is available) */
X86_FLAGS_NT | /* clear nested task */
X86_FLAGS_IOPL_MASK | /* set iopl to 0 */
X86_FLAGS_STATUS_MASK; /* clear all status flags, interrupt disabled, trap flag */
write_msr(X86_MSR_IA32_FMASK, mask);
// Apply the same mask to our current flags, to ensure that flags are
// set to known-good values, because some flags may be inherited by
// later kernel threads. We do this just in case any bad values were
// left behind by firmware or the bootloader.
x86_restore_flags(x86_save_flags() & ~mask);
/* enable syscall instruction */
uint64_t efer_msr = read_msr(X86_MSR_IA32_EFER);
efer_msr |= X86_EFER_SCE;
write_msr(X86_MSR_IA32_EFER, efer_msr);
// Enable {rd,wr}{fs,gs}base instructions.
if (x86_feature_test(X86_FEATURE_FSGSBASE)) {
x86_set_cr4(x86_get_cr4() | X86_CR4_FSGSBASE);
}
// These intel cpus support auto-entering C1E state when all cores are at C1. In
// C1E state the voltage is reduced on all cores as well as clock gated. There is
// a latency associated with ramping the voltage on wake. Disable this feature here
// to save time on the irq path from idle. (5-10us on skylake nuc from kernel irq
// handler to user space handler).
// TODO(MG-981): Look for a nicer way to handle this across different processors
const struct x86_model_info* model = x86_get_model();
if (!x86_feature_test(X86_FEATURE_HYPERVISOR) &&
x86_vendor == X86_VENDOR_INTEL && model->display_family == 0x6 && (
model->display_model == 0x1a || // nehalem
model->display_model == 0x1e ||
model->display_model == 0x1f ||
model->display_model == 0x2e ||
model->display_model == 0x25 || // westermere
model->display_model == 0x2c ||
model->display_model == 0x2f ||
model->display_model == 0x2a || // sandy bridge
model->display_model == 0x2d ||
model->display_model == 0x3a || // ivy bridge
model->display_model == 0x3e ||
model->display_model == 0x3c || // haswell
model->display_model == 0x3f ||
model->display_model == 0x45 ||
model->display_model == 0x46 ||
model->display_model == 0x3d || // broadwell
model->display_model == 0x47 ||
model->display_model == 0x4f ||
model->display_model == 0x56 ||
model->display_model == 0x4e || // skylake
model->display_model == 0x5e)) {
uint64_t power_ctl_msr = read_msr(0x1fc);
write_msr(0x1fc, power_ctl_msr & ~0x2);
}
mp_set_curr_cpu_online(true);
}
void x86_set_local_apic_id(uint32_t apic_id)
{
struct x86_percpu *percpu = x86_get_percpu();
DEBUG_ASSERT(percpu->cpu_num == 0);
percpu->apic_id = apic_id;
}
int x86_apic_id_to_cpu_num(uint32_t apic_id)
{
if (bp_percpu.apic_id == apic_id) {
return (int)bp_percpu.cpu_num;
}
for (uint i = 0; i < (uint)x86_num_cpus - 1; ++i) {
if (ap_percpus[i].apic_id == apic_id) {
return (int)ap_percpus[i].cpu_num;
}
}
return -1;
}
status_t arch_mp_send_ipi(mp_ipi_target_t target, mp_cpu_mask_t mask, mp_ipi_t ipi)
{
uint8_t vector = 0;
switch (ipi) {
case MP_IPI_GENERIC:
vector = X86_INT_IPI_GENERIC;
break;
case MP_IPI_RESCHEDULE:
vector = X86_INT_IPI_RESCHEDULE;
break;
case MP_IPI_HALT:
vector = X86_INT_IPI_HALT;
break;
default:
panic("Unexpected MP IPI value: %u", (uint)ipi);
}
if (target == MP_IPI_TARGET_ALL_BUT_LOCAL) {
apic_send_broadcast_ipi(vector, DELIVERY_MODE_FIXED);
return ZX_OK;
} else if (target == MP_IPI_TARGET_ALL) {
apic_send_broadcast_self_ipi(vector, DELIVERY_MODE_FIXED);
return ZX_OK;
}
ASSERT(x86_num_cpus <= sizeof(mask) * CHAR_BIT);
mp_cpu_mask_t remaining = mask;
uint cpu_id = 0;
while (remaining && cpu_id < x86_num_cpus) {
if (remaining & 1) {
struct x86_percpu *percpu;
if (cpu_id == 0) {
percpu = &bp_percpu;
} else {
percpu = &ap_percpus[cpu_id - 1];
}
/* Reschedule IPIs may occur before all CPUs are fully up. Just
* ignore attempts to send them to down CPUs. */
if (ipi != MP_IPI_RESCHEDULE) {
DEBUG_ASSERT(percpu->apic_id != INVALID_APIC_ID);
}
/* Make sure the CPU is actually up before sending the IPI */
if (percpu->apic_id != INVALID_APIC_ID) {
apic_send_ipi(vector, (uint8_t)percpu->apic_id, DELIVERY_MODE_FIXED);
}
}
remaining >>= 1;
cpu_id++;
}
return ZX_OK;
}
enum handler_return x86_ipi_generic_handler(void)
{
LTRACEF("cpu %u\n", arch_curr_cpu_num());
return mp_mbx_generic_irq();
}
enum handler_return x86_ipi_reschedule_handler(void)
{
LTRACEF("cpu %u\n", arch_curr_cpu_num());
return mp_mbx_reschedule_irq();
}
void x86_ipi_halt_handler(void)
{
printf("halting cpu %u\n", arch_curr_cpu_num());
platform_halt_cpu();
for (;;) {
x86_cli();
x86_hlt();
}
}
status_t arch_mp_prep_cpu_unplug(uint cpu_id) {
if (cpu_id == 0 || cpu_id >= x86_num_cpus) {
return ZX_ERR_INVALID_ARGS;
}
return ZX_OK;
}
status_t arch_mp_cpu_unplug(uint cpu_id)
{
/* we do not allow unplugging the bootstrap processor */
if (cpu_id == 0 || cpu_id >= x86_num_cpus) {
return ZX_ERR_INVALID_ARGS;
}
uint32_t dst_apic_id = ap_percpus[cpu_id - 1].apic_id;
if (dst_apic_id == INVALID_APIC_ID) {
/* This is a transient state that can occur during CPU onlining */
return ZX_ERR_UNAVAILABLE;
}
DEBUG_ASSERT(dst_apic_id < UINT8_MAX);
apic_send_ipi(0, (uint8_t)dst_apic_id, DELIVERY_MODE_INIT);
return ZX_OK;
}
status_t arch_mp_cpu_hotplug(uint cpu_id)
{
if (cpu_id >= x86_num_cpus) {
return ZX_ERR_INVALID_ARGS;
}
if (mp_is_cpu_online(cpu_id)) {
return ZX_ERR_BAD_STATE;
}
DEBUG_ASSERT(cpu_id != 0);
if (cpu_id == 0) {
/* We shouldn't be able to shutoff the bootstrap CPU, so
* no reason to be able to bring it back via this route. */
return ZX_ERR_INVALID_ARGS;
}
struct x86_percpu *percpu = &ap_percpus[cpu_id - 1];
DEBUG_ASSERT(percpu->apic_id != INVALID_APIC_ID);
return x86_bringup_aps(&percpu->apic_id, 1);
}
/* Used to suspend work on a CPU until it is further shutdown */
void arch_flush_state_and_halt(event_t *flush_done)
{
DEBUG_ASSERT(arch_ints_disabled());
// Enter no-fill cache mode (see Intel 3A section 11.5.3)
ulong cr0 = x86_get_cr0();
cr0 |= X86_CR0_CD;
cr0 &= ~X86_CR0_NW;
x86_set_cr0(cr0);
__asm__ volatile("wbinvd" : : : "memory");
event_signal(flush_done, false);
while (1) {
__asm__ volatile("cli; hlt" : : : "memory");
}
}