| // Copyright 2016 The Fuchsia Authors |
| // Copyright (c) 2016 Travis Geiselbrecht |
| // |
| // Use of this source code is governed by a MIT-style |
| // license that can be found in the LICENSE file or at |
| // https://opensource.org/licenses/MIT |
| |
| #include <assert.h> |
| #include <zircon/compiler.h> |
| #include <debug.h> |
| #include <err.h> |
| #include <stdio.h> |
| #include <string.h> |
| #include <trace.h> |
| |
| #include <arch/mp.h> |
| #include <arch/ops.h> |
| #include <arch/x86.h> |
| #include <arch/x86/apic.h> |
| #include <arch/x86/cpu_topology.h> |
| #include <arch/x86/descriptor.h> |
| #include <arch/x86/feature.h> |
| #include <arch/x86/interrupts.h> |
| #include <arch/x86/mmu.h> |
| #include <arch/x86/mp.h> |
| #include <arch/x86/tsc.h> |
| #include <dev/hw_rng.h> |
| #include <dev/interrupt.h> |
| #include <kernel/event.h> |
| #include <kernel/timer.h> |
| #include <platform.h> |
| |
| #define LOCAL_TRACE 0 |
| |
| struct x86_percpu *ap_percpus; |
| uint8_t x86_num_cpus = 1; |
| |
| extern struct idt _idt; |
| |
| status_t x86_allocate_ap_structures(uint32_t *apic_ids, uint8_t cpu_count) |
| { |
| ASSERT(ap_percpus == NULL); |
| |
| DEBUG_ASSERT(cpu_count >= 1); |
| if (cpu_count == 0) { |
| return ZX_ERR_INVALID_ARGS; |
| } |
| |
| if (cpu_count > 1) { |
| size_t len = sizeof(*ap_percpus) * (cpu_count - 1); |
| ap_percpus = (x86_percpu *)memalign(MAX_CACHE_LINE, len); |
| if (ap_percpus == NULL) { |
| return ZX_ERR_NO_MEMORY; |
| } |
| memset(ap_percpus, 0, len); |
| } |
| |
| uint32_t bootstrap_ap = apic_local_id(); |
| |
| uint apic_idx = 0; |
| for (uint i = 0; i < cpu_count; ++i) { |
| if (apic_ids[i] == bootstrap_ap) { |
| continue; |
| } |
| DEBUG_ASSERT(apic_idx != (uint)(cpu_count - 1)); |
| if (apic_idx == (uint)cpu_count - 1) { |
| /* Never found bootstrap CPU in apic id list */ |
| return ZX_ERR_BAD_STATE; |
| } |
| ap_percpus[apic_idx].cpu_num = apic_idx + 1; |
| ap_percpus[apic_idx].apic_id = apic_ids[i]; |
| ap_percpus[apic_idx].direct = &ap_percpus[apic_idx]; |
| apic_idx++; |
| } |
| |
| x86_num_cpus = cpu_count; |
| return ZX_OK; |
| } |
| |
| void x86_init_percpu(uint cpu_num) |
| { |
| struct x86_percpu *const percpu = |
| cpu_num == 0 ? &bp_percpu : &ap_percpus[cpu_num - 1]; |
| DEBUG_ASSERT(percpu->cpu_num == cpu_num); |
| DEBUG_ASSERT(percpu->direct == percpu); |
| |
| // Assembly code has already set up %gs.base so that this function's |
| // own code can use it implicitly for stack-protector or safe-stack. |
| DEBUG_ASSERT(read_msr(X86_MSR_IA32_GS_BASE) == (uintptr_t)percpu); |
| |
| /* set the KERNEL_GS_BASE MSR to 0 */ |
| /* when we enter user space, this will be populated via a swapgs */ |
| write_msr(X86_MSR_IA32_KERNEL_GS_BASE, 0); |
| |
| x86_feature_init(); |
| |
| x86_cpu_topology_init(); |
| x86_extended_register_init(); |
| x86_extended_register_enable_feature(X86_EXTENDED_REGISTER_SSE); |
| x86_extended_register_enable_feature(X86_EXTENDED_REGISTER_AVX); |
| |
| // This can be turned on/off later by the user. Turn it on here so that |
| // the buffer size assumes it's on. |
| x86_extended_register_enable_feature(X86_EXTENDED_REGISTER_PT); |
| // But then set the default mode to off. |
| x86_set_extended_register_pt_state(false); |
| |
| x86_initialize_percpu_tss(); |
| |
| // Setup the post early boot IDT |
| if (cpu_num == 0) { |
| idt_setup(&_idt); |
| // Setup alternate stacks to guarantee stack sanity when handling these |
| // interrupts |
| idt_set_ist_index(&_idt, X86_INT_NMI, NMI_IST_INDEX); |
| idt_set_ist_index(&_idt, X86_INT_MACHINE_CHECK, MCE_IST_INDEX); |
| idt_set_ist_index(&_idt, X86_INT_DOUBLE_FAULT, DBF_IST_INDEX); |
| idt_load(&_idt); |
| } else { |
| // Load the read-only IDT setup on arch initialization. |
| idt_load(idt_get_readonly()); |
| } |
| |
| // Apply any timestamp counter adjustment to keep a continuous clock across |
| // suspend/resume. |
| x86_tsc_adjust(); |
| |
| /* load the syscall entry point */ |
| write_msr(X86_MSR_IA32_LSTAR, (uint64_t)&x86_syscall); |
| |
| /* set the STAR MSR to load the appropriate kernel code selector on syscall |
| * and the appropriate user code selector on return. |
| * on syscall entry the following are loaded into segment registers: |
| * CS = CODE_64_SELECTOR (STAR[47:32]) |
| * SS = DATA_SELECTOR (STAR[47:32] + 0x8) |
| * on syscall exit: |
| * CS = USER_CODE_64_SELECTOR (STAR[63:48] + 0x16) |
| * SS = USER_DATA_SELECTOR (STAR[63:48] + 0x8) |
| */ |
| write_msr(X86_MSR_IA32_STAR, (uint64_t)USER_CODE_SELECTOR << 48 | (uint64_t)CODE_64_SELECTOR << 32); |
| |
| // Set the FMASK register to mask off certain bits in RFLAGS on syscall |
| // entry. See docs/kernel_invariants.md. |
| uint64_t mask = |
| X86_FLAGS_AC | /* disable alignment check/access control (this |
| * prevents ring 0 from performing data access |
| * to ring 3 if SMAP is available) */ |
| X86_FLAGS_NT | /* clear nested task */ |
| X86_FLAGS_IOPL_MASK | /* set iopl to 0 */ |
| X86_FLAGS_STATUS_MASK; /* clear all status flags, interrupt disabled, trap flag */ |
| write_msr(X86_MSR_IA32_FMASK, mask); |
| |
| // Apply the same mask to our current flags, to ensure that flags are |
| // set to known-good values, because some flags may be inherited by |
| // later kernel threads. We do this just in case any bad values were |
| // left behind by firmware or the bootloader. |
| x86_restore_flags(x86_save_flags() & ~mask); |
| |
| /* enable syscall instruction */ |
| uint64_t efer_msr = read_msr(X86_MSR_IA32_EFER); |
| efer_msr |= X86_EFER_SCE; |
| write_msr(X86_MSR_IA32_EFER, efer_msr); |
| |
| // Enable {rd,wr}{fs,gs}base instructions. |
| if (x86_feature_test(X86_FEATURE_FSGSBASE)) { |
| x86_set_cr4(x86_get_cr4() | X86_CR4_FSGSBASE); |
| } |
| |
| // These intel cpus support auto-entering C1E state when all cores are at C1. In |
| // C1E state the voltage is reduced on all cores as well as clock gated. There is |
| // a latency associated with ramping the voltage on wake. Disable this feature here |
| // to save time on the irq path from idle. (5-10us on skylake nuc from kernel irq |
| // handler to user space handler). |
| // TODO(MG-981): Look for a nicer way to handle this across different processors |
| const struct x86_model_info* model = x86_get_model(); |
| if (!x86_feature_test(X86_FEATURE_HYPERVISOR) && |
| x86_vendor == X86_VENDOR_INTEL && model->display_family == 0x6 && ( |
| model->display_model == 0x1a || // nehalem |
| model->display_model == 0x1e || |
| model->display_model == 0x1f || |
| model->display_model == 0x2e || |
| model->display_model == 0x25 || // westermere |
| model->display_model == 0x2c || |
| model->display_model == 0x2f || |
| model->display_model == 0x2a || // sandy bridge |
| model->display_model == 0x2d || |
| model->display_model == 0x3a || // ivy bridge |
| model->display_model == 0x3e || |
| model->display_model == 0x3c || // haswell |
| model->display_model == 0x3f || |
| model->display_model == 0x45 || |
| model->display_model == 0x46 || |
| model->display_model == 0x3d || // broadwell |
| model->display_model == 0x47 || |
| model->display_model == 0x4f || |
| model->display_model == 0x56 || |
| model->display_model == 0x4e || // skylake |
| model->display_model == 0x5e)) { |
| uint64_t power_ctl_msr = read_msr(0x1fc); |
| write_msr(0x1fc, power_ctl_msr & ~0x2); |
| } |
| |
| mp_set_curr_cpu_online(true); |
| } |
| |
| void x86_set_local_apic_id(uint32_t apic_id) |
| { |
| struct x86_percpu *percpu = x86_get_percpu(); |
| DEBUG_ASSERT(percpu->cpu_num == 0); |
| percpu->apic_id = apic_id; |
| } |
| |
| int x86_apic_id_to_cpu_num(uint32_t apic_id) |
| { |
| if (bp_percpu.apic_id == apic_id) { |
| return (int)bp_percpu.cpu_num; |
| } |
| |
| for (uint i = 0; i < (uint)x86_num_cpus - 1; ++i) { |
| if (ap_percpus[i].apic_id == apic_id) { |
| return (int)ap_percpus[i].cpu_num; |
| } |
| } |
| return -1; |
| } |
| |
| status_t arch_mp_send_ipi(mp_ipi_target_t target, mp_cpu_mask_t mask, mp_ipi_t ipi) |
| { |
| uint8_t vector = 0; |
| switch (ipi) { |
| case MP_IPI_GENERIC: |
| vector = X86_INT_IPI_GENERIC; |
| break; |
| case MP_IPI_RESCHEDULE: |
| vector = X86_INT_IPI_RESCHEDULE; |
| break; |
| case MP_IPI_HALT: |
| vector = X86_INT_IPI_HALT; |
| break; |
| default: |
| panic("Unexpected MP IPI value: %u", (uint)ipi); |
| } |
| |
| if (target == MP_IPI_TARGET_ALL_BUT_LOCAL) { |
| apic_send_broadcast_ipi(vector, DELIVERY_MODE_FIXED); |
| return ZX_OK; |
| } else if (target == MP_IPI_TARGET_ALL) { |
| apic_send_broadcast_self_ipi(vector, DELIVERY_MODE_FIXED); |
| return ZX_OK; |
| } |
| |
| ASSERT(x86_num_cpus <= sizeof(mask) * CHAR_BIT); |
| |
| mp_cpu_mask_t remaining = mask; |
| uint cpu_id = 0; |
| while (remaining && cpu_id < x86_num_cpus) { |
| if (remaining & 1) { |
| struct x86_percpu *percpu; |
| if (cpu_id == 0) { |
| percpu = &bp_percpu; |
| } else { |
| percpu = &ap_percpus[cpu_id - 1]; |
| } |
| /* Reschedule IPIs may occur before all CPUs are fully up. Just |
| * ignore attempts to send them to down CPUs. */ |
| if (ipi != MP_IPI_RESCHEDULE) { |
| DEBUG_ASSERT(percpu->apic_id != INVALID_APIC_ID); |
| } |
| /* Make sure the CPU is actually up before sending the IPI */ |
| if (percpu->apic_id != INVALID_APIC_ID) { |
| apic_send_ipi(vector, (uint8_t)percpu->apic_id, DELIVERY_MODE_FIXED); |
| } |
| } |
| remaining >>= 1; |
| cpu_id++; |
| } |
| |
| return ZX_OK; |
| } |
| |
| enum handler_return x86_ipi_generic_handler(void) |
| { |
| LTRACEF("cpu %u\n", arch_curr_cpu_num()); |
| return mp_mbx_generic_irq(); |
| } |
| |
| enum handler_return x86_ipi_reschedule_handler(void) |
| { |
| LTRACEF("cpu %u\n", arch_curr_cpu_num()); |
| return mp_mbx_reschedule_irq(); |
| } |
| |
| void x86_ipi_halt_handler(void) |
| { |
| printf("halting cpu %u\n", arch_curr_cpu_num()); |
| |
| platform_halt_cpu(); |
| |
| for (;;) { |
| x86_cli(); |
| x86_hlt(); |
| } |
| } |
| |
| status_t arch_mp_prep_cpu_unplug(uint cpu_id) { |
| if (cpu_id == 0 || cpu_id >= x86_num_cpus) { |
| return ZX_ERR_INVALID_ARGS; |
| } |
| return ZX_OK; |
| } |
| |
| status_t arch_mp_cpu_unplug(uint cpu_id) |
| { |
| /* we do not allow unplugging the bootstrap processor */ |
| if (cpu_id == 0 || cpu_id >= x86_num_cpus) { |
| return ZX_ERR_INVALID_ARGS; |
| } |
| |
| uint32_t dst_apic_id = ap_percpus[cpu_id - 1].apic_id; |
| if (dst_apic_id == INVALID_APIC_ID) { |
| /* This is a transient state that can occur during CPU onlining */ |
| return ZX_ERR_UNAVAILABLE; |
| } |
| |
| DEBUG_ASSERT(dst_apic_id < UINT8_MAX); |
| apic_send_ipi(0, (uint8_t)dst_apic_id, DELIVERY_MODE_INIT); |
| return ZX_OK; |
| } |
| |
| status_t arch_mp_cpu_hotplug(uint cpu_id) |
| { |
| if (cpu_id >= x86_num_cpus) { |
| return ZX_ERR_INVALID_ARGS; |
| } |
| if (mp_is_cpu_online(cpu_id)) { |
| return ZX_ERR_BAD_STATE; |
| } |
| DEBUG_ASSERT(cpu_id != 0); |
| if (cpu_id == 0) { |
| /* We shouldn't be able to shutoff the bootstrap CPU, so |
| * no reason to be able to bring it back via this route. */ |
| return ZX_ERR_INVALID_ARGS; |
| } |
| |
| struct x86_percpu *percpu = &ap_percpus[cpu_id - 1]; |
| DEBUG_ASSERT(percpu->apic_id != INVALID_APIC_ID); |
| return x86_bringup_aps(&percpu->apic_id, 1); |
| } |
| |
| /* Used to suspend work on a CPU until it is further shutdown */ |
| void arch_flush_state_and_halt(event_t *flush_done) |
| { |
| DEBUG_ASSERT(arch_ints_disabled()); |
| |
| // Enter no-fill cache mode (see Intel 3A section 11.5.3) |
| ulong cr0 = x86_get_cr0(); |
| cr0 |= X86_CR0_CD; |
| cr0 &= ~X86_CR0_NW; |
| x86_set_cr0(cr0); |
| |
| __asm__ volatile("wbinvd" : : : "memory"); |
| |
| event_signal(flush_done, false); |
| while (1) { |
| __asm__ volatile("cli; hlt" : : : "memory"); |
| } |
| } |