blob: fdb458c7fe74fd8cee97431886d0764dbc546a63 [file] [log] [blame]
// Copyright 2016 The Fuchsia Authors
//
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file or at
// https://opensource.org/licenses/MIT
#include "arch/x86/mmu.h"
#include <align.h>
#include <assert.h>
#include <lib/arch/sysreg.h>
#include <lib/arch/x86/boot-cpuid.h>
#include <lib/arch/x86/system.h>
#include <lib/counters.h>
#include <lib/id_allocator.h>
#include <lib/zircon-internal/macros.h>
#include <string.h>
#include <trace.h>
#include <zircon/errors.h>
#include <zircon/types.h>
#include <new>
#include <arch/arch_ops.h>
#include <arch/x86.h>
#include <arch/x86/descriptor.h>
#include <arch/x86/feature.h>
#include <arch/x86/hypervisor/invalidate.h>
#include <arch/x86/mmu_mem_types.h>
#include <kernel/mp.h>
#include <vm/arch_vm_aspace.h>
#include <vm/physmap.h>
#include <vm/pmm.h>
#include <vm/vm.h>
#define LOCAL_TRACE 0
// Count of the number of batches of TLB invalidations initiated on each CPU
KCOUNTER(tlb_invalidations_sent, "mmu.tlb_invalidation_batches_sent")
// Count of the number of batches of TLB invalidation requests received on each CPU
// Includes tlb_invalidations_full_global_received and tlb_invalidations_full_nonglobal_received
KCOUNTER(tlb_invalidations_received, "mmu.tlb_invalidation_batches_received")
// Count of the number of invalidate TLB invalidation requests received on each cpu
KCOUNTER(tlb_invalidations_received_invalid, "mmu.tlb_invalidation_batches_received_invalid")
// Count of the number of TLB invalidation requests for all entries on each CPU
KCOUNTER(tlb_invalidations_full_global_received, "mmu.tlb_invalidation_full_global_received")
// Count of the number of TLB invalidation requests for all non-global entries on each CPU
KCOUNTER(tlb_invalidations_full_nonglobal_received, "mmu.tlb_invalidation_full_nonglobal_received")
// Count of the number of times an EPT TLB invalidation got performed.
KCOUNTER(ept_tlb_invalidations, "mmu.ept_tlb_invalidations")
// Count the total number of context switches on the cpu
KCOUNTER(context_switches, "mmu.context_switches")
// Count the total number of fast context switches on the cpu (using PCID feature)
KCOUNTER(context_switches_pcid, "mmu.context_switches_pcid")
/* Default address width including virtual/physical address.
* newer versions fetched below */
static uint8_t g_max_vaddr_width = 48;
uint8_t g_max_paddr_width = 32;
/* True if the system supports 1GB pages */
static bool supports_huge_pages = false;
/* a global bitmap to track which PCIDs are in use */
using PCIDAllocator = id_allocator::IdAllocator<uint16_t, 4096, 1>;
static lazy_init::LazyInit<PCIDAllocator> pcid_allocator;
/* top level kernel page tables, initialized in start.S */
volatile pt_entry_t pml4[NO_OF_PT_ENTRIES] __ALIGNED(PAGE_SIZE);
volatile pt_entry_t pdp[NO_OF_PT_ENTRIES] __ALIGNED(PAGE_SIZE); /* temporary */
volatile pt_entry_t pte[NO_OF_PT_ENTRIES] __ALIGNED(PAGE_SIZE);
/* top level pdp needed to map the -512GB..0 space */
volatile pt_entry_t pdp_high[NO_OF_PT_ENTRIES] __ALIGNED(PAGE_SIZE);
#if __has_feature(address_sanitizer)
volatile pt_entry_t kasan_shadow_pt[NO_OF_PT_ENTRIES] __ALIGNED(PAGE_SIZE); // Leaf page tables
volatile pt_entry_t kasan_shadow_pd[NO_OF_PT_ENTRIES] __ALIGNED(PAGE_SIZE); // Page directories
// TODO(https://fxbug.dev/42104852): Share this with the vm::zero_page
volatile uint8_t kasan_zero_page[PAGE_SIZE] __ALIGNED(PAGE_SIZE);
#endif
/* a big pile of page tables needed to map 64GB of memory into kernel space using 2MB pages */
volatile pt_entry_t linear_map_pdp[(64ULL * GB) / (2 * MB)] __ALIGNED(PAGE_SIZE);
/* which of the above variables is the top level page table */
#define KERNEL_PT pml4
// When this bit is set in the source operand of a MOV CR3, TLB entries and paging structure
// caches for the active PCID may be preserved. If the bit is clear, entries will be cleared.
// See Intel Volume 3A, 4.10.4.1
#define X86_PCID_CR3_SAVE_ENTRIES (63)
// Static relocated base to prepare for KASLR. Used at early boot and by gdb
// script to know the target relocated address.
// TODO(thgarnie): Move to a dynamically generated base address
#if DISABLE_KASLR
uint64_t kernel_relocated_base = KERNEL_BASE - KERNEL_LOAD_OFFSET;
#else
uint64_t kernel_relocated_base = 0xffffffff00000000;
#endif
/* kernel base top level page table in physical space */
static const paddr_t kernel_pt_phys =
(vaddr_t)KERNEL_PT - (vaddr_t)__executable_start + KERNEL_LOAD_OFFSET;
paddr_t x86_kernel_cr3() { return kernel_pt_phys; }
/**
* @brief check if the virtual address is canonical
*/
bool x86_is_vaddr_canonical(vaddr_t vaddr) {
// If N is the number of address bits in use for a virtual address, then the
// address is canonical if bits [N - 1, 63] are all either 0 (the low half of
// the valid addresses) or 1 (the high half).
return ((vaddr & kX86CanonicalAddressMask) == 0) ||
((vaddr & kX86CanonicalAddressMask) == kX86CanonicalAddressMask);
}
/**
* @brief check if the virtual address is aligned and canonical
*/
static bool x86_mmu_check_vaddr(vaddr_t vaddr) {
/* Check to see if the address is PAGE aligned */
if (!IS_ALIGNED(vaddr, PAGE_SIZE))
return false;
return x86_is_vaddr_canonical(vaddr);
}
/**
* @brief check if the physical address is valid and aligned
*/
bool x86_mmu_check_paddr(paddr_t paddr) {
uint64_t max_paddr;
/* Check to see if the address is PAGE aligned */
if (!IS_ALIGNED(paddr, PAGE_SIZE))
return false;
max_paddr = ((uint64_t)1ull << g_max_paddr_width) - 1;
return paddr <= max_paddr;
}
static void invlpg(vaddr_t addr) {
__asm__ volatile("invlpg %0" ::"m"(*reinterpret_cast<uint8_t*>(addr)));
}
struct InvpcidDescriptor {
uint64_t pcid{};
uint64_t address{};
};
static void invpcid_va_pcid(vaddr_t addr, uint16_t pcid) {
// Mode 0 of INVPCID takes both the virtual address + pcid and locally shoots
// down non global pages with it on the current cpu.
uint64_t mode = 0;
InvpcidDescriptor desc = {
.pcid = pcid,
.address = addr,
};
__asm__ volatile("invpcid %0, %1" ::"m"(desc), "r"(mode));
}
static void invpcid_pcid_all(uint16_t pcid) {
// Mode 1 of INVPCID takes only the pcid and locally shoots down all non global
// pages tagged with it on the current cpu.
uint64_t mode = 1;
InvpcidDescriptor desc = {
.pcid = pcid,
.address = 0,
};
__asm__ volatile("invpcid %0, %1" ::"m"(desc), "r"(mode));
}
static void invpcid_all_including_global() {
// Mode 2 of INVPCID shoots down all tlb entries in all pcids including global pages
// on the current cpu.
uint64_t mode = 2;
InvpcidDescriptor desc = {
.pcid = 0,
.address = 0,
};
__asm__ volatile("invpcid %0, %1" ::"m"(desc), "r"(mode));
}
static void invpcid_all_excluding_global() {
// Mode 3 of INVPCID shoots down all tlb entries in all pcids excluding global pages
// on the current cpu.
uint64_t mode = 3;
InvpcidDescriptor desc = {
.pcid = 0,
.address = 0,
};
__asm__ volatile("invpcid %0, %1" ::"m"(desc), "r"(mode));
}
/**
* @brief invalidate all TLB entries for the given PCID, excluding global entries
*/
static void x86_tlb_nonglobal_invalidate(uint16_t pcid) {
if (g_x86_feature_invpcid) {
// If using PCID, make sure we invalidate all entries in all PCIDs.
// If just using INVPCID, take advantage of the fancier instruction.
if (pcid != MMU_X86_UNUSED_PCID) {
invpcid_pcid_all(pcid);
} else {
invpcid_all_excluding_global();
}
} else {
// Read CR3 and immediately write it back.
arch::X86Cr3::Read().Write();
}
}
/**
* @brief invalidate all TLB entries for all contexts, including global entries
*/
static void x86_tlb_global_invalidate() {
if (g_x86_feature_invpcid) {
// If using PCID, make sure we invalidate all entries in all PCIDs.
// If just using INVPCID, take advantage of the fancier instruction.
invpcid_all_including_global();
} else {
// See Intel 3A section 4.10.4.1
auto cr4 = arch::X86Cr4::Read();
DEBUG_ASSERT(cr4.pge()); // Global pages *must* be enabled.
cr4.set_pge(false).Write();
cr4.set_pge(true).Write();
}
}
// X86PageTableMmu
bool X86PageTableMmu::check_paddr(paddr_t paddr) { return x86_mmu_check_paddr(paddr); }
bool X86PageTableMmu::check_vaddr(vaddr_t vaddr) { return x86_mmu_check_vaddr(vaddr); }
bool X86PageTableMmu::supports_page_size(PageTableLevel level) {
DEBUG_ASSERT(level != PageTableLevel::PT_L);
switch (level) {
case PageTableLevel::PD_L:
return true;
case PageTableLevel::PDP_L:
return supports_huge_pages;
case PageTableLevel::PML4_L:
return false;
default:
panic("Unreachable case in supports_page_size\n");
}
}
IntermediatePtFlags X86PageTableMmu::intermediate_flags() { return X86_MMU_PG_RW | X86_MMU_PG_U; }
PtFlags X86PageTableMmu::terminal_flags(PageTableLevel level, uint flags) {
PtFlags terminal_flags = 0;
if (flags & ARCH_MMU_FLAG_PERM_WRITE) {
terminal_flags |= X86_MMU_PG_RW;
}
if (flags & ARCH_MMU_FLAG_PERM_USER) {
terminal_flags |= X86_MMU_PG_U;
}
if (use_global_mappings_) {
terminal_flags |= X86_MMU_PG_G;
}
if (!(flags & ARCH_MMU_FLAG_PERM_EXECUTE)) {
terminal_flags |= X86_MMU_PG_NX;
}
if (level != PageTableLevel::PT_L) {
switch (flags & ARCH_MMU_FLAG_CACHE_MASK) {
case ARCH_MMU_FLAG_CACHED:
terminal_flags |= X86_MMU_LARGE_PAT_WRITEBACK;
break;
case ARCH_MMU_FLAG_UNCACHED_DEVICE:
case ARCH_MMU_FLAG_UNCACHED:
terminal_flags |= X86_MMU_LARGE_PAT_UNCACHABLE;
break;
case ARCH_MMU_FLAG_WRITE_COMBINING:
terminal_flags |= X86_MMU_LARGE_PAT_WRITE_COMBINING;
break;
default:
panic("Unexpected flags 0x%x\n", flags);
}
} else {
switch (flags & ARCH_MMU_FLAG_CACHE_MASK) {
case ARCH_MMU_FLAG_CACHED:
terminal_flags |= X86_MMU_PTE_PAT_WRITEBACK;
break;
case ARCH_MMU_FLAG_UNCACHED_DEVICE:
case ARCH_MMU_FLAG_UNCACHED:
terminal_flags |= X86_MMU_PTE_PAT_UNCACHABLE;
break;
case ARCH_MMU_FLAG_WRITE_COMBINING:
terminal_flags |= X86_MMU_PTE_PAT_WRITE_COMBINING;
break;
default:
panic("Unexpected flags 0x%x\n", flags);
}
}
return terminal_flags;
}
PtFlags X86PageTableMmu::split_flags(PageTableLevel level, PtFlags flags) {
DEBUG_ASSERT(level != PageTableLevel::PML4_L && level != PageTableLevel::PT_L);
DEBUG_ASSERT(flags & X86_MMU_PG_PS);
if (level == PageTableLevel::PD_L) {
// Note: Clear PS before the check below; the PAT bit for a PTE is the
// the same as the PS bit for a higher table entry.
flags &= ~X86_MMU_PG_PS;
/* If the larger page had the PAT flag set, make sure it's
* transferred to the different index for a PTE */
if (flags & X86_MMU_PG_LARGE_PAT) {
flags &= ~X86_MMU_PG_LARGE_PAT;
flags |= X86_MMU_PG_PTE_PAT;
}
}
return flags;
}
void X86PageTableMmu::TlbInvalidate(const PendingTlbInvalidation* pending) {
AssertHeld(lock_);
if (pending->count == 0 && !pending->full_shootdown) {
return;
}
kcounter_add(tlb_invalidations_sent, 1);
const auto aspace = static_cast<X86ArchVmAspace*>(ctx());
const ulong root_ptable_phys = phys();
const uint16_t pcid = aspace->pcid();
struct TlbInvalidatePage_context {
paddr_t target_root_ptable;
const PendingTlbInvalidation* pending;
uint16_t pcid;
bool is_shared;
};
TlbInvalidatePage_context task_context = {
.target_root_ptable = root_ptable_phys,
.pending = pending,
.pcid = pcid,
.is_shared = IsShared(),
};
mp_ipi_target_t target;
cpu_mask_t target_mask = 0;
// We need to send the TLB invalidate to all CPUs if this aspace is shared because active_cpus
// is inaccurate in that case (another CPU may be running a unified aspace with these shared
// mappings).
// TODO(https://fxbug.dev/42083004): Replace this global broadcast for shared aspaces with a more
// targeted one once shared aspaces keep track of all the CPUs they are on.
if (IsShared() || pending->contains_global) {
target = MP_IPI_TARGET_ALL;
} else {
target = MP_IPI_TARGET_MASK;
// Target only CPUs this aspace is active on. It may be the case that some other CPU will
// become active in it after this load, or will have left it just before this load.
// In the absence of PCIDs there are two cases:
// 1. It is becoming active after the write to the page table, so it will see the change.
// 2. It will get a potentially spurious request to flush.
// With PCIDs we need additional handling for case (1), since an inactive CPU might have old
// entries cached and so may not see the change, and case (2) is no longer spurious. See
// additional comments in next if block.
target_mask = aspace->active_cpus();
if (g_x86_feature_pcid_enabled) {
// Only the kernel aspace uses the 0 pcid, and all its mappings are global and so would have
// forced an IPI_TARGET_ALL above.
DEBUG_ASSERT(pcid != MMU_X86_UNUSED_PCID);
// Mark all cpus as being dirty that aren't in this mask. This will force a TLB flush on the
// next context switch on that cpu.
aspace->MarkPcidDirtyCpus(~target_mask);
// At this point we have CPUs in our target_mask that we're going to IPI, and any CPUS not in
// target_mask that will at some point in the future become active and see the dirty bit.
//
// This is, however, not all CPUs, as there might be CPUs that are not in target_mask, but
// became active before we could set the dirty bit. To account for these CPUs we read
// active_cpus again and OR with the previous target_mask. It is possible that we might now
// both IPI a core and have it flush on load due to the dirty bit, however this is a very
// unlikely race condition and so will not be expensive in practice.
//
// Note that any CPU that manages to become active after we read target_mask and stop being
// active before we read it again below does not matter, since the dirty bit is still set and
// so when it eventually runs again it will still clear the PCID.
target_mask |= aspace->active_cpus();
}
}
/* Task used for invalidating a TLB entry on each CPU */
auto tlb_invalidate_page_task = [](void* raw_context) -> void {
DEBUG_ASSERT(arch_ints_disabled());
const TlbInvalidatePage_context* context = static_cast<TlbInvalidatePage_context*>(raw_context);
const paddr_t current_root_ptable = arch::X86Cr3::Read().base();
kcounter_add(tlb_invalidations_received, 1);
/* This invalidation doesn't apply to this CPU if:
* - PCID feature is not being used
* - It doesn't contain any global pages (ie, isn't the kernel)
* - The target aspace is different (different root page table in cr3)
* - This is not a shared mapping invalidation.
*/
if (!g_x86_feature_pcid_enabled && !context->pending->contains_global &&
context->target_root_ptable != current_root_ptable && !context->is_shared) {
tlb_invalidations_received_invalid.Add(1);
return;
}
// Handle full shootdowns of the TLB. This happens anytime full_shootdown is set and whenever
// this is a TLB invalidation of a shared entry.
if (context->pending->full_shootdown || context->is_shared) {
if (context->pending->contains_global) {
kcounter_add(tlb_invalidations_full_global_received, 1);
x86_tlb_global_invalidate();
} else {
kcounter_add(tlb_invalidations_full_nonglobal_received, 1);
if (context->is_shared) {
// The shared region runs under many different PCIDs, so instead of tracking those PCIDs
// we just invalidated all of them.
x86_tlb_nonglobal_invalidate(MMU_X86_UNUSED_PCID);
} else {
x86_tlb_nonglobal_invalidate(context->pcid);
}
}
return;
}
/* If not a full shootdown, then iterate through a list of pages and handle
* them individually.
*/
for (uint i = 0; i < context->pending->count; ++i) {
const auto& item = context->pending->item[i];
switch (static_cast<PageTableLevel>(item.page_level())) {
case PageTableLevel::PML4_L:
panic("PML4_L invld found; should not be here\n");
case PageTableLevel::PDP_L:
case PageTableLevel::PD_L:
case PageTableLevel::PT_L:
// Terminal entry is being asked to be flushed. If it's a global page or does not belong
// to a special PCID, use the invlpg instruction.
if (context->target_root_ptable == current_root_ptable || item.is_global() ||
context->pcid == MMU_X86_UNUSED_PCID) {
invlpg(item.addr());
} else {
/* This is a user page with a tagged PCID.
*/
invpcid_va_pcid(item.addr(), context->pcid);
}
break;
}
}
};
mp_sync_exec(target, target_mask, tlb_invalidate_page_task, &task_context);
}
uint X86PageTableMmu::pt_flags_to_mmu_flags(PtFlags flags, PageTableLevel level) {
uint mmu_flags = ARCH_MMU_FLAG_PERM_READ;
if (flags & X86_MMU_PG_RW) {
mmu_flags |= ARCH_MMU_FLAG_PERM_WRITE;
}
if (flags & X86_MMU_PG_U) {
mmu_flags |= ARCH_MMU_FLAG_PERM_USER;
}
if (!(flags & X86_MMU_PG_NX)) {
mmu_flags |= ARCH_MMU_FLAG_PERM_EXECUTE;
}
if (level != PageTableLevel::PT_L) {
switch (flags & X86_MMU_LARGE_PAT_MASK) {
case X86_MMU_LARGE_PAT_WRITEBACK:
mmu_flags |= ARCH_MMU_FLAG_CACHED;
break;
case X86_MMU_LARGE_PAT_UNCACHABLE:
mmu_flags |= ARCH_MMU_FLAG_UNCACHED;
break;
case X86_MMU_LARGE_PAT_WRITE_COMBINING:
mmu_flags |= ARCH_MMU_FLAG_WRITE_COMBINING;
break;
default:
panic("Unexpected flags %" PRIx64, flags);
}
} else {
switch (flags & X86_MMU_PTE_PAT_MASK) {
case X86_MMU_PTE_PAT_WRITEBACK:
mmu_flags |= ARCH_MMU_FLAG_CACHED;
break;
case X86_MMU_PTE_PAT_UNCACHABLE:
mmu_flags |= ARCH_MMU_FLAG_UNCACHED;
break;
case X86_MMU_PTE_PAT_WRITE_COMBINING:
mmu_flags |= ARCH_MMU_FLAG_WRITE_COMBINING;
break;
default:
panic("Unexpected flags %" PRIx64, flags);
}
}
return mmu_flags;
}
// X86PageTableEpt
bool X86PageTableEpt::allowed_flags(uint flags) {
if (!(flags & ARCH_MMU_FLAG_PERM_READ)) {
return false;
}
return true;
}
bool X86PageTableEpt::check_paddr(paddr_t paddr) { return x86_mmu_check_paddr(paddr); }
bool X86PageTableEpt::check_vaddr(vaddr_t vaddr) { return x86_mmu_check_vaddr(vaddr); }
bool X86PageTableEpt::supports_page_size(PageTableLevel level) {
DEBUG_ASSERT(level != PageTableLevel::PT_L);
switch (level) {
case PageTableLevel::PD_L:
return true;
case PageTableLevel::PDP_L:
return supports_huge_pages;
case PageTableLevel::PML4_L:
return false;
default:
panic("Unreachable case in supports_page_size\n");
}
}
PtFlags X86PageTableEpt::intermediate_flags() { return X86_EPT_R | X86_EPT_W | X86_EPT_X; }
PtFlags X86PageTableEpt::terminal_flags(PageTableLevel level, uint flags) {
PtFlags terminal_flags = 0;
if (flags & ARCH_MMU_FLAG_PERM_READ) {
terminal_flags |= X86_EPT_R;
}
if (flags & ARCH_MMU_FLAG_PERM_WRITE) {
terminal_flags |= X86_EPT_W;
}
if (flags & ARCH_MMU_FLAG_PERM_EXECUTE) {
terminal_flags |= X86_EPT_X;
}
switch (flags & ARCH_MMU_FLAG_CACHE_MASK) {
case ARCH_MMU_FLAG_CACHED:
terminal_flags |= X86_EPT_WB;
break;
case ARCH_MMU_FLAG_UNCACHED_DEVICE:
case ARCH_MMU_FLAG_UNCACHED:
terminal_flags |= X86_EPT_UC;
break;
case ARCH_MMU_FLAG_WRITE_COMBINING:
terminal_flags |= X86_EPT_WC;
break;
default:
panic("Unexpected flags 0x%x", flags);
}
return terminal_flags;
}
PtFlags X86PageTableEpt::split_flags(PageTableLevel level, PtFlags flags) {
DEBUG_ASSERT(level != PageTableLevel::PML4_L && level != PageTableLevel::PT_L);
// We don't need to relocate any flags on split for EPT.
return flags;
}
void X86PageTableEpt::TlbInvalidate(const PendingTlbInvalidation* pending) {
if (pending->count == 0 && !pending->full_shootdown) {
return;
}
kcounter_add(ept_tlb_invalidations, 1);
// Target all CPUs with a context invalidation since we do not know what CPUs have this EPT
// active. We cannot use active_cpus() is only updated by ContextSwitch, which does not get called
// for guests, and also EPT mappings persist even if a guest is not presently executing. In
// general unmap operations on EPTs should be extremely rare and not in any common path, so this
// inefficiency is not disastrous in the short term. Similarly, since this is an infrequent
// operation, we do not attempt to invalidate any individual entries, but just blow away the whole
// context.
// TODO: Track what CPUs the VCPUs using this EPT are migrated to and only IPI that subset.
uint64_t eptp = ept_pointer_from_pml4(static_cast<X86ArchVmAspace*>(ctx())->arch_table_phys());
broadcast_invept(eptp);
}
uint X86PageTableEpt::pt_flags_to_mmu_flags(PtFlags flags, PageTableLevel level) {
uint mmu_flags = 0;
if (flags & X86_EPT_R) {
mmu_flags |= ARCH_MMU_FLAG_PERM_READ;
}
if (flags & X86_EPT_W) {
mmu_flags |= ARCH_MMU_FLAG_PERM_WRITE;
}
if (flags & X86_EPT_X) {
mmu_flags |= ARCH_MMU_FLAG_PERM_EXECUTE;
}
switch (flags & X86_EPT_MEMORY_TYPE_MASK) {
case X86_EPT_WB:
mmu_flags |= ARCH_MMU_FLAG_CACHED;
break;
case X86_EPT_UC:
mmu_flags |= ARCH_MMU_FLAG_UNCACHED;
break;
case X86_EPT_WC:
mmu_flags |= ARCH_MMU_FLAG_WRITE_COMBINING;
break;
default:
panic("Unexpected flags %" PRIx64, flags);
}
return mmu_flags;
}
// We disable analysis due to the write to |pages_| tripping it up. It is safe
// to write to |pages_| since this is part of object construction.
zx_status_t X86PageTableMmu::InitKernel(void* ctx,
page_alloc_fn_t test_paf) TA_NO_THREAD_SAFETY_ANALYSIS {
test_page_alloc_func_ = test_paf;
phys_ = kernel_pt_phys;
virt_ = (pt_entry_t*)X86_PHYS_TO_VIRT(phys_);
ctx_ = ctx;
pages_ = 1;
use_global_mappings_ = true;
return ZX_OK;
}
zx_status_t X86PageTableMmu::AliasKernelMappings() {
// Copy the kernel portion of it from the master kernel pt.
memcpy(virt_ + NO_OF_PT_ENTRIES / 2, const_cast<pt_entry_t*>(&KERNEL_PT[NO_OF_PT_ENTRIES / 2]),
sizeof(pt_entry_t) * NO_OF_PT_ENTRIES / 2);
return ZX_OK;
}
X86ArchVmAspace::X86ArchVmAspace(vaddr_t base, size_t size, uint mmu_flags,
page_alloc_fn_t test_paf)
: test_page_alloc_func_(test_paf), flags_(mmu_flags), base_(base), size_(size) {}
/*
* Fill in the high level x86 arch aspace structure and allocating a top level page table.
*/
zx_status_t X86ArchVmAspace::Init() {
canary_.Assert();
LTRACEF("aspace %p, base %#" PRIxPTR ", size 0x%zx, mmu_flags 0x%x\n", this, base_, size_,
flags_);
if (flags_ & ARCH_ASPACE_FLAG_KERNEL) {
X86PageTableMmu* mmu = new (&page_table_storage_.mmu) X86PageTableMmu();
pt_ = mmu;
zx_status_t status = mmu->InitKernel(this, test_page_alloc_func_);
if (status != ZX_OK) {
return status;
}
LTRACEF("kernel aspace: pt phys %#" PRIxPTR ", virt %p\n", pt_->phys(), pt_->virt());
} else if (flags_ & ARCH_ASPACE_FLAG_GUEST) {
X86PageTableEpt* ept = new (&page_table_storage_.ept) X86PageTableEpt();
pt_ = ept;
zx_status_t status = ept->Init(this, test_page_alloc_func_);
if (status != ZX_OK) {
return status;
}
LTRACEF("guest paspace: pt phys %#" PRIxPTR ", virt %p\n", pt_->phys(), pt_->virt());
} else {
X86PageTableMmu* mmu = new (&page_table_storage_.mmu) X86PageTableMmu();
pt_ = mmu;
if (g_x86_feature_pcid_enabled) {
zx_status_t status = AllocatePCID();
if (status != ZX_OK) {
return status;
}
}
zx_status_t status = mmu->Init(this, test_page_alloc_func_);
if (status != ZX_OK) {
return status;
}
status = mmu->AliasKernelMappings();
if (status != ZX_OK) {
return status;
}
LTRACEF("user aspace: pt phys %#" PRIxPTR ", virt %p, pcid %#hx\n", pt_->phys(), pt_->virt(),
pcid_);
}
return ZX_OK;
}
zx_status_t X86ArchVmAspace::InitRestricted() {
canary_.Assert();
// Restricted ArchVmAspaces are only allowed with user address spaces.
DEBUG_ASSERT(flags_ == 0);
X86PageTableMmu* mmu = new (&page_table_storage_.mmu) X86PageTableMmu();
pt_ = mmu;
if (g_x86_feature_pcid_enabled) {
zx_status_t status = AllocatePCID();
if (status != ZX_OK) {
return status;
}
}
zx_status_t status = mmu->InitRestricted(this, test_page_alloc_func_);
if (status != ZX_OK) {
return status;
}
status = mmu->AliasKernelMappings();
if (status != ZX_OK) {
return status;
}
LTRACEF("user restricted aspace: pt phys %#" PRIxPTR ", virt %p, pcid %#hx\n", pt_->phys(),
pt_->virt(), pcid_);
return ZX_OK;
}
zx_status_t X86ArchVmAspace::InitShared() {
canary_.Assert();
// Shared ArchVmAspaces are only allowed with user address spaces.
DEBUG_ASSERT(flags_ == 0);
X86PageTableMmu* mmu = new (&page_table_storage_.mmu) X86PageTableMmu();
pt_ = mmu;
if (g_x86_feature_pcid_enabled) {
zx_status_t status = AllocatePCID();
if (status != ZX_OK) {
return status;
}
}
zx_status_t status = mmu->InitShared(this, base_, size_, test_page_alloc_func_);
if (status != ZX_OK) {
return status;
}
status = mmu->AliasKernelMappings();
if (status != ZX_OK) {
return status;
}
LTRACEF("user shared aspace: pt phys %#" PRIxPTR ", virt %p, pcid %#hx\n", pt_->phys(),
pt_->virt(), pcid_);
return ZX_OK;
}
zx_status_t X86ArchVmAspace::InitUnified(ArchVmAspaceInterface& shared,
ArchVmAspaceInterface& restricted) {
canary_.Assert();
// Unified ArchVmAspaces are only allowed with user address spaces.
DEBUG_ASSERT(flags_ == 0);
X86PageTableMmu* mmu = new (&page_table_storage_.mmu) X86PageTableMmu();
pt_ = mmu;
if (g_x86_feature_pcid_enabled) {
zx_status_t status = AllocatePCID();
if (status != ZX_OK) {
return status;
}
}
X86ArchVmAspace& sharedX86 = static_cast<X86ArchVmAspace&>(shared);
X86ArchVmAspace& restrictedX86 = static_cast<X86ArchVmAspace&>(restricted);
// Validate that the shared and restricted aspaces are correctly initialized, as this can only be
// done on MMU aspaces this tells us it is safe to case.
ASSERT(sharedX86.pt_->IsShared());
ASSERT(restrictedX86.pt_->IsRestricted());
zx_status_t status =
mmu->InitUnified(this, static_cast<X86PageTableMmu*>(sharedX86.pt_), sharedX86.base_,
sharedX86.size_, static_cast<X86PageTableMmu*>(restrictedX86.pt_),
restrictedX86.base_, restrictedX86.size_, test_page_alloc_func_);
if (status != ZX_OK) {
return status;
}
status = mmu->AliasKernelMappings();
if (status != ZX_OK) {
return status;
}
LTRACEF("user aspace: pt phys %#" PRIxPTR ", virt %p, pcid %#hx\n", pt_->phys(), pt_->virt(),
pcid_);
return ZX_OK;
}
zx_status_t X86ArchVmAspace::AllocatePCID() {
DEBUG_ASSERT(g_x86_feature_pcid_enabled);
zx::result<uint16_t> result = pcid_allocator->TryAlloc();
if (result.is_error()) {
// TODO(https://fxbug.dev/42075323): Implement some kind of PCID recycling.
LTRACEF("X86: ran out of PCIDs when assigning new aspace\n");
return ZX_ERR_NO_RESOURCES;
}
pcid_ = result.value();
DEBUG_ASSERT(pcid_ != MMU_X86_UNUSED_PCID && pcid_ < 4096);
// Start off with all cpus marked as dirty so the first context switch on any cpu
// invalidates the entire PCID when it's loaded.
MarkPcidDirtyCpus(CPU_MASK_ALL);
return ZX_OK;
}
X86ArchVmAspace::~X86ArchVmAspace() {
if (pt_) {
pt_->~X86PageTableBase();
}
// TODO(https://fxbug.dev/42105844): check that we've destroyed the aspace.
}
zx_status_t X86ArchVmAspace::Destroy() {
canary_.Assert();
DEBUG_ASSERT(active_cpus_.load() == 0);
if (flags_ & ARCH_ASPACE_FLAG_GUEST) {
static_cast<X86PageTableEpt*>(pt_)->Destroy(base_, size_);
} else {
static_cast<X86PageTableMmu*>(pt_)->Destroy(base_, size_);
if (pcid_ != MMU_X86_UNUSED_PCID) {
auto result = pcid_allocator->Free(pcid_);
DEBUG_ASSERT(result.is_ok());
}
}
return ZX_OK;
}
zx_status_t X86ArchVmAspace::Unmap(vaddr_t vaddr, size_t count, EnlargeOperation enlarge,
size_t* unmapped) {
DEBUG_ASSERT(!pt_->IsUnified());
if (!IsValidVaddr(vaddr))
return ZX_ERR_INVALID_ARGS;
zx_status_t result = pt_->UnmapPages(vaddr, count, enlarge, unmapped);
MarkAspaceModified();
return result;
}
zx_status_t X86ArchVmAspace::MapContiguous(vaddr_t vaddr, paddr_t paddr, size_t count,
uint mmu_flags, size_t* mapped) {
DEBUG_ASSERT(!pt_->IsUnified());
if (!IsValidVaddr(vaddr))
return ZX_ERR_INVALID_ARGS;
zx_status_t result = pt_->MapPagesContiguous(vaddr, paddr, count, mmu_flags, mapped);
MarkAspaceModified();
return result;
}
zx_status_t X86ArchVmAspace::Map(vaddr_t vaddr, paddr_t* phys, size_t count, uint mmu_flags,
ExistingEntryAction existing_action, size_t* mapped) {
DEBUG_ASSERT(ENABLE_PAGE_FAULT_UPGRADE || existing_action != ExistingEntryAction::Upgrade);
DEBUG_ASSERT(!pt_->IsUnified());
if (!IsValidVaddr(vaddr))
return ZX_ERR_INVALID_ARGS;
zx_status_t result = pt_->MapPages(vaddr, phys, count, mmu_flags, existing_action, mapped);
MarkAspaceModified();
return result;
}
zx_status_t X86ArchVmAspace::Protect(vaddr_t vaddr, size_t count, uint mmu_flags,
EnlargeOperation enlarge) {
DEBUG_ASSERT(!pt_->IsUnified());
if (!IsValidVaddr(vaddr))
return ZX_ERR_INVALID_ARGS;
zx_status_t result = pt_->ProtectPages(vaddr, count, mmu_flags);
MarkAspaceModified();
return result;
}
void X86ArchVmAspace::ContextSwitch(X86ArchVmAspace* old_aspace, X86ArchVmAspace* aspace) {
DEBUG_ASSERT(arch_ints_disabled());
cpu_mask_t cpu_bit = cpu_num_to_mask(arch_curr_cpu_num());
context_switches.Add(1);
if (aspace != nullptr) {
// Switching to another user aspace
aspace->canary_.Assert();
paddr_t phys = aspace->pt_phys();
LTRACEF_LEVEL(3, "switching to aspace %p, pt %#" PRIXPTR "\n", aspace, phys);
if (old_aspace != nullptr) {
[[maybe_unused]] uint32_t prev = old_aspace->active_cpus_.fetch_and(~cpu_bit);
// Make sure we were actually previously running on this CPU.
DEBUG_ASSERT(prev & cpu_bit);
}
// Set ourselves as active on this CPU prior to clearing the dirty bit. This ensures that TLB
// invalidation code will either see us as active, and know to IPI us, or we will see the dirty
// bit and clear the tlb here. See comment in X86PageTableMmu::TlbInvalidate for more details.
[[maybe_unused]] uint32_t prev = aspace->active_cpus_.fetch_or(cpu_bit);
// Should not already be running on this CPU.
DEBUG_ASSERT(!(prev & cpu_bit));
// Load the new cr3, add in the pcid if it's supported
if (aspace->pcid_ != MMU_X86_UNUSED_PCID) {
DEBUG_ASSERT(g_x86_feature_pcid_enabled);
DEBUG_ASSERT(aspace->pcid_ < 4096);
arch::X86Cr3PCID cr3;
// If the new aspace is marked as dirty for this cpu, force a TLB invalidate
// when loading the new cr3. Clear the dirty flag while we're at it. If
// another cpu sets the dirty flag after this point but before we load the cr3
// and invalidate it, we'll at most end up with an extraneous dirty flag set.
const cpu_mask_t dirty_mask = aspace->pcid_dirty_cpus_.fetch_and(~cpu_bit);
if (dirty_mask & cpu_bit) {
// This is a double negative, and noflush=0 -> flush.
cr3.set_noflush(0);
} else {
cr3.set_noflush(1);
context_switches_pcid.Add(1);
}
cr3.set_base(phys);
cr3.set_pcid(aspace->pcid_ & 0xfff);
cr3.Write();
} else {
arch::X86Cr3::Write(phys);
}
aspace->active_since_last_check_.store(true, ktl::memory_order_relaxed);
// If we are switching to a unified aspace, we need to mark the associated shared and
// restricted aspaces as active since the last check as well.
if (aspace->IsUnified()) {
// Being a unified aspace implies it is an MMU type.
X86PageTableMmu* aspace_pt = static_cast<X86PageTableMmu*>(aspace->pt_);
X86ArchVmAspace* shared = static_cast<X86ArchVmAspace*>(aspace_pt->get_shared_pt()->ctx());
X86ArchVmAspace* restricted =
static_cast<X86ArchVmAspace*>(aspace_pt->get_restricted_pt()->ctx());
shared->active_since_last_check_.store(true, ktl::memory_order_relaxed);
restricted->active_since_last_check_.store(true, ktl::memory_order_relaxed);
}
} else {
// Switching to the kernel aspace
LTRACEF_LEVEL(3, "switching to kernel aspace, pt %#" PRIxPTR "\n", kernel_pt_phys);
// Write the kernel top level page table. Note: even when using PCID we do not
// need to do anything special here since we are intrinsically loading PCID 0 with
// the noflush bit clear which is fine since the kernel uses global pages.
arch::X86Cr3::Write(kernel_pt_phys);
if (old_aspace != nullptr) {
[[maybe_unused]] uint32_t prev = old_aspace->active_cpus_.fetch_and(~cpu_bit);
// Make sure we were actually previously running on this CPU
DEBUG_ASSERT(prev & cpu_bit);
}
}
// Cleanup io bitmap entries from previous thread.
if (old_aspace)
x86_clear_tss_io_bitmap(old_aspace->io_bitmap());
// Set the io bitmap for this thread.
if (aspace)
x86_set_tss_io_bitmap(aspace->io_bitmap());
}
zx_status_t X86ArchVmAspace::Query(vaddr_t vaddr, paddr_t* paddr, uint* mmu_flags) {
DEBUG_ASSERT(!pt_->IsUnified());
if (!IsValidVaddr(vaddr))
return ZX_ERR_INVALID_ARGS;
return pt_->QueryVaddr(vaddr, paddr, mmu_flags);
}
zx_status_t X86ArchVmAspace::HarvestAccessed(vaddr_t vaddr, size_t count,
NonTerminalAction non_terminal_action,
TerminalAction terminal_action) {
DEBUG_ASSERT(!pt_->IsUnified());
if (!IsValidVaddr(vaddr)) {
return ZX_ERR_INVALID_ARGS;
}
return pt_->HarvestAccessed(vaddr, count, non_terminal_action, terminal_action);
}
bool X86ArchVmAspace::ActiveSinceLastCheck(bool clear) {
// Read whether any CPUs are presently executing.
bool currently_active = active_cpus_.load(ktl::memory_order_relaxed) != 0;
// Exchange the current notion of active, with the previously active information. This is the only
// time a |false| value can potentially be written to active_since_last_check_, and doing an
// exchange means we can never 'lose' a |true| value.
bool previously_active =
clear ? active_since_last_check_.exchange(currently_active, ktl::memory_order_relaxed)
: active_since_last_check_.load(ktl::memory_order_relaxed);
// Return whether we had previously been active. It is not necessary to also consider whether we
// are currently active, since activating would also have active_since_last_check_ to true. In the
// scenario where we race and currently_active is true, but we observe previously_active to be
// false, this means that as of the start of this function ::ContextSwitch had not completed, and
// so this aspace is still not actually active.
return previously_active;
}
vaddr_t X86ArchVmAspace::PickSpot(vaddr_t base, vaddr_t end, vaddr_t align, size_t size,
uint mmu_flags) {
canary_.Assert();
return PAGE_ALIGN(base);
}
uint32_t arch_address_tagging_features() { return 0; }
void x86_mmu_early_init() {
x86_mmu_percpu_init();
x86_mmu_mem_type_init();
// Unmap the lower identity mapping.
pml4[0] = 0;
// As we are still in early init code we cannot use the general page invalidation mechanisms,
// specifically ones that might use mp_sync_exec or kcounters, so just drop the entire tlb.
x86_tlb_global_invalidate();
/* get the address width from the CPU */
auto vaddr_width =
static_cast<uint8_t>(arch::BootCpuid<arch::CpuidAddressSizeInfo>().linear_addr_bits());
auto paddr_width =
static_cast<uint8_t>(arch::BootCpuid<arch::CpuidAddressSizeInfo>().phys_addr_bits());
supports_huge_pages = x86_feature_test(X86_FEATURE_HUGE_PAGE);
/* if we got something meaningful, override the defaults.
* some combinations of cpu on certain emulators seems to return
* nonsense paddr widths (1), so trim it. */
if (paddr_width > g_max_paddr_width) {
g_max_paddr_width = paddr_width;
}
if (vaddr_width > g_max_vaddr_width) {
g_max_vaddr_width = vaddr_width;
}
LTRACEF("paddr_width %u vaddr_width %u\n", g_max_paddr_width, g_max_vaddr_width);
pcid_allocator.Initialize();
}
void x86_mmu_init() {
printf("MMU: max physical address bits %u max virtual address bits %u\n", g_max_paddr_width,
g_max_vaddr_width);
if (g_x86_feature_pcid_enabled) {
printf("MMU: Using PCID + INVPCID\n");
} else if (g_x86_feature_invpcid) {
printf("MMU: Using INVPCID\n");
}
ASSERT_MSG(g_max_vaddr_width >= kX86VAddrBits,
"Maximum number of virtual address bits (%u) is less than the assumed number of bits"
" being used (%u)\n",
g_max_vaddr_width, kX86VAddrBits);
}
void x86_mmu_feature_init() {
// Use of PCID is detected late and on the boot cpu is this happens after x86_mmu_percpu_init
// and so we enable it again here. For other CPUs, and when coming in and out of suspend, it
// will happen correctly in x86_mmu_percpu_init.
arch::X86Cr4 cr4 = arch::X86Cr4::Read();
cr4.set_pcide(g_x86_feature_pcid_enabled);
cr4.Write();
}
void x86_mmu_percpu_init() {
arch::X86Cr0::Read()
.set_wp(true) // Set write protect.
.set_nw(false) // Clear not-write-through.
.set_cd(false) // Clear cache-disable.
.Write();
// Set or clear the SMEP & SMAP & PCIDE bits in CR4 based on features we've detected.
// Make sure global pages are enabled.
arch::X86Cr4 cr4 = arch::X86Cr4::Read();
cr4.set_smep(x86_feature_test(X86_FEATURE_SMEP));
cr4.set_smap(g_x86_feature_has_smap);
cr4.set_pcide(g_x86_feature_pcid_enabled);
cr4.set_pge(true);
cr4.Write();
// Set NXE bit in X86_MSR_IA32_EFER.
uint64_t efer_msr = read_msr(X86_MSR_IA32_EFER);
efer_msr |= X86_EFER_NXE;
write_msr(X86_MSR_IA32_EFER, efer_msr);
}