blob: bb4a31705ecb49e11122039e0c4af45a72dcd127 [file] [log] [blame] [edit]
// Copyright 2016 The Fuchsia Authors
// Copyright (c) 2014 Google Inc. All rights reserved
//
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file or at
// https://opensource.org/licenses/MIT
#include "arch/arm64/mmu.h"
#include <align.h>
#include <assert.h>
#include <bits.h>
#include <debug.h>
#include <inttypes.h>
#include <lib/arch/cache.h>
#include <lib/arch/intrin.h>
#include <lib/boot-options/boot-options.h>
#include <lib/counters.h>
#include <lib/fit/defer.h>
#include <lib/heap.h>
#include <lib/instrumentation/asan.h>
#include <lib/ktrace.h>
#include <lib/lazy_init/lazy_init.h>
#include <lib/page_cache.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <trace.h>
#include <zircon/errors.h>
#include <zircon/types.h>
#include <arch/arm64/hypervisor/el2_state.h>
#include <arch/aspace.h>
#include <kernel/auto_preempt_disabler.h>
#include <kernel/mutex.h>
#include <ktl/algorithm.h>
#include <lk/init.h>
#include <vm/arch_vm_aspace.h>
#include <vm/physmap.h>
#include <vm/pmm.h>
#include <vm/vm.h>
#include "asid_allocator.h"
#include <ktl/enforce.h>
#define LOCAL_TRACE 0
#define TRACE_CONTEXT_SWITCH 0
/* ktraces just local to this file */
#define LOCAL_KTRACE_ENABLE 0
#define LOCAL_KTRACE(label, args...) \
KTRACE_CPU_INSTANT_ENABLE(LOCAL_KTRACE_ENABLE, "kernel:probe", label, ##args)
// Use one of the ignored bits for a software simulated accessed flag for non-terminal entries.
// TODO: Once the hardware setting of the terminal AF is supported usage of this for non-terminal AF
// will have to become optional as we rely on the software terminal fault to set the non-terminal
// bits.
#define MMU_PTE_ATTR_RES_SOFTWARE_AF BM(55, 1, 1)
// Ensure we picked a bit that is actually part of the software controlled bits.
static_assert((MMU_PTE_ATTR_RES_SOFTWARE & MMU_PTE_ATTR_RES_SOFTWARE_AF) ==
MMU_PTE_ATTR_RES_SOFTWARE_AF);
static_assert(((long)KERNEL_BASE >> MMU_KERNEL_SIZE_SHIFT) == -1, "");
static_assert(((long)KERNEL_ASPACE_BASE >> MMU_KERNEL_SIZE_SHIFT) == -1, "");
static_assert(MMU_KERNEL_SIZE_SHIFT <= 48, "");
static_assert(MMU_KERNEL_SIZE_SHIFT >= 25, "");
// Static relocated base to prepare for KASLR. Used at early boot and by gdb
// script to know the target relocated address.
// TODO(https://fxbug.dev/42098994): Choose it randomly.
#if DISABLE_KASLR
uint64_t kernel_relocated_base = KERNEL_BASE;
#else
uint64_t kernel_relocated_base = 0xffffffff10000000;
#endif
// The main translation table for the kernel. Globally declared because it's reached
// from assembly.
pte_t arm64_kernel_translation_table[MMU_KERNEL_PAGE_TABLE_ENTRIES_TOP] __ALIGNED(
MMU_KERNEL_PAGE_TABLE_ENTRIES_TOP * 8);
// Physical address of the above table, saved in start.S.
paddr_t arm64_kernel_translation_table_phys;
// Global accessor for the kernel page table
pte_t* arm64_get_kernel_ptable() { return arm64_kernel_translation_table; }
paddr_t arm64_get_kernel_ptable_phys() { return arm64_kernel_translation_table_phys; }
namespace {
// Whether ASID use is enabled.
bool feat_asid_enabled;
// Whether or not we allow break-before-make. Used in very early boot.
bool allow_bbm = false;
KCOUNTER(cm_flush_all, "mmu.consistency_manager.flush_all")
KCOUNTER(cm_flush_all_replacing, "mmu.consistency_manager.flush_all_replacing")
KCOUNTER(cm_single_tlb_invalidates, "mmu.consistency_manager.single_tlb_invalidate")
KCOUNTER(cm_flush, "mmu.consistency_manager.flush")
lazy_init::LazyInit<AsidAllocator> asid;
KCOUNTER(vm_mmu_protect_make_execute_calls, "vm.mmu.protect.make_execute_calls")
KCOUNTER(vm_mmu_protect_make_execute_pages, "vm.mmu.protect.make_execute_pages")
KCOUNTER(vm_mmu_page_table_alloc, "vm.mmu.pt.alloc")
KCOUNTER(vm_mmu_page_table_free, "vm.mmu.pt.free")
KCOUNTER(vm_mmu_page_table_reclaim, "vm.mmu.pt.reclaim")
page_cache::PageCache page_cache;
zx::result<vm_page_t*> CacheAllocPage() {
if (!page_cache) {
return Pmm::Node().AllocPage(PMM_ALLOC_FLAG_ANY);
}
zx::result result = page_cache.Allocate(1);
if (result.is_error()) {
return result.take_error();
}
vm_page_t* page = list_remove_head_type(&result->page_list, vm_page_t, queue_node);
DEBUG_ASSERT(page != nullptr);
DEBUG_ASSERT(result->page_list.is_empty());
return zx::ok(page);
}
void CacheFreePages(list_node_t* list) {
if (!page_cache) {
pmm_free(list);
}
page_cache.Free(ktl::move(*list));
}
void CacheFreePage(vm_page_t* p) {
if (!page_cache) {
pmm_free_page(p);
}
page_cache::PageCache::PageList list;
list_add_tail(&list, &p->queue_node);
page_cache.Free(ktl::move(list));
}
void InitializePageCache(uint32_t level) {
ASSERT(level < LK_INIT_LEVEL_THREADING);
const size_t reserve_pages = 8;
zx::result<page_cache::PageCache> result = page_cache::PageCache::Create(reserve_pages);
ASSERT(result.is_ok());
page_cache = ktl::move(result.value());
}
// Initialize the cache after the percpu data structures are initialized.
LK_INIT_HOOK(arm64_mmu_page_cache_init, InitializePageCache, LK_INIT_LEVEL_KERNEL + 1)
void enable_bbm(uint32_t level) {
dprintf(INFO, "ARM: enabling break-before-make\n");
allow_bbm = true;
}
// Enable break-before-make when splitting large pages after the VM has been initialized
// which is where a bunch of pieces of the physmap and kernel are unmapped or permissions
// lowered.
LK_INIT_HOOK(arm64_mmu_enable_bbm, enable_bbm, LK_INIT_LEVEL_VM)
// Convert user level mmu flags to flags that go in L1 descriptors.
// Hypervisor flag modifies behavior to work for single translation regimes
// such as the mapping of kernel pages with ArmAspaceType::kHypervisor in EL2.
pte_t mmu_flags_to_s1_pte_attr(uint flags, bool hypervisor = false) {
pte_t attr = MMU_PTE_ATTR_AF;
switch (flags & ARCH_MMU_FLAG_CACHE_MASK) {
case ARCH_MMU_FLAG_CACHED:
attr |= MMU_PTE_ATTR_NORMAL_MEMORY | MMU_PTE_ATTR_SH_INNER_SHAREABLE;
break;
case ARCH_MMU_FLAG_WRITE_COMBINING:
attr |= MMU_PTE_ATTR_NORMAL_UNCACHED | MMU_PTE_ATTR_SH_INNER_SHAREABLE;
break;
case ARCH_MMU_FLAG_UNCACHED:
attr |= MMU_PTE_ATTR_STRONGLY_ORDERED;
break;
case ARCH_MMU_FLAG_UNCACHED_DEVICE:
attr |= MMU_PTE_ATTR_DEVICE;
break;
default:
panic("unexpected flags value 0x%x", flags);
}
switch (flags & (ARCH_MMU_FLAG_PERM_USER | ARCH_MMU_FLAG_PERM_WRITE)) {
case 0:
attr |= MMU_PTE_ATTR_AP_P_RO_U_NA;
break;
case ARCH_MMU_FLAG_PERM_WRITE:
attr |= MMU_PTE_ATTR_AP_P_RW_U_NA;
break;
case ARCH_MMU_FLAG_PERM_USER:
attr |= MMU_PTE_ATTR_AP_P_RO_U_RO;
break;
case ARCH_MMU_FLAG_PERM_USER | ARCH_MMU_FLAG_PERM_WRITE:
attr |= MMU_PTE_ATTR_AP_P_RW_U_RW;
break;
}
if (hypervisor) {
// For single translation regimes such as the hypervisor pages, only
// the XN bit applies.
if ((flags & ARCH_MMU_FLAG_PERM_EXECUTE) == 0) {
attr |= MMU_PTE_ATTR_XN;
}
} else {
if (flags & ARCH_MMU_FLAG_PERM_EXECUTE) {
if (flags & ARCH_MMU_FLAG_PERM_USER) {
// User executable page, marked privileged execute never.
attr |= MMU_PTE_ATTR_PXN;
} else {
// Privileged executable page, marked user execute never.
attr |= MMU_PTE_ATTR_UXN;
}
} else {
// All non executable pages are marked both privileged and user execute never.
attr |= MMU_PTE_ATTR_UXN | MMU_PTE_ATTR_PXN;
}
}
if (flags & ARCH_MMU_FLAG_NS) {
attr |= MMU_PTE_ATTR_NON_SECURE;
}
return attr;
}
uint s1_pte_attr_to_mmu_flags(pte_t pte, bool hypervisor = false) {
uint mmu_flags = 0;
switch (pte & MMU_PTE_ATTR_ATTR_INDEX_MASK) {
case MMU_PTE_ATTR_STRONGLY_ORDERED:
mmu_flags |= ARCH_MMU_FLAG_UNCACHED;
break;
case MMU_PTE_ATTR_DEVICE:
mmu_flags |= ARCH_MMU_FLAG_UNCACHED_DEVICE;
break;
case MMU_PTE_ATTR_NORMAL_UNCACHED:
mmu_flags |= ARCH_MMU_FLAG_WRITE_COMBINING;
break;
case MMU_PTE_ATTR_NORMAL_MEMORY:
mmu_flags |= ARCH_MMU_FLAG_CACHED;
break;
default:
panic("unexpected pte value %" PRIx64, pte);
}
mmu_flags |= ARCH_MMU_FLAG_PERM_READ;
switch (pte & MMU_PTE_ATTR_AP_MASK) {
case MMU_PTE_ATTR_AP_P_RW_U_NA:
mmu_flags |= ARCH_MMU_FLAG_PERM_WRITE;
break;
case MMU_PTE_ATTR_AP_P_RW_U_RW:
mmu_flags |= ARCH_MMU_FLAG_PERM_USER | ARCH_MMU_FLAG_PERM_WRITE;
break;
case MMU_PTE_ATTR_AP_P_RO_U_NA:
break;
case MMU_PTE_ATTR_AP_P_RO_U_RO:
mmu_flags |= ARCH_MMU_FLAG_PERM_USER;
break;
}
if (hypervisor) {
// Single translation regimes such as the hypervisor only support the XN bit.
if ((pte & MMU_PTE_ATTR_XN) == 0) {
mmu_flags |= ARCH_MMU_FLAG_PERM_EXECUTE;
}
} else {
// Based on whether or not this is a user page, check UXN or PXN bit to determine
// if it's an executable page.
if (mmu_flags & ARCH_MMU_FLAG_PERM_USER) {
if ((pte & MMU_PTE_ATTR_UXN) == 0) {
mmu_flags |= ARCH_MMU_FLAG_PERM_EXECUTE;
}
} else if ((pte & MMU_PTE_ATTR_PXN) == 0) {
// Privileged page, check the PXN bit.
mmu_flags |= ARCH_MMU_FLAG_PERM_EXECUTE;
}
// TODO: https://fxbug.dev/42169684
// Add additional asserts here that the translation table entries are correctly formed
// with regards to UXN and PXN bits and possibly other unhandled and/or ambiguous bits.
}
if (pte & MMU_PTE_ATTR_NON_SECURE) {
mmu_flags |= ARCH_MMU_FLAG_NS;
}
return mmu_flags;
}
pte_t mmu_flags_to_s2_pte_attr(uint flags) {
pte_t attr = MMU_PTE_ATTR_AF;
switch (flags & ARCH_MMU_FLAG_CACHE_MASK) {
case ARCH_MMU_FLAG_CACHED:
attr |= MMU_S2_PTE_ATTR_NORMAL_MEMORY | MMU_PTE_ATTR_SH_INNER_SHAREABLE;
break;
case ARCH_MMU_FLAG_WRITE_COMBINING:
attr |= MMU_S2_PTE_ATTR_NORMAL_UNCACHED | MMU_PTE_ATTR_SH_INNER_SHAREABLE;
break;
case ARCH_MMU_FLAG_UNCACHED:
attr |= MMU_S2_PTE_ATTR_STRONGLY_ORDERED;
break;
case ARCH_MMU_FLAG_UNCACHED_DEVICE:
attr |= MMU_S2_PTE_ATTR_DEVICE;
break;
default:
panic("unexpected flags value 0x%x", flags);
}
if (flags & ARCH_MMU_FLAG_PERM_WRITE) {
attr |= MMU_S2_PTE_ATTR_S2AP_RW;
} else {
attr |= MMU_S2_PTE_ATTR_S2AP_RO;
}
if (!(flags & ARCH_MMU_FLAG_PERM_EXECUTE)) {
attr |= MMU_S2_PTE_ATTR_XN;
}
return attr;
}
uint s2_pte_attr_to_mmu_flags(pte_t pte) {
uint mmu_flags = 0;
switch (pte & MMU_S2_PTE_ATTR_ATTR_INDEX_MASK) {
case MMU_S2_PTE_ATTR_STRONGLY_ORDERED:
mmu_flags |= ARCH_MMU_FLAG_UNCACHED;
break;
case MMU_S2_PTE_ATTR_DEVICE:
mmu_flags |= ARCH_MMU_FLAG_UNCACHED_DEVICE;
break;
case MMU_S2_PTE_ATTR_NORMAL_UNCACHED:
mmu_flags |= ARCH_MMU_FLAG_WRITE_COMBINING;
break;
case MMU_S2_PTE_ATTR_NORMAL_MEMORY:
mmu_flags |= ARCH_MMU_FLAG_CACHED;
break;
default:
panic("unexpected pte value %" PRIx64, pte);
}
mmu_flags |= ARCH_MMU_FLAG_PERM_READ;
switch (pte & MMU_PTE_ATTR_AP_MASK) {
case MMU_S2_PTE_ATTR_S2AP_RO:
break;
case MMU_S2_PTE_ATTR_S2AP_RW:
mmu_flags |= ARCH_MMU_FLAG_PERM_WRITE;
break;
default:
panic("unexpected pte value %" PRIx64, pte);
}
if (!(pte & MMU_S2_PTE_ATTR_XN)) {
mmu_flags |= ARCH_MMU_FLAG_PERM_EXECUTE;
}
return mmu_flags;
}
bool is_pte_valid(pte_t pte) {
return (pte & MMU_PTE_DESCRIPTOR_MASK) != MMU_PTE_DESCRIPTOR_INVALID;
}
void update_pte(volatile pte_t* pte, pte_t newval) { *pte = newval; }
int first_used_page_table_entry(const volatile pte_t* page_table, uint page_size_shift) {
const unsigned int count = 1U << (page_size_shift - 3);
for (unsigned int i = 0; i < count; i++) {
pte_t pte = page_table[i];
if (pte != MMU_PTE_DESCRIPTOR_INVALID) {
// Although the descriptor isn't exactly the INVALID value, it might have been corrupted and
// also not a valid entry. Some forms of corruption are indistinguishable from valid entries,
// so this is really just checking for scenarios where the low type bits got set to INVALID,
// but the rest of the entry did not.
//
// TODO(https://fxbug.dev/42159319): Once https://fxbug.dev/42159319 is resolved this can be
// removed.
ASSERT_MSG(is_pte_valid(pte),
"page_table at %p has malformed invalid entry %#" PRIx64 " at %u\n", page_table,
pte, i);
return i;
}
}
return -1;
}
ArmAspaceType AspaceTypeFromFlags(uint mmu_flags) {
// Kernel/Guest flags are mutually exclusive. Ensure at most 1 is set.
DEBUG_ASSERT(((mmu_flags & ARCH_ASPACE_FLAG_KERNEL) != 0) +
((mmu_flags & ARCH_ASPACE_FLAG_GUEST) != 0) <=
1);
if (mmu_flags & ARCH_ASPACE_FLAG_KERNEL) {
return ArmAspaceType::kKernel;
}
if (mmu_flags & ARCH_ASPACE_FLAG_GUEST) {
return ArmAspaceType::kGuest;
}
return ArmAspaceType::kUser;
}
ktl::string_view ArmAspaceTypeName(ArmAspaceType type) {
switch (type) {
case ArmAspaceType::kKernel:
return "kernel";
case ArmAspaceType::kUser:
return "user";
case ArmAspaceType::kGuest:
return "guest";
case ArmAspaceType::kHypervisor:
return "hypervisor";
}
__UNREACHABLE;
}
} // namespace
// A consistency manager that tracks TLB updates, walker syncs and free pages in an effort to
// minimize DSBs (by delaying and coalescing TLB invalidations) and switching to full ASID
// invalidations if too many TLB invalidations are requested.
// The aspace lock *must* be held over the full operation of the ConsistencyManager, from
// construction to deletion. The lock must be held continuously to deletion, and specifically till
// the actual TLB invalidations occur, due to strategy employed here of only invalidating actual
// vaddrs with changing entries, and not all vaddrs an operation applies to. Otherwise the following
// scenario is possible
// 1. Thread 1 performs an Unmap and removes PTE entries, but drops the lock prior to invalidation.
// 2. Thread 2 performs an Unmap, no PTE entries are removed, no invalidations occur
// 3. Thread 2 now believes the resources (pages) for the region are no longer accessible, and
// returns them to the pmm.
// 4. Thread 3 attempts to access this region and is now able to read/write to returned pages as
// invalidations have not occurred.
// This scenario is possible as the mappings here are not the source of truth of resource
// management, but a cache of information from other parts of the system. If thread 2 wanted to
// guarantee that the pages were free it could issue it's own TLB invalidations for the vaddr range,
// even though it found no entries. However this is not the strategy employed here at the moment.
class ArmArchVmAspace::ConsistencyManager {
public:
ConsistencyManager(ArmArchVmAspace& aspace) TA_REQ(aspace.lock_) : aspace_(aspace) {}
~ConsistencyManager() {
Flush();
if (!list_is_empty(&to_free_)) {
CacheFreePages(&to_free_);
}
}
void MapEntry(vaddr_t va, bool terminal) {
// We do not need to sync the walker, despite writing a new entry, as this is a
// non-terminal entry and so is irrelevant to the walker anyway.
if (!terminal) {
return;
}
// If we're mapping in the kernel aspace we may access the page shortly. DSB to make sure the
// page table walker sees it and ISB to keep the cpu from prefetching through this point.
// We do not need to do this for user pages since there will be a synchronization event before
// returning back to user space, or in the case of performing a user_copy after this mapping
// to the newly mapped page at worst there will be an extraneous page fault.
if (aspace_.type_ == ArmAspaceType::kKernel) {
__dsb(ARM_MB_ISHST);
isb_pending_ = true;
}
}
// Queue a TLB entry for flushing. This may get turned into a complete ASID flush, or even a
// complete TLB (all ASID) flush if the associated aspace is a shared one.
void FlushEntry(vaddr_t va, bool terminal) {
// Check we have queued too many entries already.
if (num_pending_tlbs_ >= kMaxPendingTlbs) {
// Most of the time we will now prefer to invalidate the entire ASID, the exception is if
// this aspace is using the global ASID, since we cannot perform a global TLB invalidation
// for all ASIDs. Note that there is an instruction to invalidate the entire TLB, but it is
// only available in EL2, and we are in EL1.
if (aspace_.asid_ != MMU_ARM64_GLOBAL_ASID) {
// Keep counting entries so that we can track how many TLB invalidates we saved by grouping.
num_pending_tlbs_++;
return;
}
// Flush what pages we've cached up until now and reset counter to zero.
Flush();
}
// va must be page aligned so we can safely throw away the bottom bit.
DEBUG_ASSERT(IS_PAGE_ALIGNED(va));
DEBUG_ASSERT(aspace_.IsValidVaddr(va));
pending_tlbs_[num_pending_tlbs_++] = {va, terminal};
}
// Performs any pending synchronization of TLBs and page table walkers. Includes the DSB to ensure
// TLB flushes have completed prior to returning to user.
void Flush() {
cm_flush.Add(1);
// Flush any pending ISBs.
if (isb_pending_) {
__isb(ARM_MB_SY);
isb_pending_ = false;
}
if (num_pending_tlbs_ == 0) {
return;
}
// Need a DSB to synchronize any page table updates prior to flushing the TLBs.
__dsb(ARM_MB_ISHST);
AssertHeld(aspace_.lock_);
// Check if we should just be performing a full ASID invalidation.
// If the associate aspace is shared, this will be upgraded to a full TLB invalidation across
// all ASIDs.
if (num_pending_tlbs_ > kMaxPendingTlbs || aspace_.type_ == ArmAspaceType::kHypervisor) {
cm_flush_all.Add(1);
cm_flush_all_replacing.Add(num_pending_tlbs_);
// If we're a shared aspace, we should be invalidating across all ASIDs.
if (aspace_.IsShared()) {
aspace_.FlushAllAsids();
} else {
aspace_.FlushAsid();
}
} else {
for (size_t i = 0; i < num_pending_tlbs_; i++) {
const vaddr_t va = pending_tlbs_[i].va();
DEBUG_ASSERT(aspace_.IsValidVaddr(va));
aspace_.FlushTLBEntry(va, pending_tlbs_[i].terminal());
}
cm_single_tlb_invalidates.Add(num_pending_tlbs_);
}
// DSB to ensure TLB flushes happen prior to returning to user.
__dsb(ARM_MB_ISH);
// Local flushes that the kernel may observe prior to Context Synchronization Event
// should go ahead and get an ISB to force it.
if (aspace_.type_ == ArmAspaceType::kKernel) {
__isb(ARM_MB_SY);
}
num_pending_tlbs_ = 0;
}
// Queue a page for freeing that is dependent on TLB flushing. This is for pages that were
// previously installed as page tables and they should not be reused until the non-terminal TLB
// flush has occurred.
void FreePage(vm_page_t* page) { list_add_tail(&to_free_, &page->queue_node); }
Lock<CriticalMutex>* lock() const TA_RET_CAP(aspace_.lock_) { return &aspace_.lock_; }
Lock<CriticalMutex>& lock_ref() const TA_RET_CAP(aspace_.lock_) { return aspace_.lock_; }
private:
// Maximum number of TLB entries we will queue before switching to ASID invalidation.
static constexpr size_t kMaxPendingTlbs = 16;
// Pending TLBs to flush are stored as 63 bits, with the bottom bit stolen to store the terminal
// flag. 63 bits is more than enough as these entries are page aligned at the minimum.
struct PendingTlbs {
PendingTlbs() = default;
PendingTlbs(uint64_t va, bool terminal) : va_terminal_(va | terminal) {}
bool terminal() const { return va_terminal_ & 1; }
uint64_t va() const { return va_terminal_ & ~1UL; }
private:
// address[63:1], terminal[0]
uint64_t va_terminal_;
};
static_assert(sizeof(PendingTlbs) == 8);
// The aspace we are invalidating TLBs for.
const ArmArchVmAspace& aspace_;
// Pending ISB
bool isb_pending_ = false;
// vm_page_t's to release to the PMM after the TLB invalidation occurs.
list_node to_free_ = LIST_INITIAL_VALUE(to_free_);
// The main list of pending TLBs.
size_t num_pending_tlbs_ = 0;
PendingTlbs pending_tlbs_[kMaxPendingTlbs];
};
uint64_t ArmArchVmAspace::Tcr() const {
if (IsRestricted()) {
return MMU_TCR_FLAGS_USER_RESTRICTED;
}
return MMU_TCR_FLAGS_USER;
}
uint ArmArchVmAspace::MmuFlagsFromPte(pte_t pte) {
switch (type_) {
case ArmAspaceType::kUser:
case ArmAspaceType::kKernel:
return s1_pte_attr_to_mmu_flags(pte);
case ArmAspaceType::kHypervisor:
return s1_pte_attr_to_mmu_flags(pte, true);
case ArmAspaceType::kGuest:
return s2_pte_attr_to_mmu_flags(pte);
}
__UNREACHABLE;
}
zx_status_t ArmArchVmAspace::Query(vaddr_t vaddr, paddr_t* paddr, uint* mmu_flags) {
Guard<CriticalMutex> al{&lock_};
return QueryLocked(vaddr, paddr, mmu_flags);
}
zx_status_t ArmArchVmAspace::QueryLocked(vaddr_t vaddr, paddr_t* paddr, uint* mmu_flags) {
vaddr_t vaddr_rem;
canary_.Assert();
LTRACEF("aspace %p, vaddr 0x%lx\n", this, vaddr);
DEBUG_ASSERT(tt_virt_);
DEBUG_ASSERT(IsValidVaddr(vaddr));
if (!IsValidVaddr(vaddr)) {
return ZX_ERR_OUT_OF_RANGE;
}
const volatile pte_t* page_table = tt_virt_;
uint32_t index_shift = top_index_shift_;
vaddr_rem = vaddr - vaddr_base_;
while (true) {
const ulong index = vaddr_rem >> index_shift;
vaddr_rem -= (vaddr_t)index << index_shift;
const pte_t pte = page_table[index];
const uint descriptor_type = pte & MMU_PTE_DESCRIPTOR_MASK;
const paddr_t pte_addr = pte & MMU_PTE_OUTPUT_ADDR_MASK;
LTRACEF("va %#" PRIxPTR ", index %lu, index_shift %u, rem %#" PRIxPTR ", pte %#" PRIx64 "\n",
vaddr, index, index_shift, vaddr_rem, pte);
if ((pte & MMU_PTE_VALID) == 0) {
ASSERT_MSG(pte == 0, "invalid pte should be zero %#" PRIx64 "\n", pte);
return ZX_ERR_NOT_FOUND;
}
if (descriptor_type == ((index_shift > page_size_shift_) ? MMU_PTE_L012_DESCRIPTOR_BLOCK
: MMU_PTE_L3_DESCRIPTOR_PAGE)) {
if (paddr) {
*paddr = pte_addr + vaddr_rem;
}
if (mmu_flags) {
*mmu_flags = MmuFlagsFromPte(pte);
}
LTRACEF("va 0x%lx, paddr 0x%lx, flags 0x%x\n", vaddr, paddr ? *paddr : ~0UL,
mmu_flags ? *mmu_flags : ~0U);
return ZX_OK;
}
ASSERT_MSG(index_shift > page_size_shift_ && descriptor_type == MMU_PTE_L012_DESCRIPTOR_TABLE,
"index_shift %u, page_size_shift %u, descriptor_type %#x", index_shift,
page_size_shift_, descriptor_type);
page_table = static_cast<const volatile pte_t*>(paddr_to_physmap(pte_addr));
index_shift -= page_size_shift_ - 3;
}
}
zx::result<vm_page_t*> ArmArchVmAspace::AllocPageTable() {
LTRACEF("page_size_shift %u\n", page_size_shift_);
// currently we only support allocating a single page
DEBUG_ASSERT(page_size_shift_ == PAGE_SIZE_SHIFT);
auto test_alloc = [&]() -> zx::result<vm_page_t*> {
vm_page_t* page;
paddr_t paddr;
zx_status_t status = test_page_alloc_func_(0, &page, &paddr);
if (status == ZX_OK) {
return zx::ok(page);
}
return zx::error(status);
};
// Allocate a page from the pmm via function pointer passed to us in Init().
// The default is CacheAllocPage so test and explicitly call it to avoid any unnecessary
// virtual functions.
auto result = likely(!test_page_alloc_func_) ? CacheAllocPage() : test_alloc();
if (likely(result.is_ok())) {
(*result)->set_state(vm_page_state::MMU);
pt_pages_++;
kcounter_add(vm_mmu_page_table_alloc, 1);
(*result)->mmu.num_mappings = 0;
LOCAL_KTRACE("page table alloc");
LTRACEF("allocated %#lx\n", (*result)->paddr());
}
return result;
}
void ArmArchVmAspace::FreePageTable(void* vaddr, vm_page_t* page, ConsistencyManager& cm,
Reclaim reclaim) {
ASSERT(page);
LTRACEF("vaddr %p paddr %#lx page_size_shift %u\n", vaddr, page->paddr(), page_size_shift_);
// currently we only support freeing a single page
DEBUG_ASSERT(page_size_shift_ == PAGE_SIZE_SHIFT);
LOCAL_KTRACE("page table free");
DEBUG_ASSERT(page->state() == vm_page_state::MMU);
DEBUG_ASSERT(page->mmu.num_mappings == 0);
cm.FreePage(page);
pt_pages_--;
kcounter_add(vm_mmu_page_table_free, 1);
if (reclaim == Reclaim::Yes) {
kcounter_add(vm_mmu_page_table_reclaim, 1);
}
}
zx_status_t ArmArchVmAspace::SplitLargePage(vaddr_t vaddr, const uint index_shift, vaddr_t pt_index,
volatile pte_t* page_table, ConsistencyManager& cm) {
DEBUG_ASSERT(index_shift > page_size_shift_);
const pte_t pte = page_table[pt_index];
DEBUG_ASSERT((pte & MMU_PTE_DESCRIPTOR_MASK) == MMU_PTE_L012_DESCRIPTOR_BLOCK);
auto result = AllocPageTable();
if (result.is_error()) {
TRACEF("failed to allocate page table\n");
return result.status_value();
}
vm_page_t* page = *result;
const uint next_shift = (index_shift - (page_size_shift_ - 3));
const auto new_page_table = static_cast<volatile pte_t*>(paddr_to_physmap(page->paddr()));
const auto new_desc_type =
(next_shift == page_size_shift_) ? MMU_PTE_L3_DESCRIPTOR_PAGE : MMU_PTE_L012_DESCRIPTOR_BLOCK;
const auto attrs = (pte & ~(MMU_PTE_OUTPUT_ADDR_MASK | MMU_PTE_DESCRIPTOR_MASK)) | new_desc_type;
const uint next_size = 1U << next_shift;
for (uint64_t i = 0, mapped_paddr = pte & MMU_PTE_OUTPUT_ADDR_MASK;
i < MMU_KERNEL_PAGE_TABLE_ENTRIES; i++, mapped_paddr += next_size) {
// directly write to the pte, no need to update since this is
// a completely new table
new_page_table[i] = mapped_paddr | attrs;
}
page->mmu.num_mappings = MMU_KERNEL_PAGE_TABLE_ENTRIES;
if (allow_bbm) {
// As we are changing the block size of a translation we must do a break-before-make in
// accordance with ARM requirements to avoid TLB and other inconsistency.
update_pte(&page_table[pt_index], MMU_PTE_DESCRIPTOR_INVALID);
cm.FlushEntry(vaddr, true);
AssertHeld(cm.lock_ref());
// Must force the flush to happen now before installing the new entry. This will also ensure the
// page table entries we wrote will be visible before we install it.
cm.Flush();
}
update_pte(&page_table[pt_index], page->paddr() | MMU_PTE_L012_DESCRIPTOR_TABLE);
LTRACEF("pte %p[%#" PRIxPTR "] = %#" PRIx64 "\n", page_table, pt_index, page_table[pt_index]);
// no need to update the page table count here since we're replacing a block entry with a table
// entry.
cm.FlushEntry(vaddr, false);
return ZX_OK;
}
void ArmArchVmAspace::FlushTLBEntryForAllAsids(vaddr_t vaddr, bool terminal) const {
if (terminal) {
ARM64_TLBI(vaale1is, (vaddr >> 12) & TLBI_VADDR_MASK);
} else {
ARM64_TLBI(vaae1is, (vaddr >> 12) & TLBI_VADDR_MASK);
}
}
// use the appropriate TLB flush instruction to globally flush the modified entry
// terminal is set when flushing at the final level of the page table.
void ArmArchVmAspace::FlushTLBEntry(vaddr_t vaddr, bool terminal) const {
switch (type_) {
case ArmAspaceType::kUser: {
if (IsShared()) {
// If this is a shared aspace, we need to flush this address for all ASIDs.
FlushTLBEntryForAllAsids(vaddr, terminal);
} else {
// Otherwise, flush this address for the specific ASID.
if (terminal) {
ARM64_TLBI(vale1is, ((vaddr >> 12) & TLBI_VADDR_MASK) | (vaddr_t)asid_ << 48);
} else {
ARM64_TLBI(vae1is, ((vaddr >> 12) & TLBI_VADDR_MASK) | (vaddr_t)asid_ << 48);
}
}
return;
}
case ArmAspaceType::kKernel: {
DEBUG_ASSERT(asid_ == MMU_ARM64_GLOBAL_ASID);
FlushTLBEntryForAllAsids(vaddr, terminal);
return;
}
case ArmAspaceType::kGuest: {
uint64_t vttbr = arm64_vttbr(asid_, tt_phys_);
[[maybe_unused]] zx_status_t status = arm64_el2_tlbi_ipa(vttbr, vaddr, terminal);
DEBUG_ASSERT(status == ZX_OK);
return;
}
case ArmAspaceType::kHypervisor:
PANIC("Unsupported.");
return;
}
__UNREACHABLE;
}
void ArmArchVmAspace::FlushAllAsids() const {
DEBUG_ASSERT(type_ == ArmAspaceType::kUser);
DEBUG_ASSERT(IsShared());
ARM64_TLBI_NOADDR(vmalle1is);
}
void ArmArchVmAspace::FlushAsid() const {
switch (type_) {
case ArmAspaceType::kUser: {
DEBUG_ASSERT(asid_ != MMU_ARM64_GLOBAL_ASID);
ARM64_TLBI_ASID(aside1is, asid_);
return;
}
case ArmAspaceType::kKernel: {
// The alle1is instruction that invalidates the TLBs for all ASIDs is only available in EL2,
// and not EL1.
panic("FlushAsid not available for kernel address space");
return;
}
case ArmAspaceType::kGuest: {
uint64_t vttbr = arm64_vttbr(asid_, tt_phys_);
zx_status_t status = arm64_el2_tlbi_vmid(vttbr);
DEBUG_ASSERT(status == ZX_OK);
return;
}
case ArmAspaceType::kHypervisor: {
// Flush all TLB entries in EL2.
zx_status_t status = arm64_el2_tlbi_el2();
DEBUG_ASSERT(status == ZX_OK);
return;
}
}
__UNREACHABLE;
}
ktl::pair<zx_status_t, uint> ArmArchVmAspace::UnmapPageTable(
VirtualAddressCursor& cursor, EnlargeOperation enlarge, CheckForEmptyPt pt_check,
const uint index_shift, volatile pte_t* page_table, ConsistencyManager& cm, Reclaim reclaim) {
const vaddr_t block_size = 1UL << index_shift;
const uint num_entries = (1u << (page_size_shift_ - 3));
const uint64_t index_mask = num_entries - 1;
uint index = static_cast<uint>((cursor.vaddr_rel() >> index_shift) & index_mask);
uint unmapped = 0;
for (; index != num_entries && cursor.size() != 0; ++index) {
pte_t pte = page_table[index];
if ((pte & MMU_PTE_DESCRIPTOR_MASK) == MMU_PTE_DESCRIPTOR_INVALID) {
cursor.SkipEntry(block_size);
continue;
}
// Check if this is a large page and we need to split it.
if (index_shift > page_size_shift_ &&
(pte & MMU_PTE_DESCRIPTOR_MASK) == MMU_PTE_L012_DESCRIPTOR_BLOCK &&
(!IS_ALIGNED(cursor.vaddr_rel(), block_size) || cursor.size() < block_size)) {
// Splitting a large page may perform break-before-make, and during that window we will have
// temporarily unmapped beyond our range, so make sure we are permitted to do that.
if (!allow_bbm && enlarge != EnlargeOperation::Yes) {
return {ZX_ERR_NOT_SUPPORTED, unmapped};
}
zx_status_t s = SplitLargePage(cursor.vaddr(), index_shift, index, page_table, cm);
if (unlikely(s != ZX_OK)) {
// If split fails, just unmap the whole thing, and let a
// subsequent page fault clean it up.
if (enlarge == EnlargeOperation::No) {
return {s, unmapped};
}
// We must unmap here, and not in the normal block below, so that we can use SkipEntry
// instead of ConsumeVAddr on the cursor. This is necessary since the range we are having to
// unmap is, by definition, larger than our actual target cursor, and it would be an error
// to call ConsumeVAddr.
update_pte(&page_table[index], MMU_PTE_DESCRIPTOR_INVALID);
unmapped++;
cm.FlushEntry(cursor.vaddr(), true);
cursor.SkipEntry(block_size);
continue;
}
pte = page_table[index];
}
if (index_shift > page_size_shift_ &&
(pte & MMU_PTE_DESCRIPTOR_MASK) == MMU_PTE_L012_DESCRIPTOR_TABLE) {
const paddr_t page_table_paddr = pte & MMU_PTE_OUTPUT_ADDR_MASK;
volatile pte_t* next_page_table =
static_cast<volatile pte_t*>(paddr_to_physmap(page_table_paddr));
// Recurse a level but remember where we are unmapping from in case we need to do a second
// pass to remove a PT.
const vaddr_t unmap_vaddr = cursor.vaddr();
auto [status, lower_unmapped] =
UnmapPageTable(cursor, enlarge, pt_check, index_shift - (page_size_shift_ - 3),
next_page_table, cm, reclaim);
bool unmap_lower = false;
// Regardless of success or failure we must update the mapping count. Since this involves
// looking up the vm_page_t we take this opportunity to check if it's empty and needs
// unmapping.
vm_page_t* lower_page = nullptr;
if (lower_unmapped > 0 || pt_check == CheckForEmptyPt::Yes) {
lower_page = Pmm::Node().PaddrToPage(page_table_paddr);
DEBUG_ASSERT(lower_page->mmu.num_mappings >= lower_unmapped);
lower_page->mmu.num_mappings -= lower_unmapped;
unmap_lower = lower_page->mmu.num_mappings == 0;
}
if (unlikely(status != ZX_OK)) {
return {status, unmapped};
}
// If the unmap made the level below us empty we want to free the page table, unless in the
// top page of an aspace with a prepopulated top page.
if (unmap_lower && !(IsShared() && index_shift == top_index_shift_)) {
LTRACEF("pte %p[0x%x] = 0 (was page table phys %#lx)\n", page_table, index,
page_table_paddr);
update_pte(&page_table[index], MMU_PTE_DESCRIPTOR_INVALID);
unmapped++;
// We can safely defer TLB flushing as the consistency manager will not return the backing
// page to the PMM until after the tlb is flushed.
cm.FlushEntry(unmap_vaddr, false);
FreePageTable(const_cast<pte_t*>(next_page_table), lower_page, cm, reclaim);
}
} else {
// Empty entries were already handled and skipped at the top of the loop
DEBUG_ASSERT(is_pte_valid(pte));
LTRACEF("pte %p[0x%x] = 0 (was phys %#lx)\n", page_table, index,
page_table[index] & MMU_PTE_OUTPUT_ADDR_MASK);
update_pte(&page_table[index], MMU_PTE_DESCRIPTOR_INVALID);
unmapped++;
cm.FlushEntry(cursor.vaddr(), true);
cursor.Consume(block_size);
}
}
return {ZX_OK, unmapped};
}
ktl::pair<zx_status_t, uint> ArmArchVmAspace::MapPageTable(pte_t attrs, bool ro, uint index_shift,
volatile pte_t* page_table,
ExistingEntryAction existing_action,
MappingCursor& cursor,
ConsistencyManager& cm) {
const vaddr_t block_size = 1UL << index_shift;
const uint num_entries = (1u << (page_size_shift_ - 3));
const uint64_t index_mask = num_entries - 1;
uint index = static_cast<uint>((cursor.vaddr_rel() >> index_shift) & index_mask);
uint mapped = 0;
for (; index != num_entries && cursor.size() != 0; ++index) {
pte_t pte = page_table[index];
// if we're at an unaligned address, not trying to map a block, and not at the terminal level,
// recurse one more level of the page table tree
const bool level_valigned = IS_ALIGNED(cursor.vaddr_rel(), block_size);
const bool level_paligned = IS_ALIGNED(cursor.paddr(), block_size);
if (!level_valigned || !level_paligned || cursor.PageRemaining() < block_size ||
(index_shift > MMU_PTE_DESCRIPTOR_BLOCK_MAX_SHIFT)) {
// Lookup the next level page table, allocating if required.
paddr_t page_table_paddr = 0;
volatile pte_t* next_page_table = nullptr;
switch (pte & MMU_PTE_DESCRIPTOR_MASK) {
case MMU_PTE_DESCRIPTOR_INVALID: {
auto result = AllocPageTable();
if (result.is_error()) {
TRACEF("failed to allocate page table\n");
// The mapping wasn't fully updated, but there is work here that might need to be undone
// as we may have allocated various levels of page tables. By consuming a single page we
// make the cleanup operation think we have added a mapping here, causing it to check
// the page table for potential cleanup.
cursor.Consume(PAGE_SIZE);
return {result.status_value(), mapped};
}
page_table_paddr = (*result)->paddr();
void* pt_vaddr = paddr_to_physmap(page_table_paddr);
LTRACEF("allocated page table, vaddr %p, paddr 0x%lx\n", pt_vaddr, page_table_paddr);
arch_zero_page(pt_vaddr);
// ensure that the zeroing is observable from hardware page table walkers, as we need to
// do this prior to writing the pte we cannot defer it using the consistency manager.
__dsb(ARM_MB_ISHST);
// When new pages are mapped they they have their AF set, under the assumption they are
// being mapped due to being accessed, and this lets us avoid an accessed fault. Since new
// terminal mappings start with the AF flag set, we then also need to start non-terminal
// mappings as having the AF set.
pte = page_table_paddr | MMU_PTE_L012_DESCRIPTOR_TABLE | MMU_PTE_ATTR_RES_SOFTWARE_AF;
update_pte(&page_table[index], pte);
mapped++;
// Tell the consistency manager that we've mapped an inner node.
cm.MapEntry(cursor.vaddr(), false);
LTRACEF("pte %p[%u] = %#" PRIx64 "\n", page_table, index, pte);
next_page_table = static_cast<volatile pte_t*>(pt_vaddr);
break;
}
case MMU_PTE_L012_DESCRIPTOR_TABLE:
// Similar to creating a page table, if we end up mapping a page lower down in this
// hierarchy then it will start off as accessed. As such we set the accessed flag on the
// way down.
pte |= MMU_PTE_ATTR_RES_SOFTWARE_AF;
update_pte(&page_table[index], pte);
page_table_paddr = pte & MMU_PTE_OUTPUT_ADDR_MASK;
LTRACEF("found page table %#" PRIxPTR "\n", page_table_paddr);
next_page_table = static_cast<volatile pte_t*>(paddr_to_physmap(page_table_paddr));
break;
case MMU_PTE_L012_DESCRIPTOR_BLOCK:
return {ZX_ERR_ALREADY_EXISTS, mapped};
default:
panic("unexpected pte value %" PRIx64, pte);
}
DEBUG_ASSERT(next_page_table);
auto [ret, lower_mapped] = MapPageTable(attrs, ro, index_shift - (page_size_shift_ - 3),
next_page_table, existing_action, cursor, cm);
// Regardless of success or failure we must update the mapping counts.
if (lower_mapped > 0) {
vm_page_t* lower_page = Pmm::Node().PaddrToPage(page_table_paddr);
DEBUG_ASSERT(lower_page);
lower_page->mmu.num_mappings += lower_mapped;
}
if (ret != ZX_OK) {
return {ret, mapped};
}
} else {
pte_t new_pte = cursor.paddr() | attrs;
if (index_shift > page_size_shift_) {
new_pte |= MMU_PTE_L012_DESCRIPTOR_BLOCK;
} else {
new_pte |= MMU_PTE_L3_DESCRIPTOR_PAGE;
}
const bool valid = is_pte_valid(pte);
if (unlikely(valid && existing_action == ExistingEntryAction::Error)) {
return {ZX_ERR_ALREADY_EXISTS, mapped};
} else if (valid && existing_action == ExistingEntryAction::Skip) {
// Empty case to simplify the other branches.
} else if (valid && existing_action == ExistingEntryAction::Upgrade &&
(pte & MMU_PTE_OUTPUT_ADDR_MASK) == cursor.paddr()) {
// Doing an upgrade of an existing entry where the output address is not changing. This is
// just a protect, which we can skip if either nothing is actually changing, or if we would
// potentially be reducing permissions.
if (!ro && new_pte != pte) {
update_pte(&page_table[index], new_pte);
cm.FlushEntry(cursor.vaddr(), true);
}
} else {
if (!valid) {
// As we are going to transition an entry form INVALID->VALID we must count this as an
// additional mapping. All other cases are changing an entry from VALID->VALID.
mapped++;
}
// Either no current entry, or we need to upgrade the existing one, potentially performing
// a break-before-make.
if (valid && !ro) {
// If the output address were not changing we would have hit the protect case above, so if
// the new entry is not read only then we must perform break-before-make before installing
// it. Failing to do this could result in writes being temporarily lost due to the
// different output addresses and so we must ignore the allow_bbm flag.
update_pte(&page_table[index], MMU_PTE_DESCRIPTOR_INVALID);
cm.FlushEntry(cursor.vaddr(), true);
// Must force the flush to happen now before installing the new entry. This will also
// ensure the page table entries we wrote will be visible before we install it.
cm.Flush();
}
LTRACEF("pte %p[%u] = %#" PRIx64 " (paddr %#lx)\n", page_table, index, pte, cursor.paddr());
update_pte(&page_table[index], new_pte);
// Tell the consistency manager we've mapped a new page.
cm.MapEntry(cursor.vaddr(), true);
}
cursor.Consume(block_size);
}
}
return {ZX_OK, mapped};
}
zx_status_t ArmArchVmAspace::ProtectPageTable(vaddr_t vaddr_in, vaddr_t vaddr_rel_in,
size_t size_in, pte_t attrs, EnlargeOperation enlarge,
const uint index_shift, volatile pte_t* page_table,
ConsistencyManager& cm) {
vaddr_t vaddr = vaddr_in;
vaddr_t vaddr_rel = vaddr_rel_in;
size_t size = size_in;
const vaddr_t block_size = 1UL << index_shift;
const vaddr_t block_mask = block_size - 1;
LTRACEF("vaddr %#" PRIxPTR ", vaddr_rel %#" PRIxPTR ", size %#" PRIxPTR ", attrs %#" PRIx64
", index shift %u, page_size_shift %u, page_table %p\n",
vaddr, vaddr_rel, size, attrs, index_shift, page_size_shift_, page_table);
// vaddr_rel and size must be page aligned
DEBUG_ASSERT(((vaddr_rel | size) & ((1UL << page_size_shift_) - 1)) == 0);
while (size) {
const vaddr_t vaddr_rem = vaddr_rel & block_mask;
const size_t chunk_size = ktl::min(size, block_size - vaddr_rem);
const vaddr_t index = vaddr_rel >> index_shift;
pte_t pte = page_table[index];
// If the input range partially covers a large page, split the page.
if (index_shift > page_size_shift_ &&
(pte & MMU_PTE_DESCRIPTOR_MASK) == MMU_PTE_L012_DESCRIPTOR_BLOCK &&
chunk_size != block_size) {
// Splitting a large page may perform break-before-make, and during that window we will have
// temporarily unmapped beyond our range, so make sure that is permitted.
if (!allow_bbm && enlarge != EnlargeOperation::Yes) {
return ZX_ERR_NOT_SUPPORTED;
}
zx_status_t s = SplitLargePage(vaddr, index_shift, index, page_table, cm);
if (unlikely(s != ZX_OK)) {
return s;
}
pte = page_table[index];
}
if (index_shift > page_size_shift_ &&
(pte & MMU_PTE_DESCRIPTOR_MASK) == MMU_PTE_L012_DESCRIPTOR_TABLE) {
const paddr_t page_table_paddr = pte & MMU_PTE_OUTPUT_ADDR_MASK;
volatile pte_t* next_page_table =
static_cast<volatile pte_t*>(paddr_to_physmap(page_table_paddr));
// Recurse a level.
zx_status_t status =
ProtectPageTable(vaddr, vaddr_rem, chunk_size, attrs, enlarge,
index_shift - (page_size_shift_ - 3), next_page_table, cm);
if (unlikely(status != ZX_OK)) {
return status;
}
} else if (is_pte_valid(pte)) {
const pte_t new_pte = (pte & ~MMU_PTE_PERMISSION_MASK) | attrs;
LTRACEF("pte %p[%#" PRIxPTR "] = %#" PRIx64 " was %#" PRIx64 "\n", page_table, index, new_pte,
pte);
// Skip updating the page table entry if the new value is the same as before.
if (new_pte != pte) {
update_pte(&page_table[index], new_pte);
cm.FlushEntry(vaddr, true);
}
} else {
LTRACEF("page table entry does not exist, index %#" PRIxPTR ", %#" PRIx64 "\n", index, pte);
}
vaddr += chunk_size;
vaddr_rel += chunk_size;
size -= chunk_size;
}
return ZX_OK;
}
size_t ArmArchVmAspace::HarvestAccessedPageTable(
size_t* entry_limit, vaddr_t vaddr, vaddr_t vaddr_rel_in, size_t size, const uint index_shift,
NonTerminalAction non_terminal_action, TerminalAction terminal_action,
volatile pte_t* page_table, ConsistencyManager& cm) {
const vaddr_t block_size = 1UL << index_shift;
const vaddr_t block_mask = block_size - 1;
// We always want to recursively call `HarvestAccessedPageTable` on entries in the top level page
// of shared address spaces. We have to do this because entries in these aspaces will be accessed
// via the unified aspace, which will not set the accessed bits on those entries.
const bool always_recurse = index_shift == top_index_shift_ && IsShared();
vaddr_t vaddr_rel = vaddr_rel_in;
// vaddr_rel and size must be page aligned
DEBUG_ASSERT(((vaddr_rel | size) & ((1UL << page_size_shift_) - 1)) == 0);
size_t harvested_size = 0;
vm_page_t* table_page = Pmm::Node().PaddrToPage(physmap_to_paddr((void*)page_table));
while (size > 0 && *entry_limit > 0) {
ktrace::Scope trace =
KTRACE_BEGIN_SCOPE_ENABLE(LOCAL_KTRACE_ENABLE, "kernel:vm", "page_table_loop");
const vaddr_t vaddr_rem = vaddr_rel & block_mask;
const vaddr_t index = vaddr_rel >> index_shift;
size_t chunk_size = ktl::min(size, block_size - vaddr_rem);
pte_t pte = page_table[index];
if (index_shift > page_size_shift_ &&
(pte & MMU_PTE_DESCRIPTOR_MASK) == MMU_PTE_L012_DESCRIPTOR_BLOCK &&
chunk_size != block_size) {
// Ignore large pages, we do not support harvesting accessed bits from them. Having this empty
// if block simplifies the overall logic.
} else if (index_shift > page_size_shift_ &&
(pte & MMU_PTE_DESCRIPTOR_MASK) == MMU_PTE_L012_DESCRIPTOR_TABLE) {
const paddr_t page_table_paddr = pte & MMU_PTE_OUTPUT_ADDR_MASK;
volatile pte_t* next_page_table =
static_cast<volatile pte_t*>(paddr_to_physmap(page_table_paddr));
// Start with the assumption that we will unmap if we can.
bool do_unmap = non_terminal_action == NonTerminalAction::FreeUnaccessed;
// Check for our emulated non-terminal AF so we can potentially skip the recursion.
// TODO: make this optional when hardware AF is supported (see todo on
// MMU_PTE_ATTR_RES_SOFTWARE_AF for details)
bool should_recurse = always_recurse || (pte & MMU_PTE_ATTR_RES_SOFTWARE_AF);
vm_page_t* lower_page = nullptr;
if (should_recurse) {
chunk_size = HarvestAccessedPageTable(
entry_limit, vaddr, vaddr_rem, chunk_size, index_shift - (page_size_shift_ - 3),
non_terminal_action, terminal_action, next_page_table, cm);
// This was accessed so we don't necessarily want to unmap it, unless our recursive call
// caused the page table to be empty, in which case we are obligated to.
lower_page = Pmm::Node().PaddrToPage(page_table_paddr);
do_unmap = lower_page->mmu.num_mappings == 0;
// If we processed till the end of sub page table, and we are not retaining page tables,
// then we can clear the AF as we know we will not have to process entries from this one
// again.
if (!do_unmap && (vaddr_rel + chunk_size) >> index_shift != index &&
non_terminal_action != NonTerminalAction::Retain) {
pte &= ~MMU_PTE_ATTR_RES_SOFTWARE_AF;
update_pte(&page_table[index], pte);
}
}
// We can't unmap any top level page table entries in an address space with a prepopulated
// top level page.
if (index_shift == top_index_shift_ && IsShared()) {
do_unmap = false;
}
if (do_unmap) {
// Unmapping an exact block, which should not need enlarging and hence should never be able
// to fail.
VirtualAddressCursor unmap_cursor(vaddr, chunk_size);
{
[[maybe_unused]] bool result =
unmap_cursor.SetVaddrRelativeOffset(vaddr_base_, 1ull << top_size_shift_);
// This should never fail as the cursor we are building is a subset of the range we have
// already processed, which by definition must have been valid.
DEBUG_ASSERT(result);
}
auto [result, lower_unmapped] =
UnmapPageTable(unmap_cursor, EnlargeOperation::No, CheckForEmptyPt::No,
index_shift - (page_size_shift_ - 3), next_page_table, cm, Reclaim::Yes);
ASSERT(result == ZX_OK);
if (!lower_page) {
lower_page = Pmm::Node().PaddrToPage(page_table_paddr);
}
DEBUG_ASSERT(lower_page->mmu.num_mappings == lower_unmapped);
lower_page->mmu.num_mappings -= lower_unmapped;
update_pte(&page_table[index], MMU_PTE_DESCRIPTOR_INVALID);
table_page->mmu.num_mappings--;
// We can safely defer TLB flushing as the consistency manager will not return the backing
// page to the PMM until after the tlb is flushed.
cm.FlushEntry(vaddr, false);
FreePageTable(const_cast<pte_t*>(next_page_table), lower_page, cm, Reclaim::Yes);
}
} else if (is_pte_valid(pte) && (pte & MMU_PTE_ATTR_AF)) {
const paddr_t pte_addr = pte & MMU_PTE_OUTPUT_ADDR_MASK;
const paddr_t paddr = pte_addr + vaddr_rem;
vm_page_t* page = paddr_to_vm_page(paddr);
// Mappings for physical VMOs do not have pages associated with them and so there's no state
// to update on an access.
if (likely(page)) {
pmm_page_queues()->MarkAccessedDeferredCount(page);
if (terminal_action == TerminalAction::UpdateAgeAndHarvest) {
// Modifying the access flag does not require break-before-make for correctness and as we
// do not support hardware access flag setting at the moment we do not have to deal with
// potential concurrent modifications.
pte = (pte & ~MMU_PTE_ATTR_AF);
LTRACEF("pte %p[%#" PRIxPTR "] = %#" PRIx64 "\n", page_table, index, pte);
update_pte(&page_table[index], pte);
cm.FlushEntry(vaddr, true);
}
}
}
vaddr += chunk_size;
vaddr_rel += chunk_size;
size -= chunk_size;
harvested_size += chunk_size;
// Each iteration of this loop examines a PTE at the current level. The
// total number of PTEs examined is limited to avoid holding the aspace lock
// for too long. However, the remaining limit balance is updated at the end
// of the loop to ensure that harvesting makes progress, even if the initial
// limit is too small to reach a terminal PTE.
if (*entry_limit > 0) {
*entry_limit -= 1;
}
}
return harvested_size;
}
void ArmArchVmAspace::MarkAccessedPageTable(vaddr_t vaddr, vaddr_t vaddr_rel_in, size_t size,
const uint index_shift, volatile pte_t* page_table,
ConsistencyManager& cm) {
const vaddr_t block_size = 1UL << index_shift;
const vaddr_t block_mask = block_size - 1;
vaddr_t vaddr_rel = vaddr_rel_in;
// vaddr_rel and size must be page aligned
DEBUG_ASSERT(((vaddr_rel | size) & ((1UL << page_size_shift_) - 1)) == 0);
while (size) {
const vaddr_t vaddr_rem = vaddr_rel & block_mask;
const size_t chunk_size = ktl::min(size, block_size - vaddr_rem);
const vaddr_t index = vaddr_rel >> index_shift;
pte_t pte = page_table[index];
if (index_shift > page_size_shift_ &&
(pte & MMU_PTE_DESCRIPTOR_MASK) == MMU_PTE_L012_DESCRIPTOR_BLOCK &&
chunk_size != block_size) {
// Ignore large pages as we don't support modifying their access flags. Having this empty if
// block simplifies the overall logic.
} else if (index_shift > page_size_shift_ &&
(pte & MMU_PTE_DESCRIPTOR_MASK) == MMU_PTE_L012_DESCRIPTOR_TABLE) {
// Set the software bit we use to represent that this page table has been accessed.
pte |= MMU_PTE_ATTR_RES_SOFTWARE_AF;
update_pte(&page_table[index], pte);
const paddr_t page_table_paddr = pte & MMU_PTE_OUTPUT_ADDR_MASK;
volatile pte_t* next_page_table =
static_cast<volatile pte_t*>(paddr_to_physmap(page_table_paddr));
MarkAccessedPageTable(vaddr, vaddr_rem, chunk_size, index_shift - (page_size_shift_ - 3),
next_page_table, cm);
} else if (is_pte_valid(pte) && (pte & MMU_PTE_ATTR_AF) == 0) {
pte |= MMU_PTE_ATTR_AF;
update_pte(&page_table[index], pte);
}
vaddr += chunk_size;
vaddr_rel += chunk_size;
size -= chunk_size;
}
}
zx_status_t ArmArchVmAspace::ProtectPages(vaddr_t vaddr, size_t size, pte_t attrs,
EnlargeOperation enlarge, vaddr_t vaddr_base,
ConsistencyManager& cm) {
vaddr_t vaddr_rel = vaddr - vaddr_base;
vaddr_t vaddr_rel_max = 1UL << top_size_shift_;
LTRACEF("vaddr %#" PRIxPTR ", size %#" PRIxPTR ", attrs %#" PRIx64 ", asid %#x\n", vaddr, size,
attrs, asid_);
if (vaddr_rel > vaddr_rel_max - size || size > vaddr_rel_max) {
TRACEF("vaddr %#" PRIxPTR ", size %#" PRIxPTR " out of range vaddr %#" PRIxPTR
", size %#" PRIxPTR "\n",
vaddr, size, vaddr_base, vaddr_rel_max);
return ZX_ERR_INVALID_ARGS;
}
LOCAL_KTRACE("mmu protect", ("vaddr", vaddr), ("size", size));
zx_status_t ret =
ProtectPageTable(vaddr, vaddr_rel, size, attrs, enlarge, top_index_shift_, tt_virt_, cm);
return ret;
}
pte_t ArmArchVmAspace::MmuParamsFromFlags(uint mmu_flags) {
pte_t attrs = 0;
switch (type_) {
case ArmAspaceType::kUser:
attrs = mmu_flags_to_s1_pte_attr(mmu_flags);
// User pages are marked non global
attrs |= MMU_PTE_ATTR_NON_GLOBAL;
break;
case ArmAspaceType::kKernel:
attrs = mmu_flags_to_s1_pte_attr(mmu_flags);
break;
case ArmAspaceType::kGuest:
attrs = mmu_flags_to_s2_pte_attr(mmu_flags);
break;
case ArmAspaceType::kHypervisor:
attrs = mmu_flags_to_s1_pte_attr(mmu_flags, true);
break;
}
return attrs;
}
zx_status_t ArmArchVmAspace::MapContiguous(vaddr_t vaddr, paddr_t paddr, size_t count,
uint mmu_flags, size_t* mapped) {
canary_.Assert();
LTRACEF("vaddr %#" PRIxPTR " paddr %#" PRIxPTR " count %zu flags %#x\n", vaddr, paddr, count,
mmu_flags);
DEBUG_ASSERT(tt_virt_);
DEBUG_ASSERT(IsValidVaddr(vaddr));
if (!IsValidVaddr(vaddr)) {
return ZX_ERR_OUT_OF_RANGE;
}
if (!(mmu_flags & ARCH_MMU_FLAG_PERM_READ)) {
return ZX_ERR_INVALID_ARGS;
}
// paddr and vaddr must be aligned.
DEBUG_ASSERT(IS_PAGE_ALIGNED(vaddr));
DEBUG_ASSERT(IS_PAGE_ALIGNED(paddr));
if (!IS_PAGE_ALIGNED(vaddr) || !IS_PAGE_ALIGNED(paddr)) {
return ZX_ERR_INVALID_ARGS;
}
if (count == 0) {
return ZX_OK;
}
{
Guard<CriticalMutex> a{&lock_};
ASSERT(updates_enabled_);
if ((mmu_flags & ARCH_MMU_FLAG_PERM_EXECUTE) || type_ == ArmAspaceType::kHypervisor) {
// The icache gets synced both for executable mappings, which is the expected case, as well as
// for any hypervisor mapping. For hypervisor mappings we additionally need to clean the cache
// fully to PoC (not just PoU as required for icache consistency) as guests, who can disable
// their caches at will, could otherwise see stale data that hasn't been written back to
// memory yet.
ArmVmICacheConsistencyManager cache_cm;
if (type_ == ArmAspaceType::kHypervisor) {
cache_cm.ForceCleanToPoC();
}
cache_cm.SyncAddr(reinterpret_cast<vaddr_t>(paddr_to_physmap(paddr)), count * PAGE_SIZE);
}
pte_t attrs = MmuParamsFromFlags(mmu_flags);
ConsistencyManager cm(*this);
MappingCursor cursor(/*paddrs=*/&paddr, /*paddr_count=*/1, /*page_size=*/count * PAGE_SIZE,
/*vaddr=*/vaddr);
if (!cursor.SetVaddrRelativeOffset(vaddr_base_, 1ull << top_size_shift_)) {
return ZX_ERR_OUT_OF_RANGE;
}
const bool ro = (mmu_flags & ARCH_MMU_FLAG_PERM_RWX_MASK) == ARCH_MMU_FLAG_PERM_READ;
auto [status, lower_mapped] =
MapPageTable(attrs, ro, top_index_shift_, tt_virt_, ExistingEntryAction::Error, cursor, cm);
tt_page_->mmu.num_mappings += lower_mapped;
MarkAspaceModified();
if (status != ZX_OK) {
VirtualAddressCursor unmap_cursor = cursor.ProcessedRange();
if (unmap_cursor.size() > 0) {
auto [unmap_status, unmapped] =
UnmapPageTable(unmap_cursor, EnlargeOperation::No, CheckForEmptyPt::Yes,
top_index_shift_, tt_virt_, cm, Reclaim::No);
DEBUG_ASSERT(unmap_status == ZX_OK);
tt_page_->mmu.num_mappings -= unmapped;
}
return status;
}
DEBUG_ASSERT(cursor.size() == 0);
}
if (mapped) {
*mapped = count;
}
#if __has_feature(address_sanitizer)
if (type_ == ArmAspaceType::kKernel) {
asan_map_shadow_for(vaddr, count * PAGE_SIZE);
}
#endif // __has_feature(address_sanitizer)
return ZX_OK;
}
zx_status_t ArmArchVmAspace::Map(vaddr_t vaddr, paddr_t* phys, size_t count, uint mmu_flags,
ExistingEntryAction existing_action, size_t* mapped) {
canary_.Assert();
LTRACEF("vaddr %#" PRIxPTR " count %zu flags %#x\n", vaddr, count, mmu_flags);
DEBUG_ASSERT(tt_virt_);
DEBUG_ASSERT(IsValidVaddr(vaddr));
if (!IsValidVaddr(vaddr)) {
return ZX_ERR_OUT_OF_RANGE;
}
for (size_t i = 0; i < count; ++i) {
DEBUG_ASSERT(IS_PAGE_ALIGNED(phys[i]));
if (!IS_PAGE_ALIGNED(phys[i])) {
return ZX_ERR_INVALID_ARGS;
}
}
if (!(mmu_flags & ARCH_MMU_FLAG_PERM_READ)) {
return ZX_ERR_INVALID_ARGS;
}
// vaddr must be aligned.
DEBUG_ASSERT(IS_PAGE_ALIGNED(vaddr));
if (!IS_PAGE_ALIGNED(vaddr)) {
return ZX_ERR_INVALID_ARGS;
}
if (count == 0) {
return ZX_OK;
}
{
Guard<CriticalMutex> a{&lock_};
ASSERT(updates_enabled_);
if ((mmu_flags & ARCH_MMU_FLAG_PERM_EXECUTE) || type_ == ArmAspaceType::kHypervisor) {
ArmVmICacheConsistencyManager cache_cm;
for (size_t idx = 0; idx < count; ++idx) {
// See comment in MapContiguous for why we do this for the hypervisor.
if (type_ == ArmAspaceType::kHypervisor) {
cache_cm.ForceCleanToPoC();
}
cache_cm.SyncAddr(reinterpret_cast<vaddr_t>(paddr_to_physmap(phys[idx])), PAGE_SIZE);
}
}
pte_t attrs = MmuParamsFromFlags(mmu_flags);
ConsistencyManager cm(*this);
MappingCursor cursor(/*paddrs=*/phys, /*paddr_count=*/count, /*page_size=*/PAGE_SIZE,
/*vaddr=*/vaddr);
if (!cursor.SetVaddrRelativeOffset(vaddr_base_, 1ull << top_size_shift_)) {
return ZX_ERR_OUT_OF_RANGE;
}
const bool ro = (mmu_flags & ARCH_MMU_FLAG_PERM_RWX_MASK) == ARCH_MMU_FLAG_PERM_READ;
auto [status, lower_mapped] =
MapPageTable(attrs, ro, top_index_shift_, tt_virt_, existing_action, cursor, cm);
tt_page_->mmu.num_mappings += lower_mapped;
MarkAspaceModified();
if (status != ZX_OK) {
VirtualAddressCursor unmap_cursor = cursor.ProcessedRange();
if (unmap_cursor.size() > 0) {
auto [unmap_status, unmapped] =
UnmapPageTable(unmap_cursor, EnlargeOperation::No, CheckForEmptyPt::Yes,
top_index_shift_, tt_virt_, cm, Reclaim::No);
DEBUG_ASSERT(unmap_status == ZX_OK);
tt_page_->mmu.num_mappings -= unmapped;
}
return status;
}
DEBUG_ASSERT(cursor.size() == 0);
}
if (mapped) {
// For ExistingEntryAction::Error, we should have mapped all the addresses we were asked to.
// For ExistingEntryAction::Skip, we might have mapped less if we encountered existing entries,
// but skipped entries contribute towards the total as well.
*mapped = count;
}
#if __has_feature(address_sanitizer)
if (type_ == ArmAspaceType::kKernel) {
asan_map_shadow_for(vaddr, count * PAGE_SIZE);
}
#endif // __has_feature(address_sanitizer)
return ZX_OK;
}
zx_status_t ArmArchVmAspace::Unmap(vaddr_t vaddr, size_t count, EnlargeOperation enlarge,
size_t* unmapped) {
canary_.Assert();
LTRACEF("vaddr %#" PRIxPTR " count %zu\n", vaddr, count);
DEBUG_ASSERT(tt_virt_);
DEBUG_ASSERT(IsValidVaddr(vaddr));
if (!IsValidVaddr(vaddr)) {
return ZX_ERR_OUT_OF_RANGE;
}
DEBUG_ASSERT(IS_PAGE_ALIGNED(vaddr));
if (!IS_PAGE_ALIGNED(vaddr)) {
return ZX_ERR_INVALID_ARGS;
}
Guard<CriticalMutex> a{&lock_};
ASSERT(updates_enabled_);
ConsistencyManager cm(*this);
VirtualAddressCursor cursor(vaddr, count * PAGE_SIZE);
if (!cursor.SetVaddrRelativeOffset(vaddr_base_, 1ull << top_size_shift_)) {
return ZX_ERR_OUT_OF_RANGE;
}
auto [ret, lower_unmapped] = UnmapPageTable(cursor, enlarge, CheckForEmptyPt::No,
top_index_shift_, tt_virt_, cm, Reclaim::No);
tt_page_->mmu.num_mappings -= lower_unmapped;
MarkAspaceModified();
DEBUG_ASSERT(cursor.size() == 0 || ret != ZX_OK);
if (unmapped) {
*unmapped = (ret == ZX_OK) ? count : 0u;
}
return ret;
}
zx_status_t ArmArchVmAspace::Protect(vaddr_t vaddr, size_t count, uint mmu_flags,
EnlargeOperation enlarge) {
canary_.Assert();
if (!IsValidVaddr(vaddr)) {
return ZX_ERR_INVALID_ARGS;
}
if (!IS_PAGE_ALIGNED(vaddr)) {
return ZX_ERR_INVALID_ARGS;
}
if (!(mmu_flags & ARCH_MMU_FLAG_PERM_READ)) {
return ZX_ERR_INVALID_ARGS;
}
// The stage 2 data and instructions aborts do not contain sufficient information for us to
// resolve permission faults, and these kinds of faults generate a hard error. As such we cannot
// safely perform protections and instead upgrade any protect to a complete unmap, therefore
// causing a regular translation fault that we can handle to repopulate the correct mapping.
if (type_ == ArmAspaceType::kGuest) {
return Unmap(vaddr, count, EnlargeOperation::Yes, nullptr);
}
Guard<CriticalMutex> a{&lock_};
ASSERT(updates_enabled_);
if (mmu_flags & ARCH_MMU_FLAG_PERM_EXECUTE) {
// If mappings are going to become executable then we first need to sync their caches.
// Unfortunately this needs to be done on kernel virtual addresses to avoid taking translation
// faults, and so we need to first query for the physical address to then get the kernel virtual
// address in the physmap.
// This sync could be more deeply integrated into ProtectPages, but making existing regions
// executable is very uncommon operation and so we keep it simple.
vm_mmu_protect_make_execute_calls.Add(1);
ArmVmICacheConsistencyManager cache_cm;
size_t pages_synced = 0;
for (size_t idx = 0; idx < count; idx++) {
paddr_t paddr;
uint flags;
if (QueryLocked(vaddr + idx * PAGE_SIZE, &paddr, &flags) == ZX_OK &&
(flags & ARCH_MMU_FLAG_PERM_EXECUTE)) {
cache_cm.SyncAddr(reinterpret_cast<vaddr_t>(paddr_to_physmap(paddr)), PAGE_SIZE);
pages_synced++;
}
}
vm_mmu_protect_make_execute_pages.Add(pages_synced);
}
int ret;
{
pte_t attrs = MmuParamsFromFlags(mmu_flags);
ConsistencyManager cm(*this);
ret = ProtectPages(vaddr, count * PAGE_SIZE, attrs, enlarge, vaddr_base_, cm);
MarkAspaceModified();
}
return ret;
}
zx_status_t ArmArchVmAspace::HarvestAccessed(vaddr_t vaddr, size_t count,
NonTerminalAction non_terminal_action,
TerminalAction terminal_action) {
VM_KTRACE_DURATION(2, "ArmArchVmAspace::HarvestAccessed", ("vaddr", vaddr), ("count", count));
canary_.Assert();
if (!IS_PAGE_ALIGNED(vaddr) || !IsValidVaddr(vaddr)) {
return ZX_ERR_INVALID_ARGS;
}
// Avoid preemption while "involuntarily" holding the arch aspace lock during
// access harvesting. The harvest loop below is O(n), however, the amount of
// work performed with the lock held and preemption disabled is limited. Other
// O(n) operations under this lock are opt-in by the user (e.g. Map, Protect)
// and are performed with preemption enabled.
Guard<CriticalMutex> guard{&lock_};
const vaddr_t vaddr_rel = vaddr - vaddr_base_;
const vaddr_t vaddr_rel_max = 1UL << top_size_shift_;
const size_t size = count * PAGE_SIZE;
if (vaddr_rel > vaddr_rel_max - size || size > vaddr_rel_max) {
TRACEF("vaddr %#" PRIxPTR ", size %#" PRIxPTR " out of range vaddr %#" PRIxPTR
", size %#" PRIxPTR "\n",
vaddr, size, vaddr_base_, vaddr_rel_max);
return ZX_ERR_INVALID_ARGS;
}
LOCAL_KTRACE("mmu harvest accessed", ("vaddr", vaddr), ("size", size));
// Limit harvesting to 32 entries per iteration with the arch aspace lock held
// to avoid delays in accessed faults in the same aspace running in parallel.
//
// This limit is derived from the following observations:
// 1. Worst case runtime to harvest a terminal PTE on a low-end A53 is ~780ns.
// 2. Real workloads can result in harvesting thousands of terminal PTEs in a
// single aspace.
// 3. An access fault handler will spin up to 150us on the aspace adaptive
// mutex before blocking.
// 4. Unnecessarily blocking is costly when the system is heavily loaded,
// especially during accessed faults, which tend to occur multiple times in
// quick succession within and across threads in the same process.
//
// To achieve optimal contention between access harvesting and access faults,
// it is important to avoid exhausting the 150us mutex spin phase by holding
// the aspace mutex for too long. The selected entry limit results in a worst
// case harvest time of about 1/6 of the mutex spin phase.
//
// Ti = worst case runtime per top-level harvest iteration.
// Te = worst case runtime per terminal entry harvest.
// L = max entries per top-level harvest iteration.
//
// Ti = Te * L = 780ns * 32 = 24.96us
//
const size_t kMaxEntriesPerIteration = 32;
size_t remaining_size = size;
vaddr_t current_vaddr = vaddr;
vaddr_t current_vaddr_rel = vaddr_rel;
while (remaining_size) {
ktrace::Scope trace = KTRACE_BEGIN_SCOPE_ENABLE(
LOCAL_KTRACE_ENABLE, "kernel:vm", "harvest_loop", ("remaining_size", remaining_size));
size_t entry_limit = kMaxEntriesPerIteration;
// The consistency manager must be scoped narrowly here as it is incorrect keep it alive without
// the lock held, which we will drop later on.
{
ConsistencyManager cm(*this);
const size_t harvested_size = HarvestAccessedPageTable(
&entry_limit, current_vaddr, current_vaddr_rel, remaining_size, top_index_shift_,
non_terminal_action, terminal_action, tt_virt_, cm);
DEBUG_ASSERT(harvested_size > 0);
DEBUG_ASSERT(harvested_size <= remaining_size);
remaining_size -= harvested_size;
current_vaddr += harvested_size;
current_vaddr_rel += harvested_size;
}
// Release and re-acquire the lock to let contending threads have a chance
// to acquire the arch aspace lock between iterations. Use arch::Yield() to
// give other CPUs spinning on the aspace mutex a slight edge in acquiring
// the mutex. Reenable preemption to flush any pending preemptions that may
// have pended during the critical section.
guard.CallUnlocked([this] {
while (pending_access_faults_.load() != 0) {
arch::Yield();
}
});
}
return ZX_OK;
}
zx_status_t ArmArchVmAspace::MarkAccessed(vaddr_t vaddr, size_t count) {
VM_KTRACE_DURATION(2, "ArmArchVmAspace::MarkAccessed", ("vaddr", vaddr), ("count", count));
canary_.Assert();
if (!IS_PAGE_ALIGNED(vaddr) || !IsValidVaddr(vaddr)) {
return ZX_ERR_OUT_OF_RANGE;
}
AutoPendingAccessFault pending_access_fault{this};
Guard<CriticalMutex> a{&lock_};
const vaddr_t vaddr_rel = vaddr - vaddr_base_;
const vaddr_t vaddr_rel_max = 1UL << top_size_shift_;
const size_t size = count * PAGE_SIZE;
if (vaddr_rel > vaddr_rel_max - size || size > vaddr_rel_max) {
TRACEF("vaddr %#" PRIxPTR ", size %#" PRIxPTR " out of range vaddr %#" PRIxPTR
", size %#" PRIxPTR "\n",
vaddr, size, vaddr_base_, vaddr_rel_max);
return ZX_ERR_OUT_OF_RANGE;
}
LOCAL_KTRACE("mmu mark accessed", ("vaddr", vaddr), ("size", size));
ConsistencyManager cm(*this);
MarkAccessedPageTable(vaddr, vaddr_rel, size, top_index_shift_, tt_virt_, cm);
MarkAspaceModified();
return ZX_OK;
}
bool ArmArchVmAspace::ActiveSinceLastCheck(bool clear) {
// Read whether any CPUs are presently executing.
bool currently_active = num_active_cpus_.load(ktl::memory_order_relaxed) != 0;
// Exchange the current notion of active, with the previously active information. This is the only
// time a |false| value can potentially be written to active_since_last_check_, and doing an
// exchange means we can never 'lose' a |true| value.
bool previously_active =
clear ? active_since_last_check_.exchange(currently_active, ktl::memory_order_relaxed)
: active_since_last_check_.load(ktl::memory_order_relaxed);
// Return whether we had previously been active. It is not necessary to also consider whether we
// are currently active, since activating would also have active_since_last_check_ to true. In the
// scenario where we race and currently_active is true, but we observe previously_active to be
// false, this means that as of the start of this function ::ContextSwitch had not completed, and
// so this aspace is still not actually active.
return previously_active;
}
zx_status_t ArmArchVmAspace::Init() {
canary_.Assert();
LTRACEF("aspace %p, base %#" PRIxPTR ", size 0x%zx, type %*s\n", this, base_, size_,
static_cast<int>(ArmAspaceTypeName(type_).size()), ArmAspaceTypeName(type_).data());
Guard<CriticalMutex> a{&lock_};
// Validate that the base + size is sane and doesn't wrap.
DEBUG_ASSERT(size_ > PAGE_SIZE);
DEBUG_ASSERT(base_ + size_ - 1 > base_);
if (type_ == ArmAspaceType::kKernel) {
// At the moment we can only deal with address spaces as globally defined.
DEBUG_ASSERT(base_ == ~0UL << MMU_KERNEL_SIZE_SHIFT);
DEBUG_ASSERT(size_ == 1UL << MMU_KERNEL_SIZE_SHIFT);
vaddr_base_ = ~0UL << MMU_KERNEL_SIZE_SHIFT;
top_size_shift_ = MMU_KERNEL_SIZE_SHIFT;
top_index_shift_ = MMU_KERNEL_TOP_SHIFT;
page_size_shift_ = MMU_KERNEL_PAGE_SIZE_SHIFT;
tt_virt_ = (volatile pte_t*)paddr_to_physmap(arm64_kernel_translation_table_phys);
tt_phys_ = arm64_kernel_translation_table_phys;
tt_page_ = Pmm::Node().PaddrToPage(arm64_kernel_translation_table_phys);
DEBUG_ASSERT(tt_page_);
DEBUG_ASSERT(tt_page_->state() == vm_page_state::MMU);
asid_ = (uint16_t)MMU_ARM64_GLOBAL_ASID;
} else {
if (type_ == ArmAspaceType::kUser) {
DEBUG_ASSERT(base_ + size_ <= 1UL << MMU_USER_SIZE_SHIFT);
vaddr_base_ = 0;
top_size_shift_ = MMU_USER_SIZE_SHIFT;
top_index_shift_ = MMU_USER_TOP_SHIFT;
page_size_shift_ = MMU_USER_PAGE_SIZE_SHIFT;
if (feat_asid_enabled) {
auto status = asid->Alloc();
if (status.is_error()) {
printf("ARM: out of ASIDs!\n");
return status.status_value();
}
asid_ = status.value();
} else {
// Initialize to a valid value even when disabled to distinguish from the destroyed case.
asid_ = MMU_ARM64_FIRST_USER_ASID;
}
} else if (type_ == ArmAspaceType::kGuest) {
DEBUG_ASSERT(base_ + size_ <= 1UL << MMU_GUEST_SIZE_SHIFT);
vaddr_base_ = 0;
top_size_shift_ = MMU_GUEST_SIZE_SHIFT;
top_index_shift_ = MMU_GUEST_TOP_SHIFT;
page_size_shift_ = MMU_GUEST_PAGE_SIZE_SHIFT;
} else {
DEBUG_ASSERT(type_ == ArmAspaceType::kHypervisor);
DEBUG_ASSERT(base_ + size_ <= 1UL << MMU_IDENT_SIZE_SHIFT);
vaddr_base_ = 0;
top_size_shift_ = MMU_IDENT_SIZE_SHIFT;
top_index_shift_ = MMU_IDENT_TOP_SHIFT;
page_size_shift_ = MMU_IDENT_PAGE_SIZE_SHIFT;
}
// allocate a top level page table to serve as the translation table
auto result = AllocPageTable();
if (result.is_error()) {
return result.status_value();
}
paddr_t pa = (*result)->paddr();
volatile pte_t* va = static_cast<volatile pte_t*>(paddr_to_physmap(pa));
tt_virt_ = va;
tt_phys_ = pa;
tt_page_ = Pmm::Node().PaddrToPage(tt_phys_);
DEBUG_ASSERT(tt_page_);
// zero the top level translation table.
arch_zero_page(const_cast<pte_t*>(tt_virt_));
__dsb(ARM_MB_ISHST);
}
pt_pages_ = 1;
kcounter_add(vm_mmu_page_table_alloc, 1);
LTRACEF("tt_phys %#" PRIxPTR " tt_virt %p\n", tt_phys_, tt_virt_);
return ZX_OK;
}
zx_status_t ArmArchVmAspace::InitRestricted() {
role_ = ArmAspaceRole::kRestricted;
return Init();
}
zx_status_t ArmArchVmAspace::InitShared() {
zx_status_t status = Init();
if (status != ZX_OK) {
return status;
}
role_ = ArmAspaceRole::kShared;
Guard<CriticalMutex> a{&lock_};
// Prepopulate the portion of the top level page table spanned by this aspace by allocating the
// necessary second level entries.
const ulong start = base_ >> top_index_shift_;
const ulong end = (base_ + size_ - 1) >> top_index_shift_;
for (ulong i = start; i <= end; i++) {
DEBUG_ASSERT((tt_virt_[i] & MMU_PTE_DESCRIPTOR_MASK) == MMU_PTE_DESCRIPTOR_INVALID);
auto result = AllocPageTable();
if (result.is_error()) {
return result.status_value();
}
paddr_t page_table_paddr = (*result)->paddr();
void* pt_vaddr = paddr_to_physmap(page_table_paddr);
arch_zero_page(pt_vaddr);
__dsb(ARM_MB_ISHST);
tt_virt_[i] = page_table_paddr | MMU_PTE_L012_DESCRIPTOR_TABLE | MMU_PTE_ATTR_RES_SOFTWARE_AF;
}
return ZX_OK;
}
zx_status_t ArmArchVmAspace::InitUnified(ArchVmAspaceInterface& s, ArchVmAspaceInterface& r) {
canary_.Assert();
LTRACEF("unified aspace %p, base %#" PRIxPTR ", size 0x%zx, type %*s\n", this, base_, size_,
static_cast<int>(ArmAspaceTypeName(type_).size()), ArmAspaceTypeName(type_).data());
ArmArchVmAspace& shared = static_cast<ArmArchVmAspace&>(s);
ArmArchVmAspace& restricted = static_cast<ArmArchVmAspace&>(r);
// Initialize this aspace.
{
Guard<CriticalMutex> a{&lock_};
DEBUG_ASSERT(base_ + size_ <= 1UL << MMU_USER_SIZE_SHIFT);
vaddr_base_ = 0;
top_size_shift_ = MMU_USER_SIZE_SHIFT;
top_index_shift_ = MMU_USER_TOP_SHIFT;
page_size_shift_ = MMU_USER_PAGE_SIZE_SHIFT;
// Assign the restricted address space's ASID to this address space.
if (feat_asid_enabled) {
asid_ = restricted.asid_;
} else {
// Initialize to a valid value even when disabled to distinguish from the destroyed case.
asid_ = MMU_ARM64_FIRST_USER_ASID;
}
// Unified aspaces use the same page table root that the restricted page table does.
tt_virt_ = restricted.tt_virt_;
tt_phys_ = restricted.tt_phys_;
tt_page_ = restricted.tt_page_;
// Set up our pointers to the restricted and shared aspaces.
restricted_aspace_ = &restricted;
shared_aspace_ = &shared;
role_ = ArmAspaceRole::kUnified;
LTRACEF("tt_phys %#" PRIxPTR " tt_virt %p\n", tt_phys_, tt_virt_);
}
const ulong restricted_start = restricted.base_ >> top_index_shift_;
const ulong restricted_end = (restricted.base_ + restricted.size_ - 1) >> top_index_shift_;
const ulong shared_start = shared.base_ >> top_index_shift_;
const ulong shared_end = (shared.base_ + shared.size_ - 1) >> top_index_shift_;
DEBUG_ASSERT(restricted_end < shared_start);
// Validate that the restricted aspace is empty and set its metadata.
{
Guard<CriticalMutex> a{&restricted.lock_};
DEBUG_ASSERT(restricted.tt_virt_);
DEBUG_ASSERT(restricted.num_references_ == 0);
DEBUG_ASSERT(!restricted.IsUnified());
for (ulong i = restricted_start; i <= restricted_end; i++) {
DEBUG_ASSERT((restricted.tt_virt_[i] & MMU_PTE_DESCRIPTOR_MASK) ==
MMU_PTE_DESCRIPTOR_INVALID);
}
restricted.num_references_++;
}
// Copy all mappings from the shared aspace and set its metadata.
{
Guard<CriticalMutex> a{&shared.lock_};
DEBUG_ASSERT(shared.tt_virt_);
DEBUG_ASSERT(shared.IsShared());
DEBUG_ASSERT(!restricted.IsUnified());
for (ulong i = shared_start; i <= shared_end; i++) {
DEBUG_ASSERT((shared.tt_virt_[i] & MMU_PTE_DESCRIPTOR_MASK) == MMU_PTE_L012_DESCRIPTOR_TABLE);
tt_virt_[i] = shared.tt_virt_[i];
}
shared.num_references_++;
}
return ZX_OK;
}
zx_status_t ArmArchVmAspace::DebugFindFirstLeafMapping(vaddr_t* out_pt, vaddr_t* out_vaddr,
pte_t* out_pte) const {
canary_.Assert();
DEBUG_ASSERT(tt_virt_);
DEBUG_ASSERT(out_vaddr);
DEBUG_ASSERT(out_pte);
const unsigned int count = 1U << (page_size_shift_ - 3);
const volatile pte_t* page_table = tt_virt_;
uint32_t index_shift = top_index_shift_;
vaddr_t vaddr = 0;
while (true) {
uint64_t index = 0;
pte_t pte;
// Walk the page table until we find an entry.
for (index = 0; index < count; index++) {
pte = page_table[index];
if (pte != MMU_PTE_DESCRIPTOR_INVALID) {
break;
}
}
if (index == count) {
return ZX_ERR_NOT_FOUND;
}
// Update the virtual address for the index at the current level.
vaddr += (index << index_shift);
const uint descriptor_type = pte & MMU_PTE_DESCRIPTOR_MASK;
const paddr_t pte_addr = pte & MMU_PTE_OUTPUT_ADDR_MASK;
// If we have found a leaf mapping, return it.
if (descriptor_type == ((index_shift > page_size_shift_) ? MMU_PTE_L012_DESCRIPTOR_BLOCK
: MMU_PTE_L3_DESCRIPTOR_PAGE)) {
*out_vaddr = vaddr;
*out_pte = pte;
*out_pt = reinterpret_cast<vaddr_t>(page_table);
return ZX_OK;
}
// Assume this entry could be corrupted and validate the next table address is valid, and return
// graceful errors on invalid descriptor types.
if (!is_physmap_phys_addr(pte_addr) || index_shift <= page_size_shift_ ||
descriptor_type != MMU_PTE_L012_DESCRIPTOR_TABLE) {
*out_vaddr = vaddr;
*out_pte = pte;
*out_pt = reinterpret_cast<vaddr_t>(page_table);
return ZX_ERR_BAD_STATE;
}
page_table = static_cast<const volatile pte_t*>(paddr_to_physmap(pte_addr));
index_shift -= page_size_shift_ - 3;
}
}
void ArmArchVmAspace::AssertEmptyLocked() const {
// Check to see if the top level page table is empty. If not the user didn't
// properly unmap everything before destroying the aspace.
const int index = first_used_page_table_entry(tt_virt_, page_size_shift_);
// Restricted aspaces share their top level page with their associated unified aspace, which
// maintain shared mappings after base_ + size_. We ignore these mappings when validating that
// the restricted aspace is empty.
const int end_index = (int)((base_ + size_ - 1) >> top_index_shift_);
if (index != -1 && index <= end_index) {
vaddr_t pt_addr = 0;
vaddr_t entry_vaddr = 0;
pte_t pte = 0;
// Attempt to walk the page table and find the first leaf most mapping that we can. This
// represents (at least one of) the entries that is holding this page table alive.
//
// TODO(https://fxbug.dev/42159319): Once https://fxbug.dev/42159319 is resolved this call, and
// the entire called method, can be removed.
zx_status_t status = DebugFindFirstLeafMapping(&pt_addr, &entry_vaddr, &pte);
panic(
"top level page table still in use! aspace %p pt_pages_ %zu tt_virt %p index %d entry "
"%" PRIx64 ". Leaf query status %d pt_addr %zu vaddr %zu entry %" PRIx64 "\n",
this, pt_pages_, tt_virt_, index, tt_virt_[index], status, pt_addr, entry_vaddr, pte);
}
if (pt_pages_ != 1) {
panic("allocated page table count is wrong, aspace %p count %zu (should be 1)\n", this,
pt_pages_);
}
}
void ArmArchVmAspace::DisableUpdates() {
canary_.Assert();
Guard<CriticalMutex> a{&lock_};
updates_enabled_ = false;
if (!tt_virt_) {
// Initialization must not have succeeded.
return;
}
if (!IsShared() && !IsUnified()) {
AssertEmptyLocked();
}
}
zx_status_t ArmArchVmAspace::DestroyIndividual() {
DEBUG_ASSERT(!IsUnified());
Guard<CriticalMutex> a{&lock_};
DEBUG_ASSERT(num_references_ == 0);
// If this page table has a prepopulated top level, we need to manually clean up the entries we
// created in InitPrepopulated. We know for sure that these entries are no longer referenced by
// other page tables because we expect those page tables to have been destroyed before this one.
if (IsShared()) {
const ulong start = base_ >> top_index_shift_;
const ulong end = (base_ + size_ - 1) >> top_index_shift_;
for (ulong i = start; i <= end; i++) {
const paddr_t paddr = tt_virt_[i] & MMU_PTE_OUTPUT_ADDR_MASK;
vm_page_t* page = paddr_to_vm_page(paddr);
DEBUG_ASSERT(page);
DEBUG_ASSERT(page->state() == vm_page_state::MMU);
CacheFreePage(page);
pt_pages_--;
tt_virt_[i] = MMU_PTE_DESCRIPTOR_INVALID;
}
}
AssertEmptyLocked();
// Need a DSB to synchronize any page table updates prior to flushing the TLBs.
__dsb(ARM_MB_ISH);
// Flush the ASID or VMID associated with this aspace
FlushAsid();
// Need a DSB to ensure all other cpus have fully processed the TLB flush.
__dsb(ARM_MB_ISH);
// Free any ASID.
if (type_ == ArmAspaceType::kUser) {
if (feat_asid_enabled) {
auto status = asid->Free(asid_);
ASSERT(status.is_ok());
} else {
DEBUG_ASSERT(asid_ == MMU_ARM64_FIRST_USER_ASID);
}
asid_ = MMU_ARM64_UNUSED_ASID;
}
// Free the top level page table.
vm_page_t* page = tt_page_;
DEBUG_ASSERT(page);
CacheFreePage(page);
pt_pages_--;
kcounter_add(vm_mmu_page_table_free, 1);
tt_phys_ = 0;
tt_virt_ = nullptr;
tt_page_ = nullptr;
return ZX_OK;
}
zx_status_t ArmArchVmAspace::DestroyUnified() {
{
Guard<CriticalMutex> a{&shared_aspace_->lock_};
// The shared page table should be referenced by at least this page table, and could be
// referenced by many other unified page tables.
DEBUG_ASSERT(shared_aspace_->num_references_ > 0);
shared_aspace_->num_references_--;
}
{
Guard<CriticalMutex> a{&restricted_aspace_->lock_};
// The restricted_aspace_ page table can only be referenced by a singular unified page table.
DEBUG_ASSERT(restricted_aspace_->num_references_ == 1);
restricted_aspace_->num_references_--;
}
shared_aspace_ = nullptr;
restricted_aspace_ = nullptr;
asid_ = MMU_ARM64_UNUSED_ASID;
tt_phys_ = 0;
tt_page_ = nullptr;
tt_virt_ = nullptr;
return ZX_OK;
}
zx_status_t ArmArchVmAspace::Destroy() {
canary_.Assert();
LTRACEF("aspace %p\n", this);
// We cannot destroy the kernel address space.
DEBUG_ASSERT(type_ != ArmAspaceType::kKernel);
// Make sure initialization succeeded.
if (!tt_virt_) {
DEBUG_ASSERT(!tt_phys_);
DEBUG_ASSERT(!tt_page_);
return ZX_OK;
}
if (IsUnified()) {
return DestroyUnified();
}
return DestroyIndividual();
}
// Called during context switches between threads with different address spaces. Swaps the
// mmu context on hardware. Assumes old_aspace != aspace and optimizes as such.
void ArmArchVmAspace::ContextSwitch(ArmArchVmAspace* old_aspace, ArmArchVmAspace* aspace) {
uint64_t tcr;
uint64_t ttbr;
// If we're not using ASIDs, we need to trigger a TLB flush here so we don't leak entries across
// the context switch. Note that we do not need to perform this flush if we are switching to
// the kernel's address space, as those mappings are global and will be unaffected by the flush.
if (aspace && !feat_asid_enabled) {
// asid_ is always set to MMU_ARM64_FIRST_USER_ASID when ASID use is disabled, so this will
// invalidate all TLB entries except the global ones.
DEBUG_ASSERT(aspace->asid_ == MMU_ARM64_FIRST_USER_ASID);
ARM64_TLBI_ASID(aside1, aspace->asid_);
}
if (likely(aspace)) {
aspace->canary_.Assert();
// Check that we are switching to a user aspace, and that the asid is in the valid range.
DEBUG_ASSERT(aspace->type_ == ArmAspaceType::kUser);
DEBUG_ASSERT(aspace->asid_ >= MMU_ARM64_FIRST_USER_ASID);
// Compute the user space TTBR with the translation table and user space ASID.
ttbr = ((uint64_t)aspace->asid_ << 48) | aspace->tt_phys_;
tcr = aspace->Tcr();
// Update TCR and TTBR0 if the new aspace uses different values, or if we're switching away
// from the kernel aspace.
if (unlikely(!old_aspace)) {
__arm_wsr64("ttbr0_el1", ttbr);
__arm_wsr64("tcr_el1", tcr);
__isb(ARM_MB_SY);
} else {
uint64_t old_ttbr = ((uint64_t)old_aspace->asid_ << 48) | old_aspace->tt_phys_;
bool needs_isb = false;
if (old_ttbr != ttbr) {
__arm_wsr64("ttbr0_el1", ttbr);
needs_isb = true;
}
if (old_aspace->Tcr() != aspace->Tcr()) {
__arm_wsr64("tcr_el1", tcr);
needs_isb = true;
}
if (needs_isb) {
__isb(ARM_MB_SY);
}
[[maybe_unused]] uint32_t prev =
old_aspace->num_active_cpus_.fetch_sub(1, ktl::memory_order_relaxed);
DEBUG_ASSERT(prev > 0);
}
[[maybe_unused]] uint32_t prev =
aspace->num_active_cpus_.fetch_add(1, ktl::memory_order_relaxed);
DEBUG_ASSERT(prev < SMP_MAX_CPUS);
aspace->active_since_last_check_.store(true, ktl::memory_order_relaxed);
// If we are switching to a unified aspace, we need to mark the associated shared and
// restricted aspaces as active since the last check as well.
if (aspace->IsUnified()) {
aspace->shared_aspace_->MarkAspaceModified();
aspace->restricted_aspace_->MarkAspaceModified();
}
} else {
// Switching to the null aspace, which means kernel address space only.
// Load a null TTBR0 and disable page table walking for user space.
tcr = MMU_TCR_FLAGS_KERNEL;
__arm_wsr64("tcr_el1", tcr);
__isb(ARM_MB_SY);
ttbr = 0; // MMU_ARM64_UNUSED_ASID
__arm_wsr64("ttbr0_el1", ttbr);
__isb(ARM_MB_SY);
if (likely(old_aspace != nullptr)) {
[[maybe_unused]] uint32_t prev =
old_aspace->num_active_cpus_.fetch_sub(1, ktl::memory_order_relaxed);
DEBUG_ASSERT(prev > 0);
}
}
if (TRACE_CONTEXT_SWITCH) {
TRACEF("old aspace %p aspace %p ttbr %#" PRIx64 ", tcr %#" PRIx64 "\n", old_aspace, aspace,
ttbr, tcr);
}
}
void arch_zero_page(void* _ptr) {
uintptr_t ptr = (uintptr_t)_ptr;
uint32_t zva_size = arm64_zva_size;
uintptr_t end_ptr = ptr + PAGE_SIZE;
do {
__asm__ volatile("dc zva, %0" ::"r"(ptr));
ptr += zva_size;
} while (ptr != end_ptr);
}
zx_status_t arm64_mmu_translate(vaddr_t va, paddr_t* pa, bool user, bool write) {
// disable interrupts around this operation to make the at/par instruction combination atomic
uint64_t par;
{
InterruptDisableGuard irqd;
if (user) {
if (write) {
__asm__ volatile("at s1e0w, %0" ::"r"(va) : "memory");
} else {
__asm__ volatile("at s1e0r, %0" ::"r"(va) : "memory");
}
} else {
if (write) {
__asm__ volatile("at s1e1w, %0" ::"r"(va) : "memory");
} else {
__asm__ volatile("at s1e1r, %0" ::"r"(va) : "memory");
}
}
par = __arm_rsr64("par_el1");
}
// if bit 0 is clear, the translation succeeded
if (BIT(par, 0)) {
return ZX_ERR_NOT_FOUND;
}
// physical address is stored in bits [51..12], naturally aligned
*pa = BITS(par, 51, 12) | (va & (PAGE_SIZE - 1));
return ZX_OK;
}
ArmArchVmAspace::ArmArchVmAspace(vaddr_t base, size_t size, ArmAspaceType type, page_alloc_fn_t paf)
: test_page_alloc_func_(paf), type_(type), base_(base), size_(size) {}
ArmArchVmAspace::ArmArchVmAspace(vaddr_t base, size_t size, uint mmu_flags, page_alloc_fn_t paf)
: ArmArchVmAspace(base, size, AspaceTypeFromFlags(mmu_flags), paf) {}
ArmArchVmAspace::~ArmArchVmAspace() {
// Destroy() will have freed the final page table if it ran correctly, and further validated that
// everything else was freed.
DEBUG_ASSERT(pt_pages_ == 0);
}
vaddr_t ArmArchVmAspace::PickSpot(vaddr_t base, vaddr_t end, vaddr_t align, size_t size,
uint mmu_flags) {
canary_.Assert();
return PAGE_ALIGN(base);
}
void ArmVmICacheConsistencyManager::SyncAddr(vaddr_t start, size_t len) {
// Validate we are operating on a kernel address range.
DEBUG_ASSERT(is_kernel_address(start));
// use the physmap to clean the range. If we have been requested to clean to PoC then we must do
// that, otherwise we can just clean to the PoU, which is the point where the instruction cache
// pulls from. Cleaning to PoU is potentially cheaper than cleaning to PoC.
if (clean_poc_) {
arch_clean_cache_range(start, len);
} else {
arm64_clean_cache_range_pou(start, len);
}
// We can batch the icache invalidate and just perform it once at the end.
need_invalidate_ = true;
}
void ArmVmICacheConsistencyManager::Finish() {
if (!need_invalidate_) {
return;
}
// Under the assumption our icache is VIPT then as we do not know all the virtual aliases of the
// sections we cleaned our only option is to dump the entire icache.
arch::InvalidateGlobalInstructionCache();
__isb(ARM_MB_SY);
need_invalidate_ = false;
}
void arm64_mmu_early_init() {
// Our current ASID allocation scheme is very naive and allocates a unique ASID to every address
// space, which means that there are often not enough ASIDs when the machine uses 8-bit ASIDs.
// Therefore, if we detect that we are only given 8-bit ASIDs, disable their use.
feat_asid_enabled =
gBootOptions->arm64_enable_asid && arm64_asid_width() != arm64_asid_width::ASID_8;
// After we've probed the feature set and parsed the boot options, initialize the asid allocator.
if (feat_asid_enabled) {
asid.Initialize();
} else {
dprintf(INFO, "mmu: not using ASIDs\n");
}
}
uint32_t arch_address_tagging_features() {
static_assert(MMU_TCR_FLAGS_USER & MMU_TCR_TBI0, "Expected TBI enabled.");
return ZX_ARM64_FEATURE_ADDRESS_TAGGING_TBI;
}