blob: 2bd7550456b3fbcfcd8f3e9c03847f595fa3da3e [file] [log] [blame]
// Copyright 2017 The Fuchsia Authors
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file or at
#include <align.h>
#include <lib/arch/x86/boot-cpuid.h>
#include <lib/fit/defer.h>
#include <lib/zx/result.h>
#include <arch/x86/page_tables/constants.h>
#include <fbl/canary.h>
#include <hwreg/bitfields.h>
#include <kernel/mutex.h>
#include <page_tables/x86/constants.h>
#include <vm/arch_vm_aspace.h>
#include <vm/mapping_cursor.h>
#include <vm/physmap.h>
#include <vm/pmm.h>
typedef uint64_t pt_entry_t;
#define PRIxPTE PRIx64
// Different page table levels in the page table mgmt hirerachy
enum class PageTableLevel {
PT_L = 0,
PD_L = 1,
PDP_L = 2,
PML4_L = 3,
// Different roles a page table can fulfill when running with unified aspaces.
enum class PageTableRole : uint8_t {
// Type for flags used in the hardware page tables, for terminal entries.
// Note that some flags here may have meanings that depend on the level
// at which they occur (e.g. page size and PAT).
using PtFlags = uint64_t;
// Type for flags used in the hardware page tables, for non-terminal
// entries.
using IntermediatePtFlags = uint64_t;
namespace internal {
// Utility for coalescing cache line flushes when modifying page tables. This
// allows us to mutate adjacent page table entries without having to flush for
// each cache line multiple times.
class CacheLineFlusher {
// If |perform_invalidations| is false, this class acts as a no-op.
explicit CacheLineFlusher(bool perform_invalidations)
: dirty_line_(0),
cl_mask_(~(arch::BootCpuid<arch::CpuidProcessorInfo>().cache_line_size_bytes() - 1ull)),
perform_invalidations_(perform_invalidations) {}
~CacheLineFlusher() { ForceFlush(); }
void FlushPtEntry(const volatile pt_entry_t* entry) {
uintptr_t entry_line = reinterpret_cast<uintptr_t>(entry) & cl_mask_;
if (entry_line != dirty_line_) {
dirty_line_ = entry_line;
void ForceFlush() {
if (dirty_line_ && perform_invalidations_) {
__asm__ volatile("clflush %0\n" : : "m"(*reinterpret_cast<char*>(dirty_line_)) : "memory");
dirty_line_ = 0;
// The cache-aligned address that currently dirty. If 0, no dirty line.
uintptr_t dirty_line_;
const uintptr_t cl_mask_;
const bool perform_invalidations_;
// Structure for tracking an upcoming TLB invalidation
struct PendingTlbInvalidation {
struct Item {
uint64_t raw;
DEF_SUBFIELD(raw, 2, 0, page_level);
DEF_SUBBIT(raw, 3, is_global);
DEF_SUBBIT(raw, 4, is_terminal);
DEF_SUBFIELD(raw, 63, 12, encoded_addr);
vaddr_t addr() const { return encoded_addr() << PAGE_SIZE_SHIFT; }
static_assert(sizeof(Item) == 8, "");
// If true, ignore |vaddr| and perform a full invalidation for this context.
bool full_shootdown = false;
// If true, at least one enqueued entry was for a global page.
bool contains_global = false;
// Number of valid elements in |item|
uint count = 0;
// List of addresses queued for invalidation.
// Explicitly uninitialized since the size is fairly large.
Item item[32];
// Add address |v|, translated at depth |level|, to the set of addresses to be invalidated.
// |is_terminal| should be true iff this invalidation is targeting the final step of the
// translation rather than a higher page table entry. |is_global_page| should be true iff this
// page was mapped with the global bit set.
void enqueue(vaddr_t v, PageTableLevel level, bool is_global_page, bool is_terminal) {
if (is_global_page) {
contains_global = true;
// We mark PML4_L entries as full shootdowns, since it's going to be
// expensive one way or another.
if (count >= ktl::size(item) || level == PageTableLevel::PML4_L) {
full_shootdown = true;
item[count].set_encoded_addr(v >> PAGE_SIZE_SHIFT);
// Clear the list of pending invalidations
void clear() {
count = 0;
full_shootdown = false;
contains_global = false;
~PendingTlbInvalidation() { DEBUG_ASSERT(count == 0); }
} // namespace internal
class X86PageTableBase {
X86PageTableBase() {}
virtual ~X86PageTableBase() {
DEBUG_ASSERT_MSG(!phys_, "page table dtor called before Destroy()");
paddr_t phys() const { return phys_; }
void* virt() const { return virt_; }
size_t pages() {
Guard<Mutex> al{AssertOrderedLock, &lock_, LockOrder()};
return pages_;
void* ctx() const { return ctx_; }
using PendingTlbInvalidation = internal::PendingTlbInvalidation;
using CacheLineFlusher = internal::CacheLineFlusher;
using ExistingEntryAction = ArchVmAspaceInterface::ExistingEntryAction;
using EnlargeOperation = ArchVmAspaceInterface::EnlargeOperation;
// Returns whether this page table is restricted.
// We do so by verifying that it was created with `InitRestricted` and has been linked to a
// unified page table.
bool IsRestricted() const { return role_ == PageTableRole::kRestricted; }
// Returns whether this page table is shared.
bool IsShared() const { return role_ == PageTableRole::kShared; }
// Returns whether this page table is unified.
bool IsUnified() const { return role_ == PageTableRole::kUnified; }
virtual zx_status_t MapPages(vaddr_t vaddr, paddr_t* phys, size_t count, uint mmu_flags,
ExistingEntryAction existing_action, size_t* mapped) = 0;
virtual zx_status_t MapPagesContiguous(vaddr_t vaddr, paddr_t paddr, const size_t count,
uint mmu_flags, size_t* mapped) = 0;
virtual zx_status_t UnmapPages(vaddr_t vaddr, const size_t count, EnlargeOperation enlarge,
size_t* unmapped) = 0;
virtual zx_status_t ProtectPages(vaddr_t vaddr, size_t count, uint mmu_flags) = 0;
virtual zx_status_t QueryVaddr(vaddr_t vaddr, paddr_t* paddr, uint* mmu_flags) = 0;
using NonTerminalAction = ArchVmAspaceInterface::NonTerminalAction;
using TerminalAction = ArchVmAspaceInterface::TerminalAction;
virtual zx_status_t HarvestAccessed(vaddr_t vaddr, size_t count,
NonTerminalAction non_terminal_action,
TerminalAction terminal_action) = 0;
// Returns 1 for unified page tables and 0 for all other page tables. This establishes an
// ordering that is used when the lock_ is acquired. The restricted page table lock is acquired
// first, and the unified page table lock is acquired afterwards.
uint32_t LockOrder() const { return IsUnified() ? 1 : 0; }
using page_alloc_fn_t = ArchVmAspaceInterface::page_alloc_fn_t;
// Initialize an empty page table, assigning this given context to it.
zx_status_t Init(void* ctx, page_alloc_fn_t test_paf = nullptr) TA_NO_THREAD_SAFETY_ANALYSIS {
test_page_alloc_func_ = test_paf;
/* allocate a top level page table for the new address space */
virt_ = AllocatePageTable();
if (!virt_) {
phys_ = physmap_to_paddr(virt_);
DEBUG_ASSERT(phys_ != 0);
ctx_ = ctx;
pages_ = 1;
return ZX_OK;
pt_entry_t* AllocatePageTable() {
paddr_t pa;
vm_page* p;
zx_status_t status;
// The default allocation routine is pmm_alloc_page so test and explicitly call it
// to avoid any unnecessary virtual function calls.
if (likely(!test_page_alloc_func_)) {
status = pmm_alloc_page(0, &p, &pa);
} else {
status = test_page_alloc_func_(0, &p, &pa);
if (status != ZX_OK) {
return nullptr;
pt_entry_t* page_ptr = static_cast<pt_entry_t*>(paddr_to_physmap(pa));
return page_ptr;
fbl::Canary<fbl::magic("X86P")> canary_;
// The number of times entries in the pml4 are referenced by other page tables.
// Unified page tables increment and decrement this value on their associated shared and
// restricted page tables, so we must hold the lock_ when doing so.
uint32_t num_references_ TA_GUARDED(lock_) = 0;
// The role this page table plays in unified aspaces, if any. This should only be set by the
// Init* functions, and should not be modified anywhere else.
PageTableRole role_ = PageTableRole::kIndependent;
// Page allocate function, overridable for testing.
page_alloc_fn_t test_page_alloc_func_ = nullptr;
// Pointer to the translation table.
paddr_t phys_ = 0;
pt_entry_t* virt_ = nullptr;
// Counter of pages allocated to back the translation table.
size_t pages_ TA_GUARDED(lock_) = 0;
// A context structure that may used by a PageTable type above as part of
// invalidation.
void* ctx_ = nullptr;
// Lock to protect the mmu code.
DECLARE_MUTEX(X86PageTableBase, lockdep::LockFlagsNestable) lock_;
// Implementation of the X86 page table code, that is expected to be derived using the recursive
// template pattern. The derived class T is expected to implement the following methods:
// Returns the highest level of the page tables
// PageTableLevel top_level();
// Returns true if the given ARCH_MMU_FLAG_* flag combination is valid.
// bool allowed_flags(uint flags);
// Returns true if the given paddr is valid
// bool check_paddr(paddr_t paddr);
// Returns true if the given vaddr is valid
// bool check_vaddr(vaddr_t vaddr);
// Whether the processor supports the page size of this level
// bool supports_page_size(PageTableLevel level);
// Return the hardware flags to use on intermediate page tables entries
// IntermediatePtFlags intermediate_flags();
// Return the hardware flags to use on terminal page table entries
// PtFlags terminal_flags(PageTableLevel level, uint flags);
// Return the hardware flags to use on smaller pages after a splitting a
// large page with flags |flags|.
// PtFlags split_flags(PageTableLevel level, PtFlags flags);
// Execute the given pending invalidation
// void TlbInvalidate(const PendingTlbInvalidation* pending);
// Convert PtFlags to ARCH_MMU_* flags.
// uint pt_flags_to_mmu_flags(PtFlags flags, PageTableLevel level);
// Returns true if a cache flush is necessary for pagetable changes to be
// visible to hardware page table walkers. On x86, this is only true for Intel IOMMU page
// tables when the IOMMU 'caching mode' bit is true.
// bool needs_cache_flushes();
template <class T>
class X86PageTableImpl : public X86PageTableBase {
X86PageTableImpl() {}
// Accessors for the shared and restricted page tables on a unified page table.
// We can turn off thread safety analysis as these accessors should only be used on unified page
// tables, for which both the shared and restricted page table pointers are notionally const.
X86PageTableBase* get_shared_pt() TA_NO_THREAD_SAFETY_ANALYSIS {
return shared_pt_;
X86PageTableBase* get_restricted_pt() TA_NO_THREAD_SAFETY_ANALYSIS {
return referenced_pt_;
zx_status_t MapPages(vaddr_t vaddr, paddr_t* phys, size_t count, uint mmu_flags,
ExistingEntryAction existing_action, size_t* mapped) override final {
if (!static_cast<T*>(this)->check_vaddr(vaddr))
for (size_t i = 0; i < count; ++i) {
if (!static_cast<T*>(this)->check_paddr(phys[i]))
if (count == 0)
return ZX_OK;
if (!static_cast<T*>(this)->allowed_flags(mmu_flags))
__UNINITIALIZED ConsistencyManager cm(this);
Guard<Mutex> a{AssertOrderedLock, &lock_, LockOrder()};
MappingCursor cursor(/*paddrs=*/phys, /*paddr_count=*/count, /*page_size=*/PAGE_SIZE,
zx_status_t status = AddMapping(virt_, mmu_flags, static_cast<T*>(this)->top_level(),
existing_action, cursor, &cm);
if (status != ZX_OK) {
dprintf(SPEW, "Add mapping failed with err=%d\n", status);
return status;
DEBUG_ASSERT(cursor.size() == 0);
if (mapped) {
*mapped = count;
return ZX_OK;
zx_status_t MapPagesContiguous(vaddr_t vaddr, paddr_t paddr, const size_t count, uint mmu_flags,
size_t* mapped) override final {
if (!static_cast<T*>(this)->check_paddr(paddr))
if (!static_cast<T*>(this)->check_vaddr(vaddr))
if (count == 0)
return ZX_OK;
if (!static_cast<T*>(this)->allowed_flags(mmu_flags))
MappingCursor cursor(/*paddrs=*/&paddr, /*paddr_count=*/1, /*page_size=*/count * PAGE_SIZE,
__UNINITIALIZED ConsistencyManager cm(this);
Guard<Mutex> a{AssertOrderedLock, &lock_, LockOrder()};
zx_status_t status = AddMapping(virt_, mmu_flags, static_cast<T*>(this)->top_level(),
ExistingEntryAction::Error, cursor, &cm);
if (status != ZX_OK) {
dprintf(SPEW, "Add mapping failed with err=%d\n", status);
return status;
DEBUG_ASSERT(cursor.size() == 0);
if (mapped)
*mapped = count;
return ZX_OK;
zx_status_t UnmapPages(vaddr_t vaddr, const size_t count, EnlargeOperation enlarge,
size_t* unmapped) override final {
if (!static_cast<T*>(this)->check_vaddr(vaddr))
if (count == 0)
return ZX_OK;
MappingCursor cursor(/*vaddr=*/vaddr, /*size=*/count * PAGE_SIZE);
__UNINITIALIZED ConsistencyManager cm(this);
// This needs to be initialized to some value as gcc cannot work out that it can elide the
// default constructor.
zx::result<bool> status = zx::ok(true);
Guard<Mutex> a{AssertOrderedLock, &lock_, LockOrder()};
status = RemoveMapping(virt_, static_cast<T*>(this)->top_level(), enlarge, cursor, &cm);
DEBUG_ASSERT(cursor.size() == 0 || status.is_error());
if (unmapped)
*unmapped = count;
return status.status_value();
zx_status_t ProtectPages(vaddr_t vaddr, size_t count, uint mmu_flags) override final {
if (!static_cast<T*>(this)->check_vaddr(vaddr))
if (count == 0)
return ZX_OK;
if (!static_cast<T*>(this)->allowed_flags(mmu_flags))
MappingCursor cursor(/*vaddr=*/vaddr, /*size=*/count * PAGE_SIZE);
__UNINITIALIZED ConsistencyManager cm(this);
Guard<Mutex> a{AssertOrderedLock, &lock_, LockOrder()};
zx_status_t status =
UpdateMapping(virt_, mmu_flags, static_cast<T*>(this)->top_level(), cursor, &cm);
if (status != ZX_OK) {
return status;
DEBUG_ASSERT(cursor.size() == 0);
return ZX_OK;
zx_status_t QueryVaddr(vaddr_t vaddr, paddr_t* paddr, uint* mmu_flags) override final {
PageTableLevel ret_level;
Guard<Mutex> a{AssertOrderedLock, &lock_, LockOrder()};
volatile pt_entry_t* last_valid_entry;
zx_status_t status =
GetMapping(virt_, vaddr, static_cast<T*>(this)->top_level(), &ret_level, &last_valid_entry);
if (status != ZX_OK)
return status;
/* based on the return level, parse the page table entry */
if (paddr) {
switch (ret_level) {
case PageTableLevel::PDP_L: /* 1GB page */
*paddr = paddr_from_pte(PageTableLevel::PDP_L, *last_valid_entry);
*paddr |= vaddr & PAGE_OFFSET_MASK_HUGE;
case PageTableLevel::PD_L: /* 2MB page */
*paddr = paddr_from_pte(PageTableLevel::PD_L, *last_valid_entry);
*paddr |= vaddr & PAGE_OFFSET_MASK_LARGE;
case PageTableLevel::PT_L: /* 4K page */
*paddr = paddr_from_pte(PageTableLevel::PT_L, *last_valid_entry);
*paddr |= vaddr & PAGE_OFFSET_MASK_4KB;
panic("arch_mmu_query: unhandled frame level\n");
/* converting arch-specific flags to mmu flags */
if (mmu_flags) {
*mmu_flags = static_cast<T*>(this)->pt_flags_to_mmu_flags(*last_valid_entry, ret_level);
return ZX_OK;
zx_status_t HarvestAccessed(vaddr_t vaddr, size_t count, NonTerminalAction non_terminal_action,
TerminalAction terminal_action) override final {
if (!static_cast<T*>(this)->check_vaddr(vaddr)) {
if (count == 0) {
return ZX_OK;
MappingCursor cursor(/*vaddr=*/vaddr, /*size=*/count * PAGE_SIZE);
__UNINITIALIZED ConsistencyManager cm(this);
Guard<Mutex> a{AssertOrderedLock, &lock_, LockOrder()};
HarvestMapping(virt_, non_terminal_action, terminal_action,
static_cast<T*>(this)->top_level(), cursor, &cm);
DEBUG_ASSERT(cursor.size() == 0);
return ZX_OK;
// We disable analysis due to the write to |pages_| tripping it up. It is safe
// to write to |pages_| since this is part of object construction.
// Initialize an empty page table and mark it as restricted.
zx_status_t InitRestricted(void* ctx, page_alloc_fn_t test_paf = nullptr) {
role_ = PageTableRole::kRestricted;
return Init(ctx, test_paf);
// Initialize a page table, assign the given context, and prepopulate the top level page table
// entries.
// We disable analysis due to the write to |pages_| tripping it up. It is safe
// to write to |pages_| since this is part of object construction.
zx_status_t InitShared(void* ctx, vaddr_t base, size_t size,
page_alloc_fn_t test_paf = nullptr) TA_NO_THREAD_SAFETY_ANALYSIS {
zx_status_t status = Init(ctx, test_paf);
if (status != ZX_OK) {
return status;
role_ = PageTableRole::kShared;
PageTableLevel top = static_cast<T*>(this)->top_level();
const uint start = vaddr_to_index(top, base);
uint end = vaddr_to_index(top, base + size - 1);
// Check the end if it fills out the table entry.
if (page_aligned(top, base + size)) {
end += 1;
IntermediatePtFlags flags = static_cast<T*>(this)->intermediate_flags();
for (uint i = start; i < end; i++) {
pt_entry_t* pdp = AllocatePageTable();
if (pdp == nullptr) {
pages_ += 1;
virt_[i] = X86_VIRT_TO_PHYS(pdp) | flags | X86_MMU_PG_P;
return ZX_OK;
// Initialize a page table, assign the given context, and set it up as a unified page table with
// entries from the given page tables.
// The shared and restricted page tables must satisfy the following requirements:
// 1) The shared page table must set only |is_shared_| to true.
// 2) The restricted page table must set only |is_restricted_| to true.
// 3) Both the shared and restricted page tables must have been initialized prior to this call.
zx_status_t InitUnified(void* ctx, X86PageTableImpl<T>* shared, vaddr_t shared_base,
size_t shared_size, X86PageTableImpl<T>* restricted,
vaddr_t restricted_base, size_t restricted_size,
page_alloc_fn_t test_paf = nullptr) {
// Validate that the shared and restricted page tables do not overlap and do not share a PML4
// entry.
PageTableLevel top = static_cast<T*>(this)->top_level();
const uint restricted_start = vaddr_to_index(top, restricted_base);
uint restricted_end = vaddr_to_index(top, restricted_base + restricted_size - 1);
if (page_aligned(top, restricted_base + restricted_size)) {
restricted_end += 1;
const uint shared_start = vaddr_to_index(top, shared_base);
uint shared_end = vaddr_to_index(top, shared_base + shared_size - 1);
if (page_aligned(top, shared_base + shared_size)) {
shared_end += 1;
DEBUG_ASSERT(restricted_end <= shared_start);
zx_status_t status = Init(ctx, test_paf);
if (status != ZX_OK) {
return status;
role_ = PageTableRole::kUnified;
// Validate the restricted page table and set its metadata.
Guard<Mutex> a{AssertOrderedLock, &restricted->lock_, restricted->LockOrder()};
DEBUG_ASSERT(restricted->referenced_pt_ == nullptr);
// Assert that there are no entries in the restricted page table.
for (uint i = restricted_start; i < restricted_end; i++) {
restricted->referenced_pt_ = this;
// Copy all mappings from the shared page table and set its metadata.
Guard<Mutex> a{AssertOrderedLock, &shared->lock_, shared->LockOrder()};
DEBUG_ASSERT(shared->referenced_pt_ == nullptr);
// Set up the PML4 so we capture any mappings created prior to creation of this unified page
// table.
pt_entry_t curr_entry = 0;
for (uint i = shared_start; i < shared_end; i++) {
curr_entry = shared->virt_[i];
if (IS_PAGE_PRESENT(curr_entry)) {
virt_[i] = curr_entry;
// Update this page table's bookkeeping.
Guard<Mutex> a{AssertOrderedLock, &lock_, LockOrder()};
referenced_pt_ = restricted;
shared_pt_ = shared;
return ZX_OK;
// Calls DestroyUnified if this is a unified page table and DestroyIndividual if it is not.
void Destroy(vaddr_t base, size_t size) {
if (IsUnified()) {
return DestroyUnified();
return DestroyIndividual(base, size);
// Utility for managing consistency of the page tables from a cache and TLB
// point-of-view. It ensures that memory is not freed while a TLB entry may
// refer to it, and that changes to the page tables have appropriate visiblity
// to the hardware interpreting them. Finish MUST be called on this
// class, even if the page table change failed.
// The aspace lock *must* be held over the full operation of the ConsistencyManager, from
// queue_free to Flush. The lock must be held continuously, due to strategy employed here of only
// invalidating actual vaddrs with changing entries, and not all vaddrs an operation applies to.
// Otherwise the following scenario is possible
// 1. Thread 1 performs an Unmap and removes PTE entries, but drops the lock prior to
// invalidation.
// 2. Thread 2 performs an Unmap, no PTE entries are removed, no invalidations occur
// 3. Thread 2 now believes the resources (pages) for the region are no longer accessible, and
// returns them to the pmm.
// 4. Thread 3 attempts to access this region and is now able to read/write to returned pages as
// invalidations have not occurred.
// This scenario is possible as the mappings here are not the source of truth of resource
// management, but a cache of information from other parts of the system. If thread 2 wanted to
// guarantee that the pages were free it could issue it's own TLB invalidations for the vaddr
// range, even though it found no entries. However this is not the trategy employed here at the
// moment.
class ConsistencyManager {
explicit ConsistencyManager(X86PageTableImpl<T>* pt)
: pt_(pt), clf_(static_cast<T*>(pt)->needs_cache_flushes()) {}
~ConsistencyManager() {
DEBUG_ASSERT(pt_ == nullptr);
// We free the paging structures here rather than in Finish(), to allow
// support deferring invoking pmm_free() until after we've left the page
// table lock.
vm_page_t* p;
list_for_every_entry (&to_free_, p, vm_page_t, queue_node) {
DEBUG_ASSERT(p->state() == vm_page_state::MMU);
if (!list_is_empty(&to_free_)) {
void queue_free(vm_page_t* page) {
DEBUG_ASSERT(page->state() == vm_page_state::MMU);
list_add_tail(&to_free_, &page->queue_node);
DEBUG_ASSERT(pt_->pages_ > 0);
CacheLineFlusher* cache_line_flusher() { return &clf_; }
PendingTlbInvalidation* pending_tlb() { return &tlb_; }
// This function must be called while holding pt_->lock_.
void Finish() {
if (static_cast<T*>(pt_)->needs_cache_flushes()) {
// If the hardware needs cache flushes for the tables to be visible,
// make sure we serialize the flushes before issuing the TLB
// invalidations.
if (pt_->IsRestricted() && pt_->referenced_pt_ != nullptr) {
// TODO( This TLB invalidation could be wrapped into the
// preceding one so long as we built the target mask correctly.
Guard<Mutex> a{AssertOrderedLock, &pt_->referenced_pt_->lock_,
// Clear out the pending TLB invalidations.
pt_ = nullptr;
void SetFullShootdown() { tlb_.full_shootdown = true; }
X86PageTableImpl<T>* pt_;
// Cache line to flush prior to TLB invalidations
CacheLineFlusher clf_;
// TLB invalidations that need to occur
PendingTlbInvalidation tlb_;
// vm_page_t's to relese to the PMM after the TLB invalidation occurs
list_node to_free_ = LIST_INITIAL_VALUE(to_free_);
// given a page table entry, return a pointer to the next page table one level down
static inline volatile pt_entry_t* get_next_table_from_entry(pt_entry_t entry) {
if (!IS_PAGE_PRESENT(entry) || IS_LARGE_PAGE(entry))
return nullptr;
return reinterpret_cast<volatile pt_entry_t*>(X86_PHYS_TO_VIRT(entry & X86_PG_FRAME));
// Return the page size for this level
static size_t page_size(PageTableLevel level) {
switch (level) {
case PageTableLevel::PT_L:
return 1ULL << PT_SHIFT;
case PageTableLevel::PD_L:
return 1ULL << PD_SHIFT;
case PageTableLevel::PDP_L:
return 1ULL << PDP_SHIFT;
case PageTableLevel::PML4_L:
return 1ULL << PML4_SHIFT;
panic("page_size: invalid level\n");
// Whether an address is aligned to the page size of this level
static bool page_aligned(PageTableLevel level, vaddr_t vaddr) {
return (vaddr & (page_size(level) - 1)) == 0;
// Extract the index needed for finding |vaddr| for the given level
static uint vaddr_to_index(PageTableLevel level, vaddr_t vaddr) {
switch (level) {
case PageTableLevel::PML4_L:
return VADDR_TO_PML4_INDEX(vaddr);
case PageTableLevel::PDP_L:
return VADDR_TO_PDP_INDEX(vaddr);
case PageTableLevel::PD_L:
return VADDR_TO_PD_INDEX(vaddr);
case PageTableLevel::PT_L:
return VADDR_TO_PT_INDEX(vaddr);
panic("vaddr_to_index: invalid level\n");
// Convert a PTE to a physical address
static paddr_t paddr_from_pte(PageTableLevel level, pt_entry_t pte) {
paddr_t pa;
switch (level) {
case PageTableLevel::PDP_L:
pa = (pte & X86_HUGE_PAGE_FRAME);
case PageTableLevel::PD_L:
pa = (pte & X86_LARGE_PAGE_FRAME);
case PageTableLevel::PT_L:
pa = (pte & X86_PG_FRAME);
panic("paddr_from_pte at unhandled level %d\n", static_cast<int>(level));
return pa;
static PageTableLevel lower_level(PageTableLevel level) {
DEBUG_ASSERT(level != PageTableLevel::PT_L);
return static_cast<PageTableLevel>(static_cast<int>(level) - 1);
static bool page_table_is_clear(const volatile pt_entry_t* page_table) {
uint lower_idx;
for (lower_idx = 0; lower_idx < NO_OF_PT_ENTRIES; ++lower_idx) {
if (IS_PAGE_PRESENT(page_table[lower_idx])) {
return false;
return true;
* @brief Creates mappings for the range specified by start_cursor
* `level` must be top_level() when invoked from external code.
* @param table The current paging structure's virtual address.
* @param mmu_flags MMU flags describing attributes of the mapping.
* @param level Page table level which the current `table` is located at.
* @param existing_action Action to take if a mapping is already present.
* @param cursor A cursor describing the range of address space to act on.
* @param cm Object to manage consistency of page table entries and cache+TLB.
* @return ZX_OK if successful
* @return ZX_ERR_ALREADY_EXISTS if the range overlaps an existing mapping and
* `existing_action` is set to `Error`
* @return ZX_ERR_NO_MEMORY if intermediate page tables could not be allocated
zx_status_t AddMapping(volatile pt_entry_t* table, uint mmu_flags, PageTableLevel level,
ExistingEntryAction existing_action, MappingCursor& cursor,
ConsistencyManager* cm) TA_REQ(lock_) {
const vaddr_t start_vaddr = cursor.vaddr();
// Unified page tables should never be mapping entries directly; rather, their constituent page
// tables should be mapping entries on their behalf.
zx_status_t ret = ZX_OK;
if (level == PageTableLevel::PT_L) {
return AddMappingL0(table, mmu_flags, existing_action, cursor, cm);
auto abort = fit::defer([&]() {
if (level == static_cast<T*>(this)->top_level()) {
// Build an unmap cursor. cursor.size should be how much is left to be mapped still.
MappingCursor unmap_cursor(/*vaddr=*/start_vaddr,
/*size=*/cursor.vaddr() - start_vaddr);
if (unmap_cursor.size() > 0) {
auto status = RemoveMapping(table, level, EnlargeOperation::No, unmap_cursor, cm);
// Removing the exact mappings we just added should never be able to fail.
DEBUG_ASSERT(unmap_cursor.size() == 0);
IntermediatePtFlags interm_flags = static_cast<T*>(this)->intermediate_flags();
PtFlags term_flags = static_cast<T*>(this)->terminal_flags(level, mmu_flags);
size_t ps = page_size(level);
bool level_supports_large_pages = static_cast<T*>(this)->supports_page_size(level);
uint index = vaddr_to_index(level, cursor.vaddr());
for (; index != NO_OF_PT_ENTRIES && cursor.size() != 0; ++index) {
volatile pt_entry_t* e = table + index;
pt_entry_t pt_val = *e;
// See if there's a large page in our way
if (IS_PAGE_PRESENT(pt_val) && IS_LARGE_PAGE(pt_val)) {
if (existing_action == ExistingEntryAction::Error) {
// Check if this is a candidate for a new large page
bool level_valigned = page_aligned(level, cursor.vaddr());
bool level_paligned = page_aligned(level, cursor.paddr());
if (level_supports_large_pages && !IS_PAGE_PRESENT(pt_val) && level_valigned &&
level_paligned && cursor.PageRemaining() >= ps) {
UpdateEntry(cm, level, cursor.vaddr(), table + index, cursor.paddr(),
term_flags | X86_MMU_PG_PS, /*was_terminal=*/false);
} else {
// See if we need to create a new table.
if (!IS_PAGE_PRESENT(pt_val)) {
// We should never need to do this in a shared PML4.
if (level == PageTableLevel::PML4_L) {
volatile pt_entry_t* m = AllocatePageTable();
if (m == nullptr) {
// The mapping wasn't fully updated, but there is work here
// that might need to be undone.
size_t partial_update = ktl::min(ps, cursor.size());
// Cancel paddr tracking so we account for the virtual range we need to
// unmap without needing to increment in page appropriate amounts.
if (level == PageTableLevel::PML4_L && IsRestricted() && referenced_pt_ != nullptr) {
Guard<Mutex> a{AssertOrderedLock, &referenced_pt_->lock_, referenced_pt_->LockOrder()};
pt_entry_t* referenced_entry = (pt_entry_t*)referenced_pt_->virt() + index;
DEBUG_ASSERT(check_equal_ignore_flags(*referenced_entry, *e));
ConsistencyManager cm_referenced(referenced_pt_);
referenced_pt_->UpdateEntry(&cm_referenced, level, cursor.vaddr(), referenced_entry,
X86_VIRT_TO_PHYS(m), interm_flags,
UpdateEntry(cm, level, cursor.vaddr(), e, X86_VIRT_TO_PHYS(m), interm_flags,
pt_val = *e;
ret = AddMapping(get_next_table_from_entry(pt_val), mmu_flags, lower_level(level),
existing_action, cursor, cm);
if (ret != ZX_OK) {
return ret;
return ZX_OK;
// Base case of AddMapping for smallest page size.
zx_status_t AddMappingL0(volatile pt_entry_t* table, uint mmu_flags,
ExistingEntryAction existing_action, MappingCursor& cursor,
ConsistencyManager* cm) TA_REQ(lock_) {
const PtFlags term_flags =
static_cast<T*>(this)->terminal_flags(PageTableLevel::PT_L, mmu_flags);
uint index = vaddr_to_index(PageTableLevel::PT_L, cursor.vaddr());
for (; index != NO_OF_PT_ENTRIES && cursor.size() != 0; ++index) {
volatile pt_entry_t* existing_entry = table + index;
if (IS_PAGE_PRESENT(*existing_entry)) {
if (existing_action == ExistingEntryAction::Upgrade) {
const paddr_t existing_paddr = (*existing_entry) & X86_PG_FRAME;
const bool remapping_same_address = existing_paddr == cursor.paddr();
const bool mmu_flags_ro =
// If the physical page we are trying to map is already present, and
// we would be marking it read only, then don't.
// Either:
// 1. it is already read-only - we can skip the work.
// 2. it is already writable - we shouldn't downgrade permissions.
if (!remapping_same_address || !mmu_flags_ro) {
UpdateEntry(cm, PageTableLevel::PT_L, cursor.vaddr(), existing_entry, cursor.paddr(),
term_flags, /*was_terminal=*/false);
} else if (existing_action == ExistingEntryAction::Error) {
} else {
UpdateEntry(cm, PageTableLevel::PT_L, cursor.vaddr(), existing_entry, cursor.paddr(),
term_flags, /*was_terminal=*/false);
return ZX_OK;
* @brief Unmaps the range specified by start_cursor.
* Level must be top_level() when invoked. The caller must, even on
* failure, free all pages in the |to_free| list and adjust the |pages_| count.
* @param table The top-level paging structure's virtual address.
* @param start_cursor A cursor describing the range of address space to
* unmap within table
* @param new_cursor A returned cursor describing how much work was not
* completed. Must be non-null.
* @return true if the caller (i.e. the next level up page table) might need to
* free this page table.
zx::result<bool> RemoveMapping(volatile pt_entry_t* table, PageTableLevel level,
EnlargeOperation enlarge, MappingCursor& cursor,
ConsistencyManager* cm) TA_REQ(lock_) {
// Unified page tables should never be unmapping entries directly; rather, their constituent
// page tables should be unmapping entries on their behalf.
if (level == PageTableLevel::PT_L) {
return zx::ok(RemoveMappingL0(table, cursor, cm));
bool unmapped = false;
// Track if there are any entries at all. This is necessary to properly rollback if an
// attempt to map a page fails to allocate a page table, as that case can result in an
// empty non-last-level page table.
bool any_pages = false;
size_t ps = page_size(level);
uint index = vaddr_to_index(level, cursor.vaddr());
for (; index != NO_OF_PT_ENTRIES && cursor.size() != 0; ++index) {
volatile pt_entry_t* e = table + index;
pt_entry_t pt_val = *e;
// If the page isn't even mapped, just skip it
if (!IS_PAGE_PRESENT(pt_val)) {
any_pages = true;
if (IS_LARGE_PAGE(pt_val)) {
bool vaddr_level_aligned = page_aligned(level, cursor.vaddr());
// If the request covers the entire large page, just unmap it
if (vaddr_level_aligned && cursor.size() >= ps) {
UnmapEntry(cm, level, cursor.vaddr(), e, /*was_terminal=*/true);
unmapped = true;
// Otherwise, we need to split it
vaddr_t page_vaddr = cursor.vaddr() & ~(ps - 1);
zx_status_t status = SplitLargePage(level, page_vaddr, e, cm);
if (status != ZX_OK) {
// If split fails, just unmap the whole thing, and let a
// subsequent page fault clean it up.
if (enlarge == EnlargeOperation::Yes) {
UnmapEntry(cm, level, cursor.vaddr(), e, /*was_terminal=*/true);
unmapped = true;
} else {
return zx::error(status);
pt_val = *e;
volatile pt_entry_t* next_table = get_next_table_from_entry(pt_val);
// Remember where we are unmapping from in case we need to do a second pass to remove a PT.
const vaddr_t unmap_vaddr = cursor.vaddr();
auto status = RemoveMapping(next_table, lower_level(level), enlarge, cursor, cm);
if (status.is_error()) {
return status;
const size_t unmap_size = cursor.vaddr() - unmap_vaddr;
bool lower_unmapped = status.value();
// If we were requesting to unmap everything in the lower page table,
// we know we can unmap the lower level page table. Otherwise, if
// we unmapped anything in the lower level, check to see if that
// level is now empty.
bool unmap_page_table = page_aligned(level, unmap_vaddr) && unmap_size >= ps;
// If the top level page is shared, we cannot unmap it here as other page tables may be
// referencing its entries.
if (IsShared() && level == PageTableLevel::PML4_L) {
unmap_page_table = false;
} else if (!unmap_page_table && lower_unmapped) {
unmap_page_table = page_table_is_clear(next_table);
if (unmap_page_table) {
paddr_t ptable_phys = X86_VIRT_TO_PHYS(next_table);
vm_page_t* page = paddr_to_vm_page(ptable_phys);
if (level == PageTableLevel::PML4_L && IsRestricted() && referenced_pt_ != nullptr) {
Guard<Mutex> a{AssertOrderedLock, &referenced_pt_->lock_, referenced_pt_->LockOrder()};
pt_entry_t* referenced_entry = (pt_entry_t*)referenced_pt_->virt() + index;
DEBUG_ASSERT(check_equal_ignore_flags(*referenced_entry, *e));
ConsistencyManager cm_referenced(referenced_pt_);
referenced_pt_->UnmapEntry(&cm_referenced, level, unmap_vaddr, referenced_entry, false);
UnmapEntry(cm, level, unmap_vaddr, e, /*was_terminal=*/false);
DEBUG_ASSERT_MSG(page->state() == vm_page_state::MMU,
"page %p state %u, paddr %#" PRIxPTR "\n", page,
static_cast<uint32_t>(page->state()), X86_VIRT_TO_PHYS(next_table));
unmapped = true;
DEBUG_ASSERT(cursor.size() == 0 || page_aligned(level, cursor.vaddr()));
return zx::ok(unmapped || !any_pages);
// Base case of RemoveMapping for smallest page size.
bool RemoveMappingL0(volatile pt_entry_t* table, MappingCursor& cursor, ConsistencyManager* cm)
TA_REQ(lock_) {
bool unmapped = false;
uint index = vaddr_to_index(PageTableLevel::PT_L, cursor.vaddr());
for (; index != NO_OF_PT_ENTRIES && cursor.size() != 0; ++index) {
volatile pt_entry_t* e = table + index;
if (IS_PAGE_PRESENT(*e)) {
UnmapEntry(cm, PageTableLevel::PT_L, cursor.vaddr(), e, /*was_terminal=*/true);
unmapped = true;
return unmapped;
* @brief Changes the permissions/caching of the range specified by start_cursor
* Level must be top_level() when invoked. The caller must, even on
* failure, free all pages in the |to_free| list and adjust the |pages_| count.
* @param table The top-level paging structure's virtual address.
* @param start_cursor A cursor describing the range of address space to
* act on within table
* @param new_cursor A returned cursor describing how much work was not
* completed. Must be non-null.
zx_status_t UpdateMapping(volatile pt_entry_t* table, uint mmu_flags, PageTableLevel level,
MappingCursor& cursor, ConsistencyManager* cm) TA_REQ(lock_) {
if (level == PageTableLevel::PT_L) {
return UpdateMappingL0(table, mmu_flags, cursor, cm);
zx_status_t ret = ZX_OK;
PtFlags term_flags = static_cast<T*>(this)->terminal_flags(level, mmu_flags);
size_t ps = page_size(level);
uint index = vaddr_to_index(level, cursor.vaddr());
for (; index != NO_OF_PT_ENTRIES && cursor.size() != 0; ++index) {
volatile pt_entry_t* e = table + index;
pt_entry_t pt_val = *e;
// Skip unmapped pages (we may encounter these due to demand paging)
if (!IS_PAGE_PRESENT(pt_val)) {
if (IS_LARGE_PAGE(pt_val)) {
bool vaddr_level_aligned = page_aligned(level, cursor.vaddr());
// If the request covers the entire large page, just change the
// permissions
if (vaddr_level_aligned && cursor.size() >= ps) {
UpdateEntry(cm, level, cursor.vaddr(), e, paddr_from_pte(level, pt_val),
term_flags | X86_MMU_PG_PS, /*was_terminal=*/true);
// Otherwise, we need to split it
vaddr_t page_vaddr = cursor.vaddr() & ~(ps - 1);
ret = SplitLargePage(level, page_vaddr, e, cm);
if (ret != ZX_OK) {
return ret;
pt_val = *e;
volatile pt_entry_t* next_table = get_next_table_from_entry(pt_val);
ret = UpdateMapping(next_table, mmu_flags, lower_level(level), cursor, cm);
if (ret != ZX_OK) {
return ret;
DEBUG_ASSERT(cursor.size() == 0 || page_aligned(level, cursor.vaddr()));
return ZX_OK;
// Base case of UpdateMapping for smallest page size.
zx_status_t UpdateMappingL0(volatile pt_entry_t* table, uint mmu_flags, MappingCursor& cursor,
ConsistencyManager* cm) TA_REQ(lock_) {
PtFlags term_flags = static_cast<T*>(this)->terminal_flags(PageTableLevel::PT_L, mmu_flags);
uint index = vaddr_to_index(PageTableLevel::PT_L, cursor.vaddr());
for (; index != NO_OF_PT_ENTRIES && cursor.size() != 0; ++index) {
volatile pt_entry_t* e = table + index;
pt_entry_t pt_val = *e;
// Skip unmapped pages (we may encounter these due to demand paging)
if (IS_PAGE_PRESENT(pt_val)) {
UpdateEntry(cm, PageTableLevel::PT_L, cursor.vaddr(), e,
paddr_from_pte(PageTableLevel::PT_L, pt_val), term_flags,
DEBUG_ASSERT(cursor.size() == 0 || page_aligned(PageTableLevel::PT_L, cursor.vaddr()));
return ZX_OK;
* @brief Removes the accessed flag on any terminal entries and calls
* pmm_page_queues()->MarkAccessed on them. For non-terminal entries any accessed bits are
* harvested, and unaccessed non-terminal entries are unmapped or retained based on the passed in
* action.
* Level must be top_level() when invoked. The caller must, even on
* failure, free all pages in the |to_free| list and adjust the |pages_| count.
* @param table The top-level paging structure's virtual address.
* @param start_cursor A cursor describing the range of address space to
* act on within table
* @param new_cursor A returned cursor describing how much work was not
* completed. Must be non-null.
* @return true if the caller (i.e. the next level up page table) might need to
* free this page table.
bool HarvestMapping(volatile pt_entry_t* table, NonTerminalAction non_terminal_action,
TerminalAction terminal_action, PageTableLevel level, MappingCursor& cursor,
ConsistencyManager* cm) TA_REQ(lock_) {
if (level == PageTableLevel::PT_L) {
HarvestMappingL0(table, terminal_action, cursor, cm);
// HarvestMappingL0 never actually unmaps any entries, so this is always false.
return false;
// Track if we perform any unmappings. We propagate this up to our caller, since if we performed
// any unmappings then we could now be empty, and if so our caller needs to free us.
bool unmapped = false;
size_t ps = page_size(level);
uint index = vaddr_to_index(level, cursor.vaddr());
bool always_recurse = level == PageTableLevel::PML4_L && (IsShared() || IsRestricted());
for (; index != NO_OF_PT_ENTRIES && cursor.size() != 0; ++index) {
volatile pt_entry_t* e = table + index;
pt_entry_t pt_val = *e;
// If the page isn't even mapped, just skip it
if (!IS_PAGE_PRESENT(pt_val)) {
if (IS_LARGE_PAGE(pt_val)) {
bool vaddr_level_aligned = page_aligned(level, cursor.vaddr());
// If the request covers the entire large page then harvest the accessed bit, otherwise we
// just skip it.
if (vaddr_level_aligned && cursor.size() >= ps) {
const uint mmu_flags = static_cast<T*>(this)->pt_flags_to_mmu_flags(pt_val, level);
const PtFlags term_flags = static_cast<T*>(this)->terminal_flags(level, mmu_flags);
UpdateEntry(cm, level, cursor.vaddr(), e, paddr_from_pte(level, pt_val),
term_flags | X86_MMU_PG_PS, /*was_terminal=*/true, /*exact_flags=*/true);
volatile pt_entry_t* next_table = get_next_table_from_entry(pt_val);
paddr_t ptable_phys = X86_VIRT_TO_PHYS(next_table);
bool lower_unmapped;
bool unmap_page_table = false;
// Remember where we are unmapping from in case we need to do a second pass to remove a PT.
const vaddr_t unmap_vaddr = cursor.vaddr();
// We should recurse and HarvestMappings at the next level if:
// 1. This page table entry is in the PML4 of a shared or restricted page table. We must
// always recurse in this case because entries in these page tables may have been accessed
// via an associated unified page table, which in turn would not set the accessed bits on
// the corresponding PML4 entries in this table.
// 2. The page table entry has been accessed. We unset the AF later should we end up not
// unmapping the page table.
bool should_recurse = always_recurse || (pt_val & X86_MMU_PG_A);
if (should_recurse) {
lower_unmapped = HarvestMapping(next_table, non_terminal_action, terminal_action,
lower_level(level), cursor, cm);
} else if (non_terminal_action == NonTerminalAction::FreeUnaccessed) {
auto status =
RemoveMapping(next_table, lower_level(level), EnlargeOperation::No, cursor, cm);
// Although we pass in EnlargeOperation::No, the unmap should never fail since we are
// unmapping an entire block and never a sub part of a page.
lower_unmapped = status.value();
const vaddr_t unmap_size = cursor.vaddr() - unmap_vaddr;
// If we processed the entire next level then we can ignore lower_unmapped and just directly
// assume that the whole page table is empty/unaccessed and that we can unmap it.
unmap_page_table = page_aligned(level, unmap_vaddr) && unmap_size >= ps;
} else {
// No accessed flag and no request to unmap means we are done with this entry.
// If the lower page table was accessed and there is uncertainty around whether it might now
// be empty, then we have to just scan it and see.
if (!unmap_page_table && lower_unmapped) {
unmap_page_table = page_table_is_clear(next_table);
// If the top level page is shared, we cannot unmap it here as other page tables may be
// referencing its entries.
if (IsShared() && level == PageTableLevel::PML4_L) {
unmap_page_table = false;
if (unmap_page_table) {
vm_page_t* page = paddr_to_vm_page(ptable_phys);
if (level == PageTableLevel::PML4_L && IsRestricted() && referenced_pt_ != nullptr) {
Guard<Mutex> a{AssertOrderedLock, &referenced_pt_->lock_, referenced_pt_->LockOrder()};
pt_entry_t* referenced_entry = (pt_entry_t*)referenced_pt_->virt() + index;
DEBUG_ASSERT(check_equal_ignore_flags(*referenced_entry, *e));
ConsistencyManager cm_referenced(referenced_pt_);
referenced_pt_->UnmapEntry(&cm_referenced, level, unmap_vaddr, referenced_entry, false);
UnmapEntry(cm, level, unmap_vaddr, e, /*was_terminal=*/false);
DEBUG_ASSERT_MSG(page->state() == vm_page_state::MMU,
"page %p state %u, paddr %#" PRIxPTR "\n", page,
static_cast<uint32_t>(page->state()), X86_VIRT_TO_PHYS(next_table));
unmapped = true;
} else if ((pt_val & X86_MMU_PG_A) && non_terminal_action != NonTerminalAction::Retain) {
// Since we didn't unmap, we need to unset the accessed flag.
const IntermediatePtFlags flags = static_cast<T*>(this)->intermediate_flags();
UpdateEntry(cm, level, unmap_vaddr, e, ptable_phys, flags, /*was_terminal=*/false,
// For the accessed flag to reliably reset we need to ensure that any leaf pages from here
// are not in the TLB so that a re-walk occurs. To avoid having to find every leaf page,
// which will probably exceed the consistency managers into count anyway, force trigger a
// full shootdown.
DEBUG_ASSERT(cursor.size() == 0 || page_aligned(level, cursor.vaddr()));
return unmapped;
// Base case of HarvestMapping for smallest page size.
void HarvestMappingL0(volatile pt_entry_t* table, TerminalAction terminal_action,
MappingCursor& cursor, ConsistencyManager* cm) TA_REQ(lock_) {
uint index = vaddr_to_index(PageTableLevel::PT_L, cursor.vaddr());
for (; index != NO_OF_PT_ENTRIES && cursor.size() != 0; ++index) {
volatile pt_entry_t* e = table + index;
pt_entry_t pt_val = *e;
if (IS_PAGE_PRESENT(pt_val) && (pt_val & X86_MMU_PG_A)) {
const paddr_t paddr = paddr_from_pte(PageTableLevel::PT_L, pt_val);
const uint mmu_flags =
static_cast<T*>(this)->pt_flags_to_mmu_flags(pt_val, PageTableLevel::PT_L);
const PtFlags term_flags =
static_cast<T*>(this)->terminal_flags(PageTableLevel::PT_L, mmu_flags);
vm_page_t* page = paddr_to_vm_page(paddr);
// Mappings for physical VMOs do not have pages associated with them and so there's no state
// to update on an access. As the hardware will update any higher level accessed bits for us
// we do not even ned to remove the accessed bit in that case.
if (likely(page)) {
if (terminal_action == TerminalAction::UpdateAgeAndHarvest) {
UpdateEntry(cm, PageTableLevel::PT_L, cursor.vaddr(), e,
paddr_from_pte(PageTableLevel::PT_L, pt_val), term_flags,
/*was_terminal=*/true, /*exact_flags=*/true);
DEBUG_ASSERT(cursor.size() == 0 || page_aligned(PageTableLevel::PT_L, cursor.vaddr()));
* @brief Walk the page table structures returning the entry and level that maps the address.
* @param table The top-level paging structure's virtual address
* @param vaddr The virtual address to retrieve the mapping for
* @param ret_level The level of the table that defines the found mapping
* @param mapping The mapping that was found
* @return ZX_OK if mapping is found
* @return ZX_ERR_NOT_FOUND if mapping is not found
zx_status_t GetMapping(volatile pt_entry_t* table, vaddr_t vaddr, PageTableLevel level,
PageTableLevel* ret_level, volatile pt_entry_t** mapping) TA_REQ(lock_) {
if (level == PageTableLevel::PT_L) {
return GetMappingL0(table, vaddr, ret_level, mapping);
uint index = vaddr_to_index(level, vaddr);
volatile pt_entry_t* e = table + index;
pt_entry_t pt_val = *e;
if (!IS_PAGE_PRESENT(pt_val))
/* if this is a large page, stop here */
if (IS_LARGE_PAGE(pt_val)) {
*mapping = e;
*ret_level = level;
return ZX_OK;
volatile pt_entry_t* next_table = get_next_table_from_entry(pt_val);
return GetMapping(next_table, vaddr, lower_level(level), ret_level, mapping);
zx_status_t GetMappingL0(volatile pt_entry_t* table, vaddr_t vaddr,
enum PageTableLevel* ret_level, volatile pt_entry_t** mapping)
TA_REQ(lock_) {
/* do the final page table lookup */
uint index = vaddr_to_index(PageTableLevel::PT_L, vaddr);
volatile pt_entry_t* e = table + index;
*mapping = e;
*ret_level = PageTableLevel::PT_L;
return ZX_OK;
// Split the given large page into smaller pages
zx_status_t SplitLargePage(PageTableLevel level, vaddr_t vaddr, volatile pt_entry_t* pte,
ConsistencyManager* cm) TA_REQ(lock_) {
DEBUG_ASSERT_MSG(level != PageTableLevel::PT_L, "tried splitting PT_L");
volatile pt_entry_t* m = AllocatePageTable();
if (m == nullptr) {
paddr_t paddr_base = paddr_from_pte(level, *pte);
PtFlags flags = static_cast<T*>(this)->split_flags(level, *pte & X86_LARGE_FLAGS_MASK);
DEBUG_ASSERT(page_aligned(level, vaddr));
vaddr_t new_vaddr = vaddr;
paddr_t new_paddr = paddr_base;
size_t ps = page_size(lower_level(level));
for (int i = 0; i < NO_OF_PT_ENTRIES; i++) {
volatile pt_entry_t* e = m + i;
// If this is a PDP_L (i.e. huge page), flags will include the
// PS bit still, so the new PD entries will be large pages.
UpdateEntry(cm, lower_level(level), new_vaddr, e, new_paddr, flags, /*was_terminal=*/false);
new_vaddr += ps;
new_paddr += ps;
DEBUG_ASSERT(new_vaddr == vaddr + page_size(level));
flags = static_cast<T*>(this)->intermediate_flags();
UpdateEntry(cm, level, vaddr, pte, X86_VIRT_TO_PHYS(m), flags, /*was_terminal=*/true);
return ZX_OK;
void UpdateEntry(ConsistencyManager* cm, PageTableLevel level, vaddr_t vaddr,
volatile pt_entry_t* pte, paddr_t paddr, PtFlags flags, bool was_terminal,
bool exact_flags = false) TA_REQ(lock_) {
pt_entry_t olde = *pte;
pt_entry_t newe = paddr | flags | X86_MMU_PG_P;
// Check if we are actually changing anything, ignoring the accessed and dirty bits unless
// exact_flags has been requested to allow for those bits to be explicitly unset.
if ((olde & ~(exact_flags ? 0 : (X86_MMU_PG_A | X86_MMU_PG_D))) == newe) {
if (level == PageTableLevel::PML4_L && IsShared()) {
// If this is a shared page table, the only possible modification should be removal of
// the accessed flag.
DEBUG_ASSERT(olde == (newe | X86_MMU_PG_A));
/* set the new entry */
*pte = newe;
/* attempt to invalidate the page */
if (IS_PAGE_PRESENT(olde)) {
cm->pending_tlb()->enqueue(vaddr, level, /*is_global_page=*/olde & X86_MMU_PG_G,
void UnmapEntry(ConsistencyManager* cm, PageTableLevel level, vaddr_t vaddr,
volatile pt_entry_t* pte, bool was_terminal) TA_REQ(lock_) {
if (level == PageTableLevel::PML4_L) {
pt_entry_t olde = *pte;
*pte = 0;
/* attempt to invalidate the page */
if (IS_PAGE_PRESENT(olde)) {
cm->pending_tlb()->enqueue(vaddr, level, /*is_global_page=*/olde & X86_MMU_PG_G,
// Allocating a new page table
// Release the resources associated with this page table. |base| and |size|
// are only used for debug checks that the page tables have no more mappings.
void DestroyIndividual(vaddr_t base, size_t size) {
// This lock should be uncontended since Destroy is not supposed to be called in parallel with
// any other operation, but hold it anyway so we can clear virt_ and attempt to surface any
// bugs.
Guard<Mutex> a{AssertOrderedLock, &lock_, LockOrder()};
DEBUG_ASSERT(num_references_ == 0);
// If this page table has a shared top level page, we need to manually clean up the entries we
// created in InitShared. We know for sure that these entries are no longer referenced by
// other page tables because we expect those page tables to have been destroyed before this one.
if (IsShared()) {
DEBUG_ASSERT(virt_ != nullptr);
PageTableLevel top = static_cast<T*>(this)->top_level();
pt_entry_t* table = static_cast<pt_entry_t*>(virt_);
const uint start = vaddr_to_index(top, base);
uint end = vaddr_to_index(top, base + size - 1);
// Check the end if it fills out the table entry.
if (page_aligned(top, base + size)) {
end += 1;
for (uint i = start; i < end; i++) {
if (IS_PAGE_PRESENT(table[i])) {
volatile pt_entry_t* next_table = get_next_table_from_entry(table[i]);
paddr_t ptable_phys = X86_VIRT_TO_PHYS(next_table);
vm_page_t* page = paddr_to_vm_page(ptable_phys);
table[i] = 0;
PageTableLevel top = static_cast<T*>(this)->top_level();
if (virt_) {
pt_entry_t* table = static_cast<pt_entry_t*>(virt_);
const uint start = vaddr_to_index(top, base);
uint end = vaddr_to_index(top, base + size - 1);
// Check the end if it fills out the table entry.
if (page_aligned(top, base + size)) {
end += 1;
for (uint i = start; i < end; ++i) {
"Destroy() called on page table with entry 0x%" PRIx64
" still present at index %u; aspace size: %zu, is_shared_: %d\n",
table[i], i, size, IsShared());
// Releases the resources exclusively owned by this unified page table, and update the relevant
// metadata on the associated restricted and shared page tables.
void DestroyUnified() {
X86PageTableImpl<T>* restricted = nullptr;
X86PageTableImpl<T>* shared = nullptr;
// This lock should be uncontended since Destroy is not supposed to be called in parallel with
// any other operation, but hold it anyway so we can clear virt_ and attempt to surface any
// bugs. We limit the scope in which we hold this lock when destroying unified page tables
// because holding it prior to acquiring the shared and restricted page table locks would
// violate the lock's ordering rules. We do not destroy the unified page table here, as the
// restricted page table may still reference it.
Guard<Mutex> a{AssertOrderedLock, &lock_, LockOrder()};
// We can copy these pointers to local variables and use them outside of this critical section
// because they are notionally const for unified page tables.
restricted = referenced_pt_;
shared = shared_pt_;
shared_pt_ = nullptr;
referenced_pt_ = nullptr;
Guard<Mutex> a{AssertOrderedLock, &shared->lock_, shared->LockOrder()};
// The shared page table should be referenced by at least this page table, and could be
// referenced by many other unified page tables.
DEBUG_ASSERT(shared->num_references_ > 0);
Guard<Mutex> a{AssertOrderedLock, &restricted->lock_, restricted->LockOrder()};
// The restricted page table can only be referenced by a singular unified page table.
DEBUG_ASSERT(restricted->num_references_ == 1);
restricted->referenced_pt_ = nullptr;
Guard<Mutex> a{AssertOrderedLock, &lock_, LockOrder()};
// Frees the top level page in this page table.
void FreeTopLevelPage() TA_REQ(lock_) {
if (phys_) {
phys_ = 0;
// Clear virt_ to indicate we are now destroyed, and prevent any misuses of the ArchVmAspace API
// from performing use-after-free on the PT.
virt_ = nullptr;
// Checks that the given page table entries are equal but ignores the accessed and dirty flags.
bool check_equal_ignore_flags(pt_entry_t left, pt_entry_t right) {
pt_entry_t no_accessed_dirty_mask = ~X86_MMU_PG_A & ~X86_MMU_PG_D;
return (left & no_accessed_dirty_mask) == (right & no_accessed_dirty_mask);
// A reference to another page table that shares entries with this one.
// If is_restricted_ is set to true, this references the associated unified page table.
// If is_unified_ is set to true, this references the associated restricted page table.
// If neither is true, this is set to null.
X86PageTableImpl<T>* referenced_pt_ TA_GUARDED(lock_) = nullptr;
// A reference to a shared page table whose mappings are also present in this page table. This is
// only set for unified page tables.
X86PageTableImpl<T>* shared_pt_ TA_GUARDED(lock_) = nullptr;