zircon/kernel/arch/x86/mmu.cc - fuchsia - Git at Google

 // Copyright 2016 The Fuchsia Authors
 //
 // Use of this source code is governed by a MIT-style
 // license that can be found in the LICENSE file or at
 // https://opensource.org/licenses/MIT
 #include "arch/x86/mmu.h"

 #include <align.h>
 #include <assert.h>
 #include <lib/arch/sysreg.h>
 #include <lib/arch/x86/boot-cpuid.h>
 #include <lib/arch/x86/system.h>
 #include <lib/boot-options/boot-options.h>
 #include <lib/counters.h>
 #include <lib/zircon-internal/macros.h>
 #include <string.h>
 #include <trace.h>
 #include <zircon/errors.h>
 #include <zircon/types.h>

 #include <new>

 #include <arch/arch_ops.h>
 #include <arch/x86.h>
 #include <arch/x86/descriptor.h>
 #include <arch/x86/feature.h>
 #include <arch/x86/hypervisor/vmx_state.h>
 #include <arch/x86/mmu_mem_types.h>
 #include <kernel/mp.h>
 #include <vm/arch_vm_aspace.h>
 #include <vm/physmap.h>
 #include <vm/pmm.h>
 #include <vm/vm.h>

 #define LOCAL_TRACE 0

 // Count of the number of batches of TLB invalidations initiated on each CPU
 KCOUNTER(tlb_invalidations_sent, "mmu.tlb_invalidation_batches_sent")
 // Count of the number of batches of TLB invalidation requests received on each CPU
 // Includes tlb_invalidations_full_global_received and tlb_invalidations_full_nonglobal_received
 KCOUNTER(tlb_invalidations_received, "mmu.tlb_invalidation_batches_received")
 // Count of the number of TLB invalidation requests for all entries on each CPU
 KCOUNTER(tlb_invalidations_full_global_received, "mmu.tlb_invalidation_full_global_received")
 // Count of the number of TLB invalidation requests for all non-global entries on each CPU
 KCOUNTER(tlb_invalidations_full_nonglobal_received, "mmu.tlb_invalidation_full_nonglobal_received")
 // Count of the number of times an EPT TLB invalidation got performed.
 KCOUNTER(ept_tlb_invalidations, "mmu.ept_tlb_invalidations")

 /* Default address width including virtual/physical address.
  * newer versions fetched below */
 uint8_t g_vaddr_width = 48;
 uint8_t g_paddr_width = 32;

 /* 1 if page table isolation should be used, 0 if not.  -1 if uninitialized. */
 int g_enable_isolation = -1;

 /* True if the system supports 1GB pages */
 static bool supports_huge_pages = false;

 /* top level kernel page tables, initialized in start.S */
 volatile pt_entry_t pml4[NO_OF_PT_ENTRIES] __ALIGNED(PAGE_SIZE);
 volatile pt_entry_t pdp[NO_OF_PT_ENTRIES] __ALIGNED(PAGE_SIZE); /* temporary */
 volatile pt_entry_t pte[NO_OF_PT_ENTRIES] __ALIGNED(PAGE_SIZE);

 /* top level pdp needed to map the -512GB..0 space */
 volatile pt_entry_t pdp_high[NO_OF_PT_ENTRIES] __ALIGNED(PAGE_SIZE);

 #if __has_feature(address_sanitizer)
 volatile pt_entry_t kasan_shadow_pt[NO_OF_PT_ENTRIES] __ALIGNED(PAGE_SIZE);  // Leaf page tables
 volatile pt_entry_t kasan_shadow_pd[NO_OF_PT_ENTRIES] __ALIGNED(PAGE_SIZE);  // Page directories
 // TODO(fxbug.dev/30033): Share this with the vm::zero_page
 volatile uint8_t kasan_zero_page[PAGE_SIZE] __ALIGNED(PAGE_SIZE);
 #endif

 /* a big pile of page tables needed to map 64GB of memory into kernel space using 2MB pages */
 volatile pt_entry_t linear_map_pdp[(64ULL * GB) / (2 * MB)] __ALIGNED(PAGE_SIZE);

 /* which of the above variables is the top level page table */
 #define KERNEL_PT pml4

 // Width of the PCID identifier
 #define X86_PCID_BITS (12)
 // When this bit is set in the source operand of a MOV CR3, TLB entries and paging structure
 // caches for the active PCID may be preserved. If the bit is clear, entries will be cleared.
 // See Intel Volume 3A, 4.10.4.1
 #define X86_PCID_CR3_SAVE_ENTRIES (63)

 // Static relocated base to prepare for KASLR. Used at early boot and by gdb
 // script to know the target relocated address.
 // TODO(thgarnie): Move to a dynamically generated base address
 #if DISABLE_KASLR
 uint64_t kernel_relocated_base = KERNEL_BASE - KERNEL_LOAD_OFFSET;
 #else
 uint64_t kernel_relocated_base = 0xffffffff00000000;
 #endif

 /* kernel base top level page table in physical space */
 static const paddr_t kernel_pt_phys =
     (vaddr_t)KERNEL_PT - (vaddr_t)__code_start + KERNEL_LOAD_OFFSET;

 extern bool g_has_meltdown;

 paddr_t x86_kernel_cr3(void) { return kernel_pt_phys; }

 /**
  * @brief  check if the virtual address is canonical
  */
 bool x86_is_vaddr_canonical(vaddr_t vaddr) {
   uint64_t max_vaddr_lohalf, min_vaddr_hihalf;

   /* get max address in lower-half canonical addr space */
   /* e.g. if width is 48, then 0x00007FFF_FFFFFFFF */
   max_vaddr_lohalf = ((uint64_t)1ull << (g_vaddr_width - 1)) - 1;

   /* get min address in higher-half canonical addr space */
   /* e.g. if width is 48, then 0xFFFF8000_00000000*/
   min_vaddr_hihalf = ~max_vaddr_lohalf;

   /* Check to see if the address in a canonical address */
   if ((vaddr > max_vaddr_lohalf) && (vaddr < min_vaddr_hihalf))
     return false;

   return true;
 }

 /**
  * @brief  check if the virtual address is aligned and canonical
  */
 static bool x86_mmu_check_vaddr(vaddr_t vaddr) {
   /* Check to see if the address is PAGE aligned */
   if (!IS_ALIGNED(vaddr, PAGE_SIZE))
     return false;

   return x86_is_vaddr_canonical(vaddr);
 }

 /**
  * @brief  check if the physical address is valid and aligned
  */
 bool x86_mmu_check_paddr(paddr_t paddr) {
   uint64_t max_paddr;

   /* Check to see if the address is PAGE aligned */
   if (!IS_ALIGNED(paddr, PAGE_SIZE))
     return false;

   max_paddr = ((uint64_t)1ull << g_paddr_width) - 1;

   return paddr <= max_paddr;
 }

 /**
  * @brief  invalidate all TLB entries, excluding global entries
  */
 static void x86_tlb_nonglobal_invalidate() {
   // Read CR3 and immediately write it back.
   arch::X86Cr3::Read().Write();
 }

 /**
  * @brief  invalidate all TLB entries, including global entries
  */
 static void x86_tlb_global_invalidate() {
   /* See Intel 3A section 4.10.4.1 */
   auto cr4 = arch::X86Cr4::Read();
   if (likely(cr4.pge())) {
     cr4.set_pge(false).Write();
   } else {
     x86_tlb_nonglobal_invalidate();
   }
 }

 /* Task used for invalidating a TLB entry on each CPU */
 struct TlbInvalidatePage_context {
   ulong target_cr3;
   const PendingTlbInvalidation* pending;
 };
 static void TlbInvalidatePage_task(void* raw_context) {
   DEBUG_ASSERT(arch_ints_disabled());
   TlbInvalidatePage_context* context = (TlbInvalidatePage_context*)raw_context;

   kcounter_add(tlb_invalidations_received, 1);

   if (context->target_cr3 != arch::X86Cr3::Read().base() && !context->pending->contains_global) {
     /* This invalidation doesn't apply to this CPU, ignore it */
     return;
   }

   if (context->pending->full_shootdown) {
     if (context->pending->contains_global) {
       kcounter_add(tlb_invalidations_full_global_received, 1);
       x86_tlb_global_invalidate();
     } else {
       kcounter_add(tlb_invalidations_full_nonglobal_received, 1);
       x86_tlb_nonglobal_invalidate();
     }
     return;
   }

   for (uint i = 0; i < context->pending->count; ++i) {
     const auto& item = context->pending->item[i];
     switch (static_cast<PageTableLevel>(item.page_level())) {
       case PageTableLevel::PML4_L:
         panic("PML4_L invld found; should not be here\n");
       case PageTableLevel::PDP_L:
       case PageTableLevel::PD_L:
       case PageTableLevel::PT_L:
         __asm__ volatile("invlpg %0" ::"m"(*(uint8_t*)item.addr()));
         break;
     }
   }
 }

 /**
  * @brief Execute a queued TLB invalidation
  *
  * @param pt The page table we're invalidating for (if nullptr, assume for current one)
  * @param pending The planned invalidation
  */
 static void x86_tlb_invalidate_page(const X86PageTableBase* pt, PendingTlbInvalidation* pending) {
   if (pending->count == 0 && !pending->full_shootdown) {
     return;
   }

   kcounter_add(tlb_invalidations_sent, 1);

   ulong cr3 = pt ? pt->phys() : x86_get_cr3();
   struct TlbInvalidatePage_context task_context = {
       .target_cr3 = cr3,
       .pending = pending,
   };

   /* Target only CPUs this aspace is active on.  It may be the case that some
    * other CPU will become active in it after this load, or will have left it
    * just before this load.  In the former case, it is becoming active after
    * the write to the page table, so it will see the change.  In the latter
    * case, it will get a spurious request to flush. */
   mp_ipi_target_t target;
   cpu_mask_t target_mask = 0;
   if (pending->contains_global || pt == nullptr) {
     target = MP_IPI_TARGET_ALL;
   } else {
     target = MP_IPI_TARGET_MASK;
     target_mask = static_cast<X86ArchVmAspace*>(pt->ctx())->active_cpus();
   }

   mp_sync_exec(target, target_mask, TlbInvalidatePage_task, &task_context);
   pending->clear();
 }

 #if 0  // TODO(mcgrathr): remove this if it isn't going to be used
 bool x86_enable_pcid() {
   DEBUG_ASSERT(arch_ints_disabled());
   if (!g_x86_feature_pcid_good) {
     return false;
   }

   arch::X86Cr4::Read().set_pcide(true).Write();
   return true;
 }
 #endif

 bool X86PageTableMmu::check_paddr(paddr_t paddr) { return x86_mmu_check_paddr(paddr); }

 bool X86PageTableMmu::check_vaddr(vaddr_t vaddr) { return x86_mmu_check_vaddr(vaddr); }

 bool X86PageTableMmu::supports_page_size(PageTableLevel level) {
   DEBUG_ASSERT(level != PageTableLevel::PT_L);
   switch (level) {
     case PageTableLevel::PD_L:
       return true;
     case PageTableLevel::PDP_L:
       return supports_huge_pages;
     case PageTableLevel::PML4_L:
       return false;
     default:
       panic("Unreachable case in supports_page_size\n");
   }
 }

 IntermediatePtFlags X86PageTableMmu::intermediate_flags() { return X86_MMU_PG_RW | X86_MMU_PG_U; }

 PtFlags X86PageTableMmu::terminal_flags(PageTableLevel level, uint flags) {
   PtFlags terminal_flags = 0;

   if (flags & ARCH_MMU_FLAG_PERM_WRITE) {
     terminal_flags |= X86_MMU_PG_RW;
   }
   if (flags & ARCH_MMU_FLAG_PERM_USER) {
     terminal_flags |= X86_MMU_PG_U;
   }
   if (use_global_mappings_) {
     terminal_flags |= X86_MMU_PG_G;
   }
   if (!(flags & ARCH_MMU_FLAG_PERM_EXECUTE)) {
     terminal_flags |= X86_MMU_PG_NX;
   }

   if (level != PageTableLevel::PT_L) {
     switch (flags & ARCH_MMU_FLAG_CACHE_MASK) {
       case ARCH_MMU_FLAG_CACHED:
         terminal_flags |= X86_MMU_LARGE_PAT_WRITEBACK;
         break;
       case ARCH_MMU_FLAG_UNCACHED_DEVICE:
       case ARCH_MMU_FLAG_UNCACHED:
         terminal_flags |= X86_MMU_LARGE_PAT_UNCACHABLE;
         break;
       case ARCH_MMU_FLAG_WRITE_COMBINING:
         terminal_flags |= X86_MMU_LARGE_PAT_WRITE_COMBINING;
         break;
       default:
         PANIC_UNIMPLEMENTED;
     }
   } else {
     switch (flags & ARCH_MMU_FLAG_CACHE_MASK) {
       case ARCH_MMU_FLAG_CACHED:
         terminal_flags |= X86_MMU_PTE_PAT_WRITEBACK;
         break;
       case ARCH_MMU_FLAG_UNCACHED_DEVICE:
       case ARCH_MMU_FLAG_UNCACHED:
         terminal_flags |= X86_MMU_PTE_PAT_UNCACHABLE;
         break;
       case ARCH_MMU_FLAG_WRITE_COMBINING:
         terminal_flags |= X86_MMU_PTE_PAT_WRITE_COMBINING;
         break;
       default:
         PANIC_UNIMPLEMENTED;
     }
   }

   return terminal_flags;
 }

 PtFlags X86PageTableMmu::split_flags(PageTableLevel level, PtFlags flags) {
   DEBUG_ASSERT(level != PageTableLevel::PML4_L && level != PageTableLevel::PT_L);
   DEBUG_ASSERT(flags & X86_MMU_PG_PS);
   if (level == PageTableLevel::PD_L) {
     // Note: Clear PS before the check below; the PAT bit for a PTE is the
     // the same as the PS bit for a higher table entry.
     flags &= ~X86_MMU_PG_PS;

     /* If the larger page had the PAT flag set, make sure it's
      * transferred to the different index for a PTE */
     if (flags & X86_MMU_PG_LARGE_PAT) {
       flags &= ~X86_MMU_PG_LARGE_PAT;
       flags |= X86_MMU_PG_PTE_PAT;
     }
   }
   return flags;
 }

 void X86PageTableMmu::TlbInvalidate(PendingTlbInvalidation* pending) {
   x86_tlb_invalidate_page(this, pending);
 }

 uint X86PageTableMmu::pt_flags_to_mmu_flags(PtFlags flags, PageTableLevel level) {
   uint mmu_flags = ARCH_MMU_FLAG_PERM_READ;

   if (flags & X86_MMU_PG_RW) {
     mmu_flags |= ARCH_MMU_FLAG_PERM_WRITE;
   }
   if (flags & X86_MMU_PG_U) {
     mmu_flags |= ARCH_MMU_FLAG_PERM_USER;
   }
   if (!(flags & X86_MMU_PG_NX)) {
     mmu_flags |= ARCH_MMU_FLAG_PERM_EXECUTE;
   }

   if (level != PageTableLevel::PT_L) {
     switch (flags & X86_MMU_LARGE_PAT_MASK) {
       case X86_MMU_LARGE_PAT_WRITEBACK:
         mmu_flags |= ARCH_MMU_FLAG_CACHED;
         break;
       case X86_MMU_LARGE_PAT_UNCACHABLE:
         mmu_flags |= ARCH_MMU_FLAG_UNCACHED;
         break;
       case X86_MMU_LARGE_PAT_WRITE_COMBINING:
         mmu_flags |= ARCH_MMU_FLAG_WRITE_COMBINING;
         break;
       default:
         PANIC_UNIMPLEMENTED;
     }
   } else {
     switch (flags & X86_MMU_PTE_PAT_MASK) {
       case X86_MMU_PTE_PAT_WRITEBACK:
         mmu_flags |= ARCH_MMU_FLAG_CACHED;
         break;
       case X86_MMU_PTE_PAT_UNCACHABLE:
         mmu_flags |= ARCH_MMU_FLAG_UNCACHED;
         break;
       case X86_MMU_PTE_PAT_WRITE_COMBINING:
         mmu_flags |= ARCH_MMU_FLAG_WRITE_COMBINING;
         break;
       default:
         PANIC_UNIMPLEMENTED;
     }
   }
   return mmu_flags;
 }

 bool X86PageTableEpt::allowed_flags(uint flags) {
   if (!(flags & ARCH_MMU_FLAG_PERM_READ)) {
     return false;
   }
   return true;
 }

 bool X86PageTableEpt::check_paddr(paddr_t paddr) { return x86_mmu_check_paddr(paddr); }

 bool X86PageTableEpt::check_vaddr(vaddr_t vaddr) { return x86_mmu_check_vaddr(vaddr); }

 bool X86PageTableEpt::supports_page_size(PageTableLevel level) {
   DEBUG_ASSERT(level != PageTableLevel::PT_L);
   switch (level) {
     case PageTableLevel::PD_L:
       return true;
     case PageTableLevel::PDP_L:
       return supports_huge_pages;
     case PageTableLevel::PML4_L:
       return false;
     default:
       panic("Unreachable case in supports_page_size\n");
   }
 }

 PtFlags X86PageTableEpt::intermediate_flags() { return X86_EPT_R | X86_EPT_W | X86_EPT_X; }

 PtFlags X86PageTableEpt::terminal_flags(PageTableLevel level, uint flags) {
   PtFlags terminal_flags = 0;

   if (flags & ARCH_MMU_FLAG_PERM_READ) {
     terminal_flags |= X86_EPT_R;
   }
   if (flags & ARCH_MMU_FLAG_PERM_WRITE) {
     terminal_flags |= X86_EPT_W;
   }
   if (flags & ARCH_MMU_FLAG_PERM_EXECUTE) {
     terminal_flags |= X86_EPT_X;
   }

   switch (flags & ARCH_MMU_FLAG_CACHE_MASK) {
     case ARCH_MMU_FLAG_CACHED:
       terminal_flags |= X86_EPT_WB;
       break;
     case ARCH_MMU_FLAG_UNCACHED_DEVICE:
     case ARCH_MMU_FLAG_UNCACHED:
       terminal_flags |= X86_EPT_UC;
       break;
     case ARCH_MMU_FLAG_WRITE_COMBINING:
       terminal_flags |= X86_EPT_WC;
       break;
     default:
       PANIC_UNIMPLEMENTED;
   }

   return terminal_flags;
 }

 PtFlags X86PageTableEpt::split_flags(PageTableLevel level, PtFlags flags) {
   DEBUG_ASSERT(level != PageTableLevel::PML4_L && level != PageTableLevel::PT_L);
   // We don't need to relocate any flags on split for EPT.
   return flags;
 }

 void X86PageTableEpt::TlbInvalidate(PendingTlbInvalidation* pending) {
   if (pending->count == 0 && !pending->full_shootdown) {
     return;
   }

   kcounter_add(ept_tlb_invalidations, 1);

   // Target all CPUs with a context invalidation since we do not know what CPUs have this EPT
   // active. We cannot use active_cpus() is only updated by ContextSwitch, which does not get called
   // for guests, and also EPT mappings persist even if a guest is not presently executing. In
   // general unmap operations on EPTs should be extremely rare and not in any common path, so this
   // inefficiency is not disastrous in the short term. Similarly, since this is an infrequent
   // operation, we do not attempt to invalidate any individual entries, but just blow away the whole
   // context.
   // TODO: Track what CPUs the VCPUs using this EPT are migrated to and only IPI that subset.
   invept_from_pml4(static_cast<X86ArchVmAspace*>(ctx())->pt_phys());
   pending->clear();
 }

 uint X86PageTableEpt::pt_flags_to_mmu_flags(PtFlags flags, PageTableLevel level) {
   uint mmu_flags = 0;

   if (flags & X86_EPT_R) {
     mmu_flags |= ARCH_MMU_FLAG_PERM_READ;
   }
   if (flags & X86_EPT_W) {
     mmu_flags |= ARCH_MMU_FLAG_PERM_WRITE;
   }
   if (flags & X86_EPT_X) {
     mmu_flags |= ARCH_MMU_FLAG_PERM_EXECUTE;
   }

   switch (flags & X86_EPT_MEMORY_TYPE_MASK) {
     case X86_EPT_WB:
       mmu_flags |= ARCH_MMU_FLAG_CACHED;
       break;
     case X86_EPT_UC:
       mmu_flags |= ARCH_MMU_FLAG_UNCACHED;
       break;
     case X86_EPT_WC:
       mmu_flags |= ARCH_MMU_FLAG_WRITE_COMBINING;
       break;
     default:
       PANIC_UNIMPLEMENTED;
   }

   return mmu_flags;
 }

 static void disable_global_pages() { arch::X86Cr4::Read().set_pge(false).Write(); }

 void x86_mmu_early_init() {
   x86_mmu_percpu_init();

   x86_mmu_mem_type_init();

   // Unmap the lower identity mapping.
   pml4[0] = 0;
   // As we are still in early init code we cannot use the general page invalidation mechanisms,
   // specifically ones that might use mp_sync_exec or kcounters, so just drop the entire tlb.
   x86_tlb_global_invalidate();

   /* get the address width from the CPU */
   auto vaddr_width =
       static_cast<uint8_t>(arch::BootCpuid<arch::CpuidAddressSizeInfo>().linear_addr_bits());
   auto paddr_width =
       static_cast<uint8_t>(arch::BootCpuid<arch::CpuidAddressSizeInfo>().phys_addr_bits());

   supports_huge_pages = x86_feature_test(X86_FEATURE_HUGE_PAGE);

   /* if we got something meaningful, override the defaults.
    * some combinations of cpu on certain emulators seems to return
    * nonsense paddr widths (1), so trim it. */
   if (paddr_width > g_paddr_width)
     g_paddr_width = paddr_width;
   if (vaddr_width > g_vaddr_width)
     g_vaddr_width = vaddr_width;

   LTRACEF("paddr_width %u vaddr_width %u\n", g_paddr_width, g_vaddr_width);
 }

 void x86_mmu_init(void) {
   g_enable_isolation =
       !gBootOptions->x86_disable_spec_mitigations &&
       (gBootOptions->x86_pti_enable == 1 || (gBootOptions->x86_pti_enable == 2 && g_has_meltdown));
   printf("Kernel PTI %s\n", g_enable_isolation ? "enabled" : "disabled");

   // TODO(crbug.com/fuchsia/31415): Currently KPTI disables Global pages; we might be able to do
   // better, to use global pages for all user-pages, to avoid implicit TLB entry invalidations
   // on user<->kernel transitions.
   //
   // All other CPUs will do this in x86_mmu_percpu_init
   if (g_enable_isolation) {
     disable_global_pages();
   }
 }

 X86PageTableBase::X86PageTableBase() {}

 X86PageTableBase::~X86PageTableBase() {
   DEBUG_ASSERT_MSG(!phys_, "page table dtor called before Destroy()");
 }

 // We disable analysis due to the write to |pages_| tripping it up.  It is safe
 // to write to |pages_| since this is part of object construction.
 zx_status_t X86PageTableBase::Init(void* ctx,
                                    page_alloc_fn_t test_paf) TA_NO_THREAD_SAFETY_ANALYSIS {
   test_page_alloc_func_ = test_paf;

   /* allocate a top level page table for the new address space */
   virt_ = AllocatePageTable();
   if (!virt_) {
     TRACEF("error allocating top level page directory\n");
     return ZX_ERR_NO_MEMORY;
   }

   phys_ = physmap_to_paddr(virt_);
   DEBUG_ASSERT(phys_ != 0);

   ctx_ = ctx;
   pages_ = 1;
   return ZX_OK;
 }

 // We disable analysis due to the write to |pages_| tripping it up.  It is safe
 // to write to |pages_| since this is part of object construction.
 zx_status_t X86PageTableMmu::InitKernel(void* ctx,
                                         page_alloc_fn_t test_paf) TA_NO_THREAD_SAFETY_ANALYSIS {
   test_page_alloc_func_ = test_paf;

   phys_ = kernel_pt_phys;
   virt_ = (pt_entry_t*)X86_PHYS_TO_VIRT(phys_);
   ctx_ = ctx;
   pages_ = 1;
   use_global_mappings_ = true;
   return ZX_OK;
 }

 zx_status_t X86PageTableMmu::AliasKernelMappings() {
   // Copy the kernel portion of it from the master kernel pt.
   memcpy(virt_ + NO_OF_PT_ENTRIES / 2, const_cast<pt_entry_t*>(&KERNEL_PT[NO_OF_PT_ENTRIES / 2]),
          sizeof(pt_entry_t) * NO_OF_PT_ENTRIES / 2);
   return ZX_OK;
 }

 X86ArchVmAspace::X86ArchVmAspace(vaddr_t base, size_t size, uint mmu_flags,
                                  page_alloc_fn_t test_paf)
     : test_page_alloc_func_(test_paf), flags_(mmu_flags), base_(base), size_(size) {}

 /*
  * Fill in the high level x86 arch aspace structure and allocating a top level page table.
  */
 zx_status_t X86ArchVmAspace::Init() {
   static_assert(sizeof(cpu_mask_t) == sizeof(active_cpus_), "err");
   canary_.Assert();

   LTRACEF("aspace %p, base %#" PRIxPTR ", size 0x%zx, mmu_flags 0x%x\n", this, base_, size_,
           flags_);

   if (flags_ & ARCH_ASPACE_FLAG_KERNEL) {
     X86PageTableMmu* mmu = new (&page_table_storage_.mmu) X86PageTableMmu();
     pt_ = mmu;

     zx_status_t status = mmu->InitKernel(this, test_page_alloc_func_);
     if (status != ZX_OK) {
       return status;
     }
     LTRACEF("kernel aspace: pt phys %#" PRIxPTR ", virt %p\n", pt_->phys(), pt_->virt());
   } else if (flags_ & ARCH_ASPACE_FLAG_GUEST) {
     X86PageTableEpt* ept = new (&page_table_storage_.ept) X86PageTableEpt();
     pt_ = ept;

     zx_status_t status = ept->Init(this, test_page_alloc_func_);
     if (status != ZX_OK) {
       return status;
     }
     LTRACEF("guest paspace: pt phys %#" PRIxPTR ", virt %p\n", pt_->phys(), pt_->virt());
   } else {
     X86PageTableMmu* mmu = new (&page_table_storage_.mmu) X86PageTableMmu();
     pt_ = mmu;

     zx_status_t status = mmu->Init(this, test_page_alloc_func_);
     if (status != ZX_OK) {
       return status;
     }

     status = mmu->AliasKernelMappings();
     if (status != ZX_OK) {
       return status;
     }

     LTRACEF("user aspace: pt phys %#" PRIxPTR ", virt %p\n", pt_->phys(), pt_->virt());
   }
   ktl::atomic_init(&active_cpus_, 0);

   return ZX_OK;
 }

 zx_status_t X86ArchVmAspace::Destroy() {
   canary_.Assert();
   DEBUG_ASSERT(active_cpus_.load() == 0);

   if (flags_ & ARCH_ASPACE_FLAG_GUEST) {
     static_cast<X86PageTableEpt*>(pt_)->Destroy(base_, size_);
   } else {
     static_cast<X86PageTableMmu*>(pt_)->Destroy(base_, size_);
   }
   return ZX_OK;
 }

 zx_status_t X86ArchVmAspace::Unmap(vaddr_t vaddr, size_t count, EnlargeOperation enlarge,
                                    size_t* unmapped) {
   if (!IsValidVaddr(vaddr))
     return ZX_ERR_INVALID_ARGS;

   zx_status_t result = pt_->UnmapPages(vaddr, count, enlarge, unmapped);
   MarkAspaceModified();
   return result;
 }

 zx_status_t X86ArchVmAspace::MapContiguous(vaddr_t vaddr, paddr_t paddr, size_t count,
                                            uint mmu_flags, size_t* mapped) {
   if (!IsValidVaddr(vaddr))
     return ZX_ERR_INVALID_ARGS;

   zx_status_t result = pt_->MapPagesContiguous(vaddr, paddr, count, mmu_flags, mapped);
   MarkAspaceModified();
   return result;
 }

 zx_status_t X86ArchVmAspace::Map(vaddr_t vaddr, paddr_t* phys, size_t count, uint mmu_flags,
                                  ExistingEntryAction existing_action, size_t* mapped) {
   if (!IsValidVaddr(vaddr))
     return ZX_ERR_INVALID_ARGS;

   zx_status_t result = pt_->MapPages(vaddr, phys, count, mmu_flags, existing_action, mapped);
   MarkAspaceModified();
   return result;
 }

 zx_status_t X86ArchVmAspace::Protect(vaddr_t vaddr, size_t count, uint mmu_flags) {
   if (!IsValidVaddr(vaddr))
     return ZX_ERR_INVALID_ARGS;

   zx_status_t result = pt_->ProtectPages(vaddr, count, mmu_flags);
   MarkAspaceModified();
   return result;
 }

 void X86ArchVmAspace::ContextSwitch(X86ArchVmAspace* old_aspace, X86ArchVmAspace* aspace) {
   cpu_mask_t cpu_bit = cpu_num_to_mask(arch_curr_cpu_num());
   if (aspace != nullptr) {
     aspace->canary_.Assert();
     paddr_t phys = aspace->pt_phys();
     LTRACEF_LEVEL(3, "switching to aspace %p, pt %#" PRIXPTR "\n", aspace, phys);
     arch::X86Cr3::Write(phys);
     if (old_aspace != nullptr) {
       __UNUSED uint32_t prev = old_aspace->active_cpus_.fetch_and(~cpu_bit);
       // Make sure we were actually previously running on this CPU
       DEBUG_ASSERT(prev & cpu_bit);
     }
     __UNUSED uint32_t prev = aspace->active_cpus_.fetch_or(cpu_bit);
     // Should not already be running on this CPU.
     DEBUG_ASSERT(!(prev & cpu_bit));
     aspace->active_since_last_check_.store(true, ktl::memory_order_relaxed);
   } else {
     LTRACEF_LEVEL(3, "switching to kernel aspace, pt %#" PRIxPTR "\n", kernel_pt_phys);
     arch::X86Cr3::Write(kernel_pt_phys);
     if (old_aspace != nullptr) {
       __UNUSED uint32_t prev = old_aspace->active_cpus_.fetch_and(~cpu_bit);
       // Make sure we were actually previously running on this CPU
       DEBUG_ASSERT(prev & cpu_bit);
     }
   }

   // Cleanup io bitmap entries from previous thread.
   if (old_aspace)
     x86_clear_tss_io_bitmap(old_aspace->io_bitmap());

   // Set the io bitmap for this thread.
   if (aspace)
     x86_set_tss_io_bitmap(aspace->io_bitmap());
 }

 zx_status_t X86ArchVmAspace::Query(vaddr_t vaddr, paddr_t* paddr, uint* mmu_flags) {
   if (!IsValidVaddr(vaddr))
     return ZX_ERR_INVALID_ARGS;

   return pt_->QueryVaddr(vaddr, paddr, mmu_flags);
 }

 zx_status_t X86ArchVmAspace::HarvestAccessed(vaddr_t vaddr, size_t count,
                                              NonTerminalAction non_terminal_action,
                                              TerminalAction terminal_action) {
   if (!IsValidVaddr(vaddr)) {
     return ZX_ERR_INVALID_ARGS;
   }
   return pt_->HarvestAccessed(vaddr, count, non_terminal_action, terminal_action);
 }

 bool X86ArchVmAspace::ActiveSinceLastCheck(bool clear) {
   // Read whether any CPUs are presently executing.
   bool currently_active = active_cpus_.load(ktl::memory_order_relaxed) != 0;
   // Exchange the current notion of active, with the previously active information. This is the only
   // time a |false| value can potentially be written to active_since_last_check_, and doing an
   // exchange means we can never 'lose' a |true| value.
   bool previously_active =
       clear ? active_since_last_check_.exchange(currently_active, ktl::memory_order_relaxed)
             : active_since_last_check_.load(ktl::memory_order_relaxed);
   // Return whether we had previously been active. It is not necessary to also consider whether we
   // are currently active, since activating would also have active_since_last_check_ to true. In the
   // scenario where we race and currently_active is true, but we observe previously_active to be
   // false, this means that as of the start of this function ::ContextSwitch had not completed, and
   // so this aspace is still not actually active.
   return previously_active;
 }

 void x86_mmu_percpu_init(void) {
   arch::X86Cr0::Read()
       .set_wp(true)   // Set write protect.
       .set_nw(false)  // Clear not-write-through.
       .set_cd(false)  // Clear cache-disable.
       .Write();

   // Set the SMEP & SMAP bits in CR4.
   arch::X86Cr4 cr4 = arch::X86Cr4::Read();
   if (x86_feature_test(X86_FEATURE_SMEP)) {
     cr4.set_smep(true);
   }
   if (g_x86_feature_has_smap) {
     cr4.set_smap(true);
   }
   cr4.Write();

   // Set NXE bit in X86_MSR_IA32_EFER.
   uint64_t efer_msr = read_msr(X86_MSR_IA32_EFER);
   efer_msr |= X86_EFER_NXE;
   write_msr(X86_MSR_IA32_EFER, efer_msr);

   // Explicitly check that this is 1, since if this is CPU 0, this may not be
   // initialized yet.
   if (g_enable_isolation == 1) {
     disable_global_pages();
   }
 }

 X86ArchVmAspace::~X86ArchVmAspace() {
   if (pt_) {
     pt_->~X86PageTableBase();
   }
   // TODO(fxbug.dev/30927): check that we've destroyed the aspace.
 }

 vaddr_t X86ArchVmAspace::PickSpot(vaddr_t base, vaddr_t end, vaddr_t align, size_t size,
                                   uint mmu_flags) {
   canary_.Assert();
   return PAGE_ALIGN(base);
 }

 uint32_t arch_address_tagging_features() { return 0; }