zircon/kernel/arch/riscv64/mmu.cc - fuchsia - Git at Google

 // Copyright 2023 The Fuchsia Authors
 //
 // Use of this source code is governed by a MIT-style
 // license that can be found in the LICENSE file or at
 // https://opensource.org/licenses/MIT

 #include <align.h>
 #include <assert.h>
 #include <bits.h>
 #include <debug.h>
 #include <inttypes.h>
 #include <lib/arch/riscv64/feature.h>
 #include <lib/boot-options/boot-options.h>
 #include <lib/counters.h>
 #include <lib/fit/defer.h>
 #include <lib/heap.h>
 #include <lib/ktrace.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/types.h>
 #include <trace.h>
 #include <zircon/errors.h>
 #include <zircon/types.h>

 #include <arch/aspace.h>
 #include <arch/ops.h>
 #include <arch/riscv64/feature.h>
 #include <arch/riscv64/mmu.h>
 #include <arch/riscv64/sbi.h>
 #include <fbl/auto_lock.h>
 #include <kernel/mp.h>
 #include <kernel/mutex.h>
 #include <ktl/algorithm.h>
 #include <phys/arch/arch-handoff.h>
 #include <vm/arch_vm_aspace.h>
 #include <vm/physmap.h>
 #include <vm/pmm.h>
 #include <vm/vm.h>

 #include "asid_allocator.h"

 #define LOCAL_TRACE 0
 #define TRACE_CONTEXT_SWITCH 0

 /* ktraces just local to this file */
 #define LOCAL_KTRACE_ENABLE 0

 // TODO-rvbringup: figure out why this isn't working
 #if 0
 #define LOCAL_KTRACE(string, args...) \
   KTRACE_CPU_INSTANT_ENABLE(LOCAL_KTRACE_ENABLE, "kernel:probe", label, ##args)
 #else
 #define LOCAL_KTRACE(string, args...)
 #endif

 // Static relocated base to prepare for KASLR. Used at early boot and by gdb
 // script to know the target relocated address.
 // TODO(https://fxbug.dev/42098994): Choose it randomly.
 uint64_t kernel_relocated_base = kArchHandoffVirtualAddress;

 // The main translation table for the kernel. Used by the one kernel address space
 // when kernel only threads are active.
 alignas(PAGE_SIZE) pte_t riscv64_kernel_translation_table[RISCV64_MMU_PT_ENTRIES];

 // A copy of the above table with memory identity mapped at 0.
 alignas(PAGE_SIZE) pte_t riscv64_kernel_bootstrap_translation_table[RISCV64_MMU_PT_ENTRIES];

 namespace {

 // 256 top level page tables that are always mapped in the kernel half of all root
 // page tables. This allows for no need to explicitly maintain consistency between
 // the official kernel page table root and all of the user address spaces as they
 // come and go.
 alignas(PAGE_SIZE) pte_t
     riscv64_kernel_top_level_page_tables[RISCV64_MMU_PT_ENTRIES / 2][RISCV64_MMU_PT_ENTRIES];

 // Track the size and capability of the hardware ASID, and if its in use.
 uint64_t riscv_asid_mask;
 bool riscv_use_asid;
 AsidAllocator asid_allocator;

 KCOUNTER(cm_flush_call, "mmu.consistency_manager.flush_call")
 KCOUNTER(cm_asid_invalidate, "mmu.consistency_manager.asid_invalidate")
 KCOUNTER(cm_global_invalidate, "mmu.consistency_manager.global_invalidate")
 KCOUNTER(cm_page_run_invalidate, "mmu.consistency_manager.page_run_invalidate")
 KCOUNTER(cm_single_page_invalidate, "mmu.consistency_manager.single_page_invalidate")
 KCOUNTER(cm_local_page_invalidate, "mmu.consistency_manager.local_page_invalidate")

 KCOUNTER(vm_mmu_protect_make_execute_calls, "vm.mmu.protect.make_execute_calls")
 KCOUNTER(vm_mmu_protect_make_execute_pages, "vm.mmu.protect.make_execute_pages")

 // Return the asid that should be assigned to the kernel aspace.
 uint16_t kernel_asid() {
   // When using ASIDs, the kernel is assigned KERNEL_ASID (1) instead of UNUSED_ASID (0)
   // for two reasons:
   // a) To keep it logically separate from UNUSED_ASID for debug and assert reasons.
   // b) A note in SiFive documentation for various cores that says
   //   "Supervisor software that uses ASIDs should use a nonzero ASID value to refer to the same
   //   address space across all harts in the supervisor execution environment (SEE) and should not
   //   use an ASID value of 0. If supervisor software does not use ASIDs, then the ASID field in the
   //   satp CSR should be set to 0."
   // Unclear if this is simply a suggestion or hardware will perform some sort of optimization based
   // on this.
   return riscv_use_asid ? MMU_RISCV64_KERNEL_ASID : MMU_RISCV64_UNUSED_ASID;
 }

 // given a va address and the level, compute the index in the current PT
 constexpr uint vaddr_to_index(vaddr_t va, uint level) {
   // levels count down from PT_LEVELS - 1
   DEBUG_ASSERT(level < RISCV64_MMU_PT_LEVELS);

   // canonicalize the address
   va &= RISCV64_MMU_CANONICAL_MASK;

   uint index =
       ((va >> PAGE_SIZE_SHIFT) >> (level * RISCV64_MMU_PT_SHIFT)) & (RISCV64_MMU_PT_ENTRIES - 1);
   LTRACEF_LEVEL(3, "canonical va %#lx, level %u = index %#x\n", va, level, index);

   return index;
 }

 constexpr uintptr_t page_size_per_level(uint level) {
   // levels count down from PT_LEVELS - 1
   DEBUG_ASSERT(level < RISCV64_MMU_PT_LEVELS);

   return 1UL << (PAGE_SIZE_SHIFT + level * RISCV64_MMU_PT_SHIFT);
 }

 constexpr uintptr_t page_mask_per_level(uint level) { return page_size_per_level(level) - 1; }

 // Convert user level mmu flags to flags that go in leaf descriptors.
 pte_t mmu_flags_to_pte_attr(uint flags, bool global) {
   pte_t attr = RISCV64_PTE_V;
   attr |= RISCV64_PTE_A | RISCV64_PTE_D;
   attr |= (flags & ARCH_MMU_FLAG_PERM_USER) ? RISCV64_PTE_U : 0;
   attr |= (flags & ARCH_MMU_FLAG_PERM_READ) ? RISCV64_PTE_R : 0;
   attr |= (flags & ARCH_MMU_FLAG_PERM_WRITE) ? RISCV64_PTE_W : 0;
   attr |= (flags & ARCH_MMU_FLAG_PERM_EXECUTE) ? RISCV64_PTE_X : 0;
   attr |= (global) ? RISCV64_PTE_G : 0;

   // Svpbmt support
   if (gRiscvFeatures[arch::RiscvFeature::kSvpbmt]) {
     switch (flags & ARCH_MMU_FLAG_CACHE_MASK) {
       case ARCH_MMU_FLAG_CACHED:
         attr |= RISCV64_PTE_PBMT_PMA;
         break;
       case ARCH_MMU_FLAG_UNCACHED:
       case ARCH_MMU_FLAG_WRITE_COMBINING:
         attr |= RISCV64_PTE_PBMT_NC;
         break;
       case ARCH_MMU_FLAG_UNCACHED_DEVICE:
         attr |= RISCV64_PTE_PBMT_IO;
         break;
     }
   }

   return attr;
 }

 // Construct a non leaf page table entry.
 // For all inner page tables for the entire kernel hierarchy, set the global bit.
 constexpr pte_t mmu_non_leaf_pte(paddr_t pa, bool global) {
   return riscv64_pte_pa_to_pte(pa) | (global ? RISCV64_PTE_G : 0) | RISCV64_PTE_V;
 }

 void update_pte(volatile pte_t* pte, pte_t newval) { *pte = newval; }

 zx::result<size_t> first_used_page_table_entry(const volatile pte_t* page_table) {
   const size_t count = 1U << (PAGE_SIZE_SHIFT - 3);

   for (size_t i = 0; i < count; i++) {
     pte_t pte = page_table[i];
     if (riscv64_pte_is_valid(pte)) {
       return zx::ok(i);
     }
   }
   return zx::error(ZX_ERR_NOT_FOUND);
 }

 bool page_table_is_clear(const volatile pte_t* page_table) {
   const zx::result<size_t> index_result = first_used_page_table_entry(page_table);
   if (index_result.is_error()) {
     LTRACEF("page table at %p is clear\n", page_table);
   } else {
     LTRACEF("page_table at %p still in use, index %zu is %#" PRIx64 "\n", page_table, *index_result,
             page_table[*index_result]);
   }
   return index_result.is_error();
 }

 constexpr Riscv64AspaceType AspaceTypeFromFlags(uint mmu_flags) {
   // Kernel/Guest flags are mutually exclusive. Ensure at most 1 is set.
   DEBUG_ASSERT(((mmu_flags & ARCH_ASPACE_FLAG_KERNEL) != 0) +
                    ((mmu_flags & ARCH_ASPACE_FLAG_GUEST) != 0) <=
                1);
   if (mmu_flags & ARCH_ASPACE_FLAG_KERNEL) {
     return Riscv64AspaceType::kKernel;
   }
   if (mmu_flags & ARCH_ASPACE_FLAG_GUEST) {
     return Riscv64AspaceType::kGuest;
   }
   return Riscv64AspaceType::kUser;
 }

 constexpr ktl::string_view Riscv64AspaceTypeName(Riscv64AspaceType type) {
   switch (type) {
     case Riscv64AspaceType::kKernel:
       return "kernel";
     case Riscv64AspaceType::kUser:
       return "user";
     case Riscv64AspaceType::kGuest:
       return "guest";
   }
   __builtin_abort();
 }

 constexpr bool IsUserBaseSizeValid(vaddr_t base, size_t size) {
   // Make sure size is > 0 and the addition of base + size is contained entirely within
   // the user half of the canonical address space.
   if (size == 0) {
     return false;
   }

   if (!IS_PAGE_ALIGNED(base) || !IS_PAGE_ALIGNED(size)) {
     return false;
   }

   if (base & kRiscv64CanonicalAddressMask) {
     return false;
   }

   uint64_t computed_user_aspace_top = 0;
   if (add_overflow(base, size, &computed_user_aspace_top)) {
     return false;
   }

   if ((computed_user_aspace_top - 1) & kRiscv64CanonicalAddressMask) {
     return false;
   }

   return true;
 }

 // Converts a symbol in the kernel to its physical address based on knowledge of
 // where the kernel is loaded virtually and physically. Only works for data within
 // the kernel proper.
 paddr_t kernel_virt_to_phys(const void* va) {
   uintptr_t pa = reinterpret_cast<uintptr_t>(va);
   pa += get_kernel_base_phys() - kernel_relocated_base;

   return pa;
 }

 // Argument to SfenceVma.  Used to perform TLB invalidation on an optional range
 // with an optional ASID.  When no range is present, the target is all
 // addresses.  When no ASID is present the target is invalidated for all ASIDs.
 struct SfenceVmaArgs {
   struct Range {
     vaddr_t base;
     size_t size;
   };
   ktl::optional<Range> range;
   ktl::optional<uint16_t> asid;
 };

 // Issues a sequence of sfence.vma instructions as specified by SfenceVmaArgs.
 void SfenceVma(void* _args) {
   DEBUG_ASSERT(arch_ints_disabled());
   auto* args = reinterpret_cast<SfenceVmaArgs*>(_args);

   if (args->range.has_value()) {
     // With range.
     const vaddr_t base = args->range->base;
     const vaddr_t end = base + args->range->size;
     if (args->asid.has_value()) {
       // With range, one ASID.
       const uint16_t asid = args->asid.value();
       for (vaddr_t va = base; va < end; va += PAGE_SIZE) {
         riscv64_tlb_flush_address_one_asid(va, asid);
       }
     } else {
       // With range, all ASIDs.
       for (vaddr_t va = base; va < end; va += PAGE_SIZE) {
         riscv64_tlb_flush_address_all_asids(va);
       }
     }
   } else {
     if (args->asid.has_value()) {
       // All addresses, one ASID.
       const uint16_t asid = args->asid.value();
       riscv64_tlb_flush_asid(asid);
     } else {
       // All addresses, all ASIDs.
       riscv64_tlb_flush_all();
     }
   }
 }

 }  // namespace

 // A consistency manager that tracks TLB updates, walker syncs and free pages in an effort to
 // minimize MBs (by delaying and coalescing TLB invalidations) and switching to full ASID
 // invalidations if too many TLB invalidations are requested.
 // The aspace lock *must* be held over the full operation of the ConsistencyManager, from
 // construction to deletion. The lock must be held continuously to deletion, and specifically till
 // the actual TLB invalidations occur, due to strategy employed here of only invalidating actual
 // vaddrs with changing entries, and not all vaddrs an operation applies to. Otherwise the following
 // scenario is possible
 //  1. Thread 1 performs an Unmap and removes PTE entries, but drops the lock prior to invalidation.
 //  2. Thread 2 performs an Unmap, no PTE entries are removed, no invalidations occur
 //  3. Thread 2 now believes the resources (pages) for the region are no longer accessible, and
 //     returns them to the pmm.
 //  4. Thread 3 attempts to access this region and is now able to read/write to returned pages as
 //     invalidations have not occurred.
 // This scenario is possible as the mappings here are not the source of truth of resource
 // management, but a cache of information from other parts of the system. If thread 2 wanted to
 // guarantee that the pages were free it could issue it's own TLB invalidations for the vaddr range,
 // even though it found no entries. However this is not the strategy employed here at the moment.
 class Riscv64ArchVmAspace::ConsistencyManager {
  public:
   ConsistencyManager(Riscv64ArchVmAspace& aspace) TA_REQ(aspace.lock_) : aspace_(aspace) {}
   ~ConsistencyManager() {
     Flush();
     if (!list_is_empty(&to_free_)) {
       pmm_free(&to_free_);
     }
   }

   // Queue a TLB entry for flushing. This may get turned into a complete ASID flush.
   void FlushEntry(vaddr_t va, bool terminal) {
     AssertHeld(aspace_.lock_);

     LTRACEF("va %#lx, asid %#x, terminal %u\n", va, aspace_.asid_, terminal);

     DEBUG_ASSERT(IS_PAGE_ALIGNED(va));
     DEBUG_ASSERT(aspace_.IsValidVaddr(va));

     if (full_flush_) {
       // If we've already decided to do a full flush, nothing more to track here.
       return;
     }

     // If we're asked to flush a non terminal entry, we're going to need to dump the entire ASID
     // so skip tracking this VA and exit now.
     if (!terminal) {
       full_flush_ = true;
       return;
     }

     // Check we have queued too many entries already.
     if (num_pending_tlb_runs_ >= kMaxPendingTlbRuns) {
       // Most of the time we will now prefer to invalidate the entire ASID, the exception is if
       // this aspace is for the kernel, in which all pages are global and we need to flush
       // them one at a time.
       if (!aspace_.IsKernel()) {
         full_flush_ = true;
         return;
       }

       // Kernel case: Flush what pages we've cached up until now and reset counter to zero.
       Flush();
     }

     if (num_pending_tlb_runs_ > 0) {
       // See if this entry completes the previous run or is the start of the previous run.
       // The latter catches a fairly common case of multiple flushes of the same page in a row.
       auto& run = pending_tlbs_[num_pending_tlb_runs_ - 1];
       if ((run.va + run.count * PAGE_SIZE == va)) {
         run.count++;
         return;
       }
       if (run.va == va) {
         return;
       }
     }

     // Start a new run of entries to track
     pending_tlbs_[num_pending_tlb_runs_].va = va;
     pending_tlbs_[num_pending_tlb_runs_].count = 1;
     num_pending_tlb_runs_++;
   }

   // Performs any pending synchronization of TLBs and page table walkers. Includes the MB to ensure
   // TLB flushes have completed prior to returning to user.
   void Flush() TA_REQ(aspace_.lock_) {
     kcounter_add(cm_flush_call, 1);
     if (!full_flush_ && num_pending_tlb_runs_ == 0) {
       return;
     }
     // Need a mb to synchronize any page table updates prior to flushing the TLBs.
     mb();

     // Check if we should just be performing a full ASID invalidation.
     if (full_flush_) {
       aspace_.FlushAsid();
       // If this is a restricted aspace, invalidate the associated unified aspace's ASID.
       if (aspace_.IsRestricted() && aspace_.referenced_aspace_ != nullptr) {
         Guard<Mutex> b{AssertOrderedLock, &aspace_.referenced_aspace_->lock_,
                        aspace_.referenced_aspace_->LockOrder()};
         aspace_.referenced_aspace_->FlushAsid();
       }
     } else {
       for (size_t i = 0; i < num_pending_tlb_runs_; i++) {
         const vaddr_t va = pending_tlbs_[i].va;
         const size_t count = pending_tlbs_[i].count;

         aspace_.FlushTLBEntryRun(va, count);
         // If this is a restricted aspace, invalidate the same run in the unified aspace.
         if (aspace_.IsRestricted() && aspace_.referenced_aspace_ != nullptr) {
           Guard<Mutex> b{AssertOrderedLock, &aspace_.referenced_aspace_->lock_,
                          aspace_.referenced_aspace_->LockOrder()};
           aspace_.referenced_aspace_->FlushTLBEntryRun(va, count);
         }
       }
     }

     // mb to ensure TLB flushes happen prior to returning to user.
     mb();
     num_pending_tlb_runs_ = 0;
     full_flush_ = false;
   }

   // Queue a page for freeing that is dependent on TLB flushing. This is for pages that were
   // previously installed as page tables and they should not be reused until the non-terminal TLB
   // flush has occurred.
   void FreePage(vm_page_t* page) { list_add_tail(&to_free_, &page->queue_node); }

  private:
   // Maximum number of TLB entries we will queue before switching to ASID invalidation.
   static constexpr uint32_t kMaxPendingTlbRuns = 8;

   // vm_page_t's to release to the PMM after the TLB invalidation occurs.
   list_node to_free_ = LIST_INITIAL_VALUE(to_free_);

   // The aspace we are invalidating TLBs for.
   const Riscv64ArchVmAspace& aspace_;

   // Perform a full flush of the entire ASID (or all ASIDs if a kernel aspace) in these cases:
   // 1) We've accumulated more than kMaxPendingTlbRuns run of pages, which are expensive to perform
   // because of cross cpu TLB shootdowns.
   // 2) If we've been asked to flush a non terminal page, which according to the RISC-V
   // privileged spec should involve clearing the entire ASID.
   bool full_flush_ = false;

   // Pending TLBs to flush are stored as a virtual address + a count of pages to flush in a run.
   uint32_t num_pending_tlb_runs_ = 0;

   // A run of pages to flush.
   struct {
     uint64_t va;
     size_t count;
   } pending_tlbs_[kMaxPendingTlbRuns];
 };

 uint Riscv64ArchVmAspace::MmuFlagsFromPte(pte_t pte) {
   uint mmu_flags = 0;
   mmu_flags |= (pte & RISCV64_PTE_U) ? ARCH_MMU_FLAG_PERM_USER : 0;
   mmu_flags |= (pte & RISCV64_PTE_R) ? ARCH_MMU_FLAG_PERM_READ : 0;
   mmu_flags |= (pte & RISCV64_PTE_W) ? ARCH_MMU_FLAG_PERM_WRITE : 0;
   mmu_flags |= (pte & RISCV64_PTE_X) ? ARCH_MMU_FLAG_PERM_EXECUTE : 0;

   // Svpbmt feature
   if (gRiscvFeatures[arch::RiscvFeature::kSvpbmt]) {
     switch (pte & RISCV64_PTE_PBMT_MASK) {
       case RISCV64_PTE_PBMT_PMA:
         // PMA state basically means default cache paramaters, as determined by physical address.
         // Don't actually report it as CACHED here since we can't know here what the actual
         // underlying physical range's type is.
         break;
       case RISCV64_PTE_PBMT_NC:
         mmu_flags |= ARCH_MMU_FLAG_UNCACHED;
         break;
       case RISCV64_PTE_PBMT_IO:
         mmu_flags |= ARCH_MMU_FLAG_UNCACHED_DEVICE;
         break;
       default:
         panic("unexpected pte value %" PRIx64, pte);
     }
   }

   return mmu_flags;
 }

 zx_status_t Riscv64ArchVmAspace::Query(vaddr_t vaddr, paddr_t* paddr, uint* mmu_flags) {
   Guard<Mutex> al{AssertOrderedLock, &lock_, LockOrder()};
   return QueryLocked(vaddr, paddr, mmu_flags);
 }

 zx_status_t Riscv64ArchVmAspace::QueryLocked(vaddr_t vaddr, paddr_t* paddr, uint* mmu_flags) {
   uint level = RISCV64_MMU_PT_LEVELS - 1;

   canary_.Assert();
   LTRACEF("aspace %p, vaddr 0x%lx\n", this, vaddr);

   DEBUG_ASSERT(tt_virt_);

   DEBUG_ASSERT(IsValidVaddr(vaddr));
   if (!IsValidVaddr(vaddr)) {
     return ZX_ERR_OUT_OF_RANGE;
   }

   const volatile pte_t* page_table = tt_virt_;

   while (true) {
     ulong index = vaddr_to_index(vaddr, level);
     const pte_t pte = page_table[index];
     const paddr_t pte_addr = riscv64_pte_pa(pte);

     LTRACEF("va %#" PRIxPTR ", index %lu, level %u, pte %#" PRIx64 "\n", vaddr, index, level, pte);

     if (!riscv64_pte_is_valid(pte)) {
       return ZX_ERR_NOT_FOUND;
     }

     if (riscv64_pte_is_leaf(pte)) {
       if (paddr) {
         *paddr = pte_addr + (vaddr & page_mask_per_level(level));
       }
       if (mmu_flags) {
         *mmu_flags = MmuFlagsFromPte(pte);
       }
       LTRACEF("va 0x%lx, paddr 0x%lx, flags 0x%x\n", vaddr, paddr ? *paddr : ~0UL,
               mmu_flags ? *mmu_flags : ~0U);
       return ZX_OK;
     }

     page_table = static_cast<const volatile pte_t*>(paddr_to_physmap(pte_addr));
     level--;
   }
 }

 zx::result<paddr_t> Riscv64ArchVmAspace::AllocPageTable() {
   // Allocate a page from the pmm via function pointer passed to us in Init().
   // The default is pmm_alloc_page so test and explicitly call it to avoid any unnecessary
   // virtual functions.
   vm_page_t* page;
   paddr_t paddr;
   const zx_status_t status = likely(!test_page_alloc_func_)
                                  ? pmm_alloc_page(0, &page, &paddr)
                                  : test_page_alloc_func_(0, &page, &paddr);
   if (status != ZX_OK) {
     return zx::error_result(status);
   }
   DEBUG_ASSERT(is_physmap_phys_addr(paddr));

   page->set_state(vm_page_state::MMU);
   pt_pages_++;

   LOCAL_KTRACE("page table alloc");
   LTRACEF("allocated 0x%lx\n", paddr);

   return zx::ok(paddr);
 }

 void Riscv64ArchVmAspace::FreePageTable(void* vaddr, paddr_t paddr, ConsistencyManager& cm) {
   LTRACEF("vaddr %p paddr 0x%lx\n", vaddr, paddr);
   LOCAL_KTRACE("page table free");

   vm_page_t* const page = paddr_to_vm_page(paddr);
   DEBUG_ASSERT(page != nullptr);
   DEBUG_ASSERT(page->state() == vm_page_state::MMU);

   cm.FreePage(page);
   pt_pages_--;
 }

 zx_status_t Riscv64ArchVmAspace::SplitLargePage(vaddr_t vaddr, uint level, vaddr_t pt_index,
                                                 volatile pte_t* page_table,
                                                 ConsistencyManager& cm) {
   LTRACEF("vaddr %#lx, level %u, pt_index %#lx, page_table %p\n", vaddr, level, pt_index,
           page_table);

   const pte_t old_pte = page_table[pt_index];
   DEBUG_ASSERT(riscv64_pte_is_leaf(old_pte));

   LTRACEF("old leaf table entry is %#lx\n", old_pte);

   const zx::result<paddr_t> result = AllocPageTable();
   if (result.is_error()) {
     TRACEF("failed to allocate page table\n");
     return result.error_value();
   }
   const paddr_t paddr = result.value();

   const auto new_page_table = static_cast<volatile pte_t*>(paddr_to_physmap(paddr));

   // Inherit all of the page table entry bits that aren't part of the address.
   const pte_t new_page_attrs = old_pte & ~(RISCV64_PTE_PPN_MASK);

   LTRACEF("new page table filled with attrs %#lx | address\n", new_page_attrs);

   const size_t next_size = page_size_per_level(level - 1);
   for (uint64_t i = 0, mapped_paddr = riscv64_pte_pa(old_pte); i < RISCV64_MMU_PT_ENTRIES;
        i++, mapped_paddr += next_size) {
     // directly write to the pte, no need to update since this is
     // a completely new table
     new_page_table[i] = riscv64_pte_pa_to_pte(mapped_paddr) | new_page_attrs;
   }

   // Ensure page table initialization becomes visible prior to page table installation.
   wmb();

   update_pte(&page_table[pt_index], mmu_non_leaf_pte(paddr, IsKernel()));
   LTRACEF("pte %p[%#" PRIxPTR "] = %#" PRIx64 "\n", page_table, pt_index, page_table[pt_index]);

   // no need to update the page table count here since we're replacing a block entry with a table
   // entry.

   cm.FlushEntry(vaddr, false);

   return ZX_OK;
 }

 // Use the appropriate TLB flush instruction to globally flush the modified run of pages
 // in the appropriate ASID, or across all ASIDs if the run is in the kernel or in a shared aspace.
 void Riscv64ArchVmAspace::FlushTLBEntryRun(vaddr_t vaddr, size_t count) const {
   LTRACEF("vaddr %#lx, count %#lx, asid %#hx, kernel %u\n", vaddr, count, asid_, IsKernel());

   kcounter_add(cm_page_run_invalidate, 1);
   kcounter_add(cm_single_page_invalidate, static_cast<int64_t>(count));

   // Future optimization here and FlushAsid() when asids are disabled:
   // Based on which cpu has the aspace active, only send IPIs (either directly
   // or via SBI) to the cores from that list to shoot down TLBs.
   const size_t size = count * PAGE_SIZE;
   if (IsKernel() || IsShared()) {
     SfenceVmaArgs args{SfenceVmaArgs::Range{vaddr, size}};
     mp_sync_exec(MP_IPI_TARGET_ALL, /* cpu_mask */ 0, &SfenceVma, &args);
   } else if (IsUser()) {
     // Flush just the aspace's asid
     SfenceVmaArgs args{SfenceVmaArgs::Range{vaddr, size}, asid_};
     mp_sync_exec(MP_IPI_TARGET_ALL, /* cpu_mask */ 0, &SfenceVma, &args);
   } else {
     PANIC_UNIMPLEMENTED;
   }
 }

 // Flush an entire ASID on all cpus.
 void Riscv64ArchVmAspace::FlushAsid() const {
   LTRACEF("asid %#hx, kernel %u\n", asid_, IsKernel());

   if (IsKernel() || IsShared()) {
     // Perform a full flush of all cpus across all ASIDs
     SfenceVmaArgs args{};
     mp_sync_exec(MP_IPI_TARGET_ALL, /* cpu_mask */ 0, &SfenceVma, &args);
     kcounter_add(cm_global_invalidate, 1);
   } else {
     // Perform a full flush of all cpus of a single ASID
     SfenceVmaArgs args{.asid = asid_};
     mp_sync_exec(MP_IPI_TARGET_ALL, /* cpu_mask */ 0, &SfenceVma, &args);
     kcounter_add(cm_asid_invalidate, 1);
   }
 }

 zx::result<size_t> Riscv64ArchVmAspace::UnmapPageTable(vaddr_t vaddr, vaddr_t vaddr_rel,
                                                        size_t size, EnlargeOperation enlarge,
                                                        uint level, volatile pte_t* page_table,
                                                        ConsistencyManager& cm) {
   const vaddr_t block_size = page_size_per_level(level);
   const vaddr_t block_mask = block_size - 1;

   LTRACEF("vaddr 0x%lx, vaddr_rel 0x%lx, size 0x%lx, level %u, page_table %p\n", vaddr, vaddr_rel,
           size, level, page_table);

   size_t unmap_size = 0;
   while (size) {
     const vaddr_t vaddr_rem = vaddr_rel & block_mask;
     const size_t chunk_size = ktl::min(size, block_size - vaddr_rem);
     const vaddr_t index = vaddr_to_index(vaddr_rel, level);

     pte_t pte = page_table[index];

     // If the input range partially covers a large page, attempt to split.
     if (level > 0 && riscv64_pte_is_valid(pte) && riscv64_pte_is_leaf(pte) &&
         chunk_size != block_size) {
       const zx_status_t status = SplitLargePage(vaddr, level, index, page_table, cm);
       // If the split failed then we just fall through and unmap the entire large page.
       if (likely(status == ZX_OK)) {
         pte = page_table[index];
       } else if (enlarge == EnlargeOperation::No) {
         return zx::error_result(status);
       }
     }

     // Check for an inner page table pointer.
     if (level > 0 && riscv64_pte_is_valid(pte) && !riscv64_pte_is_leaf(pte)) {
       const paddr_t page_table_paddr = riscv64_pte_pa(pte);
       volatile pte_t* next_page_table =
           static_cast<volatile pte_t*>(paddr_to_physmap(page_table_paddr));

       // Recurse a level.
       zx::result<size_t> result =
           UnmapPageTable(vaddr, vaddr_rem, chunk_size, enlarge, level - 1, next_page_table, cm);
       if (result.is_error()) {
         return result;
       }

       LTRACEF_LEVEL(2, "exited recursion: back at level %u\n", level);

       // If this is an entry corresponding to a top level kernel page table (MMU_PT_LEVELS - 1),
       // skip freeing it so that we always keep these kernel page tables populated in all address
       // spaces.
       const bool kernel_top_level_pt = (type_ == Riscv64AspaceType::kKernel) &&
                                        (index >= RISCV64_MMU_PT_KERNEL_BASE_INDEX) &&
                                        IsTopLevel(level);
       // Similarly, if this is an entry corresponding to a top level shared page table, skip
       // freeing it as there may be several unified aspaces referencing its contents.
       const bool shared_top_level_pt = IsShared() && IsTopLevel(level);
       if (!kernel_top_level_pt && !shared_top_level_pt &&
           (chunk_size == block_size || page_table_is_clear(next_page_table))) {
         // If we unmapped an entire page table leaf and/or the unmap made the level below us empty,
         // free the page table.
         LTRACEF("pte %p[0x%lx] = 0 (was page table phys %#lx virt %p)\n", page_table, index,
                 page_table_paddr, next_page_table);
         update_pte(&page_table[index], 0);
         // If this is a restricted aspace and we are updating the top level page table, we need to
         // update the top level page of the associated unified aspace.
         if (IsTopLevel(level) && IsRestricted() && referenced_aspace_ != nullptr) {
           Guard<Mutex> b{AssertOrderedLock, &referenced_aspace_->lock_,
                          referenced_aspace_->LockOrder()};
           update_pte(&referenced_aspace_->tt_virt_[index], 0);
         }

         // We can safely defer TLB flushing as the consistency manager will not return the backing
         // page to the PMM until after the tlb is flushed.
         cm.FlushEntry(vaddr, false);
         FreePageTable(const_cast<pte_t*>(next_page_table), page_table_paddr, cm);
       }
     } else if (riscv64_pte_is_valid(pte)) {
       // Unmap this leaf page.
       LTRACEF("pte %p[0x%lx] = 0 (was phys %#lx)\n", page_table, index,
               riscv64_pte_pa(page_table[index]));
       update_pte(&page_table[index], 0);

       cm.FlushEntry(vaddr, true);
     } else {
       LTRACEF("pte %p[0x%lx] already clear\n", page_table, index);
     }
     vaddr += chunk_size;
     vaddr_rel += chunk_size;
     size -= chunk_size;
     unmap_size += chunk_size;
   }

   return zx::ok(unmap_size);
 }

 zx::result<size_t> Riscv64ArchVmAspace::MapPageTable(vaddr_t vaddr_in, vaddr_t vaddr_rel_in,
                                                      paddr_t paddr_in, size_t size_in, pte_t attrs,
                                                      uint level, volatile pte_t* page_table,
                                                      ConsistencyManager& cm) {
   vaddr_t vaddr = vaddr_in;
   vaddr_t vaddr_rel = vaddr_rel_in;
   paddr_t paddr = paddr_in;
   size_t size = size_in;

   const vaddr_t block_size = page_size_per_level(level);
   const vaddr_t block_mask = block_size - 1;

   LTRACEF("vaddr %#" PRIxPTR ", vaddr_rel %#" PRIxPTR ", paddr %#" PRIxPTR
           ", size %#zx, attrs %#" PRIx64 ", level %u, page_table %p\n",
           vaddr, vaddr_rel, paddr, size, attrs, level, page_table);

   if ((vaddr_rel | paddr | size) & (PAGE_MASK)) {
     TRACEF("not page aligned\n");
     return zx::error_result(ZX_ERR_INVALID_ARGS);
   }

   auto cleanup = fit::defer([&]() {
     AssertHeld(lock_);
     zx::result<size_t> result = UnmapPageTable(vaddr_in, vaddr_rel_in, size_in - size,
                                                EnlargeOperation::No, level, page_table, cm);
     DEBUG_ASSERT(result.is_ok());
   });

   size_t mapped_size = 0;
   while (size) {
     const vaddr_t vaddr_rem = vaddr_rel & block_mask;
     const size_t chunk_size = ktl::min(size, block_size - vaddr_rem);
     const vaddr_t index = vaddr_to_index(vaddr_rel, level);
     pte_t pte = page_table[index];

     // if we're at an unaligned address, and not trying to map a block larger than 1GB,
     // recurse one more level of the page table tree
     if (((vaddr_rel | paddr) & block_mask) || (chunk_size != block_size) || (level > 2)) {
       bool allocated_page_table = false;
       paddr_t page_table_paddr = 0;
       volatile pte_t* next_page_table = nullptr;

       if (!riscv64_pte_is_valid(pte)) {
         zx::result<paddr_t> result = AllocPageTable();
         if (result.is_error()) {
           TRACEF("failed to allocate page table\n");
           return result.take_error();
         }
         page_table_paddr = result.value();
         allocated_page_table = true;
         void* pt_vaddr = paddr_to_physmap(page_table_paddr);

         LTRACEF("allocated page table, vaddr %p, paddr 0x%lx\n", pt_vaddr, page_table_paddr);
         arch_zero_page(pt_vaddr);

         // ensure that the zeroing is observable from hardware page table walkers, as we need to
         // do this prior to writing the pte we cannot defer it using the consistency manager.
         mb();

         pte = mmu_non_leaf_pte(page_table_paddr, IsKernel());
         update_pte(&page_table[index], pte);
         // If this is a restricted aspace and we are mapping into the top level page, we need to
         // add the page table entry to the top level page of the associated unified aspace as well.
         if (IsTopLevel(level) && IsRestricted() && referenced_aspace_ != nullptr) {
           Guard<Mutex> b{AssertOrderedLock, &referenced_aspace_->lock_,
                          referenced_aspace_->LockOrder()};
           update_pte(&referenced_aspace_->tt_virt_[index], pte);
         }
         // We do not need to sync the walker, despite writing a new entry, as this is a
         // non-terminal entry and so is irrelevant to the walker anyway.
         LTRACEF("pte %p[%#" PRIxPTR "] = %#" PRIx64 " (paddr %#lx)\n", page_table, index, pte,
                 paddr);
         next_page_table = static_cast<volatile pte_t*>(pt_vaddr);
       } else if (!riscv64_pte_is_leaf(pte)) {
         page_table_paddr = riscv64_pte_pa(pte);
         LTRACEF("found page table %#" PRIxPTR "\n", page_table_paddr);
         next_page_table = static_cast<volatile pte_t*>(paddr_to_physmap(page_table_paddr));
       } else {
         return zx::error_result(ZX_ERR_ALREADY_EXISTS);
       }
       DEBUG_ASSERT(next_page_table);

       zx::result<size_t> result =
           MapPageTable(vaddr, vaddr_rem, paddr, chunk_size, attrs, level - 1, next_page_table, cm);
       if (result.is_error()) {
         if (allocated_page_table) {
           // We just allocated this page table. The unmap in err will not clean it up as the size
           // we pass in will not cause us to look at this page table. This is reasonable as if we
           // didn't allocate the page table then we shouldn't look into and potentially unmap
           // anything from that page table.
           // Since we just allocated it there should be nothing in it, otherwise the MapPageTable
           // call would not have failed.
           DEBUG_ASSERT(page_table_is_clear(next_page_table));
           page_table[index] = 0;

           // We can safely defer TLB flushing as the consistency manager will not return the backing
           // page to the PMM until after the tlb is flushed.
           cm.FlushEntry(vaddr, false);
           FreePageTable(const_cast<pte_t*>(next_page_table), page_table_paddr, cm);
         }
         return result;
       }
       DEBUG_ASSERT(result.value() == chunk_size);
     } else {
       if (riscv64_pte_is_valid(pte)) {
         LTRACEF("page table entry already in use, index %#" PRIxPTR ", %#" PRIx64 "\n", index, pte);
         return zx::error_result(ZX_ERR_ALREADY_EXISTS);
       }

       pte = riscv64_pte_pa_to_pte(paddr) | attrs;
       LTRACEF("pte %p[%#" PRIxPTR "] = %#" PRIx64 "\n", page_table, index, pte);
       page_table[index] = pte;

       // Flush the TLB on map as well, unlike most architectures.
       if (IsKernel()) {
         // Normally we only need a local fence here and secondary cpus at worse would only
         // get a spurious page fault. However, since spurious PFs are not tolerated in the
         // kernel we want to do a full flush via the ConsistencyManager for kernel addresses.
         cm.FlushEntry(vaddr, true);
       } else if (IsUser()) {
         // Perform a local sfence.vma on the single page in the local asid. If another cpu were
         // to page fault on this user address, it will sfence.vma in its PF handler.
         riscv64_tlb_flush_address_one_asid(vaddr, asid_);
         kcounter_add(cm_local_page_invalidate, 1);
       } else [[unlikely]] {
         PANIC_UNIMPLEMENTED;
       }
     }
     vaddr += chunk_size;
     vaddr_rel += chunk_size;
     paddr += chunk_size;
     size -= chunk_size;
     mapped_size += chunk_size;
   }

   cleanup.cancel();
   return zx::ok(mapped_size);
 }

 zx_status_t Riscv64ArchVmAspace::ProtectPageTable(vaddr_t vaddr_in, vaddr_t vaddr_rel_in,
                                                   size_t size_in, pte_t attrs, uint level,
                                                   volatile pte_t* page_table,
                                                   ConsistencyManager& cm) {
   vaddr_t vaddr = vaddr_in;
   vaddr_t vaddr_rel = vaddr_rel_in;
   size_t size = size_in;

   const vaddr_t block_size = page_size_per_level(level);
   const vaddr_t block_mask = block_size - 1;

   LTRACEF("vaddr %#" PRIxPTR ", vaddr_rel %#" PRIxPTR ", size %#" PRIxPTR ", attrs %#" PRIx64
           ", level %u, page_table %p\n",
           vaddr, vaddr_rel, size, attrs, level, page_table);

   // vaddr_rel and size must be page aligned
   DEBUG_ASSERT(((vaddr_rel | size) & ((1UL << PAGE_SIZE_SHIFT) - 1)) == 0);

   while (size) {
     const vaddr_t vaddr_rem = vaddr_rel & block_mask;
     const size_t chunk_size = ktl::min(size, block_size - vaddr_rem);
     const vaddr_t index = vaddr_to_index(vaddr_rel, level);
     pte_t pte = page_table[index];

     // If the input range partially covers a large page, split the page.
     if (level > 0 && riscv64_pte_is_valid(pte) && riscv64_pte_is_leaf(pte) &&
         chunk_size != block_size) {
       zx_status_t s = SplitLargePage(vaddr, level, index, page_table, cm);
       if (unlikely(s != ZX_OK)) {
         return s;
       }
       pte = page_table[index];
     }

     if (level > 0 && riscv64_pte_is_valid(pte) && !riscv64_pte_is_leaf(pte)) {
       const paddr_t page_table_paddr = riscv64_pte_pa(pte);
       volatile pte_t* next_page_table =
           static_cast<volatile pte_t*>(paddr_to_physmap(page_table_paddr));

       // Recurse a level
       zx_status_t status =
           ProtectPageTable(vaddr, vaddr_rem, chunk_size, attrs, level - 1, next_page_table, cm);
       if (unlikely(status != ZX_OK)) {
         return status;
       }
     } else if (riscv64_pte_is_valid(pte)) {
       const pte_t new_pte = (pte & ~RISCV64_PTE_PERM_MASK) | attrs;

       LTRACEF("pte %p[%#" PRIxPTR "] = %#" PRIx64 " was %#" PRIx64 "\n", page_table, index, new_pte,
               pte);

       // Skip updating the page table entry if the new value is the same as before.
       if (new_pte != pte) {
         update_pte(&page_table[index], new_pte);

         cm.FlushEntry(vaddr, true);
       }
     } else {
       LTRACEF("page table entry does not exist, index %#" PRIxPTR ", %#" PRIx64 "\n", index, pte);
     }
     vaddr += chunk_size;
     vaddr_rel += chunk_size;
     size -= chunk_size;
   }

   return ZX_OK;
 }

 void Riscv64ArchVmAspace::HarvestAccessedPageTable(vaddr_t vaddr, vaddr_t vaddr_rel_in, size_t size,
                                                    uint level,
                                                    NonTerminalAction non_terminal_action,
                                                    TerminalAction terminal_action,
                                                    volatile pte_t* page_table,
                                                    ConsistencyManager& cm, bool* unmapped_out) {
   const vaddr_t block_size = page_size_per_level(level);
   const vaddr_t block_mask = block_size - 1;

   vaddr_t vaddr_rel = vaddr_rel_in;

   LTRACEF("vaddr 0x%lx, vaddr_rel 0x%lx, size 0x%lx, level %u, page_table %p\n", vaddr, vaddr_rel,
           size, level, page_table);

   // vaddr_rel and size must be page aligned
   DEBUG_ASSERT(((vaddr_rel | size) & ((1UL << PAGE_SIZE_SHIFT) - 1)) == 0);

   while (size) {
     const vaddr_t vaddr_rem = vaddr_rel & block_mask;
     const size_t chunk_size = ktl::min(size, block_size - vaddr_rem);
     const vaddr_t index = vaddr_to_index(vaddr_rel, level);

     pte_t pte = page_table[index];

     if (level > 0 && riscv64_pte_is_valid(pte) && riscv64_pte_is_leaf(pte) &&
         chunk_size != block_size) {
       // Ignore large pages, we do not support harvesting accessed bits from them. Having this empty
       // if block simplifies the overall logic.
     } else if (level > 0 && riscv64_pte_is_valid(pte) && !riscv64_pte_is_leaf(pte)) {
       // We're at an inner page table pointer node.
       const paddr_t page_table_paddr = riscv64_pte_pa(pte);
       volatile pte_t* next_page_table =
           static_cast<volatile pte_t*>(paddr_to_physmap(page_table_paddr));

       // NOTE: We currently cannot honor NonTerminalAction::FreeUnaccessed since accessed
       // information is not being tracked on inner nodes.

       // Recurse into the next level
       HarvestAccessedPageTable(vaddr, vaddr_rel, chunk_size, level - 1, non_terminal_action,
                                terminal_action, next_page_table, cm, unmapped_out);
     } else if (riscv64_pte_is_valid(pte) && (pte & RISCV64_PTE_A)) {
       const paddr_t pte_addr = riscv64_pte_pa(pte);
       const paddr_t paddr = pte_addr + vaddr_rem;

       vm_page_t* page = paddr_to_vm_page(paddr);
       // Mappings for physical VMOs do not have pages associated with them and so there's no state
       // to update on an access.
       if (likely(page)) {
         pmm_page_queues()->MarkAccessedDeferredCount(page);

         if (terminal_action == TerminalAction::UpdateAgeAndHarvest) {
           // Modifying the access flag does not require break-before-make for correctness and as we
           // do not support hardware access flag setting at the moment we do not have to deal with
           // potential concurrent modifications.
           pte = (pte & ~RISCV64_PTE_A);
           LTRACEF("pte %p[%#" PRIxPTR "] = %#" PRIx64 "\n", page_table, index, pte);
           update_pte(&page_table[index], pte);

           cm.FlushEntry(vaddr, true);
         }
       }
     }
     vaddr += chunk_size;
     vaddr_rel += chunk_size;
     size -= chunk_size;
   }
 }

 void Riscv64ArchVmAspace::MarkAccessedPageTable(vaddr_t vaddr, vaddr_t vaddr_rel_in, size_t size,
                                                 uint level, volatile pte_t* page_table,
                                                 ConsistencyManager& cm) {
   const vaddr_t block_size = page_size_per_level(level);
   const vaddr_t block_mask = block_size - 1;

   vaddr_t vaddr_rel = vaddr_rel_in;

   LTRACEF("vaddr 0x%lx, vaddr_rel 0x%lx, size 0x%lx, level %u, page_table %p\n", vaddr, vaddr_rel,
           size, level, page_table);

   // vaddr_rel and size must be page aligned
   DEBUG_ASSERT(((vaddr_rel | size) & ((1UL << PAGE_SIZE_SHIFT) - 1)) == 0);

   while (size) {
     const vaddr_t vaddr_rem = vaddr_rel & block_mask;
     const size_t chunk_size = ktl::min(size, block_size - vaddr_rem);
     const vaddr_t index = vaddr_to_index(vaddr_rel, level);

     pte_t pte = page_table[index];

     if (level > 0 && riscv64_pte_is_valid(pte) && riscv64_pte_is_leaf(pte) &&
         chunk_size != block_size) {
       // Ignore large pages as we don't support modifying their access flags. Having this empty if
       // block simplifies the overall logic.
     } else if (level > 0 && riscv64_pte_is_valid(pte) && !riscv64_pte_is_leaf(pte)) {
       const paddr_t page_table_paddr = riscv64_pte_pa(pte);
       volatile pte_t* next_page_table =
           static_cast<volatile pte_t*>(paddr_to_physmap(page_table_paddr));
       MarkAccessedPageTable(vaddr, vaddr_rem, chunk_size, level - 1, next_page_table, cm);
     } else if (riscv64_pte_is_valid(pte)) {
       pte |= RISCV64_PTE_A;
       update_pte(&page_table[index], pte);
     }
     vaddr += chunk_size;
     vaddr_rel += chunk_size;
     size -= chunk_size;
   }
 }

 zx::result<size_t> Riscv64ArchVmAspace::MapPages(vaddr_t vaddr, paddr_t paddr, size_t size,
                                                  pte_t attrs, ConsistencyManager& cm) {
   LOCAL_KTRACE("mmu map", (vaddr & ~PAGE_MASK) | ((size >> PAGE_SIZE_SHIFT) & PAGE_MASK));
   uint level = RISCV64_MMU_PT_LEVELS - 1;
   zx::result<size_t> ret = MapPageTable(vaddr, vaddr, paddr, size, attrs, level, tt_virt_, cm);
   mb();
   return ret;
 }

 zx::result<size_t> Riscv64ArchVmAspace::UnmapPages(vaddr_t vaddr, size_t size,
                                                    EnlargeOperation enlarge,
                                                    ConsistencyManager& cm) {
   LOCAL_KTRACE("mmu unmap", (vaddr & ~PAGE_MASK) | ((size >> PAGE_SIZE_SHIFT) & PAGE_MASK));
   uint level = RISCV64_MMU_PT_LEVELS - 1;
   return UnmapPageTable(vaddr, vaddr, size, enlarge, level, tt_virt_, cm);
 }

 zx_status_t Riscv64ArchVmAspace::ProtectPages(vaddr_t vaddr, size_t size, pte_t attrs) {
   LOCAL_KTRACE("mmu protect", (vaddr & ~PAGE_MASK) | ((size >> PAGE_SIZE_SHIFT) & PAGE_MASK));
   uint level = RISCV64_MMU_PT_LEVELS - 1;
   ConsistencyManager cm(*this);
   return ProtectPageTable(vaddr, vaddr, size, attrs, level, tt_virt_, cm);
 }

 zx_status_t Riscv64ArchVmAspace::MapContiguous(vaddr_t vaddr, paddr_t paddr, size_t count,
                                                uint mmu_flags, size_t* mapped) {
   canary_.Assert();
   LTRACEF("vaddr %#" PRIxPTR " paddr %#" PRIxPTR " count %zu flags %#x\n", vaddr, paddr, count,
           mmu_flags);

   DEBUG_ASSERT(tt_virt_);

   DEBUG_ASSERT(IsValidVaddr(vaddr));
   if (!IsValidVaddr(vaddr)) {
     return ZX_ERR_OUT_OF_RANGE;
   }

   if (!(mmu_flags & ARCH_MMU_FLAG_PERM_READ)) {
     return ZX_ERR_INVALID_ARGS;
   }

   // paddr and vaddr must be aligned.
   DEBUG_ASSERT(IS_PAGE_ALIGNED(vaddr));
   DEBUG_ASSERT(IS_PAGE_ALIGNED(paddr));
   if (!IS_PAGE_ALIGNED(vaddr) || !IS_PAGE_ALIGNED(paddr)) {
     return ZX_ERR_INVALID_ARGS;
   }

   if (count == 0) {
     return ZX_OK;
   }

   Guard<Mutex> a{AssertOrderedLock, &lock_, LockOrder()};

   if (mmu_flags & ARCH_MMU_FLAG_PERM_EXECUTE) {
     Riscv64VmICacheConsistencyManager cache_cm;
     cache_cm.SyncAddr(reinterpret_cast<vaddr_t>(paddr_to_physmap(paddr)), count * PAGE_SIZE);
   }

   ConsistencyManager cm(*this);
   const pte_t attrs = mmu_flags_to_pte_attr(mmu_flags, IsKernel());
   zx::result<size_t> result = MapPages(vaddr, paddr, count * PAGE_SIZE, attrs, cm);
   if (mapped) {
     *mapped = result.is_ok() ? result.value() / PAGE_SIZE : 0u;
     DEBUG_ASSERT(*mapped <= count);
   }

   return result.status_value();
 }

 zx_status_t Riscv64ArchVmAspace::Map(vaddr_t vaddr, paddr_t* phys, size_t count, uint mmu_flags,
                                      ExistingEntryAction existing_action, size_t* mapped) {
   canary_.Assert();

   DEBUG_ASSERT(ENABLE_PAGE_FAULT_UPGRADE || existing_action != ExistingEntryAction::Upgrade);
   DEBUG_ASSERT(tt_virt_);

   DEBUG_ASSERT(IsValidVaddr(vaddr));
   if (!IsValidVaddr(vaddr)) {
     return ZX_ERR_OUT_OF_RANGE;
   }
   for (size_t i = 0; i < count; ++i) {
     DEBUG_ASSERT(IS_PAGE_ALIGNED(phys[i]));
     if (!IS_PAGE_ALIGNED(phys[i])) {
       return ZX_ERR_INVALID_ARGS;
     }
   }

   if (!(mmu_flags & ARCH_MMU_FLAG_PERM_READ)) {
     return ZX_ERR_INVALID_ARGS;
   }

   // vaddr must be aligned.
   DEBUG_ASSERT(IS_PAGE_ALIGNED(vaddr));
   if (!IS_PAGE_ALIGNED(vaddr)) {
     return ZX_ERR_INVALID_ARGS;
   }

   if (count == 0) {
     return ZX_OK;
   }

   size_t total_mapped = 0;
   {
     Guard<Mutex> a{AssertOrderedLock, &lock_, LockOrder()};
     if (mmu_flags & ARCH_MMU_FLAG_PERM_EXECUTE) {
       Riscv64VmICacheConsistencyManager cache_cm;
       for (size_t idx = 0; idx < count; ++idx) {
         cache_cm.SyncAddr(reinterpret_cast<vaddr_t>(paddr_to_physmap(phys[idx])), PAGE_SIZE);
       }
     }

     size_t idx = 0;
     ConsistencyManager cm(*this);
     auto undo = fit::defer([&]() TA_NO_THREAD_SAFETY_ANALYSIS {
       if (idx > 0) {
         zx::result<size_t> result = UnmapPages(vaddr, idx * PAGE_SIZE, EnlargeOperation::No, cm);
         DEBUG_ASSERT(result.is_ok());
       }
     });

     const pte_t attrs = mmu_flags_to_pte_attr(mmu_flags, IsKernel());
     vaddr_t v = vaddr;
     for (; idx < count; ++idx) {
       paddr_t paddr = phys[idx];
       DEBUG_ASSERT(IS_PAGE_ALIGNED(paddr));
       zx::result<size_t> result = MapPages(v, paddr, PAGE_SIZE, attrs, cm);
       if (result.is_error()) {
         if (result.error_value() != ZX_ERR_ALREADY_EXISTS ||
             existing_action == ExistingEntryAction::Error) {
           return result.error_value();
         }
       } else {
         total_mapped += result.value() / PAGE_SIZE;
       }

       v += PAGE_SIZE;
     }
     undo.cancel();
   }
   DEBUG_ASSERT(total_mapped <= count);

   if (mapped) {
     // For ExistingEntryAction::Error, we should have mapped all the addresses we were asked to.
     // For ExistingEntryAction::Skip, we might have mapped less if we encountered existing entries,
     // but skipped entries contribute towards the total as well.
     *mapped = count;
   }

   return ZX_OK;
 }

 zx_status_t Riscv64ArchVmAspace::Unmap(vaddr_t vaddr, size_t count, EnlargeOperation enlarge,
                                        size_t* unmapped) {
   canary_.Assert();
   LTRACEF("vaddr %#" PRIxPTR " count %zu\n", vaddr, count);

   DEBUG_ASSERT(tt_virt_);

   DEBUG_ASSERT(IsValidVaddr(vaddr));

   if (!IsValidVaddr(vaddr)) {
     return ZX_ERR_OUT_OF_RANGE;
   }

   DEBUG_ASSERT(IS_PAGE_ALIGNED(vaddr));
   if (!IS_PAGE_ALIGNED(vaddr)) {
     return ZX_ERR_INVALID_ARGS;
   }

   Guard<Mutex> a{AssertOrderedLock, &lock_, LockOrder()};

   ConsistencyManager cm(*this);
   zx::result<size_t> result = UnmapPages(vaddr, count * PAGE_SIZE, enlarge, cm);

   if (unmapped) {
     *unmapped = result.is_ok() ? result.value() / PAGE_SIZE : 0u;
     DEBUG_ASSERT(*unmapped <= count);
   }

   return result.status_value();
 }

 zx_status_t Riscv64ArchVmAspace::Protect(vaddr_t vaddr, size_t count, uint mmu_flags,
                                          EnlargeOperation enlarge) {
   canary_.Assert();

   if (!IsValidVaddr(vaddr)) {
     return ZX_ERR_INVALID_ARGS;
   }

   if (!IS_PAGE_ALIGNED(vaddr)) {
     return ZX_ERR_INVALID_ARGS;
   }

   if (!(mmu_flags & ARCH_MMU_FLAG_PERM_READ)) {
     return ZX_ERR_INVALID_ARGS;
   }

   Guard<Mutex> a{AssertOrderedLock, &lock_, LockOrder()};
   if (mmu_flags & ARCH_MMU_FLAG_PERM_EXECUTE) {
     // If mappings are going to become executable then we first need to sync their caches.
     // Unfortunately this needs to be done on kernel virtual addresses to avoid taking translation
     // faults, and so we need to first query for the physical address to then get the kernel virtual
     // address in the physmap.
     // This sync could be more deeply integrated into ProtectPages, but making existing regions
     // executable is very uncommon operation and so we keep it simple.
     vm_mmu_protect_make_execute_calls.Add(1);
     Riscv64VmICacheConsistencyManager cache_cm;
     size_t pages_synced = 0;
     for (size_t idx = 0; idx < count; idx++) {
       paddr_t paddr;
       uint flags;
       if (QueryLocked(vaddr + idx * PAGE_SIZE, &paddr, &flags) == ZX_OK &&
           (flags & ARCH_MMU_FLAG_PERM_EXECUTE)) {
         cache_cm.SyncAddr(reinterpret_cast<vaddr_t>(paddr_to_physmap(paddr)), PAGE_SIZE);
         pages_synced++;
       }
     }
     vm_mmu_protect_make_execute_pages.Add(pages_synced);
   }

   const pte_t attrs = mmu_flags_to_pte_attr(mmu_flags, IsKernel());
   return ProtectPages(vaddr, count * PAGE_SIZE, attrs);
 }

 zx_status_t Riscv64ArchVmAspace::HarvestAccessed(vaddr_t vaddr, size_t count,
                                                  NonTerminalAction non_terminal,
                                                  TerminalAction terminal) {
   canary_.Assert();
   LTRACEF("vaddr %#" PRIxPTR " count %zu\n", vaddr, count);

   if (!IS_PAGE_ALIGNED(vaddr) || !IsValidVaddr(vaddr)) {
     return ZX_ERR_INVALID_ARGS;
   }

   Guard<Mutex> guard{AssertOrderedLock, &lock_, LockOrder()};

   const size_t size = count * PAGE_SIZE;
   LOCAL_KTRACE("mmu harvest accessed",
                (vaddr & ~PAGE_MASK) | ((size >> PAGE_SIZE_SHIFT) & PAGE_MASK));

   ConsistencyManager cm(*this);

   HarvestAccessedPageTable(vaddr, vaddr, size, RISCV64_MMU_PT_LEVELS - 1, non_terminal, terminal,
                            tt_virt_, cm, nullptr);
   return ZX_OK;
 }

 zx_status_t Riscv64ArchVmAspace::MarkAccessed(vaddr_t vaddr, size_t count) {
   canary_.Assert();
   LTRACEF("vaddr %#" PRIxPTR " count %zu\n", vaddr, count);

   if (!IS_PAGE_ALIGNED(vaddr) || !IsValidVaddr(vaddr)) {
     return ZX_ERR_OUT_OF_RANGE;
   }

   Guard<Mutex> a{AssertOrderedLock, &lock_, LockOrder()};

   const size_t size = count * PAGE_SIZE;
   LOCAL_KTRACE("mmu mark accessed", (vaddr & ~PAGE_MASK) | ((size >> PAGE_SIZE_SHIFT) & PAGE_MASK));

   ConsistencyManager cm(*this);

   MarkAccessedPageTable(vaddr, vaddr, size, RISCV64_MMU_PT_LEVELS - 1, tt_virt_, cm);

   return ZX_OK;
 }

 bool Riscv64ArchVmAspace::ActiveSinceLastCheck(bool clear) {
   // Read whether any CPUs are presently executing.
   bool currently_active = num_active_cpus_.load(ktl::memory_order_relaxed) != 0;
   // Exchange the current notion of active, with the previously active information. This is the only
   // time a |false| value can potentially be written to active_since_last_check_, and doing an
   // exchange means we can never 'lose' a |true| value.
   bool previously_active =
       clear ? active_since_last_check_.exchange(currently_active, ktl::memory_order_relaxed)
             : active_since_last_check_.load(ktl::memory_order_relaxed);
   // Return whether we had previously been active. It is not necessary to also consider whether we
   // are currently active, since activating would also have active_since_last_check_ to true. In the
   // scenario where we race and currently_active is true, but we observe previously_active to be
   // false, this means that as of the start of this function ::ContextSwitch had not completed, and
   // so this aspace is still not actually active.
   return previously_active;
 }

 zx_status_t Riscv64ArchVmAspace::Init() {
   canary_.Assert();
   LTRACEF("aspace %p, base %#" PRIxPTR ", size 0x%zx, type %*s\n", this, base_, size_,
           static_cast<int>(Riscv64AspaceTypeName(type_).size()),
           Riscv64AspaceTypeName(type_).data());

   Guard<Mutex> a{AssertOrderedLock, &lock_, LockOrder()};

   // Validate that the base + size is valid and doesn't wrap.
   DEBUG_ASSERT(size_ > 0 || IsUnified());
   DEBUG_ASSERT(IS_PAGE_ALIGNED(base_));
   DEBUG_ASSERT(IS_PAGE_ALIGNED(size_));
   [[maybe_unused]] uintptr_t unused;
   DEBUG_ASSERT(!add_overflow(base_, size_ - 1, &unused));

   if (type_ == Riscv64AspaceType::kKernel) {
     // At the moment we can only deal with address spaces as globally defined.
     DEBUG_ASSERT(base_ == KERNEL_ASPACE_BASE);
     DEBUG_ASSERT(size_ == KERNEL_ASPACE_SIZE);

     tt_virt_ = riscv64_kernel_translation_table;
     tt_phys_ = kernel_virt_to_phys(riscv64_kernel_translation_table);
     asid_ = kernel_asid();
   } else {
     if (type_ == Riscv64AspaceType::kUser) {
       DEBUG_ASSERT_MSG(IsUnified() || IsUserBaseSizeValid(base_, size_),
                        "base %#" PRIxPTR " size 0x%zx", base_, size_);
       if (!IsUserBaseSizeValid(base_, size_) && !IsUnified()) {
         return ZX_ERR_INVALID_ARGS;
       }

       // If using asids, assign a unique asid per process. If not, set the UNUSED
       // asid to this address space, which will be the same between all aspaces.
       if (riscv_use_asid) {
         auto status = asid_allocator.Alloc();
         if (status.is_error()) {
           printf("RISC-V: out of ASIDs!\n");
           return status.status_value();
         }
         asid_ = status.value();
       } else {
         asid_ = MMU_RISCV64_UNUSED_ASID;
       }
     } else {
       return ZX_ERR_NOT_SUPPORTED;
     }

     // allocate a top level page table to serve as the translation table
     const zx::result<paddr_t> result = AllocPageTable();
     if (result.is_error()) {
       return result.error_value();
     }
     const paddr_t pa = result.value();

     volatile pte_t* va = static_cast<volatile pte_t*>(paddr_to_physmap(pa));
     tt_virt_ = va;
     tt_phys_ = pa;

     // zero the top level translation table and copy the kernel memory mapping.
     memset((void*)tt_virt_, 0, PAGE_SIZE / 2);
     memcpy((void*)(tt_virt_ + RISCV64_MMU_PT_ENTRIES / 2),
            (void*)(riscv64_kernel_translation_table + RISCV64_MMU_PT_ENTRIES / 2), PAGE_SIZE / 2);
   }
   pt_pages_ = 1;

   LTRACEF("tt_phys %#" PRIxPTR " tt_virt %p\n", tt_phys_, tt_virt_);

   return ZX_OK;
 }

 zx_status_t Riscv64ArchVmAspace::InitRestricted() {
   role_ = Riscv64AspaceRole::kRestricted;
   return Init();
 }

 zx_status_t Riscv64ArchVmAspace::InitShared() {
   role_ = Riscv64AspaceRole::kShared;
   zx_status_t status = Init();
   if (status != ZX_OK) {
     return status;
   }

   Guard<Mutex> a{AssertOrderedLock, &lock_, LockOrder()};

   // Prepopulate the portion of the top level page table spanned by this aspace by allocating the
   // necessary second level entries.
   const uint top_level = RISCV64_MMU_PT_LEVELS - 1;
   const uint start = vaddr_to_index(base_, top_level);
   const uint end = vaddr_to_index(base_ + size_, top_level) - 1;
   for (uint i = start; i <= end; i++) {
     zx::result<paddr_t> result = AllocPageTable();
     if (result.is_error()) {
       LTRACEF("failed to allocate second level page table for shared aspace\n");
       return result.error_value();
     }
     paddr_t page_table_paddr = result.value();
     void* pt_vaddr = paddr_to_physmap(page_table_paddr);
     arch_zero_page(pt_vaddr);

     // Ensure that the zeroing is observable from hardware page table walkers, as we need to
     // do this prior to writing the pte we cannot defer it using the consistency manager.
     mb();

     pte_t pte = mmu_non_leaf_pte(page_table_paddr, false);
     update_pte(&tt_virt_[i], pte);
   }
   return ZX_OK;
 }

 zx_status_t Riscv64ArchVmAspace::InitUnified(ArchVmAspaceInterface& s, ArchVmAspaceInterface& r) {
   canary_.Assert();

   // The base_ and size_ of a unified aspace are expected to be zero.
   DEBUG_ASSERT(size_ == 0);
   DEBUG_ASSERT(base_ == 0);

   role_ = Riscv64AspaceRole::kUnified;
   zx_status_t status = Init();
   if (status != ZX_OK) {
     return status;
   }
   Riscv64ArchVmAspace& shared = static_cast<Riscv64ArchVmAspace&>(s);
   Riscv64ArchVmAspace& restricted = static_cast<Riscv64ArchVmAspace&>(r);
   {
     Guard<Mutex> a{AssertOrderedLock, &lock_, LockOrder()};
     referenced_aspace_ = &restricted;
     shared_aspace_ = &shared;
   }

   const uint top_level = RISCV64_MMU_PT_LEVELS - 1;
   const uint restricted_start = vaddr_to_index(restricted.base_, top_level);
   const uint restricted_end = vaddr_to_index(restricted.base_ + restricted.size_, top_level) - 1;
   const uint shared_start = vaddr_to_index(shared.base_, top_level);
   const uint shared_end = vaddr_to_index(shared.base_ + shared.size_, top_level) - 1;
   DEBUG_ASSERT(restricted_end < shared_start);

   // Validate that the restricted aspace is empty and set its metadata.
   {
     Guard<Mutex> a{AssertOrderedLock, &restricted.lock_, restricted.LockOrder()};
     DEBUG_ASSERT(restricted.tt_virt_);
     DEBUG_ASSERT(restricted.IsRestricted());
     DEBUG_ASSERT(restricted.num_references_ == 0);
     DEBUG_ASSERT(restricted.referenced_aspace_ == nullptr);
     for (uint i = restricted_start; i <= restricted_end; i++) {
       DEBUG_ASSERT(restricted.tt_virt_[i] == 0);
     }
     restricted.num_references_++;
     restricted.referenced_aspace_ = this;
   }

   // Copy all mappings from the shared aspace and set its metadata.
   {
     Guard<Mutex> a{AssertOrderedLock, &shared.lock_, shared.LockOrder()};
     DEBUG_ASSERT(shared.tt_virt_);
     DEBUG_ASSERT(shared.IsShared());
     for (uint i = shared_start; i <= shared_end; i++) {
       tt_virt_[i] = shared.tt_virt_[i];
     }
     shared.num_references_++;
   }
   return ZX_OK;
 }

 void Riscv64ArchVmAspace::DisableUpdates() {
   // TODO-rvbringup: add machinery for this and the update checker logic
 }

 void Riscv64ArchVmAspace::FreeTopLevelPage() {
   vm_page_t* page = paddr_to_vm_page(tt_phys_);
   DEBUG_ASSERT(page);
   pmm_free_page(page);
   pt_pages_--;

   tt_phys_ = 0;
   tt_virt_ = nullptr;
 }

 zx_status_t Riscv64ArchVmAspace::Destroy() {
   canary_.Assert();
   LTRACEF("aspace %p\n", this);

   // Not okay to destroy the kernel address space.
   DEBUG_ASSERT(type_ != Riscv64AspaceType::kKernel);

   if (IsUnified()) {
     return DestroyUnified();
   }
   return DestroyIndividual();
 }

 zx_status_t Riscv64ArchVmAspace::DestroyUnified() {
   DEBUG_ASSERT(IsUnified());

   Riscv64ArchVmAspace* restricted = nullptr;
   Riscv64ArchVmAspace* shared = nullptr;
   {
     Guard<Mutex> a{AssertOrderedLock, &lock_, LockOrder()};
     restricted = referenced_aspace_;
     shared = shared_aspace_;
     shared_aspace_ = nullptr;
     referenced_aspace_ = nullptr;
   }
   {
     Guard<Mutex> a{AssertOrderedLock, &shared->lock_, shared->LockOrder()};
     // The shared page table should be referenced by at least this page table, and could be
     // referenced by many other unified page tables.
     DEBUG_ASSERT(shared->num_references_ > 0);
     shared->num_references_--;
   }
   {
     Guard<Mutex> a{AssertOrderedLock, &restricted->lock_, restricted->LockOrder()};
     // The restricted_aspace_ page table can only be referenced by a singular unified page table.
     DEBUG_ASSERT(restricted->num_references_ == 1);
     restricted->num_references_--;
   }

   Guard<Mutex> a{AssertOrderedLock, &lock_, LockOrder()};
   if (riscv_use_asid) {
     // Flush the ASID associated with this aspace
     FlushAsid();

     // Free any ASID.
     auto status = asid_allocator.Free(asid_);
     ASSERT(status.is_ok());
     asid_ = MMU_RISCV64_UNUSED_ASID;
   }

   FreeTopLevelPage();
   return ZX_OK;
 }

 zx_status_t Riscv64ArchVmAspace::DestroyIndividual() {
   DEBUG_ASSERT(!IsUnified());
   Guard<Mutex> guard{AssertOrderedLock, &lock_, LockOrder()};
   DEBUG_ASSERT(num_references_ == 0);

   // If this is a shared aspace, its top level page table was statically prepopulated. Therefore,
   // we need to clean up all of the entries manually here.
   if (IsShared()) {
     const uint top_level = RISCV64_MMU_PT_LEVELS - 1;
     const uint start = vaddr_to_index(base_, top_level);
     const uint end = vaddr_to_index(base_ + size_, top_level) - 1;
     for (uint i = start; i <= end; i++) {
       const paddr_t page_table_paddr = riscv64_pte_pa(tt_virt_[i]);
       pmm_free_page(paddr_to_vm_page(page_table_paddr));
       pt_pages_--;
       update_pte(&tt_virt_[i], 0);
     }
   }

   // Check to see if the top level page table is empty. If not the user didn't
   // properly unmap everything before destroying the aspace.
   const zx::result<size_t> index_result = first_used_page_table_entry(tt_virt_);
   DEBUG_ASSERT_MSG(
       index_result.is_error() || *index_result < (1 << (PAGE_SIZE_SHIFT - 2)),
       "Top level page table still in use: aspace %p tt_virt %p index %zu entry %" PRIx64, this,
       tt_virt_, *index_result,
       *index_result < (1 << (PAGE_SIZE_SHIFT - 2)) ? tt_virt_[*index_result] : 0);
   DEBUG_ASSERT_MSG(pt_pages_ == 1, "Too many page table pages: aspace %p pt_pages_ %zu", this,
                    pt_pages_);

   if (riscv_use_asid) {
     // Flush the ASID associated with this aspace
     FlushAsid();

     // Free any ASID.
     auto status = asid_allocator.Free(asid_);
     ASSERT(status.is_ok());
     asid_ = MMU_RISCV64_UNUSED_ASID;
   }

   // Free the top level page table
   FreeTopLevelPage();
   return ZX_OK;
 }

 // Called during context switches between threads with different address spaces. Swaps the
 // mmu context on hardware. Assumes old_aspace != aspace and optimizes as such.
 void Riscv64ArchVmAspace::ContextSwitch(Riscv64ArchVmAspace* old_aspace,
                                         Riscv64ArchVmAspace* aspace) {
   uint64_t satp;

   if (likely(aspace)) {
     aspace->canary_.Assert();
     DEBUG_ASSERT(aspace->type_ == Riscv64AspaceType::kUser);

     // Load the user space SATP with the translation table and user space ASID.
     satp = ((uint64_t)RISCV64_SATP_MODE_SV39 << RISCV64_SATP_MODE_SHIFT) |
            ((uint64_t)aspace->asid_ << RISCV64_SATP_ASID_SHIFT) |
            (aspace->tt_phys_ >> PAGE_SIZE_SHIFT);

     [[maybe_unused]] uint32_t prev =
         aspace->num_active_cpus_.fetch_add(1, ktl::memory_order_relaxed);
     DEBUG_ASSERT(prev < SMP_MAX_CPUS);
     aspace->active_since_last_check_.store(true, ktl::memory_order_relaxed);
     // If the aspace we are context switching to is unified, we need to mark the associated shared
     // and restricted aspaces as active since we may access their mappings indirectly.
     if (aspace->IsUnified()) {
       aspace->get_shared_aspace()->active_since_last_check_.store(true, ktl::memory_order_relaxed);
       aspace->get_restricted_aspace()->active_since_last_check_.store(true,
                                                                       ktl::memory_order_relaxed);
     }
   } else {
     // Switching to the null aspace, which means kernel address space only.
     satp = ((uint64_t)RISCV64_SATP_MODE_SV39 << RISCV64_SATP_MODE_SHIFT) |
            ((uint64_t)kernel_asid() << RISCV64_SATP_ASID_SHIFT) |
            (kernel_virt_to_phys(riscv64_kernel_translation_table) >> PAGE_SIZE_SHIFT);
   }
   if (likely(old_aspace != nullptr)) {
     [[maybe_unused]] uint32_t prev =
         old_aspace->num_active_cpus_.fetch_sub(1, ktl::memory_order_relaxed);
     DEBUG_ASSERT(prev > 0);
   }
   if (TRACE_CONTEXT_SWITCH) {
     TRACEF("old aspace %p aspace %p satp %#" PRIx64 "\n", old_aspace, aspace, satp);
   }

   riscv64_csr_write(RISCV64_CSR_SATP, satp);
   mb();

   // If we're not using hardware features, flush all non global TLB entries on context switch.
   if (!riscv_use_asid) {
     riscv64_tlb_flush_asid(MMU_RISCV64_UNUSED_ASID);
   }
 }

 Riscv64ArchVmAspace::Riscv64ArchVmAspace(vaddr_t base, size_t size, Riscv64AspaceType type,
                                          page_alloc_fn_t paf)
     : test_page_alloc_func_(paf), type_(type), base_(base), size_(size) {}

 Riscv64ArchVmAspace::Riscv64ArchVmAspace(vaddr_t base, size_t size, uint mmu_flags,
                                          page_alloc_fn_t paf)
     : Riscv64ArchVmAspace(base, size, AspaceTypeFromFlags(mmu_flags), paf) {}

 Riscv64ArchVmAspace::~Riscv64ArchVmAspace() {
   // Destroy() will have freed the final page table if it ran correctly, and further validated that
   // everything else was freed.
   DEBUG_ASSERT(pt_pages_ == 0);
 }

 vaddr_t Riscv64ArchVmAspace::PickSpot(vaddr_t base, vaddr_t end, vaddr_t align, size_t size,
                                       uint mmu_flags) {
   canary_.Assert();
   return PAGE_ALIGN(base);
 }

 void riscv64_mmu_early_init() {
   // Figure out the number of supported ASID bits by writing all 1s to
   // the asid field in satp and seeing which ones 'stick'.
   auto satp_orig = riscv64_csr_read(satp);
   auto satp = satp_orig | (RISCV64_SATP_ASID_MASK << RISCV64_SATP_ASID_SHIFT);
   riscv64_csr_write(satp, satp);
   riscv_asid_mask = (riscv64_csr_read(satp) >> RISCV64_SATP_ASID_SHIFT) & RISCV64_SATP_ASID_MASK;
   riscv64_csr_write(satp, satp_orig);

   // Fill in all of the unused top level page table pointers for the kernel half of the kernel
   // top level table. These entries will be copied to all new address spaces, thus ensuring the
   // top level entries are synchronized.
   for (size_t i = RISCV64_MMU_PT_KERNEL_BASE_INDEX; i < RISCV64_MMU_PT_ENTRIES; i++) {
     if (!riscv64_pte_is_valid(riscv64_kernel_bootstrap_translation_table[i])) {
       paddr_t pt_paddr = kernel_virt_to_phys(
           riscv64_kernel_top_level_page_tables[i - RISCV64_MMU_PT_KERNEL_BASE_INDEX]);

       LTRACEF("RISCV: MMU allocating top level page table for slot %zu, pa %#lx\n", i, pt_paddr);

       pte_t pte = mmu_non_leaf_pte(pt_paddr, true);
       update_pte(&riscv64_kernel_bootstrap_translation_table[i], pte);
     }
   }

   // Make a copy of our bootstrap table with the identity map present in the user part.
   memcpy(riscv64_kernel_translation_table, riscv64_kernel_bootstrap_translation_table, PAGE_SIZE);

   // Zero the bottom of the kernel page table to remove any left over boot mappings.
   memset(riscv64_kernel_translation_table, 0, PAGE_SIZE / 2);

   // Make sure it's visible to the cpu
   wmb();
 }

 namespace {

 // Load the kernel page tables and set the passed in asid
 void riscv64_switch_kernel_asid(uint16_t asid) {
   const uint64_t satp = (RISCV64_SATP_MODE_SV39 << RISCV64_SATP_MODE_SHIFT) |
                         ((uint64_t)asid << RISCV64_SATP_ASID_SHIFT) |
                         (kernel_virt_to_phys(riscv64_kernel_translation_table) >> PAGE_SIZE_SHIFT);
   riscv64_csr_write(RISCV64_CSR_SATP, satp);

   // Globally TLB flush.
   riscv64_tlb_flush_all();
 }

 }  // anonymous namespace

 void riscv64_mmu_early_init_percpu() {
   // Switch to the proper kernel translation table.
   // Note: during early bringup on the boot cpu, we will have not decided to use asids yet, so
   // kernel_asid() will return UNUSED_ASID. This is okay, we will decide later to
   // use asids on the boot cpu in riscv64_mmu_prevm_init and reload the satp.
   // Everything will be sorted out by the time secondary cpus are brought up.
   riscv64_switch_kernel_asid(kernel_asid());

   // Globally TLB flush.
   riscv64_tlb_flush_all();
 }

 void riscv64_mmu_prevm_init() {
   // Use asids if hardware has full 16 bit support and our command line switches allow.
   // We decide here because before now we have not been able to read gBootOptions.
   riscv_use_asid = gBootOptions->riscv64_enable_asid && riscv_asid_mask == 0xffff;

   // Now that we've decided to use asids, reload the kernel satp with the proper asid
   // on the boot cpu.
   riscv64_switch_kernel_asid(kernel_asid());
 }

 void riscv64_mmu_init() {
   dprintf(INFO, "RISCV: MMU enabled sv39\n");
   dprintf(INFO, "RISCV: MMU ASID mask %#lx, using asids %u\n", riscv_asid_mask, riscv_use_asid);
 }

 void Riscv64VmICacheConsistencyManager::SyncAddr(vaddr_t start, size_t len) {
   LTRACEF("start %#lx, len %zu\n", start, len);

   // Validate we are operating on a kernel address range.
   DEBUG_ASSERT(is_kernel_address(start));

   // Track that we'll need to fence.i at the end, the address is not important.
   need_invalidate_ = true;
 }

 void Riscv64VmICacheConsistencyManager::Finish() {
   LTRACEF("need_invalidate %d\n", need_invalidate_);
   if (!need_invalidate_) {
     return;
   }

   // Sync any address, since fence.i will dump the entire icache (for now).
   arch_sync_cache_range(KERNEL_ASPACE_BASE, PAGE_SIZE);

   need_invalidate_ = false;
 }

 uint32_t arch_address_tagging_features() { return 0; }

 void arch_zero_page(void* _ptr) {
   const uintptr_t end_address = reinterpret_cast<uintptr_t>(_ptr) + PAGE_SIZE;

   if (gRiscvFeatures[arch::RiscvFeature::kZicboz]) {
     asm volatile(
         R"""(
       .balign 4
       0:
         cbo.zero 0(%0)
         add  %0,%0,%2
         bne  %0,%1,0b
         )"""
         : "+r"(_ptr)
         : "r"(end_address), "r"(riscv_cboz_size)
         : "memory");

   } else {
     asm volatile(
         R"""(
       .balign 4
       0:
         sd    zero,0(%0)
         sd    zero,8(%0)
         sd    zero,16(%0)
         sd    zero,24(%0)
         sd    zero,32(%0)
         sd    zero,40(%0)
         sd    zero,48(%0)
         sd    zero,56(%0)
         addi  %0,%0,64
         bne   %0,%1,0b
         )"""
         : "+r"(_ptr)
         : "r"(end_address)
         : "memory");
   }
 }