zircon/kernel/vm/include/vm/vm_address_region.h - fuchsia - Git at Google

 // Copyright 2016 The Fuchsia Authors
 //
 // Use of this source code is governed by a MIT-style
 // license that can be found in the LICENSE file or at
 // https://opensource.org/licenses/MIT

 #ifndef ZIRCON_KERNEL_VM_INCLUDE_VM_VM_ADDRESS_REGION_H_
 #define ZIRCON_KERNEL_VM_INCLUDE_VM_VM_ADDRESS_REGION_H_

 #include <assert.h>
 #include <lib/crypto/prng.h>
 #include <lib/zircon-internal/thread_annotations.h>
 #include <stdint.h>
 #include <zircon/types.h>

 #include <fbl/canary.h>
 #include <fbl/function.h>
 #include <fbl/intrusive_double_list.h>
 #include <fbl/intrusive_wavl_tree.h>
 #include <fbl/ref_counted.h>
 #include <fbl/ref_ptr.h>
 #include <ktl/limits.h>
 #include <ktl/optional.h>
 #include <vm/vm_aspace.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page_list.h>

 // Creation flags for VmAddressRegion and VmMappings

 // When randomly allocating subregions, reduce sprawl by placing allocations
 // near each other.
 #define VMAR_FLAG_COMPACT (1 << 0)
 // Request that the new region be at the specified offset in its parent region.
 #define VMAR_FLAG_SPECIFIC (1 << 1)
 // Like VMAR_FLAG_SPECIFIC, but permits overwriting existing mappings.  This
 // flag will not overwrite through a subregion.
 #define VMAR_FLAG_SPECIFIC_OVERWRITE (1 << 2)
 // Allow VmMappings to be created inside the new region with the SPECIFIC or
 // OFFSET_IS_UPPER_LIMIT flag.
 #define VMAR_FLAG_CAN_MAP_SPECIFIC (1 << 3)
 // When on a VmAddressRegion, allow VmMappings to be created inside the region
 // with read permissions.  When on a VmMapping, controls whether or not the
 // mapping can gain this permission.
 #define VMAR_FLAG_CAN_MAP_READ (1 << 4)
 // When on a VmAddressRegion, allow VmMappings to be created inside the region
 // with write permissions.  When on a VmMapping, controls whether or not the
 // mapping can gain this permission.
 #define VMAR_FLAG_CAN_MAP_WRITE (1 << 5)
 // When on a VmAddressRegion, allow VmMappings to be created inside the region
 // with execute permissions.  When on a VmMapping, controls whether or not the
 // mapping can gain this permission.
 #define VMAR_FLAG_CAN_MAP_EXECUTE (1 << 6)
 // Require that VMO backing the mapping is non-resizable.
 #define VMAR_FLAG_REQUIRE_NON_RESIZABLE (1 << 7)
 // Allow VMO backings that could result in faults.
 #define VMAR_FLAG_ALLOW_FAULTS (1 << 8)
 // Treat the offset as an upper limit when allocating a VMO or child VMAR.
 #define VMAR_FLAG_OFFSET_IS_UPPER_LIMIT (1 << 9)

 #define VMAR_CAN_RWX_FLAGS \
   (VMAR_FLAG_CAN_MAP_READ | VMAR_FLAG_CAN_MAP_WRITE | VMAR_FLAG_CAN_MAP_EXECUTE)

 // forward declarations
 class VmAddressRegion;
 class VmMapping;
 class VmEnumerator;

 class LazyPageRequest;

 // A VmAddressRegion represents a contiguous region of the virtual address
 // space.  It is partitioned by non-overlapping children of the following types:
 // 1) child VmAddressRegion
 // 2) child VmMapping (leafs that map VmObjects into the address space)
 // 3) gaps (logical, not actually objects).
 //
 // VmAddressRegionOrMapping represents a tagged union of the two types.
 //
 // A VmAddressRegion/VmMapping may be in one of two states: ALIVE or DEAD.  If
 // it is ALIVE, then the VmAddressRegion is a description of the virtual memory
 // mappings of the address range it represents in its parent VmAspace.  If it is
 // DEAD, then the VmAddressRegion is invalid and has no meaning.
 //
 // All VmAddressRegion and VmMapping state is protected by the aspace lock.
 class VmAddressRegionOrMapping
     : public fbl::RefCounted<VmAddressRegionOrMapping>,
       public fbl::WAVLTreeContainable<fbl::RefPtr<VmAddressRegionOrMapping>> {
  public:
   // If a VMO-mapping, unmap all pages and remove dependency on vm object it has a ref to.
   // Otherwise recursively destroy child VMARs and transition to the DEAD state.
   //
   // Returns ZX_OK on success, ZX_ERR_BAD_STATE if already dead, and other
   // values on error (typically unmap failure).
   virtual zx_status_t Destroy();

   // accessors
   vaddr_t base() const { return base_; }
   size_t size() const { return size_; }
   uint32_t flags() const { return flags_; }
   const fbl::RefPtr<VmAspace>& aspace() const { return aspace_; }

   // Recursively compute the number of allocated pages within this region
   virtual size_t AllocatedPages() const;

   // Subtype information and safe down-casting
   bool is_mapping() const { return is_mapping_; }
   fbl::RefPtr<VmAddressRegion> as_vm_address_region();
   fbl::RefPtr<VmMapping> as_vm_mapping();
   VmAddressRegion* as_vm_address_region_ptr();
   VmMapping* as_vm_mapping_ptr();

   // Page fault in an address within the region.  Recursively traverses
   // the regions to find the target mapping, if it exists.
   // If this returns ZX_ERR_SHOULD_WAIT, then the caller should wait on |page_request|
   // and try again.
   virtual zx_status_t PageFault(vaddr_t va, uint pf_flags, LazyPageRequest* page_request)
       TA_REQ(lock()) = 0;

   // WAVL tree key function
   vaddr_t GetKey() const { return base(); }

   // Dump debug info
   virtual void DumpLocked(uint depth, bool verbose) const TA_REQ(lock()) = 0;

   // Expose our backing lock for annotation purposes.
   Lock<Mutex>* lock() const TA_RET_CAP(aspace_->lock()) { return aspace_->lock(); }
   Lock<Mutex>& lock_ref() const TA_RET_CAP(aspace_->lock()) { return aspace_->lock_ref(); }

   bool is_in_range(vaddr_t base, size_t size) const {
     const size_t offset = base - base_;
     return base >= base_ && offset < size_ && size_ - offset >= size;
   }

  private:
   fbl::Canary<fbl::magic("VMRM")> canary_;
   const bool is_mapping_;

  protected:
   // friend VmAddressRegion so it can access DestroyLocked
   friend VmAddressRegion;

   // destructor, should only be invoked from RefPtr
   virtual ~VmAddressRegionOrMapping();
   friend fbl::RefPtr<VmAddressRegionOrMapping>;

   bool in_subregion_tree() const {
     return fbl::WAVLTreeContainable<fbl::RefPtr<VmAddressRegionOrMapping>>::InContainer();
   }

   enum class LifeCycleState {
     // Initial state: if NOT_READY, then do not invoke Destroy() in the
     // destructor
     NOT_READY,
     // Usual state: information is representative of the address space layout
     ALIVE,
     // Object is invalid
     DEAD
   };

   VmAddressRegionOrMapping(vaddr_t base, size_t size, uint32_t flags, VmAspace* aspace,
                            VmAddressRegion* parent, bool is_mapping);

   // Check if the given *arch_mmu_flags* are allowed under this
   // regions *flags_*
   bool is_valid_mapping_flags(uint arch_mmu_flags);

   // Returns true if the instance is alive and reporting information that
   // reflects the address space layout. |aspace()->lock()| must be held.
   bool IsAliveLocked() const;

   virtual zx_status_t DestroyLocked() TA_REQ(lock()) = 0;

   virtual size_t AllocatedPagesLocked() const TA_REQ(lock()) = 0;

   // Transition from NOT_READY to READY, and add references to self to related
   // structures.
   virtual void Activate() TA_REQ(lock()) = 0;

   // current state of the VMAR.  If LifeCycleState::DEAD, then all other
   // fields are invalid.
   LifeCycleState state_ = LifeCycleState::ALIVE;

   // address/size within the container address space
   vaddr_t base_;
   size_t size_;

   // flags from VMAR creation time
   const uint32_t flags_;

   // pointer back to our member address space.  The aspace's lock is used
   // to serialize all modifications.
   const fbl::RefPtr<VmAspace> aspace_;

   // pointer back to our parent region (nullptr if root or destroyed)
   VmAddressRegion* parent_;
 };

 // A list of regions ordered by virtual address. Templated to allow for test code to avoid needing
 // to instantiate 'real' VmAddressRegionOrMapping instances.
 template <typename T = VmAddressRegionOrMapping>
 class RegionList final {
  public:
   using ChildList = fbl::WAVLTree<vaddr_t, fbl::RefPtr<T>>;

   // Remove *region* from the list, returns the removed region.
   fbl::RefPtr<T> RemoveRegion(T* region) { return regions_.erase(*region); }

   // Request the region to the left or right of the given region.
   typename ChildList::iterator LeftOf(T* region) { return --regions_.make_iterator(*region); }
   typename ChildList::iterator RightOf(T* region) { return ++regions_.make_iterator(*region); }

   // Insert *region* to the region list.
   void InsertRegion(fbl::RefPtr<T> region) { regions_.insert(region); }

   // Find the region that covers addr, returns nullptr if not found.
   T* FindRegion(vaddr_t addr) const {
     // Find the first region with a base greater than *addr*.  If a region
     // exists for *addr*, it will be immediately before it.
     auto itr = --regions_.upper_bound(addr);
     if (!itr.IsValid()) {
       return nullptr;
     }
     // Subregion size should never be zero unless during unmapping which should never overlap with
     // this operation.
     DEBUG_ASSERT(itr->size() > 0);
     vaddr_t region_end;
     bool overflowed = add_overflow(itr->base(), itr->size() - 1, &region_end);
     ASSERT(!overflowed);
     if (itr->base() > addr || addr > region_end) {
       return nullptr;
     }

     return &*itr.CopyPointer();
   }

   // Find the region that contains |base|, or if that doesn't exist, the first region that contains
   // an address greater than |base|.
   typename ChildList::iterator IncludeOrHigher(vaddr_t base) {
     // Find the first region with a base greater than *base*.  If a region
     // exists for *base*, it will be immediately before it.
     auto itr = regions_.upper_bound(base);
     itr--;
     if (!itr.IsValid()) {
       itr = regions_.begin();
     } else if (base >= itr->base() && base - itr->base() >= itr->size()) {
       // If *base* isn't in this region, ignore it.
       ++itr;
     }
     return itr;
   }

   typename ChildList::iterator UpperBound(vaddr_t base) { return regions_.upper_bound(base); }

   // Check whether it would be valid to create a child in the range [base, base+size).
   bool IsRangeAvailable(vaddr_t base, size_t size) const {
     DEBUG_ASSERT(size > 0);

     // Find the first region with base > *base*.  Since subregions_ has no
     // overlapping elements, we just need to check this one and the prior
     // child.

     auto prev = regions_.upper_bound(base);
     auto next = prev--;

     if (prev.IsValid()) {
       vaddr_t prev_last_byte;
       if (add_overflow(prev->base(), prev->size() - 1, &prev_last_byte)) {
         return false;
       }
       if (prev_last_byte >= base) {
         return false;
       }
     }

     if (next.IsValid() && next != regions_.end()) {
       vaddr_t last_byte;
       if (add_overflow(base, size - 1, &last_byte)) {
         return false;
       }
       if (next->base() <= last_byte) {
         return false;
       }
     }
     return true;
   }

   // Get the allocation spot that is free and large enough for the aligned size.
   zx_status_t GetAllocSpot(vaddr_t* alloc_spot, uint8_t align_pow2, uint8_t entropy, size_t size,
                            vaddr_t parent_base, size_t parent_size, crypto::Prng* prng,
                            vaddr_t upper_limit = ktl::numeric_limits<vaddr_t>::max()) const {
     DEBUG_ASSERT(entropy < sizeof(size_t) * 8);
     const vaddr_t align = 1UL << align_pow2;
     // This is the maximum number of spaces we need to consider based on our desired entropy.
     const size_t max_candidate_spaces = 1ul << entropy;
     vaddr_t selected_index = 0;
     if (prng != nullptr) {
       // We first pick a index in [0, max_candidate_spaces] and hope to find the index.
       // If the number of available spots is less than selected_index, alloc_spot_info.founds would
       // be false. This means that selected_index is too large, we have to pick again in a smaller
       // range and try again.
       //
       // Note that this is mathematically equal to randomly pick a spot within
       // [0, candidate_spot_count] if selected_index <= candidate_spot_count.
       //
       // Prove as following:
       // Define M = candidate_spot_count
       // Define N = max_candidate_spaces (M < N, otherwise we can randomly allocate any spot from
       // [0, max_candidate_spaces], thus allocate a specific slot has (1 / N) probability).
       // Define slot X0 where X0 belongs to [1, M].
       // Define event A: randomly pick a slot X in [1, N], N = X0.
       // Define event B: randomly pick a slot X in [1, N], N belongs to [1, M].
       // Define event C: randomly pick a slot X in [1, N], N = X0 when N belongs to [1, M].
       // P(C) = P(A | B)
       // Since when A happens, B definitely happens, so P(AB) = P(A)
       // P(C) = P(A) / P(B) = (1 / N) / (M / N) = (1 / M)
       // which is equal to the probability of picking a specific spot in [0, M].
       selected_index = prng->RandInt(max_candidate_spaces);
     }

     AllocSpotInfo alloc_spot_info;
     FindAllocSpotInGaps(size, align_pow2, selected_index, parent_base, parent_size,
                         &alloc_spot_info, upper_limit);
     size_t candidate_spot_count = alloc_spot_info.candidate_spot_count;
     if (candidate_spot_count == 0) {
       DEBUG_ASSERT(!alloc_spot_info.found);
       return ZX_ERR_NO_MEMORY;
     }
     if (!alloc_spot_info.found) {
       if (candidate_spot_count > max_candidate_spaces) {
         candidate_spot_count = max_candidate_spaces;
       }
       // If the number of candidate spaces is less than the index we want, let's pick again from the
       // range for available spaces.
       DEBUG_ASSERT(prng);
       selected_index = prng->RandInt(candidate_spot_count);
       FindAllocSpotInGaps(size, align_pow2, selected_index, parent_base, parent_size,
                           &alloc_spot_info, upper_limit);
     }
     DEBUG_ASSERT(alloc_spot_info.found);
     *alloc_spot = alloc_spot_info.alloc_spot;
     ASSERT(IS_ALIGNED(*alloc_spot, align));

     return ZX_OK;
   }

   // Utility for allocators for iterating over gaps between allocations.
   // F should have a signature of bool func(vaddr_t gap_base, size_t gap_size).
   // If func returns false, the iteration stops.  gap_base will be aligned in accordance with
   // align_pow2.
   template <typename F>
   void ForEachGap(F func, uint8_t align_pow2, vaddr_t parent_base, size_t parent_size) const {
     const vaddr_t align = 1UL << align_pow2;

     // Scan the regions list to find the gap to the left of each region.  We
     // round up the end of the previous region to the requested alignment, so
     // all gaps reported will be for aligned ranges.
     vaddr_t prev_region_end = ROUNDUP(parent_base, align);
     for (const auto& region : regions_) {
       if (region.base() > prev_region_end) {
         const size_t gap = region.base() - prev_region_end;
         if (!func(prev_region_end, gap)) {
           return;
         }
       }
       if (add_overflow(region.base(), region.size(), &prev_region_end)) {
         // This region is already the last region.
         return;
       }
       prev_region_end = ROUNDUP(prev_region_end, align);
     }

     // Grab the gap to the right of the last region (note that if there are no
     // regions, this handles reporting the VMAR's whole span as a gap).
     if (parent_size > prev_region_end - parent_base) {
       // This is equal to parent_base + parent_size - prev_region_end, but guarantee no overflow.
       const size_t gap = parent_size - (prev_region_end - parent_base);
       func(prev_region_end, gap);
     }
   }

   // Returns whether the region list is empty.
   bool IsEmpty() const { return regions_.is_empty(); }

   // Returns the iterator points to the first element of the list.
   T& front() { return regions_.front(); }

   typename ChildList::iterator begin() { return regions_.begin(); }

   typename ChildList::const_iterator begin() const { return regions_.begin(); }

   typename ChildList::const_iterator cbegin() const { return regions_.cbegin(); }

   typename ChildList::iterator end() { return regions_.end(); }

   typename ChildList::const_iterator end() const { return regions_.end(); }

   typename ChildList::const_iterator cend() const { return regions_.cend(); }

  private:
   // list of memory regions, indexed by base address.
   ChildList regions_;

   // A structure to contain allocated spot address or number of available slots.
   struct AllocSpotInfo {
     // candidate_spot_count is the number of available slot that we could allocate if we have not
     // found the spot with index |selected_index| to allocate.
     size_t candidate_spot_count = 0;
     // Found indicates whether we have found the spot with index |selected_indexes|.
     bool found = false;
     // alloc_spot is the virtual start address of the spot to allocate if we find one.
     vaddr_t alloc_spot = 0;
   };

   // Try to find the |selected_index| spot among all the gaps, alloc_spot_info contains the max
   // candidate spots if |selected_index| is larger than candidate_spaces. In this case, we need to
   // pick a smaller index and try again.
   void FindAllocSpotInGaps(size_t size, uint8_t align_pow2, vaddr_t selected_index,
                            vaddr_t parent_base, vaddr_t parent_size, AllocSpotInfo* alloc_spot_info,
                            vaddr_t upper_limit = ktl::numeric_limits<vaddr_t>::max()) const {
     const vaddr_t align = 1UL << align_pow2;
     // candidate_spot_count is the number of available slot that we could allocate if we have not
     // found the spot with index |selected_index| to allocate.
     size_t candidate_spot_count = 0;
     // Found indicates whether we have found the spot with index |selected_indexes|.
     bool found = false;
     // alloc_spot is the virtual start address of the spot to allocate if we find one.
     vaddr_t alloc_spot = 0;
     ForEachGap(
         [align, align_pow2, size, upper_limit, &candidate_spot_count, &selected_index, &alloc_spot,
          &found](vaddr_t gap_base, size_t gap_len) -> bool {
           DEBUG_ASSERT(IS_ALIGNED(gap_base, align));
           if (gap_len < size || gap_base + size > upper_limit) {
             // Ignore gap that is too small or out of range.
             return true;
           }
           const size_t clamped_len = ClampRange(gap_base, gap_len, upper_limit);
           const size_t spots = AllocationSpotsInRange(clamped_len, size, align_pow2);
           candidate_spot_count += spots;

           if (selected_index < spots) {
             // If we are able to find the spot with index |selected_indexes| in this gap, then we
             // have found our pick.
             found = true;
             alloc_spot = gap_base + (selected_index << align_pow2);
             return false;
           }
           selected_index -= spots;
           return true;
         },
         align_pow2, parent_base, parent_size);
     alloc_spot_info->found = found;
     alloc_spot_info->alloc_spot = alloc_spot;
     alloc_spot_info->candidate_spot_count = candidate_spot_count;
     return;
   }

   // Compute the number of allocation spots that satisfy the alignment within the
   // given range size, for a range that has a base that satisfies the alignment.
   static constexpr size_t AllocationSpotsInRange(size_t range_size, size_t alloc_size,
                                                  uint8_t align_pow2) {
     return ((range_size - alloc_size) >> align_pow2) + 1;
   }

   // Returns the size of the given range clamped to the given upper limit. The base
   // of the range must be within the upper limit.
   static constexpr size_t ClampRange(vaddr_t range_base, size_t range_size, vaddr_t upper_limit) {
     DEBUG_ASSERT(range_base <= upper_limit);
     const size_t range_limit = range_base + range_size;
     return range_limit <= upper_limit ? range_size : range_size - (range_limit - upper_limit);
   }
 };

 // A representation of a contiguous range of virtual address space
 class VmAddressRegion final : public VmAddressRegionOrMapping {
  public:
   // Create a root region.  This will span the entire aspace
   static zx_status_t CreateRoot(VmAspace& aspace, uint32_t vmar_flags,
                                 fbl::RefPtr<VmAddressRegion>* out);
   // Create a subregion of this region
   zx_status_t CreateSubVmar(size_t offset, size_t size, uint8_t align_pow2, uint32_t vmar_flags,
                             const char* name, fbl::RefPtr<VmAddressRegion>* out);
   // Create a VmMapping within this region
   zx_status_t CreateVmMapping(size_t mapping_offset, size_t size, uint8_t align_pow2,
                               uint32_t vmar_flags, fbl::RefPtr<VmObject> vmo, uint64_t vmo_offset,
                               uint arch_mmu_flags, const char* name, fbl::RefPtr<VmMapping>* out);

   // Find the child region that contains the given addr.  If addr is in a gap,
   // returns nullptr.  This is a non-recursive search.
   fbl::RefPtr<VmAddressRegionOrMapping> FindRegion(vaddr_t addr);

   enum class RangeOpType {
     Commit,
     Decommit,
     MapRange,
     DontNeed,
     AlwaysNeed,
   };

   // Apply |op| to VMO mappings in the specified range of pages.
   zx_status_t RangeOp(RangeOpType op, size_t offset, size_t len, user_inout_ptr<void> buffer,
                       size_t buffer_size);

   // Helper function used by RangeOp. Returns ZX_ERR_SHOULD_WAIT if a page needs to be faulted in
   // from a pager, indicating that the caller needs to wait on the |page_request|. If that happens,
   // |next_offset| will contain the faulting offset, i.e. the offset that RangeOp should resume from
   // after the wait.
   zx_status_t RangeOpInternal(RangeOpType op, vaddr_t base, size_t size,
                               LazyPageRequest* page_request, vaddr_t* next_offset);

   // Unmap a subset of the region of memory in the containing address space,
   // returning it to this region to allocate.  If a subregion is entirely in
   // the range, that subregion is destroyed.  If a subregion is partially in
   // the range, Unmap() will fail.
   zx_status_t Unmap(vaddr_t base, size_t size);

   // Same as Unmap, but allows for subregions that are partially in the range.
   // Additionally, sub-VMARs that are completely within the range will not be
   // destroyed.
   zx_status_t UnmapAllowPartial(vaddr_t base, size_t size);

   // Change protections on a subset of the region of memory in the containing
   // address space.  If the requested range overlaps with a subregion,
   // Protect() will fail.
   zx_status_t Protect(vaddr_t base, size_t size, uint new_arch_mmu_flags);

   // Reserve a memory region within this VMAR. This region is already mapped in the page table with
   // |arch_mmu_flags|. VMAR should create a VmMapping for this region even though no physical pages
   // need to be allocated for this region.
   zx_status_t ReserveSpace(const char* name, size_t base, size_t size, uint arch_mmu_flags);

   const char* name() const { return name_; }
   bool has_parent() const;

   void DumpLocked(uint depth, bool verbose) const TA_REQ(lock()) override;
   zx_status_t PageFault(vaddr_t va, uint pf_flags, LazyPageRequest* page_request)
       TA_REQ(lock()) override;

   // Constructors are public as LazyInit cannot use them otherwise, even if friended, but
   // otherwise should be considered private and Create...() should be used instead.
   VmAddressRegion(VmAspace& aspace, vaddr_t base, size_t size, uint32_t vmar_flags);
   VmAddressRegion(VmAddressRegion& parent, vaddr_t base, size_t size, uint32_t vmar_flags,
                   const char* name);

  protected:
   friend class VmAspace;
   friend void vm_init_preheap_vmars();
   friend lazy_init::Access;

   // constructor for use in creating the kernel aspace singleton
   explicit VmAddressRegion(VmAspace& kernel_aspace);
   // Count the allocated pages, caller must be holding the aspace lock
   size_t AllocatedPagesLocked() const TA_REQ(lock()) override;
   // Used to implement VmAspace::EnumerateChildren.
   // |aspace_->lock()| must be held.
   bool EnumerateChildrenLocked(VmEnumerator* ve);

   friend class VmMapping;

  private:
   DISALLOW_COPY_ASSIGN_AND_MOVE(VmAddressRegion);

   fbl::Canary<fbl::magic("VMAR")> canary_;

   zx_status_t DestroyLocked() TA_REQ(lock()) override;

   void Activate() TA_REQ(lock()) override;

   // Helper to share code between CreateSubVmar and CreateVmMapping
   zx_status_t CreateSubVmarInternal(size_t offset, size_t size, uint8_t align_pow2,
                                     uint32_t vmar_flags, fbl::RefPtr<VmObject> vmo,
                                     uint64_t vmo_offset, uint arch_mmu_flags, const char* name,
                                     fbl::RefPtr<VmAddressRegionOrMapping>* out);

   // Create a new VmMapping within this region, overwriting any existing
   // mappings that are in the way.  If the range crosses a subregion, the call
   // fails.
   zx_status_t OverwriteVmMappingLocked(vaddr_t base, size_t size, uint32_t vmar_flags,
                                        fbl::RefPtr<VmObject> vmo, uint64_t vmo_offset,
                                        uint arch_mmu_flags,
                                        fbl::RefPtr<VmAddressRegionOrMapping>* out) TA_REQ(lock());

   // Implementation for Unmap() and OverwriteVmMapping() that does not hold
   // the aspace lock. If |can_destroy_regions| is true, then this may destroy
   // VMARs that it completely covers. If |allow_partial_vmar| is true, then
   // this can handle the situation where only part of the VMAR is contained
   // within the region and will not destroy any VMARs.
   zx_status_t UnmapInternalLocked(vaddr_t base, size_t size, bool can_destroy_regions,
                                   bool allow_partial_vmar) TA_REQ(lock());

   // If the allocation between the given children can be met this returns a virtual address of the
   // base address of that allocation, otherwise a nullopt is returned.
   ktl::optional<vaddr_t> CheckGapLocked(VmAddressRegionOrMapping* prev,
                                         VmAddressRegionOrMapping* next, vaddr_t search_base,
                                         vaddr_t align, size_t region_size, size_t min_gap,
                                         uint arch_mmu_flags) TA_REQ(lock());

   // search for a spot to allocate for a region of a given size
   zx_status_t AllocSpotLocked(size_t size, uint8_t align_pow2, uint arch_mmu_flags, vaddr_t* spot,
                               vaddr_t upper_limit = ktl::numeric_limits<vaddr_t>::max())
       TA_REQ(lock());

   template <typename ON_VMAR, typename ON_MAPPING>
   bool EnumerateChildrenInternalLocked(vaddr_t min_addr, vaddr_t max_addr, ON_VMAR on_vmar,
                                        ON_MAPPING on_mapping);

   RegionList<VmAddressRegionOrMapping> subregions_ TA_GUARDED(lock());

   const char name_[32] = {};
 };

 // Helper object for managing a WAVL tree of protection ranges inside a VmMapping. For efficiency
 // this object does not duplicate the base_ and size_ of the mapping, and so these values must be
 // passed into most methods as |mapping_base| and |mapping_size|.
 // This object is thread-compatible
 // TODO: This object could be generalized into a dense range tracker as it is not really doing
 // anything mapping specific.
 class MappingProtectionRanges {
  public:
   explicit MappingProtectionRanges(uint arch_mmu_flags)
       : first_region_arch_mmu_flags_(arch_mmu_flags) {}
   MappingProtectionRanges(MappingProtectionRanges&&) = default;
   ~MappingProtectionRanges() = default;

   // Helper struct for FlagsRangeAtAddr
   struct FlagsRange {
     uint mmu_flags;
     uint64_t region_top;
   };
   // Returns both the flags for the specified vaddr, as well as the end of the range those flags are
   // valid for.
   FlagsRange FlagsRangeAtAddr(vaddr_t mapping_base, size_t mapping_size, vaddr_t vaddr) const {
     if (protect_region_list_rest_.is_empty()) {
       return FlagsRange{first_region_arch_mmu_flags_, mapping_base + mapping_size};
     } else {
       auto region = protect_region_list_rest_.upper_bound(vaddr);
       const vaddr_t region_top =
           region.IsValid() ? region->region_start : (mapping_base + mapping_size);
       const uint mmu_flags = FlagsForPreviousRegion(region);
       return FlagsRange{mmu_flags, region_top};
     }
   }

   // Updates the specified inclusive sub range to have the given flags. On error state is unchanged.
   // When updating the provided callback is invoked for every old range and value that is being
   // modified.
   template <typename F>
   zx_status_t UpdateProtectionRange(vaddr_t mapping_base, size_t mapping_size, vaddr_t base,
                                     size_t size, uint new_arch_mmu_flags, F callback);

   // Returns the precise mmu flags for the given vaddr. The vaddr is assumed to be within the range
   // of this mapping.
   uint MmuFlagsForRegion(vaddr_t vaddr) const {
     // Check the common case here inline since it doesn't generate much code. The full lookup
     // requires wavl tree traversal, and so we want to avoid inlining that.
     if (protect_region_list_rest_.is_empty()) {
       return first_region_arch_mmu_flags_;
     }
     return MmuFlagsForWavlRegion(vaddr);
   }

   // Enumerates any different protection ranges that exist inside this mapping. The virtual range
   // specified by range_base and range_size must be within this mappings base_ and size_. The
   // provided callback is called in virtual address order for each protection type. ZX_ERR_NEXT
   // and ZX_ERR_STOP can be used to control iteration, with any other status becoming the return
   // value of this method.
   zx_status_t EnumerateProtectionRanges(
       vaddr_t mapping_base, size_t mapping_size, vaddr_t base, size_t size,
       fbl::Function<zx_status_t(vaddr_t region_base, size_t region_size, uint mmu_flags)>&& func)
       const;

   // Merges protection ranges such that |right| is left cleared, and |this| contains the information
   // of both ranges. It is an error to call this if |this| and |right| are not virtually contiguous.
   zx_status_t MergeRightNeighbor(MappingProtectionRanges& right, vaddr_t merge_addr);

   // Splits this protection range into two ranges around the specified split point. |this| becomes
   // the left range and the right range is returned.
   MappingProtectionRanges SplitAt(vaddr_t split);

   // Discard any protection information below the given address.
   void DiscardBelow(vaddr_t addr);

   // Discard any protection information above the given address.
   void DiscardAbove(vaddr_t addr);

   // Returns whether all the protection nodes are within the given range. Intended for asserts.
   bool DebugNodesWithinRange(vaddr_t mapping_base, size_t mapping_size);

   // Clears all protection information and sets the size to 0.
   void clear() { protect_region_list_rest_.clear(); }

   // Flags for the first protection region.
   uint FirstRegionMmuFlags() const { return first_region_arch_mmu_flags_; }

  private:
   // If a mapping is protected so that parts of it are different types then we need to track this
   // information. The ProtectNode represents the additional metadata that we need to allocate to
   // track this, and these nodes get placed in the protect_region_list_rest_.
   struct ProtectNode : public fbl::WAVLTreeContainable<ktl::unique_ptr<ProtectNode>> {
     ProtectNode(vaddr_t start, uint flags) : region_start(start), arch_mmu_flags(flags) {}
     ProtectNode() = default;
     ~ProtectNode() = default;

     vaddr_t GetKey() const { return region_start; }

     // Defines the start of the region that the flags apply to. The end of the region is determined
     // implicitly by either the next region in the tree, or the end of the mapping.
     vaddr_t region_start = 0;
     // The mapping flags (read/write/user/etc) for this region.
     uint arch_mmu_flags = 0;
   };
   using RegionList = fbl::WAVLTree<vaddr_t, ktl::unique_ptr<ProtectNode>>;

   // Internal helper that returns the flags for the region before the given node. Templated to work
   // on both iterator and const_iterator.
   template <typename T>
   uint FlagsForPreviousRegion(T node) const {
     node--;
     return node.IsValid() ? node->arch_mmu_flags : first_region_arch_mmu_flags_;
   }

   // Counts how many nodes would need to be allocated for a protection range. This calculation is
   // based of whether there are actually changes in the protection type that require a node to be
   // added.
   uint NodeAllocationsForRange(vaddr_t mapping_base, size_t mapping_size, vaddr_t base, size_t size,
                                RegionList::iterator removal_start, RegionList::iterator removal_end,
                                uint new_mmu_flags) const;

   // Helper method for MmuFlagsForRegionLocked that does the wavl tree lookup. Defined this way so
   // that the common case can inline efficiently, and the wavl tree traversal can stay behind a
   // function call.
   uint MmuFlagsForWavlRegion(vaddr_t vaddr) const;

   // To efficiently track the current protection/arch mmu flags of the mapping we want to avoid
   // allocating ProtectNode's as much as possible. For this the following scheme is used:
   // * The first_region_arch_mmu_flags_ represent the mmu flags from the start of the mapping (that
   //   is base_) up to the first node in the protect_region_list_rest_. Should
   //   protect_region_list_rest_ be empty then the region extends all the way to base_+size_. This
   //   means that when a mapping is first created no nodes need to be allocated and inserted into
   //   protect_region_list_rest_, we can simply set first_region_arch_mmu_flags_ to the initial
   //   protection flags.
   // * Should ::Protect need to 'split' a region, then nodes can be added to the
   // protect_region_list_rest_
   //   such that the mapping base_+first_region-arch_mmu_flags_ always represent the start of the
   //   first region, and the last region is implicitly ended by the end of the mapping.
   // As we want to avoid having redundant nodes, we can apply the following invariants to
   // protect_region_list_rest_
   // * No node region_start==base_
   // * No node with region_start==(base_+size_-1)
   // * First node in the tree cannot have arch_mmu_flags == first_region_arch_mmu_flags_
   // * No two adjacent nodes in the tree can have the same arch_mmu_flags.
   // To give an example. If there was a mapping with base_ = 0x1000, size_ = 0x5000,
   // first_region_arch_mmu_flags_ = READ and a single ProtectNode with region_start = 0x3000,
   // arch_mmu_flags = READ_WRITE. Then would determine there to be the regions
   // 0x1000-0x3000: READ (start comes from base_, the end comes from the start of the first node)
   // 0x3000-0x6000: READ_WRITE (start from node start, end comes from the end of the mapping as
   // there is no next node.
   uint first_region_arch_mmu_flags_;
   RegionList protect_region_list_rest_;
 };

 // A representation of the mapping of a VMO into the address space
 class VmMapping final : public VmAddressRegionOrMapping,
                         public fbl::DoublyLinkedListable<VmMapping*> {
  public:
   // Accessors for VMO-mapping state
   // These can be read under either lock (both locks being held for writing), so we provide two
   // different accessors, one for each lock.
   uint arch_mmu_flags_locked(vaddr_t offset) const
       TA_REQ(aspace_->lock()) TA_NO_THREAD_SAFETY_ANALYSIS {
     return protection_ranges_.MmuFlagsForRegion(offset);
   }
   uint arch_mmu_flags_locked_object(vaddr_t offset) const
       TA_REQ(object_->lock()) TA_NO_THREAD_SAFETY_ANALYSIS {
     return protection_ranges_.MmuFlagsForRegion(offset);
   }
   uint64_t object_offset_locked() const TA_REQ(lock()) TA_NO_THREAD_SAFETY_ANALYSIS {
     return object_offset_;
   }
   uint64_t object_offset_locked_object() const
       TA_REQ(object_->lock()) TA_NO_THREAD_SAFETY_ANALYSIS {
     return object_offset_;
   }
   // Intended to be used from VmEnumerator callbacks where the aspace_->lock() will be held.
   fbl::RefPtr<VmObject> vmo_locked() const TA_REQ(lock()) { return object_; }
   fbl::RefPtr<VmObject> vmo() const TA_EXCL(lock());

   // Convenience wrapper for vmo()->DecommitRange() with the necessary
   // offset modification and locking.
   zx_status_t DecommitRange(size_t offset, size_t len) TA_EXCL(lock());

   // Map in pages from the underlying vm object, optionally committing pages as it goes
   zx_status_t MapRange(size_t offset, size_t len, bool commit) TA_EXCL(lock());
   zx_status_t MapRangeLocked(size_t offset, size_t len, bool commit) TA_REQ(lock());

   // Unmap a subset of the region of memory in the containing address space,
   // returning it to the parent region to allocate.  If all of the memory is unmapped,
   // Destroy()s this mapping.  If a subrange of the mapping is specified, the
   // mapping may be split.
   zx_status_t Unmap(vaddr_t base, size_t size);

   // Change access permissions for this mapping.  It is an error to specify a
   // caching mode in the flags.  This will persist the caching mode the
   // mapping was created with.  If a subrange of the mapping is specified, the
   // mapping may be split.
   zx_status_t Protect(vaddr_t base, size_t size, uint new_arch_mmu_flags);

   void DumpLocked(uint depth, bool verbose) const TA_REQ(lock()) override;
   zx_status_t PageFault(vaddr_t va, uint pf_flags, LazyPageRequest* page_request)
       TA_REQ(lock()) override;

   // The same as PageFault with an optional |vmo_locked_callback| that needs to be called after
   // looking up the page while the VMO lock is held.
   zx_status_t PageFaultWithVmoCallback(
       vaddr_t va, uint pf_flags, LazyPageRequest* page_request,
       ktl::optional<fbl::Function<void(VmObject* vmo_locked, vm_page_t* page)>> vmo_locked_callback)
       TA_REQ(lock());

   // Apis intended for use by VmObject

   Lock<Mutex>* object_lock() TA_RET_CAP(object_->lock()) { return object_->lock(); }

   // Unmap any pages that map the passed in vmo range from the arch aspace.
   // May not intersect with this range.
   void AspaceUnmapVmoRangeLocked(uint64_t offset, uint64_t len) const TA_REQ(object_->lock());

   // Removes any writeable mappings for the passed in vmo range from the arch aspace.
   // May fall back to unmapping pages from the arch aspace if necessary.
   void AspaceRemoveWriteVmoRangeLocked(uint64_t offset, uint64_t len) const TA_REQ(object_->lock());

   // Marks this mapping as being a candidate for merging, and will immediately attempt to merge with
   // any neighboring mappings. Making a mapping mergeable essentially indicates that you will no
   // longer use this specific VmMapping instance to refer to the referenced region, and will access
   // the region via the parent vmar in the future, and so the region merely needs to remain valid
   // through some VmMapping.
   // For this the function requires you to hand in your last remaining refptr to the mapping.
   static void MarkMergeable(fbl::RefPtr<VmMapping>&& mapping);

   // Used to cache the page attribution count for this vmo range. Also tracks the vmo hierarchy
   // generation count and the mapping generation count at the time of caching the attributed page
   // count.
   struct CachedPageAttribution {
     uint64_t mapping_generation_count = 0;
     uint64_t vmo_generation_count = 0;
     size_t page_count = 0;
   };

   // Exposed for testing.
   CachedPageAttribution GetCachedPageAttribution() {
     Guard<Mutex> guard{aspace_->lock()};
     return cached_page_attribution_;
   }

   // Exposed for testing.
   uint64_t GetMappingGenerationCount() {
     Guard<Mutex> guard{aspace_->lock()};
     return GetMappingGenerationCountLocked();
   }

   // Calls MarkAsLatencySensitive on the object_.
   // Exposed so that the parent aspace can call this.
   void MarkObjectAsLatencySensitiveLocked() const TA_REQ(lock()) {
     if (object_) {
       object_->MarkAsLatencySensitive();
     }
   }

   // Enumerates any different protection ranges that exist inside this mapping. The virtual range
   // specified by range_base and range_size must be within this mappings base_ and size_. The
   // provided callback is called in virtual address order for each protection type. ZX_ERR_NEXT
   // and ZX_ERR_STOP can be used to control iteration, with any other status becoming the return
   // value of this method.
   zx_status_t EnumerateProtectionRangesLocked(
       vaddr_t base, size_t size,
       fbl::Function<zx_status_t(vaddr_t region_base, size_t region_len, uint mmu_flags)>&& func)
       const TA_REQ(aspace_->lock()) __TA_NO_THREAD_SAFETY_ANALYSIS {
     DEBUG_ASSERT(is_in_range(base, size));
     return ProtectRangesLocked().EnumerateProtectionRanges(base_, size_, base, size,
                                                            ktl::move(func));
   }

  protected:
   ~VmMapping() override;
   friend fbl::RefPtr<VmMapping>;

  private:
   DISALLOW_COPY_ASSIGN_AND_MOVE(VmMapping);

   fbl::Canary<fbl::magic("VMAP")> canary_;

   enum class Mergeable : bool { YES = true, NO = false };

   // allow VmAddressRegion to manipulate VmMapping internals for construction
   // and bookkeeping
   friend class VmAddressRegion;

   // private constructors, use VmAddressRegion::Create...() instead
   VmMapping(VmAddressRegion& parent, vaddr_t base, size_t size, uint32_t vmar_flags,
             fbl::RefPtr<VmObject> vmo, uint64_t vmo_offset, uint arch_mmu_flags,
             Mergeable mergeable);
   VmMapping(VmAddressRegion& parent, vaddr_t base, size_t size, uint32_t vmar_flags,
             fbl::RefPtr<VmObject> vmo, uint64_t vmo_offset, MappingProtectionRanges&& ranges,
             Mergeable mergeable);

   zx_status_t DestroyLocked() TA_REQ(lock()) override;

   // Implementation for Unmap().  This supports partial unmapping.
   zx_status_t UnmapLocked(vaddr_t base, size_t size) TA_REQ(lock());

   // Implementation for Protect().
   zx_status_t ProtectLocked(vaddr_t base, size_t size, uint new_arch_mmu_flags) TA_REQ(lock());

   // Helper for protect and unmap.
   static zx_status_t ProtectOrUnmap(const fbl::RefPtr<VmAspace>& aspace, vaddr_t base, size_t size,
                                     uint new_arch_mmu_flags);

   size_t AllocatedPagesLocked() const TA_REQ(lock()) override;

   void Activate() TA_REQ(lock()) override;

   void ActivateLocked() TA_REQ(lock()) TA_REQ(object_->lock());

   // Takes a range relative to the vmo object_ and converts it into a virtual address range relative
   // to aspace_. Returns true if a non zero sized intersection was found, false otherwise. If false
   // is returned |base| and |virtual_len| hold undefined contents.
   bool ObjectRangeToVaddrRange(uint64_t offset, uint64_t len, vaddr_t* base,
                                uint64_t* virtual_len) const TA_REQ(object_->lock());

   // Attempts to merge this mapping with any neighbors. It is the responsibility of the caller to
   // ensure a refptr to this is being held, as on return |this| may be in the dead state and have
   // removed itself from the hierarchy, dropping a refptr.
   void TryMergeNeighborsLocked() TA_REQ(lock());

   // Attempts to merge the given mapping into this one. This only succeeds if the candidate is
   // placed just after |this|, both in the aspace and the vmo. See implementation for the full
   // requirements for merging to succeed.
   // The candidate must be held as a RefPtr by the caller so that this function does not trigger
   // any VmMapping destructor by dropping the last reference when removing from the parent vmar.
   void TryMergeRightNeighborLocked(VmMapping* right_candidate) TA_REQ(lock());

   // This should be called whenever a change is made to the vmo range we are mapping, that could
   // result in the page attribution count of that range changing.
   void IncrementMappingGenerationCountLocked() TA_REQ(lock()) {
     DEBUG_ASSERT(mapping_generation_count_ != 0);
     mapping_generation_count_++;
   }

   // Get the current generation count.
   uint64_t GetMappingGenerationCountLocked() const TA_REQ(lock()) {
     DEBUG_ASSERT(mapping_generation_count_ != 0);
     return mapping_generation_count_;
   }

   // Helper function that updates the |size_| to |new_size| and also increments the mapping
   // generation count. Requires both the aspace lock and the object lock to be held, since |size_|
   // can be read under either of those locks.
   void set_size_locked(size_t new_size) TA_REQ(lock()) TA_REQ(object_->lock()) {
     // Check that if we have additional protection regions that they have already been constrained
     // to the range of the new size.
     DEBUG_ASSERT(protection_ranges_.DebugNodesWithinRange(base_, new_size));
     size_ = new_size;
     IncrementMappingGenerationCountLocked();
   }

   // pointer and region of the object we are mapping
   fbl::RefPtr<VmObject> object_;
   // This can be read with either lock hold, but requires both locks to write it.
   uint64_t object_offset_ TA_GUARDED(object_->lock()) TA_GUARDED(aspace_->lock()) = 0;

   // This can be read with either lock hold, but requires both locks to write it.
   MappingProtectionRanges protection_ranges_ TA_GUARDED(object_->lock())
       TA_GUARDED(aspace_->lock());

   // Helpers for gaining read access to the protection information when only one of the locks is
   // held.
   const MappingProtectionRanges& ProtectRangesLocked() const
       TA_REQ(aspace_->lock()) __TA_NO_THREAD_SAFETY_ANALYSIS {
     return protection_ranges_;
   }
   const MappingProtectionRanges& ProtectRangesLockedObject() const
       TA_REQ(object_->lock()) __TA_NO_THREAD_SAFETY_ANALYSIS {
     return protection_ranges_;
   }

   // used to detect recursions through the vmo fault path
   bool currently_faulting_ TA_GUARDED(object_->lock()) = false;

   // Whether this mapping may be merged with other adjacent mappings. A mergeable mapping is just a
   // region that can be represented by any VmMapping object, not specifically this one.
   Mergeable mergeable_ TA_GUARDED(lock()) = Mergeable::NO;

   // Tracks the last cached page attribution count for the vmo range we are mapping.
   // Only used when |object_| is a VmObjectPaged.
   mutable CachedPageAttribution cached_page_attribution_ TA_GUARDED(aspace_->lock()) = {};

   // The mapping's generation count is incremented on any change to the vmo range that is mapped.
   //
   // This is used to implement caching for page attribution counts, which get queried frequently to
   // periodically track memory usage on the system. Attributing pages to a VMO is an expensive
   // operation and involves walking the VMO tree, quite often multiple times. If the generation
   // counts for the vmo *and* the mapping do not change between two successive queries, we can avoid
   // re-counting attributed pages, and simply return the previously cached value.
   //
   // The generation count starts at 1 to ensure that there can be no cached values initially; the
   // cached generation count starts at 0.
   uint64_t mapping_generation_count_ TA_GUARDED(aspace_->lock()) = 1;
 };

 // Interface for walking a VmAspace-rooted VmAddressRegion/VmMapping tree.
 // Override this class and pass an instance to VmAspace::EnumerateChildren().
 class VmEnumerator {
  public:
   // VmAspace::EnumerateChildren() will call the On* methods in depth-first
   // pre-order. If any call returns false, the traversal will stop. The root
   // VmAspace's lock will be held during the entire traversal.
   // |depth| will be 0 for the root VmAddressRegion.
   virtual bool OnVmAddressRegion(const VmAddressRegion* vmar, uint depth) TA_REQ(vmar->lock()) {
     return true;
   }

   // |vmar| is the parent of |map|. The root VmAspace's lock will be held when this is called.
   virtual bool OnVmMapping(const VmMapping* map, const VmAddressRegion* vmar, uint depth)
       TA_REQ(map->lock()) TA_REQ(vmar->lock()) {
     return true;
   }

  protected:
   VmEnumerator() = default;
   ~VmEnumerator() = default;
 };

 #endif  // ZIRCON_KERNEL_VM_INCLUDE_VM_VM_ADDRESS_REGION_H_