zircon/kernel/vm/include/vm/vm_page_list.h - fuchsia - Git at Google

 // Copyright 2016 The Fuchsia Authors
 //
 // Use of this source code is governed by a MIT-style
 // license that can be found in the LICENSE file or at
 // https://opensource.org/licenses/MIT

 #ifndef ZIRCON_KERNEL_VM_INCLUDE_VM_VM_PAGE_LIST_H_
 #define ZIRCON_KERNEL_VM_INCLUDE_VM_VM_PAGE_LIST_H_

 #include <align.h>
 #include <bits.h>
 #include <lib/fit/function.h>
 #include <zircon/errors.h>
 #include <zircon/types.h>

 #include <fbl/canary.h>
 #include <fbl/intrusive_wavl_tree.h>
 #include <fbl/macros.h>
 #include <ktl/algorithm.h>
 #include <ktl/unique_ptr.h>
 #include <vm/page.h>
 #include <vm/pmm.h>
 #include <vm/vm.h>

 class VmPageList;
 class VMPLCursor;

 // RAII helper for representing content in a page list node. This supports being in one of five
 // states
 //  * Empty       - Contains nothing
 //  * Page p      - Contains a vm_page 'p'. This 'p' is considered owned by this wrapper and
 //                  `ReleasePage` must be called to give up ownership.
 //  * Reference r - Contains a reference 'r' to some content. This 'r' is considered owned by this
 //                  wrapper and `ReleaseReference` must be called to give up ownership.
 //  * Marker      - Indicates that whilst not a page, it is also not empty. Markers can be used to
 //                  separate the distinction between "there's no page because we've deduped to the
 //                  zero page" and "there's no page because our parent contains the content".
 //  * Interval    - Indicates that this page is part of a sparse page interval. An interval will
 //                  have a Start sentinel, and an End sentinel, and all offsets that lie between the
 //                  two will be empty. If the interval spans a single page, it will be represented
 //                  as a Slot sentinel, which is conceptually the same as both a Start and an End
 //                  sentinel.
 //
 // There are certain invariants that the page list tries to maintain at all times. It might not
 // always be possible to enforce these as the checks involved might be expensive, however it is
 // important that any code that manipulates the page list abide by them, primarily to keep the
 // memory occupied by the page list nodes in check.
 // 1. Page list nodes cannot be completely empty i.e. they must contain at least one non-empty slot.
 // 2. Any intervals in the page list should span a maximal range. In other words, there should not
 // be consecutive intervals in the page list which it would have been possible to represent with a
 // single interval instead.
 class VmPageOrMarker {
  public:
   // A PageType that otherwise holds a null pointer is considered to be Empty.
   VmPageOrMarker() : raw_(kPageType) {}
   ~VmPageOrMarker() { DEBUG_ASSERT(!IsPageOrRef()); }
   VmPageOrMarker(VmPageOrMarker&& other) noexcept : raw_(other.Release()) {}
   VmPageOrMarker(const VmPageOrMarker&) = delete;
   VmPageOrMarker& operator=(const VmPageOrMarker&) = delete;

   // Minimal wrapper around a uint64_t to provide stronger typing in code to prevent accidental
   // mixing of references and other uint64_t values.
   // Provides a way to query the required alignment of the references and does debug enforcement of
   // this.
   class ReferenceValue {
    public:
     // kAlignBits represents the number of low bits in a reference that must be zero so they can be
     // used for internal metadata. This is declared here for convenience, and is asserted to be in
     // sync with the private kReferenceBits.
     static constexpr uint64_t kAlignBits = 4;
     explicit constexpr ReferenceValue(uint64_t raw) : value_(raw) {
       DEBUG_ASSERT((value_ & BIT_MASK(kAlignBits)) == 0);
     }
     uint64_t value() const { return value_; }

    private:
     uint64_t value_;
   };

   // Returns a reference to the underlying vm_page*. Is only valid to call if `IsPage` is true.
   vm_page* Page() const {
     DEBUG_ASSERT(IsPage());
     // Do not need to mask any bits out of raw_, since Page has 0's for the type anyway.
     static_assert(kPageType == 0);
     return reinterpret_cast<vm_page*>(raw_);
   }
   ReferenceValue Reference() const {
     DEBUG_ASSERT(IsReference());
     return ReferenceValue(raw_ & ~BIT_MASK(kReferenceBits));
   }

   // If this is a page, moves the underlying vm_page* out and returns it. After this IsPage will
   // be false and IsEmpty will be true.
   [[nodiscard]] vm_page* ReleasePage() {
     DEBUG_ASSERT(IsPage());
     // Do not need to mask any bits out of the Release since Page has 0's for the type
     // anyway.
     static_assert(kPageType == 0);
     return reinterpret_cast<vm_page*>(Release());
   }

   [[nodiscard]] ReferenceValue ReleaseReference() {
     DEBUG_ASSERT(IsReference());
     return ReferenceValue(Release() & ~BIT_MASK(kReferenceBits));
   }

   // Convenience wrappers for getting and setting split bits on both pages and references.
   bool PageOrRefLeftSplit() const {
     DEBUG_ASSERT(IsPageOrRef());
     if (IsPage()) {
       return Page()->object.cow_left_split;
     }
     return raw_ & kReferenceLeftSplit;
   }
   bool PageOrRefRightSplit() const {
     DEBUG_ASSERT(IsPageOrRef());
     if (IsPage()) {
       return Page()->object.cow_right_split;
     }
     return raw_ & kReferenceRightSplit;
   }
   void SetPageOrRefLeftSplit(bool value) {
     DEBUG_ASSERT(IsPageOrRef());
     if (IsPage()) {
       Page()->object.cow_left_split = value;
     } else {
       if (value) {
         raw_ |= kReferenceLeftSplit;
       } else {
         raw_ &= ~kReferenceLeftSplit;
       }
     }
   }
   void SetPageOrRefRightSplit(bool value) {
     DEBUG_ASSERT(IsPageOrRef());
     if (IsPage()) {
       Page()->object.cow_right_split = value;
     } else {
       if (value) {
         raw_ |= kReferenceRightSplit;
       } else {
         raw_ &= ~kReferenceRightSplit;
       }
     }
   }

   // Changes the content from a reference to a page, moving over the split bits and returning the
   // original reference.
   [[nodiscard]] VmPageOrMarker::ReferenceValue SwapReferenceForPage(vm_page_t* p) {
     DEBUG_ASSERT(p);
     // Ensure no split bits were already set.
     DEBUG_ASSERT(p->object.cow_left_split == 0);
     DEBUG_ASSERT(p->object.cow_right_split == 0);
     p->object.cow_left_split = PageOrRefLeftSplit();
     p->object.cow_right_split = PageOrRefRightSplit();
     VmPageOrMarker::ReferenceValue ref = ReleaseReference();
     *this = VmPageOrMarker::Page(p);
     // The ReferenceValue that we return, unlike a page, has no split bit information and so at this
     // point the bits have been fully moved.
     return ref;
   }
   // Changes the content from a page to a reference, moving over the split bits and returning the
   // original page.
   [[nodiscard]] vm_page_t* SwapPageForReference(VmPageOrMarker::ReferenceValue ref) {
     const bool left_split = PageOrRefLeftSplit();
     const bool right_split = PageOrRefRightSplit();
     vm_page_t* page = ReleasePage();
     // Clear the page split bits before returning it, as these are moved to the reference.
     page->object.cow_left_split = 0;
     page->object.cow_right_split = 0;
     *this = VmPageOrMarker::Reference(ref, left_split, right_split);
     return page;
   }
   // Changes the content from one reference to a different one, moving over the split bits and
   // returning the original reference.
   [[nodiscard]] VmPageOrMarker::ReferenceValue ChangeReferenceValue(
       VmPageOrMarker::ReferenceValue ref) {
     const bool left_split = PageOrRefLeftSplit();
     const bool right_split = PageOrRefRightSplit();
     const VmPageOrMarker::ReferenceValue old = ReleaseReference();
     *this = VmPageOrMarker::Reference(ref, left_split, right_split);
     // The ReferenceValue that we return, unlike a page, has no split bit information and so at this
     // point the bits have been fully moved.
     return old;
   }

   bool IsPage() const { return !IsEmpty() && (GetType() == kPageType); }
   bool IsMarker() const { return GetType() == kZeroMarkerType; }
   bool IsEmpty() const {
     // A PageType that otherwise holds a null pointer is considered to be Empty.
     return raw_ == kPageType;
   }
   bool IsReference() const { return GetType() == kReferenceType; }
   bool IsPageOrRef() const { return IsPage() || IsReference(); }
   bool IsInterval() const { return GetType() == kIntervalType; }

   VmPageOrMarker& operator=(VmPageOrMarker&& other) noexcept {
     // Forbid overriding content, as that would leak it.
     DEBUG_ASSERT(!IsPageOrRef());
     raw_ = other.Release();
     return *this;
   }

   bool operator==(const VmPageOrMarker& other) const { return raw_ == other.raw_; }

   bool operator!=(const VmPageOrMarker& other) const { return raw_ != other.raw_; }

   // A PageType that otherwise holds a null pointer is considered to be Empty.
   static VmPageOrMarker Empty() { return VmPageOrMarker{kPageType}; }
   static VmPageOrMarker Marker() { return VmPageOrMarker{kZeroMarkerType}; }

   [[nodiscard]] static VmPageOrMarker Page(vm_page* p) {
     // A null page is incorrect for two reasons
     // 1. It's a violation of the API of this method
     // 2. A null page cannot be represented internally as this is used to represent Empty
     DEBUG_ASSERT(p);
     const uint64_t raw = reinterpret_cast<uint64_t>(p);
     // A pointer should be aligned by definition, and hence the low bits should always be zero, but
     // assert this anyway just in case kTypeBits is increased or someone passed an invalid pointer.
     DEBUG_ASSERT((raw & BIT_MASK(kTypeBits)) == 0);
     return VmPageOrMarker{raw | kPageType};
   }

   [[nodiscard]] static VmPageOrMarker Reference(ReferenceValue ref, bool left_split,
                                                 bool right_split) {
     return VmPageOrMarker(ref.value() | (left_split ? kReferenceLeftSplit : 0) |
                           (right_split ? kReferenceRightSplit : 0) | kReferenceType);
   }

   // The types of sparse page interval types that are supported.
   enum class IntervalType : uint64_t {
     // Represents a range of zero pages.
     Zero = 0,
     NumTypes,
   };

   // Sentinel types that are used to represent a sparse page interval.
   enum class IntervalSentinel : uint64_t {
     // Represents a single page interval.
     Slot = 0,
     // The first page of a multi-page interval.
     Start,
     // The last page of a multi-page interval.
     End,
     NumSentinels,
   };

   // The remaining bits of an interval type store any information specific to the type of interval
   // being tracked. The ZeroRange class is defined here to group together the encoding of these bits
   // specific to IntervalType::Zero.
   class ZeroRange {
    public:
     // This is the same as kIntervalBits. Equality is asserted later where kIntervalBits is defined.
     static constexpr uint64_t kAlignBits = 6;
     explicit constexpr ZeroRange(uint64_t val) : value_(val) {
       DEBUG_ASSERT((value_ & BIT_MASK(kAlignBits)) == 0);
     }
     // The various dirty states that a zero interval can be in. Refer to VmCowPages::DirtyState for
     // an explanation of the states. Note that an AwaitingClean state is not encoded in the interval
     // state bits. This information is instead stored using the AwaitingCleanLength for convenience,
     // where a non-zero length indicates that the interval is AwaitingClean. Doing this affords
     // more convenient splitting and merging of intervals.
     enum class DirtyState : uint64_t {
       Untracked = 0,
       Clean,
       Dirty,
       NumStates,
     };
     ZeroRange(uint64_t val, DirtyState state) : value_(val) {
       DEBUG_ASSERT((value_ & BIT_MASK(kAlignBits)) == 0);
       DEBUG_ASSERT(GetDirtyState() == DirtyState::Untracked);
       SetDirtyState(state);
     }
     uint64_t value() const { return value_; }

     // For zero range tracking, we also need to track dirty state information, and if the interval
     // is AwaitingClean, the length that is AwaitingClean.
     static constexpr uint64_t kDirtyStateBits = VM_PAGE_OBJECT_DIRTY_STATE_BITS;
     static_assert(static_cast<uint64_t>(DirtyState::NumStates) <= (1 << kDirtyStateBits));
     static constexpr uint64_t kDirtyStateShift = kAlignBits;
     DirtyState GetDirtyState() const {
       return static_cast<DirtyState>((value_ & (BIT_MASK(kDirtyStateBits) << kDirtyStateShift)) >>
                                      kDirtyStateShift);
     }
     void SetDirtyState(DirtyState state) {
       // Only allow dirty zero ranges for now.
       DEBUG_ASSERT(state == DirtyState::Dirty);
       // Clear the old state.
       value_ &= ~(BIT_MASK(kDirtyStateBits) << kDirtyStateShift);
       // Set the new state.
       value_ |= static_cast<uint64_t>(state) << kDirtyStateShift;
     }

     // The AwaitingCleanLength will always be a page-aligned length, so we can mask out the low
     // PAGE_SIZE_SHIFT bits and store only the upper bits.
     static constexpr uint64_t kAwaitingCleanLengthShift = PAGE_SIZE_SHIFT;
     // Assert that we are not overlapping with the dirty state bits.
     static_assert(kAwaitingCleanLengthShift >= kDirtyStateShift + kDirtyStateBits);
     void SetAwaitingCleanLength(uint64_t len) {
       DEBUG_ASSERT(GetDirtyState() == DirtyState::Dirty);
       DEBUG_ASSERT(IS_ALIGNED(len, (1 << kAwaitingCleanLengthShift)));
       // Clear the old value.
       value_ &= BIT_MASK(kAwaitingCleanLengthShift);
       // Set the new value.
       value_ |= (len & ~BIT_MASK(kAwaitingCleanLengthShift));
     }
     uint64_t GetAwaitingCleanLength() const {
       return value_ & ~BIT_MASK(kAwaitingCleanLengthShift);
     }

    private:
     uint64_t value_;
   };
   using IntervalDirtyState = ZeroRange::DirtyState;

   // Getters and setters for the interval type.
   bool IsIntervalStart() const {
     return IsInterval() && GetIntervalSentinel() == IntervalSentinel::Start;
   }
   bool IsIntervalEnd() const {
     return IsInterval() && GetIntervalSentinel() == IntervalSentinel::End;
   }
   bool IsIntervalSlot() const {
     return IsInterval() && GetIntervalSentinel() == IntervalSentinel::Slot;
   }
   bool IsIntervalZero() const { return IsInterval() && GetIntervalType() == IntervalType::Zero; }

   // Getters and setter for the zero interval type.
   bool IsZeroIntervalClean() const {
     DEBUG_ASSERT(IsIntervalZero());
     return ZeroRange(raw_ & ~BIT_MASK(kIntervalBits)).GetDirtyState() ==
            ZeroRange::DirtyState::Clean;
   }
   bool IsZeroIntervalDirty() const {
     DEBUG_ASSERT(IsIntervalZero());
     return ZeroRange(raw_ & ~BIT_MASK(kIntervalBits)).GetDirtyState() ==
            ZeroRange::DirtyState::Dirty;
   }
   ZeroRange::DirtyState GetZeroIntervalDirtyState() const {
     DEBUG_ASSERT(IsIntervalZero());
     return ZeroRange(raw_ & ~BIT_MASK(kIntervalBits)).GetDirtyState();
   }
   void SetZeroIntervalAwaitingCleanLength(uint64_t len) {
     DEBUG_ASSERT(IsIntervalZero());
     DEBUG_ASSERT(IsIntervalStart() || IsIntervalSlot());
     DEBUG_ASSERT(IsZeroIntervalDirty());
     auto interval = ZeroRange(raw_ & ~BIT_MASK(kIntervalBits));
     interval.SetAwaitingCleanLength(len);
     raw_ = (raw_ & BIT_MASK(kIntervalBits)) | interval.value();
   }
   uint64_t GetZeroIntervalAwaitingCleanLength() const {
     DEBUG_ASSERT(IsIntervalZero());
     DEBUG_ASSERT(IsIntervalStart() || IsIntervalSlot());
     return ZeroRange(raw_ & ~BIT_MASK(kIntervalBits)).GetAwaitingCleanLength();
   }

  private:
   explicit VmPageOrMarker(uint64_t raw) : raw_(raw) {}

   // The low 2 bits of raw_ are reserved to select the type, any other data has to fit into the
   // remaining high bits. Note that there is no explicit Empty type, rather a PageType with a zero
   // pointer is used to represent Empty.
   static constexpr uint64_t kTypeBits = 2;
   static constexpr uint64_t kPageType = 0b00;
   static constexpr uint64_t kZeroMarkerType = 0b01;
   static constexpr uint64_t kReferenceType = 0b10;
   static constexpr uint64_t kIntervalType = 0b11;

   // In addition to storing the type, a reference needs to track two additional pieces of data,
   // these being the left and right split bits. The split bits are normally stored in the vm_page_t
   // and are used for copy-on-write tracking in hidden VMOs. Having the ability to store the split
   // bits here allows these pages to be candidates for compression. The remaining bits are then
   // available for the actual reference value being stored. Unlike the page type, which does not
   // allow the 0 value to be stored, a reference has no restrictions and a ref value of 0 is valid
   // and may be stored.
   static constexpr uint64_t kReferenceBits = kTypeBits + 2;
   // Due to ordering and public/private visibility ReferenceValue::kAlignBits is declared
   // separately, but it should match kReferenceBits.
   static_assert(ReferenceValue::kAlignBits == kReferenceBits);
   static constexpr uint64_t kReferenceLeftSplit = 0b10 << kTypeBits;
   static constexpr uint64_t kReferenceRightSplit = 0b01 << kTypeBits;

   // In addition to storing the type for an interval, we also need to track the type of interval
   // sentinel: the start, the end, or a single slot marker.
   static constexpr uint64_t kIntervalSentinelBits = 2;
   static_assert(static_cast<uint64_t>(IntervalSentinel::NumSentinels) <=
                 (1 << kIntervalSentinelBits));
   static constexpr uint64_t kIntervalSentinelShift = kTypeBits;
   IntervalSentinel GetIntervalSentinel() const {
     return static_cast<IntervalSentinel>(
         (raw_ & (BIT_MASK(kIntervalSentinelBits) << kIntervalSentinelShift)) >>
         kIntervalSentinelShift);
   }
   void SetIntervalSentinel(IntervalSentinel sentinel) {
     // Clear the old sentinel type.
     raw_ &= ~(BIT_MASK(kIntervalSentinelBits) << kIntervalSentinelShift);
     // Set the new sentinel type.
     raw_ |= static_cast<uint64_t>(sentinel) << kIntervalSentinelShift;
   }
   // Next we also need to store the type of interval being represented; reserve a couple of bits for
   // this. Currently we only support one type of interval: a range of zero pages, but reserving 2
   // bits allows for more types in the future.
   static constexpr uint64_t kIntervalTypeBits = 2;
   static_assert(static_cast<uint64_t>(IntervalType::NumTypes) <= (1 << kIntervalTypeBits));
   static constexpr uint64_t kIntervalTypeShift = kIntervalSentinelShift + kIntervalSentinelBits;
   IntervalType GetIntervalType() const {
     return static_cast<IntervalType>((raw_ & (BIT_MASK(kIntervalTypeBits) << kIntervalTypeShift)) >>
                                      kIntervalTypeShift);
   }
   static constexpr uint64_t kIntervalBits = kTypeBits + kIntervalSentinelBits + kIntervalTypeBits;
   static_assert(ZeroRange::kAlignBits == kIntervalBits);

   // Only support creation of zero interval type for now.
   // Private and only friended with VmPageList so that an external caller cannot arbitrarily create
   // interval sentinels.
   [[nodiscard]] static VmPageOrMarker ZeroInterval(IntervalSentinel sentinel,
                                                    IntervalDirtyState state) {
     uint64_t sentinel_bits = static_cast<uint64_t>(sentinel) << kIntervalSentinelShift;
     uint64_t type_bits = static_cast<uint64_t>(IntervalType::Zero) << kIntervalTypeShift;
     return VmPageOrMarker(ZeroRange(0, state).value() | type_bits | sentinel_bits | kIntervalType);
   }

   // Change the interval sentinel type for an existing interval, while preserving the rest of the
   // original state. Only valid to call on an existing interval type. The only permissible
   // transitions are from Slot to Start/End and vice versa, as these are the only valid transitions
   // when extending or clipping intervals.
   // Private and only friended with VmPageList so that an external caller cannot arbitrarily
   // manipulate interval sentinels.
   void ChangeIntervalSentinel(IntervalSentinel new_sentinel) {
 #if ZX_DEBUG_ASSERT_IMPLEMENTED
     DEBUG_ASSERT(IsInterval());
     auto old_sentinel = GetIntervalSentinel();
     DEBUG_ASSERT(old_sentinel != new_sentinel);
     if (old_sentinel == IntervalSentinel::Start || old_sentinel == IntervalSentinel::End) {
       DEBUG_ASSERT(new_sentinel == IntervalSentinel::Slot);
     } else {
       DEBUG_ASSERT(old_sentinel == IntervalSentinel::Slot);
       DEBUG_ASSERT(new_sentinel == IntervalSentinel::Start ||
                    new_sentinel == IntervalSentinel::End);
     }
 #endif
     SetIntervalSentinel(new_sentinel);
   }

   uint64_t GetType() const { return raw_ & BIT_MASK(kTypeBits); }

   uint64_t Release() {
     const uint64_t p = raw_;
     raw_ = 0;
     return p;
   }

   uint64_t raw_;

   friend VmPageList;
 };

 // Limited reference to a VmPageOrMarker. This reference provides unrestricted const access to the
 // underlying VmPageOrMarker, but as it holds a non-const VmPageOrMarker* it has the ability to
 // modify the underlying entry. However, the interface for modification is very limited.
 //
 // This allows for the majority of VmPageList iterations that are not intended to allow for clearing
 // entries to the Empty state to allow limited mutation (such as between different content states),
 // without being completely mutable.
 class VmPageOrMarkerRef {
  public:
   VmPageOrMarkerRef() = default;
   explicit VmPageOrMarkerRef(VmPageOrMarker* page_or_marker) : page_or_marker_(page_or_marker) {}
   ~VmPageOrMarkerRef() = default;

   const VmPageOrMarker& operator*() const {
     DEBUG_ASSERT(page_or_marker_);
     return *page_or_marker_;
   }

   const VmPageOrMarker* operator->() const {
     DEBUG_ASSERT(page_or_marker_);
     return page_or_marker_;
   }

   explicit operator bool() const { return !!page_or_marker_; }

   // Forward split bit modifications as an allowed mutation.
   void SetPageOrRefLeftSplit(bool value) {
     DEBUG_ASSERT(page_or_marker_);
     page_or_marker_->SetPageOrRefLeftSplit(value);
   }
   void SetPageOrRefRightSplit(bool value) {
     DEBUG_ASSERT(page_or_marker_);
     page_or_marker_->SetPageOrRefRightSplit(value);
   }

   // Changing the kind of content is an allowed mutation and this takes ownership of the provided
   // page and returns ownership of the previous reference.
   [[nodiscard]] VmPageOrMarker::ReferenceValue SwapReferenceForPage(vm_page_t* p) {
     DEBUG_ASSERT(page_or_marker_);
     return page_or_marker_->SwapReferenceForPage(p);
   }
   // Similar to SwapReferenceForPage, but takes ownership of the ref and returns ownership of the
   // previous page.
   [[nodiscard]] vm_page_t* SwapPageForReference(VmPageOrMarker::ReferenceValue ref) {
     DEBUG_ASSERT(page_or_marker_);
     return page_or_marker_->SwapPageForReference(ref);
   }
   // Similar to SwapReferenceForPage, but changes one reference for another.
   [[nodiscard]] VmPageOrMarker::ReferenceValue ChangeReferenceValue(
       VmPageOrMarker::ReferenceValue ref) {
     DEBUG_ASSERT(page_or_marker_);
     return page_or_marker_->ChangeReferenceValue(ref);
   }

   // Forward dirty state updates as an allowed mutation.
   void SetZeroIntervalAwaitingCleanLength(uint64_t len) {
     DEBUG_ASSERT(page_or_marker_);
     page_or_marker_->SetZeroIntervalAwaitingCleanLength(len);
   }

  private:
   VmPageOrMarker* page_or_marker_ = nullptr;
 };

 class VmPageListNode final : public fbl::WAVLTreeContainable<ktl::unique_ptr<VmPageListNode>> {
  public:
   explicit VmPageListNode(uint64_t offset);
   ~VmPageListNode();

   DISALLOW_COPY_ASSIGN_AND_MOVE(VmPageListNode);

   static const size_t kPageFanOut = 16;

   // accessors
   uint64_t offset() const { return obj_offset_; }
   uint64_t GetKey() const { return obj_offset_; }

   uint64_t end_offset() const { return offset() + kPageFanOut * PAGE_SIZE; }

   void set_offset(uint64_t offset) {
     DEBUG_ASSERT(!InContainer());
     obj_offset_ = offset;
   }

   // for every page or marker in the node call the passed in function.
   template <typename PTR_TYPE, typename F>
   zx_status_t ForEveryPage(F func, uint64_t skew) {
     return ForEveryPageInRange<PTR_TYPE>(this, func, offset(), end_offset(), skew);
   }

   // for every page or marker in the node call the passed in function.
   template <typename PTR_TYPE, typename F>
   zx_status_t ForEveryPage(F func, uint64_t skew) const {
     return ForEveryPageInRange<PTR_TYPE>(this, func, offset(), end_offset(), skew);
   }

   // for every page or marker in the node in the range call the passed in function. The range is
   // assumed to be within the nodes object range.
   template <typename PTR_TYPE, typename F>
   zx_status_t ForEveryPageInRange(F func, uint64_t start_offset, uint64_t end_offset,
                                   uint64_t skew) {
     return ForEveryPageInRange<PTR_TYPE>(this, func, start_offset, end_offset, skew);
   }

   // for every page or marker in the node in the range call the passed in function. The range is
   // assumed to be within the nodes object range.
   template <typename PTR_TYPE, typename F>
   zx_status_t ForEveryPageInRange(F func, uint64_t start_offset, uint64_t end_offset,
                                   uint64_t skew) const {
     return ForEveryPageInRange<PTR_TYPE>(this, func, start_offset, end_offset, skew);
   }

   const VmPageOrMarker& Lookup(size_t index) const {
     canary_.Assert();
     DEBUG_ASSERT(index < kPageFanOut);
     return pages_[index];
   }

   VmPageOrMarker& Lookup(size_t index) {
     canary_.Assert();
     DEBUG_ASSERT(index < kPageFanOut);
     return pages_[index];
   }

   // A node is empty if it contains no pages, page interval sentinels, references, or markers.
   bool IsEmpty() const {
     for (const auto& p : pages_) {
       if (!p.IsEmpty()) {
         return false;
       }
     }
     return true;
   }

   // Returns true if there are no pages or references owned by this node. Meant to check whether the
   // node has any resource that needs to be returned.
   bool HasNoPageOrRef() const {
     for (const auto& p : pages_) {
       if (p.IsPageOrRef()) {
         return false;
       }
     }
     return true;
   }

   // Returns true if there are no interval sentinels owned by this node.
   bool HasNoIntervalSentinel() const {
     for (const auto& p : pages_) {
       if (p.IsInterval()) {
         return false;
       }
     }
     return true;
   }

  private:
   template <typename PTR_TYPE, typename S, typename F>
   static zx_status_t ForEveryPageInRange(S self, F func, uint64_t start_offset, uint64_t end_offset,
                                          uint64_t skew) {
     // Assert that the requested range is sensible and falls within our nodes actual offset range.
     DEBUG_ASSERT(end_offset >= start_offset);
     DEBUG_ASSERT(start_offset >= self->obj_offset_);
     DEBUG_ASSERT(end_offset <= self->end_offset());
     const size_t start = (start_offset - self->obj_offset_) / PAGE_SIZE;
     const size_t end = (end_offset - self->obj_offset_) / PAGE_SIZE;
     for (size_t i = start; i < end; i++) {
       if (!self->pages_[i].IsEmpty()) {
         zx_status_t status =
             func(PTR_TYPE{&self->pages_[i]}, self->obj_offset_ + i * PAGE_SIZE - skew);
         if (unlikely(status != ZX_ERR_NEXT)) {
           return status;
         }
       }
     }
     return ZX_ERR_NEXT;
   }

   fbl::Canary<fbl::magic("PLST")> canary_;

   uint64_t obj_offset_ = 0;
   VmPageOrMarker pages_[kPageFanOut];

   friend VMPLCursor;
 };

 // Cursor that can be used for iterating over contiguous blocks of entries in a page list. The
 // underlying page list must not have any entries removed while using this cursor, as the cursor
 // retains iterators into the page list. It is, however, safe to insert new entries.
 // The cursor can be used to iterate over empty contiguous slots, however iteration will always
 // cease if entries are not contiguous.
 class VMPLCursor {
  public:
   VMPLCursor() : index_(kPageFanOut) {}

   // Retrieve the current VmPageOrMarker pointed at by the cursor. This will be a nullptr if the
   // cursor is no longer valid. The slot pointed at may itself be empty.
   // Note that it is up to the caller to know the offset, which it can track by remembering how
   // many |step|s it has done.
   VmPageOrMarkerRef current() const {
     return VmPageOrMarkerRef(valid() ? &(node_->pages_[index_]) : nullptr);
   }

   // Move the cursor to the next entry. The next entry can then be retrieved by calling |current|,
   // and if there is no next entry then current will return a nullptr.
   void step() {
     if (valid()) {
       index_++;
       if (index_ == kPageFanOut) {
         inc_node();
       }
     }
   }

   // Calls the provided callback of type [](VmPageOrMarkerRef)->zx_status_t on every entry as long
   // as they are contiguous. This is equivalent a loop calling |step| and |current|, but can
   // produce more optimal code gen with the internal loop.
   // The callback can return ZX_ERR_NEXT to continue, ZX_ERR_STOP to cease iteration gracefully, or
   // any other status to terminate with that status code.
   template <typename F>
   zx_status_t ForEveryContiguous(F func) {
     while (valid()) {
       while (index_ < kPageFanOut) {
         zx_status_t status = func(VmPageOrMarkerRef(&node_->pages_[index_]));
         if (status != ZX_ERR_NEXT) {
           return status == ZX_ERR_STOP ? ZX_OK : status;
         }
         index_++;
       }
       if (!inc_node()) {
         return ZX_OK;
       }
     }
     return ZX_OK;
   }

  private:
   static constexpr size_t kPageFanOut = VmPageListNode::kPageFanOut;

   VMPLCursor(fbl::WAVLTree<uint64_t, ktl::unique_ptr<VmPageListNode>>::iterator&& node, uint index)
       : node_(node), index_(index) {}

   // Helper to increment the underlying node_, testing for contiguity.
   bool inc_node() {
     // Should only be incrementing if index is at the end, as otherwise we're not being contiguous.
     DEBUG_ASSERT(index_ == kPageFanOut);
     const uint64_t prev = node_->obj_offset_;
     node_++;
     if (node_.IsValid() && node_->obj_offset_ == prev + PAGE_SIZE * kPageFanOut) {
       // node is valid and contiguous, reset the index_ to both remove the terminal sentinel, and
       // resume iteration from the beginning.
       index_ = 0;
       // TODO: Once cursor is in use benchmark the impact of validating that the node is not empty.
       return true;
     }
     return false;
   }

   // Helper to check if the node is valid or not by checking index for its sentinel value.
   bool valid() const { return index_ < kPageFanOut; }

   // Current node_ in the underlying page list currently being iterated. If this is invalid then
   // index_ will be kPageFanOut
   fbl::WAVLTree<uint64_t, ktl::unique_ptr<VmPageListNode>>::iterator node_;

   // The index into node_ that is currently being pointed at to be returned by |current|. The
   // sentinel value of kPageFanOut is used to indicate that node_ is no longer valid.
   uint index_;

   friend VmPageList;
 };

 // Class which holds the list of vm_page structs removed from a VmPageList
 // by TakePages. The list include information about uncommitted pages and markers.
 // Every splice list is expected to go through the following series of states:
 // 1. The splice list is created.
 // 2. Pages are added to the splice list.
 // 3. The list is `Finalize`d, meaning that it can no longer be modified by `Append`.
 // 4. Pages are then `Pop`d from the list. Once all the pages are popped, the list is considered
 //    "processed".
 // 5. The list is then considered `Processed` and can be destroyed.
 class VmPageSpliceList final {
  public:
   VmPageSpliceList();
   VmPageSpliceList(uint64_t offset, uint64_t length, uint64_t list_skew);
   VmPageSpliceList(VmPageSpliceList&& other);
   VmPageSpliceList& operator=(VmPageSpliceList&& other_tree);
   ~VmPageSpliceList();

   // For use by PhysicalPageProvider.  The user-pager path doesn't use this. This returns a
   // finalized list.
   static VmPageSpliceList CreateFromPageList(uint64_t offset, uint64_t length, list_node* pages);

   // Pops the next page off of the splice list. It is invalid to pop a page from a non-finalized
   // splice list.
   VmPageOrMarker Pop();

   // Peeks at the head of the splice list and returns a non-null VmPageOrMarkerRef pointing to it
   // if and only if it is a reference. It is invalid to peek at a non-finalized splice list.
   VmPageOrMarkerRef PeekReference();

   // Appends `content` to the end of the splice list.
   // The splice list takes ownership of `content` after this call.
   // Note that this method does not work when raw_pages_ is in use.
   // It is invalid to append to a finalized splice list.
   zx_status_t Append(VmPageOrMarker content);

   // Returns true after the whole collection has been processed by Pop.
   bool IsProcessed() const { return pos_ >= length_; }

   // Returns true if this list is empty.
   bool IsEmpty() const;

   // Marks the list as finalized.
   // See the comment at `VmPageSpliceList`'s declaration for more info on what this means and when
   // to call it. Note that it is invalid to call `Finalize` twice on the same list.
   void Finalize();

   // Returns true if the splice list is finalized.
   // See the comment at `VmPageSpliceList`'s declaration for more info on what this means.
   bool IsFinalized() const { return finalized_; }

   // Returns the current position in the list.
   uint64_t Position() const { return pos_; }

   DISALLOW_COPY_AND_ASSIGN_ALLOW_MOVE(VmPageSpliceList);

  private:
   void FreeAllPages();

   uint64_t offset_;
   uint64_t length_;
   uint64_t pos_ = 0;
   uint64_t list_skew_ = 0;
   bool finalized_ = false;

   VmPageListNode head_ = VmPageListNode(0);
   fbl::WAVLTree<uint64_t, ktl::unique_ptr<VmPageListNode>> middle_;
   VmPageListNode tail_ = VmPageListNode(0);

   // To avoid the possibility of allocation failure, we don't use head_, middle_, tail_ for
   // CreateFromPageList().  With CreateFromPageList() we know that all the pages are present, so
   // we can just keep a list of pages, and create VmPageListNode on the stack as pages are Pop()ed.
   list_node raw_pages_ = LIST_INITIAL_VALUE(raw_pages_);

   friend VmPageList;
 };

 class VmPageList final {
  public:
   VmPageList();
   ~VmPageList();

   VmPageList& operator=(VmPageList&& other);
   VmPageList(VmPageList&& other);

   void InitializeSkew(uint64_t parent_skew, uint64_t offset) {
     // Checking list_skew_ doesn't catch all instances of double-initialization, but
     // it should catch some of them.
     DEBUG_ASSERT(list_skew_ == 0);
     DEBUG_ASSERT(list_.is_empty());

     list_skew_ = (parent_skew + offset) % (PAGE_SIZE * VmPageListNode::kPageFanOut);
   }
   uint64_t GetSkew() const { return list_skew_; }

   DISALLOW_COPY_AND_ASSIGN_ALLOW_MOVE(VmPageList);

   // walk the page tree, calling the passed in function on every tree node.
   template <typename F>
   zx_status_t ForEveryPage(F per_page_func) const {
     return ForEveryPage<const VmPageOrMarker*>(this, per_page_func);
   }

   // similar to ForEveryPage, but the per_page_func gets called with a VmPageOrMarkerRef instead of
   // a const VmPageOrMarker*, allowing for limited mutation.
   template <typename F>
   zx_status_t ForEveryPageMutable(F per_page_func) {
     return ForEveryPage<VmPageOrMarkerRef>(this, per_page_func);
   }

   // walk the page tree, calling the passed in function on every tree node.
   template <typename F>
   zx_status_t ForEveryPageInRange(F per_page_func, uint64_t start_offset,
                                   uint64_t end_offset) const {
     return ForEveryPageInRange<const VmPageOrMarker*>(this, per_page_func, start_offset,
                                                       end_offset);
   }

   // similar to ForEveryPageInRange, but the per_page_func gets called with a VmPageOrMarkerRef
   // instead of a const VmPageOrMarker*, allowing for limited mutation.
   template <typename F>
   zx_status_t ForEveryPageInRangeMutable(F per_page_func, uint64_t start_offset,
                                          uint64_t end_offset) {
     return ForEveryPageInRange<VmPageOrMarkerRef>(this, per_page_func, start_offset, end_offset);
   }

   // walk the page tree, calling |per_page_func| on every page/marker and |per_gap_func| on every
   // gap.
   template <typename PAGE_FUNC, typename GAP_FUNC>
   zx_status_t ForEveryPageAndGapInRange(PAGE_FUNC per_page_func, GAP_FUNC per_gap_func,
                                         uint64_t start_offset, uint64_t end_offset) const {
     return ForEveryPageAndGapInRange<const VmPageOrMarker*>(this, per_page_func, per_gap_func,
                                                             start_offset, end_offset);
   }

   // walk the page tree, calling |per_page_func| on every page/marker/interval that fulfills
   // (returns true) the |compare_func|. Also call |contiguous_run_func| on every contiguous range of
   // such pages/markers/intervals encountered, whose signature is:
   // zx_status_t contiguous_run_func(uint64_t start, uint64_t end, bool is_interval)
   //
   // Intervals are treated as distinct contiguous runs, i.e. they won't be merged into a contiguous
   // run of pages/markers for invocation of |contiguous_run_func|. For intervals,
   // |contiguous_run_func| will be called with |is_interval| set to true; for other page types it
   // will be false. Additionally, the entire interval should fulfill |compare_func| for
   // |contiguous_run_func| to be called on the portion that falls in [start_offset, end_offset).
   template <typename COMPARE_FUNC, typename PAGE_FUNC, typename CONTIGUOUS_RUN_FUNC>
   zx_status_t ForEveryPageAndContiguousRunInRange(COMPARE_FUNC compare_func,
                                                   PAGE_FUNC per_page_func,
                                                   CONTIGUOUS_RUN_FUNC contiguous_run_func,
                                                   uint64_t start_offset,
                                                   uint64_t end_offset) const {
     return ForEveryPageAndContiguousRunInRange<const VmPageOrMarker*>(
         this, compare_func, per_page_func, contiguous_run_func, start_offset, end_offset);
   }

   // Returns true if any pages (actual pages, references, or markers) are in the given range, or if
   // the range forms a part of a sparse page interval.
   bool AnyPagesOrIntervalsInRange(uint64_t start_offset, uint64_t end_offset) const {
     bool found_page = false;
     ForEveryPageInRange(
         [&found_page](const VmPageOrMarker* page, uint64_t offset) {
           found_page = true;
           return ZX_ERR_STOP;
         },
         start_offset, end_offset);
     // It is possible that the range forms a part of an interval even if no nodes in the range have
     // populated slots. We can determine that by checking to see if the start offset in the range
     // falls in an interval (we could technically perform this check for any inclusive offset in the
     // range since the range is entirely unpopulated and hence would only fall in the same interval
     // if applicable).
     return found_page ? true : IsOffsetInInterval(start_offset);
   }

   // Attempts to return a reference to the VmPageOrMarker at the specified offset. The returned
   // pointer is valid until the VmPageList is destroyed or any of the Remove*/Take/Merge etc
   // functions are called.
   //
   // Lookup may return 'nullptr' if there is no slot allocated for the given offset. If non-null
   // is returned it may still be the case that IsEmpty() on the returned PageOrMarker is true.
   const VmPageOrMarker* Lookup(uint64_t offset) const;

   // Similar to `Lookup` but returns a VmPageOrMarkerRef that allows for limited mutation of the
   // slot. General mutation requires calling `LookupOrAllocate`.
   VmPageOrMarkerRef LookupMutable(uint64_t offset);

   // Similar to `LookupMutable` but returns a VMPLCursor that allows for iterating over any
   // contiguous slots from the provided offset.
   VMPLCursor LookupMutableCursor(uint64_t offset);

   // The interval handling flag to be used by LookupOrAllocate. See comments near LookupOrAllocate.
   enum class IntervalHandling : uint8_t {
     NoIntervals,
     CheckForInterval,
     SplitInterval,
   };
   // Similar to `Lookup` but only returns `nullptr` if a slot cannot be allocated either due to out
   // of memory, due to offset being invalid, or |interval_handling| not allowing for a slot to be
   // safely returned.
   //
   // The returned slot, if not a `nullptr`, may generally be freely manipulated with the exception
   // that if it started !Empty, then it is an error to set it to Empty. In this case the
   // `RemovePage` method must be used.
   //
   // If the returned slot started Empty, as it not made !Empty, then the slot must be returned with
   // ReturnEmptySlot, to ensure no empty nodes are retained.
   //
   // The bool in the ktl::pair returns whether the offset falls inside a sparse interval. And
   // whether a valid VmPageOrMarker* is returned in the ktl::pair depends on the specified
   // |interval_handling|.
   //  - NoIntervals: The page list does not contain any intervals, so there is no special handling
   //  to check for or split intervals. In other words, each slot in the page list can be manipulated
   //  independently.
   //  - CheckForIntervals: The page list can contain intervals, and the bool in the returned
   //  ktl::pair indicates whether the offset fell inside an interval. Note that this only checks for
   //  intervals but does not allow manipulating them, so a valid VmPageOrMarker* will be returned
   //  only if the offset can safely be manipulated independently.
   //  - SplitInterval: The page list can contain intervals and we are allowed to split intervals to
   //  return the required slot. The returned VmPageOrMarker* can be manipulated freely. (See
   //  comments near LookupOrAllocateCheckForInterval for an explanation of how splitting works.)
   ktl::pair<VmPageOrMarker*, bool> LookupOrAllocate(uint64_t offset,
                                                     IntervalHandling interval_handling) {
     switch (interval_handling) {
       case IntervalHandling::NoIntervals:
         // The page list does not expect any intervals. Short circuit any checks for intervals.
         return {LookupOrAllocateInternal(offset), false};
       case IntervalHandling::CheckForInterval:
         // Check for intervals but do not allow splitting them.
         return LookupOrAllocateCheckForInterval(offset, false);
       case IntervalHandling::SplitInterval:
         // Check for intervals and also split them.
         return LookupOrAllocateCheckForInterval(offset, true);
     }
     return {nullptr, false};
   }

   // Returns a slot that was empty after LookupOrAllocate, and that the caller did not end up
   // filling.
   // This ensures that if LookupOrAllocate allocated a new underlying list node, then that list node
   // needs to be free'd otherwise it might not get cleaned up for the lifetime of the page list.
   //
   // This is only correct to call on an offset for which LookupOrAllocate had just returned a non
   // null slot, and that slot was Empty and is still Empty.
   void ReturnEmptySlot(uint64_t offset);

   // Removes any item at |offset| from the list and returns it, or VmPageOrMarker::Empty() if none.
   VmPageOrMarker RemoveContent(uint64_t offset);

   // Release every item in the page list and calls free_content_fn on any content, giving it
   // ownership. Any markers are cleared.
   template <typename T>
   void RemoveAllContent(T free_content_fn) {
     // per page get a reference to the page pointer inside the page list node
     auto per_page_func = [&free_content_fn](VmPageOrMarker* p, uint64_t offset) {
       if (p->IsPageOrRef()) {
         free_content_fn(ktl::move(*p));
       }
       *p = VmPageOrMarker::Empty();
       return ZX_ERR_NEXT;
     };

     // walk the tree in order, freeing all the pages on every node
     ForEveryPage<VmPageOrMarker*>(this, per_page_func);

     // empty the tree
     list_.clear();
   }

   // Calls the provided callback for every page or marker in the range [start_offset, end_offset).
   // The callback can modify the VmPageOrMarker and take ownership of any pages, or leave them in
   // place. The difference between this and ForEveryPage is as this allows for modifying the
   // underlying pages any intermediate data structures can be checked and potentially freed if no
   // longer needed.
   template <typename T>
   void RemovePages(T per_page_fn, uint64_t start_offset, uint64_t end_offset) {
     ForEveryPageInRange<VmPageOrMarker*, NodeCheck::CleanupEmpty>(this, per_page_fn, start_offset,
                                                                   end_offset);
   }

   // Similar to RemovePages but also takes a |per_gap_fn| callback to allow for iterating over any
   // gaps encountered as well. This can be used when the intent is to modify the underlying pages
   // and/or gaps, while checking any intermediate data structures to potentially free ones that are
   // no longer needed.
   template <typename P, typename G>
   zx_status_t RemovePagesAndIterateGaps(P per_page_fn, G per_gap_fn, uint64_t start_offset,
                                         uint64_t end_offset) {
     return ForEveryPageAndGapInRange<VmPageOrMarker*, NodeCheck::CleanupEmpty>(
         this, per_page_fn, per_gap_fn, start_offset, end_offset);
   }

   // Returns true if there are no pages, references, markers, or intervals in the page list.
   bool IsEmpty() const;

   // Returns true if the page list does not own any pages or references. Meant to check whether the
   // page list has any resource that needs to be returned.
   bool HasNoPageOrRef() const;

   // Merges the pages in |other| in the range [|offset|, |end_offset|) into |this|
   // page list, starting at offset 0 in this list.
   //
   // For every page in |other| in the given range, if there is no corresponding page or marker
   // in |this|, then they will be passed to |migrate_fn|. If |migrate_fn| leaves the page in the
   // VmPageOrMarker it will be migrated into |this|, otherwise the migrate_fn is assumed to now own
   // the page. For any pages or markers in |other| outside the given range or which conflict with a
   // page in |this|, they will be released given ownership to |release_fn|.
   //
   // The |offset| values passed to |release_fn| and |migrate_fn| are the original offsets
   // in |other|, not the adapted offsets in |this|.
   //
   // **NOTE** unlike MergeOnto, |other| will be empty at the end of this method.
   void MergeFrom(
       VmPageList& other, uint64_t offset, uint64_t end_offset,
       fit::inline_function<void(VmPageOrMarker&&, uint64_t offset), 3 * sizeof(void*)> release_fn,
       fit::inline_function<void(VmPageOrMarker*, uint64_t offset)> migrate_fn);

   // Merges this pages in |this| onto |other|.
   //
   // For every page (or marker) in |this|, checks the same offset in |other|. If there is no
   // page or marker, then it inserts the page into |other|. Otherwise, it releases the page (or
   // marker) and gives ownership to |release_fn|.
   //
   // **NOTE** unlike MergeFrom, |this| will be empty at the end of this method.
   void MergeOnto(VmPageList& other, fit::inline_function<void(VmPageOrMarker&&)> release_fn);

   // Takes the pages, references and markers in the range [offset, length) out of this page list.
   // This method calls `Finalize` on the splice list prior to returning it, meaning that no more
   // pages, references, or markers can be added to it.
   VmPageSpliceList TakePages(uint64_t offset, uint64_t length);

   uint64_t HeapAllocationBytes() const { return list_.size() * sizeof(VmPageListNode); }

   // Allow the implementation to use a one-past-the-end for VmPageListNode offsets,
   // plus to account for skew_.
   static constexpr uint64_t MAX_SIZE =
       ROUNDDOWN(UINT64_MAX, 2 * VmPageListNode::kPageFanOut * PAGE_SIZE);

   // Add a sparse zero interval spanning the range [start_offset, end_offset) with the specified
   // dirty_state. The specified range must be previously unpopulated. This will try to merge the new
   // zero interval with existing intervals to the left and/or right, if the dirty_state allows it.
   zx_status_t AddZeroInterval(uint64_t start_offset, uint64_t end_offset,
                               VmPageOrMarker::IntervalDirtyState dirty_state) {
     return AddZeroIntervalInternal(start_offset, end_offset, dirty_state, 0);
   }

   // Populates individual interval slots in the range [start_offset, end_offset) that falls inside a
   // sparse interval. The intent of this function is to allow the caller to prepare the range for
   // overwriting (replacing with pages) by populating the required slots upfront, so that slot
   // lookup does not fail after this call. Essentially simulates interval splits
   // (LookupOrAllocateCheckForInterval) for every offset in the specified range, but does so more
   // efficiently, instead of having to search the tree repeatedly for every single offset.
   zx_status_t PopulateSlotsInInterval(uint64_t start_offset, uint64_t end_offset);

   // Helper to return an unused interval slot so that it can be merged back into the interval it was
   // populated/split from.
   void ReturnIntervalSlot(uint64_t offset);

   // Clips an interval from the start by len, i.e. moves the start from interval_start to
   // interval_start + len. The total length of the interval must be larger than len.
   zx_status_t ClipIntervalStart(uint64_t interval_start, uint64_t len);

   // Clips an interval from the end by len, i.e. moves the end from interval_end to
   // interval_end - len. The total length of the interval must be larger than len.
   zx_status_t ClipIntervalEnd(uint64_t interval_end, uint64_t len);

   // Returns true if the specified offset falls in a sparse zero interval.
   bool IsOffsetInZeroInterval(uint64_t offset) const;

   // Replace an existing page at offset with a zero interval, and return the released page. The
   // caller takes ownership of the released page and is responsible for freeing it.
   vm_page_t* ReplacePageWithZeroInterval(uint64_t offset,
                                          VmPageOrMarker::IntervalDirtyState dirty_state);

  private:
   // Returns true if the specified offset falls in a sparse page interval.
   bool IsOffsetInInterval(uint64_t offset) const;

   // Internal helper used when checking whether the offset falls in an interval.
   // lower_bound is the node that was queried with a lower_bound() lookup on the list using the
   // offset. This node is passed in here so that we can reuse the node the callsite has looked up
   // and avoid an extra lookup. The interval sentinel found in lower_bound which is used to
   // conclude that offset lies is an interval is optionally returned. If this function returns true,
   // interval_out (if not null) returns the start/end/slot sentinel of the interval that the offset
   // lies in.
   bool IfOffsetInIntervalHelper(uint64_t offset, const VmPageListNode& lower_bound,
                                 const VmPageOrMarker** interval_out = nullptr) const;

   // Internal helper for AddZeroInterval.
   // |replace_existing_slot| can optionally be set to true if a zero interval spanning a single page
   // is being added, and the slot at that offset is already populated (but Empty) and can be reused.
   zx_status_t AddZeroIntervalInternal(uint64_t start_offset, uint64_t end_offset,
                                       VmPageOrMarker::IntervalDirtyState dirty_state,
                                       uint64_t awaiting_clean_len,
                                       bool replace_existing_slot = false);

   // Internal helper for LookupOrAllocate.
   VmPageOrMarker* LookupOrAllocateInternal(uint64_t offset);

   // Similar to LookupOrAllocateInternal but also checks if offset falls in a sparse page interval,
   // returning true via the bool in ktl::pair if it does, along with the slot. Also splits the
   // interval around offset if split_interval is set to true. This allows the caller to freely
   // manipulate the slot at offset similar to LookupOrAllocate. If offset is found in an interval,
   // but split_interval was false, no VmPageOrMarker* is returned, as it is not safe to manipulate
   // any slot in an interval without also splitting the interval around it.
   //
   // In other words, the return values fall into three categories.
   //  1. {page, false} : offset does not lie in an interval. |slot| is the required slot.
   //  2. {page, true} : offset lies in an interval and split_interval was true. |page| is the
   //  required slot. The interval has been correctly split around the slot, so |page| can be treated
   //  similar to any non-interval type.
   //  3. {nullptr, true} : offset lies in an interval but split_interval was false. No slot is
   //  returned.
   //
   // Splitting the interval would look as follows. If the interval previously was:
   //  [start, end) where start < offset < end.
   // After the split we would have three intervals:
   //  [start, offset) [offset, offset + PAGE_SIZE) [offset + PAGE_SIZE, end)
   // The middle interval containing offset spans only a single page, i.e. offset is an
   // IntervalSentinel::Slot, which can now be manipulated independently.
   ktl::pair<VmPageOrMarker*, bool> LookupOrAllocateCheckForInterval(uint64_t offset,
                                                                     bool split_interval);

   template <typename PTR_TYPE, typename S, typename F>
   static zx_status_t ForEveryPage(S self, F per_page_func) {
     for (auto& pl : self->list_) {
       zx_status_t status = pl.template ForEveryPage<PTR_TYPE, F>(per_page_func, self->list_skew_);
       if (unlikely(status != ZX_ERR_NEXT)) {
         if (status == ZX_ERR_STOP) {
           break;
         }
         return status;
       }
     }
     return ZX_OK;
   }

   // Calls the provided callback for every page in the given range. If the CleanupNodes template
   // argument is true then it is assumed the per_page_func may remove pages and page nodes will be
   // checked to see if they are empty and can be cleaned up.
   enum class NodeCheck : bool {
     Skip = false,
     CleanupEmpty = true,
   };
   template <typename PTR_TYPE, NodeCheck NODE_CHECK = NodeCheck::Skip, typename S, typename F>
   static zx_status_t ForEveryPageInRange(S self, F per_page_func, uint64_t start_offset,
                                          uint64_t end_offset) {
     DEBUG_ASSERT(IS_PAGE_ALIGNED(start_offset));
     DEBUG_ASSERT(IS_PAGE_ALIGNED(end_offset));
     start_offset += self->list_skew_;
     end_offset += self->list_skew_;

     // Find the first node (if any) that will contain our starting offset.
     auto cur =
         self->list_.lower_bound(ROUNDDOWN(start_offset, VmPageListNode::kPageFanOut * PAGE_SIZE));
     if (!cur) {
       return ZX_OK;
     }

     // Handle scenario where start_offset begins not aligned to a node.
     if (cur->offset() < start_offset) {
       zx_status_t status = cur->template ForEveryPageInRange<PTR_TYPE, F>(
           per_page_func, start_offset, ktl::min(end_offset, cur->end_offset()), self->list_skew_);
       auto prev = cur++;
       if constexpr (NODE_CHECK == NodeCheck::CleanupEmpty) {
         if (prev->IsEmpty()) {
           self->list_.erase(prev);
         }
       }
       if (unlikely(status != ZX_ERR_NEXT)) {
         if (status == ZX_ERR_STOP) {
           return ZX_OK;
         }
         return status;
       }
     }
     // Iterate through all full nodes contained in the range.
     while (cur && cur->end_offset() < end_offset) {
       DEBUG_ASSERT(start_offset <= cur->offset());
       zx_status_t status = cur->template ForEveryPage<PTR_TYPE, F>(per_page_func, self->list_skew_);
       auto prev = cur++;
       if constexpr (NODE_CHECK == NodeCheck::CleanupEmpty) {
         if (prev->IsEmpty()) {
           self->list_.erase(prev);
         }
       }
       if (unlikely(status != ZX_ERR_NEXT)) {
         if (status == ZX_ERR_STOP) {
           return ZX_OK;
         }
         return status;
       }
     }
     // Handle scenario where the end_offset is not aligned to the end of a node.
     if (cur && cur->offset() < end_offset) {
       DEBUG_ASSERT(cur->end_offset() >= end_offset);
       zx_status_t status = cur->template ForEveryPageInRange<PTR_TYPE, F>(
           per_page_func, cur->offset(), end_offset, self->list_skew_);
       if constexpr (NODE_CHECK == NodeCheck::CleanupEmpty) {
         if (cur->IsEmpty()) {
           self->list_.erase(cur);
         }
       }
       if (unlikely(status != ZX_ERR_NEXT)) {
         if (status == ZX_ERR_STOP) {
           return ZX_OK;
         }
         return status;
       }
     }
     return ZX_OK;
   }

   template <typename PTR_TYPE, NodeCheck NODE_CHECK = NodeCheck::Skip, typename S,
             typename PAGE_FUNC, typename GAP_FUNC>
   static zx_status_t ForEveryPageAndGapInRange(S self, PAGE_FUNC per_page_func,
                                                GAP_FUNC per_gap_func, uint64_t start_offset,
                                                uint64_t end_offset) {
     uint64_t expected_next_off = start_offset;
     // Set to true when we encounter an interval start but haven't yet encountered the end.
     bool in_interval = false;
     auto per_page_wrapper_fn = [&](auto* p, uint64_t off) {
       zx_status_t status = ZX_ERR_NEXT;
       // We can move ahead of expected_next_off in the case of an interval too, which represents a
       // run of pages. Make sure this is not an interval before calling the per_gap_func.
       if (expected_next_off != off && !p->IsIntervalEnd()) {
         status = per_gap_func(expected_next_off, off);
       }
       if (status == ZX_ERR_NEXT) {
         if (p->IsIntervalStart()) {
           // We should not already have been tracking an interval.
           DEBUG_ASSERT(!in_interval);
           // Start and end sentinel interval types should match. Since we only support zero
           // intervals currently, we can simply check for that.
           DEBUG_ASSERT(p->IsIntervalZero());
           in_interval = true;
         } else if (p->IsIntervalEnd()) {
           // If this is not the first populated slot we encountered, we should have been tracking a
           // valid interval.
           DEBUG_ASSERT(in_interval || expected_next_off == start_offset);
           // Start and end sentinel interval types should match. Since we only support zero
           // intervals currently, we can simply check for that.
           DEBUG_ASSERT(p->IsIntervalZero());
           // Reset interval tracking.
           in_interval = false;
         }
         status = per_page_func(p, off);
       }
       expected_next_off = off + PAGE_SIZE;
       // Prevent the last call to per_gap_func
       if (status == ZX_ERR_STOP) {
         expected_next_off = end_offset;
       }
       return status;
     };

     zx_status_t status = ForEveryPageInRange<PTR_TYPE, NODE_CHECK>(self, per_page_wrapper_fn,
                                                                    start_offset, end_offset);
     if (status != ZX_OK) {
       return status;
     }

     // Handle the last gap after checking that we are not in an interval. Note that simply checking
     // for in_interval is not sufficient, as it is possible to have started the traversal partway
     // into an interval, in which case we would not have seen the interval start and in_interval
     // would be false. So we perform a quick check for in_interval first and if that fails perform
     // the more expensive IsOffsetInInterval() check. The IsOffsetInInterval() call is further gated
     // by whether we encountered any page at all in the traversal above. If we saw at least one
     // page in the traversal, we know that we could not be in an interval without in_interval being
     // true because we would have seen the interval start.
     if (expected_next_off != end_offset) {
       // Traversal ended in an interval if in_interval was true, OR if the traversal did not see any
       // page at all and the start_offset is in an interval (Note that in this latter case all
       // offsets in the range [start_offset, end_offset) would lie in the same interval, so we can
       // just check one of them).
       bool ended_in_interval = in_interval || (expected_next_off == start_offset &&
                                                self->IsOffsetInInterval(start_offset));
       if (!ended_in_interval) {
         status = per_gap_func(expected_next_off, end_offset);
         if (status != ZX_ERR_NEXT && status != ZX_ERR_STOP) {
           return status;
         }
       }
     }

     return ZX_OK;
   }

   // Internal helpers to return the start (or end) of an interval given the end (or start) along
   // with the corresponding offset.
   ktl::pair<const VmPageOrMarker*, uint64_t> FindIntervalStartForEnd(uint64_t end_offset) const;
   ktl::pair<const VmPageOrMarker*, uint64_t> FindIntervalEndForStart(uint64_t start_offset) const;

   template <typename PTR_TYPE, typename S, typename COMPARE_FUNC, typename PAGE_FUNC,
             typename CONTIGUOUS_RUN_FUNC>
   static zx_status_t ForEveryPageAndContiguousRunInRange(S self, COMPARE_FUNC compare_func,
                                                          PAGE_FUNC per_page_func,
                                                          CONTIGUOUS_RUN_FUNC contiguous_run_func,
                                                          uint64_t start_offset,
                                                          uint64_t end_offset) {
     if (start_offset == end_offset) {
       return ZX_OK;
     }

     // Track contiguous range of pages fulfilling compare_func.
     uint64_t contiguous_run_start = start_offset;
     uint64_t contiguous_run_len = 0;

     // Tracks whether we enter the ForEveryPageAndGap traversal at all.
     bool found_page_or_gap = false;
     // Tracks information if we encounter an interval start, to be used when we encounter the
     // corresponding end.
     struct {
       uint64_t interval_start_offset;
       bool start_compare_status;
       bool started_interval;
     } interval_tracker = {.started_interval = false};

     zx_status_t status = ForEveryPageAndGapInRange<PTR_TYPE>(
         self,
         [&](auto* p, uint64_t off) {
           found_page_or_gap = true;
           zx_status_t st = ZX_ERR_NEXT;
           const bool compare_result = compare_func(p, off);

           // Handle interval types first.
           if (p->IsInterval()) {
             // If we are going to start an interval, end any contiguous run being tracked, and call
             // contiguous_run_func on it. This is because intervals are treated as contiguous ranges
             // distinct from pages or markers. Do this before per_page_func because we don't want to
             // have processed extra pages if contiguous_run_func on the range prior would have
             // failed. Also do this irrespective of whether this interval passes the compare_func or
             // not, since we are processing pages prior to the interval.
             if (p->IsIntervalStart() || p->IsIntervalSlot()) {
               if (contiguous_run_len > 0) {
                 st = contiguous_run_func(contiguous_run_start,
                                          contiguous_run_start + contiguous_run_len,
                                          /*is_interval=*/false);
                 // Reset contiguous range tracking.
                 contiguous_run_len = 0;
                 if (st != ZX_ERR_NEXT) {
                   return st;
                 }
               }
             }
             DEBUG_ASSERT(contiguous_run_len == 0);

             // Run the per-page function on the interval sentinel first. Then proceed to the more
             // complicated logic for the contiguous function.
             if (compare_result) {
               st = per_page_func(p, off);
               if (st != ZX_ERR_NEXT && st != ZX_ERR_STOP) {
                 return st;
               }
             }

             // A slot is a contiguous run of a single page.
             if (p->IsIntervalSlot()) {
               // We should not have been already tracking an interval.
               DEBUG_ASSERT(!interval_tracker.started_interval);
               if (compare_result) {
                 return contiguous_run_func(off, off + PAGE_SIZE, /*is_interval=*/true);
               }
               return ZX_ERR_NEXT;
             }

             if (p->IsIntervalStart()) {
               // Start tracking a new run. We should not have been already tracking an interval.
               DEBUG_ASSERT(!interval_tracker.started_interval);
               interval_tracker.started_interval = true;
               interval_tracker.interval_start_offset = off;
               // Stash the comparison result for the interval start.
               interval_tracker.start_compare_status = compare_result;
               return ZX_ERR_NEXT;
             }

             DEBUG_ASSERT(p->IsIntervalEnd());
             // If the interval end does not pass the check, there is nothing more to be done.
             if (!compare_result) {
               interval_tracker.started_interval = false;
               return ZX_ERR_NEXT;
             }

             // If this is the end of an interval, call contiguous_run_func on the interval if the
             // compare_func passes for *both* the start and the end, and proceed.
             // It is possible that we don't have the interval start if we started the traversal
             // partway inside an interval. Find the start and evaluate compare_func on it.
             if (!interval_tracker.started_interval) {
               auto [start, interval_start_offset] = self->FindIntervalStartForEnd(off);
               DEBUG_ASSERT(start);
               DEBUG_ASSERT(start->IsIntervalStart());
               DEBUG_ASSERT(interval_start_offset < start_offset);
               interval_tracker.started_interval = true;
               interval_tracker.start_compare_status = compare_func(start, interval_start_offset);
               // Pretend that the interval begins at start_offset since we're not considering the
               // range before it.
               interval_tracker.interval_start_offset = start_offset;
             }
             DEBUG_ASSERT(interval_tracker.started_interval);
             interval_tracker.started_interval = false;
             if (interval_tracker.start_compare_status) {
               return contiguous_run_func(interval_tracker.interval_start_offset, off + PAGE_SIZE,
                                          /*is_interval=*/true);
             }
             return ZX_ERR_NEXT;
           }

           // Handle any non-interval types.
           DEBUG_ASSERT(!p->IsInterval());
           DEBUG_ASSERT(!interval_tracker.started_interval);

           if (compare_result) {
             st = per_page_func(p, off);
             // Return any errors early before considering this page for contiguous_run_func.
             if (st != ZX_ERR_NEXT && st != ZX_ERR_STOP) {
               // If there was an outstanding contiguous run, process it since it had to have ended
               // before the failing offset.
               if (contiguous_run_len > 0) {
                 zx_status_t prev_range_status = contiguous_run_func(
                     contiguous_run_start, contiguous_run_start + contiguous_run_len,
                     /*is_interval=*/false);
                 contiguous_run_len = 0;
                 // If there was an error encountered, surface that instead of st, as it occurred on
                 // a range prior to this offset.
                 if (prev_range_status != ZX_ERR_NEXT && prev_range_status != ZX_ERR_STOP) {
                   return prev_range_status;
                 }
               }
               return st;
             }

             // Start tracking a contiguous run if none was being tracked.
             if (contiguous_run_len == 0) {
               contiguous_run_start = off;
             }
             // Append this page to the contiguous range being tracked.
             contiguous_run_len += PAGE_SIZE;
             // In the case that st is ZX_ERR_STOP, we will include this page in the contiguous run
             // and stop traversal *after* this page.
             return st;
           }
           // We were already tracking a contiguous range when we encountered this page that does not
           // fulfill compare_func. Invoke contiguous_run_func on the range so far and start tracking
           // a new one skipping over this page.
           if (contiguous_run_len > 0) {
             st =
                 contiguous_run_func(contiguous_run_start, contiguous_run_start + contiguous_run_len,
                                     /*is_interval=*/false);
             // Reset contiguous_run_len to zero to track a new range later if required.
             // Do this irrespective of the return status to ensure we don't erroneously have a
             // remaining range to process below after exiting the traversal.
             contiguous_run_len = 0;
           }
           return st;
         },
         [&](uint64_t start, uint64_t end) {
           found_page_or_gap = true;
           // We should not encounter any gaps in the midst of an interval we were tracking.
           DEBUG_ASSERT(!interval_tracker.started_interval);
           zx_status_t st = ZX_ERR_NEXT;
           // We were already tracking a contiguous range when we encountered this gap. Invoke
           // contiguous_run_func on the range so far and start tracking a new one skipping over this
           // gap.
           if (contiguous_run_len > 0) {
             st =
                 contiguous_run_func(contiguous_run_start, contiguous_run_start + contiguous_run_len,
                                     /*is_interval=*/false);
             // Reset contiguous_run_len to zero to track a new range later if required.
             // Do this irrespective of the return status to ensure we don't erroneously have a
             // remaining range to process below after exiting the traversal.
             contiguous_run_len = 0;
           }
           return st;
         },
         start_offset, end_offset);

     if (status != ZX_OK) {
       return status;
     }

     // If we did not execute either the per-page or per-gap function, we could only have been inside
     // an interval. In that case, we need to find both the start and the end of this interval and
     // evaluate compare_func on them.
     if (!found_page_or_gap) {
       DEBUG_ASSERT(self->IsOffsetInInterval(start_offset));
       DEBUG_ASSERT(self->IsOffsetInInterval(end_offset - PAGE_SIZE));

       uint64_t interval_end_offset = UINT64_MAX;
       bool end_compare_status = false;
       status = ForEveryPageInRange<PTR_TYPE>(
           self,
           [&](auto* p, uint64_t off) {
             // The first populated slot should be an interval end.
             DEBUG_ASSERT(p->IsIntervalEnd());
             interval_end_offset = off;
             end_compare_status = compare_func(p, off);
             return ZX_ERR_STOP;
           },
           end_offset, VmPageList::MAX_SIZE);
       DEBUG_ASSERT(status == ZX_OK);

       if (end_compare_status) {
         auto [start, interval_start_offset] = self->FindIntervalStartForEnd(interval_end_offset);
         DEBUG_ASSERT(start);
         DEBUG_ASSERT(start->IsIntervalStart());
         DEBUG_ASSERT(interval_start_offset < start_offset);
         if (compare_func(start, interval_start_offset)) {
           status = contiguous_run_func(start_offset, end_offset, /*is_interval=*/true);
           if (status != ZX_ERR_NEXT && status != ZX_ERR_STOP) {
             return status;
           }
         }
       }
       return ZX_OK;
     }

     // Process the last contiguous range if there is one, or an interval that we started tracking
     // but did not end.
     if (contiguous_run_len > 0) {
       status = contiguous_run_func(contiguous_run_start, contiguous_run_start + contiguous_run_len,
                                    /*is_interval=*/false);
       if (status != ZX_ERR_NEXT && status != ZX_ERR_STOP) {
         return status;
       }
     } else if (interval_tracker.started_interval && interval_tracker.start_compare_status) {
       auto [end, interval_end_offset] =
           self->FindIntervalEndForStart(interval_tracker.interval_start_offset);
       DEBUG_ASSERT(end);
       DEBUG_ASSERT(end->IsIntervalEnd());
       if (compare_func(end, interval_end_offset)) {
         status = contiguous_run_func(interval_tracker.interval_start_offset, end_offset,
                                      /*is_interval=*/true);
         if (status != ZX_ERR_NEXT && status != ZX_ERR_STOP) {
           return status;
         }
       }
     }

     return ZX_OK;
   }

   fbl::WAVLTree<uint64_t, ktl::unique_ptr<VmPageListNode>> list_;
   // A skew added to offsets provided as arguments to VmPageList functions before
   // interfacing with list_. This allows all VmPageLists within a clone tree
   // to place individual vm_page_t entries at the same offsets within their nodes, so
   // that the nodes can be moved between different lists without having to worry
   // about needing to split up a node.
   uint64_t list_skew_ = 0;
 };

 #endif  // ZIRCON_KERNEL_VM_INCLUDE_VM_VM_PAGE_LIST_H_