zircon/kernel/vm/page_queues.cc - fuchsia - Git at Google

 // Copyright 2020 The Fuchsia Authors
 //
 // Use of this source code is governed by a MIT-style
 // license that can be found in the LICENSE file or at
 // https://opensource.org/licenses/MIT

 #include <lib/arch/intrin.h>
 #include <lib/counters.h>
 #include <lib/fit/defer.h>
 #include <lib/zircon-internal/macros.h>

 #include <fbl/ref_counted_upgradeable.h>
 #include <kernel/auto_preempt_disabler.h>
 #include <object/thread_dispatcher.h>
 #include <vm/compression.h>
 #include <vm/page.h>
 #include <vm/page_queues.h>
 #include <vm/pmm.h>
 #include <vm/scanner.h>
 #include <vm/stack_owned_loaned_pages_interval.h>
 #include <vm/vm_cow_pages.h>

 namespace {

 KCOUNTER(pq_aging_reason_before_min_timeout, "pq.aging.reason_before_min_timeout")
 KCOUNTER(pq_aging_spurious_wakeup, "pq.aging.spurious_wakeup")
 KCOUNTER(pq_aging_reason_timeout, "pq.aging.reason.timeout")
 KCOUNTER(pq_aging_reason_active_ratio, "pq.aging.reason.active_ratio")
 KCOUNTER(pq_aging_reason_manual, "pq.aging.reason.manual")
 KCOUNTER(pq_aging_blocked_on_lru, "pq.aging.blocked_on_lru")
 KCOUNTER(pq_lru_spurious_wakeup, "pq.lru.spurious_wakeup")
 KCOUNTER(pq_lru_pages_evicted, "pq.lru.pages_evicted")
 KCOUNTER(pq_lru_pages_compressed, "pq.lru.pages_compressed")
 KCOUNTER(pq_lru_pages_discarded, "pq.lru.pages_discarded")
 KCOUNTER(pq_accessed_normal, "pq.accessed.normal")
 KCOUNTER(pq_accessed_normal_same_queue, "pq.accessed.normal_same_queue")
 KCOUNTER(pq_accessed_deferred_count, "pq.accessed.deferred")
 KCOUNTER(pq_accessed_deferred_count_same_queue, "pq.accessed.deferred_same_queue")

 }  // namespace

 // Helper class for building an isolate list for deferred processing when acting on the LRU queues.
 // Pages are added while the page queues lock is held, and processed once the lock is dropped.
 // Statically sized with the maximum number of items it might need to hold and it is an error to
 // attempt to add more than this many items, as Flush() cannot automatically be called due to
 // incompatible locking requirements between flushing and adding items.
 template <size_t Items>
 class PageQueues::LruIsolate {
  public:
   using LruAction = PageQueues::LruAction;
   LruIsolate() = default;
   ~LruIsolate() { Flush(); }
   // Sets the LRU action, this allows the object construction to happen without the page queues
   // lock, where as setting the LruAction can be done within it.
   void SetLruAction(LruAction lru_action) { lru_action_ = lru_action; }

   // Adds a page to be potentially replaced with a loaned page.
   // Requires PageQueues lock to be held
   void AddLoanReplacement(vm_page_t* page, PageQueues* pq) TA_REQ(pq->get_lock()) {
     DEBUG_ASSERT(page);
     DEBUG_ASSERT(!page->is_loaned());
     VmCowPages* cow = reinterpret_cast<VmCowPages*>(page->object.get_object());
     DEBUG_ASSERT(cow);
     fbl::RefPtr<VmCowPages> cow_ref = fbl::MakeRefPtrUpgradeFromRaw(cow, pq->get_lock());
     DEBUG_ASSERT(cow_ref);
     AddInternal(ktl::move(cow_ref), page, ListAction::ReplaceWithLoaned);
   }

   // Add a page to be reclaimed. Actual reclamation will only be done if the `SetLruAction` is
   // compatible with the page and its VMO owner.
   // Requires PageQueues lock to be held
   void AddReclaimable(vm_page_t* page, PageQueues* pq) TA_REQ(pq->get_lock()) {
     DEBUG_ASSERT(page);
     if (lru_action_ == LruAction::None) {
       return;
     }
     VmCowPages* cow = reinterpret_cast<VmCowPages*>(page->object.get_object());
     DEBUG_ASSERT(cow);
     // Need to get the cow refptr before we can check if our lru action is appropriate for this
     // page.
     fbl::RefPtr<VmCowPages> cow_ref = fbl::MakeRefPtrUpgradeFromRaw(cow, pq->get_lock());
     DEBUG_ASSERT(cow_ref);
     if (lru_action_ == LruAction::EvictAndCompress ||
         ((cow_ref->can_evict() || cow_ref->is_discardable()) ==
          (lru_action_ == LruAction::EvictOnly))) {
       AddInternal(ktl::move(cow_ref), page, ListAction::Reclaim);
     } else {
       // Must not let the cow refptr get dropped till after the lock, so even if not
       // reclaiming must keep this entry.
       AddInternal(ktl::move(cow_ref), page, ListAction::None);
     }
   }

   // Performs any pending operations on the stored pages.
   // Requires PageQueues lock NOT be held
   void Flush() {
     // Cannot check if the page queues lock specifically is held, but can validate that *no*
     // spinlocks at all are held, which also needs to be true for us to acquire VMO locks.
     DEBUG_ASSERT(arch_num_spinlocks_held() == 0);
     // Compression state will be lazily instantiate if needed, and then used for any remaining
     // pages in the list.
     VmCompression* compression = nullptr;
     ktl::optional<VmCompression::CompressorGuard> maybe_compressor;
     VmCompressor* compressor = nullptr;

     for (size_t i = 0; i < items_; ++i) {
       auto [backlink, action] = ktl::move(list_[i]);
       DEBUG_ASSERT(backlink.cow);
       if (action == ListAction::ReplaceWithLoaned) {
         // We ignore the return value because the page may have moved, become pinned, we may not
         // have any free loaned pages any more, or the VmCowPages may not be able to borrow.
         backlink.cow->ReplacePageWithLoaned(backlink.page, backlink.offset);
       } else if (action == ListAction::Reclaim) {
         // Attempt to acquire any compressor that might exist, unless only evicting. Note that if
         // LruAction::None we would not have enqueued any Reclaim pages, so we can just check for
         // EvictOnly.
         if (lru_action_ != LruAction::EvictOnly && !compression) {
           compression = pmm_page_compression();
           if (compression) {
             maybe_compressor.emplace(compression->AcquireCompressor());
             compressor = &maybe_compressor->get();
           }
         }
         // If using a compressor, make sure it is Armed between reclamations.
         if (compressor) {
           zx_status_t status = compressor->Arm();
           if (status != ZX_OK) {
             // Continue processing as we might still be able to evict and we need to clear all the
             // refptrs as well.
             continue;
           }
         }
         list_node_t freed_list = LIST_INITIAL_VALUE(freed_list);
         if (uint64_t count = backlink.cow->ReclaimPage(backlink.page, backlink.offset,
                                                        VmCowPages::EvictionHintAction::Follow,
                                                        &freed_list, compressor);
             count > 0) {
           if (backlink.cow->can_evict()) {
             pq_lru_pages_evicted.Add(count);
           } else if (backlink.cow->is_discardable()) {
             pq_lru_pages_discarded.Add(count);
           } else {
             pq_lru_pages_compressed.Add(count);
           }
           pmm_free(&freed_list);
         }
       }
     }
     items_ = 0;
   }

  private:
   // The None is needed since to know if a page can be reclaimed by the current LruAction a RefPtr
   // to the VMO must first be created. If the page shouldn't be reclaimed the RefPtr must not be
   // dropped till outside the lock, in case it's the last ref. The None action provides a way to
   // retain these RefPtrs and have them dropped outside the lock.
   enum class ListAction {
     None,
     ReplaceWithLoaned,
     Reclaim,
   };

   void AddInternal(fbl::RefPtr<VmCowPages>&& cow, vm_page_t* page, ListAction action) {
     DEBUG_ASSERT(cow);
     DEBUG_ASSERT(items_ < list_.size());
     if (cow) {
       list_[items_] = {PageQueues::VmoBacklink{cow, page, page->object.get_page_offset()}, action};
       items_++;
     }
   }

   // Cache of the PageQueues LruAction for checking what to do with different reclaimable pages.
   LruAction lru_action_ = LruAction::None;
   // List of pages and the actions to perform on them.
   ktl::array<ktl::pair<PageQueues::VmoBacklink, ListAction>, Items> list_;
   // Number of items in the list_.
   size_t items_ = 0;
 };

 // static
 uint64_t PageQueues::GetLruPagesCompressed() { return pq_lru_pages_compressed.SumAcrossAllCpus(); }

 PageQueues::PageQueues()
     : min_mru_rotate_time_(kDefaultMinMruRotateTime),
       max_mru_rotate_time_(kDefaultMaxMruRotateTime),
       active_ratio_multiplier_(kDefaultActiveRatioMultiplier) {
   for (uint32_t i = 0; i < PageQueueNumQueues; i++) {
     list_initialize(&page_queues_[i]);
   }
   list_initialize(&dont_need_processing_list_);
 }

 PageQueues::~PageQueues() {
   StopThreads();
   for (uint32_t i = 0; i < PageQueueNumQueues; i++) {
     DEBUG_ASSERT(list_is_empty(&page_queues_[i]));
   }
   for (size_t i = 0; i < page_queue_counts_.size(); i++) {
     DEBUG_ASSERT_MSG(page_queue_counts_[i] == 0, "i=%zu count=%zu", i,
                      page_queue_counts_[i].load());
   }
 }

 void PageQueues::StartThreads(zx_duration_t min_mru_rotate_time,
                               zx_duration_t max_mru_rotate_time) {
   // Clamp the max rotate to the minimum.
   max_mru_rotate_time = ktl::max(min_mru_rotate_time, max_mru_rotate_time);
   // Prevent a rotation rate that is too small.
   max_mru_rotate_time = ktl::max(max_mru_rotate_time, ZX_SEC(1));

   min_mru_rotate_time_ = min_mru_rotate_time;
   max_mru_rotate_time_ = max_mru_rotate_time;

   // Cannot perform all of thread creation under the lock as thread creation requires
   // allocations so we create in temporaries first and then stash.
   Thread* mru_thread = Thread::Create(
       "page-queue-mru-thread",
       [](void* arg) -> int {
         static_cast<PageQueues*>(arg)->MruThread();
         return 0;
       },
       this, LOW_PRIORITY);
   DEBUG_ASSERT(mru_thread);

   mru_thread->Resume();

   Thread* lru_thread = Thread::Create(
       "page-queue-lru-thread",
       [](void* arg) -> int {
         static_cast<PageQueues*>(arg)->LruThread();
         return 0;
       },
       this, LOW_PRIORITY);
   DEBUG_ASSERT(lru_thread);
   lru_thread->Resume();

   Guard<SpinLock, IrqSave> guard{&lock_};
   ASSERT(!mru_thread_);
   ASSERT(!lru_thread_);
   mru_thread_ = mru_thread;
   lru_thread_ = lru_thread;
 }

 void PageQueues::StartDebugCompressor() {
   // The debug compressor should not be enabled without debug asserts as we guard all usages of the
   // debug compressor with compile time checks so that it cannot impact the performance of release
   // versions.
   ASSERT(DEBUG_ASSERT_IMPLEMENTED);
 #if DEBUG_ASSERT_IMPLEMENTED
   fbl::AllocChecker ac;
   ktl::unique_ptr<VmDebugCompressor> dc(new (&ac) VmDebugCompressor);
   if (!ac.check()) {
     panic("Failed to allocate VmDebugCompressor");
   }
   zx_status_t status = dc->Init();
   ASSERT(status == ZX_OK);
   Guard<SpinLock, IrqSave> guard{&lock_};
   debug_compressor_ = ktl::move(dc);
 #endif
 }

 void PageQueues::StopThreads() {
   // Cannot wait for threads to complete with the lock held, so update state and then perform any
   // joins outside the lock.
   Thread* mru_thread = nullptr;
   Thread* lru_thread = nullptr;

   {
     DeferPendingSignals dps{*this};
     {
       Guard<SpinLock, IrqSave> guard{&lock_};
       shutdown_threads_ = true;
       if (aging_disabled_.exchange(false)) {
         dps.Pend(PendingSignal::AgingToken);
       }
       dps.Pend(PendingSignal::AgingActiveRatioEvent);
       dps.Pend(PendingSignal::LruEvent);
       mru_thread = mru_thread_;
       lru_thread = lru_thread_;
     }
   }

   int retcode;
   if (mru_thread) {
     zx_status_t status = mru_thread->Join(&retcode, ZX_TIME_INFINITE);
     ASSERT(status == ZX_OK);
   }
   if (lru_thread) {
     zx_status_t status = lru_thread->Join(&retcode, ZX_TIME_INFINITE);
     ASSERT(status == ZX_OK);
   }
 }

 void PageQueues::SetLruAction(LruAction action) {
   Guard<SpinLock, IrqSave> guard{&lock_};
   lru_action_ = action;
 }

 void PageQueues::SetActiveRatioMultiplier(uint32_t multiplier) {
   DeferPendingSignals dps{*this};
   Guard<SpinLock, IrqSave> guard{&lock_};
   active_ratio_multiplier_ = multiplier;
   // The change in multiplier might have caused us to need to age.
   MaybeSignalActiveRatioAgingLocked(dps);
 }

 void PageQueues::MaybeSignalActiveRatioAgingLocked(DeferPendingSignals& dps) {
   if (active_ratio_triggered_) {
     // Already triggered, nothing more to do.
     return;
   }
   if (IsActiveRatioTriggeringAging()) {
     active_ratio_triggered_ = true;
     dps.Pend(PendingSignal::AgingActiveRatioEvent);
   }
 }

 bool PageQueues::IsActiveRatioTriggeringAging() const {
   ActiveInactiveCounts active_count = GetActiveInactiveCountsLocked();
   return active_count.active * active_ratio_multiplier_ > active_count.inactive;
 }

 ktl::variant<PageQueues::AgeReason, zx_time_t> PageQueues::ConsumeAgeReason() {
   AutoPreemptDisabler apd;
   Guard<SpinLock, IrqSave> guard{&lock_};
   auto reason = GetAgeReasonLocked();
   // If the age reason is the active ratio, consume the trigger.
   if (const AgeReason* age_reason = ktl::get_if<AgeReason>(&reason)) {
     no_pending_aging_signal_.Unsignal();
     if (*age_reason == AgeReason::ActiveRatio) {
       active_ratio_triggered_ = false;
       aging_active_ratio_event_.Unsignal();
     }
   } else {
     no_pending_aging_signal_.Signal();
   }
   return reason;
 }

 void PageQueues::SynchronizeWithAging() {
   while (true) {
     // Wait for any in progress aging to complete. This is not an Autounsignal event and so waiting
     // on it without the lock is not manipulating its state.
     no_pending_aging_signal_.Wait();

     // The MruThread may not have woken up yet to clear the pending signal, so we must check
     // ourselves.
     Guard<SpinLock, IrqSave> guard{&lock_};
     if (!ktl::holds_alternative<AgeReason>(GetAgeReasonLocked())) {
       // There is no aging reason, so there is no race to worry about, and no aging can be in
       // progress.
       return;
     }
     // We may have raced with the MruThread. Either it has already seen that there is an AgeReason
     // and cleared the this signal, or it is still pending to be scheduled and clear it. If it
     // already cleared it, then us clearing it again is harmless, and if it is still waiting to run
     // by clearing it we can then Wait on the event, knowing once the MruThread finishes performing
     // aging it will do the signal.
     // Since we hold the lock, and know there is an age reason, we know that we are not racing with
     // the signal being set, and so cannot lose a signal here.
     no_pending_aging_signal_.Unsignal();
   }
 }

 ktl::variant<PageQueues::AgeReason, zx_time_t> PageQueues::GetAgeReasonLocked() const {
   const zx_time_t current = current_time();
   // Check if there is an active ratio that wants us to age.
   if (active_ratio_triggered_) {
     // Need to have passed the min time though.
     const zx_time_t min_timeout =
         zx_time_add_duration(last_age_time_.load(ktl::memory_order_relaxed), min_mru_rotate_time_);
     if (current < min_timeout) {
       return min_timeout;
     }
     // At least min time has elapsed, can age via active ratio.
     return AgeReason::ActiveRatio;
   }

   // Exceeding the maximum time forces aging.
   const zx_time_t max_timeout =
       zx_time_add_duration(last_age_time_.load(ktl::memory_order_relaxed), max_mru_rotate_time_);
   if (max_timeout <= current) {
     return AgeReason::Timeout;
   }
   // With no other reason, we will age once we hit the maximum timeout.
   return max_timeout;
 }

 void PageQueues::MaybeTriggerLruProcessing() {
   if (NeedsLruProcessing()) {
     DeferPendingSignals dps{*this};
     dps.Pend(PendingSignal::LruEvent);
   }
 }

 bool PageQueues::NeedsLruProcessing() const {
   // Currently only reason to trigger lru processing is if the MRU needs space.
   // Performing this unlocked is equivalently correct as grabbing the lock, reading, and dropping
   // the lock. If a caller needs to know if the lru queue needs processing *and* then perform an
   // action before that status could change, it should externally hold lock_ over this method and
   // its action.
   if (mru_gen_.load(ktl::memory_order_relaxed) - lru_gen_.load(ktl::memory_order_relaxed) ==
       kNumReclaim - 1) {
     return true;
   }
   return false;
 }

 void PageQueues::DisableAging() {
   // Validate a double DisableAging is not happening.
   if (aging_disabled_.exchange(true)) {
     panic("Mismatched disable/enable pair");
   }

   // Take the aging token. This will both wait for the aging thread to complete any in progress
   // aging, and prevent it from aging until we return it.
   aging_token_.Wait();
 #if DEBUG_ASSERT_IMPLEMENTED
   // Pause might drop the last reference to a VMO and trigger VMO destruction, which would then call
   // back into the page queues, so we must not hold the lock_ over the operation. We can utilize the
   // fact that once the debug_compressor_ is set it is never destroyed, so can take a raw pointer to
   // it.
   VmDebugCompressor* dc = nullptr;
   {
     Guard<SpinLock, IrqSave> guard{&lock_};
     if (debug_compressor_) {
       dc = &*debug_compressor_;
     }
   }
   if (dc) {
     dc->Pause();
   }
 #endif
 }

 void PageQueues::EnableAging() {
   DeferPendingSignals dps{*this};

   // Validate a double EnableAging is not happening.
   if (!aging_disabled_.exchange(false)) {
     panic("Mismatched disable/enable pair");
   }

   // Return the aging token, allowing the aging thread to proceed if it was waiting.
   dps.Pend(PendingSignal::AgingToken);
 #if DEBUG_ASSERT_IMPLEMENTED
   Guard<SpinLock, IrqSave> guard{&lock_};
   if (debug_compressor_) {
     debug_compressor_->Resume();
   }
 #endif
 }

 const char* PageQueues::string_from_age_reason(PageQueues::AgeReason reason) {
   switch (reason) {
     case AgeReason::ActiveRatio:
       return "Active ratio";
     case AgeReason::Timeout:
       return "Timeout";
     case AgeReason::Manual:
       return "Manual";
     default:
       panic("Unreachable");
   }
 }

 void PageQueues::Dump() {
   // Need to grab a copy of all the counts and generations. As the lock is needed to acquire the
   // active/inactive counts, also hold the lock over the copying of the counts to avoid needless
   // races.
   uint64_t mru_gen;
   uint64_t lru_gen;
   size_t counts[kNumReclaim] = {};
   size_t inactive_count;
   size_t failed_reclaim;
   size_t dirty;
   zx_time_t last_age_time;
   AgeReason last_age_reason;
   ActiveInactiveCounts activeinactive;
   {
     Guard<SpinLock, IrqSave> guard{&lock_};
     mru_gen = mru_gen_.load(ktl::memory_order_relaxed);
     lru_gen = lru_gen_.load(ktl::memory_order_relaxed);
     failed_reclaim = page_queue_counts_[PageQueueFailedReclaim].load(ktl::memory_order_relaxed);
     inactive_count = page_queue_counts_[PageQueueReclaimDontNeed].load(ktl::memory_order_relaxed);
     dirty = page_queue_counts_[PageQueuePagerBackedDirty].load(ktl::memory_order_relaxed);
     for (uint32_t i = 0; i < kNumReclaim; i++) {
       counts[i] = page_queue_counts_[PageQueueReclaimBase + i].load(ktl::memory_order_relaxed);
     }
     activeinactive = GetActiveInactiveCountsLocked();
     last_age_time = last_age_time_.load(ktl::memory_order_relaxed);
     last_age_reason = last_age_reason_;
   }
   // Small arbitrary number that should be more than large enough to hold the constructed string
   // without causing stack allocation pressure.
   constexpr size_t kBufSize = 50;
   // Start with the buffer null terminated. snprintf will always keep it null terminated.
   char buf[kBufSize] __UNINITIALIZED = "\0";
   size_t buf_len = 0;
   // This formats the counts of all buckets, not just those within the mru->lru range, even though
   // any buckets not in that range should always have a count of zero. The format this generates is
   // [active],[active],inactive,inactive,{last inactive},should-be-zero,should-be-zero
   // Although the inactive and should-be-zero use the same formatting, they are broken up by the
   // {last inactive}.
   for (uint64_t i = 0; i < kNumReclaim; i++) {
     PageQueue queue = gen_to_queue(mru_gen - i);
     ASSERT(buf_len < kBufSize);
     const size_t remain = kBufSize - buf_len;
     int write_len;
     if (i < kNumActiveQueues) {
       write_len = snprintf(buf + buf_len, remain, "[%zu],", counts[queue - PageQueueReclaimBase]);
     } else if (i == mru_gen - lru_gen) {
       write_len = snprintf(buf + buf_len, remain, "{%zu},", counts[queue - PageQueueReclaimBase]);
     } else {
       write_len = snprintf(buf + buf_len, remain, "%zu,", counts[queue - PageQueueReclaimBase]);
     }
     // Negative values are returned on encoding errors, which we never expect to get.
     ASSERT(write_len >= 0);
     if (static_cast<uint>(write_len) >= remain) {
       // Buffer too small, just use whatever we have constructed so far.
       break;
     }
     buf_len += write_len;
   }
   zx_time_t current = current_time();
   timespec age_time = zx_timespec_from_duration(zx_time_sub_time(current, last_age_time));
   printf("pq: MRU generation is %" PRIu64
          " set %ld.%lds ago due to \"%s\", LRU generation is %" PRIu64 "\n",
          mru_gen, age_time.tv_sec, age_time.tv_nsec, string_from_age_reason(last_age_reason),
          lru_gen);
   printf("pq: Pager buckets %s evict first: %zu\n", buf, inactive_count);
   printf("pq: %s active/inactive totals: %zu/%zu dirty: %zu failed reclaim: %zu\n",
          activeinactive.cached ? "cached" : "live", activeinactive.active, activeinactive.inactive,
          dirty, failed_reclaim);
 }

 // This runs the aging thread. Aging, unlike lru processing, scanning or eviction, requires very
 // little work and is more about coordination. As such this thread is heavy on checks and signalling
 // but generally only needs to hold any locks for the briefest of times.
 // There is, currently, one exception to that, which is the calls to scanner_wait_for_accessed_scan.
 // The scanner will, eventually, be a separate thread that is synchronized with, but presently
 // a full scan may happen inline in that method call, and get attributed directly to this thread.
 void PageQueues::MruThread() {
   // Pretend that aging happens during startup to simplify the rest of the loop logic.
   last_age_time_ = current_time();
   unsigned int iterations_since_last_age = 0;
   while (!shutdown_threads_.load(ktl::memory_order_relaxed)) {
     // Normally we should retry the loop at most once (i.e. pass this line of code twice) if an
     // active ratio was triggered (kicking us out of the event), but we still needed to wait for the
     // min timeout. In this case in the first pass we do not get an age reason, wake up on the event
     // then perform the Sleep, come back around the loop and can now get an age reason.
     //
     // Unfortunately due to the way DeferredPendingSignals works, there is race where a thread can
     // set `active_ratio_triggered_`, but fail to actually signal the event before being preempted.
     // It is possible for us to then call ConsumeAgeReason and perform the aging without waiting on
     // the event. At some later point that first thread could finally deliver the signal, spuriously
     // waking us up. In an extremely unlikely event there could be multiple threads queued up in
     // this state to deliver an unbounded number of late signals. This is extremely unlikely though
     // and would require some precise scheduling behavior. Nevertheless it is technically possible
     // and so we just print a warning that it has happened and do not generate any errors.
     if (iterations_since_last_age == 10) {
       printf("%s iterated %u times, possible bug or overloaded system", __FUNCTION__,
              iterations_since_last_age);
     }
     // Check if there is an age reason waiting for us, consuming if there is, or if we need to wait.
     auto reason_or_timeout = ConsumeAgeReason();
     if (const zx_time_t* age_deadline = ktl::get_if<zx_time_t>(&reason_or_timeout)) {
       // Wait for this time, ensuring we wake up if the active ratio should change.
       zx_status_t result = aging_active_ratio_event_.WaitDeadline(*age_deadline, Interruptible::No);
       // Check if shutdown has been requested, we need this extra check even though it is part of
       // the main loop check to ensure that we do not perform the minimal rotate time sleep with a
       // shutdown pending.
       if (shutdown_threads_.load(ktl::memory_order_relaxed)) {
         break;
       }
       if (result != ZX_ERR_TIMED_OUT) {
         // Might have woken up too early, ensure we have passed the minimal timeout. If the timeout
         // was already passed and we legitimately woke up due to an active ratio event, then this
         // sleep will short-circuit internally and immediately return.
         Thread::Current::Sleep(zx_time_add_duration(last_age_time_.load(ktl::memory_order_relaxed),
                                                     min_mru_rotate_time_));
       }
       // Due to races, there may or may not be an age reason at this point, so go back around the
       // loop and find out, counting how many times we go around.
       iterations_since_last_age++;
       continue;
     }
     AgeReason age_reason = ktl::get<AgeReason>(reason_or_timeout);

     if (iterations_since_last_age == 0) {
       // If we did zero iterations then this means there was an age_reason waiting for us, meaning
       // the min rotation time had already elapsed. This is not an error, but implies that aging
       // thread is running behind.
       pq_aging_reason_before_min_timeout.Add(1);
     } else if (iterations_since_last_age > 1) {
       // Typically a single iteration is expected as we might fail ConsumeAgeReason once due to
       // needing to wait for a timeout. However, due to DeferredPendingSignals, there could be
       // additional spurious wakeups (see comment at the top of the loop). This does not necessarily
       // mean there is an error, but implies that other threads are running badly behind.
       pq_aging_spurious_wakeup.Add(iterations_since_last_age - 1);
     }
     iterations_since_last_age = 0;

     // Taken the aging token, potentially blocking if aging is disabled, make sure to return it when
     // we are done.
     aging_token_.Wait();
     DeferPendingSignals dps{*this};
     dps.Pend(PendingSignal::AgingToken);

     // Make sure the accessed information has been harvested since the last time we aged, otherwise
     // we are deliberately making the age information coarser, by effectively not using one of the
     // queues, at which point we might as well not have bothered rotating.
     // Currently this is redundant since we will explicitly harvest just after aging, however once
     // there are additional aging triggers and harvesting is more asynchronous, this will serve as
     // a synchronization point.
     scanner_wait_for_accessed_scan(last_age_time_, true);

     RotateReclaimQueues(age_reason);

     // Changing mru_gen_ could have impacted the eviction logic.
     MaybeTriggerLruProcessing();
   }
 }

 // This thread should, at some point, have some of its logic and signaling merged with the Evictor.
 // Currently it might process the lru queue whilst the evictor is already trying to evict, which is
 // not harmful but it's a bit wasteful as it doubles the work that happens.
 // LRU processing, via ProcessDontNeedAndLruQueues, is expensive and happens under the lock_. It is
 // expected that ProcessDontNeedAndLruQueues perform small units of work to avoid this thread
 // causing excessive lock contention.
 void PageQueues::LruThread() {
   while (!shutdown_threads_.load(ktl::memory_order_relaxed)) {
     lru_event_.WaitDeadline(ZX_TIME_INFINITE, Interruptible::No);
     // Take the lock so we can calculate (race free) a target mru-gen
     uint64_t target_gen;
     {
       Guard<SpinLock, IrqSave> guard{&lock_};
       if (!NeedsLruProcessing()) {
         pq_lru_spurious_wakeup.Add(1);
         continue;
       }
       target_gen = lru_gen_.load(ktl::memory_order_relaxed) + 1;
     }
     // With the lock dropped process the target. This is not racy as generations are monotonic, so
     // worst case someone else already processed this generation and this call will be a no-op.
     ProcessDontNeedAndLruQueues(target_gen, false);
   }
 }

 void PageQueues::RotateReclaimQueues(AgeReason reason) {
   VM_KTRACE_DURATION(2, "RotatePagerBackedQueues");
   // We expect LRU processing to have already happened, so first poll the mru semaphore.
   if (mru_semaphore_.Wait(Deadline::infinite_past()) == ZX_ERR_TIMED_OUT) {
     // We should not have needed to wait for lru processing here, as it should have already been
     // made available due to earlier triggers. Although this could reasonably happen due to races or
     // delays in scheduling we record in a counter as happening regularly could indicate a bug.
     pq_aging_blocked_on_lru.Add(1);

     MaybeTriggerLruProcessing();

     // The LRU thread could take an arbitrary amount of time to get scheduled and run, so we cannot
     // enforce a deadline. However, we can assume there might be a bug and start making noise to
     // inform the user if we have waited multiples of the expected maximum aging interval, since
     // that implies we are starting to lose the requested fidelity of age information.
     int64_t timeouts = 0;
     while (mru_semaphore_.Wait(Deadline::after(max_mru_rotate_time_, TimerSlack::none())) ==
            ZX_ERR_TIMED_OUT) {
       timeouts++;
       printf("[pq] WARNING: Waited %" PRIi64 " seconds for LRU thread, MRU semaphore %" PRIi64
              ", aging is presently stalled\n",
              (max_mru_rotate_time_ * timeouts) / ZX_SEC(1), mru_semaphore_.count());
       Dump();
     }
   }

   ASSERT(mru_gen_.load(ktl::memory_order_relaxed) - lru_gen_.load(ktl::memory_order_relaxed) <
          kNumReclaim - 1);

   {
     // Acquire the lock to increment the mru_gen_. This allows other queue logic to not worry about
     // mru_gen_ changing whilst they hold the lock.
     DeferPendingSignals dps{*this};
     Guard<SpinLock, IrqSave> guard{&lock_};
     mru_gen_.fetch_add(1, ktl::memory_order_relaxed);
     last_age_time_ = current_time();
     last_age_reason_ = reason;
     // Update the active/inactive counts. We could be a bit smarter here since we know exactly which
     // active bucket might have changed, but this will work.
     RecalculateActiveInactiveLocked(dps);
   }
   // Keep a count of the different reasons we have rotated.
   switch (reason) {
     case AgeReason::Timeout:
       pq_aging_reason_timeout.Add(1);
       break;
     case AgeReason::ActiveRatio:
       pq_aging_reason_active_ratio.Add(1);
       break;
     case AgeReason::Manual:
       pq_aging_reason_manual.Add(1);
       break;
     default:
       panic("Unknown age reason");
   }
 }

 template <size_t Items>
 ktl::optional<PageQueues::VmoBacklink> PageQueues::ProcessLruQueueHelper(
     LruIsolate<Items>& deferred_list, uint64_t target_gen, bool peek) {
   VM_KTRACE_DURATION(2, "ProcessQueue");

   // Only accumulate pages to try to replace with loaned pages if loaned pages are available and
   // we're allowed to borrow at this code location.
   const bool do_sweeping = (pmm_count_loaned_free_pages() != 0) &&
                            pmm_physical_page_borrowing_config()->is_borrowing_on_mru_enabled();

   DeferPendingSignals dps{*this};
   // Ensure the list is empty before we start.
   deferred_list.Flush();

   // Note: we need to make sure that we disable local preemption while we are
   // holding our local lock.  Otherwise, if/when we end up posting to our mru
   // semaphore, it could result in us triggering a preemption while we are
   // holding the spinlock, which is not something we can allow.
   AutoPreemptDisabler apd;
   Guard<SpinLock, IrqSave> guard{&lock_};
   const PageQueue mru_queue = mru_gen_to_queue();
   const uint64_t lru = lru_gen_.load(ktl::memory_order_relaxed);
   // Fill in the lru action now that the lock is held.
   deferred_list.SetLruAction(lru_action_);

   // If we're processing the lru queue and it has already hit the target gen, return early.
   if (lru >= target_gen) {
     return ktl::nullopt;
   }

   uint32_t work_remain = Items;
   const PageQueue lru_queue = gen_to_queue(lru);
   list_node* operating_queue = &page_queues_[lru_queue];

   while (!list_is_empty(operating_queue) && work_remain > 0) {
     work_remain--;
     // When moving pages around we want to maintain relative page age as far as possible. Therefore,
     // if forcefully moving pages from LRU to LRU+1 we want all the pages from LRU, to appear after
     // those already in LRU+1, as the ones in LRU are older. To achieve this we want to take from
     // the head of LRU, and place in the tail of LRU+1.
     // However, if peeking (and not forcefully moving), then we always want to return the oldest
     // page, which is the tail. For any pages whose stored queue does not match, it is irrelevant
     // which end we take from as such pages have no meaningful relative ordering.
     vm_page_t* page = peek ? list_peek_tail_type(operating_queue, vm_page_t, queue_node)
                            : list_peek_head_type(operating_queue, vm_page_t, queue_node);
     PageQueue page_queue =
         (PageQueue)page->object.get_page_queue_ref().load(ktl::memory_order_relaxed);
     DEBUG_ASSERT(page_queue >= PageQueueReclaimBase);

     // If the queue stored in the page does not match then we want to move it to its correct queue
     // with the caveat that its queue could be invalid. The queue would be invalid if MarkAccessed
     // had raced. Should this happen we know that the page is actually *very* old, and so we will
     // fall back to the case of forcibly changing its age to the new lru gen.
     if (page_queue != lru_queue && queue_is_valid(page_queue, lru_queue, mru_queue)) {
       list_delete(&page->queue_node);
       list_add_head(&page_queues_[page_queue], &page->queue_node);

       if (do_sweeping && !page->is_loaned() && queue_is_active(page_queue, mru_queue)) {
         deferred_list.AddLoanReplacement(page, this);
       }
     } else if (peek) {
       VmCowPages* cow = reinterpret_cast<VmCowPages*>(page->object.get_object());
       uint64_t page_offset = page->object.get_page_offset();
       DEBUG_ASSERT(cow);

       // We may be racing with destruction of VMO. As we currently hold our lock we know that our
       // back pointer is correct in so far as the VmCowPages has not yet had completed running its
       // destructor, so we know it is safe to attempt to upgrade it to a RefPtr. If upgrading
       // fails we assume the page is about to be removed from the page queue once the VMO
       // destructor gets a chance to run.
       return VmoBacklink{fbl::MakeRefPtrUpgradeFromRaw(cow, lock_), page, page_offset};
     } else {
       // Force it into our target queue, don't care about races. If we happened to access it at
       // the same time then too bad.
       PageQueue new_queue = gen_to_queue(lru + 1);
       PageQueue old_queue = (PageQueue)page->object.get_page_queue_ref().exchange(new_queue);
       DEBUG_ASSERT(old_queue >= PageQueueReclaimBase);

       page_queue_counts_[old_queue].fetch_sub(1, ktl::memory_order_relaxed);
       page_queue_counts_[new_queue].fetch_add(1, ktl::memory_order_relaxed);
       list_delete(&page->queue_node);
       list_add_tail(&page_queues_[new_queue], &page->queue_node);
       // We should only have performed this step to move from one inactive bucket to the next,
       // so there should be no active/inactive count changes needed.
       DEBUG_ASSERT(!queue_is_active(new_queue, mru_queue));
       deferred_list.AddReclaimable(page, this);
     }
   }
   if (list_is_empty(operating_queue)) {
     lru_gen_.store(lru + 1, ktl::memory_order_relaxed);
     mru_semaphore_.Post();
   }

   return ktl::nullopt;
 }

 ktl::optional<PageQueues::VmoBacklink> PageQueues::ProcessDontNeedList(list_node_t* list,
                                                                        bool peek) {
   // Need to move every page out of the list and either put it back in the regular DontNeed list,
   // or in its correct queue. If we hit active pages we may need to replace them with loaned.

   // Processing the DontNeed queue requires holding the page_queues_ lock_. The only other actions
   // that require this lock are inserting or removing pages from the page queues. To ensure these
   // actions can complete in a small bounded time kMaxDeferredWork is chosen to be very small so
   // that the lock will be regularly dropped. As processing the DontNeed queue is not time critical
   // and can be somewhat inefficient in its operation we err on the side of doing less work per lock
   // acquisition.
   constexpr uint64_t kMaxDeferredWork = 16;
   // Pages in this list might be replaced with a loaned page, this must be done outside the lock_,
   // so we accumulate pages and then act after lock_ is released.
   LruIsolate<kMaxDeferredWork> deferred_list;
   // Only accumulate pages to try to replace with loaned pages if loaned pages are available and
   // we're allowed to borrow at this code location.
   const bool do_sweeping = (pmm_count_loaned_free_pages() != 0) &&
                            pmm_physical_page_borrowing_config()->is_borrowing_on_mru_enabled();

   Guard<SpinLock, IrqSave> guard{&lock_};
   // If not peeking we must be processing the dont_need_processing_list_, otherwise we will
   // infinite loop taking items out and placing them back into the same list we are processing.
   DEBUG_ASSERT(peek || list == &dont_need_processing_list_);
   // Count work done separately to all iterations so we can periodically drop the lock and process
   // the deferred_list.
   uint64_t work_done = 0;
   while (!list_is_empty(list)) {
     // Take from the tail of the list as that represents the oldest item. That way if |peek| is true
     // pages will get returned in oldest->newest order.
     vm_page_t* page = list_remove_tail_type(list, vm_page_t, queue_node);
     PageQueue page_queue =
         static_cast<PageQueue>(page->object.get_page_queue_ref().load(ktl::memory_order_relaxed));
     // Place in the correct list, preserving age
     if (page_queue == PageQueueReclaimDontNeed) {
       // As we removed from the tail we place in the head, that way overall ordering is preserved.
       list_add_head(&page_queues_[page_queue], &page->queue_node);
       if (peek) {
         VmCowPages* cow = reinterpret_cast<VmCowPages*>(page->object.get_object());
         DEBUG_ASSERT(cow);
         // We may be racing with destruction of VMO. As we currently hold our lock we know that our
         // back pointer is correct in so far as the VmCowPages has not yet had completed running its
         // destructor, so we know it is safe to attempt to upgrade it to a RefPtr. If upgrading
         // fails we assume the page is about to be removed from the page queue once the VMO
         // destructor gets a chance to run.
         return VmoBacklink{fbl::MakeRefPtrUpgradeFromRaw(cow, lock_), page,
                            page->object.get_page_offset()};
       }
     } else {
       // Only reason for a page to be in the DontNeed list and have the wrong queue is if it was
       // recently accessed. That means it's active and we can attempt to loan to it. As the entire
       // DontNeed queue is processed each time we change the LRU, we know this is a valid page queue
       // that has not yet aged out.
       // We have no way to know the relative age of this page with respect to its target queue, so
       // the head is as good a place as any to put it.
       list_add_head(&page_queues_[page_queue], &page->queue_node);
       if (do_sweeping && !page->is_loaned()) {
         deferred_list.AddLoanReplacement(page, this);
       }
     }
     work_done++;
     if (work_done >= kMaxDeferredWork) {
       // Drop the lock and flush the deferred_list
       guard.CallUnlocked([&deferred_list]() { deferred_list.Flush(); });
       work_done = 0;
     }
   }
   return ktl::nullopt;
 }

 ktl::optional<PageQueues::VmoBacklink> PageQueues::ProcessDontNeedAndLruQueues(uint64_t target_gen,
                                                                                bool peek) {
   // This assertion is <=, and not strictly <, since to evict a some queue X, the target must be
   // X+1. Hence to preserve kNumActiveQueues, we can allow target_gen to become equal to the first
   // active queue, as this will process all the non-active queues. Although we might refresh our
   // value for the mru_queue, since the mru_gen_ is monotonic increasing, if this assert passes once
   // it should continue to be true.
   ASSERT(target_gen <= mru_gen_.load(ktl::memory_order_relaxed) - (kNumActiveQueues - 1));

   {
     VM_KTRACE_DURATION(2, "ProcessDontNeedQueue");
     if (peek) {
       // When peeking we prefer to grab from the dont_need_processign_list_ first, as its pages are
       // older, or at least were moved to the DontNeed queue further in the past.
       ktl::optional<VmoBacklink> backlink = ProcessDontNeedList(&dont_need_processing_list_, true);
       if (backlink != ktl::nullopt) {
         return backlink;
       }
       list_node_t* list = [this]() TA_NO_THREAD_SAFETY_ANALYSIS {
         return &page_queues_[PageQueueReclaimDontNeed];
       }();
       backlink = ProcessDontNeedList(list, true);
       if (backlink != ktl::nullopt) {
         return backlink;
       }
     } else {
       // If not peeking then we will need to properly process the DontNeed queue, and so we must
       // take the processing lock and move the existing pages into the processing list.
       Guard<Mutex> dont_need_processing_guard{&dont_need_processing_lock_};
       {
         Guard<SpinLock, IrqSave> guard{&lock_};
         ASSERT(list_is_empty(&dont_need_processing_list_));
         list_move(&page_queues_[PageQueueReclaimDontNeed], &dont_need_processing_list_);
       }
       ProcessDontNeedList(&dont_need_processing_list_, false);
     }
   }

   // Calculate a truly worst case loop iteration count based on every page being in the LRU
   // queue and needing to iterate the LRU multiple steps to the target_gen. Instead of reading the
   // LRU and comparing the target_gen, just add a buffer of the maximum number of page queues.
   ActiveInactiveCounts active_inactive = GetActiveInactiveCounts();
   const uint64_t max_lru_iterations =
       active_inactive.active + active_inactive.inactive + kNumReclaim;
   // Loop iteration counting is just for diagnostic purposes.
   uint64_t loop_iterations = 0;

   // Processing the LRU queue requires holding the page_queues_ lock_. The only other
   // actions that require this lock are inserting or removing pages from the page queues. To ensure
   // these actions can complete in a small bounded time kMaxQueueWork is chosen to be very small so
   // that the lock will be regularly dropped. As processing the DontNeed/LRU queue is not time
   // critical and can be somewhat inefficient in its operation we err on the side of doing less work
   // per lock acquisition.
   //
   // Also, we need to limit the number to avoid sweep_to_loaned taking up excessive stack space.
   static constexpr uint32_t kMaxQueueWork = 16;

   // Pages in this list might be reclaimed or replaced with a loaned page, depending on the action
   // specified in deferred_action. Each of these actions must be done outside the lock_, so we
   // accumulate pages and then act after lock_ is released.
   // The deferred_list is declared here as it is expensive to construct/destruct and we would like
   // to reuse it between iterations.
   LruIsolate<kMaxQueueWork> deferred_list;

   // Process the lru queue to reach target_gen.
   while (lru_gen_.load(ktl::memory_order_relaxed) < target_gen) {
     VM_KTRACE_DURATION(2, "ProcessLruQueue");
     if (loop_iterations++ == max_lru_iterations) {
       printf("[pq]: WARNING: %s exceeded expected max LRU loop iterations %" PRIu64 "\n",
              __FUNCTION__, max_lru_iterations);
     }
     auto optional_backlink = ProcessLruQueueHelper(deferred_list, target_gen, peek);

     if (optional_backlink != ktl::nullopt) {
       return optional_backlink;
     }
   }

   return ktl::nullopt;
 }

 void PageQueues::UpdateActiveInactiveLocked(PageQueue old_queue, PageQueue new_queue,
                                             DeferPendingSignals& dps) {
   // Short circuit the lock acquisition and logic if not dealing with active/inactive queues
   if (!queue_is_reclaim(old_queue) && !queue_is_reclaim(new_queue)) {
     return;
   }
   // This just blindly updates the active/inactive counts. If accessed scanning is happening, and
   // used use_cached_queue_counts_ is true, then we could be racing and setting these to garbage
   // values. That's fine as they will never get returned anywhere, and will get reset to correct
   // values once access scanning completes.
   PageQueue mru = mru_gen_to_queue();
   if (queue_is_active(old_queue, mru)) {
     active_queue_count_--;
   } else if (queue_is_inactive(old_queue, mru)) {
     inactive_queue_count_--;
   }
   if (queue_is_active(new_queue, mru)) {
     active_queue_count_++;
   } else if (queue_is_inactive(new_queue, mru)) {
     inactive_queue_count_++;
   }
   MaybeSignalActiveRatioAgingLocked(dps);
 }

 void PageQueues::MarkAccessedContinued(vm_page_t* page) {
   // Although we can get called with the zero page, it would not be in a reclaimable queue and so
   // we should have returned in the MarkAccessed wrapper.
   DEBUG_ASSERT(page != vm_get_zero_page());

   pq_accessed_normal.Add(1);

   auto queue_ref = page->object.get_page_queue_ref();

   DeferPendingSignals dps{*this};
   Guard<SpinLock, IrqSave> guard{&lock_};

   // We need to check the current queue to see if it is in the reclaimable range. Between checking
   // this and updating the queue it could change, however it would only change as a result of
   // MarkAccessedDeferredCount, which would only move it to another reclaimable queue. No other
   // change is possible as we are holding lock_.
   if (queue_ref.load(ktl::memory_order_relaxed) < PageQueueReclaimDontNeed) {
     return;
   }

   PageQueue queue = mru_gen_to_queue();
   PageQueue old_queue = (PageQueue)queue_ref.exchange(queue, ktl::memory_order_relaxed);
   // Double check again that this was previously reclaimable
   DEBUG_ASSERT(old_queue != PageQueueNone && old_queue >= PageQueueReclaimDontNeed);
   if (old_queue != queue) {
     page_queue_counts_[old_queue].fetch_sub(1, ktl::memory_order_relaxed);
     page_queue_counts_[queue].fetch_add(1, ktl::memory_order_relaxed);
     UpdateActiveInactiveLocked(old_queue, queue, dps);
   } else {
     pq_accessed_normal_same_queue.Add(1);
   }
 }

 void PageQueues::MarkAccessedDeferredCount(vm_page_t* page) {
   // Ensure that the page queues is returning the cached counts at the moment, otherwise we might
   // race.
   pq_accessed_deferred_count.Add(1);
   DEBUG_ASSERT(use_cached_queue_counts_.load(ktl::memory_order_relaxed));
   auto queue_ref = page->object.get_page_queue_ref();
   uint8_t old_gen = queue_ref.load(ktl::memory_order_relaxed);
   // Between loading the mru_gen and finally storing it in the queue_ref it's possible for our
   // calculated target_queue to become invalid. This is extremely unlikely as it would require
   // us to stall for long enough for the lru_gen to pass this point, but if it does happen then
   // ProcessLruQueues will notice our queue is invalid and correct our age to be that of lru_gen.
   const uint32_t target_queue = mru_gen_to_queue();
   if (old_gen == target_queue) {
     pq_accessed_deferred_count_same_queue.Add(1);
     return;
   }
   do {
     // If we ever find old_gen to not be in the active/inactive range then this means the page has
     // either been racily removed from, or was never in, the reclaim queue. In which case we
     // can return as there's nothing to be marked accessed.
     if (!queue_is_reclaim(static_cast<PageQueue>(old_gen))) {
       return;
     }
   } while (!queue_ref.compare_exchange_weak(old_gen, static_cast<uint8_t>(target_queue),
                                             ktl::memory_order_relaxed));
   page_queue_counts_[old_gen].fetch_sub(1, ktl::memory_order_relaxed);
   page_queue_counts_[target_queue].fetch_add(1, ktl::memory_order_relaxed);
 }

 void PageQueues::SetQueueBacklinkLocked(vm_page_t* page, void* object, uintptr_t page_offset,
                                         PageQueue queue, DeferPendingSignals& dps) {
   DEBUG_ASSERT(page->state() == vm_page_state::OBJECT);
   DEBUG_ASSERT(!page->is_free());
   DEBUG_ASSERT(!list_in_list(&page->queue_node));
   DEBUG_ASSERT(object);
   DEBUG_ASSERT(!page->object.get_object());
   DEBUG_ASSERT(page->object.get_page_offset() == 0);

   page->object.set_object(object);
   page->object.set_page_offset(page_offset);

   DEBUG_ASSERT(page->object.get_page_queue_ref().load(ktl::memory_order_relaxed) == PageQueueNone);
   page->object.get_page_queue_ref().store(queue, ktl::memory_order_relaxed);
   list_add_head(&page_queues_[queue], &page->queue_node);
   page_queue_counts_[queue].fetch_add(1, ktl::memory_order_relaxed);
   UpdateActiveInactiveLocked(PageQueueNone, queue, dps);
 }

 void PageQueues::MoveToQueueLocked(vm_page_t* page, PageQueue queue, DeferPendingSignals& dps) {
   DEBUG_ASSERT(page->state() == vm_page_state::OBJECT);
   DEBUG_ASSERT(!page->is_free());
   DEBUG_ASSERT(list_in_list(&page->queue_node));
   DEBUG_ASSERT(page->object.get_object());
   uint32_t old_queue = page->object.get_page_queue_ref().exchange(queue, ktl::memory_order_relaxed);
   DEBUG_ASSERT(old_queue != PageQueueNone);

   list_delete(&page->queue_node);
   list_add_head(&page_queues_[queue], &page->queue_node);
   page_queue_counts_[old_queue].fetch_sub(1, ktl::memory_order_relaxed);
   page_queue_counts_[queue].fetch_add(1, ktl::memory_order_relaxed);
   UpdateActiveInactiveLocked(static_cast<PageQueue>(old_queue), queue, dps);
 }

 void PageQueues::SetWired(vm_page_t* page, VmCowPages* object, uint64_t page_offset) {
   DeferPendingSignals dps{*this};
   Guard<SpinLock, IrqSave> guard{&lock_};
   DEBUG_ASSERT(object);
   SetQueueBacklinkLocked(page, object, page_offset, PageQueueWired, dps);
 }

 void PageQueues::MoveToWired(vm_page_t* page) {
   DeferPendingSignals dps{*this};
   Guard<SpinLock, IrqSave> guard{&lock_};
   MoveToQueueLocked(page, PageQueueWired, dps);
 }

 void PageQueues::SetAnonymous(vm_page_t* page, VmCowPages* object, uint64_t page_offset) {
   DeferPendingSignals dps{*this};
   Guard<SpinLock, IrqSave> guard{&lock_};
   DEBUG_ASSERT(object);
   SetQueueBacklinkLocked(page, object, page_offset,
                          anonymous_is_reclaimable_ ? mru_gen_to_queue() : PageQueueAnonymous, dps);
 #if DEBUG_ASSERT_IMPLEMENTED
   if (debug_compressor_) {
     debug_compressor_->Add(page, object, page_offset);
   }
 #endif
 }

 void PageQueues::SetHighPriority(vm_page_t* page, VmCowPages* object, uint64_t page_offset) {
   DeferPendingSignals dps{*this};
   Guard<SpinLock, IrqSave> guard{&lock_};
   DEBUG_ASSERT(object);
   SetQueueBacklinkLocked(page, object, page_offset, PageQueueHighPriority, dps);
 }

 void PageQueues::MoveToHighPriority(vm_page_t* page) {
   DeferPendingSignals dps{*this};
   Guard<SpinLock, IrqSave> guard{&lock_};
   MoveToQueueLocked(page, PageQueueHighPriority, dps);
 }

 void PageQueues::MoveToAnonymous(vm_page_t* page) {
   DeferPendingSignals dps{*this};
   Guard<SpinLock, IrqSave> guard{&lock_};
   MoveToQueueLocked(page, anonymous_is_reclaimable_ ? mru_gen_to_queue() : PageQueueAnonymous, dps);
 #if DEBUG_ASSERT_IMPLEMENTED
   if (debug_compressor_) {
     debug_compressor_->Add(page, reinterpret_cast<VmCowPages*>(page->object.get_object()),
                            page->object.get_page_offset());
   }
 #endif
 }

 void PageQueues::SetReclaim(vm_page_t* page, VmCowPages* object, uint64_t page_offset) {
   DeferPendingSignals dps{*this};
   Guard<SpinLock, IrqSave> guard{&lock_};
   DEBUG_ASSERT(object);
   SetQueueBacklinkLocked(page, object, page_offset, mru_gen_to_queue(), dps);
 }

 void PageQueues::MoveToReclaim(vm_page_t* page) {
   DeferPendingSignals dps{*this};
   Guard<SpinLock, IrqSave> guard{&lock_};
   MoveToQueueLocked(page, mru_gen_to_queue(), dps);
 }

 void PageQueues::MoveToReclaimDontNeed(vm_page_t* page) {
   DeferPendingSignals dps{*this};
   Guard<SpinLock, IrqSave> guard{&lock_};
   MoveToQueueLocked(page, PageQueueReclaimDontNeed, dps);
 }

 void PageQueues::SetPagerBackedDirty(vm_page_t* page, VmCowPages* object, uint64_t page_offset) {
   DeferPendingSignals dps{*this};
   Guard<SpinLock, IrqSave> guard{&lock_};
   DEBUG_ASSERT(object);
   SetQueueBacklinkLocked(page, object, page_offset, PageQueuePagerBackedDirty, dps);
 }

 void PageQueues::MoveToPagerBackedDirty(vm_page_t* page) {
   DeferPendingSignals dps{*this};
   Guard<SpinLock, IrqSave> guard{&lock_};
   MoveToQueueLocked(page, PageQueuePagerBackedDirty, dps);
 }

 void PageQueues::SetAnonymousZeroFork(vm_page_t* page, VmCowPages* object, uint64_t page_offset) {
   DeferPendingSignals dps{*this};
   Guard<SpinLock, IrqSave> guard{&lock_};
   SetQueueBacklinkLocked(
       page, object, page_offset,
       zero_fork_is_reclaimable_ ? mru_gen_to_queue() : PageQueueAnonymousZeroFork, dps);
 #if DEBUG_ASSERT_IMPLEMENTED
   if (debug_compressor_) {
     debug_compressor_->Add(page, object, page_offset);
   }
 #endif
 }

 void PageQueues::MoveToAnonymousZeroFork(vm_page_t* page) {
   // The common case is that the |page| being moved was previously placed into the anonymous queue.
   // If the zero fork queue is reclaimable, then most likely so is the anonymous queue, and so this
   // move would be a no-op. As this case is common it is worth doing this quick check to
   // short-circuit.
   if (zero_fork_is_reclaimable_ &&
       queue_is_reclaim(static_cast<PageQueue>(
           page->object.get_page_queue_ref().load(ktl::memory_order_relaxed)))) {
     return;
   }
   DeferPendingSignals dps{*this};
   Guard<SpinLock, IrqSave> guard{&lock_};
   MoveToQueueLocked(
       page, zero_fork_is_reclaimable_ ? mru_gen_to_queue() : PageQueueAnonymousZeroFork, dps);
 #if DEBUG_ASSERT_IMPLEMENTED
   if (debug_compressor_) {
     debug_compressor_->Add(page, reinterpret_cast<VmCowPages*>(page->object.get_object()),
                            page->object.get_page_offset());
   }
 #endif
 }

 void PageQueues::CompressFailed(vm_page_t* page) {
   DeferPendingSignals dps{*this};
   Guard<SpinLock, IrqSave> guard{&lock_};
   // Move the page if its currently in some kind of reclaimable queue.
   if (queue_is_reclaim(static_cast<PageQueue>(
           page->object.get_page_queue_ref().load(ktl::memory_order_relaxed)))) {
     MoveToQueueLocked(page, PageQueueFailedReclaim, dps);
   }
 }

 void PageQueues::ChangeObjectOffset(vm_page_t* page, VmCowPages* object, uint64_t page_offset) {
   Guard<SpinLock, IrqSave> guard{&lock_};
   ChangeObjectOffsetLocked(page, object, page_offset);
 }

 void PageQueues::ChangeObjectOffsetLocked(vm_page_t* page, VmCowPages* object,
                                           uint64_t page_offset) {
   DEBUG_ASSERT(page->state() == vm_page_state::OBJECT);
   DEBUG_ASSERT(!page->is_free());
   DEBUG_ASSERT(list_in_list(&page->queue_node));
   DEBUG_ASSERT(object);
   DEBUG_ASSERT(page->object.get_object());
   page->object.set_object(object);
   page->object.set_page_offset(page_offset);
 }

 void PageQueues::RemoveLocked(vm_page_t* page, DeferPendingSignals& dps) {
   // Directly exchange the old gen.
   uint32_t old_queue =
       page->object.get_page_queue_ref().exchange(PageQueueNone, ktl::memory_order_relaxed);
   DEBUG_ASSERT(old_queue != PageQueueNone);
   page_queue_counts_[old_queue].fetch_sub(1, ktl::memory_order_relaxed);
   UpdateActiveInactiveLocked((PageQueue)old_queue, PageQueueNone, dps);
   page->object.clear_object();
   page->object.set_page_offset(0);
   list_delete(&page->queue_node);
 }

 void PageQueues::Remove(vm_page_t* page) {
   DeferPendingSignals dps{*this};
   Guard<SpinLock, IrqSave> guard{&lock_};
   RemoveLocked(page, dps);
 }

 void PageQueues::RemoveArrayIntoList(vm_page_t** pages, size_t count, list_node_t* out_list) {
   DEBUG_ASSERT(pages);
   DeferPendingSignals dps{*this};

   for (size_t i = 0; i < count;) {
     // Don't process more than kMaxBatchSize pages while holding the lock.
     // Instead, drop out of the lock and let other operations proceed before
     // picking the lock up again and resuming.
     size_t end = i + ktl::min(count - i, kMaxBatchSize);
     {
       Guard<SpinLock, IrqSave> guard{&lock_};
       for (; i < end; i++) {
         DEBUG_ASSERT(pages[i]);
         RemoveLocked(pages[i], dps);
         list_add_tail(out_list, &pages[i]->queue_node);
       }
     }

     // If we are not done yet, relax the CPU a bit just to let someone else have
     // a chance at grabbing the spinlock.
     //
     // TODO(johngro): Once our spinlocks have been updated to be more fair
     // (ticket locks, MCS locks, whatever), come back here and remove this
     // pessimistic cpu relax.
     if (i < count) {
       arch::Yield();
     }
   }
 }

 void PageQueues::BeginAccessScan() {
   Guard<SpinLock, IrqSave> guard{&lock_};
   ASSERT(!use_cached_queue_counts_.load(ktl::memory_order_relaxed));
   cached_active_queue_count_ = active_queue_count_;
   cached_inactive_queue_count_ = inactive_queue_count_;
   use_cached_queue_counts_.store(true, ktl::memory_order_relaxed);
 }

 void PageQueues::RecalculateActiveInactiveLocked(DeferPendingSignals& dps) {
   uint64_t active = 0;
   uint64_t inactive = 0;

   uint64_t lru = lru_gen_.load(ktl::memory_order_relaxed);
   uint64_t mru = mru_gen_.load(ktl::memory_order_relaxed);

   for (uint64_t index = lru; index <= mru; index++) {
     uint64_t count = page_queue_counts_[gen_to_queue(index)].load(ktl::memory_order_relaxed);
     if (queue_is_active(gen_to_queue(index), gen_to_queue(mru))) {
       active += count;
     } else {
       // As we are only operating on reclaimable queues, !active should imply inactive
       DEBUG_ASSERT(queue_is_inactive(gen_to_queue(index), gen_to_queue(mru)));
       inactive += count;
     }
   }
   inactive += page_queue_counts_[PageQueueReclaimDontNeed].load(ktl::memory_order_relaxed);

   // Update the counts.
   active_queue_count_ = active;
   inactive_queue_count_ = inactive;

   // New counts might mean we need to age.
   MaybeSignalActiveRatioAgingLocked(dps);
 }

 void PageQueues::EndAccessScan() {
   DeferPendingSignals dps{*this};
   Guard<SpinLock, IrqSave> guard{&lock_};

   ASSERT(use_cached_queue_counts_.load(ktl::memory_order_relaxed));

   // First clear the cached counts. Although the uncached counts aren't correct right now, we hold
   // the lock so no one can observe the counts right now.
   cached_active_queue_count_ = 0;
   cached_inactive_queue_count_ = 0;
   use_cached_queue_counts_.store(false, ktl::memory_order_relaxed);

   RecalculateActiveInactiveLocked(dps);
 }

 PageQueues::ReclaimCounts PageQueues::GetReclaimQueueCounts() const {
   ReclaimCounts counts;

   // Grab the lock to prevent LRU processing, this lets us get a slightly less racy snapshot of
   // the queue counts, although we may still double count pages that move after we count them.
   // Specifically any parallel callers of MarkAccessed could move a page and change the counts,
   // causing us to either double count or miss count that page. As these counts are not load
   // bearing we accept the very small chance of potentially being off a few pages.
   Guard<SpinLock, IrqSave> guard{&lock_};
   uint64_t lru = lru_gen_.load(ktl::memory_order_relaxed);
   uint64_t mru = mru_gen_.load(ktl::memory_order_relaxed);

   counts.total = 0;
   for (uint64_t index = lru; index <= mru; index++) {
     uint64_t count = page_queue_counts_[gen_to_queue(index)].load(ktl::memory_order_relaxed);
     // Distance to the MRU, and not the LRU, determines the bucket the count goes into. This is to
     // match the logic in PeekPagerBacked, which is also based on distance to MRU.
     if (index > mru - kNumActiveQueues) {
       counts.newest += count;
     } else if (index <= mru - (kNumReclaim - kNumOldestQueues)) {
       counts.oldest += count;
     }
     counts.total += count;
   }
   // Account the DontNeed queue length under |oldest|, since (DontNeed + oldest LRU) pages are
   // eligible for reclamation first. |oldest| is meant to track pages eligible for eviction first.
   uint64_t inactive_count =
       page_queue_counts_[PageQueueReclaimDontNeed].load(ktl::memory_order_relaxed);
   counts.oldest += inactive_count;
   counts.total += inactive_count;
   return counts;
 }

 PageQueues::Counts PageQueues::QueueCounts() const {
   Counts counts = {};

   // Grab the lock to prevent LRU processing, this lets us get a slightly less racy snapshot of
   // the queue counts. We may still double count pages that move after we count them.
   Guard<SpinLock, IrqSave> guard{&lock_};
   uint64_t lru = lru_gen_.load(ktl::memory_order_relaxed);
   uint64_t mru = mru_gen_.load(ktl::memory_order_relaxed);

   for (uint64_t index = lru; index <= mru; index++) {
     counts.reclaim[mru - index] =
         page_queue_counts_[gen_to_queue(index)].load(ktl::memory_order_relaxed);
   }
   counts.reclaim_dont_need =
       page_queue_counts_[PageQueueReclaimDontNeed].load(ktl::memory_order_relaxed);
   counts.pager_backed_dirty =
       page_queue_counts_[PageQueuePagerBackedDirty].load(ktl::memory_order_relaxed);
   counts.anonymous = page_queue_counts_[PageQueueAnonymous].load(ktl::memory_order_relaxed);
   counts.wired = page_queue_counts_[PageQueueWired].load(ktl::memory_order_relaxed);
   counts.anonymous_zero_fork =
       page_queue_counts_[PageQueueAnonymousZeroFork].load(ktl::memory_order_relaxed);
   counts.failed_reclaim =
       page_queue_counts_[PageQueueFailedReclaim].load(ktl::memory_order_relaxed);
   counts.high_priority = page_queue_counts_[PageQueueHighPriority].load(ktl::memory_order_relaxed);
   return counts;
 }

 template <typename F>
 bool PageQueues::DebugPageIsSpecificReclaim(const vm_page_t* page, F validator,
                                             size_t* queue) const {
   fbl::RefPtr<VmCowPages> cow_pages;
   {
     Guard<SpinLock, IrqSave> guard{&lock_};
     PageQueue q = (PageQueue)page->object.get_page_queue_ref().load(ktl::memory_order_relaxed);
     if (q < PageQueueReclaimBase || q > PageQueueReclaimLast) {
       return false;
     }
     if (queue) {
       *queue = queue_age(q, mru_gen_to_queue());
     }
     VmCowPages* cow = reinterpret_cast<VmCowPages*>(page->object.get_object());
     DEBUG_ASSERT(cow);
     cow_pages = fbl::MakeRefPtrUpgradeFromRaw(cow, guard);
     DEBUG_ASSERT(cow_pages);
   }
   return validator(cow_pages);
 }

 template <typename F>
 bool PageQueues::DebugPageIsSpecificQueue(const vm_page_t* page, PageQueue queue,
                                           F validator) const {
   fbl::RefPtr<VmCowPages> cow_pages;
   {
     Guard<SpinLock, IrqSave> guard{&lock_};
     PageQueue q = (PageQueue)page->object.get_page_queue_ref().load(ktl::memory_order_relaxed);
     if (q != queue) {
       return false;
     }
     VmCowPages* cow = reinterpret_cast<VmCowPages*>(page->object.get_object());
     DEBUG_ASSERT(cow);
     cow_pages = fbl::MakeRefPtrUpgradeFromRaw(cow, guard);
     DEBUG_ASSERT(cow_pages);
   }
   return validator(cow_pages);
 }

 bool PageQueues::DebugPageIsReclaim(const vm_page_t* page, size_t* queue) const {
   return DebugPageIsSpecificReclaim(page, [](auto cow) { return cow->can_evict(); }, queue);
 }

 bool PageQueues::DebugPageIsReclaimDontNeed(const vm_page_t* page) const {
   return DebugPageIsSpecificQueue(page, PageQueueReclaimDontNeed,
                                   [](auto cow) { return cow->can_evict(); });
 }

 bool PageQueues::DebugPageIsPagerBackedDirty(const vm_page_t* page) const {
   return page->object.get_page_queue_ref().load(ktl::memory_order_relaxed) ==
          PageQueuePagerBackedDirty;
 }

 bool PageQueues::DebugPageIsAnonymous(const vm_page_t* page) const {
   if (ReclaimIsOnlyPagerBacked()) {
     return page->object.get_page_queue_ref().load(ktl::memory_order_relaxed) == PageQueueAnonymous;
   }
   return DebugPageIsSpecificReclaim(page, [](auto cow) { return !cow->can_evict(); }, nullptr);
 }

 bool PageQueues::DebugPageIsWired(const vm_page_t* page) const {
   return page->object.get_page_queue_ref().load(ktl::memory_order_relaxed) == PageQueueWired;
 }

 bool PageQueues::DebugPageIsHighPriority(const vm_page_t* page) const {
   return page->object.get_page_queue_ref().load(ktl::memory_order_relaxed) == PageQueueHighPriority;
 }

 bool PageQueues::DebugPageIsAnonymousZeroFork(const vm_page_t* page) const {
   if (ReclaimIsOnlyPagerBacked()) {
     return page->object.get_page_queue_ref().load(ktl::memory_order_relaxed) ==
            PageQueueAnonymousZeroFork;
   }
   return DebugPageIsSpecificReclaim(page, [](auto cow) { return !cow->can_evict(); }, nullptr);
 }

 bool PageQueues::DebugPageIsAnyAnonymous(const vm_page_t* page) const {
   return DebugPageIsAnonymous(page) || DebugPageIsAnonymousZeroFork(page);
 }

 ktl::optional<PageQueues::VmoBacklink> PageQueues::PopAnonymousZeroFork() {
   DeferPendingSignals dps{*this};
   Guard<SpinLock, IrqSave> guard{&lock_};

   vm_page_t* page =
       list_peek_tail_type(&page_queues_[PageQueueAnonymousZeroFork], vm_page_t, queue_node);
   if (!page) {
     return ktl::nullopt;
   }

   VmCowPages* cow = reinterpret_cast<VmCowPages*>(page->object.get_object());
   uint64_t page_offset = page->object.get_page_offset();
   DEBUG_ASSERT(cow);
   MoveToQueueLocked(page, PageQueueAnonymous, dps);

   return VmoBacklink{fbl::MakeRefPtrUpgradeFromRaw(cow, guard), page, page_offset};
 }

 ktl::optional<PageQueues::VmoBacklink> PageQueues::PeekReclaim(size_t lowest_queue) {
   // Ignore any requests to evict from the active queues as this is never allowed.
   lowest_queue = ktl::max(lowest_queue, kNumActiveQueues);
   // The target gen is 1 larger than the lowest queue because evicting from queue X is done by
   // attempting to make the lru queue be X+1.
   ktl::optional<VmoBacklink> result = ProcessDontNeedAndLruQueues(
       mru_gen_.load(ktl::memory_order_relaxed) - (lowest_queue - 1), true);
   if (!result) {
     SynchronizeWithAging();
     result = ProcessDontNeedAndLruQueues(
         mru_gen_.load(ktl::memory_order_relaxed) - (lowest_queue - 1), true);
   }
   return result;
 }

 PageQueues::ActiveInactiveCounts PageQueues::GetActiveInactiveCounts() const {
   Guard<SpinLock, IrqSave> guard{&lock_};
   return GetActiveInactiveCountsLocked();
 }

 PageQueues::ActiveInactiveCounts PageQueues::GetActiveInactiveCountsLocked() const {
   if (use_cached_queue_counts_.load(ktl::memory_order_relaxed)) {
     return ActiveInactiveCounts{.cached = true,
                                 .active = cached_active_queue_count_,
                                 .inactive = cached_inactive_queue_count_};
   } else {
     // With use_cached_queue_counts_ false the counts should have been updated to remove any
     // negative values that might have been caused by races.
     ASSERT(active_queue_count_ >= 0);
     ASSERT(inactive_queue_count_ >= 0);
     return ActiveInactiveCounts{.cached = false,
                                 .active = static_cast<uint64_t>(active_queue_count_),
                                 .inactive = static_cast<uint64_t>(inactive_queue_count_)};
   }
 }

 void PageQueues::EnableAnonymousReclaim(bool zero_forks) {
   DeferPendingSignals dps{*this};
   Guard<SpinLock, IrqSave> guard{&lock_};
   anonymous_is_reclaimable_ = true;
   zero_fork_is_reclaimable_ = zero_forks;

   const PageQueue mru_queue = mru_gen_to_queue();

   // Migrate any existing pages into the reclaimable queues.

   while (!list_is_empty(&page_queues_[PageQueueAnonymous])) {
     vm_page_t* page = list_peek_head_type(&page_queues_[PageQueueAnonymous], vm_page_t, queue_node);
     MoveToQueueLocked(page, mru_queue, dps);
   }
   while (zero_forks && !list_is_empty(&page_queues_[PageQueueAnonymousZeroFork])) {
     vm_page_t* page =
         list_peek_head_type(&page_queues_[PageQueueAnonymousZeroFork], vm_page_t, queue_node);
     MoveToQueueLocked(page, mru_queue, dps);
   }
 }

 ktl::optional<PageQueues::VmoBacklink> PageQueues::GetCowWithReplaceablePage(
     vm_page_t* page, VmCowPages* owning_cow) {
   // Wait for the page to not be in a transient state.  This is in a loop, since the wait happens
   // outside the lock, so another thread doing commit/decommit on owning_cow can cause the page
   // state to change, potentially multiple times.
   //
   // While it's possible for another thread that's concurrently committing/decommitting this page
   // to/from owning_cow, or moving the page from one VmCowPages to another without going through
   // FREE, to interfere to some extent with this thread's progress toward a terminal state in this
   // loop (and the caller's loop), this interference is fairly similar to page eviction interfering
   // with progress of commit of a pager-backed range.  That said, we mitigate here by tracking which
   // cases we've seen that we only expect to see once in the absence of commit/decommit interference
   // by another thread.  Thanks to loan_cancelled, we can limit all the wait required cases to a max
   // of once.  This mitigation doesn't try to maximally detect interference and minimize iterations
   // but the mitigation does limit iterations to a finite number.
   //
   // TODO(dustingreen):
   //  * complain on excessive loop iterations / duration looping
   //  * complain on excessive lifetime duration of StackOwnedLoanedPagesInterval, probably during
   //    destructor, but consider if there's any cheap and simple enough way to complain if it's just
   //    existing too long without any pre-existing calls on it.
   uint loop_iterations = 0;
   while (true) {
     // Warn on excessive iterations. The threshold is chosen to be quite high since this isn't
     // intending to check some strict finite bound, but rather to find pathological bugs where this
     // is infinite looping and monopolizing the lock_.
     if (loop_iterations++ == 200) {
       printf("[pq]: WARNING: %s appears to be looping excessively\n", __FUNCTION__);
     }
     // This is just for asserting that we don't end up trying to wait when we didn't intend to.
     bool wait_on_stack_ownership = false;
     {  // scope guard
       Guard<SpinLock, IrqSave> guard{&lock_};
       // While holding lock_, we can safely add an event to be notified, if needed.  While a page
       // state transition from ALLOC to OBJECT, and from OBJECT with no VmCowPages to OBJECT with a
       // VmCowPages, are both guarded by lock_, a transition to FREE is not.  So we must check
       // again, in an ordered fashion (using PmmNode lock not just "relaxed" atomic) for the page
       // being in FREE state after we add an event, to ensure the transition to FREE doesn't miss
       // the added event.  If a page transitions back out of FREE due to actions by other threads,
       // the lock_ protects the page's object field from being overwritten by an event being added.
       vm_page_state state = page->state();
       // If owning_cow, we know the owning_cow destructor can't run, so the only valid page
       // states while FREE or borrowed by a VmCowPages and not pinned are FREE, ALLOC, OBJECT.
       //
       // If !owning_cow, the set of possible states isn't constrained, and we don't try to wait for
       // the page.
       switch (state) {
         case vm_page_state::FREE:
           // No cow, but still success.  The fact that we were holding lock_ while reading page
           // state isn't relevant to the transition to FREE; we just care that we'll notice FREE
           // somewhere in the loop.
           //
           // We care that we will notice transition _to_ FREE that stays FREE indefinitely via this
           // check.  Other threads doing commit/decommit on owning_cow can cause this check to miss
           // a transient FREE state, but we avoid getting stuck waiting indefinitely.
           return ktl::nullopt;
         case vm_page_state::OBJECT: {
           // Sub-cases:
           //  * Using cow.
           //  * Loaning cow.
           //  * No cow (page moving from cow to cow).
           VmCowPages* cow = reinterpret_cast<VmCowPages*>(page->object.get_object());
           if (!cow) {
             if (!owning_cow) {
               // If there's not a specific owning_cow, then we can't be as certain of the states the
               // page may reach.  For example the page may get used by something other than a
               // VmCowPages, which wouldn't trigger the event.  So we can't use the event mechanism.
               //
               // This is a success case.  We checked if there was a using cow at the moment, and
               // there wasn't.
               return ktl::nullopt;
             }
             // Page is moving from cow to cow, and/or is on the way to FREE, so wait below for
             // page to get a new VmCowPages or become FREE.  We still have to synchronize further
             // below using thread_lock, since OBJECT to FREE doesn't hold PageQueues lock_.
             wait_on_stack_ownership = true;
             break;
           } else if (cow == owning_cow) {
             // This should be impossible, since PageSource guarantees that a given page will only be
             // actively reclaimed by up to one thread at a time.  If this happens, things are broken
             // enough that we shouldn't continue.
             panic("Requested page alraedy in owning_cow; unexpected\n");
           } else {
             // At this point the page may have pin_count != 0.  We have to check in terms of which
             // queue here, since we can't acquire the VmCowPages lock (wrong order).
             if (!owning_cow) {
               if (page->object.get_page_queue_ref().load(ktl::memory_order_relaxed) ==
                   PageQueueWired) {
                 // A pinned page is not replaceable.
                 return ktl::nullopt;
               }
             }
             // There is a using/borrowing cow and we know it is still alive as we hold the
             // PageQueues lock, and the cow may not destruct while it still has pages.
             //
             // We're under PageQueues lock, so this value is stable at the moment, but by the time
             // the caller acquires the cow lock this page could potentially be elsewhere, depending
             // on whether the page is allowed to move to a different VmCowPages or to a different
             // location in this VmCowPages, without going through FREE.
             //
             // The cow->RemovePageForEviction() does a re-check that this page is still at this
             // offset.  The caller's loop takes care of chasing down the page if it moves between
             // VmCowPages or to a different offset in the same VmCowPages without going through
             // FREE.
             uint64_t page_offset = page->object.get_page_offset();
             VmoBacklink backlink{fbl::MakeRefPtrUpgradeFromRaw(cow, guard), page, page_offset};
             DEBUG_ASSERT(backlink.cow);
             // We AddRef(ed) the using cow_container.  Success.  Return the backlink.  The caller
             // can use this to call cow->RemovePageForEviction().
             return backlink;
           }
           break;
         }
         case vm_page_state::ALLOC:
           if (!owning_cow) {
             // When there's not an owning_cow, we don't know what use the page may be put to, so
             // we don't know if the page has a StackOwnedLoanedPagesInterval, since those are only
             // required for intervals involving stack ownership of loaned pages.  Since the caller
             // isn't strictly required to succeed at replacing a page when !owning_cow, the caller
             // is ok with a successful "none" here since the page isn't immediately replaceable.
             return ktl::nullopt;
           }
           // Wait for ALLOC to become OBJECT or FREE.
           wait_on_stack_ownership = true;
           break;
         default:
           // If owning_cow, we know the owning_cow destructor can't run, so the only valid page
           // states while FREE or borrowed by a VmCowPages and not pinned are FREE, ALLOC, OBJECT.
           DEBUG_ASSERT(!owning_cow);
           // When !owning_cow, the possible page states include all page states.  The caller is only
           // interested in pages that are both used by a VmCowPages (not transiently stack owned)
           // and which the caller can immediately replace with a different page, so WIRED state goes
           // along with the list of other states where the caller can't just replace the page.
           //
           // There is no cow with this page as an immediately-replaceable page.
           return ktl::nullopt;
       }
     }  // ~guard
     // If we get here, we know that wait_on_stack_ownership is true, and we know that never happens
     // when !owning_cow.
     DEBUG_ASSERT(wait_on_stack_ownership);
     DEBUG_ASSERT(owning_cow);

     StackOwnedLoanedPagesInterval::WaitUntilContiguousPageNotStackOwned(page);

     // At this point, the state of the page has changed, but we don't know how much.  Another thread
     // doing commit on owning_cow may have finished moving the page into owning_cow.  Yet another
     // thread may have decommitted the page again, and yet another thread may be using the loaned
     // page again now despite loan_cancelled having been used.  The page may have been moved to a
     // destination cow, but may now be moving again.  What we do still know is that the page still
     // has owning_cow as its underlying owner (owning_cow is a contiguous VmCowPages), thanks to
     // the ref on owning_cow held by the caller, and how contiguous VmCowPages keep the same
     // physical pages from creation to Dead.
     //
     // It's still the goal of this method to return the borrowing cow if there is one, or return
     // success without a borrowing cow if the page is verified to be reclaim-able by the owning_cow
     // at some point during this method (regardless of whether that remains true).
     //
     // Go around again to observe new page state.
     //
     // ~thread_lock_guard
   }
 }