zircon/kernel/vm/page_queues.cc - fuchsia - Git at Google

 // Copyright 2020 The Fuchsia Authors
 //
 // Use of this source code is governed by a MIT-style
 // license that can be found in the LICENSE file or at
 // https://opensource.org/licenses/MIT

 #include <lib/counters.h>
 #include <lib/fit/defer.h>
 #include <lib/zircon-internal/macros.h>

 #include <fbl/ref_counted_upgradeable.h>
 #include <kernel/auto_preempt_disabler.h>
 #include <object/thread_dispatcher.h>
 #include <vm/page.h>
 #include <vm/page_queues.h>
 #include <vm/pmm.h>
 #include <vm/scanner.h>
 #include <vm/stack_owned_loaned_pages_interval.h>
 #include <vm/vm_cow_pages.h>

 namespace {

 KCOUNTER(pq_aging_reason_before_min_timeout, "pq.aging.reason_before_min_timeout")
 KCOUNTER(pq_aging_spurious_wakeup, "pq.aging.spurious_wakeup")
 KCOUNTER(pq_aging_timeout_with_reason, "pq.aging.timeout_with_reason")
 KCOUNTER(pq_aging_reason_none, "pq.aging.reason.none")
 KCOUNTER(pq_aging_reason_timeout, "pq.aging.reason.timeout")
 KCOUNTER(pq_aging_reason_active_ratio, "pq.aging.reason.active_ratio")
 KCOUNTER(pq_aging_reason_manual, "pq.aging.reason.manual")
 KCOUNTER(pq_aging_blocked_on_lru, "pq.aging.blocked_on_lru")
 KCOUNTER(pq_lru_spurious_wakeup, "pq.lru.spurious_wakeup")

 }  // namespace

 PageQueues::PageQueues()
     : min_mru_rotate_time_(kDefaultMinMruRotateTime),
       max_mru_rotate_time_(kDefaultMaxMruRotateTime),
       active_ratio_multiplier_(kDefaultActiveRatioMultiplier) {
   for (uint32_t i = 0; i < PageQueueNumQueues; i++) {
     list_initialize(&page_queues_[i]);
   }
   list_initialize(&dont_need_processing_list_);
 }

 PageQueues::~PageQueues() {
   StopThreads();
   for (uint32_t i = 0; i < PageQueueNumQueues; i++) {
     DEBUG_ASSERT(list_is_empty(&page_queues_[i]));
   }
   for (size_t i = 0; i < page_queue_counts_.size(); i++) {
     DEBUG_ASSERT_MSG(page_queue_counts_[i] == 0, "i=%zu count=%zu", i,
                      page_queue_counts_[i].load());
   }
 }

 void PageQueues::StartThreads(zx_duration_t min_mru_rotate_time,
                               zx_duration_t max_mru_rotate_time) {
   // Clamp the max rotate to the minimum.
   max_mru_rotate_time = ktl::max(min_mru_rotate_time, max_mru_rotate_time);
   // Prevent a rotation rate that is too small.
   max_mru_rotate_time = ktl::max(max_mru_rotate_time, ZX_SEC(1));

   min_mru_rotate_time_ = min_mru_rotate_time;
   max_mru_rotate_time_ = max_mru_rotate_time;

   // Cannot perform all of thread creation under the lock as thread creation requires
   // allocations so we create in temporaries first and then stash.
   Thread* mru_thread = Thread::Create(
       "page-queue-mru-thread",
       [](void* arg) -> int {
         static_cast<PageQueues*>(arg)->MruThread();
         return 0;
       },
       this, LOW_PRIORITY);
   DEBUG_ASSERT(mru_thread);

   mru_thread->Resume();

   Thread* lru_thread = Thread::Create(
       "page-queue-lru-thread",
       [](void* arg) -> int {
         static_cast<PageQueues*>(arg)->LruThread();
         return 0;
       },
       this, LOW_PRIORITY);
   DEBUG_ASSERT(lru_thread);
   lru_thread->Resume();

   Guard<CriticalMutex> guard{&lock_};
   ASSERT(!mru_thread_);
   ASSERT(!lru_thread_);
   mru_thread_ = mru_thread;
   lru_thread_ = lru_thread;
 }

 void PageQueues::StopThreads() {
   // Cannot wait for threads to complete with the lock held, so update state and then perform any
   // joins outside the lock.
   Thread* mru_thread = nullptr;
   Thread* lru_thread = nullptr;
   {
     Guard<CriticalMutex> guard{&lock_};
     shutdown_threads_ = true;
     if (aging_disabled_.exchange(false)) {
       aging_token_.Signal();
     }
     aging_event_.Signal();
     lru_event_.Signal();
     mru_thread = mru_thread_;
     lru_thread = lru_thread_;
   }
   int retcode;
   if (mru_thread) {
     zx_status_t status = mru_thread->Join(&retcode, ZX_TIME_INFINITE);
     ASSERT(status == ZX_OK);
   }
   if (lru_thread) {
     zx_status_t status = lru_thread->Join(&retcode, ZX_TIME_INFINITE);
     ASSERT(status == ZX_OK);
   }
 }

 void PageQueues::SetActiveRatioMultiplier(uint32_t multiplier) {
   Guard<CriticalMutex> guard{&lock_};
   active_ratio_multiplier_ = multiplier;
   // The change in multiplier might have caused us to need to age.
   MaybeTriggerAgingLocked();
 }

 void PageQueues::MaybeTriggerAging() {
   Guard<CriticalMutex> guard{&lock_};
   MaybeTriggerAgingLocked();
 }

 void PageQueues::MaybeTriggerAgingLocked() {
   if (GetAgeReasonLocked() != AgeReason::None) {
     aging_event_.Signal();
   }
 }

 PageQueues::AgeReason PageQueues::GetAgeReason() const {
   Guard<CriticalMutex> guard{&lock_};
   return GetAgeReasonLocked();
 }

 PageQueues::AgeReason PageQueues::GetAgeReasonLocked() const {
   ActiveInactiveCounts active_count = GetActiveInactiveCountsLocked();
   if (active_count.active * active_ratio_multiplier_ > active_count.inactive) {
     return AgeReason::ActiveRatio;
   }
   return AgeReason::None;
 }

 void PageQueues::MaybeTriggerLruProcessing() {
   if (NeedsLruProcessing()) {
     lru_event_.Signal();
   }
 }

 bool PageQueues::NeedsLruProcessing() const {
   // Currently only reason to trigger lru processing is if the MRU needs space.
   // Performing this unlocked is equivalently correct as grabbing the lock, reading, and dropping
   // the lock. If a caller needs to know if the lru queue needs processing *and* then perform an
   // action before that status could change, it should externally hold lock_ over this method and
   // its action.
   if (mru_gen_.load(ktl::memory_order_relaxed) - lru_gen_.load(ktl::memory_order_relaxed) ==
       kNumPagerBacked - 1) {
     return true;
   }
   return false;
 }

 void PageQueues::DisableAging() {
   // Validate a double DisableAging is not happening.
   if (aging_disabled_.exchange(true)) {
     panic("Mismatched disable/enable pair");
   }

   // Take the aging token. This will both wait for the aging thread to complete any in progress
   // aging, and prevent it from aging until we return it.
   aging_token_.Wait();
 }

 void PageQueues::EnableAging() {
   // Validate a double EnableAging is not happening.
   if (!aging_disabled_.exchange(false)) {
     panic("Mismatched disable/enable pair");
   }
   // Return the aging token, allowing the aging thread to proceed if it was waiting.
   aging_token_.Signal();
 }

 const char* PageQueues::string_from_age_reason(PageQueues::AgeReason reason) {
   switch (reason) {
     case AgeReason::None:
       return "None";
     case AgeReason::ActiveRatio:
       return "Active ratio";
     case AgeReason::Timeout:
       return "Timeout";
     case AgeReason::Manual:
       return "Manual";
     default:
       panic("Unreachable");
   }
 }

 void PageQueues::Dump() {
   // Need to grab a copy of all the counts and generations. As the lock is needed to acquire the
   // active/inactive counts, also hold the lock over the copying of the counts to avoid needless
   // races.
   uint64_t mru_gen;
   uint64_t lru_gen;
   size_t counts[kNumPagerBacked] = {};
   size_t inactive_count;
   zx_time_t last_age_time;
   AgeReason last_age_reason;
   ActiveInactiveCounts activeinactive;
   {
     Guard<CriticalMutex> guard{&lock_};
     mru_gen = mru_gen_.load(ktl::memory_order_relaxed);
     lru_gen = lru_gen_.load(ktl::memory_order_relaxed);
     inactive_count =
         page_queue_counts_[PageQueuePagerBackedDontNeed].load(ktl::memory_order_relaxed);
     for (uint32_t i = 0; i < kNumPagerBacked; i++) {
       counts[i] = page_queue_counts_[PageQueuePagerBackedBase + i].load(ktl::memory_order_relaxed);
     }
     activeinactive = GetActiveInactiveCountsLocked();
     last_age_time = last_age_time_.load(ktl::memory_order_relaxed);
     last_age_reason = last_age_reason_;
   }
   // Small arbitrary number that should be more than large enough to hold the constructed string
   // without causing stack allocation pressure.
   constexpr size_t kBufSize = 50;
   // Start with the buffer null terminated. snprintf will always keep it null terminated.
   char buf[kBufSize] __UNINITIALIZED = "\0";
   size_t buf_len = 0;
   // This formats the counts of all buckets, not just those within the mru->lru range, even though
   // any buckets not in that range should always have a count of zero. The format this generates is
   // [active],[active],inactive,inactive,{last inactive},should-be-zero,should-be-zero
   // Although the inactive and should-be-zero use the same formatting, they are broken up by the
   // {last inactive}.
   for (uint64_t i = 0; i < kNumPagerBacked; i++) {
     PageQueue queue = gen_to_queue(mru_gen - i);
     ASSERT(buf_len < kBufSize);
     const size_t remain = kBufSize - buf_len;
     int write_len;
     if (i < kNumActiveQueues) {
       write_len =
           snprintf(buf + buf_len, remain, "[%zu],", counts[queue - PageQueuePagerBackedBase]);
     } else if (i == mru_gen - lru_gen) {
       write_len =
           snprintf(buf + buf_len, remain, "{%zu},", counts[queue - PageQueuePagerBackedBase]);
     } else {
       write_len = snprintf(buf + buf_len, remain, "%zu,", counts[queue - PageQueuePagerBackedBase]);
     }
     // Negative values are returned on encoding errors, which we never expect to get.
     ASSERT(write_len >= 0);
     if (static_cast<uint>(write_len) >= remain) {
       // Buffer too small, just use whatever we have constructed so far.
       break;
     }
     buf_len += write_len;
   }
   zx_time_t current = current_time();
   timespec age_time = zx_timespec_from_duration(zx_time_sub_time(current, last_age_time));
   printf("pq: MRU generation is %" PRIu64
          " set %ld.%lds ago due to \"%s\", LRU generation is %" PRIu64 "\n",
          mru_gen, age_time.tv_sec, age_time.tv_nsec, string_from_age_reason(last_age_reason),
          lru_gen);
   printf("pq: Pager buckets %s evict first: %zu, %s active/inactive totals: %zu/%zu\n", buf,
          inactive_count, activeinactive.cached ? "cached" : "live", activeinactive.active,
          activeinactive.inactive);
 }

 // This runs the aging thread. Aging, unlike lru processing, scanning or eviction, requires very
 // little work and is more about coordination. As such this thread is heavy on checks and signalling
 // but generally only needs to hold any locks for the briefest of times.
 // There is, currently, one exception to that, which is the calls to scanner_wait_for_accessed_scan.
 // The scanner will, eventually, be a separate thread that is synchronized with, but presently
 // a full scan may happen inline in that method call, and get attributed directly to this thread.
 void PageQueues::MruThread() {
   // Pretend that aging happens during startup to simplify the rest of the loop logic.
   last_age_time_ = current_time();
   while (!shutdown_threads_.load(ktl::memory_order_relaxed)) {
     // Wait for the min rotate time to pass.
     Thread::Current::Sleep(
         zx_time_add_duration(last_age_time_.load(ktl::memory_order_relaxed), min_mru_rotate_time_));

     // Wait on the aging_event_ in a loop to deal with spurious signals on the event. Will exit
     // the loop for either of
     // * A legitimate age reason returned by GetAgeReason
     // * Reached the maximum wait time
     // * Shutdown has been requested.
     // We specifically check GetAgeReason() and then wait on the aging event to catch scenarios
     // where we both timed out *and* found a pending age reason. This just allows us to log this
     // scenario, and is not necessary for correctness.
     zx_status_t result = ZX_OK;
     AgeReason age_reason = AgeReason::None;
     int iterations = 0;
     while (!shutdown_threads_.load(ktl::memory_order_relaxed) &&
            (age_reason = GetAgeReason()) == AgeReason::None && result != ZX_ERR_TIMED_OUT) {
       result = aging_event_.WaitDeadline(
           zx_time_add_duration(last_age_time_.load(ktl::memory_order_relaxed),
                                max_mru_rotate_time_),
           Interruptible::No);
       iterations++;
     }

     if (shutdown_threads_.load(ktl::memory_order_relaxed)) {
       break;
     }

     if (iterations == 0) {
       // If We did zero iterations then this means there was an age_reason waiting for us, meaning
       // we wanted to age prior to the min timeout clearing.
       pq_aging_reason_before_min_timeout.Add(1);
     } else if (iterations > 1) {
       // Every loop iteration above 1 indicates a spurious wake up.
       pq_aging_spurious_wakeup.Add(iterations - 1);
     }

     // If we didn't have an age reason then since we already handled the disable_aging_ case the
     // only other age cause must be a timeout.
     if (age_reason == AgeReason::None) {
       DEBUG_ASSERT(result == ZX_ERR_TIMED_OUT);
       age_reason = AgeReason::Timeout;
     } else {
       // Had an age reason, but check if a timeout also occurred. This could happen legitimately due
       // to races or delays in scheduling this thread, but we log this case in a counter anyway
       // since it happening regularly could indicate a bug.
       if (result == ZX_ERR_TIMED_OUT) {
         pq_aging_timeout_with_reason.Add(1);
       }
     }

     // Taken the aging token, potentially blocking if aging is disabled, make sure to return it when
     // we are done.
     aging_token_.Wait();
     auto return_token = fit::defer([this] { aging_token_.Signal(); });

     // Make sure the accessed information has been harvested since the last time we aged, otherwise
     // we are deliberately making the age information coarser, by effectively not using one of the
     // queues, at which point we might as well not have bothered rotating.
     // Currently this is redundant since we will explicitly harvest just after aging, however once
     // there are additional aging triggers and harvesting is more asynchronous, this will serve as
     // a synchronization point.
     scanner_wait_for_accessed_scan(last_age_time_, true);

     RotatePagerBackedQueues(age_reason);

     // Changing mru_gen_ could have impacted the eviction logic.
     MaybeTriggerLruProcessing();
   }
 }

 // This thread should, at some point, have some of its logic and signaling merged with the Evictor.
 // Currently it might process the lru queue whilst the evictor is already trying to evict, which is
 // not harmful but it's a bit wasteful as it doubles the work that happens.
 // LRU processing, via ProcessDontNeedAndLruQueues, is expensive and happens under the lock_. It is
 // expected that ProcessDontNeedAndLruQueues perform small units of work to avoid this thread
 // causing excessive lock contention.
 void PageQueues::LruThread() {
   while (!shutdown_threads_.load(ktl::memory_order_relaxed)) {
     lru_event_.WaitDeadline(ZX_TIME_INFINITE, Interruptible::No);
     // Take the lock so we can calculate (race free) a target mru-gen
     uint64_t target_gen;
     {
       Guard<CriticalMutex> guard{&lock_};
       if (!NeedsLruProcessing()) {
         pq_lru_spurious_wakeup.Add(1);
         continue;
       }
       target_gen = lru_gen_.load(ktl::memory_order_relaxed) + 1;
     }
     // With the lock dropped process the target. This is not racy as generations are monotonic, so
     // worst case someone else already processed this generation and this call will be a no-op.
     ProcessDontNeedAndLruQueues(target_gen, false);
   }
 }

 void PageQueues::RotatePagerBackedQueues(AgeReason reason) {
   VM_KTRACE_DURATION(2, "RotatePagerBackedQueues");
   // We expect LRU processing to have already happened, so first poll the mru semaphore.
   if (mru_semaphore_.Wait(Deadline::infinite_past()) == ZX_ERR_TIMED_OUT) {
     // We should not have needed to wait for lru processing here, as it should have already been
     // made available due to earlier triggers. Although this could reasonably happen due to races or
     // delays in scheduling we record in a counter as happening regularly could indicate a bug.
     pq_aging_blocked_on_lru.Add(1);

     MaybeTriggerLruProcessing();

     // The LRU thread could take an arbitrary amount of time to get scheduled and run, so we cannot
     // enforce a deadline. However, we can assume there might be a bug and start making noise to
     // inform the user if we have waited multiples of the expected maximum aging interval, since
     // that implies we are starting to lose the requested fidelity of age information.
     int64_t timeouts = 0;
     while (mru_semaphore_.Wait(Deadline::after(max_mru_rotate_time_, TimerSlack::none())) ==
            ZX_ERR_TIMED_OUT) {
       timeouts++;
       printf("[pq] WARNING: Waited %" PRIi64 " seconds for LRU thread, MRU semaphore %" PRIu64
              ",aging is presently stalled\n",
              (max_mru_rotate_time_ * timeouts) / ZX_SEC(1), mru_semaphore_.count());
       Dump();
     }
   }

   ASSERT(mru_gen_.load(ktl::memory_order_relaxed) - lru_gen_.load(ktl::memory_order_relaxed) <
          kNumPagerBacked - 1);

   {
     // Acquire the lock to increment the mru_gen_. This allows other queue logic to not worry about
     // mru_gen_ changing whilst they hold the lock.
     Guard<CriticalMutex> guard{&lock_};
     mru_gen_.fetch_add(1, ktl::memory_order_relaxed);
     last_age_time_ = current_time();
     last_age_reason_ = reason;
     // Update the active/inactive counts. We could be a bit smarter here since we know exactly which
     // active bucket might have changed, but this will work.
     RecalculateActiveInactiveLocked();
   }
   // Keep a count of the different reasons we have rotated.
   switch (reason) {
     case AgeReason::None:
       pq_aging_reason_none.Add(1);
       break;
     case AgeReason::Timeout:
       pq_aging_reason_timeout.Add(1);
       break;
     case AgeReason::ActiveRatio:
       pq_aging_reason_active_ratio.Add(1);
       break;
     case AgeReason::Manual:
       pq_aging_reason_manual.Add(1);
       break;
     default:
       panic("Unknown age reason");
   }
 }

 ktl::optional<PageQueues::VmoBacklink> PageQueues::ProcessQueueHelper(
     ProcessingQueue processing_queue, uint64_t target_gen, bool peek) {
   VM_KTRACE_DURATION(2, "ProcessQueue");

   // Processing the DontNeed/LRU queue requires holding the page_queues_ lock_. The only other
   // actions that require this lock are inserting or removing pages from the page queues. To ensure
   // these actions can complete in a small bounded time kMaxQueueWork is chosen to be very small so
   // that the lock will be regularly dropped. As processing the DontNeed/LRU queue is not time
   // critical and can be somewhat inefficient in its operation we err on the side of doing less work
   // per lock acquisition.
   //
   // Also, we need to limit the number to avoid sweep_to_loaned taking up excessive stack space.
   static constexpr uint32_t kMaxQueueWork = 16;

   // Each page in this list can potentially be replaced with a loaned page, but we have to do this
   // replacement outside the PageQueues::lock_, so we accumulate pages and then try to replace the
   // pages after lock_ is released.
   VmoBacklink sweep_to_loaned[kMaxQueueWork];
   uint32_t sweep_to_loaned_count = 0;
   // Only accumulate pages to try to replace with loaned pages if loaned pages are available and
   // we're allowed to borrow at this code location.
   bool do_sweeping = (pmm_count_loaned_free_pages() != 0) &&
                      pmm_physical_page_borrowing_config()->is_borrowing_on_mru_enabled();
   auto sweep_to_loaned_outside_lock =
       fit::defer([do_sweeping, &sweep_to_loaned, &sweep_to_loaned_count] {
         DEBUG_ASSERT(do_sweeping || sweep_to_loaned_count == 0);
         for (uint32_t i = 0; i < sweep_to_loaned_count; ++i) {
           DEBUG_ASSERT(sweep_to_loaned[i].cow);
           // We ignore the return value because the page may have moved, become pinned, we may not
           // have any free loaned pages any more, or the VmCowPages may not be able to borrow.
           sweep_to_loaned[i].cow->ReplacePageWithLoaned(sweep_to_loaned[i].page,
                                                         sweep_to_loaned[i].offset);
         }
       });

   Guard<CriticalMutex> guard{&lock_};
   const uint64_t mru = mru_gen_.load(ktl::memory_order_relaxed);
   const uint64_t lru = lru_gen_.load(ktl::memory_order_relaxed);

   // If we're processing the lru queue and it has already hit the target gen, return early.
   if (processing_queue == ProcessingQueue::Lru && lru >= target_gen) {
     return ktl::nullopt;
   }

   uint32_t work_remain = kMaxQueueWork;
   PageQueue queue;
   list_node* operating_queue;
   if (processing_queue == ProcessingQueue::Lru) {
     queue = gen_to_queue(lru);
     operating_queue = &page_queues_[queue];
   } else {
     // Processing the DontNeed queue typically involves moving from the Processing to the regular
     // DontNeed queue, unless we're peeking in which case we can also grab from the regular. When
     // peeking we prefer to grab from the dont_need_processign_list_ first, as its pages are older,
     // or at least were moved to the DontNeed queue further in the past.
     queue = PageQueuePagerBackedDontNeed;
     if (peek && list_is_empty(&dont_need_processing_list_)) {
       operating_queue = &page_queues_[queue];
     } else {
       operating_queue = &dont_need_processing_list_;
     }
   }

   while (!list_is_empty(operating_queue) && work_remain > 0) {
     work_remain--;
     // When moving pages we want to maintain relative page age as far as possible. For pages in the
     // LRU queue this means the pages we are moving have overall been touched more recently than
     // their destination, and so we want to take from the oldest (tail). The opposite is true for
     // the don't need process queue, as they are less recently used than their target queue.
     // When not moving pages, aka peeking, we always want the oldest pages though.
     vm_page_t* page;
     if (processing_queue == ProcessingQueue::Lru || peek) {
       page = list_peek_tail_type(operating_queue, vm_page_t, queue_node);
     } else {
       page = list_peek_head_type(operating_queue, vm_page_t, queue_node);
     }
     PageQueue page_queue =
         (PageQueue)page->object.get_page_queue_ref().load(ktl::memory_order_relaxed);

     if (processing_queue == ProcessingQueue::DontNeed) {
       DEBUG_ASSERT(page_queue >= PageQueuePagerBackedDontNeed);
     } else {
       DEBUG_ASSERT(page_queue >= PageQueuePagerBackedBase);
     }

     // If the queue stored in the page does not match then we want to move it to its correct queue
     // with the caveat that its queue could be invalid. The queue would be invalid if MarkAccessed
     // had raced. Should this happen we know that the page is actually *very* old, and so we will
     // fall back to the case of forcibly changing its age to the new lru gen.
     const PageQueue mru_queue = gen_to_queue(mru);
     // A page in the DontNeed queue will never be allowed to age to the point that its queue
     // becomes invalid, because we process *all* the DontNeed pages each time we change the lru.
     // In other words, no DontNeed page will be left behind when lru advances.
     DEBUG_ASSERT(page_queue == queue || processing_queue != ProcessingQueue::DontNeed ||
                  queue_is_valid(page_queue, gen_to_queue(lru), mru_queue));
     if (page_queue != queue && queue_is_valid(page_queue, gen_to_queue(lru), mru_queue)) {
       list_delete(&page->queue_node);
       list_add_head(&page_queues_[page_queue], &page->queue_node);

       if (queue_is_active(page_queue, mru_queue) && do_sweeping && !pmm_is_loaned(page)) {
         DEBUG_ASSERT(sweep_to_loaned_count < kMaxQueueWork);
         VmCowPages* cow = reinterpret_cast<VmCowPages*>(page->object.get_object());
         uint64_t page_offset = page->object.get_page_offset();
         DEBUG_ASSERT(cow);
         sweep_to_loaned[sweep_to_loaned_count] =
             VmoBacklink{fbl::MakeRefPtrUpgradeFromRaw(cow, lock_), page, page_offset};
         if (sweep_to_loaned[sweep_to_loaned_count].cow) {
           ++sweep_to_loaned_count;
         }
       }
     } else if (peek) {
       VmCowPages* cow = reinterpret_cast<VmCowPages*>(page->object.get_object());
       uint64_t page_offset = page->object.get_page_offset();
       DEBUG_ASSERT(cow);

       // We may be racing with destruction of VMO. As we currently hold our lock we know that our
       // back pointer is correct in so far as the VmCowPages has not yet had completed running its
       // destructor, so we know it is safe to attempt to upgrade it to a RefPtr. If upgrading
       // fails we assume the page is about to be removed from the page queue once the VMO
       // destructor gets a chance to run.
       return VmoBacklink{fbl::MakeRefPtrUpgradeFromRaw(cow, lock_), page, page_offset};
     } else {
       // Force it into our target queue, don't care about races. If we happened to access it at
       // the same time then too bad.
       PageQueue new_queue = processing_queue == ProcessingQueue::DontNeed
                                 ? PageQueuePagerBackedDontNeed
                                 : gen_to_queue(lru + 1);
       PageQueue old_queue = (PageQueue)page->object.get_page_queue_ref().exchange(new_queue);
       if (processing_queue == ProcessingQueue::DontNeed) {
         DEBUG_ASSERT(old_queue >= PageQueuePagerBackedDontNeed);
       } else {
         DEBUG_ASSERT(old_queue >= PageQueuePagerBackedBase);
       }

       page_queue_counts_[old_queue].fetch_sub(1, ktl::memory_order_relaxed);
       page_queue_counts_[new_queue].fetch_add(1, ktl::memory_order_relaxed);
       list_delete(&page->queue_node);
       if (processing_queue == ProcessingQueue::DontNeed) {
         // When pulling from the DontNeedProcessing queue we pulled from the head, and so we
         // conversely need to place them in the tail to preserve order (see comment at start of the
         // loop).
         list_add_tail(&page_queues_[new_queue], &page->queue_node);
       } else {
         list_add_head(&page_queues_[new_queue], &page->queue_node);
       }
       // We should only have performed this step to move from one inactive bucket to the next,
       // so there should be no active/inactive count changes needed.
       DEBUG_ASSERT(!queue_is_active(new_queue, mru_gen_to_queue()));
     }
   }

   if (list_is_empty(operating_queue)) {
     if (processing_queue == ProcessingQueue::Lru) {
       lru_gen_.store(lru + 1, ktl::memory_order_relaxed);
       mru_semaphore_.Post();
       // Changing the lru_gen_ might  in the future trigger a scenario where need to age, but this
       // is presently a no-op.
       MaybeTriggerAgingLocked();
     }
   }

   return ktl::nullopt;
 }

 ktl::optional<PageQueues::VmoBacklink> PageQueues::ProcessDontNeedAndLruQueues(uint64_t target_gen,
                                                                                bool peek) {
   // This assertion is <=, and not strictly <, since to evict a some queue X, the target must be
   // X+1. Hence to preserve kNumActiveQueues, we can allow target_gen to become equal to the first
   // active queue, as this will process all the non-active queues. Although we might refresh our
   // value for the mru_queue, since the mru_gen_ is monotonic increasing, if this assert passes once
   // it should continue to be true.
   ASSERT(target_gen <= mru_gen_.load(ktl::memory_order_relaxed) - (kNumActiveQueues - 1));

   // Calculate a truly worst case loop iteration count based on every page being in either the LRU
   // queue or the wrong DontNeed queue and needing to iterate the LRU multiple steps to the
   // target_gen. Instead of reading the LRU and comparing the target_gen, just add a buffer of the
   // maximum number of page queues.
   ActiveInactiveCounts active_inactive = GetActiveInactiveCounts();
   const uint64_t max_dont_need_iterations = page_queue_counts_[PageQueuePagerBackedDontNeed] + 1;
   const uint64_t max_lru_iterations =
       active_inactive.active + active_inactive.inactive + kNumPagerBacked;
   // Loop iteration counting is just for diagnostic purposes.
   uint64_t loop_iterations = 0;

   ktl::optional<Guard<Mutex>> dont_need_processing_guard;
   if (!peek) {
     // If not peeking then we will need to properly process the DontNeed queue, and so we must take
     // the processing lock and move the existing pages into the processing list.
     dont_need_processing_guard.emplace(&dont_need_processing_lock_);
     Guard<CriticalMutex> guard{&lock_};
     ASSERT(list_is_empty(&dont_need_processing_list_));
     list_move(&page_queues_[PageQueuePagerBackedDontNeed], &dont_need_processing_list_);
   }

   while (true) {
     VM_KTRACE_DURATION(2, "ProcessDontNeedQueue");
     if (loop_iterations++ == max_dont_need_iterations) {
       printf("[pq]: WARNING: %s exceeded expected max DontNeed loop iterations %" PRIu64 "\n",
              __FUNCTION__, max_dont_need_iterations);
     }
     auto optional_backlink = ProcessQueueHelper(ProcessingQueue::DontNeed, 0, peek);
     if (optional_backlink != ktl::nullopt) {
       DEBUG_ASSERT(peek);
       return optional_backlink;
     }

     Guard<CriticalMutex> guard{&lock_};
     // If we have been asked to peek, we want to keep processing until both the DontNeed queues
     // are empty. ProcessQueueHelper will process one queue and then move on to the other when
     // the first is empty, so that both get processed.
     if (list_is_empty(&dont_need_processing_list_) &&
         (!peek || list_is_empty(&page_queues_[PageQueuePagerBackedDontNeed]))) {
       break;
     }
   }
   // Not strictly required, but we're done with the dontneed processing lock.
   dont_need_processing_guard.reset();

   // Reset diagnostic counter for the new loop.
   loop_iterations = 0;
   // Process the lru queue to reach target_gen.
   while (lru_gen_.load(ktl::memory_order_relaxed) < target_gen) {
     VM_KTRACE_DURATION(2, "ProcessLruQueue");
     if (loop_iterations++ == max_lru_iterations) {
       printf("[pq]: WARNING: %s exceeded expected max LRU loop iterations %" PRIu64 "\n",
              __FUNCTION__, max_lru_iterations);
     }
     auto optional_backlink = ProcessQueueHelper(ProcessingQueue::Lru, target_gen, peek);
     if (optional_backlink != ktl::nullopt) {
       return optional_backlink;
     }
   }

   return ktl::nullopt;
 }

 void PageQueues::UpdateActiveInactiveLocked(PageQueue old_queue, PageQueue new_queue) {
   // Short circuit the lock acquisition and logic if not dealing with active/inactive queues
   if (!queue_is_pager_backed(old_queue) && !queue_is_pager_backed(new_queue)) {
     return;
   }
   // This just blindly updates the active/inactive counts. If accessed scanning is happening, and
   // used use_cached_queue_counts_ is true, then we could be racing and setting these to garbage
   // values. That's fine as they will never get returned anywhere, and will get reset to correct
   // values once access scanning completes.
   PageQueue mru = mru_gen_to_queue();
   if (queue_is_active(old_queue, mru)) {
     active_queue_count_--;
   } else if (queue_is_inactive(old_queue, mru)) {
     inactive_queue_count_--;
   }
   if (queue_is_active(new_queue, mru)) {
     active_queue_count_++;
   } else if (queue_is_inactive(new_queue, mru)) {
     inactive_queue_count_++;
   }
   MaybeTriggerAgingLocked();
 }

 void PageQueues::MarkAccessed(vm_page_t* page) {
   Guard<CriticalMutex> guard{&lock_};

   auto queue_ref = page->object.get_page_queue_ref();

   // The page can be the zero page, but in that case we'll return early below.
   DEBUG_ASSERT(page != vm_get_zero_page() ||
                queue_ref.load(ktl::memory_order_relaxed) < PageQueuePagerBackedDontNeed);

   // We need to check the current queue to see if it is in the pager backed range. Between checking
   // this and updating the queue it could change, however it would only change as a result of
   // MarkAccessedDeferredCount, which would only move it to another pager backed queue. No other
   // change is possible as we are holding lock_.
   if (queue_ref.load(ktl::memory_order_relaxed) < PageQueuePagerBackedDontNeed) {
     return;
   }

   PageQueue queue = mru_gen_to_queue();
   PageQueue old_queue = (PageQueue)queue_ref.exchange(queue, ktl::memory_order_relaxed);
   // Double check again that this was previously pager backed
   DEBUG_ASSERT(old_queue != PageQueueNone && old_queue >= PageQueuePagerBackedDontNeed);
   if (old_queue != queue) {
     page_queue_counts_[old_queue].fetch_sub(1, ktl::memory_order_relaxed);
     page_queue_counts_[queue].fetch_add(1, ktl::memory_order_relaxed);
     UpdateActiveInactiveLocked(old_queue, queue);
   }
 }

 void PageQueues::SetQueueLocked(vm_page_t* page, PageQueue queue) {
   SetQueueBacklinkLocked(page, nullptr, 0, queue);
 }

 void PageQueues::SetQueueBacklinkLocked(vm_page_t* page, void* object, uintptr_t page_offset,
                                         PageQueue queue) {
   DEBUG_ASSERT(page->state() == vm_page_state::OBJECT);
   DEBUG_ASSERT(!page->is_free());
   DEBUG_ASSERT(!list_in_list(&page->queue_node));
   if (object) {
     DEBUG_ASSERT(!page->object.get_object() || page->object.get_object() == object);
     DEBUG_ASSERT(!page->object.get_object() || page->object.get_page_offset() == page_offset);
     page->object.set_object(object);
     page->object.set_page_offset(page_offset);
   } else {
     // Currently SetQueueBacklinkLocked() can be used to clear the backlink in cases where the page
     // isn't being removed from a VmCowPages.  This won't work for loaned pages, but we don't have
     // any loaned pages yet.
     page->object.clear_object();
     page->object.set_page_offset(0);
   }
   DEBUG_ASSERT(page->object.get_page_queue_ref().load(ktl::memory_order_relaxed) == PageQueueNone);
   page->object.get_page_queue_ref().store(queue, ktl::memory_order_relaxed);
   list_add_head(&page_queues_[queue], &page->queue_node);
   page_queue_counts_[queue].fetch_add(1, ktl::memory_order_relaxed);
   UpdateActiveInactiveLocked(PageQueueNone, queue);
 }

 void PageQueues::MoveToQueueLocked(vm_page_t* page, PageQueue queue) {
   MoveToQueueBacklinkLocked(page, nullptr, 0, queue);
 }

 void PageQueues::MoveToQueueBacklinkLocked(vm_page_t* page, void* object, uintptr_t page_offset,
                                            PageQueue queue) {
   DEBUG_ASSERT(page->state() == vm_page_state::OBJECT);
   DEBUG_ASSERT(!page->is_free());
   DEBUG_ASSERT(list_in_list(&page->queue_node));
   uint32_t old_queue = page->object.get_page_queue_ref().exchange(queue, ktl::memory_order_relaxed);
   DEBUG_ASSERT(old_queue != PageQueueNone);
   if (object) {
     DEBUG_ASSERT(!page->object.get_object() || page->object.get_object() == object);
     DEBUG_ASSERT(!page->object.get_object() || page->object.get_page_offset() == page_offset);
     page->object.set_object(object);
     page->object.set_page_offset(page_offset);
   } else {
     // Currently MoveToQueueBacklinkLocked() can be used to clear the backlink in cases where the
     // page isn't being removed from a VmCowPages.  This won't work for loaned pages, but we don't
     // have any loaned pages yet.
     page->object.clear_object();
     page->object.set_page_offset(0);
   }
   list_delete(&page->queue_node);
   list_add_head(&page_queues_[queue], &page->queue_node);
   page_queue_counts_[old_queue].fetch_sub(1, ktl::memory_order_relaxed);
   page_queue_counts_[queue].fetch_add(1, ktl::memory_order_relaxed);
   UpdateActiveInactiveLocked((PageQueue)old_queue, queue);
 }

 void PageQueues::SetWired(vm_page_t* page) {
   Guard<CriticalMutex> guard{&lock_};
   SetQueueLocked(page, PageQueueWired);
 }

 void PageQueues::MoveToWired(vm_page_t* page) {
   Guard<CriticalMutex> guard{&lock_};
   MoveToQueueLocked(page, PageQueueWired);
 }

 void PageQueues::MoveToWired(vm_page_t* page, VmCowPages* object, uint64_t page_offset) {
   Guard<CriticalMutex> guard{&lock_};
   DEBUG_ASSERT(object);
   MoveToQueueBacklinkLocked(page, object, page_offset, PageQueueWired);
 }

 void PageQueues::SetUnswappable(vm_page_t* page) {
   Guard<CriticalMutex> guard{&lock_};
   SetQueueLocked(page, PageQueueUnswappable);
 }

 void PageQueues::MoveToUnswappableLocked(vm_page_t* page) {
   MoveToQueueLocked(page, PageQueueUnswappable);
 }

 void PageQueues::MoveToUnswappable(vm_page_t* page) {
   Guard<CriticalMutex> guard{&lock_};
   MoveToUnswappableLocked(page);
 }

 void PageQueues::SetPagerBacked(vm_page_t* page, VmCowPages* object, uint64_t page_offset) {
   Guard<CriticalMutex> guard{&lock_};
   DEBUG_ASSERT(object);
   SetQueueBacklinkLocked(page, object, page_offset, mru_gen_to_queue());
 }

 void PageQueues::MoveToPagerBacked(vm_page_t* page, VmCowPages* object, uint64_t page_offset) {
   Guard<CriticalMutex> guard{&lock_};
   DEBUG_ASSERT(object);
   MoveToQueueBacklinkLocked(page, object, page_offset, mru_gen_to_queue());
 }

 void PageQueues::MoveToPagerBackedDontNeed(vm_page_t* page) {
   Guard<CriticalMutex> guard{&lock_};
   MoveToQueueBacklinkLocked(page, page->object.get_object(), page->object.get_page_offset(),
                             PageQueuePagerBackedDontNeed);
 }

 void PageQueues::SetPagerBackedDirty(vm_page_t* page, VmCowPages* object, uint64_t page_offset) {
   Guard<CriticalMutex> guard{&lock_};
   DEBUG_ASSERT(object);
   SetQueueBacklinkLocked(page, object, page_offset, PageQueuePagerBackedDirty);
 }

 void PageQueues::MoveToPagerBackedDirty(vm_page_t* page, VmCowPages* object, uint64_t page_offset) {
   Guard<CriticalMutex> guard{&lock_};
   DEBUG_ASSERT(object);
   MoveToQueueBacklinkLocked(page, object, page_offset, PageQueuePagerBackedDirty);
 }

 void PageQueues::SetUnswappableZeroFork(vm_page_t* page, VmCowPages* object, uint64_t page_offset) {
   Guard<CriticalMutex> guard{&lock_};
   SetQueueBacklinkLocked(page, object, page_offset, PageQueueUnswappableZeroFork);
 }

 void PageQueues::MoveToUnswappableZeroFork(vm_page_t* page, VmCowPages* object,
                                            uint64_t page_offset) {
   Guard<CriticalMutex> guard{&lock_};
   DEBUG_ASSERT(object);
   MoveToQueueBacklinkLocked(page, object, page_offset, PageQueueUnswappableZeroFork);
 }

 void PageQueues::RemoveLocked(vm_page_t* page) {
   // Directly exchange the old gen.
   uint32_t old_queue =
       page->object.get_page_queue_ref().exchange(PageQueueNone, ktl::memory_order_relaxed);
   DEBUG_ASSERT(old_queue != PageQueueNone);
   page_queue_counts_[old_queue].fetch_sub(1, ktl::memory_order_relaxed);
   UpdateActiveInactiveLocked((PageQueue)old_queue, PageQueueNone);
   page->object.clear_object();
   page->object.set_page_offset(0);
   list_delete(&page->queue_node);
 }

 void PageQueues::Remove(vm_page_t* page) {
   Guard<CriticalMutex> guard{&lock_};
   RemoveLocked(page);
 }

 void PageQueues::RemoveArrayIntoList(vm_page_t** pages, size_t count, list_node_t* out_list) {
   DEBUG_ASSERT(pages);
   Guard<CriticalMutex> guard{&lock_};
   for (size_t i = 0; i < count; i++) {
     DEBUG_ASSERT(pages[i]);
     RemoveLocked(pages[i]);
     list_add_tail(out_list, &pages[i]->queue_node);
   }
 }

 void PageQueues::BeginAccessScan() {
   Guard<CriticalMutex> guard{&lock_};
   ASSERT(!use_cached_queue_counts_.load(ktl::memory_order_relaxed));
   cached_active_queue_count_ = active_queue_count_;
   cached_inactive_queue_count_ = inactive_queue_count_;
   use_cached_queue_counts_.store(true, ktl::memory_order_relaxed);
 }

 void PageQueues::RecalculateActiveInactiveLocked() {
   uint64_t active = 0;
   uint64_t inactive = 0;

   uint64_t lru = lru_gen_.load(ktl::memory_order_relaxed);
   uint64_t mru = mru_gen_.load(ktl::memory_order_relaxed);

   for (uint64_t index = lru; index <= mru; index++) {
     uint64_t count = page_queue_counts_[gen_to_queue(index)].load(ktl::memory_order_relaxed);
     if (queue_is_active(gen_to_queue(index), gen_to_queue(mru))) {
       active += count;
     } else {
       // As we are only operating on pager backed queues, !active should imply inactive
       DEBUG_ASSERT(queue_is_inactive(gen_to_queue(index), gen_to_queue(mru)));
       inactive += count;
     }
   }
   inactive += page_queue_counts_[PageQueuePagerBackedDontNeed].load(ktl::memory_order_relaxed);

   // Update the counts.
   active_queue_count_ = active;
   inactive_queue_count_ = inactive;

   // New counts might mean we need to age.
   MaybeTriggerAgingLocked();
 }

 void PageQueues::EndAccessScan() {
   Guard<CriticalMutex> guard{&lock_};

   ASSERT(use_cached_queue_counts_.load(ktl::memory_order_relaxed));

   // First clear the cached counts. Although the uncached counts aren't correct right now, we hold
   // the lock so no one can observe the counts right now.
   cached_active_queue_count_ = 0;
   cached_inactive_queue_count_ = 0;
   use_cached_queue_counts_.store(false, ktl::memory_order_relaxed);

   RecalculateActiveInactiveLocked();
 }

 PageQueues::PagerCounts PageQueues::GetPagerQueueCounts() const {
   PagerCounts counts;

   // Grab the lock to prevent LRU processing, this lets us get a slightly less racy snapshot of
   // the queue counts, although we may still double count pages that move after we count them.
   // Specifically any parallel callers of MarkAccessed could move a page and change the counts,
   // causing us to either double count or miss count that page. As these counts are not load
   // bearing we accept the very small chance of potentially being off a few pages.
   Guard<CriticalMutex> guard{&lock_};
   uint64_t lru = lru_gen_.load(ktl::memory_order_relaxed);
   uint64_t mru = mru_gen_.load(ktl::memory_order_relaxed);

   counts.total = 0;
   for (uint64_t index = lru; index <= mru; index++) {
     uint64_t count = page_queue_counts_[gen_to_queue(index)].load(ktl::memory_order_relaxed);
     // Distance to the MRU, and not the LRU, determines the bucket the count goes into. This is to
     // match the logic in PeekPagerBacked, which is also based on distance to MRU.
     if (index > mru - kNumActiveQueues) {
       counts.newest += count;
     } else if (index <= mru - (kNumPagerBacked - kNumOldestQueues)) {
       counts.oldest += count;
     }
     counts.total += count;
   }
   // Account the DontNeed queue length under |oldest|, since (DontNeed + oldest LRU) pages are
   // eligible for reclamation first. |oldest| is meant to track pages eligible for eviction first.
   uint64_t inactive_count =
       page_queue_counts_[PageQueuePagerBackedDontNeed].load(ktl::memory_order_relaxed);
   counts.oldest += inactive_count;
   counts.total += inactive_count;
   return counts;
 }

 PageQueues::Counts PageQueues::QueueCounts() const {
   Counts counts = {};

   // Grab the lock to prevent LRU processing, this lets us get a slightly less racy snapshot of
   // the queue counts. We may still double count pages that move after we count them.
   Guard<CriticalMutex> guard{&lock_};
   uint64_t lru = lru_gen_.load(ktl::memory_order_relaxed);
   uint64_t mru = mru_gen_.load(ktl::memory_order_relaxed);

   for (uint64_t index = lru; index <= mru; index++) {
     counts.pager_backed[mru - index] =
         page_queue_counts_[gen_to_queue(index)].load(ktl::memory_order_relaxed);
   }
   counts.pager_backed_dont_need =
       page_queue_counts_[PageQueuePagerBackedDontNeed].load(ktl::memory_order_relaxed);
   counts.unswappable = page_queue_counts_[PageQueueUnswappable].load(ktl::memory_order_relaxed);
   counts.wired = page_queue_counts_[PageQueueWired].load(ktl::memory_order_relaxed);
   counts.unswappable_zero_fork =
       page_queue_counts_[PageQueueUnswappableZeroFork].load(ktl::memory_order_relaxed);
   return counts;
 }

 bool PageQueues::DebugPageIsPagerBacked(const vm_page_t* page, size_t* queue) const {
   PageQueue q = (PageQueue)page->object.get_page_queue_ref().load(ktl::memory_order_relaxed);
   if (q >= PageQueuePagerBackedBase && q <= PageQueuePagerBackedLast) {
     if (queue) {
       *queue = queue_age(q, mru_gen_to_queue());
     }
     return true;
   }
   return false;
 }

 bool PageQueues::DebugPageIsPagerBackedDontNeed(const vm_page_t* page) const {
   return page->object.get_page_queue_ref().load(ktl::memory_order_relaxed) ==
          PageQueuePagerBackedDontNeed;
 }

 bool PageQueues::DebugPageIsPagerBackedDirty(const vm_page_t* page) const {
   return page->object.get_page_queue_ref().load(ktl::memory_order_relaxed) ==
          PageQueuePagerBackedDirty;
 }

 bool PageQueues::DebugPageIsUnswappable(const vm_page_t* page) const {
   return page->object.get_page_queue_ref().load(ktl::memory_order_relaxed) == PageQueueUnswappable;
 }

 bool PageQueues::DebugPageIsWired(const vm_page_t* page) const {
   return page->object.get_page_queue_ref().load(ktl::memory_order_relaxed) == PageQueueWired;
 }

 bool PageQueues::DebugPageIsUnswappableZeroFork(const vm_page_t* page) const {
   return page->object.get_page_queue_ref().load(ktl::memory_order_relaxed) ==
          PageQueueUnswappableZeroFork;
 }

 bool PageQueues::DebugPageIsAnyUnswappable(const vm_page_t* page) const {
   return DebugPageIsUnswappable(page) || DebugPageIsUnswappableZeroFork(page);
 }

 ktl::optional<PageQueues::VmoBacklink> PageQueues::PopUnswappableZeroFork() {
   Guard<CriticalMutex> guard{&lock_};

   vm_page_t* page =
       list_peek_tail_type(&page_queues_[PageQueueUnswappableZeroFork], vm_page_t, queue_node);
   if (!page) {
     return ktl::nullopt;
   }

   VmCowPages* cow = reinterpret_cast<VmCowPages*>(page->object.get_object());
   uint64_t page_offset = page->object.get_page_offset();
   DEBUG_ASSERT(cow);
   MoveToQueueLocked(page, PageQueueUnswappable);

   // We may be racing with destruction of VMO. As we currently hold our lock we know that our
   // back pointer is correct in so far as the VmCowPages has not yet had completed running its
   // destructor, so we know it is safe to attempt to upgrade it to a RefPtr. If upgrading fails
   // we assume the page is about to be removed from the page queue once the VMO destructor gets
   // a chance to run.
   return VmoBacklink{fbl::MakeRefPtrUpgradeFromRaw(cow, guard), page, page_offset};
 }

 ktl::optional<PageQueues::VmoBacklink> PageQueues::PeekPagerBacked(size_t lowest_queue) {
   // Ignore any requests to evict from the active queues as this is never allowed.
   lowest_queue = ktl::max(lowest_queue, kNumActiveQueues);
   // The target gen is 1 larger than the lowest queue because evicting from queue X is done by
   // attempting to make the lru queue be X+1.
   return ProcessDontNeedAndLruQueues(mru_gen_.load(ktl::memory_order_relaxed) - (lowest_queue - 1),
                                      true);
 }

 PageQueues::ActiveInactiveCounts PageQueues::GetActiveInactiveCounts() const {
   Guard<CriticalMutex> guard{&lock_};
   return GetActiveInactiveCountsLocked();
 }

 PageQueues::ActiveInactiveCounts PageQueues::GetActiveInactiveCountsLocked() const {
   if (use_cached_queue_counts_.load(ktl::memory_order_relaxed)) {
     return ActiveInactiveCounts{.cached = true,
                                 .active = cached_active_queue_count_,
                                 .inactive = cached_inactive_queue_count_};
   } else {
     // With use_cached_queue_counts_ false the counts should have been updated to remove any
     // negative values that might have been caused by races.
     ASSERT(active_queue_count_ >= 0);
     ASSERT(inactive_queue_count_ >= 0);
     return ActiveInactiveCounts{.cached = false,
                                 .active = static_cast<uint64_t>(active_queue_count_),
                                 .inactive = static_cast<uint64_t>(inactive_queue_count_)};
   }
 }

 ktl::optional<PageQueues::VmoContainerBacklink> PageQueues::GetCowWithReplaceablePage(
     vm_page_t* page, VmCowPages* owning_cow) {
   // Wait for the page to not be in a transient state.  This is in a loop, since the wait happens
   // outside the lock, so another thread doing commit/decommit on owning_cow can cause the page
   // state to change, potentially multiple times.
   //
   // While it's possible for another thread that's concurrently committing/decommitting this page
   // to/from owning_cow, or moving the page from one VmCowPages to another without going through
   // FREE, to interfere to some extent with this thread's progress toward a terminal state in this
   // loop (and the caller's loop), this interference is fairly similar to page eviction interfering
   // with progress of commit of a pager-backed range.  That said, we mitigate here by tracking which
   // cases we've seen that we only expect to see once in the absence of commit/decommit interference
   // by another thread.  Thanks to loan_cancelled, we can limit all the wait required cases to a max
   // of once.  This mitigation doesn't try to maximally detect interference and minimize iterations
   // but the mitigation does limit iterations to a finite number.
   //
   // TODO(dustingreen):
   //  * complain on excessive loop iterations / duration looping
   //  * complain on excessive lifetime duration of StackOwnedLoanedPagesInterval, probably during
   //    destructor, but consider if there's any cheap and simple enough way to complain if it's just
   //    existing too long without any pre-existing calls on it.
   uint loop_iterations = 0;
   while (true) {
     // Warn on excessive iterations. The threshold is chosen to be quite high since this isn't
     // intending to check some strict finite bound, but rather to find pathological bugs where this
     // is infinite looping and monopolizing the lock_.
     if (loop_iterations++ == 200) {
       printf("[pq]: WARNING: %s appears to be looping excessively\n", __FUNCTION__);
     }
     // This is just for asserting that we don't end up trying to wait when we didn't intend to.
     bool wait_on_stack_ownership = false;
     {  // scope guard
       Guard<CriticalMutex> guard{&lock_};
       // While holding lock_, we can safely add an event to be notified, if needed.  While a page
       // state transition from ALLOC to OBJECT, and from OBJECT with no VmCowPages to OBJECT with a
       // VmCowPages, are both guarded by lock_, a transition to FREE is not.  So we must check
       // again, in an ordered fashion (using PmmNode lock not just "relaxed" atomic) for the page
       // being in FREE state after we add an event, to ensure the transition to FREE doesn't miss
       // the added event.  If a page transitions back out of FREE due to actions by other threads,
       // the lock_ protects the page's object field from being overwritten by an event being added.
       vm_page_state state = page->state();
       // If owning_cow, we know the owning_cow destructor can't run, so the only valid page
       // states while FREE or borrowed by a VmCowPages and not pinned are FREE, ALLOC, OBJECT.
       //
       // If !owning_cow, the set of possible states isn't constrained, and we don't try to wait for
       // the page.
       switch (state) {
         case vm_page_state::FREE:
           // No cow, but still success.  The fact that we were holding lock_ while reading page
           // state isn't relevant to the transition to FREE; we just care that we'll notice FREE
           // somewhere in the loop.
           //
           // We care that we will notice transition _to_ FREE that stays FREE indefinitely via this
           // check.  Other threads doing commit/decommit on owning_cow can cause this check to miss
           // a transient FREE state, but we avoid getting stuck waiting indefinitely.
           return ktl::nullopt;
         case vm_page_state::OBJECT: {
           // Sub-cases:
           //  * Using cow.
           //  * Loaning cow.
           //  * No cow (page moving from cow to cow).
           VmCowPages* cow = reinterpret_cast<VmCowPages*>(page->object.get_object());
           if (!cow) {
             if (!owning_cow) {
               // If there's not a specific owning_cow, then we can't be as certain of the states the
               // page may reach.  For example the page may get used by something other than a
               // VmCowPages, which wouldn't trigger the event.  So we can't use the event mechanism.
               //
               // This is a success case.  We checked if there was a using cow at the moment, and
               // there wasn't.
               return ktl::nullopt;
             }
             // Page is moving from cow to cow, and/or is on the way to FREE, so wait below for
             // page to get a new VmCowPages or become FREE.  We still have to synchronize further
             // below using thread_lock, since OBJECT to FREE doesn't hold PageQueues lock_.
             wait_on_stack_ownership = true;
             break;
           } else if (cow == owning_cow) {
             // This should be impossible, since PageSource guarantees that a given page will only be
             // actively reclaimed by up to one thread at a time.  If this happens, things are broken
             // enough that we shouldn't continue.
             panic("Requested page alraedy in owning_cow; unexpected\n");
           } else {
             // At this point the page may have pin_count != 0.  We have to check in terms of which
             // queue here, since we can't acquire the VmCowPages lock (wrong order).
             if (!owning_cow) {
               if (page->object.get_page_queue_ref().load(ktl::memory_order_relaxed) ==
                   PageQueueWired) {
                 // A pinned page is not replaceable.
                 return ktl::nullopt;
               }
             }
             // There is a using/borrowing cow, but we may not be able to get a ref to it, if it's
             // already destructing.  We actually try to get a ref to the VmCowPagesContainer instead
             // since that allows us to get a ref successfully up until _after_ the page has moved to
             // FREE, avoiding any need to wait, and avoiding stuff needed to support such a wait.
             //
             // We're under PageQueues lock, so this value is stable at the moment, but by the time
             // the caller acquires the cow lock this page could potentially be elsewhere, depending
             // on whether the page is allowed to move to a different VmCowPages or to a different
             // location in this VmCowPages, without going through FREE.
             //
             // The cow->RemovePageForEviction() does a re-check that this page is still at this
             // offset.  The caller's loop takes care of chasing down the page if it moves between
             // VmCowPages or to a different offset in the same VmCowPages without going through
             // FREE.
             uint64_t page_offset = page->object.get_page_offset();
             // We may be racing with destruction of VMO. As we currently hold PageQueues lock we
             // know that our back pointer is correct as the VmCowPages has not yet completed
             // running fbl_recycle() (which has to acquire PageQueues lock to remove the backlink
             // and then release the ref on its VmCowPagesContainer), so we know it is safe to
             // attempt to upgrade from a raw VmCowPagesContainer pointer to a VmCowPagesContainer
             // RefPtr, and that this upgrade will succeed until _after_ the page is FREE.  If
             // upgrading fails we know the page has already become FREE; in that case we just go
             // back around the loop since that's almost as efficient and less code than handling
             // here.
             VmCowPagesContainer* raw_cow_container = cow->raw_container();
             VmoContainerBacklink backlink{fbl::MakeRefPtrUpgradeFromRaw(raw_cow_container, guard),
                                           page, page_offset};
             if (!backlink.cow_container) {
               // Existing cow is at least at the end of fbl_recycle().  The page has already become
               // FREE.  Let the loop handle that since it's less code than handling here and not
               // significantly more expensive to handle with the loop.
               DEBUG_ASSERT(page->is_free());
               continue;
             } else {
               // We AddRef(ed) the using cow_container.  Success.  Return the backlink.  The caller
               // can use this to call cow->RemovePageForEviction(), which is ok to call on a cow
               // with refcount 0 as long as the caller is holding the backlink's cow_container
               // VmCowPagesContainer ref.
               return backlink;
             }
           }
           break;
         }
         case vm_page_state::ALLOC:
           if (!owning_cow) {
             // When there's not an owning_cow, we don't know what use the page may be put to, so
             // we don't know if the page has a StackOwnedLoanedPagesInterval, since those are only
             // required for intervals involving stack ownership of loaned pages.  Since the caller
             // isn't strictly required to succeed at replacing a page when !owning_cow, the caller
             // is ok with a successful "none" here since the page isn't immediately replaceable.
             return ktl::nullopt;
           }
           // Wait for ALLOC to become OBJECT or FREE.
           wait_on_stack_ownership = true;
           break;
         default:
           // If owning_cow, we know the owning_cow destructor can't run, so the only valid page
           // states while FREE or borrowed by a VmCowPages and not pinned are FREE, ALLOC, OBJECT.
           DEBUG_ASSERT(!owning_cow);
           // When !owning_cow, the possible page states include all page states.  The caller is only
           // interested in pages that are both used by a VmCowPages (not transiently stack owned)
           // and which the caller can immediately replace with a different page, so WIRED state goes
           // along with the list of other states where the caller can't just replace the page.
           //
           // There is no cow with this page as an immediately-replaceable page.
           return ktl::nullopt;
       }
     }  // ~guard
     // If we get here, we know that wait_on_stack_ownership is true, and we know that never happens
     // when !owning_cow.
     DEBUG_ASSERT(wait_on_stack_ownership);
     DEBUG_ASSERT(owning_cow);

     StackOwnedLoanedPagesInterval::WaitUntilContiguousPageNotStackOwned(page);

     // At this point, the state of the page has changed, but we don't know how much.  Another thread
     // doing commit on owning_cow may have finished moving the page into owning_cow.  Yet another
     // thread may have decommitted the page again, and yet another thread may be using the loaned
     // page again now despite loan_cancelled having been used.  The page may have been moved to a
     // destination cow, but may now be moving again.  What we do still know is that the page still
     // has owning_cow as its underlying owner (owning_cow is a contiguous VmCowPages), thanks to
     // the ref on owning_cow held by the caller, and how contiguous VmCowPages keep the same
     // physical pages from creation to fbl_recycle().
     //
     // It's still the goal of this method to return the borrowing cow if there is one, or return
     // success without a borrowing cow if the page is verified to be reclaim-able by the owning_cow
     // at some point during this method (regardless of whether that remains true).
     //
     // Go around again to observe new page state.
     //
     // ~thread_lock_guard
   }
 }