blob: d35e6cdb5355646af0c1db31c59a47acb0d95306 [file] [log] [blame]
// Copyright 2020 The Fuchsia Authors
//
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file or at
// https://opensource.org/licenses/MIT
#include <lib/counters.h>
#include <lib/fit/defer.h>
#include <lib/zircon-internal/macros.h>
#include <fbl/ref_counted_upgradeable.h>
#include <kernel/auto_preempt_disabler.h>
#include <object/thread_dispatcher.h>
#include <vm/page.h>
#include <vm/page_queues.h>
#include <vm/pmm.h>
#include <vm/scanner.h>
#include <vm/stack_owned_loaned_pages_interval.h>
#include <vm/vm_cow_pages.h>
namespace {
KCOUNTER(pq_aging_reason_before_min_timeout, "pq.aging.reason_before_min_timeout")
KCOUNTER(pq_aging_spurious_wakeup, "pq.aging.spurious_wakeup")
KCOUNTER(pq_aging_timeout_with_reason, "pq.aging.timeout_with_reason")
KCOUNTER(pq_aging_reason_none, "pq.aging.reason.none")
KCOUNTER(pq_aging_reason_timeout, "pq.aging.reason.timeout")
KCOUNTER(pq_aging_reason_active_ratio, "pq.aging.reason.active_ratio")
KCOUNTER(pq_aging_reason_manual, "pq.aging.reason.manual")
KCOUNTER(pq_aging_blocked_on_lru, "pq.aging.blocked_on_lru")
KCOUNTER(pq_lru_spurious_wakeup, "pq.lru.spurious_wakeup")
} // namespace
PageQueues::PageQueues()
: min_mru_rotate_time_(kDefaultMinMruRotateTime),
max_mru_rotate_time_(kDefaultMaxMruRotateTime),
active_ratio_multiplier_(kDefaultActiveRatioMultiplier) {
for (uint32_t i = 0; i < PageQueueNumQueues; i++) {
list_initialize(&page_queues_[i]);
}
list_initialize(&dont_need_processing_list_);
}
PageQueues::~PageQueues() {
StopThreads();
for (uint32_t i = 0; i < PageQueueNumQueues; i++) {
DEBUG_ASSERT(list_is_empty(&page_queues_[i]));
}
for (size_t i = 0; i < page_queue_counts_.size(); i++) {
DEBUG_ASSERT_MSG(page_queue_counts_[i] == 0, "i=%zu count=%zu", i,
page_queue_counts_[i].load());
}
}
void PageQueues::StartThreads(zx_duration_t min_mru_rotate_time,
zx_duration_t max_mru_rotate_time) {
// Clamp the max rotate to the minimum.
max_mru_rotate_time = ktl::max(min_mru_rotate_time, max_mru_rotate_time);
// Prevent a rotation rate that is too small.
max_mru_rotate_time = ktl::max(max_mru_rotate_time, ZX_SEC(1));
min_mru_rotate_time_ = min_mru_rotate_time;
max_mru_rotate_time_ = max_mru_rotate_time;
// Cannot perform all of thread creation under the lock as thread creation requires
// allocations so we create in temporaries first and then stash.
Thread* mru_thread = Thread::Create(
"page-queue-mru-thread",
[](void* arg) -> int {
static_cast<PageQueues*>(arg)->MruThread();
return 0;
},
this, LOW_PRIORITY);
DEBUG_ASSERT(mru_thread);
mru_thread->Resume();
Thread* lru_thread = Thread::Create(
"page-queue-lru-thread",
[](void* arg) -> int {
static_cast<PageQueues*>(arg)->LruThread();
return 0;
},
this, LOW_PRIORITY);
DEBUG_ASSERT(lru_thread);
lru_thread->Resume();
Guard<CriticalMutex> guard{&lock_};
ASSERT(!mru_thread_);
ASSERT(!lru_thread_);
mru_thread_ = mru_thread;
lru_thread_ = lru_thread;
}
void PageQueues::StopThreads() {
// Cannot wait for threads to complete with the lock held, so update state and then perform any
// joins outside the lock.
Thread* mru_thread = nullptr;
Thread* lru_thread = nullptr;
{
Guard<CriticalMutex> guard{&lock_};
shutdown_threads_ = true;
if (aging_disabled_.exchange(false)) {
aging_token_.Signal();
}
aging_event_.Signal();
lru_event_.Signal();
mru_thread = mru_thread_;
lru_thread = lru_thread_;
}
int retcode;
if (mru_thread) {
zx_status_t status = mru_thread->Join(&retcode, ZX_TIME_INFINITE);
ASSERT(status == ZX_OK);
}
if (lru_thread) {
zx_status_t status = lru_thread->Join(&retcode, ZX_TIME_INFINITE);
ASSERT(status == ZX_OK);
}
}
void PageQueues::SetActiveRatioMultiplier(uint32_t multiplier) {
Guard<CriticalMutex> guard{&lock_};
active_ratio_multiplier_ = multiplier;
// The change in multiplier might have caused us to need to age.
MaybeTriggerAgingLocked();
}
void PageQueues::MaybeTriggerAging() {
Guard<CriticalMutex> guard{&lock_};
MaybeTriggerAgingLocked();
}
void PageQueues::MaybeTriggerAgingLocked() {
if (GetAgeReasonLocked() != AgeReason::None) {
aging_event_.Signal();
}
}
PageQueues::AgeReason PageQueues::GetAgeReason() const {
Guard<CriticalMutex> guard{&lock_};
return GetAgeReasonLocked();
}
PageQueues::AgeReason PageQueues::GetAgeReasonLocked() const {
ActiveInactiveCounts active_count = GetActiveInactiveCountsLocked();
if (active_count.active * active_ratio_multiplier_ > active_count.inactive) {
return AgeReason::ActiveRatio;
}
return AgeReason::None;
}
void PageQueues::MaybeTriggerLruProcessing() {
if (NeedsLruProcessing()) {
lru_event_.Signal();
}
}
bool PageQueues::NeedsLruProcessing() const {
// Currently only reason to trigger lru processing is if the MRU needs space.
// Performing this unlocked is equivalently correct as grabbing the lock, reading, and dropping
// the lock. If a caller needs to know if the lru queue needs processing *and* then perform an
// action before that status could change, it should externally hold lock_ over this method and
// its action.
if (mru_gen_.load(ktl::memory_order_relaxed) - lru_gen_.load(ktl::memory_order_relaxed) ==
kNumPagerBacked - 1) {
return true;
}
return false;
}
void PageQueues::DisableAging() {
// Validate a double DisableAging is not happening.
if (aging_disabled_.exchange(true)) {
panic("Mismatched disable/enable pair");
}
// Take the aging token. This will both wait for the aging thread to complete any in progress
// aging, and prevent it from aging until we return it.
aging_token_.Wait();
}
void PageQueues::EnableAging() {
// Validate a double EnableAging is not happening.
if (!aging_disabled_.exchange(false)) {
panic("Mismatched disable/enable pair");
}
// Return the aging token, allowing the aging thread to proceed if it was waiting.
aging_token_.Signal();
}
const char* PageQueues::string_from_age_reason(PageQueues::AgeReason reason) {
switch (reason) {
case AgeReason::None:
return "None";
case AgeReason::ActiveRatio:
return "Active ratio";
case AgeReason::Timeout:
return "Timeout";
case AgeReason::Manual:
return "Manual";
default:
panic("Unreachable");
}
}
void PageQueues::Dump() {
// Need to grab a copy of all the counts and generations. As the lock is needed to acquire the
// active/inactive counts, also hold the lock over the copying of the counts to avoid needless
// races.
uint64_t mru_gen;
uint64_t lru_gen;
size_t counts[kNumPagerBacked] = {};
size_t inactive_count;
zx_time_t last_age_time;
AgeReason last_age_reason;
ActiveInactiveCounts activeinactive;
{
Guard<CriticalMutex> guard{&lock_};
mru_gen = mru_gen_.load(ktl::memory_order_relaxed);
lru_gen = lru_gen_.load(ktl::memory_order_relaxed);
inactive_count =
page_queue_counts_[PageQueuePagerBackedDontNeed].load(ktl::memory_order_relaxed);
for (uint32_t i = 0; i < kNumPagerBacked; i++) {
counts[i] = page_queue_counts_[PageQueuePagerBackedBase + i].load(ktl::memory_order_relaxed);
}
activeinactive = GetActiveInactiveCountsLocked();
last_age_time = last_age_time_.load(ktl::memory_order_relaxed);
last_age_reason = last_age_reason_;
}
// Small arbitrary number that should be more than large enough to hold the constructed string
// without causing stack allocation pressure.
constexpr size_t kBufSize = 50;
// Start with the buffer null terminated. snprintf will always keep it null terminated.
char buf[kBufSize] __UNINITIALIZED = "\0";
size_t buf_len = 0;
// This formats the counts of all buckets, not just those within the mru->lru range, even though
// any buckets not in that range should always have a count of zero. The format this generates is
// [active],[active],inactive,inactive,{last inactive},should-be-zero,should-be-zero
// Although the inactive and should-be-zero use the same formatting, they are broken up by the
// {last inactive}.
for (uint64_t i = 0; i < kNumPagerBacked; i++) {
PageQueue queue = gen_to_queue(mru_gen - i);
ASSERT(buf_len < kBufSize);
const size_t remain = kBufSize - buf_len;
int write_len;
if (i < kNumActiveQueues) {
write_len =
snprintf(buf + buf_len, remain, "[%zu],", counts[queue - PageQueuePagerBackedBase]);
} else if (i == mru_gen - lru_gen) {
write_len =
snprintf(buf + buf_len, remain, "{%zu},", counts[queue - PageQueuePagerBackedBase]);
} else {
write_len = snprintf(buf + buf_len, remain, "%zu,", counts[queue - PageQueuePagerBackedBase]);
}
// Negative values are returned on encoding errors, which we never expect to get.
ASSERT(write_len >= 0);
if (static_cast<uint>(write_len) >= remain) {
// Buffer too small, just use whatever we have constructed so far.
break;
}
buf_len += write_len;
}
zx_time_t current = current_time();
timespec age_time = zx_timespec_from_duration(zx_time_sub_time(current, last_age_time));
printf("pq: MRU generation is %" PRIu64
" set %ld.%lds ago due to \"%s\", LRU generation is %" PRIu64 "\n",
mru_gen, age_time.tv_sec, age_time.tv_nsec, string_from_age_reason(last_age_reason),
lru_gen);
printf("pq: Pager buckets %s evict first: %zu, %s active/inactive totals: %zu/%zu\n", buf,
inactive_count, activeinactive.cached ? "cached" : "live", activeinactive.active,
activeinactive.inactive);
}
// This runs the aging thread. Aging, unlike lru processing, scanning or eviction, requires very
// little work and is more about coordination. As such this thread is heavy on checks and signalling
// but generally only needs to hold any locks for the briefest of times.
// There is, currently, one exception to that, which is the calls to scanner_wait_for_accessed_scan.
// The scanner will, eventually, be a separate thread that is synchronized with, but presently
// a full scan may happen inline in that method call, and get attributed directly to this thread.
void PageQueues::MruThread() {
// Pretend that aging happens during startup to simplify the rest of the loop logic.
last_age_time_ = current_time();
while (!shutdown_threads_.load(ktl::memory_order_relaxed)) {
// Wait for the min rotate time to pass.
Thread::Current::Sleep(
zx_time_add_duration(last_age_time_.load(ktl::memory_order_relaxed), min_mru_rotate_time_));
// Wait on the aging_event_ in a loop to deal with spurious signals on the event. Will exit
// the loop for either of
// * A legitimate age reason returned by GetAgeReason
// * Reached the maximum wait time
// * Shutdown has been requested.
// We specifically check GetAgeReason() and then wait on the aging event to catch scenarios
// where we both timed out *and* found a pending age reason. This just allows us to log this
// scenario, and is not necessary for correctness.
zx_status_t result = ZX_OK;
AgeReason age_reason = AgeReason::None;
int iterations = 0;
while (!shutdown_threads_.load(ktl::memory_order_relaxed) &&
(age_reason = GetAgeReason()) == AgeReason::None && result != ZX_ERR_TIMED_OUT) {
result = aging_event_.WaitDeadline(
zx_time_add_duration(last_age_time_.load(ktl::memory_order_relaxed),
max_mru_rotate_time_),
Interruptible::No);
iterations++;
}
if (shutdown_threads_.load(ktl::memory_order_relaxed)) {
break;
}
if (iterations == 0) {
// If We did zero iterations then this means there was an age_reason waiting for us, meaning
// we wanted to age prior to the min timeout clearing.
pq_aging_reason_before_min_timeout.Add(1);
} else if (iterations > 1) {
// Every loop iteration above 1 indicates a spurious wake up.
pq_aging_spurious_wakeup.Add(iterations - 1);
}
// If we didn't have an age reason then since we already handled the disable_aging_ case the
// only other age cause must be a timeout.
if (age_reason == AgeReason::None) {
DEBUG_ASSERT(result == ZX_ERR_TIMED_OUT);
age_reason = AgeReason::Timeout;
} else {
// Had an age reason, but check if a timeout also occurred. This could happen legitimately due
// to races or delays in scheduling this thread, but we log this case in a counter anyway
// since it happening regularly could indicate a bug.
if (result == ZX_ERR_TIMED_OUT) {
pq_aging_timeout_with_reason.Add(1);
}
}
// Taken the aging token, potentially blocking if aging is disabled, make sure to return it when
// we are done.
aging_token_.Wait();
auto return_token = fit::defer([this] { aging_token_.Signal(); });
// Make sure the accessed information has been harvested since the last time we aged, otherwise
// we are deliberately making the age information coarser, by effectively not using one of the
// queues, at which point we might as well not have bothered rotating.
// Currently this is redundant since we will explicitly harvest just after aging, however once
// there are additional aging triggers and harvesting is more asynchronous, this will serve as
// a synchronization point.
scanner_wait_for_accessed_scan(last_age_time_, true);
RotatePagerBackedQueues(age_reason);
// Changing mru_gen_ could have impacted the eviction logic.
MaybeTriggerLruProcessing();
}
}
// This thread should, at some point, have some of its logic and signaling merged with the Evictor.
// Currently it might process the lru queue whilst the evictor is already trying to evict, which is
// not harmful but it's a bit wasteful as it doubles the work that happens.
// LRU processing, via ProcessDontNeedAndLruQueues, is expensive and happens under the lock_. It is
// expected that ProcessDontNeedAndLruQueues perform small units of work to avoid this thread
// causing excessive lock contention.
void PageQueues::LruThread() {
while (!shutdown_threads_.load(ktl::memory_order_relaxed)) {
lru_event_.WaitDeadline(ZX_TIME_INFINITE, Interruptible::No);
// Take the lock so we can calculate (race free) a target mru-gen
uint64_t target_gen;
{
Guard<CriticalMutex> guard{&lock_};
if (!NeedsLruProcessing()) {
pq_lru_spurious_wakeup.Add(1);
continue;
}
target_gen = lru_gen_.load(ktl::memory_order_relaxed) + 1;
}
// With the lock dropped process the target. This is not racy as generations are monotonic, so
// worst case someone else already processed this generation and this call will be a no-op.
ProcessDontNeedAndLruQueues(target_gen, false);
}
}
void PageQueues::RotatePagerBackedQueues(AgeReason reason) {
VM_KTRACE_DURATION(2, "RotatePagerBackedQueues");
// We expect LRU processing to have already happened, so first poll the mru semaphore.
if (mru_semaphore_.Wait(Deadline::infinite_past()) == ZX_ERR_TIMED_OUT) {
// We should not have needed to wait for lru processing here, as it should have already been
// made available due to earlier triggers. Although this could reasonably happen due to races or
// delays in scheduling we record in a counter as happening regularly could indicate a bug.
pq_aging_blocked_on_lru.Add(1);
MaybeTriggerLruProcessing();
// The LRU thread could take an arbitrary amount of time to get scheduled and run, so we cannot
// enforce a deadline. However, we can assume there might be a bug and start making noise to
// inform the user if we have waited multiples of the expected maximum aging interval, since
// that implies we are starting to lose the requested fidelity of age information.
int64_t timeouts = 0;
while (mru_semaphore_.Wait(Deadline::after(max_mru_rotate_time_, TimerSlack::none())) ==
ZX_ERR_TIMED_OUT) {
timeouts++;
printf("[pq] WARNING: Waited %" PRIi64 " seconds for LRU thread, MRU semaphore %" PRIu64
",aging is presently stalled\n",
(max_mru_rotate_time_ * timeouts) / ZX_SEC(1), mru_semaphore_.count());
Dump();
}
}
ASSERT(mru_gen_.load(ktl::memory_order_relaxed) - lru_gen_.load(ktl::memory_order_relaxed) <
kNumPagerBacked - 1);
{
// Acquire the lock to increment the mru_gen_. This allows other queue logic to not worry about
// mru_gen_ changing whilst they hold the lock.
Guard<CriticalMutex> guard{&lock_};
mru_gen_.fetch_add(1, ktl::memory_order_relaxed);
last_age_time_ = current_time();
last_age_reason_ = reason;
// Update the active/inactive counts. We could be a bit smarter here since we know exactly which
// active bucket might have changed, but this will work.
RecalculateActiveInactiveLocked();
}
// Keep a count of the different reasons we have rotated.
switch (reason) {
case AgeReason::None:
pq_aging_reason_none.Add(1);
break;
case AgeReason::Timeout:
pq_aging_reason_timeout.Add(1);
break;
case AgeReason::ActiveRatio:
pq_aging_reason_active_ratio.Add(1);
break;
case AgeReason::Manual:
pq_aging_reason_manual.Add(1);
break;
default:
panic("Unknown age reason");
}
}
ktl::optional<PageQueues::VmoBacklink> PageQueues::ProcessQueueHelper(
ProcessingQueue processing_queue, uint64_t target_gen, bool peek) {
VM_KTRACE_DURATION(2, "ProcessQueue");
// Processing the DontNeed/LRU queue requires holding the page_queues_ lock_. The only other
// actions that require this lock are inserting or removing pages from the page queues. To ensure
// these actions can complete in a small bounded time kMaxQueueWork is chosen to be very small so
// that the lock will be regularly dropped. As processing the DontNeed/LRU queue is not time
// critical and can be somewhat inefficient in its operation we err on the side of doing less work
// per lock acquisition.
//
// Also, we need to limit the number to avoid sweep_to_loaned taking up excessive stack space.
static constexpr uint32_t kMaxQueueWork = 16;
// Each page in this list can potentially be replaced with a loaned page, but we have to do this
// replacement outside the PageQueues::lock_, so we accumulate pages and then try to replace the
// pages after lock_ is released.
VmoBacklink sweep_to_loaned[kMaxQueueWork];
uint32_t sweep_to_loaned_count = 0;
// Only accumulate pages to try to replace with loaned pages if loaned pages are available and
// we're allowed to borrow at this code location.
bool do_sweeping = (pmm_count_loaned_free_pages() != 0) &&
pmm_physical_page_borrowing_config()->is_borrowing_on_mru_enabled();
auto sweep_to_loaned_outside_lock =
fit::defer([do_sweeping, &sweep_to_loaned, &sweep_to_loaned_count] {
DEBUG_ASSERT(do_sweeping || sweep_to_loaned_count == 0);
for (uint32_t i = 0; i < sweep_to_loaned_count; ++i) {
DEBUG_ASSERT(sweep_to_loaned[i].cow);
// We ignore the return value because the page may have moved, become pinned, we may not
// have any free loaned pages any more, or the VmCowPages may not be able to borrow.
sweep_to_loaned[i].cow->ReplacePageWithLoaned(sweep_to_loaned[i].page,
sweep_to_loaned[i].offset);
}
});
Guard<CriticalMutex> guard{&lock_};
const uint64_t mru = mru_gen_.load(ktl::memory_order_relaxed);
const uint64_t lru = lru_gen_.load(ktl::memory_order_relaxed);
// If we're processing the lru queue and it has already hit the target gen, return early.
if (processing_queue == ProcessingQueue::Lru && lru >= target_gen) {
return ktl::nullopt;
}
uint32_t work_remain = kMaxQueueWork;
PageQueue queue;
list_node* operating_queue;
if (processing_queue == ProcessingQueue::Lru) {
queue = gen_to_queue(lru);
operating_queue = &page_queues_[queue];
} else {
// Processing the DontNeed queue typically involves moving from the Processing to the regular
// DontNeed queue, unless we're peeking in which case we can also grab from the regular. When
// peeking we prefer to grab from the dont_need_processign_list_ first, as its pages are older,
// or at least were moved to the DontNeed queue further in the past.
queue = PageQueuePagerBackedDontNeed;
if (peek && list_is_empty(&dont_need_processing_list_)) {
operating_queue = &page_queues_[queue];
} else {
operating_queue = &dont_need_processing_list_;
}
}
while (!list_is_empty(operating_queue) && work_remain > 0) {
work_remain--;
// When moving pages we want to maintain relative page age as far as possible. For pages in the
// LRU queue this means the pages we are moving have overall been touched more recently than
// their destination, and so we want to take from the oldest (tail). The opposite is true for
// the don't need process queue, as they are less recently used than their target queue.
// When not moving pages, aka peeking, we always want the oldest pages though.
vm_page_t* page;
if (processing_queue == ProcessingQueue::Lru || peek) {
page = list_peek_tail_type(operating_queue, vm_page_t, queue_node);
} else {
page = list_peek_head_type(operating_queue, vm_page_t, queue_node);
}
PageQueue page_queue =
(PageQueue)page->object.get_page_queue_ref().load(ktl::memory_order_relaxed);
if (processing_queue == ProcessingQueue::DontNeed) {
DEBUG_ASSERT(page_queue >= PageQueuePagerBackedDontNeed);
} else {
DEBUG_ASSERT(page_queue >= PageQueuePagerBackedBase);
}
// If the queue stored in the page does not match then we want to move it to its correct queue
// with the caveat that its queue could be invalid. The queue would be invalid if MarkAccessed
// had raced. Should this happen we know that the page is actually *very* old, and so we will
// fall back to the case of forcibly changing its age to the new lru gen.
const PageQueue mru_queue = gen_to_queue(mru);
// A page in the DontNeed queue will never be allowed to age to the point that its queue
// becomes invalid, because we process *all* the DontNeed pages each time we change the lru.
// In other words, no DontNeed page will be left behind when lru advances.
DEBUG_ASSERT(page_queue == queue || processing_queue != ProcessingQueue::DontNeed ||
queue_is_valid(page_queue, gen_to_queue(lru), mru_queue));
if (page_queue != queue && queue_is_valid(page_queue, gen_to_queue(lru), mru_queue)) {
list_delete(&page->queue_node);
list_add_head(&page_queues_[page_queue], &page->queue_node);
if (queue_is_active(page_queue, mru_queue) && do_sweeping && !pmm_is_loaned(page)) {
DEBUG_ASSERT(sweep_to_loaned_count < kMaxQueueWork);
VmCowPages* cow = reinterpret_cast<VmCowPages*>(page->object.get_object());
uint64_t page_offset = page->object.get_page_offset();
DEBUG_ASSERT(cow);
sweep_to_loaned[sweep_to_loaned_count] =
VmoBacklink{fbl::MakeRefPtrUpgradeFromRaw(cow, lock_), page, page_offset};
if (sweep_to_loaned[sweep_to_loaned_count].cow) {
++sweep_to_loaned_count;
}
}
} else if (peek) {
VmCowPages* cow = reinterpret_cast<VmCowPages*>(page->object.get_object());
uint64_t page_offset = page->object.get_page_offset();
DEBUG_ASSERT(cow);
// We may be racing with destruction of VMO. As we currently hold our lock we know that our
// back pointer is correct in so far as the VmCowPages has not yet had completed running its
// destructor, so we know it is safe to attempt to upgrade it to a RefPtr. If upgrading
// fails we assume the page is about to be removed from the page queue once the VMO
// destructor gets a chance to run.
return VmoBacklink{fbl::MakeRefPtrUpgradeFromRaw(cow, lock_), page, page_offset};
} else {
// Force it into our target queue, don't care about races. If we happened to access it at
// the same time then too bad.
PageQueue new_queue = processing_queue == ProcessingQueue::DontNeed
? PageQueuePagerBackedDontNeed
: gen_to_queue(lru + 1);
PageQueue old_queue = (PageQueue)page->object.get_page_queue_ref().exchange(new_queue);
if (processing_queue == ProcessingQueue::DontNeed) {
DEBUG_ASSERT(old_queue >= PageQueuePagerBackedDontNeed);
} else {
DEBUG_ASSERT(old_queue >= PageQueuePagerBackedBase);
}
page_queue_counts_[old_queue].fetch_sub(1, ktl::memory_order_relaxed);
page_queue_counts_[new_queue].fetch_add(1, ktl::memory_order_relaxed);
list_delete(&page->queue_node);
if (processing_queue == ProcessingQueue::DontNeed) {
// When pulling from the DontNeedProcessing queue we pulled from the head, and so we
// conversely need to place them in the tail to preserve order (see comment at start of the
// loop).
list_add_tail(&page_queues_[new_queue], &page->queue_node);
} else {
list_add_head(&page_queues_[new_queue], &page->queue_node);
}
// We should only have performed this step to move from one inactive bucket to the next,
// so there should be no active/inactive count changes needed.
DEBUG_ASSERT(!queue_is_active(new_queue, mru_gen_to_queue()));
}
}
if (list_is_empty(operating_queue)) {
if (processing_queue == ProcessingQueue::Lru) {
lru_gen_.store(lru + 1, ktl::memory_order_relaxed);
mru_semaphore_.Post();
// Changing the lru_gen_ might in the future trigger a scenario where need to age, but this
// is presently a no-op.
MaybeTriggerAgingLocked();
}
}
return ktl::nullopt;
}
ktl::optional<PageQueues::VmoBacklink> PageQueues::ProcessDontNeedAndLruQueues(uint64_t target_gen,
bool peek) {
// This assertion is <=, and not strictly <, since to evict a some queue X, the target must be
// X+1. Hence to preserve kNumActiveQueues, we can allow target_gen to become equal to the first
// active queue, as this will process all the non-active queues. Although we might refresh our
// value for the mru_queue, since the mru_gen_ is monotonic increasing, if this assert passes once
// it should continue to be true.
ASSERT(target_gen <= mru_gen_.load(ktl::memory_order_relaxed) - (kNumActiveQueues - 1));
// Calculate a truly worst case loop iteration count based on every page being in either the LRU
// queue or the wrong DontNeed queue and needing to iterate the LRU multiple steps to the
// target_gen. Instead of reading the LRU and comparing the target_gen, just add a buffer of the
// maximum number of page queues.
ActiveInactiveCounts active_inactive = GetActiveInactiveCounts();
const uint64_t max_dont_need_iterations = page_queue_counts_[PageQueuePagerBackedDontNeed] + 1;
const uint64_t max_lru_iterations =
active_inactive.active + active_inactive.inactive + kNumPagerBacked;
// Loop iteration counting is just for diagnostic purposes.
uint64_t loop_iterations = 0;
ktl::optional<Guard<Mutex>> dont_need_processing_guard;
if (!peek) {
// If not peeking then we will need to properly process the DontNeed queue, and so we must take
// the processing lock and move the existing pages into the processing list.
dont_need_processing_guard.emplace(&dont_need_processing_lock_);
Guard<CriticalMutex> guard{&lock_};
ASSERT(list_is_empty(&dont_need_processing_list_));
list_move(&page_queues_[PageQueuePagerBackedDontNeed], &dont_need_processing_list_);
}
while (true) {
VM_KTRACE_DURATION(2, "ProcessDontNeedQueue");
if (loop_iterations++ == max_dont_need_iterations) {
printf("[pq]: WARNING: %s exceeded expected max DontNeed loop iterations %" PRIu64 "\n",
__FUNCTION__, max_dont_need_iterations);
}
auto optional_backlink = ProcessQueueHelper(ProcessingQueue::DontNeed, 0, peek);
if (optional_backlink != ktl::nullopt) {
DEBUG_ASSERT(peek);
return optional_backlink;
}
Guard<CriticalMutex> guard{&lock_};
// If we have been asked to peek, we want to keep processing until both the DontNeed queues
// are empty. ProcessQueueHelper will process one queue and then move on to the other when
// the first is empty, so that both get processed.
if (list_is_empty(&dont_need_processing_list_) &&
(!peek || list_is_empty(&page_queues_[PageQueuePagerBackedDontNeed]))) {
break;
}
}
// Not strictly required, but we're done with the dontneed processing lock.
dont_need_processing_guard.reset();
// Reset diagnostic counter for the new loop.
loop_iterations = 0;
// Process the lru queue to reach target_gen.
while (lru_gen_.load(ktl::memory_order_relaxed) < target_gen) {
VM_KTRACE_DURATION(2, "ProcessLruQueue");
if (loop_iterations++ == max_lru_iterations) {
printf("[pq]: WARNING: %s exceeded expected max LRU loop iterations %" PRIu64 "\n",
__FUNCTION__, max_lru_iterations);
}
auto optional_backlink = ProcessQueueHelper(ProcessingQueue::Lru, target_gen, peek);
if (optional_backlink != ktl::nullopt) {
return optional_backlink;
}
}
return ktl::nullopt;
}
void PageQueues::UpdateActiveInactiveLocked(PageQueue old_queue, PageQueue new_queue) {
// Short circuit the lock acquisition and logic if not dealing with active/inactive queues
if (!queue_is_pager_backed(old_queue) && !queue_is_pager_backed(new_queue)) {
return;
}
// This just blindly updates the active/inactive counts. If accessed scanning is happening, and
// used use_cached_queue_counts_ is true, then we could be racing and setting these to garbage
// values. That's fine as they will never get returned anywhere, and will get reset to correct
// values once access scanning completes.
PageQueue mru = mru_gen_to_queue();
if (queue_is_active(old_queue, mru)) {
active_queue_count_--;
} else if (queue_is_inactive(old_queue, mru)) {
inactive_queue_count_--;
}
if (queue_is_active(new_queue, mru)) {
active_queue_count_++;
} else if (queue_is_inactive(new_queue, mru)) {
inactive_queue_count_++;
}
MaybeTriggerAgingLocked();
}
void PageQueues::MarkAccessed(vm_page_t* page) {
Guard<CriticalMutex> guard{&lock_};
auto queue_ref = page->object.get_page_queue_ref();
// The page can be the zero page, but in that case we'll return early below.
DEBUG_ASSERT(page != vm_get_zero_page() ||
queue_ref.load(ktl::memory_order_relaxed) < PageQueuePagerBackedDontNeed);
// We need to check the current queue to see if it is in the pager backed range. Between checking
// this and updating the queue it could change, however it would only change as a result of
// MarkAccessedDeferredCount, which would only move it to another pager backed queue. No other
// change is possible as we are holding lock_.
if (queue_ref.load(ktl::memory_order_relaxed) < PageQueuePagerBackedDontNeed) {
return;
}
PageQueue queue = mru_gen_to_queue();
PageQueue old_queue = (PageQueue)queue_ref.exchange(queue, ktl::memory_order_relaxed);
// Double check again that this was previously pager backed
DEBUG_ASSERT(old_queue != PageQueueNone && old_queue >= PageQueuePagerBackedDontNeed);
if (old_queue != queue) {
page_queue_counts_[old_queue].fetch_sub(1, ktl::memory_order_relaxed);
page_queue_counts_[queue].fetch_add(1, ktl::memory_order_relaxed);
UpdateActiveInactiveLocked(old_queue, queue);
}
}
void PageQueues::SetQueueLocked(vm_page_t* page, PageQueue queue) {
SetQueueBacklinkLocked(page, nullptr, 0, queue);
}
void PageQueues::SetQueueBacklinkLocked(vm_page_t* page, void* object, uintptr_t page_offset,
PageQueue queue) {
DEBUG_ASSERT(page->state() == vm_page_state::OBJECT);
DEBUG_ASSERT(!page->is_free());
DEBUG_ASSERT(!list_in_list(&page->queue_node));
if (object) {
DEBUG_ASSERT(!page->object.get_object() || page->object.get_object() == object);
DEBUG_ASSERT(!page->object.get_object() || page->object.get_page_offset() == page_offset);
page->object.set_object(object);
page->object.set_page_offset(page_offset);
} else {
// Currently SetQueueBacklinkLocked() can be used to clear the backlink in cases where the page
// isn't being removed from a VmCowPages. This won't work for loaned pages, but we don't have
// any loaned pages yet.
page->object.clear_object();
page->object.set_page_offset(0);
}
DEBUG_ASSERT(page->object.get_page_queue_ref().load(ktl::memory_order_relaxed) == PageQueueNone);
page->object.get_page_queue_ref().store(queue, ktl::memory_order_relaxed);
list_add_head(&page_queues_[queue], &page->queue_node);
page_queue_counts_[queue].fetch_add(1, ktl::memory_order_relaxed);
UpdateActiveInactiveLocked(PageQueueNone, queue);
}
void PageQueues::MoveToQueueLocked(vm_page_t* page, PageQueue queue) {
MoveToQueueBacklinkLocked(page, nullptr, 0, queue);
}
void PageQueues::MoveToQueueBacklinkLocked(vm_page_t* page, void* object, uintptr_t page_offset,
PageQueue queue) {
DEBUG_ASSERT(page->state() == vm_page_state::OBJECT);
DEBUG_ASSERT(!page->is_free());
DEBUG_ASSERT(list_in_list(&page->queue_node));
uint32_t old_queue = page->object.get_page_queue_ref().exchange(queue, ktl::memory_order_relaxed);
DEBUG_ASSERT(old_queue != PageQueueNone);
if (object) {
DEBUG_ASSERT(!page->object.get_object() || page->object.get_object() == object);
DEBUG_ASSERT(!page->object.get_object() || page->object.get_page_offset() == page_offset);
page->object.set_object(object);
page->object.set_page_offset(page_offset);
} else {
// Currently MoveToQueueBacklinkLocked() can be used to clear the backlink in cases where the
// page isn't being removed from a VmCowPages. This won't work for loaned pages, but we don't
// have any loaned pages yet.
page->object.clear_object();
page->object.set_page_offset(0);
}
list_delete(&page->queue_node);
list_add_head(&page_queues_[queue], &page->queue_node);
page_queue_counts_[old_queue].fetch_sub(1, ktl::memory_order_relaxed);
page_queue_counts_[queue].fetch_add(1, ktl::memory_order_relaxed);
UpdateActiveInactiveLocked((PageQueue)old_queue, queue);
}
void PageQueues::SetWired(vm_page_t* page) {
Guard<CriticalMutex> guard{&lock_};
SetQueueLocked(page, PageQueueWired);
}
void PageQueues::MoveToWired(vm_page_t* page) {
Guard<CriticalMutex> guard{&lock_};
MoveToQueueLocked(page, PageQueueWired);
}
void PageQueues::MoveToWired(vm_page_t* page, VmCowPages* object, uint64_t page_offset) {
Guard<CriticalMutex> guard{&lock_};
DEBUG_ASSERT(object);
MoveToQueueBacklinkLocked(page, object, page_offset, PageQueueWired);
}
void PageQueues::SetUnswappable(vm_page_t* page) {
Guard<CriticalMutex> guard{&lock_};
SetQueueLocked(page, PageQueueUnswappable);
}
void PageQueues::MoveToUnswappableLocked(vm_page_t* page) {
MoveToQueueLocked(page, PageQueueUnswappable);
}
void PageQueues::MoveToUnswappable(vm_page_t* page) {
Guard<CriticalMutex> guard{&lock_};
MoveToUnswappableLocked(page);
}
void PageQueues::SetPagerBacked(vm_page_t* page, VmCowPages* object, uint64_t page_offset) {
Guard<CriticalMutex> guard{&lock_};
DEBUG_ASSERT(object);
SetQueueBacklinkLocked(page, object, page_offset, mru_gen_to_queue());
}
void PageQueues::MoveToPagerBacked(vm_page_t* page, VmCowPages* object, uint64_t page_offset) {
Guard<CriticalMutex> guard{&lock_};
DEBUG_ASSERT(object);
MoveToQueueBacklinkLocked(page, object, page_offset, mru_gen_to_queue());
}
void PageQueues::MoveToPagerBackedDontNeed(vm_page_t* page) {
Guard<CriticalMutex> guard{&lock_};
MoveToQueueBacklinkLocked(page, page->object.get_object(), page->object.get_page_offset(),
PageQueuePagerBackedDontNeed);
}
void PageQueues::SetPagerBackedDirty(vm_page_t* page, VmCowPages* object, uint64_t page_offset) {
Guard<CriticalMutex> guard{&lock_};
DEBUG_ASSERT(object);
SetQueueBacklinkLocked(page, object, page_offset, PageQueuePagerBackedDirty);
}
void PageQueues::MoveToPagerBackedDirty(vm_page_t* page, VmCowPages* object, uint64_t page_offset) {
Guard<CriticalMutex> guard{&lock_};
DEBUG_ASSERT(object);
MoveToQueueBacklinkLocked(page, object, page_offset, PageQueuePagerBackedDirty);
}
void PageQueues::SetUnswappableZeroFork(vm_page_t* page, VmCowPages* object, uint64_t page_offset) {
Guard<CriticalMutex> guard{&lock_};
SetQueueBacklinkLocked(page, object, page_offset, PageQueueUnswappableZeroFork);
}
void PageQueues::MoveToUnswappableZeroFork(vm_page_t* page, VmCowPages* object,
uint64_t page_offset) {
Guard<CriticalMutex> guard{&lock_};
DEBUG_ASSERT(object);
MoveToQueueBacklinkLocked(page, object, page_offset, PageQueueUnswappableZeroFork);
}
void PageQueues::RemoveLocked(vm_page_t* page) {
// Directly exchange the old gen.
uint32_t old_queue =
page->object.get_page_queue_ref().exchange(PageQueueNone, ktl::memory_order_relaxed);
DEBUG_ASSERT(old_queue != PageQueueNone);
page_queue_counts_[old_queue].fetch_sub(1, ktl::memory_order_relaxed);
UpdateActiveInactiveLocked((PageQueue)old_queue, PageQueueNone);
page->object.clear_object();
page->object.set_page_offset(0);
list_delete(&page->queue_node);
}
void PageQueues::Remove(vm_page_t* page) {
Guard<CriticalMutex> guard{&lock_};
RemoveLocked(page);
}
void PageQueues::RemoveArrayIntoList(vm_page_t** pages, size_t count, list_node_t* out_list) {
DEBUG_ASSERT(pages);
Guard<CriticalMutex> guard{&lock_};
for (size_t i = 0; i < count; i++) {
DEBUG_ASSERT(pages[i]);
RemoveLocked(pages[i]);
list_add_tail(out_list, &pages[i]->queue_node);
}
}
void PageQueues::BeginAccessScan() {
Guard<CriticalMutex> guard{&lock_};
ASSERT(!use_cached_queue_counts_.load(ktl::memory_order_relaxed));
cached_active_queue_count_ = active_queue_count_;
cached_inactive_queue_count_ = inactive_queue_count_;
use_cached_queue_counts_.store(true, ktl::memory_order_relaxed);
}
void PageQueues::RecalculateActiveInactiveLocked() {
uint64_t active = 0;
uint64_t inactive = 0;
uint64_t lru = lru_gen_.load(ktl::memory_order_relaxed);
uint64_t mru = mru_gen_.load(ktl::memory_order_relaxed);
for (uint64_t index = lru; index <= mru; index++) {
uint64_t count = page_queue_counts_[gen_to_queue(index)].load(ktl::memory_order_relaxed);
if (queue_is_active(gen_to_queue(index), gen_to_queue(mru))) {
active += count;
} else {
// As we are only operating on pager backed queues, !active should imply inactive
DEBUG_ASSERT(queue_is_inactive(gen_to_queue(index), gen_to_queue(mru)));
inactive += count;
}
}
inactive += page_queue_counts_[PageQueuePagerBackedDontNeed].load(ktl::memory_order_relaxed);
// Update the counts.
active_queue_count_ = active;
inactive_queue_count_ = inactive;
// New counts might mean we need to age.
MaybeTriggerAgingLocked();
}
void PageQueues::EndAccessScan() {
Guard<CriticalMutex> guard{&lock_};
ASSERT(use_cached_queue_counts_.load(ktl::memory_order_relaxed));
// First clear the cached counts. Although the uncached counts aren't correct right now, we hold
// the lock so no one can observe the counts right now.
cached_active_queue_count_ = 0;
cached_inactive_queue_count_ = 0;
use_cached_queue_counts_.store(false, ktl::memory_order_relaxed);
RecalculateActiveInactiveLocked();
}
PageQueues::PagerCounts PageQueues::GetPagerQueueCounts() const {
PagerCounts counts;
// Grab the lock to prevent LRU processing, this lets us get a slightly less racy snapshot of
// the queue counts, although we may still double count pages that move after we count them.
// Specifically any parallel callers of MarkAccessed could move a page and change the counts,
// causing us to either double count or miss count that page. As these counts are not load
// bearing we accept the very small chance of potentially being off a few pages.
Guard<CriticalMutex> guard{&lock_};
uint64_t lru = lru_gen_.load(ktl::memory_order_relaxed);
uint64_t mru = mru_gen_.load(ktl::memory_order_relaxed);
counts.total = 0;
for (uint64_t index = lru; index <= mru; index++) {
uint64_t count = page_queue_counts_[gen_to_queue(index)].load(ktl::memory_order_relaxed);
// Distance to the MRU, and not the LRU, determines the bucket the count goes into. This is to
// match the logic in PeekPagerBacked, which is also based on distance to MRU.
if (index > mru - kNumActiveQueues) {
counts.newest += count;
} else if (index <= mru - (kNumPagerBacked - kNumOldestQueues)) {
counts.oldest += count;
}
counts.total += count;
}
// Account the DontNeed queue length under |oldest|, since (DontNeed + oldest LRU) pages are
// eligible for reclamation first. |oldest| is meant to track pages eligible for eviction first.
uint64_t inactive_count =
page_queue_counts_[PageQueuePagerBackedDontNeed].load(ktl::memory_order_relaxed);
counts.oldest += inactive_count;
counts.total += inactive_count;
return counts;
}
PageQueues::Counts PageQueues::QueueCounts() const {
Counts counts = {};
// Grab the lock to prevent LRU processing, this lets us get a slightly less racy snapshot of
// the queue counts. We may still double count pages that move after we count them.
Guard<CriticalMutex> guard{&lock_};
uint64_t lru = lru_gen_.load(ktl::memory_order_relaxed);
uint64_t mru = mru_gen_.load(ktl::memory_order_relaxed);
for (uint64_t index = lru; index <= mru; index++) {
counts.pager_backed[mru - index] =
page_queue_counts_[gen_to_queue(index)].load(ktl::memory_order_relaxed);
}
counts.pager_backed_dont_need =
page_queue_counts_[PageQueuePagerBackedDontNeed].load(ktl::memory_order_relaxed);
counts.unswappable = page_queue_counts_[PageQueueUnswappable].load(ktl::memory_order_relaxed);
counts.wired = page_queue_counts_[PageQueueWired].load(ktl::memory_order_relaxed);
counts.unswappable_zero_fork =
page_queue_counts_[PageQueueUnswappableZeroFork].load(ktl::memory_order_relaxed);
return counts;
}
bool PageQueues::DebugPageIsPagerBacked(const vm_page_t* page, size_t* queue) const {
PageQueue q = (PageQueue)page->object.get_page_queue_ref().load(ktl::memory_order_relaxed);
if (q >= PageQueuePagerBackedBase && q <= PageQueuePagerBackedLast) {
if (queue) {
*queue = queue_age(q, mru_gen_to_queue());
}
return true;
}
return false;
}
bool PageQueues::DebugPageIsPagerBackedDontNeed(const vm_page_t* page) const {
return page->object.get_page_queue_ref().load(ktl::memory_order_relaxed) ==
PageQueuePagerBackedDontNeed;
}
bool PageQueues::DebugPageIsPagerBackedDirty(const vm_page_t* page) const {
return page->object.get_page_queue_ref().load(ktl::memory_order_relaxed) ==
PageQueuePagerBackedDirty;
}
bool PageQueues::DebugPageIsUnswappable(const vm_page_t* page) const {
return page->object.get_page_queue_ref().load(ktl::memory_order_relaxed) == PageQueueUnswappable;
}
bool PageQueues::DebugPageIsWired(const vm_page_t* page) const {
return page->object.get_page_queue_ref().load(ktl::memory_order_relaxed) == PageQueueWired;
}
bool PageQueues::DebugPageIsUnswappableZeroFork(const vm_page_t* page) const {
return page->object.get_page_queue_ref().load(ktl::memory_order_relaxed) ==
PageQueueUnswappableZeroFork;
}
bool PageQueues::DebugPageIsAnyUnswappable(const vm_page_t* page) const {
return DebugPageIsUnswappable(page) || DebugPageIsUnswappableZeroFork(page);
}
ktl::optional<PageQueues::VmoBacklink> PageQueues::PopUnswappableZeroFork() {
Guard<CriticalMutex> guard{&lock_};
vm_page_t* page =
list_peek_tail_type(&page_queues_[PageQueueUnswappableZeroFork], vm_page_t, queue_node);
if (!page) {
return ktl::nullopt;
}
VmCowPages* cow = reinterpret_cast<VmCowPages*>(page->object.get_object());
uint64_t page_offset = page->object.get_page_offset();
DEBUG_ASSERT(cow);
MoveToQueueLocked(page, PageQueueUnswappable);
// We may be racing with destruction of VMO. As we currently hold our lock we know that our
// back pointer is correct in so far as the VmCowPages has not yet had completed running its
// destructor, so we know it is safe to attempt to upgrade it to a RefPtr. If upgrading fails
// we assume the page is about to be removed from the page queue once the VMO destructor gets
// a chance to run.
return VmoBacklink{fbl::MakeRefPtrUpgradeFromRaw(cow, guard), page, page_offset};
}
ktl::optional<PageQueues::VmoBacklink> PageQueues::PeekPagerBacked(size_t lowest_queue) {
// Ignore any requests to evict from the active queues as this is never allowed.
lowest_queue = ktl::max(lowest_queue, kNumActiveQueues);
// The target gen is 1 larger than the lowest queue because evicting from queue X is done by
// attempting to make the lru queue be X+1.
return ProcessDontNeedAndLruQueues(mru_gen_.load(ktl::memory_order_relaxed) - (lowest_queue - 1),
true);
}
PageQueues::ActiveInactiveCounts PageQueues::GetActiveInactiveCounts() const {
Guard<CriticalMutex> guard{&lock_};
return GetActiveInactiveCountsLocked();
}
PageQueues::ActiveInactiveCounts PageQueues::GetActiveInactiveCountsLocked() const {
if (use_cached_queue_counts_.load(ktl::memory_order_relaxed)) {
return ActiveInactiveCounts{.cached = true,
.active = cached_active_queue_count_,
.inactive = cached_inactive_queue_count_};
} else {
// With use_cached_queue_counts_ false the counts should have been updated to remove any
// negative values that might have been caused by races.
ASSERT(active_queue_count_ >= 0);
ASSERT(inactive_queue_count_ >= 0);
return ActiveInactiveCounts{.cached = false,
.active = static_cast<uint64_t>(active_queue_count_),
.inactive = static_cast<uint64_t>(inactive_queue_count_)};
}
}
ktl::optional<PageQueues::VmoContainerBacklink> PageQueues::GetCowWithReplaceablePage(
vm_page_t* page, VmCowPages* owning_cow) {
// Wait for the page to not be in a transient state. This is in a loop, since the wait happens
// outside the lock, so another thread doing commit/decommit on owning_cow can cause the page
// state to change, potentially multiple times.
//
// While it's possible for another thread that's concurrently committing/decommitting this page
// to/from owning_cow, or moving the page from one VmCowPages to another without going through
// FREE, to interfere to some extent with this thread's progress toward a terminal state in this
// loop (and the caller's loop), this interference is fairly similar to page eviction interfering
// with progress of commit of a pager-backed range. That said, we mitigate here by tracking which
// cases we've seen that we only expect to see once in the absence of commit/decommit interference
// by another thread. Thanks to loan_cancelled, we can limit all the wait required cases to a max
// of once. This mitigation doesn't try to maximally detect interference and minimize iterations
// but the mitigation does limit iterations to a finite number.
//
// TODO(dustingreen):
// * complain on excessive loop iterations / duration looping
// * complain on excessive lifetime duration of StackOwnedLoanedPagesInterval, probably during
// destructor, but consider if there's any cheap and simple enough way to complain if it's just
// existing too long without any pre-existing calls on it.
uint loop_iterations = 0;
while (true) {
// Warn on excessive iterations. The threshold is chosen to be quite high since this isn't
// intending to check some strict finite bound, but rather to find pathological bugs where this
// is infinite looping and monopolizing the lock_.
if (loop_iterations++ == 200) {
printf("[pq]: WARNING: %s appears to be looping excessively\n", __FUNCTION__);
}
// This is just for asserting that we don't end up trying to wait when we didn't intend to.
bool wait_on_stack_ownership = false;
{ // scope guard
Guard<CriticalMutex> guard{&lock_};
// While holding lock_, we can safely add an event to be notified, if needed. While a page
// state transition from ALLOC to OBJECT, and from OBJECT with no VmCowPages to OBJECT with a
// VmCowPages, are both guarded by lock_, a transition to FREE is not. So we must check
// again, in an ordered fashion (using PmmNode lock not just "relaxed" atomic) for the page
// being in FREE state after we add an event, to ensure the transition to FREE doesn't miss
// the added event. If a page transitions back out of FREE due to actions by other threads,
// the lock_ protects the page's object field from being overwritten by an event being added.
vm_page_state state = page->state();
// If owning_cow, we know the owning_cow destructor can't run, so the only valid page
// states while FREE or borrowed by a VmCowPages and not pinned are FREE, ALLOC, OBJECT.
//
// If !owning_cow, the set of possible states isn't constrained, and we don't try to wait for
// the page.
switch (state) {
case vm_page_state::FREE:
// No cow, but still success. The fact that we were holding lock_ while reading page
// state isn't relevant to the transition to FREE; we just care that we'll notice FREE
// somewhere in the loop.
//
// We care that we will notice transition _to_ FREE that stays FREE indefinitely via this
// check. Other threads doing commit/decommit on owning_cow can cause this check to miss
// a transient FREE state, but we avoid getting stuck waiting indefinitely.
return ktl::nullopt;
case vm_page_state::OBJECT: {
// Sub-cases:
// * Using cow.
// * Loaning cow.
// * No cow (page moving from cow to cow).
VmCowPages* cow = reinterpret_cast<VmCowPages*>(page->object.get_object());
if (!cow) {
if (!owning_cow) {
// If there's not a specific owning_cow, then we can't be as certain of the states the
// page may reach. For example the page may get used by something other than a
// VmCowPages, which wouldn't trigger the event. So we can't use the event mechanism.
//
// This is a success case. We checked if there was a using cow at the moment, and
// there wasn't.
return ktl::nullopt;
}
// Page is moving from cow to cow, and/or is on the way to FREE, so wait below for
// page to get a new VmCowPages or become FREE. We still have to synchronize further
// below using thread_lock, since OBJECT to FREE doesn't hold PageQueues lock_.
wait_on_stack_ownership = true;
break;
} else if (cow == owning_cow) {
// This should be impossible, since PageSource guarantees that a given page will only be
// actively reclaimed by up to one thread at a time. If this happens, things are broken
// enough that we shouldn't continue.
panic("Requested page alraedy in owning_cow; unexpected\n");
} else {
// At this point the page may have pin_count != 0. We have to check in terms of which
// queue here, since we can't acquire the VmCowPages lock (wrong order).
if (!owning_cow) {
if (page->object.get_page_queue_ref().load(ktl::memory_order_relaxed) ==
PageQueueWired) {
// A pinned page is not replaceable.
return ktl::nullopt;
}
}
// There is a using/borrowing cow, but we may not be able to get a ref to it, if it's
// already destructing. We actually try to get a ref to the VmCowPagesContainer instead
// since that allows us to get a ref successfully up until _after_ the page has moved to
// FREE, avoiding any need to wait, and avoiding stuff needed to support such a wait.
//
// We're under PageQueues lock, so this value is stable at the moment, but by the time
// the caller acquires the cow lock this page could potentially be elsewhere, depending
// on whether the page is allowed to move to a different VmCowPages or to a different
// location in this VmCowPages, without going through FREE.
//
// The cow->RemovePageForEviction() does a re-check that this page is still at this
// offset. The caller's loop takes care of chasing down the page if it moves between
// VmCowPages or to a different offset in the same VmCowPages without going through
// FREE.
uint64_t page_offset = page->object.get_page_offset();
// We may be racing with destruction of VMO. As we currently hold PageQueues lock we
// know that our back pointer is correct as the VmCowPages has not yet completed
// running fbl_recycle() (which has to acquire PageQueues lock to remove the backlink
// and then release the ref on its VmCowPagesContainer), so we know it is safe to
// attempt to upgrade from a raw VmCowPagesContainer pointer to a VmCowPagesContainer
// RefPtr, and that this upgrade will succeed until _after_ the page is FREE. If
// upgrading fails we know the page has already become FREE; in that case we just go
// back around the loop since that's almost as efficient and less code than handling
// here.
VmCowPagesContainer* raw_cow_container = cow->raw_container();
VmoContainerBacklink backlink{fbl::MakeRefPtrUpgradeFromRaw(raw_cow_container, guard),
page, page_offset};
if (!backlink.cow_container) {
// Existing cow is at least at the end of fbl_recycle(). The page has already become
// FREE. Let the loop handle that since it's less code than handling here and not
// significantly more expensive to handle with the loop.
DEBUG_ASSERT(page->is_free());
continue;
} else {
// We AddRef(ed) the using cow_container. Success. Return the backlink. The caller
// can use this to call cow->RemovePageForEviction(), which is ok to call on a cow
// with refcount 0 as long as the caller is holding the backlink's cow_container
// VmCowPagesContainer ref.
return backlink;
}
}
break;
}
case vm_page_state::ALLOC:
if (!owning_cow) {
// When there's not an owning_cow, we don't know what use the page may be put to, so
// we don't know if the page has a StackOwnedLoanedPagesInterval, since those are only
// required for intervals involving stack ownership of loaned pages. Since the caller
// isn't strictly required to succeed at replacing a page when !owning_cow, the caller
// is ok with a successful "none" here since the page isn't immediately replaceable.
return ktl::nullopt;
}
// Wait for ALLOC to become OBJECT or FREE.
wait_on_stack_ownership = true;
break;
default:
// If owning_cow, we know the owning_cow destructor can't run, so the only valid page
// states while FREE or borrowed by a VmCowPages and not pinned are FREE, ALLOC, OBJECT.
DEBUG_ASSERT(!owning_cow);
// When !owning_cow, the possible page states include all page states. The caller is only
// interested in pages that are both used by a VmCowPages (not transiently stack owned)
// and which the caller can immediately replace with a different page, so WIRED state goes
// along with the list of other states where the caller can't just replace the page.
//
// There is no cow with this page as an immediately-replaceable page.
return ktl::nullopt;
}
} // ~guard
// If we get here, we know that wait_on_stack_ownership is true, and we know that never happens
// when !owning_cow.
DEBUG_ASSERT(wait_on_stack_ownership);
DEBUG_ASSERT(owning_cow);
StackOwnedLoanedPagesInterval::WaitUntilContiguousPageNotStackOwned(page);
// At this point, the state of the page has changed, but we don't know how much. Another thread
// doing commit on owning_cow may have finished moving the page into owning_cow. Yet another
// thread may have decommitted the page again, and yet another thread may be using the loaned
// page again now despite loan_cancelled having been used. The page may have been moved to a
// destination cow, but may now be moving again. What we do still know is that the page still
// has owning_cow as its underlying owner (owning_cow is a contiguous VmCowPages), thanks to
// the ref on owning_cow held by the caller, and how contiguous VmCowPages keep the same
// physical pages from creation to fbl_recycle().
//
// It's still the goal of this method to return the borrowing cow if there is one, or return
// success without a borrowing cow if the page is verified to be reclaim-able by the owning_cow
// at some point during this method (regardless of whether that remains true).
//
// Go around again to observe new page state.
//
// ~thread_lock_guard
}
}