blob: db3f9e8f907cb992726f92e44e54ce7ac413706b [file] [log] [blame]
// Copyright 2020 The Fuchsia Authors
//
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file or at
// https://opensource.org/licenses/MIT
#ifndef ZIRCON_KERNEL_VM_INCLUDE_VM_PAGE_QUEUES_H_
#define ZIRCON_KERNEL_VM_INCLUDE_VM_PAGE_QUEUES_H_
#include <lib/fit/result.h>
#include <sys/types.h>
#include <zircon/listnode.h>
#include <fbl/algorithm.h>
#include <fbl/macros.h>
#include <kernel/event.h>
#include <kernel/lockdep.h>
#include <kernel/mutex.h>
#include <kernel/semaphore.h>
#include <ktl/array.h>
#include <ktl/optional.h>
#include <vm/debug_compressor.h>
#include <vm/page.h>
class VmCowPages;
// Allocated pages that are part of the cow pages in a VmObjectPaged can be placed in a page queue.
// The page queues provide a way to
// * Classify and group pages across VMO boundaries
// * Retrieve the VMO that a page is contained in (via a back reference stored in the vm_page_t)
// Once a page has been placed in a page queue its queue_node becomes owned by the page queue and
// must not be used until the page has been Remove'd. It is not sufficient to call list_delete on
// the queue_node yourself as this operation is not atomic and needs to be performed whilst holding
// the PageQueues::lock_.
class PageQueues {
public:
// The number of reclamation queues is slightly arbitrary, but to be useful you want at least 3
// representing
// * Very new pages that you probably don't want to evict as doing so probably implies you are in
// swap death
// * Slightly old pages that could be evicted if needed
// * Very old pages that you'd be happy to evict
// With two active queues 8 page queues are used so that there is some fidelity of information in
// the inactive queues. Additional queues have reduced value as sufficiently old pages quickly
// become equivalently unlikely to be used in the future.
static constexpr size_t kNumReclaim = 8;
// Two active queues are used to allow for better fidelity of active information. This prevents
// a race between aging once and needing to collect/harvest age information.
static constexpr size_t kNumActiveQueues = 2;
static_assert(kNumReclaim > kNumActiveQueues, "Needs to be at least one non-active queue");
// In addition to active and inactive, we want to consider some of the queues as 'oldest' to
// provide an additional way to limit eviction. Presently the processing of the LRU queue to make
// room for aging is not integrated with the Evictor, and so will not trigger eviction, therefore
// to have a non-zero number of pages ever appear in an oldest queue for eviction the last two
// queues are considered the oldest.
static constexpr size_t kNumOldestQueues = 2;
static_assert(kNumOldestQueues + kNumActiveQueues <= kNumReclaim);
static constexpr zx_duration_t kDefaultMinMruRotateTime = ZX_SEC(5);
static constexpr zx_duration_t kDefaultMaxMruRotateTime = ZX_SEC(5);
// This is presently an arbitrary constant, since the min and max mru rotate time are currently
// fixed at the same value, meaning that the active ratio can not presently trigger, or prevent,
// aging.
static constexpr uint64_t kDefaultActiveRatioMultiplier = 0;
// When holding the PageQueue lock, and performing an operation on an
// arbitrary number of pages, the "max batch size" controls the maximum number
// of number of pages for which the lock will be held before letting it go and
// allowing other operations to proceed.
//
// For example, if someone is calling RemoveArrayIntoList, with a list of 200
// pages to release, the lock will be obtained and dropped a total of 4 times,
// removing batches of 64 pages for the first 3 iterations, and finally a
// batch of 8 pages for the final iteration.
static inline constexpr size_t kMaxBatchSize = 64;
PageQueues();
~PageQueues();
DISALLOW_COPY_ASSIGN_AND_MOVE(PageQueues);
// This is a specialized version of MarkAccessed designed to be called during accessed harvesting.
// It does not update active/inactive counts, and this needs to be done separately once harvesting
// is complete. It is only permitted to call this in between BeginAccessScan and EndAccessScan
// calls.
void MarkAccessedDeferredCount(vm_page_t* page);
// All Set operations places a page, which must not currently be in a page queue, into the
// specified queue. The backlink information of |object| and |page_offset| must be specified and
// valid. If the page is either removed from the referenced object, or moved to a different
// offset, the backlink information must be updated either by calling ChangeObjectOffsetLocked, or
// removing the page completely from the queues.
void SetWired(vm_page_t* page, VmCowPages* object, uint64_t page_offset);
void SetAnonymous(vm_page_t* page, VmCowPages* object, uint64_t page_offset);
void SetPagerBacked(vm_page_t* page, VmCowPages* object, uint64_t page_offset);
void SetPagerBackedDirty(vm_page_t* page, VmCowPages* object, uint64_t page_offset);
void SetAnonymousZeroFork(vm_page_t* page, VmCowPages* object, uint64_t page_offset);
void SetHighPriority(vm_page_t* page, VmCowPages* object, uint64_t page_offset);
// All Move operations change the queue that a page is considered to be in, but do not change the
// object or offset backlink information. The page must currently be in a valid page queue.
void MoveToWired(vm_page_t* page);
void MoveToAnonymous(vm_page_t* page);
void MoveToPagerBacked(vm_page_t* page);
void MoveToPagerBackedDontNeed(vm_page_t* page);
void MoveToPagerBackedDirty(vm_page_t* page);
void MoveToAnonymousZeroFork(vm_page_t* page);
void MoveToHighPriority(vm_page_t* page);
// Indicates that page has failed a compression attempted, and moves it to a separate queue to
// prevent it from being considered part of the reclaim set, which makes it neither active nor
// inactive. The specified page must be in the page queues, but if not presently in a reclaim
// queue this method will do nothing.
// TODO(https://fxbug.dev/42138396): Determine whether/how pages are moved back into the reclaim
// pool and either further generalize this to support pager backed, or specialize FailedReclaim to
// be explicitly only anonymous.
void CompressFailed(vm_page_t* page);
// Changes the backlink information for a page and should only be called by the page owner under
// its lock (that is the VMO lock). The page must currently be in a valid page queue.
void ChangeObjectOffset(vm_page_t* page, VmCowPages* object, uint64_t page_offset);
// Externally locked variant of CHangeObjectOffset that can be used for more efficient batch
// operations. In addition to the annotated lock_, the VMO lock of the owner is also required to
// be held.
void ChangeObjectOffsetLocked(vm_page_t* page, VmCowPages* object, uint64_t page_offset)
TA_REQ(lock_);
// Removes the page from any page list and returns ownership of the queue_node.
void Remove(vm_page_t* page);
// Batched version of Remove that also places all the pages in the specified list
void RemoveArrayIntoList(vm_page_t** page, size_t count, list_node_t* out_list);
// Tells the page queue this page has been accessed, and it should have its position in the queues
// updated. This method will take the internal page queues lock and should not be used for
// accessed harvesting, where MarkAccessedDeferredCount should be used instead.
void MarkAccessed(vm_page_t* page) {
// Can retrieve the queue ref and do a short circuit check for whether mark accessed is
// necessary. Although this check is racy, since we do not yet hold the lock, we will either:
// * Race and fail to mark accessed - this is fine since racing implies a newly added page,
// which would be added to the mru queue anyway.
// * Race and attempt to spuriously mark accessed - this is fine as we will check again with
// the lock held.
auto queue_ref = page->object.get_page_queue_ref();
const uint8_t queue = queue_ref.load(ktl::memory_order_relaxed);
if (queue < PageQueueReclaimDontNeed) {
return;
}
// With the early check complete, continue with the non-inlined longer body.
MarkAccessedContinued(page);
}
// Provides access to the underlying lock, allowing _Locked variants to be called. Use of this is
// highly discouraged as the underlying lock is a CriticalMutex which disables preemption.
// Preferably *Array variations should be used, but this provides a higher performance mechanism
// when needed.
Lock<SpinLock>* get_lock() TA_RET_CAP(lock_) { return &lock_; }
// Used to identify the reason that aging is triggered, mostly for debugging and informational
// purposes.
enum class AgeReason {
// There is no current age reason.
None,
// Aging occurred due to the maximum timeout being reached before any other reason could trigger
Timeout,
// The allowable ratio of active versus inactive pages was exceeded.
ActiveRatio,
// An explicit call to RotatePagerBackedQueues caused aging. This would typically occur due to
// test code or via the kernel debug console.
Manual,
};
static const char* string_from_age_reason(PageQueues::AgeReason reason);
// Rotates the reclamation queues to perform aging. Every existing queue is now considered to be
// one epoch older. To achieve these two things are done:
// 1. A new queue, representing the current epoch, needs to be allocated to put pages that get
// accessed from here into. This just involves incrementing the MRU generation.
// 2. As there is a limited number of page queues 'allocating' one might involve cleaning up an
// old queue. See the description of ProcessDontNeedAndLruQueues for how this process works.
void RotateReclaimQueues(AgeReason reason = AgeReason::Manual);
// Used to represent and return page backlink information acquired whilst holding the page queue
// lock. As a VMO may not destruct while it has pages in it, the cow RefPtr will always be valid,
// although the page and offset contained here are not synchronized and must be separately
// validated before use. This can be done by acquiring the returned vmo's lock and then validating
// that the page is still contained at the offset.
struct VmoBacklink {
fbl::RefPtr<VmCowPages> cow;
vm_page_t* page = nullptr;
uint64_t offset = 0;
};
// Moves a page from from the anonymous zero fork queue into the anonymous queue and returns
// the backlink information. If the zero fork queue is empty then a nullopt is returned, otherwise
// if it has_value the vmo field may be null to indicate that the vmo is running its destructor
// (see VmoBacklink for more details).
ktl::optional<VmoBacklink> PopAnonymousZeroFork();
// Looks at the reclaimable queues and returns backlink information of the first page found. The
// queues themselves are walked from the current LRU queue up to the queue that is at most
// |lowest_queue| epochs from the most recent. |lowest_queue| therefore represents the youngest
// age that would be accepted. If no page was found a nullopt is returned, otherwise if
// it has_value the vmo field may be null to indicate that the vmo is running its destructor (see
// VmoBacklink for more details). If a page is returned its location in the reclaim queue is
// not modified.
ktl::optional<VmoBacklink> PeekReclaim(size_t lowest_queue);
// Called while the loaning VmCowPages is known referenced. The owning_cow parameter can be
// nullptr, if the caller doesn't care to exclude the owning cow from being returned, or if there
// isn't an owning cow.
ktl::optional<VmoBacklink> GetCowWithReplaceablePage(vm_page_t* page, VmCowPages* owning_cow);
// Helper struct to group reclaimable queue length counts returned by GetReclaimCounts.
struct ReclaimCounts {
size_t total = 0;
size_t newest = 0;
size_t oldest = 0;
};
// Returns just the reclaim queue counts. Called from the zx_object_get_info() syscall.
ReclaimCounts GetReclaimQueueCounts() const;
// Helper struct to group queue length counts returned by QueueCounts.
struct Counts {
ktl::array<size_t, kNumReclaim> reclaim = {0};
size_t reclaim_dont_need = 0;
size_t pager_backed_dirty = 0;
size_t anonymous = 0;
size_t wired = 0;
size_t anonymous_zero_fork = 0;
size_t failed_reclaim = 0;
size_t high_priority = 0;
bool operator==(const Counts& other) const {
return reclaim == other.reclaim && reclaim_dont_need == other.reclaim_dont_need &&
pager_backed_dirty == other.pager_backed_dirty && anonymous == other.anonymous &&
wired == other.wired && anonymous_zero_fork == other.anonymous_zero_fork &&
failed_reclaim == other.failed_reclaim && high_priority == other.high_priority;
}
bool operator!=(const Counts& other) const { return !(*this == other); }
};
Counts QueueCounts() const;
struct ActiveInactiveCounts {
// Whether the returned counts were cached values, or the current 'true' values. Cached values
// are returned if an accessed scan is ongoing, as the true values cannot be determined in a
// race free way.
bool cached = false;
// Pages that would normally be available for eviction, but are presently considered active and
// so will not be evicted.
size_t active = 0;
// Pages that are available for eviction due to not presently being considered active.
size_t inactive = 0;
bool operator==(const ActiveInactiveCounts& other) const {
return cached == other.cached && active == other.active && inactive == other.inactive;
}
bool operator!=(const ActiveInactiveCounts& other) const { return !(*this == other); }
};
// Retrieves the current active/inactive counts, or a cache of the last known good ones if
// accessed harvesting is happening. This method is guaranteed to return in a small window of time
// due to only needing to acquire a single lock that has very short critical sections. However,
// this means it may have to return old values if accessed scanning is happening. If blocking and
// waiting is acceptable then |scanner_synchronized_active_inactive_counts| should be used, which
// calls this when it knows accessed scanning is not happening, guaranteeing a live value.
ActiveInactiveCounts GetActiveInactiveCounts() const TA_EXCL(lock_);
void Dump() TA_EXCL(lock_);
// Returns a global count of all pages compressed at the point of LRU change. This is a global
// method and will include stats from every PageQueues that has been instantiated.
static uint64_t GetLruPagesCompressed();
// Enables reclamation of anonymous pages by causing them to be placed into the reclaimable queue
// instead of the dedicated anonymous queue. The |zero_forks| parameter controls whether the
// anonymous zero forks should also go into the general reclaimable queue or not.
// Any pages already placed into the anonymous queues will be moved over, and there is no way to
// disable this once enabled.
void EnableAnonymousReclaim(bool zero_forks);
// Returns whether or not the reclaim queues only include pager backed pages or not.
bool ReclaimIsOnlyPagerBacked() const { return !anonymous_is_reclaimable_; }
// These query functions are marked Debug as it is generally a racy way to determine a pages state
// and these are exposed for the purpose of writing tests or asserts against the pagequeue.
// This takes an optional output parameter that, if the function returns true, will contain the
// index of the queue that the page was in.
bool DebugPageIsPagerBacked(const vm_page_t* page, size_t* queue = nullptr) const;
bool DebugPageIsPagerBackedDontNeed(const vm_page_t* page) const;
bool DebugPageIsPagerBackedDirty(const vm_page_t* page) const;
bool DebugPageIsAnonymous(const vm_page_t* page) const;
bool DebugPageIsAnonymousZeroFork(const vm_page_t* page) const;
bool DebugPageIsAnyAnonymous(const vm_page_t* page) const;
bool DebugPageIsWired(const vm_page_t* page) const;
bool DebugPageIsHighPriority(const vm_page_t* page) const;
// These methods are public so that the scanner can call. Once the scanner is an object that can
// be friended, and not a collection of anonymous functions, these can be made private.
// Creates any threads for queue management. This needs to be done separately to construction as
// there is a recursive dependency where creating threads will need to manipulate pages, which
// will call back into the page queues.
// Delaying thread creation is fine as these threads are purely for aging and eviction management,
// which is not needed during early kernel boot.
// Failure to start the threads may cause operations such as RotatePagerBackedQueues to block
// indefinitely as they might attempt to offload work to a nonexistent thread. This issue is only
// relevant for unittests that may wish to avoid starting the threads for some tests.
// It is the responsibility of the caller to only call this once, otherwise it will panic.
void StartThreads(zx_duration_t min_mru_rotate_time, zx_duration_t max_mru_rotate_time);
// Initializes and starts the debug compression, which attempts to immediately compress a random
// subset of pages added to the page queues. It is an error to call this if there is no compressor
// or if not running in debug mode.
void StartDebugCompressor();
// Sets the active ratio multiplier.
void SetActiveRatioMultiplier(uint32_t multiplier);
// Describes any action to take when processing the LRU queue. This is applied to pages that would
// otherwise have to be moved from the old LRU queue into the next queue when making space.
enum class LruAction {
None,
EvictOnly,
CompressOnly,
EvictAndCompress,
};
void SetLruAction(LruAction action);
// Controls to enable and disable the active aging system. These must be called alternately and
// not in parallel. That is, it is an error to call DisableAging twice without calling EnableAging
// in between. Similar for EnableAging.
void DisableAging() TA_EXCL(lock_);
void EnableAging() TA_EXCL(lock_);
// Called by the scanner to indicate the beginning of an accessed scan. This allows
// MarkAccessedDeferredCount, and will cause the active/inactive counts returned by
// GetActiveInactiveCounts to remain unchanged until the accessed scan is complete.
void BeginAccessScan();
void EndAccessScan();
private:
// An enumeration of the various types of signals which could become
// pending-dispatch while we are holding the main PageQueues lock.
enum class PendingSignal : uint32_t {
None = 0x0,
AgingToken = 0x1,
AgingEvent = 0x2,
LruEvent = 0x4,
};
// Specifies the indices for both the page_queues_ and the page_queue_counts_
enum PageQueue : uint8_t {
PageQueueNone = 0,
PageQueueAnonymous,
PageQueueWired,
PageQueueHighPriority,
PageQueueAnonymousZeroFork,
PageQueuePagerBackedDirty,
PageQueueFailedReclaim,
PageQueueReclaimDontNeed,
PageQueueReclaimBase,
PageQueueReclaimLast = PageQueueReclaimBase + kNumReclaim - 1,
PageQueueNumQueues,
};
// A "DeferPendingSignals" is a small RAII-style helper which assists with
// following the rules for properly handling the Events in `PageQueues`
//
// As pages are added to, removed from, and moved between, various page
// queues, it may become necessary to signal and wake up one of the background
// threads responsible for maintaining the queues. Most of the time, however,
// this happens while we are holding the PagesQueues spinlock. Unfortunately,
// it is illegal to signal an Event while holding a spinlock, so the
// signalling operation must be deferred until just after the lock has been
// dropped..
//
// A DeferPendingSignals is used to accumulate the set of Events which
// need to be signalled when we (eventually) drop the lock. Methods which
// might need to have a signal asserted demand that callers pass a reference
// to a DeferPendingSignals in order to accumulate any pending signals, if
// needed.
//
// PageQueues Events have been wrapped in a thin wrapper class which allows
// them to be waited on, but not signalled by anything but a
// DeferPendingSignals. The PSDs themselves cannot be constructed while
// the main `lock_` is being held, nor can they be copied, moved, or heap
// allocated. Provided that scope based guards are being used to manage the
// `lock_`, this makes it very difficult for a PSD instance to destruct and
// flush pending signals while `lock_` is being held.
//
// Any time that a user wants to call a locked method which might need a
// signal to fire, all they need to do is to put a PSD instance on the stack
// outside of the scope of the lock guard, and pass it to the method they want
// to call. For example:
//
// ```
// {
// DeferPendingSignals dps{*this};
// Guard<SpinLock, IrqSave> guard{&lock_};
// RecalculateActiveInactiveLocked(dps);
// } // pending signals auto-flushed here, after dropping the lock
//
// ```
//
class DeferPendingSignals {
public:
// We cannot create a PSD while holding its PageQueues' lock instance.
[[nodiscard]] explicit DeferPendingSignals(PageQueues& page_queues) TA_EXCL(page_queues.lock_)
: page_queues_(page_queues) {}
// No move, no copy.
DeferPendingSignals(const DeferPendingSignals&) = delete;
DeferPendingSignals(DeferPendingSignals&&) = delete;
DeferPendingSignals& operator=(const DeferPendingSignals&) = delete;
DeferPendingSignals& operator=(DeferPendingSignals&&) = delete;
// When the PSD destructs, fire any pending signals.
~DeferPendingSignals() {
using T = ktl::underlying_type_t<PendingSignal>;
T pending = static_cast<T>(pending_signals_);
if (pending) {
if (pending & static_cast<T>(PendingSignal::AgingToken)) {
page_queues_.aging_token_.Signal();
}
if (pending & static_cast<T>(PendingSignal::AgingEvent)) {
page_queues_.aging_event_.Signal();
}
if (pending & static_cast<T>(PendingSignal::LruEvent)) {
page_queues_.lru_event_.Signal();
}
}
}
// Accumulate `signal` into the set of currently pending signals for this
// dispatcher instance.
void Pend(PendingSignal signal) {
using T = ktl::underlying_type_t<PendingSignal>;
pending_signals_ =
static_cast<PendingSignal>(static_cast<T>(pending_signals_) | static_cast<T>(signal));
}
private:
// PSDs may not be heap allocated. We want their scopes to always be
// defined to a function/stack scope, and never extend outside of that.
static void* operator new(size_t) = delete;
static void* operator new[](size_t) = delete;
static void operator delete(void*) = delete;
static void operator delete[](void*) = delete;
PageQueues& page_queues_;
PendingSignal pending_signals_{PendingSignal::None};
};
// a PendingSignalEvent is a thin wrapper around an AutounsignalEvent. It
// restricts the API of the AutounsignalEvent just a bit, allowing
// DeferPendingSignalss to signal events (outside of the `lock_`, as the
// PSD goes out of scope), but not other code. This should make it more
// difficult for someone to accidentally signal one of the events while
// holding the PageQueues' spinlock.
class PendingSignalEvent : protected AutounsignalEvent {
public:
PendingSignalEvent() = default;
explicit PendingSignalEvent(bool initial_state) : AutounsignalEvent(initial_state) {}
using AutounsignalEvent::Wait;
using AutounsignalEvent::WaitDeadline;
private:
friend class DeferPendingSignals;
};
// A DeferPendingSignals is allowed to access the various Event signals.
friend class DeferPendingSignals;
// Ensure that the reclaim queue counts are always at the end.
static_assert(PageQueueReclaimLast + 1 == PageQueueNumQueues);
// The page queue index, unlike the full generation count, needs to be able to fit inside a
// uint8_t in the vm_page_t.
static_assert(PageQueueNumQueues < 256);
// Converts free running generation to reclaim queue.
static constexpr PageQueue gen_to_queue(uint64_t gen) {
return static_cast<PageQueue>((gen % kNumReclaim) + PageQueueReclaimBase);
}
// Checks if a candidate reclaim page queue would be valid given a specific lru and mru
// queue.
static constexpr bool queue_is_valid(PageQueue page_queue, PageQueue lru, PageQueue mru) {
DEBUG_ASSERT(page_queue >= PageQueueReclaimBase);
if (lru <= mru) {
return page_queue >= lru && page_queue <= mru;
} else {
return page_queue <= mru || page_queue >= lru;
}
}
// Returns whether this queue is reclaimable, and hence can be active or inactive. If this
// returns false then it is guaranteed that both |queue_is_active| and |queue_is_inactive| would
// return false.
static constexpr bool queue_is_reclaim(PageQueue page_queue) {
// We check against the the DontNeed queue and not the base queue so that accessing a page can
// move it from the DontNeed list into the LRU queues. To keep this case efficient we require
// that the DontNeed queue be directly before the LRU queues.
static_assert(PageQueueReclaimDontNeed + 1 == PageQueueReclaimBase);
// Ensure that the Dirty queue comes before the smallest queue that would return true for this
// function. This function is used for computing active/inactive sets for the purpose of
// eviction, and dirty pages cannot be evicted. The Dirty queue also needs to come before the
// DontNeed queue so that MarkAccessed does not try to move the page to the MRU queue on
// access.
static_assert(PageQueuePagerBackedDirty < PageQueueReclaimDontNeed);
return page_queue >= PageQueueReclaimDontNeed;
}
// Calculates the age of a queue against a given mru, with 0 meaning page_queue==mru
// This is only meaningful to call on reclaimable queues.
static constexpr uint queue_age(PageQueue page_queue, PageQueue mru) {
DEBUG_ASSERT(page_queue >= PageQueueReclaimBase);
if (page_queue <= mru) {
return mru - page_queue;
} else {
return (static_cast<uint>(kNumReclaim) - page_queue) + mru;
}
}
// Returns whether the given page queue would be considered active against a given mru.
// This is valid to call on any page queue, not just reclaimable ones, and as such this returning
// false does not imply the queue is inactive.
static constexpr bool queue_is_active(PageQueue page_queue, PageQueue mru) {
if (page_queue < PageQueueReclaimBase) {
return false;
}
return queue_age(page_queue, mru) < kNumActiveQueues;
}
// Returns whether the given page queue would be considered inactive against a given mru.
// This is valid to call on any page queue, not just reclaimable ones, and as such this returning
// false does not imply the queue is active.
static constexpr bool queue_is_inactive(PageQueue page_queue, PageQueue mru) {
// The DontNeed queue does not have an age, and so we cannot call queue_age on it, but it should
// definitely be considered part of the inactive set.
if (page_queue == PageQueueReclaimDontNeed) {
return true;
}
if (page_queue < PageQueueReclaimBase) {
return false;
}
return queue_age(page_queue, mru) >= kNumActiveQueues;
}
PageQueue mru_gen_to_queue() const {
return gen_to_queue(mru_gen_.load(ktl::memory_order_relaxed));
}
PageQueue lru_gen_to_queue() const {
return gen_to_queue(lru_gen_.load(ktl::memory_order_relaxed));
}
// This processes the DontNeed queue and the LRU queue.
// For the DontNeed queue the pages are first placed in a temporary list (if |peek| is false), and
// then processed to ensure termination (see comment on dont_need_processing_list_ for full
// details). If |peek| is true we will either return a page from the DontNeed queue, or it will be
// empty, and so no temporary list is needed. For the LRU queue, the aim is to make the lru_gen_
// be the passed in target_gen. It achieves this by walking all the pages in the queue and either
// 1. For pages that have a newest accessed time and are in the wrong queue, are moved into the
// correct queue.
// 2. For pages that are in the correct queue, they are either returned (if |peek| is true), or
// moved to another queue - pages in the DontNeed queue are moved between
// dont_need_processing_list_ and regular list, although this is in some ways the some queue
// as the pages do not actually have their queue updated, and pages in the LRU queue have
// their age effectively decreased by being moved to the next queue.
// In the second case for LRU, pages get moved into the next queue so that the LRU queue can
// become empty, allowing the gen to be incremented to eventually reach the |target_gen|. The
// mechanism of freeing up the LRU queue is necessary to make room for new MRU queues. When |peek|
// is false, this always returns a nullopt and guarantees that it moved lru_gen_ to at least
// target_gen. If |peek| is true, then the first time it hits a page in case (2), it returns it
// instead of decreasing its age.
ktl::optional<PageQueues::VmoBacklink> ProcessDontNeedAndLruQueues(uint64_t target_gen,
bool peek);
// Helper used by ProcessDontNeedAndLruQueues. |target_gen| is the minimum value lru_gen_ should
// advance to. If |peek| is true, the first page that encountered in the respective queue, whose
// age does not require to be fixed up, is returned.
// The passed in LruIsolate object is used to process reclamation work outside of the lock and can
// be reused across multiple calls. The |Items| parameter therefore controls how much work will be
// done within the lock acquisition before returning.
template <size_t Items>
class LruIsolate;
template <size_t Items>
ktl::optional<PageQueues::VmoBacklink> ProcessLruQueueHelper(LruIsolate<Items>& deferred_list,
uint64_t target_gen, bool peek)
TA_EXCL(lock_);
// Helper used by ProcessDontNeedAndLruQueues. Processes the provided DoneNeed list (which should
// either be |dont_need_processing_list_| or |page_queues_[PageQueueReclaimDontNeed|) and places
// items in their correct list. If |peek| is true then the first item found that is correctly in
// the DontNeed list is returned.
// As this moves pages to their correct list, it is an error to call this with real DontNeed list
// and peek being false, as it would never terminate.
ktl::optional<VmoBacklink> ProcessDontNeedList(list_node_t* list, bool peek) TA_EXCL(lock_);
// Helpers for adding and removing to the queues. All of the public Set/Move/Remove operations
// are convenience wrappers around these.
void RemoveLocked(vm_page_t* page, DeferPendingSignals& dps) TA_REQ(lock_);
void SetQueueBacklinkLocked(vm_page_t* page, void* object, uintptr_t page_offset, PageQueue queue,
DeferPendingSignals& dps) TA_REQ(lock_);
void MoveToQueueLocked(vm_page_t* page, PageQueue queue, DeferPendingSignals& dps) TA_REQ(lock_);
// Updates the active/inactive counts assuming a single page has moved from |old_queue| to
// |new_queue|. Either of these can be PageQueueNone to simulate pages being added or removed.
void UpdateActiveInactiveLocked(PageQueue old_queue, PageQueue new_queue,
DeferPendingSignals& dps) TA_REQ(lock_);
// Recalculates |active_queue_count_| and |inactive_queue_count_|. This is pulled into a helper
// method as this needs to be done both when accessed scanning completes, or if the mru_gen_ is
// changed.
void RecalculateActiveInactiveLocked(DeferPendingSignals& dps) TA_REQ(lock_);
// Internal locked version of GetActiveInactiveCounts.
ActiveInactiveCounts GetActiveInactiveCountsLocked() const TA_REQ(lock_);
// Internal helper for shutting down any threads created in |StartThreads|.
void StopThreads();
// Entry point for the thread that will performing aging and increment the mru generation.
void MruThread();
void MaybeTriggerAging() TA_EXCL(lock_);
void MaybeTriggerAgingLocked(DeferPendingSignals& dps) TA_REQ(lock_);
AgeReason GetAgeReason() const TA_EXCL(lock_);
AgeReason GetAgeReasonLocked() const TA_REQ(lock_);
void LruThread();
void MaybeTriggerLruProcessing() TA_EXCL(lock_);
bool NeedsLruProcessing() const;
// MarkAccessed is split into a small inlinable portion that attempts to short circuit, and this
// main implementation that does the actual accessed marking if needed.
void MarkAccessedContinued(vm_page_t* page);
// Returns true if a page is both in one of the Reclaim queues, and succeeds the passed in
// validator, which takes a fbl::RefPtr<VmCowPages>.
template <typename F>
bool DebugPageIsSpecificReclaim(const vm_page_t* page, F validator, size_t* queue) const;
// Returns true if a page is both in the specified |queue|, and succeeds the passed in validator,
// which takes a fbl::RefPtr<VmCowPages>.
template <typename F>
bool DebugPageIsSpecificQueue(const vm_page_t* page, PageQueue queue, F validator) const;
// The lock_ is needed to protect the linked lists queues as these cannot be implemented with
// atomics.
DECLARE_SPINLOCK(PageQueues) mutable lock_;
// This Event is a binary semaphore and is used to control aging. Is acquired by the aging thread
// when it performs aging, and can be acquired separately to block aging. For this purpose it
// needs to start as being initially signalled.
PendingSignalEvent aging_token_{true};
// Flag used to catch programming errors related to double enabling or disabling aging.
ktl::atomic<bool> aging_disabled_ = false;
// Time at which the mru_gen_ was last incremented.
ktl::atomic<zx_time_t> last_age_time_ = ZX_TIME_INFINITE_PAST;
// Reason the last aging event happened, this is purely for informational/debugging purposes.
AgeReason last_age_reason_ TA_GUARDED(lock_) = AgeReason::None;
// Used to signal the aging thread that it should wake up and see if it needs to do anything.
PendingSignalEvent aging_event_;
// Used to signal the lru thread that it should wake up and check if the lru queue needs
// processing.
PendingSignalEvent lru_event_;
// What to do with pages when processing the LRU queue.
LruAction lru_action_ TA_GUARDED(lock_) = LruAction::None;
// The page queues are placed into an array, indexed by page queue, for consistency and uniformity
// of access. This does mean that the list for PageQueueNone does not actually have any pages in
// it, and should always be empty.
// The reclaimable queues are the more complicated as, unlike the other categories, pages can be
// in one of the queues, and can move around. The reclaimable queues themselves store pages that
// are roughly grouped by their last access time. The relationship is not precise as pages are not
// moved between queues unless it becomes strictly necessary. This is in contrast to the queue
// counts that are always up to date.
//
// What this means is that the vm_page::page_queue index is always up to do date, and the
// page_queue_counts_ represent an accurate count of pages with that vm_page::page_queue index,
// but counting the pages actually in the linked list may not yield the correct number.
//
// New reclaimable pages are always placed into the queue associated with the MRU generation. If
// they get accessed the vm_page_t::page_queue gets updated along with the counts. At some point
// the LRU queue will get processed (see |ProcessDontNeedAndLruQueues|) and this will cause pages
// to get relocated to their correct list.
//
// Consider the following example:
//
// LRU MRU LRU MRU LRU MRU LRU MRU MRU LRU
// | | | | | | | | | |
// | | Insert A | | Age | | Touch A | | Age | |
// V v Queue=2 v v Queue=2 v v Queue=3 v v Queue=3 v v
// [][ ][ ][] -------> [][ ][a][] -------> [][ ][a][ ] -------> [][ ][a][ ] -------> [ ][ ][a][]
//
// At this point page A, in its vm_page_t, has its queue marked as 3, and the page_queue_counts
// are {0,0,1,0}, but the page itself remains in the linked list for queue 2. If the LRU queue is
// then processed to increment it we would do.
//
// MRU LRU MRU LRU MRU LRU
// | | | | | |
// | | Move LRU | | Move LRU | |
// V v Queue=3 v v Queue=3 v v
// [ ][ ][a][] -------> [ ][][a][] -------> [][ ][][a]
//
// In the second processing of the LRU queue it gets noticed that the page, based on
// vm_page_t::page_queue, is in the wrong queue and gets moved into the correct one.
//
// For specifics on how LRU and MRU generations map to LRU and MRU queues, see comments on
// |lru_gen_| and |mru_gen_|.
ktl::array<list_node_t, PageQueueNumQueues> page_queues_ TA_GUARDED(lock_);
// The generation counts are monotonic increasing counters and used to represent the effective age
// of the oldest and newest reclaimable queues. The page queues themselves are treated as a fixed
// size circular buffer that the generations map onto (see definition of |gen_to_queue|).This
// means all pages in the system have an age somewhere in [lru_gen_, mru_gen_] and so the lru and
// mru generations cannot drift apart by more than kNumPagerBacked, otherwise there would not be
// enough queues.
// A pages age being between [lru_gen_, mru_gen_] is not an invariant as MarkAccessed can race and
// mark pages as being in an invalid queue. This race will get noticed by ProcessLruQueues and
// the page will get updated at that point to have a valid queue. Importantly, whilst pages can
// think they are in a queue that is invalid, only valid linked lists in the page_queues_ will
// ever have pages in them. This invariant is easy to enforce as the page_queues_ are updated
// under a lock.
ktl::atomic<uint64_t> lru_gen_ = 0;
ktl::atomic<uint64_t> mru_gen_ = kNumReclaim - 1;
// This semaphore counts the amount of space remaining for the mru to grow before it would overlap
// with the lru. Having this as a semaphore (even though it can always be calculated from lru_gen_
// and mru_gen_ above) provides a way for the aging thread to block when it needs to wait for
// eviction/lru processing to happen. This allows eviction/lru processing to be happening
// concurrently in a different thread, without requiring it to happen in-line in the aging thread.
// Without this the aging thread would need to process the LRU queue directly if it needed to make
// space. Initially, with the lru_gen_ and mru_gen_ definitions above, we start with no space for
// the mru to grow, so initialize this to 0.
Semaphore mru_semaphore_ = Semaphore(0);
// Tracks the counts of pages in each queue in O(1) time complexity. As pages are moved between
// queues, the corresponding source and destination counts are decremented and incremented,
// respectively.
//
// The first entry of the array is left special: it logically represents pages not in any queue.
// For simplicity, it is initialized to zero rather than the total number of pages in the system.
// Consequently, the value of this entry is a negative number with absolute value equal to the
// total number of pages in all queues. This approach avoids unnecessary branches when updating
// counts.
ktl::array<ktl::atomic<size_t>, PageQueueNumQueues> page_queue_counts_ = {};
// These are the continuously updated active/inactive queue counts. Continuous here means updated
// by all page queue methods except for MarkAccessedDeferredCount. Due to races whilst accessed
// harvesting is happening, these could be inaccurate or even become negative, and should not be
// read from whilst used_cached_queue_counts_ is true, and need to be completely recalculated
// prior to setting |used_cached_queue_counts_| back to false.
int64_t active_queue_count_ TA_GUARDED(lock_) = 0;
int64_t inactive_queue_count_ TA_GUARDED(lock_) = 0;
// When accessed harvesting is happening these hold the last known 'good' values of the
// active/inactive queue counts.
uint64_t cached_active_queue_count_ TA_GUARDED(lock_) = 0;
uint64_t cached_inactive_queue_count_ TA_GUARDED(lock_) = 0;
// Indicates whether the cached counts should be returned in queries or not. This also indicates
// whether the page queues expect accessed harvesting to be happening. This is only an atomic
// so that MarkAccessedDeferredCount can reference it in a DEBUG_ASSERT without triggering
// memory safety issues.
ktl::atomic<bool> use_cached_queue_counts_ = false;
// Track the mru and lru threads and have a signalling mechanism to shut them down.
ktl::atomic<bool> shutdown_threads_ = false;
Thread* mru_thread_ TA_GUARDED(lock_) = nullptr;
Thread* lru_thread_ TA_GUARDED(lock_) = nullptr;
// Debug compressor is only available when debug asserts are also enabled. This ensures it can
// never have an impact on production builds.
#if DEBUG_ASSERT_IMPLEMENTED
ktl::unique_ptr<VmDebugCompressor> debug_compressor_ TA_GUARDED(lock_);
#endif
// Queue rotation parameters. These are not locked as they are only read by the mru thread, and
// are set before the mru thread is started.
zx_duration_t min_mru_rotate_time_;
zx_duration_t max_mru_rotate_time_;
// Determines if anonymous zero page forks are placed in the zero fork queue or in the reclaimable
// queue.
bool zero_fork_is_reclaimable_ TA_GUARDED(lock_) = false;
// Determines if anonymous pages are placed in the reclaimable queues, or in their own non aging
// anonymous queues.
RelaxedAtomic<bool> anonymous_is_reclaimable_ = false;
// Current active ratio multiplier.
uint64_t active_ratio_multiplier_ TA_GUARDED(lock_);
// This list is a temporary list used when updating the ages of pages in the DontNeed queue.
// Logically it can be considered the same queue as the PageQueuePagerBackedDontNeed, just a
// different physical list, and hence whether a page is here or in the regular
// page_queue_[PageQueuePagerBackedDontNeed] the page_queue_count_[PageQueuePagerBackedDontNeed]
// is the same.
// The way this list is used is when processing the LRU queue, all the pages in the regular
// DontNeed queue are moved here, and then this list is processed till its empty by moving them
// back to the DontNeed list. By doing this it can be guaranteed that original page is checked,
// without new DontNeed pages being able to cause the process to not terminate.
// Due to the way list_node_t's work, whether a page is in this specific list, or the page_queues_
// list, they can still be removed with list_delete, which is why it's safe to move the pages
// here.
// Note that we cannot guard this list by the dont_need_processing_lock_ since peek actions, that
// do not take that lock, also need to be able to remove items from this list.
list_node_t dont_need_processing_list_ TA_GUARDED(lock_) = LIST_INITIAL_CLEARED_VALUE;
// The way processing works by moving all the DontNeed pages to the dont_need_process_list_ and
// then iterating over them means that it only make sense for one of these operations to be being
// done at a time. Since only one LRU rotation will realistically happen at a time, this lock in
// practice should never actually block anything.
DECLARE_MUTEX(PageQueues) dont_need_processing_lock_;
};
#endif // ZIRCON_KERNEL_VM_INCLUDE_VM_PAGE_QUEUES_H_