blob: 0a604e2431a3d923052da0b5eb52e7cb6b38cf19 [file] [log] [blame]
// Copyright 2020 The Fuchsia Authors
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file or at
#include <assert.h>
#include <lib/user_copy/user_ptr.h>
#include <lib/zircon-internal/thread_annotations.h>
#include <stdint.h>
#include <zircon/listnode.h>
#include <zircon/types.h>
#include <fbl/array.h>
#include <fbl/canary.h>
#include <fbl/enum_bits.h>
#include <fbl/intrusive_double_list.h>
#include <fbl/macros.h>
#include <fbl/ref_counted.h>
#include <fbl/ref_ptr.h>
#include <kernel/mutex.h>
#include <vm/page_source.h>
#include <vm/physical_page_borrowing_config.h>
#include <vm/pmm.h>
#include <vm/vm.h>
#include <vm/vm_aspace.h>
#include <vm/vm_object.h>
#include <vm/vm_page_list.h>
// Forward declare these so VmCowPages helpers can accept references.
class BatchPQRemove;
class VmObjectPaged;
namespace internal {
struct DiscardableListTag {};
} // namespace internal
enum class VmCowPagesOptions : uint32_t {
// Externally-usable flags:
kNone = 0u,
// With this clear, zeroing a page tries to decommit the page. With this set, zeroing never
// decommits the page. Currently this is only set for contiguous VMOs.
// TODO(dustingreen): Once we're happy with the reliability of page borrowing, we should be able
// to relax this restriction. We may still need to flush zeroes to RAM during reclaim to mitigate
// a hypothetical client incorrectly assuming that cache-clean status will remain intact while
// pages aren't pinned, but that mitigation should be sufficient (even assuming such a client) to
// allow implicit decommit when zeroing or when zero scanning, as long as no clients are doing DMA
// to/from contiguous while not pinned.
kCannotDecommitZeroPages = (1u << 0),
// Internal-only flags:
kHidden = (1u << 1),
kSlice = (1u << 2),
kUnpinOnDelete = (1u << 3),
kInternalOnlyMask = kHidden | kSlice,
// Implements a copy-on-write hierarchy of pages in a VmPageList.
class VmCowPages final
: public VmHierarchyBase,
public fbl::ContainableBaseClasses<
// Guarded by lock_.
fbl::TaggedDoublyLinkedListable<VmCowPages*, internal::ChildListTag>,
// Guarded by DiscardableVmosLock::Get().
fbl::TaggedDoublyLinkedListable<VmCowPages*, internal::DiscardableListTag>>,
public fbl::Recyclable<VmCowPages> {
static zx_status_t Create(fbl::RefPtr<VmHierarchyState> root_lock, VmCowPagesOptions options,
uint32_t pmm_alloc_flags, uint64_t size,
fbl::RefPtr<VmCowPages>* cow_pages);
static zx_status_t CreateExternal(fbl::RefPtr<PageSource> src, VmCowPagesOptions options,
fbl::RefPtr<VmHierarchyState> root_lock, uint64_t size,
fbl::RefPtr<VmCowPages>* cow_pages);
// Creates a copy-on-write clone with the desired parameters. This can fail due to various
// internal states not being correct.
zx_status_t CreateCloneLocked(CloneType type, uint64_t offset, uint64_t size,
fbl::RefPtr<VmCowPages>* child_cow) TA_REQ(lock_);
// Creates a child that looks back to this VmCowPages for all operations. Once a child slice is
// created this node should not ever be Resized.
zx_status_t CreateChildSliceLocked(uint64_t offset, uint64_t size,
fbl::RefPtr<VmCowPages>* cow_slice) TA_REQ(lock_);
// Returns the size in bytes of this cow pages range. This will always be a multiple of the page
// size.
uint64_t size_locked() const TA_REQ(lock_) { return size_; }
// Returns whether this cow pages node is ultimately backed by a user pager to fulfill initial
// content, and not zero pages. Contiguous VMOs have page_source_ set, but are not pager backed
// in this sense.
// This should only be used to report to user mode whether a VMO is user-pager backed, not for any
// other purpose.
bool is_root_source_user_pager_backed_locked() const TA_REQ(lock_) {
auto root = GetRootLocked();
// The root will never be null. It will either point to a valid parent, or |this| if there's no
// parent.
return root->page_source_ && root->page_source_->properties().is_user_pager;
bool debug_is_user_pager_backed_locked() const TA_REQ(lock_) {
return page_source_ && page_source_->properties().is_user_pager;
bool debug_is_contiguous() const TA_REQ(lock_) {
return page_source_ && page_source_->properties().is_providing_specific_physical_pages;
bool is_private_pager_copy_supported() const TA_REQ(lock_) {
auto root = GetRootLocked();
// The root will never be null. It will either point to a valid parent, or |this| if there's no
// parent.
bool result = root->page_source_ && root->page_source_->properties().is_preserving_page_content;
DEBUG_ASSERT(result == is_root_source_user_pager_backed_locked());
return result;
bool is_cow_clonable_locked() const TA_REQ(lock_) {
// Copy-on-write clones of pager vmos or their descendants aren't supported as we can't
// efficiently make an immutable snapshot.
if (can_root_source_evict_locked()) {
return false;
// We also don't support COW clones for contiguous VMOs.
if (is_source_supplying_specific_physical_pages_locked()) {
return false;
// Copy-on-write clones of slices aren't supported at the moment due to the resulting VMO chains
// having non hidden VMOs between hidden VMOs. This case cannot be handled be CloneCowPageLocked
// at the moment and so we forbid the construction of such cases for the moment.
// Bug: 36841
if (is_slice_locked()) {
return false;
return true;
bool can_evict_locked() const TA_REQ(lock_) {
bool result = page_source_ && page_source_->properties().is_preserving_page_content;
DEBUG_ASSERT(result == debug_is_user_pager_backed_locked());
return result;
bool can_root_source_evict_locked() const TA_REQ(lock_) {
auto root = GetRootLocked();
// The root will never be null. It will either point to a valid parent, or |this| if there's no
// parent.
bool result = root->can_evict_locked();
DEBUG_ASSERT(result == is_root_source_user_pager_backed_locked());
return result;
bool has_pager_backlinks_locked() const TA_REQ(lock_) {
bool result = can_evict_locked();
DEBUG_ASSERT(result == debug_is_user_pager_backed_locked());
return result;
// Returns whether this cow pages node is dirty tracked.
bool is_dirty_tracked_locked() const TA_REQ(lock_) {
// Pager-backed VMOs require dirty tracking either if:
// 1. They are directly backed by the pager, i.e. the root VMO.
// OR
// 2. They are slice children of root pager-backed VMOs, since slices directly reference the
// parent's pages.
auto* which_cow = is_slice_locked() ? parent_.get() : this;
bool result =
which_cow->page_source_ && which_cow->page_source_->properties().is_preserving_page_content;
DEBUG_ASSERT(result == which_cow->debug_is_user_pager_backed_locked());
return result;
bool is_source_preserving_page_content_locked() const TA_REQ(lock_) {
bool result = page_source_ && page_source_->properties().is_preserving_page_content;
DEBUG_ASSERT(result == debug_is_user_pager_backed_locked());
return result;
bool is_source_supplying_specific_physical_pages_locked() const TA_REQ(lock_) {
bool result = page_source_ && page_source_->properties().is_providing_specific_physical_pages;
DEBUG_ASSERT(result == debug_is_contiguous());
return result;
// When attributing pages hidden nodes must be attributed to either their left or right
// descendants. The attribution IDs of all involved determine where attribution goes. For
// historical and practical reasons actual user ids are used, although any consistent naming
// scheme will have the same effect.
void set_page_attribution_user_id_locked(uint64_t id) TA_REQ(lock_) {
page_attribution_user_id_ = id;
// See description on |pinned_page_count_| for meaning.
uint64_t pinned_page_count_locked() const TA_REQ(lock_) { return pinned_page_count_; }
// Sets the VmObjectPaged backlink for this copy-on-write node. This object has no tracking of
// mappings, but understands that they exist. When it manipulates pages in a way that could effect
// mappings it uses the backlink to notify the VmObjectPaged.
// Currently it is assumed that all nodes always have backlinks with the 1:1 hierarchy mapping.
void set_paged_backlink_locked(VmObjectPaged* ref) TA_REQ(lock_) { paged_ref_ = ref; }
uint64_t HeapAllocationBytesLocked() const TA_REQ(lock_) {
return page_list_.HeapAllocationBytes();
uint64_t EvictionEventCountLocked() const TA_REQ(lock_) { return eviction_event_count_; }
void DetachSourceLocked() TA_REQ(lock_);
// Resizes the range of this cow pages. |size| must be a multiple of the page size and this must
// not be called on slices or nodes with slice children.
zx_status_t ResizeLocked(uint64_t size) TA_REQ(lock_);
// See VmObject::Lookup
zx_status_t LookupLocked(uint64_t offset, uint64_t len, VmObject::LookupFunction lookup_fn)
// Similar to LookupLocked, but enumerate all readable pages in the hierarchy within the requested
// range. The offset passed to the |lookup_fn| is the offset this page is visible at in this
// object, even if the page itself is committed in a parent object. The physical addresses given
// to the lookup_fn should not be retained in any way unless the range has also been pinned by the
// caller.
// Ranges of length zero are considered invalid and will return ZX_ERR_INVALID_ARGS. The lookup_fn
// can terminate iteration early by returning ZX_ERR_STOP.
using LookupReadableFunction =
fit::inline_function<zx_status_t(uint64_t offset, paddr_t pa), 4 * sizeof(void*)>;
zx_status_t LookupReadableLocked(uint64_t offset, uint64_t len, LookupReadableFunction lookup_fn)
// See VmObject::TakePages
zx_status_t TakePagesLocked(uint64_t offset, uint64_t len, VmPageSpliceList* pages) TA_REQ(lock_);
// See VmObject::SupplyPages
// The new_zeroed_pages parameter should be true if the pages are new pages that need to be
// initialized, or false if the pages are from a different VmCowPages and are being moved to this
// VmCowPages.
zx_status_t SupplyPagesLocked(uint64_t offset, uint64_t len, VmPageSpliceList* pages,
bool new_zeroed_pages) TA_REQ(lock_);
// The new_zeroed_pages parameter should be true if the pages are new pages that need to be
// initialized, or false if the pages are from a different VmCowPages and are being moved to this
// VmCowPages.
zx_status_t SupplyPages(uint64_t offset, uint64_t len, VmPageSpliceList* pages,
bool new_zeroed_pages) TA_EXCL(lock_);
// See VmObject::FailPageRequests
zx_status_t FailPageRequestsLocked(uint64_t offset, uint64_t len, zx_status_t error_status)
// Used to track dirty_state in the vm_page_t.
// The transitions between the three states can roughly be summarized as follows:
// 1. A page starts off as Clean when supplied.
// 2. A write transitions the page from Clean to Dirty.
// 3. A writeback_begin moves the Dirty page to AwaitingClean.
// 4. A writeback_end moves the AwaitingClean page to Clean.
// 5. A write that comes in while the writeback is in progress (i.e. the page is AwaitingClean)
// moves the AwaitingClean page back to Dirty.
enum class DirtyState : uint8_t {
// The page does not track dirty state. Used for non pager backed pages.
Untracked = 0,
// The page is clean, i.e. its contents have not been altered from when the page was supplied.
// The page's contents have been modified from the time of supply, and should be written back to
// the page source at some point.
// The page still has modified contents, but the page source is in the process of writing back
// the changes. This is used to ensure that a consistent version is written back, and that any
// new modifications that happen during the writeback are not lost. The page source will mark
// pages AwaitingClean before starting any writeback.
// Make sure that the state can be encoded in the vm_page_t's dirty_state field.
static_assert(static_cast<uint8_t>(DirtyState::NumStates) <= VM_PAGE_OBJECT_MAX_DIRTY_STATES);
static bool is_page_dirty_tracked(const vm_page_t* page) {
return DirtyState(page->object.dirty_state) != DirtyState::Untracked;
static bool is_page_dirty(const vm_page_t* page) {
return DirtyState(page->object.dirty_state) == DirtyState::Dirty;
static bool is_page_clean(const vm_page_t* page) {
return DirtyState(page->object.dirty_state) == DirtyState::Clean;
static bool is_page_awaiting_clean(const vm_page_t* page) {
return DirtyState(page->object.dirty_state) == DirtyState::AwaitingClean;
// See VmObject::DirtyPages
zx_status_t DirtyPagesLocked(uint64_t offset, uint64_t len) TA_REQ(lock_);
using DirtyRangeEnumerateFunction = VmObject::DirtyRangeEnumerateFunction;
// See VmObject::EnumerateDirtyRanges
zx_status_t EnumerateDirtyRangesLocked(uint64_t offset, uint64_t len,
DirtyRangeEnumerateFunction&& dirty_range_fn)
// See VmObject::WritebackBegin
zx_status_t WritebackBeginLocked(uint64_t offset, uint64_t len) TA_REQ(lock_);
// See VmObject::WritebackEnd
zx_status_t WritebackEndLocked(uint64_t offset, uint64_t len) TA_REQ(lock_);
using LookupInfo = VmObject::LookupInfo;
using DirtyTrackingAction = VmObject::DirtyTrackingAction;
// See VmObject::GetPage
// The pages returned from this are assumed to be used in the following ways.
// * Our VmObjectPaged backlink, or any of children's backlinks, are allowed to have readable
// mappings, and will be informed to unmap via the backlinks when needed.
// * Our VmObjectPaged backlink and our *slice* children are allowed to have writable mappings,
// and will be informed to either unmap or remove writability when needed.
zx_status_t LookupPagesLocked(uint64_t offset, uint pf_flags, DirtyTrackingAction mark_dirty,
uint64_t max_out_pages, list_node* alloc_list,
LazyPageRequest* page_request, LookupInfo* out) TA_REQ(lock_);
// Controls the type of content that can be overwritten by the Add[New]Page[s]Locked functions.
enum class CanOverwriteContent : uint8_t {
// Do not overwrite any kind of content, i.e. only add a page at the slot if there is true
// absence of content.
// Only overwrite slots that represent zeros. In the case of anonymous VMOs, both gaps and zero
// page markers represent zeros, as the entire VMO is implicitly zero on creation. For pager
// backed VMOs, zero page markers and gaps after supply_zero_offset_ represent zeros.
// Overwrite any slots, regardless of the type of content.
// Adds an allocated page to this cow pages at the specified offset, can be optionally zeroed and
// any mappings invalidated. If an error is returned the caller retains ownership of |page|.
// Offset must be page aligned.
// |overwrite| controls how the function handles pre-existing content at |offset|. If |overwrite|
// does not permit replacing the content, ZX_ERR_ALREADY_EXISTS will be returned. If a page is
// released from the page list as a result of overwriting, it is returned through |released_page|
// and the caller takes ownership of this page. If the |overwrite| action is such that a page
// cannot be released, it is valid for the caller to pass in nullptr for |released_page|.
zx_status_t AddNewPageLocked(uint64_t offset, vm_page_t* page, CanOverwriteContent overwrite,
ktl::optional<vm_page_t*>* released_page, bool zero = true,
bool do_range_update = true) TA_REQ(lock_);
// Adds a set of pages consecutively starting from the given offset. Regardless of the return
// result ownership of the pages is taken. Pages are assumed to be in the ALLOC state and can be
// optionally zeroed before inserting. start_offset must be page aligned.
// |overwrite| controls how the function handles pre-existing content in the range. If |overwrite|
// does not permit replacing the content, ZX_ERR_ALREADY_EXISTS will be returned. Pages released
// from the page list as a result of overwriting are returned through |released_pages| and the
// caller takes ownership of these pages. If the |overwrite| action is such that pages cannot be
// released, it is valid for the caller to pass in nullptr for |released_pages|.
zx_status_t AddNewPagesLocked(uint64_t start_offset, list_node_t* pages,
CanOverwriteContent overwrite, list_node_t* released_pages,
bool zero = true, bool do_range_update = true) TA_REQ(lock_);
// Attempts to release pages in the pages list causing the range to become copy-on-write again.
// For consistency if there is a parent or a backing page source, such that the range would not
// explicitly copy-on-write the zero page then this will fail. Use ZeroPagesLocked for an
// operation that is guaranteed to succeed, but may not release memory.
zx_status_t DecommitRangeLocked(uint64_t offset, uint64_t len) TA_REQ(lock_);
// After successful completion the range of pages will all read as zeros. The mechanism used to
// achieve this is not guaranteed to decommit, but it will try to.
// |page_start_base| and |page_end_base| must be page aligned offsets within the range of the
// object. |zeroed_len_out| will contain the length (in bytes) starting at |page_start_base| that
// was successfully zeroed.
// Returns one of the following:
// ZX_OK => The whole range was successfully zeroed.
// ZX_ERR_SHOULD_WAIT => The caller needs to wait on the |page_request| and then retry the
// operation. |zeroed_len_out| will contain the range that was partially zeroed, so the caller
// can advance the start offset before retrying.
// Any other error code indicates a failure to zero a part of the range or the whole range.
zx_status_t ZeroPagesLocked(uint64_t page_start_base, uint64_t page_end_base,
LazyPageRequest* page_request, uint64_t* zeroed_len_out)
// Attempts to commit a range of pages. This has three kinds of return status
// ZX_OK => The whole range was successfully committed and |len| will be written to
// |committed_len|
// ZX_ERR_SHOULD_WAIT => A partial (potentially 0) range was committed (output in |committed_len|
// and the passed in |page_request| should be waited on before retrying
// the commit operation. The portion that was successfully committed does
// not need to retried.
// * => Any other error, the number of pages committed is undefined.
// The |offset| and |len| are assumed to be page aligned and within the range of |size_|.
zx_status_t CommitRangeLocked(uint64_t offset, uint64_t len, uint64_t* committed_len,
LazyPageRequest* page_request) TA_REQ(lock_);
// Increases the pin count of the range of pages given by |offset| and |len|. The full range must
// already be committed and this either pins all pages in the range, or pins no pages and returns
// an error. The caller can assume that on success len / PAGE_SIZE pages were pinned.
// The |offset| and |len| are assumed to be page aligned and within the range of |size_|.
// This method also replaces any loaned pages with non-loaned pages.
zx_status_t PinRangeLocked(uint64_t offset, uint64_t len) TA_REQ(lock_);
// See VmObject::Unpin
void UnpinLocked(uint64_t offset, uint64_t len, bool allow_gaps) TA_REQ(lock_);
// Returns true if a page is not currently committed, and if the offset were to be read from, it
// would be read as zero. Requested offset must be page aligned and within range.
bool PageWouldReadZeroLocked(uint64_t page_offset) TA_REQ(lock_);
// Returns whether this node is currently suitable for having a copy-on-write child made of it.
bool IsCowClonableLocked() const TA_REQ(lock_);
// see VmObjectPaged::AttributedPagesInRange
size_t AttributedPagesInRangeLocked(uint64_t offset, uint64_t len) const TA_REQ(lock_);
// Scans this cow pages range for zero pages and frees them if |reclaim| is set to true. Returns
// the number of pages freed or scanned.
uint32_t ScanForZeroPagesLocked(bool reclaim) TA_REQ(lock_);
enum class EvictionHintAction : uint8_t {
// Asks the VMO to attempt to evict the specified page. This returns true if the page was
// actually from this VMO and was successfully evicted, at which point the caller now has
// ownership of the page. Otherwise eviction is allowed to fail for any reason, specifically
// if the page is considered in use, or the VMO has no way to recreate the page then eviction
// will fail. Although eviction may fail for any reason, if it does the caller is able to assume
// that either the page was not from this vmo, or that the page is not in any evictable page queue
// (such as the pager_backed_ queue).
// |hint_action| indicates whether the |always_need| eviction hint should be respected or ignored.
// If this page is not evicted as a result of the hint, the caller can assume that the page has
// been moved out from the evictable page queue(s) into the active queue(s).
bool RemovePageForEviction(vm_page_t* page, uint64_t offset, EvictionHintAction hint_action);
// Swap an old page for a new page. The old page must be at offset. The new page must be in
// ALLOC state. On return, the old_page is owned by the caller. Typically the caller will
// remove the old_page from pmm_page_queues() and free the old_page.
void SwapPageLocked(uint64_t offset, vm_page_t* old_page, vm_page_t* new_page) TA_REQ(lock_);
// If page is still at offset, replace it with a different page. If with_loaned is true, replace
// with a loaned page. If with_loaned is false, replace with a non-loaned page.
zx_status_t ReplacePage(vm_page_t* before_page, uint64_t offset, bool with_loaned) TA_EXCL(lock_);
zx_status_t ReplacePageLocked(vm_page_t* before_page, uint64_t offset, bool with_loaned,
vm_page_t** after_page) TA_REQ(lock_);
// Attempts to dedup the given page at the specified offset with the zero page. The only
// correctness requirement for this is that `page` must be *some* valid vm_page_t, meaning that
// all race conditions are handled internally. This function returns false if
// * page is either not from this VMO, or not found at the specified offset
// * page is pinned
// * vmo is uncached
// * page is not all zeroes
// Otherwise 'true' is returned and the page will have been returned to the pmm with a zero page
// marker put in its place.
bool DedupZeroPage(vm_page_t* page, uint64_t offset);
void DumpLocked(uint depth, bool verbose) const TA_REQ(lock_);
bool DebugValidatePageSplitsLocked() const TA_REQ(lock_);
bool DebugValidateBacklinksLocked() const TA_REQ(lock_);
// Calls DebugValidatePageSplitsLocked on this and every parent in the chain, returning true if
// all return true. Also calls DebugValidateBacklinksLocked() on every node in the hierarchy.
bool DebugValidatePageSplitsHierarchyLocked() const TA_REQ(lock_);
bool DebugValidateVmoPageBorrowingLocked() const TA_REQ(lock_);
// Different operations that RangeChangeUpdate* can perform against any VmMappings that are found.
enum class RangeChangeOp {
// Apply the specified operation to all mappings in the given range. This is applied to all
// descendants within the range.
void RangeChangeUpdateLocked(uint64_t offset, uint64_t len, RangeChangeOp op) TA_REQ(lock_);
// Promote pages in the specified range for reclamation under memory pressure. |offset| will be
// rounded down to the page boundary, and |len| will be rounded up to the page boundary.
// Currently used only for pager-backed VMOs to move their pages to the end of the
// pager-backed queue, so that they can be evicted first.
void PromoteRangeForReclamationLocked(uint64_t offset, uint64_t len) TA_REQ(lock_);
// Protect pages in the specified range from reclamation under memory pressure. |offset| will be
// rounded down to the page boundary, and |len| will be rounded up to the page boundary. Used to
// set the |always_need| hint for pages in pager-backed VMOs. Any absent pages in the range will
// be committed first, and the call will block on the fulfillment of the page request(s), dropping
// |guard| while waiting (multiple times if multiple pages need to be supplied).
void ProtectRangeFromReclamationLocked(uint64_t offset, uint64_t len, Guard<Mutex>* guard)
void MarkAsLatencySensitiveLocked() TA_REQ(lock_);
zx_status_t LockRangeLocked(uint64_t offset, uint64_t len, zx_vmo_lock_state_t* lock_state_out);
zx_status_t TryLockRangeLocked(uint64_t offset, uint64_t len);
zx_status_t UnlockRangeLocked(uint64_t offset, uint64_t len);
// Exposed for testing.
uint64_t DebugGetLockCount() const {
Guard<Mutex> guard{&lock_};
return lock_count_;
uint64_t DebugGetPageCountLocked() const TA_REQ(lock_);
bool DebugIsReclaimable() const;
bool DebugIsUnreclaimable() const;
bool DebugIsDiscarded() const;
bool DebugIsPage(uint64_t offset) const;
bool DebugIsMarker(uint64_t offset) const;
bool DebugIsEmpty(uint64_t offset) const;
vm_page_t* DebugGetPage(uint64_t offset) const TA_EXCL(lock_);
vm_page_t* DebugGetPageLocked(uint64_t offset) const TA_REQ(lock_);
uint64_t DebugGetSupplyZeroOffset() const TA_EXCL(lock_);
// Discard all the pages from a discardable vmo in the |kReclaimable| state. For this call to
// succeed, the vmo should have been in the reclaimable state for at least
// |min_duration_since_reclaimable|. If successful, the |discardable_state_| is set to
// |kDiscarded|, and the vmo is moved from the reclaim candidates list. The pages are removed /
// discarded from the vmo and appended to the |freed_list| passed in; the caller takes ownership
// of the removed pages and is responsible for freeing them. Returns the number of pages
// discarded.
uint64_t DiscardPages(zx_duration_t min_duration_since_reclaimable, list_node_t* freed_list)
TA_EXCL(DiscardableVmosLock::Get()) TA_EXCL(lock_);
struct DiscardablePageCounts {
uint64_t locked;
uint64_t unlocked;
// Returns the total number of pages locked and unlocked across all discardable vmos.
// Note that this might not be exact and we might miss some vmos, because the
// |DiscardableVmosLock| is dropped after processing each vmo on the global discardable lists.
// That is fine since these numbers are only used for accounting.
static DiscardablePageCounts DebugDiscardablePageCounts() TA_EXCL(DiscardableVmosLock::Get());
// Walks through the LRU reclaimable list of discardable vmos and discards pages from each, until
// |target_pages| have been discarded, or the list of candidates is exhausted. Only vmos that have
// become reclaimable more than |min_duration_since_reclaimable| in the past will be discarded;
// this prevents discarding reclaimable vmos that were recently accessed. The discarded pages are
// appended to the |freed_list| passed in; the caller takes ownership of the discarded pages and
// is responsible for freeing them. Returns the total number of pages discarded.
static uint64_t ReclaimPagesFromDiscardableVmos(uint64_t target_pages,
zx_duration_t min_duration_since_reclaimable,
list_node_t* freed_list)
// Walks up the parent tree and returns the root, or |this| if there is no parent.
const VmCowPages* GetRootLocked() const TA_REQ(lock_);
// Only for use by loaned page reclaim.
VmCowPagesContainer* raw_container();
// private constructor (use Create...())
VmCowPages(ktl::unique_ptr<VmCowPagesContainer> cow_container,
fbl::RefPtr<VmHierarchyState> root_lock, VmCowPagesOptions options,
uint32_t pmm_alloc_flags, uint64_t size, fbl::RefPtr<PageSource> page_source);
friend class VmCowPagesContainer;
~VmCowPages() override;
// This takes all the constructor parameters including the VmCowPagesContainer, which avoids any
// possiblity of allocation failure.
template <class... Args>
static fbl::RefPtr<VmCowPages> NewVmCowPages(ktl::unique_ptr<VmCowPagesContainer> cow_container,
Args&&... args);
// This takes all the constructor parameters except for the VmCowPagesContainer which is
// allocated. The AllocChecker will reflect whether allocation was successful.
template <class... Args>
static fbl::RefPtr<VmCowPages> NewVmCowPages(fbl::AllocChecker* ac, Args&&... args);
// fbl_recycle() does all the explicit cleanup, and the destructor does all the implicit cleanup.
void fbl_recycle() override;
friend class fbl::Recyclable<VmCowPages>;
bool is_hidden_locked() const TA_REQ(lock_) { return !!(options_ & VmCowPagesOptions::kHidden); }
bool is_slice_locked() const TA_REQ(lock_) { return !!(options_ & VmCowPagesOptions::kSlice); }
bool can_decommit_zero_pages_locked() const TA_REQ(lock_) {
bool result = !(options_ & VmCowPagesOptions::kCannotDecommitZeroPages);
DEBUG_ASSERT(result == !debug_is_contiguous());
return result;
// can_borrow_locked() returns true if the VmCowPages is capable of borrowing pages, but whether
// the VmCowPages should actually borrow pages also depends on a borrowing-site-specific flag that
// the caller is responsible for checking (in addition to checking can_borrow_locked()). Only if
// both are true should the caller actually borrow at the caller's specific potential borrowing
// site. For example, see is_borrowing_in_supplypages_enabled() and
// is_borrowing_on_mru_enabled().
bool can_borrow_locked() const TA_REQ(lock_) {
// TODO(dustingreen): Or rashaeqbal@. We can only borrow while the page is not dirty.
// Currently we enforce this by checking ShouldTrapDirtyTransitions() below and leaning on the
// fact that !ShouldTrapDirtyTransitions() dirtying isn't implemented yet. We currently evict
// to reclaim instead of replacing the page, and we can't evict a dirty page since the contents
// would be lost. Option 1: When a loaned page is about to become dirty, we could replace it
// with a non-loaned page. Option 2: When reclaiming a loaned page we could replace instead of
// evicting (this may be simpler).
// Currently we can only borrow if we have a suitable PageSource, since this suitable page
// source is currently 1:1 with having the needed backlinks for reclaim.
bool source_is_suitable = page_source_ && page_source_->properties().is_preserving_page_content;
// This ensures that if borrowing is globally disabled (no borrowing sites enabled), that we'll
// return false. We could delete this bool without damaging correctness, but we want to
// mitigate a call site that maybe fails to check its call-site-specific settings such as
// is_borrowing_in_supplypages_enabled().
// We also don't technically need to check is_any_borrowing_enabled() here since pmm will check
// also, but by checking here, we minimize the amount of code that will run when
// !is_any_borrowing_enabled() (in case we have it disabled due to late discovery of a problem
// with borrowing).
bool borrowing_is_generally_acceptable =
// Exclude is_latency_sensitive_ to avoid adding latency due to reclaim.
// Currently we evict instead of replacing a page when reclaiming, so we want to avoid evicting
// pages that are latency sensitive or are fairly likely to be pinned at some point.
// We also don't want to borrow a page that might get pinned again since we want to mitigate the
// possibility of an invalid DMA-after-free.
bool excluded_from_borrowing_for_latency_reasons = is_latency_sensitive_ || ever_pinned_;
// Avoid borrowing and trapping dirty transitions overlapping for now; nothing really stops
// these from being compatible AFAICT - we're just avoiding overlap of these two things until
// later.
bool overlapping_with_other_features = page_source_->ShouldTrapDirtyTransitions();
bool result = source_is_suitable && borrowing_is_generally_acceptable &&
!excluded_from_borrowing_for_latency_reasons && !overlapping_with_other_features;
DEBUG_ASSERT(result == (debug_is_user_pager_backed_locked() &&
pmm_physical_page_borrowing_config()->is_any_borrowing_enabled() &&
!is_latency_sensitive_ && !ever_pinned_ &&
return result;
bool direct_source_supplies_zero_pages_locked() const TA_REQ(lock_) {
bool result = page_source_ && !page_source_->properties().is_preserving_page_content;
DEBUG_ASSERT(result == debug_is_contiguous());
return result;
bool can_decommit_locked() const TA_REQ(lock_) {
bool result = !page_source_ || !page_source_->properties().is_preserving_page_content;
DEBUG_ASSERT(result == !debug_is_user_pager_backed_locked());
return result;
// Add a page to the object at |offset|.
// |overwrite| controls how the function handles pre-existing content at |offset|. If |overwrite|
// does not permit replacing the content, ZX_ERR_ALREADY_EXISTS will be returned. If a page is
// released from the page list as a result of overwriting, it is returned through |released_page|
// and the caller takes ownership of this page. If the |overwrite| action is such that a page
// cannot be released, it is valid for the caller to pass in nullptr for |released_page|.
// This operation unmaps the corresponding offset from any existing mappings, unless
// |do_range_update| is false, in which case it will skip updating mappings.
// On success the page to add is moved out of `*p`, otherwise it is left there.
zx_status_t AddPageLocked(VmPageOrMarker* p, uint64_t offset, CanOverwriteContent overwrite,
ktl::optional<vm_page_t*>* released_page, bool do_range_update = true)
// Unmaps and removes all the committed pages in the specified range.
// Called from DecommitRangeLocked() to perform the actual decommit action after some of the
// initial sanity checks have succeeded. Also called from DetachSourceLocked() when a VMO is
// detached from the page source, and from DiscardPages() to reclaim pages from a discardable VMO.
// Upon success the removed pages are placed in |freed_list|. The caller has ownership of these
// pages and is responsible for freeing them.
// Unlike DecommitRangeLocked(), this function only operates on |this| node, which must have no
// parent.
// |offset| must be page aligned. |len| must be less than or equal to |size_ - offset|. If |len|
// is less than |size_ - offset| it must be page aligned.
// Optionally returns the number of pages removed if |pages_freed_out| is not null.
zx_status_t UnmapAndRemovePagesLocked(uint64_t offset, uint64_t len, list_node_t* freed_list,
uint64_t* pages_freed_out = nullptr) TA_REQ(lock_);
// internal check if any pages in a range are pinned
bool AnyPagesPinnedLocked(uint64_t offset, size_t len) TA_REQ(lock_);
// Helper function for ::AllocatedPagesInRangeLocked. Counts the number of pages in ancestor's
// vmos that should be attributed to this vmo for the specified range. It is an error to pass in a
// range that does not need attributing (i.e. offset must be < parent_limit_), although |len| is
// permitted to be sized such that the range exceeds parent_limit_.
// The return value is the length of the processed region, which will be <= |size| and is
// guaranteed to be > 0. The |count| is the number of pages in this region that should be
// attributed to this vmo, versus some other vmo.
uint64_t CountAttributedAncestorPagesLocked(uint64_t offset, uint64_t size, uint64_t* count) const
// Searches for the the initial content for |this| at |offset|. The result could be used to
// initialize a commit, or compare an existing commit with the original. The initial content
// is a reference to a VmPageOrMarker as there could be an explicit vm_page of content, an
// explicit zero page of content via a marker, or no initial content. Determining the meaning of
// no initial content (i.e. whether it is zero or something else) is left up to the caller.
// If an ancestor has a committed page which corresponds to |offset|, returns that page
// as well as the VmCowPages and offset which own the page. If no ancestor has a committed
// page for the offset, returns null as well as the VmCowPages/offset which need to be queried
// to populate the page.
// If the passed |owner_length| is not null, then the visible range of the owner is calculated and
// stored back into |owner_length| on the walk up. The |owner_length| represents the size of the
// range in the owner for which no other VMO in the chain had forked a page.
VmPageOrMarker* FindInitialPageContentLocked(uint64_t offset, VmCowPages** owner_out,
uint64_t* owner_offset_out, uint64_t* owner_length)
// LookupPagesLocked helper function that 'forks' the page at |offset| of the current vmo. If
// this function successfully inserts a page into |offset| of the current vmo, it returns ZX_OK
// and populates |out_page|. If a |page_request| is provided and ZX_ERR_SHOULD_WAIT is returned
// then this indicates a transient failure that should be resolved by waiting on the page_request.
// The source page that is being forked has already been calculated - it is |page|, which
// is currently in |page_owner| at offset |owner_offset|.
// This function is responsible for ensuring that COW clones never result in worse memory
// consumption than simply creating a new vmo and memcpying the content. It does this by
// migrating a page from a hidden vmo into one child if that page is not 'accessible' to the
// other child (instead of allocating a new page into the child and making the hidden vmo's
// page inaccessible).
// Whether a particular page in a hidden vmo is 'accessible' to a particular child is
// determined by a combination of two factors. First, if the page lies outside of the range
// in the hidden vmo the child can see (specified by parent_offset_ and parent_limit_), then
// the page is not accessible. Second, if the page has already been copied into the child,
// then the page in the hidden vmo is not accessible to that child. This is tracked by the
// cow_X_split bits in the vm_page_t structure.
// To handle memory allocation failure, this function performs the fork operation from the
// root vmo towards the leaf vmo. This allows the COW invariants to always be preserved.
// |page| must not be the zero-page, as there is no need to do the complex page
// fork logic to reduce memory consumption in that case.
zx_status_t CloneCowPageLocked(uint64_t offset, list_node_t* alloc_list, VmCowPages* page_owner,
vm_page_t* page, uint64_t owner_offset,
LazyPageRequest* page_request, vm_page_t** out_page) TA_REQ(lock_);
// This is an optimized wrapper around CloneCowPageLocked for when an initial content page needs
// to be forked to preserve the COW invariant, but you know you are immediately going to overwrite
// the forked page with zeros.
// The optimization it can make is that it can fork the page up to the parent and then, instead
// of forking here and then having to immediately free the page, it can insert a marker here and
// set the split bits in the parent page as if it had been forked.
zx_status_t CloneCowPageAsZeroLocked(uint64_t offset, list_node_t* freed_list,
VmCowPages* page_owner, vm_page_t* page,
uint64_t owner_offset) TA_REQ(lock_);
// Returns true if |page| (located at |offset| in this vmo) is only accessible by one
// child, where 'accessible' is defined by ::CloneCowPageLocked.
bool IsUniAccessibleLocked(vm_page_t* page, uint64_t offset) const TA_REQ(lock_);
// Releases this vmo's reference to any ancestor vmo's COW pages, for the range [start, end)
// in this vmo. This is done by either setting the pages' split bits (if something else
// can access the pages) or by freeing the pages using the |page_remover|
// This function recursively invokes itself for regions of the parent vmo which are
// not accessible by the sibling vmo.
void ReleaseCowParentPagesLocked(uint64_t start, uint64_t end, BatchPQRemove* page_remover)
// Helper function for ReleaseCowParentPagesLocked that processes pages which are visible
// to at least this VMO, and possibly its sibling, as well as updates parent_(offset_)limit_.
void ReleaseCowParentPagesLockedHelper(uint64_t start, uint64_t end, bool sibling_visible,
BatchPQRemove* page_remover) TA_REQ(lock_);
// Updates the parent limits of all children so that they will never be able to
// see above |new_size| in this vmo, even if the vmo is enlarged in the future.
void UpdateChildParentLimitsLocked(uint64_t new_size) TA_REQ(lock_);
// When cleaning up a hidden vmo, merges the hidden vmo's content (e.g. page list, view
// of the parent) into the remaining child.
void MergeContentWithChildLocked(VmCowPages* removed, bool removed_left) TA_REQ(lock_);
// Only valid to be called when is_slice_locked() is true and returns the first parent of this
// hierarchy that is not a slice. The offset of this slice within that VmObjectPaged is set as
// the output.
VmCowPages* PagedParentOfSliceLocked(uint64_t* offset) TA_REQ(lock_);
// Unpins a page and potentially moves it into a different page queue should its pin
// count reach zero.
void UnpinPageLocked(vm_page_t* page, uint64_t offset) TA_REQ(lock_);
// Moves an existing page to the wired queue, retaining backlink information if applicable.
void MoveToWiredLocked(vm_page_t* page, uint64_t offset) TA_REQ(lock_);
// Updates the page queue of an existing page, moving it to whichever non wired queue
// is appropriate.
void MoveToNotWiredLocked(vm_page_t* page, uint64_t offset) TA_REQ(lock_);
// Places a newly added page into the appropriate non wired page queue.
void SetNotWiredLocked(vm_page_t* page, uint64_t offset) TA_REQ(lock_);
// Updates any meta data for accessing a page. Currently this moves pager backed pages around in
// the page queue to track which ones were recently accessed for the purposes of eviction. In
// terms of functional correctness this never has to be called.
void UpdateOnAccessLocked(vm_page_t* page, uint pf_flags) TA_REQ(lock_);
// Updates the page's dirty state to the one specified, and also moves the page between page
// queues if required by the dirty state. |dirty_state| should be a valid dirty tracking state,
// i.e. one of Clean, AwaitingClean, or Dirty.
// |offset| is the page-aligned offset of the page in this object.
// |is_pending_add| indicates whether this page is yet to be added to this object's page list,
// false by default. If the page is yet to be added, this function will skip updating the page
// queue as an optimization, since the page queue will be updated later when the page gets added
// to the page list. |is_pending_add| also helps determine certain validation checks that can be
// performed on the page.
void UpdateDirtyStateLocked(vm_page_t* page, uint64_t offset, DirtyState dirty_state,
bool is_pending_add = false) TA_REQ(lock_);
// Prepares the specified range for a write, forwarding a DIRTY page request to the page source if
// pages are clean and need to transition to dirty, in which case ZX_ERR_SHOULD_WAIT will be
// returned and the caller should wait on |page_request|. If no page requests need to be
// generated, i.e. the pages are already dirty, or if they do not require the dirty transition to
// be trapped, ZX_OK is returned.
// |offset| and |len| should be page-aligned.
// |dirty_len_out| will return the (page-aligned) length starting at |offset| that contains dirty
// pages, either already dirty before making the call or dirtied during the call. In other words,
// the range [offset, offset + dirty_len_out) will be dirty when this call returns, where
// |dirty_len_out| <= |len|.
zx_status_t PrepareForWriteLocked(LazyPageRequest* page_request, uint64_t offset, uint64_t len,
uint64_t* dirty_len_out) TA_REQ(lock_);
// If supply_zero_offset_ falls within the specified range [start_offset, end_offset), try to
// advance supply_zero_offset_ over any pages in the range that might have been committed
// immediately following supply_zero_offset_. |start_offset| and |end_offset| should be
// page-aligned.
void TryAdvanceSupplyZeroOffsetLocked(uint64_t start_offset, uint64_t end_offset) TA_REQ(lock_);
// Initializes and adds as a child the given VmCowPages as a full clone of this one such that the
// VmObjectPaged backlink can be moved from this to the child, keeping all page offsets, sizes and
// other requirements (see VmObjectPaged::SetCowPagesReferenceLocked) are valid. This does also
// move our paged_ref_ into child_ and update the VmObjectPaged backlinks.
void CloneParentIntoChildLocked(fbl::RefPtr<VmCowPages>& child) TA_REQ(lock_);
// Removes the specified child from this objects |children_list_| and performs any hierarchy
// updates that need to happen as a result. This does not modify the |parent_| member of the
// removed child and if this is not being called due to |removed| being destructed it is the
// callers responsibility to correct parent_.
void RemoveChildLocked(VmCowPages* removed) TA_REQ(lock_);
// Inserts a newly created VmCowPages into this hierarchy as a child of this VmCowPages.
// Initializes child members based on the passed in values that only have meaning when an object
// is a child. This updates the parent_ field in child to hold a refptr to |this|.
void AddChildLocked(VmCowPages* child, uint64_t offset, uint64_t root_parent_offset,
uint64_t parent_limit) TA_REQ(lock_);
// Outside of initialization/destruction, hidden vmos always have two children. For
// clarity, whichever child is first in the list is the 'left' child, and whichever
// child is second is the 'right' child. Children of a paged vmo will always be paged
// vmos themselves.
VmCowPages& left_child_locked() TA_REQ(lock_) TA_ASSERT(left_child_locked().lock()) {
DEBUG_ASSERT(children_list_len_ == 2);
auto& ret = children_list_.front();
return ret;
VmCowPages& right_child_locked() TA_REQ(lock_) TA_ASSERT(right_child_locked().lock()) {
DEBUG_ASSERT(children_list_len_ == 2);
auto& ret = children_list_.back();
return ret;
const VmCowPages& left_child_locked() const TA_REQ(lock_) TA_ASSERT(left_child_locked().lock()) {
DEBUG_ASSERT(children_list_len_ == 2);
const auto& ret = children_list_.front();
return ret;
const VmCowPages& right_child_locked() const TA_REQ(lock_)
TA_ASSERT(right_child_locked().lock()) {
DEBUG_ASSERT(children_list_len_ == 2);
const auto& ret = children_list_.back();
return ret;
void ReplaceChildLocked(VmCowPages* old, VmCowPages* new_child) TA_REQ(lock_);
void DropChildLocked(VmCowPages* c) TA_REQ(lock_);
// Types for an additional linked list over the VmCowPages for use when doing a
// RangeChangeUpdate.
// To avoid unbounded stack growth we need to reserve the memory to exist on a
// RangeChange list in our object so that we can have a flat iteration over a
// work list. RangeChangeLists should only be used by the RangeChangeUpdate
// code.
using RangeChangeNodeState = fbl::SinglyLinkedListNodeState<VmCowPages*>;
struct RangeChangeTraits {
static RangeChangeNodeState& node_state(VmCowPages& cow) { return cow.range_change_state_; }
using RangeChangeList =
fbl::SinglyLinkedListCustomTraits<VmCowPages*, VmCowPages::RangeChangeTraits>;
friend struct RangeChangeTraits;
// Given an initial list of VmCowPages performs RangeChangeUpdate on it until the list is empty.
static void RangeChangeUpdateListLocked(RangeChangeList* list, RangeChangeOp op);
void RangeChangeUpdateFromParentLocked(uint64_t offset, uint64_t len, RangeChangeList* list)
// Helper to check whether the requested range for LockRangeLocked() / TryLockRangeLocked() /
// UnlockRangeLocked() is valid.
bool IsLockRangeValidLocked(uint64_t offset, uint64_t len) const TA_REQ(lock_);
// Lock that protects the global discardable lists.
// This lock can be acquired with the vmo's |lock_| held. To prevent deadlocks, if both locks are
// required the order of locking should always be 1) vmo's lock, and then 2) DiscardableVmosLock.
enum class DiscardableState : uint8_t {
kUnset = 0,
using DiscardableList = fbl::TaggedDoublyLinkedList<VmCowPages*, internal::DiscardableListTag>;
// Two global lists of discardable vmos:
// - |discardable_reclaim_candidates_| tracks discardable vmos that are eligible for reclamation
// and haven't been reclaimed yet.
// - |discardable_non_reclaim_candidates_| tracks all other discardable VMOs.
// The lists are protected by the |DiscardableVmosLock|, and updated based on a discardable vmo's
// state changes (lock, unlock, or discard).
static DiscardableList discardable_reclaim_candidates_ TA_GUARDED(DiscardableVmosLock::Get());
static DiscardableList discardable_non_reclaim_candidates_ TA_GUARDED(DiscardableVmosLock::Get());
// Helper function to move an object from the |discardable_non_reclaim_candidates_| list to the
// |discardable_reclaim_candidates_| list.
void MoveToReclaimCandidatesListLocked() TA_REQ(lock_) TA_REQ(DiscardableVmosLock::Get());
// Helper function to move an object from the |discardable_reclaim_candidates_| list to the
// |discardable_non_reclaim_candidates_| list. If |new_candidate| is true, that indicates that the
// object was not yet being tracked on any list, and should only be inserted into the
// |discardable_non_reclaim_candidates_| list without a corresponding list removal.
void MoveToNonReclaimCandidatesListLocked(bool new_candidate = false) TA_REQ(lock_)
// Updates the |discardable_state_| of a discardable vmo, and moves it from one discardable list
// to another.
void UpdateDiscardableStateLocked(DiscardableState state) TA_REQ(lock_)
// Remove a discardable object from whichever global discardable list it is in. Called from the
// VmCowPages destructor.
void RemoveFromDiscardableListLocked() TA_REQ(lock_) TA_EXCL(DiscardableVmosLock::Get());
// Returns whether the vmo is in either one of the |discardable_reclaim_candidates_| or
// |discardable_reclaim_candidates_| lists, depending on whether it is a |reclaim_candidate|
// or not.
bool DebugIsInDiscardableListLocked(bool reclaim_candidate) const TA_REQ(lock_)
DiscardablePageCounts GetDiscardablePageCounts() const TA_EXCL(lock_);
// Returns the root parent's page source.
fbl::RefPtr<PageSource> GetRootPageSourceLocked() const TA_REQ(lock_);
void FreePages(list_node* pages) {
if (!page_source_ || !page_source_->properties().is_handling_free) {
void FreePage(vm_page_t* page) {
if (!page_source_ || !page_source_->properties().is_handling_free) {
list_node_t list;
list_add_tail(&list, &page->queue_node);
void CopyPageForReplacementLocked(vm_page_t* dst_page, vm_page_t* src_page) TA_REQ(lock_);
// Update supply_zero_offset_ to the specified page-aligned |offset|, and potentially also reset
// awaiting_clean_zero_range_end_ if required. (See comments near declaration of
// awaiting_clean_zero_range_end_ for additional context.)
void UpdateSupplyZeroOffsetLocked(uint64_t offset) TA_REQ(lock_) {
uint64_t prev_supply_zero_offset = supply_zero_offset_;
supply_zero_offset_ = offset;
// If there was no zero range AwaitingClean, there is nothing more to do.
if (awaiting_clean_zero_range_end_ == 0) {
DEBUG_ASSERT(prev_supply_zero_offset < awaiting_clean_zero_range_end_);
// The AwaitingClean zero range we were tracking was [prev_supply_zero_offset,
// awaiting_clean_zero_range_end_). If |offset| lies within this range, we still have a valid
// AwaitingClean sub-range that we can continue tracking i.e. [offset,
// awaiting_clean_zero_range_end_). Otherwise, the AwaitingClean zero range is no longer valid
// and must be reset.
if (!(offset >= prev_supply_zero_offset && offset < awaiting_clean_zero_range_end_)) {
awaiting_clean_zero_range_end_ = 0;
// If awaiting_clean_zero_range_end_ is non-zero, it should be strictly greater than
// supply_zero_offset_, as it is used to track the range [supply_zero_offset_,
// awaiting_clean_zero_range_end_).
DEBUG_ASSERT(awaiting_clean_zero_range_end_ == 0 ||
supply_zero_offset_ < awaiting_clean_zero_range_end_);
// Consider trimming the AwaitingClean zero range (if there is one) to end at the specified
// page-aligned |end_offset|. The AwaitingClean zero range always starts at supply_zero_offset_.
// (See comments near declaration of awaiting_clean_zero_range_end_ for additional context.)
// Three scenarios are possible here:
// - If awaiting_clean_zero_range_end_ is 0, no AwaitingClean zero range is being tracked, so
// nothing needs to be done.
// - If |end_offset| lies within [supply_zero_offset_, awaiting_clean_zero_range_end_), the zero
// range should now end at |end_offset|. The new AwaitingClean zero range becomes
// [supply_zero_offset_, end_offset).
// - If |end_offset| lies outside of [supply_zero_offset_, awaiting_clean_zero_range_end_), it
// does not affect the AwaitingClean zero range.
void ConsiderTrimAwaitingCleanZeroRangeLocked(uint64_t end_offset) TA_REQ(lock_) {
// No AwaitingClean zero range was being tracked.
if (awaiting_clean_zero_range_end_ == 0) {
DEBUG_ASSERT(supply_zero_offset_ < awaiting_clean_zero_range_end_);
// Trim the zero range to the new end offset.
if (end_offset >= supply_zero_offset_ && end_offset < awaiting_clean_zero_range_end_) {
awaiting_clean_zero_range_end_ = end_offset;
// Reset awaiting_clean_zero_range_end_ if this leaves us with no valid range.
if (awaiting_clean_zero_range_end_ == supply_zero_offset_) {
awaiting_clean_zero_range_end_ = 0;
// If awaiting_clean_zero_range_end_ is non-zero, it should be strictly greater than
// supply_zero_offset_, as it is used to track the range [supply_zero_offset_,
// awaiting_clean_zero_range_end_).
DEBUG_ASSERT(awaiting_clean_zero_range_end_ == 0 ||
supply_zero_offset_ < awaiting_clean_zero_range_end_);
// magic value
fbl::Canary<fbl::magic("VMCP")> canary_;
// VmCowPages keeps this ref on VmCowPagesContainer until the end of VmCowPages::fbl_recycle().
// This allows loaned page reclaim to upgrade a raw container pointer until _after_ all the pages
// have been removed from the VmCowPages. This way there's always something for loaned page
// reclaim to block on that'll do priority inheritance to the thread that needs to finish moving
// pages.
fbl::RefPtr<VmCowPagesContainer> container_;
VmCowPagesContainer* debug_retained_raw_container_ = nullptr;
VmCowPagesOptions options_ TA_GUARDED(lock_);
uint64_t size_ TA_GUARDED(lock_);
// Offset in the *parent* where this object starts.
uint64_t parent_offset_ TA_GUARDED(lock_) = 0;
// Offset in *this object* above which accesses will no longer access the parent.
uint64_t parent_limit_ TA_GUARDED(lock_) = 0;
// Offset in *this object* below which this vmo stops referring to its parent. This field
// is only useful for hidden vmos, where it is used by ::ReleaseCowPagesParentLocked
// together with parent_limit_ to reduce how often page split bits need to be set. It is
// effectively a summary of the parent_offset_ values of all descendants - unlike
// parent_limit_, this value does not directly impact page lookup. See partial_cow_release_ flag
// for more details on usage of this limit.
uint64_t parent_start_limit_ TA_GUARDED(lock_) = 0;
// Offset in our root parent where this object would start if projected onto it. This value is
// used as an efficient summation of accumulated offsets to ensure that an offset projected all
// the way to the root would not overflow a 64-bit integer. Although actual page resolution
// would never reach the root in such a case, a childs full range projected onto its parent is
// used to simplify some operations and so this invariant of not overflowing accumulated offsets
// needs to be maintained.
uint64_t root_parent_offset_ TA_GUARDED(lock_) = 0;
const uint32_t pmm_alloc_flags_;
// Flag which is true if there was a call to ::ReleaseCowParentPagesLocked which was
// not able to update the parent limits. When this is not set, it is sometimes
// possible for ::MergeContentWithChildLocked to do significantly less work. This flag acts as a
// proxy then for how precise the parent_limit_ and parent_start_limit_ are. It is always an
// absolute guarantee that descendants cannot see outside of the limits, but when this flag is
// true there is a possibility that there is a sub range inside the limits that they also cannot
// see.
// Imagine a two siblings that see the parent range [0x1000-0x2000) and [0x3000-0x4000)
// respectively. The parent can have the start_limit of 0x1000 and limit of 0x4000, but without
// additional allocations it cannot track the free region 0x2000-0x3000, and so
// partial_cow_release_ must be set to indicate in the future we need to do more expensive
// processing to check for such free regions.
bool partial_cow_release_ TA_GUARDED(lock_) = false;
// parent pointer (may be null)
fbl::RefPtr<VmCowPages> parent_ TA_GUARDED(lock_);
// list of every child
fbl::TaggedDoublyLinkedList<VmCowPages*, internal::ChildListTag> children_list_ TA_GUARDED(lock_);
// length of children_list_
uint32_t children_list_len_ TA_GUARDED(lock_) = 0;
// Flag used for walking back up clone tree without recursion. See ::CloneCowPageLocked.
enum class StackDir : bool {
struct {
uint64_t scratch : 63;
StackDir dir_flag : 1;
} stack_ TA_GUARDED(lock_);
// This value is used when determining against which user-visible vmo a hidden vmo's
// pages should be attributed. It serves as a tie-breaker for pages that are accessible by
// multiple user-visible vmos. See ::HasAttributedAncestorPageLocked for more details.
// For non-hidden vmobjects, this always equals user_id_. For hidden vmobjects, this
// is the page_attribution_user_id_ of one of their children (i.e. the user_id_ of one
// of their non-hidden descendants).
uint64_t page_attribution_user_id_ TA_GUARDED(lock_) = 0;
// Counts the total number of pages pinned by ::CommitRange. If one page is pinned n times, it
// contributes n to this count.
uint64_t pinned_page_count_ TA_GUARDED(lock_) = 0;
// The page source, if any.
const fbl::RefPtr<PageSource> page_source_;
// The offset beyond which new page requests are fulfilled by supplying zero pages, rather than
// having the page source supply pages. Only relevant if there is a valid page_source_ and it
// preserves page content.
// Updating supply_zero_offset_ might affect the AwaitingClean zero range being tracked by
// [supply_zero_offset_, awaiting_clean_zero_range_end_), and so supply_zero_offset_ should not
// be directly assigned. Use the UpdateSupplyZeroOffsetLocked() helper instead. See comments near
// awaiting_clean_zero_range_end_ for more context.
uint64_t supply_zero_offset_ TA_GUARDED(lock_) = UINT64_MAX;
// If supply_zero_offset_ is relevant, and there is a zero range that is AwaitingClean, i.e. a
// zero range starting at supply_zero_offset_, on which WritebackBegin was called but not
// WritebackEnd, awaiting_clean_zero_range_end_ tracks the end of that range. In other words, if
// there exists a zero range that is AwaitingClean, that range is [supply_zero_offset_,
// awaiting_clean_zero_range_end_).
// Will be set to 0 otherwise. So awaiting_clean_zero_range_end_ will either be 0, or will be
// strictly greater than supply_zero_offset_.
// Note that there can be at most one zero range that is AwaitingClean at a time.
// The motivation for this value is to be able to transition the zero range starting at
// supply_zero_offset_ to Clean once it has been written back by the user pager, without having to
// track per-page dirty state for this zero range, which is represented in the page list by a gap.
// TODO(rashaeqbal): Consider removing this once page lists can support custom zero ranges.
uint64_t awaiting_clean_zero_range_end_ TA_GUARDED(lock_) = 0;
// Count eviction events so that we can report them to the user.
uint64_t eviction_event_count_ TA_GUARDED(lock_) = 0;
// Count of outstanding lock operations. A non-zero count prevents the kernel from discarding /
// evicting pages from the VMO to relieve memory pressure (currently only applicable if
// |kDiscardable| is set). Note that this does not prevent removal of pages by other means, like
// decommitting or resizing, since those are explicit actions driven by the user, not by the
// kernel directly.
uint64_t lock_count_ TA_GUARDED(lock_) = 0;
// Timestamp of the last unlock operation that changed a discardable vmo's state to
// |kReclaimable|. Used to determine whether the vmo was accessed too recently to be discarded.
zx_time_t last_unlock_timestamp_ TA_GUARDED(lock_) = ZX_TIME_INFINITE;
// The current state of a discardable vmo, depending on the lock count and whether it has been
// discarded.
// State transitions work as follows:
// 1. kUnreclaimable -> kReclaimable: When the lock count changes from 1 to 0.
// 2. kReclaimable -> kUnreclaimable: When the lock count changes from 0 to 1. The vmo remains
// kUnreclaimable for any non-zero lock count.
// 3. kReclaimable -> kDiscarded: When a vmo with lock count 0 is discarded.
// 4. kDiscarded -> kUnreclaimable: When a discarded vmo is locked again.
// We start off with state kUnset, so a discardable vmo must be locked at least once to opt into
// the above state transitions. For non-discardable vmos, the state will always remain kUnset.
DiscardableState discardable_state_ TA_GUARDED(lock_) = DiscardableState::kUnset;
// a tree of pages
VmPageList page_list_ TA_GUARDED(lock_);
RangeChangeNodeState range_change_state_;
uint64_t range_change_offset_ TA_GUARDED(lock_);
uint64_t range_change_len_ TA_GUARDED(lock_);
// optional reference back to a VmObjectPaged so that we can perform mapping updates. This is a
// raw pointer to avoid circular references, the VmObjectPaged destructor needs to update it.
VmObjectPaged* paged_ref_ TA_GUARDED(lock_) = nullptr;
// TODO(fxb/85056): This is a temporary solution and needs to be replaced with something that is
// formalized.
// Marks whether or not this VMO is considered a latency sensitive object. For a VMO being latency
// sensitive means pages that get committed should not be decommitted (or made expensive to
// access) by any background kernel process, such as the zero page deduper.
// Note: This does not presently protect against user pager eviction, as there is already a
// separate mechanism for that. Once fxb/85056 is resolved this might change.
bool is_latency_sensitive_ TA_GUARDED(lock_) = false;
using Cursor =
VmoCursor<VmCowPages, DiscardableVmosLock, DiscardableList, DiscardableList::iterator>;
// The list of all outstanding cursors iterating over the discardable lists:
// |discardable_reclaim_candidates_| and |discardable_non_reclaim_candidates_|. The cursors should
// be advanced (by calling AdvanceIf()) before removing any element from the discardable lists.
static fbl::DoublyLinkedList<Cursor*> discardable_vmos_cursors_
// With this bool we achieve these things:
// * Avoid using loaned pages for a VMO that will just get pinned and replace the loaned pages
// with non-loaned pages again, possibly repeatedly.
// * Avoid increasing pin latency in the (more) common case of pinning a VMO the 2nd or
// subsequent times (vs the 1st time).
// * Once we have any form of active sweeping (of data from non-loaned to loaned physical pages)
// this bool is part of mitigating any potential DMA-while-not-pinned (which is not permitted
// but is also difficult to detect or prevent without an IOMMU).
bool ever_pinned_ TA_GUARDED(lock_) = false;
// VmCowPagesContainer exists to essentially split the VmCowPages ref_count_ into two counts, so
// that it remains possible to upgrade from a raw container pointer until after the VmCowPages
// fbl_recycle() has mostly completed and has removed and freed all the pages.
// This way, if we can upgrade, then we can call RemovePageForEviction() and it'll either work or
// the page will already have been removed from that location in the VmCowPages, or we can't
// upgrade, in which case all the pages have already been removed and freed.
// In contrast if we were to attempt upgrade of a raw VmCowPages pointer to VmCowPages ref, the
// ability to upgrade would disappear before the backlink is removed to make room for a
// StackOwnedLoanedPagesInterval, so loaned page reclaim would need to wait (somehow) for the page
// to be removed from the VmCowPages and at least have a backlink. That wait is problematic since
// it would also need to propagate priority inheritance properly like StackOwnedLoanedPagesInterval
// does, but the interval begins at the moment the refcount goes from 1 to 0, and reliably wrapping
// that 1 to 0 transition, while definitely posssible with some RefPtr changes etc etc, is more
// complicated than having a VmCowPagesContainer whose ref can still be obtained up until after the
// pages have become FREE. There may of course be yet other options that are overall better; please
// suggest if you think of one.
// All the explicit cleanup of VmCowPages happens in VmCowPages::fbl_recycle(), with the final
// explicit fbl_recycle() step being release of the containing VmCowPagesContainer which in turn
// triggers ~VmCowPages which finishes up with implicit cleanup of VmCowPages (but possibly delayed
// slightly by loaned page reclaimer(s) that can have a VmCowPagesContainer ref transiently).
// Those paying close attention may note that under high load with potential low priority thread
// starvation (with a hypothetical scheduling policy that is assumed to let thread starvation be
// possible), each low priority loaned page reclaiming thread may essentially be thought of as
// having up to one VmCowPagesContainer + contained de-populated VmCowPages as additional memory
// overhead that can be thought of as being essentially attributed to the memory cost of the low
// priority thread. I think this is completely fine and completely analogous to many other similar
// situations. In a sense it's priority inversion of the rest of cleanup of the VmCowPages memory,
// but since it's a depopulated VmCowPages, the symptom isn't enough of a problem to justify any
// mitigation other than mentally accounting for it in the low priority thread's memory cost. We
// should be careful not to let a refcount held by a lower priority thread potentially keep
// unbounded memory allocated of course, but in this case it's well bounded.
// We restrict visibility of VmCowPages via its VmCowPagesContainer, to control which methods are
// ok to call on the VmCowPages via a VmCowPagesContainer ref while lacking any direct VmCowPages
// ref. The methods that are ok to call with only a VmCowPagesContainer ref are called via a
// corresponding method on VmCowPagesContainer.
class VmCowPagesContainer : public fbl::RefCountedUpgradeable<VmCowPagesContainer> {
VmCowPagesContainer() = default;
// These are the only VmCowPages methods that are ok to call via ref on VmCowPagesContainer while
// holding no ref on the contained VmCowPages. These will operate correctly despite potential
// concurrent VmCowPages::fbl_recycle() on a different thread and despite VmCowPages refcount_
// potentially being 0. The VmCowPagesContainer ref held by the caller keeps the actual
// VmCowPages object alive during this call.
bool RemovePageForEviction(vm_page_t* page, uint64_t offset,
VmCowPages::EvictionHintAction hint_action);
zx_status_t ReplacePage(vm_page_t* page, uint64_t offset, bool with_loaned);
friend class VmCowPages;
// We'd use ktl::optional<VmCowPages> or std::variant<monostate, VmCowPages>, but both those
// require is_constructible_v<VmCowPages, ...>, which in turn requires the VmCowPages constructor
// to be public, which we don't want.
// Used for construction of contained VmCowPages.
template <class... Args>
void EmplaceCow(Args&&... args);
VmCowPages& cow();
ktl::aligned_storage_t<sizeof(VmCowPages), alignof(VmCowPages)> cow_space_;
bool is_cow_present_ = false;