| // Copyright 2020 The Fuchsia Authors |
| // |
| // Use of this source code is governed by a MIT-style |
| // license that can be found in the LICENSE file or at |
| // https://opensource.org/licenses/MIT |
| |
| #ifndef ZIRCON_KERNEL_VM_INCLUDE_VM_VM_COW_PAGES_H_ |
| #define ZIRCON_KERNEL_VM_INCLUDE_VM_VM_COW_PAGES_H_ |
| |
| #include <assert.h> |
| #include <lib/user_copy/user_ptr.h> |
| #include <lib/zircon-internal/thread_annotations.h> |
| #include <stdint.h> |
| #include <zircon/listnode.h> |
| #include <zircon/types.h> |
| |
| #include <fbl/array.h> |
| #include <fbl/canary.h> |
| #include <fbl/enum_bits.h> |
| #include <fbl/intrusive_double_list.h> |
| #include <fbl/macros.h> |
| #include <fbl/ref_counted.h> |
| #include <fbl/ref_ptr.h> |
| #include <kernel/mutex.h> |
| #include <vm/page_source.h> |
| #include <vm/physical_page_borrowing_config.h> |
| #include <vm/pmm.h> |
| #include <vm/vm.h> |
| #include <vm/vm_aspace.h> |
| #include <vm/vm_object.h> |
| #include <vm/vm_page_list.h> |
| |
| // Forward declare these so VmCowPages helpers can accept references. |
| class BatchPQRemove; |
| class VmObjectPaged; |
| |
| namespace internal { |
| struct DiscardableListTag {}; |
| } // namespace internal |
| |
| enum class VmCowPagesOptions : uint32_t { |
| // Externally-usable flags: |
| kNone = 0u, |
| |
| // With this clear, zeroing a page tries to decommit the page. With this set, zeroing never |
| // decommits the page. Currently this is only set for contiguous VMOs. |
| // |
| // TODO(dustingreen): Once we're happy with the reliability of page borrowing, we should be able |
| // to relax this restriction. We may still need to flush zeroes to RAM during reclaim to mitigate |
| // a hypothetical client incorrectly assuming that cache-clean status will remain intact while |
| // pages aren't pinned, but that mitigation should be sufficient (even assuming such a client) to |
| // allow implicit decommit when zeroing or when zero scanning, as long as no clients are doing DMA |
| // to/from contiguous while not pinned. |
| kCannotDecommitZeroPages = (1u << 0), |
| |
| // Internal-only flags: |
| kHidden = (1u << 1), |
| kSlice = (1u << 2), |
| kUnpinOnDelete = (1u << 3), |
| |
| kInternalOnlyMask = kHidden | kSlice, |
| }; |
| FBL_ENABLE_ENUM_BITS(VmCowPagesOptions) |
| |
| // Implements a copy-on-write hierarchy of pages in a VmPageList. |
| class VmCowPages final |
| : public VmHierarchyBase, |
| public fbl::ContainableBaseClasses< |
| // Guarded by lock_. |
| fbl::TaggedDoublyLinkedListable<VmCowPages*, internal::ChildListTag>, |
| // Guarded by DiscardableVmosLock::Get(). |
| fbl::TaggedDoublyLinkedListable<VmCowPages*, internal::DiscardableListTag>>, |
| public fbl::Recyclable<VmCowPages> { |
| public: |
| static zx_status_t Create(fbl::RefPtr<VmHierarchyState> root_lock, VmCowPagesOptions options, |
| uint32_t pmm_alloc_flags, uint64_t size, |
| fbl::RefPtr<VmCowPages>* cow_pages); |
| |
| static zx_status_t CreateExternal(fbl::RefPtr<PageSource> src, VmCowPagesOptions options, |
| fbl::RefPtr<VmHierarchyState> root_lock, uint64_t size, |
| fbl::RefPtr<VmCowPages>* cow_pages); |
| |
| // Creates a copy-on-write clone with the desired parameters. This can fail due to various |
| // internal states not being correct. |
| zx_status_t CreateCloneLocked(CloneType type, uint64_t offset, uint64_t size, |
| fbl::RefPtr<VmCowPages>* child_cow) TA_REQ(lock_); |
| |
| // Creates a child that looks back to this VmCowPages for all operations. Once a child slice is |
| // created this node should not ever be Resized. |
| zx_status_t CreateChildSliceLocked(uint64_t offset, uint64_t size, |
| fbl::RefPtr<VmCowPages>* cow_slice) TA_REQ(lock_); |
| |
| // Returns the size in bytes of this cow pages range. This will always be a multiple of the page |
| // size. |
| uint64_t size_locked() const TA_REQ(lock_) { return size_; } |
| |
| // Returns whether this cow pages node is ultimately backed by a user pager to fulfill initial |
| // content, and not zero pages. Contiguous VMOs have page_source_ set, but are not pager backed |
| // in this sense. |
| // |
| // This should only be used to report to user mode whether a VMO is user-pager backed, not for any |
| // other purpose. |
| bool is_root_source_user_pager_backed_locked() const TA_REQ(lock_) { |
| auto root = GetRootLocked(); |
| // The root will never be null. It will either point to a valid parent, or |this| if there's no |
| // parent. |
| DEBUG_ASSERT(root); |
| return root->page_source_ && root->page_source_->properties().is_user_pager; |
| } |
| |
| bool debug_is_user_pager_backed_locked() const TA_REQ(lock_) { |
| return page_source_ && page_source_->properties().is_user_pager; |
| } |
| |
| bool debug_is_contiguous() const TA_REQ(lock_) { |
| return page_source_ && page_source_->properties().is_providing_specific_physical_pages; |
| } |
| |
| bool is_private_pager_copy_supported() const TA_REQ(lock_) { |
| auto root = GetRootLocked(); |
| // The root will never be null. It will either point to a valid parent, or |this| if there's no |
| // parent. |
| DEBUG_ASSERT(root); |
| bool result = root->page_source_ && root->page_source_->properties().is_preserving_page_content; |
| DEBUG_ASSERT(result == is_root_source_user_pager_backed_locked()); |
| return result; |
| } |
| |
| bool is_cow_clonable_locked() const TA_REQ(lock_) { |
| // Copy-on-write clones of pager vmos or their descendants aren't supported as we can't |
| // efficiently make an immutable snapshot. |
| if (can_root_source_evict_locked()) { |
| return false; |
| } |
| |
| // We also don't support COW clones for contiguous VMOs. |
| if (is_source_supplying_specific_physical_pages_locked()) { |
| return false; |
| } |
| |
| // Copy-on-write clones of slices aren't supported at the moment due to the resulting VMO chains |
| // having non hidden VMOs between hidden VMOs. This case cannot be handled be CloneCowPageLocked |
| // at the moment and so we forbid the construction of such cases for the moment. |
| // Bug: 36841 |
| if (is_slice_locked()) { |
| return false; |
| } |
| |
| return true; |
| } |
| |
| bool can_evict_locked() const TA_REQ(lock_) { |
| bool result = page_source_ && page_source_->properties().is_preserving_page_content; |
| DEBUG_ASSERT(result == debug_is_user_pager_backed_locked()); |
| return result; |
| } |
| |
| bool can_root_source_evict_locked() const TA_REQ(lock_) { |
| auto root = GetRootLocked(); |
| // The root will never be null. It will either point to a valid parent, or |this| if there's no |
| // parent. |
| DEBUG_ASSERT(root); |
| AssertHeld(root->lock_); |
| bool result = root->can_evict_locked(); |
| DEBUG_ASSERT(result == is_root_source_user_pager_backed_locked()); |
| return result; |
| } |
| |
| bool has_pager_backlinks_locked() const TA_REQ(lock_) { |
| bool result = can_evict_locked(); |
| DEBUG_ASSERT(result == debug_is_user_pager_backed_locked()); |
| return result; |
| } |
| |
| // Returns whether this cow pages node is dirty tracked. |
| bool is_dirty_tracked_locked() const TA_REQ(lock_) { |
| // Pager-backed VMOs require dirty tracking either if: |
| // 1. They are directly backed by the pager, i.e. the root VMO. |
| // OR |
| // 2. They are slice children of root pager-backed VMOs, since slices directly reference the |
| // parent's pages. |
| auto* which_cow = is_slice_locked() ? parent_.get() : this; |
| bool result = |
| which_cow->page_source_ && which_cow->page_source_->properties().is_preserving_page_content; |
| AssertHeld(which_cow->lock_); |
| DEBUG_ASSERT(result == which_cow->debug_is_user_pager_backed_locked()); |
| return result; |
| } |
| |
| bool is_source_preserving_page_content_locked() const TA_REQ(lock_) { |
| bool result = page_source_ && page_source_->properties().is_preserving_page_content; |
| DEBUG_ASSERT(result == debug_is_user_pager_backed_locked()); |
| return result; |
| } |
| |
| bool is_source_supplying_specific_physical_pages_locked() const TA_REQ(lock_) { |
| bool result = page_source_ && page_source_->properties().is_providing_specific_physical_pages; |
| DEBUG_ASSERT(result == debug_is_contiguous()); |
| return result; |
| } |
| |
| // When attributing pages hidden nodes must be attributed to either their left or right |
| // descendants. The attribution IDs of all involved determine where attribution goes. For |
| // historical and practical reasons actual user ids are used, although any consistent naming |
| // scheme will have the same effect. |
| void set_page_attribution_user_id_locked(uint64_t id) TA_REQ(lock_) { |
| page_attribution_user_id_ = id; |
| } |
| |
| // See description on |pinned_page_count_| for meaning. |
| uint64_t pinned_page_count_locked() const TA_REQ(lock_) { return pinned_page_count_; } |
| |
| // Sets the VmObjectPaged backlink for this copy-on-write node. This object has no tracking of |
| // mappings, but understands that they exist. When it manipulates pages in a way that could effect |
| // mappings it uses the backlink to notify the VmObjectPaged. |
| // Currently it is assumed that all nodes always have backlinks with the 1:1 hierarchy mapping. |
| void set_paged_backlink_locked(VmObjectPaged* ref) TA_REQ(lock_) { paged_ref_ = ref; } |
| |
| uint64_t HeapAllocationBytesLocked() const TA_REQ(lock_) { |
| return page_list_.HeapAllocationBytes(); |
| } |
| |
| uint64_t EvictionEventCountLocked() const TA_REQ(lock_) { return eviction_event_count_; } |
| |
| void DetachSourceLocked() TA_REQ(lock_); |
| |
| // Resizes the range of this cow pages. |size| must be a multiple of the page size and this must |
| // not be called on slices or nodes with slice children. |
| zx_status_t ResizeLocked(uint64_t size) TA_REQ(lock_); |
| |
| // See VmObject::Lookup |
| zx_status_t LookupLocked(uint64_t offset, uint64_t len, VmObject::LookupFunction lookup_fn) |
| TA_REQ(lock_); |
| |
| // Similar to LookupLocked, but enumerate all readable pages in the hierarchy within the requested |
| // range. The offset passed to the |lookup_fn| is the offset this page is visible at in this |
| // object, even if the page itself is committed in a parent object. The physical addresses given |
| // to the lookup_fn should not be retained in any way unless the range has also been pinned by the |
| // caller. |
| // Ranges of length zero are considered invalid and will return ZX_ERR_INVALID_ARGS. The lookup_fn |
| // can terminate iteration early by returning ZX_ERR_STOP. |
| using LookupReadableFunction = |
| fit::inline_function<zx_status_t(uint64_t offset, paddr_t pa), 4 * sizeof(void*)>; |
| zx_status_t LookupReadableLocked(uint64_t offset, uint64_t len, LookupReadableFunction lookup_fn) |
| TA_REQ(lock_); |
| |
| // See VmObject::TakePages |
| zx_status_t TakePagesLocked(uint64_t offset, uint64_t len, VmPageSpliceList* pages) TA_REQ(lock_); |
| |
| // See VmObject::SupplyPages |
| // |
| // The new_zeroed_pages parameter should be true if the pages are new pages that need to be |
| // initialized, or false if the pages are from a different VmCowPages and are being moved to this |
| // VmCowPages. |
| zx_status_t SupplyPagesLocked(uint64_t offset, uint64_t len, VmPageSpliceList* pages, |
| bool new_zeroed_pages) TA_REQ(lock_); |
| |
| // The new_zeroed_pages parameter should be true if the pages are new pages that need to be |
| // initialized, or false if the pages are from a different VmCowPages and are being moved to this |
| // VmCowPages. |
| zx_status_t SupplyPages(uint64_t offset, uint64_t len, VmPageSpliceList* pages, |
| bool new_zeroed_pages) TA_EXCL(lock_); |
| |
| // See VmObject::FailPageRequests |
| zx_status_t FailPageRequestsLocked(uint64_t offset, uint64_t len, zx_status_t error_status) |
| TA_REQ(lock_); |
| |
| // Used to track dirty_state in the vm_page_t. |
| // |
| // The transitions between the three states can roughly be summarized as follows: |
| // 1. A page starts off as Clean when supplied. |
| // 2. A write transitions the page from Clean to Dirty. |
| // 3. A writeback_begin moves the Dirty page to AwaitingClean. |
| // 4. A writeback_end moves the AwaitingClean page to Clean. |
| // 5. A write that comes in while the writeback is in progress (i.e. the page is AwaitingClean) |
| // moves the AwaitingClean page back to Dirty. |
| enum class DirtyState : uint8_t { |
| // The page does not track dirty state. Used for non pager backed pages. |
| Untracked = 0, |
| // The page is clean, i.e. its contents have not been altered from when the page was supplied. |
| Clean, |
| // The page's contents have been modified from the time of supply, and should be written back to |
| // the page source at some point. |
| Dirty, |
| // The page still has modified contents, but the page source is in the process of writing back |
| // the changes. This is used to ensure that a consistent version is written back, and that any |
| // new modifications that happen during the writeback are not lost. The page source will mark |
| // pages AwaitingClean before starting any writeback. |
| AwaitingClean, |
| NumStates, |
| }; |
| // Make sure that the state can be encoded in the vm_page_t's dirty_state field. |
| static_assert(static_cast<uint8_t>(DirtyState::NumStates) <= VM_PAGE_OBJECT_MAX_DIRTY_STATES); |
| |
| static bool is_page_dirty_tracked(const vm_page_t* page) { |
| return DirtyState(page->object.dirty_state) != DirtyState::Untracked; |
| } |
| static bool is_page_dirty(const vm_page_t* page) { |
| return DirtyState(page->object.dirty_state) == DirtyState::Dirty; |
| } |
| static bool is_page_clean(const vm_page_t* page) { |
| return DirtyState(page->object.dirty_state) == DirtyState::Clean; |
| } |
| static bool is_page_awaiting_clean(const vm_page_t* page) { |
| return DirtyState(page->object.dirty_state) == DirtyState::AwaitingClean; |
| } |
| |
| // See VmObject::DirtyPages |
| zx_status_t DirtyPagesLocked(uint64_t offset, uint64_t len) TA_REQ(lock_); |
| |
| using DirtyRangeEnumerateFunction = VmObject::DirtyRangeEnumerateFunction; |
| // See VmObject::EnumerateDirtyRanges |
| zx_status_t EnumerateDirtyRangesLocked(uint64_t offset, uint64_t len, |
| DirtyRangeEnumerateFunction&& dirty_range_fn) |
| TA_REQ(lock_); |
| |
| // See VmObject::WritebackBegin |
| zx_status_t WritebackBeginLocked(uint64_t offset, uint64_t len) TA_REQ(lock_); |
| |
| // See VmObject::WritebackEnd |
| zx_status_t WritebackEndLocked(uint64_t offset, uint64_t len) TA_REQ(lock_); |
| |
| using LookupInfo = VmObject::LookupInfo; |
| using DirtyTrackingAction = VmObject::DirtyTrackingAction; |
| // See VmObject::GetPage |
| // The pages returned from this are assumed to be used in the following ways. |
| // * Our VmObjectPaged backlink, or any of children's backlinks, are allowed to have readable |
| // mappings, and will be informed to unmap via the backlinks when needed. |
| // * Our VmObjectPaged backlink and our *slice* children are allowed to have writable mappings, |
| // and will be informed to either unmap or remove writability when needed. |
| zx_status_t LookupPagesLocked(uint64_t offset, uint pf_flags, DirtyTrackingAction mark_dirty, |
| uint64_t max_out_pages, list_node* alloc_list, |
| LazyPageRequest* page_request, LookupInfo* out) TA_REQ(lock_); |
| |
| // Controls the type of content that can be overwritten by the Add[New]Page[s]Locked functions. |
| enum class CanOverwriteContent : uint8_t { |
| // Do not overwrite any kind of content, i.e. only add a page at the slot if there is true |
| // absence of content. |
| None, |
| // Only overwrite slots that represent zeros. In the case of anonymous VMOs, both gaps and zero |
| // page markers represent zeros, as the entire VMO is implicitly zero on creation. For pager |
| // backed VMOs, zero page markers and gaps after supply_zero_offset_ represent zeros. |
| Zero, |
| // Overwrite any slots, regardless of the type of content. |
| NonZero, |
| }; |
| // Adds an allocated page to this cow pages at the specified offset, can be optionally zeroed and |
| // any mappings invalidated. If an error is returned the caller retains ownership of |page|. |
| // Offset must be page aligned. |
| // |
| // |overwrite| controls how the function handles pre-existing content at |offset|. If |overwrite| |
| // does not permit replacing the content, ZX_ERR_ALREADY_EXISTS will be returned. If a page is |
| // released from the page list as a result of overwriting, it is returned through |released_page| |
| // and the caller takes ownership of this page. If the |overwrite| action is such that a page |
| // cannot be released, it is valid for the caller to pass in nullptr for |released_page|. |
| zx_status_t AddNewPageLocked(uint64_t offset, vm_page_t* page, CanOverwriteContent overwrite, |
| ktl::optional<vm_page_t*>* released_page, bool zero = true, |
| bool do_range_update = true) TA_REQ(lock_); |
| |
| // Adds a set of pages consecutively starting from the given offset. Regardless of the return |
| // result ownership of the pages is taken. Pages are assumed to be in the ALLOC state and can be |
| // optionally zeroed before inserting. start_offset must be page aligned. |
| // |
| // |overwrite| controls how the function handles pre-existing content in the range. If |overwrite| |
| // does not permit replacing the content, ZX_ERR_ALREADY_EXISTS will be returned. Pages released |
| // from the page list as a result of overwriting are returned through |released_pages| and the |
| // caller takes ownership of these pages. If the |overwrite| action is such that pages cannot be |
| // released, it is valid for the caller to pass in nullptr for |released_pages|. |
| zx_status_t AddNewPagesLocked(uint64_t start_offset, list_node_t* pages, |
| CanOverwriteContent overwrite, list_node_t* released_pages, |
| bool zero = true, bool do_range_update = true) TA_REQ(lock_); |
| |
| // Attempts to release pages in the pages list causing the range to become copy-on-write again. |
| // For consistency if there is a parent or a backing page source, such that the range would not |
| // explicitly copy-on-write the zero page then this will fail. Use ZeroPagesLocked for an |
| // operation that is guaranteed to succeed, but may not release memory. |
| zx_status_t DecommitRangeLocked(uint64_t offset, uint64_t len) TA_REQ(lock_); |
| |
| // After successful completion the range of pages will all read as zeros. The mechanism used to |
| // achieve this is not guaranteed to decommit, but it will try to. |
| // |page_start_base| and |page_end_base| must be page aligned offsets within the range of the |
| // object. |zeroed_len_out| will contain the length (in bytes) starting at |page_start_base| that |
| // was successfully zeroed. |
| // |
| // Returns one of the following: |
| // ZX_OK => The whole range was successfully zeroed. |
| // ZX_ERR_SHOULD_WAIT => The caller needs to wait on the |page_request| and then retry the |
| // operation. |zeroed_len_out| will contain the range that was partially zeroed, so the caller |
| // can advance the start offset before retrying. |
| // Any other error code indicates a failure to zero a part of the range or the whole range. |
| zx_status_t ZeroPagesLocked(uint64_t page_start_base, uint64_t page_end_base, |
| LazyPageRequest* page_request, uint64_t* zeroed_len_out) |
| TA_REQ(lock_); |
| |
| // Attempts to commit a range of pages. This has three kinds of return status |
| // ZX_OK => The whole range was successfully committed and |len| will be written to |
| // |committed_len| |
| // ZX_ERR_SHOULD_WAIT => A partial (potentially 0) range was committed (output in |committed_len| |
| // and the passed in |page_request| should be waited on before retrying |
| // the commit operation. The portion that was successfully committed does |
| // not need to retried. |
| // * => Any other error, the number of pages committed is undefined. |
| // The |offset| and |len| are assumed to be page aligned and within the range of |size_|. |
| zx_status_t CommitRangeLocked(uint64_t offset, uint64_t len, uint64_t* committed_len, |
| LazyPageRequest* page_request) TA_REQ(lock_); |
| |
| // Increases the pin count of the range of pages given by |offset| and |len|. The full range must |
| // already be committed and this either pins all pages in the range, or pins no pages and returns |
| // an error. The caller can assume that on success len / PAGE_SIZE pages were pinned. |
| // The |offset| and |len| are assumed to be page aligned and within the range of |size_|. |
| // |
| // This method also replaces any loaned pages with non-loaned pages. |
| zx_status_t PinRangeLocked(uint64_t offset, uint64_t len) TA_REQ(lock_); |
| |
| // See VmObject::Unpin |
| void UnpinLocked(uint64_t offset, uint64_t len, bool allow_gaps) TA_REQ(lock_); |
| |
| // Returns true if a page is not currently committed, and if the offset were to be read from, it |
| // would be read as zero. Requested offset must be page aligned and within range. |
| bool PageWouldReadZeroLocked(uint64_t page_offset) TA_REQ(lock_); |
| |
| // Returns whether this node is currently suitable for having a copy-on-write child made of it. |
| bool IsCowClonableLocked() const TA_REQ(lock_); |
| |
| // see VmObjectPaged::AttributedPagesInRange |
| size_t AttributedPagesInRangeLocked(uint64_t offset, uint64_t len) const TA_REQ(lock_); |
| |
| // Scans this cow pages range for zero pages and frees them if |reclaim| is set to true. Returns |
| // the number of pages freed or scanned. |
| uint32_t ScanForZeroPagesLocked(bool reclaim) TA_REQ(lock_); |
| |
| enum class EvictionHintAction : uint8_t { |
| Follow, |
| Ignore, |
| }; |
| |
| // Asks the VMO to attempt to evict the specified page. This returns true if the page was |
| // actually from this VMO and was successfully evicted, at which point the caller now has |
| // ownership of the page. Otherwise eviction is allowed to fail for any reason, specifically |
| // if the page is considered in use, or the VMO has no way to recreate the page then eviction |
| // will fail. Although eviction may fail for any reason, if it does the caller is able to assume |
| // that either the page was not from this vmo, or that the page is not in any evictable page queue |
| // (such as the pager_backed_ queue). |
| // |hint_action| indicates whether the |always_need| eviction hint should be respected or ignored. |
| // If this page is not evicted as a result of the hint, the caller can assume that the page has |
| // been moved out from the evictable page queue(s) into the active queue(s). |
| bool RemovePageForEviction(vm_page_t* page, uint64_t offset, EvictionHintAction hint_action); |
| |
| // Swap an old page for a new page. The old page must be at offset. The new page must be in |
| // ALLOC state. On return, the old_page is owned by the caller. Typically the caller will |
| // remove the old_page from pmm_page_queues() and free the old_page. |
| void SwapPageLocked(uint64_t offset, vm_page_t* old_page, vm_page_t* new_page) TA_REQ(lock_); |
| |
| // If page is still at offset, replace it with a different page. If with_loaned is true, replace |
| // with a loaned page. If with_loaned is false, replace with a non-loaned page. |
| zx_status_t ReplacePage(vm_page_t* before_page, uint64_t offset, bool with_loaned) TA_EXCL(lock_); |
| zx_status_t ReplacePageLocked(vm_page_t* before_page, uint64_t offset, bool with_loaned, |
| vm_page_t** after_page) TA_REQ(lock_); |
| |
| // Attempts to dedup the given page at the specified offset with the zero page. The only |
| // correctness requirement for this is that `page` must be *some* valid vm_page_t, meaning that |
| // all race conditions are handled internally. This function returns false if |
| // * page is either not from this VMO, or not found at the specified offset |
| // * page is pinned |
| // * vmo is uncached |
| // * page is not all zeroes |
| // Otherwise 'true' is returned and the page will have been returned to the pmm with a zero page |
| // marker put in its place. |
| bool DedupZeroPage(vm_page_t* page, uint64_t offset); |
| |
| void DumpLocked(uint depth, bool verbose) const TA_REQ(lock_); |
| |
| // VMO_VALIDATION |
| bool DebugValidatePageSplitsLocked() const TA_REQ(lock_); |
| bool DebugValidateBacklinksLocked() const TA_REQ(lock_); |
| // Calls DebugValidatePageSplitsLocked on this and every parent in the chain, returning true if |
| // all return true. Also calls DebugValidateBacklinksLocked() on every node in the hierarchy. |
| bool DebugValidatePageSplitsHierarchyLocked() const TA_REQ(lock_); |
| |
| // VMO_FRUGAL_VALIDATION |
| bool DebugValidateVmoPageBorrowingLocked() const TA_REQ(lock_); |
| |
| // Different operations that RangeChangeUpdate* can perform against any VmMappings that are found. |
| enum class RangeChangeOp { |
| Unmap, |
| RemoveWrite, |
| }; |
| // Apply the specified operation to all mappings in the given range. This is applied to all |
| // descendants within the range. |
| void RangeChangeUpdateLocked(uint64_t offset, uint64_t len, RangeChangeOp op) TA_REQ(lock_); |
| |
| // Promote pages in the specified range for reclamation under memory pressure. |offset| will be |
| // rounded down to the page boundary, and |len| will be rounded up to the page boundary. |
| // Currently used only for pager-backed VMOs to move their pages to the end of the |
| // pager-backed queue, so that they can be evicted first. |
| void PromoteRangeForReclamationLocked(uint64_t offset, uint64_t len) TA_REQ(lock_); |
| |
| // Protect pages in the specified range from reclamation under memory pressure. |offset| will be |
| // rounded down to the page boundary, and |len| will be rounded up to the page boundary. Used to |
| // set the |always_need| hint for pages in pager-backed VMOs. Any absent pages in the range will |
| // be committed first, and the call will block on the fulfillment of the page request(s), dropping |
| // |guard| while waiting (multiple times if multiple pages need to be supplied). |
| void ProtectRangeFromReclamationLocked(uint64_t offset, uint64_t len, Guard<Mutex>* guard) |
| TA_REQ(lock_); |
| |
| void MarkAsLatencySensitiveLocked() TA_REQ(lock_); |
| |
| zx_status_t LockRangeLocked(uint64_t offset, uint64_t len, zx_vmo_lock_state_t* lock_state_out); |
| zx_status_t TryLockRangeLocked(uint64_t offset, uint64_t len); |
| zx_status_t UnlockRangeLocked(uint64_t offset, uint64_t len); |
| |
| // Exposed for testing. |
| uint64_t DebugGetLockCount() const { |
| Guard<Mutex> guard{&lock_}; |
| return lock_count_; |
| } |
| uint64_t DebugGetPageCountLocked() const TA_REQ(lock_); |
| bool DebugIsReclaimable() const; |
| bool DebugIsUnreclaimable() const; |
| bool DebugIsDiscarded() const; |
| bool DebugIsPage(uint64_t offset) const; |
| bool DebugIsMarker(uint64_t offset) const; |
| bool DebugIsEmpty(uint64_t offset) const; |
| vm_page_t* DebugGetPage(uint64_t offset) const TA_EXCL(lock_); |
| vm_page_t* DebugGetPageLocked(uint64_t offset) const TA_REQ(lock_); |
| uint64_t DebugGetSupplyZeroOffset() const TA_EXCL(lock_); |
| |
| // Discard all the pages from a discardable vmo in the |kReclaimable| state. For this call to |
| // succeed, the vmo should have been in the reclaimable state for at least |
| // |min_duration_since_reclaimable|. If successful, the |discardable_state_| is set to |
| // |kDiscarded|, and the vmo is moved from the reclaim candidates list. The pages are removed / |
| // discarded from the vmo and appended to the |freed_list| passed in; the caller takes ownership |
| // of the removed pages and is responsible for freeing them. Returns the number of pages |
| // discarded. |
| uint64_t DiscardPages(zx_duration_t min_duration_since_reclaimable, list_node_t* freed_list) |
| TA_EXCL(DiscardableVmosLock::Get()) TA_EXCL(lock_); |
| |
| struct DiscardablePageCounts { |
| uint64_t locked; |
| uint64_t unlocked; |
| }; |
| |
| // Returns the total number of pages locked and unlocked across all discardable vmos. |
| // Note that this might not be exact and we might miss some vmos, because the |
| // |DiscardableVmosLock| is dropped after processing each vmo on the global discardable lists. |
| // That is fine since these numbers are only used for accounting. |
| static DiscardablePageCounts DebugDiscardablePageCounts() TA_EXCL(DiscardableVmosLock::Get()); |
| |
| // Walks through the LRU reclaimable list of discardable vmos and discards pages from each, until |
| // |target_pages| have been discarded, or the list of candidates is exhausted. Only vmos that have |
| // become reclaimable more than |min_duration_since_reclaimable| in the past will be discarded; |
| // this prevents discarding reclaimable vmos that were recently accessed. The discarded pages are |
| // appended to the |freed_list| passed in; the caller takes ownership of the discarded pages and |
| // is responsible for freeing them. Returns the total number of pages discarded. |
| static uint64_t ReclaimPagesFromDiscardableVmos(uint64_t target_pages, |
| zx_duration_t min_duration_since_reclaimable, |
| list_node_t* freed_list) |
| TA_EXCL(DiscardableVmosLock::Get()); |
| |
| // Walks up the parent tree and returns the root, or |this| if there is no parent. |
| const VmCowPages* GetRootLocked() const TA_REQ(lock_); |
| |
| // Only for use by loaned page reclaim. |
| VmCowPagesContainer* raw_container(); |
| |
| private: |
| // private constructor (use Create...()) |
| VmCowPages(ktl::unique_ptr<VmCowPagesContainer> cow_container, |
| fbl::RefPtr<VmHierarchyState> root_lock, VmCowPagesOptions options, |
| uint32_t pmm_alloc_flags, uint64_t size, fbl::RefPtr<PageSource> page_source); |
| friend class VmCowPagesContainer; |
| |
| ~VmCowPages() override; |
| |
| // This takes all the constructor parameters including the VmCowPagesContainer, which avoids any |
| // possiblity of allocation failure. |
| template <class... Args> |
| static fbl::RefPtr<VmCowPages> NewVmCowPages(ktl::unique_ptr<VmCowPagesContainer> cow_container, |
| Args&&... args); |
| |
| // This takes all the constructor parameters except for the VmCowPagesContainer which is |
| // allocated. The AllocChecker will reflect whether allocation was successful. |
| template <class... Args> |
| static fbl::RefPtr<VmCowPages> NewVmCowPages(fbl::AllocChecker* ac, Args&&... args); |
| |
| // fbl_recycle() does all the explicit cleanup, and the destructor does all the implicit cleanup. |
| void fbl_recycle() override; |
| friend class fbl::Recyclable<VmCowPages>; |
| |
| DISALLOW_COPY_ASSIGN_AND_MOVE(VmCowPages); |
| |
| bool is_hidden_locked() const TA_REQ(lock_) { return !!(options_ & VmCowPagesOptions::kHidden); } |
| bool is_slice_locked() const TA_REQ(lock_) { return !!(options_ & VmCowPagesOptions::kSlice); } |
| bool can_decommit_zero_pages_locked() const TA_REQ(lock_) { |
| bool result = !(options_ & VmCowPagesOptions::kCannotDecommitZeroPages); |
| DEBUG_ASSERT(result == !debug_is_contiguous()); |
| return result; |
| } |
| |
| // can_borrow_locked() returns true if the VmCowPages is capable of borrowing pages, but whether |
| // the VmCowPages should actually borrow pages also depends on a borrowing-site-specific flag that |
| // the caller is responsible for checking (in addition to checking can_borrow_locked()). Only if |
| // both are true should the caller actually borrow at the caller's specific potential borrowing |
| // site. For example, see is_borrowing_in_supplypages_enabled() and |
| // is_borrowing_on_mru_enabled(). |
| bool can_borrow_locked() const TA_REQ(lock_) { |
| // TODO(dustingreen): Or rashaeqbal@. We can only borrow while the page is not dirty. |
| // Currently we enforce this by checking ShouldTrapDirtyTransitions() below and leaning on the |
| // fact that !ShouldTrapDirtyTransitions() dirtying isn't implemented yet. We currently evict |
| // to reclaim instead of replacing the page, and we can't evict a dirty page since the contents |
| // would be lost. Option 1: When a loaned page is about to become dirty, we could replace it |
| // with a non-loaned page. Option 2: When reclaiming a loaned page we could replace instead of |
| // evicting (this may be simpler). |
| |
| // Currently we can only borrow if we have a suitable PageSource, since this suitable page |
| // source is currently 1:1 with having the needed backlinks for reclaim. |
| bool source_is_suitable = page_source_ && page_source_->properties().is_preserving_page_content; |
| // This ensures that if borrowing is globally disabled (no borrowing sites enabled), that we'll |
| // return false. We could delete this bool without damaging correctness, but we want to |
| // mitigate a call site that maybe fails to check its call-site-specific settings such as |
| // is_borrowing_in_supplypages_enabled(). |
| // |
| // We also don't technically need to check is_any_borrowing_enabled() here since pmm will check |
| // also, but by checking here, we minimize the amount of code that will run when |
| // !is_any_borrowing_enabled() (in case we have it disabled due to late discovery of a problem |
| // with borrowing). |
| bool borrowing_is_generally_acceptable = |
| pmm_physical_page_borrowing_config()->is_any_borrowing_enabled(); |
| // Exclude is_latency_sensitive_ to avoid adding latency due to reclaim. |
| // |
| // Currently we evict instead of replacing a page when reclaiming, so we want to avoid evicting |
| // pages that are latency sensitive or are fairly likely to be pinned at some point. |
| // |
| // We also don't want to borrow a page that might get pinned again since we want to mitigate the |
| // possibility of an invalid DMA-after-free. |
| bool excluded_from_borrowing_for_latency_reasons = is_latency_sensitive_ || ever_pinned_; |
| // Avoid borrowing and trapping dirty transitions overlapping for now; nothing really stops |
| // these from being compatible AFAICT - we're just avoiding overlap of these two things until |
| // later. |
| bool overlapping_with_other_features = page_source_->ShouldTrapDirtyTransitions(); |
| |
| bool result = source_is_suitable && borrowing_is_generally_acceptable && |
| !excluded_from_borrowing_for_latency_reasons && !overlapping_with_other_features; |
| |
| DEBUG_ASSERT(result == (debug_is_user_pager_backed_locked() && |
| pmm_physical_page_borrowing_config()->is_any_borrowing_enabled() && |
| !is_latency_sensitive_ && !ever_pinned_ && |
| !page_source_->ShouldTrapDirtyTransitions())); |
| |
| return result; |
| } |
| |
| bool direct_source_supplies_zero_pages_locked() const TA_REQ(lock_) { |
| bool result = page_source_ && !page_source_->properties().is_preserving_page_content; |
| DEBUG_ASSERT(result == debug_is_contiguous()); |
| return result; |
| } |
| |
| bool can_decommit_locked() const TA_REQ(lock_) { |
| bool result = !page_source_ || !page_source_->properties().is_preserving_page_content; |
| DEBUG_ASSERT(result == !debug_is_user_pager_backed_locked()); |
| return result; |
| } |
| |
| // Add a page to the object at |offset|. |
| // |
| // |overwrite| controls how the function handles pre-existing content at |offset|. If |overwrite| |
| // does not permit replacing the content, ZX_ERR_ALREADY_EXISTS will be returned. If a page is |
| // released from the page list as a result of overwriting, it is returned through |released_page| |
| // and the caller takes ownership of this page. If the |overwrite| action is such that a page |
| // cannot be released, it is valid for the caller to pass in nullptr for |released_page|. |
| // |
| // This operation unmaps the corresponding offset from any existing mappings, unless |
| // |do_range_update| is false, in which case it will skip updating mappings. |
| // |
| // On success the page to add is moved out of `*p`, otherwise it is left there. |
| zx_status_t AddPageLocked(VmPageOrMarker* p, uint64_t offset, CanOverwriteContent overwrite, |
| ktl::optional<vm_page_t*>* released_page, bool do_range_update = true) |
| TA_REQ(lock_); |
| |
| // Unmaps and removes all the committed pages in the specified range. |
| // Called from DecommitRangeLocked() to perform the actual decommit action after some of the |
| // initial sanity checks have succeeded. Also called from DetachSourceLocked() when a VMO is |
| // detached from the page source, and from DiscardPages() to reclaim pages from a discardable VMO. |
| // Upon success the removed pages are placed in |freed_list|. The caller has ownership of these |
| // pages and is responsible for freeing them. |
| // |
| // Unlike DecommitRangeLocked(), this function only operates on |this| node, which must have no |
| // parent. |
| // |offset| must be page aligned. |len| must be less than or equal to |size_ - offset|. If |len| |
| // is less than |size_ - offset| it must be page aligned. |
| // Optionally returns the number of pages removed if |pages_freed_out| is not null. |
| zx_status_t UnmapAndRemovePagesLocked(uint64_t offset, uint64_t len, list_node_t* freed_list, |
| uint64_t* pages_freed_out = nullptr) TA_REQ(lock_); |
| |
| // internal check if any pages in a range are pinned |
| bool AnyPagesPinnedLocked(uint64_t offset, size_t len) TA_REQ(lock_); |
| |
| // Helper function for ::AllocatedPagesInRangeLocked. Counts the number of pages in ancestor's |
| // vmos that should be attributed to this vmo for the specified range. It is an error to pass in a |
| // range that does not need attributing (i.e. offset must be < parent_limit_), although |len| is |
| // permitted to be sized such that the range exceeds parent_limit_. |
| // The return value is the length of the processed region, which will be <= |size| and is |
| // guaranteed to be > 0. The |count| is the number of pages in this region that should be |
| // attributed to this vmo, versus some other vmo. |
| uint64_t CountAttributedAncestorPagesLocked(uint64_t offset, uint64_t size, uint64_t* count) const |
| TA_REQ(lock_); |
| |
| // Searches for the the initial content for |this| at |offset|. The result could be used to |
| // initialize a commit, or compare an existing commit with the original. The initial content |
| // is a reference to a VmPageOrMarker as there could be an explicit vm_page of content, an |
| // explicit zero page of content via a marker, or no initial content. Determining the meaning of |
| // no initial content (i.e. whether it is zero or something else) is left up to the caller. |
| // |
| // If an ancestor has a committed page which corresponds to |offset|, returns that page |
| // as well as the VmCowPages and offset which own the page. If no ancestor has a committed |
| // page for the offset, returns null as well as the VmCowPages/offset which need to be queried |
| // to populate the page. |
| // |
| // If the passed |owner_length| is not null, then the visible range of the owner is calculated and |
| // stored back into |owner_length| on the walk up. The |owner_length| represents the size of the |
| // range in the owner for which no other VMO in the chain had forked a page. |
| VmPageOrMarker* FindInitialPageContentLocked(uint64_t offset, VmCowPages** owner_out, |
| uint64_t* owner_offset_out, uint64_t* owner_length) |
| TA_REQ(lock_); |
| |
| // LookupPagesLocked helper function that 'forks' the page at |offset| of the current vmo. If |
| // this function successfully inserts a page into |offset| of the current vmo, it returns ZX_OK |
| // and populates |out_page|. If a |page_request| is provided and ZX_ERR_SHOULD_WAIT is returned |
| // then this indicates a transient failure that should be resolved by waiting on the page_request. |
| // |
| // The source page that is being forked has already been calculated - it is |page|, which |
| // is currently in |page_owner| at offset |owner_offset|. |
| // |
| // This function is responsible for ensuring that COW clones never result in worse memory |
| // consumption than simply creating a new vmo and memcpying the content. It does this by |
| // migrating a page from a hidden vmo into one child if that page is not 'accessible' to the |
| // other child (instead of allocating a new page into the child and making the hidden vmo's |
| // page inaccessible). |
| // |
| // Whether a particular page in a hidden vmo is 'accessible' to a particular child is |
| // determined by a combination of two factors. First, if the page lies outside of the range |
| // in the hidden vmo the child can see (specified by parent_offset_ and parent_limit_), then |
| // the page is not accessible. Second, if the page has already been copied into the child, |
| // then the page in the hidden vmo is not accessible to that child. This is tracked by the |
| // cow_X_split bits in the vm_page_t structure. |
| // |
| // To handle memory allocation failure, this function performs the fork operation from the |
| // root vmo towards the leaf vmo. This allows the COW invariants to always be preserved. |
| // |
| // |page| must not be the zero-page, as there is no need to do the complex page |
| // fork logic to reduce memory consumption in that case. |
| zx_status_t CloneCowPageLocked(uint64_t offset, list_node_t* alloc_list, VmCowPages* page_owner, |
| vm_page_t* page, uint64_t owner_offset, |
| LazyPageRequest* page_request, vm_page_t** out_page) TA_REQ(lock_); |
| |
| // This is an optimized wrapper around CloneCowPageLocked for when an initial content page needs |
| // to be forked to preserve the COW invariant, but you know you are immediately going to overwrite |
| // the forked page with zeros. |
| // |
| // The optimization it can make is that it can fork the page up to the parent and then, instead |
| // of forking here and then having to immediately free the page, it can insert a marker here and |
| // set the split bits in the parent page as if it had been forked. |
| zx_status_t CloneCowPageAsZeroLocked(uint64_t offset, list_node_t* freed_list, |
| VmCowPages* page_owner, vm_page_t* page, |
| uint64_t owner_offset) TA_REQ(lock_); |
| |
| // Returns true if |page| (located at |offset| in this vmo) is only accessible by one |
| // child, where 'accessible' is defined by ::CloneCowPageLocked. |
| bool IsUniAccessibleLocked(vm_page_t* page, uint64_t offset) const TA_REQ(lock_); |
| |
| // Releases this vmo's reference to any ancestor vmo's COW pages, for the range [start, end) |
| // in this vmo. This is done by either setting the pages' split bits (if something else |
| // can access the pages) or by freeing the pages using the |page_remover| |
| // |
| // This function recursively invokes itself for regions of the parent vmo which are |
| // not accessible by the sibling vmo. |
| void ReleaseCowParentPagesLocked(uint64_t start, uint64_t end, BatchPQRemove* page_remover) |
| TA_REQ(lock_); |
| |
| // Helper function for ReleaseCowParentPagesLocked that processes pages which are visible |
| // to at least this VMO, and possibly its sibling, as well as updates parent_(offset_)limit_. |
| void ReleaseCowParentPagesLockedHelper(uint64_t start, uint64_t end, bool sibling_visible, |
| BatchPQRemove* page_remover) TA_REQ(lock_); |
| |
| // Updates the parent limits of all children so that they will never be able to |
| // see above |new_size| in this vmo, even if the vmo is enlarged in the future. |
| void UpdateChildParentLimitsLocked(uint64_t new_size) TA_REQ(lock_); |
| |
| // When cleaning up a hidden vmo, merges the hidden vmo's content (e.g. page list, view |
| // of the parent) into the remaining child. |
| void MergeContentWithChildLocked(VmCowPages* removed, bool removed_left) TA_REQ(lock_); |
| |
| // Only valid to be called when is_slice_locked() is true and returns the first parent of this |
| // hierarchy that is not a slice. The offset of this slice within that VmObjectPaged is set as |
| // the output. |
| VmCowPages* PagedParentOfSliceLocked(uint64_t* offset) TA_REQ(lock_); |
| |
| // Unpins a page and potentially moves it into a different page queue should its pin |
| // count reach zero. |
| void UnpinPageLocked(vm_page_t* page, uint64_t offset) TA_REQ(lock_); |
| |
| // Moves an existing page to the wired queue, retaining backlink information if applicable. |
| void MoveToWiredLocked(vm_page_t* page, uint64_t offset) TA_REQ(lock_); |
| |
| // Updates the page queue of an existing page, moving it to whichever non wired queue |
| // is appropriate. |
| void MoveToNotWiredLocked(vm_page_t* page, uint64_t offset) TA_REQ(lock_); |
| |
| // Places a newly added page into the appropriate non wired page queue. |
| void SetNotWiredLocked(vm_page_t* page, uint64_t offset) TA_REQ(lock_); |
| |
| // Updates any meta data for accessing a page. Currently this moves pager backed pages around in |
| // the page queue to track which ones were recently accessed for the purposes of eviction. In |
| // terms of functional correctness this never has to be called. |
| void UpdateOnAccessLocked(vm_page_t* page, uint pf_flags) TA_REQ(lock_); |
| |
| // Updates the page's dirty state to the one specified, and also moves the page between page |
| // queues if required by the dirty state. |dirty_state| should be a valid dirty tracking state, |
| // i.e. one of Clean, AwaitingClean, or Dirty. |
| // |
| // |offset| is the page-aligned offset of the page in this object. |
| // |
| // |is_pending_add| indicates whether this page is yet to be added to this object's page list, |
| // false by default. If the page is yet to be added, this function will skip updating the page |
| // queue as an optimization, since the page queue will be updated later when the page gets added |
| // to the page list. |is_pending_add| also helps determine certain validation checks that can be |
| // performed on the page. |
| void UpdateDirtyStateLocked(vm_page_t* page, uint64_t offset, DirtyState dirty_state, |
| bool is_pending_add = false) TA_REQ(lock_); |
| |
| // Prepares the specified range for a write, forwarding a DIRTY page request to the page source if |
| // pages are clean and need to transition to dirty, in which case ZX_ERR_SHOULD_WAIT will be |
| // returned and the caller should wait on |page_request|. If no page requests need to be |
| // generated, i.e. the pages are already dirty, or if they do not require the dirty transition to |
| // be trapped, ZX_OK is returned. |
| // |
| // |offset| and |len| should be page-aligned. |
| // |
| // |dirty_len_out| will return the (page-aligned) length starting at |offset| that contains dirty |
| // pages, either already dirty before making the call or dirtied during the call. In other words, |
| // the range [offset, offset + dirty_len_out) will be dirty when this call returns, where |
| // |dirty_len_out| <= |len|. |
| zx_status_t PrepareForWriteLocked(LazyPageRequest* page_request, uint64_t offset, uint64_t len, |
| uint64_t* dirty_len_out) TA_REQ(lock_); |
| |
| // If supply_zero_offset_ falls within the specified range [start_offset, end_offset), try to |
| // advance supply_zero_offset_ over any pages in the range that might have been committed |
| // immediately following supply_zero_offset_. |start_offset| and |end_offset| should be |
| // page-aligned. |
| void TryAdvanceSupplyZeroOffsetLocked(uint64_t start_offset, uint64_t end_offset) TA_REQ(lock_); |
| |
| // Initializes and adds as a child the given VmCowPages as a full clone of this one such that the |
| // VmObjectPaged backlink can be moved from this to the child, keeping all page offsets, sizes and |
| // other requirements (see VmObjectPaged::SetCowPagesReferenceLocked) are valid. This does also |
| // move our paged_ref_ into child_ and update the VmObjectPaged backlinks. |
| void CloneParentIntoChildLocked(fbl::RefPtr<VmCowPages>& child) TA_REQ(lock_); |
| |
| // Removes the specified child from this objects |children_list_| and performs any hierarchy |
| // updates that need to happen as a result. This does not modify the |parent_| member of the |
| // removed child and if this is not being called due to |removed| being destructed it is the |
| // callers responsibility to correct parent_. |
| void RemoveChildLocked(VmCowPages* removed) TA_REQ(lock_); |
| |
| // Inserts a newly created VmCowPages into this hierarchy as a child of this VmCowPages. |
| // Initializes child members based on the passed in values that only have meaning when an object |
| // is a child. This updates the parent_ field in child to hold a refptr to |this|. |
| void AddChildLocked(VmCowPages* child, uint64_t offset, uint64_t root_parent_offset, |
| uint64_t parent_limit) TA_REQ(lock_); |
| |
| // Outside of initialization/destruction, hidden vmos always have two children. For |
| // clarity, whichever child is first in the list is the 'left' child, and whichever |
| // child is second is the 'right' child. Children of a paged vmo will always be paged |
| // vmos themselves. |
| VmCowPages& left_child_locked() TA_REQ(lock_) TA_ASSERT(left_child_locked().lock()) { |
| DEBUG_ASSERT(is_hidden_locked()); |
| DEBUG_ASSERT(children_list_len_ == 2); |
| |
| auto& ret = children_list_.front(); |
| AssertHeld(ret.lock_); |
| return ret; |
| } |
| VmCowPages& right_child_locked() TA_REQ(lock_) TA_ASSERT(right_child_locked().lock()) { |
| DEBUG_ASSERT(is_hidden_locked()); |
| DEBUG_ASSERT(children_list_len_ == 2); |
| auto& ret = children_list_.back(); |
| AssertHeld(ret.lock_); |
| return ret; |
| } |
| const VmCowPages& left_child_locked() const TA_REQ(lock_) TA_ASSERT(left_child_locked().lock()) { |
| DEBUG_ASSERT(is_hidden_locked()); |
| DEBUG_ASSERT(children_list_len_ == 2); |
| const auto& ret = children_list_.front(); |
| AssertHeld(ret.lock_); |
| return ret; |
| } |
| const VmCowPages& right_child_locked() const TA_REQ(lock_) |
| TA_ASSERT(right_child_locked().lock()) { |
| DEBUG_ASSERT(is_hidden_locked()); |
| DEBUG_ASSERT(children_list_len_ == 2); |
| const auto& ret = children_list_.back(); |
| AssertHeld(ret.lock_); |
| return ret; |
| } |
| |
| void ReplaceChildLocked(VmCowPages* old, VmCowPages* new_child) TA_REQ(lock_); |
| |
| void DropChildLocked(VmCowPages* c) TA_REQ(lock_); |
| |
| // Types for an additional linked list over the VmCowPages for use when doing a |
| // RangeChangeUpdate. |
| // |
| // To avoid unbounded stack growth we need to reserve the memory to exist on a |
| // RangeChange list in our object so that we can have a flat iteration over a |
| // work list. RangeChangeLists should only be used by the RangeChangeUpdate |
| // code. |
| using RangeChangeNodeState = fbl::SinglyLinkedListNodeState<VmCowPages*>; |
| struct RangeChangeTraits { |
| static RangeChangeNodeState& node_state(VmCowPages& cow) { return cow.range_change_state_; } |
| }; |
| using RangeChangeList = |
| fbl::SinglyLinkedListCustomTraits<VmCowPages*, VmCowPages::RangeChangeTraits>; |
| friend struct RangeChangeTraits; |
| |
| // Given an initial list of VmCowPages performs RangeChangeUpdate on it until the list is empty. |
| static void RangeChangeUpdateListLocked(RangeChangeList* list, RangeChangeOp op); |
| |
| void RangeChangeUpdateFromParentLocked(uint64_t offset, uint64_t len, RangeChangeList* list) |
| TA_REQ(lock_); |
| |
| // Helper to check whether the requested range for LockRangeLocked() / TryLockRangeLocked() / |
| // UnlockRangeLocked() is valid. |
| bool IsLockRangeValidLocked(uint64_t offset, uint64_t len) const TA_REQ(lock_); |
| |
| // Lock that protects the global discardable lists. |
| // This lock can be acquired with the vmo's |lock_| held. To prevent deadlocks, if both locks are |
| // required the order of locking should always be 1) vmo's lock, and then 2) DiscardableVmosLock. |
| DECLARE_SINGLETON_MUTEX(DiscardableVmosLock); |
| |
| enum class DiscardableState : uint8_t { |
| kUnset = 0, |
| kReclaimable, |
| kUnreclaimable, |
| kDiscarded, |
| }; |
| |
| using DiscardableList = fbl::TaggedDoublyLinkedList<VmCowPages*, internal::DiscardableListTag>; |
| |
| // Two global lists of discardable vmos: |
| // - |discardable_reclaim_candidates_| tracks discardable vmos that are eligible for reclamation |
| // and haven't been reclaimed yet. |
| // - |discardable_non_reclaim_candidates_| tracks all other discardable VMOs. |
| // The lists are protected by the |DiscardableVmosLock|, and updated based on a discardable vmo's |
| // state changes (lock, unlock, or discard). |
| static DiscardableList discardable_reclaim_candidates_ TA_GUARDED(DiscardableVmosLock::Get()); |
| static DiscardableList discardable_non_reclaim_candidates_ TA_GUARDED(DiscardableVmosLock::Get()); |
| |
| // Helper function to move an object from the |discardable_non_reclaim_candidates_| list to the |
| // |discardable_reclaim_candidates_| list. |
| void MoveToReclaimCandidatesListLocked() TA_REQ(lock_) TA_REQ(DiscardableVmosLock::Get()); |
| |
| // Helper function to move an object from the |discardable_reclaim_candidates_| list to the |
| // |discardable_non_reclaim_candidates_| list. If |new_candidate| is true, that indicates that the |
| // object was not yet being tracked on any list, and should only be inserted into the |
| // |discardable_non_reclaim_candidates_| list without a corresponding list removal. |
| void MoveToNonReclaimCandidatesListLocked(bool new_candidate = false) TA_REQ(lock_) |
| TA_REQ(DiscardableVmosLock::Get()); |
| |
| // Updates the |discardable_state_| of a discardable vmo, and moves it from one discardable list |
| // to another. |
| void UpdateDiscardableStateLocked(DiscardableState state) TA_REQ(lock_) |
| TA_EXCL(DiscardableVmosLock::Get()); |
| |
| // Remove a discardable object from whichever global discardable list it is in. Called from the |
| // VmCowPages destructor. |
| void RemoveFromDiscardableListLocked() TA_REQ(lock_) TA_EXCL(DiscardableVmosLock::Get()); |
| |
| // Returns whether the vmo is in either one of the |discardable_reclaim_candidates_| or |
| // |discardable_reclaim_candidates_| lists, depending on whether it is a |reclaim_candidate| |
| // or not. |
| bool DebugIsInDiscardableListLocked(bool reclaim_candidate) const TA_REQ(lock_) |
| TA_EXCL(DiscardableVmosLock::Get()); |
| |
| DiscardablePageCounts GetDiscardablePageCounts() const TA_EXCL(lock_); |
| |
| // Returns the root parent's page source. |
| fbl::RefPtr<PageSource> GetRootPageSourceLocked() const TA_REQ(lock_); |
| |
| void FreePages(list_node* pages) { |
| if (!page_source_ || !page_source_->properties().is_handling_free) { |
| pmm_free(pages); |
| return; |
| } |
| page_source_->FreePages(pages); |
| } |
| |
| void FreePage(vm_page_t* page) { |
| DEBUG_ASSERT(!list_in_list(&page->queue_node)); |
| if (!page_source_ || !page_source_->properties().is_handling_free) { |
| pmm_free_page(page); |
| return; |
| } |
| list_node_t list; |
| list_initialize(&list); |
| list_add_tail(&list, &page->queue_node); |
| page_source_->FreePages(&list); |
| } |
| |
| void CopyPageForReplacementLocked(vm_page_t* dst_page, vm_page_t* src_page) TA_REQ(lock_); |
| |
| // Update supply_zero_offset_ to the specified page-aligned |offset|, and potentially also reset |
| // awaiting_clean_zero_range_end_ if required. (See comments near declaration of |
| // awaiting_clean_zero_range_end_ for additional context.) |
| void UpdateSupplyZeroOffsetLocked(uint64_t offset) TA_REQ(lock_) { |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(offset)); |
| uint64_t prev_supply_zero_offset = supply_zero_offset_; |
| supply_zero_offset_ = offset; |
| |
| // If there was no zero range AwaitingClean, there is nothing more to do. |
| if (awaiting_clean_zero_range_end_ == 0) { |
| return; |
| } |
| DEBUG_ASSERT(prev_supply_zero_offset < awaiting_clean_zero_range_end_); |
| |
| // The AwaitingClean zero range we were tracking was [prev_supply_zero_offset, |
| // awaiting_clean_zero_range_end_). If |offset| lies within this range, we still have a valid |
| // AwaitingClean sub-range that we can continue tracking i.e. [offset, |
| // awaiting_clean_zero_range_end_). Otherwise, the AwaitingClean zero range is no longer valid |
| // and must be reset. |
| if (!(offset >= prev_supply_zero_offset && offset < awaiting_clean_zero_range_end_)) { |
| awaiting_clean_zero_range_end_ = 0; |
| } |
| |
| // If awaiting_clean_zero_range_end_ is non-zero, it should be strictly greater than |
| // supply_zero_offset_, as it is used to track the range [supply_zero_offset_, |
| // awaiting_clean_zero_range_end_). |
| DEBUG_ASSERT(awaiting_clean_zero_range_end_ == 0 || |
| supply_zero_offset_ < awaiting_clean_zero_range_end_); |
| } |
| |
| // Consider trimming the AwaitingClean zero range (if there is one) to end at the specified |
| // page-aligned |end_offset|. The AwaitingClean zero range always starts at supply_zero_offset_. |
| // (See comments near declaration of awaiting_clean_zero_range_end_ for additional context.) |
| // |
| // Three scenarios are possible here: |
| // - If awaiting_clean_zero_range_end_ is 0, no AwaitingClean zero range is being tracked, so |
| // nothing needs to be done. |
| // - If |end_offset| lies within [supply_zero_offset_, awaiting_clean_zero_range_end_), the zero |
| // range should now end at |end_offset|. The new AwaitingClean zero range becomes |
| // [supply_zero_offset_, end_offset). |
| // - If |end_offset| lies outside of [supply_zero_offset_, awaiting_clean_zero_range_end_), it |
| // does not affect the AwaitingClean zero range. |
| void ConsiderTrimAwaitingCleanZeroRangeLocked(uint64_t end_offset) TA_REQ(lock_) { |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(end_offset)); |
| |
| // No AwaitingClean zero range was being tracked. |
| if (awaiting_clean_zero_range_end_ == 0) { |
| return; |
| } |
| DEBUG_ASSERT(supply_zero_offset_ < awaiting_clean_zero_range_end_); |
| |
| // Trim the zero range to the new end offset. |
| if (end_offset >= supply_zero_offset_ && end_offset < awaiting_clean_zero_range_end_) { |
| awaiting_clean_zero_range_end_ = end_offset; |
| // Reset awaiting_clean_zero_range_end_ if this leaves us with no valid range. |
| if (awaiting_clean_zero_range_end_ == supply_zero_offset_) { |
| awaiting_clean_zero_range_end_ = 0; |
| } |
| } |
| |
| // If awaiting_clean_zero_range_end_ is non-zero, it should be strictly greater than |
| // supply_zero_offset_, as it is used to track the range [supply_zero_offset_, |
| // awaiting_clean_zero_range_end_). |
| DEBUG_ASSERT(awaiting_clean_zero_range_end_ == 0 || |
| supply_zero_offset_ < awaiting_clean_zero_range_end_); |
| } |
| |
| // magic value |
| fbl::Canary<fbl::magic("VMCP")> canary_; |
| |
| // VmCowPages keeps this ref on VmCowPagesContainer until the end of VmCowPages::fbl_recycle(). |
| // This allows loaned page reclaim to upgrade a raw container pointer until _after_ all the pages |
| // have been removed from the VmCowPages. This way there's always something for loaned page |
| // reclaim to block on that'll do priority inheritance to the thread that needs to finish moving |
| // pages. |
| fbl::RefPtr<VmCowPagesContainer> container_; |
| VmCowPagesContainer* debug_retained_raw_container_ = nullptr; |
| |
| VmCowPagesOptions options_ TA_GUARDED(lock_); |
| |
| uint64_t size_ TA_GUARDED(lock_); |
| // Offset in the *parent* where this object starts. |
| uint64_t parent_offset_ TA_GUARDED(lock_) = 0; |
| // Offset in *this object* above which accesses will no longer access the parent. |
| uint64_t parent_limit_ TA_GUARDED(lock_) = 0; |
| // Offset in *this object* below which this vmo stops referring to its parent. This field |
| // is only useful for hidden vmos, where it is used by ::ReleaseCowPagesParentLocked |
| // together with parent_limit_ to reduce how often page split bits need to be set. It is |
| // effectively a summary of the parent_offset_ values of all descendants - unlike |
| // parent_limit_, this value does not directly impact page lookup. See partial_cow_release_ flag |
| // for more details on usage of this limit. |
| uint64_t parent_start_limit_ TA_GUARDED(lock_) = 0; |
| // Offset in our root parent where this object would start if projected onto it. This value is |
| // used as an efficient summation of accumulated offsets to ensure that an offset projected all |
| // the way to the root would not overflow a 64-bit integer. Although actual page resolution |
| // would never reach the root in such a case, a childs full range projected onto its parent is |
| // used to simplify some operations and so this invariant of not overflowing accumulated offsets |
| // needs to be maintained. |
| uint64_t root_parent_offset_ TA_GUARDED(lock_) = 0; |
| const uint32_t pmm_alloc_flags_; |
| |
| // Flag which is true if there was a call to ::ReleaseCowParentPagesLocked which was |
| // not able to update the parent limits. When this is not set, it is sometimes |
| // possible for ::MergeContentWithChildLocked to do significantly less work. This flag acts as a |
| // proxy then for how precise the parent_limit_ and parent_start_limit_ are. It is always an |
| // absolute guarantee that descendants cannot see outside of the limits, but when this flag is |
| // true there is a possibility that there is a sub range inside the limits that they also cannot |
| // see. |
| // Imagine a two siblings that see the parent range [0x1000-0x2000) and [0x3000-0x4000) |
| // respectively. The parent can have the start_limit of 0x1000 and limit of 0x4000, but without |
| // additional allocations it cannot track the free region 0x2000-0x3000, and so |
| // partial_cow_release_ must be set to indicate in the future we need to do more expensive |
| // processing to check for such free regions. |
| bool partial_cow_release_ TA_GUARDED(lock_) = false; |
| |
| // parent pointer (may be null) |
| fbl::RefPtr<VmCowPages> parent_ TA_GUARDED(lock_); |
| |
| // list of every child |
| fbl::TaggedDoublyLinkedList<VmCowPages*, internal::ChildListTag> children_list_ TA_GUARDED(lock_); |
| |
| // length of children_list_ |
| uint32_t children_list_len_ TA_GUARDED(lock_) = 0; |
| |
| // Flag used for walking back up clone tree without recursion. See ::CloneCowPageLocked. |
| enum class StackDir : bool { |
| Left, |
| Right, |
| }; |
| struct { |
| uint64_t scratch : 63; |
| StackDir dir_flag : 1; |
| } stack_ TA_GUARDED(lock_); |
| |
| // This value is used when determining against which user-visible vmo a hidden vmo's |
| // pages should be attributed. It serves as a tie-breaker for pages that are accessible by |
| // multiple user-visible vmos. See ::HasAttributedAncestorPageLocked for more details. |
| // |
| // For non-hidden vmobjects, this always equals user_id_. For hidden vmobjects, this |
| // is the page_attribution_user_id_ of one of their children (i.e. the user_id_ of one |
| // of their non-hidden descendants). |
| uint64_t page_attribution_user_id_ TA_GUARDED(lock_) = 0; |
| |
| // Counts the total number of pages pinned by ::CommitRange. If one page is pinned n times, it |
| // contributes n to this count. |
| uint64_t pinned_page_count_ TA_GUARDED(lock_) = 0; |
| |
| // The page source, if any. |
| const fbl::RefPtr<PageSource> page_source_; |
| |
| // The offset beyond which new page requests are fulfilled by supplying zero pages, rather than |
| // having the page source supply pages. Only relevant if there is a valid page_source_ and it |
| // preserves page content. |
| // |
| // Updating supply_zero_offset_ might affect the AwaitingClean zero range being tracked by |
| // [supply_zero_offset_, awaiting_clean_zero_range_end_), and so supply_zero_offset_ should not |
| // be directly assigned. Use the UpdateSupplyZeroOffsetLocked() helper instead. See comments near |
| // awaiting_clean_zero_range_end_ for more context. |
| uint64_t supply_zero_offset_ TA_GUARDED(lock_) = UINT64_MAX; |
| |
| // If supply_zero_offset_ is relevant, and there is a zero range that is AwaitingClean, i.e. a |
| // zero range starting at supply_zero_offset_, on which WritebackBegin was called but not |
| // WritebackEnd, awaiting_clean_zero_range_end_ tracks the end of that range. In other words, if |
| // there exists a zero range that is AwaitingClean, that range is [supply_zero_offset_, |
| // awaiting_clean_zero_range_end_). |
| // |
| // Will be set to 0 otherwise. So awaiting_clean_zero_range_end_ will either be 0, or will be |
| // strictly greater than supply_zero_offset_. |
| // |
| // Note that there can be at most one zero range that is AwaitingClean at a time. |
| // |
| // The motivation for this value is to be able to transition the zero range starting at |
| // supply_zero_offset_ to Clean once it has been written back by the user pager, without having to |
| // track per-page dirty state for this zero range, which is represented in the page list by a gap. |
| // TODO(rashaeqbal): Consider removing this once page lists can support custom zero ranges. |
| uint64_t awaiting_clean_zero_range_end_ TA_GUARDED(lock_) = 0; |
| |
| // Count eviction events so that we can report them to the user. |
| uint64_t eviction_event_count_ TA_GUARDED(lock_) = 0; |
| |
| // Count of outstanding lock operations. A non-zero count prevents the kernel from discarding / |
| // evicting pages from the VMO to relieve memory pressure (currently only applicable if |
| // |kDiscardable| is set). Note that this does not prevent removal of pages by other means, like |
| // decommitting or resizing, since those are explicit actions driven by the user, not by the |
| // kernel directly. |
| uint64_t lock_count_ TA_GUARDED(lock_) = 0; |
| |
| // Timestamp of the last unlock operation that changed a discardable vmo's state to |
| // |kReclaimable|. Used to determine whether the vmo was accessed too recently to be discarded. |
| zx_time_t last_unlock_timestamp_ TA_GUARDED(lock_) = ZX_TIME_INFINITE; |
| |
| // The current state of a discardable vmo, depending on the lock count and whether it has been |
| // discarded. |
| // State transitions work as follows: |
| // 1. kUnreclaimable -> kReclaimable: When the lock count changes from 1 to 0. |
| // 2. kReclaimable -> kUnreclaimable: When the lock count changes from 0 to 1. The vmo remains |
| // kUnreclaimable for any non-zero lock count. |
| // 3. kReclaimable -> kDiscarded: When a vmo with lock count 0 is discarded. |
| // 4. kDiscarded -> kUnreclaimable: When a discarded vmo is locked again. |
| // |
| // We start off with state kUnset, so a discardable vmo must be locked at least once to opt into |
| // the above state transitions. For non-discardable vmos, the state will always remain kUnset. |
| DiscardableState discardable_state_ TA_GUARDED(lock_) = DiscardableState::kUnset; |
| |
| // a tree of pages |
| VmPageList page_list_ TA_GUARDED(lock_); |
| |
| RangeChangeNodeState range_change_state_; |
| uint64_t range_change_offset_ TA_GUARDED(lock_); |
| uint64_t range_change_len_ TA_GUARDED(lock_); |
| |
| // optional reference back to a VmObjectPaged so that we can perform mapping updates. This is a |
| // raw pointer to avoid circular references, the VmObjectPaged destructor needs to update it. |
| VmObjectPaged* paged_ref_ TA_GUARDED(lock_) = nullptr; |
| |
| // TODO(fxb/85056): This is a temporary solution and needs to be replaced with something that is |
| // formalized. |
| // Marks whether or not this VMO is considered a latency sensitive object. For a VMO being latency |
| // sensitive means pages that get committed should not be decommitted (or made expensive to |
| // access) by any background kernel process, such as the zero page deduper. |
| // Note: This does not presently protect against user pager eviction, as there is already a |
| // separate mechanism for that. Once fxb/85056 is resolved this might change. |
| bool is_latency_sensitive_ TA_GUARDED(lock_) = false; |
| |
| using Cursor = |
| VmoCursor<VmCowPages, DiscardableVmosLock, DiscardableList, DiscardableList::iterator>; |
| |
| // The list of all outstanding cursors iterating over the discardable lists: |
| // |discardable_reclaim_candidates_| and |discardable_non_reclaim_candidates_|. The cursors should |
| // be advanced (by calling AdvanceIf()) before removing any element from the discardable lists. |
| static fbl::DoublyLinkedList<Cursor*> discardable_vmos_cursors_ |
| TA_GUARDED(DiscardableVmosLock::Get()); |
| |
| // With this bool we achieve these things: |
| // * Avoid using loaned pages for a VMO that will just get pinned and replace the loaned pages |
| // with non-loaned pages again, possibly repeatedly. |
| // * Avoid increasing pin latency in the (more) common case of pinning a VMO the 2nd or |
| // subsequent times (vs the 1st time). |
| // * Once we have any form of active sweeping (of data from non-loaned to loaned physical pages) |
| // this bool is part of mitigating any potential DMA-while-not-pinned (which is not permitted |
| // but is also difficult to detect or prevent without an IOMMU). |
| bool ever_pinned_ TA_GUARDED(lock_) = false; |
| }; |
| |
| // VmCowPagesContainer exists to essentially split the VmCowPages ref_count_ into two counts, so |
| // that it remains possible to upgrade from a raw container pointer until after the VmCowPages |
| // fbl_recycle() has mostly completed and has removed and freed all the pages. |
| // |
| // This way, if we can upgrade, then we can call RemovePageForEviction() and it'll either work or |
| // the page will already have been removed from that location in the VmCowPages, or we can't |
| // upgrade, in which case all the pages have already been removed and freed. |
| // |
| // In contrast if we were to attempt upgrade of a raw VmCowPages pointer to VmCowPages ref, the |
| // ability to upgrade would disappear before the backlink is removed to make room for a |
| // StackOwnedLoanedPagesInterval, so loaned page reclaim would need to wait (somehow) for the page |
| // to be removed from the VmCowPages and at least have a backlink. That wait is problematic since |
| // it would also need to propagate priority inheritance properly like StackOwnedLoanedPagesInterval |
| // does, but the interval begins at the moment the refcount goes from 1 to 0, and reliably wrapping |
| // that 1 to 0 transition, while definitely posssible with some RefPtr changes etc etc, is more |
| // complicated than having a VmCowPagesContainer whose ref can still be obtained up until after the |
| // pages have become FREE. There may of course be yet other options that are overall better; please |
| // suggest if you think of one. |
| // |
| // All the explicit cleanup of VmCowPages happens in VmCowPages::fbl_recycle(), with the final |
| // explicit fbl_recycle() step being release of the containing VmCowPagesContainer which in turn |
| // triggers ~VmCowPages which finishes up with implicit cleanup of VmCowPages (but possibly delayed |
| // slightly by loaned page reclaimer(s) that can have a VmCowPagesContainer ref transiently). |
| // |
| // Those paying close attention may note that under high load with potential low priority thread |
| // starvation (with a hypothetical scheduling policy that is assumed to let thread starvation be |
| // possible), each low priority loaned page reclaiming thread may essentially be thought of as |
| // having up to one VmCowPagesContainer + contained de-populated VmCowPages as additional memory |
| // overhead that can be thought of as being essentially attributed to the memory cost of the low |
| // priority thread. I think this is completely fine and completely analogous to many other similar |
| // situations. In a sense it's priority inversion of the rest of cleanup of the VmCowPages memory, |
| // but since it's a depopulated VmCowPages, the symptom isn't enough of a problem to justify any |
| // mitigation other than mentally accounting for it in the low priority thread's memory cost. We |
| // should be careful not to let a refcount held by a lower priority thread potentially keep |
| // unbounded memory allocated of course, but in this case it's well bounded. |
| // |
| // We restrict visibility of VmCowPages via its VmCowPagesContainer, to control which methods are |
| // ok to call on the VmCowPages via a VmCowPagesContainer ref while lacking any direct VmCowPages |
| // ref. The methods that are ok to call with only a VmCowPagesContainer ref are called via a |
| // corresponding method on VmCowPagesContainer. |
| class VmCowPagesContainer : public fbl::RefCountedUpgradeable<VmCowPagesContainer> { |
| public: |
| VmCowPagesContainer() = default; |
| ~VmCowPagesContainer(); |
| |
| // These are the only VmCowPages methods that are ok to call via ref on VmCowPagesContainer while |
| // holding no ref on the contained VmCowPages. These will operate correctly despite potential |
| // concurrent VmCowPages::fbl_recycle() on a different thread and despite VmCowPages refcount_ |
| // potentially being 0. The VmCowPagesContainer ref held by the caller keeps the actual |
| // VmCowPages object alive during this call. |
| bool RemovePageForEviction(vm_page_t* page, uint64_t offset, |
| VmCowPages::EvictionHintAction hint_action); |
| |
| zx_status_t ReplacePage(vm_page_t* page, uint64_t offset, bool with_loaned); |
| |
| private: |
| friend class VmCowPages; |
| |
| // We'd use ktl::optional<VmCowPages> or std::variant<monostate, VmCowPages>, but both those |
| // require is_constructible_v<VmCowPages, ...>, which in turn requires the VmCowPages constructor |
| // to be public, which we don't want. |
| |
| // Used for construction of contained VmCowPages. |
| template <class... Args> |
| void EmplaceCow(Args&&... args); |
| |
| VmCowPages& cow(); |
| |
| ktl::aligned_storage_t<sizeof(VmCowPages), alignof(VmCowPages)> cow_space_; |
| bool is_cow_present_ = false; |
| }; |
| |
| #endif // ZIRCON_KERNEL_VM_INCLUDE_VM_VM_COW_PAGES_H_ |