| // Copyright 2020 The Fuchsia Authors |
| // |
| // Use of this source code is governed by a MIT-style |
| // license that can be found in the LICENSE file or at |
| // https://opensource.org/licenses/MIT |
| |
| #include "vm/vm_cow_pages.h" |
| |
| #include <lib/arch/intrin.h> |
| #include <lib/boot-options/boot-options.h> |
| #include <lib/counters.h> |
| #include <lib/fit/defer.h> |
| #include <trace.h> |
| |
| #include <cstdint> |
| |
| #include <kernel/range_check.h> |
| #include <ktl/type_traits.h> |
| #include <ktl/utility.h> |
| #include <lk/init.h> |
| #include <vm/compression.h> |
| #include <vm/discardable_vmo_tracker.h> |
| #include <vm/fault.h> |
| #include <vm/page.h> |
| #include <vm/physmap.h> |
| #include <vm/pmm.h> |
| #include <vm/vm_object.h> |
| #include <vm/vm_object_paged.h> |
| #include <vm/vm_page_list.h> |
| |
| #include "ktl/optional.h" |
| #include "vm_priv.h" |
| |
| #include <ktl/enforce.h> |
| |
| #define LOCAL_TRACE VM_GLOBAL_TRACE(0) |
| |
| // add expensive code to do a full validation of the VMO at various points. |
| #define VMO_VALIDATION (0 || (LK_DEBUGLEVEL > 2)) |
| |
| // Assertion that is only enabled if VMO_VALIDATION is enabled. |
| #define VMO_VALIDATION_ASSERT(x) \ |
| do { \ |
| if (VMO_VALIDATION) { \ |
| ASSERT(x); \ |
| } \ |
| } while (0) |
| |
| // Add not-as-expensive code to do some extra validation at various points. This is off in normal |
| // debug builds because it can add O(n) validation to an O(1) operation, so can still make things |
| // slower, despite not being as slow as VMO_VALIDATION. |
| #define VMO_FRUGAL_VALIDATION (0 || (LK_DEBUGLEVEL > 2)) |
| |
| // Assertion that is only enabled if VMO_FRUGAL_VALIDATION is enabled. |
| #define VMO_FRUGAL_VALIDATION_ASSERT(x) \ |
| do { \ |
| if (VMO_FRUGAL_VALIDATION) { \ |
| ASSERT(x); \ |
| } \ |
| } while (0) |
| |
| namespace { |
| |
| KCOUNTER(vm_vmo_high_priority, "vm.vmo.high_priority") |
| KCOUNTER(vm_vmo_dont_need, "vm.vmo.dont_need") |
| KCOUNTER(vm_vmo_always_need, "vm.vmo.always_need") |
| KCOUNTER(vm_vmo_compression_zero_slot, "vm.vmo.compression.zero_empty_slot") |
| KCOUNTER(vm_vmo_compression_marker, "vm.vmo.compression_zero_marker") |
| KCOUNTER(vm_vmo_range_update_from_parent_skipped, "vm.vmo.range_updated_from_parent.skipped") |
| KCOUNTER(vm_vmo_range_update_from_parent_performed, "vm.vmo.range_updated_from_parent.performed") |
| |
| KCOUNTER(vm_reclaim_evict_accessed, "vm.reclaim.evict_accessed") |
| KCOUNTER(vm_reclaim_compress_accessed, "vm.reclaim.compress_accessed") |
| KCOUNTER(vm_reclaim_no_reclamation_strategy, "vm.reclaim.no_reclamation_strategy") |
| KCOUNTER(vm_reclaim_always_need_skipped, "vm.reclaim.always_need_skipped") |
| KCOUNTER(vm_reclaim_discardable_failed, "vm.reclaim.discardable_failed") |
| KCOUNTER(vm_reclaim_incorrect_page, "vm.reclaim.incorrect_page") |
| KCOUNTER(vm_reclaim_high_priority, "vm.reclaim.high_priority") |
| KCOUNTER(vm_reclaim_pinned, "vm.reclaim.pinned") |
| KCOUNTER(vm_reclaim_dirty, "vm.reclaim.dirty") |
| KCOUNTER(vm_reclaim_uncached, "vm.reclaim.uncached") |
| KCOUNTER(vm_reclaim_compress_success, "vm.reclaim.compress.success") |
| KCOUNTER(vm_reclaim_compress_zero, "vm.reclaim.compress.zero") |
| KCOUNTER(vm_reclaim_compress_fail, "vm.reclaim.compress.fail") |
| KCOUNTER(vm_reclaim_compress_race, "vm.reclaim.compress.race") |
| |
| template <typename T> |
| uint32_t GetShareCount(T p) { |
| DEBUG_ASSERT(p->IsPageOrRef()); |
| |
| uint32_t share_count = 0; |
| if (p->IsPage()) { |
| share_count = p->Page()->object.share_count; |
| } else if (p->IsReference()) { |
| share_count = Pmm::Node().GetPageCompression()->GetMetadata(p->Reference()); |
| } |
| |
| return share_count; |
| } |
| |
| void ZeroPage(paddr_t pa) { |
| void* ptr = paddr_to_physmap(pa); |
| DEBUG_ASSERT(ptr); |
| |
| arch_zero_page(ptr); |
| } |
| |
| void ZeroPage(vm_page_t* p) { |
| paddr_t pa = p->paddr(); |
| ZeroPage(pa); |
| } |
| |
| bool IsZeroPage(vm_page_t* p) { |
| uint64_t* base = (uint64_t*)paddr_to_physmap(p->paddr()); |
| for (size_t i = 0; i < PAGE_SIZE / sizeof(uint64_t); i++) { |
| if (base[i] != 0) |
| return false; |
| } |
| return true; |
| } |
| |
| void InitializeVmPage(vm_page_t* p) { |
| DEBUG_ASSERT(p); |
| DEBUG_ASSERT(!list_in_list(&p->queue_node)); |
| // Page should be in the ALLOC state so we can transition it to the OBJECT state. |
| DEBUG_ASSERT(p->state() == vm_page_state::ALLOC); |
| p->set_state(vm_page_state::OBJECT); |
| p->object.share_count = 0; |
| p->object.pin_count = 0; |
| p->object.always_need = 0; |
| p->object.dirty_state = uint8_t(VmCowPages::DirtyState::Untracked); |
| p->object.set_object(nullptr); |
| p->object.set_page_offset(0); |
| } |
| |
| inline uint64_t CheckedAdd(uint64_t a, uint64_t b) { |
| uint64_t result; |
| bool overflow = add_overflow(a, b, &result); |
| DEBUG_ASSERT(!overflow); |
| return result; |
| } |
| |
| inline uint64_t CheckedSub(uint64_t a, uint64_t b) { |
| DEBUG_ASSERT(b <= a); |
| return a - b; |
| } |
| |
| inline uint64_t ClampedLimit(uint64_t offset, uint64_t limit, uint64_t max_limit) { |
| // Return a clamped `limit` value such that `offset + clamped_limit <= max_limit`. |
| // If `offset > max_limit` to begin with, then clamp `limit` to 0 to avoid underflow. |
| // |
| // This is typically used to update a child node's parent limit when its parent is resized or the |
| // child moves to a new parent. This guaranatees that the child cannot see any ancestor content |
| // beyond what it could before the resize or move operation. |
| uint64_t offset_limit = CheckedAdd(offset, limit); |
| return ktl::max(ktl::min(offset_limit, max_limit), offset) - offset; |
| } |
| |
| ktl::optional<vm_page_t*> MaybeDecompressReference(VmCompression* compression, |
| VmCompression::CompressedRef ref) { |
| if (auto maybe_page_and_metadata = compression->MoveReference(ref)) { |
| InitializeVmPage(maybe_page_and_metadata->page); |
| // Ensure the share count is propagated from the compressed page. |
| maybe_page_and_metadata->page->object.share_count = maybe_page_and_metadata->metadata; |
| |
| return maybe_page_and_metadata->page; |
| } |
| |
| return ktl::nullopt; |
| } |
| |
| void FreeReference(VmPageOrMarker::ReferenceValue content) { |
| VmCompression* compression = Pmm::Node().GetPageCompression(); |
| DEBUG_ASSERT(compression); |
| compression->Free(content); |
| } |
| |
| // Helper to allow for accessing the VmCowPages::paged_ref_ without needing to manually assert the |
| // lock. Declared as a local helper here instead of a method in VmCowPages due to VmCowPages being |
| // defined prior to VmObjectPaged. |
| VmObjectPaged* paged_backlink_locked(VmCowPages* cow) TA_REQ(cow->lock()) |
| TA_ASSERT(paged_backlink_locked(cow)->lock()) { |
| return cow->get_paged_backlink_locked(); |
| } |
| |
| } // namespace |
| |
| // static |
| void VmCowPages::DebugDumpReclaimCounters() { |
| printf("Failed reclaim evict_accessed %ld\n", vm_reclaim_evict_accessed.SumAcrossAllCpus()); |
| printf("Failed reclaim compress_accessed %ld\n", vm_reclaim_compress_accessed.SumAcrossAllCpus()); |
| printf("Failed reclaim no_strategy %ld\n", vm_reclaim_no_reclamation_strategy.SumAcrossAllCpus()); |
| printf("Failed reclaim always_need %ld\n", vm_reclaim_always_need_skipped.SumAcrossAllCpus()); |
| printf("Failed reclaim discardable %ld\n", vm_reclaim_discardable_failed.SumAcrossAllCpus()); |
| printf("Failed reclaim incorrect_page %ld\n", vm_reclaim_incorrect_page.SumAcrossAllCpus()); |
| printf("Failed reclaim high_priority %ld\n", vm_reclaim_high_priority.SumAcrossAllCpus()); |
| printf("Failed reclaim pinned %ld\n", vm_reclaim_pinned.SumAcrossAllCpus()); |
| printf("Failed reclaim dirty %ld\n", vm_reclaim_dirty.SumAcrossAllCpus()); |
| printf("Failed reclaim uncached %ld\n", vm_reclaim_uncached.SumAcrossAllCpus()); |
| } |
| |
| // Helper for walking up a VmCowPages hierarchy where the start node is locked, and the immediate |
| // parent may or may not be locked. |
| class LockedParentWalker { |
| public: |
| // Construct the parent walker with a reference to a LockedPtr of any locked parent. The |
| // referenced LockedPtr can be empty if the immediate parent is either not locked, or does not |
| // exist. It is the callers responsibility to ensure the LockedPtr lives long enough. |
| explicit LockedParentWalker(const VmCowPages::LockedPtr& maybe_locked_parent) |
| : pre_locked_parent_(maybe_locked_parent) {} |
| |
| // Returns a locked reference to the current node in the walk. The TA_ASSERT is deemed correct as |
| // all code paths return a `LockedPtr::locked*` method, that themselves have a TA_ASSERT. |
| VmCowPages& current(VmCowPages* self) const TA_REQ(self->lock()) TA_ASSERT(current(self).lock()) { |
| if (current_is_pre_locked_parent_) { |
| return pre_locked_parent_.locked(); |
| } |
| return current_.locked_or(self); |
| } |
| const VmCowPages& current(const VmCowPages* self) const TA_REQ(self->lock()) |
| TA_ASSERT(current(self).lock()) { |
| if (current_is_pre_locked_parent_) { |
| return pre_locked_parent_.locked(); |
| } |
| return current_.locked_or(self); |
| } |
| |
| // Resets the walker to its initial state, allowing for a new walk. |
| void reset() { |
| current_.release(); |
| current_is_pre_locked_parent_ = false; |
| } |
| |
| // Walk up the hierarchy, changing the current node to the current nodes parent. It is an error to |
| // call this if current has no parent. |
| void WalkUp(const VmCowPages* self) TA_REQ(self->lock()) { |
| VmCowPages* next = current(self).parent_.get(); |
| DEBUG_ASSERT(next); |
| // If the next node in the chain matches the pre locked parent, then use that, otherwise move |
| // current_ up and acquire the lock. |
| if (next == pre_locked_parent_.get()) { |
| // Double check that the pre_locked_parent_ is actually the immediate parent. |
| DEBUG_ASSERT(self->parent_.get() == next); |
| current_is_pre_locked_parent_ = true; |
| } else { |
| current_is_pre_locked_parent_ = false; |
| current_ = VmCowPages::LockedPtr(next, next->lock_order()); |
| } |
| } |
| |
| private: |
| // Tracks whether a call to |current| should return the |pre_locked_locked_parent_|, or the normal |
| // |current_| tracker. |
| bool current_is_pre_locked_parent_ = false; |
| const VmCowPages::LockedPtr& pre_locked_parent_; |
| VmCowPages::LockedPtr current_; |
| }; |
| |
| // Helper class for collecting pages to performed batched Removes from the page queue to not incur |
| // its spinlock overhead for every single page. Pages that it removes from the page queue get placed |
| // into a provided list. Note that pages are not moved into the list until *after* Flush has been |
| // called and Flush must be called prior to object destruction. |
| // |
| // This class has a large internal array and should be marked uninitialized. |
| class BatchPQRemove { |
| public: |
| explicit BatchPQRemove(ScopedPageFreedList& freed_list) : freed_list_(freed_list) {} |
| ~BatchPQRemove() { DEBUG_ASSERT(count_ == 0); } |
| DISALLOW_COPY_AND_ASSIGN_ALLOW_MOVE(BatchPQRemove); |
| |
| // Add a page to the batch set. Automatically calls |Flush| if the limit is reached. |
| void Push(vm_page_t* page) { |
| DEBUG_ASSERT(page); |
| ASSERT(page->object.pin_count == 0); |
| DEBUG_ASSERT(count_ < kMaxPages); |
| if (count_ != 0 && page->is_loaned() != is_loaned_) { |
| Flush(); |
| } |
| if (count_ == 0) { |
| is_loaned_ = page->is_loaned(); |
| } |
| |
| pages_[count_] = page; |
| count_++; |
| if (count_ == kMaxPages) { |
| Flush(); |
| } |
| } |
| |
| // Removes any content from the supplied |page_or_marker| and either calls |Push| or otherwise |
| // frees it. Always leaves the |page_or_marker| in the empty state. |
| // Automatically calls |Flush| if the limit on pages is reached. |
| void PushContent(VmPageOrMarker* page_or_marker) { |
| if (page_or_marker->IsPage()) { |
| Push(page_or_marker->ReleasePage()); |
| } else if (page_or_marker->IsReference()) { |
| // TODO(https://fxbug.dev/42138396): Consider whether it is worth batching these. |
| FreeReference(page_or_marker->ReleaseReference()); |
| } else { |
| *page_or_marker = VmPageOrMarker::Empty(); |
| } |
| } |
| |
| // Performs |Remove| on any pending pages. This allows you to know that all pages are in the |
| // original list so that you can do operations on the list. |
| void Flush() { |
| if (count_ > 0) { |
| if (is_loaned_) { |
| Pmm::Node().BeginFreeLoanedArray( |
| pages_, count_, |
| [](vm_page_t** pages, size_t count, list_node_t* free_list) { |
| pmm_page_queues()->RemoveArrayIntoList(pages, count, free_list); |
| }, |
| freed_list_.Flph()); |
| } else { |
| pmm_page_queues()->RemoveArrayIntoList(pages_, count_, freed_list_.List()); |
| freed_count_ += count_; |
| } |
| count_ = 0; |
| } |
| } |
| |
| // Returns the number of pages that were added to |freed_list_| by calls to Flush(). The |
| // |freed_count_| counter keeps a running count of freed pages as they are removed and added to |
| // |freed_list_|, avoiding having to walk |freed_list_| to compute its length. |
| size_t freed_count() const { return freed_count_; } |
| |
| // Produces a callback suitable for passing to VmPageList::RemovePages that will |PushContent| all |
| // items. |
| auto RemovePagesCallback() { |
| return [this](VmPageOrMarker* p, uint64_t off) { |
| PushContent(p); |
| return ZX_ERR_NEXT; |
| }; |
| } |
| |
| private: |
| // The value of 64 was chosen as there is minimal performance gains originally measured by using |
| // higher values. There is an incentive on this being as small as possible due to this typically |
| // being created on the stack, and our stack space is limited. |
| static constexpr size_t kMaxPages = 64; |
| |
| size_t count_ = 0; |
| size_t freed_count_ = 0; |
| vm_page_t* pages_[kMaxPages]; |
| ScopedPageFreedList& freed_list_; |
| bool is_loaned_ = false; |
| }; |
| |
| // Helper class for collecting pages to perform batched calls of |ChangeObjectOffset| on the page |
| // queue in order to avoid incurring its spinlock overhead for every single page. Note that pages |
| // are not modified until *after* Flush has been called and Flush must be called prior to object |
| // destruction. |
| // |
| // This class has a large internal array and should be marked uninitialized. |
| class BatchPQUpdateBacklink { |
| public: |
| explicit BatchPQUpdateBacklink(VmCowPages* object) : object_(object) {} |
| ~BatchPQUpdateBacklink() { DEBUG_ASSERT(count_ == 0); } |
| DISALLOW_COPY_AND_ASSIGN_ALLOW_MOVE(BatchPQUpdateBacklink); |
| |
| // Add a page to the batch set. Automatically calls |Flush| if the limit is reached. |
| void Push(vm_page_t* page, uint64_t offset) { |
| DEBUG_ASSERT(page); |
| DEBUG_ASSERT(count_ < kMaxPages); |
| |
| pages_[count_] = page; |
| offsets_[count_] = offset; |
| count_++; |
| |
| if (count_ == kMaxPages) { |
| Flush(); |
| } |
| } |
| |
| // Performs |ChangeObjectOffset| on any pending pages. |
| void Flush() { |
| if (count_ > 0) { |
| pmm_page_queues()->ChangeObjectOffsetArray(pages_, object_, offsets_, count_); |
| count_ = 0; |
| } |
| } |
| |
| private: |
| // Batch size is selected to balance performance and size of the object that gets allocated on the |
| // stack. |
| static constexpr size_t kMaxPages = 64; |
| |
| VmCowPages* object_ = nullptr; |
| |
| size_t count_ = 0; |
| vm_page_t* pages_[kMaxPages]; |
| uint64_t offsets_[kMaxPages]; |
| }; |
| |
| // Helper class for iterating over a subtree while respecting the child->parent lock ordering |
| // requirement. |
| // Cursor is constructed with a root, i.e. the starting point, and will iterate over at least |
| // every node that existed at the point of construction. Nodes that are racily created mid |
| // iteration may or may not be visited. Utilizes the cursor lists in the VmCowPages to coordinate |
| // with any destruction. |
| // A cursor is logically at a 'current' location, which is initially the root the cursor was |
| // constructed at. As the current location is always held locked, the cursor can be assumed to be |
| // initially valid, and is valid as long as any iteration request (NextChild / NextSibling) returns |
| // true. The cursor explicitly performs a pre-order walk, allowing subtrees of a given node to be |
| // skipped during the iteration. |
| class VmCowPages::TreeWalkCursor |
| : public fbl::ContainableBaseClasses< |
| fbl::TaggedDoublyLinkedListable<TreeWalkCursor*, VmCowPages::RootListTag>, |
| fbl::TaggedDoublyLinkedListable<TreeWalkCursor*, VmCowPages::CurListTag>> { |
| public: |
| explicit TreeWalkCursor(LockedPtr root) |
| : root_(root.get()), cur_(root.get()), cur_locked_(ktl::move(root)) { |
| DEBUG_ASSERT(cur_locked_.locked().life_cycle_ == LifeCycle::Alive); |
| cur_locked_.locked().root_cursor_list_.push_back(this); |
| cur_locked_.locked().cur_cursor_list_.push_back(this); |
| } |
| ~TreeWalkCursor() { |
| if (root_) { |
| reset(); |
| } |
| } |
| // These static methods exist to simplify the call sites in VmCowPages in such a way that the lock |
| // annotations are preserved. A generic 'perform arbitrary lambda on all cursors' helper would |
| // reduce the code duplication here, but it would lose the annotations. |
| // See description of the non static methods for these do. |
| |
| static void MoveToSibling(fbl::TaggedDoublyLinkedList<TreeWalkCursor*, CurListTag>& cursor_list, |
| VmCowPages* cur, VmCowPages* sibling) TA_REQ(cur->lock()) |
| TA_REQ(sibling->lock()) { |
| while (!cursor_list.is_empty()) { |
| cursor_list.front().MoveToSibling(cur, sibling); |
| } |
| } |
| static void MoveToSiblingOfParent( |
| fbl::TaggedDoublyLinkedList<TreeWalkCursor*, CurListTag>& cursor_list, VmCowPages* cur, |
| VmCowPages* parent) TA_REQ(cur->lock()) TA_REQ(parent->lock()) { |
| while (!cursor_list.is_empty()) { |
| cursor_list.front().MoveToSiblingOfParent(cur, parent); |
| } |
| } |
| static void Erase(fbl::TaggedDoublyLinkedList<TreeWalkCursor*, RootListTag>& cursor_list, |
| VmCowPages* leaf) TA_REQ(leaf->lock()) { |
| while (!cursor_list.is_empty()) { |
| cursor_list.front().Erase(leaf); |
| } |
| } |
| static void MergeToChild(fbl::TaggedDoublyLinkedList<TreeWalkCursor*, CurListTag>& cur_list, |
| fbl::TaggedDoublyLinkedList<TreeWalkCursor*, RootListTag>& root_list, |
| VmCowPages* cur, VmCowPages* child) TA_REQ(cur->lock()) |
| TA_REQ(child->lock()) { |
| while (!root_list.is_empty()) { |
| root_list.front().MergeRootToChild(cur, child); |
| } |
| while (!cur_list.is_empty()) { |
| cur_list.front().MergeToChild(cur, child); |
| } |
| } |
| |
| // Inform the cursor that its current node is going away, and it should re-home to its sibling. |
| void MoveToSibling(VmCowPages* cur, VmCowPages* sibling) TA_REQ(cur->lock()) |
| TA_REQ(sibling->lock()) { |
| Guard<CriticalMutex> guard{&lock_}; |
| DEBUG_ASSERT(cur->parent_ && cur->parent_ == sibling->parent_); |
| // If current was the root, then do not move to the sibling, as that would be outside our |
| // iteration tree, erase instead. |
| if (cur == root_) { |
| EraseLocked(cur, cur); |
| return; |
| } |
| MoveCurLocked( |
| cur, sibling, |
| CheckedSub(cumulative_parent_offset_, cur->parent_offset_) + sibling->parent_offset_, |
| debug_depth_); |
| } |
| |
| // Inform the cursor that the root node is going away. Since a node can only be removed if it has |
| // no children, this implies that the cursor is still at the root, and so the entire cursor should |
| // be removed. |
| void Erase(VmCowPages* root) TA_REQ(root->lock()) { |
| DEBUG_ASSERT(root->children_list_len_ == 0); |
| Guard<CriticalMutex> guard{&lock_}; |
| EraseLocked(root, root); |
| } |
| |
| // Inform the cursor that the root node is being merged into the child, and the cursor should be |
| // moved. |
| void MergeRootToChild(VmCowPages* root, VmCowPages* child) TA_REQ(root->lock()) |
| TA_REQ(child->lock()) { |
| Guard<CriticalMutex> guard{&lock_}; |
| DEBUG_ASSERT(root == root_); |
| DEBUG_ASSERT(child->parent_.get() == root); |
| // If the cursor was still pointing at the root then also move it. Although this would get |
| // updated by a separate call to MergeToChild anyway, it's preferable to maintain the invariant. |
| if (cur_ == root) { |
| MoveCurLocked(root, child, cumulative_parent_offset_ + child->parent_offset_, debug_depth_); |
| } else { |
| debug_depth_--; |
| } |
| root->root_cursor_list_.erase(*this); |
| child->root_cursor_list_.push_back(this); |
| root_ = child; |
| } |
| |
| // Inform the cursor that the current node is merging with its child. |
| void MergeToChild(VmCowPages* cur, VmCowPages* child) TA_REQ(cur->lock()) TA_REQ(child->lock()) { |
| Guard<CriticalMutex> guard{&lock_}; |
| DEBUG_ASSERT(child->parent_.get() == cur); |
| |
| DEBUG_ASSERT(cur != root_); |
| uint32_t new_depth = (cur == root_) ? debug_depth_ : (debug_depth_ - 1); |
| |
| MoveCurLocked(cur, child, cumulative_parent_offset_ + child->parent_offset_, new_depth); |
| } |
| |
| // Inform the cursor that both the current node and its parent are going away and the cursor |
| // should be moved to the next available sibling of the parent, assuming that is still within the |
| // subtree to be walked. |
| // This method will logically end up at the same final node as just MoveToNextSibling, and it is |
| // specialized not for performance, but rather for the scenario where the lock of |parent| is |
| // already held, and hence directly using MoveToNextSibling would cause a double lock acquisition. |
| void MoveToSiblingOfParent(VmCowPages* cur, VmCowPages* parent) TA_REQ(cur->lock()) |
| TA_REQ(parent->lock()) { |
| DEBUG_ASSERT(cur->parent_.get() == parent); |
| // Not trying to be efficient, as this method is only used for cleaning up when racing |
| // deletion with a cursor traversal, so just move the cursor to the parent, then move to the |
| // sibling. |
| { |
| Guard<CriticalMutex> guard{&lock_}; |
| if (cur == root_) { |
| EraseLocked(cur, cur); |
| return; |
| } |
| if (parent == root_) { |
| EraseLocked(cur, parent); |
| return; |
| } |
| MoveCurLocked(cur, parent, CheckedSub(cumulative_parent_offset_, cur->parent_offset_), |
| debug_depth_); |
| } |
| MoveToNextSibling(parent); |
| } |
| |
| // Move the cursor to the next un-visited child, or if no children the next sibling. Returns false |
| // if iteration has completed and the cursor is now invalid. This may not be called on an invalid |
| // cursor. |
| bool NextChild() { |
| DEBUG_ASSERT(cur_locked_); |
| do { |
| // If no child then find a sibling instead. |
| if (cur_locked_.locked().children_list_len_ == 0) { |
| return NextSibling(); |
| } |
| |
| // To acquire the child lock we need to release the current lock, so first take a refptr to |
| // the child. |
| fbl::RefPtr<VmCowPages> child_ref = fbl::MakeRefPtrUpgradeFromRaw( |
| &cur_locked_.locked().children_list_.front(), cur_locked_.locked().lock()); |
| cur_locked_.release(); |
| |
| { |
| LockedPtr child(child_ref.get()); |
| // While the locks were dropped things could have changed, so check that the child still has |
| // a parent before attempting to acquire the parents lock. |
| if (child.locked().parent_) { |
| LockedPtr parent(child.locked().parent_.get()); |
| Guard<CriticalMutex> guard{&lock_}; |
| // If nothing raced then the parent of child should still be cur_. |
| if (parent.get() == cur_) { |
| // Both cur_ and child must be in the alive state, otherwise cur_ would have been |
| // updated on a dead transition. The fact that a dead transition has not occurred, and |
| // that child lock must be acquired to perform said transition, is why it is safe for us |
| // to drop child_ref and store a raw LockedPtr of child. |
| DEBUG_ASSERT(parent.locked().life_cycle_ == LifeCycle::Alive && |
| child.locked().life_cycle_ == LifeCycle::Alive); |
| |
| MoveCurLocked(&parent.locked(), &child.locked(), |
| cumulative_parent_offset_ + child.locked().parent_offset_, |
| (debug_depth_ + 1)); |
| cur_locked_ = ktl::move(child); |
| // cur_ is updated and cur_locked_ holds a lock acquired with the correct order so we |
| // can directly return and do not need to use UpdateCurLocked to reacquire. |
| return true; |
| } |
| } |
| } |
| // We raced with a modification to the tree. This modification will have set the new value of |
| // cur_ (possibly to nullptr if the cursor has been deleted), and we call UpdateCurLocked to |
| // retrieve this and then go around the loop and check again for a child. |
| } while (UpdateCurLocked()); |
| // Only reach here if UpdateCurLocked returns false, which only happens if the cursor was |
| // deleted, in which case we definitely have no child. |
| return false; |
| } |
| |
| // Move the cursor to the next un-visited sibling, skipping any children of the current node. |
| // Returns false if iteration has completed and the cursor is now invalid. This may not be called |
| // on an invalid cursor. |
| bool NextSibling() { |
| DEBUG_ASSERT(cur_locked_); |
| { |
| LockedPtr cur = ktl::move(cur_locked_); |
| // Due to the way the sibling lock gets acquired we always need to re-acquire it as a first |
| // acquisition with its normal lock order. For this reason there is no point in attempting to |
| // retain the lock of the updated cur_, and so we use a common helper and then re-read (and |
| // re-lock) cur_. |
| MoveToNextSibling(&cur.locked()); |
| } |
| return UpdateCurLocked(); |
| } |
| |
| // Retrieves the offset that projects an offset from the starting node into an offset in the |
| // current node. This does not imply that the current node can 'see' the content at that offset, |
| // just that if it could that is the offset that would do it. |
| // May only be called while the cursor is valid. |
| uint64_t GetCurrentOffset() const { |
| // As long as we hold cur_locked_ then no one can be altering cur_ and so we own the offset. |
| DEBUG_ASSERT(cur_locked_); |
| return cumulative_parent_offset_; |
| } |
| |
| // Retrieve a reference to the current node. |
| const LockedPtr& GetCur() const { return cur_locked_; } |
| |
| int32_t DebugGetDepth() const { return debug_depth_; } |
| |
| private: |
| // Helper for moving cur_ to the next sibling. The |start| location, which must be equal to cur_ |
| // and held locked externally, must be passed in. This allows |cur_locked_| to be set by this |
| // method without having to release its lock. |
| // Walking the next sibling involves walking both 'up' and 'right' until we either find a node or |
| // we encounter root_ and terminate. |
| void MoveToNextSibling(VmCowPages* start) TA_REQ(start->lock()) { |
| DEBUG_ASSERT(!cur_locked_); |
| uint64_t offset; |
| { |
| Guard<CriticalMutex> guard{&lock_}; |
| DEBUG_ASSERT(start == cur_); |
| // The later loop wants to assume that we have a parent (in order to be finding a sibling), |
| // which could be false if we are presently at the root_ and there is otherwise no parent. |
| if (start == root_) { |
| EraseLocked(start, start); |
| return; |
| } |
| // As we hold the lock to cur_, the offset cannot change, so we can cache it outside the lock. |
| offset = cumulative_parent_offset_; |
| } |
| LockedPtr cur; |
| while (true) { |
| // If we aren't at the root then, by definition, we are in a subtree and must have a parent. |
| DEBUG_ASSERT(cur.locked_or(start).parent_.get()); |
| fbl::RefPtr<VmCowPages> sibling_ref; |
| { |
| // Acquire the parent lock and check for a sibling. |
| LockedPtr parent(cur.locked_or(start).parent_.get()); |
| auto iter = ++parent.locked().children_list_.make_iterator(cur.locked_or(start)); |
| if (!iter.IsValid()) { |
| // If no sibling then walk up to the parent, ensuring we do not walk past the root. |
| Guard<CriticalMutex> guard{&lock_}; |
| // Although we checked this previously, the root can get moved into its child, and so we |
| // must re-check. |
| if (start == root_) { |
| EraseLocked(start, start); |
| return; |
| } |
| if (parent.get() == root_) { |
| EraseLocked(start, &parent.locked()); |
| return; |
| } |
| offset = CheckedSub(offset, cur.locked_or(start).parent_offset_); |
| cur = ktl::move(parent); |
| continue; |
| } |
| // Make a ref to the sibling, we have to drop the parent lock before acquiring the sibling |
| // lock. |
| sibling_ref = fbl::MakeRefPtrUpgradeFromRaw(&*iter, parent.locked().lock()); |
| } |
| |
| LockedPtr sibling(sibling_ref.get(), cur.locked_or(start).lock_order() + 1); |
| // If the sibling is still from the same parent then no race occurred and sibling must still |
| // be alive. |
| if (sibling.locked().parent_ == cur.locked_or(start).parent_) { |
| Guard<CriticalMutex> guard{&lock_}; |
| DEBUG_ASSERT(start == cur_); |
| MoveCurLocked(start, &sibling.locked(), |
| CheckedSub(offset, cur.locked_or(start).parent_offset_) + |
| sibling.locked().parent_offset_, |
| debug_depth_); |
| return; |
| } |
| // Raced with a modification, need to go around again and see what the state of the tree is |
| // now and try again. The only way our siblings parent could have changed is if it got |
| // deleted, and since new siblings will be placed at the head of the list (where as we are |
| // iterating towards the tail), the number of times we can race is strictly bounded. |
| } |
| } |
| |
| // Updates cur_locked_ to be what is in cur_. This is used to resolve scenarios where the lock to |
| // current needs to be dropped, and hence a racing deletion might move it. |
| bool UpdateCurLocked() TA_EXCL(lock_) { |
| // We must do this loop as the lock ordering is vmo->cursor and so in between dropping the |
| // cursor lock to acquire cur_locked_, cur_ could move again. |
| Guard<CriticalMutex> guard{&lock_}; |
| fbl::RefPtr<VmCowPages> cur; |
| // Use a local cur_locked while we are looping and only update cur_locked_ at the end once we |
| // are certain we have the correct lock. |
| LockedPtr cur_locked = ktl::move(cur_locked_); |
| do { |
| // Clear any previous lock. |
| cur_locked.release(); |
| // Cursor was deleted. |
| if (!cur_) { |
| return false; |
| } |
| cur = fbl::MakeRefPtrUpgradeFromRaw(cur_, lock_); |
| guard.CallUnlocked([&cur, &cur_locked]() { cur_locked = LockedPtr(cur.get()); }); |
| } while (cur_locked.get() != cur_); |
| // We have the lock to cur_ and so we safely drop the RefPtr, knowing that the object cannot be |
| // destroyed without our backlink being updated, which would require someone else to acquire the |
| // lock first. All this is only true if the object is presently in the Alive state. |
| DEBUG_ASSERT(cur_locked.locked().life_cycle_ == LifeCycle::Alive); |
| cur_locked_ = ktl::move(cur_locked); |
| return true; |
| } |
| |
| // Erase the cursor, removing all the backlinks. |
| void EraseLocked(VmCowPages* cur, VmCowPages* root) TA_REQ(cur->lock()) TA_REQ(root->lock()) |
| TA_REQ(lock_) { |
| DEBUG_ASSERT(cur == cur_); |
| DEBUG_ASSERT(root == root_); |
| cur->cur_cursor_list_.erase(*this); |
| root->root_cursor_list_.erase(*this); |
| cur_ = root_ = nullptr; |
| debug_depth_ = 0; |
| } |
| |
| // Helper to update the current location of the cursor. |
| void MoveCurLocked(VmCowPages* old_cur, VmCowPages* new_cur, uint64_t new_offset, |
| uint32_t new_depth) TA_REQ(lock_) TA_REQ(old_cur->lock()) |
| TA_REQ(new_cur->lock()) { |
| DEBUG_ASSERT(old_cur == cur_); |
| DEBUG_ASSERT(new_cur != root_); |
| // Validate there is no cur_locked_, and so we can update this without racing with any readers |
| // as hold the lock of cur_. |
| DEBUG_ASSERT(!cur_locked_); |
| cumulative_parent_offset_ = new_offset; |
| old_cur->cur_cursor_list_.erase(*this); |
| new_cur->cur_cursor_list_.push_back(this); |
| debug_depth_ = new_depth; |
| cur_ = new_cur; |
| } |
| |
| // Reset and invalidate the cursor. |
| void reset() { |
| LockedPtr cur = ktl::move(cur_locked_); |
| Guard<CriticalMutex> guard{&lock_}; |
| LockedPtr root_locked; |
| fbl::RefPtr<VmCowPages> root; |
| // We must do this loop as the lock ordering is vmo->cursor and so in between dropping the |
| // cursor lock to acquire root_locked, root_ could move again. |
| do { |
| root_locked.release(); |
| if (!root_) { |
| return; |
| } |
| if (root_ == cur_) { |
| EraseLocked(&cur.locked(), &cur.locked()); |
| return; |
| } |
| root = fbl::MakeRefPtrUpgradeFromRaw(root_, lock_); |
| guard.CallUnlocked([&]() { root_locked = LockedPtr(root.get()); }); |
| } while (root_locked.get() != root_); |
| EraseLocked(&cur.locked(), &root_locked.locked()); |
| } |
| |
| // Modifying any item, such as root_ or cur_, requires holding the lock of the respective object, |
| // but to support being able to non-racily read the current value we define an additional lock_. |
| // Reading any value can be performed by holding either lock_, or the respective object lock_, but |
| // both must be held to modify. |
| DECLARE_CRITICAL_MUTEX(TreeWalkCursor) lock_; |
| // Tracks the offset that projects offsets from the original root, to the current node. This is |
| // logically locked by cur_->lock(), but this annotation cannot be properly expressed. Although we |
| // can say TA_REQ(cur_->lock()), there are times when we want to read this value know that |
| // cur_locked_ is valid when we do not hold lock_, hence we cannot even write |
| // AssertHeld(cur_->lock()), as we do not hold lock_ to dereference cur_, and hence cannot explain |
| // to the static analysis that cur_locked_ is an alias of cur_. |
| uint64_t cumulative_parent_offset_ = 0; |
| // The invariant that we maintain is that if root_ or cur_ is not null, then the object they point |
| // to must be in the Alive state, and this cursor must be in the respective cursor_list_. |
| // Modifying these can only be done when holding the respective object lock, as well as lock_. |
| // Attempting to annotate these with something like TA_GUARDED(cur_->lock()) is not useful since |
| // the static analysis cannot resolve the pointer aliasing, and since these are pointers that can |
| // change, using AssertHeld is dangerous as it can provide a false sense of correctness. |
| VmCowPages* root_ TA_GUARDED(lock_) = nullptr; |
| VmCowPages* cur_ TA_GUARDED(lock_) = nullptr; |
| |
| // Probably depth of cur_ with respect to root_. As the cow-pages has a fine-grained lock, there |
| // is a chance that a node outside of the lock races and it becomes inaccurate. |
| int32_t debug_depth_ = 0; |
| |
| // Whenever the cursor is valid, then cur_locked_ is a LockedPtr to cur_. This lock is only |
| // dropped internally when walking between nodes. Storing this internally, instead of returning it |
| // to the user on successful calls to NextChild or NextSibling is merely to ensure that they do |
| // not release the lock at all, allowing us to make assumptions when resuming iteration. |
| LockedPtr cur_locked_; |
| }; |
| |
| bool VmCowRange::IsBoundedBy(uint64_t max) const { return InRange(offset, len, max); } |
| |
| // Allocates a new page and populates it with the data at |parent_paddr|. |
| zx_status_t VmCowPages::AllocateCopyPage(paddr_t parent_paddr, list_node_t* alloc_list, |
| AnonymousPageRequest* request, vm_page_t** clone) { |
| DEBUG_ASSERT(request || !(pmm_alloc_flags_ & PMM_ALLOC_FLAG_CAN_WAIT)); |
| DEBUG_ASSERT(!is_source_supplying_specific_physical_pages()); |
| |
| vm_page_t* p_clone = nullptr; |
| |
| if (request->has_page()) { |
| p_clone = request->take_page(); |
| } else if (alloc_list) { |
| p_clone = list_remove_head_type(alloc_list, vm_page, queue_node); |
| } |
| |
| if (p_clone) { |
| InitializeVmPage(p_clone); |
| } else { |
| zx_status_t status = AllocPage(&p_clone, request); |
| if (status != ZX_OK) { |
| return status; |
| } |
| DEBUG_ASSERT(p_clone); |
| } |
| |
| void* dst = paddr_to_physmap(p_clone->paddr()); |
| DEBUG_ASSERT(dst); |
| |
| if (parent_paddr != vm_get_zero_page_paddr()) { |
| // do a direct copy of the two pages |
| const void* src = paddr_to_physmap(parent_paddr); |
| DEBUG_ASSERT(src); |
| memcpy(dst, src, PAGE_SIZE); |
| } else { |
| // avoid pointless fetches by directly zeroing dst |
| arch_zero_page(dst); |
| } |
| |
| *clone = p_clone; |
| |
| return ZX_OK; |
| } |
| |
| zx_status_t VmCowPages::AllocUninitializedPage(vm_page_t** page, AnonymousPageRequest* request) { |
| paddr_t paddr = 0; |
| DEBUG_ASSERT(!is_source_supplying_specific_physical_pages()); |
| // Another layer has already allocated a page for us. |
| if (request->has_page()) { |
| *page = request->take_page(); |
| return ZX_OK; |
| } |
| |
| zx_status_t status = CacheAllocPage(pmm_alloc_flags_, page, &paddr); |
| if (status == ZX_ERR_SHOULD_WAIT) { |
| request->MakeActive(); |
| } |
| return status; |
| } |
| |
| zx_status_t VmCowPages::AllocPage(vm_page_t** page, AnonymousPageRequest* request) { |
| zx_status_t status = AllocUninitializedPage(page, request); |
| if (status == ZX_OK) { |
| InitializeVmPage(*page); |
| } |
| return status; |
| } |
| |
| template <typename F> |
| zx::result<vm_page_t*> VmCowPages::AllocLoanedPage(F allocated) { |
| DEBUG_ASSERT(!is_source_supplying_specific_physical_pages()); |
| return Pmm::Node().AllocLoanedPage([allocated](vm_page_t* page) { |
| InitializeVmPage(page); |
| allocated(page); |
| }); |
| } |
| |
| void VmCowPages::RemovePageLocked(vm_page_t* page, DeferredOps& ops) { |
| if (page->is_loaned()) { |
| Pmm::Node().BeginFreeLoanedPage( |
| page, [](vm_page_t* page) { pmm_page_queues()->Remove(page); }, ops.FreedList(this).Flph()); |
| } else { |
| pmm_page_queues()->Remove(page); |
| list_add_tail(ops.FreedList(this).List(), &page->queue_node); |
| } |
| } |
| |
| zx_status_t VmCowPages::CacheAllocPage(uint alloc_flags, vm_page_t** p, paddr_t* pa) { |
| if (!page_cache_) { |
| return pmm_alloc_page(alloc_flags, p, pa); |
| } |
| |
| zx::result result = page_cache_.Allocate(1, alloc_flags); |
| if (result.is_error()) { |
| return result.error_value(); |
| } |
| |
| vm_page_t* page = list_remove_head_type(&result->page_list, vm_page_t, queue_node); |
| DEBUG_ASSERT(page != nullptr); |
| DEBUG_ASSERT(result->page_list.is_empty()); |
| |
| *p = page; |
| *pa = page->paddr(); |
| return ZX_OK; |
| } |
| |
| void VmCowPages::CacheFree(list_node_t* list) { |
| if (!page_cache_) { |
| pmm_free(list); |
| return; |
| } |
| |
| page_cache_.Free(ktl::move(*list)); |
| } |
| |
| void VmCowPages::CacheFree(vm_page_t* p) { |
| if (!page_cache_) { |
| pmm_free_page(p); |
| return; |
| } |
| |
| page_cache::PageCache::PageList list; |
| list_add_tail(&list, &p->queue_node); |
| |
| page_cache_.Free(ktl::move(list)); |
| } |
| |
| zx_status_t VmCowPages::MakePageFromReference(VmPageOrMarkerRef page_or_mark, |
| AnonymousPageRequest* page_request) { |
| DEBUG_ASSERT(page_or_mark->IsReference()); |
| VmCompression* compression = Pmm::Node().GetPageCompression(); |
| DEBUG_ASSERT(compression); |
| |
| vm_page_t* p; |
| zx_status_t status = AllocPage(&p, page_request); |
| if (status != ZX_OK) { |
| return status; |
| } |
| |
| const auto ref = page_or_mark.SwapReferenceForPage(p); |
| void* page_data = paddr_to_physmap(p->paddr()); |
| uint32_t page_metadata; |
| compression->Decompress(ref, page_data, &page_metadata); |
| // Ensure the share count is propagated from the compressed page. |
| p->object.share_count = page_metadata; |
| |
| return ZX_OK; |
| } |
| |
| zx_status_t VmCowPages::ReplaceReferenceWithPageLocked(VmPageOrMarkerRef page_or_mark, |
| uint64_t offset, |
| AnonymousPageRequest* page_request) { |
| // First replace the ref with a page. |
| zx_status_t status = MakePageFromReference(page_or_mark, page_request); |
| if (status != ZX_OK) { |
| return status; |
| } |
| // Add the new page to the page queues for tracking. References are by definition not pinned, so |
| // we know this is not wired. |
| SetNotPinnedLocked(page_or_mark->Page(), offset); |
| return ZX_OK; |
| } |
| |
| VmCowPages::VmCowPages(VmCowPagesOptions options, uint32_t pmm_alloc_flags, uint64_t size, |
| fbl::RefPtr<PageSource> page_source, |
| ktl::unique_ptr<DiscardableVmoTracker> discardable_tracker, |
| uint64_t lock_order) |
| : pmm_alloc_flags_(pmm_alloc_flags), |
| options_(options), |
| // If both local and shared locks are defined then there is still only one true lock, the shared |
| // one, with the local lock existing to increase the tracking ability of lockdep. The local lock |
| // therefore needs to be pointed at the shared lock to forward the actual locking actions. |
| #if VMO_USE_LOCAL_LOCK && VMO_USE_SHARED_LOCK |
| lock_(hierarchy_state_ptr_->lock()->lock()), |
| #endif |
| #if (LOCK_DEP_ENABLED_FEATURE_LEVEL > 0) |
| lock_order_(lock_order), |
| #endif |
| size_(size), |
| page_source_(ktl::move(page_source)), |
| discardable_tracker_(ktl::move(discardable_tracker)) { |
| DEBUG_ASSERT(IS_PAGE_ROUNDED(size)); |
| // If we are tracking correct lock orders then add some asserts that nodes are created with lock |
| // orders that at least vaguely make sense. |
| #if (LOCK_DEP_ENABLED_FEATURE_LEVEL > 0) |
| // Nodes with a page source must always be the root, and have the respective lock order. |
| DEBUG_ASSERT(!page_source_ || lock_order_ == kLockOrderRoot); |
| // Hidden nodes must always have a lock order above the anonymous numbering area. |
| DEBUG_ASSERT(!is_hidden() || lock_order_ > kLockOrderFirstAnon); |
| // First anonymous nodes (i.e. not hidden and not with a direct page source) should fall into the |
| // anonymous numbering area. |
| DEBUG_ASSERT(page_source_ || is_hidden() || lock_order_ <= kLockOrderFirstAnon); |
| #endif |
| } |
| |
| void VmCowPages::TransitionToAliveLocked() { |
| ASSERT(life_cycle_ == LifeCycle::Init); |
| life_cycle_ = LifeCycle::Alive; |
| } |
| |
| fbl::RefPtr<VmCowPages> VmCowPages::MaybeDeadTransition() { |
| // We perform a dead transition if |should_dead_transition_locked| is true, but in order to do the |
| // transition we require holding multiple locks. Due to races with either other attempts at dead |
| // transitions, or other creation and deletions modifying the tree, we may need to attempt the |
| // lock acquisitions multiple times until we can get a stable snapshot. |
| // The purpose of acquiring all the locks here is to ensure that once we begin a dead transition |
| // we can continuously hold all the locks that lead to that decision (namely our own), otherwise |
| // we would need to reason about our state potentially changing mid way through after dropping our |
| // lock. |
| // The locks we need to be holding to do a dead transition are: our own, our parent (if we have |
| // one) and our sibling (if we have one). The sibling is a bit nuanced as we generally only want |
| // the right sibling (i.e. next in parents child list), and if no right sibling can skip. The |
| // exception being when our parent is hidden and has exactly two children, in which case the left |
| // sibling is required to perform the hidden parent merge step. |
| while (true) { |
| fbl::RefPtr<VmCowPages> sibling_ref; |
| VmCowPages* parent_raw; |
| // Use a subscope as we potentially need to drop and then reacquire the locks. |
| { |
| Guard<CriticalMutex> guard{AssertOrderedLock, lock(), lock_order()}; |
| // With the lock now held check if we even need to do a dead transition. |
| if (!should_dead_transition_locked()) { |
| return nullptr; |
| } |
| // If no parent, then there can be no sibling, so can just do the transition. |
| if (!parent_) { |
| return DeadTransitionLocked(LockedPtr(), LockedPtr()); |
| } |
| LockedPtr parent(parent_.get()); |
| // If we are the only child, then no need to check for siblings. |
| if (parent.locked().children_list_len_ == 1) { |
| return DeadTransitionLocked(ktl::move(parent), LockedPtr()); |
| } |
| // First check if there is a sibling to our right. |
| auto sibling_iter = ++parent.locked().children_list_.make_iterator(*this); |
| if (sibling_iter.IsValid()) { |
| // We found a sibling to our right, and so we can acquire its lock without dropping our own. |
| // However, we do need to drop the parent lock to do so. To do this we take a RefPtr to the |
| // sibling to ensure it stays alive, before dropping the parent lock, acquiring the sibling |
| // lock and reacquiring the parent lock. A new LockedPtr is used for the parent acquisition |
| // simply to allow the default destruction order to correctly release the locks in order. |
| sibling_ref = fbl::MakeRefPtrUpgradeFromRaw(&*sibling_iter, parent.locked().lock()); |
| parent.release(); |
| LockedPtr sibling = LockedPtr(sibling_ref.get(), lock_order() + 1); |
| LockedPtr parent2(parent_.get()); |
| // We have continuously held our lock, so we know that parent_ is unchanged for us, but |
| // check if this is still our sibling or not by recalculating and comparing. |
| sibling_iter = ++parent2.locked().children_list_.make_iterator(*this); |
| if (!sibling_iter.IsValid() || sibling.get() != &*sibling_iter) { |
| // We raced and this sibling has gone away. For simplicity we just try again from the top. |
| continue; |
| } |
| return DeadTransitionLocked(parent2, sibling); |
| } |
| // There is no right sibling, so check if we need to get the left sibling. The left sibling is |
| // needed only if the parent is hidden and we are one of exactly two children. |
| if (!parent->is_hidden() || parent.locked().children_list_len_ != 2) { |
| return DeadTransitionLocked(parent, LockedPtr()); |
| } |
| // Create a RefPtr to hold the sibling alive and stash the current raw value of parent_ (so we |
| // can detect any races later) then drop all the locks. |
| sibling_ref = fbl::MakeRefPtrUpgradeFromRaw(&parent.locked().children_list_.front(), |
| parent.locked().lock()); |
| DEBUG_ASSERT(sibling_ref.get() != this); |
| parent_raw = parent_.get(); |
| } |
| |
| // Reacquire the locks, sibling first as it is to the 'left' in list order. |
| LockedPtr sibling = LockedPtr(sibling_ref.get()); |
| // We could have the same lock order as our sibling, so we use the gap in the lock orders to |
| // acquire. |
| Guard<CriticalMutex> guard{AssertOrderedLock, lock(), sibling_ref->lock_order() + 1}; |
| // With our lock reacquired, check that this still needs a dead transition, as it could already |
| // have been done by someone else. |
| if (!should_dead_transition_locked()) { |
| return nullptr; |
| } |
| |
| // With both us and our sibling locked check that they are indeed still our sibling by ensuring |
| // we both have the same original parent. This check failing would imply that our sibling got |
| // dead transitioned and we merged with the parent. We might still need a dead transition, but |
| // the locks we need are now all different so we just retry from the top. |
| if (parent_.get() != parent_raw || sibling.locked().parent_.get() != parent_raw) { |
| continue; |
| } |
| LockedPtr parent(parent_.get()); |
| // Even if parent didn't change it could have gained new children and we might be needing to |
| // acquire a right sibling instead. For simplicity just retry. |
| if (parent.locked().children_list_len_ != 2) { |
| continue; |
| } |
| |
| return DeadTransitionLocked(parent, sibling); |
| } |
| } |
| |
| fbl::RefPtr<VmCowPages> VmCowPages::DeadTransitionLocked(const LockedPtr& parent, |
| const LockedPtr& sibling) { |
| canary_.Assert(); |
| DEBUG_ASSERT(life_cycle_ == LifeCycle::Alive); |
| // Change our life cycle to the dying state so that if we need to drop the lock no other attempts |
| // are made at performing a DeadTransition. |
| life_cycle_ = LifeCycle::Dying; |
| |
| // Close any PageSource. It does not matter if we do this before or after removing the pages, as |
| // we hold the lock continuously, but it makes more sense (and is slightly more efficient for the |
| // PhysicalPageProvider) to notify the close before. |
| if (page_source_) { |
| page_source_->Close(); |
| } |
| |
| // To prevent races with a hidden parent creation or merging, it is necessary to hold the lock |
| // over the is_hidden and parent_ check and into the subsequent removal call. |
| |
| // At the point of destruction we should no longer have any mappings or children still |
| // referencing us, and by extension our priority count must therefore be back to zero. |
| DEBUG_ASSERT(high_priority_count_ == 0); |
| VMO_VALIDATION_ASSERT(DebugValidateHierarchyLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| fbl::RefPtr<VmCowPages> deferred; |
| |
| // If we're not a hidden vmo then we need to remove ourselves from our parent and free any pages |
| // that we own. |
| if (!is_hidden()) { |
| // Clear out all content that we can see. This means dropping references to any pages in our |
| // parents, as well as removing any pages in our own page list. |
| __UNINITIALIZED ScopedPageFreedList freed_list; |
| ReleaseOwnedPagesLocked(0, parent, freed_list); |
| freed_list.FreePages(this); |
| |
| DEBUG_ASSERT(parent.get() == parent_.get()); |
| if (parent_) { |
| parent.locked().RemoveChildLocked(this, sibling); |
| |
| // We removed a child from the parent, and so it may also need to be cleaned. |
| // Avoid recursing destructors and dead transitions when we delete our parent by using the |
| // deferred deletion method, i.e. return the parent_ and have the caller call dead transition |
| // on it. |
| deferred = ktl::move(parent_); |
| } else { |
| // If we had a parent then RemoveChildLocked would have cleaned up any cursors, but otherwise |
| // we must erase from any lists. As we have no parent and cannot have children the root and |
| // current cursor list must be equivalent, and so only need to process one. |
| TreeWalkCursor::Erase(root_cursor_list_, this); |
| } |
| } else { |
| // Most of the hidden vmo's state should have already been cleaned up when it merged |
| // itself into its child in ::RemoveChildLocked. |
| DEBUG_ASSERT(children_list_len_ == 0); |
| DEBUG_ASSERT(page_list_.HasNoPageOrRef()); |
| DEBUG_ASSERT(!parent_); |
| } |
| |
| DEBUG_ASSERT(page_list_.IsEmpty()); |
| DEBUG_ASSERT(root_cursor_list_.is_empty()); |
| DEBUG_ASSERT(cur_cursor_list_.is_empty()); |
| |
| // Due to the potential lock dropping earlier double check our life_cycle_ is what we expect. |
| DEBUG_ASSERT(life_cycle_ == LifeCycle::Dying); |
| life_cycle_ = LifeCycle::Dead; |
| return deferred; |
| } |
| |
| VmCowPages::~VmCowPages() { |
| // Most of the explicit cleanup happens in DeadTransition() with asserts and some remaining |
| // cleanup happening here in the destructor. |
| canary_.Assert(); |
| DEBUG_ASSERT(page_list_.HasNoPageOrRef()); |
| // A cow pages can only be destructed if it is either still in the Init state, suggesting |
| // something when wrong with completing construction, or if it is fully in the Dead state, nothing |
| // in between. |
| DEBUG_ASSERT(life_cycle_ == LifeCycle::Init || life_cycle_ == LifeCycle::Dead); |
| // The discardable tracker is unlinked explicitly in the destructor to ensure that no RefPtrs can |
| // be constructed to the VmCowPages from here. See comment in |
| // DiscardableVmoTracker::DebugDiscardablePageCounts that depends upon this being here instead of |
| // during the dead transition. |
| if (discardable_tracker_) { |
| Guard<CriticalMutex> guard{lock()}; |
| discardable_tracker_->assert_cow_pages_locked(); |
| discardable_tracker_->RemoveFromDiscardableListLocked(); |
| } |
| } |
| |
| template <typename T> |
| zx_status_t VmCowPages::ForEveryOwnedHierarchyPageInRangeLocked(T func, uint64_t offset, |
| uint64_t size, |
| const LockedPtr& parent) const { |
| return ForEveryOwnedHierarchyPageInRange<const VmPageOrMarker*>(const_cast<VmCowPages*>(this), |
| func, offset, size, parent); |
| } |
| |
| template <typename T> |
| zx_status_t VmCowPages::ForEveryOwnedMutableHierarchyPageInRangeLocked(T func, uint64_t offset, |
| uint64_t size, |
| const LockedPtr& parent) { |
| return ForEveryOwnedHierarchyPageInRange<VmPageOrMarkerRef>(this, func, offset, size, parent); |
| } |
| |
| template <typename T> |
| zx_status_t VmCowPages::RemoveOwnedHierarchyPagesInRangeLocked(T func, uint64_t offset, |
| uint64_t size, |
| const LockedPtr& parent) { |
| return ForEveryOwnedHierarchyPageInRange<VmPageOrMarker*>(this, func, offset, size, parent); |
| } |
| |
| template <typename P, typename S, typename T> |
| zx_status_t VmCowPages::ForEveryOwnedHierarchyPageInRange(S* self, T func, uint64_t offset, |
| uint64_t size, const LockedPtr& parent) { |
| DEBUG_ASSERT(IS_PAGE_ROUNDED(offset)); |
| DEBUG_ASSERT(IS_PAGE_ROUNDED(size)); |
| |
| uint64_t start_in_self = offset; |
| uint64_t end_in_self = CheckedAdd(offset, size); |
| uint64_t start_in_cur = start_in_self; |
| uint64_t end_in_cur = end_in_self; |
| |
| LockedParentWalker walker(parent); |
| |
| while (start_in_self < end_in_self) { |
| // We attempt to always inline these lambdas, as its a huge performance benefit and has minimal |
| // impact on code size. |
| bool stopped_early = false; |
| uint64_t parent_content_start = UINT64_MAX; |
| uint64_t parent_content_end = 0; |
| auto page_callback = [&](auto p, uint64_t page_offset) __ALWAYS_INLINE { |
| AssertHeld(self->lock_ref()); |
| uint64_t cur_to_self = start_in_cur - start_in_self; |
| // If we had started tracking a run of contiguous parent content then we must walk up once it |
| // stops, either due to a gap or a switch to some other entry type. |
| if (parent_content_end != 0 && (page_offset != parent_content_end || !p->IsParentContent())) { |
| return ZX_ERR_STOP; |
| } |
| if (p->IsParentContent()) { |
| // ParentContent markers can exist spuriously (see explanation on |
| // tree_has_parent_content_markers) and so only consider walking up if within the |
| // parent_limit_. |
| if (page_offset < walker.current(self).parent_limit_) { |
| // Either adding to or starting a new contiguous parent content run. |
| parent_content_start = ktl::min(parent_content_start, page_offset); |
| parent_content_end = page_offset + PAGE_SIZE; |
| } |
| return ZX_ERR_NEXT; |
| } |
| zx_status_t status = func(p, &walker.current(self), page_offset - cur_to_self, page_offset); |
| if (status == ZX_ERR_STOP) { |
| stopped_early = true; |
| } |
| return status; |
| }; |
| auto gap_callback = [&](uint64_t gap_start_offset, uint64_t gap_end_offset) __ALWAYS_INLINE { |
| // The gap is empty, so walk up if the parent is accessible from any part of it. |
| // Mark the range immediately preceding the gap as processed. |
| AssertHeld(self->lock_ref()); |
| // Gaps will never be considered on nodes that have parent content markers, so should never be |
| // in the middle of calculating a parent content run. |
| DEBUG_ASSERT(parent_content_end == 0); |
| if (gap_start_offset < walker.current(self).parent_limit_) { |
| parent_content_start = gap_start_offset; |
| parent_content_end = gap_end_offset; |
| return ZX_ERR_STOP; |
| } |
| return ZX_ERR_NEXT; |
| }; |
| |
| zx_status_t status = ZX_OK; |
| if (walker.current(self).is_parent_hidden_locked() && |
| start_in_cur < walker.current(self).parent_limit_ && |
| !walker.current(self).node_has_parent_content_markers()) { |
| // We can see into a hidden parent, and cannot use content markers to optimize the walk up, so |
| // need to consider any gaps. |
| if constexpr (ktl::is_same_v<P, VmPageOrMarker*>) { |
| status = walker.current(self).page_list_.RemovePagesAndIterateGaps( |
| page_callback, gap_callback, start_in_cur, end_in_cur); |
| } else if constexpr (ktl::is_same_v<P, VmPageOrMarkerRef>) { |
| status = walker.current(self).page_list_.ForEveryPageAndGapInRangeMutable( |
| page_callback, gap_callback, start_in_cur, end_in_cur); |
| } else { |
| status = walker.current(self).page_list_.ForEveryPageAndGapInRange( |
| page_callback, gap_callback, start_in_cur, end_in_cur); |
| } |
| } else { |
| // Either we cannot see into a hidden parent, or we are able to utilize parent content |
| // markers, and so do not need to consider gaps and can just directly process the pages. |
| if constexpr (ktl::is_same_v<P, VmPageOrMarker*>) { |
| status = |
| walker.current(self).page_list_.RemovePages(page_callback, start_in_cur, end_in_cur); |
| } else if constexpr (ktl::is_same_v<P, VmPageOrMarkerRef>) { |
| status = walker.current(self).page_list_.ForEveryPageInRangeMutable( |
| page_callback, start_in_cur, end_in_cur); |
| } else { |
| status = walker.current(self).page_list_.ForEveryPageInRange(page_callback, start_in_cur, |
| end_in_cur); |
| } |
| } |
| if (status != ZX_OK) { |
| return status; |
| } |
| |
| // If the page callback wanted to stop early, then do so. |
| if (stopped_early) { |
| return ZX_OK; |
| } |
| |
| if (parent_content_end != 0) { |
| // If we found a run of parent content, either via parent content markers or from a gap, then |
| // need to walk up and look for it. |
| start_in_self += parent_content_start - start_in_cur; |
| start_in_cur = parent_content_start + walker.current(self).parent_offset_; |
| end_in_cur = ktl::min(parent_content_end, walker.current(self).parent_limit_) + |
| walker.current(self).parent_offset_; |
| walker.WalkUp(self); |
| } else { |
| // If not walk up, then mark the entire range as processed and begin another walk up from |
| // `self`. |
| start_in_self += end_in_cur - start_in_cur; |
| start_in_cur = start_in_self; |
| end_in_cur = end_in_self; |
| walker.reset(); |
| } |
| } |
| |
| return ZX_OK; |
| } |
| |
| // Walks all the descendants in a preorder traversal. Stops if func returns anything other than |
| // ZX_OK. |
| zx_status_t VmCowPages::DebugForEachDescendant( |
| fit::function<bool(VmCowPages* cow, uint depth)> visit) { |
| auto cursor = TreeWalkCursor{LockedPtr(this)}; |
| |
| do { |
| AssertHeld(cursor.GetCur()->lock_ref()); |
| int32_t approx_depth = cursor.DebugGetDepth(); |
| uint32_t depth = (approx_depth < 0) ? 0 : approx_depth; |
| auto status = visit(cursor.GetCur().get(), depth); |
| |
| if (status != ZX_OK) { |
| return status; |
| } |
| } while (cursor.NextChild()); |
| |
| return ZX_OK; |
| } |
| |
| bool VmCowPages::DedupZeroPage(vm_page_t* page, uint64_t offset) { |
| canary_.Assert(); |
| |
| __UNINITIALIZED DeferredOps deferred(this); |
| Guard<CriticalMutex> guard{lock()}; |
| |
| // Forbid zero page deduping if this is high priority. |
| if (high_priority_count_ != 0) { |
| return false; |
| } |
| |
| // The VmObjectPaged could have been destroyed, or this could be a hidden node. Check if the |
| // paged_ref_ is valid first. |
| if (paged_ref_) { |
| if (!paged_backlink_locked(this)->CanDedupZeroPagesLocked()) { |
| return false; |
| } |
| } |
| |
| // Check this page is still a part of this VMO. object.page_offset could be wrong, but there's no |
| // harm in looking up a random slot as we'll then notice it's the wrong page. |
| // Also ignore any references since we cannot efficiently scan them, and they should presumably |
| // already be deduped. |
| // Pinned pages cannot be decommited and so also must not be committed. We must also not decommit |
| // pages from kernel VMOs, as the kernel cannot fault them back in, but all kernel pages will be |
| // pinned. |
| VmPageOrMarkerRef page_or_marker = page_list_.LookupMutable(offset); |
| if (!page_or_marker || !page_or_marker->IsPage() || page_or_marker->Page() != page || |
| page->object.pin_count > 0 || (is_page_dirty_tracked(page) && !is_page_clean(page))) { |
| return false; |
| } |
| |
| // We expect most pages to not be zero, as such we will first do a 'racy' zero page check where |
| // we leave write permissions on the page. If the page isn't zero, which is our hope, then we |
| // haven't paid the price of modifying page tables. |
| if (!IsZeroPage(page_or_marker->Page())) { |
| return false; |
| } |
| |
| RangeChangeUpdateLocked(VmCowRange(offset, PAGE_SIZE), RangeChangeOp::RemoveWrite, nullptr); |
| // No range change needs to be processed for the children since children, by virtue of being |
| // copy-on-write, cannot have a writable mapping. |
| |
| if (IsZeroPage(page_or_marker->Page())) { |
| VmPageOrMarker old_page; |
| |
| if (node_has_parent_content_markers()) { |
| // If using parent content markers then we do not need to, and are not permitted to, insert a |
| // regular marker. Instead just clear the slot, which indicates zero content regardless of any |
| // parents above us. |
| RangeChangeUpdateLocked(VmCowRange(offset, PAGE_SIZE), RangeChangeOp::Unmap, &deferred); |
| old_page = page_list_.RemoveContent(offset); |
| } else { |
| // Replace the slot with a marker. |
| __UNINITIALIZED auto result = |
| BeginAddPageWithSlotLocked(offset, page_or_marker, CanOverwriteContent::NonZero); |
| DEBUG_ASSERT(result.is_ok()); |
| old_page = CompleteAddPageLocked(*result, VmPageOrMarker::Marker(), ParentContent::Unknown, |
| &deferred); |
| } |
| DEBUG_ASSERT(old_page.IsPage()); |
| |
| // Free the old page. |
| vm_page_t* released_page = old_page.ReleasePage(); |
| RemovePageLocked(released_page, deferred); |
| |
| reclamation_event_count_++; |
| VMO_VALIDATION_ASSERT(DebugValidateHierarchyLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| return true; |
| } |
| return false; |
| } |
| |
| zx_status_t VmCowPages::Create(VmCowPagesOptions options, uint32_t pmm_alloc_flags, uint64_t size, |
| ktl::unique_ptr<DiscardableVmoTracker> discardable_tracker, |
| fbl::RefPtr<VmCowPages>* cow_pages) { |
| DEBUG_ASSERT(!(options & VmCowPagesOptions::kInternalOnlyMask)); |
| fbl::AllocChecker ac; |
| auto cow = fbl::AdoptRef<VmCowPages>(new (&ac) VmCowPages(options, pmm_alloc_flags, size, nullptr, |
| ktl::move(discardable_tracker), |
| kLockOrderFirstAnon)); |
| if (!ac.check()) { |
| return ZX_ERR_NO_MEMORY; |
| } |
| if (cow->discardable_tracker_) { |
| cow->discardable_tracker_->InitCowPages(cow.get()); |
| } |
| |
| *cow_pages = ktl::move(cow); |
| return ZX_OK; |
| } |
| |
| zx_status_t VmCowPages::CreateExternal(fbl::RefPtr<PageSource> src, VmCowPagesOptions options, |
| uint64_t size, fbl::RefPtr<VmCowPages>* cow_pages) { |
| DEBUG_ASSERT(!(options & VmCowPagesOptions::kInternalOnlyMask)); |
| fbl::AllocChecker ac; |
| auto cow = fbl::AdoptRef<VmCowPages>(new (&ac) VmCowPages( |
| options, PMM_ALLOC_FLAG_CAN_WAIT, size, ktl::move(src), nullptr, kLockOrderRoot)); |
| if (!ac.check()) { |
| return ZX_ERR_NO_MEMORY; |
| } |
| |
| *cow_pages = ktl::move(cow); |
| return ZX_OK; |
| } |
| |
| void VmCowPages::ReplaceChildLocked(VmCowPages* old, VmCowPages* new_child) { |
| canary_.Assert(); |
| |
| [[maybe_unused]] VmCowPages* replaced = children_list_.replace(*old, new_child); |
| DEBUG_ASSERT(replaced == old); |
| } |
| |
| void VmCowPages::DropChildLocked(VmCowPages* child) { |
| canary_.Assert(); |
| |
| [[maybe_unused]] VmCowPages* erased = children_list_.erase(*child); |
| DEBUG_ASSERT(erased == child); |
| DEBUG_ASSERT(children_list_len_ > 0); |
| --children_list_len_; |
| } |
| |
| void VmCowPages::AddChildLocked(VmCowPages* child, uint64_t offset, uint64_t parent_limit) { |
| canary_.Assert(); |
| |
| // This function must succeed, as failure here requires the caller to roll back allocations. |
| |
| // The child should definitely stop seeing into the parent at the limit of its size. |
| DEBUG_ASSERT(parent_limit <= child->size_); |
| // The child's offsets must not overflow when projected onto the root. |
| // Callers should validate this externally and report errors as appropriate. |
| const uint64_t root_parent_offset = CheckedAdd(offset, root_parent_offset_); |
| CheckedAdd(root_parent_offset, child->size_); |
| |
| // Write in the parent view values. |
| child->root_parent_offset_ = root_parent_offset; |
| child->parent_offset_ = offset; |
| child->parent_limit_ = parent_limit; |
| |
| // The child's page list should skew by the child's offset relative to the parent. This allows |
| // fast copies of page list entries when merging the lists later (entire blocks of entries can be |
| // copied at once). |
| child->page_list_.InitializeSkew(page_list_.GetSkew(), offset); |
| |
| // If the child has a non-zero high priority count, then it is counting as an incoming edge to our |
| // count. |
| if (child->high_priority_count_ > 0) { |
| ChangeSingleHighPriorityCountLocked(1); |
| } |
| |
| child->parent_ = fbl::RefPtr(this); |
| children_list_.push_front(child); |
| children_list_len_++; |
| } |
| |
| VmCowPages::ParentAndRange VmCowPages::FindParentAndRangeForCloneLocked( |
| uint64_t offset, uint64_t size, bool parent_must_be_hidden) { |
| DEBUG_ASSERT(!is_hidden()); |
| |
| // The clone's parent limit starts out equal to its size, but it can't exceed the parent's size. |
| // This ensures that any clone pages beyond the parent's range get initialized from zeroes. |
| uint64_t parent_limit = ClampedLimit(offset, size, size_); |
| |
| LockedPtr parent; |
| LockedPtr grandparent; |
| |
| // Walk up the hierarchy until we find the last node which can correctly be the clone's parent. |
| while (VmCowPages* next_parent = parent.locked_or(this).parent_.get()) { |
| grandparent = LockedPtr(next_parent); |
| |
| // `parent` will always satisfy `parent_must_be_hidden` at this point. |
| // |
| // If `next_parent` doesn't satisfy `parent_must_be_hidden` then we must use `parent` as the |
| // clone's parent, even if it doesn't have any pages for the clone to snapshot. |
| if (parent_must_be_hidden && !next_parent->is_hidden()) { |
| break; |
| } |
| |
| // If `parent` owns any pages in the clone's range then we muse use it as the clone's parent. |
| // If we continued iterating, the clone couldn't snapshot all ancestor pages that it would be |
| // able to if `this` had been the parent. |
| // This will specifically walk through any parent content markers, since they indicate the |
| // presence of content *above* this node, not held specifically by this node. |
| if (parent_limit > 0 && parent.locked_or(this).page_list_.AnyOwnedPagesOrIntervalsInRange( |
| offset, offset + parent_limit)) { |
| break; |
| } |
| |
| // Before the loop the caller validated that the clone's offsets cannot overflow when projected |
| // onto the root. Verify this will remain true. |
| // |
| // Each iteration of this loop must leave the clone's ultimate `root_parent_offset_` unchanged. |
| // We will increase the clone's `offset` by the current parent's `parent_offset_` but the new |
| // parent's `root_parent_offset_` is smaller by the same amount. |
| DEBUG_ASSERT(CheckedAdd(grandparent.locked().root_parent_offset_, |
| parent.locked_or(this).parent_offset_) == |
| parent.locked_or(this).root_parent_offset_); |
| |
| // To move to `next_parent` we need to translate the clone's window to be relative to it. |
| // |
| // The clone's last visible offset into `next_parent` cannot exceed `parent`'s parent limit, as |
| // it shouldn't be able to see more pages than it could see if `parent` had been the parent. |
| parent_limit = ClampedLimit(offset, parent_limit, parent.locked_or(this).parent_limit_); |
| offset = CheckedAdd(parent.locked_or(this).parent_offset_, offset); |
| |
| parent = ktl::move(grandparent); |
| } |
| |
| return ParentAndRange{ktl::move(parent), ktl::move(grandparent), offset, parent_limit, size}; |
| } |
| |
| zx::result<VmCowPages::LockedRefPtr> VmCowPages::CloneNewHiddenParentLocked( |
| uint64_t offset, uint64_t limit, uint64_t size, VmPageList&& initial_page_list, |
| const LockedPtr& parent) { |
| canary_.Assert(); |
| |
| const VmCowPagesOptions options = inheritable_options(); |
| |
| fbl::AllocChecker ac; |
| LockedRefPtr cow_clone; |
| // Use a sub-scope to limit visibility of cow_clone_ref as it's just a temporary. |
| { |
| auto cow_clone_ref = fbl::AdoptRef<VmCowPages>(new (&ac) VmCowPages( |
| options, pmm_alloc_flags_, size, nullptr, nullptr, kLockOrderFirstAnon)); |
| if (!ac.check()) { |
| return zx::error(ZX_ERR_NO_MEMORY); |
| } |
| // As this node was just constructed we know the lock is free, use one of the lock order gap |
| // values to acquire without a lockdep violation. If we have a parent, and hence hold its lock, |
| // then we must set the lock order after it. |
| DEBUG_ASSERT(parent_.get() == parent.get()); |
| const uint64_t order = (parent ? parent->lock_order() : lock_order()) + 1; |
| cow_clone = LockedRefPtr(ktl::move(cow_clone_ref), order); |
| } |
| |
| DEBUG_ASSERT(!is_hidden()); |
| // If `parent` is to be the new child's parent then it must become hidden first. |
| // That requires creating a new hidden node and rotating `parent` to be its child. |
| DEBUG_ASSERT(life_cycle_ == LifeCycle::Alive); |
| DEBUG_ASSERT(children_list_len_ == 0); |
| |
| // Invalidate everything, both the pages the clone will and will not be able to see. As hidden |
| // nodes are immutable, even for pages that the clone cannot see we want the parent_clone to |
| // move them back out before modifying them. |
| // Note: We could eagerly move these pages into the parent_clone instead. |
| // Bi-directional clones may not themselves already have children, so we are able to assume an |
| // absence here when performing the range update. |
| RangeChangeUpdateLocked(VmCowRange(0, size_), RangeChangeOp::RemoveWrite, nullptr); |
| |
| LockedRefPtr hidden_parent; |
| // Use a sub-scope to limit visibility of hidden_parent_ref as it's just a temporary. |
| { |
| // Lock order for a new hidden parent is either derived from its parent, or if no parent |
| // starts kLockOrderRoot. Cow creation rules state that our parent is either hidden, or a page |
| // root node ensuring that our derived lock order will still be in the hidden range. |
| DEBUG_ASSERT(!parent_ || parent_->is_hidden() || parent_->page_source_); |
| const uint64_t hidden_lock_order = |
| parent_ ? parent_->lock_order() - kLockOrderDelta : kLockOrderRoot; |
| auto hidden_parent_ref = fbl::AdoptRef<VmCowPages>( |
| new (&ac) VmCowPages(options | VmCowPagesOptions::kHidden, pmm_alloc_flags_, size_, nullptr, |
| nullptr, hidden_lock_order)); |
| if (!ac.check()) { |
| return zx::error(ZX_ERR_NO_MEMORY); |
| } |
| // If we have a parent (which will become the parent of the new hidden node) then since its |
| // lock is already acquired we cannot acquire the new hidden parent using its normal lock |
| // order. As we just created this node we know that no one else can be acquiring it, so we use |
| // the gap in the regular lock orders, taking into account that the new leaf node was already |
| // acquired into the same gap. |
| const uint64_t order = parent ? parent->lock_order() + 2 : hidden_parent_ref->lock_order(); |
| hidden_parent = LockedRefPtr(ktl::move(hidden_parent_ref), order); |
| } |
| |
| // Create a temporary page list collect the parent content markers we might need to make. This |
| // will eventually become our page_list_, but not until we've updated the backlinks and moved it |
| // into the hidden parent. |
| VmPageList temp_list; |
| temp_list.InitializeSkew(page_list_.GetSkew(), 0); |
| |
| VmCompression* compression = Pmm::Node().GetPageCompression(); |
| zx_status_t status = ZX_OK; |
| |
| { |
| __UNINITIALIZED BatchPQUpdateBacklink page_backlink_updater(&hidden_parent.locked()); |
| status = page_list_.RemovePages( |
| [&](VmPageOrMarker* p, uint64_t offset) { |
| if (tree_has_parent_content_markers()) { |
| // If a tree is uses parent content markers then, since we are a leaf node, we know that |
| // there can be no markers and no intervals, hence this is either content, or a parent |
| // marker. In either case we need to retain a ParentContent marker in |this|, and since |
| // the page list being iterated will be moved into |hidden_parent|, add a slot to the |
| // |temp_list|. |
| DEBUG_ASSERT(node_has_parent_content_markers()); |
| DEBUG_ASSERT(p->IsParentContent() || p->IsPageOrRef()); |
| auto [slot, _] = |
| temp_list.LookupOrAllocate(offset, VmPageList::IntervalHandling::NoIntervals); |
| if (!slot) { |
| return ZX_ERR_NO_MEMORY; |
| } |
| *slot = VmPageOrMarker::ParentContent(); |
| if (p->IsParentContent()) { |
| // Hidden nodes do not themselves have parent content markers, as we have effectively |
| // moved this to ourselves can clear this slot and continue. |
| *p = VmPageOrMarker::Empty(); |
| return ZX_ERR_NEXT; |
| } |
| } |
| if (p->IsReference()) { |
| // A regular reference we can move, a temporary reference we need to turn back into |
| // its page so we can move it. To determine if we have a temporary reference we can |
| // just attempt to move it, and if it was a temporary reference we will get a page |
| // returned. |
| if (auto maybe_page = MaybeDecompressReference(compression, p->Reference())) { |
| // For simplicity, since this is a very uncommon edge case, just update the page in |
| // place in this page list, then move it as a regular page. |
| AssertHeld(lock_ref()); |
| SetNotPinnedLocked(*maybe_page, offset); |
| VmPageOrMarker::ReferenceValue ref = p->SwapReferenceForPage(*maybe_page); |
| ASSERT(compression->IsTempReference(ref)); |
| } |
| } |
| // Not an else-if to intentionally perform this if the previous block turned a reference |
| // into a page. |
| if (p->IsPage()) { |
| page_backlink_updater.Push(p->Page(), offset); |
| } |
| return ZX_ERR_NEXT; |
| }, |
| 0, size_); |
| |
| page_backlink_updater.Flush(); |
| } |
| |
| // On error we need to roll-back any partial modifications. |
| if (status != ZX_OK) { |
| DEBUG_ASSERT_MSG(status == ZX_ERR_NO_MEMORY, "status: %d", status); |
| // Re-set all the backlinks back to |this|. Any backlinks that hadn't yet been moved will get a |
| // harmless no-op. |
| __UNINITIALIZED BatchPQUpdateBacklink page_backlink_updater(this); |
| page_list_.ForEveryPage([&](const VmPageOrMarker* p, uint64_t offset) { |
| if (p->IsPage()) { |
| page_backlink_updater.Push(p->Page(), offset); |
| } |
| return ZX_ERR_NEXT; |
| }); |
| // Need to put back any ParentContent markers we had deleted. |
| temp_list.MergeRangeOntoAndClear( |
| [](VmPageOrMarker* src, VmPageOrMarker* dst, uint64_t) { |
| // The only items in temp_list are parent content markers we just put in. |
| DEBUG_ASSERT(src->IsParentContent()); |
| // If dst is empty then it use to hold a ParentContent marker, but we deleted it, so put |
| // it back. A non-empty dst we leave alone, as that indicates where we created a |
| // ParentContent marker for content that we did not modify, and hence do not need to roll |
| // back. |
| if (dst->IsEmpty()) { |
| *dst = ktl::move(*src); |
| } |
| }, |
| page_list_, 0, size_); |
| // temp_list just contains ParentContent markers, which can be safely dropped. |
| return zx::error(status); |
| } |
| |
| // Move our pagelist before adding ourselves as its child, because we cannot be added as a child |
| // unless we have no pages. |
| hidden_parent.locked().page_list_ = ktl::move(page_list_); |
| |
| hidden_parent.locked().TransitionToAliveLocked(); |
| |
| // If the current object is not the root of the tree, then we need to replace ourselves in our |
| // parent's child list with the new hidden node before we can becomes its child. |
| if (parent_) { |
| DEBUG_ASSERT(parent && parent.get() == parent_.get()); |
| // Copy the offsets and limits from the current node to the newly created parent. |
| // This logic is similar to AddChildLocked, except that we don't need to recompute these |
| // values. |
| hidden_parent.locked().root_parent_offset_ = root_parent_offset_; |
| hidden_parent.locked().parent_offset_ = parent_offset_; |
| hidden_parent.locked().parent_limit_ = parent_limit_; |
| |
| // We do not need to set high_priority_count_ because the called to AddChildLocked below |
| // will initialize high_priority_count_ for hidden_parent. |
| |
| parent.locked().ReplaceChildLocked(this, hidden_parent.get()); |
| hidden_parent.locked().parent_ = ktl::move(parent_); |
| |
| // We have lost our parent, which means we could now be violating the invariant that |
| // parent_limit_ being non-zoer implies we have a parent. In practice this assignment |
| // shouldn't matter because we are about to add ourselves as a child of `hidden_parent`. |
| parent_offset_ = parent_limit_ = 0; |
| } |
| |
| // Add the children and then populate their initial page lists. |
| hidden_parent.locked().AddChildLocked(this, 0, size_); |
| hidden_parent.locked().AddChildLocked(&cow_clone.locked(), offset, limit); |
| DEBUG_ASSERT(temp_list.GetSkew() == page_list_.GetSkew()); |
| page_list_ = ktl::move(temp_list); |
| DEBUG_ASSERT(cow_clone.locked().page_list_.GetSkew() == initial_page_list.GetSkew()); |
| cow_clone.locked().page_list_ = ktl::move(initial_page_list); |
| |
| // Checking this node's hierarchy will also check the parent's hierarchy. |
| VMO_VALIDATION_ASSERT(DebugValidateHierarchyLocked()); |
| |
| return zx::ok(ktl::move(cow_clone)); |
| } |
| |
| zx::result<VmCowPages::LockedRefPtr> VmCowPages::CloneChildLocked(uint64_t offset, uint64_t limit, |
| uint64_t size, |
| VmPageList&& initial_page_list, |
| const LockedPtr& parent) { |
| canary_.Assert(); |
| |
| VmCowPagesOptions options = inheritable_options(); |
| |
| LockedRefPtr cow_clone; |
| // Use a sub-scope to limit visibility of cow_clone_ref as it's just a temporary. |
| { |
| fbl::AllocChecker ac; |
| // We are either constructing the first visible anonymous node in a chain, which gets |
| // kLockOrderFirstAnon, or this is part of a unidirectional clone chain and takes a lock order |
| // derived from ourselves. In full these possibilities are: |
| // * This is userpager root (we have no parent and are not hidden), we are creating first |
| // visible anonymous node |
| // * This is a hidden node, we are creating first visible anonymous node |
| // * Unidirectional clone chain (we have parent and are not hidden), creating derived visible |
| // anonymous node. |
| // See comment above lock_order_ definition for more details. |
| const uint64_t clone_order = |
| (parent_ && !is_hidden()) ? lock_order() - kLockOrderDelta : kLockOrderFirstAnon; |
| auto cow_clone_ref = fbl::AdoptRef<VmCowPages>( |
| new (&ac) VmCowPages(options, pmm_alloc_flags_, size, nullptr, nullptr, clone_order)); |
| if (!ac.check()) { |
| return zx::error(ZX_ERR_NO_MEMORY); |
| } |
| // As this node was just constructed we know the lock is free, use one of the lock order gap |
| // values to acquire without a lockdep violation. If we have a parent, and hence hold its lock, |
| // then we must set the lock order after it. |
| DEBUG_ASSERT(parent_.get() == parent.get()); |
| cow_clone = |
| LockedRefPtr(ktl::move(cow_clone_ref), (parent ? parent->lock_order() : lock_order()) + 1); |
| } |
| |
| AddChildLocked(&cow_clone.locked(), offset, limit); |
| // If given a non-empty initial_page_list then place it in the clone. |
| if (!initial_page_list.IsEmpty()) { |
| DEBUG_ASSERT(cow_clone.locked().page_list_.GetSkew() == initial_page_list.GetSkew()); |
| cow_clone.locked().page_list_ = ktl::move(initial_page_list); |
| } |
| |
| // Checking this node's hierarchy will also check the parent's hierarchy. |
| // It will not check the child's page sharing however, so check that independently. |
| VMO_VALIDATION_ASSERT(DebugValidateHierarchyLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| VMO_VALIDATION_ASSERT(cow_clone.locked().DebugValidatePageSharingLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(cow_clone.locked().DebugValidateVmoPageBorrowingLocked()); |
| |
| return zx::ok(ktl::move(cow_clone)); |
| } |
| |
| zx::result<VmCowPages::LockedRefPtr> VmCowPages::CreateCloneLocked(SnapshotType type, |
| bool require_unidirectional, |
| VmCowRange range, |
| DeferredOps& ops) { |
| canary_.Assert(); |
| |
| // When creating a clone the DeferredOps is not used beyond acting to serialize operations on |
| // pager backed hierarchies via the page_source_lock that it holds. For why this is important see |
| // the comments in ::Resize. |
| DEBUG_ASSERT(ops.self_ == this); |
| |
| LTRACEF("vmo %p offset %#" PRIx64 " size %#" PRIx64 "\n", this, range.offset, range.len); |
| |
| DEBUG_ASSERT(range.is_page_aligned()); |
| DEBUG_ASSERT(!is_hidden()); |
| VMO_VALIDATION_ASSERT(DebugValidateHierarchyLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| |
| // A full snapshot is not compatible with there being a root page source. More specifically a |
| // full snapshot requires that there be no unidirectional clones in the tree, and this invariant |
| // is maintained by limiting unidirectional clones to only existing if there *is* a root page |
| // source. Any unidirectional clones in the tree would be able to introduce / modify content, |
| // which is not compatible with the notion of a full snapshot. |
| if (type == SnapshotType::Full && can_root_source_evict()) { |
| return zx::error(ZX_ERR_NOT_SUPPORTED); |
| } |
| |
| // Determine whether the snapshot type is requiring a bidirectional clone or not. |
| const bool require_bidirectional = [&]() TA_REQ(lock()) { |
| switch (type) { |
| case SnapshotType::Full: |
| // As per the above check, a full snapshot is incompatible with unidirectional clones, and |
| // so this type insists on bidirectional. |
| return true; |
| case SnapshotType::Modified: |
| // If there is a parent then a bidirectional clone is required in order to produce a |
| // snapshot of any of the pages we have modified with respect to our parent. In the absence |
| // of a parent there is no restriction. |
| return !!parent_; |
| case SnapshotType::OnWrite: |
| // Any kind of clone implements on copy-on-write, so no restriction. |
| return false; |
| } |
| return false; |
| }(); |
| |
| // Offsets within the new clone must not overflow when projected onto the root. |
| { |
| uint64_t child_root_parent_offset; |
| bool overflow; |
| overflow = add_overflow(root_parent_offset_, range.offset, &child_root_parent_offset); |
| if (overflow) { |
| return zx::error(ZX_ERR_INVALID_ARGS); |
| } |
| uint64_t child_root_parent_end; |
| overflow = add_overflow(child_root_parent_offset, range.len, &child_root_parent_end); |
| if (overflow) { |
| return zx::error(ZX_ERR_INVALID_ARGS); |
| } |
| } |
| |
| if (require_bidirectional && require_unidirectional) { |
| return zx::error(ZX_ERR_NOT_SUPPORTED); |
| } |
| const bool unidirectional = !require_bidirectional && can_unidirectional_clone_locked(); |
| |
| // Only contiguous VMOs have a source that handles free, and those may not have cow clones made of |
| // them. Once there is a cow hierarchy tracking exactly what node a page was from to free it is |
| // not performed, and it is assumed that therefore that we do not need to free owned pages to |
| // their 'correct' object. |
| ASSERT(!is_source_handling_free()); |
| |
| if (unidirectional) { |
| ParentAndRange child_range = FindParentAndRangeForCloneLocked(range.offset, range.len, false); |
| |
| return child_range.parent.locked_or(this).CloneChildLocked( |
| child_range.parent_offset, child_range.parent_limit, child_range.size, VmPageList(), |
| child_range.grandparent); |
| } |
| |
| if (require_unidirectional) { |
| return zx::error(ZX_ERR_NOT_SUPPORTED); |
| } |
| |
| // If this is non-zero, that means that there are pages which hardware can |
| // touch, so the vmo can't be safely cloned. |
| // TODO: consider immediately forking these pages. |
| if (pinned_page_count_locked()) { |
| return zx::error(ZX_ERR_BAD_STATE); |
| } |
| |
| VmCompression* compression = Pmm::Node().GetPageCompression(); |
| |
| // For any content that we have part or full ownership of in the range to be cloned, then the |
| // child, regardless of what actual node it ends up hanging of, will gain part ownership of said |
| // content. Therefore we first want to find all such content, incrementing the share counts, and |
| // populating a new page list with parent content markers if needed. |
| // We explicitly need to do this *before* walking up because, if using parent content markers, the |
| // content we are able to see is possibly determined by content markers in *this* node, even if we |
| // will be able to mechanically hang the new node higher up. |
| VmPageList page_list; |
| page_list.InitializeSkew(page_list_.GetSkew(), range.offset); |
| |
| // To account for any errors that result in needing to roll back we remember the range we have |
| // processed the share counts for. |
| uint64_t shared_end = range.offset; |
| auto rollback = fit::defer([this, &range, &shared_end, compression]() { |
| AssertHeld(lock_ref()); |
| |
| // Decrement the share count on all pages. As every page we can see is also owned by this, and |
| // we have continuously held our lock, no page should need to be freed as a result. |
| zx_status_t status = RemoveOwnedHierarchyPagesInRangeLocked( |
| [&](VmPageOrMarker* p, const VmCowPages* owner, uint64_t this_offset, |
| uint64_t owner_offset) { |
| if (p->IsPage()) { |
| vm_page_t* page = p->Page(); |
| DEBUG_ASSERT(page->object.share_count > 0); |
| page->object.share_count--; |
| } else if (p->IsReference()) { |
| const uint32_t share_count = compression->GetMetadata(p->Reference()); |
| DEBUG_ASSERT(share_count > 0); |
| compression->SetMetadata(p->Reference(), share_count - 1); |
| } |
| return ZX_ERR_NEXT; |
| }, |
| range.offset, shared_end - range.offset, LockedPtr()); |
| DEBUG_ASSERT(status == ZX_OK); |
| }); |
| |
| // Update any share counts for content the clone will be able to see, and populate a temporary |
| // page list with any parent content markers if needed. |
| zx_status_t status = ForEveryOwnedMutableHierarchyPageInRangeLocked( |
| [&](VmPageOrMarkerRef p, VmCowPages* owner, uint64_t cow_clone_offset, |
| uint64_t owner_offset) { |
| if (tree_has_parent_content_markers() && p->IsPageOrRef()) { |
| const uint64_t off = cow_clone_offset - range.offset; |
| auto [slot, _] = |
| page_list.LookupOrAllocate(off, VmPageList::IntervalHandling::NoIntervals); |
| if (!slot) { |
| return ZX_ERR_NO_MEMORY; |
| } |
| *slot = VmPageOrMarker::ParentContent(); |
| } |
| if (p->IsPage()) { |
| p->Page()->object.share_count++; |
| } else if (p->IsReference()) { |
| VmPageOrMarker::ReferenceValue ref = p->Reference(); |
| compression->SetMetadata(ref, compression->GetMetadata(ref) + 1); |
| } |
| shared_end = owner_offset + PAGE_SIZE; |
| |
| return ZX_ERR_NEXT; |
| }, |
| range.offset, range.len, LockedPtr()); |
| |
| if (status != ZX_OK) { |
| // However far we got is recorded in |shared_end|, and |rollback| will clean it up. |
| return zx::error(status); |
| } |
| |
| ParentAndRange child_range = FindParentAndRangeForCloneLocked(range.offset, range.len, true); |
| |
| // The bidirectional clone check requires looking at the parent of where we want to hang the |
| // node, which is represented by |child_range.grandparent|. |
| if (!can_bidirectional_clone_locked(child_range.grandparent)) { |
| return zx::error(ZX_ERR_NOT_SUPPORTED); |
| } |
| |
| // If we found a hidden node to be our parent, then we can just hang a new node under that, |
| // otherwise we need to also create a new hidden node to place this and the new child under. |
| auto result = child_range.parent.locked_or(this).is_hidden() |
| ? child_range.parent.locked().CloneChildLocked( |
| child_range.parent_offset, child_range.parent_limit, child_range.size, |
| ktl::move(page_list), child_range.grandparent) |
| : child_range.parent.locked_or(this).CloneNewHiddenParentLocked( |
| child_range.parent_offset, child_range.parent_limit, child_range.size, |
| ktl::move(page_list), child_range.grandparent); |
| // If everything went well then we can finally cancel the rollback and let the clone own the |
| // content we added the share counts for. |
| if (result.is_ok()) { |
| rollback.cancel(); |
| } |
| return result; |
| } |
| |
| void VmCowPages::RemoveChildLocked(VmCowPages* removed, const LockedPtr& sibling) { |
| canary_.Assert(); |
| |
| VMO_VALIDATION_ASSERT(DebugValidateHierarchyLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| |
| // If we have a sibling to the right of the removed node then update any cursors to point there, |
| // otherwise find the next valid sibling starting from our parent, which we already hold the lock |
| // for. |
| const bool removed_left = removed == &children_list_.front(); |
| if (removed_left && sibling) { |
| TreeWalkCursor::MoveToSibling(removed->cur_cursor_list_, removed, &sibling.locked()); |
| } else { |
| TreeWalkCursor::MoveToSiblingOfParent(removed->cur_cursor_list_, removed, this); |
| } |
| // Moving the cursors should have implicitly cleared any root references since cursors can never |
| // be positioned outside their subtree. |
| DEBUG_ASSERT(removed->root_cursor_list_.is_empty()); |
| |
| if (!is_hidden() || children_list_len_ > 2) { |
| DropChildLocked(removed); |
| // Things should be consistent after dropping the child. |
| VMO_VALIDATION_ASSERT(DebugValidateHierarchyLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| return; |
| } |
| |
| // Hidden vmos have 0, 2 or more children. If we had more we would have already returned, and we |
| // cannot be here with 0 children, therefore we must have 2, including the one we are removing. |
| DEBUG_ASSERT(children_list_len_ == 2); |
| |
| // Merge any cursors into the remaining child. |
| TreeWalkCursor::MergeToChild(cur_cursor_list_, root_cursor_list_, this, &sibling.locked()); |
| |
| DropChildLocked(removed); |
| MergeContentWithChildLocked(); |
| |
| DEBUG_ASSERT(sibling.get() == &children_list_.front()); |
| |
| // The child which removed itself and led to the invocation should have a reference |
| // to us, in addition to child.parent_ which we are about to clear. |
| DEBUG_ASSERT(ref_count_debug() >= 2); |
| |
| // We can have a priority count of at most 1, and only if the remaining child is the one |
| // contributing to it. |
| DEBUG_ASSERT(high_priority_count_ == 0 || |
| (high_priority_count_ == 1 && sibling.locked().high_priority_count_ > 0)); |
| // Similarly if we have a priority count, and we have a parent, then our parent must have a |
| // non-zero count. |
| LockedPtr locked_parent; |
| if (parent_) { |
| locked_parent = LockedPtr(parent_.get()); |
| } |
| if (locked_parent) { |
| DEBUG_ASSERT(high_priority_count_ == 0 || locked_parent.locked().high_priority_count_ != 0); |
| } |
| // If our child has a non-zero count, then it is propagating a +1 count to us, and we in turn are |
| // propagating a +1 count to our parent. In the final arrangement after ReplaceChildLocked then |
| // the +1 count child was giving to us needs to go to parent, but as we were already giving a +1 |
| // count to parent, everything is correct. |
| // Although the final hierarchy has correct counts, there is still an assertion in our destructor |
| // that our count is zero, so subtract of any count that we might have. |
| ChangeSingleHighPriorityCountLocked(-high_priority_count_); |
| |
| // Drop the child from our list, but don't recurse back into this function. Then |
| // remove ourselves from the clone tree and dead transition ourselves. |
| DropChildLocked(&sibling.locked()); |
| if (locked_parent) { |
| locked_parent.locked().ReplaceChildLocked(this, &sibling.locked()); |
| } |
| sibling.locked().parent_ = ktl::move(parent_); |
| // We just removed our parent, and so we have no parent and no sibling. Performing this dead |
| // transition here ensures that we are not in an alive state, despite being detached from the |
| // rest of the tree. |
| fbl::RefPtr<VmCowPages> deferred = DeadTransitionLocked(LockedPtr(), LockedPtr()); |
| ASSERT(!deferred); |
| |
| // Things should be consistent after dropping one child and merging with the other. |
| VMO_VALIDATION_ASSERT(DebugValidateHierarchyLocked()); |
| VMO_VALIDATION_ASSERT(sibling.locked().DebugValidateHierarchyLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(sibling.locked().DebugValidateVmoPageBorrowingLocked()); |
| } |
| |
| void VmCowPages::MergeContentWithChildLocked() { |
| canary_.Assert(); |
| |
| DEBUG_ASSERT(is_hidden()); |
| // There's no technical reason why this merging code cannot be run if there is a page source, |
| // however a bi-directional clone will never have a page source and so in case there are any |
| // consequence that have no been considered, ensure we are not in this case. |
| DEBUG_ASSERT(!is_source_preserving_page_content()); |
| DEBUG_ASSERT(children_list_len_ == 1); |
| |
| VmCowPages& child = children_list_.front(); |
| AssertHeld(child.lock_ref()); |
| // We don't check the hierarchy because it is inconsistent at this point. |
| // It will be made consistent by the caller and checked then. |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(child.DebugValidateVmoPageBorrowingLocked()); |
| |
| const uint64_t merge_start_offset = child.parent_offset_; |
| const uint64_t merge_end_offset = child.parent_offset_ + child.parent_limit_; |
| VmCompression* compression = Pmm::Node().GetPageCompression(); |
| |
| __UNINITIALIZED BatchPQUpdateBacklink page_backlink_updater(&child); |
| page_list_.MergeRangeOntoAndClear( |
| [&](VmPageOrMarker* src, VmPageOrMarker* dst, uint64_t off) __ALWAYS_INLINE { |
| // Never overwrite any actual content in the destination. |
| if (dst->IsPageOrRef()) { |
| return; |
| } |
| // If using parent content markers then any marker we are moving from src can become an |
| // empty slot in the destination. We already know that dst does not have any page or ref so |
| // clearing dst is guaranteed to not delete content. |
| if (src->IsMarker() && child.node_has_parent_content_markers()) { |
| DEBUG_ASSERT(dst->IsEmpty() || dst->IsParentContent()); |
| *dst = VmPageOrMarker::Empty(); |
| return; |
| } |
| |
| // Either moving some content that the child was referring to in the parent from the parent |
| // into the child, or both parent and child ended up with a marker, in which case the move |
| // is a safe no-op. |
| DEBUG_ASSERT(dst->IsEmpty() || dst->IsParentContent() || |
| (dst->IsMarker() && src->IsMarker())); |
| if (src->IsReference()) { |
| // A regular reference we can move, a temporary reference we need to turn back into its |
| // page so we can move it. To determine if we have a temporary reference we can just |
| // attempt to move it, and if it was a temporary reference we will get a page returned. |
| if (auto maybe_page = MaybeDecompressReference(compression, src->Reference())) { |
| // For simplicity, since this is a very uncommon edge case, just update the page in |
| // place in this page list, then move it as a regular page. |
| AssertHeld(lock_ref()); |
| SetNotPinnedLocked(*maybe_page, off); |
| VmPageOrMarker::ReferenceValue ref = src->SwapReferenceForPage(*maybe_page); |
| ASSERT(compression->IsTempReference(ref)); |
| } |
| } |
| // Not an else-if to intentionally perform this if the previous block turned a reference |
| // into a page. |
| if (src->IsPage()) { |
| page_backlink_updater.Push(src->Page(), off); |
| } |
| *dst = ktl::move(*src); |
| }, |
| child.page_list_, merge_start_offset, merge_end_offset); |
| |
| page_backlink_updater.Flush(); |
| |
| // MergeRangeOntoAndClear clears out the page_list_ for us. |
| DEBUG_ASSERT(page_list_.IsEmpty()); |
| |
| // Adjust the child's offset and limit so it will still see the correct range after it replaces |
| // this node. The limit must be adjusted before the offset. |
| child.parent_limit_ = ClampedLimit(child.parent_offset_, child.parent_limit_, parent_limit_); |
| child.parent_offset_ = CheckedAdd(parent_offset_, child.parent_offset_); |
| |
| // The child's last visible offset into this node's parent must be no larger than this node's last |
| // visible offset, unless the child can't see anything in this node's parent - in which case its |
| // limit will be 0. |
| DEBUG_ASSERT(child.parent_limit_ == 0 || |
| (parent_offset_ + parent_limit_ >= child.parent_offset_ + child.parent_limit_)); |
| |
| // We don't check the hierarchy because it is inconsistent at this point. |
| // It will be made consistent by the caller and checked then. |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(child.DebugValidateVmoPageBorrowingLocked()); |
| } |
| |
| void VmCowPages::DumpLocked(uint depth, bool verbose) const { |
| canary_.Assert(); |
| |
| size_t page_count = 0; |
| size_t compressed_count = 0; |
| page_list_.ForEveryPage([&page_count, &compressed_count](const auto* p, uint64_t) { |
| if (p->IsPage()) { |
| page_count++; |
| } else if (p->IsReference()) { |
| compressed_count++; |
| } |
| return ZX_ERR_NEXT; |
| }); |
| |
| const char* node_type = ""; |
| if (is_hidden()) { |
| node_type = "(hidden) "; |
| } |
| |
| for (uint i = 0; i < depth; ++i) { |
| printf(" "); |
| } |
| printf("cow_pages %p %ssize %#" PRIx64 " offset %#" PRIx64 " limit %#" PRIx64 |
| " content pages %zu compressed pages %zu ref %d parent %p num children %u\n", |
| this, node_type, size_, parent_offset_, parent_limit_, page_count, compressed_count, |
| ref_count_debug(), parent_.get(), children_list_len_); |
| |
| if (page_source_) { |
| for (uint i = 0; i < depth + 1; ++i) { |
| printf(" "); |
| } |
| printf("page_source preserves content %d\n", is_source_preserving_page_content()); |
| page_source_->Dump(depth + 1, UINT32_MAX); |
| } |
| |
| if (verbose) { |
| auto f = [depth](const auto* p, uint64_t offset) { |
| for (uint i = 0; i < depth + 1; ++i) { |
| printf(" "); |
| } |
| if (p->IsMarker()) { |
| printf("offset %#" PRIx64 " zero page marker\n", offset); |
| } else if (p->IsPage()) { |
| vm_page_t* page = p->Page(); |
| printf("offset %#" PRIx64 " page %p paddr %#" PRIxPTR " share %" PRIu32 "(%c)\n", offset, |
| page, page->paddr(), page->object.share_count, page->object.always_need ? 'A' : '.'); |
| } else if (p->IsReference()) { |
| const uint64_t cookie = p->Reference().value(); |
| printf("offset %#" PRIx64 " reference %#" PRIx64 " share %" PRIu32 "\n", offset, cookie, |
| Pmm::Node().GetPageCompression()->GetMetadata(p->Reference())); |
| } else if (p->IsIntervalStart()) { |
| printf("offset %#" PRIx64 " page interval start\n", offset); |
| } else if (p->IsIntervalEnd()) { |
| printf("offset %#" PRIx64 " page interval end\n", offset); |
| } else if (p->IsIntervalSlot()) { |
| printf("offset %#" PRIx64 " single page interval slot\n", offset); |
| } else if (p->IsParentContent()) { |
| printf("offset %#" PRIx64 " parent content marker\n", offset); |
| } |
| return ZX_ERR_NEXT; |
| }; |
| page_list_.ForEveryPage(f); |
| } |
| } |
| |
| uint32_t VmCowPages::DebugLookupDepthLocked() const { |
| canary_.Assert(); |
| |
| // Count the number of parents we need to traverse to find the root, and call this our lookup |
| // depth. |
| uint32_t depth = 0; |
| LockedPtr ptr; |
| while (VmCowPages* parent = ptr.locked_or(this).parent_.get()) { |
| depth++; |
| ptr = LockedPtr(parent); |
| } |
| return depth; |
| } |
| |
| VmCowPages::AttributionCounts VmCowPages::GetAttributedMemoryInRangeLocked(VmCowRange range) const { |
| canary_.Assert(); |
| |
| // Due to the need to manipulate fields in AttributionCounts that only exist based on the #define |
| // we cannot use the normal if constexpr guard and instead need a preprocessor guard. |
| DEBUG_ASSERT(!is_hidden()); |
| |
| VmCompression* compression = Pmm::Node().GetPageCompression(); |
| |
| // Accumulate bytes for all pages and references this node has ownership over. |
| AttributionCounts counts; |
| zx_status_t status = ForEveryOwnedHierarchyPageInRangeLocked( |
| [&](const VmPageOrMarker* p, const VmCowPages* owner, uint64_t this_offset, |
| uint64_t owner_offset) { |
| auto do_attribution = [&](auto get_share_count, auto& bytes, auto& private_bytes, |
| auto& scaled_bytes) { |
| // The short-circuit condition of (owner == this) greatly improves performance by removing |
| // the need to dereference 'random' vm_page_ts/references in the common case, greatly |
| // reducing memory stalls. For this reason the get_share_count is a callback, and not a |
| // value. |
| const uint32_t share_count = (owner == this) ? 0 : get_share_count(); |
| if (share_count == 0) { |
| bytes += PAGE_SIZE; |
| private_bytes += PAGE_SIZE; |
| scaled_bytes += PAGE_SIZE; |
| } else { |
| // An unshared (i.e. private) page has a share count of 0, add 1 to get the number of |
| // owners and scale the full page by this. |
| const vm::FractionalBytes scaled_contribution = |
| vm::FractionalBytes(PAGE_SIZE, share_count + 1); |
| bytes += PAGE_SIZE; |
| scaled_bytes += scaled_contribution; |
| } |
| }; |
| if (p->IsPage()) { |
| do_attribution([&]() { return p->Page()->object.share_count; }, counts.uncompressed_bytes, |
| counts.private_uncompressed_bytes, counts.scaled_uncompressed_bytes); |
| } else if (p->IsReference()) { |
| do_attribution([&]() { return compression->GetMetadata(p->Reference()); }, |
| counts.compressed_bytes, counts.private_compressed_bytes, |
| counts.scaled_compressed_bytes); |
| } |
| return ZX_ERR_NEXT; |
| }, |
| range.offset, range.len, LockedPtr()); |
| DEBUG_ASSERT(status == ZX_OK); |
| |
| return counts; |
| } |
| |
| VmPageOrMarker VmCowPages::AddPageTransaction::Complete(VmPageOrMarker p) { |
| VmPageOrMarker ret = slot_.SwapContent(ktl::move(p)); |
| slot_ = VmPageOrMarkerRef(); |
| return ret; |
| } |
| |
| void VmCowPages::AddPageTransaction::Cancel(VmPageList& pl) { |
| DEBUG_ASSERT(slot_); |
| if (slot_->IsEmpty()) { |
| pl.ReturnEmptySlot(offset_); |
| } |
| slot_ = VmPageOrMarkerRef(); |
| } |
| |
| zx::result<VmCowPages::AddPageTransaction> VmCowPages::BeginAddPageWithSlotLocked( |
| uint64_t offset, VmPageOrMarkerRef slot, CanOverwriteContent overwrite) { |
| canary_.Assert(); |
| zx_status_t status = CheckOverwriteConditionsLocked(offset, slot, overwrite); |
| if (unlikely(status != ZX_OK)) { |
| return zx::error(status); |
| } |
| // Do additinoal checks. The IsOffsetInZeroInterval check is expensive, but the assumption is that |
| // this method is not used when is_source_preserving_page_content is true, so the assertion should |
| // short circuit. |
| DEBUG_ASSERT(!is_source_preserving_page_content() || !slot->IsEmpty() || |
| !page_list_.IsOffsetInZeroInterval(offset)); |
| return zx::ok(AddPageTransaction(slot, offset, overwrite)); |
| } |
| |
| zx::result<VmCowPages::AddPageTransaction> VmCowPages::BeginAddPageLocked( |
| uint64_t offset, CanOverwriteContent overwrite) { |
| canary_.Assert(); |
| auto interval_handling = VmPageList::IntervalHandling::NoIntervals; |
| // If we're backed by a page source that preserves content (user pager), we cannot directly update |
| // empty slots in the page list. An empty slot might lie in a sparse zero interval, which would |
| // require splitting the interval around the required offset before it can be manipulated. |
| if (is_source_preserving_page_content()) { |
| // We can overwrite zero intervals if we're allowed to overwrite zeros (or non-zeros). |
| interval_handling = overwrite != CanOverwriteContent::None |
| ? VmPageList::IntervalHandling::SplitInterval |
| : VmPageList::IntervalHandling::CheckForInterval; |
| } |
| auto [slot, is_in_interval] = page_list_.LookupOrAllocate(offset, interval_handling); |
| if (is_in_interval) { |
| // We should not have found an interval if we were not expecting any. |
| DEBUG_ASSERT(interval_handling != VmPageList::IntervalHandling::NoIntervals); |
| // Return error if the offset lies in an interval but we cannot overwrite intervals. |
| if (interval_handling != VmPageList::IntervalHandling::SplitInterval) { |
| // The lookup should not have returned a slot for us to manipulate if it was in an interval |
| // that cannot be overwritten, even if that slot was already populated (by an interval |
| // sentinel). |
| DEBUG_ASSERT(!slot); |
| return zx::error(ZX_ERR_ALREADY_EXISTS); |
| } |
| // If offset was in an interval, we should have an interval slot to overwrite at this point. |
| DEBUG_ASSERT(slot && slot->IsIntervalSlot()); |
| } |
| |
| if (unlikely(!slot)) { |
| return zx::error(ZX_ERR_NO_MEMORY); |
| } |
| |
| zx_status_t status = CheckOverwriteConditionsLocked(offset, VmPageOrMarkerRef(slot), overwrite); |
| if (unlikely(status != ZX_OK)) { |
| if (slot->IsEmpty()) { |
| page_list_.ReturnEmptySlot(offset); |
| } |
| return zx::error(status); |
| } |
| |
| return zx::ok(AddPageTransaction(VmPageOrMarkerRef(slot), offset, overwrite)); |
| } |
| |
| zx_status_t VmCowPages::CheckOverwriteConditionsLocked(uint64_t offset, VmPageOrMarkerRef slot, |
| CanOverwriteContent overwrite) { |
| // Pages can be added as part of Init, but not once we transition to dead. |
| DEBUG_ASSERT(life_cycle_ != LifeCycle::Dead); |
| |
| if (offset >= size_) { |
| return ZX_ERR_OUT_OF_RANGE; |
| } |
| |
| // We cannot overwrite any kind of content. |
| if (overwrite == CanOverwriteContent::None) { |
| // An anonymous VMO starts off with all its content set to zero, i.e. at no point can it have |
| // absence of content. |
| if (!page_source_) { |
| return ZX_ERR_ALREADY_EXISTS; |
| } |
| // This VMO is backed by a page source, so empty slots represent absence of content. Fail if the |
| // slot is not empty. |
| if (!slot->IsEmpty()) { |
| return ZX_ERR_ALREADY_EXISTS; |
| } |
| } |
| |
| // We're only permitted to overwrite zero content. This has different meanings based on the |
| // whether the VMO is anonymous or is backed by a pager. |
| // |
| // * For anonymous VMOs, the initial content for the entire VMO is implicitly all zeroes at the |
| // time of creation. So both zero page markers and empty slots represent zero content. Therefore |
| // the only content type that cannot be overwritten in this case is an actual page. |
| // |
| // * For pager backed VMOs, content is either explicitly supplied by the user pager, or |
| // implicitly supplied as zeros by the kernel. Zero content is represented by either zero page |
| // markers (supplied by the user pager), or by sparse zero intervals (supplied by the kernel). |
| // Therefore the only content type that cannot be overwritten in this case as well is an actual |
| // page. |
| if (overwrite == CanOverwriteContent::Zero && slot->IsPageOrRef()) { |
| // If we have a page source, the page source should be able to validate the page. |
| // Note that having a page source implies that any content must be an actual page and so |
| // although we return an error for any kind of content, the debug check only gets run for page |
| // sources where it will be a real page. |
| DEBUG_ASSERT(!page_source_ || page_source_->DebugIsPageOk(slot->Page(), offset)); |
| return ZX_ERR_ALREADY_EXISTS; |
| } |
| // If the old entry and actual content then we should be permitted to overwrite any kind of |
| // content (zero or non-zero). |
| DEBUG_ASSERT(overwrite == CanOverwriteContent::NonZero || !slot->IsPageOrRef()); |
| return ZX_OK; |
| } |
| |
| VmPageOrMarker VmCowPages::CompleteAddPageLocked(AddPageTransaction& transaction, |
| VmPageOrMarker&& p, ParentContent parent, |
| DeferredOps* deferred) { |
| if (p.IsPage()) { |
| LTRACEF("vmo %p, offset %#" PRIx64 ", page %p (%#" PRIxPTR ")\n", this, transaction.offset(), |
| p.Page(), p.Page()->paddr()); |
| } else if (p.IsReference()) { |
| [[maybe_unused]] const uint64_t cookie = p.Reference().value(); |
| LTRACEF("vmo %p, offset %#" PRIx64 ", reference %#" PRIx64 "\n", this, transaction.offset(), |
| cookie); |
| } else { |
| DEBUG_ASSERT(p.IsMarker()); |
| LTRACEF("vmo %p, offset %#" PRIx64 ", marker\n", this, transaction.offset()); |
| } |
| |
| // If the new page is an actual page and we have a page source, the page source should be able to |
| // validate the page. |
| // Note that having a page source implies that any content must be an actual page and so |
| // although we return an error for any kind of content, the debug check only gets run for page |
| // sources where it will be a real page. |
| DEBUG_ASSERT(!p.IsPageOrRef() || !page_source_ || |
| page_source_->DebugIsPageOk(p.Page(), transaction.offset())); |
| |
| // Markers should never be placed in a node that uses parent content markers, since doing so is |
| // completely redundant and any attempt to do so represents a logic bug somewhere. |
| DEBUG_ASSERT(!p.IsMarker() || !node_has_parent_content_markers()); |
| |
| // If this is actually a real page, we need to place it into the appropriate queue. |
| if (p.IsPage()) { |
| vm_page_t* low_level_page = p.Page(); |
| DEBUG_ASSERT(low_level_page->state() == vm_page_state::OBJECT); |
| DEBUG_ASSERT(low_level_page->object.pin_count == 0); |
| SetNotPinnedLocked(low_level_page, transaction.offset()); |
| } |
| VmPageOrMarker old = transaction.Complete(ktl::move(p)); |
| |
| if (deferred) { |
| // If the old entry is a reference then we know that there can be no mappings to it, since a |
| // reference cannot be mapped in, and we can skip the range update. |
| if (!old.IsReference()) { |
| if (old.IsEmpty() && is_source_preserving_page_content()) { |
| // An empty slot where the page source is preserving content cannot have any mappings, |
| // either in self or the children, since the content is unknown (i.e. not the zero page), |
| // and so we do not need to perform any range change update. |
| // However, as we are modifying the contents we still must synchronize with any other |
| // modification to this hierarchy, which we know is true because a non-null |deferred| was |
| // passed in. |
| } else { |
| // other mappings may have covered this offset into the vmo, so unmap those ranges. |
| // If we are both overwriting zero content *and* the caller has promised us that any visible |
| // parent contains zero content, then we can use a more optimal unmap request. |
| const RangeChangeOp op = transaction.overwrite() == CanOverwriteContent::NonZero || |
| parent == ParentContent::Unknown |
| ? RangeChangeOp::Unmap |
| : RangeChangeOp::UnmapZeroPage; |
| RangeChangeUpdateLocked(VmCowRange(transaction.offset(), PAGE_SIZE), op, deferred); |
| } |
| } |
| } |
| |
| VMO_VALIDATION_ASSERT(DebugValidateHierarchyLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| VMO_VALIDATION_ASSERT(DebugValidateZeroIntervalsLocked()); |
| return old; |
| } |
| |
| void VmCowPages::CancelAddPageLocked(AddPageTransaction& transaction) { |
| transaction.Cancel(page_list_); |
| } |
| |
| zx::result<VmPageOrMarker> VmCowPages::AddPageLocked(uint64_t offset, VmPageOrMarker&& p, |
| CanOverwriteContent overwrite, |
| DeferredOps* deferred) { |
| __UNINITIALIZED auto result = BeginAddPageLocked(offset, overwrite); |
| if (unlikely(result.is_error())) { |
| if (p.IsPage()) { |
| FreePage(p.ReleasePage()); |
| } else if (p.IsReference()) { |
| FreeReference(p.ReleaseReference()); |
| } |
| return result.take_error(); |
| } |
| return zx::ok(CompleteAddPageLocked(*result, ktl::move(p), ParentContent::Unknown, deferred)); |
| } |
| |
| zx_status_t VmCowPages::AddNewPageLocked(uint64_t offset, vm_page_t* page, |
| CanOverwriteContent overwrite, |
| VmPageOrMarker* released_page, bool zero, |
| DeferredOps* deferred) { |
| canary_.Assert(); |
| |
| __UNINITIALIZED auto result = BeginAddPageLocked(offset, overwrite); |
| if (result.is_error()) { |
| return result.status_value(); |
| } |
| VmPageOrMarker old = CompleteAddNewPageLocked(*result, page, zero, deferred); |
| if (released_page) { |
| *released_page = ktl::move(old); |
| } else { |
| DEBUG_ASSERT(!old.IsPageOrRef()); |
| } |
| return ZX_OK; |
| } |
| |
| VmPageOrMarker VmCowPages::CompleteAddNewPageLocked(AddPageTransaction& transaction, |
| vm_page_t* page, bool zero, |
| DeferredOps* deferred) { |
| DEBUG_ASSERT(IS_PAGE_ROUNDED(transaction.offset())); |
| |
| InitializeVmPage(page); |
| if (zero) { |
| ZeroPage(page); |
| } |
| |
| // Pages being added to pager backed VMOs should have a valid dirty_state before being added to |
| // the page list, so that they can be inserted in the correct page queue. New pages start off |
| // clean. |
| if (is_source_preserving_page_content()) { |
| // Only zero pages can be added as new pages to pager backed VMOs. |
| DEBUG_ASSERT(zero || IsZeroPage(page)); |
| UpdateDirtyStateLocked(page, transaction.offset(), DirtyState::Clean, /*is_pending_add=*/true); |
| } |
| return CompleteAddPageLocked(transaction, VmPageOrMarker::Page(page), ParentContent::Unknown, |
| deferred); |
| } |
| |
| zx_status_t VmCowPages::AddNewPagesLocked(uint64_t start_offset, list_node_t* pages, |
| CanOverwriteContent overwrite, bool zero, |
| DeferredOps* deferred) { |
| ASSERT(overwrite != CanOverwriteContent::NonZero); |
| canary_.Assert(); |
| |
| DEBUG_ASSERT(IS_PAGE_ROUNDED(start_offset)); |
| |
| uint64_t offset = start_offset; |
| while (vm_page_t* p = list_remove_head_type(pages, vm_page_t, queue_node)) { |
| // Defer the range change update by passing false as we will do it in bulk at the end if needed. |
| zx_status_t status = AddNewPageLocked(offset, p, overwrite, nullptr, zero, nullptr); |
| if (status != ZX_OK) { |
| // Put the page back on the list so that someone owns it and it'll get free'd. |
| list_add_head(pages, &p->queue_node); |
| // Remove any pages we already placed. |
| if (offset > start_offset) { |
| __UNINITIALIZED ScopedPageFreedList freed_list; |
| __UNINITIALIZED BatchPQRemove page_remover(freed_list); |
| |
| page_list_.RemovePages(page_remover.RemovePagesCallback(), start_offset, offset); |
| page_remover.Flush(); |
| freed_list.FreePages(this); |
| } |
| |
| // Free all the pages back as we had ownership of them. |
| FreePages(pages); |
| return status; |
| } |
| offset += PAGE_SIZE; |
| } |
| |
| if (deferred) { |
| // other mappings may have covered this offset into the vmo, so unmap those ranges |
| RangeChangeUpdateLocked(VmCowRange(start_offset, offset - start_offset), RangeChangeOp::Unmap, |
| deferred); |
| } |
| |
| VMO_VALIDATION_ASSERT(DebugValidateHierarchyLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| return ZX_OK; |
| } |
| |
| zx_status_t VmCowPages::CloneCowPageLocked(uint64_t offset, list_node_t* alloc_list, |
| VmCowPages* page_owner, vm_page_t* page, |
| uint64_t owner_offset, DeferredOps& deferred, |
| AnonymousPageRequest* page_request, |
| vm_page_t** out_page) { |
| DEBUG_ASSERT(page != vm_get_zero_page()); |
| DEBUG_ASSERT(parent_); |
| DEBUG_ASSERT(page_request); |
| // We only clone pages from hidden to visible nodes. |
| DEBUG_ASSERT(page_owner->is_hidden()); |
| DEBUG_ASSERT(!is_hidden()); |
| // We don't want to handle intervals here. They should only be present when this node is backed by |
| // a user pager, and such nodes don't have parents so cannot be the target of a forked page. |
| DEBUG_ASSERT(!is_source_preserving_page_content()); |
| |
| // Ensure this node is ready to accept a newly-allocated page. If a subsequent step fails (such as |
| // allocating the page itself), cancelling the `page_transaction` will handle any rollback logic. |
| // |
| // By the time this function returns, the transaction will be either completed or canceled. |
| __UNINITIALIZED auto page_transaction = BeginAddPageLocked(offset, CanOverwriteContent::Zero); |
| auto cancel_transaction = fit::defer([this, out_page, &page_transaction] { |
| AssertHeld(lock_ref()); |
| |
| if (!page_transaction.is_error()) { |
| CancelAddPageLocked(*page_transaction); |
| } |
| *out_page = nullptr; // Ensure the `out_page` is initialized if we fail at any point. |
| }); |
| if (page_transaction.is_error()) { |
| return page_transaction.status_value(); |
| } |
| |
| // If the page is shared we must fork it, otherwise we can migrate it. |
| if (page->object.share_count > 0) { |
| // Create a fork of the page. This may fail due to inability to allocate a new page. |
| // The page is not writable so there is no need to unmap or protect it before reading it for the |
| // fork. |
| vm_page_t* forked_page = nullptr; |
| zx_status_t status = AllocateCopyPage(page->paddr(), alloc_list, page_request, &forked_page); |
| if (unlikely(status != ZX_OK)) { |
| return status; |
| } |
| |
| // The page is now shared one less time. |
| page->object.share_count--; |
| |
| *out_page = forked_page; |
| } else { |
| // Remove the page from the owner. |
| VmPageOrMarker removed = page_owner->page_list_.RemoveContent(owner_offset); |
| vm_page* removed_page = removed.ReleasePage(); |
| DEBUG_ASSERT(removed_page == page); |
| // TODO: This could be optimized to a ChangeObjectOffset instead of doing a Remove here and an |
| // insert in CompleteAddPageLocked. |
| pmm_page_queues()->Remove(removed_page); |
| |
| *out_page = removed_page; |
| } |
| |
| // Now that we can no longer fail to insert the new page into this node, complete the add page |
| // transaction. |
| // |
| // If the new page is different from the original page, then we must remove the original page |
| // from any mappings that reference this node or its descendants. |
| const bool do_range_update = (*out_page != page); |
| [[maybe_unused]] VmPageOrMarker prev_content = |
| CompleteAddPageLocked(*page_transaction, VmPageOrMarker::Page(*out_page), |
| ParentContent::Unknown, do_range_update ? &deferred : nullptr); |
| // We should not have been trying to fork at this offset if something already existed. |
| DEBUG_ASSERT(prev_content.IsEmpty() || prev_content.IsParentContent()); |
| // Transaction completed successfully, so it should no longer be cancelled. |
| cancel_transaction.cancel(); |
| |
| return ZX_OK; |
| } |
| |
| void VmCowPages::DecrementCowContentShareCount(VmPageOrMarkerRef content, uint64_t offset, |
| ScopedPageFreedList& list, |
| VmCompression* compression) { |
| // Only hidden nodes have content with a non-zero share count. |
| DEBUG_ASSERT(is_hidden()); |
| |
| // Release the reference we held to the forked page. |
| if (content->IsPage()) { |
| vm_page_t* page = content->Page(); |
| if (page->object.share_count > 0) { |
| // The page is now shared one less time. |
| page->object.share_count--; |
| } else { |
| // Remove the page from the owner. |
| VmPageOrMarker removed = page_list_.RemoveContent(offset); |
| vm_page* removed_page = removed.ReleasePage(); |
| DEBUG_ASSERT(removed_page == page); |
| Pmm::Node().GetPageQueues()->Remove(removed_page); |
| DEBUG_ASSERT(!page->is_loaned()); |
| |
| list_add_tail(list.List(), &page->queue_node); |
| } |
| } else { |
| DEBUG_ASSERT(content->IsReference()); |
| uint32_t prev = compression->GetMetadata(content->Reference()); |
| if (prev > 0) { |
| compression->SetMetadata(content->Reference(), prev - 1); |
| } else { |
| VmPageOrMarker removed = page_list_.RemoveContent(offset); |
| compression->Free(removed.ReleaseReference()); |
| } |
| } |
| } |
| |
| zx_status_t VmCowPages::CloneCowContentAsZeroLocked(uint64_t offset, ScopedPageFreedList& list, |
| VmCowPages* content_owner, |
| VmPageOrMarkerRef owner_content, |
| uint64_t owner_offset) { |
| DEBUG_ASSERT(parent_); |
| // We only clone pages from hidden to visible nodes. |
| DEBUG_ASSERT(content_owner->is_hidden()); |
| DEBUG_ASSERT(!is_hidden()); |
| // We don't want to handle intervals here. They should only be present when this node is backed by |
| // a user pager, and such nodes don't have parents so cannot be the target of a forked page. |
| DEBUG_ASSERT(!is_source_preserving_page_content()); |
| |
| if (owner_content->IsMarker()) { |
| // Markers do not have ref counts so nothing else to do, this will already see this as zero. |
| return ZX_OK; |
| } |
| // Only other valid items should be pages or references. |
| DEBUG_ASSERT(owner_content->IsPageOrRef()); |
| // Performing a cow zero of a parent content marker would require clearing a slot in |this| page |
| // list, which is a problem for our caller who might be iterating that some page list. As such |
| // this method may not be used if there might be parent content markers. |
| DEBUG_ASSERT(!node_has_parent_content_markers()); |
| |
| // Go ahead and insert the new zero marker into the target. We don't have anything to rollback |
| // if this fails so we can just bail immediately. |
| // |
| // We expect the caller to update any mappings as it can more efficiently do this in bulk. |
| zx::result<VmPageOrMarker> prev_content = |
| AddPageLocked(offset, VmPageOrMarker::Marker(), CanOverwriteContent::Zero, nullptr); |
| if (prev_content.is_error()) { |
| return prev_content.status_value(); |
| } |
| DEBUG_ASSERT(prev_content->IsEmpty()); |
| content_owner->DecrementCowContentShareCount(owner_content, owner_offset, list, |
| Pmm::Node().GetPageCompression()); |
| |
| return ZX_OK; |
| } |
| |
| void VmCowPages::ReleaseOwnedPagesRangeLocked(uint64_t offset, uint64_t len, |
| const LockedPtr& parent, |
| ScopedPageFreedList& freed_list) { |
| DEBUG_ASSERT(!is_hidden()); |
| DEBUG_ASSERT(offset <= size_); |
| DEBUG_ASSERT(offset + len <= size_); |
| |
| __UNINITIALIZED BatchPQRemove page_remover(freed_list); |
| |
| // If we know that the only pages in this range that need to be freed are from our own page list, |
| // and we no longer need to consider our parent, then just remove them. |
| if (!is_parent_hidden_locked() || offset >= parent_limit_) { |
| if (offset == 0 && len == size_) { |
| page_list_.RemoveAllContent( |
| [&page_remover](VmPageOrMarker&& p) { page_remover.PushContent(&p); }); |
| } else { |
| page_list_.RemovePages(page_remover.RemovePagesCallback(), offset, offset + len); |
| } |
| page_remover.Flush(); |
| // Potentially trim the parent limit to reflect the range that has been freed. |
| if (offset + len >= parent_limit_) { |
| parent_limit_ = ktl::min(parent_limit_, offset); |
| } |
| return; |
| } |
| |
| VmCompression* compression = Pmm::Node().GetPageCompression(); |
| |
| // Decrement the share count on all pages, both directly owned by us and shared via our parents, |
| // that this node can see, and free any pages with a zero ref count. |
| zx_status_t status = RemoveOwnedHierarchyPagesInRangeLocked( |
| [&](VmPageOrMarker* p, const VmCowPages* owner, uint64_t this_offset, uint64_t owner_offset) { |
| // Explicitly handle this case separately since although we would naturally find these to |
| // have a share_count of 0 and free them, we would always like to free any markers, however |
| // we can only free markers that are precisely in 'this' since markers have no refcount. |
| if (this == owner) { |
| page_remover.PushContent(p); |
| return ZX_ERR_NEXT; |
| } |
| |
| if (p->IsPage()) { |
| vm_page_t* page = p->Page(); |
| if (page->object.share_count == 0) { |
| page_remover.PushContent(p); |
| } else { |
| page->object.share_count--; |
| } |
| } else if (p->IsReference()) { |
| const uint32_t share_count = compression->GetMetadata(p->Reference()); |
| if (share_count == 0) { |
| page_remover.PushContent(p); |
| } else { |
| compression->SetMetadata(p->Reference(), share_count - 1); |
| } |
| } |
| return ZX_ERR_NEXT; |
| }, |
| offset, len, parent); |
| DEBUG_ASSERT(status == ZX_OK); |
| |
| if (node_has_parent_content_markers()) { |
| // Any parent content markers for the pages that we removed the reference counts for need to be |
| // separately removed. |
| page_list_.RemovePages( |
| [&](VmPageOrMarker* slot, uint64_t offset) { |
| DEBUG_ASSERT(slot->IsParentContent()); |
| *slot = VmPageOrMarker::Empty(); |
| return ZX_ERR_NEXT; |
| }, |
| offset, offset + len); |
| } |
| |
| // This node can no longer see into its parent in the range we just released. |
| DEBUG_ASSERT(offset < parent_limit_); |
| if (offset + len >= parent_limit_) { |
| parent_limit_ = offset; |
| } |
| |
| page_remover.Flush(); |
| } |
| |
| void VmCowPages::FindPageContentLocked(uint64_t offset, uint64_t max_owner_length, |
| PageLookup* out) { |
| const uint64_t this_offset = offset; |
| |
| // Search up the clone chain for any committed pages. cur_offset is the offset |
| // into cur we care about. The loop terminates either when that offset contains |
| // a committed page or when that offset can't reach into the parent. |
| LockedPtr cur; |
| while (offset < cur.locked_or(this).parent_limit_) { |
| VmCowPages* parent = cur.locked_or(this).parent_.get(); |
| DEBUG_ASSERT(parent); |
| |
| __UNINITIALIZED VMPLCursor cursor = |
| cur.locked_or(this).page_list_.LookupNearestMutableCursor(offset); |
| VmPageOrMarkerRef p = cursor.current(); |
| const bool cursor_correct_offset = |
| p && cursor.offset(cur.locked_or(this).page_list_.GetSkew()) == offset; |
| // If this slot has any actual content, then can immediately return it. |
| if (cursor_correct_offset && !p->IsEmpty() && !p->IsParentContent()) { |
| *out = {cursor, ktl::move(cur), offset, max_owner_length + this_offset}; |
| return; |
| } |
| // If using parent content markers then unless there is a marker we can skip walking up, as we |
| // know there is no content above us. |
| if (cur.locked_or(this).node_has_parent_content_markers() && |
| (!cursor_correct_offset || !p->IsParentContent())) { |
| *out = {VMPLCursor(), ktl::move(cur), offset, max_owner_length + this_offset}; |
| return; |
| } |
| |
| // Need to walk up, see if we need to trim the owner length. |
| if (max_owner_length > PAGE_SIZE) { |
| // First trim to the parent limit. |
| max_owner_length = ktl::min(max_owner_length, cur.locked_or(this).parent_limit_ - offset); |
| if (max_owner_length > PAGE_SIZE) { |
| // There are three cases to consider for determining the range of the parent that we can |
| // actually see. The cases are considered in order, with each case also assuming the |
| // negation of the condition of all cases above it. |
| // 1. Leaf node using parent content markers - Here the current cursor must be valid and be |
| // a ParentContent marker, otherwise we would have already returned with content and |
| // would not be walking up. In this case the visible length is the number of contiguous |
| // parentContent markers. |
| // 2. The cursor is valid - We know that the current offset does not have content, but |
| // there is some content later on, and see we must find its offset to determine if it |
| // limits the visible range or not. |
| // 3. The cursor is invalid - There is no content from here till the end of the page list, |
| // in which case the visible length extends to the parent limit (i.e. what was just |
| // calculated in max_owner_length), and we know there is no content to look for to trim |
| // this length. |
| if (cur.locked_or(this).node_has_parent_content_markers()) { |
| uint64_t new_owner_length = 0; |
| cursor.ForEveryContiguous([&new_owner_length, max_owner_length](VmPageOrMarkerRef p) { |
| if (p->IsParentContent() && new_owner_length < max_owner_length) { |
| new_owner_length += PAGE_SIZE; |
| return ZX_ERR_NEXT; |
| } |
| return ZX_ERR_STOP; |
| }); |
| // The first slot in the cursor was parent content, so should always have incremented at |
| // least once. |
| DEBUG_ASSERT(new_owner_length > 0); |
| max_owner_length = ktl::min(new_owner_length, max_owner_length); |
| } else if (p) { |
| cur.locked_or(this).page_list_.ForEveryPageInCursorRange( |
| [&offset, &max_owner_length](const VmPageOrMarker* slot, uint64_t slot_offset) { |
| DEBUG_ASSERT(!slot->IsEmpty() && slot_offset >= offset); |
| const uint64_t new_owner_length = slot_offset - offset; |
| DEBUG_ASSERT(new_owner_length > 0 && new_owner_length <= max_owner_length); |
| max_owner_length = new_owner_length; |
| return ZX_ERR_STOP; |
| }, |
| cursor, offset + max_owner_length); |
| } |
| } |
| } |
| |
| offset += cur.locked_or(this).parent_offset_; |
| cur = LockedPtr(parent); |
| } |
| *out = {cur.locked_or(this).page_list_.LookupMutableCursor(offset), ktl::move(cur), offset, |
| max_owner_length + this_offset}; |
| } |
| |
| void VmCowPages::FindInitialPageContentLocked(uint64_t offset, PageLookup* out) { |
| if (parent_ && offset < parent_limit_) { |
| LockedPtr parent = LockedPtr(parent_.get()); |
| parent.locked().FindPageContentLocked(offset + parent_offset_, PAGE_SIZE, out); |
| if (!out->owner) { |
| out->owner = ktl::move(parent); |
| } |
| } else { |
| *out = {VMPLCursor(), LockedPtr(), offset, offset + PAGE_SIZE}; |
| } |
| } |
| |
| void VmCowPages::UpdateDirtyStateLocked(vm_page_t* page, uint64_t offset, DirtyState dirty_state, |
| bool is_pending_add) { |
| ASSERT(page); |
| ASSERT(is_source_preserving_page_content()); |
| |
| // If the page is not pending being added to the page list, it should have valid object info. |
| DEBUG_ASSERT(is_pending_add || page->object.get_object() == this); |
| DEBUG_ASSERT(is_pending_add || page->object.get_page_offset() == offset); |
| |
| // If the page is Dirty or AwaitingClean, it should not be loaned. |
| DEBUG_ASSERT(!(is_page_dirty(page) || is_page_awaiting_clean(page)) || !page->is_loaned()); |
| |
| // Perform state-specific checks. We will finally update the state below. |
| bool update_page_queues = false; |
| switch (dirty_state) { |
| case DirtyState::Clean: |
| // If the page is not in the process of being added, we can only see a transition to Clean |
| // from AwaitingClean. |
| ASSERT(is_pending_add || is_page_awaiting_clean(page)); |
| |
| // If we are expecting a pending Add[New]PageLocked, we can defer updating the page queue. |
| if (!is_pending_add) { |
| update_page_queues = true; |
| } |
| break; |
| case DirtyState::Dirty: |
| // If the page is not in the process of being added, we can only see a transition to Dirty |
| // from Clean or AwaitingClean. |
| ASSERT(is_pending_add || (is_page_clean(page) || is_page_awaiting_clean(page))); |
| |
| // A loaned page cannot be marked Dirty as loaned pages are reclaimed by eviction; Dirty pages |
| // cannot be evicted. |
| DEBUG_ASSERT(!page->is_loaned()); |
| |
| // If we are expecting a pending Add[New]PageLocked, we can defer updating the page queue. |
| if (!is_pending_add) { |
| update_page_queues = true; |
| } |
| break; |
| case DirtyState::AwaitingClean: |
| // A newly added page cannot start off as AwaitingClean. |
| ASSERT(!is_pending_add); |
| // A pinned page will be kept Dirty as long as it is pinned. |
| // |
| // Note that there isn't a similar constraint when setting the Clean state as it is possible |
| // to pin a page for read after it has been marked AwaitingClean. Since it is a pinned read it |
| // does not need to dirty the page. So when the writeback is done it can transition from |
| // AwaitingClean -> Clean with a non-zero pin count. |
| // |
| // It is also possible for us to observe an intermediate pin count for a write-pin that has |
| // not fully completed yet, as we will only attempt to dirty pages after pinning them. So it |
| // is possible for a thread to be waiting on a DIRTY request on a pinned page, while a racing |
| // writeback transitions the page from AwaitingClean -> Clean with a non-zero pin count. |
| ASSERT(page->object.pin_count == 0); |
| // We can only transition to AwaitingClean from Dirty. |
| ASSERT(is_page_dirty(page)); |
| // A loaned page cannot be marked AwaitingClean as loaned pages are reclaimed by eviction; |
| // AwaitingClean pages cannot be evicted. |
| DEBUG_ASSERT(!page->is_loaned()); |
| // No page queue update. Leave the page in the Dirty queue for now as it is not clean yet; |
| // it will be moved out on WritebackEnd. |
| DEBUG_ASSERT(pmm_page_queues()->DebugPageIsPagerBackedDirty(page)); |
| break; |
| default: |
| ASSERT(false); |
| } |
| page->object.dirty_state = static_cast<uint8_t>(dirty_state) & VM_PAGE_OBJECT_DIRTY_STATES_MASK; |
| if (update_page_queues && page->object.pin_count == 0) { |
| // Move the page to the appropriate page queue, checking for global state such as high priority |
| // count etc. |
| // |
| // If Clean: |
| // Move to evictable pager backed queue to start tracking age information. |
| // |
| // If Dirty: |
| // Move the page to the Dirty queue, which does not track page age. While the page is in the |
| // Dirty queue, age information is not required (yet). It will be required when the page |
| // becomes Clean (and hence evictable) again, at which point it will get moved to the MRU |
| // pager backed queue and will age as normal. |
| // TODO(rashaeqbal): We might want age tracking for the Dirty queue in the future when the |
| // kernel generates writeback pager requests. |
| MoveToNotPinnedLocked(page, offset); |
| } |
| } |
| |
| zx_status_t VmCowPages::PrepareForWriteLocked(VmCowRange range, LazyPageRequest* page_request, |
| uint64_t* dirty_len_out) { |
| DEBUG_ASSERT(range.is_page_aligned()); |
| DEBUG_ASSERT(range.IsBoundedBy(size_)); |
| |
| DEBUG_ASSERT(page_source_); |
| DEBUG_ASSERT(is_source_preserving_page_content()); |
| |
| uint64_t dirty_len = 0; |
| const uint64_t start_offset = range.offset; |
| const uint64_t end_offset = range.end(); |
| |
| // If the VMO does not require us to trap dirty transitions, simply mark the pages dirty, and move |
| // them to the dirty page queue. Do this only for the first consecutive run of committed pages |
| // within the range starting at offset. Any absent pages will need to be provided by the page |
| // source, which might fail and terminate the lookup early. Any zero page markers and zero |
| // intervals might need to be forked, which can fail too. Only mark those pages dirty that the |
| // lookup is guaranteed to return successfully. |
| if (!page_source_->ShouldTrapDirtyTransitions()) { |
| zx_status_t status = page_list_.ForEveryPageAndGapInRange( |
| [this, &dirty_len, start_offset](const VmPageOrMarker* p, uint64_t off) { |
| // TODO(johngro): remove this explicit unused-capture warning suppression |
| // when https://bugs.llvm.org/show_bug.cgi?id=35450 gets fixed. |
| (void)start_offset; // used only in DEBUG_ASSERT |
| if (p->IsMarker() || p->IsIntervalZero()) { |
| // Found a marker or zero interval. End the traversal. |
| return ZX_ERR_STOP; |
| } |
| // VMOs with a page source will never have compressed references, so this should be a |
| // real page. |
| DEBUG_ASSERT(p->IsPage()); |
| vm_page_t* page = p->Page(); |
| DEBUG_ASSERT(is_page_dirty_tracked(page)); |
| DEBUG_ASSERT(page->object.get_object() == this); |
| DEBUG_ASSERT(page->object.get_page_offset() == off); |
| |
| // End the traversal if we encounter a loaned page. We reclaim loaned pages by evicting |
| // them, and dirty pages cannot be evicted. |
| if (page->is_loaned()) { |
| // If this is a loaned page, it should be clean. |
| DEBUG_ASSERT(is_page_clean(page)); |
| return ZX_ERR_STOP; |
| } |
| DEBUG_ASSERT(!page->is_loaned()); |
| |
| // Mark the page dirty. |
| if (!is_page_dirty(page)) { |
| AssertHeld(lock_ref()); |
| UpdateDirtyStateLocked(page, off, DirtyState::Dirty); |
| } |
| // The page was either already dirty, or we just marked it dirty. Proceed to the next one. |
| DEBUG_ASSERT(start_offset + dirty_len == off); |
| dirty_len += PAGE_SIZE; |
| return ZX_ERR_NEXT; |
| }, |
| [](uint64_t start, uint64_t end) { |
| // We found a gap. End the traversal. |
| return ZX_ERR_STOP; |
| }, |
| start_offset, end_offset); |
| // We don't expect a failure from the traversal. |
| DEBUG_ASSERT(status == ZX_OK); |
| |
| *dirty_len_out = dirty_len; |
| VMO_VALIDATION_ASSERT(DebugValidateZeroIntervalsLocked()); |
| return ZX_OK; |
| } |
| |
| // Otherwise, generate a DIRTY page request for pages in the range which need to transition to |
| // Dirty. Pages that qualify are: |
| // - Any contiguous run of non-Dirty pages (committed pages as well as zero page markers). |
| // For the purpose of generating DIRTY requests, both Clean and AwaitingClean pages are |
| // considered equivalent. This is because pages that are in AwaitingClean will need another |
| // acknowledgment from the user pager before they can be made Dirty (the filesystem might need to |
| // reserve additional space for them etc.). |
| // - Any zero intervals are implicit zero pages, i.e. the kernel supplies zero pages when they |
| // are accessed. Since these pages are not supplied by the user pager via zx_pager_supply_pages, |
| // we will need to wait on a DIRTY request before the sparse range can be replaced by an actual |
| // page for writing (the filesystem might need to reserve additional space). |
| uint64_t pages_to_dirty_len = 0; |
| |
| // Helper lambda used in the page list traversal below. Try to add pages in the range |
| // [dirty_pages_start, dirty_pages_end) to the run of dirty pages being tracked. Return codes are |
| // the same as those used by VmPageList::ForEveryPageAndGapInRange to continue or terminate |
| // traversal. |
| auto accumulate_dirty_pages = [&pages_to_dirty_len, &dirty_len, start_offset]( |
| uint64_t dirty_pages_start, |
| uint64_t dirty_pages_end) -> zx_status_t { |
| // Bail if we were tracking a non-zero run of pages to be dirtied as we cannot extend |
| // pages_to_dirty_len anymore. |
| if (pages_to_dirty_len > 0) { |
| return ZX_ERR_STOP; |
| } |
| // Append the page to the dirty range being tracked if it immediately follows it. |
| if (start_offset + dirty_len == dirty_pages_start) { |
| dirty_len += (dirty_pages_end - dirty_pages_start); |
| return ZX_ERR_NEXT; |
| } |
| // Otherwise we cannot accumulate any more contiguous dirty pages. |
| return ZX_ERR_STOP; |
| }; |
| |
| // Helper lambda used in the page list traversal below. Try to add pages in the range |
| // [to_dirty_start, to_dirty_end) to the run of to-be-dirtied pages being tracked. Return codes |
| // are the same as those used by VmPageList::ForEveryPageAndGapInRange to continue or terminate |
| // traversal. |
| auto accumulate_pages_to_dirty = [&pages_to_dirty_len, &dirty_len, start_offset]( |
| uint64_t to_dirty_start, |
| uint64_t to_dirty_end) -> zx_status_t { |
| // Bail if we were already accumulating a non-zero run of Dirty pages. |
| if (dirty_len > 0) { |
| return ZX_ERR_STOP; |
| } |
| // Append the pages to the range being tracked if they immediately follow it. |
| if (start_offset + pages_to_dirty_len == to_dirty_start) { |
| pages_to_dirty_len += (to_dirty_end - to_dirty_start); |
| return ZX_ERR_NEXT; |
| } |
| // Otherwise we cannot accumulate any more contiguous to-dirty pages. |
| return ZX_ERR_STOP; |
| }; |
| |
| // This tracks the beginning of an interval that falls in the specified range. Since we might |
| // start partway inside an interval, this is initialized to start_offset so that we only consider |
| // the portion of the interval inside the range. If we did not start inside an interval, we will |
| // end up reinitializing this when we do find an interval start, before this value is used, so it |
| // is safe to initialize to start_offset in all cases. |
| uint64_t interval_start_off = start_offset; |
| // This tracks whether we saw an interval start sentinel in the traversal, but have not yet |
| // encountered a matching interval end sentinel. Should we end the traversal partway in an |
| // interval, we will need to handle the portion of the interval between the interval start and the |
| // end of the specified range. |
| bool unmatched_interval_start = false; |
| bool found_page_or_gap = false; |
| zx_status_t status = page_list_.ForEveryPageAndGapInRange( |
| [&accumulate_dirty_pages, &accumulate_pages_to_dirty, &interval_start_off, |
| &unmatched_interval_start, &found_page_or_gap, this](const VmPageOrMarker* p, uint64_t off) { |
| found_page_or_gap = true; |
| if (p->IsPage()) { |
| vm_page_t* page = p->Page(); |
| DEBUG_ASSERT(is_page_dirty_tracked(page)); |
| // VMOs that trap dirty transitions should not have loaned pages. |
| DEBUG_ASSERT(!page->is_loaned()); |
| // Page is already dirty. Try to add it to the dirty run. |
| if (is_page_dirty(page)) { |
| return accumulate_dirty_pages(off, off + PAGE_SIZE); |
| } |
| // If the page is clean, mark it accessed to grant it some protection from eviction |
| // until the pager has a chance to respond to the DIRTY request. |
| if (is_page_clean(page)) { |
| AssertHeld(lock_ref()); |
| pmm_page_queues()->MarkAccessed(page); |
| } |
| } else if (p->IsIntervalZero()) { |
| if (p->IsIntervalStart() || p->IsIntervalSlot()) { |
| unmatched_interval_start = true; |
| interval_start_off = off; |
| } |
| if (p->IsIntervalEnd() || p->IsIntervalSlot()) { |
| unmatched_interval_start = false; |
| // We need to commit pages if this is an interval, irrespective of the dirty state. |
| return accumulate_pages_to_dirty(interval_start_off, off + PAGE_SIZE); |
| } |
| return ZX_ERR_NEXT; |
| } |
| |
| // We don't compress pages in pager-backed VMOs. |
| DEBUG_ASSERT(!p->IsReference()); |
| // Parent content markers do not appear in pager-backed hierarchies. |
| DEBUG_ASSERT(!p->IsParentContent()); |
| // This is a either a zero page marker (which represents a clean zero page) or a committed |
| // page which is not already Dirty. Try to add it to the range of pages to be dirtied. |
| DEBUG_ASSERT(p->IsMarker() || !is_page_dirty(p->Page())); |
| return accumulate_pages_to_dirty(off, off + PAGE_SIZE); |
| }, |
| [&found_page_or_gap](uint64_t start, uint64_t end) { |
| found_page_or_gap = true; |
| // We found a gap. End the traversal. |
| return ZX_ERR_STOP; |
| }, |
| start_offset, end_offset); |
| |
| // We don't expect an error from the traversal above. If an incompatible contiguous page or |
| // a gap is encountered, we will simply terminate early. |
| DEBUG_ASSERT(status == ZX_OK); |
| |
| // Process the last remaining interval if there is one. |
| if (unmatched_interval_start) { |
| accumulate_pages_to_dirty(interval_start_off, end_offset); |
| } |
| |
| // Account for the case where we started and ended in unpopulated slots inside an interval, i.e we |
| // did not find either a page or a gap in the traversal. We would not have accumulated any pages |
| // in that case. |
| if (!found_page_or_gap) { |
| DEBUG_ASSERT(page_list_.IsOffsetInZeroInterval(start_offset)); |
| DEBUG_ASSERT(page_list_.IsOffsetInZeroInterval(end_offset - PAGE_SIZE)); |
| DEBUG_ASSERT(dirty_len == 0); |
| DEBUG_ASSERT(pages_to_dirty_len == 0); |
| // The entire range falls in an interval so it needs a DIRTY request. |
| pages_to_dirty_len = end_offset - start_offset; |
| } |
| |
| // We should either have found dirty pages or pages that need to be dirtied, but not both. |
| DEBUG_ASSERT(dirty_len == 0 || pages_to_dirty_len == 0); |
| // Check that dirty_len and pages_to_dirty_len both specify valid ranges. |
| DEBUG_ASSERT(start_offset + dirty_len <= end_offset); |
| DEBUG_ASSERT(pages_to_dirty_len == 0 || start_offset + pages_to_dirty_len <= end_offset); |
| |
| *dirty_len_out = dirty_len; |
| |
| VMO_VALIDATION_ASSERT(DebugValidateZeroIntervalsLocked()); |
| |
| // No pages need to transition to Dirty. |
| if (pages_to_dirty_len == 0) { |
| return ZX_OK; |
| } |
| |
| // Found a contiguous run of pages that need to transition to Dirty. There might be more such |
| // pages later in the range, but we will come into this call again for them via another |
| // LookupCursor call after the waiting caller is unblocked for this range. |
| |
| VmoDebugInfo vmo_debug_info{}; |
| // We have a page source so this cannot be a hidden node, but the VmObjectPaged could have been |
| // destroyed. We could be looking up a page via a lookup in a child (slice) after the parent |
| // VmObjectPaged has gone away, so paged_ref_ could be null. Let the page source handle any |
| // failures requesting the dirty transition. |
| if (paged_ref_) { |
| vmo_debug_info.vmo_id = paged_ref_->user_id(); |
| paged_ref_->get_name(vmo_debug_info.vmo_name, sizeof(vmo_debug_info.vmo_name)); |
| } |
| status = page_source_->RequestDirtyTransition(page_request->get(), start_offset, |
| pages_to_dirty_len, vmo_debug_info); |
| // The page source will never succeed synchronously. |
| DEBUG_ASSERT(status != ZX_OK); |
| return status; |
| } |
| |
| void VmCowPages::LookupCursor::EstablishCursor() { |
| // Check if the cursor needs recalculating. |
| if (IsCursorValid()) { |
| return; |
| } |
| // Release any lock held from any previous cursor. As per the comment on InvalidateCursor, the |
| // lock is dropped here to ensure that up until the next page is required, the caller can assume |
| // the lock of the owning VMO is held. |
| owner_info_.owner.release(); |
| |
| // Ensure still in the valid range. |
| DEBUG_ASSERT(offset_ < end_offset_); |
| |
| target_->FindPageContentLocked(offset_, end_offset_ - offset_, &owner_info_); |
| owner_cursor_ = owner_info_.cursor.current(); |
| is_valid_ = true; |
| } |
| |
| inline VmCowPages::LookupCursor::RequireResult VmCowPages::LookupCursor::PageAsResultNoIncrement( |
| vm_page_t* page, bool in_target) { |
| // The page is writable if it's present in the target (non owned pages are never writable) and it |
| // does not need a dirty transition. A page doesn't need a dirty transition if the target isn't |
| // preserving page contents, or if the page is just already dirty. |
| RequireResult result{page, |
| (in_target && (!target_preserving_page_content_ || is_page_dirty(page)))}; |
| return result; |
| } |
| |
| void VmCowPages::LookupCursor::IncrementOffsetAndInvalidateCursor(uint64_t delta) { |
| offset_ += delta; |
| InvalidateCursor(); |
| } |
| |
| bool VmCowPages::LookupCursor::CursorIsContentZero() const { |
| // Markers are always zero. |
| if (CursorIsMarker()) { |
| return true; |
| } |
| |
| if (owner_info_.owner.locked_or(target_).page_source_) { |
| // With a page source emptiness implies needing to request content, however we can have zero |
| // intervals which do start as zero content. |
| return CursorIsInIntervalZero(); |
| } |
| // Without a page source emptiness is filled with zeros and intervals are only permitted if there |
| // is a page source. |
| // We consider parent content to be empty since a parent content marker can be spurious, and the |
| // only time the cursor would actually point to the parent content marker is if there is no |
| // content to be found in the parent. |
| return CursorIsEmpty() || CursorIsParentContent(); |
| } |
| |
| bool VmCowPages::LookupCursor::TargetZeroContentSupplyDirty(bool writing) const { |
| if (!TargetDirtyTracked()) { |
| return false; |
| } |
| if (writing) { |
| return true; |
| } |
| // Markers start clean |
| if (CursorIsMarker()) { |
| return false; |
| } |
| // The only way this offset can have been zero content and reach here, is if we are in an |
| // interval. If this slot were empty then, since we are dirty tracked and hence must have a |
| // page source, we would not consider this zero. |
| DEBUG_ASSERT(CursorIsInIntervalZero()); |
| // Zero intervals are considered implicitly dirty and allocating them, even for reading, causes |
| // them to be supplied as new dirty pages. |
| return true; |
| } |
| |
| zx::result<VmCowPages::LookupCursor::RequireResult> |
| VmCowPages::LookupCursor::TargetAllocateCopyPageAsResult(vm_page_t* source, DirtyState dirty_state, |
| VmCowPages::DeferredOps& deferred, |
| AnonymousPageRequest* page_request) { |
| vm_page_t* out_page = nullptr; |
| zx_status_t status = |
| target_->AllocateCopyPage(source->paddr(), alloc_list_, page_request, &out_page); |
| if (status != ZX_OK) { |
| return zx::error(status); |
| } |
| // The forked page was just allocated, and so cannot be a loaned page. |
| DEBUG_ASSERT(!out_page->is_loaned()); |
| |
| // We could be allocating a page to replace a zero page marker in a pager-backed VMO. If so then |
| // set its dirty state to what was requested, AddPageLocked below will then insert the page into |
| // the appropriate page queue. |
| if (target_preserving_page_content_) { |
| // The only page we can be forking here is the zero page. |
| DEBUG_ASSERT(source == vm_get_zero_page()); |
| // The object directly owns the page. |
| DEBUG_ASSERT(TargetIsOwner()); |
| |
| target_->UpdateDirtyStateLocked(out_page, offset_, dirty_state, |
| /*is_pending_add=*/true); |
| } |
| |
| // For efficiently we would like to use the slot we already have in our cursor if possible, |
| // however that can only be done if all of the following hold: |
| // * TargetIsOwner() - If not true then we do not even have a cursor (and hence slot) for where |
| // the insertion is happening. |
| // * owner_pl_cursor_.current() != nullptr - Must be an actual node and slot already allocated, |
| // it is just Empty() |
| // * !is_source_preserving_page_content() - A source preserving page content may have intervals, |
| // which are zeroes that we could be overwriting here, but the slot itself we have found could |
| // be empty and the interval may need splitting. For simplicity we do not attempt to check for |
| // and handle interval splitting, and just skip reusing our slot in this case. |
| const bool can_reuse_slot = |
| (TargetIsOwner() && owner_info_.cursor.current() && |
| !owner_info_.owner.locked_or(target_).is_source_preserving_page_content()); |
| __UNINITIALIZED auto page_transaction = |
| can_reuse_slot ? target_->BeginAddPageWithSlotLocked(offset_, owner_info_.cursor.current(), |
| CanOverwriteContent::Zero) |
| : target_->BeginAddPageLocked(offset_, CanOverwriteContent::Zero); |
| if (page_transaction.is_error()) { |
| target_->FreePage(out_page); |
| return page_transaction.take_error(); |
| } |
| |
| [[maybe_unused]] VmPageOrMarker old = target_->CompleteAddPageLocked( |
| *page_transaction, VmPageOrMarker::Page(out_page), |
| source == vm_get_zero_page() ? ParentContent::Zero : ParentContent::Unknown, &deferred); |
| DEBUG_ASSERT(!old.IsPageOrRef()); |
| |
| // If asked to explicitly mark zero forks, and this is actually fork of the zero page, move to the |
| // correct queue. Discardable pages are not considered zero forks as they are always in the |
| // reclaimable page queues. |
| if (zero_fork_ && source == vm_get_zero_page() && !target_->is_discardable()) { |
| pmm_page_queues()->MoveAnonymousToAnonymousZeroFork(out_page); |
| } |
| |
| // This is the only path where we can allocate a new page without being a clone (clones are |
| // always cached). So we check here if we are not fully cached and if so perform a |
| // clean/invalidate to flush our zeroes. After doing this we will not touch the page via the |
| // physmap and so we can pretend there isn't an aliased mapping. |
| // There are three potential states that may exist |
| // * VMO is cached, paged_ref_ might be null, we might have children -> no cache op needed |
| // * VMO is uncached, paged_ref_ is not null, we have no children -> cache op needed |
| // * VMO is uncached, paged_ref_ is null, we have no children -> cache op not needed / |
| // state cannot happen |
| // In the uncached case we know we have no children, since it is by definition not valid to |
| // have copy-on-write children of uncached pages. The third case cannot happen, but even if it |
| // could with no children and no paged_ref_ the pages cannot actually be referenced so any |
| // cache operation is pointless. |
| // The paged_ref_ could be null if the VmObjectPaged has been destroyed. |
| if (target_->paged_ref_) { |
| if (paged_backlink_locked(target_)->GetMappingCachePolicyLocked() != ARCH_MMU_FLAG_CACHED) { |
| arch_clean_invalidate_cache_range((vaddr_t)paddr_to_physmap(out_page->paddr()), PAGE_SIZE); |
| } |
| } |
| |
| // Need to increment the cursor, but we have also potentially modified the page lists in the |
| // process of inserting the page. |
| if (TargetIsOwner()) { |
| // In the case of TargetIsOwner() we may have to create a node and need to establish a cursor. |
| // However, if we already had a node, i.e. the cursor was valid, then it would have had the page |
| // inserted into it. |
| if (!owner_info_.cursor.current()) { |
| IncrementOffsetAndInvalidateCursor(PAGE_SIZE); |
| } else { |
| // Cursor should have been updated to the new page |
| DEBUG_ASSERT(CursorIsPage()); |
| DEBUG_ASSERT(owner_cursor_->Page() == out_page); |
| IncrementCursor(); |
| } |
| } else { |
| // If !TargetIsOwner() then the owner's page list will not have been modified, so safe to just |
| // increment. |
| IncrementCursor(); |
| } |
| |
| // Return the page. We know it's in the target, since we just put it there, but let PageAsResult |
| // determine if that means it is actually writable or not. |
| return zx::ok(PageAsResultNoIncrement(out_page, true)); |
| } |
| |
| zx_status_t VmCowPages::LookupCursor::CursorReferenceToPage(AnonymousPageRequest* page_request) { |
| DEBUG_ASSERT(CursorIsReference()); |
| |
| return owner_info_.owner.locked_or(target_).ReplaceReferenceWithPageLocked( |
| owner_cursor_, owner_info_.owner_offset, page_request); |
| } |
| |
| zx_status_t VmCowPages::LookupCursor::ReadRequest(uint max_request_pages, |
| PageRequest* page_request) { |
| // The owner must have a page_source_ to be doing a read request. |
| DEBUG_ASSERT(owner_info_.owner.locked_or(target_).page_source_); |
| // The cursor should be explicitly empty as read requests are only for complete content absence. |
| DEBUG_ASSERT(CursorIsEmpty()); |
| DEBUG_ASSERT(!CursorIsInIntervalZero()); |
| // The total range requested should not be beyond the cursors valid range. |
| DEBUG_ASSERT(offset_ + PAGE_SIZE * max_request_pages <= end_offset_); |
| DEBUG_ASSERT(max_request_pages > 0); |
| |
| VmoDebugInfo vmo_debug_info{}; |
| // The page owner has a page source so it cannot be a hidden node, but the VmObjectPaged |
| // could have been destroyed. We could be looking up a page via a lookup in a child after |
| // the parent VmObjectPaged has gone away, so paged_ref_ could be null. Let the page source |
| // handle any failures requesting the pages. |
| if (owner_info_.owner.locked_or(target_).paged_ref_) { |
| vmo_debug_info.vmo_id = owner_info_.owner.locked_or(target_).paged_ref_->user_id(); |
| owner_info_.owner.locked_or(target_).paged_ref_->get_name(vmo_debug_info.vmo_name, |
| sizeof(vmo_debug_info.vmo_name)); |
| } |
| |
| // Try and batch more pages up to |max_request_pages|. |
| uint64_t request_size = static_cast<uint64_t>(max_request_pages) * PAGE_SIZE; |
| if (!TargetIsOwner()) { |
| DEBUG_ASSERT(owner_info_.visible_end > offset_); |
| // Limit the request by the number of pages that are actually visible from the target_ to |
| // owner. |
| request_size = ktl::min(request_size, owner_info_.visible_end - offset_); |
| } |
| // Limit |request_size| to the first page visible in the page owner to avoid requesting pages |
| // that are already present. If there is one page present in an otherwise long run of absent pages |
| // then it might be preferable to have one big page request, but for now only request absent |
| // pages.If already requesting a single page then can avoid the page list operation. |
| if (request_size > PAGE_SIZE) { |
| owner_info_.owner.locked_or(target_).page_list_.ForEveryPageInRange( |
| [&](const VmPageOrMarker* p, uint64_t offset) { |
| DEBUG_ASSERT(!p->IsParentContent()); |
| // Content should have been empty initially, so should not find anything at the start |
| // offset. |
| DEBUG_ASSERT(offset > owner_info_.owner_offset); |
| // If this is an interval sentinel, it can only be a start or slot, since we know we |
| // started in a true gap outside of an interval. |
| DEBUG_ASSERT(!p->IsInterval() || p->IsIntervalSlot() || p->IsIntervalStart()); |
| const uint64_t new_size = offset - owner_info_.owner_offset; |
| // Due to the limited range of the operation, the only way this callback ever fires is if |
| // the range is actually getting trimmed. |
| DEBUG_ASSERT(new_size < request_size); |
| request_size = new_size; |
| return ZX_ERR_STOP; |
| }, |
| owner_info_.owner_offset, owner_info_.owner_offset + request_size); |
| } |
| DEBUG_ASSERT(request_size >= PAGE_SIZE); |
| |
| zx_status_t status = owner_info_.owner.locked_or(target_).page_source_->GetPages( |
| owner_info_.owner_offset, request_size, page_request, vmo_debug_info); |
| // Pager page sources will never synchronously return a page. |
| DEBUG_ASSERT(status != ZX_OK); |
| return status; |
| } |
| |
| zx_status_t VmCowPages::LookupCursor::DirtyRequest(uint max_request_pages, |
| LazyPageRequest* page_request) { |
| // Dirty requests, unlike read requests, happen directly against the target, and not the owner. |
| // This is because to make something dirty you must own it. Simply checking for TargetIsOwner() is |
| // insufficient, since the cursor may have been made invalid (clearing the owner) just prior to |
| // generating this dirty request, and we do not otherwise need the cursor here. |
| // So we also validate that we have no parent, and that we have a page source. |
| DEBUG_ASSERT(TargetIsOwner()); |
| DEBUG_ASSERT(!target_->parent_); |
| DEBUG_ASSERT(target_->page_source_); |
| DEBUG_ASSERT(max_request_pages > 0); |
| DEBUG_ASSERT(offset_ + PAGE_SIZE * max_request_pages <= end_offset_); |
| |
| // As we know target_ is the owner there is no need to trim the requested range to any kind of |
| // visible range, so just attempt to dirty the entire range. |
| uint64_t dirty_len = 0; |
| zx_status_t status = target_->PrepareForWriteLocked( |
| VmCowRange(offset_, PAGE_SIZE * max_request_pages), page_request, &dirty_len); |
| if (status == ZX_OK) { |
| // If success is claimed then it must be the case that at least one page was dirtied, allowing |
| // us to make progress. |
| DEBUG_ASSERT(dirty_len != 0 && dirty_len <= max_request_pages * PAGE_SIZE); |
| } else { |
| DEBUG_ASSERT(dirty_len == 0); |
| } |
| return status; |
| } |
| |
| vm_page_t* VmCowPages::LookupCursor::MaybePage(bool will_write) { |
| EstablishCursor(); |
| |
| // If the page is immediately usable, i.e. no dirty transitions etc needed, then we can provide |
| // it. Otherwise just increment the cursor and return the nullptr. |
| vm_page_t* page = CursorIsUsablePage(will_write) ? owner_cursor_->Page() : nullptr; |
| |
| if (page && mark_accessed_) { |
| pmm_page_queues()->MarkAccessed(page); |
| } |
| |
| IncrementCursor(); |
| |
| return page; |
| } |
| |
| uint64_t VmCowPages::LookupCursor::SkipMissingPages() { |
| EstablishCursor(); |
| |
| // Check if the cursor is truly empty |
| if (!CursorIsEmpty() || CursorIsInIntervalZero()) { |
| return 0; |
| } |
| |
| uint64_t possibly_empty = owner_info_.visible_end - offset_; |
| // Limit possibly_empty by the first page visible in the owner which, since our cursor is empty, |
| // would also be the root vmo. |
| if (possibly_empty > PAGE_SIZE) { |
| owner_info_.owner.locked_or(target_).page_list_.ForEveryPageInRange( |
| [&](const VmPageOrMarker* p, uint64_t offset) { |
| // Content should have been empty initially, so should not find anything at the start |
| // offset. |
| DEBUG_ASSERT(offset > owner_info_.owner_offset); |
| // If this is an interval sentinel, it can only be a start or slot, since we know we |
| // started in a true gap outside of an interval. |
| DEBUG_ASSERT(!p->IsInterval() || p->IsIntervalSlot() || p->IsIntervalStart()); |
| const uint64_t new_size = offset - owner_info_.owner_offset; |
| // Due to the limited range of the operation, the only way this callback ever fires is if |
| // the range is actually getting trimmed. |
| DEBUG_ASSERT(new_size < possibly_empty); |
| possibly_empty = new_size; |
| return ZX_ERR_STOP; |
| }, |
| owner_info_.owner_offset, owner_info_.owner_offset + possibly_empty); |
| } |
| // The cursor was empty, so we should have ended up with at least one page. |
| DEBUG_ASSERT(possibly_empty >= PAGE_SIZE); |
| DEBUG_ASSERT(IS_PAGE_ROUNDED(possibly_empty)); |
| DEBUG_ASSERT(possibly_empty + offset_ <= end_offset_); |
| IncrementOffsetAndInvalidateCursor(possibly_empty); |
| return possibly_empty / PAGE_SIZE; |
| } |
| |
| uint VmCowPages::LookupCursor::IfExistPages(bool will_write, uint max_pages, paddr_t* paddrs) { |
| // Ensure that the requested range is valid. |
| DEBUG_ASSERT(offset_ + PAGE_SIZE * max_pages <= end_offset_); |
| DEBUG_ASSERT(paddrs); |
| |
| EstablishCursor(); |
| |
| // We only return actual pages that are ready to use right now without any dirty transitions or |
| // copy-on-write or needing to mark them accessed. |
| if (!CursorIsUsablePage(will_write) || mark_accessed_) { |
| return 0; |
| } |
| |
| // Trim max pages to the visible length of the current owner. This only has an effect when |
| // target_ is not the owner as otherwise the visible_end is the same as end_offset_ and we already |
| // validated that we are within that range. |
| if (!TargetIsOwner()) { |
| max_pages = |
| ktl::min(max_pages, static_cast<uint>((owner_info_.visible_end - offset_) / PAGE_SIZE)); |
| } |
| DEBUG_ASSERT(max_pages > 0); |
| |
| // Take up to the max_pages as long as they exist contiguously. |
| uint pages = 0; |
| owner_info_.cursor.ForEveryContiguous([&](VmPageOrMarkerRef page) { |
| if (page->IsPage()) { |
| paddrs[pages] = page->PageAsPaddr(); |
| pages++; |
| return pages == max_pages ? ZX_ERR_STOP : ZX_ERR_NEXT; |
| } |
| return ZX_ERR_STOP; |
| }); |
| // Update the cursor to reflect the number of pages we found and are returning. |
| // We could check if cursor is still valid, but it's more efficient to just invalidate it and let |
| // any potential next page request recalculate it. |
| IncrementOffsetAndInvalidateCursor(pages * PAGE_SIZE); |
| return pages; |
| } |
| |
| zx::result<VmCowPages::LookupCursor::RequireResult> VmCowPages::LookupCursor::RequireOwnedPage( |
| bool will_write, uint max_request_pages, DeferredOps& deferred, |
| MultiPageRequest* page_request) { |
| DEBUG_ASSERT(page_request); |
| |
| // Make sure the cursor is valid. |
| EstablishCursor(); |
| |
| // Convert any references to pages. |
| if (CursorIsReference()) { |
| // Decompress in place. |
| zx_status_t status = CursorReferenceToPage(page_request->GetAnonymous()); |
| if (status != ZX_OK) { |
| return zx::error(status); |
| } |
| } |
| |
| // If page exists in the target, i.e. the owner is the target, then we handle this case separately |
| // as it's the only scenario where we might be dirtying an existing committed page. |
| if (TargetIsOwner() && CursorIsPage()) { |
| // If we're writing to a root VMO backed by a user pager, i.e. a VMO whose page source preserves |
| // page contents, we might need to mark pages Dirty so that they can be written back later. This |
| // is the only path that can result in a write to such a page; if the page was not present, we |
| // would have already blocked on a read request the first time, and ended up here when |
| // unblocked, at which point the page would be present. |
| if (will_write && target_preserving_page_content_) { |
| // If this page was loaned, it should be replaced with a non-loaned page, so that we can make |
| // progress with marking pages dirty. PrepareForWriteLocked terminates its page walk when it |
| // encounters a loaned page; loaned pages are reclaimed by evicting them and we cannot evict |
| // dirty pages. |
| if (owner_cursor_->Page()->is_loaned()) { |
| vm_page_t* res_page = nullptr; |
| DEBUG_ASSERT(is_page_clean(owner_cursor_->Page())); |
| zx_status_t status = |
| target_->ReplacePageLocked(owner_cursor_->Page(), offset_, /*with_loaned=*/false, |
| &res_page, deferred, page_request->GetAnonymous()); |
| if (status != ZX_OK) { |
| return zx::error(status); |
| } |
| // Cursor should remain valid and have been replaced with the page. |
| DEBUG_ASSERT(CursorIsPage()); |
| DEBUG_ASSERT(owner_cursor_->Page() == res_page); |
| DEBUG_ASSERT(!owner_cursor_->Page()->is_loaned()); |
| } |
| // If the page is not already dirty, then generate a dirty request. The dirty request code can |
| // handle the page already being dirty, this is just a short circuit optimization. |
| if (!is_page_dirty(owner_cursor_->Page())) { |
| zx_status_t status = DirtyRequest(max_request_pages, page_request->GetLazyDirtyRequest()); |
| if (status != ZX_OK) { |
| if (status == ZX_ERR_SHOULD_WAIT) { |
| page_request->MadeDirtyRequest(); |
| } |
| return zx::error(status); |
| } |
| } |
| } |
| // Return the page. |
| return zx::ok(CursorAsResult()); |
| } |
| |
| // Should there be page, but it not be owned by the target, then we are performing copy on write |
| // into the target. As the target cannot have a page source do not need to worry about writes or |
| // dirtying. |
| if (CursorIsPage()) { |
| DEBUG_ASSERT(!TargetIsOwner()); |
| vm_page_t* res_page = nullptr; |
| // Although we are not returning the page, the act of forking counts as an access, and this is |
| // an access regardless of whether the final returned page should be considered accessed, so |
| // ignore the mark_accessed_ check here. |
| pmm_page_queues()->MarkAccessed(owner_cursor_->Page()); |
| if (!owner_info_.owner.locked_or(target_).is_hidden()) { |
| // Directly copying the page from the owner into the target. |
| return TargetAllocateCopyPageAsResult(owner_cursor_->Page(), DirtyState::Untracked, deferred, |
| page_request->GetAnonymous()); |
| } |
| zx_status_t result = target_->CloneCowPageLocked( |
| offset_, alloc_list_, &owner_info_.owner.locked_or(target_), owner_cursor_->Page(), |
| owner_info_.owner_offset, deferred, page_request->GetAnonymous(), &res_page); |
| if (result != ZX_OK) { |
| return zx::error(result); |
| } |
| // Cloning the cow page may have impacted our cursor due to a page being moved so invalidate the |
| // cursor to perform a fresh lookup on the next page requested. |
| IncrementOffsetAndInvalidateCursor(PAGE_SIZE); |
| // This page as just allocated so no need to worry about update access times, can just return. |
| return zx::ok(RequireResult{res_page, true}); |
| } |
| |
| // Zero content is the most complicated cases where, even if reading, dirty requests might need to |
| // be performed and the resulting committed pages may / may not be dirty. |
| if (CursorIsContentZero()) { |
| // If the page source is preserving content (is a PagerProxy), and is configured to trap dirty |
| // transitions, we first need to generate a DIRTY request *before* the zero page can be forked |
| // and marked dirty. If dirty transitions are not trapped, we will fall through to allocate the |
| // page and then mark it dirty below. |
| // |
| // Note that the check for ShouldTrapDirtyTransitions() is an optimization here. |
| // PrepareForWriteLocked() would do the right thing depending on ShouldTrapDirtyTransitions(), |
| // however we choose to avoid the extra work only to have it be a no-op if dirty transitions |
| // should not be trapped. |
| const bool target_page_dirty = TargetZeroContentSupplyDirty(will_write); |
| if (target_page_dirty && target_->page_source_->ShouldTrapDirtyTransitions()) { |
| zx_status_t status = DirtyRequest(max_request_pages, page_request->GetLazyDirtyRequest()); |
| // Since we know we have a page source that traps, and page sources will never succeed |
| // synchronously, our dirty request must have 'failed'. |
| DEBUG_ASSERT(status != ZX_OK); |
| if (status == ZX_ERR_SHOULD_WAIT) { |
| page_request->MadeDirtyRequest(); |
| } |
| return zx::error(status); |
| } |
| // Allocate the page and mark it dirty or clean as previously determined. |
| return TargetAllocateCopyPageAsResult(vm_get_zero_page(), |
| target_page_dirty ? DirtyState::Dirty : DirtyState::Clean, |
| deferred, page_request->GetAnonymous()); |
| } |
| DEBUG_ASSERT(CursorIsEmpty()); |
| |
| // Generate a read request to populate the content in the owner. Even if this is a write, we still |
| // populate content first, then perform any dirty transitions / requests. |
| return zx::error(ReadRequest(max_request_pages, page_request->GetReadRequest())); |
| } |
| |
| zx::result<VmCowPages::LookupCursor::RequireResult> VmCowPages::LookupCursor::RequireReadPage( |
| uint max_request_pages, DeferredOps& deferred, MultiPageRequest* page_request) { |
| DEBUG_ASSERT(page_request); |
| |
| // Make sure the cursor is valid. |
| EstablishCursor(); |
| |
| // If there's a page or reference, return it. |
| if (CursorIsPage() || CursorIsReference()) { |
| if (CursorIsReference()) { |
| zx_status_t status = CursorReferenceToPage(page_request->GetAnonymous()); |
| if (status != ZX_OK) { |
| return zx::error(status); |
| } |
| DEBUG_ASSERT(CursorIsPage()); |
| } |
| return zx::ok(CursorAsResult()); |
| } |
| |
| // Check for zero page options. |
| if (CursorIsContentZero()) { |
| IncrementCursor(); |
| return zx::ok(RequireResult{vm_get_zero_page(), false}); |
| } |
| |
| // No available content, need to fetch it from the page source. ReadRequest performs all the |
| // requisite asserts to ensure we are not doing this mistakenly. |
| return zx::error(ReadRequest(max_request_pages, page_request->GetReadRequest())); |
| } |
| |
| zx::result<VmCowPages::LookupCursor> VmCowPages::GetLookupCursorLocked(VmCowRange range) { |
| canary_.Assert(); |
| DEBUG_ASSERT(!is_hidden()); |
| DEBUG_ASSERT(!range.is_empty()); |
| DEBUG_ASSERT(range.is_page_aligned()); |
| DEBUG_ASSERT(life_cycle_ == LifeCycle::Alive); |
| VMO_VALIDATION_ASSERT(DebugValidateHierarchyLocked()); |
| |
| if (unlikely(range.offset >= size_ || !range.IsBoundedBy(size_))) { |
| return zx::error{ZX_ERR_OUT_OF_RANGE}; |
| } |
| |
| if (discardable_tracker_) { |
| discardable_tracker_->assert_cow_pages_locked(); |
| // This vmo was discarded and has not been locked yet after the discard. Do not return any |
| // pages. |
| if (discardable_tracker_->WasDiscardedLocked()) { |
| return zx::error{ZX_ERR_NOT_FOUND}; |
| } |
| } |
| |
| return zx::ok(LookupCursor(this, range)); |
| } |
| |
| zx_status_t VmCowPages::CommitRangeLocked(VmCowRange range, DeferredOps& deferred, |
| uint64_t* committed_len, MultiPageRequest* page_request) { |
| canary_.Assert(); |
| LTRACEF("offset %#" PRIx64 ", len %#" PRIx64 "\n", range.offset, range.len); |
| |
| DEBUG_ASSERT(range.is_page_aligned()); |
| DEBUG_ASSERT(range.IsBoundedBy(size_)); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| |
| // If this vmo has a direct page source, then the source will provide the backing memory. For |
| // children that eventually depend on a page source, we skip preallocating memory to avoid |
| // potentially overallocating pages if something else touches the vmo while we're blocked on the |
| // request. Otherwise we optimize things by preallocating all the pages. |
| list_node page_list; |
| list_initialize(&page_list); |
| if (!root_has_page_source()) { |
| // make a pass through the list to find out how many pages we need to allocate |
| size_t count = range.len / PAGE_SIZE; |
| page_list_.ForEveryPageInRange( |
| [&count](const auto* p, auto off) { |
| if (p->IsPage()) { |
| count--; |
| } |
| return ZX_ERR_NEXT; |
| }, |
| range.offset, range.end()); |
| |
| if (count == 0) { |
| *committed_len = range.len; |
| return ZX_OK; |
| } |
| |
| zx_status_t status = pmm_alloc_pages(count, pmm_alloc_flags_, &page_list); |
| // Ignore ZX_ERR_SHOULD_WAIT since the loop below will fall back to a page by page allocation, |
| // allowing us to wait for single pages should we need to. |
| if (status != ZX_OK && status != ZX_ERR_SHOULD_WAIT) { |
| return status; |
| } |
| } |
| |
| auto list_cleanup = fit::defer([&page_list, this]() { |
| if (!list_is_empty(&page_list)) { |
| FreePages(&page_list); |
| } |
| }); |
| |
| const uint64_t start_offset = range.offset; |
| const uint64_t end = range.end(); |
| __UNINITIALIZED auto cursor = GetLookupCursorLocked(range); |
| if (cursor.is_error()) { |
| return cursor.error_value(); |
| } |
| AssertHeld(cursor->lock_ref()); |
| // Commit represents an explicit desire to have pages and should not be deduped back to the zero |
| // page. |
| cursor->DisableZeroFork(); |
| cursor->GiveAllocList(&page_list); |
| |
| zx_status_t status = ZX_OK; |
| uint64_t offset = start_offset; |
| while (offset < end) { |
| __UNINITIALIZED zx::result<VmCowPages::LookupCursor::RequireResult> result = |
| cursor->RequireOwnedPage(false, static_cast<uint>((end - offset) / PAGE_SIZE), deferred, |
| page_request); |
| |
| if (result.is_error()) { |
| status = result.error_value(); |
| break; |
| } |
| offset += PAGE_SIZE; |
| } |
| // Record how much we were able to process. |
| *committed_len = offset - start_offset; |
| |
| // Clear the alloc list from the cursor and let list_cleanup free any remaining pages. |
| cursor->ClearAllocList(); |
| |
| VMO_VALIDATION_ASSERT(DebugValidateHierarchyLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| return status; |
| } |
| |
| zx_status_t VmCowPages::PinRangeLocked(VmCowRange range) { |
| canary_.Assert(); |
| LTRACEF("offset %#" PRIx64 ", len %#" PRIx64 "\n", range.offset, range.len); |
| |
| DEBUG_ASSERT(range.is_page_aligned()); |
| DEBUG_ASSERT(range.IsBoundedBy(size_)); |
| |
| ever_pinned_ = true; |
| |
| // Tracks our expected page offset when iterating to ensure all pages are present. |
| uint64_t next_offset = range.offset; |
| |
| // Should any errors occur we need to unpin everything. |
| auto pin_cleanup = fit::defer([this, offset = range.offset, &next_offset]() { |
| if (next_offset > offset) { |
| AssertHeld(*lock()); |
| UnpinLocked(VmCowRange(offset, next_offset - offset), nullptr); |
| } |
| }); |
| |
| zx_status_t status = page_list_.ForEveryPageInRange( |
| [this, &next_offset](const VmPageOrMarker* p, uint64_t page_offset) { |
| AssertHeld(lock_ref()); |
| if (page_offset != next_offset || !p->IsPage()) { |
| return ZX_ERR_BAD_STATE; |
| } |
| vm_page_t* page = p->Page(); |
| DEBUG_ASSERT(page->state() == vm_page_state::OBJECT); |
| DEBUG_ASSERT(!page->is_loaned()); |
| |
| if (page->object.pin_count == VM_PAGE_OBJECT_MAX_PIN_COUNT) { |
| return ZX_ERR_UNAVAILABLE; |
| } |
| |
| page->object.pin_count++; |
| if (page->object.pin_count == 1) { |
| MoveToPinnedLocked(page, page_offset); |
| } |
| |
| // Pinning every page in the largest vmo possible as many times as possible can't overflow |
| static_assert(VmPageList::MAX_SIZE / PAGE_SIZE < UINT64_MAX / VM_PAGE_OBJECT_MAX_PIN_COUNT); |
| next_offset += PAGE_SIZE; |
| return ZX_ERR_NEXT; |
| }, |
| range.offset, range.end()); |
| |
| const uint64_t actual = (next_offset - range.offset) / PAGE_SIZE; |
| // Count whatever pages we pinned, in the failure scenario this will get decremented on the unpin. |
| pinned_page_count_ += actual; |
| |
| if (status == ZX_OK) { |
| // If the missing pages were at the end of the range (or the range was empty) then our iteration |
| // will have just returned ZX_OK. Perform one final check that we actually pinned the number of |
| // pages we expected to. |
| const uint64_t expected = range.len / PAGE_SIZE; |
| if (actual != expected) { |
| status = ZX_ERR_BAD_STATE; |
| } else { |
| pin_cleanup.cancel(); |
| } |
| } |
| return status; |
| } |
| |
| zx_status_t VmCowPages::DecommitRange(VmCowRange range) { |
| canary_.Assert(); |
| |
| __UNINITIALIZED DeferredOps deferred(this); |
| Guard<CriticalMutex> guard{AssertOrderedLock, lock(), lock_order()}; |
| // Validate the size and perform our zero-length hot-path check before we recurse |
| // up to our top-level ancestor. Size bounding needs to take place relative |
| // to the child the operation was originally targeted against. |
| if (!range.IsBoundedBy(size_)) { |
| return ZX_ERR_OUT_OF_RANGE; |
| } |
| |
| // was in range, just zero length |
| if (range.is_empty()) { |
| return ZX_OK; |
| } |
| |
| // Currently, we can't decommit if the absence of a page doesn't imply zeroes. |
| if (parent_ || is_source_preserving_page_content()) { |
| return ZX_ERR_NOT_SUPPORTED; |
| } |
| |
| // VmObjectPaged::DecommitRange() rejects is_contiguous() VMOs (for now). |
| DEBUG_ASSERT(can_decommit()); |
| |
| // Demand offset and length be correctly aligned to not give surprising user semantics. |
| if (!range.is_page_aligned()) { |
| return ZX_ERR_INVALID_ARGS; |
| } |
| |
| return UnmapAndFreePagesLocked(range.offset, range.len, deferred).status_value(); |
| } |
| |
| zx::result<uint64_t> VmCowPages::UnmapAndFreePagesLocked(uint64_t offset, uint64_t len, |
| DeferredOps& deferred) { |
| canary_.Assert(); |
| |
| if (AnyPagesPinnedLocked(offset, len)) { |
| return zx::error(ZX_ERR_BAD_STATE); |
| } |
| |
| LTRACEF("start offset %#" PRIx64 ", end %#" PRIx64 "\n", offset, offset + len); |
| |
| // We've already trimmed the range in DecommitRange(). |
| DEBUG_ASSERT(InRange(offset, len, size_)); |
| |
| // Verify page alignment. |
| DEBUG_ASSERT(IS_PAGE_ROUNDED(offset)); |
| DEBUG_ASSERT(IS_PAGE_ROUNDED(len) || (offset + len == size_)); |
| |
| // DecommitRange() will call this function only on a VMO with no parent. |
| DEBUG_ASSERT(!parent_); |
| |
| // unmap all of the pages in this range on all the mapping regions |
| RangeChangeUpdateLocked(VmCowRange(offset, len), RangeChangeOp::Unmap, &deferred); |
| |
| __UNINITIALIZED BatchPQRemove page_remover(deferred.FreedList(this)); |
| |
| page_list_.RemovePages(page_remover.RemovePagesCallback(), offset, offset + len); |
| page_remover.Flush(); |
| |
| VMO_VALIDATION_ASSERT(DebugValidateHierarchyLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| return zx::ok(page_remover.freed_count()); |
| } |
| |
| bool VmCowPages::PageWouldReadZeroLocked(uint64_t page_offset) { |
| canary_.Assert(); |
| |
| DEBUG_ASSERT(IS_PAGE_ROUNDED(page_offset)); |
| DEBUG_ASSERT(page_offset < size_); |
| const VmPageOrMarker* slot = page_list_.Lookup(page_offset); |
| if (node_has_parent_content_markers()) { |
| if (slot && slot->IsPageOrRef()) { |
| return false; |
| } |
| if (!slot || !slot->IsParentContent()) { |
| return true; |
| } |
| } |
| |
| if (slot && slot->IsMarker()) { |
| // This is already considered zero as there's a marker. |
| return true; |
| } |
| if (is_source_preserving_page_content() && |
| ((slot && slot->IsIntervalZero()) || page_list_.IsOffsetInZeroInterval(page_offset))) { |
| // Pages in zero intervals are supplied as zero by the kernel. |
| return true; |
| } |
| // If we don't have a page or reference here we need to check our parent. |
| if (!slot || !slot->IsPageOrRef()) { |
| PageLookup content; |
| FindInitialPageContentLocked(page_offset, &content); |
| if (!content.cursor.current()) { |
| // Parent doesn't have a page either, so would also read as zero, assuming no page source. |
| return !is_root_source_user_pager_backed(); |
| } |
| } |
| // Content either locally or in our parent, assume it is non-zero and return false. |
| return false; |
| } |
| |
| zx_status_t VmCowPages::ZeroPagesPreservingContentLocked(uint64_t page_start_base, |
| uint64_t page_end_base, bool dirty_track, |
| DeferredOps& deferred, |
| MultiPageRequest* page_request, |
| uint64_t* processed_len_out) { |
| // Validate inputs. |
| DEBUG_ASSERT(IS_PAGE_ROUNDED(page_start_base) && IS_PAGE_ROUNDED(page_end_base)); |
| DEBUG_ASSERT(page_end_base <= size_); |
| DEBUG_ASSERT(is_source_preserving_page_content()); |
| |
| // Give us easier names for our range. |
| const uint64_t start = page_start_base; |
| const uint64_t end = page_end_base; |
| |
| if (start == end) { |
| return ZX_OK; |
| } |
| |
| // If we're not asked to dirty track, we will need to drop pages, because if a page is present it |
| // is going to be in one of the dirty tracked states (Clean, Dirty, AwaitingClean). So check for |
| // any pinned pages first. |
| if (!dirty_track && AnyPagesPinnedLocked(start, end - start)) { |
| return ZX_ERR_BAD_STATE; |
| } |
| |
| // Inserting zero intervals can modify the page list such that new nodes are added and deleted. |
| // So we cannot safely insert zero intervals while iterating the page list. The pattern we |
| // follow here is: |
| // 1. Traverse the page list to find a range that can be represented by a zero interval instead. |
| // 2. When such a range is found, break out of the traversal, and insert the zero interval. |
| // 3. Advance past the zero interval we inserted and resume the traversal from there, until |
| // we've covered the entire range. |
| |
| // The start offset at which to start the next traversal loop. |
| uint64_t next_start_offset = start; |
| // Dirty state for zero intervals we insert. |
| const VmPageOrMarker::IntervalDirtyState required_state = |
| dirty_track ? VmPageOrMarker::IntervalDirtyState::Dirty |
| : VmPageOrMarker::IntervalDirtyState::Untracked; |
| do { |
| // Track whether we find ourselves in a zero interval. |
| bool in_interval = false; |
| // The start of the zero interval if we are in one. |
| uint64_t interval_start = next_start_offset; |
| const uint64_t prev_start_offset = next_start_offset; |
| // State tracking information for inserting a new zero interval. |
| struct { |
| bool add_zero_interval; |
| uint64_t start; |
| uint64_t end; |
| bool replace_page; |
| bool overwrite_interval; |
| } state = {.add_zero_interval = false}; |
| |
| zx_status_t status = page_list_.RemovePagesAndIterateGaps( |
| [&](VmPageOrMarker* p, uint64_t off) { |
| // We cannot have references in pager-backed VMOs. |
| DEBUG_ASSERT(!p->IsReference()); |
| |
| // If this is a page, see if we can remove it and absorb it into a zero interval. |
| if (p->IsPage()) { |
| AssertHeld(lock_ref()); |
| if (p->Page()->object.pin_count > 0) { |
| DEBUG_ASSERT(dirty_track); |
| // Cannot remove this page if it is pinned. Lookup the page and zero it. Looking up |
| // ensures that we request dirty transition if needed by the pager. |
| LookupCursor cursor(this, VmCowRange(off, PAGE_SIZE)); |
| AssertHeld(cursor.lock_ref()); |
| zx::result<LookupCursor::RequireResult> result = |
| cursor.RequireOwnedPage(true, 1, deferred, page_request); |
| if (result.is_error()) { |
| return result.error_value(); |
| } |
| DEBUG_ASSERT(result->page == p->Page()); |
| // Zero the page we looked up. |
| ZeroPage(result->page->paddr()); |
| *processed_len_out += PAGE_SIZE; |
| next_start_offset = off + PAGE_SIZE; |
| return ZX_ERR_NEXT; |
| } |
| // Break out of the traversal. We can release the page and add a zero interval |
| // instead. |
| state = {.add_zero_interval = true, |
| .start = off, |
| .end = off, |
| .replace_page = true, |
| .overwrite_interval = false}; |
| return ZX_ERR_STOP; |
| } |
| |
| // Otherwise this is a marker or zero interval, in which case we already have zeroes, but |
| // we might need to change the dirty state. |
| DEBUG_ASSERT(p->IsMarker() || p->IsIntervalZero()); |
| if (p->IsIntervalStart()) { |
| // Track the interval start so we know how much to add to processed_len_out later. |
| interval_start = off; |
| in_interval = true; |
| if (p->GetZeroIntervalDirtyState() != required_state) { |
| // If we find the matching end, we will update state.end with the correct offset. |
| // Do not terminate the traversal yet. |
| state = {.add_zero_interval = true, |
| .start = interval_start, |
| .end = UINT64_MAX, |
| .replace_page = false, |
| .overwrite_interval = true}; |
| } |
| } else if (p->IsIntervalEnd()) { |
| if (p->GetZeroIntervalDirtyState() != required_state) { |
| state = {.add_zero_interval = true, |
| .start = in_interval ? interval_start : UINT64_MAX, |
| .end = off, |
| .replace_page = false, |
| .overwrite_interval = true}; |
| return ZX_ERR_STOP; |
| } |
| // Add the range from interval start to end. |
| *processed_len_out += (off + PAGE_SIZE - interval_start); |
| in_interval = false; |
| } else { |
| // This is either a single interval slot or a marker. Terminate the traversal to |
| // overwrite with a zero interval if: |
| // - this is an interval slot with a different dirty state, OR |
| // - this is a marker and we're asked to not dirty track, since a marker is a clean |
| // zero page. |
| if (p->IsMarker() && !dirty_track) { |
| // Release the marker so that it can be replaced by a gap by the traversal loop first, |
| // where the new zero interval will then be added. |
| *p = VmPageOrMarker::Empty(); |
| } |
| if (p->IsEmpty() || |
| (p->IsIntervalSlot() && p->GetZeroIntervalDirtyState() != required_state)) { |
| state = {.add_zero_interval = true, |
| .start = off, |
| .end = off, |
| .replace_page = false, |
| .overwrite_interval = p->IsIntervalSlot()}; |
| return ZX_ERR_STOP; |
| } |
| *processed_len_out += PAGE_SIZE; |
| } |
| next_start_offset = off + PAGE_SIZE; |
| return ZX_ERR_NEXT; |
| }, |
| [&](uint64_t gap_start, uint64_t gap_end) { |
| AssertHeld(lock_ref()); |
| // This gap will be replaced with a zero interval. Invalidate any read requests in this |
| // range. Since we have just validated that this is a gap in the page list we can directly |
| // call OnPagesSupplied, instead of iterating through the gaps using |
| // InvalidateReadRequestsLocked |
| page_source_->OnPagesSupplied(gap_start, gap_end - gap_start); |
| // We have found a new zero interval to insert. Break out of the traversal. |
| state = {.add_zero_interval = true, |
| .start = gap_start, |
| .end = gap_end - PAGE_SIZE, |
| .replace_page = false, |
| .overwrite_interval = false}; |
| return ZX_ERR_STOP; |
| }, |
| next_start_offset, end); |
| // Bubble up any errors from LookupCursor. |
| if (status != ZX_OK) { |
| return status; |
| } |
| |
| // Add any new zero interval. |
| if (state.add_zero_interval) { |
| if (state.replace_page) { |
| DEBUG_ASSERT(state.start == state.end); |
| vm_page_t* page = page_list_.ReplacePageWithZeroInterval(state.start, required_state); |
| DEBUG_ASSERT(page->object.pin_count == 0); |
| RemovePageLocked(page, deferred); |
| } else if (state.overwrite_interval) { |
| uint64_t old_start = state.start; |
| uint64_t old_end = state.end; |
| if (state.start == UINT64_MAX) { |
| state.start = next_start_offset; |
| } |
| if (state.end == UINT64_MAX) { |
| state.end = end - PAGE_SIZE; |
| } |
| status = page_list_.OverwriteZeroInterval(old_start, old_end, state.start, state.end, |
| required_state); |
| } else { |
| status = page_list_.AddZeroInterval(state.start, state.end + PAGE_SIZE, required_state); |
| } |
| if (status != ZX_OK) { |
| DEBUG_ASSERT(status == ZX_ERR_NO_MEMORY); |
| return status; |
| } |
| *processed_len_out += (state.end - state.start + PAGE_SIZE); |
| next_start_offset = state.end + PAGE_SIZE; |
| } else { |
| // Handle the last partial interval. Or the case where we did not advance next_start_offset at |
| // all, which can only happen if the range fell entirely inside an interval. |
| if (in_interval || next_start_offset == prev_start_offset) { |
| // If the range fell entirely inside an interval, verify that it was indeed a zero interval. |
| DEBUG_ASSERT(next_start_offset != prev_start_offset || |
| page_list_.IsOffsetInZeroInterval(next_start_offset)); |
| // If entirely inside an interval, we have one of two possibilities: |
| // (1) The interval is already in required_state in which case we don't need to do |
| // anything. |
| // (2) The interval is not in required_state. We do not expect this case in practice, so |
| // instead of splitting up a zero interval in the middle just to change its dirty state, |
| // claim that we processed the range. |
| *processed_len_out += (end - interval_start); |
| next_start_offset = end; |
| } |
| } |
| // Ensure we're making progress. |
| DEBUG_ASSERT(next_start_offset > prev_start_offset); |
| } while (next_start_offset < end); |
| |
| VMO_VALIDATION_ASSERT(DebugValidateZeroIntervalsLocked()); |
| return ZX_OK; |
| } |
| |
| zx_status_t VmCowPages::ZeroPagesLocked(VmCowRange range, bool dirty_track, DeferredOps& deferred, |
| MultiPageRequest* page_request, uint64_t* zeroed_len_out) { |
| canary_.Assert(); |
| |
| DEBUG_ASSERT(range.IsBoundedBy(size_)); |
| DEBUG_ASSERT(range.is_page_aligned()); |
| // This function is only valid on a visible node as it will not handle zeroing children. |
| DEBUG_ASSERT(!is_hidden()); |
| ASSERT(zeroed_len_out); |
| |
| // This function tries to zero pages as optimally as possible for most cases, so we attempt |
| // increasingly expensive actions only if certain preconditions do not allow us to perform the |
| // cheaper action. Broadly speaking, the sequence of actions that are attempted are as follows. |
| // 1) Try to decommit each page if the VMO allows it and doing so doesn't expose content in the |
| // parent (if any) that shouldn't be visible. |
| // 2) Otherwise, if this is a child VMO and there is no committed page yet, allocate a zero page. |
| // 3) Otherwise, look up the page, faulting it in if necessary, and zero the page. If the page |
| // source needs to supply or dirty track the page, a page request is initialized and we return |
| // early with ZX_ERR_SHOULD_WAIT. The caller is expected to wait on the page request, and then |
| // retry. On the retry, we should be able to look up the page successfully and zero it. |
| |
| // Unmap any page that is touched by this range in any of our, or our childrens, mapping |
| // regions. We do this on the assumption we are going to be able to free pages either completely |
| // or by turning them into markers and it's more efficient to unmap once in bulk here. |
| RangeChangeUpdateLocked(range, RangeChangeOp::Unmap, &deferred); |
| |
| // Give us easier names for our range. |
| const uint64_t start = range.offset; |
| const uint64_t end = range.end(); |
| |
| // If the VMO is directly backed by a page source that preserves content, it should be the root |
| // VMO of the hierarchy. |
| DEBUG_ASSERT(!is_source_preserving_page_content() || !parent_); |
| |
| // If the page source preserves content, we can perform efficient zeroing by inserting dirty zero |
| // intervals. Handle this case separately. |
| if (is_source_preserving_page_content()) { |
| return ZeroPagesPreservingContentLocked(start, end, dirty_track, deferred, page_request, |
| zeroed_len_out); |
| } |
| // dirty_track has no meaning for VMOs without page sources that preserve content, so ignore it |
| // for the remainder of the function. |
| |
| // Helper lambda to determine if this VMO can see parent contents at offset, or if a length is |
| // specified as well in the range [offset, offset + length). |
| auto can_see_parent = [this](uint64_t offset, uint64_t length = PAGE_SIZE) TA_REQ(lock()) { |
| if (!parent_) { |
| return false; |
| } |
| return offset < parent_limit_ && offset + length <= parent_limit_; |
| }; |
| |
| // This is a lambda as it only makes sense to talk about parent mutability when we have a parent |
| // for the offset being considered. |
| auto parent_immutable = [can_see_parent, this](uint64_t offset) TA_REQ(lock()) { |
| // TODO(johngro): remove this explicit unused-capture warning suppression |
| // when https://bugs.llvm.org/show_bug.cgi?id=35450 gets fixed. |
| (void)can_see_parent; // used only in DEBUG_ASSERT |
| DEBUG_ASSERT(can_see_parent(offset)); |
| return parent_->is_hidden(); |
| }; |
| |
| // Finding the initial page content is expensive, but we only need to call it under certain |
| // circumstances scattered in the code below. The lambda get_initial_page_content() will lazily |
| // fetch and cache the details. This avoids us calling it when we don't need to, or calling it |
| // more than once. |
| struct InitialPageContent { |
| bool inited = false; |
| LockedPtr page_owner; |
| uint64_t owner_offset; |
| uint64_t cached_offset; |
| VmPageOrMarkerRef page_or_marker; |
| } initial_content_; |
| auto get_initial_page_content = [&initial_content_, can_see_parent, this](uint64_t offset) |
| TA_REQ(lock()) -> const InitialPageContent& { |
| // TODO(johngro): remove this explicit unused-capture warning suppression |
| // when https://bugs.llvm.org/show_bug.cgi?id=35450 gets fixed. |
| (void)can_see_parent; // used only in DEBUG_ASSERT |
| |
| // If there is no cached page content or if we're looking up a different offset from the cached |
| // one, perform the lookup. |
| if (!initial_content_.inited || offset != initial_content_.cached_offset) { |
| DEBUG_ASSERT(can_see_parent(offset)); |
| PageLookup content; |
| initial_content_.page_owner.release(); |
| FindInitialPageContentLocked(offset, &content); |
| initial_content_.page_owner = ktl::move(content.owner); |
| initial_content_.owner_offset = content.owner_offset; |
| initial_content_.page_or_marker = content.cursor.current(); |
| // We only care about the parent having a 'true' vm_page for content. If the parent has a |
| // marker then it's as if the parent has no content since that's a zero page anyway, which is |
| // what we are trying to achieve. |
| initial_content_.inited = true; |
| initial_content_.cached_offset = offset; |
| } |
| DEBUG_ASSERT(offset == initial_content_.cached_offset); |
| return initial_content_; |
| }; |
| |
| // Helper lambda to determine if parent has content at the specified offset. |
| auto parent_has_content = [&](uint64_t offset) TA_REQ(lock()) { |
| if (node_has_parent_content_markers()) { |
| // Unless there is a parent content marker then we know the parent has no content for us. |
| const VmPageOrMarker* slot = page_list_.Lookup(offset); |
| if (!slot || !slot->IsParentContent()) { |
| return false; |
| } |
| } |
| const VmPageOrMarkerRef& page_or_marker = get_initial_page_content(offset).page_or_marker; |
| return page_or_marker && page_or_marker->IsPageOrRef(); |
| }; |
| |
| // In the ideal case we can zero by making there be an Empty slot in our page list. This is true |
| // when we're not specifically avoiding decommit on zero and there is nothing pinned. |
| // |
| // Note that this lambda is only checking for pre-conditions in *this* VMO which allow us to |
| // represent zeros with an empty slot. We will combine this check with additional checks for |
| // contents visible through the parent, if applicable. |
| auto can_decommit_slot = [this](const VmPageOrMarker* slot, uint64_t offset) TA_REQ(lock()) { |
| if (!can_decommit_zero_pages() || |
| (slot && slot->IsPage() && slot->Page()->object.pin_count > 0)) { |
| return false; |
| } |
| DEBUG_ASSERT(!is_source_preserving_page_content()); |
| return true; |
| }; |
| |
| // Like can_decommit_slot but for a range. |
| auto can_decommit_slots_in_range = [this](uint64_t offset, uint64_t length) TA_REQ(lock()) { |
| if (!can_decommit_zero_pages() || AnyPagesPinnedLocked(offset, length)) { |
| return false; |
| } |
| DEBUG_ASSERT(!is_source_preserving_page_content()); |
| return true; |
| }; |
| |
| // Helper lambda to zero the slot at offset either by inserting a marker or by zeroing the actual |
| // page as applicable. The return codes match those expected for VmPageList traversal. |
| auto zero_slot = [&](VmPageOrMarker* slot, uint64_t offset) TA_REQ(lock()) { |
| // Ideally we will use a marker, but we can only do this if we can point to a committed page |
| // to justify the allocation of the marker (i.e. we cannot allocate infinite markers with no |
| // committed pages). A committed page in this case exists if the parent has any content. |
| // Otherwise, we'll need to zero an actual page. |
| if (!can_decommit_slot(slot, offset) || !parent_has_content(offset)) { |
| // If we're here because of !parent_has_content() and slot doesn't have a page, we can simply |
| // allocate a zero page to replace the empty slot. Otherwise, we'll have to look up the page |
| // and zero it. |
| // |
| // We could technically fall through to GetLookupCursorLocked even for an empty slot and let |
| // RequirePage allocate a new page and zero it, but we want to avoid having to redundantly |
| // zero a newly forked zero page. |
| if (!slot && can_see_parent(offset) && !parent_has_content(offset)) { |
| // We could only have ended up here if the parent was mutable or if there is a pager-backed |
| // root, otherwise we should have been able to treat an empty slot as zero (decommit a |
| // committed page) and return early above. |
| DEBUG_ASSERT(!parent_immutable(offset) || is_root_source_user_pager_backed()); |
| // We will try to insert a new zero page below. Note that at this point we know that this is |
| // not a contiguous VMO (which cannot have arbitrary zero pages inserted into it). We |
| // checked for can_see_parent just now and contiguous VMOs do not support clones. Besides, |
| // if the slot was empty we should have moved on when we found the gap in the page list |
| // traversal as the contiguous page source zeroes supplied pages by default. |
| DEBUG_ASSERT(!is_source_supplying_specific_physical_pages()); |
| |
| // Allocate a new page, it will be zeroed in the process. |
| vm_page_t* p; |
| // Do not pass our freed_list here as this takes an |alloc_list| list to allocate from. |
| zx_status_t status = |
| AllocateCopyPage(vm_get_zero_page_paddr(), nullptr, page_request->GetAnonymous(), &p); |
| if (status != ZX_OK) { |
| return status; |
| } |
| auto result = |
| AddPageLocked(offset, VmPageOrMarker::Page(p), CanOverwriteContent::Zero, nullptr); |
| // Absent bugs, AddPageLocked() can only return ZX_ERR_NO_MEMORY. |
| if (result.is_error()) { |
| ASSERT(result.status_value() == ZX_ERR_NO_MEMORY); |
| } |
| DEBUG_ASSERT(!result->IsPageOrRef()); |
| return ZX_ERR_NEXT; |
| } |
| |
| // Lookup the page which will potentially fault it in via the page source. Zeroing is |
| // equivalent to a VMO write with zeros, so simulate a write fault. |
| zx::result<VmCowPages::LookupCursor> cursor = |
| GetLookupCursorLocked(VmCowRange(offset, PAGE_SIZE)); |
| if (cursor.is_error()) { |
| return cursor.error_value(); |
| } |
| AssertHeld(cursor->lock_ref()); |
| auto result = cursor->RequirePage(true, 1, deferred, page_request); |
| if (result.is_error()) { |
| return result.error_value(); |
| } |
| ZeroPage(result->page->paddr()); |
| return ZX_ERR_NEXT; |
| } |
| |
| DEBUG_ASSERT(parent_ && parent_has_content(offset) && (!slot || !slot->IsParentContent())); |
| // Validate we can insert our own pages/content. |
| DEBUG_ASSERT(!is_source_supplying_specific_physical_pages()); |
| |
| // We are able to insert a marker, but if our page content is from a hidden owner we need to |
| // perform slightly more complex cow forking. |
| const InitialPageContent& content = get_initial_page_content(offset); |
| if (!slot && content.page_owner.locked_or(this).is_hidden()) { |
| zx_status_t result = CloneCowContentAsZeroLocked( |
| offset, deferred.FreedList(this), &content.page_owner.locked_or(this), |
| content.page_or_marker, content.owner_offset); |
| if (result != ZX_OK) { |
| return result; |
| } |
| return ZX_ERR_NEXT; |
| } |
| |
| // Remove any page that could be hanging around in the slot and replace it with a marker. |
| auto result = |
| AddPageLocked(offset, VmPageOrMarker::Marker(), CanOverwriteContent::NonZero, nullptr); |
| // Absent bugs, AddPageLocked() can only return ZX_ERR_NO_MEMORY. |
| if (result.is_error()) { |
| ASSERT(result.status_value() == ZX_ERR_NO_MEMORY); |
| return result.status_value(); |
| } |
| VmPageOrMarker& released_page = *result; |
| // Free the old page. |
| if (released_page.IsPage()) { |
| vm_page_t* page = released_page.ReleasePage(); |
| RemovePageLocked(page, deferred); |
| } else if (released_page.IsReference()) { |
| FreeReference(released_page.ReleaseReference()); |
| } |
| return ZX_ERR_NEXT; |
| }; |
| |
| *zeroed_len_out = 0; |
| // Main page list traversal loop to remove any existing pages / markers, zero existing pages, and |
| // also insert any new markers / zero pages in gaps as applicable. We use the VmPageList traversal |
| // helper here instead of iterating over each offset in the range so we can efficiently skip over |
| // gaps if possible. |
| zx_status_t status = page_list_.RemovePagesAndIterateGaps( |
| [&](VmPageOrMarker* slot, uint64_t offset) { |
| AssertHeld(lock_ref()); |
| |
| // We don't expect intervals in non pager-backed VMOs. |
| DEBUG_ASSERT(!slot->IsInterval()); |
| |
| // Contiguous VMOs cannot have markers. |
| DEBUG_ASSERT(!direct_source_supplies_zero_pages() || !slot->IsMarker()); |
| |
| // First see if we can simply get done with an empty slot in the page list. This VMO should |
| // allow decommitting a page at this offset when zeroing. Additionally, one of the following |
| // conditions should hold w.r.t. to the parent: |
| // * This offset does not relate to our parent, or we don't have a parent. |
| // * This offset does relate to our parent, but our parent is immutable, currently |
| // zero at this offset and there is no pager-backed root VMO. |
| if (can_decommit_slot(slot, offset) && |
| (!can_see_parent(offset) || (parent_immutable(offset) && !parent_has_content(offset) && |
| !is_root_source_user_pager_backed()))) { |
| if (slot->IsPage()) { |
| vm_page_t* page = slot->ReleasePage(); |
| RemovePageLocked(page, deferred); |
| } else if (slot->IsReference()) { |
| FreeReference(slot->ReleaseReference()); |
| } else { |
| // If this is a marker, simply make the slot empty. |
| *slot = VmPageOrMarker::Empty(); |
| } |
| // We successfully zeroed this offset. Move on to the next offset. |
| *zeroed_len_out += PAGE_SIZE; |
| return ZX_ERR_NEXT; |
| } |
| if (slot->IsParentContent()) { |
| // If the slot is a parent content marker then we can zero by clearing the slot, but to do |
| // so we must also remove our ref count of said content. |
| DEBUG_ASSERT(can_see_parent(offset) && parent_has_content(offset) && |
| !root_has_page_source()); |
| const InitialPageContent& content = get_initial_page_content(offset); |
| content.page_owner.locked_or(this).DecrementCowContentShareCount( |
| content.page_or_marker, content.owner_offset, deferred.FreedList(this), |
| Pmm::Node().GetPageCompression()); |
| *slot = VmPageOrMarker::Empty(); |
| *zeroed_len_out += PAGE_SIZE; |
| return ZX_ERR_NEXT; |
| } |
| |
| // If there's already a marker then we can avoid any second guessing and leave the marker |
| // alone. |
| if (slot->IsMarker()) { |
| *zeroed_len_out += PAGE_SIZE; |
| return ZX_ERR_NEXT; |
| } |
| |
| // The only time we would reach here and *not* have a parent is if we could not decommit a |
| // page at this offset when zeroing. |
| DEBUG_ASSERT(!can_decommit_slot(slot, offset) || parent_); |
| |
| // Now we know that we need to do something active to make this zero, either through a |
| // marker or a page. |
| zx_status_t status = zero_slot(slot, offset); |
| if (status == ZX_ERR_NEXT) { |
| // If we were able to successfully zero this slot, move on to the next offset. |
| *zeroed_len_out += PAGE_SIZE; |
| } |
| return status; |
| }, |
| [&](uint64_t gap_start, uint64_t gap_end) { |
| AssertHeld(lock_ref()); |
| if (node_has_parent_content_markers()) { |
| // Gaps are already zero when using parent content markers. |
| *zeroed_len_out += (gap_end - gap_start); |
| return ZX_ERR_NEXT; |
| } |
| if (direct_source_supplies_zero_pages()) { |
| // Already logically zero - don't commit pages to back the zeroes if they're not already |
| // committed. This is important for contiguous VMOs, as we don't use markers for |
| // contiguous VMOs, and allocating a page below to hold zeroes would not be asking the |
| // page_source_ for the proper physical page. This prevents allocating an arbitrary |
| // physical page to back the zeroes. |
| *zeroed_len_out += (gap_end - gap_start); |
| return ZX_ERR_NEXT; |
| } |
| |
| // If empty slots imply zeroes, and the gap does not see parent contents, we already have |
| // zeroes. |
| if (can_decommit_slots_in_range(gap_start, gap_end - gap_start) && |
| !can_see_parent(gap_start, gap_end - gap_start)) { |
| *zeroed_len_out += (gap_end - gap_start); |
| return ZX_ERR_NEXT; |
| } |
| |
| // Otherwise fall back to examining each offset in the gap to determine the action to |
| // perform. |
| for (uint64_t offset = gap_start; offset < gap_end; |
| offset += PAGE_SIZE, *zeroed_len_out += PAGE_SIZE) { |
| // First see if we can simply get done with an empty slot in the page list. This VMO |
| // should allow decommitting a page at this offset when zeroing. Additionally, one of the |
| // following conditions should hold w.r.t. to the parent: |
| // * This offset does not relate to our parent, or we don't have a parent. |
| // * This offset does relate to our parent, but our parent is immutable, currently |
| // zero at this offset and there is no pager-backed root VMO. |
| if (can_decommit_slot(nullptr, offset) && |
| (!can_see_parent(offset) || |
| (parent_immutable(offset) && !parent_has_content(offset) && |
| !is_root_source_user_pager_backed()))) { |
| continue; |
| } |
| |
| // The only time we would reach here and *not* have a parent is if we could not decommit a |
| // page at this offset when zeroing. |
| DEBUG_ASSERT(!can_decommit_slot(nullptr, offset) || parent_); |
| |
| // Now we know that we need to do something active to make this zero, either through a |
| // marker or a page. |
| zx_status_t status = zero_slot(nullptr, offset); |
| if (status != ZX_ERR_NEXT) { |
| return status; |
| } |
| } |
| |
| return ZX_ERR_NEXT; |
| }, |
| start, end); |
| |
| VMO_VALIDATION_ASSERT(DebugValidateHierarchyLocked()); |
| VMO_VALIDATION_ASSERT(DebugValidateZeroIntervalsLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| return status; |
| } |
| |
| void VmCowPages::MoveToPinnedLocked(vm_page_t* page, uint64_t offset) { |
| pmm_page_queues()->MoveToWired(page); |
| } |
| |
| void VmCowPages::MoveToNotPinnedLocked(vm_page_t* page, uint64_t offset) { |
| PageQueues* pq = pmm_page_queues(); |
| if (is_source_preserving_page_content()) { |
| DEBUG_ASSERT(is_page_dirty_tracked(page)); |
| // We can only move Clean pages to the pager backed queues as they track age information for |
| // eviction; only Clean pages can be evicted. Pages in AwaitingClean and Dirty are protected |
| // from eviction in the Dirty queue. |
| if (is_page_clean(page)) { |
| if (high_priority_count_ != 0) { |
| // If this VMO is high priority then do not place in the pager backed queue as that is |
| // reclaimable, place in the high priority queue instead. |
| pq->MoveToHighPriority(page); |
| } else { |
| pq->MoveToReclaim(page); |
| } |
| } else { |
| DEBUG_ASSERT(!page->is_loaned()); |
| pq->MoveToPagerBackedDirty(page); |
| } |
| } else { |
| // Place pages from contiguous VMOs in the wired queue, as they are notionally pinned until the |
| // owner explicitly releases them. |
| if (can_decommit_zero_pages()) { |
| if (high_priority_count_ != 0 && !pq->ReclaimIsOnlyPagerBacked()) { |
| // If anonymous pages are reclaimable, and this VMO is high priority, then places our pages |
| // in the high priority queue instead of the anonymous one to avoid reclamation. |
| pq->MoveToHighPriority(page); |
| } else { |
| bool cannot_reclaim = false; |
| // If this is a discardable VMO but not currently unlocked, it cannot be reclaimed. The |
| // reclamation code is tolerant to this, but avoid wasted work. |
| if (is_discardable()) { |
| discardable_tracker_->assert_cow_pages_locked(); |
| cannot_reclaim = !discardable_tracker_->IsEligibleForReclamationLocked(); |
| } |
| // If the VMO is mapped uncached, it cannot be reclaimed. The reclamation code is tolerant |
| // to this and will skip the page anyway, but uncached memory is typically used by drivers |
| // and tends to back large buffers, so avoid wasted work. |
| if (!cannot_reclaim && paged_ref_) { |
| cannot_reclaim = (paged_backlink_locked(this)->GetMappingCachePolicyLocked() & |
| ZX_CACHE_POLICY_MASK) != ZX_CACHE_POLICY_CACHED; |
| } |
| pq->MoveToAnonymous(page, /*skip_reclaim=*/cannot_reclaim); |
| } |
| } else { |
| pq->MoveToWired(page); |
| } |
| } |
| } |
| |
| void VmCowPages::SetNotPinnedLocked(vm_page_t* page, uint64_t offset) { |
| PageQueues* pq = pmm_page_queues(); |
| if (is_source_preserving_page_content()) { |
| DEBUG_ASSERT(is_page_dirty_tracked(page)); |
| // We can only move Clean pages to the pager backed queues as they track age information for |
| // eviction; only Clean pages can be evicted. Pages in AwaitingClean and Dirty are protected |
| // from eviction in the Dirty queue. |
| if (is_page_clean(page)) { |
| if (high_priority_count_ != 0) { |
| // If this VMO is high priority then do not place in the pager backed queue as that is |
| // reclaimable, place in the high priority queue instead. |
| pq->SetHighPriority(page, this, offset); |
| } else { |
| pq->SetReclaim(page, this, offset); |
| } |
| } else { |
| DEBUG_ASSERT(!page->is_loaned()); |
| pq->SetPagerBackedDirty(page, this, offset); |
| } |
| } else { |
| // Place pages from contiguous VMOs in the wired queue, as they are notionally pinned until the |
| // owner explicitly releases them. |
| if (can_decommit_zero_pages()) { |
| if (high_priority_count_ != 0 && !pq->ReclaimIsOnlyPagerBacked()) { |
| // If anonymous pages are reclaimable, and this VMO is high priority, then places our pages |
| // in the high priority queue instead of the anonymous one to avoid reclamation. |
| pq->SetHighPriority(page, this, offset); |
| } else { |
| bool cannot_reclaim = false; |
| // If this is a discardable VMO but not currently unlocked, it cannot be reclaimed. The |
| // reclamation code is tolerant to this, but avoid wasted work. |
| if (is_discardable()) { |
| discardable_tracker_->assert_cow_pages_locked(); |
| cannot_reclaim = !discardable_tracker_->IsEligibleForReclamationLocked(); |
| } |
| // If the VMO is mapped uncached, it cannot be reclaimed. The reclamation code is tolerant |
| // to this and will skip the page anyway, but uncached memory is typically used by drivers |
| // and tends to back large buffers, so avoid wasted work. |
| if (!cannot_reclaim && paged_ref_) { |
| cannot_reclaim = (paged_backlink_locked(this)->GetMappingCachePolicyLocked() & |
| ZX_CACHE_POLICY_MASK) != ZX_CACHE_POLICY_CACHED; |
| } |
| pq->SetAnonymous(page, this, offset, /*skip_reclaim=*/cannot_reclaim); |
| } |
| } else { |
| pq->SetWired(page, this, offset); |
| } |
| } |
| } |
| |
| zx_status_t VmCowPages::PromoteRangeForReclamation(VmCowRange range) { |
| canary_.Assert(); |
| |
| // Hints only apply to pager backed VMOs. |
| if (!can_root_source_evict()) { |
| return ZX_OK; |
| } |
| // Zero lengths have no work to do. |
| if (range.is_empty()) { |
| return ZX_OK; |
| } |
| |
| Guard<CriticalMutex> guard{AssertOrderedLock, lock(), lock_order()}; |
| if (!range.IsBoundedBy(size_)) { |
| return ZX_ERR_OUT_OF_RANGE; |
| } |
| |
| uint64_t start_offset = ROUNDDOWN_PAGE_SIZE(range.offset); |
| uint64_t end_offset = ROUNDUP_PAGE_SIZE(range.end()); |
| |
| __UNINITIALIZED zx::result<VmCowPages::LookupCursor> cursor = |
| GetLookupCursorLocked(VmCowRange(start_offset, end_offset - start_offset)); |
| if (cursor.is_error()) { |
| return cursor.status_value(); |
| } |
| // Do not consider pages accessed as the goal is reclaim them, not consider them used. |
| cursor->DisableMarkAccessed(); |
| AssertHeld(cursor->lock_ref()); |
| while (start_offset < end_offset) { |
| // Lookup the page if it exists, but do not let it get allocated or say we are writing to it. |
| // On success or failure this causes the cursor to go to the next offset. |
| vm_page_t* page = cursor->MaybePage(false); |
| if (page) { |
| // Check to see if the page is owned by the root VMO. Hints only apply to the root, as that is |
| // where the page source is. |
| // Don't move a pinned page or a dirty page to the DontNeed queue. |
| // Note that this does not unset the always_need bit if it has been previously set. The |
| // always_need hint is sticky. |
| VmCowPages* owner = reinterpret_cast<VmCowPages*>(page->object.get_object()); |
| DEBUG_ASSERT(owner); |
| if (owner->page_source_ && page->object.pin_count == 0 && is_page_clean(page)) { |
| pmm_page_queues()->MoveToReclaimDontNeed(page); |
| vm_vmo_dont_need.Add(1); |
| } |
| } |
| // Can't really do anything in case an error is encountered while looking up the page. Simply |
| // ignore it and move on to the next page. Hints are best effort anyway. |
| start_offset += PAGE_SIZE; |
| } |
| return ZX_OK; |
| } |
| |
| zx_status_t VmCowPages::ProtectRangeFromReclamation(VmCowRange range, bool set_always_need, |
| bool ignore_errors) { |
| canary_.Assert(); |
| |
| // Hints only apply to pager backed VMOs. |
| if (!can_root_source_evict()) { |
| return ZX_OK; |
| } |
| |
| // Validate that the range is completely in range at the start of the operation. Although we |
| // tolerate the VMO shrinking during the operation, the range must be valid at the point we |
| // started. |
| { |
| Guard<CriticalMutex> guard{AssertOrderedLock, lock(), lock_order()}; |
| if (!range.IsBoundedBy(size_)) { |
| return ZX_ERR_OUT_OF_RANGE; |
| } |
| // Zero lengths have no work to do. |
| if (range.is_empty()) { |
| return ZX_OK; |
| } |
| } |
| |
| range = range.ExpandTillPageAligned(); |
| |
| __UNINITIALIZED MultiPageRequest page_request; |
| while (!range.is_empty()) { |
| // Any loaned page replacement needs to happen outside the main lock acquisition so if we loaned |
| // page is found we use these variables to record its information and process it after dropping |
| // the lock. |
| fbl::RefPtr<VmCowPages> loaned_page_owner; |
| uint64_t loaned_page_offset = 0; |
| vm_page_t* loaned_page = nullptr; |
| zx_status_t status; |
| { |
| __UNINITIALIZED DeferredOps deferred(this); |
| Guard<CriticalMutex> guard{AssertOrderedLock, lock(), lock_order()}; |
| // The size might have changed since we dropped the lock. Adjust the range if required. |
| if (range.offset >= size_) { |
| // No more pages to hint. |
| return ZX_OK; |
| } |
| // Shrink the range if required. Proceed with hinting on the remaining pages in the range; |
| // we've already hinted on the preceding pages, so just go on ahead instead of returning an |
| // error. The range was valid at the time we started hinting. |
| if (!range.IsBoundedBy(size_)) { |
| range = range.WithLength(size_ - range.offset); |
| } |
| |
| __UNINITIALIZED zx::result<VmCowPages::LookupCursor> cursor = |
| GetLookupCursorLocked(VmCowRange(range.offset, range.len)); |
| if (cursor.is_error()) { |
| return cursor.status_value(); |
| } |
| AssertHeld(cursor->lock_ref()); |
| for (; !range.is_empty(); range = range.TrimedFromStart(PAGE_SIZE)) { |
| // Lookup the page, this will fault in the page from the parent if necessary, but will not |
| // allocate pages directly in this if it is a child. |
| auto result = cursor->RequirePage(false, static_cast<uint>(range.len / PAGE_SIZE), deferred, |
| &page_request); |
| status = result.status_value(); |
| if (status != ZX_OK) { |
| break; |
| } |
| // If we reached here, we successfully found a page at the current offset. |
| vm_page_t* page = result->page; |
| |
| // The root might have gone away when the lock was dropped while waiting above. Compute the |
| // root again and check if we still have a page source backing it before applying the hint. |
| if (!can_root_source_evict()) { |
| // Hinting is not applicable anymore. No more pages to hint. |
| return ZX_OK; |
| } |
| |
| // Check to see if the page is owned by the root VMO. Hints only apply to the root, as that |
| // is where the page source is. There could equivalently be no owner if this is the zero |
| // page, which should also be ignored. |
| VmCowPages* owner = reinterpret_cast<VmCowPages*>(page->object.get_object()); |
| if (!owner || !owner->page_source_) { |
| // Hinting is not applicable to this page, but it might apply to following ones. |
| continue; |
| } |
| |
| // If the page is loaned, replace it with a non-loaned page. Loaned pages are reclaimed by |
| // eviction, and hinted pages should not be evicted. |
| if (page->is_loaned()) { |
| DEBUG_ASSERT(is_page_clean(page)); |
| // The lock of |owner| may or may not be held depending on the current state of the |
| // LookupCursor, however we do not need the owner lock in order to take a RefPtr. Since we |
| // were able to get a reference to the page, the page cannot be removed or changed in |
| // owner without informing us, as we might have a mapping to it. Us holding our lock |
| // blocks that and prevents it from completing, meaning that owner must still be a live |
| // object. The page could already be removed from owner, but we will deal with that race |
| // in the ReplacePage step down below. |
| loaned_page_owner = fbl::MakeRefPtrUpgradeFromRaw<VmCowPages>(owner, lock()); |
| loaned_page = page; |
| loaned_page_offset = page->object.get_page_offset(); |
| break; |
| } |
| if (status != ZX_OK) { |
| break; |
| } |
| |
| DEBUG_ASSERT(!page->is_loaned()); |
| if (set_always_need) { |
| page->object.always_need = 1; |
| vm_vmo_always_need.Add(1); |
| // Nothing more to do beyond marking the page always_need true. The lookup must have |
| // already marked the page accessed, moving it to the head of the first page queue. |
| } |
| } |
| } |
| // Check if we exited to swap a loaned page. |
| if (loaned_page) { |
| vm_page_t* after; |
| status = loaned_page_owner->ReplacePage(loaned_page, loaned_page_offset, false, &after, |
| page_request.GetAnonymous()); |
| if (status != ZX_ERR_SHOULD_WAIT) { |
| // Between finding the loaned page and attempting to replace it the lock was dropped and so |
| // ReplacePage could spuriously fail, hence ignore any other failure and go around the loop |
| // and retry. |
| status = ZX_OK; |
| } |
| } |
| if (status != ZX_OK) { |
| if (status == ZX_ERR_SHOULD_WAIT) { |
| status = page_request.Wait(); |
| |
| // If the wait succeeded, cur_offset will now have a backing page, so we need to try the |
| // same offset again with a new cursor. |
| if (status == ZX_OK) { |
| continue; |
| } |
| } |
| |
| // Should only get here if an error was encountered, check if we should ignore or return it. |
| DEBUG_ASSERT(status != ZX_OK); |
| if (!ignore_errors) { |
| return status; |
| } |
| |
| // Ignore the error, move to the next offset. |
| range = range.TrimedFromStart(PAGE_SIZE); |
| } |
| } |
| return ZX_OK; |
| } |
| |
| zx_status_t VmCowPages::DecompressInRange(VmCowRange range) { |
| canary_.Assert(); |
| |
| Guard<CriticalMutex> guard{AssertOrderedLock, lock(), lock_order()}; |
| if (!range.IsBoundedBy(size_)) { |
| return ZX_ERR_OUT_OF_RANGE; |
| } |
| if (range.is_empty()) { |
| return ZX_OK; |
| } |
| |
| uint64_t cur_offset = ROUNDDOWN_PAGE_SIZE(range.offset); |
| uint64_t end_offset = ROUNDUP_PAGE_SIZE(range.end()); |
| |
| zx_status_t status; |
| do { |
| __UNINITIALIZED AnonymousPageRequest page_request; |
| status = ForEveryOwnedMutableHierarchyPageInRangeLocked( |
| [&cur_offset, &page_request](VmPageOrMarkerRef p, VmCowPages* owner, uint64_t this_offset, |
| uint64_t owner_offset) { |
| if (!p->IsReference()) { |
| return ZX_ERR_NEXT; |
| } |
| AssertHeld(owner->lock_ref()); |
| zx_status_t status = owner->ReplaceReferenceWithPageLocked(VmPageOrMarkerRef(p), |
| owner_offset, &page_request); |
| if (status == ZX_OK) { |
| cur_offset = this_offset + PAGE_SIZE; |
| return ZX_ERR_NEXT; |
| } |
| return status; |
| }, |
| cur_offset, end_offset - cur_offset, LockedPtr()); |
| if (status == ZX_OK) { |
| return ZX_OK; |
| } |
| if (status == ZX_ERR_SHOULD_WAIT) { |
| guard.CallUnlocked( |
| [&page_request, &status]() { status = page_request.Allocate().status_value(); }); |
| } |
| } while (status == ZX_OK); |
| return status; |
| } |
| |
| int64_t VmCowPages::ChangeSingleHighPriorityCountLocked(int64_t delta) { |
| const bool was_zero = high_priority_count_ == 0; |
| high_priority_count_ += delta; |
| DEBUG_ASSERT(high_priority_count_ >= 0); |
| const bool is_zero = high_priority_count_ == 0; |
| // Any change to or from zero means we need to add or remove a count from our parent (if we have |
| // one) and potentially move pages in the page queues. |
| if (is_zero && !was_zero) { |
| delta = -1; |
| } else if (was_zero && !is_zero) { |
| delta = 1; |
| } else { |
| delta = 0; |
| } |
| if (delta != 0) { |
| // If we moved to or from zero then update every page into the correct page queue for tracking. |
| // MoveToNotPinnedLocked will check the high_priority_count_, which has already been updated, so |
| // can just call that on every page. |
| page_list_.ForEveryPage([this](const VmPageOrMarker* page_or_marker, uint64_t offset) { |
| if (page_or_marker->IsPage()) { |
| vm_page_t* page = page_or_marker->Page(); |
| if (page->object.pin_count == 0) { |
| AssertHeld(lock_ref()); |
| MoveToNotPinnedLocked(page, offset); |
| } |
| } |
| return ZX_ERR_NEXT; |
| }); |
| } |
| vm_vmo_high_priority.Add(delta); |
| return delta; |
| } |
| |
| void VmCowPages::ChangeHighPriorityCountLocked(int64_t delta) { |
| canary_.Assert(); |
| |
| LockedPtr cur; |
| // Any change to or from zero requires updating a count in the parent, so we need to walk up the |
| // parent chain as long as a transition is happening. |
| while (delta != 0) { |
| delta = cur.locked_or(this).ChangeSingleHighPriorityCountLocked(delta); |
| VmCowPages* parent = cur.locked_or(this).parent_.get(); |
| if (!parent) { |
| break; |
| } |
| cur = LockedPtr(parent); |
| } |
| } |
| |
| void VmCowPages::UnpinLocked(VmCowRange range, DeferredOps* deferred) { |
| canary_.Assert(); |
| |
| // verify that the range is within the object |
| ASSERT(range.IsBoundedBy(size_)); |
| // forbid zero length unpins as zero length pins return errors. |
| ASSERT(!range.is_empty()); |
| |
| const uint64_t start_page_offset = ROUNDDOWN_PAGE_SIZE(range.offset); |
| const uint64_t end_page_offset = ROUNDUP_PAGE_SIZE(range.end()); |
| |
| #if (DEBUG_ASSERT_IMPLEMENTED) |
| // For any pages that have their pin count transition to 0, i.e. become unpinned, we want to |
| // perform a range change op. For efficiency track contiguous ranges. |
| uint64_t completely_unpin_start = 0; |
| uint64_t completely_unpin_len = 0; |
| #endif |
| |
| uint64_t unpin_count = 0; |
| zx_status_t status = page_list_.ForEveryPageAndGapInRange( |
| [&](const auto* page, uint64_t off) { |
| AssertHeld(lock_ref()); |
| // Only real pages can be pinned. |
| ASSERT(page->IsPage()); |
| |
| vm_page_t* p = page->Page(); |
| ASSERT(p->object.pin_count > 0); |
| p->object.pin_count--; |
| if (p->object.pin_count == 0) { |
| MoveToNotPinnedLocked(p, range.offset); |
| #if (DEBUG_ASSERT_IMPLEMENTED) |
| // Check if the current range can be extended. |
| if (completely_unpin_start + completely_unpin_len == off) { |
| completely_unpin_len += PAGE_SIZE; |
| } else { |
| // Complete any existing range and then start again at this offset. |
| if (completely_unpin_len > 0 && deferred) { |
| const VmCowRange range_update = |
| VmCowRange(completely_unpin_start, completely_unpin_len); |
| RangeChangeUpdateLocked(range_update, RangeChangeOp::DebugUnpin, deferred); |
| } |
| completely_unpin_start = off; |
| completely_unpin_len = PAGE_SIZE; |
| } |
| #endif |
| } |
| ++unpin_count; |
| return ZX_ERR_NEXT; |
| }, |
| [](uint64_t gap_start, uint64_t gap_end) { return ZX_ERR_NOT_FOUND; }, start_page_offset, |
| end_page_offset); |
| ASSERT_MSG(status == ZX_OK, "Tried to unpin an uncommitted page"); |
| |
| // Possible that we were entirely inside a spare interval without any committed pages, in which |
| // case neither the page nor gap callback would have triggered, and the assert above would |
| // succeed. This is still an error though and can catch this, and any other mistakes, by ensuring |
| // we found and decremented the pin counts from the exact expected number of pages. |
| ASSERT(unpin_count == (end_page_offset - start_page_offset) / PAGE_SIZE); |
| |
| #if (DEBUG_ASSERT_IMPLEMENTED) |
| // Check any leftover range. |
| if (completely_unpin_len > 0 && deferred) { |
| const VmCowRange range_update = VmCowRange(completely_unpin_start, completely_unpin_len); |
| RangeChangeUpdateLocked(range_update, RangeChangeOp::DebugUnpin, deferred); |
| } |
| #endif |
| |
| bool overflow = sub_overflow(pinned_page_count_, unpin_count, &pinned_page_count_); |
| ASSERT(!overflow); |
| |
| return; |
| } |
| |
| bool VmCowPages::DebugIsRangePinnedLocked(VmCowRange range) { |
| canary_.Assert(); |
| DEBUG_ASSERT(range.is_page_aligned()); |
| |
| uint64_t pinned_count = 0; |
| page_list_.ForEveryPageInRange( |
| [&pinned_count](const auto* p, uint64_t off) { |
| if (p->IsPage() && p->Page()->object.pin_count > 0) { |
| pinned_count++; |
| return ZX_ERR_NEXT; |
| } |
| return ZX_ERR_STOP; |
| }, |
| range.offset, range.end()); |
| return pinned_count == range.len / PAGE_SIZE; |
| } |
| |
| bool VmCowPages::AnyPagesPinnedLocked(uint64_t offset, size_t len) { |
| canary_.Assert(); |
| DEBUG_ASSERT(lock_ref().lock().IsHeld()); |
| DEBUG_ASSERT(IS_PAGE_ROUNDED(offset)); |
| DEBUG_ASSERT(IS_PAGE_ROUNDED(len)); |
| |
| const uint64_t start_page_offset = offset; |
| const uint64_t end_page_offset = offset + len; |
| |
| if (pinned_page_count_ == 0) { |
| return false; |
| } |
| |
| bool found_pinned = false; |
| page_list_.ForEveryPageInRange( |
| [&found_pinned, start_page_offset, end_page_offset](const auto* p, uint64_t off) { |
| DEBUG_ASSERT(off >= start_page_offset && off < end_page_offset); |
| if (p->IsPage() && p->Page()->object.pin_count > 0) { |
| found_pinned = true; |
| return ZX_ERR_STOP; |
| } |
| return ZX_ERR_NEXT; |
| }, |
| start_page_offset, end_page_offset); |
| |
| return found_pinned; |
| } |
| |
| void VmCowPages::InvalidateReadRequestsLocked(uint64_t offset, uint64_t len) { |
| DEBUG_ASSERT(IS_PAGE_ROUNDED(offset)); |
| DEBUG_ASSERT(IS_PAGE_ROUNDED(len)); |
| DEBUG_ASSERT(InRange(offset, len, size_)); |
| |
| DEBUG_ASSERT(page_source_); |
| |
| const uint64_t start = offset; |
| const uint64_t end = offset + len; |
| |
| zx_status_t status = page_list_.ForEveryPageAndGapInRange( |
| [](const auto* p, uint64_t off) { return ZX_ERR_NEXT; }, |
| [this](uint64_t gap_start, uint64_t gap_end) { |
| page_source_->OnPagesSupplied(gap_start, gap_end - gap_start); |
| return ZX_ERR_NEXT; |
| }, |
| start, end); |
| DEBUG_ASSERT(status == ZX_OK); |
| } |
| |
| void VmCowPages::InvalidateDirtyRequestsLocked(uint64_t offset, uint64_t len) { |
| DEBUG_ASSERT(IS_PAGE_ROUNDED(offset)); |
| DEBUG_ASSERT(IS_PAGE_ROUNDED(len)); |
| DEBUG_ASSERT(InRange(offset, len, size_)); |
| |
| DEBUG_ASSERT(is_source_preserving_page_content()); |
| DEBUG_ASSERT(page_source_->ShouldTrapDirtyTransitions()); |
| |
| const uint64_t start = offset; |
| const uint64_t end = offset + len; |
| |
| zx_status_t status = page_list_.ForEveryPageAndContiguousRunInRange( |
| [](const VmPageOrMarker* p, uint64_t off) { |
| // A marker is a clean zero page and might have an outstanding DIRTY request. |
| if (p->IsMarker()) { |
| return true; |
| } |
| // An interval is an uncommitted zero page and might have an outstanding DIRTY request |
| // irrespective of dirty state. |
| if (p->IsIntervalZero()) { |
| return true; |
| } |
| // Although a reference is implied to be clean, VMO backed by a page source should never |
| // have references. |
| DEBUG_ASSERT(!p->IsReference()); |
| // Not parent content in pager-backed VMOs. |
| DEBUG_ASSERT(!p->IsParentContent()); |
| |
| vm_page_t* page = p->Page(); |
| DEBUG_ASSERT(is_page_dirty_tracked(page)); |
| |
| // A page that is not Dirty already might have an outstanding DIRTY request. |
| if (!is_page_dirty(page)) { |
| return true; |
| } |
| // Otherwise the page should already be Dirty. |
| DEBUG_ASSERT(is_page_dirty(page)); |
| return false; |
| }, |
| [](const VmPageOrMarker* p, uint64_t off) { |
| // Nothing to update for the page as we're not actually marking it Dirty. |
| return ZX_ERR_NEXT; |
| }, |
| [this](uint64_t start, uint64_t end, bool unused) { |
| // Resolve any DIRTY requests in this contiguous range. |
| page_source_->OnPagesDirtied(start, end - start); |
| return ZX_ERR_NEXT; |
| }, |
| start, end); |
| // We don't expect an error from the traversal. |
| DEBUG_ASSERT(status == ZX_OK); |
| |
| // Now resolve DIRTY requests for any gaps. After request generation, pages could either |
| // have been evicted, or zero intervals written back, leading to gaps. So it is possible for gaps |
| // to have outstanding DIRTY requests. |
| status = page_list_.ForEveryPageAndGapInRange( |
| [](const VmPageOrMarker* p, uint64_t off) { |
| // Nothing to do for pages. We already handled them above. |
| return ZX_ERR_NEXT; |
| }, |
| [this](uint64_t gap_start, uint64_t gap_end) { |
| // Resolve any DIRTY requests in this gap. |
| page_source_->OnPagesDirtied(gap_start, gap_end - gap_start); |
| return ZX_ERR_NEXT; |
| }, |
| start, end); |
| // We don't expect an error from the traversal. |
| DEBUG_ASSERT(status == ZX_OK); |
| } |
| |
| zx_status_t VmCowPages::Resize(uint64_t s) { |
| canary_.Assert(); |
| |
| LTRACEF("vmcp %p, size %" PRIu64 "\n", this, s); |
| |
| __UNINITIALIZED DeferredOps deferred(this); |
| // In the case where we are shrinking any child limits may need to be updated, but the locking |
| // order requires their locks to be acquired without our lock held, and so we do this after |
| // dropping the main lock, but before any pages are freed from the deferred ops. See the comment |
| // and checks where this is set to true for details on the correctness. |
| bool update_child_limits = false; |
| { |
| Guard<CriticalMutex> guard{AssertOrderedLock, lock(), lock_order()}; |
| |
| // make sure everything is aligned before we get started |
| DEBUG_ASSERT(IS_PAGE_ROUNDED(size_)); |
| DEBUG_ASSERT(IS_PAGE_ROUNDED(s)); |
| |
| // see if we're shrinking or expanding the vmo |
| if (s < size_) { |
| // shrinking |
| const uint64_t start = s; |
| const uint64_t end = size_; |
| const uint64_t len = end - start; |
| |
| // bail if there are any pinned pages in the range we're trimming |
| if (AnyPagesPinnedLocked(start, len)) { |
| return ZX_ERR_BAD_STATE; |
| } |
| |
| // unmap all of the pages in this range on all the mapping regions |
| RangeChangeUpdateLocked(VmCowRange(start, len), RangeChangeOp::Unmap, &deferred); |
| |
| // Resolve any outstanding page requests tracked by the page source that are now |
| // out-of-bounds. |
| if (page_source_) { |
| // Tell the page source that any non-resident pages that are now out-of-bounds |
| // were supplied, to ensure that any reads of those pages get woken up. |
| InvalidateReadRequestsLocked(start, len); |
| |
| // If DIRTY requests are supported, also tell the page source that any non-Dirty pages that |
| // are now out-of-bounds were dirtied (without actually dirtying them), to ensure that any |
| // threads blocked on DIRTY requests for those pages get woken up. |
| if (is_source_preserving_page_content() && page_source_->ShouldTrapDirtyTransitions()) { |
| InvalidateDirtyRequestsLocked(start, len); |
| } |
| } |
| |
| // If pager-backed and the new size falls partway in an interval, we will need to clip the |
| // interval. |
| if (is_source_preserving_page_content()) { |
| // Check if the first populated slot we find in the now-invalid range is an interval end. |
| uint64_t interval_end = UINT64_MAX; |
| zx_status_t status = page_list_.ForEveryPageInRange( |
| [&interval_end](const VmPageOrMarker* p, uint64_t off) { |
| if (p->IsIntervalEnd()) { |
| interval_end = off; |
| } |
| // We found the first populated slot. Stop the traversal. |
| return ZX_ERR_STOP; |
| }, |
| start, size_); |
| DEBUG_ASSERT(status == ZX_OK); |
| |
| if (interval_end != UINT64_MAX) { |
| status = page_list_.ClipIntervalEnd(interval_end, interval_end - start + PAGE_SIZE); |
| if (status != ZX_OK) { |
| DEBUG_ASSERT(status == ZX_ERR_NO_MEMORY); |
| return status; |
| } |
| } |
| } |
| |
| // Clip the parent limit and release any pages, if any, in this node or the parents. |
| // |
| // It should never exceed this node's size, either the current size (which is `end`) or the |
| // new size (which is `start`). |
| DEBUG_ASSERT(parent_limit_ <= end); |
| |
| ReleaseOwnedPagesLocked(start, LockedPtr(), deferred.FreedList(this)); |
| |
| // If the tail of a parent disappears, the children shouldn't be able to see that region |
| // again, even if the parent is later reenlarged. So update the children's parent limits. |
| if (children_list_len_ != 0) { |
| // The only scenario where we can have children is if this is a pager backed hierarchy, in |
| // which case the DeferredOps constructed at the top of this function holds the pager |
| // hierarchy lock, which is held over all resize operations. Due to this lock being held we |
| // know that, even once the VMO lock is dropped, no resize operation to reenlarge can occur |
| // till after we have completed updating the child limits. |
| // In the present state, with our size_ reduced but child parent_limit_ not updated, the |
| // children will just walk up to us, see that the offset is beyond our size_, and substitute |
| // a zero page. Once the child parent_limit_s are updated they will instead not walk up to |
| // us, and substitute a zero page. |
| ASSERT(root_has_page_source()); |
| update_child_limits = true; |
| } |
| } else if (s > size_) { |
| uint64_t temp; |
| // Check that this VMOs new size would not cause it to overflow if projected onto the root. |
| bool overflow = add_overflow(root_parent_offset_, s, &temp); |
| if (overflow) { |
| return ZX_ERR_INVALID_ARGS; |
| } |
| // expanding |
| // figure the starting and ending page offset that is affected |
| const uint64_t start = size_; |
| const uint64_t end = s; |
| const uint64_t len = end - start; |
| |
| // inform all our children or mapping that there's new bits |
| RangeChangeUpdateLocked(VmCowRange(start, len), RangeChangeOp::Unmap, &deferred); |
| |
| // If pager-backed, need to insert a dirty zero interval beyond the old size. |
| if (is_source_preserving_page_content()) { |
| zx_status_t status = |
| page_list_.AddZeroInterval(start, end, VmPageOrMarker::IntervalDirtyState::Dirty); |
| if (status != ZX_OK) { |
| DEBUG_ASSERT(status == ZX_ERR_NO_MEMORY); |
| return status; |
| } |
| } |
| } |
| |
| // save bytewise size |
| size_ = s; |
| |
| // We were able to successfully resize. Mark as modified. |
| mark_modified_locked(); |
| |
| VMO_VALIDATION_ASSERT(DebugValidateHierarchyLocked()); |
| VMO_VALIDATION_ASSERT(DebugValidateZeroIntervalsLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| } |
| // Now that the lock is dropped, check if we need to update the child limits before the |
| // DeferredOps get finalized. When iterating over our children it is important that we iterate |
| // precisely over *all* of our children and exactly our direct children (i.e. not our children's |
| // children). The TreeWalkCursor is able to provide these guarantees in this case since clone |
| // creation is serialized with the page_source_lock in the DeferredOps, just like here. |
| // Serializing the clone calls with resize ensures that any child we are iterating cannot move |
| // down in the tree and gain a new parent, which happens when a hidden node needs to be inserted. |
| // The deletion path is not an issue since if the node we are iterating at gets deleted then the |
| // cursor will just move to its sibling (or get deleted if no sibling), which is the behavior that |
| // we want anyway. |
| if (update_child_limits) { |
| // Use a TreeWalkCursor to walk all our children. |
| // A child's parent limit will also limit that child's descendants' views into this node, so |
| // this method only needs to touch the direct children. |
| TreeWalkCursor cursor(LockedPtr(this)); |
| // Go to the first child, if we still have one. |
| if (cursor.NextChild()) { |
| // Update this child and all its siblings. |
| do { |
| // Ensure that we are only modifying direct descendants. |
| DEBUG_ASSERT(cursor.GetCur().locked().parent_.get() == this); |
| cursor.GetCur().locked().parent_limit_ = ClampedLimit( |
| cursor.GetCur().locked().parent_offset_, cursor.GetCur().locked().parent_limit_, s); |
| } while (cursor.NextSibling()); |
| } |
| } |
| return ZX_OK; |
| } |
| |
| zx_status_t VmCowPages::LookupLocked(VmCowRange range, VmObject::LookupFunction lookup_fn) { |
| canary_.Assert(); |
| if (unlikely(range.is_empty())) { |
| return ZX_ERR_INVALID_ARGS; |
| } |
| |
| // verify that the range is within the object |
| if (unlikely(!range.IsBoundedBy(size_))) { |
| return ZX_ERR_OUT_OF_RANGE; |
| } |
| |
| const uint64_t start_page_offset = ROUNDDOWN_PAGE_SIZE(range.offset); |
| const uint64_t end_page_offset = ROUNDUP_PAGE_SIZE(range.end()); |
| |
| return page_list_.ForEveryPageInRange( |
| [&lookup_fn](const auto* p, uint64_t off) { |
| if (!p->IsPage()) { |
| // Skip non pages. |
| return ZX_ERR_NEXT; |
| } |
| paddr_t pa = p->Page()->paddr(); |
| return lookup_fn(off, pa); |
| }, |
| start_page_offset, end_page_offset); |
| } |
| |
| zx_status_t VmCowPages::LookupReadableLocked(VmCowRange range, LookupReadableFunction lookup_fn) { |
| canary_.Assert(); |
| if (unlikely(range.is_empty())) { |
| return ZX_ERR_INVALID_ARGS; |
| } |
| |
| // verify that the range is within the object |
| if (unlikely(!range.IsBoundedBy(size_))) { |
| return ZX_ERR_OUT_OF_RANGE; |
| } |
| |
| uint64_t current_page_offset = ROUNDDOWN_PAGE_SIZE(range.offset); |
| const uint64_t end_page_offset = ROUNDUP_PAGE_SIZE(range.end()); |
| |
| DEBUG_ASSERT(!is_hidden()); |
| |
| while (current_page_offset != end_page_offset) { |
| // Attempt to process any pages we have first. Skip over anything that's not a page since the |
| // lookup_fn only applies to actual pages. |
| zx_status_t status = page_list_.ForEveryPageInRange( |
| [&lookup_fn, ¤t_page_offset, this](const VmPageOrMarker* page_or_marker, |
| uint64_t offset) { |
| // The offset can advance ahead if we encounter gaps or sparse intervals. |
| if (offset != current_page_offset) { |
| if (!page_or_marker->IsIntervalEnd() && !node_has_parent_content_markers()) { |
| // There was a gap before this offset and the tree does not use parent content markers |
| // so we must walk up to find the content. |
| return ZX_ERR_STOP; |
| } |
| // Otherwise, we can advance our cursor to the interval/gap end. |
| offset = current_page_offset; |
| } |
| // Parent content is like a gap and so we need to exit and find the content. |
| if (page_or_marker->IsParentContent()) { |
| return ZX_ERR_STOP; |
| } |
| DEBUG_ASSERT(offset == current_page_offset); |
| current_page_offset = offset + PAGE_SIZE; |
| if (!page_or_marker->IsPage()) { |
| return ZX_ERR_NEXT; |
| } |
| return lookup_fn(offset, page_or_marker->Page()->paddr()); |
| }, |
| current_page_offset, end_page_offset); |
| |
| // Check if we've processed the whole range. |
| if (current_page_offset == end_page_offset) { |
| break; |
| } |
| |
| // See if any of our parents have the content. |
| // Note that page intervals are only supported in root VMOs, so if we ended the page list |
| // traversal above partway into an interval, we will be able to continue the traversal over the |
| // rest of the interval after this call - since we're the root, we will be the owner and the |
| // owner length won't be clipped. |
| PageLookup content; |
| FindPageContentLocked(current_page_offset, end_page_offset - current_page_offset, &content); |
| |
| // This should always get filled out. |
| DEBUG_ASSERT(content.visible_end > current_page_offset); |
| const uint64_t owner_length = content.visible_end - current_page_offset; |
| |
| // Iterate over any potential content. |
| status = content.owner.locked_or(this).page_list_.ForEveryPageInRange( |
| [&lookup_fn, current_page_offset, &content](const VmPageOrMarker* page_or_marker, |
| uint64_t offset) { |
| if (!page_or_marker->IsPage()) { |
| return ZX_ERR_NEXT; |
| } |
| return lookup_fn(offset - content.owner_offset + current_page_offset, |
| page_or_marker->Page()->paddr()); |
| }, |
| content.owner_offset, content.owner_offset + owner_length); |
| if (status != ZX_OK || status != ZX_ERR_NEXT) { |
| return status; |
| } |
| |
| current_page_offset += owner_length; |
| } |
| return ZX_OK; |
| } |
| |
| zx_status_t VmCowPages::TakePages(VmCowRange range, uint64_t splice_offset, VmPageSpliceList* pages, |
| uint64_t* taken_len, MultiPageRequest* page_request) { |
| canary_.Assert(); |
| |
| DEBUG_ASSERT(range.is_page_aligned()); |
| |
| __UNINITIALIZED DeferredOps deferred(this); |
| Guard<CriticalMutex> guard{AssertOrderedLock, lock(), lock_order()}; |
| |
| if (!range.IsBoundedBy(size_)) { |
| pages->Finalize(); |
| return ZX_ERR_OUT_OF_RANGE; |
| } |
| |
| if (page_source_) { |
| pages->Finalize(); |
| return ZX_ERR_NOT_SUPPORTED; |
| } |
| |
| if (AnyPagesPinnedLocked(range.offset, range.len)) { |
| pages->Finalize(); |
| return ZX_ERR_BAD_STATE; |
| } |
| |
| // On the assumption of success, unamp the entire range we are going to process. This ensures that |
| // in the unlikely event of a failure mid way through the unmap of the portion that was modified |
| // is not lost. |
| RangeChangeUpdateLocked(range, RangeChangeOp::Unmap, &deferred); |
| |
| VmCompression* compression = Pmm::Node().GetPageCompression(); |
| |
| // If we do not have a parent, and the page splice list is empty, then we can use TakePages to |
| // directly move the page list nodes into the splice list. It is possible to both have not parent |
| // and not have an empty splice list if the parent was concurrently closed while performing this |
| // operation, in which case as its an infrequent race condition we fall through to the less |
| // efficient code below. |
| if (!parent_ && pages->IsEmpty() && splice_offset == 0) { |
| bool found_page = false; |
| page_list_.ForEveryPageInRangeMutable( |
| [&compression, &found_page](VmPageOrMarkerRef p, uint64_t off) { |
| found_page = true; |
| // Splice lists do not support page intervals. |
| ASSERT(!p->IsInterval()); |
| // Have no parent and so should not see parent content. |
| DEBUG_ASSERT(!p->IsParentContent()); |
| if (p->IsPage()) { |
| DEBUG_ASSERT(p->Page()->object.pin_count == 0); |
| // Cannot be taking pages from a pager backed VMO, hence cannot be taking a loaned page. |
| DEBUG_ASSERT(!p->Page()->is_loaned()); |
| pmm_page_queues()->Remove(p->Page()); |
| } else if (p->IsReference()) { |
| // A regular reference we can move are permitted in the VmPageSpliceList, it is up to |
| // the receiver of the pages to reject or otherwise deal with them. A temporary |
| // reference we need to turn back into its page so we can move it. |
| if (auto maybe_page = MaybeDecompressReference(compression, p->Reference())) { |
| // Don't insert the page in the page queues, since we're trying to remove the pages, |
| // just update the page list reader for TakePages below. |
| VmPageOrMarker::ReferenceValue ref = p.SwapReferenceForPage(*maybe_page); |
| ASSERT(compression->IsTempReference(ref)); |
| } |
| } |
| return ZX_ERR_NEXT; |
| }, |
| range.offset, range.end()); |
| |
| // If we did not find any pages, we could either be entirely inside a gap or an interval. Make |
| // sure we're not inside an interval; checking a single offset for membership should suffice. |
| ASSERT(found_page || !page_list_.IsOffsetInZeroInterval(range.offset)); |
| |
| zx_status_t status = page_list_.TakePages(range.offset, pages); |
| if (status != ZX_OK) { |
| DEBUG_ASSERT(status == ZX_ERR_NO_MEMORY); |
| return status; |
| } |
| *taken_len = range.len; |
| return ZX_OK; |
| } |
| |
| // Removing pages is performed in a loop to more easily handle a couple of potential edge cases. |
| // If the VMO has a pager backed parent then any gaps in our page list may presently be seen as |
| // non zero content, but need to be zero after taking. To achieve this we can perform |
| // copy-on-write on the gap, giving us both pages we can take, and page list node slots we can |
| // insert markers into. However, we need to perform a new iteration in order re-walk these offsets |
| // that are now committed. Having a loop allows to therefore populate the gap, and then restart |
| // the operation, potentially repeatedly if there are multiple gaps in the overall range. |
| uint64_t processed = 0; |
| do { |
| // Helper method that is compatible with being given to VmPageList::RemovePages that moves |
| // contents into the splice list, and replaces it with zero content. On error sets *taken_len |
| // with the current offset. |
| auto remove_page_callback = [&](VmPageOrMarker* slot, uint64_t offset) { |
| if (slot->IsMarker()) { |
| // Already zero. Can leave a gap, which is implied zero, in the splice list. |
| return ZX_ERR_NEXT; |
| } |
| if (slot->IsParentContent()) { |
| AssertHeld(lock_ref()); |
| auto cursor = GetLookupCursorLocked(VmCowRange(offset, PAGE_SIZE)); |
| if (cursor.is_error()) { |
| *taken_len = offset - range.offset; |
| return cursor.error_value(); |
| } |
| AssertHeld(cursor->lock_ref()); |
| auto result = cursor->RequireOwnedPage(true, 1, deferred, page_request); |
| if (result.is_error()) { |
| *taken_len = offset - range.offset; |
| return result.error_value(); |
| } |
| DEBUG_ASSERT(slot->Page() == result->page); |
| } |
| if (slot->IsReference()) { |
| // A regular reference we can move are permitted in the VmPageSpliceList, it is up to |
| // the receiver of the pages to reject or otherwise deal with them. A temporary |
| // reference we need to turn back into its page so we can move it. |
| if (auto maybe_page = MaybeDecompressReference(compression, slot->Reference())) { |
| // Don't insert the page in the page queues, since we're trying to remove the pages, |
| // just update the page list reader for TakePages below. |
| VmPageOrMarker::ReferenceValue ref = slot->SwapReferenceForPage(*maybe_page); |
| ASSERT(compression->IsTempReference(ref)); |
| } |
| } else if (slot->IsPage()) { |
| DEBUG_ASSERT(slot->Page()->object.pin_count == 0); |
| // Cannot be taking pages from a pager backed VMO, hence cannot be taking a loaned page. |
| DEBUG_ASSERT(!slot->Page()->is_loaned()); |
| Pmm::Node().GetPageQueues()->Remove(slot->Page()); |
| } |
| // Take the content and place it in the splice list. |
| DEBUG_ASSERT(slot->IsPageOrRef()); |
| zx_status_t status = pages->Insert(offset - range.offset + splice_offset, ktl::move(*slot)); |
| if (status != ZX_OK) { |
| ASSERT(status == ZX_ERR_NO_MEMORY); |
| *taken_len = offset - range.offset; |
| return status; |
| } |
| // Check if we need to insert a marker to zero the current location. |
| auto parent_has_content = [this](uint64_t offset) { |
| PageLookup content; |
| AssertHeld(lock_ref()); |
| FindInitialPageContentLocked(offset, &content); |
| return !!content.cursor.current(); |
| }; |
| if (!node_has_parent_content_markers() && |
| (root_has_page_source() || parent_has_content(offset))) { |
| *slot = VmPageOrMarker::Marker(); |
| } |
| return ZX_ERR_NEXT; |
| }; |
| |
| // Process any pages and record any gap we find that needs processing. Start with the assumption |
| // that there is a gap at the end of the range to process in order to simplify termination logic |
| // later on. |
| uint64_t found_gap_start = range.end(); |
| uint64_t found_gap_end = found_gap_start; |
| zx_status_t status = page_list_.RemovePagesAndIterateGaps( |
| remove_page_callback, |
| [&](uint64_t gap_start, uint64_t gap_end) { |
| if (node_has_parent_content_markers()) { |
| // Gaps imply zero content so we can just leave a gap in the splice list and continue. |
| return ZX_ERR_NEXT; |
| } |
| found_gap_start = gap_start; |
| found_gap_end = gap_end; |
| return ZX_ERR_STOP; |
| }, |
| range.offset + processed, range.end()); |
| if (status != ZX_OK) { |
| return status; |
| } |
| |
| if (found_gap_start < found_gap_end) { |
| // Most likely this range is pager backed, since TakePages is impossible to be called on a |
| // hidden node, and if we do not have parent content markers, then we are pager backed. As |
| // such this entire gap has non-zero content as determined by the page source or one of |
| // our intermediate parents and so just perform copy-on-write on the whole range so we can |
| // then take those pages. |
| const uint64_t gap_len = found_gap_end - found_gap_start; |
| AssertHeld(lock_ref()); |
| auto cursor = GetLookupCursorLocked(VmCowRange(found_gap_start, gap_len)); |
| if (cursor.is_error()) { |
| *taken_len = found_gap_start - range.offset; |
| return cursor.error_value(); |
| } |
| AssertHeld(cursor->lock_ref()); |
| for (uint64_t offset = 0; offset < gap_len; offset += PAGE_SIZE) { |
| auto result = cursor->RequireOwnedPage( |
| true, static_cast<uint>((gap_len - offset) / PAGE_SIZE), deferred, page_request); |
| // In the case of an error we want to take any pages we may have successfully committed in |
| // this loop in order to ensure forward progress. |
| if (result.is_error()) { |
| *taken_len = found_gap_start + offset - range.offset; |
| // The only error we need to handle forward progress for is ZX_ERR_SHOULD_WAIT, anything |
| // else doesn't matter as it will not retry. |
| if (result.error_value() != ZX_ERR_SHOULD_WAIT || offset == 0) { |
| return result.error_value(); |
| } |
| status = page_list_.RemovePages(remove_page_callback, found_gap_start, |
| found_gap_start + offset); |
| if (status == ZX_OK) { |
| // If RemovePages completed successfully then we can return our original error. |
| status = ZX_ERR_SHOULD_WAIT; |
| } else if (status == ZX_ERR_SHOULD_WAIT) { |
| // The remove_page_callback will have updated `taken_len` to reflect exactly how much |
| // progress it made before encountering an allocation failure, ensuring we make forwards |
| // progress. |
| } else { |
| // We received a different error, most likely ZX_ERR_OUT_OF_MEMORY. This takes |
| // precedence over the previous ZX_ERR_SHOULD_WAIT, so we need to cancel any page |
| // requests. |
| page_request->CancelRequests(); |
| } |
| return status; |
| } |
| } |
| } |
| // Set the amount we have processed such that we retry from the start of the gap we potentially |
| // just committed. If no gap is found then due to the way found_gap_start was initialized we |
| // will terminate the loop. |
| processed = found_gap_start - range.offset; |
| // Keep going as long as we have something to process. |
| } while (processed < range.len); |
| |
| pages->Finalize(); |
| *taken_len = range.len; |
| |
| VMO_VALIDATION_ASSERT(DebugValidateHierarchyLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| |
| return ZX_OK; |
| } |
| |
| zx_status_t VmCowPages::SupplyPagesLocked(VmCowRange range, VmPageSpliceList* pages, |
| SupplyOptions options, uint64_t* supplied_len, |
| DeferredOps& deferred, MultiPageRequest* page_request) { |
| canary_.Assert(); |
| |
| DEBUG_ASSERT(range.is_page_aligned()); |
| DEBUG_ASSERT(supplied_len); |
| ASSERT(options != SupplyOptions::PagerSupply || page_source_); |
| |
| if (!range.IsBoundedBy(size_)) { |
| *supplied_len = 0; |
| return ZX_ERR_OUT_OF_RANGE; |
| } |
| |
| if (options == SupplyOptions::TransferData) { |
| if (page_source_) { |
| return ZX_ERR_NOT_SUPPORTED; |
| } |
| if (AnyPagesPinnedLocked(range.offset, range.len)) { |
| return ZX_ERR_BAD_STATE; |
| } |
| } |
| |
| if (page_source_ && page_source_->is_detached()) { |
| return ZX_ERR_BAD_STATE; |
| } |
| |
| const uint64_t start = range.offset; |
| const uint64_t end = range.end(); |
| |
| const CanOverwriteContent overwrite_policy = options == SupplyOptions::TransferData |
| ? CanOverwriteContent::NonZero |
| : CanOverwriteContent::None; |
| |
| // If this node is utilizing parent content markers then we can perform a very efficient supply |
| // as we can freely clear existing content and leave gaps to indicate zero content. |
| // TODO(https://fxbug.dev/434536251): Deduplicate this into a more general solution for all kinds |
| // supply pages. |
| if (node_has_parent_content_markers()) { |
| DEBUG_ASSERT(!page_source_); |
| DEBUG_ASSERT(options == SupplyOptions::TransferData); |
| DEBUG_ASSERT(overwrite_policy == CanOverwriteContent::NonZero); |
| VmCompression* compression = Pmm::Node().GetPageCompression(); |
| |
| RangeChangeUpdateLocked(range, RangeChangeOp::Unmap, &deferred); |
| |
| // For any content in the splice list we want to insert into our page list, overwriting (i.e. |
| // freeing) any existing content. Any gaps in the splice list imply zeroes which, given this |
| // node users parent content markers, can be represented by ensuring the corresponding range in |
| // this VMO is empty. |
| zx_status_t status = pages->RemovePagesAndIterateGaps( |
| [&](VmPageOrMarker slot, uint64_t src_offset) { |
| AssertHeld(lock_ref()); |
| DEBUG_ASSERT(!slot.IsInterval()); |
| const uint64_t dst_offset = start + src_offset; |
| zx::result<VmPageOrMarker> result = |
| AddPageLocked(dst_offset, ktl::move(slot), overwrite_policy, nullptr); |
| if (result.is_error()) { |
| *supplied_len = src_offset; |
| return result.error_value(); |
| } |
| if (result->IsPage()) { |
| vm_page_t* page = result->ReleasePage(); |
| Pmm::Node().GetPageQueues()->Remove(page); |
| list_add_tail(deferred.FreedList(this).List(), &page->queue_node); |
| } else if (result->IsReference()) { |
| compression->Free(result->ReleaseReference()); |
| } else if (result->IsParentContent()) { |
| // In the case of parent content need to find the original owner and release our share |
| // count to the content. |
| PageLookup lookup_info; |
| FindInitialPageContentLocked(dst_offset, &lookup_info); |
| DEBUG_ASSERT(lookup_info.cursor.current() && !lookup_info.cursor.current()->IsEmpty()); |
| DEBUG_ASSERT(lookup_info.owner); |
| lookup_info.owner.locked().DecrementCowContentShareCount( |
| lookup_info.cursor.current(), dst_offset, deferred.FreedList(this), compression); |
| } |
| return ZX_ERR_NEXT; |
| }, |
| [&](uint64_t gap_start, uint64_t gap_end) { |
| const uint64_t gap_dst_start = gap_start + start; |
| const uint64_t gap_dst_end = gap_end + start; |
| AssertHeld(lock_ref()); |
| ReleaseOwnedPagesRangeLocked(gap_dst_start, gap_dst_end - gap_dst_start, LockedPtr(), |
| deferred.FreedList(this)); |
| return ZX_ERR_NEXT; |
| }); |
| if (status == ZX_OK) { |
| *supplied_len = range.len; |
| } |
| return status; |
| } |
| |
| // If this VMO has a parent, we need to make sure we take ownership of all of the pages in the |
| // input range. |
| // TODO(https://fxbug.dev/42076904): This is suboptimal, as we take ownership of a page just to |
| // free it immediately when we replace it with the supplied page. |
| if (parent_) { |
| uint64_t position = start; |
| auto cursor = GetLookupCursorLocked(range); |
| if (cursor.is_error()) { |
| return cursor.error_value(); |
| } |
| AssertHeld(cursor->lock_ref()); |
| while (position < end) { |
| auto result = cursor->RequireOwnedPage(true, static_cast<uint>((end - position) / PAGE_SIZE), |
| deferred, page_request); |
| if (result.is_error()) { |
| return result.error_value(); |
| } |
| position += PAGE_SIZE; |
| } |
| } |
| |
| // [new_pages_start, new_pages_start + new_pages_len) tracks the current run of |
| // consecutive new pages added to this vmo. |
| uint64_t offset = range.offset; |
| uint64_t new_pages_start = offset; |
| uint64_t new_pages_len = 0; |
| zx_status_t status = ZX_OK; |
| [[maybe_unused]] uint64_t initial_list_position = pages->Position(); |
| while (!pages->IsProcessed()) { |
| // With a PageSource only Pages are supported, so convert any refs to real pages. |
| // We do this without popping a page from the splice list as `MakePageFromReference` may return |
| // ZX_ERR_SHOULD_WAIT. This could lead the caller to wait on the page request and call |
| // `SupplyPagesLocked` again, at which point it would expect the operation to continue at the |
| // exact same page. |
| VmPageOrMarkerRef src_page_ref = pages->PeekReference(); |
| // The src_page_ref can be null if the head of the page list is not a reference or if the page |
| // list is empty. |
| if (src_page_ref) { |
| DEBUG_ASSERT(src_page_ref->IsReference()); |
| status = MakePageFromReference(src_page_ref, page_request->GetAnonymous()); |
| if (status != ZX_OK) { |
| break; |
| } |
| } |
| VmPageOrMarker src_page = pages->Pop(); |
| DEBUG_ASSERT(!src_page.IsReference()); |
| |
| // The pager API does not allow the source VMO of supply pages to have a page source, so we can |
| // assume that any empty pages are zeroes and insert explicit markers here. We need to insert |
| // explicit markers to actually resolve the pager fault. |
| // If we are using parent content markers then we do not want to insert redundant markers |
| // into a node. This would only happen when performing transfer data where this is not |
| // actually pager backed and so we do not actually need to insert anything as there is no |
| // fault to resolve. We will have to make the slot read as zero though, which is handled later |
| // on by clearing the slot. |
| if (src_page.IsEmpty() && !node_has_parent_content_markers()) { |
| src_page = VmPageOrMarker::Marker(); |
| } |
| |
| // A newly supplied page starts off as Clean. |
| if (src_page.IsPage() && is_source_preserving_page_content()) { |
| UpdateDirtyStateLocked(src_page.Page(), offset, DirtyState::Clean, |
| /*is_pending_add=*/true); |
| } |
| |
| VmPageOrMarker old_page; |
| // Defer individual range updates so we can do them in blocks. |
| if (src_page.IsEmpty()) { |
| DEBUG_ASSERT(node_has_parent_content_markers()); |
| DEBUG_ASSERT(overwrite_policy == CanOverwriteContent::NonZero); |
| // If the src page is empty this implies we want to the zero content, which can be achieved |
| // when using parent content markers by just clearing the slot. |
| old_page = page_list_.RemoveContent(offset); |
| // If we had a parent, and hence could have any parent content markers, then the |
| // RequireOwnedPage should have transformed them into actual pages and so we should never see |
| // a parent content marker at this point. |
| DEBUG_ASSERT(!old_page.IsParentContent()); |
| } else { |
| auto page_transaction = BeginAddPageLocked(offset, overwrite_policy); |
| if (page_transaction.is_error()) { |
| // Unable to insert anything at this slot, cleanup any existing src_page and handle a |
| // completed run. |
| if (src_page.IsPageOrRef()) { |
| DEBUG_ASSERT(src_page.IsPage()); |
| vm_page_t* page = src_page.ReleasePage(); |
| DEBUG_ASSERT(!list_in_list(&page->queue_node)); |
| list_add_tail(deferred.FreedList(this).List(), &page->queue_node); |
| } |
| |
| if (likely(page_transaction.status_value() == ZX_ERR_ALREADY_EXISTS)) { |
| // We hit the end of a run of absent pages, so notify the page source |
| // of any new pages that were added and reset the tracking variables. |
| if (new_pages_len) { |
| RangeChangeUpdateLocked(VmCowRange(new_pages_start, new_pages_len), |
| RangeChangeOp::Unmap, &deferred); |
| if (page_source_) { |
| page_source_->OnPagesSupplied(new_pages_start, new_pages_len); |
| } |
| } |
| new_pages_start = offset + PAGE_SIZE; |
| new_pages_len = 0; |
| offset += PAGE_SIZE; |
| continue; |
| } else { |
| // Only cause for this should be an out of memory from the kernel heap when attempting to |
| // allocate a page list node. |
| status = page_transaction.status_value(); |
| ASSERT(status == ZX_ERR_NO_MEMORY); |
| break; |
| } |
| } |
| if (options == SupplyOptions::PhysicalPageProvider) { |
| // When being called from the physical page provider, we need to call InitializeVmPage(), |
| // which AddNewPageLocked() will do. |
| // We only want to populate offsets that have true absence of content, so do not overwrite |
| // anything in the page list. |
| old_page = CompleteAddNewPageLocked(*page_transaction, src_page.Page(), |
| /*zero=*/false, nullptr); |
| // The page was successfully added, but we still have a copy in the src_page, so we need to |
| // release it, however need to store the result in a temporary as we are required to use the |
| // result of ReleasePage. |
| [[maybe_unused]] vm_page_t* unused = src_page.ReleasePage(); |
| } else { |
| // When not being called from the physical page provider, we don't need InitializeVmPage(), |
| // so we use AddPageLocked(). |
| // We only want to populate offsets that have true absence of content, so do not overwrite |
| // anything in the page list. |
| old_page = CompleteAddPageLocked(*page_transaction, ktl::move(src_page), |
| ParentContent::Unknown, nullptr); |
| } |
| } |
| // If the content overwrite policy was None, the old page should be empty. |
| DEBUG_ASSERT(overwrite_policy != CanOverwriteContent::None || old_page.IsEmpty()); |
| // Clean up the old_page if necessary. The action taken is different depending on the state of |
| // old_page: |
| // 1. Page: If old_page is backed by an actual page, remove it from the page queues and free |
| // the page. |
| // 2. Reference: If old_page is a reference, free the reference. |
| // 3. Interval: We should not be overwriting data in a pager-backed VMO, so assert that |
| // old_page is not an interval. |
| // 4. Marker: There are no resources to free here, so do nothing. |
| if (old_page.IsPage()) { |
| vm_page_t* released_page = old_page.ReleasePage(); |
| // We do not overwrite content in pager backed VMOs, the only place where loaned pages can be, |
| // so any old page must never have been loaned. |
| DEBUG_ASSERT(!released_page->is_loaned()); |
| pmm_page_queues()->Remove(released_page); |
| DEBUG_ASSERT(!list_in_list(&released_page->queue_node)); |
| list_add_tail(deferred.FreedList(this).List(), &released_page->queue_node); |
| } else if (old_page.IsReference()) { |
| FreeReference(old_page.ReleaseReference()); |
| } else { |
| DEBUG_ASSERT(!old_page.IsInterval()); |
| DEBUG_ASSERT(!old_page.IsParentContent()); |
| } |
| new_pages_len += PAGE_SIZE; |
| DEBUG_ASSERT(new_pages_start + new_pages_len <= end); |
| |
| offset += PAGE_SIZE; |
| } |
| // Unless there was an error and we exited the loop early, then there should have been the correct |
| // number of pages in the splice list. |
| DEBUG_ASSERT(offset == end || status != ZX_OK); |
| if (new_pages_len) { |
| RangeChangeUpdateLocked(VmCowRange(new_pages_start, new_pages_len), RangeChangeOp::Unmap, |
| &deferred); |
| if (page_source_) { |
| page_source_->OnPagesSupplied(new_pages_start, new_pages_len); |
| } |
| } |
| |
| VMO_VALIDATION_ASSERT(DebugValidateHierarchyLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| |
| *supplied_len = offset - start; |
| // In the case of ZX_OK or ZX_ERR_SHOULD_WAIT we should have supplied exactly as many pages as we |
| // processed. In any other case the value is undefined. |
| DEBUG_ASSERT(((pages->Position() - initial_list_position) == *supplied_len) || |
| (status != ZX_OK && status != ZX_ERR_SHOULD_WAIT)); |
| return status; |
| } |
| |
| // This is a transient operation used only to fail currently outstanding page requests. It does not |
| // alter the state of the VMO, or any pages that might have already been populated within the |
| // specified range. |
| // |
| // If certain pages in this range are populated, we must have done so via a previous SupplyPages() |
| // call that succeeded. So it might be fine for clients to continue accessing them, despite the |
| // larger range having failed. |
| // |
| // TODO(rashaeqbal): If we support a more permanent failure mode in the future, we will need to free |
| // populated pages in the specified range, and possibly detach the VMO from the page source. |
| zx_status_t VmCowPages::FailPageRequestsLocked(VmCowRange range, zx_status_t error_status) { |
| canary_.Assert(); |
| |
| DEBUG_ASSERT(range.is_page_aligned()); |
| |
| ASSERT(page_source_); |
| |
| if (!PageSource::IsValidInternalFailureCode(error_status)) { |
| return ZX_ERR_INVALID_ARGS; |
| } |
| |
| if (!range.IsBoundedBy(size_)) { |
| return ZX_ERR_OUT_OF_RANGE; |
| } |
| |
| if (page_source_->is_detached()) { |
| return ZX_ERR_BAD_STATE; |
| } |
| |
| page_source_->OnPagesFailed(range.offset, range.len, error_status); |
| return ZX_OK; |
| } |
| |
| zx_status_t VmCowPages::DirtyPages(VmCowRange range, list_node_t* alloc_list, |
| AnonymousPageRequest* page_request) { |
| canary_.Assert(); |
| |
| DEBUG_ASSERT(range.is_page_aligned()); |
| |
| ASSERT(page_source_); |
| |
| if (!page_source_->ShouldTrapDirtyTransitions()) { |
| return ZX_ERR_NOT_SUPPORTED; |
| } |
| DEBUG_ASSERT(is_source_preserving_page_content()); |
| |
| const uint64_t start_offset = range.offset; |
| const uint64_t end_offset = range.end(); |
| |
| __UNINITIALIZED DeferredOps deferred(this); |
| Guard<CriticalMutex> guard{AssertOrderedLock, lock(), lock_order()}; |
| |
| if (start_offset > size_locked()) { |
| return ZX_ERR_OUT_OF_RANGE; |
| } |
| |
| // Overflow check. |
| if (end_offset < start_offset) { |
| return ZX_ERR_OUT_OF_RANGE; |
| } |
| |
| // After the above checks, the page source has tried to respond correctly to a range of dirty |
| // requests, so the kernel should resolve those outstanding dirty requests, even in the failure |
| // case. From a returned error, the page source currently has no ability to detect which ranges |
| // caused the error, so the kernel should either completely succeed or fail the request instead of |
| // holding onto a partial outstanding request that will block pager progress. |
| auto invalidate_requests_on_error = fit::defer([this, len = range.len, start_offset] { |
| AssertHeld(lock_ref()); |
| DEBUG_ASSERT(size_locked() >= start_offset); |
| |
| uint64_t invalidate_len = ktl::min(size_locked() - start_offset, len); |
| InvalidateDirtyRequestsLocked(start_offset, invalidate_len); |
| }); |
| |
| // The page source may have tried to mark a larger range than necessary as dirty. Invalidate the |
| // requests and return an error. |
| if (end_offset > size_locked()) { |
| return ZX_ERR_OUT_OF_RANGE; |
| } |
| |
| if (page_source_->is_detached()) { |
| return ZX_ERR_BAD_STATE; |
| } |
| |
| // If any of the pages in the range are zero page markers (Clean zero pages), they need to be |
| // forked in order to be dirtied (written to). Find the number of such pages that need to be |
| // allocated. We also need to allocate zero pages to replace sparse zero intervals. |
| size_t zero_pages_count = 0; |
| // This tracks the beginning of an interval that falls in the specified range. Since we might |
| // start partway inside an interval, this is initialized to start_offset so that we only consider |
| // the portion of the interval inside the range. If we did not start inside an interval, we will |
| // end up reinitializing this when we do find an interval start, before this value is used, so it |
| // is safe to initialize to start_offset in all cases. |
| uint64_t interval_start = start_offset; |
| // This tracks whether we saw an interval start sentinel in the traversal, but have not yet |
| // encountered a matching interval end sentinel. Should we end the traversal partway in an |
| // interval, we will need to handle the portion of the interval between the interval start and the |
| // end of the specified range. |
| bool unmatched_interval_start = false; |
| bool found_page_or_gap = false; |
| zx_status_t status = page_list_.ForEveryPageAndGapInRange( |
| [&zero_pages_count, &interval_start, &unmatched_interval_start, &found_page_or_gap]( |
| const VmPageOrMarker* p, uint64_t off) { |
| found_page_or_gap = true; |
| if (p->IsMarker()) { |
| zero_pages_count++; |
| return ZX_ERR_NEXT; |
| } |
| if (p->IsIntervalZero()) { |
| if (p->IsIntervalStart()) { |
| interval_start = off; |
| unmatched_interval_start = true; |
| } else if (p->IsIntervalEnd()) { |
| zero_pages_count += (off - interval_start + PAGE_SIZE) / PAGE_SIZE; |
| unmatched_interval_start = false; |
| } else { |
| DEBUG_ASSERT(p->IsIntervalSlot()); |
| zero_pages_count++; |
| } |
| return ZX_ERR_NEXT; |
| } |
| // Pager-backed VMOs cannot have compressed references, so the only other type is a page. |
| DEBUG_ASSERT(p->IsPage()); |
| return ZX_ERR_NEXT; |
| }, |
| [&found_page_or_gap](uint64_t start, uint64_t end) { |
| found_page_or_gap = true; |
| // A gap indicates a page that has not been supplied yet. It will need to be supplied |
| // first. Although we will never generate a DIRTY request for absent pages in the first |
| // place, it is still possible for a clean page to get evicted after the DIRTY request was |
| // generated. It is also possible for a dirty zero interval to have been written back such |
| // that we have an old DIRTY request for the interval. |
| // |
| // Spuriously resolve the DIRTY page request, and let the waiter(s) retry looking up the |
| // page, which will generate a READ request first to supply the missing page. |
| return ZX_ERR_NOT_FOUND; |
| }, |
| start_offset, end_offset); |
| |
| if (status != ZX_OK) { |
| return status; |
| } |
| |
| // Handle the last interval or if we did not enter the traversal callbacks at all. |
| if (unmatched_interval_start || !found_page_or_gap) { |
| DEBUG_ASSERT(found_page_or_gap || interval_start == start_offset); |
| zero_pages_count += (end_offset - interval_start) / PAGE_SIZE; |
| } |
| |
| // If we have found any zero pages to populate, then we need to allocate and transition them to |
| // the dirty state. |
| if (zero_pages_count > 0) { |
| // Allocate the number of zero pages required upfront, so that we can fail the call early if the |
| // page allocation fails. First determine how many pages we still need to allocate, based on the |
| // number of existing pages in the list. |
| uint64_t alloc_list_len = list_length(alloc_list); |
| zero_pages_count = zero_pages_count > alloc_list_len ? zero_pages_count - alloc_list_len : 0; |
| |
| // First try to allocate all the pages at once. This is an optimization and avoids repeated |
| // calls to the PMM to allocate single pages. If the PMM returns ZX_ERR_SHOULD_WAIT, fall back |
| // to allocating one page at a time below, giving reclamation strategies a better chance to |
| // catch up with incoming allocation requests. |
| status = pmm_alloc_pages(zero_pages_count, pmm_alloc_flags_, alloc_list); |
| if (status == ZX_OK) { |
| // All requested pages allocated. |
| zero_pages_count = 0; |
| } else { |
| if (status != ZX_ERR_SHOULD_WAIT) { |
| return status; |
| } |
| |
| // Fall back to allocating a single page at a time. We want to do this before we can start |
| // inserting pages into the page list, to avoid rolling back any pages we inserted but could |
| // not dirty in case we fail partway after having inserted some pages into the page list. |
| // Rolling back like this can lead to a livelock where we are constantly allocating some |
| // pages, freeing them, waiting on the page_request, and then repeating. |
| // |
| // If allocations do fail partway here, we will have accumulated the allocated pages in |
| // alloc_list, so we will be able to reuse them on a subsequent call to DirtyPagesLocked. This |
| // ensures we are making forward progress across successive calls. |
| while (zero_pages_count > 0) { |
| vm_page_t* new_page; |
| // We will initialize this page later when passing it to AddNewPageLocked |
| status = AllocUninitializedPage(&new_page, page_request); |
| // If single page allocation fails, bubble up the failure. |
| if (status != ZX_OK) { |
| // If propagating up ZX_ERR_SHOULD_WAIT do not consider this an error that requires |
| // invalidating the dirty request as we are going to retry it. |
| if (status == ZX_ERR_SHOULD_WAIT) { |
| invalidate_requests_on_error.cancel(); |
| } |
| return status; |
| } |
| list_add_tail(alloc_list, &new_page->queue_node); |
| zero_pages_count--; |
| } |
| } |
| DEBUG_ASSERT(zero_pages_count == 0); |
| |
| // We have to mark all the requested pages Dirty *atomically*. The user pager might be tracking |
| // filesystem space reservations based on the success / failure of this call. So if we fail |
| // partway, the user pager might think that no pages in the specified range have been dirtied, |
| // which would be incorrect. If there are any conditions that would cause us to fail, evaluate |
| // those before actually adding the pages, so that we can return the failure early before |
| // starting to mark pages Dirty. |
| // |
| // Install page slots for all the intervals we'll be adding zero pages in. Page insertion will |
| // only proceed once we've allocated all the slots without any errors. |
| // Populating slots will alter the page list. So break out of the traversal upon finding an |
| // interval, populate slots in it, and then resume the traversal after the interval. |
| uint64_t next_start_offset = start_offset; |
| do { |
| struct { |
| bool found_interval; |
| uint64_t start; |
| uint64_t end; |
| } state = {.found_interval = false, .start = 0, .end = 0}; |
| status = page_list_.ForEveryPageAndContiguousRunInRange( |
| [](const VmPageOrMarker* p, uint64_t off) { |
| return p->IsIntervalStart() || p->IsIntervalEnd(); |
| }, |
| [](const VmPageOrMarker* p, uint64_t off) { |
| DEBUG_ASSERT(p->IsIntervalZero()); |
| return ZX_ERR_NEXT; |
| }, |
| [&state](uint64_t start, uint64_t end, bool is_interval) { |
| DEBUG_ASSERT(is_interval); |
| state = {.found_interval = true, .start = start, .end = end}; |
| return ZX_ERR_STOP; |
| }, |
| next_start_offset, end_offset); |
| DEBUG_ASSERT(status == ZX_OK); |
| |
| // No intervals remain. |
| if (!state.found_interval) { |
| break; |
| } |
| // Ensure we're making forward progress. |
| DEBUG_ASSERT(state.end - state.start >= PAGE_SIZE); |
| zx_status_t st = page_list_.PopulateSlotsInInterval(state.start, state.end); |
| if (st != ZX_OK) { |
| DEBUG_ASSERT(st == ZX_ERR_NO_MEMORY); |
| // Before returning, we need to undo any slots we might have populated in intervals we |
| // previously encountered. This is a rare error case and can be inefficient. |
| for (uint64_t off = start_offset; off < state.start; off += PAGE_SIZE) { |
| auto slot = page_list_.Lookup(off); |
| if (slot) { |
| // If this is an interval slot, return it. Note that even though we did populate all |
| // slots until this point, not all will remain slots in this for-loop. When returning |
| // slots, they can merge with intervals both before and after, so it's possible that the |
| // next slot we were expecting has already been consumed. |
| if (slot->IsIntervalSlot()) { |
| page_list_.ReturnIntervalSlot(off); |
| } |
| } |
| } |
| return st; |
| } |
| next_start_offset = state.end; |
| } while (next_start_offset < end_offset); |
| |
| // All operations from this point on must succeed so we can atomically mark pages dirty. |
| |
| // Install newly allocated pages in place of the zero page markers and interval sentinels. Start |
| // with clean zero pages even for the intervals, so that the dirty transition logic below can |
| // uniformly transition them to dirty along with pager supplied pages. |
| status = page_list_.ForEveryPageInRange( |
| [this, &alloc_list, &deferred](const VmPageOrMarker* p, uint64_t off) { |
| if (p->IsMarker() || p->IsIntervalSlot()) { |
| DEBUG_ASSERT(!list_is_empty(alloc_list)); |
| AssertHeld(lock_ref()); |
| |
| // AddNewPageLocked will also zero the page and update any mappings. |
| // |
| // TODO(rashaeqbal): Depending on how often we end up forking zero markers, we might |
| // want to pass a nullptr here instead of &deferred and perform a single batch update |
| // later. |
| zx_status_t status = |
| AddNewPageLocked(off, list_remove_head_type(alloc_list, vm_page, queue_node), |
| CanOverwriteContent::Zero, nullptr, true, &deferred); |
| // AddNewPageLocked will not fail with ZX_ERR_ALREADY_EXISTS as we can overwrite |
| // markers and interval slots since they are zero, nor with ZX_ERR_NO_MEMORY as we don't |
| // need to allocate a new slot in the page list, we're simply replacing its content. |
| ASSERT(status == ZX_OK); |
| } |
| return ZX_ERR_NEXT; |
| }, |
| start_offset, end_offset); |
| |
| // We don't expect an error from the traversal. |
| DEBUG_ASSERT(status == ZX_OK); |
| } |
| |
| status = page_list_.ForEveryPageAndContiguousRunInRange( |
| [](const VmPageOrMarker* p, uint64_t off) { |
| DEBUG_ASSERT(!p->IsReference()); |
| if (p->IsPage()) { |
| vm_page_t* page = p->Page(); |
| DEBUG_ASSERT(is_page_dirty_tracked(page)); |
| DEBUG_ASSERT(is_page_clean(page) || !page->is_loaned()); |
| return !is_page_dirty(page); |
| } |
| return false; |
| }, |
| [this](const VmPageOrMarker* p, uint64_t off) { |
| DEBUG_ASSERT(p->IsPage()); |
| vm_page_t* page = p->Page(); |
| DEBUG_ASSERT(is_page_dirty_tracked(page)); |
| DEBUG_ASSERT(!is_page_dirty(page)); |
| AssertHeld(lock_ref()); |
| UpdateDirtyStateLocked(page, off, DirtyState::Dirty); |
| return ZX_ERR_NEXT; |
| }, |
| [this](uint64_t start, uint64_t end, bool unused) { |
| page_source_->OnPagesDirtied(start, end - start); |
| return ZX_ERR_NEXT; |
| }, |
| start_offset, end_offset); |
| // We don't expect a failure from the traversal. |
| DEBUG_ASSERT(status == ZX_OK); |
| |
| // All pages have been dirtied successfully, so cancel the cleanup on error. |
| invalidate_requests_on_error.cancel(); |
| |
| VMO_VALIDATION_ASSERT(DebugValidateZeroIntervalsLocked()); |
| return status; |
| } |
| |
| zx_status_t VmCowPages::EnumerateDirtyRangesLocked(VmCowRange range, |
| DirtyRangeEnumerateFunction&& dirty_range_fn) { |
| canary_.Assert(); |
| |
| // Dirty pages are only tracked if the page source preserves content. |
| if (!is_source_preserving_page_content()) { |
| return ZX_ERR_NOT_SUPPORTED; |
| } |
| |
| if (!range.IsBoundedBy(size_)) { |
| return ZX_ERR_OUT_OF_RANGE; |
| } |
| |
| const uint64_t start_offset = ROUNDDOWN_PAGE_SIZE(range.offset); |
| const uint64_t end_offset = ROUNDUP_PAGE_SIZE(range.end()); |
| |
| zx_status_t status = page_list_.ForEveryPageAndContiguousRunInRange( |
| [](const VmPageOrMarker* p, uint64_t off) { |
| // Enumerate both AwaitingClean and Dirty pages, i.e. anything that is not Clean. |
| // AwaitingClean pages are "dirty" too for the purposes of this enumeration, since their |
| // modified contents are still in the process of being written back. |
| if (p->IsPage()) { |
| vm_page_t* page = p->Page(); |
| DEBUG_ASSERT(is_page_dirty_tracked(page)); |
| DEBUG_ASSERT(is_page_clean(page) || !page->is_loaned()); |
| return !is_page_clean(page); |
| } |
| // Enumerate any dirty zero intervals. |
| if (p->IsIntervalZero()) { |
| // For now we do not support clean intervals. |
| DEBUG_ASSERT(!p->IsZeroIntervalClean()); |
| return p->IsZeroIntervalDirty(); |
| } |
| // Pager-backed VMOs cannot have compressed references, so the only other type is a marker. |
| DEBUG_ASSERT(p->IsMarker()); |
| return false; |
| }, |
| [](const VmPageOrMarker* p, uint64_t off) { |
| if (p->IsPage()) { |
| vm_page_t* page = p->Page(); |
| DEBUG_ASSERT(is_page_dirty_tracked(page)); |
| DEBUG_ASSERT(!is_page_clean(page)); |
| DEBUG_ASSERT(!page->is_loaned()); |
| DEBUG_ASSERT(page->object.get_page_offset() == off); |
| } else if (p->IsIntervalZero()) { |
| DEBUG_ASSERT(p->IsZeroIntervalDirty()); |
| } |
| return ZX_ERR_NEXT; |
| }, |
| [&dirty_range_fn](uint64_t start, uint64_t end, bool is_interval) { |
| // Zero intervals are enumerated as zero ranges. |
| return dirty_range_fn(start, end - start, /*range_is_zero=*/is_interval); |
| }, |
| start_offset, end_offset); |
| |
| VMO_VALIDATION_ASSERT(DebugValidateZeroIntervalsLocked()); |
| return status; |
| } |
| |
| zx_status_t VmCowPages::WritebackBeginLocked(VmCowRange range, bool is_zero_range) { |
| canary_.Assert(); |
| |
| DEBUG_ASSERT(range.is_page_aligned()); |
| |
| ASSERT(page_source_); |
| |
| if (!range.IsBoundedBy(size_)) { |
| return ZX_ERR_OUT_OF_RANGE; |
| } |
| |
| if (!is_source_preserving_page_content()) { |
| return ZX_ERR_NOT_SUPPORTED; |
| } |
| |
| const uint64_t start_offset = range.offset; |
| const uint64_t end_offset = range.end(); |
| // We only need to consider transitioning committed pages if the caller has specified that this is |
| // not a zero range. For a zero range, we cannot start cleaning any pages because the caller has |
| // expressed intent to write back zeros in this range; any pages we clean might get evicted and |
| // incorrectly supplied again as zero pages, leading to data loss. |
| // |
| // When querying dirty ranges, zero page intervals are indicated as dirty zero ranges. So it's |
| // perfectly reasonable for the user pager to write back these zero ranges efficiently without |
| // having to read the actual contents of the range, which would read zeroes anyway. There can |
| // exist a race however, where the user pager has just discovered a dirty zero range, and before |
| // it starts writing it out, an actual page gets dirtied in that range. Consider the following |
| // example that demonstrates the race: |
| // 1. The zero interval [5, 10) is indicated as a dirty zero range when the user pager queries |
| // dirty ranges. |
| // 2. A write comes in for page 7 and it is marked Dirty. The interval is split up into two: [5, |
| // 7) and [8, 10). |
| // 3. The user pager prepares to write the range [5, 10) with WritebackBegin. |
| // 4. Both the intervals as well as page 7 are marked AwaitingClean. |
| // 5. The user pager still thinks that [5, 10) is zero and writes back zeroes for the range. |
| // 6. The user pager does a WritebackEnd on [5, 10), and page 7 gets marked Clean. |
| // 7. At some point in the future, page 7 gets evicted. The data on page 7 (which was prematurely |
| // marked Clean) is now lost. |
| // |
| // This race occurred because there was a mismatch between what the user pager and the kernel |
| // think the contents of the range being written back are. The user pager intended to mark only |
| // zero ranges clean, not actual pages. The is_zero_range flag captures this intent, so that the |
| // kernel does not incorrectly clean actual committed pages. Committed dirty pages will be |
| // returned as actual dirty pages (not dirty zero ranges) on a subsequent call to query dirty |
| // ranges, and can be cleaned then. |
| |
| auto interval_start = VmPageOrMarkerRef(nullptr); |
| uint64_t interval_start_off; |
| zx_status_t status = page_list_.ForEveryPageInRangeMutable( |
| [is_zero_range, &interval_start, &interval_start_off, this](VmPageOrMarkerRef p, |
| uint64_t off) { |
| // VMOs with a page source should never have references. |
| DEBUG_ASSERT(!p->IsReference()); |
| // If the page is pinned we have to leave it Dirty in case it is still being written to |
| // via DMA. The VM system will be unaware of these writes, and so we choose to be |
| // conservative here and might end up with pinned pages being left dirty for longer, until |
| // a writeback is attempted after the unpin. |
| // If the caller indicates that they're only cleaning zero pages, any committed pages need |
| // to be left dirty. |
| if (p->IsPage() && (p->Page()->object.pin_count > 0 || is_zero_range)) { |
| return ZX_ERR_NEXT; |
| } |
| // Transition pages from Dirty to AwaitingClean. |
| if (p->IsPage() && is_page_dirty(p->Page())) { |
| AssertHeld(lock_ref()); |
| UpdateDirtyStateLocked(p->Page(), off, DirtyState::AwaitingClean); |
| return ZX_ERR_NEXT; |
| } |
| // Transition dirty zero intervals to AwaitingClean. |
| if (p->IsIntervalZero()) { |
| if (!p->IsZeroIntervalDirty()) { |
| // The only other state we support is Untracked. |
| DEBUG_ASSERT(p->IsZeroIntervalUntracked()); |
| return ZX_ERR_NEXT; |
| } |
| if (p->IsIntervalStart() || p->IsIntervalSlot()) { |
| // Start tracking a dirty interval. It will only transition once the end is encountered. |
| DEBUG_ASSERT(!interval_start); |
| interval_start = p; |
| interval_start_off = off; |
| } |
| if (p->IsIntervalEnd() || p->IsIntervalSlot()) { |
| // Now that we've encountered the end, the entire interval can be transitioned to |
| // AwaitingClean. This is done by setting the AwaitingCleanLength of the start sentinel. |
| // TODO: If the writeback began partway into the interval, try to coalesce the start's |
| // awaiting clean length with the range being cleaned here if it immediately follows. |
| if (interval_start) { |
| // Set the new AwaitingClean length to the max of the old value and the new one. |
| // See comments in WritebackEndLocked for an explanation. |
| const uint64_t old_len = interval_start->GetZeroIntervalAwaitingCleanLength(); |
| interval_start.SetZeroIntervalAwaitingCleanLength( |
| ktl::max(off - interval_start_off + PAGE_SIZE, old_len)); |
| } |
| // Reset the interval start so we can track a new one later. |
| interval_start = VmPageOrMarkerRef(nullptr); |
| } |
| return ZX_ERR_NEXT; |
| } |
| // This was either a marker (which is already clean), or a non-Dirty page. |
| DEBUG_ASSERT(p->IsMarker() || !is_page_dirty(p->Page())); |
| return ZX_ERR_NEXT; |
| }, |
| start_offset, end_offset); |
| // We don't expect a failure from the traversal. |
| DEBUG_ASSERT(status == ZX_OK); |
| |
| // Process the last partial interval. |
| if (interval_start) { |
| DEBUG_ASSERT(interval_start->IsIntervalStart()); |
| const uint64_t old_len = interval_start->GetZeroIntervalAwaitingCleanLength(); |
| interval_start.SetZeroIntervalAwaitingCleanLength( |
| ktl::max(end_offset - interval_start_off, old_len)); |
| } |
| |
| // Set any mappings for this range to read-only, so that a permission fault is triggered the next |
| // time the page is written to in order for us to track it as dirty. This might cover more pages |
| // than the Dirty pages found in the page list traversal above, but we choose to do this once for |
| // the entire range instead of per page; pages in the AwaitingClean and Clean states will already |
| // have their write permission removed, so this is a no-op for them. |
| const VmCowRange range_update = VmCowRange(start_offset, end_offset - start_offset); |
| RangeChangeUpdateLocked(range_update, RangeChangeOp::RemoveWrite, nullptr); |
| // No range change needs to be processed for the children since children, by virtue of being |
| // copy-on-write, cannot have a writable mapping. |
| |
| VMO_VALIDATION_ASSERT(DebugValidateZeroIntervalsLocked()); |
| return ZX_OK; |
| } |
| |
| zx_status_t VmCowPages::WritebackEndLocked(VmCowRange range) { |
| canary_.Assert(); |
| |
| DEBUG_ASSERT(range.is_page_aligned()); |
| |
| ASSERT(page_source_); |
| |
| if (!range.IsBoundedBy(size_)) { |
| return ZX_ERR_OUT_OF_RANGE; |
| } |
| |
| if (!is_source_preserving_page_content()) { |
| return ZX_ERR_NOT_SUPPORTED; |
| } |
| |
| const uint64_t start_offset = range.offset; |
| const uint64_t end_offset = range.end(); |
| |
| // Mark any AwaitingClean pages Clean. Remove AwaitingClean intervals that can be fully cleaned, |
| // otherwise clip the interval start removing the part that has been cleaned. Note that deleting |
| // an interval start is delayed until the corresponding end is encountered, and to ensure safe |
| // continued traversal, the start should always be released before the end, i.e. in the expected |
| // forward traversal order for RemovePages. |
| VmPageOrMarker* interval_start = nullptr; |
| uint64_t interval_start_off; |
| // This tracks the end offset until which all zero intervals can be marked clean. This is a |
| // running counter that is maintained across multiple zero intervals. Each time we encounter |
| // a new interval start, we take the max of the existing value and the AwaitingCleanLength of the |
| // new interval. This is because when zero intervals are truncated at the end or split, their |
| // AwaitingCleanLength does not get updated, even if it's larger than the current interval length. |
| // This is an optimization to avoid having to potentially walk to another node to find the |
| // relevant start to update. The reason it is safe to leave the AwaitingCleanLength unchanged is |
| // that it should be possible to apply the AwaitingCleanLength to any new zero intervals that get |
| // added later beyond the truncated interval. The user pager has indicated its intent to write a |
| // range as zeros, so until the point that it actually completes the writeback, it doesn't matter |
| // if zero intervals are removed and re-added, as long as they fall in the range that was |
| // initially indicated as being written back as zeros. |
| uint64_t interval_awaiting_clean_end = start_offset; |
| page_list_.RemovePages( |
| [&interval_start, &interval_start_off, &interval_awaiting_clean_end, this](VmPageOrMarker* p, |
| uint64_t off) { |
| // VMOs with a page source should never have references. |
| DEBUG_ASSERT(!p->IsReference()); |
| // Transition pages from AwaitingClean to Clean. |
| if (p->IsPage() && is_page_awaiting_clean(p->Page())) { |
| AssertHeld(lock_ref()); |
| UpdateDirtyStateLocked(p->Page(), off, DirtyState::Clean); |
| return ZX_ERR_NEXT; |
| } |
| // Handle zero intervals. |
| if (p->IsIntervalZero()) { |
| if (!p->IsZeroIntervalDirty()) { |
| // The only other state we support is Untracked. |
| DEBUG_ASSERT(p->IsZeroIntervalUntracked()); |
| return ZX_ERR_NEXT; |
| } |
| if (p->IsIntervalStart() || p->IsIntervalSlot()) { |
| DEBUG_ASSERT(!interval_start); |
| // Start tracking an interval. |
| interval_start = p; |
| interval_start_off = off; |
| // See if we can advance interval_awaiting_clean_end to include the AwaitingCleanLength |
| // of this interval. |
| interval_awaiting_clean_end = ktl::max(interval_awaiting_clean_end, |
| off + p->GetZeroIntervalAwaitingCleanLength()); |
| } |
| if (p->IsIntervalEnd() || p->IsIntervalSlot()) { |
| // Can only transition the end if we saw the corresponding start. |
| if (interval_start) { |
| AssertHeld(lock_ref()); |
| if (off < interval_awaiting_clean_end) { |
| // The entire interval is clean, so can remove it. |
| if (interval_start_off != off) { |
| *interval_start = VmPageOrMarker::Empty(); |
| // Return the start slot as it could have come from an earlier page list node. |
| // If the start slot came from the same node, we know that we still have a |
| // non-empty slot in that node (the current interval end we're looking at), and so |
| // the current node cannot be freed up, making it safe to continue traversal. The |
| // interval start should always be released before the end, which is consistent |
| // with forward traversal done by RemovePages. |
| page_list_.ReturnEmptySlot(interval_start_off); |
| } |
| // This empty slot with be returned by the RemovePages iterator. |
| *p = VmPageOrMarker::Empty(); |
| } else { |
| // The entire interval cannot be marked clean. Move forward the start by awaiting |
| // clean length, which will also set the AwaitingCleanLength for the resulting |
| // interval. |
| // Ignore any errors. Cleaning is best effort. If this fails, the interval will |
| // remain as is and get retried on another writeback attempt. |
| page_list_.ClipIntervalStart(interval_start_off, |
| interval_awaiting_clean_end - interval_start_off); |
| } |
| // Either way, the interval start tracking needs to be reset. |
| interval_start = nullptr; |
| } |
| } |
| return ZX_ERR_NEXT; |
| } |
| // This was either a marker (which is already clean), or a non-AwaitingClean page. |
| DEBUG_ASSERT(p->IsMarker() || !is_page_awaiting_clean(p->Page())); |
| return ZX_ERR_NEXT; |
| }, |
| start_offset, end_offset); |
| |
| // Handle the last partial interval. |
| if (interval_start) { |
| // Ignore any errors. Cleaning is best effort. If this fails, the interval will remain as is and |
| // get retried on another writeback attempt. |
| page_list_.ClipIntervalStart( |
| interval_start_off, ktl::min(interval_awaiting_clean_end, end_offset) - interval_start_off); |
| } |
| |
| VMO_VALIDATION_ASSERT(DebugValidateZeroIntervalsLocked()); |
| return ZX_OK; |
| } |
| |
| fbl::RefPtr<VmCowPages> VmCowPages::DebugGetParent() { |
| canary_.Assert(); |
| |
| Guard<CriticalMutex> guard{lock()}; |
| return parent_; |
| } |
| |
| void VmCowPages::DetachSource() { |
| canary_.Assert(); |
| |
| __UNINITIALIZED DeferredOps deferred(this); |
| Guard<CriticalMutex> guard{AssertOrderedLock, lock(), lock_order()}; |
| |
| DEBUG_ASSERT(page_source_); |
| page_source_->Detach(); |
| |
| // We would like to remove all committed pages so that all future page faults on this VMO and its |
| // clones can fail in a deterministic manner. However, if the page source is preserving content |
| // (is a userpager), we need to hold on to un-Clean (Dirty and AwaitingClean pages) so that they |
| // can be written back by the page source. If the page source is not preserving content, its pages |
| // will not be dirty tracked to begin with i.e. their dirty state will be Untracked, so we will |
| // end up removing all pages. |
| |
| // We should only be removing pages from the root VMO. |
| DEBUG_ASSERT(!parent_); |
| |
| // Even though we might end up removing only a subset of the pages, unmap them all at once as an |
| // optimization. Only the userpager is expected to access (dirty) pages beyond this point, in |
| // order to write back their contents, where the cost of the writeback is presumably much larger |
| // than page faults to update hardware page table mappings for resident pages. |
| RangeChangeUpdateLocked(VmCowRange(0, size_), RangeChangeOp::Unmap, &deferred); |
| |
| __UNINITIALIZED BatchPQRemove page_remover(deferred.FreedList(this)); |
| |
| // Remove all clean (or untracked) pages. |
| // TODO(rashaeqbal): Pages that linger after this will be written back and marked clean at some |
| // point, and will age through the pager-backed queues and eventually get evicted. We could |
| // adopt an eager approach instead, and decommit those pages as soon as they get marked clean. |
| // If we do that, we could also extend the eager approach to supply_pages, where pages get |
| // decommitted on supply, i.e. the supply is a no-op. |
| page_list_.RemovePages( |
| [&page_remover](VmPageOrMarker* p, uint64_t off) { |
| // A marker is a clean zero page. Replace it with an empty slot. |
| if (p->IsMarker()) { |
| *p = VmPageOrMarker::Empty(); |
| return ZX_ERR_NEXT; |
| } |
| |
| // Zero intervals are dirty so they cannot be removed. |
| if (p->IsIntervalZero()) { |
| // TODO: Remove clean intervals once they are supported. |
| DEBUG_ASSERT(!p->IsZeroIntervalClean()); |
| return ZX_ERR_NEXT; |
| } |
| |
| // VMOs with a page source cannot have references. |
| DEBUG_ASSERT(p->IsPage()); |
| |
| // We cannot remove the page if it is dirty-tracked but not clean. |
| if (is_page_dirty_tracked(p->Page()) && !is_page_clean(p->Page())) { |
| DEBUG_ASSERT(!p->Page()->is_loaned()); |
| return ZX_ERR_NEXT; |
| } |
| |
| // This is a page that we're going to remove; we don't expect it to be pinned. |
| DEBUG_ASSERT(p->Page()->object.pin_count == 0); |
| |
| page_remover.Push(p->ReleasePage()); |
| return ZX_ERR_NEXT; |
| }, |
| 0, size_); |
| |
| page_remover.Flush(); |
| } |
| |
| void VmCowPages::RangeChangeUpdateLocked(VmCowRange range, RangeChangeOp op, |
| DeferredOps* deferred) { |
| canary_.Assert(); |
| // If we have children (or this is a pager backed hierarchy) then potentially need to perform |
| // deferred operations. |
| if (children_list_len_ != 0 || root_has_page_source()) { |
| if (deferred) { |
| deferred->AddRange(this, range, op); |
| } else { |
| // If the operation was RemoveWrite then, since children are copy-on-write and cannot have |
| // writable mappings, they do not require a deferred operation. This is still true for pager |
| // hierarchies as, since no content is actually changing, there is no need for serialization. |
| DEBUG_ASSERT(op == RangeChangeOp::RemoveWrite); |
| } |
| } |
| if (paged_ref_ && !range.is_empty()) { |
| paged_backlink_locked(this)->RangeChangeUpdateLocked(range, op); |
| } |
| } |
| |
| // static |
| void VmCowPages::RangeChangeUpdateCowChildren(LockedPtr self, VmCowRange range, RangeChangeOp op) { |
| self->canary_.Assert(); |
| |
| // Helper for doing checking and performing a range change on a single candidate node. Although |
| // this is used once it is split out here to make the loops that actually walk the tree as easy to |
| // read as possible. |
| // Returns true if the passed in |candidate| had some overlap with the operation range, and hence |
| // its children also need to be walked. If false is returned the children of |candidate| can be |
| // skipped. Due to not being able to continuously hold locks while walking the subtree, even |
| // though we are therefore racing with concurrent modifications to the tree, it is still correct |
| // to skip subtrees. To explain why, first consider the following (impossible) scenario: |
| // A |
| // | |
| // |---| |
| // B ... |
| // | |
| // |---| |
| // C D |
| // 1. Thread 1 performs an unmap on a page in A (offset X), that can be seen by B, C and D |
| // 2. Thread 1 drops the lock of A to prepare to acquire lock of B |
| // 3. Thread 2 inserts a page into B at offset X, and starts its own child range change update/ |
| // 4. Thread 2 drops the lock of B to prepare to acquire lock of C |
| // 5. Thread 1 acquires the lock of B, observes that B cannot see X in A and skips the subtree |
| // of C and D. |
| // At this point neither of the threads have performed an unmap on C or D, so how can thread 1 |
| // guarantee that neither can see page A? |
| // The reason this cannot happen, and why this is an impossible scenario, as this would require B |
| // to not be a hidden node, i.e. part of a user pager hierarchy. However, user pager hierarchies |
| // have an additional lock used to serialize all such operations, and so the operation in thread 2 |
| // would not actually be able to start until thread 1 completely finished its range update and |
| // released this serialization lock. |
| auto check_candidate = [range, op](VmCowPages* candidate, uint64_t cur_accumulative_offset) |
| TA_REQ(candidate->lock()) -> bool { |
| uint64_t candidate_offset = 0; |
| uint64_t candidate_len = 0; |
| if (!GetIntersect(cur_accumulative_offset, candidate->size_, range.offset, range.len, |
| &candidate_offset, &candidate_len)) { |
| // Not intersection, can skip this node and the subtree. |
| return false; |
| } |
| // if they intersect with us, then by definition the new offset must be >= total parent_offset_ |
| DEBUG_ASSERT(candidate_offset >= cur_accumulative_offset); |
| |
| // subtract our offset |
| candidate_offset -= cur_accumulative_offset; |
| |
| // verify that it's still within range of us |
| DEBUG_ASSERT(candidate_offset + candidate_len <= candidate->size_); |
| |
| // Check if there are any gaps in this range where we would actually see the parent. |
| uint64_t first_gap_start = UINT64_MAX; |
| uint64_t last_gap_end = 0; |
| candidate->page_list_.ForEveryPageAndGapInRange( |
| [&](auto page, uint64_t offset) { |
| // If we have found a parent content marker then we can specifically see the parent at |
| // this location, and can consider this like a gap. For anything else we know we do not |
| // see the parent for this offset, so regardless of what it is just keep looking for a |
| // gap. Additionally any children that we have will see this content instead of our |
| // parents, and so we know it is also safe to skip them as well. |
| if (page->IsParentContent()) { |
| first_gap_start = ktl::min(first_gap_start, offset); |
| last_gap_end = ktl::max(last_gap_end, offset + PAGE_SIZE); |
| } |
| return ZX_ERR_NEXT; |
| }, |
| [&](uint64_t start, uint64_t end) { |
| // A gap in the page list indicates a range where the parent can be seen, unless this is a |
| // leaf node using parent content markers, in which case a gap indicates a range where we |
| // do *not* see the parent. |
| if (!candidate->node_has_parent_content_markers()) { |
| first_gap_start = ktl::min(first_gap_start, start); |
| last_gap_end = ktl::max(last_gap_end, end); |
| } |
| return ZX_ERR_NEXT; |
| }, |
| candidate_offset, candidate_offset + candidate_len); |
| |
| if (first_gap_start >= last_gap_end) { |
| vm_vmo_range_update_from_parent_skipped.Add(1); |
| return false; |
| } |
| // Invalidate the new, potentially smaller, range that covers the gaps. Due to the |
| // inability to store state we cannot use this smaller range for processing any of our |
| // children, as we would not be able to restore the original range when walking back up, |
| // but this still limits the range we process here and might have elided this subtree |
| // altogether if no gap was found. |
| // Construct a new, potentially smaller, range that covers the gaps. This will still |
| // result in potentially processing pages that are locally covered, but are limited to a |
| // single range here. |
| if (candidate->paged_ref_) { |
| AssertHeld(candidate->paged_ref_->lock_ref()); |
| candidate->paged_ref_->RangeChangeUpdateLocked( |
| VmCowRange(first_gap_start, last_gap_end - first_gap_start), op); |
| } |
| vm_vmo_range_update_from_parent_performed.Add(1); |
| // We processed this node and may need to walk the subtree. |
| return true; |
| }; |
| |
| if (range.is_empty()) { |
| return; |
| } |
| |
| if (self.locked().children_list_len_ == 0) { |
| return; |
| } |
| TreeWalkCursor cursor(ktl::move(self)); |
| |
| bool candidate = cursor.NextChild(); |
| |
| while (candidate) { |
| if (check_candidate(&cursor.GetCur().locked(), cursor.GetCurrentOffset())) { |
| candidate = cursor.NextChild(); |
| } else { |
| candidate = cursor.NextSibling(); |
| } |
| } |
| } |
| |
| void VmCowPages::FinishCachePolicyTransitionLocked() { |
| // No need to perform clean/invalidate if size is zero because there can be no pages. |
| if (size_ == 0) { |
| return; |
| } |
| |
| page_list_.ForEveryPage([this](const VmPageOrMarker* p, uint64_t off) { |
| if (!p->IsPage()) { |
| return ZX_ERR_NEXT; |
| } |
| vm_page_t* page = p->Page(); |
| DEBUG_ASSERT(page->object.pin_count == 0); |
| // Refreshing the page queue will move the page to an unreclaimable one if applicable. |
| AssertHeld(lock_ref()); |
| MoveToNotPinnedLocked(page, off); |
| arch_clean_invalidate_cache_range((vaddr_t)paddr_to_physmap(page->paddr()), PAGE_SIZE); |
| return ZX_ERR_NEXT; |
| }); |
| } |
| |
| template <typename T> |
| ktl::optional<VmCowReclaimFailure> VmCowPages::CannotReclaimPageLocked(vm_page_t* page, T actual) { |
| // Check this page is still a part of this VMO. After this any failures should mark the page as |
| // accessed to prevent the page from remaining a reclamation candidate. |
| if (!actual || !actual->IsPage() || actual->Page() != page) { |
| vm_reclaim_incorrect_page.Add(1); |
| return VmCowReclaimFailure::IncorrectPage; |
| } |
| // Pinned pages could be in use by DMA so we cannot safely reclaim them. |
| if (page->object.pin_count != 0) { |
| // Loaned pages should never end up pinned. |
| DEBUG_ASSERT(!page->is_loaned()); |
| pmm_page_queues()->MarkAccessed(page); |
| vm_reclaim_pinned.Add(1); |
| return VmCowReclaimFailure::Other; |
| } |
| return ktl::nullopt; |
| } |
| |
| VmCowReclaimResult VmCowPages::ReclaimPageForEviction(vm_page_t* page, uint64_t offset, |
| EvictionAction eviction_action) { |
| canary_.Assert(); |
| // Without a page source to bring the page back in we cannot even think about eviction. |
| DEBUG_ASSERT(can_evict()); |
| |
| __UNINITIALIZED DeferredOps deferred(this); |
| Guard<CriticalMutex> guard{AssertOrderedLock, lock(), lock_order()}; |
| |
| const VmPageOrMarker* page_or_marker = page_list_.Lookup(offset); |
| if (auto reason = CannotReclaimPageLocked(page, page_or_marker)) { |
| return fit::error(reason.value()); |
| } |
| // Since CanReclaimPageLocked() succeeded, we know that this page is owned by us at the provided |
| // offset. So it should be safe to call MarkAccessed() on the page if reclamation fails, provided |
| // we don't drop the lock. |
| |
| // Now allowed to reclaim if high priority, unless being required to do so. |
| if (high_priority_count_ != 0 && (eviction_action != EvictionAction::Require)) { |
| pmm_page_queues()->MarkAccessed(page); |
| vm_reclaim_high_priority.Add(1); |
| return fit::error(VmCowReclaimFailure::Other); |
| } |
| DEBUG_ASSERT(is_page_dirty_tracked(page)); |
| |
| // We cannot evict the page unless it is clean. If the page is dirty, it will already have been |
| // moved to the dirty page queue. |
| if (!is_page_clean(page)) { |
| DEBUG_ASSERT(pmm_page_queues()->DebugPageIsPagerBackedDirty(page)); |
| DEBUG_ASSERT(!page->is_loaned()); |
| vm_reclaim_dirty.Add(1); |
| return fit::error(VmCowReclaimFailure::Other); |
| } |
| |
| // Do not evict if the |always_need| hint is set, unless we are told to ignore the eviction hint. |
| if (page->object.always_need == 1 && eviction_action == EvictionAction::FollowHint) { |
| DEBUG_ASSERT(!page->is_loaned()); |
| // We still need to move the page from the tail of the LRU page queue(s) so that the eviction |
| // loop can make progress. Since this page is always needed, move it out of the way and into the |
| // MRU queue. Do this here while we hold the lock, instead of at the callsite. |
| // |
| // TODO(rashaeqbal): Since we're essentially simulating an access here, this page may not |
| // qualify for eviction if we do decide to override the hint soon after (i.e. if an OOM follows |
| // shortly after). Investigate adding a separate queue once we have some more data around hints |
| // usage. A possible approach might involve moving to a separate queue when we skip the page for |
| // eviction. Pages move out of said queue when accessed, and continue aging as other pages. |
| // Pages in the queue are considered for eviction pre-OOM, but ignored otherwise. |
| pmm_page_queues()->MarkAccessed(page); |
| vm_reclaim_always_need_skipped.Add(1); |
| return fit::error(VmCowReclaimFailure::Other); |
| } |
| |
| // Remove any mappings to this page before we remove it. |
| uint8_t old_queue = page->object.get_page_queue_ref().load(ktl::memory_order_relaxed); |
| RangeChangeUpdateLocked(VmCowRange(offset, PAGE_SIZE), RangeChangeOp::UnmapAndHarvest, &deferred); |
| const uint8_t new_queue = page->object.get_page_queue_ref().load(ktl::memory_order_relaxed); |
| // If queue has changed, the accessed bit will have been set by the unmap. |
| // Page has been accessed, don't evict. |
| // TODO(https://fxbug.dev/412464435): don't unmap & return accessed status to avoid checking page |
| // queues. |
| if ((old_queue != new_queue) && (eviction_action != EvictionAction::Require)) { |
| vm_reclaim_evict_accessed.Add(1); |
| return fit::error(VmCowReclaimFailure::EvictAccessed); |
| } |
| |
| char vmo_name[ZX_MAX_NAME_LEN] __UNINITIALIZED = "\0"; |
| // Lambda so that vmo_name is only filled out if tracing is enabled. |
| auto get_vmo_name = [&]() __ALWAYS_INLINE { |
| AssertHeld(lock_); |
| if (paged_ref_) { |
| paged_ref_->get_name(vmo_name, sizeof(vmo_name)); |
| } |
| return vmo_name; |
| }; |
| VM_KTRACE_INSTANT(1, "evict_page", ("vmo_id", paged_ref_ ? paged_ref_->user_id() : 0), |
| ("offset", offset), ("vmo_name", get_vmo_name())); |
| |
| // Use RemovePage over just writing to page_or_marker so that the page list has the opportunity |
| // to release any now empty intermediate nodes. |
| vm_page_t* p = page_list_.RemoveContent(offset).ReleasePage(); |
| DEBUG_ASSERT(p == page); |
| const bool loaned = page->is_loaned(); |
| RemovePageLocked(page, deferred); |
| |
| reclamation_event_count_++; |
| VMO_VALIDATION_ASSERT(DebugValidateHierarchyLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| return fit::ok(VmCowReclaimSuccess{.type = loaned ? VmCowReclaimSuccess::Type::EvictLoaned |
| : VmCowReclaimSuccess::Type::EvictNonLoaned, |
| .num_pages = 1}); |
| } |
| |
| VmCowReclaimResult VmCowPages::ReclaimPageForCompression(vm_page_t* page, uint64_t offset, |
| VmCompressor* compressor) { |
| DEBUG_ASSERT(compressor); |
| DEBUG_ASSERT(!page_source_); |
| DEBUG_ASSERT(!discardable_tracker_); |
| DEBUG_ASSERT(can_decommit_zero_pages()); |
| |
| // Track whether we should tell the caller we reclaimed a page or not. |
| bool reclaimed = false; |
| { |
| __UNINITIALIZED DeferredOps deferred(this); |
| Guard<CriticalMutex> guard{AssertOrderedLock, lock(), lock_order()}; |
| |
| // Use a sub-scope as the page_or_marker will become invalid as we will drop the lock later. |
| { |
| VmPageOrMarkerRef page_or_marker = page_list_.LookupMutable(offset); |
| if (auto reason = CannotReclaimPageLocked(page, page_or_marker)) { |
| return fit::error(reason.value()); |
| } |
| // Since CanReclaimPageLocked() succeeded, we know that this page is owned by us at the |
| // provided offset. So it should be safe to call MarkAccessed() on the page if reclamation |
| // fails, provided we don't drop the lock. |
| |
| // Not allowed to reclaim if uncached. |
| if ((paged_ref_ && (paged_backlink_locked(this)->GetMappingCachePolicyLocked() & |
| ZX_CACHE_POLICY_MASK) != ZX_CACHE_POLICY_CACHED)) { |
| // To avoid this page remaining in the reclamation list we simulate an access. |
| pmm_page_queues()->MarkAccessed(page); |
| vm_reclaim_uncached.Add(1); |
| return fit::error(VmCowReclaimFailure::Other); |
| } |
| |
| // Not allowed to reclaim if high priority. |
| if (high_priority_count_ != 0) { |
| pmm_page_queues()->MarkAccessed(page); |
| vm_reclaim_high_priority.Add(1); |
| return fit::error(VmCowReclaimFailure::Other); |
| } |
| DEBUG_ASSERT(!page->is_loaned()); |
| // Perform the unmap of the page on our mappings while we hold the lock. This removes all |
| // possible writable mappings, although our children could still have read-only mappings. |
| // These read-only mappings will be dealt with later, for now the page will at least be |
| // immutable. |
| uint8_t old_queue = page->object.get_page_queue_ref().load(ktl::memory_order_relaxed); |
| RangeChangeUpdateLocked(VmCowRange(offset, PAGE_SIZE), RangeChangeOp::UnmapAndHarvest, |
| &deferred); |
| const uint8_t new_queue = page->object.get_page_queue_ref().load(ktl::memory_order_relaxed); |
| // If queue has changed, the accessed bit will have been set by the unmap. |
| // Page has been accessed, don't compress. |
| // TODO(https://fxbug.dev/412464435): don't unmap & return accessed status to avoid checking |
| // page queues. |
| if (old_queue != new_queue) { |
| vm_reclaim_compress_accessed.Add(1); |
| return fit::error(VmCowReclaimFailure::CompressAccessed); |
| } |
| |
| // Start compression of the page by swapping the page list to contain the temporary reference. |
| // Ensure the compression system is aware of the page's current share_count so it can track |
| // any changes we make to that value while compression is running. |
| VmPageOrMarker::ReferenceValue temp_ref = compressor->Start( |
| VmCompressor::PageAndMetadata{.page = page, .metadata = page->object.share_count}); |
| [[maybe_unused]] vm_page_t* compress_page = page_or_marker.SwapPageForReference(temp_ref); |
| DEBUG_ASSERT(compress_page == page); |
| } |
| pmm_page_queues()->Remove(page); |
| |
| // We now stack own the page (and guarantee to the compressor that it will not be modified) and |
| // the VMO owns the temporary reference. We can safely drop the VMO lock and perform the |
| // remaining range updates and the compression step. |
| } |
| compressor->Compress(); |
| bool compression_failed = false; |
| |
| { |
| Guard<CriticalMutex> guard{AssertOrderedLock, lock(), lock_order()}; |
| |
| // Retrieve the result of compression now that we hold the VMO lock again. |
| VmCompressor::CompressResult compression_result = compressor->TakeCompressionResult(); |
| |
| // We hold the VMO lock again and need to reclaim the temporary reference. Either the |
| // temporary reference is still installed, and since we hold the VMO lock we now own both the |
| // temp reference and the place, or the temporary reference got replaced, in which case it no |
| // longer exists and is not referring to page and so we own page. |
| // |
| // Determining what state we are in just requires re-looking up the slot and see if the |
| // temporary reference we installed is still there. |
| auto [slot, is_in_interval] = |
| page_list_.LookupOrAllocate(offset, VmPageList::IntervalHandling::NoIntervals); |
| DEBUG_ASSERT(!is_in_interval); |
| if (slot && slot->IsReference() && compressor->IsTempReference(slot->Reference())) { |
| // Slot still holds the original reference; need to replace it with the result of compression. |
| VmPageOrMarker::ReferenceValue old_ref{0}; |
| if (const VmPageOrMarker::ReferenceValue* ref = |
| ktl::get_if<VmPageOrMarker::ReferenceValue>(&compression_result)) { |
| // Compression succeeded, put the new reference in. |
| // When compression succeeded, the |compressor| internally copied the page's metadata from |
| // the temp reference to the new reference so we don't need to manually copy it here. |
| old_ref = VmPageOrMarkerRef(slot).SwapReferenceForReference(*ref); |
| reclamation_event_count_++; |
| reclaimed = true; |
| vm_reclaim_compress_success.Add(1); |
| } else if (VmCompressor::FailTag* fail = |
| ktl::get_if<VmCompressor::FailTag>(&compression_result)) { |
| // Compression failed, put the page back in the slot. |
| // The |compressor| doesn't know how to update the |page| with any changes we made to its |
| // metadata while compression was running, so we need to manually copy the metadata over to |
| // the page's share_count here. |
| DEBUG_ASSERT(page == fail->src_page.page); |
| page->object.share_count = fail->src_page.metadata; |
| old_ref = VmPageOrMarkerRef(slot).SwapReferenceForPage(page); |
| // TODO(https://fxbug.dev/42138396): Placing in a queue and then moving it is inefficient, |
| // but avoids needing to reason about whether reclamation could be manually attempted on |
| // pages that might otherwise not end up in the reclaimable queues. |
| SetNotPinnedLocked(page, offset); |
| // TODO(https://fxbug.dev/42138396): Marking this page as failing reclamation will prevent |
| // it from ever being tried again. As compression might succeed if the contents changes, we |
| // should consider moving the page out of this queue if it is modified. |
| pmm_page_queues()->CompressFailed(page); |
| // Page stays owned by the VMO. |
| vm_reclaim_compress_fail.Add(1); |
| page = nullptr; |
| compression_failed = true; |
| } else { |
| ASSERT(ktl::holds_alternative<VmCompressor::ZeroTag>(compression_result)); |
| old_ref = slot->ReleaseReference(); |
| // Check if we can clear the slot, or if we need to insert a marker. Unlike the full zero |
| // pages this simply needs to check if there's any visible content above us, and then if |
| // there isn't if the root is immutable or not (i.e. if it has a page source). |
| auto parent_has_content = [this](uint64_t offset) TA_REQ(lock()) { |
| PageLookup content; |
| FindInitialPageContentLocked(offset, &content); |
| return !!content.cursor.current(); |
| }; |
| if (node_has_parent_content_markers() || |
| (!root_has_page_source() && !parent_has_content(offset))) { |
| *slot = VmPageOrMarker::Empty(); |
| page_list_.ReturnEmptySlot(offset); |
| vm_vmo_compression_zero_slot.Add(1); |
| } else { |
| *slot = VmPageOrMarker::Marker(); |
| vm_vmo_compression_marker.Add(1); |
| } |
| reclamation_event_count_++; |
| reclaimed = true; |
| vm_reclaim_compress_zero.Add(1); |
| } |
| // Temporary reference has been replaced, can return it to the compressor. |
| compressor->ReturnTempReference(old_ref); |
| } else { |
| vm_reclaim_compress_race.Add(1); |
| // The temporary reference is no longer there. We know nothing else about the state of the VMO |
| // at this point and will just free any compression result and exit. |
| if (const VmPageOrMarker::ReferenceValue* ref = |
| ktl::get_if<VmPageOrMarker::ReferenceValue>(&compression_result)) { |
| compressor->Free(*ref); |
| } |
| // If the slot is allocated, but empty, then make sure we properly return it. |
| if (slot && slot->IsEmpty()) { |
| page_list_.ReturnEmptySlot(offset); |
| } |
| // In this case we are still going to free the page, but it doesn't count as a reclamation as |
| // there is now something new in the slot we were trying to free. |
| } |
| } |
| // One way or another the temporary reference has been returned, and so we can finalize. |
| compressor->Finalize(); |
| |
| if (page) { |
| FreePage(page); |
| page = nullptr; |
| } |
| |
| if (compression_failed) { |
| return fit::error(VmCowReclaimFailure::CompressFailed); |
| } |
| return fit::ok(VmCowReclaimSuccess{.type = VmCowReclaimSuccess::Type::Compress, |
| .num_pages = reclaimed ? 1u : 0u}); |
| } |
| |
| VmCowReclaimResult VmCowPages::ReclaimPage(vm_page_t* page, uint64_t offset, |
| EvictionAction hint_action, VmCompressor* compressor) { |
| canary_.Assert(); |
| |
| // See if we can reclaim by eviction. |
| if (can_evict()) { |
| return ReclaimPageForEviction(page, offset, hint_action); |
| } |
| if (compressor && !page_source_ && !discardable_tracker_) { |
| return ReclaimPageForCompression(page, offset, compressor); |
| } |
| if (discardable_tracker_) { |
| // On any errors touch the page so we stop trying to reclaim it. In particular for discardable |
| // reclamation attempts, if the page we are passing is not the first page in the discardable |
| // VMO then the discard will fail, so touching it will stop us from continuously trying to |
| // trigger a discard with it. |
| auto result = ReclaimDiscardable(page, offset); |
| if (result.is_ok()) { |
| return fit::ok( |
| VmCowReclaimSuccess{.type = VmCowReclaimSuccess::Type::Discard, .num_pages = *result}); |
| } |
| vm_reclaim_discardable_failed.Add(1); |
| return fit::error(VmCowReclaimFailure::Other); |
| } |
| |
| // Keep a count as having no reclamation strategy is probably a sign of miss-configuration. |
| vm_reclaim_no_reclamation_strategy.Add(1); |
| |
| // Either no other strategies, or reclamation failed, so to avoid this page remaining in a |
| // reclamation list we simulate an access. Do not want to place it in the ReclaimFailed queue |
| // since our failure was not based on page contents. |
| // Before touching it double check this page is page of this VMO, as otherwise we cannot safely |
| // know its state to call MarkAccessed. |
| Guard<CriticalMutex> guard{lock()}; |
| const VmPageOrMarker* page_or_marker = page_list_.Lookup(offset); |
| if (!page_or_marker || !page_or_marker->IsPage() || page_or_marker->Page() != page) { |
| return fit::error(VmCowReclaimFailure::IncorrectPage); |
| } |
| pmm_page_queues()->MarkAccessed(page); |
| return fit::error(VmCowReclaimFailure::Other); |
| } |
| |
| zx_status_t VmCowPages::ReplacePagesWithNonLoanedLocked(VmCowRange range, DeferredOps& deferred, |
| AnonymousPageRequest* page_request, |
| uint64_t* non_loaned_len) { |
| canary_.Assert(); |
| |
| DEBUG_ASSERT(range.is_page_aligned()); |
| DEBUG_ASSERT(range.IsBoundedBy(size_)); |
| DEBUG_ASSERT(non_loaned_len); |
| |
| *non_loaned_len = 0; |
| bool found_page_or_gap = false; |
| zx_status_t status = page_list_.ForEveryPageAndGapInRange( |
| [page_request, non_loaned_len, &found_page_or_gap, &deferred, this](const VmPageOrMarker* p, |
| uint64_t off) { |
| found_page_or_gap = true; |
| // We only expect committed pages in the specified range. |
| if (!p->IsPage()) { |
| return ZX_ERR_BAD_STATE; |
| } |
| vm_page_t* page = p->Page(); |
| // If the page is loaned, replace is with a non-loaned page. |
| if (page->is_loaned()) { |
| AssertHeld(lock_ref()); |
| // A loaned page could only have been clean. |
| DEBUG_ASSERT(!is_page_dirty_tracked(page) || is_page_clean(page)); |
| DEBUG_ASSERT(page_request); |
| zx_status_t status = |
| ReplacePageLocked(page, off, /*with_loaned=*/false, &page, deferred, page_request); |
| if (status == ZX_ERR_SHOULD_WAIT) { |
| return status; |
| } |
| if (status != ZX_OK) { |
| return ZX_ERR_BAD_STATE; |
| } |
| } |
| DEBUG_ASSERT(!page->is_loaned()); |
| *non_loaned_len += PAGE_SIZE; |
| return ZX_ERR_NEXT; |
| }, |
| [&found_page_or_gap](uint64_t start, uint64_t end) { |
| found_page_or_gap = true; |
| // We only expect committed pages in the specified range. |
| return ZX_ERR_BAD_STATE; |
| }, |
| range.offset, range.end()); |
| |
| if (status != ZX_OK) { |
| return status; |
| } |
| |
| // If we did not find a page or a gap, the entire range fell inside an interval. We only expect |
| // committed pages in the range. |
| if (!found_page_or_gap) { |
| return ZX_ERR_BAD_STATE; |
| } |
| |
| return ZX_OK; |
| } |
| |
| zx_status_t VmCowPages::ReplacePageWithLoaned(vm_page_t* before_page, uint64_t offset) { |
| canary_.Assert(); |
| |
| __UNINITIALIZED DeferredOps deferred(this); |
| Guard<CriticalMutex> guard{lock()}; |
| return ReplacePageLocked(before_page, offset, true, nullptr, deferred, nullptr); |
| } |
| |
| zx_status_t VmCowPages::ReplacePage(vm_page_t* before_page, uint64_t offset, bool with_loaned, |
| vm_page_t** after_page, AnonymousPageRequest* page_request) { |
| __UNINITIALIZED DeferredOps deferred(this); |
| Guard<CriticalMutex> guard{lock()}; |
| return ReplacePageLocked(before_page, offset, with_loaned, after_page, deferred, page_request); |
| } |
| |
| zx_status_t VmCowPages::ReplacePageLocked(vm_page_t* before_page, uint64_t offset, bool with_loaned, |
| vm_page_t** after_page, DeferredOps& deferred, |
| AnonymousPageRequest* page_request) { |
| // If not replacing with loaned it is required that a page_request be provided. |
| DEBUG_ASSERT(with_loaned || page_request); |
| |
| VmPageOrMarkerRef p = page_list_.LookupMutable(offset); |
| if (!p) { |
| return ZX_ERR_NOT_FOUND; |
| } |
| if (!p->IsPage()) { |
| return ZX_ERR_NOT_FOUND; |
| } |
| vm_page_t* old_page = p->Page(); |
| if (old_page != before_page) { |
| return ZX_ERR_NOT_FOUND; |
| } |
| DEBUG_ASSERT(old_page != vm_get_zero_page()); |
| if (old_page->object.pin_count != 0) { |
| DEBUG_ASSERT(!old_page->is_loaned()); |
| return ZX_ERR_BAD_STATE; |
| } |
| if (old_page->object.always_need) { |
| DEBUG_ASSERT(!old_page->is_loaned()); |
| return ZX_ERR_BAD_STATE; |
| } |
| |
| // unmap before removing old page |
| RangeChangeUpdateLocked(VmCowRange(offset, PAGE_SIZE), RangeChangeOp::Unmap, &deferred); |
| |
| VmPageOrMarker released_page; |
| auto replace_page_in_list = [&](vm_page_t* new_page) { |
| AssertHeld(lock_ref()); |
| DEBUG_ASSERT(new_page->state() == vm_page_state::OBJECT); |
| |
| CopyPageMetadataForReplacementLocked(new_page, old_page); |
| |
| // Add replacement page in place of old page. |
| __UNINITIALIZED auto result = |
| BeginAddPageWithSlotLocked(offset, p, CanOverwriteContent::NonZero); |
| // Absent bugs, BeginAddPageWithSlotLocked() can only return ZX_ERR_NO_MEMORY, but that failure |
| // can only occur if page_list_ had to allocate. Here, page_list_ hasn't yet had a chance to |
| // clean up any internal structures, so BeginAddPageWithSlotLocked() didn't need to allocate, so |
| // we know that BeginAddPageWithSlotLocked() will succeed. |
| DEBUG_ASSERT(result.is_ok()); |
| released_page = CompleteAddPageLocked(*result, VmPageOrMarker::Page(new_page), |
| ParentContent::Unknown, nullptr); |
| }; |
| |
| vm_page_t* new_page = nullptr; |
| zx_status_t status = ZX_OK; |
| if (with_loaned) { |
| if (!should_borrow_locked()) { |
| return ZX_ERR_NOT_SUPPORTED; |
| } |
| if (is_page_dirty_tracked(old_page) && !is_page_clean(old_page)) { |
| return ZX_ERR_BAD_STATE; |
| } |
| auto result = |
| AllocLoanedPage([&replace_page_in_list](vm_page_t* page) { replace_page_in_list(page); }); |
| status = result.status_value(); |
| if (result.is_ok()) { |
| new_page = *result; |
| } |
| } else { |
| status = AllocPage(&new_page, page_request); |
| if (status == ZX_OK) { |
| replace_page_in_list(new_page); |
| } |
| } |
| |
| if (status != ZX_OK) { |
| return status; |
| } |
| CopyPageContentsForReplacementLocked(new_page, old_page); |
| |
| // Need to take the page out of |released_page| to avoid a [[nodiscard]] error. Since we just |
| // checked that this matches the target page, which is now owned by the caller, this is not |
| // leaking. |
| [[maybe_unused]] vm_page_t* released = released_page.ReleasePage(); |
| // The page released was the old page. |
| DEBUG_ASSERT(released == old_page); |
| |
| RemovePageLocked(old_page, deferred); |
| if (after_page) { |
| *after_page = new_page; |
| } |
| |
| return ZX_OK; |
| } |
| |
| bool VmCowPages::DebugValidateHierarchyLocked() TA_REQ(lock()) { |
| canary_.Assert(); |
| |
| VmCowPages* cur = this; |
| AssertHeld(cur->lock_ref()); |
| VmCowPages* parent_most = cur; |
| do { |
| if (!cur->DebugValidatePageSharingLocked()) { |
| return false; |
| } |
| cur = cur->parent_.get(); |
| if (cur) { |
| parent_most = cur; |
| } |
| } while (cur); |
| // Iterate whole hierarchy; the iteration order doesn't matter. Since there are cases with |
| // >2 children, in-order isn't well defined, so we choose pre-order, but post-order would also |
| // be fine. |
| zx_status_t status = parent_most->DebugForEachDescendant([this](VmCowPages* cur, uint depth) { |
| AssertHeld(cur->lock_ref()); |
| if (!cur->DebugValidateBacklinksLocked()) { |
| dprintf(INFO, "cur: %p this: %p\n", cur, this); |
| return ZX_ERR_BAD_STATE; |
| } |
| return ZX_OK; |
| }); |
| return status == ZX_OK; |
| } |
| |
| bool VmCowPages::DebugValidatePageSharingLocked() const { |
| canary_.Assert(); |
| |
| // Visible nodes should never contain shared pages. |
| if (!is_hidden()) { |
| zx_status_t status = |
| page_list_.ForEveryPage([this](const VmPageOrMarker* page, uint64_t offset) { |
| if (!page->IsPageOrRef()) { |
| return ZX_ERR_NEXT; |
| } |
| AssertHeld(lock_ref()); |
| |
| const uint32_t share_count = GetShareCount(page); |
| if (share_count != 0) { |
| if (page->IsPage()) { |
| printf("Found shared page in visible node %p (page %p) (off %#" PRIx64 |
| ") (share %" PRIu32 "), but expected it to be private\n", |
| this, page->Page(), offset, share_count); |
| } else { |
| printf("Found shared reference in visible node %p (off %#" PRIx64 ") (share %" PRIu32 |
| "), but expected it to be private\n", |
| this, offset, share_count); |
| } |
| DumpLocked(1, true); |
| return ZX_ERR_BAD_STATE; |
| } |
| |
| return ZX_ERR_NEXT; |
| }); |
| |
| // Nothing else to check for visible nodes |
| return status == ZX_OK; |
| } |
| |
| // Hidden nodes should share their pages with the correct number of visible nodes. |
| DEBUG_ASSERT(is_hidden()); |
| DEBUG_ASSERT(!children_list_.is_empty()); // Hidden nodes must always have children |
| zx_status_t status = page_list_.ForEveryPage([this](const VmPageOrMarker* page, uint64_t offset) { |
| if (!page->IsPageOrRef()) { |
| return ZX_ERR_NEXT; |
| } |
| AssertHeld(lock_ref()); |
| |
| const uint32_t share_count = GetShareCount(page); |
| const VmCowPages* cur = &children_list_.front(); |
| uint64_t offset_in_parent = offset; |
| uint32_t found_count = 0; |
| // For hidden nodes, check that the share counts on their pages and references are correct. |
| // For a page with a share count of N, there should be N + 1 visible nodes that can access the |
| // page. |
| // |
| // Walk the subtree rooted at this node. At each visible node we encounter, search back up to |
| // see if it can access `page`. |
| // |
| // We start with cur being an immediate child of 'this', so we can preform subtree traversal |
| // until we end up back in 'this'. |
| while (cur != this) { |
| AssertHeld(cur->lock_ref()); |
| DEBUG_ASSERT(cur->is_parent_hidden_locked()); |
| |
| // Check that we can see this page in the parent. Importantly this first checks if |
| // |offset_in_parent < cur->parent_offset_| allowing us to safely perform that subtraction |
| // from then on. |
| if (offset_in_parent < cur->parent_offset_ || |
| offset_in_parent - cur->parent_offset_ >= cur->parent_limit_) { |
| // This blank case is used to capture the scenario where current does not see the target |
| // offset in the parent, in which case there is no point traversing into the children. |
| } else if (cur->is_hidden()) { |
| // The children of a hidden node can only access the page if the hidden node isn't |
| // covering it with anything, so only walk down if this offset is empty in the hidden node. |
| const VmPageOrMarker* l = cur->page_list_.Lookup(offset_in_parent - cur->parent_offset_); |
| if (!l || l->IsEmpty()) { |
| // Page not found, we need to recurse down into our children. |
| DEBUG_ASSERT(!cur->children_list_.is_empty()); |
| offset_in_parent -= cur->parent_offset_; |
| cur = &cur->children_list_.front(); |
| continue; |
| } |
| } else { |
| // `cur` is a visible node, so search up and see if it has partial ownership over the page. |
| cur->ForEveryOwnedHierarchyPageInRangeLocked( |
| [&](const VmPageOrMarker* p, const VmCowPages* owner, uint64_t this_offset, |
| uint64_t owner_offset) { |
| if (p == page) { |
| DEBUG_ASSERT(owner == this); |
| DEBUG_ASSERT(owner_offset == offset); |
| found_count++; |
| return ZX_ERR_STOP; |
| } |
| |
| return ZX_ERR_NEXT; |
| }, |
| offset_in_parent - cur->parent_offset_, PAGE_SIZE, LockedPtr()); |
| } |
| |
| // Our next node should be the next available child in some `children_list_`. We will walk up |
| // until `cur` is not the last child in its parent's `children_list_`. |
| do { |
| const VmCowPages* parent = cur->parent_.get(); |
| AssertHeld(parent->lock_ref()); |
| |
| // Check for next child after `cur`. |
| auto children_iter = parent->children_list_.make_iterator(*cur); |
| children_iter++; |
| if (children_iter.IsValid()) { |
| cur = children_iter.CopyPointer(); |
| // Parent shouldn't have changed, so `offset_in_parent` doesn't need to. |
| AssertHeld(cur->lock_ref()); |
| DEBUG_ASSERT(cur->parent_.get() == parent); |
| break; |
| } |
| |
| // Otherwise keep walking up. |
| cur = parent; |
| offset_in_parent += parent->parent_offset_; |
| if (cur == this) { |
| break; |
| } |
| } while (1); |
| } |
| |
| // Ensure we found the page the correct number of times in the subtree. |
| if (found_count != share_count + 1) { |
| if (page->IsPage()) { |
| printf("Found shared page in hidden node %p (page %p) (off %#" PRIx64 ") (share %" PRIu32 |
| "), but accessible by wrong number of visible nodes %" PRIu32 "\n", |
| this, page->Page(), offset, share_count, found_count); |
| } else { |
| printf("Found shared reference in hidden node %p (off %#" PRIx64 ") (share %" PRIu32 |
| "), but accessible by wrong number of visible nodes %" PRIu32 "\n", |
| this, offset, share_count, found_count); |
| } |
| DumpLocked(1, true); |
| return ZX_ERR_BAD_STATE; |
| } |
| |
| return ZX_ERR_NEXT; |
| }); |
| |
| return status == ZX_OK; |
| } |
| |
| bool VmCowPages::DebugValidateBacklinksLocked() const { |
| canary_.Assert(); |
| bool result = true; |
| page_list_.ForEveryPage([this, &result](const auto* p, uint64_t offset) { |
| // Markers, references, and intervals don't have backlinks. |
| if (p->IsReference() || p->IsMarker() || p->IsInterval()) { |
| return ZX_ERR_NEXT; |
| } |
| vm_page_t* page = p->Page(); |
| vm_page_state state = page->state(); |
| if (state != vm_page_state::OBJECT) { |
| dprintf(INFO, "unexpected page state: %u\n", static_cast<uint32_t>(state)); |
| result = false; |
| return ZX_ERR_STOP; |
| } |
| const VmCowPages* object = reinterpret_cast<VmCowPages*>(page->object.get_object()); |
| if (!object) { |
| dprintf(INFO, "missing object\n"); |
| result = false; |
| return ZX_ERR_STOP; |
| } |
| if (object != this) { |
| dprintf(INFO, "incorrect object - object: %p this: %p\n", object, this); |
| result = false; |
| return ZX_ERR_STOP; |
| } |
| uint64_t page_offset = page->object.get_page_offset(); |
| if (page_offset != offset) { |
| dprintf(INFO, "incorrect offset - page_offset: %" PRIx64 " offset: %" PRIx64 "\n", |
| page_offset, offset); |
| result = false; |
| return ZX_ERR_STOP; |
| } |
| return ZX_ERR_NEXT; |
| }); |
| return result; |
| } |
| |
| bool VmCowPages::DebugValidateVmoPageBorrowingLocked() const { |
| canary_.Assert(); |
| // Skip checking larger VMOs to avoid slowing things down too much, since the things being |
| // verified will typically assert from incorrect behavior on smaller VMOs (and we can always |
| // remove this filter if we suspect otherwise). |
| if (size_ >= 2 * 1024 * 1024) { |
| return true; |
| } |
| bool result = true; |
| page_list_.ForEveryPage([this, &result](const auto* p, uint64_t offset) { |
| AssertHeld(lock_ref()); |
| if (!p->IsPage()) { |
| // If we don't have a page, this is either a marker or reference, both of which are not |
| // allowed with contiguous VMOs. |
| DEBUG_ASSERT(!direct_source_supplies_zero_pages()); |
| return ZX_ERR_NEXT; |
| } |
| vm_page_t* page = p->Page(); |
| if (page->is_loaned()) { |
| if (!can_borrow()) { |
| dprintf(INFO, "!can_borrow() but page is loaned?? - offset: 0x%" PRIx64 "\n", offset); |
| result = false; |
| return ZX_ERR_STOP; |
| } |
| if (page->object.pin_count) { |
| dprintf(INFO, "pinned page is loaned?? - offset: 0x%" PRIx64 "\n", offset); |
| result = false; |
| return ZX_ERR_STOP; |
| } |
| if (page->object.always_need) { |
| dprintf(INFO, "always_need page is loaned?? - offset: 0x%" PRIx64 "\n", offset); |
| result = false; |
| return ZX_ERR_STOP; |
| } |
| if (is_page_dirty_tracked(page) && !is_page_clean(page)) { |
| dprintf(INFO, "!clean page is loaned?? - offset: 0x%" PRIx64 "\n", offset); |
| result = false; |
| return ZX_ERR_STOP; |
| } |
| } |
| return ZX_ERR_NEXT; |
| }); |
| if (!result) { |
| dprintf(INFO, "DebugValidateVmoPageBorrowingLocked() failing\n"); |
| } |
| return result; |
| } |
| |
| bool VmCowPages::DebugValidateZeroIntervalsLocked() const { |
| canary_.Assert(); |
| bool in_interval = false; |
| auto dirty_state = VmPageOrMarker::IntervalDirtyState::Untracked; |
| zx_status_t status = page_list_.ForEveryPage( |
| [&in_interval, &dirty_state, pager_backed = is_source_preserving_page_content()]( |
| const VmPageOrMarker* p, uint64_t off) { |
| if (!pager_backed) { |
| if (p->IsInterval()) { |
| dprintf(INFO, "found interval at offset 0x%" PRIx64 " in non pager backed vmo\n", off); |
| return ZX_ERR_BAD_STATE; |
| } |
| return ZX_ERR_NEXT; |
| } |
| |
| if (p->IsInterval()) { |
| DEBUG_ASSERT(p->IsIntervalZero()); |
| DEBUG_ASSERT(p->IsZeroIntervalDirty() || p->IsZeroIntervalUntracked()); |
| if (p->IsIntervalStart()) { |
| if (in_interval) { |
| dprintf(INFO, "interval start at 0x%" PRIx64 " while already in interval\n", off); |
| return ZX_ERR_BAD_STATE; |
| } |
| in_interval = true; |
| dirty_state = p->GetZeroIntervalDirtyState(); |
| } else if (p->IsIntervalEnd()) { |
| if (!in_interval) { |
| dprintf(INFO, "interval end at 0x%" PRIx64 " while not in interval\n", off); |
| return ZX_ERR_BAD_STATE; |
| } |
| if (p->GetZeroIntervalDirtyState() != dirty_state) { |
| dprintf(INFO, "dirty state mismatch - start %lu, end %lu\n", (uint64_t)(dirty_state), |
| (uint64_t)(p->GetZeroIntervalDirtyState())); |
| return ZX_ERR_BAD_STATE; |
| } |
| in_interval = false; |
| dirty_state = VmPageOrMarker::IntervalDirtyState::Untracked; |
| } else { |
| if (in_interval) { |
| dprintf(INFO, "interval slot at 0x%" PRIx64 " while already in interval\n", off); |
| return ZX_ERR_BAD_STATE; |
| } |
| } |
| return ZX_ERR_NEXT; |
| } |
| |
| if (p->IsReference()) { |
| dprintf(INFO, "found compressed ref at offset 0x%" PRIx64 " in pager backed vmo\n", off); |
| return ZX_ERR_BAD_STATE; |
| } |
| |
| if (p->IsPage() && in_interval) { |
| dprintf(INFO, "found page at 0x%" PRIx64 " in interval\n", off); |
| return ZX_ERR_BAD_STATE; |
| } |
| |
| if (p->IsMarker() && in_interval) { |
| dprintf(INFO, "found marker at 0x%" PRIx64 " in interval\n", off); |
| return ZX_ERR_BAD_STATE; |
| } |
| return ZX_ERR_NEXT; |
| }); |
| return status == ZX_OK; |
| } |
| |
| bool VmCowPages::IsLockRangeValidLocked(VmCowRange range) const { |
| return range.offset == 0 && range.len == size_locked(); |
| } |
| |
| zx_status_t VmCowPages::LockRangeLocked(VmCowRange range, zx_vmo_lock_state_t* lock_state_out) { |
| canary_.Assert(); |
| ASSERT(discardable_tracker_); |
| |
| if (!IsLockRangeValidLocked(range)) { |
| return ZX_ERR_OUT_OF_RANGE; |
| } |
| |
| if (!lock_state_out) { |
| return ZX_ERR_INVALID_ARGS; |
| } |
| lock_state_out->offset = range.offset; |
| lock_state_out->size = range.len; |
| |
| discardable_tracker_->assert_cow_pages_locked(); |
| |
| bool was_discarded = false; |
| auto ret = discardable_tracker_->LockDiscardableLocked(/*try_lock=*/false, &was_discarded); |
| zx_status_t status = ret.first; |
| // Locking must succeed if try_lock was false. |
| DEBUG_ASSERT(status == ZX_OK); |
| |
| // If the VMO just became unreclaimable as a result of this lock, refresh the page queue state of |
| // all of its pages, which will move them out of any reclaimable queue. |
| if (ret.second) { |
| page_list_.ForEveryPage([this](const VmPageOrMarker* page_or_marker, uint64_t offset) { |
| if (page_or_marker->IsPage()) { |
| vm_page_t* page = page_or_marker->Page(); |
| if (page->object.pin_count == 0) { |
| AssertHeld(lock_ref()); |
| MoveToNotPinnedLocked(page, offset); |
| } |
| } |
| return ZX_ERR_NEXT; |
| }); |
| } |
| |
| lock_state_out->discarded_offset = 0; |
| lock_state_out->discarded_size = was_discarded ? size_locked() : 0; |
| |
| return status; |
| } |
| |
| zx_status_t VmCowPages::TryLockRangeLocked(VmCowRange range) { |
| canary_.Assert(); |
| ASSERT(discardable_tracker_); |
| |
| if (!IsLockRangeValidLocked(range)) { |
| return ZX_ERR_OUT_OF_RANGE; |
| } |
| |
| discardable_tracker_->assert_cow_pages_locked(); |
| bool unused; |
| auto ret = discardable_tracker_->LockDiscardableLocked(/*try_lock=*/true, &unused); |
| zx_status_t status = ret.first; |
| if (status != ZX_OK) { |
| return status; |
| } |
| |
| // If the VMO just became unreclaimable as a result of this lock, refresh the page queue state of |
| // all of its pages, which will move them out of any reclaimable queue. |
| if (ret.second) { |
| page_list_.ForEveryPage([this](const VmPageOrMarker* page_or_marker, uint64_t offset) { |
| if (page_or_marker->IsPage()) { |
| vm_page_t* page = page_or_marker->Page(); |
| if (page->object.pin_count == 0) { |
| AssertHeld(lock_ref()); |
| MoveToNotPinnedLocked(page, offset); |
| } |
| } |
| return ZX_ERR_NEXT; |
| }); |
| } |
| |
| return status; |
| } |
| |
| zx_status_t VmCowPages::UnlockRangeLocked(VmCowRange range) { |
| canary_.Assert(); |
| ASSERT(discardable_tracker_); |
| |
| if (!IsLockRangeValidLocked(range)) { |
| return ZX_ERR_OUT_OF_RANGE; |
| } |
| |
| discardable_tracker_->assert_cow_pages_locked(); |
| auto ret = discardable_tracker_->UnlockDiscardableLocked(); |
| zx_status_t status = ret.first; |
| if (status != ZX_OK) { |
| return status; |
| } |
| |
| // If the VMO just became reclaimable as a result of this unlock, refresh the page queue state of |
| // all of its pages, which will move them into the reclaimable queue. |
| if (ret.second) { |
| DEBUG_ASSERT(discardable_tracker_->IsEligibleForReclamationLocked()); |
| page_list_.ForEveryPage([this](const VmPageOrMarker* page_or_marker, uint64_t offset) { |
| if (page_or_marker->IsPage()) { |
| vm_page_t* page = page_or_marker->Page(); |
| if (page->object.pin_count == 0) { |
| AssertHeld(lock_ref()); |
| MoveToNotPinnedLocked(page, offset); |
| } |
| } |
| return ZX_ERR_NEXT; |
| }); |
| } |
| |
| return status; |
| } |
| |
| uint64_t VmCowPages::DebugGetPageCountLocked() const { |
| canary_.Assert(); |
| uint64_t page_count = 0; |
| zx_status_t status = page_list_.ForEveryPage([&page_count](auto* p, uint64_t offset) { |
| if (!p->IsPageOrRef()) { |
| return ZX_ERR_NEXT; |
| } |
| ++page_count; |
| return ZX_ERR_NEXT; |
| }); |
| // We never stop early in lambda above. |
| DEBUG_ASSERT(status == ZX_OK); |
| return page_count; |
| } |
| |
| bool VmCowPages::DebugIsPage(uint64_t offset) const { |
| canary_.Assert(); |
| DEBUG_ASSERT(IS_PAGE_ROUNDED(offset)); |
| Guard<CriticalMutex> guard{lock()}; |
| const VmPageOrMarker* p = page_list_.Lookup(offset); |
| return p && p->IsPage(); |
| } |
| |
| bool VmCowPages::DebugIsMarker(uint64_t offset) const { |
| canary_.Assert(); |
| DEBUG_ASSERT(IS_PAGE_ROUNDED(offset)); |
| Guard<CriticalMutex> guard{lock()}; |
| const VmPageOrMarker* p = page_list_.Lookup(offset); |
| return p && p->IsMarker(); |
| } |
| |
| bool VmCowPages::DebugIsEmpty(uint64_t offset) const { |
| canary_.Assert(); |
| DEBUG_ASSERT(IS_PAGE_ROUNDED(offset)); |
| Guard<CriticalMutex> guard{lock()}; |
| const VmPageOrMarker* p = page_list_.Lookup(offset); |
| return !p || p->IsEmpty(); |
| } |
| |
| vm_page_t* VmCowPages::DebugGetPage(uint64_t offset) const { |
| canary_.Assert(); |
| Guard<CriticalMutex> guard{lock()}; |
| return DebugGetPageLocked(offset); |
| } |
| |
| vm_page_t* VmCowPages::DebugGetPageLocked(uint64_t offset) const { |
| canary_.Assert(); |
| DEBUG_ASSERT(IS_PAGE_ROUNDED(offset)); |
| const VmPageOrMarker* p = page_list_.Lookup(offset); |
| if (p && p->IsPage()) { |
| return p->Page(); |
| } |
| return nullptr; |
| } |
| |
| bool VmCowPages::DebugIsHighMemoryPriority() const { |
| canary_.Assert(); |
| Guard<CriticalMutex> guard{lock()}; |
| return is_high_memory_priority_locked(); |
| } |
| |
| VmCowPages::DiscardablePageCounts VmCowPages::DebugGetDiscardablePageCounts() const { |
| canary_.Assert(); |
| DiscardablePageCounts counts = {}; |
| |
| // Not a discardable VMO. |
| if (!discardable_tracker_) { |
| return counts; |
| } |
| |
| Guard<CriticalMutex> guard{lock()}; |
| |
| discardable_tracker_->assert_cow_pages_locked(); |
| const DiscardableVmoTracker::DiscardableState state = |
| discardable_tracker_->discardable_state_locked(); |
| // This is a discardable VMO but hasn't opted into locking / unlocking yet. |
| if (state == DiscardableVmoTracker::DiscardableState::kUnset) { |
| return counts; |
| } |
| |
| uint64_t pages = 0; |
| page_list_.ForEveryPage([&pages](const auto* p, uint64_t) { |
| // TODO(https://fxbug.dev/42138396) Figure out attribution between pages and references. |
| if (p->IsPageOrRef()) { |
| ++pages; |
| } |
| return ZX_ERR_NEXT; |
| }); |
| |
| switch (state) { |
| case DiscardableVmoTracker::DiscardableState::kReclaimable: |
| counts.unlocked = pages; |
| break; |
| case DiscardableVmoTracker::DiscardableState::kUnreclaimable: |
| counts.locked = pages; |
| break; |
| case DiscardableVmoTracker::DiscardableState::kDiscarded: |
| DEBUG_ASSERT(pages == 0); |
| break; |
| default: |
| break; |
| } |
| |
| return counts; |
| } |
| |
| zx::result<uint64_t> VmCowPages::DiscardPagesLocked(DeferredOps& deferred) { |
| // Not a discardable VMO. |
| if (!discardable_tracker_) { |
| return zx::error(ZX_ERR_BAD_STATE); |
| } |
| |
| discardable_tracker_->assert_cow_pages_locked(); |
| if (!discardable_tracker_->IsEligibleForReclamationLocked()) { |
| return zx::error(ZX_ERR_BAD_STATE); |
| } |
| |
| // Remove all pages. |
| zx::result<uint64_t> result = UnmapAndFreePagesLocked(0, size_, deferred); |
| |
| if (result.is_ok()) { |
| reclamation_event_count_++; |
| |
| // Set state to discarded. |
| discardable_tracker_->SetDiscardedLocked(); |
| } |
| return result; |
| } |
| |
| zx::result<uint64_t> VmCowPages::ReclaimDiscardable(vm_page_t* page, uint64_t offset) { |
| DEBUG_ASSERT(discardable_tracker_); |
| |
| __UNINITIALIZED DeferredOps deferred(this); |
| Guard<CriticalMutex> guard{AssertOrderedLock, lock(), lock_order()}; |
| |
| const VmPageOrMarker* page_or_marker = page_list_.Lookup(offset); |
| if (CannotReclaimPageLocked(page, page_or_marker)) { |
| return zx::error(ZX_ERR_BAD_STATE); |
| } |
| // Since CanReclaimPageLocked() succeeded, we know that this page is owned by us at the provided |
| // offset. So it should be safe to call MarkAccessed() on the page if reclamation fails, provided |
| // we don't drop the lock. |
| |
| // Check if this is the first page. |
| bool first = false; |
| page_list_.ForEveryPage([&first, &offset, &page](auto* p, uint64_t off) { |
| if (!p->IsPage()) { |
| return ZX_ERR_NEXT; |
| } |
| first = (p->Page() == page) && off == offset; |
| return ZX_ERR_STOP; |
| }); |
| zx::result<uint64_t> result = |
| first ? DiscardPagesLocked(deferred) : zx::error(ZX_ERR_INVALID_ARGS); |
| if (result.is_error()) { |
| // Mark the page accessed so that it's no longer a reclamation candidate. The other error path |
| // above already does this inside the CanReclaimPageLocked() helper. |
| pmm_page_queues()->MarkAccessed(page); |
| } |
| return result; |
| } |
| |
| void VmCowPages::CopyPageContentsForReplacementLocked(vm_page_t* dst_page, vm_page_t* src_page) { |
| DEBUG_ASSERT(!src_page->object.pin_count); |
| void* src = paddr_to_physmap(src_page->paddr()); |
| DEBUG_ASSERT(src); |
| void* dst = paddr_to_physmap(dst_page->paddr()); |
| DEBUG_ASSERT(dst); |
| memcpy(dst, src, PAGE_SIZE); |
| if (paged_ref_) { |
| if (paged_backlink_locked(this)->GetMappingCachePolicyLocked() != ARCH_MMU_FLAG_CACHED) { |
| arch_clean_invalidate_cache_range((vaddr_t)dst, PAGE_SIZE); |
| } |
| } |
| } |
| |
| void VmCowPages::CopyPageMetadataForReplacementLocked(vm_page_t* dst_page, vm_page_t* src_page) { |
| dst_page->object.share_count = src_page->object.share_count; |
| dst_page->object.always_need = src_page->object.always_need; |
| DEBUG_ASSERT(!dst_page->object.always_need || (!dst_page->is_loaned() && !src_page->is_loaned())); |
| dst_page->object.dirty_state = src_page->object.dirty_state; |
| } |
| |
| VmCowPages::DeferredOps::DeferredOps(VmCowPages* self) : self_(self) { |
| // If we are referencing a pager backed object then we must acquire the pager hierarchy lock, |
| // which requires walking up to the root to find the page_source_. |
| if (self_->root_has_page_source()) { |
| fbl::RefPtr<PageSource> source; |
| { |
| Guard<CriticalMutex> guard{AssertOrderedLock, self_->lock(), self_->lock_order()}; |
| if (self_->life_cycle_ != LifeCycle::Alive) { |
| // Although the C++ object is guaranteed to be valid by the caller, it's possible that VMO |
| // has transitioned into a dead state. This race can occur typically due to reclamation |
| // having to first acquire a RefPtr, then check acquire the lock, then check if the page is |
| // still present in the VMO. If the VMO has transitioned to dead then its pages will have |
| // been cleared, and so the operation will get skipped. |
| // Unfortunately at this point the main lock acquisition and check has not been performed. |
| // This is a problem since when dead transitioning the parent_ reference is cleared, meaning |
| // we will find a 'fake' root, that will not consequently not have a valid page_source_. |
| // So to avoid failing to find a root page_source_ we make sure to terminate if this object |
| // is dead. |
| // As it is dead and no longer connected to the tree, there is no rest of the hierarchy to |
| // synchronize with and so failing to acquire the lock is safe. |
| return; |
| } |
| LockedPtr current; |
| while (current.locked_or(self_).parent_) { |
| current = LockedPtr(current.locked_or(self_).parent_.get()); |
| } |
| source = current.locked_or(self_).page_source_; |
| } |
| DEBUG_ASSERT(source); |
| page_source_lock_.emplace(source->paged_vmo_lock(), ktl::move(source)); |
| } |
| } |
| |
| VmCowPages::DeferredOps::~DeferredOps() { |
| if (range_op_.has_value()) { |
| LockedPtr self(self_); |
| VmCowPages::RangeChangeUpdateCowChildren(ktl::move(self), range_op_->range, range_op_->op); |
| } |
| // The pages must be freed *after* any range update is performed, but *before* dropping the |
| // |page_source_lock_|. In the case where the page source is handling free this is still a logical |
| // operation involving the cow pages and must remain serialized, as demonstrated by FreePages |
| // itself taking a reference to the VmCowPages. |
| freed_list_.FreePages(self_); |
| if (page_source_lock_.has_value()) { |
| // When dropping the page_source_lock as we could be holding the last references to the object |
| // the mutex must be released first, prior to potentially destroying the object by releasing the |
| // refptr. |
| page_source_lock_->first.Release(); |
| page_source_lock_->second.reset(); |
| page_source_lock_.reset(); |
| } |
| } |
| |
| void VmCowPages::DeferredOps::AddRange(VmCowPages* self, VmCowRange range, RangeChangeOp op) { |
| DEBUG_ASSERT(self == self_); |
| if (range_op_.has_value()) { |
| if (range_op_->op != op) { |
| // Permit an UnmapZeroPage to to be upgraded to an Unmap. If already an Unmap, then ignore any |
| // UnmapZeroPage. |
| if (range_op_->op == RangeChangeOp::UnmapZeroPage && op == RangeChangeOp::Unmap) { |
| range_op_->op = op; |
| } else { |
| DEBUG_ASSERT(range_op_->op == RangeChangeOp::Unmap && op == RangeChangeOp::UnmapZeroPage); |
| } |
| } |
| range_op_->range = range_op_->range.Cover(range); |
| } else { |
| range_op_ = DeferredRangeOp{.op = op, .range = range}; |
| } |
| } |
| |
| void VmCowPages::InitializePageCache(uint32_t level) { |
| ASSERT(level < LK_INIT_LEVEL_THREADING); |
| |
| const size_t reserve_pages = 64; |
| zx::result<page_cache::PageCache> result = page_cache::PageCache::Create(reserve_pages); |
| |
| ASSERT(result.is_ok()); |
| page_cache_ = ktl::move(result.value()); |
| |
| if (gBootOptions->pmm_alloc_random_should_wait) { |
| page_cache_.SeedRandomShouldWait(); |
| } |
| } |
| |
| // Initialize the cache after the percpu data structures are initialized. |
| LK_INIT_HOOK(vm_cow_pages_cache_init, VmCowPages::InitializePageCache, LK_INIT_LEVEL_KERNEL) |