| // Copyright 2020 The Fuchsia Authors |
| // |
| // Use of this source code is governed by a MIT-style |
| // license that can be found in the LICENSE file or at |
| // https://opensource.org/licenses/MIT |
| |
| #include "vm/vm_cow_pages.h" |
| |
| #include <lib/arch/intrin.h> |
| #include <lib/counters.h> |
| #include <lib/fit/defer.h> |
| #include <trace.h> |
| |
| #include <kernel/range_check.h> |
| #include <ktl/move.h> |
| #include <lk/init.h> |
| #include <vm/anonymous_page_requester.h> |
| #include <vm/compression.h> |
| #include <vm/discardable_vmo_tracker.h> |
| #include <vm/fault.h> |
| #include <vm/physmap.h> |
| #include <vm/pmm.h> |
| #include <vm/stack_owned_loaned_pages_interval.h> |
| #include <vm/vm_cow_pages.h> |
| #include <vm/vm_object.h> |
| #include <vm/vm_object_paged.h> |
| #include <vm/vm_page_list.h> |
| |
| #include "vm_priv.h" |
| |
| #include <ktl/enforce.h> |
| |
| #define LOCAL_TRACE VM_GLOBAL_TRACE(0) |
| |
| // add expensive code to do a full validation of the VMO at various points. |
| #define VMO_VALIDATION (0 || (LK_DEBUGLEVEL > 2)) |
| |
| // Assertion that is only enabled if VMO_VALIDATION is enabled. |
| #define VMO_VALIDATION_ASSERT(x) \ |
| do { \ |
| if (VMO_VALIDATION) { \ |
| ASSERT(x); \ |
| } \ |
| } while (0) |
| |
| // Add not-as-expensive code to do some extra validation at various points. This is off in normal |
| // debug builds because it can add O(n) validation to an O(1) operation, so can still make things |
| // slower, despite not being as slow as VMO_VALIDATION. |
| #define VMO_FRUGAL_VALIDATION (0 || (LK_DEBUGLEVEL > 2)) |
| |
| // Assertion that is only enabled if VMO_FRUGAL_VALIDATION is enabled. |
| #define VMO_FRUGAL_VALIDATION_ASSERT(x) \ |
| do { \ |
| if (VMO_FRUGAL_VALIDATION) { \ |
| ASSERT(x); \ |
| } \ |
| } while (0) |
| |
| namespace { |
| |
| KCOUNTER(vm_vmo_high_priority, "vm.vmo.high_priority") |
| KCOUNTER(vm_vmo_no_reclamation_strategy, "vm.vmo.no_reclamation_strategy") |
| KCOUNTER(vm_vmo_dont_need, "vm.vmo.dont_need") |
| KCOUNTER(vm_vmo_always_need, "vm.vmo.always_need") |
| KCOUNTER(vm_vmo_always_need_skipped_reclaim, "vm.vmo.always_need_skipped_reclaim") |
| KCOUNTER(vm_vmo_compression_zero_slot, "vm.vmo.compression.zero_empty_slot") |
| KCOUNTER(vm_vmo_compression_marker, "vm.vmo.compression_zero_marker") |
| KCOUNTER(vm_vmo_discardable_failed_reclaim, "vm.vmo.discardable_failed_reclaim") |
| KCOUNTER(vm_vmo_range_update_from_parent_skipped, "vm.vmo.range_updated_from_parent.skipped") |
| KCOUNTER(vm_vmo_range_update_from_parent_performed, "vm.vmo.range_updated_from_parent.performed") |
| |
| void ZeroPage(paddr_t pa) { |
| void* ptr = paddr_to_physmap(pa); |
| DEBUG_ASSERT(ptr); |
| |
| arch_zero_page(ptr); |
| } |
| |
| void ZeroPage(vm_page_t* p) { |
| paddr_t pa = p->paddr(); |
| ZeroPage(pa); |
| } |
| |
| bool IsZeroPage(vm_page_t* p) { |
| uint64_t* base = (uint64_t*)paddr_to_physmap(p->paddr()); |
| for (int i = 0; i < PAGE_SIZE / (int)sizeof(uint64_t); i++) { |
| if (base[i] != 0) |
| return false; |
| } |
| return true; |
| } |
| |
| void InitializeVmPage(vm_page_t* p) { |
| DEBUG_ASSERT(p->state() == vm_page_state::ALLOC); |
| p->set_state(vm_page_state::OBJECT); |
| p->object.pin_count = 0; |
| p->object.cow_left_split = 0; |
| p->object.cow_right_split = 0; |
| p->object.always_need = 0; |
| p->object.dirty_state = uint8_t(VmCowPages::DirtyState::Untracked); |
| } |
| |
| inline uint64_t CheckedAdd(uint64_t a, uint64_t b) { |
| uint64_t result; |
| bool overflow = add_overflow(a, b, &result); |
| DEBUG_ASSERT(!overflow); |
| return result; |
| } |
| |
| void FreeReference(VmPageOrMarker::ReferenceValue content) { |
| VmCompression* compression = pmm_page_compression(); |
| DEBUG_ASSERT(compression); |
| compression->Free(content); |
| } |
| |
| } // namespace |
| |
| // Helper class for collecting pages to performed batched Removes from the page queue to not incur |
| // its spinlock overhead for every single page. Pages that it removes from the page queue get placed |
| // into a provided list. Note that pages are not moved into the list until *after* Flush has been |
| // called and Flush must be called prior to object destruction. |
| // |
| // This class has a large internal array and should be marked uninitialized. |
| class BatchPQRemove { |
| public: |
| BatchPQRemove(list_node_t* freed_list) : freed_list_(freed_list) {} |
| ~BatchPQRemove() { DEBUG_ASSERT(count_ == 0); } |
| DISALLOW_COPY_AND_ASSIGN_ALLOW_MOVE(BatchPQRemove); |
| |
| // Add a page to the batch set. Automatically calls |Flush| if the limit is reached. |
| void Push(vm_page_t* page) { |
| DEBUG_ASSERT(page); |
| DEBUG_ASSERT(count_ < kMaxPages); |
| pages_[count_] = page; |
| count_++; |
| if (count_ == kMaxPages) { |
| Flush(); |
| } |
| } |
| |
| // Removes any content from the supplied |page_or_marker| and either calls |Push| or otherwise |
| // frees it. Always leaves the |page_or_marker| in the empty state. |
| // Automatically calls |Flush| if the limit on pages is reached. |
| void PushContent(VmPageOrMarker* page_or_marker) { |
| if (page_or_marker->IsPage()) { |
| Push(page_or_marker->ReleasePage()); |
| } else if (page_or_marker->IsReference()) { |
| // TODO(https://fxbug.dev/42138396): Consider whether it is worth batching these. |
| FreeReference(page_or_marker->ReleaseReference()); |
| } else { |
| *page_or_marker = VmPageOrMarker::Empty(); |
| } |
| } |
| |
| // Performs |Remove| on any pending pages. This allows you to know that all pages are in the |
| // original list so that you can do operations on the list. |
| void Flush() { |
| if (count_ > 0) { |
| pmm_page_queues()->RemoveArrayIntoList(pages_, count_, freed_list_); |
| freed_count_ += count_; |
| count_ = 0; |
| } |
| } |
| |
| // Returns the number of pages that were added to |freed_list_| by calls to Flush(). The |
| // |freed_count_| counter keeps a running count of freed pages as they are removed and added to |
| // |freed_list_|, avoiding having to walk |freed_list_| to compute its length. |
| size_t freed_count() const { return freed_count_; } |
| |
| // Produces a callback suitable for passing to VmPageList::RemovePages that will |PushContent| all |
| // items. |
| auto RemovePagesCallback() { |
| return [this](VmPageOrMarker* p, uint64_t off) { |
| PushContent(p); |
| return ZX_ERR_NEXT; |
| }; |
| } |
| |
| private: |
| // The value of 64 was chosen as there is minimal performance gains originally measured by using |
| // higher values. There is an incentive on this being as small as possible due to this typically |
| // being created on the stack, and our stack space is limited. |
| static constexpr size_t kMaxPages = 64; |
| |
| size_t count_ = 0; |
| size_t freed_count_ = 0; |
| vm_page_t* pages_[kMaxPages]; |
| list_node_t* freed_list_ = nullptr; |
| }; |
| |
| // Allocates a new page and populates it with the data at |parent_paddr|. |
| zx_status_t VmCowPages::AllocateCopyPage(paddr_t parent_paddr, list_node_t* alloc_list, |
| LazyPageRequest* request, vm_page_t** clone) { |
| DEBUG_ASSERT(request || !(pmm_alloc_flags_ & PMM_ALLOC_FLAG_CAN_WAIT)); |
| DEBUG_ASSERT(!is_source_supplying_specific_physical_pages()); |
| |
| vm_page_t* p_clone = nullptr; |
| if (alloc_list) { |
| p_clone = list_remove_head_type(alloc_list, vm_page, queue_node); |
| } |
| |
| if (p_clone) { |
| InitializeVmPage(p_clone); |
| } else { |
| zx_status_t status = AllocPage(&p_clone, request); |
| if (status != ZX_OK) { |
| return status; |
| } |
| DEBUG_ASSERT(p_clone); |
| } |
| |
| void* dst = paddr_to_physmap(p_clone->paddr()); |
| DEBUG_ASSERT(dst); |
| |
| if (parent_paddr != vm_get_zero_page_paddr()) { |
| // do a direct copy of the two pages |
| const void* src = paddr_to_physmap(parent_paddr); |
| DEBUG_ASSERT(src); |
| memcpy(dst, src, PAGE_SIZE); |
| } else { |
| // avoid pointless fetches by directly zeroing dst |
| arch_zero_page(dst); |
| } |
| |
| *clone = p_clone; |
| |
| return ZX_OK; |
| } |
| |
| zx_status_t VmCowPages::AllocUninitializedPage(vm_page_t** page, LazyPageRequest* request) { |
| paddr_t paddr = 0; |
| DEBUG_ASSERT(!is_source_supplying_specific_physical_pages()); |
| zx_status_t status = CacheAllocPage(pmm_alloc_flags_, page, &paddr); |
| if (status == ZX_ERR_SHOULD_WAIT) { |
| status = AnonymousPageRequester::Get().FillRequest(request->get()); |
| } |
| return status; |
| } |
| |
| zx_status_t VmCowPages::AllocPage(vm_page_t** page, LazyPageRequest* request) { |
| zx_status_t status = AllocUninitializedPage(page, request); |
| if (status == ZX_OK) { |
| InitializeVmPage(*page); |
| } |
| return status; |
| } |
| |
| zx_status_t VmCowPages::AllocLoanedPage(vm_page_t** page) { |
| uint32_t pmm_alloc_flags = pmm_alloc_flags_; |
| // Loaned page allocations will always precisely succeed or fail and the CAN_WAIT flag cannot be |
| // combined and so we remove it if it exists. |
| pmm_alloc_flags &= ~PMM_ALLOC_FLAG_CAN_WAIT; |
| pmm_alloc_flags |= PMM_ALLOC_FLAG_LOANED; |
| zx_status_t status = pmm_alloc_page(pmm_alloc_flags, page); |
| if (status == ZX_OK) { |
| InitializeVmPage(*page); |
| } |
| return status; |
| } |
| |
| zx_status_t VmCowPages::CacheAllocPage(uint alloc_flags, vm_page_t** p, paddr_t* pa) { |
| if (!page_cache_) { |
| return pmm_alloc_page(alloc_flags, p, pa); |
| } |
| |
| zx::result result = page_cache_.Allocate(1, alloc_flags); |
| if (result.is_error()) { |
| return result.error_value(); |
| } |
| |
| vm_page_t* page = list_remove_head_type(&result->page_list, vm_page_t, queue_node); |
| DEBUG_ASSERT(page != nullptr); |
| DEBUG_ASSERT(result->page_list.is_empty()); |
| |
| *p = page; |
| *pa = page->paddr(); |
| return ZX_OK; |
| } |
| |
| void VmCowPages::CacheFree(list_node_t* list) { |
| if (!page_cache_) { |
| pmm_free(list); |
| return; |
| } |
| |
| page_cache_.Free(ktl::move(*list)); |
| } |
| |
| void VmCowPages::CacheFree(vm_page_t* p) { |
| if (!page_cache_) { |
| pmm_free_page(p); |
| return; |
| } |
| |
| page_cache::PageCache::PageList list; |
| list_add_tail(&list, &p->queue_node); |
| |
| page_cache_.Free(ktl::move(list)); |
| } |
| |
| zx_status_t VmCowPages::MakePageFromReference(VmPageOrMarkerRef page_or_mark, |
| LazyPageRequest* page_request) { |
| DEBUG_ASSERT(page_or_mark->IsReference()); |
| VmCompression* compression = pmm_page_compression(); |
| DEBUG_ASSERT(compression); |
| vm_page_t* p; |
| zx_status_t status = AllocPage(&p, page_request); |
| if (status != ZX_OK) { |
| return status; |
| } |
| const auto ref = page_or_mark.SwapReferenceForPage(p); |
| compression->Decompress(ref, paddr_to_physmap(p->paddr())); |
| return ZX_OK; |
| } |
| |
| zx_status_t VmCowPages::ReplaceReferenceWithPageLocked(VmPageOrMarkerRef page_or_mark, |
| uint64_t offset, |
| LazyPageRequest* page_request) { |
| // First replace the ref with a page. |
| zx_status_t status = MakePageFromReference(page_or_mark, page_request); |
| if (status != ZX_OK) { |
| return status; |
| } |
| IncrementHierarchyGenerationCountLocked(); |
| // Add the new page to the page queues for tracking. References are by definition not pinned, so |
| // we know this is not wired. |
| SetNotPinnedLocked(page_or_mark->Page(), offset); |
| return ZX_OK; |
| } |
| |
| VmCowPages::VmCowPages(const fbl::RefPtr<VmHierarchyState> hierarchy_state_ptr, |
| VmCowPagesOptions options, uint32_t pmm_alloc_flags, uint64_t size, |
| fbl::RefPtr<PageSource> page_source, |
| ktl::unique_ptr<DiscardableVmoTracker> discardable_tracker) |
| : VmHierarchyBase(ktl::move(hierarchy_state_ptr)), |
| pmm_alloc_flags_(pmm_alloc_flags), |
| options_(options), |
| size_(size), |
| page_source_(ktl::move(page_source)), |
| discardable_tracker_(ktl::move(discardable_tracker)) { |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(size)); |
| DEBUG_ASSERT(!(pmm_alloc_flags & PMM_ALLOC_FLAG_LOANED)); |
| } |
| |
| void VmCowPages::TransitionToAliveLocked() { |
| ASSERT(life_cycle_ == LifeCycle::Init); |
| life_cycle_ = LifeCycle::Alive; |
| } |
| |
| void VmCowPages::MaybeDeadTransitionLocked(Guard<CriticalMutex>& guard) { |
| if (!paged_ref_ && children_list_len_ == 0 && life_cycle_ == LifeCycle::Alive) { |
| DeadTransition(guard); |
| } |
| } |
| |
| void VmCowPages::MaybeDeadTransition() { |
| Guard<CriticalMutex> guard{lock()}; |
| MaybeDeadTransitionLocked(guard); |
| } |
| |
| void VmCowPages::DeadTransition(Guard<CriticalMutex>& guard) { |
| canary_.Assert(); |
| DEBUG_ASSERT(life_cycle_ == LifeCycle::Alive); |
| |
| // To prevent races with a hidden parent creation or merging, it is necessary to hold the lock |
| // over the is_hidden and parent_ check and into the subsequent removal call. |
| |
| // We'll be making changes to the hierarchy we're part of. |
| IncrementHierarchyGenerationCountLocked(); |
| |
| // At the point of destruction we should no longer have any mappings or children still |
| // referencing us, and by extension our priority count must therefore be back to zero. |
| DEBUG_ASSERT(high_priority_count_ == 0); |
| VMO_VALIDATION_ASSERT(DebugValidatePageSplitsHierarchyLocked()); |
| // If we're not a hidden vmo, then we need to remove ourself from our parent. This needs |
| // to be done before emptying the page list so that a hidden parent can't merge into this |
| // vmo and repopulate the page list. |
| if (!is_hidden_locked()) { |
| if (parent_) { |
| parent_locked().RemoveChildLocked(this); |
| } |
| |
| // Before potentially dropping the lock to perform any long running deletions over our parents |
| // clear out our page list. Any page (or reference) that links back to us is linking back to a |
| // VMO that is partially dead (our parent_ pointer still exists, but our parent does not link |
| // back to us, etc). |
| |
| { |
| // We stack-own loaned pages between removing the page from PageQueues and freeing the page |
| // via call to FreePagesLocked(). |
| __UNINITIALIZED StackOwnedLoanedPagesInterval raii_interval; |
| |
| // Cleanup page lists and page sources. |
| list_node_t list; |
| list_initialize(&list); |
| |
| __UNINITIALIZED BatchPQRemove page_remover(&list); |
| // free all of the pages attached to us |
| page_list_.RemoveAllContent([&page_remover](VmPageOrMarker&& p) { |
| ASSERT(!p.IsPage() || p.Page()->object.pin_count == 0); |
| page_remover.PushContent(&p); |
| }); |
| page_remover.Flush(); |
| |
| FreePagesLocked(&list, /*freeing_owned_pages=*/true); |
| } |
| |
| if (parent_) { |
| // We removed a child from the parent, and so it may also need to be cleaned. |
| // Avoid recursing destructors and dead transitions when we delete our parent by using the |
| // deferred deletion method. See common in parent else branch for why we can avoid this on a |
| // hidden parent. |
| if (!parent_locked().is_hidden_locked()) { |
| guard.CallUnlocked([this, parent = ktl::move(parent_)]() mutable { |
| hierarchy_state_ptr_->DoDeferredDelete(ktl::move(parent)); |
| }); |
| } else { |
| parent_locked().MaybeDeadTransitionLocked(guard); |
| } |
| } |
| } else { |
| // Most of the hidden vmo's state should have already been cleaned up when it merged |
| // itself into its child in ::RemoveChildLocked. |
| DEBUG_ASSERT(children_list_len_ == 0); |
| DEBUG_ASSERT(page_list_.HasNoPageOrRef()); |
| // Even though we are hidden we might have a parent. Unlike in the other branch of this if we |
| // do not need to perform any deferred deletion. The reason for this is that the deferred |
| // deletion mechanism is intended to resolve the scenario where there is a chain of 'one ref' |
| // parent pointers that will chain delete. However, with hidden parents we *know* that a |
| // hidden parent has two children (and hence at least one other ref to it) and so we cannot be |
| // in a one ref chain. Even if N threads all tried to remove children from the hierarchy at |
| // once, this would ultimately get serialized through the lock and the hierarchy would go from |
| // |
| // [..] |
| // / |
| // A [..] |
| // / \ / |
| // B E TO B A |
| // / \ / / \. |
| // C D C D E |
| // |
| // And so each serialized deletion breaks of a discrete two VMO chain that can be safely |
| // finalized with one recursive step. |
| if (parent_) { |
| DEBUG_ASSERT(!parent_locked().parent_); |
| // We explicitly call DeadTransition on our parent (even though we are still a child of it) as |
| // otherwise its destructor will run without this transition happening, which is an error. |
| // This otherwise does not cause any actual cleanup to happen, since our parent is hidden and |
| // will have had all its pages removed already. |
| parent_locked().DeadTransition(guard); |
| } |
| } |
| |
| DEBUG_ASSERT(page_list_.HasNoPageOrRef()); |
| // We must Close() after removing pages, so that all pages will be loaned by the time |
| // PhysicalPageProvider::OnClose() calls pmm_delete_lender() on the whole physical range. |
| if (page_source_) { |
| page_source_->Close(); |
| } |
| life_cycle_ = LifeCycle::Dead; |
| } |
| |
| VmCowPages::~VmCowPages() { |
| // Most of the explicit cleanup happens in DeadTransition() with asserts and some remaining |
| // cleanup happening here in the destructor. |
| canary_.Assert(); |
| DEBUG_ASSERT(page_list_.HasNoPageOrRef()); |
| DEBUG_ASSERT(life_cycle_ != LifeCycle::Alive); |
| // The discardable tracker is unlinked explicitly in the destructor to ensure that no RefPtrs can |
| // be constructed to the VmCowPages from here. See comment in |
| // DiscardableVmoTracker::DebugDiscardablePageCounts that depends upon this being here instead of |
| // during the dead transition. |
| if (discardable_tracker_) { |
| Guard<CriticalMutex> guard{lock()}; |
| discardable_tracker_->assert_cow_pages_locked(); |
| discardable_tracker_->RemoveFromDiscardableListLocked(); |
| } |
| } |
| |
| bool VmCowPages::DedupZeroPage(vm_page_t* page, uint64_t offset) { |
| canary_.Assert(); |
| |
| Guard<CriticalMutex> guard{lock()}; |
| |
| // Forbid zero page deduping if this is high priority. |
| if (high_priority_count_ != 0) { |
| return false; |
| } |
| |
| // The VmObjectPaged could have been destroyed, or this could be a hidden node. Check if the |
| // paged_ref_ is valid first. |
| if (paged_ref_) { |
| AssertHeld(paged_ref_->lock_ref()); |
| if (!paged_ref_->CanDedupZeroPagesLocked()) { |
| return false; |
| } |
| } |
| |
| // Check this page is still a part of this VMO. object.page_offset could be wrong, but there's no |
| // harm in looking up a random slot as we'll then notice it's the wrong page. |
| // Also ignore any references since we cannot efficiently scan them, and they should presumably |
| // already be deduped. |
| // Pinned pages cannot be decommited and so also must not be committed. We must also not decommit |
| // pages from kernel VMOs, as the kernel cannot fault them back in, but all kernel pages will be |
| // pinned. |
| VmPageOrMarkerRef page_or_marker = page_list_.LookupMutable(offset); |
| if (!page_or_marker || !page_or_marker->IsPage() || page_or_marker->Page() != page || |
| page->object.pin_count > 0 || (is_page_dirty_tracked(page) && !is_page_clean(page))) { |
| return false; |
| } |
| |
| // We expect most pages to not be zero, as such we will first do a 'racy' zero page check where |
| // we leave write permissions on the page. If the page isn't zero, which is our hope, then we |
| // haven't paid the price of modifying page tables. |
| if (!IsZeroPage(page_or_marker->Page())) { |
| return false; |
| } |
| |
| RangeChangeUpdateLocked(offset, PAGE_SIZE, RangeChangeOp::RemoveWrite); |
| |
| if (IsZeroPage(page_or_marker->Page())) { |
| // We stack-own loaned pages from when they're removed until they're freed. |
| __UNINITIALIZED StackOwnedLoanedPagesInterval raii_interval; |
| |
| // Replace the slot with a marker. |
| VmPageOrMarker new_marker = VmPageOrMarker::Marker(); |
| VmPageOrMarker old_page; |
| zx_status_t status = |
| AddPageLocked(&new_marker, offset, CanOverwriteContent::NonZero, &old_page); |
| DEBUG_ASSERT(status == ZX_OK); |
| DEBUG_ASSERT(old_page.IsPage()); |
| |
| // Free the old page. |
| vm_page_t* released_page = old_page.ReleasePage(); |
| pmm_page_queues()->Remove(released_page); |
| |
| DEBUG_ASSERT(!list_in_list(&released_page->queue_node)); |
| FreePageLocked(released_page, /*freeing_owned_page=*/true); |
| |
| reclamation_event_count_++; |
| IncrementHierarchyGenerationCountLocked(); |
| VMO_VALIDATION_ASSERT(DebugValidatePageSplitsHierarchyLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| return true; |
| } |
| return false; |
| } |
| |
| zx_status_t VmCowPages::Create(fbl::RefPtr<VmHierarchyState> root_lock, VmCowPagesOptions options, |
| uint32_t pmm_alloc_flags, uint64_t size, |
| ktl::unique_ptr<DiscardableVmoTracker> discardable_tracker, |
| fbl::RefPtr<VmCowPages>* cow_pages) { |
| DEBUG_ASSERT(!(options & VmCowPagesOptions::kInternalOnlyMask)); |
| fbl::AllocChecker ac; |
| auto cow = fbl::AdoptRef<VmCowPages>(new (&ac) VmCowPages(ktl::move(root_lock), options, |
| pmm_alloc_flags, size, nullptr, |
| ktl::move(discardable_tracker))); |
| if (!ac.check()) { |
| return ZX_ERR_NO_MEMORY; |
| } |
| if (cow->discardable_tracker_) { |
| cow->discardable_tracker_->InitCowPages(cow.get()); |
| } |
| |
| *cow_pages = ktl::move(cow); |
| return ZX_OK; |
| } |
| |
| zx_status_t VmCowPages::CreateExternal(fbl::RefPtr<PageSource> src, VmCowPagesOptions options, |
| fbl::RefPtr<VmHierarchyState> root_lock, uint64_t size, |
| fbl::RefPtr<VmCowPages>* cow_pages) { |
| DEBUG_ASSERT(!(options & VmCowPagesOptions::kInternalOnlyMask)); |
| fbl::AllocChecker ac; |
| auto cow = fbl::AdoptRef<VmCowPages>(new (&ac) VmCowPages( |
| ktl::move(root_lock), options, PMM_ALLOC_FLAG_CAN_WAIT, size, ktl::move(src), nullptr)); |
| if (!ac.check()) { |
| return ZX_ERR_NO_MEMORY; |
| } |
| |
| *cow_pages = ktl::move(cow); |
| return ZX_OK; |
| } |
| |
| void VmCowPages::ReplaceChildLocked(VmCowPages* old, VmCowPages* new_child) { |
| canary_.Assert(); |
| children_list_.replace(*old, new_child); |
| } |
| |
| void VmCowPages::DropChildLocked(VmCowPages* child) { |
| canary_.Assert(); |
| DEBUG_ASSERT(children_list_len_ > 0); |
| children_list_.erase(*child); |
| --children_list_len_; |
| } |
| |
| void VmCowPages::AddChildLocked(VmCowPages* child, uint64_t offset, uint64_t root_parent_offset, |
| uint64_t parent_limit) { |
| canary_.Assert(); |
| |
| // As we do not want to have to return failure from this function we require root_parent_offset to |
| // be calculated and validated that it does not overflow externally, but we can still assert that |
| // it has been calculated correctly to prevent accidents. |
| AssertHeld(child->lock_ref()); |
| DEBUG_ASSERT(CheckedAdd(root_parent_offset_, offset) == root_parent_offset); |
| |
| // The child should definitely stop seeing into the parent at the limit of its size. |
| DEBUG_ASSERT(parent_limit <= child->size_); |
| |
| // Write in the parent view values. |
| child->root_parent_offset_ = root_parent_offset; |
| child->parent_offset_ = offset; |
| child->parent_limit_ = parent_limit; |
| |
| // This child should be in an initial state and these members should be clear. |
| DEBUG_ASSERT(!child->partial_cow_release_); |
| DEBUG_ASSERT(child->parent_start_limit_ == 0); |
| |
| child->page_list_.InitializeSkew(page_list_.GetSkew(), offset); |
| |
| // If the child has a non-zero high priority count, then it is counting as an incoming edge to our |
| // count. |
| if (child->high_priority_count_ > 0) { |
| ChangeSingleHighPriorityCountLocked(1); |
| } |
| |
| child->parent_ = fbl::RefPtr(this); |
| children_list_.push_front(child); |
| children_list_len_++; |
| } |
| |
| zx_status_t VmCowPages::CreateChildSliceLocked(uint64_t offset, uint64_t size, |
| fbl::RefPtr<VmCowPages>* cow_slice) { |
| LTRACEF("vmo %p offset %#" PRIx64 " size %#" PRIx64 "\n", this, offset, size); |
| |
| canary_.Assert(); |
| |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(offset)); |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(size)); |
| DEBUG_ASSERT(CheckedAdd(offset, size) <= size_); |
| |
| // If this is a slice re-home this on our parent. Due to this logic we can guarantee that any |
| // slice parent is, itself, not a slice. |
| // We are able to do this for two reasons: |
| // * Slices are subsets and so every position in a slice always maps back to the paged parent. |
| // * Slices are not permitted to be resized and so nothing can be done on the intermediate parent |
| // that requires us to ever look at it again. |
| if (is_slice_locked()) { |
| return slice_parent_locked().CreateChildSliceLocked(offset + parent_offset_, size, cow_slice); |
| } |
| |
| fbl::AllocChecker ac; |
| // Slices just need the slice option and default alloc flags since they will propagate any |
| // operation up to a parent and use their options and alloc flags. |
| auto slice = fbl::AdoptRef<VmCowPages>(new (&ac) VmCowPages( |
| hierarchy_state_ptr_, VmCowPagesOptions::kSlice, PMM_ALLOC_FLAG_ANY, size, nullptr, nullptr)); |
| if (!ac.check()) { |
| return ZX_ERR_NO_MEMORY; |
| } |
| // At this point slice must *not* be destructed in this function, as doing so would cause a |
| // deadlock. That means from this point on we *must* succeed and any future error checking needs |
| // to be added prior to creation. |
| |
| AssertHeld(slice->lock_ref()); |
| |
| // As our slice must be in range of the parent it is impossible to have the accumulated parent |
| // offset overflow. |
| uint64_t root_parent_offset = CheckedAdd(offset, root_parent_offset_); |
| CheckedAdd(root_parent_offset, size); |
| |
| AddChildLocked(slice.get(), offset, root_parent_offset, size); |
| |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(slice->DebugValidateVmoPageBorrowingLocked()); |
| |
| *cow_slice = slice; |
| return ZX_OK; |
| } |
| |
| void VmCowPages::CloneParentIntoChildLocked(fbl::RefPtr<VmCowPages>& child) { |
| AssertHeld(child->lock_ref()); |
| // This function is invalid to call if any pages are pinned as the unpin after we change the |
| // backlink will not work. |
| DEBUG_ASSERT(pinned_page_count_ == 0); |
| // We are going to change our linked VmObjectPaged to eventually point to our left child instead |
| // of us, so we need to make the left child look equivalent. To do this it inherits our |
| // children, attribution id and eviction count and is sized to completely cover us. |
| for (auto& c : children_list_) { |
| AssertHeld(c.lock_ref()); |
| c.parent_ = child; |
| } |
| child->children_list_ = ktl::move(children_list_); |
| child->children_list_len_ = children_list_len_; |
| children_list_len_ = 0; |
| child->reclamation_event_count_ = reclamation_event_count_; |
| child->page_attribution_user_id_ = page_attribution_user_id_; |
| child->high_priority_count_ = high_priority_count_; |
| high_priority_count_ = 0; |
| AddChildLocked(child.get(), 0, root_parent_offset_, size_); |
| |
| // Time to change the VmCowPages that our paged_ref_ is pointing to. |
| // We could only have gotten here from a valid VmObjectPaged since we're trying to create a child. |
| // The paged_ref_ should therefore be valid. |
| DEBUG_ASSERT(paged_ref_); |
| child->paged_ref_ = paged_ref_; |
| AssertHeld(paged_ref_->lock_ref()); |
| DEBUG_ASSERT(child->life_cycle_ == LifeCycle::Init); |
| child->life_cycle_ = LifeCycle::Alive; |
| [[maybe_unused]] fbl::RefPtr<VmCowPages> previous = |
| paged_ref_->SetCowPagesReferenceLocked(ktl::move(child)); |
| // Validate that we replaced a reference to ourself as we expected, this ensures we can safely |
| // drop the refptr without triggering our own destructor, since we know someone else must be |
| // holding a refptr to us to be in this function. |
| DEBUG_ASSERT(previous.get() == this); |
| paged_ref_ = nullptr; |
| } |
| |
| zx_status_t VmCowPages::CloneBidirectionalLocked(uint64_t offset, uint64_t size, |
| fbl::RefPtr<VmCowPages>* cow_child, |
| uint64_t new_root_parent_offset, |
| uint64_t child_parent_limit) { |
| // We need two new VmCowPages for our two children. |
| fbl::AllocChecker ac; |
| fbl::RefPtr<VmCowPages> left_child = fbl::AdoptRef<VmCowPages>(new (&ac) VmCowPages( |
| hierarchy_state_ptr_, VmCowPagesOptions::kNone, pmm_alloc_flags_, size_, nullptr, nullptr)); |
| if (!ac.check()) { |
| return ZX_ERR_NO_MEMORY; |
| } |
| AssertHeld(left_child->lock_ref()); |
| fbl::RefPtr<VmCowPages> right_child = fbl::AdoptRef<VmCowPages>(new (&ac) VmCowPages( |
| hierarchy_state_ptr_, VmCowPagesOptions::kNone, pmm_alloc_flags_, size, nullptr, nullptr)); |
| if (!ac.check()) { |
| return ZX_ERR_NO_MEMORY; |
| } |
| AssertHeld(right_child->lock_ref()); |
| |
| // The left child becomes a full clone of us, inheriting our children, paged backref etc. |
| CloneParentIntoChildLocked(left_child); |
| |
| // The right child is the, potential, subset view into the parent so has a variable offset. If |
| // this view would extend beyond us then we need to clip the parent_limit to our size_, which |
| // will ensure any pages in that range just get initialized from zeroes. |
| AddChildLocked(right_child.get(), offset, new_root_parent_offset, child_parent_limit); |
| |
| // Transition into being the hidden node. |
| options_ |= VmCowPagesOptions::kHidden; |
| DEBUG_ASSERT(life_cycle_ == LifeCycle::Alive); |
| DEBUG_ASSERT(children_list_len_ == 2); |
| |
| *cow_child = ktl::move(right_child); |
| |
| VMO_VALIDATION_ASSERT(DebugValidatePageSplitsHierarchyLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| return ZX_OK; |
| } |
| |
| zx_status_t VmCowPages::CloneUnidirectionalLocked(uint64_t offset, uint64_t size, |
| fbl::RefPtr<VmCowPages>* cow_child, |
| uint64_t new_root_parent_offset, |
| uint64_t child_parent_limit) { |
| fbl::AllocChecker ac; |
| auto cow_pages = fbl::AdoptRef<VmCowPages>(new (&ac) VmCowPages( |
| hierarchy_state_ptr_, VmCowPagesOptions::kNone, pmm_alloc_flags_, size, nullptr, nullptr)); |
| if (!ac.check()) { |
| return ZX_ERR_NO_MEMORY; |
| } |
| |
| // Walk up the parent chain until we find a good place to hang this new cow clone. A good |
| // place here means the first place that has committed pages that we actually need to |
| // snapshot. In doing so we need to ensure that the limits of the child we create do not end |
| // up seeing more of the final parent than it would have been able to see from here. |
| VmCowPages* cur = this; |
| AssertHeld(cur->lock_ref()); |
| while (cur->parent_) { |
| // There's a parent, check if there are any pages in the current range. Unless we've moved |
| // outside the range of our parent, in which case we can just walk up. |
| if (child_parent_limit > 0 && |
| cur->page_list_.AnyPagesOrIntervalsInRange(offset, offset + child_parent_limit)) { |
| break; |
| } |
| // To move to the parent we need to translate our window into |cur|. |
| if (offset >= cur->parent_limit_) { |
| child_parent_limit = 0; |
| } else { |
| child_parent_limit = ktl::min(child_parent_limit, cur->parent_limit_ - offset); |
| } |
| offset += cur->parent_offset_; |
| cur = cur->parent_.get(); |
| } |
| new_root_parent_offset = CheckedAdd(offset, cur->root_parent_offset_); |
| cur->AddChildLocked(cow_pages.get(), offset, new_root_parent_offset, child_parent_limit); |
| |
| *cow_child = ktl::move(cow_pages); |
| |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| AssertHeld((*cow_child)->lock_ref()); |
| VMO_FRUGAL_VALIDATION_ASSERT((*cow_child)->DebugValidateVmoPageBorrowingLocked()); |
| |
| return ZX_OK; |
| } |
| |
| zx_status_t VmCowPages::CreateCloneLocked(CloneType type, uint64_t offset, uint64_t size, |
| fbl::RefPtr<VmCowPages>* cow_child) { |
| LTRACEF("vmo %p offset %#" PRIx64 " size %#" PRIx64 "\n", this, offset, size); |
| |
| canary_.Assert(); |
| |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(offset)); |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(size)); |
| DEBUG_ASSERT(!is_hidden_locked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| |
| // Upgrade clone type, if possible. |
| if (type == CloneType::SnapshotAtLeastOnWrite && !is_snapshot_at_least_on_write_supported()) { |
| if (can_snapshot_modified_locked()) { |
| type = CloneType::SnapshotModified; |
| } else { |
| type = CloneType::Snapshot; |
| } |
| } else if (type == CloneType::SnapshotModified) { |
| if (!can_snapshot_modified_locked()) { |
| type = CloneType::Snapshot; |
| } |
| } |
| |
| // All validation *must* be performed here prior to construction the VmCowPages, as the |
| // destructor for VmCowPages may acquire the lock, which we are already holding. |
| |
| switch (type) { |
| case CloneType::Snapshot: { |
| if (!is_cow_clonable_locked()) { |
| return ZX_ERR_NOT_SUPPORTED; |
| } |
| |
| // If this is non-zero, that means that there are pages which hardware can |
| // touch, so the vmo can't be safely cloned. |
| // TODO: consider immediately forking these pages. |
| if (pinned_page_count_locked()) { |
| return ZX_ERR_BAD_STATE; |
| } |
| break; |
| } |
| case CloneType::SnapshotAtLeastOnWrite: { |
| if (!is_snapshot_at_least_on_write_supported()) { |
| return ZX_ERR_NOT_SUPPORTED; |
| } |
| |
| break; |
| } |
| case CloneType::SnapshotModified: { |
| if (!can_snapshot_modified_locked()) { |
| return ZX_ERR_NOT_SUPPORTED; |
| } |
| if (pinned_page_count_locked()) { |
| return ZX_ERR_BAD_STATE; |
| } |
| break; |
| } |
| } |
| |
| uint64_t new_root_parent_offset; |
| bool overflow; |
| overflow = add_overflow(offset, root_parent_offset_, &new_root_parent_offset); |
| if (overflow) { |
| return ZX_ERR_INVALID_ARGS; |
| } |
| uint64_t temp; |
| overflow = add_overflow(new_root_parent_offset, size, &temp); |
| if (overflow) { |
| return ZX_ERR_INVALID_ARGS; |
| } |
| |
| uint64_t child_parent_limit = offset >= size_ ? 0 : ktl::min(size, size_ - offset); |
| |
| // Invalidate everything the clone will be able to see. They're COW pages now, |
| // so any existing mappings can no longer directly write to the pages. |
| RangeChangeUpdateLocked(offset, size, RangeChangeOp::RemoveWrite); |
| |
| switch (type) { |
| case CloneType::Snapshot: { |
| return CloneBidirectionalLocked(offset, size, cow_child, new_root_parent_offset, |
| child_parent_limit); |
| } |
| case CloneType::SnapshotAtLeastOnWrite: { |
| return CloneUnidirectionalLocked(offset, size, cow_child, new_root_parent_offset, |
| child_parent_limit); |
| } |
| case CloneType::SnapshotModified: { |
| // If at the root of vmo hierarchy or the slice of the root VMO, create a unidirectional clone |
| // TODO(https://fxbug.dev/42074633): consider extinding this to take unidirectional clones of |
| // snapshot-modified leaves if possible. |
| if (!parent_ || is_slice_locked()) { |
| if (is_slice_locked()) { |
| // ZX_ERR_NOT_SUPPORTED should have already been returned in the case of a non-root slice |
| // clone. |
| AssertHeld(parent_->lock_ref()); |
| DEBUG_ASSERT(!parent_->parent_); |
| } |
| return CloneUnidirectionalLocked(offset, size, cow_child, new_root_parent_offset, |
| child_parent_limit); |
| // Else, take a snapshot. |
| } else { |
| return CloneBidirectionalLocked(offset, size, cow_child, new_root_parent_offset, |
| child_parent_limit); |
| } |
| } |
| } |
| return ZX_ERR_NOT_SUPPORTED; |
| } |
| |
| void VmCowPages::RemoveChildLocked(VmCowPages* removed) { |
| canary_.Assert(); |
| |
| AssertHeld(removed->lock_ref()); |
| |
| VMO_VALIDATION_ASSERT(DebugValidatePageSplitsHierarchyLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| |
| if (!is_hidden_locked()) { |
| DropChildLocked(removed); |
| return; |
| } |
| |
| // Hidden vmos always have 0 or 2 children, but we can't be here with 0 children. |
| DEBUG_ASSERT(children_list_len_ == 2); |
| bool removed_left = &left_child_locked() == removed; |
| |
| DropChildLocked(removed); |
| |
| VmCowPages* child = &children_list_.front(); |
| DEBUG_ASSERT(child); |
| |
| MergeContentWithChildLocked(removed, removed_left); |
| |
| // The child which removed itself and led to the invocation should have a reference |
| // to us, in addition to child.parent_ which we are about to clear. |
| DEBUG_ASSERT(ref_count_debug() >= 2); |
| |
| AssertHeld(child->lock_ref()); |
| if (child->page_attribution_user_id_ != page_attribution_user_id_) { |
| // If the attribution user id of this vmo doesn't match that of its remaining child, |
| // then the vmo with the matching attribution user id was just closed. In that case, we |
| // need to reattribute the pages of any ancestor hidden vmos to vmos that still exist. |
| // |
| // The syscall API doesn't specify how pages are to be attributed among a group of COW |
| // clones. One option is to pick a remaining vmo 'arbitrarily' and attribute everything to |
| // that vmo. However, it seems fairer to reattribute each remaining hidden vmo with |
| // its child whose user id doesn't match the vmo that was just closed. So walk up the |
| // clone chain and attribute each hidden vmo to the vmo we didn't just walk through. |
| auto cur = this; |
| AssertHeld(cur->lock_ref()); |
| uint64_t user_id_to_skip = page_attribution_user_id_; |
| while (cur->parent_ != nullptr) { |
| auto parent = cur->parent_.get(); |
| AssertHeld(parent->lock_ref()); |
| |
| // Snapshot-modified case: hidden node with non-hidden parent. |
| // Pages will be attributed to the visible root. |
| if (!parent->is_hidden_locked()) { |
| // Parent must be root & pager-backed. |
| DEBUG_ASSERT(!parent->parent_); |
| DEBUG_ASSERT(parent->is_source_preserving_page_content()); |
| break; |
| } |
| |
| if (parent->page_attribution_user_id_ == page_attribution_user_id_) { |
| uint64_t new_user_id = parent->left_child_locked().page_attribution_user_id_; |
| if (new_user_id == user_id_to_skip) { |
| new_user_id = parent->right_child_locked().page_attribution_user_id_; |
| } |
| // Although user IDs can be unset for VMOs that do not have a dispatcher, copy-on-write |
| // VMOs always have user level dispatchers, and should have a valid user-id set, hence we |
| // should never end up re-attributing a hidden parent with an unset id. |
| DEBUG_ASSERT(new_user_id != 0); |
| // The 'if' above should mean that the new_user_id isn't the ID we are trying to remove |
| // and isn't one we just used. For this to fail we either need a corrupt VMO hierarchy, or |
| // to have labeled two leaf nodes with the same user_id, which would also be incorrect as |
| // leaf nodes have unique dispatchers and hence unique ids. |
| DEBUG_ASSERT(new_user_id != page_attribution_user_id_ && new_user_id != user_id_to_skip); |
| parent->page_attribution_user_id_ = new_user_id; |
| user_id_to_skip = new_user_id; |
| |
| cur = parent; |
| } else { |
| break; |
| } |
| } |
| } |
| |
| // We can have a priority count of at most 1, and only if the remaining child is the one |
| // contributing to it. |
| DEBUG_ASSERT(high_priority_count_ == 0 || |
| (high_priority_count_ == 1 && child->high_priority_count_ > 0)); |
| // Similarly if we have a priority count, and we have a parent, then our parent must have a |
| // non-zero count. |
| if (parent_) { |
| DEBUG_ASSERT(high_priority_count_ == 0 || parent_locked().high_priority_count_ != 0); |
| } |
| // If our child has a non-zero count, then it is propagating a +1 count to us, and we in turn are |
| // propagating a +1 count to our parent. In the final arrangement after ReplaceChildLocked then |
| // the +1 count child was giving to us needs to go to parent, but as we were already giving a +1 |
| // count to parent, everything is correct. |
| // Although the final hierarchy has correct counts, there is still an assertion in our destructor |
| // that our count is zero, so subtract of any count that we might have. |
| ChangeSingleHighPriorityCountLocked(-high_priority_count_); |
| |
| // Drop the child from our list, but don't recurse back into this function. Then |
| // remove ourselves from the clone tree. |
| DropChildLocked(child); |
| if (parent_) { |
| parent_locked().ReplaceChildLocked(this, child); |
| } |
| child->parent_ = ktl::move(parent_); |
| // We have lost our parent which, if we had a parent, could lead us to now be violating the |
| // invariant that parent_limit_ being non-zero implies we have a parent. Although this generally |
| // should not matter, we have not transitioned to being dead yet, so we should maintain the |
| // correct invariants. |
| parent_offset_ = parent_limit_ = parent_start_limit_ = 0; |
| |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| } |
| |
| void VmCowPages::MergeContentWithChildLocked(VmCowPages* removed, bool removed_left) { |
| DEBUG_ASSERT(children_list_len_ == 1); |
| VmCowPages& child = children_list_.front(); |
| AssertHeld(child.lock_ref()); |
| AssertHeld(removed->lock_ref()); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| |
| list_node freed_pages; |
| list_initialize(&freed_pages); |
| __UNINITIALIZED BatchPQRemove page_remover(&freed_pages); |
| |
| const uint64_t visibility_start_offset = child.parent_offset_ + child.parent_start_limit_; |
| const uint64_t merge_start_offset = child.parent_offset_; |
| const uint64_t merge_end_offset = child.parent_offset_ + child.parent_limit_; |
| |
| // There's no technical reason why this merging code cannot be run if there is a page source, |
| // however a bi-directional clone will never have a page source and so in case there are any |
| // consequence that have no been considered, ensure we are not in this case. |
| DEBUG_ASSERT(!is_source_preserving_page_content()); |
| |
| page_list_.RemovePages(page_remover.RemovePagesCallback(), 0, visibility_start_offset); |
| page_list_.RemovePages(page_remover.RemovePagesCallback(), merge_end_offset, |
| VmPageList::MAX_SIZE); |
| |
| if (child.parent_offset_ + child.parent_limit_ > parent_limit_) { |
| // Update the child's parent limit to ensure that it won't be able to see more |
| // of its new parent than this hidden vmo was able to see. |
| if (parent_limit_ < child.parent_offset_) { |
| child.parent_limit_ = 0; |
| child.parent_start_limit_ = 0; |
| } else { |
| child.parent_limit_ = parent_limit_ - child.parent_offset_; |
| child.parent_start_limit_ = ktl::min(child.parent_start_limit_, child.parent_limit_); |
| } |
| } else { |
| // The child will be able to see less of its new parent than this hidden vmo was |
| // able to see, so release any parent pages in that range. |
| ReleaseCowParentPagesLocked(merge_end_offset, parent_limit_, &page_remover); |
| } |
| |
| if (removed->parent_offset_ + removed->parent_start_limit_ < visibility_start_offset) { |
| // If the removed former child has a smaller offset, then there are retained |
| // ancestor pages that will no longer be visible and thus should be freed. |
| ReleaseCowParentPagesLocked(removed->parent_offset_ + removed->parent_start_limit_, |
| visibility_start_offset, &page_remover); |
| } |
| |
| // Adjust the child's offset so it will still see the correct range. |
| bool overflow = add_overflow(parent_offset_, child.parent_offset_, &child.parent_offset_); |
| // Overflow here means that something went wrong when setting up parent limits. |
| DEBUG_ASSERT(!overflow); |
| |
| if (child.is_hidden_locked()) { |
| // After the merge, either |child| can't see anything in parent (in which case |
| // the parent limits could be anything), or |child|'s first visible offset will be |
| // at least as large as |this|'s first visible offset. |
| DEBUG_ASSERT(child.parent_start_limit_ == child.parent_limit_ || |
| parent_offset_ + parent_start_limit_ <= |
| child.parent_offset_ + child.parent_start_limit_); |
| } else { |
| // non-hidden vmos should always have zero parent_start_limit_ |
| DEBUG_ASSERT(child.parent_start_limit_ == 0); |
| } |
| |
| // At this point, we need to merge |this|'s page list and |child|'s page list. |
| // |
| // In general, COW clones are expected to share most of their pages (i.e. to fork a relatively |
| // small number of pages). Because of this, it is preferable to do work proportional to the |
| // number of pages which were forked into |removed|. However, there are a few things that can |
| // prevent this: |
| // - If |child|'s offset is non-zero then the offsets of all of |this|'s pages will |
| // need to be updated when they are merged into |child|. |
| // - If there has been a call to ReleaseCowParentPagesLocked which was not able to |
| // update the parent limits, then there can exist pages in this vmo's page list |
| // which are not visible to |child| but can't be easily freed based on its parent |
| // limits. Finding these pages requires examining the split bits of all pages. |
| // - If |child| is hidden, then there can exist pages in this vmo which were split into |
| // |child|'s subtree and then migrated out of |child|. Those pages need to be freed, and |
| // the simplest way to find those pages is to examine the split bits. |
| bool fast_merge = merge_start_offset == 0 && !partial_cow_release_ && !child.is_hidden_locked(); |
| |
| if (fast_merge) { |
| // Only leaf vmos can be directly removed, so this must always be true. This guarantees |
| // that there are no pages that were split into |removed| that have since been migrated |
| // to its children. |
| DEBUG_ASSERT(!removed->is_hidden_locked()); |
| |
| // Before merging, find any pages that are present in both |removed| and |this|. Those |
| // pages are visibile to |child| but haven't been written to through |child|, so |
| // their split bits need to be cleared. Note that ::ReleaseCowParentPagesLocked ensures |
| // that pages outside of the parent limit range won't have their split bits set. |
| removed->page_list_.ForEveryPageInRange( |
| [removed_offset = removed->parent_offset_, this](auto* page, uint64_t offset) { |
| // Hidden VMO hierarchies do not support intervals. |
| ASSERT(!page->IsInterval()); |
| AssertHeld(lock_ref()); |
| // Whether this is a true page, or a marker, we must check |this| for a page as either |
| // represents a potential fork, even if we subsequently changed it to a marker. |
| VmPageOrMarkerRef page_or_mark = page_list_.LookupMutable(offset + removed_offset); |
| if (page_or_mark && page_or_mark->IsPageOrRef()) { |
| // The page was definitely forked into |removed|, but |
| // shouldn't be forked twice. |
| DEBUG_ASSERT(page_or_mark->PageOrRefLeftSplit() ^ page_or_mark->PageOrRefRightSplit()); |
| page_or_mark.SetPageOrRefLeftSplit(false); |
| page_or_mark.SetPageOrRefRightSplit(false); |
| } |
| return ZX_ERR_NEXT; |
| }, |
| removed->parent_start_limit_, removed->parent_limit_); |
| |
| // These will be freed, but accumulate them separately for use in asserts before adding these to |
| // freed_pages. |
| list_node covered_pages; |
| list_initialize(&covered_pages); |
| __UNINITIALIZED BatchPQRemove covered_remover(&covered_pages); |
| |
| // Although not all pages in page_list_ will end up existing in child, we don't know which ones |
| // will get replaced, so we must update all of the backlinks. |
| { |
| size_t batch_count{0}; |
| PageQueues* pq = pmm_page_queues(); |
| VmCompression* compression = pmm_page_compression(); |
| Guard<SpinLock, IrqSave> guard{pq->get_lock()}; |
| |
| page_list_.ForEveryPageMutable([this, pq, &child, &guard, &compression, &batch_count]( |
| VmPageOrMarkerRef p, uint64_t off) { |
| // Hidden VMO hierarchies do not support intervals. |
| ASSERT(!p->IsInterval()); |
| // If we have processed our batch limit, drop the page_queue lock to |
| // give other threads a chance to perform operations, before |
| // re-acquiring the lock and continuing. |
| if (batch_count >= PageQueues::kMaxBatchSize) { |
| batch_count = 0; |
| guard.CallUnlocked([]() { |
| // TODO(johngro): Once our spinlocks have been updated to be more fair |
| // (ticket locks, MCS locks, whatever), come back here and remove this |
| // pessimistic cpu relax. |
| arch::Yield(); |
| }); |
| } |
| if (p->IsReference()) { |
| // A regular reference we can move, a temporary reference we need to turn back into |
| // its page so we can move it. To determine if we have a temporary reference we can just |
| // attempt to move it, and if it was a temporary reference we will get a page returned. |
| if (auto page = compression->MoveReference(p->Reference())) { |
| InitializeVmPage(*page); |
| // Dropping the page queues lock is inefficient, but this is an unlikely edge case that |
| // can happen exactly once (due to only one temporary reference). |
| guard.CallUnlocked([this, page, off] { |
| AssertHeld(lock_ref()); |
| SetNotPinnedLocked(*page, off); |
| }); |
| VmPageOrMarker::ReferenceValue ref = p.SwapReferenceForPage(*page); |
| ASSERT(compression->IsTempReference(ref)); |
| } |
| } |
| if (p->IsPage()) { |
| AssertHeld<Lock<SpinLock>, IrqSave>(*pq->get_lock()); |
| |
| vm_page_t* page = p->Page(); |
| pq->ChangeObjectOffsetLocked(page, &child, off); |
| } |
| |
| ++batch_count; |
| return ZX_ERR_NEXT; |
| }); |
| } |
| |
| // Now merge |child|'s pages into |this|, overwriting any pages present in |this|, and |
| // then move that list to |child|. |
| // We are going to perform a delayed free on pages removed here by concatenating |covered_pages| |
| // to |freed_pages|. As a result |freed_pages| will end up with mixed ownership of pages, so |
| // FreePagesLocked() will simply free the pages to the PMM. Make sure that the |child| did not |
| // have a source that was handling frees, which would require more work that simply freeing |
| // pages to the PMM. |
| DEBUG_ASSERT(!child.is_source_handling_free_locked()); |
| child.page_list_.MergeOnto( |
| page_list_, [&covered_remover](VmPageOrMarker&& p) { covered_remover.PushContent(&p); }); |
| child.page_list_ = ktl::move(page_list_); |
| |
| vm_page_t* p; |
| covered_remover.Flush(); |
| list_for_every_entry (&covered_pages, p, vm_page_t, queue_node) { |
| // The page was already present in |child|, so it should be split at least |
| // once. And being split twice is obviously bad. |
| ASSERT(p->object.cow_left_split ^ p->object.cow_right_split); |
| ASSERT(p->object.pin_count == 0); |
| } |
| list_splice_after(&covered_pages, &freed_pages); |
| } else { |
| // Merge our page list into the child page list and update all the necessary metadata. |
| struct { |
| PageQueues* pq; |
| bool removed_left; |
| uint64_t merge_start_offset; |
| VmCowPages* child; |
| BatchPQRemove* page_remover; |
| VmCompression* compression; |
| } state = {pmm_page_queues(), removed_left, merge_start_offset, &child, |
| &page_remover, pmm_page_compression()}; |
| child.page_list_.MergeFrom( |
| page_list_, merge_start_offset, merge_end_offset, |
| [&page_remover](VmPageOrMarker&& p, uint64_t offset) { page_remover.PushContent(&p); }, |
| [this, &state](VmPageOrMarker* page_or_marker, uint64_t offset) { |
| DEBUG_ASSERT(page_or_marker->IsPageOrRef()); |
| DEBUG_ASSERT(page_or_marker->IsReference() || |
| page_or_marker->Page()->object.pin_count == 0); |
| |
| if (state.removed_left ? page_or_marker->PageOrRefRightSplit() |
| : page_or_marker->PageOrRefLeftSplit()) { |
| // This happens when the pages was already migrated into child but then |
| // was migrated further into child's descendants. The page can be freed. |
| state.page_remover->PushContent(page_or_marker); |
| } else { |
| // Since we recursively fork on write, if the child doesn't have the |
| // page, then neither of its children do. |
| page_or_marker->SetPageOrRefLeftSplit(false); |
| page_or_marker->SetPageOrRefRightSplit(false); |
| if (page_or_marker->IsReference()) { |
| // A regular reference we can move, a temporary reference we need to turn back into |
| // its page so we can move it. To determine if we have a temporary reference we can |
| // just attempt to move it, and if it was a temporary reference we will get a page |
| // returned. |
| if (auto page = state.compression->MoveReference(page_or_marker->Reference())) { |
| InitializeVmPage(*page); |
| // For simplicity, since this is a very uncommon edge case, just update the page in |
| // place in this page list, then move it as a regular page. |
| AssertHeld(lock_ref()); |
| SetNotPinnedLocked(*page, offset); |
| VmPageOrMarker::ReferenceValue ref = |
| VmPageOrMarkerRef(page_or_marker).SwapReferenceForPage(*page); |
| ASSERT(state.compression->IsTempReference(ref)); |
| } |
| } |
| // Not an else-if to intentionally perform this if the previous block turned a reference |
| // into a page. |
| if (page_or_marker->IsPage()) { |
| state.pq->ChangeObjectOffset(page_or_marker->Page(), state.child, |
| offset - state.merge_start_offset); |
| } |
| } |
| }); |
| } |
| |
| page_remover.Flush(); |
| if (!list_is_empty(&freed_pages)) { |
| // |freed_pages| might also contain pages removed from a child or an ancestor, so we do not own |
| // all the pages. Make sure we did not have a page source that was handling frees which would |
| // require additional work on the owned pages on top of a simple free to the PMM. |
| DEBUG_ASSERT(!is_source_handling_free_locked()); |
| FreePagesLocked(&freed_pages, /*freeing_owned_pages=*/false); |
| } |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| } |
| |
| void VmCowPages::DumpLocked(uint depth, bool verbose) const { |
| canary_.Assert(); |
| |
| size_t page_count = 0; |
| size_t compressed_count = 0; |
| page_list_.ForEveryPage([&page_count, &compressed_count](const auto* p, uint64_t) { |
| if (p->IsPage()) { |
| page_count++; |
| } else if (p->IsReference()) { |
| compressed_count++; |
| } |
| return ZX_ERR_NEXT; |
| }); |
| |
| for (uint i = 0; i < depth; ++i) { |
| printf(" "); |
| } |
| printf("cow_pages %p size %#" PRIx64 " offset %#" PRIx64 " start limit %#" PRIx64 |
| " limit %#" PRIx64 " content pages %zu compressed pages %zu ref %d parent %p\n", |
| this, size_, parent_offset_, parent_start_limit_, parent_limit_, page_count, |
| compressed_count, ref_count_debug(), parent_.get()); |
| |
| if (page_source_) { |
| for (uint i = 0; i < depth + 1; ++i) { |
| printf(" "); |
| } |
| printf("page_source preserves content %d\n", is_source_preserving_page_content()); |
| page_source_->Dump(depth + 1); |
| } |
| |
| if (verbose) { |
| auto f = [depth](const auto* p, uint64_t offset) { |
| for (uint i = 0; i < depth + 1; ++i) { |
| printf(" "); |
| } |
| if (p->IsMarker()) { |
| printf("offset %#" PRIx64 " zero page marker\n", offset); |
| } else if (p->IsPage()) { |
| vm_page_t* page = p->Page(); |
| printf("offset %#" PRIx64 " page %p paddr %#" PRIxPTR "(%c%c%c)\n", offset, page, |
| page->paddr(), page->object.cow_left_split ? 'L' : '.', |
| page->object.cow_right_split ? 'R' : '.', page->object.always_need ? 'A' : '.'); |
| } else if (p->IsReference()) { |
| const uint64_t cookie = p->Reference().value(); |
| printf("offset %#" PRIx64 " reference %#" PRIx64 "(%c%c)\n", offset, cookie, |
| p->PageOrRefLeftSplit() ? 'L' : '.', p->PageOrRefRightSplit() ? 'R' : '.'); |
| } else if (p->IsIntervalStart()) { |
| printf("offset %#" PRIx64 " page interval start\n", offset); |
| } else if (p->IsIntervalEnd()) { |
| printf("offset %#" PRIx64 " page interval end\n", offset); |
| } else if (p->IsIntervalSlot()) { |
| printf("offset %#" PRIx64 " single page interval slot\n", offset); |
| } |
| return ZX_ERR_NEXT; |
| }; |
| page_list_.ForEveryPage(f); |
| } |
| } |
| |
| uint32_t VmCowPages::DebugLookupDepthLocked() const { |
| canary_.Assert(); |
| |
| // Count the number of parents we need to traverse to find the root, and call this our lookup |
| // depth. Slices don't need to be explicitly handled as they are just a parent. |
| uint32_t depth = 0; |
| const VmCowPages* cur = this; |
| AssertHeld(cur->lock_ref()); |
| while (cur->parent_) { |
| depth++; |
| cur = cur->parent_.get(); |
| } |
| return depth; |
| } |
| |
| VmCowPages::AttributionCounts VmCowPages::GetAttributedMemoryInRangeLocked( |
| uint64_t offset_bytes, uint64_t len_bytes) const { |
| canary_.Assert(); |
| |
| if (is_hidden_locked()) { |
| return AttributionCounts{}; |
| } |
| |
| AttributionCounts counts; |
| // TODO(https://g-issues.fuchsia.dev/issues/338300808): Formalize attribution model. |
| page_list_.ForEveryPageAndGapInRange( |
| [&counts](const auto* p, uint64_t off) { |
| if (p->IsPage()) { |
| counts.uncompressed_bytes += PAGE_SIZE; |
| } else if (p->IsReference()) { |
| counts.compressed_bytes += PAGE_SIZE; |
| } |
| return ZX_ERR_NEXT; |
| }, |
| [this, &counts](uint64_t gap_start, uint64_t gap_end) { |
| AssertHeld(lock_ref()); |
| |
| // If there's no parent, there's no pages to care about. If there is a non-hidden |
| // parent, then that owns any pages in the gap, not us. |
| if (!parent_) { |
| return ZX_ERR_NEXT; |
| } |
| if (!parent_locked().is_hidden_locked()) { |
| return ZX_ERR_NEXT; |
| } |
| |
| // Count any ancestor pages that should be attributed to us in the range. Ideally the whole |
| // range gets processed in one attempt, but in order to prevent unbounded stack growth with |
| // recursion we instead process partial ranges and recalculate the intermediate results. |
| // As a result instead of being O(n) in the number of committed pages it could |
| // pathologically become O(nd) where d is our depth in the vmo hierarchy. |
| uint64_t off = gap_start; |
| while (off < parent_limit_ && off < gap_end) { |
| AttributionCounts local_count; |
| uint64_t attributed = |
| CountAttributedAncestorBytesLocked(off, gap_end - off, &local_count); |
| // |CountAttributedAncestorPagesLocked| guarantees that it will make progress. |
| DEBUG_ASSERT(attributed > 0); |
| off += attributed; |
| counts += local_count; |
| } |
| |
| return ZX_ERR_NEXT; |
| }, |
| offset_bytes, offset_bytes + len_bytes); |
| |
| return counts; |
| } |
| |
| uint64_t VmCowPages::CountAttributedAncestorBytesLocked(uint64_t offset, uint64_t size, |
| AttributionCounts* count) const |
| TA_REQ(lock()) { |
| // We need to walk up the ancestor chain to see if there are any pages that should be attributed |
| // to this vmo. We attempt operate on the entire range given to us but should we need to query |
| // the next parent for a range we trim our operating range. Trimming the range is necessary as |
| // we cannot recurse and otherwise have no way to remember where we were up to after processing |
| // the range in the parent. The solution then is to return all the way back up to the caller with |
| // a partial range and then effectively recompute the meta data at the point we were up to. |
| |
| // Note that we cannot stop just because the page_attribution_user_id_ changes. This is because |
| // there might still be a forked page at the offset in question which should be attributed to |
| // this vmo. Whenever the attribution user id changes while walking up the ancestors, we need |
| // to determine if there is a 'closer' vmo in the sibling subtree to which the offset in |
| // question can be attributed, or if it should still be attributed to the current vmo. |
| |
| DEBUG_ASSERT(offset < parent_limit_); |
| const VmCowPages* cur = this; |
| AssertHeld(cur->lock_ref()); |
| uint64_t cur_offset = offset; |
| uint64_t cur_size = size; |
| // Count of how many bytes we attributed as being owned by this vmo. |
| AttributionCounts attributed_ours; |
| // Count how much we've processed. This is needed to remember when we iterate up the parent list |
| // at an offset. |
| uint64_t attributed = 0; |
| while (cur_offset < cur->parent_limit_) { |
| // For cur->parent_limit_ to be non-zero, it must have a parent. |
| DEBUG_ASSERT(cur->parent_); |
| |
| const auto parent = cur->parent_.get(); |
| AssertHeld(parent->lock_ref()); |
| uint64_t parent_offset; |
| bool overflowed = add_overflow(cur->parent_offset_, cur_offset, &parent_offset); |
| DEBUG_ASSERT(!overflowed); // vmo creation should have failed |
| DEBUG_ASSERT(parent_offset <= parent->size_); // parent_limit_ prevents this |
| |
| // Child of snapshot-modified root, pages will be attributed to parent |
| if (!parent->is_hidden_locked()) { |
| // Parent must be root & pager-backed. |
| DEBUG_ASSERT(!parent->parent_); |
| DEBUG_ASSERT(parent->is_source_preserving_page_content()); |
| break; |
| } |
| |
| const bool left = cur == &parent->left_child_locked(); |
| const auto& sib = left ? parent->right_child_locked() : parent->left_child_locked(); |
| |
| // Work out how much of the desired size is actually visible to us in the parent, we just use |
| // this to walk the correct amount of the page_list_ |
| const uint64_t parent_size = ktl::min(cur_size, cur->parent_limit_ - cur_offset); |
| |
| // By default we expect to process the entire range, hence our next_size is 0. Should we need to |
| // iterate up the stack then these will be set by one of the callbacks. |
| uint64_t next_parent_offset = parent_offset + cur_size; |
| uint64_t next_size = 0; |
| parent->page_list_.ForEveryPageAndGapInRange( |
| [&parent, &cur, &attributed_ours, &sib](const auto* p, uint64_t off) { |
| AssertHeld(cur->lock_ref()); |
| AssertHeld(sib.lock_ref()); |
| AssertHeld(parent->lock_ref()); |
| // Hidden VMO hierarchies don't support page intervals. |
| ASSERT(!p->IsInterval()); |
| if (p->IsMarker()) { |
| return ZX_ERR_NEXT; |
| } |
| if ( |
| // Page is explicitly owned by us |
| (parent->page_attribution_user_id_ == cur->page_attribution_user_id_) || |
| // If page has already been split and we can see it, then we know |
| // the sibling subtree can't see the page and thus it should be |
| // attributed to this vmo. |
| (p->PageOrRefLeftSplit() || p->PageOrRefRightSplit()) || |
| // If the sibling cannot access this page then its ours, otherwise we know there's |
| // a vmo in the sibling subtree which is 'closer' to this offset, and to which we will |
| // attribute the page to. |
| !(sib.parent_offset_ + sib.parent_start_limit_ <= off && |
| off < sib.parent_offset_ + sib.parent_limit_)) { |
| if (p->IsPage()) { |
| attributed_ours.uncompressed_bytes += PAGE_SIZE; |
| } else if (p->IsReference()) { |
| attributed_ours.compressed_bytes += PAGE_SIZE; |
| } |
| } |
| return ZX_ERR_NEXT; |
| }, |
| [&parent, &cur, &next_parent_offset, &next_size, &sib](uint64_t gap_start, |
| uint64_t gap_end) { |
| // Process a gap in the parent VMO. |
| // |
| // A gap in the parent VMO doesn't necessarily mean there are no pages |
| // in this range: our parent's ancestors may have pages, so we need to |
| // walk up the tree to find out. |
| // |
| // We don't always need to walk the tree though: in this this gap, both this VMO |
| // and our sibling VMO will share the same set of ancestor pages. However, the |
| // pages will only be accounted to one of the two VMOs. |
| // |
| // If the parent page_attribution_user_id is the same as us, we need to |
| // keep walking up the tree to perform a more accurate count. |
| // |
| // If the parent page_attribution_user_id is our sibling, however, we |
| // can just ignore the overlapping range: pages may or may not exist in |
| // the range --- but either way, they would be accounted to our sibling. |
| // Instead, we need only walk up ranges not visible to our sibling. |
| AssertHeld(cur->lock_ref()); |
| AssertHeld(sib.lock_ref()); |
| AssertHeld(parent->lock_ref()); |
| uint64_t gap_size = gap_end - gap_start; |
| if (parent->page_attribution_user_id_ == cur->page_attribution_user_id_) { |
| // don't need to consider siblings as we own this range, but we do need to |
| // keep looking up the stack to find any actual pages. |
| next_parent_offset = gap_start; |
| next_size = gap_size; |
| return ZX_ERR_STOP; |
| } |
| // For this entire range we know that the offset is visible to the current vmo, and there |
| // are no committed or migrated pages. We need to check though for what portion of this |
| // range we should attribute to the sibling. Any range that we can attribute to the |
| // sibling we can skip, otherwise we have to keep looking up the stack to see if there are |
| // any pages that could be attributed to us. |
| uint64_t sib_offset, sib_len; |
| if (!GetIntersect(gap_start, gap_size, sib.parent_offset_ + sib.parent_start_limit_, |
| sib.parent_limit_ - sib.parent_start_limit_, &sib_offset, &sib_len)) { |
| // No sibling ownership, so need to look at the whole range in the parent to find any |
| // pages. |
| next_parent_offset = gap_start; |
| next_size = gap_size; |
| return ZX_ERR_STOP; |
| } |
| // If the whole range is owned by the sibling, any pages that might be in |
| // it won't be accounted to us anyway. Skip the segment. |
| if (sib_len == gap_size) { |
| DEBUG_ASSERT(sib_offset == gap_start); |
| return ZX_ERR_NEXT; |
| } |
| |
| // Otherwise, inspect the range not visible to our sibling. |
| if (sib_offset == gap_start) { |
| next_parent_offset = sib_offset + sib_len; |
| next_size = gap_end - next_parent_offset; |
| } else { |
| next_parent_offset = gap_start; |
| next_size = sib_offset - gap_start; |
| } |
| return ZX_ERR_STOP; |
| }, |
| parent_offset, parent_offset + parent_size); |
| if (next_size == 0) { |
| // If next_size wasn't set then we don't need to keep looking up the chain as we successfully |
| // looked at the entire range. |
| break; |
| } |
| // Count anything up to the next starting point as being processed. |
| attributed += next_parent_offset - parent_offset; |
| // Size should have been reduced by at least the amount we just attributed |
| DEBUG_ASSERT(next_size <= cur_size && |
| cur_size - next_size >= next_parent_offset - parent_offset); |
| |
| cur = parent; |
| cur_offset = next_parent_offset; |
| cur_size = next_size; |
| } |
| // Exiting the loop means we either ceased finding a relevant parent for the range, or we were |
| // able to process the entire range without needing to look up to a parent, in either case we |
| // can consider the entire range as attributed. |
| // |
| // The cur_size can be larger than the value of parent_size from the last loop iteration. This is |
| // fine as that range we trivially know has zero pages in it, and therefore has zero pages to |
| // determine attributions off. |
| attributed += cur_size; |
| |
| *count = attributed_ours; |
| return attributed; |
| } |
| |
| zx_status_t VmCowPages::AddPageLocked(VmPageOrMarker* p, uint64_t offset, |
| CanOverwriteContent overwrite, VmPageOrMarker* released_page, |
| bool do_range_update) { |
| canary_.Assert(); |
| |
| // Pages can be added as part of Init, but not once we transition to dead. |
| DEBUG_ASSERT(life_cycle_ != LifeCycle::Dead); |
| |
| if (p->IsPage()) { |
| LTRACEF("vmo %p, offset %#" PRIx64 ", page %p (%#" PRIxPTR ")\n", this, offset, p->Page(), |
| p->Page()->paddr()); |
| } else if (p->IsReference()) { |
| [[maybe_unused]] const uint64_t cookie = p->Reference().value(); |
| LTRACEF("vmo %p, offset %#" PRIx64 ", reference %#" PRIx64 "\n", this, offset, cookie); |
| } else { |
| DEBUG_ASSERT(p->IsMarker()); |
| LTRACEF("vmo %p, offset %#" PRIx64 ", marker\n", this, offset); |
| } |
| |
| if (released_page != nullptr) { |
| *released_page = VmPageOrMarker::Empty(); |
| } |
| |
| if (offset >= size_) { |
| return ZX_ERR_OUT_OF_RANGE; |
| } |
| |
| VmPageOrMarker* page; |
| auto interval_handling = VmPageList::IntervalHandling::NoIntervals; |
| // If we're backed by a page source that preserves content (user pager), we cannot directly update |
| // empty slots in the page list. An empty slot might lie in a sparse zero interval, which would |
| // require splitting the interval around the required offset before it can be manipulated. |
| if (is_source_preserving_page_content()) { |
| // We can overwrite zero intervals if we're allowed to overwrite zeros (or non-zeros). |
| interval_handling = overwrite != CanOverwriteContent::None |
| ? VmPageList::IntervalHandling::SplitInterval |
| : VmPageList::IntervalHandling::CheckForInterval; |
| } |
| auto [slot, is_in_interval] = page_list_.LookupOrAllocate(offset, interval_handling); |
| if (is_in_interval) { |
| // We should not have found an interval if we were not expecting any. |
| DEBUG_ASSERT(interval_handling != VmPageList::IntervalHandling::NoIntervals); |
| // Return error if the offset lies in an interval but we cannot overwrite intervals. |
| if (interval_handling != VmPageList::IntervalHandling::SplitInterval) { |
| // The lookup should not have returned a slot for us to manipulate if it was in an interval |
| // that cannot be overwritten, even if that slot was already populated (by an interval |
| // sentinel). |
| DEBUG_ASSERT(!slot); |
| return ZX_ERR_ALREADY_EXISTS; |
| } |
| // If offset was in an interval, we should have an interval slot to overwrite at this point. |
| DEBUG_ASSERT(slot && slot->IsIntervalSlot()); |
| } |
| page = slot; |
| |
| if (!page) { |
| return ZX_ERR_NO_MEMORY; |
| } |
| // The slot might have started empty and in error paths we will not fill it, so make sure it gets |
| // returned in that case. |
| auto return_slot = fit::defer([page, offset, this] { |
| // If we started with an interval slot to manipulate, we should have been able to overwrite it. |
| DEBUG_ASSERT(!page->IsIntervalSlot()); |
| if (unlikely(page->IsEmpty())) { |
| AssertHeld(lock_ref()); |
| page_list_.ReturnEmptySlot(offset); |
| } |
| }); |
| |
| // We cannot overwrite any kind of content. |
| if (overwrite == CanOverwriteContent::None) { |
| // An anonymous VMO starts off with all its content set to zero, i.e. at no point can it have |
| // absence of content. |
| if (!page_source_) { |
| return ZX_ERR_ALREADY_EXISTS; |
| } |
| // This VMO is backed by a page source, so empty slots represent absence of content. Fail if the |
| // slot is not empty. |
| if (!page->IsEmpty()) { |
| return ZX_ERR_ALREADY_EXISTS; |
| } |
| } |
| |
| // We're only permitted to overwrite zero content. This has different meanings based on the |
| // whether the VMO is anonymous or is backed by a pager. |
| // |
| // * For anonymous VMOs, the initial content for the entire VMO is implicitly all zeroes at the |
| // time of creation. So both zero page markers and empty slots represent zero content. Therefore |
| // the only content type that cannot be overwritten in this case is an actual page. |
| // |
| // * For pager backed VMOs, content is either explicitly supplied by the user pager, or |
| // implicitly supplied as zeros by the kernel. Zero content is represented by either zero page |
| // markers (supplied by the user pager), or by sparse zero intervals (supplied by the kernel). |
| // Therefore the only content type that cannot be overwritten in this case as well is an actual |
| // page. |
| if (overwrite == CanOverwriteContent::Zero && page->IsPageOrRef()) { |
| // If we have a page source, the page source should be able to validate the page. |
| // Note that having a page source implies that any content must be an actual page and so |
| // although we return an error for any kind of content, the debug check only gets run for page |
| // sources where it will be a real page. |
| DEBUG_ASSERT(!page_source_ || page_source_->DebugIsPageOk(page->Page(), offset)); |
| return ZX_ERR_ALREADY_EXISTS; |
| } |
| |
| // If the old entry is actual content, release it. |
| if (page->IsPageOrRef()) { |
| // We should be permitted to overwrite any kind of content (zero or non-zero). |
| DEBUG_ASSERT(overwrite == CanOverwriteContent::NonZero); |
| // The caller should have passed in an optional to hold the released page. |
| DEBUG_ASSERT(released_page != nullptr); |
| *released_page = ktl::move(*page); |
| } |
| |
| // If the new page is an actual page and we have a page source, the page source should be able to |
| // validate the page. |
| // Note that having a page source implies that any content must be an actual page and so |
| // although we return an error for any kind of content, the debug check only gets run for page |
| // sources where it will be a real page. |
| DEBUG_ASSERT(!p->IsPageOrRef() || !page_source_ || |
| page_source_->DebugIsPageOk(p->Page(), offset)); |
| |
| // If this is actually a real page, we need to place it into the appropriate queue. |
| if (p->IsPage()) { |
| vm_page_t* low_level_page = p->Page(); |
| DEBUG_ASSERT(low_level_page->state() == vm_page_state::OBJECT); |
| DEBUG_ASSERT(low_level_page->object.pin_count == 0); |
| SetNotPinnedLocked(low_level_page, offset); |
| } |
| *page = ktl::move(*p); |
| |
| if (do_range_update) { |
| // other mappings may have covered this offset into the vmo, so unmap those ranges |
| RangeChangeUpdateLocked(offset, PAGE_SIZE, RangeChangeOp::Unmap); |
| } |
| |
| VMO_VALIDATION_ASSERT(DebugValidatePageSplitsHierarchyLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| VMO_VALIDATION_ASSERT(DebugValidateZeroIntervalsLocked()); |
| return ZX_OK; |
| } |
| |
| zx_status_t VmCowPages::AddNewPageLocked(uint64_t offset, vm_page_t* page, |
| CanOverwriteContent overwrite, |
| VmPageOrMarker* released_page, bool zero, |
| bool do_range_update) { |
| canary_.Assert(); |
| |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(offset)); |
| |
| InitializeVmPage(page); |
| if (zero) { |
| ZeroPage(page); |
| } |
| |
| // Pages being added to pager backed VMOs should have a valid dirty_state before being added to |
| // the page list, so that they can be inserted in the correct page queue. New pages start off |
| // clean. |
| if (is_source_preserving_page_content()) { |
| // Only zero pages can be added as new pages to pager backed VMOs. |
| DEBUG_ASSERT(zero || IsZeroPage(page)); |
| UpdateDirtyStateLocked(page, offset, DirtyState::Clean, /*is_pending_add=*/true); |
| } |
| |
| VmPageOrMarker p = VmPageOrMarker::Page(page); |
| zx_status_t status = AddPageLocked(&p, offset, overwrite, released_page, do_range_update); |
| |
| if (status != ZX_OK) { |
| // Release the page from 'p', as we are returning failure 'page' is still owned by the caller. |
| // Store the result in a temporary as we are required to use the result of ReleasePage. |
| [[maybe_unused]] vm_page_t* unused = p.ReleasePage(); |
| } |
| VMO_VALIDATION_ASSERT(DebugValidatePageSplitsHierarchyLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| return status; |
| } |
| |
| zx_status_t VmCowPages::AddNewPagesLocked(uint64_t start_offset, list_node_t* pages, |
| CanOverwriteContent overwrite, bool zero, |
| bool do_range_update) { |
| ASSERT(overwrite != CanOverwriteContent::NonZero); |
| canary_.Assert(); |
| |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(start_offset)); |
| |
| uint64_t offset = start_offset; |
| while (vm_page_t* p = list_remove_head_type(pages, vm_page_t, queue_node)) { |
| // Defer the range change update by passing false as we will do it in bulk at the end if needed. |
| zx_status_t status = AddNewPageLocked(offset, p, overwrite, nullptr, zero, false); |
| if (status != ZX_OK) { |
| // Put the page back on the list so that someone owns it and it'll get free'd. |
| list_add_head(pages, &p->queue_node); |
| // Decommit any pages we already placed. |
| if (offset > start_offset) { |
| DecommitRangeLocked(start_offset, offset - start_offset); |
| } |
| |
| // Free all the pages back as we had ownership of them. |
| FreePagesLocked(pages, /*freeing_owned_pages=*/true); |
| return status; |
| } |
| offset += PAGE_SIZE; |
| } |
| |
| if (do_range_update) { |
| // other mappings may have covered this offset into the vmo, so unmap those ranges |
| RangeChangeUpdateLocked(start_offset, offset - start_offset, RangeChangeOp::Unmap); |
| } |
| |
| VMO_VALIDATION_ASSERT(DebugValidatePageSplitsHierarchyLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| return ZX_OK; |
| } |
| |
| bool VmCowPages::IsUniAccessibleLocked(vm_page_t* page, uint64_t offset) const { |
| DEBUG_ASSERT(page_list_.Lookup(offset)->Page() == page); |
| |
| if (page->object.cow_right_split || page->object.cow_left_split) { |
| return true; |
| } |
| |
| if (offset < left_child_locked().parent_offset_ + left_child_locked().parent_start_limit_ || |
| offset >= left_child_locked().parent_offset_ + left_child_locked().parent_limit_) { |
| return true; |
| } |
| |
| if (offset < right_child_locked().parent_offset_ + right_child_locked().parent_start_limit_ || |
| offset >= right_child_locked().parent_offset_ + right_child_locked().parent_limit_) { |
| return true; |
| } |
| |
| return false; |
| } |
| |
| zx_status_t VmCowPages::CloneCowPageLocked(uint64_t offset, list_node_t* alloc_list, |
| VmCowPages* page_owner, vm_page_t* page, |
| uint64_t owner_offset, LazyPageRequest* page_request, |
| vm_page_t** out_page) { |
| DEBUG_ASSERT(page != vm_get_zero_page()); |
| DEBUG_ASSERT(parent_); |
| DEBUG_ASSERT(page_request); |
| |
| // Stash the paddr of the page that is going to be copied across loop iterations. |
| const paddr_t page_paddr = page->paddr(); |
| |
| // To avoid the need for rollback logic on allocation failure, we start the forking |
| // process from the root-most vmo and work our way towards the leaf vmo. This allows |
| // us to maintain the hidden vmo invariants through the whole operation, so that we |
| // can stop at any point. |
| // |
| // To set this up, walk from the leaf to |page_owner|, and keep track of the |
| // path via |stack_.dir_flag|. |
| VmCowPages* cur = this; |
| do { |
| AssertHeld(cur->lock_ref()); |
| VmCowPages* next = cur->parent_.get(); |
| // We should always be able to find |page_owner| in the ancestor chain. |
| DEBUG_ASSERT(next); |
| AssertHeld(next->lock_ref()); |
| |
| next->stack_.dir_flag = &next->left_child_locked() == cur ? StackDir::Left : StackDir::Right; |
| if (next->stack_.dir_flag == StackDir::Right) { |
| DEBUG_ASSERT(&next->right_child_locked() == cur); |
| } |
| cur = next; |
| } while (cur != page_owner); |
| uint64_t cur_offset = owner_offset; |
| |
| // |target_page| is the page we're considering for migration. Cache it |
| // across loop iterations. |
| vm_page_t* target_page = page; |
| |
| zx_status_t alloc_status = ZX_OK; |
| |
| // As long as we're simply migrating |page|, there's no need to update any vmo mappings, since |
| // that means the other side of the clone tree has already covered |page| and the current side |
| // of the clone tree will still see |page|. As soon as we insert a new page, we'll need to |
| // update all mappings at or below that level. |
| bool skip_range_update = true; |
| do { |
| // |target_page| is always located in |cur| at |cur_offset| at the start of the loop. |
| VmCowPages* target_page_owner = cur; |
| AssertHeld(target_page_owner->lock_ref()); |
| uint64_t target_page_offset = cur_offset; |
| |
| cur = cur->stack_.dir_flag == StackDir::Left ? &cur->left_child_locked() |
| : &cur->right_child_locked(); |
| DEBUG_ASSERT(cur_offset >= cur->parent_offset_); |
| cur_offset -= cur->parent_offset_; |
| |
| // We're either going to migrate the page or copy the page from |target_page_owner| to |cur|. |
| // Lookup the page list slot in |cur| that we're going to manipulate so that when we add the |
| // page later it does not encounter an allocation failure in the page list. We need to do this |
| // *before* we've made any changes to the |target_page_owner| page list so that we do not need |
| // to roll back in case of a failed migration. |
| auto [slot, is_in_interval] = |
| cur->page_list_.LookupOrAllocate(cur_offset, VmPageList::IntervalHandling::NoIntervals); |
| DEBUG_ASSERT(!is_in_interval); |
| // Bail if we could not allocate the slot. |
| if (!slot) { |
| *out_page = nullptr; |
| return ZX_ERR_NO_MEMORY; |
| } |
| // We should not be trying to fork at this offset if something already existed. |
| DEBUG_ASSERT(slot->IsEmpty()); |
| |
| // From this point on, we should ensure that the slot gets used to hold a page, or it is |
| // returned if empty. |
| const VmPageOrMarker* cur_page = slot; |
| auto return_empty_slot = fit::defer([cur, cur_offset, cur_page] { |
| if (!cur_page->IsPage()) { |
| AssertHeld(cur->lock_ref()); |
| // If we did not use the slot to hold a page, it could only have remained empty. |
| cur->page_list_.ReturnEmptySlot(cur_offset); |
| } |
| }); |
| |
| if (target_page_owner->IsUniAccessibleLocked(target_page, target_page_offset)) { |
| // If the page we're covering in the parent is uni-accessible, then we |
| // can directly move the page. |
| |
| // Assert that we're not trying to split the page the same direction two times. Either |
| // some tracking state got corrupted or a page in the subtree we're trying to |
| // migrate to got improperly migrated/freed. If we did this migration, then the |
| // opposite subtree would lose access to this page. |
| DEBUG_ASSERT(!(target_page_owner->stack_.dir_flag == StackDir::Left && |
| target_page->object.cow_left_split)); |
| DEBUG_ASSERT(!(target_page_owner->stack_.dir_flag == StackDir::Right && |
| target_page->object.cow_right_split)); |
| // For now, we won't see a loaned page here. |
| DEBUG_ASSERT(!target_page->is_loaned()); |
| |
| target_page->object.cow_left_split = 0; |
| target_page->object.cow_right_split = 0; |
| VmPageOrMarker removed = target_page_owner->page_list_.RemoveContent(target_page_offset); |
| // We know this is a true page since it is just our |target_page|, which is a true page. |
| vm_page* removed_page = removed.ReleasePage(); |
| pmm_page_queues()->Remove(removed_page); |
| DEBUG_ASSERT(removed_page == target_page); |
| } else { |
| // Otherwise we need to fork the page. The page has no writable mappings so we don't need to |
| // remove write or unmap before copying the contents. |
| vm_page_t* cover_page; |
| alloc_status = AllocateCopyPage(page_paddr, alloc_list, page_request, &cover_page); |
| if (alloc_status != ZX_OK) { |
| break; |
| } |
| |
| // We're going to cover target_page with cover_page, so set appropriate split bit. |
| if (target_page_owner->stack_.dir_flag == StackDir::Left) { |
| target_page->object.cow_left_split = 1; |
| DEBUG_ASSERT(target_page->object.cow_right_split == 0); |
| } else { |
| target_page->object.cow_right_split = 1; |
| DEBUG_ASSERT(target_page->object.cow_left_split == 0); |
| } |
| target_page = cover_page; |
| |
| skip_range_update = false; |
| } |
| |
| // Skip the automatic range update so we can do it ourselves more efficiently. |
| VmPageOrMarker add_page = VmPageOrMarker::Page(target_page); |
| zx_status_t status = |
| cur->AddPageLocked(&add_page, cur_offset, CanOverwriteContent::Zero, nullptr, false); |
| // Since we have allocated the slot already, we know this cannot fail. |
| DEBUG_ASSERT_MSG(status == ZX_OK, "AddPageLocked returned %d\n", status); |
| DEBUG_ASSERT(cur_page->Page() == target_page); |
| |
| if (!skip_range_update) { |
| if (cur != this) { |
| // In this case, cur is a hidden vmo and has no direct mappings. Also, its |
| // descendents along the page stack will be dealt with by subsequent iterations |
| // of this loop. That means that any mappings that need to be touched now are |
| // owned by the children on the opposite side of stack_.dir_flag. |
| VmCowPages& other = cur->stack_.dir_flag == StackDir::Left ? cur->right_child_locked() |
| : cur->left_child_locked(); |
| AssertHeld(other.lock_ref()); |
| RangeChangeList list; |
| other.RangeChangeUpdateFromParentLocked(cur_offset, PAGE_SIZE, &list); |
| RangeChangeUpdateListLocked(&list, RangeChangeOp::Unmap); |
| } else { |
| // In this case, cur is the last vmo being changed, so update its whole subtree. |
| DEBUG_ASSERT(offset == cur_offset); |
| RangeChangeUpdateLocked(offset, PAGE_SIZE, RangeChangeOp::Unmap); |
| } |
| } |
| } while (cur != this); |
| DEBUG_ASSERT(alloc_status != ZX_OK || cur_offset == offset); |
| |
| if (unlikely(alloc_status != ZX_OK)) { |
| *out_page = nullptr; |
| return alloc_status; |
| } else { |
| *out_page = target_page; |
| return ZX_OK; |
| } |
| } |
| |
| zx_status_t VmCowPages::CloneCowPageAsZeroLocked(uint64_t offset, list_node_t* freed_list, |
| VmCowPages* page_owner, vm_page_t* page, |
| uint64_t owner_offset, |
| LazyPageRequest* page_request) { |
| DEBUG_ASSERT(parent_); |
| |
| DEBUG_ASSERT(!page_source_ || page_source_->DebugIsPageOk(page, offset)); |
| |
| // Need to make sure the page is duplicated as far as our parent. Then we can pretend |
| // that we have forked it into us by setting the marker. |
| if (page_owner != parent_.get()) { |
| // Do not pass our freed_list here as this wants an alloc_list to allocate from. |
| zx_status_t result = parent_locked().CloneCowPageLocked( |
| offset + parent_offset_, nullptr, page_owner, page, owner_offset, page_request, &page); |
| if (result != ZX_OK) { |
| return result; |
| } |
| } |
| |
| // Before forking/moving the page, ensure a slot is available so that we know AddPageLocked cannot |
| // fail below. In the scenario where |slot| is empty, we do not need to worry about calling |
| // ReturnEmptySlot, since there are no failure paths from here and we are guaranteed to fill the |
| // slot. |
| auto [slot, is_in_interval] = |
| page_list_.LookupOrAllocate(offset, VmPageList::IntervalHandling::NoIntervals); |
| DEBUG_ASSERT(!is_in_interval); |
| |
| if (!slot) { |
| return ZX_ERR_NO_MEMORY; |
| } |
| |
| // We cannot be forking a page to here if there's already something. |
| DEBUG_ASSERT(slot->IsEmpty()); |
| |
| bool left = this == &(parent_locked().left_child_locked()); |
| // Page is in our parent. Check if its uni accessible, if so we can free it. |
| if (parent_locked().IsUniAccessibleLocked(page, offset + parent_offset_)) { |
| // Make sure we didn't already merge the page in this direction. |
| DEBUG_ASSERT(!(left && page->object.cow_left_split)); |
| DEBUG_ASSERT(!(!left && page->object.cow_right_split)); |
| // We are going to be inserting removed pages into a shared free list. So make sure the parent |
| // did not have a page source that was handling frees which would require additional work on the |
| // owned pages on top of a simple free to the PMM. |
| DEBUG_ASSERT(!parent_locked().is_source_handling_free_locked()); |
| // We know this is a true page since it is just our target |page|. |
| vm_page* removed = |
| parent_locked().page_list_.RemoveContent(offset + parent_offset_).ReleasePage(); |
| DEBUG_ASSERT(removed == page); |
| pmm_page_queues()->Remove(removed); |
| DEBUG_ASSERT(!list_in_list(&removed->queue_node)); |
| list_add_tail(freed_list, &removed->queue_node); |
| } else { |
| if (left) { |
| page->object.cow_left_split = 1; |
| } else { |
| page->object.cow_right_split = 1; |
| } |
| } |
| // Insert the zero marker. |
| VmPageOrMarker new_marker = VmPageOrMarker::Marker(); |
| // We know that the slot is empty, so we know we won't be overwriting an actual page. |
| // We expect the caller to update any mappings. |
| zx_status_t status = AddPageLocked(&new_marker, offset, CanOverwriteContent::Zero, nullptr, |
| /*do_range_update=*/false); |
| // Absent bugs, AddPageLocked() can only return ZX_ERR_NO_MEMORY, but that failure can only |
| // occur if we had to allocate a slot in the page list. Since we allocated a slot above, we |
| // know that can't be the case. |
| DEBUG_ASSERT(status == ZX_OK); |
| return ZX_OK; |
| } |
| |
| VMPLCursor VmCowPages::FindInitialPageContentLocked(uint64_t offset, VmCowPages** owner_out, |
| uint64_t* owner_offset_out, |
| uint64_t* owner_length) { |
| // Search up the clone chain for any committed pages. cur_offset is the offset |
| // into cur we care about. The loop terminates either when that offset contains |
| // a committed page or when that offset can't reach into the parent. |
| VMPLCursor page; |
| VmCowPages* cur = this; |
| AssertHeld(cur->lock_ref()); |
| uint64_t cur_offset = offset; |
| while (cur_offset < cur->parent_limit_) { |
| VmCowPages* parent = cur->parent_.get(); |
| // If there's no parent, then parent_limit_ is 0 and we'll never enter the loop |
| DEBUG_ASSERT(parent); |
| AssertHeld(parent->lock_ref()); |
| |
| uint64_t parent_offset; |
| bool overflowed = add_overflow(cur->parent_offset_, cur_offset, &parent_offset); |
| ASSERT(!overflowed); |
| if (parent_offset >= parent->size_) { |
| // The offset is off the end of the parent, so cur is the VmObjectPaged |
| // which will provide the page. |
| break; |
| } |
| if (owner_length) { |
| // Before we walk up, need to check to see if there's any forked pages that require us to |
| // restrict the owner length. Additionally need to restrict the owner length to the actual |
| // parent limit. |
| *owner_length = ktl::min(*owner_length, cur->parent_limit_ - cur_offset); |
| cur->page_list_.ForEveryPageInRange( |
| [owner_length, cur_offset](const VmPageOrMarker* p, uint64_t off) { |
| // VMO children do not support page intervals. |
| ASSERT(!p->IsInterval()); |
| *owner_length = off - cur_offset; |
| return ZX_ERR_STOP; |
| }, |
| cur_offset, cur_offset + *owner_length); |
| } |
| |
| cur = parent; |
| cur_offset = parent_offset; |
| VMPLCursor next_cursor = cur->page_list_.LookupMutableCursor(parent_offset); |
| VmPageOrMarkerRef p = next_cursor.current(); |
| if (p && !p->IsEmpty()) { |
| page = ktl::move(next_cursor); |
| break; |
| } |
| } |
| |
| *owner_out = cur; |
| *owner_offset_out = cur_offset; |
| |
| return page; |
| } |
| |
| void VmCowPages::UpdateDirtyStateLocked(vm_page_t* page, uint64_t offset, DirtyState dirty_state, |
| bool is_pending_add) { |
| ASSERT(page); |
| ASSERT(is_source_preserving_page_content()); |
| |
| // If the page is not pending being added to the page list, it should have valid object info. |
| DEBUG_ASSERT(is_pending_add || page->object.get_object() == this); |
| DEBUG_ASSERT(is_pending_add || page->object.get_page_offset() == offset); |
| |
| // If the page is Dirty or AwaitingClean, it should not be loaned. |
| DEBUG_ASSERT(!(is_page_dirty(page) || is_page_awaiting_clean(page)) || !page->is_loaned()); |
| |
| // Perform state-specific checks and actions. We will finally update the state below. |
| switch (dirty_state) { |
| case DirtyState::Clean: |
| // If the page is not in the process of being added, we can only see a transition to Clean |
| // from AwaitingClean. |
| ASSERT(is_pending_add || is_page_awaiting_clean(page)); |
| |
| // If we are expecting a pending Add[New]PageLocked, we can defer updating the page queue. |
| if (!is_pending_add) { |
| // Move to evictable pager backed queue to start tracking age information. |
| pmm_page_queues()->MoveToReclaim(page); |
| } |
| break; |
| case DirtyState::Dirty: |
| // If the page is not in the process of being added, we can only see a transition to Dirty |
| // from Clean or AwaitingClean. |
| ASSERT(is_pending_add || (is_page_clean(page) || is_page_awaiting_clean(page))); |
| |
| // A loaned page cannot be marked Dirty as loaned pages are reclaimed by eviction; Dirty pages |
| // cannot be evicted. |
| DEBUG_ASSERT(!page->is_loaned()); |
| |
| // If we are expecting a pending Add[New]PageLocked, we can defer updating the page queue. |
| if (!is_pending_add) { |
| // Move the page to the Dirty queue, which does not track page age. While the page is in the |
| // Dirty queue, age information is not required (yet). It will be required when the page |
| // becomes Clean (and hence evictable) again, at which point it will get moved to the MRU |
| // pager backed queue and will age as normal. |
| // TODO(rashaeqbal): We might want age tracking for the Dirty queue in the future when the |
| // kernel generates writeback pager requests. |
| pmm_page_queues()->MoveToPagerBackedDirty(page); |
| } |
| break; |
| case DirtyState::AwaitingClean: |
| // A newly added page cannot start off as AwaitingClean. |
| ASSERT(!is_pending_add); |
| // A pinned page will be kept Dirty as long as it is pinned. |
| // |
| // Note that there isn't a similar constraint when setting the Clean state as it is possible |
| // to pin a page for read after it has been marked AwaitingClean. Since it is a pinned read it |
| // does not need to dirty the page. So when the writeback is done it can transition from |
| // AwaitingClean -> Clean with a non-zero pin count. |
| // |
| // It is also possible for us to observe an intermediate pin count for a write-pin that has |
| // not fully completed yet, as we will only attempt to dirty pages after pinning them. So it |
| // is possible for a thread to be waiting on a DIRTY request on a pinned page, while a racing |
| // writeback transitions the page from AwaitingClean -> Clean with a non-zero pin count. |
| ASSERT(page->object.pin_count == 0); |
| // We can only transition to AwaitingClean from Dirty. |
| ASSERT(is_page_dirty(page)); |
| // A loaned page cannot be marked AwaitingClean as loaned pages are reclaimed by eviction; |
| // AwaitingClean pages cannot be evicted. |
| DEBUG_ASSERT(!page->is_loaned()); |
| // No page queue update. Leave the page in the Dirty queue for now as it is not clean yet; |
| // it will be moved out on WritebackEnd. |
| DEBUG_ASSERT(pmm_page_queues()->DebugPageIsPagerBackedDirty(page)); |
| break; |
| default: |
| ASSERT(false); |
| } |
| page->object.dirty_state = static_cast<uint8_t>(dirty_state) & VM_PAGE_OBJECT_DIRTY_STATES_MASK; |
| } |
| |
| zx_status_t VmCowPages::PrepareForWriteLocked(uint64_t offset, uint64_t len, |
| LazyPageRequest* page_request, |
| uint64_t* dirty_len_out) { |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(offset)); |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(len)); |
| DEBUG_ASSERT(InRange(offset, len, size_)); |
| |
| if (is_slice_locked()) { |
| return slice_parent_locked().PrepareForWriteLocked(offset + parent_offset_, len, page_request, |
| dirty_len_out); |
| } |
| |
| DEBUG_ASSERT(page_source_); |
| DEBUG_ASSERT(is_source_preserving_page_content()); |
| |
| uint64_t dirty_len = 0; |
| const uint64_t start_offset = offset; |
| const uint64_t end_offset = offset + len; |
| |
| // If the VMO does not require us to trap dirty transitions, simply mark the pages dirty, and move |
| // them to the dirty page queue. Do this only for the first consecutive run of committed pages |
| // within the range starting at offset. Any absent pages will need to be provided by the page |
| // source, which might fail and terminate the lookup early. Any zero page markers and zero |
| // intervals might need to be forked, which can fail too. Only mark those pages dirty that the |
| // lookup is guaranteed to return successfully. |
| if (!page_source_->ShouldTrapDirtyTransitions()) { |
| zx_status_t status = page_list_.ForEveryPageAndGapInRange( |
| [this, &dirty_len, start_offset](const VmPageOrMarker* p, uint64_t off) { |
| // TODO(johngro): remove this explicit unused-capture warning suppression |
| // when https://bugs.llvm.org/show_bug.cgi?id=35450 gets fixed. |
| (void)start_offset; // used only in DEBUG_ASSERT |
| if (p->IsMarker() || p->IsIntervalZero()) { |
| // Found a marker or zero interval. End the traversal. |
| return ZX_ERR_STOP; |
| } |
| // VMOs with a page source will never have compressed references, so this should be a |
| // real page. |
| DEBUG_ASSERT(p->IsPage()); |
| vm_page_t* page = p->Page(); |
| DEBUG_ASSERT(is_page_dirty_tracked(page)); |
| DEBUG_ASSERT(page->object.get_object() == this); |
| DEBUG_ASSERT(page->object.get_page_offset() == off); |
| |
| // End the traversal if we encounter a loaned page. We reclaim loaned pages by evicting |
| // them, and dirty pages cannot be evicted. |
| if (page->is_loaned()) { |
| // If this is a loaned page, it should be clean. |
| DEBUG_ASSERT(is_page_clean(page)); |
| return ZX_ERR_STOP; |
| } |
| DEBUG_ASSERT(!page->is_loaned()); |
| |
| // Mark the page dirty. |
| if (!is_page_dirty(page)) { |
| AssertHeld(lock_ref()); |
| UpdateDirtyStateLocked(page, off, DirtyState::Dirty); |
| } |
| // The page was either already dirty, or we just marked it dirty. Proceed to the next one. |
| DEBUG_ASSERT(start_offset + dirty_len == off); |
| dirty_len += PAGE_SIZE; |
| return ZX_ERR_NEXT; |
| }, |
| [](uint64_t start, uint64_t end) { |
| // We found a gap. End the traversal. |
| return ZX_ERR_STOP; |
| }, |
| start_offset, end_offset); |
| // We don't expect a failure from the traversal. |
| DEBUG_ASSERT(status == ZX_OK); |
| |
| *dirty_len_out = dirty_len; |
| VMO_VALIDATION_ASSERT(DebugValidateZeroIntervalsLocked()); |
| return ZX_OK; |
| } |
| |
| // Otherwise, generate a DIRTY page request for pages in the range which need to transition to |
| // Dirty. Pages that qualify are: |
| // - Any contiguous run of non-Dirty pages (committed pages as well as zero page markers). |
| // For the purpose of generating DIRTY requests, both Clean and AwaitingClean pages are |
| // considered equivalent. This is because pages that are in AwaitingClean will need another |
| // acknowledgment from the user pager before they can be made Dirty (the filesystem might need to |
| // reserve additional space for them etc.). |
| // - Any zero intervals are implicit zero pages, i.e. the kernel supplies zero pages when they |
| // are accessed. Since these pages are not supplied by the user pager via zx_pager_supply_pages, |
| // we will need to wait on a DIRTY request before the sparse range can be replaced by an actual |
| // page for writing (the filesystem might need to reserve additional space). |
| uint64_t pages_to_dirty_len = 0; |
| |
| // Helper lambda used in the page list traversal below. Try to add pages in the range |
| // [dirty_pages_start, dirty_pages_end) to the run of dirty pages being tracked. Return codes are |
| // the same as those used by VmPageList::ForEveryPageAndGapInRange to continue or terminate |
| // traversal. |
| auto accumulate_dirty_pages = [&pages_to_dirty_len, &dirty_len, start_offset]( |
| uint64_t dirty_pages_start, |
| uint64_t dirty_pages_end) -> zx_status_t { |
| // Bail if we were tracking a non-zero run of pages to be dirtied as we cannot extend |
| // pages_to_dirty_len anymore. |
| if (pages_to_dirty_len > 0) { |
| return ZX_ERR_STOP; |
| } |
| // Append the page to the dirty range being tracked if it immediately follows it. |
| if (start_offset + dirty_len == dirty_pages_start) { |
| dirty_len += (dirty_pages_end - dirty_pages_start); |
| return ZX_ERR_NEXT; |
| } |
| // Otherwise we cannot accumulate any more contiguous dirty pages. |
| return ZX_ERR_STOP; |
| }; |
| |
| // Helper lambda used in the page list traversal below. Try to add pages in the range |
| // [to_dirty_start, to_dirty_end) to the run of to-be-dirtied pages being tracked. Return codes |
| // are the same as those used by VmPageList::ForEveryPageAndGapInRange to continue or terminate |
| // traversal. |
| auto accumulate_pages_to_dirty = [&pages_to_dirty_len, &dirty_len, start_offset]( |
| uint64_t to_dirty_start, |
| uint64_t to_dirty_end) -> zx_status_t { |
| // Bail if we were already accumulating a non-zero run of Dirty pages. |
| if (dirty_len > 0) { |
| return ZX_ERR_STOP; |
| } |
| // Append the pages to the range being tracked if they immediately follow it. |
| if (start_offset + pages_to_dirty_len == to_dirty_start) { |
| pages_to_dirty_len += (to_dirty_end - to_dirty_start); |
| return ZX_ERR_NEXT; |
| } |
| // Otherwise we cannot accumulate any more contiguous to-dirty pages. |
| return ZX_ERR_STOP; |
| }; |
| |
| // This tracks the beginning of an interval that falls in the specified range. Since we might |
| // start partway inside an interval, this is initialized to start_offset so that we only consider |
| // the portion of the interval inside the range. If we did not start inside an interval, we will |
| // end up reinitializing this when we do find an interval start, before this value is used, so it |
| // is safe to initialize to start_offset in all cases. |
| uint64_t interval_start_off = start_offset; |
| // This tracks whether we saw an interval start sentinel in the traversal, but have not yet |
| // encountered a matching interval end sentinel. Should we end the traversal partway in an |
| // interval, we will need to handle the portion of the interval between the interval start and the |
| // end of the specified range. |
| bool unmatched_interval_start = false; |
| bool found_page_or_gap = false; |
| zx_status_t status = page_list_.ForEveryPageAndGapInRange( |
| [&accumulate_dirty_pages, &accumulate_pages_to_dirty, &interval_start_off, |
| &unmatched_interval_start, &found_page_or_gap, this](const VmPageOrMarker* p, uint64_t off) { |
| found_page_or_gap = true; |
| if (p->IsPage()) { |
| vm_page_t* page = p->Page(); |
| DEBUG_ASSERT(is_page_dirty_tracked(page)); |
| // VMOs that trap dirty transitions should not have loaned pages. |
| DEBUG_ASSERT(!page->is_loaned()); |
| // Page is already dirty. Try to add it to the dirty run. |
| if (is_page_dirty(page)) { |
| return accumulate_dirty_pages(off, off + PAGE_SIZE); |
| } |
| // If the page is clean, mark it accessed to grant it some protection from eviction |
| // until the pager has a chance to respond to the DIRTY request. |
| if (is_page_clean(page)) { |
| AssertHeld(lock_ref()); |
| pmm_page_queues()->MarkAccessed(page); |
| } |
| } else if (p->IsIntervalZero()) { |
| if (p->IsIntervalStart() || p->IsIntervalSlot()) { |
| unmatched_interval_start = true; |
| interval_start_off = off; |
| } |
| if (p->IsIntervalEnd() || p->IsIntervalSlot()) { |
| unmatched_interval_start = false; |
| // We need to commit pages if this is an interval, irrespective of the dirty state. |
| return accumulate_pages_to_dirty(interval_start_off, off + PAGE_SIZE); |
| } |
| return ZX_ERR_NEXT; |
| } |
| |
| // We don't compress pages in pager-backed VMOs. |
| DEBUG_ASSERT(!p->IsReference()); |
| // This is a either a zero page marker (which represents a clean zero page) or a committed |
| // page which is not already Dirty. Try to add it to the range of pages to be dirtied. |
| DEBUG_ASSERT(p->IsMarker() || !is_page_dirty(p->Page())); |
| return accumulate_pages_to_dirty(off, off + PAGE_SIZE); |
| }, |
| [&found_page_or_gap](uint64_t start, uint64_t end) { |
| found_page_or_gap = true; |
| // We found a gap. End the traversal. |
| return ZX_ERR_STOP; |
| }, |
| start_offset, end_offset); |
| |
| // We don't expect an error from the traversal above. If an incompatible contiguous page or |
| // a gap is encountered, we will simply terminate early. |
| DEBUG_ASSERT(status == ZX_OK); |
| |
| // Process the last remaining interval if there is one. |
| if (unmatched_interval_start) { |
| accumulate_pages_to_dirty(interval_start_off, end_offset); |
| } |
| |
| // Account for the case where we started and ended in unpopulated slots inside an interval, i.e we |
| // did not find either a page or a gap in the traversal. We would not have accumulated any pages |
| // in that case. |
| if (!found_page_or_gap) { |
| DEBUG_ASSERT(page_list_.IsOffsetInZeroInterval(start_offset)); |
| DEBUG_ASSERT(page_list_.IsOffsetInZeroInterval(end_offset - PAGE_SIZE)); |
| DEBUG_ASSERT(dirty_len == 0); |
| DEBUG_ASSERT(pages_to_dirty_len == 0); |
| // The entire range falls in an interval so it needs a DIRTY request. |
| pages_to_dirty_len = end_offset - start_offset; |
| } |
| |
| // We should either have found dirty pages or pages that need to be dirtied, but not both. |
| DEBUG_ASSERT(dirty_len == 0 || pages_to_dirty_len == 0); |
| // Check that dirty_len and pages_to_dirty_len both specify valid ranges. |
| DEBUG_ASSERT(start_offset + dirty_len <= end_offset); |
| DEBUG_ASSERT(pages_to_dirty_len == 0 || start_offset + pages_to_dirty_len <= end_offset); |
| |
| *dirty_len_out = dirty_len; |
| |
| VMO_VALIDATION_ASSERT(DebugValidateZeroIntervalsLocked()); |
| |
| // No pages need to transition to Dirty. |
| if (pages_to_dirty_len == 0) { |
| return ZX_OK; |
| } |
| |
| // Found a contiguous run of pages that need to transition to Dirty. There might be more such |
| // pages later in the range, but we will come into this call again for them via another |
| // LookupCursor call after the waiting caller is unblocked for this range. |
| |
| VmoDebugInfo vmo_debug_info{}; |
| // We have a page source so this cannot be a hidden node, but the VmObjectPaged could have been |
| // destroyed. We could be looking up a page via a lookup in a child (slice) after the parent |
| // VmObjectPaged has gone away, so paged_ref_ could be null. Let the page source handle any |
| // failures requesting the dirty transition. |
| if (paged_ref_) { |
| AssertHeld(paged_ref_->lock_ref()); |
| vmo_debug_info = {.vmo_ptr = reinterpret_cast<uintptr_t>(paged_ref_), |
| .vmo_id = paged_ref_->user_id_locked()}; |
| } |
| status = page_source_->RequestDirtyTransition(page_request->get(), start_offset, |
| pages_to_dirty_len, vmo_debug_info); |
| // The page source will never succeed synchronously. |
| DEBUG_ASSERT(status != ZX_OK); |
| return status; |
| } |
| |
| inline VmCowPages::LookupCursor::RequireResult VmCowPages::LookupCursor::PageAsResultNoIncrement( |
| vm_page_t* page, bool in_target) { |
| // The page is writable if it's present in the target (non owned pages are never writable) and it |
| // does not need a dirty transition. A page doesn't need a dirty transition if the target isn't |
| // preserving page contents, or if the page is just already dirty. |
| RequireResult result{page, |
| (in_target && (!target_preserving_page_content_ || is_page_dirty(page)))}; |
| return result; |
| } |
| |
| void VmCowPages::LookupCursor::IncrementOffsetAndInvalidateCursor(uint64_t delta) { |
| offset_ += delta; |
| owner_ = nullptr; |
| } |
| |
| bool VmCowPages::LookupCursor::CursorIsContentZero() const { |
| // Markers are always zero. |
| if (CursorIsMarker()) { |
| return true; |
| } |
| |
| if (owner_->page_source_) { |
| // With a page source emptiness implies needing to request content, however we can have zero |
| // intervals which do start as zero content. |
| return CursorIsInIntervalZero(); |
| } |
| // Without a page source emptiness is filled with zeros and intervals are only permitted if there |
| // is a page source. |
| return CursorIsEmpty(); |
| } |
| |
| bool VmCowPages::LookupCursor::TargetZeroContentSupplyDirty(bool writing) const { |
| if (!TargetDirtyTracked()) { |
| return false; |
| } |
| if (writing) { |
| return true; |
| } |
| // Markers start clean |
| if (CursorIsMarker()) { |
| return false; |
| } |
| // The only way this offset can have been zero content and reach here, is if we are in an |
| // interval. If this slot were empty then, since we are dirty tracked and hence must have a |
| // page source, we would not consider this zero. |
| DEBUG_ASSERT(CursorIsInIntervalZero()); |
| // Zero intervals are considered implicitly dirty and allocating them, even for reading, causes |
| // them to be supplied as new dirty pages. |
| return true; |
| } |
| |
| zx::result<VmCowPages::LookupCursor::RequireResult> |
| VmCowPages::LookupCursor::TargetAllocateCopyPageAsResult(vm_page_t* source, DirtyState dirty_state, |
| LazyPageRequest* page_request) { |
| // The general pmm_alloc_flags_ are not allowed to contain the LOANED option, and this is relied |
| // upon below to assume the page allocated cannot be loaned. |
| DEBUG_ASSERT(!(target_->pmm_alloc_flags_ & PMM_ALLOC_FLAG_LOANED)); |
| |
| vm_page_t* out_page = nullptr; |
| zx_status_t status = |
| target_->AllocateCopyPage(source->paddr(), alloc_list_, page_request, &out_page); |
| if (status != ZX_OK) { |
| return zx::error(status); |
| } |
| // The forked page was just allocated, and so cannot be a loaned page. |
| DEBUG_ASSERT(!out_page->is_loaned()); |
| |
| // We could be allocating a page to replace a zero page marker in a pager-backed VMO. If so then |
| // set its dirty state to what was requested, AddPageLocked below will then insert the page into |
| // the appropriate page queue. |
| if (target_preserving_page_content_) { |
| // The only page we can be forking here is the zero page. |
| DEBUG_ASSERT(source == vm_get_zero_page()); |
| // The object directly owns the page. |
| DEBUG_ASSERT(owner_ == target_); |
| |
| target_->UpdateDirtyStateLocked(out_page, offset_, dirty_state, |
| /*is_pending_add=*/true); |
| } |
| VmPageOrMarker insert = VmPageOrMarker::Page(out_page); |
| status = target_->AddPageLocked(&insert, offset_, CanOverwriteContent::Zero, nullptr); |
| if (status != ZX_OK) { |
| // AddPageLocked failing for any other reason is a programming error. |
| DEBUG_ASSERT_MSG(status == ZX_ERR_NO_MEMORY, "status=%d\n", status); |
| // We are freeing a page we just got from the PMM (or from the alloc_list), so we do not own |
| // it yet. |
| target_->FreePageLocked(insert.ReleasePage(), /*freeing_owned_page=*/false); |
| return zx::error(status); |
| } |
| target_->IncrementHierarchyGenerationCountLocked(); |
| |
| // If asked to explicitly mark zero forks, and this is actually fork of the zero page, move to the |
| // correct queue. |
| if (zero_fork_ && source == vm_get_zero_page()) { |
| pmm_page_queues()->MoveToAnonymousZeroFork(out_page); |
| } |
| |
| // This is the only path where we can allocate a new page without being a clone (clones are |
| // always cached). So we check here if we are not fully cached and if so perform a |
| // clean/invalidate to flush our zeroes. After doing this we will not touch the page via the |
| // physmap and so we can pretend there isn't an aliased mapping. |
| // There are three potential states that may exist |
| // * VMO is cached, paged_ref_ might be null, we might have children -> no cache op needed |
| // * VMO is uncached, paged_ref_ is not null, we have no children -> cache op needed |
| // * VMO is uncached, paged_ref_ is null, we have no children -> cache op not needed / |
| // state cannot happen |
| // In the uncached case we know we have no children, since it is by definition not valid to |
| // have copy-on-write children of uncached pages. The third case cannot happen, but even if it |
| // could with no children and no paged_ref_ the pages cannot actually be referenced so any |
| // cache operation is pointless. |
| // The paged_ref_ could be null if the VmObjectPaged has been destroyed. |
| if (target_->paged_ref_) { |
| AssertHeld(target_->paged_ref_->lock_ref()); |
| if (target_->paged_ref_->GetMappingCachePolicyLocked() != ARCH_MMU_FLAG_CACHED) { |
| arch_clean_invalidate_cache_range((vaddr_t)paddr_to_physmap(out_page->paddr()), PAGE_SIZE); |
| } |
| } |
| |
| // Need to increment the cursor, but we have also potentially modified the page lists in the |
| // process of inserting the page. |
| if (owner_ == target_) { |
| // In the case of owner_ == target_ we may have create a node and need to establish a cursor. |
| // However, if we already had a node, i.e. the cursor was valid, then it would have had the page |
| // inserted into it. |
| if (!owner_pl_cursor_.current()) { |
| IncrementOffsetAndInvalidateCursor(PAGE_SIZE); |
| } else { |
| // Cursor should have been updated to the new page |
| DEBUG_ASSERT(CursorIsPage()); |
| DEBUG_ASSERT(owner_cursor_->Page() == out_page); |
| IncrementCursor(); |
| } |
| } else { |
| // If owner_ != target_ then owner_ page list will not have been modified, so safe to just |
| // increment. |
| IncrementCursor(); |
| } |
| |
| // Return the page. We know it's in the target, since we just put it there, but let PageAsResult |
| // determine if that means it is actually writable or not. |
| return zx::ok(PageAsResultNoIncrement(out_page, true)); |
| } |
| |
| zx_status_t VmCowPages::LookupCursor::CursorReferenceToPage(LazyPageRequest* page_request) { |
| DEBUG_ASSERT(CursorIsReference()); |
| |
| return owner()->ReplaceReferenceWithPageLocked(owner_cursor_, owner_offset_, page_request); |
| } |
| |
| zx_status_t VmCowPages::LookupCursor::ReadRequest(uint max_request_pages, |
| LazyPageRequest* page_request) { |
| // The owner must have a page_source_ to be doing a read request. |
| DEBUG_ASSERT(owner_->page_source_); |
| // The cursor should be explicitly empty as read requests are only for complete content absence. |
| DEBUG_ASSERT(CursorIsEmpty()); |
| DEBUG_ASSERT(!CursorIsInIntervalZero()); |
| // The total range requested should not be beyond the cursors valid range. |
| DEBUG_ASSERT(offset_ + PAGE_SIZE * max_request_pages <= end_offset_); |
| DEBUG_ASSERT(max_request_pages > 0); |
| |
| VmoDebugInfo vmo_debug_info{}; |
| // The page owner has a page source so it cannot be a hidden node, but the VmObjectPaged |
| // could have been destroyed. We could be looking up a page via a lookup in a child after |
| // the parent VmObjectPaged has gone away, so paged_ref_ could be null. Let the page source |
| // handle any failures requesting the pages. |
| if (owner()->paged_ref_) { |
| AssertHeld(owner()->paged_ref_->lock_ref()); |
| vmo_debug_info = {.vmo_ptr = reinterpret_cast<uintptr_t>(owner()->paged_ref_), |
| .vmo_id = owner()->paged_ref_->user_id_locked()}; |
| } |
| |
| // Try and batch more pages up to |max_request_pages|. |
| uint64_t request_size = static_cast<uint64_t>(max_request_pages) * PAGE_SIZE; |
| if (owner_ != target_) { |
| DEBUG_ASSERT(visible_end_ > offset_); |
| // Limit the request by the number of pages that are actually visible from the target_ to |
| // owner_ |
| request_size = ktl::min(request_size, visible_end_ - offset_); |
| } |
| // Limit |request_size| to the first page visible in the page owner to avoid requesting pages |
| // that are already present. If there is one page present in an otherwise long run of absent pages |
| // then it might be preferable to have one big page request, but for now only request absent |
| // pages.If already requesting a single page then can avoid the page list operation. |
| if (request_size > PAGE_SIZE) { |
| owner()->page_list_.ForEveryPageInRange( |
| [&](const VmPageOrMarker* p, uint64_t offset) { |
| // Content should have been empty initially, so should not find anything at the start |
| // offset. |
| DEBUG_ASSERT(offset > owner_offset_); |
| // If this is an interval sentinel, it can only be a start or slot, since we know we |
| // started in a true gap outside of an interval. |
| DEBUG_ASSERT(!p->IsInterval() || p->IsIntervalSlot() || p->IsIntervalStart()); |
| const uint64_t new_size = offset - owner_offset_; |
| // Due to the limited range of the operation, the only way this callback ever fires is if |
| // the range is actually getting trimmed. |
| DEBUG_ASSERT(new_size < request_size); |
| request_size = new_size; |
| return ZX_ERR_STOP; |
| }, |
| owner_offset_, owner_offset_ + request_size); |
| } |
| DEBUG_ASSERT(request_size >= PAGE_SIZE); |
| |
| zx_status_t status = owner_->page_source_->GetPages(owner_offset_, request_size, |
| page_request->get(), vmo_debug_info); |
| // Pager page sources will never synchronously return a page. |
| DEBUG_ASSERT(status != ZX_OK); |
| return status; |
| } |
| |
| zx_status_t VmCowPages::LookupCursor::DirtyRequest(uint max_request_pages, |
| LazyPageRequest* page_request) { |
| // Dirty requests, unlike read requests, happen directly against the target, and not the owner. |
| // This is because to make something dirty you must own it, i.e. target_ is already equal to |
| // owner_. |
| DEBUG_ASSERT(target_ == owner_); |
| DEBUG_ASSERT(target_->page_source_); |
| DEBUG_ASSERT(max_request_pages > 0); |
| DEBUG_ASSERT(offset_ + PAGE_SIZE * max_request_pages <= end_offset_); |
| |
| // As we know target_==owner_ there is no need to trim the requested range to any kind of visible |
| // range, so just attempt to dirty the entire range. |
| uint64_t dirty_len = 0; |
| zx_status_t status = target_->PrepareForWriteLocked(offset_, PAGE_SIZE * max_request_pages, |
| page_request, &dirty_len); |
| if (status == ZX_OK) { |
| // If success is claimed then it must be the case that at least one page was dirtied, allowing |
| // us to make progress. |
| DEBUG_ASSERT(dirty_len != 0 && dirty_len <= max_request_pages * PAGE_SIZE); |
| } else { |
| DEBUG_ASSERT(dirty_len == 0); |
| } |
| return status; |
| } |
| |
| vm_page_t* VmCowPages::LookupCursor::MaybePage(bool will_write) { |
| EstablishCursor(); |
| |
| // If the page is immediately usable, i.e. no dirty transitions etc needed, then we can provide |
| // it. Otherwise just increment the cursor and return the nullptr. |
| vm_page_t* page = CursorIsUsablePage(will_write) ? owner_cursor_->Page() : nullptr; |
| |
| if (page && mark_accessed_) { |
| pmm_page_queues()->MarkAccessed(page); |
| } |
| |
| IncrementCursor(); |
| |
| return page; |
| } |
| |
| uint64_t VmCowPages::LookupCursor::SkipMissingPages() { |
| EstablishCursor(); |
| |
| // Check if the cursor is truly empty |
| if (!CursorIsEmpty() || CursorIsInIntervalZero()) { |
| return 0; |
| } |
| |
| uint64_t possibly_empty = visible_end_ - offset_; |
| // Limit possibly_empty by the first page visible in the owner which, since our cursor is empty, |
| // would also be the root vmo. |
| if (possibly_empty > PAGE_SIZE) { |
| owner()->page_list_.ForEveryPageInRange( |
| [&](const VmPageOrMarker* p, uint64_t offset) { |
| // Content should have been empty initially, so should not find anything at the start |
| // offset. |
| DEBUG_ASSERT(offset > owner_offset_); |
| // If this is an interval sentinel, it can only be a start or slot, since we know we |
| // started in a true gap outside of an interval. |
| DEBUG_ASSERT(!p->IsInterval() || p->IsIntervalSlot() || p->IsIntervalStart()); |
| const uint64_t new_size = offset - owner_offset_; |
| // Due to the limited range of the operation, the only way this callback ever fires is if |
| // the range is actually getting trimmed. |
| DEBUG_ASSERT(new_size < possibly_empty); |
| possibly_empty = new_size; |
| return ZX_ERR_STOP; |
| }, |
| owner_offset_, owner_offset_ + possibly_empty); |
| } |
| // The cursor was empty, so we should have ended up with at least one page. |
| DEBUG_ASSERT(possibly_empty >= PAGE_SIZE); |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(possibly_empty)); |
| DEBUG_ASSERT(possibly_empty + offset_ <= end_offset_); |
| IncrementOffsetAndInvalidateCursor(possibly_empty); |
| return possibly_empty / PAGE_SIZE; |
| } |
| |
| uint VmCowPages::LookupCursor::IfExistPages(bool will_write, uint max_pages, paddr_t* paddrs) { |
| // Ensure that the requested range is valid. |
| DEBUG_ASSERT(offset_ + PAGE_SIZE * max_pages <= end_offset_); |
| DEBUG_ASSERT(paddrs); |
| |
| EstablishCursor(); |
| |
| // We only return actual pages that are ready to use right now without any dirty transitions or |
| // copy-on-write or needing to mark them accessed. |
| if (!CursorIsUsablePage(will_write) || mark_accessed_) { |
| return 0; |
| } |
| |
| // Trim max pages to the visible length of the current owner. This only has an effect when |
| // target_ != owner_ as otherwise the visible_end_ is the same as end_offset_ and we already |
| // validated that we are within that range. |
| if (owner_ != target_) { |
| max_pages = ktl::min(max_pages, static_cast<uint>((visible_end_ - offset_) / PAGE_SIZE)); |
| } |
| DEBUG_ASSERT(max_pages > 0); |
| |
| // Take up to the max_pages as long as they exist contiguously. |
| uint pages = 0; |
| owner_pl_cursor_.ForEveryContiguous([&](VmPageOrMarkerRef page) { |
| if (page->IsPage()) { |
| paddrs[pages] = page->Page()->paddr(); |
| pages++; |
| return pages == max_pages ? ZX_ERR_STOP : ZX_ERR_NEXT; |
| } |
| return ZX_ERR_STOP; |
| }); |
| // Update the cursor to reflect the number of pages we found and are returning. |
| // We could check if cursor is still valid, but it's more efficient to just invalidate it and let |
| // any potential next page request recalculate it. |
| IncrementOffsetAndInvalidateCursor(pages * PAGE_SIZE); |
| return pages; |
| } |
| |
| zx::result<VmCowPages::LookupCursor::RequireResult> VmCowPages::LookupCursor::RequireOwnedPage( |
| bool will_write, uint max_request_pages, LazyPageRequest* page_request) { |
| DEBUG_ASSERT(page_request); |
| |
| // Make sure the cursor is valid. |
| EstablishCursor(); |
| |
| // Convert any references to pages. |
| if (CursorIsReference()) { |
| // Decompress in place. |
| zx_status_t status = CursorReferenceToPage(page_request); |
| if (status != ZX_OK) { |
| return zx::error(status); |
| } |
| } |
| |
| // If page exists in the target, i.e. the owner is the target, then we handle this case separately |
| // as it's the only scenario where we might be dirtying an existing committed page. |
| if (owner_ == target_ && CursorIsPage()) { |
| // If we're writing to a root VMO backed by a user pager, i.e. a VMO whose page source preserves |
| // page contents, we might need to mark pages Dirty so that they can be written back later. This |
| // is the only path that can result in a write to such a page; if the page was not present, we |
| // would have already blocked on a read request the first time, and ended up here when |
| // unblocked, at which point the page would be present. |
| if (will_write && target_preserving_page_content_) { |
| // If this page was loaned, it should be replaced with a non-loaned page, so that we can make |
| // progress with marking pages dirty. PrepareForWriteLocked terminates its page walk when it |
| // encounters a loaned page; loaned pages are reclaimed by evicting them and we cannot evict |
| // dirty pages. |
| if (owner_cursor_->Page()->is_loaned()) { |
| vm_page_t* res_page = nullptr; |
| DEBUG_ASSERT(is_page_clean(owner_cursor_->Page())); |
| zx_status_t status = target_->ReplacePageLocked( |
| owner_cursor_->Page(), offset_, /*with_loaned=*/false, &res_page, page_request); |
| if (status != ZX_OK) { |
| return zx::error(status); |
| } |
| // Cursor should remain valid and have been replaced with the page. |
| DEBUG_ASSERT(CursorIsPage()); |
| DEBUG_ASSERT(owner_cursor_->Page() == res_page); |
| DEBUG_ASSERT(!owner_cursor_->Page()->is_loaned()); |
| } |
| // If the page is not already dirty, then generate a dirty request. The dirty request code can |
| // handle the page already being dirty, this is just a short circuit optimization. |
| if (!is_page_dirty(owner_cursor_->Page())) { |
| zx_status_t status = DirtyRequest(max_request_pages, page_request); |
| if (status != ZX_OK) { |
| return zx::error(status); |
| } |
| } |
| } |
| // Return the page. |
| return zx::ok(CursorAsResult()); |
| } |
| |
| // Should there be page, but it not be owned by the target, then we are performing copy on write |
| // into the target. As the target cannot have a page source do not need to worry about writes or |
| // dirtying. |
| if (CursorIsPage()) { |
| DEBUG_ASSERT(owner_ != target_); |
| vm_page_t* res_page = nullptr; |
| // Although we are not returning the page, the act of forking counts as an access, and this is |
| // an access regardless of whether the final returned page should be considered accessed, so |
| // ignore the mark_accessed_ check here. |
| pmm_page_queues()->MarkAccessed(owner_cursor_->Page()); |
| if (!owner()->is_hidden_locked()) { |
| // Directly copying the page from the owner into the target. |
| return TargetAllocateCopyPageAsResult(owner_cursor_->Page(), DirtyState::Untracked, |
| page_request); |
| } |
| zx_status_t result = |
| target_->CloneCowPageLocked(offset_, alloc_list_, owner_, owner_cursor_->Page(), |
| owner_offset_, page_request, &res_page); |
| if (result != ZX_OK) { |
| return zx::error(result); |
| } |
| target_->IncrementHierarchyGenerationCountLocked(); |
| // Cloning the cow page may have impacted our cursor due to a split page being moved so |
| // invalidate the cursor to perform a fresh lookup on the next page requested. |
| IncrementOffsetAndInvalidateCursor(PAGE_SIZE); |
| // This page as just allocated so no need to worry about update access times, can just return. |
| return zx::ok(RequireResult{res_page, true}); |
| } |
| |
| // Zero content is the most complicated cases where, even if reading, dirty requests might need to |
| // be performed and the resulting committed pages may / may not be dirty. |
| if (CursorIsContentZero()) { |
| // If the page source is preserving content (is a PagerProxy), and is configured to trap dirty |
| // transitions, we first need to generate a DIRTY request *before* the zero page can be forked |
| // and marked dirty. If dirty transitions are not trapped, we will fall through to allocate the |
| // page and then mark it dirty below. |
| // |
| // Note that the check for ShouldTrapDirtyTransitions() is an optimization here. |
| // PrepareForWriteLocked() would do the right thing depending on ShouldTrapDirtyTransitions(), |
| // however we choose to avoid the extra work only to have it be a no-op if dirty transitions |
| // should not be trapped. |
| const bool target_page_dirty = TargetZeroContentSupplyDirty(will_write); |
| if (target_page_dirty && target_->page_source_->ShouldTrapDirtyTransitions()) { |
| zx_status_t status = DirtyRequest(max_request_pages, page_request); |
| // Since we know we have a page source that traps, and page sources will never succeed |
| // synchronously, our dirty request must have 'failed'. |
| DEBUG_ASSERT(status != ZX_OK); |
| return zx::error(status); |
| } |
| // Allocate the page and mark it dirty or clean as previously determined. |
| return TargetAllocateCopyPageAsResult(vm_get_zero_page(), |
| target_page_dirty ? DirtyState::Dirty : DirtyState::Clean, |
| page_request); |
| } |
| DEBUG_ASSERT(CursorIsEmpty()); |
| |
| // Generate a read request to populate the content in the owner. Even if this is a write, we still |
| // populate content first, then perform any dirty transitions / requests. |
| return zx::error(ReadRequest(max_request_pages, page_request)); |
| } |
| |
| zx::result<VmCowPages::LookupCursor::RequireResult> VmCowPages::LookupCursor::RequireReadPage( |
| uint max_request_pages, LazyPageRequest* page_request) { |
| DEBUG_ASSERT(page_request); |
| |
| // Make sure the cursor is valid. |
| EstablishCursor(); |
| |
| // If there's a page or reference, return it. |
| if (CursorIsPage() || CursorIsReference()) { |
| if (CursorIsReference()) { |
| zx_status_t status = CursorReferenceToPage(page_request); |
| if (status != ZX_OK) { |
| return zx::error(status); |
| } |
| DEBUG_ASSERT(CursorIsPage()); |
| } |
| return zx::ok(CursorAsResult()); |
| } |
| |
| // Check for zero page options. |
| if (CursorIsContentZero()) { |
| IncrementCursor(); |
| return zx::ok(RequireResult{vm_get_zero_page(), false}); |
| } |
| |
| // No available content, need to fetch it from the page source. ReadRequest performs all the |
| // requisite asserts to ensure we are not doing this mistakenly. |
| return zx::error(ReadRequest(max_request_pages, page_request)); |
| } |
| |
| zx::result<VmCowPages::LookupCursor> VmCowPages::GetLookupCursorLocked(uint64_t offset, |
| uint64_t max_len) { |
| canary_.Assert(); |
| DEBUG_ASSERT(!is_hidden_locked()); |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(offset) && max_len > 0 && IS_PAGE_ALIGNED(max_len)); |
| DEBUG_ASSERT(life_cycle_ == LifeCycle::Alive); |
| VMO_VALIDATION_ASSERT(DebugValidatePageSplitsHierarchyLocked()); |
| |
| if (unlikely(offset >= size_ || !InRange(offset, max_len, size_))) { |
| return zx::error{ZX_ERR_OUT_OF_RANGE}; |
| } |
| |
| if (discardable_tracker_) { |
| discardable_tracker_->assert_cow_pages_locked(); |
| // This vmo was discarded and has not been locked yet after the discard. Do not return any |
| // pages. |
| if (discardable_tracker_->WasDiscardedLocked()) { |
| return zx::error{ZX_ERR_NOT_FOUND}; |
| } |
| } |
| |
| if (is_slice_locked()) { |
| return slice_parent_locked().GetLookupCursorLocked(offset + parent_offset_, max_len); |
| } |
| |
| return zx::ok(LookupCursor(this, offset, max_len)); |
| } |
| |
| zx_status_t VmCowPages::CommitRangeLocked(uint64_t offset, uint64_t len, uint64_t* committed_len, |
| LazyPageRequest* page_request) { |
| canary_.Assert(); |
| LTRACEF("offset %#" PRIx64 ", len %#" PRIx64 "\n", offset, len); |
| |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(offset)); |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(len)); |
| DEBUG_ASSERT(InRange(offset, len, size_)); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| |
| if (is_slice_locked()) { |
| return slice_parent_locked().CommitRangeLocked(offset + parent_offset_, len, committed_len, |
| page_request); |
| } |
| |
| fbl::RefPtr<PageSource> root_source = GetRootPageSourceLocked(); |
| |
| // If this vmo has a direct page source, then the source will provide the backing memory. For |
| // children that eventually depend on a page source, we skip preallocating memory to avoid |
| // potentially overallocating pages if something else touches the vmo while we're blocked on the |
| // request. Otherwise we optimize things by preallocating all the pages. |
| list_node page_list; |
| list_initialize(&page_list); |
| if (root_source == nullptr) { |
| // make a pass through the list to find out how many pages we need to allocate |
| size_t count = len / PAGE_SIZE; |
| page_list_.ForEveryPageInRange( |
| [&count](const auto* p, auto off) { |
| if (p->IsPage()) { |
| count--; |
| } |
| return ZX_ERR_NEXT; |
| }, |
| offset, offset + len); |
| |
| if (count == 0) { |
| *committed_len = len; |
| return ZX_OK; |
| } |
| |
| zx_status_t status = pmm_alloc_pages(count, pmm_alloc_flags_, &page_list); |
| // Ignore ZX_ERR_SHOULD_WAIT since the loop below will fall back to a page by page allocation, |
| // allowing us to wait for single pages should we need to. |
| if (status != ZX_OK && status != ZX_ERR_SHOULD_WAIT) { |
| return status; |
| } |
| } |
| |
| auto list_cleanup = fit::defer([&page_list, this]() { |
| if (!list_is_empty(&page_list)) { |
| AssertHeld(lock_ref()); |
| // We are freeing pages we got from the PMM and did not end up using, so we do not own them. |
| FreePagesLocked(&page_list, /*freeing_owned_pages=*/false); |
| } |
| }); |
| |
| const uint64_t start_offset = offset; |
| const uint64_t end = offset + len; |
| __UNINITIALIZED auto cursor = GetLookupCursorLocked(start_offset, len); |
| if (cursor.is_error()) { |
| return cursor.error_value(); |
| } |
| AssertHeld(cursor->lock_ref()); |
| // Commit represents an explicit desire to have pages and should not be deduped back to the zero |
| // page. |
| cursor->DisableZeroFork(); |
| cursor->GiveAllocList(&page_list); |
| |
| zx_status_t status = ZX_OK; |
| while (offset < end) { |
| __UNINITIALIZED zx::result<VmCowPages::LookupCursor::RequireResult> result = |
| cursor->RequireOwnedPage(false, static_cast<uint>((end - offset) / PAGE_SIZE), |
| page_request); |
| |
| if (result.is_error()) { |
| status = result.error_value(); |
| break; |
| } |
| offset += PAGE_SIZE; |
| } |
| // Record how much we were able to process. |
| *committed_len = offset - start_offset; |
| |
| // Clear the alloc list from the cursor and let list_cleanup free any remaining pages. |
| cursor->ClearAllocList(); |
| |
| VMO_VALIDATION_ASSERT(DebugValidatePageSplitsHierarchyLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| return status; |
| } |
| |
| zx_status_t VmCowPages::PinRangeLocked(uint64_t offset, uint64_t len) { |
| canary_.Assert(); |
| LTRACEF("offset %#" PRIx64 ", len %#" PRIx64 "\n", offset, len); |
| |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(offset)); |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(len)); |
| DEBUG_ASSERT(InRange(offset, len, size_)); |
| |
| if (is_slice_locked()) { |
| return slice_parent_locked().PinRangeLocked(offset + parent_offset_, len); |
| } |
| |
| ever_pinned_ = true; |
| |
| // Tracks our expected page offset when iterating to ensure all pages are present. |
| uint64_t next_offset = offset; |
| |
| // Should any errors occur we need to unpin everything. |
| auto pin_cleanup = fit::defer([this, offset, &next_offset]() { |
| if (next_offset > offset) { |
| AssertHeld(*lock()); |
| UnpinLocked(offset, next_offset - offset, /*allow_gaps=*/false); |
| } |
| }); |
| |
| zx_status_t status = page_list_.ForEveryPageInRange( |
| [this, &next_offset](const VmPageOrMarker* p, uint64_t page_offset) { |
| AssertHeld(lock_ref()); |
| if (page_offset != next_offset || !p->IsPage()) { |
| return ZX_ERR_BAD_STATE; |
| } |
| vm_page_t* page = p->Page(); |
| DEBUG_ASSERT(page->state() == vm_page_state::OBJECT); |
| DEBUG_ASSERT(!page->is_loaned()); |
| |
| if (page->object.pin_count == VM_PAGE_OBJECT_MAX_PIN_COUNT) { |
| return ZX_ERR_UNAVAILABLE; |
| } |
| |
| page->object.pin_count++; |
| if (page->object.pin_count == 1) { |
| MoveToPinnedLocked(page, page_offset); |
| } |
| |
| // Pinning every page in the largest vmo possible as many times as possible can't overflow |
| static_assert(VmPageList::MAX_SIZE / PAGE_SIZE < UINT64_MAX / VM_PAGE_OBJECT_MAX_PIN_COUNT); |
| next_offset += PAGE_SIZE; |
| return ZX_ERR_NEXT; |
| }, |
| offset, offset + len); |
| |
| const uint64_t actual = (next_offset - offset) / PAGE_SIZE; |
| // Count whatever pages we pinned, in the failure scenario this will get decremented on the unpin. |
| pinned_page_count_ += actual; |
| |
| if (status == ZX_OK) { |
| // If the missing pages were at the end of the range (or the range was empty) then our iteration |
| // will have just returned ZX_OK. Perform one final check that we actually pinned the number of |
| // pages we expected to. |
| const uint64_t expected = len / PAGE_SIZE; |
| if (actual != expected) { |
| status = ZX_ERR_BAD_STATE; |
| } else { |
| pin_cleanup.cancel(); |
| } |
| } |
| return status; |
| } |
| |
| zx_status_t VmCowPages::DecommitRangeLocked(uint64_t offset, uint64_t len) { |
| canary_.Assert(); |
| |
| // Validate the size and perform our zero-length hot-path check before we recurse |
| // up to our top-level ancestor. Size bounding needs to take place relative |
| // to the child the operation was originally targeted against. |
| if (!InRange(offset, len, size_)) { |
| return ZX_ERR_OUT_OF_RANGE; |
| } |
| |
| // was in range, just zero length |
| if (len == 0) { |
| return ZX_OK; |
| } |
| |
| if (is_slice_locked()) { |
| return slice_parent_locked().DecommitRangeLocked(offset + parent_offset_, len); |
| } |
| |
| // Currently, we can't decommit if the absence of a page doesn't imply zeroes. |
| if (parent_ || is_source_preserving_page_content()) { |
| return ZX_ERR_NOT_SUPPORTED; |
| } |
| |
| // VmObjectPaged::DecommitRange() rejects is_contiguous() VMOs (for now). |
| DEBUG_ASSERT(can_decommit()); |
| |
| // Demand offset and length be correctly aligned to not give surprising user semantics. |
| if (!IS_PAGE_ALIGNED(offset) || !IS_PAGE_ALIGNED(len)) { |
| return ZX_ERR_INVALID_ARGS; |
| } |
| |
| list_node_t freed_list; |
| list_initialize(&freed_list); |
| zx_status_t status = UnmapAndRemovePagesLocked(offset, len, &freed_list); |
| if (status != ZX_OK) { |
| return status; |
| } |
| |
| // We were successfully able to remove pages. Increment the gen count. |
| IncrementHierarchyGenerationCountLocked(); |
| |
| FreePagesLocked(&freed_list, /*freeing_owned_pages=*/true); |
| |
| return status; |
| } |
| |
| zx_status_t VmCowPages::UnmapAndRemovePagesLocked(uint64_t offset, uint64_t len, |
| list_node_t* freed_list, |
| uint64_t* pages_freed_out) { |
| canary_.Assert(); |
| |
| if (AnyPagesPinnedLocked(offset, len)) { |
| return ZX_ERR_BAD_STATE; |
| } |
| |
| LTRACEF("start offset %#" PRIx64 ", end %#" PRIx64 "\n", offset, offset + len); |
| |
| // We've already trimmed the range in DecommitRangeLocked(). |
| DEBUG_ASSERT(InRange(offset, len, size_)); |
| |
| // Verify page alignment. |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(offset)); |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(len) || (offset + len == size_)); |
| |
| // DecommitRangeLocked() will call this function only on a VMO with no parent. The only clone |
| // types that support OP_DECOMMIT are slices, for which we will recurse up to the root. |
| DEBUG_ASSERT(!parent_); |
| |
| // unmap all of the pages in this range on all the mapping regions |
| RangeChangeUpdateLocked(offset, len, RangeChangeOp::Unmap); |
| |
| __UNINITIALIZED BatchPQRemove page_remover(freed_list); |
| |
| page_list_.RemovePages(page_remover.RemovePagesCallback(), offset, offset + len); |
| page_remover.Flush(); |
| |
| if (pages_freed_out) { |
| *pages_freed_out = page_remover.freed_count(); |
| } |
| |
| VMO_VALIDATION_ASSERT(DebugValidatePageSplitsHierarchyLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| return ZX_OK; |
| } |
| |
| bool VmCowPages::PageWouldReadZeroLocked(uint64_t page_offset) { |
| canary_.Assert(); |
| |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(page_offset)); |
| DEBUG_ASSERT(page_offset < size_); |
| const VmPageOrMarker* slot = page_list_.Lookup(page_offset); |
| if (slot && slot->IsMarker()) { |
| // This is already considered zero as there's a marker. |
| return true; |
| } |
| if (is_source_preserving_page_content() && |
| ((slot && slot->IsIntervalZero()) || page_list_.IsOffsetInZeroInterval(page_offset))) { |
| // Pages in zero intervals are supplied as zero by the kernel. |
| return true; |
| } |
| // If we don't have a page or reference here we need to check our parent. |
| if (!slot || !slot->IsPageOrRef()) { |
| VmCowPages* page_owner; |
| uint64_t owner_offset; |
| if (!FindInitialPageContentLocked(page_offset, &page_owner, &owner_offset, nullptr).current()) { |
| // Parent doesn't have a page either, so would also read as zero, assuming no page source. |
| return GetRootPageSourceLocked() == nullptr; |
| } |
| } |
| // Content either locally or in our parent, assume it is non-zero and return false. |
| return false; |
| } |
| |
| zx_status_t VmCowPages::ZeroPagesLocked(uint64_t page_start_base, uint64_t page_end_base, |
| LazyPageRequest* page_request, uint64_t* zeroed_len_out) { |
| canary_.Assert(); |
| |
| DEBUG_ASSERT(page_start_base <= page_end_base); |
| DEBUG_ASSERT(page_end_base <= size_); |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(page_start_base)); |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(page_end_base)); |
| ASSERT(zeroed_len_out); |
| |
| // Forward any operations on slices up to the original non slice parent. |
| if (is_slice_locked()) { |
| return slice_parent_locked().ZeroPagesLocked(page_start_base + parent_offset_, |
| page_end_base + parent_offset_, page_request, |
| zeroed_len_out); |
| } |
| |
| // This function tries to zero pages as optimally as possible for most cases, so we attempt |
| // increasingly expensive actions only if certain preconditions do not allow us to perform the |
| // cheaper action. Broadly speaking, the sequence of actions that are attempted are as follows. |
| // 1) Try to decommit the entire range at once if the VMO allows it. |
| // 2) Otherwise, try to decommit each page if the VMO allows it and doing so doesn't expose |
| // content in the parent (if any) that shouldn't be visible. |
| // 3) Otherwise, if this is a child VMO and there is no committed page yet, allocate a zero page. |
| // 4) Otherwise, look up the page, faulting it in if necessary, and zero the page. If the page |
| // source needs to supply or dirty track the page, a page request is initialized and we return |
| // early with ZX_ERR_SHOULD_WAIT. The caller is expected to wait on the page request, and then |
| // retry. On the retry, we should be able to look up the page successfully and zero it. |
| |
| // First try and do the more efficient decommit. We prefer/ decommit as it performs work in the |
| // order of the number of committed pages, instead of work in the order of size of the range. An |
| // error from DecommitRangeLocked indicates that the VMO is not of a form that decommit can safely |
| // be performed without exposing data that we shouldn't between children and parents, but no |
| // actual state will have been changed. Should decommit succeed we are done, otherwise we will |
| // have to handle each offset individually. |
| // |
| // Zeroing doesn't decommit pages of contiguous VMOs. |
| if (can_decommit_zero_pages_locked()) { |
| zx_status_t status = DecommitRangeLocked(page_start_base, page_end_base - page_start_base); |
| if (status == ZX_OK) { |
| *zeroed_len_out = page_end_base - page_start_base; |
| return ZX_OK; |
| } |
| |
| // Unmap any page that is touched by this range in any of our, or our childrens, mapping |
| // regions. We do this on the assumption we are going to be able to free pages either completely |
| // or by turning them into markers and it's more efficient to unmap once in bulk here. |
| RangeChangeUpdateLocked(page_start_base, page_end_base - page_start_base, RangeChangeOp::Unmap); |
| } |
| |
| // Increment the gen count early as it's possible to fail part way through and this function |
| // doesn't unroll its actions. If we were able to successfully decommit pages above, |
| // DecommitRangeLocked would have incremented the gen count already, so we can do this after the |
| // decommit attempt. |
| // |
| // Zeroing pages of a contiguous VMO doesn't commit or decommit any pages currently, but we |
| // increment the generation count anyway in case that changes in future, and to keep the tests |
| // more consistent. |
| IncrementHierarchyGenerationCountLocked(); |
| |
| // We stack-own loaned pages from when they're removed until they're freed. |
| __UNINITIALIZED StackOwnedLoanedPagesInterval raii_interval; |
| |
| // Pages removed from this object are put into freed_list, while pages removed from any ancestor |
| // are put into ancestor_freed_list. This is so that freeing of both the lists can be handled |
| // correctly, by passing the correct value for freeing_owned_pages in the call to |
| // FreePagesLocked(). |
| list_node_t freed_list; |
| list_initialize(&freed_list); |
| list_node_t ancestor_freed_list; |
| list_initialize(&ancestor_freed_list); |
| |
| // See also free_any_pages below, which intentionally frees incrementally. |
| auto auto_free = fit::defer([this, &freed_list, &ancestor_freed_list]() { |
| AssertHeld(lock_ref()); |
| if (!list_is_empty(&freed_list)) { |
| FreePagesLocked(&freed_list, /*freeing_owned_pages=*/true); |
| } |
| if (!list_is_empty(&ancestor_freed_list)) { |
| FreePagesLocked(&ancestor_freed_list, /*freeing_owned_pages=*/false); |
| } |
| }); |
| |
| // Ideally we just collect up pages and hand them over to the pmm all at the end, but if we need |
| // to allocate any pages then we would like to ensure that we do not cause total memory to peak |
| // higher due to squirreling these pages away. |
| auto free_any_pages = [this, &freed_list, &ancestor_freed_list] { |
| AssertHeld(lock_ref()); |
| if (!list_is_empty(&freed_list)) { |
| FreePagesLocked(&freed_list, /*freeing_owned_pages=*/true); |
| } |
| if (!list_is_empty(&ancestor_freed_list)) { |
| FreePagesLocked(&ancestor_freed_list, /*freeing_owned_pages=*/false); |
| } |
| }; |
| |
| // Give us easier names for our range. |
| const uint64_t start = page_start_base; |
| const uint64_t end = page_end_base; |
| |
| // If the VMO is directly backed by a page source that preserves content, it should be the root |
| // VMO of the hierarchy. |
| DEBUG_ASSERT(!is_source_preserving_page_content() || !parent_); |
| |
| // If the page source preserves content, we can perform efficient zeroing by inserting dirty zero |
| // intervals. Handle this case separately. |
| if (is_source_preserving_page_content()) { |
| // Inserting zero intervals can modify the page list such that new nodes are added and deleted. |
| // So we cannot safely insert zero intervals while iterating the page list. The pattern we |
| // follow here is: |
| // 1. Traverse the page list to find a range that can be represented by a zero interval instead. |
| // 2. When such a range is found, break out of the traversal, and insert the zero interval. |
| // 3. Advance past the zero interval we inserted and resume the traversal from there, until |
| // we've covered the entire range. |
| |
| // The start offset at which to start the next traversal loop. |
| uint64_t next_start_offset = start; |
| do { |
| // Zeroing a zero interval is a no-op. Track whether we find ourselves in a zero interval. |
| bool in_interval = false; |
| // The start of the zero interval if we are in one. |
| uint64_t interval_start = next_start_offset; |
| const uint64_t prev_start_offset = next_start_offset; |
| // State tracking information for inserting a new zero interval. |
| struct { |
| bool add_zero_interval; |
| uint64_t start; |
| uint64_t end; |
| bool replace_page; |
| } state = {.add_zero_interval = false, .start = 0, .end = 0, .replace_page = false}; |
| |
| zx_status_t status = page_list_.RemovePagesAndIterateGaps( |
| [&](VmPageOrMarker* p, uint64_t off) { |
| // We cannot have references in pager-backed VMOs. |
| DEBUG_ASSERT(!p->IsReference()); |
| |
| // If this is a page, see if we can remove it and absorb it into a zero interval. |
| if (p->IsPage()) { |
| AssertHeld(lock_ref()); |
| if (p->Page()->object.pin_count > 0) { |
| // Cannot remove this page if it is pinned. Lookup the page and zero it. Looking up |
| // ensures that we request dirty transition if needed by the pager. |
| LookupCursor cursor(this, off, PAGE_SIZE); |
| AssertHeld(cursor.lock_ref()); |
| zx::result<LookupCursor::RequireResult> result = |
| cursor.RequireOwnedPage(true, 1, page_request); |
| if (result.is_error()) { |
| return result.error_value(); |
| } |
| DEBUG_ASSERT(result->page == p->Page()); |
| // Zero the page we looked up. |
| ZeroPage(result->page->paddr()); |
| *zeroed_len_out += PAGE_SIZE; |
| next_start_offset = off + PAGE_SIZE; |
| return ZX_ERR_NEXT; |
| } |
| // Break out of the traversal. We can release the page and add a zero interval |
| // instead. |
| state = {.add_zero_interval = true, |
| .start = off, |
| .end = off + PAGE_SIZE, |
| .replace_page = true}; |
| return ZX_ERR_STOP; |
| } |
| |
| // Otherwise this is a marker or zero interval, in which case we already have zeroes. |
| DEBUG_ASSERT(p->IsMarker() || p->IsIntervalZero()); |
| if (p->IsIntervalStart()) { |
| // Track the interval start so we know how much to add to zeroed_len_out later. |
| interval_start = off; |
| in_interval = true; |
| } else if (p->IsIntervalEnd()) { |
| // Add the range from interval start to end. |
| *zeroed_len_out += (off + PAGE_SIZE - interval_start); |
| in_interval = false; |
| } else { |
| // This is either a single interval slot or a marker. |
| *zeroed_len_out += PAGE_SIZE; |
| } |
| next_start_offset = off + PAGE_SIZE; |
| return ZX_ERR_NEXT; |
| }, |
| [&](uint64_t gap_start, uint64_t gap_end) { |
| AssertHeld(lock_ref()); |
| // This gap will be replaced with a zero interval. Invalidate any read requests in this |
| // range. |
| InvalidateReadRequestsLocked(gap_start, gap_end - gap_start); |
| // We have found a new zero interval to insert. Break out of the traversal. |
| state = {.add_zero_interval = true, |
| .start = gap_start, |
| .end = gap_end, |
| .replace_page = false}; |
| return ZX_ERR_STOP; |
| }, |
| next_start_offset, end); |
| // Bubble up any errors from LookupCursor. |
| if (status != ZX_OK) { |
| return status; |
| } |
| |
| // Add any new zero interval. |
| if (state.add_zero_interval) { |
| if (state.replace_page) { |
| DEBUG_ASSERT(state.start + PAGE_SIZE == state.end); |
| vm_page_t* page = page_list_.ReplacePageWithZeroInterval( |
| state.start, VmPageOrMarker::IntervalDirtyState::Dirty); |
| DEBUG_ASSERT(page->object.pin_count == 0); |
| pmm_page_queues()->Remove(page); |
| DEBUG_ASSERT(!list_in_list(&page->queue_node)); |
| list_add_tail(&freed_list, &page->queue_node); |
| } else { |
| status = page_list_.AddZeroInterval(state.start, state.end, |
| VmPageOrMarker::IntervalDirtyState::Dirty); |
| if (status != ZX_OK) { |
| DEBUG_ASSERT(status == ZX_ERR_NO_MEMORY); |
| return status; |
| } |
| } |
| *zeroed_len_out += (state.end - state.start); |
| next_start_offset = state.end; |
| } |
| |
| // Handle the last partial interval. Or the case where we did not advance next_start_offset at |
| // all, which can only happen if the range fell entirely inside an interval. |
| if (in_interval || next_start_offset == prev_start_offset) { |
| // If the range fell entirely inside an interval, verify that it was indeed a zero interval. |
| DEBUG_ASSERT(next_start_offset != prev_start_offset || |
| page_list_.IsOffsetInZeroInterval(next_start_offset)); |
| *zeroed_len_out += (end - interval_start); |
| next_start_offset = end; |
| } |
| } while (next_start_offset < end); |
| |
| VMO_VALIDATION_ASSERT(DebugValidateZeroIntervalsLocked()); |
| return ZX_OK; |
| } |
| |
| // We've already handled this case above and returned early. |
| DEBUG_ASSERT(!is_source_preserving_page_content()); |
| |
| // If we're zeroing at the end of our parent range we can update to reflect this similar to a |
| // resize. This does not work if we are a slice, but we checked for that earlier. Whilst this does |
| // not actually zero the range in question, it makes future zeroing of the range far more |
| // efficient, which is why we do it first. |
| if (start < parent_limit_ && end >= parent_limit_) { |
| bool hidden_parent = false; |
| if (parent_) { |
| hidden_parent = parent_locked().is_hidden_locked(); |
| } |
| if (hidden_parent) { |
| // Release any COW pages that are no longer necessary. This will also |
| // update the parent limit. |
| __UNINITIALIZED BatchPQRemove page_remover(&ancestor_freed_list); |
| ReleaseCowParentPagesLocked(start, parent_limit_, &page_remover); |
| page_remover.Flush(); |
| } else { |
| parent_limit_ = start; |
| } |
| } |
| |
| // Helper lambda to determine if this VMO can see parent contents at offset, or if a length is |
| // specified as well in the range [offset, offset + length). |
| auto can_see_parent = [this](uint64_t offset, uint64_t length = PAGE_SIZE) TA_REQ(lock()) { |
| if (!parent_) { |
| return false; |
| } |
| return offset < parent_limit_ && offset + length <= parent_limit_; |
| }; |
| |
| // This is a lambda as it only makes sense to talk about parent mutability when we have a parent |
| // for the offset being considered. |
| auto parent_immutable = [can_see_parent, this](uint64_t offset) TA_REQ(lock()) { |
| // TODO(johngro): remove this explicit unused-capture warning suppression |
| // when https://bugs.llvm.org/show_bug.cgi?id=35450 gets fixed. |
| (void)can_see_parent; // used only in DEBUG_ASSERT |
| DEBUG_ASSERT(can_see_parent(offset)); |
| return parent_locked().is_hidden_locked(); |
| }; |
| |
| // Finding the initial page content is expensive, but we only need to call it under certain |
| // circumstances scattered in the code below. The lambda get_initial_page_content() will lazily |
| // fetch and cache the details. This avoids us calling it when we don't need to, or calling it |
| // more than once. |
| struct InitialPageContent { |
| bool inited = false; |
| VmCowPages* page_owner; |
| uint64_t owner_offset; |
| uint64_t cached_offset; |
| VmPageOrMarkerRef page_or_marker; |
| } initial_content_; |
| auto get_initial_page_content = [&initial_content_, can_see_parent, this](uint64_t offset) |
| TA_REQ(lock()) -> const InitialPageContent& { |
| // TODO(johngro): remove this explicit unused-capture warning suppression |
| // when https://bugs.llvm.org/show_bug.cgi?id=35450 gets fixed. |
| (void)can_see_parent; // used only in DEBUG_ASSERT |
| |
| // If there is no cached page content or if we're looking up a different offset from the cached |
| // one, perform the lookup. |
| if (!initial_content_.inited || offset != initial_content_.cached_offset) { |
| DEBUG_ASSERT(can_see_parent(offset)); |
| VmPageOrMarkerRef page_or_marker = |
| FindInitialPageContentLocked(offset, &initial_content_.page_owner, |
| &initial_content_.owner_offset, nullptr) |
| .current(); |
| // We only care about the parent having a 'true' vm_page for content. If the parent has a |
| // marker then it's as if the parent has no content since that's a zero page anyway, which is |
| // what we are trying to achieve. |
| initial_content_.page_or_marker = page_or_marker; |
| initial_content_.inited = true; |
| initial_content_.cached_offset = offset; |
| } |
| DEBUG_ASSERT(offset == initial_content_.cached_offset); |
| return initial_content_; |
| }; |
| |
| // Helper lambda to determine if parent has content at the specified offset. |
| auto parent_has_content = [get_initial_page_content](uint64_t offset) TA_REQ(lock()) { |
| const VmPageOrMarkerRef& page_or_marker = get_initial_page_content(offset).page_or_marker; |
| return page_or_marker && page_or_marker->IsPageOrRef(); |
| }; |
| |
| // In the ideal case we can zero by making there be an Empty slot in our page list. This is true |
| // when we're not specifically avoiding decommit on zero and there is nothing pinned. |
| // |
| // Note that this lambda is only checking for pre-conditions in *this* VMO which allow us to |
| // represent zeros with an empty slot. We will combine this check with additional checks for |
| // contents visible through the parent, if applicable. |
| auto can_decommit_slot = [this](const VmPageOrMarker* slot, uint64_t offset) TA_REQ(lock()) { |
| if (!can_decommit_zero_pages_locked() || |
| (slot && slot->IsPage() && slot->Page()->object.pin_count > 0)) { |
| return false; |
| } |
| DEBUG_ASSERT(!is_source_preserving_page_content()); |
| return true; |
| }; |
| |
| // Like can_decommit_slot but for a range. |
| auto can_decommit_slots_in_range = [this](uint64_t offset, uint64_t length) TA_REQ(lock()) { |
| if (!can_decommit_zero_pages_locked() || AnyPagesPinnedLocked(offset, length)) { |
| return false; |
| } |
| DEBUG_ASSERT(!is_source_preserving_page_content()); |
| return true; |
| }; |
| |
| // Helper lambda to zero the slot at offset either by inserting a marker or by zeroing the actual |
| // page as applicable. The return codes match those expected for VmPageList traversal. |
| auto zero_slot = [&](VmPageOrMarker* slot, uint64_t offset) TA_REQ(lock()) { |
| // Ideally we will use a marker, but we can only do this if we can point to a committed page |
| // to justify the allocation of the marker (i.e. we cannot allocate infinite markers with no |
| // committed pages). A committed page in this case exists if the parent has any content. |
| // Otherwise, we'll need to zero an actual page. |
| if (!can_decommit_slot(slot, offset) || !parent_has_content(offset)) { |
| // We might allocate a new page below. Free any pages we've accumulated first. |
| free_any_pages(); |
| |
| // If we're here because of !parent_has_content() and slot doesn't have a page, we can simply |
| // allocate a zero page to replace the empty slot. Otherwise, we'll have to look up the page |
| // and zero it. |
| // |
| // We could technically fall through to GetLookupCursorLocked even for an empty slot and let |
| // RequirePage allocate a new page and zero it, but we want to avoid having to redundantly |
| // zero a newly forked zero page. |
| if (!slot && can_see_parent(offset) && !parent_has_content(offset)) { |
| // We could only have ended up here if the parent was mutable or if there is a pager-backed |
| // root, otherwise we should have been able to treat an empty slot as zero (decommit a |
| // committed page) and return early above. |
| DEBUG_ASSERT(!parent_immutable(offset) || is_root_source_user_pager_backed_locked()); |
| // We will try to insert a new zero page below. Note that at this point we know that this is |
| // not a contiguous VMO (which cannot have arbitrary zero pages inserted into it). We |
| // checked for can_see_parent just now and contiguous VMOs do not support (non-slice) |
| // clones. Besides, if the slot was empty we should have moved on when we found the gap in |
| // the page list traversal as the contiguous page source zeroes supplied pages by default. |
| DEBUG_ASSERT(!is_source_supplying_specific_physical_pages()); |
| |
| // Allocate a new page, it will be zeroed in the process. |
| vm_page_t* p; |
| // Do not pass our freed_list here as this takes an |alloc_list| list to allocate from. |
| zx_status_t status = AllocateCopyPage(vm_get_zero_page_paddr(), nullptr, page_request, &p); |
| if (status != ZX_OK) { |
| return status; |
| } |
| VmPageOrMarker new_page = VmPageOrMarker::Page(p); |
| status = AddPageLocked(&new_page, offset, CanOverwriteContent::Zero, nullptr, |
| /*do_range_update=*/false); |
| // Absent bugs, AddPageLocked() can only return ZX_ERR_NO_MEMORY. |
| if (status == ZX_ERR_NO_MEMORY) { |
| return status; |
| } |
| DEBUG_ASSERT(status == ZX_OK); |
| return ZX_ERR_NEXT; |
| } |
| |
| // Lookup the page which will potentially fault it in via the page source. Zeroing is |
| // equivalent to a VMO write with zeros, so simulate a write fault. |
| zx::result<VmCowPages::LookupCursor> cursor = GetLookupCursorLocked(offset, PAGE_SIZE); |
| if (cursor.is_error()) { |
| return cursor.error_value(); |
| } |
| AssertHeld(cursor->lock_ref()); |
| auto result = cursor->RequirePage(true, 1, page_request); |
| if (result.is_error()) { |
| return result.error_value(); |
| } |
| ZeroPage(result->page->paddr()); |
| return ZX_ERR_NEXT; |
| } |
| |
| DEBUG_ASSERT(parent_ && parent_has_content(offset)); |
| // Validate we can insert our own pages/content. |
| DEBUG_ASSERT(!is_source_supplying_specific_physical_pages()); |
| |
| // We are able to insert a marker, but if our page content is from a hidden owner we need to |
| // perform slightly more complex cow forking. |
| const InitialPageContent& content = get_initial_page_content(offset); |
| AssertHeld(content.page_owner->lock_ref()); |
| if (!slot && content.page_owner->is_hidden_locked()) { |
| free_any_pages(); |
| // TODO(https://fxbug.dev/42138396): This could be more optimal since unlike a regular cow |
| // clone, we are not going to actually need to read the target page we are cloning, and hence |
| // it does not actually need to get converted. |
| if (content.page_or_marker->IsReference()) { |
| zx_status_t result = content.page_owner->ReplaceReferenceWithPageLocked( |
| content.page_or_marker, content.owner_offset, page_request); |
| if (result != ZX_OK) { |
| return result; |
| } |
| } |
| zx_status_t result = CloneCowPageAsZeroLocked( |
| offset, &ancestor_freed_list, content.page_owner, content.page_or_marker->Page(), |
| content.owner_offset, page_request); |
| if (result != ZX_OK) { |
| return result; |
| } |
| return ZX_ERR_NEXT; |
| } |
| |
| // Remove any page that could be hanging around in the slot and replace it with a marker. |
| VmPageOrMarker new_marker = VmPageOrMarker::Marker(); |
| VmPageOrMarker released_page; |
| zx_status_t status = AddPageLocked(&new_marker, offset, CanOverwriteContent::NonZero, |
| &released_page, /*do_range_update=*/false); |
| // Absent bugs, AddPageLocked() can only return ZX_ERR_NO_MEMORY. |
| if (status == ZX_ERR_NO_MEMORY) { |
| return status; |
| } |
| DEBUG_ASSERT(status == ZX_OK); |
| // Free the old page. |
| if (released_page.IsPage()) { |
| vm_page_t* page = released_page.ReleasePage(); |
| DEBUG_ASSERT(page->object.pin_count == 0); |
| pmm_page_queues()->Remove(page); |
| DEBUG_ASSERT(!list_in_list(&page->queue_node)); |
| list_add_tail(&freed_list, &page->queue_node); |
| } else if (released_page.IsReference()) { |
| FreeReference(released_page.ReleaseReference()); |
| } |
| return ZX_ERR_NEXT; |
| }; |
| |
| *zeroed_len_out = 0; |
| // Main page list traversal loop to remove any existing pages / markers, zero existing pages, and |
| // also insert any new markers / zero pages in gaps as applicable. We use the VmPageList traversal |
| // helper here instead of iterating over each offset in the range so we can efficiently skip over |
| // gaps if possible. |
| zx_status_t status = page_list_.RemovePagesAndIterateGaps( |
| [&](VmPageOrMarker* slot, uint64_t offset) { |
| AssertHeld(lock_ref()); |
| |
| // We don't expect intervals in non pager-backed VMOs. |
| DEBUG_ASSERT(!slot->IsInterval()); |
| |
| // Contiguous VMOs cannot have markers. |
| DEBUG_ASSERT(!direct_source_supplies_zero_pages() || !slot->IsMarker()); |
| |
| // First see if we can simply get done with an empty slot in the page list. This VMO should |
| // allow decommitting a page at this offset when zeroing. Additionally, one of the following |
| // conditions should hold w.r.t. to the parent: |
| // * This offset does not relate to our parent, or we don't have a parent. |
| // * This offset does relate to our parent, but our parent is immutable, currently |
| // zero at this offset and there is no pager-backed root VMO. |
| if (can_decommit_slot(slot, offset) && |
| (!can_see_parent(offset) || (parent_immutable(offset) && !parent_has_content(offset) && |
| !is_root_source_user_pager_backed_locked()))) { |
| if (slot->IsPage()) { |
| vm_page_t* page = slot->ReleasePage(); |
| pmm_page_queues()->Remove(page); |
| DEBUG_ASSERT(!list_in_list(&page->queue_node)); |
| list_add_tail(&freed_list, &page->queue_node); |
| } else if (slot->IsReference()) { |
| FreeReference(slot->ReleaseReference()); |
| } else { |
| // If this is a marker, simply make the slot empty. |
| *slot = VmPageOrMarker::Empty(); |
| } |
| // We successfully zeroed this offset. Move on to the next offset. |
| *zeroed_len_out += PAGE_SIZE; |
| return ZX_ERR_NEXT; |
| } |
| |
| // If there's already a marker then we can avoid any second guessing and leave the marker |
| // alone. |
| if (slot->IsMarker()) { |
| *zeroed_len_out += PAGE_SIZE; |
| return ZX_ERR_NEXT; |
| } |
| |
| // The only time we would reach here and *not* have a parent is if we could not decommit a |
| // page at this offset when zeroing. |
| DEBUG_ASSERT(!can_decommit_slot(slot, offset) || parent_); |
| |
| // Now we know that we need to do something active to make this zero, either through a |
| // marker or a page. |
| zx_status_t status = zero_slot(slot, offset); |
| if (status == ZX_ERR_NEXT) { |
| // If we were able to successfully zero this slot, move on to the next offset. |
| *zeroed_len_out += PAGE_SIZE; |
| } |
| return status; |
| }, |
| [&](uint64_t gap_start, uint64_t gap_end) { |
| AssertHeld(lock_ref()); |
| if (direct_source_supplies_zero_pages()) { |
| // Already logically zero - don't commit pages to back the zeroes if they're not already |
| // committed. This is important for contiguous VMOs, as we don't use markers for |
| // contiguous VMOs, and allocating a page below to hold zeroes would not be asking the |
| // page_source_ for the proper physical page. This prevents allocating an arbitrary |
| // physical page to back the zeroes. |
| *zeroed_len_out += (gap_end - gap_start); |
| return ZX_ERR_NEXT; |
| } |
| |
| // If empty slots imply zeroes, and the gap does not see parent contents, we already have |
| // zeroes. |
| if (can_decommit_slots_in_range(gap_start, gap_end - gap_start) && |
| !can_see_parent(gap_start, gap_end - gap_start)) { |
| *zeroed_len_out += (gap_end - gap_start); |
| return ZX_ERR_NEXT; |
| } |
| |
| // Otherwise fall back to examining each offset in the gap to determine the action to |
| // perform. |
| for (uint64_t offset = gap_start; offset < gap_end; |
| offset += PAGE_SIZE, *zeroed_len_out += PAGE_SIZE) { |
| // First see if we can simply get done with an empty slot in the page list. This VMO |
| // should allow decommitting a page at this offset when zeroing. Additionally, one of the |
| // following conditions should hold w.r.t. to the parent: |
| // * This offset does not relate to our parent, or we don't have a parent. |
| // * This offset does relate to our parent, but our parent is immutable, currently |
| // zero at this offset and there is no pager-backed root VMO. |
| if (can_decommit_slot(nullptr, offset) && |
| (!can_see_parent(offset) || |
| (parent_immutable(offset) && !parent_has_content(offset) && |
| !is_root_source_user_pager_backed_locked()))) { |
| continue; |
| } |
| |
| // The only time we would reach here and *not* have a parent is if we could not decommit a |
| // page at this offset when zeroing. |
| DEBUG_ASSERT(!can_decommit_slot(nullptr, offset) || parent_); |
| |
| // Now we know that we need to do something active to make this zero, either through a |
| // marker or a page. |
| zx_status_t status = zero_slot(nullptr, offset); |
| if (status != ZX_ERR_NEXT) { |
| return status; |
| } |
| } |
| |
| return ZX_ERR_NEXT; |
| }, |
| start, end); |
| |
| VMO_VALIDATION_ASSERT(DebugValidatePageSplitsHierarchyLocked()); |
| VMO_VALIDATION_ASSERT(DebugValidateZeroIntervalsLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| return status; |
| } |
| |
| void VmCowPages::MoveToPinnedLocked(vm_page_t* page, uint64_t offset) { |
| pmm_page_queues()->MoveToWired(page); |
| } |
| |
| void VmCowPages::MoveToNotPinnedLocked(vm_page_t* page, uint64_t offset) { |
| PageQueues* pq = pmm_page_queues(); |
| if (is_source_preserving_page_content()) { |
| DEBUG_ASSERT(is_page_dirty_tracked(page)); |
| // We can only move Clean pages to the pager backed queues as they track age information for |
| // eviction; only Clean pages can be evicted. Pages in AwaitingClean and Dirty are protected |
| // from eviction in the Dirty queue. |
| if (is_page_clean(page)) { |
| if (high_priority_count_ != 0) { |
| // If this VMO is high priority then do not place in the pager backed queue as that is |
| // reclaimable, place in the high priority queue instead. |
| pq->MoveToHighPriority(page); |
| } else { |
| pq->MoveToReclaim(page); |
| } |
| } else { |
| DEBUG_ASSERT(!page->is_loaned()); |
| pq->MoveToPagerBackedDirty(page); |
| } |
| } else { |
| // Place pages from contiguous VMOs in the wired queue, as they are notionally pinned until the |
| // owner explicitly releases them. |
| if (can_decommit_zero_pages_locked()) { |
| if (high_priority_count_ != 0 && !pq->ReclaimIsOnlyPagerBacked()) { |
| // If anonymous pages are reclaimable, and this VMO is high priority, then places our pages |
| // in the high priority queue instead of the anonymous one to avoid reclamation. |
| pq->MoveToHighPriority(page); |
| } else if (is_discardable()) { |
| pq->MoveToReclaim(page); |
| } else { |
| pq->MoveToAnonymous(page); |
| } |
| } else { |
| pq->MoveToWired(page); |
| } |
| } |
| } |
| |
| void VmCowPages::SetNotPinnedLocked(vm_page_t* page, uint64_t offset) { |
| PageQueues* pq = pmm_page_queues(); |
| if (is_source_preserving_page_content()) { |
| DEBUG_ASSERT(is_page_dirty_tracked(page)); |
| // We can only move Clean pages to the pager backed queues as they track age information for |
| // eviction; only Clean pages can be evicted. Pages in AwaitingClean and Dirty are protected |
| // from eviction in the Dirty queue. |
| if (is_page_clean(page)) { |
| if (high_priority_count_ != 0) { |
| // If this VMO is high priority then do not place in the pager backed queue as that is |
| // reclaimable, place in the high priority queue instead. |
| pq->SetHighPriority(page, this, offset); |
| } else { |
| pq->SetReclaim(page, this, offset); |
| } |
| } else { |
| DEBUG_ASSERT(!page->is_loaned()); |
| pq->SetPagerBackedDirty(page, this, offset); |
| } |
| } else { |
| // Place pages from contiguous VMOs in the wired queue, as they are notionally pinned until the |
| // owner explicitly releases them. |
| if (can_decommit_zero_pages_locked()) { |
| if (high_priority_count_ != 0 && !pq->ReclaimIsOnlyPagerBacked()) { |
| // If anonymous pages are reclaimable, and this VMO is high priority, then places our pages |
| // in the high priority queue instead of the anonymous one to avoid reclamation. |
| pq->SetHighPriority(page, this, offset); |
| } else if (is_discardable()) { |
| pq->SetReclaim(page, this, offset); |
| } else { |
| pq->SetAnonymous(page, this, offset); |
| } |
| } else { |
| pq->SetWired(page, this, offset); |
| } |
| } |
| } |
| |
| void VmCowPages::PromoteRangeForReclamationLocked(uint64_t offset, uint64_t len) { |
| canary_.Assert(); |
| |
| // Hints only apply to pager backed VMOs. |
| if (!can_root_source_evict_locked()) { |
| return; |
| } |
| // Zero lengths have no work to do. |
| if (len == 0) { |
| return; |
| } |
| |
| // Walk up the tree to get to the root parent. A raw pointer is fine as we're holding the lock and |
| // won't drop it in this function. |
| // We need the root to check if the pages are owned by the root below. Hints only apply to pages |
| // in the root that are visible to this child, not to pages the child might have forked. |
| const VmCowPages* const root = GetRootLocked(); |
| |
| uint64_t start_offset = ROUNDDOWN(offset, PAGE_SIZE); |
| uint64_t end_offset = ROUNDUP(offset + len, PAGE_SIZE); |
| |
| __UNINITIALIZED zx::result<VmCowPages::LookupCursor> cursor = |
| GetLookupCursorLocked(start_offset, end_offset - start_offset); |
| if (cursor.is_error()) { |
| return; |
| } |
| // Do not consider pages accessed as the goal is reclaim them, not consider them used. |
| cursor->DisableMarkAccessed(); |
| AssertHeld(cursor->lock_ref()); |
| while (start_offset < end_offset) { |
| // Lookup the page if it exists, but do not let it get allocated or say we are writing to it. |
| // On success or failure this causes the cursor to go to the next offset. |
| vm_page_t* page = cursor->MaybePage(false); |
| if (page) { |
| // Check to see if the page is owned by the root VMO. Hints only apply to the root. |
| // Don't move a pinned page or a dirty page to the DontNeed queue. |
| // Note that this does not unset the always_need bit if it has been previously set. The |
| // always_need hint is sticky. |
| if (page->object.get_object() == root && page->object.pin_count == 0 && is_page_clean(page)) { |
| pmm_page_queues()->MoveToReclaimDontNeed(page); |
| vm_vmo_dont_need.Add(1); |
| } |
| } |
| // Can't really do anything in case an error is encountered while looking up the page. Simply |
| // ignore it and move on to the next page. Hints are best effort anyway. |
| start_offset += PAGE_SIZE; |
| } |
| } |
| |
| zx_status_t VmCowPages::ProtectRangeFromReclamationLocked(uint64_t offset, uint64_t len, |
| bool set_always_need, bool ignore_errors, |
| Guard<CriticalMutex>* guard) { |
| canary_.Assert(); |
| |
| // Hints only apply to pager backed VMOs. |
| if (!can_root_source_evict_locked()) { |
| return ZX_OK; |
| } |
| // Zero lengths have no work to do. |
| if (len == 0) { |
| return ZX_OK; |
| } |
| |
| uint64_t cur_offset = ROUNDDOWN(offset, PAGE_SIZE); |
| uint64_t end_offset = ROUNDUP(offset + len, PAGE_SIZE); |
| |
| __UNINITIALIZED LazyPageRequest page_request; |
| __UNINITIALIZED zx::result<VmCowPages::LookupCursor> cursor = |
| GetLookupCursorLocked(cur_offset, end_offset - cur_offset); |
| // Track the validity of the cursor as we would like to efficiently look up runs where possible, |
| // but due to both errors and lock drops will need to acquire new cursors on occasion. |
| bool cursor_valid = true; |
| for (; cur_offset < end_offset; cur_offset += PAGE_SIZE) { |
| const uint64_t remaining = end_offset - cur_offset; |
| if (!cursor_valid) { |
| cursor = GetLookupCursorLocked(cur_offset, remaining); |
| if (cursor.is_error()) { |
| return cursor.status_value(); |
| } |
| cursor_valid = true; |
| } |
| AssertHeld(cursor->lock_ref()); |
| // Lookup the page, this will fault in the page from the parent if neccessary, but will not |
| // allocate pages directly in this if it is a child. |
| auto result = |
| cursor->RequirePage(false, static_cast<uint>(remaining / PAGE_SIZE), &page_request); |
| zx_status_t status = result.status_value(); |
| if (status == ZX_OK) { |
| // If we reached here, we successfully found a page at the current offset. |
| vm_page_t* page = result->page; |
| |
| // The root might have gone away when the lock was dropped while waiting above. Compute the |
| // root again and check if we still have a page source backing it before applying the hint. |
| if (!can_root_source_evict_locked()) { |
| // Hinting is not applicable anymore. No more pages to hint. |
| return ZX_OK; |
| } |
| |
| // Check to see if the page is owned by the root VMO. Hints only apply to the root. |
| VmCowPages* owner = reinterpret_cast<VmCowPages*>(page->object.get_object()); |
| if (owner != GetRootLocked()) { |
| // Hinting is not applicable to this page, but it might apply to following ones. |
| continue; |
| } |
| |
| // If the page is loaned, replace it with a non-loaned page. Loaned pages are reclaimed by |
| // eviction, and hinted pages should not be evicted. |
| if (page->is_loaned()) { |
| DEBUG_ASSERT(is_page_clean(page)); |
| AssertHeld(owner->lock_ref()); |
| status = owner->ReplacePageLocked(page, page->object.get_page_offset(), |
| /*with_loaned=*/false, &page, &page_request); |
| // Let the status fall through below to have success, waiting and errors handled. |
| } |
| |
| if (status == ZX_OK) { |
| DEBUG_ASSERT(!page->is_loaned()); |
| if (set_always_need) { |
| page->object.always_need = 1; |
| vm_vmo_always_need.Add(1); |
| // Nothing more to do beyond marking the page always_need true. The lookup must have |
| // already marked the page accessed, moving it to the head of the first page queue. |
| } |
| continue; |
| } |
| } |
| // There was either an error in the original require page, or in processing what was looked up. |
| // Either way when go back around in the loop we are going to need a new cursor. |
| cursor_valid = false; |
| |
| if (status == ZX_ERR_SHOULD_WAIT) { |
| guard->CallUnlocked([&status, &page_request]() { status = page_request->Wait(); }); |
| |
| // The size might have changed since we dropped the lock. Adjust the range if required. |
| if (cur_offset >= size_locked()) { |
| // No more pages to hint. |
| return ZX_OK; |
| } |
| // Shrink the range if required. Proceed with hinting on the remaining pages in the range; |
| // we've already hinted on the preceding pages, so just go on ahead instead of returning an |
| // error. The range was valid at the time we started hinting. |
| if (end_offset > size_locked()) { |
| end_offset = size_locked(); |
| } |
| |
| // If the wait succeeded, cur_offset will now have a backing page, so we need to try the |
| // same offset again. Move back a page so the loop increment keeps us at the same offset. In |
| // case of failure, simply continue on to the next page, as hints are best effort only. |
| if (status == ZX_OK) { |
| cur_offset -= PAGE_SIZE; |
| continue; |
| } |
| } |
| // Should only get here if an error was encountered, check if we should ignore or return it. |
| DEBUG_ASSERT(status != ZX_OK); |
| if (!ignore_errors) { |
| return status; |
| } |
| } |
| return ZX_OK; |
| } |
| |
| zx_status_t VmCowPages::DecompressInRangeLocked(uint64_t offset, uint64_t len, |
| Guard<CriticalMutex>* guard) { |
| canary_.Assert(); |
| |
| if (len == 0) { |
| return ZX_OK; |
| } |
| |
| DEBUG_ASSERT(InRange(offset, len, size_)); |
| uint64_t cur_offset = ROUNDDOWN(offset, PAGE_SIZE); |
| uint64_t end_offset = ROUNDUP(offset + len, PAGE_SIZE); |
| |
| while (cur_offset < end_offset) { |
| VmPageOrMarkerRef ref; |
| uint64_t ref_offset = 0; |
| page_list_.ForEveryPageInRangeMutable( |
| [&](VmPageOrMarkerRef page_or_marker, uint64_t offset) { |
| if (page_or_marker->IsReference()) { |
| ref = page_or_marker; |
| ref_offset = offset; |
| return ZX_ERR_STOP; |
| } |
| return ZX_ERR_NEXT; |
| }, |
| cur_offset, end_offset); |
| if (!ref) { |
| return ZX_OK; |
| } |
| __UNINITIALIZED LazyPageRequest page_request; |
| zx_status_t status = ReplaceReferenceWithPageLocked(ref, ref_offset, &page_request); |
| if (status == ZX_OK) { |
| cur_offset = ref_offset + PAGE_SIZE; |
| } else if (status == ZX_ERR_SHOULD_WAIT) { |
| guard->CallUnlocked([&page_request, &status]() { status = page_request->Wait(); }); |
| // With the lock dropped it's possible that our cur/end_offset are no longer within the range |
| // of the VMO, but if this is the case we will immediately find no pages in the page_list_ |
| // for this range and return. |
| } |
| if (status != ZX_OK) { |
| return status; |
| } |
| } |
| return ZX_OK; |
| } |
| |
| int64_t VmCowPages::ChangeSingleHighPriorityCountLocked(int64_t delta) { |
| const bool was_zero = high_priority_count_ == 0; |
| high_priority_count_ += delta; |
| DEBUG_ASSERT(high_priority_count_ >= 0); |
| const bool is_zero = high_priority_count_ == 0; |
| // Any change to or from zero means we need to add or remove a count from our parent (if we have |
| // one) and potentially move pages in the page queues. |
| if (is_zero && !was_zero) { |
| delta = -1; |
| } else if (was_zero && !is_zero) { |
| delta = 1; |
| } else { |
| delta = 0; |
| } |
| if (delta != 0) { |
| // If we moved to or from zero then update every page into the correct page queue for tracking. |
| // MoveToNotPinnedLocked will check the high_priority_count_, which has already been updated, so |
| // can just call that on every page. |
| page_list_.ForEveryPage([this](const VmPageOrMarker* page_or_marker, uint64_t offset) { |
| if (page_or_marker->IsPage()) { |
| vm_page_t* page = page_or_marker->Page(); |
| if (page->object.pin_count == 0) { |
| AssertHeld(lock_ref()); |
| MoveToNotPinnedLocked(page, offset); |
| } |
| } |
| return ZX_ERR_NEXT; |
| }); |
| } |
| vm_vmo_high_priority.Add(delta); |
| return delta; |
| } |
| |
| void VmCowPages::ChangeHighPriorityCountLocked(int64_t delta) { |
| canary_.Assert(); |
| |
| VmCowPages* cur = this; |
| AssertHeld(cur->lock_ref()); |
| // Any change to or from zero requires updating a count in the parent, so we need to walk up the |
| // parent chain as long as a transition is happening. |
| while (cur && delta != 0) { |
| delta = cur->ChangeSingleHighPriorityCountLocked(delta); |
| cur = cur->parent_.get(); |
| } |
| } |
| |
| void VmCowPages::UnpinLocked(uint64_t offset, uint64_t len, bool allow_gaps) { |
| canary_.Assert(); |
| |
| // verify that the range is within the object |
| ASSERT(InRange(offset, len, size_)); |
| // forbid zero length unpins as zero length pins return errors. |
| ASSERT(len != 0); |
| |
| if (is_slice_locked()) { |
| return slice_parent_locked().UnpinLocked(offset + parent_offset_, len, allow_gaps); |
| } |
| |
| const uint64_t start_page_offset = ROUNDDOWN(offset, PAGE_SIZE); |
| const uint64_t end_page_offset = ROUNDUP(offset + len, PAGE_SIZE); |
| |
| #if (DEBUG_ASSERT_IMPLEMENTED) |
| // For any pages that have their pin count transition to 0, i.e. become unpinned, we want to |
| // perform a range change op. For efficiency track contiguous ranges. |
| uint64_t completely_unpin_start = 0; |
| uint64_t completely_unpin_len = 0; |
| #endif |
| |
| uint64_t unpin_count = 0; |
| bool found_page_or_gap = false; |
| zx_status_t status = page_list_.ForEveryPageAndGapInRange( |
| [&](const auto* page, uint64_t off) { |
| found_page_or_gap = true; |
| if (page->IsMarker()) { |
| // So far, allow_gaps is only used on contiguous VMOs which have no markers. We'd need |
| // to decide if a marker counts as a gap to allow before removing this assert. |
| DEBUG_ASSERT(!allow_gaps); |
| return ZX_ERR_NOT_FOUND; |
| } |
| AssertHeld(lock_ref()); |
| |
| // Reference content is not pinned by definition, and so we cannot unpin it. |
| ASSERT(!page->IsReference()); |
| // Intervals are sparse ranges without any committed pages, so cannot be pinned/unpinned. |
| ASSERT(!page->IsInterval()); |
| |
| vm_page_t* p = page->Page(); |
| ASSERT(p->object.pin_count > 0); |
| p->object.pin_count--; |
| if (p->object.pin_count == 0) { |
| MoveToNotPinnedLocked(p, offset); |
| #if (DEBUG_ASSERT_IMPLEMENTED) |
| // Check if the current range can be extended. |
| if (completely_unpin_start + completely_unpin_len == off) { |
| completely_unpin_len += PAGE_SIZE; |
| } else { |
| // Complete any existing range and then start again at this offset. |
| if (completely_unpin_len > 0) { |
| RangeChangeUpdateLocked(completely_unpin_start, completely_unpin_len, |
| RangeChangeOp::DebugUnpin); |
| } |
| completely_unpin_start = off; |
| completely_unpin_len = PAGE_SIZE; |
| } |
| #endif |
| } |
| ++unpin_count; |
| return ZX_ERR_NEXT; |
| }, |
| [allow_gaps, &found_page_or_gap](uint64_t gap_start, uint64_t gap_end) { |
| found_page_or_gap = true; |
| if (!allow_gaps) { |
| return ZX_ERR_NOT_FOUND; |
| } |
| return ZX_ERR_NEXT; |
| }, |
| start_page_offset, end_page_offset); |
| ASSERT_MSG(status == ZX_OK, "Tried to unpin an uncommitted page with allow_gaps false"); |
| |
| // If we did not find a page or a gap, we were entirely inside a sparse interval without any |
| // committed pages, so cannot be pinned/unpinned. |
| ASSERT(found_page_or_gap); |
| |
| #if (DEBUG_ASSERT_IMPLEMENTED) |
| // Check any leftover range. |
| if (completely_unpin_len > 0) { |
| RangeChangeUpdateLocked(completely_unpin_start, completely_unpin_len, |
| RangeChangeOp::DebugUnpin); |
| } |
| #endif |
| |
| bool overflow = sub_overflow(pinned_page_count_, unpin_count, &pinned_page_count_); |
| ASSERT(!overflow); |
| |
| return; |
| } |
| |
| bool VmCowPages::DebugIsRangePinnedLocked(uint64_t offset, uint64_t len) { |
| canary_.Assert(); |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(offset)); |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(len)); |
| |
| uint64_t pinned_count = 0; |
| page_list_.ForEveryPageInRange( |
| [&pinned_count](const auto* p, uint64_t off) { |
| if (p->IsPage() && p->Page()->object.pin_count > 0) { |
| pinned_count++; |
| return ZX_ERR_NEXT; |
| } |
| return ZX_ERR_STOP; |
| }, |
| offset, offset + len); |
| return pinned_count == len / PAGE_SIZE; |
| } |
| |
| bool VmCowPages::AnyPagesPinnedLocked(uint64_t offset, size_t len) { |
| canary_.Assert(); |
| DEBUG_ASSERT(lock_ref().lock().IsHeld()); |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(offset)); |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(len)); |
| |
| const uint64_t start_page_offset = offset; |
| const uint64_t end_page_offset = offset + len; |
| |
| if (pinned_page_count_ == 0) { |
| return false; |
| } |
| |
| bool found_pinned = false; |
| page_list_.ForEveryPageInRange( |
| [&found_pinned, start_page_offset, end_page_offset](const auto* p, uint64_t off) { |
| DEBUG_ASSERT(off >= start_page_offset && off < end_page_offset); |
| if (p->IsPage() && p->Page()->object.pin_count > 0) { |
| found_pinned = true; |
| return ZX_ERR_STOP; |
| } |
| return ZX_ERR_NEXT; |
| }, |
| start_page_offset, end_page_offset); |
| |
| return found_pinned; |
| } |
| |
| // Helper function which processes the region visible by both children. |
| void VmCowPages::ReleaseCowParentPagesLockedHelper(uint64_t start, uint64_t end, |
| bool sibling_visible, |
| BatchPQRemove* page_remover) { |
| // Compute the range in the parent that cur no longer will be able to see. |
| const uint64_t parent_range_start = CheckedAdd(start, parent_offset_); |
| const uint64_t parent_range_end = CheckedAdd(end, parent_offset_); |
| |
| bool skip_split_bits = true; |
| if (parent_limit_ <= end) { |
| parent_limit_ = ktl::min(start, parent_limit_); |
| if (parent_limit_ <= parent_start_limit_) { |
| // Setting both to zero is cleaner and makes some asserts easier. |
| parent_start_limit_ = 0; |
| parent_limit_ = 0; |
| } |
| } else if (start == parent_start_limit_) { |
| parent_start_limit_ = end; |
| } else if (sibling_visible) { |
| // Split bits and partial cow release are only an issue if this range is also visible to our |
| // sibling. If it's not visible then we will always be freeing all pages anyway, no need to |
| // worry about split bits. Otherwise if the vmo limits can't be updated, this function will need |
| // to use the split bits to release pages in the parent. It also means that ancestor pages in |
| // the specified range might end up being released based on their current split bits, instead of |
| // through subsequent calls to this function. Therefore parent and all ancestors need to have |
| // the partial_cow_release_ flag set to prevent fast merge issues in ::RemoveChildLocked. |
| auto cur = this; |
| AssertHeld(cur->lock_ref()); |
| uint64_t cur_start = start; |
| uint64_t cur_end = end; |
| while (cur->parent_ && cur_start < cur_end) { |
| auto parent = cur->parent_.get(); |
| AssertHeld(parent->lock_ref()); |
| parent->partial_cow_release_ = true; |
| cur_start = ktl::max(CheckedAdd(cur_start, cur->parent_offset_), parent->parent_start_limit_); |
| cur_end = ktl::min(CheckedAdd(cur_end, cur->parent_offset_), parent->parent_limit_); |
| cur = parent; |
| } |
| skip_split_bits = false; |
| } |
| |
| // Free any pages that either aren't visible, or were already split into the other child. For |
| // pages that haven't been split into the other child, we need to ensure they're univisible. |
| // We are going to be inserting removed pages into a shared free list. So make sure the parent did |
| // not have a page source that was handling frees which would require additional work on the owned |
| // pages on top of a simple free to the PMM. |
| DEBUG_ASSERT(!parent_locked().is_source_handling_free_locked()); |
| parent_locked().page_list_.RemovePages( |
| [skip_split_bits, sibling_visible, page_remover, |
| left = this == &parent_locked().left_child_locked()](VmPageOrMarker* page_or_mark, |
| uint64_t offset) { |
| // Hidden VMO hierarchies do not support intervals. |
| ASSERT(!page_or_mark->IsInterval()); |
| |
| if (page_or_mark->IsMarker()) { |
| // If this marker is in a range still visible to the sibling then we just leave it, no |
| // split bits or anything to be updated. If the sibling cannot see it, then we can clear |
| // it. |
| if (!sibling_visible) { |
| *page_or_mark = VmPageOrMarker::Empty(); |
| } |
| return ZX_ERR_NEXT; |
| } |
| // If the sibling can still see this page then we need to keep it around, otherwise we can |
| // free it. The sibling can see the page if this range is |sibling_visible| and if the |
| // sibling hasn't already forked the page, which is recorded in the split bits. |
| if (!sibling_visible || left ? page_or_mark->PageOrRefRightSplit() |
| : page_or_mark->PageOrRefLeftSplit()) { |
| page_remover->PushContent(page_or_mark); |
| return ZX_ERR_NEXT; |
| } |
| if (skip_split_bits) { |
| // If we were able to update this vmo's parent limit, that made the pages |
| // uniaccessible. We clear the split bits to allow ::RemoveChildLocked to efficiently |
| // merge vmos without having to worry about pages above parent_limit_. |
| page_or_mark->SetPageOrRefLeftSplit(false); |
| page_or_mark->SetPageOrRefRightSplit(false); |
| } else { |
| // Otherwise set the appropriate split bit to make the page uniaccessible. |
| if (left) { |
| page_or_mark->SetPageOrRefLeftSplit(true); |
| } else { |
| page_or_mark->SetPageOrRefRightSplit(true); |
| } |
| } |
| return ZX_ERR_NEXT; |
| }, |
| parent_range_start, parent_range_end); |
| } |
| |
| void VmCowPages::ReleaseCowParentPagesLocked(uint64_t start, uint64_t end, |
| BatchPQRemove* page_remover) { |
| // This function releases |this| references to any ancestor vmo's COW pages. |
| // |
| // To do so, we divide |this| parent into three (possibly 0-length) regions: the region |
| // which |this| sees but before what the sibling can see, the region where both |this| |
| // and its sibling can see, and the region |this| can see but after what the sibling can |
| // see. Processing the 2nd region only requires touching the direct parent, since the sibling |
| // can see ancestor pages in the region. However, processing the 1st and 3rd regions requires |
| // recursively releasing |this| parent's ancestor pages, since those pages are no longer |
| // visible through |this| parent. |
| // |
| // This function processes region 3 (incl. recursively processing the parent), then region 2, |
| // then region 1 (incl. recursively processing the parent). Processing is done in reverse order |
| // to ensure parent_limit_ is reduced correctly. When processing either regions of type 1 or 3 we |
| // 1. walk up the parent and find the largest common slice that all nodes in the hierarchy see |
| // as being of the same type. |
| // 2. walk back down (using stack_ direction flags) applying the range update using that final |
| // calculated size |
| // 3. reduce the range we are operating on to not include the section we just processed |
| // 4. repeat steps 1-3 until range is empty |
| // In the worst case it is possible for this algorithm then to be O(N^2) in the depth of the tree. |
| // More optimal algorithms probably exist, but this algorithm is sufficient for at the moment as |
| // these suboptimal scenarios do not occur in practice. |
| |
| // At the top level we continuously attempt to process the range until it is empty. |
| while (end > start) { |
| // cur_start / cur_end get adjusted as cur moves up/down the parent chain. |
| uint64_t cur_start = start; |
| uint64_t cur_end = end; |
| VmCowPages* cur = this; |
| |
| AssertHeld(cur->lock_ref()); |
| // First walk up the parent chain as long as there is a visible parent that does not overlap |
| // with its sibling. |
| while (cur->parent_ && cur->parent_start_limit_ < cur_end && cur_start < cur->parent_limit_) { |
| if (cur_end > cur->parent_limit_) { |
| // Part of the range sees the parent, and part of it doesn't. As we only process ranges of |
| // a single type we first trim the range down to the portion that doesn't see the parent, |
| // then next time around the top level loop we will process the portion that does see |
| cur_start = cur->parent_limit_; |
| DEBUG_ASSERT(cur_start < cur_end); |
| break; |
| } |
| // Trim the start to the portion of the parent it can see. |
| cur_start = ktl::max(cur_start, cur->parent_start_limit_); |
| DEBUG_ASSERT(cur_start < cur_end); |
| |
| // Work out what the overlap with our sibling is |
| auto parent = cur->parent_.get(); |
| AssertHeld(parent->lock_ref()); |
| // Stop processing if we are the child of a snapshot-modified root, as any pages in the parent |
| // are owned by the root and should remain accessible to the pager. |
| if (!parent->is_hidden_locked()) { |
| // Parent must be root & pager-backed. |
| DEBUG_ASSERT(!parent->parent_); |
| DEBUG_ASSERT(parent->is_source_preserving_page_content()); |
| break; |
| } |
| bool left = cur == &parent->left_child_locked(); |
| auto& other = left ? parent->right_child_locked() : parent->left_child_locked(); |
| AssertHeld(other.lock_ref()); |
| |
| // Project our operating range into our parent. |
| const uint64_t our_parent_start = CheckedAdd(cur_start, cur->parent_offset_); |
| const uint64_t our_parent_end = CheckedAdd(cur_end, cur->parent_offset_); |
| // Project our siblings full range into our parent. |
| const uint64_t other_parent_start = |
| CheckedAdd(other.parent_offset_, other.parent_start_limit_); |
| const uint64_t other_parent_end = CheckedAdd(other.parent_offset_, other.parent_limit_); |
| |
| if (other_parent_end >= our_parent_end && other_parent_start < our_parent_end) { |
| // At least some of the end of our range overlaps with the sibling. First move up our start |
| // to ensure our range is 100% overlapping. |
| if (other_parent_start > our_parent_start) { |
| cur_start = CheckedAdd(cur_start, other_parent_start - our_parent_start); |
| DEBUG_ASSERT(cur_start < cur_end); |
| } |
| // Free the range that overlaps with the sibling, then we are done walking up as this is the |
| // type 2 kind of region. It is safe to process this right now since we are in a terminal |
| // state and are leaving the loop, thus we know that this is the final size of the region. |
| cur->ReleaseCowParentPagesLockedHelper(cur_start, cur_end, true, page_remover); |
| break; |
| } |
| // End of our range does not see the sibling. First move up our start to ensure we are dealing |
| // with a range that is 100% no sibling, and then keep on walking up. |
| if (other_parent_end > our_parent_start && other_parent_end < our_parent_end) { |
| cur_start = CheckedAdd(cur_start, other_parent_end - our_parent_start); |
| DEBUG_ASSERT(cur_start < cur_end); |
| } |
| |
| // Record the direction so we can walk about down later. |
| parent->stack_.dir_flag = left ? StackDir::Left : StackDir::Right; |
| // Don't use our_parent_start as we may have updated cur_start |
| cur_start = CheckedAdd(cur_start, cur->parent_offset_); |
| cur_end = our_parent_end; |
| DEBUG_ASSERT(cur_start < cur_end); |
| cur = parent; |
| } |
| |
| // Every parent that we walked up had no overlap with its siblings. Now that we know the size |
| // of the range that we can process we just walk back down processing. |
| while (cur != this) { |
| // Although we free pages in the parent we operate on the *child*, as that is whose limits |
| // we will actually adjust. The ReleaseCowParentPagesLockedHelper will then reach backup to |
| // the parent to actually free any pages. |
| cur = cur->stack_.dir_flag == StackDir::Left ? &cur->left_child_locked() |
| : &cur->right_child_locked(); |
| AssertHeld(cur->lock_ref()); |
| DEBUG_ASSERT(cur_start >= cur->parent_offset_); |
| DEBUG_ASSERT(cur_end >= cur->parent_offset_); |
| cur_start -= cur->parent_offset_; |
| cur_end -= cur->parent_offset_; |
| |
| cur->ReleaseCowParentPagesLockedHelper(cur_start, cur_end, false, page_remover); |
| } |
| |
| // Update the end with the portion we managed to do. Ensuring some basic sanity of the range, |
| // most importantly that we processed a non-zero portion to ensure progress. |
| DEBUG_ASSERT(cur_start >= start); |
| DEBUG_ASSERT(cur_start < end); |
| DEBUG_ASSERT(cur_end == end); |
| end = cur_start; |
| } |
| } |
| |
| void VmCowPages::InvalidateReadRequestsLocked(uint64_t offset, uint64_t len) { |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(offset)); |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(len)); |
| DEBUG_ASSERT(InRange(offset, len, size_)); |
| |
| DEBUG_ASSERT(page_source_); |
| |
| const uint64_t start = offset; |
| const uint64_t end = offset + len; |
| |
| zx_status_t status = page_list_.ForEveryPageAndGapInRange( |
| [](const auto* p, uint64_t off) { return ZX_ERR_NEXT; }, |
| [this](uint64_t gap_start, uint64_t gap_end) { |
| page_source_->OnPagesSupplied(gap_start, gap_end - gap_start); |
| return ZX_ERR_NEXT; |
| }, |
| start, end); |
| DEBUG_ASSERT(status == ZX_OK); |
| } |
| |
| void VmCowPages::InvalidateDirtyRequestsLocked(uint64_t offset, uint64_t len) { |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(offset)); |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(len)); |
| DEBUG_ASSERT(InRange(offset, len, size_)); |
| |
| DEBUG_ASSERT(is_source_preserving_page_content()); |
| DEBUG_ASSERT(page_source_->ShouldTrapDirtyTransitions()); |
| |
| const uint64_t start = offset; |
| const uint64_t end = offset + len; |
| |
| zx_status_t status = page_list_.ForEveryPageAndContiguousRunInRange( |
| [](const VmPageOrMarker* p, uint64_t off) { |
| // A marker is a clean zero page and might have an outstanding DIRTY request. |
| if (p->IsMarker()) { |
| return true; |
| } |
| // An interval is an uncommitted zero page and might have an outstanding DIRTY request |
| // irrespective of dirty state. |
| if (p->IsIntervalZero()) { |
| return true; |
| } |
| // Although a reference is implied to be clean, VMO backed by a page source should never |
| // have references. |
| DEBUG_ASSERT(!p->IsReference()); |
| |
| vm_page_t* page = p->Page(); |
| DEBUG_ASSERT(is_page_dirty_tracked(page)); |
| |
| // A page that is not Dirty already might have an outstanding DIRTY request. |
| if (!is_page_dirty(page)) { |
| return true; |
| } |
| // Otherwise the page should already be Dirty. |
| DEBUG_ASSERT(is_page_dirty(page)); |
| return false; |
| }, |
| [](const VmPageOrMarker* p, uint64_t off) { |
| // Nothing to update for the page as we're not actually marking it Dirty. |
| return ZX_ERR_NEXT; |
| }, |
| [this](uint64_t start, uint64_t end, bool unused) { |
| // Resolve any DIRTY requests in this contiguous range. |
| page_source_->OnPagesDirtied(start, end - start); |
| return ZX_ERR_NEXT; |
| }, |
| start, end); |
| // We don't expect an error from the traversal. |
| DEBUG_ASSERT(status == ZX_OK); |
| |
| // Now resolve DIRTY requests for any gaps. After request generation, pages could either |
| // have been evicted, or zero intervals written back, leading to gaps. So it is possible for gaps |
| // to have outstanding DIRTY requests. |
| status = page_list_.ForEveryPageAndGapInRange( |
| [](const VmPageOrMarker* p, uint64_t off) { |
| // Nothing to do for pages. We already handled them above. |
| return ZX_ERR_NEXT; |
| }, |
| [this](uint64_t gap_start, uint64_t gap_end) { |
| // Resolve any DIRTY requests in this gap. |
| page_source_->OnPagesDirtied(gap_start, gap_end - gap_start); |
| return ZX_ERR_NEXT; |
| }, |
| start, end); |
| // We don't expect an error from the traversal. |
| DEBUG_ASSERT(status == ZX_OK); |
| } |
| |
| zx_status_t VmCowPages::ResizeLocked(uint64_t s) { |
| canary_.Assert(); |
| |
| LTRACEF("vmcp %p, size %" PRIu64 "\n", this, s); |
| |
| // make sure everything is aligned before we get started |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(size_)); |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(s)); |
| DEBUG_ASSERT(!is_slice_locked()); |
| |
| // We stack-own loaned pages from removal until freed. |
| __UNINITIALIZED StackOwnedLoanedPagesInterval raii_interval; |
| |
| // see if we're shrinking or expanding the vmo |
| if (s < size_) { |
| // shrinking |
| const uint64_t start = s; |
| const uint64_t end = size_; |
| const uint64_t len = end - start; |
| |
| // bail if there are any pinned pages in the range we're trimming |
| if (AnyPagesPinnedLocked(start, len)) { |
| return ZX_ERR_BAD_STATE; |
| } |
| |
| // unmap all of the pages in this range on all the mapping regions |
| RangeChangeUpdateLocked(start, len, RangeChangeOp::Unmap); |
| |
| // Resolve any outstanding page requests tracked by the page source that are now out-of-bounds. |
| if (page_source_) { |
| // Tell the page source that any non-resident pages that are now out-of-bounds |
| // were supplied, to ensure that any reads of those pages get woken up. |
| InvalidateReadRequestsLocked(start, len); |
| |
| // If DIRTY requests are supported, also tell the page source that any non-Dirty pages that |
| // are now out-of-bounds were dirtied (without actually dirtying them), to ensure that any |
| // threads blocked on DIRTY requests for those pages get woken up. |
| if (is_source_preserving_page_content() && page_source_->ShouldTrapDirtyTransitions()) { |
| InvalidateDirtyRequestsLocked(start, len); |
| } |
| } |
| |
| // If pager-backed and the new size falls partway in an interval, we will need to clip the |
| // interval. |
| if (is_source_preserving_page_content()) { |
| // Check if the first populated slot we find in the now-invalid range is an interval end. |
| uint64_t interval_end = UINT64_MAX; |
| zx_status_t status = page_list_.ForEveryPageInRange( |
| [&interval_end](const VmPageOrMarker* p, uint64_t off) { |
| if (p->IsIntervalEnd()) { |
| interval_end = off; |
| } |
| // We found the first populated slot. Stop the traversal. |
| return ZX_ERR_STOP; |
| }, |
| s, size_); |
| DEBUG_ASSERT(status == ZX_OK); |
| |
| if (interval_end != UINT64_MAX) { |
| status = page_list_.ClipIntervalEnd(interval_end, interval_end - s + PAGE_SIZE); |
| if (status != ZX_OK) { |
| DEBUG_ASSERT(status == ZX_ERR_NO_MEMORY); |
| return status; |
| } |
| } |
| } |
| |
| // We might need to free pages from an ancestor and/or this object. |
| list_node_t freed_list; |
| list_initialize(&freed_list); |
| __UNINITIALIZED BatchPQRemove page_remover(&freed_list); |
| |
| bool hidden_parent = false; |
| if (parent_) { |
| hidden_parent = parent_locked().is_hidden_locked(); |
| } |
| if (hidden_parent) { |
| // Release any COW pages that are no longer necessary. This will also |
| // update the parent limit. |
| ReleaseCowParentPagesLocked(start, end, &page_remover); |
| |
| // Flush the page remover and free the pages, so that we don't mix ownership of ancestor pages |
| // with pages removed from this object below. |
| page_remover.Flush(); |
| FreePagesLocked(&freed_list, /*freeing_owned_pages=*/false); |
| |
| // Validate that the parent limit was correctly updated as it should never remain larger than |
| // our actual size. |
| DEBUG_ASSERT(parent_limit_ <= s); |
| } else { |
| parent_limit_ = ktl::min(parent_limit_, s); |
| } |
| // If the tail of a parent disappears, the children shouldn't be able to see that region |
| // again, even if the parent is later reenlarged. So update the child parent limits. |
| UpdateChildParentLimitsLocked(s); |
| |
| // We should not have any outstanding pages to free as we flushed ancestor pages already. So |
| // this flush should be a no-op. |
| page_remover.Flush(); |
| DEBUG_ASSERT(list_length(&freed_list) == 0); |
| |
| // Remove and free pages from this object. |
| page_list_.RemovePages(page_remover.RemovePagesCallback(), start, end); |
| page_remover.Flush(); |
| FreePagesLocked(&freed_list, /*freeing_owned_pages=*/true); |
| |
| } else if (s > size_) { |
| uint64_t temp; |
| // Check that this VMOs new size would not cause it to overflow if projected onto the root. |
| bool overflow = add_overflow(root_parent_offset_, s, &temp); |
| if (overflow) { |
| return ZX_ERR_INVALID_ARGS; |
| } |
| // expanding |
| // figure the starting and ending page offset that is affected |
| const uint64_t start = size_; |
| const uint64_t end = s; |
| const uint64_t len = end - start; |
| |
| // inform all our children or mapping that there's new bits |
| RangeChangeUpdateLocked(start, len, RangeChangeOp::Unmap); |
| |
| // If pager-backed, need to insert a dirty zero interval beyond the old size. |
| if (is_source_preserving_page_content()) { |
| zx_status_t status = |
| page_list_.AddZeroInterval(start, end, VmPageOrMarker::IntervalDirtyState::Dirty); |
| if (status != ZX_OK) { |
| DEBUG_ASSERT(status == ZX_ERR_NO_MEMORY); |
| return status; |
| } |
| } |
| } |
| |
| // save bytewise size |
| size_ = s; |
| |
| IncrementHierarchyGenerationCountLocked(); |
| |
| VMO_VALIDATION_ASSERT(DebugValidatePageSplitsHierarchyLocked()); |
| VMO_VALIDATION_ASSERT(DebugValidateZeroIntervalsLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| return ZX_OK; |
| } |
| |
| void VmCowPages::UpdateChildParentLimitsLocked(uint64_t new_size) { |
| // Note that a child's parent_limit_ will limit that child's descendants' views into |
| // this vmo, so this method only needs to touch the direct children. |
| for (auto& child : children_list_) { |
| AssertHeld(child.lock_ref()); |
| if (new_size < child.parent_offset_) { |
| child.parent_limit_ = 0; |
| } else { |
| child.parent_limit_ = ktl::min(child.parent_limit_, new_size - child.parent_offset_); |
| } |
| } |
| } |
| |
| zx_status_t VmCowPages::LookupLocked(uint64_t offset, uint64_t len, |
| VmObject::LookupFunction lookup_fn) { |
| canary_.Assert(); |
| if (unlikely(len == 0)) { |
| return ZX_ERR_INVALID_ARGS; |
| } |
| |
| // verify that the range is within the object |
| if (unlikely(!InRange(offset, len, size_))) { |
| return ZX_ERR_OUT_OF_RANGE; |
| } |
| |
| if (is_slice_locked()) { |
| return slice_parent_locked().LookupLocked( |
| offset + parent_offset_, len, |
| [&lookup_fn, parent_offset = parent_offset_](uint64_t offset, paddr_t pa) { |
| // Need to undo the parent_offset before forwarding to the lookup_fn, who is ignorant of |
| // slices. |
| return lookup_fn(offset - parent_offset, pa); |
| }); |
| } |
| |
| const uint64_t start_page_offset = ROUNDDOWN(offset, PAGE_SIZE); |
| const uint64_t end_page_offset = ROUNDUP(offset + len, PAGE_SIZE); |
| |
| return page_list_.ForEveryPageInRange( |
| [&lookup_fn](const auto* p, uint64_t off) { |
| if (!p->IsPage()) { |
| // Skip non pages. |
| return ZX_ERR_NEXT; |
| } |
| paddr_t pa = p->Page()->paddr(); |
| return lookup_fn(off, pa); |
| }, |
| start_page_offset, end_page_offset); |
| } |
| |
| zx_status_t VmCowPages::LookupReadableLocked(uint64_t offset, uint64_t len, |
| LookupReadableFunction lookup_fn) { |
| canary_.Assert(); |
| if (unlikely(len == 0)) { |
| return ZX_ERR_INVALID_ARGS; |
| } |
| |
| // verify that the range is within the object |
| if (unlikely(!InRange(offset, len, size_))) { |
| return ZX_ERR_OUT_OF_RANGE; |
| } |
| |
| if (is_slice_locked()) { |
| return slice_parent_locked().LookupReadableLocked( |
| offset + parent_offset_, len, |
| [&lookup_fn, parent_offset = parent_offset_](uint64_t offset, paddr_t pa) { |
| // Need to undo the parent_offset before forwarding to the lookup_fn, who is ignorant of |
| // slices. |
| return lookup_fn(offset - parent_offset, pa); |
| }); |
| } |
| |
| uint64_t current_page_offset = ROUNDDOWN(offset, PAGE_SIZE); |
| const uint64_t end_page_offset = ROUNDUP(offset + len, PAGE_SIZE); |
| |
| while (current_page_offset != end_page_offset) { |
| // Attempt to process any pages we have first. Skip over anything that's not a page since the |
| // lookup_fn only applies to actual pages. |
| zx_status_t status = page_list_.ForEveryPageInRange( |
| [&lookup_fn, ¤t_page_offset](const VmPageOrMarker* page_or_marker, uint64_t offset) { |
| // The offset can advance ahead if we encounter gaps or sparse intervals. |
| if (offset != current_page_offset) { |
| if (!page_or_marker->IsIntervalEnd()) { |
| // There was a gap before this offset. End the traversal. |
| return ZX_ERR_STOP; |
| } |
| // Otherwise, we can advance our cursor to the interval end. |
| offset = current_page_offset; |
| } |
| DEBUG_ASSERT(offset == current_page_offset); |
| current_page_offset = offset + PAGE_SIZE; |
| if (!page_or_marker->IsPage()) { |
| return ZX_ERR_NEXT; |
| } |
| return lookup_fn(offset, page_or_marker->Page()->paddr()); |
| }, |
| current_page_offset, end_page_offset); |
| |
| // Check if we've processed the whole range. |
| if (current_page_offset == end_page_offset) { |
| break; |
| } |
| |
| // See if any of our parents have the content. |
| VmCowPages* owner = nullptr; |
| uint64_t owner_offset = 0; |
| uint64_t owner_length = end_page_offset - current_page_offset; |
| |
| // We do not care about the return value, all we are interested in is the populated out |
| // variables that we pass in. |
| // |
| // Note that page intervals are only supported in root VMOs, so if we ended the page list |
| // traversal above partway into an interval, we will be able to continue the traversal over the |
| // rest of the interval after this call - since we're the root, we will be the owner and the |
| // owner length won't be clipped. |
| FindInitialPageContentLocked(current_page_offset, &owner, &owner_offset, &owner_length) |
| .current(); |
| |
| // This should always get filled out. |
| DEBUG_ASSERT(owner_length > 0); |
| DEBUG_ASSERT(owner); |
| |
| // Iterate over any potential content. |
| AssertHeld(owner->lock_ref()); |
| status = owner->page_list_.ForEveryPageInRange( |
| [&lookup_fn, current_page_offset, owner_offset](const VmPageOrMarker* page_or_marker, |
| uint64_t offset) { |
| if (!page_or_marker->IsPage()) { |
| return ZX_ERR_NEXT; |
| } |
| return lookup_fn(offset - owner_offset + current_page_offset, |
| page_or_marker->Page()->paddr()); |
| }, |
| owner_offset, owner_offset + owner_length); |
| if (status != ZX_OK || status != ZX_ERR_NEXT) { |
| return status; |
| } |
| |
| current_page_offset += owner_length; |
| } |
| return ZX_OK; |
| } |
| |
| zx_status_t VmCowPages::TakePagesWithParentLocked(uint64_t offset, uint64_t len, |
| VmPageSpliceList* pages, uint64_t* taken_len, |
| LazyPageRequest* page_request) { |
| DEBUG_ASSERT(parent_); |
| |
| // Set up a cursor that will help us take pages from the parent. |
| const uint64_t end = offset + len; |
| uint64_t position = offset; |
| auto cursor = GetLookupCursorLocked(offset, len); |
| if (cursor.is_error()) { |
| return cursor.error_value(); |
| } |
| AssertHeld(cursor->lock_ref()); |
| |
| VmCompression* compression = pmm_page_compression(); |
| |
| // This loop attempts to take pages from the VMO one page at a time. For each page, it: |
| // 1. Allocates a zero page to replace the existing page. |
| // 2. Takes ownership of the page. |
| // 3. Replaces the existing page with the zero page. |
| // 4. Adds the existing page to the splice list. |
| // We perform this operation page-by-page to ensure that we can always make forward progress. |
| // For example, if we tried to take ownership of the entire range of pages but encounter a |
| // ZX_ERR_SHOULD_WAIT, we would need to drop the lock, wait on the page request, and then attempt |
| // to take ownership of all of the pages again. On highly contended VMOs, this could lead to a |
| // situation in which we get stuck in this loop and no forward progress is made. |
| zx_status_t status = ZX_OK; |
| uint64_t new_pages_len = 0; |
| while (position < end) { |
| // Allocate a zero page to replace the content at position. |
| // TODO(https://fxbug.dev/42076904): Inserting a full zero page is inefficient. We should |
| // replace this logic with something a bit more efficient; this could mean using the same logic |
| // that `ZeroPages` uses and insert markers, or generalizing the concept of intervals and using |
| // those instead. |
| vm_page_t* p; |
| status = AllocateCopyPage(vm_get_zero_page_paddr(), nullptr, page_request, &p); |
| if (status != ZX_OK) { |
| break; |
| } |
| VmPageOrMarker zeroed_out_page = VmPageOrMarker::Page(p); |
| VmPageOrMarker* zero_page_ptr = &zeroed_out_page; |
| auto free_zeroed_page = fit::defer([zero_page_ptr, this] { |
| // If the zeroed out page is not incorporated into this VMO, free it. |
| if (!zero_page_ptr->IsEmpty()) { |
| vm_page_t* p = zero_page_ptr->ReleasePage(); |
| AssertHeld(lock_ref()); |
| // The zero page is not part of any VMO at this point, so it should not be in a page queue. |
| FreePageLocked(p, false); |
| } |
| }); |
| |
| // Once we have a zero page ready to go, require an owned page at the current position. |
| auto result = cursor->RequireOwnedPage(true, static_cast<uint>((end - position) / PAGE_SIZE), |
| page_request); |
| if (result.is_error()) { |
| status = result.error_value(); |
| break; |
| } |
| |
| // Replace the content at `position` with the zeroed out page. |
| VmPageOrMarker content; |
| status = AddPageLocked(&zeroed_out_page, position, CanOverwriteContent::NonZero, &content, |
| /*do_range_update=*/false); |
| // Absent bugs, AddPageLocked() can only return ZX_ERR_NO_MEMORY. |
| if (status != ZX_OK) { |
| DEBUG_ASSERT(status == ZX_ERR_NO_MEMORY); |
| break; |
| } |
| new_pages_len += PAGE_SIZE; |
| ASSERT(!content.IsInterval()); |
| |
| // Before adding the content to the splice list, we need to make sure that it: |
| // 1. Is not in any page queues if it is a page. |
| // 2. Is not a temporary reference. |
| if (content.IsPage()) { |
| DEBUG_ASSERT(content.Page()->object.pin_count == 0); |
| pmm_page_queues()->Remove(content.Page()); |
| } else if (content.IsReference()) { |
| if (auto page = compression->MoveReference(content.Reference())) { |
| InitializeVmPage(*page); |
| AssertHeld(lock_ref()); |
| // Don't insert the page in the page queues, since we're trying to remove the pages. |
| VmPageOrMarker::ReferenceValue ref = content.SwapReferenceForPage(*page); |
| ASSERT(compression->IsTempReference(ref)); |
| } |
| } |
| |
| // Add the content to the splice list. |
| status = pages->Append(ktl::move(content)); |
| if (status == ZX_ERR_NO_MEMORY) { |
| break; |
| } |
| DEBUG_ASSERT(status == ZX_OK); |
| position += PAGE_SIZE; |
| *taken_len += PAGE_SIZE; |
| } |
| |
| if (new_pages_len) { |
| RangeChangeUpdateLocked(offset, new_pages_len, RangeChangeOp::Unmap); |
| } |
| |
| VMO_VALIDATION_ASSERT(DebugValidatePageSplitsHierarchyLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| |
| // We need to finalize the splice page list as soon as we know that we will not be adding pages |
| // to it. This is true in any case that does not return ZX_ERR_SHOULD_WAIT. |
| if (status != ZX_ERR_SHOULD_WAIT) { |
| pages->Finalize(); |
| } |
| |
| return status; |
| } |
| |
| zx_status_t VmCowPages::TakePagesLocked(uint64_t offset, uint64_t len, VmPageSpliceList* pages, |
| uint64_t* taken_len, LazyPageRequest* page_request) { |
| canary_.Assert(); |
| |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(offset)); |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(len)); |
| |
| if (!InRange(offset, len, size_)) { |
| pages->Finalize(); |
| return ZX_ERR_OUT_OF_RANGE; |
| } |
| |
| if (page_source_) { |
| pages->Finalize(); |
| return ZX_ERR_NOT_SUPPORTED; |
| } |
| |
| if (AnyPagesPinnedLocked(offset, len)) { |
| pages->Finalize(); |
| return ZX_ERR_BAD_STATE; |
| } |
| |
| // If this is a child slice, propagate the operation to the parent. |
| if (is_slice_locked()) { |
| return slice_parent_locked().TakePagesLocked(offset + parent_offset_, len, pages, taken_len, |
| page_request); |
| } |
| |
| // Now that all early checks are done, increment the gen count since we're going to remove pages. |
| IncrementHierarchyGenerationCountLocked(); |
| |
| // If this is a child of any other kind, we need to handle it specially. |
| if (parent_) { |
| return TakePagesWithParentLocked(offset, len, pages, taken_len, page_request); |
| } |
| |
| VmCompression* compression = pmm_page_compression(); |
| bool found_page = false; |
| page_list_.ForEveryPageInRangeMutable( |
| [&compression, &found_page, this](VmPageOrMarkerRef p, uint64_t off) { |
| found_page = true; |
| // Splice lists do not support page intervals. |
| ASSERT(!p->IsInterval()); |
| if (p->IsPage()) { |
| DEBUG_ASSERT(p->Page()->object.pin_count == 0); |
| pmm_page_queues()->Remove(p->Page()); |
| } else if (p->IsReference()) { |
| // A regular reference we can move are permitted in the VmPageSpliceList, it is up to the |
| // receiver of the pages to reject or otherwise deal with them. A temporary reference we |
| // need to turn back into its page so we can move it. |
| if (auto page = compression->MoveReference(p->Reference())) { |
| InitializeVmPage(*page); |
| AssertHeld(lock_ref()); |
| // Don't insert the page in the page queues, since we're trying to remove the pages, |
| // just update the page list reader for TakePages below. |
| VmPageOrMarker::ReferenceValue ref = p.SwapReferenceForPage(*page); |
| ASSERT(compression->IsTempReference(ref)); |
| } |
| } |
| return ZX_ERR_NEXT; |
| }, |
| offset, offset + len); |
| |
| // If we did not find any pages, we could either be entirely inside a gap or an interval. Make |
| // sure we're not inside an interval; checking a single offset for membership should suffice. |
| ASSERT(found_page || !page_list_.IsOffsetInZeroInterval(offset)); |
| |
| // The VmPageSpliceList should not have been modified by anything up to this point. |
| DEBUG_ASSERT(pages->IsEmpty()); |
| |
| *pages = page_list_.TakePages(offset, len); |
| *taken_len = len; |
| RangeChangeUpdateLocked(offset, len, RangeChangeOp::Unmap); |
| |
| VMO_VALIDATION_ASSERT(DebugValidatePageSplitsHierarchyLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| |
| return ZX_OK; |
| } |
| |
| zx_status_t VmCowPages::SupplyPages(uint64_t offset, uint64_t len, VmPageSpliceList* pages, |
| SupplyOptions options, uint64_t* supplied_len, |
| LazyPageRequest* page_request) { |
| canary_.Assert(); |
| Guard<CriticalMutex> guard{lock()}; |
| return SupplyPagesLocked(offset, len, pages, options, supplied_len, page_request); |
| } |
| |
| zx_status_t VmCowPages::SupplyPagesLocked(uint64_t offset, uint64_t len, VmPageSpliceList* pages, |
| SupplyOptions options, uint64_t* supplied_len, |
| LazyPageRequest* page_request) { |
| canary_.Assert(); |
| |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(offset)); |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(len)); |
| DEBUG_ASSERT(supplied_len); |
| ASSERT(options != SupplyOptions::PagerSupply || page_source_); |
| |
| if (!InRange(offset, len, size_)) { |
| *supplied_len = 0; |
| return ZX_ERR_OUT_OF_RANGE; |
| } |
| |
| if (options == SupplyOptions::TransferData) { |
| if (page_source_) { |
| return ZX_ERR_NOT_SUPPORTED; |
| } |
| if (AnyPagesPinnedLocked(offset, len)) { |
| return ZX_ERR_BAD_STATE; |
| } |
| } |
| |
| if (page_source_ && page_source_->is_detached()) { |
| return ZX_ERR_BAD_STATE; |
| } |
| |
| // If this is a child slice, propagate the operation to the parent. |
| if (is_slice_locked()) { |
| return slice_parent_locked().SupplyPagesLocked(offset + parent_offset_, len, pages, options, |
| supplied_len, page_request); |
| } |
| |
| // If this VMO has a parent, we need to make sure we take ownership of all of the pages in the |
| // input range. |
| // TODO(https://fxbug.dev/42076904): This is suboptimal, as we take ownership of a page just to |
| // free it immediately when we replace it with the supplied page. |
| if (parent_) { |
| const uint64_t end = offset + len; |
| uint64_t position = offset; |
| auto cursor = GetLookupCursorLocked(offset, len); |
| if (cursor.is_error()) { |
| return cursor.error_value(); |
| } |
| AssertHeld(cursor->lock_ref()); |
| while (position < end) { |
| auto result = cursor->RequireOwnedPage(true, static_cast<uint>((end - position) / PAGE_SIZE), |
| page_request); |
| if (result.is_error()) { |
| return result.error_value(); |
| } |
| position += PAGE_SIZE; |
| } |
| } |
| |
| // It is possible that we fail to insert pages below and we increment the gen count needlessly, |
| // but the user is certainly expecting it to succeed. |
| IncrementHierarchyGenerationCountLocked(); |
| |
| const uint64_t start = offset; |
| const uint64_t end = offset + len; |
| |
| // We stack-own loaned pages below from allocation for page replacement to AddPageLocked(). |
| __UNINITIALIZED StackOwnedLoanedPagesInterval raii_interval; |
| |
| list_node freed_list; |
| list_initialize(&freed_list); |
| |
| // [new_pages_start, new_pages_start + new_pages_len) tracks the current run of |
| // consecutive new pages added to this vmo. |
| uint64_t new_pages_start = offset; |
| uint64_t new_pages_len = 0; |
| zx_status_t status = ZX_OK; |
| [[maybe_unused]] uint64_t initial_list_position = pages->Position(); |
| while (!pages->IsProcessed()) { |
| // With a PageSource only Pages are supported, so convert any refs to real pages. |
| // We do this without popping a page from the splice list as `MakePageFromReference` may return |
| // ZX_ERR_SHOULD_WAIT. This could lead the caller to wait on the page request and call |
| // `SupplyPagesLocked` again, at which point it would expect the operation to continue at the |
| // exact same page. |
| VmPageOrMarkerRef src_page_ref = pages->PeekReference(); |
| // The src_page_ref can be null if the head of the page list is not a reference or if the page |
| // list is empty. |
| if (src_page_ref) { |
| DEBUG_ASSERT(src_page_ref->IsReference()); |
| status = MakePageFromReference(src_page_ref, page_request); |
| if (status != ZX_OK) { |
| break; |
| } |
| } |
| VmPageOrMarker src_page = pages->Pop(); |
| DEBUG_ASSERT(!src_page.IsReference()); |
| |
| // The pager API does not allow the source VMO of supply pages to have a page source, so we can |
| // assume that any empty pages are zeroes and insert explicit markers here. We need to insert |
| // explicit markers to actually resolve the pager fault. |
| if (src_page.IsEmpty()) { |
| src_page = VmPageOrMarker::Marker(); |
| } |
| |
| // A newly supplied page starts off as Clean. |
| if (src_page.IsPage() && is_source_preserving_page_content()) { |
| UpdateDirtyStateLocked(src_page.Page(), offset, DirtyState::Clean, |
| /*is_pending_add=*/true); |
| } |
| |
| if (can_borrow_locked() && src_page.IsPage() && |
| pmm_physical_page_borrowing_config()->is_borrowing_in_supplypages_enabled()) { |
| // Assert some things we implicitly know are true (currently). We can avoid explicitly |
| // checking these in the if condition for now. |
| DEBUG_ASSERT(!is_source_supplying_specific_physical_pages()); |
| DEBUG_ASSERT(!src_page.Page()->is_loaned()); |
| DEBUG_ASSERT(options != SupplyOptions::PhysicalPageProvider); |
| // Try to replace src_page with a loaned page. We allocate the loaned page one page at a time |
| // to avoid failing the allocation due to asking for more loaned pages than there are free |
| // loaned pages. |
| vm_page_t* new_page = nullptr; |
| zx_status_t alloc_status = AllocLoanedPage(&new_page); |
| // If we got a loaned page, replace the page in src_page, else just continue with src_page |
| // unmodified since pmm has no more loaned free pages or |
| // !is_borrowing_in_supplypages_enabled(). |
| if (alloc_status == ZX_OK) { |
| CopyPageForReplacementLocked(new_page, src_page.Page()); |
| vm_page_t* old_page = src_page.ReleasePage(); |
| list_add_tail(&freed_list, &old_page->queue_node); |
| src_page = VmPageOrMarker::Page(new_page); |
| } |
| DEBUG_ASSERT(src_page.IsPage()); |
| } |
| |
| // Defer individual range updates so we can do them in blocks. |
| const CanOverwriteContent overwrite_policy = options == SupplyOptions::TransferData |
| ? CanOverwriteContent::NonZero |
| : CanOverwriteContent::None; |
| VmPageOrMarker old_page; |
| if (options == SupplyOptions::PhysicalPageProvider) { |
| // When being called from the physical page provider, we need to call InitializeVmPage(), |
| // which AddNewPageLocked() will do. |
| // We only want to populate offsets that have true absence of content, so do not overwrite |
| // anything in the page list. |
| DEBUG_ASSERT(src_page.IsPage()); |
| status = AddNewPageLocked(offset, src_page.Page(), overwrite_policy, &old_page, |
| /*zero=*/false, /*do_range_update=*/false); |
| if (status == ZX_OK) { |
| // The page was successfully added, but we still have a copy in the src_page, so we need to |
| // release it, however need to store the result in a temporary as we are required to use the |
| // result of ReleasePage. |
| [[maybe_unused]] vm_page_t* unused = src_page.ReleasePage(); |
| } |
| } else { |
| // When not being called from the physical page provider, we don't need InitializeVmPage(), |
| // so we use AddPageLocked(). |
| // We only want to populate offsets that have true absence of content, so do not overwrite |
| // anything in the page list. |
| status = AddPageLocked(&src_page, offset, overwrite_policy, &old_page, |
| /*do_range_update=*/false); |
| } |
| |
| // If the content overwrite policy was None, the old page should be empty. |
| DEBUG_ASSERT(overwrite_policy != CanOverwriteContent::None || old_page.IsEmpty()); |
| |
| // Clean up the old_page if necessary. The action taken is different depending on the state of |
| // old_page: |
| // 1. Page: If old_page is backed by an actual page, remove it from the page queues and free |
| // the page. |
| // 2. Reference: If old_page is a reference, free the reference. |
| // 3. Interval: We should not be overwriting data in a pager-backed VMO, so assert that |
| // old_page is not an interval. |
| // 4. Marker: There are no resources to free here, so do nothing. |
| if (old_page.IsPage()) { |
| vm_page_t* released_page = old_page.ReleasePage(); |
| pmm_page_queues()->Remove(released_page); |
| DEBUG_ASSERT(!list_in_list(&released_page->queue_node)); |
| list_add_tail(&freed_list, &released_page->queue_node); |
| } else if (old_page.IsReference()) { |
| FreeReference(old_page.ReleaseReference()); |
| } |
| DEBUG_ASSERT(!old_page.IsInterval()); |
| |
| if (status == ZX_OK) { |
| new_pages_len += PAGE_SIZE; |
| } else { |
| if (src_page.IsPageOrRef()) { |
| DEBUG_ASSERT(src_page.IsPage()); |
| vm_page_t* page = src_page.ReleasePage(); |
| DEBUG_ASSERT(!list_in_list(&page->queue_node)); |
| list_add_tail(&freed_list, &page->queue_node); |
| } |
| |
| if (likely(status == ZX_ERR_ALREADY_EXISTS)) { |
| status = ZX_OK; |
| |
| // We hit the end of a run of absent pages, so notify the page source |
| // of any new pages that were added and reset the tracking variables. |
| if (new_pages_len) { |
| RangeChangeUpdateLocked(new_pages_start, new_pages_len, RangeChangeOp::Unmap); |
| if (page_source_) { |
| page_source_->OnPagesSupplied(new_pages_start, new_pages_len); |
| } |
| } |
| new_pages_start = offset + PAGE_SIZE; |
| new_pages_len = 0; |
| } else { |
| break; |
| } |
| } |
| offset += PAGE_SIZE; |
| |
| DEBUG_ASSERT(new_pages_start + new_pages_len <= end); |
| } |
| // Unless there was an error and we exited the loop early, then there should have been the correct |
| // number of pages in the splice list. |
| DEBUG_ASSERT(offset == end || status != ZX_OK); |
| if (new_pages_len) { |
| RangeChangeUpdateLocked(new_pages_start, new_pages_len, RangeChangeOp::Unmap); |
| if (page_source_) { |
| page_source_->OnPagesSupplied(new_pages_start, new_pages_len); |
| } |
| } |
| |
| if (!list_is_empty(&freed_list)) { |
| // Even though we did not insert these pages successfully, we had logical ownership of them. |
| FreePagesLocked(&freed_list, /*freeing_owned_pages=*/true); |
| } |
| |
| VMO_VALIDATION_ASSERT(DebugValidatePageSplitsHierarchyLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| |
| *supplied_len = offset - start; |
| // Assert we have only popped as many pages from the splice list as we have supplied. |
| DEBUG_ASSERT((pages->Position() - initial_list_position) == *supplied_len); |
| return status; |
| } |
| |
| // This is a transient operation used only to fail currently outstanding page requests. It does not |
| // alter the state of the VMO, or any pages that might have already been populated within the |
| // specified range. |
| // |
| // If certain pages in this range are populated, we must have done so via a previous SupplyPages() |
| // call that succeeded. So it might be fine for clients to continue accessing them, despite the |
| // larger range having failed. |
| // |
| // TODO(rashaeqbal): If we support a more permanent failure mode in the future, we will need to free |
| // populated pages in the specified range, and possibly detach the VMO from the page source. |
| zx_status_t VmCowPages::FailPageRequestsLocked(uint64_t offset, uint64_t len, |
| zx_status_t error_status) { |
| canary_.Assert(); |
| |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(offset)); |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(len)); |
| |
| ASSERT(page_source_); |
| |
| if (!PageSource::IsValidInternalFailureCode(error_status)) { |
| return ZX_ERR_INVALID_ARGS; |
| } |
| |
| if (!InRange(offset, len, size_)) { |
| return ZX_ERR_OUT_OF_RANGE; |
| } |
| |
| if (page_source_->is_detached()) { |
| return ZX_ERR_BAD_STATE; |
| } |
| |
| page_source_->OnPagesFailed(offset, len, error_status); |
| return ZX_OK; |
| } |
| |
| zx_status_t VmCowPages::DirtyPagesLocked(uint64_t offset, uint64_t len, list_node_t* alloc_list, |
| LazyPageRequest* page_request) { |
| canary_.Assert(); |
| |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(offset)); |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(len)); |
| |
| ASSERT(page_source_); |
| |
| if (!page_source_->ShouldTrapDirtyTransitions()) { |
| return ZX_ERR_NOT_SUPPORTED; |
| } |
| DEBUG_ASSERT(is_source_preserving_page_content()); |
| |
| const uint64_t start_offset = offset; |
| const uint64_t end_offset = offset + len; |
| |
| if (start_offset > size_locked()) { |
| return ZX_ERR_OUT_OF_RANGE; |
| } |
| |
| // Overflow check. |
| if (end_offset < start_offset) { |
| return ZX_ERR_OUT_OF_RANGE; |
| } |
| |
| // After the above checks, the page source has tried to respond correctly to a range of dirty |
| // requests, so the kernel should resolve those outstanding dirty requests, even in the failure |
| // case. From a returned error, the page source currently has no ability to detect which ranges |
| // caused the error, so the kernel should either completely succeed or fail the request instead of |
| // holding onto a partial outstanding request that will block pager progress. |
| auto invalidate_requests_on_error = fit::defer([this, len, start_offset] { |
| AssertHeld(lock_ref()); |
| DEBUG_ASSERT(size_locked() >= start_offset); |
| |
| uint64_t invalidate_len = ktl::min(size_locked() - start_offset, len); |
| InvalidateDirtyRequestsLocked(start_offset, invalidate_len); |
| }); |
| |
| // The page source may have tried to mark a larger range than necessary as dirty. Invalidate the |
| // requests and return an error. |
| if (end_offset > size_locked()) { |
| return ZX_ERR_OUT_OF_RANGE; |
| } |
| |
| if (page_source_->is_detached()) { |
| return ZX_ERR_BAD_STATE; |
| } |
| |
| // If any of the pages in the range are zero page markers (Clean zero pages), they need to be |
| // forked in order to be dirtied (written to). Find the number of such pages that need to be |
| // allocated. We also need to allocate zero pages to replace sparse zero intervals. |
| size_t zero_pages_count = 0; |
| // This tracks the beginning of an interval that falls in the specified range. Since we might |
| // start partway inside an interval, this is initialized to start_offset so that we only consider |
| // the portion of the interval inside the range. If we did not start inside an interval, we will |
| // end up reinitializing this when we do find an interval start, before this value is used, so it |
| // is safe to initialize to start_offset in all cases. |
| uint64_t interval_start = start_offset; |
| // This tracks whether we saw an interval start sentinel in the traversal, but have not yet |
| // encountered a matching interval end sentinel. Should we end the traversal partway in an |
| // interval, we will need to handle the portion of the interval between the interval start and the |
| // end of the specified range. |
| bool unmatched_interval_start = false; |
| bool found_page_or_gap = false; |
| zx_status_t status = page_list_.ForEveryPageAndGapInRange( |
| [&zero_pages_count, &interval_start, &unmatched_interval_start, &found_page_or_gap]( |
| const VmPageOrMarker* p, uint64_t off) { |
| found_page_or_gap = true; |
| if (p->IsMarker()) { |
| zero_pages_count++; |
| return ZX_ERR_NEXT; |
| } |
| if (p->IsIntervalZero()) { |
| if (p->IsIntervalStart()) { |
| interval_start = off; |
| unmatched_interval_start = true; |
| } else if (p->IsIntervalEnd()) { |
| zero_pages_count += (off - interval_start + PAGE_SIZE) / PAGE_SIZE; |
| unmatched_interval_start = false; |
| } else { |
| DEBUG_ASSERT(p->IsIntervalSlot()); |
| zero_pages_count++; |
| } |
| return ZX_ERR_NEXT; |
| } |
| // Pager-backed VMOs cannot have compressed references, so the only other type is a page. |
| DEBUG_ASSERT(p->IsPage()); |
| return ZX_ERR_NEXT; |
| }, |
| [&found_page_or_gap](uint64_t start, uint64_t end) { |
| found_page_or_gap = true; |
| // A gap indicates a page that has not been supplied yet. It will need to be supplied |
| // first. Although we will never generate a DIRTY request for absent pages in the first |
| // place, it is still possible for a clean page to get evicted after the DIRTY request was |
| // generated. It is also possible for a dirty zero interval to have been written back such |
| // that we have an old DIRTY request for the interval. |
| // |
| // Spuriously resolve the DIRTY page request, and let the waiter(s) retry looking up the |
| // page, which will generate a READ request first to supply the missing page. |
| return ZX_ERR_NOT_FOUND; |
| }, |
| start_offset, end_offset); |
| |
| if (status != ZX_OK) { |
| return status; |
| } |
| |
| // Handle the last interval or if we did not enter the traversal callbacks at all. |
| if (unmatched_interval_start || !found_page_or_gap) { |
| DEBUG_ASSERT(found_page_or_gap || interval_start == start_offset); |
| zero_pages_count += (end_offset - interval_start) / PAGE_SIZE; |
| } |
| |
| // If we have found any zero pages to populate, then we need to allocate and transition them to |
| // the dirty state. |
| if (zero_pages_count > 0) { |
| // Allocate the number of zero pages required upfront, so that we can fail the call early if the |
| // page allocation fails. First determine how many pages we still need to allocate, based on the |
| // number of existing pages in the list. |
| uint64_t alloc_list_len = list_length(alloc_list); |
| zero_pages_count = zero_pages_count > alloc_list_len ? zero_pages_count - alloc_list_len : 0; |
| |
| // First try to allocate all the pages at once. This is an optimization and avoids repeated |
| // calls to the PMM to allocate single pages. If the PMM returns ZX_ERR_SHOULD_WAIT, fall back |
| // to allocating one page at a time below, giving reclamation strategies a better chance to |
| // catch up with incoming allocation requests. |
| status = pmm_alloc_pages(zero_pages_count, pmm_alloc_flags_, alloc_list); |
| if (status == ZX_OK) { |
| // All requested pages allocated. |
| zero_pages_count = 0; |
| } else { |
| if (status != ZX_ERR_SHOULD_WAIT) { |
| return status; |
| } |
| |
| // Fall back to allocating a single page at a time. We want to do this before we can start |
| // inserting pages into the page list, to avoid rolling back any pages we inserted but could |
| // not dirty in case we fail partway after having inserted some pages into the page list. |
| // Rolling back like this can lead to a livelock where we are constantly allocating some |
| // pages, freeing them, waiting on the page_request, and then repeating. |
| // |
| // If allocations do fail partway here, we will have accumulated the allocated pages in |
| // alloc_list, so we will be able to reuse them on a subsequent call to DirtyPagesLocked. This |
| // ensures we are making forward progress across successive calls. |
| while (zero_pages_count > 0) { |
| vm_page_t* new_page; |
| // We will initialize this page later when passing it to AddNewPageLocked |
| status = AllocUninitializedPage(&new_page, page_request); |
| // If single page allocation fails, bubble up the failure. |
| if (status != ZX_OK) { |
| return status; |
| } |
| list_add_tail(alloc_list, &new_page->queue_node); |
| zero_pages_count--; |
| } |
| } |
| DEBUG_ASSERT(zero_pages_count == 0); |
| |
| // We have to mark all the requested pages Dirty *atomically*. The user pager might be tracking |
| // filesystem space reservations based on the success / failure of this call. So if we fail |
| // partway, the user pager might think that no pages in the specified range have been dirtied, |
| // which would be incorrect. If there are any conditions that would cause us to fail, evaluate |
| // those before actually adding the pages, so that we can return the failure early before |
| // starting to mark pages Dirty. |
| // |
| // Install page slots for all the intervals we'll be adding zero pages in. Page insertion will |
| // only proceed once we've allocated all the slots without any errors. |
| // Populating slots will alter the page list. So break out of the traversal upon finding an |
| // interval, populate slots in it, and then resume the traversal after the interval. |
| uint64_t next_start_offset = start_offset; |
| do { |
| struct { |
| bool found_interval; |
| uint64_t start; |
| uint64_t end; |
| } state = {.found_interval = false, .start = 0, .end = 0}; |
| status = page_list_.ForEveryPageAndContiguousRunInRange( |
| [](const VmPageOrMarker* p, uint64_t off) { |
| return p->IsIntervalStart() || p->IsIntervalEnd(); |
| }, |
| [](const VmPageOrMarker* p, uint64_t off) { |
| DEBUG_ASSERT(p->IsIntervalZero()); |
| return ZX_ERR_NEXT; |
| }, |
| [&state](uint64_t start, uint64_t end, bool is_interval) { |
| DEBUG_ASSERT(is_interval); |
| state = {.found_interval = true, .start = start, .end = end}; |
| return ZX_ERR_STOP; |
| }, |
| next_start_offset, end_offset); |
| DEBUG_ASSERT(status == ZX_OK); |
| |
| // No intervals remain. |
| if (!state.found_interval) { |
| break; |
| } |
| // Ensure we're making forward progress. |
| DEBUG_ASSERT(state.end - state.start >= PAGE_SIZE); |
| zx_status_t st = page_list_.PopulateSlotsInInterval(state.start, state.end); |
| if (st != ZX_OK) { |
| DEBUG_ASSERT(st == ZX_ERR_NO_MEMORY); |
| // Before returning, we need to undo any slots we might have populated in intervals we |
| // previously encountered. This is a rare error case and can be inefficient. |
| for (uint64_t off = start_offset; off < state.start; off += PAGE_SIZE) { |
| auto slot = page_list_.Lookup(off); |
| if (slot) { |
| // If this is an interval slot, return it. Note that even though we did populate all |
| // slots until this point, not all will remain slots in this for-loop. When returning |
| // slots, they can merge with intervals both before and after, so it's possible that the |
| // next slot we were expecting has already been consumed. |
| if (slot->IsIntervalSlot()) { |
| page_list_.ReturnIntervalSlot(off); |
| } |
| } |
| } |
| return st; |
| } |
| next_start_offset = state.end; |
| } while (next_start_offset < end_offset); |
| |
| // All operations from this point on must succeed so we can atomically mark pages dirty. |
| |
| // Increment the generation count as we're going to be inserting new pages. |
| IncrementHierarchyGenerationCountLocked(); |
| |
| // Install newly allocated pages in place of the zero page markers and interval sentinels. Start |
| // with clean zero pages even for the intervals, so that the dirty transition logic below can |
| // uniformly transition them to dirty along with pager supplied pages. |
| status = page_list_.ForEveryPageInRange( |
| [this, &alloc_list](const VmPageOrMarker* p, uint64_t off) { |
| if (p->IsMarker() || p->IsIntervalSlot()) { |
| DEBUG_ASSERT(!list_is_empty(alloc_list)); |
| AssertHeld(lock_ref()); |
| |
| // AddNewPageLocked will also zero the page and update any mappings. |
| // |
| // TODO(rashaeqbal): Depending on how often we end up forking zero markers, we might |
| // want to pass do_range_udpate = false, and defer updates until later, so we can |
| // perform a single batch update. |
| zx_status_t status = |
| AddNewPageLocked(off, list_remove_head_type(alloc_list, vm_page, queue_node), |
| CanOverwriteContent::Zero, nullptr); |
| // AddNewPageLocked will not fail with ZX_ERR_ALREADY_EXISTS as we can overwrite |
| // markers and interval slots since they are zero, nor with ZX_ERR_NO_MEMORY as we don't |
| // need to allocate a new slot in the page list, we're simply replacing its content. |
| ASSERT(status == ZX_OK); |
| } |
| return ZX_ERR_NEXT; |
| }, |
| start_offset, end_offset); |
| |
| // We don't expect an error from the traversal. |
| DEBUG_ASSERT(status == ZX_OK); |
| } |
| |
| status = page_list_.ForEveryPageAndContiguousRunInRange( |
| [](const VmPageOrMarker* p, uint64_t off) { |
| DEBUG_ASSERT(!p->IsReference()); |
| if (p->IsPage()) { |
| vm_page_t* page = p->Page(); |
| DEBUG_ASSERT(is_page_dirty_tracked(page)); |
| DEBUG_ASSERT(is_page_clean(page) || !page->is_loaned()); |
| return !is_page_dirty(page); |
| } |
| return false; |
| }, |
| [this](const VmPageOrMarker* p, uint64_t off) { |
| DEBUG_ASSERT(p->IsPage()); |
| vm_page_t* page = p->Page(); |
| DEBUG_ASSERT(is_page_dirty_tracked(page)); |
| DEBUG_ASSERT(!is_page_dirty(page)); |
| AssertHeld(lock_ref()); |
| UpdateDirtyStateLocked(page, off, DirtyState::Dirty); |
| return ZX_ERR_NEXT; |
| }, |
| [this](uint64_t start, uint64_t end, bool unused) { |
| page_source_->OnPagesDirtied(start, end - start); |
| return ZX_ERR_NEXT; |
| }, |
| start_offset, end_offset); |
| // We don't expect a failure from the traversal. |
| DEBUG_ASSERT(status == ZX_OK); |
| |
| // All pages have been dirtied successfully, so cancel the cleanup on error. |
| invalidate_requests_on_error.cancel(); |
| |
| VMO_VALIDATION_ASSERT(DebugValidateZeroIntervalsLocked()); |
| return status; |
| } |
| |
| zx_status_t VmCowPages::EnumerateDirtyRangesLocked(uint64_t offset, uint64_t len, |
| DirtyRangeEnumerateFunction&& dirty_range_fn) { |
| canary_.Assert(); |
| |
| // Dirty pages are only tracked if the page source preserves content. |
| if (!is_source_preserving_page_content()) { |
| return ZX_ERR_NOT_SUPPORTED; |
| } |
| |
| if (!InRange(offset, len, size_)) { |
| return ZX_ERR_OUT_OF_RANGE; |
| } |
| |
| const uint64_t start_offset = ROUNDDOWN(offset, PAGE_SIZE); |
| const uint64_t end_offset = ROUNDUP(offset + len, PAGE_SIZE); |
| |
| zx_status_t status = page_list_.ForEveryPageAndContiguousRunInRange( |
| [](const VmPageOrMarker* p, uint64_t off) { |
| // Enumerate both AwaitingClean and Dirty pages, i.e. anything that is not Clean. |
| // AwaitingClean pages are "dirty" too for the purposes of this enumeration, since their |
| // modified contents are still in the process of being written back. |
| if (p->IsPage()) { |
| vm_page_t* page = p->Page(); |
| DEBUG_ASSERT(is_page_dirty_tracked(page)); |
| DEBUG_ASSERT(is_page_clean(page) || !page->is_loaned()); |
| return !is_page_clean(page); |
| } |
| // Enumerate any dirty zero intervals. |
| if (p->IsIntervalZero()) { |
| // For now we only support dirty intervals. |
| DEBUG_ASSERT(!p->IsZeroIntervalClean()); |
| return !p->IsZeroIntervalClean(); |
| } |
| // Pager-backed VMOs cannot have compressed references, so the only other type is a marker. |
| DEBUG_ASSERT(p->IsMarker()); |
| return false; |
| }, |
| [](const VmPageOrMarker* p, uint64_t off) { |
| if (p->IsPage()) { |
| vm_page_t* page = p->Page(); |
| DEBUG_ASSERT(is_page_dirty_tracked(page)); |
| DEBUG_ASSERT(!is_page_clean(page)); |
| DEBUG_ASSERT(!page->is_loaned()); |
| DEBUG_ASSERT(page->object.get_page_offset() == off); |
| } else if (p->IsIntervalZero()) { |
| DEBUG_ASSERT(!p->IsZeroIntervalClean()); |
| } |
| return ZX_ERR_NEXT; |
| }, |
| [&dirty_range_fn](uint64_t start, uint64_t end, bool is_interval) { |
| // Zero intervals are enumerated as zero ranges. |
| return dirty_range_fn(start, end - start, /*range_is_zero=*/is_interval); |
| }, |
| start_offset, end_offset); |
| |
| VMO_VALIDATION_ASSERT(DebugValidateZeroIntervalsLocked()); |
| return status; |
| } |
| |
| zx_status_t VmCowPages::WritebackBeginLocked(uint64_t offset, uint64_t len, bool is_zero_range) { |
| canary_.Assert(); |
| |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(offset)); |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(len)); |
| |
| ASSERT(page_source_); |
| |
| if (!InRange(offset, len, size_)) { |
| return ZX_ERR_OUT_OF_RANGE; |
| } |
| |
| if (!is_source_preserving_page_content()) { |
| return ZX_ERR_NOT_SUPPORTED; |
| } |
| |
| const uint64_t start_offset = offset; |
| const uint64_t end_offset = offset + len; |
| // We only need to consider transitioning committed pages if the caller has specified that this is |
| // not a zero range. For a zero range, we cannot start cleaning any pages because the caller has |
| // expressed intent to write back zeros in this range; any pages we clean might get evicted and |
| // incorrectly supplied again as zero pages, leading to data loss. |
| // |
| // When querying dirty ranges, zero page intervals are indicated as dirty zero ranges. So it's |
| // perfectly reasonable for the user pager to write back these zero ranges efficiently without |
| // having to read the actual contents of the range, which would read zeroes anyway. There can |
| // exist a race however, where the user pager has just discovered a dirty zero range, and before |
| // it starts writing it out, an actual page gets dirtied in that range. Consider the following |
| // example that demonstrates the race: |
| // 1. The zero interval [5, 10) is indicated as a dirty zero range when the user pager queries |
| // dirty ranges. |
| // 2. A write comes in for page 7 and it is marked Dirty. The interval is split up into two: [5, |
| // 7) and [8, 10). |
| // 3. The user pager prepares to write the range [5, 10) with WritebackBegin. |
| // 4. Both the intervals as well as page 7 are marked AwaitingClean. |
| // 5. The user pager still thinks that [5, 10) is zero and writes back zeroes for the range. |
| // 6. The user pager does a WritebackEnd on [5, 10), and page 7 gets marked Clean. |
| // 7. At some point in the future, page 7 gets evicted. The data on page 7 (which was prematurely |
| // marked Clean) is now lost. |
| // |
| // This race occurred because there was a mismatch between what the user pager and the kernel |
| // think the contents of the range being written back are. The user pager intended to mark only |
| // zero ranges clean, not actual pages. The is_zero_range flag captures this intent, so that the |
| // kernel does not incorrectly clean actual committed pages. Committed dirty pages will be |
| // returned as actual dirty pages (not dirty zero ranges) on a subsequent call to query dirty |
| // ranges, and can be cleaned then. |
| |
| auto interval_start = VmPageOrMarkerRef(nullptr); |
| uint64_t interval_start_off; |
| zx_status_t status = page_list_.ForEveryPageInRangeMutable( |
| [is_zero_range, &interval_start, &interval_start_off, this](VmPageOrMarkerRef p, |
| uint64_t off) { |
| // VMOs with a page source should never have references. |
| DEBUG_ASSERT(!p->IsReference()); |
| // If the page is pinned we have to leave it Dirty in case it is still being written to |
| // via DMA. The VM system will be unaware of these writes, and so we choose to be |
| // conservative here and might end up with pinned pages being left dirty for longer, until |
| // a writeback is attempted after the unpin. |
| // If the caller indicates that they're only cleaning zero pages, any committed pages need |
| // to be left dirty. |
| if (p->IsPage() && (p->Page()->object.pin_count > 0 || is_zero_range)) { |
| return ZX_ERR_NEXT; |
| } |
| // Transition pages from Dirty to AwaitingClean. |
| if (p->IsPage() && is_page_dirty(p->Page())) { |
| AssertHeld(lock_ref()); |
| UpdateDirtyStateLocked(p->Page(), off, DirtyState::AwaitingClean); |
| return ZX_ERR_NEXT; |
| } |
| if (p->IsIntervalZero()) { |
| // Transition zero intervals to AwaitingClean. |
| DEBUG_ASSERT(p->IsZeroIntervalDirty()); |
| if (p->IsIntervalStart() || p->IsIntervalSlot()) { |
| // Start tracking a dirty interval. It will only transition once the end is encountered. |
| DEBUG_ASSERT(!interval_start); |
| interval_start = p; |
| interval_start_off = off; |
| } |
| if (p->IsIntervalEnd() || p->IsIntervalSlot()) { |
| // Now that we've encountered the end, the entire interval can be transitioned to |
| // AwaitingClean. This is done by setting the AwaitingCleanLength of the start sentinel. |
| // TODO: If the writeback began partway into the interval, try to coalesce the start's |
| // awaiting clean length with the range being cleaned here if it immediately follows. |
| if (interval_start) { |
| // Set the new AwaitingClean length to the max of the old value and the new one. |
| // See comments in WritebackEndLocked for an explanation. |
| const uint64_t old_len = interval_start->GetZeroIntervalAwaitingCleanLength(); |
| interval_start.SetZeroIntervalAwaitingCleanLength( |
| ktl::max(off - interval_start_off + PAGE_SIZE, old_len)); |
| } |
| // Reset the interval start so we can track a new one later. |
| interval_start = VmPageOrMarkerRef(nullptr); |
| } |
| return ZX_ERR_NEXT; |
| } |
| // This was either a marker (which is already clean), or a non-Dirty page. |
| DEBUG_ASSERT(p->IsMarker() || !is_page_dirty(p->Page())); |
| return ZX_ERR_NEXT; |
| }, |
| start_offset, end_offset); |
| // We don't expect a failure from the traversal. |
| DEBUG_ASSERT(status == ZX_OK); |
| |
| // Process the last partial interval. |
| if (interval_start) { |
| DEBUG_ASSERT(interval_start->IsIntervalStart()); |
| const uint64_t old_len = interval_start->GetZeroIntervalAwaitingCleanLength(); |
| interval_start.SetZeroIntervalAwaitingCleanLength( |
| ktl::max(end_offset - interval_start_off, old_len)); |
| } |
| |
| // Set any mappings for this range to read-only, so that a permission fault is triggered the next |
| // time the page is written to in order for us to track it as dirty. This might cover more pages |
| // than the Dirty pages found in the page list traversal above, but we choose to do this once for |
| // the entire range instead of per page; pages in the AwaitingClean and Clean states will already |
| // have their write permission removed, so this is a no-op for them. |
| RangeChangeUpdateLocked(start_offset, end_offset - start_offset, RangeChangeOp::RemoveWrite); |
| |
| VMO_VALIDATION_ASSERT(DebugValidateZeroIntervalsLocked()); |
| return ZX_OK; |
| } |
| |
| zx_status_t VmCowPages::WritebackEndLocked(uint64_t offset, uint64_t len) { |
| canary_.Assert(); |
| |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(offset)); |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(len)); |
| |
| ASSERT(page_source_); |
| |
| if (!InRange(offset, len, size_)) { |
| return ZX_ERR_OUT_OF_RANGE; |
| } |
| |
| if (!is_source_preserving_page_content()) { |
| return ZX_ERR_NOT_SUPPORTED; |
| } |
| |
| // We might end up removing / clipping zero intervals, so update the generation count. |
| IncrementHierarchyGenerationCountLocked(); |
| |
| const uint64_t start_offset = offset; |
| const uint64_t end_offset = offset + len; |
| |
| // Mark any AwaitingClean pages Clean. Remove AwaitingClean intervals that can be fully cleaned, |
| // otherwise clip the interval start removing the part that has been cleaned. Note that deleting |
| // an interval start is delayed until the corresponding end is encountered, and to ensure safe |
| // continued traversal, the start should always be released before the end, i.e. in the expected |
| // forward traversal order for RemovePages. |
| VmPageOrMarker* interval_start = nullptr; |
| uint64_t interval_start_off; |
| // This tracks the end offset until which all zero intervals can be marked clean. This is a |
| // running counter that is maintained across multiple zero intervals. Each time we encounter |
| // a new interval start, we take the max of the existing value and the AwaitingCleanLength of the |
| // new interval. This is because when zero intervals are truncated at the end or split, their |
| // AwaitingCleanLength does not get updated, even if it's larger than the current interval length. |
| // This is an optimization to avoid having to potentially walk to another node to find the |
| // relevant start to update. The reason it is safe to leave the AwaitingCleanLength unchanged is |
| // that it should be possible to apply the AwaitingCleanLength to any new zero intervals that get |
| // added later beyond the truncated interval. The user pager has indicated its intent to write a |
| // range as zeros, so until the point that it actually completes the writeback, it doesn't matter |
| // if zero intervals are removed and re-added, as long as they fall in the range that was |
| // initially indicated as being written back as zeros. |
| uint64_t interval_awaiting_clean_end = start_offset; |
| page_list_.RemovePages( |
| [&interval_start, &interval_start_off, &interval_awaiting_clean_end, this](VmPageOrMarker* p, |
| uint64_t off) { |
| // VMOs with a page source should never have references. |
| DEBUG_ASSERT(!p->IsReference()); |
| // Transition pages from AwaitingClean to Clean. |
| if (p->IsPage() && is_page_awaiting_clean(p->Page())) { |
| AssertHeld(lock_ref()); |
| UpdateDirtyStateLocked(p->Page(), off, DirtyState::Clean); |
| return ZX_ERR_NEXT; |
| } |
| if (p->IsIntervalZero()) { |
| // Handle zero intervals. |
| DEBUG_ASSERT(p->IsZeroIntervalDirty()); |
| if (p->IsIntervalStart() || p->IsIntervalSlot()) { |
| DEBUG_ASSERT(!interval_start); |
| // Start tracking an interval. |
| interval_start = p; |
| interval_start_off = off; |
| // See if we can advance interval_awaiting_clean_end to include the AwaitingCleanLength |
| // of this interval. |
| interval_awaiting_clean_end = ktl::max(interval_awaiting_clean_end, |
| off + p->GetZeroIntervalAwaitingCleanLength()); |
| } |
| if (p->IsIntervalEnd() || p->IsIntervalSlot()) { |
| // Can only transition the end if we saw the corresponding start. |
| if (interval_start) { |
| AssertHeld(lock_ref()); |
| if (off < interval_awaiting_clean_end) { |
| // The entire interval is clean, so can remove it. |
| if (interval_start_off != off) { |
| *interval_start = VmPageOrMarker::Empty(); |
| // Return the start slot as it could have come from an earlier page list node. |
| // If the start slot came from the same node, we know that we still have a |
| // non-empty slot in that node (the current interval end we're looking at), and so |
| // the current node cannot be freed up, making it safe to continue traversal. The |
| // interval start should always be released before the end, which is consistent |
| // with forward traversal done by RemovePages. |
| page_list_.ReturnEmptySlot(interval_start_off); |
| } |
| // This empty slot with be returned by the RemovePages iterator. |
| *p = VmPageOrMarker::Empty(); |
| } else { |
| // The entire interval cannot be marked clean. Move forward the start by awaiting |
| // clean length, which will also set the AwaitingCleanLength for the resulting |
| // interval. |
| // Ignore any errors. Cleaning is best effort. If this fails, the interval will |
| // remain as is and get retried on another writeback attempt. |
| page_list_.ClipIntervalStart(interval_start_off, |
| interval_awaiting_clean_end - interval_start_off); |
| } |
| // Either way, the interval start tracking needs to be reset. |
| interval_start = nullptr; |
| } |
| } |
| return ZX_ERR_NEXT; |
| } |
| // This was either a marker (which is already clean), or a non-AwaitingClean page. |
| DEBUG_ASSERT(p->IsMarker() || !is_page_awaiting_clean(p->Page())); |
| return ZX_ERR_NEXT; |
| }, |
| start_offset, end_offset); |
| |
| // Handle the last partial interval. |
| if (interval_start) { |
| // Ignore any errors. Cleaning is best effort. If this fails, the interval will remain as is and |
| // get retried on another writeback attempt. |
| page_list_.ClipIntervalStart( |
| interval_start_off, ktl::min(interval_awaiting_clean_end, end_offset) - interval_start_off); |
| } |
| |
| VMO_VALIDATION_ASSERT(DebugValidateZeroIntervalsLocked()); |
| return ZX_OK; |
| } |
| |
| const VmCowPages* VmCowPages::GetRootLocked() const { |
| auto cow_pages = this; |
| AssertHeld(cow_pages->lock_ref()); |
| while (cow_pages->parent_) { |
| cow_pages = cow_pages->parent_.get(); |
| // We just checked that this is not null in the loop conditional. |
| DEBUG_ASSERT(cow_pages); |
| } |
| DEBUG_ASSERT(cow_pages); |
| return cow_pages; |
| } |
| |
| fbl::RefPtr<VmCowPages> VmCowPages::DebugGetParent() { |
| canary_.Assert(); |
| |
| Guard<CriticalMutex> guard{lock()}; |
| return parent_; |
| } |
| |
| fbl::RefPtr<PageSource> VmCowPages::GetRootPageSourceLocked() const { |
| auto root = GetRootLocked(); |
| // The root will never be null. It will either point to a valid parent, or |this| if there's no |
| // parent. |
| DEBUG_ASSERT(root); |
| return root->page_source_; |
| } |
| |
| void VmCowPages::DetachSourceLocked() { |
| canary_.Assert(); |
| |
| DEBUG_ASSERT(page_source_); |
| page_source_->Detach(); |
| |
| // We stack-own loaned pages from UnmapAndRemovePagesLocked() to FreePagesLocked(). |
| __UNINITIALIZED StackOwnedLoanedPagesInterval raii_interval; |
| |
| list_node_t freed_list; |
| list_initialize(&freed_list); |
| |
| // We would like to remove all committed pages so that all future page faults on this VMO and its |
| // clones can fail in a deterministic manner. However, if the page source is preserving content |
| // (is a userpager), we need to hold on to un-Clean (Dirty and AwaitingClean pages) so that they |
| // can be written back by the page source. If the page source is not preserving content, its pages |
| // will not be dirty tracked to begin with i.e. their dirty state will be Untracked, so we will |
| // end up removing all pages. |
| |
| // We should only be removing pages from the root VMO. |
| DEBUG_ASSERT(!parent_); |
| |
| // Even though we might end up removing only a subset of the pages, unmap them all at once as an |
| // optimization. Only the userpager is expected to access (dirty) pages beyond this point, in |
| // order to write back their contents, where the cost of the writeback is presumably much larger |
| // than page faults to update hardware page table mappings for resident pages. |
| RangeChangeUpdateLocked(0, size_, RangeChangeOp::Unmap); |
| |
| __UNINITIALIZED BatchPQRemove page_remover(&freed_list); |
| |
| // Remove all clean (or untracked) pages. |
| // TODO(rashaeqbal): Pages that linger after this will be written back and marked clean at some |
| // point, and will age through the pager-backed queues and eventually get evicted. We could |
| // adopt an eager approach instead, and decommit those pages as soon as they get marked clean. |
| // If we do that, we could also extend the eager approach to supply_pages, where pages get |
| // decommitted on supply, i.e. the supply is a no-op. |
| page_list_.RemovePages( |
| [&page_remover](VmPageOrMarker* p, uint64_t off) { |
| // A marker is a clean zero page. Replace it with an empty slot. |
| if (p->IsMarker()) { |
| *p = VmPageOrMarker::Empty(); |
| return ZX_ERR_NEXT; |
| } |
| |
| // Zero intervals are dirty so they cannot be removed. |
| if (p->IsIntervalZero()) { |
| // TODO: Remove clean intervals once they are supported. |
| DEBUG_ASSERT(p->IsZeroIntervalDirty()); |
| return ZX_ERR_NEXT; |
| } |
| |
| // VMOs with a page source cannot have references. |
| DEBUG_ASSERT(p->IsPage()); |
| |
| // We cannot remove the page if it is dirty-tracked but not clean. |
| if (is_page_dirty_tracked(p->Page()) && !is_page_clean(p->Page())) { |
| DEBUG_ASSERT(!p->Page()->is_loaned()); |
| return ZX_ERR_NEXT; |
| } |
| |
| // This is a page that we're going to remove; we don't expect it to be pinned. |
| DEBUG_ASSERT(p->Page()->object.pin_count == 0); |
| |
| page_remover.Push(p->ReleasePage()); |
| return ZX_ERR_NEXT; |
| }, |
| 0, size_); |
| |
| page_remover.Flush(); |
| FreePagesLocked(&freed_list, /*freeing_owned_pages=*/true); |
| |
| IncrementHierarchyGenerationCountLocked(); |
| } |
| |
| void VmCowPages::RangeChangeUpdateFromParentLocked(const uint64_t offset, const uint64_t len, |
| RangeChangeList* list) { |
| canary_.Assert(); |
| |
| LTRACEF("offset %#" PRIx64 " len %#" PRIx64 " p_offset %#" PRIx64 " size_ %#" PRIx64 "\n", offset, |
| len, parent_offset_, size_); |
| |
| // our parent is notifying that a range of theirs changed, see where it intersects |
| // with our offset into the parent and pass it on |
| uint64_t offset_new; |
| uint64_t len_new; |
| if (!GetIntersect(parent_offset_, size_, offset, len, &offset_new, &len_new)) { |
| return; |
| } |
| |
| // if they intersect with us, then by definition the new offset must be >= parent_offset_ |
| DEBUG_ASSERT(offset_new >= parent_offset_); |
| |
| // subtract our offset |
| offset_new -= parent_offset_; |
| |
| // verify that it's still within range of us |
| DEBUG_ASSERT(offset_new + len_new <= size_); |
| |
| LTRACEF("new offset %#" PRIx64 " new len %#" PRIx64 "\n", offset_new, len_new); |
| |
| // Check if there are any gaps in this range where we would actually see the parent. |
| uint64_t first_gap_start = UINT64_MAX; |
| uint64_t last_gap_end = 0; |
| page_list_.ForEveryPageAndGapInRange( |
| [](auto page, uint64_t offset) { |
| // For anything in the page list we know we do not see the parent for this offset, so |
| // regardless of what it is just keep looking for a gap. Additionally any children that we |
| // have will see this content instead of our parents, and so we know it is also safe to skip |
| // them as well. |
| return ZX_ERR_NEXT; |
| }, |
| [&first_gap_start, &last_gap_end](uint64_t start, uint64_t end) { |
| first_gap_start = ktl::min(first_gap_start, start); |
| last_gap_end = ktl::max(last_gap_end, end); |
| return ZX_ERR_NEXT; |
| }, |
| offset_new, offset_new + len_new); |
| |
| if (first_gap_start >= last_gap_end) { |
| // Entire range was traversed and no gaps found. Neither us, nor our children, can see the |
| // parents content for this range and so we can skip the range update. |
| vm_vmo_range_update_from_parent_skipped.Add(1); |
| return; |
| } |
| |
| // Construct a new, potentially smaller, range that covers the gaps. This will still result in |
| // potentially processing pages that are locally covered, but are limited to a single range here. |
| offset_new = first_gap_start; |
| len_new = last_gap_end - first_gap_start; |
| vm_vmo_range_update_from_parent_performed.Add(1); |
| |
| // pass it on. to prevent unbounded recursion we package up our desired offset and len and add |
| // ourselves to the list. UpdateRangeLocked will then get called on it later. |
| range_change_offset_ = offset_new; |
| range_change_len_ = len_new; |
| list->push_front(this); |
| } |
| |
| void VmCowPages::RangeChangeUpdateListLocked(RangeChangeList* list, RangeChangeOp op) { |
| while (!list->is_empty()) { |
| VmCowPages* object = list->pop_front(); |
| AssertHeld(object->lock_ref()); |
| |
| // Check if there is an associated backlink, and if so pass the operation over. |
| if (object->paged_ref_) { |
| AssertHeld(object->paged_ref_->lock_ref()); |
| object->paged_ref_->RangeChangeUpdateLocked(object->range_change_offset_, |
| object->range_change_len_, op); |
| } |
| |
| // inform all our children this as well, so they can inform their mappings |
| for (auto& child : object->children_list_) { |
| AssertHeld(child.lock_ref()); |
| child.RangeChangeUpdateFromParentLocked(object->range_change_offset_, |
| object->range_change_len_, list); |
| } |
| } |
| } |
| |
| void VmCowPages::RangeChangeUpdateLocked(uint64_t offset, uint64_t len, RangeChangeOp op) { |
| canary_.Assert(); |
| |
| if (len == 0) { |
| return; |
| } |
| |
| RangeChangeList list; |
| this->range_change_offset_ = offset; |
| this->range_change_len_ = len; |
| list.push_front(this); |
| RangeChangeUpdateListLocked(&list, op); |
| } |
| |
| bool VmCowPages::RemovePageForEviction(vm_page_t* page, uint64_t offset) { |
| canary_.Assert(); |
| |
| Guard<CriticalMutex> guard{lock()}; |
| |
| // Check this page is still a part of this VMO. |
| const VmPageOrMarker* page_or_marker = page_list_.Lookup(offset); |
| if (!page_or_marker || !page_or_marker->IsPage() || page_or_marker->Page() != page) { |
| return false; |
| } |
| |
| // We shouldn't have been asked to evict a pinned page. |
| ASSERT(page->object.pin_count == 0); |
| |
| // Ignore any hints, we were asked directly to evict. |
| return RemovePageForEvictionLocked(page, offset, EvictionHintAction::Ignore); |
| } |
| |
| bool VmCowPages::RemovePageForEvictionLocked(vm_page_t* page, uint64_t offset, |
| EvictionHintAction hint_action) { |
| // Without a page source to bring the page back in we cannot even think about eviction. |
| if (!can_evict()) { |
| return false; |
| } |
| |
| // We can assume this page is in the VMO. |
| #if (DEBUG_ASSERT_IMPLEMENTED) |
| { |
| const VmPageOrMarker* page_or_marker = page_list_.Lookup(offset); |
| DEBUG_ASSERT(page_or_marker); |
| DEBUG_ASSERT(page_or_marker->IsPage()); |
| DEBUG_ASSERT(page_or_marker->Page() == page); |
| } |
| #endif |
| |
| DEBUG_ASSERT(is_page_dirty_tracked(page)); |
| |
| // We cannot evict the page unless it is clean. If the page is dirty, it will already have been |
| // moved to the dirty page queue. |
| if (!is_page_clean(page)) { |
| DEBUG_ASSERT(!page->is_loaned()); |
| return false; |
| } |
| |
| // Do not evict if the |always_need| hint is set, unless we are told to ignore the eviction hint. |
| if (page->object.always_need == 1 && hint_action == EvictionHintAction::Follow) { |
| DEBUG_ASSERT(!page->is_loaned()); |
| // We still need to move the page from the tail of the LRU page queue(s) so that the eviction |
| // loop can make progress. Since this page is always needed, move it out of the way and into the |
| // MRU queue. Do this here while we hold the lock, instead of at the callsite. |
| // |
| // TODO(rashaeqbal): Since we're essentially simulating an access here, this page may not |
| // qualify for eviction if we do decide to override the hint soon after (i.e. if an OOM follows |
| // shortly after). Investigate adding a separate queue once we have some more data around hints |
| // usage. A possible approach might involve moving to a separate queue when we skip the page for |
| // eviction. Pages move out of said queue when accessed, and continue aging as other pages. |
| // Pages in the queue are considered for eviction pre-OOM, but ignored otherwise. |
| pmm_page_queues()->MarkAccessed(page); |
| vm_vmo_always_need_skipped_reclaim.Add(1); |
| return false; |
| } |
| |
| // Remove any mappings to this page before we remove it. |
| RangeChangeUpdateLocked(offset, PAGE_SIZE, RangeChangeOp::Unmap); |
| |
| // Use RemovePage over just writing to page_or_marker so that the page list has the opportunity |
| // to release any now empty intermediate nodes. |
| vm_page_t* p = page_list_.RemoveContent(offset).ReleasePage(); |
| DEBUG_ASSERT(p == page); |
| pmm_page_queues()->Remove(page); |
| |
| reclamation_event_count_++; |
| IncrementHierarchyGenerationCountLocked(); |
| VMO_VALIDATION_ASSERT(DebugValidatePageSplitsHierarchyLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| // |page| is now owned by the caller. |
| return true; |
| } |
| |
| bool VmCowPages::RemovePageForCompressionLocked(vm_page_t* page, uint64_t offset, |
| VmCompressor* compressor, |
| Guard<CriticalMutex>& guard) { |
| DEBUG_ASSERT(compressor); |
| DEBUG_ASSERT(!page_source_); |
| ASSERT(page->object.pin_count == 0); |
| DEBUG_ASSERT(!page->is_loaned()); |
| DEBUG_ASSERT(!discardable_tracker_); |
| DEBUG_ASSERT(can_decommit_zero_pages_locked()); |
| if (paged_ref_) { |
| AssertHeld(paged_ref_->lock_ref()); |
| if ((paged_ref_->GetMappingCachePolicyLocked() & ZX_CACHE_POLICY_MASK) != |
| ZX_CACHE_POLICY_CACHED) { |
| // Cannot compress uncached mappings. To avoid this page remaining in the reclamation list we |
| // simulate an access. |
| pmm_page_queues()->MarkAccessed(page); |
| return false; |
| } |
| } |
| |
| // Use a sub-scope as the page_or_marker will become invalid as we will drop the lock later. |
| { |
| VmPageOrMarkerRef page_or_marker = page_list_.LookupMutable(offset); |
| DEBUG_ASSERT(page_or_marker); |
| DEBUG_ASSERT(page_or_marker->IsPage()); |
| DEBUG_ASSERT(page_or_marker->Page() == page); |
| |
| RangeChangeUpdateLocked(offset, PAGE_SIZE, RangeChangeOp::Unmap); |
| |
| // Start compression of the page by swapping the page list to contain the temporary reference. |
| [[maybe_unused]] vm_page_t* compress_page = |
| page_or_marker.SwapPageForReference(compressor->Start(page)); |
| DEBUG_ASSERT(compress_page == page); |
| } |
| pmm_page_queues()->Remove(page); |
| // Going to drop the lock so need to indicate that we've modified the hierarchy by putting in the |
| // temporary reference. |
| IncrementHierarchyGenerationCountLocked(); |
| |
| // We now stack own the page (and guarantee to the compressor that it will not be modified) and |
| // the VMO owns the temporary reference. We can safely drop the VMO lock and perform the |
| // compression step. |
| VmCompressor::CompressResult compression_result = VmCompressor::FailTag{}; |
| guard.CallUnlocked( |
| [compressor, &compression_result] { compression_result = compressor->Compress(); }); |
| |
| // We hold the VMO lock again and need to reclaim the temporary reference. Either the |
| // temporary reference is still installed, and since we hold the VMO lock we now own both the |
| // temp reference and the place, or the temporary reference got replaced, in which case it no |
| // longer exists and is not referring to page and so we own page. |
| // |
| // Determining what state we are in just requires re-looking up the slot and see if the temporary |
| // reference we installed is still there. |
| auto [slot, is_in_interval] = |
| page_list_.LookupOrAllocate(offset, VmPageList::IntervalHandling::NoIntervals); |
| DEBUG_ASSERT(!is_in_interval); |
| if (slot && slot->IsReference() && compressor->IsTempReference(slot->Reference())) { |
| // Still the original reference, need to replace it with the result of compression. |
| VmPageOrMarker::ReferenceValue old_ref{0}; |
| if (const VmPageOrMarker::ReferenceValue* ref = |
| ktl::get_if<VmPageOrMarker::ReferenceValue>(&compression_result)) { |
| // Compression succeeded, put the new reference in. |
| old_ref = VmPageOrMarkerRef(slot).ChangeReferenceValue(*ref); |
| reclamation_event_count_++; |
| } else if (ktl::holds_alternative<VmCompressor::FailTag>(compression_result)) { |
| // Compression failed, but the page back in. |
| old_ref = VmPageOrMarkerRef(slot).SwapReferenceForPage(page); |
| // TODO(https://fxbug.dev/42138396): Placing in a queue and then moving it is inefficient, but |
| // avoids needing to reason about whether reclamation could be manually attempted on pages |
| // that might otherwise not end up in the reclaimable queues. |
| SetNotPinnedLocked(page, offset); |
| // TODO(https://fxbug.dev/42138396): Marking this page as failing reclamation will prevent it |
| // from ever being tried again. As compression might succeed if the contents changes, we |
| // should consider moving the page out of this queue if it is modified. |
| pmm_page_queues()->CompressFailed(page); |
| // Page stays owned by the VMO. |
| page = nullptr; |
| } else { |
| ASSERT(ktl::holds_alternative<VmCompressor::ZeroTag>(compression_result)); |
| old_ref = slot->ReleaseReference(); |
| // Check if we can clear the slot, or if we need to insert a marker. Unlike the full zero |
| // pages this simply needs to check if there's any visible content above us, and then if there |
| // isn't if the root is immutable or not (i.e. if it has a page source). |
| VmCowPages* page_owner; |
| uint64_t owner_offset; |
| if (!FindInitialPageContentLocked(offset, &page_owner, &owner_offset, nullptr).current() && |
| !page_owner->page_source_) { |
| *slot = VmPageOrMarker::Empty(); |
| page_list_.ReturnEmptySlot(offset); |
| vm_vmo_compression_zero_slot.Add(1); |
| } else { |
| *slot = VmPageOrMarker::Marker(); |
| vm_vmo_compression_marker.Add(1); |
| } |
| reclamation_event_count_++; |
| } |
| // Temporary reference has been replaced, can return it to the compressor. |
| compressor->ReturnTempReference(old_ref); |
| // Have done a modification. |
| IncrementHierarchyGenerationCountLocked(); |
| } else { |
| // The temporary reference is no longer there. We know nothing else about the state of the VMO |
| // at this point and will just free any compression result and exit. |
| if (const VmPageOrMarker::ReferenceValue* ref = |
| ktl::get_if<VmPageOrMarker::ReferenceValue>(&compression_result)) { |
| compressor->Free(*ref); |
| } |
| // To avoid claiming that |page| got reclaimed when it didn't, separately free it. |
| FreePageLocked(page, true); |
| page = nullptr; |
| // If the slot is allocated, but empty, then make sure we properly return it. |
| if (slot && slot->IsEmpty()) { |
| page_list_.ReturnEmptySlot(offset); |
| } |
| } |
| // One way or another the temporary reference has been returned, and so we can finalize. |
| compressor->Finalize(); |
| |
| // Return whether we ended up reclaiming the page or not. That is, whether we currently own it and |
| // it needs to be freed. |
| return page != nullptr; |
| } |
| |
| uint64_t VmCowPages::ReclaimPage(vm_page_t* page, uint64_t offset, EvictionHintAction hint_action, |
| list_node* freed_list, VmCompressor* compressor) { |
| DEBUG_ASSERT(freed_list); |
| canary_.Assert(); |
| |
| Guard<CriticalMutex> guard{lock()}; |
| // Check this page is still a part of this VMO. |
| const VmPageOrMarker* page_or_marker = page_list_.Lookup(offset); |
| if (!page_or_marker || !page_or_marker->IsPage() || page_or_marker->Page() != page) { |
| return 0; |
| } |
| |
| // Pinned pages could be in use by DMA so we cannot safely reclaim them. |
| if (page->object.pin_count != 0) { |
| return 0; |
| } |
| |
| if (high_priority_count_ != 0) { |
| // Not allowed to reclaim. To avoid this page remaining in a reclamation list we simulate an |
| // access. |
| pmm_page_queues()->MarkAccessed(page); |
| return 0; |
| } |
| |
| auto single_page_wrapper = [&](bool reclaimed) { |
| if (reclaimed) { |
| DEBUG_ASSERT(!list_in_list(&page->queue_node)); |
| list_add_tail(freed_list, &page->queue_node); |
| return 1; |
| } |
| return 0; |
| }; |
| |
| // See if we can reclaim by eviction. |
| if (can_evict()) { |
| return single_page_wrapper(RemovePageForEvictionLocked(page, offset, hint_action)); |
| } else if (compressor && !page_source_ && !discardable_tracker_) { |
| return single_page_wrapper(RemovePageForCompressionLocked(page, offset, compressor, guard)); |
| } else if (discardable_tracker_) { |
| // On any errors touch the page so we stop trying to reclaim it. In particular for discardable |
| // reclamation attempts, if the page we are passing is not the first page in the discardable |
| // VMO then the discard will fail, so touching it will stop us from continuously trying to |
| // trigger a discard with it. |
| auto result = ReclaimDiscardableLocked(page, offset, freed_list); |
| if (result.is_error()) { |
| pmm_page_queues()->MarkAccessed(page); |
| vm_vmo_discardable_failed_reclaim.Add(1); |
| return 0; |
| } |
| return *result; |
| } |
| // No other reclamation strategies, so to avoid this page remaining in a reclamation list we |
| // simulate an access. Do not want to place it in the ReclaimFailed queue since our failure was |
| // not based on page contents. |
| pmm_page_queues()->MarkAccessed(page); |
| // Keep a count as having no reclamation strategy is probably a sign of miss-configuration. |
| vm_vmo_no_reclamation_strategy.Add(1); |
| return 0; |
| } |
| |
| void VmCowPages::SwapPageLocked(uint64_t offset, vm_page_t* old_page, vm_page_t* new_page) { |
| DEBUG_ASSERT(!old_page->object.pin_count); |
| DEBUG_ASSERT(new_page->state() == vm_page_state::OBJECT); |
| |
| // unmap before removing old page |
| RangeChangeUpdateLocked(offset, PAGE_SIZE, RangeChangeOp::Unmap); |
| |
| const VmPageOrMarker* p = page_list_.Lookup(offset); |
| DEBUG_ASSERT(p); |
| DEBUG_ASSERT(p->IsPage()); |
| |
| CopyPageForReplacementLocked(new_page, old_page); |
| |
| // Add replacement page in place of old page. |
| // |
| // We could optimize this by doing what's needed to *p directly, but for now call this |
| // common code. |
| VmPageOrMarker new_vm_page = VmPageOrMarker::Page(new_page); |
| VmPageOrMarker released_page; |
| zx_status_t status = AddPageLocked(&new_vm_page, offset, CanOverwriteContent::NonZero, |
| &released_page, /*do_range_update=*/false); |
| // Absent bugs, AddPageLocked() can only return ZX_ERR_NO_MEMORY, but that failure can only occur |
| // if page_list_ had to allocate. Here, page_list_ hasn't yet had a chance to clean up any |
| // internal structures, so AddPageLocked() didn't need to allocate, so we know that |
| // AddPageLocked() will succeed. |
| DEBUG_ASSERT(status == ZX_OK); |
| // The page released was the old page. |
| DEBUG_ASSERT(released_page.IsPage() && released_page.Page() == old_page); |
| // Need to take the page out of |released_page| to avoid a [[nodiscard]] error. Since we just |
| // checked that this matches the target page, which is now owned by the caller, this is not |
| // leaking. |
| [[maybe_unused]] vm_page_t* released = released_page.ReleasePage(); |
| } |
| |
| zx_status_t VmCowPages::ReplacePagesWithNonLoanedLocked(uint64_t offset, uint64_t len, |
| LazyPageRequest* page_request, |
| uint64_t* non_loaned_len) { |
| canary_.Assert(); |
| |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(offset)); |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(len)); |
| DEBUG_ASSERT(InRange(offset, len, size_)); |
| DEBUG_ASSERT(non_loaned_len); |
| |
| if (is_slice_locked()) { |
| return slice_parent_locked().ReplacePagesWithNonLoanedLocked(offset + parent_offset_, len, |
| page_request, non_loaned_len); |
| } |
| |
| *non_loaned_len = 0; |
| bool found_page_or_gap = false; |
| zx_status_t status = page_list_.ForEveryPageAndGapInRange( |
| [page_request, non_loaned_len, &found_page_or_gap, this](const VmPageOrMarker* p, |
| uint64_t off) { |
| found_page_or_gap = true; |
| // We only expect committed pages in the specified range. |
| if (p->IsMarker() || p->IsReference() || p->IsInterval()) { |
| return ZX_ERR_BAD_STATE; |
| } |
| vm_page_t* page = p->Page(); |
| // If the page is loaned, replace is with a non-loaned page. |
| if (page->is_loaned()) { |
| AssertHeld(lock_ref()); |
| // A loaned page could only have been clean. |
| DEBUG_ASSERT(!is_page_dirty_tracked(page) || is_page_clean(page)); |
| DEBUG_ASSERT(page_request); |
| zx_status_t status = |
| ReplacePageLocked(page, off, /*with_loaned=*/false, &page, page_request); |
| if (status == ZX_ERR_SHOULD_WAIT) { |
| return status; |
| } |
| if (status != ZX_OK) { |
| return ZX_ERR_BAD_STATE; |
| } |
| } |
| DEBUG_ASSERT(!page->is_loaned()); |
| *non_loaned_len += PAGE_SIZE; |
| return ZX_ERR_NEXT; |
| }, |
| [&found_page_or_gap](uint64_t start, uint64_t end) { |
| found_page_or_gap = true; |
| // We only expect committed pages in the specified range. |
| return ZX_ERR_BAD_STATE; |
| }, |
| offset, offset + len); |
| |
| if (status != ZX_OK) { |
| return status; |
| } |
| |
| // If we did not find a page or a gap, the entire range fell inside an interval. We only expect |
| // committed pages in the range. |
| if (!found_page_or_gap) { |
| return ZX_ERR_BAD_STATE; |
| } |
| |
| return ZX_OK; |
| } |
| |
| zx_status_t VmCowPages::ReplacePageWithLoaned(vm_page_t* before_page, uint64_t offset) { |
| canary_.Assert(); |
| |
| Guard<CriticalMutex> guard{lock()}; |
| return ReplacePageLocked(before_page, offset, true, nullptr, nullptr); |
| } |
| |
| zx_status_t VmCowPages::ReplacePageLocked(vm_page_t* before_page, uint64_t offset, bool with_loaned, |
| vm_page_t** after_page, LazyPageRequest* page_request) { |
| // If not replacing with loaned it is required that a page_request be provided. |
| DEBUG_ASSERT(with_loaned || page_request); |
| |
| const VmPageOrMarker* p = page_list_.Lookup(offset); |
| if (!p) { |
| return ZX_ERR_NOT_FOUND; |
| } |
| if (!p->IsPage()) { |
| return ZX_ERR_NOT_FOUND; |
| } |
| vm_page_t* old_page = p->Page(); |
| if (old_page != before_page) { |
| return ZX_ERR_NOT_FOUND; |
| } |
| DEBUG_ASSERT(old_page != vm_get_zero_page()); |
| if (old_page->object.pin_count != 0) { |
| DEBUG_ASSERT(!old_page->is_loaned()); |
| return ZX_ERR_BAD_STATE; |
| } |
| if (old_page->object.always_need) { |
| DEBUG_ASSERT(!old_page->is_loaned()); |
| return ZX_ERR_BAD_STATE; |
| } |
| |
| // We stack-own a loaned page from pmm_alloc_page() to SwapPageLocked() OR from SwapPageLocked() |
| // until FreePageLocked(). |
| __UNINITIALIZED StackOwnedLoanedPagesInterval raii_interval; |
| |
| vm_page_t* new_page = nullptr; |
| zx_status_t status = ZX_OK; |
| if (with_loaned) { |
| if (!can_borrow_locked()) { |
| return ZX_ERR_NOT_SUPPORTED; |
| } |
| if (is_page_dirty_tracked(old_page) && !is_page_clean(old_page)) { |
| return ZX_ERR_BAD_STATE; |
| } |
| status = AllocLoanedPage(&new_page); |
| } else { |
| status = AllocPage(&new_page, page_request); |
| } |
| if (status != ZX_OK) { |
| return status; |
| } |
| |
| SwapPageLocked(offset, old_page, new_page); |
| pmm_page_queues()->Remove(old_page); |
| FreePageLocked(old_page, /*freeing_owned_page=*/true); |
| if (after_page) { |
| *after_page = new_page; |
| } |
| |
| // We've changed a page in the page list. Update the generation count. |
| IncrementHierarchyGenerationCountLocked(); |
| |
| return ZX_OK; |
| } |
| |
| bool VmCowPages::DebugValidatePageSplitsHierarchyLocked() const { |
| canary_.Assert(); |
| |
| const VmCowPages* cur = this; |
| AssertHeld(cur->lock_ref()); |
| const VmCowPages* parent_most = cur; |
| do { |
| if (!cur->DebugValidatePageSplitsLocked()) { |
| return false; |
| } |
| cur = cur->parent_.get(); |
| if (cur) { |
| parent_most = cur; |
| } |
| } while (cur); |
| // Iterate whole hierarchy; the iteration order doesn't matter. Since there are cases with |
| // >2 children, in-order isn't well defined, so we choose pre-order, but post-order would also |
| // be fine. |
| const VmCowPages* prev = nullptr; |
| cur = parent_most; |
| while (cur) { |
| uint32_t children = cur->children_list_len_; |
| if (!prev || prev == cur->parent_.get()) { |
| // Visit cur |
| if (!cur->DebugValidateBacklinksLocked()) { |
| dprintf(INFO, "cur: %p this: %p\n", cur, this); |
| return false; |
| } |
| |
| if (!children) { |
| // no children; move to parent (or nullptr) |
| prev = cur; |
| cur = cur->parent_.get(); |
| continue; |
| } else { |
| // move to first child |
| prev = cur; |
| cur = &cur->children_list_.front(); |
| continue; |
| } |
| } |
| // At this point we know we came up from a child, not down from the parent. |
| DEBUG_ASSERT(prev && prev != cur->parent_.get()); |
| // The children are linked together, so we can move from one child to the next. |
| |
| auto iterator = cur->children_list_.make_iterator(*prev); |
| ++iterator; |
| if (iterator == cur->children_list_.end()) { |
| // no more children; move back to parent |
| prev = cur; |
| cur = cur->parent_.get(); |
| continue; |
| } |
| |
| // descend to next child |
| prev = cur; |
| cur = &(*iterator); |
| DEBUG_ASSERT(cur); |
| } |
| return true; |
| } |
| |
| bool VmCowPages::DebugValidatePageSplitsLocked() const { |
| canary_.Assert(); |
| |
| // Assume this is valid until we prove otherwise. |
| bool valid = true; |
| page_list_.ForEveryPage([this, &valid](const VmPageOrMarker* page, uint64_t offset) { |
| if (!page->IsPageOrRef()) { |
| return ZX_ERR_NEXT; |
| } |
| AssertHeld(this->lock_ref()); |
| |
| // All pages in non-hidden VMOs should not be split, as this is a meaningless thing to talk |
| // about and indicates a book keeping error somewhere else. |
| if (!this->is_hidden_locked()) { |
| if (page->PageOrRefLeftSplit() || page->PageOrRefRightSplit()) { |
| if (page->IsPage()) { |
| printf("Found split page %p (off %p) in non-hidden node %p\n", page->Page(), |
| (void*)offset, this); |
| } else { |
| printf("Found split reference off %p in non-hidden node%p\n", (void*)offset, this); |
| } |
| this->DumpLocked(1, true); |
| valid = false; |
| return ZX_ERR_STOP; |
| } |
| // Nothing else to test for non-hidden VMOs. |
| return ZX_ERR_NEXT; |
| } |
| |
| // We found a page in the hidden VMO, if it has been forked in either direction then we |
| // expect that if we search down that path we will find that the forked page and that no |
| // descendant can 'see' back to this page. |
| const VmCowPages* expected = nullptr; |
| if (page->PageOrRefLeftSplit()) { |
| expected = &left_child_locked(); |
| } else if (page->PageOrRefRightSplit()) { |
| expected = &right_child_locked(); |
| } else { |
| return ZX_ERR_NEXT; |
| } |
| |
| // We know this must be true as this is a hidden vmo and so left_child_locked and |
| // right_child_locked will never have returned null. |
| DEBUG_ASSERT(expected); |
| |
| // No leaf VMO in expected should be able to 'see' this page and potentially re-fork it. To |
| // validate this we need to walk the entire sub tree. |
| const VmCowPages* cur = expected; |
| uint64_t off = offset; |
| // We start with cur being an immediate child of 'this', so we can preform subtree traversal |
| // until we end up back in 'this'. |
| while (cur != this) { |
| AssertHeld(cur->lock_ref()); |
| // Check that we can see this page in the parent. Importantly this first checks if |
| // |off < cur->parent_offset_| allowing us to safely perform that subtraction from then on. |
| if (off < cur->parent_offset_ || off - cur->parent_offset_ < cur->parent_start_limit_ || |
| off - cur->parent_offset_ >= cur->parent_limit_) { |
| // This blank case is used to capture the scenario where current does not see the target |
| // offset in the parent, in which case there is no point traversing into the children. |
| } else if (cur->is_hidden_locked()) { |
| // A hidden VMO *may* have the page, but not necessarily if both children forked it out. |
| const VmPageOrMarker* l = cur->page_list_.Lookup(off - cur->parent_offset_); |
| if (!l || l->IsEmpty()) { |
| // Page not found, we need to recurse down into our children. |
| off -= cur->parent_offset_; |
| cur = &cur->left_child_locked(); |
| continue; |
| } |
| } else { |
| // We already checked in the first 'if' branch that this offset was visible, and so this |
| // leaf VMO *must* have a page or marker to prevent it 'seeing' the already forked original. |
| const VmPageOrMarker* l = cur->page_list_.Lookup(off - cur->parent_offset_); |
| if (!l || l->IsEmpty()) { |
| if (page->IsPage()) { |
| printf("Failed to find fork of page %p (off %p) from %p in leaf node %p (off %p)\n", |
| page->Page(), (void*)offset, this, cur, (void*)(off - cur->parent_offset_)); |
| } else { |
| printf("Failed to find fork of reference (off %p) from %p in leaf node %p (off %p)\n", |
| (void*)offset, this, cur, (void*)(off - cur->parent_offset_)); |
| } |
| cur->DumpLocked(1, true); |
| this->DumpLocked(1, true); |
| valid = false; |
| return ZX_ERR_STOP; |
| } |
| } |
| |
| // Find our next node by walking up until we see we have come from a left path, then go right. |
| do { |
| VmCowPages* next = cur->parent_.get(); |
| AssertHeld(next->lock_ref()); |
| off += next->parent_offset_; |
| if (next == this) { |
| cur = next; |
| break; |
| } |
| |
| // If we came from the left, go back down on the right, otherwise just keep going up. |
| if (cur == &next->left_child_locked()) { |
| off -= next->parent_offset_; |
| cur = &next->right_child_locked(); |
| break; |
| } |
| cur = next; |
| } while (1); |
| } |
| |
| // The inverse case must also exist where the side that hasn't forked it must still be able to |
| // see it. It can either be seen by a leaf vmo that does not have a page, or a hidden vmo that |
| // has partial_cow_release_ set. |
| // No leaf VMO in expected should be able to 'see' this page and potentially re-fork it. To |
| // validate this we need to walk the entire sub tree. |
| if (page->PageOrRefLeftSplit()) { |
| cur = &right_child_locked(); |
| } else if (page->PageOrRefRightSplit()) { |
| cur = &left_child_locked(); |
| } else { |
| return ZX_ERR_NEXT; |
| } |
| off = offset; |
| // Initially we haven't seen the page, unless this VMO itself has done a partial cow release, in |
| // which case we ourselves can see it. Logic is structured this way to avoid indenting this |
| // whole code block in an if, whilst preserving the ability to add future checks below. |
| bool seen = partial_cow_release_; |
| // We start with cur being an immediate child of 'this', so we can preform subtree traversal |
| // until we end up back in 'this'. |
| while (cur != this && !seen) { |
| AssertHeld(cur->lock_ref()); |
| // Check that we can see this page in the parent. Importantly this first checks if |
| // |off < cur->parent_offset_| allowing us to safely perform that subtraction from then on. |
| if (off < cur->parent_offset_ || off - cur->parent_offset_ < cur->parent_start_limit_ || |
| off - cur->parent_offset_ >= cur->parent_limit_) { |
| // This blank case is used to capture the scenario where current does not see the target |
| // offset in the parent, in which case there is no point traversing into the children. |
| } else if (cur->is_hidden_locked()) { |
| // A hidden VMO can see the page if it performed a partial cow release. |
| if (cur->partial_cow_release_) { |
| seen = true; |
| break; |
| } |
| // Otherwise recurse into the children. |
| off -= cur->parent_offset_; |
| cur = &cur->left_child_locked(); |
| continue; |
| } else { |
| // We already checked in the first 'if' branch that this offset was visible, and so if this |
| // leaf has no committed page then it is able to see it. |
| const VmPageOrMarker* l = cur->page_list_.Lookup(off - cur->parent_offset_); |
| if (!l || l->IsEmpty()) { |
| seen = true; |
| break; |
| } |
| } |
| // Find our next node by walking up until we see we have come from a left path, then go right. |
| do { |
| VmCowPages* next = cur->parent_.get(); |
| AssertHeld(next->lock_ref()); |
| off += next->parent_offset_; |
| if (next == this) { |
| cur = next; |
| break; |
| } |
| |
| // If we came from the left, go back down on the right, otherwise just keep going up. |
| if (cur == &next->left_child_locked()) { |
| off -= next->parent_offset_; |
| cur = &next->right_child_locked(); |
| break; |
| } |
| cur = next; |
| } while (1); |
| } |
| if (!seen) { |
| if (page->IsPage()) { |
| printf( |
| "Failed to find any child who could fork the remaining split page %p (off %p) in node " |
| "%p\n", |
| page->Page(), (void*)offset, this); |
| } else { |
| printf( |
| "Failed to find any child who could fork the remaining split reference (off %p) in " |
| "node " |
| "%p\n", |
| (void*)offset, this); |
| } |
| this->DumpLocked(1, true); |
| printf("Left:\n"); |
| left_child_locked().DumpLocked(1, true); |
| printf("Right:\n"); |
| right_child_locked().DumpLocked(1, true); |
| valid = false; |
| return ZX_ERR_STOP; |
| } |
| return ZX_ERR_NEXT; |
| }); |
| |
| return valid; |
| } |
| |
| bool VmCowPages::DebugValidateBacklinksLocked() const { |
| canary_.Assert(); |
| bool result = true; |
| page_list_.ForEveryPage([this, &result](const auto* p, uint64_t offset) { |
| // Markers, references, and intervals don't have backlinks. |
| if (p->IsReference() || p->IsMarker() || p->IsInterval()) { |
| return ZX_ERR_NEXT; |
| } |
| vm_page_t* page = p->Page(); |
| vm_page_state state = page->state(); |
| if (state != vm_page_state::OBJECT) { |
| dprintf(INFO, "unexpected page state: %u\n", static_cast<uint32_t>(state)); |
| result = false; |
| return ZX_ERR_STOP; |
| } |
| const VmCowPages* object = reinterpret_cast<VmCowPages*>(page->object.get_object()); |
| if (!object) { |
| dprintf(INFO, "missing object\n"); |
| result = false; |
| return ZX_ERR_STOP; |
| } |
| if (object != this) { |
| dprintf(INFO, "incorrect object - object: %p this: %p\n", object, this); |
| result = false; |
| return ZX_ERR_STOP; |
| } |
| uint64_t page_offset = page->object.get_page_offset(); |
| if (page_offset != offset) { |
| dprintf(INFO, "incorrect offset - page_offset: %" PRIx64 " offset: %" PRIx64 "\n", |
| page_offset, offset); |
| result = false; |
| return ZX_ERR_STOP; |
| } |
| return ZX_ERR_NEXT; |
| }); |
| return result; |
| } |
| |
| bool VmCowPages::DebugValidateVmoPageBorrowingLocked() const { |
| canary_.Assert(); |
| // Skip checking larger VMOs to avoid slowing things down too much, since the things being |
| // verified will typically assert from incorrect behavior on smaller VMOs (and we can always |
| // remove this filter if we suspect otherwise). |
| if (size_ >= 2 * 1024 * 1024) { |
| return true; |
| } |
| bool result = true; |
| page_list_.ForEveryPage([this, &result](const auto* p, uint64_t offset) { |
| AssertHeld(lock_ref()); |
| if (!p->IsPage()) { |
| // If we don't have a page, this is either a marker or reference, both of which are not |
| // allowed with contiguous VMOs. |
| DEBUG_ASSERT(!direct_source_supplies_zero_pages()); |
| return ZX_ERR_NEXT; |
| } |
| vm_page_t* page = p->Page(); |
| if (page->is_loaned()) { |
| if (!can_borrow_locked()) { |
| dprintf(INFO, "!can_borrow_locked() but page is loaned?? - offset: 0x%" PRIx64 "\n", |
| offset); |
| result = false; |
| return ZX_ERR_STOP; |
| } |
| if (page->object.pin_count) { |
| dprintf(INFO, "pinned page is loaned?? - offset: 0x%" PRIx64 "\n", offset); |
| result = false; |
| return ZX_ERR_STOP; |
| } |
| if (page->object.always_need) { |
| dprintf(INFO, "always_need page is loaned?? - offset: 0x%" PRIx64 "\n", offset); |
| result = false; |
| return ZX_ERR_STOP; |
| } |
| if (is_page_dirty_tracked(page) && !is_page_clean(page)) { |
| dprintf(INFO, "!clean page is loaned?? - offset: 0x%" PRIx64 "\n", offset); |
| result = false; |
| return ZX_ERR_STOP; |
| } |
| } |
| return ZX_ERR_NEXT; |
| }); |
| if (!result) { |
| dprintf(INFO, "DebugValidateVmoPageBorrowingLocked() failing - slice: %d\n", is_slice_locked()); |
| } |
| return result; |
| } |
| |
| bool VmCowPages::DebugValidateZeroIntervalsLocked() const { |
| canary_.Assert(); |
| bool in_interval = false; |
| auto dirty_state = VmPageOrMarker::IntervalDirtyState::Untracked; |
| zx_status_t status = page_list_.ForEveryPage( |
| [&in_interval, &dirty_state, pager_backed = is_source_preserving_page_content()]( |
| const VmPageOrMarker* p, uint64_t off) { |
| if (!pager_backed) { |
| if (p->IsInterval()) { |
| dprintf(INFO, "found interval at offset 0x%" PRIx64 " in non pager backed vmo\n", off); |
| return ZX_ERR_BAD_STATE; |
| } |
| return ZX_ERR_NEXT; |
| } |
| |
| if (p->IsInterval()) { |
| DEBUG_ASSERT(p->IsIntervalZero()); |
| DEBUG_ASSERT(p->IsZeroIntervalDirty()); |
| if (p->IsIntervalStart()) { |
| if (in_interval) { |
| dprintf(INFO, "interval start at 0x%" PRIx64 " while already in interval\n", off); |
| return ZX_ERR_BAD_STATE; |
| } |
| in_interval = true; |
| dirty_state = p->GetZeroIntervalDirtyState(); |
| } else if (p->IsIntervalEnd()) { |
| if (!in_interval) { |
| dprintf(INFO, "interval end at 0x%" PRIx64 " while not in interval\n", off); |
| return ZX_ERR_BAD_STATE; |
| } |
| if (p->GetZeroIntervalDirtyState() != dirty_state) { |
| dprintf(INFO, "dirty state mismatch - start %lu, end %lu\n", (uint64_t)(dirty_state), |
| (uint64_t)(p->GetZeroIntervalDirtyState())); |
| return ZX_ERR_BAD_STATE; |
| } |
| in_interval = false; |
| dirty_state = VmPageOrMarker::IntervalDirtyState::Untracked; |
| } else { |
| if (in_interval) { |
| dprintf(INFO, "interval slot at 0x%" PRIx64 " while already in interval\n", off); |
| return ZX_ERR_BAD_STATE; |
| } |
| } |
| return ZX_ERR_NEXT; |
| } |
| |
| if (p->IsReference()) { |
| dprintf(INFO, "found compressed ref at offset 0x%" PRIx64 " in pager backed vmo\n", off); |
| return ZX_ERR_BAD_STATE; |
| } |
| |
| if (p->IsPage() && in_interval) { |
| dprintf(INFO, "found page at 0x%" PRIx64 " in interval\n", off); |
| return ZX_ERR_BAD_STATE; |
| } |
| |
| if (p->IsMarker() && in_interval) { |
| dprintf(INFO, "found marker at 0x%" PRIx64 " in interval\n", off); |
| return ZX_ERR_BAD_STATE; |
| } |
| return ZX_ERR_NEXT; |
| }); |
| return status == ZX_OK; |
| } |
| |
| bool VmCowPages::IsLockRangeValidLocked(uint64_t offset, uint64_t len) const { |
| return offset == 0 && len == size_locked(); |
| } |
| |
| zx_status_t VmCowPages::LockRangeLocked(uint64_t offset, uint64_t len, |
| zx_vmo_lock_state_t* lock_state_out) { |
| canary_.Assert(); |
| ASSERT(discardable_tracker_); |
| |
| AssertHeld(lock_ref()); |
| if (!IsLockRangeValidLocked(offset, len)) { |
| return ZX_ERR_OUT_OF_RANGE; |
| } |
| |
| if (!lock_state_out) { |
| return ZX_ERR_INVALID_ARGS; |
| } |
| lock_state_out->offset = offset; |
| lock_state_out->size = len; |
| |
| discardable_tracker_->assert_cow_pages_locked(); |
| |
| bool was_discarded = false; |
| zx_status_t status = |
| discardable_tracker_->LockDiscardableLocked(/*try_lock=*/false, &was_discarded); |
| // Locking must succeed if try_lock was false. |
| DEBUG_ASSERT(status == ZX_OK); |
| lock_state_out->discarded_offset = 0; |
| lock_state_out->discarded_size = was_discarded ? size_locked() : 0; |
| |
| return status; |
| } |
| |
| zx_status_t VmCowPages::TryLockRangeLocked(uint64_t offset, uint64_t len) { |
| canary_.Assert(); |
| ASSERT(discardable_tracker_); |
| |
| AssertHeld(lock_ref()); |
| if (!IsLockRangeValidLocked(offset, len)) { |
| return ZX_ERR_OUT_OF_RANGE; |
| } |
| |
| discardable_tracker_->assert_cow_pages_locked(); |
| bool unused; |
| return discardable_tracker_->LockDiscardableLocked(/*try_lock=*/true, &unused); |
| } |
| |
| zx_status_t VmCowPages::UnlockRangeLocked(uint64_t offset, uint64_t len) { |
| canary_.Assert(); |
| ASSERT(discardable_tracker_); |
| |
| AssertHeld(lock_ref()); |
| if (!IsLockRangeValidLocked(offset, len)) { |
| return ZX_ERR_OUT_OF_RANGE; |
| } |
| |
| discardable_tracker_->assert_cow_pages_locked(); |
| zx_status_t status = discardable_tracker_->UnlockDiscardableLocked(); |
| if (status != ZX_OK) { |
| return status; |
| } |
| if (discardable_tracker_->IsEligibleForReclamationLocked()) { |
| // Simulate an access to the first page. We use the first page as the discardable trigger, so by |
| // simulating an access we ensure that an unlocked VMO is treated as recently accessed |
| // equivalent to all other pages. Touching just the first page, instead of all pages, is an |
| // optimization as we can simply ignore any attempts to trigger discard from those other pages. |
| page_list_.ForEveryPage([](auto* p, uint64_t offset) { |
| // Skip over any markers. |
| if (!p->IsPage()) { |
| return ZX_ERR_NEXT; |
| } |
| pmm_page_queues()->MarkAccessed(p->Page()); |
| return ZX_ERR_STOP; |
| }); |
| } |
| return status; |
| } |
| |
| uint64_t VmCowPages::DebugGetPageCountLocked() const { |
| canary_.Assert(); |
| uint64_t page_count = 0; |
| zx_status_t status = page_list_.ForEveryPage([&page_count](auto* p, uint64_t offset) { |
| if (!p->IsPageOrRef()) { |
| return ZX_ERR_NEXT; |
| } |
| ++page_count; |
| return ZX_ERR_NEXT; |
| }); |
| // We never stop early in lambda above. |
| DEBUG_ASSERT(status == ZX_OK); |
| return page_count; |
| } |
| |
| bool VmCowPages::DebugIsPage(uint64_t offset) const { |
| canary_.Assert(); |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(offset)); |
| Guard<CriticalMutex> guard{lock()}; |
| const VmPageOrMarker* p = page_list_.Lookup(offset); |
| return p && p->IsPage(); |
| } |
| |
| bool VmCowPages::DebugIsMarker(uint64_t offset) const { |
| canary_.Assert(); |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(offset)); |
| Guard<CriticalMutex> guard{lock()}; |
| const VmPageOrMarker* p = page_list_.Lookup(offset); |
| return p && p->IsMarker(); |
| } |
| |
| bool VmCowPages::DebugIsEmpty(uint64_t offset) const { |
| canary_.Assert(); |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(offset)); |
| Guard<CriticalMutex> guard{lock()}; |
| const VmPageOrMarker* p = page_list_.Lookup(offset); |
| return !p || p->IsEmpty(); |
| } |
| |
| vm_page_t* VmCowPages::DebugGetPage(uint64_t offset) const { |
| canary_.Assert(); |
| Guard<CriticalMutex> guard{lock()}; |
| return DebugGetPageLocked(offset); |
| } |
| |
| vm_page_t* VmCowPages::DebugGetPageLocked(uint64_t offset) const { |
| canary_.Assert(); |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(offset)); |
| const VmPageOrMarker* p = page_list_.Lookup(offset); |
| if (p && p->IsPage()) { |
| return p->Page(); |
| } |
| return nullptr; |
| } |
| |
| bool VmCowPages::DebugIsHighMemoryPriority() const { |
| canary_.Assert(); |
| Guard<CriticalMutex> guard{lock()}; |
| return is_high_memory_priority_locked(); |
| } |
| |
| VmCowPages::DiscardablePageCounts VmCowPages::DebugGetDiscardablePageCounts() const { |
| canary_.Assert(); |
| DiscardablePageCounts counts = {}; |
| |
| // Not a discardable VMO. |
| if (!discardable_tracker_) { |
| return counts; |
| } |
| |
| Guard<CriticalMutex> guard{lock()}; |
| |
| discardable_tracker_->assert_cow_pages_locked(); |
| const DiscardableVmoTracker::DiscardableState state = |
| discardable_tracker_->discardable_state_locked(); |
| // This is a discardable VMO but hasn't opted into locking / unlocking yet. |
| if (state == DiscardableVmoTracker::DiscardableState::kUnset) { |
| return counts; |
| } |
| |
| uint64_t pages = 0; |
| page_list_.ForEveryPage([&pages](const auto* p, uint64_t) { |
| // TODO(https://fxbug.dev/42138396) Figure out attribution between pages and references. |
| if (p->IsPageOrRef()) { |
| ++pages; |
| } |
| return ZX_ERR_NEXT; |
| }); |
| |
| switch (state) { |
| case DiscardableVmoTracker::DiscardableState::kReclaimable: |
| counts.unlocked = pages; |
| break; |
| case DiscardableVmoTracker::DiscardableState::kUnreclaimable: |
| counts.locked = pages; |
| break; |
| case DiscardableVmoTracker::DiscardableState::kDiscarded: |
| DEBUG_ASSERT(pages == 0); |
| break; |
| default: |
| break; |
| } |
| |
| return counts; |
| } |
| |
| uint64_t VmCowPages::DiscardPages(list_node_t* freed_list) { |
| canary_.Assert(); |
| |
| Guard<CriticalMutex> guard{lock()}; |
| // Discard any errors and overlap a 0 return value for errors. |
| return DiscardPagesLocked(freed_list).value_or(0); |
| } |
| |
| zx::result<uint64_t> VmCowPages::DiscardPagesLocked(list_node_t* freed_list) { |
| // Not a discardable VMO. |
| if (!discardable_tracker_) { |
| return zx::error(ZX_ERR_BAD_STATE); |
| } |
| |
| discardable_tracker_->assert_cow_pages_locked(); |
| if (!discardable_tracker_->IsEligibleForReclamationLocked()) { |
| return zx::error(ZX_ERR_BAD_STATE); |
| } |
| |
| // Remove all pages. |
| uint64_t pages_freed = 0; |
| zx_status_t status = UnmapAndRemovePagesLocked(0, size_, freed_list, &pages_freed); |
| |
| if (status != ZX_OK) { |
| ASSERT(pages_freed == 0); |
| return zx::error(status); |
| } |
| |
| reclamation_event_count_++; |
| IncrementHierarchyGenerationCountLocked(); |
| |
| // Set state to discarded. |
| discardable_tracker_->SetDiscardedLocked(); |
| |
| return zx::ok(pages_freed); |
| } |
| |
| zx::result<uint64_t> VmCowPages::ReclaimDiscardableLocked(vm_page_t* page, uint64_t offset, |
| list_node_t* freed_list) { |
| DEBUG_ASSERT(discardable_tracker_); |
| // Check if this is the first page. |
| bool first = false; |
| page_list_.ForEveryPage([&first, &offset, &page](auto* p, uint64_t off) { |
| if (!p->IsPage()) { |
| return ZX_ERR_NEXT; |
| } |
| first = (p->Page() == page) && off == offset; |
| return ZX_ERR_STOP; |
| }); |
| if (!first) { |
| return zx::error(ZX_ERR_INVALID_ARGS); |
| } |
| |
| return DiscardPagesLocked(freed_list); |
| } |
| |
| void VmCowPages::CopyPageForReplacementLocked(vm_page_t* dst_page, vm_page_t* src_page) { |
| DEBUG_ASSERT(!src_page->object.pin_count); |
| void* src = paddr_to_physmap(src_page->paddr()); |
| DEBUG_ASSERT(src); |
| void* dst = paddr_to_physmap(dst_page->paddr()); |
| DEBUG_ASSERT(dst); |
| memcpy(dst, src, PAGE_SIZE); |
| if (paged_ref_) { |
| AssertHeld(paged_ref_->lock_ref()); |
| if (paged_ref_->GetMappingCachePolicyLocked() != ARCH_MMU_FLAG_CACHED) { |
| arch_clean_invalidate_cache_range((vaddr_t)dst, PAGE_SIZE); |
| } |
| } |
| dst_page->object.cow_left_split = src_page->object.cow_left_split; |
| dst_page->object.cow_right_split = src_page->object.cow_right_split; |
| dst_page->object.always_need = src_page->object.always_need; |
| DEBUG_ASSERT(!dst_page->object.always_need || (!dst_page->is_loaned() && !src_page->is_loaned())); |
| dst_page->object.dirty_state = src_page->object.dirty_state; |
| } |
| |
| void VmCowPages::InitializePageCache(uint32_t level) { |
| ASSERT(level < LK_INIT_LEVEL_THREADING); |
| |
| const size_t reserve_pages = 64; |
| zx::result<page_cache::PageCache> result = page_cache::PageCache::Create(reserve_pages); |
| |
| ASSERT(result.is_ok()); |
| page_cache_ = ktl::move(result.value()); |
| } |
| |
| // Initialize the cache after the percpu data structures are initialized. |
| LK_INIT_HOOK(vm_cow_pages_cache_init, VmCowPages::InitializePageCache, LK_INIT_LEVEL_KERNEL + 1) |