| // Copyright 2020 The Fuchsia Authors |
| // |
| // Use of this source code is governed by a MIT-style |
| // license that can be found in the LICENSE file or at |
| // https://opensource.org/licenses/MIT |
| |
| #include "vm/vm_cow_pages.h" |
| |
| #include <lib/counters.h> |
| #include <lib/fit/defer.h> |
| #include <trace.h> |
| |
| #include <kernel/range_check.h> |
| #include <ktl/move.h> |
| #include <vm/anonymous_page_requester.h> |
| #include <vm/fault.h> |
| #include <vm/physmap.h> |
| #include <vm/pmm.h> |
| #include <vm/stack_owned_loaned_pages_interval.h> |
| #include <vm/vm_cow_pages.h> |
| #include <vm/vm_object.h> |
| #include <vm/vm_object_paged.h> |
| #include <vm/vm_page_list.h> |
| |
| #include "vm_priv.h" |
| |
| #include <ktl/enforce.h> |
| |
| #define LOCAL_TRACE VM_GLOBAL_TRACE(0) |
| |
| // add expensive code to do a full validation of the VMO at various points. |
| #define VMO_VALIDATION (0 || (LK_DEBUGLEVEL > 2)) |
| |
| // Assertion that is only enabled if VMO_VALIDATION is enabled. |
| #define VMO_VALIDATION_ASSERT(x) \ |
| do { \ |
| if (VMO_VALIDATION) { \ |
| ASSERT(x); \ |
| } \ |
| } while (0) |
| |
| // Add not-as-expensive code to do some extra validation at various points. This is off in normal |
| // debug builds because it can add O(n) validation to an O(1) operation, so can still make things |
| // slower, despite not being as slow as VMO_VALIDATION. |
| #define VMO_FRUGAL_VALIDATION (0 || (LK_DEBUGLEVEL > 2)) |
| |
| // Assertion that is only enabled if VMO_FRUGAL_VALIDATION is enabled. |
| #define VMO_FRUGAL_VALIDATION_ASSERT(x) \ |
| do { \ |
| if (VMO_FRUGAL_VALIDATION) { \ |
| ASSERT(x); \ |
| } \ |
| } while (0) |
| |
| namespace { |
| |
| KCOUNTER(vm_vmo_marked_latency_sensitive, "vm.vmo.latency_sensitive.marked") |
| KCOUNTER(vm_vmo_latency_sensitive_destroyed, "vm.vmo.latency_sensitive.destroyed") |
| |
| void ZeroPage(paddr_t pa) { |
| void* ptr = paddr_to_physmap(pa); |
| DEBUG_ASSERT(ptr); |
| |
| arch_zero_page(ptr); |
| } |
| |
| void ZeroPage(vm_page_t* p) { |
| paddr_t pa = p->paddr(); |
| ZeroPage(pa); |
| } |
| |
| bool IsZeroPage(vm_page_t* p) { |
| uint64_t* base = (uint64_t*)paddr_to_physmap(p->paddr()); |
| for (int i = 0; i < PAGE_SIZE / (int)sizeof(uint64_t); i++) { |
| if (base[i] != 0) |
| return false; |
| } |
| return true; |
| } |
| |
| void InitializeVmPage(vm_page_t* p) { |
| DEBUG_ASSERT(p->state() == vm_page_state::ALLOC); |
| p->set_state(vm_page_state::OBJECT); |
| p->object.pin_count = 0; |
| p->object.cow_left_split = 0; |
| p->object.cow_right_split = 0; |
| p->object.always_need = 0; |
| p->object.dirty_state = uint8_t(VmCowPages::DirtyState::Untracked); |
| } |
| |
| // Allocates a new page and populates it with the data at |parent_paddr|. |
| zx_status_t AllocateCopyPage(uint32_t pmm_alloc_flags, paddr_t parent_paddr, |
| list_node_t* alloc_list, LazyPageRequest* request, vm_page_t** clone) { |
| DEBUG_ASSERT(request || !(pmm_alloc_flags & PMM_ALLOC_FLAG_CAN_WAIT)); |
| |
| vm_page_t* p_clone = nullptr; |
| if (alloc_list) { |
| p_clone = list_remove_head_type(alloc_list, vm_page, queue_node); |
| } |
| |
| paddr_t pa_clone; |
| if (p_clone) { |
| pa_clone = p_clone->paddr(); |
| } else { |
| zx_status_t status = pmm_alloc_page(pmm_alloc_flags, &p_clone, &pa_clone); |
| if (status != ZX_OK) { |
| DEBUG_ASSERT(!p_clone); |
| if (status == ZX_ERR_SHOULD_WAIT) { |
| status = AnonymousPageRequester::Get().FillRequest(request->get()); |
| } |
| return status; |
| } |
| DEBUG_ASSERT(p_clone); |
| } |
| |
| InitializeVmPage(p_clone); |
| |
| void* dst = paddr_to_physmap(pa_clone); |
| DEBUG_ASSERT(dst); |
| |
| if (parent_paddr != vm_get_zero_page_paddr()) { |
| // do a direct copy of the two pages |
| const void* src = paddr_to_physmap(parent_paddr); |
| DEBUG_ASSERT(src); |
| memcpy(dst, src, PAGE_SIZE); |
| } else { |
| // avoid pointless fetches by directly zeroing dst |
| arch_zero_page(dst); |
| } |
| |
| *clone = p_clone; |
| |
| return ZX_OK; |
| } |
| |
| inline uint64_t CheckedAdd(uint64_t a, uint64_t b) { |
| uint64_t result; |
| bool overflow = add_overflow(a, b, &result); |
| DEBUG_ASSERT(!overflow); |
| return result; |
| } |
| |
| } // namespace |
| |
| VmCowPages::DiscardableList VmCowPages::discardable_reclaim_candidates_ = {}; |
| VmCowPages::DiscardableList VmCowPages::discardable_non_reclaim_candidates_ = {}; |
| |
| fbl::DoublyLinkedList<VmCowPages::Cursor*> VmCowPages::discardable_vmos_cursors_ = {}; |
| |
| // Helper class for collecting pages to performed batched Removes from the page queue to not incur |
| // its spinlock overhead for every single page. Pages that it removes from the page queue get placed |
| // into a provided list. Note that pages are not moved into the list until *after* Flush has been |
| // called and Flush must be called prior to object destruction. |
| // |
| // This class has a large internal array and should be marked uninitialized. |
| class BatchPQRemove { |
| public: |
| BatchPQRemove(list_node_t* freed_list) : freed_list_(freed_list) {} |
| ~BatchPQRemove() { DEBUG_ASSERT(count_ == 0); } |
| DISALLOW_COPY_AND_ASSIGN_ALLOW_MOVE(BatchPQRemove); |
| |
| // Add a page to the batch set. Automatically calls |Flush| if the limit is reached. |
| void Push(vm_page_t* page) { |
| DEBUG_ASSERT(page); |
| DEBUG_ASSERT(count_ < kMaxPages); |
| pages_[count_] = page; |
| count_++; |
| if (count_ == kMaxPages) { |
| Flush(); |
| } |
| } |
| |
| // Performs |Remove| on any pending pages. This allows you to know that all pages are in the |
| // original list so that you can do operations on the list. |
| void Flush() { |
| if (count_ > 0) { |
| pmm_page_queues()->RemoveArrayIntoList(pages_, count_, freed_list_); |
| freed_count_ += count_; |
| count_ = 0; |
| } |
| } |
| |
| // Returns the number of pages that were added to |freed_list_| by calls to Flush(). The |
| // |freed_count_| counter keeps a running count of freed pages as they are removed and added to |
| // |freed_list_|, avoiding having to walk |freed_list_| to compute its length. |
| size_t freed_count() const { return freed_count_; } |
| |
| // Produces a callback suitable for passing to VmPageList::RemovePages that will |Push| any pages |
| auto RemovePagesCallback() { |
| return [this](VmPageOrMarker* p, uint64_t off) { |
| if (p->IsPage()) { |
| vm_page_t* page = p->ReleasePage(); |
| Push(page); |
| } |
| *p = VmPageOrMarker::Empty(); |
| return ZX_ERR_NEXT; |
| }; |
| } |
| |
| private: |
| // The value of 64 was chosen as there is minimal performance gains originally measured by using |
| // higher values. There is an incentive on this being as small as possible due to this typically |
| // being created on the stack, and our stack space is limited. |
| static constexpr size_t kMaxPages = 64; |
| |
| size_t count_ = 0; |
| size_t freed_count_ = 0; |
| vm_page_t* pages_[kMaxPages]; |
| list_node_t* freed_list_ = nullptr; |
| }; |
| |
| VmCowPages::VmCowPages(ktl::unique_ptr<VmCowPagesContainer> cow_container, |
| const fbl::RefPtr<VmHierarchyState> hierarchy_state_ptr, |
| VmCowPagesOptions options, uint32_t pmm_alloc_flags, uint64_t size, |
| fbl::RefPtr<PageSource> page_source) |
| : VmHierarchyBase(ktl::move(hierarchy_state_ptr)), |
| container_(fbl::AdoptRef(cow_container.release())), |
| debug_retained_raw_container_(container_.get()), |
| options_(options), |
| size_(size), |
| pmm_alloc_flags_(pmm_alloc_flags), |
| page_source_(ktl::move(page_source)) { |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(size)); |
| DEBUG_ASSERT(!(pmm_alloc_flags & PMM_ALLOC_FLAG_CAN_BORROW)); |
| } |
| |
| void VmCowPages::fbl_recycle() { |
| canary_.Assert(); |
| |
| // To prevent races with a hidden parent creation or merging, it is necessary to hold the lock |
| // over the is_hidden and parent_ check and into the subsequent removal call. |
| // It is safe to grab the lock here because we are careful to never cause the last reference to |
| // a VmCowPages to be dropped in this code whilst holding the lock. The single place we drop a |
| // a VmCowPages reference that could trigger a deletion is in this destructor when parent_ is |
| // dropped, but that is always done without holding the lock. |
| { // scope guard |
| Guard<Mutex> guard{&lock_}; |
| VMO_VALIDATION_ASSERT(DebugValidatePageSplitsHierarchyLocked()); |
| // If we're not a hidden vmo, then we need to remove ourself from our parent. This needs |
| // to be done before emptying the page list so that a hidden parent can't merge into this |
| // vmo and repopulate the page list. |
| if (!is_hidden_locked()) { |
| if (parent_) { |
| AssertHeld(parent_->lock_); |
| parent_->RemoveChildLocked(this); |
| // Avoid recursing destructors when we delete our parent by using the deferred deletion |
| // method. See common in parent else branch for why we can avoid this on a hidden parent. |
| if (!parent_->is_hidden_locked()) { |
| guard.CallUnlocked([this, parent = ktl::move(parent_)]() mutable { |
| hierarchy_state_ptr_->DoDeferredDelete(ktl::move(parent)); |
| }); |
| } |
| } |
| } else { |
| // Most of the hidden vmo's state should have already been cleaned up when it merged |
| // itself into its child in ::RemoveChildLocked. |
| DEBUG_ASSERT(children_list_len_ == 0); |
| DEBUG_ASSERT(page_list_.HasNoPages()); |
| // Even though we are hidden we might have a parent. Unlike in the other branch of this if we |
| // do not need to perform any deferred deletion. The reason for this is that the deferred |
| // deletion mechanism is intended to resolve the scenario where there is a chain of 'one ref' |
| // parent pointers that will chain delete. However, with hidden parents we *know* that a |
| // hidden parent has two children (and hence at least one other ref to it) and so we cannot be |
| // in a one ref chain. Even if N threads all tried to remove children from the hierarchy at |
| // once, this would ultimately get serialized through the lock and the hierarchy would go from |
| // |
| // [..] |
| // / |
| // A [..] |
| // / \ / |
| // B E TO B A |
| // / \ / / \. |
| // C D C D E |
| // |
| // And so each serialized deletion breaks of a discrete two VMO chain that can be safely |
| // finalized with one recursive step. |
| } |
| |
| RemoveFromDiscardableListLocked(); |
| |
| // We stack-own loaned pages between removing the page from PageQueues and freeing the page via |
| // call to FreePages(). |
| __UNINITIALIZED StackOwnedLoanedPagesInterval raii_interval; |
| |
| // Cleanup page lists and page sources. |
| list_node_t list; |
| list_initialize(&list); |
| |
| __UNINITIALIZED BatchPQRemove page_remover(&list); |
| // free all of the pages attached to us |
| page_list_.RemoveAllPages([&page_remover](vm_page_t* page) { |
| ASSERT(page->object.pin_count == 0); |
| page_remover.Push(page); |
| }); |
| page_remover.Flush(); |
| |
| FreePages(&list); |
| |
| // We must Close() after removing pages, so that all pages will be loaned by the time |
| // PhysicalPageProvider::OnClose() calls pmm_delete_lender() on the whole physical range. |
| if (page_source_) { |
| page_source_->Close(); |
| } |
| |
| // Update counters |
| if (is_latency_sensitive_) { |
| vm_vmo_latency_sensitive_destroyed.Add(1); |
| } |
| } // ~guard |
| |
| // Release the ref that VmCowPages keeps on VmCowPagesContainer. |
| container_.reset(); |
| } |
| |
| VmCowPages::~VmCowPages() { |
| // All the explicit cleanup happens in fbl_recycle(). Only asserts and implicit cleanup happens |
| // in the destructor. |
| canary_.Assert(); |
| // While use a ktl::optional<VmCowPages> in VmCowPagesContainer, we don't intend to reset() it |
| // early. |
| DEBUG_ASSERT(0 == ref_count_debug()); |
| // We only intent to delete VmCowPages when the container is also deleting, and the container |
| // won't be deleting unless it's ref is 0. |
| DEBUG_ASSERT(!container_); |
| DEBUG_ASSERT(0 == debug_retained_raw_container_->ref_count_debug()); |
| } |
| |
| bool VmCowPages::DedupZeroPage(vm_page_t* page, uint64_t offset) { |
| canary_.Assert(); |
| |
| Guard<Mutex> guard{&lock_}; |
| |
| // TODO(fxb/85056): Formalize this. |
| // Forbid zero page deduping if this is latency sensitive. |
| if (is_latency_sensitive_) { |
| return false; |
| } |
| |
| if (paged_ref_) { |
| AssertHeld(paged_ref_->lock_ref()); |
| if (!paged_ref_->CanDedupZeroPagesLocked()) { |
| return false; |
| } |
| } |
| |
| // Check this page is still a part of this VMO. object.page_offset could be wrong, but there's no |
| // harm in looking up a random slot as we'll then notice it's the wrong page. |
| const VmPageOrMarker* page_or_marker = page_list_.Lookup(offset); |
| if (!page_or_marker || !page_or_marker->IsPage() || page_or_marker->Page() != page || |
| page->object.pin_count > 0) { |
| return false; |
| } |
| |
| // We expect most pages to not be zero, as such we will first do a 'racy' zero page check where |
| // we leave write permissions on the page. If the page isn't zero, which is our hope, then we |
| // haven't paid the price of modifying page tables. |
| if (!IsZeroPage(page_or_marker->Page())) { |
| return false; |
| } |
| |
| RangeChangeUpdateLocked(offset, PAGE_SIZE, RangeChangeOp::RemoveWrite); |
| |
| if (IsZeroPage(page_or_marker->Page())) { |
| // Replace the slot with a marker. |
| VmPageOrMarker new_marker = VmPageOrMarker::Marker(); |
| ktl::optional<vm_page_t*> old_page = ktl::nullopt; |
| zx_status_t status = |
| AddPageLocked(&new_marker, offset, CanOverwriteContent::NonZero, &old_page); |
| DEBUG_ASSERT(status == ZX_OK); |
| DEBUG_ASSERT(old_page.has_value()); |
| |
| // Free the old page. |
| vm_page_t* released_page = old_page.value(); |
| pmm_page_queues()->Remove(released_page); |
| DEBUG_ASSERT(!list_in_list(&released_page->queue_node)); |
| FreePage(released_page); |
| |
| eviction_event_count_++; |
| IncrementHierarchyGenerationCountLocked(); |
| VMO_VALIDATION_ASSERT(DebugValidatePageSplitsHierarchyLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| return true; |
| } |
| return false; |
| } |
| |
| uint32_t VmCowPages::ScanForZeroPagesLocked(bool reclaim) { |
| canary_.Assert(); |
| |
| if (!can_decommit_zero_pages_locked()) { |
| // Even if !reclaim, we don't count zero pages in contiguous VMOs because we can't reclaim them |
| // anyway (at least not just due to them being zero; user mode can decommit). We also don't |
| // add contiguous VMO pages to the ZeroFork queue ever, so counting these would only create |
| // false hope that we could potentially loan the pages despite explicitly not wanting to |
| // auto-decommit zero pages as expressed by !can_decommit_zero_pages_locked(). In future we |
| // may relax this restriction on contiguous VMOs at which point it'd be fine to remove zero |
| // pages (assuming other criteria are met, like not being pinned). |
| return 0; |
| } |
| |
| // Check if we have any slice children. Slice children may have writable mappings to our pages, |
| // and so we need to also remove any mappings from them. Non-slice children could only have |
| // read-only mappings, which is the state we already want, and so we don't need to touch them. |
| for (auto& child : children_list_) { |
| AssertHeld(child.lock_); |
| if (child.is_slice_locked()) { |
| // Slices are strict subsets of their parents so we don't need to bother looking at parent |
| // limits etc and can just operate on the entire range. |
| child.RangeChangeUpdateLocked(0, child.size_, RangeChangeOp::RemoveWrite); |
| } |
| } |
| |
| list_node_t freed_list; |
| list_initialize(&freed_list); |
| |
| uint32_t count = 0; |
| page_list_.RemovePages( |
| [&count, &freed_list, reclaim, this](VmPageOrMarker* p, uint64_t off) { |
| // Pinned pages cannot be decommitted so do not consider them. |
| if (p->IsPage() && p->Page()->object.pin_count == 0 && IsZeroPage(p->Page())) { |
| count++; |
| if (reclaim) { |
| // Need to remove all mappings (include read) ones to this range before we remove the |
| // page. |
| AssertHeld(this->lock_); |
| RangeChangeUpdateLocked(off, PAGE_SIZE, RangeChangeOp::Unmap); |
| vm_page_t* page = p->ReleasePage(); |
| pmm_page_queues()->Remove(page); |
| DEBUG_ASSERT(!list_in_list(&page->queue_node)); |
| list_add_tail(&freed_list, &page->queue_node); |
| *p = VmPageOrMarker::Marker(); |
| } |
| } |
| return ZX_ERR_NEXT; |
| }, |
| 0, VmPageList::MAX_SIZE); |
| |
| FreePages(&freed_list); |
| |
| if (reclaim && count > 0) { |
| IncrementHierarchyGenerationCountLocked(); |
| // A batch free is counted as a single eviction event. |
| eviction_event_count_++; |
| } |
| |
| return count; |
| } |
| |
| zx_status_t VmCowPages::Create(fbl::RefPtr<VmHierarchyState> root_lock, VmCowPagesOptions options, |
| uint32_t pmm_alloc_flags, uint64_t size, |
| fbl::RefPtr<VmCowPages>* cow_pages) { |
| DEBUG_ASSERT(!(options & VmCowPagesOptions::kInternalOnlyMask)); |
| fbl::AllocChecker ac; |
| auto cow = NewVmCowPages(&ac, ktl::move(root_lock), options, pmm_alloc_flags, size, nullptr); |
| if (!ac.check()) { |
| return ZX_ERR_NO_MEMORY; |
| } |
| *cow_pages = ktl::move(cow); |
| return ZX_OK; |
| } |
| |
| zx_status_t VmCowPages::CreateExternal(fbl::RefPtr<PageSource> src, VmCowPagesOptions options, |
| fbl::RefPtr<VmHierarchyState> root_lock, uint64_t size, |
| fbl::RefPtr<VmCowPages>* cow_pages) { |
| DEBUG_ASSERT(!(options & VmCowPagesOptions::kInternalOnlyMask)); |
| fbl::AllocChecker ac; |
| auto cow = |
| NewVmCowPages(&ac, ktl::move(root_lock), options, PMM_ALLOC_FLAG_ANY, size, ktl::move(src)); |
| if (!ac.check()) { |
| return ZX_ERR_NO_MEMORY; |
| } |
| |
| { |
| // If the page source preserves content, initialize supply_zero_offset_ to size. All initial |
| // content for a newly created VMO is provided by the page source, i.e. there is no content that |
| // the kernel implicitly supplies with zero. |
| Guard<Mutex> guard{&cow->lock_}; |
| if (cow->is_source_preserving_page_content_locked()) { |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(size)); |
| cow->UpdateSupplyZeroOffsetLocked(size); |
| } |
| } |
| |
| *cow_pages = ktl::move(cow); |
| return ZX_OK; |
| } |
| |
| void VmCowPages::ReplaceChildLocked(VmCowPages* old, VmCowPages* new_child) { |
| canary_.Assert(); |
| children_list_.replace(*old, new_child); |
| } |
| |
| void VmCowPages::DropChildLocked(VmCowPages* child) { |
| canary_.Assert(); |
| DEBUG_ASSERT(children_list_len_ > 0); |
| children_list_.erase(*child); |
| --children_list_len_; |
| } |
| |
| void VmCowPages::AddChildLocked(VmCowPages* child, uint64_t offset, uint64_t root_parent_offset, |
| uint64_t parent_limit) { |
| canary_.Assert(); |
| |
| // As we do not want to have to return failure from this function we require root_parent_offset to |
| // be calculated and validated that it does not overflow externally, but we can still assert that |
| // it has been calculated correctly to prevent accidents. |
| AssertHeld(child->lock_ref()); |
| DEBUG_ASSERT(CheckedAdd(root_parent_offset_, offset) == root_parent_offset); |
| |
| // The child should definitely stop seeing into the parent at the limit of its size. |
| DEBUG_ASSERT(parent_limit <= child->size_); |
| |
| // Write in the parent view values. |
| child->root_parent_offset_ = root_parent_offset; |
| child->parent_offset_ = offset; |
| child->parent_limit_ = parent_limit; |
| |
| // This child should be in an initial state and these members should be clear. |
| DEBUG_ASSERT(!child->partial_cow_release_); |
| DEBUG_ASSERT(child->parent_start_limit_ == 0); |
| |
| child->page_list_.InitializeSkew(page_list_.GetSkew(), offset); |
| |
| child->parent_ = fbl::RefPtr(this); |
| children_list_.push_front(child); |
| children_list_len_++; |
| } |
| |
| zx_status_t VmCowPages::CreateChildSliceLocked(uint64_t offset, uint64_t size, |
| fbl::RefPtr<VmCowPages>* cow_slice) { |
| LTRACEF("vmo %p offset %#" PRIx64 " size %#" PRIx64 "\n", this, offset, size); |
| |
| canary_.Assert(); |
| |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(offset)); |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(size)); |
| DEBUG_ASSERT(CheckedAdd(offset, size) <= size_); |
| |
| // If this is a slice re-home this on our parent. Due to this logic we can guarantee that any |
| // slice parent is, itself, not a slice. |
| // We are able to do this for two reasons: |
| // * Slices are subsets and so every position in a slice always maps back to the paged parent. |
| // * Slices are not permitted to be resized and so nothing can be done on the intermediate parent |
| // that requires us to ever look at it again. |
| if (is_slice_locked()) { |
| DEBUG_ASSERT(parent_); |
| AssertHeld(parent_->lock_ref()); |
| DEBUG_ASSERT(!parent_->is_slice_locked()); |
| return parent_->CreateChildSliceLocked(offset + parent_offset_, size, cow_slice); |
| } |
| |
| fbl::AllocChecker ac; |
| // Slices just need the slice option and default alloc flags since they will propagate any |
| // operation up to a parent and use their options and alloc flags. |
| auto slice = NewVmCowPages(&ac, hierarchy_state_ptr_, VmCowPagesOptions::kSlice, |
| PMM_ALLOC_FLAG_ANY, size, nullptr); |
| if (!ac.check()) { |
| return ZX_ERR_NO_MEMORY; |
| } |
| // At this point slice must *not* be destructed in this function, as doing so would cause a |
| // deadlock. That means from this point on we *must* succeed and any future error checking needs |
| // to be added prior to creation. |
| |
| AssertHeld(slice->lock_); |
| |
| // As our slice must be in range of the parent it is impossible to have the accumulated parent |
| // offset overflow. |
| uint64_t root_parent_offset = CheckedAdd(offset, root_parent_offset_); |
| CheckedAdd(root_parent_offset, size); |
| |
| AddChildLocked(slice.get(), offset, root_parent_offset, size); |
| |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(slice->DebugValidateVmoPageBorrowingLocked()); |
| |
| *cow_slice = slice; |
| return ZX_OK; |
| } |
| |
| void VmCowPages::CloneParentIntoChildLocked(fbl::RefPtr<VmCowPages>& child) { |
| AssertHeld(child->lock_ref()); |
| // This function is invalid to call if any pages are pinned as the unpin after we change the |
| // backlink will not work. |
| DEBUG_ASSERT(pinned_page_count_ == 0); |
| // We are going to change our linked VmObjectPaged to eventually point to our left child instead |
| // of us, so we need to make the left child look equivalent. To do this it inherits our |
| // children, attribution id and eviction count and is sized to completely cover us. |
| for (auto& c : children_list_) { |
| AssertHeld(c.lock_ref()); |
| c.parent_ = child; |
| } |
| child->children_list_ = ktl::move(children_list_); |
| child->children_list_len_ = children_list_len_; |
| children_list_len_ = 0; |
| child->eviction_event_count_ = eviction_event_count_; |
| child->page_attribution_user_id_ = page_attribution_user_id_; |
| AddChildLocked(child.get(), 0, root_parent_offset_, size_); |
| |
| // Time to change the VmCowPages that our paged_ref_ is point to. |
| if (paged_ref_) { |
| child->paged_ref_ = paged_ref_; |
| AssertHeld(paged_ref_->lock_ref()); |
| fbl::RefPtr<VmCowPages> __UNUSED previous = |
| paged_ref_->SetCowPagesReferenceLocked(ktl::move(child)); |
| // Validate that we replaced a reference to ourself as we expected, this ensures we can safely |
| // drop the refptr without triggering our own destructor, since we know someone else must be |
| // holding a refptr to us to be in this function. |
| DEBUG_ASSERT(previous.get() == this); |
| paged_ref_ = nullptr; |
| } |
| } |
| |
| zx_status_t VmCowPages::CreateCloneLocked(CloneType type, uint64_t offset, uint64_t size, |
| fbl::RefPtr<VmCowPages>* cow_child) { |
| LTRACEF("vmo %p offset %#" PRIx64 " size %#" PRIx64 "\n", this, offset, size); |
| |
| canary_.Assert(); |
| |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(offset)); |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(size)); |
| DEBUG_ASSERT(!is_hidden_locked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| |
| // All validation *must* be performed here prior to construction the VmCowPages, as the |
| // destructor for VmCowPages may acquire the lock, which we are already holding. |
| |
| switch (type) { |
| case CloneType::Snapshot: { |
| if (!is_cow_clonable_locked()) { |
| return ZX_ERR_NOT_SUPPORTED; |
| } |
| |
| // If this is non-zero, that means that there are pages which hardware can |
| // touch, so the vmo can't be safely cloned. |
| // TODO: consider immediately forking these pages. |
| if (pinned_page_count_locked()) { |
| return ZX_ERR_BAD_STATE; |
| } |
| break; |
| } |
| case CloneType::PrivatePagerCopy: |
| if (!is_private_pager_copy_supported()) { |
| return ZX_ERR_NOT_SUPPORTED; |
| } |
| break; |
| } |
| |
| uint64_t new_root_parent_offset; |
| bool overflow; |
| overflow = add_overflow(offset, root_parent_offset_, &new_root_parent_offset); |
| if (overflow) { |
| return ZX_ERR_INVALID_ARGS; |
| } |
| uint64_t temp; |
| overflow = add_overflow(new_root_parent_offset, size, &temp); |
| if (overflow) { |
| return ZX_ERR_INVALID_ARGS; |
| } |
| |
| uint64_t child_parent_limit = offset >= size_ ? 0 : ktl::min(size, size_ - offset); |
| |
| // Invalidate everything the clone will be able to see. They're COW pages now, |
| // so any existing mappings can no longer directly write to the pages. |
| RangeChangeUpdateLocked(offset, size, RangeChangeOp::RemoveWrite); |
| |
| if (type == CloneType::Snapshot) { |
| // We need two new VmCowPages for our two children. To avoid destructor of the first being |
| // invoked if the second fails we separately perform allocations and construction. It's fine |
| // for the destructor of VmCowPagesContainer to run since the optional VmCowPages isn't emplaced |
| // yet so the VmCowPages destructor doesn't run if the second fails allocation. |
| fbl::AllocChecker ac; |
| ktl::unique_ptr<VmCowPagesContainer> left_child_placeholder = |
| ktl::make_unique<VmCowPagesContainer>(&ac); |
| if (!ac.check()) { |
| return ZX_ERR_NO_MEMORY; |
| } |
| ktl::unique_ptr<VmCowPagesContainer> right_child_placeholder = |
| ktl::make_unique<VmCowPagesContainer>(&ac); |
| if (!ac.check()) { |
| return ZX_ERR_NO_MEMORY; |
| } |
| |
| // At this point cow_pages must *not* be destructed in this function, as doing so would cause a |
| // deadlock. That means from this point on we *must* succeed and any future error checking needs |
| // to be added prior to creation. |
| |
| fbl::RefPtr<VmCowPages> left_child = |
| NewVmCowPages(ktl::move(left_child_placeholder), hierarchy_state_ptr_, |
| VmCowPagesOptions::kNone, pmm_alloc_flags_, size_, nullptr); |
| fbl::RefPtr<VmCowPages> right_child = |
| NewVmCowPages(ktl::move(right_child_placeholder), hierarchy_state_ptr_, |
| VmCowPagesOptions::kNone, pmm_alloc_flags_, size, nullptr); |
| |
| AssertHeld(left_child->lock_ref()); |
| AssertHeld(right_child->lock_ref()); |
| |
| // The left child becomes a full clone of us, inheriting our children, paged backref etc. |
| CloneParentIntoChildLocked(left_child); |
| |
| // The right child is the, potential, subset view into the parent so has a variable offset. If |
| // this view would extend beyond us then we need to clip the parent_limit to our size_, which |
| // will ensure any pages in that range just get initialized from zeroes. |
| AddChildLocked(right_child.get(), offset, new_root_parent_offset, child_parent_limit); |
| |
| // Transition into being the hidden node. |
| options_ |= VmCowPagesOptions::kHidden; |
| DEBUG_ASSERT(children_list_len_ == 2); |
| |
| *cow_child = ktl::move(right_child); |
| |
| VMO_VALIDATION_ASSERT(DebugValidatePageSplitsHierarchyLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| return ZX_OK; |
| } else { |
| fbl::AllocChecker ac; |
| auto cow_pages = NewVmCowPages(&ac, hierarchy_state_ptr_, VmCowPagesOptions::kNone, |
| pmm_alloc_flags_, size, nullptr); |
| if (!ac.check()) { |
| return ZX_ERR_NO_MEMORY; |
| } |
| |
| // Walk up the parent chain until we find a good place to hang this new cow clone. A good place |
| // here means the first place that has committed pages that we actually need to snapshot. In |
| // doing so we need to ensure that the limits of the child we create do not end up seeing more |
| // of the final parent than it would have been able to see from here. |
| VmCowPages* cur = this; |
| AssertHeld(cur->lock_ref()); |
| while (cur->parent_) { |
| // There's a parent, check if there are any pages in the current range. Unless we've moved |
| // outside the range of our parent, in which case we can just walk up. |
| if (child_parent_limit > 0 && |
| cur->page_list_.AnyPagesInRange(offset, offset + child_parent_limit)) { |
| break; |
| } |
| // To move to the parent we need to translate our window into |cur|. |
| if (offset >= cur->parent_limit_) { |
| child_parent_limit = 0; |
| } else { |
| child_parent_limit = ktl::min(child_parent_limit, cur->parent_limit_ - offset); |
| } |
| offset += cur->parent_offset_; |
| cur = cur->parent_.get(); |
| } |
| new_root_parent_offset = CheckedAdd(offset, cur->root_parent_offset_); |
| cur->AddChildLocked(cow_pages.get(), offset, new_root_parent_offset, child_parent_limit); |
| |
| *cow_child = ktl::move(cow_pages); |
| } |
| |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| AssertHeld((*cow_child)->lock_ref()); |
| VMO_FRUGAL_VALIDATION_ASSERT((*cow_child)->DebugValidateVmoPageBorrowingLocked()); |
| |
| return ZX_OK; |
| } |
| |
| void VmCowPages::RemoveChildLocked(VmCowPages* removed) { |
| canary_.Assert(); |
| |
| AssertHeld(removed->lock_); |
| |
| VMO_VALIDATION_ASSERT(DebugValidatePageSplitsHierarchyLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| |
| if (!is_hidden_locked()) { |
| DropChildLocked(removed); |
| return; |
| } |
| |
| // Hidden vmos always have 0 or 2 children, but we can't be here with 0 children. |
| DEBUG_ASSERT(children_list_len_ == 2); |
| bool removed_left = &left_child_locked() == removed; |
| |
| DropChildLocked(removed); |
| |
| VmCowPages* child = &children_list_.front(); |
| DEBUG_ASSERT(child); |
| |
| MergeContentWithChildLocked(removed, removed_left); |
| |
| // The child which removed itself and led to the invocation should have a reference |
| // to us, in addition to child.parent_ which we are about to clear. |
| DEBUG_ASSERT(ref_count_debug() >= 2); |
| |
| AssertHeld(child->lock_); |
| if (child->page_attribution_user_id_ != page_attribution_user_id_) { |
| // If the attribution user id of this vmo doesn't match that of its remaining child, |
| // then the vmo with the matching attribution user id was just closed. In that case, we |
| // need to reattribute the pages of any ancestor hidden vmos to vmos that still exist. |
| // |
| // The syscall API doesn't specify how pages are to be attributed among a group of COW |
| // clones. One option is to pick a remaining vmo 'arbitrarily' and attribute everything to |
| // that vmo. However, it seems fairer to reattribute each remaining hidden vmo with |
| // its child whose user id doesn't match the vmo that was just closed. So walk up the |
| // clone chain and attribute each hidden vmo to the vmo we didn't just walk through. |
| auto cur = this; |
| AssertHeld(cur->lock_); |
| uint64_t user_id_to_skip = page_attribution_user_id_; |
| while (cur->parent_ != nullptr) { |
| auto parent = cur->parent_.get(); |
| AssertHeld(parent->lock_); |
| DEBUG_ASSERT(parent->is_hidden_locked()); |
| |
| if (parent->page_attribution_user_id_ == page_attribution_user_id_) { |
| uint64_t new_user_id = parent->left_child_locked().page_attribution_user_id_; |
| if (new_user_id == user_id_to_skip) { |
| new_user_id = parent->right_child_locked().page_attribution_user_id_; |
| } |
| // Although user IDs can be unset for VMOs that do not have a dispatcher, copy-on-write |
| // VMOs always have user level dispatchers, and should have a valid user-id set, hence we |
| // should never end up re-attributing a hidden parent with an unset id. |
| DEBUG_ASSERT(new_user_id != 0); |
| // The 'if' above should mean that the new_user_id isn't the ID we are trying to remove |
| // and isn't one we just used. For this to fail we either need a corrupt VMO hierarchy, or |
| // to have labeled two leaf nodes with the same user_id, which would also be incorrect as |
| // leaf nodes have unique dispatchers and hence unique ids. |
| DEBUG_ASSERT(new_user_id != page_attribution_user_id_ && new_user_id != user_id_to_skip); |
| parent->page_attribution_user_id_ = new_user_id; |
| user_id_to_skip = new_user_id; |
| |
| cur = parent; |
| } else { |
| break; |
| } |
| } |
| } |
| |
| // Drop the child from our list, but don't recurse back into this function. Then |
| // remove ourselves from the clone tree. |
| DropChildLocked(child); |
| if (parent_) { |
| AssertHeld(parent_->lock_ref()); |
| parent_->ReplaceChildLocked(this, child); |
| } |
| child->parent_ = ktl::move(parent_); |
| |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| } |
| |
| void VmCowPages::MergeContentWithChildLocked(VmCowPages* removed, bool removed_left) { |
| DEBUG_ASSERT(children_list_len_ == 1); |
| VmCowPages& child = children_list_.front(); |
| AssertHeld(child.lock_); |
| AssertHeld(removed->lock_); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| |
| list_node freed_pages; |
| list_initialize(&freed_pages); |
| __UNINITIALIZED BatchPQRemove page_remover(&freed_pages); |
| |
| const uint64_t visibility_start_offset = child.parent_offset_ + child.parent_start_limit_; |
| const uint64_t merge_start_offset = child.parent_offset_; |
| const uint64_t merge_end_offset = child.parent_offset_ + child.parent_limit_; |
| |
| // Hidden parents are not supposed to have page sources, but we assert it here anyway because a |
| // page source would make the way we move pages between objects incorrect, as we would break any |
| // potential back links. |
| DEBUG_ASSERT(!has_pager_backlinks_locked()); |
| |
| page_list_.RemovePages(page_remover.RemovePagesCallback(), 0, visibility_start_offset); |
| page_list_.RemovePages(page_remover.RemovePagesCallback(), merge_end_offset, |
| VmPageList::MAX_SIZE); |
| |
| if (child.parent_offset_ + child.parent_limit_ > parent_limit_) { |
| // Update the child's parent limit to ensure that it won't be able to see more |
| // of its new parent than this hidden vmo was able to see. |
| if (parent_limit_ < child.parent_offset_) { |
| child.parent_limit_ = 0; |
| child.parent_start_limit_ = 0; |
| } else { |
| child.parent_limit_ = parent_limit_ - child.parent_offset_; |
| child.parent_start_limit_ = ktl::min(child.parent_start_limit_, child.parent_limit_); |
| } |
| } else { |
| // The child will be able to see less of its new parent than this hidden vmo was |
| // able to see, so release any parent pages in that range. |
| ReleaseCowParentPagesLocked(merge_end_offset, parent_limit_, &page_remover); |
| } |
| |
| if (removed->parent_offset_ + removed->parent_start_limit_ < visibility_start_offset) { |
| // If the removed former child has a smaller offset, then there are retained |
| // ancestor pages that will no longer be visible and thus should be freed. |
| ReleaseCowParentPagesLocked(removed->parent_offset_ + removed->parent_start_limit_, |
| visibility_start_offset, &page_remover); |
| } |
| |
| // Adjust the child's offset so it will still see the correct range. |
| bool overflow = add_overflow(parent_offset_, child.parent_offset_, &child.parent_offset_); |
| // Overflow here means that something went wrong when setting up parent limits. |
| DEBUG_ASSERT(!overflow); |
| |
| if (child.is_hidden_locked()) { |
| // After the merge, either |child| can't see anything in parent (in which case |
| // the parent limits could be anything), or |child|'s first visible offset will be |
| // at least as large as |this|'s first visible offset. |
| DEBUG_ASSERT(child.parent_start_limit_ == child.parent_limit_ || |
| parent_offset_ + parent_start_limit_ <= |
| child.parent_offset_ + child.parent_start_limit_); |
| } else { |
| // non-hidden vmos should always have zero parent_start_limit_ |
| DEBUG_ASSERT(child.parent_start_limit_ == 0); |
| } |
| |
| // As we are moving pages between objects we need to make sure no backlinks are broken. We know |
| // there's no page_source_ and hence no pages will be in the pager_backed queue, but we could |
| // have pages in the unswappable_zero_forked queue. We do know that pages in this queue cannot |
| // have been pinned, so we can just move (or re-move potentially) any page that is not pinned |
| // into the unswappable queue. |
| { |
| PageQueues* pq = pmm_page_queues(); |
| Guard<CriticalMutex> guard{pq->get_lock()}; |
| page_list_.ForEveryPage([pq](auto* p, uint64_t off) { |
| if (p->IsPage()) { |
| vm_page_t* page = p->Page(); |
| if (page->object.pin_count == 0) { |
| AssertHeld<Lock<CriticalMutex>>(*pq->get_lock()); |
| pq->MoveToUnswappableLocked(page); |
| } |
| } |
| return ZX_ERR_NEXT; |
| }); |
| } |
| |
| // At this point, we need to merge |this|'s page list and |child|'s page list. |
| // |
| // In general, COW clones are expected to share most of their pages (i.e. to fork a relatively |
| // small number of pages). Because of this, it is preferable to do work proportional to the |
| // number of pages which were forked into |removed|. However, there are a few things that can |
| // prevent this: |
| // - If |child|'s offset is non-zero then the offsets of all of |this|'s pages will |
| // need to be updated when they are merged into |child|. |
| // - If there has been a call to ReleaseCowParentPagesLocked which was not able to |
| // update the parent limits, then there can exist pages in this vmo's page list |
| // which are not visible to |child| but can't be easily freed based on its parent |
| // limits. Finding these pages requires examining the split bits of all pages. |
| // - If |child| is hidden, then there can exist pages in this vmo which were split into |
| // |child|'s subtree and then migrated out of |child|. Those pages need to be freed, and |
| // the simplest way to find those pages is to examine the split bits. |
| bool fast_merge = merge_start_offset == 0 && !partial_cow_release_ && !child.is_hidden_locked(); |
| |
| if (fast_merge) { |
| // Only leaf vmos can be directly removed, so this must always be true. This guarantees |
| // that there are no pages that were split into |removed| that have since been migrated |
| // to its children. |
| DEBUG_ASSERT(!removed->is_hidden_locked()); |
| |
| // Before merging, find any pages that are present in both |removed| and |this|. Those |
| // pages are visibile to |child| but haven't been written to through |child|, so |
| // their split bits need to be cleared. Note that ::ReleaseCowParentPagesLocked ensures |
| // that pages outside of the parent limit range won't have their split bits set. |
| removed->page_list_.ForEveryPageInRange( |
| [removed_offset = removed->parent_offset_, this](auto* page, uint64_t offset) { |
| AssertHeld(lock_); |
| // Whether this is a true page, or a marker, we must check |this| for a page as either |
| // represents a potential fork, even if we subsequently changed it to a marker. |
| const VmPageOrMarker* page_or_mark = page_list_.Lookup(offset + removed_offset); |
| if (page_or_mark && page_or_mark->IsPage()) { |
| vm_page* p_page = page_or_mark->Page(); |
| // The page was definitely forked into |removed|, but |
| // shouldn't be forked twice. |
| DEBUG_ASSERT(p_page->object.cow_left_split ^ p_page->object.cow_right_split); |
| p_page->object.cow_left_split = 0; |
| p_page->object.cow_right_split = 0; |
| } |
| return ZX_ERR_NEXT; |
| }, |
| removed->parent_start_limit_, removed->parent_limit_); |
| |
| // These will be freed, but accumulate them separately for use in asserts before adding these to |
| // freed_pages. |
| list_node covered_pages; |
| list_initialize(&covered_pages); |
| __UNINITIALIZED BatchPQRemove covered_remover(&covered_pages); |
| |
| // Now merge |child|'s pages into |this|, overwriting any pages present in |this|, and |
| // then move that list to |child|. |
| child.page_list_.MergeOnto(page_list_, |
| [&covered_remover](vm_page_t* p) { covered_remover.Push(p); }); |
| child.page_list_ = ktl::move(page_list_); |
| |
| vm_page_t* p; |
| covered_remover.Flush(); |
| list_for_every_entry (&covered_pages, p, vm_page_t, queue_node) { |
| // The page was already present in |child|, so it should be split at least |
| // once. And being split twice is obviously bad. |
| ASSERT(p->object.cow_left_split ^ p->object.cow_right_split); |
| ASSERT(p->object.pin_count == 0); |
| } |
| list_splice_after(&covered_pages, &freed_pages); |
| } else { |
| // Merge our page list into the child page list and update all the necessary metadata. |
| child.page_list_.MergeFrom( |
| page_list_, merge_start_offset, merge_end_offset, |
| [&page_remover](vm_page* page, uint64_t offset) { page_remover.Push(page); }, |
| [&page_remover, removed_left](VmPageOrMarker* page_or_marker, uint64_t offset) { |
| DEBUG_ASSERT(page_or_marker->IsPage()); |
| vm_page_t* page = page_or_marker->Page(); |
| DEBUG_ASSERT(page->object.pin_count == 0); |
| |
| if (removed_left ? page->object.cow_right_split : page->object.cow_left_split) { |
| // This happens when the pages was already migrated into child but then |
| // was migrated further into child's descendants. The page can be freed. |
| page = page_or_marker->ReleasePage(); |
| page_remover.Push(page); |
| } else { |
| // Since we recursively fork on write, if the child doesn't have the |
| // page, then neither of its children do. |
| page->object.cow_left_split = 0; |
| page->object.cow_right_split = 0; |
| } |
| }); |
| } |
| |
| page_remover.Flush(); |
| if (!list_is_empty(&freed_pages)) { |
| FreePages(&freed_pages); |
| } |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| } |
| |
| void VmCowPages::DumpLocked(uint depth, bool verbose) const { |
| canary_.Assert(); |
| |
| size_t count = 0; |
| page_list_.ForEveryPage([&count](const auto* p, uint64_t) { |
| if (p->IsPage()) { |
| count++; |
| } |
| return ZX_ERR_NEXT; |
| }); |
| |
| for (uint i = 0; i < depth; ++i) { |
| printf(" "); |
| } |
| printf("cow_pages %p size %#" PRIx64 " offset %#" PRIx64 " start limit %#" PRIx64 |
| " limit %#" PRIx64 " pages %zu ref %d parent %p\n", |
| this, size_, parent_offset_, parent_start_limit_, parent_limit_, count, ref_count_debug(), |
| parent_.get()); |
| |
| if (page_source_) { |
| for (uint i = 0; i < depth + 1; ++i) { |
| printf(" "); |
| } |
| page_source_->Dump(); |
| } |
| |
| if (verbose) { |
| auto f = [depth](const auto* p, uint64_t offset) { |
| for (uint i = 0; i < depth + 1; ++i) { |
| printf(" "); |
| } |
| if (p->IsMarker()) { |
| printf("offset %#" PRIx64 " zero page marker\n", offset); |
| } else { |
| vm_page_t* page = p->Page(); |
| printf("offset %#" PRIx64 " page %p paddr %#" PRIxPTR "(%c%c%c)\n", offset, page, |
| page->paddr(), page->object.cow_left_split ? 'L' : '.', |
| page->object.cow_right_split ? 'R' : '.', page->object.always_need ? 'A' : '.'); |
| } |
| return ZX_ERR_NEXT; |
| }; |
| page_list_.ForEveryPage(f); |
| } |
| } |
| |
| size_t VmCowPages::AttributedPagesInRangeLocked(uint64_t offset, uint64_t len) const { |
| canary_.Assert(); |
| |
| if (is_hidden_locked()) { |
| return 0; |
| } |
| |
| size_t page_count = 0; |
| // TODO: Decide who pages should actually be attribtued to. |
| page_list_.ForEveryPageAndGapInRange( |
| [&page_count](const auto* p, uint64_t off) { |
| if (p->IsPage()) { |
| page_count++; |
| } |
| return ZX_ERR_NEXT; |
| }, |
| [this, &page_count](uint64_t gap_start, uint64_t gap_end) { |
| AssertHeld(lock_); |
| |
| // If there's no parent, there's no pages to care about. If there is a non-hidden |
| // parent, then that owns any pages in the gap, not us. |
| if (!parent_) { |
| return ZX_ERR_NEXT; |
| } |
| AssertHeld(parent_->lock_ref()); |
| if (!parent_->is_hidden_locked()) { |
| return ZX_ERR_NEXT; |
| } |
| |
| // Count any ancestor pages that should be attributed to us in the range. Ideally the whole |
| // range gets processed in one attempt, but in order to prevent unbounded stack growth with |
| // recursion we instead process partial ranges and recalculate the intermediate results. |
| // As a result instead of being O(n) in the number of committed pages it could |
| // pathologically become O(nd) where d is our depth in the vmo hierarchy. |
| uint64_t off = gap_start; |
| while (off < parent_limit_ && off < gap_end) { |
| uint64_t local_count = 0; |
| uint64_t attributed = |
| CountAttributedAncestorPagesLocked(off, gap_end - off, &local_count); |
| // |CountAttributedAncestorPagesLocked| guarantees that it will make progress. |
| DEBUG_ASSERT(attributed > 0); |
| off += attributed; |
| page_count += local_count; |
| } |
| |
| return ZX_ERR_NEXT; |
| }, |
| offset, offset + len); |
| |
| return page_count; |
| } |
| |
| uint64_t VmCowPages::CountAttributedAncestorPagesLocked(uint64_t offset, uint64_t size, |
| uint64_t* count) const TA_REQ(lock_) { |
| // We need to walk up the ancestor chain to see if there are any pages that should be attributed |
| // to this vmo. We attempt operate on the entire range given to us but should we need to query |
| // the next parent for a range we trim our operating range. Trimming the range is necessary as |
| // we cannot recurse and otherwise have no way to remember where we were up to after processing |
| // the range in the parent. The solution then is to return all the way back up to the caller with |
| // a partial range and then effectively recompute the meta data at the point we were up to. |
| |
| // Note that we cannot stop just because the page_attribution_user_id_ changes. This is because |
| // there might still be a forked page at the offset in question which should be attributed to |
| // this vmo. Whenever the attribution user id changes while walking up the ancestors, we need |
| // to determine if there is a 'closer' vmo in the sibling subtree to which the offset in |
| // question can be attributed, or if it should still be attributed to the current vmo. |
| |
| DEBUG_ASSERT(offset < parent_limit_); |
| const VmCowPages* cur = this; |
| AssertHeld(cur->lock_); |
| uint64_t cur_offset = offset; |
| uint64_t cur_size = size; |
| // Count of how many pages we attributed as being owned by this vmo. |
| uint64_t attributed_ours = 0; |
| // Count how much we've processed. This is needed to remember when we iterate up the parent list |
| // at an offset. |
| uint64_t attributed = 0; |
| while (cur_offset < cur->parent_limit_) { |
| // For cur->parent_limit_ to be non-zero, it must have a parent. |
| DEBUG_ASSERT(cur->parent_); |
| |
| const auto parent = cur->parent_.get(); |
| AssertHeld(parent->lock_); |
| uint64_t parent_offset; |
| bool overflowed = add_overflow(cur->parent_offset_, cur_offset, &parent_offset); |
| DEBUG_ASSERT(!overflowed); // vmo creation should have failed |
| DEBUG_ASSERT(parent_offset <= parent->size_); // parent_limit_ prevents this |
| |
| const bool left = cur == &parent->left_child_locked(); |
| const auto& sib = left ? parent->right_child_locked() : parent->left_child_locked(); |
| |
| // Work out how much of the desired size is actually visible to us in the parent, we just use |
| // this to walk the correct amount of the page_list_ |
| const uint64_t parent_size = ktl::min(cur_size, cur->parent_limit_ - cur_offset); |
| |
| // By default we expect to process the entire range, hence our next_size is 0. Should we need to |
| // iterate up the stack then these will be set by one of the callbacks. |
| uint64_t next_parent_offset = parent_offset + cur_size; |
| uint64_t next_size = 0; |
| parent->page_list_.ForEveryPageAndGapInRange( |
| [&parent, &cur, &attributed_ours, &sib](const auto* p, uint64_t off) { |
| AssertHeld(cur->lock_); |
| AssertHeld(sib.lock_); |
| AssertHeld(parent->lock_); |
| if (p->IsMarker()) { |
| return ZX_ERR_NEXT; |
| } |
| vm_page* page = p->Page(); |
| if ( |
| // Page is explicitly owned by us |
| (parent->page_attribution_user_id_ == cur->page_attribution_user_id_) || |
| // If page has already been split and we can see it, then we know |
| // the sibling subtree can't see the page and thus it should be |
| // attributed to this vmo. |
| (page->object.cow_left_split || page->object.cow_right_split) || |
| // If the sibling cannot access this page then its ours, otherwise we know there's |
| // a vmo in the sibling subtree which is 'closer' to this offset, and to which we will |
| // attribute the page to. |
| !(sib.parent_offset_ + sib.parent_start_limit_ <= off && |
| off < sib.parent_offset_ + sib.parent_limit_)) { |
| attributed_ours++; |
| } |
| return ZX_ERR_NEXT; |
| }, |
| [&parent, &cur, &next_parent_offset, &next_size, &sib](uint64_t gap_start, |
| uint64_t gap_end) { |
| // Process a gap in the parent VMO. |
| // |
| // A gap in the parent VMO doesn't necessarily mean there are no pages |
| // in this range: our parent's ancestors may have pages, so we need to |
| // walk up the tree to find out. |
| // |
| // We don't always need to walk the tree though: in this this gap, both this VMO |
| // and our sibling VMO will share the same set of ancestor pages. However, the |
| // pages will only be accounted to one of the two VMOs. |
| // |
| // If the parent page_attribution_user_id is the same as us, we need to |
| // keep walking up the tree to perform a more accurate count. |
| // |
| // If the parent page_attribution_user_id is our sibling, however, we |
| // can just ignore the overlapping range: pages may or may not exist in |
| // the range --- but either way, they would be accounted to our sibling. |
| // Instead, we need only walk up ranges not visible to our sibling. |
| AssertHeld(cur->lock_); |
| AssertHeld(sib.lock_); |
| AssertHeld(parent->lock_); |
| uint64_t gap_size = gap_end - gap_start; |
| if (parent->page_attribution_user_id_ == cur->page_attribution_user_id_) { |
| // don't need to consider siblings as we own this range, but we do need to |
| // keep looking up the stack to find any actual pages. |
| next_parent_offset = gap_start; |
| next_size = gap_size; |
| return ZX_ERR_STOP; |
| } |
| // For this entire range we know that the offset is visible to the current vmo, and there |
| // are no committed or migrated pages. We need to check though for what portion of this |
| // range we should attribute to the sibling. Any range that we can attribute to the |
| // sibling we can skip, otherwise we have to keep looking up the stack to see if there are |
| // any pages that could be attributed to us. |
| uint64_t sib_offset, sib_len; |
| if (!GetIntersect(gap_start, gap_size, sib.parent_offset_ + sib.parent_start_limit_, |
| sib.parent_limit_ - sib.parent_start_limit_, &sib_offset, &sib_len)) { |
| // No sibling ownership, so need to look at the whole range in the parent to find any |
| // pages. |
| next_parent_offset = gap_start; |
| next_size = gap_size; |
| return ZX_ERR_STOP; |
| } |
| // If the whole range is owned by the sibling, any pages that might be in |
| // it won't be accounted to us anyway. Skip the segment. |
| if (sib_len == gap_size) { |
| DEBUG_ASSERT(sib_offset == gap_start); |
| return ZX_ERR_NEXT; |
| } |
| |
| // Otherwise, inspect the range not visible to our sibling. |
| if (sib_offset == gap_start) { |
| next_parent_offset = sib_offset + sib_len; |
| next_size = gap_end - next_parent_offset; |
| } else { |
| next_parent_offset = gap_start; |
| next_size = sib_offset - gap_start; |
| } |
| return ZX_ERR_STOP; |
| }, |
| parent_offset, parent_offset + parent_size); |
| if (next_size == 0) { |
| // If next_size wasn't set then we don't need to keep looking up the chain as we successfully |
| // looked at the entire range. |
| break; |
| } |
| // Count anything up to the next starting point as being processed. |
| attributed += next_parent_offset - parent_offset; |
| // Size should have been reduced by at least the amount we just attributed |
| DEBUG_ASSERT(next_size <= cur_size && |
| cur_size - next_size >= next_parent_offset - parent_offset); |
| |
| cur = parent; |
| cur_offset = next_parent_offset; |
| cur_size = next_size; |
| } |
| // Exiting the loop means we either ceased finding a relevant parent for the range, or we were |
| // able to process the entire range without needing to look up to a parent, in either case we |
| // can consider the entire range as attributed. |
| // |
| // The cur_size can be larger than the value of parent_size from the last loop iteration. This is |
| // fine as that range we trivially know has zero pages in it, and therefore has zero pages to |
| // determine attributions off. |
| attributed += cur_size; |
| |
| *count = attributed_ours; |
| return attributed; |
| } |
| |
| zx_status_t VmCowPages::AddPageLocked(VmPageOrMarker* p, uint64_t offset, |
| CanOverwriteContent overwrite, |
| ktl::optional<vm_page_t*>* released_page, |
| bool do_range_update) { |
| canary_.Assert(); |
| |
| if (p->IsPage()) { |
| LTRACEF("vmo %p, offset %#" PRIx64 ", page %p (%#" PRIxPTR ")\n", this, offset, p->Page(), |
| p->Page()->paddr()); |
| } else { |
| DEBUG_ASSERT(p->IsMarker()); |
| LTRACEF("vmo %p, offset %#" PRIx64 ", marker\n", this, offset); |
| } |
| |
| if (released_page != nullptr) { |
| *released_page = ktl::nullopt; |
| } |
| |
| if (offset >= size_) { |
| return ZX_ERR_OUT_OF_RANGE; |
| } |
| |
| VmPageOrMarker* page = page_list_.LookupOrAllocate(offset); |
| if (!page) { |
| return ZX_ERR_NO_MEMORY; |
| } |
| |
| // We cannot overwrite any kind of content. |
| if (overwrite == CanOverwriteContent::None) { |
| // An anonymous VMO starts off with all its content set to zero, i.e. at no point can it have |
| // absence of content. |
| if (!page_source_) { |
| return ZX_ERR_ALREADY_EXISTS; |
| } |
| // This VMO is backed by a page source, so empty slots represent absence of content. Fail if the |
| // slot is not empty. |
| if (!page->IsEmpty()) { |
| return ZX_ERR_ALREADY_EXISTS; |
| } |
| // This VMO is backed by a page source and the slot is empty. Check if this empty slot |
| // represents zero content. For page sources that preserve content (pager backed VMOs), pages |
| // starting at the supply_zero_offset_ have an implicit initial content of zero. These pages are |
| // not supplied by the user pager, and are instead supplied by the kernel as zero pages. So for |
| // pager backed VMOs, we should not overwrite this zero content. |
| // |
| // TODO(rashaeqbal): Consider replacing supply_zero_offset_ with a single zero range in the page |
| // list itself, so that all content resides in the page list. This might require supporting |
| // custom sized ranges in the page list; we don't want to pay the cost of individual zero page |
| // markers per page or multiple fixed sized zero ranges. |
| if (is_source_preserving_page_content_locked() && offset >= supply_zero_offset_) { |
| return ZX_ERR_ALREADY_EXISTS; |
| } |
| } |
| |
| // We're only permitted to overwrite zero content. This has different meanings based on the |
| // whether the VMO is anonymous or is backed by a pager. |
| // |
| // * For anonymous VMOs, the initial content for the entire VMO is implicitly all zeroes at the |
| // time of creation. So both zero page markers and empty slots represent zero content. Therefore |
| // the only content type that cannot be overwritten in this case is an actual page. |
| // |
| // * For pager backed VMOs, content is either explicitly supplied by the user pager before |
| // supply_zero_offset_, or implicitly supplied as zeros beyond supply_zero_offset_. So zero |
| // content is represented by either zero page markers before supply_zero_offset_ (supplied by the |
| // user pager), or by gaps after supply_zero_offset_ (supplied by the kernel). Therefore the only |
| // content type that cannot be overwritten in this case as well is an actual page. |
| if (overwrite == CanOverwriteContent::Zero && page->IsPage()) { |
| // If we have a page source, the page source should be able to validate the page. |
| DEBUG_ASSERT(!page_source_ || page_source_->DebugIsPageOk(page->Page(), offset)); |
| return ZX_ERR_ALREADY_EXISTS; |
| } |
| |
| // If the old entry is an actual page, release it. |
| if (page->IsPage()) { |
| // We should be permitted to overwrite any kind of content (zero or non-zero). |
| DEBUG_ASSERT(overwrite == CanOverwriteContent::NonZero); |
| // The caller should have passed in an optional to hold the released page. |
| DEBUG_ASSERT(released_page != nullptr); |
| *released_page = page->ReleasePage(); |
| } |
| |
| // If the new page is an actual page and we have a page source, the page source should be able to |
| // validate the page. |
| DEBUG_ASSERT(!p->IsPage() || !page_source_ || page_source_->DebugIsPageOk(p->Page(), offset)); |
| |
| // If this is actually a real page, we need to place it into the appropriate queue. |
| if (p->IsPage()) { |
| vm_page_t* low_level_page = p->Page(); |
| DEBUG_ASSERT(low_level_page->state() == vm_page_state::OBJECT); |
| DEBUG_ASSERT(low_level_page->object.pin_count == 0); |
| SetNotWiredLocked(low_level_page, offset); |
| } |
| *page = ktl::move(*p); |
| |
| if (do_range_update) { |
| // other mappings may have covered this offset into the vmo, so unmap those ranges |
| RangeChangeUpdateLocked(offset, PAGE_SIZE, RangeChangeOp::Unmap); |
| } |
| |
| VMO_VALIDATION_ASSERT(DebugValidatePageSplitsHierarchyLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| return ZX_OK; |
| } |
| |
| zx_status_t VmCowPages::AddNewPageLocked(uint64_t offset, vm_page_t* page, |
| CanOverwriteContent overwrite, |
| ktl::optional<vm_page_t*>* released_page, bool zero, |
| bool do_range_update) { |
| canary_.Assert(); |
| |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(offset)); |
| |
| InitializeVmPage(page); |
| if (zero) { |
| ZeroPage(page); |
| } |
| |
| // Pages being added to pager backed VMOs should have a valid dirty_state before being added to |
| // the page list, so that they can be inserted in the correct page queue. New pages start off |
| // clean. |
| if (has_pager_backlinks_locked()) { |
| // Only zero pages can be added as new pages to pager backed VMOs. |
| DEBUG_ASSERT(zero || IsZeroPage(page)); |
| UpdateDirtyStateLocked(page, offset, DirtyState::Clean, /*is_pending_add=*/true); |
| } |
| |
| VmPageOrMarker p = VmPageOrMarker::Page(page); |
| zx_status_t status = AddPageLocked(&p, offset, overwrite, released_page, do_range_update); |
| |
| if (status != ZX_OK) { |
| // Release the page from 'p', as we are returning failure 'page' is still owned by the caller. |
| p.ReleasePage(); |
| } |
| VMO_VALIDATION_ASSERT(DebugValidatePageSplitsHierarchyLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| return status; |
| } |
| |
| zx_status_t VmCowPages::AddNewPagesLocked(uint64_t start_offset, list_node_t* pages, |
| CanOverwriteContent overwrite, |
| list_node_t* released_pages, bool zero, |
| bool do_range_update) { |
| canary_.Assert(); |
| |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(start_offset)); |
| |
| uint64_t offset = start_offset; |
| while (vm_page_t* p = list_remove_head_type(pages, vm_page_t, queue_node)) { |
| ktl::optional<vm_page_t*> released_page = ktl::nullopt; |
| // Defer the range change update by passing false as we will do it in bulk at the end if needed. |
| zx_status_t status = AddNewPageLocked(offset, p, overwrite, &released_page, zero, false); |
| if (released_page.has_value()) { |
| DEBUG_ASSERT(released_pages != nullptr); |
| vm_page_t* released = released_page.value(); |
| list_add_tail(released_pages, &released->queue_node); |
| } |
| if (status != ZX_OK) { |
| // Put the page back on the list so that someone owns it and it'll get free'd. |
| list_add_head(pages, &p->queue_node); |
| // Decommit any pages we already placed. |
| if (offset > start_offset) { |
| DecommitRangeLocked(start_offset, offset - start_offset); |
| } |
| |
| // Free all the pages back as we had ownership of them. |
| FreePages(pages); |
| return status; |
| } |
| offset += PAGE_SIZE; |
| } |
| |
| if (do_range_update) { |
| // other mappings may have covered this offset into the vmo, so unmap those ranges |
| RangeChangeUpdateLocked(start_offset, offset - start_offset, RangeChangeOp::Unmap); |
| } |
| |
| VMO_VALIDATION_ASSERT(DebugValidatePageSplitsHierarchyLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| return ZX_OK; |
| } |
| |
| bool VmCowPages::IsUniAccessibleLocked(vm_page_t* page, uint64_t offset) const { |
| DEBUG_ASSERT(page_list_.Lookup(offset)->Page() == page); |
| |
| if (page->object.cow_right_split || page->object.cow_left_split) { |
| return true; |
| } |
| |
| if (offset < left_child_locked().parent_offset_ + left_child_locked().parent_start_limit_ || |
| offset >= left_child_locked().parent_offset_ + left_child_locked().parent_limit_) { |
| return true; |
| } |
| |
| if (offset < right_child_locked().parent_offset_ + right_child_locked().parent_start_limit_ || |
| offset >= right_child_locked().parent_offset_ + right_child_locked().parent_limit_) { |
| return true; |
| } |
| |
| return false; |
| } |
| |
| zx_status_t VmCowPages::CloneCowPageLocked(uint64_t offset, list_node_t* alloc_list, |
| VmCowPages* page_owner, vm_page_t* page, |
| uint64_t owner_offset, LazyPageRequest* page_request, |
| vm_page_t** out_page) { |
| DEBUG_ASSERT(page != vm_get_zero_page()); |
| DEBUG_ASSERT(parent_); |
| DEBUG_ASSERT(page_request); |
| |
| // To avoid the need for rollback logic on allocation failure, we start the forking |
| // process from the root-most vmo and work our way towards the leaf vmo. This allows |
| // us to maintain the hidden vmo invariants through the whole operation, so that we |
| // can stop at any point. |
| // |
| // To set this up, walk from the leaf to |page_owner|, and keep track of the |
| // path via |stack_.dir_flag|. |
| VmCowPages* cur = this; |
| do { |
| AssertHeld(cur->lock_); |
| VmCowPages* next = cur->parent_.get(); |
| // We can't make COW clones of physical vmos, so this can only happen if we |
| // somehow don't find |page_owner| in the ancestor chain. |
| DEBUG_ASSERT(next); |
| AssertHeld(next->lock_); |
| |
| next->stack_.dir_flag = &next->left_child_locked() == cur ? StackDir::Left : StackDir::Right; |
| if (next->stack_.dir_flag == StackDir::Right) { |
| DEBUG_ASSERT(&next->right_child_locked() == cur); |
| } |
| cur = next; |
| } while (cur != page_owner); |
| uint64_t cur_offset = owner_offset; |
| |
| // |target_page| is the page we're considering for migration. Cache it |
| // across loop iterations. |
| vm_page_t* target_page = page; |
| |
| zx_status_t alloc_status = ZX_OK; |
| |
| // As long as we're simply migrating |page|, there's no need to update any vmo mappings, since |
| // that means the other side of the clone tree has already covered |page| and the current side |
| // of the clone tree will still see |page|. As soon as we insert a new page, we'll need to |
| // update all mappings at or below that level. |
| bool skip_range_update = true; |
| do { |
| // |target_page| is always located at in |cur| at |cur_offset| at the start of the loop. |
| VmCowPages* target_page_owner = cur; |
| AssertHeld(target_page_owner->lock_); |
| uint64_t target_page_offset = cur_offset; |
| |
| cur = cur->stack_.dir_flag == StackDir::Left ? &cur->left_child_locked() |
| : &cur->right_child_locked(); |
| DEBUG_ASSERT(cur_offset >= cur->parent_offset_); |
| cur_offset -= cur->parent_offset_; |
| |
| if (target_page_owner->IsUniAccessibleLocked(target_page, target_page_offset)) { |
| // If the page we're covering in the parent is uni-accessible, then we |
| // can directly move the page. |
| |
| // Assert that we're not trying to split the page the same direction two times. Either |
| // some tracking state got corrupted or a page in the subtree we're trying to |
| // migrate to got improperly migrated/freed. If we did this migration, then the |
| // opposite subtree would lose access to this page. |
| DEBUG_ASSERT(!(target_page_owner->stack_.dir_flag == StackDir::Left && |
| target_page->object.cow_left_split)); |
| DEBUG_ASSERT(!(target_page_owner->stack_.dir_flag == StackDir::Right && |
| target_page->object.cow_right_split)); |
| // For now, we won't see a loaned page here. |
| DEBUG_ASSERT(!pmm_is_loaned(target_page)); |
| |
| target_page->object.cow_left_split = 0; |
| target_page->object.cow_right_split = 0; |
| VmPageOrMarker removed = target_page_owner->page_list_.RemovePage(target_page_offset); |
| vm_page* removed_page = removed.ReleasePage(); |
| pmm_page_queues()->Remove(removed_page); |
| DEBUG_ASSERT(removed_page == target_page); |
| } else { |
| // Otherwise we need to fork the page. The page has no writable mappings so we don't need to |
| // remove write or unmap before copying the contents. |
| vm_page_t* cover_page; |
| alloc_status = |
| AllocateCopyPage(pmm_alloc_flags_, page->paddr(), alloc_list, page_request, &cover_page); |
| if (alloc_status != ZX_OK) { |
| break; |
| } |
| |
| // We're going to cover target_page with cover_page, so set appropriate split bit. |
| if (target_page_owner->stack_.dir_flag == StackDir::Left) { |
| target_page->object.cow_left_split = 1; |
| DEBUG_ASSERT(target_page->object.cow_right_split == 0); |
| } else { |
| target_page->object.cow_right_split = 1; |
| DEBUG_ASSERT(target_page->object.cow_left_split == 0); |
| } |
| target_page = cover_page; |
| |
| skip_range_update = false; |
| } |
| |
| // Skip the automatic range update so we can do it ourselves more efficiently. |
| VmPageOrMarker add_page = VmPageOrMarker::Page(target_page); |
| zx_status_t status = |
| cur->AddPageLocked(&add_page, cur_offset, CanOverwriteContent::Zero, nullptr, false); |
| DEBUG_ASSERT_MSG(status == ZX_OK, "AddPageLocked returned %d\n", status); |
| |
| if (!skip_range_update) { |
| if (cur != this) { |
| // In this case, cur is a hidden vmo and has no direct mappings. Also, its |
| // descendents along the page stack will be dealt with by subsequent iterations |
| // of this loop. That means that any mappings that need to be touched now are |
| // owned by the children on the opposite side of stack_.dir_flag. |
| VmCowPages& other = cur->stack_.dir_flag == StackDir::Left ? cur->right_child_locked() |
| : cur->left_child_locked(); |
| AssertHeld(other.lock_); |
| RangeChangeList list; |
| other.RangeChangeUpdateFromParentLocked(cur_offset, PAGE_SIZE, &list); |
| RangeChangeUpdateListLocked(&list, RangeChangeOp::Unmap); |
| } else { |
| // In this case, cur is the last vmo being changed, so update its whole subtree. |
| DEBUG_ASSERT(offset == cur_offset); |
| RangeChangeUpdateLocked(offset, PAGE_SIZE, RangeChangeOp::Unmap); |
| } |
| } |
| } while (cur != this); |
| DEBUG_ASSERT(alloc_status != ZX_OK || cur_offset == offset); |
| |
| if (unlikely(alloc_status != ZX_OK)) { |
| *out_page = nullptr; |
| return alloc_status; |
| } else { |
| *out_page = target_page; |
| return ZX_OK; |
| } |
| } |
| |
| zx_status_t VmCowPages::CloneCowPageAsZeroLocked(uint64_t offset, list_node_t* freed_list, |
| VmCowPages* page_owner, vm_page_t* page, |
| uint64_t owner_offset, |
| LazyPageRequest* page_request) { |
| DEBUG_ASSERT(parent_); |
| |
| // Ensure we have a slot as we'll need it later. |
| const VmPageOrMarker* slot = page_list_.LookupOrAllocate(offset); |
| |
| if (!slot) { |
| return ZX_ERR_NO_MEMORY; |
| } |
| |
| // We cannot be forking a page to here if there's already something. |
| DEBUG_ASSERT(slot->IsEmpty()); |
| |
| DEBUG_ASSERT(!page_source_ || page_source_->DebugIsPageOk(page, offset)); |
| |
| // Need to make sure the page is duplicated as far as our parent. Then we can pretend |
| // that we have forked it into us by setting the marker. |
| AssertHeld(parent_->lock_); |
| if (page_owner != parent_.get()) { |
| // Do not pass our freed_list here as this wants an alloc_list to allocate from. |
| zx_status_t result = parent_->CloneCowPageLocked(offset + parent_offset_, nullptr, page_owner, |
| page, owner_offset, page_request, &page); |
| if (result != ZX_OK) { |
| return result; |
| } |
| } |
| |
| bool left = this == &(parent_->left_child_locked()); |
| // Page is in our parent. Check if its uni accessible, if so we can free it. |
| if (parent_->IsUniAccessibleLocked(page, offset + parent_offset_)) { |
| // Make sure we didn't already merge the page in this direction. |
| DEBUG_ASSERT(!(left && page->object.cow_left_split)); |
| DEBUG_ASSERT(!(!left && page->object.cow_right_split)); |
| vm_page* removed = parent_->page_list_.RemovePage(offset + parent_offset_).ReleasePage(); |
| DEBUG_ASSERT(removed == page); |
| pmm_page_queues()->Remove(removed); |
| DEBUG_ASSERT(!list_in_list(&removed->queue_node)); |
| list_add_tail(freed_list, &removed->queue_node); |
| } else { |
| if (left) { |
| page->object.cow_left_split = 1; |
| } else { |
| page->object.cow_right_split = 1; |
| } |
| } |
| // Insert the zero marker. |
| VmPageOrMarker new_marker = VmPageOrMarker::Marker(); |
| // We know that the slot is empty, so we know we won't be overwriting an actual page. |
| // We expect the caller to update any mappings. |
| zx_status_t status = AddPageLocked(&new_marker, offset, CanOverwriteContent::Zero, nullptr, |
| /*do_range_update=*/false); |
| // Absent bugs, AddPageLocked() can only return ZX_ERR_NO_MEMORY, but that failure can only |
| // occur if we had to allocate a slot in the page list. Since we allocated a slot above, we |
| // know that can't be the case. |
| DEBUG_ASSERT(status == ZX_OK); |
| return ZX_OK; |
| } |
| |
| const VmPageOrMarker* VmCowPages::FindInitialPageContentLocked(uint64_t offset, |
| VmCowPages** owner_out, |
| uint64_t* owner_offset_out, |
| uint64_t* owner_length) { |
| // Search up the clone chain for any committed pages. cur_offset is the offset |
| // into cur we care about. The loop terminates either when that offset contains |
| // a committed page or when that offset can't reach into the parent. |
| const VmPageOrMarker* page = nullptr; |
| VmCowPages* cur = this; |
| AssertHeld(cur->lock_); |
| uint64_t cur_offset = offset; |
| while (cur_offset < cur->parent_limit_) { |
| VmCowPages* parent = cur->parent_.get(); |
| // If there's no parent, then parent_limit_ is 0 and we'll never enter the loop |
| DEBUG_ASSERT(parent); |
| AssertHeld(parent->lock_ref()); |
| |
| uint64_t parent_offset; |
| bool overflowed = add_overflow(cur->parent_offset_, cur_offset, &parent_offset); |
| ASSERT(!overflowed); |
| if (parent_offset >= parent->size_) { |
| // The offset is off the end of the parent, so cur is the VmObjectPaged |
| // which will provide the page. |
| break; |
| } |
| if (owner_length) { |
| // Before we walk up, need to check to see if there's any forked pages that require us to |
| // restrict the owner length. Additionally need to restrict the owner length to the actual |
| // parent limit. |
| *owner_length = ktl::min(*owner_length, cur->parent_limit_ - cur_offset); |
| cur->page_list_.ForEveryPageInRange( |
| [owner_length, cur_offset](const VmPageOrMarker*, uint64_t off) { |
| *owner_length = off - cur_offset; |
| return ZX_ERR_STOP; |
| }, |
| cur_offset, cur_offset + *owner_length); |
| } |
| |
| cur = parent; |
| cur_offset = parent_offset; |
| const VmPageOrMarker* p = cur->page_list_.Lookup(parent_offset); |
| if (p && !p->IsEmpty()) { |
| page = p; |
| break; |
| } |
| } |
| |
| *owner_out = cur; |
| *owner_offset_out = cur_offset; |
| |
| return page; |
| } |
| |
| void VmCowPages::UpdateDirtyStateLocked(vm_page_t* page, uint64_t offset, DirtyState dirty_state, |
| bool is_pending_add) { |
| ASSERT(page); |
| ASSERT(has_pager_backlinks_locked()); |
| ASSERT(is_source_preserving_page_content_locked()); |
| |
| // If the page is not pending being added to the page list, it should have valid object info. |
| DEBUG_ASSERT(is_pending_add || page->object.get_object() == this); |
| DEBUG_ASSERT(is_pending_add || page->object.get_page_offset() == offset); |
| |
| // If the page is Dirty or AwaitingClean, it should not be loaned. |
| DEBUG_ASSERT(!(is_page_dirty(page) || is_page_awaiting_clean(page)) || !pmm_is_loaned(page)); |
| |
| // Perform state-specific checks and actions. We will finally update the state below. |
| switch (dirty_state) { |
| case DirtyState::Clean: |
| // If the page is not in the process of being added, we can only see a transition to Clean |
| // from AwaitingClean. |
| ASSERT(is_pending_add || is_page_awaiting_clean(page)); |
| |
| // If we are expecting a pending Add[New]PageLocked, we can defer updating the page queue. |
| if (!is_pending_add) { |
| // Move to evictable pager backed queue to start tracking age information. |
| pmm_page_queues()->MoveToPagerBacked(page, this, offset); |
| } |
| break; |
| case DirtyState::Dirty: |
| // If the page is not in the process of being added, we can only see a transition to Dirty |
| // from Clean or AwaitingClean. |
| ASSERT(is_pending_add || (is_page_clean(page) || is_page_awaiting_clean(page))); |
| |
| // A loaned page cannot be marked Dirty as loaned pages are reclaimed by eviction; Dirty pages |
| // cannot be evicted. |
| DEBUG_ASSERT(!pmm_is_loaned(page)); |
| |
| // If we are expecting a pending Add[New]PageLocked, we can defer updating the page queue. |
| if (!is_pending_add) { |
| // Move the page to the Dirty queue, which does not track page age. While the page is in the |
| // Dirty queue, age information is not required (yet). It will be required when the page |
| // becomes Clean (and hence evictable) again, at which point it will get moved to the MRU |
| // pager backed queue and will age as normal. |
| // TODO(rashaeqbal): We might want age tracking for the Dirty queue in the future when the |
| // kernel generates writeback pager requests. |
| pmm_page_queues()->MoveToPagerBackedDirty(page, this, offset); |
| } |
| |
| // We might need to trim the AwaitingClean zero range [supply_zero_offset_, |
| // awaiting_clean_zero_range_end_) if the newly dirtied page falls within that range. |
| ConsiderTrimAwaitingCleanZeroRangeLocked(offset); |
| break; |
| case DirtyState::AwaitingClean: |
| // A newly added page cannot start off as AwaitingClean. |
| ASSERT(!is_pending_add); |
| // We can only transition to AwaitingClean from Dirty. |
| ASSERT(is_page_dirty(page)); |
| // A loaned page cannot be marked AwaitingClean as loaned pages are reclaimed by eviction; |
| // AwaitingClean pages cannot be evicted. |
| DEBUG_ASSERT(!pmm_is_loaned(page)); |
| // No page queue update. Leave the page in the Dirty queue for now as it is not clean yet; |
| // it will be moved out on WritebackEnd. |
| DEBUG_ASSERT(pmm_page_queues()->DebugPageIsPagerBackedDirty(page)); |
| break; |
| default: |
| ASSERT(false); |
| } |
| page->object.dirty_state = static_cast<uint8_t>(dirty_state) & VM_PAGE_OBJECT_DIRTY_STATES_MASK; |
| } |
| |
| zx_status_t VmCowPages::PrepareForWriteLocked(LazyPageRequest* page_request, uint64_t offset, |
| uint64_t len, uint64_t* dirty_len_out) { |
| DEBUG_ASSERT(page_source_); |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(offset)); |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(len)); |
| DEBUG_ASSERT(InRange(offset, len, size_)); |
| DEBUG_ASSERT(is_source_preserving_page_content_locked()); |
| |
| uint64_t dirty_len = 0; |
| const uint64_t start_offset = offset; |
| const uint64_t end_offset = offset + len; |
| |
| // If the VMO does not require us to trap dirty transitions, simply mark the pages dirty, and move |
| // them to the dirty page queue. Do this only for the first consecutive run of committed pages |
| // within the range starting at offset. Any absent pages will need to be provided by the page |
| // source, which might fail and terminate the lookup early. Any zero page markers might need to be |
| // forked, which can fail too. Only mark those pages dirty that the lookup is guaranteed to return |
| // successfully. |
| if (!page_source_->ShouldTrapDirtyTransitions()) { |
| zx_status_t status = page_list_.ForEveryPageAndGapInRange( |
| [this, &dirty_len, start_offset](const VmPageOrMarker* p, uint64_t off) { |
| if (p->IsMarker()) { |
| // Found a marker. End the traversal. |
| return ZX_ERR_STOP; |
| } |
| vm_page_t* page = p->Page(); |
| DEBUG_ASSERT(is_page_dirty_tracked(page)); |
| DEBUG_ASSERT(page->object.get_object() == this); |
| DEBUG_ASSERT(page->object.get_page_offset() == off); |
| |
| // End the traversal if we encounter a loaned page. We reclaim loaned pages by evicting |
| // them, and dirty pages cannot be evicted. |
| if (pmm_is_loaned(page)) { |
| // If this is a loaned page, it should be clean. |
| DEBUG_ASSERT(is_page_clean(page)); |
| return ZX_ERR_STOP; |
| } |
| DEBUG_ASSERT(!pmm_is_loaned(page)); |
| |
| // Mark the page dirty. |
| if (!is_page_dirty(page)) { |
| AssertHeld(lock_); |
| UpdateDirtyStateLocked(page, off, DirtyState::Dirty); |
| } |
| // The page was either already dirty, or we just marked it dirty. Proceed to the next one. |
| DEBUG_ASSERT(start_offset + dirty_len == off); |
| dirty_len += PAGE_SIZE; |
| return ZX_ERR_NEXT; |
| }, |
| [](uint64_t start, uint64_t end) { |
| // We found a gap. End the traversal. |
| return ZX_ERR_STOP; |
| }, |
| start_offset, end_offset); |
| // We don't expect a failure from the traversal. |
| DEBUG_ASSERT(status == ZX_OK); |
| |
| *dirty_len_out = dirty_len; |
| return ZX_OK; |
| } |
| |
| // Otherwise, generate a DIRTY page request for pages in the range which need to transition to |
| // Dirty. The eligibility criteria is different depending on which side of supply_zero_offset_ the |
| // page lies. |
| // |
| // - For pages before supply_zero_offset_: |
| // Find a contiguous run of non-Dirty pages (committed pages as well as zero page markers). |
| // For the purpose of generating DIRTY requests, both Clean and AwaitingClean pages are |
| // considered equivalent. This is because pages that are in AwaitingClean will need another |
| // acknowledgment from the user pager before they can be made Dirty (the filesystem might need to |
| // reserve additional space for them etc.). |
| // |
| // - For pages at and after supply_zero_offset_: |
| // - Any gaps are implicit zero pages, i.e. the kernel supplies zero pages when they are |
| // accessed. Since these pages are not supplied by the user pager via zx_pager_supply_pages, we |
| // will need to wait on a DIRTY request before the gap can be replaced by an actual page for |
| // writing (the filesystem might need to reserve additional space). |
| // - There can exist actual pages beyond supply_zero_offset_ from previous writes, but these |
| // will either be Dirty or AwaitingClean, since we cannot mark a page Clean beyond |
| // supply_zero_offset_ without also advancing supply_zero_offset_ after the Clean page. This is |
| // because the range after supply_zero_offset_ is supplied by the kernel, not the user pager, |
| // so if we were to Clean a page beyond supply_zero_offset_, it might get evicted, and then |
| // incorrectly supplied by the kernel as a zero page. It is possible for pages to be in |
| // AwaitingClean if the user pager is attempting to write them back, in which case a future |
| // write to the page is treated the same as before supply_zero_offset_. It must be trapped so |
| // that the filesystem can acknowledge it again (it might need to reserve additional space |
| // again). |
| uint64_t pages_to_dirty_len = 0; |
| |
| // Helper lambda used in the page list traversal below. Try to add page at |dirty_page_offset| to |
| // the run of dirty pages being tracked. Return codes are the same as those used by |
| // VmPageList::ForEveryPageAndGapInRange to continue or terminate traversal. |
| auto accumulate_dirty_page = [&pages_to_dirty_len, &dirty_len, |
| start_offset](uint64_t dirty_page_offset) -> zx_status_t { |
| // Bail if we were tracking a non-zero run of pages to be dirtied as we cannot extend |
| // pages_to_dirty_len anymore. |
| if (pages_to_dirty_len > 0) { |
| return ZX_ERR_STOP; |
| } |
| // Append the page to the dirty range being tracked if it immediately follows it. |
| if (start_offset + dirty_len == dirty_page_offset) { |
| dirty_len += PAGE_SIZE; |
| return ZX_ERR_NEXT; |
| } |
| // Otherwise we cannot accumulate any more contiguous dirty pages. |
| return ZX_ERR_STOP; |
| }; |
| |
| // Helper lambda used in the page list traversal below. Try to add pages in the range |
| // [to_dirty_start, to_dirty_end) to the run of to-be-dirtied pages being tracked. Return codes |
| // are the same as those used by VmPageList::ForEveryPageAndGapInRange to continue or terminate |
| // traversal. |
| auto accumulate_pages_to_dirty = [&pages_to_dirty_len, &dirty_len, start_offset]( |
| uint64_t to_dirty_start, |
| uint64_t to_dirty_end) -> zx_status_t { |
| // Bail if we were already accumulating a non-zero run of Dirty pages. |
| if (dirty_len > 0) { |
| return ZX_ERR_STOP; |
| } |
| // Append the pages to the range being tracked if they immediately follow it. |
| if (start_offset + pages_to_dirty_len == to_dirty_start) { |
| pages_to_dirty_len += (to_dirty_end - to_dirty_start); |
| return ZX_ERR_NEXT; |
| } |
| // Otherwise we cannot accumulate any more contiguous to-dirty pages. |
| return ZX_ERR_STOP; |
| }; |
| |
| // First consider the portion of the range that ends before supply_zero_offset_. |
| // We don't have a range to consider here if offset was greater than supply_zero_offset_. |
| if (start_offset < supply_zero_offset_) { |
| const uint64_t end = ktl::min(supply_zero_offset_, end_offset); |
| zx_status_t status = page_list_.ForEveryPageAndGapInRange( |
| [&accumulate_dirty_page, &accumulate_pages_to_dirty](const VmPageOrMarker* p, |
| uint64_t off) { |
| if (p->IsPage()) { |
| vm_page_t* page = p->Page(); |
| DEBUG_ASSERT(is_page_dirty_tracked(page)); |
| // VMOs that trap dirty transitions should not have loaned pages. |
| DEBUG_ASSERT(!pmm_is_loaned(page)); |
| // Page is already dirty. Try to add it to the dirty run. |
| if (is_page_dirty(page)) { |
| return accumulate_dirty_page(off); |
| } |
| } |
| // This is a either a zero page marker (which represents a clean zero page) or a committed |
| // page which is not already Dirty. Try to add it to the range of pages to be dirtied. |
| return accumulate_pages_to_dirty(off, off + PAGE_SIZE); |
| }, |
| [](uint64_t start, uint64_t end) { |
| // We found a gap. End the traversal. |
| return ZX_ERR_STOP; |
| }, |
| start_offset, end); |
| |
| // We don't expect an error from the traversal above. If an incompatible contiguous page or |
| // a gap is encountered, we will simply terminate early. |
| DEBUG_ASSERT(status == ZX_OK); |
| } |
| |
| // Now consider the portion of the range that starts at/after supply_zero_offset_, and see if we |
| // can extend an already existing to-dirty range, or start a new one. [offset, offset + len) might |
| // have fallen entirely before supply_zero_offset_, in which case we have no remaining portion to |
| // consider here. |
| if (supply_zero_offset_ < end_offset) { |
| const uint64_t start = ktl::max(start_offset, supply_zero_offset_); |
| zx_status_t status = page_list_.ForEveryPageAndGapInRange( |
| [&accumulate_dirty_page, &accumulate_pages_to_dirty](const VmPageOrMarker* p, |
| uint64_t off) { |
| // We can only find un-Clean committed pages beyond supply_zero_offset_. There can be no |
| // markers as well as they represent Clean zero pages. |
| ASSERT(p->IsPage()); |
| vm_page_t* page = p->Page(); |
| ASSERT(is_page_dirty_tracked(page)); |
| ASSERT(!is_page_clean(page)); |
| DEBUG_ASSERT(!pmm_is_loaned(page)); |
| |
| // Page is already dirty. Try to add it to the dirty run. |
| if (is_page_dirty(page)) { |
| return accumulate_dirty_page(off); |
| } |
| |
| // This page was not Dirty, the only other state a page beyond supply_zero_offset_ could |
| // be in is AwaitingClean. |
| ASSERT(is_page_awaiting_clean(page)); |
| // Try to add this page to the range of pages to be dirtied. |
| return accumulate_pages_to_dirty(off, off + PAGE_SIZE); |
| }, |
| [&accumulate_pages_to_dirty](uint64_t start, uint64_t end) { |
| // We need to request a Dirty transition for the gap. Try to add it to the range of pages |
| // to be dirtied. |
| return accumulate_pages_to_dirty(start, end); |
| }, |
| start, end_offset); |
| |
| // We don't expect an error from the traversal above. If an already dirty page or a |
| // non-contiguous page/gap is encountered, we will simply terminate early. |
| DEBUG_ASSERT(status == ZX_OK); |
| } |
| |
| // We should either have found dirty pages or pages that need to be dirtied, but not both. |
| DEBUG_ASSERT(dirty_len == 0 || pages_to_dirty_len == 0); |
| // Check that dirty_len and pages_to_dirty_len both specify valid ranges. |
| DEBUG_ASSERT(start_offset + dirty_len <= end_offset); |
| DEBUG_ASSERT(pages_to_dirty_len == 0 || start_offset + pages_to_dirty_len <= end_offset); |
| |
| *dirty_len_out = dirty_len; |
| |
| // No pages need to transition to Dirty. |
| if (pages_to_dirty_len == 0) { |
| return ZX_OK; |
| } |
| |
| // Found a contiguous run of pages that need to transition to Dirty. There might be more such |
| // pages later in the range, but we will come into this call again for them via another |
| // LookupPagesLocked after the waiting caller is unblocked for this range. |
| AssertHeld(paged_ref_->lock_ref()); |
| VmoDebugInfo vmo_debug_info = {.vmo_ptr = reinterpret_cast<uintptr_t>(paged_ref_), |
| .vmo_id = paged_ref_->user_id_locked()}; |
| zx_status_t status = page_source_->RequestDirtyTransition(page_request->get(), start_offset, |
| pages_to_dirty_len, vmo_debug_info); |
| // The page source will never succeed synchronously. |
| DEBUG_ASSERT(status != ZX_OK); |
| return status; |
| } |
| |
| void VmCowPages::UpdateOnAccessLocked(vm_page_t* page, uint pf_flags) { |
| // We only care about updating on access if we can evict pages. We can skip if eviction isn't |
| // possible. |
| if (!can_evict_locked()) { |
| return; |
| } |
| |
| // Don't make the page accessed for hardware faults. These accesses, if any actually end up |
| // happening, will be detected by the accessed bits in the page tables. |
| // For non hardware faults, the kernel might use the page directly through the physmap, which will |
| // not cause accessed information to be updated and so we consider it accessed at this point. |
| if (pf_flags & VMM_PF_FLAG_HW_FAULT) { |
| return; |
| } |
| |
| pmm_page_queues()->MarkAccessed(page); |
| } |
| |
| // Looks up the page at the requested offset, faulting it in if requested and necessary. If |
| // this VMO has a parent and the requested page isn't found, the parent will be searched. |
| // |
| // Both VMM_PF_FLAG_HW_FAULT and VMM_PF_FLAG_SW_FAULT are treated identically with respect to the |
| // values that get returned, they only differ with respect to internal meta-data that gets updated |
| // different. If SW or HW fault then unless there is some other error condition, a page of some kind |
| // will always be returned, performing allocations as required. |
| // The rules for non faults are: |
| // * A reference to the zero page will never be returned, be it because reading from an uncommitted |
| // offset or from a marker. Uncommitted offsets and markers will always result in |
| // ZX_ERR_NOT_FOUND |
| // * Writes to real committed pages (i.e. non markers) in parent VMOs will cause a copy-on-write |
| // fork to be allocated into this VMO and returned. |
| // This means that |
| // * Reads or writes to committed real (non marker) pages in this VMO will always succeed. |
| // * Reads to committed real (non marker) pages in parents will succeed |
| // * Writes to real pages in parents will trigger a COW fork and succeed |
| // * All other cases, that is reads or writes to markers in this VMO or the parent and uncommitted |
| // offsets, will not trigger COW forks or allocations and will fail. |
| // |
| // |alloc_list|, if not NULL, is a list of allocated but unused vm_page_t that |
| // this function may allocate from. This function will need at most one entry, |
| // and will not fail if |alloc_list| is a non-empty list, faulting in was requested, |
| // and offset is in range. |
| zx_status_t VmCowPages::LookupPagesLocked(uint64_t offset, uint pf_flags, |
| DirtyTrackingAction mark_dirty, uint64_t max_out_pages, |
| list_node* alloc_list, LazyPageRequest* page_request, |
| LookupInfo* out) { |
| VM_KTRACE_DURATION(2, "VmCowPages::LookupPagesLocked", page_attribution_user_id_, offset); |
| canary_.Assert(); |
| DEBUG_ASSERT(!is_hidden_locked()); |
| DEBUG_ASSERT(out); |
| DEBUG_ASSERT(max_out_pages > 0); |
| DEBUG_ASSERT(page_request || !(pf_flags & VMM_PF_FLAG_FAULT_MASK)); |
| VMO_VALIDATION_ASSERT(DebugValidatePageSplitsHierarchyLocked()); |
| |
| if (offset >= size_) { |
| return ZX_ERR_OUT_OF_RANGE; |
| } |
| |
| // This vmo was discarded and has not been locked yet after the discard. Do not return any pages. |
| if (discardable_state_ == DiscardableState::kDiscarded) { |
| return ZX_ERR_NOT_FOUND; |
| } |
| |
| offset = ROUNDDOWN(offset, PAGE_SIZE); |
| |
| // Trim the number of output pages to the size of this VMO. This ensures any range calculation |
| // can never overflow. |
| max_out_pages = ktl::min(static_cast<uint64_t>(max_out_pages), ((size_ - offset) / PAGE_SIZE)); |
| |
| if (is_slice_locked()) { |
| uint64_t parent_offset = 0; |
| VmCowPages* parent = PagedParentOfSliceLocked(&parent_offset); |
| AssertHeld(parent->lock_); |
| return parent->LookupPagesLocked(offset + parent_offset, pf_flags, mark_dirty, max_out_pages, |
| alloc_list, page_request, out); |
| } |
| |
| // Ensure we're adding pages to an empty list so we don't risk overflowing it. |
| out->num_pages = 0; |
| |
| // Helper to find contiguous runs of pages in a page list and add them to the output pages. |
| auto collect_pages = [out, pf_flags](VmCowPages* cow, uint64_t offset, uint64_t max_len) { |
| DEBUG_ASSERT(max_len > 0); |
| |
| AssertHeld(cow->lock_); |
| cow->page_list_.ForEveryPageAndGapInRange( |
| [out, cow, pf_flags](const VmPageOrMarker* page, uint64_t off) { |
| if (page->IsMarker()) { |
| // Never pre-map in zero pages. |
| return ZX_ERR_STOP; |
| } |
| vm_page_t* p = page->Page(); |
| AssertHeld(cow->lock_); |
| cow->UpdateOnAccessLocked(p, pf_flags); |
| out->add_page(p->paddr()); |
| return ZX_ERR_NEXT; |
| }, |
| [](uint64_t start, uint64_t end) { |
| // This is a gap, and we never want to pre-map in zero pages. |
| return ZX_ERR_STOP; |
| }, |
| offset, CheckedAdd(offset, max_len)); |
| }; |
| |
| // We perform an exact Lookup and not something more fancy as a trade off between three scenarios |
| // * Page is in this page list and max_out_pages == 1 |
| // * Page is not in this page list |
| // * Page is in this page list and max_out_pages > 1 |
| // In the first two cases an exact Lookup is the most optimal choice, and in the third scenario |
| // although we have to re-walk the page_list_ 'needlessly', we should somewhat amortize it by the |
| // fact we return multiple pages. |
| const VmPageOrMarker* page_or_mark = page_list_.Lookup(offset); |
| if (page_or_mark && page_or_mark->IsPage()) { |
| // This is the common case where we have the page and don't need to do anything more, so |
| // return it straight away, collecting any additional pages if possible. |
| vm_page_t* p = page_or_mark->Page(); |
| |
| // If we're writing to a root VMO backed by a user pager, i.e. a VMO whose page source preserves |
| // page contents, we might need to mark pages Dirty so that they can be written back later. This |
| // is the only path that can result in a write to such a page; if the page was not present, we |
| // would have already blocked on a read request the first time, and ended up here when |
| // unblocked, at which point the page would be present. |
| uint64_t dirty_len = 0; |
| if ((pf_flags & VMM_PF_FLAG_WRITE) && is_source_preserving_page_content_locked() && |
| mark_dirty == DirtyTrackingAction::DirtyAllPagesOnWrite) { |
| // If this page was loaned, it should be replaced with a non-loaned page, so that we can make |
| // progress with marking pages dirty. PrepareForWriteLocked terminates its page walk when it |
| // encounters a loaned page; loaned pages are reclaimed by evicting them and we cannot evict |
| // dirty pages. |
| if (pmm_is_loaned(p)) { |
| DEBUG_ASSERT(is_page_clean(p)); |
| DEBUG_ASSERT(page_request); |
| zx_status_t status = ReplacePageLocked(p, offset, /*with_loaned=*/false, &p, page_request); |
| if (status != ZX_OK) { |
| return status; |
| } |
| } |
| DEBUG_ASSERT(!pmm_is_loaned(p)); |
| |
| // Pass in max_out_pages for the requested length. If the VMO traps dirty transitions, this |
| // will allow extending the DIRTY request to also include other consecutive markers / |
| // non-dirty pages in the entire lookup range. This is an optimization to reduce the number of |
| // DIRTY page requests generated overall. |
| zx_status_t status = |
| PrepareForWriteLocked(page_request, offset, max_out_pages * PAGE_SIZE, &dirty_len); |
| if (status != ZX_OK) { |
| // We were not able to dirty any pages. |
| DEBUG_ASSERT(dirty_len == 0); |
| // No pages to return. |
| out->num_pages = 0; |
| return status; |
| } |
| |
| // PrepareForWriteLocked was successful, so we should have some dirty pages, and they should |
| // be within the requested range. |
| DEBUG_ASSERT(dirty_len >= PAGE_SIZE); |
| DEBUG_ASSERT(dirty_len <= max_out_pages * PAGE_SIZE); |
| // PrepareForWriteLocked returned successfully, so we know that pages in the range [offset, |
| // offset + dirty_len) have been dirtied. We need to clip the maximum range collect_pages |
| // iterates over below to dirty_len, so that only pages that have been prepared for the write |
| // (by marking dirty) are returned. |
| max_out_pages = dirty_len / PAGE_SIZE; |
| } |
| |
| // This is writable if either of these conditions is true: |
| // 1) This is a write fault. |
| // 2) This is a read fault and we do not need to do dirty tracking, i.e. it is fine to retain |
| // the write permission on mappings since we don't need to generate a permission fault. We only |
| // need to dirty track pages owned by a root user-pager-backed VMO, i.e. a VMO with a page |
| // source that preserves page contents. |
| out->writable = pf_flags & VMM_PF_FLAG_WRITE || !is_source_preserving_page_content_locked(); |
| |
| UpdateOnAccessLocked(p, pf_flags); |
| out->add_page(p->paddr()); |
| if (max_out_pages > 1) { |
| collect_pages(this, offset + PAGE_SIZE, (max_out_pages - 1) * PAGE_SIZE); |
| } |
| |
| // If dirtiness was applicable i.e. we reached here after calling PrepareForWriteLocked, we |
| // should have dirtied exactly the same number of pages that is being returned. |
| DEBUG_ASSERT_MSG(dirty_len == 0 || dirty_len == out->num_pages * PAGE_SIZE, |
| "dirty pages %zu, looked up pages %zu\n", dirty_len / PAGE_SIZE, |
| out->num_pages); |
| |
| return ZX_OK; |
| } |
| |
| // The only time we will say something is writable when the fault is a read is if the page is |
| // already in this VMO. That scenario is the above if block, and so if we get here then writable |
| // mirrors the fault flag. |
| const bool writing = (pf_flags & VMM_PF_FLAG_WRITE) != 0; |
| out->writable = writing; |
| |
| // If we are reading we track the visible length of pages in the owner. We don't bother tracking |
| // this for writing, since when writing we will fork the page into ourselves anyway. |
| uint64_t visible_length = writing ? PAGE_SIZE : PAGE_SIZE * max_out_pages; |
| // Get content from parent if available, otherwise accept we are the owner of the yet to exist |
| // page. |
| VmCowPages* page_owner = nullptr; |
| uint64_t owner_offset = 0; |
| if ((!page_or_mark || page_or_mark->IsEmpty()) && parent_) { |
| // Pass nullptr if visible_length is PAGE_SIZE to allow the lookup to short-circuit the length |
| // calculation, as the calculation involves additional page lookups at every level. |
| page_or_mark = FindInitialPageContentLocked( |
| offset, &page_owner, &owner_offset, visible_length > PAGE_SIZE ? &visible_length : nullptr); |
| } else { |
| page_owner = this; |
| owner_offset = offset; |
| } |
| |
| // At this point we might not have an actual page, but we should at least have a notional owner. |
| DEBUG_ASSERT(page_owner); |
| |
| __UNUSED char pf_string[5]; |
| LTRACEF("vmo %p, offset %#" PRIx64 ", pf_flags %#x (%s)\n", this, offset, pf_flags, |
| vmm_pf_flags_to_string(pf_flags, pf_string)); |
| |
| // We need to turn this potential page or marker into a real vm_page_t. This means failing cases |
| // that we cannot handle, determining whether we can substitute the zero_page and potentially |
| // consulting a page_source. |
| vm_page_t* p = nullptr; |
| if (page_or_mark && page_or_mark->IsPage()) { |
| p = page_or_mark->Page(); |
| } else { |
| // If we don't have a real page and we're not sw or hw faulting in the page, return not found. |
| if ((pf_flags & VMM_PF_FLAG_FAULT_MASK) == 0) { |
| return ZX_ERR_NOT_FOUND; |
| } |
| |
| // We need to get a real page as our initial content. At this point we are either starting from |
| // the zero page, or something supplied from a page source. The page source only fills in if we |
| // have a true absence of content. |
| // |
| // We treat a page source that always supplies zeroes (does not preserve page content) as an |
| // absence of content (given the lack of a page), but we can only use the zero page if we're not |
| // writing, since we can't (or in case of not providing specific physical pages, shouldn't) let |
| // an arbitrary physical page get added below - we need to only add the specific physical pages |
| // supplied by the source. |
| // |
| // In the case of a (hypothetical) page source that's both always providing zeroes and not |
| // suppying specific physical pages, we intentionally ask the page source to supply the pages |
| // here since otherwise there's no point in having such a page source. We have no such page |
| // sources currently. |
| // |
| // Contiguous VMOs don't use markers and always have a page source, so the first two conditions |
| // won't be true for a contiguous VMO. |
| AssertHeld(page_owner->lock_); |
| if ((page_or_mark && page_or_mark->IsMarker()) || !page_owner->page_source_ || |
| (!writing && !page_owner->is_source_preserving_page_content_locked())) { |
| // We case use the zero page, since we have a marker, or no page source, or we're not adding |
| // a page to the VmCowPages (due to !writing) and the page source always provides zeroes so |
| // reading zeroes is consistent with what the page source would provide. |
| p = vm_get_zero_page(); |
| } else { |
| // We will attempt to get the page from the page source. |
| |
| AssertHeld(page_owner->lock_); |
| // Before requesting the page source, check if we can implicitly supply a zero page. Pages in |
| // the range [supply_zero_offset_, size_) can be supplied with zeros. |
| if (owner_offset >= page_owner->supply_zero_offset_) { |
| // The supply_zero_offset_ is only relevant for page sources preserving page content. For |
| // other types of VMOs, the supply_zero_offset_ will be set to UINT64_MAX, so we can never |
| // end up here. |
| DEBUG_ASSERT(page_owner->is_source_preserving_page_content_locked()); |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(page_owner->supply_zero_offset_)); |
| DEBUG_ASSERT(page_owner->supply_zero_offset_ <= page_owner->size_); |
| |
| // Set p to the zero page and fall through. We will correctly fork the zero page if we're |
| // writing to it. |
| p = vm_get_zero_page(); |
| } else { |
| // Otherwise request the page from the page source. |
| uint64_t user_id = 0; |
| if (page_owner->paged_ref_) { |
| AssertHeld(page_owner->paged_ref_->lock_ref()); |
| user_id = page_owner->paged_ref_->user_id_locked(); |
| } |
| VmoDebugInfo vmo_debug_info = { |
| .vmo_ptr = reinterpret_cast<uintptr_t>(page_owner->paged_ref_), .vmo_id = user_id}; |
| zx_status_t status = page_owner->page_source_->GetPage(owner_offset, page_request->get(), |
| vmo_debug_info, &p, nullptr); |
| // Pager page sources will never synchronously return a page. |
| DEBUG_ASSERT(status != ZX_OK); |
| |
| return status; |
| } |
| } |
| } |
| |
| // If we made it this far we must have some valid vm_page in |p|. Although this may be the zero |
| // page, the rest of this function is tolerant towards correctly forking it. |
| DEBUG_ASSERT(p); |
| // It's possible that we are going to fork the page, and the user isn't actually going to directly |
| // use `p`, but creating the fork still uses `p` so we want to consider it accessed. |
| AssertHeld(page_owner->lock_); |
| page_owner->UpdateOnAccessLocked(p, pf_flags); |
| |
| if (!writing) { |
| // If we're read-only faulting, return the page so they can map or read from it directly, |
| // grabbing any additional pages if visible. |
| out->add_page(p->paddr()); |
| if (visible_length > PAGE_SIZE) { |
| collect_pages(page_owner, owner_offset + PAGE_SIZE, visible_length - PAGE_SIZE); |
| } |
| LTRACEF("read only faulting in page %p, pa %#" PRIxPTR " from parent\n", p, p->paddr()); |
| return ZX_OK; |
| } |
| |
| // From here we must allocate additional pages, which we may only do if acting on a software or |
| // hardware fault. |
| if ((pf_flags & VMM_PF_FLAG_FAULT_MASK) == 0) { |
| return ZX_ERR_NOT_FOUND; |
| } |
| |
| vm_page_t* res_page; |
| if (!page_owner->is_hidden_locked() || p == vm_get_zero_page()) { |
| // If the page source is preserving content (is a PagerProxy), and is configured to trap dirty |
| // transitions, we first need to generate a DIRTY request *before* the zero page can be forked |
| // and marked dirty. If dirty transitions are not trapped, we will fall through to allocate the |
| // page and then mark it dirty below. |
| // |
| // Note that the check for ShouldTrapDirtyTransitions() is an optimization here. |
| // PrepareForWriteLocked() would do the right thing depending on ShouldTrapDirtyTransitions(), |
| // however we choose to avoid the extra work only to have it be a no-op if dirty transitions |
| // should not be trapped. |
| if (is_source_preserving_page_content_locked() && page_source_->ShouldTrapDirtyTransitions()) { |
| // The only page we can be forking here is the zero page. A non-slice child VMO does not |
| // support dirty page tracking. |
| DEBUG_ASSERT(p == vm_get_zero_page()); |
| // This object directly owns the page. |
| DEBUG_ASSERT(page_owner == this); |
| |
| // When generating the DIRTY request, try to extend the range beyond the immediate page, to |
| // include other non-dirty pages and markers within the requested range. This is an |
| // optimization aimed at reducing the number of distinct calls to LookupPagesLocked, and hence |
| // the number of distinct DIRTY page requests generated for consecutive pages that need DIRTY |
| // requests. |
| uint64_t dirty_len = 0; |
| zx_status_t status = |
| PrepareForWriteLocked(page_request, offset, max_out_pages * PAGE_SIZE, &dirty_len); |
| // The page source will never succeed synchronously. |
| DEBUG_ASSERT(status != ZX_OK); |
| // No pages will have been dirtied. The range starts with a marker, so we won't be able to |
| // accumulate any committed dirty pages. |
| DEBUG_ASSERT(dirty_len == 0); |
| // No pages to return yet. |
| out->num_pages = 0; |
| return status; |
| } |
| |
| // The general pmm_alloc_flags_ are not allowed to contain the BORROW option, and this is relied |
| // upon below to assume the page allocated cannot be loaned. |
| DEBUG_ASSERT(!(pmm_alloc_flags_ & PMM_ALLOC_FLAG_CAN_BORROW)); |
| |
| // If the vmo isn't hidden, we can't move the page. If the page is the zero |
| // page, there's no need to try to move the page. In either case, we need to |
| // allocate a writable page for this vmo. |
| DEBUG_ASSERT(page_request); |
| zx_status_t alloc_status = |
| AllocateCopyPage(pmm_alloc_flags_, p->paddr(), alloc_list, page_request, &res_page); |
| if (unlikely(alloc_status != ZX_OK)) { |
| return alloc_status; |
| } |
| VmPageOrMarker insert = VmPageOrMarker::Page(res_page); |
| |
| // We could be allocating a page to replace a zero page marker in a pager-backed VMO. We're |
| // going to write to the page, so mark it Dirty. AddPageLocked below will then insert the page |
| // into the appropriate page queue. |
| if (is_source_preserving_page_content_locked()) { |
| // The only page we can be forking here is the zero page. A non-slice child VMO does not |
| // support dirty page tracking. |
| DEBUG_ASSERT(p == vm_get_zero_page()); |
| // This object directly owns the page. |
| DEBUG_ASSERT(page_owner == this); |
| |
| // The forked page was just allocated, and so cannot be a loaned page. |
| DEBUG_ASSERT(!pmm_is_loaned(res_page)); |
| |
| // Mark the forked page dirty. |
| UpdateDirtyStateLocked(res_page, offset, DirtyState::Dirty, /*is_pending_add=*/true); |
| } |
| |
| zx_status_t status = AddPageLocked(&insert, offset, CanOverwriteContent::Zero, nullptr); |
| if (status != ZX_OK) { |
| // AddPageLocked failing for any other reason is a programming error. |
| DEBUG_ASSERT_MSG(status == ZX_ERR_NO_MEMORY, "status=%d\n", status); |
| FreePage(insert.ReleasePage()); |
| return status; |
| } |
| // Interpret a software fault as an explicit desire to have potential zero pages and don't |
| // consider them for cleaning, this is an optimization. |
| // |
| // We explicitly must *not* place pages from a page_source_ that's using pager queues into the |
| // zero scanning queue, as the pager queues are already using the backlink. |
| // |
| // We don't need to scan for zeroes if on finding zeroes we wouldn't be able to remove the page |
| // anyway. |
| if (p == vm_get_zero_page() && !has_pager_backlinks_locked() && |
| can_decommit_zero_pages_locked() && !(pf_flags & VMM_PF_FLAG_SW_FAULT)) { |
| pmm_page_queues()->MoveToUnswappableZeroFork(res_page, this, offset); |
| } |
| |
| // This is the only path where we can allocate a new page without being a clone (clones are |
| // always cached). So we check here if we are not fully cached and if so perform a |
| // clean/invalidate to flush our zeroes. After doing this we will not touch the page via the |
| // physmap and so we can pretend there isn't an aliased mapping. |
| // There are three potential states that may exist |
| // * VMO is cached, paged_ref_ might be null, we might have children -> no cache op needed |
| // * VMO is uncached, paged_ref_ is not null, we have no children -> cache op needed |
| // * VMO is uncached, paged_ref_ is null, we have no children -> cache op not needed / |
| // state cannot happen |
| // In the uncached case we know we have no children, since it is by definition not valid to |
| // have copy-on-write children of uncached pages. The third case cannot happen, but even if it |
| // could with no children and no paged_ref_ the pages cannot actually be referenced so any |
| // cache operation is pointless. |
| if (paged_ref_) { |
| AssertHeld(paged_ref_->lock_ref()); |
| if (paged_ref_->GetMappingCachePolicyLocked() != ARCH_MMU_FLAG_CACHED) { |
| arch_clean_invalidate_cache_range((vaddr_t)paddr_to_physmap(res_page->paddr()), PAGE_SIZE); |
| } |
| } |
| } else { |
| // We need a writable page; let ::CloneCowPageLocked handle inserting one. |
| zx_status_t result = CloneCowPageLocked(offset, alloc_list, page_owner, p, owner_offset, |
| page_request, &res_page); |
| if (result != ZX_OK) { |
| return result; |
| } |
| VMO_VALIDATION_ASSERT(DebugValidatePageSplitsHierarchyLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| } |
| |
| LTRACEF("faulted in page %p, pa %#" PRIxPTR "\n", res_page, res_page->paddr()); |
| |
| out->add_page(res_page->paddr()); |
| |
| // If we made it here, we committed a new page in this VMO. |
| IncrementHierarchyGenerationCountLocked(); |
| |
| return ZX_OK; |
| } |
| |
| zx_status_t VmCowPages::CommitRangeLocked(uint64_t offset, uint64_t len, uint64_t* committed_len, |
| LazyPageRequest* page_request) { |
| canary_.Assert(); |
| LTRACEF("offset %#" PRIx64 ", len %#" PRIx64 "\n", offset, len); |
| |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(offset)); |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(len)); |
| DEBUG_ASSERT(InRange(offset, len, size_)); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| |
| if (is_slice_locked()) { |
| uint64_t parent_offset; |
| VmCowPages* parent = PagedParentOfSliceLocked(&parent_offset); |
| AssertHeld(parent->lock_); |
| |
| // PagedParentOfSliceLocked will walk all of the way up the VMO hierarchy |
| // until it hits a non-slice VMO. This guarantees that we should only ever |
| // recurse once instead of an unbound number of times. DEBUG_ASSERT this so |
| // that we don't actually end up with unbound recursion just in case the |
| // property changes. |
| DEBUG_ASSERT(!parent->is_slice_locked()); |
| |
| return parent->CommitRangeLocked(offset + parent_offset, len, committed_len, page_request); |
| } |
| |
| fbl::RefPtr<PageSource> root_source = GetRootPageSourceLocked(); |
| |
| // If this vmo has a direct page source, then the source will provide the backing memory. For |
| // children that eventually depend on a page source, we skip preallocating memory to avoid |
| // potentially overallocating pages if something else touches the vmo while we're blocked on the |
| // request. Otherwise we optimize things by preallocating all the pages. |
| list_node page_list; |
| list_initialize(&page_list); |
| if (root_source == nullptr) { |
| // make a pass through the list to find out how many pages we need to allocate |
| size_t count = len / PAGE_SIZE; |
| page_list_.ForEveryPageInRange( |
| [&count](const auto* p, auto off) { |
| if (p->IsPage()) { |
| count--; |
| } |
| return ZX_ERR_NEXT; |
| }, |
| offset, offset + len); |
| |
| if (count == 0) { |
| *committed_len = len; |
| return ZX_OK; |
| } |
| |
| zx_status_t status = pmm_alloc_pages(count, pmm_alloc_flags_, &page_list); |
| // Ignore ZX_ERR_SHOULD_WAIT since the loop below will fall back to a page by page allocation, |
| // allowing us to wait for single pages should we need to. |
| if (status != ZX_OK && status != ZX_ERR_SHOULD_WAIT) { |
| return status; |
| } |
| } |
| |
| auto list_cleanup = fit::defer([this, &page_list]() { |
| if (!list_is_empty(&page_list)) { |
| FreePages(&page_list); |
| } |
| }); |
| |
| const uint64_t start_offset = offset; |
| const uint64_t end = offset + len; |
| bool have_page_request = false; |
| LookupInfo lookup_info; |
| while (offset < end) { |
| // Don't commit if we already have this page |
| const VmPageOrMarker* p = page_list_.Lookup(offset); |
| if (!p || !p->IsPage()) { |
| // Check if our parent has the page |
| const uint flags = VMM_PF_FLAG_SW_FAULT | VMM_PF_FLAG_WRITE; |
| // A commit does not imply that pages are being dirtied, they are just being populated. |
| zx_status_t res = LookupPagesLocked(offset, flags, DirtyTrackingAction::None, 1, &page_list, |
| page_request, &lookup_info); |
| if (unlikely(res == ZX_ERR_SHOULD_WAIT)) { |
| if (page_request->get()->BatchAccepting()) { |
| // In batch mode, will need to finalize the request later. |
| if (!have_page_request) { |
| // Stash how much we have committed right now, as we are going to have to reprocess this |
| // range so we do not want to claim it was committed. |
| *committed_len = offset - start_offset; |
| have_page_request = true; |
| } |
| } else { |
| // We can end up here in two cases: |
| // 1. We were in batch mode but had to terminate the batch early. |
| // 2. We hit the first missing page and we were not in batch mode. |
| // |
| // If we do have a page request, that means the batch was terminated early by |
| // pre-populated pages (case 1). Return immediately. |
| // |
| // Do not update the |committed_len| for case 1 as we are returning on encountering |
| // pre-populated pages while processing a batch. When that happens, we will terminate the |
| // batch we were processing and send out a page request for the contiguous range we've |
| // accumulated in the batch so far. And we will need to come back into this function again |
| // to reprocess the range the page request spanned, so we cannot claim any pages have been |
| // committed yet. |
| if (!have_page_request) { |
| // Not running in batch mode, and this is the first missing page (case 2). Update the |
| // committed length we have so far and return. |
| *committed_len = offset - start_offset; |
| } |
| return ZX_ERR_SHOULD_WAIT; |
| } |
| } else if (unlikely(res != ZX_OK)) { |
| VMO_VALIDATION_ASSERT(DebugValidatePageSplitsHierarchyLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| return res; |
| } |
| } |
| |
| offset += PAGE_SIZE; |
| } |
| |
| if (have_page_request) { |
| // commited_len was set when have_page_request was set so can just return. |
| return page_request->get()->FinalizeRequest(); |
| } |
| |
| // Processed the full range successfully |
| *committed_len = len; |
| VMO_VALIDATION_ASSERT(DebugValidatePageSplitsHierarchyLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| return ZX_OK; |
| } |
| |
| zx_status_t VmCowPages::PinRangeLocked(uint64_t offset, uint64_t len) { |
| canary_.Assert(); |
| LTRACEF("offset %#" PRIx64 ", len %#" PRIx64 "\n", offset, len); |
| |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(offset)); |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(len)); |
| DEBUG_ASSERT(InRange(offset, len, size_)); |
| |
| if (is_slice_locked()) { |
| uint64_t parent_offset; |
| VmCowPages* parent = PagedParentOfSliceLocked(&parent_offset); |
| AssertHeld(parent->lock_); |
| |
| // PagedParentOfSliceLocked will walk all of the way up the VMO hierarchy |
| // until it hits a non-slice VMO. This guarantees that we should only ever |
| // recurse once instead of an unbound number of times. DEBUG_ASSERT this so |
| // that we don't actually end up with unbound recursion just in case the |
| // property changes. |
| DEBUG_ASSERT(!parent->is_slice_locked()); |
| |
| return parent->PinRangeLocked(offset + parent_offset, len); |
| } |
| |
| ever_pinned_ = true; |
| |
| // Tracks our expected page offset when iterating to ensure all pages are present. |
| uint64_t next_offset = offset; |
| |
| // Should any errors occur we need to unpin everything. |
| auto pin_cleanup = fit::defer([this, offset, &next_offset]() { |
| if (next_offset > offset) { |
| AssertHeld(*lock()); |
| UnpinLocked(offset, next_offset - offset, /*allow_gaps=*/false); |
| } |
| }); |
| |
| // We stack-own loaned pages from SwapPageLocked() to pmm_free(). |
| __UNINITIALIZED StackOwnedLoanedPagesInterval raii_interval; |
| |
| // This is separate from pin_cleanup because we never cancel this one. |
| list_node_t freed_list; |
| list_initialize(&freed_list); |
| __UNINITIALIZED BatchPQRemove page_remover(&freed_list); |
| auto freed_pages_cleanup = fit::defer([&freed_list, &page_remover] { |
| page_remover.Flush(); |
| pmm_free(&freed_list); |
| }); |
| |
| zx_status_t status = page_list_.ForEveryPageInRange( |
| [this, &next_offset, &page_remover](const VmPageOrMarker* p, uint64_t page_offset) { |
| AssertHeld(lock_); |
| if (page_offset != next_offset || !p->IsPage()) { |
| return ZX_ERR_BAD_STATE; |
| } |
| vm_page_t* old_page = p->Page(); |
| DEBUG_ASSERT(old_page->state() == vm_page_state::OBJECT); |
| |
| vm_page_t* page = old_page; |
| if (pmm_is_loaned(old_page)) { |
| DEBUG_ASSERT(!old_page->object.pin_count); |
| DEBUG_ASSERT(!is_page_dirty_tracked(old_page) || is_page_clean(old_page)); |
| vm_page_t* new_page; |
| // It's possible for the old_page to become non-loaned by the time we call |
| // pmm_alloc_page(), but that's fine; we'll just replace anyway with new_page which we |
| // know isn't loaned. |
| DEBUG_ASSERT(!(pmm_alloc_flags_ & PMM_ALLOC_FLAG_CAN_BORROW)); |
| // TODO(fxbug.dev/99890): Support delayed allocations here. |
| zx_status_t status = |
| pmm_alloc_page(pmm_alloc_flags_ & ~PMM_ALLOC_FLAG_CAN_WAIT, &new_page); |
| if (status != ZX_OK) { |
| return status; |
| } |
| DEBUG_ASSERT(!new_page->is_loaned()); |
| SwapPageLocked(page_offset, old_page, new_page); |
| page_remover.Push(old_page); |
| page = new_page; |
| } |
| |
| if (page->object.pin_count == VM_PAGE_OBJECT_MAX_PIN_COUNT) { |
| return ZX_ERR_UNAVAILABLE; |
| } |
| |
| page->object.pin_count++; |
| if (page->object.pin_count == 1) { |
| MoveToWiredLocked(page, page_offset); |
| } |
| |
| // Pinning every page in the largest vmo possible as many times as possible can't overflow |
| static_assert(VmPageList::MAX_SIZE / PAGE_SIZE < UINT64_MAX / VM_PAGE_OBJECT_MAX_PIN_COUNT); |
| next_offset += PAGE_SIZE; |
| return ZX_ERR_NEXT; |
| }, |
| offset, offset + len); |
| |
| const uint64_t actual = (next_offset - offset) / PAGE_SIZE; |
| // Count whatever pages we pinned, in the failure scenario this will get decremented on the unpin. |
| pinned_page_count_ += actual; |
| |
| if (status == ZX_OK) { |
| // If the missing pages were at the end of the range (or the range was empty) then our iteration |
| // will have just returned ZX_OK. Perform one final check that we actually pinned the number of |
| // pages we expected to. |
| const uint64_t expected = len / PAGE_SIZE; |
| if (actual != expected) { |
| status = ZX_ERR_BAD_STATE; |
| } else { |
| pin_cleanup.cancel(); |
| } |
| } |
| return status; |
| } |
| |
| zx_status_t VmCowPages::DecommitRangeLocked(uint64_t offset, uint64_t len) { |
| canary_.Assert(); |
| |
| // Trim the size and perform our zero-length hot-path check before we recurse |
| // up to our top-level ancestor. Size bounding needs to take place relative |
| // to the child the operation was originally targeted against. |
| uint64_t new_len; |
| if (!TrimRange(offset, len, size_, &new_len)) { |
| return ZX_ERR_OUT_OF_RANGE; |
| } |
| |
| // was in range, just zero length |
| if (new_len == 0) { |
| return ZX_OK; |
| } |
| |
| // If this is a child slice of a VMO, then find our way up to our root |
| // ancestor (taking our offset into account as we do), and then recurse, |
| // running the operation against our ancestor. Note that |
| // PagedParentOfSliceLocked will iteratively walk all the way up to our |
| // non-slice ancestor, not just our immediate parent, so we can guaranteed |
| // bounded recursion. |
| if (is_slice_locked()) { |
| uint64_t parent_offset; |
| VmCowPages* parent = PagedParentOfSliceLocked(&parent_offset); |
| AssertHeld(parent->lock_); |
| DEBUG_ASSERT(!parent->is_slice_locked()); // assert bounded recursion. |
| return parent->DecommitRangeLocked(offset + parent_offset, new_len); |
| } |
| |
| // Currently, we can't decommit if the absence of a page doesn't imply zeroes. |
| if (parent_ || is_source_preserving_page_content_locked()) { |
| return ZX_ERR_NOT_SUPPORTED; |
| } |
| |
| // VmObjectPaged::DecommitRange() rejects is_contiguous() VMOs (for now). |
| DEBUG_ASSERT(can_decommit_locked()); |
| |
| // Demand offset and length be correctly aligned to not give surprising user semantics. |
| if (!IS_PAGE_ALIGNED(offset) || !IS_PAGE_ALIGNED(len)) { |
| return ZX_ERR_INVALID_ARGS; |
| } |
| |
| list_node_t freed_list; |
| list_initialize(&freed_list); |
| zx_status_t status = UnmapAndRemovePagesLocked(offset, new_len, &freed_list); |
| if (status != ZX_OK) { |
| return status; |
| } |
| |
| FreePages(&freed_list); |
| |
| return status; |
| } |
| |
| zx_status_t VmCowPages::UnmapAndRemovePagesLocked(uint64_t offset, uint64_t len, |
| list_node_t* freed_list, |
| uint64_t* pages_freed_out) { |
| canary_.Assert(); |
| |
| if (AnyPagesPinnedLocked(offset, len)) { |
| return ZX_ERR_BAD_STATE; |
| } |
| |
| LTRACEF("start offset %#" PRIx64 ", end %#" PRIx64 "\n", offset, offset + len); |
| |
| // We've already trimmed the range in DecommitRangeLocked(). |
| DEBUG_ASSERT(InRange(offset, len, size_)); |
| |
| // Verify page alignment. |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(offset)); |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(len) || (offset + len == size_)); |
| |
| // DecommitRangeLocked() will call this function only on a VMO with no parent. The only clone |
| // types that support OP_DECOMMIT are slices, for which we will recurse up to the root. |
| DEBUG_ASSERT(!parent_); |
| |
| // unmap all of the pages in this range on all the mapping regions |
| RangeChangeUpdateLocked(offset, len, RangeChangeOp::Unmap); |
| |
| __UNINITIALIZED BatchPQRemove page_remover(freed_list); |
| |
| page_list_.RemovePages(page_remover.RemovePagesCallback(), offset, offset + len); |
| page_remover.Flush(); |
| |
| if (pages_freed_out) { |
| *pages_freed_out = page_remover.freed_count(); |
| } |
| |
| VMO_VALIDATION_ASSERT(DebugValidatePageSplitsHierarchyLocked()); |
| VMO_FRUGAL_VALIDATION_ASSERT(DebugValidateVmoPageBorrowingLocked()); |
| return ZX_OK; |
| } |
| |
| bool VmCowPages::PageWouldReadZeroLocked(uint64_t page_offset) { |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(page_offset)); |
| DEBUG_ASSERT(page_offset < size_); |
| const VmPageOrMarker* slot = page_list_.Lookup(page_offset); |
| if (slot && slot->IsMarker()) { |
| // This is already considered zero as there's a marker. |
| return true; |
| } |
| if (is_source_preserving_page_content_locked() && page_offset >= supply_zero_offset_) { |
| // Uncommitted pages beyond supply_zero_offset_ are supplied as zeros by the kernel. |
| if (!slot || slot->IsEmpty()) { |
| return true; |
| } |
| } |
| // If we don't have a committed page we need to check our parent. |
| if (!slot || !slot->IsPage()) { |
| VmCowPages* page_owner; |
| uint64_t owner_offset; |
| if (!FindInitialPageContentLocked(page_offset, &page_owner, &owner_offset, nullptr)) { |
| // Parent doesn't have a page either, so would also read as zero, assuming no page source. |
| return GetRootPageSourceLocked() == nullptr; |
| } |
| } |
| // Content either locally or in our parent, assume it is non-zero and return false. |
| return false; |
| } |
| |
| zx_status_t VmCowPages::ZeroPagesLocked(uint64_t page_start_base, uint64_t page_end_base, |
| LazyPageRequest* page_request, uint64_t* zeroed_len_out) { |
| canary_.Assert(); |
| |
| DEBUG_ASSERT(page_start_base <= page_end_base); |
| DEBUG_ASSERT(page_end_base <= size_); |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(page_start_base)); |
| DEBUG_ASSERT(IS_PAGE_ALIGNED(page_end_base)); |
| ASSERT(zeroed_len_out); |
| |
| // Forward any operations on slices up to the original non slice parent. |
| if (is_slice_locked()) { |
| uint64_t parent_offset; |
| VmCowPages* parent = PagedParentOfSliceLocked(&parent_offset); |
| AssertHeld(parent->lock_); |
| return parent->ZeroPagesLocked(page_start_base + parent_offset, page_end_base + parent_offset, |
| page_request, zeroed_len_out); |
| } |
| |
| // This function tries to zero pages as optimally as possible for most cases, so we attempt |
| // increasingly expensive actions only if certain preconditions do not allow us to perform the |
| // cheaper action. Broadly speaking, the sequence of actions that are attempted are as follows. |
| // 1) Try to decommit the entire range at once if the VMO allows it. |
| // 2) Otherwise, try to decommit each page if the VMO allows it and doing so doesn't expose |
| // content in the parent (if any) that shouldn't be visible. |
| // 3) Otherwise, if this is a child VMO and there is no committed page yet, allocate a zero page. |
| // 4) Otherwise, look up the page, faulting it in if necessary, and zero the page. If the page |
| // source needs to supply or dirty track the page, a page request is initialized and we return |
| // early with ZX_ERR_SHOULD_WAIT. The caller is expected to wait on the page request, and then |
| // retry. On the retry, we should be able to look up the page successfully and zero it. |
| |
| // First try and do the more efficient decommit. We prefer/ decommit as it performs work in the |
| // order of the number of committed pages, instead of work in the order of size of the range. An |
| // error from DecommitRangeLocked indicates that the VMO is not of a form that decommit can safely |
| // be performed without exposing data that we shouldn't between children and parents, but no |
| // actual state will have been changed. Should decommit succeed we are done, otherwise we will |
| // have to handle each offset individually. |
| // |
| // Zeroing doesn't decommit pages of contiguous VMOs. |
| if (can_decommit_zero_pages_locked()) { |
| zx_status_t status = DecommitRangeLocked(page_start_base, page_end_base - page_start_base); |
| if (status == ZX_OK) { |
| *zeroed_len_out = page_end_base - page_start_base; |
| return ZX_OK; |
| } |
| |
| // Unmap any page that is touched by this range in any of our, or our childrens, mapping |
| // regions. We do this on the assumption we are going to be able to free pages either completely |
| // or by turning them into markers and it's more efficient to unmap once in bulk here. |
| RangeChangeUpdateLocked(page_start_base, page_end_base - page_start_base, RangeChangeOp::Unmap); |
| } |
| |
| // We stack-own loaned pages from when they're removed until they're freed. |
| __UNINITIALIZED StackOwnedLoanedPagesInterval raii_interval; |
| |
| list_node_t freed_list; |
| list_initialize(&freed_list); |
| |
| // See also free_any_pages below, which intentionally frees incrementally. |
| auto auto_free = fit::defer([this, &freed_list]() { |
| if (!list_is_empty(&freed_list)) { |
| FreePages(&freed_list); |
| } |
| }); |
| |
| // Give us easier names for our range. |
| const uint64_t start = page_start_base; |
| const uint64_t end = page_end_base; |
| |
| // If we're zeroing at the end of our parent range we can update to reflect this similar to a |
| // resize. This does not work if we are a slice, but we checked for that earlier. Whilst this does |
| // not actually zero the range in question, it makes future zeroing of the range far more |
| // efficient, which is why we do it first. |
| if (start < parent_limit_ && end >= parent_limit_) { |
| bool hidden_parent = false; |
| if (parent_) { |
| AssertHeld(parent_->lock_ref()); |
| hidden_parent = parent_->is_hidden_locked(); |
| } |
| if (hidden_parent) { |
| // Release any COW pages that are no longer necessary. This will also |
| // update the parent limit. |
| __UNINITIALIZED BatchPQRemove page_remover(&freed_list); |
| ReleaseCowParentPagesLocked(start, parent_limit_, &page_remover); |
| page_remover.Flush(); |
| } else { |
| parent_limit_ = start; |
| } |
| } |
| |
| *zeroed_len_out = 0; |
| uint64_t offset = start; |
| for (; offset < end; offset += PAGE_SIZE, *zeroed_len_out += PAGE_SIZE) { |
| const VmPageOrMarker* slot = page_list_.Lookup(offset); |
| |
| DEBUG_ASSERT(!direct_source_supplies_zero_pages_locked() || (!slot || !slot->IsMarker())); |
| if (direct_source_supplies_zero_pages_locked() && (!slot || slot->IsEmpty())) { |
| // Already logically zero - don't commit pages to back the zeroes if they're not already |
| // committed. This is important for contiguous VMOs, as we don't use markers for contiguous |
| // VMOs, and allocating a page below to hold zeroes would not be asking the page_source_ for |
| // the proper physical page. This prevents allocating an arbitrary physical page to back the |
| // zeroes. |
| continue; |
| } |
| |
| // If the source preserves page content, empty slots beyond supply_zero_offset_ are implicitly |
| // zero, and any committed pages beyond supply_zero_offset_ need to be removed. Exit the loop |
| // for this case, so that we can perform this operation later more optimally in the order of |
| // committed pages, instead of having to iterate over empty slots too. |
| // |
| // TODO(rashaeqbal): Optimize for the common case too so that we do not need to handle this case |
| // separately out of the common loop. Specialized code increases the likelihood of forgetting |
| // one of the many checks in this common loop, as is evidenced by fxbug.dev/101608. |
| if (is_source_preserving_page_content_locked() && offset >= supply_zero_offset_) { |
| break; |
| } |
| |
| // If there's already a marker then we can avoid any second guessing and leave the marker alone. |
| if (slot && slot->IsMarker()) { |
| continue; |
| } |
|