blob: df36be044e5940ffacf70455b50896ad76b0c827 [file] [log] [blame]
// Copyright 2016 The Fuchsia Authors
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file or at
#include "vm/vm_object_paged.h"
#include <align.h>
#include <assert.h>
#include <inttypes.h>
#include <lib/console.h>
#include <lib/counters.h>
#include <lib/fit/defer.h>
#include <stdlib.h>
#include <string.h>
#include <trace.h>
#include <zircon/compiler.h>
#include <zircon/errors.h>
#include <zircon/types.h>
#include <arch/ops.h>
#include <fbl/alloc_checker.h>
#include <ktl/algorithm.h>
#include <ktl/array.h>
#include <ktl/move.h>
#include <vm/bootreserve.h>
#include <vm/discardable_vmo_tracker.h>
#include <vm/fault.h>
#include <vm/page_source.h>
#include <vm/physical_page_provider.h>
#include <vm/physmap.h>
#include <vm/vm.h>
#include <vm/vm_address_region.h>
#include <vm/vm_cow_pages.h>
#include "vm_priv.h"
#include <ktl/enforce.h>
namespace {
KCOUNTER(vmo_attribution_queries, "vm.attributed_memory.object.queries")
KCOUNTER(vmo_attribution_cache_hits, "vm.attributed_memory.object.cache_hits")
KCOUNTER(vmo_attribution_cache_misses, "vm.attributed_memory.object.cache_misses")
} // namespace
VmObjectPaged::VmObjectPaged(uint32_t options, fbl::RefPtr<VmHierarchyState> hierarchy_state)
: VmObject(VMOType::Paged, ktl::move(hierarchy_state)), options_(options) {
LTRACEF("%p\n", this);
VmObjectPaged::~VmObjectPaged() {
LTRACEF("%p\n", this);
if (!cow_pages_) {
// Initialization didn't finish. This is not in the global list and any complex destruction can
// all be skipped.
if (options_ & kAlwaysPinned) {
Unpin(0, size());
Guard<CriticalMutex> guard{lock()};
// Only clear the backlink if we are not a reference. A reference does not "own" the VmCowPages,
// so in the typical case, the VmCowPages will not have its backlink set to a reference. There
// does exist an edge case where the backlink can be a reference, which is handled by the else
// block below.
if (!is_reference()) {
} else {
// If this is a reference, we need to remove it from the original (parent) VMO's reference list.
VmObjectPaged* root_ref = cow_pages_locked()->get_paged_backlink_locked();
// The VmCowPages will have a valid backlink, either to the original VmObjectPaged or a
// reference VmObjectPaged, as long as there is a reference that is alive. We know that this is
// a reference.
if (likely(root_ref != this)) {
VmObjectPaged* removed = root_ref->reference_list_.erase(*this);
DEBUG_ASSERT(removed == this);
} else {
// It is possible for the backlink to point to |this| if the original parent went away at some
// point and the rest of the reference list had to be re-homed to |this|, and the backlink set
// to |this|.
// The VmCowPages was pointing to us, so clear the backlink. The backlink will get reset below
// if other references remain.
// If this VMO had references, pick one of the references as the paged backlink from the shared
// VmCowPages. Also, move the remainder of the reference list to the chosen reference. Note that
// we're only moving the reference list over without adding the references to the children list;
// we do not want these references to be counted as children of the chosen VMO. We simply want a
// safe way to propagate mapping updates and VmCowPages changes on hidden node addition.
if (!reference_list_.is_empty()) {
// We should only be attempting to reset the backlink if the owner is going away and has reset
// the backlink above.
DEBUG_ASSERT(cow_pages_locked()->get_paged_backlink_locked() == nullptr);
VmObjectPaged* paged_backlink = reference_list_.pop_front();
paged_backlink->reference_list_.splice(paged_backlink->reference_list_.end(), reference_list_);
// Re-home all our children with any parent that we have.
while (!children_list_.is_empty()) {
VmObject* c = &children_list_.front();
VmObjectPaged* child = reinterpret_cast<VmObjectPaged*>(c);
child->parent_ = parent_;
if (parent_) {
// Ignore the return since 'this' is a child so we know we are not transitioning from 0->1
// children.
[[maybe_unused]] bool notify = parent_->AddChildLocked(child);
if (parent_) {
// As parent_ is a raw pointer we must ensure that if we call a method on it that it lives long
// enough. To do so we attempt to upgrade it to a refptr, which could fail if it's already
// slated for deletion.
fbl::RefPtr<VmObjectPaged> parent = fbl::MakeRefPtrUpgradeFromRaw(parent_, guard);
if (parent) {
// Holding refptr, can safely pass in the guard to RemoveChild.
parent->RemoveChild(this, guard.take());
// As we constructed a RefPtr to our parent, and we are in our own destructor, there is now
// the potential for recursive destruction if we need to delete the parent due to holding the
// last ref, hit this same path, etc.
} else {
// parent is up for deletion and so there's no need to use RemoveChild since there is no
// user dispatcher to notify anyway and so just drop ourselves to keep the hierarchy correct.
zx_status_t VmObjectPaged::HintRange(uint64_t offset, uint64_t len, EvictionHint hint) {
uint64_t end_offset;
if (add_overflow(offset, len, &end_offset)) {
if (can_block_on_page_requests() && hint == EvictionHint::AlwaysNeed) {
Guard<CriticalMutex> guard{lock()};
// Ignore hints for non user-pager-backed VMOs. We choose to silently ignore hints for
// incompatible combinations instead of failing. This is because the kernel does not make any
// explicit guarantees on hints; since they are just hints, the kernel is always free to ignore
// them.
if (!cow_pages_locked()->can_root_source_evict_locked()) {
return ZX_OK;
if (!InRange(offset, len, size_locked())) {
switch (hint) {
case EvictionHint::DontNeed: {
cow_pages_locked()->PromoteRangeForReclamationLocked(offset, len);
case EvictionHint::AlwaysNeed: {
// Hints are best effort, so ignore any errors in the paging in process.
cow_pages_locked()->ProtectRangeFromReclamationLocked(offset, len, /*set_always_need=*/true,
/*ignore_errors=*/true, &guard);
return ZX_OK;
zx_status_t VmObjectPaged::PrefetchRangeLocked(uint64_t offset, uint64_t len,
Guard<CriticalMutex>* guard) {
if (!InRange(offset, len, size_locked())) {
// Cannot overflow otherwise InRange would have failed.
if (len == 0) {
return ZX_OK;
if (cow_pages_locked()->is_root_source_user_pager_backed_locked()) {
return cow_pages_locked()->ProtectRangeFromReclamationLocked(offset, len,
/*ignore_errors=*/false, guard);
} else {
// Committing high priority pages is best effort, so ignore any errors from decompressing.
return cow_pages_locked()->DecompressInRangeLocked(offset, len, guard);
zx_status_t VmObjectPaged::PrefetchRange(uint64_t offset, uint64_t len) {
if (can_block_on_page_requests()) {
Guard<CriticalMutex> guard{lock()};
// Round offset and len to be page aligned. Use a sub-scope to validate that temporary end
// calculations cannot be accidentally used later on.
uint64_t end;
if (add_overflow(offset, len, &end)) {
const uint64_t end_page = ROUNDUP_PAGE_SIZE(end);
if (end_page < end) {
DEBUG_ASSERT(end_page >= offset);
offset = ROUNDDOWN(offset, PAGE_SIZE);
len = end_page - offset;
return PrefetchRangeLocked(offset, len, &guard);
void VmObjectPaged::CommitHighPriorityPages(uint64_t offset, uint64_t len) {
Guard<CriticalMutex> guard{lock()};
if (!cow_pages_locked()->is_high_memory_priority_locked()) {
// Ignore the result of the prefetch, high priority commit is best effort.
PrefetchRangeLocked(offset, len, &guard);
bool VmObjectPaged::CanDedupZeroPagesLocked() {
// Skip uncached VMOs as we cannot efficiently scan them.
return false;
// Okay to dedup from this VMO.
return true;
zx_status_t VmObjectPaged::CreateCommon(uint32_t pmm_alloc_flags, uint32_t options, uint64_t size,
fbl::RefPtr<VmObjectPaged>* obj) {
DEBUG_ASSERT(!(options & (kContiguous | kCanBlockOnPageRequests)));
// Cannot be resizable and pinned, otherwise we will lose track of the pinned range.
if ((options & kResizable) && (options & kAlwaysPinned)) {
if (pmm_alloc_flags & PMM_ALLOC_FLAG_CAN_WAIT) {
options |= kCanBlockOnPageRequests;
// make sure size is page aligned
if (!IS_PAGE_ALIGNED(size)) {
if (size > MAX_SIZE) {
fbl::AllocChecker ac;
auto state = fbl::MakeRefCountedChecked<VmHierarchyState>(&ac);
if (!ac.check()) {
ktl::unique_ptr<DiscardableVmoTracker> discardable = nullptr;
if (options & kDiscardable) {
discardable = ktl::make_unique<DiscardableVmoTracker>(&ac);
if (!ac.check()) {
fbl::RefPtr<VmCowPages> cow_pages;
zx_status_t status = VmCowPages::Create(state, VmCowPagesOptions::kNone, pmm_alloc_flags, size,
ktl::move(discardable), &cow_pages);
if (status != ZX_OK) {
return status;
// If this VMO will always be pinned, allocate and pin the pages in the VmCowPages prior to
// creating the VmObjectPaged. This ensures the VmObjectPaged destructor can assume that the pages
// are committed and pinned.
if (options & kAlwaysPinned) {
list_node_t prealloc_pages;
status = pmm_alloc_pages(size / PAGE_SIZE, pmm_alloc_flags, &prealloc_pages);
if (status != ZX_OK) {
return status;
Guard<CriticalMutex> guard{cow_pages->lock()};
// Add all the preallocated pages to the object, this takes ownership of all pages regardless
// of the outcome. This is a new VMO, but this call could fail due to OOM.
status = cow_pages->AddNewPagesLocked(0, &prealloc_pages, VmCowPages::CanOverwriteContent::Zero,
true, false);
if (status != ZX_OK) {
return status;
// With all the pages in place, pin them.
status = cow_pages->PinRangeLocked(0, size);
ASSERT(status == ZX_OK);
auto vmo = fbl::AdoptRef<VmObjectPaged>(new (&ac) VmObjectPaged(options, ktl::move(state)));
if (!ac.check()) {
if (options & kAlwaysPinned) {
Guard<CriticalMutex> guard{cow_pages->lock()};
cow_pages->UnpinLocked(0, size, false);
// This creation has succeeded. Must wire up the cow pages and *then* place in the globals list.
Guard<CriticalMutex> guard{vmo->lock()};
vmo->cow_pages_ = ktl::move(cow_pages);
*obj = ktl::move(vmo);
return ZX_OK;
zx_status_t VmObjectPaged::Create(uint32_t pmm_alloc_flags, uint32_t options, uint64_t size,
fbl::RefPtr<VmObjectPaged>* obj) {
if (options & (kContiguous | kCanBlockOnPageRequests)) {
// Force callers to use CreateContiguous() instead.
return CreateCommon(pmm_alloc_flags, options, size, obj);
zx_status_t VmObjectPaged::CreateContiguous(uint32_t pmm_alloc_flags, uint64_t size,
uint8_t alignment_log2,
fbl::RefPtr<VmObjectPaged>* obj) {
DEBUG_ASSERT(alignment_log2 < sizeof(uint64_t) * 8);
// make sure size is page aligned
if (!IS_PAGE_ALIGNED(size)) {
if (size > MAX_SIZE) {
fbl::AllocChecker ac;
// For contiguous VMOs, we need a PhysicalPageProvider to reclaim specific loaned physical pages
// on commit.
auto page_provider = fbl::AdoptRef(new (&ac) PhysicalPageProvider(size));
if (!ac.check()) {
PhysicalPageProvider* physical_page_provider_ptr = page_provider.get();
fbl::RefPtr<PageSource> page_source =
fbl::AdoptRef(new (&ac) PageSource(ktl::move(page_provider)));
if (!ac.check()) {
auto* page_source_ptr = page_source.get();
fbl::RefPtr<VmObjectPaged> vmo;
zx_status_t status =
CreateWithSourceCommon(page_source, pmm_alloc_flags, kContiguous, size, &vmo);
if (status != ZX_OK) {
// Ensure to close the page source we created, as it will not get closed by the VmCowPages since
// that creation failed.
return status;
if (size == 0) {
*obj = ktl::move(vmo);
return ZX_OK;
// allocate the pages
list_node page_list;
size_t num_pages = size / PAGE_SIZE;
paddr_t pa;
status = pmm_alloc_contiguous(num_pages, pmm_alloc_flags, alignment_log2, &pa, &page_list);
if (status != ZX_OK) {
LTRACEF("failed to allocate enough pages (asked for %zu)\n", num_pages);
Guard<CriticalMutex> guard{vmo->lock()};
// Add them to the appropriate range of the object, this takes ownership of all the pages
// regardless of outcome.
// This is a newly created VMO with a page source, so we don't expect to be overwriting anything
// in its page list.
status = vmo->cow_pages_locked()->AddNewPagesLocked(0, &page_list,
if (status != ZX_OK) {
return status;
physical_page_provider_ptr->Init(vmo->cow_pages_locked(), page_source_ptr, pa);
*obj = ktl::move(vmo);
return ZX_OK;
zx_status_t VmObjectPaged::CreateFromWiredPages(const void* data, size_t size, bool exclusive,
fbl::RefPtr<VmObjectPaged>* obj) {
LTRACEF("data %p, size %zu\n", data, size);
fbl::RefPtr<VmObjectPaged> vmo;
zx_status_t status = CreateCommon(PMM_ALLOC_FLAG_ANY, 0, size, &vmo);
if (status != ZX_OK) {
return status;
if (size > 0) {
// Do a direct lookup of the physical pages backing the range of
// the kernel that these addresses belong to and jam them directly
// into the VMO.
// NOTE: This relies on the kernel not otherwise owning the pages.
// If the setup of the kernel's address space changes so that the
// pages are attached to a kernel VMO, this will need to change.
paddr_t start_paddr = vaddr_to_paddr(data);
ASSERT(start_paddr != 0);
Guard<CriticalMutex> guard{vmo->lock()};
for (size_t count = 0; count < size / PAGE_SIZE; count++) {
paddr_t pa = start_paddr + count * PAGE_SIZE;
vm_page_t* page = paddr_to_vm_page(pa);
if (page->state() == vm_page_state::WIRED) {
} else {
// This function is only valid for memory in the boot image,
// which should all be wired.
panic("page used to back static vmo in unusable state: paddr %#" PRIxPTR " state %zu\n", pa,
// This is a newly created anonymous VMO, so we expect to be overwriting zeros. A newly
// created anonymous VMO with no committed pages has all its content implicitly zero.
status = vmo->cow_pages_locked()->AddNewPageLocked(
count * PAGE_SIZE, page, VmCowPages::CanOverwriteContent::Zero, nullptr, false, false);
ASSERT_MSG(status == ZX_OK,
"AddNewPageLocked failed on page %zu of %zu at %#" PRIx64 " from [%#" PRIx64
", %#" PRIx64 ")",
count, size / PAGE_SIZE, pa, start_paddr, start_paddr + size);
if (exclusive && !is_physmap_addr(data)) {
// unmap it from the kernel
// NOTE: this means the image can no longer be referenced from original pointer
status = VmAspace::kernel_aspace()->arch_aspace().Unmap(
reinterpret_cast<vaddr_t>(data), size / PAGE_SIZE, ArchVmAspace::EnlargeOperation::No,
ASSERT(status == ZX_OK);
if (!exclusive) {
// Pin all the pages as we must never decommit any of them since they are shared elsewhere.
status = vmo->cow_pages_locked()->PinRangeLocked(0, size);
ASSERT(status == ZX_OK);
*obj = ktl::move(vmo);
return ZX_OK;
zx_status_t VmObjectPaged::CreateExternal(fbl::RefPtr<PageSource> src, uint32_t options,
uint64_t size, fbl::RefPtr<VmObjectPaged>* obj) {
if (options & (kDiscardable | kCanBlockOnPageRequests | kAlwaysPinned)) {
// make sure size is page aligned
if (!IS_PAGE_ALIGNED(size)) {
if (size > MAX_SIZE) {
// External VMOs always support delayed PMM allocations, since they already have to tolerate
// arbitrary waits for pages due to the PageSource.
return CreateWithSourceCommon(ktl::move(src), PMM_ALLOC_FLAG_ANY | PMM_ALLOC_FLAG_CAN_WAIT,
options | kCanBlockOnPageRequests, size, obj);
zx_status_t VmObjectPaged::CreateWithSourceCommon(fbl::RefPtr<PageSource> src,
uint32_t pmm_alloc_flags, uint32_t options,
uint64_t size, fbl::RefPtr<VmObjectPaged>* obj) {
// Caller must check that size is page aligned.
DEBUG_ASSERT(!(options & kAlwaysPinned));
fbl::AllocChecker ac;
auto state = fbl::AdoptRef<VmHierarchyState>(new (&ac) VmHierarchyState);
if (!ac.check()) {
// The cow pages will have a page source, so blocking is always possible.
options |= kCanBlockOnPageRequests;
VmCowPagesOptions cow_options = VmCowPagesOptions::kNone;
if (options & kContiguous) {
cow_options |= VmCowPagesOptions::kCannotDecommitZeroPages;
fbl::RefPtr<VmCowPages> cow_pages;
zx_status_t status =
VmCowPages::CreateExternal(ktl::move(src), cow_options, state, size, &cow_pages);
if (status != ZX_OK) {
return status;
auto vmo = fbl::AdoptRef<VmObjectPaged>(new (&ac) VmObjectPaged(options, ktl::move(state)));
if (!ac.check()) {
// This creation has succeeded. Must wire up the cow pages and *then* place in the globals list.
Guard<CriticalMutex> guard{vmo->lock()};
vmo->cow_pages_ = ktl::move(cow_pages);
*obj = ktl::move(vmo);
return ZX_OK;
zx_status_t VmObjectPaged::CreateChildSlice(uint64_t offset, uint64_t size, bool copy_name,
fbl::RefPtr<VmObject>* child_vmo) {
LTRACEF("vmo %p offset %#" PRIx64 " size %#" PRIx64 "\n", this, offset, size);
// Offset must be page aligned.
if (!IS_PAGE_ALIGNED(offset)) {
// Make sure size is page aligned.
if (!IS_PAGE_ALIGNED(size)) {
if (size > MAX_SIZE) {
// Slice must be wholly contained. |size()| will read the size holding the lock. This is extra
// acquisition is correct as we must drop the lock in order to perform the allocations.
uint64_t our_size = this->size();
if (!InRange(offset, size, our_size)) {
// Forbid creating children of resizable VMOs. This restriction may be lifted in the future.
if (is_resizable()) {
uint32_t options = kSlice;
if (is_contiguous()) {
options |= kContiguous;
if (can_block_on_page_requests()) {
options |= kCanBlockOnPageRequests;
fbl::AllocChecker ac;
auto vmo = fbl::AdoptRef<VmObjectPaged>(new (&ac) VmObjectPaged(options, hierarchy_state_ptr_));
if (!ac.check()) {
Guard<CriticalMutex> guard{lock()};
// If this VMO is contiguous then we allow creating an uncached slice. When zeroing pages that
// are reclaimed from having been loaned from a contiguous VMO, we will zero the pages and flush
// the zeroes to RAM.
if (cache_policy_ != ARCH_MMU_FLAG_CACHED && !is_contiguous()) {
vmo->cache_policy_ = cache_policy_;
fbl::RefPtr<VmCowPages> cow_pages;
zx_status_t status = cow_pages_locked()->CreateChildSliceLocked(offset, size, &cow_pages);
if (status != ZX_OK) {
return status;
// Now that everything has succeeded, link up the cow pages and our parents/children.
// Both child notification and inserting into the globals list has to happen outside the lock.
vmo->cow_pages_ = ktl::move(cow_pages);
vmo->parent_ = this;
if (copy_name) {
vmo->name_ = name_;
// Add to the global list now that fully initialized.
*child_vmo = ktl::move(vmo);
return ZX_OK;
zx_status_t VmObjectPaged::CreateChildReference(Resizability resizable, uint64_t offset,
uint64_t size, bool copy_name, bool* first_child,
fbl::RefPtr<VmObject>* child_vmo) {
LTRACEF("vmo %p offset %#" PRIx64 " size %#" PRIx64 "\n", this, offset, size);
// A reference spans the entirety of the parent. The specified range has no meaning, require it
// to be zero.
if (offset != 0 || size != 0) {
// Not supported for contiguous VMOs. Can use slices instead as contiguous VMOs are non-resizable
// and support slices.
if (is_contiguous()) {
if (resizable == Resizability::Resizable) {
// Cannot create a resizable reference from a non-resizable VMO.
if (!is_resizable()) {
uint32_t options = kReference;
if (can_block_on_page_requests()) {
options |= kCanBlockOnPageRequests;
// Reference inherits resizability from parent.
if (is_resizable()) {
options |= kResizable;
fbl::AllocChecker ac;
auto vmo = fbl::AdoptRef<VmObjectPaged>(new (&ac) VmObjectPaged(options, hierarchy_state_ptr_));
if (!ac.check()) {
Guard<CriticalMutex> guard{lock()};
// We know that we are not contiguous so we should not be uncached either.
if (cache_policy_ != ARCH_MMU_FLAG_CACHED) {
DEBUG_ASSERT(vmo->cache_policy_ == ARCH_MMU_FLAG_CACHED);
// Reference shares the same VmCowPages as the parent.
auto cow_pages = fbl::RefPtr<VmCowPages>(this->cow_pages_locked());
// Link up the cow pages and our parent/children. Both child notification and inserting into
// the globals list has to happen outside the lock.
vmo->cow_pages_ = ktl::move(cow_pages);
vmo->parent_ = this;
const bool first = AddChildLocked(vmo.get());
if (first_child) {
*first_child = first;
// Also insert into the reference list. The reference should only be inserted in the list of the
// object that the cow_pages_locked() has the backlink to, i.e. the notional "owner" of the
// VmCowPages.
// As a consequence of this, in the case of nested references, the reference relationship can
// look different from the parent->child relationship, which instead mirrors the child creation
// calls as specified by the user (this is true for all child types).
VmObjectPaged* paged_owner = cow_pages_locked()->get_paged_backlink_locked();
// The VmCowPages we point to should have a valid backlink, either to us or to our parent (if we
// are a reference).
// If this object is not a reference, the |paged_owner| we computed should be the same as
// |this|.
DEBUG_ASSERT(is_reference() || paged_owner == this);
if (copy_name) {
vmo->name_ = name_;
// Add to the global list now that fully initialized.
*child_vmo = ktl::move(vmo);
return ZX_OK;
zx_status_t VmObjectPaged::CreateClone(Resizability resizable, CloneType type, uint64_t offset,
uint64_t size, bool copy_name,
fbl::RefPtr<VmObject>* child_vmo) {
LTRACEF("vmo %p offset %#" PRIx64 " size %#" PRIx64 "\n", this, offset, size);
// Copy-on-write clones of contiguous VMOs do not have meaningful semantics, so forbid them.
if (is_contiguous()) {
// offset must be page aligned
if (!IS_PAGE_ALIGNED(offset)) {
// size must be page aligned and not too large.
if (!IS_PAGE_ALIGNED(size)) {
if (size > MAX_SIZE) {
uint32_t options = 0;
if (resizable == Resizability::Resizable) {
options |= kResizable;
if (can_block_on_page_requests()) {
options |= kCanBlockOnPageRequests;
fbl::AllocChecker ac;
auto vmo = fbl::AdoptRef<VmObjectPaged>(new (&ac) VmObjectPaged(options, hierarchy_state_ptr_));
if (!ac.check()) {
// Declare these prior to the guard so that any failure paths destroy these without holding
// the lock.
fbl::RefPtr<VmCowPages> clone_cow_pages;
Guard<CriticalMutex> guard{lock()};
// check that we're not uncached in some way
if (cache_policy_ != ARCH_MMU_FLAG_CACHED) {
DEBUG_ASSERT(vmo->cache_policy_ == ARCH_MMU_FLAG_CACHED);
zx_status_t status =
cow_pages_locked()->CreateCloneLocked(type, offset, size, &clone_cow_pages);
if (status != ZX_OK) {
return status;
// Now that everything has succeeded we can wire up cow pages references. VMO will be placed in
// the global list later once lock has been dropped.
vmo->cow_pages_ = ktl::move(clone_cow_pages);
// Install the parent.
vmo->parent_ = this;
// add the new vmo as a child before we do anything, since its
// dtor expects to find it in its parent's child list
if (copy_name) {
vmo->name_ = name_;
// Add to the global list now that fully initialized.
*child_vmo = ktl::move(vmo);
return ZX_OK;
void VmObjectPaged::DumpLocked(uint depth, bool verbose) const {
uint64_t parent_id = 0;
if (parent_) {
parent_id = parent_->user_id_locked();
for (uint i = 0; i < depth; ++i) {
printf(" ");
printf("vmo %p/k%" PRIu64 " ref %d parent %p/k%" PRIu64 "\n", this, user_id_, ref_count_debug(),
parent_, parent_id);
char name[ZX_MAX_NAME_LEN];
get_name(name, sizeof(name));
if (strlen(name) > 0) {
for (uint i = 0; i < depth + 1; ++i) {
printf(" ");
printf("name %s\n", name);
cow_pages_locked()->DumpLocked(depth, verbose);
VmObject::AttributionCounts VmObjectPaged::GetAttributedMemoryInRangeLocked(
uint64_t offset_bytes, uint64_t len_bytes) const {
uint64_t new_len_bytes;
if (!TrimRange(offset_bytes, len_bytes, size_locked(), &new_len_bytes)) {
return AttributionCounts{};
// A reference never has memory attributed to it. It points to the parent's VmCowPages, and we
// need to hold the invariant that we don't double-count attributed memory.
// TODO( Consider attributing memory to the current VmCowPages
// backlink for the case where the parent has gone away.
if (is_reference()) {
return AttributionCounts{};
uint64_t gen_count;
bool update_cached_attribution = false;
// Use cached value if generation count has not changed since the last time we attributed memory.
// Only applicable for attribution over the entire VMO, not a partial range.
if (offset_bytes == 0 && new_len_bytes == size_locked()) {
gen_count = GetHierarchyGenerationCountLocked();
if (cached_memory_attribution_.generation_count == gen_count) {
return cached_memory_attribution_.attribution_counts;
} else {
update_cached_attribution = true;
AttributionCounts counts =
cow_pages_locked()->GetAttributedMemoryInRangeLocked(offset_bytes, new_len_bytes);
if (update_cached_attribution) {
// Cache attribution counts along with current generation count.
DEBUG_ASSERT(cached_memory_attribution_.generation_count != gen_count);
cached_memory_attribution_.generation_count = gen_count;
cached_memory_attribution_.attribution_counts = counts;
return counts;
zx_status_t VmObjectPaged::CommitRangeInternal(uint64_t offset, uint64_t len, bool pin,
bool write) {
LTRACEF("offset %#" PRIx64 ", len %#" PRIx64 "\n", offset, len);
if (can_block_on_page_requests()) {
// We only expect write to be set if this a pin. All non-pin commits are reads.
DEBUG_ASSERT(!write || pin);
Guard<CriticalMutex> guard{lock()};
// Child slices of VMOs are currently not resizable, nor can they be made
// from resizable parents. If this ever changes, the logic surrounding what
// to do if a VMO gets resized during a Commit or Pin operation will need to
// be revisited. Right now, we can just rely on the fact that the initial
// vetting/trimming of the offset and length of the operation will never
// change if the operation is being executed against a child slice.
DEBUG_ASSERT(!is_resizable() || !is_slice());
// Round offset and len to be page aligned. Use a sub-scope to validate that temporary end
// calculations cannot be accidentally used later on.
uint64_t end;
if (add_overflow(offset, len, &end)) {
const uint64_t end_page = ROUNDUP_PAGE_SIZE(end);
if (end_page < end) {
DEBUG_ASSERT(end_page >= offset);
offset = ROUNDDOWN(offset, PAGE_SIZE);
len = end_page - offset;
// If a pin is requested the entire range must exist and be valid.
if (pin) {
// If pinning we explicitly forbid zero length pins as we cannot guarantee consistent semantics.
// For example pinning a zero length range outside the range of the VMO is an error, and so
// pinning a zero length range inside the vmo and then resizing the VMO smaller than the pin
// region should also be an error. To enforce this without having to have new metadata to track
// zero length pin regions is to just forbid them. Note that the user entry points for pinning
// already forbid zero length ranges.
if (unlikely(len == 0)) {
// verify that the range is within the object
if (unlikely(!InRange(offset, len, size_locked()))) {
} else {
// verify that the range is within the object
if (!InRange(offset, len, size_locked())) {
// was in range, just zero length
if (len == 0) {
return ZX_OK;
// Tracks the end of the pinned range to unpin in case of failure. The |offset| might lag behind
// the pinned range, as it tracks the range that has been completely processed, which would
// also include dirtying the page after pinning in case of a write.
uint64_t pinned_end_offset = offset;
// Should any errors occur we need to unpin everything. If we were asked to write, we need to mark
// the VMO modified if any pages were committed.
auto deferred_cleanup =
fit::defer([this, original_offset = offset, &offset, &len, &pinned_end_offset, pin, write]() {
// If we were not able to pin the entire range, i.e. len is not 0, we need to unpin
// everything. Regardless of any resizes or other things that may have happened any pinned
// pages *must* still be within a valid range, and so we know Unpin should succeed. The edge
// case is if we had failed to pin *any* pages and so our original offset may be outside the
// current range of the vmo. Additionally, as pinning a zero length range is invalid, so is
// unpinning, and so we must avoid.
if (pin && len > 0 && pinned_end_offset > original_offset) {
cow_pages_locked()->UnpinLocked(original_offset, pinned_end_offset - original_offset,
} else if (write && offset > original_offset) {
// Mark modified as we successfully committed pages for writing *and* we did not end up
// undoing a partial pin (the if-block above).
__UNINITIALIZED LazyPageRequest page_request;
// Convenience lambda to advance offset by processed_len, indicating that all pages in the range
// [offset, offset + processed_len) have been processed, then potentially wait on the page_request
// (if wait_on_page_request is set to true), and revalidate range checks after waiting.
auto advance_processed_range = [&](uint64_t processed_len,
bool wait_on_page_request) -> zx_status_t {
offset += processed_len;
len -= processed_len;
if (wait_on_page_request) {
zx_status_t wait_status = ZX_OK;
[&page_request, &wait_status]() mutable { wait_status = page_request->Wait(); });
if (wait_status != ZX_OK) {
if (wait_status == ZX_ERR_TIMED_OUT) {
DumpLocked(0, false);
return wait_status;
// Re-run the range checks, since size_ could have changed while we were blocked. This
// is not a failure, since the arguments were valid when the syscall was made. It's as
// if the commit was successful but then the pages were thrown away. Unless we are pinning,
// in which case pages being thrown away is explicitly an error.
if (pin) {
// verify that the range is within the object
if (unlikely(!InRange(offset, len, size_locked()))) {
} else {
uint64_t new_len = len;
if (!TrimRange(offset, len, size_locked(), &new_len)) {
// No remaining range to process. Set len to 0 so that the top level loop can exit.
len = 0;
return ZX_OK;
len = new_len;
return ZX_OK;
// As we may need to wait on arbitrary page requests we just keep running this as long as there is
// a non-zero range to process.
while (len > 0) {
uint64_t committed_len = 0;
zx_status_t commit_status =
cow_pages_locked()->CommitRangeLocked(offset, len, &committed_len, &page_request);
DEBUG_ASSERT(committed_len <= len);
// Now we can exit if we received any error states.
if (commit_status != ZX_OK && commit_status != ZX_ERR_SHOULD_WAIT) {
return commit_status;
// Handle the contiguous case separately because most of the following code (replacing with
// non-loaned pages and dirtying pages) does not apply to contiguous VMOs anyway. More
// importantly that code will cancel page requests if required. Contiguous VMOs are backed by a
// physical page provider which does not handle page request cancelation well, more specifically
// a page request regeneration after cancelation breaks the assumption of all processed page
// requests being unique. So avoid cancelation altogether, which is not needed for contiguous
// VMOs anyway, as the only page request type we can encounter here are read page requests. More
// details can be found in
if (is_contiguous()) {
// Pages owned by contiguous VMOs are by definition non-loaned, so we can directly pin any
// committed pages.
if (pin && committed_len > 0) {
// Verify that we are starting the pin after the previously pinned range, as we do not want
// to repeatedly pin the same pages.
ASSERT(pinned_end_offset == offset);
zx_status_t pin_status = cow_pages_locked()->PinRangeLocked(offset, committed_len);
if (pin_status != ZX_OK) {
return pin_status;
pinned_end_offset = offset + committed_len;
// Update how much was committed, and then wait on the page request (if any).
zx_status_t wait_status = advance_processed_range(
committed_len, /*wait_on_page_request=*/commit_status == ZX_ERR_SHOULD_WAIT);
if (wait_status != ZX_OK) {
return wait_status;
// Continue to the top of the while loop.
// We've already handled the contiguous case above.
// If we're required to pin, try to pin the committed range before waiting on the page_request,
// which has been populated to request pages beyond the committed range.
// Even though the page_request has already been initialized, we choose to first completely
// process the committed range, which could end up canceling the already initialized page
// request. This allows us to keep making forward progress as we will potentially pin a few
// pages before trying to fault in further pages, thereby preventing the already committed (and
// pinned) pages from being evicted while we wait with the lock dropped.
if (pin && committed_len > 0) {
// We need to replace any loaned pages in the committed range with non-loaned pages first,
// since pinning expects all pages to be non-loaned. Replacing loaned pages requires a page
// request too. At any time we'll only be able to wait on a single page request, and after the
// wait the conditions that resulted in the previous request might have changed, so we can
// just cancel and reuse the existing page_request.
uint64_t non_loaned_len = 0;
zx_status_t replace_status = cow_pages_locked()->ReplacePagesWithNonLoanedLocked(
offset, committed_len, &page_request, &non_loaned_len);
DEBUG_ASSERT(non_loaned_len <= committed_len);
if (replace_status == ZX_OK) {
DEBUG_ASSERT(non_loaned_len == committed_len);
} else if (replace_status != ZX_ERR_SHOULD_WAIT) {
return replace_status;
// We can safely pin the non-loaned range before waiting on the page request.
if (non_loaned_len > 0) {
// Verify that we are starting the pin after the previously pinned range, as we do not want
// to repeatedly pin the same pages.
ASSERT(pinned_end_offset == offset);
zx_status_t pin_status = cow_pages_locked()->PinRangeLocked(offset, non_loaned_len);
if (pin_status != ZX_OK) {
return pin_status;
// At this point we have successfully committed and pinned non_loaned_len.
uint64_t pinned_len = non_loaned_len;
pinned_end_offset = offset + pinned_len;
// If this is a write and the VMO supports dirty tracking, we also need to mark the pinned
// pages Dirty.
// We pin the pages first before marking them dirty in order to guarantee forward progress.
// Pinning the pages will prevent them from getting decommitted while we are waiting on the
// dirty page request without the lock held.
if (write && pinned_len > 0 && is_dirty_tracked_locked()) {
// Prepare the committed range for writing. We need a page request for this too, so cancel
// any existing one and reuse it.
// We want to dirty the entire pinned range.
uint64_t to_dirty_len = pinned_len;
while (to_dirty_len > 0) {
uint64_t dirty_len = 0;
zx_status_t write_status = cow_pages_locked()->PrepareForWriteLocked(
offset, to_dirty_len, &page_request, &dirty_len);
DEBUG_ASSERT(dirty_len <= to_dirty_len);
if (write_status != ZX_OK && write_status != ZX_ERR_SHOULD_WAIT) {
return write_status;
// Account for the pages that were dirtied during this attempt.
to_dirty_len -= dirty_len;
// At this point we have successfully committed, pinned, and dirtied dirty_len. This is
// where we need to restart the next call to PrepareForWriteLocked. Advance the offset to
// reflect that, and then wait on the page request beyond dirty_len (if any).
zx_status_t wait_status = advance_processed_range(
dirty_len, /*wait_on_page_request=*/write_status == ZX_ERR_SHOULD_WAIT);
if (wait_status != ZX_OK) {
return wait_status;
// Retry dirtying pages beyond dirty_len. Note that it is fine to resume the inner loop
// here and directly call PrepareForWriteLocked after advancing the offset because the
// pages were pinned previously, and so they could not have gotten decommitted while we
// waited on the page request.
if (write_status == ZX_ERR_SHOULD_WAIT) {
// Resume the loop that repeatedly calls PrepareForWriteLocked until all the pinned
// pages have been marked dirty.
} else {
// We did not need to perform any dirty tracking. So we can advance the offset over the
// pinned length. Now that we've dealt with all the pages in the non-loaned range, wait on
// the page request for offsets beyond (if any).
zx_status_t wait_status = advance_processed_range(
pinned_len, /*wait_on_page_request=*/replace_status == ZX_ERR_SHOULD_WAIT);
if (wait_status != ZX_OK) {
return wait_status;
// Since we dropped the lock while waiting, things might have changed, so reattempt
// committing beyond the length we had successfully pinned before waiting.
if (replace_status == ZX_ERR_SHOULD_WAIT) {
} else {
// We were either not required to pin, or committed_len was 0. We need to update how much was
// committed, and then wait on the page request (if any).
zx_status_t wait_status = advance_processed_range(
committed_len, /*wait_on_page_request=*/commit_status == ZX_ERR_SHOULD_WAIT);
if (wait_status != ZX_OK) {
return wait_status;
// After we're done waiting on the page request, we loop around with the same |offset| and
// |len|, so that we can reprocess the range populated by the page request, with another
// call to VmCowPages::CommitRangeLocked(). This is required to make any COW copies of pages
// that were just supplied.
// - The first call to VmCowPages::CommitRangeLocked() returns early from
// LookupCursor::RequireOwnedPage with ZX_ERR_SHOULD_WAIT after queueing a page request
// for the absent page.
// - The second call to VmCowPages::CommitRangeLocked() calls LookupCursor::RequireOwnedPage
// which copies out the now present page (if required).
if (commit_status == ZX_ERR_SHOULD_WAIT) {
// If commit was successful we should have no more to process.
DEBUG_ASSERT(commit_status != ZX_OK || len == 0);
return ZX_OK;
zx_status_t VmObjectPaged::DecommitRange(uint64_t offset, uint64_t len) {
LTRACEF("offset %#" PRIx64 ", len %#" PRIx64 "\n", offset, len);
Guard<CriticalMutex> guard{lock()};
if (is_contiguous() && !pmm_physical_page_borrowing_config()->is_loaning_enabled()) {
return DecommitRangeLocked(offset, len);
zx_status_t VmObjectPaged::DecommitRangeLocked(uint64_t offset, uint64_t len) {
// Decommit of pages from a contiguous VMO relies on contiguous VMOs not being resizable.
DEBUG_ASSERT(!is_resizable() || !is_contiguous());
return cow_pages_locked()->DecommitRangeLocked(offset, len);
zx_status_t VmObjectPaged::ZeroPartialPageLocked(uint64_t page_base_offset,
uint64_t zero_start_offset,
uint64_t zero_end_offset,
Guard<CriticalMutex>* guard) {
DEBUG_ASSERT(zero_start_offset <= zero_end_offset);
DEBUG_ASSERT(zero_end_offset <= PAGE_SIZE);
DEBUG_ASSERT(page_base_offset < size_locked());
// TODO: Consider replacing this with a more appropriate generic API when one is available.
if (cow_pages_locked()->PageWouldReadZeroLocked(page_base_offset)) {
// This is already considered zero so no need to redundantly zero again.
return ZX_OK;
// Need to actually zero out bytes in the page.
return ReadWriteInternalLocked(
page_base_offset + zero_start_offset, zero_end_offset - zero_start_offset, true,
[](void* dst, size_t offset, size_t len, Guard<CriticalMutex>* guard) -> zx_status_t {
// We're memsetting the *kernel* address of an allocated page, so we know that this
// cannot fault. memset may not be the most efficient, but we don't expect to be doing
// this very often.
memset(dst, 0, len);
return ZX_OK;
zx_status_t VmObjectPaged::ZeroRange(uint64_t offset, uint64_t len) {
if (can_block_on_page_requests()) {
Guard<CriticalMutex> guard{lock()};
// Zeroing a range behaves as if it were an efficient zx_vmo_write. As we cannot write to uncached
// vmo, we also cannot zero an uncahced vmo.
if (cache_policy_ != ARCH_MMU_FLAG_CACHED) {
// Validate the length is in range of the vmo.
if (!InRange(offset, len, size_locked())) {
// Construct our initial range. Already checked the range above so we know it cannot overflow.
uint64_t start = offset;
uint64_t end = start + len;
// Helper that checks and establishes our invariants. We use this after calling functions that
// may have temporarily released the lock.
auto establish_invariants = [this, &end]() TA_REQ(lock()) {
if (end > size_locked()) {
if (cache_policy_ != ARCH_MMU_FLAG_CACHED) {
return ZX_OK;
uint64_t start_page_base = ROUNDDOWN(start, PAGE_SIZE);
uint64_t end_page_base = ROUNDDOWN(end, PAGE_SIZE);
if (unlikely(start_page_base != start)) {
// Need to handle the case were end is unaligned and on the same page as start
if (unlikely(start_page_base == end_page_base)) {
return ZeroPartialPageLocked(start_page_base, start - start_page_base, end - start_page_base,
zx_status_t status =
ZeroPartialPageLocked(start_page_base, start - start_page_base, PAGE_SIZE, &guard);
if (status == ZX_OK) {
status = establish_invariants();
if (status != ZX_OK) {
return status;
start = start_page_base + PAGE_SIZE;
if (unlikely(end_page_base != end)) {
zx_status_t status = ZeroPartialPageLocked(end_page_base, 0, end - end_page_base, &guard);
if (status == ZX_OK) {
status = establish_invariants();
if (status != ZX_OK) {
return status;
end = end_page_base;
// Now that we have a page aligned range we can try hand over to the cow pages zero method.
// Currently we want ZeroPagesLocked() to not decommit any pages from a contiguous VMO. In debug
// we can assert that (not a super fast assert, but seems worthwhile; it's debug only).
uint64_t page_count_before = is_contiguous() ? cow_pages_locked()->DebugGetPageCountLocked() : 0;
auto mark_modified = fit::defer([this, original_start = start, &start]() {
if (start > original_start) {
// Mark modified since we wrote zeros.
// We might need a page request if the VMO is backed by a page source.
__UNINITIALIZED LazyPageRequest page_request;
while (start < end) {
uint64_t zeroed_len = 0;
zx_status_t status =
cow_pages_locked()->ZeroPagesLocked(start, end, &page_request, &zeroed_len);
if (status == ZX_ERR_SHOULD_WAIT) {
guard.CallUnlocked([&status, &page_request]() { status = page_request->Wait(); });
if (status != ZX_OK) {
if (status == ZX_ERR_TIMED_OUT) {
DumpLocked(0, false);
return status;
// We dropped the lock while waiting. Check the invariants again.
status = establish_invariants();
if (status != ZX_OK) {
return status;
} else if (status != ZX_OK) {
return status;
// Advance over pages that had already been zeroed.
start += zeroed_len;
if (is_contiguous()) {
uint64_t page_count_after = cow_pages_locked()->DebugGetPageCountLocked();
DEBUG_ASSERT(page_count_after == page_count_before);
return ZX_OK;
zx_status_t VmObjectPaged::Resize(uint64_t s) {
LTRACEF("vmo %p, size %" PRIu64 "\n", this, s);
DEBUG_ASSERT(!is_contiguous() || !is_resizable());
// Also rejects contiguous VMOs.
if (!is_resizable()) {
// ensure the size is valid and that we will not wrap.
if (!IS_PAGE_ALIGNED(s)) {
if (s > MAX_SIZE) {
Guard<CriticalMutex> guard{lock()};
zx_status_t status = cow_pages_locked()->ResizeLocked(s);
if (status != ZX_OK) {
return status;
// We were able to successfully resize. Mark as modified.
return ZX_OK;
// perform some sort of copy in/out on a range of the object using a passed in lambda for the copy
// routine. The copy routine has the expected type signature of: (uint64_t src_offset, uint64_t
// dest_offset, bool write, Guard<CriticalMutex> *guard) -> zx_status_t The passed in guard may have
// its CallUnlocked member used, but if it does then ZX_OK must not be the return value. A return of
// ZX_ERR_SHOULD_WAIT implies that the attempted copy should be tried again at the exact same
// offsets.
template <typename T>
zx_status_t VmObjectPaged::ReadWriteInternalLocked(uint64_t offset, size_t len, bool write,
VmObjectReadWriteOptions options, T copyfunc,
Guard<CriticalMutex>* guard) {
uint64_t end_offset;
if (add_overflow(offset, len, &end_offset)) {
// Declare a lambda that will check any object properties we require to be true and, if can_trim
// is set, reduce the requested length if it exceeds the the VMO size. We place these in a lambda
// so that we can perform them any time the lock is dropped.
const bool can_trim = !!(options & VmObjectReadWriteOptions::TrimLength);
auto check_and_trim = [this, can_trim, &end_offset]() -> zx_status_t {
if (cache_policy_ != ARCH_MMU_FLAG_CACHED) {
const uint64_t size = size_locked();
if (end_offset > size) {
if (can_trim) {
end_offset = size;
} else {
return ZX_OK;
// Perform initial check.
if (zx_status_t status = check_and_trim(); status != ZX_OK) {
return status;
// Track our two offsets.
uint64_t src_offset = offset;
size_t dest_offset = 0;
auto mark_modified = fit::defer([this, &dest_offset, write]() {
if (write && dest_offset > 0) {
// We wrote something, so mark as modified.
// The PageRequest is a non-trivial object so we declare it outside the loop to avoid having to
// construct and deconstruct it each iteration. It is tolerant of being reused and will
// reinitialize itself if needed.
__UNINITIALIZED LazyPageRequest page_request;
while (src_offset < end_offset) {
const size_t first_page_offset = ROUNDDOWN(src_offset, PAGE_SIZE);
const size_t last_page_offset = ROUNDDOWN(end_offset - 1, PAGE_SIZE);
size_t remaining_pages = (last_page_offset - first_page_offset) / PAGE_SIZE + 1;
size_t pages_since_last_unlock = 0;
__UNINITIALIZED zx::result<VmCowPages::LookupCursor> cursor =
GetLookupCursorLocked(first_page_offset, remaining_pages * PAGE_SIZE);
if (cursor.is_error()) {
return cursor.status_value();
// Performing explicit accesses by request of the user, so disable zero forking.
while (remaining_pages > 0) {
// If we need to wait on pages then we would like to wait on as many as possible, up to the
// actual limit of the read/write operation. As we would otherwise have to wait for all pages
// before resuming the copy, cap the maximum number to limit the latency before we start
// making progress.
constexpr uint64_t kMaxWaitPages = 16;
const uint64_t max_waitable_pages = ktl::min(remaining_pages, kMaxWaitPages);
// Attempt to lookup a page
__UNINITIALIZED zx::result<VmCowPages::LookupCursor::RequireResult> result =
cursor->RequirePage(write, static_cast<uint>(max_waitable_pages), &page_request);
zx_status_t status = result.status_value();
if (status == ZX_ERR_SHOULD_WAIT) {
guard->CallUnlocked([&status, &page_request]() { status = page_request->Wait(); });
if (status != ZX_OK) {
if (status == ZX_ERR_TIMED_OUT) {
DumpLocked(0, false);
return status;
// Recheck properties and if all is good go back to the top of the outer loop to attempt
// to acquire a fresh cursor and try again.
status = check_and_trim();
if (status == ZX_OK) {
if (status != ZX_OK) {
return status;
const paddr_t pa = result->page->paddr();
const size_t page_offset = src_offset % PAGE_SIZE;
const size_t tocopy = ktl::min(PAGE_SIZE - page_offset, end_offset - src_offset);
// Compute the kernel mapping of this page.
char* page_ptr = reinterpret_cast<char*>(paddr_to_physmap(pa));
// Call the copy routine. If the copy was successful then ZX_OK is returned, otherwise
// ZX_ERR_SHOULD_WAIT may be returned to indicate the copy failed but we can retry it.
status = copyfunc(page_ptr + page_offset, dest_offset, tocopy, guard);
if (status == ZX_ERR_SHOULD_WAIT) {
// Although we can retry, as the lock was dropped we must re-check any properties, and then
// if all is good go back to the top of the outer loop to attempt to acquire a fresh cursor
// and try again.
status = check_and_trim();
if (status == ZX_OK) {
return status;
if (status != ZX_OK) {
return status;
// Advance the copy location.
src_offset += tocopy;
dest_offset += tocopy;
// Periodically yield the lock in order to allow other read or write
// operations to advance sooner than they otherwise would.
constexpr size_t kPagesBetweenUnlocks = 16;
if (unlikely(++pages_since_last_unlock == kPagesBetweenUnlocks)) {
pages_since_last_unlock = 0;
if (guard->lock()->IsContested()) {
// Just drop the lock and re-acquire it. There is no need to yield.
// Since the lock is contested, the empty |CallUnlocked| will:
// 1. Immediately grant the lock to another thread. This thread may
// continue running until #3, or it may be descheduled.
// 2. Run the empty lambda.
// 3. Attempt to re-acquire the lock. There are 3 possibilities:
// 3a. Mutex is owned by the other thread, and is contested (there
// are more waiters besides the other thread). This thread will
// immediately block on the Mutex.
// 3b. Mutex is owned by the other thread, and uncontested. This
// thread will spin on the Mutex, and block after some time.
// 3c. Mutex is un-owned. This thread will immediately own the
// Mutex again and continue running.
// Thus, there is no danger of thrashing here. The other thread will
// always get the Mutex, even without an explicit yield.
guard->CallUnlocked([]() {});
status = check_and_trim();
if (status == ZX_OK) {
return status;
return ZX_OK;
zx_status_t VmObjectPaged::Read(void* _ptr, uint64_t offset, size_t len) {
// test to make sure this is a kernel pointer
if (!is_kernel_address(reinterpret_cast<vaddr_t>(_ptr))) {
DEBUG_ASSERT_MSG(0, "non kernel pointer passed\n");
// read routine that just uses a memcpy
char* ptr = reinterpret_cast<char*>(_ptr);
auto read_routine = [ptr](const void* src, size_t offset, size_t len,
Guard<CriticalMutex>* guard) -> zx_status_t {
memcpy(ptr + offset, src, len);
return ZX_OK;
if (can_block_on_page_requests()) {
Guard<CriticalMutex> guard{lock()};
return ReadWriteInternalLocked(offset, len, false, VmObjectReadWriteOptions::None, read_routine,
zx_status_t VmObjectPaged::Write(const void* _ptr, uint64_t offset, size_t len) {
// test to make sure this is a kernel pointer
if (!is_kernel_address(reinterpret_cast<vaddr_t>(_ptr))) {
DEBUG_ASSERT_MSG(0, "non kernel pointer passed\n");
// write routine that just uses a memcpy
const char* ptr = reinterpret_cast<const char*>(_ptr);
auto write_routine = [ptr](void* dst, size_t offset, size_t len,
Guard<CriticalMutex>* guard) -> zx_status_t {
memcpy(dst, ptr + offset, len);
return ZX_OK;
if (can_block_on_page_requests()) {
Guard<CriticalMutex> guard{lock()};
return ReadWriteInternalLocked(offset, len, true, VmObjectReadWriteOptions::None, write_routine,
zx_status_t VmObjectPaged::CacheOp(uint64_t offset, uint64_t len, CacheOpType type) {
if (unlikely(len == 0)) {
Guard<CriticalMutex> guard{lock()};
// verify that the range is within the object
if (unlikely(!InRange(offset, len, size_locked()))) {
// This cannot overflow as we already checked the range.
const uint64_t end_offset = offset + len;
// For syncing instruction caches there may be work that is more efficient to batch together, and
// so we use an abstract consistency manager to optimize it for the given architecture.
ArchVmICacheConsistencyManager sync_cm;
return cow_pages_locked()->LookupReadableLocked(
offset, len, [&sync_cm, offset, end_offset, type](uint64_t page_offset, paddr_t pa) {
// This cannot overflow due to the maximum possible size of a VMO.
const uint64_t page_end = page_offset + PAGE_SIZE;
// Determine our start and end in terms of vmo offset
const uint64_t start = ktl::max(page_offset, offset);
const uint64_t end = ktl::min(end_offset, page_end);
// Translate to inter-page offset
DEBUG_ASSERT(start >= page_offset);
const uint64_t op_start_offset = start - page_offset;
DEBUG_ASSERT(op_start_offset < PAGE_SIZE);
DEBUG_ASSERT(end > start);
const uint64_t op_len = end - start;
CacheOpPhys(pa + op_start_offset, op_len, type, sync_cm);
return ZX_ERR_NEXT;
zx_status_t VmObjectPaged::Lookup(uint64_t offset, uint64_t len,
VmObject::LookupFunction lookup_fn) {
if (unlikely(len == 0)) {
Guard<CriticalMutex> guard{lock()};
return cow_pages_locked()->LookupLocked(offset, len, ktl::move(lookup_fn));
zx_status_t VmObjectPaged::LookupContiguous(uint64_t offset, uint64_t len, paddr_t* out_paddr) {
if (unlikely(len == 0 || !IS_PAGE_ALIGNED(offset))) {
Guard<CriticalMutex> guard{lock()};
if (unlikely(!InRange(offset, len, size_locked()))) {
if (unlikely(!is_contiguous() && (len != PAGE_SIZE))) {
// Multi-page lookup only supported for contiguous VMOs.
// Verify that all pages are present, and assert that the present pages are contiguous since we
// only support len > PAGE_SIZE for contiguous VMOs.
bool page_seen = false;
uint64_t first_offset = 0;
paddr_t first_paddr = 0;
uint64_t count = 0;
// This has to work for child slices with non-zero parent_offset_ also, which means even if all
// pages are present, the first cur_offset can be offset + parent_offset_.
zx_status_t status = cow_pages_locked()->LookupLocked(
offset, len,
[&page_seen, &first_offset, &first_paddr, &count](uint64_t cur_offset, paddr_t pa) mutable {
if (!page_seen) {
first_offset = cur_offset;
first_paddr = pa;
page_seen = true;
ASSERT(first_paddr + (cur_offset - first_offset) == pa);
return ZX_ERR_NEXT;
ASSERT(status == ZX_OK);
if (count != len / PAGE_SIZE) {
if (out_paddr) {
*out_paddr = first_paddr;
return ZX_OK;
zx_status_t VmObjectPaged::ReadUser(user_out_ptr<char> ptr, uint64_t offset, size_t len,
VmObjectReadWriteOptions options, size_t* out_actual) {
if (out_actual != nullptr) {
*out_actual = 0;
// read routine that uses copy_to_user
auto read_routine = [ptr, out_actual](const char* src, size_t offset, size_t len,
Guard<CriticalMutex>* guard) -> zx_status_t {
__UNINITIALIZED auto copy_result =
ptr.byte_offset(offset).copy_array_to_user_capture_faults(src, len);
// If a fault has actually occurred, then we will have captured fault info that we can use to
// handle the fault.
if (copy_result.fault_info.has_value()) {
zx_status_t result;
guard->CallUnlocked([&info = *copy_result.fault_info, &result] {
result = Thread::Current::SoftFault(info.pf_va, info.pf_flags);
// If we handled the fault, tell the upper level to try again.
return result == ZX_OK ? ZX_ERR_SHOULD_WAIT : result;
// If we encounter _any_ unrecoverable error from the copy operation which
// produced no fault address, squash the error down to just "NOT_FOUND".
// This is what the SoftFault error would have told us if we did try to
// handle the fault and could not.
if (copy_result.status != ZX_OK) {
if (out_actual != nullptr) {
*out_actual += len;
return ZX_OK;
if (can_block_on_page_requests()) {
Guard<CriticalMutex> guard{lock()};
return ReadWriteInternalLocked(offset, len, false, options, read_routine, &guard);
zx_status_t VmObjectPaged::WriteUser(user_in_ptr<const char> ptr, uint64_t offset, size_t len,
VmObjectReadWriteOptions options, size_t* out_actual,
const OnWriteBytesTransferredCallback& on_bytes_transferred) {
if (out_actual != nullptr) {
*out_actual = 0;
// write routine that uses copy_from_user
auto write_routine = [ptr, base_vmo_offset = offset, out_actual, &on_bytes_transferred](
char* dst, size_t offset, size_t len,
Guard<CriticalMutex>* guard) -> zx_status_t {
__UNINITIALIZED auto copy_result =
ptr.byte_offset(offset).copy_array_from_user_capture_faults(dst, len);
// If a fault has actually occurred, then we will have captured fault info that we can use to
// handle the fault.
if (copy_result.fault_info.has_value()) {
zx_status_t result;
guard->CallUnlocked([&info = *copy_result.fault_info, &result] {
result = Thread::Current::SoftFault(info.pf_va, info.pf_flags);
// If we handled the fault, tell the upper level to try again.
return result == ZX_OK ? ZX_ERR_SHOULD_WAIT : result;
// If we encounter _any_ unrecoverable error from the copy operation which
// produced no fault address, squash the error down to just "NOT_FOUND".
// This is what the SoftFault error would have told us if we did try to
// handle the fault and could not.
if (copy_result.status != ZX_OK) {
if (out_actual != nullptr) {
*out_actual += len;
if (on_bytes_transferred) {
on_bytes_transferred(base_vmo_offset + offset, len);
return ZX_OK;
if (can_block_on_page_requests()) {
Guard<CriticalMutex> guard{lock()};
return ReadWriteInternalLocked(offset, len, true, options, write_routine, &guard);
zx_status_t VmObjectPaged::TakePages(uint64_t offset, uint64_t len, VmPageSpliceList* pages) {
// TODO: Check that the region is locked once locking is implemented
if (is_contiguous()) {
// Initialize the splice list to the right size.
*pages = VmPageSpliceList(offset, len, 0);
__UNINITIALIZED LazyPageRequest page_request;
while (len > 0) {
Guard<CriticalMutex> guard{lock()};
uint64_t taken_len = 0;
zx_status_t status =
cow_pages_locked()->TakePagesLocked(offset, len, pages, &taken_len, &page_request);
if (status != ZX_ERR_SHOULD_WAIT && status != ZX_OK) {
return status;
// We would only have failed to take anything if status was not ZX_OK, which in this case
// would be ZX_ERR_SHOULD_WAIT as that is the only non-OK status we can reach here with.
DEBUG_ASSERT(taken_len > 0 || status == ZX_ERR_SHOULD_WAIT);
// We should have taken the entire range requested if the status was ZX_OK.
DEBUG_ASSERT(status != ZX_OK || taken_len == len);
// We should not have taken any more than the requested range.
DEBUG_ASSERT(taken_len <= len);
// Record the completed portion.
len -= taken_len;
offset += taken_len;
if (status == ZX_ERR_SHOULD_WAIT) {
guard.CallUnlocked([&page_request, &status] { status = page_request->Wait(); });
if (status != ZX_OK) {
return status;
return ZX_OK;
zx_status_t VmObjectPaged::SupplyPages(uint64_t offset, uint64_t len, VmPageSpliceList* pages,
SupplyOptions options) {
// We need this check here instead of in SupplyPagesLocked, as we do use that
// function to provide pages to contiguous VMOs as well.
if (is_contiguous()) {
__UNINITIALIZED LazyPageRequest page_request;
while (len > 0) {
Guard<CriticalMutex> guard{lock()};
uint64_t supply_len = 0;
zx_status_t status = cow_pages_locked()->SupplyPagesLocked(offset, len, pages, options,
&supply_len, &page_request);
if (status != ZX_ERR_SHOULD_WAIT && status != ZX_OK) {
return status;
// We would only have failed to supply anything if status was not ZX_OK, which in this case
// would be ZX_ERR_SHOULD_WAIT as that is the only non-OK status we can reach here with.
DEBUG_ASSERT(supply_len > 0 || status == ZX_ERR_SHOULD_WAIT);
// We shoud have supplied the entire range requested if the status was ZX_OK.
DEBUG_ASSERT(status != ZX_OK || supply_len == len);
// We should not have supplied any more than the requested range.
DEBUG_ASSERT(supply_len <= len);
// Record the completed portion.
offset += supply_len;
len -= supply_len;
if (status == ZX_ERR_SHOULD_WAIT) {
guard.CallUnlocked([&page_request, &status] { status = page_request->Wait(); });
if (status != ZX_OK) {
return status;
return ZX_OK;
zx_status_t VmObjectPaged::DirtyPages(uint64_t offset, uint64_t len) {
zx_status_t status;
// It is possible to encounter delayed PMM allocations, which requires waiting on the
// page_request.
__UNINITIALIZED LazyPageRequest page_request;
Guard<CriticalMutex> guard{lock()};
// Initialize a list of allocated pages that DirtyPagesLocked will allocate any new pages into
// before inserting them in the VMO. Allocated pages can therefore be shared across multiple calls
// to DirtyPagesLocked. Instead of having to allocate and free pages in case DirtyPagesLocked
// cannot successfully dirty the entire range atomically, we can just hold on to the allocated
// pages and use them for the next call. This ensures that we are making forward progress with
// each successive call to DirtyPagesLocked.
list_node alloc_list;
auto alloc_list_cleanup = fit::defer([&alloc_list, this]() -> void {
if (!list_is_empty(&alloc_list)) {
cow_pages_locked()->FreePagesLocked(&alloc_list, true);
do {
status = cow_pages_locked()->DirtyPagesLocked(offset, len, &alloc_list, &page_request);
if (status == ZX_ERR_SHOULD_WAIT) {
zx_status_t wait_status;
guard.CallUnlocked([&page_request, &wait_status]() { wait_status = page_request->Wait(); });
if (wait_status != ZX_OK) {
return wait_status;
// If the wait was successful, loop around and try the call again, which will re-validate any
// state that might have changed when the lock was dropped.
} while (status == ZX_ERR_SHOULD_WAIT);
return status;
zx_status_t VmObjectPaged::SetMappingCachePolicy(const uint32_t cache_policy) {
// Is it a valid cache flag?
if (cache_policy & ~ZX_CACHE_POLICY_MASK) {
Guard<CriticalMutex> guard{lock()};
// conditions for allowing the cache policy to be set:
// 1) vmo either has no pages committed currently or is transitioning from being cached
// 2) vmo has no pinned pages
// 3) vmo has no mappings
// 4) vmo has no children
// 5) vmo is not a child
// Counting attributed memory does a sufficient job of checking for committed pages since we also
// require no children and no parent, so attribution == precisely our pages.
if (cow_pages_locked()->GetAttributedMemoryInRangeLocked(0, size_locked()) !=
AttributionCounts{} &&
cache_policy_ != ARCH_MMU_FLAG_CACHED) {
// We forbid to transitioning committed pages from any kind of uncached->cached policy as we do
// not currently have a story for dealing with the speculative loads that may have happened
// against the cached physmap. That is, whilst a page was uncached the cached physmap version
// may have been loaded and sitting in cache. If we switch to cached mappings we may then use
// stale data out of the cache.
// This isn't a problem if going *from* an cached state, as we can safely clean+invalidate.
// Similarly it's not a problem if there aren't actually any committed pages.
if (cow_pages_locked()->pinned_page_count_locked() > 0) {
if (!mapping_list_.is_empty()) {
if (!children_list_.is_empty()) {
if (parent_) {
// Forbid if there are references, or if this object is a reference itself. We do not want cache
// policies to diverge across references. Note that this check is required in addition to the
// children_list_ and parent_ check, because it is possible for a non-reference parent to go away,
// which will trigger the election of a reference as the new owner for the remaining
// reference_list_, and also reset the parent_.
if (!reference_list_.is_empty()) {
if (is_reference()) {
// If transitioning from a cached policy we must clean/invalidate all the pages as the kernel may
// have written to them on behalf of the user.
if (cache_policy_ == ARCH_MMU_FLAG_CACHED && cache_policy != ARCH_MMU_FLAG_CACHED) {
// No need to perform clean/invalidate if size is zero because there can be no pages.
if (size_locked() > 0) {
zx_status_t status = cow_pages_locked()->LookupLocked(
0, size_locked(), [](uint64_t offset, paddr_t pa) mutable {
arch_clean_invalidate_cache_range((vaddr_t)paddr_to_physmap(pa), PAGE_SIZE);
return ZX_ERR_NEXT;
if (status != ZX_OK) {
return status;
cache_policy_ = cache_policy;
return ZX_OK;
void VmObjectPaged::RangeChangeUpdateLocked(uint64_t offset, uint64_t len, RangeChangeOp op) {
// offsets for vmos needn't be aligned, but vmars use aligned offsets
const uint64_t aligned_offset = ROUNDDOWN(offset, PAGE_SIZE);
const uint64_t aligned_len = ROUNDUP(offset + len, PAGE_SIZE) - aligned_offset;
for (auto& m : mapping_list_) {
if (op == RangeChangeOp::Unmap) {
m.AspaceUnmapLockedObject(aligned_offset, aligned_len);
} else if (op == RangeChangeOp::RemoveWrite) {
m.AspaceRemoveWriteLockedObject(aligned_offset, aligned_len);
} else if (op == RangeChangeOp::DebugUnpin) {
m.AspaceDebugUnpinLockedObject(aligned_offset, aligned_len);
} else {
panic("Unknown RangeChangeOp %d\n", static_cast<int>(op));
// Propagate the change to reference children as well.
for (auto& ref : reference_list_) {
// Use the same offset and len. References span the entirety of the parent VMO and hence share
// all offsets.
ref.RangeChangeUpdateLocked(offset, len, op);
zx_status_t VmObjectPaged::LockRange(uint64_t offset, uint64_t len,
zx_vmo_lock_state_t* lock_state_out) {
if (!is_discardable()) {
Guard<CriticalMutex> guard{lock()};
return cow_pages_locked()->LockRangeLocked(offset, len, lock_state_out);
zx_status_t VmObjectPaged::TryLockRange(uint64_t offset, uint64_t len) {
if (!is_discardable()) {
Guard<CriticalMutex> guard{lock()};
return cow_pages_locked()->TryLockRangeLocked(offset, len);
zx_status_t VmObjectPaged::UnlockRange(uint64_t offset, uint64_t len) {
if (!is_discardable()) {
Guard<CriticalMutex> guard{lock()};
return cow_pages_locked()->UnlockRangeLocked(offset, len);
zx_status_t VmObjectPaged::GetPage(uint64_t offset, uint pf_flags, list_node* alloc_list,
LazyPageRequest* page_request, vm_page_t** page, paddr_t* pa) {
Guard<CriticalMutex> guard{lock()};
const bool write = pf_flags & VMM_PF_FLAG_WRITE;
zx::result<VmCowPages::LookupCursor> cursor = GetLookupCursorLocked(offset, PAGE_SIZE);
if (cursor.is_error()) {
return cursor.error_value();
// Hardware faults are considered to update access times separately, all other lookup reasons
// should do the default update of access time.
if (pf_flags & VMM_PF_FLAG_HW_FAULT) {
if (!(pf_flags & VMM_PF_FLAG_FAULT_MASK)) {
vm_page_t* p = cursor->MaybePage(write);
if (!p) {
if (page) {
*page = p;
if (pa) {
*pa = p->paddr();
return ZX_OK;
auto result = cursor->RequirePage(write, PAGE_SIZE, page_request);
if (result.is_error()) {
return result.error_value();
if (page) {
*page = result->page;
if (pa) {
*pa = result->page->paddr();
return ZX_OK;