[kernel][vm] Improve efficiency of bidirectional clones

This change reduces the number of page allocations done by bidirectional
clones so that they are never less efficient than simply memcpying into
a new VMO. This is done by moving pages out of hidden VMOs when writing
to one of their children if the page is not accessible by the other
child.

Whether a page in a hidden VMO is 'accessible' to a particular child is
determined by a combination of two factors. First, a page is not
accessible if it lies outside the range of the hidden VMO which the
child can see. This is tracked by the existing parent offset and limit
values in the child. Second, if a page in the hidden VMO has already
been copied into a child, that page in the hidden VMO is no longer
accessible to that child. This is tracked by new 'split' bits in the
vm_page_t structure.

Test: existing vmo-clone2 core tests were written to cover this code,
and kstress has a test for hammering COW clones
ZX-1268 #comment [kernel][vm] Improve efficiency of bidirectional clones

Change-Id: Idbfcab1b684c12faa29499e3c9c78e478989a1ef
diff --git a/zircon/kernel/vm/include/vm/page.h b/zircon/kernel/vm/include/vm/page.h
index 326ebf5..8388944 100644
--- a/zircon/kernel/vm/include/vm/page.h
+++ b/zircon/kernel/vm/include/vm/page.h
@@ -32,6 +32,19 @@
 #define VM_PAGE_OBJECT_MAX_PIN_COUNT ((1ul << VM_PAGE_OBJECT_PIN_COUNT_BITS) - 1)
 
             uint8_t pin_count : VM_PAGE_OBJECT_PIN_COUNT_BITS;
+
+            // Bits used by VmObjectPaged implementation of COW clones.
+            //
+            // Pages of VmObjectPaged have two "split" bits. These bits are used to track which
+            // pages in children of hidden VMOs have diverged from their parent. There are two
+            // bits, left and right, one for each child. In a hidden parent, a 1 split bit means
+            // that page in the child has diverged from the parent and the parent's page is
+            // no longer accessible to that child.
+            //
+            // It should never be the case that both split bits are set, as the page should
+            // be moved into the child instead of setting the second bit.
+            uint8_t cow_left_split : 1;
+            uint8_t cow_right_split : 1;
         } object; // attached to a vm object
     };
 
diff --git a/zircon/kernel/vm/include/vm/vm_object.h b/zircon/kernel/vm/include/vm/vm_object.h
index a2fa0f0..25f4bc9 100644
--- a/zircon/kernel/vm/include/vm/vm_object.h
+++ b/zircon/kernel/vm/include/vm/vm_object.h
@@ -76,6 +76,9 @@
     // Returns true if the VMO was created vma CreatePagerVmo().
     virtual bool is_pager_backed() const { return false; }
 
+    // Returns true if the vmo is a hidden paged vmo.
+    virtual bool is_hidden() const { return false; }
+
     // Returns the number of physical pages currently allocated to the
     // object where (offset <= page_offset < offset+len).
     // |offset| and |len| are in bytes.
diff --git a/zircon/kernel/vm/include/vm/vm_object_paged.h b/zircon/kernel/vm/include/vm/vm_object_paged.h
index bc3c7ef1..cbdf6b0 100644
--- a/zircon/kernel/vm/include/vm/vm_object_paged.h
+++ b/zircon/kernel/vm/include/vm/vm_object_paged.h
@@ -81,7 +81,7 @@
         Guard<fbl::Mutex> guard{&lock_};
         return GetRootPageSourceLocked() != nullptr;
     }
-    bool is_hidden() const { return (options_ & kHidden); }
+    bool is_hidden() const override { return (options_ & kHidden); }
     ChildType child_type() const override {
         Guard<fbl::Mutex> guard{&lock_};
         return (original_parent_user_id_ != 0) ? ChildType::kCowClone : ChildType::kNotChild;
@@ -176,9 +176,12 @@
 
     DISALLOW_COPY_ASSIGN_AND_MOVE(VmObjectPaged);
 
-    // add a page to the object
+    // Add a page to the object. This operation unmaps the corresponding
+    // offset from any existing mappings.
     zx_status_t AddPage(vm_page_t* p, uint64_t offset);
-    zx_status_t AddPageLocked(vm_page_t* p, uint64_t offset) TA_REQ(lock_);
+    // If |do_range_update| is false, this function will skip updating mappings.
+    zx_status_t AddPageLocked(vm_page_t* p, uint64_t offset,
+                              bool do_range_update = true) TA_REQ(lock_);
 
     // internal page list routine
     void AddPageToArray(size_t index, vm_page_t* p);
@@ -217,6 +220,85 @@
             // Walks the child chain, which confuses analysis.
             TA_NO_THREAD_SAFETY_ANALYSIS;
 
+    // GetPageLocked helper function that 'forks' the page at |offset| of the current vmo. If
+    // this function successfully inserts a page into |offset| of the current vmo, it returns
+    // a pointer to the corresponding vm_page_t struct. The only failure condition is memory
+    // allocation failure, in which case this function returns null.
+    //
+    // The source page that is being forked has already been calculated - it is |page|, which
+    // is currently in |page_owner| at offset |owner_offset|.
+    //
+    // This function is responsible for ensuring that COW clones never result in worse memory
+    // consumption than simply creating a new vmo and memcpying the content. It does this by
+    // migrating a page from a hidden vmo into one child if that page is not 'accessible' to the
+    // other child (instead of allocating a new page into the child and making the hidden vmo's
+    // page inaccessible).
+    //
+    // Whether a particular page in a hidden vmo is 'accessible' to a particular child is
+    // determined by a combination of two factors. First, if the page lies outside of the range
+    // in the hidden vmo the child can see (specified by parent_offset_ and parent_limit_), then
+    // the page is not accessible. Second, if the page has already been copied into the child,
+    // then the page in the hidden vmo is not accessible to that child. This is tracked by the
+    // cow_X_split bits in the vm_page_t structure.
+    //
+    // To handle memory allocation failure, this function performs the fork operation from the
+    // root vmo towards the leaf vmo. This allows the COW invariants to always be preserved.
+    //
+    // |page| must not be the zero-page, as there is no need to do the complex page
+    // fork logic to reduce memory consumption in that case.
+    vm_page_t* CloneCowPageLocked(uint64_t offset, list_node_t* free_list,
+                                  VmObjectPaged* page_owner, vm_page_t* page,
+                                  uint64_t owner_offset)
+            // Walking through the ancestors confuses analysis.
+            TA_NO_THREAD_SAFETY_ANALYSIS;
+
+    // Returns true if |page| (located at |offset| in this vmo) is only accessible by one
+    // child, where 'accessible' is defined by ::CloneCowPageLocked.
+    bool IsUniAccessibleLocked(vm_page_t* page, uint64_t offset) const
+            // Reaching into the children confuses analysis
+            TA_NO_THREAD_SAFETY_ANALYSIS;
+
+    // Releases this vmo's reference to any ancestor vmo's COW pages, for the range [start, end)
+    // in this vmo. This is done by either setting the pages' split bits (if something else
+    // can access the pages) or by freeing the pages onto |free_list| (if nothing else can
+    // access the pages).
+    //
+    // This function recursively invokes itself for regions of the parent vmo which are
+    // not accessible by the sibling vmo.
+    // TODO(ZX-757): Carefully constructed clone chains can blow the stack.
+    //
+    // If |update_limit| is set, then this function will recursively update parent_limit_ of
+    // this and ancestor vmos to account for the fact that this vmo will never access pages
+    // in its parent above |start|.
+    void ReleaseCowParentPagesLocked(uint64_t start, uint64_t end, bool update_limit,
+                                     list_node_t* free_list)
+            // Calling into the parents confuses analysis
+            TA_NO_THREAD_SAFETY_ANALYSIS;
+
+    // Outside of initialization/destruction, hidden vmos always have two children. For
+    // clarity, whichever child is first in the list is the 'left' child, and whichever
+    // child is second is the 'right' child.
+    VmObjectPaged& left_child_locked() TA_REQ(lock_) {
+        DEBUG_ASSERT(is_hidden());
+        DEBUG_ASSERT(children_list_len_ == 2);
+        return children_list_.front();
+    }
+    VmObjectPaged& right_child_locked() TA_REQ(lock_) {
+        DEBUG_ASSERT(is_hidden());
+        DEBUG_ASSERT(children_list_len_ == 2);
+        return children_list_.back();
+    }
+    const VmObjectPaged& left_child_locked() const TA_REQ(lock_) {
+        DEBUG_ASSERT(is_hidden());
+        DEBUG_ASSERT(children_list_len_ == 2);
+        return children_list_.front();
+    }
+    const VmObjectPaged& right_child_locked() const TA_REQ(lock_) {
+        DEBUG_ASSERT(is_hidden());
+        DEBUG_ASSERT(children_list_len_ == 2);
+        return children_list_.back();
+    }
+
     // members
     const uint32_t options_;
     uint64_t size_ TA_GUARDED(lock_) = 0;
@@ -233,6 +315,12 @@
     // a bidirectional clone and end up changing parent_.
     uint64_t original_parent_user_id_ TA_GUARDED(lock_) = 0;
 
+    // Flag used for walking back up clone tree without recursion. See ::CloneCowPageLocked.
+    enum class StackDir {
+        Left, Right,
+    };
+    StackDir page_stack_flag_ TA_GUARDED(lock_);
+
     // The page source, if any.
     const fbl::RefPtr<PageSource> page_source_;
 
diff --git a/zircon/kernel/vm/page.cpp b/zircon/kernel/vm/page.cpp
index cedcf66..cc74d5e 100644
--- a/zircon/kernel/vm/page.cpp
+++ b/zircon/kernel/vm/page.cpp
@@ -42,8 +42,14 @@
 }
 
 void vm_page::dump() const {
-    printf("page %p: address %#" PRIxPTR " state %s flags %#x\n", this, paddr(),
+    printf("page %p: address %#" PRIxPTR " state %s flags %#x", this, paddr(),
            page_state_to_string(state_priv), flags);
+    if (state_priv == VM_PAGE_STATE_OBJECT) {
+        printf(" pin_count %d split_bits %d%d\n",
+               object.pin_count, object.cow_left_split, object.cow_right_split);
+    } else {
+        printf("\n");
+    }
 }
 
 void vm_page::set_state(vm_page_state new_state) {
diff --git a/zircon/kernel/vm/vm_object_paged.cpp b/zircon/kernel/vm/vm_object_paged.cpp
index 9d46800..b37ec56 100644
--- a/zircon/kernel/vm/vm_object_paged.cpp
+++ b/zircon/kernel/vm/vm_object_paged.cpp
@@ -47,6 +47,48 @@
     DEBUG_ASSERT(p->state() == VM_PAGE_STATE_ALLOC);
     p->set_state(VM_PAGE_STATE_OBJECT);
     p->object.pin_count = 0;
+    p->object.cow_left_split = 0;
+    p->object.cow_right_split = 0;
+}
+
+// Allocates a new page and populates it with the data at |parent_paddr|.
+bool AllocateCopyPage(uint32_t pmm_alloc_flags, paddr_t parent_paddr,
+                      list_node_t* free_list, vm_page_t** clone) {
+    paddr_t pa_clone;
+    vm_page_t* p_clone = nullptr;
+    if (free_list) {
+        p_clone = list_remove_head_type(free_list, vm_page, queue_node);
+        if (p_clone) {
+            pa_clone = p_clone->paddr();
+        }
+    }
+    if (!p_clone) {
+        zx_status_t status = pmm_alloc_page(pmm_alloc_flags, &p_clone, &pa_clone);
+        if (!p_clone) {
+            DEBUG_ASSERT(status == ZX_ERR_NO_MEMORY);
+            return false;
+        }
+        DEBUG_ASSERT(status == ZX_OK);
+    }
+
+    InitializeVmPage(p_clone);
+
+    void* dst = paddr_to_physmap(pa_clone);
+    DEBUG_ASSERT(dst);
+
+    if (parent_paddr != vm_get_zero_page_paddr()) {
+        // do a direct copy of the two pages
+        const void* src = paddr_to_physmap(parent_paddr);
+        DEBUG_ASSERT(src);
+        memcpy(dst, src, PAGE_SIZE);
+    } else {
+        // avoid pointless fetches by directly zeroing dst
+        arch_zero_page(dst);
+    }
+
+    *clone = p_clone;
+
+    return true;
 }
 
 // round up the size to the next page size boundary and make sure we dont wrap
@@ -104,6 +146,9 @@
 
     RemoveFromGlobalList();
 
+    list_node_t list;
+    list_initialize(&list);
+
     if (!is_hidden()) {
         // If we're not a hidden vmo, then we need to remove ourself from our parent. This needs
         // to be done before emptying the page list so that a hidden parent can't merge into this
@@ -113,6 +158,9 @@
         // to hold the lock over the parent_ check and into the subsequent removal call.
         Guard<fbl::Mutex> guard{&lock_};
         if (parent_) {
+            // Release any COW pages in ancestors retained by this vmo.
+            ReleaseCowParentPagesLocked(0, parent_limit_, true, &list);
+
             LTRACEF("removing ourself from our parent %p\n", parent_.get());
             parent_->RemoveChild(this, guard.take());
         }
@@ -133,8 +181,6 @@
         });
 
     // free all of the pages attached to us
-    list_node_t list;
-    list_initialize(&list);
     page_list_.RemoveAllPages(&list);
 
     if (page_source_) {
@@ -525,14 +571,10 @@
     const uint64_t merge_start_offset = child.parent_offset_;
     const uint64_t merge_end_offset = child.parent_offset_ + child.parent_limit_;
 
-    // Update child's parent limit so that it won't be able to see more of parent_ than
-    // this can see once it gets reparented and has its offset adjusted.
-    if (child.parent_offset_ + child.parent_limit_ < parent_limit_) {
-        // No need to update the limit since child's view is a subset of this's view.
-        // TODO(stevensd): Release ancestor pages which this can see but child can't.
-    } else {
-        // Set the limit so the child won't be able to see more of its new parent
-        // than this hidden vmo was able to see.
+    // Calculate the child's parent limit as a limit against its new parent to compare
+    // with this hidden vmo's limit. Update the child's limit so it won't be able to see
+    // more of its new parent than this hidden vmo was able to see.
+    if (child.parent_offset_ + child.parent_limit_ > parent_limit_) {
         if (parent_limit_ < child.parent_offset_) {
             child.parent_limit_ = 0;
         } else {
@@ -540,8 +582,8 @@
         }
     }
 
-    // TODO(stevensd): Release ancestor pages below child.parent_offset_.
 
+    // Adjust the child's offset so it will still see the correct range.
     bool overflow = add_overflow(parent_offset_, child.parent_offset_, &child.parent_offset_);
     // Overflow here means that something went wrong when setting up parent limits.
     DEBUG_ASSERT(!overflow);
@@ -557,8 +599,14 @@
             [](vm_page* page, uint64_t offset) {
                 // TODO(stevensd): Update per-page metadata here when available
             },
-            [](vm_page* page, uint64_t offset) {
-                // TODO(stevensd): Update per-page metadata here when available
+            [merge_start_offset, merge_end_offset](vm_page* page, uint64_t offset) {
+                // Only migrate pages in the child's range.
+                DEBUG_ASSERT(merge_start_offset <= offset && offset < merge_end_offset);
+
+                // Since we recursively fork on write, if the child doesn't have the
+                // page, then neither of its children do.
+                page->object.cow_left_split = 0;
+                page->object.cow_right_split = 0;
             }, &covered_pages);
 
     if (!list_is_empty(&covered_pages)) {
@@ -601,8 +649,8 @@
         printf("  ");
     }
     printf("vmo %p/k%" PRIu64 " size %#" PRIx64 " offset %#" PRIx64
-           " pages %zu ref %d parent %p/k%" PRIu64 "\n",
-           this, user_id_, size_, parent_offset_, count,
+           " limit %#" PRIx64 " pages %zu ref %d parent %p/k%" PRIu64 "\n",
+           this, user_id_, size_, parent_offset_, parent_limit_, count,
            ref_count_debug(), parent_.get(), parent_id);
 
     if (verbose) {
@@ -647,7 +695,7 @@
     return AddPageLocked(p, offset);
 }
 
-zx_status_t VmObjectPaged::AddPageLocked(vm_page_t* p, uint64_t offset) {
+zx_status_t VmObjectPaged::AddPageLocked(vm_page_t* p, uint64_t offset, bool do_range_update) {
     canary_.Assert();
     DEBUG_ASSERT(lock_.lock().IsHeld());
 
@@ -664,12 +712,163 @@
         return err;
     }
 
-    // other mappings may have covered this offset into the vmo, so unmap those ranges
-    RangeChangeUpdateLocked(offset, PAGE_SIZE);
+    if (do_range_update) {
+        // other mappings may have covered this offset into the vmo, so unmap those ranges
+        RangeChangeUpdateLocked(offset, PAGE_SIZE);
+    }
 
     return ZX_OK;
 }
 
+bool VmObjectPaged::IsUniAccessibleLocked(vm_page_t* page, uint64_t offset) const {
+    DEBUG_ASSERT(lock_.lock().IsHeld());
+    DEBUG_ASSERT(page_list_.GetPage(offset) == page);
+
+    if (page->object.cow_right_split || page->object.cow_left_split) {
+        return true;
+    }
+
+    if (offset < left_child_locked().parent_offset_
+            || offset >= left_child_locked().parent_offset_ + left_child_locked().parent_limit_) {
+        return true;
+    }
+
+    if (offset < right_child_locked().parent_offset_
+            || offset >= right_child_locked().parent_offset_ + right_child_locked().parent_limit_) {
+        return true;
+    }
+
+    return false;
+}
+
+vm_page_t* VmObjectPaged::CloneCowPageLocked(uint64_t offset, list_node_t* free_list,
+                                             VmObjectPaged* page_owner, vm_page_t* page,
+                                             uint64_t owner_offset) {
+    DEBUG_ASSERT(page != vm_get_zero_page());
+    DEBUG_ASSERT(parent_);
+
+    // To avoid the need for rollback logic on allocation failure, we start the forking
+    // process from the root-most vmo and work our way towards the leaf vmo. This allows
+    // us to maintain the hidden vmo invariants through the whole operation, so that we
+    // can stop at any point.
+    //
+    // To set this up, walk from the leaf to |page_owner|, and keep track of the
+    // path via |page_stack_flag_|.
+    VmObjectPaged* cur = this;
+    do {
+        VmObjectPaged* next = VmObjectPaged::AsVmObjectPaged(cur->parent_);
+        // We can't make COW clones of physical vmos, so this can only happen if we
+        // somehow don't find |page_owner| in the ancestor chain.
+        DEBUG_ASSERT(next);
+
+        next->page_stack_flag_ =
+                &next->left_child_locked() == cur ? StackDir::Left : StackDir::Right;
+        if (next->page_stack_flag_ == StackDir::Right) {
+            DEBUG_ASSERT(&next->right_child_locked() == cur);
+        }
+        cur = next;
+    } while (cur != page_owner);
+    cur = cur->page_stack_flag_ == StackDir::Left ?
+            &cur->left_child_locked() : &cur->right_child_locked();
+    uint64_t cur_offset = owner_offset - cur->parent_offset_;
+
+    // target_page is the page we're considering for migration.
+    vm_page_t* target_page = page;
+    VmObjectPaged* target_page_owner = page_owner;
+    uint64_t target_page_offset = owner_offset;
+
+    bool alloc_failure = false;
+
+    // As long as we're simply migrating |page|, there's no need to update any vmo mappings, since
+    // that means the other side of the clone tree has already covered |page| and the current side
+    // of the clone tree will still see |page|. As soon as we insert a new page, we'll need to
+    // update all mappings at or below that level.
+    bool skip_range_update = true;
+    while (cur) {
+        if (target_page_owner->IsUniAccessibleLocked(target_page, target_page_offset)) {
+            // If the page we're covering in the parent is uni-accessible, then we
+            // can directly move the page.
+
+            // Assert that we're not trying to split the page the same direction two times. Either
+            // some tracking state got corrupted or a page in the subtree we're trying to
+            // migrate to got improperly migrated/freed. If we did this migration, then the
+            // opposite subtree would lose access to this page.
+            DEBUG_ASSERT(!(target_page_owner->page_stack_flag_ == StackDir::Left
+                    && target_page->object.cow_left_split));
+            DEBUG_ASSERT(!(target_page_owner->page_stack_flag_ == StackDir::Right
+                    && target_page->object.cow_right_split));
+
+            target_page->object.cow_left_split = 0;
+            target_page->object.cow_right_split = 0;
+            vm_page_t* expected_page = target_page;
+            bool success =
+                    target_page_owner->page_list_.RemovePage(target_page_offset, &target_page);
+            DEBUG_ASSERT(success);
+            DEBUG_ASSERT(target_page == expected_page);
+        } else {
+            // Otherwise we need to fork the page.
+            vm_page_t* cover_page;
+            alloc_failure = !AllocateCopyPage(pmm_alloc_flags_, page->paddr(),
+                                              free_list, &cover_page);
+            if (alloc_failure) {
+                // TODO: plumb through PageRequest once anonymous page source is implemented.
+                break;
+            }
+
+            // We're going to cover target_page with cover_page, so set appropriate split bit.
+            if (target_page_owner->page_stack_flag_ == StackDir::Left) {
+                target_page->object.cow_left_split = 1;
+                DEBUG_ASSERT(target_page->object.cow_right_split == 0);
+            } else {
+                target_page->object.cow_right_split = 1;
+                DEBUG_ASSERT(target_page->object.cow_left_split == 0);
+            }
+            target_page = cover_page;
+
+            skip_range_update = false;
+        }
+
+        // Skip the automatic range update so we can do it ourselves more efficiently.
+        zx_status_t status = cur->AddPageLocked(target_page, cur_offset, false);
+        DEBUG_ASSERT(status == ZX_OK);
+
+        if (!skip_range_update) {
+            if (cur != this) {
+                // In this case, cur is a hidden vmo and has no direct mappings. Also, its
+                // descendents along the page stack will be dealt with by subsequent iterations
+                // of this loop. That means that any mappings that need to be touched now are
+                // owned by the children on the opposite side of page_stack_flag_.
+                DEBUG_ASSERT(cur->mapping_list_len_ == 0);
+                VmObjectPaged& other = cur->page_stack_flag_ == StackDir::Left
+                        ? cur->right_child_locked() : cur->left_child_locked();
+                other.RangeChangeUpdateFromParentLocked(cur_offset, PAGE_SIZE);
+            } else {
+                // In this case, cur is the last vmo being changed, so update its whole subtree.
+                DEBUG_ASSERT(offset == cur_offset);
+                RangeChangeUpdateLocked(offset, PAGE_SIZE);
+            }
+        }
+
+        target_page_owner = cur;
+        target_page_offset = cur_offset;
+
+        if (cur == this) {
+            break;
+        } else {
+            cur = cur->page_stack_flag_ == StackDir::Left
+                    ? &cur->left_child_locked() : &cur->right_child_locked();
+            cur_offset -= cur->parent_offset_;
+        }
+    }
+    DEBUG_ASSERT(alloc_failure || cur_offset == offset);
+
+    if (alloc_failure) {
+        return nullptr;
+    } else {
+        return target_page;
+    }
+}
+
 vm_page_t* VmObjectPaged::FindInitialPageContentLocked(uint64_t offset, uint pf_flags,
                                                        VmObject** owner_out,
                                                        uint64_t* owner_offset_out) {
@@ -731,11 +930,14 @@
                                          vm_page_t** const page_out, paddr_t* const pa_out) {
     canary_.Assert();
     DEBUG_ASSERT(lock_.lock().IsHeld());
+    DEBUG_ASSERT(!is_hidden());
 
     if (offset >= size_) {
         return ZX_ERR_OUT_OF_RANGE;
     }
 
+    offset = ROUNDDOWN(offset, PAGE_SIZE);
+
     vm_page_t* p;
 
     // see if we already have a page at that offset
@@ -809,58 +1011,40 @@
         return ZX_OK;
     }
 
-    // If we're write faulting, we need to allocate a wriable page into this VMO.
-    vm_page_t* new_p = nullptr;
-    paddr_t new_pa;
-    if (free_list) {
-        new_p = list_remove_head_type(free_list, vm_page, queue_node);
-        if (new_p) {
-            new_pa = new_p->paddr();
+    vm_page_t* res_page;
+    if (!page_owner->is_hidden() || p == vm_get_zero_page()) {
+        // If the vmo isn't hidden, we can't move the page. If the page is the zero
+        // page, there's no need to try to move the page. In either case, we need to
+        // allocate a writable page for this vmo.
+        if (!AllocateCopyPage(pmm_alloc_flags_, p->paddr(), free_list, &res_page)) {
+             return ZX_ERR_NO_MEMORY;
         }
-    }
-    if (!new_p) {
-        pmm_alloc_page(pmm_alloc_flags_, &new_p, &new_pa);
-        if (!new_p) {
+        zx_status_t status = AddPageLocked(res_page, offset);
+        DEBUG_ASSERT(status == ZX_OK);
+    } else {
+        // We need a writable page; let ::CloneCowPageLocked handle inserting one.
+        res_page = CloneCowPageLocked(offset, free_list,
+                                      static_cast<VmObjectPaged*>(page_owner), p, owner_offset);
+        if (res_page == nullptr) {
             return ZX_ERR_NO_MEMORY;
         }
     }
 
-    InitializeVmPage(new_p);
+    LTRACEF("faulted in page %p, pa %#" PRIxPTR "\n", res_page, res_page->paddr());
 
-    void* dst = paddr_to_physmap(new_pa);
-    DEBUG_ASSERT(dst);
-
-    if (likely(p == vm_get_zero_page())) {
-        // avoid pointless fetches by directly zeroing dst
-        arch_zero_page(dst);
-
-        // If ARM and not fully cached, clean/invalidate the page after zeroing it.
-        // check doesn't need to be done in the other branch, since that branch is
-        // only hit for clones and clones are always cached.
+    // If ARM and not fully cached, clean/invalidate the page after zeroing it. This
+    // can be done here instead of with the clone logic because clones must be cached.
 #if ARCH_ARM64
-        if (cache_policy_ != ARCH_MMU_FLAG_CACHED) {
-            arch_clean_invalidate_cache_range((addr_t) dst, PAGE_SIZE);
-        }
-#endif
-    } else {
-        // do a direct copy of the two pages
-        const void* src = paddr_to_physmap(p->paddr());
-        DEBUG_ASSERT(src);
-        memcpy(dst, src, PAGE_SIZE);
+    if (cache_policy_ != ARCH_MMU_FLAG_CACHED) {
+        arch_clean_invalidate_cache_range((addr_t)paddr_to_physmap(res_page->paddr()), PAGE_SIZE);
     }
-
-    // Add the new page and return it. This also is responsible for
-    // unmapping this offset in any children.
-    zx_status_t status = AddPageLocked(new_p, offset);
-    DEBUG_ASSERT(status == ZX_OK);
-
-    LTRACEF("faulted in page %p, pa %#" PRIxPTR " copied from %p\n", new_p, new_pa, p);
+#endif
 
     if (page_out) {
-        *page_out = new_p;
+        *page_out = res_page;
     }
     if (pa_out) {
-        *pa_out = new_pa;
+        *pa_out = res_page->paddr();
     }
 
     return ZX_OK;
@@ -1161,6 +1345,92 @@
     return found_pinned;
 }
 
+void VmObjectPaged::ReleaseCowParentPagesLocked(uint64_t start, uint64_t end, bool update_limit,
+                                                list_node_t* free_list) {
+    if (start == end) {
+        return;
+    }
+
+    // This vmo is only retaining pages less than parent_limit_, so we shouldn't
+    // be trying to release any pages above this limit.
+    DEBUG_ASSERT(start < parent_limit_);
+    DEBUG_ASSERT(end <= parent_limit_);
+
+    if (!parent_ || !parent_->is_hidden()) {
+        return;
+    }
+    auto parent = VmObjectPaged::AsVmObjectPaged(parent_);
+    bool left = this == &parent->left_child_locked();
+    auto& other = left ? parent->right_child_locked() : parent->left_child_locked();
+
+    // Compute the range in the parent that cur no longer will be able to see.
+    uint64_t parent_range_start, parent_range_end;
+    bool overflow = add_overflow(start, parent_offset_, &parent_range_start);
+    bool overflow2 = add_overflow(end, parent_offset_, &parent_range_end);
+    DEBUG_ASSERT(!overflow && !overflow2); // vmo creation should have failed.
+
+    // Drop any pages in the parent which are outside of the other child's accessibility, and
+    // recursively release COW pages in ancestor vmos in those inaccessible regions.
+    //
+    // There are two newly inaccessible regions to consider here - the region before what the
+    // sibling can access and the region after what it can access. We first free the 'head'
+    // region and calculate |tail_start| to determine where the 'tail' region starts. Then
+    // we can free the 'tail' region.
+    //
+    // Note that |update_limit| can only be recursively passed into the second call since we
+    // don't want the parent's parent_limit_ to be updated if the sibling can still access
+    // the pages.
+    uint64_t tail_start;
+    if (other.parent_limit_ != 0) {
+        if (parent_range_start < other.parent_offset_) {
+            uint64_t head_end = fbl::min(other.parent_offset_, parent_range_end);
+            parent->page_list_.RemovePages(parent_range_start, head_end, free_list);
+            parent->ReleaseCowParentPagesLocked(fbl::min(parent->parent_limit_, parent_range_start),
+                                                fbl::min(parent->parent_limit_, head_end),
+                                                false, free_list);
+        }
+        tail_start = fbl::max(other.parent_offset_ + other.parent_limit_, parent_range_start);
+    } else {
+        // If the sibling can't access anything in the parent, the whole region
+        // we're operating on is the 'tail' region.
+        tail_start = parent_range_start;
+    }
+    if (tail_start < parent_range_end) {
+        parent->page_list_.RemovePages(tail_start, parent_range_end, free_list);
+        parent->ReleaseCowParentPagesLocked(
+                fbl::min(parent->parent_limit_, tail_start),
+                fbl::min(parent->parent_limit_, parent_range_end),
+                update_limit, free_list);
+    }
+
+    if (update_limit) {
+        parent_limit_ = start;
+    }
+
+    // Any pages left were accesible by both children. Free any pages that were already split
+    // into the other child, and mark any unsplit pages. We don't need to recurse on this
+    // range because the visibility of ancestor pages in this range isn't changing.
+    parent->page_list_.RemovePages([update_limit, left](vm_page_t*& page, auto offset) -> bool {
+        // Simply checking if the page is resident in |this|->page_list_ is insufficient, as the
+        // page split into this vmo could have been migrated anywhere into is children. To avoid
+        // having to search its entire child subtree, we need to track into which subtree
+        // a page is split (i.e. have two directional split bits instead of a single split bit).
+        if (left ? page->object.cow_right_split : page->object.cow_left_split) {
+            return true;
+        }
+        if (!update_limit) {
+            // Only bother setting the split bits if we didn't make the page
+            // inaccessible through parent_limit_;
+            if (left) {
+                page->object.cow_left_split = 1;
+            } else {
+                page->object.cow_right_split = 1;
+            }
+        }
+        return false;
+    }, parent_range_start, parent_range_end, free_list);
+}
+
 zx_status_t VmObjectPaged::Resize(uint64_t s) {
     canary_.Assert();
 
@@ -1215,7 +1485,14 @@
             DEBUG_ASSERT(status == ZX_OK);
         }
 
-        parent_limit_ = fbl::min(parent_limit_, s);
+        if (parent_ && parent_->is_hidden()) {
+            // Release any COW pages that are no longer necessary. This will also
+            // update the parent limit.
+            ReleaseCowParentPagesLocked(fbl::min(start, parent_limit_),
+                                        fbl::min(end, parent_limit_), true, &free_list);
+        } else {
+            parent_limit_ = fbl::min(parent_limit_, s);
+        }
 
         page_list_.RemovePages(start, end, &free_list);
     } else if (s > size_) {