// Copyright 2016 The Fuchsia Authors
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file or at
#include <assert.h>
#include <lib/user_copy/user_ptr.h>
#include <lib/zircon-internal/thread_annotations.h>
#include <list.h>
#include <stdint.h>
#include <zircon/types.h>
#include <fbl/array.h>
#include <fbl/canary.h>
#include <fbl/intrusive_double_list.h>
#include <fbl/macros.h>
#include <fbl/ref_counted.h>
#include <fbl/ref_ptr.h>
#include <kernel/mutex.h>
#include <vm/page_source.h>
#include <vm/pmm.h>
#include <vm/vm.h>
#include <vm/vm_aspace.h>
#include <vm/vm_object.h>
#include <vm/vm_page_list.h>
// the main VM object type, holding a list of pages
class VmObjectPaged final : public VmObject {
// |options_| is a bitmask of:
static constexpr uint32_t kResizable = (1u << 0);
static constexpr uint32_t kContiguous = (1u << 1);
static constexpr uint32_t kHidden = (1u << 2);
static constexpr uint32_t kSlice = (1u << 3);
static zx_status_t Create(uint32_t pmm_alloc_flags, uint32_t options, uint64_t size,
fbl::RefPtr<VmObject>* vmo);
// Gets the raw VmObjectPaged pointer, or null if the VmObject is not paged.
static VmObjectPaged* AsVmObjectPaged(const fbl::RefPtr<VmObject>& vmo) {
if (vmo->is_paged()) {
return static_cast<VmObjectPaged*>(vmo.get());
} else {
return nullptr;
// Create a VMO backed by a contiguous range of physical memory. The
// returned vmo has all of its pages committed, and does not allow
// decommitting them.
static zx_status_t CreateContiguous(uint32_t pmm_alloc_flags, uint64_t size,
uint8_t alignment_log2, fbl::RefPtr<VmObject>* vmo);
// Creates a VMO from wired pages.
// Creating a VMO using this method is destructive. Once the VMO is released, its
// pages will be released into the general purpose page pool, so it is not possible
// to create multiple VMOs for the same region using this method.
// |exclusive| indicates whether or not the created vmo should have exclusive access to
// the pages. If exclusive is true, then [data, data + size) will be unmapped from the
// kernel address space (unless they lie in the physmap).
static zx_status_t CreateFromWiredPages(const void* data, size_t size, bool exclusive,
fbl::RefPtr<VmObject>* vmo);
static zx_status_t CreateExternal(fbl::RefPtr<PageSource> src, uint32_t options, uint64_t size,
fbl::RefPtr<VmObject>* vmo);
zx_status_t Resize(uint64_t size) override;
uint32_t create_options() const override { return options_; }
uint64_t size() const override
// TODO: Figure out whether it's safe to lock here without causing
// any deadlocks.
return size_;
bool is_paged() const override { return true; }
bool is_contiguous() const override { return (options_ & kContiguous); }
bool is_resizable() const override { return (options_ & kResizable); }
bool is_pager_backed() const override {
Guard<fbl::Mutex> guard{&lock_};
return GetRootPageSourceLocked() != nullptr;
bool is_hidden() const override { return (options_ & kHidden); }
ChildType child_type() const override {
Guard<fbl::Mutex> guard{&lock_};
return (original_parent_user_id_ != 0) ? ChildType::kCowClone : ChildType::kNotChild;
bool is_slice() const { return options_ & kSlice; }
uint64_t parent_user_id() const override {
Guard<fbl::Mutex> guard{&lock_};
return original_parent_user_id_;
void set_user_id(uint64_t user_id) override {
Guard<fbl::Mutex> guard{&lock_};
page_attribution_user_id_ = user_id;
size_t AttributedPagesInRange(uint64_t offset, uint64_t len) const override;
zx_status_t CommitRange(uint64_t offset, uint64_t len) override;
zx_status_t DecommitRange(uint64_t offset, uint64_t len) override;
zx_status_t Pin(uint64_t offset, uint64_t len) override;
void Unpin(uint64_t offset, uint64_t len) override;
zx_status_t Read(void* ptr, uint64_t offset, size_t len) override;
zx_status_t Write(const void* ptr, uint64_t offset, size_t len) override;
zx_status_t Lookup(uint64_t offset, uint64_t len, vmo_lookup_fn_t lookup_fn,
void* context) override;
zx_status_t ReadUser(VmAspace* current_aspace, user_out_ptr<void> ptr, uint64_t offset,
size_t len) override;
zx_status_t WriteUser(VmAspace* current_aspace, user_in_ptr<const void> ptr, uint64_t offset,
size_t len) override;
zx_status_t TakePages(uint64_t offset, uint64_t len, VmPageSpliceList* pages) override;
zx_status_t SupplyPages(uint64_t offset, uint64_t len, VmPageSpliceList* pages) override;
void Dump(uint depth, bool verbose) override;
zx_status_t GetPageLocked(uint64_t offset, uint pf_flags, list_node* free_list,
PageRequest* page_request, vm_page_t**, paddr_t*) override
// Calls a Locked method of the parent, which confuses analysis.
zx_status_t CreateClone(Resizability resizable, CloneType type, uint64_t offset, uint64_t size,
bool copy_name, fbl::RefPtr<VmObject>* child_vmo) override
// This function reaches into the created child, which confuses analysis.
// Inserts |hidden_parent| as a hidden parent of |this|. This vmo and |hidden_parent|
// must have the same lock.
void InsertHiddenParentLocked(fbl::RefPtr<VmObjectPaged>&& hidden_parent)
// This accesses both |this| and |hidden_parent|, which confuses analysis.
void RangeChangeUpdateFromParentLocked(uint64_t offset, uint64_t len, RangeChangeList* list)
// Called under the parent's lock, which confuses analysis.
uint32_t GetMappingCachePolicy() const override;
zx_status_t SetMappingCachePolicy(const uint32_t cache_policy) override;
void RemoveChild(VmObject* child, Guard<Mutex>&& guard) override
// Analysis doesn't know that the guard passed to this function is the vmo's lock.
bool OnChildAddedLocked() override TA_REQ(lock_);
void DetachSource() override {
zx_status_t CreateChildSlice(uint64_t offset, uint64_t size, bool copy_name,
fbl::RefPtr<VmObject>* child_vmo) override
// This function reaches into the created child, which confuses analysis.
static constexpr uint64_t MAX_SIZE = VmPageList::MAX_SIZE;
// Ensure that MAX_SIZE + PAGE_SIZE doesn't overflow so VmObjectPaged doesn't
// need to worry about overflow for loop bounds.
static_assert(MAX_SIZE % PAGE_SIZE == 0);
// private constructor (use Create())
VmObjectPaged(uint32_t options, uint32_t pmm_alloc_flags, uint64_t size,
fbl::RefPtr<vm_lock_t> root_lock, fbl::RefPtr<PageSource> page_source);
// Initializes the original parent state of the vmo. |offset| is the offset of
// this vmo in |parent|.
// This function should be called at most once, even if the parent changes
// after initialization.
void InitializeOriginalParentLocked(fbl::RefPtr<VmObject> parent, uint64_t offset)
// Accesses both parent and child, which confuses analysis.
static zx_status_t CreateCommon(uint32_t pmm_alloc_flags, uint32_t options, uint64_t size,
fbl::RefPtr<VmObject>* vmo);
// private destructor, only called from refptr
~VmObjectPaged() override;
friend fbl::RefPtr<VmObjectPaged>;
// Add a page to the object. This operation unmaps the corresponding
// offset from any existing mappings.
zx_status_t AddPage(vm_page_t* p, uint64_t offset);
// If |do_range_update| is false, this function will skip updating mappings.
zx_status_t AddPageLocked(vm_page_t* p, uint64_t offset, bool do_range_update = true)
// internal page list routine
void AddPageToArray(size_t index, vm_page_t* p);
zx_status_t PinLocked(uint64_t offset, uint64_t len) TA_REQ(lock_);
void UnpinLocked(uint64_t offset, uint64_t len) TA_REQ(lock_);
// Internal decommit range helper that expects the lock to be held. On success it will populate
// the past in page list with any pages that should be freed.
zx_status_t DecommitRangeLocked(uint64_t offset, uint64_t len, list_node_t& free_list)
fbl::RefPtr<PageSource> GetRootPageSourceLocked() const
// Walks the parent chain to get the root page source, which confuses analysis.
bool IsCowClonable() const
// Walks the parent chain since the root determines clonability.
// internal check if any pages in a range are pinned
bool AnyPagesPinnedLocked(uint64_t offset, size_t len) TA_REQ(lock_);
// see AttributedPagesInRange
size_t AttributedPagesInRangeLocked(uint64_t offset, uint64_t len) const TA_REQ(lock_);
// Helper function for ::AllocatedPagesInRangeLocked. Counts the number of pages in ancestor's
// vmos that should be attributed to this vmo for the specified range. It is an error to pass in a
// range that does not need attributing (i.e. offset must be < parent_limit_), although |len| is
// permitted to be sized such that the range exceeds parent_limit_.
// The return value is the length of the processed region, which will be <= |size| and is
// guaranteed to be > 0. The |count| is the number of pages in this region that should be
// attributed to this vmo, versus some other vmo.
uint64_t CountAttributedAncestorPagesLocked(uint64_t offset, uint64_t size, uint64_t* count) const
// internal read/write routine that takes a templated copy function to help share some code
template <typename T>
zx_status_t ReadWriteInternal(uint64_t offset, size_t len, bool write, T copyfunc);
// Searches for info for initialization of a page being commited into |this| at |offset|.
// If an ancestor has a committed page which corresponds to |offset|, returns that page
// as well as the VmObject and offset which own the page. If no ancestor has a committed
// page for the offset, returns null as well as the VmObject/offset which need to be queried
// to populate the page.
// It is an error to call this when |this| has a committed page at |offset|.
vm_page_t* FindInitialPageContentLocked(uint64_t offset, uint pf_flags, VmObject** owner_out,
uint64_t* owner_offset_out)
// Walks the child chain, which confuses analysis.
// GetPageLocked helper function that 'forks' the page at |offset| of the current vmo. If
// this function successfully inserts a page into |offset| of the current vmo, it returns
// a pointer to the corresponding vm_page_t struct. The only failure condition is memory
// allocation failure, in which case this function returns null.
// The source page that is being forked has already been calculated - it is |page|, which
// is currently in |page_owner| at offset |owner_offset|.
// This function is responsible for ensuring that COW clones never result in worse memory
// consumption than simply creating a new vmo and memcpying the content. It does this by
// migrating a page from a hidden vmo into one child if that page is not 'accessible' to the
// other child (instead of allocating a new page into the child and making the hidden vmo's
// page inaccessible).
// Whether a particular page in a hidden vmo is 'accessible' to a particular child is
// determined by a combination of two factors. First, if the page lies outside of the range
// in the hidden vmo the child can see (specified by parent_offset_ and parent_limit_), then
// the page is not accessible. Second, if the page has already been copied into the child,
// then the page in the hidden vmo is not accessible to that child. This is tracked by the
// cow_X_split bits in the vm_page_t structure.
// To handle memory allocation failure, this function performs the fork operation from the
// root vmo towards the leaf vmo. This allows the COW invariants to always be preserved.
// |page| must not be the zero-page, as there is no need to do the complex page
// fork logic to reduce memory consumption in that case.
vm_page_t* CloneCowPageLocked(uint64_t offset, list_node_t* free_list, VmObjectPaged* page_owner,
vm_page_t* page, uint64_t owner_offset)
// Walking through the ancestors confuses analysis.
// Returns true if |page| (located at |offset| in this vmo) is only accessible by one
// child, where 'accessible' is defined by ::CloneCowPageLocked.
bool IsUniAccessibleLocked(vm_page_t* page, uint64_t offset) const
// Reaching into the children confuses analysis
// ::CloneCowPageLocked helper function that ensures contigous vmos remain contiguous.
// In general, it does not matter which vmo gets which physical page when forking pages in
// hidden vmos. However, if there are COW clones of a contiguous vmo, the original vmo
// must always see the original physical pages, so that it always looks contiguous to
// userspace. This function is responsible for fixing up any violations to this property
// introduced by the primary page forking logic in ::CloneCowPageLocked.
// |page_owner| is the original owner of the page being forked, and |page_owner_offset|
// is the offset of the original page. |last_contig| is the last contiguous vmo between
// |this| and |page_owner| that can also no longer see the desired page.
void ContiguousCowFixupLocked(VmObjectPaged* page_owner, uint64_t page_owner_offset,
VmObjectPaged* last_contig, uint64_t last_contig_offset)
// Walking through the ancestors confuses analysis.
// Releases this vmo's reference to any ancestor vmo's COW pages, for the range [start, end)
// in this vmo. This is done by either setting the pages' split bits (if something else
// can access the pages) or by freeing the pages onto |free_list| (if nothing else can
// access the pages).
// This function recursively invokes itself for regions of the parent vmo which are
// not accessible by the sibling vmo.
void ReleaseCowParentPagesLocked(uint64_t start, uint64_t end, list_node_t* free_list)
// Walking the clone tree confuses analysis
// Helper function for ReleaseCowParentPagesLocked that processes pages which are visible
// to both children as well as updates parent_(offset_)limit_.
void ReleaseCowParentPagesLockedHelper(uint64_t start, uint64_t end, list_node_t* free_list)
// Calling into the parents confuses analysis
// Updates the parent limits of all children so that they will never be able to
// see above |new_size| in this vmo, even if the vmo is enlarged in the future.
void UpdateChildParentLimitsLocked(uint64_t new_size)
// Calling into the children confuses analysis
// When cleaning up a hidden vmo, merges the hidden vmo's content (e.g. page list, view
// of the parent) into the remaining child.
void MergeContentWithChildLocked(VmObjectPaged* removed, bool removed_left)
// Accesses into the child confuse analysis
// Only valid to be called when is_slice() is true and returns the first parent of this
// hierarchy that is not a slice. The offset of this slice within that VmObjectPaged is set as
// the output.
VmObjectPaged* PagedParentOfSliceLocked(uint64_t* offset)
// Calling into the parent confuses analysis
// Outside of initialization/destruction, hidden vmos always have two children. For
// clarity, whichever child is first in the list is the 'left' child, and whichever
// child is second is the 'right' child. Children of a paged vmo will always be paged
// vmos themselves.
VmObjectPaged& left_child_locked() TA_REQ(lock_) {
DEBUG_ASSERT(children_list_len_ == 2);
return static_cast<VmObjectPaged&>(children_list_.front());
VmObjectPaged& right_child_locked() TA_REQ(lock_) {
DEBUG_ASSERT(children_list_len_ == 2);
return static_cast<VmObjectPaged&>(children_list_.back());
const VmObjectPaged& left_child_locked() const TA_REQ(lock_) {
DEBUG_ASSERT(children_list_len_ == 2);
return static_cast<const VmObjectPaged&>(children_list_.front());
const VmObjectPaged& right_child_locked() const TA_REQ(lock_) {
DEBUG_ASSERT(children_list_len_ == 2);
return static_cast<const VmObjectPaged&>(children_list_.back());
// members
const uint32_t options_;
uint64_t size_ TA_GUARDED(lock_) = 0;
// Offset in the *parent* where this object starts.
uint64_t parent_offset_ TA_GUARDED(lock_) = 0;
// Offset in *this object* above which accesses will no longer access the parent.
uint64_t parent_limit_ TA_GUARDED(lock_) = 0;
// Offset in *this object* below which this vmo stops referring to its parent. This field
// is only useful for hidden vmos, where it is used by ::ReleaseCowPagesParentLocked
// together with parent_limit_ to reduce how often page split bits need to be set. It is
// effectively a summary of the parent_offset_ values of all descendants - unlike
// parent_limit_, this value does not directly impact page lookup.
uint64_t parent_start_limit_ TA_GUARDED(lock_) = 0;
const uint32_t pmm_alloc_flags_ = PMM_ALLOC_FLAG_ANY;
uint32_t cache_policy_ TA_GUARDED(lock_) = ARCH_MMU_FLAG_CACHED;
// Flag which is true if there was a call to ::ReleaseCowParentPagesLocked which was
// not able to update the parent limits. When this is not set, it is sometimes
// possible for ::MergeContentWithChildLocked to do significantly less work.
bool partial_cow_release_ TA_GUARDED(lock_) = false;
// parent pointer (may be null)
fbl::RefPtr<VmObject> parent_ TA_GUARDED(lock_);
// Record the user_id_ of the original parent, in case we make
// a bidirectional clone and end up changing parent_.
uint64_t original_parent_user_id_ TA_GUARDED(lock_) = 0;
// Flag used for walking back up clone tree without recursion. See ::CloneCowPageLocked.
enum class StackDir : bool {
struct {
uint64_t scratch : 63;
StackDir dir_flag : 1;
} stack_ TA_GUARDED(lock_);
// This value is used when determining against which user-visible vmo a hidden vmo's
// pages should be attributed. It serves as a tie-breaker for pages that are accessible by
// multiple user-visible vmos. See ::HasAttributedAncestorPageLocked for more details.
// For non-hidden vmobjects, this always equals user_id_. For hidden vmobjects, this
// is the page_attribution_user_id_ of one of their children (i.e. the user_id_ of one
// of their non-hidden descendants).
uint64_t page_attribution_user_id_ TA_GUARDED(lock_) = 0;
// Counts the total number of pages pinned by ::Pin. If one page is pinned n times, it
// contributes n to this count. However, this does not include pages pinned when creating
// a contiguous vmo.
uint64_t pinned_page_count_ TA_GUARDED(lock_) = 0;
// The page source, if any.
const fbl::RefPtr<PageSource> page_source_;
// a tree of pages
VmPageList page_list_ TA_GUARDED(lock_);