blob: b0553691bdf8e0bd5a816eecc2f87087d1f9c3eb [file] [log] [blame]
// Copyright 2016 The Fuchsia Authors
// Copyright (c) 2014 Travis Geiselbrecht
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file or at
#include <lib/zircon-internal/macros.h>
#include <stdint.h>
#include <sys/types.h>
#include <zircon/compiler.h>
#include <zircon/listnode.h>
#include <kernel/percpu.h>
#include <ktl/atomic.h>
#include <ktl/optional.h>
#include <ktl/type_traits.h>
#include <vm/page_state.h>
#include <vm/stack_owned_loaned_pages_interval.h>
// core per page structure allocated at pmm arena creation time
struct vm_page {
struct list_node queue_node;
// read-only after being set up
paddr_t paddr_priv; // use paddr() accessor
// offset 0x18
union {
struct {
// This field is used for two different purposes, depending on whether the low order bit is
// set or not. This same field exists in states OBJECT, ALLOC, and FREE.
// When all bits are 0:
// There is no object and no StackOwnedLoanedPagesInterval.
// When kObjectOrStackOwnerIsStackOwnerFlag is set:
// The rest of the bits are a pointer to a StackOwnedLoanedPagesInterval. This allows a
// thread reclaiming a loaned page to apply priority inheritance onto the thread whose stack
// is transiently owning a loaned page. The StackOwnedLoanedPagesInterval has an
// OwnedWaitQueue that's used to avoid priority inversion while the reclaiming thread is
// waiting for the loaned page to no longer be stack owned. This brief waiting is part of
// chasing down and replacing loaned pages that are being borrowed, so that the loaned page
// can be returned to its contiguous VmCowPages.
// When kObjectOrStackOwnerIsStackOwnerFlag bit is 0 but any other bits are 1:
// This is a back pointer to the vm object this page is currently contained in. It is
// implicitly valid when the page is in a VmCowPages (which is a superset of intervals
// during which the page is in a page queue), and nullptr (or logically nullptr) otherwise.
// This should not be modified (except under the page queue lock) whilst a page is in a
// VmCowPages.
// More details:
// This field is accessed via an atomic_ref<>. Using ktl::atomic<> here seems to make GCC
// unhappy, depite the offset and alignment being fine (verified by static asserts and
// DEBUG_ASSERT()s). So instead we use atomic_ref<>.
// If a page is loaned, installation of StackOwnedLoanedPagesInterval on a page must occur
// before any stack ownership of the page has begun, and removal of
// StackOwnedLoanedPagesInterval must occur after stack ownership of the page has already
// ended.
// This field is a struct to enforce that all access is at least atomic memory_order_relaxed.
// Field should be modified by the setters and getters to allow for future encoding changes.
// Any changes to this field need to be made to "alloc" and "free" union variants also.
struct {
ktl::atomic_ref<uintptr_t> get() {
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Waddress-of-packed-member"
// This is fine, because vm_page_t are 8 byte aligned, and object_or_stack_owner_priv is 8
// byte aligned within the vm_page_t.
static_assert(offsetof(vm_page, object.object_or_stack_owner.object_or_stack_owner) %
sizeof(decltype(object_or_stack_owner)) ==
DEBUG_ASSERT(reinterpret_cast<uintptr_t>(&object_or_stack_owner) %
sizeof(decltype(object_or_stack_owner)) ==
return ktl::atomic_ref<uintptr_t>(*&object_or_stack_owner);
#pragma GCC diagnostic pop
ktl::atomic_ref<const uintptr_t> get() const {
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Waddress-of-packed-member"
// This is fine, because vm_page_t are 8 byte aligned, and object_or_stack_owner_priv is 8
// byte aligned within the vm_page_t.
static_assert(offsetof(vm_page, object.object_or_stack_owner.object_or_stack_owner) %
sizeof(decltype(object_or_stack_owner)) ==
DEBUG_ASSERT(reinterpret_cast<uintptr_t>(&object_or_stack_owner) %
sizeof(decltype(object_or_stack_owner)) ==
return ktl::atomic_ref<const uintptr_t>(*&object_or_stack_owner);
#pragma GCC diagnostic pop
uintptr_t object_or_stack_owner;
// Only for a static_assert() below. Logically private.
using InternalType = decltype(object_or_stack_owner);
} __PACKED object_or_stack_owner;
using object_or_stack_owner_t = decltype(object_or_stack_owner);
void* get_object() const {
uintptr_t value = object_or_stack_owner.get().load(ktl::memory_order_relaxed);
if (unlikely(value & kObjectOrStackOwnerIsStackOwnerFlag)) {
return nullptr;
return reinterpret_cast<void*>(value);
// offset 0x20
// This also logically does clear_stack_owner() atomically.
void set_object(void* obj) {
// If the caller wants to clear the object, use clear_object() instead.
// Calling set_object() on a loaned page requires a StackOwnedLoanedPagesInterval on the
// current stack. If the object is already set, the stack ownership interval is essentially
// quite short and all under a single VmCowPages hierarchy lock hold interval. But we still
// require the StackOwnedLoanedPagesInterval for consistency, since the page can be moving
// between different VmCowPages, so in a sense it still stack owned.
// For longer stack ownership intervals (those not entirely under a single VmCowPages
// hierarchy lock hold interval), the object won't be set on entry to this method, and we
// can verify that a StackOwnedLoanedPagesInterval was set on the page, and is still the
// current active interval.
if (containerof(this, vm_page, object)->is_loaned()) {
Thread* current_thread = Thread::Current::Get();
if (!get_object()) {
DEBUG_ASSERT(current_thread->stack_owned_loaned_pages_interval() == &stack_owner());
} else if (obj != get_object()) {
// Ensure set_object() is visible after set_page_offset().
if (is_stack_owned()) {
// In addition to clearing object, this does set_stack_owner() atomically, if needed.
void clear_object() {
if (containerof(this, vm_page, object)->is_loaned()) {
Thread* current_thread = Thread::Current::Get();
// To clear the object backlink of a loaned page, a StackOwnedLoanedPagesInterval on the
// current stack is required.
object_or_stack_owner.get().store(0, ktl::memory_order_relaxed);
StackOwnedLoanedPagesInterval* maybe_stack_owner() const {
uintptr_t value = object_or_stack_owner.get().load(ktl::memory_order_relaxed);
if (!(value & kObjectOrStackOwnerIsStackOwnerFlag)) {
return nullptr;
return reinterpret_cast<StackOwnedLoanedPagesInterval*>(value & ~kObjectOrStackOwnerFlags);
StackOwnedLoanedPagesInterval& stack_owner() const {
uintptr_t value = object_or_stack_owner.get().load(ktl::memory_order_relaxed);
DEBUG_ASSERT(value & kObjectOrStackOwnerIsStackOwnerFlag);
return *reinterpret_cast<StackOwnedLoanedPagesInterval*>(value & ~kObjectOrStackOwnerFlags);
void set_stack_owner(StackOwnedLoanedPagesInterval* stack_owner) {
// The stack owner shouldn't be set by the caller in situations where the/a stack owner is
// already set. It is expected that the field may currently be set to a VmCowPages*, but
// that won't have the kObjectOrStackOwnerIsStackOwnerFlag bit set due to pointer alignment.
DEBUG_ASSERT(!(object_or_stack_owner.get().load(ktl::memory_order_relaxed) &
// We use relaxed here because we're only relying on atomicity. For ordering, the PmmNode
// lock and PageQueues locks are relevant. For ordering of a thread joining the owned wait
// queue vs. deletion of the owned wait queue, the thread lock is relevant.
reinterpret_cast<uintptr_t>(stack_owner) | kObjectOrStackOwnerIsStackOwnerFlag,
void clear_stack_owner() { clear_stack_owner_internal(nullptr); }
void clear_stack_owner_internal(void* new_obj) {
// If this fires, it likely means there's an extra clear somewhere, possibly by the current
// thread, or possibly by a different thread. This call could be the "extra" clear if the
// caller didn't check whether there's a stack owner before calling.
while (true) {
uintptr_t old_value = object_or_stack_owner.get().load(ktl::memory_order_relaxed);
// If this fires, it likely means that some other thread did a clear (so either this
// thread or the other thread shouldn't have cleared). If this thread had already done a
// previous clear, the assert near the top would have fired instead.
DEBUG_ASSERT(old_value & kObjectOrStackOwnerIsStackOwnerFlag);
// We don't want to be acquiring SOLIP::lock here every time we free a loaned page, so we
// only acquire the lock if the page's StackOwnedLoanedPagesInterval has a waiter,
// which is much more rare. In that case we must acquire the SOLPI::lock to avoid letting
// this thread continue and signal and delete the StackOwnedLoanedPagesInterval until
// after the waiter has committed to blocking on the OwnedWaitQueue, so that the waiter
// can be woken and removed from the OwnedWaitQueue before the OwnedWaitQueue is deleted.
ktl::optional<Guard<SpinLock, IrqSave>> maybe_sollock_guard;
if (old_value & kObjectOrStackOwnerHasWaiter) {
// Go ahead and actually acquire the lock.
if (object_or_stack_owner.get().compare_exchange_strong(
old_value, reinterpret_cast<uintptr_t>(new_obj), ktl::memory_order_relaxed)) {
// ~maybe_sollock_guard will release the lock if it was acquired
bool is_stack_owned() const {
// This can return true for a page that was loaned fairly recently but is no longer loaned.
return !!(object_or_stack_owner.get().load(ktl::memory_order_relaxed) &
struct TrySetHasWaiterResult {
// True iff this call to try_set_has_waiter() was the first thread to set that there's a
// waiter.
bool first_setter;
// The stack_owner may own the page. The stack_owner can be waited on safely now that the
// waiter bit is set. The wait on stack_owner must occur while still the calling thread is
// still holding the SOLPI::lock.
StackOwnedLoanedPagesInterval* stack_owner;
// ktl::is_ok() iff the page has a stack_owner and the waiter bit is set.
// !ktl::is_ok() iff the page no longer has a stack_owner.
ktl::optional<TrySetHasWaiterResult> try_set_has_waiter()
// offset 0x20
// When object_or_event_priv is pointing to a VmCowPages, this is the offset in the VmCowPages
// that contains this page.
// Else this field is 0.
// Field should be modified by the setters and getters to allow for future encoding changes.
uint64_t page_offset_priv;
uint64_t get_page_offset() const { return page_offset_priv; }
void set_page_offset(uint64_t page_offset) { page_offset_priv = page_offset; }
// offset 0x28
// Identifies which queue this page is in.
uint8_t page_queue_priv;
ktl::atomic_ref<uint8_t> get_page_queue_ref() {
return ktl::atomic_ref<uint8_t>(page_queue_priv);
ktl::atomic_ref<const uint8_t> get_page_queue_ref() const {
return ktl::atomic_ref<const uint8_t>(page_queue_priv);
// offset 0x29
uint8_t pin_count : VM_PAGE_OBJECT_PIN_COUNT_BITS;
// Bits used by VmObjectPaged implementation of COW clones.
// Pages of VmObjectPaged have two "split" bits. These bits are used to track which
// pages in children of hidden VMOs have diverged from their parent. There are two
// bits, left and right, one for each child. In a hidden parent, a 1 split bit means
// that page in the child has diverged from the parent and the parent's page is
// no longer accessible to that child.
// It should never be the case that both split bits are set, as the page should
// be moved into the child instead of setting the second bit.
uint8_t cow_left_split : 1;
uint8_t cow_right_split : 1;
// Hint for whether the page is always needed and should not be considered for reclamation
// under memory pressure (unless the kernel decides to override hints for some reason).
uint8_t always_need : 1;
// Tracks state used to determine whether the page is dirty and its contents need to written
// back to the page source at some point, and when it has been cleaned. Used for pages backed
// by a user pager. The three states supported are Clean, Dirty, and AwaitingClean (more
// details in VmCowPages::DirtyState).
uint8_t dirty_state : VM_PAGE_OBJECT_DIRTY_STATE_BITS;
uint8_t padding : 6;
// This struct has no type name and exists inside an unpacked parent and so it really doesn't
// need to have any padding. By making it packed we allow the next outer variables, to use
// space we would have otherwise wasted in padding, without breaking alignment rules.
} __PACKED object; // attached to a vm object
struct {
// No fields may be added for these variants due to UB until we improve the stack ownership
// system, or otherwise address the current usage of object.object_or_stack_owner outside
// of OBJECT state.
} free; // free - typically in free_list_ or free_loaned_list_, unless loan_cancelled
struct {
// No fields may be added for these variants due to UB until we improve the stack ownership
// system, or otherwise address the current usage of object.object_or_stack_owner outside
// of OBJECT state.
} alloc; // allocated, but not yet put to any specific use
struct {
// Used by the VmTriPageStorage allocator to record the size of the item in each of the
// possible buckets. See it for more details.
uint16_t left_compress_size;
uint16_t mid_compress_size;
uint16_t right_compress_size;
} __PACKED zram;
using object_t = decltype(object);
// offset 0x2b
// logically private; use |state()| and |set_state()|
vm_page_state state_priv;
// offset 0x2c
// logically private, use loaned getters and setters below.
static constexpr uint8_t kLoanedStateIsLoaned = 1;
static constexpr uint8_t kLoanedStateIsLoanCancelled = 2;
uint8_t loaned_state_priv;
// This padding is inserted here to make sizeof(vm_page) a multiple of 8 and help validate that
// all commented offsets were indeed correct.
char padding_bytes[3];
// helper routines
// Returns whether this page is in the FREE state. When in the FREE state the page is assumed to
// be owned by the relevant PmmNode, and hence unless its lock is held this query must be assumed
// to be racy.
bool is_free() const { return state() == vm_page_state::FREE; }
// If true, this page is "loaned" in the sense of being loaned from a contiguous VMO (via
// decommit) to Zircon. If the original contiguous VMO is deleted, this page will no longer be
// loaned. A loaned page cannot be pinned. Instead a different physical page (non-loaned) is
// used for the pin. A loaned page can be (re-)committed back into its original contiguous VMO,
// which causes the data in the loaned page to be moved into a different physical page (which
// itself can be non-loaned or loaned). A loaned page cannot be used to allocate a new contiguous
// VMO.
bool is_loaned() const {
return !!(ktl::atomic_ref<const uint8_t>(loaned_state_priv).load(ktl::memory_order_relaxed) &
// If true, the original contiguous VMO wants the page back. Such pages won't be re-used until
// the page is no longer loaned, either via commit of the page back into the contiguous VMO that
// loaned the page, or via deletion of the contiguous VMO that loaned the page. Such pages are
// not in the free_loaned_list_ in pmm, which is how re-use is prevented.
bool is_loan_cancelled() const {
return !!(ktl::atomic_ref<const uint8_t>(loaned_state_priv).load(ktl::memory_order_relaxed) &
// Manipulation of 'loaned' should only be done by the PmmNode under its lock whilst it is the
// owner of the page.
void set_is_loaned() {
void clear_is_loaned() {
// Manipulation of 'loan_cancelled' should only be done by the PmmNode under its lock, but may be
// done when the PmmNode is not the owner of the page.
void set_is_loan_cancelled() {
void clear_is_loan_cancelled() {
void dump() const;
// return the physical address
// future plan to store in a compressed form
paddr_t paddr() const { return paddr_priv; }
vm_page_state state() const {
return ktl::atomic_ref<const vm_page_state>(state_priv).load(ktl::memory_order_relaxed);
void set_state(vm_page_state new_state) {
const vm_page_state old_state = state();
ktl::atomic_ref<vm_page_state>(state_priv).store(new_state, ktl::memory_order_relaxed);
// See comment at percpu::vm_page_counts
auto& p = percpu::GetCurrent();
p.vm_page_counts.by_state[VmPageStateIndex(old_state)] -= 1;
p.vm_page_counts.by_state[VmPageStateIndex(new_state)] += 1;
// Return the approximate number of pages in state |state|.
// When called concurrently with |set_state|, the count may be off by a small amount.
static uint64_t get_count(vm_page_state state);
// Add |n| to the count of pages in state |state|.
// Should be used when first constructing pages.
static void add_to_initial_count(vm_page_state state, uint64_t n);
static constexpr uintptr_t kObjectOrStackOwnerIsStackOwnerFlag = 0x1;
static constexpr uintptr_t kObjectOrStackOwnerHasWaiter = 0x2;
static constexpr uintptr_t kObjectOrStackOwnerFlags = 0x3;
// Make sure the address of a StackOwnedLoanedPagesInterval will always have room for at least 2
// low order bit flags.
static_assert(alignof(StackOwnedLoanedPagesInterval) >= kObjectOrStackOwnerFlags + 1);
// Provide a type alias using modern syntax to avoid clang-tidy warnings.
using vm_page_t = vm_page;
// assert expected offsets (the offsets in comments above) and natural alignments
static_assert(offsetof(vm_page_t, queue_node) == 0x0);
static_assert(offsetof(vm_page_t, queue_node) % alignof(decltype(vm_page_t::queue_node)) == 0);
static_assert(offsetof(vm_page_t, queue_node) % alignof(list_node) == 0);
static_assert(offsetof(vm_page_t, paddr_priv) == 0x10);
static_assert(offsetof(vm_page_t, paddr_priv) % alignof(decltype(vm_page_t::paddr_priv)) == 0);
static_assert(offsetof(vm_page_t, paddr_priv) % alignof(paddr_t) == 0);
static_assert(offsetof(vm_page_t, object.object_or_stack_owner) == 0x18);
static_assert(offsetof(vm_page_t, object.object_or_stack_owner) %
alignof(vm_page_t::object_t::object_or_stack_owner_t::InternalType) ==
static_assert(offsetof(vm_page_t, object.object_or_stack_owner) % alignof(uintptr_t) == 0);
static_assert(offsetof(vm_page_t, object.page_offset_priv) == 0x20);
static_assert(offsetof(vm_page_t, object.page_offset_priv) %
alignof(decltype(vm_page_t::object_t::page_offset_priv)) ==
static_assert(offsetof(vm_page_t, object.page_offset_priv) % alignof(uint64_t) == 0);
static_assert(offsetof(vm_page_t, object.page_queue_priv) == 0x28);
static_assert(offsetof(vm_page_t, object.page_queue_priv) %
alignof(decltype(vm_page_t::object_t::page_queue_priv)) ==
static_assert(offsetof(vm_page_t, object.page_queue_priv) % alignof(uint8_t) == 0);
static_assert(offsetof(vm_page_t, state_priv) == 0x2b);
static_assert(offsetof(vm_page_t, state_priv) % alignof(decltype(vm_page_t::state_priv)) == 0);
static_assert(offsetof(vm_page_t, state_priv) % alignof(vm_page_state) == 0);
static_assert(offsetof(vm_page_t, padding_bytes) == 0x2d);
// assert that the page structure isn't growing uncontrollably
static_assert(sizeof(vm_page) == 0x30);
// assert that |vm_page| is a POD
static_assert(ktl::is_trivial_v<vm_page> && ktl::is_standard_layout_v<vm_page>);