| // Copyright 2016 The Fuchsia Authors |
| // Copyright (c) 2014 Travis Geiselbrecht |
| // |
| // Use of this source code is governed by a MIT-style |
| // license that can be found in the LICENSE file or at |
| // https://opensource.org/licenses/MIT |
| |
| #ifndef ZIRCON_KERNEL_VM_INCLUDE_VM_PAGE_H_ |
| #define ZIRCON_KERNEL_VM_INCLUDE_VM_PAGE_H_ |
| |
| #include <lib/zircon-internal/macros.h> |
| #include <stdint.h> |
| #include <sys/types.h> |
| #include <zircon/compiler.h> |
| #include <zircon/listnode.h> |
| |
| #include <kernel/percpu.h> |
| #include <ktl/atomic.h> |
| #include <ktl/optional.h> |
| #include <ktl/type_traits.h> |
| #include <vm/page_state.h> |
| #include <vm/stack_owned_loaned_pages_interval.h> |
| |
| // core per page structure allocated at pmm arena creation time |
| struct vm_page { |
| struct list_node queue_node; |
| |
| // read-only after being set up |
| paddr_t paddr_priv; // use paddr() accessor |
| |
| // offset 0x18 |
| |
| union { |
| struct { |
| // This field is used for two different purposes, depending on whether the low order bit is |
| // set or not. This same field exists in states OBJECT, ALLOC, and FREE. |
| // |
| // When all bits are 0: |
| // |
| // There is no object and no StackOwnedLoanedPagesInterval. |
| // |
| // When kObjectOrStackOwnerIsStackOwnerFlag is set: |
| // |
| // The rest of the bits are a pointer to a StackOwnedLoanedPagesInterval. This allows a |
| // thread reclaiming a loaned page to apply priority inheritance onto the thread whose stack |
| // is transiently owning a loaned page. The StackOwnedLoanedPagesInterval has an |
| // OwnedWaitQueue that's used to avoid priority inversion while the reclaiming thread is |
| // waiting for the loaned page to no longer be stack owned. This brief waiting is part of |
| // chasing down and replacing loaned pages that are being borrowed, so that the loaned page |
| // can be returned to its contiguous VmCowPages. |
| // |
| // When kObjectOrStackOwnerIsStackOwnerFlag bit is 0 but any other bits are 1: |
| // |
| // This is a back pointer to the vm object this page is currently contained in. It is |
| // implicitly valid when the page is in a VmCowPages (which is a superset of intervals |
| // during which the page is in a page queue), and nullptr (or logically nullptr) otherwise. |
| // This should not be modified (except under the page queue lock) whilst a page is in a |
| // VmCowPages. |
| // |
| // More details: |
| // |
| // This field is accessed via an atomic_ref<>. Using ktl::atomic<> here seems to make GCC |
| // unhappy, depite the offset and alignment being fine (verified by static asserts and |
| // DEBUG_ASSERT()s). So instead we use atomic_ref<>. |
| // |
| // If a page is loaned, installation of StackOwnedLoanedPagesInterval on a page must occur |
| // before any stack ownership of the page has begun, and removal of |
| // StackOwnedLoanedPagesInterval must occur after stack ownership of the page has already |
| // ended. |
| // |
| // This field is a struct to enforce that all access is at least atomic memory_order_relaxed. |
| // |
| // Field should be modified by the setters and getters to allow for future encoding changes. |
| // |
| // Any changes to this field need to be made to "alloc" and "free" union variants also. |
| struct { |
| public: |
| ktl::atomic_ref<uintptr_t> get() { |
| #pragma GCC diagnostic push |
| #pragma GCC diagnostic ignored "-Waddress-of-packed-member" |
| // This is fine, because vm_page_t are 8 byte aligned, and object_or_stack_owner_priv is 8 |
| // byte aligned within the vm_page_t. |
| static_assert(offsetof(vm_page, object.object_or_stack_owner.object_or_stack_owner) % |
| sizeof(decltype(object_or_stack_owner)) == |
| 0); |
| DEBUG_ASSERT(reinterpret_cast<uintptr_t>(&object_or_stack_owner) % |
| sizeof(decltype(object_or_stack_owner)) == |
| 0); |
| return ktl::atomic_ref<uintptr_t>(*&object_or_stack_owner); |
| #pragma GCC diagnostic pop |
| } |
| |
| ktl::atomic_ref<const uintptr_t> get() const { |
| #pragma GCC diagnostic push |
| #pragma GCC diagnostic ignored "-Waddress-of-packed-member" |
| // This is fine, because vm_page_t are 8 byte aligned, and object_or_stack_owner_priv is 8 |
| // byte aligned within the vm_page_t. |
| static_assert(offsetof(vm_page, object.object_or_stack_owner.object_or_stack_owner) % |
| sizeof(decltype(object_or_stack_owner)) == |
| 0); |
| DEBUG_ASSERT(reinterpret_cast<uintptr_t>(&object_or_stack_owner) % |
| sizeof(decltype(object_or_stack_owner)) == |
| 0); |
| return ktl::atomic_ref<const uintptr_t>(*&object_or_stack_owner); |
| #pragma GCC diagnostic pop |
| } |
| |
| private: |
| uintptr_t object_or_stack_owner; |
| |
| public: |
| // Only for a static_assert() below. Logically private. |
| using InternalType = decltype(object_or_stack_owner); |
| } __PACKED object_or_stack_owner; |
| using object_or_stack_owner_t = decltype(object_or_stack_owner); |
| |
| void* get_object() const { |
| uintptr_t value = object_or_stack_owner.get().load(ktl::memory_order_relaxed); |
| if (unlikely(value & kObjectOrStackOwnerIsStackOwnerFlag)) { |
| return nullptr; |
| } |
| return reinterpret_cast<void*>(value); |
| } |
| |
| // offset 0x20 |
| |
| // This also logically does clear_stack_owner() atomically. |
| void set_object(void* obj) { |
| // If the caller wants to clear the object, use clear_object() instead. |
| DEBUG_ASSERT(obj); |
| // Calling set_object() on a loaned page requires a StackOwnedLoanedPagesInterval on the |
| // current stack. If the object is already set, the stack ownership interval is essentially |
| // quite short and all under a single VmCowPages hierarchy lock hold interval. But we still |
| // require the StackOwnedLoanedPagesInterval for consistency, since the page can be moving |
| // between different VmCowPages, so in a sense it still stack owned. |
| // |
| // For longer stack ownership intervals (those not entirely under a single VmCowPages |
| // hierarchy lock hold interval), the object won't be set on entry to this method, and we |
| // can verify that a StackOwnedLoanedPagesInterval was set on the page, and is still the |
| // current active interval. |
| #if DEBUG_ASSERT_IMPLEMENTED |
| if (containerof(this, vm_page, object)->is_loaned()) { |
| Thread* current_thread = Thread::Current::Get(); |
| if (!get_object()) { |
| DEBUG_ASSERT(is_stack_owned()); |
| DEBUG_ASSERT(current_thread->stack_owned_loaned_pages_interval() == &stack_owner()); |
| } else if (obj != get_object()) { |
| DEBUG_ASSERT(current_thread->stack_owned_loaned_pages_interval()); |
| } |
| } |
| #endif |
| // Ensure set_object() is visible after set_page_offset(). |
| ktl::atomic_thread_fence(ktl::memory_order_release); |
| if (is_stack_owned()) { |
| clear_stack_owner_internal(obj); |
| return; |
| } |
| object_or_stack_owner.get().store(reinterpret_cast<uintptr_t>(obj), |
| ktl::memory_order_relaxed); |
| } |
| |
| // In addition to clearing object, this does set_stack_owner() atomically, if needed. |
| void clear_object() { |
| DEBUG_ASSERT(!is_stack_owned()); |
| if (containerof(this, vm_page, object)->is_loaned()) { |
| Thread* current_thread = Thread::Current::Get(); |
| // To clear the object backlink of a loaned page, a StackOwnedLoanedPagesInterval on the |
| // current stack is required. |
| DEBUG_ASSERT(current_thread->stack_owned_loaned_pages_interval()); |
| set_stack_owner(current_thread->stack_owned_loaned_pages_interval()); |
| return; |
| } |
| object_or_stack_owner.get().store(0, ktl::memory_order_relaxed); |
| } |
| |
| StackOwnedLoanedPagesInterval* maybe_stack_owner() const { |
| uintptr_t value = object_or_stack_owner.get().load(ktl::memory_order_relaxed); |
| if (!(value & kObjectOrStackOwnerIsStackOwnerFlag)) { |
| return nullptr; |
| } |
| return reinterpret_cast<StackOwnedLoanedPagesInterval*>(value & ~kObjectOrStackOwnerFlags); |
| } |
| |
| StackOwnedLoanedPagesInterval& stack_owner() const { |
| uintptr_t value = object_or_stack_owner.get().load(ktl::memory_order_relaxed); |
| DEBUG_ASSERT(value & kObjectOrStackOwnerIsStackOwnerFlag); |
| return *reinterpret_cast<StackOwnedLoanedPagesInterval*>(value & ~kObjectOrStackOwnerFlags); |
| } |
| |
| void set_stack_owner(StackOwnedLoanedPagesInterval* stack_owner) { |
| DEBUG_ASSERT(stack_owner); |
| // The stack owner shouldn't be set by the caller in situations where the/a stack owner is |
| // already set. It is expected that the field may currently be set to a VmCowPages*, but |
| // that won't have the kObjectOrStackOwnerIsStackOwnerFlag bit set due to pointer alignment. |
| DEBUG_ASSERT(!(object_or_stack_owner.get().load(ktl::memory_order_relaxed) & |
| kObjectOrStackOwnerIsStackOwnerFlag)); |
| // We use relaxed here because we're only relying on atomicity. For ordering, the PmmNode |
| // lock and PageQueues locks are relevant. For ordering of a thread joining the owned wait |
| // queue vs. deletion of the owned wait queue, the thread lock is relevant. |
| object_or_stack_owner.get().store( |
| reinterpret_cast<uintptr_t>(stack_owner) | kObjectOrStackOwnerIsStackOwnerFlag, |
| ktl::memory_order_relaxed); |
| } |
| |
| void clear_stack_owner() { clear_stack_owner_internal(nullptr); } |
| |
| void clear_stack_owner_internal(void* new_obj) { |
| // If this fires, it likely means there's an extra clear somewhere, possibly by the current |
| // thread, or possibly by a different thread. This call could be the "extra" clear if the |
| // caller didn't check whether there's a stack owner before calling. |
| DEBUG_ASSERT(is_stack_owned()); |
| while (true) { |
| uintptr_t old_value = object_or_stack_owner.get().load(ktl::memory_order_relaxed); |
| // If this fires, it likely means that some other thread did a clear (so either this |
| // thread or the other thread shouldn't have cleared). If this thread had already done a |
| // previous clear, the assert near the top would have fired instead. |
| DEBUG_ASSERT(old_value & kObjectOrStackOwnerIsStackOwnerFlag); |
| // We don't want to be acquiring SOLIP::lock here every time we free a loaned page, so we |
| // only acquire the lock if the page's StackOwnedLoanedPagesInterval has a waiter, |
| // which is much more rare. In that case we must acquire the SOLPI::lock to avoid letting |
| // this thread continue and signal and delete the StackOwnedLoanedPagesInterval until |
| // after the waiter has committed to blocking on the OwnedWaitQueue, so that the waiter |
| // can be woken and removed from the OwnedWaitQueue before the OwnedWaitQueue is deleted. |
| ktl::optional<Guard<SpinLock, IrqSave>> maybe_sollock_guard; |
| if (old_value & kObjectOrStackOwnerHasWaiter) { |
| // Go ahead and actually acquire the lock. |
| maybe_sollock_guard.emplace(&StackOwnedLoanedPagesInterval::get_lock()); |
| } |
| |
| if (object_or_stack_owner.get().compare_exchange_strong( |
| old_value, reinterpret_cast<uintptr_t>(new_obj), ktl::memory_order_relaxed)) { |
| break; |
| } |
| // ~maybe_sollock_guard will release the lock if it was acquired |
| } |
| } |
| |
| bool is_stack_owned() const { |
| // This can return true for a page that was loaned fairly recently but is no longer loaned. |
| return !!(object_or_stack_owner.get().load(ktl::memory_order_relaxed) & |
| kObjectOrStackOwnerIsStackOwnerFlag); |
| } |
| |
| struct TrySetHasWaiterResult { |
| // True iff this call to try_set_has_waiter() was the first thread to set that there's a |
| // waiter. |
| bool first_setter; |
| // The stack_owner may own the page. The stack_owner can be waited on safely now that the |
| // waiter bit is set. The wait on stack_owner must occur while still the calling thread is |
| // still holding the SOLPI::lock. |
| StackOwnedLoanedPagesInterval* stack_owner; |
| }; |
| // ktl::is_ok() iff the page has a stack_owner and the waiter bit is set. |
| // !ktl::is_ok() iff the page no longer has a stack_owner. |
| ktl::optional<TrySetHasWaiterResult> try_set_has_waiter() |
| TA_REQ(StackOwnedLoanedPagesInterval::get_lock()); |
| |
| // offset 0x20 |
| |
| // When object_or_event_priv is pointing to a VmCowPages, this is the offset in the VmCowPages |
| // that contains this page. |
| // |
| // Else this field is 0. |
| // |
| // Field should be modified by the setters and getters to allow for future encoding changes. |
| uint64_t page_offset_priv; |
| |
| uint64_t get_page_offset() const { return page_offset_priv; } |
| |
| void set_page_offset(uint64_t page_offset) { page_offset_priv = page_offset; } |
| |
| // offset 0x28 |
| |
| // Identifies which queue this page is in. |
| uint8_t page_queue_priv; |
| |
| ktl::atomic_ref<uint8_t> get_page_queue_ref() { |
| return ktl::atomic_ref<uint8_t>(page_queue_priv); |
| } |
| ktl::atomic_ref<const uint8_t> get_page_queue_ref() const { |
| return ktl::atomic_ref<const uint8_t>(page_queue_priv); |
| } |
| |
| // offset 0x29 |
| |
| #define VM_PAGE_OBJECT_PIN_COUNT_BITS 5 |
| #define VM_PAGE_OBJECT_MAX_PIN_COUNT ((1ul << VM_PAGE_OBJECT_PIN_COUNT_BITS) - 1) |
| uint8_t pin_count : VM_PAGE_OBJECT_PIN_COUNT_BITS; |
| |
| // Bits used by VmObjectPaged implementation of COW clones. |
| // |
| // Pages of VmObjectPaged have two "split" bits. These bits are used to track which |
| // pages in children of hidden VMOs have diverged from their parent. There are two |
| // bits, left and right, one for each child. In a hidden parent, a 1 split bit means |
| // that page in the child has diverged from the parent and the parent's page is |
| // no longer accessible to that child. |
| // |
| // It should never be the case that both split bits are set, as the page should |
| // be moved into the child instead of setting the second bit. |
| uint8_t cow_left_split : 1; |
| uint8_t cow_right_split : 1; |
| |
| // Hint for whether the page is always needed and should not be considered for reclamation |
| // under memory pressure (unless the kernel decides to override hints for some reason). |
| uint8_t always_need : 1; |
| |
| #define VM_PAGE_OBJECT_DIRTY_STATE_BITS 2 |
| #define VM_PAGE_OBJECT_MAX_DIRTY_STATES ((1u << VM_PAGE_OBJECT_DIRTY_STATE_BITS)) |
| #define VM_PAGE_OBJECT_DIRTY_STATES_MASK (VM_PAGE_OBJECT_MAX_DIRTY_STATES - 1) |
| // Tracks state used to determine whether the page is dirty and its contents need to written |
| // back to the page source at some point, and when it has been cleaned. Used for pages backed |
| // by a user pager. The three states supported are Clean, Dirty, and AwaitingClean (more |
| // details in VmCowPages::DirtyState). |
| uint8_t dirty_state : VM_PAGE_OBJECT_DIRTY_STATE_BITS; |
| |
| uint8_t padding : 6; |
| // This struct has no type name and exists inside an unpacked parent and so it really doesn't |
| // need to have any padding. By making it packed we allow the next outer variables, to use |
| // space we would have otherwise wasted in padding, without breaking alignment rules. |
| } __PACKED object; // attached to a vm object |
| struct { |
| // No fields may be added for these variants due to UB until we improve the stack ownership |
| // system, or otherwise address the current usage of object.object_or_stack_owner outside |
| // of OBJECT state. |
| } free; // free - typically in free_list_ or free_loaned_list_, unless loan_cancelled |
| struct { |
| // No fields may be added for these variants due to UB until we improve the stack ownership |
| // system, or otherwise address the current usage of object.object_or_stack_owner outside |
| // of OBJECT state. |
| } alloc; // allocated, but not yet put to any specific use |
| struct { |
| // Used by the VmTriPageStorage allocator to record the size of the item in each of the |
| // possible buckets. See it for more details. |
| uint16_t left_compress_size; |
| uint16_t mid_compress_size; |
| uint16_t right_compress_size; |
| } __PACKED zram; |
| }; |
| using object_t = decltype(object); |
| |
| // offset 0x2b |
| |
| // logically private; use |state()| and |set_state()| |
| vm_page_state state_priv; |
| |
| // offset 0x2c |
| |
| // logically private, use loaned getters and setters below. |
| static constexpr uint8_t kLoanedStateIsLoaned = 1; |
| static constexpr uint8_t kLoanedStateIsLoanCancelled = 2; |
| uint8_t loaned_state_priv; |
| |
| // This padding is inserted here to make sizeof(vm_page) a multiple of 8 and help validate that |
| // all commented offsets were indeed correct. |
| char padding_bytes[3]; |
| |
| // helper routines |
| |
| // Returns whether this page is in the FREE state. When in the FREE state the page is assumed to |
| // be owned by the relevant PmmNode, and hence unless its lock is held this query must be assumed |
| // to be racy. |
| bool is_free() const { return state() == vm_page_state::FREE; } |
| |
| // If true, this page is "loaned" in the sense of being loaned from a contiguous VMO (via |
| // decommit) to Zircon. If the original contiguous VMO is deleted, this page will no longer be |
| // loaned. A loaned page cannot be pinned. Instead a different physical page (non-loaned) is |
| // used for the pin. A loaned page can be (re-)committed back into its original contiguous VMO, |
| // which causes the data in the loaned page to be moved into a different physical page (which |
| // itself can be non-loaned or loaned). A loaned page cannot be used to allocate a new contiguous |
| // VMO. |
| bool is_loaned() const { |
| return !!(ktl::atomic_ref<const uint8_t>(loaned_state_priv).load(ktl::memory_order_relaxed) & |
| kLoanedStateIsLoaned); |
| } |
| // If true, the original contiguous VMO wants the page back. Such pages won't be re-used until |
| // the page is no longer loaned, either via commit of the page back into the contiguous VMO that |
| // loaned the page, or via deletion of the contiguous VMO that loaned the page. Such pages are |
| // not in the free_loaned_list_ in pmm, which is how re-use is prevented. |
| bool is_loan_cancelled() const { |
| return !!(ktl::atomic_ref<const uint8_t>(loaned_state_priv).load(ktl::memory_order_relaxed) & |
| kLoanedStateIsLoanCancelled); |
| } |
| // Manipulation of 'loaned' should only be done by the PmmNode under its lock whilst it is the |
| // owner of the page. |
| void set_is_loaned() { |
| ktl::atomic_ref<uint8_t>(loaned_state_priv).fetch_or(kLoanedStateIsLoaned); |
| } |
| void clear_is_loaned() { |
| ktl::atomic_ref<uint8_t>(loaned_state_priv) |
| .fetch_and(static_cast<uint8_t>(~kLoanedStateIsLoaned)); |
| } |
| |
| // Manipulation of 'loan_cancelled' should only be done by the PmmNode under its lock, but may be |
| // done when the PmmNode is not the owner of the page. |
| void set_is_loan_cancelled() { |
| ktl::atomic_ref<uint8_t>(loaned_state_priv).fetch_or(kLoanedStateIsLoanCancelled); |
| } |
| void clear_is_loan_cancelled() { |
| ktl::atomic_ref<uint8_t>(loaned_state_priv) |
| .fetch_and(static_cast<uint8_t>(~kLoanedStateIsLoanCancelled)); |
| } |
| |
| void dump() const; |
| |
| // return the physical address |
| // future plan to store in a compressed form |
| paddr_t paddr() const { return paddr_priv; } |
| |
| vm_page_state state() const { |
| return ktl::atomic_ref<const vm_page_state>(state_priv).load(ktl::memory_order_relaxed); |
| } |
| |
| void set_state(vm_page_state new_state) { |
| const vm_page_state old_state = state(); |
| ktl::atomic_ref<vm_page_state>(state_priv).store(new_state, ktl::memory_order_relaxed); |
| |
| // See comment at percpu::vm_page_counts |
| auto& p = percpu::GetCurrent(); |
| p.vm_page_counts.by_state[VmPageStateIndex(old_state)] -= 1; |
| p.vm_page_counts.by_state[VmPageStateIndex(new_state)] += 1; |
| } |
| |
| // Return the approximate number of pages in state |state|. |
| // |
| // When called concurrently with |set_state|, the count may be off by a small amount. |
| static uint64_t get_count(vm_page_state state); |
| |
| // Add |n| to the count of pages in state |state|. |
| // |
| // Should be used when first constructing pages. |
| static void add_to_initial_count(vm_page_state state, uint64_t n); |
| |
| private: |
| static constexpr uintptr_t kObjectOrStackOwnerIsStackOwnerFlag = 0x1; |
| static constexpr uintptr_t kObjectOrStackOwnerHasWaiter = 0x2; |
| static constexpr uintptr_t kObjectOrStackOwnerFlags = 0x3; |
| // Make sure the address of a StackOwnedLoanedPagesInterval will always have room for at least 2 |
| // low order bit flags. |
| static_assert(alignof(StackOwnedLoanedPagesInterval) >= kObjectOrStackOwnerFlags + 1); |
| }; |
| |
| // Provide a type alias using modern syntax to avoid clang-tidy warnings. |
| using vm_page_t = vm_page; |
| |
| // assert expected offsets (the offsets in comments above) and natural alignments |
| static_assert(offsetof(vm_page_t, queue_node) == 0x0); |
| static_assert(offsetof(vm_page_t, queue_node) % alignof(decltype(vm_page_t::queue_node)) == 0); |
| static_assert(offsetof(vm_page_t, queue_node) % alignof(list_node) == 0); |
| |
| static_assert(offsetof(vm_page_t, paddr_priv) == 0x10); |
| static_assert(offsetof(vm_page_t, paddr_priv) % alignof(decltype(vm_page_t::paddr_priv)) == 0); |
| static_assert(offsetof(vm_page_t, paddr_priv) % alignof(paddr_t) == 0); |
| |
| static_assert(offsetof(vm_page_t, object.object_or_stack_owner) == 0x18); |
| static_assert(offsetof(vm_page_t, object.object_or_stack_owner) % |
| alignof(vm_page_t::object_t::object_or_stack_owner_t::InternalType) == |
| 0); |
| static_assert(offsetof(vm_page_t, object.object_or_stack_owner) % alignof(uintptr_t) == 0); |
| |
| static_assert(offsetof(vm_page_t, object.page_offset_priv) == 0x20); |
| static_assert(offsetof(vm_page_t, object.page_offset_priv) % |
| alignof(decltype(vm_page_t::object_t::page_offset_priv)) == |
| 0); |
| static_assert(offsetof(vm_page_t, object.page_offset_priv) % alignof(uint64_t) == 0); |
| |
| static_assert(offsetof(vm_page_t, object.page_queue_priv) == 0x28); |
| static_assert(offsetof(vm_page_t, object.page_queue_priv) % |
| alignof(decltype(vm_page_t::object_t::page_queue_priv)) == |
| 0); |
| static_assert(offsetof(vm_page_t, object.page_queue_priv) % alignof(uint8_t) == 0); |
| |
| static_assert(offsetof(vm_page_t, state_priv) == 0x2b); |
| static_assert(offsetof(vm_page_t, state_priv) % alignof(decltype(vm_page_t::state_priv)) == 0); |
| static_assert(offsetof(vm_page_t, state_priv) % alignof(vm_page_state) == 0); |
| |
| static_assert(offsetof(vm_page_t, padding_bytes) == 0x2d); |
| |
| // assert that the page structure isn't growing uncontrollably |
| static_assert(sizeof(vm_page) == 0x30); |
| |
| // assert that |vm_page| is a POD |
| static_assert(ktl::is_trivial_v<vm_page> && ktl::is_standard_layout_v<vm_page>); |
| |
| #endif // ZIRCON_KERNEL_VM_INCLUDE_VM_PAGE_H_ |