blob: 839371e4f4f4fdff1d8e4d95e427af4486aef456 [file] [log] [blame]
// Copyright 2018 The Fuchsia Authors
//
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file or at
// https://opensource.org/licenses/MIT
#ifndef ZIRCON_KERNEL_VM_PMM_NODE_H_
#define ZIRCON_KERNEL_VM_PMM_NODE_H_
#include <fbl/canary.h>
#include <fbl/intrusive_double_list.h>
#include <kernel/event.h>
#include <kernel/lockdep.h>
#include <kernel/mutex.h>
#include <ktl/span.h>
#include <vm/compression.h>
#include <vm/loan_sweeper.h>
#include <vm/physical_page_borrowing_config.h>
#include <vm/pmm.h>
#include <vm/pmm_checker.h>
#include "pmm_arena.h"
// per numa node collection of pmm arenas and worker threads
class PmmNode {
public:
// This constructor may be called early in the boot sequence so make sure it does not do any "real
// work" or depend on any globals.
PmmNode() : evictor_(this) {}
~PmmNode() = default;
DISALLOW_COPY_ASSIGN_AND_MOVE(PmmNode);
paddr_t PageToPaddr(const vm_page_t* page) TA_NO_THREAD_SAFETY_ANALYSIS;
vm_page_t* PaddrToPage(paddr_t addr) TA_NO_THREAD_SAFETY_ANALYSIS;
// main allocator routines
zx_status_t AllocPage(uint alloc_flags, vm_page_t** page, paddr_t* pa);
zx_status_t AllocPages(size_t count, uint alloc_flags, list_node* list);
zx_status_t AllocRange(paddr_t address, size_t count, list_node* list);
zx_status_t AllocContiguous(size_t count, uint alloc_flags, uint8_t alignment_log2, paddr_t* pa,
list_node* list);
void FreePage(vm_page* page);
// The list can be a combination of loaned and non-loaned pages.
void FreeList(list_node* list);
// Contiguous page loaning routines
void BeginLoan(list_node* page_list);
void CancelLoan(paddr_t address, size_t count);
void EndLoan(paddr_t address, size_t count, list_node* page_list);
void DeleteLender(paddr_t address, size_t count);
// See |pmm_set_free_memory_signal|
bool SetFreeMemorySignal(uint64_t free_lower_bound, uint64_t free_upper_bound,
uint64_t delay_allocations_pages, Event* event);
zx_status_t WaitTillShouldRetrySingleAlloc(const Deadline& deadline) {
return free_pages_evt_.Wait(deadline);
}
void StopReturningShouldWait();
uint64_t CountFreePages() const;
uint64_t CountLoanedFreePages() const;
uint64_t CountLoanCancelledPages() const;
// Not actually used and cancelled is still not free, since the page can't be allocated in that
// state.
uint64_t CountLoanedNotFreePages() const;
uint64_t CountLoanedPages() const;
uint64_t CountTotalBytes() const;
// printf free and overall state of the internal arenas
// NOTE: both functions skip mutexes and can be called inside timer or crash context
// though the data they return may be questionable
void DumpFree() const TA_NO_THREAD_SAFETY_ANALYSIS;
void Dump(bool is_panic) const TA_NO_THREAD_SAFETY_ANALYSIS;
zx_status_t AddArena(const pmm_arena_info_t* info);
// Returns the number of active arenas.
size_t NumArenas() const {
Guard<Mutex> guard{&lock_};
return active_arenas().size();
}
// Copies |count| pmm_arena_info_t objects into |buffer| starting with the |i|-th arena ordered by
// base address. For example, passing an |i| of 1 would skip the 1st arena.
//
// The objects will be sorted in ascending order by arena base address.
//
// Returns ZX_ERR_OUT_OF_RANGE if |count| is 0 or |i| and |count| specificy an invalid range.
//
// Returns ZX_ERR_BUFFER_TOO_SMALL if the buffer is too small.
zx_status_t GetArenaInfo(size_t count, uint64_t i, pmm_arena_info_t* buffer, size_t buffer_size);
// add new pages to the free queue. used when boostrapping a PmmArena
void AddFreePages(list_node* list);
PageQueues* GetPageQueues() { return &page_queues_; }
// See |pmm_get_page_compression|
VmCompression* GetPageCompression() {
Guard<Mutex> guard{&compression_lock_};
return page_compression_.get();
}
// See |pmm_set_page_compression|.
zx_status_t SetPageCompression(fbl::RefPtr<VmCompression> compression);
// Fill all free pages (both non-loaned and loaned) with a pattern and arm the checker. See
// |PmmChecker|.
//
// This is a no-op if the checker is not enabled. See |EnableFreePageFilling|
void FillFreePagesAndArm();
// Synchronously walk the PMM's free list (and free loaned list) and validate each page. This is
// an incredibly expensive operation and should only be used for debugging purposes.
void CheckAllFreePages();
#if __has_feature(address_sanitizer)
// Synchronously walk the PMM's free list (and free loaned list) and poison each page.
void PoisonAllFreePages();
#endif
// Enable the free fill checker with the specified fill size and action, and begin filling freed
// pages (including freed loaned pages) going forward. See |PmmChecker| for definition of fill
// size.
//
// Note, pages freed piror to calling this method will remain unfilled. To fill them, call
// |FillFreePagesAndArm|.
//
// Returns true if the checker was enabled with the requested fill_size, or |false| otherwise.
bool EnableFreePageFilling(size_t fill_size, CheckFailAction action);
// Return a pointer to this object's free fill checker.
//
// For test and diagnostic purposes.
PmmChecker* Checker() { return &checker_; }
static int64_t get_alloc_failed_count();
// See |pmm_has_alloc_failed_no_mem|.
static bool has_alloc_failed_no_mem();
Evictor* GetEvictor() { return &evictor_; }
// If randomly waiting on allocations is enabled, this re-seeds from the global prng, otherwise it
// does nothing.
void SeedRandomShouldWait();
// Tell this PmmNode that we've failed a user-visible allocation. Calling this method will
// (optionally) trigger an asynchronous OOM response.
void ReportAllocFailure() TA_EXCL(lock_);
private:
void FreePageHelperLocked(vm_page* page, bool already_filled) TA_REQ(lock_);
void FreeListLocked(list_node* list, bool already_filled) TA_REQ(lock_);
void SignalFreeMemoryChangeLocked() TA_REQ(lock_);
void TripFreePagesLevelLocked() TA_REQ(lock_);
void UpdateMemAvailStateLocked() TA_REQ(lock_);
void SetMemAvailStateLocked(uint8_t mem_avail_state) TA_REQ(lock_);
void IncrementFreeCountLocked(uint64_t amount) TA_REQ(lock_) {
free_count_.fetch_add(amount, ktl::memory_order_relaxed);
if (mem_signal_ && free_count_.load(ktl::memory_order_relaxed) > mem_signal_upper_bound_) {
SignalFreeMemoryChangeLocked();
}
}
void DecrementFreeCountLocked(uint64_t amount) TA_REQ(lock_) {
[[maybe_unused]] uint64_t count = free_count_.fetch_sub(amount, ktl::memory_order_relaxed);
DEBUG_ASSERT(count >= amount);
if (should_wait_ == ShouldWaitState::OnceLevelTripped &&
free_count_.load(ktl::memory_order_relaxed) < should_wait_free_pages_level_) {
TripFreePagesLevelLocked();
}
if (mem_signal_ && free_count_.load(ktl::memory_order_relaxed) < mem_signal_lower_bound_) {
SignalFreeMemoryChangeLocked();
}
}
void IncrementFreeLoanedCountLocked(uint64_t amount) TA_REQ(lock_) {
free_loaned_count_.fetch_add(amount, ktl::memory_order_relaxed);
}
void DecrementFreeLoanedCountLocked(uint64_t amount) TA_REQ(lock_) {
DEBUG_ASSERT(free_loaned_count_.load(ktl::memory_order_relaxed) >= amount);
free_loaned_count_.fetch_sub(amount, ktl::memory_order_relaxed);
}
void IncrementLoanedCountLocked(uint64_t amount) TA_REQ(lock_) {
loaned_count_.fetch_add(amount, ktl::memory_order_relaxed);
}
void DecrementLoanedCountLocked(uint64_t amount) TA_REQ(lock_) {
DEBUG_ASSERT(loaned_count_.load(ktl::memory_order_relaxed) >= amount);
loaned_count_.fetch_sub(amount, ktl::memory_order_relaxed);
}
void IncrementLoanCancelledCountLocked(uint64_t amount) TA_REQ(lock_) {
loan_cancelled_count_.fetch_add(amount, ktl::memory_order_relaxed);
}
void DecrementLoanCancelledCountLocked(uint64_t amount) TA_REQ(lock_) {
DEBUG_ASSERT(loan_cancelled_count_.load(ktl::memory_order_relaxed) >= amount);
loan_cancelled_count_.fetch_sub(amount, ktl::memory_order_relaxed);
}
bool ShouldDelayAllocationLocked() TA_REQ(lock_);
void AllocPageHelperLocked(vm_page_t* page) TA_REQ(lock_);
template <typename F>
void ForPagesInPhysRangeLocked(paddr_t start, size_t count, F func) TA_REQ(lock_);
// This method should be called when the PMM fails to allocate in a user-visible way and will
// (optionally) trigger an asynchronous OOM response.
void ReportAllocFailureLocked() TA_REQ(lock_);
fbl::Canary<fbl::magic("PNOD")> canary_;
mutable DECLARE_MUTEX(PmmNode) lock_;
uint64_t arena_cumulative_size_ TA_GUARDED(lock_) = 0;
// This is both an atomic and guarded by lock_ as we would like modifications to require the lock,
// as logic in the system relies on the free_count_ not changing whilst the lock is held, but also
// be an atomic so it can be correctly read without the lock.
ktl::atomic<uint64_t> free_count_ TA_GUARDED(lock_) = 0;
ktl::atomic<uint64_t> free_loaned_count_ TA_GUARDED(lock_) = 0;
ktl::atomic<uint64_t> loaned_count_ TA_GUARDED(lock_) = 0;
ktl::atomic<uint64_t> loan_cancelled_count_ TA_GUARDED(lock_) = 0;
// Free pages where !loaned.
list_node free_list_ TA_GUARDED(lock_) = LIST_INITIAL_VALUE(free_list_);
// Free pages where loaned && !loan_cancelled.
list_node free_loaned_list_ TA_GUARDED(lock_) = LIST_INITIAL_VALUE(free_loaned_list_);
// Controls the behavior of requests that have the PMM_ALLOC_FLAG_CAN_WAIT.
enum class ShouldWaitState {
// The PMM_ALLOC_FLAG_CAN_WAIT should never be followed and we will always attempt to perform
// the allocation, or fail with ZX_ERR_NO_MEMORY. This state is permanent and cannot be left.
Never,
// Allocations do not need to be delayed, but the should_wait_free_pages_level_ should be
// monitored and once tripped should be delayed.
OnceLevelTripped,
// State indicates that the level got tripped, and we should delay any allocations until the
// level is reset.
UntilReset,
};
ShouldWaitState should_wait_ TA_GUARDED(lock_) = ShouldWaitState::OnceLevelTripped;
// Below this number of free pages the PMM will transition into delaying allocations.
uint64_t should_wait_free_pages_level_ TA_GUARDED(lock_) = 0;
// Event is signaled whenever allocations are allowed to happen based on the |should_wait_| state.
// Whenever in the |UntilReset| state, this event will be Unsignaled causing waiters to block.
Event free_pages_evt_;
// If mem_signal_ is not null, then once the available free memory falls outside of the defined
// lower and upper bound the signal is raised. This is a one-shot signal and is cleared after
// firing.
Event* mem_signal_ TA_GUARDED(lock_) = nullptr;
uint64_t mem_signal_lower_bound_ TA_GUARDED(lock_) = 0;
uint64_t mem_signal_upper_bound_ TA_GUARDED(lock_) = 0;
PageQueues page_queues_;
Evictor evictor_;
// The page_compression_ is a lazily initialized RefPtr to keep the PmmNode constructor simple, at
// the cost needing to hold a lock to read the RefPtr. To avoid unnecessarily contending on the
// main pmm lock_, use a separate one.
DECLARE_MUTEX(PmmNode) compression_lock_;
fbl::RefPtr<VmCompression> page_compression_ TA_GUARDED(compression_lock_);
// Indicates whether pages should have a pattern filled into them when they are freed. This value
// can only transition from false->true, and never back to false again. Once this value is set,
// the fill size in checker_ may no longer be changed, and it becomes safe to call FillPattern
// even without the lock held.
// This is an atomic to allow for reading this outside of the lock, but modifications only happen
// with the lock held.
ktl::atomic<bool> free_fill_enabled_ TA_GUARDED(lock_) = false;
// Indicates whether it is known that all pages in the free list have had a pattern filled into
// them. This value can only transition from false->true, and never back to false again. Once this
// value is set the action and armed state in checker_ may no longer be changed, and it becomes
// safe to call AssertPattern even without the lock held.
bool all_free_pages_filled_ TA_GUARDED(lock_) = false;
PmmChecker checker_;
// This method is racy as it allows us to read free_fill_enabled_ without holding the lock. If we
// receive a value of 'true', then as there is no mechanism to re-set it to false, we know it is
// still true. If we receive the value of 'false', then it could still become 'true' later.
// The intent of this method is to allow for filling the free pattern outside of the lock in most
// cases, and in the unlikely event of a race during the checker being armed, the pattern can
// resort to being filled inside the lock.
bool IsFreeFillEnabledRacy() const TA_NO_THREAD_SAFETY_ANALYSIS {
// Read with acquire semantics to ensure that any modifications to checker_ are visible before
// changes to free_fill_enabled_. See EnableFreePageFilling for where the release is performed.
return free_fill_enabled_.load(ktl::memory_order_acquire);
}
// The rng state for random waiting on allocations. This allows us to use rand_r, which requires
// no further thread synchronization, unlike rand().
uintptr_t random_should_wait_seed_ TA_GUARDED(lock_) = 0;
// Arenas are allocated from the node itself to avoid any boot allocations. Walking linearly
// through them at run time should also be fairly efficient.
static const size_t kArenaCount = 16;
size_t used_arena_count_ TA_GUARDED(lock_) = 0;
PmmArena arenas_[kArenaCount] TA_GUARDED(lock_);
// Return the span of arenas from the built-in array that are known to be active. Used in loops
// that iterate across all arenas.
ktl::span<PmmArena> active_arenas() TA_REQ(lock_) {
return ktl::span<PmmArena>(arenas_, used_arena_count_);
}
ktl::span<const PmmArena> active_arenas() const TA_REQ(lock_) {
return ktl::span<const PmmArena>(arenas_, used_arena_count_);
}
};
// We don't need to hold the arena lock while executing this, since it is
// only accesses values that are set once during system initialization.
inline vm_page_t* PmmNode::PaddrToPage(paddr_t addr) TA_NO_THREAD_SAFETY_ANALYSIS {
for (auto& a : active_arenas()) {
if (a.address_in_arena(addr)) {
size_t index = (addr - a.base()) / PAGE_SIZE;
return a.get_page(index);
}
}
return nullptr;
}
#endif // ZIRCON_KERNEL_VM_PMM_NODE_H_