blob: 2a448f2195c0a535b918ba65e8ddbecc13445e07 [file] [log] [blame]
// Copyright 2020 The Fuchsia Authors
//
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file or at
// https://opensource.org/licenses/MIT
#include <lib/boot-options/boot-options.h>
#include <lib/debuglog.h>
#include <lib/zircon-internal/macros.h>
#include <object/executor.h>
#include <object/memory_watchdog.h>
#include <platform/halt_helper.h>
#include <platform/halt_token.h>
#include <vm/scanner.h>
namespace {
const char* PressureLevelToString(MemoryWatchdog::PressureLevel level) {
switch (level) {
case MemoryWatchdog::PressureLevel::kOutOfMemory:
return "OutOfMemory";
case MemoryWatchdog::PressureLevel::kImminentOutOfMemory:
return "ImminentOutOfMemory";
case MemoryWatchdog::PressureLevel::kCritical:
return "Critical";
case MemoryWatchdog::PressureLevel::kWarning:
return "Warning";
case MemoryWatchdog::PressureLevel::kNormal:
return "Normal";
default:
return "Unknown";
}
}
void HandleOnOomReboot() {
// Notify the pmm that although we are out of memory, we would like to never wait for memory.
// This ensures that if userspace needs to allocate to do a graceful shutdown it is able to.
pmm_stop_returning_should_wait();
if (!HaltToken::Get().Take()) {
// We failed to acquire the token. Someone else must have it. That's OK. We'll rely on them
// to halt/reboot. Nothing left for us to do but wait.
printf("memory-pressure: halt/reboot already in progress; sleeping forever\n");
Thread::Current::Sleep(ZX_TIME_INFINITE);
}
// We now have the halt token so we're committed. To ensure we record the true cause of the
// reboot, we must ensure nothing (aside from a panic) prevents us from halting with reason OOM.
// We are out of or nearly out of memory so future attempts to allocate may fail. From this
// point on, avoid performing any allocation. Establish a "no allocation allowed" scope to
// detect (assert) if we attempt to allocate.
ScopedMemoryAllocationDisabled allocation_disabled;
printf("memory-pressure: pausing for %ums after OOM mem signal\n", gBootOptions->oom_timeout_ms);
zx_status_t status =
HaltToken::Get().WaitForAck(Deadline::after(ZX_MSEC(gBootOptions->oom_timeout_ms)));
switch (status) {
case ZX_OK:
printf("memory-pressure: rebooting due to OOM. received user-mode acknowledgement.\n");
break;
case ZX_ERR_TIMED_OUT:
// TODO(fxb/91704): Update this comment once the referenced bug/work-item
// is fixed.
//
// Note: While in general, a timeout while waiting for acknowledgement
// from user mode would be an indication of something bad going on (such
// as a bug preventing the ack, or simply not enough time to allow user
// mode to shut down), it is currently expected behavior as the change to
// cause user-mode to acknowledge a kernel initiated OOM reboot has not
// landed yet.
//
// See http://fxb/91704 for details.
//
printf(
"memory-pressure: rebooting due to OOM. timed out after waiting %ums for user-mode "
"ack.\n",
gBootOptions->oom_timeout_ms);
break;
default:
printf(
"memory-pressure: rebooting due to OOM. unexpected error while waiting for user-mode "
"acknowledgement (status %d).\n",
status);
break;
}
// Tell the oom_tests host test that we are about to generate an OOM
// crashlog to keep it happy. Without these messages present in a
// specific order in the log, the test will fail.
printf("memory-pressure: stowing crashlog\nZIRCON REBOOT REASON (OOM)\n");
// The debuglog could contain diagnostic messages that would assist in debugging the cause of
// the OOM. Shutdown debuglog before rebooting in order to flush any queued messages.
//
// It is important that we don't hang during this process so set a deadline for the debuglog
// to shutdown.
//
// How long should we wait? Shutting down the debuglog includes flushing any buffered
// messages to the serial port (if present). Writing to a serial port can be slow. Assuming
// we have a full debuglog buffer of 128KB, at 115200 bps, with 8-N-1, it will take roughly
// 11.4 seconds to drain the buffer. The timeout should be long enough to allow a full DLOG
// buffer to be drained.
zx_time_t deadline = current_time() + ZX_SEC(20);
status = dlog_shutdown(deadline);
if (status != ZX_OK) {
// If `dlog_shutdown` failed, there's not much we can do besides print an error (which
// probably won't make it out anyway since we've already called `dlog_shutdown`) and
// continue on to `platform_halt`.
printf("ERROR: dlog_shutdown failed: %d\n", status);
}
platform_halt(HALT_ACTION_REBOOT, ZirconCrashReason::Oom);
}
} // namespace
fbl::RefPtr<EventDispatcher> MemoryWatchdog::GetMemPressureEvent(uint32_t kind) {
switch (kind) {
case ZX_SYSTEM_EVENT_OUT_OF_MEMORY:
return mem_pressure_events_[PressureLevel::kOutOfMemory];
case ZX_SYSTEM_EVENT_IMMINENT_OUT_OF_MEMORY:
return mem_pressure_events_[PressureLevel::kImminentOutOfMemory];
case ZX_SYSTEM_EVENT_MEMORY_PRESSURE_CRITICAL:
return mem_pressure_events_[PressureLevel::kCritical];
case ZX_SYSTEM_EVENT_MEMORY_PRESSURE_WARNING:
return mem_pressure_events_[PressureLevel::kWarning];
case ZX_SYSTEM_EVENT_MEMORY_PRESSURE_NORMAL:
return mem_pressure_events_[PressureLevel::kNormal];
default:
return nullptr;
}
}
// Callback used with |pmm_init_reclamation|.
// This is a very minimal save idx and signal an event as we are called under the pmm lock and must
// avoid causing any additional allocations.
void MemoryWatchdog::AvailableStateUpdatedCallback(void* context, uint8_t idx) {
MemoryWatchdog* watchdog = reinterpret_cast<MemoryWatchdog*>(context);
watchdog->AvailableStateUpdate(idx);
}
void MemoryWatchdog::AvailableStateUpdate(uint8_t idx) {
mem_event_idx_ = PressureLevel(idx);
mem_state_signal_.Signal();
}
void MemoryWatchdog::EvictionTriggerCallback(Timer* timer, zx_time_t now, void* arg) {
MemoryWatchdog* watchdog = reinterpret_cast<MemoryWatchdog*>(arg);
watchdog->EvictionTrigger();
}
void MemoryWatchdog::EvictionTrigger() {
// This runs from a timer interrupt context, as such we do not want to be performing synchronous
// eviction and blocking some random thread. Therefore we use the asynchronous eviction trigger
// that will cause the eviction thread to perform the actual eviction work.
if (eviction_strategy_ == EvictionStrategy::Continuous) {
pmm_evictor()->EnableContinuousEviction(min_free_target_, free_mem_target_,
Evictor::EvictionLevel::OnlyOldest,
Evictor::Output::Print);
} else {
pmm_evictor()->EvictOneShotAsynchronous(min_free_target_, free_mem_target_,
Evictor::EvictionLevel::OnlyOldest,
Evictor::Output::Print);
}
}
// Helper called by the memory pressure thread when OOM state is entered.
void MemoryWatchdog::OnOom() {
switch (gBootOptions->oom_behavior) {
case OomBehavior::kJobKill:
if (!executor_->GetRootJobDispatcher()->KillJobWithKillOnOOM()) {
printf("memory-pressure: no alive job has a kill bit\n");
}
// Since killing is asynchronous, sleep for a short period for the system to quiesce. This
// prevents us from rapidly killing more jobs than necessary. And if we don't find a
// killable job, don't just spin since the next iteration probably won't find a one either.
Thread::Current::SleepRelative(ZX_MSEC(500));
break;
case OomBehavior::kReboot:
HandleOnOomReboot();
}
}
bool MemoryWatchdog::IsSignalDue(PressureLevel idx, zx_time_t time_now) const {
// We signal a memory state change immediately if any of these conditions are met:
// 1) The current index is lower than the previous one signaled (i.e. available memory is lower
// now), so that clients can act on the signal quickly.
// 2) |hysteresis_seconds_| have elapsed since the last time we examined the state.
return idx < prev_mem_event_idx_ ||
zx_time_sub_time(time_now, prev_mem_state_eval_time_) >= hysteresis_seconds_;
}
bool MemoryWatchdog::IsEvictionRequired(PressureLevel idx) const {
// Trigger asynchronous eviction if:
// 1) the memory availability state is more critical than the previous one
// AND
// 2) we're configured to evict at that level.
//
// Do not trigger asynchronous eviction at the OOM level, as we have already performed synchronous
// eviction to attempt a quick recovery before reaching here. At this point we are about to signal
// filesystems to shut down on OOM, after which eviction will be a no-op anyway, since there will
// no longer be any pager-backed memory to evict.
return idx < prev_mem_event_idx_ && idx <= max_eviction_level_ &&
idx != PressureLevel::kOutOfMemory;
}
void MemoryWatchdog::WorkerThread() {
while (true) {
// If we've hit OOM level perform some immediate synchronous eviction to attempt to avoid OOM.
if (mem_event_idx_ == PressureLevel::kOutOfMemory) {
printf("memory-pressure: free memory is %zuMB, evicting pages to prevent OOM...\n",
pmm_count_free_pages() * PAGE_SIZE / MB);
pmm_page_queues()->Dump();
// Keep trying to perform eviction for as long as we are evicting non-zero pages and we remain
// in the out of memory state.
while (mem_event_idx_ == PressureLevel::kOutOfMemory) {
uint64_t evicted_pages = pmm_evictor()->EvictOneShotSynchronous(
MB * 10, Evictor::EvictionLevel::IncludeNewest, Evictor::Output::Print);
if (evicted_pages == 0) {
printf("memory-pressure: found no pages to evict\n");
break;
}
}
printf("memory-pressure: free memory after OOM eviction is %zuMB\n",
pmm_count_free_pages() * PAGE_SIZE / MB);
pmm_page_queues()->Dump();
PageQueues::PagerCounts pager_counts = pmm_page_queues()->GetPagerQueueCounts();
printf(
"memory-pressure: pager-backed working set immediately after OOM eviction is: "
"total: %zu MiB newest: %zu MiB oldest: %zu MiB\n",
pager_counts.total * PAGE_SIZE / MB, pager_counts.newest * PAGE_SIZE / MB,
pager_counts.oldest * PAGE_SIZE / MB);
pmm_print_physical_page_borrowing_stats();
}
// Get a local copy of the atomic. It's possible by the time we read this that we've already
// exited the last observed state, but that's fine as we don't necessarily need to signal every
// transient state.
PressureLevel idx = mem_event_idx_;
// Check to see if the PMM has failed any allocations. If the PMM has ever failed to allocate
// because it was out of memory, then escalate the pressure level to trigger an OOM response
// immediately. The idea here is that usermode processes may not be able to handle allocation
// failure and therefore could have become wedged in some way.
if (gBootOptions->oom_trigger_on_alloc_failure && pmm_has_alloc_failed_no_mem()) {
printf("memory-pressure: pmm failed one or more alloc calls, escalating to oom...\n");
idx = PressureLevel::kOutOfMemory;
}
auto time_now = current_time();
if (IsSignalDue(idx, time_now)) {
printf("memory-pressure: memory availability state - %s\n", PressureLevelToString(idx));
pmm_page_queues()->Dump();
if (IsEvictionRequired(idx)) {
// Clear any previous eviction trigger. Once Cancel completes we know that we will not race
// with the callback and are free to update the targets. Cancel will return true if the
// timer was canceled before it was scheduled on a cpu, i.e. an eviction was outstanding.
bool eviction_was_outstanding = eviction_trigger_.Cancel();
const uint64_t free_mem = pmm_count_free_pages() * PAGE_SIZE;
// Set the minimum amount to free as half the amount required to reach our desired free
// memory level. This minimum ensures that even if the user reduces memory in reaction to
// this signal we will always attempt to free a bit.
// TODO: measure and fine tune this over time as user space evolves.
min_free_target_ = free_mem < free_mem_target_ ? (free_mem_target_ - free_mem) / 2 : 0;
// If eviction was outstanding when we canceled the eviction trigger, trigger eviction
// immediately without any delay. We are here because of a rapid allocation spike which
// caused the memory pressure to become more critical in a very short interval, so it might
// be better to evict pages as soon as possible to try and counter the allocation spike.
// Otherwise if eviction was not outstanding, trigger the eviction for slightly in the
// future. Half the hysteresis time here is a balance between giving user space time to
// release memory and the eviction running before the end of the hysteresis period.
eviction_trigger_.SetOneshot(
(eviction_was_outstanding ? time_now
: zx_time_add_duration(time_now, hysteresis_seconds_ / 2)),
EvictionTriggerCallback, this);
printf("memory-pressure: set target memory to evict %zuMB (free memory is %zuMB)\n",
min_free_target_ / MB, free_mem / MB);
} else if (eviction_strategy_ == EvictionStrategy::Continuous && idx > max_eviction_level_) {
// If we're out of the max configured eviction-eligible memory pressure level, disable
// continuous eviction.
// Cancel any outstanding eviction trigger, so that eviction is not accidentally enabled
// *after* we disable it here.
eviction_trigger_.Cancel();
// Disable continuous eviction.
pmm_evictor()->DisableContinuousEviction();
}
// Unsignal the last event that was signaled.
zx_status_t status =
mem_pressure_events_[prev_mem_event_idx_]->user_signal_self(ZX_EVENT_SIGNALED, 0);
if (status != ZX_OK) {
panic("memory-pressure: unsignal memory event %s failed: %d\n",
PressureLevelToString(prev_mem_event_idx_), status);
}
// Signal event corresponding to the new memory state.
status = mem_pressure_events_[idx]->user_signal_self(0, ZX_EVENT_SIGNALED);
if (status != ZX_OK) {
panic("memory-pressure: signal memory event %s failed: %d\n", PressureLevelToString(idx),
status);
}
prev_mem_event_idx_ = idx;
prev_mem_state_eval_time_ = time_now;
// If we're below the out-of-memory watermark, trigger OOM behavior.
if (idx == PressureLevel::kOutOfMemory) {
pmm_page_queues()->Dump();
OnOom();
}
// Wait for the memory state to change again.
mem_state_signal_.Wait(Deadline::infinite());
} else {
prev_mem_state_eval_time_ = time_now;
// We are ignoring this memory state transition. Wait for only |hysteresis_seconds_|, and then
// re-evaluate the memory state. Otherwise we could remain stuck at the lower memory state if
// mem_avail_state_updated_cb() is not invoked.
mem_state_signal_.Wait(
Deadline::no_slack(zx_time_add_duration(time_now, hysteresis_seconds_)));
}
}
}
void MemoryWatchdog::Init(Executor* executor) {
DEBUG_ASSERT(executor_ == nullptr);
executor_ = executor;
for (uint8_t i = 0; i < PressureLevel::kNumLevels; i++) {
auto level = PressureLevel(i);
KernelHandle<EventDispatcher> event;
zx_rights_t rights;
zx_status_t status = EventDispatcher::Create(0, &event, &rights);
if (status != ZX_OK) {
panic("memory-pressure: create memory event %s failed: %d\n", PressureLevelToString(level),
status);
}
mem_pressure_events_[i] = event.release();
}
if (gBootOptions->oom_enabled) {
constexpr auto kNumWatermarks = PressureLevel::kNumLevels - 1;
ktl::array<uint64_t, kNumWatermarks> mem_watermarks;
// TODO(rashaeqbal): The watermarks chosen below are arbitrary. Tune them based on memory usage
// patterns. Consider moving to percentages of total memory instead of absolute numbers - will
// be easier to maintain across platforms.
mem_watermarks[PressureLevel::kOutOfMemory] =
(gBootOptions->oom_out_of_memory_threshold_mb) * MB;
mem_watermarks[PressureLevel::kImminentOutOfMemory] =
mem_watermarks[PressureLevel::kOutOfMemory] +
(gBootOptions->oom_imminent_oom_delta_mb) * MB;
mem_watermarks[PressureLevel::kCritical] = (gBootOptions->oom_critical_threshold_mb) * MB;
mem_watermarks[PressureLevel::kWarning] = (gBootOptions->oom_warning_threshold_mb) * MB;
uint64_t watermark_debounce = gBootOptions->oom_debounce_mb * MB;
if (gBootOptions->oom_evict_at_warning) {
max_eviction_level_ = PressureLevel::kWarning;
}
// Set our eviction target to be such that we try to get completely out of the max eviction
// level, taking into account the debounce.
free_mem_target_ = mem_watermarks[max_eviction_level_] + watermark_debounce;
hysteresis_seconds_ = ZX_SEC(gBootOptions->oom_hysteresis_seconds);
zx_status_t status =
pmm_init_reclamation(&mem_watermarks[PressureLevel::kOutOfMemory], kNumWatermarks,
watermark_debounce, this, &AvailableStateUpdatedCallback);
if (status != ZX_OK) {
panic("memory-pressure: failed to initialize pmm reclamation: %d\n", status);
}
printf(
"memory-pressure: memory watermarks - OutOfMemory: %zuMB, Critical: %zuMB, Warning: %zuMB, "
"Debounce: %zuMB\n",
mem_watermarks[PressureLevel::kOutOfMemory] / MB,
mem_watermarks[PressureLevel::kCritical] / MB, mem_watermarks[PressureLevel::kWarning] / MB,
watermark_debounce / MB);
printf("memory-pressure: eviction trigger level - %s\n",
PressureLevelToString(max_eviction_level_));
if (gBootOptions->oom_evict_continuous) {
eviction_strategy_ = EvictionStrategy::Continuous;
printf("memory-pressure: eviction strategy - continuous\n");
} else {
eviction_strategy_ = EvictionStrategy::OneShot;
printf("memory-pressure: eviction strategy - one-shot\n");
}
printf("memory-pressure: hysteresis interval - %ld seconds\n", hysteresis_seconds_ / ZX_SEC(1));
printf("memory-pressure: ImminentOutOfMemory watermark - %zuMB\n",
mem_watermarks[PressureLevel::kImminentOutOfMemory] / MB);
auto memory_worker_thread = [](void* arg) -> int {
MemoryWatchdog* watchdog = reinterpret_cast<MemoryWatchdog*>(arg);
watchdog->WorkerThread();
};
auto thread =
Thread::Create("memory-pressure-thread", memory_worker_thread, this, HIGHEST_PRIORITY);
DEBUG_ASSERT(thread);
thread->Detach();
thread->Resume();
}
}