blob: 791022aabe2bf7c6215a9484cea75a2115aa437d [file] [log] [blame]
// Copyright 2020 The Fuchsia Authors
//
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file or at
// https://opensource.org/licenses/MIT
#ifndef ZIRCON_KERNEL_LIB_LOCKUP_DETECTOR_INCLUDE_LIB_LOCKUP_DETECTOR_STATE_H_
#define ZIRCON_KERNEL_LIB_LOCKUP_DETECTOR_INCLUDE_LIB_LOCKUP_DETECTOR_STATE_H_
#include <kernel/align.h>
#include <kernel/cpu.h>
#include <kernel/event_limiter.h>
#include <ktl/atomic.h>
// Per CPU state for lockup detector.
struct __CPU_ALIGN LockupDetectorState {
/////////////////////////////////////////////////////////////////////////////
//
// Common per-cpu lockup detector state
//
/////////////////////////////////////////////////////////////////////////////
// The ID of CPU who is currently performing a check of this CPUs conditions,
// or INVALID_CPU if no CPU currently is. Used to prevent multiple CPUs from
// recognizing and reporting the same condition on the same CPU concurrently.
//
// TODO(johngro): This really should just be a spinlock which is only ever
// try-locked during the check. If spinlocks had a trylock operation which
// was compatible with clang static analysis, we could make use of it here
// along with static annotations to catch mistakes.
ktl::atomic<cpu_num_t> current_checker_id{INVALID_CPU};
/////////////////////////////////////////////////////////////////////////////
//
// Per-cpu lockup detector state used for detecting the "heartbeat" condition.
//
/////////////////////////////////////////////////////////////////////////////
struct {
// A flag used to indicate that this CPU is participating in the heartbeat
// mechanism. IOW - it is periodically recording that it is still running in
// the |last_heartbeat| field, and after doing so it is checking on its peer
// CPUs.
//
// TODO(johngro): Should we merge this field with |last_heartbeat|? Seems
// like we should be able to use a sentinel value, such as 0, or
// ZX_TIME_INFINITE to indicate that the mechanism is disabled.
ktl::atomic<bool> active = false;
// The last time at which this CPU checked in.
ktl::atomic<zx_time_t> last_heartbeat = 0;
// The largest gap between last_heartbeat and now ever observed by a checker.
// Note that writes to the field are "protected" by the exclusive role of
// "checker".
ktl::atomic<zx_duration_t> max_gap = 0;
// limiter for the rate at which heartbeat failures are reported. "Protected"
// by the exclusive role of "checker".
EventLimiter<ZX_SEC(1)> alert_limiter;
} heartbeat;
/////////////////////////////////////////////////////////////////////////////
//
// Per-cpu lockup detector state used for detecting the "critical section"
// condition.
//
/////////////////////////////////////////////////////////////////////////////
struct {
// Critical sections may be nested, so lockup_timed_begin and
// lockup_timed_end (called as code enters and exits critical sections) must
// keep track of the depth. This variable is only ever accessed by the code
// entering and exiting the CS, and always on the same CPU, so there is no
// need for it to be atomic. However, because an interrupt may fire as a
// thread enters a critical section and the interrupt handler itself my
// enter a critical section, compiler fences must be used when accessing to
// ensure that compiler reordering does not lead to problem.
//
// Accessed only by this CPU.
uint32_t depth = 0;
// The name of the active critical section, if any. May be nullptr.
//
// Accessed by both this CPU and observers.
ktl::atomic<const char*> name{nullptr};
// The worst case CS time ever observed by the critical section thread as it
// exits the critical section. This variable is written only by the CPU
// exiting the critical section, but is read by other CPUs during the
// heartbeat sanity checks. While the thread exiting the critical section
// reports the worst cast time via this variable, only the threads
// performing heartbeat sanity checks will ever report issues (via an OOPS)
// as a result of a new worst case value.
//
// Accessed by both this CPU and observers.
ktl::atomic<zx_ticks_t> worst_case_ticks{0};
// The time (tick count) at which the CPU entered the critical section.
//
// This field is used to establish Release-Acquire ordering of changes made
// by critical section threads and observed by observers.
//
// Accessed by both this CPU and observers.
ktl::atomic<zx_ticks_t> begin_ticks{0};
// State variable used to de-dupe the critical section lockup events for the
// purposes of updating kcounters.
//
// Accessed only by observers.
zx_ticks_t last_counted_begin_ticks{0};
// The largest worst case value ever _reported_ by a heartbeat checker.
// This variable is only ever used by the current checker, and the
// acquire/release semantics of the current_checker_id variable should
// ensure that it is coherent on architectures with weak memory ordering.
//
// Accessed only by observers.
zx_ticks_t reported_worst_case_ticks{0};
// The alert limiter used to rate limit warnings printed for ongoing
// critical section times (eg; CPUs which enter critical sections, but don't
// exit them for so long that a heartbeat checker notices them)
//
// Accessed only by observers.
EventLimiter<ZX_SEC(1)> ongoing_call_alert_limiter;
// The alert limiter used to rate limit warnings printed when the heartbeat
// monitor notices new, unreported, worst case values.
//
// Accessed only by observers.
EventLimiter<ZX_SEC(1)> worst_case_alert_limiter;
} critical_section;
};
extern LockupDetectorState gLockupDetectorPerCpuState[SMP_MAX_CPUS];
#endif // ZIRCON_KERNEL_LIB_LOCKUP_DETECTOR_INCLUDE_LIB_LOCKUP_DETECTOR_STATE_H_