blob: a98ab7664dcefda51a66df5beb72888d9eb88e43 [file] [log] [blame]
// Copyright 2021 The Fuchsia Authors
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file or at
#include <assert.h>
#include <lib/fit/function.h>
#include <lib/zircon-internal/ktrace.h>
#include <stdint.h>
#include <zircon/errors.h>
#include <zircon/types.h>
#include <arch/user_copy.h>
#include <kernel/lockdep.h>
#include <kernel/mutex.h>
#include <kernel/spinlock.h>
#include <ktl/atomic.h>
#include <ktl/forward.h>
// Fwd decl of tests to allow friendship.
namespace ktrace_tests {
class TestKTraceState;
namespace internal {
class KTraceState {
// Notes on KTrace operating modes.
// KTrace can currently operate in one of two different modes, either
// "Saturate" or "Circular".
// During saturating operation, if an attempt is made to write a record to the
// ktrace buffer, but there is not enough room to write the record, then the
// buffer has become "saturated". The record is dropped, and the group mask
// is cleared, preventing new writes from occurring until the trace is
// restarted.
// During circular operation, if an attempt is made to write a record to the
// ktrace buffer, but there is not enough room to write the record, then old
// records are discarded from the trace buffer in order to make room for new
// records.
// After a rewind operation, but before starting, the buffer is effectively
// operating in saturating mode for the purposes of recording static data such
// as the names of probes and threads in the system at the start of tracing.
// Afterwards, if the trace is then started in circular mode, the KTraceState
// instance remembers the point in the buffer where the static records ended,
// and the circular portion of the buffer starts. Records from the static
// region of the trace will never be purged from the trace to make room for
// new records recorded while in circular mode.
// A trace may be started, stopped, and started again in Saturate mode any
// number of times without rewinding. Additionally, a trace which has
// previously been started in Saturate mode may subsequently be started in
// Circular mode without rewinding. All records recorded while in saturate
// mode will be part of the static region of the buffer. It is, however, not
// legal to start a trace in Circular mode, then stop it, and then attempt to
// start it again in Saturate mode.
enum class StartMode { Saturate, Circular };
constexpr KTraceState() = default;
virtual ~KTraceState();
// Initialize the KTraceState instance, may only be called once. Any methods
// called on a KTraceState instance after construction, but before Init,
// should behave as no-ops.
// |target_bufsize| : The target size (in bytes) of the ktrace buffer to be
// allocated. Must be a multiple of 8 bytes.
// |initial_groups| : The initial set of enabled trace groups (see
// zircon-internal/ktrace.h). If non-zero, causes Init to attempt to allocate
// the trace buffer immediately. If the allocation fails, or the initial
// group mask is zero, allocation is delayed until the first time that start
// is called.
void Init(uint32_t target_bufsize, uint32_t initial_groups) TA_EXCL(lock_, write_lock_);
[[nodiscard]] zx_status_t Start(uint32_t groups, StartMode mode) TA_EXCL(lock_, write_lock_);
[[nodiscard]] zx_status_t Stop() TA_EXCL(lock_, write_lock_);
[[nodiscard]] zx_status_t Rewind() TA_EXCL(lock_, write_lock_) {
Guard<Mutex> guard(&lock_);
return RewindLocked();
ssize_t ReadUser(void* ptr, uint32_t off, size_t len) TA_EXCL(lock_, write_lock_);
// Write a record to the tracelog.
// |payload| must consist of all uint32_t or all uint64_t types.
template <typename... Args>
void WriteRecord(uint32_t effective_tag, uint64_t explicit_ts, Args... args);
void WriteRecordTiny(uint32_t tag, uint32_t arg) TA_EXCL(write_lock_);
void WriteNameEtc(uint32_t tag, uint32_t id, uint32_t arg, const char* name, bool always)
inline uint32_t grpmask() const {
return static_cast<uint32_t>(grpmask_and_inflight_writes_.load(ktl::memory_order_acquire));
// Check to see if a tag is currently enabled using either a new observation
// of the group mask (default), or a previous observation.
static inline bool tag_enabled(uint32_t tag, uint32_t mask) { return (mask & tag) != 0; }
inline bool tag_enabled(uint32_t tag) const { return tag_enabled(tag, grpmask()); }
// A small RAII helper which makes sure that we don't mess up our
// in_flight_writes bookkeeping.
class AutoWriteInFlight {
explicit AutoWriteInFlight(KTraceState& ks)
: ks_(ks),
kInflightWritesInc, ktl::memory_order_acq_rel) &
~kInflightWritesMask)) {}
~AutoWriteInFlight() {
[[maybe_unused]] uint64_t prev;
prev =
ks_.grpmask_and_inflight_writes_.fetch_sub(kInflightWritesInc, ktl::memory_order_release);
DEBUG_ASSERT((prev & kInflightWritesMask) > 0);
uint32_t observed_grpmask() const { return observed_grpmask_; }
KTraceState& ks_;
const uint32_t observed_grpmask_;
// A small helper class which should make it impossible to forget to commit a
// record after a successful reservation.
class PendingCommit {
// There are only two ways to make an instance of a PendingCommit. Either
// via implicit conversion from nullptr (a failed reservation), or from a
// pointer to the start of the record, and a value for the tag which
// eventually must be committed.
PendingCommit(nullptr_t) {}
PendingCommit(void* ptr, uint32_t tag) : ptr_(ptr), tag_(tag) {}
// No copy.
PendingCommit(const PendingCommit&) = delete;
PendingCommit& operator=(const PendingCommit&) = delete;
// Yes move.
PendingCommit(PendingCommit&& other) noexcept : ptr_(other.ptr_), tag_(other.tag_) {
other.ptr_ = nullptr;
PendingCommit& operator=(PendingCommit&& other) noexcept {
ptr_ = other.ptr_;
tag_ = other.tag_;
other.ptr_ = nullptr;
return *this;
// Going out of scope is what triggers the commit.
~PendingCommit() {
if (ptr_ != nullptr) {
ktl::atomic_ref(*static_cast<uint32_t*>(ptr_)).store(tag_, ktl::memory_order_release);
// Users need access to the reserved pointer in order to fill out their
// record payload.
ktrace_header_t* hdr() const { return reinterpret_cast<ktrace_header_t*>(ptr_); }
bool is_valid() const { return (ptr_ != nullptr); }
void* ptr_{nullptr};
uint32_t tag_{0};
friend class ktrace_tests::TestKTraceState;
friend class AutoWriteInFlight;
static inline uint32_t MakeTidField(uint32_t tag) {
? arch_curr_cpu_num()
: static_cast<uint32_t>(Thread::Current::Get()->tid());
[[nodiscard]] zx_status_t RewindLocked() TA_REQ(lock_);
// Add static names (eg syscalls and probes) to the trace buffer. Called
// during a rewind operation immediately after resetting the trace buffer.
// Declared as virtual to facilitate testing.
virtual void ReportStaticNames() TA_REQ(lock_);
// Add the names of current live threads and processes to the trace buffer.
// Called during start operations just before setting the group mask. Declared
// as virtual to facilitate testing.
virtual void ReportThreadProcessNames() TA_REQ(lock_);
// Copy data from kernel memory to user memory. Used by Read, and overloaded
// by test code (which needs to copy to kernel memory, not user memory).
virtual zx_status_t CopyToUser(void* dst, const void* src, size_t len) {
return arch_copy_to_user(dst, src, len);
// A small printf stand-in which gives tests the ability to disable diagnostic
// printing during testing.
int DiagsPrintf(int level, const char* fmt, ...) __PRINTFLIKE(3, 4) {
if (!disable_diags_printfs_ && DPRINTF_ENABLED_FOR_LEVEL(level)) {
va_list args;
va_start(args, fmt);
int result = vprintf(fmt, args);
return result;
return 0;
// Attempt to allocate our buffer, if we have not already done so.
zx_status_t AllocBuffer() TA_REQ(lock_);
// Reserve KTRACE_LEN(tag) bytes of contiguous space in the buffer, if
// possible.
PendingCommit Reserve(uint32_t tag);
inline void DisableGroupMask() {
grpmask_and_inflight_writes_.fetch_and(kInflightWritesMask, ktl::memory_order_release);
inline void SetGroupMask(uint32_t new_mask) {
grpmask_and_inflight_writes_.fetch_and(kInflightWritesMask, ktl::memory_order_relaxed);
grpmask_and_inflight_writes_.fetch_or(new_mask, ktl::memory_order_release);
// Convert an absolute read or write pointer into an offset into the circular
// region of the buffer. Note that it is illegal to call this if we are not
// operating in circular mode.
uint32_t PtrToCircularOffset(uint64_t ptr) const TA_REQ(write_lock_) {
DEBUG_ASSERT(circular_size_ > 0);
return static_cast<uint32_t>((ptr % circular_size_) + wrap_offset_);
inline uint32_t inflight_writes() const {
return static_cast<uint32_t>(
(grpmask_and_inflight_writes_.load(ktl::memory_order_acquire) & kInflightWritesMask) >> 32);
// Allow diagnostic dprintf'ing or not. Overridden by test code.
bool disable_diags_printfs_{false};
// An atomic state variable which tracks the currently active group mask (in
// its lower 32 bits) and the current in-flight write count (in its upper 32
// bits).
// Write operations consist of:
// 1) Observing the group mask with acquire semantics to determine if the
// write should proceed.
// 2) Incrementing the in-flight-write count portion of the state with acq/rel
// semantics to indicate that a write operation has begun.
// 3) Completing the operation, or aborting it if the group mask has been
// disabled for this write since step #1.
// 4) Decrementing the in-flight-write count portion of the state with release
// semantics to indicate that the write is finished.
// This allows Stop operations to synchronize with any in-flight writes by:
// 1) Clearing the grpmask portion of the state with release semantics.
// 2) Spinning on the in-flight-writes portion of the mask with acquire
// semantics until an in-flight count of zero is observed.
static constexpr uint64_t kInflightWritesMask = 0xFFFFFFFF00000000;
static constexpr uint64_t kInflightWritesInc = 0x0000000100000000;
ktl::atomic<uint64_t> grpmask_and_inflight_writes_{0};
// The target buffer size (in bytes) we would like to use, when we eventually
// call AllocBuffer. Set during the call to Init.
uint32_t target_bufsize_{0};
// A lock used to serialize all non-write operations. IOW - this lock ensures
// that only a single thread at a time may be involved in operations such as
// Start, Stop, Rewind, and ReadUser
DECLARE_MUTEX(KTraceState) lock_;
bool is_started_ TA_GUARDED(lock_){false};
// The core allocation state of the trace buffer, protected by the write
// spinlock. See "Notes on KTrace operating modes" (above) for details on
// saturate vs. circular mode. This comment will describe how the bookkeeping
// maintained in each of the two modes, how wrapping is handled in circular
// mode, and how space for records in the buffer is reserved and subsequently
// committed.
// --== Saturate mode ==--
// While operating in saturate mode, the value of |circular_size_| and |rd_|
// will always be 0, and the value of |wrap_offset_| is not defined. The only
// important piece of bookkeeping maintained is the value of |wr_|. |wr_|
// always points to the offset in the buffer where the next record will be
// stored, and it should always be <= |bufsize_|. When reading back records,
// the first record will always be located at offset 0.
// --== Circular mode ==--
// When operating in circular mode, the buffer is partitioned into two
// regions; a "static" region which contains the records recorded before
// entering circular mode, and a circular region which contain records written
// after beginning circular operation. |circular_size_| must be non-zero, and
// contains the size (in bytes) of the circular region of the buffer. The
// region of the buffer from [0, wrap_offset_) is the static region of the
// buffer, while the region from [wrap_offset_, bufsize_) is the circular
// region. |wrap_offset_| must always be < |bufsize_|.
// The |rd_| and |wr_| pointers are absolute offsets into the circular region
// of the buffer, modulo |circular_size_|. When space in the buffer is
// reserved for a record, |wr_| is incremented by the size of the record.
// When a record is purged to make room for new records, |rd_| is incremented.
// At all times, |rd_| <= |wr_|, and both pointers are monotonically
// increasing. The function which maps from one of these pointers to an
// offset in the buffer (on the range [0, bufsize_)) is given by
// f(ptr) = (ptr % circular_size_) + wrap_offset_
// --== Reserving records and memory ordering ==--
// In order to write a record to the trace buffer, the writer must first
// reserve the space to do so. During this period of time, the |write_lock_|
// is held while the bookkeeping is handled in order to reserve space.
// Holding the write lock during reservation guarantees coherent observations
// of the bookkeeping state by the writers.
// If the reservation succeeds, the tag field of the reserved record is stored
// as 0 with release semantics, then the write lock is dropped in order to
// allow other reservations to take place concurrently while the payload of
// the record is populated. Once the writer has finished recording the
// payload, it must write the final tag value for the record with release
// semantics. This finalizes the record, and after this operation, the
// payload may no longer change.
// If, while operating in circular mode, an old record needs to be purged in
// order to make space for a new record, the |rd_| pointer will simply be
// incremented by the size of the record located at the |rd_| pointer. The
// tag of this record must first be read with memory order acquire semantics
// in order to compute its length so that the |rd_| pointer may be adjusted
// appropriately. If, during this observation, the value of the tag is
// observed to be 0, it means that a writer is attempting to advance the read
// pointer past a record which has not been fully committed yet. If this ever
// happens, the reservation operation fails, and the group mask will be
// cleared, just like if a reservation had failed in saturating mode.
// --== Circular mode padding ==--
// If a record of size X is to be reserved in the trace buffer while operating
// in circular mode, and the distance between the write pointer and the end of
// the buffer is too small for the record to be contained contiguously, a
// "padding" record will be inserted instead. This is a record with a group
// ID of 0 which contains no payload. Its only purpose is to bad the buffer
// out so that the record to be written may exist contiguously in the trace
// buffer.
DECLARE_SPINLOCK(KTraceState) write_lock_;
uint64_t rd_ TA_GUARDED(write_lock_){0};
uint64_t wr_ TA_GUARDED(write_lock_){0};
uint32_t circular_size_ TA_GUARDED(write_lock_){0};
uint32_t wrap_offset_ TA_GUARDED(write_lock_){0};
// Note: these don't _actually_ have to be protected by the write lock.
// Memory ordering consistency for mutators of these variables are protected
// via lock_, while observations from trace writers are actually protected by
// a complicated set of arguments based on the stopped/started state of the
// system, and the acq/rel semantics of the grpmask_ variable.
// Instead of relying on these complicated and difficult to
// communicate/enforce invariants, however, we just toss these variables into
// the write lock and leave it at that. Trace writers already needed to be
// inside of the write lock to manipulate the read/write pointers while
// reserving space. Mutation of these variables can only happen during
// start/init when the system is stopped (and there are no writers), so
// obtaining the write lock to allocate the buffer is basically free since it
// will never be contested.
uint8_t* buffer_ TA_GUARDED(write_lock_){nullptr};
uint32_t bufsize_ TA_GUARDED(write_lock_){0};
} // namespace internal