blob: b7d5ec02d58edb630ea695fe04d2096c90193a77 [file] [log] [blame]
// Copyright 2021 The Fuchsia Authors
//
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file or at
// https://opensource.org/licenses/MIT
#ifndef ZIRCON_KERNEL_LIB_KTRACE_INCLUDE_LIB_KTRACE_KTRACE_INTERNAL_H_
#define ZIRCON_KERNEL_LIB_KTRACE_INCLUDE_LIB_KTRACE_KTRACE_INTERNAL_H_
#include <assert.h>
#include <lib/fit/function.h>
#include <lib/fxt/interned_category.h>
#include <lib/fxt/serializer.h>
#include <lib/user_copy/user_ptr.h>
#include <lib/zircon-internal/ktrace.h>
#include <lib/zx/result.h>
#include <stdint.h>
#include <zircon/errors.h>
#include <zircon/types.h>
#include <arch/user_copy.h>
#include <kernel/lockdep.h>
#include <kernel/mutex.h>
#include <kernel/spinlock.h>
#include <ktl/atomic.h>
#include <ktl/forward.h>
#include <ktl/move.h>
// Fwd decl of tests to allow friendship.
namespace ktrace_tests {
class TestKTraceState;
}
namespace internal {
class KTraceState {
public:
////////////////////////////////////////////////////////////////
//
// Notes on KTrace operating modes.
//
// KTrace can currently operate in one of two different modes, either
// "Saturate" or "Circular".
//
// During saturating operation, if an attempt is made to write a record to the
// ktrace buffer, but there is not enough room to write the record, then the
// buffer has become "saturated". The record is dropped, and the group mask
// is cleared, preventing new writes from occurring until the trace is
// restarted.
//
// During circular operation, if an attempt is made to write a record to the
// ktrace buffer, but there is not enough room to write the record, then old
// records are discarded from the trace buffer in order to make room for new
// records.
//
// After a rewind operation, but before starting, the buffer is effectively
// operating in saturating mode for the purposes of recording static data such
// as the names of probes and threads in the system at the start of tracing.
// Afterwards, if the trace is then started in circular mode, the KTraceState
// instance remembers the point in the buffer where the static records ended,
// and the circular portion of the buffer starts. Records from the static
// region of the trace will never be purged from the trace to make room for
// new records recorded while in circular mode.
//
// A trace may be started, stopped, and started again in Saturate mode any
// number of times without rewinding. Additionally, a trace which has
// previously been started in Saturate mode may subsequently be started in
// Circular mode without rewinding. All records recorded while in saturate
// mode will be part of the static region of the buffer. It is, however, not
// legal to start a trace in Circular mode, then stop it, and then attempt to
// start it again in Saturate mode.
enum class StartMode { Saturate, Circular };
constexpr KTraceState() = default;
virtual ~KTraceState();
// Initialize the KTraceState instance, may only be called once. Any methods
// called on a KTraceState instance after construction, but before Init,
// should behave as no-ops.
//
// |target_bufsize| : The target size (in bytes) of the ktrace buffer to be
// allocated. Must be a multiple of 8 bytes.
//
// |initial_groups| : The initial set of enabled trace groups (see
// zircon-internal/ktrace.h). If non-zero, causes Init to attempt to allocate
// the trace buffer immediately. If the allocation fails, or the initial
// group mask is zero, allocation is delayed until the first time that start
// is called.
//
void Init(uint32_t target_bufsize, uint32_t initial_groups) TA_EXCL(lock_, write_lock_);
[[nodiscard]] zx_status_t Start(uint32_t groups, StartMode mode) TA_EXCL(lock_, write_lock_);
[[nodiscard]] zx_status_t Stop() TA_EXCL(lock_, write_lock_);
[[nodiscard]] zx_status_t Rewind() TA_EXCL(lock_, write_lock_) {
Guard<Mutex> guard(&lock_);
return RewindLocked();
}
ssize_t ReadUser(user_out_ptr<void> ptr, uint32_t off, size_t len) TA_EXCL(lock_, write_lock_);
uint32_t grpmask() const {
return static_cast<uint32_t>(grpmask_.load(ktl::memory_order_acquire));
}
bool IsCategoryEnabled(const fxt::InternedCategory& category) const {
const uint32_t bit_number = category.GetBit();
const uint32_t bitmask =
bit_number != fxt::InternedCategory::kInvalidBitNumber ? 1u << bit_number : 0;
return (bitmask & grpmask()) != 0;
}
// Atomically increments in-flight-writes iff writes are enabled and returns true.
//
// Returns false if the value was not incremented because writes are not enabled.
[[nodiscard]] bool IncPendingWrite() {
uint64_t desired;
uint64_t expected = write_state_.load(std::memory_order_relaxed);
do {
// Are writes enabled?
if ((expected & kWritesEnabledMask) == 0) {
return false;
}
desired = expected + 1;
} while (!write_state_.compare_exchange_weak(expected, desired, std::memory_order_acq_rel,
std::memory_order_relaxed));
// Did we overflow?
DEBUG_ASSERT((desired & kWritesInFlightMask) > 0);
return true;
}
void DecPendingWrite() {
[[maybe_unused]] uint64_t previous_value = write_state_.fetch_sub(1, ktl::memory_order_release);
DEBUG_ASSERT((previous_value & kWritesInFlightMask) > 0);
}
// A RAII that implements the FXT Writer protocol and automatically commits the record after a
// successful reservation.
class PendingCommit {
public:
PendingCommit(uint64_t* ptr, uint64_t header, KTraceState* ks)
: ptr_{ptr}, header_{header}, ks_{ks} {}
// No copy.
PendingCommit(const PendingCommit&) = delete;
PendingCommit& operator=(const PendingCommit&) = delete;
// Yes move.
PendingCommit(PendingCommit&& other) noexcept { *this = ktl::move(other); }
PendingCommit& operator=(PendingCommit&& other) noexcept {
ptr_ = other.ptr_;
header_ = other.header_;
ks_ = other.ks_;
other.ptr_ = nullptr;
other.ks_ = nullptr;
return *this;
}
// Going out of scope is what triggers the commit.
~PendingCommit() {
if (ptr_ != nullptr) {
ktl::atomic_ref(*ptr_).store(header_, ktl::memory_order_release);
ks_->DecPendingWrite();
}
}
void WriteWord(uint64_t word) {
ptr_[word_offset_] = word;
word_offset_++;
}
void WriteBytes(const void* bytes, size_t num_bytes) {
const size_t num_words = (num_bytes + 7) / 8;
// Write 0 to the last word to cover any padding bytes.
ptr_[word_offset_ + num_words - 1] = 0;
memcpy(&ptr_[word_offset_], bytes, num_bytes);
word_offset_ += num_words;
}
void Commit() {}
private:
size_t word_offset_{1};
uint64_t* ptr_{nullptr};
uint64_t header_{0};
KTraceState* ks_{nullptr};
};
// Reserve enough bytes of contiguous space in the buffer to fit the FXT Record described by
// `header`, if possible.
zx::result<PendingCommit> Reserve(uint64_t header) {
if (!IncPendingWrite()) {
return zx::error(ZX_ERR_BAD_STATE);
}
uint64_t* const ptr = ReserveRaw(fxt::RecordFields::RecordSize::Get<uint32_t>(header));
if (ptr == nullptr) {
ClearMaskDisableWrites();
DecPendingWrite();
return zx::error(ZX_ERR_NO_MEMORY);
}
return zx::ok(PendingCommit(ptr, header, this));
}
private:
friend class ktrace_tests::TestKTraceState;
[[nodiscard]] zx_status_t RewindLocked() TA_REQ(lock_);
// Add static names (eg syscalls and probes) to the trace buffer. Called
// during a rewind operation immediately after resetting the trace buffer.
// Declared as virtual to facilitate testing.
virtual void ReportStaticNames() TA_REQ(lock_);
// Add the names of current live threads and processes to the trace buffer.
// Called during start operations just before setting the group mask. Declared
// as virtual to facilitate testing.
virtual void ReportThreadProcessNames() TA_REQ(lock_);
// Copy data from kernel memory to user memory. Used by Read, and overloaded
// by test code (which needs to copy to kernel memory, not user memory).
virtual zx_status_t CopyToUser(user_out_ptr<uint8_t> dst, const uint8_t* src, size_t len) {
return dst.copy_array_to_user(src, len);
}
// A small printf stand-in which gives tests the ability to disable diagnostic
// printing during testing.
int DiagsPrintf(int level, const char* fmt, ...) const __PRINTFLIKE(3, 4) {
if (!disable_diags_printfs_ && DPRINTF_ENABLED_FOR_LEVEL(level)) {
va_list args;
va_start(args, fmt);
int result = vprintf(fmt, args);
va_end(args);
return result;
}
return 0;
}
// Attempt to allocate our buffer, if we have not already done so.
zx_status_t AllocBuffer() TA_REQ(lock_);
// Reserve the given number of words in the trace buffer. Returns nullptr if the reservation
// fails.
uint64_t* ReserveRaw(uint32_t num_words);
// Set the group mask, but don't modify the writes-enable state.
void SetGroupMask(uint32_t new_mask) { grpmask_.store(new_mask, ktl::memory_order_release); }
// Enable writes, but don't modify the group mask.
void EnableWrites() { write_state_.fetch_or(kWritesEnabledMask, ktl::memory_order_release); }
// Clear the group mask and disable writes.
void ClearMaskDisableWrites() {
grpmask_.store(0, ktl::memory_order_release);
write_state_.fetch_and(~kWritesEnabledMask, ktl::memory_order_release);
}
// Convert an absolute read or write pointer into an offset into the circular
// region of the buffer. Note that it is illegal to call this if we are not
// operating in circular mode.
uint32_t PtrToCircularOffset(uint64_t ptr) const TA_REQ(write_lock_) {
DEBUG_ASSERT(circular_size_ > 0);
return static_cast<uint32_t>((ptr % circular_size_) + wrap_offset_);
}
uint32_t inflight_writes() const {
return static_cast<uint32_t>(write_state_.load(ktl::memory_order_acquire) &
kWritesInFlightMask);
}
// Allow diagnostic dprintf'ing or not. Overridden by test code.
bool disable_diags_printfs_{false};
// An atomic state variable which tracks whether writes are enabled (bit 63)
// and the number of writes in-flight (bits 0-62).
//
// Write operations consist of:
//
// 1) Optionally observing the group mask with acquire semantics to determine
// if the category is enabled for this write.
// 2) Atomically checking whether writes are enabled and incrementing the
// in-flight count with acq/rel semantics.
// 3) Completing or aborting the write operation.
// 4) Decrementing the in-flight count portion of the state with release
// semantics to indicate that the write is finished.
//
// This allows Stop operations to synchronize with any in-flight writes by:
//
// 1) Clearing the writes-enabled bit with release semantics.
// 2) Spinning on the in-flight-writes with acquire semantics until an
// in-flight count of zero is observed.
//
//
// Notes:
//
// * Once a writer has incremented the in-flight count, they must also
// decrement in a reasonable amount of time to ensure a trace can be stopped.
//
// * The algorithm above has a race (the ABA problem) where a writer may end
// up writing a record of a category that's not enabled. Consumers of the
// data are expected to handle finding unexpected records in the trace buffer.
// The race goes like this: Category A is enabled and a writer is attempting
// to emit a record for category A. The writer checks the group mask (step
// 1), sees that A is enabled and proceeds to step 2. Prior to executing step
// 2, a different thread stops the trace, disabling writes and clearing the
// group mask. A new trace is started with only category B enabled. The
// writer resumes step 2 and atomically checks that writes are enabled,
// increments the in-flight count and proceeds to write a category A record.
// Bit-wise AND with |write_state_| to read writes-enabled.
static constexpr uint64_t kWritesEnabledMask = 1ul << 63;
// Bit-wise AND with |write_state_| to read in-flight-writes count.
static constexpr uint64_t kWritesInFlightMask = ~kWritesEnabledMask;
ktl::atomic<uint64_t> write_state_{0};
ktl::atomic<uint32_t> grpmask_{0};
// The target buffer size (in bytes) we would like to use, when we eventually
// call AllocBuffer. Set during the call to Init.
uint32_t target_bufsize_{0};
// A lock used to serialize all non-write operations. IOW - this lock ensures
// that only a single thread at a time may be involved in operations such as
// Start, Stop, Rewind, and ReadUser
DECLARE_MUTEX(KTraceState) lock_;
bool is_started_ TA_GUARDED(lock_){false};
// The core allocation state of the trace buffer, protected by the write
// spinlock. See "Notes on KTrace operating modes" (above) for details on
// saturate vs. circular mode. This comment will describe how the bookkeeping
// maintained in each of the two modes, how wrapping is handled in circular
// mode, and how space for records in the buffer is reserved and subsequently
// committed.
//
// --== Saturate mode ==--
//
// While operating in saturate mode, the value of |circular_size_| and |rd_|
// will always be 0, and the value of |wrap_offset_| is not defined. The only
// important piece of bookkeeping maintained is the value of |wr_|. |wr_|
// always points to the offset in the buffer where the next record will be
// stored, and it should always be <= |bufsize_|. When reading back records,
// the first record will always be located at offset 0.
//
// --== Circular mode ==--
//
// When operating in circular mode, the buffer is partitioned into two
// regions; a "static" region which contains the records recorded before
// entering circular mode, and a circular region which contain records written
// after beginning circular operation. |circular_size_| must be non-zero, and
// contains the size (in bytes) of the circular region of the buffer. The
// region of the buffer from [0, wrap_offset_) is the static region of the
// buffer, while the region from [wrap_offset_, bufsize_) is the circular
// region. |wrap_offset_| must always be < |bufsize_|.
//
// The |rd_| and |wr_| pointers are absolute offsets into the circular region
// of the buffer, modulo |circular_size_|. When space in the buffer is
// reserved for a record, |wr_| is incremented by the size of the record.
// When a record is purged to make room for new records, |rd_| is incremented.
// At all times, |rd_| <= |wr_|, and both pointers are monotonically
// increasing. The function which maps from one of these pointers to an
// offset in the buffer (on the range [0, bufsize_)) is given by
//
// f(ptr) = (ptr % circular_size_) + wrap_offset_
//
// --== Reserving records and memory ordering ==--
//
// In order to write a record to the trace buffer, the writer must first
// reserve the space to do so. During this period of time, the |write_lock_|
// is held while the bookkeeping is handled in order to reserve space.
// Holding the write lock during reservation guarantees coherent observations
// of the bookkeeping state by the writers.
//
// If the reservation succeeds, the tag field of the reserved record is stored
// as 0 with release semantics, then the write lock is dropped in order to
// allow other reservations to take place concurrently while the payload of
// the record is populated. Once the writer has finished recording the
// payload, it must write the final tag value for the record with release
// semantics. This finalizes the record, and after this operation, the
// payload may no longer change.
//
// If, while operating in circular mode, an old record needs to be purged in
// order to make space for a new record, the |rd_| pointer will simply be
// incremented by the size of the record located at the |rd_| pointer. The
// tag of this record must first be read with memory order acquire semantics
// in order to compute its length so that the |rd_| pointer may be adjusted
// appropriately. If, during this observation, the value of the tag is
// observed to be 0, it means that a writer is attempting to advance the read
// pointer past a record which has not been fully committed yet. If this ever
// happens, the reservation operation fails, and the group mask will be
// cleared, just like if a reservation had failed in saturating mode.
//
// --== Circular mode padding ==--
//
// If a record of size X is to be reserved in the trace buffer while operating
// in circular mode, and the distance between the write pointer and the end of
// the buffer is too small for the record to be contained contiguously, a
// "padding" record will be inserted instead. This is a record with a record type
// of 0 which contains no payload. Its only purpose is to pad the buffer
// out so that the record to be written may exist contiguously in the trace
// buffer.
//
DECLARE_SPINLOCK_WITH_TYPE(KTraceState, TraceDisabledSpinLock) write_lock_;
uint64_t rd_ TA_GUARDED(write_lock_){0};
uint64_t wr_ TA_GUARDED(write_lock_){0};
uint32_t circular_size_ TA_GUARDED(write_lock_){0};
uint32_t wrap_offset_ TA_GUARDED(write_lock_){0};
// Note: these don't _actually_ have to be protected by the write lock.
// Memory ordering consistency for mutators of these variables are protected
// via lock_, while observations from trace writers are actually protected by
// a complicated set of arguments based on the stopped/started state of the
// system, and the acq/rel semantics of the grpmask_ variable.
//
// Instead of relying on these complicated and difficult to
// communicate/enforce invariants, however, we just toss these variables into
// the write lock and leave it at that. Trace writers already needed to be
// inside of the write lock to manipulate the read/write pointers while
// reserving space. Mutation of these variables can only happen during
// start/init when the system is stopped (and there are no writers), so
// obtaining the write lock to allocate the buffer is basically free since it
// will never be contested.
//
uint8_t* buffer_ TA_GUARDED(write_lock_){nullptr};
uint32_t bufsize_ TA_GUARDED(write_lock_){0};
};
} // namespace internal
#endif // ZIRCON_KERNEL_LIB_KTRACE_INCLUDE_LIB_KTRACE_KTRACE_INTERNAL_H_