zircon/kernel/lib/ktrace/include/lib/ktrace/ktrace_internal.h - fuchsia - Git at Google

 // Copyright 2021 The Fuchsia Authors
 //
 // Use of this source code is governed by a MIT-style
 // license that can be found in the LICENSE file or at
 // https://opensource.org/licenses/MIT

 #ifndef ZIRCON_KERNEL_LIB_KTRACE_INCLUDE_LIB_KTRACE_KTRACE_INTERNAL_H_
 #define ZIRCON_KERNEL_LIB_KTRACE_INCLUDE_LIB_KTRACE_KTRACE_INTERNAL_H_

 #include <assert.h>
 #include <lib/fit/function.h>
 #include <lib/fxt/interned_category.h>
 #include <lib/fxt/serializer.h>
 #include <lib/user_copy/user_ptr.h>
 #include <lib/zircon-internal/ktrace.h>
 #include <lib/zx/result.h>
 #include <stdint.h>
 #include <zircon/errors.h>
 #include <zircon/types.h>

 #include <arch/user_copy.h>
 #include <kernel/lockdep.h>
 #include <kernel/mutex.h>
 #include <kernel/spinlock.h>
 #include <ktl/atomic.h>
 #include <ktl/forward.h>
 #include <ktl/move.h>

 // Fwd decl of tests to allow friendship.
 namespace ktrace_tests {
 class TestKTraceState;
 }

 namespace internal {

 class KTraceState {
  public:
   ////////////////////////////////////////////////////////////////
   //
   // Notes on KTrace operating modes.
   //
   // KTrace can currently operate in one of two different modes, either
   // "Saturate" or "Circular".
   //
   // During saturating operation, if an attempt is made to write a record to the
   // ktrace buffer, but there is not enough room to write the record, then the
   // buffer has become "saturated".  The record is dropped, and the group mask
   // is cleared, preventing new writes from occurring until the trace is
   // restarted.
   //
   // During circular operation, if an attempt is made to write a record to the
   // ktrace buffer, but there is not enough room to write the record, then old
   // records are discarded from the trace buffer in order to make room for new
   // records.
   //
   // After a rewind operation, but before starting, the buffer is effectively
   // operating in saturating mode for the purposes of recording static data such
   // as the names of probes and threads in the system at the start of tracing.
   // Afterwards, if the trace is then started in circular mode, the KTraceState
   // instance remembers the point in the buffer where the static records ended,
   // and the circular portion of the buffer starts.  Records from the static
   // region of the trace will never be purged from the trace to make room for
   // new records recorded while in circular mode.
   //
   // A trace may be started, stopped, and started again in Saturate mode any
   // number of times without rewinding.  Additionally, a trace which has
   // previously been started in Saturate mode may subsequently be started in
   // Circular mode without rewinding.  All records recorded while in saturate
   // mode will be part of the static region of the buffer.  It is, however, not
   // legal to start a trace in Circular mode, then stop it, and then attempt to
   // start it again in Saturate mode.
   enum class StartMode { Saturate, Circular };

   constexpr KTraceState() = default;
   virtual ~KTraceState();

   // Initialize the KTraceState instance, may only be called once.  Any methods
   // called on a KTraceState instance after construction, but before Init,
   // should behave as no-ops.
   //
   // |target_bufsize| : The target size (in bytes) of the ktrace buffer to be
   // allocated.  Must be a multiple of 8 bytes.
   //
   // |initial_groups| : The initial set of enabled trace groups (see
   // zircon-internal/ktrace.h).  If non-zero, causes Init to attempt to allocate
   // the trace buffer immediately.  If the allocation fails, or the initial
   // group mask is zero, allocation is delayed until the first time that start
   // is called.
   //
   void Init(uint32_t target_bufsize, uint32_t initial_groups) TA_EXCL(lock_, write_lock_);

   [[nodiscard]] zx_status_t Start(uint32_t groups, StartMode mode) TA_EXCL(lock_, write_lock_);
   [[nodiscard]] zx_status_t Stop() TA_EXCL(lock_, write_lock_);
   [[nodiscard]] zx_status_t Rewind() TA_EXCL(lock_, write_lock_) {
     Guard<Mutex> guard(&lock_);
     return RewindLocked();
   }

   ssize_t ReadUser(user_out_ptr<void> ptr, uint32_t off, size_t len) TA_EXCL(lock_, write_lock_);

   uint32_t grpmask() const {
     return static_cast<uint32_t>(grpmask_.load(ktl::memory_order_acquire));
   }

   bool IsCategoryEnabled(const fxt::InternedCategory& category) const {
     const uint32_t bit_number = category.GetBit();
     const uint32_t bitmask =
         bit_number != fxt::InternedCategory::kInvalidBitNumber ? 1u << bit_number : 0;
     return (bitmask & grpmask()) != 0;
   }

   // Atomically increments in-flight-writes iff writes are enabled and returns true.
   //
   // Returns false if the value was not incremented because writes are not enabled.
   [[nodiscard]] bool IncPendingWrite() {
     uint64_t desired;
     uint64_t expected = write_state_.load(std::memory_order_relaxed);
     do {
       // Are writes enabled?
       if ((expected & kWritesEnabledMask) == 0) {
         return false;
       }
       desired = expected + 1;
     } while (!write_state_.compare_exchange_weak(expected, desired, std::memory_order_acq_rel,
                                                  std::memory_order_relaxed));
     // Did we overflow?
     DEBUG_ASSERT((desired & kWritesInFlightMask) > 0);
     return true;
   }

   void DecPendingWrite() {
     [[maybe_unused]] uint64_t previous_value = write_state_.fetch_sub(1, ktl::memory_order_release);
     DEBUG_ASSERT((previous_value & kWritesInFlightMask) > 0);
   }

   // A RAII that implements the FXT Writer protocol and automatically commits the record after a
   // successful reservation.
   class PendingCommit {
    public:
     PendingCommit(uint64_t* ptr, uint64_t header, KTraceState* ks)
         : ptr_{ptr}, header_{header}, ks_{ks} {}

     // No copy.
     PendingCommit(const PendingCommit&) = delete;
     PendingCommit& operator=(const PendingCommit&) = delete;

     // Yes move.
     PendingCommit(PendingCommit&& other) noexcept { *this = ktl::move(other); }

     PendingCommit& operator=(PendingCommit&& other) noexcept {
       ptr_ = other.ptr_;
       header_ = other.header_;
       ks_ = other.ks_;
       other.ptr_ = nullptr;
       other.ks_ = nullptr;
       return *this;
     }

     // Going out of scope is what triggers the commit.
     ~PendingCommit() {
       if (ptr_ != nullptr) {
         ktl::atomic_ref(*ptr_).store(header_, ktl::memory_order_release);
         ks_->DecPendingWrite();
       }
     }

     void WriteWord(uint64_t word) {
       ptr_[word_offset_] = word;
       word_offset_++;
     }

     void WriteBytes(const void* bytes, size_t num_bytes) {
       const size_t num_words = (num_bytes + 7) / 8;
       // Write 0 to the last word to cover any padding bytes.
       ptr_[word_offset_ + num_words - 1] = 0;
       memcpy(&ptr_[word_offset_], bytes, num_bytes);
       word_offset_ += num_words;
     }

     void Commit() {}

    private:
     size_t word_offset_{1};
     uint64_t* ptr_{nullptr};
     uint64_t header_{0};
     KTraceState* ks_{nullptr};
   };

   // Reserve enough bytes of contiguous space in the buffer to fit the FXT Record described by
   // `header`, if possible.
   zx::result<PendingCommit> Reserve(uint64_t header) {
     if (!IncPendingWrite()) {
       return zx::error(ZX_ERR_BAD_STATE);
     }
     uint64_t* const ptr = ReserveRaw(fxt::RecordFields::RecordSize::Get<uint32_t>(header));
     if (ptr == nullptr) {
       ClearMaskDisableWrites();
       DecPendingWrite();
       return zx::error(ZX_ERR_NO_MEMORY);
     }
     return zx::ok(PendingCommit(ptr, header, this));
   }

  private:
   friend class ktrace_tests::TestKTraceState;

   [[nodiscard]] zx_status_t RewindLocked() TA_REQ(lock_);

   // Add static names (eg syscalls and probes) to the trace buffer.  Called
   // during a rewind operation immediately after resetting the trace buffer.
   // Declared as virtual to facilitate testing.
   virtual void ReportStaticNames() TA_REQ(lock_);

   // Add the names of current live threads and processes to the trace buffer.
   // Called during start operations just before setting the group mask. Declared
   // as virtual to facilitate testing.
   virtual void ReportThreadProcessNames() TA_REQ(lock_);

   // Copy data from kernel memory to user memory.  Used by Read, and overloaded
   // by test code (which needs to copy to kernel memory, not user memory).
   virtual zx_status_t CopyToUser(user_out_ptr<uint8_t> dst, const uint8_t* src, size_t len) {
     return dst.copy_array_to_user(src, len);
   }

   // A small printf stand-in which gives tests the ability to disable diagnostic
   // printing during testing.
   int DiagsPrintf(int level, const char* fmt, ...) const __PRINTFLIKE(3, 4) {
     if (!disable_diags_printfs_ && DPRINTF_ENABLED_FOR_LEVEL(level)) {
       va_list args;
       va_start(args, fmt);
       int result = vprintf(fmt, args);
       va_end(args);
       return result;
     }
     return 0;
   }

   // Attempt to allocate our buffer, if we have not already done so.
   zx_status_t AllocBuffer() TA_REQ(lock_);

   // Reserve the given number of words in the trace buffer. Returns nullptr if the reservation
   // fails.
   uint64_t* ReserveRaw(uint32_t num_words);

   // Set the group mask, but don't modify the writes-enable state.
   void SetGroupMask(uint32_t new_mask) { grpmask_.store(new_mask, ktl::memory_order_release); }

   // Enable writes, but don't modify the group mask.
   void EnableWrites() { write_state_.fetch_or(kWritesEnabledMask, ktl::memory_order_release); }

   // Clear the group mask and disable writes.
   void ClearMaskDisableWrites() {
     grpmask_.store(0, ktl::memory_order_release);
     write_state_.fetch_and(~kWritesEnabledMask, ktl::memory_order_release);
   }

   // Convert an absolute read or write pointer into an offset into the circular
   // region of the buffer.  Note that it is illegal to call this if we are not
   // operating in circular mode.
   uint32_t PtrToCircularOffset(uint64_t ptr) const TA_REQ(write_lock_) {
     DEBUG_ASSERT(circular_size_ > 0);
     return static_cast<uint32_t>((ptr % circular_size_) + wrap_offset_);
   }

   uint32_t inflight_writes() const {
     return static_cast<uint32_t>(write_state_.load(ktl::memory_order_acquire) &
                                  kWritesInFlightMask);
   }

   // Allow diagnostic dprintf'ing or not.  Overridden by test code.
   bool disable_diags_printfs_{false};

   // An atomic state variable which tracks whether writes are enabled (bit 63)
   // and the number of writes in-flight (bits 0-62).
   //
   // Write operations consist of:
   //
   // 1) Optionally observing the group mask with acquire semantics to determine
   //    if the category is enabled for this write.
   // 2) Atomically checking whether writes are enabled and incrementing the
   //    in-flight count with acq/rel semantics.
   // 3) Completing or aborting the write operation.
   // 4) Decrementing the in-flight count portion of the state with release
   //    semantics to indicate that the write is finished.
   //
   // This allows Stop operations to synchronize with any in-flight writes by:
   //
   // 1) Clearing the writes-enabled bit with release semantics.
   // 2) Spinning on the in-flight-writes with acquire semantics until an
   // in-flight count of zero is observed.
   //
   //
   // Notes:
   //
   // * Once a writer has incremented the in-flight count, they must also
   // decrement in a reasonable amount of time to ensure a trace can be stopped.
   //
   // * The algorithm above has a race (the ABA problem) where a writer may end
   // up writing a record of a category that's not enabled.  Consumers of the
   // data are expected to handle finding unexpected records in the trace buffer.
   // The race goes like this: Category A is enabled and a writer is attempting
   // to emit a record for category A.  The writer checks the group mask (step
   // 1), sees that A is enabled and proceeds to step 2.  Prior to executing step
   // 2, a different thread stops the trace, disabling writes and clearing the
   // group mask.  A new trace is started with only category B enabled.  The
   // writer resumes step 2 and atomically checks that writes are enabled,
   // increments the in-flight count and proceeds to write a category A record.

   // Bit-wise AND with |write_state_| to read writes-enabled.
   static constexpr uint64_t kWritesEnabledMask = 1ul << 63;
   // Bit-wise AND with |write_state_| to read in-flight-writes count.
   static constexpr uint64_t kWritesInFlightMask = ~kWritesEnabledMask;

   ktl::atomic<uint64_t> write_state_{0};

   ktl::atomic<uint32_t> grpmask_{0};

   // The target buffer size (in bytes) we would like to use, when we eventually
   // call AllocBuffer.  Set during the call to Init.
   uint32_t target_bufsize_{0};

   // A lock used to serialize all non-write operations.  IOW - this lock ensures
   // that only a single thread at a time may be involved in operations such as
   // Start, Stop, Rewind, and ReadUser
   DECLARE_MUTEX(KTraceState) lock_;
   bool is_started_ TA_GUARDED(lock_){false};

   // The core allocation state of the trace buffer, protected by the write
   // spinlock.  See "Notes on KTrace operating modes" (above) for details on
   // saturate vs. circular mode.  This comment will describe how the bookkeeping
   // maintained in each of the two modes, how wrapping is handled in circular
   // mode, and how space for records in the buffer is reserved and subsequently
   // committed.
   //
   // --== Saturate mode ==--
   //
   // While operating in saturate mode, the value of |circular_size_| and |rd_|
   // will always be 0, and the value of |wrap_offset_| is not defined.  The only
   // important piece of bookkeeping maintained is the value of |wr_|.  |wr_|
   // always points to the offset in the buffer where the next record will be
   // stored, and it should always be <= |bufsize_|.  When reading back records,
   // the first record will always be located at offset 0.
   //
   // --== Circular mode ==--
   //
   // When operating in circular mode, the buffer is partitioned into two
   // regions; a "static" region which contains the records recorded before
   // entering circular mode, and a circular region which contain records written
   // after beginning circular operation.  |circular_size_| must be non-zero, and
   // contains the size (in bytes) of the circular region of the buffer.  The
   // region of the buffer from [0, wrap_offset_) is the static region of the
   // buffer, while the region from [wrap_offset_, bufsize_) is the circular
   // region.  |wrap_offset_| must always be < |bufsize_|.
   //
   // The |rd_| and |wr_| pointers are absolute offsets into the circular region
   // of the buffer, modulo |circular_size_|.  When space in the buffer is
   // reserved for a record, |wr_| is incremented by the size of the record.
   // When a record is purged to make room for new records, |rd_| is incremented.
   // At all times, |rd_| <= |wr_|, and both pointers are monotonically
   // increasing.  The function which maps from one of these pointers to an
   // offset in the buffer (on the range [0, bufsize_)) is given by
   //
   //   f(ptr) = (ptr % circular_size_) + wrap_offset_
   //
   // --== Reserving records and memory ordering ==--
   //
   // In order to write a record to the trace buffer, the writer must first
   // reserve the space to do so.  During this period of time, the |write_lock_|
   // is held while the bookkeeping is handled in order to reserve space.
   // Holding the write lock during reservation guarantees coherent observations
   // of the bookkeeping state by the writers.
   //
   // If the reservation succeeds, the tag field of the reserved record is stored
   // as 0 with release semantics, then the write lock is dropped in order to
   // allow other reservations to take place concurrently while the payload of
   // the record is populated.  Once the writer has finished recording the
   // payload, it must write the final tag value for the record with release
   // semantics.  This finalizes the record, and after this operation, the
   // payload may no longer change.
   //
   // If, while operating in circular mode, an old record needs to be purged in
   // order to make space for a new record, the |rd_| pointer will simply be
   // incremented by the size of the record located at the |rd_| pointer.  The
   // tag of this record must first be read with memory order acquire semantics
   // in order to compute its length so that the |rd_| pointer may be adjusted
   // appropriately.  If, during this observation, the value of the tag is
   // observed to be 0, it means that a writer is attempting to advance the read
   // pointer past a record which has not been fully committed yet.  If this ever
   // happens, the reservation operation fails, and the group mask will be
   // cleared, just like if a reservation had failed in saturating mode.
   //
   // --== Circular mode padding ==--
   //
   // If a record of size X is to be reserved in the trace buffer while operating
   // in circular mode, and the distance between the write pointer and the end of
   // the buffer is too small for the record to be contained contiguously, a
   // "padding" record will be inserted instead.  This is a record with a record type
   // of 0 which contains no payload.  Its only purpose is to pad the buffer
   // out so that the record to be written may exist contiguously in the trace
   // buffer.
   //
   DECLARE_SPINLOCK_WITH_TYPE(KTraceState, TraceDisabledSpinLock) write_lock_;
   uint64_t rd_ TA_GUARDED(write_lock_){0};
   uint64_t wr_ TA_GUARDED(write_lock_){0};
   uint32_t circular_size_ TA_GUARDED(write_lock_){0};
   uint32_t wrap_offset_ TA_GUARDED(write_lock_){0};

   // Note: these don't _actually_ have to be protected by the write lock.
   // Memory ordering consistency for mutators of these variables are protected
   // via lock_, while observations from trace writers are actually protected by
   // a complicated set of arguments based on the stopped/started state of the
   // system, and the acq/rel semantics of the grpmask_ variable.
   //
   // Instead of relying on these complicated and difficult to
   // communicate/enforce invariants, however, we just toss these variables into
   // the write lock and leave it at that.  Trace writers already needed to be
   // inside of the write lock to manipulate the read/write pointers while
   // reserving space.  Mutation of these variables can only happen during
   // start/init when the system is stopped (and there are no writers), so
   // obtaining the write lock to allocate the buffer is basically free since it
   // will never be contested.
   //
   uint8_t* buffer_ TA_GUARDED(write_lock_){nullptr};
   uint32_t bufsize_ TA_GUARDED(write_lock_){0};
 };

 }  // namespace internal

 #endif  // ZIRCON_KERNEL_LIB_KTRACE_INCLUDE_LIB_KTRACE_KTRACE_INTERNAL_H_
	// Copyright 2021 The Fuchsia Authors
	//
	// Use of this source code is governed by a MIT-style
	// license that can be found in the LICENSE file or at
	// https://opensource.org/licenses/MIT

	#ifndef ZIRCON_KERNEL_LIB_KTRACE_INCLUDE_LIB_KTRACE_KTRACE_INTERNAL_H_
	#define ZIRCON_KERNEL_LIB_KTRACE_INCLUDE_LIB_KTRACE_KTRACE_INTERNAL_H_

	#include <assert.h>
	#include <lib/fit/function.h>
	#include <lib/fxt/interned_category.h>
	#include <lib/fxt/serializer.h>
	#include <lib/user_copy/user_ptr.h>
	#include <lib/zircon-internal/ktrace.h>
	#include <lib/zx/result.h>
	#include <stdint.h>
	#include <zircon/errors.h>
	#include <zircon/types.h>

	#include <arch/user_copy.h>
	#include <kernel/lockdep.h>
	#include <kernel/mutex.h>
	#include <kernel/spinlock.h>
	#include <ktl/atomic.h>
	#include <ktl/forward.h>
	#include <ktl/move.h>

	// Fwd decl of tests to allow friendship.
	namespace ktrace_tests {
	class TestKTraceState;
	}

	namespace internal {

	class KTraceState {
	public:
	////////////////////////////////////////////////////////////////
	//
	// Notes on KTrace operating modes.
	//
	// KTrace can currently operate in one of two different modes, either
	// "Saturate" or "Circular".
	//
	// During saturating operation, if an attempt is made to write a record to the
	// ktrace buffer, but there is not enough room to write the record, then the
	// buffer has become "saturated". The record is dropped, and the group mask
	// is cleared, preventing new writes from occurring until the trace is
	// restarted.
	//
	// During circular operation, if an attempt is made to write a record to the
	// ktrace buffer, but there is not enough room to write the record, then old
	// records are discarded from the trace buffer in order to make room for new
	// records.
	//
	// After a rewind operation, but before starting, the buffer is effectively
	// operating in saturating mode for the purposes of recording static data such
	// as the names of probes and threads in the system at the start of tracing.
	// Afterwards, if the trace is then started in circular mode, the KTraceState
	// instance remembers the point in the buffer where the static records ended,
	// and the circular portion of the buffer starts. Records from the static
	// region of the trace will never be purged from the trace to make room for
	// new records recorded while in circular mode.
	//
	// A trace may be started, stopped, and started again in Saturate mode any
	// number of times without rewinding. Additionally, a trace which has
	// previously been started in Saturate mode may subsequently be started in
	// Circular mode without rewinding. All records recorded while in saturate
	// mode will be part of the static region of the buffer. It is, however, not
	// legal to start a trace in Circular mode, then stop it, and then attempt to
	// start it again in Saturate mode.
	enum class StartMode { Saturate, Circular };

	constexpr KTraceState() = default;
	virtual ~KTraceState();

	// Initialize the KTraceState instance, may only be called once. Any methods
	// called on a KTraceState instance after construction, but before Init,
	// should behave as no-ops.
	//
	// \|target_bufsize\| : The target size (in bytes) of the ktrace buffer to be
	// allocated. Must be a multiple of 8 bytes.
	//
	// \|initial_groups\| : The initial set of enabled trace groups (see
	// zircon-internal/ktrace.h). If non-zero, causes Init to attempt to allocate
	// the trace buffer immediately. If the allocation fails, or the initial
	// group mask is zero, allocation is delayed until the first time that start
	// is called.
	//
	void Init(uint32_t target_bufsize, uint32_t initial_groups) TA_EXCL(lock_, write_lock_);

	[[nodiscard]] zx_status_t Start(uint32_t groups, StartMode mode) TA_EXCL(lock_, write_lock_);
	[[nodiscard]] zx_status_t Stop() TA_EXCL(lock_, write_lock_);
	[[nodiscard]] zx_status_t Rewind() TA_EXCL(lock_, write_lock_) {
	Guard<Mutex> guard(&lock_);
	return RewindLocked();
	}

	ssize_t ReadUser(user_out_ptr<void> ptr, uint32_t off, size_t len) TA_EXCL(lock_, write_lock_);

	uint32_t grpmask() const {
	return static_cast<uint32_t>(grpmask_.load(ktl::memory_order_acquire));
	}

	bool IsCategoryEnabled(const fxt::InternedCategory& category) const {
	const uint32_t bit_number = category.GetBit();
	const uint32_t bitmask =
	bit_number != fxt::InternedCategory::kInvalidBitNumber ? 1u << bit_number : 0;
	return (bitmask & grpmask()) != 0;
	}

	// Atomically increments in-flight-writes iff writes are enabled and returns true.
	//
	// Returns false if the value was not incremented because writes are not enabled.
	[[nodiscard]] bool IncPendingWrite() {
	uint64_t desired;
	uint64_t expected = write_state_.load(std::memory_order_relaxed);
	do {
	// Are writes enabled?
	if ((expected & kWritesEnabledMask) == 0) {
	return false;
	}
	desired = expected + 1;
	} while (!write_state_.compare_exchange_weak(expected, desired, std::memory_order_acq_rel,
	std::memory_order_relaxed));
	// Did we overflow?
	DEBUG_ASSERT((desired & kWritesInFlightMask) > 0);
	return true;
	}

	void DecPendingWrite() {
	[[maybe_unused]] uint64_t previous_value = write_state_.fetch_sub(1, ktl::memory_order_release);
	DEBUG_ASSERT((previous_value & kWritesInFlightMask) > 0);
	}

	// A RAII that implements the FXT Writer protocol and automatically commits the record after a
	// successful reservation.
	class PendingCommit {
	public:
	PendingCommit(uint64_t* ptr, uint64_t header, KTraceState* ks)
	: ptr_{ptr}, header_{header}, ks_{ks} {}

	// No copy.
	PendingCommit(const PendingCommit&) = delete;
	PendingCommit& operator=(const PendingCommit&) = delete;

	// Yes move.
	PendingCommit(PendingCommit&& other) noexcept { *this = ktl::move(other); }

	PendingCommit& operator=(PendingCommit&& other) noexcept {
	ptr_ = other.ptr_;
	header_ = other.header_;
	ks_ = other.ks_;
	other.ptr_ = nullptr;
	other.ks_ = nullptr;
	return *this;
	}

	// Going out of scope is what triggers the commit.
	~PendingCommit() {
	if (ptr_ != nullptr) {
	ktl::atomic_ref(*ptr_).store(header_, ktl::memory_order_release);
	ks_->DecPendingWrite();
	}
	}

	void WriteWord(uint64_t word) {
	ptr_[word_offset_] = word;
	word_offset_++;
	}

	void WriteBytes(const void* bytes, size_t num_bytes) {
	const size_t num_words = (num_bytes + 7) / 8;
	// Write 0 to the last word to cover any padding bytes.
	ptr_[word_offset_ + num_words - 1] = 0;
	memcpy(&ptr_[word_offset_], bytes, num_bytes);
	word_offset_ += num_words;
	}

	void Commit() {}

	private:
	size_t word_offset_{1};
	uint64_t* ptr_{nullptr};
	uint64_t header_{0};
	KTraceState* ks_{nullptr};
	};

	// Reserve enough bytes of contiguous space in the buffer to fit the FXT Record described by
	// `header`, if possible.
	zx::result<PendingCommit> Reserve(uint64_t header) {
	if (!IncPendingWrite()) {
	return zx::error(ZX_ERR_BAD_STATE);
	}
	uint64_t* const ptr = ReserveRaw(fxt::RecordFields::RecordSize::Get<uint32_t>(header));
	if (ptr == nullptr) {
	ClearMaskDisableWrites();
	DecPendingWrite();
	return zx::error(ZX_ERR_NO_MEMORY);
	}
	return zx::ok(PendingCommit(ptr, header, this));
	}

	private:
	friend class ktrace_tests::TestKTraceState;

	[[nodiscard]] zx_status_t RewindLocked() TA_REQ(lock_);

	// Add static names (eg syscalls and probes) to the trace buffer. Called
	// during a rewind operation immediately after resetting the trace buffer.
	// Declared as virtual to facilitate testing.
	virtual void ReportStaticNames() TA_REQ(lock_);

	// Add the names of current live threads and processes to the trace buffer.
	// Called during start operations just before setting the group mask. Declared
	// as virtual to facilitate testing.
	virtual void ReportThreadProcessNames() TA_REQ(lock_);

	// Copy data from kernel memory to user memory. Used by Read, and overloaded
	// by test code (which needs to copy to kernel memory, not user memory).
	virtual zx_status_t CopyToUser(user_out_ptr<uint8_t> dst, const uint8_t* src, size_t len) {
	return dst.copy_array_to_user(src, len);
	}

	// A small printf stand-in which gives tests the ability to disable diagnostic
	// printing during testing.
	int DiagsPrintf(int level, const char* fmt, ...) const __PRINTFLIKE(3, 4) {
	if (!disable_diags_printfs_ && DPRINTF_ENABLED_FOR_LEVEL(level)) {
	va_list args;
	va_start(args, fmt);
	int result = vprintf(fmt, args);
	va_end(args);
	return result;
	}
	return 0;
	}

	// Attempt to allocate our buffer, if we have not already done so.
	zx_status_t AllocBuffer() TA_REQ(lock_);

	// Reserve the given number of words in the trace buffer. Returns nullptr if the reservation
	// fails.
	uint64_t* ReserveRaw(uint32_t num_words);

	// Set the group mask, but don't modify the writes-enable state.
	void SetGroupMask(uint32_t new_mask) { grpmask_.store(new_mask, ktl::memory_order_release); }

	// Enable writes, but don't modify the group mask.
	void EnableWrites() { write_state_.fetch_or(kWritesEnabledMask, ktl::memory_order_release); }

	// Clear the group mask and disable writes.
	void ClearMaskDisableWrites() {
	grpmask_.store(0, ktl::memory_order_release);
	write_state_.fetch_and(~kWritesEnabledMask, ktl::memory_order_release);
	}

	// Convert an absolute read or write pointer into an offset into the circular
	// region of the buffer. Note that it is illegal to call this if we are not
	// operating in circular mode.
	uint32_t PtrToCircularOffset(uint64_t ptr) const TA_REQ(write_lock_) {
	DEBUG_ASSERT(circular_size_ > 0);
	return static_cast<uint32_t>((ptr % circular_size_) + wrap_offset_);
	}

	uint32_t inflight_writes() const {
	return static_cast<uint32_t>(write_state_.load(ktl::memory_order_acquire) &
	kWritesInFlightMask);
	}

	// Allow diagnostic dprintf'ing or not. Overridden by test code.
	bool disable_diags_printfs_{false};

	// An atomic state variable which tracks whether writes are enabled (bit 63)
	// and the number of writes in-flight (bits 0-62).
	//
	// Write operations consist of:
	//
	// 1) Optionally observing the group mask with acquire semantics to determine
	// if the category is enabled for this write.
	// 2) Atomically checking whether writes are enabled and incrementing the
	// in-flight count with acq/rel semantics.
	// 3) Completing or aborting the write operation.
	// 4) Decrementing the in-flight count portion of the state with release
	// semantics to indicate that the write is finished.
	//
	// This allows Stop operations to synchronize with any in-flight writes by:
	//
	// 1) Clearing the writes-enabled bit with release semantics.
	// 2) Spinning on the in-flight-writes with acquire semantics until an
	// in-flight count of zero is observed.
	//
	//
	// Notes:
	//
	// * Once a writer has incremented the in-flight count, they must also
	// decrement in a reasonable amount of time to ensure a trace can be stopped.
	//
	// * The algorithm above has a race (the ABA problem) where a writer may end
	// up writing a record of a category that's not enabled. Consumers of the
	// data are expected to handle finding unexpected records in the trace buffer.
	// The race goes like this: Category A is enabled and a writer is attempting
	// to emit a record for category A. The writer checks the group mask (step
	// 1), sees that A is enabled and proceeds to step 2. Prior to executing step
	// 2, a different thread stops the trace, disabling writes and clearing the
	// group mask. A new trace is started with only category B enabled. The
	// writer resumes step 2 and atomically checks that writes are enabled,
	// increments the in-flight count and proceeds to write a category A record.

	// Bit-wise AND with \|write_state_\| to read writes-enabled.
	static constexpr uint64_t kWritesEnabledMask = 1ul << 63;
	// Bit-wise AND with \|write_state_\| to read in-flight-writes count.
	static constexpr uint64_t kWritesInFlightMask = ~kWritesEnabledMask;

	ktl::atomic<uint64_t> write_state_{0};

	ktl::atomic<uint32_t> grpmask_{0};

	// The target buffer size (in bytes) we would like to use, when we eventually
	// call AllocBuffer. Set during the call to Init.
	uint32_t target_bufsize_{0};

	// A lock used to serialize all non-write operations. IOW - this lock ensures
	// that only a single thread at a time may be involved in operations such as
	// Start, Stop, Rewind, and ReadUser
	DECLARE_MUTEX(KTraceState) lock_;
	bool is_started_ TA_GUARDED(lock_){false};

	// The core allocation state of the trace buffer, protected by the write
	// spinlock. See "Notes on KTrace operating modes" (above) for details on
	// saturate vs. circular mode. This comment will describe how the bookkeeping
	// maintained in each of the two modes, how wrapping is handled in circular
	// mode, and how space for records in the buffer is reserved and subsequently
	// committed.
	//
	// --== Saturate mode ==--
	//
	// While operating in saturate mode, the value of \|circular_size_\| and \|rd_\|
	// will always be 0, and the value of \|wrap_offset_\| is not defined. The only
	// important piece of bookkeeping maintained is the value of \|wr_\|. \|wr_\|
	// always points to the offset in the buffer where the next record will be
	// stored, and it should always be <= \|bufsize_\|. When reading back records,
	// the first record will always be located at offset 0.
	//
	// --== Circular mode ==--
	//
	// When operating in circular mode, the buffer is partitioned into two
	// regions; a "static" region which contains the records recorded before
	// entering circular mode, and a circular region which contain records written
	// after beginning circular operation. \|circular_size_\| must be non-zero, and
	// contains the size (in bytes) of the circular region of the buffer. The
	// region of the buffer from [0, wrap_offset_) is the static region of the
	// buffer, while the region from [wrap_offset_, bufsize_) is the circular
	// region. \|wrap_offset_\| must always be < \|bufsize_\|.
	//
	// The \|rd_\| and \|wr_\| pointers are absolute offsets into the circular region
	// of the buffer, modulo \|circular_size_\|. When space in the buffer is
	// reserved for a record, \|wr_\| is incremented by the size of the record.
	// When a record is purged to make room for new records, \|rd_\| is incremented.
	// At all times, \|rd_\| <= \|wr_\|, and both pointers are monotonically
	// increasing. The function which maps from one of these pointers to an
	// offset in the buffer (on the range [0, bufsize_)) is given by
	//
	// f(ptr) = (ptr % circular_size_) + wrap_offset_
	//
	// --== Reserving records and memory ordering ==--
	//
	// In order to write a record to the trace buffer, the writer must first
	// reserve the space to do so. During this period of time, the \|write_lock_\|
	// is held while the bookkeeping is handled in order to reserve space.
	// Holding the write lock during reservation guarantees coherent observations
	// of the bookkeeping state by the writers.
	//
	// If the reservation succeeds, the tag field of the reserved record is stored
	// as 0 with release semantics, then the write lock is dropped in order to
	// allow other reservations to take place concurrently while the payload of
	// the record is populated. Once the writer has finished recording the
	// payload, it must write the final tag value for the record with release
	// semantics. This finalizes the record, and after this operation, the
	// payload may no longer change.
	//
	// If, while operating in circular mode, an old record needs to be purged in
	// order to make space for a new record, the \|rd_\| pointer will simply be
	// incremented by the size of the record located at the \|rd_\| pointer. The
	// tag of this record must first be read with memory order acquire semantics
	// in order to compute its length so that the \|rd_\| pointer may be adjusted
	// appropriately. If, during this observation, the value of the tag is
	// observed to be 0, it means that a writer is attempting to advance the read
	// pointer past a record which has not been fully committed yet. If this ever
	// happens, the reservation operation fails, and the group mask will be
	// cleared, just like if a reservation had failed in saturating mode.
	//
	// --== Circular mode padding ==--
	//
	// If a record of size X is to be reserved in the trace buffer while operating
	// in circular mode, and the distance between the write pointer and the end of
	// the buffer is too small for the record to be contained contiguously, a
	// "padding" record will be inserted instead. This is a record with a record type
	// of 0 which contains no payload. Its only purpose is to pad the buffer
	// out so that the record to be written may exist contiguously in the trace
	// buffer.
	//
	DECLARE_SPINLOCK_WITH_TYPE(KTraceState, TraceDisabledSpinLock) write_lock_;
	uint64_t rd_ TA_GUARDED(write_lock_){0};
	uint64_t wr_ TA_GUARDED(write_lock_){0};
	uint32_t circular_size_ TA_GUARDED(write_lock_){0};
	uint32_t wrap_offset_ TA_GUARDED(write_lock_){0};

	// Note: these don't _actually_ have to be protected by the write lock.
	// Memory ordering consistency for mutators of these variables are protected
	// via lock_, while observations from trace writers are actually protected by
	// a complicated set of arguments based on the stopped/started state of the
	// system, and the acq/rel semantics of the grpmask_ variable.
	//
	// Instead of relying on these complicated and difficult to
	// communicate/enforce invariants, however, we just toss these variables into
	// the write lock and leave it at that. Trace writers already needed to be
	// inside of the write lock to manipulate the read/write pointers while
	// reserving space. Mutation of these variables can only happen during
	// start/init when the system is stopped (and there are no writers), so
	// obtaining the write lock to allocate the buffer is basically free since it
	// will never be contested.
	//
	uint8_t* buffer_ TA_GUARDED(write_lock_){nullptr};
	uint32_t bufsize_ TA_GUARDED(write_lock_){0};
	};

	} // namespace internal

	#endif // ZIRCON_KERNEL_LIB_KTRACE_INCLUDE_LIB_KTRACE_KTRACE_INTERNAL_H_