zircon/kernel/lib/ktrace/include/lib/ktrace/ktrace_internal.h - fuchsia - Git at Google

 // Copyright 2021 The Fuchsia Authors
 //
 // Use of this source code is governed by a MIT-style
 // license that can be found in the LICENSE file or at
 // https://opensource.org/licenses/MIT

 #ifndef ZIRCON_KERNEL_LIB_KTRACE_INCLUDE_LIB_KTRACE_KTRACE_INTERNAL_H_
 #define ZIRCON_KERNEL_LIB_KTRACE_INCLUDE_LIB_KTRACE_KTRACE_INTERNAL_H_

 #include <assert.h>
 #include <lib/fit/function.h>
 #include <lib/zircon-internal/ktrace.h>
 #include <stdint.h>
 #include <zircon/errors.h>
 #include <zircon/types.h>

 #include <arch/user_copy.h>
 #include <kernel/lockdep.h>
 #include <kernel/mutex.h>
 #include <kernel/spinlock.h>
 #include <ktl/atomic.h>
 #include <ktl/forward.h>

 // Fwd decl of tests to allow friendship.
 namespace ktrace_tests {
 class TestKTraceState;
 }

 namespace internal {

 class KTraceState {
  public:
   ////////////////////////////////////////////////////////////////
   //
   // Notes on KTrace operating modes.
   //
   // KTrace can currently operate in one of two different modes, either
   // "Saturate" or "Circular".
   //
   // During saturating operation, if an attempt is made to write a record to the
   // ktrace buffer, but there is not enough room to write the record, then the
   // buffer has become "saturated".  The record is dropped, and the group mask
   // is cleared, preventing new writes from occurring until the trace is
   // restarted.
   //
   // During circular operation, if an attempt is made to write a record to the
   // ktrace buffer, but there is not enough room to write the record, then old
   // records are discarded from the trace buffer in order to make room for new
   // records.
   //
   // After a rewind operation, but before starting, the buffer is effectively
   // operating in saturating mode for the purposes of recording static data such
   // as the names of probes and threads in the system at the start of tracing.
   // Afterwards, if the trace is then started in circular mode, the KTraceState
   // instance remembers the point in the buffer where the static records ended,
   // and the circular portion of the buffer starts.  Records from the static
   // region of the trace will never be purged from the trace to make room for
   // new records recorded while in circular mode.
   //
   // A trace may be started, stopped, and started again in Saturate mode any
   // number of times without rewinding.  Additionally, a trace which has
   // previously been started in Saturate mode may subsequently be started in
   // Circular mode without rewinding.  All records recorded while in saturate
   // mode will be part of the static region of the buffer.  It is, however, not
   // legal to start a trace in Circular mode, then stop it, and then attempt to
   // start it again in Saturate mode.
   enum class StartMode { Saturate, Circular };

   constexpr KTraceState() = default;
   virtual ~KTraceState();

   // Initialize the KTraceState instance, may only be called once.  Any methods
   // called on a KTraceState instance after construction, but before Init,
   // should behave as no-ops.
   //
   // |target_bufsize| : The target size (in bytes) of the ktrace buffer to be
   // allocated.  Must be a multiple of 8 bytes.
   //
   // |initial_groups| : The initial set of enabled trace groups (see
   // zircon-internal/ktrace.h).  If non-zero, causes Init to attempt to allocate
   // the trace buffer immediately.  If the allocation fails, or the initial
   // group mask is zero, allocation is delayed until the first time that start
   // is called.
   //
   void Init(uint32_t target_bufsize, uint32_t initial_groups) TA_EXCL(lock_, write_lock_);

   [[nodiscard]] zx_status_t Start(uint32_t groups, StartMode mode) TA_EXCL(lock_, write_lock_);
   [[nodiscard]] zx_status_t Stop() TA_EXCL(lock_, write_lock_);
   [[nodiscard]] zx_status_t Rewind() TA_EXCL(lock_, write_lock_) {
     Guard<Mutex> guard(&lock_);
     return RewindLocked();
   }

   ssize_t ReadUser(void* ptr, uint32_t off, size_t len) TA_EXCL(lock_, write_lock_);

   // Write a record to the tracelog.
   //
   // |payload| must consist of all uint32_t or all uint64_t types.
   template <typename... Args>
   void WriteRecord(uint32_t effective_tag, uint64_t explicit_ts, Args... args);
   void WriteRecordTiny(uint32_t tag, uint32_t arg) TA_EXCL(write_lock_);
   void WriteNameEtc(uint32_t tag, uint32_t id, uint32_t arg, const char* name, bool always)
       TA_EXCL(write_lock_);

   inline uint32_t grpmask() const {
     return static_cast<uint32_t>(grpmask_and_inflight_writes_.load(ktl::memory_order_acquire));
   }

   // Check to see if a tag is currently enabled using either a new observation
   // of the group mask (default), or a previous observation.
   static inline bool tag_enabled(uint32_t tag, uint32_t mask) { return (mask & tag) != 0; }

   inline bool tag_enabled(uint32_t tag) const { return tag_enabled(tag, grpmask()); }

  private:
   // A small RAII helper which makes sure that we don't mess up our
   // in_flight_writes bookkeeping.
   class AutoWriteInFlight {
    public:
     explicit AutoWriteInFlight(KTraceState& ks)
         : ks_(ks),
           observed_grpmask_(
               static_cast<uint32_t>(ks_.grpmask_and_inflight_writes_.fetch_add(
                                         kInflightWritesInc, ktl::memory_order_acq_rel) &
                                     ~kInflightWritesMask)) {}

     ~AutoWriteInFlight() {
       [[maybe_unused]] uint64_t prev;
       prev =
           ks_.grpmask_and_inflight_writes_.fetch_sub(kInflightWritesInc, ktl::memory_order_release);
       DEBUG_ASSERT((prev & kInflightWritesMask) > 0);
     }

     uint32_t observed_grpmask() const { return observed_grpmask_; }

    private:
     KTraceState& ks_;
     const uint32_t observed_grpmask_;
   };

   // A small helper class which should make it impossible to forget to commit a
   // record after a successful reservation.
   class PendingCommit {
    public:
     // There are only two ways to make an instance of a PendingCommit.  Either
     // via implicit conversion from nullptr (a failed reservation), or from a
     // pointer to the start of the record, and a value for the tag which
     // eventually must be committed.
     PendingCommit(nullptr_t) {}
     PendingCommit(void* ptr, uint32_t tag) : ptr_(ptr), tag_(tag) {}

     // No copy.
     PendingCommit(const PendingCommit&) = delete;
     PendingCommit& operator=(const PendingCommit&) = delete;

     // Yes move.
     PendingCommit(PendingCommit&& other) noexcept : ptr_(other.ptr_), tag_(other.tag_) {
       other.ptr_ = nullptr;
     }

     PendingCommit& operator=(PendingCommit&& other) noexcept {
       ptr_ = other.ptr_;
       tag_ = other.tag_;
       other.ptr_ = nullptr;
       return *this;
     }

     // Going out of scope is what triggers the commit.
     ~PendingCommit() {
       if (ptr_ != nullptr) {
         ktl::atomic_ref(*static_cast<uint32_t*>(ptr_)).store(tag_, ktl::memory_order_release);
       }
     }

     // Users need access to the reserved pointer in order to fill out their
     // record payload.
     ktrace_header_t* hdr() const { return reinterpret_cast<ktrace_header_t*>(ptr_); }
     bool is_valid() const { return (ptr_ != nullptr); }

    private:
     void* ptr_{nullptr};
     uint32_t tag_{0};
   };

   friend class ktrace_tests::TestKTraceState;
   friend class AutoWriteInFlight;

   static inline uint32_t MakeTidField(uint32_t tag) {
     return KTRACE_FLAGS(tag) & KTRACE_FLAGS_CPU
                ? arch_curr_cpu_num()
                : static_cast<uint32_t>(Thread::Current::Get()->tid());
   }

   [[nodiscard]] zx_status_t RewindLocked() TA_REQ(lock_);

   // Add static names (eg syscalls and probes) to the trace buffer.  Called
   // during a rewind operation immediately after resetting the trace buffer.
   // Declared as virtual to facilitate testing.
   virtual void ReportStaticNames() TA_REQ(lock_);

   // Add the names of current live threads and processes to the trace buffer.
   // Called during start operations just before setting the group mask. Declared
   // as virtual to facilitate testing.
   virtual void ReportThreadProcessNames() TA_REQ(lock_);

   // Copy data from kernel memory to user memory.  Used by Read, and overloaded
   // by test code (which needs to copy to kernel memory, not user memory).
   virtual zx_status_t CopyToUser(void* dst, const void* src, size_t len) {
     return arch_copy_to_user(dst, src, len);
   }

   // A small printf stand-in which gives tests the ability to disable diagnostic
   // printing during testing.
   int DiagsPrintf(int level, const char* fmt, ...) __PRINTFLIKE(3, 4) {
     if (!disable_diags_printfs_ && DPRINTF_ENABLED_FOR_LEVEL(level)) {
       va_list args;
       va_start(args, fmt);
       int result = vprintf(fmt, args);
       va_end(args);
       return result;
     }

     return 0;
   }

   // Attempt to allocate our buffer, if we have not already done so.
   zx_status_t AllocBuffer() TA_REQ(lock_);

   // Reserve KTRACE_LEN(tag) bytes of contiguous space in the buffer, if
   // possible.
   PendingCommit Reserve(uint32_t tag);

   inline void DisableGroupMask() {
     grpmask_and_inflight_writes_.fetch_and(kInflightWritesMask, ktl::memory_order_release);
   }

   inline void SetGroupMask(uint32_t new_mask) {
     grpmask_and_inflight_writes_.fetch_and(kInflightWritesMask, ktl::memory_order_relaxed);
     grpmask_and_inflight_writes_.fetch_or(new_mask, ktl::memory_order_release);
   }

   // Convert an absolute read or write pointer into an offset into the circular
   // region of the buffer.  Note that it is illegal to call this if we are not
   // operating in circular mode.
   uint32_t PtrToCircularOffset(uint64_t ptr) const TA_REQ(write_lock_) {
     DEBUG_ASSERT(circular_size_ > 0);
     return static_cast<uint32_t>((ptr % circular_size_) + wrap_offset_);
   }

   inline uint32_t inflight_writes() const {
     return static_cast<uint32_t>(
         (grpmask_and_inflight_writes_.load(ktl::memory_order_acquire) & kInflightWritesMask) >> 32);
   }

   // Allow diagnostic dprintf'ing or not.  Overridden by test code.
   bool disable_diags_printfs_{false};

   // An atomic state variable which tracks the currently active group mask (in
   // its lower 32 bits) and the current in-flight write count (in its upper 32
   // bits).
   //
   // Write operations consist of:
   //
   // 1) Observing the group mask with acquire semantics to determine if the
   //    write should proceed.
   // 2) Incrementing the in-flight-write count portion of the state with acq/rel
   //    semantics to indicate that a write operation has begun.
   // 3) Completing the operation, or aborting it if the group mask has been
   //    disabled for this write since step #1.
   // 4) Decrementing the in-flight-write count portion of the state with release
   //    semantics to indicate that the write is finished.
   //
   // This allows Stop operations to synchronize with any in-flight writes by:
   //
   // 1) Clearing the grpmask portion of the state with release semantics.
   // 2) Spinning on the in-flight-writes portion of the mask with acquire
   //    semantics until an in-flight count of zero is observed.
   //
   static constexpr uint64_t kInflightWritesMask = 0xFFFFFFFF00000000;
   static constexpr uint64_t kInflightWritesInc = 0x0000000100000000;
   ktl::atomic<uint64_t> grpmask_and_inflight_writes_{0};

   // The target buffer size (in bytes) we would like to use, when we eventually
   // call AllocBuffer.  Set during the call to Init.
   uint32_t target_bufsize_{0};

   // A lock used to serialize all non-write operations.  IOW - this lock ensures
   // that only a single thread at a time may be involved in operations such as
   // Start, Stop, Rewind, and ReadUser
   DECLARE_MUTEX(KTraceState) lock_;
   bool is_started_ TA_GUARDED(lock_){false};

   // The core allocation state of the trace buffer, protected by the write
   // spinlock.  See "Notes on KTrace operating modes" (above) for details on
   // saturate vs. circular mode.  This comment will describe how the bookkeeping
   // maintained in each of the two modes, how wrapping is handled in circular
   // mode, and how space for records in the buffer is reserved and subsequently
   // committed.
   //
   // --== Saturate mode ==--
   //
   // While operating in saturate mode, the value of |circular_size_| and |rd_|
   // will always be 0, and the value of |wrap_offset_| is not defined.  The only
   // important piece of bookkeeping maintained is the value of |wr_|.  |wr_|
   // always points to the offset in the buffer where the next record will be
   // stored, and it should always be <= |bufsize_|.  When reading back records,
   // the first record will always be located at offset 0.
   //
   // --== Circular mode ==--
   //
   // When operating in circular mode, the buffer is partitioned into two
   // regions; a "static" region which contains the records recorded before
   // entering circular mode, and a circular region which contain records written
   // after beginning circular operation.  |circular_size_| must be non-zero, and
   // contains the size (in bytes) of the circular region of the buffer.  The
   // region of the buffer from [0, wrap_offset_) is the static region of the
   // buffer, while the region from [wrap_offset_, bufsize_) is the circular
   // region.  |wrap_offset_| must always be < |bufsize_|.
   //
   // The |rd_| and |wr_| pointers are absolute offsets into the circular region
   // of the buffer, modulo |circular_size_|.  When space in the buffer is
   // reserved for a record, |wr_| is incremented by the size of the record.
   // When a record is purged to make room for new records, |rd_| is incremented.
   // At all times, |rd_| <= |wr_|, and both pointers are monotonically
   // increasing.  The function which maps from one of these pointers to an
   // offset in the buffer (on the range [0, bufsize_)) is given by
   //
   //   f(ptr) = (ptr % circular_size_) + wrap_offset_
   //
   // --== Reserving records and memory ordering ==--
   //
   // In order to write a record to the trace buffer, the writer must first
   // reserve the space to do so.  During this period of time, the |write_lock_|
   // is held while the bookkeeping is handled in order to reserve space.
   // Holding the write lock during reservation guarantees coherent observations
   // of the bookkeeping state by the writers.
   //
   // If the reservation succeeds, the tag field of the reserved record is stored
   // as 0 with release semantics, then the write lock is dropped in order to
   // allow other reservations to take place concurrently while the payload of
   // the record is populated.  Once the writer has finished recording the
   // payload, it must write the final tag value for the record with release
   // semantics.  This finalizes the record, and after this operation, the
   // payload may no longer change.
   //
   // If, while operating in circular mode, an old record needs to be purged in
   // order to make space for a new record, the |rd_| pointer will simply be
   // incremented by the size of the record located at the |rd_| pointer.  The
   // tag of this record must first be read with memory order acquire semantics
   // in order to compute its length so that the |rd_| pointer may be adjusted
   // appropriately.  If, during this observation, the value of the tag is
   // observed to be 0, it means that a writer is attempting to advance the read
   // pointer past a record which has not been fully committed yet.  If this ever
   // happens, the reservation operation fails, and the group mask will be
   // cleared, just like if a reservation had failed in saturating mode.
   //
   // --== Circular mode padding ==--
   //
   // If a record of size X is to be reserved in the trace buffer while operating
   // in circular mode, and the distance between the write pointer and the end of
   // the buffer is too small for the record to be contained contiguously, a
   // "padding" record will be inserted instead.  This is a record with a group
   // ID of 0 which contains no payload.  Its only purpose is to bad the buffer
   // out so that the record to be written may exist contiguously in the trace
   // buffer.
   //
   DECLARE_SPINLOCK(KTraceState) write_lock_;
   uint64_t rd_ TA_GUARDED(write_lock_){0};
   uint64_t wr_ TA_GUARDED(write_lock_){0};
   uint32_t circular_size_ TA_GUARDED(write_lock_){0};
   uint32_t wrap_offset_ TA_GUARDED(write_lock_){0};

   // Note: these don't _actually_ have to be protected by the write lock.
   // Memory ordering consistency for mutators of these variables are protected
   // via lock_, while observations from trace writers are actually protected by
   // a complicated set of arguments based on the stopped/started state of the
   // system, and the acq/rel semantics of the grpmask_ variable.
   //
   // Instead of relying on these complicated and difficult to
   // communicate/enforce invariants, however, we just toss these variables into
   // the write lock and leave it at that.  Trace writers already needed to be
   // inside of the write lock to manipulate the read/write pointers while
   // reserving space.  Mutation of these variables can only happen during
   // start/init when the system is stopped (and there are no writers), so
   // obtaining the write lock to allocate the buffer is basically free since it
   // will never be contested.
   //
   uint8_t* buffer_ TA_GUARDED(write_lock_){nullptr};
   uint32_t bufsize_ TA_GUARDED(write_lock_){0};
 };

 }  // namespace internal

 #endif  // ZIRCON_KERNEL_LIB_KTRACE_INCLUDE_LIB_KTRACE_KTRACE_INTERNAL_H_
	// Copyright 2021 The Fuchsia Authors
	//
	// Use of this source code is governed by a MIT-style
	// license that can be found in the LICENSE file or at
	// https://opensource.org/licenses/MIT

	#ifndef ZIRCON_KERNEL_LIB_KTRACE_INCLUDE_LIB_KTRACE_KTRACE_INTERNAL_H_
	#define ZIRCON_KERNEL_LIB_KTRACE_INCLUDE_LIB_KTRACE_KTRACE_INTERNAL_H_

	#include <assert.h>
	#include <lib/fit/function.h>
	#include <lib/zircon-internal/ktrace.h>
	#include <stdint.h>
	#include <zircon/errors.h>
	#include <zircon/types.h>

	#include <arch/user_copy.h>
	#include <kernel/lockdep.h>
	#include <kernel/mutex.h>
	#include <kernel/spinlock.h>
	#include <ktl/atomic.h>
	#include <ktl/forward.h>

	// Fwd decl of tests to allow friendship.
	namespace ktrace_tests {
	class TestKTraceState;
	}

	namespace internal {

	class KTraceState {
	public:
	////////////////////////////////////////////////////////////////
	//
	// Notes on KTrace operating modes.
	//
	// KTrace can currently operate in one of two different modes, either
	// "Saturate" or "Circular".
	//
	// During saturating operation, if an attempt is made to write a record to the
	// ktrace buffer, but there is not enough room to write the record, then the
	// buffer has become "saturated". The record is dropped, and the group mask
	// is cleared, preventing new writes from occurring until the trace is
	// restarted.
	//
	// During circular operation, if an attempt is made to write a record to the
	// ktrace buffer, but there is not enough room to write the record, then old
	// records are discarded from the trace buffer in order to make room for new
	// records.
	//
	// After a rewind operation, but before starting, the buffer is effectively
	// operating in saturating mode for the purposes of recording static data such
	// as the names of probes and threads in the system at the start of tracing.
	// Afterwards, if the trace is then started in circular mode, the KTraceState
	// instance remembers the point in the buffer where the static records ended,
	// and the circular portion of the buffer starts. Records from the static
	// region of the trace will never be purged from the trace to make room for
	// new records recorded while in circular mode.
	//
	// A trace may be started, stopped, and started again in Saturate mode any
	// number of times without rewinding. Additionally, a trace which has
	// previously been started in Saturate mode may subsequently be started in
	// Circular mode without rewinding. All records recorded while in saturate
	// mode will be part of the static region of the buffer. It is, however, not
	// legal to start a trace in Circular mode, then stop it, and then attempt to
	// start it again in Saturate mode.
	enum class StartMode { Saturate, Circular };

	constexpr KTraceState() = default;
	virtual ~KTraceState();

	// Initialize the KTraceState instance, may only be called once. Any methods
	// called on a KTraceState instance after construction, but before Init,
	// should behave as no-ops.
	//
	// \|target_bufsize\| : The target size (in bytes) of the ktrace buffer to be
	// allocated. Must be a multiple of 8 bytes.
	//
	// \|initial_groups\| : The initial set of enabled trace groups (see
	// zircon-internal/ktrace.h). If non-zero, causes Init to attempt to allocate
	// the trace buffer immediately. If the allocation fails, or the initial
	// group mask is zero, allocation is delayed until the first time that start
	// is called.
	//
	void Init(uint32_t target_bufsize, uint32_t initial_groups) TA_EXCL(lock_, write_lock_);

	[[nodiscard]] zx_status_t Start(uint32_t groups, StartMode mode) TA_EXCL(lock_, write_lock_);
	[[nodiscard]] zx_status_t Stop() TA_EXCL(lock_, write_lock_);
	[[nodiscard]] zx_status_t Rewind() TA_EXCL(lock_, write_lock_) {
	Guard<Mutex> guard(&lock_);
	return RewindLocked();
	}

	ssize_t ReadUser(void* ptr, uint32_t off, size_t len) TA_EXCL(lock_, write_lock_);

	// Write a record to the tracelog.
	//
	// \|payload\| must consist of all uint32_t or all uint64_t types.
	template <typename... Args>
	void WriteRecord(uint32_t effective_tag, uint64_t explicit_ts, Args... args);
	void WriteRecordTiny(uint32_t tag, uint32_t arg) TA_EXCL(write_lock_);
	void WriteNameEtc(uint32_t tag, uint32_t id, uint32_t arg, const char* name, bool always)
	TA_EXCL(write_lock_);

	inline uint32_t grpmask() const {
	return static_cast<uint32_t>(grpmask_and_inflight_writes_.load(ktl::memory_order_acquire));
	}

	// Check to see if a tag is currently enabled using either a new observation
	// of the group mask (default), or a previous observation.
	static inline bool tag_enabled(uint32_t tag, uint32_t mask) { return (mask & tag) != 0; }

	inline bool tag_enabled(uint32_t tag) const { return tag_enabled(tag, grpmask()); }

	private:
	// A small RAII helper which makes sure that we don't mess up our
	// in_flight_writes bookkeeping.
	class AutoWriteInFlight {
	public:
	explicit AutoWriteInFlight(KTraceState& ks)
	: ks_(ks),
	observed_grpmask_(
	static_cast<uint32_t>(ks_.grpmask_and_inflight_writes_.fetch_add(
	kInflightWritesInc, ktl::memory_order_acq_rel) &
	~kInflightWritesMask)) {}

	~AutoWriteInFlight() {
	[[maybe_unused]] uint64_t prev;
	prev =
	ks_.grpmask_and_inflight_writes_.fetch_sub(kInflightWritesInc, ktl::memory_order_release);
	DEBUG_ASSERT((prev & kInflightWritesMask) > 0);
	}

	uint32_t observed_grpmask() const { return observed_grpmask_; }

	private:
	KTraceState& ks_;
	const uint32_t observed_grpmask_;
	};

	// A small helper class which should make it impossible to forget to commit a
	// record after a successful reservation.
	class PendingCommit {
	public:
	// There are only two ways to make an instance of a PendingCommit. Either
	// via implicit conversion from nullptr (a failed reservation), or from a
	// pointer to the start of the record, and a value for the tag which
	// eventually must be committed.
	PendingCommit(nullptr_t) {}
	PendingCommit(void* ptr, uint32_t tag) : ptr_(ptr), tag_(tag) {}

	// No copy.
	PendingCommit(const PendingCommit&) = delete;
	PendingCommit& operator=(const PendingCommit&) = delete;

	// Yes move.
	PendingCommit(PendingCommit&& other) noexcept : ptr_(other.ptr_), tag_(other.tag_) {
	other.ptr_ = nullptr;
	}

	PendingCommit& operator=(PendingCommit&& other) noexcept {
	ptr_ = other.ptr_;
	tag_ = other.tag_;
	other.ptr_ = nullptr;
	return *this;
	}

	// Going out of scope is what triggers the commit.
	~PendingCommit() {
	if (ptr_ != nullptr) {
	ktl::atomic_ref(static_cast<uint32_t>(ptr_)).store(tag_, ktl::memory_order_release);
	}
	}

	// Users need access to the reserved pointer in order to fill out their
	// record payload.
	ktrace_header_t* hdr() const { return reinterpret_cast<ktrace_header_t*>(ptr_); }
	bool is_valid() const { return (ptr_ != nullptr); }

	private:
	void* ptr_{nullptr};
	uint32_t tag_{0};
	};

	friend class ktrace_tests::TestKTraceState;
	friend class AutoWriteInFlight;

	static inline uint32_t MakeTidField(uint32_t tag) {
	return KTRACE_FLAGS(tag) & KTRACE_FLAGS_CPU
	? arch_curr_cpu_num()
	: static_cast<uint32_t>(Thread::Current::Get()->tid());
	}

	[[nodiscard]] zx_status_t RewindLocked() TA_REQ(lock_);

	// Add static names (eg syscalls and probes) to the trace buffer. Called
	// during a rewind operation immediately after resetting the trace buffer.
	// Declared as virtual to facilitate testing.
	virtual void ReportStaticNames() TA_REQ(lock_);

	// Add the names of current live threads and processes to the trace buffer.
	// Called during start operations just before setting the group mask. Declared
	// as virtual to facilitate testing.
	virtual void ReportThreadProcessNames() TA_REQ(lock_);

	// Copy data from kernel memory to user memory. Used by Read, and overloaded
	// by test code (which needs to copy to kernel memory, not user memory).
	virtual zx_status_t CopyToUser(void* dst, const void* src, size_t len) {
	return arch_copy_to_user(dst, src, len);
	}

	// A small printf stand-in which gives tests the ability to disable diagnostic
	// printing during testing.
	int DiagsPrintf(int level, const char* fmt, ...) __PRINTFLIKE(3, 4) {
	if (!disable_diags_printfs_ && DPRINTF_ENABLED_FOR_LEVEL(level)) {
	va_list args;
	va_start(args, fmt);
	int result = vprintf(fmt, args);
	va_end(args);
	return result;
	}

	return 0;
	}

	// Attempt to allocate our buffer, if we have not already done so.
	zx_status_t AllocBuffer() TA_REQ(lock_);

	// Reserve KTRACE_LEN(tag) bytes of contiguous space in the buffer, if
	// possible.
	PendingCommit Reserve(uint32_t tag);

	inline void DisableGroupMask() {
	grpmask_and_inflight_writes_.fetch_and(kInflightWritesMask, ktl::memory_order_release);
	}

	inline void SetGroupMask(uint32_t new_mask) {
	grpmask_and_inflight_writes_.fetch_and(kInflightWritesMask, ktl::memory_order_relaxed);
	grpmask_and_inflight_writes_.fetch_or(new_mask, ktl::memory_order_release);
	}

	// Convert an absolute read or write pointer into an offset into the circular
	// region of the buffer. Note that it is illegal to call this if we are not
	// operating in circular mode.
	uint32_t PtrToCircularOffset(uint64_t ptr) const TA_REQ(write_lock_) {
	DEBUG_ASSERT(circular_size_ > 0);
	return static_cast<uint32_t>((ptr % circular_size_) + wrap_offset_);
	}

	inline uint32_t inflight_writes() const {
	return static_cast<uint32_t>(
	(grpmask_and_inflight_writes_.load(ktl::memory_order_acquire) & kInflightWritesMask) >> 32);
	}

	// Allow diagnostic dprintf'ing or not. Overridden by test code.
	bool disable_diags_printfs_{false};

	// An atomic state variable which tracks the currently active group mask (in
	// its lower 32 bits) and the current in-flight write count (in its upper 32
	// bits).
	//
	// Write operations consist of:
	//
	// 1) Observing the group mask with acquire semantics to determine if the
	// write should proceed.
	// 2) Incrementing the in-flight-write count portion of the state with acq/rel
	// semantics to indicate that a write operation has begun.
	// 3) Completing the operation, or aborting it if the group mask has been
	// disabled for this write since step #1.
	// 4) Decrementing the in-flight-write count portion of the state with release
	// semantics to indicate that the write is finished.
	//
	// This allows Stop operations to synchronize with any in-flight writes by:
	//
	// 1) Clearing the grpmask portion of the state with release semantics.
	// 2) Spinning on the in-flight-writes portion of the mask with acquire
	// semantics until an in-flight count of zero is observed.
	//
	static constexpr uint64_t kInflightWritesMask = 0xFFFFFFFF00000000;
	static constexpr uint64_t kInflightWritesInc = 0x0000000100000000;
	ktl::atomic<uint64_t> grpmask_and_inflight_writes_{0};

	// The target buffer size (in bytes) we would like to use, when we eventually
	// call AllocBuffer. Set during the call to Init.
	uint32_t target_bufsize_{0};

	// A lock used to serialize all non-write operations. IOW - this lock ensures
	// that only a single thread at a time may be involved in operations such as
	// Start, Stop, Rewind, and ReadUser
	DECLARE_MUTEX(KTraceState) lock_;
	bool is_started_ TA_GUARDED(lock_){false};

	// The core allocation state of the trace buffer, protected by the write
	// spinlock. See "Notes on KTrace operating modes" (above) for details on
	// saturate vs. circular mode. This comment will describe how the bookkeeping
	// maintained in each of the two modes, how wrapping is handled in circular
	// mode, and how space for records in the buffer is reserved and subsequently
	// committed.
	//
	// --== Saturate mode ==--
	//
	// While operating in saturate mode, the value of \|circular_size_\| and \|rd_\|
	// will always be 0, and the value of \|wrap_offset_\| is not defined. The only
	// important piece of bookkeeping maintained is the value of \|wr_\|. \|wr_\|
	// always points to the offset in the buffer where the next record will be
	// stored, and it should always be <= \|bufsize_\|. When reading back records,
	// the first record will always be located at offset 0.
	//
	// --== Circular mode ==--
	//
	// When operating in circular mode, the buffer is partitioned into two
	// regions; a "static" region which contains the records recorded before
	// entering circular mode, and a circular region which contain records written
	// after beginning circular operation. \|circular_size_\| must be non-zero, and
	// contains the size (in bytes) of the circular region of the buffer. The
	// region of the buffer from [0, wrap_offset_) is the static region of the
	// buffer, while the region from [wrap_offset_, bufsize_) is the circular
	// region. \|wrap_offset_\| must always be < \|bufsize_\|.
	//
	// The \|rd_\| and \|wr_\| pointers are absolute offsets into the circular region
	// of the buffer, modulo \|circular_size_\|. When space in the buffer is
	// reserved for a record, \|wr_\| is incremented by the size of the record.
	// When a record is purged to make room for new records, \|rd_\| is incremented.
	// At all times, \|rd_\| <= \|wr_\|, and both pointers are monotonically
	// increasing. The function which maps from one of these pointers to an
	// offset in the buffer (on the range [0, bufsize_)) is given by
	//
	// f(ptr) = (ptr % circular_size_) + wrap_offset_
	//
	// --== Reserving records and memory ordering ==--
	//
	// In order to write a record to the trace buffer, the writer must first
	// reserve the space to do so. During this period of time, the \|write_lock_\|
	// is held while the bookkeeping is handled in order to reserve space.
	// Holding the write lock during reservation guarantees coherent observations
	// of the bookkeeping state by the writers.
	//
	// If the reservation succeeds, the tag field of the reserved record is stored
	// as 0 with release semantics, then the write lock is dropped in order to
	// allow other reservations to take place concurrently while the payload of
	// the record is populated. Once the writer has finished recording the
	// payload, it must write the final tag value for the record with release
	// semantics. This finalizes the record, and after this operation, the
	// payload may no longer change.
	//
	// If, while operating in circular mode, an old record needs to be purged in
	// order to make space for a new record, the \|rd_\| pointer will simply be
	// incremented by the size of the record located at the \|rd_\| pointer. The
	// tag of this record must first be read with memory order acquire semantics
	// in order to compute its length so that the \|rd_\| pointer may be adjusted
	// appropriately. If, during this observation, the value of the tag is
	// observed to be 0, it means that a writer is attempting to advance the read
	// pointer past a record which has not been fully committed yet. If this ever
	// happens, the reservation operation fails, and the group mask will be
	// cleared, just like if a reservation had failed in saturating mode.
	//
	// --== Circular mode padding ==--
	//
	// If a record of size X is to be reserved in the trace buffer while operating
	// in circular mode, and the distance between the write pointer and the end of
	// the buffer is too small for the record to be contained contiguously, a
	// "padding" record will be inserted instead. This is a record with a group
	// ID of 0 which contains no payload. Its only purpose is to bad the buffer
	// out so that the record to be written may exist contiguously in the trace
	// buffer.
	//
	DECLARE_SPINLOCK(KTraceState) write_lock_;
	uint64_t rd_ TA_GUARDED(write_lock_){0};
	uint64_t wr_ TA_GUARDED(write_lock_){0};
	uint32_t circular_size_ TA_GUARDED(write_lock_){0};
	uint32_t wrap_offset_ TA_GUARDED(write_lock_){0};

	// Note: these don't _actually_ have to be protected by the write lock.
	// Memory ordering consistency for mutators of these variables are protected
	// via lock_, while observations from trace writers are actually protected by
	// a complicated set of arguments based on the stopped/started state of the
	// system, and the acq/rel semantics of the grpmask_ variable.
	//
	// Instead of relying on these complicated and difficult to
	// communicate/enforce invariants, however, we just toss these variables into
	// the write lock and leave it at that. Trace writers already needed to be
	// inside of the write lock to manipulate the read/write pointers while
	// reserving space. Mutation of these variables can only happen during
	// start/init when the system is stopped (and there are no writers), so
	// obtaining the write lock to allocate the buffer is basically free since it
	// will never be contested.
	//
	uint8_t* buffer_ TA_GUARDED(write_lock_){nullptr};
	uint32_t bufsize_ TA_GUARDED(write_lock_){0};
	};

	} // namespace internal

	#endif // ZIRCON_KERNEL_LIB_KTRACE_INCLUDE_LIB_KTRACE_KTRACE_INTERNAL_H_