zircon/kernel/include/kernel/thread.h - fuchsia - Git at Google

 // Copyright 2016 The Fuchsia Authors
 // Copyright (c) 2008-2015 Travis Geiselbrecht
 //
 // Use of this source code is governed by a MIT-style
 // license that can be found in the LICENSE file or at
 // https://opensource.org/licenses/MIT

 #ifndef ZIRCON_KERNEL_INCLUDE_KERNEL_THREAD_H_
 #define ZIRCON_KERNEL_INCLUDE_KERNEL_THREAD_H_

 #include <debug.h>
 #include <lib/backtrace.h>
 #include <lib/fit/function.h>
 #include <lib/relaxed_atomic.h>
 #include <lib/zircon-internal/thread_annotations.h>
 #include <platform.h>
 #include <sys/types.h>
 #include <zircon/compiler.h>
 #include <zircon/listnode.h>
 #include <zircon/syscalls/object.h>
 #include <zircon/syscalls/scheduler.h>
 #include <zircon/types.h>

 #include <arch/defines.h>
 #include <arch/exception.h>
 #include <arch/ops.h>
 #include <arch/thread.h>
 #include <fbl/canary.h>
 #include <fbl/intrusive_double_list.h>
 #include <fbl/macros.h>
 #include <fbl/wavl_tree_best_node_observer.h>
 #include <kernel/cpu.h>
 #include <kernel/deadline.h>
 #include <kernel/koid.h>
 #include <kernel/scheduler_state.h>
 #include <kernel/spinlock.h>
 #include <kernel/task_runtime_stats.h>
 #include <kernel/thread_lock.h>
 #include <kernel/timer.h>
 #include <ktl/array.h>
 #include <ktl/atomic.h>
 #include <ktl/string_view.h>
 #include <lockdep/thread_lock_state.h>
 #include <vm/kstack.h>

 class Dpc;
 struct Thread;
 class OwnedWaitQueue;
 class PreemptionState;
 class StackOwnedLoanedPagesInterval;
 class ThreadDispatcher;
 class WaitQueue;
 class VmAspace;

 // These forward declarations are needed so that Thread can friend
 // them before they are defined.
 static inline Thread* arch_get_current_thread();
 static inline void arch_set_current_thread(Thread*);

 // When blocking this enum indicates the kind of resource ownership that is being waited for
 // that is causing the block.
 enum class ResourceOwnership {
   // Blocking is either not for any particular resource, or it is to wait for
   // exclusive access to a resource.
   Normal,
   // Blocking is happening whilst waiting for shared read access to a resource.
   Reader,
 };

 // The PreemptDisabledToken (and its global singleton instance,
 // |preempt_disabled_token|) are clang static analysis tokens which can be used
 // to annotate methods as requiring that local preemption be disabled in order
 // to operate properly.  See the AnnotatedAutoPreemptDisabler helper in
 // kernel/auto_preempt_disabler.h for more details.
 struct TA_CAP("token") PreemptDisabledToken {
  public:
   void AssertHeld() TA_ASSERT();

   PreemptDisabledToken() = default;
   PreemptDisabledToken(const PreemptDisabledToken&) = delete;
   PreemptDisabledToken(PreemptDisabledToken&&) = delete;
   PreemptDisabledToken& operator=(const PreemptDisabledToken&) = delete;
   PreemptDisabledToken& operator=(PreemptDisabledToken&&) = delete;

  private:
   friend class PreemptionState;
   void Acquire() TA_ACQ() {}
   void Release() TA_REL() {}
 };

 extern PreemptDisabledToken preempt_disabled_token;

 // Whether a block or a sleep can be interrupted.
 enum class Interruptible : bool { No, Yes };

 // When signaling to a wait queue that the priority of one of its blocked
 // threads has changed, this enum is used as a signal indicating whether or not
 // the priority change should be propagated down the PI chain (if any) or not.
 enum class PropagatePI : bool { No = false, Yes };

 // A WaitQueueCollection is the data structure which holds a collection of
 // threads which are currently blocked in a wait queue.  The data structure
 // imposes a total ordering on the threads meant to represent the order in which
 // the threads should be woken, from "most important" to "least important".
 //
 // One unusual property of the ordering implemented by a WaitQueueCollection is
 // that, unlike an ordering determined by completely properties such as thread
 // priority or weight, it is dynamic with respect to time.  This is to say that
 // while at any instant in time there is always a specific order to the threads,
 // as time advances, this order can change.  The ordering itself is determined
 // by the nature of the various dynamic scheduling disciplines implemented by
 // the Zircon scheduler.
 //
 // At any specific time |now|, the order of the collection is considered to
 // be:
 //
 // 1) The deadline threads in the collection whose absolute deadlines are in the
 //    future, sorted by ascending absolute deadline.  These are the threads who
 //    still have a chance of meeting their absolute deadline, with the nearest
 //    absolute deadline considered to be the most important.
 // 2) The deadline threads in the collection whose absolute deadlines are in the
 //    past, sorted by ascending relative deadline.  These are the threads who
 //    have been blocked until after their last cycle's absolute deadline.  If
 //    all threads were to be woken |now|, the thread with the minimum relative
 //    deadline would be the thread which has the new absolute deadline across
 //    the set.
 // 3) The fair threads in the collection, sorted by their "virtual finish time".
 //    This is equal to the start time of the thread plus the scheduler's maximum
 //    target latency divided by the thread's weight (normalized to the range
 //    (0.0, 1.0].  This is the same ordering imposed by the Scheduler's RunQueue
 //    for fair threads, and is intended to prioritize higher weight threads,
 //    while still ensuring some level of fairness over time.  The start time
 //    represents the last time that a thread entered a run queue, and while high
 //    weight threads will be chosen before low weight threads who arrived at
 //    similar times, threads who arrived earlier (and have been waiting for
 //    longer) will eventually end up being chosen, no matter how much weight
 //    other threads in the collection have compared to it.
 //    TODO(johngro): Instead of using the start time for the last time a
 //    thread entered a RunQueue, should we use the time at which the thread
 //    joined the wait queue instead?
 //
 // In an attempt to make the selection of the "best" thread in a wait queue as
 // efficient as we can, in light of the dynamic nature of the total ordering, we
 // use an "augmented" WAVL tree as our data structure, much like the scheduler's
 // RunQueue.  The tree keeps all of its threads sorted according to a primary
 // key representing the minimum absolute deadline or a modified version its
 // virtual finish time, depending on the thread's scheduling discipline).
 //
 // The virtual finish time of threads is modified so that the MSB of the time is
 // always set. This guarantees that fair threads _always_ come after in the
 // sorting of threads.  Note that we could have also achieved this partitioning
 // by tracking fair threads separately from deadline thread in a separate tree
 // instance.  We keep things in a single tree (for now) in order to help to
 // minimize the size of WaitQueueCollections to help control the size of objects
 // in the kernel (such as the Mutex object).
 //
 // There should be no serious issue with using the MSB of the sort key in this
 // fashion.  Absolute timestamps in zircon use signed 64 bit integers, and the
 // monotonic clock is set at startup to start from zero, meaning that there is
 // no real world case where we would be searching for a deadline thread to
 // wake using a timestamp with the MSB set.
 //
 // Finally, we also maintain an addition augmented invariant such that: For
 // every node (X) in the tree, the pointer to the thread with the minimum
 // relative deadline in the subtree headed by X is maintained as nodes are
 // inserted and removed.
 //
 // With these invariants in place, finding the best thread to run can be
 // computed as follows.
 //
 // 1) If the left-most member of the tree has the MSB of its sorting key set,
 //    then the thread is a fair thread, and there are _no_ deadline threads in
 //    the tree.  Additionally, this thread has the minimum virtual finish time
 //    across all of the fair threads in the tree, and therefore is the "best"
 //    thread to unblock.  When the tree is in this state, selection is O(1).
 // 2) Otherwise, there are deadline threads in the tree.  The tree is searched
 //    to find the first thread whose absolute deadline is in the future,
 //    relative to |now|.  If such a thread exists, then it is the "best" thread
 //    to run right now and it is selected.  When the tree is in this state,
 //    selection is O(log).
 // 3) If there are no threads whose deadlines are in the future, the pointer to
 //    the thread with the minimum relative deadline in the tree is chosen,
 //    simply by fetching the best-in-subtree pointer maintained in |root()|.
 //    While this operation is O(1), when the tree is this state, the over all
 //    achieved order was O(log) because of the search which needed to happen
 //    during step 2.
 //
 // Insert and remove order for the tree should be:
 // 1) Insertions into the tree are always O(log).
 // 2) Unlike a typical WAVL tree, removals of a specific thread from the tree
 //    are O(log) instead of being amortized constant.  This is because of the
 //    cost of restoring the augmented invariant after removal, which involves
 //    walking from the point of removal up to the root of the tree.
 //
 // Finally:
 // Please note that it is possible for the dynamic ordering defined above choose
 // a deadline thread which is not currently eligible to run as the choice for
 // "best thread".  This is because the scheduler does not currently demand that
 // the absolute deadline of a thread be equal to when its period ends and its
 // timeslice is eligible for refresh.
 //
 // While it is possible to account for this behavior as well, doing so is not
 // without cost (both in WaitQueue object size and code complexity). This
 // behavior is no different from the previous priority-based-ordering's
 // behavior, where ineligible deadline threads could also be chosen.  The
 // ability to specify a period different from a relative deadline is currently
 // rarely used in the system, and we are moving in a direction of removing it
 // entirely.  If the concept needs to be re-introduced at a later date, this
 // data structure could be adjusted later on to order threads in phase 2 based
 // on the earliest absolute deadline the could possible have based on earliest
 // time that their period could be refreshed, and their relative deadline
 // parameter.
 class WaitQueueCollection {
  private:
   // fwd decls
   struct BlockedThreadTreeTraits;
   struct MinRelativeDeadlineTraits;

  public:
   using Key = ktl::pair<uint64_t, uintptr_t>;

   // Encapsulation of all the per-thread state for the WaitQueueCollection data structure.
   class ThreadState {
    public:
     ThreadState() = default;

     ~ThreadState();

     // Disallow copying.
     ThreadState(const ThreadState&) = delete;
     ThreadState& operator=(const ThreadState&) = delete;

     bool InWaitQueue() const { return blocked_threads_tree_node_.InContainer(); }

     zx_status_t BlockedStatus() const TA_REQ(thread_lock) { return blocked_status_; }

     void Block(Interruptible interruptible, zx_status_t status) TA_REQ(thread_lock);

     void UnblockIfInterruptible(Thread* thread, zx_status_t status)
         TA_REQ(thread_lock, preempt_disabled_token);

     void Unsleep(Thread* thread, zx_status_t status) TA_REQ(thread_lock);
     void UnsleepIfInterruptible(Thread* thread, zx_status_t status) TA_REQ(thread_lock);

     void UpdatePriorityIfBlocked(Thread* thread, int priority, PropagatePI propagate)
         TA_REQ(thread_lock, preempt_disabled_token);

     void AssertNoOwnedWaitQueues() const TA_REQ(thread_lock) {
       DEBUG_ASSERT(owned_wait_queues_.is_empty());
     }

     void AssertNotBlocked() const TA_REQ(thread_lock) {
       DEBUG_ASSERT(blocking_wait_queue_ == nullptr);
       DEBUG_ASSERT(!InWaitQueue());
     }

    private:
     // WaitQueues, WaitQueueCollections, and their List types, can
     // directly manipulate the contents of the per-thread state, for now.
     friend class OwnedWaitQueue;
     friend class WaitQueue;
     friend class WaitQueueCollection;
     friend struct WaitQueueCollection::BlockedThreadTreeTraits;
     friend struct WaitQueueCollection::MinRelativeDeadlineTraits;

     // If blocked, a pointer to the WaitQueue the Thread is on.
     WaitQueue* blocking_wait_queue_ TA_GUARDED(thread_lock) = nullptr;

     // A list of the WaitQueues currently owned by this Thread.
     fbl::DoublyLinkedList<OwnedWaitQueue*> owned_wait_queues_ TA_GUARDED(thread_lock);

     // Node state for existing in WaitQueueCollection::threads_
     fbl::WAVLTreeNodeState<Thread*> blocked_threads_tree_node_;

     // Primary key used for determining our position in the collection of
     // blocked threads. Pre-computed during insert in order to save a time
     // during insert, rebalance, and search operations.
     uint64_t blocked_threads_tree_sort_key_{0};

     // State variable holding the pointer to the thread in our subtree with the
     // minimum relative deadline (if any).
     Thread* subtree_min_rel_deadline_thread_{nullptr};

     // Return code if woken up abnormally from suspend, sleep, or block.
     zx_status_t blocked_status_ = ZX_OK;

     // Dumping routines are allowed to see inside us.
     friend void dump_thread_locked(Thread* t, bool full_dump);

     // Are we allowed to be interrupted on the current thing we're blocked/sleeping on?
     Interruptible interruptible_ = Interruptible::No;
   };

   constexpr WaitQueueCollection() {}

   // The number of threads currently in the collection.
   uint32_t Count() const TA_REQ(thread_lock) { return static_cast<uint32_t>(threads_.size()); }

   // Peek at the first Thread in the collection.
   Thread* Peek(zx_time_t now) TA_REQ(thread_lock);
   const Thread* Peek(zx_time_t now) const TA_REQ(thread_lock) {
     return const_cast<WaitQueueCollection*>(this)->Peek(now);
   }

   // Add the Thread into its sorted location in the collection.
   void Insert(Thread* thread) TA_REQ(thread_lock);

   // Remove the Thread from the collection.
   void Remove(Thread* thread) TA_REQ(thread_lock);

   // Disallow copying.
   WaitQueueCollection(const WaitQueueCollection&) = delete;
   WaitQueueCollection& operator=(const WaitQueueCollection&) = delete;

  private:
   friend class WaitQueue;  // TODO(johngro): remove this when WaitQueue::BlockedPriority goes away.
   static constexpr uint64_t kFairThreadSortKeyBit = uint64_t{1} << 63;

   struct BlockedThreadTreeTraits {
     static Key GetKey(const Thread& thread);
     static bool LessThan(Key a, Key b) { return a < b; }
     static bool EqualTo(Key a, Key b) { return a == b; }
     static fbl::WAVLTreeNodeState<Thread*>& node_state(Thread& thread);
   };

   struct MinRelativeDeadlineTraits {
     // WAVLTreeBestNodeObserver template API
     using ValueType = Thread*;
     static ValueType GetValue(const Thread& node);
     static ValueType GetSubtreeBest(const Thread& node);
     static bool Compare(ValueType a, ValueType b);
     static void AssignBest(Thread& node, ValueType val);
     static void ResetBest(Thread& target);
   };

   using BlockedThreadTree = fbl::WAVLTree<Key, Thread*, BlockedThreadTreeTraits,
                                           fbl::DefaultObjectTag, BlockedThreadTreeTraits,
                                           fbl::WAVLTreeBestNodeObserver<MinRelativeDeadlineTraits>>;

   BlockedThreadTree threads_;
 };

 // NOTE: must be inside critical section when using these
 class WaitQueue {
  public:
   constexpr WaitQueue() : WaitQueue(kMagic) {}
   ~WaitQueue();

   WaitQueue(WaitQueue&) = delete;
   WaitQueue(WaitQueue&&) = delete;
   WaitQueue& operator=(WaitQueue&) = delete;
   WaitQueue& operator=(WaitQueue&&) = delete;

   // Remove a specific thread out of a wait queue it's blocked on.
   static zx_status_t UnblockThread(Thread* t, zx_status_t wait_queue_error)
       TA_REQ(thread_lock, preempt_disabled_token);

   // Block on a wait queue.
   // The returned status is whatever the caller of WaitQueue::Wake_*() specifies.
   // A deadline other than Deadline::infinite() will abort at the specified time
   // and return ZX_ERR_TIMED_OUT. A deadline in the past will immediately return.
   zx_status_t Block(const Deadline& deadline, Interruptible interruptible) TA_REQ(thread_lock) {
     return BlockEtc(deadline, 0, ResourceOwnership::Normal, interruptible);
   }

   // Block on a wait queue with a zx_time_t-typed deadline.
   zx_status_t Block(zx_time_t deadline, Interruptible interruptible) TA_REQ(thread_lock) {
     return BlockEtc(Deadline::no_slack(deadline), 0, ResourceOwnership::Normal, interruptible);
   }

   // Block on a wait queue, ignoring existing signals in |signal_mask|.
   // The returned status is whatever the caller of WaitQueue::Wake_*() specifies, or
   // ZX_ERR_TIMED_OUT if the deadline has elapsed or is in the past.
   // This will never timeout when called with a deadline of Deadline::infinite().
   zx_status_t BlockEtc(const Deadline& deadline, uint signal_mask, ResourceOwnership reason,
                        Interruptible interruptible) TA_REQ(thread_lock);

   // Returns the current highest priority blocked thread on this wait queue, or
   // nullptr if no threads are blocked.
   Thread* Peek(zx_time_t now) TA_REQ(thread_lock) { return collection_.Peek(now); }
   const Thread* Peek(zx_time_t now) const TA_REQ(thread_lock) { return collection_.Peek(now); }

   // Release one or more threads from the wait queue.
   // wait_queue_error = what WaitQueue::Block() should return for the blocking thread.
   //
   // Returns true if a thread was woken, and false otherwise.
   bool WakeOne(zx_status_t wait_queue_error) TA_REQ(thread_lock);

   void WakeAll(zx_status_t wait_queue_error) TA_REQ(thread_lock);

   // Whether the wait queue is currently empty.
   bool IsEmpty() const TA_REQ(thread_lock);

   uint32_t Count() const TA_REQ(thread_lock) { return collection_.Count(); }

   // Returns the highest priority of all the blocked threads on this WaitQueue.
   // Returns -1 if no threads are blocked.
   int BlockedPriority() const TA_REQ(thread_lock);

   // Used by WaitQueue and OwnedWaitQueue to manage changes to the maximum
   // priority of a wait queue due to external effects (thread priority change,
   // thread timeout, thread killed).
   void UpdatePriority(int old_prio) TA_REQ(thread_lock);

   // A thread's priority has changed.  Update the wait queue bookkeeping to
   // properly reflect this change.
   //
   // |t| must be blocked on this WaitQueue.
   //
   // If |propagate| is PropagatePI::Yes, call into the wait queue code to
   // propagate the priority change down the PI chain (if any).  Then returns true
   // if the change of priority has affected the priority of another thread due to
   // priority inheritance, or false otherwise.
   //
   // If |propagate| is PropagatePI::No, do not attempt to propagate the PI change.
   // This is the mode used by OwnedWaitQueue during a batch update of a PI chain.
   void PriorityChanged(Thread* t, int old_prio, PropagatePI propagate)
       TA_REQ(thread_lock, preempt_disabled_token);

   // OwnedWaitQueue needs to be able to call this on WaitQueues to
   // determine if they are base WaitQueues or the OwnedWaitQueue
   // subclass.
   uint32_t magic() const { return magic_; }

  protected:
   explicit constexpr WaitQueue(uint32_t magic) : magic_(magic) {}

   // Inline helpers (defined in wait_queue_internal.h) for
   // WaitQueue::BlockEtc and OwnedWaitQueue::BlockAndAssignOwner to
   // share.
   inline zx_status_t BlockEtcPreamble(const Deadline& deadline, uint signal_mask,
                                       ResourceOwnership reason, Interruptible interuptible)
       TA_REQ(thread_lock);
   inline zx_status_t BlockEtcPostamble(const Deadline& deadline) TA_REQ(thread_lock);

   // Dequeue the specified thread and set its blocked_status.  Do not actually
   // schedule the thread to run.
   void DequeueThread(Thread* t, zx_status_t wait_queue_error) TA_REQ(thread_lock);

   // Move the specified thread from the source wait queue to the dest wait queue.
   static void MoveThread(WaitQueue* source, WaitQueue* dest, Thread* t) TA_REQ(thread_lock);

  private:
   static void TimeoutHandler(Timer* timer, zx_time_t now, void* arg);

   // Internal helper for dequeueing a single Thread.
   void Dequeue(Thread* t, zx_status_t wait_queue_error) TA_REQ(thread_lock);

   // Validate that the queue of a given WaitQueue is valid.
   void ValidateQueue() TA_REQ(thread_lock);

   // Note: Wait queues come in 2 flavors (traditional and owned) which are
   // distinguished using the magic number.  The point here is that, unlike
   // most other magic numbers in the system, the wait_queue_t serves a
   // functional purpose beyond checking for corruption debug builds.
   static constexpr uint32_t kMagic = fbl::magic("wait");
   uint32_t magic_;

   // The OwnedWaitQueue subclass also manipulates the collection.
  protected:
   WaitQueueCollection collection_;
 };

 // Returns a string constant for the given thread state.
 const char* ToString(enum thread_state state);

 typedef int (*thread_start_routine)(void* arg);
 typedef void (*thread_trampoline_routine)() __NO_RETURN;

 // clang-format off
 #define THREAD_FLAG_DETACHED                 (1 << 0)
 #define THREAD_FLAG_FREE_STRUCT              (1 << 1)
 #define THREAD_FLAG_IDLE                     (1 << 2)
 #define THREAD_FLAG_VCPU                     (1 << 3)

 #define THREAD_SIGNAL_KILL                   (1 << 0)
 #define THREAD_SIGNAL_SUSPEND                (1 << 1)
 #define THREAD_SIGNAL_POLICY_EXCEPTION       (1 << 2)
 // clang-format on

 // thread priority
 #define NUM_PRIORITIES (32)
 #define LOWEST_PRIORITY (0)
 #define HIGHEST_PRIORITY (NUM_PRIORITIES - 1)
 #define DPC_PRIORITY (NUM_PRIORITIES - 2)
 #define IDLE_PRIORITY LOWEST_PRIORITY
 #define LOW_PRIORITY (NUM_PRIORITIES / 4)
 #define DEFAULT_PRIORITY (NUM_PRIORITIES / 2)
 #define HIGH_PRIORITY ((NUM_PRIORITIES / 4) * 3)

 // stack size
 #ifdef CUSTOM_DEFAULT_STACK_SIZE
 #define DEFAULT_STACK_SIZE CUSTOM_DEFAULT_STACK_SIZE
 #else
 #define DEFAULT_STACK_SIZE ARCH_DEFAULT_STACK_SIZE
 #endif

 void dump_thread_locked(Thread* t, bool full) TA_REQ(thread_lock);
 void dump_thread(Thread* t, bool full) TA_EXCL(thread_lock);
 void arch_dump_thread(Thread* t);
 void dump_all_threads_locked(bool full) TA_REQ(thread_lock);
 void dump_all_threads(bool full) TA_EXCL(thread_lock);
 void dump_thread_tid(zx_koid_t tid, bool full) TA_EXCL(thread_lock);
 void dump_thread_tid_locked(zx_koid_t tid, bool full) TA_REQ(thread_lock);

 static inline void dump_thread_during_panic(Thread* t, bool full) TA_NO_THREAD_SAFETY_ANALYSIS {
   // Skip grabbing the lock if we are panic'ing
   dump_thread_locked(t, full);
 }

 static inline void dump_all_threads_during_panic(bool full) TA_NO_THREAD_SAFETY_ANALYSIS {
   // Skip grabbing the lock if we are panic'ing
   dump_all_threads_locked(full);
 }

 static inline void dump_thread_tid_during_panic(zx_koid_t tid,
                                                 bool full) TA_NO_THREAD_SAFETY_ANALYSIS {
   // Skip grabbing the lock if we are panic'ing
   dump_thread_tid_locked(tid, full);
 }

 class PreemptionState {
  public:
   static constexpr uint32_t kMaxFieldCount = 0xffff;
   static constexpr uint32_t kPreemptDisableMask = kMaxFieldCount;
   static constexpr uint32_t kEagerReschedDisableShift = 16;
   static constexpr uint32_t kEagerReschedDisableMask = kMaxFieldCount << kEagerReschedDisableShift;

   cpu_mask_t preempts_pending() const { return preempts_pending_; }

   bool PreemptIsEnabled() const { return disable_counts_ == 0; }

   uint32_t PreemptDisableCount() const { return disable_counts_ & kPreemptDisableMask; }
   uint32_t EagerReschedDisableCount() const { return disable_counts_ >> kEagerReschedDisableShift; }

   enum Flush { FlushLocal = 0x1, FlushRemote = 0x2, FlushAll = FlushLocal | FlushRemote };

   // Flushes local, remote, or all pending preemptions.
   void FlushPending(Flush flush);

   // PreemptDisable() increments the preempt disable counter for the current
   // thread. While preempt disable is non-zero, preemption of the thread is
   // disabled, including preemption from interrupt handlers. During this time,
   // any call to Reschedule() will only record that a reschedule is pending, and
   // won't do a context switch.
   //
   // Note that this does not disallow blocking operations (e.g.
   // mutex.Acquire()). Disabling preemption does not prevent switching away from
   // the current thread if it blocks.
   //
   // A call to PreemptDisable() must be matched by a later call to
   // PreemptReenable() to decrement the preempt disable counter.
   void PreemptDisable() {
     const uint32_t old_count = disable_counts_.fetch_add(1);
     ASSERT((old_count & kPreemptDisableMask) < kMaxFieldCount);
   }

   // PreemptReenable() decrements the preempt disable counter and flushes any
   // pending local preemption operation.  Callers must ensure that they are
   // calling from a context where blocking is allowed, as the call may result in
   // the immediate preemption of the calling thread.
   void PreemptReenable() {
     const uint32_t old_count = disable_counts_.fetch_sub(1);
     ASSERT((old_count & kPreemptDisableMask) > 0);
     if (old_count == 1) {
       DEBUG_ASSERT(!arch_blocking_disallowed());
       FlushPending(FlushLocal);
     }
   }

   void PreemptDisableAnnotated() TA_ACQ(preempt_disabled_token) {
     preempt_disabled_token.Acquire();
     PreemptDisable();
   }

   void PreemptReenableAnnotated() TA_REL(preempt_disabled_token) {
     preempt_disabled_token.Release();
     PreemptReenable();
   }

   // PreemptReenableDelayFlush() decrements the preempt disable counter, but
   // deliberately does _not_ flush any pending local preemption operation.
   // Instead, if local preemption has become enabled again after the count
   // drops, and the local pending bit is set, the method will clear the bit and
   // return true.  Otherwise, it will return false.
   //
   // This method may only be called when interrupts are disabled and blocking is
   // not allowed.
   //
   // Callers of this method are "taking" ownership of the responsibility to
   // ensure that preemption on the local CPU takes place in the near future
   // after the call if the method returns true.
   //
   // Use of this method is strongly discouraged outside of top-level interrupt
   // glue and early threading setup.
   //
   // TODO(johngro): Consider replacing the bool return type with a move-only
   // RAII type which wraps the bool, and ensures that preemption event _must_
   // happen, either by having the user call a method on the object to manually
   // force the preemption event, or when the object destructs.
   [[nodiscard]] bool PreemptReenableDelayFlush() {
     DEBUG_ASSERT(arch_ints_disabled());
     DEBUG_ASSERT(arch_blocking_disallowed());

     const uint32_t old_count = disable_counts_.fetch_sub(1);
     ASSERT((old_count & kPreemptDisableMask) > 0);
     if (old_count == 1) {
       const cpu_mask_t local_mask = cpu_num_to_mask(arch_curr_cpu_num());
       const cpu_mask_t prev_mask = preempts_pending_.fetch_and(~local_mask);
       return (local_mask & prev_mask) != 0;
     }

     return false;
   }

   // EagerReschedDisable() increments the eager resched disable counter for the
   // current thread. When early resched disable is non-zero, issuing local and
   // remote preemptions is disabled, including from interrupt handlers. During
   // this time, any call to Reschedule() or other scheduler entry points that
   // imply a reschedule will only record the pending reschedule for the affected
   // CPU, but will not perform reschedule IPIs or a local context switch.
   //
   // As with PreemptDisable, blocking operations are still allowed while
   // eager resched disable is non-zero.
   //
   // A call to EagerReschedDisable() must be matched by a later call to
   // EagerReschedReenable() to decrement the eager resched disable counter.
   void EagerReschedDisable() {
     const uint32_t old_count = disable_counts_.fetch_add(1 << kEagerReschedDisableShift);
     ASSERT((old_count >> kEagerReschedDisableShift) < kMaxFieldCount);
   }

   // EagerReschedReenable() decrements the eager resched disable counter and
   // flushes pending local and/or remote preemptions if enabled, respectively.
   //
   // It is the responsibility of the caller to correctly handle flushing when
   // passing flush_pending=false. Disabling automatic flushing is strongly
   // discouraged outside of top-level interrupt glue and early threading setup.
   void EagerReschedReenable(bool flush_pending = true) {
     const uint32_t old_count = disable_counts_.fetch_sub(1 << kEagerReschedDisableShift);
     ASSERT((old_count >> kEagerReschedDisableShift) > 0);

     if ((old_count & kEagerReschedDisableMask) == 1 << kEagerReschedDisableShift && flush_pending) {
       DEBUG_ASSERT(old_count != 1 << kEagerReschedDisableShift || !arch_blocking_disallowed());
       FlushPending(old_count == 1 << kEagerReschedDisableShift ? FlushAll : FlushRemote);
     }
   }

   void EagerReschedDisableAnnotated() TA_ACQ(preempt_disabled_token) {
     preempt_disabled_token.Acquire();
     EagerReschedDisable();
   }

   void EagerReschedReenableAnnotated(bool flush_pending = true) TA_REL(preempt_disabled_token) {
     preempt_disabled_token.Release();
     EagerReschedReenable(flush_pending);
   }

   // PreemptSetPending() marks a pending preemption for the given CPUs.
   //
   // This is similar to Reschedule(), except that it may only be used inside an
   // interrupt handler while interrupts and preemption are disabled, between
   // PreemptDisable() and PreemptReenable(). It is similar to Reschedule(),
   // except that it does not need to be called with thread_lock held.
   void PreemptSetPending(cpu_mask_t reschedule_mask = cpu_num_to_mask(arch_curr_cpu_num())) {
     DEBUG_ASSERT(arch_ints_disabled());
     DEBUG_ASSERT(arch_blocking_disallowed());
     DEBUG_ASSERT(!PreemptIsEnabled());

     preempts_pending_ |= reschedule_mask;
   }

  private:
   friend class Scheduler;
   friend class PreemptDisableTestAccess;

   // disable_counts_ contains two fields:
   //
   //  * Bottom 16 bits: the preempt disable counter.
   //  * Top 16 bits: the eager resched disable counter.
   //
   // This is a single field so that both counters can be compared against
   // zero with a single memory access and comparison.
   //
   // disable_counts_ is modified by interrupt handlers, but it is always
   // restored to its original value before the interrupt handler returns,
   // so modifications are not visible to the interrupted thread.
   RelaxedAtomic<uint32_t> disable_counts_;

   // preempts_pending_ tracks pending reschedules to both local and remote CPUs
   // due to activity in the context of the current thread.
   //
   // This value can be changed asynchronously by an interrupt handler.
   //
   // preempts_pending_ should only be non-zero:
   //  * if PreemptDisableCount() or EagerReschedDisable() are non-zero, or
   //  * after PreemptDisableCount() or EagerReschedDisable() have been
   //    decremented, while preempts_pending_ is being checked.
   RelaxedAtomic<cpu_mask_t> preempts_pending_;
 };

 // TaskState is responsible for running the task defined by
 // |entry(arg)|, and reporting its value to any joining threads.
 //
 // TODO: the detached state in Thread::flags_ probably belongs here.
 class TaskState {
  public:
   TaskState() = default;

   void Init(thread_start_routine entry, void* arg);

   zx_status_t Join(zx_time_t deadline) TA_REQ(thread_lock);

   void WakeJoiners(zx_status_t status) TA_REQ(thread_lock);

   thread_start_routine entry() { return entry_; }
   void* arg() { return arg_; }

   int retcode() { return retcode_; }
   void set_retcode(int retcode) { retcode_ = retcode; }

  private:
   // Dumping routines are allowed to see inside us.
   friend void dump_thread_locked(Thread* t, bool full_dump);

   // The Thread's entry point, and its argument.
   thread_start_routine entry_ = nullptr;
   void* arg_ = nullptr;

   // Storage for the return code.
   int retcode_ = 0;

   // Other threads waiting to join this Thread.
   WaitQueue retcode_wait_queue_;
 };

 // Keeps track of whether a thread is allowed to allocate memory.
 //
 // A thread's |MemoryAllocationState| should only be accessed by that thread itself or interrupt
 // handlers running in the thread's context.
 class MemoryAllocationState {
  public:
   void Disable() {
     ktl::atomic_signal_fence(ktl::memory_order_seq_cst);
     disable_count_ = disable_count_ + 1;
     ktl::atomic_signal_fence(ktl::memory_order_seq_cst);
   }

   void Enable() {
     ktl::atomic_signal_fence(ktl::memory_order_seq_cst);
     DEBUG_ASSERT(disable_count_ > 0);
     disable_count_ = disable_count_ - 1;
     ktl::atomic_signal_fence(ktl::memory_order_seq_cst);
   }

   // Returns true if memory allocation is allowed.
   bool IsEnabled() {
     ktl::atomic_signal_fence(ktl::memory_order_seq_cst);
     return disable_count_ == 0;
   }

  private:
   // Notice that we aren't using atomic operations to access the field.  We don't need atomic
   // operations here as long as...
   //
   // 1. We use atomic_signal_fence to prevent compiler reordering.
   //
   // 2. We use volatile to ensure the compiler actually generates loads and stores for the value (so
   // the interrupt handler can see what the thread see, and vice versa).
   //
   // 3. Upon completion, an interrupt handler that modified the field restores it to the value it
   // held at the start of the interrupt.
   volatile uint32_t disable_count_ = 0;
 };

 struct Thread {
   // TODO(kulakowski) Are these needed?
   // Default constructor/destructor declared to be not-inline in order to
   // avoid circular include dependencies involving Thread, WaitQueue, and
   // OwnedWaitQueue.
   Thread();
   ~Thread();

   static Thread* CreateIdleThread(cpu_num_t cpu_num);
   // Creates a thread with |name| that will execute |entry| at |priority|. |arg|
   // will be passed to |entry| when executed, the return value of |entry| will be
   // passed to Exit().
   // This call allocates a thread and places it in the global thread list. This
   // memory will be freed by either Join() or Detach(), one of these
   // MUST be called.
   // The thread will not be scheduled until Resume() is called.
   static Thread* Create(const char* name, thread_start_routine entry, void* arg, int priority);
   static Thread* CreateEtc(Thread* t, const char* name, thread_start_routine entry, void* arg,
                            int priority, thread_trampoline_routine alt_trampoline);

   // Internal initialization routines. Eventually, these should be private.
   void SecondaryCpuInitEarly();

   // Associate this Thread to the given ThreadDispatcher.
   void SetUsermodeThread(fbl::RefPtr<ThreadDispatcher> user_thread);

   // Get the associated ThreadDispatcher.
   ThreadDispatcher* user_thread() { return user_thread_.get(); }
   const ThreadDispatcher* user_thread() const { return user_thread_.get(); }

   // Returns the koid of the associated ProcessDispatcher for user threads or
   // ZX_KOID_INVLID for kernel threads.
   zx_koid_t pid() const { return pid_; }

   // Returns the koid of the associated ThreadDispatcher for user threads or an
   // independent koid for kernel threads.
   zx_koid_t tid() const { return tid_; }

   // Called to mark a thread as schedulable.
   void Resume();
   zx_status_t Suspend();
   void Forget();
   // Marks a thread as detached, in this state its memory will be released once
   // execution is done.
   zx_status_t Detach();
   zx_status_t DetachAndResume();
   // Waits |deadline| time for a thread to complete execution then releases its memory.
   zx_status_t Join(int* retcode, zx_time_t deadline);
   // Deliver a kill signal to a thread.
   void Kill();

   // Checks whether the kill or suspend signal has been raised. If kill has been
   // raised, then `ZX_ERR_INTERNAL_INTR_KILLED` will be returned. If suspend has
   // been raised, then `ZX_ERR_INTERNAL_INTR_RETRY` will be returned. Otherwise,
   // `ZX_OK` will be returned.
   zx_status_t CheckKillOrSuspendSignal() const;

   // Erase this thread from all global lists, where applicable.
   void EraseFromListsLocked() TA_REQ(thread_lock);

   void SetPriority(int priority);
   void SetDeadline(const zx_sched_deadline_params_t& params);

   void* recursive_object_deletion_list() { return recursive_object_deletion_list_; }
   void set_recursive_object_deletion_list(void* ptr) { recursive_object_deletion_list_ = ptr; }

   // Get/set the mask of valid CPUs that thread may run on. If a new mask
   // is set, the thread will be migrated to satisfy the new constraint.
   //
   // Affinity comes in two flavours:
   //
   //   * "hard affinity", which will always be respected by the scheduler.
   //     The scheduler will panic if it can't satisfy this affinity.
   //
   //   * "soft affinity" indicating where the thread should ideally be scheduled.
   //     The scheduler will respect the mask unless there are no other
   //     options (e.g., the soft affinity and hard affinity don't contain
   //     any common CPUs).
   //
   // If the two masks conflict, the hard affinity wins.
   void SetCpuAffinity(cpu_mask_t affinity) TA_EXCL(thread_lock);
   cpu_mask_t GetCpuAffinity() const TA_EXCL(thread_lock);
   void SetSoftCpuAffinity(cpu_mask_t affinity) TA_EXCL(thread_lock);
   cpu_mask_t GetSoftCpuAffinity() const TA_EXCL(thread_lock);

   enum class MigrateStage {
     // The stage before the thread has migrated. Called from the old CPU.
     Before,
     // The stage after the thread has migrated. Called from the new CPU.
     After,
     // The Thread is exiting. Can be called from any CPU.
     Exiting,
   };
   // The migrate function will be invoked twice when a thread is migrate between
   // CPUs. Firstly when the thread is removed from the old CPUs scheduler,
   // secondly when the thread is rescheduled on the new CPU. When the migrate
   // function is called, |thread_lock| is held.
   using MigrateFn = fit::inline_function<void(Thread* thread, MigrateStage stage), sizeof(void*)>
       TA_REQ(thread_lock);

   void SetMigrateFn(MigrateFn migrate_fn) TA_EXCL(thread_lock);
   void SetMigrateFnLocked(MigrateFn migrate_fn) TA_REQ(thread_lock);
   void CallMigrateFnLocked(MigrateStage stage) TA_REQ(thread_lock);

   // Call |migrate_fn| for each thread that was last run on the current CPU.
   static void CallMigrateFnForCpuLocked(cpu_num_t cpu) TA_REQ(thread_lock);

   void OwnerName(char (&out_name)[ZX_MAX_NAME_LEN]);
   // Return the number of nanoseconds a thread has been running for.
   zx_duration_t Runtime() const;

   // Last cpu this thread was running on, or INVALID_CPU if it has never run.
   cpu_num_t LastCpu() const TA_EXCL(thread_lock);
   cpu_num_t LastCpuLocked() const;

   // Return true if thread has been signaled.
   bool IsSignaled() { return signals() != 0; }
   bool IsIdle() const { return !!(flags_ & THREAD_FLAG_IDLE); }

   // Returns true if this Thread's user state has been saved.
   //
   // Caller must hold the thread lock.
   bool IsUserStateSavedLocked() const TA_REQ(thread_lock) {
     thread_lock.AssertHeld();
     return user_state_saved_;
   }

   // Callback for the Timer used for SleepEtc.
   static void SleepHandler(Timer* timer, zx_time_t now, void* arg);
   void HandleSleep(Timer* timer, zx_time_t now);

   // All of these operations implicitly operate on the current thread.
   struct Current {
     // This is defined below, just after the Thread declaration.
     static inline Thread* Get();

     // Scheduler routines to be used by regular kernel code.
     static void Yield();
     static void Preempt();
     static void Reschedule();
     static void Exit(int retcode) __NO_RETURN;
     static void ExitLocked(int retcode) TA_REQ(thread_lock) __NO_RETURN;
     static void Kill();
     static void BecomeIdle() __NO_RETURN;

     // Wait until the deadline has occurred.
     //
     // If interruptible, may return early with ZX_ERR_INTERNAL_INTR_KILLED if
     // thread is signaled for kill.
     static zx_status_t SleepEtc(const Deadline& deadline, Interruptible interruptible,
                                 zx_time_t now);
     // Non-interruptible version of SleepEtc.
     static zx_status_t Sleep(zx_time_t deadline);
     // Non-interruptible relative delay version of Sleep.
     static zx_status_t SleepRelative(zx_duration_t delay);
     // Interruptible version of Sleep.
     static zx_status_t SleepInterruptible(zx_time_t deadline);

     // Transition the current thread to the THREAD_SUSPENDED state.
     static void DoSuspend();

     // |policy_exception_code| should be a ZX_EXCP_POLICY_CODE_* value.
     static void SignalPolicyException(uint32_t policy_exception_code,
                                       uint32_t policy_exception_data);

     // Process pending signals, may never return because of kill signal.
     static void ProcessPendingSignals(GeneralRegsSource source, void* gregs);

     // Migrates the current thread to the CPU identified by target_cpu.
     static void MigrateToCpu(cpu_num_t target_cpuid);

     static void SetName(const char* name);

     static PreemptionState& preemption_state() {
       return Thread::Current::Get()->preemption_state();
     }

     static MemoryAllocationState& memory_allocation_state() {
       return Thread::Current::Get()->memory_allocation_state_;
     }

     // Generate a backtrace for the calling thread.
     //
     // |out_bt| will be reset() prior to be filled in and if a backtrace cannot
     // be obtained, it will be left empty.
     static void GetBacktrace(Backtrace& out_bt);

     // Generate a backtrace for the calling thread starting at frame pointer |fp|.
     //
     // |out_bt| will be reset() prior to be filled in and if a backtrace cannot
     // be obtained, it will be left empty.
     static void GetBacktrace(vaddr_t fp, Backtrace& out_bt);

     static void DumpLocked(bool full) TA_REQ(thread_lock);
     static void Dump(bool full) TA_EXCL(thread_lock);
     static void DumpAllThreadsLocked(bool full) TA_REQ(thread_lock);
     static void DumpAllThreads(bool full) TA_EXCL(thread_lock);
     static void DumpUserTid(zx_koid_t tid, bool full) TA_EXCL(thread_lock);
     static void DumpUserTidLocked(zx_koid_t tid, bool full) TA_REQ(thread_lock);
     static void DumpAllDuringPanic(bool full) TA_NO_THREAD_SAFETY_ANALYSIS {
       dump_all_threads_during_panic(full);
     }
     static void DumpUserTidDuringPanic(zx_koid_t tid, bool full) TA_NO_THREAD_SAFETY_ANALYSIS {
       dump_thread_tid_during_panic(tid, full);
     }
   };

   // Trait for the global Thread list.
   struct ThreadListTrait {
     static fbl::DoublyLinkedListNodeState<Thread*>& node_state(Thread& thread) {
       return thread.thread_list_node_;
     }
   };
   using List = fbl::DoublyLinkedListCustomTraits<Thread*, ThreadListTrait>;

   // Traits for the temporary unblock list, used to batch-unblock threads.
   //
   // TODO(johngro): look into options for optimizing this.  It should be
   // possible to share node storage with that used for wait queues (since a
   // thread needs to have been removed from a wait queue before being sent to
   // Scheduler::Unblock).
   struct UnblockListTrait {
     static fbl::DoublyLinkedListNodeState<Thread*>& node_state(Thread& thread) {
       return thread.unblock_list_node_;
     }
   };
   using UnblockList = fbl::DoublyLinkedListCustomTraits<Thread*, UnblockListTrait>;

   // Stats for a thread's runtime.
   class RuntimeStats {
    public:
     struct SchedulerStats {
       thread_state state = thread_state::THREAD_INITIAL;  // last state
       zx_time_t state_time = 0;                           // when the thread entered state
       zx_duration_t cpu_time = 0;                         // time spent on CPU
       zx_duration_t queue_time = 0;                       // time spent ready to start running
     };

     const SchedulerStats& GetSchedulerStats() const { return sched_; }

     // Update scheduler stats with newer content.
     //
     // Adds to CPU and queue time, but sets the given state directly.
     void UpdateSchedulerStats(const SchedulerStats& other) {
       sched_.cpu_time = zx_duration_add_duration(sched_.cpu_time, other.cpu_time);
       sched_.queue_time = zx_duration_add_duration(sched_.queue_time, other.queue_time);
       sched_.state = other.state;
       sched_.state_time = other.state_time;
     }

     // Add time spent handling page faults.
     // Safe for concurrent use.
     void AddPageFaultTicks(zx_ticks_t ticks) {
       // Ignore overflow: it will take hundreds of years to overflow, and even if it
       // does overflow, this is primarily used to compute relative (rather than absolute)
       // values, which still works after overflow.
       page_fault_ticks_.fetch_add(ticks);
     }

     // Add time spent contented on locks.
     // Safe for concurrent use.
     void AddLockContentionTicks(zx_ticks_t ticks) {
       // Ignore overflow: it will take hundreds of years to overflow, and even if it
       // does overflow, this is primarily used to compute relative (rather than absolute)
       // values, which still works after overflow.
       lock_contention_ticks_.fetch_add(ticks);
     }

     // Get the current TaskRuntimeStats, including the current scheduler state.
     TaskRuntimeStats TotalRuntime() const {
       TaskRuntimeStats ret = {
           .cpu_time = sched_.cpu_time,
           .queue_time = sched_.queue_time,
           .page_fault_ticks = page_fault_ticks_.load(),
           .lock_contention_ticks = lock_contention_ticks_.load(),
       };
       if (sched_.state == thread_state::THREAD_RUNNING) {
         ret.cpu_time = zx_duration_add_duration(
             ret.cpu_time, zx_duration_sub_duration(current_time(), sched_.state_time));
       } else if (sched_.state == thread_state::THREAD_READY) {
         ret.queue_time = zx_duration_add_duration(
             ret.queue_time, zx_duration_sub_duration(current_time(), sched_.state_time));
       }
       return ret;
     }

     // Adds the local stats to the given output for userspace.
     //
     // This method uses the current state of the thread to include partial runtime and queue time
     // between reschedules.
     void AccumulateRuntimeTo(zx_info_task_runtime_t* info) const {
       TaskRuntimeStats runtime = TotalRuntime();
       runtime.AccumulateRuntimeTo(info);
     }

    private:
     SchedulerStats sched_;
     RelaxedAtomic<zx_ticks_t> page_fault_ticks_{0};
     RelaxedAtomic<zx_ticks_t> lock_contention_ticks_{0};
   };

   struct Linebuffer {
     size_t pos = 0;
     ktl::array<char, 128> buffer;
   };

   void UpdateSchedulerStats(const RuntimeStats::SchedulerStats& stats) TA_REQ(thread_lock);

   void DumpDuringPanic(bool full) TA_NO_THREAD_SAFETY_ANALYSIS {
     dump_thread_during_panic(this, full);
   }

   // Accessors into Thread state. When the conversion to all-private
   // members is complete (bug 54383), we can revisit the overall
   // Thread API.

   thread_state state() const { return scheduler_state_.state(); }

   // The scheduler can set threads to be running, or to be ready to run.
   void set_running() { scheduler_state_.set_state(THREAD_RUNNING); }
   void set_ready() { scheduler_state_.set_state(THREAD_READY); }
   // While wait queues can set threads to be blocked.
   void set_blocked() { scheduler_state_.set_state(THREAD_BLOCKED); }
   void set_blocked_read_lock() { scheduler_state_.set_state(THREAD_BLOCKED_READ_LOCK); }
   // The thread can set itself to be sleeping.
   void set_sleeping() { scheduler_state_.set_state(THREAD_SLEEPING); }
   void set_death() { scheduler_state_.set_state(THREAD_DEATH); }
   void set_suspended() { scheduler_state_.set_state(THREAD_SUSPENDED); }

   // Accessors for specific flags_ bits.
   bool detatched() const { return (flags_ & THREAD_FLAG_DETACHED) != 0; }
   void set_detached(bool value) {
     if (value) {
       flags_ |= THREAD_FLAG_DETACHED;
     } else {
       flags_ &= ~THREAD_FLAG_DETACHED;
     }
   }
   bool free_struct() const { return (flags_ & THREAD_FLAG_FREE_STRUCT) != 0; }
   void set_free_struct(bool value) {
     if (value) {
       flags_ |= THREAD_FLAG_FREE_STRUCT;
     } else {
       flags_ &= ~THREAD_FLAG_FREE_STRUCT;
     }
   }
   bool idle() const { return (flags_ & THREAD_FLAG_IDLE) != 0; }
   void set_idle(bool value) {
     if (value) {
       flags_ |= THREAD_FLAG_IDLE;
     } else {
       flags_ &= ~THREAD_FLAG_IDLE;
     }
   }
   bool vcpu() const { return (flags_ & THREAD_FLAG_VCPU) != 0; }
   void set_vcpu(bool value) {
     if (value) {
       flags_ |= THREAD_FLAG_VCPU;
     } else {
       flags_ &= ~THREAD_FLAG_VCPU;
     }
   }

   // Access to the entire flags_ value, for diagnostics.
   unsigned int flags() const { return flags_; }

   unsigned int signals() const { return signals_.load(ktl::memory_order_relaxed); }

   bool has_migrate_fn() const { return migrate_fn_ != nullptr; }
   bool migrate_pending() const { return migrate_pending_; }

   TaskState& task_state() { return task_state_; }
   const TaskState& task_state() const { return task_state_; }

   PreemptionState& preemption_state() { return preemption_state_; }
   const PreemptionState& preemption_state() const { return preemption_state_; }

   SchedulerState& scheduler_state() { return scheduler_state_; }
   const SchedulerState& scheduler_state() const { return scheduler_state_; }

   WaitQueueCollection::ThreadState& wait_queue_state() { return wait_queue_state_; }
   const WaitQueueCollection::ThreadState& wait_queue_state() const { return wait_queue_state_; }

 #if WITH_LOCK_DEP
   lockdep::ThreadLockState& lock_state() { return lock_state_; }
   const lockdep::ThreadLockState& lock_state() const { return lock_state_; }
 #endif

   arch_thread& arch() { return arch_; }
   const arch_thread& arch() const { return arch_; }

   KernelStack& stack() { return stack_; }
   const KernelStack& stack() const { return stack_; }

   VmAspace* aspace() { return aspace_; }
   const VmAspace* aspace() const { return aspace_; }
   VmAspace* switch_aspace(VmAspace* aspace) {
     VmAspace* old_aspace = aspace_;
     aspace_ = aspace;
     return old_aspace;
   }

   const char* name() const { return name_; }
   // This may truncate |name|, so that it (including a trailing NUL
   // byte) fit in ZX_MAX_NAME_LEN bytes.
   void set_name(ktl::string_view name);

   Linebuffer& linebuffer() { return linebuffer_; }

   using Canary = fbl::Canary<fbl::magic("thrd")>;
   const Canary& canary() const { return canary_; }

   // Generate a backtrace for |this| thread.
   //
   // |this| must be blocked, sleeping or suspended (i.e. not running).
   //
   // |out_bt| will be reset() prior to be filled in and if a backtrace cannot be
   // obtained, it will be left empty.
   void GetBacktrace(Backtrace& out_bt) TA_EXCL(thread_lock);

   StackOwnedLoanedPagesInterval* stack_owned_loaned_pages_interval() {
     return stack_owned_loaned_pages_interval_;
   }

  private:
   // The architecture-specific methods for getting and setting the
   // current thread may need to see Thread's arch_ member via offsetof.
   friend inline Thread* arch_get_current_thread();
   friend inline void arch_set_current_thread(Thread*);

   // OwnedWaitQueues manipulate wait queue state.
   friend class OwnedWaitQueue;

   // ScopedThreadExceptionContext is the only public way to call
   // SaveUserStateLocked and RestoreUserStateLocked.
   friend class ScopedThreadExceptionContext;

   // StackOwnedLoanedPagesInterval is the only public way to set/clear the
   // stack_owned_loaned_pages_interval().
   friend class StackOwnedLoanedPagesInterval;

   // Dumping routines are allowed to see inside us.
   friend void dump_thread_locked(Thread* t, bool full_dump);

   // The default trampoline used when running the Thread. This can be
   // replaced by the |alt_trampoline| parameter to CreateEtc().
   static void Trampoline() TA_REQ(thread_lock) __NO_RETURN;

   // Dpc callback used for cleaning up a detached Thread's resources.
   static void FreeDpc(Dpc* dpc);

   // Save the arch-specific user state.
   //
   // Returns true when the user state will later need to be restored.
   [[nodiscard]] bool SaveUserStateLocked() TA_REQ(thread_lock);

   // Restore the arch-specific user state.
   void RestoreUserStateLocked() TA_REQ(thread_lock);

   // Returns true if it decides to kill the thread, which must be the
   // current thread. The thread_lock must be held when calling this
   // function.
   //
   // TODO: move this to CurrentThread, once that becomes a subclass of
   // Thread.
   bool CheckKillSignal() TA_REQ(thread_lock);

   __NO_RETURN void ExitLocked(int retcode) TA_REQ(thread_lock);

  private:
   struct MigrateListTrait {
     static fbl::DoublyLinkedListNodeState<Thread*>& node_state(Thread& thread) {
       return thread.migrate_list_node_;
     }
   };
   using MigrateList = fbl::DoublyLinkedListCustomTraits<Thread*, MigrateListTrait>;

   // The global list of threads with migrate functions.
   static MigrateList migrate_list_ TA_GUARDED(thread_lock);

   Canary canary_;

   // These fields are among the most active in the thread. They are grouped
   // together near the front to improve cache locality.
   unsigned int flags_;
   ktl::atomic<unsigned int> signals_;
   SchedulerState scheduler_state_;
   WaitQueueCollection::ThreadState wait_queue_state_;
   TaskState task_state_;
   PreemptionState preemption_state_;
   MemoryAllocationState memory_allocation_state_;
   // This is part of ensuring that all stack ownership of loaned pages can be boosted in priority
   // via priority inheritance if a higher priority thread is trying to reclaim the loaned pages.
   StackOwnedLoanedPagesInterval* stack_owned_loaned_pages_interval_ = nullptr;

 #if WITH_LOCK_DEP
   // state for runtime lock validation when in thread context
   lockdep::ThreadLockState lock_state_;
 #endif

   // pointer to the kernel address space this thread is associated with
   VmAspace* aspace_;

   // Saved by SignalPolicyException() to store the type of policy error, and
   // passed to exception disptach in ProcessPendingSignals().
   uint32_t extra_policy_exception_code_ TA_GUARDED(thread_lock) = 0;
   uint32_t extra_policy_exception_data_ TA_GUARDED(thread_lock) = 0;

   // Strong reference to user thread if one exists for this thread.
   // In the common case freeing Thread will also free ThreadDispatcher when this
   // reference is dropped.
   fbl::RefPtr<ThreadDispatcher> user_thread_;

   // When user_thread_ is set, these values are copied from ThreadDispatcher and
   // its parent ProcessDispatcher. Kernel threads maintain an independent tid.
   zx_koid_t tid_ = KernelObjectId::Generate();
   zx_koid_t pid_ = ZX_KOID_INVALID;

   // Architecture-specific stuff.
   struct arch_thread arch_;

   KernelStack stack_;

   // This is used by dispatcher.cc:SafeDeleter.
   void* recursive_object_deletion_list_ = nullptr;

   // This always includes the trailing NUL.
   char name_[ZX_MAX_NAME_LEN];

   // Buffering for Debuglog output.
   Linebuffer linebuffer_;

   // Indicates whether user register state (debug, vector, fp regs, etc.) has been saved to the
   // arch_thread_t as part of thread suspension / exception handling.
   //
   // When a user thread is suspended or generates an exception (synthetic or architectural) that
   // might be observed by another process, we save user register state to the thread's arch_thread_t
   // so that it may be accessed by a debugger.  Upon leaving a suspended or exception state, we
   // restore user register state.
   //
   // See also |IsUserStateSavedLocked()| and |ScopedThreadExceptionContext|.
   bool user_state_saved_;

   // Provides a way to execute a custom logic when a thread must be migrated between CPUs.
   MigrateFn migrate_fn_;

   // For threads with migration functions, indicates whether a migration is in progress. When true,
   // the migrate function has been called with Before but not yet with After.
   bool migrate_pending_;

   // Used to track threads that have set |migrate_fn_|. This is used to migrate
   // threads before a CPU is taken offline.
   fbl::DoublyLinkedListNodeState<Thread*> migrate_list_node_ TA_GUARDED(thread_lock);

   // Node storage for existing on the global thread list.
   fbl::DoublyLinkedListNodeState<Thread*> thread_list_node_ TA_GUARDED(thread_lock);

   // Node storage for existing on the temporary batch unblock list.
   fbl::DoublyLinkedListNodeState<Thread*> unblock_list_node_ TA_GUARDED(thread_lock);
 };

 // For the moment, the arch-specific current thread implementations need to come here, after the
 // Thread definition. One of the arches needs to know the structure of Thread to compute the offset
 // that the hardware pointer holds into Thread.
 #include <arch/current_thread.h>
 Thread* Thread::Current::Get() { return arch_get_current_thread(); }

 // TODO(johngro): Remove this when we have addressed fxbug.dev/33473.  Right now, this
 // is used in only one place (x86_bringup_aps in arch/x86/smp.cpp) outside of
 // thread.cpp.
 //
 // Normal users should only ever need to call either Thread::Create, or
 // Thread::CreateEtc.
 void init_thread_struct(Thread* t, const char* name);

 // Other thread-system bringup functions.
 void thread_init_early();
 void thread_secondary_cpu_entry() __NO_RETURN;
 void thread_construct_first(Thread* t, const char* name);

 // Call the arch-specific signal handler.
 extern "C" void arch_iframe_process_pending_signals(iframe_t* iframe);

 // find a thread based on the thread id
 // NOTE: used only for debugging, its a slow linear search through the
 // global thread list
 Thread* thread_id_to_thread_slow(zx_koid_t tid) TA_EXCL(thread_lock);

 // RAII helper that installs/removes an exception context and saves/restores user register state.
 // The class operates on the current thread.
 //
 // When a thread takes an exception, this class is used to make user register state available to
 // debuggers and exception handlers.
 //
 // Example Usage:
 //
 // {
 //   ScopedThreadExceptionContext context(...);
 //   HandleException();
 // }
 //
 // Note, ScopedThreadExceptionContext keeps track of whether the state has already been saved so
 // it's safe to nest them:
 //
 // void Foo() {
 //   ScopedThreadExceptionContext context(...);
 //   Bar();
 // }
 //
 // void Bar() {
 //   ScopedThreadExceptionContext context(...);
 //   Baz();
 // }
 //
 class ScopedThreadExceptionContext {
  public:
   explicit ScopedThreadExceptionContext(const arch_exception_context_t* context);
   ~ScopedThreadExceptionContext();
   DISALLOW_COPY_ASSIGN_AND_MOVE(ScopedThreadExceptionContext);

  private:
   Thread* thread_;
   const arch_exception_context_t* context_;
   bool need_to_remove_;
   bool need_to_restore_;
 };

 // RAII helper to enforce that a block of code does not allocate memory.
 //
 // See |Thread::Current::memory_allocation_state()|.
 class ScopedMemoryAllocationDisabled {
  public:
   ScopedMemoryAllocationDisabled() { Thread::Current::memory_allocation_state().Disable(); }
   ~ScopedMemoryAllocationDisabled() { Thread::Current::memory_allocation_state().Enable(); }
   DISALLOW_COPY_ASSIGN_AND_MOVE(ScopedMemoryAllocationDisabled);
 };

 // WaitQueue collection trait implementations.  While typically these would be
 // implemented in-band in the trait class itself, these definitions must come
 // last, after the definition Thread.  This is because the traits (defined in
 // WaitQueueCollection) need to understand the layout of Thread in order to be
 // able to access both scheduler state and wait queue state variables.
 inline WaitQueueCollection::Key WaitQueueCollection::BlockedThreadTreeTraits::GetKey(
     const Thread& thread) {
   // TODO(johngro): consider extending FBL to support a "MultiWAVLTree"
   // implementation which would allow for nodes with identical keys, breaking
   // ties (under the hood) using pointer value.  This way, we would not need to
   // manifest our own pointer in GetKey or in our key type.
   return {thread.wait_queue_state().blocked_threads_tree_sort_key_,
           reinterpret_cast<uintptr_t>(&thread)};
 }

 inline fbl::WAVLTreeNodeState<Thread*>& WaitQueueCollection::BlockedThreadTreeTraits::node_state(
     Thread& thread) {
   return thread.wait_queue_state().blocked_threads_tree_node_;
 }

 inline Thread* WaitQueueCollection::MinRelativeDeadlineTraits::GetValue(const Thread& thread) {
   // TODO(johngro), consider pre-computing this value so it is just a fetch
   // instead of a branch.
   return (thread.scheduler_state().discipline() == SchedDiscipline::Fair)
              ? nullptr
              : const_cast<Thread*>(&thread);
 }

 inline Thread* WaitQueueCollection::MinRelativeDeadlineTraits::GetSubtreeBest(
     const Thread& thread) {
   return thread.wait_queue_state().subtree_min_rel_deadline_thread_;
 }

 inline bool WaitQueueCollection::MinRelativeDeadlineTraits::Compare(Thread* a, Thread* b) {
   // The thread pointer value of a non-deadline thread is null, an non-deadline
   // threads are always the worst choice when choosing the thread with the
   // minimum relative deadline.
   // clang-format off
   if (a == nullptr) { return false; }
   if (b == nullptr) { return true; }
   const SchedDuration a_deadline = a->scheduler_state().deadline().deadline_ns;
   const SchedDuration b_deadline = b->scheduler_state().deadline().deadline_ns;
   return (a_deadline < b_deadline) || ((a_deadline == b_deadline) && (a < b));
   // clang-format on
 }

 inline void WaitQueueCollection::MinRelativeDeadlineTraits::AssignBest(Thread& thread,
                                                                        Thread* val) {
   thread.wait_queue_state().subtree_min_rel_deadline_thread_ = val;
 }

 inline void WaitQueueCollection::MinRelativeDeadlineTraits::ResetBest(Thread& thread) {
   // In a debug build, zero out the subtree best as we leave the collection.
   // This can help to find bugs by allowing us to assert that the value is zero
   // during insertion, however it is not strictly needed in a production build
   // and can be skipped.
 #ifdef DEBUG_ASSERT_IMPLEMENTED
   thread.wait_queue_state().subtree_min_rel_deadline_thread_ = nullptr;
 #endif
 }

 inline void PreemptDisabledToken::AssertHeld() {
   DEBUG_ASSERT(Thread::Current::preemption_state().PreemptIsEnabled() == false);
 }

 #endif  // ZIRCON_KERNEL_INCLUDE_KERNEL_THREAD_H_