zircon/kernel/kernel/scheduler.cc - fuchsia - Git at Google

 // Copyright 2018 The Fuchsia Authors
 //
 // Use of this source code is governed by a MIT-style
 // license that can be found in the LICENSE file or at
 // https://opensource.org/licenses/MIT

 #include "kernel/scheduler.h"

 #include <assert.h>
 #include <debug.h>
 #include <err.h>
 #include <inttypes.h>
 #include <lib/counters.h>
 #include <lib/ktrace.h>
 #include <platform.h>
 #include <stdio.h>
 #include <string.h>
 #include <target.h>
 #include <trace.h>
 #include <zircon/listnode.h>
 #include <zircon/types.h>

 #include <algorithm>
 #include <new>

 #include <kernel/lockdep.h>
 #include <kernel/mp.h>
 #include <kernel/percpu.h>
 #include <kernel/sched.h>
 #include <kernel/scheduler_internal.h>
 #include <kernel/scheduler_state.h>
 #include <kernel/thread.h>
 #include <kernel/thread_lock.h>
 #include <ktl/algorithm.h>
 #include <ktl/move.h>
 #include <vm/vm.h>

 using ffl::FromRatio;
 using ffl::Round;

 // Determines which subset of tracers are enabled when detailed tracing is
 // enabled.
 #define LOCAL_KTRACE_LEVEL SCHEDULER_TRACING_LEVEL

 // The tracing levels used in this compilation unit.
 #define KTRACE_COMMON 1
 #define KTRACE_FLOW 2
 #define KTRACE_DETAILED 3

 // Evaluates to true if tracing is enabled for the given level.
 #define LOCAL_KTRACE_LEVEL_ENABLED(level) ((LOCAL_KTRACE_LEVEL) >= (level))

 #define LOCAL_KTRACE(level, string, args...)                                     \
   ktrace_probe(LocalTrace<LOCAL_KTRACE_LEVEL_ENABLED(level)>, TraceContext::Cpu, \
                KTRACE_STRING_REF(string), ##args)

 #define LOCAL_KTRACE_FLOW_BEGIN(level, string, flow_id, args...)                      \
   ktrace_flow_begin(LocalTrace<LOCAL_KTRACE_LEVEL_ENABLED(level)>, TraceContext::Cpu, \
                     KTRACE_GRP_SCHEDULER, KTRACE_STRING_REF(string), flow_id, ##args)

 #define LOCAL_KTRACE_FLOW_END(level, string, flow_id, args...)                      \
   ktrace_flow_end(LocalTrace<LOCAL_KTRACE_LEVEL_ENABLED(level)>, TraceContext::Cpu, \
                   KTRACE_GRP_SCHEDULER, KTRACE_STRING_REF(string), flow_id, ##args)

 template <size_t level>
 using LocalTraceDuration = TraceDuration<TraceEnabled<LOCAL_KTRACE_LEVEL_ENABLED(level)>,
                                          KTRACE_GRP_SCHEDULER, TraceContext::Cpu>;

 // Enable/disable console traces local to this file.
 #define LOCAL_TRACE 0

 #define SCHED_LTRACEF(str, args...) LTRACEF("[%u] " str, arch_curr_cpu_num(), ##args)
 #define SCHED_TRACEF(str, args...) TRACEF("[%u] " str, arch_curr_cpu_num(), ##args)

 // Counters to track system load metrics.
 KCOUNTER(demand_counter, "thread.demand_accum")
 KCOUNTER(latency_counter, "thread.latency_accum")
 KCOUNTER(runnable_counter, "thread.runnable_accum")
 KCOUNTER(samples_counter, "thread.samples_accum")

 namespace {

 // Conversion table entry. Scales the integer argument to a fixed-point weight
 // in the interval (0.0, 1.0].
 struct WeightTableEntry {
   constexpr WeightTableEntry(int64_t value)
       : value{FromRatio<int64_t>(value, SchedWeight::Format::Power)} {}
   constexpr operator SchedWeight() const { return value; }
   const SchedWeight value;
 };

 // Table of fixed-point constants converting from kernel priority to fair
 // scheduler weight.
 constexpr WeightTableEntry kPriorityToWeightTable[] = {
     121,   149,   182,   223,   273,   335,   410,   503,   616,   754,  924,
     1132,  1386,  1698,  2080,  2549,  3122,  3825,  4685,  5739,  7030, 8612,
     10550, 12924, 15832, 19394, 23757, 29103, 35651, 43672, 53499, 65536};

 // Converts from kernel priority value in the interval [0, 31] to weight in the
 // interval (0.0, 1.0]. See the definition of SchedWeight for an explanation of
 // the weight distribution.
 constexpr SchedWeight PriorityToWeight(int priority) { return kPriorityToWeightTable[priority]; }

 // The minimum possible weight and its reciprocal.
 constexpr SchedWeight kMinWeight = PriorityToWeight(LOWEST_PRIORITY);
 constexpr SchedWeight kReciprocalMinWeight = 1 / kMinWeight;

 // Utility operator to make expressions more succinct that update thread times
 // and durations of basic types using the fixed-point counterparts.
 constexpr zx_time_t& operator+=(zx_time_t& value, SchedDuration delta) {
   value += delta.raw_value();
   return value;
 }

 // On ARM64 with safe-stack, it's no longer possible to use the unsafe-sp
 // after arch_set_current_thread (we'd now see newthread's unsafe-sp instead!).
 // Hence this function and everything it calls between this point and the
 // the low-level context switch must be marked with __NO_SAFESTACK.
 __NO_SAFESTACK void FinalContextSwitch(Thread* oldthread, Thread* newthread) {
   arch_set_current_thread(newthread);
   arch_context_switch(oldthread, newthread);
 }

 // Writes a context switch record to the ktrace buffer. This is always enabled
 // so that user mode tracing can track which threads are running.
 inline void TraceContextSwitch(const Thread* current_thread, const Thread* next_thread,
                                cpu_num_t current_cpu) {
   const auto raw_current = reinterpret_cast<uintptr_t>(current_thread);
   const auto raw_next = reinterpret_cast<uintptr_t>(next_thread);
   const auto current = static_cast<uint32_t>(raw_current);
   const auto next = static_cast<uint32_t>(raw_next);
   const auto user_tid = static_cast<uint32_t>(next_thread->user_tid_);
   const uint32_t context = current_cpu | (current_thread->state_ << 8) |
                            (current_thread->base_priority_ << 16) |
                            (next_thread->base_priority_ << 24);

   ktrace(TAG_CONTEXT_SWITCH, user_tid, context, current, next);
 }

 // Returns a sufficiently unique flow id for a thread based on the thread id and
 // queue generation count. This flow id cannot be used across enqueues because
 // the generation count changes during enqueue.
 inline uint64_t FlowIdFromThreadGeneration(const Thread* thread) {
   const int kRotationBits = 32;
   const uint64_t rotated_tid =
       (thread->user_tid_ << kRotationBits) | (thread->user_tid_ >> kRotationBits);
   return rotated_tid ^ thread->scheduler_state_.generation();
 }

 // Returns true if the given thread is fair scheduled.
 inline bool IsFairThread(const Thread* thread) {
   return thread->scheduler_state_.discipline() == SchedDiscipline::Fair;
 }

 // Returns true if the given thread is deadline scheduled.
 inline bool IsDeadlineThread(const Thread* thread) {
   return thread->scheduler_state_.discipline() == SchedDiscipline::Deadline;
 }

 // Returns true if the given thread's time slice is adjustable under changes to
 // the fair scheduler demand on the CPU.
 inline bool IsThreadAdjustable(const Thread* thread) {
   // Checking the thread state avoids unnecessary adjustments on a thread that
   // is no longer competing.
   return !thread->IsIdle() && IsFairThread(thread) && thread->state_ == THREAD_READY;
 }

 // Returns the effective mask of CPUs a thread is may run on, based on the
 // thread's affinity masks and available CPUs.
 cpu_mask_t GetEffectiveCpuMask(cpu_mask_t active_mask, const Thread* thread) {
   // The thread may run on any active CPU allowed by both its hard and
   // soft CPU affinity.
   const cpu_mask_t soft_affinity = thread->soft_affinity_;
   const cpu_mask_t hard_affinity = thread->hard_affinity_;
   const cpu_mask_t available_mask = active_mask & soft_affinity & hard_affinity;

   // Return the mask honoring soft affinity if it is viable, otherwise ignore
   // soft affinity and honor only hard affinity.
   if (likely(available_mask != 0)) {
     return available_mask;
   }

   return active_mask & hard_affinity;
 }

 }  // anonymous namespace

 void Scheduler::Dump() {
   printf("\tweight_total=%#x fair_tasks=%d deadline_tasks=%d vtime=%" PRId64 " period=%" PRId64
          " ema=%" PRId64 " deadline_utilization=%" PRId64 "\n",
          static_cast<uint32_t>(weight_total_.raw_value()), runnable_fair_task_count_,
          runnable_deadline_task_count_, virtual_time_.raw_value(),
          scheduling_period_grans_.raw_value(), total_expected_runtime_ns_.raw_value(),
          total_deadline_utilization_.raw_value());

   if (active_thread_ != nullptr) {
     const SchedulerState& state = active_thread_->scheduler_state_;
     if (IsFairThread(active_thread_)) {
       printf("\t-> name=%s weight=%#x start=%" PRId64 " finish=%" PRId64 " ts=%" PRId64
              " ema=%" PRId64 "\n",
              active_thread_->name_, static_cast<uint32_t>(state.fair_.weight.raw_value()),
              state.start_time_.raw_value(), state.finish_time_.raw_value(),
              state.time_slice_ns_.raw_value(), state.expected_runtime_ns_.raw_value());
     } else {
       printf("\t-> name=%s deadline=(%" PRId64 ", %" PRId64 ", %" PRId64 ") start=%" PRId64
              " finish=%" PRId64 " ts=%" PRId64 " ema=%" PRId64 "\n",
              active_thread_->name_, state.deadline_.capacity_ns.raw_value(),
              state.deadline_.deadline_ns.raw_value(), state.deadline_.period_ns.raw_value(),
              state.start_time_.raw_value(), state.finish_time_.raw_value(),
              state.time_slice_ns_.raw_value(), state.expected_runtime_ns_.raw_value());
     }
   }

   for (const Thread& thread : deadline_run_queue_) {
     const SchedulerState& state = thread.scheduler_state_;
     printf("\t   name=%s deadline=(%" PRId64 ", %" PRId64 ", %" PRId64 ") start=%" PRId64
            " finish=%" PRId64 " ts=%" PRId64 " ema=%" PRId64 "\n",
            thread.name_, state.deadline_.capacity_ns.raw_value(),
            state.deadline_.deadline_ns.raw_value(), state.deadline_.period_ns.raw_value(),
            state.start_time_.raw_value(), state.finish_time_.raw_value(),
            state.time_slice_ns_.raw_value(), state.expected_runtime_ns_.raw_value());
   }

   for (const Thread& thread : fair_run_queue_) {
     const SchedulerState& state = thread.scheduler_state_;
     printf("\t   name=%s weight=%#x start=%" PRId64 " finish=%" PRId64 " ts=%" PRId64
            " ema=%" PRId64 "\n",
            thread.name_, static_cast<uint32_t>(state.fair_.weight.raw_value()),
            state.start_time_.raw_value(), state.finish_time_.raw_value(),
            state.time_slice_ns_.raw_value(), state.expected_runtime_ns_.raw_value());
   }
 }

 SchedWeight Scheduler::GetTotalWeight() const {
   Guard<spin_lock_t, IrqSave> guard{ThreadLock::Get()};
   return weight_total_;
 }

 size_t Scheduler::GetRunnableTasks() const {
   Guard<spin_lock_t, IrqSave> guard{ThreadLock::Get()};
   const int64_t total_runnable_tasks = runnable_fair_task_count_ + runnable_deadline_task_count_;
   return static_cast<size_t>(total_runnable_tasks);
 }

 // Performs an augmented binary search for the task with the earliest finish
 // time that is also equal to or later than the given eligible time.
 //
 // The tree is ordered by start time and is augmented by maintaining an
 // additional invariant: each task node in the tree stores the minimum finish
 // time of its descendents, including itself, in addition to its own start and
 // finish time. The combination of these three values permits traversinng the
 // tree along a perfect partition of minimum finish times with eligible start
 // times.
 //
 // See kernel/scheduler_internal.h for an explanation of how the augmented
 // invariant is maintained.
 Thread* Scheduler::FindEarliestEligibleThread(RunQueue* run_queue, SchedTime eligible_time) {
   // Early out if there is no eligible thread.
   if (run_queue->is_empty() || run_queue->front().scheduler_state_.start_time_ > eligible_time) {
     return nullptr;
   }

   auto node = run_queue->root();
   auto subtree = run_queue->end();
   auto path = run_queue->end();

   // Descend the tree, with |node| following the path from the root to a leaf,
   // such that the path partitions the tree into two parts: the nodes on the
   // left represent eligible tasks, while the nodes on the right represent tasks
   // that are not eligible. Eligible tasks are both in the left partition and
   // along the search path, tracked by |path|.
   while (node) {
     if (node->scheduler_state_.start_time_ <= eligible_time) {
       if (!path || path->scheduler_state_.finish_time_ > node->scheduler_state_.finish_time_) {
         path = node;
       }

       if (auto left = node.left();
           !subtree || (left && subtree->scheduler_state_.min_finish_time_ >
                                    left->scheduler_state_.min_finish_time_)) {
         subtree = left;
       }

       node = node.right();
     } else {
       node = node.left();
     }
   }

   if (!subtree ||
       subtree->scheduler_state_.min_finish_time_ >= path->scheduler_state_.finish_time_) {
     return path.CopyPointer();
   }

   // Find the node with the earliest finish time among the decendents of the
   // subtree with the smallest minimum finish time.
   node = subtree;
   do {
     if (subtree->scheduler_state_.min_finish_time_ == node->scheduler_state_.finish_time_) {
       return node.CopyPointer();
     }

     if (auto left = node.left(); left && node->scheduler_state_.min_finish_time_ ==
                                              left->scheduler_state_.min_finish_time_) {
       node = left;
     } else {
       node = node.right();
     }
   } while (node);

   return nullptr;
 }

 Scheduler* Scheduler::Get() { return Get(arch_curr_cpu_num()); }

 Scheduler* Scheduler::Get(cpu_num_t cpu) { return &percpu::Get(cpu).scheduler; }

 void Scheduler::InitializeThread(Thread* thread, int priority) {
   new (&thread->scheduler_state_) SchedulerState{PriorityToWeight(priority)};
   thread->base_priority_ = priority;
   thread->effec_priority_ = priority;
   thread->inherited_priority_ = -1;
   thread->scheduler_state_.expected_runtime_ns_ = kDefaultTargetLatency;
 }

 void Scheduler::InitializeThread(Thread* thread, const zx_sched_deadline_params_t& params) {
   new (&thread->scheduler_state_) SchedulerState{params};
   // Set the numeric priority of the deadline task to the highest as a temporary
   // workaround for the rest of the kernel not knowing about deadlines. This
   // will cause deadline tasks to exert maximum fair scheduler pressure on fair
   // tasks during PI interactions.
   // TODO(eieio): Fix this with an abstraction that the higher layers can use
   // to express priority / deadline more abstractly for PI and etc...
   thread->base_priority_ = HIGHEST_PRIORITY;
   thread->effec_priority_ = HIGHEST_PRIORITY;
   thread->inherited_priority_ = -1;
   thread->scheduler_state_.expected_runtime_ns_ = SchedDuration{params.capacity};
 }

 // Removes the thread at the head of the first eligible run queue. If there is
 // an eligible deadline thread, it takes precedence over available fair
 // threads.
 Thread* Scheduler::DequeueThread(SchedTime now) {
   if (IsDeadlineThreadEligible(now)) {
     return DequeueDeadlineThread(now);
   } else if (likely(!fair_run_queue_.is_empty())) {
     return DequeueFairThread();
   } else {
     return &percpu::Get(this_cpu()).idle_thread;
   }
 }

 // Dequeues the eligible thread with the earliest virtual finish time. The
 // caller must ensure that there is at least one thread in the queue.
 Thread* Scheduler::DequeueFairThread() {
   LocalTraceDuration<KTRACE_DETAILED> trace{"dequeue_fair_thread"_stringref};

   // Snap the virtual clock to the earliest start time.
   const auto& earliest_thread = fair_run_queue_.front();
   const auto earliest_start = earliest_thread.scheduler_state_.start_time_;
   const SchedTime eligible_time = ktl::max(virtual_time_, earliest_start);

   // Find the eligible thread with the earliest virtual finish time.
   // Note: Currently, fair tasks are always eligible when added to the run
   // queue, such that this search is equivalent to taking the front element of
   // a tree sorted by finish time, instead of start time. However, when moving
   // to the WF2Q algorithm, eligibility becomes a factor. Using the eligibility
   // query now prepares for migrating the algorithm and also avoids having two
   // different template instantiations of fbl::WAVLTree to support the fair and
   // deadline disciplines.
   Thread* const eligible_thread = FindEarliestEligibleThread(&fair_run_queue_, eligible_time);
   DEBUG_ASSERT_MSG(eligible_thread != nullptr,
                    "virtual_time=%" PRId64 ", eligible_time=%" PRId64 " , start_time=%" PRId64
                    ", finish_time=%" PRId64 ", min_finish_time=%" PRId64 "!",
                    virtual_time_.raw_value(), eligible_time.raw_value(),
                    earliest_thread.scheduler_state_.start_time_.raw_value(),
                    earliest_thread.scheduler_state_.finish_time_.raw_value(),
                    earliest_thread.scheduler_state_.min_finish_time_.raw_value());

   virtual_time_ = eligible_time;
   return fair_run_queue_.erase(*eligible_thread);
 }

 // Dequeues the eligible thread with the earliest deadline. The caller must
 // ensure that there is at least one eligible thread in the queue.
 Thread* Scheduler::DequeueDeadlineThread(SchedTime eligible_time) {
   LocalTraceDuration<KTRACE_DETAILED> trace{"dequeue_deadline_thread"_stringref};

   Thread* const eligible_thread = FindEarliestEligibleThread(&deadline_run_queue_, eligible_time);
   DEBUG_ASSERT_MSG(eligible_thread != nullptr,
                    "eligible_time=%" PRId64 ", start_time=%" PRId64 ", finish_time=%" PRId64
                    ", min_finish_time=%" PRId64 "!",
                    eligible_time.raw_value(),
                    eligible_thread->scheduler_state_.start_time_.raw_value(),
                    eligible_thread->scheduler_state_.finish_time_.raw_value(),
                    eligible_thread->scheduler_state_.min_finish_time_.raw_value());

   deadline_run_queue_.erase(*eligible_thread);

   const SchedulerState& state = eligible_thread->scheduler_state_;
   trace.End(Round<uint64_t>(state.start_time_), Round<uint64_t>(state.finish_time_));
   return eligible_thread;
 }

 // Returns the eligible thread with the earliest deadline that is also earlier
 // than the given deadline. Returns nullptr if no threads meet this criteria or
 // the run queue is empty.
 Thread* Scheduler::FindEarlierDeadlineThread(SchedTime eligible_time, SchedTime finish_time) {
   Thread* const eligible_thread = FindEarliestEligibleThread(&deadline_run_queue_, eligible_time);
   const bool found_earlier_deadline =
       eligible_thread && eligible_thread->scheduler_state_.finish_time_ < finish_time;
   return found_earlier_deadline ? eligible_thread : nullptr;
 }

 // Returns the time that the next deadline task will become eligible or infinite
 // if there are no ready deadline tasks.
 SchedTime Scheduler::GetNextEligibleTime() {
   return deadline_run_queue_.is_empty() ? SchedTime{ZX_TIME_INFINITE}
                                         : deadline_run_queue_.front().scheduler_state_.start_time_;
 }

 // Dequeues the eligible thread with the earliest deadline that is also earlier
 // than the given deadline. Returns nullptr if no threads meet the criteria or
 // the run queue is empty.
 Thread* Scheduler::DequeueEarlierDeadlineThread(SchedTime eligible_time, SchedTime finish_time) {
   LocalTraceDuration<KTRACE_DETAILED> trace{"dequeue_earlier_deadline_thread"_stringref};
   Thread* const eligible_thread = FindEarlierDeadlineThread(eligible_time, finish_time);
   return eligible_thread ? deadline_run_queue_.erase(*eligible_thread) : nullptr;
 }

 // Updates the system load metrics. Updates happen only when the active thread
 // changes or the time slice expires.
 void Scheduler::UpdateCounters(SchedDuration queue_time_ns) {
   demand_counter.Add(weight_total_.raw_value());
   runnable_counter.Add(runnable_fair_task_count_ + runnable_deadline_task_count_);
   latency_counter.Add(queue_time_ns.raw_value());
   samples_counter.Add(1);
 }

 // Selects a thread to run. Performs any necessary maintenanace if the current
 // thread is changing, depending on the reason for the change.
 Thread* Scheduler::EvaluateNextThread(SchedTime now, Thread* current_thread, bool timeslice_expired,
                                       SchedDuration total_runtime_ns) {
   LocalTraceDuration<KTRACE_DETAILED> trace{"find_thread"_stringref};

   const bool is_idle = current_thread->IsIdle();
   const bool is_active = current_thread->state_ == THREAD_READY;
   const bool is_deadline = IsDeadlineThread(current_thread);
   const bool is_new_deadline_eligible = IsDeadlineThreadEligible(now);

   const cpu_num_t current_cpu = arch_curr_cpu_num();
   const cpu_mask_t current_cpu_mask = cpu_num_to_mask(current_cpu);
   const cpu_mask_t active_mask = mp_get_active_mask();

   Thread* next_thread = nullptr;
   if (is_active && likely(!is_idle)) {
     if (timeslice_expired) {
       // If the timeslice expired insert the current thread into the run queue.
       QueueThread(current_thread, Placement::Insertion, now, total_runtime_ns);
     } else if (is_new_deadline_eligible && is_deadline) {
       // The current thread is deadline scheduled and there is at least one
       // eligible deadline thread in the run queue: select the eligible thread
       // with the earliest deadline, which may still be the current thread.
       const SchedTime deadline_ns = current_thread->scheduler_state_.finish_time_;
       if (Thread* const earlier_thread = DequeueEarlierDeadlineThread(now, deadline_ns);
           earlier_thread != nullptr) {
         QueueThread(current_thread, Placement::Preemption, now, total_runtime_ns);
         next_thread = earlier_thread;
       } else {
         // The current thread still has the earliest deadline.
         next_thread = current_thread;
       }
     } else if (is_new_deadline_eligible && !is_deadline) {
       // The current thread is fair scheduled and there is at least one eligible
       // deadline thread in the run queue: return this thread to the run queue.
       QueueThread(current_thread, Placement::Preemption, now, total_runtime_ns);
     } else {
       // The current thread has remaining time and no eligible contender.
       next_thread = current_thread;
     }
   } else if (!is_active && likely(!is_idle)) {
     // The current thread is no longer ready, remove its accounting.
     Remove(current_thread);
   }

   // The current thread is no longer running or has returned to the run queue,
   // select another thread to run.
   if (next_thread == nullptr) {
     next_thread = DequeueThread(now);
   }

   // Returns true when the given thread requires active migration.
   const auto needs_migration = [active_mask, current_cpu_mask](Thread* const thread) {
     return (GetEffectiveCpuMask(active_mask, thread) & current_cpu_mask) == 0 ||
            thread->next_cpu_ != INVALID_CPU;
   };

   // If the next thread needs *active* migration, call the migration function,
   // migrate the thread, and select another thread to run.
   //
   // Most migrations are passive. Passive migration happens whenever a thread
   // becomes READY and a different CPU is selected than the last CPU the thread
   // ran on.
   //
   // Active migration happens under the following conditions:
   //  1. The CPU affinity of a thread that is READY or RUNNING is changed to
   //     exclude the CPU it is currently active on.
   //  2. Passive migration, or active migration due to #1, selects a different
   //     CPU for a thread with a migration function. Migration to the next CPU
   //     is delayed until the migration function is called on the last CPU.
   //  3. A thread that is READY or RUNNING is relocated by the periodic load
   //     balancer. NOT YET IMPLEMENTED.
   //
   cpu_mask_t cpus_to_reschedule_mask = 0;
   for (; needs_migration(next_thread); next_thread = DequeueThread(now)) {
     // If the thread is not scheduled to migrate to a specifc CPU, find a
     // suitable target CPU. If the thread has a migration function, the search
     // will schedule the thread to migrate to a specific CPU and return the
     // current CPU.
     cpu_num_t target_cpu = INVALID_CPU;
     if (next_thread->next_cpu_ == INVALID_CPU) {
         target_cpu = FindTargetCpu(next_thread);
         DEBUG_ASSERT(target_cpu != this_cpu() || next_thread->next_cpu_ != INVALID_CPU);
     }

     // If the thread is scheduled to migrate to a specific CPU, set the target
     // to that CPU and call the migration function.
     if (next_thread->next_cpu_ != INVALID_CPU) {
       DEBUG_ASSERT(next_thread->last_cpu_ == this_cpu());
       target_cpu = next_thread->next_cpu_;
       next_thread->CallMigrateFnLocked(Thread::MigrateStage::Before);
       next_thread->next_cpu_ = INVALID_CPU;
     }

     // The target CPU must always be different than the current CPU.
     DEBUG_ASSERT(target_cpu != this_cpu());

     // Remove accounting from this run queue and insert in the target run queue.
     Remove(next_thread);
     Scheduler* const target = Get(target_cpu);
     target->Insert(now, next_thread);

     cpus_to_reschedule_mask |= cpu_num_to_mask(target_cpu);
   }

   // Issue reschedule IPIs to CPUs with migrated threads.
   if (cpus_to_reschedule_mask) {
     mp_reschedule(cpus_to_reschedule_mask, 0);
   }

   return next_thread;
 }

 cpu_num_t Scheduler::FindTargetCpu(Thread* thread) {
   LocalTraceDuration<KTRACE_DETAILED> trace{"find_target: cpu,avail"_stringref};

   const cpu_num_t last_cpu = thread->last_cpu_;
   const cpu_mask_t current_cpu_mask = cpu_num_to_mask(arch_curr_cpu_num());
   const cpu_mask_t last_cpu_mask = cpu_num_to_mask(last_cpu);
   const cpu_mask_t active_mask = mp_get_active_mask();
   const cpu_mask_t idle_mask = mp_get_idle_mask();

   // Determine the set of CPUs the thread is allowed to run on.
   //
   // Threads may be created and resumed before the thread init level. Work around
   // an empty active mask by assuming the current cpu is scheduleable.
   const cpu_mask_t available_mask =
       active_mask != 0 ? GetEffectiveCpuMask(active_mask, thread) : current_cpu_mask;
   DEBUG_ASSERT_MSG(available_mask != 0,
                    "thread=%s affinity=%#x soft_affinity=%#x active=%#x "
                    "idle=%#x arch_ints_disabled=%d",
                    thread->name_, thread->hard_affinity_, thread->soft_affinity_, active_mask,
                    mp_get_idle_mask(), arch_ints_disabled());

   LOCAL_KTRACE(KTRACE_DETAILED, "target_mask: online,active", mp_get_online_mask(), active_mask);

   cpu_num_t target_cpu;
   Scheduler* target_queue;

   // Select an initial target.
   if (last_cpu_mask & available_mask && (!idle_mask || last_cpu_mask & idle_mask)) {
     target_cpu = last_cpu;
   } else if (current_cpu_mask & available_mask) {
     target_cpu = arch_curr_cpu_num();
   } else {
     target_cpu = lowest_cpu_set(available_mask);
   }

   target_queue = Get(target_cpu);

   // See if there is a better target in the set of available CPUs.
   // TODO(eieio): Replace this with a search in order of increasing cache
   // distance from the initial target cpu when topology information is available.
   // TODO(eieio): Add some sort of threshold to terminate search when a
   // sufficiently unloaded target is found.

   const auto compare_fair = [](Scheduler* const queue_a,
                                Scheduler* const queue_b) TA_REQ(thread_lock) {
     if (queue_a->total_expected_runtime_ns_ == queue_b->total_expected_runtime_ns_) {
       return queue_a->total_deadline_utilization_ < queue_b->total_deadline_utilization_;
     }
     return queue_a->total_expected_runtime_ns_ < queue_b->total_expected_runtime_ns_;
   };
   const auto is_idle_fair = [](Scheduler* const queue) TA_REQ(thread_lock) {
     return queue->total_expected_runtime_ns_ == SchedDuration{0};
   };

   const auto compare_deadline = [](Scheduler* const queue_a,
                                    Scheduler* const queue_b) TA_REQ(thread_lock) {
     if (queue_a->total_deadline_utilization_ == queue_b->total_deadline_utilization_) {
       return queue_a->total_expected_runtime_ns_ < queue_b->total_expected_runtime_ns_;
     }
     return queue_a->total_deadline_utilization_ < queue_b->total_deadline_utilization_;
   };
   const auto is_idle_deadline = [](Scheduler* const queue) TA_REQ(thread_lock) {
     return queue->total_deadline_utilization_ == SchedUtilization{0} &&
            queue->total_expected_runtime_ns_ == SchedDuration{0};
   };

   const auto compare = IsFairThread(thread) ? compare_fair : compare_deadline;
   const auto is_idle = IsFairThread(thread) ? is_idle_fair : is_idle_deadline;

   cpu_mask_t remaining_mask = available_mask & ~cpu_num_to_mask(target_cpu);
   while (remaining_mask != 0 && !is_idle(target_queue)) {
     const cpu_num_t candidate_cpu = lowest_cpu_set(remaining_mask);
     Scheduler* const candidate_queue = Get(candidate_cpu);

     if (compare(candidate_queue, target_queue)) {
       target_cpu = candidate_cpu;
       target_queue = candidate_queue;
     }

     remaining_mask &= ~cpu_num_to_mask(candidate_cpu);
   }

   SCHED_LTRACEF("thread=%s target_cpu=%u\n", thread->name_, target_cpu);
   trace.End(target_cpu, remaining_mask);

   bool delay_migration = last_cpu != target_cpu && last_cpu != INVALID_CPU && thread->migrate_fn_ &&
                          (active_mask & last_cpu_mask) != 0;
   if (unlikely(delay_migration)) {
     thread->next_cpu_ = target_cpu;
     return last_cpu;
   } else {
     return target_cpu;
   }
 }

 void Scheduler::UpdateTimeline(SchedTime now) {
   LocalTraceDuration<KTRACE_DETAILED> trace{"update_vtime"_stringref};

   const auto runtime_ns = now - last_update_time_ns_;
   last_update_time_ns_ = now;

   if (weight_total_ > SchedWeight{0}) {
     virtual_time_ += runtime_ns;
   }

   trace.End(Round<uint64_t>(runtime_ns), Round<uint64_t>(virtual_time_));
 }

 void Scheduler::RescheduleCommon(SchedTime now, EndTraceCallback end_outer_trace) {
   LocalTraceDuration<KTRACE_DETAILED> trace{"reschedule_common"_stringref};

   const cpu_num_t current_cpu = arch_curr_cpu_num();
   Thread* const current_thread = Thread::Current::Get();
   SchedulerState* const current_state = &current_thread->scheduler_state_;

   DEBUG_ASSERT(arch_ints_disabled());
   DEBUG_ASSERT(spin_lock_held(&thread_lock));
   // Aside from the thread_lock, spinlocks should never be held over a reschedule.
   DEBUG_ASSERT(arch_num_spinlocks_held() == 1);
   DEBUG_ASSERT_MSG(current_thread->state_ != THREAD_RUNNING, "state %d\n", current_thread->state_);
   DEBUG_ASSERT(!arch_blocking_disallowed());
   DEBUG_ASSERT_MSG(current_cpu == this_cpu(), "current_cpu=%u this_cpu=%u", current_cpu,
                    this_cpu());

   CPU_STATS_INC(reschedules);

   UpdateTimeline(now);

   const SchedDuration total_runtime_ns = now - start_of_current_time_slice_ns_;
   const SchedDuration actual_runtime_ns = now - current_thread->last_started_running_;
   current_thread->last_started_running_ = now.raw_value();

   // Update the runtime accounting for the thread that just ran.
   current_thread->runtime_ns_ += actual_runtime_ns;

   // Adjust the rate of the current thread when demand changes. Changes in
   // demand could be due to threads entering or leaving the run queue, or due
   // to weights changing in the current or enqueued threads.
   if (IsThreadAdjustable(current_thread) && weight_total_ != scheduled_weight_total_ &&
       total_runtime_ns < current_state->time_slice_ns_) {
     LocalTraceDuration<KTRACE_DETAILED> trace_adjust_rate{"adjust_rate"_stringref};
     scheduled_weight_total_ = weight_total_;

     const SchedDuration time_slice_ns = CalculateTimeslice(current_thread);
     const SchedDuration remaining_time_slice_ns =
         time_slice_ns * current_state->fair_.normalized_timeslice_remainder;

     const bool timeslice_changed = time_slice_ns != current_state->fair_.initial_time_slice_ns;
     const bool timeslice_remaining = total_runtime_ns < remaining_time_slice_ns;

     // Update the preemption timer if necessary.
     if (timeslice_changed && timeslice_remaining) {
       const SchedTime slice_deadline_ns = start_of_current_time_slice_ns_ + remaining_time_slice_ns;
       absolute_deadline_ns_ = ClampToDeadline(slice_deadline_ns);
       TimerQueue::PreemptReset(absolute_deadline_ns_.raw_value());
     }

     current_state->fair_.initial_time_slice_ns = time_slice_ns;
     current_state->time_slice_ns_ = remaining_time_slice_ns;
     trace_adjust_rate.End(Round<uint64_t>(remaining_time_slice_ns),
                           Round<uint64_t>(total_runtime_ns));
   }

   const bool timeslice_expired = total_runtime_ns >= current_state->time_slice_ns_;

   // Select a thread to run.
   Thread* const next_thread =
       EvaluateNextThread(now, current_thread, timeslice_expired, total_runtime_ns);
   DEBUG_ASSERT(next_thread != nullptr);

   SCHED_LTRACEF("current={%s, %s} next={%s, %s} expired=%d total_runtime_ns=%" PRId64
                 " fair_front=%s deadline_front=%s\n",
                 current_thread->name_, ToString(current_thread->state_), next_thread->name_,
                 ToString(next_thread->state_), timeslice_expired, total_runtime_ns.raw_value(),
                 fair_run_queue_.is_empty() ? "[none]" : fair_run_queue_.front().name_,
                 deadline_run_queue_.is_empty() ? "[none]" : deadline_run_queue_.front().name_);

   // Call the migrate function if the thread has moved between CPUs.
   if (next_thread->last_cpu_ != INVALID_CPU && next_thread->last_cpu_ != next_thread->curr_cpu_) {
     next_thread->CallMigrateFnLocked(Thread::MigrateStage::After);
   }

   // Update the state of the current and next thread.
   current_thread->preempt_pending_ = false;
   next_thread->state_ = THREAD_RUNNING;
   next_thread->last_cpu_ = current_cpu;
   next_thread->curr_cpu_ = current_cpu;

   active_thread_ = next_thread;

   // Update the expected runtime of the current thread and the per-CPU total.
   // Only update the thread and aggregate values if the current thread is still
   // associated with this CPU.
   const bool current_is_active =
       current_state->active() && current_thread->curr_cpu_ == current_cpu;
   if (!current_thread->IsIdle() && current_is_active &&
       (timeslice_expired || current_thread != next_thread)) {
     LocalTraceDuration<KTRACE_DETAILED> update_ema_trace{
         "update_expected_runtime: rt, drt"_stringref};

     // The expected runtime is an exponential moving average updated as follows:
     //
     // a = 1 / 2**d
     // Sn = Sn-1 + a * (Yn - Sn-1)
     //    = Sn-1 + (Yn - Sn-1) >> d
     //
     const SchedDuration delta_ns = total_runtime_ns - current_state->expected_runtime_ns_;
     const SchedDuration scaled_ns = delta_ns / (1 << kExpectedRuntimeAdjustmentRateShift);
     const SchedDuration clamped_ns =
         ktl::max<SchedDuration>(scaled_ns, -current_state->expected_runtime_ns_);
     current_state->expected_runtime_ns_ += clamped_ns;

     // Adjust the aggregate value by the same amount.
     total_expected_runtime_ns_ += clamped_ns;

     update_ema_trace.End(Round<uint64_t>(total_expected_runtime_ns_),
                          Round<uint64_t>(total_deadline_utilization_));
   }

   // Always call to handle races between reschedule IPIs and changes to the run
   // queue.
   mp_prepare_current_cpu_idle_state(next_thread->IsIdle());

   if (next_thread->IsIdle()) {
     mp_set_cpu_idle(current_cpu);
   } else {
     mp_set_cpu_busy(current_cpu);
   }

   // The task is always non-realtime when managed by this scheduler.
   // TODO(eieio): Revisit this when deadline scheduling is addressed.
   mp_set_cpu_non_realtime(current_cpu);

   if (current_thread->IsIdle()) {
     percpu::Get(current_cpu).stats.idle_time += actual_runtime_ns;
   }

   if (next_thread->IsIdle()) {
     LocalTraceDuration<KTRACE_DETAILED> trace_stop_preemption{"idle"_stringref};
     SCHED_LTRACEF("Idle: current=%s next=%s\n", current_thread->name_, next_thread->name_);
     UpdateCounters(SchedDuration{0});
     next_thread->last_started_running_ = now.raw_value();

     // If there are no tasks to run in the future, disable the preemption timer.
     // Otherwise, set the preemption time to the earliest eligible time.
     if (deadline_run_queue_.is_empty()) {
       TimerQueue::PreemptCancel();
     } else {
       const auto& earliest_thread = deadline_run_queue_.front();
       absolute_deadline_ns_ = earliest_thread.scheduler_state_.start_time_;
       TimerQueue::PreemptReset(absolute_deadline_ns_.raw_value());
     }
   } else if (timeslice_expired || next_thread != current_thread) {
     LocalTraceDuration<KTRACE_DETAILED> trace_start_preemption{
         "next_slice: now,deadline"_stringref};

     // Re-compute the time slice and deadline for the new thread based on the
     // latest state.
     absolute_deadline_ns_ = NextThreadTimeslice(next_thread, now);

     // Compute the time the next thread spent in the run queue. The value of
     // last_started_running for the current thread is updated at the top of
     // this method: when the current and next thread are the same, the queue
     // time is zero. Otherwise, last_started_running is the time the next thread
     // entered the run queue.
     const SchedDuration queue_time_ns = now - next_thread->last_started_running_;
     UpdateCounters(queue_time_ns);

     next_thread->last_started_running_ = now.raw_value();
     start_of_current_time_slice_ns_ = now;
     scheduled_weight_total_ = weight_total_;

     SCHED_LTRACEF("Start preempt timer: current=%s next=%s now=%" PRId64 " deadline=%" PRId64 "\n",
                   current_thread->name_, next_thread->name_, now.raw_value(),
                   absolute_deadline_ns_.raw_value());
     TimerQueue::PreemptReset(absolute_deadline_ns_.raw_value());

     trace_start_preemption.End(Round<uint64_t>(now), Round<uint64_t>(absolute_deadline_ns_));

     // Emit a flow end event to match the flow begin event emitted when the
     // thread was enqueued. Emitting in this scope ensures that thread just
     // came from the run queue (and is not the idle thread).
     LOCAL_KTRACE_FLOW_END(KTRACE_FLOW, "sched_latency", FlowIdFromThreadGeneration(next_thread),
                           next_thread->user_tid_);
   } else if (const SchedTime eligible_time_ns = GetNextEligibleTime();
              eligible_time_ns < absolute_deadline_ns_) {
     absolute_deadline_ns_ = eligible_time_ns;
     TimerQueue::PreemptReset(absolute_deadline_ns_.raw_value());
   }

   if (next_thread != current_thread) {
     LOCAL_KTRACE(KTRACE_DETAILED, "reschedule current: count,slice",
                  runnable_fair_task_count_ + runnable_deadline_task_count_,
                  Round<uint64_t>(current_thread->scheduler_state_.time_slice_ns_));
     LOCAL_KTRACE(KTRACE_DETAILED, "reschedule next: wsum,slice", weight_total_.raw_value(),
                  Round<uint64_t>(next_thread->scheduler_state_.time_slice_ns_));

     TraceContextSwitch(current_thread, next_thread, current_cpu);

     // Blink the optional debug LEDs on the target.
     target_set_debug_led(0, !next_thread->IsIdle());

     SCHED_LTRACEF("current=(%s, flags 0x%#x) next=(%s, flags 0x%#x)\n", current_thread->name_,
                   current_thread->flags_, next_thread->name_, next_thread->flags_);

     if (current_thread->aspace_ != next_thread->aspace_) {
       vmm_context_switch(current_thread->aspace_, next_thread->aspace_);
     }

     CPU_STATS_INC(context_switches);

     // Prevent the scheduler durations from spanning the context switch.
     // Some context switches do not resume within this method on the other
     // thread, which results in unterminated durations. All of the callers
     // with durations tail-call this method, so terminating the duration
     // here should not cause significant inaccuracy of the outer duration.
     trace.End();
     if (end_outer_trace) {
       end_outer_trace();
     }
     FinalContextSwitch(current_thread, next_thread);
   }
 }

 void Scheduler::UpdatePeriod() {
   LocalTraceDuration<KTRACE_DETAILED> trace{"update_period"_stringref};

   DEBUG_ASSERT(runnable_fair_task_count_ >= 0);
   DEBUG_ASSERT(minimum_granularity_ns_ > 0);
   DEBUG_ASSERT(peak_latency_grans_ > 0);
   DEBUG_ASSERT(target_latency_grans_ > 0);

   const int64_t num_tasks = runnable_fair_task_count_;
   const int64_t peak_tasks = Round<int64_t>(peak_latency_grans_);
   const int64_t normal_tasks = Round<int64_t>(target_latency_grans_);

   // The scheduling period stretches when there are too many tasks to fit
   // within the target latency.
   scheduling_period_grans_ = SchedDuration{num_tasks > normal_tasks ? num_tasks : normal_tasks};

   SCHED_LTRACEF("num_tasks=%" PRId64 " peak_tasks=%" PRId64 " normal_tasks=%" PRId64
                 " period_grans=%" PRId64 "\n",
                 num_tasks, peak_tasks, normal_tasks, scheduling_period_grans_.raw_value());

   trace.End(Round<uint64_t>(scheduling_period_grans_), num_tasks);
 }

 SchedDuration Scheduler::CalculateTimeslice(Thread* thread) {
   LocalTraceDuration<KTRACE_DETAILED> trace{"calculate_timeslice: w,wt"_stringref};
   SchedulerState* const state = &thread->scheduler_state_;

   // Calculate the relative portion of the scheduling period.
   const SchedWeight proportional_time_slice_grans =
       scheduling_period_grans_ * state->fair_.weight / weight_total_;

   // Ensure that the time slice is at least the minimum granularity.
   const int64_t time_slice_grans = Round<int64_t>(proportional_time_slice_grans);
   const int64_t minimum_time_slice_grans = time_slice_grans > 0 ? time_slice_grans : 1;

   // Calcluate the time slice in nanoseconds.
   const SchedDuration time_slice_ns = minimum_time_slice_grans * minimum_granularity_ns_;

   trace.End(state->fair_.weight.raw_value(), weight_total_.raw_value());
   return time_slice_ns;
 }

 SchedTime Scheduler::ClampToDeadline(SchedTime completion_time) {
   return ktl::min(completion_time, GetNextEligibleTime());
 }

 SchedTime Scheduler::ClampToEarlierDeadline(SchedTime completion_time, SchedTime finish_time) {
   Thread* const thread = FindEarlierDeadlineThread(completion_time, finish_time);
   return thread ? ktl::min(completion_time, thread->scheduler_state_.start_time_) : completion_time;
 }

 SchedTime Scheduler::NextThreadTimeslice(Thread* thread, SchedTime now) {
   LocalTraceDuration<KTRACE_DETAILED> trace{"next_timeslice: t,abs"_stringref};

   SchedulerState* const state = &thread->scheduler_state_;
   SchedTime absolute_deadline_ns;

   if (IsFairThread(thread)) {
     // Calculate the next time slice and the deadline when the time slice is
     // completed.
     const SchedDuration time_slice_ns = CalculateTimeslice(thread);
     const SchedDuration remaining_time_slice_ns =
         time_slice_ns * state->fair_.normalized_timeslice_remainder;

     DEBUG_ASSERT(time_slice_ns > SchedDuration{0});
     DEBUG_ASSERT(remaining_time_slice_ns > SchedDuration{0});

     state->fair_.initial_time_slice_ns = time_slice_ns;
     state->time_slice_ns_ = remaining_time_slice_ns;

     const SchedTime slice_deadline_ns = now + remaining_time_slice_ns;
     absolute_deadline_ns = ClampToDeadline(slice_deadline_ns);

     DEBUG_ASSERT_MSG(state->time_slice_ns_ > SchedDuration{0} && absolute_deadline_ns > now,
                      "time_slice_ns=%" PRId64 " now=%" PRId64 " absolute_deadline_ns=%" PRId64,
                      state->time_slice_ns_.raw_value(), now.raw_value(),
                      absolute_deadline_ns.raw_value());

     SCHED_LTRACEF("name=%s weight_total=%#x weight=%#x time_slice_ns=%" PRId64 "\n", thread->name_,
                   static_cast<uint32_t>(weight_total_.raw_value()),
                   static_cast<uint32_t>(state->fair_.weight.raw_value()),
                   state->time_slice_ns_.raw_value());
   } else {
     // Calculate the deadline when the remaining time slice is completed. The
     // time slice is maintained by the deadline queuing logic, no need to update
     // it here.
     const SchedTime slice_deadline_ns = now + state->time_slice_ns_;
     absolute_deadline_ns = ClampToEarlierDeadline(slice_deadline_ns, state->finish_time_);

     SCHED_LTRACEF("name=%s capacity=%" PRId64 " deadline=%" PRId64 " period=%" PRId64
                   " time_slice_ns=%" PRId64 "\n",
                   thread->name_, state->deadline_.capacity_ns.raw_value(),
                   state->deadline_.deadline_ns.raw_value(), state->deadline_.period_ns.raw_value(),
                   state->time_slice_ns_.raw_value());
   }

   trace.End(Round<uint64_t>(state->time_slice_ns_), Round<uint64_t>(absolute_deadline_ns));
   return absolute_deadline_ns;
 }

 void Scheduler::QueueThread(Thread* thread, Placement placement, SchedTime now,
                             SchedDuration total_runtime_ns) {
   LocalTraceDuration<KTRACE_DETAILED> trace{"queue_thread: s,f"_stringref};

   DEBUG_ASSERT(thread->state_ == THREAD_READY);
   DEBUG_ASSERT(!thread->IsIdle());
   SCHED_LTRACEF("QueueThread: thread=%s\n", thread->name_);

   SchedulerState* const state = &thread->scheduler_state_;

   // Account for the consumed time slice. The consumed time is zero when the
   // thread is unblocking, migrating, or adjusting queue position. The
   // remaining time slice may become negative due to scheduler overhead.
   state->time_slice_ns_ -= total_runtime_ns;

   if (IsFairThread(thread)) {
     // Compute the ratio of remaining time slice to ideal time slice. This may
     // be less than 1.0 due to time slice consumed or due to previous preemption
     // by a deadline task or both.
     const SchedRemainder normalized_timeslice_remainder =
         state->time_slice_ns_ / ktl::max(state->fair_.initial_time_slice_ns, SchedDuration{1});

     DEBUG_ASSERT_MSG(
         normalized_timeslice_remainder <= SchedRemainder{1},
         "time_slice_ns=%" PRId64 " initial_time_slice_ns=%" PRId64 " remainder=%" PRId64 "\n",
         state->time_slice_ns_.raw_value(), state->fair_.initial_time_slice_ns.raw_value(),
         normalized_timeslice_remainder.raw_value());

     if (placement == Placement::Insertion || normalized_timeslice_remainder <= SchedRemainder{0}) {
       state->start_time_ = ktl::max(state->finish_time_, virtual_time_);
       state->fair_.normalized_timeslice_remainder = SchedRemainder{1};
     } else if (placement == Placement::Preemption) {
       DEBUG_ASSERT(state->time_slice_ns_ > SchedDuration{0});
       state->fair_.normalized_timeslice_remainder = normalized_timeslice_remainder;
     }

     const SchedDuration scheduling_period_ns = scheduling_period_grans_ * minimum_granularity_ns_;
     const SchedWeight rate = kReciprocalMinWeight * state->fair_.weight;
     const SchedDuration delta_norm = scheduling_period_ns / rate;
     state->finish_time_ = state->start_time_ + delta_norm;

     DEBUG_ASSERT_MSG(state->start_time_ < state->finish_time_,
                      "start=%" PRId64 " finish=%" PRId64 " delta_norm=%" PRId64 "\n",
                      state->start_time_.raw_value(), state->finish_time_.raw_value(),
                      delta_norm.raw_value());
   } else {
     if (placement == Placement::Insertion) {
       LocalTraceDuration<KTRACE_DETAILED> insert_trace{"insert_deadline: r,c"_stringref};

       // Determine how much time is left before the deadline. This might be less
       // than the remaining time slice or negative if the thread blocked.
       const SchedDuration time_until_deadline_ns = state->finish_time_ - now;
       if (time_until_deadline_ns <= SchedDuration{0} || state->time_slice_ns_ <= SchedDuration{0}) {
         const SchedTime period_finish_ns = state->start_time_ + state->deadline_.period_ns;

         state->start_time_ = now >= period_finish_ns ? now : period_finish_ns;
         state->finish_time_ = state->start_time_ + state->deadline_.deadline_ns;
         state->time_slice_ns_ = state->deadline_.capacity_ns;
       } else if (state->time_slice_ns_ >= time_until_deadline_ns) {
         state->time_slice_ns_ = time_until_deadline_ns;
       }
       DEBUG_ASSERT(state->time_slice_ns_ >= SchedDuration{0});

       insert_trace.End(Round<uint64_t>(time_until_deadline_ns),
                        Round<uint64_t>(state->time_slice_ns_));
     } else if (placement == Placement::Preemption) {
       LocalTraceDuration<KTRACE_DETAILED> preemption_trace{"preemption_deadline: r,c"_stringref};
       const SchedDuration time_until_deadline_ns = state->finish_time_ - now;
       preemption_trace.End(Round<uint64_t>(time_until_deadline_ns),
                            Round<uint64_t>(state->time_slice_ns_));
     }

     DEBUG_ASSERT_MSG(state->start_time_ < state->finish_time_,
                      "start=%" PRId64 " finish=%" PRId64 " capacity=%" PRId64 "\n",
                      state->start_time_.raw_value(), state->finish_time_.raw_value(),
                      state->time_slice_ns_.raw_value());
   }

   // Only update the generation, enqueue time, and emit a flow event if this
   // is an insertion or preemption. In constrast, an adjustment only changes the
   // queue position due to a parameter change and should not perform these
   // actions.
   if (placement != Placement::Adjustment) {
     // Reuse this member to track the time the thread enters the run queue.
     // It is not read outside of the scheduler unless the thread state is
     // THREAD_RUNNING.
     thread->last_started_running_ = now.raw_value();
     thread->scheduler_state_.generation_ = ++generation_count_;
     LOCAL_KTRACE_FLOW_BEGIN(KTRACE_FLOW, "sched_latency", FlowIdFromThreadGeneration(thread),
                             thread->user_tid_);
   }

   // Insert the thread into the appropriate run queue after the generation count
   // is potentially updated above.
   if (IsFairThread(thread)) {
     fair_run_queue_.insert(thread);
   } else {
     deadline_run_queue_.insert(thread);
   }
   LOCAL_KTRACE(KTRACE_DETAILED, "queue_thread");

   trace.End(Round<uint64_t>(state->start_time_), Round<uint64_t>(state->finish_time_));
 }

 void Scheduler::Insert(SchedTime now, Thread* thread) {
   LocalTraceDuration<KTRACE_DETAILED> trace{"insert"_stringref};

   DEBUG_ASSERT(thread->state_ == THREAD_READY);
   DEBUG_ASSERT(!thread->IsIdle());

   SchedulerState* const state = &thread->scheduler_state_;

   // Ensure insertion happens only once, even if Unblock is called multiple times.
   if (state->OnInsert()) {
     // Insertion can happen from a different CPU. Set the thread's current
     // CPU to the one this scheduler instance services.
     thread->curr_cpu_ = this_cpu();

     total_expected_runtime_ns_ += state->expected_runtime_ns_;
     DEBUG_ASSERT(total_expected_runtime_ns_ >= SchedDuration{0});

     if (IsFairThread(thread)) {
       runnable_fair_task_count_++;
       DEBUG_ASSERT(runnable_fair_task_count_ > 0);

       UpdateTimeline(now);
       UpdatePeriod();

       weight_total_ += state->fair_.weight;
       DEBUG_ASSERT(weight_total_ > SchedWeight{0});
     } else {
       total_deadline_utilization_ += state->deadline_.utilization;
       DEBUG_ASSERT(total_deadline_utilization_ > SchedUtilization{0});

       runnable_deadline_task_count_++;
       DEBUG_ASSERT(runnable_deadline_task_count_ != 0);
     }

     QueueThread(thread, Placement::Insertion, now);
   }
 }

 void Scheduler::Remove(Thread* thread) {
   LocalTraceDuration<KTRACE_DETAILED> trace{"remove"_stringref};

   DEBUG_ASSERT(!thread->IsIdle());

   SchedulerState* const state = &thread->scheduler_state_;
   DEBUG_ASSERT(!state->InQueue());

   // Ensure that removal happens only once, even if Block() is called multiple times.
   if (state->OnRemove()) {
     thread->curr_cpu_ = INVALID_CPU;

     total_expected_runtime_ns_ -= state->expected_runtime_ns_;
     DEBUG_ASSERT(total_expected_runtime_ns_ >= SchedDuration{0});

     if (IsFairThread(thread)) {
       DEBUG_ASSERT(runnable_fair_task_count_ > 0);
       runnable_fair_task_count_--;

       UpdatePeriod();

       state->start_time_ = SchedNs(0);
       state->finish_time_ = SchedNs(0);

       weight_total_ -= state->fair_.weight;
       DEBUG_ASSERT(weight_total_ >= SchedWeight{0});

       SCHED_LTRACEF("name=%s weight_total=%#x weight=%#x\n", thread->name_,
                     static_cast<uint32_t>(weight_total_.raw_value()),
                     static_cast<uint32_t>(state->fair_.weight.raw_value()));
     } else {
       DEBUG_ASSERT(total_deadline_utilization_ > SchedUtilization{0});
       total_deadline_utilization_ -= state->deadline_.utilization;

       DEBUG_ASSERT(runnable_deadline_task_count_ > 0);
       runnable_deadline_task_count_--;
     }
   }
 }

 void Scheduler::Block() {
   LocalTraceDuration<KTRACE_COMMON> trace{"sched_block"_stringref};

   DEBUG_ASSERT(spin_lock_held(&thread_lock));

   Thread* const current_thread = Thread::Current::Get();

   DEBUG_ASSERT(current_thread->magic_ == THREAD_MAGIC);
   DEBUG_ASSERT(current_thread->state_ != THREAD_RUNNING);

   const SchedTime now = CurrentTime();
   SCHED_LTRACEF("current=%s now=%" PRId64 "\n", current_thread->name_, now.raw_value());

   Scheduler::Get()->RescheduleCommon(now, trace.Completer());
 }

 bool Scheduler::Unblock(Thread* thread) {
   LocalTraceDuration<KTRACE_COMMON> trace{"sched_unblock"_stringref};

   DEBUG_ASSERT(thread->magic_ == THREAD_MAGIC);
   DEBUG_ASSERT(spin_lock_held(&thread_lock));

   const SchedTime now = CurrentTime();
   SCHED_LTRACEF("thread=%s now=%" PRId64 "\n", thread->name_, now.raw_value());

   const cpu_num_t target_cpu = FindTargetCpu(thread);
   Scheduler* const target = Get(target_cpu);

   thread->state_ = THREAD_READY;
   target->Insert(now, thread);

   if (target_cpu == arch_curr_cpu_num()) {
     return true;
   } else {
     mp_reschedule(cpu_num_to_mask(target_cpu), 0);
     return false;
   }
 }

 bool Scheduler::Unblock(list_node* list) {
   LocalTraceDuration<KTRACE_COMMON> trace{"sched_unblock_list"_stringref};

   DEBUG_ASSERT(list);
   DEBUG_ASSERT(spin_lock_held(&thread_lock));

   const SchedTime now = CurrentTime();

   cpu_mask_t cpus_to_reschedule_mask = 0;
   Thread* thread;
   while ((thread = list_remove_tail_type(list, Thread, queue_node_)) != nullptr) {
     DEBUG_ASSERT(thread->magic_ == THREAD_MAGIC);
     DEBUG_ASSERT(!thread->IsIdle());

     SCHED_LTRACEF("thread=%s now=%" PRId64 "\n", thread->name_, now.raw_value());

     const cpu_num_t target_cpu = FindTargetCpu(thread);
     Scheduler* const target = Get(target_cpu);

     thread->state_ = THREAD_READY;
     target->Insert(now, thread);

     cpus_to_reschedule_mask |= cpu_num_to_mask(target_cpu);
   }

   // Issue reschedule IPIs to other CPUs.
   if (cpus_to_reschedule_mask) {
     mp_reschedule(cpus_to_reschedule_mask, 0);
   }

   // Return true if the current CPU is in the mask.
   const cpu_mask_t current_cpu_mask = cpu_num_to_mask(arch_curr_cpu_num());
   return cpus_to_reschedule_mask & current_cpu_mask;
 }

 void Scheduler::UnblockIdle(Thread* thread) {
   DEBUG_ASSERT(spin_lock_held(&thread_lock));

   DEBUG_ASSERT(thread->IsIdle());
   DEBUG_ASSERT(thread->hard_affinity_ &&
                (thread->hard_affinity_ & (thread->hard_affinity_ - 1)) == 0);

   SCHED_LTRACEF("thread=%s now=%" PRId64 "\n", thread->name_, current_time());

   thread->state_ = THREAD_READY;
   thread->curr_cpu_ = lowest_cpu_set(thread->hard_affinity_);
 }

 void Scheduler::Yield() {
   LocalTraceDuration<KTRACE_COMMON> trace{"sched_yield"_stringref};

   DEBUG_ASSERT(spin_lock_held(&thread_lock));

   Thread* const current_thread = Thread::Current::Get();
   SchedulerState* const current_state = &current_thread->scheduler_state_;
   DEBUG_ASSERT(!current_thread->IsIdle());

   Scheduler* const current = Get();
   const SchedTime now = CurrentTime();
   SCHED_LTRACEF("current=%s now=%" PRId64 "\n", current_thread->name_, now.raw_value());

   // Set the time slice to expire now.
   current_thread->state_ = THREAD_READY;
   current_state->time_slice_ns_ = now - current->start_of_current_time_slice_ns_;
   DEBUG_ASSERT(current_state->time_slice_ns_ >= 0);

   if (IsFairThread(current_thread)) {
     // Update the virtual timeline in preparation for snapping the thread's
     // virtual finish time to the current virtual time.
     current->UpdateTimeline(now);

     // The thread is re-evaluated with zero lag against other competing threads
     // and may skip lower priority threads with similar arrival times.
     current_state->finish_time_ = current->virtual_time_;
     current_state->fair_.initial_time_slice_ns = current_state->time_slice_ns_;
     current_state->fair_.normalized_timeslice_remainder = SchedRemainder{1};
   }

   current->RescheduleCommon(now, trace.Completer());
 }

 void Scheduler::Preempt() {
   LocalTraceDuration<KTRACE_COMMON> trace{"sched_preempt"_stringref};

   DEBUG_ASSERT(spin_lock_held(&thread_lock));

   Thread* current_thread = Thread::Current::Get();
   const cpu_num_t current_cpu = arch_curr_cpu_num();

   DEBUG_ASSERT(current_thread->curr_cpu_ == current_cpu);
   DEBUG_ASSERT(current_thread->last_cpu_ == current_thread->curr_cpu_);

   const SchedTime now = CurrentTime();
   SCHED_LTRACEF("current=%s now=%" PRId64 "\n", current_thread->name_, now.raw_value());

   current_thread->state_ = THREAD_READY;
   Get()->RescheduleCommon(now, trace.Completer());
 }

 void Scheduler::Reschedule() {
   LocalTraceDuration<KTRACE_COMMON> trace{"sched_reschedule"_stringref};

   DEBUG_ASSERT(spin_lock_held(&thread_lock));

   Thread* current_thread = Thread::Current::Get();
   const cpu_num_t current_cpu = arch_curr_cpu_num();

   if (current_thread->disable_counts_ != 0) {
     current_thread->preempt_pending_ = true;
     return;
   }

   DEBUG_ASSERT(current_thread->curr_cpu_ == current_cpu);
   DEBUG_ASSERT(current_thread->last_cpu_ == current_thread->curr_cpu_);

   const SchedTime now = CurrentTime();
   SCHED_LTRACEF("current=%s now=%" PRId64 "\n", current_thread->name_, now.raw_value());

   current_thread->state_ = THREAD_READY;
   Get()->RescheduleCommon(now, trace.Completer());
 }

 void Scheduler::RescheduleInternal() { Get()->RescheduleCommon(CurrentTime()); }

 void Scheduler::Migrate(Thread* thread) {
   LocalTraceDuration<KTRACE_COMMON> trace{"sched_migrate"_stringref};

   DEBUG_ASSERT(spin_lock_held(&thread_lock));
   cpu_mask_t cpus_to_reschedule_mask = 0;

   if (thread->state_ == THREAD_RUNNING) {
     const cpu_mask_t thread_cpu_mask = cpu_num_to_mask(thread->curr_cpu_);
     if (!(GetEffectiveCpuMask(mp_get_active_mask(), thread) & thread_cpu_mask)) {
       // Mark the CPU the thread is running on for reschedule. The
       // scheduler on that CPU will take care of the actual migration.
       cpus_to_reschedule_mask |= thread_cpu_mask;
     }
   } else if (thread->state_ == THREAD_READY) {
     const cpu_mask_t thread_cpu_mask = cpu_num_to_mask(thread->curr_cpu_);
     if (!(GetEffectiveCpuMask(mp_get_active_mask(), thread) & thread_cpu_mask)) {
       Scheduler* current = Get(thread->curr_cpu_);

       DEBUG_ASSERT(thread->scheduler_state_.InQueue());
       current->GetRunQueue(thread).erase(*thread);
       current->Remove(thread);

       const cpu_num_t target_cpu = FindTargetCpu(thread);
       Scheduler* const target = Get(target_cpu);
       target->Insert(CurrentTime(), thread);

       cpus_to_reschedule_mask |= cpu_num_to_mask(target_cpu);
     }
   }

   if (cpus_to_reschedule_mask) {
     mp_reschedule(cpus_to_reschedule_mask, 0);
   }

   const cpu_mask_t current_cpu_mask = cpu_num_to_mask(arch_curr_cpu_num());
   if (cpus_to_reschedule_mask & current_cpu_mask) {
     trace.End();
     Reschedule();
   }
 }

 void Scheduler::MigrateUnpinnedThreads() {
   LocalTraceDuration<KTRACE_COMMON> trace{"sched_migrate_unpinned"_stringref};

   DEBUG_ASSERT(spin_lock_held(&thread_lock));

   const cpu_num_t current_cpu = arch_curr_cpu_num();
   const cpu_mask_t current_cpu_mask = cpu_num_to_mask(current_cpu);

   // Prevent this CPU from being selected as a target for scheduling threads.
   mp_set_curr_cpu_active(false);

   const SchedTime now = CurrentTime();
   Scheduler* const current = Get(current_cpu);

   RunQueue pinned_threads;
   cpu_mask_t cpus_to_reschedule_mask = 0;
   while (!current->fair_run_queue_.is_empty()) {
     Thread* const thread = current->fair_run_queue_.pop_front();

     if (thread->hard_affinity_ == current_cpu_mask) {
       // Keep track of threads pinned to this CPU.
       pinned_threads.insert(thread);
     } else {
       // Move unpinned threads to another available CPU.
       current->Remove(thread);
       thread->CallMigrateFnLocked(Thread::MigrateStage::Before);

       const cpu_num_t target_cpu = FindTargetCpu(thread);
       Scheduler* const target = Get(target_cpu);
       DEBUG_ASSERT(target != current);

       target->Insert(now, thread);
       cpus_to_reschedule_mask |= cpu_num_to_mask(target_cpu);
     }
   }

   // Return the pinned threads to the fair run queue.
   current->fair_run_queue_ = ktl::move(pinned_threads);

   while (!current->deadline_run_queue_.is_empty()) {
     Thread* const thread = current->deadline_run_queue_.pop_front();

     if (thread->hard_affinity_ == current_cpu_mask) {
       // Keep track of threads pinned to this CPU.
       pinned_threads.insert(thread);
     } else {
       // Move unpinned threads to another available CPU.
       current->Remove(thread);
       thread->CallMigrateFnLocked(Thread::MigrateStage::Before);

       const cpu_num_t target_cpu = FindTargetCpu(thread);
       Scheduler* const target = Get(target_cpu);
       DEBUG_ASSERT(target != current);

       target->Insert(now, thread);
       cpus_to_reschedule_mask |= cpu_num_to_mask(target_cpu);
     }
   }

   // Return the pinned threads to the deadline run queue.
   current->deadline_run_queue_ = ktl::move(pinned_threads);

   if (cpus_to_reschedule_mask) {
     mp_reschedule(cpus_to_reschedule_mask, 0);
   }
 }

 void Scheduler::UpdateWeightCommon(Thread* thread, int original_priority_, SchedWeight weight,
                                    cpu_mask_t* cpus_to_reschedule_mask, PropagatePI propagate) {
   SchedulerState* const state = &thread->scheduler_state_;

   switch (thread->state_) {
     case THREAD_INITIAL:
     case THREAD_SLEEPING:
     case THREAD_SUSPENDED:
       // Adjust the weight of the thread so that the correct value is
       // available when the thread enters the run queue.
       state->discipline_ = SchedDiscipline::Fair;
       state->fair_.weight = weight;
       break;

     case THREAD_RUNNING:
     case THREAD_READY: {
       DEBUG_ASSERT(is_valid_cpu_num(thread->curr_cpu_));
       Scheduler* const current = Get(thread->curr_cpu_);

       // If the thread is in a run queue, remove it before making subsequent
       // changes to the properties of the thread. Erasing and enqueuing depend
       // on having the currect discipline set before hand.
       if (thread->state_ == THREAD_READY) {
         DEBUG_ASSERT(state->InQueue());
         DEBUG_ASSERT(state->active());
         current->GetRunQueue(thread).erase(*thread);
       }

       if (IsDeadlineThread(thread)) {
         // Changed to the fair discipline and update the task counts. Changing
         // from deadline to fair behaves similarly to a yield.
         current->total_deadline_utilization_ -= state->deadline_.utilization;
         state->discipline_ = SchedDiscipline::Fair;
         state->start_time_ = current->virtual_time_;
         state->finish_time_ = current->virtual_time_;
         state->time_slice_ns_ = SchedDuration{0};
         state->fair_.initial_time_slice_ns = SchedDuration{0};
         state->fair_.normalized_timeslice_remainder = SchedRemainder{1};
         current->runnable_deadline_task_count_--;
         current->runnable_fair_task_count_++;
       } else {
         // Remove the old weight from the run queue.
         current->weight_total_ -= state->fair_.weight;
       }

       // Update the weight of the thread and the run queue. The time slice
       // of a running thread will be adjusted during reschedule due to the
       // change in demand on the run queue.
       current->weight_total_ += weight;
       state->fair_.weight = weight;

       // Adjust the position of the thread in the run queue based on the new
       // weight.
       if (thread->state_ == THREAD_READY) {
         current->QueueThread(thread, Placement::Adjustment);
       }

       *cpus_to_reschedule_mask |= cpu_num_to_mask(thread->curr_cpu_);
       break;
     }

     case THREAD_BLOCKED:
     case THREAD_BLOCKED_READ_LOCK:
       // Update the weight of the thread blocked in a wait queue. Also
       // handle the race where the thread is no longer in the wait queue
       // but has not yet transitioned to ready.
       state->discipline_ = SchedDiscipline::Fair;
       state->fair_.weight = weight;
       if (thread->blocking_wait_queue_) {
         wait_queue_priority_changed(thread, original_priority_, propagate);
       }
       break;

     default:
       break;
   }
 }

 void Scheduler::UpdateDeadlineCommon(Thread* thread, int original_priority_,
                                      const SchedDeadlineParams& params,
                                      cpu_mask_t* cpus_to_reschedule_mask, PropagatePI propagate) {
   SchedulerState* const state = &thread->scheduler_state_;

   switch (thread->state_) {
     case THREAD_INITIAL:
     case THREAD_SLEEPING:
     case THREAD_SUSPENDED:
       // Adjust the deadline of the thread so that the correct value is
       // available when the thread enters the run queue.
       state->discipline_ = SchedDiscipline::Deadline;
       state->deadline_ = params;
       break;

     case THREAD_RUNNING:
     case THREAD_READY: {
       DEBUG_ASSERT(is_valid_cpu_num(thread->curr_cpu_));
       Scheduler* const current = Get(thread->curr_cpu_);

       // If the thread is running or is already a deadline task, keep the
       // original arrival time. Otherwise, when moving a ready task from the
       // fair run queue to the deadline run queue, use the current time as the
       // arrival time.
       SchedTime effective_start_time;
       if (IsDeadlineThread(thread)) {
         effective_start_time = state->start_time_;
       } else if (thread->state_ == THREAD_RUNNING) {
         effective_start_time = current->start_of_current_time_slice_ns_;
       } else {
         effective_start_time = CurrentTime();
       }

       // If the thread is in a run queue, remove it before making subsequent
       // changes to the properties of the thread. Erasing and enqueuing depend
       // on having the currect discipline set before hand.
       if (thread->state_ == THREAD_READY) {
         DEBUG_ASSERT(state->InQueue());
         DEBUG_ASSERT(state->active());
         current->GetRunQueue(thread).erase(*thread);
       }

       if (IsFairThread(thread)) {
         // Changed to the deadline discipline and update the task counts and
         // queue weight.
         current->weight_total_ -= state->fair_.weight;
         state->discipline_ = SchedDiscipline::Deadline;
         current->runnable_fair_task_count_--;
         current->runnable_deadline_task_count_++;
       } else {
         // Remove old utilization from the run queue.
         current->total_deadline_utilization_ -= state->deadline_.utilization;
       }

       // Update the deadline params and the run queue.
       state->deadline_ = params;
       state->start_time_ = effective_start_time;
       state->finish_time_ = state->start_time_ + params.deadline_ns;
       state->time_slice_ns_ = ktl::min(state->time_slice_ns_, params.capacity_ns);
       current->total_deadline_utilization_ += state->deadline_.utilization;

       // Adjust the position of the thread in the run queue based on the new
       // deadline.
       if (thread->state_ == THREAD_READY) {
         current->QueueThread(thread, Placement::Adjustment);
       }

       *cpus_to_reschedule_mask |= cpu_num_to_mask(thread->curr_cpu_);
       break;
     }

     case THREAD_BLOCKED:
     case THREAD_BLOCKED_READ_LOCK:
       // Update the weight of the thread blocked in a wait queue. Also
       // handle the race where the thread is no longer in the wait queue
       // but has not yet transitioned to ready.
       state->discipline_ = SchedDiscipline::Deadline;
       state->deadline_ = params;
       if (thread->blocking_wait_queue_) {
         wait_queue_priority_changed(thread, original_priority_, propagate);
       }
       break;

     default:
       break;
   }
 }

 void Scheduler::ChangeWeight(Thread* thread, int priority, cpu_mask_t* cpus_to_reschedule_mask) {
   LocalTraceDuration<KTRACE_COMMON> trace{"sched_change_weight"_stringref};

   DEBUG_ASSERT(spin_lock_held(&thread_lock));
   SCHED_LTRACEF("thread={%s, %s} base=%d effective=%d inherited=%d\n", thread->name_,
                 ToString(thread->state_), thread->base_priority_, thread->effec_priority_,
                 thread->inherited_priority_);

   if (thread->IsIdle() || thread->state_ == THREAD_DEATH) {
     return;
   }

   // TODO(eieio): The rest of the kernel still uses priority so we have to
   // operate in those terms here. Abstract the notion of priority once the
   // deadline scheduler is available and remove this conversion once the
   // kernel uses the abstraction throughout.
   const int original_priority_ = thread->effec_priority_;
   thread->base_priority_ = priority;
   thread->effec_priority_ = ktl::max(thread->base_priority_, thread->inherited_priority_);

   // Perform the state-specific updates if the discipline or effective priority
   // changed.
   if (IsDeadlineThread(thread) || thread->effec_priority_ != original_priority_) {
     UpdateWeightCommon(thread, original_priority_, PriorityToWeight(thread->effec_priority_),
                        cpus_to_reschedule_mask, PropagatePI::Yes);
   }

   trace.End(original_priority_, thread->effec_priority_);
 }

 void Scheduler::ChangeDeadline(Thread* thread, const SchedDeadlineParams& params,
                                cpu_mask_t* cpus_to_reschedule_mask) {
   LocalTraceDuration<KTRACE_COMMON> trace{"sched_change_deadline"_stringref};

   DEBUG_ASSERT(spin_lock_held(&thread_lock));
   SCHED_LTRACEF("thread={%s, %s} base=%d effective=%d inherited=%d\n", thread->name_,
                 ToString(thread->state_), thread->base_priority_, thread->effec_priority_,
                 thread->inherited_priority_);

   if (thread->IsIdle() || thread->state_ == THREAD_DEATH) {
     return;
   }

   SchedulerState* const state = &thread->scheduler_state_;
   const bool changed = IsFairThread(thread) || state->deadline_ != params;

   // Always set deadline threads to the highest fair priority. This is a
   // workaround until deadline priority inheritance is worked out.
   // TODO(eieio): Replace this with actual deadline PI.
   const int original_priority_ = thread->effec_priority_;
   thread->base_priority_ = HIGHEST_PRIORITY;
   thread->inherited_priority_ = -1;
   thread->effec_priority_ = thread->base_priority_;

   // Perform the state-specific updates if the discipline or deadline params changed.
   if (changed) {
     UpdateDeadlineCommon(thread, original_priority_, params, cpus_to_reschedule_mask,
                          PropagatePI::Yes);
   }

   trace.End(original_priority_, thread->effec_priority_);
 }

 void Scheduler::InheritWeight(Thread* thread, int priority, cpu_mask_t* cpus_to_reschedule_mask) {
   LocalTraceDuration<KTRACE_COMMON> trace{"sched_inherit_weight"_stringref};

   DEBUG_ASSERT(spin_lock_held(&thread_lock));
   SCHED_LTRACEF("thread={%s, %s} base=%d effective=%d inherited=%d\n", thread->name_,
                 ToString(thread->state_), thread->base_priority_, thread->effec_priority_,
                 thread->inherited_priority_);

   // For now deadline threads are logically max weight for the purposes of
   // priority inheritance.
   if (IsDeadlineThread(thread)) {
     return;
   }

   const int original_priority_ = thread->effec_priority_;
   thread->inherited_priority_ = priority;
   thread->effec_priority_ = ktl::max(thread->base_priority_, thread->inherited_priority_);

   // Perform the state-specific updates if the effective priority changed.
   if (thread->effec_priority_ != original_priority_) {
     UpdateWeightCommon(thread, original_priority_, PriorityToWeight(thread->effec_priority_),
                        cpus_to_reschedule_mask, PropagatePI::No);
   }

   trace.End(original_priority_, thread->effec_priority_);
 }

 void Scheduler::TimerTick(SchedTime now) {
   LocalTraceDuration<KTRACE_COMMON> trace{"sched_timer_tick"_stringref};
   Thread::Current::PreemptSetPending();
 }

 // Temporary compatibility with the thread layer.

 void sched_init_thread(Thread* thread, int priority) {
   Scheduler::InitializeThread(thread, priority);
 }

 void sched_block() { Scheduler::Block(); }

 bool sched_unblock(Thread* thread) { return Scheduler::Unblock(thread); }

 bool sched_unblock_list(list_node* list) { return Scheduler::Unblock(list); }

 void sched_unblock_idle(Thread* thread) { Scheduler::UnblockIdle(thread); }

 void sched_yield() { Scheduler::Yield(); }

 void sched_preempt() { Scheduler::Preempt(); }

 void sched_reschedule() { Scheduler::Reschedule(); }

 void sched_resched_internal() { Scheduler::RescheduleInternal(); }

 void sched_transition_off_cpu() { Scheduler::MigrateUnpinnedThreads(); }

 void sched_migrate(Thread* thread) { Scheduler::Migrate(thread); }

 void sched_inherit_priority(Thread* thread, int priority, bool* local_reschedule,
                             cpu_mask_t* cpus_to_reschedule_mask) {
   Scheduler::InheritWeight(thread, priority, cpus_to_reschedule_mask);

   const cpu_mask_t current_cpu_mask = cpu_num_to_mask(arch_curr_cpu_num());
   if (*cpus_to_reschedule_mask & current_cpu_mask) {
     *local_reschedule = true;
   }
 }

 void sched_change_priority(Thread* thread, int priority) {
   cpu_mask_t cpus_to_reschedule_mask = 0;
   Scheduler::ChangeWeight(thread, priority, &cpus_to_reschedule_mask);

   const cpu_mask_t current_cpu_mask = cpu_num_to_mask(arch_curr_cpu_num());
   if (cpus_to_reschedule_mask & current_cpu_mask) {
     Scheduler::Reschedule();
   }
   if (cpus_to_reschedule_mask & ~current_cpu_mask) {
     mp_reschedule(cpus_to_reschedule_mask, 0);
   }
 }

 void sched_change_deadline(Thread* thread, const zx_sched_deadline_params_t& params) {
   cpu_mask_t cpus_to_reschedule_mask = 0;
   Scheduler::ChangeDeadline(thread, params, &cpus_to_reschedule_mask);

   const cpu_mask_t current_cpu_mask = cpu_num_to_mask(arch_curr_cpu_num());
   if (cpus_to_reschedule_mask & current_cpu_mask) {
     Scheduler::Reschedule();
   }
   if (cpus_to_reschedule_mask & ~current_cpu_mask) {
     mp_reschedule(cpus_to_reschedule_mask, 0);
   }
 }

 void sched_preempt_timer_tick(zx_time_t now) { Scheduler::TimerTick(SchedTime{now}); }