zircon/kernel/kernel/fair_scheduler.cc - fuchsia - Git at Google

 // Copyright 2018 The Fuchsia Authors
 //
 // Use of this source code is governed by a MIT-style
 // license that can be found in the LICENSE file or at
 // https://opensource.org/licenses/MIT

 #include "kernel/fair_scheduler.h"

 #include <assert.h>
 #include <debug.h>
 #include <err.h>
 #include <inttypes.h>
 #include <lib/counters.h>
 #include <lib/ktrace.h>
 #include <list.h>
 #include <platform.h>
 #include <printf.h>
 #include <string.h>
 #include <target.h>
 #include <trace.h>
 #include <zircon/types.h>

 #include <algorithm>
 #include <new>

 #include <kernel/lockdep.h>
 #include <kernel/mp.h>
 #include <kernel/percpu.h>
 #include <kernel/sched.h>
 #include <kernel/thread.h>
 #include <kernel/thread_lock.h>
 #include <ktl/move.h>
 #include <vm/vm.h>

 using ffl::Expression;
 using ffl::Fixed;
 using ffl::FromRatio;
 using ffl::Round;

 // Enable/disable ktraces local to this file.
 #define LOCAL_KTRACE_ENABLE 0 || WITH_DETAILED_SCHEDULER_TRACING

 #define LOCAL_KTRACE(string, args...)                                                         \
   ktrace_probe(LocalTrace<LOCAL_KTRACE_ENABLE>, TraceContext::Cpu, KTRACE_STRING_REF(string), \
                ##args)

 #define LOCAL_KTRACE_FLOW_BEGIN(string, flow_id)                                              \
   ktrace_flow_begin(LocalTrace<LOCAL_KTRACE_ENABLE>, TraceContext::Cpu, KTRACE_GRP_SCHEDULER, \
                     KTRACE_STRING_REF(string), flow_id)

 #define LOCAL_KTRACE_FLOW_END(string, flow_id)                                              \
   ktrace_flow_end(LocalTrace<LOCAL_KTRACE_ENABLE>, TraceContext::Cpu, KTRACE_GRP_SCHEDULER, \
                   KTRACE_STRING_REF(string), flow_id)

 using LocalTraceDuration =
     TraceDuration<TraceEnabled<LOCAL_KTRACE_ENABLE>, KTRACE_GRP_SCHEDULER, TraceContext::Cpu>;

 // Enable/disable console traces local to this file.
 #define LOCAL_TRACE 0

 #define SCHED_LTRACEF(str, args...) LTRACEF("[%u] " str, arch_curr_cpu_num(), ##args)
 #define SCHED_TRACEF(str, args...) TRACEF("[%u] " str, arch_curr_cpu_num(), ##args)

 // Counters to track system load metrics.
 KCOUNTER(demand_counter, "thread.demand_accum")
 KCOUNTER(latency_counter, "thread.latency_accum")
 KCOUNTER(runnable_counter, "thread.runnable_accum")
 KCOUNTER(samples_counter, "thread.samples_accum")

 namespace {

 // Conversion table entry. Scales the integer argument to a fixed-point weight
 // in the interval (0.0, 1.0].
 struct WeightTableEntry {
   constexpr WeightTableEntry(int64_t value)
       : value{FromRatio<int64_t>(value, SchedWeight::Format::Power)} {}
   constexpr operator SchedWeight() const { return value; }
   const SchedWeight value;
 };

 // Table of fixed-point constants converting from kernel priority to fair
 // scheduler weight.
 constexpr WeightTableEntry kPriorityToWeightTable[] = {
     121,   149,   182,   223,   273,   335,   410,   503,   616,   754,  924,
     1132,  1386,  1698,  2080,  2549,  3122,  3825,  4685,  5739,  7030, 8612,
     10550, 12924, 15832, 19394, 23757, 29103, 35651, 43672, 53499, 65536};

 // Converts from kernel priority value in the interval [0, 31] to weight in the
 // interval (0.0, 1.0]. See the definition of SchedWeight for an explanation of
 // the weight distribution.
 constexpr SchedWeight PriorityToWeight(int priority) { return kPriorityToWeightTable[priority]; }

 // The minimum possible weight and its reciprocal.
 constexpr SchedWeight kMinWeight = PriorityToWeight(LOWEST_PRIORITY);
 constexpr SchedWeight kReciprocalMinWeight = 1 / kMinWeight;

 // Utility operator to make expressions more succinct that update thread times
 // and durations of basic types using the fixed-point counterparts.
 constexpr zx_time_t& operator+=(zx_time_t& value, SchedDuration delta) {
   value += delta.raw_value();
   return value;
 }

 // On ARM64 with safe-stack, it's no longer possible to use the unsafe-sp
 // after set_current_thread (we'd now see newthread's unsafe-sp instead!).
 // Hence this function and everything it calls between this point and the
 // the low-level context switch must be marked with __NO_SAFESTACK.
 __NO_SAFESTACK void FinalContextSwitch(thread_t* oldthread, thread_t* newthread) {
   set_current_thread(newthread);
   arch_context_switch(oldthread, newthread);
 }

 inline void TraceContextSwitch(const thread_t* current_thread, const thread_t* next_thread,
                                cpu_num_t current_cpu) {
   const uintptr_t raw_current = reinterpret_cast<uintptr_t>(current_thread);
   const uintptr_t raw_next = reinterpret_cast<uintptr_t>(next_thread);
   const uint32_t current = static_cast<uint32_t>(raw_current);
   const uint32_t next = static_cast<uint32_t>(raw_next);
   const uint32_t user_tid = static_cast<uint32_t>(next_thread->user_tid);
   const uint32_t context = current_cpu | (current_thread->state << 8) |
                            (current_thread->base_priority << 16) |
                            (next_thread->base_priority << 24);

   ktrace(TAG_CONTEXT_SWITCH, user_tid, context, current, next);
 }

 // Returns a sufficiently unique flow id for a thread based on the thread id and
 // queue generation count. This flow id cannot be used across enqueues because
 // the generation count changes during enqueue.
 inline uint64_t FlowIdFromThreadGeneration(const thread_t* thread) {
   const int kRotationBits = 32;
   const uint64_t rotated_tid =
       (thread->user_tid << kRotationBits) | (thread->user_tid >> kRotationBits);
   return rotated_tid ^ thread->fair_task_state.generation();
 }

 // Calculate a mask of CPUs a thread is allowed to run on, based on the thread's
 // affinity mask and what CPUs are online.
 cpu_mask_t GetAllowedCpusMask(cpu_mask_t active_mask, const thread_t* thread) {
   // The thread may run on any active CPU allowed by both its hard and
   // soft CPU affinity.
   const cpu_mask_t soft_affinity = thread->soft_affinity;
   const cpu_mask_t hard_affinity = thread->hard_affinity;
   const cpu_mask_t available_mask = active_mask & soft_affinity & hard_affinity;
   if (likely(available_mask != 0)) {
     return available_mask;
   }

   // There is no CPU allowed by the intersection of active CPUs, the
   // hard affinity mask, and the soft affinity mask. Ignore the soft
   // affinity.
   return active_mask & hard_affinity;
 }

 }  // anonymous namespace

 void FairScheduler::Dump() {
   printf("\tweight_total=%#x runnable_tasks=%d vtime=%ld period=%ld\n",
          static_cast<uint32_t>(weight_total_.raw_value()), runnable_task_count_,
          virtual_time_.raw_value(), scheduling_period_grans_.raw_value());

   if (active_thread_ != nullptr) {
     const FairTaskState* const state = &active_thread_->fair_task_state;
     printf("\t-> name=%s weight=%#x vstart=%ld vfinish=%ld time_slice_ns=%ld\n",
            active_thread_->name, static_cast<uint32_t>(state->weight_.raw_value()),
            state->virtual_start_time_.raw_value(), state->virtual_finish_time_.raw_value(),
            state->time_slice_ns_.raw_value());
   }

   for (const thread_t& thread : run_queue_) {
     const FairTaskState* const state = &thread.fair_task_state;
     printf("\t   name=%s weight=%#x vstart=%ld vfinish=%ld time_slice_ns=%ld\n", thread.name,
            static_cast<uint32_t>(state->weight_.raw_value()),
            state->virtual_start_time_.raw_value(), state->virtual_finish_time_.raw_value(),
            state->time_slice_ns_.raw_value());
   }
 }

 SchedWeight FairScheduler::GetTotalWeight() const {
   Guard<spin_lock_t, IrqSave> guard{ThreadLock::Get()};
   return weight_total_;
 }

 size_t FairScheduler::GetRunnableTasks() const {
   Guard<spin_lock_t, IrqSave> guard{ThreadLock::Get()};
   return static_cast<size_t>(runnable_task_count_);
 }

 FairScheduler* FairScheduler::Get() { return Get(arch_curr_cpu_num()); }

 FairScheduler* FairScheduler::Get(cpu_num_t cpu) { return &percpu::Get(cpu).fair_runqueue; }

 void FairScheduler::InitializeThread(thread_t* thread, int priority) {
   new (&thread->fair_task_state) FairTaskState{PriorityToWeight(priority)};
   thread->base_priority = priority;
   thread->effec_priority = priority;
   thread->inherited_priority = -1;
   thread->priority_boost = 0;
 }

 thread_t* FairScheduler::DequeueThread() { return run_queue_.pop_front(); }

 // Updates the system load metrics. Updates happen only when the active thread
 // changes or the time slice expires.
 void FairScheduler::UpdateCounters(SchedDuration queue_time_ns) {
   demand_counter.Add(weight_total_.raw_value());
   runnable_counter.Add(runnable_task_count_);
   latency_counter.Add(queue_time_ns.raw_value());
   samples_counter.Add(1);
 }

 // Selects a thread to run. Performs any necessary maintenanace if the current
 // thread is changing, depending on the reason for the change.
 thread_t* FairScheduler::EvaluateNextThread(SchedTime now, thread_t* current_thread,
                                             bool timeslice_expired) {
   const bool is_idle = thread_is_idle(current_thread);
   const bool is_active = current_thread->state == THREAD_READY;
   const cpu_num_t current_cpu = arch_curr_cpu_num();
   const cpu_mask_t current_cpu_mask = cpu_num_to_mask(current_cpu);
   const cpu_mask_t active_mask = mp_get_active_mask();
   const bool needs_migration =
       (GetAllowedCpusMask(active_mask, current_thread) & current_cpu_mask) == 0;

   if (is_active && unlikely(needs_migration)) {
     // The current CPU is not in the thread's affinity mask, find a new CPU
     // and move it to that queue.
     current_thread->state = THREAD_READY;
     Remove(current_thread);

     const cpu_num_t target_cpu = FindTargetCpu(current_thread);
     FairScheduler* const target = Get(target_cpu);
     DEBUG_ASSERT(target != this);

     target->Insert(now, current_thread);
     mp_reschedule(cpu_num_to_mask(target_cpu), 0);
   } else if (is_active && likely(!is_idle)) {
     // If the timeslice expired put the current thread back in the runqueue,
     // otherwise continue to run it.
     if (timeslice_expired) {
       UpdateThreadTimeline(current_thread, Placement::Insertion);
       QueueThread(current_thread, Placement::Insertion, now);
     } else {
       return current_thread;
     }
   } else if (!is_active && likely(!is_idle)) {
     // The current thread is no longer ready, remove its accounting.
     Remove(current_thread);
   }

   // The current thread is no longer running or has returned to the runqueue.
   // Select another thread to run.
   if (likely(!run_queue_.is_empty())) {
     return DequeueThread();
   } else {
     const cpu_num_t current_cpu = arch_curr_cpu_num();
     return &percpu::Get(current_cpu).idle_thread;
   }
 }

 cpu_num_t FairScheduler::FindTargetCpu(thread_t* thread) {
   LocalTraceDuration trace{"find_target: cpu,avail"_stringref};

   const cpu_mask_t current_cpu_mask = cpu_num_to_mask(arch_curr_cpu_num());
   const cpu_mask_t last_cpu_mask = cpu_num_to_mask(thread->last_cpu);
   const cpu_mask_t active_mask = mp_get_active_mask();

   // Determine the set of CPUs the thread is allowed to run on.
   //
   // Threads may be created and resumed before the thread init level. Work around
   // an empty active mask by assuming the current cpu is scheduleable.
   const cpu_mask_t available_mask =
       active_mask != 0 ? GetAllowedCpusMask(active_mask, thread) : current_cpu_mask;
   DEBUG_ASSERT_MSG(available_mask != 0,
                    "thread=%s affinity=%#x soft_affinity=%#x active=%#x "
                    "idle=%#x arch_ints_disabled=%d",
                    thread->name, thread->hard_affinity, thread->soft_affinity, active_mask,
                    mp_get_idle_mask(), arch_ints_disabled());

   LOCAL_KTRACE("target_mask: online,active", mp_get_online_mask(), active_mask);

   cpu_num_t target_cpu;
   FairScheduler* target_queue;

   // Select an initial target.
   if (last_cpu_mask & available_mask) {
     target_cpu = thread->last_cpu;
   } else if (current_cpu_mask & available_mask) {
     target_cpu = arch_curr_cpu_num();
   } else {
     target_cpu = lowest_cpu_set(available_mask);
   }

   target_queue = Get(target_cpu);

   // See if there is a better target in the set of available CPUs.
   // TODO(eieio): Replace this with a search in order of increasing cache
   // distance from the initial target cpu when topology information is available.
   // TODO(eieio): Add some sort of threshold to terminate search when a sufficiently
   // unloaded target is found.
   cpu_mask_t remaining_mask = available_mask & ~cpu_num_to_mask(target_cpu);
   while (remaining_mask != 0 && target_queue->weight_total_ > SchedWeight{0}) {
     const cpu_num_t candidate_cpu = lowest_cpu_set(remaining_mask);
     FairScheduler* const candidate_queue = Get(candidate_cpu);

     if (candidate_queue->weight_total_ < target_queue->weight_total_) {
       target_cpu = candidate_cpu;
       target_queue = candidate_queue;
     }

     remaining_mask &= ~cpu_num_to_mask(candidate_cpu);
   }

   SCHED_LTRACEF("thread=%s target_cpu=%u\n", thread->name, target_cpu);
   trace.End(target_cpu, remaining_mask);
   return target_cpu;
 }

 void FairScheduler::UpdateTimeline(SchedTime now) {
   LocalTraceDuration trace{"update_vtime"_stringref};

   const Expression runtime_ns = now - last_update_time_ns_;
   last_update_time_ns_ = now;

   if (weight_total_ > SchedWeight{0}) {
     virtual_time_ += runtime_ns;
   }

   trace.End(Round<uint64_t>(runtime_ns), Round<uint64_t>(virtual_time_));
 }

 void FairScheduler::RescheduleCommon(SchedTime now, void* outer_trace) {
   LocalTraceDuration trace{"reschedule_common"_stringref};

   const cpu_num_t current_cpu = arch_curr_cpu_num();
   thread_t* const current_thread = get_current_thread();
   FairTaskState* const current_state = &current_thread->fair_task_state;

   DEBUG_ASSERT(arch_ints_disabled());
   DEBUG_ASSERT(spin_lock_held(&thread_lock));
   // Aside from the thread_lock, spinlocks should never be held over a reschedule.
   DEBUG_ASSERT(arch_num_spinlocks_held() == 1);
   DEBUG_ASSERT_MSG(current_thread->state != THREAD_RUNNING, "state %d\n", current_thread->state);
   DEBUG_ASSERT(!arch_blocking_disallowed());
   DEBUG_ASSERT_MSG(current_cpu == this_cpu(), "current_cpu=%u this_cpu=%u", current_cpu,
                    this_cpu());

   CPU_STATS_INC(reschedules);

   UpdateTimeline(now);

   const SchedTime total_runtime_ns = now - start_of_current_time_slice_ns_;
   const SchedDuration actual_runtime_ns = now - current_thread->last_started_running;
   current_thread->last_started_running = now.raw_value();

   // Update the runtime accounting for the thread that just ran.
   current_thread->runtime_ns += actual_runtime_ns;

   // Adjust the rate of the current thread when demand changes. Changes in
   // demand could be due to threads entering or leaving the run queue, or due
   // to weights changing in the current or enqueued threads.
   if (!thread_is_idle(current_thread) && weight_total_ != scheduled_weight_total_) {
     LocalTraceDuration trace_adjust_rate{"adjust_rate"_stringref};
     scheduled_weight_total_ = weight_total_;

     const SchedDuration time_slice_ns = CalculateTimeslice(current_thread);
     const bool timeslice_changed = time_slice_ns != current_state->time_slice_ns_;
     const bool timeslice_remaining = total_runtime_ns < time_slice_ns;

     // Update the preemption timer if necessary.
     if (timeslice_changed && timeslice_remaining) {
       const SchedTime absolute_deadline_ns = start_of_current_time_slice_ns_ + time_slice_ns;
       timer_preempt_reset(absolute_deadline_ns.raw_value());
     }

     current_state->time_slice_ns_ = time_slice_ns;
     trace_adjust_rate.End(Round<uint64_t>(time_slice_ns), Round<uint64_t>(total_runtime_ns));
   }

   const bool timeslice_expired = total_runtime_ns >= current_state->time_slice_ns_;

   // Select a thread to run.
   thread_t* const next_thread = EvaluateNextThread(now, current_thread, timeslice_expired);
   DEBUG_ASSERT(next_thread != nullptr);

   SCHED_LTRACEF("current={%s, %s} next={%s, %s} expired=%d is_empty=%d front=%s\n",
                 current_thread->name, ToString(current_thread->state), next_thread->name,
                 ToString(next_thread->state), timeslice_expired, run_queue_.is_empty(),
                 run_queue_.is_empty() ? "[none]" : run_queue_.front().name);

   // Update the state of the current and next thread.
   current_thread->preempt_pending = false;
   next_thread->state = THREAD_RUNNING;
   next_thread->last_cpu = current_cpu;
   next_thread->curr_cpu = current_cpu;

   active_thread_ = next_thread;

   // Always call to handle races between reschedule IPIs and changes to the run queue.
   mp_prepare_current_cpu_idle_state(thread_is_idle(next_thread));

   if (thread_is_idle(next_thread)) {
     mp_set_cpu_idle(current_cpu);
   } else {
     mp_set_cpu_busy(current_cpu);
   }

   // The task is always non-realtime when managed by this scheduler.
   // TODO(eieio): Revisit this when deadline scheduling is addressed.
   mp_set_cpu_non_realtime(current_cpu);

   if (thread_is_idle(current_thread)) {
     percpu::Get(current_cpu).stats.idle_time += actual_runtime_ns;
   }

   if (thread_is_idle(next_thread)) {
     LocalTraceDuration trace_stop_preemption{"stop_preemption"_stringref};
     SCHED_LTRACEF("Stop preemption timer: current=%s next=%s\n", current_thread->name,
                   next_thread->name);
     UpdateCounters(SchedDuration{0});
     next_thread->last_started_running = now.raw_value();
     timer_preempt_cancel();
   } else if (timeslice_expired || next_thread != current_thread) {
     LocalTraceDuration trace_start_preemption{"start_preemption: now,deadline"_stringref};

     // Re-compute the time slice for the new thread based on the latest state.
     NextThreadTimeslice(next_thread);

     // Update the preemption time based on the time slice.
     FairTaskState* const next_state = &next_thread->fair_task_state;
     const SchedTime absolute_deadline_ns = now + next_state->time_slice_ns_;

     // Compute the time the next thread spent in the run queue. The value of
     // last_started_running for the current thread is updated at the top of
     // this method: when the current and next thread are the same, the queue
     // time is zero. Otherwise, last_started_running is the time the next thread
     // entered the run queue.
     const SchedDuration queue_time_ns = now - next_thread->last_started_running;
     UpdateCounters(queue_time_ns);

     next_thread->last_started_running = now.raw_value();
     start_of_current_time_slice_ns_ = now;
     scheduled_weight_total_ = weight_total_;

     SCHED_LTRACEF("Start preemption timer: current=%s next=%s now=%ld deadline=%ld\n",
                   current_thread->name, next_thread->name, now.raw_value(),
                   absolute_deadline_ns.raw_value());
     timer_preempt_reset(absolute_deadline_ns.raw_value());

     trace_start_preemption.End(Round<uint64_t>(now), Round<uint64_t>(absolute_deadline_ns));

     // Emit a flow end event to match the flow begin event emitted when the
     // thread was enqueued. Emitting in this scope ensures that thread just
     // came from the run queue (and is not the idle thread).
     LOCAL_KTRACE_FLOW_END("sched_latency", FlowIdFromThreadGeneration(next_thread));
   }

   if (next_thread != current_thread) {
     LOCAL_KTRACE("reschedule current: count,slice", runnable_task_count_,
                  Round<uint64_t>(current_thread->fair_task_state.time_slice_ns_));
     LOCAL_KTRACE("reschedule next: wsum,slice", weight_total_.raw_value(),
                  Round<uint64_t>(next_thread->fair_task_state.time_slice_ns_));

     TraceContextSwitch(current_thread, next_thread, current_cpu);

     // Blink the optional debug LEDs on the target.
     target_set_debug_led(0, !thread_is_idle(next_thread));

     SCHED_LTRACEF("current=(%s, flags 0x%#x) next=(%s, flags 0x%#x)\n", current_thread->name,
                   current_thread->flags, next_thread->name, next_thread->flags);

     if (current_thread->aspace != next_thread->aspace) {
       vmm_context_switch(current_thread->aspace, next_thread->aspace);
     }

     CPU_STATS_INC(context_switches);

     // Prevent the scheduler durations from spanning the context switch.
     // Some context switches do not resume within this method on the other
     // thread, which results in unterminated durations. All of the callers
     // with durations tail-call this method, so terminating the duration
     // here should not cause significant inaccuracy of the outer duration.
     trace.End();
     if (outer_trace) {
       static_cast<LocalTraceDuration*>(outer_trace)->End();
     }
     FinalContextSwitch(current_thread, next_thread);
   }
 }

 void FairScheduler::UpdatePeriod() {
   LocalTraceDuration trace{"update_period"_stringref};

   DEBUG_ASSERT(runnable_task_count_ >= 0);
   DEBUG_ASSERT(minimum_granularity_ns_ > 0);
   DEBUG_ASSERT(peak_latency_ns_ > 0);
   DEBUG_ASSERT(target_latency_ns_ > 0);

   const int64_t num_tasks = runnable_task_count_;
   const int64_t peak_tasks = Round<int64_t>(peak_latency_ns_ / minimum_granularity_ns_);
   const int64_t normal_tasks = Round<int64_t>(target_latency_ns_ / minimum_granularity_ns_);

   // The scheduling period stretches when there are too many tasks to fit
   // within the target latency.
   scheduling_period_grans_ = SchedDuration{num_tasks > normal_tasks ? num_tasks : normal_tasks};

   SCHED_LTRACEF("num_tasks=%ld peak_tasks=%ld normal_tasks=%ld period_grans=%ld\n", num_tasks,
                 peak_tasks, normal_tasks, scheduling_period_grans_.raw_value());

   trace.End(Round<uint64_t>(scheduling_period_grans_), num_tasks);
 }

 SchedDuration FairScheduler::CalculateTimeslice(thread_t* thread) {
   LocalTraceDuration trace{"calculate_timeslice: w,wt"_stringref};
   FairTaskState* const state = &thread->fair_task_state;

   // Calculate the relative portion of the scheduling period.
   const SchedWeight proportional_time_slice_grans =
       scheduling_period_grans_ * state->weight_ / weight_total_;

   // Ensure that the time slice is at least the minimum granularity.
   const int64_t time_slice_grans = Round<int64_t>(proportional_time_slice_grans);
   const int64_t minimum_time_slice_grans = time_slice_grans > 0 ? time_slice_grans : 1;

   // Calcluate the time slice in nanoseconds.
   const SchedDuration time_slice_ns = minimum_time_slice_grans * minimum_granularity_ns_;

   trace.End(state->weight_.raw_value(), weight_total_.raw_value());
   return time_slice_ns;
 }

 void FairScheduler::NextThreadTimeslice(thread_t* thread) {
   LocalTraceDuration trace{"next_timeslice: s,w"_stringref};

   if (thread_is_idle(thread) || thread->state == THREAD_DEATH) {
     return;
   }

   FairTaskState* const state = &thread->fair_task_state;
   state->time_slice_ns_ = CalculateTimeslice(thread);

   SCHED_LTRACEF("name=%s weight_total=%#x weight=%#x time_slice_ns=%ld\n", thread->name,
                 static_cast<uint32_t>(weight_total_.raw_value()),
                 static_cast<uint32_t>(state->weight_.raw_value()),
                 state->time_slice_ns_.raw_value());

   trace.End(Round<uint64_t>(state->time_slice_ns_), state->weight_.raw_value());
 }

 void FairScheduler::UpdateThreadTimeline(thread_t* thread, Placement placement) {
   LocalTraceDuration trace{"update_timeline: vs,vf"_stringref};

   if (thread_is_idle(thread) || thread->state == THREAD_DEATH) {
     return;
   }

   FairTaskState* const state = &thread->fair_task_state;

   // Update virtual timeline.
   if (placement == Placement::Insertion) {
     state->virtual_start_time_ = std::max(state->virtual_finish_time_, virtual_time_);
   }

   const SchedDuration scheduling_period_ns = scheduling_period_grans_ * minimum_granularity_ns_;
   const SchedWeight rate = kReciprocalMinWeight * state->weight_;
   const SchedDuration delta_norm = scheduling_period_ns / rate;
   state->virtual_finish_time_ = state->virtual_start_time_ + delta_norm;

   DEBUG_ASSERT_MSG(state->virtual_start_time_ < state->virtual_finish_time_,
                    "vstart=%ld vfinish=%ld delta_norm=%ld\n",
                    state->virtual_start_time_.raw_value(), state->virtual_finish_time_.raw_value(),
                    delta_norm.raw_value());

   SCHED_LTRACEF("name=%s vstart=%ld vfinish=%ld lag=%ld vtime=%ld\n", thread->name,
                 state->virtual_start_time_.raw_value(), state->virtual_finish_time_.raw_value(),
                 state->lag_time_ns_.raw_value(), virtual_time_.raw_value());

   trace.End(Round<uint64_t>(state->virtual_start_time_),
             Round<uint64_t>(state->virtual_finish_time_));
 }

 void FairScheduler::QueueThread(thread_t* thread, Placement placement, SchedTime now) {
   LocalTraceDuration trace{"queue_thread"_stringref};

   DEBUG_ASSERT(thread->state == THREAD_READY);
   DEBUG_ASSERT(!thread_is_idle(thread));
   DEBUG_ASSERT(placement == Placement::Adjustment || now != SchedTime{0});
   SCHED_LTRACEF("QueueThread: thread=%s\n", thread->name);

   // Only update the generation, enqueue time, and emit a flow event if this
   // is an insertion. In constrast, an adjustment only changes the queue
   // position due to a weight change and should not perform these actions.
   if (placement == Placement::Insertion) {
     thread->fair_task_state.generation_ = ++generation_count_;

     // Reuse this member to track the time the thread enters the run queue.
     // It is not read outside of the scheduler unless the thread state is
     // THREAD_RUNNING.
     thread->last_started_running = now.raw_value();
   }

   run_queue_.insert(thread);
   LOCAL_KTRACE("queue_thread");

   if (placement == Placement::Insertion) {
     LOCAL_KTRACE_FLOW_BEGIN("sched_latency", FlowIdFromThreadGeneration(thread));
   }
 }

 void FairScheduler::Insert(SchedTime now, thread_t* thread) {
   LocalTraceDuration trace{"insert"_stringref};

   DEBUG_ASSERT(thread->state == THREAD_READY);
   DEBUG_ASSERT(!thread_is_idle(thread));

   FairTaskState* const state = &thread->fair_task_state;

   // Ensure insertion happens only once, even if Unblock is called multiple times.
   if (state->OnInsert()) {
     runnable_task_count_++;
     DEBUG_ASSERT(runnable_task_count_ != 0);

     UpdateTimeline(now);
     UpdatePeriod();

     // Insertion can happen from a different CPU. Set the thread's current
     // CPU to the one this scheduler instance services.
     thread->curr_cpu = this_cpu();

     // Factor this task into the run queue.
     weight_total_ += state->weight_;
     DEBUG_ASSERT(weight_total_ > SchedWeight{0});

     UpdateThreadTimeline(thread, Placement::Insertion);
     QueueThread(thread, Placement::Insertion, now);
   }
 }

 void FairScheduler::Remove(thread_t* thread) {
   LocalTraceDuration trace{"remove"_stringref};

   DEBUG_ASSERT(!thread_is_idle(thread));

   FairTaskState* const state = &thread->fair_task_state;
   DEBUG_ASSERT(!state->InQueue());

   // Ensure that removal happens only once, even if Block() is called multiple times.
   if (state->OnRemove()) {
     DEBUG_ASSERT(runnable_task_count_ > 0);
     runnable_task_count_--;

     UpdatePeriod();

     thread->curr_cpu = INVALID_CPU;

     state->virtual_start_time_ = SchedNs(0);
     state->virtual_finish_time_ = SchedNs(0);

     // Factor this task out of the run queue.
     weight_total_ -= state->weight_;
     DEBUG_ASSERT(weight_total_ >= SchedWeight{0});

     SCHED_LTRACEF("name=%s weight_total=%#x weight=%#x lag_time_ns=%ld\n", thread->name,
                   static_cast<uint32_t>(weight_total_.raw_value()),
                   static_cast<uint32_t>(state->weight_.raw_value()),
                   state->lag_time_ns_.raw_value());
   }
 }

 void FairScheduler::Block() {
   LocalTraceDuration trace{"sched_block"_stringref};

   DEBUG_ASSERT(spin_lock_held(&thread_lock));

   thread_t* const current_thread = get_current_thread();

   DEBUG_ASSERT(current_thread->magic == THREAD_MAGIC);
   DEBUG_ASSERT(current_thread->state != THREAD_RUNNING);

   const SchedTime now = CurrentTime();
   SCHED_LTRACEF("current=%s now=%ld\n", current_thread->name, now.raw_value());

   FairScheduler::Get()->RescheduleCommon(now, &trace);
 }

 bool FairScheduler::Unblock(thread_t* thread) {
   LocalTraceDuration trace{"sched_unblock"_stringref};

   DEBUG_ASSERT(thread->magic == THREAD_MAGIC);
   DEBUG_ASSERT(spin_lock_held(&thread_lock));

   const SchedTime now = CurrentTime();
   SCHED_LTRACEF("thread=%s now=%ld\n", thread->name, now.raw_value());

   const cpu_num_t target_cpu = FindTargetCpu(thread);
   FairScheduler* const target = Get(target_cpu);

   thread->state = THREAD_READY;
   target->Insert(now, thread);

   if (target_cpu == arch_curr_cpu_num()) {
     return true;
   } else {
     mp_reschedule(cpu_num_to_mask(target_cpu), 0);
     return false;
   }
 }

 bool FairScheduler::Unblock(list_node* list) {
   LocalTraceDuration trace{"sched_unblock_list"_stringref};

   DEBUG_ASSERT(list);
   DEBUG_ASSERT(spin_lock_held(&thread_lock));

   const SchedTime now = CurrentTime();

   cpu_mask_t cpus_to_reschedule_mask = 0;
   thread_t* thread;
   while ((thread = list_remove_tail_type(list, thread_t, queue_node)) != nullptr) {
     DEBUG_ASSERT(thread->magic == THREAD_MAGIC);
     DEBUG_ASSERT(!thread_is_idle(thread));

     SCHED_LTRACEF("thread=%s now=%ld\n", thread->name, now.raw_value());

     const cpu_num_t target_cpu = FindTargetCpu(thread);
     FairScheduler* const target = Get(target_cpu);

     thread->state = THREAD_READY;
     target->Insert(now, thread);

     cpus_to_reschedule_mask |= cpu_num_to_mask(target_cpu);
   }

   // Issue reschedule IPIs to other CPUs.
   if (cpus_to_reschedule_mask) {
     mp_reschedule(cpus_to_reschedule_mask, 0);
   }

   // Return true if the current CPU is in the mask.
   const cpu_mask_t current_cpu_mask = cpu_num_to_mask(arch_curr_cpu_num());
   return cpus_to_reschedule_mask & current_cpu_mask;
 }

 void FairScheduler::UnblockIdle(thread_t* thread) {
   DEBUG_ASSERT(spin_lock_held(&thread_lock));

   DEBUG_ASSERT(thread_is_idle(thread));
   DEBUG_ASSERT(thread->hard_affinity && (thread->hard_affinity & (thread->hard_affinity - 1)) == 0);

   SCHED_LTRACEF("thread=%s now=%ld\n", thread->name, current_time());

   thread->state = THREAD_READY;
   thread->curr_cpu = lowest_cpu_set(thread->hard_affinity);
 }

 void FairScheduler::Yield() {
   LocalTraceDuration trace{"sched_yield"_stringref};

   DEBUG_ASSERT(spin_lock_held(&thread_lock));

   thread_t* const current_thread = get_current_thread();
   FairTaskState* const current_state = &current_thread->fair_task_state;
   DEBUG_ASSERT(!thread_is_idle(current_thread));

   const SchedTime now = CurrentTime();
   SCHED_LTRACEF("current=%s now=%ld\n", current_thread->name, now.raw_value());

   // Update the virtual timeline in preparation for snapping the thread's
   // virtual finish time to the current virtual time.
   FairScheduler* const current = Get();
   current->UpdateTimeline(now);

   // Set the time slice to expire now. The thread is re-evaluated with with
   // zero lag against other competing threads and may skip lower priority
   // threads with similar arrival times.
   current_thread->state = THREAD_READY;
   current_state->virtual_finish_time_ = current->virtual_time_;
   current_state->time_slice_ns_ = now - current->start_of_current_time_slice_ns_;
   DEBUG_ASSERT(current_state->time_slice_ns_ >= 0);

   current->RescheduleCommon(now, &trace);
 }

 void FairScheduler::Preempt() {
   LocalTraceDuration trace{"sched_preempt"_stringref};

   DEBUG_ASSERT(spin_lock_held(&thread_lock));

   thread_t* current_thread = get_current_thread();
   const cpu_num_t current_cpu = arch_curr_cpu_num();

   DEBUG_ASSERT(current_thread->curr_cpu == current_cpu);
   DEBUG_ASSERT(current_thread->last_cpu == current_thread->curr_cpu);

   const SchedTime now = CurrentTime();
   SCHED_LTRACEF("current=%s now=%ld\n", current_thread->name, now.raw_value());

   current_thread->state = THREAD_READY;
   Get()->RescheduleCommon(now, &trace);
 }

 void FairScheduler::Reschedule() {
   LocalTraceDuration trace{"sched_reschedule"_stringref};

   DEBUG_ASSERT(spin_lock_held(&thread_lock));

   thread_t* current_thread = get_current_thread();
   const cpu_num_t current_cpu = arch_curr_cpu_num();

   if (current_thread->disable_counts != 0) {
     current_thread->preempt_pending = true;
     return;
   }

   DEBUG_ASSERT(current_thread->curr_cpu == current_cpu);
   DEBUG_ASSERT(current_thread->last_cpu == current_thread->curr_cpu);

   const SchedTime now = CurrentTime();
   SCHED_LTRACEF("current=%s now=%ld\n", current_thread->name, now.raw_value());

   current_thread->state = THREAD_READY;
   Get()->RescheduleCommon(now, &trace);
 }

 void FairScheduler::RescheduleInternal() { Get()->RescheduleCommon(CurrentTime()); }

 void FairScheduler::Migrate(thread_t* thread) {
   LocalTraceDuration trace{"sched_migrate"_stringref};

   DEBUG_ASSERT(spin_lock_held(&thread_lock));
   cpu_mask_t cpus_to_reschedule_mask = 0;

   if (thread->state == THREAD_RUNNING) {
     const cpu_mask_t thread_cpu_mask = cpu_num_to_mask(thread->curr_cpu);
     if (!(GetAllowedCpusMask(mp_get_active_mask(), thread) & thread_cpu_mask)) {
       // Mark the CPU the thread is running on for reschedule. The
       // scheduler on that CPU will take care of the actual migration.
       cpus_to_reschedule_mask |= thread_cpu_mask;
     }
   } else if (thread->state == THREAD_READY) {
     const cpu_mask_t thread_cpu_mask = cpu_num_to_mask(thread->curr_cpu);
     if (!(GetAllowedCpusMask(mp_get_active_mask(), thread) & thread_cpu_mask)) {
       FairScheduler* current = Get(thread->curr_cpu);

       DEBUG_ASSERT(thread->fair_task_state.InQueue());
       current->run_queue_.erase(*thread);
       current->Remove(thread);

       const cpu_num_t target_cpu = FindTargetCpu(thread);
       FairScheduler* const target = Get(target_cpu);
       target->Insert(CurrentTime(), thread);

       cpus_to_reschedule_mask |= cpu_num_to_mask(target_cpu);
     }
   }

   if (cpus_to_reschedule_mask) {
     mp_reschedule(cpus_to_reschedule_mask, 0);
   }

   const cpu_mask_t current_cpu_mask = cpu_num_to_mask(arch_curr_cpu_num());
   if (cpus_to_reschedule_mask & current_cpu_mask) {
     trace.End();
     Reschedule();
   }
 }

 void FairScheduler::MigrateUnpinnedThreads(cpu_num_t current_cpu) {
   LocalTraceDuration trace{"sched_migrate_unpinned"_stringref};

   DEBUG_ASSERT(spin_lock_held(&thread_lock));
   DEBUG_ASSERT(current_cpu == arch_curr_cpu_num());

   // Prevent this CPU from being selected as a target for scheduling threads.
   mp_set_curr_cpu_active(false);

   const SchedTime now = CurrentTime();
   FairScheduler* const current = Get(current_cpu);
   const cpu_mask_t current_cpu_mask = cpu_num_to_mask(current_cpu);

   RunQueue pinned_threads;
   cpu_mask_t cpus_to_reschedule_mask = 0;
   while (!current->run_queue_.is_empty()) {
     thread_t* const thread = current->DequeueThread();

     if (thread->hard_affinity == current_cpu_mask) {
       // Keep track of threads pinned to this CPU.
       pinned_threads.insert(thread);
     } else {
       // Move unpinned threads to another available CPU.
       current->Remove(thread);

       const cpu_num_t target_cpu = FindTargetCpu(thread);
       FairScheduler* const target = Get(target_cpu);
       DEBUG_ASSERT(target != current);

       target->Insert(now, thread);
       cpus_to_reschedule_mask |= cpu_num_to_mask(target_cpu);
     }
   }

   // Return the pinned threads to the run queue.
   current->run_queue_ = ktl::move(pinned_threads);

   if (cpus_to_reschedule_mask) {
     mp_reschedule(cpus_to_reschedule_mask, 0);
   }
 }

 void FairScheduler::UpdateWeightCommon(thread_t* thread, int original_priority, SchedWeight weight,
                                        cpu_mask_t* cpus_to_reschedule_mask, PropagatePI propagate) {
   FairTaskState* const state = &thread->fair_task_state;

   switch (thread->state) {
     case THREAD_INITIAL:
     case THREAD_SLEEPING:
     case THREAD_SUSPENDED:
       // Adjust the weight of the thread so that the correct value is
       // available when the thread enters the run queue.
       state->weight_ = weight;
       break;

     case THREAD_RUNNING:
     case THREAD_READY: {
       DEBUG_ASSERT(is_valid_cpu_num(thread->curr_cpu));
       FairScheduler* const current = Get(thread->curr_cpu);

       // Adjust the weight of the thread and the run queue. The time slice
       // of the running thread will be adjusted during reschedule due to
       // the change in demand on the run queue.
       current->weight_total_ -= state->weight_;
       current->weight_total_ += weight;
       state->weight_ = weight;

       if (thread->state == THREAD_READY) {
         DEBUG_ASSERT(state->InQueue());
         DEBUG_ASSERT(state->active());

         // Adjust the position of the thread in the run queue based on
         // the new weight.
         current->run_queue_.erase(*thread);
         current->UpdateThreadTimeline(thread, Placement::Adjustment);
         current->QueueThread(thread, Placement::Adjustment);
       }

       *cpus_to_reschedule_mask |= cpu_num_to_mask(thread->curr_cpu);
       break;
     }

     case THREAD_BLOCKED:
     case THREAD_BLOCKED_READ_LOCK:
       // Update the weight of the thread blocked in a wait queue. Also
       // handle the race where the thread is no longer in the wait queue
       // but has not yet transitioned to ready.
       state->weight_ = weight;
       if (thread->blocking_wait_queue) {
         wait_queue_priority_changed(thread, original_priority, propagate);
       }
       break;

     default:
       break;
   }
 }

 void FairScheduler::ChangeWeight(thread_t* thread, int priority,
                                  cpu_mask_t* cpus_to_reschedule_mask) {
   LocalTraceDuration trace{"sched_change_weight"_stringref};

   DEBUG_ASSERT(spin_lock_held(&thread_lock));
   SCHED_LTRACEF("thread={%s, %s} base=%d effective=%d inherited=%d\n", thread->name,
                 ToString(thread->state), thread->base_priority, thread->effec_priority,
                 thread->inherited_priority);

   if (thread_is_idle(thread) || thread->state == THREAD_DEATH) {
     return;
   }

   // TODO(eieio): The rest of the kernel still uses priority so we have to
   // operate in those terms here. Abstract the notion of priority once the
   // deadline scheduler is available and remove this conversion once the
   // kernel uses the abstraction throughout.
   const int original_priority = thread->effec_priority;
   thread->base_priority = priority;
   thread->priority_boost = 0;

   // Adjust the effective priority for inheritance, if necessary.
   if (thread->inherited_priority > thread->base_priority) {
     thread->effec_priority = thread->inherited_priority;
   } else {
     thread->effec_priority = thread->base_priority;
   }

   // Perform the state-specific updates if the effective priority changed.
   if (thread->effec_priority != original_priority) {
     UpdateWeightCommon(thread, original_priority, PriorityToWeight(thread->effec_priority),
                        cpus_to_reschedule_mask, PropagatePI::Yes);
   }

   trace.End(original_priority, thread->effec_priority);
 }

 void FairScheduler::InheritWeight(thread_t* thread, int priority,
                                   cpu_mask_t* cpus_to_reschedule_mask) {
   LocalTraceDuration trace{"sched_inherit_weight"_stringref};

   DEBUG_ASSERT(spin_lock_held(&thread_lock));
   SCHED_LTRACEF("thread={%s, %s} base=%d effective=%d inherited=%d\n", thread->name,
                 ToString(thread->state), thread->base_priority, thread->effec_priority,
                 thread->inherited_priority);

   const int original_priority = thread->effec_priority;
   thread->inherited_priority = priority;
   thread->priority_boost = 0;

   if (thread->inherited_priority > thread->base_priority) {
     thread->effec_priority = thread->inherited_priority;
   } else {
     thread->effec_priority = thread->base_priority;
   }

   // Perform the state-specific updates if the effective priority changed.
   if (thread->effec_priority != original_priority) {
     UpdateWeightCommon(thread, original_priority, PriorityToWeight(thread->effec_priority),
                        cpus_to_reschedule_mask, PropagatePI::No);
   }

   trace.End(original_priority, thread->effec_priority);
 }

 void FairScheduler::TimerTick(SchedTime now) {
   LocalTraceDuration trace{"sched_timer_tick"_stringref};
   thread_preempt_set_pending();
 }

 // Temporary compatibility with the thread layer.

 void sched_init_thread(thread_t* thread, int priority) {
   FairScheduler::InitializeThread(thread, priority);
 }

 void sched_block() { FairScheduler::Block(); }

 bool sched_unblock(thread_t* thread) { return FairScheduler::Unblock(thread); }

 bool sched_unblock_list(list_node* list) { return FairScheduler::Unblock(list); }

 void sched_unblock_idle(thread_t* thread) { FairScheduler::UnblockIdle(thread); }

 void sched_yield() { FairScheduler::Yield(); }

 void sched_preempt() { FairScheduler::Preempt(); }

 void sched_reschedule() { FairScheduler::Reschedule(); }

 void sched_resched_internal() { FairScheduler::RescheduleInternal(); }

 void sched_transition_off_cpu(cpu_num_t current_cpu) {
   FairScheduler::MigrateUnpinnedThreads(current_cpu);
 }

 void sched_migrate(thread_t* thread) { FairScheduler::Migrate(thread); }

 void sched_inherit_priority(thread_t* thread, int priority, bool* local_reschedule,
                             cpu_mask_t* cpus_to_reschedule_mask) {
   FairScheduler::InheritWeight(thread, priority, cpus_to_reschedule_mask);

   const cpu_mask_t current_cpu_mask = cpu_num_to_mask(arch_curr_cpu_num());
   if (*cpus_to_reschedule_mask & current_cpu_mask) {
     *local_reschedule = true;
   }
 }

 void sched_change_priority(thread_t* thread, int priority) {
   cpu_mask_t cpus_to_reschedule_mask = 0;
   FairScheduler::ChangeWeight(thread, priority, &cpus_to_reschedule_mask);

   const cpu_mask_t current_cpu_mask = cpu_num_to_mask(arch_curr_cpu_num());
   if (cpus_to_reschedule_mask & current_cpu_mask) {
     FairScheduler::Reschedule();
   }
   if (cpus_to_reschedule_mask & ~current_cpu_mask) {
     mp_reschedule(cpus_to_reschedule_mask, 0);
   }
 }

 void sched_preempt_timer_tick(zx_time_t now) { FairScheduler::TimerTick(SchedTime{now}); }