[kernel] Initial implementation of fair scheduler.
Add a fair scheduler option that is API compatible with the current
kernel/sched API. The algorithm is similar to WFQ/EEVDF/CFS and
provides weight-based bandwidth allocation with bounded delay for
all competing tasks.
Bug: ZX-2885
Test: zircon_benchmarks, k spinners, and extensive tracing. More TBD.
Change-Id: Ife693073a076aa0e7cac8bbd834823035a4950a4
diff --git a/kernel/include/kernel/fair_scheduler.h b/kernel/include/kernel/fair_scheduler.h
new file mode 100644
index 0000000..2b07d61
--- /dev/null
+++ b/kernel/include/kernel/fair_scheduler.h
@@ -0,0 +1,145 @@
+// Copyright 2018 The Fuchsia Authors
+//
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file or at
+// https://opensource.org/licenses/MIT
+#pragma once
+
+#include <stdint.h>
+#include <zircon/types.h>
+
+#include <kernel/fair_task_state.h>
+#include <kernel/thread.h>
+#include <kernel/sched.h>
+
+#include <platform.h>
+
+#include <fbl/intrusive_pointer_traits.h>
+#include <fbl/intrusive_wavl_tree.h>
+#include <fbl/type_support.h>
+#include <ffl/fixed.h>
+
+class FairScheduler {
+public:
+ // Default minimum granularity of time slices.
+ static constexpr SchedDuration kDefaultMinimumGranularity = SchedUs(750);
+
+ // Default target latency for a scheduling period.
+ static constexpr SchedDuration kDefaultTargetLatency = SchedMs(6);
+
+ // Default peak latency for a scheduling period.
+ static constexpr SchedDuration kDefaultPeakLatency = SchedMs(10);
+
+ FairScheduler() = default;
+ ~FairScheduler() = default;
+
+ FairScheduler(const FairScheduler&) = delete;
+ FairScheduler& operator=(const FairScheduler&) = delete;
+
+ SchedWeight GetTotalWeight();
+ size_t GetRunnableTasks();
+
+ void Dump() TA_REQ(thread_lock);
+
+private:
+ friend void sched_init_early(void);
+ friend void sched_init_thread(thread_t* t, int priority);
+ friend void sched_block(void);
+ friend void sched_yield(void);
+ friend void sched_preempt(void);
+ friend void sched_reschedule(void);
+ friend void sched_resched_internal(void);
+ friend void sched_unblock_idle(thread_t* t);
+ friend void sched_migrate(thread_t* t);
+ friend void sched_inherit_priority(thread_t* t, int pri, bool* local_resched);
+ friend void sched_change_priority(thread_t* t, int pri);
+ friend bool sched_unblock(thread_t* t);
+ friend bool sched_unblock_list(struct list_node* list);
+ friend void sched_transition_off_cpu(cpu_num_t old_cpu);
+ friend void sched_preempt_timer_tick(zx_time_t now);
+
+ static void InitializeThread(thread_t* thread, SchedWeight weight);
+ static void Block() TA_REQ(thread_lock);
+ static void Yield() TA_REQ(thread_lock);
+ static void Preempt() TA_REQ(thread_lock);
+ static void Reschedule() TA_REQ(thread_lock);
+ static void RescheduleInternal() TA_REQ(thread_lock);
+
+ static bool Unblock(thread_t* thread) __WARN_UNUSED_RESULT TA_REQ(thread_lock);
+ static bool Unblock(list_node* thread_list) __WARN_UNUSED_RESULT TA_REQ(thread_lock);
+ static void UnblockIdle(thread_t* idle_thread) TA_REQ(thread_lock);
+ static void Migrate(thread_t* thread) TA_REQ(thread_lock);
+
+ static void TimerTick(SchedTime now);
+
+ // Returns the current system time as a SchedTime value.
+ static SchedTime CurrentTime() {
+ return ffl::FromInteger(current_time());
+ }
+
+ // Returns the FairScheduler instance for the current CPU.
+ static FairScheduler* Get();
+
+ // Returns the FairScheduler instance for the given CPU.
+ static FairScheduler* Get(cpu_num_t cpu);
+
+ static cpu_num_t FindTargetCpu(thread_t* thread) TA_REQ(thread_lock);
+
+ void RescheduleCommon(SchedTime now) TA_REQ(thread_lock);
+
+ // Returns the next thread to execute.
+ thread_t* NextThread(thread_t* current_thread, bool timeslice_expired) TA_REQ(thread_lock);
+
+ void QueueThread(thread_t* thread) TA_REQ(thread_lock);
+
+ void UpdateActiveThread(thread_t* thread, SchedDuration actual_runtime_ns) TA_REQ(thread_lock);
+ void NextThreadTimeslice(thread_t* thread) TA_REQ(thread_lock);
+ void UpdateThreadTimeline(thread_t* thread) TA_REQ(thread_lock);
+
+ void UpdatePeriod() TA_REQ(thread_lock);
+ void UpdateTimeline(SchedTime now) TA_REQ(thread_lock);
+
+ void Insert(SchedTime now, thread_t* thread) TA_REQ(thread_lock);
+ void Remove(thread_t* thread) TA_REQ(thread_lock);
+
+ struct TaskTraits {
+ using KeyType = FairTaskState::KeyType;
+ static KeyType GetKey(const thread_t& thread) { return thread.fair_task_state.key(); }
+ static bool LessThan(KeyType a, KeyType b) { return a < b; }
+ static bool EqualTo(KeyType a, KeyType b) { return a == b; }
+ static auto& node_state(thread_t& thread) { return thread.fair_task_state.run_queue_node_; }
+ };
+
+ using RunQueue = fbl::WAVLTree<SchedTime, thread_t*, TaskTraits, TaskTraits>;
+ TA_GUARDED(thread_lock) RunQueue run_queue_;
+
+ TA_GUARDED(thread_lock) thread_t* active_thread_{nullptr};
+
+ // Monotonically increasing counter to break ties when queuing tasks with
+ // the same virtual finish time. This has the effect of placing newly
+ // queued tasks behind already queued tasks with the same virtual finish
+ // time. This is also necessary to guarantee uniqueness of the key as
+ // required by the WAVLTree container.
+ TA_GUARDED(thread_lock) uint64_t generation_count_{0};
+
+ // Count of the threads running on this CPU, including threads in the run
+ // queue and the currently running thread. Does not include the idle thread.
+ TA_GUARDED(thread_lock) int32_t runnable_task_count_{0};
+
+ // Total weights of threads running on this CPU, including threads in the
+ // run queue and the currently running thread. Does not include the idle
+ // thread.
+ TA_GUARDED(thread_lock) SchedWeight weight_total_{ffl::FromInteger(0)};
+
+ TA_GUARDED(thread_lock) SchedTime virtual_time_{SchedNs(0)};
+ TA_GUARDED(thread_lock) SchedTime last_update_time_ns_{SchedNs(0)};
+ TA_GUARDED(thread_lock) SchedTime absolute_deadline_ns_{SchedNs(0)};
+ TA_GUARDED(thread_lock) SchedTime last_reschedule_time_ns_{SchedNs(0)};
+
+ // Scheduling period in which every runnable task executes once.
+ TA_GUARDED(thread_lock) SchedDuration scheduling_period_ns_{kDefaultTargetLatency};
+
+ TA_GUARDED(thread_lock) SchedDuration minimum_granularity_ns_{kDefaultMinimumGranularity};
+ TA_GUARDED(thread_lock) SchedDuration target_latency_ns_{kDefaultTargetLatency};
+ TA_GUARDED(thread_lock) SchedDuration peak_latency_ns_{kDefaultPeakLatency};
+};
diff --git a/kernel/include/kernel/fair_task_state.h b/kernel/include/kernel/fair_task_state.h
new file mode 100644
index 0000000..a6d995f
--- /dev/null
+++ b/kernel/include/kernel/fair_task_state.h
@@ -0,0 +1,112 @@
+// Copyright 2018 The Fuchsia Authors
+//
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file or at
+// https://opensource.org/licenses/MIT
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+#include <zircon/types.h>
+
+#include <fbl/intrusive_wavl_tree.h>
+#include <ffl/fixed.h>
+
+#include <utility>
+
+// Forward declaration.
+typedef struct thread thread_t;
+
+// Fixed-point task weight/priority. The 5bit fractional component supports 32
+// priority levels (1/32 through 32/32), while the 27bit integer component
+// supports sums of ~134M threads with weight 1.0.
+using SchedWeight = ffl::Fixed<int32_t, 5>;
+
+// Fixed-point types wrapping time and duration types to make time expressions
+// cleaner in the scheduler code.
+using SchedDuration = ffl::Fixed<zx_duration_t, 0>;
+using SchedTime = ffl::Fixed<zx_time_t, 0>;
+
+// Utilities that return fixed-point Expression representing the given integer
+// time units in terms of system time units (nanoseconds).
+template <typename T>
+constexpr auto SchedNs(T nanoseconds) {
+ return ffl::FromInteger(ZX_NSEC(nanoseconds));
+}
+template <typename T>
+constexpr auto SchedUs(T microseconds) {
+ return ffl::FromInteger(ZX_USEC(microseconds));
+}
+template <typename T>
+constexpr auto SchedMs(T milliseconds) {
+ return ffl::FromInteger(ZX_MSEC(milliseconds));
+}
+
+class FairTaskState {
+public:
+ using KeyType = std::pair<SchedTime, uint64_t>;
+
+ static constexpr SchedWeight kDefaultWeight = ffl::FromInteger(1);
+
+ FairTaskState() = default;
+ explicit FairTaskState(SchedWeight weight)
+ : base_weight_{weight} {}
+
+ FairTaskState(const FairTaskState&) = delete;
+ FairTaskState& operator=(const FairTaskState&) = delete;
+
+ // TODO(eieio): Implement inheritance.
+ SchedWeight base_weight() const { return base_weight_; }
+ SchedWeight effective_weight() const { return base_weight_; }
+
+ // Returns the key used to order the run queue.
+ KeyType key() const { return {virtual_finish_time_, generation_}; }
+
+ bool operator<(const FairTaskState& other) const {
+ return key() < other.key();
+ }
+
+ // Returns true of the task state is currently enqueued in the runnable tree.
+ bool InQueue() const {
+ return run_queue_node_.InContainer();
+ }
+
+ bool active() const { return active_; }
+
+ // Sets the task state to active (on a run queue). Returns true if the task
+ // was not previously active.
+ bool OnInsert() {
+ const bool was_active = active_;
+ active_ = true;
+ return !was_active;
+ }
+
+ // Sets the task state to inactive (not on a run queue). Returns true if the
+ // task was previously active.
+ bool OnRemove() {
+ const bool was_active = active_;
+ active_ = false;
+ return was_active;
+ }
+
+private:
+ friend class FairScheduler;
+
+ fbl::WAVLTreeNodeState<thread_t*> run_queue_node_;
+
+ // Takes the value of FairScheduler::generation_count_ + 1 at the time this
+ // node is added to the run queue.
+ uint64_t generation_{0};
+
+ SchedWeight base_weight_{kDefaultWeight};
+
+ // TODO(eieio): Some of the values below are only relevant when running,
+ // while others only while ready. Consider using a union to save space.
+ SchedTime virtual_start_time_{SchedNs(0)};
+ SchedTime virtual_finish_time_{SchedNs(0)};
+
+ SchedDuration time_slice_ns_{SchedNs(0)};
+ SchedDuration lag_time_ns_{SchedNs(0)};
+
+ bool active_{false};
+};
diff --git a/kernel/include/kernel/percpu.h b/kernel/include/kernel/percpu.h
index 9bbb309..6ad86c6 100644
--- a/kernel/include/kernel/percpu.h
+++ b/kernel/include/kernel/percpu.h
@@ -8,6 +8,7 @@
#include <arch/ops.h>
#include <kernel/align.h>
#include <kernel/event.h>
+#include <kernel/fair_scheduler.h>
#include <kernel/stats.h>
#include <kernel/thread.h>
#include <kernel/timer.h>
@@ -31,6 +32,10 @@
struct list_node run_queue[NUM_PRIORITIES];
uint32_t run_queue_bitmap;
+#if WITH_FAIR_SCHEDULER
+ FairScheduler fair_runqueue;
+#endif
+
#if WITH_LOCK_DEP
// state for runtime lock validation when in irq context
lockdep_state_t lock_state;
diff --git a/kernel/include/kernel/thread.h b/kernel/include/kernel/thread.h
index 86cae3a..802083e 100644
--- a/kernel/include/kernel/thread.h
+++ b/kernel/include/kernel/thread.h
@@ -12,6 +12,7 @@
#include <arch/thread.h>
#include <debug.h>
#include <kernel/cpu.h>
+#include <kernel/fair_task_state.h>
#include <kernel/spinlock.h>
#include <kernel/wait.h>
#include <list.h>
@@ -115,6 +116,10 @@
cpu_num_t last_cpu; // last cpu the thread ran on, INVALID_CPU if it's never run
cpu_mask_t cpu_affinity; // mask of cpus that this thread can run on
+#if 1 || WITH_FAIR_SCHEDULER
+ FairTaskState fair_task_state;
+#endif
+
// if blocked, a pointer to the wait queue
struct wait_queue* blocking_wait_queue;
diff --git a/kernel/kernel/debug.cpp b/kernel/kernel/debug.cpp
index fa79650..8c2c46f 100644
--- a/kernel/kernel/debug.cpp
+++ b/kernel/kernel/debug.cpp
@@ -135,6 +135,10 @@
printf("\tyields: %lu\n", percpu[i].stats.yields);
printf("\ttimer interrupts: %lu\n", percpu[i].stats.timer_ints);
printf("\ttimers: %lu\n", percpu[i].stats.timers);
+#if WITH_FAIR_SCHEDULER
+ printf("\ttotal weight: %d\n", percpu[i].fair_runqueue.GetTotalWeight().raw_value());
+ printf("\trunnable tasks: %zu\n", percpu[i].fair_runqueue.GetRunnableTasks());
+#endif
}
return 0;
@@ -262,6 +266,28 @@
return 0;
}
+#if WITH_FAIR_SCHEDULER
+static int cmd_threadq(int argc, const cmd_args* argv, uint32_t flags) {
+ static RecurringCallback callback([]() {
+ printf("----------------------------------------------------\n");
+ for (uint i = 0; i < SMP_MAX_CPUS; i++) {
+ Guard<spin_lock_t, NoIrqSave> thread_lock_guard{ThreadLock::Get()};
+
+ if (!mp_is_cpu_active(i)) {
+ continue;
+ }
+
+ printf("thread queue cpu %2u:\n", i);
+ percpu[i].fair_runqueue.Dump();
+ }
+ printf("\n");
+ });
+
+ callback.Toggle();
+
+ return 0;
+}
+#else
static int cmd_threadq(int argc, const cmd_args* argv, uint32_t flags) {
static RecurringCallback cb([]() {
for (uint i = 0; i < SMP_MAX_CPUS; i++) {
@@ -286,3 +312,4 @@
return 0;
}
+#endif
diff --git a/kernel/kernel/fair_scheduler.cpp b/kernel/kernel/fair_scheduler.cpp
new file mode 100644
index 0000000..f218e2c
--- /dev/null
+++ b/kernel/kernel/fair_scheduler.cpp
@@ -0,0 +1,792 @@
+// Copyright 2018 The Fuchsia Authors
+//
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file or at
+// https://opensource.org/licenses/MIT
+
+#include <kernel/fair_scheduler.h>
+
+#include <assert.h>
+#include <debug.h>
+#include <err.h>
+#include <inttypes.h>
+#include <kernel/lockdep.h>
+#include <kernel/mp.h>
+#include <kernel/percpu.h>
+#include <kernel/sched.h>
+#include <kernel/thread.h>
+#include <kernel/thread_lock.h>
+#include <lib/ktrace.h>
+#include <list.h>
+#include <platform.h>
+#include <printf.h>
+#include <string.h>
+#include <target.h>
+#include <trace.h>
+#include <vm/vm.h>
+#include <zircon/types.h>
+
+#include <new>
+
+using ffl::Expression;
+using ffl::FromInteger;
+using ffl::FromRatio;
+using ffl::Max;
+using ffl::Round;
+using ffl::ToPrecision;
+
+// Enable/disable ktraces local to this file.
+#define LOCAL_KTRACE_ENABLE 0
+
+#define LOCAL_KTRACE(string, args...) \
+ ktrace_probe(LocalTrace<LOCAL_KTRACE_ENABLE>, TraceContext::Cpu, \
+ KTRACE_STRING_REF(string), ##args)
+
+#define LOCAL_KTRACE_DURATION \
+ TraceDuration<TraceEnabled<LOCAL_KTRACE_ENABLE>, TraceContext::Cpu>
+
+// Enable/disable console traces local to this file.
+#define LOCAL_TRACE 0
+
+#define SCHED_LTRACEF(str, args...) LTRACEF("[%d] " str, arch_curr_cpu_num(), ##args)
+#define SCHED_TRACEF(str, args...) TRACEF("[%d] " str, arch_curr_cpu_num(), ##args)
+
+namespace {
+
+constexpr SchedWeight kMinWeight = FromRatio(LOWEST_PRIORITY + 1, NUM_PRIORITIES);
+constexpr SchedWeight kReciprocalMinWeight = 1 / kMinWeight;
+
+// On ARM64 with safe-stack, it's no longer possible to use the unsafe-sp
+// after set_current_thread (we'd now see newthread's unsafe-sp instead!).
+// Hence this function and everything it calls between this point and the
+// the low-level context switch must be marked with __NO_SAFESTACK.
+__NO_SAFESTACK static void FinalContextSwitch(thread_t* oldthread,
+ thread_t* newthread) {
+ set_current_thread(newthread);
+ arch_context_switch(oldthread, newthread);
+}
+
+inline const char* ToString(enum thread_state state) {
+ switch (state) {
+ case THREAD_INITIAL:
+ return "initial";
+ case THREAD_SUSPENDED:
+ return "suspended";
+ case THREAD_READY:
+ return "ready";
+ case THREAD_RUNNING:
+ return "running";
+ case THREAD_BLOCKED:
+ return "blocked";
+ case THREAD_SLEEPING:
+ return "sleeping";
+ case THREAD_DEATH:
+ return "death";
+ default:
+ return "[unknown]";
+ }
+}
+
+inline void TraceContextSwitch(const thread_t* current_thread,
+ const thread_t* next_thread, cpu_num_t current_cpu) {
+ const uintptr_t raw_current = reinterpret_cast<uintptr_t>(current_thread);
+ const uintptr_t raw_next = reinterpret_cast<uintptr_t>(next_thread);
+ const uint32_t current = static_cast<uint32_t>(raw_current);
+ const uint32_t next = static_cast<uint32_t>(raw_next);
+ const uint32_t user_tid = static_cast<uint32_t>(next_thread->user_tid);
+ const uint32_t context = current_cpu |
+ (current_thread->state << 8) |
+ (current_thread->base_priority << 16) |
+ (next_thread->base_priority << 24);
+
+ ktrace(TAG_CONTEXT_SWITCH, user_tid, context, current, next);
+}
+
+} // anonymous namespace
+
+void FairScheduler::Dump() {
+ printf("\tweight_total=%x runnable_tasks=%d vtime=%ld period=%ld\n",
+ weight_total_.raw_value(), runnable_task_count_,
+ virtual_time_.raw_value(), scheduling_period_ns_.raw_value());
+
+ if (active_thread_ != nullptr) {
+ const FairTaskState* const state = &active_thread_->fair_task_state;
+ printf("\t-> name=%s weight=%x vstart=%ld vfinish=%ld time_slice_ns=%ld\n",
+ active_thread_->name,
+ state->effective_weight().raw_value(),
+ state->virtual_start_time_.raw_value(),
+ state->virtual_finish_time_.raw_value(),
+ state->time_slice_ns_.raw_value());
+ }
+
+ for (const thread_t& thread : run_queue_) {
+ const FairTaskState* const state = &thread.fair_task_state;
+ printf("\t name=%s weight=%x vstart=%ld vfinish=%ld time_slice_ns=%ld\n",
+ thread.name,
+ state->effective_weight().raw_value(),
+ state->virtual_start_time_.raw_value(),
+ state->virtual_finish_time_.raw_value(),
+ state->time_slice_ns_.raw_value());
+ }
+}
+
+SchedWeight FairScheduler::GetTotalWeight() {
+ Guard<spin_lock_t, IrqSave> guard{ThreadLock::Get()};
+ return weight_total_;
+}
+
+size_t FairScheduler::GetRunnableTasks() {
+ Guard<spin_lock_t, IrqSave> guard{ThreadLock::Get()};
+ return static_cast<size_t>(runnable_task_count_);
+}
+
+FairScheduler* FairScheduler::Get() {
+ return Get(arch_curr_cpu_num());
+}
+
+FairScheduler* FairScheduler::Get(cpu_num_t cpu) {
+ return &percpu[cpu].fair_runqueue;
+}
+
+void FairScheduler::InitializeThread(thread_t* thread, SchedWeight weight) {
+ new (&thread->fair_task_state) FairTaskState{weight};
+}
+
+// Returns the next thread to execute.
+thread_t* FairScheduler::NextThread(thread_t* current_thread, bool timeslice_expired) {
+ const bool is_idle = thread_is_idle(current_thread);
+ const bool is_active = current_thread->state == THREAD_READY ||
+ current_thread->state == THREAD_RUNNING;
+ const bool should_migrate = !(current_thread->cpu_affinity &
+ cpu_num_to_mask(arch_curr_cpu_num()));
+
+ if (should_migrate) {
+ current_thread->state = THREAD_READY;
+ Remove(current_thread);
+
+ const cpu_num_t target_cpu = FindTargetCpu(current_thread);
+ FairScheduler* target = Get(target_cpu);
+ target->Insert(CurrentTime(), current_thread);
+
+ mp_reschedule(cpu_num_to_mask(target_cpu), 0);
+ } else if (is_active && !is_idle) {
+ if (timeslice_expired) {
+ NextThreadTimeslice(current_thread);
+ UpdateThreadTimeline(current_thread);
+ QueueThread(current_thread);
+ } else {
+ return current_thread;
+ }
+ } else if (!is_active && !is_idle) {
+ Remove(current_thread);
+ }
+
+ if (!run_queue_.is_empty()) {
+ return run_queue_.pop_front();
+ } else {
+ const cpu_num_t current_cpu = arch_curr_cpu_num();
+ return &percpu[current_cpu].idle_thread;
+ }
+}
+
+cpu_num_t FairScheduler::FindTargetCpu(thread_t* thread) {
+ LOCAL_KTRACE_DURATION trace{"find_target: cpu,avail"_stringref};
+
+ const cpu_mask_t current_cpu_mask = cpu_num_to_mask(arch_curr_cpu_num());
+ const cpu_mask_t last_cpu_mask = cpu_num_to_mask(thread->last_cpu);
+ const cpu_mask_t affinity_mask = thread->cpu_affinity;
+ const cpu_mask_t active_mask = mp_get_active_mask();
+ const cpu_mask_t idle_mask = mp_get_idle_mask();
+
+ // Threads may be created and resumed before the thread init level. Work around
+ // an empty active mask by assuming the current cpu is scheduleable.
+ cpu_mask_t available_mask = active_mask != 0 ? affinity_mask & active_mask
+ : current_cpu_mask;
+ DEBUG_ASSERT_MSG(available_mask != 0,
+ "thread=%s affinity=%x active=%x idle=%x arch_ints_disabled=%d",
+ thread->name, affinity_mask, active_mask, idle_mask, arch_ints_disabled());
+
+ LOCAL_KTRACE("target_mask: online,active", mp_get_online_mask(), active_mask);
+
+ cpu_num_t target_cpu;
+ cpu_num_t least_loaded_cpu;
+ FairScheduler* target_queue;
+ FairScheduler* least_loaded_queue;
+
+ // Select an initial target.
+ if (last_cpu_mask & available_mask) {
+ target_cpu = thread->last_cpu;
+ } else if (current_cpu_mask & available_mask) {
+ target_cpu = arch_curr_cpu_num();
+ } else {
+ target_cpu = lowest_cpu_set(available_mask);
+ }
+
+ target_queue = Get(target_cpu);
+ least_loaded_cpu = target_cpu;
+ least_loaded_queue = target_queue;
+
+ // See if there is a better target in the set of available CPUs.
+ // TODO(eieio): Replace this with a search in order of increasing cache
+ // distance when topology information is available.
+ while (available_mask != 0) {
+ if (target_queue->weight_total_ < least_loaded_queue->weight_total_) {
+ least_loaded_cpu = target_cpu;
+ least_loaded_queue = target_queue;
+ }
+
+ available_mask &= ~cpu_num_to_mask(target_cpu);
+ if (available_mask) {
+ target_cpu = lowest_cpu_set(available_mask);
+ target_queue = Get(target_cpu);
+ }
+ }
+
+ SCHED_LTRACEF("thread=%s target_cpu=%d\n", thread->name, least_loaded_cpu);
+ trace.End(least_loaded_cpu, available_mask);
+ return least_loaded_cpu;
+}
+
+void FairScheduler::UpdateTimeline(SchedTime now) {
+ LOCAL_KTRACE_DURATION trace{"update_vtime"_stringref};
+
+ const Expression runtime_ns = now - last_update_time_ns_;
+ last_update_time_ns_ = now;
+
+ if (weight_total_ > SchedWeight{FromInteger(0)}) {
+ virtual_time_ += runtime_ns;
+ }
+
+ trace.End(Round<uint64_t>(runtime_ns), Round<uint64_t>(virtual_time_));
+}
+
+void FairScheduler::RescheduleCommon(SchedTime now) {
+ LOCAL_KTRACE_DURATION trace{"reschedule_common"_stringref};
+
+ const cpu_num_t current_cpu = arch_curr_cpu_num();
+ thread_t* const current_thread = get_current_thread();
+
+ DEBUG_ASSERT(arch_ints_disabled());
+ DEBUG_ASSERT(spin_lock_held(&thread_lock));
+ DEBUG_ASSERT_MSG(current_thread->state != THREAD_RUNNING, "state %d\n", current_thread->state);
+ DEBUG_ASSERT(!arch_blocking_disallowed());
+
+ CPU_STATS_INC(reschedules);
+
+ UpdateTimeline(now);
+
+ const SchedDuration actual_runtime_ns = now - last_reschedule_time_ns_;
+ last_reschedule_time_ns_ = now;
+
+ // Update the accounting for the thread that just ran.
+ current_thread->runtime_ns += actual_runtime_ns.raw_value();
+ const bool timeslice_expired = now >= absolute_deadline_ns_;
+
+ // Select a thread to run.
+ thread_t* const next_thread = NextThread(current_thread, timeslice_expired);
+ DEBUG_ASSERT(next_thread != nullptr);
+
+ const char* const front_name = run_queue_.is_empty() ? "[none]" : run_queue_.front().name;
+ SCHED_LTRACEF("current={%s, %s} next={%s, %s} expired=%d is_empty=%d front=%s\n",
+ current_thread->name, ToString(current_thread->state),
+ next_thread->name, ToString(next_thread->state),
+ timeslice_expired, run_queue_.is_empty(), front_name);
+
+ // Update the state of the current and next thread.
+ current_thread->preempt_pending = false;
+ next_thread->state = THREAD_RUNNING;
+ next_thread->last_cpu = current_cpu;
+ next_thread->curr_cpu = current_cpu;
+
+ active_thread_ = next_thread;
+
+ if (next_thread != current_thread) {
+ // Re-compute the timeslice for the new thread based on the latest state.
+ NextThreadTimeslice(next_thread);
+ }
+
+ // Always call to handle races between reschedule IPIs and changes to the run queue.
+ mp_prepare_current_cpu_idle_state(thread_is_idle(next_thread));
+
+ if (thread_is_idle(next_thread)) {
+ mp_set_cpu_idle(current_cpu);
+ } else {
+ mp_set_cpu_busy(current_cpu);
+ }
+
+ // The task is always non-realtime when managed by this scheduler.
+ // TODO(eieio): Revisit this when deadline scheduling is addressed.
+ mp_set_cpu_non_realtime(current_cpu);
+
+ if (thread_is_idle(current_thread)) {
+ percpu[current_cpu].stats.idle_time += actual_runtime_ns.raw_value();
+ }
+
+ if (thread_is_idle(next_thread) /*|| runnable_task_count_ == 1*/) {
+ LOCAL_KTRACE_DURATION trace{"stop_preemption"_stringref};
+ SCHED_LTRACEF("Stop preemption timer: current=%s next=%s\n",
+ current_thread->name, next_thread->name);
+ timer_preempt_cancel();
+ } else if (timeslice_expired || next_thread != current_thread) {
+ LOCAL_KTRACE_DURATION trace{"start_preemption: now,deadline"_stringref};
+
+ // Update the preemption time based on the time slice.
+ FairTaskState* const next_state = &next_thread->fair_task_state;
+ absolute_deadline_ns_ = now + next_state->time_slice_ns_;
+
+ SCHED_LTRACEF("Start preemption timer: current=%s next=%s now=%ld deadline=%ld\n",
+ current_thread->name, next_thread->name, now.raw_value(),
+ absolute_deadline_ns_.raw_value());
+ timer_preempt_reset(absolute_deadline_ns_.raw_value());
+
+ trace.End(Round<uint64_t>(now), Round<uint64_t>(absolute_deadline_ns_));
+ }
+
+ if (next_thread != current_thread) {
+ LOCAL_KTRACE("reschedule current: count,slice",
+ runnable_task_count_,
+ Round<uint64_t>(current_thread->fair_task_state.time_slice_ns_));
+ LOCAL_KTRACE("reschedule next: wsum,slice",
+ weight_total_.raw_value(),
+ Round<uint64_t>(next_thread->fair_task_state.time_slice_ns_));
+
+ TraceContextSwitch(current_thread, next_thread, current_cpu);
+
+ // Blink the optional debug LEDs on the target.
+ target_set_debug_led(0, !thread_is_idle(next_thread));
+
+ SCHED_LTRACEF("current=(%s, flags 0x%x) next=(%s, flags 0x%x)\n",
+ current_thread->name, current_thread->flags,
+ next_thread->name, next_thread->flags);
+
+ if (current_thread->aspace != next_thread->aspace) {
+ vmm_context_switch(current_thread->aspace, next_thread->aspace);
+ }
+
+ CPU_STATS_INC(context_switches);
+ FinalContextSwitch(current_thread, next_thread);
+ }
+}
+
+void FairScheduler::UpdatePeriod() {
+ LOCAL_KTRACE_DURATION trace{"update_period"_stringref};
+
+ DEBUG_ASSERT(runnable_task_count_ >= 0);
+ DEBUG_ASSERT(minimum_granularity_ns_ > 0);
+ DEBUG_ASSERT(peak_latency_ns_ > 0);
+ DEBUG_ASSERT(target_latency_ns_ > 0);
+
+ const int64_t num_tasks = runnable_task_count_;
+ const int64_t peak_tasks = Round<int64_t>(peak_latency_ns_ / minimum_granularity_ns_);
+ const int64_t normal_tasks = Round<int64_t>(target_latency_ns_ / minimum_granularity_ns_);
+
+ // The scheduling period stretches when there are too many tasks to fit
+ // within the target latency.
+ const int64_t period_grans = num_tasks > normal_tasks ? num_tasks : normal_tasks;
+ scheduling_period_ns_ = period_grans * minimum_granularity_ns_;
+
+ SCHED_LTRACEF("num_tasks=%ld peak_tasks=%ld normal_tasks=%ld period_ns=%ld\n",
+ num_tasks, peak_tasks, normal_tasks, scheduling_period_ns_.raw_value());
+
+ trace.End(Round<uint64_t>(scheduling_period_ns_), num_tasks);
+}
+
+void FairScheduler::NextThreadTimeslice(thread_t* thread) {
+ LOCAL_KTRACE_DURATION trace{"next_timeslice: s,w"_stringref};
+
+ if (thread_is_idle(thread) || thread->state == THREAD_DEATH) {
+ return;
+ }
+
+ FairTaskState* const state = &thread->fair_task_state;
+
+ // Calculate the relative portion of the scheduling period.
+ state->time_slice_ns_ = scheduling_period_ns_ * state->effective_weight() / weight_total_;
+ DEBUG_ASSERT(state->time_slice_ns_ > 0);
+
+ SCHED_LTRACEF("name=%s weight_total=%x weight=%x time_slice_ns=%ld\n",
+ thread->name,
+ weight_total_.raw_value(),
+ state->effective_weight().raw_value(),
+ state->time_slice_ns_.raw_value());
+
+ trace.End(Round<uint64_t>(state->time_slice_ns_), state->effective_weight().raw_value());
+}
+
+void FairScheduler::UpdateThreadTimeline(thread_t* thread) {
+ LOCAL_KTRACE_DURATION trace{"update_timeline: vs,vf"_stringref};
+
+ if (thread_is_idle(thread) || thread->state == THREAD_DEATH) {
+ return;
+ }
+
+ FairTaskState* const state = &thread->fair_task_state;
+
+ // Update virtual timeline.
+ state->virtual_start_time_ = Max(state->virtual_finish_time_, virtual_time_);
+
+ const SchedDuration delta_norm = state->time_slice_ns_ / (kReciprocalMinWeight * state->effective_weight());
+ state->virtual_finish_time_ = state->virtual_start_time_ + delta_norm;
+
+ DEBUG_ASSERT_MSG(state->virtual_start_time_ < state->virtual_finish_time_,
+ "vstart=%ld vfinish=%ld delta_norm=%ld\n",
+ state->virtual_start_time_.raw_value(),
+ state->virtual_finish_time_.raw_value(),
+ delta_norm.raw_value());
+
+ SCHED_LTRACEF("name=%s vstart=%ld vfinish=%ld lag=%ld vtime=%ld\n",
+ thread->name,
+ state->virtual_start_time_.raw_value(),
+ state->virtual_finish_time_.raw_value(),
+ state->lag_time_ns_.raw_value(),
+ virtual_time_.raw_value());
+
+ trace.End(Round<uint64_t>(state->virtual_start_time_),
+ Round<uint64_t>(state->virtual_finish_time_));
+}
+
+void FairScheduler::QueueThread(thread_t* thread) {
+ LOCAL_KTRACE_DURATION trace{"queue_thread"_stringref};
+
+ DEBUG_ASSERT(thread->state == THREAD_READY);
+ SCHED_LTRACEF("QueueThread: thread=%s\n", thread->name);
+
+ if (!thread_is_idle(thread)) {
+ thread->fair_task_state.generation_ = ++generation_count_;
+ run_queue_.insert(thread);
+ LOCAL_KTRACE("queue_thread");
+ }
+}
+
+void FairScheduler::Insert(SchedTime now, thread_t* thread) {
+ LOCAL_KTRACE_DURATION trace{"insert"_stringref};
+
+ DEBUG_ASSERT(thread->state == THREAD_READY);
+ DEBUG_ASSERT(!thread_is_idle(thread));
+
+ FairTaskState* const state = &thread->fair_task_state;
+
+ // Ensure insertion happens only once, even if Unblock is called multiple times.
+ if (state->OnInsert()) {
+ runnable_task_count_++;
+ DEBUG_ASSERT(runnable_task_count_ != 0);
+
+ UpdateTimeline(now);
+ UpdatePeriod();
+
+ thread->curr_cpu = arch_curr_cpu_num();
+
+ // Factor this task into the run queue.
+ weight_total_ += state->effective_weight();
+ DEBUG_ASSERT(weight_total_ > SchedWeight{FromInteger(0)});
+ //virtual_time_ -= state->lag_time_ns_ / weight_total_;
+
+ NextThreadTimeslice(thread);
+ UpdateThreadTimeline(thread);
+ QueueThread(thread);
+ }
+}
+
+void FairScheduler::Remove(thread_t* thread) {
+ LOCAL_KTRACE_DURATION trace{"remove"_stringref};
+
+ DEBUG_ASSERT(!thread_is_idle(thread));
+
+ FairTaskState* const state = &thread->fair_task_state;
+ DEBUG_ASSERT(!state->InQueue());
+
+ // Ensure that removal happens only once, even if Block() is called multiple times.
+ if (state->OnRemove()) {
+ DEBUG_ASSERT(runnable_task_count_ > 0);
+ runnable_task_count_--;
+
+ UpdatePeriod();
+
+ thread->curr_cpu = INVALID_CPU;
+
+ state->virtual_start_time_ = SchedNs(0);
+ state->virtual_finish_time_ = SchedNs(0);
+
+ // Factor this task out of the run queue.
+ //virtual_time_ += state->lag_time_ns_ / weight_total_;
+ weight_total_ -= state->effective_weight();
+ DEBUG_ASSERT(weight_total_ >= SchedWeight{FromInteger(0)});
+
+ SCHED_LTRACEF("name=%s weight_total=%x weight=%x lag_time_ns=%ld\n",
+ thread->name,
+ weight_total_.raw_value(),
+ state->effective_weight().raw_value(),
+ state->lag_time_ns_.raw_value());
+ }
+}
+
+void FairScheduler::Block() {
+ LOCAL_KTRACE_DURATION trace{"sched_block"_stringref};
+
+ DEBUG_ASSERT(spin_lock_held(&thread_lock));
+
+ thread_t* const current_thread = get_current_thread();
+
+ DEBUG_ASSERT(current_thread->magic == THREAD_MAGIC);
+ DEBUG_ASSERT(current_thread->state != THREAD_RUNNING);
+
+ const SchedTime now = CurrentTime();
+ SCHED_LTRACEF("current=%s now=%ld\n", current_thread->name, now.raw_value());
+
+ FairScheduler::Get()->RescheduleCommon(now);
+}
+
+bool FairScheduler::Unblock(thread_t* thread) {
+ LOCAL_KTRACE_DURATION trace{"sched_unblock"_stringref};
+
+ DEBUG_ASSERT(thread->magic == THREAD_MAGIC);
+ DEBUG_ASSERT(spin_lock_held(&thread_lock));
+
+ const SchedTime now = CurrentTime();
+ SCHED_LTRACEF("thread=%s now=%ld\n", thread->name, now.raw_value());
+
+ const cpu_num_t target_cpu = FindTargetCpu(thread);
+ FairScheduler* const target = Get(target_cpu);
+
+ thread->state = THREAD_READY;
+ target->Insert(now, thread);
+
+ if (target_cpu == arch_curr_cpu_num()) {
+ return true;
+ } else {
+ mp_reschedule(cpu_num_to_mask(target_cpu), 0);
+ return false;
+ }
+}
+
+bool FairScheduler::Unblock(list_node* list) {
+ LOCAL_KTRACE_DURATION trace{"sched_unblock_list"_stringref};
+
+ DEBUG_ASSERT(list);
+ DEBUG_ASSERT(spin_lock_held(&thread_lock));
+
+ const SchedTime now = CurrentTime();
+
+ cpu_mask_t cpus_to_reschedule_mask = 0;
+ thread_t* thread;
+ while ((thread = list_remove_tail_type(list, thread_t, queue_node)) != nullptr) {
+ DEBUG_ASSERT(thread->magic == THREAD_MAGIC);
+ DEBUG_ASSERT(!thread_is_idle(thread));
+
+ SCHED_LTRACEF("thread=%s now=%ld\n", thread->name, now.raw_value());
+
+ const cpu_num_t target_cpu = FindTargetCpu(thread);
+ FairScheduler* const target = Get(target_cpu);
+
+ thread->state = THREAD_READY;
+ target->Insert(now, thread);
+
+ cpus_to_reschedule_mask |= cpu_num_to_mask(target_cpu);
+ }
+
+ // Issue reschedule IPIs to other CPUs.
+ if (cpus_to_reschedule_mask) {
+ mp_reschedule(cpus_to_reschedule_mask, 0);
+ }
+
+ // Return true if the current CPU is in the mask.
+ const cpu_mask_t current_cpu_mask = cpu_num_to_mask(arch_curr_cpu_num());
+ return cpus_to_reschedule_mask & current_cpu_mask;
+}
+
+void FairScheduler::UnblockIdle(thread_t* thread) {
+ DEBUG_ASSERT(spin_lock_held(&thread_lock));
+
+ DEBUG_ASSERT(thread_is_idle(thread));
+ DEBUG_ASSERT(thread->cpu_affinity && (thread->cpu_affinity & (thread->cpu_affinity - 1)) == 0);
+
+ SCHED_LTRACEF("thread=%s now=%ld\n", thread->name, current_time());
+
+ thread->state = THREAD_READY;
+ thread->curr_cpu = lowest_cpu_set(thread->cpu_affinity);
+}
+
+void FairScheduler::Yield() {
+ LOCAL_KTRACE_DURATION trace{"sched_yield"_stringref};
+
+ DEBUG_ASSERT(spin_lock_held(&thread_lock));
+
+ thread_t* current_thread = get_current_thread();
+ DEBUG_ASSERT(!thread_is_idle(current_thread));
+
+ const SchedTime now = CurrentTime();
+ SCHED_LTRACEF("current=%s now=%ld\n", current_thread->name, now.raw_value());
+
+ current_thread->state = THREAD_READY;
+ Get()->RescheduleCommon(now);
+}
+
+void FairScheduler::Preempt() {
+ LOCAL_KTRACE_DURATION trace{"sched_preempt"_stringref};
+
+ DEBUG_ASSERT(spin_lock_held(&thread_lock));
+
+ thread_t* current_thread = get_current_thread();
+ const cpu_num_t current_cpu = arch_curr_cpu_num();
+
+ DEBUG_ASSERT(current_thread->curr_cpu == current_cpu);
+ DEBUG_ASSERT(current_thread->last_cpu == current_thread->curr_cpu);
+
+ const SchedTime now = CurrentTime();
+ SCHED_LTRACEF("current=%s now=%ld\n", current_thread->name, now.raw_value());
+
+ current_thread->state = THREAD_READY;
+ Get()->RescheduleCommon(now);
+}
+
+void FairScheduler::Reschedule() {
+ LOCAL_KTRACE_DURATION trace{"sched_reschedule"_stringref};
+
+ DEBUG_ASSERT(spin_lock_held(&thread_lock));
+
+ thread_t* current_thread = get_current_thread();
+ const cpu_num_t current_cpu = arch_curr_cpu_num();
+
+ if (current_thread->disable_counts != 0) {
+ current_thread->preempt_pending = true;
+ return;
+ }
+
+ DEBUG_ASSERT(current_thread->curr_cpu == current_cpu);
+ DEBUG_ASSERT(current_thread->last_cpu == current_thread->curr_cpu);
+
+ const SchedTime now = CurrentTime();
+ SCHED_LTRACEF("current=%s now=%ld\n", current_thread->name, now.raw_value());
+
+ current_thread->state = THREAD_READY;
+ Get()->RescheduleCommon(now);
+}
+
+void FairScheduler::RescheduleInternal() {
+ Get()->RescheduleCommon(CurrentTime());
+}
+
+void FairScheduler::Migrate(thread_t* thread) {
+ DEBUG_ASSERT(spin_lock_held(&thread_lock));
+ cpu_mask_t cpus_to_reschedule_mask = 0;
+
+ if (thread->state == THREAD_RUNNING) {
+ const cpu_mask_t thread_cpu_mask = cpu_num_to_mask(thread->curr_cpu);
+ if (!(thread->cpu_affinity & thread_cpu_mask)) {
+ // Mark the CPU the thread is running on for reschedule.
+ cpus_to_reschedule_mask |= thread_cpu_mask;
+ }
+ } else if (thread->state == THREAD_READY) {
+ const cpu_mask_t thread_cpu_mask = cpu_num_to_mask(thread->curr_cpu);
+ if (!(thread->cpu_affinity & thread_cpu_mask)) {
+ FairScheduler* current = Get(thread->curr_cpu);
+
+ DEBUG_ASSERT(thread->fair_task_state.InQueue());
+ current->run_queue_.erase(*thread);
+ current->Remove(thread);
+
+ const cpu_num_t target_cpu = FindTargetCpu(thread);
+ FairScheduler* const target = Get(target_cpu);
+ target->Insert(CurrentTime(), thread);
+
+ cpus_to_reschedule_mask |= cpu_num_to_mask(target_cpu);
+ }
+ }
+
+ if (cpus_to_reschedule_mask) {
+ mp_reschedule(cpus_to_reschedule_mask, 0);
+ }
+
+ const cpu_mask_t current_cpu_mask = cpu_num_to_mask(arch_curr_cpu_num());
+ if (cpus_to_reschedule_mask & current_cpu_mask) {
+ FairScheduler::Reschedule();
+ }
+}
+
+void FairScheduler::TimerTick(SchedTime now) {
+ LOCAL_KTRACE_DURATION trace{"sched_timer_tick"_stringref};
+
+ SchedTime absolute_deadline_ns;
+
+ {
+ FairScheduler* const queue = Get();
+ Guard<spin_lock_t, IrqSave> guard{ThreadLock::Get()};
+ absolute_deadline_ns = queue->absolute_deadline_ns_;
+ }
+
+ SCHED_LTRACEF("now=%ld deadline=%ld\n", now.raw_value(), absolute_deadline_ns.raw_value());
+
+ thread_preempt_set_pending();
+
+ trace.End(Round<uint64_t>(now), Round<uint64_t>(absolute_deadline_ns));
+}
+
+// Temporary compatibility with the thread layer.
+
+void sched_init_thread(thread_t* thread, int priority) {
+ FairScheduler::InitializeThread(thread, FromRatio(priority, NUM_PRIORITIES));
+ thread->base_priority = priority;
+}
+
+void sched_block() {
+ FairScheduler::Block();
+}
+
+bool sched_unblock(thread_t* thread) {
+ return FairScheduler::Unblock(thread);
+}
+
+bool sched_unblock_list(list_node* list) {
+ return FairScheduler::Unblock(list);
+}
+
+void sched_unblock_idle(thread_t* thread) {
+ FairScheduler::UnblockIdle(thread);
+}
+
+void sched_yield() {
+ FairScheduler::Yield();
+}
+
+void sched_preempt() {
+ FairScheduler::Preempt();
+}
+
+void sched_reschedule() {
+ FairScheduler::Reschedule();
+}
+
+void sched_resched_internal() {
+ FairScheduler::RescheduleInternal();
+}
+
+void sched_transition_off_cpu(cpu_num_t old_cpu) {
+ DEBUG_ASSERT(spin_lock_held(&thread_lock));
+ DEBUG_ASSERT(old_cpu == arch_curr_cpu_num());
+
+ (void)old_cpu;
+}
+
+void sched_migrate(thread_t* thread) {
+ FairScheduler::Migrate(thread);
+}
+
+void sched_inherit_priority(thread_t* t, int pri, bool* local_resched) {
+ DEBUG_ASSERT(spin_lock_held(&thread_lock));
+ (void)t;
+ (void)pri;
+ (void)local_resched;
+}
+
+void sched_change_priority(thread_t* t, int pri) {
+ DEBUG_ASSERT(spin_lock_held(&thread_lock));
+ (void)t;
+ (void)pri;
+}
+
+void sched_preempt_timer_tick(zx_time_t now) {
+ FairScheduler::TimerTick(FromInteger(now));
+}
+
+void sched_init_early() {
+}
diff --git a/kernel/kernel/rules.mk b/kernel/kernel/rules.mk
index 3311371..5dc517d 100644
--- a/kernel/kernel/rules.mk
+++ b/kernel/kernel/rules.mk
@@ -16,6 +16,7 @@
kernel/lib/heap \
kernel/lib/libc \
kernel/lib/fbl \
+ kernel/lib/ffl \
kernel/lib/zircon-internal \
kernel/vm
@@ -28,9 +29,14 @@
$(LOCAL_DIR)/mp.cpp \
$(LOCAL_DIR)/mutex.cpp \
$(LOCAL_DIR)/percpu.cpp \
- $(LOCAL_DIR)/sched.cpp \
$(LOCAL_DIR)/thread.cpp \
$(LOCAL_DIR)/timer.cpp \
$(LOCAL_DIR)/wait.cpp
+ifeq ($(call TOBOOL,$(ENABLE_FAIR_SCHEDULER)),true)
+MODULE_SRCS += $(LOCAL_DIR)/fair_scheduler.cpp
+else
+MODULE_SRCS += $(LOCAL_DIR)/sched.cpp
+endif
+
include make/module.mk
diff --git a/make/engine.mk b/make/engine.mk
index 46394bb..b55f518 100644
--- a/make/engine.mk
+++ b/make/engine.mk
@@ -27,6 +27,7 @@
ENABLE_NEW_BOOTDATA := true
ENABLE_LOCK_DEP ?= false
ENABLE_LOCK_DEP_TESTS ?= $(ENABLE_LOCK_DEP)
+ENABLE_FAIR_SCHEDULER ?= false
DISABLE_UTEST ?= false
ENABLE_ULIB_ONLY ?= false
USE_ASAN ?= false
@@ -428,6 +429,11 @@
KERNEL_DEFINES += WITH_LOCK_DEP_TESTS=1
endif
+# Scheduler experiment.
+ifeq ($(call TOBOOL,$(ENABLE_FAIR_SCHEDULER)),true)
+KERNEL_DEFINES += WITH_FAIR_SCHEDULER=1
+endif
+
# additional bootdata items to be included to bootdata.bin
ADDITIONAL_BOOTDATA_ITEMS :=