[kernel] Initial implementation of fair scheduler.

Add a fair scheduler option that is API compatible with the current
kernel/sched API. The algorithm is similar to WFQ/EEVDF/CFS and
provides weight-based bandwidth allocation with bounded delay for
all competing tasks.

Bug: ZX-2885
Test: zircon_benchmarks, k spinners, and extensive tracing. More TBD.
Change-Id: Ife693073a076aa0e7cac8bbd834823035a4950a4
diff --git a/kernel/include/kernel/fair_scheduler.h b/kernel/include/kernel/fair_scheduler.h
new file mode 100644
index 0000000..2b07d61
--- /dev/null
+++ b/kernel/include/kernel/fair_scheduler.h
@@ -0,0 +1,145 @@
+// Copyright 2018 The Fuchsia Authors
+//
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file or at
+// https://opensource.org/licenses/MIT
+#pragma once
+
+#include <stdint.h>
+#include <zircon/types.h>
+
+#include <kernel/fair_task_state.h>
+#include <kernel/thread.h>
+#include <kernel/sched.h>
+
+#include <platform.h>
+
+#include <fbl/intrusive_pointer_traits.h>
+#include <fbl/intrusive_wavl_tree.h>
+#include <fbl/type_support.h>
+#include <ffl/fixed.h>
+
+class FairScheduler {
+public:
+    // Default minimum granularity of time slices.
+    static constexpr SchedDuration kDefaultMinimumGranularity = SchedUs(750);
+
+    // Default target latency for a scheduling period.
+    static constexpr SchedDuration kDefaultTargetLatency = SchedMs(6);
+
+    // Default peak latency for a scheduling period.
+    static constexpr SchedDuration kDefaultPeakLatency = SchedMs(10);
+
+    FairScheduler() = default;
+    ~FairScheduler() = default;
+
+    FairScheduler(const FairScheduler&) = delete;
+    FairScheduler& operator=(const FairScheduler&) = delete;
+
+    SchedWeight GetTotalWeight();
+    size_t GetRunnableTasks();
+
+    void Dump() TA_REQ(thread_lock);
+
+private:
+    friend void sched_init_early(void);
+    friend void sched_init_thread(thread_t* t, int priority);
+    friend void sched_block(void);
+    friend void sched_yield(void);
+    friend void sched_preempt(void);
+    friend void sched_reschedule(void);
+    friend void sched_resched_internal(void);
+    friend void sched_unblock_idle(thread_t* t);
+    friend void sched_migrate(thread_t* t);
+    friend void sched_inherit_priority(thread_t* t, int pri, bool* local_resched);
+    friend void sched_change_priority(thread_t* t, int pri);
+    friend bool sched_unblock(thread_t* t);
+    friend bool sched_unblock_list(struct list_node* list);
+    friend void sched_transition_off_cpu(cpu_num_t old_cpu);
+    friend void sched_preempt_timer_tick(zx_time_t now);
+
+    static void InitializeThread(thread_t* thread, SchedWeight weight);
+    static void Block() TA_REQ(thread_lock);
+    static void Yield() TA_REQ(thread_lock);
+    static void Preempt() TA_REQ(thread_lock);
+    static void Reschedule() TA_REQ(thread_lock);
+    static void RescheduleInternal() TA_REQ(thread_lock);
+
+    static bool Unblock(thread_t* thread) __WARN_UNUSED_RESULT TA_REQ(thread_lock);
+    static bool Unblock(list_node* thread_list) __WARN_UNUSED_RESULT TA_REQ(thread_lock);
+    static void UnblockIdle(thread_t* idle_thread) TA_REQ(thread_lock);
+    static void Migrate(thread_t* thread) TA_REQ(thread_lock);
+
+    static void TimerTick(SchedTime now);
+
+    // Returns the current system time as a SchedTime value.
+    static SchedTime CurrentTime() {
+        return ffl::FromInteger(current_time());
+    }
+
+    // Returns the FairScheduler instance for the current CPU.
+    static FairScheduler* Get();
+
+    // Returns the FairScheduler instance for the given CPU.
+    static FairScheduler* Get(cpu_num_t cpu);
+
+    static cpu_num_t FindTargetCpu(thread_t* thread) TA_REQ(thread_lock);
+
+    void RescheduleCommon(SchedTime now) TA_REQ(thread_lock);
+
+    // Returns the next thread to execute.
+    thread_t* NextThread(thread_t* current_thread, bool timeslice_expired) TA_REQ(thread_lock);
+
+    void QueueThread(thread_t* thread) TA_REQ(thread_lock);
+
+    void UpdateActiveThread(thread_t* thread, SchedDuration actual_runtime_ns) TA_REQ(thread_lock);
+    void NextThreadTimeslice(thread_t* thread) TA_REQ(thread_lock);
+    void UpdateThreadTimeline(thread_t* thread) TA_REQ(thread_lock);
+
+    void UpdatePeriod() TA_REQ(thread_lock);
+    void UpdateTimeline(SchedTime now) TA_REQ(thread_lock);
+
+    void Insert(SchedTime now, thread_t* thread) TA_REQ(thread_lock);
+    void Remove(thread_t* thread) TA_REQ(thread_lock);
+
+    struct TaskTraits {
+        using KeyType = FairTaskState::KeyType;
+        static KeyType GetKey(const thread_t& thread) { return thread.fair_task_state.key(); }
+        static bool LessThan(KeyType a, KeyType b) { return a < b; }
+        static bool EqualTo(KeyType a, KeyType b) { return a == b; }
+        static auto& node_state(thread_t& thread) { return thread.fair_task_state.run_queue_node_; }
+    };
+
+    using RunQueue = fbl::WAVLTree<SchedTime, thread_t*, TaskTraits, TaskTraits>;
+    TA_GUARDED(thread_lock) RunQueue run_queue_;
+
+    TA_GUARDED(thread_lock) thread_t* active_thread_{nullptr};
+
+    // Monotonically increasing counter to break ties when queuing tasks with
+    // the same virtual finish time. This has the effect of placing newly
+    // queued tasks behind already queued tasks with the same virtual finish
+    // time. This is also necessary to guarantee uniqueness of the key as
+    // required by the WAVLTree container.
+    TA_GUARDED(thread_lock) uint64_t generation_count_{0};
+
+    // Count of the threads running on this CPU, including threads in the run
+    // queue and the currently running thread. Does not include the idle thread.
+    TA_GUARDED(thread_lock) int32_t runnable_task_count_{0};
+
+    // Total weights of threads running on this CPU, including threads in the
+    // run queue and the currently running thread. Does not include the idle
+    // thread.
+    TA_GUARDED(thread_lock) SchedWeight weight_total_{ffl::FromInteger(0)};
+
+    TA_GUARDED(thread_lock) SchedTime virtual_time_{SchedNs(0)};
+    TA_GUARDED(thread_lock) SchedTime last_update_time_ns_{SchedNs(0)};
+    TA_GUARDED(thread_lock) SchedTime absolute_deadline_ns_{SchedNs(0)};
+    TA_GUARDED(thread_lock) SchedTime last_reschedule_time_ns_{SchedNs(0)};
+
+    // Scheduling period in which every runnable task executes once.
+    TA_GUARDED(thread_lock) SchedDuration scheduling_period_ns_{kDefaultTargetLatency};
+
+    TA_GUARDED(thread_lock) SchedDuration minimum_granularity_ns_{kDefaultMinimumGranularity};
+    TA_GUARDED(thread_lock) SchedDuration target_latency_ns_{kDefaultTargetLatency};
+    TA_GUARDED(thread_lock) SchedDuration peak_latency_ns_{kDefaultPeakLatency};
+};
diff --git a/kernel/include/kernel/fair_task_state.h b/kernel/include/kernel/fair_task_state.h
new file mode 100644
index 0000000..a6d995f
--- /dev/null
+++ b/kernel/include/kernel/fair_task_state.h
@@ -0,0 +1,112 @@
+// Copyright 2018 The Fuchsia Authors
+//
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file or at
+// https://opensource.org/licenses/MIT
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+#include <zircon/types.h>
+
+#include <fbl/intrusive_wavl_tree.h>
+#include <ffl/fixed.h>
+
+#include <utility>
+
+// Forward declaration.
+typedef struct thread thread_t;
+
+// Fixed-point task weight/priority. The 5bit fractional component supports 32
+// priority levels (1/32 through 32/32), while the 27bit integer component
+// supports sums of ~134M threads with weight 1.0.
+using SchedWeight = ffl::Fixed<int32_t, 5>;
+
+// Fixed-point types wrapping time and duration types to make time expressions
+// cleaner in the scheduler code.
+using SchedDuration = ffl::Fixed<zx_duration_t, 0>;
+using SchedTime = ffl::Fixed<zx_time_t, 0>;
+
+// Utilities that return fixed-point Expression representing the given integer
+// time units in terms of system time units (nanoseconds).
+template <typename T>
+constexpr auto SchedNs(T nanoseconds) {
+    return ffl::FromInteger(ZX_NSEC(nanoseconds));
+}
+template <typename T>
+constexpr auto SchedUs(T microseconds) {
+    return ffl::FromInteger(ZX_USEC(microseconds));
+}
+template <typename T>
+constexpr auto SchedMs(T milliseconds) {
+    return ffl::FromInteger(ZX_MSEC(milliseconds));
+}
+
+class FairTaskState {
+public:
+    using KeyType = std::pair<SchedTime, uint64_t>;
+
+    static constexpr SchedWeight kDefaultWeight = ffl::FromInteger(1);
+
+    FairTaskState() = default;
+    explicit FairTaskState(SchedWeight weight)
+        : base_weight_{weight} {}
+
+    FairTaskState(const FairTaskState&) = delete;
+    FairTaskState& operator=(const FairTaskState&) = delete;
+
+    // TODO(eieio): Implement inheritance.
+    SchedWeight base_weight() const { return base_weight_; }
+    SchedWeight effective_weight() const { return base_weight_; }
+
+    // Returns the key used to order the run queue.
+    KeyType key() const { return {virtual_finish_time_, generation_}; }
+
+    bool operator<(const FairTaskState& other) const {
+        return key() < other.key();
+    }
+
+    // Returns true of the task state is currently enqueued in the runnable tree.
+    bool InQueue() const {
+        return run_queue_node_.InContainer();
+    }
+
+    bool active() const { return active_; }
+
+    // Sets the task state to active (on a run queue). Returns true if the task
+    // was not previously active.
+    bool OnInsert() {
+        const bool was_active = active_;
+        active_ = true;
+        return !was_active;
+    }
+
+    // Sets the task state to inactive (not on a run queue). Returns true if the
+    // task was previously active.
+    bool OnRemove() {
+        const bool was_active = active_;
+        active_ = false;
+        return was_active;
+    }
+
+private:
+    friend class FairScheduler;
+
+    fbl::WAVLTreeNodeState<thread_t*> run_queue_node_;
+
+    // Takes the value of FairScheduler::generation_count_ + 1 at the time this
+    // node is added to the run queue.
+    uint64_t generation_{0};
+
+    SchedWeight base_weight_{kDefaultWeight};
+
+    // TODO(eieio): Some of the values below are only relevant when running,
+    // while others only while ready. Consider using a union to save space.
+    SchedTime virtual_start_time_{SchedNs(0)};
+    SchedTime virtual_finish_time_{SchedNs(0)};
+
+    SchedDuration time_slice_ns_{SchedNs(0)};
+    SchedDuration lag_time_ns_{SchedNs(0)};
+
+    bool active_{false};
+};
diff --git a/kernel/include/kernel/percpu.h b/kernel/include/kernel/percpu.h
index 9bbb309..6ad86c6 100644
--- a/kernel/include/kernel/percpu.h
+++ b/kernel/include/kernel/percpu.h
@@ -8,6 +8,7 @@
 #include <arch/ops.h>
 #include <kernel/align.h>
 #include <kernel/event.h>
+#include <kernel/fair_scheduler.h>
 #include <kernel/stats.h>
 #include <kernel/thread.h>
 #include <kernel/timer.h>
@@ -31,6 +32,10 @@
     struct list_node run_queue[NUM_PRIORITIES];
     uint32_t run_queue_bitmap;
 
+#if WITH_FAIR_SCHEDULER
+    FairScheduler fair_runqueue;
+#endif
+
 #if WITH_LOCK_DEP
     // state for runtime lock validation when in irq context
     lockdep_state_t lock_state;
diff --git a/kernel/include/kernel/thread.h b/kernel/include/kernel/thread.h
index 86cae3a..802083e 100644
--- a/kernel/include/kernel/thread.h
+++ b/kernel/include/kernel/thread.h
@@ -12,6 +12,7 @@
 #include <arch/thread.h>
 #include <debug.h>
 #include <kernel/cpu.h>
+#include <kernel/fair_task_state.h>
 #include <kernel/spinlock.h>
 #include <kernel/wait.h>
 #include <list.h>
@@ -115,6 +116,10 @@
     cpu_num_t last_cpu;      // last cpu the thread ran on, INVALID_CPU if it's never run
     cpu_mask_t cpu_affinity; // mask of cpus that this thread can run on
 
+#if 1 || WITH_FAIR_SCHEDULER
+    FairTaskState fair_task_state;
+#endif
+
     // if blocked, a pointer to the wait queue
     struct wait_queue* blocking_wait_queue;
 
diff --git a/kernel/kernel/debug.cpp b/kernel/kernel/debug.cpp
index fa79650..8c2c46f 100644
--- a/kernel/kernel/debug.cpp
+++ b/kernel/kernel/debug.cpp
@@ -135,6 +135,10 @@
         printf("\tyields: %lu\n", percpu[i].stats.yields);
         printf("\ttimer interrupts: %lu\n", percpu[i].stats.timer_ints);
         printf("\ttimers: %lu\n", percpu[i].stats.timers);
+#if WITH_FAIR_SCHEDULER
+        printf("\ttotal weight: %d\n", percpu[i].fair_runqueue.GetTotalWeight().raw_value());
+        printf("\trunnable tasks: %zu\n", percpu[i].fair_runqueue.GetRunnableTasks());
+#endif
     }
 
     return 0;
@@ -262,6 +266,28 @@
     return 0;
 }
 
+#if WITH_FAIR_SCHEDULER
+static int cmd_threadq(int argc, const cmd_args* argv, uint32_t flags) {
+    static RecurringCallback callback([]() {
+		printf("----------------------------------------------------\n");
+		for (uint i = 0; i < SMP_MAX_CPUS; i++) {
+			Guard<spin_lock_t, NoIrqSave> thread_lock_guard{ThreadLock::Get()};
+
+			if (!mp_is_cpu_active(i)) {
+				continue;
+			}
+
+			printf("thread queue cpu %2u:\n", i);
+			percpu[i].fair_runqueue.Dump();
+		}
+		printf("\n");
+	});
+
+	callback.Toggle();
+
+    return 0;
+}
+#else
 static int cmd_threadq(int argc, const cmd_args* argv, uint32_t flags) {
     static RecurringCallback cb([]() {
         for (uint i = 0; i < SMP_MAX_CPUS; i++) {
@@ -286,3 +312,4 @@
 
     return 0;
 }
+#endif
diff --git a/kernel/kernel/fair_scheduler.cpp b/kernel/kernel/fair_scheduler.cpp
new file mode 100644
index 0000000..f218e2c
--- /dev/null
+++ b/kernel/kernel/fair_scheduler.cpp
@@ -0,0 +1,792 @@
+// Copyright 2018 The Fuchsia Authors
+//
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file or at
+// https://opensource.org/licenses/MIT
+
+#include <kernel/fair_scheduler.h>
+
+#include <assert.h>
+#include <debug.h>
+#include <err.h>
+#include <inttypes.h>
+#include <kernel/lockdep.h>
+#include <kernel/mp.h>
+#include <kernel/percpu.h>
+#include <kernel/sched.h>
+#include <kernel/thread.h>
+#include <kernel/thread_lock.h>
+#include <lib/ktrace.h>
+#include <list.h>
+#include <platform.h>
+#include <printf.h>
+#include <string.h>
+#include <target.h>
+#include <trace.h>
+#include <vm/vm.h>
+#include <zircon/types.h>
+
+#include <new>
+
+using ffl::Expression;
+using ffl::FromInteger;
+using ffl::FromRatio;
+using ffl::Max;
+using ffl::Round;
+using ffl::ToPrecision;
+
+// Enable/disable ktraces local to this file.
+#define LOCAL_KTRACE_ENABLE 0
+
+#define LOCAL_KTRACE(string, args...)                                \
+    ktrace_probe(LocalTrace<LOCAL_KTRACE_ENABLE>, TraceContext::Cpu, \
+                 KTRACE_STRING_REF(string), ##args)
+
+#define LOCAL_KTRACE_DURATION \
+    TraceDuration<TraceEnabled<LOCAL_KTRACE_ENABLE>, TraceContext::Cpu>
+
+// Enable/disable console traces local to this file.
+#define LOCAL_TRACE 0
+
+#define SCHED_LTRACEF(str, args...) LTRACEF("[%d] " str, arch_curr_cpu_num(), ##args)
+#define SCHED_TRACEF(str, args...) TRACEF("[%d] " str, arch_curr_cpu_num(), ##args)
+
+namespace {
+
+constexpr SchedWeight kMinWeight = FromRatio(LOWEST_PRIORITY + 1, NUM_PRIORITIES);
+constexpr SchedWeight kReciprocalMinWeight = 1 / kMinWeight;
+
+// On ARM64 with safe-stack, it's no longer possible to use the unsafe-sp
+// after set_current_thread (we'd now see newthread's unsafe-sp instead!).
+// Hence this function and everything it calls between this point and the
+// the low-level context switch must be marked with __NO_SAFESTACK.
+__NO_SAFESTACK static void FinalContextSwitch(thread_t* oldthread,
+                                              thread_t* newthread) {
+    set_current_thread(newthread);
+    arch_context_switch(oldthread, newthread);
+}
+
+inline const char* ToString(enum thread_state state) {
+    switch (state) {
+    case THREAD_INITIAL:
+        return "initial";
+    case THREAD_SUSPENDED:
+        return "suspended";
+    case THREAD_READY:
+        return "ready";
+    case THREAD_RUNNING:
+        return "running";
+    case THREAD_BLOCKED:
+        return "blocked";
+    case THREAD_SLEEPING:
+        return "sleeping";
+    case THREAD_DEATH:
+        return "death";
+    default:
+        return "[unknown]";
+    }
+}
+
+inline void TraceContextSwitch(const thread_t* current_thread,
+                               const thread_t* next_thread, cpu_num_t current_cpu) {
+    const uintptr_t raw_current = reinterpret_cast<uintptr_t>(current_thread);
+    const uintptr_t raw_next = reinterpret_cast<uintptr_t>(next_thread);
+    const uint32_t current = static_cast<uint32_t>(raw_current);
+    const uint32_t next = static_cast<uint32_t>(raw_next);
+    const uint32_t user_tid = static_cast<uint32_t>(next_thread->user_tid);
+    const uint32_t context = current_cpu |
+                             (current_thread->state << 8) |
+                             (current_thread->base_priority << 16) |
+                             (next_thread->base_priority << 24);
+
+    ktrace(TAG_CONTEXT_SWITCH, user_tid, context, current, next);
+}
+
+} // anonymous namespace
+
+void FairScheduler::Dump() {
+    printf("\tweight_total=%x runnable_tasks=%d vtime=%ld period=%ld\n",
+           weight_total_.raw_value(), runnable_task_count_,
+           virtual_time_.raw_value(), scheduling_period_ns_.raw_value());
+
+    if (active_thread_ != nullptr) {
+        const FairTaskState* const state = &active_thread_->fair_task_state;
+        printf("\t-> name=%s weight=%x vstart=%ld vfinish=%ld time_slice_ns=%ld\n",
+               active_thread_->name,
+               state->effective_weight().raw_value(),
+               state->virtual_start_time_.raw_value(),
+               state->virtual_finish_time_.raw_value(),
+               state->time_slice_ns_.raw_value());
+    }
+
+    for (const thread_t& thread : run_queue_) {
+        const FairTaskState* const state = &thread.fair_task_state;
+        printf("\t   name=%s weight=%x vstart=%ld vfinish=%ld time_slice_ns=%ld\n",
+               thread.name,
+               state->effective_weight().raw_value(),
+               state->virtual_start_time_.raw_value(),
+               state->virtual_finish_time_.raw_value(),
+               state->time_slice_ns_.raw_value());
+    }
+}
+
+SchedWeight FairScheduler::GetTotalWeight() {
+    Guard<spin_lock_t, IrqSave> guard{ThreadLock::Get()};
+    return weight_total_;
+}
+
+size_t FairScheduler::GetRunnableTasks() {
+    Guard<spin_lock_t, IrqSave> guard{ThreadLock::Get()};
+    return static_cast<size_t>(runnable_task_count_);
+}
+
+FairScheduler* FairScheduler::Get() {
+    return Get(arch_curr_cpu_num());
+}
+
+FairScheduler* FairScheduler::Get(cpu_num_t cpu) {
+    return &percpu[cpu].fair_runqueue;
+}
+
+void FairScheduler::InitializeThread(thread_t* thread, SchedWeight weight) {
+    new (&thread->fair_task_state) FairTaskState{weight};
+}
+
+// Returns the next thread to execute.
+thread_t* FairScheduler::NextThread(thread_t* current_thread, bool timeslice_expired) {
+    const bool is_idle = thread_is_idle(current_thread);
+    const bool is_active = current_thread->state == THREAD_READY ||
+                           current_thread->state == THREAD_RUNNING;
+    const bool should_migrate = !(current_thread->cpu_affinity &
+                                  cpu_num_to_mask(arch_curr_cpu_num()));
+
+    if (should_migrate) {
+        current_thread->state = THREAD_READY;
+        Remove(current_thread);
+
+        const cpu_num_t target_cpu = FindTargetCpu(current_thread);
+        FairScheduler* target = Get(target_cpu);
+        target->Insert(CurrentTime(), current_thread);
+
+        mp_reschedule(cpu_num_to_mask(target_cpu), 0);
+    } else if (is_active && !is_idle) {
+        if (timeslice_expired) {
+            NextThreadTimeslice(current_thread);
+            UpdateThreadTimeline(current_thread);
+            QueueThread(current_thread);
+        } else {
+            return current_thread;
+        }
+    } else if (!is_active && !is_idle) {
+        Remove(current_thread);
+    }
+
+    if (!run_queue_.is_empty()) {
+        return run_queue_.pop_front();
+    } else {
+        const cpu_num_t current_cpu = arch_curr_cpu_num();
+        return &percpu[current_cpu].idle_thread;
+    }
+}
+
+cpu_num_t FairScheduler::FindTargetCpu(thread_t* thread) {
+    LOCAL_KTRACE_DURATION trace{"find_target: cpu,avail"_stringref};
+
+    const cpu_mask_t current_cpu_mask = cpu_num_to_mask(arch_curr_cpu_num());
+    const cpu_mask_t last_cpu_mask = cpu_num_to_mask(thread->last_cpu);
+    const cpu_mask_t affinity_mask = thread->cpu_affinity;
+    const cpu_mask_t active_mask = mp_get_active_mask();
+    const cpu_mask_t idle_mask = mp_get_idle_mask();
+
+    // Threads may be created and resumed before the thread init level. Work around
+    // an empty active mask by assuming the current cpu is scheduleable.
+    cpu_mask_t available_mask = active_mask != 0 ? affinity_mask & active_mask
+                                                 : current_cpu_mask;
+    DEBUG_ASSERT_MSG(available_mask != 0,
+                     "thread=%s affinity=%x active=%x idle=%x arch_ints_disabled=%d",
+                     thread->name, affinity_mask, active_mask, idle_mask, arch_ints_disabled());
+
+    LOCAL_KTRACE("target_mask: online,active", mp_get_online_mask(), active_mask);
+
+    cpu_num_t target_cpu;
+    cpu_num_t least_loaded_cpu;
+    FairScheduler* target_queue;
+    FairScheduler* least_loaded_queue;
+
+    // Select an initial target.
+    if (last_cpu_mask & available_mask) {
+        target_cpu = thread->last_cpu;
+    } else if (current_cpu_mask & available_mask) {
+        target_cpu = arch_curr_cpu_num();
+    } else {
+        target_cpu = lowest_cpu_set(available_mask);
+    }
+
+    target_queue = Get(target_cpu);
+    least_loaded_cpu = target_cpu;
+    least_loaded_queue = target_queue;
+
+    // See if there is a better target in the set of available CPUs.
+    // TODO(eieio): Replace this with a search in order of increasing cache
+    // distance when topology information is available.
+    while (available_mask != 0) {
+        if (target_queue->weight_total_ < least_loaded_queue->weight_total_) {
+            least_loaded_cpu = target_cpu;
+            least_loaded_queue = target_queue;
+        }
+
+        available_mask &= ~cpu_num_to_mask(target_cpu);
+        if (available_mask) {
+            target_cpu = lowest_cpu_set(available_mask);
+            target_queue = Get(target_cpu);
+        }
+    }
+
+    SCHED_LTRACEF("thread=%s target_cpu=%d\n", thread->name, least_loaded_cpu);
+    trace.End(least_loaded_cpu, available_mask);
+    return least_loaded_cpu;
+}
+
+void FairScheduler::UpdateTimeline(SchedTime now) {
+    LOCAL_KTRACE_DURATION trace{"update_vtime"_stringref};
+
+    const Expression runtime_ns = now - last_update_time_ns_;
+    last_update_time_ns_ = now;
+
+    if (weight_total_ > SchedWeight{FromInteger(0)}) {
+        virtual_time_ += runtime_ns;
+    }
+
+    trace.End(Round<uint64_t>(runtime_ns), Round<uint64_t>(virtual_time_));
+}
+
+void FairScheduler::RescheduleCommon(SchedTime now) {
+    LOCAL_KTRACE_DURATION trace{"reschedule_common"_stringref};
+
+    const cpu_num_t current_cpu = arch_curr_cpu_num();
+    thread_t* const current_thread = get_current_thread();
+
+    DEBUG_ASSERT(arch_ints_disabled());
+    DEBUG_ASSERT(spin_lock_held(&thread_lock));
+    DEBUG_ASSERT_MSG(current_thread->state != THREAD_RUNNING, "state %d\n", current_thread->state);
+    DEBUG_ASSERT(!arch_blocking_disallowed());
+
+    CPU_STATS_INC(reschedules);
+
+    UpdateTimeline(now);
+
+    const SchedDuration actual_runtime_ns = now - last_reschedule_time_ns_;
+    last_reschedule_time_ns_ = now;
+
+    // Update the accounting for the thread that just ran.
+    current_thread->runtime_ns += actual_runtime_ns.raw_value();
+    const bool timeslice_expired = now >= absolute_deadline_ns_;
+
+    // Select a thread to run.
+    thread_t* const next_thread = NextThread(current_thread, timeslice_expired);
+    DEBUG_ASSERT(next_thread != nullptr);
+
+    const char* const front_name = run_queue_.is_empty() ? "[none]" : run_queue_.front().name;
+    SCHED_LTRACEF("current={%s, %s} next={%s, %s} expired=%d is_empty=%d front=%s\n",
+                  current_thread->name, ToString(current_thread->state),
+                  next_thread->name, ToString(next_thread->state),
+                  timeslice_expired, run_queue_.is_empty(), front_name);
+
+    // Update the state of the current and next thread.
+    current_thread->preempt_pending = false;
+    next_thread->state = THREAD_RUNNING;
+    next_thread->last_cpu = current_cpu;
+    next_thread->curr_cpu = current_cpu;
+
+    active_thread_ = next_thread;
+
+    if (next_thread != current_thread) {
+        // Re-compute the timeslice for the new thread based on the latest state.
+        NextThreadTimeslice(next_thread);
+    }
+
+    // Always call to handle races between reschedule IPIs and changes to the run queue.
+    mp_prepare_current_cpu_idle_state(thread_is_idle(next_thread));
+
+    if (thread_is_idle(next_thread)) {
+        mp_set_cpu_idle(current_cpu);
+    } else {
+        mp_set_cpu_busy(current_cpu);
+    }
+
+    // The task is always non-realtime when managed by this scheduler.
+    // TODO(eieio): Revisit this when deadline scheduling is addressed.
+    mp_set_cpu_non_realtime(current_cpu);
+
+    if (thread_is_idle(current_thread)) {
+        percpu[current_cpu].stats.idle_time += actual_runtime_ns.raw_value();
+    }
+
+    if (thread_is_idle(next_thread) /*|| runnable_task_count_ == 1*/) {
+        LOCAL_KTRACE_DURATION trace{"stop_preemption"_stringref};
+        SCHED_LTRACEF("Stop preemption timer: current=%s next=%s\n",
+                      current_thread->name, next_thread->name);
+        timer_preempt_cancel();
+    } else if (timeslice_expired || next_thread != current_thread) {
+        LOCAL_KTRACE_DURATION trace{"start_preemption: now,deadline"_stringref};
+
+        // Update the preemption time based on the time slice.
+        FairTaskState* const next_state = &next_thread->fair_task_state;
+        absolute_deadline_ns_ = now + next_state->time_slice_ns_;
+
+        SCHED_LTRACEF("Start preemption timer: current=%s next=%s now=%ld deadline=%ld\n",
+                      current_thread->name, next_thread->name, now.raw_value(),
+                      absolute_deadline_ns_.raw_value());
+        timer_preempt_reset(absolute_deadline_ns_.raw_value());
+
+        trace.End(Round<uint64_t>(now), Round<uint64_t>(absolute_deadline_ns_));
+    }
+
+    if (next_thread != current_thread) {
+        LOCAL_KTRACE("reschedule current: count,slice",
+                     runnable_task_count_,
+                     Round<uint64_t>(current_thread->fair_task_state.time_slice_ns_));
+        LOCAL_KTRACE("reschedule next: wsum,slice",
+                     weight_total_.raw_value(),
+                     Round<uint64_t>(next_thread->fair_task_state.time_slice_ns_));
+
+        TraceContextSwitch(current_thread, next_thread, current_cpu);
+
+        // Blink the optional debug LEDs on the target.
+        target_set_debug_led(0, !thread_is_idle(next_thread));
+
+        SCHED_LTRACEF("current=(%s, flags 0x%x) next=(%s, flags 0x%x)\n",
+                      current_thread->name, current_thread->flags,
+                      next_thread->name, next_thread->flags);
+
+        if (current_thread->aspace != next_thread->aspace) {
+            vmm_context_switch(current_thread->aspace, next_thread->aspace);
+        }
+
+        CPU_STATS_INC(context_switches);
+        FinalContextSwitch(current_thread, next_thread);
+    }
+}
+
+void FairScheduler::UpdatePeriod() {
+    LOCAL_KTRACE_DURATION trace{"update_period"_stringref};
+
+    DEBUG_ASSERT(runnable_task_count_ >= 0);
+    DEBUG_ASSERT(minimum_granularity_ns_ > 0);
+    DEBUG_ASSERT(peak_latency_ns_ > 0);
+    DEBUG_ASSERT(target_latency_ns_ > 0);
+
+    const int64_t num_tasks = runnable_task_count_;
+    const int64_t peak_tasks = Round<int64_t>(peak_latency_ns_ / minimum_granularity_ns_);
+    const int64_t normal_tasks = Round<int64_t>(target_latency_ns_ / minimum_granularity_ns_);
+
+    // The scheduling period stretches when there are too many tasks to fit
+    // within the target latency.
+    const int64_t period_grans = num_tasks > normal_tasks ? num_tasks : normal_tasks;
+    scheduling_period_ns_ = period_grans * minimum_granularity_ns_;
+
+    SCHED_LTRACEF("num_tasks=%ld peak_tasks=%ld normal_tasks=%ld period_ns=%ld\n",
+                  num_tasks, peak_tasks, normal_tasks, scheduling_period_ns_.raw_value());
+
+    trace.End(Round<uint64_t>(scheduling_period_ns_), num_tasks);
+}
+
+void FairScheduler::NextThreadTimeslice(thread_t* thread) {
+    LOCAL_KTRACE_DURATION trace{"next_timeslice: s,w"_stringref};
+
+    if (thread_is_idle(thread) || thread->state == THREAD_DEATH) {
+        return;
+    }
+
+    FairTaskState* const state = &thread->fair_task_state;
+
+    // Calculate the relative portion of the scheduling period.
+    state->time_slice_ns_ = scheduling_period_ns_ * state->effective_weight() / weight_total_;
+    DEBUG_ASSERT(state->time_slice_ns_ > 0);
+
+    SCHED_LTRACEF("name=%s weight_total=%x weight=%x time_slice_ns=%ld\n",
+                  thread->name,
+                  weight_total_.raw_value(),
+                  state->effective_weight().raw_value(),
+                  state->time_slice_ns_.raw_value());
+
+    trace.End(Round<uint64_t>(state->time_slice_ns_), state->effective_weight().raw_value());
+}
+
+void FairScheduler::UpdateThreadTimeline(thread_t* thread) {
+    LOCAL_KTRACE_DURATION trace{"update_timeline: vs,vf"_stringref};
+
+    if (thread_is_idle(thread) || thread->state == THREAD_DEATH) {
+        return;
+    }
+
+    FairTaskState* const state = &thread->fair_task_state;
+
+    // Update virtual timeline.
+    state->virtual_start_time_ = Max(state->virtual_finish_time_, virtual_time_);
+
+    const SchedDuration delta_norm = state->time_slice_ns_ / (kReciprocalMinWeight * state->effective_weight());
+    state->virtual_finish_time_ = state->virtual_start_time_ + delta_norm;
+
+    DEBUG_ASSERT_MSG(state->virtual_start_time_ < state->virtual_finish_time_,
+                     "vstart=%ld vfinish=%ld delta_norm=%ld\n",
+                     state->virtual_start_time_.raw_value(),
+                     state->virtual_finish_time_.raw_value(),
+                     delta_norm.raw_value());
+
+    SCHED_LTRACEF("name=%s vstart=%ld vfinish=%ld lag=%ld vtime=%ld\n",
+                  thread->name,
+                  state->virtual_start_time_.raw_value(),
+                  state->virtual_finish_time_.raw_value(),
+                  state->lag_time_ns_.raw_value(),
+                  virtual_time_.raw_value());
+
+    trace.End(Round<uint64_t>(state->virtual_start_time_),
+              Round<uint64_t>(state->virtual_finish_time_));
+}
+
+void FairScheduler::QueueThread(thread_t* thread) {
+    LOCAL_KTRACE_DURATION trace{"queue_thread"_stringref};
+
+    DEBUG_ASSERT(thread->state == THREAD_READY);
+    SCHED_LTRACEF("QueueThread: thread=%s\n", thread->name);
+
+    if (!thread_is_idle(thread)) {
+        thread->fair_task_state.generation_ = ++generation_count_;
+        run_queue_.insert(thread);
+        LOCAL_KTRACE("queue_thread");
+    }
+}
+
+void FairScheduler::Insert(SchedTime now, thread_t* thread) {
+    LOCAL_KTRACE_DURATION trace{"insert"_stringref};
+
+    DEBUG_ASSERT(thread->state == THREAD_READY);
+    DEBUG_ASSERT(!thread_is_idle(thread));
+
+    FairTaskState* const state = &thread->fair_task_state;
+
+    // Ensure insertion happens only once, even if Unblock is called multiple times.
+    if (state->OnInsert()) {
+        runnable_task_count_++;
+        DEBUG_ASSERT(runnable_task_count_ != 0);
+
+        UpdateTimeline(now);
+        UpdatePeriod();
+
+        thread->curr_cpu = arch_curr_cpu_num();
+
+        // Factor this task into the run queue.
+        weight_total_ += state->effective_weight();
+        DEBUG_ASSERT(weight_total_ > SchedWeight{FromInteger(0)});
+        //virtual_time_ -= state->lag_time_ns_ / weight_total_;
+
+        NextThreadTimeslice(thread);
+        UpdateThreadTimeline(thread);
+        QueueThread(thread);
+    }
+}
+
+void FairScheduler::Remove(thread_t* thread) {
+    LOCAL_KTRACE_DURATION trace{"remove"_stringref};
+
+    DEBUG_ASSERT(!thread_is_idle(thread));
+
+    FairTaskState* const state = &thread->fair_task_state;
+    DEBUG_ASSERT(!state->InQueue());
+
+    // Ensure that removal happens only once, even if Block() is called multiple times.
+    if (state->OnRemove()) {
+        DEBUG_ASSERT(runnable_task_count_ > 0);
+        runnable_task_count_--;
+
+        UpdatePeriod();
+
+        thread->curr_cpu = INVALID_CPU;
+
+        state->virtual_start_time_ = SchedNs(0);
+        state->virtual_finish_time_ = SchedNs(0);
+
+        // Factor this task out of the run queue.
+        //virtual_time_ += state->lag_time_ns_ / weight_total_;
+        weight_total_ -= state->effective_weight();
+        DEBUG_ASSERT(weight_total_ >= SchedWeight{FromInteger(0)});
+
+        SCHED_LTRACEF("name=%s weight_total=%x weight=%x lag_time_ns=%ld\n",
+                      thread->name,
+                      weight_total_.raw_value(),
+                      state->effective_weight().raw_value(),
+                      state->lag_time_ns_.raw_value());
+    }
+}
+
+void FairScheduler::Block() {
+    LOCAL_KTRACE_DURATION trace{"sched_block"_stringref};
+
+    DEBUG_ASSERT(spin_lock_held(&thread_lock));
+
+    thread_t* const current_thread = get_current_thread();
+
+    DEBUG_ASSERT(current_thread->magic == THREAD_MAGIC);
+    DEBUG_ASSERT(current_thread->state != THREAD_RUNNING);
+
+    const SchedTime now = CurrentTime();
+    SCHED_LTRACEF("current=%s now=%ld\n", current_thread->name, now.raw_value());
+
+    FairScheduler::Get()->RescheduleCommon(now);
+}
+
+bool FairScheduler::Unblock(thread_t* thread) {
+    LOCAL_KTRACE_DURATION trace{"sched_unblock"_stringref};
+
+    DEBUG_ASSERT(thread->magic == THREAD_MAGIC);
+    DEBUG_ASSERT(spin_lock_held(&thread_lock));
+
+    const SchedTime now = CurrentTime();
+    SCHED_LTRACEF("thread=%s now=%ld\n", thread->name, now.raw_value());
+
+    const cpu_num_t target_cpu = FindTargetCpu(thread);
+    FairScheduler* const target = Get(target_cpu);
+
+    thread->state = THREAD_READY;
+    target->Insert(now, thread);
+
+    if (target_cpu == arch_curr_cpu_num()) {
+        return true;
+    } else {
+        mp_reschedule(cpu_num_to_mask(target_cpu), 0);
+        return false;
+    }
+}
+
+bool FairScheduler::Unblock(list_node* list) {
+    LOCAL_KTRACE_DURATION trace{"sched_unblock_list"_stringref};
+
+    DEBUG_ASSERT(list);
+    DEBUG_ASSERT(spin_lock_held(&thread_lock));
+
+    const SchedTime now = CurrentTime();
+
+    cpu_mask_t cpus_to_reschedule_mask = 0;
+    thread_t* thread;
+    while ((thread = list_remove_tail_type(list, thread_t, queue_node)) != nullptr) {
+        DEBUG_ASSERT(thread->magic == THREAD_MAGIC);
+        DEBUG_ASSERT(!thread_is_idle(thread));
+
+        SCHED_LTRACEF("thread=%s now=%ld\n", thread->name, now.raw_value());
+
+        const cpu_num_t target_cpu = FindTargetCpu(thread);
+        FairScheduler* const target = Get(target_cpu);
+
+        thread->state = THREAD_READY;
+        target->Insert(now, thread);
+
+        cpus_to_reschedule_mask |= cpu_num_to_mask(target_cpu);
+    }
+
+    // Issue reschedule IPIs to other CPUs.
+    if (cpus_to_reschedule_mask) {
+        mp_reschedule(cpus_to_reschedule_mask, 0);
+    }
+
+    // Return true if the current CPU is in the mask.
+    const cpu_mask_t current_cpu_mask = cpu_num_to_mask(arch_curr_cpu_num());
+    return cpus_to_reschedule_mask & current_cpu_mask;
+}
+
+void FairScheduler::UnblockIdle(thread_t* thread) {
+    DEBUG_ASSERT(spin_lock_held(&thread_lock));
+
+    DEBUG_ASSERT(thread_is_idle(thread));
+    DEBUG_ASSERT(thread->cpu_affinity && (thread->cpu_affinity & (thread->cpu_affinity - 1)) == 0);
+
+    SCHED_LTRACEF("thread=%s now=%ld\n", thread->name, current_time());
+
+    thread->state = THREAD_READY;
+    thread->curr_cpu = lowest_cpu_set(thread->cpu_affinity);
+}
+
+void FairScheduler::Yield() {
+    LOCAL_KTRACE_DURATION trace{"sched_yield"_stringref};
+
+    DEBUG_ASSERT(spin_lock_held(&thread_lock));
+
+    thread_t* current_thread = get_current_thread();
+    DEBUG_ASSERT(!thread_is_idle(current_thread));
+
+    const SchedTime now = CurrentTime();
+    SCHED_LTRACEF("current=%s now=%ld\n", current_thread->name, now.raw_value());
+
+    current_thread->state = THREAD_READY;
+    Get()->RescheduleCommon(now);
+}
+
+void FairScheduler::Preempt() {
+    LOCAL_KTRACE_DURATION trace{"sched_preempt"_stringref};
+
+    DEBUG_ASSERT(spin_lock_held(&thread_lock));
+
+    thread_t* current_thread = get_current_thread();
+    const cpu_num_t current_cpu = arch_curr_cpu_num();
+
+    DEBUG_ASSERT(current_thread->curr_cpu == current_cpu);
+    DEBUG_ASSERT(current_thread->last_cpu == current_thread->curr_cpu);
+
+    const SchedTime now = CurrentTime();
+    SCHED_LTRACEF("current=%s now=%ld\n", current_thread->name, now.raw_value());
+
+    current_thread->state = THREAD_READY;
+    Get()->RescheduleCommon(now);
+}
+
+void FairScheduler::Reschedule() {
+    LOCAL_KTRACE_DURATION trace{"sched_reschedule"_stringref};
+
+    DEBUG_ASSERT(spin_lock_held(&thread_lock));
+
+    thread_t* current_thread = get_current_thread();
+    const cpu_num_t current_cpu = arch_curr_cpu_num();
+
+    if (current_thread->disable_counts != 0) {
+        current_thread->preempt_pending = true;
+        return;
+    }
+
+    DEBUG_ASSERT(current_thread->curr_cpu == current_cpu);
+    DEBUG_ASSERT(current_thread->last_cpu == current_thread->curr_cpu);
+
+    const SchedTime now = CurrentTime();
+    SCHED_LTRACEF("current=%s now=%ld\n", current_thread->name, now.raw_value());
+
+    current_thread->state = THREAD_READY;
+    Get()->RescheduleCommon(now);
+}
+
+void FairScheduler::RescheduleInternal() {
+    Get()->RescheduleCommon(CurrentTime());
+}
+
+void FairScheduler::Migrate(thread_t* thread) {
+    DEBUG_ASSERT(spin_lock_held(&thread_lock));
+    cpu_mask_t cpus_to_reschedule_mask = 0;
+
+    if (thread->state == THREAD_RUNNING) {
+        const cpu_mask_t thread_cpu_mask = cpu_num_to_mask(thread->curr_cpu);
+        if (!(thread->cpu_affinity & thread_cpu_mask)) {
+            // Mark the CPU the thread is running on for reschedule.
+            cpus_to_reschedule_mask |= thread_cpu_mask;
+        }
+    } else if (thread->state == THREAD_READY) {
+        const cpu_mask_t thread_cpu_mask = cpu_num_to_mask(thread->curr_cpu);
+        if (!(thread->cpu_affinity & thread_cpu_mask)) {
+            FairScheduler* current = Get(thread->curr_cpu);
+
+            DEBUG_ASSERT(thread->fair_task_state.InQueue());
+            current->run_queue_.erase(*thread);
+            current->Remove(thread);
+
+            const cpu_num_t target_cpu = FindTargetCpu(thread);
+            FairScheduler* const target = Get(target_cpu);
+            target->Insert(CurrentTime(), thread);
+
+            cpus_to_reschedule_mask |= cpu_num_to_mask(target_cpu);
+        }
+    }
+
+    if (cpus_to_reschedule_mask) {
+        mp_reschedule(cpus_to_reschedule_mask, 0);
+    }
+
+    const cpu_mask_t current_cpu_mask = cpu_num_to_mask(arch_curr_cpu_num());
+    if (cpus_to_reschedule_mask & current_cpu_mask) {
+        FairScheduler::Reschedule();
+    }
+}
+
+void FairScheduler::TimerTick(SchedTime now) {
+    LOCAL_KTRACE_DURATION trace{"sched_timer_tick"_stringref};
+
+    SchedTime absolute_deadline_ns;
+
+    {
+        FairScheduler* const queue = Get();
+        Guard<spin_lock_t, IrqSave> guard{ThreadLock::Get()};
+        absolute_deadline_ns = queue->absolute_deadline_ns_;
+    }
+
+    SCHED_LTRACEF("now=%ld deadline=%ld\n", now.raw_value(), absolute_deadline_ns.raw_value());
+
+    thread_preempt_set_pending();
+
+    trace.End(Round<uint64_t>(now), Round<uint64_t>(absolute_deadline_ns));
+}
+
+// Temporary compatibility with the thread layer.
+
+void sched_init_thread(thread_t* thread, int priority) {
+    FairScheduler::InitializeThread(thread, FromRatio(priority, NUM_PRIORITIES));
+    thread->base_priority = priority;
+}
+
+void sched_block() {
+    FairScheduler::Block();
+}
+
+bool sched_unblock(thread_t* thread) {
+    return FairScheduler::Unblock(thread);
+}
+
+bool sched_unblock_list(list_node* list) {
+    return FairScheduler::Unblock(list);
+}
+
+void sched_unblock_idle(thread_t* thread) {
+    FairScheduler::UnblockIdle(thread);
+}
+
+void sched_yield() {
+    FairScheduler::Yield();
+}
+
+void sched_preempt() {
+    FairScheduler::Preempt();
+}
+
+void sched_reschedule() {
+    FairScheduler::Reschedule();
+}
+
+void sched_resched_internal() {
+    FairScheduler::RescheduleInternal();
+}
+
+void sched_transition_off_cpu(cpu_num_t old_cpu) {
+    DEBUG_ASSERT(spin_lock_held(&thread_lock));
+    DEBUG_ASSERT(old_cpu == arch_curr_cpu_num());
+
+    (void)old_cpu;
+}
+
+void sched_migrate(thread_t* thread) {
+    FairScheduler::Migrate(thread);
+}
+
+void sched_inherit_priority(thread_t* t, int pri, bool* local_resched) {
+    DEBUG_ASSERT(spin_lock_held(&thread_lock));
+    (void)t;
+    (void)pri;
+    (void)local_resched;
+}
+
+void sched_change_priority(thread_t* t, int pri) {
+    DEBUG_ASSERT(spin_lock_held(&thread_lock));
+    (void)t;
+    (void)pri;
+}
+
+void sched_preempt_timer_tick(zx_time_t now) {
+    FairScheduler::TimerTick(FromInteger(now));
+}
+
+void sched_init_early() {
+}
diff --git a/kernel/kernel/rules.mk b/kernel/kernel/rules.mk
index 3311371..5dc517d 100644
--- a/kernel/kernel/rules.mk
+++ b/kernel/kernel/rules.mk
@@ -16,6 +16,7 @@
 	kernel/lib/heap \
 	kernel/lib/libc \
 	kernel/lib/fbl \
+	kernel/lib/ffl \
 	kernel/lib/zircon-internal \
 	kernel/vm
 
@@ -28,9 +29,14 @@
 	$(LOCAL_DIR)/mp.cpp \
 	$(LOCAL_DIR)/mutex.cpp \
 	$(LOCAL_DIR)/percpu.cpp \
-	$(LOCAL_DIR)/sched.cpp \
 	$(LOCAL_DIR)/thread.cpp \
 	$(LOCAL_DIR)/timer.cpp \
 	$(LOCAL_DIR)/wait.cpp
 
+ifeq ($(call TOBOOL,$(ENABLE_FAIR_SCHEDULER)),true)
+MODULE_SRCS += $(LOCAL_DIR)/fair_scheduler.cpp
+else
+MODULE_SRCS += $(LOCAL_DIR)/sched.cpp
+endif
+
 include make/module.mk
diff --git a/make/engine.mk b/make/engine.mk
index 46394bb..b55f518 100644
--- a/make/engine.mk
+++ b/make/engine.mk
@@ -27,6 +27,7 @@
 ENABLE_NEW_BOOTDATA := true
 ENABLE_LOCK_DEP ?= false
 ENABLE_LOCK_DEP_TESTS ?= $(ENABLE_LOCK_DEP)
+ENABLE_FAIR_SCHEDULER ?= false
 DISABLE_UTEST ?= false
 ENABLE_ULIB_ONLY ?= false
 USE_ASAN ?= false
@@ -428,6 +429,11 @@
 KERNEL_DEFINES += WITH_LOCK_DEP_TESTS=1
 endif
 
+# Scheduler experiment.
+ifeq ($(call TOBOOL,$(ENABLE_FAIR_SCHEDULER)),true)
+KERNEL_DEFINES += WITH_FAIR_SCHEDULER=1
+endif
+
 # additional bootdata items to be included to bootdata.bin
 ADDITIONAL_BOOTDATA_ITEMS :=