blob: 7a7b669a2d374051cb1abf1d8192016fbf761ec1 [file] [log] [blame]
// Copyright 2018 The Fuchsia Authors
//
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file or at
// https://opensource.org/licenses/MIT
#ifndef ZIRCON_KERNEL_INCLUDE_KERNEL_SCHEDULER_H_
#define ZIRCON_KERNEL_INCLUDE_KERNEL_SCHEDULER_H_
#include <lib/ktrace/string_ref.h>
#include <lib/relaxed_atomic.h>
#include <platform.h>
#include <stdint.h>
#include <zircon/syscalls/scheduler.h>
#include <zircon/types.h>
#include <fbl/function.h>
#include <fbl/intrusive_pointer_traits.h>
#include <fbl/intrusive_wavl_tree.h>
#include <ffl/fixed.h>
#include <kernel/scheduler_state.h>
#include <kernel/thread.h>
#include <kernel/wait.h>
// Forward declaration.
struct percpu;
// Ensure this define has a value when not defined globally by the build system.
#ifndef SCHEDULER_TRACING_LEVEL
#define SCHEDULER_TRACING_LEVEL 0
#endif
// Ensure this define has a value when not defined globally by the build system.
#ifndef SCHEDULER_QUEUE_TRACING_ENABLED
#define SCHEDULER_QUEUE_TRACING_ENABLED false
#endif
// Performance scale of a CPU relative to the highest performance CPU in the
// system. The precision accommodates the 8bit performance values available for
// ARM and x86.
using SchedPerformanceScale = ffl::Fixed<int32_t, 8>;
// Implements fair and deadline scheduling algorithms and manages the associated
// per-CPU state.
class Scheduler {
public:
// Default minimum granularity of time slices.
static constexpr SchedDuration kDefaultMinimumGranularity = SchedMs(1);
// Default target latency for a scheduling period.
static constexpr SchedDuration kDefaultTargetLatency = SchedMs(8);
// The threshold for cross-cluster work stealing. Queues with an estimated
// runtime below this value are not stolen from if the target and destination
// CPUs are in different logical clusters. This tunable value approximates the
// cost of cross-cluster migration due to cache misses, assuming a task has
// high cache affinity in its current cluster.
static constexpr SchedDuration kInterClusterThreshold = SchedUs(75);
// The threshold for early termination when searching for a CPU to place a
// task. Queues with an estimated runtime below this value are sufficiently
// unloaded. This tunable value approximates the cost of intra-cluster
// migration due to cache misses, assuming a task has high cache affinity with
// the last CPU it ran on.
static constexpr SchedDuration kIntraClusterThreshold = SchedUs(25);
// The per-CPU deadline utilization limit to attempt to honor when selecting a
// CPU to place a task. It is up to userspace to ensure that the total set of
// deadline tasks can honor this limit. Even if userspace ensures the total
// set of deadline utilizations is within the total available processor
// resources, when total utilization is high enough it may not be possible to
// honor this limit due to the bin packing problem.
static constexpr SchedUtilization kCpuUtilizationLimit{1};
// The maximum deadline utilization permitted for a single thread. This limit
// is applied when scaling the utilization of a deadline task to the relative
// performance of a candidate target processor -- placing a task on a
// processor that would cause the scaled thread utilization to exceed this
// value is avoided if possible.
static constexpr SchedUtilization kThreadUtilizationMax{1};
// The adjustment rates of the exponential moving averages tracking the
// expected runtimes of each thread.
static constexpr ffl::Fixed<int32_t, 2> kExpectedRuntimeAlpha = ffl::FromRatio(1, 4);
static constexpr ffl::Fixed<int32_t, 0> kExpectedRuntimeBeta = ffl::FromRatio(1, 1);
Scheduler() = default;
~Scheduler() = default;
Scheduler(const Scheduler&) = delete;
Scheduler& operator=(const Scheduler&) = delete;
// Accessors for total weight and number of runnable tasks.
SchedWeight GetTotalWeight() const TA_EXCL(thread_lock);
size_t GetRunnableTasks() const TA_EXCL(thread_lock);
// Dumps the state of the run queue to the specified output target.
void Dump(FILE* output_target = stdout) TA_REQ(thread_lock);
// Returns the number of the CPU this scheduler instance is associated with.
cpu_num_t this_cpu() const { return this_cpu_; }
// Returns the index of the logical cluster of the CPU this scheduler instance
// is associated with.
size_t cluster() const { return cluster_; }
// Returns the lock-free value of the predicted queue time for the CPU this
// scheduler instance is associated with.
SchedDuration predicted_queue_time_ns() const {
return exported_total_expected_runtime_ns_.load();
}
// Returns the lock-free value of the predicted deadline utilization for the
// CPU this scheduler instance is associated with.
SchedUtilization predicted_deadline_utilization() const {
return exported_total_deadline_utilization_.load();
}
// Returns the performance scale of the CPU this scheduler instance is
// associated with.
SchedPerformanceScale performance_scale() const { return performance_scale_; }
// Returns the reciprocal performance scale of the CPU this scheduler instance
// is associated with.
SchedPerformanceScale performance_scale_reciprocal() const {
return performance_scale_reciprocal_;
}
// Returns a pointer to the currently running thread, if any.
Thread* active_thread() const TA_REQ(thread_lock) { return active_thread_; }
// Public entry points.
static void InitializeThread(Thread* thread, int priority);
static void InitializeThread(Thread* thread, const zx_sched_deadline_params_t& params);
static void Block() TA_REQ(thread_lock);
static void Yield() TA_REQ(thread_lock);
static void Preempt() TA_REQ(thread_lock);
static void Reschedule() TA_REQ(thread_lock);
static void RescheduleInternal() TA_REQ(thread_lock);
// Return true if the thread was placed on the current cpu's run queue.
// This usually means the caller should locally reschedule soon.
static bool Unblock(Thread* thread) __WARN_UNUSED_RESULT TA_REQ(thread_lock);
static bool Unblock(WaitQueueSublist thread_list) __WARN_UNUSED_RESULT TA_REQ(thread_lock);
static void UnblockIdle(Thread* idle_thread) TA_REQ(thread_lock);
static void Migrate(Thread* thread) TA_REQ(thread_lock);
static void MigrateUnpinnedThreads() TA_REQ(thread_lock);
// TimerTick is called when the preemption timer for a CPU has fired.
//
// This function is logically private and should only be called by timer.cc.
static void TimerTick(SchedTime now);
// Set the inherited priority of a thread.
//
// Update a mask of affected CPUs along with a flag indicating whether or not a
// local reschedule is needed. After the caller has finished any batch update
// operations, it is their responsibility to trigger reschedule operations on
// the local CPU (if needed) as well as any other CPUs. This allows callers to
// bacth update the state of several threads in a priority inheritance chain
// before finally rescheduling.
static void InheritPriority(Thread* t, int pri, bool* local_resched, cpu_mask_t* accum_cpu_mask)
TA_REQ(thread_lock);
// Set the priority of a thread and reset the boost value. This function might reschedule.
// pri should be 0 <= to <= MAX_PRIORITY.
static void ChangePriority(Thread* t, int pri) TA_REQ(thread_lock);
// Set the deadline of a thread. This function might reschedule.
// This requires: 0 < capacity <= relative_deadline <= period.
static void ChangeDeadline(Thread* t, const zx_sched_deadline_params_t& params)
TA_REQ(thread_lock);
// Return the time at which the current thread should be preempted.
//
// May only be called with preemption disabled.
static zx_time_t GetTargetPreemptionTime() TA_EXCL(thread_lock);
private:
// Allow percpu to init our cpu number and performance scale.
friend struct percpu;
// Load balancer test.
friend struct LoadBalancerTestAccess;
// Allow tests to modify our state.
friend class LoadBalancerTest;
static void ChangeWeight(Thread* thread, int priority, cpu_mask_t* cpus_to_reschedule_mask)
TA_REQ(thread_lock);
static void ChangeDeadline(Thread* thread, const SchedDeadlineParams& params,
cpu_mask_t* cpus_to_reschedule_mask) TA_REQ(thread_lock);
static void InheritWeight(Thread* thread, int priority, cpu_mask_t* cpus_to_reschedule_mask)
TA_REQ(thread_lock);
// Specifies how to associate a thread with a Scheduler instance, update
// metadata, and whether/where to place the thread in a run queue.
enum class Placement {
// Selects a place in the queue based on the current insertion time and
// thread weight or deadline.
Insertion,
// Selects a place in the queue based on the original insertion time and
// the updated (inherited or changed) weight or deadline on the same CPU.
Adjustment,
// Selects a place in the queue based on the original insertion time and
// the updated time slice due to being preempted by another thread.
Preemption,
// Selects a place in the queue based on the insertion time in the original
// queue adjusted for the new queue.
Migration,
// Updates the metadata to account for a stolen thread that was just taken
// from a different queue. This is distinct from Migration in that the
// thread is not also enqueued, it is run immediately by the new CPU.
Association,
};
// Returns the current system time as a SchedTime value.
static SchedTime CurrentTime() { return SchedTime{current_time()}; }
// Returns the Scheduler instance for the current CPU.
static Scheduler* Get();
// Returns the Scheduler instance for the given CPU.
static Scheduler* Get(cpu_num_t cpu);
// Returns a CPU to run the given thread on.
static cpu_num_t FindTargetCpu(Thread* thread) TA_REQ(thread_lock);
// Updates the system load metrics.
void UpdateCounters(SchedDuration queue_time_ns) TA_REQ(thread_lock);
// Updates the thread's weight and updates state-dependent bookkeeping.
static void UpdateWeightCommon(Thread* thread, int original_priority, SchedWeight weight,
cpu_mask_t* cpus_to_reschedule_mask, PropagatePI propagate)
TA_REQ(thread_lock);
// Updates the thread's deadline and updates state-dependent bookkeeping.
static void UpdateDeadlineCommon(Thread* thread, int original_priority,
const SchedDeadlineParams& params,
cpu_mask_t* cpus_to_reschedule_mask, PropagatePI propagate)
TA_REQ(thread_lock);
using EndTraceCallback = fbl::InlineFunction<void(), sizeof(void*)>;
// Common logic for reschedule API.
void RescheduleCommon(SchedTime now, EndTraceCallback end_outer_trace = nullptr)
TA_REQ(thread_lock);
// Evaluates the schedule and returns the thread that should execute,
// updating the run queue as necessary.
Thread* EvaluateNextThread(SchedTime now, Thread* current_thread, bool timeslice_expired,
SchedDuration scaled_total_runtime_ns) TA_REQ(thread_lock);
// Adds a thread to the run queue tree. The thread must be active on this
// CPU.
void QueueThread(Thread* thread, Placement placement, SchedTime now = SchedTime{0},
SchedDuration scaled_total_runtime_ns = SchedDuration{0}) TA_REQ(thread_lock);
// Removes the thread at the head of the first eligible run queue.
Thread* DequeueThread(SchedTime now) TA_REQ(thread_lock);
// Removes the thread at the head of the fair run queue and returns it.
Thread* DequeueFairThread() TA_REQ(thread_lock);
// Removes the eligible thread with the earliest deadline in the deadline run
// queue and returns it.
Thread* DequeueDeadlineThread(SchedTime eligible_time) TA_REQ(thread_lock);
// Returns the eligible thread in the run queue with a deadline earlier than
// the given deadline, or nullptr if one does not exist.
Thread* FindEarlierDeadlineThread(SchedTime eligible_time, SchedTime finish_time)
TA_REQ(thread_lock);
// Removes the eligible thread with a deadline earlier than the given deadline
// and returns it or nullptr if one does not exist.
Thread* DequeueEarlierDeadlineThread(SchedTime eligible_time, SchedTime finish_time)
TA_REQ(thread_lock);
// Attempts to steal work from other busy CPUs. Returns nullptr if no work was
// stolen, otherwise returns a pointer to the stolen thread that is now
// associated with the local Scheduler instance.
Thread* StealWork(SchedTime now) TA_REQ(thread_lock);
// Returns the time that the next deadline task will become eligible or infinite
// if there are no ready deadline tasks.
SchedTime GetNextEligibleTime() TA_REQ(thread_lock);
// Calculates the timeslice of the thread based on the current run queue
// state.
SchedDuration CalculateTimeslice(Thread* thread) TA_REQ(thread_lock);
// Returns the completion time clamped to the start of the earliest deadline
// thread that will become eligible in that time frame.
SchedTime ClampToDeadline(SchedTime completion_time) TA_REQ(thread_lock);
// Returns the completion time clamped to the start of the earliest deadline
// thread that will become eligible in that time frame and also has an earlier
// deadline than the given finish time.
SchedTime ClampToEarlierDeadline(SchedTime completion_time, SchedTime finish_time)
TA_REQ(thread_lock);
// Updates the timeslice of the thread based on the current run queue state.
// Returns the absolute deadline for the next time slice, which may be earlier
// than the completion of the time slice if other threads could preempt the
// given thread before the time slice is exhausted.
SchedTime NextThreadTimeslice(Thread* thread, SchedTime now) TA_REQ(thread_lock);
// Updates the scheduling period based on the number of active threads.
void UpdatePeriod() TA_REQ(thread_lock);
// Updates the global virtual timeline.
void UpdateTimeline(SchedTime now) TA_REQ(thread_lock);
// Makes a thread active on this CPU's scheduler and inserts it into the
// run queue tree.
void Insert(SchedTime now, Thread* thread, Placement placement = Placement::Insertion)
TA_REQ(thread_lock);
// Removes the thread from this CPU's scheduler. The thread must not be in
// the run queue tree.
void Remove(Thread* thread) TA_REQ(thread_lock);
// Returns true if there is at least one eligible deadline thread in the
// run queue.
inline bool IsDeadlineThreadEligible(SchedTime eligible_time) TA_REQ(thread_lock) {
return !deadline_run_queue_.is_empty() &&
deadline_run_queue_.front().scheduler_state().start_time_ <= eligible_time;
}
// Updates the total expected runtime estimator and exports the atomic shadow
// variable for cross-CPU readers.
inline void UpdateTotalExpectedRuntime(SchedDuration delta_ns) TA_REQ(thread_lock);
// Updates to total deadline utilization estimator and exports the atomic
// shadow variable for cross-CPU readers.
inline void UpdateTotalDeadlineUtilization(SchedUtilization delta_ns) TA_REQ(thread_lock);
// Utilities to scale up or down the given value by the performace scale of the CPU.
template <typename T>
inline T ScaleUp(T value) const;
template <typename T>
inline T ScaleDown(T value) const;
// Update trace counters which track the total number of runnable threads for a CPU
inline void TraceTotalRunnableThreads() const TA_REQ(thread_lock);
// Returns a new flow id when flow tracing is enabled, zero otherwise.
inline static uint64_t NextFlowId();
// Traits type to adapt the WAVLTree to Thread with node state in the
// scheduler_state member.
struct TaskTraits {
using KeyType = SchedulerState::KeyType;
static KeyType GetKey(const Thread& thread) { return thread.scheduler_state().key(); }
static bool LessThan(KeyType a, KeyType b) { return a < b; }
static bool EqualTo(KeyType a, KeyType b) { return a == b; }
static auto& node_state(Thread& thread) { return thread.scheduler_state().run_queue_node_; }
};
// Observer that maintains the subtree invariant min_finish_time as nodes are
// added to and removed from the run queue.
struct SubtreeMinObserver {
template <typename Iter>
static void RecordInsert(Iter node);
template <typename Iter>
static void RecordInsertCollision(Thread* node, Iter collision);
template <typename Iter>
static void RecordInsertReplace(Iter node, Thread* replacement);
template <typename Iter>
static void RecordInsertTraverse(Thread* node, Iter ancestor);
template <typename Iter>
static void RecordRotation(Iter pivot, Iter lr_child, Iter rl_child, Iter parent, Iter sibling);
template <typename Iter>
static void RecordErase(Thread* node, Iter invalidated);
static void RecordInsertPromote() {}
static void RecordInsertRotation() {}
static void RecordInsertDoubleRotation() {}
static void RecordEraseDemote() {}
static void RecordEraseRotation() {}
static void RecordEraseDoubleRotation() {}
};
// Alias of the WAVLTree type for the run queue.
using RunQueue = fbl::WAVLTree<SchedTime, Thread*, TaskTraits, fbl::DefaultObjectTag, TaskTraits,
SubtreeMinObserver>;
// Finds the next eligible thread in the given run queue.
static Thread* FindEarliestEligibleThread(RunQueue* run_queue, SchedTime eligible_time)
TA_REQ(thread_lock);
// Finds the next eligible thread in the given run queue that also passes the
// given predicate.
template <typename Predicate>
static Thread* FindEarliestEligibleThread(RunQueue* run_queue, SchedTime eligible_time,
Predicate&& predicate) TA_REQ(thread_lock);
// Returns the run queue for the given thread's scheduling discipline.
inline RunQueue& GetRunQueue(Thread* thread) {
return thread->scheduler_state().discipline() == SchedDiscipline::Fair ? fair_run_queue_
: deadline_run_queue_;
}
// Emits queue event tracers for trace-based scheduler performance analysis.
inline void TraceThreadQueueEvent(StringRef* name, Thread* thread) TA_REQ(thread_lock);
// The run queue of fair scheduled threads ready to run, but not currently running.
TA_GUARDED(thread_lock)
RunQueue fair_run_queue_;
// The run queue of deadline scheduled threads ready to run, but not currently running.
TA_GUARDED(thread_lock)
RunQueue deadline_run_queue_;
// Pointer to the thread actively running on this CPU.
TA_GUARDED(thread_lock)
Thread* active_thread_{nullptr};
// Monotonically increasing counter to break ties when queuing tasks with
// the same key. This has the effect of placing newly queued tasks behind
// already queued tasks with the same key. This is also necessary to
// guarantee uniqueness of the key as required by the WAVLTree container.
TA_GUARDED(thread_lock)
uint64_t generation_count_{0};
// Count of the fair threads running on this CPU, including threads in the run
// queue and the currently running thread. Does not include the idle thread.
TA_GUARDED(thread_lock)
int32_t runnable_fair_task_count_{0};
// Count of the deadline threads running on this CPU, including threads in the
// run queue and the currently running thread. Does not include the idle
// thread.
TA_GUARDED(thread_lock)
int32_t runnable_deadline_task_count_{0};
// Total weights of threads running on this CPU, including threads in the
// run queue and the currently running thread. Does not include the idle
// thread.
TA_GUARDED(thread_lock)
SchedWeight weight_total_{0};
// The value of |weight_total_| when the current thread was scheduled.
// Provides a reference for determining whether the total weights changed
// since the last reschedule.
TA_GUARDED(thread_lock)
SchedWeight scheduled_weight_total_{0};
// The global virtual time of this run queue.
TA_GUARDED(thread_lock)
SchedTime virtual_time_{0};
// The system time since the last update to the global virtual time.
TA_GUARDED(thread_lock)
SchedTime last_update_time_ns_{0};
// The system time that the current time slice started.
TA_GUARDED(thread_lock)
SchedTime start_of_current_time_slice_ns_{0};
// The system time that the current thread should be preempted. Initialized to
// ZX_TIME_INFINITE to pass the assertion now < target_preemption_time_ns_ (or
// else the current time slice is expired) on the first entry into the
// scheduler.
TA_GUARDED(thread_lock)
SchedTime target_preemption_time_ns_{ZX_TIME_INFINITE};
// The sum of the expected runtimes of all active threads on this CPU. This
// value is an estimate of the average queuimg time for this CPU, given the
// current set of active threads.
TA_GUARDED(thread_lock)
SchedDuration total_expected_runtime_ns_{0};
// The sum of the worst case utilization of all active deadline threads on
// this CPU.
TA_GUARDED(thread_lock)
SchedUtilization total_deadline_utilization_{0};
// Scheduling period in which every runnable task executes once in units of
// minimum granularity.
TA_GUARDED(thread_lock)
SchedDuration scheduling_period_grans_{kDefaultTargetLatency / kDefaultMinimumGranularity};
// The smallest timeslice a thread is allocated in a single round.
TA_GUARDED(thread_lock)
SchedDuration minimum_granularity_ns_{kDefaultMinimumGranularity};
// The target scheduling period. The scheduling period is set to this value
// when the number of tasks is low enough for the sum of all timeslices to
// fit within this duration. This has the effect of increasing the size of
// the timeslices under nominal load to reduce scheduling overhead.
TA_GUARDED(thread_lock)
SchedDuration target_latency_grans_{kDefaultTargetLatency / kDefaultMinimumGranularity};
// Performance scale of this CPU relative to the highest performance CPU. This
// value is determined from the system topology, when available.
SchedPerformanceScale performance_scale_{1};
SchedPerformanceScale performance_scale_reciprocal_{1};
// The CPU this scheduler instance is associated with.
// NOTE: This member is not initialized to prevent clobbering the value set
// by sched_early_init(), which is called before the global ctors that
// initialize the rest of the members of this class.
// TODO(eieio): Figure out a better long-term solution to determine which
// CPU is associated with each instance of this class. This is needed by
// non-static methods that are called from arbitrary CPUs, namely Insert().
cpu_num_t this_cpu_;
// The index of the logical cluster this CPU belongs to. CPUs with the same
// logical cluster index have the best chance of good cache affinity with
// respect to load distribution decisions.
size_t cluster_{0};
// Values exported for lock-free access across CPUs. These are mirrors of the
// members of the same name without the exported_ prefix. This avoids
// unnecessary atomic loads when updating the values using arithmetic
// operations on the local CPU. These values are atomically readonly to other
// CPUs.
// TODO(eieio): Look at cache line alignment for these members to optimize
// cache performance.
RelaxedAtomic<SchedDuration> exported_total_expected_runtime_ns_{SchedNs(0)};
RelaxedAtomic<SchedUtilization> exported_total_deadline_utilization_{SchedUtilization{0}};
// Flow id counter for sched_latency flow events.
inline static RelaxedAtomic<uint64_t> next_flow_id_{1};
};
#endif // ZIRCON_KERNEL_INCLUDE_KERNEL_SCHEDULER_H_