| // Copyright 2018 The Fuchsia Authors |
| // |
| // Use of this source code is governed by a MIT-style |
| // license that can be found in the LICENSE file or at |
| // https://opensource.org/licenses/MIT |
| #ifndef ZIRCON_KERNEL_INCLUDE_KERNEL_SCHEDULER_H_ |
| #define ZIRCON_KERNEL_INCLUDE_KERNEL_SCHEDULER_H_ |
| |
| #include <lib/fit/function.h> |
| #include <lib/fxt/interned_string.h> |
| #include <lib/relaxed_atomic.h> |
| #include <lib/zircon-internal/macros.h> |
| #include <platform.h> |
| #include <stdint.h> |
| #include <zircon/syscalls/scheduler.h> |
| #include <zircon/syscalls/system.h> |
| #include <zircon/types.h> |
| |
| #include <fbl/intrusive_pointer_traits.h> |
| #include <fbl/intrusive_wavl_tree.h> |
| #include <fbl/wavl_tree_best_node_observer.h> |
| #include <ffl/fixed.h> |
| #include <kernel/auto_lock.h> |
| #include <kernel/owned_wait_queue.h> |
| #include <kernel/scheduler_state.h> |
| #include <kernel/spinlock.h> |
| #include <kernel/thread.h> |
| #include <kernel/wait.h> |
| |
| // Forward declarations. |
| struct percpu; |
| |
| namespace kconcurrent { |
| struct RescheduleContext; |
| struct SchedulerUtils; |
| } // namespace kconcurrent |
| |
| // Ensure this define has a value when not defined globally by the build system. |
| #ifndef SCHEDULER_TRACING_LEVEL |
| #define SCHEDULER_TRACING_LEVEL 0 |
| #endif |
| |
| // Ensure this define has a value when not defined globally by the build system. |
| #ifndef SCHEDULER_QUEUE_TRACING_ENABLED |
| #define SCHEDULER_QUEUE_TRACING_ENABLED false |
| #endif |
| |
| // Performance scale of a CPU relative to the highest performance CPU in the |
| // system. |
| using SchedPerformanceScale = ffl::Fixed<int64_t, 31>; |
| |
| // Converts a userspace CPU performance scale to a SchedPerformanceScale value. |
| constexpr SchedPerformanceScale ToSchedPerformanceScale(zx_cpu_performance_scale_t value) { |
| const size_t FractionaBits = sizeof(value.fractional_part) * 8; |
| return ffl::FromRaw<FractionaBits>(uint64_t{value.integral_part} << FractionaBits | |
| value.fractional_part); |
| } |
| |
| // Converts a SchedPerformanceScale value to a userspace CPU performance scale. |
| constexpr zx_cpu_performance_scale_t ToUserPerformanceScale(SchedPerformanceScale value) { |
| using UserScale = ffl::Fixed<uint64_t, 32>; |
| const UserScale user_scale{value}; |
| const uint64_t integral = user_scale.Integral().raw_value() >> UserScale::Format::FractionalBits; |
| const uint64_t fractional = user_scale.Fraction().raw_value(); |
| return {.integral_part = uint32_t(integral), .fractional_part = uint32_t(fractional)}; |
| } |
| |
| // Implements fair and deadline scheduling algorithms and manages the associated |
| // per-CPU state. |
| class Scheduler { |
| public: |
| // Default minimum granularity of time slices. |
| static constexpr SchedDuration kDefaultMinimumGranularity = SchedMs(1); |
| |
| // Default target latency for a scheduling period. |
| static constexpr SchedDuration kDefaultTargetLatency = SchedMs(8); |
| |
| // The threshold for cross-cluster work stealing. Queues with an estimated |
| // runtime below this value are not stolen from if the target and destination |
| // CPUs are in different logical clusters. In a performance-balanced system, |
| // this tunable value approximates the cost of cross-cluster migration due to |
| // cache misses, assuming a task has high cache affinity in its current |
| // cluster. This tunable may be increased to limit cross-cluster spill over. |
| static constexpr SchedDuration kInterClusterThreshold = SchedMs(2); |
| |
| // The threshold for early termination when searching for a CPU to place a |
| // task. Queues with an estimated runtime below this value are sufficiently |
| // unloaded. In a performance-balanced system, this tunable value approximates |
| // the cost of intra-cluster migration due to cache misses, assuming a task |
| // has high cache affinity with the last CPU it ran on. This tunable may be |
| // increased to limit intra-cluster spill over. |
| static constexpr SchedDuration kIntraClusterThreshold = SchedUs(25); |
| |
| // The per-CPU deadline utilization limit to attempt to honor when selecting a |
| // CPU to place a task. It is up to userspace to ensure that the total set of |
| // deadline tasks can honor this limit. Even if userspace ensures the total |
| // set of deadline utilizations is within the total available processor |
| // resources, when total utilization is high enough it may not be possible to |
| // honor this limit due to the bin packing problem. |
| static constexpr SchedUtilization kCpuUtilizationLimit{1}; |
| |
| // The maximum deadline utilization permitted for a single thread. This limit |
| // is applied when scaling the utilization of a deadline task to the relative |
| // performance of a candidate target processor -- placing a task on a |
| // processor that would cause the scaled thread utilization to exceed this |
| // value is avoided if possible. |
| static constexpr SchedUtilization kThreadUtilizationMax{1}; |
| |
| // The adjustment rates of the exponential moving averages tracking the |
| // expected runtimes of each thread. |
| static constexpr ffl::Fixed<int32_t, 2> kExpectedRuntimeAlpha = ffl::FromRatio(1, 4); |
| static constexpr ffl::Fixed<int32_t, 0> kExpectedRuntimeBeta = ffl::FromRatio(1, 1); |
| |
| Scheduler() = default; |
| ~Scheduler() = default; |
| |
| Scheduler(const Scheduler&) = delete; |
| Scheduler& operator=(const Scheduler&) = delete; |
| |
| // Accessors for total weight and number of runnable tasks. |
| SchedWeight GetTotalWeight() const TA_EXCL(queue_lock_); |
| size_t GetRunnableTasks() const TA_EXCL(queue_lock_); |
| |
| // Dumps the state of the run queue to the specified output target. Note that |
| // interrupts must be disabled in order to call this method. |
| void Dump(FILE* output_target = stdout) TA_EXCL(queue_lock_); |
| |
| // Dumps info about the currently active thread (if any). Note that |
| // interrupts must be disabled in order to call this method. |
| void DumpActiveThread(FILE* output_target = stdout) TA_EXCL(queue_lock_); |
| |
| // Returns the number of the CPU this scheduler instance is associated with. |
| cpu_num_t this_cpu() const { return this_cpu_; } |
| |
| // Returns the index of the logical cluster of the CPU this scheduler instance |
| // is associated with. |
| size_t cluster() const { return cluster_; } |
| |
| // Returns the lock-free value of the predicted queue time for the CPU this |
| // scheduler instance is associated with. |
| SchedDuration predicted_queue_time_ns() const { |
| return exported_total_expected_runtime_ns_.load(); |
| } |
| |
| // Returns the lock-free value of the predicted deadline utilization for the |
| // CPU this scheduler instance is associated with. |
| SchedUtilization predicted_deadline_utilization() const { |
| return exported_total_deadline_utilization_.load(); |
| } |
| |
| // Returns the performance scale of the CPU this scheduler instance is |
| // associated with. |
| SchedPerformanceScale performance_scale() const TA_REQ(queue_lock_) { return performance_scale_; } |
| |
| // Returns the reciprocal performance scale of the CPU this scheduler instance |
| // is associated with. |
| SchedPerformanceScale performance_scale_reciprocal() const TA_REQ(queue_lock_) { |
| return performance_scale_reciprocal_; |
| } |
| |
| // Returns a pointer to the currently running thread, if any. |
| // |
| // TODO(johngro); This is not particularly safe. As soon as we returned this |
| // thread's pointer, it could (in theory) exit. We probably need to require |
| // that the scheduler's queue lock be held in order to peek at the active |
| // thread member. |
| Thread* active_thread() const TA_EXCL(queue_lock_) { |
| Guard<MonitoredSpinLock, IrqSave> guard{&queue_lock_, SOURCE_TAG}; |
| return active_thread_; |
| } |
| |
| // Public entry points. |
| |
| // See the comment in Thread::CreateEtc for why InitializeThread does not take |
| // a stance on whether or not we should be in an active chain-lock |
| // transaction. On the surface, demanding that we hold a threads lock without |
| // also demanding that we also be involved in an active chain-lock transaction |
| // is an apparent contradiction of some fundamental invariants (how can a |
| // chain lock be held without being in a transaction?), but when used |
| // exclusively from Thread::CreateEtc, it should be OK. |
| static void InitializeThread(Thread* thread, const SchedulerState::BaseProfile& profile) |
| TA_REQ(thread->get_lock()); |
| static void InitializeFirstThread(Thread* thread) |
| TA_REQ(chainlock_transaction_token, thread->get_lock()); |
| static void RemoveFirstThread(Thread* thread) |
| TA_REQ(chainlock_transaction_token, thread->get_lock()); |
| static void Block(Thread* const current_thread) |
| TA_REQ(chainlock_transaction_token, current_thread->get_lock()); |
| static void Yield(Thread* const current_thread) |
| TA_REQ(chainlock_transaction_token, current_thread->get_lock()); |
| |
| // Note; no locks should be held when calling preempt. The thread's lock will be obtained |
| // unconditionally in the process. |
| static void Preempt() TA_EXCL(chainlock_transaction_token); |
| |
| static void Reschedule(Thread* const current_thread) |
| TA_REQ(chainlock_transaction_token, current_thread->get_lock()); |
| static void RescheduleInternal(Thread* const current_thread) |
| TA_REQ(chainlock_transaction_token, current_thread->get_lock()); |
| |
| // Finish unblocking a thread. This function expects the thread to be locked |
| // when called. It will: |
| // |
| // 1) Select a scheduler compatible with the thread. |
| // 2) Place the thread into that scheduler's run queue. |
| // 3) Drop the thread's lock. |
| // 4) Reschedule if needed/permitted. |
| // |
| static void Unblock(Thread* thread) TA_REQ(chainlock_transaction_token) |
| TA_REL(thread->get_lock()); |
| |
| // Unblock list expects to receive a list of threads, all of whose |
| // locks are currently held. It will drop each thread's lock after |
| // successfully assigning it to a scheduler. |
| static void Unblock(Thread::UnblockList thread_list) TA_REQ(chainlock_transaction_token); |
| |
| // UnblockIdle is used in the process of creation of the idle thread. It |
| // simply asserts that the thread is (in fact) flagged as the idle thread, and |
| // that it has hard affinity for exactly one CPU (its CPU). It then sets the |
| // thread to be ready, and ensures that its curr_cpu_ state is correct. |
| static void UnblockIdle(Thread* idle_thread) |
| TA_REQ(chainlock_transaction_token, idle_thread->get_lock()); |
| |
| // Called when something has changed about a thread which may result in it |
| // needing to be migrated to a different CPU. In particular, a change to |
| // either its soft or hard affinity mask. |
| static void Migrate(Thread* thread) TA_REQ(chainlock_transaction_token) |
| TA_REL(thread->get_lock()); |
| static void MigrateUnpinnedThreads(); |
| |
| // TimerTick is called when the preemption timer for a CPU has fired. |
| // |
| // This function is logically private and should only be called by timer.cc. |
| static void TimerTick(SchedTime now); |
| |
| // Releases the lock held by the previous and current threads after a context |
| // switch. This must be called by trampoline routines at some point before |
| // jumping to the current thread's entry point. |
| static void TrampolineLockHandoff(); |
| |
| // Called when the base profile of |thread| has been changed by a program. The |
| // thread must be either running or runnable when this method is called. If |
| // the thread had been blocked instead, the change of the base profile would |
| // have been managed at the WaitQueue level, perhaps eventually propagating to |
| // the scheduler via UpstreamThreadBaseProfileChanged. |
| static void ThreadBaseProfileChanged(Thread& thread) |
| TA_REQ(chainlock_transaction_token, thread.get_lock(), preempt_disabled_token); |
| |
| // Called when the base profile of a thread (|upstream|) which exists upstream |
| // of |target| has changed its base profile. It is possible for the rules |
| // regarding scheduling penalties base profile changes to be slightly |
| // different for when a thread's effective profile changes as a result of its |
| // own base profile changing, vs the base profile of a thread which exists |
| // upstream of it in a PI graph. Because of this, the two scenarios are |
| // handled separately using two different methods. |
| template <typename TargetType> |
| static void UpstreamThreadBaseProfileChanged(Thread& upstream, TargetType& target) |
| TA_REQ(chainlock_transaction_token, upstream.get_lock(), internal::GetPiNodeLock(target), |
| preempt_disabled_token); |
| |
| // Called when a thread (|upstream|) blocks in an OwnedWaitQueue, and the |
| // target of that OWQ (|target|) is running or runnable. At the point that |
| // this method is called, propagation of the static profile pressure |
| // parameters (weight, deadline utilization, and minimum deadline) should have |
| // already been propagated to |target|, and its EffectiveProfile should be |
| // dirty. This method is called in order to update the dynamic scheduling |
| // parameters (start time, finish time, time slice remaining) of |target| as |
| // needed because of the join operation. |
| template <typename UpstreamType, typename TargetType> |
| static void JoinNodeToPiGraph(UpstreamType& upstream, TargetType& target) |
| TA_REQ(chainlock_transaction_token, internal::GetPiNodeLock(upstream), |
| internal::GetPiNodeLock(target), preempt_disabled_token); |
| |
| // Called when a thread (|upstream|) unblocks from an OwnedWaitQueue and |
| // leaves the PI graph whose running or runnable target is |target|. |
| // |
| // At the point that this method is called, propagation of the static profile |
| // pressure parameters (weight, deadline utilization, and minimum deadline) |
| // should have already been propagated to |target|, and its EffectiveProfile |
| // should be dirty. This method is called in order to update the dynamic |
| // scheduling parameters (start time, finish time, time slice remaining) in |
| // both |target| and |upstream| as needed because of the split operation. |
| template <typename UpstreamType, typename TargetType> |
| static void SplitNodeFromPiGraph(UpstreamType& upstream, TargetType& target) |
| TA_REQ(chainlock_transaction_token, internal::GetPiNodeLock(upstream), |
| internal::GetPiNodeLock(target), preempt_disabled_token); |
| |
| // Updates the performance scales of the requested CPUs and returns the |
| // effective values in place, which may be different than the requested values |
| // if they are below the minimum safe values for the respective CPUs. |
| // |
| // Requires |count| <= num CPUs. |
| static void UpdatePerformanceScales(zx_cpu_performance_info_t* info, size_t count) |
| TA_EXCL(queue_lock_); |
| |
| // Gets the performance scales of up to count CPUs. Returns the last values |
| // requested by userspace, even if they have not yet taken effect. |
| // |
| // Requires |count| <= num CPUs. |
| static void GetPerformanceScales(zx_cpu_performance_info_t* info, size_t count) |
| TA_EXCL(queue_lock_); |
| |
| // Gets the default performance scales of up to count CPUs. Returns the |
| // initial values determined by the system topology, or 1.0 when no topology |
| // is available. |
| // |
| // Requires |count| <= num CPUs. |
| static void GetDefaultPerformanceScales(zx_cpu_performance_info_t* info, size_t count) |
| TA_EXCL(queue_lock_); |
| |
| // Get the mask of valid CPUs that thread may run on. If a new mask |
| // is set, the thread will be migrated to satisfy the new constraint. |
| template <Affinity AffinityType> |
| static cpu_mask_t SetCpuAffinity(Thread& thread, cpu_mask_t affinity) |
| TA_EXCL(chainlock_transaction_token, thread.get_lock()); |
| |
| // Run the passed callable while holding the specified scheduler's lock. |
| // Currently used only by the threadload debug function in kernel/debug.cc |
| template <typename Callable> |
| static auto RunInLockedScheduler(cpu_num_t which_cpu, Callable callable) { |
| // The call do Get will eventually DEBUG_ASSERT if our cpu ID is out of range. |
| Scheduler& scheduler = *Get(which_cpu); |
| Guard<MonitoredSpinLock, IrqSave> queue_guard{&scheduler.queue_lock_, SOURCE_TAG}; |
| return callable(); |
| } |
| |
| // Run the passed callable while holding the current CPU's scheduler's lock. |
| // Used by routines in thread.cc who need to change the current thread's |
| // state, but need to be holding the current scheduler's lock in order to do |
| // so. |
| template <typename Callable> |
| static void RunInLockedCurrentScheduler(Callable callable) { |
| DEBUG_ASSERT(arch_ints_disabled()); |
| Scheduler& scheduler = *Get(); |
| Guard<MonitoredSpinLock, NoIrqSave> queue_guard{&scheduler.queue_lock_, SOURCE_TAG}; |
| callable(); |
| } |
| |
| // Run the passed callable while holding the specified thread's scheduler's |
| // lock, if any. Used by SetMigrateFn in thread.cc. |
| template <typename Callable> |
| static void RunInThreadsSchedulerLocked(Thread* thread, Callable callable) |
| TA_REQ(thread->get_lock()) { |
| const cpu_num_t thread_cpu = thread->scheduler_state().curr_cpu(); |
| if (thread_cpu != INVALID_CPU) { |
| Scheduler& scheduler = *Get(thread_cpu); |
| Guard<MonitoredSpinLock, NoIrqSave> queue_guard{&scheduler.queue_lock_, SOURCE_TAG}; |
| callable(); |
| } else { |
| callable(); |
| } |
| } |
| |
| private: |
| // fwd decl of a helper class used for PI Join/Split operations |
| template <typename T> |
| class PiNodeAdapter; |
| |
| friend struct Thread; |
| friend struct ::kconcurrent::SchedulerUtils; |
| |
| // Allow percpu to init our cpu number and performance scale. |
| friend struct percpu; |
| // Load balancer test. |
| friend struct LoadBalancerTestAccess; |
| // Allow tests to modify our state. |
| friend class LoadBalancerTest; |
| // A helper used in Join/Split operations |
| template <typename T> |
| friend class PiNodeAdapter; |
| // Diagnostics used by LockupDetector |
| friend void DumpSchedulerActiveThreadDetails(Scheduler& scheduler); |
| |
| // EffectiveProfile and BaseProfile are both inner classes of SchedulerState which |
| // are also frequently used by the methods in Scheduler. Make a couple of handy |
| // private aliases to keep us from needing to say |
| // SchedulerState::(Effective|Base)Profile in all of the Scheduler methods. |
| using EffectiveProfile = SchedulerState::EffectiveProfile; |
| using BaseProfile = SchedulerState::BaseProfile; |
| |
| static void LockHandoffInternal(const kconcurrent::RescheduleContext& ctx, |
| Thread* const current_thread) |
| TA_REQ(chainlock_transaction_token, current_thread->get_lock()); |
| |
| // Sets the initial values of the CPU performance scales for this Scheduler |
| // instance. |
| void InitializePerformanceScale(SchedPerformanceScale scale); |
| |
| static inline void RescheduleMask(cpu_mask_t cpus_to_reschedule_mask); |
| static void PreemptLocked(Thread* current_thread) |
| TA_REQ(chainlock_transaction_token, current_thread->get_lock()); |
| |
| // Unconditionally perform an expensive check of as many scheduler invariants |
| // as we can. Usually, this is not called directly; the ValidateInvariants |
| // version (which is gated on a default-off build option) is preferred. |
| void ValidateInvariantsUnconditional() const TA_REQ(queue_lock_, chainlock_transaction_token); |
| void ValidateInvariants() const TA_REQ(queue_lock_, chainlock_transaction_token) { |
| if constexpr (kSchedulerExtraInvariantValidation) { |
| ValidateInvariantsUnconditional(); |
| } |
| } |
| |
| // Lockup detector needs to be able to hold a scheduler's lock in order to |
| // print diagnostics |
| auto& queue_lock() TA_RET_CAP(queue_lock_) { return queue_lock_; } |
| |
| // Specifies how to associate a thread with a Scheduler instance, update |
| // metadata, and whether/where to place the thread in a run queue. |
| enum class Placement { |
| // Selects a place in the queue based on the current insertion time and |
| // thread weight or deadline. |
| Insertion, |
| |
| // Selects a place in the queue based on the original insertion time and |
| // the updated (inherited or changed) weight or deadline on the same CPU. |
| Adjustment, |
| |
| // Selects a place in the queue based on the original insertion time and |
| // the updated time slice due to being preempted by another thread. |
| Preemption, |
| |
| // Selects a place in the queue based on the insertion time in the original |
| // queue adjusted for the new queue. |
| Migration, |
| |
| // Updates the metadata to account for a stolen thread that was just taken |
| // from a different queue. This is distinct from Migration in that the |
| // thread is not also enqueued, it is run immediately by the new CPU. |
| Association, |
| }; |
| |
| // When we need to find a target CPU for a thread to run on, the behavior of |
| // FindTargetCpu depends on the reason why the thread needs to find a target |
| // CPU. This enumeration encodes the reason allowing FindTargetCpu to |
| // behavior properly. |
| enum class FindTargetCpuReason { |
| // If the thread is unblocking (or coming out of suspend/sleep), then it |
| // might need to choose a CPU which it typically wouldn't (because of its |
| // currently configured soft affinity). This happens when the thread last |
| // ran on CPU X before blocking, has a migration function, and now needs to |
| // run on a different CPU because of its affinity mask. The thread will be |
| // sent to CPU X as it unblocks, but as soon as CPU X picks the thread to |
| // run, the threads migration function will be called (on CPU X) before the |
| // scheduler sends it to a different CPU queue (where it will complete the |
| // migration once it is finally scheduled to run). |
| Unblocking, |
| |
| // If the thread needs to select a CPU because it is migrating, then the |
| // special logic used for the Unblocking reason is skipped. It is assumed |
| // that the code has already managed to call the thread's migration function |
| // while running on its last cpu, and now the CPU chosen must be as compatible |
| // with the thread's soft affinity mask as possible. |
| Migrating, |
| }; |
| |
| // Returns the current system time as a SchedTime value. |
| static SchedTime CurrentTime() { return SchedTime{current_time()}; } |
| |
| // Returns the Scheduler instance for the current CPU. |
| static Scheduler* Get(); |
| |
| // Returns the Scheduler instance for the given CPU. |
| static Scheduler* Get(cpu_num_t cpu); |
| |
| // Returns a CPU to run the given thread on. The thread's lock must be held |
| // exclusively during this operation. This is almost always called when |
| // threads are transitioning from a wait queue to a scheduler, therefore the |
| // thread's state is only protected by the thread's lock; not by any scheduler |
| // or wait queue lock. |
| static cpu_num_t FindTargetCpu(Thread* thread, FindTargetCpuReason reason) |
| TA_REQ_SHARED(thread->get_lock()); |
| |
| // Handle all of the common tasks associated with each of the possible PI |
| // interactions. The outline of this is: |
| // |
| // 1) If the target is an active thread (meaning either running or runnable), |
| // we need to: |
| // 1.1) Enter the scheduler's queue lock. |
| // 1.2) If the thread is active, but not actually running, remove the target |
| // thread from its scheduler's run queue. |
| // 1.3) Now update the thread's effective profile. |
| // 1.4) Apply any changes in the thread's effective profile to its scheduler's |
| // bookkeeping. |
| // 1.5) Update the dynamic parameters of the thread. |
| // 1.6) Either re-insert the thread into its scheduler's run queue (if it was |
| // READY) or adjust its schedulers preemption time (if it was RUNNING). |
| // 1.7) Trigger a reschedule on the thread's scheduler. |
| // 2) If the target is either an OwnedWaitQueue, or a thread which is not |
| // active: |
| // 2.1) Recompute the target's effective profile, adjust the target's position |
| // in it's wait queue if the target is a thread which is currently |
| // blocked in a wait queue. |
| // 2.2) Recompute the target's dynamic scheduler parameters. |
| template <typename TargetType, typename Callable> |
| static inline void HandlePiInteractionCommon(SchedTime now, PiNodeAdapter<TargetType>& target, |
| Callable UpdateDynamicParams) |
| TA_REQ(chainlock_transaction_token, target.get_lock()); |
| |
| using EndTraceCallback = fit::inline_function<void(), sizeof(void*)>; |
| |
| // Common logic for reschedule API. |
| void RescheduleCommon(Thread* const current_thread, SchedTime now, |
| EndTraceCallback end_outer_trace = nullptr) TA_EXCL(queue_lock_) |
| TA_REQ(chainlock_transaction_token, current_thread->get_lock()); |
| |
| // Evaluates the schedule and returns the thread that should execute, |
| // updating the run queue as necessary. |
| Thread* EvaluateNextThread(SchedTime now, Thread* current_thread, bool timeslice_expired, |
| SchedDuration total_runtime_ns, |
| Guard<MonitoredSpinLock, NoIrqSave>& queue_guard) |
| TA_REQ(chainlock_transaction_token, queue_lock_, current_thread->get_lock()); |
| |
| // Adds a thread to the run queue tree. The thread must be active on this |
| // CPU. |
| void QueueThread(Thread* thread, Placement placement, SchedTime now = SchedTime{0}, |
| SchedDuration total_runtime_ns = SchedDuration{0}) |
| TA_REQ(queue_lock_, thread->get_lock()); |
| |
| // Removes the thread at the head of the first eligible run queue, or attempt |
| // to steal a thread from another scheduler if we have no eligible thread in |
| // our current run queues. If none of these attempts succeed, we will fall |
| // back on returning this scheduler's Idle thread. |
| // |
| // In all cases, the thread being returned will not be locked, and will not be |
| // a member of any current run queue. One of the following *must* happen to |
| // the thread after this. |
| // |
| // 1) The thread is added back to one of these scheduler's run queues (this |
| // is common during a PI interaction involving the thread). |
| // 2) The thread is flagged as "reschedule_in_progress" in its scheduler-owned |
| // variables, before becoming the active thread in its scheduler (after |
| // dropping the scheduler lock and re-obtaining the thread's lock |
| // exclusively). This happens in RescheduleCommon when EvaluateNextThread |
| // chooses a new thread to run. |
| // 3) The thread is flagged as "steal_in_progress" after being removed from |
| // another scheduler's run queue, but before being locked exclusively and |
| // re-inserted into its new scheduler's run queue. |
| // |
| // Note that these rules apply to *all* the versions of Dequeue listed below. |
| Thread* DequeueThread(SchedTime now, Guard<MonitoredSpinLock, NoIrqSave>& queue_guard) |
| TA_REQ(chainlock_transaction_token, queue_lock_); |
| |
| // Removes the thread at the head of the fair run queue and returns it. |
| Thread* DequeueFairThread() TA_REQ(queue_lock_); |
| |
| // Removes the eligible thread with the earliest deadline in the deadline run |
| // queue and returns it. |
| Thread* DequeueDeadlineThread(SchedTime eligible_time) TA_REQ(queue_lock_); |
| |
| // Removes the eligible thread with a deadline earlier than the given deadline |
| // and returns it or nullptr if one does not exist. |
| Thread* DequeueEarlierDeadlineThread(SchedTime eligible_time, SchedTime finish_time) |
| TA_REQ(queue_lock_); |
| |
| // Returns the eligible thread in the run queue with a deadline earlier than |
| // the given deadline, or nullptr if one does not exist. |
| // |
| // See the note in |FindEarliestEligibleThread| for lifecycle rules for the |
| // returned pointer (if any) |
| Thread* FindEarlierDeadlineThread(SchedTime eligible_time, SchedTime finish_time) |
| TA_REQ(queue_lock_); |
| |
| // Attempts to steal work from other busy CPUs. Returns nullptr if no work was |
| // stolen, otherwise returns a pointer to the stolen thread that is now |
| // associated with the local Scheduler instance. |
| Thread* StealWork(SchedTime now, SchedPerformanceScale scale_up_factor) |
| TA_REQ(chainlock_transaction_token) TA_EXCL(queue_lock_); |
| |
| // Returns the time that the next deadline task will become eligible or infinite |
| // if there are no ready deadline tasks. |
| SchedTime GetNextEligibleTime() TA_REQ(queue_lock_); |
| |
| // Calculates the timeslice of the thread based on the current run queue |
| // state. |
| SchedDuration CalculateTimeslice(const Thread* thread) TA_REQ(queue_lock_) |
| TA_REQ_SHARED(thread->get_lock()); |
| |
| // Returns the completion time clamped to the start of the earliest deadline |
| // thread that will become eligible in that time frame. |
| SchedTime ClampToDeadline(SchedTime completion_time) TA_REQ(queue_lock_); |
| |
| // Returns the completion time clamped to the start of the earliest deadline |
| // thread that will become eligible in that time frame and also has an earlier |
| // deadline than the given finish time. |
| SchedTime ClampToEarlierDeadline(SchedTime completion_time, SchedTime finish_time) |
| TA_REQ(queue_lock_); |
| |
| // Updates the timeslice of the thread based on the current run queue state. |
| // Returns the absolute deadline for the next time slice, which may be earlier |
| // than the completion of the time slice if other threads could preempt the |
| // given thread before the time slice is exhausted. |
| SchedTime NextThreadTimeslice(Thread* thread, SchedTime now) |
| TA_REQ(queue_lock_, thread->get_lock()); |
| |
| // Updates the scheduling period based on the number of active threads. |
| void UpdatePeriod() TA_REQ(queue_lock_); |
| |
| // Updates the global virtual timeline. |
| void UpdateTimeline(SchedTime now) TA_REQ(queue_lock_); |
| |
| static void SetThreadMigrateFnInternal(Thread* thread, Thread::MigrateFn migrate_fn) |
| TA_REQ(thread->get_lock(), Thread::get_list_lock()); |
| |
| // Makes a thread active on this CPU's scheduler and inserts it into the |
| // run queue tree. |
| void Insert(SchedTime now, Thread* thread, Placement placement = Placement::Insertion) |
| TA_REQ(queue_lock_, thread->get_lock(), thread->get_scheduler_variable_lock()); |
| |
| // Removes the thread from this CPU's scheduler as it transitions to another |
| // CPU (either because it is being stolen, or migrated). The thread's lock |
| // cannot be held exclusively at this point, so some of the thread's state |
| // will remain in a transient state until the thread can be re-inserted in its |
| // new home. Specifically, this includes: |
| // |
| // 1) The thread's current CPU id. |
| // 2) The thread's start time (if it is a fair thread). |
| // 3) The thread's finish time (if it is a fair thread). |
| // |
| // This state will be cleaned up by FinishTransition once the thread has been |
| // locked exclusively. |
| void RemoveForTransition(Thread* thread, SchedulerQueueState::TransientState target_state) |
| TA_REQ(queue_lock_, thread->get_scheduler_variable_lock()) TA_REQ_SHARED(thread->get_lock()); |
| |
| void FinishTransition(SchedTime now, Thread* thread) |
| TA_REQ(queue_lock_, thread->get_lock(), thread->get_scheduler_variable_lock()); |
| |
| // Removes the thread from this CPU's scheduler. The thread must not be in |
| // the run queue tree. This is the same operation as RemoveForTransition, but it |
| // requires that the thread's lock is held exclusively, which allows the CPU |
| // id, start time, and finish time to be updated. |
| void Remove(Thread* thread, SchedulerQueueState::TransientState reason = |
| SchedulerQueueState::TransientState::None) |
| TA_REQ(queue_lock_, thread->get_lock(), thread->get_scheduler_variable_lock()) { |
| SchedulerState& state = thread->scheduler_state(); |
| |
| RemoveForTransition(thread, reason); |
| state.curr_cpu_ = INVALID_CPU; |
| if (IsFairThread(thread)) { |
| state.start_time_ = SchedNs(0); |
| state.finish_time_ = SchedNs(0); |
| } |
| } |
| |
| // Removes a specific thread from its current RunQueue. Note that the thread |
| // must currently exist in its queue, it is an error otherwise. |
| void EraseFromQueue(Thread* thread) TA_REQ(queue_lock_, thread->get_scheduler_variable_lock()) |
| TA_REQ_SHARED(thread->get_lock()) { |
| DEBUG_ASSERT(thread->scheduler_queue_state().InQueue()); |
| const SchedulerState& state = const_cast<const Thread*>(thread)->scheduler_state(); |
| RunQueue& queue = |
| state.discipline() == SchedDiscipline::Fair ? fair_run_queue_ : deadline_run_queue_; |
| queue.erase(*thread); |
| } |
| |
| // Returns true if there is at least one eligible deadline thread in the |
| // run queue. |
| inline bool IsDeadlineThreadEligible(SchedTime eligible_time) TA_REQ(queue_lock_) { |
| if (deadline_run_queue_.is_empty()) { |
| return false; |
| } |
| |
| const Thread& front = deadline_run_queue_.front(); |
| AssertInRunQueue(front); |
| return front.scheduler_state().start_time_ <= eligible_time; |
| } |
| |
| // Updates the total expected runtime estimator and exports the atomic shadow |
| // variable for cross-CPU readers. |
| inline void UpdateTotalExpectedRuntime(SchedDuration delta_ns) TA_REQ(queue_lock_); |
| |
| // Updates to total deadline utilization estimator and exports the atomic |
| // shadow variable for cross-CPU readers. |
| inline void UpdateTotalDeadlineUtilization(SchedUtilization delta_ns) TA_REQ(queue_lock_); |
| |
| // Utilities to scale up or down the given value by the performance scale of the CPU. |
| template <typename T> |
| inline T ScaleUp(T value) const TA_REQ(queue_lock_); |
| template <typename T> |
| inline T ScaleDown(T value) const TA_REQ(queue_lock_); |
| |
| // Update trace counters which track the total number of runnable threads for a CPU |
| inline void TraceTotalRunnableThreads() const TA_REQ(queue_lock_); |
| |
| // Returns a new flow id when flow tracing is enabled, zero otherwise. |
| inline static uint64_t NextFlowId(); |
| |
| // Traits type to adapt the WAVLTree to Thread with node state in the |
| // scheduler_state member. |
| struct TaskTraits { |
| using KeyType = SchedulerState::KeyType; |
| static KeyType GetKey(const Thread& thread) TA_NO_THREAD_SAFETY_ANALYSIS { |
| return thread.scheduler_state().key(); |
| } |
| static bool LessThan(KeyType a, KeyType b) { return a < b; } |
| static bool EqualTo(KeyType a, KeyType b) { return a == b; } |
| static auto& node_state(Thread& thread) TA_NO_THREAD_SAFETY_ANALYSIS { |
| return thread.scheduler_queue_state().run_queue_node; |
| } |
| }; |
| |
| // Observer that maintains the subtree invariant min_finish_time as nodes are |
| // added to and removed from the run queue. |
| struct SubtreeMinTraits { |
| static SchedTime GetValue(const Thread& node) TA_NO_THREAD_SAFETY_ANALYSIS { |
| return node.scheduler_state().finish_time_; |
| } |
| |
| static SchedTime GetSubtreeBest(const Thread& node) TA_NO_THREAD_SAFETY_ANALYSIS { |
| return node.scheduler_state().min_finish_time_; |
| } |
| |
| static bool Compare(SchedTime a, SchedTime b) { return a < b; } |
| |
| static void AssignBest(Thread& node, SchedTime val) TA_NO_THREAD_SAFETY_ANALYSIS { |
| node.scheduler_state().min_finish_time_ = val; |
| } |
| |
| static void ResetBest(Thread& target) {} |
| }; |
| |
| using SubtreeMinObserver = fbl::WAVLTreeBestNodeObserver<SubtreeMinTraits>; |
| |
| // Alias of the WAVLTree type for the run queue. |
| using RunQueue = fbl::WAVLTree<TaskTraits::KeyType, Thread*, TaskTraits, fbl::DefaultObjectTag, |
| TaskTraits, SubtreeMinObserver>; |
| |
| // Finds the next eligible thread in the given run queue. |
| // |
| // Note: Any Thread* returned by this method may only be used as long as the |
| // queue lock is held. Dropping the queue lock allows the thread to be removed |
| // from the scheduler, and potentially exit, destroying the Thread and |
| // invalidating the thread pointer in the process. |
| Thread* FindEarliestEligibleThread(RunQueue* run_queue, SchedTime eligible_time) |
| TA_REQ(queue_lock_); |
| |
| // Finds the next eligible thread in the given run queue that also passes the |
| // given predicate. |
| // |
| // See the note in |FindEarliestEligibleThread| for lifecycle rules for the |
| // returned pointer (if any) |
| template <typename Predicate> |
| Thread* FindEarliestEligibleThread(RunQueue* run_queue, SchedTime eligible_time, |
| Predicate&& predicate) TA_REQ(queue_lock_); |
| |
| // Emits queue event tracers for trace-based scheduler performance analysis. |
| inline void TraceThreadQueueEvent(const fxt::InternedString& name, const Thread* thread) const |
| TA_REQ(queue_lock_) TA_REQ_SHARED(thread->get_lock()); |
| |
| // Returns true if the given thread is fair scheduled. |
| static bool IsFairThread(const Thread* thread) TA_REQ_SHARED(thread->get_lock()) { |
| return thread->scheduler_state().effective_profile_.IsFair(); |
| } |
| |
| // Returns true if the given thread is deadline scheduled. |
| static bool IsDeadlineThread(const Thread* thread) TA_REQ_SHARED(thread->get_lock()) { |
| return thread->scheduler_state().effective_profile_.IsDeadline(); |
| } |
| |
| // Returns true if the given thread's time slice is adjustable under changes to |
| // the fair scheduler demand on the CPU. |
| static bool IsThreadAdjustable(const Thread* thread) TA_REQ_SHARED(thread->get_lock()) { |
| // Checking the thread state avoids unnecessary adjustments on a thread that |
| // is no longer competing. |
| return !thread->IsIdle() && IsFairThread(thread) && thread->state() == THREAD_READY; |
| } |
| |
| // Protects run queues and associated metadata for this Scheduler instance. |
| // The queue lock is the bottom most lock in the system for the CPU it is |
| // associated with: no other locks may be nested within a queue lock. A queue |
| // lock may be acquired across CPUs, however, the no nesting rule still |
| // applies. |
| // |
| // Note: while the queue_lock_ is the bottom-most lock in the system from the |
| // scheduler's perspective, it is not (formally) a leaf-lock right now. |
| // Currently, there is one place in the system (DumpThreadLocked) where the |
| // queue_lock_ needs to be held, while fprintf is called. |
| // |
| // This happens when the lockup-detector is unhappy, and asking the Scheduler |
| // to print information about its run queue state. Most of the printf targets |
| // which could be passed to the scheduler also involve holding locks. If the |
| // queue_lock_ is flagged as a leaf lock, and lockdep is enabled, and the |
| // lockup-detector attempt to report a hang, it will cause lock-dep to |
| // complain about obtaining a lock while holding a leaf-lock, which will cause |
| // more printing, which causes more complaints, etc... |
| // |
| // See https://fxbug.dev/42070437 for more details. |
| mutable DECLARE_SPINLOCK_WITH_TYPE(Scheduler, MonitoredSpinLock) queue_lock_ |
| TA_ACQ_AFTER(Thread::get_list_lock()); |
| |
| // Alias of the queue lock wrapper type. |
| using QueueLock = decltype(queue_lock_); |
| |
| // Simplified static assertion on the wrapped queue lock type to avoid the more verbose template |
| // parameters required by lockdep::AssertHeld. |
| static void AssertHeld(const QueueLock& lock) TA_ASSERT(lock) TA_ASSERT(lock.lock()) { |
| lock.lock().AssertHeld(); |
| } |
| |
| // Asserts that a given thread belongs to |this| Scheduler instance. |
| // Requires holding the scheduler's queue_lock_. |
| // |
| void AssertInScheduler(const Thread& t) const TA_REQ(queue_lock_) TA_ASSERT_SHARED(t.get_lock()) |
| TA_ASSERT(t.get_scheduler_variable_lock()) { |
| [&]() TA_NO_THREAD_SAFETY_ANALYSIS { |
| DEBUG_ASSERT_MSG((t.scheduler_state().curr_cpu() == this_cpu_), |
| "Thread %lu expected to be a member of scheduler %u, but curr_cpu says %u", |
| t.tid(), this_cpu_, t.scheduler_state().curr_cpu()); |
| }(); |
| } |
| |
| // It would be nice to be able to make a stronger assertion here (that the |
| // thread was not just in a run queue, but the specific run queue we expect it |
| // to be in), but unfortunately, there is no inexpensive way of doing that. |
| // |
| // Thankfully, most of the time that we need to invoke an assertion like this, |
| // it is pretty obvious from the context that the assertion is true. It might |
| // be reasonably to (someday) consider removing even this weak runtime check, |
| // and just leaving this as a static analysis workaround, and nothing more. |
| inline void AssertInRunQueue(const Thread& t) const TA_REQ(queue_lock_) |
| TA_ASSERT_SHARED(t.get_lock()) TA_ASSERT(t.get_scheduler_variable_lock()) { |
| [&]() TA_NO_THREAD_SAFETY_ANALYSIS { |
| DEBUG_ASSERT((t.scheduler_state().curr_cpu_ == this_cpu_) && |
| (t.scheduler_queue_state().InQueue())); |
| }(); |
| } |
| |
| // Add a couple of small no-op helpers which inform the static analyzer of |
| // some properties which are very difficult to apply to the lambdas used in |
| // the functional programming patterns below. Neither of these methods |
| // actually _check_ anything, so they must be used with *extreme caution*. |
| // |
| // They should only ever be used when the locking invariants are trivially |
| // verifiable from the immediate context. |
| // |
| // MarkHasSchedulerAccess tells the static analyzer that we are exclusively |
| // holding a scheduler's queue lock. It's primary use case is in lambda's |
| // which are declared inside of the scope of a queue_lock_ guard, and are used |
| // for some sort of functional programming pattern. |
| // |
| // MarkHasOwnedThreadAccess tells the static analyzer that we have exclusive |
| // access to a thread's scheduler-variables, and shared (read-only) access to |
| // variables protected by a thread's lock (eg; the scheduler_state). The |
| // conditions for this operation are that we are holding the queue_lock_ for |
| // the scheduler that the thread is currently a member of (see |
| // AssertInScheduler, above). |
| // |
| // See StealWork for a practical example of where these methods are useful. |
| // |
| static inline void MarkHasSchedulerAccess(const Scheduler& s) TA_ASSERT(s.queue_lock_) {} |
| static inline void MarkHasOwnedThreadAccess(const Thread& t) |
| TA_ASSERT(t.get_scheduler_variable_lock()) TA_ASSERT_SHARED(t.get_lock()) {} |
| |
| // The run queue of fair scheduled threads ready to run, but not currently running. |
| TA_GUARDED(queue_lock_) |
| RunQueue fair_run_queue_; |
| |
| // The run queue of deadline scheduled threads ready to run, but not currently running. |
| TA_GUARDED(queue_lock_) |
| RunQueue deadline_run_queue_; |
| |
| // Pointer to the thread actively running on this CPU. |
| TA_GUARDED(queue_lock_) |
| Thread* active_thread_{nullptr}; |
| |
| // Monotonically increasing counter to break ties when queuing tasks with |
| // the same key. This has the effect of placing newly queued tasks behind |
| // already queued tasks with the same key. This is also necessary to |
| // guarantee uniqueness of the key as required by the WAVLTree container. |
| TA_GUARDED(queue_lock_) |
| uint64_t generation_count_{0}; |
| |
| // Count of the fair threads running on this CPU, including threads in the run |
| // queue and the currently running thread. Does not include the idle thread. |
| TA_GUARDED(queue_lock_) |
| int32_t runnable_fair_task_count_{0}; |
| |
| // Count of the deadline threads running on this CPU, including threads in the |
| // run queue and the currently running thread. Does not include the idle |
| // thread. |
| TA_GUARDED(queue_lock_) |
| int32_t runnable_deadline_task_count_{0}; |
| |
| // Total weights of threads running on this CPU, including threads in the |
| // run queue and the currently running thread. Does not include the idle |
| // thread. |
| TA_GUARDED(queue_lock_) |
| SchedWeight weight_total_{0}; |
| |
| // The value of |weight_total_| when the current thread was scheduled. |
| // Provides a reference for determining whether the total weights changed |
| // since the last reschedule. |
| SchedWeight scheduled_weight_total_{0}; |
| |
| // The global virtual time of this run queue. |
| TA_GUARDED(queue_lock_) |
| SchedTime virtual_time_{0}; |
| |
| // The system time since the last update to the global virtual time. |
| TA_GUARDED(queue_lock_) |
| SchedTime last_update_time_ns_{0}; |
| |
| // The system time that the current time slice started. |
| SchedTime start_of_current_time_slice_ns_{0}; |
| |
| // The system time that the current thread should be preempted. Initialized to |
| // ZX_TIME_INFINITE to pass the assertion now < target_preemption_time_ns_ (or |
| // else the current time slice is expired) on the first entry into the |
| // scheduler. |
| SchedTime target_preemption_time_ns_{ZX_TIME_INFINITE}; |
| |
| // The sum of the expected runtimes of all active threads on this CPU. This |
| // value is an estimate of the average queuimg time for this CPU, given the |
| // current set of active threads. |
| TA_GUARDED(queue_lock_) |
| SchedDuration total_expected_runtime_ns_{0}; |
| |
| // The sum of the worst case utilization of all active deadline threads on |
| // this CPU. |
| TA_GUARDED(queue_lock_) |
| SchedUtilization total_deadline_utilization_{0}; |
| |
| // Scheduling period in which every runnable task executes once in units of |
| // minimum granularity. |
| TA_GUARDED(queue_lock_) |
| SchedDuration scheduling_period_grans_{kDefaultTargetLatency / kDefaultMinimumGranularity}; |
| |
| // The smallest timeslice a thread is allocated in a single round. |
| TA_GUARDED(queue_lock_) |
| SchedDuration minimum_granularity_ns_{kDefaultMinimumGranularity}; |
| |
| // The target scheduling period. The scheduling period is set to this value |
| // when the number of tasks is low enough for the sum of all timeslices to |
| // fit within this duration. This has the effect of increasing the size of |
| // the timeslices under nominal load to reduce scheduling overhead. |
| TA_GUARDED(queue_lock_) |
| SchedDuration target_latency_grans_{kDefaultTargetLatency / kDefaultMinimumGranularity}; |
| |
| // Performance scale of this CPU relative to the highest performance CPU. This |
| // value is initially determined from the system topology, when available, and |
| // by userspace performance/thermal management at runtime. |
| TA_GUARDED(queue_lock_) SchedPerformanceScale performance_scale_{1}; |
| TA_GUARDED(queue_lock_) |
| RelaxedAtomic<SchedPerformanceScale> performance_scale_reciprocal_{SchedPerformanceScale{1}}; |
| |
| // Performance scale requested by userspace. The operational performance scale |
| // is updated to this value (possibly adjusted for the minimum allowed value) |
| // on the next reschedule, after the current thread's accounting is updated. |
| TA_GUARDED(queue_lock_) SchedPerformanceScale pending_user_performance_scale_{1}; |
| |
| // Default performance scale, determined from the system topology, when |
| // available. |
| TA_GUARDED(queue_lock_) SchedPerformanceScale default_performance_scale_{1}; |
| |
| // The CPU this scheduler instance is associated with. |
| // NOTE: This member is not initialized to prevent clobbering the value set |
| // by sched_early_init(), which is called before the global ctors that |
| // initialize the rest of the members of this class. |
| // TODO(eieio): Figure out a better long-term solution to determine which |
| // CPU is associated with each instance of this class. This is needed by |
| // non-static methods that are called from arbitrary CPUs, namely Insert(). |
| cpu_num_t this_cpu_; |
| |
| // The index of the logical cluster this CPU belongs to. CPUs with the same |
| // logical cluster index have the best chance of good cache affinity with |
| // respect to load distribution decisions. |
| size_t cluster_{0}; |
| |
| // Flag indicating that this Scheduler's CPU is in the process of becoming |
| // inactive. When true, threads who have a migration function which must be |
| // run on this CPU are permitted to join this CPU's ready queue, even if the |
| // CPU has already been flagged as inactive. The thread who is in charge of |
| // deactivating the CPU will make sure to run the migration function of the |
| // thread on this CPU before finishing the deactivation process. |
| ktl::atomic<bool> cpu_deactivating_{false}; |
| |
| // Values exported for lock-free access across CPUs. These are mirrors of the |
| // members of the same name without the exported_ prefix. This avoids |
| // unnecessary atomic loads when updating the values using arithmetic |
| // operations on the local CPU. These values are atomically readonly to other |
| // CPUs. |
| // TODO(eieio): Look at cache line alignment for these members to optimize |
| // cache performance. |
| RelaxedAtomic<SchedDuration> exported_total_expected_runtime_ns_{SchedNs(0)}; |
| RelaxedAtomic<SchedUtilization> exported_total_deadline_utilization_{SchedUtilization{0}}; |
| |
| // Flow id counter for sched_latency flow events. |
| inline static RelaxedAtomic<uint64_t> next_flow_id_{1}; |
| }; |
| |
| #endif // ZIRCON_KERNEL_INCLUDE_KERNEL_SCHEDULER_H_ |