| // Copyright 2024 The Fuchsia Authors |
| // |
| // Use of this source code is governed by a MIT-style |
| // license that can be found in the LICENSE file or at |
| // https://opensource.org/licenses/MIT |
| #ifndef ZIRCON_KERNEL_LIB_WAKE_VECTOR_INCLUDE_LIB_WAKE_VECTOR_H_ |
| #define ZIRCON_KERNEL_LIB_WAKE_VECTOR_INCLUDE_LIB_WAKE_VECTOR_H_ |
| |
| #include <lib/relaxed_atomic.h> |
| #include <lib/user_copy/user_ptr.h> |
| #include <stdarg.h> |
| #include <stdint.h> |
| #include <stdio.h> |
| #include <zircon/syscalls/system.h> |
| #include <zircon/types.h> |
| |
| #include <fbl/intrusive_double_list.h> |
| #include <kernel/auto_preempt_disabler.h> |
| #include <kernel/mutex.h> |
| #include <ktl/array.h> |
| #include <ktl/forward.h> |
| #include <ktl/type_traits.h> |
| |
| namespace wake_vector { |
| |
| namespace internal { |
| struct GlobalListTag {}; |
| struct PendingListTag {}; |
| } // namespace internal |
| |
| // Forward declaration. |
| class WakeEvent; |
| |
| // WakeVector is an interface implemented by objects that will generate system wake events using the |
| // WakeEvent type. This interface provides diagnostic information about the wake vector to the |
| // suspend subsystem. |
| class WakeVector { |
| public: |
| // This constructor verifies that the derived class has a WakeEvent member at compile time to help |
| // avoid misuse. A derived class must pass a pointer-to-member to its WakeEvent member. This |
| // constructor does not touch the contents of the WakeEvent instance, which most likely is |
| // uninitialized at this point. |
| // |
| // Example: |
| // |
| // MyWakeVector::MyWakeVector() : WakeVector{&MyWakeVector::wake_event_}, wake_event_{*this} {} |
| // |
| template <typename Class> |
| explicit WakeVector(WakeEvent Class::* wake_event_member) { |
| static_assert(ktl::is_base_of_v<WakeVector, Class>); |
| } |
| virtual ~WakeVector() = default; |
| |
| // Diagnostic information about the wake vector managed by the implementor of this interface. |
| struct Diagnostics { |
| // Indicates that the given wake vector is enabled and can generate wake events. Disabled wake |
| // vectors are not listed in diagnostic logs. |
| bool enabled = false; |
| |
| // The koid of the object implementing this interface, if any. |
| zx_koid_t koid = ZX_KOID_INVALID; |
| |
| // Extra information specific to the wake vector that can aid in determining the source of the |
| // wake event and potentially its state. |
| ktl::array<char, ZX_MAX_NAME_LEN> extra{}; |
| |
| // Utility to write into the extra field printf style. |
| int PrintExtra(const char* format, ...) __PRINTFLIKE(2, 3) { |
| va_list ap; |
| va_start(ap, format); |
| const int err = vsnprintf(extra.data(), extra.size(), format, ap); |
| va_end(ap); |
| return err; |
| } |
| }; |
| |
| // Provides diagnostic information about the wake vector object implementing this interface. |
| virtual void GetDiagnostics(Diagnostics& diagnostics_out) const = 0; |
| }; |
| |
| // The result of a request to wake up the system. |
| enum class WakeResult { |
| // The system was not suspended at the time of the event. |
| Active, |
| |
| // The system was suspended at the time of the wake event. |
| Resumed, |
| |
| // The system was in the process of suspending at the time of the wake event. |
| SuspendAborted, |
| |
| // An already pending wake event was triggered again. |
| BadState, |
| }; |
| |
| // WakeEvent manages the lifecycle of wake events triggered by wake vectors. |
| // |
| // A system wake event may be triggered in response to an appropriately configured interrupt, |
| // exception, timer, or other future wake source that should resume the system from a suspended |
| // state. When a wake event is triggered, it enters the pending state and will prevent the system |
| // from entering suspend until it has been acknowledged. A pending wake event is automatically |
| // acknowledged when the WakeEvent instance is destroyed to prevent missing an acknowledgement that |
| // would render the system unable to suspend. |
| // |
| // WakeEvent maintains a global list of all instances for diagnostic purposes (i.e. logging wake |
| // events that pending before, during, and after suspend). WakeEvents are added to and removed from |
| // the global list using WakeEvent::Initialize and WakeEvent::Destroy, respectively. Because |
| // diagnostics access each WakeEvent, and its containing WakeVector, from the global list, care must |
| // be taken to avoid potential use-after-free hazards. |
| // |
| // Users of WakeEvents MUST adhere to the following rules: |
| // 1. A WakeEvent object MUST be instantiated as a member of a type that implements the WakeVector |
| // interface, such that the lifetime of the containing type encoloses the lifetime of the |
| // WakeEvent. DO NOT heap allocate WakeEvent instances separately from the referenced WakeVector. |
| // 2. The container of a WakeEvent instance SHOULD call WakeEvent::Initialize during construction / |
| // initialization to register the WakeEvent on the global list. The container MAY skip the call |
| // to WakeEvent::Initialize if initialization of the containing type fails. |
| // 3. The container of a WakeEvent MUST call WakeEvent::Destroy IFF WakeEvent::Initialize has been |
| // called previously on the same instance of WakeEvent AND WakeEvent::Destroy MUST be called |
| // BEFORE the containing type itself is destructed. |
| // |
| // WakeEvent::Destroy MAY be called in the destructor of the containing type, which will ensure that |
| // the WakeEvent is removed from the global list before any state in the containing type that |
| // diagnostics may access becomes invalid. However, extra care must be taken if a WakeEvent is a |
| // member of a base class and there are subclasses that override WakeVector::GetDiagnostics -- in |
| // these cases the subclass is responsible for calling WakeEvent::Destroy before its destructor |
| // completes INSTEAD of the base class to prevent use-after-free during races between diagnostics |
| // and the destruction of subclass state that diagnostics might access. |
| // |
| // AS A GENERAL RULE, it is safe to call WakeEvent::Initialize in a constructor and |
| // WakeEvent::Destroy in a destructor IF the destructor OR the implementation/override of |
| // WakeVector::GetDiagnostics can be marked final in the class making the calls. |
| // |
| // Calls to WakeEvent::Initialize and WakeEvent::Destroy must always be balanced. However, a |
| // WakeEvent MAY be initialized and destroyed more than once, as long as it is destroyed before its |
| // destructor is invoked. |
| // |
| class WakeEvent : public fbl::ContainableBaseClasses< |
| fbl::TaggedDoublyLinkedListable<WakeEvent*, internal::GlobalListTag>, |
| fbl::TaggedDoublyLinkedListable<WakeEvent*, internal::PendingListTag>> { |
| public: |
| enum class AckBehavior { ClearSignaled, RemainSignaled }; |
| |
| static bool has_pending_wake_events() TA_EXCL(PendingListLock::Get()) { |
| Guard<SpinLock, IrqSave> guard{PendingListLock::Get()}; |
| return !pending_list_.is_empty(); |
| } |
| |
| // Construct a WakeEvent referencing the given wake_vector. |
| explicit WakeEvent(const WakeVector& wake_vector) : wake_vector_(wake_vector) {} |
| |
| ~WakeEvent() { |
| // By the time that we destruct, we should have been Destroyed, meaning that |
| // we are no longer in any lists. |
| DEBUG_ASSERT(!in_global_list()); |
| DEBUG_ASSERT(!in_pending_list()); |
| } |
| |
| void Initialize() TA_EXCL(GlobalListLock::Get(), PendingListLock::Get()); |
| void Destroy() TA_EXCL(GlobalListLock::Get(), PendingListLock::Get()); |
| |
| // Triggers a wakeup that resumes the system, or aborts an incomplete suspend sequence, and |
| // prevents the system from starting a new suspend sequence. |
| // |
| // Must be called with interrupts and preempt disabled. |
| // |
| // Returns: |
| // - WakeResult::Active if this wake trigger occurred when the system was active. |
| // - WakeResult::Resumed if this or another wake trigger resumed the system. |
| // - WakeResult::SuspendAborted if this wake trigger occurred before suspend completed. |
| // - WakeResult::BadState if this wake event is already pending. |
| // |
| // Calls to |Trigger| and |Acknowledge| must be synchronized by the caller to guarantee that |
| // updates are performed by a single actor at a time. |
| // |
| WakeResult Trigger(zx_instant_boot_t trigger_time) TA_EXCL(PendingListLock::Get()) { |
| AnnotatedAutoPreemptDisabler preempt_disabler; |
| Guard<SpinLock, IrqSave> pending_guard{PendingListLock::Get()}; |
| return TriggerLocked(trigger_time); |
| } |
| |
| // Acknowledges a pending wake event, allowing the system to enter suspend when all other |
| // suspend conditions are met. |
| // |
| // Calls to |Trigger| and |Acknowledge| must be synchronized by the caller to guarantee that |
| // updates are performed by a single actor at a time. |
| // |
| void Acknowledge(AckBehavior ack_behavior) TA_EXCL(PendingListLock::Get()) { |
| AnnotatedAutoPreemptDisabler preempt_disabler; |
| Guard<SpinLock, IrqSave> pending_guard{PendingListLock::Get()}; |
| AcknowledgeLocked(current_boot_time(), ack_behavior); |
| } |
| |
| // WARNING : This is not the method you are looking for <jedimindtrick/> |
| // |
| // Strobe is an operation used only in a very specific situation; when a suspend operation times |
| // out and the ResumeTimerWakeVector becomes signaled as a result. This object is (currently) the |
| // only non-interrupt wake source/vector defined in the system, and it is not directly exposed to |
| // user-mode as a object which becomes acknowledged by user-mode actions. Instead, it is the |
| // synthetic wake source used to report suspend-operation timeouts, and is (logically speaking) |
| // _always_ immediately acked after being signaled. |
| // |
| // Strobe handles this operation, without needing to expose any locks to make it possible to |
| // atomically Trigger/Acknowledge the object. For all other wake source objects in the system, |
| // explicit calls to Trigger and Acknowledge are what should be used. |
| WakeResult Strobe(zx_instant_boot_t trigger_time = current_boot_time()) |
| TA_EXCL(PendingListLock::Get()) { |
| AnnotatedAutoPreemptDisabler preempt_disabler; |
| Guard<SpinLock, IrqSave> pending_guard{PendingListLock::Get()}; |
| const WakeResult result = TriggerLocked(trigger_time); |
| AcknowledgeLocked(trigger_time, AckBehavior::ClearSignaled); |
| return result; |
| } |
| |
| // Walk the global list of all instances and dump diagnostic information to |f|. All events that |
| // are currently pending OR that were triggered after the optional time value are logged. |
| // |
| // Safe to call concurrently with any and all methods, including ctors and dtors. |
| static void Dump(FILE* f, zx_instant_boot_t log_triggered_after_boot_time = ZX_TIME_INFINITE) |
| TA_EXCL(GlobalListLock::Get(), PendingListLock::Get()); |
| |
| static zx_status_t GenerateWakeEventReport( |
| zx_instant_boot_t suspend_start_time, user_out_ptr<zx_wake_source_report_header_t> out_header, |
| user_out_ptr<zx_wake_source_report_entry_t> out_entries, uint32_t num_entries, |
| user_out_ptr<uint32_t> actual_entries) TA_EXCL(GlobalListLock::Get(), PendingListLock::Get()); |
| |
| static void DiscardWakeEventReport() TA_EXCL(GlobalListLock::Get(), PendingListLock::Get()); |
| |
| private: |
| using GlobalListTag = internal::GlobalListTag; |
| using PendingListTag = internal::PendingListTag; |
| using GlobalList = fbl::DoublyLinkedList<WakeEvent*, GlobalListTag, fbl::SizeOrder::Constant>; |
| using PendingList = fbl::DoublyLinkedList<WakeEvent*, PendingListTag, fbl::SizeOrder::Constant>; |
| |
| bool in_global_list() const TA_REQ(GlobalListLock::Get()); |
| bool in_pending_list() const TA_REQ(PendingListLock::Get()); |
| |
| bool is_signaled() const TA_REQ(PendingListLock::Get()) { |
| return (report_info_.flags & ZX_SYSTEM_WAKE_REPORT_ENTRY_FLAG_SIGNALED) != 0; |
| } |
| |
| bool has_been_reported() const TA_REQ(PendingListLock::Get()) { |
| return (report_info_.flags & ZX_SYSTEM_WAKE_REPORT_ENTRY_FLAG_PREVIOUSLY_REPORTED) != 0; |
| } |
| |
| WakeResult TriggerLocked(zx_instant_boot_t trigger_time) |
| TA_REQ(PendingListLock::Get(), preempt_disabled_token); |
| void AcknowledgeLocked(zx_instant_boot_t trigger_time, AckBehavior ack_behavior) |
| TA_REQ(PendingListLock::Get(), preempt_disabled_token); |
| |
| void AssignFlag(bool value, uint32_t flag) TA_REQ(PendingListLock::Get()) { |
| if (value) { |
| report_info_.flags |= flag; |
| } else { |
| report_info_.flags &= ~flag; |
| } |
| } |
| |
| void AssignSignaled(bool value) TA_REQ(PendingListLock::Get()) { |
| AssignFlag(value, ZX_SYSTEM_WAKE_REPORT_ENTRY_FLAG_SIGNALED); |
| } |
| |
| void AssignHasBeenReported(bool value) TA_REQ(PendingListLock::Get()) { |
| AssignFlag(value, ZX_SYSTEM_WAKE_REPORT_ENTRY_FLAG_PREVIOUSLY_REPORTED); |
| } |
| |
| // -- Important -- |
| // |
| // Notes on the pending list, locks, and concurrency. You definitely want to |
| // read this if you are reading the wake source reporting generation code. It |
| // will provide an explanation about how this all works, why it is structured |
| // the way it is, and why it is all safe. |
| // |
| // It is a requirement that every wake source in the system which has become |
| // signaled since last being reported be present in any wake source report |
| // generated for a caller of `zx_system_suspend_enter`. They will continue to |
| // be reported to users until they have been *both* acknowledged, and reported |
| // at least once. |
| // |
| // The `pending_list_` holds the current list of wake events waiting to be |
| // reported. Members of the list should remain on the list provided that: |
| // |
| // 1) They have been signaled at some point in the past, at least once. |
| // 2) They are either not-yet-acknowledged, or not-yet-reported, or both. |
| // |
| // The PendingListLock is a spinlock used to protect the integrity of the |
| // `pending_list_`, however due to another requirement, it alone is not |
| // sufficient. Specifically, it is a requirement that generating a report can |
| // never hold off interrupt processing for O(n) time. This requirement would |
| // be violated if we had to hold the PendingListLock for the duration of a |
| // report-generation operation. To avoid violating this requirement, we drop |
| // the PendingListLock each time through the loop while iterating through the |
| // pending list during report generation. |
| // |
| // The need to drop the PendingListLock during iteration while generating a |
| // report leads to two other potential bad behaviors which we need to protect |
| // against: |
| // |
| // 1) During report generation, we are holding an iterator to an element in |
| // the list. This iterator cannot become invalidated during the period |
| // where we don't hold the lock. |
| // 2) We must never "double report" a wake source in a single report. IOW - |
| // if KOID X shows up once in the report generated for the user, it must |
| // not show up any more times _in that specific report_. |
| // |
| // There are a total of 4 operations which can affect report generation. They |
| // are: |
| // |
| // 1) Triggering. This will update the bookkeeping for a wake event, and add |
| // that event to the pending list if it was not already on the list. This |
| // operation is the only operation which takes place at hard IRQ time, and |
| // takes O(1) time. |
| // 2) Ack'ing. This will update the bookkeeping for a wake event, but will |
| // never remove the event from the list, even if it is now both reported |
| // and acknowledged. This operation always takes place in the context of a |
| // syscall made by user mode and is O(1). |
| // 3) Construction/Registration. Construction of a new wake event does |
| // not directly affect the pending_list_, but it does affect the total |
| // count of wake sources in the system which is a number which is also |
| // included in the wake source report. This is an O(1) operation. |
| // 4) Destruction/De-registration. Destruction of a wake event always |
| // unconditionally removes the event from the pending list, if it is on the |
| // list at the time of destruction. This is an O(1)) operation. |
| // |
| // -- Avoiding iterator invalidation -- |
| // |
| // Report generation holds the GlobalListLock (a mutex) for the duration of a |
| // report generation operation. Construction/Destruction (#3-4) operations |
| // must also hold the GlobalListLock meaning that they cannot invalidate a |
| // report operation's iterator as they cannot run concurrently with the report |
| // generation. Trigger operations (#1) only add items, and therefore cannot |
| // invalidate an intrusive list iterator. |
| // |
| // Ack operations (#2) could theoretically cause trouble if they were to |
| // remove an element from the list as soon as it became both acked and |
| // reported. While it would not result in UAF, it could cause the report to |
| // stop iteration early if the next element to report was acked and |
| // immediately removed from the list. To avoid this, ack operations will |
| // never remove an element from the list. Instead, they merely mark the |
| // element as ack'ed and depend on report generation to handle the removal for |
| // them, avoiding the invalidation issue in the process. |
| // |
| // Adopting this approach of having the report generation operation remove the |
| // ack'ed wake source, instead of using the GlobalListLock to synchronize, |
| // does two things for us. |
| // |
| // 1) It means that user mode ack operations will never need to obtain the |
| // GlobalListLock, and potentially block behind an O(n) report generation |
| // operation. |
| // 2) It means that it is possible for kernel code it ack kernel-owned |
| // interrupts which also happen to be wake sources at hard IRQ time. |
| // |
| // -- Avoiding double reports -- |
| // |
| // Construction/Destruction (#3-4) operations have no potential to produce a |
| // double report in the first place, but they also cannot run concurrently |
| // with report generation, so they have no potential to produce a double |
| // report. Likewise, ack'ing (#3) cannot produce a double report as it will |
| // never remove an element from the list, only update the bookkeeping. Even |
| // if it did actually remove elements from the list, it couldn't produce a |
| // double report. |
| // |
| // This means that only triggering (op #2) has the potential to produce a |
| // double report. A sequence which would produce this behavior would go like |
| // this. |
| // |
| // 1) While a report is being generated, event X is encountered. X has |
| // already been acknowledged, and now has certainly been reported, so X is |
| // removed from the pending list as the iterator is advanced to the next |
| // event, Y. The reporting thread then it drops the lock and starts to copy |
| // information into the user's buffer. |
| // 2) X is now triggered again. The IRQ handler grabs the pending lock, marks |
| // X as triggered, and adds it back to the end of the list, then drops the |
| // lock again. |
| // 3) The reporting thread locks the list again, and processes Y. It will |
| // advance down the list until it encounters X again, eventually adding |
| // it to the report a second time. |
| // |
| // Avoiding this situation is easy if we follow one simple rule. When an event |
| // becomes triggered, it should be added to the *front* of the list instead of |
| // the back. This ensures that once report generation has started and the |
| // initial iterator has been computed, no newly triggered events can show up |
| // in this report. They will have to wait to ride the next report-train. |
| // |
| DECLARE_SINGLETON_MUTEX(GlobalListLock); |
| DECLARE_SINGLETON_SPINLOCK(PendingListLock); |
| static GlobalList global_list_ TA_GUARDED(GlobalListLock::Get()); |
| static PendingList pending_list_ TA_GUARDED(PendingListLock::Get()); |
| |
| // Our parent wake_vector reference. This is only safe to access when the object has been |
| // instantiated and init'ed, but not yet destroyed. IOW - only when `active_` is true. |
| const WakeVector& wake_vector_; |
| TA_GUARDED(PendingListLock::Get()) zx_wake_source_report_entry_t report_info_ { 0 }; |
| }; |
| |
| inline bool WakeEvent::in_global_list() const { return fbl::InContainer<GlobalListTag>(*this); } |
| inline bool WakeEvent::in_pending_list() const { return fbl::InContainer<PendingListTag>(*this); } |
| |
| } // namespace wake_vector |
| |
| #endif // ZIRCON_KERNEL_LIB_WAKE_VECTOR_INCLUDE_LIB_WAKE_VECTOR_H_ |