| // Copyright 2020 The Fuchsia Authors |
| // |
| // Use of this source code is governed by a MIT-style |
| // license that can be found in the LICENSE file or at |
| // https://opensource.org/licenses/MIT |
| |
| #include "kernel/cpu_search_set.h" |
| |
| #include <assert.h> |
| #include <debug.h> |
| #include <inttypes.h> |
| #include <stddef.h> |
| |
| #include <fbl/alloc_checker.h> |
| #include <kernel/cpu_distance_map.h> |
| #include <ktl/algorithm.h> |
| #include <ktl/span.h> |
| #include <ktl/unique_ptr.h> |
| |
| #include <ktl/enforce.h> |
| |
| CpuSearchSet::ClusterSet CpuSearchSet::cluster_set_; |
| |
| namespace { |
| |
| // Utility type compute CPU clusters using a disjoint set structure. |
| class ClusterMap { |
| public: |
| ClusterMap() = default; |
| |
| ClusterMap(const ClusterMap&) = delete; |
| ClusterMap& operator=(const ClusterMap&) = delete; |
| ClusterMap(ClusterMap&&) = default; |
| ClusterMap& operator=(ClusterMap&&) = default; |
| |
| explicit operator bool() const { return !elements_.is_empty(); } |
| |
| // Creates a ClusterMap with the given number of CPUs, with each CPU initially |
| // in its own cluster. |
| static ClusterMap Create(size_t element_count) { |
| fbl::AllocChecker checker; |
| fbl::Vector<cpu_num_t> elements; |
| elements.resize(element_count, &checker); |
| ASSERT(checker.check()); |
| |
| for (cpu_num_t i = 0; i < element_count; i++) { |
| elements[i] = i; |
| } |
| |
| return ClusterMap{ktl::move(elements)}; |
| } |
| |
| // Returns an iterator over the elements of the disjoint set structure. |
| auto iterator() { return ktl::span{elements_.begin(), elements_.size()}; } |
| auto begin() { return iterator().begin(); } |
| auto end() { return iterator().end(); } |
| |
| cpu_num_t operator[](size_t index) const { return elements_[index]; } |
| |
| cpu_num_t FindSet(cpu_num_t node) { |
| while (true) { |
| cpu_num_t parent = elements_[node]; |
| cpu_num_t grandparent = elements_[parent]; |
| if (parent == grandparent) { |
| return parent; |
| } |
| elements_[node] = grandparent; |
| node = parent; |
| } |
| } |
| |
| void UnionSets(cpu_num_t a, cpu_num_t b) { |
| while (true) { |
| cpu_num_t root_a = FindSet(a); |
| cpu_num_t root_b = FindSet(b); |
| |
| if (root_a < root_b) { |
| elements_[root_b] = root_a; |
| } else if (root_a > root_b) { |
| elements_[root_a] = root_b; |
| } else { |
| return; |
| } |
| } |
| } |
| |
| size_t ClusterCount() const { |
| size_t count = 0; |
| for (size_t i = 0; i < elements_.size(); i++) { |
| if (elements_[i] == i) { |
| count++; |
| } |
| } |
| return count; |
| } |
| |
| size_t MemberCount(cpu_num_t root) { |
| size_t count = 0; |
| for (cpu_num_t i = 0; i < elements_.size(); i++) { |
| if (FindSet(i) == root) { |
| count++; |
| } |
| } |
| return count; |
| } |
| |
| private: |
| explicit ClusterMap(fbl::Vector<cpu_num_t> elements) : elements_{ktl::move(elements)} {} |
| |
| fbl::Vector<cpu_num_t> elements_{}; |
| }; |
| |
| } // anonymous namespace |
| |
| CpuSearchSet::ClusterSet CpuSearchSet::DoAutoCluster(size_t cpu_count, const CpuDistanceMap& map) { |
| ClusterMap cluster_map = ClusterMap::Create(cpu_count); |
| |
| // Perform a single pass of agglomerative clustering, joining CPUs with |
| // distances below the significant distance threshold into the same cluster. |
| for (cpu_num_t i = 0; i < cpu_count; i++) { |
| for (cpu_num_t j = i + 1; j < cpu_count; j++) { |
| if (map[{i, j}] < map.distance_threshold()) { |
| cluster_map.UnionSets(i, j); |
| } |
| } |
| } |
| |
| // Allocate vector of Cluster structures with an entry for each computed |
| // cluster. |
| fbl::AllocChecker checker; |
| fbl::Vector<Cluster> clusters; |
| clusters.resize(cluster_map.ClusterCount(), &checker); |
| ASSERT(checker.check()); |
| |
| // Alllocate a vector of MapEntry structures with an entry for each CPU. |
| fbl::Vector<MapEntry> cpu_to_cluster_map; |
| cpu_to_cluster_map.resize(cpu_count, &checker); |
| ASSERT(checker.check()); |
| |
| // Fill in the Cluster structures and CPU-to-cluster map. |
| size_t cluster_index = 0; |
| for (cpu_num_t i = 0; i < cpu_count; i++) { |
| const size_t member_count = cluster_map.MemberCount(i); |
| if (member_count != 0) { |
| Cluster& cluster = clusters[cluster_index]; |
| cluster.id = cluster_index; |
| cluster.members.resize(member_count, &checker); |
| ASSERT(checker.check()); |
| |
| size_t member_index = 0; |
| for (cpu_num_t j = 0; j < cpu_count; j++) { |
| if (cluster_map[j] == i) { |
| cluster.members[member_index] = j; |
| cpu_to_cluster_map[j] = {&cluster, member_index}; |
| member_index++; |
| } |
| } |
| cluster_index++; |
| } |
| } |
| |
| return {ktl::move(clusters), ktl::move(cpu_to_cluster_map)}; |
| } |
| |
| void CpuSearchSet::DumpClusters() { |
| dprintf(INFO, "CPU clusters:\n"); |
| for (const Cluster& cluster : cluster_set_.iterator()) { |
| dprintf(INFO, "Cluster %2zu: ", cluster.id); |
| for (size_t j = 0; j < cluster.members.size(); j++) { |
| dprintf(INFO, "%" PRIu32 "%s", cluster.members[j], |
| j < cluster.members.size() - 1 ? ", " : ""); |
| } |
| dprintf(INFO, "\n"); |
| } |
| } |
| |
| // Initializes the search set with a unique CPU order that minimizes cache level |
| // crossings while attempting to maximize distribution across CPUs. |
| void CpuSearchSet::DoInitialize(cpu_num_t this_cpu, size_t cpu_count, const ClusterSet& cluster_set, |
| const CpuDistanceMap& map) { |
| this_cpu_ = this_cpu; |
| |
| // Initialize the search set in increasing ordinal order. |
| cpu_count_ = cpu_count; |
| for (cpu_num_t i = 0; i < cpu_count; i++) { |
| const size_t cluster = cluster_set.cpu_to_cluster_map[i].cluster->id; |
| ordered_cpus_[i] = {i, cluster}; |
| } |
| |
| // Sort the search set by these criteria in priority order: |
| // 1. Cache distance from this CPU. |
| // 2. Modular cluster order, offset by this CPU's cluster id. |
| // 3. Modular cluster member order, offset by this CPU's cluster member index. |
| // |
| // These criteria result in a relaxed Latin Square with the following |
| // properties: |
| // * A CPU is always at the front of its own search list (distance is 0). |
| // * The search list is ordered by increasing cache distance. |
| // * The search order is reasonably unique compared to other CPUs (a CPU is |
| // found as few times as possible at a given offset in all search lists). |
| // |
| const auto comparator = [this_cpu, &cluster_set, &map](const Entry& a, const Entry& b) { |
| // TODO(eieio): Temporary hack to put little cores at the beginning of the search sets for |
| // better idle power consumption. This can be removed or revised when a power model is |
| // introduced to better handle power vs. performance tradeoffs. |
| if (perf_scales_[a.cpu] != perf_scales_[b.cpu]) { |
| return perf_scales_[a.cpu] < perf_scales_[b.cpu]; |
| } |
| |
| const auto distance_a = map[{this_cpu, a.cpu}]; |
| const auto distance_b = map[{this_cpu, b.cpu}]; |
| if (distance_a != distance_b) { |
| return distance_a < distance_b; |
| } |
| |
| const auto [this_cluster, this_index] = cluster_set.cpu_to_cluster_map[this_cpu]; |
| const auto [a_cluster, a_index] = cluster_set.cpu_to_cluster_map[a.cpu]; |
| const auto [b_cluster, b_index] = cluster_set.cpu_to_cluster_map[b.cpu]; |
| |
| const size_t cluster_count = cluster_set.clusters.size(); |
| const auto a_cluster_prime = (this_cluster->id + cluster_count - a_cluster->id) % cluster_count; |
| const auto b_cluster_prime = (this_cluster->id + cluster_count - b_cluster->id) % cluster_count; |
| if (a_cluster_prime != b_cluster_prime) { |
| return a_cluster_prime < b_cluster_prime; |
| } |
| |
| const auto a_count = a_cluster->members.size(); |
| const auto b_count = b_cluster->members.size(); |
| const size_t a_index_prime = a_cluster->members[(this_index + a_count - a_index) % a_count]; |
| const size_t b_index_prime = b_cluster->members[(this_index + b_count - b_index) % b_count]; |
| return a_index_prime < b_index_prime; |
| }; |
| |
| ktl::stable_sort(iterator().begin(), iterator().end(), comparator); |
| } |
| |
| void CpuSearchSet::Dump() const { |
| dprintf(INFO, "CPU %2" PRIu32 ": ", this_cpu_); |
| for (cpu_num_t i = 0; i < cpu_count_; i++) { |
| dprintf(INFO, "%2" PRIu32 "%s", ordered_cpus_[i].cpu, i < cpu_count_ - 1 ? ", " : ""); |
| } |
| dprintf(INFO, "\n"); |
| } |