zircon/kernel/lib/arch/include/lib/arch/x86/cache.h - fuchsia - Git at Google

 // Copyright 2020 The Fuchsia Authors
 //
 // Use of this source code is governed by a MIT-style
 // license that can be found in the LICENSE file or at
 // https://opensource.org/licenses/MIT

 #ifndef ZIRCON_KERNEL_LIB_ARCH_INCLUDE_LIB_ARCH_X86_CACHE_H_
 #define ZIRCON_KERNEL_LIB_ARCH_INCLUDE_LIB_ARCH_X86_CACHE_H_

 #include <lib/arch/x86/cpuid.h>

 #include <optional>
 #include <type_traits>

 namespace arch {

 // Represents a single cache.
 struct CpuCacheLevelInfo {
   size_t level;
   X86CacheType type;

   // The size, in KiB, of the cache available to each processor. In the case of
   // the last-level cache, however, this field might report the aggregate size
   // of all such caches on the package.
   size_t size_kb;

   // The number of sets in the cache available to each processor. In the case
   // of the last-level cache, however, this field might report the aggregate
   // number of sets across all such caches in the package.
   size_t number_of_sets;                  // Indeterminate if zero.
   size_t ways_of_associativity;           // Indeterminate if zero.
   std::optional<bool> fully_associative;  // Indeterminate if std::nullopt.

   // The number of bits to shift an APIC ID to get the associated "share ID":
   // processors with coinciding share IDs share this cache. If std::nullopt,
   // then it is indeterminate what the cache's shift is.
   std::optional<size_t> share_id_shift;
 };

 // Gives information on the set of caches in a package.
 class CpuCacheInfo {
  public:
   using iterator = const CpuCacheLevelInfo*;
   using const_iterator = iterator;

   template <typename CpuidIoProvider,
             // To avoid precedence over copy and move constructors.
             typename = std::enable_if_t<!std::is_same_v<CpuidIoProvider, CpuCacheInfo>>>
   explicit CpuCacheInfo(CpuidIoProvider&& io) {
     // We first try the Intel v2 leaves - and then the AMD v2 leaves.
     // Hypervisors on AMD hosts might lay CPUID values in the Intel style - and
     // there is no harm in doing this in general as AMD hardware will tend to
     // reserve these Intel leaves as zero.

     if (TryV2Topology<CpuidIntelCacheTopologyA, CpuidIntelCacheTopologyB, CpuidIntelCacheTopologyC,
                       CpuidMaximumLeaf>(io)) {
       return;
     }

     // [amd/vol3]: E.4.15  Function 8000_001Dh—Cache Topology Information.
     //
     // CpuidAmdCacheTopologyA's leaf (0x8000'001d) is reserved if the
     // topology extension feature is not implemented.
     if (io.template Read<CpuidAmdFeatureFlagsC>().topology_extensions() &&
         TryV2Topology<CpuidAmdCacheTopologyA, CpuidAmdCacheTopologyB,  //
                       CpuidAmdCacheTopologyC, CpuidMaximumExtendedLeaf>(io)) {
       return;
     }

     // The extended leaves explicitly enumerate information about L1d, L1i, L2,
     // and L3, which was the original means of figuring out cache topology on
     // AMD.
     if (io.template Read<CpuidMaximumExtendedLeaf>().leaf() >= CpuidL3CacheInformation::kLeaf) {
       const auto l1d = io.template Read<CpuidL1DataCacheInformation>();
       const auto l1i = io.template Read<CpuidL1InstructionCacheInformation>();
       const auto l2 = io.template Read<CpuidL2CacheInformation>();
       const auto l3 = io.template Read<CpuidL3CacheInformation>();

       caches_[0] = {
           .level = 1,
           .type = X86CacheType::kData,
           .size_kb = l1d.size_kb(),
           .ways_of_associativity = l1d.ways_of_associativity(),
           .fully_associative = l1d.fully_associative(),
       };
       caches_[1] = {
           .level = 1,
           .type = X86CacheType::kInstruction,
           .size_kb = l1i.size_kb(),
           .ways_of_associativity = l1i.ways_of_associativity(),
           .fully_associative = l1i.fully_associative(),
       };
       caches_[2] = {
           .level = 2,
           .type = X86CacheType::kUnified,
           .size_kb = l2.size_kb(),
           .ways_of_associativity = l2.ways_of_associativity(),
           .fully_associative = l2.fully_associative(),
       };
       size_ = 3;

       if (l3.size()) {
         caches_[3] = {
             .level = 3,
             .type = X86CacheType::kUnified,
             // [amd/vol3]: E.4.5  Function 8000_0006h—L2 Cache and TLB and L3 Cache Information.
             //
             // `l3.size()` actually provides bounds for the total size of L3
             // cache across the package, in terms of 512 KiB blocks:
             // l3.size() * 512 ≤ total size KiB < (l3.size() + 1) * 512
             // In practice, the total size is a multiple of 512 and this
             // reports the actual total size.
             .size_kb = 512 * l3.size(),
             .ways_of_associativity = l3.ways_of_associativity(),
             .fully_associative = l3.fully_associative(),
         };
         size_ = 4;
       }
     }
   }

   CpuCacheInfo() = delete;

   iterator begin() const { return caches_; }

   iterator end() const { return caches_ + size_; }

   size_t size() const { return size_; }

   bool empty() const { return size_ == 0; }

   // Returns information on the last-level cache.
   const CpuCacheLevelInfo& back() const {
     ZX_DEBUG_ASSERT(size_ > 0);
     return caches_[size_ - 1];
   }

  private:
   // A split L1 and unified L2, L3, L4 caches makes five.
   static constexpr size_t kMaxNumCaches = 5;

   // We templatize this so that we can supply either the Intel or AMD V2 cache
   // topology leaves (0x4 and 0x8000'001d, respectively), which are identically
   // laid out.
   template <template <uint32_t> class CacheTopologyA,  //
             template <uint32_t> class CacheTopologyB,  //
             template <uint32_t> class CacheTopologyC,  //
             typename MaximumLeaf,                      //
             typename CpuidIoProvider>
   bool TryV2Topology(CpuidIoProvider&& io) {
     if (io.template Read<MaximumLeaf>().leaf() < CacheTopologyA<0>::kLeaf) {
       return false;
     }

     for (size_t i = 0; i < kMaxNumCaches; ++i, ++size_) {
       const auto eax = Read<CacheTopologyA>(io, i);
       const auto ebx = Read<CacheTopologyB>(io, i);
       const auto ecx = Read<CacheTopologyC>(io, i);

       if (eax.cache_type() == X86CacheType::kNull) {
         break;
       }
       const size_t size_bytes = (ebx.ways() + 1) * (ebx.physical_line_partitions() + 1) *
                                 (ebx.system_coherency_line_size() + 1) * (ecx.sets() + 1);
       caches_[i] = {
           .level = eax.cache_level(),
           .type = eax.cache_type(),
           .size_kb = size_bytes / 1024,
           .number_of_sets = ecx.sets() + 1,
           .ways_of_associativity = ebx.ways() + 1,
           .fully_associative = eax.fully_associative() != 0,
           .share_id_shift = CeilLog2(eax.max_sharing_logical_processors() + 1),
       };
     }
     // We expect at least split L1 caches and a L2 cache. If, for whatever reason,
     // less than expected was encoded, fall back to other means to populate
     // `caches`.
     return size_ >= 3;
   }

   // A shim to dynamically look up statically parametrized values.
   template <template <uint32_t> class CpuidValueType,  //
             typename CpuidIoProvider>
   static auto Read(CpuidIoProvider&& io, size_t n) {
     switch (n) {
       case 0:
         return io.template Read<CpuidValueType<0>>();
       case 1:
         return io.template Read<CpuidValueType<1>>();
       case 2:
         return io.template Read<CpuidValueType<2>>();
       case 3:
         return io.template Read<CpuidValueType<3>>();
       case 4:
         return io.template Read<CpuidValueType<4>>();
       default:
         static_assert(kMaxNumCaches == 5);
         ZX_DEBUG_ASSERT(n < kMaxNumCaches);
         __UNREACHABLE;
     }
   }

   // TODO(C++20): use `cpp20::bit_ceil()` instead.
   static size_t CeilLog2(size_t n) {
     ZX_DEBUG_ASSERT(n > 0);
     size_t exp = 0;
     while ((size_t{1} << exp) < n) {
       exp++;
     }
     return exp;
   }

   CpuCacheLevelInfo caches_[kMaxNumCaches];
   // Gives the actual number of `caches_` on which we have information.
   size_t size_ = 0;
 };

 }  // namespace arch

 #endif  // ZIRCON_KERNEL_LIB_ARCH_INCLUDE_LIB_ARCH_X86_CACHE_H_
	// Copyright 2020 The Fuchsia Authors
	//
	// Use of this source code is governed by a MIT-style
	// license that can be found in the LICENSE file or at
	// https://opensource.org/licenses/MIT

	#ifndef ZIRCON_KERNEL_LIB_ARCH_INCLUDE_LIB_ARCH_X86_CACHE_H_
	#define ZIRCON_KERNEL_LIB_ARCH_INCLUDE_LIB_ARCH_X86_CACHE_H_

	#include <lib/arch/x86/cpuid.h>

	#include <optional>
	#include <type_traits>

	namespace arch {

	// Represents a single cache.
	struct CpuCacheLevelInfo {
	size_t level;
	X86CacheType type;

	// The size, in KiB, of the cache available to each processor. In the case of
	// the last-level cache, however, this field might report the aggregate size
	// of all such caches on the package.
	size_t size_kb;

	// The number of sets in the cache available to each processor. In the case
	// of the last-level cache, however, this field might report the aggregate
	// number of sets across all such caches in the package.
	size_t number_of_sets; // Indeterminate if zero.
	size_t ways_of_associativity; // Indeterminate if zero.
	std::optional<bool> fully_associative; // Indeterminate if std::nullopt.

	// The number of bits to shift an APIC ID to get the associated "share ID":
	// processors with coinciding share IDs share this cache. If std::nullopt,
	// then it is indeterminate what the cache's shift is.
	std::optional<size_t> share_id_shift;
	};

	// Gives information on the set of caches in a package.
	class CpuCacheInfo {
	public:
	using iterator = const CpuCacheLevelInfo*;
	using const_iterator = iterator;

	template <typename CpuidIoProvider,
	// To avoid precedence over copy and move constructors.
	typename = std::enable_if_t<!std::is_same_v<CpuidIoProvider, CpuCacheInfo>>>
	explicit CpuCacheInfo(CpuidIoProvider&& io) {
	// We first try the Intel v2 leaves - and then the AMD v2 leaves.
	// Hypervisors on AMD hosts might lay CPUID values in the Intel style - and
	// there is no harm in doing this in general as AMD hardware will tend to
	// reserve these Intel leaves as zero.

	if (TryV2Topology<CpuidIntelCacheTopologyA, CpuidIntelCacheTopologyB, CpuidIntelCacheTopologyC,
	CpuidMaximumLeaf>(io)) {
	return;
	}

	// [amd/vol3]: E.4.15 Function 8000_001Dh—Cache Topology Information.
	//
	// CpuidAmdCacheTopologyA's leaf (0x8000'001d) is reserved if the
	// topology extension feature is not implemented.
	if (io.template Read<CpuidAmdFeatureFlagsC>().topology_extensions() &&
	TryV2Topology<CpuidAmdCacheTopologyA, CpuidAmdCacheTopologyB, //
	CpuidAmdCacheTopologyC, CpuidMaximumExtendedLeaf>(io)) {
	return;
	}

	// The extended leaves explicitly enumerate information about L1d, L1i, L2,
	// and L3, which was the original means of figuring out cache topology on
	// AMD.
	if (io.template Read<CpuidMaximumExtendedLeaf>().leaf() >= CpuidL3CacheInformation::kLeaf) {
	const auto l1d = io.template Read<CpuidL1DataCacheInformation>();
	const auto l1i = io.template Read<CpuidL1InstructionCacheInformation>();
	const auto l2 = io.template Read<CpuidL2CacheInformation>();
	const auto l3 = io.template Read<CpuidL3CacheInformation>();

	caches_[0] = {
	.level = 1,
	.type = X86CacheType::kData,
	.size_kb = l1d.size_kb(),
	.ways_of_associativity = l1d.ways_of_associativity(),
	.fully_associative = l1d.fully_associative(),
	};
	caches_[1] = {
	.level = 1,
	.type = X86CacheType::kInstruction,
	.size_kb = l1i.size_kb(),
	.ways_of_associativity = l1i.ways_of_associativity(),
	.fully_associative = l1i.fully_associative(),
	};
	caches_[2] = {
	.level = 2,
	.type = X86CacheType::kUnified,
	.size_kb = l2.size_kb(),
	.ways_of_associativity = l2.ways_of_associativity(),
	.fully_associative = l2.fully_associative(),
	};
	size_ = 3;

	if (l3.size()) {
	caches_[3] = {
	.level = 3,
	.type = X86CacheType::kUnified,
	// [amd/vol3]: E.4.5 Function 8000_0006h—L2 Cache and TLB and L3 Cache Information.
	//
	// `l3.size()` actually provides bounds for the total size of L3
	// cache across the package, in terms of 512 KiB blocks:
	// l3.size() * 512 ≤ total size KiB < (l3.size() + 1) * 512
	// In practice, the total size is a multiple of 512 and this
	// reports the actual total size.
	.size_kb = 512 * l3.size(),
	.ways_of_associativity = l3.ways_of_associativity(),
	.fully_associative = l3.fully_associative(),
	};
	size_ = 4;
	}
	}
	}

	CpuCacheInfo() = delete;

	iterator begin() const { return caches_; }

	iterator end() const { return caches_ + size_; }

	size_t size() const { return size_; }

	bool empty() const { return size_ == 0; }

	// Returns information on the last-level cache.
	const CpuCacheLevelInfo& back() const {
	ZX_DEBUG_ASSERT(size_ > 0);
	return caches_[size_ - 1];
	}

	private:
	// A split L1 and unified L2, L3, L4 caches makes five.
	static constexpr size_t kMaxNumCaches = 5;

	// We templatize this so that we can supply either the Intel or AMD V2 cache
	// topology leaves (0x4 and 0x8000'001d, respectively), which are identically
	// laid out.
	template <template <uint32_t> class CacheTopologyA, //
	template <uint32_t> class CacheTopologyB, //
	template <uint32_t> class CacheTopologyC, //
	typename MaximumLeaf, //
	typename CpuidIoProvider>
	bool TryV2Topology(CpuidIoProvider&& io) {
	if (io.template Read<MaximumLeaf>().leaf() < CacheTopologyA<0>::kLeaf) {
	return false;
	}

	for (size_t i = 0; i < kMaxNumCaches; ++i, ++size_) {
	const auto eax = Read<CacheTopologyA>(io, i);
	const auto ebx = Read<CacheTopologyB>(io, i);
	const auto ecx = Read<CacheTopologyC>(io, i);

	if (eax.cache_type() == X86CacheType::kNull) {
	break;
	}
	const size_t size_bytes = (ebx.ways() + 1) * (ebx.physical_line_partitions() + 1) *
	(ebx.system_coherency_line_size() + 1) * (ecx.sets() + 1);
	caches_[i] = {
	.level = eax.cache_level(),
	.type = eax.cache_type(),
	.size_kb = size_bytes / 1024,
	.number_of_sets = ecx.sets() + 1,
	.ways_of_associativity = ebx.ways() + 1,
	.fully_associative = eax.fully_associative() != 0,
	.share_id_shift = CeilLog2(eax.max_sharing_logical_processors() + 1),
	};
	}
	// We expect at least split L1 caches and a L2 cache. If, for whatever reason,
	// less than expected was encoded, fall back to other means to populate
	// `caches`.
	return size_ >= 3;
	}

	// A shim to dynamically look up statically parametrized values.
	template <template <uint32_t> class CpuidValueType, //
	typename CpuidIoProvider>
	static auto Read(CpuidIoProvider&& io, size_t n) {
	switch (n) {
	case 0:
	return io.template Read<CpuidValueType<0>>();
	case 1:
	return io.template Read<CpuidValueType<1>>();
	case 2:
	return io.template Read<CpuidValueType<2>>();
	case 3:
	return io.template Read<CpuidValueType<3>>();
	case 4:
	return io.template Read<CpuidValueType<4>>();
	default:
	static_assert(kMaxNumCaches == 5);
	ZX_DEBUG_ASSERT(n < kMaxNumCaches);
	__UNREACHABLE;
	}
	}

	// TODO(C++20): use `cpp20::bit_ceil()` instead.
	static size_t CeilLog2(size_t n) {
	ZX_DEBUG_ASSERT(n > 0);
	size_t exp = 0;
	while ((size_t{1} << exp) < n) {
	exp++;
	}
	return exp;
	}

	CpuCacheLevelInfo caches_[kMaxNumCaches];
	// Gives the actual number of `caches_` on which we have information.
	size_t size_ = 0;
	};

	} // namespace arch

	#endif // ZIRCON_KERNEL_LIB_ARCH_INCLUDE_LIB_ARCH_X86_CACHE_H_