zircon/kernel/arch/arm64/include/arch/arm64/mp.h - fuchsia - Git at Google

 // Copyright 2016 The Fuchsia Authors
 //
 // Use of this source code is governed by a MIT-style
 // license that can be found in the LICENSE file or at
 // https://opensource.org/licenses/MIT

 #ifndef ZIRCON_KERNEL_ARCH_ARM64_INCLUDE_ARCH_ARM64_MP_H_
 #define ZIRCON_KERNEL_ARCH_ARM64_INCLUDE_ARCH_ARM64_MP_H_

 #define PERCPU_IN_RESTRICTED_MODE 32

 #ifndef __ASSEMBLER__

 #include <zircon/compiler.h>

 #include <arch/arm64.h>
 #include <arch/arm64/feature.h>
 #include <kernel/align.h>
 #include <kernel/cpu.h>
 #include <ktl/bit.h>
 #include <ktl/type_traits.h>

 // bits for mpidr register
 #define MPIDR_AFF0_MASK 0xFFULL
 #define MPIDR_AFF0_SHIFT 0
 #define MPIDR_AFF1_MASK (0xFFULL << 8)
 #define MPIDR_AFF1_SHIFT 8
 #define MPIDR_AFF2_MASK (0xFFULL << 16)
 #define MPIDR_AFF2_SHIFT 16
 #define MPIDR_AFF3_MASK (0xFFULL << 32)
 #define MPIDR_AFF3_SHIFT 32

 // construct a ARM MPID from cluster (AFF1) and cpu number (AFF0)
 #define ARM64_MPID(cluster, cpu)                       \
   (((cluster << MPIDR_AFF1_SHIFT) & MPIDR_AFF1_MASK) | \
    ((cpu << MPIDR_AFF0_SHIFT) & MPIDR_AFF0_MASK))

 #define ARM64_MPIDR_MASK (MPIDR_AFF3_MASK | MPIDR_AFF2_MASK | MPIDR_AFF1_MASK | MPIDR_AFF0_MASK)

 // TODO: add support for AFF2 and AFF3

 struct percpu;

 // Per cpu structure, pointed to by a fixed register while in kernel mode.
 // Aligned on the maximum architectural cache line to avoid cache
 // line sharing between cpus.
 struct arm64_percpu {
   // cpu number
   cpu_num_t cpu_num;

   // Whether blocking is disallowed.  See arch_blocking_disallowed().
   uint32_t blocking_disallowed;

   // Number of spinlocks currently held.
   uint32_t num_spinlocks;

   // Microarchitecture of this cpu (ex: Cortex-A53)
   arm64_microarch microarch;

   // True if the branch predictor should be invalidated during context switch
   // to mitigate Spectre V2 attacks.
   bool should_invalidate_bp_on_context_switch;

   // True if the branch predictor should be invalidated on suspicious entries
   // to EL1 from EL0 to mitigate Spectre V2 attacks.
   bool should_invalidate_bp_on_el0_exception;

   // A pointer providing fast access to the high-level arch-agnostic per-cpu struct.
   percpu* high_level_percpu;

   // Flag to track that we're in restricted mode.
   uint32_t in_restricted_mode;
 } __CPU_ALIGN;
 static_assert(offsetof(struct arm64_percpu, in_restricted_mode) == PERCPU_IN_RESTRICTED_MODE,
               "in_restricted mode is at the wrong offset");

 void arch_init_cpu_map(uint cluster_count, const uint* cluster_cpus);
 void arch_register_mpid(uint cpu_id, uint64_t mpid);
 void arm64_init_percpu_early();

 extern uint arm_num_cpus;
 extern uint arm64_cpu_cluster_ids[SMP_MAX_CPUS];
 extern uint arm64_cpu_cpu_ids[SMP_MAX_CPUS];

 // Use the x20 register to always point at the local cpu structure for fast access.
 // x20 is the first available callee-saved register that clang will allow to be marked
 // as fixed (via -ffixed-x20 command line). Since it's callee saved when making firmware
 // calls to PSCI or SMCC the register will be naturally saved and restored.
 inline void arm64_write_percpu_ptr(struct arm64_percpu* percpu) {
   __asm__ volatile("mov x20, %0" ::"r"(percpu));
 }

 inline struct arm64_percpu* arm64_read_percpu_ptr() {
   struct arm64_percpu* p;
   __asm__ volatile("mov %0, x20" : "=r"(p));
   return p;
 }

 // Mark as volatile to force a read of the field to make sure the compiler
 // always emits a read when asked and does not cache a copy between.  For the
 // same reason, this can't by done via the arm64_percpu_ptr variable, since
 // the compiler could copy x20 into another register and access it after a
 // reschedule.
 template <typename T, size_t Offset>
 [[gnu::always_inline]] inline T arm64_read_percpu_field() {
   static_assert((Offset & (alignof(T) - 1)) == 0, "Bad offset alignment");

   // The `ldr` instruction is 64 bits or 32 bits depending on the register name
   // used.  All the load instructions for sizes smaller that 64 bits use the
   // 32-bit register form, but sizes smaller than 32 bits have their own load
   // instructions: `ldrh` for 16 bits, `ldrb` for 8 bits.  These still write a
   // full 32-bit register, so the output operand is uint32_t.  Narrow the
   // result to the actual value type.
   constexpr auto narrow = [](uint32_t value) -> T {
     if constexpr (sizeof(T) > sizeof(uint32_t)) {
       PANIC("unreachable");
       return {};
     } else if constexpr (ktl::is_signed_v<T>) {
       using U = ktl::make_unsigned_t<T>;
       return ktl::bit_cast<T>(static_cast<U>(value));
     } else {
       return static_cast<T>(value);
     }
   };

   if constexpr (sizeof(T) == sizeof(uint32_t)) {
     T value;
     __asm__ volatile("ldr %w[val], [x20, %[offset]]" : [val] "=r"(value) : [offset] "Ir"(Offset));
     return value;
   } else if constexpr (sizeof(T) == sizeof(uint16_t)) {
     uint32_t value;
     __asm__ volatile("ldrh %w[val], [x20, %[offset]]" : [val] "=r"(value) : [offset] "Ir"(Offset));
     return narrow(value);
   } else if constexpr (sizeof(T) == sizeof(uint8_t)) {
     uint32_t value;
     __asm__ volatile("ldrb %w[val], [x20, %[offset]]" : [val] "=r"(value) : [offset] "Ir"(Offset));
     return narrow(value);
   } else {
     static_assert(sizeof(T) == sizeof(uint64_t));
     T value;
     __asm__ volatile("ldr %[val], [x20, %[offset]]" : [val] "=r"(value) : [offset] "Ir"(Offset));
     return value;
   }
 }
 #define READ_PERCPU_FIELD(field) \
   (arm64_read_percpu_field<decltype(arm64_percpu::field), offsetof(arm64_percpu, field)>())

 template <typename T, size_t Offset>
 [[gnu::always_inline]] inline void arm64_write_percpu_field(T value) {
   static_assert((Offset & (alignof(T) - 1)) == 0, "Bad offset alignment");

   // The `str` instruction is 64 bits or 32 bits depending on the register name
   // used.  All the store instructions for sizes smaller that 64 bits use the
   // 32-bit register form, but sizes smaller than 32 bits have their own store
   // instructions: `strh` for 16 bits, `strb` for 8 bits.  In all cases the
   // "unused" bits of the register are ignored.  But just in case, zero-extend
   // the value from its actual bit width up to uint32_t since the asm operand
   // uses uint32_t to reflect that the 32-bit operand register appears in asm.
   constexpr auto widen = [](T value) -> uint32_t {
     if constexpr (sizeof(T) > sizeof(uint32_t)) {
       PANIC("unreachable");
       return 0;
     } else if constexpr (ktl::is_signed_v<T>) {
       using U = ktl::make_unsigned_t<T>;
       return ktl::bit_cast<U>(value);
     } else {
       return value;
     }
   };

   if constexpr (sizeof(T) == sizeof(uint32_t)) {
     __asm__ volatile("str %w[val], [x20, %[offset]]"
                      :
                      : [val] "r"(value), [offset] "Ir"(Offset)
                      : "memory");
   } else if constexpr (sizeof(T) == sizeof(uint16_t)) {
     __asm__ volatile("strh %w[val], [x20, %[offset]]"
                      :
                      : [val] "r"(widen(value)), [offset] "Ir"(Offset)
                      : "memory");
   } else if constexpr (sizeof(T) == sizeof(uint8_t)) {
     __asm__ volatile("strb %w[val], [x20, %[offset]]"
                      :
                      : [val] "r"(widen(value)), [offset] "Ir"(Offset)
                      : "memory");
   } else {
     static_assert(sizeof(T) == sizeof(uint64_t));
     __asm__ volatile("str %[val], [x20, %[offset]]"
                      :
                      : [val] "r"(value), [offset] "Ir"(Offset)
                      : "memory");
   }
 }
 #define WRITE_PERCPU_FIELD(field, value) \
   (arm64_write_percpu_field<decltype(arm64_percpu::field), offsetof(arm64_percpu, field)>(value))

 // Return a pointer to the high-level percpu struct for the calling CPU.
 inline struct percpu* arch_get_curr_percpu() { return READ_PERCPU_FIELD(high_level_percpu); }

 inline cpu_num_t arch_curr_cpu_num() { return READ_PERCPU_FIELD(cpu_num); }

 // TODO(https://fxbug.dev/42108040) get num_cpus from topology.
 // This needs to be set very early (before arch_init).
 inline void arch_set_num_cpus(uint cpu_count) { arm_num_cpus = cpu_count; }

 inline uint arch_max_num_cpus() { return arm_num_cpus; }

 // translate a cpu number back to the cluster ID (AFF1)
 inline uint arch_cpu_num_to_cluster_id(cpu_num_t cpu) {
   DEBUG_ASSERT(cpu < SMP_MAX_CPUS);

   return arm64_cpu_cluster_ids[cpu];
 }

 // translate a cpu number back to the MP cpu number within a cluster (AFF0)
 inline uint arch_cpu_num_to_cpu_id(cpu_num_t cpu) {
   DEBUG_ASSERT(cpu < SMP_MAX_CPUS);

   return arm64_cpu_cpu_ids[cpu];
 }

 // Translate a CPU number back to the MPIDR of the CPU.
 uint64_t arch_cpu_num_to_mpidr(cpu_num_t cpu_num);

 // translate mpidr to cpu number
 cpu_num_t arm64_mpidr_to_cpu_num(uint64_t mpidr);

 // Setup the high-level percpu struct pointer for |cpu_num|.
 void arch_setup_percpu(cpu_num_t cpu_num, struct percpu* percpu);

 inline void arch_set_restricted_flag(bool restricted) {
   WRITE_PERCPU_FIELD(in_restricted_mode, restricted ? 1 : 0);
 }

 inline bool arch_get_restricted_flag() { return READ_PERCPU_FIELD(in_restricted_mode); }

 #endif  // !__ASSEMBLER__

 #endif  // ZIRCON_KERNEL_ARCH_ARM64_INCLUDE_ARCH_ARM64_MP_H_
	// Copyright 2016 The Fuchsia Authors
	//
	// Use of this source code is governed by a MIT-style
	// license that can be found in the LICENSE file or at
	// https://opensource.org/licenses/MIT

	#ifndef ZIRCON_KERNEL_ARCH_ARM64_INCLUDE_ARCH_ARM64_MP_H_
	#define ZIRCON_KERNEL_ARCH_ARM64_INCLUDE_ARCH_ARM64_MP_H_

	#define PERCPU_IN_RESTRICTED_MODE 32

	#ifndef __ASSEMBLER__

	#include <zircon/compiler.h>

	#include <arch/arm64.h>
	#include <arch/arm64/feature.h>
	#include <kernel/align.h>
	#include <kernel/cpu.h>
	#include <ktl/bit.h>
	#include <ktl/type_traits.h>

	// bits for mpidr register
	#define MPIDR_AFF0_MASK 0xFFULL
	#define MPIDR_AFF0_SHIFT 0
	#define MPIDR_AFF1_MASK (0xFFULL << 8)
	#define MPIDR_AFF1_SHIFT 8
	#define MPIDR_AFF2_MASK (0xFFULL << 16)
	#define MPIDR_AFF2_SHIFT 16
	#define MPIDR_AFF3_MASK (0xFFULL << 32)
	#define MPIDR_AFF3_SHIFT 32

	// construct a ARM MPID from cluster (AFF1) and cpu number (AFF0)
	#define ARM64_MPID(cluster, cpu) \
	(((cluster << MPIDR_AFF1_SHIFT) & MPIDR_AFF1_MASK) \| \
	((cpu << MPIDR_AFF0_SHIFT) & MPIDR_AFF0_MASK))

	#define ARM64_MPIDR_MASK (MPIDR_AFF3_MASK \| MPIDR_AFF2_MASK \| MPIDR_AFF1_MASK \| MPIDR_AFF0_MASK)

	// TODO: add support for AFF2 and AFF3

	struct percpu;

	// Per cpu structure, pointed to by a fixed register while in kernel mode.
	// Aligned on the maximum architectural cache line to avoid cache
	// line sharing between cpus.
	struct arm64_percpu {
	// cpu number
	cpu_num_t cpu_num;

	// Whether blocking is disallowed. See arch_blocking_disallowed().
	uint32_t blocking_disallowed;

	// Number of spinlocks currently held.
	uint32_t num_spinlocks;

	// Microarchitecture of this cpu (ex: Cortex-A53)
	arm64_microarch microarch;

	// True if the branch predictor should be invalidated during context switch
	// to mitigate Spectre V2 attacks.
	bool should_invalidate_bp_on_context_switch;

	// True if the branch predictor should be invalidated on suspicious entries
	// to EL1 from EL0 to mitigate Spectre V2 attacks.
	bool should_invalidate_bp_on_el0_exception;

	// A pointer providing fast access to the high-level arch-agnostic per-cpu struct.
	percpu* high_level_percpu;

	// Flag to track that we're in restricted mode.
	uint32_t in_restricted_mode;
	} __CPU_ALIGN;
	static_assert(offsetof(struct arm64_percpu, in_restricted_mode) == PERCPU_IN_RESTRICTED_MODE,
	"in_restricted mode is at the wrong offset");

	void arch_init_cpu_map(uint cluster_count, const uint* cluster_cpus);
	void arch_register_mpid(uint cpu_id, uint64_t mpid);
	void arm64_init_percpu_early();

	extern uint arm_num_cpus;
	extern uint arm64_cpu_cluster_ids[SMP_MAX_CPUS];
	extern uint arm64_cpu_cpu_ids[SMP_MAX_CPUS];

	// Use the x20 register to always point at the local cpu structure for fast access.
	// x20 is the first available callee-saved register that clang will allow to be marked
	// as fixed (via -ffixed-x20 command line). Since it's callee saved when making firmware
	// calls to PSCI or SMCC the register will be naturally saved and restored.
	inline void arm64_write_percpu_ptr(struct arm64_percpu* percpu) {
	__asm__ volatile("mov x20, %0" ::"r"(percpu));
	}

	inline struct arm64_percpu* arm64_read_percpu_ptr() {
	struct arm64_percpu* p;
	__asm__ volatile("mov %0, x20" : "=r"(p));
	return p;
	}

	// Mark as volatile to force a read of the field to make sure the compiler
	// always emits a read when asked and does not cache a copy between. For the
	// same reason, this can't by done via the arm64_percpu_ptr variable, since
	// the compiler could copy x20 into another register and access it after a
	// reschedule.
	template <typename T, size_t Offset>
	[[gnu::always_inline]] inline T arm64_read_percpu_field() {
	static_assert((Offset & (alignof(T) - 1)) == 0, "Bad offset alignment");

	// The `ldr` instruction is 64 bits or 32 bits depending on the register name
	// used. All the load instructions for sizes smaller that 64 bits use the
	// 32-bit register form, but sizes smaller than 32 bits have their own load
	// instructions: `ldrh` for 16 bits, `ldrb` for 8 bits. These still write a
	// full 32-bit register, so the output operand is uint32_t. Narrow the
	// result to the actual value type.
	constexpr auto narrow = [](uint32_t value) -> T {
	if constexpr (sizeof(T) > sizeof(uint32_t)) {
	PANIC("unreachable");
	return {};
	} else if constexpr (ktl::is_signed_v<T>) {
	using U = ktl::make_unsigned_t<T>;
	return ktl::bit_cast<T>(static_cast<U>(value));
	} else {
	return static_cast<T>(value);
	}
	};

	if constexpr (sizeof(T) == sizeof(uint32_t)) {
	T value;
	__asm__ volatile("ldr %w[val], [x20, %[offset]]" : [val] "=r"(value) : [offset] "Ir"(Offset));
	return value;
	} else if constexpr (sizeof(T) == sizeof(uint16_t)) {
	uint32_t value;
	__asm__ volatile("ldrh %w[val], [x20, %[offset]]" : [val] "=r"(value) : [offset] "Ir"(Offset));
	return narrow(value);
	} else if constexpr (sizeof(T) == sizeof(uint8_t)) {
	uint32_t value;
	__asm__ volatile("ldrb %w[val], [x20, %[offset]]" : [val] "=r"(value) : [offset] "Ir"(Offset));
	return narrow(value);
	} else {
	static_assert(sizeof(T) == sizeof(uint64_t));
	T value;
	__asm__ volatile("ldr %[val], [x20, %[offset]]" : [val] "=r"(value) : [offset] "Ir"(Offset));
	return value;
	}
	}
	#define READ_PERCPU_FIELD(field) \
	(arm64_read_percpu_field<decltype(arm64_percpu::field), offsetof(arm64_percpu, field)>())

	template <typename T, size_t Offset>
	[[gnu::always_inline]] inline void arm64_write_percpu_field(T value) {
	static_assert((Offset & (alignof(T) - 1)) == 0, "Bad offset alignment");

	// The `str` instruction is 64 bits or 32 bits depending on the register name
	// used. All the store instructions for sizes smaller that 64 bits use the
	// 32-bit register form, but sizes smaller than 32 bits have their own store
	// instructions: `strh` for 16 bits, `strb` for 8 bits. In all cases the
	// "unused" bits of the register are ignored. But just in case, zero-extend
	// the value from its actual bit width up to uint32_t since the asm operand
	// uses uint32_t to reflect that the 32-bit operand register appears in asm.
	constexpr auto widen = [](T value) -> uint32_t {
	if constexpr (sizeof(T) > sizeof(uint32_t)) {
	PANIC("unreachable");
	return 0;
	} else if constexpr (ktl::is_signed_v<T>) {
	using U = ktl::make_unsigned_t<T>;
	return ktl::bit_cast<U>(value);
	} else {
	return value;
	}
	};

	if constexpr (sizeof(T) == sizeof(uint32_t)) {
	__asm__ volatile("str %w[val], [x20, %[offset]]"
	:
	: [val] "r"(value), [offset] "Ir"(Offset)
	: "memory");
	} else if constexpr (sizeof(T) == sizeof(uint16_t)) {
	__asm__ volatile("strh %w[val], [x20, %[offset]]"
	:
	: [val] "r"(widen(value)), [offset] "Ir"(Offset)
	: "memory");
	} else if constexpr (sizeof(T) == sizeof(uint8_t)) {
	__asm__ volatile("strb %w[val], [x20, %[offset]]"
	:
	: [val] "r"(widen(value)), [offset] "Ir"(Offset)
	: "memory");
	} else {
	static_assert(sizeof(T) == sizeof(uint64_t));
	__asm__ volatile("str %[val], [x20, %[offset]]"
	:
	: [val] "r"(value), [offset] "Ir"(Offset)
	: "memory");
	}
	}
	#define WRITE_PERCPU_FIELD(field, value) \
	(arm64_write_percpu_field<decltype(arm64_percpu::field), offsetof(arm64_percpu, field)>(value))

	// Return a pointer to the high-level percpu struct for the calling CPU.
	inline struct percpu* arch_get_curr_percpu() { return READ_PERCPU_FIELD(high_level_percpu); }

	inline cpu_num_t arch_curr_cpu_num() { return READ_PERCPU_FIELD(cpu_num); }

	// TODO(https://fxbug.dev/42108040) get num_cpus from topology.
	// This needs to be set very early (before arch_init).
	inline void arch_set_num_cpus(uint cpu_count) { arm_num_cpus = cpu_count; }

	inline uint arch_max_num_cpus() { return arm_num_cpus; }

	// translate a cpu number back to the cluster ID (AFF1)
	inline uint arch_cpu_num_to_cluster_id(cpu_num_t cpu) {
	DEBUG_ASSERT(cpu < SMP_MAX_CPUS);

	return arm64_cpu_cluster_ids[cpu];
	}

	// translate a cpu number back to the MP cpu number within a cluster (AFF0)
	inline uint arch_cpu_num_to_cpu_id(cpu_num_t cpu) {
	DEBUG_ASSERT(cpu < SMP_MAX_CPUS);

	return arm64_cpu_cpu_ids[cpu];
	}

	// Translate a CPU number back to the MPIDR of the CPU.
	uint64_t arch_cpu_num_to_mpidr(cpu_num_t cpu_num);

	// translate mpidr to cpu number
	cpu_num_t arm64_mpidr_to_cpu_num(uint64_t mpidr);

	// Setup the high-level percpu struct pointer for \|cpu_num\|.
	void arch_setup_percpu(cpu_num_t cpu_num, struct percpu* percpu);

	inline void arch_set_restricted_flag(bool restricted) {
	WRITE_PERCPU_FIELD(in_restricted_mode, restricted ? 1 : 0);
	}

	inline bool arch_get_restricted_flag() { return READ_PERCPU_FIELD(in_restricted_mode); }

	#endif // !__ASSEMBLER__

	#endif // ZIRCON_KERNEL_ARCH_ARM64_INCLUDE_ARCH_ARM64_MP_H_