sdk/lib/c/threads/thread-storage.cc - fuchsia - Git at Google

 // Copyright 2025 The Fuchsia Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "thread-storage.h"

 #include <lib/elfldltl/machine.h>
 #include <zircon/assert.h>

 #include <cassert>
 #include <concepts>
 #include <utility>

 #include "shadow-call-stack.h"
 #include "threads_impl.h"

 namespace LIBC_NAMESPACE_DECL {
 namespace {

 using TlsLayout = elfldltl::TlsLayout<>;

 // If the thread's ZX_PROP_NAME is empty, the VMO will use this name instead.
 constexpr std::string_view kVmoName = "thread-stacks+TLS";

 // Each of the blocks owned by ThreadStorage is represented by a Block
 // specialization for some &ThreadStorage::member_.  Each class below meets
 // this same API contract.
 template <typename T>
 concept BlockType = requires(T& t, ThreadStorage& storage, PageRoundedSize tls_size,
                              zx::unowned_vmar vmar, AllocationVmo& vmo) {
   // Returns the size needed in the AllocationVmo.
   { std::as_const(t).VmoSize(std::as_const(storage), tls_size) } -> std::same_as<PageRoundedSize>;

   // Maps the block into the VMAR from the AllocationVmo.
   { t.Map(std::as_const(storage), tls_size, vmar->borrow(), vmo).is_ok() } -> std::same_as<bool>;

   // Commits the block to the successfully-allocated ThreadStorage object.
   { t.Commit(storage) };
 };

 template <BlockType T>
 using BlockTypeCheck = T;

 template <auto Member>
 class Block;

 template <auto Member>
 using BlockFor = BlockTypeCheck<Block<Member>>;

 // This handles each of the stack blocks.
 template <uintptr_t ThreadStorage::* Member>
 class Block<Member> {
  public:
   PageRoundedSize VmoSize(const ThreadStorage& storage, PageRoundedSize tls_size) const {
     return storage.stack_size();
   }

   auto Map(const ThreadStorage& storage, PageRoundedSize tls_size, zx::unowned_vmar vmar,
            AllocationVmo& vmo) {
     PageRoundedSize guard_below, guard_above;
     if constexpr (ThreadStorage::StackGrowsUp(Member)) {
       guard_above = storage.guard_size();
     } else {
       guard_below = storage.guard_size();
     }

     return block_.Allocate<uint64_t>(vmar->borrow(), vmo, storage.stack_size(), guard_below,
                                      guard_above);
   }

   void Commit(ThreadStorage& storage) { storage.*Member = block_.release(); }

  private:
   GuardedPageBlock block_;
 };

 // This handles shadow_call_stack_ when it's a no-op.
 template <NoShadowCallStack ThreadStorage::* Member>
 class Block<Member> {
  public:
   PageRoundedSize VmoSize(const ThreadStorage& storage, PageRoundedSize tls_size) const {
     return {};
   }

   zx::result<std::span<uint64_t>> Map(const ThreadStorage& storage, PageRoundedSize tls_size,
                                       zx::unowned_vmar vmar, AllocationVmo& vmo) {
     return zx::ok(std::span<uint64_t>{});
   }

   void Commit(ThreadStorage& storage) {}
 };

 // This handles thread_block_, which includes both the TCB and the
 // (runtime-dynamic) static TLS segments.  It always gets one-page guards both
 // above and below, regardless of the configured guard size for the stacks.
 template <GuardedPageBlock ThreadStorage::* Member>
 class Block<Member> {
  public:
   PageRoundedSize VmoSize(const ThreadStorage& storage, PageRoundedSize tls_size) const {
     return tls_size;
   }

   auto Map(const ThreadStorage& storage, PageRoundedSize tls_size, zx::unowned_vmar vmar,
            AllocationVmo& vmo) {
     const PageRoundedSize page_size = PageRoundedSize::Page();
     return block_.Allocate(vmar->borrow(), vmo, tls_size, page_size, page_size);
   }

   void Commit(ThreadStorage& storage) { storage.*Member = std::move(block_); }

  private:
   GuardedPageBlock block_;
 };

 // This is the result of computations for allocating the thread block.  The
 // PT_TLS p_align fields affect the computations within, but regardless the
 // whole block is always page-aligned (so any p_align larger than a page is
 // effectively treated as only a page).  This is allocated via GuardedPageBlock
 // with guards both above and below to minimize chances of overruns out of TLS
 // into something else or out of something else into TLS or the TCB.  (In the
 // x86 kTlsNegative layout, overruns from TLS into the TCB are unguarded while
 // elsewhere only underruns from TLS back into the TCB are what's unguarded.)
 struct ThreadBlockSize {
   PageRoundedSize size;  // Total size to allocate for TLS + TCB.
   size_t tp_offset;      // Point $tp this far inside the block allocated.
 };

 // These abbreviations are used in the layout descriptions below:
 //  * $tp: the thread pointer, as __builtin_thread_pointer() returns
 //  * TLSn: the PT_TLS segment for static TLS module with ID n (n > 0)
 //    - When the main executable has a PT_TLS of its own, then it has ID 1
 //      and its $tp offset is fixed by an ABI calculation at static link time.
 //  * Sn: the runtime address in a given thread where the TLSn block starts
 //    - This must lie at an address 0 mod p_align, occupying p_memsz bytes.
 //    - (Sn - $tp) is the value that appears in GOT slots for IE accesses.
 //    - (S1 - $tp) for LE accesses is fixed by the ABI given TLS1.p_align.
 //  * DTV: the Dynamic Thread Vector of traditional dynamic TLS implementations
 //    - Not part of any public ABI contract, but sometimes described as if so.
 //  * TCB: the Thread object
 //    - This is a private implementation detail of libc (mostly).
 //    - The <zircon/tls.h> Fuchsia Compiler ABI slots lie within this object,
 //      though they are expressed as byte offsets from $tp.
 //    - When kTlsLocalExecOffset is nonzero (ARM), it describes the final bytes
 //      of the Thread object (psABI), so the Thread object in memory will
 //      straddle $tp.  The ABI specifies that this much space above $tp is
 //      reserved for the implementation and says nothing about what goes there.
 //    - When kTpSelfPointer is nonzero (x86), the void* directly at $tp must be
 //      set to the $tp address itself.  Only x86 has this and also only x86 has
 //      kTlsNegative, so this first word above $tp is not part of the layout
 //      calculations imposed by the ABI for TLS; instead, it's just a runtime
 //      requirement and forms the first part of the Thread object.
 //    - In traditional implementations, the second word above $tp (either in
 //      the private TCB or in the ABI-specified reserved area) holds the DTV.
 //      This has never been part of any ABI contract, except one private to
 //      some particular dynamic linker, its __tls_get_addr implementation, and
 //      its TLSDESC implementation hooks.
 //  * T: the runtime address of the Thread object
 //  * FC: the <zircon/tls.h> Fuchsia Compiler ABI slots
 //    - <zircon/tls.h> defines two per-machine offsets from $tp for ABI use
 //    - These are used by --target=*-fuchsia compilers by default, but are not
 //      part of the basic machine ABI or the Fuchsia System ABI.  They are not
 //      used by the startup dynamic linker or vDSO, nor provided by the system
 //      program loader.  They are only provided here in libc.
 //    - This implementation includes these in the TCB.
 //  * psABI: ELF processor-specific ABI-mandated kTlsLocalExecOffset space
 //    - The psABI for ARM and AArch64 specifies this, but not its use.
 //    - Traditional implementations have used it just like the first two words
 //      past $tp on x86: a $tp self-pointer (though nothing uses that); and the
 //      DTV (a private implementation detail).
 // TCB is used to refer to the whole Thread object and also sometimes to
 // distinguish FC and psABI from the rest of the Thread object.  A future
 // implementation might have clearer distinctions in the data structures.

 template <class TlsTraits = elfldltl::TlsTraits<>>
   requires(TlsTraits::kTlsNegative)
 ThreadBlockSize ComputeThreadBlockSize(TlsLayout static_tls_layout) {
   // This layout is used only on x86 (both EM_386 and EM_X86_64).  To be
   // pedantic, the ABI requirement kTpSelfPointer indicates is orthogonal;
   // but it's related, and also unique to x86.  kTlsLocalExecOffset is also
   // zero on some machines other than x86, but it's especially helpful to
   // ignore a possible nonzero value in explaining the kTlsNegative layout.
   static_assert(TlsTraits::kTpSelfPointer);
   static_assert(TlsTraits::kTlsLocalExecOffset == 0);

   // *----------------------------------------------------------------------*
   // |     unused    | TLSn | ... | TLS1 | $tp . (DTV) . FC . TCB | (align) |
   // *---------------^------^-----^------^-----^-------^----^-----*---------*
   // |               Sn     S2    S1    $tp=T  +8      +16  +32             |
   // *----------------------------------------------------------------------*
   // Note: T == $tp
   //
   // TLS offsets are negative, but the layout size is computed ascending from
   // zero with PT_TLS p_memsz and p_align requirements; then each segment's
   // offset is just negated at the end.  So the layout size is how many bytes
   // below $tp will be used.  The start address of each TLSn (Sn) must meet
   // TLSn's p_align requirement (capped at one OS page, as in PT_LOAD), but
   // TLS1 is always assigned first (closest to $tp).  The whole block must be
   // aligned to the maximum of the alignment requirements of each TLSn and the
   // TCB.  Then the offsets will be "aligned up" before being negated, such
   // that each address Sn is correctly aligned (the first TLSn block in memory,
   // for the largest n, will have the same alignment as $tp--the maximum of any
   // block).  Once the offsets have been assigned in this way, each TLSn block
   // starts at $tp + (negative) offset.  That may be followed by unused padding
   // space as required to make S(n+1) be 0 mod the TLS(n+1) p_align.  When TLS1
   // is the LE segment from the executable, that padding will be included in
   // its (negative offset) such that $tp itself will have at least the same
   // alignment as TLS1.  Layout mechanics require that $tp be thus aligned to
   // the maximum needed by any TLSn or the TCB.  If this is larger than the
   // TCB's own size, then there can be some unused space wasted at the very end
   // of the allocation: `alignof($tp) - sizeof(Thread)` bytes.
   //
   // The Fuchsia Compiler ABI <zircon/tls.h> slots are early in the TCB,
   // appearing in Thread just after the $tp slot and the second slot
   // traditionally used for the DTV (but formally just reserved for private
   // implementation use).
   //
   // Since the allocation will be in whole pages, there will often be some
   // unused space beyond any (usual small) amount wasted for alignment.  That
   // (usually larger) unused portion is placed at the beginning of the first
   // page, such that the end of the TCB is at (or close to) the end of the
   // whole allocation and the space "off the end" of TLSn (in the negative
   // direction from $tp) is available.  Reusing that space opportunistically
   // for PT_TLS segments of modules loaded later (e.g. dlopen) is fairly
   // straightforward, though notably a new PT_TLS segment with p_align greater
   // than static_tls_layout.alignment() can never be placed there (even if it
   // fits), as each thread's separate allocation can only be presumed to be
   // aligned to the original layout's requirement.
   const size_t tls_size = static_tls_layout.Align(  //
       static_tls_layout.size_bytes(), alignof(Thread));
   const size_t aligned_thread_size = static_tls_layout.Align(sizeof(Thread), alignof(Thread));
   const PageRoundedSize allocation_size{tls_size + aligned_thread_size};
   return {
       .size = allocation_size,
       .tp_offset = allocation_size.get() - aligned_thread_size,
   };
 }

 template <class TlsTraits = elfldltl::TlsTraits<>>
   requires(!TlsTraits::kTlsNegative)
 ThreadBlockSize ComputeThreadBlockSize(TlsLayout static_tls_layout) {
   // This style of layout is used on all machines other than x86.
   // The kTlsLocalExecOffset value differs by machine.
   //
   // *----------------------------------------------------------------------*
   // |  (align) | TCB, FC | [psABI] | TLS1 | ... | TLSn |     unused        |
   // *----------^---------^---------^------^-----^------^-------------------*
   // |          T        $tp        S1     S2    Sn     (Sn+p_memsz)        |
   // *----------------------------------------------------------------------*
   // Note: T == $tp + kTlsLocalExecOffset - sizeof(Thread)
   //
   // TLS offsets are positive, so the TCB (Thread) will sit just below $tp.
   // Any ABI-specified reserved area (ABI) is the tail of the layout of
   // Thread.  This means that Thread straddles $tp, which points to the ABI
   // reserved area that forms the last kTlsLocalExecOffset bytes of the
   // Thread object.  When kTlsLocalExecOffset is zero, $tp points exactly
   // just past Thread, which is also exactly the S1 address where TLS1 starts
   // (the LE segment assigned per ABI at static link time if there is one) .
   //
   // **Note:** It is always $tp _itself_ that must be aligned to the maximum
   // TLSn p_align!  When kTlsLocalExecOffset is nonzero, the static linker
   // starts its LE offset assignments there and then rounds up if the p_align
   // in the LE (executable) TLS1 is larger than that ABI-specified offset.
   //
   // kTpSelfPointer is not required for any non-x86 machine yet, but if it
   // were then the ABI's kTlsLocalExecOffset value would account for it.
   //
   // Note also that the 16 bytes just _below_ $tp are reserved by the Fuchsia
   // Compiler ABI for the two <zircon/tls.h> slots.
   //
   // If the TLS layout's required alignment is less than alignof(Thread), the
   // allocation will be aligned for Thread.  Otherwise, the allocation will
   // be aligned for the TLS layout and may include unused padding bytes at
   // the start so the TCB sits at exactly `$tp - sizeof(Thread)` while still
   // meeting the TLS alignment requirement for $tp.
   //
   // Since the allocation will be in whole pages, there will often be some
   // unused space still available off the end of the last TLSn.  This layout
   // makes it easy to just iteratively do another static_tls_layout.Assign to
   // see if a correctly-placed new block fits in the space left over.  There
   // may also be space at the beginning of the block if the TLS alignment is
   // greater than alignof(Thread), which will not be recovered for reuse.
   constexpr size_t kThreadToTp =  // Size from Thread* (T) to $tp.
       sizeof(Thread) - TlsTraits::kTlsLocalExecOffset;
   const size_t aligned_thread_size = static_tls_layout.Align(  //
       kThreadToTp, alignof(Thread));
   auto tls_layout_size = [static_tls_layout]() -> size_t {
     if (static_tls_layout.size_bytes() == 0) {
       return 0;
     }
     // The TLS layout size includes the reserved area, already part of Thread.
     assert(static_tls_layout.size_bytes() > TlsTraits::kTlsLocalExecOffset);
     return static_tls_layout.size_bytes() - TlsTraits::kTlsLocalExecOffset;
   };
   return {
       .size{aligned_thread_size + tls_layout_size()},
       .tp_offset = aligned_thread_size,
   };
 }

 }  // namespace

 void ThreadStorage::FreeStacks() {
   auto unmap = [this, block_size = stack_size_ + guard_size_](uintptr_t base) {
     if (base != 0) {
       assert(thread_block_.vmar());
       zx::result result = zx::make_result(thread_block_.vmar().unmap(base, block_size.get()));
       ZX_ASSERT_MSG(result.is_ok(), "zx_vmar_unmap: %s", result.status_string());
     }
   };
   unmap(machine_stack_);
   unmap(unsafe_stack_);
   OnShadowCallStack(shadow_call_stack_, unmap);
 }

 // Translate from the legacy C struct representation for ownership.
 ThreadStorage ThreadStorage::FromThread(Thread& thread, zx::unowned_vmar vmar) {
   using Sizes = std::array<size_t, 2>;  // Stack size, guard size.
   constexpr auto infer_sizes = [](iovec stack, iovec region, bool grows_up = false) -> Sizes {
     assert(PageRoundedSize{stack.iov_len}.get() == stack.iov_len);
     assert(PageRoundedSize{region.iov_len}.get() == region.iov_len);
     assert(stack.iov_len <= region.iov_len);
     assert(stack.iov_base >= region.iov_base);
     if (stack.iov_base == region.iov_base) {
       assert(grows_up || stack.iov_len == region.iov_len);
     } else {
       assert(!grows_up);
       assert(reinterpret_cast<uintptr_t>(stack.iov_base) -
                  reinterpret_cast<uintptr_t>(region.iov_base) ==
              region.iov_len - stack.iov_len);
     }
     return {stack.iov_len, region.iov_len - stack.iov_len};
   };

   constexpr auto take_stack = [](iovec& stack, iovec& region) -> uintptr_t {
     stack = {};
     return reinterpret_cast<uintptr_t>(std::exchange(region, {}).iov_base);
   };

   Sizes stack_sizes = infer_sizes(thread.safe_stack, thread.safe_stack_region);
   assert(infer_sizes(thread.unsafe_stack, thread.unsafe_stack_region) == stack_sizes);
 #if HAVE_SHADOW_CALL_STACK
   assert(infer_sizes(thread.shadow_call_stack, thread.shadow_call_stack_region) == stack_sizes);
 #endif

   assert(*vmar);
   ThreadStorage result;
   result.thread_block_ = {std::exchange(thread.tcb_region, {}), vmar->borrow()};
   std::tie(result.stack_size_.rounded_size_, result.guard_size_.rounded_size_) = stack_sizes;
   result.machine_stack_ = take_stack(thread.safe_stack, thread.safe_stack_region);
   result.unsafe_stack_ = take_stack(thread.unsafe_stack, thread.unsafe_stack_region);
 #if HAVE_SHADOW_CALL_STACK
   result.shadow_call_stack_ = take_stack(thread.shadow_call_stack, thread.shadow_call_stack_region);
 #endif

   return result;
 }

 void ThreadStorage::ToThread(Thread& thread) && {
   auto take_stack = [this](iovec& stack, iovec& region, uintptr_t& base, bool grows_up = false) {
     assert(!stack.iov_base);
     assert(stack.iov_len == 0);
     assert(!region.iov_base);
     assert(region.iov_len == 0);
     region = {
         .iov_base = reinterpret_cast<void*>(base),
         .iov_len = (stack_size_ + guard_size_).get(),
     };
     stack = {
         .iov_base = reinterpret_cast<void*>(base + (grows_up ? 0 : guard_size_.get())),
         .iov_len = stack_size_.get(),
     };
     base = 0;
   };

   thread.tcb_region = std::move(thread_block_).TakeIovec();

   take_stack(thread.safe_stack, thread.safe_stack_region, machine_stack_);
   take_stack(thread.unsafe_stack, thread.unsafe_stack_region, unsafe_stack_);
 #if HAVE_SHADOW_CALL_STACK
   take_stack(thread.shadow_call_stack, thread.shadow_call_stack_region, shadow_call_stack_, true);
 #endif
 }

 zx::result<Thread*> ThreadStorage::Allocate(zx::unowned_vmar allocate_from,
                                             std::string_view thread_name, PageRoundedSize stack,
                                             PageRoundedSize guard) {
   ZX_DEBUG_ASSERT(*allocate_from);
   ZX_DEBUG_ASSERT(stack);

   if (thread_name.empty()) {
     thread_name = kVmoName;
   }

   // The thread block size is a complex calculation, while the others depend
   // only on the stack and guard sizes.
   auto [thread_block_size, tp_offset] = ComputeThreadBlockSize(GetTlsLayout());

   stack_size_ = stack;
   guard_size_ = guard;

   std::span<std::byte> thread_block;

   // The VMO space and mapping is handled the same for each block.
   auto allocate_blocks = [&](Block<&ThreadStorage::thread_block_> tcb,
                              BlockType auto... stacks) -> zx::result<> {
     // Allocate a single VMO for all the blocks.
     const PageRoundedSize vmo_size =
         tcb.VmoSize(*this, thread_block_size) + (stacks.VmoSize(*this, thread_block_size) + ...);
     zx::result vmo = AllocationVmo::New(vmo_size);
     if (vmo.is_error()) [[unlikely]] {
       return vmo.take_error();
     }

     auto map_one_block = [&]<BlockType B>(B& block) -> zx::result<> {
       zx::result result = block.Map(*this, thread_block_size, allocate_from->borrow(), *vmo);
       if (result.is_error()) [[unlikely]] {
         return result.take_error();
       }
       if constexpr (std::is_same_v<B, Block<&ThreadStorage::thread_block_>>) {
         thread_block = result.value();
       }
       return fit::ok();
     };

     // Map in each block's portion of that VMO.  After this, the mappings
     // (including guards) cannot be modified, only unmapped whole from above.
     auto map_blocks = [&](BlockType auto&&... blocks) {
       zx::result<> result = zx::ok();
       ((result = map_one_block(blocks)).is_ok() && ...);
       return result;
     };

     // Allocate the largest blocks first to minimize fragmentation in the VMAR.
     // The stacks all have the same size.
     if (zx::result<> result = tcb.VmoSize(*this, thread_block_size) >= stack
                                   ? map_blocks(tcb, stacks...)
                                   : map_blocks(stacks..., tcb);
         result.is_error()) [[unlikely]] {
       return result;
     }

     // Now that everything is mapped in, the ownership can move into this
     // ThreadStorage object.  Everything will be cleaned up on destruction.
     tcb.Commit(*this);
     (stacks.Commit(*this), ...);

     // Name the VMO to match the thread.
     return zx::make_result(
         vmo->vmo.set_property(ZX_PROP_NAME, thread_name.data(), thread_name.size()));
   };

   // Allocate all the blocks together in a single VMO and map each separately.
   if (zx::result<> result = allocate_blocks(        //
           Block<&ThreadStorage::thread_block_>{},   //
           Block<&ThreadStorage::machine_stack_>{},  //
           Block<&ThreadStorage::unsafe_stack_>{},   //
           Block<&ThreadStorage::shadow_call_stack_>{});
       result.is_error()) [[unlikely]] {
     return result.take_error();
   }

   // Initialize the static TLS data from PT_TLS segments.
   InitializeTls(thread_block, tp_offset);

   // The location of the Thread object inside the thread block is part of the
   // complex sizing calculation, while all the stack pointers are just at one
   // end of their block or the other.
   Thread* thread = tp_to_pthread(thread_block.data() + tp_offset);
   if constexpr (elfldltl::TlsTraits<>::kTpSelfPointer) {
     void* tp = pthread_to_tp(thread);
     assert(&thread->head.tp == tp);
     thread->head.tp = reinterpret_cast<uintptr_t>(tp);
   }

   // The unsafe stack pointer is always part of the Thread rather than using a
   // machine register, so it can be initialized right here.
   thread->abi.unsafe_sp = reinterpret_cast<uintptr_t>(unsafe_sp());

   return zx::ok(thread);
 }

 }  // namespace LIBC_NAMESPACE_DECL