blob: f4a5a71b8251208a5ca65bd68cbe2c25a9ea116d [file] [log] [blame]
// Copyright 2025 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "thread-storage.h"
#include <lib/elfldltl/machine.h>
#include <zircon/assert.h>
#include <cassert>
#include <concepts>
#include <utility>
#include "shadow-call-stack.h"
#include "threads_impl.h"
namespace LIBC_NAMESPACE_DECL {
namespace {
using TlsLayout = elfldltl::TlsLayout<>;
// If the thread's ZX_PROP_NAME is empty, the VMO will use this name instead.
constexpr std::string_view kVmoName = "thread-stacks+TLS";
// Each of the blocks owned by ThreadStorage is represented by a Block
// specialization for some &ThreadStorage::member_. Each class below meets
// this same API contract.
template <typename T>
concept BlockType = requires(T& t, ThreadStorage& storage, PageRoundedSize tls_size,
zx::unowned_vmar vmar, AllocationVmo& vmo) {
// Returns the size needed in the AllocationVmo.
{ std::as_const(t).VmoSize(std::as_const(storage), tls_size) } -> std::same_as<PageRoundedSize>;
// Maps the block into the VMAR from the AllocationVmo.
{ t.Map(std::as_const(storage), tls_size, vmar->borrow(), vmo).is_ok() } -> std::same_as<bool>;
// Commits the block to the successfully-allocated ThreadStorage object.
{ t.Commit(storage) };
};
template <BlockType T>
using BlockTypeCheck = T;
template <auto Member>
class Block;
template <auto Member>
using BlockFor = BlockTypeCheck<Block<Member>>;
// This handles each of the stack blocks.
template <uintptr_t ThreadStorage::* Member>
class Block<Member> {
public:
PageRoundedSize VmoSize(const ThreadStorage& storage, PageRoundedSize tls_size) const {
return storage.stack_size();
}
auto Map(const ThreadStorage& storage, PageRoundedSize tls_size, zx::unowned_vmar vmar,
AllocationVmo& vmo) {
PageRoundedSize guard_below, guard_above;
if constexpr (ThreadStorage::StackGrowsUp(Member)) {
guard_above = storage.guard_size();
} else {
guard_below = storage.guard_size();
}
return block_.Allocate<uint64_t>(vmar->borrow(), vmo, storage.stack_size(), guard_below,
guard_above);
}
void Commit(ThreadStorage& storage) { storage.*Member = block_.release(); }
private:
GuardedPageBlock block_;
};
// This handles shadow_call_stack_ when it's a no-op.
template <NoShadowCallStack ThreadStorage::* Member>
class Block<Member> {
public:
PageRoundedSize VmoSize(const ThreadStorage& storage, PageRoundedSize tls_size) const {
return {};
}
zx::result<std::span<uint64_t>> Map(const ThreadStorage& storage, PageRoundedSize tls_size,
zx::unowned_vmar vmar, AllocationVmo& vmo) {
return zx::ok(std::span<uint64_t>{});
}
void Commit(ThreadStorage& storage) {}
};
// This handles thread_block_, which includes both the TCB and the
// (runtime-dynamic) static TLS segments. It always gets one-page guards both
// above and below, regardless of the configured guard size for the stacks.
template <GuardedPageBlock ThreadStorage::* Member>
class Block<Member> {
public:
PageRoundedSize VmoSize(const ThreadStorage& storage, PageRoundedSize tls_size) const {
return tls_size;
}
auto Map(const ThreadStorage& storage, PageRoundedSize tls_size, zx::unowned_vmar vmar,
AllocationVmo& vmo) {
const PageRoundedSize page_size = PageRoundedSize::Page();
return block_.Allocate(vmar->borrow(), vmo, tls_size, page_size, page_size);
}
void Commit(ThreadStorage& storage) { storage.*Member = std::move(block_); }
private:
GuardedPageBlock block_;
};
// This is the result of computations for allocating the thread block. The
// PT_TLS p_align fields affect the computations within, but regardless the
// whole block is always page-aligned (so any p_align larger than a page is
// effectively treated as only a page). This is allocated via GuardedPageBlock
// with guards both above and below to minimize chances of overruns out of TLS
// into something else or out of something else into TLS or the TCB. (In the
// x86 kTlsNegative layout, overruns from TLS into the TCB are unguarded while
// elsewhere only underruns from TLS back into the TCB are what's unguarded.)
struct ThreadBlockSize {
PageRoundedSize size; // Total size to allocate for TLS + TCB.
size_t tp_offset; // Point $tp this far inside the block allocated.
};
// These abbreviations are used in the layout descriptions below:
// * $tp: the thread pointer, as __builtin_thread_pointer() returns
// * TLSn: the PT_TLS segment for static TLS module with ID n (n > 0)
// - When the main executable has a PT_TLS of its own, then it has ID 1
// and its $tp offset is fixed by an ABI calculation at static link time.
// * Sn: the runtime address in a given thread where the TLSn block starts
// - This must lie at an address 0 mod p_align, occupying p_memsz bytes.
// - (Sn - $tp) is the value that appears in GOT slots for IE accesses.
// - (S1 - $tp) for LE accesses is fixed by the ABI given TLS1.p_align.
// * DTV: the Dynamic Thread Vector of traditional dynamic TLS implementations
// - Not part of any public ABI contract, but sometimes described as if so.
// * TCB: the Thread object
// - This is a private implementation detail of libc (mostly).
// - The <zircon/tls.h> Fuchsia Compiler ABI slots lie within this object,
// though they are expressed as byte offsets from $tp.
// - When kTlsLocalExecOffset is nonzero (ARM), it describes the final bytes
// of the Thread object (psABI), so the Thread object in memory will
// straddle $tp. The ABI specifies that this much space above $tp is
// reserved for the implementation and says nothing about what goes there.
// - When kTpSelfPointer is nonzero (x86), the void* directly at $tp must be
// set to the $tp address itself. Only x86 has this and also only x86 has
// kTlsNegative, so this first word above $tp is not part of the layout
// calculations imposed by the ABI for TLS; instead, it's just a runtime
// requirement and forms the first part of the Thread object.
// - In traditional implementations, the second word above $tp (either in
// the private TCB or in the ABI-specified reserved area) holds the DTV.
// This has never been part of any ABI contract, except one private to
// some particular dynamic linker, its __tls_get_addr implementation, and
// its TLSDESC implementation hooks.
// * T: the runtime address of the Thread object
// * FC: the <zircon/tls.h> Fuchsia Compiler ABI slots
// - <zircon/tls.h> defines two per-machine offsets from $tp for ABI use
// - These are used by --target=*-fuchsia compilers by default, but are not
// part of the basic machine ABI or the Fuchsia System ABI. They are not
// used by the startup dynamic linker or vDSO, nor provided by the system
// program loader. They are only provided here in libc.
// - This implementation includes these in the TCB.
// * psABI: ELF processor-specific ABI-mandated kTlsLocalExecOffset space
// - The psABI for ARM and AArch64 specifies this, but not its use.
// - Traditional implementations have used it just like the first two words
// past $tp on x86: a $tp self-pointer (though nothing uses that); and the
// DTV (a private implementation detail).
// TCB is used to refer to the whole Thread object and also sometimes to
// distinguish FC and psABI from the rest of the Thread object. A future
// implementation might have clearer distinctions in the data structures.
template <class TlsTraits = elfldltl::TlsTraits<>>
requires(TlsTraits::kTlsNegative)
ThreadBlockSize ComputeThreadBlockSize(TlsLayout static_tls_layout) {
// This layout is used only on x86 (both EM_386 and EM_X86_64). To be
// pedantic, the ABI requirement kTpSelfPointer indicates is orthogonal;
// but it's related, and also unique to x86. kTlsLocalExecOffset is also
// zero on some machines other than x86, but it's especially helpful to
// ignore a possible nonzero value in explaining the kTlsNegative layout.
static_assert(TlsTraits::kTpSelfPointer);
static_assert(TlsTraits::kTlsLocalExecOffset == 0);
// *----------------------------------------------------------------------*
// | unused | TLSn | ... | TLS1 | $tp . (DTV) . FC . TCB | (align) |
// *---------------^------^-----^------^-----^-------^----^-----*---------*
// | Sn S2 S1 $tp=T +8 +16 +32 |
// *----------------------------------------------------------------------*
// Note: T == $tp
//
// TLS offsets are negative, but the layout size is computed ascending from
// zero with PT_TLS p_memsz and p_align requirements; then each segment's
// offset is just negated at the end. So the layout size is how many bytes
// below $tp will be used. The start address of each TLSn (Sn) must meet
// TLSn's p_align requirement (capped at one OS page, as in PT_LOAD), but
// TLS1 is always assigned first (closest to $tp). The whole block must be
// aligned to the maximum of the alignment requirements of each TLSn and the
// TCB. Then the offsets will be "aligned up" before being negated, such
// that each address Sn is correctly aligned (the first TLSn block in memory,
// for the largest n, will have the same alignment as $tp--the maximum of any
// block). Once the offsets have been assigned in this way, each TLSn block
// starts at $tp + (negative) offset. That may be followed by unused padding
// space as required to make S(n+1) be 0 mod the TLS(n+1) p_align. When TLS1
// is the LE segment from the executable, that padding will be included in
// its (negative offset) such that $tp itself will have at least the same
// alignment as TLS1. Layout mechanics require that $tp be thus aligned to
// the maximum needed by any TLSn or the TCB. If this is larger than the
// TCB's own size, then there can be some unused space wasted at the very end
// of the allocation: `alignof($tp) - sizeof(Thread)` bytes.
//
// The Fuchsia Compiler ABI <zircon/tls.h> slots are early in the TCB,
// appearing in Thread just after the $tp slot and the second slot
// traditionally used for the DTV (but formally just reserved for private
// implementation use).
//
// Since the allocation will be in whole pages, there will often be some
// unused space beyond any (usual small) amount wasted for alignment. That
// (usually larger) unused portion is placed at the beginning of the first
// page, such that the end of the TCB is at (or close to) the end of the
// whole allocation and the space "off the end" of TLSn (in the negative
// direction from $tp) is available. Reusing that space opportunistically
// for PT_TLS segments of modules loaded later (e.g. dlopen) is fairly
// straightforward, though notably a new PT_TLS segment with p_align greater
// than static_tls_layout.alignment() can never be placed there (even if it
// fits), as each thread's separate allocation can only be presumed to be
// aligned to the original layout's requirement.
const size_t tls_size = static_tls_layout.Align( //
static_tls_layout.size_bytes(), alignof(Thread));
const size_t aligned_thread_size = static_tls_layout.Align(sizeof(Thread), alignof(Thread));
const PageRoundedSize allocation_size{tls_size + aligned_thread_size};
return {
.size = allocation_size,
.tp_offset = allocation_size.get() - aligned_thread_size,
};
}
template <class TlsTraits = elfldltl::TlsTraits<>>
requires(!TlsTraits::kTlsNegative)
ThreadBlockSize ComputeThreadBlockSize(TlsLayout static_tls_layout) {
// This style of layout is used on all machines other than x86.
// The kTlsLocalExecOffset value differs by machine.
//
// *----------------------------------------------------------------------*
// | (align) | TCB, FC | [psABI] | TLS1 | ... | TLSn | unused |
// *----------^---------^---------^------^-----^------^-------------------*
// | T $tp S1 S2 Sn (Sn+p_memsz) |
// *----------------------------------------------------------------------*
// Note: T == $tp + kTlsLocalExecOffset - sizeof(Thread)
//
// TLS offsets are positive, so the TCB (Thread) will sit just below $tp.
// Any ABI-specified reserved area (ABI) is the tail of the layout of
// Thread. This means that Thread straddles $tp, which points to the ABI
// reserved area that forms the last kTlsLocalExecOffset bytes of the
// Thread object. When kTlsLocalExecOffset is zero, $tp points exactly
// just past Thread, which is also exactly the S1 address where TLS1 starts
// (the LE segment assigned per ABI at static link time if there is one) .
//
// **Note:** It is always $tp _itself_ that must be aligned to the maximum
// TLSn p_align! When kTlsLocalExecOffset is nonzero, the static linker
// starts its LE offset assignments there and then rounds up if the p_align
// in the LE (executable) TLS1 is larger than that ABI-specified offset.
//
// kTpSelfPointer is not required for any non-x86 machine yet, but if it
// were then the ABI's kTlsLocalExecOffset value would account for it.
//
// Note also that the 16 bytes just _below_ $tp are reserved by the Fuchsia
// Compiler ABI for the two <zircon/tls.h> slots.
//
// If the TLS layout's required alignment is less than alignof(Thread), the
// allocation will be aligned for Thread. Otherwise, the allocation will
// be aligned for the TLS layout and may include unused padding bytes at
// the start so the TCB sits at exactly `$tp - sizeof(Thread)` while still
// meeting the TLS alignment requirement for $tp.
//
// Since the allocation will be in whole pages, there will often be some
// unused space still available off the end of the last TLSn. This layout
// makes it easy to just iteratively do another static_tls_layout.Assign to
// see if a correctly-placed new block fits in the space left over. There
// may also be space at the beginning of the block if the TLS alignment is
// greater than alignof(Thread), which will not be recovered for reuse.
constexpr size_t kThreadToTp = // Size from Thread* (T) to $tp.
sizeof(Thread) - TlsTraits::kTlsLocalExecOffset;
const size_t aligned_thread_size = static_tls_layout.Align( //
kThreadToTp, alignof(Thread));
auto tls_layout_size = [static_tls_layout]() -> size_t {
if (static_tls_layout.size_bytes() == 0) {
return 0;
}
// The TLS layout size includes the reserved area, already part of Thread.
assert(static_tls_layout.size_bytes() > TlsTraits::kTlsLocalExecOffset);
return static_tls_layout.size_bytes() - TlsTraits::kTlsLocalExecOffset;
};
return {
.size{aligned_thread_size + tls_layout_size()},
.tp_offset = aligned_thread_size,
};
}
} // namespace
void ThreadStorage::FreeStacks() {
auto unmap = [this, block_size = stack_size_ + guard_size_](uintptr_t base) {
if (base != 0) {
assert(thread_block_.vmar());
zx::result result = zx::make_result(thread_block_.vmar().unmap(base, block_size.get()));
ZX_ASSERT_MSG(result.is_ok(), "zx_vmar_unmap: %s", result.status_string());
}
};
unmap(machine_stack_);
unmap(unsafe_stack_);
OnShadowCallStack(shadow_call_stack_, unmap);
}
// Translate from the legacy C struct representation for ownership.
ThreadStorage ThreadStorage::FromThread(Thread& thread, zx::unowned_vmar vmar) {
using Sizes = std::array<size_t, 2>; // Stack size, guard size.
constexpr auto infer_sizes = [](iovec stack, iovec region, bool grows_up = false) -> Sizes {
assert(PageRoundedSize{stack.iov_len}.get() == stack.iov_len);
assert(PageRoundedSize{region.iov_len}.get() == region.iov_len);
assert(stack.iov_len <= region.iov_len);
assert(stack.iov_base >= region.iov_base);
if (stack.iov_base == region.iov_base) {
assert(grows_up || stack.iov_len == region.iov_len);
} else {
assert(!grows_up);
assert(reinterpret_cast<uintptr_t>(stack.iov_base) -
reinterpret_cast<uintptr_t>(region.iov_base) ==
region.iov_len - stack.iov_len);
}
return {stack.iov_len, region.iov_len - stack.iov_len};
};
constexpr auto take_stack = [](iovec& stack, iovec& region) -> uintptr_t {
stack = {};
return reinterpret_cast<uintptr_t>(std::exchange(region, {}).iov_base);
};
Sizes stack_sizes = infer_sizes(thread.safe_stack, thread.safe_stack_region);
assert(infer_sizes(thread.unsafe_stack, thread.unsafe_stack_region) == stack_sizes);
#if HAVE_SHADOW_CALL_STACK
assert(infer_sizes(thread.shadow_call_stack, thread.shadow_call_stack_region) == stack_sizes);
#endif
assert(*vmar);
ThreadStorage result;
result.thread_block_ = {std::exchange(thread.tcb_region, {}), vmar->borrow()};
std::tie(result.stack_size_.rounded_size_, result.guard_size_.rounded_size_) = stack_sizes;
result.machine_stack_ = take_stack(thread.safe_stack, thread.safe_stack_region);
result.unsafe_stack_ = take_stack(thread.unsafe_stack, thread.unsafe_stack_region);
#if HAVE_SHADOW_CALL_STACK
result.shadow_call_stack_ = take_stack(thread.shadow_call_stack, thread.shadow_call_stack_region);
#endif
return result;
}
void ThreadStorage::ToThread(Thread& thread) && {
auto take_stack = [this](iovec& stack, iovec& region, uintptr_t& base, bool grows_up = false) {
assert(!stack.iov_base);
assert(stack.iov_len == 0);
assert(!region.iov_base);
assert(region.iov_len == 0);
region = {
.iov_base = reinterpret_cast<void*>(base),
.iov_len = (stack_size_ + guard_size_).get(),
};
stack = {
.iov_base = reinterpret_cast<void*>(base + (grows_up ? 0 : guard_size_.get())),
.iov_len = stack_size_.get(),
};
base = 0;
};
thread.tcb_region = std::move(thread_block_).TakeIovec();
take_stack(thread.safe_stack, thread.safe_stack_region, machine_stack_);
take_stack(thread.unsafe_stack, thread.unsafe_stack_region, unsafe_stack_);
#if HAVE_SHADOW_CALL_STACK
take_stack(thread.shadow_call_stack, thread.shadow_call_stack_region, shadow_call_stack_, true);
#endif
}
zx::result<Thread*> ThreadStorage::Allocate(zx::unowned_vmar allocate_from,
std::string_view thread_name, PageRoundedSize stack,
PageRoundedSize guard) {
ZX_DEBUG_ASSERT(*allocate_from);
ZX_DEBUG_ASSERT(stack);
if (thread_name.empty()) {
thread_name = kVmoName;
}
// The thread block size is a complex calculation, while the others depend
// only on the stack and guard sizes.
auto [thread_block_size, tp_offset] = ComputeThreadBlockSize(GetTlsLayout());
stack_size_ = stack;
guard_size_ = guard;
std::span<std::byte> thread_block;
// The VMO space and mapping is handled the same for each block.
auto allocate_blocks = [&](Block<&ThreadStorage::thread_block_> tcb,
BlockType auto... stacks) -> zx::result<> {
// Allocate a single VMO for all the blocks.
const PageRoundedSize vmo_size =
tcb.VmoSize(*this, thread_block_size) + (stacks.VmoSize(*this, thread_block_size) + ...);
zx::result vmo = AllocationVmo::New(vmo_size);
if (vmo.is_error()) [[unlikely]] {
return vmo.take_error();
}
auto map_one_block = [&]<BlockType B>(B& block) -> zx::result<> {
zx::result result = block.Map(*this, thread_block_size, allocate_from->borrow(), *vmo);
if (result.is_error()) [[unlikely]] {
return result.take_error();
}
if constexpr (std::is_same_v<B, Block<&ThreadStorage::thread_block_>>) {
thread_block = result.value();
}
return fit::ok();
};
// Map in each block's portion of that VMO. After this, the mappings
// (including guards) cannot be modified, only unmapped whole from above.
auto map_blocks = [&](BlockType auto&&... blocks) {
zx::result<> result = zx::ok();
((result = map_one_block(blocks)).is_ok() && ...);
return result;
};
// Allocate the largest blocks first to minimize fragmentation in the VMAR.
// The stacks all have the same size.
if (zx::result<> result = tcb.VmoSize(*this, thread_block_size) >= stack
? map_blocks(tcb, stacks...)
: map_blocks(stacks..., tcb);
result.is_error()) [[unlikely]] {
return result;
}
// Now that everything is mapped in, the ownership can move into this
// ThreadStorage object. Everything will be cleaned up on destruction.
tcb.Commit(*this);
(stacks.Commit(*this), ...);
// Name the VMO to match the thread.
return zx::make_result(
vmo->vmo.set_property(ZX_PROP_NAME, thread_name.data(), thread_name.size()));
};
// Allocate all the blocks together in a single VMO and map each separately.
if (zx::result<> result = allocate_blocks( //
Block<&ThreadStorage::thread_block_>{}, //
Block<&ThreadStorage::machine_stack_>{}, //
Block<&ThreadStorage::unsafe_stack_>{}, //
Block<&ThreadStorage::shadow_call_stack_>{});
result.is_error()) [[unlikely]] {
return result.take_error();
}
// Initialize the static TLS data from PT_TLS segments.
InitializeTls(thread_block, tp_offset);
// The location of the Thread object inside the thread block is part of the
// complex sizing calculation, while all the stack pointers are just at one
// end of their block or the other.
Thread* thread = tp_to_pthread(thread_block.data() + tp_offset);
if constexpr (elfldltl::TlsTraits<>::kTpSelfPointer) {
void* tp = pthread_to_tp(thread);
assert(&thread->head.tp == tp);
thread->head.tp = reinterpret_cast<uintptr_t>(tp);
}
// The unsafe stack pointer is always part of the Thread rather than using a
// machine register, so it can be initialized right here.
thread->abi.unsafe_sp = reinterpret_cast<uintptr_t>(unsafe_sp());
return zx::ok(thread);
}
} // namespace LIBC_NAMESPACE_DECL