blob: 8fd4c917adb7c27cb3cbad8dde45cf21693dfee1 [file] [log] [blame]
// Copyright 2025 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include <lib/arch/asm.h>
#include <lib/ld/tlsdesc.h>
// See tlsdesc-runtime-dynamic.h comments for the background.
// This file implements the TLSDESC entry points described there.
//
// Each machine has a bespoke calling convention for TLSDESC entry points
// different from the standard C function calling convention on that same
// machine. But they're all about the same:
// * The return value register is used as the argument register. (On most
// machines, they're the same register in the normal convention anyway.)
// * The argument registers gets a pointer to two (read-only) GOT slots
// (address-width on most machines, but not on x86-64 ILP32), the second
// of which encodes the true arguments for the function.
// * The SP follows normal ABI conventions and can be pushed onto.
// * The FP is not expected to be touched by a leaf function.
// * All other registers must be preserved on return.
// * No other registers have known values except the return address
// register (for branch-and-link machines). Machines able to use an
// alternate return address register do so (RISC-V: t0 instead of ra).
//
// The two entry points here share most of their code for each machine.
// The "split" version (most commonly used) extracts the two bit fields
// from the GOT value; the "indirect" version (rarely used) uses the GOT
// value as a pointer to two uintptr_t fields: index and offset. Then it
// does a TLS access in IE model to read _dl_tlsdesc_runtime_dynamic_blocks
// and compute `blocks[index] + offset` into the return value register.
//
// The implementation pairs for each machine are quite similar as well. A
// prologue common to both versions spills additional registers to be used
// as temporaries and does the TLS access to get the blocks vector pointer
// into a register. Each version has different code to load the second GOT
// slot via the argument register and extract an index and offset from it
// into registers. Finally, a common tail computes the return value of
// `blocks[index] + offset - $tp`; reloads spills; and returns.
//
// Each machine defines a prologue macro used by both entry points. The
// common tail is implemented directly in the "split" version at the label
// `.Lload_block`; the less-used "indirect" version just jumps there.
//
// The code is written to work for ILP32 as well as LP64 (and for AArch32),
// though that has not been thoroughly tested yet. Note that GOT slot size
// matches pointer size on most machines, but not on x86-64 (ILP32).
#ifdef __UINTPTR_WIDTH__
#define SPLIT_SHIFT (__UINTPTR_WIDTH__ / 2)
#else
#error "Compiler should predefine __UINTPTR_WIDTH__"
#endif
#if __UINTPTR_WIDTH__ == 64
#define PTR_SHIFT 3
#elif __UINTPTR_WIDTH__ == 32
#define PTR_SHIFT 2
#else
#error "Unexpected __UINTPTR_WIDTH__ value!"
#endif
#if defined(__aarch64__)
// This is the shared beginning portion of both the split and indirect
// versions. It extracts the the GOT's value slot into x1 and the blocks
// vector pointer into x2, with x0 then available for scratch. Note that
// some uses are literal x[012] and some instead use tlsdesc_r[012] macros,
// depending on whether it's an base register (always 64 bits) or it's a
// pointer-sized operand.
.macro prologue
.tlsdesc.cfi
stp.spill x1, x2
// Fetch the blocks vector pointer using a standard IE model TLS read:
// load the $tp offset from the GOT; load the value from $tp + offset.
#ifdef __AARCH64_CMODEL_TINY__
ldr tlsdesc_r1, :gottprel:_dl_tlsdesc_runtime_dynamic_blocks
#else
adrp x1, :gottprel:_dl_tlsdesc_runtime_dynamic_blocks
ldr tlsdesc_r1, [x1, #:gottprel_lo12:_dl_tlsdesc_runtime_dynamic_blocks]
#endif
mrs x2, TPIDR_EL0 // System registers are 64 bits.
ldr tlsdesc_r2, [x2, tlsdesc_uxtw(tlsdesc_r1, #0)]
// Fetch the value word from the GOT slot (argument pointer).
ldr tlsdesc_r1, [x0, #tlsdesc.value_offset]
// The caller's value (the pointer into the GOT) will be clobbered next.
.cfi_undefined x0
.endm
.function _dl_tlsdesc_runtime_dynamic_split, global
// The first portion of the function is the same here and below.
prologue
// x0 gets the low bits: the offset.
ubfx tlsdesc_r0, tlsdesc_r1, #0, #SPLIT_SHIFT
// x1 gets the high bits: the block index.
ubfx tlsdesc_r1, tlsdesc_r1, #SPLIT_SHIFT, #SPLIT_SHIFT
.Lload_block:
// Fetch the module's block (x1) from the blocks vector.
ldr tlsdesc_r1, [x2, tlsdesc_uxtw(tlsdesc_r1, #PTR_SHIFT)]
// x2 is no longer needed; get $tp back there so it can be subtracted out.
mrs x2, TPIDR_EL0
// Finally, return value (x0) = block (x1) + offset (x0) - $tp (x2).
add tlsdesc_r0, tlsdesc_r1, tlsdesc_r0
sub tlsdesc_r0, tlsdesc_r0, tlsdesc_r2
ldp.reload x1, x2
ret
.end_function
.function _dl_tlsdesc_runtime_dynamic_indirect, global
// The first portion of the function is the same here and above.
prologue
// x1 gets the first word: the block index.
// x0 gets the second word: the offset.
ldp tlsdesc_r1, tlsdesc_r0, [x1]
// The tail is the same as for the (hotter) split case, so share its code.
b .Lload_block
.end_function
#elif defined(__arm__)
// This is the shared beginning portion of both the split and indirect
// versions. It extracts the the GOT's value slot into r0 and the blocks
// vector pointer into r2, with r1 then available for scratch.
.macro prologue
.tlsdesc.cfi
push.spill r1, r2
ldr r0, [r0, #4]
.cfi_undefined r0 // Caller's value (argument pointer) no longer available.
// Load the $tp offset from the GOT for IE model TLS access.
ldr r1, =_dl_tlsdesc_runtime_dynamic_blocks(GOTTPOFF) - (0f + pcrel.bias)
read_tp r2 // Fetch $tp in between.
0:add r1, pc
ldr r1, [r1]
// Load the blocks vector pointer (r2) from $tp (r2) + offset (r1).
ldr r2, [r2, r1]
.endm
.function _dl_tlsdesc_runtime_dynamic_split, global
// The first portion of the function is the same here and below.
prologue
// r1 gets the low bits: the offset.
uxth r1, r0
// r0 gets the high bits: the block index.
lsr r0, r0, #SPLIT_SHIFT
.Lload_block:
// Fetch the module's block from the blocks vector.
ldr r0, [r2, r0, lsl #2]
// Recover $tp so we can subtract it out.
read_tp r2
// Finally, add in the offset and subtract out $tp.
add r0, r0, r1
sub r0, r0, r2
// Epilogue.
pop.reload r1, r2
bx lr
.end_function
.function _dl_tlsdesc_runtime_dynamic_indirect, global
// The first portion of the function is the same here and above.
prologue
// r0 gets the first word: the block index.
// r1 gets the second word: the offset.
ldm r0, {r0, r1}
// The tail is the same as for the split case, so share its code.
b .Lload_block
.end_function
#elif defined(__riscv)
// This is the shared beginning portion of both the split and indirect
// versions. It extracts the the GOT's value slot into a1 and the blocks
// vector pointer into a2, with a0 then available for scratch.
.macro prologue
.tlsdesc.cfi
add sp, sp, -16
.cfi_adjust_cfa_offset 16
sd a1, 0(sp)
.cfi_rel_offset a1, 0
sd a2, 8(sp)
.cfi_rel_offset a2, 8
tlsdesc.load a1, tlsdesc.value_offset(a0)
// Fetch the tp offset from the GOT for IE model TLS access.
la.tls.ie a2, _dl_tlsdesc_runtime_dynamic_blocks
// Load the blocks vector pointer (a2) from tp + offset (a2).
tlsdesc.add a2, tp, a2
tlsdesc.load a2, (a2)
// The caller's value (the pointer into the GOT) will be clobbered next.
.cfi_undefined a0
.endm
.function _dl_tlsdesc_runtime_dynamic_split, global
// The first portion of the function is the same here and below.
prologue
// a0 gets the low bits: the offset.
// a1 gets the high bits: the block index.
#ifdef _LP64
zext.w a0, a1
srl a1, a1, SPLIT_SHIFT
#else
zext.h a0, a1, SPLIT_SHIFT
srlw a1, a1, SPLIT_SHIFT
#endif
.Lload_block:
// The blocks vector element contains a pointer, and we're adding an offset
// to that. But the return value is not that final pointer! Instead, it's
// that pointer's distance from tp, which the caller will add back in (this
// makes most sense when the static TLS case is considered). So adjust the
// offset down by the tp value here.
tlsdesc.sub a0, a0, tp
// Fetch the module's block from the blocks vector.
sll a1, a1, PTR_SHIFT // Scale up by pointer size.
add a1, a2, a1 // Add the blocks vector pointer.
tlsdesc.load a1, (a1) // a1 = blocks[index]
// Add in the offset.
tlsdesc.add a0, a1, a0
// Epilogue.
ld a1, 0(sp)
.cfi_same_value a1
ld a2, 8(sp)
.cfi_same_value a2
add sp, sp, 16
.cfi_adjust_cfa_offset -16
// The caller's return address is in t0, with ra preserved.
jr t0
.end_function
.function _dl_tlsdesc_runtime_dynamic_indirect, global
// The first portion of the function is the same here and above.
prologue
// a0 gets the second word: the offset.
tlsdesc.load a0, __SIZEOF_POINTER__(a1)
// a1 gets the first word: the block index.
tlsdesc.load a1, 0(a1)
// The tail is the same as for the split case, so share its code.
j .Lload_block
.end_function
#elif defined(__x86_64__)
// This is the shared beginning portion of both the split and indirect
// versions. It extracts the the GOT's value slot into %rax and the blocks
// vector pointer into %rdx, with %rcx then available for scratch.
.macro prologue
.tlsdesc.cfi
push.spill %rcx
push.spill %rdx
// On entry %rax contains the argument: the address of the GOT slot pair.
// The first word holds our own PC, the second is the value slot.
mov 8(%rax), %rax
.cfi_undefined %rax
// Fetch the $tp offset from the GOT for IE model TLS access.
mov _dl_tlsdesc_runtime_dynamic_blocks@GOTTPOFF(%rip), %rdx
// Load the blocks vector pointer (%rdx) from $tp + offset (%rdx).
mov %fs:(%rdx), %tlsdesc_dx
.endm
// Note that on x86-64 ILP32, GOT entries are still 8 bytes, to facilitate
// use of the indirect addressing modes. This means that even ILP32 can
// make use of a full 32 bits for each of index and offset.
.function _dl_tlsdesc_runtime_dynamic_split, global
// The first portion of the function is the same here and below.
prologue
// %rcx gets the low bits: the offset.
mov %eax, %ecx
// %rax gets the high bits: the block index.
shr $32, %rax
.Lload_block:
// Fetch the module's block (index %rax) from the blocks vector (%rdx).
mov (%rdx, %rax, __SIZEOF_POINTER__), %tlsdesc_ax
// Finally, add in the offset and subtract back out $tp that will be added.
add %tlsdesc_cx, %tlsdesc_ax
sub %fs:0, %tlsdesc_ax
// Epilogue.
pop.reload %rdx
pop.reload %rcx
ret
.end_function
.function _dl_tlsdesc_runtime_dynamic_indirect, global
// The first portion of the function is the same here and above.
prologue
// %rcx gets the second word: the offset.
mov __SIZEOF_POINTER__(%rax), %tlsdesc_cx
// %rax gets the first word: the block index.
mov (%rax), %tlsdesc_ax
// The tail is the same as for the split case, so share its code.
jmp .Lload_block
.end_function
#else
// Not all machines have TLSDESC support specified in the psABI.
#endif