blob: 012214a48ce21d5d2df84bc01f00f7c0b24b5817 [file] [log] [blame]
// Copyright 2025 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include <lib/arch/asm.h>
// This is doing the equivalent of:
// ```
// [[noreturn]] void _start(...) { __libc_start_main(..., &main); }
// ```
// Currently the `...` is three arguments: the zx_handle_t and vDSO address
// passed to zx_process_start, plus another zx_handle_t possibly added by
// the startup dynamic linker.
//
// That is, _start just calls __libc_start_main with the arguments it got, plus
// an additional argument that's the address of the main function. (Note we
// want to ensure it's not a tail call so that there will be an outermost
// backtrace frame showing _start as the caller of __libc_start_main.)
//
// Three cases need to be considered:
// * static PIE: _start from Scrt1.o, __libc_start_main from libc.a, and main
// from the user's program (perhaps in a static library) are all statically
// linked together into the executable itself.
// * dynamic linking, static main: _start from Scrt1.o and main from the
// user's program (perhaps via some other static library) are statically
// linked into the executable itself and have hidden visibility;
// __libc_start_main is in libc.so.
// * dynamically linked main: _start from Scrt1.o is statically linked into
// the executable itself; __libc_start_main is in libc.so; main is in some
// other shared library specified at static link time.
//
// Most often main will be defined in the executable itself at static link
// time, so a direct PC-relative reference would work. That's what the
// compiler would generate if main were declared with STV_HIDDEN visibility.
//
// If main is found in a shared library at static link time then it will
// require a runtime dynamic relocation. When using a dynamic linker, dynamic
// relocation has already been done by now, so a normal GOT relocation here
// could load the address from main's GOT slot. Likewise, that's what the
// compiler would normally generate if main weren't declared as STV_HIDDEN.
//
// In either case the __libc_start_main function will usually be found in the
// shared libc.so, so it needs to be called via PLT or GOT. The compiler will
// normally use the `@PLT` form to request the PLT-call relocation, which tells
// the linker to generate a PLT entry as needed. The PLT entry really just
// does a load from the GOT and a jump, so it's a bit superfluous in the usual
// case and adds a second jump. The compiler under `-fno-plt` would instead
// generate a GOT load and indirect call here, in effect inlining that PLT
// entry. A compiler doing LTO where it knows the call is resolved into a
// shared library might very well also elide the PLT and use the direct GOT
// access, an optimization that is a very good one in the general case.
//
// However, this one Scrt1.o is meant to support both the dynamic linking case
// and the static PIE case. In the static PIE case, this is the real entry
// point directly from the program loader and the GOT (and other initialized
// data) is still in its link-time state. The self-relocation work will be
// done by __libc_start_main as linked in from the static libc.a; before that
// the GOT cannot be used. With a direct PC-relative reference to main here,
// that would be fine--but that's not compatible with the case of main in a
// shared library. The relocation will always have been done by the time
// __libc_start_main actually needs to call main, it just won't always have
// been done by the time _start needs to pass the main function pointer to
// __libc_start_main. So what's needed is a different, fixed jump target
// (i.e. function pointer) that will, when called, tail call the main function
// as resolved by that dynamic relocation--just like a PLT entry! But, in PIC
// and PIE modes the compiler won't try to use a PLT entry to materialize a
// function pointer--only for an actual direct call. The compiler will instead
// just use a GOT load, in essence doing the first half of the PLT's work
// without doing the second half of making the function call.
//
// This can be solved in C++ source by passing the address of an internal
// linkage wrapper function that calls main. Since that function has internal
// linkage, the compiler's reference to it will use a direct PC-relative
// relocation to pass the function pointer to __libc_start_main. Then the need
// for PLT or GOT resolution of main is deferred until that function is called.
// However, there is no way to force the compiler to guarantee a total tail
// call, i.e. a simple jump to main, such that a backtrace will never show this
// local function as the caller of main instead of __libc_start_main directly.
//
// The simplest solution all around is to just write _start entirely in
// assembly where it's possible to precisely control what kinds of relocations
// are used for both main and __libc_start_main.
.function _start, global
.prologue.fp
// Three argument registers are accepted by _start and passed through
// to __libc_start_main, with &main (or proxy for it) in the fourth.
#if defined(__aarch64__)
// There are only two kinds of relocations that can generate a PLT entry:
// 1. Jumps / calls, only usable in a b or bl instruction.
// 2. 32-bit data (PLT32), only usable in a flat 32-bit data field.
// There is no way to use `main@PLT` in an `adrp` + `add` pair so as to
// directly materialize the address of the PLT entry here. Doing an `adrp`
// sequence here for a PC-relative load of an RODATA word with a PLT32
// relocation would require materializing the address of that word in one
// register, loading it into another, and then adding the two together. So
// instead, we just materialize the address of a later trampoline that does a
// pure tail call. Since that trampoline is right below, a single `adr` is
// always sufficient (whereas `adrp` + add would always be required for
// either a PLT entry or an RODATA word).
adr x3, 0f
bl __libc_start_main
// __libc_start_main is declared [[noreturn]] and cannot return. This
// appears as the return address for backtrace purposes, but will never be
// reached. Just in case, an explicit trap here makes that extra clear.
udf #0
// __libc_start_main bounces here to call main. In the most common case,
// main will be defined in the executable and this will be resolved to a
// direct jump at static link time, so there is one extra jump but no load.
// When this actually jumps to a PLT entry, that will do a GOT load and jump
// again, so there will have been two extra jumps and a load. This could
// instead do a GOT load and jump here, for just one extra jump and one extra
// load, but in the more common case that would instead do the same number of
// jumps at the cost of an extra load of an extra relocated GOT slot.
0:b main
#elif defined(__riscv)
// The relocation issues on RISC-V are all the same as just described for
// AAarch64, but the PC-relative sequences are even more instructions though
// hidden behind assembler pseudo-instructions. The `lla` here will produce
// multiple instructions for the general case of the distance to the 0: label
// but it should be relaxed down at link time to only two.
lla a3, 0f
call __libc_start_main
unimp
0:tail main
#elif defined(__x86_64__)
// On x86-64, the PLT32 relocation type works with many instructions because
// they just use flat 32-bit displacements and immediates anyway. The other
// alternative here would be to do a load from `main@GOTPCREL(%rip)`: that
// requests a GOT slot load, which as described earlier can't work right in
// the static PIE case. But on x86-64 with modern assemblers and linkers,
// this is the GOTPCRELX relocation type, which is guaranteed to be relaxed
// at link time to a direct PC-relative `lea` instruction when main is
// defined in the executable at static link time. That would not support a
// static PIE that does its own dynamic linking to resolve main in a shared
// library, which would work on the other machines using PLT calls. It
// doesn't really need to be really supported to link in the static libc.a
// and then do dynamic linking against any shared libraries except the vDSO.
// But in the common case both relax into this same `lea` instruction anyway,
// and in the case where main is in a shared library but dynamic linking was
// already done an extra bounce through the PLT doesn't hurt much (less than
// the double bounce on the other machines where PLT32 is harder to use).
lea main@PLT(%rip), %rcx
call __libc_start_main@PLT
ud2
#else
#error "unsupported machine"
#endif
// The end of the function is never reached, so .epilogue.fp is elided.
.end_function