zircon/third_party/ulib/musl/src/thread/allocate.c - fuchsia - Git at Google

 #include <stdatomic.h>
 #include <stddef.h>
 #include <string.h>
 #include <zircon/process.h>
 #include <zircon/syscalls.h>

 #include "asan_impl.h"
 #include "libc.h"
 #include "threads_impl.h"
 #include "zircon_impl.h"

 // See dynlink.c for the full explanation.  The compiler generates calls to
 // these implicitly.  They are PLT calls into the ASan runtime, which is fine
 // in and of itself at this point (unlike in dynlink.c).  But they might also
 // use ShadowCallStack, which is not set up yet.  So make sure references here
 // only use the libc-internal symbols, which don't have any setup requirements.
 __asan_weak_ref("memcpy")
 __asan_weak_ref("memset")

 enum lock_state {
   LOCK_UNLOCKED,
   LOCK_LOCKED,
   LOCK_CONTENDED,
 };

 static struct pthread* all_threads;
 static zx_futex_t all_threads_lock = LOCK_UNLOCKED;

 LIBC_NO_SAFESTACK struct pthread** __thread_list_acquire(void) {
   // Fast path: LOCK_UNLOCKED -> LOCK_LOCKED
   int expected = LOCK_UNLOCKED;
   if (atomic_compare_exchange_strong_explicit(&all_threads_lock, &expected, LOCK_LOCKED,
                                               memory_order_acquire, memory_order_relaxed)) {
     return &all_threads;
   }

   // Slow path: bring all states to LOCK_CONTENDED, success if the previous state was LOCK_UNLOCKED
   while (true) {
     int observed =
         atomic_exchange_explicit(&all_threads_lock, LOCK_CONTENDED, memory_order_acquire);
     if (observed == LOCK_UNLOCKED) {
       break;
     }
     if (observed != LOCK_LOCKED && observed != LOCK_CONTENDED) {
       // Lock memory was corrupted.
       __builtin_trap();
     }
     _zx_futex_wait(&all_threads_lock, LOCK_CONTENDED, ZX_HANDLE_INVALID, ZX_TIME_INFINITE);
   }

   return &all_threads;
 }

 LIBC_NO_SAFESTACK void __thread_list_release(void) {
   int old = atomic_exchange_explicit(&all_threads_lock, LOCK_UNLOCKED, memory_order_release);
   if (old == LOCK_CONTENDED) {
     _zx_futex_wake(&all_threads_lock, 1);
   }
 }

 // A detached thread has to remove itself from the list.
 // Joinable threads get removed only in pthread_join.
 LIBC_NO_SAFESTACK void __thread_list_erase(void* arg) {
   struct pthread* t = arg;
   __thread_list_acquire();
   *t->prevp = t->next;
   if (t->next != NULL) {
     t->next->prevp = t->prevp;
   }
   __thread_list_release();
 }

 static pthread_rwlock_t allocation_lock = PTHREAD_RWLOCK_INITIALIZER;

 // Many threads could be reading the TLS state.
 static void thread_allocation_acquire(void) { pthread_rwlock_rdlock(&allocation_lock); }

 // dlopen calls this under another lock. Only one dlopen call can be
 // modifying state at a time.
 void __thread_allocation_inhibit(void) { pthread_rwlock_wrlock(&allocation_lock); }

 void __thread_allocation_release(void) { pthread_rwlock_unlock(&allocation_lock); }

 LIBC_NO_SAFESTACK static inline size_t round_up_to_page(size_t sz) {
   return (sz + PAGE_SIZE - 1) & -PAGE_SIZE;
 }

 LIBC_NO_SAFESTACK static ptrdiff_t offset_for_module(const struct tls_module* module) {
 #ifdef TLS_ABOVE_TP
   return module->offset;
 #else
   return -module->offset;
 #endif
 }

 LIBC_NO_SAFESTACK static thrd_t copy_tls(unsigned char* mem, size_t alloc) {
   thrd_t td;
   struct tls_module* p;
   size_t i;
   void** dtv;

 #ifdef TLS_ABOVE_TP
   // *-----------------------------------------------------------------------*
   // | pthread | tcb | X | tls_1 | ... | tlsN | ... | tls_cnt | dtv[1] | ... |
   // *-----------------------------------------------------------------------*
   // ^         ^         ^             ^            ^
   // td        tp      dtv[1]       dtv[n+1]       dtv
   //
   // Note: The TCB is actually the last member of pthread.
   // See: "Addenda to, and Errata in, the ABI for the ARM Architecture"

   dtv = (void**)(mem + libc.tls_size) - (libc.tls_cnt + 1);
   // We need to make sure that the thread pointer is maximally aligned so
   // that tp + dtv[N] is aligned to align_N no matter what N is. So we need
   // 'mem' to be such that if mem == td then td->head is maximially aligned.
   // To do this we need take &td->head (e.g. mem + offset of head) and align
   // it then subtract out the offset of ->head to ensure that &td->head is
   // aligned.
   uintptr_t tp = (uintptr_t)mem + PTHREAD_TP_OFFSET;
   tp = (tp + libc.tls_align - 1) & -libc.tls_align;
   td = (thrd_t)(tp - PTHREAD_TP_OFFSET);
   // Now mem should be the new thread pointer.
   mem = (unsigned char*)tp;
 #else
   // *-----------------------------------------------------------------------*
   // | tls_cnt | dtv[1] | ... | tls_n | ... | tls_1 | tcb | pthread | unused |
   // *-----------------------------------------------------------------------*
   // ^                        ^             ^       ^
   // dtv                   dtv[n+1]       dtv[1]  tp/td
   //
   // Note: The TCB is actually the first member of pthread.
   dtv = (void**)mem;

   mem += alloc - sizeof(struct pthread);
   mem -= (uintptr_t)mem & (libc.tls_align - 1);
   td = (thrd_t)mem;
 #endif

   for (i = 1, p = libc.tls_head; p; i++, p = p->next) {
     dtv[i] = mem + offset_for_module(p);
     memcpy(dtv[i], p->image, p->len);
   }

   dtv[0] = (void*)libc.tls_cnt;
   td->head.dtv = dtv;
   return td;
 }

 #if __has_feature(hwaddress_sanitizer)
 // Define stubs here for hwasan functions that call into the runtime. We want
 // to intercept runtime calls here because the hwasan runtime is instrumented
 // with shadow call stack, but x18 may not yet be setup so accessing it can
 // result in a page fault. To avoid calling into the runtime, we can define a
 // local stub that will instead be called into which can be empty.
 #include "hwasan-stubs.h"
 #include "sanitizer-stubs.h"
 #define HWASAN_STUB(name) HWASAN_STUB_ASM("__hwasan_" #name)
 #define HWASAN_STUB_ASM(name) SANITIZER_STUB_ASM(name, SANITIZER_STUB_ASM_BODY(name))
 HWASAN_STUBS
 #endif  // __has_feature(hwaddress_sanitizer)

 LIBC_NO_SAFESTACK static bool map_block(zx_handle_t parent_vmar, zx_handle_t vmo, size_t vmo_offset,
                                         size_t size, size_t before, size_t after,
                                         struct iovec* mapping, struct iovec* region) {
   region->iov_len = before + size + after;
   zx_handle_t vmar;
   uintptr_t addr;
   zx_status_t status = _zx_vmar_allocate(
       parent_vmar, ZX_VM_CAN_MAP_READ | ZX_VM_CAN_MAP_WRITE | ZX_VM_CAN_MAP_SPECIFIC, 0,
       region->iov_len, &vmar, &addr);
   if (status != ZX_OK)
     return true;
   region->iov_base = (void*)addr;
   status = _zx_vmar_map(vmar, ZX_VM_PERM_READ | ZX_VM_PERM_WRITE | ZX_VM_SPECIFIC, before, vmo,
                         vmo_offset, size, &addr);
   if (status != ZX_OK)
     _zx_vmar_destroy(vmar);
   _zx_handle_close(vmar);
   mapping->iov_base = (void*)addr;
   mapping->iov_len = size;
   return status != ZX_OK;
 }

 // This allocates all the per-thread memory for a new thread about to
 // be created, or for the initial thread at startup.  It's called
 // either at startup or under thread_allocation_acquire.  Hence,
 // it's serialized with any dynamic linker changes to the TLS
 // bookkeeping.
 //
 // This conceptually allocates five things, but concretely allocates
 // four separate blocks.
 // 1. The safe stack (where the thread's SP will point).
 // 2. The unsafe stack (where __builtin___get_unsafe_stack_ptr() will point).
 // 3. The shadow call stack (where the thread's SCSP will point).
 //    (This only exists #if HAVE_SHADOW_CALL_STACK.)
 // 4. The thread descriptor (struct pthread).  The thread pointer points
 //    into this (where into it depends on the machine ABI).
 // 5. The static TLS area.  The ELF TLS ABI for the Initial Exec model
 //    mandates a fixed distance from the thread pointer to the TLS area
 //    across all threads.  So effectively this must always be allocated
 //    as part of the same block with the thread descriptor.
 // This function also copies in the TLS initializer data.
 // It initializes the basic thread descriptor fields.
 // Everything else is zero-initialized.
 //
 // The region for the TCB and TLS area has a precise required size that's
 // computed here.  The sizes of the stacks and the guard regions around them
 // are speculative parameters to be tuned.  Note that there are only two tuning
 // knobs provided due to API legacy: the "stack size" and the "guard size".
 //
 // Nowadays with both safe-stack and shadow-call-stack available in the ABI
 // there are three different stacks to choose sizes for.  Different kinds of
 // program behavior consume each of the different stacks at different rates, so
 // it's hard to predict generically: buffers and other address-taken stack
 // variables grow the unsafe stack; pure call depth (e.g. deep recursion) grows
 // the shadow call stack; certain kinds of large functions, and aggregate call
 // depth of those, grow the safe stack.
 //
 // The legacy presumption is that all consumption is on a single stack (the
 // machine stack, aka the "safe" stack under safe-stack).  Thus the single
 // tuned size provided by the legacy API is meant to represent total
 // consumption across all types of stack use but we don't know how best to
 // allot that among the three stacks so that the actual overall consumption
 // pattern that works in the traditional single-stack ABI with a given total
 // consumption limit still works in with the new stack ABIs.
 //
 // To support whatever consumption patterns may arise, we give each of the
 // three stacks the full size requested via the legacy API for a unitary stack.
 // This seems very wasteful: 3x the stack allocation!  But in theory it should
 // only waste 3x *address space*, not 3x *memory*.  The worst-case total
 // "wasted" space in each of the three should be one page minus one word,
 // i.e. around three pages total (plus some amortized page table overhead
 // proportional to the address space use).  Since all stack pages are actually
 // lazily allocated on demand, the excess unused pages of each stack that's
 // larger than it needs to be will never be allocated.  The only alternative
 // that works in the general case is to come up with new tuning APIs that can
 // express the different kinds of stack consumption required to tune the three
 // sizes separately (or proportionally to each other or whatever).

 // In the function below, the compiler may generate calls to memcpy
 // intrinsics for copying structs. With ASan enabled, calls to these memcpy
 // intrinsics are converted to calls to __asan_memcpy. Calls to the ASan runtime
 // in these cases may not be safe because of ABI requirements like
 // ShadowCallStack that aren't ready yet. So redirect this symbol to libc's own
 // memcpy implementation, which is always a leaf function that doesn't require
 // the ShadowCallStack ABI.
 __asan_weak_ref("memcpy")

     LIBC_NO_SAFESTACK thrd_t
     __allocate_thread(size_t requested_guard_size, size_t requested_stack_size,
                       const char* thread_name, char vmo_name[ZX_MAX_NAME_LEN]) {
   // In the initial thread, we're allocating the stacks and TCB for the running
   // thread itself.  So we can't make calls that rely on safe-stack or
   // shadow-call-stack setup.  Rather than annotating everything in the call
   // path here, we just avoid the problematic calls.  Locking is not required
   // since this is the sole thread.
   const bool initial_thread = vmo_name == NULL;

   if (!initial_thread) {
     thread_allocation_acquire();
   }

   const size_t guard_size = requested_guard_size == 0 ? 0 : round_up_to_page(requested_guard_size);
   const size_t stack_size = round_up_to_page(requested_stack_size);

   const size_t tls_size = libc.tls_size;
   const size_t tcb_size = round_up_to_page(tls_size);

   const size_t vmo_size = tcb_size + stack_size * (2 + HAVE_SHADOW_CALL_STACK);
   zx_handle_t vmo;
   zx_status_t status = _zx_vmo_create(vmo_size, 0, &vmo);
   if (status != ZX_OK) {
     if (!initial_thread) {
       __thread_allocation_release();
     }
     return NULL;
   }
   struct iovec tcb, tcb_region;
   if (map_block(_zx_vmar_root_self(), vmo, 0, tcb_size, PAGE_SIZE, PAGE_SIZE, &tcb, &tcb_region)) {
     if (!initial_thread) {
       __thread_allocation_release();
     }
     _zx_handle_close(vmo);
     return NULL;
   }

   thrd_t td = copy_tls(tcb.iov_base, tcb.iov_len);
   if (initial_thread) {
     td->process_handle = _zx_process_self();
   } else {
     td->process_handle = __pthread_self()->process_handle;
   }

   // At this point all our access to global TLS state is done, so we
   // can allow dlopen again.
   if (!initial_thread) {
     __thread_allocation_release();
   }

   // For the initial thread, it's too early to call snprintf because
   // it's not LIBC_NO_SAFESTACK.
   if (!initial_thread) {
     // For other threads, try to give the VMO a name that includes
     // the thrd_t value (and the TLS size if that fits too), but
     // don't use a truncated value since that would be confusing to
     // interpret.
     if (snprintf(vmo_name, ZX_MAX_NAME_LEN, "%s:%p/TLS=%#zx", thread_name, td, tls_size) <
             ZX_MAX_NAME_LEN ||
         snprintf(vmo_name, ZX_MAX_NAME_LEN, "%s:%p", thread_name, td) < ZX_MAX_NAME_LEN)
       thread_name = vmo_name;
   }
   _zx_object_set_property(vmo, ZX_PROP_NAME, thread_name, strlen(thread_name));

   if (map_block(_zx_vmar_root_self(), vmo, tcb_size, stack_size, guard_size, 0, &td->safe_stack,
                 &td->safe_stack_region)) {
     _zx_vmar_unmap(_zx_vmar_root_self(), (uintptr_t)tcb_region.iov_base, tcb_region.iov_len);
     _zx_handle_close(vmo);
     return NULL;
   }

   if (map_block(_zx_vmar_root_self(), vmo, tcb_size + stack_size, stack_size, guard_size, 0,
                 &td->unsafe_stack, &td->unsafe_stack_region)) {
     _zx_vmar_unmap(_zx_vmar_root_self(), (uintptr_t)td->safe_stack_region.iov_base,
                    td->safe_stack_region.iov_len);
     _zx_vmar_unmap(_zx_vmar_root_self(), (uintptr_t)tcb_region.iov_base, tcb_region.iov_len);
     _zx_handle_close(vmo);
     return NULL;
   }

 #if HAVE_SHADOW_CALL_STACK
   if (map_block(_zx_vmar_root_self(), vmo, tcb_size + stack_size * 2,
                 // Shadow call stack grows up, so a guard after is probably
                 // enough.  But be extra careful with guards on both sides.
                 stack_size, guard_size, guard_size,
                 //
                 &td->shadow_call_stack, &td->shadow_call_stack_region)) {
     _zx_vmar_unmap(_zx_vmar_root_self(), (uintptr_t)td->unsafe_stack_region.iov_base,
                    td->unsafe_stack_region.iov_len);
     _zx_vmar_unmap(_zx_vmar_root_self(), (uintptr_t)td->safe_stack_region.iov_base,
                    td->safe_stack_region.iov_len);
     _zx_vmar_unmap(_zx_vmar_root_self(), (uintptr_t)tcb_region.iov_base, tcb_region.iov_len);
     _zx_handle_close(vmo);
     return NULL;
   }
 #endif

   _zx_handle_close(vmo);
   td->tcb_region = tcb_region;
   td->locale = &libc.global_locale;
   td->head.tp = (uintptr_t)pthread_to_tp(td);
   td->abi.stack_guard = __stack_chk_guard;
   td->abi.unsafe_sp = (uintptr_t)td->unsafe_stack.iov_base + td->unsafe_stack.iov_len;

   struct pthread** prevp = __thread_list_acquire();
   td->prevp = prevp;
   td->next = *prevp;
   if (td->next != NULL) {
     td->next->prevp = &td->next;
   }
   *prevp = td;
   __thread_list_release();

   return td;
 }
	#include <stdatomic.h>
	#include <stddef.h>
	#include <string.h>
	#include <zircon/process.h>
	#include <zircon/syscalls.h>

	#include "asan_impl.h"
	#include "libc.h"
	#include "threads_impl.h"
	#include "zircon_impl.h"

	// See dynlink.c for the full explanation. The compiler generates calls to
	// these implicitly. They are PLT calls into the ASan runtime, which is fine
	// in and of itself at this point (unlike in dynlink.c). But they might also
	// use ShadowCallStack, which is not set up yet. So make sure references here
	// only use the libc-internal symbols, which don't have any setup requirements.
	__asan_weak_ref("memcpy")
	__asan_weak_ref("memset")

	enum lock_state {
	LOCK_UNLOCKED,
	LOCK_LOCKED,
	LOCK_CONTENDED,
	};

	static struct pthread* all_threads;
	static zx_futex_t all_threads_lock = LOCK_UNLOCKED;

	LIBC_NO_SAFESTACK struct pthread** __thread_list_acquire(void) {
	// Fast path: LOCK_UNLOCKED -> LOCK_LOCKED
	int expected = LOCK_UNLOCKED;
	if (atomic_compare_exchange_strong_explicit(&all_threads_lock, &expected, LOCK_LOCKED,
	memory_order_acquire, memory_order_relaxed)) {
	return &all_threads;
	}

	// Slow path: bring all states to LOCK_CONTENDED, success if the previous state was LOCK_UNLOCKED
	while (true) {
	int observed =
	atomic_exchange_explicit(&all_threads_lock, LOCK_CONTENDED, memory_order_acquire);
	if (observed == LOCK_UNLOCKED) {
	break;
	}
	if (observed != LOCK_LOCKED && observed != LOCK_CONTENDED) {
	// Lock memory was corrupted.
	__builtin_trap();
	}
	_zx_futex_wait(&all_threads_lock, LOCK_CONTENDED, ZX_HANDLE_INVALID, ZX_TIME_INFINITE);
	}

	return &all_threads;
	}

	LIBC_NO_SAFESTACK void __thread_list_release(void) {
	int old = atomic_exchange_explicit(&all_threads_lock, LOCK_UNLOCKED, memory_order_release);
	if (old == LOCK_CONTENDED) {
	_zx_futex_wake(&all_threads_lock, 1);
	}
	}

	// A detached thread has to remove itself from the list.
	// Joinable threads get removed only in pthread_join.
	LIBC_NO_SAFESTACK void __thread_list_erase(void* arg) {
	struct pthread* t = arg;
	__thread_list_acquire();
	*t->prevp = t->next;
	if (t->next != NULL) {
	t->next->prevp = t->prevp;
	}
	__thread_list_release();
	}

	static pthread_rwlock_t allocation_lock = PTHREAD_RWLOCK_INITIALIZER;

	// Many threads could be reading the TLS state.
	static void thread_allocation_acquire(void) { pthread_rwlock_rdlock(&allocation_lock); }

	// dlopen calls this under another lock. Only one dlopen call can be
	// modifying state at a time.
	void __thread_allocation_inhibit(void) { pthread_rwlock_wrlock(&allocation_lock); }

	void __thread_allocation_release(void) { pthread_rwlock_unlock(&allocation_lock); }

	LIBC_NO_SAFESTACK static inline size_t round_up_to_page(size_t sz) {
	return (sz + PAGE_SIZE - 1) & -PAGE_SIZE;
	}

	LIBC_NO_SAFESTACK static ptrdiff_t offset_for_module(const struct tls_module* module) {
	#ifdef TLS_ABOVE_TP
	return module->offset;
	#else
	return -module->offset;
	#endif
	}

	LIBC_NO_SAFESTACK static thrd_t copy_tls(unsigned char* mem, size_t alloc) {
	thrd_t td;
	struct tls_module* p;
	size_t i;
	void** dtv;

	#ifdef TLS_ABOVE_TP
	// -----------------------------------------------------------------------
	// \| pthread \| tcb \| X \| tls_1 \| ... \| tlsN \| ... \| tls_cnt \| dtv[1] \| ... \|
	// -----------------------------------------------------------------------
	// ^ ^ ^ ^ ^
	// td tp dtv[1] dtv[n+1] dtv
	//
	// Note: The TCB is actually the last member of pthread.
	// See: "Addenda to, and Errata in, the ABI for the ARM Architecture"

	dtv = (void**)(mem + libc.tls_size) - (libc.tls_cnt + 1);
	// We need to make sure that the thread pointer is maximally aligned so
	// that tp + dtv[N] is aligned to align_N no matter what N is. So we need
	// 'mem' to be such that if mem == td then td->head is maximially aligned.
	// To do this we need take &td->head (e.g. mem + offset of head) and align
	// it then subtract out the offset of ->head to ensure that &td->head is
	// aligned.
	uintptr_t tp = (uintptr_t)mem + PTHREAD_TP_OFFSET;
	tp = (tp + libc.tls_align - 1) & -libc.tls_align;
	td = (thrd_t)(tp - PTHREAD_TP_OFFSET);
	// Now mem should be the new thread pointer.
	mem = (unsigned char*)tp;
	#else
	// -----------------------------------------------------------------------
	// \| tls_cnt \| dtv[1] \| ... \| tls_n \| ... \| tls_1 \| tcb \| pthread \| unused \|
	// -----------------------------------------------------------------------
	// ^ ^ ^ ^
	// dtv dtv[n+1] dtv[1] tp/td
	//
	// Note: The TCB is actually the first member of pthread.
	dtv = (void**)mem;

	mem += alloc - sizeof(struct pthread);
	mem -= (uintptr_t)mem & (libc.tls_align - 1);
	td = (thrd_t)mem;
	#endif

	for (i = 1, p = libc.tls_head; p; i++, p = p->next) {
	dtv[i] = mem + offset_for_module(p);
	memcpy(dtv[i], p->image, p->len);
	}

	dtv[0] = (void*)libc.tls_cnt;
	td->head.dtv = dtv;
	return td;
	}

	#if __has_feature(hwaddress_sanitizer)
	// Define stubs here for hwasan functions that call into the runtime. We want
	// to intercept runtime calls here because the hwasan runtime is instrumented
	// with shadow call stack, but x18 may not yet be setup so accessing it can
	// result in a page fault. To avoid calling into the runtime, we can define a
	// local stub that will instead be called into which can be empty.
	#include "hwasan-stubs.h"
	#include "sanitizer-stubs.h"
	#define HWASAN_STUB(name) HWASAN_STUB_ASM("__hwasan_" #name)
	#define HWASAN_STUB_ASM(name) SANITIZER_STUB_ASM(name, SANITIZER_STUB_ASM_BODY(name))
	HWASAN_STUBS
	#endif // __has_feature(hwaddress_sanitizer)

	LIBC_NO_SAFESTACK static bool map_block(zx_handle_t parent_vmar, zx_handle_t vmo, size_t vmo_offset,
	size_t size, size_t before, size_t after,
	struct iovec* mapping, struct iovec* region) {
	region->iov_len = before + size + after;
	zx_handle_t vmar;
	uintptr_t addr;
	zx_status_t status = _zx_vmar_allocate(
	parent_vmar, ZX_VM_CAN_MAP_READ \| ZX_VM_CAN_MAP_WRITE \| ZX_VM_CAN_MAP_SPECIFIC, 0,
	region->iov_len, &vmar, &addr);
	if (status != ZX_OK)
	return true;
	region->iov_base = (void*)addr;
	status = _zx_vmar_map(vmar, ZX_VM_PERM_READ \| ZX_VM_PERM_WRITE \| ZX_VM_SPECIFIC, before, vmo,
	vmo_offset, size, &addr);
	if (status != ZX_OK)
	_zx_vmar_destroy(vmar);
	_zx_handle_close(vmar);
	mapping->iov_base = (void*)addr;
	mapping->iov_len = size;
	return status != ZX_OK;
	}

	// This allocates all the per-thread memory for a new thread about to
	// be created, or for the initial thread at startup. It's called
	// either at startup or under thread_allocation_acquire. Hence,
	// it's serialized with any dynamic linker changes to the TLS
	// bookkeeping.
	//
	// This conceptually allocates five things, but concretely allocates
	// four separate blocks.
	// 1. The safe stack (where the thread's SP will point).
	// 2. The unsafe stack (where __builtin___get_unsafe_stack_ptr() will point).
	// 3. The shadow call stack (where the thread's SCSP will point).
	// (This only exists #if HAVE_SHADOW_CALL_STACK.)
	// 4. The thread descriptor (struct pthread). The thread pointer points
	// into this (where into it depends on the machine ABI).
	// 5. The static TLS area. The ELF TLS ABI for the Initial Exec model
	// mandates a fixed distance from the thread pointer to the TLS area
	// across all threads. So effectively this must always be allocated
	// as part of the same block with the thread descriptor.
	// This function also copies in the TLS initializer data.
	// It initializes the basic thread descriptor fields.
	// Everything else is zero-initialized.
	//
	// The region for the TCB and TLS area has a precise required size that's
	// computed here. The sizes of the stacks and the guard regions around them
	// are speculative parameters to be tuned. Note that there are only two tuning
	// knobs provided due to API legacy: the "stack size" and the "guard size".
	//
	// Nowadays with both safe-stack and shadow-call-stack available in the ABI
	// there are three different stacks to choose sizes for. Different kinds of
	// program behavior consume each of the different stacks at different rates, so
	// it's hard to predict generically: buffers and other address-taken stack
	// variables grow the unsafe stack; pure call depth (e.g. deep recursion) grows
	// the shadow call stack; certain kinds of large functions, and aggregate call
	// depth of those, grow the safe stack.
	//
	// The legacy presumption is that all consumption is on a single stack (the
	// machine stack, aka the "safe" stack under safe-stack). Thus the single
	// tuned size provided by the legacy API is meant to represent total
	// consumption across all types of stack use but we don't know how best to
	// allot that among the three stacks so that the actual overall consumption
	// pattern that works in the traditional single-stack ABI with a given total
	// consumption limit still works in with the new stack ABIs.
	//
	// To support whatever consumption patterns may arise, we give each of the
	// three stacks the full size requested via the legacy API for a unitary stack.
	// This seems very wasteful: 3x the stack allocation! But in theory it should
	// only waste 3x address space, not 3x memory. The worst-case total
	// "wasted" space in each of the three should be one page minus one word,
	// i.e. around three pages total (plus some amortized page table overhead
	// proportional to the address space use). Since all stack pages are actually
	// lazily allocated on demand, the excess unused pages of each stack that's
	// larger than it needs to be will never be allocated. The only alternative
	// that works in the general case is to come up with new tuning APIs that can
	// express the different kinds of stack consumption required to tune the three
	// sizes separately (or proportionally to each other or whatever).

	// In the function below, the compiler may generate calls to memcpy
	// intrinsics for copying structs. With ASan enabled, calls to these memcpy
	// intrinsics are converted to calls to __asan_memcpy. Calls to the ASan runtime
	// in these cases may not be safe because of ABI requirements like
	// ShadowCallStack that aren't ready yet. So redirect this symbol to libc's own
	// memcpy implementation, which is always a leaf function that doesn't require
	// the ShadowCallStack ABI.
	__asan_weak_ref("memcpy")

	LIBC_NO_SAFESTACK thrd_t
	__allocate_thread(size_t requested_guard_size, size_t requested_stack_size,
	const char* thread_name, char vmo_name[ZX_MAX_NAME_LEN]) {
	// In the initial thread, we're allocating the stacks and TCB for the running
	// thread itself. So we can't make calls that rely on safe-stack or
	// shadow-call-stack setup. Rather than annotating everything in the call
	// path here, we just avoid the problematic calls. Locking is not required
	// since this is the sole thread.
	const bool initial_thread = vmo_name == NULL;

	if (!initial_thread) {
	thread_allocation_acquire();
	}

	const size_t guard_size = requested_guard_size == 0 ? 0 : round_up_to_page(requested_guard_size);
	const size_t stack_size = round_up_to_page(requested_stack_size);

	const size_t tls_size = libc.tls_size;
	const size_t tcb_size = round_up_to_page(tls_size);

	const size_t vmo_size = tcb_size + stack_size * (2 + HAVE_SHADOW_CALL_STACK);
	zx_handle_t vmo;
	zx_status_t status = _zx_vmo_create(vmo_size, 0, &vmo);
	if (status != ZX_OK) {
	if (!initial_thread) {
	__thread_allocation_release();
	}
	return NULL;
	}
	struct iovec tcb, tcb_region;
	if (map_block(_zx_vmar_root_self(), vmo, 0, tcb_size, PAGE_SIZE, PAGE_SIZE, &tcb, &tcb_region)) {
	if (!initial_thread) {
	__thread_allocation_release();
	}
	_zx_handle_close(vmo);
	return NULL;
	}

	thrd_t td = copy_tls(tcb.iov_base, tcb.iov_len);
	if (initial_thread) {
	td->process_handle = _zx_process_self();
	} else {
	td->process_handle = __pthread_self()->process_handle;
	}

	// At this point all our access to global TLS state is done, so we
	// can allow dlopen again.
	if (!initial_thread) {
	__thread_allocation_release();
	}

	// For the initial thread, it's too early to call snprintf because
	// it's not LIBC_NO_SAFESTACK.
	if (!initial_thread) {
	// For other threads, try to give the VMO a name that includes
	// the thrd_t value (and the TLS size if that fits too), but
	// don't use a truncated value since that would be confusing to
	// interpret.
	if (snprintf(vmo_name, ZX_MAX_NAME_LEN, "%s:%p/TLS=%#zx", thread_name, td, tls_size) <
	ZX_MAX_NAME_LEN \|\|
	snprintf(vmo_name, ZX_MAX_NAME_LEN, "%s:%p", thread_name, td) < ZX_MAX_NAME_LEN)
	thread_name = vmo_name;
	}
	_zx_object_set_property(vmo, ZX_PROP_NAME, thread_name, strlen(thread_name));

	if (map_block(_zx_vmar_root_self(), vmo, tcb_size, stack_size, guard_size, 0, &td->safe_stack,
	&td->safe_stack_region)) {
	_zx_vmar_unmap(_zx_vmar_root_self(), (uintptr_t)tcb_region.iov_base, tcb_region.iov_len);
	_zx_handle_close(vmo);
	return NULL;
	}

	if (map_block(_zx_vmar_root_self(), vmo, tcb_size + stack_size, stack_size, guard_size, 0,
	&td->unsafe_stack, &td->unsafe_stack_region)) {
	_zx_vmar_unmap(_zx_vmar_root_self(), (uintptr_t)td->safe_stack_region.iov_base,
	td->safe_stack_region.iov_len);
	_zx_vmar_unmap(_zx_vmar_root_self(), (uintptr_t)tcb_region.iov_base, tcb_region.iov_len);
	_zx_handle_close(vmo);
	return NULL;
	}

	#if HAVE_SHADOW_CALL_STACK
	if (map_block(_zx_vmar_root_self(), vmo, tcb_size + stack_size * 2,
	// Shadow call stack grows up, so a guard after is probably
	// enough. But be extra careful with guards on both sides.
	stack_size, guard_size, guard_size,
	//
	&td->shadow_call_stack, &td->shadow_call_stack_region)) {
	_zx_vmar_unmap(_zx_vmar_root_self(), (uintptr_t)td->unsafe_stack_region.iov_base,
	td->unsafe_stack_region.iov_len);
	_zx_vmar_unmap(_zx_vmar_root_self(), (uintptr_t)td->safe_stack_region.iov_base,
	td->safe_stack_region.iov_len);
	_zx_vmar_unmap(_zx_vmar_root_self(), (uintptr_t)tcb_region.iov_base, tcb_region.iov_len);
	_zx_handle_close(vmo);
	return NULL;
	}
	#endif

	_zx_handle_close(vmo);
	td->tcb_region = tcb_region;
	td->locale = &libc.global_locale;
	td->head.tp = (uintptr_t)pthread_to_tp(td);
	td->abi.stack_guard = __stack_chk_guard;
	td->abi.unsafe_sp = (uintptr_t)td->unsafe_stack.iov_base + td->unsafe_stack.iov_len;

	struct pthread** prevp = __thread_list_acquire();
	td->prevp = prevp;
	td->next = *prevp;
	if (td->next != NULL) {
	td->next->prevp = &td->next;
	}
	*prevp = td;
	__thread_list_release();

	return td;
	}