Rewrite cxa guard implementation.

This patch does three main things:
  (1) It re-writes the cxa guard implementation to make it testable.
  (2) Adds support for recursive init detection on non-apple platforms.
  (3) It adds a futex based implementation.

The futex based implementation locks and notifies on a per-object basis, unlike the
current implementation which uses a global lock for all objects. Once this patch settles
I'll turn it on by default when supported.

git-svn-id: https://llvm.org/svn/llvm-project/libcxxabi/trunk@359060 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/src/cxa_guard.cpp b/src/cxa_guard.cpp
index 3d66826..5d1cf23 100644
--- a/src/cxa_guard.cpp
+++ b/src/cxa_guard.cpp
@@ -7,12 +7,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "__cxxabi_config.h"
+#include "cxxabi.h"
 
-#include "abort_message.h"
-#include <__threading_support>
-
-#include <stdint.h>
-#include <string.h>
+// Tell the implementation that we're building the actual implementation
+// (and not testing it)
+#define BUILDING_CXA_GUARD
+#include "cxa_guard_impl.h"
 
 /*
     This implementation must be careful to not call code external to this file
@@ -24,278 +24,30 @@
     to not be a problem.
 */
 
-namespace __cxxabiv1
-{
-
-namespace
-{
-
-enum InitializationResult {
-  INIT_COMPLETE,
-  INIT_NOT_COMPLETE,
-};
+namespace __cxxabiv1 {
 
 #if defined(_LIBCXXABI_GUARD_ABI_ARM)
-// A 32-bit, 4-byte-aligned static data value. The least significant 2 bits must
-// be statically initialized to 0.
-typedef uint32_t guard_type;
+using guard_type = uint32_t;
 #else
-typedef uint64_t guard_type;
+using guard_type = uint64_t;
 #endif
 
-#if !defined(_LIBCXXABI_HAS_NO_THREADS) && defined(__APPLE__) &&               \
-    !defined(_LIBCXXABI_GUARD_ABI_ARM)
-// This is a special-case pthread dependency for Mac. We can't pull this
-// out into libcxx's threading API (__threading_support) because not all
-// supported Mac environments provide this function (in pthread.h). To
-// make it possible to build/use libcxx in those environments, we have to
-// keep this pthread dependency local to libcxxabi. If there is some
-// convenient way to detect precisely when pthread_mach_thread_np is
-// available in a given Mac environment, it might still be possible to
-// bury this dependency in __threading_support.
-#ifndef _LIBCPP_HAS_THREAD_API_PTHREAD
-#error "How do I pthread_mach_thread_np()?"
-#endif
-#define LIBCXXABI_HAS_DEADLOCK_DETECTION
-#define LOCK_ID_FOR_THREAD() pthread_mach_thread_np(std::__libcpp_thread_get_current_id())
-typedef uint32_t lock_type;
-#else
-#define LOCK_ID_FOR_THREAD() true
-typedef bool lock_type;
-#endif
-
-enum class OnRelease : char { UNLOCK, UNLOCK_AND_BROADCAST };
-
-struct GlobalMutexGuard {
-  explicit GlobalMutexGuard(const char* calling_func, OnRelease on_release)
-      : calling_func(calling_func), on_release(on_release) {
-#ifndef _LIBCXXABI_HAS_NO_THREADS
-    if (std::__libcpp_mutex_lock(&guard_mut))
-      abort_message("%s failed to acquire mutex", calling_func);
-#endif
-  }
-
-  ~GlobalMutexGuard() {
-#ifndef _LIBCXXABI_HAS_NO_THREADS
-    if (std::__libcpp_mutex_unlock(&guard_mut))
-      abort_message("%s failed to release mutex", calling_func);
-    if (on_release == OnRelease::UNLOCK_AND_BROADCAST) {
-      if (std::__libcpp_condvar_broadcast(&guard_cv))
-        abort_message("%s failed to broadcast condition variable",
-                      calling_func);
-    }
-#endif
-  }
-
-  void wait_for_signal() {
-#ifndef _LIBCXXABI_HAS_NO_THREADS
-    if (std::__libcpp_condvar_wait(&guard_cv, &guard_mut))
-      abort_message("%s condition variable wait failed", calling_func);
-#endif
-  }
-
-private:
-  GlobalMutexGuard(GlobalMutexGuard const&) = delete;
-  GlobalMutexGuard& operator=(GlobalMutexGuard const&) = delete;
-
-  const char* const calling_func;
-  OnRelease on_release;
-
-#ifndef _LIBCXXABI_HAS_NO_THREADS
-  static std::__libcpp_mutex_t guard_mut;
-  static std::__libcpp_condvar_t guard_cv;
-#endif
-};
-
-#ifndef _LIBCXXABI_HAS_NO_THREADS
-std::__libcpp_mutex_t GlobalMutexGuard::guard_mut = _LIBCPP_MUTEX_INITIALIZER;
-std::__libcpp_condvar_t GlobalMutexGuard::guard_cv =
-    _LIBCPP_CONDVAR_INITIALIZER;
-#endif
-
-struct GuardObject;
-
-/// GuardValue - An abstraction for accessing the various fields and bits of
-///   the guard object.
-struct GuardValue {
-private:
-  explicit GuardValue(guard_type v) : value(v) {}
-  friend struct GuardObject;
-
-public:
-  /// Functions returning the values used to represent the uninitialized,
-  /// initialized, and initialization pending states.
-  static GuardValue ZERO();
-  static GuardValue INIT_COMPLETE();
-  static GuardValue INIT_PENDING();
-
-  /// Returns true if the guard value represents that the initialization is
-  /// complete.
-  bool is_initialization_complete() const;
-
-  /// Returns true if the guard value represents that the initialization is
-  /// currently pending.
-  bool is_initialization_pending() const;
-
-  /// Returns the lock value for the current guard value.
-  lock_type get_lock_value() const;
-
-private:
-  // Returns a guard object corresponding to the specified lock value.
-  static guard_type guard_value_from_lock(lock_type l);
-
-  // Returns the lock value represented by the specified guard object.
-  static lock_type lock_value_from_guard(guard_type g);
-
-private:
-  guard_type value;
-};
-
-/// GuardObject - Manages correctly reading and writing to the guard object.
-struct GuardObject {
-  explicit GuardObject(guard_type *g) : guard(g) {}
-
-  // Read the current value of the guard object.
-  // TODO: Make this read atomic.
-  GuardValue read() const;
-
-  // Write the specified value to the guard object.
-  // TODO: Make this atomic
-  void write(GuardValue new_val);
-
-private:
-  GuardObject(const GuardObject&) = delete;
-  GuardObject& operator=(const GuardObject&) = delete;
-
-  guard_type *guard;
-};
-
-}  // unnamed namespace
-
 extern "C"
 {
-
 _LIBCXXABI_FUNC_VIS int __cxa_guard_acquire(guard_type* raw_guard_object) {
-  GlobalMutexGuard gmutex("__cxa_guard_acquire", OnRelease::UNLOCK);
-  GuardObject guard(raw_guard_object);
-  GuardValue current_value = guard.read();
-
-  if (current_value.is_initialization_complete())
-    return INIT_COMPLETE;
-
-  const GuardValue LOCK_ID = GuardValue::INIT_PENDING();
-#ifdef LIBCXXABI_HAS_DEADLOCK_DETECTION
-   if (current_value.is_initialization_pending() &&
-       current_value.get_lock_value() == LOCK_ID.get_lock_value()) {
-    abort_message("__cxa_guard_acquire detected deadlock");
-  }
-#endif
-  while (current_value.is_initialization_pending()) {
-      gmutex.wait_for_signal();
-      current_value = guard.read();
-  }
-  if (current_value.is_initialization_complete())
-    return INIT_COMPLETE;
-
-  guard.write(LOCK_ID);
-  return INIT_NOT_COMPLETE;
+  SelectedImplementation imp(raw_guard_object);
+  return static_cast<int>(imp.cxa_guard_acquire());
 }
 
 _LIBCXXABI_FUNC_VIS void __cxa_guard_release(guard_type *raw_guard_object) {
-  GlobalMutexGuard gmutex("__cxa_guard_release",
-                          OnRelease::UNLOCK_AND_BROADCAST);
-  GuardObject guard(raw_guard_object);
-  guard.write(GuardValue::ZERO());
-  guard.write(GuardValue::INIT_COMPLETE());
+  SelectedImplementation imp(raw_guard_object);
+  imp.cxa_guard_release();
 }
 
 _LIBCXXABI_FUNC_VIS void __cxa_guard_abort(guard_type *raw_guard_object) {
-  GlobalMutexGuard gmutex("__cxa_guard_abort", OnRelease::UNLOCK_AND_BROADCAST);
-  GuardObject guard(raw_guard_object);
-  guard.write(GuardValue::ZERO());
+  SelectedImplementation imp(raw_guard_object);
+  imp.cxa_guard_abort();
 }
 }  // extern "C"
 
-//===----------------------------------------------------------------------===//
-//                        GuardObject Definitions
-//===----------------------------------------------------------------------===//
-
-GuardValue GuardObject::read() const {
-  // FIXME: Make this atomic
-  guard_type val = *guard;
-  return GuardValue(val);
-}
-
-void GuardObject::write(GuardValue new_val) {
-  // FIXME: make this atomic
-  *guard = new_val.value;
-}
-
-//===----------------------------------------------------------------------===//
-//                        GuardValue Definitions
-//===----------------------------------------------------------------------===//
-
-GuardValue GuardValue::ZERO() { return GuardValue(0); }
-
-GuardValue GuardValue::INIT_COMPLETE() {
-  guard_type value = {0};
-#if defined(_LIBCXXABI_GUARD_ABI_ARM)
-  value |= 1;
-#else
-  char* init_bit = (char*)&value;
-  *init_bit = 1;
-#endif
-  return GuardValue(value);
-}
-
-GuardValue GuardValue::INIT_PENDING() {
-  return GuardValue(guard_value_from_lock(LOCK_ID_FOR_THREAD()));
-}
-
-bool GuardValue::is_initialization_complete() const {
-#if defined(_LIBCXXABI_GUARD_ABI_ARM)
-  return value & 1;
-#else
-  const char* init_bit = (const char*)&value;
-  return *init_bit;
-#endif
-}
-
-bool GuardValue::is_initialization_pending() const {
-  return lock_value_from_guard(value) != 0;
-}
-
-lock_type GuardValue::get_lock_value() const {
-  return lock_value_from_guard(value);
-}
-
-// Create a guard object with the lock set to the specified value.
-guard_type GuardValue::guard_value_from_lock(lock_type l) {
-#if defined(__APPLE__) && !defined(_LIBCXXABI_GUARD_ABI_ARM)
-#if __LITTLE_ENDIAN__
-  return static_cast<guard_type>(l) << 32;
-#else
-  return static_cast<guard_type>(l);
-#endif
-#else  // defined(__APPLE__) && !defined(_LIBCXXABI_GUARD_ABI_ARM)
-  guard_type f = {0};
-  memcpy(static_cast<char*>(static_cast<void*>(&f)) + 1, &l, sizeof(lock_type));
-  return f;
-#endif // defined(__APPLE__) && !defined(_LIBCXXABI_GUARD_ABI_ARM)
-}
-
-lock_type GuardValue::lock_value_from_guard(guard_type g) {
-#if defined(__APPLE__) && !defined(_LIBCXXABI_GUARD_ABI_ARM)
-#if __LITTLE_ENDIAN__
-  return static_cast<lock_type>(g >> 32);
-#else
-  return static_cast<lock_type>(g);
-#endif
-#else  // defined(__APPLE__) && !defined(_LIBCXXABI_GUARD_ABI_ARM)
-  uint8_t guard_bytes[sizeof(guard_type)];
-  memcpy(&guard_bytes, &g, sizeof(guard_type));
-  return guard_bytes[1] != 0;
-#endif // defined(__APPLE__) && !defined(_LIBCXXABI_GUARD_ABI_ARM)
-}
-
 }  // __cxxabiv1
diff --git a/src/cxa_guard_impl.h b/src/cxa_guard_impl.h
new file mode 100644
index 0000000..8c31848
--- /dev/null
+++ b/src/cxa_guard_impl.h
@@ -0,0 +1,550 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LIBCXXABI_SRC_INCLUDE_CXA_GUARD_IMPL_H
+#define LIBCXXABI_SRC_INCLUDE_CXA_GUARD_IMPL_H
+
+/* cxa_guard_impl.h - Implements the C++ runtime support for function local
+ * static guards.
+ * The layout of the guard object is the same across ARM and Itanium.
+ *
+ * The first "guard byte" (which is checked by the compiler) is set only upon
+ * the completion of cxa release.
+ *
+ * The second "init byte" does the rest of the bookkeeping. It tracks if
+ * initialization is complete or pending, and if there are waiting threads.
+ *
+ * If the guard variable is 64-bits and the platforms supplies a 32-bit thread
+ * identifier, it is used to detect recursive initialization. The thread ID of
+ * the thread currently performing initialization is stored in the second word.
+ *
+ *  Guard Object Layout:
+ * -------------------------------------------------------------------------
+ * |a: guard byte | a+1: init byte | a+2 : unused ... | a+4: thread-id ... |
+ * ------------------------------------------------------------------------
+ *
+ *  Access Protocol:
+ *    For each implementation the guard byte is checked and set before accessing
+ *    the init byte.
+ *
+ *  Overall Design:
+ *    The implementation was designed to allow each implementation to be tested
+ *    independent of the C++ runtime or platform support.
+ *
+ */
+
+#include "__cxxabi_config.h"
+#include "include/atomic_support.h"
+#include <unistd.h>
+#include <sys/types.h>
+#if defined(__has_include)
+# if __has_include(<sys/syscall.h>)
+#   include <sys/syscall.h>
+# endif
+#endif
+
+#include <stdlib.h>
+#include <__threading_support>
+
+// To make testing possible, this header is included from both cxa_guard.cpp
+// and a number of tests.
+//
+// For this reason we place everything in an anonymous namespace -- even though
+// we're in a header. We want the actual implementation and the tests to have
+// unique definitions of the types in this header (since the tests may depend
+// on function local statics).
+//
+// To enforce this either `BUILDING_CXA_GUARD` or `TESTING_CXA_GUARD` must be
+// defined when including this file. Only `src/cxa_guard.cpp` should define
+// the former.
+#ifdef BUILDING_CXA_GUARD
+# include "abort_message.h"
+# define ABORT_WITH_MESSAGE(...) ::abort_message(__VA_ARGS__)
+#elif defined(TESTING_CXA_GUARD)
+# define ABORT_WITH_MESSAGE(...) ::abort()
+#else
+# error "Either BUILDING_CXA_GUARD or TESTING_CXA_GUARD must be defined"
+#endif
+
+
+namespace __cxxabiv1 {
+// Use an anonymous namespace to ensure that the tests and actual implementation
+// have unique definitions of these symbols.
+namespace {
+
+//===----------------------------------------------------------------------===//
+//                          Misc Utilities
+//===----------------------------------------------------------------------===//
+
+template <class T, T(*Init)()>
+struct LazyValue {
+  LazyValue() : is_init(false) {}
+
+  T& get() {
+    if (!is_init) {
+      value = Init();
+      is_init = true;
+    }
+    return value;
+  }
+ private:
+  T value;
+  bool is_init = false;
+};
+
+//===----------------------------------------------------------------------===//
+//                       PlatformGetThreadID
+//===----------------------------------------------------------------------===//
+
+#if defined(__APPLE__) && defined(_LIBCPP_HAS_THREAD_API_PTHREAD)
+uint32_t PlatformThreadID() {
+  static_assert(sizeof(mach_port_t) == sizeof(uint32_t), "");
+  return static_cast<uint32_t>(
+      pthread_mach_thread_np(std::__libcpp_thread_get_current_id()));
+}
+#elif defined(SYS_gettid) && defined(_LIBCPP_HAS_THREAD_API_PTHREAD)
+uint32_t PlatformThreadID() {
+  static_assert(sizeof(pid_t) == sizeof(uint32_t), "");
+  return static_cast<uint32_t>(syscall(SYS_gettid));
+}
+#else
+constexpr uint32_t (*PlatformThreadID)() = nullptr;
+#endif
+
+
+constexpr bool DoesPlatformSupportThreadID() {
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wtautological-pointer-compare"
+#endif
+  return +PlatformThreadID != nullptr;
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+}
+
+//===----------------------------------------------------------------------===//
+//                          GuardBase
+//===----------------------------------------------------------------------===//
+
+enum class AcquireResult {
+  INIT_IS_DONE,
+  INIT_IS_PENDING,
+};
+constexpr AcquireResult INIT_IS_DONE = AcquireResult::INIT_IS_DONE;
+constexpr AcquireResult INIT_IS_PENDING = AcquireResult::INIT_IS_PENDING;
+
+static constexpr uint8_t UNSET = 0;
+static constexpr uint8_t COMPLETE_BIT = (1 << 0);
+static constexpr uint8_t PENDING_BIT = (1 << 1);
+static constexpr uint8_t WAITING_BIT = (1 << 2);
+
+template <class Derived>
+struct GuardObject {
+  GuardObject() = delete;
+  GuardObject(GuardObject const&) = delete;
+  GuardObject& operator=(GuardObject const&) = delete;
+
+  explicit GuardObject(uint32_t* g)
+      : base_address(g), guard_byte_address(reinterpret_cast<uint8_t*>(g)),
+        init_byte_address(reinterpret_cast<uint8_t*>(g) + 1),
+        thread_id_address(nullptr) {}
+
+  explicit GuardObject(uint64_t* g)
+      : base_address(g), guard_byte_address(reinterpret_cast<uint8_t*>(g)),
+        init_byte_address(reinterpret_cast<uint8_t*>(g) + 1),
+        thread_id_address(reinterpret_cast<uint32_t*>(g) + 1) {}
+
+public:
+  /// Implements __cxa_guard_acquire
+  AcquireResult cxa_guard_acquire() {
+    AtomicInt<uint8_t> guard_byte(guard_byte_address);
+    if (guard_byte.load(std::_AO_Acquire) == COMPLETE_BIT)
+      return INIT_IS_DONE;
+    return derived()->acquire_init_byte();
+  }
+
+  /// Implements __cxa_guard_release
+  void cxa_guard_release() {
+    AtomicInt<uint8_t> guard_byte(guard_byte_address);
+    // Store complete first, so that when release wakes other folks, they see
+    // it as having been completed.
+    guard_byte.store(COMPLETE_BIT, std::_AO_Release);
+    derived()->release_init_byte();
+  }
+
+  /// Implements __cxa_guard_abort
+  void cxa_guard_abort() { derived()->abort_init_byte(); }
+
+public:
+  /// base_address - the address of the original guard object.
+  void* const base_address;
+  /// The address of the guord byte at offset 0.
+  uint8_t* const guard_byte_address;
+  /// The address of the byte used by the implementation during initialization.
+  uint8_t* const init_byte_address;
+  /// An optional address storing an identifier for the thread performing initialization.
+  /// It's used to detect recursive initialization.
+  uint32_t* const thread_id_address;
+
+private:
+  Derived* derived() { return static_cast<Derived*>(this); }
+};
+
+//===----------------------------------------------------------------------===//
+//                    Single Threaded Implementation
+//===----------------------------------------------------------------------===//
+
+struct InitByteNoThreads : GuardObject<InitByteNoThreads> {
+  using GuardObject::GuardObject;
+
+  AcquireResult acquire_init_byte() {
+    if (*init_byte_address == COMPLETE_BIT)
+      return INIT_IS_DONE;
+    if (*init_byte_address & PENDING_BIT)
+      ABORT_WITH_MESSAGE("__cxa_guard_acquire detected recursive initialization");
+    *init_byte_address = PENDING_BIT;
+    return INIT_IS_PENDING;
+  }
+
+  void release_init_byte() { *init_byte_address = COMPLETE_BIT; }
+  void abort_init_byte() { *init_byte_address = UNSET; }
+};
+
+
+//===----------------------------------------------------------------------===//
+//                     Global Mutex Implementation
+//===----------------------------------------------------------------------===//
+
+struct LibcppMutex;
+struct LibcppCondVar;
+
+#ifndef _LIBCXXABI_HAS_NO_THREADS
+struct LibcppMutex {
+  LibcppMutex() = default;
+  LibcppMutex(LibcppMutex const&) = delete;
+  LibcppMutex& operator=(LibcppMutex const&) = delete;
+
+  bool lock() { return std::__libcpp_mutex_lock(&mutex); }
+  bool unlock() { return std::__libcpp_mutex_unlock(&mutex); }
+
+private:
+  friend struct LibcppCondVar;
+  std::__libcpp_mutex_t mutex = _LIBCPP_MUTEX_INITIALIZER;
+};
+
+struct LibcppCondVar {
+  LibcppCondVar() = default;
+  LibcppCondVar(LibcppCondVar const&) = delete;
+  LibcppCondVar& operator=(LibcppCondVar const&) = delete;
+
+  bool wait(LibcppMutex& mut) {
+    return std::__libcpp_condvar_wait(&cond, &mut.mutex);
+  }
+  bool broadcast() { return std::__libcpp_condvar_broadcast(&cond); }
+
+private:
+  std::__libcpp_condvar_t cond = _LIBCPP_CONDVAR_INITIALIZER;
+};
+#endif // !defined(_LIBCXXABI_HAS_NO_THREADS)
+
+
+template <class Mutex, class CondVar, Mutex& global_mutex, CondVar& global_cond,
+          uint32_t (*GetThreadID)() = PlatformThreadID>
+struct InitByteGlobalMutex
+    : GuardObject<InitByteGlobalMutex<Mutex, CondVar, global_mutex, global_cond,
+                                    GetThreadID>> {
+
+  using BaseT = typename InitByteGlobalMutex::GuardObject;
+  using BaseT::BaseT;
+
+  explicit InitByteGlobalMutex(uint32_t *g)
+    : BaseT(g), has_thread_id_support(false) {}
+  explicit InitByteGlobalMutex(uint64_t *g)
+    : BaseT(g), has_thread_id_support(DoesPlatformSupportThreadID()) {}
+
+public:
+  AcquireResult acquire_init_byte() {
+    LockGuard g("__cxa_guard_acquire");
+    // Check for possible recursive initialization.
+    if (has_thread_id_support && (*init_byte_address & PENDING_BIT)) {
+      if (*thread_id_address == current_thread_id.get())
+       ABORT_WITH_MESSAGE("__cxa_guard_acquire detected recursive initialization");
+    }
+
+    // Wait until the pending bit is not set.
+    while (*init_byte_address & PENDING_BIT) {
+      *init_byte_address |= WAITING_BIT;
+      global_cond.wait(global_mutex);
+    }
+
+    if (*init_byte_address == COMPLETE_BIT)
+      return INIT_IS_DONE;
+
+    if (has_thread_id_support)
+      *thread_id_address = current_thread_id.get();
+
+    *init_byte_address = PENDING_BIT;
+    return INIT_IS_PENDING;
+  }
+
+  void release_init_byte() {
+    bool has_waiting;
+    {
+      LockGuard g("__cxa_guard_release");
+      has_waiting = *init_byte_address & WAITING_BIT;
+      *init_byte_address = COMPLETE_BIT;
+    }
+    if (has_waiting) {
+      if (global_cond.broadcast()) {
+        ABORT_WITH_MESSAGE("%s failed to broadcast", "__cxa_guard_release");
+      }
+    }
+  }
+
+  void abort_init_byte() {
+    bool has_waiting;
+    {
+      LockGuard g("__cxa_guard_abort");
+      if (has_thread_id_support)
+        *thread_id_address = 0;
+      has_waiting = *init_byte_address & WAITING_BIT;
+      *init_byte_address = UNSET;
+    }
+    if (has_waiting) {
+      if (global_cond.broadcast()) {
+        ABORT_WITH_MESSAGE("%s failed to broadcast", "__cxa_guard_abort");
+      }
+    }
+  }
+
+private:
+  using BaseT::init_byte_address;
+  using BaseT::thread_id_address;
+  const bool has_thread_id_support;
+  LazyValue<uint32_t, GetThreadID> current_thread_id;
+
+private:
+  struct LockGuard {
+    LockGuard() = delete;
+    LockGuard(LockGuard const&) = delete;
+    LockGuard& operator=(LockGuard const&) = delete;
+
+    explicit LockGuard(const char* calling_func)
+        : calling_func(calling_func)  {
+      if (global_mutex.lock())
+        ABORT_WITH_MESSAGE("%s failed to acquire mutex", calling_func);
+    }
+
+    ~LockGuard() {
+      if (global_mutex.unlock())
+        ABORT_WITH_MESSAGE("%s failed to release mutex", calling_func);
+    }
+
+  private:
+    const char* const calling_func;
+  };
+};
+
+//===----------------------------------------------------------------------===//
+//                         Futex Implementation
+//===----------------------------------------------------------------------===//
+
+#if defined(SYS_futex)
+void PlatformFutexWait(int* addr, int expect) {
+  constexpr int WAIT = 0;
+  syscall(SYS_futex, addr, WAIT, expect, 0);
+}
+void PlatformFutexWake(int* addr) {
+  constexpr int WAKE = 1;
+  syscall(SYS_futex, addr, WAKE, INT_MAX);
+}
+#else
+constexpr void (*PlatformFutexWait)(int*, int) = nullptr;
+constexpr void (*PlatformFutexWake)(int*) = nullptr;
+#endif
+
+constexpr bool DoesPlatformSupportFutex() {
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wtautological-pointer-compare"
+#endif
+  return +PlatformFutexWait != nullptr;
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+}
+
+/// InitByteFutex - Manages initialization using atomics and the futex syscall
+/// for waiting and waking.
+template <void (*Wait)(int*, int) = PlatformFutexWait,
+          void (*Wake)(int*) = PlatformFutexWake,
+          uint32_t (*GetThreadIDArg)() = PlatformThreadID>
+struct InitByteFutex : GuardObject<InitByteFutex<Wait, Wake, GetThreadIDArg>> {
+  using BaseT = typename InitByteFutex::GuardObject;
+
+  /// ARM Constructor
+  explicit InitByteFutex(uint32_t *g) : BaseT(g),
+    init_byte(this->init_byte_address),
+    has_thread_id_support(this->thread_id_address && GetThreadIDArg),
+    thread_id(this->thread_id_address) {}
+
+  /// Itanium Constructor
+  explicit InitByteFutex(uint64_t *g) : BaseT(g),
+    init_byte(this->init_byte_address),
+    has_thread_id_support(this->thread_id_address && GetThreadIDArg),
+    thread_id(this->thread_id_address) {}
+
+public:
+  AcquireResult acquire_init_byte() {
+    while (true) {
+      uint8_t last_val = UNSET;
+      if (init_byte.compare_exchange(&last_val, PENDING_BIT, std::_AO_Acq_Rel,
+                                     std::_AO_Acquire)) {
+        if (has_thread_id_support) {
+          thread_id.store(current_thread_id.get(), std::_AO_Relaxed);
+        }
+        return INIT_IS_PENDING;
+      }
+
+      if (last_val == COMPLETE_BIT)
+        return INIT_IS_DONE;
+
+      if (last_val & PENDING_BIT) {
+
+        // Check for recursive initialization
+        if (has_thread_id_support && thread_id.load(std::_AO_Relaxed) == current_thread_id.get()) {
+            ABORT_WITH_MESSAGE("__cxa_guard_acquire detected recursive initialization");
+        }
+
+        if ((last_val & WAITING_BIT) == 0) {
+          // This compare exchange can fail for several reasons
+          // (1) another thread finished the whole thing before we got here
+          // (2) another thread set the waiting bit we were trying to thread
+          // (3) another thread had an exception and failed to finish
+          if (!init_byte.compare_exchange(&last_val, PENDING_BIT | WAITING_BIT,
+                                          std::_AO_Acq_Rel, std::_AO_Release)) {
+            // (1) success, via someone else's work!
+            if (last_val == COMPLETE_BIT)
+              return INIT_IS_DONE;
+
+            // (3) someone else, bailed on doing the work, retry from the start!
+            if (last_val == UNSET)
+              continue;
+
+            // (2) the waiting bit got set, so we are happy to keep waiting
+          }
+        }
+        wait_on_initialization();
+      }
+    }
+  }
+
+  void release_init_byte() {
+    uint8_t old = init_byte.exchange(COMPLETE_BIT, std::_AO_Acq_Rel);
+    if (old & WAITING_BIT)
+      wake_all();
+  }
+
+  void abort_init_byte() {
+    if (has_thread_id_support)
+      thread_id.store(0, std::_AO_Relaxed);
+
+    uint8_t old = init_byte.exchange(0, std::_AO_Acq_Rel);
+    if (old & WAITING_BIT)
+      wake_all();
+  }
+
+private:
+  /// Use the futex to wait on the current guard variable. Futex expects a
+  /// 32-bit 4-byte aligned address as the first argument, so we have to use use
+  /// the base address of the guard variable (not the init byte).
+  void wait_on_initialization() {
+    Wait(static_cast<int*>(this->base_address),
+         expected_value_for_futex(PENDING_BIT | WAITING_BIT));
+  }
+  void wake_all() { Wake(static_cast<int*>(this->base_address)); }
+
+private:
+  AtomicInt<uint8_t> init_byte;
+
+  const bool has_thread_id_support;
+  // Unsafe to use unless has_thread_id_support
+  AtomicInt<uint32_t> thread_id;
+  LazyValue<uint32_t, GetThreadIDArg> current_thread_id;
+
+  /// Create the expected integer value for futex `wait(int* addr, int expected)`.
+  /// We pass the base address as the first argument, So this function creates
+  /// an zero-initialized integer  with `b` copied at the correct offset.
+  static int expected_value_for_futex(uint8_t b) {
+    int dest_val = 0;
+    std::memcpy(reinterpret_cast<char*>(&dest_val) + 1, &b, 1);
+    return dest_val;
+  }
+
+  static_assert(Wait != nullptr && Wake != nullptr, "");
+};
+
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+template <class T>
+struct GlobalStatic {
+  static T instance;
+};
+template <class T>
+_LIBCPP_SAFE_STATIC T GlobalStatic<T>::instance = {};
+
+enum class Implementation {
+  NoThreads,
+  GlobalLock,
+  Futex
+};
+
+template <Implementation Impl>
+struct SelectImplementation;
+
+template <>
+struct SelectImplementation<Implementation::NoThreads> {
+  using type = InitByteNoThreads;
+};
+
+template <>
+struct SelectImplementation<Implementation::GlobalLock> {
+  using type = InitByteGlobalMutex<
+      LibcppMutex, LibcppCondVar, GlobalStatic<LibcppMutex>::instance,
+      GlobalStatic<LibcppCondVar>::instance, PlatformThreadID>;
+};
+
+template <>
+struct SelectImplementation<Implementation::Futex> {
+  using type =
+      InitByteFutex<PlatformFutexWait, PlatformFutexWake, PlatformThreadID>;
+};
+
+// TODO(EricWF): We should prefer the futex implementation when available. But
+// it should be done in a separate step from adding the implementation.
+constexpr Implementation CurrentImplementation =
+#if defined(_LIBCXXABI_HAS_NO_THREADS)
+    Implementation::NoThreads;
+#elif defined(_LIBCXXABI_USE_FUTEX)
+    Implementation::Futex;
+#else
+   Implementation::GlobalLock;
+#endif
+
+static_assert(CurrentImplementation != Implementation::Futex
+           || DoesPlatformSupportFutex(), "Futex selected but not supported");
+
+using SelectedImplementation =
+    SelectImplementation<CurrentImplementation>::type;
+
+} // end namespace
+} // end namespace __cxxabiv1
+
+#endif // LIBCXXABI_SRC_INCLUDE_CXA_GUARD_IMPL_H
diff --git a/src/include/atomic_support.h b/src/include/atomic_support.h
index fca6659..4ff45eb 100644
--- a/src/include/atomic_support.h
+++ b/src/include/atomic_support.h
@@ -150,7 +150,7 @@
 template <class _ValueType>
 inline _LIBCPP_INLINE_VISIBILITY
 _ValueType __libcpp_atomic_exchange(_ValueType* __target,
-                                    _ValueType __value, int __order = _AO_Seq)
+                                    _ValueType __value, int  = _AO_Seq)
 {
     _ValueType old = *__target;
     *__target = __value;
@@ -177,4 +177,34 @@
 
 _LIBCPP_END_NAMESPACE_STD
 
+namespace {
+
+template <class IntType>
+class AtomicInt {
+public:
+  using MemoryOrder = std::__libcpp_atomic_order;
+
+  explicit AtomicInt(IntType *b) : b(b) {}
+  AtomicInt(AtomicInt const&) = delete;
+  AtomicInt& operator=(AtomicInt const&) = delete;
+
+  IntType load(MemoryOrder ord) {
+    return std::__libcpp_atomic_load(b, ord);
+  }
+  void store(IntType val, MemoryOrder ord) {
+    std::__libcpp_atomic_store(b, val, ord);
+  }
+  IntType exchange(IntType new_val, MemoryOrder ord) {
+    return std::__libcpp_atomic_exchange(b, new_val, ord);
+  }
+  bool compare_exchange(IntType *expected, IntType desired, MemoryOrder ord_success, MemoryOrder ord_failure) {
+    return std::__libcpp_atomic_compare_exchange(b, expected, desired, ord_success, ord_failure);
+  }
+
+private:
+  IntType *b;
+};
+
+} // end namespace
+
 #endif // ATOMIC_SUPPORT_H
diff --git a/test/guard_test_basic.pass.cpp b/test/guard_test_basic.pass.cpp
new file mode 100644
index 0000000..5f1576d
--- /dev/null
+++ b/test/guard_test_basic.pass.cpp
@@ -0,0 +1,154 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: c++98, c++03
+
+#define TESTING_CXA_GUARD
+#include "../src/cxa_guard_impl.h"
+
+using namespace __cxxabiv1;
+
+template <class GuardType, class Impl>
+struct Tests {
+private:
+  Tests() : g{}, impl(&g) {}
+  GuardType g;
+  Impl impl;
+
+  uint8_t first_byte() {
+    uint8_t first;
+    std::memcpy(&first, &g, 1);
+    return first;
+  }
+
+  void reset() { g = {}; }
+
+public:
+  // Test the post conditions on cxa_guard_acquire, cxa_guard_abort, and
+  // cxa_guard_release. Specifically, that they leave the first byte with
+  // the value 0 or 1 as specified by the ARM or Itanium specification.
+  static void test() {
+    Tests tests;
+    tests.test_acquire();
+    tests.test_abort();
+    tests.test_release();
+  }
+
+  void test_acquire() {
+    {
+      reset();
+      assert(first_byte() == 0);
+      assert(impl.cxa_guard_acquire() == INIT_IS_PENDING);
+      assert(first_byte() == 0);
+    }
+    {
+      reset();
+      assert(first_byte() == 0);
+      assert(impl.cxa_guard_acquire() == INIT_IS_PENDING);
+      impl.cxa_guard_release();
+      assert(first_byte() == 1);
+      assert(impl.cxa_guard_acquire() == INIT_IS_DONE);
+    }
+  }
+
+  void test_release() {
+    {
+      reset();
+      assert(first_byte() == 0);
+      assert(impl.cxa_guard_acquire() == INIT_IS_PENDING);
+      assert(first_byte() == 0);
+      impl.cxa_guard_release();
+      assert(first_byte() == 1);
+    }
+  }
+
+  void test_abort() {
+    {
+      reset();
+      assert(first_byte() == 0);
+      assert(impl.cxa_guard_acquire() == INIT_IS_PENDING);
+      assert(first_byte() == 0);
+      impl.cxa_guard_abort();
+      assert(first_byte() == 0);
+      assert(impl.cxa_guard_acquire() == INIT_IS_PENDING);
+      assert(first_byte() == 0);
+    }
+  }
+};
+
+struct NopMutex {
+  bool lock() {
+    assert(!is_locked);
+    is_locked = true;
+    return false;
+  }
+  bool unlock() {
+    assert(is_locked);
+    is_locked = false;
+    return false;
+  }
+
+private:
+  bool is_locked = false;
+};
+static NopMutex global_nop_mutex = {};
+
+struct NopCondVar {
+  bool broadcast() { return false; }
+  bool wait(NopMutex&) { return false; }
+};
+static NopCondVar global_nop_cond = {};
+
+void NopFutexWait(int*, int) { assert(false); }
+void NopFutexWake(int*) { assert(false); }
+uint32_t MockGetThreadID() { return 0; }
+
+int main() {
+  {
+#if defined(_LIBCXXABI_HAS_NO_THREADS)
+    static_assert(CurrentImplementation == Implementation::NoThreads, "");
+    static_assert(
+        std::is_same<SelectedImplementation, InitByteNoThreads>::value, "");
+#else
+    static_assert(CurrentImplementation == Implementation::GlobalLock, "");
+    static_assert(
+        std::is_same<
+            SelectedImplementation,
+            InitByteGlobalMutex<LibcppMutex, LibcppCondVar,
+                                GlobalStatic<LibcppMutex>::instance,
+                                GlobalStatic<LibcppCondVar>::instance>>::value,
+        "");
+#endif
+  }
+  {
+#if defined(__APPLE__) || defined(__linux__)
+    assert(PlatformThreadID);
+#endif
+    if (+PlatformThreadID) {
+      assert(PlatformThreadID() != 0);
+      assert(PlatformThreadID() == PlatformThreadID());
+    }
+  }
+  {
+    Tests<uint32_t, InitByteNoThreads>::test();
+    Tests<uint64_t, InitByteNoThreads>::test();
+  }
+  {
+    using MutexImpl =
+        InitByteGlobalMutex<NopMutex, NopCondVar, global_nop_mutex,
+                            global_nop_cond, MockGetThreadID>;
+    Tests<uint32_t, MutexImpl>::test();
+    Tests<uint64_t, MutexImpl>::test();
+  }
+  {
+    using FutexImpl =
+        InitByteFutex<&NopFutexWait, &NopFutexWake, &MockGetThreadID>;
+    Tests<uint32_t, FutexImpl>::test();
+    Tests<uint64_t, FutexImpl>::test();
+  }
+}
diff --git a/test/guard_threaded_test.pass.cpp b/test/guard_threaded_test.pass.cpp
new file mode 100644
index 0000000..e38e132
--- /dev/null
+++ b/test/guard_threaded_test.pass.cpp
@@ -0,0 +1,378 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++98, c++03
+// UNSUPPORTED: libcxxabi-no-threads, libcxxabi-no-exceptions
+
+#define TESTING_CXA_GUARD
+#include "../src/cxa_guard_impl.h"
+#include <unordered_map>
+#include <thread>
+#include <atomic>
+#include <array>
+#include <cassert>
+#include <memory>
+#include <vector>
+
+
+using namespace __cxxabiv1;
+
+enum class InitResult {
+  COMPLETE,
+  PERFORMED,
+  WAITED,
+  ABORTED
+};
+constexpr InitResult COMPLETE = InitResult::COMPLETE;
+constexpr InitResult PERFORMED = InitResult::PERFORMED;
+constexpr InitResult WAITED = InitResult::WAITED;
+constexpr InitResult ABORTED = InitResult::ABORTED;
+
+
+template <class Impl, class GuardType, class Init>
+InitResult check_guard(GuardType *g, Init init) {
+  uint8_t *first_byte = reinterpret_cast<uint8_t*>(g);
+  if (std::__libcpp_atomic_load(first_byte, std::_AO_Acquire) == 0) {
+    Impl impl(g);
+    if (impl.cxa_guard_acquire() == INIT_IS_PENDING) {
+#ifndef LIBCXXABI_HAS_NO_EXCEPTIONS
+      try {
+#endif
+        init();
+        impl.cxa_guard_release();
+        return PERFORMED;
+#ifndef LIBCXXABI_HAS_NO_EXCEPTIONS
+      } catch (...) {
+        impl.cxa_guard_abort();
+        return ABORTED;
+      }
+#endif
+    }
+    return WAITED;
+  }
+  return COMPLETE;
+}
+
+
+template <class GuardType, class Impl>
+struct FunctionLocalStatic {
+  FunctionLocalStatic() { reset(); }
+  FunctionLocalStatic(FunctionLocalStatic const&) = delete;
+
+  template <class InitFunc>
+  InitResult access(InitFunc&& init) {
+    ++waiting_threads;
+    auto res = check_guard<Impl>(&guard_object, init);
+    --waiting_threads;
+    ++result_counts[static_cast<int>(res)];
+    return res;
+  }
+
+  struct Accessor {
+    explicit Accessor(FunctionLocalStatic& obj) : this_obj(&obj) {}
+
+    template <class InitFn>
+    void operator()(InitFn && fn) const {
+      this_obj->access(std::forward<InitFn>(fn));
+    }
+  private:
+    FunctionLocalStatic *this_obj;
+  };
+
+  Accessor get_access() {
+    return Accessor(*this);
+  }
+
+  void reset() {
+    guard_object = 0;
+    waiting_threads.store(0);
+    for (auto& counter : result_counts) {
+      counter.store(0);
+    }
+  }
+
+  int get_count(InitResult I) const {
+    return result_counts[static_cast<int>(I)].load();
+  }
+  int num_completed() const {
+    return get_count(COMPLETE) + get_count(PERFORMED) + get_count(WAITED);
+  }
+  int num_waiting() const {
+    return waiting_threads.load();
+  }
+
+private:
+  GuardType guard_object;
+  std::atomic<int> waiting_threads;
+  std::array<std::atomic<int>, 4> result_counts;
+  static_assert(static_cast<int>(ABORTED) == 3, "only 4 result kinds expected");
+};
+
+struct ThreadGroup {
+  ThreadGroup() = default;
+  ThreadGroup(ThreadGroup const&) = delete;
+
+  template <class ...Args>
+  void Create(Args&& ...args) {
+    threads.emplace_back(std::forward<Args>(args)...);
+  }
+
+  void JoinAll() {
+    for (auto& t : threads) {
+      t.join();
+    }
+  }
+
+private:
+  std::vector<std::thread> threads;
+};
+
+struct Barrier {
+  explicit Barrier(int n) : m_wait_for(n) { reset(); }
+  Barrier(Barrier const&) = delete;
+
+  void wait() {
+    ++m_entered;
+    while (m_entered.load() < m_wait_for) {
+      std::this_thread::yield();
+    }
+    assert(m_entered.load() == m_wait_for);
+    ++m_exited;
+  }
+
+  int num_waiting() const {
+    return m_entered.load() - m_exited.load();
+  }
+
+  void reset() {
+    m_entered.store(0);
+    m_exited.store(0);
+  }
+private:
+  const int m_wait_for;
+  std::atomic<int> m_entered;
+  std::atomic<int> m_exited;
+};
+
+struct Notification {
+  Notification() { reset(); }
+  Notification(Notification const&) = delete;
+
+  int num_waiting() const {
+    return m_waiting.load();
+  }
+
+  void wait() {
+    if (m_cond.load())
+      return;
+    ++m_waiting;
+    while (!m_cond.load()) {
+      std::this_thread::yield();
+    }
+    --m_waiting;
+  }
+
+  void notify() {
+    m_cond.store(true);
+  }
+
+  template <class Cond>
+  void notify_when(Cond &&c) {
+    if (m_cond.load())
+      return;
+    while (!c()) {
+      std::this_thread::yield();
+    }
+    m_cond.store(true);
+  }
+
+  void reset() {
+    m_cond.store(0);
+    m_waiting.store(0);
+  }
+private:
+  std::atomic<bool> m_cond;
+  std::atomic<int> m_waiting;
+};
+
+
+template <class GuardType, class Impl>
+void test_free_for_all() {
+  const int num_waiting_threads = 10; // one initializing thread, 10 waiters.
+
+  FunctionLocalStatic<GuardType, Impl> test_obj;
+
+  Barrier start_init_barrier(num_waiting_threads);
+  bool already_init = false;
+  ThreadGroup threads;
+  for (int i=0; i < num_waiting_threads; ++i) {
+    threads.Create([&]() {
+      start_init_barrier.wait();
+      test_obj.access([&]() {
+        assert(!already_init);
+        already_init = true;
+      });
+    });
+  }
+
+  // wait for the other threads to finish initialization.
+  threads.JoinAll();
+
+  assert(test_obj.get_count(PERFORMED) == 1);
+  assert(test_obj.get_count(COMPLETE) + test_obj.get_count(WAITED) == 9);
+}
+
+template <class GuardType, class Impl>
+void test_waiting_for_init() {
+    const int num_waiting_threads = 10; // one initializing thread, 10 waiters.
+
+    Notification init_pending;
+    Notification init_barrier;
+    FunctionLocalStatic<GuardType, Impl> test_obj;
+    auto access_fn = test_obj.get_access();
+
+    ThreadGroup threads;
+    threads.Create(access_fn,
+      [&]() {
+        init_pending.notify();
+        init_barrier.wait();
+      }
+    );
+    init_pending.wait();
+
+    assert(test_obj.num_waiting() == 1);
+
+    for (int i=0; i < num_waiting_threads; ++i) {
+      threads.Create(access_fn, []() { assert(false); });
+    }
+    // unblock the initializing thread
+    init_barrier.notify_when([&]() {
+      return test_obj.num_waiting() == num_waiting_threads + 1;
+    });
+
+    // wait for the other threads to finish initialization.
+    threads.JoinAll();
+
+    assert(test_obj.get_count(PERFORMED) == 1);
+    assert(test_obj.get_count(WAITED) == 10);
+    assert(test_obj.get_count(COMPLETE) == 0);
+}
+
+
+template <class GuardType, class Impl>
+void test_aborted_init() {
+  const int num_waiting_threads = 10; // one initializing thread, 10 waiters.
+
+  Notification init_pending;
+  Notification init_barrier;
+  FunctionLocalStatic<GuardType, Impl> test_obj;
+  auto access_fn = test_obj.get_access();
+
+  ThreadGroup threads;
+  threads.Create(access_fn,
+                 [&]() {
+                   init_pending.notify();
+                   init_barrier.wait();
+                   throw 42;
+                 }
+  );
+  init_pending.wait();
+
+  assert(test_obj.num_waiting() == 1);
+
+  bool already_init = false;
+  for (int i=0; i < num_waiting_threads; ++i) {
+    threads.Create(access_fn, [&]() {
+      assert(!already_init);
+      already_init = true;
+    });
+  }
+  // unblock the initializing thread
+  init_barrier.notify_when([&]() {
+    return test_obj.num_waiting() == num_waiting_threads + 1;
+  });
+
+  // wait for the other threads to finish initialization.
+  threads.JoinAll();
+
+  assert(test_obj.get_count(ABORTED) == 1);
+  assert(test_obj.get_count(PERFORMED) == 1);
+  assert(test_obj.get_count(WAITED) == 9);
+  assert(test_obj.get_count(COMPLETE) == 0);
+}
+
+
+template <class GuardType, class Impl>
+void test_completed_init() {
+  const int num_waiting_threads = 10; // one initializing thread, 10 waiters.
+
+  Notification init_barrier;
+  FunctionLocalStatic<GuardType, Impl> test_obj;
+
+  test_obj.access([]() {});
+  assert(test_obj.num_waiting() == 0);
+  assert(test_obj.num_completed() == 1);
+  assert(test_obj.get_count(PERFORMED) == 1);
+
+  auto access_fn = test_obj.get_access();
+  ThreadGroup threads;
+  for (int i=0; i < num_waiting_threads; ++i) {
+    threads.Create(access_fn, []() {
+      assert(false);
+    });
+  }
+
+  // wait for the other threads to finish initialization.
+  threads.JoinAll();
+
+  assert(test_obj.get_count(ABORTED) == 0);
+  assert(test_obj.get_count(PERFORMED) == 1);
+  assert(test_obj.get_count(WAITED) == 0);
+  assert(test_obj.get_count(COMPLETE) == 10);
+}
+
+template <class Impl>
+void test_impl() {
+  {
+    test_free_for_all<uint32_t, Impl>();
+    test_free_for_all<uint32_t, Impl>();
+  }
+  {
+    test_waiting_for_init<uint32_t, Impl>();
+    test_waiting_for_init<uint64_t, Impl>();
+  }
+  {
+    test_aborted_init<uint32_t, Impl>();
+    test_aborted_init<uint64_t, Impl>();
+  }
+  {
+    test_completed_init<uint32_t, Impl>();
+    test_completed_init<uint64_t, Impl>();
+  }
+}
+
+int main() {
+  using MutexImpl = SelectImplementation<Implementation::GlobalLock>::type;
+
+  // Attempt to test the Futex based implementation if it's supported on the
+  // target platform.
+  using RealFutexImpl = SelectImplementation<Implementation::Futex>::type;
+  using FutexImpl = typename std::conditional<
+      DoesPlatformSupportFutex(),
+      RealFutexImpl,
+      MutexImpl
+  >::type;
+
+  // Run each test 5 times to help TSAN catch bugs.
+  const int num_runs = 5;
+  for (int i=0; i < num_runs; ++i) {
+    test_impl<MutexImpl>();
+    if (DoesPlatformSupportFutex())
+      test_impl<FutexImpl>();
+  }
+}