Remove dependency on sys_membarrier on linux

By not inlining the fastpath of dispatch_once() when the hardware cannot
give us the right semantics.

Signed-off-by: Daniel A. Steffen <dsteffen@apple.com>
diff --git a/dispatch/once.h b/dispatch/once.h
index 68acfe8..37a4950 100644
--- a/dispatch/once.h
+++ b/dispatch/once.h
@@ -40,6 +40,14 @@
 DISPATCH_SWIFT3_UNAVAILABLE("Use lazily initialized globals instead")
 typedef long dispatch_once_t;
 
+#if defined(__x86_64__) || defined(__i386__) || defined(__s390x__)
+#define DISPATCH_ONCE_INLINE_FASTPATH 1
+#elif defined(__APPLE__)
+#define DISPATCH_ONCE_INLINE_FASTPATH 1
+#else
+#define DISPATCH_ONCE_INLINE_FASTPATH 0
+#endif
+
 /*!
  * @function dispatch_once
  *
@@ -65,6 +73,7 @@
 dispatch_once(dispatch_once_t *predicate,
 		DISPATCH_NOESCAPE dispatch_block_t block);
 
+#if DISPATCH_ONCE_INLINE_FASTPATH
 DISPATCH_INLINE DISPATCH_ALWAYS_INLINE DISPATCH_NONNULL_ALL DISPATCH_NOTHROW
 DISPATCH_SWIFT3_UNAVAILABLE("Use lazily initialized globals instead")
 void
@@ -81,6 +90,7 @@
 #undef dispatch_once
 #define dispatch_once _dispatch_once
 #endif
+#endif // DISPATCH_ONCE_INLINE_FASTPATH
 
 API_AVAILABLE(macos(10.6), ios(4.0))
 DISPATCH_EXPORT DISPATCH_NONNULL1 DISPATCH_NONNULL3 DISPATCH_NOTHROW
@@ -89,6 +99,7 @@
 dispatch_once_f(dispatch_once_t *predicate, void *_Nullable context,
 		dispatch_function_t function);
 
+#if DISPATCH_ONCE_INLINE_FASTPATH
 DISPATCH_INLINE DISPATCH_ALWAYS_INLINE DISPATCH_NONNULL1 DISPATCH_NONNULL3
 DISPATCH_NOTHROW
 DISPATCH_SWIFT3_UNAVAILABLE("Use lazily initialized globals instead")
@@ -105,6 +116,7 @@
 }
 #undef dispatch_once_f
 #define dispatch_once_f _dispatch_once_f
+#endif // DISPATCH_ONCE_INLINE_FASTPATH
 
 __END_DECLS
 
diff --git a/src/once.c b/src/once.c
index 75d7a39..c01538c 100644
--- a/src/once.c
+++ b/src/once.c
@@ -40,9 +40,15 @@
 }
 #endif
 
-DISPATCH_NOINLINE
-void
-dispatch_once_f(dispatch_once_t *val, void *ctxt, dispatch_function_t func)
+#if DISPATCH_ONCE_INLINE_FASTPATH
+#define DISPATCH_ONCE_SLOW_INLINE inline DISPATCH_ALWAYS_INLINE
+#else
+#define DISPATCH_ONCE_SLOW_INLINE DISPATCH_NOINLINE
+#endif // DISPATCH_ONCE_INLINE_FASTPATH
+
+DISPATCH_ONCE_SLOW_INLINE
+static void
+dispatch_once_f_slow(dispatch_once_t *val, void *ctxt, dispatch_function_t func)
 {
 #if DISPATCH_GATE_USE_FOR_DISPATCH_ONCE
 	dispatch_once_gate_t l = (dispatch_once_gate_t)val;
@@ -95,3 +101,15 @@
 	}
 #endif
 }
+
+DISPATCH_NOINLINE
+void
+dispatch_once_f(dispatch_once_t *val, void *ctxt, dispatch_function_t func)
+{
+#if !DISPATCH_ONCE_INLINE_FASTPATH
+	if (likely(os_atomic_load(val, acquire) == DLOCK_ONCE_DONE)) {
+		return;
+	}
+#endif // !DISPATCH_ONCE_INLINE_FASTPATH
+	return dispatch_once_f_slow(val, ctxt, func);
+}
diff --git a/src/shims/lock.h b/src/shims/lock.h
index 99c5563..0c089aa 100644
--- a/src/shims/lock.h
+++ b/src/shims/lock.h
@@ -59,9 +59,6 @@
 #elif defined(__linux__)
 
 #include <linux/futex.h>
-#if !defined(__x86_64__) && !defined(__i386__) && !defined(__s390x__)
-#include <linux/membarrier.h>
-#endif
 #include <unistd.h>
 #include <sys/syscall.h>   /* For SYS_xxx definitions */
 
@@ -473,28 +470,7 @@
 static inline dispatch_once_t
 _dispatch_once_xchg_done(dispatch_once_t *pred)
 {
-#if defined(__i386__) || defined(__x86_64__) || defined(__s390x__)
-	// On Intel, any load is a load-acquire, so we don't need to be fancy
-	// same for s390x
 	return os_atomic_xchg(pred, DLOCK_ONCE_DONE, release);
-#elif defined(__linux__)
-	if (unlikely(syscall(__NR_membarrier, MEMBARRIER_CMD_SHARED, 0) < 0)) {
-		/*
-		 * sys_membarrier not supported
-		 *
-		 * Ideally we would call DISPATCH_INTERNAL_CRASH() here, but
-		 * due to ordering constraints in internal.h required by Darwin
-		 * the macro is undefined when this header is included.
-		 * Instead, open-code what would be a call to
-		 * _dispatch_hardware_crash() inside DISPATCH_INTERNAL_CRASH().
-		 */
-		__asm__("");
-		__builtin_trap();
-	}
-	return os_atomic_xchg(pred, DLOCK_ONCE_DONE, relaxed);
-#else
-#  error dispatch_once algorithm not available for this port
-#endif
 }
 
 DISPATCH_ALWAYS_INLINE