cpu yield on global contention (#216)

mainly for cpu power saving/being more available for background tasks.
Noticed a little but consistent performance increase on ARM.
diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index bfe6bc8..c99f18d 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -128,6 +128,7 @@
 #  include <unistd.h>
 #  include <stdio.h>
 #  include <stdlib.h>
+#  include <time.h>
 #  if defined(__APPLE__)
 #    include <TargetConditionals.h>
 #    if !TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR
@@ -743,6 +744,21 @@
 	_rpmalloc_main_thread_id = get_thread_id();
 }
 
+static void
+_rpmalloc_spin(void) {
+#if defined(__x86_64__) || defined(__i386__)
+	__asm__ volatile("pause" ::: "memory");
+#elif defined(__aarch64__) || (defined(__arm__) && __ARM_ARCH__ >= 7)
+	__asm__ volatile("yield" ::: "memory");
+#elif defined(__powerpc__) || defined(__powerpc64__)
+        // No idea if ever been compiled in such archs but ... as precaution
+	__asm__ volatile("or 27,27,27");
+#else
+	struct timespec ts = {0};
+	nanosleep(&ts, NULL);
+#endif
+}
+
 #if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
 static void NTAPI
 _rpmalloc_thread_destructor(void* value) {
@@ -1068,9 +1084,8 @@
 	span_t* span = 0;
 	if (_memory_page_size > _memory_span_size) {
 		// If huge pages, make sure only one thread maps more memory to avoid bloat
-		while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0)) {
-			/* Spin */
-		}
+		while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0))
+			_rpmalloc_spin();
 		if (_memory_global_reserve_count >= span_count) {
 			size_t reserve_count = (!heap->spans_reserved ? DEFAULT_SPAN_MAP_COUNT : span_count);
 			if (_memory_global_reserve_count < reserve_count)
@@ -1284,7 +1299,7 @@
 static void
 _rpmalloc_global_cache_finalize(global_cache_t* cache) {
 	while (!atomic_cas32_acquire(&cache->lock, 1, 0))
-		/* Spin */;
+		_rpmalloc_spin();
 
 	for (size_t ispan = 0; ispan < cache->count; ++ispan)
 		_rpmalloc_span_unmap(cache->span[ispan]);
@@ -1309,7 +1324,7 @@
 
 	size_t insert_count = count;
 	while (!atomic_cas32_acquire(&cache->lock, 1, 0))
-		/* Spin */;
+		_rpmalloc_spin();
 
 	if ((cache->count + insert_count) > cache_limit)
 		insert_count = cache_limit - cache->count;
@@ -1340,7 +1355,7 @@
 
 	size_t extract_count = count;
 	while (!atomic_cas32_acquire(&cache->lock, 1, 0))
-		/* Spin */;
+		_rpmalloc_spin();
 
 	if (extract_count > cache->count)
 		extract_count = cache->count;
@@ -1757,7 +1772,7 @@
 _rpmalloc_heap_allocate(int first_class) {
 	heap_t* heap = 0;
 	while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0))
-		/* Spin */;
+		_rpmalloc_spin();
 	if (first_class == 0)
 		heap = _rpmalloc_heap_extract_orphan(&_memory_orphan_heaps);
 #if RPMALLOC_FIRST_CLASS_HEAPS
@@ -1815,7 +1830,7 @@
 	// lock atomic is unknown and it's best to just go ahead and exit
 	if (get_thread_id() != _rpmalloc_main_thread_id) {
 		while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0))
-			/* Spin */;
+			_rpmalloc_spin();
 	}
 	_rpmalloc_heap_orphan(heap, first_class);
 	atomic_store32_release(&_memory_global_lock, 0);