simplify orphan and size cache handling
diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index b190c1c..cf7310b 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -302,8 +302,6 @@
 #define MEDIUM_SIZE_LIMIT         (SMALL_SIZE_LIMIT + (MEDIUM_GRANULARITY * MEDIUM_CLASS_COUNT))
 //! Maximum size of a large block
 #define LARGE_SIZE_LIMIT          ((LARGE_CLASS_COUNT * _memory_span_size) - SPAN_HEADER_SIZE)
-//! ABA protection size in orhpan heap list (also becomes limit of smallest page size)
-#define HEAP_ORPHAN_ABA_SIZE      512
 //! Size of a span header (must be a multiple of SMALL_GRANULARITY and a power of two)
 #define SPAN_HEADER_SIZE          128
 //! Number of spans in thread cache
@@ -476,7 +474,7 @@
 	//  Previous span pointer in head points to tail span of list.
 	span_t*      partial_span;
 	//! Early level cache of fully free spans
-	span_t*      cache[2];
+	span_t*      cache;
 };
 typedef struct heap_size_class_t heap_size_class_t;
 
@@ -616,7 +614,7 @@
 static global_cache_t _memory_span_cache[LARGE_CLASS_COUNT];
 #endif
 //! All heaps
-static atomicptr_t _memory_heaps[HEAP_ARRAY_SIZE];
+static heap_t* _memory_heaps[HEAP_ARRAY_SIZE];
 //! Orphan lock
 static atomic32_t _memory_orphan_lock;
 //! Orphaned heaps
@@ -1041,12 +1039,9 @@
 	if (!heap->finalize) {
 		_rpmalloc_stat_inc(&heap->span_use[0].spans_to_cache);
 		_rpmalloc_stat_inc(&heap->size_class_use[span->size_class].spans_to_cache);
-		if (heap->size_class[span->size_class].cache[0]) {
-			if (heap->size_class[span->size_class].cache[1])
-				_rpmalloc_heap_cache_insert(heap, heap->size_class[span->size_class].cache[1]);
-			heap->size_class[span->size_class].cache[1] = heap->size_class[span->size_class].cache[0];
-		}
-		heap->size_class[span->size_class].cache[0] = span;
+		if (heap->size_class[span->size_class].cache)
+			_rpmalloc_heap_cache_insert(heap, heap->size_class[span->size_class].cache);
+		heap->size_class[span->size_class].cache = span;
 	} else {
 		_rpmalloc_span_unmap(span);
 	}
@@ -1376,9 +1371,9 @@
 	}
 	//Heap is now completely free, unmap and remove from heap list
 	size_t list_idx = heap->id % HEAP_ARRAY_SIZE;
-	heap_t* list_heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]);
+	heap_t* list_heap = _memory_heaps[list_idx];
 	if (list_heap == heap) {
-		atomic_store_ptr(&_memory_heaps[list_idx], heap->next_heap);
+		_memory_heaps[list_idx] = heap->next_heap;
 	} else {
 		while (list_heap->next_heap != heap)
 			list_heap = list_heap->next_heap;
@@ -1518,17 +1513,12 @@
 #endif
 #if ENABLE_THREAD_CACHE
 	if (class_idx < SIZE_CLASS_COUNT) {
-		if (heap->size_class[class_idx].cache[0]) {
-			span = heap->size_class[class_idx].cache[0];
+		if (heap->size_class[class_idx].cache) {
+			span = heap->size_class[class_idx].cache;
 			span_t* new_cache = 0;
 			if (heap->span_cache.count)
 				new_cache = heap->span_cache.span[--heap->span_cache.count];
-			if (heap->size_class[class_idx].cache[1]) {
-				heap->size_class[class_idx].cache[0] = heap->size_class[class_idx].cache[1];
-				heap->size_class[class_idx].cache[1] = new_cache;
-			} else {
-				heap->size_class[class_idx].cache[0] = new_cache;
-			}
+			heap->size_class[class_idx].cache = new_cache;
 			return span;
 		}
 	}
@@ -1558,18 +1548,13 @@
 
 static void
 _rpmalloc_heap_initialize(heap_t* heap) {
-	memset(heap, 0, sizeof(heap_t));
-
 	//Get a new heap ID
 	heap->id = 1 + atomic_incr32(&_memory_heap_id);
 
 	//Link in heap in heap ID map
-	heap_t* next_heap;
 	size_t list_idx = heap->id % HEAP_ARRAY_SIZE;
-	do {
-		next_heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]);
-		heap->next_heap = next_heap;
-	} while (!atomic_cas_ptr(&_memory_heaps[list_idx], heap, next_heap));
+	heap->next_heap = _memory_heaps[list_idx];
+	_memory_heaps[list_idx] = heap;
 }
 
 static void
@@ -1581,11 +1566,8 @@
 	(void)sizeof(first_class);
 	heap_t** heap_list = &_memory_orphan_heaps;
 #endif
-	while (!atomic_cas32_acquire(&_memory_orphan_lock, 1, 0))
-		/* Spin */;
 	heap->next_orphan = *heap_list;
 	*heap_list = heap;
-	atomic_store32_release(&_memory_orphan_lock, 0);
 }
 
 //! Allocate a new heap from newly mapped memory pages
@@ -1619,11 +1601,8 @@
 
 static heap_t*
 _rpmalloc_heap_extract_orphan(heap_t** heap_list) {
-	while (!atomic_cas32_acquire(&_memory_orphan_lock, 1, 0))
-		/* Spin */;
 	heap_t* heap = *heap_list;
 	*heap_list = (heap ? heap->next_orphan : 0);
-	atomic_store32_release(&_memory_orphan_lock, 0);
 	return heap;
 }
 
@@ -1631,6 +1610,8 @@
 static heap_t*
 _rpmalloc_heap_allocate(int first_class) {
 	heap_t* heap = 0;
+	while (!atomic_cas32_acquire(&_memory_orphan_lock, 1, 0))
+		/* Spin */;
 	if (first_class == 0)
 		heap = _rpmalloc_heap_extract_orphan(&_memory_orphan_heaps);
 #if RPMALLOC_FIRST_CLASS_HEAPS
@@ -1639,6 +1620,7 @@
 #endif
 	if (!heap)
 		heap = _rpmalloc_heap_allocate_new();
+	atomic_store32_release(&_memory_orphan_lock, 0);
 	return heap;
 }
 
@@ -1683,7 +1665,10 @@
 	assert(atomic_load32(&_memory_active_heaps) >= 0);
 #endif
 
+	while (!atomic_cas32_acquire(&_memory_orphan_lock, 1, 0))
+		/* Spin */;
 	_rpmalloc_heap_orphan(heap, first_class);
+	atomic_store32_release(&_memory_orphan_lock, 0);
 }
 
 static void
@@ -1702,12 +1687,9 @@
 	_rpmalloc_heap_cache_adopt_deferred(heap, 0);
 
 	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
-		if (heap->size_class[iclass].cache[0])
-			_rpmalloc_span_unmap(heap->size_class[iclass].cache[0]);
-		if (heap->size_class[iclass].cache[1])
-			_rpmalloc_span_unmap(heap->size_class[iclass].cache[1]);
-		heap->size_class[iclass].cache[0] = 0;
-		heap->size_class[iclass].cache[1] = 0;
+		if (heap->size_class[iclass].cache)
+			_rpmalloc_span_unmap(heap->size_class[iclass].cache);
+		heap->size_class[iclass].cache = 0;
 		span_t* span = heap->size_class[iclass].partial_span;
 		while (span) {
 			span_t* next = span->next;
@@ -2464,8 +2446,7 @@
 	}
 #endif
 
-	//The ABA counter in heap orphan list is tied to using HEAP_ORPHAN_ABA_SIZE
-	size_t min_span_size = HEAP_ORPHAN_ABA_SIZE;
+	size_t min_span_size = 256;
 	size_t max_page_size;
 #if UINTPTR_MAX > 0xFFFFFFFF
 	max_page_size = 4096ULL * 1024ULL * 1024ULL;
@@ -2550,8 +2531,8 @@
 #if RPMALLOC_FIRST_CLASS_HEAPS
 	_memory_first_class_orphan_heaps = 0;
 #endif
-	for (size_t ilist = 0, lsize = (sizeof(_memory_heaps) / sizeof(_memory_heaps[0])); ilist < lsize; ++ilist)
-		atomic_store_ptr(&_memory_heaps[ilist], 0);
+	memset(_memory_heaps, 0, sizeof(_memory_heaps));
+	atomic_store32_release(&_memory_orphan_lock, 0);
 
 	//Initialize this thread
 	rpmalloc_thread_initialize();
@@ -2566,7 +2547,7 @@
 
 	//Free all thread caches and fully free spans
 	for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) {
-		heap_t* heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]);
+		heap_t* heap = _memory_heaps[list_idx];
 		while (heap) {
 			heap_t* next_heap = heap->next_heap;
 			heap->finalize = 1;
@@ -2902,7 +2883,7 @@
 	//If you hit this assert, you still have active threads or forgot to finalize some thread(s)
 	assert(atomic_load32(&_memory_active_heaps) == 0);
 	for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) {
-		heap_t* heap = atomic_load_ptr(&_memory_heaps[list_idx]);
+		heap_t* heap = _memory_heaps[list_idx];
 		while (heap) {
 			int need_dump = 0;
 			for (size_t iclass = 0; !need_dump && (iclass < SIZE_CLASS_COUNT); ++iclass) {