Array based caches (#177)

diff --git a/CHANGELOG b/CHANGELOG
index 2e1a037..c11f613 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,5 +1,7 @@
 1.4.1
 
+Dual license as both released to public domain or under MIT license
+
 Allow up to 4GiB page sizes
 
 Fix an issue where large page sizes in conjunction with many threads waste a lot of memory (previously
@@ -21,6 +23,11 @@
 Refactor finalization to be compatible with global scope data causing dynamic allocations and frees, like
 C++ objects with custom ctors/dtors.
 
+Refactor thread and global cache to be array based instead of list based for improved performance
+and cache size control.
+
+Added missing C++ operator overloads with ENABLE_OVERRIDE when using Microsoft C++ runtimes
+
 
 1.4.0
 
diff --git a/rpmalloc/malloc.c b/rpmalloc/malloc.c
index b281e1f..2e1f02a 100644
--- a/rpmalloc/malloc.c
+++ b/rpmalloc/malloc.c
@@ -222,6 +222,11 @@
 
 #endif
 
+static inline size_t
+_rpmalloc_page_size(void) {
+	return _memory_page_size;
+}
+
 extern inline void* RPMALLOC_CDECL
 reallocarray(void* ptr, size_t count, size_t size) {
 	size_t total;
@@ -248,9 +253,10 @@
 extern inline void* RPMALLOC_CDECL
 valloc(size_t size) {
 	get_thread_heap();
+	const size_t page_size = _rpmalloc_page_size();
 	if (!size)
-		size = _memory_page_size;
-	size_t total_size = size + _memory_page_size;
+		size = page_size;
+	size_t total_size = size + page_size;
 #if ENABLE_VALIDATE_ARGS
 	if (total_size < size) {
 		errno = EINVAL;
@@ -258,8 +264,8 @@
 	}
 #endif
 	void* buffer = rpmalloc(total_size);
-	if ((uintptr_t)buffer & (_memory_page_size - 1))
-		return (void*)(((uintptr_t)buffer & ~(_memory_page_size - 1)) + _memory_page_size);
+	if ((uintptr_t)buffer & (page_size - 1))
+		return (void*)(((uintptr_t)buffer & ~(page_size - 1)) + page_size);
 	return buffer;
 }
 
@@ -267,8 +273,9 @@
 pvalloc(size_t size) {
 	get_thread_heap();
 	size_t aligned_size = size;
-	if (aligned_size % _memory_page_size)
-		aligned_size = (1 + (aligned_size / _memory_page_size)) * _memory_page_size;
+	const size_t page_size = _rpmalloc_page_size();
+	if (aligned_size % page_size)
+		aligned_size = (1 + (aligned_size / page_size)) * page_size;
 #if ENABLE_VALIDATE_ARGS
 	if (aligned_size < size) {
 		errno = EINVAL;
diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index d011bcc..4b83a0e 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -1,4 +1,4 @@
-/* rpmalloc.c  -  Memory allocator  -  Public Domain  -  2016 Mattias Jansson
+/* rpmalloc.c  -  Memory allocator  -  Public Domain  -  2016-2020 Mattias Jansson
  *
  * This library provides a cross-platform lock free thread caching malloc implementation in C11.
  * The latest source code is always available at
@@ -50,60 +50,43 @@
 #define ENABLE_PRELOAD            0
 #endif
 #ifndef DISABLE_UNMAP
-//! Disable unmapping memory pages
+//! Disable unmapping memory pages (also enables unlimited cache)
 #define DISABLE_UNMAP             0
 #endif
+#ifndef ENABLE_UNLIMITED_CACHE
+//! Enable unlimited global cache (no unmapping until finalization)
+#define ENABLE_UNLIMITED_CACHE    0
+#endif
+#ifndef ENABLE_ADAPTIVE_THREAD_CACHE
+//! Enable adaptive thread cache size based on use heuristics
+#define ENABLE_ADAPTIVE_THREAD_CACHE 0
+#endif
 #ifndef DEFAULT_SPAN_MAP_COUNT
 //! Default number of spans to map in call to map more virtual memory (default values yield 4MiB here)
 #define DEFAULT_SPAN_MAP_COUNT    64
 #endif
-
-#if ENABLE_THREAD_CACHE
-#ifndef ENABLE_UNLIMITED_CACHE
-//! Unlimited thread and global cache
-#define ENABLE_UNLIMITED_CACHE    0
-#endif
-#ifndef ENABLE_UNLIMITED_THREAD_CACHE
-//! Unlimited cache disables any thread cache limitations
-#define ENABLE_UNLIMITED_THREAD_CACHE ENABLE_UNLIMITED_CACHE
-#endif
-#if !ENABLE_UNLIMITED_THREAD_CACHE
-#ifndef THREAD_CACHE_MULTIPLIER
-//! Multiplier for thread cache (cache limit will be span release count multiplied by this value)
-#define THREAD_CACHE_MULTIPLIER 16
-#endif
-#ifndef ENABLE_ADAPTIVE_THREAD_CACHE
-//! Enable adaptive size of per-thread cache (still bounded by THREAD_CACHE_MULTIPLIER hard limit)
-#define ENABLE_ADAPTIVE_THREAD_CACHE  0
-#endif
-#endif
-#endif
-
-#if ENABLE_GLOBAL_CACHE && ENABLE_THREAD_CACHE
-#if DISABLE_UNMAP
-#undef ENABLE_UNLIMITED_GLOBAL_CACHE
-#define ENABLE_UNLIMITED_GLOBAL_CACHE 1
-#endif
-#ifndef ENABLE_UNLIMITED_GLOBAL_CACHE
-//! Unlimited cache disables any global cache limitations
-#define ENABLE_UNLIMITED_GLOBAL_CACHE ENABLE_UNLIMITED_CACHE
-#endif
-#if !ENABLE_UNLIMITED_GLOBAL_CACHE
-//! Multiplier for global cache (cache limit will be span release count multiplied by this value)
-#define GLOBAL_CACHE_MULTIPLIER (THREAD_CACHE_MULTIPLIER * 6)
-#endif
-#else
-#  undef ENABLE_GLOBAL_CACHE
-#  define ENABLE_GLOBAL_CACHE 0
-#endif
-
-#if !ENABLE_THREAD_CACHE || ENABLE_UNLIMITED_THREAD_CACHE
-#  undef ENABLE_ADAPTIVE_THREAD_CACHE
-#  define ENABLE_ADAPTIVE_THREAD_CACHE 0
+#ifndef GLOBAL_CACHE_MULTIPLIER
+//! Multiplier for global cache
+#define GLOBAL_CACHE_MULTIPLIER   8
 #endif
 
 #if DISABLE_UNMAP && !ENABLE_GLOBAL_CACHE
-#  error Must use global cache if unmap is disabled
+#error Must use global cache if unmap is disabled
+#endif
+
+#if DISABLE_UNMAP
+#undef ENABLE_UNLIMITED_CACHE
+#define ENABLE_UNLIMITED_CACHE 1
+#endif
+
+#if !ENABLE_GLOBAL_CACHE
+#undef ENABLE_UNLIMITED_CACHE
+#define ENABLE_UNLIMITED_CACHE 0
+#endif
+
+#if !ENABLE_THREAD_CACHE
+#undef ENABLE_ADAPTIVE_THREAD_CACHE
+#define ENABLE_ADAPTIVE_THREAD_CACHE 0
 #endif
 
 #if defined( _WIN32 ) || defined( __WIN32__ ) || defined( _WIN64 )
@@ -115,7 +98,7 @@
 #endif
 
 /// Platform and arch specifics
-#if defined(_MSC_VER) && !defined(__clang__)
+#if defined(_MSC_VER)
 #  ifndef FORCEINLINE
 #    define FORCEINLINE inline __forceinline
 #  endif
@@ -206,13 +189,15 @@
 
 static FORCEINLINE int32_t atomic_load32(atomic32_t* src) { return *src; }
 static FORCEINLINE void    atomic_store32(atomic32_t* dst, int32_t val) { *dst = val; }
+static FORCEINLINE void    atomic_store32_release(atomic32_t* dst, int32_t val) { *dst = val; }
 static FORCEINLINE int32_t atomic_incr32(atomic32_t* val) { return (int32_t)InterlockedIncrement(val); }
 static FORCEINLINE int32_t atomic_decr32(atomic32_t* val) { return (int32_t)InterlockedDecrement(val); }
+static FORCEINLINE int32_t atomic_add32(atomic32_t* val, int32_t add) { return (int32_t)InterlockedExchangeAdd(val, add) + add; }
+static FORCEINLINE int     atomic_cas32_acquire(atomic32_t* dst, int32_t val, int32_t ref) { return (InterlockedCompareExchange(dst, val, ref) == ref) ? 1 : 0; }
 #if ENABLE_STATISTICS || ENABLE_ADAPTIVE_THREAD_CACHE
 static FORCEINLINE int64_t atomic_load64(atomic64_t* src) { return *src; }
 static FORCEINLINE int64_t atomic_add64(atomic64_t* val, int64_t add) { return (int64_t)InterlockedExchangeAdd64(val, add) + add; }
 #endif
-static FORCEINLINE int32_t atomic_add32(atomic32_t* val, int32_t add) { return (int32_t)InterlockedExchangeAdd(val, add) + add; }
 static FORCEINLINE void*   atomic_load_ptr(atomicptr_t* src) { return (void*)*src; }
 static FORCEINLINE void    atomic_store_ptr(atomicptr_t* dst, void* val) { *dst = val; }
 static FORCEINLINE void    atomic_store_ptr_release(atomicptr_t* dst, void* val) { *dst = val; }
@@ -232,13 +217,15 @@
 
 static FORCEINLINE int32_t atomic_load32(atomic32_t* src) { return atomic_load_explicit(src, memory_order_relaxed); }
 static FORCEINLINE void    atomic_store32(atomic32_t* dst, int32_t val) { atomic_store_explicit(dst, val, memory_order_relaxed); }
+static FORCEINLINE void    atomic_store32_release(atomic32_t* dst, int32_t val) { atomic_store_explicit(dst, val, memory_order_release); }
 static FORCEINLINE int32_t atomic_incr32(atomic32_t* val) { return atomic_fetch_add_explicit(val, 1, memory_order_relaxed) + 1; }
 static FORCEINLINE int32_t atomic_decr32(atomic32_t* val) { return atomic_fetch_add_explicit(val, -1, memory_order_relaxed) - 1; }
+static FORCEINLINE int32_t atomic_add32(atomic32_t* val, int32_t add) { return atomic_fetch_add_explicit(val, add, memory_order_relaxed) + add; }
+static FORCEINLINE int     atomic_cas32_acquire(atomic32_t* dst, int32_t val, int32_t ref) { return atomic_compare_exchange_weak_explicit(dst, &ref, val, memory_order_acquire, memory_order_relaxed); }
 #if ENABLE_STATISTICS || ENABLE_ADAPTIVE_THREAD_CACHE
 static FORCEINLINE int64_t atomic_load64(atomic64_t* val) { return atomic_load_explicit(val, memory_order_relaxed); }
 static FORCEINLINE int64_t atomic_add64(atomic64_t* val, int64_t add) { return atomic_fetch_add_explicit(val, add, memory_order_relaxed) + add; }
 #endif
-static FORCEINLINE int32_t atomic_add32(atomic32_t* val, int32_t add) { return atomic_fetch_add_explicit(val, add, memory_order_relaxed) + add; }
 static FORCEINLINE void*   atomic_load_ptr(atomicptr_t* src) { return atomic_load_explicit(src, memory_order_relaxed); }
 static FORCEINLINE void    atomic_store_ptr(atomicptr_t* dst, void* val) { atomic_store_explicit(dst, val, memory_order_relaxed); }
 static FORCEINLINE void    atomic_store_ptr_release(atomicptr_t* dst, void* val) { atomic_store_explicit(dst, val, memory_order_release); }
@@ -314,6 +301,14 @@
 #define HEAP_ORPHAN_ABA_SIZE      512
 //! Size of a span header (must be a multiple of SMALL_GRANULARITY and a power of two)
 #define SPAN_HEADER_SIZE          128
+//! Number of spans in thread cache
+#define MAX_THREAD_SPAN_CACHE     256
+//! Number of spans to transfer between thread and global cache
+#define THREAD_SPAN_CACHE_TRANSFER 64
+//! Number of spans in thread cache for large spans (must be greater than LARGE_CLASS_COUNT / 2)
+#define MAX_THREAD_SPAN_LARGE_CACHE 64
+//! Number of spans to transfer between thread and global cache for large spans
+#define THREAD_SPAN_LARGE_CACHE_TRANSFER 6
 
 _Static_assert((SMALL_GRANULARITY & (SMALL_GRANULARITY - 1)) == 0, "Small granularity must be power of two");
 _Static_assert((SPAN_HEADER_SIZE & (SPAN_HEADER_SIZE - 1)) == 0, "Span header size must be power of two");
@@ -456,30 +451,42 @@
 };
 _Static_assert(sizeof(span_t) <= SPAN_HEADER_SIZE, "span size mismatch");
 
+struct span_cache_t {
+	size_t       count;
+	span_t*      span[MAX_THREAD_SPAN_CACHE];
+};
+typedef struct span_cache_t span_cache_t;
+
+struct span_large_cache_t {
+	size_t       count;
+	span_t*      span[MAX_THREAD_SPAN_LARGE_CACHE];
+};
+typedef struct span_large_cache_t span_large_cache_t;
+
+struct heap_size_class_t {
+	//! Free list of active span
+	void*        free_list;
+	//! Double linked list of partially used spans with free blocks.
+	//  Previous span pointer in head points to tail span of list.
+	span_t*      partial_span;
+	//! Early level cache of fully free spans
+	span_t*      cache[2];
+};
+typedef struct heap_size_class_t heap_size_class_t;
+
 // Control structure for a heap, either a thread heap or a first class heap if enabled
 struct heap_t {
 	//! Owning thread ID
 	uintptr_t    owner_thread;
-	//! Free list of active span
-	void*        free_list[SIZE_CLASS_COUNT];
-	//! Double linked list of partially used spans with free blocks for each size class.
-	//  Previous span pointer in head points to tail span of list.
-	span_t*      partial_span[SIZE_CLASS_COUNT];
+	//! Free lists for each size class
+	heap_size_class_t size_class[SIZE_CLASS_COUNT];
 #if RPMALLOC_FIRST_CLASS_HEAPS
 	//! Double linked list of fully utilized spans with free blocks for each size class.
 	//  Previous span pointer in head points to tail span of list.
 	span_t*      full_span[SIZE_CLASS_COUNT];
 #endif
-#if ENABLE_THREAD_CACHE
-	//! List of free spans (single linked list)
-	span_t*      span_cache[LARGE_CLASS_COUNT];
-#endif
 	//! List of deferred free spans (single linked list)
 	atomicptr_t  span_free_deferred;
-#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
-	//! Current and high water mark of spans used per span count
-	span_use_t   span_use[LARGE_CLASS_COUNT];
-#endif
 #if RPMALLOC_FIRST_CLASS_HEAPS
 	//! Double linked list of large and huge spans allocated by this heap
 	span_t*      large_huge_span;
@@ -506,6 +513,16 @@
 	heap_t*      master_heap;
 	//! Child count
 	atomic32_t   child_count;
+#if ENABLE_THREAD_CACHE
+	//! Arrays of fully freed spans, single span
+	span_cache_t span_cache;
+	//! Arrays of fully freed spans, large spans with > 1 span count
+	span_large_cache_t span_large_cache[LARGE_CLASS_COUNT - 1];
+#endif
+#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
+	//! Current and high water mark of spans used per span count
+	span_use_t   span_use[LARGE_CLASS_COUNT];
+#endif
 #if ENABLE_STATISTICS
 	//! Number of bytes transitioned thread -> global
 	atomic64_t   thread_to_global;
@@ -528,12 +545,16 @@
 _Static_assert(sizeof(size_class_t) == 8, "Size class size mismatch");
 
 struct global_cache_t {
-	//! Cache list pointer
-	atomicptr_t cache;
-	//! Cache size
-	atomic32_t size;
-	//! ABA counter
-	atomic32_t counter;
+	//! Cache lock
+	atomic32_t lock;
+	//! Cache count
+	size_t count;
+	//! Cached spans
+	span_t* span[GLOBAL_CACHE_MULTIPLIER * MAX_THREAD_SPAN_CACHE];
+#if ENABLE_UNLIMITED_CACHE
+	//! Unlimited cache overflow
+	span_t* overflow;
+#endif
 };
 
 ////////////
@@ -542,6 +563,11 @@
 ///
 //////
 
+//! Default span size (64KiB)
+#define _memory_default_span_size (64 * 1024)
+#define _memory_default_span_size_shift 16
+#define _memory_default_span_mask (~((uintptr_t)(_memory_span_size - 1)))
+
 //! Initialized flag
 static int _rpmalloc_initialized;
 //! Configuration
@@ -560,10 +586,10 @@
 //! Mask to get to start of a memory span
 static uintptr_t _memory_span_mask;
 #else
-//! Hardwired span size (64KiB)
-#define _memory_span_size (64 * 1024)
-#define _memory_span_size_shift 16
-#define _memory_span_mask (~((uintptr_t)(_memory_span_size - 1)))
+//! Hardwired span size
+#define _memory_span_size _memory_default_span_size
+#define _memory_span_size_shift _memory_default_span_size_shift
+#define _memory_span_mask _memory_default_span_mask
 #endif
 //! Number of spans to map in each map call
 static size_t _memory_span_map_count;
@@ -827,76 +853,6 @@
 ///
 //////
 
-#if ENABLE_THREAD_CACHE
-
-static void
-_rpmalloc_span_unmap(span_t* span);
-
-//! Unmap a single linked list of spans
-static void
-_rpmalloc_span_list_unmap_all(span_t* span) {
-	size_t list_size = span->list_size;
-	for (size_t ispan = 0; ispan < list_size; ++ispan) {
-		span_t* next_span = span->next;
-		_rpmalloc_span_unmap(span);
-		span = next_span;
-	}
-	assert(!span);
-}
-
-//! Add span to head of single linked span list
-static size_t
-_rpmalloc_span_list_push(span_t** head, span_t* span) {
-	span->next = *head;
-	if (*head)
-		span->list_size = (*head)->list_size + 1;
-	else
-		span->list_size = 1;
-	*head = span;
-	return span->list_size;
-}
-
-//! Remove span from head of single linked span list, returns the new list head
-static span_t*
-_rpmalloc_span_list_pop(span_t** head) {
-	span_t* span = *head;
-	span_t* next_span = 0;
-	if (span->list_size > 1) {
-		assert(span->next);
-		next_span = span->next;
-		assert(next_span);
-		next_span->list_size = span->list_size - 1;
-	}
-	*head = next_span;
-	return span;
-}
-
-//! Split a single linked span list
-static span_t*
-_rpmalloc_span_list_split(span_t* span, size_t limit) {
-	span_t* next = 0;
-	if (limit < 2)
-		limit = 2;
-	if (span->list_size > limit) {
-		uint32_t list_size = 1;
-		span_t* last = span;
-		next = span->next;
-		while (list_size < limit) {
-			last = next;
-			next = next->next;
-			++list_size;
-		}
-		last->next = 0;
-		assert(next);
-		next->list_size = span->list_size - list_size;
-		span->list_size = list_size;
-		span->prev = 0;
-	}
-	return next;
-}
-
-#endif
-
 //! Add a span to double linked list at the head
 static void
 _rpmalloc_span_double_link_list_add(span_t** head, span_t* span) {
@@ -1075,10 +1031,19 @@
 #if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
 	atomic_decr32(&heap->span_use[0].current);
 #endif
-	_rpmalloc_stat_inc(&heap->span_use[0].spans_to_cache);
-	_rpmalloc_stat_inc(&heap->size_class_use[span->size_class].spans_to_cache);
 	_rpmalloc_stat_dec(&heap->size_class_use[span->size_class].spans_current);
-	_rpmalloc_heap_cache_insert(heap, span);
+	if (!heap->finalize) {
+		_rpmalloc_stat_inc(&heap->span_use[0].spans_to_cache);
+		_rpmalloc_stat_inc(&heap->size_class_use[span->size_class].spans_to_cache);
+		if (heap->size_class[span->size_class].cache[0]) {
+			if (heap->size_class[span->size_class].cache[1])
+				_rpmalloc_heap_cache_insert(heap, heap->size_class[span->size_class].cache[1]);
+			heap->size_class[span->size_class].cache[1] = heap->size_class[span->size_class].cache[0];
+		}
+		heap->size_class[span->size_class].cache[0] = span;
+	} else {
+		_rpmalloc_span_unmap(span);
+	}
 }
 
 //! Initialize a (partial) free list up to next system memory page, while reserving the first block
@@ -1129,11 +1094,11 @@
 
 	//Setup free list. Only initialize one system page worth of free blocks in list
 	void* block;
-	span->free_list_limit = free_list_partial_init(&heap->free_list[class_idx], &block, 
+	span->free_list_limit = free_list_partial_init(&heap->size_class[class_idx].free_list, &block, 
 		span, pointer_offset(span, SPAN_HEADER_SIZE), size_class->block_count, size_class->block_size);
 	//Link span as partial if there remains blocks to be initialized as free list, or full if fully initialized
 	if (span->free_list_limit < span->block_count) {
-		_rpmalloc_span_double_link_list_add(&heap->partial_span[class_idx], span);
+		_rpmalloc_span_double_link_list_add(&heap->size_class[class_idx].partial_span, span);
 		span->used_count = span->free_list_limit;
 	} else {
 #if RPMALLOC_FIRST_CLASS_HEAPS
@@ -1165,7 +1130,8 @@
 
 static int
 _rpmalloc_span_finalize(heap_t* heap, size_t iclass, span_t* span, span_t** list_head) {
-	span_t* class_span = (span_t*)((uintptr_t)heap->free_list[iclass] & _memory_span_mask);
+	void* free_list = heap->size_class[iclass].free_list;
+	span_t* class_span = (span_t*)((uintptr_t)free_list & _memory_span_mask);
 	if (span == class_span) {
 		// Adopt the heap class free list back into the span free list
 		void* block = span->free_list;
@@ -1175,17 +1141,17 @@
 			block = *((void**)block);
 		}
 		uint32_t free_count = 0;
-		block = heap->free_list[iclass];
+		block = free_list;
 		while (block) {
 			++free_count;
 			block = *((void**)block);
 		}
 		if (last_block) {
-			*((void**)last_block) = heap->free_list[iclass];
+			*((void**)last_block) = free_list;
 		} else {
-			span->free_list = heap->free_list[iclass];
+			span->free_list = free_list;
 		}
-		heap->free_list[iclass] = 0;
+		heap->size_class[iclass].free_list = 0;
 		span->used_count -= free_count;
 	}
 	//If this assert triggers you have memory leaks
@@ -1211,89 +1177,83 @@
 
 #if ENABLE_GLOBAL_CACHE
 
-//! Insert the given list of memory page spans in the global cache
-static void
-_rpmalloc_global_cache_insert(global_cache_t* cache, span_t* span, size_t cache_limit) {
-	assert((span->list_size == 1) || (span->next != 0));
-	int32_t list_size = (int32_t)span->list_size;
-	//Unmap if cache has reached the limit. Does not need stronger synchronization, the worst
-	//case is that the span list is unmapped when it could have been cached (no real dependency
-	//between the two variables)
-	if (atomic_add32(&cache->size, list_size) > (int32_t)cache_limit) {
-#if !ENABLE_UNLIMITED_GLOBAL_CACHE
-		_rpmalloc_span_list_unmap_all(span);
-		atomic_add32(&cache->size, -list_size);
-		return;
-#endif
-	}
-	void* current_cache, *new_cache;
-	do {
-		current_cache = atomic_load_ptr(&cache->cache);
-		span->prev = (span_t*)((uintptr_t)current_cache & _memory_span_mask);
-		new_cache = (void*)((uintptr_t)span | ((uintptr_t)atomic_incr32(&cache->counter) & ~_memory_span_mask));
-	} while (!atomic_cas_ptr(&cache->cache, new_cache, current_cache));
-}
-
-//! Extract a number of memory page spans from the global cache
-static span_t*
-_rpmalloc_global_cache_extract(global_cache_t* cache) {
-	uintptr_t span_ptr;
-	do {
-		void* global_span = atomic_load_ptr(&cache->cache);
-		span_ptr = (uintptr_t)global_span & _memory_span_mask;
-		if (span_ptr) {
-			span_t* span = (span_t*)span_ptr;
-			//By accessing the span ptr before it is swapped out of list we assume that a contending thread
-			//does not manage to traverse the span to being unmapped before we access it
-			void* new_cache = (void*)((uintptr_t)span->prev | ((uintptr_t)atomic_incr32(&cache->counter) & ~_memory_span_mask));
-			if (atomic_cas_ptr(&cache->cache, new_cache, global_span)) {
-				atomic_add32(&cache->size, -(int32_t)span->list_size);
-				return span;
-			}
-		}
-	} while (span_ptr);
-	return 0;
-}
-
 //! Finalize a global cache, only valid from allocator finalization (not thread safe)
 static void
 _rpmalloc_global_cache_finalize(global_cache_t* cache) {
-	void* current_cache = atomic_load_ptr(&cache->cache);
-	span_t* span = (span_t*)((uintptr_t)current_cache & _memory_span_mask);
-	while (span) {
-		span_t* skip_span = (span_t*)((uintptr_t)span->prev & _memory_span_mask);
-		atomic_add32(&cache->size, -(int32_t)span->list_size);
-		_rpmalloc_span_list_unmap_all(span);
-		span = skip_span;
+	for (size_t ispan = 0; ispan < cache->count; ++ispan)
+		_rpmalloc_span_unmap(cache->span[ispan]);
+	cache->count = 0;
+
+#if ENABLE_UNLIMITED_CACHE
+	while (cache->overflow) {
+		span_t* span = cache->overflow;
+		cache->overflow = span->next;
+		_rpmalloc_span_unmap(span);
 	}
-	assert(!atomic_load32(&cache->size));
-	atomic_store_ptr(&cache->cache, 0);
-	atomic_store32(&cache->size, 0);
+#endif
+
+	atomic_store32_release(&cache->lock, 0);
 }
 
-//! Insert the given list of memory page spans in the global cache
 static void
-_rpmalloc_global_cache_insert_span_list(span_t* span) {
-	size_t span_count = span->span_count;
-#if ENABLE_UNLIMITED_GLOBAL_CACHE
-	_rpmalloc_global_cache_insert(&_memory_span_cache[span_count - 1], span, 0);
+_rpmalloc_global_cache_insert_spans(span_t** span, size_t span_count, size_t count) {
+	const size_t cache_limit = (span_count == 1) ? 
+		GLOBAL_CACHE_MULTIPLIER * MAX_THREAD_SPAN_CACHE :
+		GLOBAL_CACHE_MULTIPLIER * (MAX_THREAD_SPAN_LARGE_CACHE - (span_count >> 1));
+
+	global_cache_t* cache = &_memory_span_cache[span_count - 1];
+
+	size_t insert_count = count;
+	while (!atomic_cas32_acquire(&cache->lock, 1, 0))
+		/* Spin */;
+
+	if ((cache->count + insert_count) > cache_limit)
+		insert_count = cache_limit - cache->count;
+
+	memcpy(cache->span + cache->count, span, sizeof(span_t*) * insert_count);
+	cache->count += insert_count;
+
+#if ENABLE_UNLIMITED_CACHE
+	while (insert_count < count) {
+		span_t* current_span = span[insert_count++];
+		current_span->next = cache->overflow;
+		cache->overflow = current_span;
+	}
+	atomic_store32_release(&cache->lock, 0);
 #else
-	const size_t cache_limit = (GLOBAL_CACHE_MULTIPLIER * ((span_count == 1) ? _memory_span_release_count : _memory_span_release_count_large));
-	_rpmalloc_global_cache_insert(&_memory_span_cache[span_count - 1], span, cache_limit);
+	atomic_store32_release(&cache->lock, 0);
+	for (size_t ispan = insert_count; ispan < count; ++ispan)
+		_rpmalloc_span_unmap(span[ispan]);
 #endif
 }
 
-//! Extract a number of memory page spans from the global cache for large blocks
-static span_t*
-_rpmalloc_global_cache_extract_span_list(size_t span_count) {
-	span_t* span = _rpmalloc_global_cache_extract(&_memory_span_cache[span_count - 1]);
-	assert(!span || (span->span_count == span_count));
-	return span;
+static size_t
+_rpmalloc_global_cache_extract_spans(span_t** span, size_t span_count, size_t count) {
+	global_cache_t* cache = &_memory_span_cache[span_count - 1];
+
+	size_t extract_count = count;
+	while (!atomic_cas32_acquire(&cache->lock, 1, 0))
+		/* Spin */;
+
+	if (extract_count > cache->count)
+		extract_count = cache->count;
+
+	memcpy(span, cache->span + (cache->count - extract_count), sizeof(span_t*) * extract_count);
+	cache->count -= extract_count;
+#if ENABLE_UNLIMITED_CACHE
+	while ((extract_count < count) && cache->overflow) {
+		span_t* current_span = cache->overflow;
+		span[extract_count++] = current_span;
+		cache->overflow = current_span->next;
+	}
+#endif
+	atomic_store32_release(&cache->lock, 0);
+
+	return extract_count;
 }
 
 #endif
 
-
 ////////////
 ///
 /// Heap control
@@ -1383,10 +1343,14 @@
 
 #if ENABLE_THREAD_CACHE
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-		span_t* span = heap->span_cache[iclass];
-		heap->span_cache[iclass] = 0;
-		if (span)
-			_rpmalloc_span_list_unmap_all(span);
+		span_cache_t* span_cache;
+		if (!iclass)
+			span_cache = &heap->span_cache;
+		else
+			span_cache = (span_cache_t*)(heap->span_large_cache + (iclass - 1));
+		for (size_t ispan = 0; ispan < span_cache->count; ++ispan)
+			_rpmalloc_span_unmap(span_cache->span[ispan]);
+		span_cache->count = 0;
 	}
 #endif
 
@@ -1396,7 +1360,7 @@
 	}
 
 	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
-		if (heap->free_list[iclass] || heap->partial_span[iclass]) {
+		if (heap->size_class[iclass].free_list || heap->size_class[iclass].partial_span) {
 			--heap->finalize;
 			return;
 		}
@@ -1412,7 +1376,7 @@
 		list_heap->next_heap = heap->next_heap;
 	}
 
-	_rpmalloc_heap_unmap( heap );
+	_rpmalloc_heap_unmap(heap);
 }
 
 //! Insert a single span into thread heap cache, releasing to global cache if overflow
@@ -1425,37 +1389,42 @@
 	}
 #if ENABLE_THREAD_CACHE
 	size_t span_count = span->span_count;
-	size_t idx = span_count - 1;
-	_rpmalloc_stat_inc(&heap->span_use[idx].spans_to_cache);
-#if ENABLE_UNLIMITED_THREAD_CACHE
-	_rpmalloc_span_list_push(&heap->span_cache[idx], span);
-#else
-	const size_t release_count = (!idx ? _memory_span_release_count : _memory_span_release_count_large);
-	size_t current_cache_size = _rpmalloc_span_list_push(&heap->span_cache[idx], span);
-	if (current_cache_size <= release_count)
-		return;
-	const size_t hard_limit = release_count * THREAD_CACHE_MULTIPLIER;
-	if (current_cache_size <= hard_limit) {
-#if ENABLE_ADAPTIVE_THREAD_CACHE
-		//Require 25% of high water mark to remain in cache (and at least 1, if use is 0)
-		const size_t high_mark = heap->span_use[idx].high;
-		const size_t min_limit = (high_mark >> 2) + release_count + 1;
-		if (current_cache_size < min_limit)
-			return;
-#else
-		return;
-#endif
-	}
-	heap->span_cache[idx] = _rpmalloc_span_list_split(span, release_count);
-	assert(span->list_size == release_count);
+	_rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_to_cache);
+	if (span_count == 1) {
+		span_cache_t* span_cache = &heap->span_cache;
+		span_cache->span[span_cache->count++] = span;
+		if (span_cache->count == MAX_THREAD_SPAN_CACHE) {
+			const size_t remain_count = MAX_THREAD_SPAN_CACHE - THREAD_SPAN_CACHE_TRANSFER;
 #if ENABLE_GLOBAL_CACHE
-	_rpmalloc_stat_add64(&heap->thread_to_global, (size_t)span->list_size * span_count * _memory_span_size);
-	_rpmalloc_stat_add(&heap->span_use[idx].spans_to_global, span->list_size);
-	_rpmalloc_global_cache_insert_span_list(span);
+			_rpmalloc_stat_add64(&heap->thread_to_global, THREAD_SPAN_CACHE_TRANSFER * _memory_span_size);
+			_rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_to_global, THREAD_SPAN_CACHE_TRANSFER);
+			_rpmalloc_global_cache_insert_spans(span_cache->span + remain_count, span_count, THREAD_SPAN_CACHE_TRANSFER);
 #else
-	_rpmalloc_span_list_unmap_all(span);
+			for (size_t ispan = 0; ispan < THREAD_SPAN_CACHE_TRANSFER; ++ispan)
+				_rpmalloc_span_unmap(span_cache->span[remain_count + ispan]);
 #endif
+			span_cache->count = remain_count;
+		}
+	} else {
+		size_t cache_idx = span_count - 2;
+		span_large_cache_t* span_cache = heap->span_large_cache + cache_idx;
+		span_cache->span[span_cache->count++] = span;
+		const size_t cache_limit = (MAX_THREAD_SPAN_LARGE_CACHE - (span_count >> 1));
+		if (span_cache->count == cache_limit) {
+			const size_t transfer_limit = 2 + (cache_limit >> 2);
+			const size_t transfer_count = (THREAD_SPAN_LARGE_CACHE_TRANSFER <= transfer_limit ? THREAD_SPAN_LARGE_CACHE_TRANSFER : transfer_limit);
+			const size_t remain_count = cache_limit - transfer_count;
+#if ENABLE_GLOBAL_CACHE
+			_rpmalloc_stat_add64(&heap->thread_to_global, transfer_count * span_count * _memory_span_size);
+			_rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_to_global, transfer_count);
+			_rpmalloc_global_cache_insert_spans(span_cache->span + remain_count, span_count, transfer_count);
+#else
+			for (size_t ispan = 0; ispan < transfer_count; ++ispan)
+				_rpmalloc_span_unmap(span_cache->span[remain_count + ispan]);
 #endif
+			span_cache->count = remain_count;
+		}
+	}
 #else
 	(void)sizeof(heap);
 	_rpmalloc_span_unmap(span);
@@ -1466,13 +1435,20 @@
 static span_t*
 _rpmalloc_heap_thread_cache_extract(heap_t* heap, size_t span_count) {
 	span_t* span = 0;
-	size_t idx = span_count - 1;
-	if (!idx)
+	if (span_count == 1) {
 		_rpmalloc_heap_cache_adopt_deferred(heap, &span);
+		if (span)
+			return span;
+	}
 #if ENABLE_THREAD_CACHE
-	if (!span && heap->span_cache[idx]) {
-		_rpmalloc_stat_inc(&heap->span_use[idx].spans_from_cache);
-		span = _rpmalloc_span_list_pop(&heap->span_cache[idx]);
+	span_cache_t* span_cache;
+	if (span_count == 1)
+		span_cache = &heap->span_cache;
+	else
+		span_cache = (span_cache_t*)(heap->span_large_cache + (span_count - 2));
+	if (span_cache->count) {
+		_rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_from_cache);
+		return span_cache->span[--span_cache->count];
 	}
 #endif
 	return span;
@@ -1489,13 +1465,31 @@
 static span_t*
 _rpmalloc_heap_global_cache_extract(heap_t* heap, size_t span_count) {
 #if ENABLE_GLOBAL_CACHE
-	size_t idx = span_count - 1;
-	heap->span_cache[idx] = _rpmalloc_global_cache_extract_span_list(span_count);
-	if (heap->span_cache[idx]) {
-		_rpmalloc_stat_add64(&heap->global_to_thread, (size_t)heap->span_cache[idx]->list_size * span_count * _memory_span_size);
-		_rpmalloc_stat_add(&heap->span_use[idx].spans_from_global, heap->span_cache[idx]->list_size);
-		return _rpmalloc_span_list_pop(&heap->span_cache[idx]);
+#if ENABLE_THREAD_CACHE
+	span_cache_t* span_cache;
+	size_t wanted_count;
+	if (span_count == 1) {
+		span_cache = &heap->span_cache;
+		wanted_count = THREAD_SPAN_CACHE_TRANSFER;
+	} else {
+		span_cache = (span_cache_t*)(heap->span_large_cache + (span_count - 2));
+		wanted_count = THREAD_SPAN_LARGE_CACHE_TRANSFER;
 	}
+	span_cache->count = _rpmalloc_global_cache_extract_spans(span_cache->span, span_count, wanted_count);
+	if (span_cache->count) {
+		_rpmalloc_stat_add64(&heap->global_to_thread, span_count * span_cache->count * _memory_span_size);
+		_rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_from_global, span_cache->count);
+		return span_cache->span[--span_cache->count];
+	}
+#else
+	span_t* span = 0;
+	size_t count = _rpmalloc_global_cache_extract_spans(&span, span_count, 1);
+	if (count) {
+		_rpmalloc_stat_add64(&heap->global_to_thread, span_count * count * _memory_span_size);
+		_rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_from_global, count);
+		return span;
+	}
+#endif
 #endif
 	(void)sizeof(heap);
 	(void)sizeof(span_count);
@@ -1505,7 +1499,7 @@
 //! Get a span from one of the cache levels (thread cache, reserved, global cache) or fallback to mapping more memory
 static span_t*
 _rpmalloc_heap_extract_new_span(heap_t* heap, size_t span_count, uint32_t class_idx) {
-	(void)sizeof(class_idx);
+	span_t* span;
 #if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
 	uint32_t idx = (uint32_t)span_count - 1;
 	uint32_t current_count = (uint32_t)atomic_incr32(&heap->span_use[idx].current);
@@ -1513,7 +1507,26 @@
 		atomic_store32(&heap->span_use[idx].high, (int32_t)current_count);
 	_rpmalloc_stat_add_peak(&heap->size_class_use[class_idx].spans_current, 1, heap->size_class_use[class_idx].spans_peak);
 #endif
-	span_t* span = _rpmalloc_heap_thread_cache_extract(heap, span_count);
+#if ENABLE_THREAD_CACHE
+	if (class_idx < SIZE_CLASS_COUNT) {
+		if (heap->size_class[class_idx].cache[0]) {
+			span = heap->size_class[class_idx].cache[0];
+			span_t* new_cache = 0;
+			if (heap->span_cache.count)
+				new_cache = heap->span_cache.span[--heap->span_cache.count];
+			if (heap->size_class[class_idx].cache[1]) {
+				heap->size_class[class_idx].cache[0] = heap->size_class[class_idx].cache[1];
+				heap->size_class[class_idx].cache[1] = new_cache;
+			} else {
+				heap->size_class[class_idx].cache[0] = new_cache;
+			}
+			return span;
+		}
+	}
+#else
+	(void)sizeof(class_idx);
+#endif
+	span = _rpmalloc_heap_thread_cache_extract(heap, span_count);
 	if (EXPECTED(span != 0)) {
 		_rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_cache);
 		return span;
@@ -1642,26 +1655,27 @@
 	_rpmalloc_heap_cache_adopt_deferred(heap, 0);
 #if ENABLE_THREAD_CACHE
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-		span_t* span = heap->span_cache[iclass];
-		heap->span_cache[iclass] = 0;
-		if (span && heap->finalize) {
-			_rpmalloc_span_list_unmap_all(span);
+		span_cache_t* span_cache;
+		if (!iclass)
+			span_cache = &heap->span_cache;
+		else
+			span_cache = (span_cache_t*)(heap->span_large_cache + (iclass - 1));
+		if (!span_cache->count)
 			continue;
-		}
+		if (heap->finalize) {
+			for (size_t ispan = 0; ispan < span_cache->count; ++ispan)
+				_rpmalloc_span_unmap(span_cache->span[ispan]);
+		} else {
 #if ENABLE_GLOBAL_CACHE
-		while (span) {
-			assert(span->span_count == (iclass + 1));
-			size_t release_count = (!iclass ? _memory_span_release_count : _memory_span_release_count_large);
-			span_t* next = _rpmalloc_span_list_split(span, (uint32_t)release_count);
-			_rpmalloc_stat_add64(&heap->thread_to_global, (size_t)span->list_size * span->span_count * _memory_span_size);
-			_rpmalloc_stat_add(&heap->span_use[iclass].spans_to_global, span->list_size);
-			_rpmalloc_global_cache_insert_span_list(span);
-			span = next;
-		}
+			_rpmalloc_stat_add64(&heap->thread_to_global, span_cache->count * (iclass + 1) * _memory_span_size);
+			_rpmalloc_stat_add(&heap->span_use[iclass].spans_to_global, span_cache->count);
+			_rpmalloc_global_cache_insert_spans(span_cache->span, iclass + 1, span_cache->count);
 #else
-		if (span)
-			_rpmalloc_span_list_unmap_all(span);
+			for (size_t ispan = 0; ispan < span_cache->count; ++ispan)
+				_rpmalloc_span_unmap(span_cache->span[ispan]);
 #endif
+		}
+		span_cache->count = 0;
 	}
 #endif
 
@@ -1692,15 +1706,21 @@
 	_rpmalloc_heap_cache_adopt_deferred(heap, 0);
 
 	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
-		span_t* span = heap->partial_span[iclass];
+		if (heap->size_class[iclass].cache[0])
+			_rpmalloc_span_unmap(heap->size_class[iclass].cache[0]);
+		if (heap->size_class[iclass].cache[1])
+			_rpmalloc_span_unmap(heap->size_class[iclass].cache[1]);
+		heap->size_class[iclass].cache[0] = 0;
+		heap->size_class[iclass].cache[1] = 0;
+		span_t* span = heap->size_class[iclass].partial_span;
 		while (span) {
 			span_t* next = span->next;
-			_rpmalloc_span_finalize(heap, iclass, span, &heap->partial_span[iclass]);
+			_rpmalloc_span_finalize(heap, iclass, span, &heap->size_class[iclass].partial_span);
 			span = next;
 		}
 		// If class still has a free list it must be a full span
-		if (heap->free_list[iclass]) {
-			span_t* class_span = (span_t*)((uintptr_t)heap->free_list[iclass] & _memory_span_mask);
+		if (heap->size_class[iclass].free_list) {
+			span_t* class_span = (span_t*)((uintptr_t)heap->size_class[iclass].free_list & _memory_span_mask);
 			span_t** list = 0;
 #if RPMALLOC_FIRST_CLASS_HEAPS
 			list = &heap->full_span[iclass];
@@ -1709,17 +1729,21 @@
 			if (!_rpmalloc_span_finalize(heap, iclass, class_span, list)) {
 				if (list)
 					_rpmalloc_span_double_link_list_remove(list, class_span);
-				_rpmalloc_span_double_link_list_add(&heap->partial_span[iclass], class_span);
+				_rpmalloc_span_double_link_list_add(&heap->size_class[iclass].partial_span, class_span);
 			}
 		}
 	}
 
 #if ENABLE_THREAD_CACHE
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-		if (heap->span_cache[iclass]) {
-			_rpmalloc_span_list_unmap_all(heap->span_cache[iclass]);
-			heap->span_cache[iclass] = 0;
-		}
+		span_cache_t* span_cache;
+		if (!iclass)
+			span_cache = &heap->span_cache;
+		else
+			span_cache = (span_cache_t*)(heap->span_large_cache + (iclass - 1));
+		for (size_t ispan = 0; ispan < span_cache->count; ++ispan)
+			_rpmalloc_span_unmap(span_cache->span[ispan]);
+		span_cache->count = 0;
 	}
 #endif
 	assert(!atomic_load_ptr(&heap->span_free_deferred));
@@ -1743,20 +1767,20 @@
 //! Allocate a small/medium sized memory block from the given heap
 static void*
 _rpmalloc_allocate_from_heap_fallback(heap_t* heap, uint32_t class_idx) {
-	span_t* span = heap->partial_span[class_idx];
+	span_t* span = heap->size_class[class_idx].partial_span;
 	if (EXPECTED(span != 0)) {
 		assert(span->block_count == _memory_size_class[span->size_class].block_count);
 		assert(!_rpmalloc_span_is_fully_utilized(span));
 		void* block;
 		if (span->free_list) {
 			//Swap in free list if not empty
-			heap->free_list[class_idx] = span->free_list;
+			heap->size_class[class_idx].free_list = span->free_list;
 			span->free_list = 0;
-			block = free_list_pop(&heap->free_list[class_idx]);
+			block = free_list_pop(&heap->size_class[class_idx].free_list);
 		} else {
 			//If the span did not fully initialize free list, link up another page worth of blocks			
 			void* block_start = pointer_offset(span, SPAN_HEADER_SIZE + ((size_t)span->free_list_limit * span->block_size));
-			span->free_list_limit += free_list_partial_init(&heap->free_list[class_idx], &block,
+			span->free_list_limit += free_list_partial_init(&heap->size_class[class_idx].free_list, &block,
 				(void*)((uintptr_t)block_start & ~(_memory_page_size - 1)), block_start,
 				span->block_count - span->free_list_limit, span->block_size);
 		}
@@ -1772,7 +1796,7 @@
 			return block;
 
 		//The span is fully utilized, unlink from partial list and add to fully utilized list
-		_rpmalloc_span_double_link_list_pop_head(&heap->partial_span[class_idx], span);
+		_rpmalloc_span_double_link_list_pop_head(&heap->size_class[class_idx].partial_span, span);
 #if RPMALLOC_FIRST_CLASS_HEAPS
 		_rpmalloc_span_double_link_list_add(&heap->full_span[class_idx], span);
 #endif
@@ -1797,8 +1821,8 @@
 	//Small sizes have unique size classes
 	const uint32_t class_idx = (uint32_t)((size + (SMALL_GRANULARITY - 1)) >> SMALL_GRANULARITY_SHIFT);
 	_rpmalloc_stat_inc_alloc(heap, class_idx);
-	if (EXPECTED(heap->free_list[class_idx] != 0))
-		return free_list_pop(&heap->free_list[class_idx]);
+	if (EXPECTED(heap->size_class[class_idx].free_list != 0))
+		return free_list_pop(&heap->size_class[class_idx].free_list);
 	return _rpmalloc_allocate_from_heap_fallback(heap, class_idx);
 }
 
@@ -1810,8 +1834,8 @@
 	const uint32_t base_idx = (uint32_t)(SMALL_CLASS_COUNT + ((size - (SMALL_SIZE_LIMIT + 1)) >> MEDIUM_GRANULARITY_SHIFT));
 	const uint32_t class_idx = _memory_size_class[base_idx].class_idx;
 	_rpmalloc_stat_inc_alloc(heap, class_idx);
-	if (EXPECTED(heap->free_list[class_idx] != 0))
-		return free_list_pop(&heap->free_list[class_idx]);
+	if (EXPECTED(heap->size_class[class_idx].free_list != 0))
+		return free_list_pop(&heap->size_class[class_idx].free_list);
 	return _rpmalloc_allocate_from_heap_fallback(heap, class_idx);
 }
 
@@ -2019,14 +2043,14 @@
 #if RPMALLOC_FIRST_CLASS_HEAPS
 		_rpmalloc_span_double_link_list_remove(&heap->full_span[span->size_class], span);
 #endif
-		_rpmalloc_span_double_link_list_add(&heap->partial_span[span->size_class], span);
+		_rpmalloc_span_double_link_list_add(&heap->size_class[span->size_class].partial_span, span);
 		--heap->full_span_count;
 	}
 	--span->used_count;
 	*((void**)block) = span->free_list;
 	span->free_list = block;
 	if (UNEXPECTED(span->used_count == span->list_size)) {
-		_rpmalloc_span_double_link_list_remove(&heap->partial_span[span->size_class], span);
+		_rpmalloc_span_double_link_list_remove(&heap->size_class[span->size_class].partial_span, span);
 		_rpmalloc_span_release_to_cache(heap, span);
 	}
 }
@@ -2465,18 +2489,22 @@
 	_memory_page_size = ((size_t)1 << _memory_page_size_shift);
 
 #if RPMALLOC_CONFIGURABLE
-	size_t span_size = _memory_config.span_size;
-	if (!span_size)
-		span_size = (64 * 1024);
-	if (span_size > (256 * 1024))
-		span_size = (256 * 1024);
-	_memory_span_size = 4096;
-	_memory_span_size_shift = 12;
-	while (_memory_span_size < span_size) {
-		_memory_span_size <<= 1;
-		++_memory_span_size_shift;
+	if (!_memory_config.span_size) {
+		_memory_span_size = _memory_default_span_size;
+		_memory_span_size_shift = _memory_default_span_size_shift;
+		_memory_span_mask = _memory_default_span_mask;
+	} else {
+		size_t span_size = _memory_config.span_size;
+		if (span_size > (256 * 1024))
+			span_size = (256 * 1024);
+		_memory_span_size = 4096;
+		_memory_span_size_shift = 12;
+		while (_memory_span_size < span_size) {
+			_memory_span_size <<= 1;
+			++_memory_span_size_shift;
+		}
+		_memory_span_mask = ~(uintptr_t)(_memory_span_size - 1);
 	}
-	_memory_span_mask = ~(uintptr_t)(_memory_span_size - 1);
 #endif
 
 	_memory_span_map_count = ( _memory_config.span_map_count ? _memory_config.span_map_count : DEFAULT_SPAN_MAP_COUNT);
@@ -2746,7 +2774,7 @@
 
 	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
 		size_class_t* size_class = _memory_size_class + iclass;
-		span_t* span = heap->partial_span[iclass];
+		span_t* span = heap->size_class[iclass].partial_span;
 		while (span) {
 			size_t free_count = span->list_size;
 			size_t block_count = size_class->block_count;
@@ -2760,8 +2788,12 @@
 
 #if ENABLE_THREAD_CACHE
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-		if (heap->span_cache[iclass])
-			stats->spancache = (size_t)heap->span_cache[iclass]->list_size * (iclass + 1) * _memory_span_size;
+		span_cache_t* span_cache;
+		if (!iclass)
+			span_cache = &heap->span_cache;
+		else
+			span_cache = (span_cache_t*)(heap->span_large_cache + (iclass - 1));
+		stats->spancache = span_cache->count * (iclass + 1) * _memory_span_size;
 	}
 #endif
 
@@ -2812,9 +2844,8 @@
 	stats->huge_alloc_peak = (size_t)_huge_pages_peak * _memory_page_size;
 #endif
 #if ENABLE_GLOBAL_CACHE
-	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-		stats->cached += (size_t)atomic_load32(&_memory_span_cache[iclass].size) * (iclass + 1) * _memory_span_size;
-	}
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass)
+		stats->cached += _memory_span_cache[iclass].count * (iclass + 1) * _memory_span_size;
 #endif
 }
 
@@ -2851,7 +2882,7 @@
 			atomic_load32(&heap->span_use[iclass].high),
 			((size_t)atomic_load32(&heap->span_use[iclass].high) * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024),
 #if ENABLE_THREAD_CACHE
-			heap->span_cache[iclass] ? heap->span_cache[iclass]->list_size : 0,
+			(unsigned int)(iclass ? heap->span_cache.count : heap->span_large_cache[iclass - 1].count),
 			((size_t)atomic_load32(&heap->span_use[iclass].spans_to_cache) * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
 			((size_t)atomic_load32(&heap->span_use[iclass].spans_from_cache) * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
 #else
@@ -3032,13 +3063,13 @@
 	_rpmalloc_heap_cache_adopt_deferred(heap, 0);
 
 	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
-		span = heap->partial_span[iclass];
+		span = heap->size_class[iclass].partial_span;
 		while (span) {
 			next_span = span->next;
 			_rpmalloc_heap_cache_insert(heap, span);
 			span = next_span;
 		}
-		heap->partial_span[iclass] = 0;
+		heap->size_class[iclass].partial_span = 0;
 		span = heap->full_span[iclass];
 		while (span) {
 			next_span = span->next;
@@ -3046,8 +3077,7 @@
 			span = next_span;
 		}
 	}
-	memset(heap->free_list, 0, sizeof(heap->free_list));
-	memset(heap->partial_span, 0, sizeof(heap->partial_span));
+	memset(heap->size_class, 0, sizeof(heap->size_class));
 	memset(heap->full_span, 0, sizeof(heap->full_span));
 
 	span = heap->large_huge_span;
@@ -3064,22 +3094,22 @@
 
 #if ENABLE_THREAD_CACHE
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-		span = heap->span_cache[iclass];
+		span_cache_t* span_cache;
+		if (!iclass)
+			span_cache = &heap->span_cache;
+		else
+			span_cache = (span_cache_t*)(heap->span_large_cache + (iclass - 1));
+		if (!span_cache->count)
+			continue;
 #if ENABLE_GLOBAL_CACHE
-		while (span) {
-			assert(span->span_count == (iclass + 1));
-			size_t release_count = (!iclass ? _memory_span_release_count : _memory_span_release_count_large);
-			next_span = _rpmalloc_span_list_split(span, (uint32_t)release_count);
-			_rpmalloc_stat_add64(&heap->thread_to_global, (size_t)span->list_size * span->span_count * _memory_span_size);
-			_rpmalloc_stat_add(&heap->span_use[iclass].spans_to_global, span->list_size);
-			_rpmalloc_global_cache_insert_span_list(span);
-			span = next_span;
-		}
+		_rpmalloc_stat_add64(&heap->thread_to_global, span_cache->count * (iclass + 1) * _memory_span_size);
+		_rpmalloc_stat_add(&heap->span_use[iclass].spans_to_global, span_cache->count);
+		_rpmalloc_global_cache_insert_spans(span_cache->span, iclass + 1, span_cache->count);
 #else
-		if (span)
-			_rpmalloc_span_list_unmap_all(span);
+		for (size_t ispan = 0; ispan < span_cache->count; ++ispan)
+			_rpmalloc_span_unmap(span_cache->span[ispan]);
 #endif
-		heap->span_cache[iclass] = 0;
+		span_cache->count = 0;
 	}
 #endif