improve cache use
diff --git a/CHANGELOG b/CHANGELOG
index 1a93962..b7681db 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -3,6 +3,17 @@
 Fixed an issue where calling _exit might hang the main thread cleanup in rpmalloc if another
 worker thread was terminated while holding exclusive access to the global cache.
 
+Improved caches to prioritize main spans in a chunk to avoid leaving main spans mapped due to
+remaining subspans in caches.
+
+Improve cache reuse by allowing large blocks to use caches from slightly larger cache classes.
+
+Fixed an issue where thread heap statistics would go out of sync when a free span was deferred
+to another thread heap
+
+API breaking change - added flag to rpmalloc_thread_finalize to avoid releasing thread caches.
+Pass nonzero value to retain old behaviour of releasing thread caches to global cache.
+
 
 1.4.1
 
diff --git a/rpmalloc/malloc.c b/rpmalloc/malloc.c
index ef549c6..005b34d 100644
--- a/rpmalloc/malloc.c
+++ b/rpmalloc/malloc.c
@@ -292,7 +292,7 @@
 	else if (reason == DLL_THREAD_ATTACH)
 		rpmalloc_thread_initialize();
 	else if (reason == DLL_THREAD_DETACH)
-		rpmalloc_thread_finalize();
+		rpmalloc_thread_finalize(1);
 	return TRUE;
 }
 
diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 2745e2b..8be3c80 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -159,7 +159,7 @@
 #    define MAP_HUGETLB MAP_ALIGNED_SUPER
 #  endif
 #  ifdef __sun
-   extern int madvise(caddr_t, size_t, int);
+extern int madvise(caddr_t, size_t, int);
 #  endif
 #  ifndef MAP_UNINITIALIZED
 #    define MAP_UNINITIALIZED 0
@@ -353,6 +353,8 @@
 #define SPAN_FLAG_SUBSPAN 2U
 //! Flag indicating span has blocks with increased alignment
 #define SPAN_FLAG_ALIGNED_BLOCKS 4U
+//! Flag indicating an unmapped master span
+#define SPAN_FLAG_UNMAPPED_MASTER 8U
 
 #if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
 struct span_use_t {
@@ -361,6 +363,8 @@
 	//! High water mark of spans used
 	atomic32_t high;
 #if ENABLE_STATISTICS
+	//! Number of spans in deferred list
+	atomic32_t spans_deferred;
 	//! Number of spans transitioned to global cache
 	atomic32_t spans_to_global;
 	//! Number of spans transitioned from global cache
@@ -626,6 +630,10 @@
 static heap_t* _memory_first_class_orphan_heaps;
 #endif
 #if ENABLE_STATISTICS
+//! Allocations counter
+static atomic64_t _allocation_counter;
+//! Deallocations counter
+static atomic64_t _deallocation_counter;
 //! Active heap count
 static atomic32_t _memory_active_heaps;
 //! Number of currently mapped memory pages
@@ -634,6 +642,8 @@
 static int32_t _mapped_pages_peak;
 //! Number of mapped master spans
 static atomic32_t _master_spans;
+//! Number of unmapped dangling master spans
+static atomic32_t _unmapped_master_spans;
 //! Number of currently unused spans
 static atomic32_t _reserved_spans;
 //! Running counter of total number of mapped memory pages since start
@@ -780,7 +790,7 @@
 		return;
 #endif
 	if (value)
-		rpmalloc_thread_finalize();
+		rpmalloc_thread_finalize(1);
 }
 #endif
 
@@ -1142,7 +1152,8 @@
 	} else {
 		//Special double flag to denote an unmapped master
 		//It must be kept in memory since span header must be used
-		span->flags |= SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN;
+		span->flags |= SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN | SPAN_FLAG_UNMAPPED_MASTER;
+		_rpmalloc_stat_add(&_unmapped_master_spans, 1);
 	}
 
 	if (atomic_add32(&master->remaining_spans, -(int32_t)span_count) <= 0) {
@@ -1153,6 +1164,7 @@
 			unmap_count = master->total_spans;
 		_rpmalloc_stat_sub(&_reserved_spans, unmap_count);
 		_rpmalloc_stat_sub(&_master_spans, 1);
+		_rpmalloc_stat_sub(&_unmapped_master_spans, 1);
 		_rpmalloc_unmap(master, unmap_count * _memory_span_size, master->align_offset, (size_t)master->total_spans * _memory_span_size);
 	}
 }
@@ -1357,29 +1369,72 @@
 	}
 	atomic_store32_release(&cache->lock, 0);
 
-	for (size_t ispan = insert_count; ispan < count; ++ispan)
-		_rpmalloc_span_unmap(span[ispan]);
+	span_t* keep = 0;
+	for (size_t ispan = insert_count; ispan < count; ++ispan) {
+		span_t* current_span = span[ispan];
+		// Keep master spans that has remaining subspans to avoid dangling them
+		if ((current_span->flags & SPAN_FLAG_MASTER) &&
+		    (atomic_load32(&current_span->remaining_spans) > (int32_t)current_span->span_count)) {
+			current_span->next = keep;
+			keep = current_span;
+		} else {
+			_rpmalloc_span_unmap(current_span);
+		}
+	}
+
+	if (keep) {
+		while (!atomic_cas32_acquire(&cache->lock, 1, 0))
+			_rpmalloc_spin();
+
+		size_t islot = 0;
+		while (keep) {
+			for (; islot < cache->count; ++islot) {
+				span_t* current_span = cache->span[islot];
+				if (!(current_span->flags & SPAN_FLAG_MASTER) || ((current_span->flags & SPAN_FLAG_MASTER) &&
+				    (atomic_load32(&current_span->remaining_spans) <= (int32_t)current_span->span_count))) {
+					_rpmalloc_span_unmap(current_span);
+					cache->span[islot] = keep;
+					break;
+				}
+			}
+			if (islot == cache->count)
+				break;
+			keep = keep->next;
+		}
+
+		while (keep) {
+			span_t* next_span = keep->next;
+			keep->next = cache->overflow;
+			cache->overflow = keep;
+			keep = next_span;
+		}
+
+		atomic_store32_release(&cache->lock, 0);
+	}
 }
 
 static size_t
 _rpmalloc_global_cache_extract_spans(span_t** span, size_t span_count, size_t count) {
 	global_cache_t* cache = &_memory_span_cache[span_count - 1];
 
-	size_t extract_count = count;
+	size_t extract_count = 0;
 	while (!atomic_cas32_acquire(&cache->lock, 1, 0))
 		_rpmalloc_spin();
 
-	if (extract_count > cache->count)
-		extract_count = cache->count;
+	size_t want = count - extract_count;
+	if (want > cache->count)
+		want = cache->count;
 
-	memcpy(span, cache->span + (cache->count - extract_count), sizeof(span_t*) * extract_count);
-	cache->count -= (uint32_t)extract_count;
+	memcpy(span + extract_count, cache->span + (cache->count - want), sizeof(span_t*) * want);
+	cache->count -= (uint32_t)want;
+	extract_count += want;
 
 	while ((extract_count < count) && cache->overflow) {
 		span_t* current_span = cache->overflow;
 		span[extract_count++] = current_span;
 		cache->overflow = current_span->next;
 	}
+
 	atomic_store32_release(&cache->lock, 0);
 
 	return extract_count;
@@ -1413,16 +1468,16 @@
 		if (EXPECTED(span->size_class < SIZE_CLASS_COUNT)) {
 			assert(heap->full_span_count);
 			--heap->full_span_count;
+			_rpmalloc_stat_dec(&heap->span_use[0].spans_deferred);
 #if RPMALLOC_FIRST_CLASS_HEAPS
 			_rpmalloc_span_double_link_list_remove(&heap->full_span[span->size_class], span);
 #endif
-			if (single_span && !*single_span) {
+			_rpmalloc_stat_dec(&heap->span_use[0].current);
+			_rpmalloc_stat_dec(&heap->size_class_use[span->size_class].spans_current);
+			if (single_span && !*single_span)
 				*single_span = span;
-			} else {
-				_rpmalloc_stat_dec(&heap->span_use[0].current);
-				_rpmalloc_stat_dec(&heap->size_class_use[span->size_class].spans_current);
+			else
 				_rpmalloc_heap_cache_insert(heap, span);
-			}
 		} else {
 			if (span->size_class == SIZE_CLASS_HUGE) {
 				_rpmalloc_deallocate_huge(span);
@@ -1434,12 +1489,12 @@
 				_rpmalloc_span_double_link_list_remove(&heap->large_huge_span, span);
 #endif
 				uint32_t idx = span->span_count - 1;
-				if (!idx && single_span && !*single_span) {
+				_rpmalloc_stat_dec(&heap->span_use[idx].spans_deferred);
+				_rpmalloc_stat_dec(&heap->span_use[idx].current);
+				if (!idx && single_span && !*single_span)
 					*single_span = span;
-				} else {
-					_rpmalloc_stat_dec(&heap->span_use[idx].current);
+				else
 					_rpmalloc_heap_cache_insert(heap, span);
-				}
 			}
 		}
 		span = next_span;
@@ -1631,10 +1686,11 @@
 	return 0;
 }
 
-//! Get a span from one of the cache levels (thread cache, reserved, global cache) or fallback to mapping more memory
-static span_t*
-_rpmalloc_heap_extract_new_span(heap_t* heap, size_t span_count, uint32_t class_idx) {
-	span_t* span;
+static void
+_rpmalloc_inc_span_statistics(heap_t* heap, size_t span_count, uint32_t class_idx) {
+	(void)sizeof(heap);
+	(void)sizeof(span_count);
+	(void)sizeof(class_idx);
 #if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
 	uint32_t idx = (uint32_t)span_count - 1;
 	uint32_t current_count = (uint32_t)atomic_incr32(&heap->span_use[idx].current);
@@ -1642,6 +1698,12 @@
 		atomic_store32(&heap->span_use[idx].high, (int32_t)current_count);
 	_rpmalloc_stat_add_peak(&heap->size_class_use[class_idx].spans_current, 1, heap->size_class_use[class_idx].spans_peak);
 #endif
+}
+
+//! Get a span from one of the cache levels (thread cache, reserved, global cache) or fallback to mapping more memory
+static span_t*
+_rpmalloc_heap_extract_new_span(heap_t* heap, size_t span_count, uint32_t class_idx) {
+	span_t* span;
 #if ENABLE_THREAD_CACHE
 	if (class_idx < SIZE_CLASS_COUNT) {
 		if (heap->size_class[class_idx].cache) {
@@ -1650,34 +1712,48 @@
 			if (heap->span_cache.count)
 				new_cache = heap->span_cache.span[--heap->span_cache.count];
 			heap->size_class[class_idx].cache = new_cache;
+			_rpmalloc_inc_span_statistics(heap, span_count, class_idx);
 			return span;
 		}
 	}
 #else
 	(void)sizeof(class_idx);
 #endif
-	span = _rpmalloc_heap_thread_cache_extract(heap, span_count);
-	if (EXPECTED(span != 0)) {
-		_rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_cache);
-		return span;
-	}
-	span = _rpmalloc_heap_thread_cache_deferred_extract(heap, span_count);
-	if (EXPECTED(span != 0)) {
-		_rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_cache);
-		return span;
-	}
-	span = _rpmalloc_heap_reserved_extract(heap, span_count);
-	if (EXPECTED(span != 0)) {
-		_rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_reserved);
-		return span;
-	}
-	span = _rpmalloc_heap_global_cache_extract(heap, span_count);
-	if (EXPECTED(span != 0)) {
-		_rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_cache);
-		return span;
-	}
+	// Allow 50% overhead to increase cache hits
+	size_t base_span_count = span_count;
+	size_t limit_span_count = (span_count > 2) ? (span_count + (span_count >> 1)) : span_count;
+	if (limit_span_count > LARGE_CLASS_COUNT)
+		limit_span_count = LARGE_CLASS_COUNT;
+	do {
+		span = _rpmalloc_heap_thread_cache_extract(heap, span_count);
+		if (EXPECTED(span != 0)) {
+			_rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_cache);
+			_rpmalloc_inc_span_statistics(heap, span_count, class_idx);
+			return span;
+		}
+		span = _rpmalloc_heap_thread_cache_deferred_extract(heap, span_count);
+		if (EXPECTED(span != 0)) {
+			_rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_cache);
+			_rpmalloc_inc_span_statistics(heap, span_count, class_idx);
+			return span;
+		}
+		span = _rpmalloc_heap_reserved_extract(heap, span_count);
+		if (EXPECTED(span != 0)) {
+			_rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_reserved);
+			_rpmalloc_inc_span_statistics(heap, span_count, class_idx);
+			return span;
+		}
+		span = _rpmalloc_heap_global_cache_extract(heap, span_count);
+		if (EXPECTED(span != 0)) {
+			_rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_cache);
+			_rpmalloc_inc_span_statistics(heap, span_count, class_idx);
+			return span;
+		}
+		++span_count;
+	} while (span_count <= limit_span_count);
 	//Final fallback, map in more virtual memory
-	span = _rpmalloc_span_map(heap, span_count);
+	span = _rpmalloc_span_map(heap, base_span_count);
+	_rpmalloc_inc_span_statistics(heap, base_span_count, class_idx);
 	_rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_map_calls);
 	return span;
 }
@@ -1806,41 +1882,44 @@
 	if (!heap)
 		heap = _rpmalloc_heap_allocate_new();
 	atomic_store32_release(&_memory_global_lock, 0);
+	_rpmalloc_heap_cache_adopt_deferred(heap, 0);
 	return heap;
 }
 
 static void
-_rpmalloc_heap_release(void* heapptr, int first_class) {
+_rpmalloc_heap_release(void* heapptr, int first_class, int release_cache) {
 	heap_t* heap = (heap_t*)heapptr;
 	if (!heap)
 		return;
 	//Release thread cache spans back to global cache
 	_rpmalloc_heap_cache_adopt_deferred(heap, 0);
+	if (release_cache  || heap->finalize) {
 #if ENABLE_THREAD_CACHE
-	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-		span_cache_t* span_cache;
-		if (!iclass)
-			span_cache = &heap->span_cache;
-		else
-			span_cache = (span_cache_t*)(heap->span_large_cache + (iclass - 1));
-		if (!span_cache->count)
-			continue;
+		for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+			span_cache_t* span_cache;
+			if (!iclass)
+				span_cache = &heap->span_cache;
+			else
+				span_cache = (span_cache_t*)(heap->span_large_cache + (iclass - 1));
+			if (!span_cache->count)
+				continue;
 #if ENABLE_GLOBAL_CACHE
-		if (heap->finalize) {
+			if (heap->finalize) {
+				for (size_t ispan = 0; ispan < span_cache->count; ++ispan)
+					_rpmalloc_span_unmap(span_cache->span[ispan]);
+			} else {
+				_rpmalloc_stat_add64(&heap->thread_to_global, span_cache->count * (iclass + 1) * _memory_span_size);
+				_rpmalloc_stat_add(&heap->span_use[iclass].spans_to_global, span_cache->count);
+				_rpmalloc_global_cache_insert_spans(span_cache->span, iclass + 1, span_cache->count);
+			}
+#else
 			for (size_t ispan = 0; ispan < span_cache->count; ++ispan)
 				_rpmalloc_span_unmap(span_cache->span[ispan]);
-		} else {
-			_rpmalloc_stat_add64(&heap->thread_to_global, span_cache->count * (iclass + 1) * _memory_span_size);
-			_rpmalloc_stat_add(&heap->span_use[iclass].spans_to_global, span_cache->count);
-			_rpmalloc_global_cache_insert_spans(span_cache->span, iclass + 1, span_cache->count);
+#endif
+			span_cache->count = 0;
 		}
-#else
-		for (size_t ispan = 0; ispan < span_cache->count; ++ispan)
-			_rpmalloc_span_unmap(span_cache->span[ispan]);
 #endif
-		span_cache->count = 0;
 	}
-#endif
 
 	if (get_thread_heap_raw() == heap)
 		set_thread_heap(0);
@@ -1861,8 +1940,8 @@
 }
 
 static void
-_rpmalloc_heap_release_raw(void* heapptr) {
-	_rpmalloc_heap_release(heapptr, 0);
+_rpmalloc_heap_release_raw(void* heapptr, int release_cache) {
+	_rpmalloc_heap_release(heapptr, 0, release_cache);
 }
 
 static void
@@ -2024,7 +2103,7 @@
 		return span;
 
 	//Mark span as owned by this heap and set base data
-	assert(span->span_count == span_count);
+	assert(span->span_count >= span_count);
 	span->size_class = SIZE_CLASS_LARGE;
 	span->heap = heap;
 
@@ -2040,7 +2119,6 @@
 static void*
 _rpmalloc_allocate_huge(heap_t* heap, size_t size) {
 	assert(heap);
-	_rpmalloc_heap_cache_adopt_deferred(heap, 0);
 	size += SPAN_HEADER_SIZE;
 	size_t num_pages = size >> _memory_page_size_shift;
 	if (size & (_memory_page_size - 1))
@@ -2068,6 +2146,9 @@
 //! Allocate a block of the given size
 static void*
 _rpmalloc_allocate(heap_t* heap, size_t size) {
+#if ENABLE_STATISTICS
+	atomic_add64(&_allocation_counter, 1);
+#endif
 	if (EXPECTED(size <= SMALL_SIZE_LIMIT))
 		return _rpmalloc_allocate_small(heap, size);
 	else if (size <= _memory_medium_size_limit)
@@ -2190,6 +2271,10 @@
 #endif
 	++heap->full_span_count;
 
+#if ENABLE_STATISTICS
+	atomic_add64(&_allocation_counter, 1);
+#endif
+
 	return ptr;
 }
 
@@ -2225,6 +2310,8 @@
 
 static void
 _rpmalloc_deallocate_defer_free_span(heap_t* heap, span_t* span) {
+	if (span->size_class != SIZE_CLASS_HUGE)
+		_rpmalloc_stat_inc(&heap->span_use[span->span_count - 1].spans_deferred);
 	//This list does not need ABA protection, no mutable side state
 	do {
 		span->free_list = (void*)atomic_load_ptr(&heap->span_free_deferred);
@@ -2348,6 +2435,9 @@
 //! Deallocate the given block
 static void
 _rpmalloc_deallocate(void* p) {
+#if ENABLE_STATISTICS
+	atomic_add64(&_deallocation_counter, 1);
+#endif
 	//Grab the span (always at start of span, using span alignment)
 	span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask);
 	if (UNEXPECTED(!span))
@@ -2360,7 +2450,6 @@
 		_rpmalloc_deallocate_huge(span);
 }
 
-
 ////////////
 ///
 /// Reallocation entry points
@@ -2731,7 +2820,7 @@
 //! Finalize the allocator
 void
 rpmalloc_finalize(void) {
-	rpmalloc_thread_finalize();
+	rpmalloc_thread_finalize(1);
 	//rpmalloc_dump_statistics(stdout);
 
 	if (_memory_global_reserve) {
@@ -2793,10 +2882,10 @@
 
 //! Finalize thread, orphan heap
 void
-rpmalloc_thread_finalize(void) {
+rpmalloc_thread_finalize(int release_caches) {
 	heap_t* heap = get_thread_heap_raw();
 	if (heap)
-		_rpmalloc_heap_release_raw(heap);
+		_rpmalloc_heap_release_raw(heap, release_caches);
 	set_thread_heap(0);
 #if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
 	FlsSetValue(fls_key, 0);
@@ -3047,13 +3136,14 @@
 			((size_t)atomic_load32(&heap->size_class_use[iclass].spans_from_reserved) * _memory_span_size) / (size_t)(1024 * 1024),
 			atomic_load32(&heap->size_class_use[iclass].spans_map_calls));
 	}
-	fprintf(file, "Spans  Current     Peak  PeakMiB  Cached  ToCacheMiB FromCacheMiB ToReserveMiB FromReserveMiB ToGlobalMiB FromGlobalMiB  MmapCalls\n");
+	fprintf(file, "Spans  Current     Peak Deferred  PeakMiB  Cached  ToCacheMiB FromCacheMiB ToReserveMiB FromReserveMiB ToGlobalMiB FromGlobalMiB  MmapCalls\n");
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
 		if (!atomic_load32(&heap->span_use[iclass].high) && !atomic_load32(&heap->span_use[iclass].spans_map_calls))
 			continue;
-		fprintf(file, "%4u: %8d %8u %8zu %7u %11zu %12zu %12zu %14zu %11zu %13zu %10u\n", (uint32_t)(iclass + 1),
+		fprintf(file, "%4u: %8d %8u %8u %8zu %7u %11zu %12zu %12zu %14zu %11zu %13zu %10u\n", (uint32_t)(iclass + 1),
 			atomic_load32(&heap->span_use[iclass].current),
 			atomic_load32(&heap->span_use[iclass].high),
+			atomic_load32(&heap->span_use[iclass].spans_deferred),
 			((size_t)atomic_load32(&heap->span_use[iclass].high) * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024),
 #if ENABLE_THREAD_CACHE
 			(unsigned int)(!iclass ? heap->span_cache.count : heap->span_large_cache[iclass - 1].count),
@@ -3068,6 +3158,7 @@
 			((size_t)atomic_load32(&heap->span_use[iclass].spans_from_global) * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024),
 			atomic_load32(&heap->span_use[iclass].spans_map_calls));
 	}
+	fprintf(file, "Full spans: %zu\n", heap->full_span_count);
 	fprintf(file, "ThreadToGlobalMiB GlobalToThreadMiB\n");
 	fprintf(file, "%17zu %17zu\n", (size_t)atomic_load64(&heap->thread_to_global) / (size_t)(1024 * 1024), (size_t)atomic_load64(&heap->global_to_thread) / (size_t)(1024 * 1024));
 }
@@ -3078,7 +3169,7 @@
 rpmalloc_dump_statistics(void* file) {
 #if ENABLE_STATISTICS
 	//If you hit this assert, you still have active threads or forgot to finalize some thread(s)
-	assert(atomic_load32(&_memory_active_heaps) == 0);
+	//assert(atomic_load32(&_memory_active_heaps) == 0);
 	for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) {
 		heap_t* heap = _memory_heaps[list_idx];
 		while (heap) {
@@ -3107,6 +3198,20 @@
 	fprintf(file, "HugeCurrentMiB HugePeakMiB\n");
 	fprintf(file, "%14zu %11zu\n", huge_current / (size_t)(1024 * 1024), huge_peak / (size_t)(1024 * 1024));
 
+	size_t global_cache = 0;
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+		global_cache_t* cache = _memory_span_cache + iclass;
+		global_cache += (size_t)cache->count * iclass * _memory_span_size;
+
+		span_t* span = cache->overflow;
+		while (span) {
+			global_cache += iclass * _memory_span_size;
+			span = span->next;
+		}
+	}
+	fprintf(file, "GlobalCacheMiB\n");
+	fprintf(file, "%14zu\n", global_cache / (size_t)(1024 * 1024));
+
 	size_t mapped = (size_t)atomic_load32(&_mapped_pages) * _memory_page_size;
 	size_t mapped_os = (size_t)atomic_load32(&_mapped_pages_os) * _memory_page_size;
 	size_t mapped_peak = (size_t)_mapped_pages_peak * _memory_page_size;
@@ -3123,9 +3228,17 @@
 		reserved_total / (size_t)(1024 * 1024));
 
 	fprintf(file, "\n");
-#else
-	(void)sizeof(file);
+#if 0
+	int64_t allocated = atomic_load64(&_allocation_counter);
+	int64_t deallocated = atomic_load64(&_deallocation_counter);
+	fprintf(file, "Allocation count: %lli\n", allocated);
+	fprintf(file, "Deallocation count: %lli\n", deallocated);
+	fprintf(file, "Current allocations: %lli\n", (allocated - deallocated));
+	fprintf(file, "Master spans: %d\n", atomic_load32(&_master_spans));
+	fprintf(file, "Dangling master spans: %d\n", atomic_load32(&_unmapped_master_spans));
 #endif
+#endif
+	(void)sizeof(file);
 }
 
 #if RPMALLOC_FIRST_CLASS_HEAPS
@@ -3145,7 +3258,7 @@
 extern inline void
 rpmalloc_heap_release(rpmalloc_heap_t* heap) {
 	if (heap)
-		_rpmalloc_heap_release(heap, 1);
+		_rpmalloc_heap_release(heap, 1, 1);
 }
 
 extern inline RPMALLOC_ALLOCATOR void*
diff --git a/rpmalloc/rpmalloc.h b/rpmalloc/rpmalloc.h
index 40d17ac..c74bc90 100644
--- a/rpmalloc/rpmalloc.h
+++ b/rpmalloc/rpmalloc.h
@@ -200,7 +200,7 @@
 
 //! Finalize allocator for calling thread
 RPMALLOC_EXPORT void
-rpmalloc_thread_finalize(void);
+rpmalloc_thread_finalize(int release_caches);
 
 //! Perform deferred deallocations pending for the calling thread heap
 RPMALLOC_EXPORT void
diff --git a/test/main.c b/test/main.c
index f8db4c7..012e80b 100644
--- a/test/main.c
+++ b/test/main.c
@@ -454,7 +454,7 @@
 	thread_sleep(1);
 
 	if (arg.init_fini_each_loop)
-		rpmalloc_thread_finalize();
+		rpmalloc_thread_finalize(1);
 
 	for (iloop = 0; iloop < arg.loops; ++iloop) {
 		if (arg.init_fini_each_loop)
@@ -504,7 +504,7 @@
 		}
 
 		if (arg.init_fini_each_loop)
-			rpmalloc_thread_finalize();
+			rpmalloc_thread_finalize(1);
 	}
 
 	if (arg.init_fini_each_loop)
@@ -513,7 +513,7 @@
 	rpfree(data);
 	rpfree(addr);
 
-	rpmalloc_thread_finalize();
+	rpmalloc_thread_finalize(1);
 
 end:
 	thread_exit((uintptr_t)ret);
@@ -676,7 +676,7 @@
 	}
 
 end:
-	rpmalloc_thread_finalize();
+	rpmalloc_thread_finalize(1);
 
 	thread_exit((uintptr_t)ret);
 }
@@ -777,12 +777,12 @@
 			rpfree(addr[ipass]);
 		}
 
-		rpmalloc_thread_finalize();
+		rpmalloc_thread_finalize(1);
 		thread_yield();
 	}
 
 end:
-	rpmalloc_thread_finalize();
+	rpmalloc_thread_finalize(1);
 	thread_exit((uintptr_t)ret);
 }