Merge develop for 1.4.3 release

diff --git a/CHANGELOG b/CHANGELOG
index cfc9e73..58d505c 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,15 @@
+1.4.3
+
+Fixed an issue where certain combinations of memory page size and span map counts could cause
+a deadlock in the mapping of new memory pages.
+
+Tweaked cache levels and avoid setting spans as reserved in a heap when the heap already has
+spans in the thread cache to improve cache usage.
+
+Prefer flags to more actively evict physical pages in madvise calls when partially unmapping
+span ranges on POSIX systems.
+
+
 1.4.2
 
 Fixed an issue where calling _exit might hang the main thread cleanup in rpmalloc if another
@@ -204,7 +216,7 @@
 
 Improve documentation and additional code comments
 
-Move benchmarks to separate repo, https://github.com/rampantpixels/rpmalloc-benchmark
+Move benchmarks to separate repo, https://github.com/mjansson/rpmalloc-benchmark
 
 
 1.0
diff --git a/README.md b/README.md
index fcda26b..ec8a843 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # rpmalloc - General Purpose Memory Allocator
 This library provides a public domain cross platform lock free thread caching 16-byte aligned memory allocator implemented in C. The latest source code is always available at https://github.com/mjansson/rpmalloc
 
-Created by Mattias Jansson ([@maniccoder](https://twitter.com/maniccoder))
+Created by Mattias Jansson ([@maniccoder](https://twitter.com/maniccoder)) - Discord server for discussions at https://discord.gg/M8BwTQrt6c 
 
 Platforms currently supported:
 
diff --git a/build/ninja/msvc.py b/build/ninja/msvc.py
index 8288d94..9f9d13a 100644
--- a/build/ninja/msvc.py
+++ b/build/ninja/msvc.py
@@ -31,8 +31,8 @@
     self.linkcmd = '$toolchain$link $libpaths $configlibpaths $linkflags $linkarchflags $linkconfigflags /DEBUG /NOLOGO /SUBSYSTEM:CONSOLE /DYNAMICBASE /NXCOMPAT /MANIFEST /MANIFESTUAC:\"level=\'asInvoker\' uiAccess=\'false\'\" /TLBID:1 /PDB:$pdbpath /OUT:$out $in $libs $archlibs $oslibs'
     self.dllcmd = self.linkcmd + ' /DLL'
 
-    self.cflags = ['/D', '"' + project.upper() + '_COMPILE=1"', '/D', '"_UNICODE"',  '/D', '"UNICODE"', '/Zi', '/Oi', '/Oy-', '/GS-', '/Gy-', '/Qpar-', '/fp:fast', '/fp:except-', '/Zc:forScope', '/Zc:wchar_t', '/GR-', '/openmp-']
-    self.cwarnflags = ['/W4', '/WX']
+    self.cflags = ['/D', '"' + project.upper() + '_COMPILE=1"', '/D', '"_UNICODE"',  '/D', '"UNICODE"', '/std:c17', '/Zi', '/Oi', '/Oy-', '/GS-', '/Gy-', '/Qpar-', '/fp:fast', '/fp:except-', '/Zc:forScope', '/Zc:wchar_t', '/GR-', '/openmp-']
+    self.cwarnflags = ['/W4', '/WX', '/wd4201'] #Ignore nameless union/struct which is allowed in C11
     self.cmoreflags = []
     self.arflags = ['/ignore:4221'] #Ignore empty object file warning]
     self.linkflags = ['/DEBUG']
@@ -138,10 +138,11 @@
           tools_list.sort(key=StrictVersion)
           self.toolchain = os.path.join(tools_basepath, tools_list[-1])
           self.toolchain_version = major_version + ".0"
+          break
 
       if self.toolchain == '':
         toolchain = ''
-        versions = ['16.0', '15.0', '14.0', '13.0', '12.0', '11.0', '10.0']
+        versions = ['17.0', '16.0', '15.0']
         keys = [
           'HKLM\\SOFTWARE\\Microsoft\\VisualStudio\\SxS\\VC7',
           'HKCU\\SOFTWARE\\Microsoft\\VisualStudio\\SxS\\VC7',
@@ -161,17 +162,18 @@
             except:
               continue
             if not toolchain == '':
-              if version == '15.0' or version == '16.0':
-                tools_basepath = os.path.join(toolchain, 'VC', 'Tools', 'MSVC')
-                tools_list = [item for item in os.listdir(tools_basepath) if os.path.isdir(os.path.join(tools_basepath, item))]
-                from distutils.version import StrictVersion
-                tools_list.sort(key=StrictVersion)
-                toolchain = os.path.join(tools_basepath, tools_list[-1])
+              tools_basepath = os.path.join(toolchain, 'VC', 'Tools', 'MSVC')
+              tools_list = [item for item in os.listdir(tools_basepath) if os.path.isdir(os.path.join(tools_basepath, item))]
+              from distutils.version import StrictVersion
+              tools_list.sort(key=StrictVersion)
+              toolchain = os.path.join(tools_basepath, tools_list[-1])
               self.toolchain = toolchain
               self.toolchain_version = version
               break
           if not self.toolchain == '':
             break
+    if self.toolchain == '':
+      raise Exception("Unable to locate any installed Visual Studio toolchain")
     self.includepaths += [os.path.join(self.toolchain, 'include')]
     if self.sdkpath == '':
       versions = ['v10.0', 'v8.1']
@@ -237,13 +239,10 @@
     return []
 
   def make_arch_toolchain_path(self, arch):
-    if self.toolchain_version == '15.0' or self.toolchain_version == '16.0':
-      if arch == 'x86-64':
-        return os.path.join(self.toolchain, 'bin', 'HostX64', 'x64\\')
-      elif arch == 'x86':
-        return os.path.join(self.toolchain, 'bin', 'HostX64', 'x86\\')
     if arch == 'x86-64':
-      return os.path.join(self.toolchain, 'bin', 'amd64\\')
+      return os.path.join(self.toolchain, 'bin', 'HostX64', 'x64\\')
+    elif arch == 'x86':
+      return os.path.join(self.toolchain, 'bin', 'HostX64', 'x86\\')
     return os.path.join(self.toolchain, 'bin\\')
 
   def make_carchflags(self, arch, targettype):
@@ -321,20 +320,14 @@
       libpaths += [os.path.join(libpath, self.libpath, config, arch) for libpath in extralibpaths]
     if self.sdkpath != '':
       if arch == 'x86':
-        if self.toolchain_version == '15.0' or self.toolchain_version == '16.0':
-          libpaths += [os.path.join(self.toolchain, 'lib', 'x86')]
-        else:
-          libpaths += [os.path.join(self.toolchain, 'lib')]
+        libpaths += [os.path.join(self.toolchain, 'lib', 'x86')]
         if self.sdkversion == 'v8.1':
           libpaths += [os.path.join( self.sdkpath, 'lib', 'winv6.3', 'um', 'x86')]
         if self.sdkversion == 'v10.0':
           libpaths += [os.path.join(self.sdkpath, 'lib', self.sdkversionpath, 'um', 'x86')]
           libpaths += [os.path.join(self.sdkpath, 'lib', self.sdkversionpath, 'ucrt', 'x86')]
       else:
-        if self.toolchain_version == '15.0' or self.toolchain_version == '16.0':
-          libpaths += [os.path.join( self.toolchain, 'lib', 'x64')]
-        else:
-          libpaths += [os.path.join( self.toolchain, 'lib', 'amd64')]
+        libpaths += [os.path.join( self.toolchain, 'lib', 'x64')]
         if self.sdkversion == 'v8.1':
           libpaths += [os.path.join( self.sdkpath, 'lib', 'winv6.3', 'um', 'x64')]
         if self.sdkversion == 'v10.0':
diff --git a/build/ninja/version.py b/build/ninja/version.py
index 1bf086a..78be895 100644
--- a/build/ninja/version.py
+++ b/build/ninja/version.py
@@ -15,7 +15,7 @@
   if sys.platform.startswith('win'):
     gitcmd = 'git.exe'
   try:
-    git_version = subprocess.check_output( [ gitcmd, 'describe', '--long' ], stderr = subprocess.STDOUT ).strip()
+    git_version = subprocess.check_output( [ gitcmd, 'describe', '--tags', '--long' ], stderr = subprocess.STDOUT ).strip()
     tokens = git_version.decode().split( '-' )
     version_numbers = tokens[0].split( '.' )
   except Exception:
diff --git a/build/ninja/vslocate.py b/build/ninja/vslocate.py
index 4ec7fcf..afa171a 100644
--- a/build/ninja/vslocate.py
+++ b/build/ninja/vslocate.py
@@ -88,8 +88,15 @@
         ctypes.POINTER(ctypes.POINTER(ISetupConfiguration)),
         ctypes.c_void_p)
 
+    installations = []
+    dll = None
+
     dll_path = os.path.expandvars("$ProgramData\\Microsoft\\VisualStudio\\Setup\\x64\\Microsoft.VisualStudio.Setup.Configuration.Native.dll")
-    dll = ctypes.WinDLL(dll_path)
+    try:
+        dll = ctypes.WinDLL(dll_path)
+    except OSError as e:
+        #print("Failed to load Visual Studio setup configuration DLL: " + str(e))
+        return installations
 
     params_get_setup_configuration = (1, "configuration", 0), (1, "reserved", 0),
 
@@ -98,8 +105,6 @@
     configuration = ctypes.POINTER(ISetupConfiguration)()
     reserved = ctypes.c_void_p(0)
 
-    installations = []
-
     result = get_setup_configuration(ctypes.byref(configuration), reserved)
     if result != 0:
         #print("Failed to get setup configuration: " + str(result))
@@ -110,7 +115,7 @@
     enum_setup_instances = ctypes.POINTER(IEnumSetupInstances)()
     result = enum_instances(configuration, ctypes.byref(enum_setup_instances))
     if result != 0:
-    	#print("Failed to enum setup instances: " + str(result))
+        #print("Failed to enum setup instances: " + str(result))
         return installations
 
 
diff --git a/configure.py b/configure.py
index 514189e..dc1b78e 100755
--- a/configure.py
+++ b/configure.py
@@ -9,7 +9,7 @@
 
 import generator
 
-generator = generator.Generator(project = 'rpmalloc', variables = [('bundleidentifier', 'com.rampantpixels.rpmalloc.$(binname)')])
+generator = generator.Generator(project = 'rpmalloc', variables = [('bundleidentifier', 'com.maniccoder.rpmalloc.$(binname)')])
 
 rpmalloc_lib = generator.lib(module = 'rpmalloc', libname = 'rpmalloc', sources = ['rpmalloc.c'])
 rpmalloc_test_lib = generator.lib(module = 'rpmalloc', libname = 'rpmalloc-test', sources = ['rpmalloc.c'], variables = {'defines': ['ENABLE_ASSERTS=1', 'ENABLE_STATISTICS=1', 'RPMALLOC_FIRST_CLASS_HEAPS=1', 'RPMALLOC_CONFIGURABLE=1']})
diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 5186f61..f061cb4 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -20,6 +20,9 @@
 #if defined(__clang__)
 #pragma clang diagnostic ignored "-Wunused-macros"
 #pragma clang diagnostic ignored "-Wunused-function"
+#if __has_warning("-Wreserved-identifier")
+#pragma clang diagnostic ignored "-Wreserved-identifier"
+#endif
 #elif defined(__GNUC__)
 #pragma GCC diagnostic ignored "-Wunused-macros"
 #pragma GCC diagnostic ignored "-Wunused-function"
@@ -107,6 +110,7 @@
 
 /// Platform and arch specifics
 #if defined(_MSC_VER) && !defined(__clang__)
+#  pragma warning (disable: 5105)
 #  ifndef FORCEINLINE
 #    define FORCEINLINE inline __forceinline
 #  endif
@@ -320,11 +324,11 @@
 //! Size of a span header (must be a multiple of SMALL_GRANULARITY and a power of two)
 #define SPAN_HEADER_SIZE          128
 //! Number of spans in thread cache
-#define MAX_THREAD_SPAN_CACHE     256
+#define MAX_THREAD_SPAN_CACHE     400
 //! Number of spans to transfer between thread and global cache
 #define THREAD_SPAN_CACHE_TRANSFER 64
 //! Number of spans in thread cache for large spans (must be greater than LARGE_CLASS_COUNT / 2)
-#define MAX_THREAD_SPAN_LARGE_CACHE 64
+#define MAX_THREAD_SPAN_LARGE_CACHE 100
 //! Number of spans to transfer between thread and global cache for large spans
 #define THREAD_SPAN_LARGE_CACHE_TRANSFER 6
 
@@ -570,6 +574,12 @@
 	atomic32_t lock;
 	//! Cache count
 	uint32_t count;
+#if ENABLE_STATISTICS
+	//! Insert count
+	size_t insert_count;
+	//! Extract count
+	size_t extract_count;
+#endif
 	//! Cached spans
 	span_t* span[GLOBAL_CACHE_MULTIPLIER * MAX_THREAD_SPAN_CACHE];
 	//! Unlimited cache overflow
@@ -614,10 +624,8 @@
 #endif
 //! Number of spans to map in each map call
 static size_t _memory_span_map_count;
-//! Number of spans to release from thread cache to global cache (single spans)
-static size_t _memory_span_release_count;
-//! Number of spans to release from thread cache to global cache (large multiple spans)
-static size_t _memory_span_release_count_large;
+//! Number of spans to keep reserved in each heap
+static size_t _memory_heap_reserve_count;
 //! Global size classes
 static size_class_t _memory_size_class[SIZE_CLASS_COUNT];
 //! Run-time size limit of medium blocks
@@ -661,8 +669,6 @@
 static atomic32_t _master_spans;
 //! Number of unmapped dangling master spans
 static atomic32_t _unmapped_master_spans;
-//! Number of currently unused spans
-static atomic32_t _reserved_spans;
 //! Running counter of total number of mapped memory pages since start
 static atomic32_t _mapped_total;
 //! Running counter of total number of unmapped memory pages since start
@@ -858,7 +864,12 @@
 	//Ok to MEM_COMMIT - according to MSDN, "actual physical pages are not allocated unless/until the virtual addresses are actually accessed"
 	void* ptr = VirtualAlloc(0, size + padding, (_memory_huge_pages ? MEM_LARGE_PAGES : 0) | MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
 	if (!ptr) {
-		rpmalloc_assert(ptr, "Failed to map virtual memory block");
+		if (_memory_config.map_fail_callback) {
+			if (_memory_config.map_fail_callback(size + padding))
+				return _rpmalloc_mmap_os(size, offset);
+		} else {
+			rpmalloc_assert(ptr, "Failed to map virtual memory block");
+		}
 		return 0;
 	}
 #else
@@ -880,8 +891,12 @@
 	void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, -1, 0);
 #  endif
 	if ((ptr == MAP_FAILED) || !ptr) {
-		if (errno != ENOMEM)
+		if (_memory_config.map_fail_callback) {
+			if (_memory_config.map_fail_callback(size + padding))
+				return _rpmalloc_mmap_os(size, offset);
+		} else if (errno != ENOMEM) {
 			rpmalloc_assert((ptr != MAP_FAILED) && ptr, "Failed to map virtual memory block");
+		}
 		return 0;
 	}
 #endif
@@ -927,12 +942,13 @@
 		int ret;
 		while ((ret = madvise(address, size, MADV_FREE_REUSABLE)) == -1 && (errno == EAGAIN))
 			errno = 0;
-		if ((ret == -1) && (errno != 0))
-#elif defined(MADV_FREE)
-		if (madvise(address, size, MADV_FREE))
-#endif
-#if defined(MADV_DONTNEED)
+		if ((ret == -1) && (errno != 0)) {
+#elif defined(MADV_DONTNEED)
 		if (madvise(address, size, MADV_DONTNEED)) {
+#elif defined(MADV_PAGEOUT)
+		if (madvise(address, size, MADV_PAGEOUT)) {
+#elif defined(MADV_FREE)
+		if (madvise(address, size, MADV_FREE)) {
 #else
 		if (posix_madvise(address, size, POSIX_MADV_DONTNEED)) {
 #endif
@@ -1084,7 +1100,6 @@
 	if (!span)
 		return 0;
 	_rpmalloc_span_initialize(span, aligned_span_count, span_count, align_offset);
-	_rpmalloc_stat_add(&_reserved_spans, aligned_span_count);
 	_rpmalloc_stat_inc(&_master_spans);
 	if (span_count <= LARGE_CLASS_COUNT)
 		_rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_map_calls);
@@ -1095,19 +1110,17 @@
 			_rpmalloc_span_mark_as_subspan_unless_master(heap->span_reserve_master, heap->span_reserve, heap->spans_reserved);
 			_rpmalloc_heap_cache_insert(heap, heap->span_reserve);
 		}
-		if (reserved_count > DEFAULT_SPAN_MAP_COUNT) {
-			// If huge pages, make sure only one thread maps more memory to avoid bloat
-			while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0))
-				_rpmalloc_spin();
-			size_t remain_count = reserved_count - DEFAULT_SPAN_MAP_COUNT;
-			reserved_count = DEFAULT_SPAN_MAP_COUNT;
+		if (reserved_count > _memory_heap_reserve_count) {
+			// If huge pages or eager spam map count, the global reserve spin lock is held by caller, _rpmalloc_span_map
+			rpmalloc_assert(atomic_load32(&_memory_global_lock) == 1, "Global spin lock not held as expected");
+			size_t remain_count = reserved_count - _memory_heap_reserve_count;
+			reserved_count = _memory_heap_reserve_count;
 			span_t* remain_span = (span_t*)pointer_offset(reserved_spans, reserved_count * _memory_span_size);
 			if (_memory_global_reserve) {
 				_rpmalloc_span_mark_as_subspan_unless_master(_memory_global_reserve_master, _memory_global_reserve, _memory_global_reserve_count);
 				_rpmalloc_span_unmap(_memory_global_reserve);
 			}
 			_rpmalloc_global_set_reserved_spans(span, remain_span, remain_count);
-			atomic_store32_release(&_memory_global_lock, 0);
 		}
 		_rpmalloc_heap_set_reserved_spans(heap, span, reserved_spans, reserved_count);
 	}
@@ -1120,12 +1133,13 @@
 	if (span_count <= heap->spans_reserved)
 		return _rpmalloc_span_map_from_reserve(heap, span_count);
 	span_t* span = 0;
-	if (_memory_page_size > _memory_span_size) {
+	int use_global_reserve = (_memory_page_size > _memory_span_size) || (_memory_span_map_count > _memory_heap_reserve_count);
+	if (use_global_reserve) {
 		// If huge pages, make sure only one thread maps more memory to avoid bloat
 		while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0))
 			_rpmalloc_spin();
 		if (_memory_global_reserve_count >= span_count) {
-			size_t reserve_count = (!heap->spans_reserved ? DEFAULT_SPAN_MAP_COUNT : span_count);
+			size_t reserve_count = (!heap->spans_reserved ? _memory_heap_reserve_count : span_count);
 			if (_memory_global_reserve_count < reserve_count)
 				reserve_count = _memory_global_reserve_count;
 			span = _rpmalloc_global_get_reserved_spans(reserve_count);
@@ -1141,7 +1155,7 @@
 	}
 	if (!span)
 		span = _rpmalloc_span_map_aligned_count(heap, span_count);
-	if (_memory_page_size > _memory_span_size)
+	if (use_global_reserve)
 		atomic_store32_release(&_memory_global_lock, 0);
 	return span;
 }
@@ -1161,10 +1175,8 @@
 	if (!is_master) {
 		//Directly unmap subspans (unless huge pages, in which case we defer and unmap entire page range with master)
 		rpmalloc_assert(span->align_offset == 0, "Span align offset corrupted");
-		if (_memory_span_size >= _memory_page_size) {
+		if (_memory_span_size >= _memory_page_size)
 			_rpmalloc_unmap(span, span_count * _memory_span_size, 0, 0);
-			_rpmalloc_stat_sub(&_reserved_spans, span_count);
-		}
 	} else {
 		//Special double flag to denote an unmapped master
 		//It must be kept in memory since span header must be used
@@ -1178,7 +1190,6 @@
 		size_t unmap_count = master->span_count;
 		if (_memory_span_size < _memory_page_size)
 			unmap_count = master->total_spans;
-		_rpmalloc_stat_sub(&_reserved_spans, unmap_count);
 		_rpmalloc_stat_sub(&_master_spans, 1);
 		_rpmalloc_stat_sub(&_unmapped_master_spans, 1);
 		_rpmalloc_unmap(master, unmap_count * _memory_span_size, master->align_offset, (size_t)master->total_spans * _memory_span_size);
@@ -1190,6 +1201,7 @@
 _rpmalloc_span_release_to_cache(heap_t* heap, span_t* span) {
 	rpmalloc_assert(heap == span->heap, "Span heap pointer corrupted");
 	rpmalloc_assert(span->size_class < SIZE_CLASS_COUNT, "Invalid span size class");
+	rpmalloc_assert(span->span_count == 1, "Invalid span count");
 #if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
 	atomic_decr32(&heap->span_use[0].current);
 #endif
@@ -1366,6 +1378,9 @@
 	while (!atomic_cas32_acquire(&cache->lock, 1, 0))
 		_rpmalloc_spin();
 
+#if ENABLE_STATISTICS
+	cache->insert_count += count;
+#endif
 	if ((cache->count + insert_count) > cache_limit)
 		insert_count = cache_limit - cache->count;
 
@@ -1438,6 +1453,9 @@
 	while (!atomic_cas32_acquire(&cache->lock, 1, 0))
 		_rpmalloc_spin();
 
+#if ENABLE_STATISTICS
+	cache->extract_count += count;
+#endif
 	size_t want = count - extract_count;
 	if (want > cache->count)
 		want = cache->count;
@@ -1452,6 +1470,12 @@
 		cache->overflow = current_span->next;
 	}
 
+#if ENABLE_ASSERTS
+	for (size_t ispan = 0; ispan < extract_count; ++ispan) {
+		assert(span[ispan]->span_count == span_count);
+	}
+#endif
+
 	atomic_store32_release(&cache->lock, 0);
 
 	return extract_count;
@@ -1830,7 +1854,6 @@
 			return 0;
 
 		// Master span will contain the heaps
-		_rpmalloc_stat_add(&_reserved_spans, span_count);
 		_rpmalloc_stat_inc(&_master_spans);
 		_rpmalloc_span_initialize(span, span_count, heap_span_count, align_offset);
 	}
@@ -1856,7 +1879,7 @@
 	if (span_count > heap_span_count) {
 		// Cap reserved spans
 		size_t remain_count = span_count - heap_span_count;
-		size_t reserve_count = (remain_count > DEFAULT_SPAN_MAP_COUNT ? DEFAULT_SPAN_MAP_COUNT : remain_count);
+		size_t reserve_count = (remain_count > _memory_heap_reserve_count ? _memory_heap_reserve_count : remain_count);
 		span_t* remain_span = (span_t*)pointer_offset(span, heap_span_count * _memory_span_size);
 		_rpmalloc_heap_set_reserved_spans(heap, span, remain_span, reserve_count);
 
@@ -2392,7 +2415,7 @@
 		_rpmalloc_deallocate_defer_free_span(span->heap, span);
 		return;
 	}
-	rpmalloc_assert(span->heap->full_span_count, "Heap spanc counter corrupted");
+	rpmalloc_assert(span->heap->full_span_count, "Heap span counter corrupted");
 	--span->heap->full_span_count;
 #if RPMALLOC_FIRST_CLASS_HEAPS
 	_rpmalloc_span_double_link_list_remove(&span->heap->large_huge_span, span);
@@ -2404,7 +2427,12 @@
 #endif
 	heap_t* heap = span->heap;
 	rpmalloc_assert(heap, "No thread heap");
-	if ((span->span_count > 1) && !heap->finalize && !heap->spans_reserved) {
+#if ENABLE_THREAD_CACHE
+	const int set_as_reserved = ((span->span_count > 1) && (heap->span_cache.count == 0) && !heap->finalize && !heap->spans_reserved);
+#else
+	const int set_as_reserved = ((span->span_count > 1) && !heap->finalize && !heap->spans_reserved);
+#endif
+	if (set_as_reserved) {
 		heap->span_reserve = span;
 		heap->spans_reserved = span->span_count;
 		if (span->flags & SPAN_FLAG_MASTER) {
@@ -2651,23 +2679,26 @@
 		_memory_config.memory_unmap = _rpmalloc_unmap_os;
 	}
 
+#if PLATFORM_WINDOWS
+	SYSTEM_INFO system_info;
+	memset(&system_info, 0, sizeof(system_info));
+	GetSystemInfo(&system_info);
+	_memory_map_granularity = system_info.dwAllocationGranularity;
+#else
+	_memory_map_granularity = (size_t)sysconf(_SC_PAGESIZE);
+#endif
+
 #if RPMALLOC_CONFIGURABLE
 	_memory_page_size = _memory_config.page_size;
 #else
 	_memory_page_size = 0;
 #endif
 	_memory_huge_pages = 0;
-	_memory_map_granularity = _memory_page_size;
 	if (!_memory_page_size) {
 #if PLATFORM_WINDOWS
-		SYSTEM_INFO system_info;
-		memset(&system_info, 0, sizeof(system_info));
-		GetSystemInfo(&system_info);
 		_memory_page_size = system_info.dwPageSize;
-		_memory_map_granularity = system_info.dwAllocationGranularity;
 #else
-		_memory_page_size = (size_t)sysconf(_SC_PAGESIZE);
-		_memory_map_granularity = _memory_page_size;
+		_memory_page_size = _memory_map_granularity;
 		if (_memory_config.enable_huge_pages) {
 #if defined(__linux__)
 			size_t huge_page_size = 0;
@@ -2722,18 +2753,18 @@
 				token_privileges.Privileges[0].Luid = luid;
 				token_privileges.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
 				if (AdjustTokenPrivileges(token, FALSE, &token_privileges, 0, 0, 0)) {
-					DWORD err = GetLastError();
-					if (err == ERROR_SUCCESS) {
+					if (GetLastError() == ERROR_SUCCESS)
 						_memory_huge_pages = 1;
-						if (large_page_minimum > _memory_page_size)
-						 	_memory_page_size = large_page_minimum;
-						if (large_page_minimum > _memory_map_granularity)
-							_memory_map_granularity = large_page_minimum;
-					}
 				}
 			}
 			CloseHandle(token);
 		}
+		if (_memory_huge_pages) {
+			if (large_page_minimum > _memory_page_size)
+				_memory_page_size = large_page_minimum;
+			if (large_page_minimum > _memory_map_granularity)
+				_memory_map_granularity = large_page_minimum;
+		}
 	}
 #endif
 
@@ -2780,15 +2811,13 @@
 		_memory_span_map_count = (_memory_page_size / _memory_span_size);
 	if ((_memory_page_size >= _memory_span_size) && ((_memory_span_map_count * _memory_span_size) % _memory_page_size))
 		_memory_span_map_count = (_memory_page_size / _memory_span_size);
+	_memory_heap_reserve_count = (_memory_span_map_count > DEFAULT_SPAN_MAP_COUNT) ? DEFAULT_SPAN_MAP_COUNT : _memory_span_map_count;
 
 	_memory_config.page_size = _memory_page_size;
 	_memory_config.span_size = _memory_span_size;
 	_memory_config.span_map_count = _memory_span_map_count;
 	_memory_config.enable_huge_pages = _memory_huge_pages;
 
-	_memory_span_release_count = (_memory_span_map_count > 4 ? ((_memory_span_map_count < 64) ? _memory_span_map_count : 64) : 4);
-	_memory_span_release_count_large = (_memory_span_release_count > 8 ? (_memory_span_release_count / 4) : 2);
-
 #if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD
 	if (pthread_key_create(&_memory_thread_heap, _rpmalloc_heap_release_raw_fc))
 		return -1;
@@ -2827,7 +2856,6 @@
 	atomic_store32(&_mapped_pages, 0);
 	_mapped_pages_peak = 0;
 	atomic_store32(&_master_spans, 0);
-	atomic_store32(&_reserved_spans, 0);
 	atomic_store32(&_mapped_total, 0);
 	atomic_store32(&_unmapped_total, 0);
 	atomic_store32(&_mapped_pages_os, 0);
@@ -2883,7 +2911,6 @@
 #if ENABLE_STATISTICS
 	//If you hit these asserts you probably have memory leaks (perhaps global scope data doing dynamic allocations) or double frees in your code
 	rpmalloc_assert(atomic_load32(&_mapped_pages) == 0, "Memory leak detected");
-	rpmalloc_assert(atomic_load32(&_reserved_spans) == 0, "Memory leak detected");
 	rpmalloc_assert(atomic_load32(&_mapped_pages_os) == 0, "Memory leak detected");
 #endif
 
@@ -3221,34 +3248,33 @@
 	fprintf(file, "HugeCurrentMiB HugePeakMiB\n");
 	fprintf(file, "%14zu %11zu\n", huge_current / (size_t)(1024 * 1024), huge_peak / (size_t)(1024 * 1024));
 
-	size_t global_cache = 0;
+	fprintf(file, "GlobalCacheMiB\n");
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
 		global_cache_t* cache = _memory_span_cache + iclass;
-		global_cache += (size_t)cache->count * iclass * _memory_span_size;
+		size_t global_cache = (size_t)cache->count * iclass * _memory_span_size;
 
+		size_t global_overflow_cache = 0;
 		span_t* span = cache->overflow;
 		while (span) {
-			global_cache += iclass * _memory_span_size;
+			global_overflow_cache += iclass * _memory_span_size;
 			span = span->next;
 		}
+		if (global_cache || global_overflow_cache || cache->insert_count || cache->extract_count)
+			fprintf(file, "%4zu: %8zuMiB (%8zuMiB overflow) %14zu insert %14zu extract\n", iclass + 1, global_cache / (size_t)(1024 * 1024), global_overflow_cache / (size_t)(1024 * 1024), cache->insert_count, cache->extract_count);
 	}
-	fprintf(file, "GlobalCacheMiB\n");
-	fprintf(file, "%14zu\n", global_cache / (size_t)(1024 * 1024));
 
 	size_t mapped = (size_t)atomic_load32(&_mapped_pages) * _memory_page_size;
 	size_t mapped_os = (size_t)atomic_load32(&_mapped_pages_os) * _memory_page_size;
 	size_t mapped_peak = (size_t)_mapped_pages_peak * _memory_page_size;
 	size_t mapped_total = (size_t)atomic_load32(&_mapped_total) * _memory_page_size;
 	size_t unmapped_total = (size_t)atomic_load32(&_unmapped_total) * _memory_page_size;
-	size_t reserved_total = (size_t)atomic_load32(&_reserved_spans) * _memory_span_size;
-	fprintf(file, "MappedMiB MappedOSMiB MappedPeakMiB MappedTotalMiB UnmappedTotalMiB ReservedTotalMiB\n");
-	fprintf(file, "%9zu %11zu %13zu %14zu %16zu %16zu\n",
+	fprintf(file, "MappedMiB MappedOSMiB MappedPeakMiB MappedTotalMiB UnmappedTotalMiB\n");
+	fprintf(file, "%9zu %11zu %13zu %14zu %16zu\n",
 		mapped / (size_t)(1024 * 1024),
 		mapped_os / (size_t)(1024 * 1024),
 		mapped_peak / (size_t)(1024 * 1024),
 		mapped_total / (size_t)(1024 * 1024),
-		unmapped_total / (size_t)(1024 * 1024),
-		reserved_total / (size_t)(1024 * 1024));
+		unmapped_total / (size_t)(1024 * 1024));
 
 	fprintf(file, "\n");
 #if 0
diff --git a/rpmalloc/rpmalloc.h b/rpmalloc/rpmalloc.h
index b1fa757..f3363f2 100644
--- a/rpmalloc/rpmalloc.h
+++ b/rpmalloc/rpmalloc.h
@@ -156,6 +156,12 @@
 	//! Called when an assert fails, if asserts are enabled. Will use the standard assert()
 	//  if this is not set.
 	void (*error_callback)(const char* message);
+	//! Called when a call to map memory pages fails (out of memory). If this callback is
+	//  not set or returns zero the library will return a null pointer in the allocation
+	//  call. If this callback returns non-zero the map call will be retried. The argument
+	//  passed is the number of bytes that was requested in the map call. Only used if
+	//  the default system memory map function is used (memory_map callback is not set).
+	int (*map_fail_callback)(size_t size);
 	//! Size of memory pages. The page size MUST be a power of two. All memory mapping
 	//  requests to memory_map will be made with size set to a multiple of the page size.
 	//  Used if RPMALLOC_CONFIGURABLE is defined to 1, otherwise system page size is used.
diff --git a/test/main.c b/test/main.c
index 0a92ac7..a01e09c 100644
--- a/test/main.c
+++ b/test/main.c
@@ -2,6 +2,11 @@
 #if defined(_WIN32) && !defined(_CRT_SECURE_NO_WARNINGS)
 #  define _CRT_SECURE_NO_WARNINGS
 #endif
+#ifdef _MSC_VER
+#  if !defined(__clang__)
+#    pragma warning (disable: 5105)
+#  endif
+#endif
 #if defined(__clang__)
 #pragma clang diagnostic ignored "-Wnonportable-system-include-path"
 #endif
@@ -362,7 +367,7 @@
 	for (size_t iloop = 0; iloop < 8000; ++iloop) {
 		for (size_t iptr = 0; iptr < pointer_count; ++iptr) {
 			if (iloop)
-				rpfree(rprealloc(pointers[iptr], rand() % 4096));
+				rpfree(rprealloc(pointers[iptr], (size_t)rand() % 4096));
 			pointers[iptr] = rpaligned_alloc(alignments[(iptr + iloop) % 5], iloop + iptr);
 		}
 	}
@@ -787,15 +792,13 @@
 }
 
 static int
-test_threaded(void) {
+test_thread_implementation(void) {
 	uintptr_t thread[32];
 	uintptr_t threadres[32];
 	unsigned int i;
 	size_t num_alloc_threads;
 	allocator_thread_arg_t arg;
 
-	rpmalloc_initialize();
-
 	num_alloc_threads = _hardware_threads;
 	if (num_alloc_threads < 2)
 		num_alloc_threads = 2;
@@ -846,11 +849,23 @@
 			return -1;
 	}
 
-	printf("Memory threaded tests passed\n");
-
 	return 0;
 }
 
+static int
+test_threaded(void) {
+	rpmalloc_initialize();
+
+	int ret = test_thread_implementation();
+
+	rpmalloc_finalize();
+
+	if (ret == 0)
+		printf("Memory threaded tests passed\n");
+
+	return ret;
+}
+
 static int 
 test_crossthread(void) {
 	uintptr_t thread[32];
@@ -917,10 +932,10 @@
 	for (unsigned int ithread = 0; ithread < num_alloc_threads; ++ithread)
 		rpfree(arg[ithread].pointers);
 
-	rpmalloc_finalize();
-
 	printf("Memory cross thread free tests passed\n");
 
+	rpmalloc_finalize();
+
 	return 0;
 }
 
@@ -1091,6 +1106,24 @@
 	return 0;
 }
 
+static int
+test_large_pages(void) {
+	rpmalloc_config_t config = {0};
+	config.page_size = 16 * 1024 * 1024;
+	config.span_map_count = 16;
+
+	rpmalloc_initialize_config(&config);
+
+	int ret = test_thread_implementation();
+
+	rpmalloc_finalize();
+
+	if (ret == 0)
+		printf("Large page config test passed\n");
+
+	return ret;
+}
+
 int
 test_run(int argc, char** argv) {
 	(void)sizeof(argc);
@@ -1110,6 +1143,8 @@
 		return -1;
 	if (test_first_class_heaps())
 		return -1;
+	if (test_large_pages())
+		return -1;
 	if (test_error())
 		return -1;
 	printf("All tests passed\n");
diff --git a/test/thread.c b/test/thread.c
index ff4758b..9d047e9 100644
--- a/test/thread.c
+++ b/test/thread.c
@@ -3,6 +3,9 @@
 #include <errno.h>
 
 #ifdef _MSC_VER
+#  if !defined(__clang__)
+#    pragma warning (disable: 5105)
+#  endif
 #  define ATTRIBUTE_NORETURN
 #else
 #  define ATTRIBUTE_NORETURN __attribute__((noreturn))