Minor optimizations and fix map padding on Windows (#174)

diff --git a/rpmalloc/malloc.c b/rpmalloc/malloc.c
index 1c95fbe..def584f 100644
--- a/rpmalloc/malloc.c
+++ b/rpmalloc/malloc.c
@@ -62,15 +62,18 @@
 #undef malloc
 #undef free
 #undef calloc
+#define RPMALLOC_RESTRICT __declspec(restrict)
+#else
+#define RPMALLOC_RESTRICT
 #endif
 
 #if ENABLE_OVERRIDE
 
 #if USE_IMPLEMENT
 
-extern inline void* RPMALLOC_CDECL malloc(size_t size) { return rpmalloc(size); }
-extern inline void* RPMALLOC_CDECL calloc(size_t count, size_t size) { return rpcalloc(count, size); }
-extern inline void* RPMALLOC_CDECL realloc(void* ptr, size_t size) { return rprealloc(ptr, size); }
+extern inline RPMALLOC_RESTRICT void* RPMALLOC_CDECL malloc(size_t size) { return rpmalloc(size); }
+extern inline RPMALLOC_RESTRICT void* RPMALLOC_CDECL calloc(size_t count, size_t size) { return rpcalloc(count, size); }
+extern inline RPMALLOC_RESTRICT void* RPMALLOC_CDECL realloc(void* ptr, size_t size) { return rprealloc(ptr, size); }
 extern inline void* RPMALLOC_CDECL reallocf(void* ptr, size_t size) { return rprealloc(ptr, size); }
 extern inline void* RPMALLOC_CDECL aligned_alloc(size_t alignment, size_t size) { return rpaligned_alloc(alignment, size); }
 extern inline void* RPMALLOC_CDECL memalign(size_t alignment, size_t size) { return rpmemalign(alignment, size); }
diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index f6f172e..15e666c 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -216,8 +216,8 @@
 static FORCEINLINE void*   atomic_load_ptr(atomicptr_t* src) { return (void*)*src; }
 static FORCEINLINE void    atomic_store_ptr(atomicptr_t* dst, void* val) { *dst = val; }
 static FORCEINLINE void    atomic_store_ptr_release(atomicptr_t* dst, void* val) { *dst = val; }
+static FORCEINLINE void*   atomic_exchange_ptr_acquire(atomicptr_t* dst, void* val) { return (void*)InterlockedExchangePointer((void* volatile*)dst, val); }
 static FORCEINLINE int     atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { return (InterlockedCompareExchangePointer((void* volatile*)dst, val, ref) == ref) ? 1 : 0; }
-static FORCEINLINE int     atomic_cas_ptr_acquire(atomicptr_t* dst, void* val, void* ref) { return atomic_cas_ptr(dst, val, ref); }
 
 #define EXPECTED(x) (x)
 #define UNEXPECTED(x) (x)
@@ -242,8 +242,8 @@
 static FORCEINLINE void*   atomic_load_ptr(atomicptr_t* src) { return atomic_load_explicit(src, memory_order_relaxed); }
 static FORCEINLINE void    atomic_store_ptr(atomicptr_t* dst, void* val) { atomic_store_explicit(dst, val, memory_order_relaxed); }
 static FORCEINLINE void    atomic_store_ptr_release(atomicptr_t* dst, void* val) { atomic_store_explicit(dst, val, memory_order_release); }
+static FORCEINLINE void*   atomic_exchange_ptr_acquire(atomicptr_t* dst, void* val) { return atomic_exchange_explicit(dst, val, memory_order_acquire); }
 static FORCEINLINE int     atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { return atomic_compare_exchange_weak_explicit(dst, &ref, val, memory_order_relaxed, memory_order_relaxed); }
-static FORCEINLINE int     atomic_cas_ptr_acquire(atomicptr_t* dst, void* val, void* ref) { return atomic_compare_exchange_weak_explicit(dst, &ref, val, memory_order_acquire, memory_order_relaxed); }
 
 #define EXPECTED(x) __builtin_expect((x), 1)
 #define UNEXPECTED(x) __builtin_expect((x), 0)
@@ -305,7 +305,7 @@
 //! Total number of small + medium size classes
 #define SIZE_CLASS_COUNT          (SMALL_CLASS_COUNT + MEDIUM_CLASS_COUNT)
 //! Number of large block size classes
-#define LARGE_CLASS_COUNT         32
+#define LARGE_CLASS_COUNT         63
 //! Maximum size of a medium block
 #define MEDIUM_SIZE_LIMIT         (SMALL_SIZE_LIMIT + (MEDIUM_GRANULARITY * MEDIUM_CLASS_COUNT))
 //! Maximum size of a large block
@@ -787,10 +787,10 @@
 	if (release && offset) {
 		offset <<= 3;
 		address = pointer_offset(address, -(int32_t)offset);
-#if PLATFORM_POSIX
-		//Padding is always one span size
-		release += _memory_span_size;
-#endif
+		if ((release >= _memory_span_size) && (_memory_span_size > _memory_map_granularity)) {
+			//Padding is always one span size
+			release += _memory_span_size;
+		}
 	}
 #if !DISABLE_UNMAP
 #if PLATFORM_WINDOWS
@@ -1147,8 +1147,8 @@
 	// We need acquire semantics on the CAS operation since we are interested in the list size
 	// Refer to _rpmalloc_deallocate_defer_small_or_medium for further comments on this dependency
 	do {
-		span->free_list = atomic_load_ptr(&span->free_list_deferred);
-	} while ((span->free_list == INVALID_POINTER) || !atomic_cas_ptr_acquire(&span->free_list_deferred, INVALID_POINTER, span->free_list));
+		span->free_list = atomic_exchange_ptr_acquire(&span->free_list_deferred, INVALID_POINTER);
+	} while (span->free_list == INVALID_POINTER);
 	span->used_count -= span->list_size;
 	span->list_size = 0;
 	atomic_store_ptr_release(&span->free_list_deferred, 0);
@@ -2045,9 +2045,9 @@
 	// guarantee the list_size variable validity + release semantics on pointer store
 	void* free_list;
 	do {
-		free_list = atomic_load_ptr(&span->free_list_deferred);
-		*((void**)block) = free_list;
-	} while ((free_list == INVALID_POINTER) || !atomic_cas_ptr_acquire(&span->free_list_deferred, INVALID_POINTER, free_list));
+		free_list = atomic_exchange_ptr_acquire(&span->free_list_deferred, INVALID_POINTER);
+	} while (free_list == INVALID_POINTER);
+	*((void**)block) = free_list;
 	uint32_t free_count = ++span->list_size;
 	atomic_store_ptr_release(&span->free_list_deferred, block);
 	if (free_count == span->block_count) {
@@ -2207,7 +2207,7 @@
 			void* block = pointer_offset(span, SPAN_HEADER_SIZE);
 			if (!oldsize)
 				oldsize = (current_spans * _memory_span_size) - (size_t)pointer_diff(p, block) - SPAN_HEADER_SIZE;
-			if ((current_spans >= num_spans) && (num_spans >= (current_spans / 2))) {
+			if ((current_spans >= num_spans) && (total_size >= (oldsize / 2))) {
 				//Still fits in block, never mind trying to save memory, but preserve data if alignment changed
 				if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE))
 					memmove(block, p, oldsize);
@@ -2314,14 +2314,16 @@
 	_memory_size_class[iclass].class_idx = (uint16_t)iclass;
 
 	//Check if previous size classes can be merged
-	size_t prevclass = iclass;
-	while (prevclass > 0) {
-		--prevclass;
-		//A class can be merged if number of pages and number of blocks are equal
-		if (_memory_size_class[prevclass].block_count == _memory_size_class[iclass].block_count)
-			memcpy(_memory_size_class + prevclass, _memory_size_class + iclass, sizeof(_memory_size_class[iclass]));
-		else
-			break;
+	if (iclass >= SMALL_CLASS_COUNT) {
+		size_t prevclass = iclass;
+		while (prevclass > 0) {
+			--prevclass;
+			//A class can be merged if number of pages and number of blocks are equal
+			if (_memory_size_class[prevclass].block_count == _memory_size_class[iclass].block_count)
+				memcpy(_memory_size_class + prevclass, _memory_size_class + iclass, sizeof(_memory_size_class[iclass]));
+			else
+				break;
+		}
 	}
 }
 
@@ -2533,7 +2535,7 @@
 void
 rpmalloc_finalize(void) {
 	rpmalloc_thread_finalize();
-	//rpmalloc_dump_statistics(stderr);
+	//rpmalloc_dump_statistics(stdout);
 
 	//Free all thread caches and fully free spans
 	for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) {
diff --git a/test/main.c b/test/main.c
index 426a48a..7e6fe64 100644
--- a/test/main.c
+++ b/test/main.c
@@ -47,9 +47,15 @@
 	void* addr[8142];
 	char data[20000];
 	unsigned int datasize[7] = { 473, 39, 195, 24, 73, 376, 245 };
+	size_t wanted_usable_size;
 
 	rpmalloc_initialize();
 
+	//Query the small granularity
+	void* zero_alloc = rpmalloc(0);
+	size_t small_granularity = rpmalloc_usable_size(zero_alloc);
+	rpfree(zero_alloc);
+
 	for (id = 0; id < 20000; ++id)
 		data[id] = (char)(id % 139 + id % 17);
 
@@ -68,16 +74,18 @@
 	rpfree(testptr);
 	for (iloop = 0; iloop <= 1024; ++iloop) {
 		testptr = rpmalloc(iloop);
-		size_t wanted_usable_size = 16 * ((iloop / 16) + ((!iloop || (iloop % 16)) ? 1 : 0));
-		if (rpmalloc_usable_size(testptr) != wanted_usable_size)
+		wanted_usable_size = iloop ? small_granularity * ((iloop + (small_granularity - 1)) / small_granularity) : small_granularity;
+		if (rpmalloc_usable_size(testptr) != wanted_usable_size) {
+			printf("For %u wanted %zu got %zu\n", iloop, wanted_usable_size, rpmalloc_usable_size(testptr));
 			return test_fail("Bad base alloc usable size");
+		}
 		rpfree(testptr);
 	}
 
 	//Verify medium block sizes (until class merging kicks in)
 	for (iloop = 1025; iloop <= 6000; ++iloop) {
 		testptr = rpmalloc(iloop);
-		size_t wanted_usable_size = 512 * ((iloop / 512) + ((iloop % 512) ? 1 : 0));
+		wanted_usable_size = 512 * ((iloop / 512) + ((iloop % 512) ? 1 : 0));
 		if (rpmalloc_usable_size(testptr) != wanted_usable_size)
 			return test_fail("Bad medium alloc usable size");
 		rpfree(testptr);
@@ -86,9 +94,10 @@
 	//Large reallocation test
 	testptr = rpmalloc(253000);
 	testptr = rprealloc(testptr, 151);
-	if (rpmalloc_usable_size(testptr) != 160)
+	wanted_usable_size = (small_granularity * ((151 + (small_granularity - 1)) / small_granularity));
+	if (rpmalloc_usable_size(testptr) != wanted_usable_size)
 		return test_fail("Bad usable size");
-	if (rpmalloc_usable_size(pointer_offset(testptr, 16)) != 144)
+	if (rpmalloc_usable_size(pointer_offset(testptr, 16)) != (wanted_usable_size - 16))
 		return test_fail("Bad offset usable size");
 	rpfree(testptr);
 
@@ -97,7 +106,7 @@
 		size_t size = 37 * iloop;
 		testptr = rpmalloc(size);
 		*((uintptr_t*)testptr) = 0x12345678;
-		size_t wanted_usable_size = 16 * ((size / 16) + ((size % 16) ? 1 : 0));
+		wanted_usable_size = small_granularity * ((size / small_granularity) + ((size % small_granularity) ? 1 : 0));
 		if (rpmalloc_usable_size(testptr) != wanted_usable_size)
 			return test_fail("Bad usable size (alloc)");
 		testptr = rprealloc(testptr, size + 16);
@@ -109,7 +118,7 @@
 
 		testptr = rpaligned_alloc(128, size);
 		*((uintptr_t*)testptr) = 0x12345678;
-		wanted_usable_size = 16 * ((size / 16) + ((size % 16) ? 1 : 0));
+		wanted_usable_size = small_granularity * ((size / small_granularity) + ((size % small_granularity) ? 1 : 0));
 		if (rpmalloc_usable_size(testptr) < wanted_usable_size)
 			return test_fail("Bad usable size (aligned alloc)");
 		if (rpmalloc_usable_size(testptr) > (wanted_usable_size + 128))