Minor optimizations and fix map padding on Windows (#174)
diff --git a/rpmalloc/malloc.c b/rpmalloc/malloc.c
index 1c95fbe..def584f 100644
--- a/rpmalloc/malloc.c
+++ b/rpmalloc/malloc.c
@@ -62,15 +62,18 @@
#undef malloc
#undef free
#undef calloc
+#define RPMALLOC_RESTRICT __declspec(restrict)
+#else
+#define RPMALLOC_RESTRICT
#endif
#if ENABLE_OVERRIDE
#if USE_IMPLEMENT
-extern inline void* RPMALLOC_CDECL malloc(size_t size) { return rpmalloc(size); }
-extern inline void* RPMALLOC_CDECL calloc(size_t count, size_t size) { return rpcalloc(count, size); }
-extern inline void* RPMALLOC_CDECL realloc(void* ptr, size_t size) { return rprealloc(ptr, size); }
+extern inline RPMALLOC_RESTRICT void* RPMALLOC_CDECL malloc(size_t size) { return rpmalloc(size); }
+extern inline RPMALLOC_RESTRICT void* RPMALLOC_CDECL calloc(size_t count, size_t size) { return rpcalloc(count, size); }
+extern inline RPMALLOC_RESTRICT void* RPMALLOC_CDECL realloc(void* ptr, size_t size) { return rprealloc(ptr, size); }
extern inline void* RPMALLOC_CDECL reallocf(void* ptr, size_t size) { return rprealloc(ptr, size); }
extern inline void* RPMALLOC_CDECL aligned_alloc(size_t alignment, size_t size) { return rpaligned_alloc(alignment, size); }
extern inline void* RPMALLOC_CDECL memalign(size_t alignment, size_t size) { return rpmemalign(alignment, size); }
diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index f6f172e..15e666c 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -216,8 +216,8 @@
static FORCEINLINE void* atomic_load_ptr(atomicptr_t* src) { return (void*)*src; }
static FORCEINLINE void atomic_store_ptr(atomicptr_t* dst, void* val) { *dst = val; }
static FORCEINLINE void atomic_store_ptr_release(atomicptr_t* dst, void* val) { *dst = val; }
+static FORCEINLINE void* atomic_exchange_ptr_acquire(atomicptr_t* dst, void* val) { return (void*)InterlockedExchangePointer((void* volatile*)dst, val); }
static FORCEINLINE int atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { return (InterlockedCompareExchangePointer((void* volatile*)dst, val, ref) == ref) ? 1 : 0; }
-static FORCEINLINE int atomic_cas_ptr_acquire(atomicptr_t* dst, void* val, void* ref) { return atomic_cas_ptr(dst, val, ref); }
#define EXPECTED(x) (x)
#define UNEXPECTED(x) (x)
@@ -242,8 +242,8 @@
static FORCEINLINE void* atomic_load_ptr(atomicptr_t* src) { return atomic_load_explicit(src, memory_order_relaxed); }
static FORCEINLINE void atomic_store_ptr(atomicptr_t* dst, void* val) { atomic_store_explicit(dst, val, memory_order_relaxed); }
static FORCEINLINE void atomic_store_ptr_release(atomicptr_t* dst, void* val) { atomic_store_explicit(dst, val, memory_order_release); }
+static FORCEINLINE void* atomic_exchange_ptr_acquire(atomicptr_t* dst, void* val) { return atomic_exchange_explicit(dst, val, memory_order_acquire); }
static FORCEINLINE int atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { return atomic_compare_exchange_weak_explicit(dst, &ref, val, memory_order_relaxed, memory_order_relaxed); }
-static FORCEINLINE int atomic_cas_ptr_acquire(atomicptr_t* dst, void* val, void* ref) { return atomic_compare_exchange_weak_explicit(dst, &ref, val, memory_order_acquire, memory_order_relaxed); }
#define EXPECTED(x) __builtin_expect((x), 1)
#define UNEXPECTED(x) __builtin_expect((x), 0)
@@ -305,7 +305,7 @@
//! Total number of small + medium size classes
#define SIZE_CLASS_COUNT (SMALL_CLASS_COUNT + MEDIUM_CLASS_COUNT)
//! Number of large block size classes
-#define LARGE_CLASS_COUNT 32
+#define LARGE_CLASS_COUNT 63
//! Maximum size of a medium block
#define MEDIUM_SIZE_LIMIT (SMALL_SIZE_LIMIT + (MEDIUM_GRANULARITY * MEDIUM_CLASS_COUNT))
//! Maximum size of a large block
@@ -787,10 +787,10 @@
if (release && offset) {
offset <<= 3;
address = pointer_offset(address, -(int32_t)offset);
-#if PLATFORM_POSIX
- //Padding is always one span size
- release += _memory_span_size;
-#endif
+ if ((release >= _memory_span_size) && (_memory_span_size > _memory_map_granularity)) {
+ //Padding is always one span size
+ release += _memory_span_size;
+ }
}
#if !DISABLE_UNMAP
#if PLATFORM_WINDOWS
@@ -1147,8 +1147,8 @@
// We need acquire semantics on the CAS operation since we are interested in the list size
// Refer to _rpmalloc_deallocate_defer_small_or_medium for further comments on this dependency
do {
- span->free_list = atomic_load_ptr(&span->free_list_deferred);
- } while ((span->free_list == INVALID_POINTER) || !atomic_cas_ptr_acquire(&span->free_list_deferred, INVALID_POINTER, span->free_list));
+ span->free_list = atomic_exchange_ptr_acquire(&span->free_list_deferred, INVALID_POINTER);
+ } while (span->free_list == INVALID_POINTER);
span->used_count -= span->list_size;
span->list_size = 0;
atomic_store_ptr_release(&span->free_list_deferred, 0);
@@ -2045,9 +2045,9 @@
// guarantee the list_size variable validity + release semantics on pointer store
void* free_list;
do {
- free_list = atomic_load_ptr(&span->free_list_deferred);
- *((void**)block) = free_list;
- } while ((free_list == INVALID_POINTER) || !atomic_cas_ptr_acquire(&span->free_list_deferred, INVALID_POINTER, free_list));
+ free_list = atomic_exchange_ptr_acquire(&span->free_list_deferred, INVALID_POINTER);
+ } while (free_list == INVALID_POINTER);
+ *((void**)block) = free_list;
uint32_t free_count = ++span->list_size;
atomic_store_ptr_release(&span->free_list_deferred, block);
if (free_count == span->block_count) {
@@ -2207,7 +2207,7 @@
void* block = pointer_offset(span, SPAN_HEADER_SIZE);
if (!oldsize)
oldsize = (current_spans * _memory_span_size) - (size_t)pointer_diff(p, block) - SPAN_HEADER_SIZE;
- if ((current_spans >= num_spans) && (num_spans >= (current_spans / 2))) {
+ if ((current_spans >= num_spans) && (total_size >= (oldsize / 2))) {
//Still fits in block, never mind trying to save memory, but preserve data if alignment changed
if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE))
memmove(block, p, oldsize);
@@ -2314,14 +2314,16 @@
_memory_size_class[iclass].class_idx = (uint16_t)iclass;
//Check if previous size classes can be merged
- size_t prevclass = iclass;
- while (prevclass > 0) {
- --prevclass;
- //A class can be merged if number of pages and number of blocks are equal
- if (_memory_size_class[prevclass].block_count == _memory_size_class[iclass].block_count)
- memcpy(_memory_size_class + prevclass, _memory_size_class + iclass, sizeof(_memory_size_class[iclass]));
- else
- break;
+ if (iclass >= SMALL_CLASS_COUNT) {
+ size_t prevclass = iclass;
+ while (prevclass > 0) {
+ --prevclass;
+ //A class can be merged if number of pages and number of blocks are equal
+ if (_memory_size_class[prevclass].block_count == _memory_size_class[iclass].block_count)
+ memcpy(_memory_size_class + prevclass, _memory_size_class + iclass, sizeof(_memory_size_class[iclass]));
+ else
+ break;
+ }
}
}
@@ -2533,7 +2535,7 @@
void
rpmalloc_finalize(void) {
rpmalloc_thread_finalize();
- //rpmalloc_dump_statistics(stderr);
+ //rpmalloc_dump_statistics(stdout);
//Free all thread caches and fully free spans
for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) {
diff --git a/test/main.c b/test/main.c
index 426a48a..7e6fe64 100644
--- a/test/main.c
+++ b/test/main.c
@@ -47,9 +47,15 @@
void* addr[8142];
char data[20000];
unsigned int datasize[7] = { 473, 39, 195, 24, 73, 376, 245 };
+ size_t wanted_usable_size;
rpmalloc_initialize();
+ //Query the small granularity
+ void* zero_alloc = rpmalloc(0);
+ size_t small_granularity = rpmalloc_usable_size(zero_alloc);
+ rpfree(zero_alloc);
+
for (id = 0; id < 20000; ++id)
data[id] = (char)(id % 139 + id % 17);
@@ -68,16 +74,18 @@
rpfree(testptr);
for (iloop = 0; iloop <= 1024; ++iloop) {
testptr = rpmalloc(iloop);
- size_t wanted_usable_size = 16 * ((iloop / 16) + ((!iloop || (iloop % 16)) ? 1 : 0));
- if (rpmalloc_usable_size(testptr) != wanted_usable_size)
+ wanted_usable_size = iloop ? small_granularity * ((iloop + (small_granularity - 1)) / small_granularity) : small_granularity;
+ if (rpmalloc_usable_size(testptr) != wanted_usable_size) {
+ printf("For %u wanted %zu got %zu\n", iloop, wanted_usable_size, rpmalloc_usable_size(testptr));
return test_fail("Bad base alloc usable size");
+ }
rpfree(testptr);
}
//Verify medium block sizes (until class merging kicks in)
for (iloop = 1025; iloop <= 6000; ++iloop) {
testptr = rpmalloc(iloop);
- size_t wanted_usable_size = 512 * ((iloop / 512) + ((iloop % 512) ? 1 : 0));
+ wanted_usable_size = 512 * ((iloop / 512) + ((iloop % 512) ? 1 : 0));
if (rpmalloc_usable_size(testptr) != wanted_usable_size)
return test_fail("Bad medium alloc usable size");
rpfree(testptr);
@@ -86,9 +94,10 @@
//Large reallocation test
testptr = rpmalloc(253000);
testptr = rprealloc(testptr, 151);
- if (rpmalloc_usable_size(testptr) != 160)
+ wanted_usable_size = (small_granularity * ((151 + (small_granularity - 1)) / small_granularity));
+ if (rpmalloc_usable_size(testptr) != wanted_usable_size)
return test_fail("Bad usable size");
- if (rpmalloc_usable_size(pointer_offset(testptr, 16)) != 144)
+ if (rpmalloc_usable_size(pointer_offset(testptr, 16)) != (wanted_usable_size - 16))
return test_fail("Bad offset usable size");
rpfree(testptr);
@@ -97,7 +106,7 @@
size_t size = 37 * iloop;
testptr = rpmalloc(size);
*((uintptr_t*)testptr) = 0x12345678;
- size_t wanted_usable_size = 16 * ((size / 16) + ((size % 16) ? 1 : 0));
+ wanted_usable_size = small_granularity * ((size / small_granularity) + ((size % small_granularity) ? 1 : 0));
if (rpmalloc_usable_size(testptr) != wanted_usable_size)
return test_fail("Bad usable size (alloc)");
testptr = rprealloc(testptr, size + 16);
@@ -109,7 +118,7 @@
testptr = rpaligned_alloc(128, size);
*((uintptr_t*)testptr) = 0x12345678;
- wanted_usable_size = 16 * ((size / 16) + ((size % 16) ? 1 : 0));
+ wanted_usable_size = small_granularity * ((size / small_granularity) + ((size % small_granularity) ? 1 : 0));
if (rpmalloc_usable_size(testptr) < wanted_usable_size)
return test_fail("Bad usable size (aligned alloc)");
if (rpmalloc_usable_size(testptr) > (wanted_usable_size + 128))