| /* rpmalloc.c - Memory allocator - Public Domain - 2016 Mattias Jansson / Rampant Pixels |
| * |
| * This library provides a cross-platform lock free thread caching malloc implementation in C11. |
| * The latest source code is always available at |
| * |
| * https://github.com/rampantpixels/rpmalloc |
| * |
| * This library is put in the public domain; you can redistribute it and/or modify it without any restrictions. |
| * |
| */ |
| |
| #include "rpmalloc.h" |
| |
| // Build time configurable limits |
| |
| // Presets, if none is defined it will default to performance priority |
| //#define ENABLE_UNLIMITED_CACHE |
| //#define DISABLE_CACHE |
| //#define ENABLE_SPACE_PRIORITY_CACHE |
| |
| // Presets for cache limits |
| #if defined(ENABLE_UNLIMITED_CACHE) |
| // Unlimited caches |
| #define MIN_SPAN_CACHE_RELEASE 16 |
| #define MAX_SPAN_CACHE_DIVISOR 1 |
| #elif defined(DISABLE_CACHE) |
| //Disable cache |
| #define MIN_SPAN_CACHE_RELEASE 1 |
| #define MAX_SPAN_CACHE_DIVISOR 0 |
| #elif defined(ENABLE_SPACE_PRIORITY_CACHE) |
| // Space priority cache limits |
| #define MIN_SPAN_CACHE_SIZE 8 |
| #define MIN_SPAN_CACHE_RELEASE 8 |
| #define MAX_SPAN_CACHE_DIVISOR 16 |
| #define GLOBAL_SPAN_CACHE_MULTIPLIER 1 |
| #else |
| // Default - performance priority cache limits |
| //! Limit of thread cache in number of spans for each page count class (undefine for unlimited cache - i.e never release spans to global cache unless thread finishes) |
| //! Minimum cache size to remain after a release to global cache |
| #define MIN_SPAN_CACHE_SIZE 8 |
| //! Minimum number of spans to transfer between thread and global cache |
| #define MIN_SPAN_CACHE_RELEASE 16 |
| //! Maximum cache size divisor (max cache size will be max allocation count divided by this divisor) |
| #define MAX_SPAN_CACHE_DIVISOR 8 |
| //! Multiplier for global span cache limit (max cache size will be calculated like thread cache and multiplied with this) |
| #define GLOBAL_SPAN_CACHE_MULTIPLIER 4 |
| #endif |
| |
| //! Size of heap hashmap |
| #define HEAP_ARRAY_SIZE 79 |
| |
| #ifndef ENABLE_VALIDATE_ARGS |
| //! Enable validation of args to public entry points |
| #define ENABLE_VALIDATE_ARGS 0 |
| #endif |
| |
| #ifndef ENABLE_STATISTICS |
| //! Enable statistics collection |
| #define ENABLE_STATISTICS 0 |
| #endif |
| |
| #ifndef ENABLE_ASSERTS |
| //! Enable asserts |
| #define ENABLE_ASSERTS 0 |
| #endif |
| |
| // Platform and arch specifics |
| |
| #ifdef _MSC_VER |
| # define ALIGNED_STRUCT(name, alignment) __declspec(align(alignment)) struct name |
| # define FORCEINLINE __forceinline |
| # define TLS_MODEL |
| # define _Static_assert static_assert |
| # define _Thread_local __declspec(thread) |
| # define atomic_thread_fence_acquire() //_ReadWriteBarrier() |
| # define atomic_thread_fence_release() //_ReadWriteBarrier() |
| # if ENABLE_VALIDATE_ARGS |
| # include <Intsafe.h> |
| # endif |
| #else |
| # define ALIGNED_STRUCT(name, alignment) struct __attribute__((__aligned__(alignment))) name |
| # define FORCEINLINE inline __attribute__((__always_inline__)) |
| # define TLS_MODEL __attribute__((tls_model("initial-exec"))) |
| # if !defined(__clang__) && defined(__GNUC__) |
| # define _Thread_local __thread |
| # endif |
| # ifdef __arm__ |
| # define atomic_thread_fence_acquire() __asm volatile("dmb sy" ::: "memory") |
| # define atomic_thread_fence_release() __asm volatile("dmb st" ::: "memory") |
| # else |
| # define atomic_thread_fence_acquire() //__asm volatile("" ::: "memory") |
| # define atomic_thread_fence_release() //__asm volatile("" ::: "memory") |
| # endif |
| #endif |
| |
| #if defined( __x86_64__ ) || defined( _M_AMD64 ) || defined( _M_X64 ) || defined( _AMD64_ ) || defined( __arm64__ ) || defined( __aarch64__ ) |
| # define ARCH_64BIT 1 |
| #else |
| # define ARCH_64BIT 0 |
| #endif |
| |
| #if defined( _WIN32 ) || defined( __WIN32__ ) || defined( _WIN64 ) |
| # define PLATFORM_WINDOWS 1 |
| #else |
| # define PLATFORM_POSIX 1 |
| #endif |
| |
| #include <stdint.h> |
| #include <string.h> |
| |
| #if ENABLE_ASSERTS |
| # include <assert.h> |
| #else |
| # define assert(x) |
| #endif |
| |
| // Atomic access abstraction |
| ALIGNED_STRUCT(atomic32_t, 4) { |
| int32_t nonatomic; |
| }; |
| typedef struct atomic32_t atomic32_t; |
| |
| ALIGNED_STRUCT(atomic64_t, 8) { |
| int64_t nonatomic; |
| }; |
| typedef struct atomic64_t atomic64_t; |
| |
| ALIGNED_STRUCT(atomicptr_t, 8) { |
| void* nonatomic; |
| }; |
| typedef struct atomicptr_t atomicptr_t; |
| |
| static FORCEINLINE int32_t |
| atomic_load32(atomic32_t* src) { |
| return src->nonatomic; |
| } |
| |
| static FORCEINLINE void |
| atomic_store32(atomic32_t* dst, int32_t val) { |
| dst->nonatomic = val; |
| } |
| |
| #if PLATFORM_POSIX |
| |
| static FORCEINLINE void |
| atomic_store64(atomic64_t* dst, int64_t val) { |
| dst->nonatomic = val; |
| } |
| |
| static FORCEINLINE int64_t |
| atomic_exchange_and_add64(atomic64_t* dst, int64_t add) { |
| return __sync_fetch_and_add(&dst->nonatomic, add); |
| } |
| |
| #endif |
| |
| static FORCEINLINE int32_t |
| atomic_incr32(atomic32_t* val) { |
| #ifdef _MSC_VER |
| int32_t old = (int32_t)_InterlockedExchangeAdd((volatile long*)&val->nonatomic, 1); |
| return (old + 1); |
| #else |
| return __sync_add_and_fetch(&val->nonatomic, 1); |
| #endif |
| } |
| |
| static FORCEINLINE int32_t |
| atomic_add32(atomic32_t* val, int32_t add) { |
| #ifdef _MSC_VER |
| int32_t old = (int32_t)_InterlockedExchangeAdd((volatile long*)&val->nonatomic, add); |
| return (old + add); |
| #else |
| return __sync_add_and_fetch(&val->nonatomic, add); |
| #endif |
| } |
| |
| static FORCEINLINE void* |
| atomic_load_ptr(atomicptr_t* src) { |
| return src->nonatomic; |
| } |
| |
| static FORCEINLINE void |
| atomic_store_ptr(atomicptr_t* dst, void* val) { |
| dst->nonatomic = val; |
| } |
| |
| static FORCEINLINE int |
| atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref); |
| |
| static void |
| thread_yield(void); |
| |
| // Preconfigured limits and sizes |
| |
| //! Memory page size |
| #define PAGE_SIZE 4096 |
| |
| //! Granularity of all memory page spans for small & medium block allocations |
| #define SPAN_ADDRESS_GRANULARITY 65536 |
| //! Maximum size of a span of memory pages |
| #define SPAN_MAX_SIZE (SPAN_ADDRESS_GRANULARITY) |
| //! Mask for getting the start of a span of memory pages |
| #define SPAN_MASK (~((uintptr_t)SPAN_MAX_SIZE - 1)) |
| //! Maximum number of memory pages in a span |
| #define SPAN_MAX_PAGE_COUNT (SPAN_MAX_SIZE / PAGE_SIZE) |
| //! Span size class granularity |
| #define SPAN_CLASS_GRANULARITY 4 |
| //! Number of size classes for spans |
| #define SPAN_CLASS_COUNT (SPAN_MAX_PAGE_COUNT / SPAN_CLASS_GRANULARITY) |
| |
| //! Granularity of a small allocation block |
| #define SMALL_GRANULARITY 16 |
| //! Small granularity shift count |
| #define SMALL_GRANULARITY_SHIFT 4 |
| //! Number of small block size classes |
| #define SMALL_CLASS_COUNT (((PAGE_SIZE - SPAN_HEADER_SIZE) >> 1) >> SMALL_GRANULARITY_SHIFT) |
| //! Maximum size of a small block |
| #define SMALL_SIZE_LIMIT (SMALL_CLASS_COUNT * SMALL_GRANULARITY) |
| |
| //! Granularity of a medium allocation block |
| #define MEDIUM_GRANULARITY 512 |
| //! Medimum granularity shift count |
| #define MEDIUM_GRANULARITY_SHIFT 9 |
| //! Number of medium block size classes |
| #define MEDIUM_CLASS_COUNT 60 |
| //! Maximum size of a medium block |
| #define MEDIUM_SIZE_LIMIT (SMALL_SIZE_LIMIT + (MEDIUM_GRANULARITY * MEDIUM_CLASS_COUNT) - SPAN_HEADER_SIZE) |
| |
| //! Total number of small + medium size classes |
| #define SIZE_CLASS_COUNT (SMALL_CLASS_COUNT + MEDIUM_CLASS_COUNT) |
| |
| //! Number of large block size classes |
| #define LARGE_CLASS_COUNT 32 |
| //! Maximum number of memory pages in a large block |
| #define LARGE_MAX_PAGES (SPAN_MAX_PAGE_COUNT * LARGE_CLASS_COUNT) |
| //! Maximum size of a large block |
| #define LARGE_SIZE_LIMIT ((LARGE_MAX_PAGES * PAGE_SIZE) - SPAN_HEADER_SIZE) |
| |
| #define SPAN_LIST_LOCK_TOKEN ((void*)1) |
| |
| #define pointer_offset(ptr, ofs) (void*)((char*)(ptr) + (ptrdiff_t)(ofs)) |
| #define pointer_diff(first, second) (ptrdiff_t)((const char*)(first) - (const char*)(second)) |
| |
| //! Size of a span header |
| #define SPAN_HEADER_SIZE 32 |
| |
| #if ARCH_64BIT |
| typedef int64_t offset_t; |
| #else |
| typedef int32_t offset_t; |
| #endif |
| typedef uint32_t count_t; |
| |
| #if ENABLE_VALIDATE_ARGS |
| //! Maximum allocation size to avoid integer overflow |
| #define MAX_ALLOC_SIZE (((size_t)-1) - PAGE_SIZE) |
| #endif |
| |
| // Data types |
| |
| //! A memory heap, per thread |
| typedef struct heap_t heap_t; |
| //! Span of memory pages |
| typedef struct span_t span_t; |
| //! Size class definition |
| typedef struct size_class_t size_class_t; |
| //! Span block bookkeeping |
| typedef struct span_block_t span_block_t; |
| //! Span data union, usage depending on span state |
| typedef union span_data_t span_data_t; |
| //! Cache data |
| typedef struct span_counter_t span_counter_t; |
| |
| struct span_block_t { |
| //! Free list |
| uint16_t free_list; |
| //! First autolinked block |
| uint16_t first_autolink; |
| //! Free count |
| uint16_t free_count; |
| //! Padding |
| uint16_t padding; |
| }; |
| |
| union span_data_t { |
| //! Span data |
| span_block_t block; |
| //! List size (used when span is part of a list) |
| uint32_t list_size; |
| }; |
| |
| struct span_t { |
| //! Heap ID |
| atomic32_t heap_id; |
| //! Size class |
| count_t size_class; |
| //! Span data |
| span_data_t data; |
| //! Next span |
| span_t* next_span; |
| //! Previous span |
| span_t* prev_span; |
| }; |
| _Static_assert(sizeof(span_t) <= SPAN_HEADER_SIZE, "span size mismatch"); |
| |
| struct span_counter_t { |
| //! Allocation high water mark |
| uint32_t max_allocations; |
| //! Current number of allocations |
| uint32_t current_allocations; |
| //! Cache limit |
| uint32_t cache_limit; |
| }; |
| |
| struct heap_t { |
| //! Heap ID |
| int32_t id; |
| //! Deferred deallocation |
| atomicptr_t defer_deallocate; |
| //! Free count for each size class active span |
| span_block_t active_block[SIZE_CLASS_COUNT]; |
| //! Active span for each size class |
| span_t* active_span[SIZE_CLASS_COUNT]; |
| //! List of demi-used spans with free blocks for each size class (double linked list) |
| span_t* size_cache[SIZE_CLASS_COUNT]; |
| //! List of free spans for each page count (single linked list) |
| span_t* span_cache[SPAN_CLASS_COUNT]; |
| //! Allocation counters |
| span_counter_t span_counter[SPAN_CLASS_COUNT]; |
| //! List of free spans for each large class count (single linked list) |
| span_t* large_cache[LARGE_CLASS_COUNT]; |
| //! Allocation counters for large blocks |
| span_counter_t large_counter[LARGE_CLASS_COUNT]; |
| //! Next heap in id list |
| heap_t* next_heap; |
| //! Next heap in orphan list |
| heap_t* next_orphan; |
| #if ENABLE_STATISTICS |
| //! Number of bytes currently reqeusted in allocations |
| size_t requested; |
| //! Number of bytes current allocated |
| size_t allocated; |
| //! Number of bytes transitioned thread -> global |
| size_t thread_to_global; |
| //! Number of bytes transitioned global -> thread |
| size_t global_to_thread; |
| #endif |
| }; |
| _Static_assert(sizeof(heap_t) <= PAGE_SIZE*2, "heap size mismatch"); |
| |
| struct size_class_t { |
| //! Size of blocks in this class |
| uint16_t size; |
| //! Number of pages to allocate for a chunk |
| uint16_t page_count; |
| //! Number of blocks in each chunk |
| uint16_t block_count; |
| //! Class index this class is merged with |
| uint16_t class_idx; |
| }; |
| _Static_assert(sizeof(size_class_t) == 8, "Size class size mismatch"); |
| |
| //! Global size classes |
| static size_class_t _memory_size_class[SIZE_CLASS_COUNT]; |
| |
| //! Heap ID counter |
| static atomic32_t _memory_heap_id; |
| |
| #ifdef PLATFORM_POSIX |
| //! Virtual memory address counter |
| static atomic64_t _memory_addr; |
| #endif |
| |
| //! Global span cache |
| static atomicptr_t _memory_span_cache[SPAN_CLASS_COUNT]; |
| |
| //! Global large cache |
| static atomicptr_t _memory_large_cache[LARGE_CLASS_COUNT]; |
| |
| //! Current thread heap |
| static _Thread_local heap_t* _memory_thread_heap TLS_MODEL; |
| |
| //! All heaps |
| static atomicptr_t _memory_heaps[HEAP_ARRAY_SIZE]; |
| |
| //! Orphaned heaps |
| static atomicptr_t _memory_orphan_heaps; |
| |
| //! Active heap count |
| static atomic32_t _memory_active_heaps; |
| |
| //! Adaptive cache max allocation count |
| static uint32_t _memory_max_allocation[SPAN_CLASS_COUNT]; |
| |
| //! Adaptive cache max allocation count |
| static uint32_t _memory_max_allocation_large[LARGE_CLASS_COUNT]; |
| |
| #if ENABLE_STATISTICS |
| //! Total number of mapped memory pages |
| static atomic32_t _mapped_pages; |
| //! Running counter of total number of mapped memory pages since start |
| static atomic32_t _mapped_total; |
| //! Running counter of total number of unmapped memory pages since start |
| static atomic32_t _unmapped_total; |
| #endif |
| |
| static void* |
| _memory_map(size_t page_count); |
| |
| static void |
| _memory_unmap(void* ptr, size_t page_count); |
| |
| static int |
| _memory_deallocate_deferred(heap_t* heap, size_t size_class); |
| |
| //! Lookup a memory heap from heap ID |
| static heap_t* |
| _memory_heap_lookup(int32_t id) { |
| uint32_t list_idx = id % HEAP_ARRAY_SIZE; |
| heap_t* heap = atomic_load_ptr(&_memory_heaps[list_idx]); |
| while (heap && (heap->id != id)) |
| heap = heap->next_heap; |
| return heap; |
| } |
| |
| //! Get the span size class from page count |
| static size_t |
| _span_class_from_page_count(size_t page_count) { |
| assert((page_count > 0) && (page_count <= 16)); |
| return ((page_count + SPAN_CLASS_GRANULARITY - 1) / SPAN_CLASS_GRANULARITY) - 1; |
| } |
| |
| //! Increase an allocation counter |
| static void |
| _memory_counter_increase(span_counter_t* counter, uint32_t* global_counter) { |
| if (++counter->current_allocations > counter->max_allocations) { |
| counter->max_allocations = counter->current_allocations; |
| #if MAX_SPAN_CACHE_DIVISOR > 0 |
| counter->cache_limit = counter->max_allocations / MAX_SPAN_CACHE_DIVISOR; |
| #endif |
| if (counter->max_allocations > *global_counter) |
| *global_counter = counter->max_allocations; |
| } |
| } |
| |
| //! Insert the given list of memory page spans in the global cache for small/medium blocks |
| static void |
| _memory_global_cache_insert(span_t* first_span, size_t list_size, size_t page_count) { |
| assert((list_size == 1) || (first_span->next_span != 0)); |
| #if MAX_SPAN_CACHE_DIVISOR > 0 |
| while (1) { |
| size_t span_class_idx = _span_class_from_page_count(page_count); |
| void* global_span_ptr = atomic_load_ptr(&_memory_span_cache[span_class_idx]); |
| if (global_span_ptr != SPAN_LIST_LOCK_TOKEN) { |
| uintptr_t global_list_size = (uintptr_t)global_span_ptr & ~SPAN_MASK; |
| span_t* global_span = (span_t*)((void*)((uintptr_t)global_span_ptr & SPAN_MASK)); |
| |
| #ifdef GLOBAL_SPAN_CACHE_MULTIPLIER |
| size_t cache_limit = GLOBAL_SPAN_CACHE_MULTIPLIER * (_memory_max_allocation[span_class_idx] / MAX_SPAN_CACHE_DIVISOR); |
| if ((global_list_size >= cache_limit) && (global_list_size > MIN_SPAN_CACHE_SIZE)) |
| break; |
| #endif |
| //We only have 16 bits for size of list, avoid overflow |
| if ((global_list_size + list_size) > 0xFFFF) |
| break; |
| |
| //Use prev_span as skip pointer over this sublist range of spans |
| first_span->data.list_size = (uint32_t)list_size; |
| first_span->prev_span = global_span; |
| |
| //Insert sublist into global cache |
| global_list_size += list_size; |
| void* first_span_ptr = (void*)((uintptr_t)first_span | global_list_size); |
| if (atomic_cas_ptr(&_memory_span_cache[span_class_idx], first_span_ptr, global_span_ptr)) |
| return; |
| } |
| else { |
| //Atomic operation failed, yield timeslice and retry |
| thread_yield(); |
| atomic_thread_fence_acquire(); |
| } |
| } |
| #endif |
| //Global cache full, release pages |
| for (size_t ispan = 0; ispan < list_size; ++ispan) { |
| assert(first_span); |
| span_t* next_span = first_span->next_span; |
| _memory_unmap(first_span, page_count); |
| first_span = next_span; |
| } |
| } |
| |
| //! Extract a number of memory page spans from the global cache for small/medium blocks |
| static span_t* |
| _memory_global_cache_extract(size_t page_count) { |
| span_t* span = 0; |
| size_t span_class_idx = _span_class_from_page_count(page_count); |
| atomicptr_t* cache = &_memory_span_cache[span_class_idx]; |
| atomic_thread_fence_acquire(); |
| void* global_span_ptr = atomic_load_ptr(cache); |
| while (global_span_ptr) { |
| if ((global_span_ptr != SPAN_LIST_LOCK_TOKEN) && |
| atomic_cas_ptr(cache, SPAN_LIST_LOCK_TOKEN, global_span_ptr)) { |
| //Grab a number of thread cache spans, using the skip span pointer |
| //stored in prev_span to quickly skip ahead in the list to get the new head |
| uintptr_t global_span_count = (uintptr_t)global_span_ptr & ~SPAN_MASK; |
| span = (span_t*)((void*)((uintptr_t)global_span_ptr & SPAN_MASK)); |
| assert((span->data.list_size == 1) || (span->next_span != 0)); |
| |
| span_t* new_global_span = span->prev_span; |
| global_span_count -= span->data.list_size; |
| |
| //Set new head of global cache list |
| void* new_cache_head = global_span_count ? |
| ((void*)((uintptr_t)new_global_span | global_span_count)) : |
| 0; |
| atomic_store_ptr(cache, new_cache_head); |
| atomic_thread_fence_release(); |
| break; |
| } |
| |
| //List busy, yield timeslice and retry |
| thread_yield(); |
| atomic_thread_fence_acquire(); |
| global_span_ptr = atomic_load_ptr(cache); |
| } |
| |
| return span; |
| } |
| |
| /*! Insert the given list of memory page spans in the global cache for large blocks, |
| similar to _memory_global_cache_insert */ |
| static void |
| _memory_global_cache_large_insert(span_t* span_list, size_t list_size, size_t span_count) { |
| assert((list_size == 1) || (span_list->next_span != 0)); |
| assert(span_list->size_class == (SIZE_CLASS_COUNT + (span_count - 1))); |
| #if MAX_SPAN_CACHE_DIVISOR > 0 |
| atomicptr_t* cache = &_memory_large_cache[span_count - 1]; |
| while (1) { |
| void* global_span_ptr = atomic_load_ptr(cache); |
| if (global_span_ptr != SPAN_LIST_LOCK_TOKEN) { |
| uintptr_t global_list_size = (uintptr_t)global_span_ptr & ~SPAN_MASK; |
| span_t* global_span = (span_t*)((void*)((uintptr_t)global_span_ptr & SPAN_MASK)); |
| |
| #ifdef GLOBAL_SPAN_CACHE_MULTIPLIER |
| size_t cache_limit = GLOBAL_SPAN_CACHE_MULTIPLIER * (_memory_max_allocation_large[span_count-1] / MAX_SPAN_CACHE_DIVISOR); |
| if ((global_list_size >= cache_limit) && (global_list_size > MIN_SPAN_CACHE_SIZE)) |
| break; |
| #endif |
| if ((global_list_size + list_size) > 0xFFFF) |
| break; |
| |
| span_list->data.list_size = (uint32_t)list_size; |
| span_list->prev_span = global_span; |
| |
| global_list_size += list_size; |
| void* new_global_span_ptr = (void*)((uintptr_t)span_list | global_list_size); |
| if (atomic_cas_ptr(cache, new_global_span_ptr, global_span_ptr)) |
| return; |
| } |
| else { |
| thread_yield(); |
| atomic_thread_fence_acquire(); |
| } |
| } |
| #endif |
| //Global cache full, release spans |
| for (size_t ispan = 0; ispan < list_size; ++ispan) { |
| assert(span_list); |
| span_t* next_span = span_list->next_span; |
| _memory_unmap(span_list, span_count * SPAN_MAX_PAGE_COUNT); |
| span_list = next_span; |
| } |
| } |
| |
| /*! Extract a number of memory page spans from the global cache for large blocks, |
| similar to _memory_global_cache_extract */ |
| static span_t* |
| _memory_global_cache_large_extract(size_t span_count) { |
| span_t* span = 0; |
| atomicptr_t* cache = &_memory_large_cache[span_count - 1]; |
| atomic_thread_fence_acquire(); |
| void* global_span_ptr = atomic_load_ptr(cache); |
| while (global_span_ptr) { |
| if ((global_span_ptr != SPAN_LIST_LOCK_TOKEN) && |
| atomic_cas_ptr(cache, SPAN_LIST_LOCK_TOKEN, global_span_ptr)) { |
| uintptr_t global_list_size = (uintptr_t)global_span_ptr & ~SPAN_MASK; |
| span = (span_t*)((void*)((uintptr_t)global_span_ptr & SPAN_MASK)); |
| assert((span->data.list_size == 1) || (span->next_span != 0)); |
| assert(span->size_class == (SIZE_CLASS_COUNT + (span_count - 1))); |
| |
| span_t* new_global_span = span->prev_span; |
| global_list_size -= span->data.list_size; |
| |
| void* new_global_span_ptr = global_list_size ? |
| ((void*)((uintptr_t)new_global_span | global_list_size)) : |
| 0; |
| atomic_store_ptr(cache, new_global_span_ptr); |
| atomic_thread_fence_release(); |
| break; |
| } |
| |
| thread_yield(); |
| atomic_thread_fence_acquire(); |
| global_span_ptr = atomic_load_ptr(cache); |
| } |
| return span; |
| } |
| |
| //! Allocate a small/medium sized memory block from the given heap |
| static void* |
| _memory_allocate_from_heap(heap_t* heap, size_t size) { |
| #if ENABLE_STATISTICS |
| //For statistics we need to store the requested size in the memory block |
| size += sizeof(size_t); |
| #endif |
| |
| //Calculate the size class index and do a dependent lookup of the final class index (in case of merged classes) |
| const size_t class_idx = _memory_size_class[(size <= SMALL_SIZE_LIMIT) ? |
| ((size + (SMALL_GRANULARITY - 1)) >> SMALL_GRANULARITY_SHIFT) - 1 : |
| SMALL_CLASS_COUNT + ((size - SMALL_SIZE_LIMIT + (MEDIUM_GRANULARITY - 1)) >> MEDIUM_GRANULARITY_SHIFT) - 1].class_idx; |
| |
| span_block_t* active_block = heap->active_block + class_idx; |
| size_class_t* size_class = _memory_size_class + class_idx; |
| const count_t class_size = size_class->size; |
| |
| #if ENABLE_STATISTICS |
| heap->allocated += class_size; |
| heap->requested += size; |
| #endif |
| |
| //Step 1: Try to get a block from the currently active span. The span block bookkeeping |
| // data for the active span is stored in the heap for faster access |
| use_active: |
| if (active_block->free_count) { |
| //Happy path, we have a span with at least one free block |
| span_t* span = heap->active_span[class_idx]; |
| count_t offset = class_size * active_block->free_list; |
| uint32_t* block = pointer_offset(span, SPAN_HEADER_SIZE + offset); |
| assert(span); |
| |
| --active_block->free_count; |
| if (!active_block->free_count) { |
| //Span is now completely allocated, set the bookkeeping data in the |
| //span itself and reset the active span pointer in the heap |
| span->data.block.free_count = 0; |
| span->data.block.first_autolink = (uint16_t)size_class->block_count; |
| heap->active_span[class_idx] = 0; |
| } |
| else { |
| //Get the next free block, either from linked list or from auto link |
| if (active_block->free_list < active_block->first_autolink) { |
| active_block->free_list = (uint16_t)(*block); |
| } |
| else { |
| ++active_block->free_list; |
| ++active_block->first_autolink; |
| } |
| assert(active_block->free_list < size_class->block_count); |
| } |
| |
| #if ENABLE_STATISTICS |
| //Store the requested size for statistics |
| *(size_t*)pointer_offset(block, class_size - sizeof(size_t)) = size; |
| #endif |
| |
| return block; |
| } |
| |
| //Step 2: No active span, try executing deferred deallocations and try again if there |
| // was at least one of the reqeusted size class |
| if (_memory_deallocate_deferred(heap, class_idx)) { |
| if (active_block->free_count) |
| goto use_active; |
| } |
| |
| //Step 3: Check if there is a semi-used span of the requested size class available |
| if (heap->size_cache[class_idx]) { |
| //Promote a pending semi-used span to be active, storing bookkeeping data in |
| //the heap structure for faster access |
| span_t* span = heap->size_cache[class_idx]; |
| *active_block = span->data.block; |
| assert(active_block->free_count > 0); |
| span_t* next_span = span->next_span; |
| heap->size_cache[class_idx] = next_span; |
| heap->active_span[class_idx] = span; |
| goto use_active; |
| } |
| |
| //Step 4: No semi-used span available, try grab a span from the thread cache |
| size_t span_class_idx = _span_class_from_page_count(size_class->page_count); |
| span_t* span = heap->span_cache[span_class_idx]; |
| if (!span) { |
| //Step 5: No span available in the thread cache, try grab a list of spans from the global cache |
| span = _memory_global_cache_extract(size_class->page_count); |
| #if ENABLE_STATISTICS |
| if (span) |
| heap->global_to_thread += (size_t)span->data.list_size * size_class->page_count * PAGE_SIZE; |
| #endif |
| } |
| if (span) { |
| if (span->data.list_size > 1) { |
| //We got a list of spans, we will use first as active and store remainder in thread cache |
| span_t* next_span = span->next_span; |
| assert(next_span); |
| next_span->data.list_size = span->data.list_size - 1; |
| heap->span_cache[span_class_idx] = next_span; |
| } |
| else { |
| heap->span_cache[span_class_idx] = 0; |
| } |
| } |
| else { |
| //Step 6: All caches empty, map in new memory pages |
| span = _memory_map(size_class->page_count); |
| } |
| |
| //Mark span as owned by this heap and set base data |
| atomic_store32(&span->heap_id, heap->id); |
| atomic_thread_fence_release(); |
| |
| span->size_class = (count_t)class_idx; |
| |
| //If we only have one block we will grab it, otherwise |
| //set span as new span to use for next allocation |
| if (size_class->block_count > 1) { |
| //Reset block order to sequential auto linked order |
| active_block->free_count = (uint16_t)(size_class->block_count - 1); |
| active_block->free_list = 1; |
| active_block->first_autolink = 1; |
| heap->active_span[class_idx] = span; |
| } |
| else { |
| span->data.block.free_count = 0; |
| span->data.block.first_autolink = (uint16_t)size_class->block_count; |
| } |
| |
| //Track counters |
| _memory_counter_increase(&heap->span_counter[span_class_idx], &_memory_max_allocation[span_class_idx]); |
| |
| #if ENABLE_STATISTICS |
| //Store the requested size for statistics |
| *(size_t*)pointer_offset(span, SPAN_HEADER_SIZE + class_size - sizeof(size_t)) = size; |
| #endif |
| |
| //Return first block if memory page span |
| return pointer_offset(span, SPAN_HEADER_SIZE); |
| } |
| |
| //! Allocate a large sized memory block from the given heap |
| static void* |
| _memory_allocate_large_from_heap(heap_t* heap, size_t size) { |
| //Calculate number of needed max sized spans (including header) |
| size += SPAN_HEADER_SIZE; |
| size_t num_spans = size / SPAN_MAX_SIZE; |
| if (size % SPAN_MAX_SIZE) |
| ++num_spans; |
| size_t idx = num_spans - 1; |
| |
| if (!idx) { |
| size_t span_class_idx = _span_class_from_page_count(SPAN_MAX_PAGE_COUNT); |
| span_t* span = heap->span_cache[span_class_idx]; |
| if (!span) { |
| _memory_deallocate_deferred(heap, 0); |
| span = heap->span_cache[span_class_idx]; |
| } |
| if (!span) { |
| //Step 5: No span available in the thread cache, try grab a list of spans from the global cache |
| span = _memory_global_cache_extract(SPAN_MAX_PAGE_COUNT); |
| #if ENABLE_STATISTICS |
| if (span) |
| heap->global_to_thread += (size_t)span->data.list_size * SPAN_MAX_PAGE_COUNT * PAGE_SIZE; |
| #endif |
| } |
| if (span) { |
| if (span->data.list_size > 1) { |
| //We got a list of spans, we will use first as active and store remainder in thread cache |
| span_t* next_span = span->next_span; |
| assert(next_span); |
| next_span->data.list_size = span->data.list_size - 1; |
| heap->span_cache[span_class_idx] = next_span; |
| } |
| else { |
| heap->span_cache[span_class_idx] = 0; |
| } |
| } |
| else { |
| //Step 6: All caches empty, map in new memory pages |
| span = _memory_map(SPAN_MAX_PAGE_COUNT); |
| } |
| |
| //Mark span as owned by this heap and set base data |
| atomic_store32(&span->heap_id, heap->id); |
| atomic_thread_fence_release(); |
| |
| span->size_class = SIZE_CLASS_COUNT; |
| |
| //Track counters |
| _memory_counter_increase(&heap->span_counter[span_class_idx], &_memory_max_allocation[span_class_idx]); |
| |
| return pointer_offset(span, SPAN_HEADER_SIZE); |
| } |
| |
| use_cache: |
| //Step 1: Check if cache for this large size class (or the following, unless first class) has a span |
| while (!heap->large_cache[idx] && (idx < LARGE_CLASS_COUNT) && (idx < num_spans + 1)) |
| ++idx; |
| span_t* span = heap->large_cache[idx]; |
| if (span) { |
| //Happy path, use from cache |
| if (span->data.list_size > 1) { |
| span_t* new_head = span->next_span; |
| assert(new_head); |
| new_head->data.list_size = span->data.list_size - 1; |
| heap->large_cache[idx] = new_head; |
| } |
| else { |
| heap->large_cache[idx] = 0; |
| } |
| |
| span->size_class = SIZE_CLASS_COUNT + (count_t)idx; |
| |
| //Increase counter |
| _memory_counter_increase(&heap->large_counter[idx], &_memory_max_allocation_large[idx]); |
| |
| return pointer_offset(span, SPAN_HEADER_SIZE); |
| } |
| |
| //Restore index, we're back to smallest fitting span count |
| idx = num_spans - 1; |
| |
| //Step 2: Process deferred deallocation |
| if (_memory_deallocate_deferred(heap, SIZE_CLASS_COUNT + idx)) |
| goto use_cache; |
| assert(!heap->large_cache[idx]); |
| |
| //Step 3: Extract a list of spans from global cache |
| span = _memory_global_cache_large_extract(num_spans); |
| if (span) { |
| #if ENABLE_STATISTICS |
| heap->global_to_thread += (size_t)span->data.list_size * num_spans * SPAN_MAX_SIZE; |
| #endif |
| //We got a list from global cache, store remainder in thread cache |
| if (span->data.list_size > 1) { |
| span_t* new_head = span->next_span; |
| assert(new_head); |
| new_head->prev_span = 0; |
| new_head->data.list_size = span->data.list_size - 1; |
| heap->large_cache[idx] = new_head; |
| } |
| } |
| else { |
| //Step 4: Map in more memory pages |
| span = _memory_map(num_spans * SPAN_MAX_PAGE_COUNT); |
| } |
| //Mark span as owned by this heap |
| atomic_store32(&span->heap_id, heap->id); |
| atomic_thread_fence_release(); |
| |
| span->size_class = SIZE_CLASS_COUNT + (count_t)idx; |
| |
| //Increase counter |
| _memory_counter_increase(&heap->large_counter[idx], &_memory_max_allocation_large[idx]); |
| |
| return pointer_offset(span, SPAN_HEADER_SIZE); |
| } |
| |
| //! Allocate a new heap |
| static heap_t* |
| _memory_allocate_heap(void) { |
| heap_t* heap; |
| heap_t* next_heap; |
| //Try getting an orphaned heap |
| atomic_thread_fence_acquire(); |
| do { |
| heap = atomic_load_ptr(&_memory_orphan_heaps); |
| if (!heap) |
| break; |
| next_heap = heap->next_orphan; |
| } |
| while (!atomic_cas_ptr(&_memory_orphan_heaps, next_heap, heap)); |
| |
| if (heap) { |
| heap->next_orphan = 0; |
| return heap; |
| } |
| |
| //Map in pages for a new heap |
| heap = _memory_map(2); |
| memset(heap, 0, sizeof(heap_t)); |
| |
| //Get a new heap ID |
| do { |
| heap->id = atomic_incr32(&_memory_heap_id); |
| if (_memory_heap_lookup(heap->id)) |
| heap->id = 0; |
| } |
| while (!heap->id); |
| |
| //Link in heap in heap ID map |
| size_t list_idx = heap->id % HEAP_ARRAY_SIZE; |
| do { |
| next_heap = atomic_load_ptr(&_memory_heaps[list_idx]); |
| heap->next_heap = next_heap; |
| } |
| while (!atomic_cas_ptr(&_memory_heaps[list_idx], heap, next_heap)); |
| |
| return heap; |
| } |
| |
| //! Add a span to a double linked list |
| static void |
| _memory_list_add(span_t** head, span_t* span) { |
| if (*head) { |
| (*head)->prev_span = span; |
| span->next_span = *head; |
| } |
| else { |
| span->next_span = 0; |
| } |
| *head = span; |
| } |
| |
| //! Remove a span from a double linked list |
| static void |
| _memory_list_remove(span_t** head, span_t* span) { |
| if (*head == span) { |
| *head = span->next_span; |
| } |
| else { |
| if (span->next_span) |
| span->next_span->prev_span = span->prev_span; |
| span->prev_span->next_span = span->next_span; |
| } |
| } |
| |
| //! Insert span into thread cache, releasing to global cache if overflow |
| static void |
| _memory_heap_cache_insert(heap_t* heap, span_t* span, size_t page_count) { |
| #if MAX_SPAN_CACHE_DIVISOR == 0 |
| (void)sizeof(heap); |
| _memory_global_cache_insert(span, 1, page_count); |
| #else |
| size_t span_class_idx = _span_class_from_page_count(page_count); |
| span_t** cache = &heap->span_cache[span_class_idx]; |
| span->next_span = *cache; |
| if (*cache) |
| span->data.list_size = (*cache)->data.list_size + 1; |
| else |
| span->data.list_size = 1; |
| *cache = span; |
| #if MAX_SPAN_CACHE_DIVISOR > 1 |
| //Check if cache exceeds limit |
| if ((span->data.list_size >= (MIN_SPAN_CACHE_RELEASE + MIN_SPAN_CACHE_SIZE)) && |
| (span->data.list_size > heap->span_counter[span_class_idx].cache_limit)) { |
| //Release to global cache |
| count_t list_size = 1; |
| span_t* next = span->next_span; |
| span_t* last = span; |
| while (list_size < MIN_SPAN_CACHE_RELEASE) { |
| last = next; |
| next = next->next_span; |
| ++list_size; |
| } |
| next->data.list_size = span->data.list_size - list_size; |
| last->next_span = 0; //Terminate list |
| *cache = next; |
| _memory_global_cache_insert(span, list_size, page_count); |
| #if ENABLE_STATISTICS |
| heap->thread_to_global += list_size * page_count * PAGE_SIZE; |
| #endif |
| } |
| #endif |
| #endif |
| } |
| |
| //! Deallocate the given small/medium memory block from the given heap |
| static void |
| _memory_deallocate_to_heap(heap_t* heap, span_t* span, void* p) { |
| //Check if span is the currently active span in order to operate |
| //on the correct bookkeeping data |
| const count_t class_idx = span->size_class; |
| size_class_t* size_class = _memory_size_class + class_idx; |
| int is_active = (heap->active_span[class_idx] == span); |
| span_block_t* block_data = is_active ? |
| heap->active_block + class_idx : |
| &span->data.block; |
| |
| #if ENABLE_STATISTICS |
| heap->allocated -= size_class->size; |
| heap->requested -= *(size_t*)pointer_offset(p, size_class->size - sizeof(size_t)); |
| #endif |
| |
| //Check if the span will become completely free |
| if (block_data->free_count == ((count_t)size_class->block_count - 1)) { |
| //Track counters |
| size_t span_class_idx = _span_class_from_page_count(size_class->page_count); |
| assert(heap->span_counter[span_class_idx].current_allocations > 0); |
| --heap->span_counter[span_class_idx].current_allocations; |
| |
| //If it was active, reset counter. Otherwise, if not active, remove from |
| //partial free list if we had a previous free block (guard for classes with only 1 block) |
| if (is_active) |
| block_data->free_count = 0; |
| else if (block_data->free_count > 0) |
| _memory_list_remove(&heap->size_cache[class_idx], span); |
| |
| //Add to span cache |
| _memory_heap_cache_insert(heap, span, size_class->page_count); |
| return; |
| } |
| |
| //Check if first free block for this span (previously fully allocated) |
| if (block_data->free_count == 0) { |
| //add to free list and disable autolink |
| _memory_list_add(&heap->size_cache[class_idx], span); |
| block_data->first_autolink = (uint16_t)size_class->block_count; |
| } |
| ++block_data->free_count; |
| //Span is not yet completely free, so add block to the linked list of free blocks |
| void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE); |
| count_t block_offset = (count_t)pointer_diff(p, blocks_start); |
| count_t block_idx = block_offset / (count_t)size_class->size; |
| uint32_t* block = pointer_offset(blocks_start, block_idx * size_class->size); |
| *block = block_data->free_list; |
| block_data->free_list = (uint16_t)block_idx; |
| } |
| |
| //! Deallocate the given large memory block from the given heap |
| static void |
| _memory_deallocate_large_to_heap(heap_t* heap, span_t* span) { |
| //Check if aliased with 64KiB small/medium spans |
| if (span->size_class == SIZE_CLASS_COUNT) { |
| //Track counters |
| size_t span_class_idx = _span_class_from_page_count(SPAN_MAX_PAGE_COUNT); |
| --heap->span_counter[span_class_idx].current_allocations; |
| //Add to span cache |
| _memory_heap_cache_insert(heap, span, SPAN_MAX_PAGE_COUNT); |
| return; |
| } |
| |
| //Decrease counter |
| size_t idx = span->size_class - SIZE_CLASS_COUNT; |
| span_counter_t* counter = heap->large_counter + idx; |
| assert(counter->current_allocations > 0); |
| --counter->current_allocations; |
| |
| #if MAX_SPAN_CACHE_DIVISOR == 0 |
| _memory_global_cache_large_insert(span, 1, idx + 1); |
| #else |
| //Insert into cache list |
| span_t** cache = heap->large_cache + idx; |
| span->next_span = *cache; |
| if (*cache) |
| span->data.list_size = (*cache)->data.list_size + 1; |
| else |
| span->data.list_size = 1; |
| *cache = span; |
| #if MAX_SPAN_CACHE_DIVISOR > 1 |
| //Check if cache exceeds limit |
| if ((span->data.list_size >= (MIN_SPAN_CACHE_RELEASE + MIN_SPAN_CACHE_SIZE)) && |
| (span->data.list_size > counter->cache_limit)) { |
| //Release to global cache |
| count_t list_size = 1; |
| span_t* next = span->next_span; |
| span_t* last = span; |
| while (list_size < MIN_SPAN_CACHE_RELEASE) { |
| last = next; |
| next = next->next_span; |
| ++list_size; |
| } |
| assert(next->next_span); |
| next->data.list_size = span->data.list_size - list_size; |
| last->next_span = 0; //Terminate list |
| *cache = next; |
| _memory_global_cache_large_insert(span, list_size, idx + 1); |
| #if ENABLE_STATISTICS |
| heap->thread_to_global += list_size * (idx + 1) * SPAN_MAX_SIZE; |
| #endif |
| } |
| #endif |
| #endif |
| } |
| |
| //! Process pending deferred cross-thread deallocations |
| static int |
| _memory_deallocate_deferred(heap_t* heap, size_t size_class) { |
| //Grab the current list of deferred deallocations |
| atomic_thread_fence_acquire(); |
| void* p = atomic_load_ptr(&heap->defer_deallocate); |
| if (!p) |
| return 0; |
| if (!atomic_cas_ptr(&heap->defer_deallocate, 0, p)) |
| return 0; |
| //Keep track if we deallocate in the given size class |
| int got_class = 0; |
| do { |
| void* next = *(void**)p; |
| //Get span and check which type of block |
| span_t* span = (void*)((uintptr_t)p & SPAN_MASK); |
| if (span->size_class < SIZE_CLASS_COUNT) { |
| //Small/medium block |
| got_class |= (span->size_class == size_class); |
| _memory_deallocate_to_heap(heap, span, p); |
| } |
| else { |
| //Large block |
| got_class |= ((span->size_class >= size_class) && (span->size_class <= (size_class + 2))); |
| _memory_deallocate_large_to_heap(heap, span); |
| } |
| //Loop until all pending operations in list are processed |
| p = next; |
| } while (p); |
| return got_class; |
| } |
| |
| //! Defer deallocation of the given block to the given heap |
| static void |
| _memory_deallocate_defer(int32_t heap_id, void* p) { |
| //Get the heap and link in pointer in list of deferred opeations |
| heap_t* heap = _memory_heap_lookup(heap_id); |
| void* last_ptr; |
| do { |
| last_ptr = atomic_load_ptr(&heap->defer_deallocate); |
| *(void**)p = last_ptr; //Safe to use block, it's being deallocated |
| } while (!atomic_cas_ptr(&heap->defer_deallocate, p, last_ptr)); |
| } |
| |
| //! Allocate a block of the given size |
| static void* |
| _memory_allocate(size_t size) { |
| if (size <= MEDIUM_SIZE_LIMIT) |
| return _memory_allocate_from_heap(_memory_thread_heap, size); |
| else if (size <= LARGE_SIZE_LIMIT) |
| return _memory_allocate_large_from_heap(_memory_thread_heap, size); |
| |
| //Oversized, allocate pages directly |
| size += SPAN_HEADER_SIZE; |
| size_t num_pages = size / PAGE_SIZE; |
| if (size % PAGE_SIZE) |
| ++num_pages; |
| span_t* span = _memory_map(num_pages); |
| atomic_store32(&span->heap_id, 0); |
| //Store page count in next_span |
| span->next_span = (span_t*)((uintptr_t)num_pages); |
| |
| return pointer_offset(span, SPAN_HEADER_SIZE); |
| } |
| |
| //! Deallocate the given block |
| static void |
| _memory_deallocate(void* p) { |
| if (!p) |
| return; |
| |
| //Grab the span (always at start of span, using 64KiB alignment) |
| span_t* span = (void*)((uintptr_t)p & SPAN_MASK); |
| int32_t heap_id = atomic_load32(&span->heap_id); |
| heap_t* heap = _memory_thread_heap; |
| //Check if block belongs to this heap or if deallocation should be deferred |
| if (heap_id == heap->id) { |
| if (span->size_class < SIZE_CLASS_COUNT) |
| _memory_deallocate_to_heap(heap, span, p); |
| else |
| _memory_deallocate_large_to_heap(heap, span); |
| } |
| else if (heap_id > 0) { |
| _memory_deallocate_defer(heap_id, p); |
| } |
| else { |
| //Oversized allocation, page count is stored in next_span |
| size_t num_pages = (size_t)span->next_span; |
| _memory_unmap(span, num_pages); |
| } |
| } |
| |
| //! Reallocate the given block to the given size |
| static void* |
| _memory_reallocate(void* p, size_t size, size_t oldsize, unsigned int flags) { |
| if (p) { |
| //Grab the span (always at start of span, using 64KiB alignment) |
| span_t* span = (void*)((uintptr_t)p & SPAN_MASK); |
| int32_t heap_id = atomic_load32(&span->heap_id); |
| if (heap_id) { |
| if (span->size_class < SIZE_CLASS_COUNT) { |
| //Small/medium sized block |
| size_class_t* size_class = _memory_size_class + span->size_class; |
| if ((size_t)size_class->size >= size) |
| return p; //Still fits in block, never mind trying to save memory |
| if (!oldsize) |
| oldsize = size_class->size; |
| } |
| else { |
| //Large block |
| size_t total_size = size + SPAN_HEADER_SIZE; |
| size_t num_spans = total_size / SPAN_MAX_SIZE; |
| if (total_size % SPAN_MAX_SIZE) |
| ++num_spans; |
| size_t current_spans = (span->size_class - SIZE_CLASS_COUNT) + 1; |
| if ((current_spans >= num_spans) && (num_spans >= (current_spans / 2))) |
| return p; //Still fits and less than half of memory would be freed |
| if (!oldsize) |
| oldsize = (current_spans * (size_t)SPAN_MAX_SIZE) - SPAN_HEADER_SIZE; |
| } |
| } |
| else { |
| //Oversized block |
| size_t total_size = size + SPAN_HEADER_SIZE; |
| size_t num_pages = total_size / PAGE_SIZE; |
| if (total_size % PAGE_SIZE) |
| ++num_pages; |
| //Page count is stored in next_span |
| size_t current_pages = (size_t)span->next_span; |
| if ((current_pages >= num_pages) && (num_pages >= (current_pages / 2))) |
| return p; //Still fits and less than half of memory would be freed |
| if (!oldsize) |
| oldsize = (current_pages * (size_t)PAGE_SIZE) - SPAN_HEADER_SIZE; |
| } |
| } |
| |
| //Size is greater than block size, need to allocate a new block and deallocate the old |
| //Avoid hysteresis by overallocating if increase is small (below 37%) |
| size_t lower_bound = oldsize + (oldsize >> 2) + (oldsize >> 3); |
| void* block = _memory_allocate(size > lower_bound ? size : lower_bound); |
| if (p) { |
| if (!(flags & RPMALLOC_NO_PRESERVE)) |
| memcpy(block, p, oldsize < size ? oldsize : size); |
| _memory_deallocate(p); |
| } |
| |
| return block; |
| } |
| |
| //! Get the usable size of the given block |
| static size_t |
| _memory_usable_size(void* p) { |
| //Grab the span (always at start of span, using 64KiB alignment) |
| span_t* span = (void*)((uintptr_t)p & SPAN_MASK); |
| int32_t heap_id = atomic_load32(&span->heap_id); |
| if (heap_id) { |
| if (span->size_class < SIZE_CLASS_COUNT) { |
| //Small/medium block |
| size_class_t* size_class = _memory_size_class + span->size_class; |
| return size_class->size; |
| } |
| |
| //Large block |
| size_t current_spans = (span->size_class - SIZE_CLASS_COUNT) + 1; |
| return (current_spans * (size_t)SPAN_MAX_SIZE) - SPAN_HEADER_SIZE; |
| } |
| |
| //Oversized block, page count is stored in next_span |
| size_t current_pages = (size_t)span->next_span; |
| return (current_pages * (size_t)PAGE_SIZE) - SPAN_HEADER_SIZE; |
| } |
| |
| //! Adjust and optimize the size class properties for the given class |
| static void |
| _memory_adjust_size_class(size_t iclass) { |
| //Calculate how many pages are needed for 255 blocks |
| size_t block_size = _memory_size_class[iclass].size; |
| size_t page_count = (block_size * 255) / PAGE_SIZE; |
| //Cap to 16 pages (64KiB span granularity) |
| page_count = (page_count == 0) ? 1 : ((page_count > 16) ? 16 : page_count); |
| //Merge page counts to span size class granularity |
| page_count = ((page_count + (SPAN_CLASS_GRANULARITY - 1)) / SPAN_CLASS_GRANULARITY) * SPAN_CLASS_GRANULARITY; |
| if (page_count > 16) |
| page_count = 16; |
| size_t block_count = ((page_count * PAGE_SIZE) - SPAN_HEADER_SIZE) / block_size; |
| //Store the final configuration |
| _memory_size_class[iclass].page_count = (uint16_t)page_count; |
| _memory_size_class[iclass].block_count = (uint16_t)block_count; |
| _memory_size_class[iclass].class_idx = (uint16_t)iclass; |
| |
| //Check if previous size classes can be merged |
| size_t prevclass = iclass; |
| while (prevclass > 0) { |
| --prevclass; |
| //A class can be merged if number of pages and number of blocks are equal |
| if ((_memory_size_class[prevclass].page_count == _memory_size_class[iclass].page_count) && |
| (_memory_size_class[prevclass].block_count == _memory_size_class[iclass].block_count)) { |
| memcpy(_memory_size_class + prevclass, _memory_size_class + iclass, sizeof(_memory_size_class[iclass])); |
| } |
| else { |
| break; |
| } |
| } |
| } |
| |
| #if defined( _WIN32 ) || defined( __WIN32__ ) || defined( _WIN64 ) |
| # include <windows.h> |
| #else |
| # include <sys/mman.h> |
| # include <sched.h> |
| # include <errno.h> |
| # ifndef MAP_UNINITIALIZED |
| # define MAP_UNINITIALIZED 0 |
| # endif |
| #endif |
| |
| //! Initialize the allocator and setup global data |
| int |
| rpmalloc_initialize(void) { |
| #ifdef PLATFORM_WINDOWS |
| SYSTEM_INFO system_info; |
| memset(&system_info, 0, sizeof(system_info)); |
| GetSystemInfo(&system_info); |
| if (system_info.dwAllocationGranularity < SPAN_ADDRESS_GRANULARITY) |
| return -1; |
| #else |
| #if ARCH_64BIT |
| atomic_store64(&_memory_addr, 0x1000000000ULL); |
| #else |
| atomic_store64(&_memory_addr, 0x1000000ULL); |
| #endif |
| #endif |
| |
| atomic_store32(&_memory_heap_id, 0); |
| |
| //Setup all small and medium size classes |
| size_t iclass; |
| for (iclass = 0; iclass < SMALL_CLASS_COUNT; ++iclass) { |
| size_t size = (iclass + 1) * SMALL_GRANULARITY; |
| _memory_size_class[iclass].size = (uint16_t)size; |
| _memory_adjust_size_class(iclass); |
| } |
| for (iclass = 0; iclass < MEDIUM_CLASS_COUNT; ++iclass) { |
| size_t size = SMALL_SIZE_LIMIT + ((iclass + 1) * MEDIUM_GRANULARITY); |
| if (size > MEDIUM_SIZE_LIMIT) |
| size = MEDIUM_SIZE_LIMIT; |
| _memory_size_class[SMALL_CLASS_COUNT + iclass].size = (uint16_t)size; |
| _memory_adjust_size_class(SMALL_CLASS_COUNT + iclass); |
| } |
| |
| //Initialize this thread |
| rpmalloc_thread_initialize(); |
| return 0; |
| } |
| |
| //! Finalize the allocator |
| void |
| rpmalloc_finalize(void) { |
| atomic_thread_fence_acquire(); |
| |
| //Free all thread caches |
| for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) { |
| heap_t* heap = atomic_load_ptr(&_memory_heaps[list_idx]); |
| while (heap) { |
| _memory_deallocate_deferred(heap, 0); |
| |
| for (size_t iclass = 0; iclass < SPAN_CLASS_COUNT; ++iclass) { |
| const size_t page_count = (iclass + 1) * SPAN_CLASS_GRANULARITY; |
| span_t* span = heap->span_cache[iclass]; |
| unsigned int span_count = span ? span->data.list_size : 0; |
| for (unsigned int ispan = 0; ispan < span_count; ++ispan) { |
| span_t* next_span = span->next_span; |
| _memory_unmap(span, page_count); |
| span = next_span; |
| } |
| } |
| |
| //Free large spans |
| for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { |
| const size_t span_count = iclass + 1; |
| span_t* span = heap->large_cache[iclass]; |
| while (span) { |
| span_t* next_span = span->next_span; |
| _memory_unmap(span, span_count * SPAN_MAX_PAGE_COUNT); |
| span = next_span; |
| } |
| } |
| |
| heap_t* next_heap = heap->next_heap; |
| _memory_unmap(heap, 2); |
| heap = next_heap; |
| } |
| |
| atomic_store_ptr(&_memory_heaps[list_idx], 0); |
| } |
| atomic_store_ptr(&_memory_orphan_heaps, 0); |
| |
| //Free global caches |
| for (size_t iclass = 0; iclass < SPAN_CLASS_COUNT; ++iclass) { |
| void* span_ptr = atomic_load_ptr(&_memory_span_cache[iclass]); |
| size_t cache_count = (uintptr_t)span_ptr & ~SPAN_MASK; |
| span_t* span = (span_t*)((void*)((uintptr_t)span_ptr & SPAN_MASK)); |
| while (cache_count) { |
| span_t* skip_span = span->prev_span; |
| unsigned int span_count = span->data.list_size; |
| for (unsigned int ispan = 0; ispan < span_count; ++ispan) { |
| span_t* next_span = span->next_span; |
| _memory_unmap(span, (iclass + 1) * SPAN_CLASS_GRANULARITY); |
| span = next_span; |
| } |
| span = skip_span; |
| cache_count -= span_count; |
| } |
| atomic_store_ptr(&_memory_span_cache[iclass], 0); |
| } |
| |
| for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { |
| void* span_ptr = atomic_load_ptr(&_memory_large_cache[iclass]); |
| size_t cache_count = (uintptr_t)span_ptr & ~SPAN_MASK; |
| span_t* span = (span_t*)((void*)((uintptr_t)span_ptr & SPAN_MASK)); |
| while (cache_count) { |
| span_t* skip_span = span->prev_span; |
| unsigned int span_count = span->data.list_size; |
| for (unsigned int ispan = 0; ispan < span_count; ++ispan) { |
| span_t* next_span = span->next_span; |
| _memory_unmap(span, (iclass + 1) * SPAN_MAX_PAGE_COUNT); |
| span = next_span; |
| } |
| span = skip_span; |
| cache_count -= span_count; |
| } |
| atomic_store_ptr(&_memory_large_cache[iclass], 0); |
| } |
| |
| atomic_thread_fence_release(); |
| } |
| |
| //! Initialize thread, assign heap |
| void |
| rpmalloc_thread_initialize(void) { |
| if (!_memory_thread_heap) { |
| heap_t* heap = _memory_allocate_heap(); |
| #if ENABLE_STATISTICS |
| heap->thread_to_global = 0; |
| heap->global_to_thread = 0; |
| #endif |
| _memory_thread_heap = heap; |
| atomic_incr32(&_memory_active_heaps); |
| } |
| } |
| |
| //! Finalize thread, orphan heap |
| void |
| rpmalloc_thread_finalize(void) { |
| heap_t* heap = _memory_thread_heap; |
| if (!heap) |
| return; |
| |
| atomic_add32(&_memory_active_heaps, -1); |
| |
| _memory_deallocate_deferred(heap, 0); |
| |
| //Release thread cache spans back to global cache |
| for (size_t iclass = 0; iclass < SPAN_CLASS_COUNT; ++iclass) { |
| const size_t page_count = (iclass + 1) * SPAN_CLASS_GRANULARITY; |
| span_t* span = heap->span_cache[iclass]; |
| while (span) { |
| if (span->data.list_size > MIN_SPAN_CACHE_RELEASE) { |
| count_t list_size = 1; |
| span_t* next = span->next_span; |
| span_t* last = span; |
| while (list_size < MIN_SPAN_CACHE_RELEASE) { |
| last = next; |
| next = next->next_span; |
| ++list_size; |
| } |
| last->next_span = 0; //Terminate list |
| next->data.list_size = span->data.list_size - list_size; |
| _memory_global_cache_insert(span, list_size, page_count); |
| span = next; |
| } |
| else { |
| _memory_global_cache_insert(span, span->data.list_size, page_count); |
| span = 0; |
| } |
| } |
| heap->span_cache[iclass] = 0; |
| } |
| |
| for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { |
| const size_t span_count = iclass + 1; |
| span_t* span = heap->large_cache[iclass]; |
| while (span) { |
| if (span->data.list_size > MIN_SPAN_CACHE_RELEASE) { |
| count_t list_size = 1; |
| span_t* next = span->next_span; |
| span_t* last = span; |
| while (list_size < MIN_SPAN_CACHE_RELEASE) { |
| last = next; |
| next = next->next_span; |
| ++list_size; |
| } |
| last->next_span = 0; //Terminate list |
| next->data.list_size = span->data.list_size - list_size; |
| _memory_global_cache_large_insert(span, list_size, span_count); |
| span = next; |
| } |
| else { |
| _memory_global_cache_large_insert(span, span->data.list_size, span_count); |
| span = 0; |
| } |
| } |
| heap->large_cache[iclass] = 0; |
| } |
| |
| //Reset allocation counters |
| memset(heap->span_counter, 0, sizeof(heap->span_counter)); |
| memset(heap->large_counter, 0, sizeof(heap->large_counter)); |
| #if ENABLE_STATISTICS |
| heap->requested = 0; |
| heap->allocated = 0; |
| heap->thread_to_global = 0; |
| heap->global_to_thread = 0; |
| #endif |
| |
| //Orphan the heap |
| heap_t* last_heap; |
| do { |
| last_heap = atomic_load_ptr(&_memory_orphan_heaps); |
| heap->next_orphan = last_heap; |
| } |
| while (!atomic_cas_ptr(&_memory_orphan_heaps, heap, last_heap)); |
| |
| _memory_thread_heap = 0; |
| } |
| |
| int |
| rpmalloc_is_thread_initialized(void) { |
| return (_memory_thread_heap != 0) ? 1 : 0; |
| } |
| |
| //! Map new pages to virtual memory |
| static void* |
| _memory_map(size_t page_count) { |
| size_t total_size = page_count * PAGE_SIZE; |
| void* pages_ptr = 0; |
| |
| #if ENABLE_STATISTICS |
| atomic_add32(&_mapped_pages, (int32_t)page_count); |
| atomic_add32(&_mapped_total, (int32_t)page_count); |
| #endif |
| |
| #ifdef PLATFORM_WINDOWS |
| pages_ptr = VirtualAlloc(0, total_size, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); |
| #else |
| //mmap lacks a way to set 64KiB address granularity, implement it locally |
| intptr_t incr = (intptr_t)total_size / (intptr_t)SPAN_ADDRESS_GRANULARITY; |
| if (total_size % SPAN_ADDRESS_GRANULARITY) |
| ++incr; |
| do { |
| void* base_addr = (void*)(uintptr_t)atomic_exchange_and_add64(&_memory_addr, |
| (incr * (intptr_t)SPAN_ADDRESS_GRANULARITY)); |
| pages_ptr = mmap(base_addr, total_size, PROT_READ | PROT_WRITE, |
| MAP_PRIVATE | MAP_ANONYMOUS | MAP_UNINITIALIZED, -1, 0); |
| if (pages_ptr != MAP_FAILED) { |
| if (pages_ptr != base_addr) { |
| void* new_base = (void*)((uintptr_t)pages_ptr & SPAN_MASK); |
| atomic_store64(&_memory_addr, (int64_t)((uintptr_t)new_base) + |
| ((incr + 1) * (intptr_t)SPAN_ADDRESS_GRANULARITY)); |
| atomic_thread_fence_release(); |
| } |
| if (!((uintptr_t)pages_ptr & ~SPAN_MASK)) |
| break; |
| munmap(pages_ptr, total_size); |
| } |
| } |
| while (1); |
| #endif |
| |
| return pages_ptr; |
| } |
| |
| //! Unmap pages from virtual memory |
| static void |
| _memory_unmap(void* ptr, size_t page_count) { |
| #if ENABLE_STATISTICS |
| atomic_add32(&_mapped_pages, -(int32_t)page_count); |
| atomic_add32(&_unmapped_total, (int32_t)page_count); |
| #endif |
| |
| #ifdef PLATFORM_WINDOWS |
| VirtualFree(ptr, 0, MEM_RELEASE); |
| #else |
| munmap(ptr, PAGE_SIZE * page_count); |
| #endif |
| } |
| |
| static FORCEINLINE int |
| atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { |
| #ifdef _MSC_VER |
| # if ARCH_64BIT |
| return (_InterlockedCompareExchange64((volatile long long*)&dst->nonatomic, |
| (long long)val, (long long)ref) == (long long)ref) ? 1 : 0; |
| # else |
| return (_InterlockedCompareExchange((volatile long*)&dst->nonatomic, |
| (long)val, (long)ref) == (long)ref) ? 1 : 0; |
| # endif |
| #else |
| return __sync_bool_compare_and_swap(&dst->nonatomic, ref, val); |
| #endif |
| } |
| |
| //! Yield the thread remaining timeslice |
| static void |
| thread_yield(void) { |
| #ifdef PLATFORM_WINDOWS |
| YieldProcessor(); |
| #else |
| sched_yield(); |
| #endif |
| } |
| |
| // Extern interface |
| |
| void* |
| rpmalloc(size_t size) { |
| #if ENABLE_VALIDATE_ARGS |
| if (size >= MAX_ALLOC_SIZE) { |
| errno = EINVAL; |
| return 0; |
| } |
| #endif |
| return _memory_allocate(size); |
| } |
| |
| void |
| rpfree(void* ptr) { |
| _memory_deallocate(ptr); |
| } |
| |
| void* |
| rpcalloc(size_t num, size_t size) { |
| size_t total; |
| #if ENABLE_VALIDATE_ARGS |
| #ifdef PLATFORM_WINDOWS |
| int err = SizeTMult(num, size, &total); |
| if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) { |
| errno = EINVAL; |
| return 0; |
| } |
| #else |
| int err = __builtin_umull_overflow(num, size, &total); |
| if (err || (total >= MAX_ALLOC_SIZE)) { |
| errno = EINVAL; |
| return 0; |
| } |
| #endif |
| #else |
| total = num * size; |
| #endif |
| void* ptr = _memory_allocate(total); |
| memset(ptr, 0, total); |
| return ptr; |
| } |
| |
| void* |
| rprealloc(void* ptr, size_t size) { |
| #if ENABLE_VALIDATE_ARGS |
| if (size >= MAX_ALLOC_SIZE) { |
| errno = EINVAL; |
| return ptr; |
| } |
| #endif |
| return _memory_reallocate(ptr, size, 0, 0); |
| } |
| |
| void* |
| rpaligned_realloc(void* ptr, size_t alignment, size_t size, size_t oldsize, |
| unsigned int flags) { |
| #if ENABLE_VALIDATE_ARGS |
| if (size + alignment < size) { |
| errno = EINVAL; |
| return 0; |
| } |
| #endif |
| //TODO: If alignment > 16, we need to copy to new aligned position |
| (void)sizeof(alignment); |
| return _memory_reallocate(ptr, size, oldsize, flags); |
| } |
| |
| void* |
| rpaligned_alloc(size_t alignment, size_t size) { |
| if (alignment <= 16) |
| return rpmalloc(size); |
| |
| #if ENABLE_VALIDATE_ARGS |
| if (size + alignment < size) { |
| errno = EINVAL; |
| return 0; |
| } |
| #endif |
| |
| void* ptr = rpmalloc(size + alignment); |
| if ((uintptr_t)ptr & (alignment - 1)) |
| ptr = (void*)(((uintptr_t)ptr & ~((uintptr_t)alignment - 1)) + alignment); |
| return ptr; |
| } |
| |
| void* |
| rpmemalign(size_t alignment, size_t size) { |
| return rpaligned_alloc(alignment, size); |
| } |
| |
| int |
| rpposix_memalign(void **memptr, size_t alignment, size_t size) { |
| if (memptr) |
| *memptr = rpaligned_alloc(alignment, size); |
| else |
| return EINVAL; |
| return *memptr ? 0 : ENOMEM; |
| } |
| |
| size_t |
| rpmalloc_usable_size(void* ptr) { |
| return ptr ? _memory_usable_size(ptr) : 0; |
| } |
| |
| void |
| rpmalloc_thread_collect(void) { |
| _memory_deallocate_deferred(_memory_thread_heap, 0); |
| } |
| |
| void |
| rpmalloc_thread_statistics(rpmalloc_thread_statistics_t* stats) { |
| memset(stats, 0, sizeof(rpmalloc_thread_statistics_t)); |
| heap_t* heap = _memory_thread_heap; |
| #if ENABLE_STATISTICS |
| stats->allocated = heap->allocated; |
| stats->requested = heap->requested; |
| #endif |
| void* p = atomic_load_ptr(&heap->defer_deallocate); |
| while (p) { |
| void* next = *(void**)p; |
| span_t* span = (void*)((uintptr_t)p & SPAN_MASK); |
| stats->deferred += _memory_size_class[span->size_class].size; |
| p = next; |
| } |
| |
| for (size_t isize = 0; isize < SIZE_CLASS_COUNT; ++isize) { |
| if (heap->active_block[isize].free_count) |
| stats->active += heap->active_block[isize].free_count * _memory_size_class[heap->active_span[isize]->size_class].size; |
| |
| span_t* cache = heap->size_cache[isize]; |
| while (cache) { |
| stats->sizecache = cache->data.block.free_count * _memory_size_class[cache->size_class].size; |
| cache = cache->next_span; |
| } |
| } |
| |
| for (size_t isize = 0; isize < SPAN_CLASS_COUNT; ++isize) { |
| if (heap->span_cache[isize]) |
| stats->spancache = (size_t)heap->span_cache[isize]->data.list_size * (isize + 1) * SPAN_CLASS_GRANULARITY * PAGE_SIZE; |
| } |
| } |
| |
| void |
| rpmalloc_global_statistics(rpmalloc_global_statistics_t* stats) { |
| memset(stats, 0, sizeof(rpmalloc_global_statistics_t)); |
| #if ENABLE_STATISTICS |
| stats->mapped = (size_t)atomic_load32(&_mapped_pages) * PAGE_SIZE; |
| stats->mapped_total = (size_t)atomic_load32(&_mapped_total) * PAGE_SIZE; |
| stats->unmapped_total = (size_t)atomic_load32(&_unmapped_total) * PAGE_SIZE; |
| #endif |
| for (size_t iclass = 0; iclass < SPAN_CLASS_COUNT; ++iclass) { |
| void* global_span_ptr = atomic_load_ptr(&_memory_span_cache[iclass]); |
| while (global_span_ptr == SPAN_LIST_LOCK_TOKEN) { |
| thread_yield(); |
| global_span_ptr = atomic_load_ptr(&_memory_span_cache[iclass]); |
| } |
| uintptr_t global_span_count = (uintptr_t)global_span_ptr & ~SPAN_MASK; |
| size_t list_bytes = global_span_count * (iclass + 1) * SPAN_CLASS_GRANULARITY * PAGE_SIZE; |
| stats->cached += list_bytes; |
| } |
| for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { |
| void* global_span_ptr = atomic_load_ptr(&_memory_large_cache[iclass]); |
| while (global_span_ptr == SPAN_LIST_LOCK_TOKEN) { |
| thread_yield(); |
| global_span_ptr = atomic_load_ptr(&_memory_large_cache[iclass]); |
| } |
| uintptr_t global_span_count = (uintptr_t)global_span_ptr & ~SPAN_MASK; |
| size_t list_bytes = global_span_count * (iclass + 1) * SPAN_MAX_PAGE_COUNT * PAGE_SIZE; |
| stats->cached_large += list_bytes; |
| } |
| } |