Improve exit handling under forced _exit (#186)

diff --git a/CHANGELOG b/CHANGELOG
index 2820daf..1a93962 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,9 @@
+1.4.2
+
+Fixed an issue where calling _exit might hang the main thread cleanup in rpmalloc if another
+worker thread was terminated while holding exclusive access to the global cache.
+
+
 1.4.1
 
 Dual license as both released to public domain or under MIT license
diff --git a/rpmalloc/malloc.c b/rpmalloc/malloc.c
index 12acbe0..a10d443 100644
--- a/rpmalloc/malloc.c
+++ b/rpmalloc/malloc.c
@@ -296,8 +296,33 @@
 	return TRUE;
 }
 
+//end BUILD_DYNAMIC_LINK
+#else
+
+extern void
+_global_rpmalloc_init(void) {
+	rpmalloc_set_main_thread();
+	rpmalloc_initialize();
+}
+
+#if defined(__clang__) || defined(__GNUC__)
+
+static void __attribute__((constructor))
+initializer(void) {
+	_global_rpmalloc_init();
+}
+
+#elif defined(_MSC_VER)
+
+#pragma section(".CRT$XIB",read)
+__declspec(allocate(".CRT$XIB")) void (*_rpmalloc_module_init)(void) = _global_rpmalloc_init;
+#pragma comment(linker, "/include:_rpmalloc_module_init")
+
 #endif
 
+//end !BUILD_DYNAMIC_LINK
+#endif 
+
 #else
 
 #include <pthread.h>
@@ -305,6 +330,9 @@
 #include <stdint.h>
 #include <unistd.h>
 
+extern void
+rpmalloc_set_main_thread(void);
+
 static pthread_key_t destructor_key;
 
 static void
@@ -312,6 +340,7 @@
 
 static void __attribute__((constructor))
 initializer(void) {
+	rpmalloc_set_main_thread();
 	rpmalloc_initialize();
 	pthread_key_create(&destructor_key, thread_destructor);
 }
diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index a23d62a..ed00a56 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -149,11 +149,6 @@
 #if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
 #include <fibersapi.h>
 static DWORD fls_key;
-static void NTAPI
-_rpmalloc_thread_destructor(void* value) {
-	if (value)
-		rpmalloc_thread_finalize();
-}
 #endif
 
 #if PLATFORM_POSIX
@@ -570,6 +565,8 @@
 
 //! Initialized flag
 static int _rpmalloc_initialized;
+//! Main thread ID
+static uintptr_t _rpmalloc_main_thread_id;
 //! Configuration
 static rpmalloc_config_t _memory_config;
 //! Memory page size
@@ -731,6 +728,30 @@
 		heap->owner_thread = get_thread_id();
 }
 
+//! Set main thread ID
+extern void
+rpmalloc_set_main_thread(void);
+
+void
+rpmalloc_set_main_thread(void) {
+	_rpmalloc_main_thread_id = get_thread_id();
+}
+
+#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
+static void NTAPI
+_rpmalloc_thread_destructor(void* value) {
+#if ENABLE_OVERRIDE
+	// If this is called on main thread it means rpmalloc_finalize
+	// has not been called and shutdown is forced (through _exit) or unclean
+	if (get_thread_id() == _rpmalloc_main_thread_id)
+		return;
+#endif
+	if (value)
+		rpmalloc_thread_finalize();
+}
+#endif
+
+
 ////////////
 ///
 /// Low level memory map/unmap
@@ -1771,8 +1792,12 @@
 	assert(atomic_load32(&_memory_active_heaps) >= 0);
 #endif
 
-	while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0))
-		/* Spin */;
+	// If we are forcibly terminating with _exit the state of the
+	// lock atomic is unknown and it's best to just go ahead and exit
+	if (get_thread_id() != _rpmalloc_main_thread_id) {
+		while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0))
+			/* Spin */;
+	}
 	_rpmalloc_heap_orphan(heap, first_class);
 	atomic_store32_release(&_memory_global_lock, 0);
 }