Optimize away the tsd_fast() check on free fastpath.

To ensure that the free fastpath can tolerate uninitialized tsd, improved the
static initializer for rtree_ctx in tsd.
diff --git a/include/jemalloc/internal/emap.h b/include/jemalloc/internal/emap.h
index a40b504..87ece63 100644
--- a/include/jemalloc/internal/emap.h
+++ b/include/jemalloc/internal/emap.h
@@ -276,12 +276,14 @@
 }
 
 /*
- * Returns true on error.
+ * Only used on the fastpath of free.  Returns true when cannot be fulfilled by
+ * fast path, e.g. when the metadata key is not cached.
  */
 JEMALLOC_ALWAYS_INLINE bool
 emap_alloc_ctx_try_lookup_fast(tsd_t *tsd, emap_t *emap, const void *ptr,
     emap_alloc_ctx_t *alloc_ctx) {
-	rtree_ctx_t *rtree_ctx = tsd_rtree_ctx(tsd);
+	/* Use the unsafe getter since this may gets called during exit. */
+	rtree_ctx_t *rtree_ctx = tsd_rtree_ctxp_get_unsafe(tsd);
 
 	rtree_metadata_t metadata;
 	bool err = rtree_metadata_try_read_fast(tsd_tsdn(tsd), &emap->rtree,
diff --git a/include/jemalloc/internal/rtree.h b/include/jemalloc/internal/rtree.h
index c5f0d8c..b4f4484 100644
--- a/include/jemalloc/internal/rtree.h
+++ b/include/jemalloc/internal/rtree.h
@@ -35,9 +35,6 @@
 #  define RTREE_LEAF_COMPACT
 #endif
 
-/* Needed for initialization only. */
-#define RTREE_LEAFKEY_INVALID ((uintptr_t)1)
-
 typedef struct rtree_node_elm_s rtree_node_elm_t;
 struct rtree_node_elm_s {
 	atomic_p_t	child; /* (rtree_{node,leaf}_elm_t *) */
diff --git a/include/jemalloc/internal/rtree_tsd.h b/include/jemalloc/internal/rtree_tsd.h
index 562e292..e45525c 100644
--- a/include/jemalloc/internal/rtree_tsd.h
+++ b/include/jemalloc/internal/rtree_tsd.h
@@ -18,16 +18,28 @@
  * cache misses if made overly large, plus the cost of linear search in the LRU
  * cache.
  */
-#define RTREE_CTX_LG_NCACHE 4
-#define RTREE_CTX_NCACHE (1 << RTREE_CTX_LG_NCACHE)
+#define RTREE_CTX_NCACHE 16
 #define RTREE_CTX_NCACHE_L2 8
 
-/*
- * Zero initializer required for tsd initialization only.  Proper initialization
- * done via rtree_ctx_data_init().
- */
-#define RTREE_CTX_ZERO_INITIALIZER {{{0, 0}}, {{0, 0}}}
+/* Needed for initialization only. */
+#define RTREE_LEAFKEY_INVALID ((uintptr_t)1)
+#define RTREE_CTX_CACHE_ELM_INVALID {RTREE_LEAFKEY_INVALID, NULL}
 
+#define RTREE_CTX_INIT_ELM_1 RTREE_CTX_CACHE_ELM_INVALID
+#define RTREE_CTX_INIT_ELM_2 RTREE_CTX_INIT_ELM_1, RTREE_CTX_INIT_ELM_1
+#define RTREE_CTX_INIT_ELM_4 RTREE_CTX_INIT_ELM_2, RTREE_CTX_INIT_ELM_2
+#define RTREE_CTX_INIT_ELM_8 RTREE_CTX_INIT_ELM_4, RTREE_CTX_INIT_ELM_4
+#define RTREE_CTX_INIT_ELM_16 RTREE_CTX_INIT_ELM_8, RTREE_CTX_INIT_ELM_8
+
+#define _RTREE_CTX_INIT_ELM_DATA(n) RTREE_CTX_INIT_ELM_##n
+#define RTREE_CTX_INIT_ELM_DATA(n) _RTREE_CTX_INIT_ELM_DATA(n)
+
+/*
+ * Static initializer (to invalidate the cache entries) is required because the
+ * free fastpath may access the rtree cache before a full tsd initialization.
+ */
+#define RTREE_CTX_INITIALIZER {{RTREE_CTX_INIT_ELM_DATA(RTREE_CTX_NCACHE)}, \
+			       {RTREE_CTX_INIT_ELM_DATA(RTREE_CTX_NCACHE_L2)}}
 
 typedef struct rtree_leaf_elm_s rtree_leaf_elm_t;
 
diff --git a/include/jemalloc/internal/thread_event.h b/include/jemalloc/internal/thread_event.h
index 525019b..2f4e1b3 100644
--- a/include/jemalloc/internal/thread_event.h
+++ b/include/jemalloc/internal/thread_event.h
@@ -118,17 +118,10 @@
 }
 
 JEMALLOC_ALWAYS_INLINE void
-te_free_fastpath_ctx(tsd_t *tsd, uint64_t *deallocated, uint64_t *threshold,
-    bool size_hint) {
-	if (!size_hint) {
-		*deallocated = tsd_thread_deallocated_get(tsd);
-		*threshold = tsd_thread_deallocated_next_event_fast_get(tsd);
-	} else {
-		/* Unsafe getters since this may happen before tsd_init. */
-		*deallocated = *tsd_thread_deallocatedp_get_unsafe(tsd);
-		*threshold =
-		    *tsd_thread_deallocated_next_event_fastp_get_unsafe(tsd);
-	}
+te_free_fastpath_ctx(tsd_t *tsd, uint64_t *deallocated, uint64_t *threshold) {
+	/* Unsafe getters since this may happen before tsd_init. */
+	*deallocated = *tsd_thread_deallocatedp_get_unsafe(tsd);
+	*threshold = *tsd_thread_deallocated_next_event_fastp_get_unsafe(tsd);
 	assert(*threshold <= TE_NEXT_EVENT_FAST_MAX);
 }
 
diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h
index 86d5277..0a46d44 100644
--- a/include/jemalloc/internal/tsd.h
+++ b/include/jemalloc/internal/tsd.h
@@ -119,7 +119,7 @@
     /* activity_callback_thunk */					\
 	ACTIVITY_CALLBACK_THUNK_INITIALIZER,				\
     /* tcache_slow */		TCACHE_SLOW_ZERO_INITIALIZER,		\
-    /* rtree_ctx */		RTREE_CTX_ZERO_INITIALIZER,
+    /* rtree_ctx */		RTREE_CTX_INITIALIZER,
 
 /*  O(name,			type,			nullable type) */
 #define TSD_DATA_FAST							\
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 469a491..0c798c8 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -2910,12 +2910,20 @@
 JEMALLOC_ALWAYS_INLINE
 bool free_fastpath(void *ptr, size_t size, bool size_hint) {
 	tsd_t *tsd = tsd_get(false);
+	/* The branch gets optimized away unless tsd_get_allocates(). */
+	if (unlikely(tsd == NULL)) {
+		return false;
+	}
+	/*
+	 *  The tsd_fast() / initialized checks are folded into the branch
+	 *  testing (deallocated_after >= threshold) later in this function.
+	 *  The threshold will be set to 0 when !tsd_fast.
+	 */
+	assert(tsd_fast(tsd) ||
+	    *tsd_thread_deallocated_next_event_fastp_get_unsafe(tsd) == 0);
 
 	emap_alloc_ctx_t alloc_ctx;
 	if (!size_hint) {
-		if (unlikely(tsd == NULL || !tsd_fast(tsd))) {
-			return false;
-		}
 		bool err = emap_alloc_ctx_try_lookup_fast(tsd,
 		    &arena_emap_global, ptr, &alloc_ctx);
 
@@ -2926,15 +2934,6 @@
 		assert(alloc_ctx.szind != SC_NSIZES);
 	} else {
 		/*
-		 * The size hinted fastpath does not involve rtree lookup, thus
-		 * can tolerate an uninitialized tsd.  This allows the tsd_fast
-		 * check to be folded into the branch testing fast_threshold
-		 * (set to 0 when !tsd_fast).
-		 */
-		if (unlikely(tsd == NULL)) {
-			return false;
-		}
-		/*
 		 * Check for both sizes that are too large, and for sampled
 		 * objects.  Sampled objects are always page-aligned.  The
 		 * sampled object check will also check for null ptr.
@@ -2949,7 +2948,7 @@
 	}
 
 	uint64_t deallocated, threshold;
-	te_free_fastpath_ctx(tsd, &deallocated, &threshold, size_hint);
+	te_free_fastpath_ctx(tsd, &deallocated, &threshold);
 
 	size_t usize = sz_index2size(alloc_ctx.szind);
 	uint64_t deallocated_after = deallocated + usize;
@@ -2963,7 +2962,7 @@
 	if (unlikely(deallocated_after >= threshold)) {
 		return false;
 	}
-
+	assert(tsd_fast(tsd));
 	bool fail = maybe_check_alloc_ctx(tsd, ptr, &alloc_ctx);
 	if (fail) {
 		/* See the comment in isfree. */
diff --git a/test/unit/tsd.c b/test/unit/tsd.c
index 3f3ca73..205d870 100644
--- a/test/unit/tsd.c
+++ b/test/unit/tsd.c
@@ -48,6 +48,13 @@
 	int d = (int)(uintptr_t)arg;
 	void *p;
 
+	/*
+	 * Test free before tsd init -- the free fast path (which does not
+	 * explicitly check for NULL) has to tolerate this case, and fall back
+	 * to free_default.
+	 */
+	free(NULL);
+
 	tsd_t *tsd = tsd_fetch();
 	expect_x_eq(tsd_test_data_get(tsd), MALLOC_TSD_TEST_DATA_INIT,
 	    "Initial tsd get should return initialization value");