Make dss operations lockless.

Rather than protecting dss operations with a mutex, use atomic
operations.  This has negligible impact on synchronization overhead
during typical dss allocation, but is a substantial improvement for
extent_in_dss() and the newly added extent_dss_mergeable(), which can be
called multiple times during extent deallocations.

This change also has the advantage of avoiding tsd in deallocation paths
associated with purging, which resolves potential deadlocks during
thread exit due to attempted tsd resurrection.

This resolves #425.
diff --git a/include/jemalloc/internal/extent.h b/include/jemalloc/internal/extent.h
index 528759b..08d3036 100644
--- a/include/jemalloc/internal/extent.h
+++ b/include/jemalloc/internal/extent.h
@@ -127,9 +127,6 @@
     size_t usize_a, size_t size_b, size_t usize_b);
 bool	extent_merge_wrapper(tsdn_t *tsdn, arena_t *arena,
     extent_hooks_t **r_extent_hooks, extent_t *a, extent_t *b);
-void	extent_prefork(tsdn_t *tsdn);
-void	extent_postfork_parent(tsdn_t *tsdn);
-void	extent_postfork_child(tsdn_t *tsdn);
 
 bool	extent_boot(void);
 
diff --git a/include/jemalloc/internal/extent_dss.h b/include/jemalloc/internal/extent_dss.h
index 0aabc2e..f2dac52 100644
--- a/include/jemalloc/internal/extent_dss.h
+++ b/include/jemalloc/internal/extent_dss.h
@@ -23,15 +23,13 @@
 
 extern const char	*opt_dss;
 
-dss_prec_t	extent_dss_prec_get(tsdn_t *tsdn);
-bool	extent_dss_prec_set(tsdn_t *tsdn, dss_prec_t dss_prec);
+dss_prec_t	extent_dss_prec_get(void);
+bool	extent_dss_prec_set(dss_prec_t dss_prec);
 void	*extent_alloc_dss(tsdn_t *tsdn, arena_t *arena, void *new_addr,
     size_t size, size_t alignment, bool *zero, bool *commit);
-bool	extent_in_dss(tsdn_t *tsdn, void *addr);
-bool	extent_dss_boot(void);
-void	extent_dss_prefork(tsdn_t *tsdn);
-void	extent_dss_postfork_parent(tsdn_t *tsdn);
-void	extent_dss_postfork_child(tsdn_t *tsdn);
+bool	extent_in_dss(void *addr);
+bool	extent_dss_mergeable(void *addr_a, void *addr_b);
+void	extent_dss_boot(void);
 
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
diff --git a/include/jemalloc/internal/large.h b/include/jemalloc/internal/large.h
index 8345f89..f3d382b 100644
--- a/include/jemalloc/internal/large.h
+++ b/include/jemalloc/internal/large.h
@@ -19,11 +19,11 @@
 #ifdef JEMALLOC_JET
 typedef void (large_dalloc_junk_t)(void *, size_t);
 extern large_dalloc_junk_t *large_dalloc_junk;
-typedef void (large_dalloc_maybe_junk_t)(tsdn_t *, void *, size_t);
+typedef void (large_dalloc_maybe_junk_t)(void *, size_t);
 extern large_dalloc_maybe_junk_t *large_dalloc_maybe_junk;
 #else
 void	large_dalloc_junk(void *ptr, size_t usize);
-void	large_dalloc_maybe_junk(tsdn_t *tsdn, void *ptr, size_t usize);
+void	large_dalloc_maybe_junk(void *ptr, size_t usize);
 #endif
 void	large_dalloc_junked_locked(tsdn_t *tsdn, extent_t *extent);
 void	large_dalloc(tsdn_t *tsdn, extent_t *extent);
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index d1f39cf..8d573b7 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -156,11 +156,9 @@
 extent_dalloc_wrapper
 extent_decommit_wrapper
 extent_dss_boot
-extent_dss_postfork_child
-extent_dss_postfork_parent
+extent_dss_mergeable
 extent_dss_prec_get
 extent_dss_prec_set
-extent_dss_prefork
 extent_heap_empty
 extent_heap_first
 extent_heap_insert
@@ -176,9 +174,6 @@
 extent_lookup
 extent_merge_wrapper
 extent_past_get
-extent_postfork_child
-extent_postfork_parent
-extent_prefork
 extent_prof_tctx_get
 extent_prof_tctx_set
 extent_purge_wrapper
diff --git a/src/arena.c b/src/arena.c
index 2b8aead..ce28959 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -1682,7 +1682,7 @@
 		    (uint64_t)(uintptr_t)arena;
 	}
 
-	arena->dss_prec = extent_dss_prec_get(tsdn);
+	arena->dss_prec = extent_dss_prec_get();
 
 	arena->purging = false;
 	arena->nactive = 0;
diff --git a/src/ctl.c b/src/ctl.c
index b4e2208..067b677 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -1527,11 +1527,11 @@
 		dss_prec_old = arena_dss_prec_get(tsd_tsdn(tsd), arena);
 	} else {
 		if (dss_prec != dss_prec_limit &&
-		    extent_dss_prec_set(tsd_tsdn(tsd), dss_prec)) {
+		    extent_dss_prec_set(dss_prec)) {
 			ret = EFAULT;
 			goto label_return;
 		}
-		dss_prec_old = extent_dss_prec_get(tsd_tsdn(tsd));
+		dss_prec_old = extent_dss_prec_get();
 	}
 
 	dss = dss_prec_names[dss_prec_old];
diff --git a/src/extent.c b/src/extent.c
index e4d3ccd..e4ceb8f 100644
--- a/src/extent.c
+++ b/src/extent.c
@@ -856,10 +856,10 @@
 }
 
 static bool
-extent_dalloc_default_impl(tsdn_t *tsdn, void *addr, size_t size)
+extent_dalloc_default_impl(void *addr, size_t size)
 {
 
-	if (!have_dss || !extent_in_dss(tsdn, addr))
+	if (!have_dss || !extent_in_dss(addr))
 		return (extent_dalloc_mmap(addr, size));
 	return (true);
 }
@@ -869,13 +869,10 @@
 extent_dalloc_default(extent_hooks_t *extent_hooks, void *addr, size_t size,
     bool committed, unsigned arena_ind)
 {
-	tsdn_t *tsdn;
 
 	assert(extent_hooks == &extent_hooks_default);
 
-	tsdn = tsdn_fetch();
-
-	return (extent_dalloc_default_impl(tsdn, addr, size));
+	return (extent_dalloc_default_impl(addr, size));
 }
 
 void
@@ -897,7 +894,7 @@
 	extent_deregister(tsdn, extent);
 	if (*r_extent_hooks == &extent_hooks_default) {
 		/* Call directly to propagate tsdn. */
-		err = extent_dalloc_default_impl(tsdn, extent_base_get(extent),
+		err = extent_dalloc_default_impl(extent_base_get(extent),
 		    extent_size_get(extent));
 	} else {
 		err = (*r_extent_hooks)->dalloc(*r_extent_hooks,
@@ -1083,13 +1080,12 @@
 }
 
 static bool
-extent_merge_default_impl(tsdn_t *tsdn, void *addr_a, void *addr_b)
+extent_merge_default_impl(void *addr_a, void *addr_b)
 {
 
 	if (!maps_coalesce)
 		return (true);
-	if (have_dss && extent_in_dss(tsdn, addr_a) != extent_in_dss(tsdn,
-	    addr_b))
+	if (have_dss && !extent_dss_mergeable(addr_a, addr_b))
 		return (true);
 
 	return (false);
@@ -1099,13 +1095,10 @@
 extent_merge_default(extent_hooks_t *extent_hooks, void *addr_a, size_t size_a,
     void *addr_b, size_t size_b, bool committed, unsigned arena_ind)
 {
-	tsdn_t *tsdn;
 
 	assert(extent_hooks == &extent_hooks_default);
 
-	tsdn = tsdn_fetch();
-
-	return (extent_merge_default_impl(tsdn, addr_a, addr_b));
+	return (extent_merge_default_impl(addr_a, addr_b));
 }
 
 bool
@@ -1120,7 +1113,7 @@
 	extent_hooks_assure_initialized(arena, r_extent_hooks);
 	if (*r_extent_hooks == &extent_hooks_default) {
 		/* Call directly to propagate tsdn. */
-		err = extent_merge_default_impl(tsdn, extent_base_get(a),
+		err = extent_merge_default_impl(extent_base_get(a),
 		    extent_base_get(b));
 	} else {
 		err = (*r_extent_hooks)->merge(*r_extent_hooks,
@@ -1171,29 +1164,8 @@
 	    LG_PAGE)))
 		return (true);
 
-	if (have_dss && extent_dss_boot())
-		return (true);
+	if (have_dss)
+		extent_dss_boot();
 
 	return (false);
 }
-
-void
-extent_prefork(tsdn_t *tsdn)
-{
-
-	extent_dss_prefork(tsdn);
-}
-
-void
-extent_postfork_parent(tsdn_t *tsdn)
-{
-
-	extent_dss_postfork_parent(tsdn);
-}
-
-void
-extent_postfork_child(tsdn_t *tsdn)
-{
-
-	extent_dss_postfork_child(tsdn);
-}
diff --git a/src/extent_dss.c b/src/extent_dss.c
index e0e6635..31fe8fe 100644
--- a/src/extent_dss.c
+++ b/src/extent_dss.c
@@ -12,20 +12,19 @@
 	"N/A"
 };
 
-/* Current dss precedence default, used when creating new arenas. */
-static dss_prec_t	dss_prec_default = DSS_PREC_DEFAULT;
-
 /*
- * Protects sbrk() calls.  This avoids malloc races among threads, though it
- * does not protect against races with threads that call sbrk() directly.
+ * Current dss precedence default, used when creating new arenas.  NB: This is
+ * stored as unsigned rather than dss_prec_t because in principle there's no
+ * guarantee that sizeof(dss_prec_t) is the same as sizeof(unsigned), and we use
+ * atomic operations to synchronize the setting.
  */
-static malloc_mutex_t	dss_mtx;
+static unsigned		dss_prec_default = (unsigned)DSS_PREC_DEFAULT;
 
 /* Base address of the DSS. */
 static void		*dss_base;
-/* Current end of the DSS, or ((void *)-1) if the DSS is exhausted. */
-static void		*dss_prev;
-/* Current upper limit on DSS addresses. */
+/* Atomic boolean indicating whether the DSS is exhausted. */
+static unsigned		dss_exhausted;
+/* Atomic current upper limit on DSS addresses. */
 static void		*dss_max;
 
 /******************************************************************************/
@@ -43,35 +42,63 @@
 }
 
 dss_prec_t
-extent_dss_prec_get(tsdn_t *tsdn)
+extent_dss_prec_get(void)
 {
 	dss_prec_t ret;
 
 	if (!have_dss)
 		return (dss_prec_disabled);
-	malloc_mutex_lock(tsdn, &dss_mtx);
-	ret = dss_prec_default;
-	malloc_mutex_unlock(tsdn, &dss_mtx);
+	ret = (dss_prec_t)atomic_read_u(&dss_prec_default);
 	return (ret);
 }
 
 bool
-extent_dss_prec_set(tsdn_t *tsdn, dss_prec_t dss_prec)
+extent_dss_prec_set(dss_prec_t dss_prec)
 {
 
 	if (!have_dss)
 		return (dss_prec != dss_prec_disabled);
-	malloc_mutex_lock(tsdn, &dss_mtx);
-	dss_prec_default = dss_prec;
-	malloc_mutex_unlock(tsdn, &dss_mtx);
+	atomic_write_u(&dss_prec_default, (unsigned)dss_prec);
 	return (false);
 }
 
+static void *
+extent_dss_max_update(void *new_addr)
+{
+	void *max_cur;
+	spin_t spinner;
+
+	/*
+	 * Get the current end of the DSS as max_cur and assure that dss_max is
+	 * up to date.
+	 */
+	spin_init(&spinner);
+	while (true) {
+		void *max_prev = atomic_read_p(&dss_max);
+
+		max_cur = extent_dss_sbrk(0);
+		if ((uintptr_t)max_prev > (uintptr_t)max_cur) {
+			/*
+			 * Another thread optimistically updated dss_max.  Wait
+			 * for it to finish.
+			 */
+			spin_adaptive(&spinner);
+			continue;
+		}
+		if (!atomic_cas_p(&dss_max, max_prev, max_cur))
+			break;
+	}
+	/* Fixed new_addr can only be supported if it is at the edge of DSS. */
+	if (new_addr != NULL && max_cur != new_addr)
+		return (NULL);
+
+	return (max_cur);
+}
+
 void *
 extent_alloc_dss(tsdn_t *tsdn, arena_t *arena, void *new_addr, size_t size,
     size_t alignment, bool *zero, bool *commit)
 {
-	void *ret;
 	extent_t *gap;
 
 	cassert(have_dss);
@@ -89,35 +116,27 @@
 	if (gap == NULL)
 		return (NULL);
 
-	malloc_mutex_lock(tsdn, &dss_mtx);
-	if (dss_prev != (void *)-1) {
+	if (!atomic_read_u(&dss_exhausted)) {
 		/*
 		 * The loop is necessary to recover from races with other
 		 * threads that are using the DSS for something other than
 		 * malloc.
 		 */
 		while (true) {
-			void *gap_addr, *dss_next;
+			void *ret, *max_cur, *gap_addr, *dss_next, *dss_prev;
 			size_t gap_size;
 			intptr_t incr;
 
-			/* Avoid an unnecessary system call. */
-			if (new_addr != NULL && dss_max != new_addr)
-				break;
-
-			/* Get the current end of the DSS. */
-			dss_max = extent_dss_sbrk(0);
-
-			/* Make sure the earlier condition still holds. */
-			if (new_addr != NULL && dss_max != new_addr)
-				break;
+			max_cur = extent_dss_max_update(new_addr);
+			if (max_cur == NULL)
+				goto label_oom;
 
 			/*
 			 * Compute how much gap space (if any) is necessary to
 			 * satisfy alignment.  This space can be recycled for
 			 * later use.
 			 */
-			gap_addr = (void *)(PAGE_CEILING((uintptr_t)dss_max));
+			gap_addr = (void *)(PAGE_CEILING((uintptr_t)max_cur));
 			ret = (void *)ALIGNMENT_CEILING((uintptr_t)gap_addr,
 			    PAGE_CEILING(alignment));
 			gap_size = (uintptr_t)ret - (uintptr_t)gap_addr;
@@ -126,17 +145,24 @@
 				    gap_size, false, false, true, false);
 			}
 			dss_next = (void *)((uintptr_t)ret + size);
-			if ((uintptr_t)ret < (uintptr_t)dss_max ||
-			    (uintptr_t)dss_next < (uintptr_t)dss_max)
-				break; /* Wrap-around. */
+			if ((uintptr_t)ret < (uintptr_t)max_cur ||
+			    (uintptr_t)dss_next < (uintptr_t)max_cur)
+				goto label_oom; /* Wrap-around. */
 			incr = gap_size + size;
+
+			/*
+			 * Optimistically update dss_max, and roll back below if
+			 * sbrk() fails.  No other thread will try to extend the
+			 * DSS while dss_max is greater than the current DSS
+			 * max reported by sbrk(0).
+			 */
+			if (atomic_cas_p(&dss_max, max_cur, dss_next))
+				continue;
+
+			/* Try to allocate. */
 			dss_prev = extent_dss_sbrk(incr);
-			if (dss_prev == (void *)-1)
-				break;
-			if (dss_prev == dss_max) {
+			if (dss_prev == max_cur) {
 				/* Success. */
-				dss_max = dss_next;
-				malloc_mutex_unlock(tsdn, &dss_mtx);
 				if (gap_size != 0)
 					extent_dalloc_gap(tsdn, arena, gap);
 				else
@@ -147,69 +173,69 @@
 					*commit = pages_decommit(ret, size);
 				return (ret);
 			}
+			/*
+			 * Failure, whether due to OOM or a race with a raw
+			 * sbrk() call from outside the allocator.  Try to roll
+			 * back optimistic dss_max update; if rollback fails,
+			 * it's due to another caller of this function having
+			 * succeeded since this invocation started, in which
+			 * case rollback is not necessary.
+			 */
+			atomic_cas_p(&dss_max, dss_next, max_cur);
+			if (dss_prev == (void *)-1) {
+				/* OOM. */
+				atomic_write_u(&dss_exhausted, (unsigned)true);
+				goto label_oom;
+			}
 		}
 	}
-	/* OOM. */
-	malloc_mutex_unlock(tsdn, &dss_mtx);
+label_oom:
 	extent_dalloc(tsdn, arena, gap);
 	return (NULL);
 }
 
-bool
-extent_in_dss(tsdn_t *tsdn, void *addr)
+static bool
+extent_in_dss_helper(void *addr, void *max)
 {
-	bool ret;
 
-	cassert(have_dss);
-
-	malloc_mutex_lock(tsdn, &dss_mtx);
-	if ((uintptr_t)addr >= (uintptr_t)dss_base
-	    && (uintptr_t)addr < (uintptr_t)dss_max)
-		ret = true;
-	else
-		ret = false;
-	malloc_mutex_unlock(tsdn, &dss_mtx);
-
-	return (ret);
+	return ((uintptr_t)addr >= (uintptr_t)dss_base && (uintptr_t)addr <
+	    (uintptr_t)max);
 }
 
 bool
+extent_in_dss(void *addr)
+{
+
+	cassert(have_dss);
+
+	return (extent_in_dss_helper(addr, atomic_read_p(&dss_max)));
+}
+
+bool
+extent_dss_mergeable(void *addr_a, void *addr_b)
+{
+	void *max;
+
+	cassert(have_dss);
+
+	if ((uintptr_t)addr_a < (uintptr_t)dss_base && (uintptr_t)addr_b <
+	    (uintptr_t)dss_base)
+		return (true);
+
+	max = atomic_read_p(&dss_max);
+	return (extent_in_dss_helper(addr_a, max) ==
+	    extent_in_dss_helper(addr_b, max));
+}
+
+void
 extent_dss_boot(void)
 {
 
 	cassert(have_dss);
 
-	if (malloc_mutex_init(&dss_mtx, "dss", WITNESS_RANK_DSS))
-		return (true);
 	dss_base = extent_dss_sbrk(0);
-	dss_prev = dss_base;
+	dss_exhausted = (unsigned)(dss_base == (void *)-1);
 	dss_max = dss_base;
-
-	return (false);
-}
-
-void
-extent_dss_prefork(tsdn_t *tsdn)
-{
-
-	if (have_dss)
-		malloc_mutex_prefork(tsdn, &dss_mtx);
-}
-
-void
-extent_dss_postfork_parent(tsdn_t *tsdn)
-{
-
-	if (have_dss)
-		malloc_mutex_postfork_parent(tsdn, &dss_mtx);
-}
-
-void
-extent_dss_postfork_child(tsdn_t *tsdn)
-{
-
-	if (have_dss)
-		malloc_mutex_postfork_child(tsdn, &dss_mtx);
 }
 
 /******************************************************************************/
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 0348b8a..5108d15 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1030,8 +1030,7 @@
 				for (i = 0; i < dss_prec_limit; i++) {
 					if (strncmp(dss_prec_names[i], v, vlen)
 					    == 0) {
-						if (extent_dss_prec_set(NULL,
-						   i)) {
+						if (extent_dss_prec_set(i)) {
 							malloc_conf_error(
 							    "Error setting dss",
 							    k, klen, v, vlen);
@@ -2631,7 +2630,6 @@
 		}
 	}
 	base_prefork(tsd_tsdn(tsd));
-	extent_prefork(tsd_tsdn(tsd));
 	for (i = 0; i < narenas; i++) {
 		if ((arena = arena_get(tsd_tsdn(tsd), i, false)) != NULL)
 			arena_prefork3(tsd_tsdn(tsd), arena);
@@ -2660,7 +2658,6 @@
 
 	witness_postfork_parent(tsd);
 	/* Release all mutexes, now that fork() has completed. */
-	extent_postfork_parent(tsd_tsdn(tsd));
 	base_postfork_parent(tsd_tsdn(tsd));
 	for (i = 0, narenas = narenas_total_get(); i < narenas; i++) {
 		arena_t *arena;
@@ -2685,7 +2682,6 @@
 
 	witness_postfork_child(tsd);
 	/* Release all mutexes, now that fork() has completed. */
-	extent_postfork_child(tsd_tsdn(tsd));
 	base_postfork_child(tsd_tsdn(tsd));
 	for (i = 0, narenas = narenas_total_get(); i < narenas; i++) {
 		arena_t *arena;
diff --git a/src/large.c b/src/large.c
index 34b3bdb..23af183 100644
--- a/src/large.c
+++ b/src/large.c
@@ -81,7 +81,7 @@
 #define	large_dalloc_maybe_junk JEMALLOC_N(n_large_dalloc_maybe_junk)
 #endif
 void
-large_dalloc_maybe_junk(tsdn_t *tsdn, void *ptr, size_t usize)
+large_dalloc_maybe_junk(void *ptr, size_t usize)
 {
 
 	if (config_fill && have_dss && unlikely(opt_junk_free)) {
@@ -89,7 +89,7 @@
 		 * Only bother junk filling if the extent isn't about to be
 		 * unmapped.
 		 */
-		if (!config_munmap || (have_dss && extent_in_dss(tsdn, ptr)))
+		if (!config_munmap || (have_dss && extent_in_dss(ptr)))
 			large_dalloc_junk(ptr, usize);
 	}
 }
@@ -119,7 +119,7 @@
 			return (true);
 
 		if (config_fill && unlikely(opt_junk_free)) {
-			large_dalloc_maybe_junk(tsdn, extent_addr_get(trail),
+			large_dalloc_maybe_junk(extent_addr_get(trail),
 			    extent_usize_get(trail));
 		}
 
@@ -296,7 +296,7 @@
 	ql_remove(&arena->large, extent, ql_link);
 	malloc_mutex_unlock(tsdn, &arena->large_mtx);
 	if (!junked_locked) {
-		large_dalloc_maybe_junk(tsdn, extent_addr_get(extent),
+		large_dalloc_maybe_junk(extent_addr_get(extent),
 		    extent_usize_get(extent));
 	}
 	arena_extent_dalloc_large(tsdn, arena, extent, junked_locked);
diff --git a/test/unit/junk.c b/test/unit/junk.c
index fe453b6..680f0d2 100644
--- a/test/unit/junk.c
+++ b/test/unit/junk.c
@@ -53,10 +53,10 @@
 }
 
 static void
-large_dalloc_maybe_junk_intercept(tsdn_t *tsdn, void *ptr, size_t usize)
+large_dalloc_maybe_junk_intercept(void *ptr, size_t usize)
 {
 
-	large_dalloc_maybe_junk_orig(tsdn, ptr, usize);
+	large_dalloc_maybe_junk_orig(ptr, usize);
 	if (ptr == watch_for_junking)
 		saw_junking = true;
 }