Merge branch.
diff --git a/ChangeLog b/ChangeLog
index 926209e..ed62e0e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -4,6 +4,17 @@
 
     https://github.com/jemalloc/jemalloc
 
+* 4.2.1 (June 8, 2016)
+
+  Bug fixes:
+  - Fix bootstrapping issues for configurations that require allocation during
+    tsd initialization (e.g. --disable-tls).  (@cferris1000, @jasone)
+  - Fix gettimeofday() version of nstime_update().  (@ronawho)
+  - Fix Valgrind regressions in calloc() and chunk_alloc_wrapper().  (@ronawho)
+  - Fix potential VM map fragmentation regression.  (@jasone)
+  - Fix opt_zero-triggered in-place huge reallocation zeroing.  (@jasone)
+  - Fix heap profiling context leaks in reallocation edge cases.  (@jasone)
+
 * 4.2.0 (May 12, 2016)
 
   New features:
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index 51bf897..8f82edd 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -754,7 +754,7 @@
 		 * Calculate the size of the over-size run that arena_palloc()
 		 * would need to allocate in order to guarantee the alignment.
 		 */
-		if (usize + large_pad + alignment <= arena_maxrun)
+		if (usize + large_pad + alignment - PAGE <= arena_maxrun)
 			return (usize);
 	}
 
@@ -784,7 +784,7 @@
 	 * Calculate the multi-chunk mapping that huge_palloc() would need in
 	 * order to guarantee the alignment.
 	 */
-	if (usize + alignment < usize) {
+	if (usize + alignment - PAGE < usize) {
 		/* size_t overflow. */
 		return (0);
 	}
diff --git a/include/jemalloc/internal/prof.h b/include/jemalloc/internal/prof.h
index 691e153..21dff5f 100644
--- a/include/jemalloc/internal/prof.h
+++ b/include/jemalloc/internal/prof.h
@@ -513,6 +513,7 @@
 			 * though its actual usize was insufficient to cross the
 			 * sample threshold.
 			 */
+			prof_alloc_rollback(tsd, tctx, true);
 			tctx = (prof_tctx_t *)(uintptr_t)1U;
 		}
 	}
diff --git a/src/arena.c b/src/arena.c
index c605bcd..ce62590 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -2687,7 +2687,7 @@
 		return (NULL);
 
 	alignment = PAGE_CEILING(alignment);
-	alloc_size = usize + large_pad + alignment;
+	alloc_size = usize + large_pad + alignment - PAGE;
 
 	malloc_mutex_lock(tsdn, &arena->lock);
 	run = arena_run_alloc_large(tsdn, arena, alloc_size, false);
diff --git a/src/chunk.c b/src/chunk.c
index adc666f..f292c98 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -421,15 +421,11 @@
 }
 
 static void *
-chunk_alloc_default(void *new_addr, size_t size, size_t alignment, bool *zero,
-    bool *commit, unsigned arena_ind)
+chunk_alloc_default_impl(tsdn_t *tsdn, arena_t *arena, void *new_addr,
+    size_t size, size_t alignment, bool *zero, bool *commit)
 {
 	void *ret;
-	tsdn_t *tsdn;
-	arena_t *arena;
 
-	tsdn = tsdn_fetch();
-	arena = chunk_arena_get(tsdn, arena_ind);
 	ret = chunk_alloc_core(tsdn, arena, new_addr, size, alignment, zero,
 	    commit, arena->dss_prec);
 	if (ret == NULL)
@@ -441,6 +437,20 @@
 }
 
 static void *
+chunk_alloc_default(void *new_addr, size_t size, size_t alignment, bool *zero,
+    bool *commit, unsigned arena_ind)
+{
+	tsdn_t *tsdn;
+	arena_t *arena;
+
+	tsdn = tsdn_fetch();
+	arena = chunk_arena_get(tsdn, arena_ind);
+
+	return (chunk_alloc_default_impl(tsdn, arena, new_addr, size, alignment,
+	    zero, commit));
+}
+
+static void *
 chunk_alloc_retained(tsdn_t *tsdn, arena_t *arena, chunk_hooks_t *chunk_hooks,
     void *new_addr, size_t size, size_t alignment, bool *zero, bool *commit)
 {
@@ -472,14 +482,23 @@
 	ret = chunk_alloc_retained(tsdn, arena, chunk_hooks, new_addr, size,
 	    alignment, zero, commit);
 	if (ret == NULL) {
-		ret = chunk_hooks->alloc(new_addr, size, alignment, zero,
-		    commit, arena->ind);
+		if (chunk_hooks->alloc == chunk_alloc_default) {
+			/* Call directly to propagate tsdn. */
+			ret = chunk_alloc_default_impl(tsdn, arena, new_addr,
+			    size, alignment, zero, commit);
+		} else {
+			ret = chunk_hooks->alloc(new_addr, size, alignment,
+			    zero, commit, arena->ind);
+		}
+
 		if (ret == NULL)
 			return (NULL);
+
+		if (config_valgrind && chunk_hooks->alloc !=
+		    chunk_alloc_default)
+			JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ret, chunksize);
 	}
 
-	if (config_valgrind && chunk_hooks->alloc != chunk_alloc_default)
-		JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ret, chunksize);
 	return (ret);
 }
 
@@ -591,19 +610,30 @@
 }
 
 static bool
+chunk_dalloc_default_impl(tsdn_t *tsdn, void *chunk, size_t size)
+{
+
+	if (!have_dss || !chunk_in_dss(tsdn, chunk))
+		return (chunk_dalloc_mmap(chunk, size));
+	return (true);
+}
+
+static bool
 chunk_dalloc_default(void *chunk, size_t size, bool committed,
     unsigned arena_ind)
 {
+	tsdn_t *tsdn;
 
-	if (!have_dss || !chunk_in_dss(tsdn_fetch(), chunk))
-		return (chunk_dalloc_mmap(chunk, size));
-	return (true);
+	tsdn = tsdn_fetch();
+
+	return (chunk_dalloc_default_impl(tsdn, chunk, size));
 }
 
 void
 chunk_dalloc_wrapper(tsdn_t *tsdn, arena_t *arena, chunk_hooks_t *chunk_hooks,
     void *chunk, size_t size, bool zeroed, bool committed)
 {
+	bool err;
 
 	assert(chunk != NULL);
 	assert(CHUNK_ADDR2BASE(chunk) == chunk);
@@ -612,7 +642,13 @@
 
 	chunk_hooks_assure_initialized(tsdn, arena, chunk_hooks);
 	/* Try to deallocate. */
-	if (!chunk_hooks->dalloc(chunk, size, committed, arena->ind))
+	if (chunk_hooks->dalloc == chunk_dalloc_default) {
+		/* Call directly to propagate tsdn. */
+		err = chunk_dalloc_default_impl(tsdn, chunk, size);
+	} else
+		err = chunk_hooks->dalloc(chunk, size, committed, arena->ind);
+
+	if (!err)
 		return;
 	/* Try to decommit; purge if that fails. */
 	if (committed) {
@@ -681,26 +717,34 @@
 }
 
 static bool
-chunk_merge_default(void *chunk_a, size_t size_a, void *chunk_b, size_t size_b,
-    bool committed, unsigned arena_ind)
+chunk_merge_default_impl(tsdn_t *tsdn, void *chunk_a, void *chunk_b)
 {
 
 	if (!maps_coalesce)
 		return (true);
-	if (have_dss) {
-		tsdn_t *tsdn = tsdn_fetch();
-		if (chunk_in_dss(tsdn, chunk_a) != chunk_in_dss(tsdn, chunk_b))
-			return (true);
-	}
+	if (have_dss && chunk_in_dss(tsdn, chunk_a) != chunk_in_dss(tsdn,
+	    chunk_b))
+		return (true);
 
 	return (false);
 }
 
+static bool
+chunk_merge_default(void *chunk_a, size_t size_a, void *chunk_b, size_t size_b,
+    bool committed, unsigned arena_ind)
+{
+	tsdn_t *tsdn;
+
+	tsdn = tsdn_fetch();
+
+	return (chunk_merge_default_impl(tsdn, chunk_a, chunk_b));
+}
+
 static rtree_node_elm_t *
 chunks_rtree_node_alloc(size_t nelms)
 {
 
-	return ((rtree_node_elm_t *)base_alloc(tsdn_fetch(), nelms *
+	return ((rtree_node_elm_t *)base_alloc(TSDN_NULL, nelms *
 	    sizeof(rtree_node_elm_t)));
 }
 
diff --git a/src/chunk_mmap.c b/src/chunk_mmap.c
index f95ae75..73fc497 100644
--- a/src/chunk_mmap.c
+++ b/src/chunk_mmap.c
@@ -9,7 +9,7 @@
 	void *ret;
 	size_t alloc_size;
 
-	alloc_size = size + alignment;
+	alloc_size = size + alignment - PAGE;
 	/* Beware size_t wrap-around. */
 	if (alloc_size < size)
 		return (NULL);
diff --git a/src/huge.c b/src/huge.c
index 1aa02a0..3a2877c 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -262,19 +262,19 @@
 	malloc_mutex_unlock(tsdn, &arena->huge_mtx);
 
 	/*
-	 * Copy zero into is_zeroed_chunk and pass the copy to chunk_alloc(), so
-	 * that it is possible to make correct junk/zero fill decisions below.
+	 * Use is_zeroed_chunk to detect whether the trailing memory is zeroed,
+	 * update extent's zeroed field, and zero as necessary.
 	 */
-	is_zeroed_chunk = zero;
-
+	is_zeroed_chunk = false;
 	if (arena_chunk_ralloc_huge_expand(tsdn, arena, ptr, oldsize, usize,
 	     &is_zeroed_chunk))
 		return (true);
 
 	malloc_mutex_lock(tsdn, &arena->huge_mtx);
-	/* Update the size of the huge allocation. */
 	huge_node_unset(ptr, node);
 	extent_node_size_set(node, usize);
+	extent_node_zeroed_set(node, extent_node_zeroed_get(node) &&
+	    is_zeroed_chunk);
 	huge_node_reset(tsdn, ptr, node);
 	malloc_mutex_unlock(tsdn, &arena->huge_mtx);
 
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 40eb2ea..5d1f493 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1739,7 +1739,7 @@
 		ret = ialloc_body(num_size, true, &tsdn, &usize, true);
 		ialloc_post_check(ret, tsdn, usize, "calloc", true, true);
 		UTRACE(0, num_size, ret);
-		JEMALLOC_VALGRIND_MALLOC(ret != NULL, tsdn, ret, usize, false);
+		JEMALLOC_VALGRIND_MALLOC(ret != NULL, tsdn, ret, usize, true);
 	}
 
 	return (ret);
@@ -2222,7 +2222,7 @@
 
 	prof_active = prof_active_get_unlocked();
 	old_tctx = prof_tctx_get(tsd_tsdn(tsd), old_ptr);
-	tctx = prof_alloc_prep(tsd, *usize, prof_active, true);
+	tctx = prof_alloc_prep(tsd, *usize, prof_active, false);
 	if (unlikely((uintptr_t)tctx != (uintptr_t)1U)) {
 		p = irallocx_prof_sample(tsd, old_ptr, old_usize, *usize,
 		    alignment, zero, tcache, arena, tctx);
@@ -2231,7 +2231,7 @@
 		    tcache, arena);
 	}
 	if (unlikely(p == NULL)) {
-		prof_alloc_rollback(tsd, tctx, true);
+		prof_alloc_rollback(tsd, tctx, false);
 		return (NULL);
 	}
 
@@ -2246,7 +2246,7 @@
 		 */
 		*usize = isalloc(tsd_tsdn(tsd), p, config_prof);
 	}
-	prof_realloc(tsd, p, *usize, tctx, prof_active, true, old_ptr,
+	prof_realloc(tsd, p, *usize, tctx, prof_active, false, old_ptr,
 	    old_usize, old_tctx);
 
 	return (p);
diff --git a/src/nstime.c b/src/nstime.c
index 26e49dc..aad2c26 100644
--- a/src/nstime.c
+++ b/src/nstime.c
@@ -128,9 +128,11 @@
 		time->ns = ts.tv_sec * BILLION + ts.tv_nsec;
 	}
 #else
-	struct timeval tv;
-	gettimeofday(&tv, NULL);
-	time->ns = tv.tv_sec * BILLION + tv.tv_usec * 1000;
+	{
+		struct timeval tv;
+		gettimeofday(&tv, NULL);
+		time->ns = tv.tv_sec * BILLION + tv.tv_usec * 1000;
+	}
 #endif
 
 	/* Handle non-monotonic clocks. */