Merge branch 'dev'
diff --git a/ChangeLog b/ChangeLog
index 58e4462..e3b0a51 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -4,6 +4,20 @@
 
     https://github.com/jemalloc/jemalloc
 
+* 4.0.3 (September 24, 2015)
+
+  This bugfix release continues the trend of xallocx() and heap profiling fixes.
+
+  Bug fixes:
+  - Fix xallocx(..., MALLOCX_ZERO) to zero all trailing bytes of large
+    allocations when --enable-cache-oblivious configure option is enabled.
+  - Fix xallocx(..., MALLOCX_ZERO) to zero trailing bytes of huge allocations
+    when resizing from/to a size class that is not a multiple of the chunk size.
+  - Fix prof_tctx_dump_iter() to filter out nodes that were created after heap
+    profile dumping started.
+  - Work around a potentially bad thread-specific data initialization
+    interaction with NPTL (glibc's pthreads implementation).
+
 * 4.0.2 (September 21, 2015)
 
   This bugfix release addresses a few bugs specific to heap profiling.
diff --git a/src/arena.c b/src/arena.c
index 7f4a6ca..3081519 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -2679,6 +2679,16 @@
 		if (arena_run_split_large(arena, run, splitsize, zero))
 			goto label_fail;
 
+		if (config_cache_oblivious && zero) {
+			/*
+			 * Zero the trailing bytes of the original allocation's
+			 * last page, since they are in an indeterminate state.
+			 */
+			assert(PAGE_CEILING(oldsize) == oldsize);
+			memset((void *)((uintptr_t)ptr + oldsize), 0,
+			    PAGE_CEILING((uintptr_t)ptr) - (uintptr_t)ptr);
+		}
+
 		size = oldsize + splitsize;
 		npages = (size + large_pad) >> LG_PAGE;
 
diff --git a/src/huge.c b/src/huge.c
index f8778db..1e9a665 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -133,7 +133,7 @@
 	extent_node_t *node;
 	arena_t *arena;
 	chunk_hooks_t chunk_hooks = CHUNK_HOOKS_INITIALIZER;
-	bool zeroed;
+	bool pre_zeroed, post_zeroed;
 
 	/* Increase usize to incorporate extra. */
 	for (usize = usize_min; usize < usize_max && (usize_next = s2u(usize+1))
@@ -145,26 +145,27 @@
 
 	node = huge_node_get(ptr);
 	arena = extent_node_arena_get(node);
+	pre_zeroed = extent_node_zeroed_get(node);
 
 	/* Fill if necessary (shrinking). */
 	if (oldsize > usize) {
 		size_t sdiff = oldsize - usize;
 		if (config_fill && unlikely(opt_junk_free)) {
 			memset((void *)((uintptr_t)ptr + usize), 0x5a, sdiff);
-			zeroed = false;
+			post_zeroed = false;
 		} else {
-			zeroed = !chunk_purge_wrapper(arena, &chunk_hooks, ptr,
-			    CHUNK_CEILING(oldsize), usize, sdiff);
+			post_zeroed = !chunk_purge_wrapper(arena, &chunk_hooks,
+			    ptr, CHUNK_CEILING(oldsize), usize, sdiff);
 		}
 	} else
-		zeroed = true;
+		post_zeroed = pre_zeroed;
 
 	malloc_mutex_lock(&arena->huge_mtx);
 	/* Update the size of the huge allocation. */
 	assert(extent_node_size_get(node) != usize);
 	extent_node_size_set(node, usize);
-	/* Clear node's zeroed field if zeroing failed above. */
-	extent_node_zeroed_set(node, extent_node_zeroed_get(node) && zeroed);
+	/* Update zeroed. */
+	extent_node_zeroed_set(node, post_zeroed);
 	malloc_mutex_unlock(&arena->huge_mtx);
 
 	arena_chunk_ralloc_huge_similar(arena, ptr, oldsize, usize);
@@ -172,7 +173,7 @@
 	/* Fill if necessary (growing). */
 	if (oldsize < usize) {
 		if (zero || (config_fill && unlikely(opt_zero))) {
-			if (!zeroed) {
+			if (!pre_zeroed) {
 				memset((void *)((uintptr_t)ptr + oldsize), 0,
 				    usize - oldsize);
 			}
@@ -190,10 +191,11 @@
 	arena_t *arena;
 	chunk_hooks_t chunk_hooks;
 	size_t cdiff;
-	bool zeroed;
+	bool pre_zeroed, post_zeroed;
 
 	node = huge_node_get(ptr);
 	arena = extent_node_arena_get(node);
+	pre_zeroed = extent_node_zeroed_get(node);
 	chunk_hooks = chunk_hooks_get(arena);
 
 	assert(oldsize > usize);
@@ -209,21 +211,21 @@
 		if (config_fill && unlikely(opt_junk_free)) {
 			huge_dalloc_junk((void *)((uintptr_t)ptr + usize),
 			    sdiff);
-			zeroed = false;
+			post_zeroed = false;
 		} else {
-			zeroed = !chunk_purge_wrapper(arena, &chunk_hooks,
+			post_zeroed = !chunk_purge_wrapper(arena, &chunk_hooks,
 			    CHUNK_ADDR2BASE((uintptr_t)ptr + usize),
 			    CHUNK_CEILING(oldsize),
 			    CHUNK_ADDR2OFFSET((uintptr_t)ptr + usize), sdiff);
 		}
 	} else
-		zeroed = true;
+		post_zeroed = pre_zeroed;
 
 	malloc_mutex_lock(&arena->huge_mtx);
 	/* Update the size of the huge allocation. */
 	extent_node_size_set(node, usize);
-	/* Clear node's zeroed field if zeroing failed above. */
-	extent_node_zeroed_set(node, extent_node_zeroed_get(node) && zeroed);
+	/* Update zeroed. */
+	extent_node_zeroed_set(node, post_zeroed);
 	malloc_mutex_unlock(&arena->huge_mtx);
 
 	/* Zap the excess chunks. */
diff --git a/src/prof.c b/src/prof.c
index 0a08062..5d2b959 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -1102,11 +1102,23 @@
 {
 	bool propagate_err = *(bool *)arg;
 
-	if (prof_dump_printf(propagate_err,
-	    "  t%"FMTu64": %"FMTu64": %"FMTu64" [%"FMTu64": %"FMTu64"]\n",
-	    tctx->thr_uid, tctx->dump_cnts.curobjs, tctx->dump_cnts.curbytes,
-	    tctx->dump_cnts.accumobjs, tctx->dump_cnts.accumbytes))
-		return (tctx);
+	switch (tctx->state) {
+	case prof_tctx_state_initializing:
+	case prof_tctx_state_nominal:
+		/* Not captured by this dump. */
+		break;
+	case prof_tctx_state_dumping:
+	case prof_tctx_state_purgatory:
+		if (prof_dump_printf(propagate_err,
+		    "  t%"FMTu64": %"FMTu64": %"FMTu64" [%"FMTu64": "
+		    "%"FMTu64"]\n", tctx->thr_uid, tctx->dump_cnts.curobjs,
+		    tctx->dump_cnts.curbytes, tctx->dump_cnts.accumobjs,
+		    tctx->dump_cnts.accumbytes))
+			return (tctx);
+		break;
+	default:
+		not_reached();
+	}
 	return (NULL);
 }
 
diff --git a/src/tsd.c b/src/tsd.c
index 2100833..9ffe9af 100644
--- a/src/tsd.c
+++ b/src/tsd.c
@@ -73,6 +73,9 @@
 	tsd_t *tsd = (tsd_t *)arg;
 
 	switch (tsd->state) {
+	case tsd_state_uninitialized:
+		/* Do nothing. */
+		break;
 	case tsd_state_nominal:
 #define O(n, t)								\
 		n##_cleanup(tsd);
diff --git a/test/integration/mallocx.c b/test/integration/mallocx.c
index 3973938..6253175 100644
--- a/test/integration/mallocx.c
+++ b/test/integration/mallocx.c
@@ -52,9 +52,20 @@
 
 	hugemax = get_huge_size(get_nhuge()-1);
 
-	/* In practice hugemax is too large to be allocated. */
-	assert_ptr_null(mallocx(hugemax, 0),
-	    "Expected OOM for mallocx(size=%#zx, 0)", hugemax);
+	/*
+	 * It should be impossible to allocate two objects that each consume
+	 * more than half the virtual address space.
+	 */
+	{
+		void *p;
+
+		p = mallocx(hugemax, 0);
+		if (p != NULL) {
+			assert_ptr_null(mallocx(hugemax, 0),
+			    "Expected OOM for mallocx(size=%#zx, 0)", hugemax);
+			dallocx(p, 0);
+		}
+	}
 
 #if LG_SIZEOF_PTR == 3
 	size      = ZU(0x8000000000000000);
diff --git a/test/integration/xallocx.c b/test/integration/xallocx.c
index 058e27c..3736252 100644
--- a/test/integration/xallocx.c
+++ b/test/integration/xallocx.c
@@ -347,6 +347,112 @@
 }
 TEST_END
 
+static void
+print_filled_extents(const void *p, uint8_t c, size_t len)
+{
+	const uint8_t *pc = (const uint8_t *)p;
+	size_t i, range0;
+	uint8_t c0;
+
+	malloc_printf("  p=%p, c=%#x, len=%zu:", p, c, len);
+	range0 = 0;
+	c0 = pc[0];
+	for (i = 0; i < len; i++) {
+		if (pc[i] != c0) {
+			malloc_printf(" %#x[%zu..%zu)", c0, range0, i);
+			range0 = i;
+			c0 = pc[i];
+		}
+	}
+	malloc_printf(" %#x[%zu..%zu)\n", c0, range0, i);
+}
+
+static bool
+validate_fill(const void *p, uint8_t c, size_t offset, size_t len)
+{
+	const uint8_t *pc = (const uint8_t *)p;
+	bool err;
+	size_t i;
+
+	for (i = offset, err = false; i < offset+len; i++) {
+		if (pc[i] != c)
+			err = true;
+	}
+
+	if (err)
+		print_filled_extents(p, c, offset + len);
+
+	return (err);
+}
+
+static void
+test_zero(size_t szmin, size_t szmax)
+{
+	size_t sz, nsz;
+	void *p;
+#define	FILL_BYTE 0x7aU
+
+	sz = szmax;
+	p = mallocx(sz, MALLOCX_ZERO);
+	assert_ptr_not_null(p, "Unexpected mallocx() error");
+	assert_false(validate_fill(p, 0x00, 0, sz), "Memory not filled: sz=%zu",
+	    sz);
+
+	/*
+	 * Fill with non-zero so that non-debug builds are more likely to detect
+	 * errors.
+	 */
+	memset(p, FILL_BYTE, sz);
+	assert_false(validate_fill(p, FILL_BYTE, 0, sz),
+	    "Memory not filled: sz=%zu", sz);
+
+	/* Shrink in place so that we can expect growing in place to succeed. */
+	sz = szmin;
+	assert_zu_eq(xallocx(p, sz, 0, MALLOCX_ZERO), sz,
+	    "Unexpected xallocx() error");
+	assert_false(validate_fill(p, FILL_BYTE, 0, sz),
+	    "Memory not filled: sz=%zu", sz);
+
+	for (sz = szmin; sz < szmax; sz = nsz) {
+		nsz = nallocx(sz+1, MALLOCX_ZERO);
+		assert_zu_eq(xallocx(p, sz+1, 0, MALLOCX_ZERO), nsz,
+		    "Unexpected xallocx() failure");
+		assert_false(validate_fill(p, FILL_BYTE, 0, sz),
+		    "Memory not filled: sz=%zu", sz);
+		assert_false(validate_fill(p, 0x00, sz, nsz-sz),
+		    "Memory not filled: sz=%zu, nsz-sz=%zu", sz, nsz-sz);
+		memset((void *)((uintptr_t)p + sz), FILL_BYTE, nsz-sz);
+		assert_false(validate_fill(p, FILL_BYTE, 0, nsz),
+		    "Memory not filled: nsz=%zu", nsz);
+	}
+
+	dallocx(p, 0);
+}
+
+TEST_BEGIN(test_zero_large)
+{
+	size_t large0, largemax;
+
+	/* Get size classes. */
+	large0 = get_large_size(0);
+	largemax = get_large_size(get_nlarge()-1);
+
+	test_zero(large0, largemax);
+}
+TEST_END
+
+TEST_BEGIN(test_zero_huge)
+{
+	size_t huge0, huge1;
+
+	/* Get size classes. */
+	huge0 = get_huge_size(0);
+	huge1 = get_huge_size(1);
+
+	test_zero(huge1, huge0 * 2);
+}
+TEST_END
+
 int
 main(void)
 {
@@ -359,5 +465,7 @@
 	    test_size_extra_overflow,
 	    test_extra_small,
 	    test_extra_large,
-	    test_extra_huge));
+	    test_extra_huge,
+	    test_zero_large,
+	    test_zero_huge));
 }