Merge branch 'dev'
diff --git a/ChangeLog b/ChangeLog
index 4498683..58e4462 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -4,6 +4,19 @@
 
     https://github.com/jemalloc/jemalloc
 
+* 4.0.2 (September 21, 2015)
+
+  This bugfix release addresses a few bugs specific to heap profiling.
+
+  Bug fixes:
+  - Fix ixallocx_prof_sample() to never modify nor create sampled small
+    allocations.  xallocx() is in general incapable of moving small allocations,
+    so this fix removes buggy code without loss of generality.
+  - Fix irallocx_prof_sample() to always allocate large regions, even when
+    alignment is non-zero.
+  - Fix prof_alloc_rollback() to read tdata from thread-specific data rather
+    than dereferencing a potentially invalid tctx.
+
 * 4.0.1 (September 15, 2015)
 
   This is a bugfix release that is somewhat high risk due to the amount of
diff --git a/Makefile.in b/Makefile.in
index 01285af..1ac6f29 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -353,6 +353,7 @@
 check_integration_prof: tests_integration check_integration_dir
 ifeq ($(enable_prof), 1)
 	$(MALLOC_CONF)="prof:true" $(SHELL) $(objroot)test/test.sh $(TESTS_INTEGRATION:$(srcroot)%.c=$(objroot)%)
+	$(MALLOC_CONF)="prof:true,prof_active:false" $(SHELL) $(objroot)test/test.sh $(TESTS_INTEGRATION:$(srcroot)%.c=$(objroot)%)
 endif
 check_integration: tests_integration check_integration_dir
 	$(SHELL) $(objroot)test/test.sh $(TESTS_INTEGRATION:$(srcroot)%.c=$(objroot)%)
diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h
index 62a887e..eed7aa0 100644
--- a/include/jemalloc/internal/tsd.h
+++ b/include/jemalloc/internal/tsd.h
@@ -190,7 +190,7 @@
 	return (false);							\
 }									\
 a_attr void								\
-a_name##tsd_boot1()							\
+a_name##tsd_boot1(void)							\
 {									\
 									\
 	/* Do nothing. */						\
@@ -235,7 +235,7 @@
 	return (false);							\
 }									\
 a_attr void								\
-a_name##tsd_boot1()							\
+a_name##tsd_boot1(void)							\
 {									\
 									\
 	/* Do nothing. */						\
@@ -345,7 +345,7 @@
 	return (false);							\
 }									\
 a_attr void								\
-a_name##tsd_boot1()							\
+a_name##tsd_boot1(void)							\
 {									\
 	a_name##tsd_wrapper_t *wrapper;					\
 	wrapper = (a_name##tsd_wrapper_t *)				\
@@ -467,7 +467,7 @@
 	return (false);							\
 }									\
 a_attr void								\
-a_name##tsd_boot1()							\
+a_name##tsd_boot1(void)							\
 {									\
 	a_name##tsd_wrapper_t *wrapper;					\
 	wrapper = (a_name##tsd_wrapper_t *)				\
diff --git a/src/arena.c b/src/arena.c
index 2e888ea..7f4a6ca 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -2560,7 +2560,7 @@
     JEMALLOC_N(arena_dalloc_junk_large_impl);
 #endif
 
-void
+static void
 arena_dalloc_large_locked_impl(arena_t *arena, arena_chunk_t *chunk,
     void *ptr, bool junked)
 {
diff --git a/src/jemalloc.c b/src/jemalloc.c
index ab7cf02..5a2d324 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1923,6 +1923,7 @@
 		*alignment = MALLOCX_ALIGN_GET_SPECIFIED(flags);
 		*usize = sa2u(size, *alignment);
 	}
+	assert(*usize != 0);
 	*zero = MALLOCX_ZERO_GET(flags);
 	if ((flags & MALLOCX_TCACHE_MASK) != 0) {
 		if ((flags & MALLOCX_TCACHE_MASK) == MALLOCX_TCACHE_NONE)
@@ -1965,41 +1966,29 @@
     tcache_t *tcache, arena_t *arena)
 {
 
-	if (alignment != 0)
+	if (unlikely(alignment != 0))
 		return (ipalloct(tsd, usize, alignment, zero, tcache, arena));
-	if (zero)
+	if (unlikely(zero))
 		return (icalloct(tsd, usize, tcache, arena));
 	return (imalloct(tsd, usize, tcache, arena));
 }
 
-JEMALLOC_ALWAYS_INLINE_C void *
-imallocx_maybe_flags(tsd_t *tsd, size_t size, int flags, size_t usize,
-    size_t alignment, bool zero, tcache_t *tcache, arena_t *arena)
-{
-
-	if (likely(flags == 0))
-		return (imalloc(tsd, size));
-	return (imallocx_flags(tsd, usize, alignment, zero, tcache, arena));
-}
-
 static void *
-imallocx_prof_sample(tsd_t *tsd, size_t size, int flags, size_t usize,
-    size_t alignment, bool zero, tcache_t *tcache, arena_t *arena)
+imallocx_prof_sample(tsd_t *tsd, size_t usize, size_t alignment, bool zero,
+    tcache_t *tcache, arena_t *arena)
 {
 	void *p;
 
 	if (usize <= SMALL_MAXCLASS) {
 		assert(((alignment == 0) ? s2u(LARGE_MINCLASS) :
 		    sa2u(LARGE_MINCLASS, alignment)) == LARGE_MINCLASS);
-		p = imallocx_maybe_flags(tsd, LARGE_MINCLASS, flags,
-		    LARGE_MINCLASS, alignment, zero, tcache, arena);
+		p = imallocx_flags(tsd, LARGE_MINCLASS, alignment, zero, tcache,
+		    arena);
 		if (p == NULL)
 			return (NULL);
 		arena_prof_promoted(p, usize);
-	} else {
-		p = imallocx_maybe_flags(tsd, size, flags, usize, alignment,
-		    zero, tcache, arena);
-	}
+	} else
+		p = imallocx_flags(tsd, usize, alignment, zero, tcache, arena);
 
 	return (p);
 }
@@ -2018,12 +2007,11 @@
 	    &zero, &tcache, &arena)))
 		return (NULL);
 	tctx = prof_alloc_prep(tsd, *usize, prof_active_get_unlocked(), true);
-	if (likely((uintptr_t)tctx == (uintptr_t)1U)) {
-		p = imallocx_maybe_flags(tsd, size, flags, *usize, alignment,
-		    zero, tcache, arena);
-	} else if ((uintptr_t)tctx > (uintptr_t)1U) {
-		p = imallocx_prof_sample(tsd, size, flags, *usize, alignment,
-		    zero, tcache, arena);
+	if (likely((uintptr_t)tctx == (uintptr_t)1U))
+		p = imallocx_flags(tsd, *usize, alignment, zero, tcache, arena);
+	else if ((uintptr_t)tctx > (uintptr_t)1U) {
+		p = imallocx_prof_sample(tsd, *usize, alignment, zero, tcache,
+		    arena);
 	} else
 		p = NULL;
 	if (unlikely(p == NULL)) {
@@ -2098,8 +2086,8 @@
 }
 
 static void *
-irallocx_prof_sample(tsd_t *tsd, void *old_ptr, size_t old_usize, size_t size,
-    size_t alignment, size_t usize, bool zero, tcache_t *tcache, arena_t *arena,
+irallocx_prof_sample(tsd_t *tsd, void *old_ptr, size_t old_usize,
+    size_t usize, size_t alignment, bool zero, tcache_t *tcache, arena_t *arena,
     prof_tctx_t *tctx)
 {
 	void *p;
@@ -2113,7 +2101,7 @@
 			return (NULL);
 		arena_prof_promoted(p, usize);
 	} else {
-		p = iralloct(tsd, old_ptr, old_usize, size, alignment, zero,
+		p = iralloct(tsd, old_ptr, old_usize, usize, alignment, zero,
 		    tcache, arena);
 	}
 
@@ -2133,8 +2121,8 @@
 	old_tctx = prof_tctx_get(old_ptr);
 	tctx = prof_alloc_prep(tsd, *usize, prof_active, true);
 	if (unlikely((uintptr_t)tctx != (uintptr_t)1U)) {
-		p = irallocx_prof_sample(tsd, old_ptr, old_usize, size,
-		    alignment, *usize, zero, tcache, arena, tctx);
+		p = irallocx_prof_sample(tsd, old_ptr, old_usize, *usize,
+		    alignment, zero, tcache, arena, tctx);
 	} else {
 		p = iralloct(tsd, old_ptr, old_usize, size, alignment, zero,
 		    tcache, arena);
@@ -2251,26 +2239,13 @@
 
 static size_t
 ixallocx_prof_sample(void *ptr, size_t old_usize, size_t size, size_t extra,
-    size_t alignment, size_t usize_max, bool zero, prof_tctx_t *tctx)
+    size_t alignment, bool zero, prof_tctx_t *tctx)
 {
 	size_t usize;
 
 	if (tctx == NULL)
 		return (old_usize);
-	/* Use minimum usize to determine whether promotion may happen. */
-	if (((alignment == 0) ? s2u(size) : sa2u(size, alignment)) <=
-	    SMALL_MAXCLASS) {
-		if (ixalloc(ptr, old_usize, SMALL_MAXCLASS+1,
-		    (SMALL_MAXCLASS+1 >= size+extra) ? 0 : size+extra -
-		    (SMALL_MAXCLASS+1), alignment, zero))
-			return (old_usize);
-		usize = isalloc(ptr, config_prof);
-		if (usize_max < LARGE_MINCLASS)
-			arena_prof_promoted(ptr, usize);
-	} else {
-		usize = ixallocx_helper(ptr, old_usize, size, extra, alignment,
-		    zero);
-	}
+	usize = ixallocx_helper(ptr, old_usize, size, extra, alignment, zero);
 
 	return (usize);
 }
@@ -2293,15 +2268,16 @@
 	 */
 	usize_max = (alignment == 0) ? s2u(size+extra) : sa2u(size+extra,
 	    alignment);
+	assert(usize_max != 0);
 	tctx = prof_alloc_prep(tsd, usize_max, prof_active, false);
 	if (unlikely((uintptr_t)tctx != (uintptr_t)1U)) {
 		usize = ixallocx_prof_sample(ptr, old_usize, size, extra,
-		    alignment, usize_max, zero, tctx);
+		    alignment, zero, tctx);
 	} else {
 		usize = ixallocx_helper(ptr, old_usize, size, extra, alignment,
 		    zero);
 	}
-	if (unlikely(usize == old_usize)) {
+	if (usize == old_usize) {
 		prof_alloc_rollback(tsd, tctx, false);
 		return (usize);
 	}
diff --git a/src/prof.c b/src/prof.c
index d68478f..0a08062 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -209,7 +209,7 @@
 		 */
 		tdata = prof_tdata_get(tsd, true);
 		if (tdata != NULL)
-			prof_sample_threshold_update(tctx->tdata);
+			prof_sample_threshold_update(tdata);
 	}
 
 	if ((uintptr_t)tctx > (uintptr_t)1U) {
diff --git a/test/integration/mallocx.c b/test/integration/mallocx.c
index 4b0e33f..3973938 100644
--- a/test/integration/mallocx.c
+++ b/test/integration/mallocx.c
@@ -1,5 +1,74 @@
 #include "test/jemalloc_test.h"
 
+static unsigned
+get_nsizes_impl(const char *cmd)
+{
+	unsigned ret;
+	size_t z;
+
+	z = sizeof(unsigned);
+	assert_d_eq(mallctl(cmd, &ret, &z, NULL, 0), 0,
+	    "Unexpected mallctl(\"%s\", ...) failure", cmd);
+
+	return (ret);
+}
+
+static unsigned
+get_nhuge(void)
+{
+
+	return (get_nsizes_impl("arenas.nhchunks"));
+}
+
+static size_t
+get_size_impl(const char *cmd, size_t ind)
+{
+	size_t ret;
+	size_t z;
+	size_t mib[4];
+	size_t miblen = 4;
+
+	z = sizeof(size_t);
+	assert_d_eq(mallctlnametomib(cmd, mib, &miblen),
+	    0, "Unexpected mallctlnametomib(\"%s\", ...) failure", cmd);
+	mib[2] = ind;
+	z = sizeof(size_t);
+	assert_d_eq(mallctlbymib(mib, miblen, &ret, &z, NULL, 0),
+	    0, "Unexpected mallctlbymib([\"%s\", %zu], ...) failure", cmd, ind);
+
+	return (ret);
+}
+
+static size_t
+get_huge_size(size_t ind)
+{
+
+	return (get_size_impl("arenas.hchunk.0.size", ind));
+}
+
+TEST_BEGIN(test_oom)
+{
+	size_t hugemax, size, alignment;
+
+	hugemax = get_huge_size(get_nhuge()-1);
+
+	/* In practice hugemax is too large to be allocated. */
+	assert_ptr_null(mallocx(hugemax, 0),
+	    "Expected OOM for mallocx(size=%#zx, 0)", hugemax);
+
+#if LG_SIZEOF_PTR == 3
+	size      = ZU(0x8000000000000000);
+	alignment = ZU(0x8000000000000000);
+#else
+	size      = ZU(0x80000000);
+	alignment = ZU(0x80000000);
+#endif
+	assert_ptr_null(mallocx(size, MALLOCX_ALIGN(alignment)),
+	    "Expected OOM for mallocx(size=%#zx, MALLOCX_ALIGN(%#zx)", size,
+	    alignment);
+}
+TEST_END
+
 TEST_BEGIN(test_basic)
 {
 #define	MAXSZ (((size_t)1) << 26)
@@ -96,6 +165,7 @@
 {
 
 	return (test(
+	    test_oom,
 	    test_basic,
 	    test_alignment_and_size));
 }