Merge branch 'dev'
diff --git a/.gitignore b/.gitignore
index d9e49f8..b468186 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,10 +6,11 @@
 /jemalloc/doc/jemalloc.3
 /jemalloc/lib/
 /jemalloc/Makefile
-/jemalloc/src/internal/jemalloc_internal\.h
-/jemalloc/src/internal/mtrgraph_defs\.h
-/jemalloc/src/internal/mtrplay_defs\.h
-/jemalloc/src/jemalloc\.h
-/jemalloc/src/jemalloc_defs\.h
+/jemalloc/include/jemalloc/internal/jemalloc_internal\.h
+/jemalloc/include/jemalloc/jemalloc\.h
+/jemalloc/include/jemalloc/jemalloc_defs\.h
+/jemalloc/test/jemalloc_test\.h
 /jemalloc/src/*.[od]
+/jemalloc/test/*.[od]
+/jemalloc/test/*.out
 /jemalloc/VERSION
diff --git a/jemalloc/COPYING b/jemalloc/COPYING
index 1baaf50..10ade12 100644
--- a/jemalloc/COPYING
+++ b/jemalloc/COPYING
@@ -3,6 +3,7 @@
 --------------------------------------------------------------------------------
 Copyright (C) 2002-2010 Jason Evans <jasone@canonware.com>.
 All rights reserved.
+Copyright (C) 2007-2010 Mozilla Foundation.  All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
diff --git a/jemalloc/ChangeLog b/jemalloc/ChangeLog
new file mode 100644
index 0000000..290dea1
--- /dev/null
+++ b/jemalloc/ChangeLog
@@ -0,0 +1,130 @@
+Following are change highlights associated with official releases.  Important
+bug fixes are all mentioned, but internal enhancements are omitted here for
+brevity (even though they are more fun to write about).  Much more detail can be
+found in the git revision history:
+
+    http://www.canonware.com/cgi-bin/gitweb.cgi?p=jemalloc.git
+    git://canonware.com/jemalloc.git
+
+* 2.0.0
+
+  This version focuses on the experimental *allocm() API, and on improved
+  run-time configuration/introspection.  Nonetheless, numerous performance
+  improvements are also included.
+
+  New features:
+    - Implement the experimental {,r,s,d}allocm() API, which provides a superset
+      of the functionality available via malloc(), calloc(), posix_memalign(),
+      realloc(), malloc_usable_size(), and free().  These functions can be used
+      to allocate/reallocate aligned zeroed memory, ask for optional extra
+      memory during reallocation, prevent object movement during reallocation,
+      etc.
+    - Replace JEMALLOC_OPTIONS/JEMALLOC_PROF_PREFIX with MALLOC_CONF, which is
+      more human-readable, and more flexible.  For example:
+        JEMALLOC_OPTIONS=AJP
+      is now:
+        MALLOC_CONF=abort:true,fill:true,stats_print:true
+    - Port to Apple OS X.  Sponsored by Mozilla.
+    - Make it possible for the application to control thread-->arena mappings
+      via the "thread.arena" mallctl.
+    - Add compile-time support for all TLS-related functionality via pthreads
+      TSD.  This is mainly of interest for OS X, which does not support TLS, but
+      has a TSD implementation with similar performance.
+    - Override memalign() and valloc() if they are provided by the system.
+    - Add the "arenas.purge" mallctl, which can be used to synchronously purge
+      all dirty unused pages.
+    - Make cumulative heap profiling data optional, so that it is possible to
+      limit the amount of memory consumed by heap profiling data structures.
+    - Add per thread allocation counters that can be accessed via the
+      "thread.allocated" and "thread.deallocated" mallctls.
+
+  Incompatible changes:
+    - Remove JEMALLOC_OPTIONS and malloc_options (see MALLOC_CONF above).
+    - Increase default backtrace depth from 4 to 128 for heap profiling.
+    - Disable interval-based profile dumps by default.
+
+  Bug fixes:
+  - Remove bad assertions in fork handler functions.  These assertions could
+    cause aborts for some combinations of configure settings.
+  - Fix strerror_r() usage to deal with non-standard semantics in GNU libc.
+  - Fix leak context reporting.  This bug tended to cause the number of contexts
+    to be underreported (though the reported number of objects and bytes were
+    correct).
+  - Fix a realloc() bug for large in-place growing reallocation.  This bug could
+    cause memory corruption, but it was hard to trigger.
+  - Fix an allocation bug for small allocations that could be triggered if
+    multiple threads raced to create a new run of backing pages.
+  - Enhance the heap profiler to trigger samples based on usable size, rather
+    than request size.
+  - Fix a heap profiling bug due to sometimes losing track of requested object
+    size for sampled objects.
+
+* 1.0.3
+
+  Bug fixes:
+  - Fix the libunwind-based implementation of stack backtracing (used for heap
+    profiling).  This bug could cause zero-length backtraces to be reported.
+  - Add a missing mutex unlock in library initialization code.  If multiple
+    threads raced to initialize malloc, some of them could end up permanently
+    blocked.
+
+* 1.0.2
+
+  Bug fixes:
+  - Fix junk filling of large objects, which could cause memory corruption.
+  - Add MAP_NORESERVE support for chunk mapping, because otherwise virtual
+    memory limits could cause swap file configuration to fail.  Contributed by
+    Jordan DeLong.
+
+* 1.0.1
+
+  Bug fixes:
+  - Fix compilation when --enable-fill is specified.
+  - Fix threads-related profiling bugs that affected accuracy and caused memory
+    to be leaked during thread exit.
+  - Fix dirty page purging race conditions that could cause crashes.
+  - Fix crash in tcache flushing code during thread destruction.
+
+* 1.0.0
+
+  This release focuses on speed and run-time introspection.  Numerous
+  algorithmic improvements make this release substantially faster than its
+  predecessors.
+
+  New features:
+  - Implement autoconf-based configuration system.
+  - Add mallctl*(), for the purposes of introspection and run-time
+    configuration.
+  - Make it possible for the application to manually flush a thread's cache, via
+    the "tcache.flush" mallctl.
+  - Base maximum dirty page count on proportion of active memory.
+  - Compute various addtional run-time statistics, including per size class
+    statistics for large objects.
+  - Expose malloc_stats_print(), which can be called repeatedly by the
+    application.
+  - Simplify the malloc_message() signature to only take one string argument,
+    and incorporate an opaque data pointer argument for use by the application
+    in combination with malloc_stats_print().
+  - Add support for allocation backed by one or more swap files, and allow the
+    application to disable over-commit if swap files are in use.
+  - Implement allocation profiling and leak checking.
+
+  Removed features:
+  - Remove the dynamic arena rebalancing code, since thread-specific caching
+    reduces its utility.
+
+  Bug fixes:
+  - Modify chunk allocation to work when address space layout randomization
+    (ASLR) is in use.
+  - Fix thread cleanup bugs related to TLS destruction.
+  - Handle 0-size allocation requests in posix_memalign().
+  - Fix a chunk leak.  The leaked chunks were never touched, so this impacted
+    virtual memory usage, but not physical memory usage.
+
+* linux_20080828a, linux_20080827a
+
+  These snapshot releases are the simple result of incorporating Linux-specific
+  support into the FreeBSD malloc sources.
+
+--------------------------------------------------------------------------------
+vim:filetype=text:textwidth=80
diff --git a/jemalloc/INSTALL b/jemalloc/INSTALL
index eec3b37..fafd788 100644
--- a/jemalloc/INSTALL
+++ b/jemalloc/INSTALL
@@ -27,26 +27,42 @@
     it is linked to.  This works only on ELF-based systems.
 
 --with-jemalloc-prefix=<prefix>
-    Prefix all public APIs with <prefix>, so that, for example, malloc()
-    becomes <prefix>malloc().  This makes it possible to use jemalloc at the
-    same time as the system allocator.
+    Prefix all public APIs with <prefix>.  For example, if <prefix> is
+    "prefix_", the API changes like the following occur:
+
+      malloc()         --> prefix_malloc()
+      malloc_conf      --> prefix_malloc_conf
+      /etc/malloc.conf --> /etc/prefix_malloc.conf
+      MALLOC_CONF      --> PREFIX_MALLOC_CONF
+
+    This makes it possible to use jemalloc at the same time as the system
+    allocator, or even to use multiple copies of jemalloc simultaneously.
+
+    By default, the prefix is "", except on OS X, where it is "je_".  On OS X,
+    jemalloc overlays the default malloc zone, but makes no attempt to actually
+    replace the "malloc", "calloc", etc. symbols.
 
 --with-install-suffix=<suffix>
     Append <suffix> to the base name of all installed files, such that multiple
     versions of jemalloc can coexist in the same installation directory.  For
     example, libjemalloc.so.0 becomes libjemalloc<suffix>.so.0.
 
+--enable-cc-silence
+    Enable code that silences unuseful compiler warnings.  This is helpful when
+    trying to tell serious warnings from those due to compiler limitations, but
+    it potentially incurs a performance penalty.
+
 --enable-debug
     Enable assertions and validation code.  This incurs a substantial
     performance hit, but is very useful during application development.
 
 --enable-stats
-    Enable statistics gathering functionality.  Use the 'P' option to print
-    detailed allocation statistics at exit.
+    Enable statistics gathering functionality.  See the "opt.stats_print"
+    option documentation for usage details.
 
 --enable-prof
-    Enable heap profiling and leak detection functionality.  Use the 'B', 'E',
-    'F', 'I', 'L', and 'U' options to control these features.
+    Enable heap profiling and leak detection functionality.  See the "opt.prof"
+    option documention for usage details.
 
 --disable-prof-libgcc
     Disable the use of libgcc's backtracing functionality.  Ordinarily, libgcc's
@@ -72,8 +88,8 @@
 
 --disable-tcache
     Disable thread-specific caches for small objects.  Objects are cached and
-    released in bulk, thus reducing the total number of mutex operations.  Use
-    the 'H', 'G', and 'M' options to control thread-specific caching.
+    released in bulk, thus reducing the total number of mutex operations.  See
+    the "opt.tcache" option for suage details.
 
 --enable-swap
     Enable mmap()ed swap file support.  When this feature is built in, it is
@@ -85,18 +101,18 @@
     mmap(2).
 
 --enable-fill
-    Enable support for junk/zero filling of memory.  Use the 'J' option to
-    control junk filling, or the 'Z' option to control zero filling.
+    Enable support for junk/zero filling of memory.  See the "opt.junk"/
+    "opt.zero" option documentation for usage details.
 
 --enable-xmalloc
     Enable support for optional immediate termination due to out-of-memory
     errors, as is commonly implemented by "xmalloc" wrapper function for malloc.
-    Use the 'X' option to control termination behavior.
+    See the "opt.xmalloc" option documentation for usage details.
 
 --enable-sysv
     Enable support for System V semantics, wherein malloc(0) returns NULL
-    rather than a minimal allocation.  Use the 'V' option to control System V
-    compatibility.
+    rather than a minimal allocation.  See the "opt.sysv" option documentation
+    for usage details.
 
 --enable-dynamic-page-shift
     Under most conditions, the system page size never changes (usually 4KiB or
@@ -213,3 +229,14 @@
     cd obj
     ../configure --enable-autogen
     make
+
+=== Documentation ==============================================================
+
+The manual page that the configure script generates can be manually formatted
+prior to installation via any of the following commands:
+
+    nroff -man -man-ext -t doc/jemalloc.3
+
+    groff -man -man-ext -t -Tps doc/jemalloc.3 | ps2pdf - doc/jemalloc.3.pdf
+
+    (cd doc; groff -man -man-ext -t -Thtml jemalloc.3 > jemalloc.3.html)
diff --git a/jemalloc/Makefile.in b/jemalloc/Makefile.in
index ac9b782..46eddf4 100644
--- a/jemalloc/Makefile.in
+++ b/jemalloc/Makefile.in
@@ -28,10 +28,17 @@
 RPATH_EXTRA := @RPATH_EXTRA@
 ifeq (macho, @abi@)
 SO := dylib
+WL_SONAME := dylib_install_name
 else
 SO := so
+WL_SONAME := soname
 endif
-REV := 0
+REV := 1
+ifeq (macho, @abi@)
+TEST_LIBRARY_PATH := DYLD_FALLBACK_LIBRARY_PATH=@objroot@lib
+else
+TEST_LIBRARY_PATH :=
+endif
 
 # Lists of files.
 BINS := @srcroot@bin/pprof
@@ -42,11 +49,18 @@
 	@srcroot@src/chunk_mmap.c @srcroot@src/chunk_swap.c @srcroot@src/ckh.c \
 	@srcroot@src/ctl.c @srcroot@src/extent.c @srcroot@src/hash.c \
 	@srcroot@src/huge.c @srcroot@src/mb.c @srcroot@src/mutex.c \
-	@srcroot@src/prof.c @srcroot@src/stats.c @srcroot@src/tcache.c
-DSOS := @objroot@lib/libjemalloc@install_suffix@.so.$(REV) \
-	@objroot@lib/libjemalloc@install_suffix@.so \
+	@srcroot@src/prof.c @srcroot@src/rtree.c \
+	@srcroot@src/stats.c @srcroot@src/tcache.c
+ifeq (macho, @abi@)
+CSRCS += @srcroot@src/zone.c
+endif
+DSOS := @objroot@lib/libjemalloc@install_suffix@.$(SO).$(REV) \
+	@objroot@lib/libjemalloc@install_suffix@.$(SO) \
 	@objroot@lib/libjemalloc@install_suffix@_pic.a
 MAN3 := @objroot@doc/jemalloc@install_suffix@.3
+CTESTS := @srcroot@test/allocated.c @srcroot@test/allocm.c \
+	@srcroot@test/posix_memalign.c \
+	@srcroot@test/rallocm.c @srcroot@test/thread_arena.c
 
 .PHONY: all dist install check clean distclean relclean
 
@@ -63,18 +77,32 @@
 	$(CC) $(CFLAGS) -c $(CPPFLAGS) -o $@ $<
 	@$(SHELL) -ec "$(CC) -MM $(CPPFLAGS) $< | sed \"s/\($(subst /,\/,$(notdir $(basename $@)))\)\.o\([ :]*\)/$(subst /,\/,$(strip $(dir $@)))\1.o \2/g\" > $(@:%.o=%.d)"
 
-%.so : %.so.$(REV)
+%.$(SO) : %.$(SO).$(REV)
 	@mkdir -p $(@D)
 	ln -sf $(<F) $@
 
-@objroot@lib/libjemalloc@install_suffix@.so.$(REV) : $(CSRCS:@srcroot@%.c=@objroot@%.o)
+@objroot@lib/libjemalloc@install_suffix@.$(SO).$(REV) : $(CSRCS:@srcroot@%.c=@objroot@%.o)
 	@mkdir -p $(@D)
-	$(CC) -shared -Wl,-soname,$(@F) -o $@ $+ $(LDFLAGS) $(LIBS)
+	$(CC) -shared -Wl,-$(WL_SONAME),$(@F) $(RPATH_EXTRA:%=@RPATH@%) -o $@ $+ $(LDFLAGS) $(LIBS)
 
 @objroot@lib/libjemalloc@install_suffix@_pic.a : $(CSRCS:@srcroot@%.c=@objroot@%.o)
 	@mkdir -p $(@D)
 	ar crus $@ $+
 
+@objroot@test/%.o: @srcroot@test/%.c
+	@mkdir -p $(@D)
+	$(CC) $(CFLAGS) -c $(CPPFLAGS) -I@objroot@test -o $@ $<
+	@$(SHELL) -ec "$(CC) -MM $(CPPFLAGS) -I@objroot@test $< | sed \"s/\($(subst /,\/,$(notdir $(basename $@)))\)\.o\([ :]*\)/$(subst /,\/,$(strip $(dir $@)))\1.o \2/g\" > $(@:%.o=%.d)"
+
+@objroot@test/%: @objroot@test/%.o \
+		 @objroot@lib/libjemalloc@install_suffix@.$(SO)
+	@mkdir -p $(@D)
+ifneq (@RPATH@, )
+	$(CC) -o $@ $< @RPATH@@objroot@lib -L@objroot@lib -ljemalloc@install_suffix@
+else
+	$(CC) -o $@ $< -L@objroot@lib -ljemalloc@install_suffix@
+endif
+
 install_bin:
 	install -d $(BINDIR)
 	@for b in $(BINS); do \
@@ -91,8 +119,8 @@
 
 install_lib: $(DSOS)
 	install -d $(LIBDIR)
-	install -m 755 @objroot@lib/libjemalloc@install_suffix@.so.$(REV) $(LIBDIR)
-	ln -sf libjemalloc@install_suffix@.so.$(REV) $(LIBDIR)/libjemalloc@install_suffix@.so
+	install -m 755 @objroot@lib/libjemalloc@install_suffix@.$(SO).$(REV) $(LIBDIR)
+	ln -sf libjemalloc@install_suffix@.$(SO).$(REV) $(LIBDIR)/libjemalloc@install_suffix@.$(SO)
 	install -m 755 @objroot@lib/libjemalloc@install_suffix@_pic.a $(LIBDIR)
 
 install_man:
@@ -104,19 +132,50 @@
 
 install: install_bin install_include install_lib install_man
 
-check:
+tests: $(CTESTS:@srcroot@%.c=@objroot@%)
+
+check: tests
+	@mkdir -p @objroot@test
+	@$(SHELL) -c 'total=0; \
+		failures=0; \
+		echo "========================================="; \
+		for t in $(CTESTS:@srcroot@%.c=@objroot@%); do \
+			total=`expr $$total + 1`; \
+			/bin/echo -n "$${t} ... "; \
+			$(TEST_LIBRARY_PATH) $${t} @abs_srcroot@ @abs_objroot@ \
+			  > @objroot@$${t}.out 2>&1; \
+			if test -e "@srcroot@$${t}.exp"; then \
+				diff -u @srcroot@$${t}.exp \
+				  @objroot@$${t}.out >/dev/null 2>&1; \
+				fail=$$?; \
+				if test "$${fail}" -eq "1" ; then \
+					failures=`expr $${failures} + 1`; \
+					echo "*** FAIL ***"; \
+				else \
+					echo "pass"; \
+				fi; \
+			else \
+				echo "*** FAIL *** (.exp file is missing)"; \
+				failures=`expr $${failures} + 1`; \
+			fi; \
+		done; \
+		echo "========================================="; \
+		echo "Failures: $${failures}/$${total}"'
 
 clean:
 	rm -f $(CSRCS:@srcroot@%.c=@objroot@%.o)
 	rm -f $(CSRCS:@srcroot@%.c=@objroot@%.d)
+	rm -f $(CTESTS:@srcroot@%.c=@objroot@%)
+	rm -f $(CTESTS:@srcroot@%.c=@objroot@%.o)
+	rm -f $(CTESTS:@srcroot@%.c=@objroot@%.d)
+	rm -f $(CTESTS:@srcroot@%.c=@objroot@%.out)
 	rm -f $(DSOS)
 
 distclean: clean
 	rm -rf @objroot@autom4te.cache
 	rm -f @objroot@config.log
 	rm -f @objroot@config.status
-	rm -f @objroot@cfghdrs.stamp
-	rm -f @objroot@cfgoutputs.stamp
+	rm -f @objroot@config.stamp
 	rm -f @cfghdrs_out@
 	rm -f @cfgoutputs_out@
 
diff --git a/jemalloc/README b/jemalloc/README
index 2ff36ef..4d7b552 100644
--- a/jemalloc/README
+++ b/jemalloc/README
@@ -1,9 +1,9 @@
 jemalloc is a general-purpose scalable concurrent malloc(3) implementation.
 This distribution is a stand-alone "portable" implementation that currently
-targets only Linux.  jemalloc is included as the default allocator in the
-FreeBSD and NetBSD operating systems, and it is used by the Mozilla Firefox web
-browser on Microsoft Windows-related platforms.  Depending on your needs, one
-of the other divergent versions may suit your needs better than this
+targets Linux and Apple OS X.  jemalloc is included as the default allocator in
+the FreeBSD and NetBSD operating systems, and it is used by the Mozilla Firefox
+web browser on Microsoft Windows-related platforms.  Depending on your needs,
+one of the other divergent versions may suit your needs better than this
 distribution.
 
 The COPYING file contains copyright and licensing information.
@@ -11,4 +11,6 @@
 The INSTALL file contains information on how to configure, build, and install
 jemalloc.
 
+The ChangeLog file contains a brief summary of changes for each release.
+
 URL: http://www.canonware.com/jemalloc/
diff --git a/jemalloc/bin/pprof b/jemalloc/bin/pprof
index 57c0600..1655f07 100755
--- a/jemalloc/bin/pprof
+++ b/jemalloc/bin/pprof
@@ -92,9 +92,7 @@
 my $KCACHEGRIND = "kcachegrind";
 my $PS2PDF = "ps2pdf";
 # These are used for dynamic profiles
-my $WGET = "wget";
-my $WGET_FLAGS = "--no-http-keep-alive";   # only supported by some wgets
-my $CURL = "curl";
+my $URL_FETCHER = "curl -s";
 
 # These are the web pages that servers need to support for dynamic profiles
 my $HEAP_PAGE = "/pprof/heap";
@@ -108,6 +106,12 @@
 my $SYMBOL_PAGE = "/pprof/symbol";     # must support symbol lookup via POST
 my $PROGRAM_NAME_PAGE = "/pprof/cmdline";
 
+# These are the web pages that can be named on the command line.
+# All the alternatives must begin with /.
+my $PROFILES = "($HEAP_PAGE|$PROFILE_PAGE|$PMUPROFILE_PAGE|" .
+               "$GROWTH_PAGE|$CONTENTION_PAGE|$WALL_PAGE|" .
+               "$FILTEREDPROFILE_PAGE)";
+
 # default binary name
 my $UNKNOWN_BINARY = "(unknown)";
 
@@ -176,12 +180,14 @@
    --text              Generate text report
    --callgrind         Generate callgrind format to stdout
    --gv                Generate Postscript and display
+   --web               Generate SVG and display
    --list=<regexp>     Generate source listing of matching routines
    --disasm=<regexp>   Generate disassembly of matching routines
    --symbols           Print demangled symbol names found at given addresses
    --dot               Generate DOT file to stdout
    --ps                Generate Postcript to stdout
    --pdf               Generate PDF to stdout
+   --svg               Generate SVG to stdout
    --gif               Generate GIF to stdout
    --raw               Generate symbolized pprof data (useful with remote fetch)
 
@@ -209,7 +215,7 @@
                        (i.e. direct leak generators) more visible
 
 Miscellaneous:
-   --tools=<prefix>    Prefix for object tool pathnames
+   --tools=<prefix or binary:fullpath>[,...]   \$PATH for object tool pathnames
    --test              Run unit tests
    --help              This message
    --version           Version information
@@ -224,6 +230,8 @@
                        Enters "interactive" mode
 pprof --text /bin/ls ls.prof
                        Outputs one line per procedure
+pprof --web /bin/ls ls.prof
+                       Displays annotated call-graph in web browser
 pprof --gv /bin/ls ls.prof
                        Displays annotated call-graph via 'gv'
 pprof --gv --focus=Mutex /bin/ls ls.prof
@@ -234,6 +242,9 @@
                        (Per-line) annotated source listing for getdir()
 pprof --disasm=getdir /bin/ls ls.prof
                        (Per-PC) annotated disassembly for getdir()
+
+pprof http://localhost:1234/
+                       Enters "interactive" mode
 pprof --text localhost:1234
                        Outputs one line per procedure for localhost:1234
 pprof --raw localhost:1234 > ./local.raw
@@ -293,10 +304,12 @@
   $main::opt_disasm = "";
   $main::opt_symbols = 0;
   $main::opt_gv = 0;
+  $main::opt_web = 0;
   $main::opt_dot = 0;
   $main::opt_ps = 0;
   $main::opt_pdf = 0;
   $main::opt_gif = 0;
+  $main::opt_svg = 0;
   $main::opt_raw = 0;
 
   $main::opt_nodecount = 80;
@@ -331,6 +344,9 @@
   # Are we using $SYMBOL_PAGE?
   $main::use_symbol_page = 0;
 
+  # Files returned by TempName.
+  %main::tempnames = ();
+
   # Type of profile we are dealing with
   # Supported types:
   #     cpu
@@ -356,9 +372,11 @@
              "disasm=s"       => \$main::opt_disasm,
              "symbols!"       => \$main::opt_symbols,
              "gv!"            => \$main::opt_gv,
+             "web!"           => \$main::opt_web,
              "dot!"           => \$main::opt_dot,
              "ps!"            => \$main::opt_ps,
              "pdf!"           => \$main::opt_pdf,
+             "svg!"           => \$main::opt_svg,
              "gif!"           => \$main::opt_gif,
              "raw!"           => \$main::opt_raw,
              "interactive!"   => \$main::opt_interactive,
@@ -434,9 +452,11 @@
       ($main::opt_disasm eq '' ? 0 : 1) +
       ($main::opt_symbols == 0 ? 0 : 1) +
       $main::opt_gv +
+      $main::opt_web +
       $main::opt_dot +
       $main::opt_ps +
       $main::opt_pdf +
+      $main::opt_svg +
       $main::opt_gif +
       $main::opt_raw +
       $main::opt_interactive +
@@ -511,20 +531,6 @@
     ConfigureObjTools($main::prog)
   }
 
-  # Check what flags our commandline utilities support
-  if (open(TFILE, "$WGET $WGET_FLAGS -V 2>&1 |")) {
-    my @lines = <TFILE>;
-    if (grep(/unrecognized/, @lines) > 0) {
-      # grep found 'unrecognized' token from WGET, clear WGET flags
-      $WGET_FLAGS = "";
-    }
-    close(TFILE);
-  }
-  # TODO(csilvers): check all the other binaries and objtools to see
-  # if they are installed and what flags they support, and store that
-  # in a data structure here, rather than scattering these tests about.
-  # Then, ideally, rewrite code to use wget OR curl OR GET or ...
-
   # Break the opt_list_prefix into the prefix_list array
   @prefix_list = split (',', $main::opt_lib_prefix);
 
@@ -588,6 +594,10 @@
   } elsif ($main::use_symbol_page) {
     $symbols = FetchSymbols($pcs);
   } else {
+    # TODO(csilvers): $libs uses the /proc/self/maps data from profile1,
+    # which may differ from the data from subsequent profiles, especially
+    # if they were run on different machines.  Use appropriate libs for
+    # each pc somehow.
     $symbols = ExtractSymbols($libs, $pcs);
   }
 
@@ -635,9 +645,24 @@
     } else {
       if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) {
         if ($main::opt_gv) {
-          RunGV(PsTempName($main::next_tmpfile), "");
+          RunGV(TempName($main::next_tmpfile, "ps"), "");
+        } elsif ($main::opt_web) {
+          my $tmp = TempName($main::next_tmpfile, "svg");
+          RunWeb($tmp);
+          # The command we run might hand the file name off
+          # to an already running browser instance and then exit.
+          # Normally, we'd remove $tmp on exit (right now),
+          # but fork a child to remove $tmp a little later, so that the
+          # browser has time to load it first.
+          delete $main::tempnames{$tmp};
+          if (fork() == 0) {
+            sleep 5;
+            unlink($tmp);
+            exit(0);
+          }
         }
       } else {
+        cleanup();
         exit(1);
       }
     }
@@ -683,6 +708,34 @@
   }
 }
 
+sub RunWeb {
+  my $fname = shift;
+  print STDERR "Loading web page file:///$fname\n";
+
+  if (`uname` =~ /Darwin/) {
+    # OS X: open will use standard preference for SVG files.
+    system("/usr/bin/open", $fname);
+    return;
+  }
+
+  # Some kind of Unix; try generic symlinks, then specific browsers.
+  # (Stop once we find one.)
+  # Works best if the browser is already running.
+  my @alt = (
+    "/etc/alternatives/gnome-www-browser",
+    "/etc/alternatives/x-www-browser",
+    "google-chrome",
+    "firefox",
+  );
+  foreach my $b (@alt) {
+    if (system($b, $fname) == 0) {
+      return;
+    }
+  }
+
+  print STDERR "Could not load web browser.\n";
+}
+
 sub RunKcachegrind {
   my $fname = shift;
   my $bg = shift;       # "" or " &" if we should run in background
@@ -739,10 +792,10 @@
     print STDERR "\n";
     return 0;
   }
-  if (m/^ *quit/) {
+  if (m/^\s*quit/) {
     return 0;
   }
-  if (m/^ *help/) {
+  if (m/^\s*help/) {
     InteractiveHelpMessage();
     return 1;
   }
@@ -754,7 +807,7 @@
   $main::opt_gv = 0;
   $main::opt_cum = 0;
 
-  if (m/^ *(text|top)(\d*) *(.*)/) {
+  if (m/^\s*(text|top)(\d*)\s*(.*)/) {
     $main::opt_text = 1;
 
     my $line_limit = ($2 ne "") ? int($2) : 10;
@@ -773,14 +826,14 @@
     PrintText($symbols, $flat, $cumulative, $total, $line_limit);
     return 1;
   }
-  if (m/^ *callgrind *([^ \n]*)/) {
+  if (m/^\s*callgrind\s*([^ \n]*)/) {
     $main::opt_callgrind = 1;
 
     # Get derived profiles
     my $calls = ExtractCalls($symbols, $orig_profile);
     my $filename = $1;
     if ( $1 eq '' ) {
-      $filename = CallgrindTempName($main::next_tmpfile);
+      $filename = TempName($main::next_tmpfile, "callgrind");
     }
     PrintCallgrind($calls, $filename);
     if ( $1 eq '' ) {
@@ -790,7 +843,7 @@
 
     return 1;
   }
-  if (m/^ *list *(.+)/) {
+  if (m/^\s*list\s*(.+)/) {
     $main::opt_list = 1;
 
     my $routine;
@@ -807,7 +860,7 @@
     PrintListing($libs, $flat, $cumulative, $routine);
     return 1;
   }
-  if (m/^ *disasm *(.+)/) {
+  if (m/^\s*disasm\s*(.+)/) {
     $main::opt_disasm = 1;
 
     my $routine;
@@ -825,12 +878,18 @@
     PrintDisassembly($libs, $flat, $cumulative, $routine, $total);
     return 1;
   }
-  if (m/^ *gv *(.*)/) {
-    $main::opt_gv = 1;
+  if (m/^\s*(gv|web)\s*(.*)/) {
+    $main::opt_gv = 0;
+    $main::opt_web = 0;
+    if ($1 eq "gv") {
+      $main::opt_gv = 1;
+    } elsif ($1 eq "web") {
+      $main::opt_web = 1;
+    }
 
     my $focus;
     my $ignore;
-    ($focus, $ignore) = ParseInteractiveArgs($1);
+    ($focus, $ignore) = ParseInteractiveArgs($2);
 
     # Process current profile to account for various settings
     my $profile = ProcessProfile($orig_profile, $symbols, $focus, $ignore);
@@ -841,11 +900,19 @@
     my $cumulative = CumulativeProfile($reduced);
 
     if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) {
-      RunGV(PsTempName($main::next_tmpfile), " &");
+      if ($main::opt_gv) {
+        RunGV(TempName($main::next_tmpfile, "ps"), " &");
+      } elsif ($main::opt_web) {
+        RunWeb(TempName($main::next_tmpfile, "svg"));
+      }
       $main::next_tmpfile++;
     }
     return 1;
   }
+  if (m/^\s*$/) {
+    return 1;
+  }
+  print STDERR "Unknown command: try 'help'.\n";
   return 1;
 }
 
@@ -894,6 +961,14 @@
       the "focus" regular expression matches a routine name on the stack
       trace.
 
+  web
+  web [focus] [-ignore1] [-ignore2]
+      Like GV, but displays profile in your web browser instead of using
+      Ghostview. Works best if your web browser is already running.
+      To change the browser that gets used:
+      On Linux, set the /etc/alternatives/gnome-www-browser symlink.
+      On OS X, change the Finder association for SVG files.
+
   list [routine_regexp] [-ignore1] [-ignore2]
       Show source listing of routines whose names match "routine_regexp"
 
@@ -950,14 +1025,12 @@
 
 ##### Output code #####
 
-sub PsTempName {
+sub TempName {
   my $fnum = shift;
-  return "$main::tmpfile_ps" . "." . "$fnum" . ".ps";
-}
-
-sub CallgrindTempName {
-  my $fnum = shift;
-  return "$main::tmpfile_ps" . "." . "$fnum" . ".callgrind";
+  my $ext = shift;
+  my $file = "$main::tmpfile_ps.$fnum.$ext";
+  $main::tempnames{$file} = 1;
+  return $file;
 }
 
 # Print profile data in packed binary format (64-bit) to standard out
@@ -1599,7 +1672,6 @@
   }
   if ($last < 0) {
     print STDERR "No nodes to print\n";
-    cleanup();
     return 0;
   }
 
@@ -1612,11 +1684,14 @@
   # Open DOT output file
   my $output;
   if ($main::opt_gv) {
-    $output = "| $DOT -Tps2 >" . PsTempName($main::next_tmpfile);
+    $output = "| $DOT -Tps2 >" . TempName($main::next_tmpfile, "ps");
   } elsif ($main::opt_ps) {
     $output = "| $DOT -Tps2";
   } elsif ($main::opt_pdf) {
     $output = "| $DOT -Tps2 | $PS2PDF - -";
+  } elsif ($main::opt_web || $main::opt_svg) {
+    # We need to post-process the SVG, so write to a temporary file always.
+    $output = "| $DOT -Tsvg >" . TempName($main::next_tmpfile, "svg");
   } elsif ($main::opt_gif) {
     $output = "| $DOT -Tgif";
   } else {
@@ -1727,7 +1802,10 @@
       my $fraction = abs($local_total ? (3 * ($n / $local_total)) : 0);
       if ($fraction > 1) { $fraction = 1; }
       my $w = $fraction * 2;
-      #if ($w < 1) { $w = 1; }
+      if ($w < 1 && ($main::opt_web || $main::opt_svg)) {
+        # SVG output treats line widths < 1 poorly.
+        $w = 1;
+      }
 
       # Dot sometimes segfaults if given edge weights that are too large, so
       # we cap the weights at a large value
@@ -1751,11 +1829,312 @@
   }
 
   print DOT ("}\n");
-
   close(DOT);
+
+  if ($main::opt_web || $main::opt_svg) {
+    # Rewrite SVG to be more usable inside web browser.
+    RewriteSvg(TempName($main::next_tmpfile, "svg"));
+  }
+
   return 1;
 }
 
+sub RewriteSvg {
+  my $svgfile = shift;
+
+  open(SVG, $svgfile) || die "open temp svg: $!";
+  my @svg = <SVG>;
+  close(SVG);
+  unlink $svgfile;
+  my $svg = join('', @svg);
+
+  # Dot's SVG output is
+  #
+  #    <svg width="___" height="___"
+  #     viewBox="___" xmlns=...>
+  #    <g id="graph0" transform="...">
+  #    ...
+  #    </g>
+  #    </svg>
+  #
+  # Change it to
+  #
+  #    <svg width="100%" height="100%"
+  #     xmlns=...>
+  #    $svg_javascript
+  #    <g id="viewport" transform="translate(0,0)">
+  #    <g id="graph0" transform="...">
+  #    ...
+  #    </g>
+  #    </g>
+  #    </svg>
+
+  # Fix width, height; drop viewBox.
+  $svg =~ s/(?s)<svg width="[^"]+" height="[^"]+"(.*?)viewBox="[^"]+"/<svg width="100%" height="100%"$1/;
+
+  # Insert script, viewport <g> above first <g>
+  my $svg_javascript = SvgJavascript();
+  my $viewport = "<g id=\"viewport\" transform=\"translate(0,0)\">\n";
+  $svg =~ s/<g id="graph\d"/$svg_javascript$viewport$&/;
+
+  # Insert final </g> above </svg>.
+  $svg =~ s/(.*)(<\/svg>)/$1<\/g>$2/;
+  $svg =~ s/<g id="graph\d"(.*?)/<g id="viewport"$1/;
+
+  if ($main::opt_svg) {
+    # --svg: write to standard output.
+    print $svg;
+  } else {
+    # Write back to temporary file.
+    open(SVG, ">$svgfile") || die "open $svgfile: $!";
+    print SVG $svg;
+    close(SVG);
+  }
+}
+
+sub SvgJavascript {
+  return <<'EOF';
+<script type="text/ecmascript"><![CDATA[
+// SVGPan
+// http://www.cyberz.org/blog/2009/12/08/svgpan-a-javascript-svg-panzoomdrag-library/
+// Local modification: if(true || ...) below to force panning, never moving.
+
+/**
+ *  SVGPan library 1.2
+ * ====================
+ *
+ * Given an unique existing element with id "viewport", including the
+ * the library into any SVG adds the following capabilities:
+ *
+ *  - Mouse panning
+ *  - Mouse zooming (using the wheel)
+ *  - Object dargging
+ *
+ * Known issues:
+ *
+ *  - Zooming (while panning) on Safari has still some issues
+ *
+ * Releases:
+ *
+ * 1.2, Sat Mar 20 08:42:50 GMT 2010, Zeng Xiaohui
+ *	Fixed a bug with browser mouse handler interaction
+ *
+ * 1.1, Wed Feb  3 17:39:33 GMT 2010, Zeng Xiaohui
+ *	Updated the zoom code to support the mouse wheel on Safari/Chrome
+ *
+ * 1.0, Andrea Leofreddi
+ *	First release
+ *
+ * This code is licensed under the following BSD license:
+ *
+ * Copyright 2009-2010 Andrea Leofreddi <a.leofreddi@itcharm.com>. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are
+ * permitted provided that the following conditions are met:
+ *
+ *    1. Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *
+ *    2. Redistributions in binary form must reproduce the above copyright notice, this list
+ *       of conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Andrea Leofreddi ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL Andrea Leofreddi OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * The views and conclusions contained in the software and documentation are those of the
+ * authors and should not be interpreted as representing official policies, either expressed
+ * or implied, of Andrea Leofreddi.
+ */
+
+var root = document.documentElement;
+
+var state = 'none', stateTarget, stateOrigin, stateTf;
+
+setupHandlers(root);
+
+/**
+ * Register handlers
+ */
+function setupHandlers(root){
+	setAttributes(root, {
+		"onmouseup" : "add(evt)",
+		"onmousedown" : "handleMouseDown(evt)",
+		"onmousemove" : "handleMouseMove(evt)",
+		"onmouseup" : "handleMouseUp(evt)",
+		//"onmouseout" : "handleMouseUp(evt)", // Decomment this to stop the pan functionality when dragging out of the SVG element
+	});
+
+	if(navigator.userAgent.toLowerCase().indexOf('webkit') >= 0)
+		window.addEventListener('mousewheel', handleMouseWheel, false); // Chrome/Safari
+	else
+		window.addEventListener('DOMMouseScroll', handleMouseWheel, false); // Others
+
+	var g = svgDoc.getElementById("svg");
+	g.width = "100%";
+	g.height = "100%";
+}
+
+/**
+ * Instance an SVGPoint object with given event coordinates.
+ */
+function getEventPoint(evt) {
+	var p = root.createSVGPoint();
+
+	p.x = evt.clientX;
+	p.y = evt.clientY;
+
+	return p;
+}
+
+/**
+ * Sets the current transform matrix of an element.
+ */
+function setCTM(element, matrix) {
+	var s = "matrix(" + matrix.a + "," + matrix.b + "," + matrix.c + "," + matrix.d + "," + matrix.e + "," + matrix.f + ")";
+
+	element.setAttribute("transform", s);
+}
+
+/**
+ * Dumps a matrix to a string (useful for debug).
+ */
+function dumpMatrix(matrix) {
+	var s = "[ " + matrix.a + ", " + matrix.c + ", " + matrix.e + "\n  " + matrix.b + ", " + matrix.d + ", " + matrix.f + "\n  0, 0, 1 ]";
+
+	return s;
+}
+
+/**
+ * Sets attributes of an element.
+ */
+function setAttributes(element, attributes){
+	for (i in attributes)
+		element.setAttributeNS(null, i, attributes[i]);
+}
+
+/**
+ * Handle mouse move event.
+ */
+function handleMouseWheel(evt) {
+	if(evt.preventDefault)
+		evt.preventDefault();
+
+	evt.returnValue = false;
+
+	var svgDoc = evt.target.ownerDocument;
+
+	var delta;
+
+	if(evt.wheelDelta)
+		delta = evt.wheelDelta / 3600; // Chrome/Safari
+	else
+		delta = evt.detail / -90; // Mozilla
+
+	var z = 1 + delta; // Zoom factor: 0.9/1.1
+
+	var g = svgDoc.getElementById("viewport");
+
+	var p = getEventPoint(evt);
+
+	p = p.matrixTransform(g.getCTM().inverse());
+
+	// Compute new scale matrix in current mouse position
+	var k = root.createSVGMatrix().translate(p.x, p.y).scale(z).translate(-p.x, -p.y);
+
+        setCTM(g, g.getCTM().multiply(k));
+
+	stateTf = stateTf.multiply(k.inverse());
+}
+
+/**
+ * Handle mouse move event.
+ */
+function handleMouseMove(evt) {
+	if(evt.preventDefault)
+		evt.preventDefault();
+
+	evt.returnValue = false;
+
+	var svgDoc = evt.target.ownerDocument;
+
+	var g = svgDoc.getElementById("viewport");
+
+	if(state == 'pan') {
+		// Pan mode
+		var p = getEventPoint(evt).matrixTransform(stateTf);
+
+		setCTM(g, stateTf.inverse().translate(p.x - stateOrigin.x, p.y - stateOrigin.y));
+	} else if(state == 'move') {
+		// Move mode
+		var p = getEventPoint(evt).matrixTransform(g.getCTM().inverse());
+
+		setCTM(stateTarget, root.createSVGMatrix().translate(p.x - stateOrigin.x, p.y - stateOrigin.y).multiply(g.getCTM().inverse()).multiply(stateTarget.getCTM()));
+
+		stateOrigin = p;
+	}
+}
+
+/**
+ * Handle click event.
+ */
+function handleMouseDown(evt) {
+	if(evt.preventDefault)
+		evt.preventDefault();
+
+	evt.returnValue = false;
+
+	var svgDoc = evt.target.ownerDocument;
+
+	var g = svgDoc.getElementById("viewport");
+
+	if(true || evt.target.tagName == "svg") {
+		// Pan mode
+		state = 'pan';
+
+		stateTf = g.getCTM().inverse();
+
+		stateOrigin = getEventPoint(evt).matrixTransform(stateTf);
+	} else {
+		// Move mode
+		state = 'move';
+
+		stateTarget = evt.target;
+
+		stateTf = g.getCTM().inverse();
+
+		stateOrigin = getEventPoint(evt).matrixTransform(stateTf);
+	}
+}
+
+/**
+ * Handle mouse button release event.
+ */
+function handleMouseUp(evt) {
+	if(evt.preventDefault)
+		evt.preventDefault();
+
+	evt.returnValue = false;
+
+	var svgDoc = evt.target.ownerDocument;
+
+	if(state == 'pan' || state == 'move') {
+		// Quit pan mode
+		state = '';
+	}
+}
+
+]]></script>
+EOF
+}
+
 # Translate a stack of addresses into a stack of symbols
 sub TranslateStack {
   my $symbols = shift;
@@ -2310,28 +2689,11 @@
   AddEntry($profile, (join "\n", @k), $count);
 }
 
-sub IsSymbolizedProfileFile {
-  my $file_name = shift;
-
-  if (!(-e $file_name) || !(-r $file_name)) {
-    return 0;
-  }
-
-  $SYMBOL_PAGE =~ m,[^/]+$,;    # matches everything after the last slash
-  my $symbol_marker = $&;
-  # Check if the file contains a symbol-section marker.
-  open(TFILE, "<$file_name");
-  my @lines = <TFILE>;
-  my $result = grep(/^--- *$symbol_marker/, @lines);
-  close(TFILE);
-  return $result > 0;
-}
-
 ##### Code to profile a server dynamically #####
 
 sub CheckSymbolPage {
   my $url = SymbolPageURL();
-  open(SYMBOL, "$WGET $WGET_FLAGS -qO- '$url' |");
+  open(SYMBOL, "$URL_FETCHER '$url' |");
   my $line = <SYMBOL>;
   $line =~ s/\r//g;         # turn windows-looking lines into unix-looking lines
   close(SYMBOL);
@@ -2350,33 +2712,45 @@
 
 sub IsProfileURL {
   my $profile_name = shift;
-  my ($host, $port, $path) = ParseProfileURL($profile_name);
-  return defined($host) and defined($port) and defined($path);
+  if (-f $profile_name) {
+    printf STDERR "Using local file $profile_name.\n";
+    return 0;
+  }
+  return 1;
 }
 
 sub ParseProfileURL {
   my $profile_name = shift;
-  if (defined($profile_name) &&
-      $profile_name =~ m,^(http://|)([^/:]+):(\d+)(|\@\d+)(|/|.*($PROFILE_PAGE|$PMUPROFILE_PAGE|$HEAP_PAGE|$GROWTH_PAGE|$CONTENTION_PAGE|$WALL_PAGE|$FILTEREDPROFILE_PAGE))$,o) {
-    # $6 is $PROFILE_PAGE/$HEAP_PAGE/etc.  $5 is *everything* after
-    # the hostname, as long as that everything is the empty string,
-    # a slash, or something ending in $PROFILE_PAGE/$HEAP_PAGE/etc.
-    # So "$6 || $5" is $PROFILE_PAGE/etc if there, or else it's "/" or "".
-    return ($2, $3, $6 || $5);
+
+  if (!defined($profile_name) || $profile_name eq "") {
+    return ();
   }
-  return ();
+
+  # Split profile URL - matches all non-empty strings, so no test.
+  $profile_name =~ m,^(https?://)?([^/]+)(.*?)(/|$PROFILES)?$,;
+
+  my $proto = $1 || "http://";
+  my $hostport = $2;
+  my $prefix = $3;
+  my $profile = $4 || "/";
+
+  my $host = $hostport;
+  $host =~ s/:.*//;
+
+  my $baseurl = "$proto$hostport$prefix";
+  return ($host, $baseurl, $profile);
 }
 
 # We fetch symbols from the first profile argument.
 sub SymbolPageURL {
-  my ($host, $port, $path) = ParseProfileURL($main::pfile_args[0]);
-  return "http://$host:$port$SYMBOL_PAGE";
+  my ($host, $baseURL, $path) = ParseProfileURL($main::pfile_args[0]);
+  return "$baseURL$SYMBOL_PAGE";
 }
 
 sub FetchProgramName() {
-  my ($host, $port, $path) = ParseProfileURL($main::pfile_args[0]);
-  my $url = "http://$host:$port$PROGRAM_NAME_PAGE";
-  my $command_line = "$WGET $WGET_FLAGS -qO- '$url'";
+  my ($host, $baseURL, $path) = ParseProfileURL($main::pfile_args[0]);
+  my $url = "$baseURL$PROGRAM_NAME_PAGE";
+  my $command_line = "$URL_FETCHER '$url'";
   open(CMDLINE, "$command_line |") or error($command_line);
   my $cmdline = <CMDLINE>;
   $cmdline =~ s/\r//g;   # turn windows-looking lines into unix-looking lines
@@ -2393,7 +2767,7 @@
 # curl.  Redirection happens on borg hosts.
 sub ResolveRedirectionForCurl {
   my $url = shift;
-  my $command_line = "$CURL -s --head '$url'";
+  my $command_line = "$URL_FETCHER --head '$url'";
   open(CMDLINE, "$command_line |") or error($command_line);
   while (<CMDLINE>) {
     s/\r//g;         # turn windows-looking lines into unix-looking lines
@@ -2405,6 +2779,20 @@
   return $url;
 }
 
+# Add a timeout flat to URL_FETCHER
+sub AddFetchTimeout {
+  my $fetcher = shift;
+  my $timeout = shift;
+  if (defined($timeout)) {
+    if ($fetcher =~ m/\bcurl -s/) {
+      $fetcher .= sprintf(" --max-time %d", $timeout);
+    } elsif ($fetcher =~ m/\brpcget\b/) {
+      $fetcher .= sprintf(" --deadline=%d", $timeout);
+    }
+  }
+  return $fetcher;
+}
+
 # Reads a symbol map from the file handle name given as $1, returning
 # the resulting symbol map.  Also processes variables relating to symbols.
 # Currently, the only variable processed is 'binary=<value>' which updates
@@ -2460,10 +2848,14 @@
     close(POSTFILE);
 
     my $url = SymbolPageURL();
-    # Here we use curl for sending data via POST since old
-    # wget doesn't have --post-file option.
-    $url = ResolveRedirectionForCurl($url);
-    my $command_line = "$CURL -sd '\@$main::tmpfile_sym' '$url'";
+
+    my $command_line;
+    if ($URL_FETCHER =~ m/\bcurl -s/) {
+      $url = ResolveRedirectionForCurl($url);
+      $command_line = "$URL_FETCHER -d '\@$main::tmpfile_sym' '$url'";
+    } else {
+      $command_line = "$URL_FETCHER --post '$url' < '$main::tmpfile_sym'";
+    }
     # We use c++filt in case $SYMBOL_PAGE gives us mangled symbols.
     my $cppfilt = $obj_tool_map{"c++filt"};
     open(SYMBOL, "$command_line | $cppfilt |") or error($command_line);
@@ -2508,10 +2900,10 @@
 
 sub MakeProfileBaseName {
   my ($binary_name, $profile_name) = @_;
-  my ($host, $port, $path) = ParseProfileURL($profile_name);
+  my ($host, $baseURL, $path) = ParseProfileURL($profile_name);
   my $binary_shortname = BaseName($binary_name);
-  return sprintf("%s.%s.%s-port%s",
-                 $binary_shortname, $main::op_time, $host, $port);
+  return sprintf("%s.%s.%s",
+                 $binary_shortname, $main::op_time, $host);
 }
 
 sub FetchDynamicProfile {
@@ -2523,7 +2915,7 @@
   if (!IsProfileURL($profile_name)) {
     return $profile_name;
   } else {
-    my ($host, $port, $path) = ParseProfileURL($profile_name);
+    my ($host, $baseURL, $path) = ParseProfileURL($profile_name);
     if ($path eq "" || $path eq "/") {
       # Missing type specifier defaults to cpu-profile
       $path = $PROFILE_PAGE;
@@ -2531,35 +2923,26 @@
 
     my $profile_file = MakeProfileBaseName($binary_name, $profile_name);
 
-    my $url;
-    my $wget_timeout;
-    if (($path =~ m/$PROFILE_PAGE/) || ($path =~ m/$PMUPROFILE_PAGE/)) {
-      if ($path =~ m/$PROFILE_PAGE/) {
-        $url = sprintf("http://$host:$port$path?seconds=%d",
-            $main::opt_seconds);
+    my $url = "$baseURL$path";
+    my $fetch_timeout = undef;
+    if ($path =~ m/$PROFILE_PAGE|$PMUPROFILE_PAGE/) {
+      if ($path =~ m/[?]/) {
+        $url .= "&";
       } else {
-        if ($profile_name =~ m/[?]/) {
-          $profile_name .= "&"
-        } else {
-          $profile_name .= "?"
-        }
-        $url = sprintf("http://$profile_name" . "seconds=%d",
-            $main::opt_seconds);
+        $url .= "?";
       }
-      $wget_timeout = sprintf("--timeout=%d",
-                              int($main::opt_seconds * 1.01 + 60));
+      $url .= sprintf("seconds=%d", $main::opt_seconds);
+      $fetch_timeout = $main::opt_seconds * 1.01 + 60;
     } else {
       # For non-CPU profiles, we add a type-extension to
       # the target profile file name.
       my $suffix = $path;
       $suffix =~ s,/,.,g;
-      $profile_file .= "$suffix";
-      $url = "http://$host:$port$path";
-      $wget_timeout = "";
+      $profile_file .= $suffix;
     }
 
     my $profile_dir = $ENV{"PPROF_TMPDIR"} || ($ENV{HOME} . "/pprof");
-    if (!(-d $profile_dir)) {
+    if (! -d $profile_dir) {
       mkdir($profile_dir)
           || die("Unable to create profile directory $profile_dir: $!\n");
     }
@@ -2570,14 +2953,15 @@
       return $real_profile;
     }
 
-    my $cmd = "$WGET $WGET_FLAGS $wget_timeout -q -O $tmp_profile '$url'";
-    if (($path =~ m/$PROFILE_PAGE/) || ($path =~ m/$PMUPROFILE_PAGE/)){
+    my $fetcher = AddFetchTimeout($URL_FETCHER, $fetch_timeout);
+    my $cmd = "$fetcher '$url' > '$tmp_profile'";
+    if ($path =~ m/$PROFILE_PAGE|$PMUPROFILE_PAGE/){
       print STDERR "Gathering CPU profile from $url for $main::opt_seconds seconds to\n  ${real_profile}\n";
       if ($encourage_patience) {
         print STDERR "Be patient...\n";
       }
     } else {
-      print STDERR "Fetching $path profile from $host:$port to\n  ${real_profile}\n";
+      print STDERR "Fetching $path profile from $url to\n  ${real_profile}\n";
     }
 
     (system($cmd) == 0) || error("Failed to get profile: $cmd: $!\n");
@@ -2624,6 +3008,7 @@
   } else {
     $position = 1 | ($position << 1);
     TryCollectProfile($maxlevel, $level, $position);
+    cleanup();
     exit(0);
   }
 }
@@ -2662,6 +3047,7 @@
                  stride      => 512 * 1024,   # must be a multiple of bitsize/8
                  slots       => [],
                  unpack_code => "",           # N for big-endian, V for little
+                 perl_is_64bit => 1,          # matters if profile is 64-bit
     };
     bless $self, $class;
     # Let unittests adjust the stride
@@ -2685,17 +3071,15 @@
       }
       @$slots = unpack($self->{unpack_code} . "*", $str);
     } else {
-      # If we're a 64-bit profile, make sure we're a 64-bit-capable
+      # If we're a 64-bit profile, check if we're a 64-bit-capable
       # perl.  Otherwise, each slot will be represented as a float
       # instead of an int64, losing precision and making all the
-      # 64-bit addresses right.  We *could* try to handle this with
-      # software emulation of 64-bit ints, but that's added complexity
-      # for no clear benefit (yet).  We use 'Q' to test for 64-bit-ness;
-      # perl docs say it's only available on 64-bit perl systems.
+      # 64-bit addresses wrong.  We won't complain yet, but will
+      # later if we ever see a value that doesn't fit in 32 bits.
       my $has_q = 0;
       eval { $has_q = pack("Q", "1") ? 1 : 1; };
       if (!$has_q) {
-        ::error("$fname: need a 64-bit perl to process this 64-bit profile.\n");
+	$self->{perl_is_64bit} = 0;
       }
       read($self->{file}, $str, 8);
       if (substr($str, 4, 4) eq chr(0)x4) {
@@ -2731,11 +3115,17 @@
         # TODO(csilvers): if this is a 32-bit perl, the math below
         #    could end up in a too-large int, which perl will promote
         #    to a double, losing necessary precision.  Deal with that.
-        if ($self->{unpack_code} eq 'V') {    # little-endian
-          push(@b64_values, $b32_values[$i] + $b32_values[$i+1] * (2**32));
-        } else {
-          push(@b64_values, $b32_values[$i] * (2**32) + $b32_values[$i+1]);
-        }
+	#    Right now, we just die.
+	my ($lo, $hi) = ($b32_values[$i], $b32_values[$i+1]);
+        if ($self->{unpack_code} eq 'N') {    # big-endian
+	  ($lo, $hi) = ($hi, $lo);
+	}
+	my $value = $lo + $hi * (2**32);
+	if (!$self->{perl_is_64bit} &&   # check value is exactly represented
+	    (($value % (2**32)) != $lo || int($value / (2**32)) != $hi)) {
+	  ::error("Need a 64-bit perl to process this 64-bit profile.\n");
+	}
+	push(@b64_values, $value);
       }
       @$slots = @b64_values;
     }
@@ -2764,6 +3154,44 @@
   }
 }
 
+# Return the next line from the profile file, assuming it's a text
+# line (which in this case means, doesn't start with a NUL byte).  If
+# it's not a text line, return "".  At EOF, return undef, like perl does.
+# Input file should be in binmode.
+sub ReadProfileLine {
+  local *PROFILE = shift;
+  my $firstchar = "";
+  my $line = "";
+  read(PROFILE, $firstchar, 1);
+  seek(PROFILE, -1, 1);          # unread the firstchar
+  if ($firstchar eq "\0") {
+    return "";
+  }
+  $line = <PROFILE>;
+  if (defined($line)) {
+    $line =~ s/\r//g;   # turn windows-looking lines into unix-looking lines
+  }
+  return $line;
+}
+
+sub IsSymbolizedProfileFile {
+  my $file_name = shift;
+  if (!(-e $file_name) || !(-r $file_name)) {
+    return 0;
+  }
+  # Check if the file contains a symbol-section marker.
+  open(TFILE, "<$file_name");
+  binmode TFILE;
+  my $firstline = ReadProfileLine(*TFILE);
+  close(TFILE);
+  if (!$firstline) {
+    return 0;
+  }
+  $SYMBOL_PAGE =~ m,[^/]+$,;    # matches everything after the last slash
+  my $symbol_marker = $&;
+  return $firstline =~ /^--- *$symbol_marker/;
+}
+
 # Parse profile generated by common/profiler.cc and return a reference
 # to a map:
 #      $result->{version}     Version number of profile file
@@ -2798,28 +3226,17 @@
   # whole firstline, since it may be gigabytes(!) of data.
   open(PROFILE, "<$fname") || error("$fname: $!\n");
   binmode PROFILE;      # New perls do UTF-8 processing
-  my $firstchar = "";
-  my $header = "";
-  read(PROFILE, $firstchar, 1);
-  seek(PROFILE, -1, 1);          # unread the firstchar
-  if ($firstchar ne "\0") {
-    $header = <PROFILE>;
-    $header =~ s/\r//g;   # turn windows-looking lines into unix-looking lines
+  my $header = ReadProfileLine(*PROFILE);
+  if (!defined($header)) {   # means "at EOF"
+    error("Profile is empty.\n");
   }
 
   my $symbols;
   if ($header =~ m/^--- *$symbol_marker/o) {
-    # read the symbol section of the symbolized profile file
+    # Read the symbol section of the symbolized profile file.
     $symbols = ReadSymbols(*PROFILE{IO});
-
-    # read the next line to get the header for the remaining profile
-    $header = "";
-    read(PROFILE, $firstchar, 1);
-    seek(PROFILE, -1, 1);          # unread the firstchar
-    if ($firstchar ne "\0") {
-      $header = <PROFILE>;
-      $header =~ s/\r//g;
-    }
+    # Read the next line to get the header for the remaining profile.
+    $header = ReadProfileLine(*PROFILE) || "";
   }
 
   my $result;
@@ -3114,18 +3531,18 @@
           # The sampling frequency is the rate of a Poisson process.
           # This means that the probability of sampling an allocation of
           # size X with sampling rate Y is 1 - exp(-X/Y)
-          my $ratio;
-          my $scale_factor;
-          if ($n1 != 0) {
-            $ratio = (($s1*1.0)/$n1)/($sample_adjustment);
-            $scale_factor = 1/(1 - exp(-$ratio));
-            $n1 *= $scale_factor;
-            $s1 *= $scale_factor;
-          }
-          $ratio = (($s2*1.0)/$n2)/($sample_adjustment);
-          $scale_factor = 1/(1 - exp(-$ratio));
+	  if ($n1 != 0) {
+	    my $ratio = (($s1*1.0)/$n1)/($sample_adjustment);
+	    my $scale_factor = 1/(1 - exp(-$ratio));
+          $n1 *= $scale_factor;
+          $s1 *= $scale_factor;
+	  }
+	  if ($n2 != 0) {
+	    my $ratio = (($s2*1.0)/$n2)/($sample_adjustment);
+	    my $scale_factor = 1/(1 - exp(-$ratio));
           $n2 *= $scale_factor;
           $s2 *= $scale_factor;
+	  }
         } else {
           # Remote-heap version 1
           my $ratio;
@@ -3676,35 +4093,34 @@
 
   my $symbols = {};
 
-  # Map each PC value to the containing library
+  # Map each PC value to the containing library.  To make this faster,
+  # we sort libraries by their starting pc value (highest first), and
+  # advance through the libraries as we advance the pc.  Sometimes the
+  # addresses of libraries may overlap with the addresses of the main
+  # binary, so to make sure the libraries 'win', we iterate over the
+  # libraries in reverse order (binary will have the lowest start addr).
   my @pcs = (sort { $a cmp $b } keys(%{$pcset}));
-  foreach my $lib (reverse sort {$a->[1] cmp $b->[1]} @{$libs}) {
+  foreach my $lib (sort {$b->[1] cmp $a->[1]} @{$libs}) {
     my $libname = $lib->[0];
     my $start = $lib->[1];
     my $finish = $lib->[2];
     my $offset = $lib->[3];
 
     # Get list of pcs that belong in this library.
-    my $pc = pop(@pcs);
-    my @pcs2 = ();
     my $contained = [];
-    while (defined $pc && $pc gt $finish) {
-      unshift(@pcs2, $pc);
-      $pc = pop(@pcs);
+    my ($start_pc_index, $finish_pc_index);
+    for ($finish_pc_index = $#pcs + 1; $finish_pc_index > 0;
+	 $finish_pc_index--) {
+      last if $pcs[$finish_pc_index - 1] le $finish;
     }
-    while (defined $pc && $pc ge $start) {
-      push(@{$contained}, $pc);
-      $pc = pop(@pcs);
+    for ($start_pc_index = $finish_pc_index; $start_pc_index > 0;
+	 $start_pc_index--) {
+      last if $pcs[$start_pc_index - 1] lt $start;
     }
-    if (defined $pc) {
-      push(@pcs, $pc);
-    }
-    @pcs = (@pcs, @pcs2);
+    @{$contained} = splice(@pcs, $start_pc_index,
+			   $finish_pc_index - $start_pc_index);
     # Map to symbols
     MapToSymbols($libname, AddressSub($start, $offset), $contained, $symbols);
-    if (scalar(@pcs) == 0) {
-      last;
-    }
   }
 
   return $symbols;
@@ -3732,8 +4148,7 @@
 
   # If "addr2line" isn't installed on the system at all, just use
   # nm to get what info we can (function names, but not line numbers).
-  if ($main::opt_lines == 0 || system("$addr2line --help >/dev/null 2>&1")
-      != 0) {
+  if (system("$addr2line --help >/dev/null 2>&1") != 0) {
     MapSymbolsWithNM($image, $offset, $pclist, $symbols);
     return;
   }
@@ -3919,6 +4334,8 @@
   if ($file_type =~ /Mach-O/) {
     # OS X uses otool to examine Mach-O files, rather than objdump.
     $obj_tool_map{"otool"} = "otool";
+    $obj_tool_map{"addr2line"} = "false";  # no addr2line
+    $obj_tool_map{"objdump"} = "false";  # no objdump
   }
 
   # Go fill in %obj_tool_map with the pathnames to use:
@@ -3935,18 +4352,27 @@
   my $tool = shift;
   my $path;
 
-  if ($main::opt_tools ne "") {
-    # Use a prefix specified by the --tools option...
-    $path = $main::opt_tools . $tool;
-    if (!-x $path) {
-      error("No '$tool' found with prefix specified by --tools $main::opt_tools\n");
+  # --tools (or $PPROF_TOOLS) is a comma separated list, where each
+  # item is either a) a pathname prefix, or b) a map of the form
+  # <tool>:<path>.  First we look for an entry of type (b) for our
+  # tool.  If one is found, we use it.  Otherwise, we consider all the
+  # pathname prefixes in turn, until one yields an existing file.  If
+  # none does, we use a default path.
+  my $tools = $main::opt_tools || $ENV{"PPROF_TOOLS"} || "";
+  if ($tools =~ m/(,|^)\Q$tool\E:([^,]*)/) {
+    $path = $2;
+    # TODO(csilvers): sanity-check that $path exists?  Hard if it's relative.
+  } elsif ($tools ne '') {
+    foreach my $prefix (split(',', $tools)) {
+      next if ($prefix =~ /:/);    # ignore "tool:fullpath" entries in the list
+      if (-x $prefix . $tool) {
+        $path = $prefix . $tool;
+        last;
+      }
     }
-  } elsif (exists $ENV{"PPROF_TOOLS"} &&
-           $ENV{"PPROF_TOOLS"} ne "") {
-    #... or specified with the PPROF_TOOLS environment variable...
-    $path = $ENV{"PPROF_TOOLS"} . $tool;
-    if (!-x $path) {
-      error("No '$tool' found with prefix specified by PPROF_TOOLS=$ENV{PPROF_TOOLS}\n");
+    if (!$path) {
+      error("No '$tool' found with prefix specified by " .
+            "--tools (or \$PPROF_TOOLS) '$tools'\n");
     }
   } else {
     # ... otherwise use the version that exists in the same directory as
@@ -3965,9 +4391,8 @@
 
 sub cleanup {
   unlink($main::tmpfile_sym);
-  for (my $i = 0; $i < $main::next_tmpfile; $i++) {
-    unlink(PsTempName($i));
-  }
+  unlink(keys %main::tempnames);
+
   # We leave any collected profiles in $HOME/pprof in case the user wants
   # to look at them later.  We print a message informing them of this.
   if ((scalar(@main::profile_files) > 0) &&
@@ -4010,7 +4435,7 @@
   my $routine = "";
   while (<NM>) {
     s/\r//g;         # turn windows-looking lines into unix-looking lines
-    if (m/^([0-9a-f]+) (.) (..*)/) {
+    if (m/^\s*([0-9a-f]+) (.) (..*)/) {
       my $start_val = $1;
       my $type = $2;
       my $this_routine = $3;
@@ -4072,7 +4497,6 @@
     $symbol_table->{$routine} = [HexExtend($last_start),
                                  HexExtend($last_start)];
   }
-
   return $symbol_table;
 }
 
@@ -4120,7 +4544,11 @@
   my @nm_commands = ("$nm -n $flatten_flag $demangle_flag" .
                      " $image 2>/dev/null $cppfilt_flag",
                      "$nm -D -n $flatten_flag $demangle_flag" .
-                     " $image 2>/dev/null $cppfilt_flag");
+                     " $image 2>/dev/null $cppfilt_flag",
+                     # 6nm is for Go binaries
+		     "6nm $image 2>/dev/null | sort",
+                     );
+
   # If the executable is an MS Windows PDB-format executable, we'll
   # have set up obj_tool_map("nm_pdb").  In this case, we actually
   # want to use both unix nm and windows-specific nm_pdb, since
diff --git a/jemalloc/configure.ac b/jemalloc/configure.ac
index ce6e679..0ed1373 100644
--- a/jemalloc/configure.ac
+++ b/jemalloc/configure.ac
@@ -150,7 +150,7 @@
               [attribute])
 if test "x${attribute}" = "xyes" ; then
   AC_DEFINE([JEMALLOC_HAVE_ATTR], [ ])
-  if test "x$GCC" = "xyes" ; then
+  if test "x$GCC" = "xyes" -a "${abi}" = "xelf"; then
     JE_CFLAGS_APPEND([-fvisibility=internal])
   fi
 fi
@@ -166,17 +166,20 @@
   *-*-darwin*)
 	CFLAGS="$CFLAGS -fno-common -no-cpp-precomp"
 	abi="macho"
+	AC_DEFINE([JEMALLOC_PURGE_MADVISE_FREE])
 	RPATH=""
 	;;
   *-*-freebsd*)
 	CFLAGS="$CFLAGS"
 	abi="elf"
+	AC_DEFINE([JEMALLOC_PURGE_MADVISE_FREE])
 	RPATH="-Wl,-rpath,"
 	;;
   *-*-linux*)
 	CFLAGS="$CFLAGS"
 	CPPFLAGS="$CPPFLAGS -D_GNU_SOURCE"
 	abi="elf"
+	AC_DEFINE([JEMALLOC_PURGE_MADVISE_DONTNEED])
 	RPATH="-Wl,-rpath,"
 	;;
   *-*-netbsd*)
@@ -191,6 +194,7 @@
                           [CFLAGS="$CFLAGS"; abi="elf"],
                           [abi="aout"])
 	AC_MSG_RESULT([$abi])
+	AC_DEFINE([JEMALLOC_PURGE_MADVISE_FREE])
 	RPATH="-Wl,-rpath,"
 	;;
   *-*-solaris2*)
@@ -245,12 +249,20 @@
 AC_ARG_WITH([jemalloc_prefix],
   [AS_HELP_STRING([--with-jemalloc-prefix=<prefix>], [Prefix to prepend to all public APIs])],
   [JEMALLOC_PREFIX="$with_jemalloc_prefix"],
-  [JEMALLOC_PREFIX=]
+  [if test "x$abi" != "xmacho" ; then
+  JEMALLOC_PREFIX=""
+else
+  JEMALLOC_PREFIX="je_"
+fi]
 )
 if test "x$JEMALLOC_PREFIX" != "x" ; then
-  AC_DEFINE([JEMALLOC_PREFIX], [ ])
+  JEMALLOC_CPREFIX=`echo ${JEMALLOC_PREFIX} | tr "a-z" "A-Z"`
+  AC_DEFINE_UNQUOTED([JEMALLOC_PREFIX], ["$JEMALLOC_PREFIX"])
+  AC_DEFINE_UNQUOTED([JEMALLOC_CPREFIX], ["$JEMALLOC_CPREFIX"])
   jemalloc_prefix="$JEMALLOC_PREFIX"
+  jemalloc_cprefix="$JEMALLOC_CPREFIX"
   AC_SUBST([jemalloc_prefix])
+  AC_SUBST([jemalloc_cprefix])
   AC_DEFINE_UNQUOTED([JEMALLOC_P(string_that_no_one_should_want_to_use_as_a_jemalloc_API_prefix)], [${JEMALLOC_PREFIX}##string_that_no_one_should_want_to_use_as_a_jemalloc_API_prefix])
 fi
 
@@ -266,14 +278,17 @@
 cfgoutputs_in="${srcroot}Makefile.in ${srcroot}doc/jemalloc.3.in"
 cfgoutputs_in="${cfgoutputs_in} ${srcroot}include/jemalloc/jemalloc.h.in"
 cfgoutputs_in="${cfgoutputs_in} ${srcroot}include/jemalloc/internal/jemalloc_internal.h.in"
+cfgoutputs_in="${cfgoutputs_in} ${srcroot}test/jemalloc_test.h.in"
 
 cfgoutputs_out="Makefile doc/jemalloc${install_suffix}.3"
 cfgoutputs_out="${cfgoutputs_out} include/jemalloc/jemalloc${install_suffix}.h"
 cfgoutputs_out="${cfgoutputs_out} include/jemalloc/internal/jemalloc_internal.h"
+cfgoutputs_out="${cfgoutputs_out} test/jemalloc_test.h"
 
 cfgoutputs_tup="Makefile doc/jemalloc${install_suffix}.3:doc/jemalloc.3.in"
 cfgoutputs_tup="${cfgoutputs_tup} include/jemalloc/jemalloc${install_suffix}.h:include/jemalloc/jemalloc.h.in"
 cfgoutputs_tup="${cfgoutputs_tup} include/jemalloc/internal/jemalloc_internal.h"
+cfgoutputs_tup="${cfgoutputs_tup} test/jemalloc_test.h:test/jemalloc_test.h.in"
 
 cfghdrs_in="${srcroot}include/jemalloc/jemalloc_defs.h.in"
 
@@ -281,6 +296,23 @@
 
 cfghdrs_tup="include/jemalloc/jemalloc_defs${install_suffix}.h:include/jemalloc/jemalloc_defs.h.in"
 
+dnl Do not silence irrelevant compiler warnings by default, since enabling this
+dnl option incurs a performance penalty.
+AC_ARG_ENABLE([cc-silence],
+  [AS_HELP_STRING([--enable-cc-silence],
+                  [Silence irrelevant compiler warnings])],
+[if test "x$enable_cc_silence" = "xno" ; then
+  enable_cc_silence="0"
+else
+  enable_cc_silence="1"
+fi
+],
+[enable_cc_silence="0"]
+)
+if test "x$enable_cc_silence" = "x1" ; then
+  AC_DEFINE([JEMALLOC_CC_SILENCE])
+fi
+
 dnl Do not compile with debugging by default.
 AC_ARG_ENABLE([debug],
   [AS_HELP_STRING([--enable-debug], [Build debugging code])],
@@ -294,8 +326,18 @@
 )
 if test "x$enable_debug" = "x1" ; then
   AC_DEFINE([JEMALLOC_DEBUG], [ ])
+  AC_DEFINE([JEMALLOC_IVSALLOC], [ ])
 fi
 AC_SUBST([enable_debug])
+if test "x$enable_debug" = "x0" ; then
+  roff_debug=".\\\" "
+  roff_no_debug=""
+else
+  roff_debug=""
+  roff_no_debug=".\\\" "
+fi
+AC_SUBST([roff_debug])
+AC_SUBST([roff_no_debug])
 
 dnl Only optimize if not debugging.
 if test "x$enable_debug" = "x0" -a "x$no_CFLAGS" = "xyes" ; then
@@ -379,7 +421,47 @@
 fi,
   LUNWIND="-lunwind"
 )
-dnl Finish prof-related definitions below, once TLS configuration is done.
+if test "x$enable_prof" = "x1" ; then
+  LIBS="$LIBS -lm"
+  AC_DEFINE([JEMALLOC_PROF], [ ])
+  if test "x$enable_prof_libunwind" = "x1" ; then
+    AC_CHECK_HEADERS([libunwind.h], , [enable_prof_libunwind="0"])
+    if test "x$LUNWIND" = "x-lunwind" ; then
+      AC_CHECK_LIB([unwind], [backtrace], [LIBS="$LIBS $LUNWIND"],
+                   [enable_prof_libunwind="0"])
+    else
+      LIBS="$LIBS $LUNWIND"
+    fi
+    if test "x${enable_prof_libunwind}" = "x1" ; then
+      AC_DEFINE([JEMALLOC_PROF_LIBUNWIND], [ ])
+    fi
+  fi
+fi
+AC_SUBST([enable_prof])
+if test "x$enable_prof" = "x0" ; then
+  roff_prof=".\\\" "
+  roff_no_prof=""
+else
+  roff_prof=""
+  roff_no_prof=".\\\" "
+fi
+AC_SUBST([roff_prof])
+AC_SUBST([roff_no_prof])
+
+dnl If libunwind isn't enabled, try to use libgcc rather than gcc intrinsics
+dnl for backtracing.
+if test "x$enable_prof" = "x1" -a "x$enable_prof_libgcc" = "x1" ; then
+  if test "x$enable_prof_libunwind" = "x0" -a "x$GCC" = "xyes" ; then
+    enable_prof_libgcc="1"
+    AC_CHECK_HEADERS([unwind.h], , [enable_prof_libgcc="0"])
+    AC_CHECK_LIB([gcc], [_Unwind_Backtrace], [LIBS="$LIBS -lgcc"], [enable_prof_libgcc="0"])
+    if test "x${enable_prof_libgcc}" = "x1" ; then
+      AC_DEFINE([JEMALLOC_PROF_LIBGCC], [ ])
+    fi
+  else
+    enable_prof_libgcc="0"
+  fi
+fi
 
 dnl Enable tiny allocations by default.
 AC_ARG_ENABLE([tiny],
@@ -417,7 +499,19 @@
 ],
 [enable_tcache="1"]
 )
-dnl Finish tcache-related definitions below, once TLS configuration is done.
+if test "x$enable_tcache" = "x1" ; then
+  AC_DEFINE([JEMALLOC_TCACHE], [ ])
+fi
+AC_SUBST([enable_tcache])
+if test "x$enable_tcache" = "x0" ; then
+  roff_tcache=".\\\" "
+  roff_no_tcache=""
+else
+  roff_tcache=""
+  roff_no_tcache=".\\\" "
+fi
+AC_SUBST([roff_tcache])
+AC_SUBST([roff_no_tcache])
 
 dnl Do not enable mmap()ped swap files by default.
 AC_ARG_ENABLE([swap],
@@ -579,7 +673,7 @@
 
 dnl Set VERSION if source directory has an embedded git repository.
 if test -d "${srcroot}../.git" ; then
-  git describe --long > ${srcroot}VERSION
+  git describe --long --abbrev=40 > ${srcroot}VERSION
 fi
 jemalloc_version=`cat ${srcroot}VERSION`
 jemalloc_version_major=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]1}'`
@@ -647,69 +741,63 @@
               AC_MSG_RESULT([no])
               enable_tls="0")
 fi
+AC_SUBST([enable_tls])
 if test "x${enable_tls}" = "x0" ; then
   AC_DEFINE_UNQUOTED([NO_TLS], [ ])
 fi
 
-dnl Finish tcache-related definitions, now that TLS configuration is done.
-if test "x$enable_tls" = "x0" ; then
-  enable_tcache="0"
-fi
-if test "x$enable_tcache" = "x1" ; then
-  AC_DEFINE([JEMALLOC_TCACHE], [ ])
-fi
-AC_SUBST([enable_tcache])
-if test "x$enable_tcache" = "x0" ; then
-  roff_tcache=".\\\" "
-  roff_no_tcache=""
-else
-  roff_tcache=""
-  roff_no_tcache=".\\\" "
-fi
-AC_SUBST([roff_tcache])
-AC_SUBST([roff_no_tcache])
+dnl ============================================================================
+dnl Check for allocator-related functions that should be wrapped.
 
-dnl Finish prof-related definitions, now that TLS configuration is done.
-if test "x$enable_tls" = "x0" ; then
-  enable_prof="0"
-fi
-if test "x$enable_prof" = "x1" ; then
-  LIBS="$LIBS -lm"
-  AC_DEFINE([JEMALLOC_PROF], [ ])
-  if test "x$enable_prof_libunwind" = "x1" ; then
-    AC_CHECK_HEADERS([libunwind.h], , [enable_prof_libunwind="0"])
-    if test "x$LUNWIND" = "x-lunwind" ; then
-      AC_CHECK_LIB([unwind], [backtrace], [LIBS="$LIBS $LUNWIND"],
-                   [enable_prof_libunwind="0"])
-    else
-      LIBS="$LIBS $LUNWIND"
-    fi
-    if test "x${enable_prof_libunwind}" = "x1" ; then
-      AC_DEFINE([JEMALLOC_PROF_LIBUNWIND], [ ])
-    fi
-  fi
-fi
-AC_SUBST([enable_prof])
-if test "x$enable_prof" = "x0" ; then
-  roff_prof=".\\\" "
-  roff_no_prof=""
-else
-  roff_prof=""
-  roff_no_prof=".\\\" "
-fi
-AC_SUBST([roff_prof])
-AC_SUBST([roff_no_prof])
+AC_CHECK_FUNC([memalign],
+	      [AC_DEFINE([JEMALLOC_OVERRIDE_MEMALIGN])])
+AC_CHECK_FUNC([valloc],
+	      [AC_DEFINE([JEMALLOC_OVERRIDE_VALLOC])])
 
-dnl If libunwind isn't enabled, try to use libgcc rather than gcc intrinsics
-dnl for backtracing.
-if test "x$enable_prof" = "x1" -a "x$enable_prof_libunwind" = "x0" \
- -a "x$GCC" = "xyes" -a "x$enable_prof_libgcc" = "x1" ; then
-  enable_prof_libgcc="1"
-  AC_CHECK_HEADERS([unwind.h], , [enable_prof_libgcc="0"])
-  AC_CHECK_LIB([gcc], [_Unwind_Backtrace], [LIBS="$LIBS -lgcc"], [enable_prof_libgcc="0"])
-  if test "x${enable_prof_libgcc}" = "x1" ; then
-    AC_DEFINE([JEMALLOC_PROF_LIBGCC], [ ])
-  fi
+dnl ============================================================================
+dnl Darwin-related configuration.
+
+if test "x${abi}" = "xmacho" ; then
+  AC_DEFINE([JEMALLOC_IVSALLOC])
+  AC_DEFINE([JEMALLOC_ZONE])
+
+  dnl The szone version jumped from 3 to 6 between the OS X 10.5.x and 10.6
+  dnl releases.  malloc_zone_t and malloc_introspection_t have new fields in
+  dnl 10.6, which is the only source-level indication of the change.
+  AC_MSG_CHECKING([malloc zone version])
+  AC_TRY_COMPILE([#include <stdlib.h>
+#include <malloc/malloc.h>], [
+	static malloc_zone_t zone;
+	static struct malloc_introspection_t zone_introspect;
+
+	zone.size = NULL;
+	zone.malloc = NULL;
+	zone.calloc = NULL;
+	zone.valloc = NULL;
+	zone.free = NULL;
+	zone.realloc = NULL;
+	zone.destroy = NULL;
+	zone.zone_name = "jemalloc_zone";
+	zone.batch_malloc = NULL;
+	zone.batch_free = NULL;
+	zone.introspect = &zone_introspect;
+	zone.version = 6;
+	zone.memalign = NULL;
+	zone.free_definite_size = NULL;
+
+	zone_introspect.enumerator = NULL;
+	zone_introspect.good_size = NULL;
+	zone_introspect.check = NULL;
+	zone_introspect.print = NULL;
+	zone_introspect.log = NULL;
+	zone_introspect.force_lock = NULL;
+	zone_introspect.force_unlock = NULL;
+	zone_introspect.statistics = NULL;
+	zone_introspect.zone_locked = NULL;
+], [AC_DEFINE_UNQUOTED([JEMALLOC_ZONE_VERSION], [6])
+    AC_MSG_RESULT([6])],
+   [AC_DEFINE_UNQUOTED([JEMALLOC_ZONE_VERSION], [3])
+   AC_MSG_RESULT([3])])
 fi
 
 dnl ============================================================================
@@ -755,9 +843,11 @@
 AC_MSG_RESULT([JEMALLOC_PREFIX    : ${JEMALLOC_PREFIX}])
 AC_MSG_RESULT([install_suffix     : ${install_suffix}])
 AC_MSG_RESULT([autogen            : ${enable_autogen}])
+AC_MSG_RESULT([cc-silence         : ${enable_cc_silence}])
 AC_MSG_RESULT([debug              : ${enable_debug}])
 AC_MSG_RESULT([stats              : ${enable_stats}])
 AC_MSG_RESULT([prof               : ${enable_prof}])
+AC_MSG_RESULT([prof-libgcc        : ${enable_prof_libgcc}])
 AC_MSG_RESULT([prof-libunwind     : ${enable_prof_libunwind}])
 AC_MSG_RESULT([tiny               : ${enable_tiny}])
 AC_MSG_RESULT([tcache             : ${enable_tcache}])
@@ -768,4 +858,5 @@
 AC_MSG_RESULT([dss                : ${enable_dss}])
 AC_MSG_RESULT([dynamic_page_shift : ${enable_dynamic_page_shift}])
 AC_MSG_RESULT([lazy_lock          : ${enable_lazy_lock}])
+AC_MSG_RESULT([tls                : ${enable_tls}])
 AC_MSG_RESULT([===============================================================================])
diff --git a/jemalloc/doc/jemalloc.3.in b/jemalloc/doc/jemalloc.3.in
index cf5cb5e..6286664 100644
--- a/jemalloc/doc/jemalloc.3.in
+++ b/jemalloc/doc/jemalloc.3.in
@@ -38,8 +38,8 @@
 .\"     @(#)malloc.3	8.1 (Berkeley) 6/4/93
 .\" $FreeBSD: head/lib/libc/stdlib/malloc.3 182225 2008-08-27 02:00:53Z jasone $
 .\"
-.Dd April 2, 2010
-.Dt JEMALLOC 3
+.Dd October 24, 2010
+.Dt jemalloc 3
 .Os
 .Sh NAME
 .Nm @jemalloc_prefix@malloc ,
@@ -51,13 +51,24 @@
 .Nm @jemalloc_prefix@malloc_stats_print ,
 .Nm @jemalloc_prefix@mallctl ,
 .Nm @jemalloc_prefix@mallctlnametomib ,
-.Nm @jemalloc_prefix@mallctlbymib
+.Nm @jemalloc_prefix@mallctlbymib ,
+.Nm @jemalloc_prefix@allocm ,
+.Nm @jemalloc_prefix@rallocm ,
+.Nm @jemalloc_prefix@sallocm ,
+.Nm @jemalloc_prefix@dallocm
 .Nd general purpose memory allocation functions
 .Sh LIBRARY
 .Sy libjemalloc@install_suffix@
+.Pp
+This manual describes jemalloc @jemalloc_version@.
+More information can be found at the
+.UR http://\:www.canonware.com/\:jemalloc/
+jemalloc website
+.UE .
 .Sh SYNOPSIS
 .In stdlib.h
 .In jemalloc/jemalloc@install_suffix@.h
+.Ss Standard API
 .Ft void *
 .Fn @jemalloc_prefix@malloc "size_t size"
 .Ft void *
@@ -68,6 +79,7 @@
 .Fn @jemalloc_prefix@realloc "void *ptr" "size_t size"
 .Ft void
 .Fn @jemalloc_prefix@free "void *ptr"
+.Ss Non-standard API
 .Ft size_t
 .Fn @jemalloc_prefix@malloc_usable_size "const void *ptr"
 .Ft void
@@ -79,10 +91,20 @@
 .Ft int
 .Fn @jemalloc_prefix@mallctlbymib "const size_t *mib" "size_t miblen" "void *oldp" "size_t *oldlenp" "void *newp" "size_t newlen"
 .Ft const char *
-.Va @jemalloc_prefix@malloc_options ;
+.Va @jemalloc_prefix@malloc_conf ;
 .Ft void
 .Fn \*(lp*@jemalloc_prefix@malloc_message\*(rp "void *cbopaque" "const char *s"
+.Ss Experimental API
+.Ft int
+.Fn @jemalloc_prefix@allocm "void **ptr" "size_t *rsize" "size_t size" "int flags"
+.Ft int
+.Fn @jemalloc_prefix@rallocm "void **ptr" "size_t *rsize" "size_t size" "size_t extra" "int flags"
+.Ft int
+.Fn @jemalloc_prefix@sallocm "const void *ptr" "size_t *rsize" "int flags"
+.Ft int
+.Fn @jemalloc_prefix@dallocm "void *ptr" "int flags"
 .Sh DESCRIPTION
+.Ss Standard API
 The
 .Fn @jemalloc_prefix@malloc
 function allocates
@@ -158,7 +180,7 @@
 is
 .Dv NULL ,
 no action occurs.
-.Pp
+.Ss Non-standard API
 The
 .Fn @jemalloc_prefix@malloc_usable_size
 function returns the usable size of the allocation pointed to by
@@ -289,255 +311,130 @@
 	/* Do something with bin_size... */
 }
 .Ed
-.Sh TUNING
-Once, when the first call is made to one of these memory allocation
-routines, various flags will be set or reset, which affects the
-workings of this allocator implementation.
+.Ss Experimental API
+The experimental API is subject to change or removal without regard for
+backward compatibility.
 .Pp
 The
-.Dq name
-of the file referenced by the symbolic link named
-.Pa /etc/jemalloc.conf ,
-the value of the environment variable
-.Ev JEMALLOC_OPTIONS ,
-and the string pointed to by the global variable
-.Va @jemalloc_prefix@malloc_options
-will be interpreted, in that order, from left to right as flags.
-.Pp
-Each flag is a single letter, optionally prefixed by a non-negative base 10
-integer repetition count.
-For example,
-.Dq 3N
-is equivalent to
-.Dq NNN .
-Some flags control parameter magnitudes, where uppercase increases the
-magnitude, and lowercase decreases the magnitude.
-Other flags control boolean parameters, where uppercase indicates that a
-behavior is set, or on, and lowercase means that a behavior is not set, or off.
-.Bl -tag -width indent
-.It A
-All warnings (except for the warning about unknown
-flags being set) become fatal.
-The process will call
-.Xr abort 3
-in these cases.
-@roff_prof@.It B
-@roff_prof@Double/halve the maximum backtrace depth when profiling memory
-@roff_prof@allocation activity.
-@roff_prof@The default is 4.
-.It C
-Double/halve the size of the maximum size class that is a multiple of the
-cacheline size (64).
-Above this size, subpage spacing (256 bytes) is used for size classes.
-The default value is 512 bytes.
-.It D
-Halve/double the per-arena minimum ratio of active to dirty pages.
-Some dirty unused pages may be allowed to accumulate, within the limit set by
-the ratio (or one chunk worth of dirty pages, whichever is greater), before
-informing the kernel about some of those pages via
-.Xr madvise 2 .
-This provides the kernel with sufficient information to recycle dirty pages if
-physical memory becomes scarce and the pages remain unused.
-The default minimum ratio is 32:1;
-.Ev JEMALLOC_OPTIONS=6D
-will disable dirty page purging.
-@roff_prof@.It E
-@roff_prof@Activate/deactivate profiling.
-@roff_prof@This is a secondary control mechanism that makes it possible to
-@roff_prof@start the application with profiling enabled (see the
-@roff_prof@.Dq F
-@roff_prof@option) but inactive, then toggle profiling at any time during
-@roff_prof@program execution with the
-@roff_prof@.Dq prof.active
-@roff_prof@mallctl.
-@roff_prof@This option is enabled by default.
-@roff_prof@.It F
-@roff_prof@Profile memory allocation activity, and use an
-@roff_prof@.Xr atexit 3
-@roff_prof@function to dump final memory usage to a file named according to
-@roff_prof@the pattern
-@roff_prof@.Pa <prefix>.<pid>.<seq>.f.heap ,
-@roff_prof@where
-@roff_prof@.Pa <prefix>
-@roff_prof@is controlled by the
-@roff_prof@JEMALLOC_PROF_PREFIX
-@roff_prof@environment variable.
-@roff_prof@See the
-@roff_prof@.Dq B
-@roff_prof@option for backtrace depth control.
-@roff_prof@See the
-@roff_prof@.Dq E
-@roff_prof@option for on-the-fly activation/deactivation.
-@roff_prof@See the
-@roff_prof@.Dq S
-@roff_prof@option for probabilistic sampling control.
-@roff_prof@See the
-@roff_prof@.Dq I
-@roff_prof@option for information on interval-triggered profile dumping, and the
-@roff_prof@.Dq U
-@roff_prof@option for information on high-water-triggered profile dumping.
-@roff_prof@Profile output is compatible with the included pprof Perl script,
-@roff_prof@which originates from the google-perftools package
-@roff_prof@(http://code.google.com/p/google-perftools/).
-@roff_tcache@.It G
-@roff_tcache@Double/halve the approximate interval (counted in terms of
-@roff_tcache@thread-specific cache allocation/deallocation events) between full
-@roff_tcache@thread-specific cache garbage collection sweeps.
-@roff_tcache@Garbage collection is actually performed incrementally, one size
-@roff_tcache@class at a time, in order to avoid large collection pauses.
-@roff_tcache@The default sweep interval is 8192;
-@roff_tcache@.Ev JEMALLOC_OPTIONS=14g
-@roff_tcache@will disable garbage collection.
-@roff_tcache@.It H
-@roff_tcache@Enable/disable thread-specific caching.
-@roff_tcache@When there are multiple threads, each thread uses a
-@roff_tcache@thread-specific cache for objects up to a certain size.
-@roff_tcache@Thread-specific caching allows many allocations to be satisfied
-@roff_tcache@without performing any thread synchronization, at the cost of
-@roff_tcache@increased memory use.
-@roff_tcache@See the
-@roff_tcache@.Dq G
-@roff_tcache@and
-@roff_tcache@.Dq M
-@roff_tcache@options for related tuning information.
-@roff_tcache@This option is enabled by default.
-@roff_prof@.It I
-@roff_prof@Double/halve the average interval between memory profile dumps, as
-@roff_prof@measured in bytes of allocation activity.
-@roff_prof@The actual interval between dumps may be sporadic because
-@roff_prof@decentralized allocation counters are used to avoid synchronization
-@roff_prof@bottlenecks.
-@roff_prof@Profiles are dumped to files named according to the pattern
-@roff_prof@.Pa <prefix>.<pid>.<seq>.i<iseq>.heap ,
-@roff_prof@where
-@roff_prof@.Pa <prefix>
-@roff_prof@is controlled by the
-@roff_prof@JEMALLOC_PROF_PREFIX
-@roff_prof@environment variable.
-@roff_prof@The default average interval is 1 GiB;
-@roff_prof@.Ev JEMALLOC_OPTIONS=31i
-@roff_prof@will disable interval-triggered profile dumping.
-@roff_fill@.It J
-@roff_fill@Each byte of new memory allocated by
-@roff_fill@.Fn @jemalloc_prefix@malloc
-@roff_fill@or
-@roff_fill@.Fn @jemalloc_prefix@realloc
-@roff_fill@will be initialized to 0xa5.
-@roff_fill@All memory returned by
-@roff_fill@.Fn @jemalloc_prefix@free
-@roff_fill@or
-@roff_fill@.Fn @jemalloc_prefix@realloc
-@roff_fill@will be initialized to 0x5a.
-@roff_fill@This is intended for debugging and will impact performance
-@roff_fill@negatively.
-.It K
-Double/halve the virtual memory chunk size.
-The default chunk size is 4 MiB.
-@roff_prof@.It L
-@roff_prof@Use an
-@roff_prof@.Xr atexit 3
-@roff_prof@function to report memory leaks.
-@roff_prof@See the
-@roff_prof@.Dq B
-@roff_prof@option for backtrace depth control.
-@roff_prof@See the
-@roff_prof@.Dq F option for information on analyzing heap profile output.
-@roff_prof@This option is disabled by default.
-@roff_tcache@.It M
-@roff_tcache@Double/halve the maximum size class to cache.
-@roff_tcache@At a minimum, all small size classes are cached, and at a maximum
-@roff_tcache@all large size classes are cached.
-@roff_tcache@The default maximum is 32 KiB.
-.It N
-Double/halve the number of arenas.
-The default number of arenas is four times the number of CPUs, or one if there
-is a single CPU.
-@roff_swap@.It O
-@roff_swap@Over-commit memory as a side effect of using anonymous
-@roff_swap@.Xr mmap 2
-@roff_swap@@roff_dss@ and
-@roff_swap@@roff_dss@.Xr sbrk 2
-@roff_swap@for virtual memory allocation.
-@roff_swap@In order for overcommit to be disabled, the
-@roff_swap@.Dq swap.fds
-@roff_swap@mallctl must have been successfully written to.
-@roff_swap@This option is enabled by default.
-.It P
-The
-.Fn malloc_stats_print
-function is called at program exit via an
-.Xr atexit 3
-function.
-@roff_stats@This has the potential to cause deadlock for a multi-threaded
-@roff_stats@process that exits while one or more threads are executing in the
-@roff_stats@memory allocation functions.
-@roff_stats@Therefore, this option should only be used with care; it is
-@roff_stats@primarily intended as a performance tuning aid during application
-@roff_stats@development.
-.It Q
-Double/halve the size of the maximum size class that is a multiple of the
-quantum (8 or 16 bytes, depending on architecture).
-Above this size, cacheline spacing is used for size classes.
-The default value is 128 bytes.
-@roff_prof@.It S
-@roff_prof@Double/halve the average interval between allocation samples, as
-@roff_prof@measured in bytes of allocation activity.
-@roff_prof@Increasing the sampling interval decreases profile fidelity, but
-@roff_prof@also decreases the computational overhead.
-@roff_prof@The default sample interval is one (i.e. all allocations are
-@roff_prof@sampled).
-@roff_prof@.It U
-@roff_prof@Trigger a memory profile dump every time the total virtual memory
-@roff_prof@exceeds the previous maximum.
-@roff_prof@Profiles are dumped to files named according to the pattern
-@roff_prof@.Pa <prefix>.<pid>.<seq>.u<useq>.heap ,
-@roff_prof@where
-@roff_prof@.Pa <prefix>
-@roff_prof@is controlled by the
-@roff_prof@JEMALLOC_PROF_PREFIX
-@roff_prof@environment variable.
-@roff_prof@This option is disabled by default.
-@roff_sysv@.It V
-@roff_sysv@Attempting to allocate zero bytes will return a
-@roff_sysv@.Dv NULL
-@roff_sysv@pointer instead of a valid pointer.
-@roff_sysv@(The default behavior is to make a minimal allocation and return a
-@roff_sysv@pointer to it.)
-@roff_sysv@This option is provided for System V compatibility.
-@roff_sysv@@roff_xmalloc@This option is incompatible with the
-@roff_sysv@@roff_xmalloc@.Dq X
-@roff_sysv@@roff_xmalloc@option.
-@roff_xmalloc@.It X
-@roff_xmalloc@Rather than return failure for any allocation function, display a
-@roff_xmalloc@diagnostic message on
-@roff_xmalloc@.Dv STDERR_FILENO
-@roff_xmalloc@and cause the program to drop core (using
-@roff_xmalloc@.Xr abort 3 ) .
-@roff_xmalloc@This option should be set at compile time by including the
-@roff_xmalloc@following in the source code:
-@roff_xmalloc@.Bd -literal -offset indent
-@roff_xmalloc@@jemalloc_prefix@malloc_options = "X";
-@roff_xmalloc@.Ed
-@roff_fill@.It Z
-@roff_fill@Each byte of new memory allocated by
-@roff_fill@.Fn @jemalloc_prefix@malloc
-@roff_fill@or
-@roff_fill@.Fn @jemalloc_prefix@realloc
-@roff_fill@will be initialized to 0.
-@roff_fill@Note that this initialization only happens once for each byte, so
-@roff_fill@.Fn @jemalloc_prefix@realloc
-@roff_fill@calls do not zero memory that was previously allocated.
-@roff_fill@This is intended for debugging and will impact performance
-@roff_fill@negatively.
+.Fn @jemalloc_prefix@allocm ,
+.Fn @jemalloc_prefix@rallocm ,
+.Fn @jemalloc_prefix@sallocm ,
+and
+.Fn @jemalloc_prefix@dallocm
+functions all have a
+.Fa flags
+argument that can be used to specify options.
+The functions only check the options that are contextually relevant.
+Use bitwise or (|) operations to specify one or more of the following:
+.Bl -tag -width ".Dv ALLOCM_LG_ALIGN(la)"
+.It ALLOCM_LG_ALIGN(la)
+Align the memory allocation to start at an address that is a multiple of
+(1 <<
+.Fa la ) .
+This macro does not validate that
+.Fa la
+is within the valid range.
+.It ALLOCM_ALIGN(a)
+Align the memory allocation to start at an address that is a multiple of
+.Fa a ,
+where
+.Fa a
+is a power of two.
+This macro does not validate that
+.Fa a
+is a power of 2.
+.It ALLOCM_ZERO
+Initialize newly allocated memory to contain zero bytes.
+In the growing reallocation case, the real size prior to reallocation defines
+the boundary between untouched bytes and those that are initialized to contain
+zero bytes.
+If this option is absent, newly allocated memory is uninitialized.
+.It ALLOCM_NO_MOVE
+For reallocation, fail rather than moving the object.
+This constraint can apply to both growth and shrinkage.
 .El
 .Pp
-@roff_fill@The
-@roff_fill@.Dq J
-@roff_fill@and
-@roff_fill@.Dq Z
-@roff_fill@options are intended for testing and debugging.
-@roff_fill@An application which changes its behavior when these options are used
-@roff_fill@is flawed.
+The
+.Fn @jemalloc_prefix@allocm
+function allocates at least
+.Fa size
+bytes of memory, sets
+.Fa *ptr
+to the base address of the allocation, and sets
+.Fa *rsize
+to the real size of the allocation if
+.Fa rsize
+is not
+.Dv NULL .
+.Pp
+The
+.Fn @jemalloc_prefix@rallocm
+function resizes the allocation at
+.Fa *ptr
+to be at least
+.Fa size
+bytes, sets
+.Fa *ptr
+to the base address of the allocation if it moved, and sets
+.Fa *rsize
+to the real size of the allocation if
+.Fa rsize
+is not
+.Dv NULL .
+If
+.Fa extra
+is non-zero, an attempt is made to resize the allocation to be at least
+.Fa ( size
++
+.Fa extra )
+bytes, though inability to allocate the extra byte(s) will not by itself result
+in failure.
+Behavior is undefined if
+.Fa ( size
++
+.Fa extra
+>
+.Dv SIZE_T_MAX ) .
+.Pp
+The
+.Fn @jemalloc_prefix@sallocm
+function sets
+.Fa *rsize
+to the real size of the allocation.
+.Pp
+The
+.Fn @jemalloc_prefix@dallocm
+function causes the memory referenced by
+.Fa ptr
+to be made available for future allocations.
+.Sh TUNING
+Once, when the first call is made to one of the memory allocation routines, the
+allocator initializes its internals based in part on various options that can
+be specified at compile- or run-time.
+.Pp
+The string pointed to by the global variable
+.Va @jemalloc_prefix@malloc_conf ,
+the
+.Dq name
+of the file referenced by the symbolic link named
+.Pa /etc/@jemalloc_prefix@malloc.conf ,
+and the value of the environment variable
+.Ev @jemalloc_cprefix@MALLOC_CONF ,
+will be interpreted, in that order, from left to right as options.
+.Pp
+An options string is a comma-separated list of option:value pairs.
+There is one key corresponding to each
+.Dq opt.*
+mallctl.
+For example,
+.Dq abort:true,narenas:1
+sets the
+.Dq opt.abort
+and
+.Dq opt.narenas
+options.
+Some options have boolean values (true/false), others have integer values (base
+8, 10, or 16, depending on prefix), and yet others have raw string values.
 .Sh IMPLEMENTATION NOTES
 @roff_dss@Traditionally, allocators have used
 @roff_dss@.Xr sbrk 2
@@ -564,8 +461,8 @@
 does not make much use of the allocation functions.
 .Pp
 @roff_tcache@In addition to multiple arenas, this allocator supports
-@roff_tcache@thread-specific caching for small objects, in order to make it
-@roff_tcache@possible to completely avoid synchronization for most small
+@roff_tcache@thread-specific caching for small and large objects, in order to
+@roff_tcache@make it possible to completely avoid synchronization for most
 @roff_tcache@allocation requests.
 @roff_tcache@Such caching allows very fast allocation in the common case, but it
 @roff_tcache@increases memory usage and fragmentation, since a bounded number of
@@ -594,27 +491,27 @@
 determine all metadata regarding small and large allocations in constant time.
 .Pp
 Small objects are managed in groups by page runs.
-Each run maintains a bitmap that tracks which regions are in use.
+Each run maintains a frontier and free list to track which regions are in use.
 @roff_tiny@Allocation requests that are no more than half the quantum (8 or 16,
 @roff_tiny@depending on architecture) are rounded up to the nearest power of
 @roff_tiny@two.
 Allocation requests that are
 @roff_tiny@more than half the quantum, but
 no more than the minimum cacheline-multiple size class (see the
-.Dq Q
+.Dq opt.lg_qspace_max
 option) are rounded up to the nearest multiple of the
 @roff_tiny@quantum.
 @roff_no_tiny@quantum (8 or 16, depending on architecture).
 Allocation requests that are more than the minimum cacheline-multiple size
 class, but no more than the minimum subpage-multiple size class (see the
-.Dq C
+.Dq opt.lg_cspace_max
 option) are rounded up to the nearest multiple of the cacheline size (64).
 Allocation requests that are more than the minimum subpage-multiple size class,
 but no more than the maximum subpage-multiple size class are rounded up to the
 nearest multiple of the subpage size (256).
 Allocation requests that are more than the maximum subpage-multiple size class,
 but small enough to fit in an arena-managed chunk (see the
-.Dq K
+.Dq opt.lg_chunk
 option), are rounded up to the nearest run size.
 Allocation requests that are too large to fit in an arena-managed chunk are
 rounded up to the nearest multiple of the chunk size.
@@ -623,10 +520,33 @@
 multi-threaded applications.
 If you need to assure that allocations do not suffer from cacheline sharing,
 round your allocation requests up to the nearest multiple of the cacheline
-size.
+size, or specify cacheline alignment when allocating.
+.Pp
+Assuming 4 MiB chunks, 4 KiB pages, and a 16-byte quantum on a 64-bit system,
+the size classes in each category are as follows:
+.\"-----------------------------------------------------------------------------
+.TS
+allbox tab(;);
+LLL
+LLL
+^LL
+^LL
+^LL
+LsL
+LsL.
+Category;Subcategory;Size
+@roff_tiny@Small;Tiny;[8]
+@roff_no_tiny@Small;Tiny;[disabled]
+;Quantum-spaced;[16, 32, 48, ..., 128]
+;Cacheline-spaced;[192, 256, 320, ..., 512]
+;Sub-page-spaced;[768, 1024, 1280, ..., 3840]
+Large;[4 KiB, 8 KiB, 12 KiB, ..., 4072 KiB]
+Huge;[4 MiB, 8 MiB, 12 MiB, ...]
+.TE
+.\"-----------------------------------------------------------------------------
 .Sh MALLCTL NAMESPACE
 The following names are defined in the namespace accessible via the
-.Fn mallctl*
+.Fn @jemalloc_prefix@mallctl*
 functions.
 Value types are specified in parentheses, and their readable/writable statuses
 are encoded as rw, r-, -w, or --.
@@ -638,6 +558,10 @@
 @roff_stats@<i> equal to
 @roff_stats@.Dq arenas.narenas
 @roff_stats@can be used to access the summation of statistics from all arenas.
+.Pp
+Take special note of the
+.Dq epoch
+mallctl, which controls refreshing of cached dynamic statistics.
 .Bl -ohang
 .\"-----------------------------------------------------------------------------
 .It Sy "version (const char *) r-"
@@ -648,27 +572,12 @@
 .It Sy "epoch (uint64_t) rw"
 .Bd -ragged -offset indent -compact
 If a value is passed in, refresh the data from which the
-.Fn mallctl*
+.Fn @jemalloc_prefix@mallctl*
 functions report values, and increment the epoch.
 Return the current epoch.
 This is useful for detecting whether another thread caused a refresh.
 .Ed
 .\"-----------------------------------------------------------------------------
-@roff_tcache@.It Sy "tcache.flush (void) --"
-@roff_tcache@.Bd -ragged -offset indent -compact
-@roff_tcache@Flush calling thread's tcache.
-@roff_tcache@This interface releases all cached objects and internal data
-@roff_tcache@structures associated with the calling thread's thread-specific
-@roff_tcache@cache.
-@roff_tcache@Ordinarily, this interface need not be called, since automatic
-@roff_tcache@periodic incremental garbage collection occurs, and the thread
-@roff_tcache@cache is automatically discarded when a thread exits.
-@roff_tcache@However, garbage collection is triggered by allocation activity,
-@roff_tcache@so it is possible for a thread that stops allocating/deallocating
-@roff_tcache@to retain its cache indefinitely, in which case the developer may
-@roff_tcache@find manual flushing useful.
-.Ed
-.\"-----------------------------------------------------------------------------
 .It Sy "config.debug (bool) r-"
 .Bd -ragged -offset indent -compact
 --enable-debug was specified during build configuration.
@@ -746,129 +655,386 @@
 .\"-----------------------------------------------------------------------------
 .It Sy "opt.abort (bool) r-"
 .Bd -ragged -offset indent -compact
-See the
-.Dq A
-option.
+Abort-on-warning enabled/disabled.
+If true, most warnings are fatal.
+The process will call
+.Xr abort 3
+in these cases.
+This option is
+@roff_debug@enabled
+@roff_no_debug@disabled
+by default.
 .Ed
 .\"-----------------------------------------------------------------------------
-@roff_fill@.It Sy "opt.junk (bool) r-"
-@roff_fill@.Bd -ragged -offset indent -compact
-@roff_fill@See the
-@roff_fill@.Dq J
-@roff_fill@option.
-@roff_fill@.Ed
-.\"-----------------------------------------------------------------------------
-@roff_fill@.It Sy "opt.zero (bool) r-"
-@roff_fill@.Bd -ragged -offset indent -compact
-@roff_fill@See the
-@roff_fill@.Dq Z
-@roff_fill@option.
-@roff_fill@.Ed
-.\"-----------------------------------------------------------------------------
-@roff_xmalloc@.It Sy "opt.xmalloc (bool) r-"
-@roff_xmalloc@.Bd -ragged -offset indent -compact
-@roff_xmalloc@See the
-@roff_xmalloc@.Dq X
-@roff_xmalloc@option.
-@roff_xmalloc@.Ed
-.\"-----------------------------------------------------------------------------
-@roff_tcache@.It Sy "opt.tcache (bool) r-"
-@roff_tcache@.Bd -ragged -offset indent -compact
-@roff_tcache@See the
-@roff_tcache@.Dq H
-@roff_tcache@option.
-@roff_tcache@.Ed
-.\"-----------------------------------------------------------------------------
-@roff_tcache@.It Sy "opt.lg_tcache_gc_sweep (ssize_t) r-"
-@roff_tcache@.Bd -ragged -offset indent -compact
-@roff_tcache@See the
-@roff_tcache@.Dq G
-@roff_tcache@option.
-@roff_tcache@.Ed
-.\"-----------------------------------------------------------------------------
-.It Sy "opt.stats_print (bool) r-"
-.Bd -ragged -offset indent -compact
-See the
-.Dq P
-option.
-.Ed
-.\"-----------------------------------------------------------------------------
-@roff_prof@.It Sy "opt.prof (bool) r-"
-@roff_prof@.Bd -ragged -offset indent -compact
-@roff_prof@See the
-@roff_prof@.Dq F
-@roff_prof@option.
-@roff_prof@.Ed
-.\"-----------------------------------------------------------------------------
-@roff_prof@.It Sy "opt.lg_prof_bt_max (size_t) r-"
-@roff_prof@.Bd -ragged -offset indent -compact
-@roff_prof@See the
-@roff_prof@.Dq B
-@roff_prof@option.
-@roff_prof@.Ed
-.\"-----------------------------------------------------------------------------
-@roff_prof@.It Sy "opt.lg_prof_interval (ssize_t) r-"
-@roff_prof@.Bd -ragged -offset indent -compact
-@roff_prof@See the
-@roff_prof@.Dq I
-@roff_prof@option.
-@roff_prof@.Ed
-.\"-----------------------------------------------------------------------------
-@roff_prof@.It Sy "opt.prof_udump (bool) r-"
-@roff_prof@.Bd -ragged -offset indent -compact
-@roff_prof@See the
-@roff_prof@.Dq U
-@roff_prof@option.
-@roff_prof@.Ed
-.\"-----------------------------------------------------------------------------
-@roff_prof@.It Sy "opt.prof_leak (bool) r-"
-@roff_prof@.Bd -ragged -offset indent -compact
-@roff_prof@See the
-@roff_prof@.Dq L
-@roff_prof@option.
-@roff_prof@.Ed
-.\"-----------------------------------------------------------------------------
 .It Sy "opt.lg_qspace_max (size_t) r-"
 .Bd -ragged -offset indent -compact
-See the
-.Dq Q
-option.
+Size (log base 2) of the maximum size class that is a multiple of the quantum
+(8 or 16 bytes, depending on architecture).
+Above this size, cacheline spacing is used for size classes.
+The default value is 128 bytes (2^7).
 .Ed
 .\"-----------------------------------------------------------------------------
 .It Sy "opt.lg_cspace_max (size_t) r-"
 .Bd -ragged -offset indent -compact
-See the
-.Dq C
-option.
-.Ed
-.\"-----------------------------------------------------------------------------
-.It Sy "opt.lg_dirty_mult (ssize_t) r-"
-.Bd -ragged -offset indent -compact
-See the
-.Dq D
-option.
+Size (log base 2) of the maximum size class that is a multiple of the cacheline
+size (64).
+Above this size, subpage spacing (256 bytes) is used for size classes.
+The default value is 512 bytes (2^9).
 .Ed
 .\"-----------------------------------------------------------------------------
 .It Sy "opt.lg_chunk (size_t) r-"
 .Bd -ragged -offset indent -compact
-See the
-.Dq K
-option.
+Virtual memory chunk size (log base 2).
+The default chunk size is 4 MiB (2^22).
 .Ed
 .\"-----------------------------------------------------------------------------
+.It Sy "opt.narenas (size_t) r-"
+.Bd -ragged -offset indent -compact
+Maximum number of arenas to use.
+The default maximum number of arenas is four times the number of CPUs, or one
+if there is a single CPU.
+.Ed
+.\"-----------------------------------------------------------------------------
+.It Sy "opt.lg_dirty_mult (ssize_t) r-"
+.Bd -ragged -offset indent -compact
+Per-arena minimum ratio (log base 2) of active to dirty pages.
+Some dirty unused pages may be allowed to accumulate, within the limit set by
+the ratio (or one chunk worth of dirty pages, whichever is greater), before
+informing the kernel about some of those pages via
+.Xr madvise 2
+or a similar system call.
+This provides the kernel with sufficient information to recycle dirty pages if
+physical memory becomes scarce and the pages remain unused.
+The default minimum ratio is 32:1 (2^5:1); an option value of -1 will disable
+dirty page purging.
+.Ed
+.\"-----------------------------------------------------------------------------
+.It Sy "opt.stats_print (bool) r-"
+.Bd -ragged -offset indent -compact
+Enable/disable statistics printing at exit.
+If enabled, the
+.Fn @jemalloc_prefix@malloc_stats_print
+function is called at program exit via an
+.Xr atexit 3
+function.
+@roff_stats@This has the potential to cause deadlock for a multi-threaded
+@roff_stats@process that exits while one or more threads are executing in the
+@roff_stats@memory allocation functions.
+@roff_stats@Therefore, this option should only be used with care; it is
+@roff_stats@primarily intended as a performance tuning aid during application
+@roff_stats@development.
+This option is disabled by default.
+.Ed
+.\"-----------------------------------------------------------------------------
+@roff_fill@.It Sy "opt.junk (bool) r-"
+@roff_fill@.Bd -ragged -offset indent -compact
+@roff_fill@Junk filling enabled/disabled.
+@roff_fill@If enabled, each byte of uninitialized allocated memory will be
+@roff_fill@initialized to 0xa5.
+@roff_fill@All deallocated memory will be initialized to 0x5a.
+@roff_fill@This is intended for debugging and will impact performance
+@roff_fill@negatively.
+@roff_fill@This option is
+@roff_fill@@roff_debug@enabled
+@roff_fill@@roff_no_debug@disabled
+@roff_fill@by default.
+@roff_fill@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_fill@.It Sy "opt.zero (bool) r-"
+@roff_fill@.Bd -ragged -offset indent -compact
+@roff_fill@Zero filling enabled/disabled.
+@roff_fill@If enabled, each byte of uninitialized allocated memory will be
+@roff_fill@initialized to 0.
+@roff_fill@Note that this initialization only happens once for each byte, so
+@roff_fill@.Fn @jemalloc_prefix@realloc
+@roff_fill@calls do not zero memory that was previously allocated.
+@roff_fill@This is intended for debugging and will impact performance
+@roff_fill@negatively.
+@roff_fill@This option is disabled by default.
+@roff_fill@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_sysv@.It Sy "opt.sysv (bool) r-"
+@roff_sysv@.Bd -ragged -offset indent -compact
+@roff_sysv@If enabled, attempting to allocate zero bytes will return a
+@roff_sysv@.Dv NULL
+@roff_sysv@pointer instead of a valid pointer.
+@roff_sysv@(The default behavior is to make a minimal allocation and return a
+@roff_sysv@pointer to it.)
+@roff_sysv@This option is provided for System V compatibility.
+@roff_sysv@@roff_xmalloc@This option is incompatible with the
+@roff_sysv@@roff_xmalloc@.Dq opt.xmalloc
+@roff_sysv@@roff_xmalloc@option.
+@roff_sysv@This option is disabled by default.
+@roff_sysv@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_xmalloc@.It Sy "opt.xmalloc (bool) r-"
+@roff_xmalloc@.Bd -ragged -offset indent -compact
+@roff_xmalloc@Abort-on-out-of-memory enabled/disabled.
+@roff_xmalloc@If enabled, rather than returning failure for any allocation
+@roff_xmalloc@function, display a diagnostic message on
+@roff_xmalloc@.Dv STDERR_FILENO
+@roff_xmalloc@and cause the program to drop core (using
+@roff_xmalloc@.Xr abort 3 ) .
+@roff_xmalloc@If an application is designed to depend on this behavior, set the
+@roff_xmalloc@option at compile time by including the following in the source
+@roff_xmalloc@code:
+@roff_xmalloc@.Bd -literal -offset indent
+@roff_xmalloc@@jemalloc_prefix@malloc_conf = "xmalloc:true";
+@roff_xmalloc@.Ed
+@roff_xmalloc@.Pp
+@roff_xmalloc@This option is disabled by default.
+@roff_xmalloc@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_tcache@.It Sy "opt.tcache (bool) r-"
+@roff_tcache@.Bd -ragged -offset indent -compact
+@roff_tcache@Thread-specific caching enabled/disabled.
+@roff_tcache@When there are multiple threads, each thread uses a
+@roff_tcache@thread-specific cache for objects up to a certain size.
+@roff_tcache@Thread-specific caching allows many allocations to be satisfied
+@roff_tcache@without performing any thread synchronization, at the cost of
+@roff_tcache@increased memory use.
+@roff_tcache@See the
+@roff_tcache@.Dq opt.lg_tcache_gc_sweep
+@roff_tcache@and
+@roff_tcache@.Dq opt.tcache_max
+@roff_tcache@options for related tuning information.
+@roff_tcache@This option is enabled by default.
+@roff_tcache@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_tcache@.It Sy "opt.lg_tcache_gc_sweep (ssize_t) r-"
+@roff_tcache@.Bd -ragged -offset indent -compact
+@roff_tcache@Approximate interval (log base 2) between full thread-specific
+@roff_tcache@cache garbage collection sweeps, counted in terms of
+@roff_tcache@thread-specific cache allocation/deallocation events.
+@roff_tcache@Garbage collection is actually performed incrementally, one size
+@roff_tcache@class at a time, in order to avoid large collection pauses.
+@roff_tcache@The default sweep interval is 8192 (2^13); setting this option to
+@roff_tcache@-1 will disable garbage collection.
+@roff_tcache@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_tcache@.It Sy "opt.lg_tcache_max (size_t) r-"
+@roff_tcache@.Bd -ragged -offset indent -compact
+@roff_tcache@Maximum size class (log base 2) to cache in the thread-specific
+@roff_tcache@cache.
+@roff_tcache@At a minimum, all small size classes are cached, and at a maximum
+@roff_tcache@all large size classes are cached.
+@roff_tcache@The default maximum is 32 KiB (2^15).
+@roff_tcache@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_prof@.It Sy "opt.prof (bool) r-"
+@roff_prof@.Bd -ragged -offset indent -compact
+@roff_prof@Memory profiling enabled/disabled.
+@roff_prof@If enabled, profile memory allocation activity, and use an
+@roff_prof@.Xr atexit 3
+@roff_prof@function to dump final memory usage to a file named according to
+@roff_prof@the pattern
+@roff_prof@.Pa <prefix>.<pid>.<seq>.f.heap ,
+@roff_prof@where
+@roff_prof@.Pa <prefix>
+@roff_prof@is controlled by the
+@roff_prof@.Dq opt.prof_prefix
+@roff_prof@option.
+@roff_prof@See the
+@roff_prof@.Dq opt.lg_prof_bt_max
+@roff_prof@option for backtrace depth control.
+@roff_prof@See the
+@roff_prof@.Dq opt.prof_active
+@roff_prof@option for on-the-fly activation/deactivation.
+@roff_prof@See the
+@roff_prof@.Dq opt.lg_prof_sample
+@roff_prof@option for probabilistic sampling control.
+@roff_prof@See the
+@roff_prof@.Dq opt.prof_accum
+@roff_prof@option for control of cumulative sample reporting.
+@roff_prof@See the
+@roff_prof@.Dq opt.lg_prof_tcmax
+@roff_prof@option for control of per thread backtrace caching.
+@roff_prof@See the
+@roff_prof@.Dq opt.lg_prof_interval
+@roff_prof@option for information on interval-triggered profile dumping, and the
+@roff_prof@.Dq opt.prof_gdump
+@roff_prof@option for information on high-water-triggered profile dumping.
+@roff_prof@Profile output is compatible with the included pprof Perl script,
+@roff_prof@which originates from the
+@roff_prof@.UR http://\:code.google.com/\:p/\:google-perftools/
+@roff_prof@google-perftools package
+@roff_prof@.UE .
+@roff_prof@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_prof@.It Sy "opt.prof_prefix (const char *) r-"
+@roff_prof@.Bd -ragged -offset indent -compact
+@roff_prof@Filename prefix for profile dumps.
+@roff_prof@If the prefix is set to the empty string, no automatic dumps will
+@roff_prof@occur; this is primarily useful for disabling the automatic final
+@roff_prof@heap dump (which also disables leak reporting, if enabled).
+@roff_prof@The default prefix is
+@roff_prof@.Pa jeprof .
+@roff_prof@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_prof@.It Sy "opt.lg_prof_bt_max (size_t) r-"
+@roff_prof@.Bd -ragged -offset indent -compact
+@roff_prof@Maximum backtrace depth (log base 2) when profiling memory
+@roff_prof@allocation activity.
+@roff_prof@The default is 128 (2^7).
+@roff_prof@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_prof@.It Sy "opt.prof_active (bool) r-"
+@roff_prof@.Bd -ragged -offset indent -compact
+@roff_prof@Profiling activated/deactivated.
+@roff_prof@This is a secondary control mechanism that makes it possible to
+@roff_prof@start the application with profiling enabled (see the
+@roff_prof@.Dq opt.prof
+@roff_prof@option) but inactive, then toggle profiling at any time during
+@roff_prof@program execution with the
+@roff_prof@.Dq prof.active
+@roff_prof@mallctl.
+@roff_prof@This option is enabled by default.
+@roff_prof@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_prof@.It Sy "opt.lg_prof_sample (ssize_t) r-"
+@roff_prof@.Bd -ragged -offset indent -compact
+@roff_prof@Average interval (log base 2) between allocation samples, as
+@roff_prof@measured in bytes of allocation activity.
+@roff_prof@Increasing the sampling interval decreases profile fidelity, but
+@roff_prof@also decreases the computational overhead.
+@roff_prof@The default sample interval is 1 (2^0) (i.e. all allocations are
+@roff_prof@sampled).
+@roff_prof@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_prof@.It Sy "opt.prof_accum (bool) r-"
+@roff_prof@.Bd -ragged -offset indent -compact
+@roff_prof@Reporting of cumulative object/byte counts in profile dumps
+@roff_prof@enabled/disabled.
+@roff_prof@If this option is enabled, every unique backtrace must be stored for
+@roff_prof@the duration of execution.
+@roff_prof@Depending on the application, this can impose a large memory
+@roff_prof@overhead, and the cumulative counts are not always of interest.
+@roff_prof@See the
+@roff_prof@.Dq opt.lg_prof_tcmax
+@roff_prof@option for control of per thread backtrace caching, which has
+@roff_prof@important interactions.
+@roff_prof@This option is enabled by default.
+@roff_prof@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_prof@.It Sy "opt.lg_prof_tcmax (ssize_t) r-"
+@roff_prof@.Bd -ragged -offset indent -compact
+@roff_prof@Maximum per thread backtrace cache (log base 2) used for heap
+@roff_prof@profiling.
+@roff_prof@A backtrace can only be discarded if the
+@roff_prof@.Dq opt.prof_accum
+@roff_prof@option is disabled, and no thread caches currently refer to the
+@roff_prof@backtrace.
+@roff_prof@Therefore, a backtrace cache limit should be imposed if the
+@roff_prof@intention is to limit how much memory is used by backtraces.
+@roff_prof@By default, no limit is imposed (encoded as -1).
+@roff_prof@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_prof@.It Sy "opt.lg_prof_interval (ssize_t) r-"
+@roff_prof@.Bd -ragged -offset indent -compact
+@roff_prof@Average interval (log base 2) between memory profile dumps, as
+@roff_prof@measured in bytes of allocation activity.
+@roff_prof@The actual interval between dumps may be sporadic because
+@roff_prof@decentralized allocation counters are used to avoid synchronization
+@roff_prof@bottlenecks.
+@roff_prof@Profiles are dumped to files named according to the pattern
+@roff_prof@.Pa <prefix>.<pid>.<seq>.i<iseq>.heap ,
+@roff_prof@where
+@roff_prof@.Pa <prefix>
+@roff_prof@is controlled by the
+@roff_prof@.Dq opt.prof_prefix
+@roff_prof@option.
+@roff_prof@By default, interval-triggered profile dumping is disabled (encoded
+@roff_prof@as -1).
+@roff_prof@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_prof@.It Sy "opt.prof_gdump (bool) r-"
+@roff_prof@.Bd -ragged -offset indent -compact
+@roff_prof@Trigger a memory profile dump every time the total virtual memory
+@roff_prof@exceeds the previous maximum.
+@roff_prof@Profiles are dumped to files named according to the pattern
+@roff_prof@.Pa <prefix>.<pid>.<seq>.u<useq>.heap ,
+@roff_prof@where
+@roff_prof@.Pa <prefix>
+@roff_prof@is controlled by the
+@roff_prof@.Dq opt.prof_prefix
+@roff_prof@option.
+@roff_prof@This option is disabled by default.
+@roff_prof@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_prof@.It Sy "opt.prof_leak (bool) r-"
+@roff_prof@.Bd -ragged -offset indent -compact
+@roff_prof@Leak reporting enabled/disabled.
+@roff_prof@If enabled, use an
+@roff_prof@.Xr atexit 3
+@roff_prof@function to report memory leaks detected by allocation sampling.
+@roff_prof@See the
+@roff_prof@.Dq opt.lg_prof_bt_max
+@roff_prof@option for backtrace depth control.
+@roff_prof@See the
+@roff_prof@.Dq opt.prof
+@roff_prof@option for information on analyzing heap profile output.
+@roff_prof@This option is disabled by default.
+@roff_prof@.Ed
+.\"-----------------------------------------------------------------------------
 .It Sy "opt.overcommit (bool) r-"
 .Bd -ragged -offset indent -compact
-See the
-.Dq O
-option.
+@roff_swap@Over-commit enabled/disabled.
+@roff_swap@If enabled, over-commit memory as a side effect of using anonymous
+@roff_swap@.Xr mmap 2
+@roff_swap@@roff_dss@ and
+@roff_swap@@roff_dss@.Xr sbrk 2
+@roff_swap@for virtual memory allocation.
+@roff_swap@In order for overcommit to be disabled, the
+@roff_swap@.Dq swap.fds
+@roff_swap@mallctl must have been successfully written to.
+@roff_swap@This option is enabled by default.
 .Ed
 .\"-----------------------------------------------------------------------------
+@roff_tcache@.It Sy "tcache.flush (void) --"
+@roff_tcache@.Bd -ragged -offset indent -compact
+@roff_tcache@Flush calling thread's tcache.
+@roff_tcache@This interface releases all cached objects and internal data
+@roff_tcache@structures associated with the calling thread's thread-specific
+@roff_tcache@cache.
+@roff_tcache@Ordinarily, this interface need not be called, since automatic
+@roff_tcache@periodic incremental garbage collection occurs, and the thread
+@roff_tcache@cache is automatically discarded when a thread exits.
+@roff_tcache@However, garbage collection is triggered by allocation activity,
+@roff_tcache@so it is possible for a thread that stops allocating/deallocating
+@roff_tcache@to retain its cache indefinitely, in which case the developer may
+@roff_tcache@find manual flushing useful.
+.Ed
+.\"-----------------------------------------------------------------------------
+.It Sy "thread.arena (unsigned) rw"
+.Bd -ragged -offset indent -compact
+Get or set the arena associated with the calling thread.
+The arena index must be less than the maximum number of arenas (see the
+.Dq arenas.narenas
+mallctl).
+If the specified arena was not initialized beforehand (see the
+.Dq arenas.initialized
+mallctl), it will be automatically initialized as a side effect of calling this
+interface.
+.Ed
+.\"-----------------------------------------------------------------------------
+@roff_stats@.It Sy "thread.allocated (uint64_t) r-"
+@roff_stats@.Bd -ragged -offset indent -compact
+@roff_stats@Get the total number of bytes ever allocated by the calling thread.
+@roff_stats@This counter has the potential to wrap around; it is up to the
+@roff_stats@application to appropriately interpret the counter in such cases.
+@roff_stats@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_stats@.It Sy "thread.deallocated (uint64_t) r-"
+@roff_stats@.Bd -ragged -offset indent -compact
+@roff_stats@Get the total number of bytes ever deallocated by the calling
+@roff_stats@thread.
+@roff_stats@This counter has the potential to wrap around; it is up to the
+@roff_stats@application to appropriately interpret the counter in such cases.
+@roff_stats@.Ed
+.\"-----------------------------------------------------------------------------
 .It Sy "arenas.narenas (unsigned) r-"
 .Bd -ragged -offset indent -compact
 Maximum number of arenas.
-See the
-.Dq N
-option.
 .Ed
 .\"-----------------------------------------------------------------------------
 .It Sy "arenas.initialized (bool *) r-"
@@ -1004,11 +1170,17 @@
 Maximum size supported by this large size class.
 .Ed
 .\"-----------------------------------------------------------------------------
+.It Sy "arenas.purge (unsigned) -w"
+.Bd -ragged -offset indent -compact
+Purge unused dirty pages for the specified arena, or for all arenas if none is
+specified.
+.Ed
+.\"-----------------------------------------------------------------------------
 @roff_prof@.It Sy "prof.active (bool) rw"
 @roff_prof@.Bd -ragged -offset indent -compact
 @roff_prof@Control whether sampling is currently active.
 @roff_prof@See the
-@roff_prof@.Dq E
+@roff_prof@.Dq opt.prof_active
 @roff_prof@option for additional information.
 @roff_prof@.Ed
 .\"-----------------------------------------------------------------------------
@@ -1020,8 +1192,8 @@
 @roff_prof@where
 @roff_prof@.Pa <prefix>
 @roff_prof@is controlled by the
-@roff_prof@JEMALLOC_PROF_PREFIX
-@roff_prof@environment variable.
+@roff_prof@.Dq opt.prof_prefix
+@roff_prof@option.
 @roff_prof@.Ed
 .\"-----------------------------------------------------------------------------
 @roff_prof@.It Sy "prof.interval (uint64_t) r-"
@@ -1029,7 +1201,7 @@
 @roff_prof@Average number of bytes allocated between inverval-based profile
 @roff_prof@dumps.
 @roff_prof@See the
-@roff_prof@.Dq I
+@roff_prof@.Dq opt.lg_prof_interval
 @roff_prof@option for additional information.
 @roff_prof@.Ed
 .\"-----------------------------------------------------------------------------
@@ -1283,10 +1455,9 @@
 .\"-----------------------------------------------------------------------------
 .El
 .Sh DEBUGGING MALLOC PROBLEMS
-The first thing to do is to set the
-.Dq A
-option.
-This option forces a coredump (if possible) at the first sign of trouble,
+Start by setting the
+.Dq opt.abort
+option, which forces a coredump (if possible) at the first sign of trouble,
 rather than the normal policy of trying to continue if at all possible.
 .Pp
 It is probably also a good idea to recompile the program with suitable
@@ -1294,22 +1465,22 @@
 .Pp
 @roff_fill@If the program starts to give unusual results, coredump or generally
 @roff_fill@behave differently without emitting any of the messages mentioned in
-@roff_fill@the next section, it is likely because it depends on the storage
-@roff_fill@being filled with zero bytes.
+@roff_fill@the next section, it is likely because the program depends on the
+@roff_fill@storage being filled with zero bytes.
 @roff_fill@Try running it with the
-@roff_fill@.Dq Z
+@roff_fill@.Dq opt.zero
 @roff_fill@option set;
 @roff_fill@if that improves the situation, this diagnosis has been confirmed.
 @roff_fill@If the program still misbehaves,
 @roff_fill@the likely problem is accessing memory outside the allocated area.
 @roff_fill@.Pp
 @roff_fill@Alternatively, if the symptoms are not easy to reproduce, setting the
-@roff_fill@.Dq J
+@roff_fill@.Dq opt.junk
 @roff_fill@option may help provoke the problem.
 @roff_fill@.Pp
-Unfortunately this implementation does not provide much detail about
-the problems it detects; the performance impact for storing such information
-would be prohibitive.
+This implementation does not provide much detail about the problems it detects,
+because the performance impact for storing such information would be
+prohibitive.
 There are a number of allocator implementations available on the Internet
 which focus on detecting and pinpointing problems by trading performance for
 extra sanity checks and detailed diagnostics.
@@ -1319,8 +1490,8 @@
 .Dv STDERR_FILENO .
 Errors will result in the process dumping core.
 If the
-.Dq A
-option is set, all warnings are treated as errors.
+.Dq opt.abort
+option is set, most warnings are treated as errors.
 .Pp
 The
 .Va @jemalloc_prefix@malloc_message
@@ -1342,6 +1513,7 @@
 All messages are prefixed by
 .Dq <jemalloc>: .
 .Sh RETURN VALUES
+.Ss Standard API
 The
 .Fn @jemalloc_prefix@malloc
 and
@@ -1390,7 +1562,7 @@
 The
 .Fn @jemalloc_prefix@free
 function returns no value.
-.Pp
+.Ss Non-standard API
 The
 .Fn @jemalloc_prefix@malloc_usable_size
 function returns the usable size of the allocation pointed to by
@@ -1429,37 +1601,69 @@
 A memory allocation failure occurred.
 .It Bq Er EFAULT
 An interface with side effects failed in some way not directly related to
-.Fn mallctl*
+.Fn @jemalloc_prefix@mallctl*
 read/write processing.
 .El
+.Ss Experimental API
+The
+.Fn @jemalloc_prefix@allocm ,
+.Fn @jemalloc_prefix@rallocm ,
+.Fn @jemalloc_prefix@sallocm ,
+and
+.Fn @jemalloc_prefix@dallocm
+functions return
+.Dv ALLOCM_SUCCESS
+on success; otherwise they return an error value.
+The
+.Fn @jemalloc_prefix@allocm
+and
+.Fn @jemalloc_prefix@rallocm
+functions will fail if:
+.Bl -tag -width ".Bq Er ALLOCM_ERR_OOM"
+.It Bq Er ALLOCM_ERR_OOM
+Out of memory.
+Insufficient contiguous memory was available to service the allocation request.
+The
+.Fn @jemalloc_prefix@allocm
+function additionally sets
+.Fa *ptr
+to
+.Dv NULL ,
+whereas the
+.Fn @jemalloc_prefix@rallocm
+function leaves
+.Fa *ptr
+unmodified.
+.El
+.Pp
+The
+.Fn @jemalloc_prefix@rallocm
+function will also fail if:
+.Bl -tag -width ".Bq Er ALLOCM_ERR_NOT_MOVED"
+.It Bq Er ALLOCM_ERR_NOT_MOVED
+.Dv ALLOCM_NO_MOVE
+was specified, but the reallocation request could not be serviced without
+moving the object.
+.El
 .Sh ENVIRONMENT
-The following environment variables affect the execution of the allocation
+The following environment variable affects the execution of the allocation
 functions:
-@roff_prof@.Bl -tag -width ".Ev JEMALLOC_PROF_PREFIX"
-@roff_no_prof@.Bl -tag -width ".Ev JEMALLOC_OPTIONS"
-.It Ev JEMALLOC_OPTIONS
+.Bl -tag -width ".Ev @jemalloc_cprefix@MALLOC_CONF"
+.It Ev @jemalloc_cprefix@MALLOC_CONF
 If the environment variable
-.Ev JEMALLOC_OPTIONS
-is set, the characters it contains will be interpreted as flags to the
-allocation functions.
-@roff_prof@.It Ev JEMALLOC_PROF_PREFIX
-@roff_prof@If the environment variable
-@roff_prof@.Ev JEMALLOC_PROF_PREFIX
-@roff_prof@is set, use it as the filename prefix for profile dumps; otherwise
-@roff_prof@use
-@roff_prof@.Pa jeprof
-@roff_prof@as the prefix.
+.Ev @jemalloc_cprefix@MALLOC_CONF
+is set, the characters it contains will be interpreted as options.
 .El
 .Sh EXAMPLES
 To dump core whenever a problem occurs:
 .Pp
 .Bd -literal -offset indent
-ln -s 'A' /etc/jemalloc.conf
+ln -s 'abort:true' /etc/@jemalloc_prefix@malloc.conf
 .Ed
 .Pp
-To specify in the source a chunk size that is twice the default:
+To specify in the source a chunk size that is 16 MiB:
 .Bd -literal -offset indent
-@jemalloc_prefix@malloc_options = "K";
+@jemalloc_prefix@malloc_conf = "lg_chunk:24";
 .Ed
 .Sh SEE ALSO
 .Xr madvise 2 ,
diff --git a/jemalloc/include/jemalloc/internal/arena.h b/jemalloc/include/jemalloc/internal/arena.h
index c1955f1..9556c2c 100644
--- a/jemalloc/include/jemalloc/internal/arena.h
+++ b/jemalloc/include/jemalloc/internal/arena.h
@@ -121,17 +121,17 @@
 	 *
 	 * p : run page offset
 	 * s : run size
-	 * c : size class (used only if prof_promote is true)
+	 * c : (binind+1) for size class (used only if prof_promote is true)
 	 * x : don't care
 	 * - : 0
 	 * + : 1
-	 * [DZLA] : bit set
-	 * [dzla] : bit unset
+	 * [DULA] : bit set
+	 * [dula] : bit unset
 	 *
 	 *   Unallocated (clean):
-	 *     ssssssss ssssssss ssss---- ----dz--
-	 *     xxxxxxxx xxxxxxxx xxxx---- -----Zxx
-	 *     ssssssss ssssssss ssss---- ----dZ--
+	 *     ssssssss ssssssss ssss---- ----du--
+	 *     xxxxxxxx xxxxxxxx xxxx---- -----Uxx
+	 *     ssssssss ssssssss ssss---- ----dU--
 	 *
 	 *   Unallocated (dirty):
 	 *     ssssssss ssssssss ssss---- ----D---
@@ -144,7 +144,7 @@
 	 *     pppppppp pppppppp pppp---- ----d--a
 	 *
 	 *   Large:
-	 *     ssssssss ssssssss ssss++++ ++++D-la
+	 *     ssssssss ssssssss ssss---- ----D-la
 	 *     xxxxxxxx xxxxxxxx xxxx---- ----xxxx
 	 *     -------- -------- -------- ----D-la
 	 *
@@ -152,7 +152,7 @@
 	 *     ssssssss ssssssss sssscccc ccccD-la
 	 *
 	 *   Large (not sampled, size == PAGE_SIZE):
-	 *     ssssssss ssssssss ssss++++ ++++D-la
+	 *     ssssssss ssssssss ssss---- ----D-la
 	 */
 	size_t				bits;
 #ifdef JEMALLOC_PROF
@@ -161,7 +161,7 @@
 #endif
 #define	CHUNK_MAP_FLAGS_MASK	((size_t)0xfU)
 #define	CHUNK_MAP_DIRTY		((size_t)0x8U)
-#define	CHUNK_MAP_ZEROED	((size_t)0x4U)
+#define	CHUNK_MAP_UNZEROED	((size_t)0x4U)
 #define	CHUNK_MAP_LARGE		((size_t)0x2U)
 #define	CHUNK_MAP_ALLOCATED	((size_t)0x1U)
 #define	CHUNK_MAP_KEY		CHUNK_MAP_ALLOCATED
@@ -187,7 +187,12 @@
 	/* Number of dirty pages. */
 	size_t		ndirty;
 
-	/* Map of pages within chunk that keeps track of free/large/small. */
+	/*
+	 * Map of pages within chunk that keeps track of free/large/small.  The
+	 * first map_bias entries are omitted, since the chunk header does not
+	 * need to be tracked in the map.  This omission saves a header page
+	 * for common chunk sizes (e.g. 4 MiB).
+	 */
 	arena_chunk_map_t map[1]; /* Dynamically sized. */
 };
 typedef rb_tree(arena_chunk_t) arena_chunk_tree_t;
@@ -416,8 +421,12 @@
 extern size_t		sspace_max;
 #define			small_maxclass	sspace_max
 
-#define			nlclasses (chunk_npages - arena_chunk_header_npages)
+#define			nlclasses (chunk_npages - map_bias)
 
+void	arena_purge_all(arena_t *arena);
+#ifdef JEMALLOC_PROF
+void	arena_prof_accum(arena_t *arena, uint64_t accumbytes);
+#endif
 #ifdef JEMALLOC_TCACHE
 void	arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin,
     size_t binind
@@ -426,20 +435,15 @@
 #  endif
     );
 #endif
-#ifdef JEMALLOC_PROF
-void	arena_prof_accum(arena_t *arena, uint64_t accumbytes);
-#endif
 void	*arena_malloc_small(arena_t *arena, size_t size, bool zero);
 void	*arena_malloc_large(arena_t *arena, size_t size, bool zero);
 void	*arena_malloc(size_t size, bool zero);
-void	*arena_palloc(arena_t *arena, size_t alignment, size_t size,
-    size_t alloc_size);
+void	*arena_palloc(arena_t *arena, size_t size, size_t alloc_size,
+    size_t alignment, bool zero);
 size_t	arena_salloc(const void *ptr);
 #ifdef JEMALLOC_PROF
 void	arena_prof_promoted(const void *ptr, size_t size);
 size_t	arena_salloc_demote(const void *ptr);
-prof_ctx_t	*arena_prof_ctx_get(const void *ptr);
-void	arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
 #endif
 void	arena_dalloc_bin(arena_t *arena, arena_chunk_t *chunk, void *ptr,
     arena_chunk_map_t *mapelm);
@@ -449,7 +453,10 @@
     arena_stats_t *astats, malloc_bin_stats_t *bstats,
     malloc_large_stats_t *lstats);
 #endif
-void	*arena_ralloc(void *ptr, size_t size, size_t oldsize);
+void	*arena_ralloc_no_move(void *ptr, size_t oldsize, size_t size,
+    size_t extra, bool zero);
+void	*arena_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra,
+    size_t alignment, bool zero);
 bool	arena_new(arena_t *arena, unsigned ind);
 bool	arena_boot(void);
 
@@ -458,10 +465,149 @@
 #ifdef JEMALLOC_H_INLINES
 
 #ifndef JEMALLOC_ENABLE_INLINE
+unsigned	arena_run_regind(arena_run_t *run, arena_bin_t *bin,
+    const void *ptr, size_t size);
+#  ifdef JEMALLOC_PROF
+prof_ctx_t	*arena_prof_ctx_get(const void *ptr);
+void	arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
+#  endif
 void	arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_ARENA_C_))
+JEMALLOC_INLINE unsigned
+arena_run_regind(arena_run_t *run, arena_bin_t *bin, const void *ptr,
+    size_t size)
+{
+	unsigned shift, diff, regind;
+
+	assert(run->magic == ARENA_RUN_MAGIC);
+
+	/*
+	 * Avoid doing division with a variable divisor if possible.  Using
+	 * actual division here can reduce allocator throughput by over 20%!
+	 */
+	diff = (unsigned)((uintptr_t)ptr - (uintptr_t)run - bin->reg0_offset);
+
+	/* Rescale (factor powers of 2 out of the numerator and denominator). */
+	shift = ffs(size) - 1;
+	diff >>= shift;
+	size >>= shift;
+
+	if (size == 1) {
+		/* The divisor was a power of 2. */
+		regind = diff;
+	} else {
+		/*
+		 * To divide by a number D that is not a power of two we
+		 * multiply by (2^21 / D) and then right shift by 21 positions.
+		 *
+		 *   X / D
+		 *
+		 * becomes
+		 *
+		 *   (X * size_invs[D - 3]) >> SIZE_INV_SHIFT
+		 *
+		 * We can omit the first three elements, because we never
+		 * divide by 0, and 1 and 2 are both powers of two, which are
+		 * handled above.
+		 */
+#define	SIZE_INV_SHIFT 21
+#define	SIZE_INV(s) (((1U << SIZE_INV_SHIFT) / (s)) + 1)
+		static const unsigned size_invs[] = {
+		    SIZE_INV(3),
+		    SIZE_INV(4), SIZE_INV(5), SIZE_INV(6), SIZE_INV(7),
+		    SIZE_INV(8), SIZE_INV(9), SIZE_INV(10), SIZE_INV(11),
+		    SIZE_INV(12), SIZE_INV(13), SIZE_INV(14), SIZE_INV(15),
+		    SIZE_INV(16), SIZE_INV(17), SIZE_INV(18), SIZE_INV(19),
+		    SIZE_INV(20), SIZE_INV(21), SIZE_INV(22), SIZE_INV(23),
+		    SIZE_INV(24), SIZE_INV(25), SIZE_INV(26), SIZE_INV(27),
+		    SIZE_INV(28), SIZE_INV(29), SIZE_INV(30), SIZE_INV(31)
+		};
+
+		if (size <= ((sizeof(size_invs) / sizeof(unsigned)) + 2))
+			regind = (diff * size_invs[size - 3]) >> SIZE_INV_SHIFT;
+		else
+			regind = diff / size;
+#undef SIZE_INV
+#undef SIZE_INV_SHIFT
+	}
+	assert(diff == regind * size);
+	assert(regind < bin->nregs);
+
+	return (regind);
+}
+
+#ifdef JEMALLOC_PROF
+JEMALLOC_INLINE prof_ctx_t *
+arena_prof_ctx_get(const void *ptr)
+{
+	prof_ctx_t *ret;
+	arena_chunk_t *chunk;
+	size_t pageind, mapbits;
+
+	assert(ptr != NULL);
+	assert(CHUNK_ADDR2BASE(ptr) != ptr);
+
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
+	mapbits = chunk->map[pageind-map_bias].bits;
+	assert((mapbits & CHUNK_MAP_ALLOCATED) != 0);
+	if ((mapbits & CHUNK_MAP_LARGE) == 0) {
+		if (prof_promote)
+			ret = (prof_ctx_t *)(uintptr_t)1U;
+		else {
+			arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
+			    (uintptr_t)((pageind - (mapbits >> PAGE_SHIFT)) <<
+			    PAGE_SHIFT));
+			arena_bin_t *bin = run->bin;
+			unsigned regind;
+
+			assert(run->magic == ARENA_RUN_MAGIC);
+			regind = arena_run_regind(run, bin, ptr, bin->reg_size);
+			ret = *(prof_ctx_t **)((uintptr_t)run +
+			    bin->ctx0_offset + (regind *
+			    sizeof(prof_ctx_t *)));
+		}
+	} else
+		ret = chunk->map[pageind-map_bias].prof_ctx;
+
+	return (ret);
+}
+
+JEMALLOC_INLINE void
+arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
+{
+	arena_chunk_t *chunk;
+	size_t pageind, mapbits;
+
+	assert(ptr != NULL);
+	assert(CHUNK_ADDR2BASE(ptr) != ptr);
+
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
+	mapbits = chunk->map[pageind-map_bias].bits;
+	assert((mapbits & CHUNK_MAP_ALLOCATED) != 0);
+	if ((mapbits & CHUNK_MAP_LARGE) == 0) {
+		if (prof_promote == false) {
+			arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
+			    (uintptr_t)((pageind - (mapbits >> PAGE_SHIFT)) <<
+			    PAGE_SHIFT));
+			arena_bin_t *bin = run->bin;
+			unsigned regind;
+
+			assert(run->magic == ARENA_RUN_MAGIC);
+			regind = arena_run_regind(run, bin, ptr, bin->reg_size);
+
+			*((prof_ctx_t **)((uintptr_t)run + bin->ctx0_offset
+			    + (regind * sizeof(prof_ctx_t *)))) = ctx;
+		} else
+			assert((uintptr_t)ctx == (uintptr_t)1U);
+	} else
+		chunk->map[pageind-map_bias].prof_ctx = ctx;
+}
+#endif
+
 JEMALLOC_INLINE void
 arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr)
 {
@@ -474,8 +620,8 @@
 	assert(ptr != NULL);
 	assert(CHUNK_ADDR2BASE(ptr) != ptr);
 
-	pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT);
-	mapelm = &chunk->map[pageind];
+	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
+	mapelm = &chunk->map[pageind-map_bias];
 	assert((mapelm->bits & CHUNK_MAP_ALLOCATED) != 0);
 	if ((mapelm->bits & CHUNK_MAP_LARGE) == 0) {
 		/* Small allocation. */
diff --git a/jemalloc/include/jemalloc/internal/chunk.h b/jemalloc/include/jemalloc/internal/chunk.h
index 1f6abf7..a60f0ad 100644
--- a/jemalloc/include/jemalloc/internal/chunk.h
+++ b/jemalloc/include/jemalloc/internal/chunk.h
@@ -39,13 +39,17 @@
 extern chunk_stats_t	stats_chunks;
 #endif
 
+#ifdef JEMALLOC_IVSALLOC
+extern rtree_t		*chunks_rtree;
+#endif
+
 extern size_t		chunksize;
 extern size_t		chunksize_mask; /* (chunksize - 1). */
 extern size_t		chunk_npages;
-extern size_t		arena_chunk_header_npages;
+extern size_t		map_bias; /* Number of arena chunk header pages. */
 extern size_t		arena_maxclass; /* Max size class for arenas. */
 
-void	*chunk_alloc(size_t size, bool *zero);
+void	*chunk_alloc(size_t size, bool base, bool *zero);
 void	chunk_dealloc(void *chunk, size_t size);
 bool	chunk_boot(void);
 
diff --git a/jemalloc/include/jemalloc/internal/chunk_mmap.h b/jemalloc/include/jemalloc/internal/chunk_mmap.h
index dc52448..07b50a4 100644
--- a/jemalloc/include/jemalloc/internal/chunk_mmap.h
+++ b/jemalloc/include/jemalloc/internal/chunk_mmap.h
@@ -13,6 +13,8 @@
 void	*chunk_alloc_mmap_noreserve(size_t size);
 void	chunk_dealloc_mmap(void *chunk, size_t size);
 
+bool	chunk_mmap_boot(void);
+
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES
diff --git a/jemalloc/include/jemalloc/internal/ckh.h b/jemalloc/include/jemalloc/internal/ckh.h
index c39ea5c..d4e391b 100644
--- a/jemalloc/include/jemalloc/internal/ckh.h
+++ b/jemalloc/include/jemalloc/internal/ckh.h
@@ -45,7 +45,7 @@
 #endif
 
 	/* Used for pseudo-random number generation. */
-#define	CKH_A		12345
+#define	CKH_A		1103515241
 #define	CKH_C		12347
 	uint32_t	prn_state;
 
diff --git a/jemalloc/include/jemalloc/internal/ctl.h b/jemalloc/include/jemalloc/internal/ctl.h
index 7bbf21e..8776ad1 100644
--- a/jemalloc/include/jemalloc/internal/ctl.h
+++ b/jemalloc/include/jemalloc/internal/ctl.h
@@ -82,9 +82,9 @@
 #define	xmallctl(name, oldp, oldlenp, newp, newlen) do {		\
 	if (JEMALLOC_P(mallctl)(name, oldp, oldlenp, newp, newlen)	\
 	    != 0) {							\
-		malloc_write("<jemalloc>: Invalid xmallctl(\"");	\
+		malloc_write("<jemalloc>: Failure in xmallctl(\"");	\
 		malloc_write(name);					\
-		malloc_write("\", ...) call\n");			\
+		malloc_write("\", ...)\n");				\
 		abort();						\
 	}								\
 } while (0)
@@ -92,9 +92,9 @@
 #define	xmallctlnametomib(name, mibp, miblenp) do {			\
 	if (JEMALLOC_P(mallctlnametomib)(name, mibp, miblenp) != 0) {	\
 		malloc_write(						\
-		    "<jemalloc>: Invalid xmallctlnametomib(\"");	\
+		    "<jemalloc>: Failure in xmallctlnametomib(\"");	\
 		malloc_write(name);					\
-		malloc_write("\", ...) call\n");			\
+		malloc_write("\", ...)\n");				\
 		abort();						\
 	}								\
 } while (0)
@@ -103,7 +103,7 @@
 	if (JEMALLOC_P(mallctlbymib)(mib, miblen, oldp, oldlenp, newp,	\
 	    newlen) != 0) {						\
 		malloc_write(						\
-		    "<jemalloc>: Invalid xmallctlbymib() call\n");	\
+		    "<jemalloc>: Failure in xmallctlbymib()\n");	\
 		abort();						\
 	}								\
 } while (0)
diff --git a/jemalloc/include/jemalloc/internal/huge.h b/jemalloc/include/jemalloc/internal/huge.h
index 0c0582f..bf23127 100644
--- a/jemalloc/include/jemalloc/internal/huge.h
+++ b/jemalloc/include/jemalloc/internal/huge.h
@@ -20,8 +20,11 @@
 extern malloc_mutex_t	huge_mtx;
 
 void	*huge_malloc(size_t size, bool zero);
-void	*huge_palloc(size_t alignment, size_t size);
-void	*huge_ralloc(void *ptr, size_t size, size_t oldsize);
+void	*huge_palloc(size_t size, size_t alignment, bool zero);
+void	*huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size,
+    size_t extra);
+void	*huge_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra,
+    size_t alignment, bool zero);
 void	huge_dalloc(void *ptr);
 size_t	huge_salloc(const void *ptr);
 #ifdef JEMALLOC_PROF
diff --git a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
index 2c3f32f..3d25300 100644
--- a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
@@ -17,16 +17,29 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
+#include <stddef.h>
+#ifndef offsetof
+#  define offsetof(type, member)	((size_t)&(((type *)NULL)->member))
+#endif
 #include <inttypes.h>
 #include <string.h>
 #include <strings.h>
+#include <ctype.h>
 #include <unistd.h>
 #include <fcntl.h>
 #include <pthread.h>
+#include <math.h>
 
 #define	JEMALLOC_MANGLE
 #include "../jemalloc@install_suffix@.h"
 
+#ifdef JEMALLOC_ZONE
+#include <mach/mach_error.h>
+#include <mach/mach_init.h>
+#include <mach/vm_map.h>
+#include <malloc/malloc.h>
+#endif
+
 #ifdef JEMALLOC_LAZY_LOCK
 #include <dlfcn.h>
 #endif
@@ -49,7 +62,7 @@
 		malloc_write("<jemalloc>: ");				\
 		malloc_write(__FILE__);					\
 		malloc_write(":");					\
-		malloc_write(umax2s(__LINE__, 10, line_buf));		\
+		malloc_write(u2s(__LINE__, 10, line_buf));		\
 		malloc_write(": Failed assertion: ");			\
 		malloc_write("\"");					\
 		malloc_write(#e);					\
@@ -77,6 +90,8 @@
 /******************************************************************************/
 #define JEMALLOC_H_TYPES
 
+#define	ALLOCM_LG_ALIGN_MASK	((int)0x3f)
+
 #define	ZU(z)	((size_t)z)
 
 #ifndef __DECONST
@@ -92,8 +107,8 @@
 #  define JEMALLOC_INLINE static inline
 #endif
 
-/* Size of stack-allocated buffer passed to strerror_r(). */
-#define	STRERROR_BUF		64
+/* Size of stack-allocated buffer passed to buferror(). */
+#define	BUFERROR_BUF		64
 
 /* Minimum alignment of allocations is 2^LG_QUANTUM bytes. */
 #ifdef __i386__
@@ -159,6 +174,16 @@
 #define	STATIC_PAGE_SIZE ((size_t)(1U << STATIC_PAGE_SHIFT))
 #define	STATIC_PAGE_MASK ((size_t)(STATIC_PAGE_SIZE - 1))
 
+#ifdef PAGE_SHIFT
+#  undef PAGE_SHIFT
+#endif
+#ifdef PAGE_SIZE
+#  undef PAGE_SIZE
+#endif
+#ifdef PAGE_MASK
+#  undef PAGE_MASK
+#endif
+
 #ifdef DYNAMIC_PAGE_SHIFT
 #  define PAGE_SHIFT	lg_pagesize
 #  define PAGE_SIZE	pagesize
@@ -184,8 +209,12 @@
 #include "jemalloc/internal/base.h"
 #include "jemalloc/internal/chunk.h"
 #include "jemalloc/internal/huge.h"
+#include "jemalloc/internal/rtree.h"
 #include "jemalloc/internal/tcache.h"
 #include "jemalloc/internal/hash.h"
+#ifdef JEMALLOC_ZONE
+#include "jemalloc/internal/zone.h"
+#endif
 #include "jemalloc/internal/prof.h"
 
 #undef JEMALLOC_H_TYPES
@@ -203,8 +232,12 @@
 #include "jemalloc/internal/base.h"
 #include "jemalloc/internal/chunk.h"
 #include "jemalloc/internal/huge.h"
+#include "jemalloc/internal/rtree.h"
 #include "jemalloc/internal/tcache.h"
 #include "jemalloc/internal/hash.h"
+#ifdef JEMALLOC_ZONE
+#include "jemalloc/internal/zone.h"
+#endif
 #include "jemalloc/internal/prof.h"
 
 #undef JEMALLOC_H_STRUCTS
@@ -224,6 +257,7 @@
 #ifdef JEMALLOC_FILL
 extern bool	opt_zero;
 #endif
+extern size_t	opt_narenas;
 
 #ifdef DYNAMIC_PAGE_SHIFT
 extern size_t		pagesize;
@@ -240,8 +274,19 @@
  * Map of pthread_self() --> arenas[???], used for selecting an arena to use
  * for allocations.
  */
-extern __thread arena_t	*arenas_map JEMALLOC_ATTR(tls_model("initial-exec"));
+extern __thread arena_t	*arenas_tls JEMALLOC_ATTR(tls_model("initial-exec"));
+#  define ARENA_GET()	arenas_tls
+#  define ARENA_SET(v)	do {						\
+	arenas_tls = (v);						\
+} while (0)
+#else
+extern pthread_key_t	arenas_tsd;
+#  define ARENA_GET()	((arena_t *)pthread_getspecific(arenas_tsd))
+#  define ARENA_SET(v)	do {						\
+	pthread_setspecific(arenas_tsd, (void *)(v));			\
+} while (0)
 #endif
+
 /*
  * Arenas that are used to service external requests.  Not all elements of the
  * arenas array are necessarily used; arenas are created lazily as needed.
@@ -249,11 +294,56 @@
 extern arena_t		**arenas;
 extern unsigned		narenas;
 
-arena_t	*arenas_extend(unsigned ind);
-#ifndef NO_TLS
-arena_t	*choose_arena_hard(void);
+#ifdef JEMALLOC_STATS
+typedef struct {
+	uint64_t	allocated;
+	uint64_t	deallocated;
+} thread_allocated_t;
+#  ifndef NO_TLS
+extern __thread thread_allocated_t	thread_allocated_tls;
+#    define ALLOCATED_GET() thread_allocated_tls.allocated
+#    define DEALLOCATED_GET() thread_allocated_tls.deallocated
+#    define ALLOCATED_ADD(a, d) do {					\
+	thread_allocated_tls.allocated += a;				\
+	thread_allocated_tls.deallocated += d;				\
+} while (0)
+#  else
+extern pthread_key_t	thread_allocated_tsd;
+#    define ALLOCATED_GET()						\
+	(uint64_t)((pthread_getspecific(thread_allocated_tsd) != NULL)	\
+	    ? ((thread_allocated_t *)					\
+	    pthread_getspecific(thread_allocated_tsd))->allocated : 0)
+#    define DEALLOCATED_GET()						\
+	(uint64_t)((pthread_getspecific(thread_allocated_tsd) != NULL)	\
+	    ? ((thread_allocated_t					\
+	    *)pthread_getspecific(thread_allocated_tsd))->deallocated :	\
+	    0)
+#    define ALLOCATED_ADD(a, d) do {					\
+	thread_allocated_t *thread_allocated = (thread_allocated_t *)	\
+	    pthread_getspecific(thread_allocated_tsd);			\
+	if (thread_allocated != NULL) {					\
+		thread_allocated->allocated += (a);			\
+		thread_allocated->deallocated += (d);			\
+	} else {							\
+		thread_allocated = (thread_allocated_t *)		\
+		    imalloc(sizeof(thread_allocated_t));		\
+		if (thread_allocated != NULL) {				\
+			pthread_setspecific(thread_allocated_tsd,	\
+			    thread_allocated);				\
+			thread_allocated->allocated = (a);		\
+			thread_allocated->deallocated = (d);		\
+		}							\
+	}								\
+} while (0)
+#  endif
 #endif
 
+arena_t	*arenas_extend(unsigned ind);
+arena_t	*choose_arena_hard(void);
+int	buferror(int errnum, char *buf, size_t buflen);
+void	jemalloc_prefork(void);
+void	jemalloc_postfork(void);
+
 #include "jemalloc/internal/prn.h"
 #include "jemalloc/internal/ckh.h"
 #include "jemalloc/internal/stats.h"
@@ -265,8 +355,12 @@
 #include "jemalloc/internal/base.h"
 #include "jemalloc/internal/chunk.h"
 #include "jemalloc/internal/huge.h"
+#include "jemalloc/internal/rtree.h"
 #include "jemalloc/internal/tcache.h"
 #include "jemalloc/internal/hash.h"
+#ifdef JEMALLOC_ZONE
+#include "jemalloc/internal/zone.h"
+#endif
 #include "jemalloc/internal/prof.h"
 
 #undef JEMALLOC_H_EXTERNS
@@ -285,11 +379,143 @@
 #include "jemalloc/internal/huge.h"
 
 #ifndef JEMALLOC_ENABLE_INLINE
+size_t	pow2_ceil(size_t x);
+size_t	s2u(size_t size);
+size_t	sa2u(size_t size, size_t alignment, size_t *run_size_p);
 void	malloc_write(const char *s);
 arena_t	*choose_arena(void);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_C_))
+/* Compute the smallest power of 2 that is >= x. */
+JEMALLOC_INLINE size_t
+pow2_ceil(size_t x)
+{
+
+	x--;
+	x |= x >> 1;
+	x |= x >> 2;
+	x |= x >> 4;
+	x |= x >> 8;
+	x |= x >> 16;
+#if (LG_SIZEOF_PTR == 3)
+	x |= x >> 32;
+#endif
+	x++;
+	return (x);
+}
+
+/*
+ * Compute usable size that would result from allocating an object with the
+ * specified size.
+ */
+JEMALLOC_INLINE size_t
+s2u(size_t size)
+{
+
+	if (size <= small_maxclass)
+		return arenas[0]->bins[small_size2bin[size]].reg_size;
+	if (size <= arena_maxclass)
+		return PAGE_CEILING(size);
+	return CHUNK_CEILING(size);
+}
+
+/*
+ * Compute usable size that would result from allocating an object with the
+ * specified size and alignment.
+ */
+JEMALLOC_INLINE size_t
+sa2u(size_t size, size_t alignment, size_t *run_size_p)
+{
+	size_t usize;
+
+	/*
+	 * Round size up to the nearest multiple of alignment.
+	 *
+	 * This done, we can take advantage of the fact that for each small
+	 * size class, every object is aligned at the smallest power of two
+	 * that is non-zero in the base two representation of the size.  For
+	 * example:
+	 *
+	 *   Size |   Base 2 | Minimum alignment
+	 *   -----+----------+------------------
+	 *     96 |  1100000 |  32
+	 *    144 | 10100000 |  32
+	 *    192 | 11000000 |  64
+	 *
+	 * Depending on runtime settings, it is possible that arena_malloc()
+	 * will further round up to a power of two, but that never causes
+	 * correctness issues.
+	 */
+	usize = (size + (alignment - 1)) & (-alignment);
+	/*
+	 * (usize < size) protects against the combination of maximal
+	 * alignment and size greater than maximal alignment.
+	 */
+	if (usize < size) {
+		/* size_t overflow. */
+		return (0);
+	}
+
+	if (usize <= arena_maxclass && alignment <= PAGE_SIZE) {
+		if (usize <= small_maxclass) {
+			return
+			    (arenas[0]->bins[small_size2bin[usize]].reg_size);
+		}
+		return (PAGE_CEILING(usize));
+	} else {
+		size_t run_size;
+
+		/*
+		 * We can't achieve subpage alignment, so round up alignment
+		 * permanently; it makes later calculations simpler.
+		 */
+		alignment = PAGE_CEILING(alignment);
+		usize = PAGE_CEILING(size);
+		/*
+		 * (usize < size) protects against very large sizes within
+		 * PAGE_SIZE of SIZE_T_MAX.
+		 *
+		 * (usize + alignment < usize) protects against the
+		 * combination of maximal alignment and usize large enough
+		 * to cause overflow.  This is similar to the first overflow
+		 * check above, but it needs to be repeated due to the new
+		 * usize value, which may now be *equal* to maximal
+		 * alignment, whereas before we only detected overflow if the
+		 * original size was *greater* than maximal alignment.
+		 */
+		if (usize < size || usize + alignment < usize) {
+			/* size_t overflow. */
+			return (0);
+		}
+
+		/*
+		 * Calculate the size of the over-size run that arena_palloc()
+		 * would need to allocate in order to guarantee the alignment.
+		 */
+		if (usize >= alignment)
+			run_size = usize + alignment - PAGE_SIZE;
+		else {
+			/*
+			 * It is possible that (alignment << 1) will cause
+			 * overflow, but it doesn't matter because we also
+			 * subtract PAGE_SIZE, which in the case of overflow
+			 * leaves us with a very large run_size.  That causes
+			 * the first conditional below to fail, which means
+			 * that the bogus run_size value never gets used for
+			 * anything important.
+			 */
+			run_size = (alignment << 1) - PAGE_SIZE;
+		}
+		if (run_size_p != NULL)
+			*run_size_p = run_size;
+
+		if (run_size <= arena_maxclass)
+			return (PAGE_CEILING(usize));
+		return (CHUNK_CEILING(usize));
+	}
+}
+
 /*
  * Wrapper around malloc_message() that avoids the need for
  * JEMALLOC_P(malloc_message)(...) throughout the code.
@@ -310,78 +536,35 @@
 {
 	arena_t *ret;
 
-	/*
-	 * We can only use TLS if this is a PIC library, since for the static
-	 * library version, libc's malloc is used by TLS allocation, which
-	 * introduces a bootstrapping issue.
-	 */
-#ifndef NO_TLS
-	ret = arenas_map;
+	ret = ARENA_GET();
 	if (ret == NULL) {
 		ret = choose_arena_hard();
 		assert(ret != NULL);
 	}
-#else
-	if (isthreaded && narenas > 1) {
-		unsigned long ind;
 
-		/*
-		 * Hash pthread_self() to one of the arenas.  There is a prime
-		 * number of arenas, so this has a reasonable chance of
-		 * working.  Even so, the hashing can be easily thwarted by
-		 * inconvenient pthread_self() values.  Without specific
-		 * knowledge of how pthread_self() calculates values, we can't
-		 * easily do much better than this.
-		 */
-		ind = (unsigned long) pthread_self() % narenas;
-
-		/*
-		 * Optimistially assume that arenas[ind] has been initialized.
-		 * At worst, we find out that some other thread has already
-		 * done so, after acquiring the lock in preparation.  Note that
-		 * this lazy locking also has the effect of lazily forcing
-		 * cache coherency; without the lock acquisition, there's no
-		 * guarantee that modification of arenas[ind] by another thread
-		 * would be seen on this CPU for an arbitrary amount of time.
-		 *
-		 * In general, this approach to modifying a synchronized value
-		 * isn't a good idea, but in this case we only ever modify the
-		 * value once, so things work out well.
-		 */
-		ret = arenas[ind];
-		if (ret == NULL) {
-			/*
-			 * Avoid races with another thread that may have already
-			 * initialized arenas[ind].
-			 */
-			malloc_mutex_lock(&arenas_lock);
-			if (arenas[ind] == NULL)
-				ret = arenas_extend((unsigned)ind);
-			else
-				ret = arenas[ind];
-			malloc_mutex_unlock(&arenas_lock);
-		}
-	} else
-		ret = arenas[0];
-#endif
-
-	assert(ret != NULL);
 	return (ret);
 }
 #endif
 
+#include "jemalloc/internal/rtree.h"
 #include "jemalloc/internal/tcache.h"
 #include "jemalloc/internal/arena.h"
 #include "jemalloc/internal/hash.h"
-#include "jemalloc/internal/prof.h"
+#ifdef JEMALLOC_ZONE
+#include "jemalloc/internal/zone.h"
+#endif
 
 #ifndef JEMALLOC_ENABLE_INLINE
 void	*imalloc(size_t size);
 void	*icalloc(size_t size);
-void	*ipalloc(size_t alignment, size_t size);
+void	*ipalloc(size_t size, size_t alignment, bool zero);
 size_t	isalloc(const void *ptr);
-void	*iralloc(void *ptr, size_t size);
+#  ifdef JEMALLOC_IVSALLOC
+size_t	ivsalloc(const void *ptr);
+#  endif
 void	idalloc(void *ptr);
+void	*iralloc(void *ptr, size_t size, size_t extra, size_t alignment,
+    bool zero, bool no_move);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_C_))
@@ -408,95 +591,28 @@
 }
 
 JEMALLOC_INLINE void *
-ipalloc(size_t alignment, size_t size)
+ipalloc(size_t size, size_t alignment, bool zero)
 {
 	void *ret;
-	size_t ceil_size;
+	size_t usize;
+	size_t run_size
+#  ifdef JEMALLOC_CC_SILENCE
+	    = 0
+#  endif
+	    ;
 
-	/*
-	 * Round size up to the nearest multiple of alignment.
-	 *
-	 * This done, we can take advantage of the fact that for each small
-	 * size class, every object is aligned at the smallest power of two
-	 * that is non-zero in the base two representation of the size.  For
-	 * example:
-	 *
-	 *   Size |   Base 2 | Minimum alignment
-	 *   -----+----------+------------------
-	 *     96 |  1100000 |  32
-	 *    144 | 10100000 |  32
-	 *    192 | 11000000 |  64
-	 *
-	 * Depending on runtime settings, it is possible that arena_malloc()
-	 * will further round up to a power of two, but that never causes
-	 * correctness issues.
-	 */
-	ceil_size = (size + (alignment - 1)) & (-alignment);
-	/*
-	 * (ceil_size < size) protects against the combination of maximal
-	 * alignment and size greater than maximal alignment.
-	 */
-	if (ceil_size < size) {
-		/* size_t overflow. */
+	usize = sa2u(size, alignment, &run_size);
+	if (usize == 0)
 		return (NULL);
-	}
-
-	if (ceil_size <= PAGE_SIZE || (alignment <= PAGE_SIZE
-	    && ceil_size <= arena_maxclass))
-		ret = arena_malloc(ceil_size, false);
-	else {
-		size_t run_size;
-
-		/*
-		 * We can't achieve subpage alignment, so round up alignment
-		 * permanently; it makes later calculations simpler.
-		 */
-		alignment = PAGE_CEILING(alignment);
-		ceil_size = PAGE_CEILING(size);
-		/*
-		 * (ceil_size < size) protects against very large sizes within
-		 * PAGE_SIZE of SIZE_T_MAX.
-		 *
-		 * (ceil_size + alignment < ceil_size) protects against the
-		 * combination of maximal alignment and ceil_size large enough
-		 * to cause overflow.  This is similar to the first overflow
-		 * check above, but it needs to be repeated due to the new
-		 * ceil_size value, which may now be *equal* to maximal
-		 * alignment, whereas before we only detected overflow if the
-		 * original size was *greater* than maximal alignment.
-		 */
-		if (ceil_size < size || ceil_size + alignment < ceil_size) {
-			/* size_t overflow. */
-			return (NULL);
-		}
-
-		/*
-		 * Calculate the size of the over-size run that arena_palloc()
-		 * would need to allocate in order to guarantee the alignment.
-		 */
-		if (ceil_size >= alignment)
-			run_size = ceil_size + alignment - PAGE_SIZE;
-		else {
-			/*
-			 * It is possible that (alignment << 1) will cause
-			 * overflow, but it doesn't matter because we also
-			 * subtract PAGE_SIZE, which in the case of overflow
-			 * leaves us with a very large run_size.  That causes
-			 * the first conditional below to fail, which means
-			 * that the bogus run_size value never gets used for
-			 * anything important.
-			 */
-			run_size = (alignment << 1) - PAGE_SIZE;
-		}
-
-		if (run_size <= arena_maxclass) {
-			ret = arena_palloc(choose_arena(), alignment, ceil_size,
-			    run_size);
-		} else if (alignment <= chunksize)
-			ret = huge_malloc(ceil_size, false);
-		else
-			ret = huge_palloc(alignment, ceil_size);
-	}
+	if (usize <= arena_maxclass && alignment <= PAGE_SIZE)
+		ret = arena_malloc(usize, zero);
+	else if (run_size <= arena_maxclass) {
+		ret = arena_palloc(choose_arena(), usize, run_size, alignment,
+		    zero);
+	} else if (alignment <= chunksize)
+		ret = huge_malloc(usize, zero);
+	else
+		ret = huge_palloc(usize, alignment, zero);
 
 	assert(((uintptr_t)ret & (alignment - 1)) == 0);
 	return (ret);
@@ -526,21 +642,18 @@
 	return (ret);
 }
 
-JEMALLOC_INLINE void *
-iralloc(void *ptr, size_t size)
+#ifdef JEMALLOC_IVSALLOC
+JEMALLOC_INLINE size_t
+ivsalloc(const void *ptr)
 {
-	size_t oldsize;
 
-	assert(ptr != NULL);
-	assert(size != 0);
+	/* Return 0 if ptr is not within a chunk managed by jemalloc. */
+	if (rtree_get(chunks_rtree, (uintptr_t)CHUNK_ADDR2BASE(ptr)) == NULL)
+		return (0);
 
-	oldsize = isalloc(ptr);
-
-	if (size <= arena_maxclass)
-		return (arena_ralloc(ptr, size, oldsize));
-	else
-		return (huge_ralloc(ptr, size, oldsize));
+	return (isalloc(ptr));
 }
+#endif
 
 JEMALLOC_INLINE void
 idalloc(void *ptr)
@@ -555,7 +668,70 @@
 	else
 		huge_dalloc(ptr);
 }
+
+JEMALLOC_INLINE void *
+iralloc(void *ptr, size_t size, size_t extra, size_t alignment, bool zero,
+    bool no_move)
+{
+	void *ret;
+	size_t oldsize;
+
+	assert(ptr != NULL);
+	assert(size != 0);
+
+	oldsize = isalloc(ptr);
+
+	if (alignment != 0 && ((uintptr_t)ptr & ((uintptr_t)alignment-1))
+	    != 0) {
+		size_t copysize;
+
+		/*
+		 * Existing object alignment is inadquate; allocate new space
+		 * and copy.
+		 */
+		if (no_move)
+			return (NULL);
+		ret = ipalloc(size + extra, alignment, zero);
+		if (ret == NULL) {
+			if (extra == 0)
+				return (NULL);
+			/* Try again, without extra this time. */
+			ret = ipalloc(size, alignment, zero);
+			if (ret == NULL)
+				return (NULL);
+		}
+		/*
+		 * Copy at most size bytes (not size+extra), since the caller
+		 * has no expectation that the extra bytes will be reliably
+		 * preserved.
+		 */
+		copysize = (size < oldsize) ? size : oldsize;
+		memcpy(ret, ptr, copysize);
+		idalloc(ptr);
+		return (ret);
+	}
+
+	if (no_move) {
+		if (size <= arena_maxclass) {
+			return (arena_ralloc_no_move(ptr, oldsize, size,
+			    extra, zero));
+		} else {
+			return (huge_ralloc_no_move(ptr, oldsize, size,
+			    extra));
+		}
+	} else {
+		if (size + extra <= arena_maxclass) {
+			return (arena_ralloc(ptr, oldsize, size, extra,
+			    alignment, zero));
+		} else {
+			return (huge_ralloc(ptr, oldsize, size, extra,
+			    alignment, zero));
+		}
+	}
+}
 #endif
 
+#include "jemalloc/internal/prof.h"
+
 #undef JEMALLOC_H_INLINES
 /******************************************************************************/
diff --git a/jemalloc/include/jemalloc/internal/mutex.h b/jemalloc/include/jemalloc/internal/mutex.h
index 108bfa8..dcca01e 100644
--- a/jemalloc/include/jemalloc/internal/mutex.h
+++ b/jemalloc/include/jemalloc/internal/mutex.h
@@ -3,6 +3,12 @@
 
 typedef pthread_mutex_t malloc_mutex_t;
 
+#ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
+#  define MALLOC_MUTEX_INITIALIZER PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
+#else
+#  define MALLOC_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER
+#endif
+
 #endif /* JEMALLOC_H_TYPES */
 /******************************************************************************/
 #ifdef JEMALLOC_H_STRUCTS
@@ -18,6 +24,7 @@
 #endif
 
 bool	malloc_mutex_init(malloc_mutex_t *mutex);
+void	malloc_mutex_destroy(malloc_mutex_t *mutex);
 
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
diff --git a/jemalloc/include/jemalloc/internal/prof.h b/jemalloc/include/jemalloc/internal/prof.h
index fb55fb9..7864000 100644
--- a/jemalloc/include/jemalloc/internal/prof.h
+++ b/jemalloc/include/jemalloc/internal/prof.h
@@ -6,20 +6,25 @@
 typedef struct prof_cnt_s prof_cnt_t;
 typedef struct prof_thr_cnt_s prof_thr_cnt_t;
 typedef struct prof_ctx_s prof_ctx_t;
-typedef struct prof_s prof_t;
+typedef struct prof_tdata_s prof_tdata_t;
 
 /* Option defaults. */
-#define	LG_PROF_BT_MAX_DEFAULT		2
+#define	PROF_PREFIX_DEFAULT		"jeprof"
+#define	LG_PROF_BT_MAX_DEFAULT		7
 #define	LG_PROF_SAMPLE_DEFAULT		0
-#define	LG_PROF_INTERVAL_DEFAULT	30
+#define	LG_PROF_INTERVAL_DEFAULT	-1
+#define	LG_PROF_TCMAX_DEFAULT		-1
 
 /*
  * Hard limit on stack backtrace depth.  Note that the version of
  * prof_backtrace() that is based on __builtin_return_address() necessarily has
- * a hard-coded number of backtrace frame handlers, so increasing
- * LG_PROF_BT_MAX requires changing prof_backtrace().
+ * a hard-coded number of backtrace frame handlers.
  */
-#define	LG_PROF_BT_MAX		7 /* >= LG_PROF_BT_MAX_DEFAULT */
+#if (defined(JEMALLOC_PROF_LIBGCC) || defined(JEMALLOC_PROF_LIBUNWIND))
+#  define LG_PROF_BT_MAX	((ZU(1) << (LG_SIZEOF_PTR+3)) - 1)
+#else
+#  define LG_PROF_BT_MAX	7 /* >= LG_PROF_BT_MAX_DEFAULT */
+#endif
 #define	PROF_BT_MAX		(1U << LG_PROF_BT_MAX)
 
 /* Initial hash table size. */
@@ -34,16 +39,16 @@
 
 struct prof_bt_s {
 	/* Backtrace, stored as len program counters. */
-	void			**vec;
-	unsigned		len;
+	void		**vec;
+	unsigned	len;
 };
 
 #ifdef JEMALLOC_PROF_LIBGCC
 /* Data structure passed to libgcc _Unwind_Backtrace() callback functions. */
 typedef struct {
-	prof_bt_t *bt;
-	unsigned nignore;
-	unsigned max;
+	prof_bt_t	*bt;
+	unsigned	nignore;
+	unsigned	max;
 } prof_unwind_data_t;
 #endif
 
@@ -51,11 +56,11 @@
 	/*
 	 * Profiling counters.  An allocation/deallocation pair can operate on
 	 * different prof_thr_cnt_t objects that are linked into the same
-	 * prof_ctx_t sets_ql, so it is possible for the cur* counters to go
+	 * prof_ctx_t cnts_ql, so it is possible for the cur* counters to go
 	 * negative.  In principle it is possible for the *bytes counters to
-	 * overflow/underflow, but a general solution would require some form
-	 * of 128-bit counter solution; this implementation doesn't bother to
-	 * solve that problem.
+	 * overflow/underflow, but a general solution would require something
+	 * like 128-bit counters; this implementation doesn't bother to solve
+	 * that problem.
 	 */
 	int64_t		curobjs;
 	int64_t		curbytes;
@@ -64,15 +69,18 @@
 };
 
 struct prof_thr_cnt_s {
-	/* Linkage into prof_ctx_t's sets_ql. */
-	ql_elm(prof_thr_cnt_t)	link;
+	/* Linkage into prof_ctx_t's cnts_ql. */
+	ql_elm(prof_thr_cnt_t)	cnts_link;
+
+	/* Linkage into thread's LRU. */
+	ql_elm(prof_thr_cnt_t)	lru_link;
 
 	/*
 	 * Associated context.  If a thread frees an object that it did not
 	 * allocate, it is possible that the context is not cached in the
 	 * thread's hash table, in which case it must be able to look up the
 	 * context, insert a new prof_thr_cnt_t into the thread's hash table,
-	 * and link it into the prof_ctx_t's sets_ql.
+	 * and link it into the prof_ctx_t's cnts_ql.
 	 */
 	prof_ctx_t		*ctx;
 
@@ -101,11 +109,11 @@
 	/* Associated backtrace. */
 	prof_bt_t		*bt;
 
-	/* Protects cnt_merged and sets_ql. */
+	/* Protects cnt_merged and cnts_ql. */
 	malloc_mutex_t		lock;
 
-	/* Temporary storage for aggregation during dump. */
-	prof_cnt_t		cnt_dump;
+	/* Temporary storage for summation during dump. */
+	prof_cnt_t		cnt_summed;
 
 	/* When threads exit, they merge their stats into cnt_merged. */
 	prof_cnt_t		cnt_merged;
@@ -117,6 +125,31 @@
 	ql_head(prof_thr_cnt_t)	cnts_ql;
 };
 
+struct prof_tdata_s {
+	/*
+	 * Hash of (prof_bt_t *)-->(prof_thr_cnt_t *).  Each thread keeps a
+	 * cache of backtraces, with associated thread-specific prof_thr_cnt_t
+	 * objects.  Other threads may read the prof_thr_cnt_t contents, but no
+	 * others will ever write them.
+	 *
+	 * Upon thread exit, the thread must merge all the prof_thr_cnt_t
+	 * counter data into the associated prof_ctx_t objects, and unlink/free
+	 * the prof_thr_cnt_t objects.
+	 */
+	ckh_t			bt2cnt;
+
+	/* LRU for contents of bt2cnt. */
+	ql_head(prof_thr_cnt_t)	lru_ql;
+
+	/* Backtrace vector, used for calls to prof_backtrace(). */
+	void			**vec;
+
+	/* Sampling state. */
+	uint64_t		prn_state;
+	uint64_t		threshold;
+	uint64_t		accum;
+};
+
 #endif /* JEMALLOC_H_STRUCTS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS
@@ -129,11 +162,14 @@
  * to notice state changes.
  */
 extern bool	opt_prof_active;
-extern size_t	opt_lg_prof_bt_max; /* Maximum backtrace depth. */
-extern size_t	opt_lg_prof_sample; /* Mean bytes between samples. */
+extern size_t	opt_lg_prof_bt_max;   /* Maximum backtrace depth. */
+extern size_t	opt_lg_prof_sample;   /* Mean bytes between samples. */
 extern ssize_t	opt_lg_prof_interval; /* lg(prof_interval). */
-extern bool	opt_prof_udump; /* High-water memory dumping. */
-extern bool	opt_prof_leak; /* Dump leak summary at exit. */
+extern bool	opt_prof_gdump;       /* High-water memory dumping. */
+extern bool	opt_prof_leak;        /* Dump leak summary at exit. */
+extern bool	opt_prof_accum;       /* Report cumulative bytes. */
+extern ssize_t	opt_lg_prof_tcmax;    /* lg(max per thread bactrace cache) */
+extern char	opt_prof_prefix[PATH_MAX + 1];
 
 /*
  * Profile dump interval, measured in bytes allocated.  Each arena triggers a
@@ -150,25 +186,362 @@
  */
 extern bool	prof_promote;
 
-bool	prof_init(prof_t *prof, bool master);
-void	prof_destroy(prof_t *prof);
+/* (1U << opt_lg_prof_bt_max). */
+extern unsigned	prof_bt_max;
 
-prof_thr_cnt_t	*prof_alloc_prep(size_t size);
-prof_ctx_t	*prof_ctx_get(const void *ptr);
-void	prof_malloc(const void *ptr, prof_thr_cnt_t *cnt);
-void	prof_realloc(const void *ptr, prof_thr_cnt_t *cnt, const void *old_ptr,
-    size_t old_size, prof_ctx_t *old_ctx);
-void	prof_free(const void *ptr);
+/* Thread-specific backtrace cache, used to reduce bt2ctx contention. */
+#ifndef NO_TLS
+extern __thread prof_tdata_t	*prof_tdata_tls
+    JEMALLOC_ATTR(tls_model("initial-exec"));
+#  define PROF_TCACHE_GET()	prof_tdata_tls
+#  define PROF_TCACHE_SET(v)	do {					\
+	prof_tdata_tls = (v);						\
+	pthread_setspecific(prof_tdata_tsd, (void *)(v));		\
+} while (0)
+#else
+#  define PROF_TCACHE_GET()						\
+	((prof_tdata_t *)pthread_getspecific(prof_tdata_tsd))
+#  define PROF_TCACHE_SET(v)	do {					\
+	pthread_setspecific(prof_tdata_tsd, (void *)(v));		\
+} while (0)
+#endif
+/*
+ * Same contents as b2cnt_tls, but initialized such that the TSD destructor is
+ * called when a thread exits, so that prof_tdata_tls contents can be merged,
+ * unlinked, and deallocated.
+ */
+extern pthread_key_t	prof_tdata_tsd;
+
+void	bt_init(prof_bt_t *bt, void **vec);
+void	prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max);
+prof_thr_cnt_t	*prof_lookup(prof_bt_t *bt);
 void	prof_idump(void);
 bool	prof_mdump(const char *filename);
-void	prof_udump(void);
+void	prof_gdump(void);
+prof_tdata_t	*prof_tdata_init(void);
 void	prof_boot0(void);
-bool	prof_boot1(void);
+void	prof_boot1(void);
+bool	prof_boot2(void);
 
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES
 
+#ifndef JEMALLOC_ENABLE_INLINE
+void	prof_sample_threshold_update(prof_tdata_t *prof_tdata);
+prof_thr_cnt_t	*prof_alloc_prep(size_t size);
+prof_ctx_t	*prof_ctx_get(const void *ptr);
+void	prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
+bool	prof_sample_accum_update(size_t size);
+void	prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt);
+void	prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
+    size_t old_size, prof_ctx_t *old_ctx);
+void	prof_free(const void *ptr, size_t size);
+#endif
+
+#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_))
+JEMALLOC_INLINE void
+prof_sample_threshold_update(prof_tdata_t *prof_tdata)
+{
+	uint64_t r;
+	double u;
+
+	/*
+	 * Compute prof_sample_threshold as a geometrically distributed random
+	 * variable with mean (2^opt_lg_prof_sample).
+	 */
+	prn64(r, 53, prof_tdata->prn_state,
+	    (uint64_t)6364136223846793005LLU, (uint64_t)1442695040888963407LLU);
+	u = (double)r * (1.0/9007199254740992.0L);
+	prof_tdata->threshold = (uint64_t)(log(u) /
+	    log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample))))
+	    + (uint64_t)1U;
+}
+
+JEMALLOC_INLINE prof_thr_cnt_t *
+prof_alloc_prep(size_t size)
+{
+#ifdef JEMALLOC_ENABLE_INLINE
+   /* This function does not have its own stack frame, because it is inlined. */
+#  define NIGNORE 1
+#else
+#  define NIGNORE 2
+#endif
+	prof_thr_cnt_t *ret;
+	prof_tdata_t *prof_tdata;
+	prof_bt_t bt;
+
+	assert(size == s2u(size));
+
+	prof_tdata = PROF_TCACHE_GET();
+	if (prof_tdata == NULL) {
+		prof_tdata = prof_tdata_init();
+		if (prof_tdata == NULL)
+			return (NULL);
+	}
+
+	if (opt_prof_active == false) {
+		/* Sampling is currently inactive, so avoid sampling. */
+		ret = (prof_thr_cnt_t *)(uintptr_t)1U;
+	} else if (opt_lg_prof_sample == 0) {
+		/*
+		 * Don't bother with sampling logic, since sampling interval is
+		 * 1.
+		 */
+		bt_init(&bt, prof_tdata->vec);
+		prof_backtrace(&bt, NIGNORE, prof_bt_max);
+		ret = prof_lookup(&bt);
+	} else {
+		if (prof_tdata->threshold == 0) {
+			/*
+			 * Initialize.  Seed the prng differently for each
+			 * thread.
+			 */
+			prof_tdata->prn_state = (uint64_t)(uintptr_t)&size;
+			prof_sample_threshold_update(prof_tdata);
+		}
+
+		/*
+		 * Determine whether to capture a backtrace based on whether
+		 * size is enough for prof_accum to reach
+		 * prof_tdata->threshold.  However, delay updating these
+		 * variables until prof_{m,re}alloc(), because we don't know
+		 * for sure that the allocation will succeed.
+		 *
+		 * Use subtraction rather than addition to avoid potential
+		 * integer overflow.
+		 */
+		if (size >= prof_tdata->threshold - prof_tdata->accum) {
+			bt_init(&bt, prof_tdata->vec);
+			prof_backtrace(&bt, NIGNORE, prof_bt_max);
+			ret = prof_lookup(&bt);
+		} else
+			ret = (prof_thr_cnt_t *)(uintptr_t)1U;
+	}
+
+	return (ret);
+#undef NIGNORE
+}
+
+JEMALLOC_INLINE prof_ctx_t *
+prof_ctx_get(const void *ptr)
+{
+	prof_ctx_t *ret;
+	arena_chunk_t *chunk;
+
+	assert(ptr != NULL);
+
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+	if (chunk != ptr) {
+		/* Region. */
+		assert(chunk->arena->magic == ARENA_MAGIC);
+
+		ret = arena_prof_ctx_get(ptr);
+	} else
+		ret = huge_prof_ctx_get(ptr);
+
+	return (ret);
+}
+
+JEMALLOC_INLINE void
+prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
+{
+	arena_chunk_t *chunk;
+
+	assert(ptr != NULL);
+
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+	if (chunk != ptr) {
+		/* Region. */
+		assert(chunk->arena->magic == ARENA_MAGIC);
+
+		arena_prof_ctx_set(ptr, ctx);
+	} else
+		huge_prof_ctx_set(ptr, ctx);
+}
+
+JEMALLOC_INLINE bool
+prof_sample_accum_update(size_t size)
+{
+	prof_tdata_t *prof_tdata;
+
+	/* Sampling logic is unnecessary if the interval is 1. */
+	assert(opt_lg_prof_sample != 0);
+
+	prof_tdata = PROF_TCACHE_GET();
+	assert(prof_tdata != NULL);
+
+	/* Take care to avoid integer overflow. */
+	if (size >= prof_tdata->threshold - prof_tdata->accum) {
+		prof_tdata->accum -= (prof_tdata->threshold - size);
+		/* Compute new prof_sample_threshold. */
+		prof_sample_threshold_update(prof_tdata);
+		while (prof_tdata->accum >= prof_tdata->threshold) {
+			prof_tdata->accum -= prof_tdata->threshold;
+			prof_sample_threshold_update(prof_tdata);
+		}
+		return (false);
+	} else {
+		prof_tdata->accum += size;
+		return (true);
+	}
+}
+
+JEMALLOC_INLINE void
+prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt)
+{
+
+	assert(ptr != NULL);
+	assert(size == isalloc(ptr));
+
+	if (opt_lg_prof_sample != 0) {
+		if (prof_sample_accum_update(size)) {
+			/*
+			 * Don't sample.  For malloc()-like allocation, it is
+			 * always possible to tell in advance how large an
+			 * object's usable size will be, so there should never
+			 * be a difference between the size passed to
+			 * prof_alloc_prep() and prof_malloc().
+			 */
+			assert((uintptr_t)cnt == (uintptr_t)1U);
+		}
+	}
+
+	if ((uintptr_t)cnt > (uintptr_t)1U) {
+		prof_ctx_set(ptr, cnt->ctx);
+
+		cnt->epoch++;
+		/*********/
+		mb_write();
+		/*********/
+		cnt->cnts.curobjs++;
+		cnt->cnts.curbytes += size;
+		if (opt_prof_accum) {
+			cnt->cnts.accumobjs++;
+			cnt->cnts.accumbytes += size;
+		}
+		/*********/
+		mb_write();
+		/*********/
+		cnt->epoch++;
+		/*********/
+		mb_write();
+		/*********/
+	} else
+		prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
+}
+
+JEMALLOC_INLINE void
+prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
+    size_t old_size, prof_ctx_t *old_ctx)
+{
+	prof_thr_cnt_t *told_cnt;
+
+	assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U);
+
+	if (ptr != NULL) {
+		assert(size == isalloc(ptr));
+		if (opt_lg_prof_sample != 0) {
+			if (prof_sample_accum_update(size)) {
+				/*
+				 * Don't sample.  The size passed to
+				 * prof_alloc_prep() was larger than what
+				 * actually got allocated, so a backtrace was
+				 * captured for this allocation, even though
+				 * its actual size was insufficient to cross
+				 * the sample threshold.
+				 */
+				cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
+			}
+		}
+	}
+
+	if ((uintptr_t)old_ctx > (uintptr_t)1U) {
+		told_cnt = prof_lookup(old_ctx->bt);
+		if (told_cnt == NULL) {
+			/*
+			 * It's too late to propagate OOM for this realloc(),
+			 * so operate directly on old_cnt->ctx->cnt_merged.
+			 */
+			malloc_mutex_lock(&old_ctx->lock);
+			old_ctx->cnt_merged.curobjs--;
+			old_ctx->cnt_merged.curbytes -= old_size;
+			malloc_mutex_unlock(&old_ctx->lock);
+			told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
+		}
+	} else
+		told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
+
+	if ((uintptr_t)told_cnt > (uintptr_t)1U)
+		told_cnt->epoch++;
+	if ((uintptr_t)cnt > (uintptr_t)1U) {
+		prof_ctx_set(ptr, cnt->ctx);
+		cnt->epoch++;
+	} else
+		prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
+	/*********/
+	mb_write();
+	/*********/
+	if ((uintptr_t)told_cnt > (uintptr_t)1U) {
+		told_cnt->cnts.curobjs--;
+		told_cnt->cnts.curbytes -= old_size;
+	}
+	if ((uintptr_t)cnt > (uintptr_t)1U) {
+		cnt->cnts.curobjs++;
+		cnt->cnts.curbytes += size;
+		if (opt_prof_accum) {
+			cnt->cnts.accumobjs++;
+			cnt->cnts.accumbytes += size;
+		}
+	}
+	/*********/
+	mb_write();
+	/*********/
+	if ((uintptr_t)told_cnt > (uintptr_t)1U)
+		told_cnt->epoch++;
+	if ((uintptr_t)cnt > (uintptr_t)1U)
+		cnt->epoch++;
+	/*********/
+	mb_write(); /* Not strictly necessary. */
+}
+
+JEMALLOC_INLINE void
+prof_free(const void *ptr, size_t size)
+{
+	prof_ctx_t *ctx = prof_ctx_get(ptr);
+
+	if ((uintptr_t)ctx > (uintptr_t)1) {
+		assert(size == isalloc(ptr));
+		prof_thr_cnt_t *tcnt = prof_lookup(ctx->bt);
+
+		if (tcnt != NULL) {
+			tcnt->epoch++;
+			/*********/
+			mb_write();
+			/*********/
+			tcnt->cnts.curobjs--;
+			tcnt->cnts.curbytes -= size;
+			/*********/
+			mb_write();
+			/*********/
+			tcnt->epoch++;
+			/*********/
+			mb_write();
+			/*********/
+		} else {
+			/*
+			 * OOM during free() cannot be propagated, so operate
+			 * directly on cnt->ctx->cnt_merged.
+			 */
+			malloc_mutex_lock(&ctx->lock);
+			ctx->cnt_merged.curobjs--;
+			ctx->cnt_merged.curbytes -= size;
+			malloc_mutex_unlock(&ctx->lock);
+		}
+	}
+}
+#endif
+
 #endif /* JEMALLOC_H_INLINES */
 /******************************************************************************/
 #endif /* JEMALLOC_PROF */
diff --git a/jemalloc/include/jemalloc/internal/rtree.h b/jemalloc/include/jemalloc/internal/rtree.h
new file mode 100644
index 0000000..9d58eba
--- /dev/null
+++ b/jemalloc/include/jemalloc/internal/rtree.h
@@ -0,0 +1,161 @@
+/*
+ * This radix tree implementation is tailored to the singular purpose of
+ * tracking which chunks are currently owned by jemalloc.  This functionality
+ * is mandatory for OS X, where jemalloc must be able to respond to object
+ * ownership queries.
+ *
+ *******************************************************************************
+ */
+#ifdef JEMALLOC_H_TYPES
+
+typedef struct rtree_s rtree_t;
+
+/*
+ * Size of each radix tree node (must be a power of 2).  This impacts tree
+ * depth.
+ */
+#if (LG_SIZEOF_PTR == 2)
+#  define RTREE_NODESIZE (1U << 14)
+#else
+#  define RTREE_NODESIZE CACHELINE
+#endif
+
+#endif /* JEMALLOC_H_TYPES */
+/******************************************************************************/
+#ifdef JEMALLOC_H_STRUCTS
+
+struct rtree_s {
+	malloc_mutex_t	mutex;
+	void		**root;
+	unsigned	height;
+	unsigned	level2bits[1]; /* Dynamically sized. */
+};
+
+#endif /* JEMALLOC_H_STRUCTS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_EXTERNS
+
+rtree_t	*rtree_new(unsigned bits);
+
+#endif /* JEMALLOC_H_EXTERNS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_INLINES
+
+#ifndef JEMALLOC_ENABLE_INLINE
+#ifndef JEMALLOC_DEBUG
+void	*rtree_get_locked(rtree_t *rtree, uintptr_t key);
+#endif
+void	*rtree_get(rtree_t *rtree, uintptr_t key);
+bool	rtree_set(rtree_t *rtree, uintptr_t key, void *val);
+#endif
+
+#if (defined(JEMALLOC_ENABLE_INLINE) || defined(RTREE_C_))
+#define	RTREE_GET_GENERATE(f)						\
+/* The least significant bits of the key are ignored. */		\
+JEMALLOC_INLINE void *							\
+f(rtree_t *rtree, uintptr_t key)					\
+{									\
+	void *ret;							\
+	uintptr_t subkey;						\
+	unsigned i, lshift, height, bits;				\
+	void **node, **child;						\
+									\
+	RTREE_LOCK(&rtree->mutex);					\
+	for (i = lshift = 0, height = rtree->height, node = rtree->root;\
+	    i < height - 1;						\
+	    i++, lshift += bits, node = child) {			\
+		bits = rtree->level2bits[i];				\
+		subkey = (key << lshift) >> ((ZU(1) << (LG_SIZEOF_PTR + \
+		    3)) - bits);					\
+		child = (void**)node[subkey];				\
+		if (child == NULL) {					\
+			RTREE_UNLOCK(&rtree->mutex);			\
+			return (NULL);					\
+		}							\
+	}								\
+									\
+	/*								\
+	 * node is a leaf, so it contains values rather than node	\
+	 * pointers.							\
+	 */								\
+	bits = rtree->level2bits[i];					\
+	subkey = (key << lshift) >> ((ZU(1) << (LG_SIZEOF_PTR+3)) -	\
+	    bits);							\
+	ret = node[subkey];						\
+	RTREE_UNLOCK(&rtree->mutex);					\
+									\
+	RTREE_GET_VALIDATE						\
+	return (ret);							\
+}
+
+#ifdef JEMALLOC_DEBUG
+#  define RTREE_LOCK(l)		malloc_mutex_lock(l)
+#  define RTREE_UNLOCK(l)	malloc_mutex_unlock(l)
+#  define RTREE_GET_VALIDATE
+RTREE_GET_GENERATE(rtree_get_locked)
+#  undef RTREE_LOCK
+#  undef RTREE_UNLOCK
+#  undef RTREE_GET_VALIDATE
+#endif
+
+#define	RTREE_LOCK(l)
+#define	RTREE_UNLOCK(l)
+#ifdef JEMALLOC_DEBUG
+   /*
+    * Suppose that it were possible for a jemalloc-allocated chunk to be
+    * munmap()ped, followed by a different allocator in another thread re-using
+    * overlapping virtual memory, all without invalidating the cached rtree
+    * value.  The result would be a false positive (the rtree would claim that
+    * jemalloc owns memory that it had actually discarded).  This scenario
+    * seems impossible, but the following assertion is a prudent sanity check.
+    */
+#  define RTREE_GET_VALIDATE						\
+	assert(rtree_get_locked(rtree, key) == ret);
+#else
+#  define RTREE_GET_VALIDATE
+#endif
+RTREE_GET_GENERATE(rtree_get)
+#undef RTREE_LOCK
+#undef RTREE_UNLOCK
+#undef RTREE_GET_VALIDATE
+
+JEMALLOC_INLINE bool
+rtree_set(rtree_t *rtree, uintptr_t key, void *val)
+{
+	uintptr_t subkey;
+	unsigned i, lshift, height, bits;
+	void **node, **child;
+
+	malloc_mutex_lock(&rtree->mutex);
+	for (i = lshift = 0, height = rtree->height, node = rtree->root;
+	    i < height - 1;
+	    i++, lshift += bits, node = child) {
+		bits = rtree->level2bits[i];
+		subkey = (key << lshift) >> ((ZU(1) << (LG_SIZEOF_PTR+3)) -
+		    bits);
+		child = (void**)node[subkey];
+		if (child == NULL) {
+			child = (void**)base_alloc(sizeof(void *) <<
+			    rtree->level2bits[i+1]);
+			if (child == NULL) {
+				malloc_mutex_unlock(&rtree->mutex);
+				return (true);
+			}
+			memset(child, 0, sizeof(void *) <<
+			    rtree->level2bits[i+1]);
+			node[subkey] = child;
+		}
+	}
+
+	/* node is a leaf, so it contains values rather than node pointers. */
+	bits = rtree->level2bits[i];
+	subkey = (key << lshift) >> ((ZU(1) << (LG_SIZEOF_PTR+3)) - bits);
+	node[subkey] = val;
+	malloc_mutex_unlock(&rtree->mutex);
+
+	return (false);
+}
+#endif
+
+#endif /* JEMALLOC_H_INLINES */
+/******************************************************************************/
diff --git a/jemalloc/include/jemalloc/internal/stats.h b/jemalloc/include/jemalloc/internal/stats.h
index cbf035f..3fc2080 100644
--- a/jemalloc/include/jemalloc/internal/stats.h
+++ b/jemalloc/include/jemalloc/internal/stats.h
@@ -154,7 +154,7 @@
 
 extern bool	opt_stats_print;
 
-char	*umax2s(uintmax_t x, unsigned base, char *s);
+char	*u2s(uint64_t x, unsigned base, char *s);
 #ifdef JEMALLOC_STATS
 void malloc_cprintf(void (*write)(void *, const char *), void *cbopaque,
     const char *format, ...) JEMALLOC_ATTR(format(printf, 3, 4));
diff --git a/jemalloc/include/jemalloc/internal/tcache.h b/jemalloc/include/jemalloc/internal/tcache.h
index a8be436..1ad91a9 100644
--- a/jemalloc/include/jemalloc/internal/tcache.h
+++ b/jemalloc/include/jemalloc/internal/tcache.h
@@ -17,7 +17,7 @@
 /* Number of cache slots for large size classes. */
 #define	TCACHE_NSLOTS_LARGE		20
 
-/* (1U << opt_lg_tcache_maxclass) is used to compute tcache_maxclass. */
+/* (1U << opt_lg_tcache_max) is used to compute tcache_maxclass. */
 #define	LG_TCACHE_MAXCLASS_DEFAULT	15
 
 /*
@@ -61,12 +61,25 @@
 #ifdef JEMALLOC_H_EXTERNS
 
 extern bool	opt_tcache;
-extern ssize_t	opt_lg_tcache_maxclass;
+extern ssize_t	opt_lg_tcache_max;
 extern ssize_t	opt_lg_tcache_gc_sweep;
 
 /* Map of thread-specific caches. */
+#ifndef NO_TLS
 extern __thread tcache_t	*tcache_tls
     JEMALLOC_ATTR(tls_model("initial-exec"));
+#  define TCACHE_GET()	tcache_tls
+#  define TCACHE_SET(v)	do {						\
+	tcache_tls = (tcache_t *)(v);					\
+	pthread_setspecific(tcache_tsd, (void *)(v));			\
+} while (0)
+#else
+#  define TCACHE_GET()	((tcache_t *)pthread_getspecific(tcache_tsd))
+#  define TCACHE_SET(v)	do {						\
+	pthread_setspecific(tcache_tsd, (void *)(v));			\
+} while (0)
+#endif
+extern pthread_key_t		tcache_tsd;
 
 /*
  * Number of tcache bins.  There are nbins small-object bins, plus 0 or more
@@ -122,14 +135,23 @@
 	if ((isthreaded & opt_tcache) == false)
 		return (NULL);
 
-	tcache = tcache_tls;
-	if ((uintptr_t)tcache <= (uintptr_t)1) {
+	tcache = TCACHE_GET();
+	if ((uintptr_t)tcache <= (uintptr_t)2) {
 		if (tcache == NULL) {
 			tcache = tcache_create(choose_arena());
 			if (tcache == NULL)
 				return (NULL);
-		} else
+		} else {
+			if (tcache == (void *)(uintptr_t)1) {
+				/*
+				 * Make a note that an allocator function was
+				 * called after the tcache_thread_cleanup() was
+				 * called.
+				 */
+				TCACHE_SET((uintptr_t)2);
+			}
 			return (NULL);
+		}
 	}
 
 	return (tcache);
@@ -258,9 +280,9 @@
 	} else {
 #ifdef JEMALLOC_PROF
 		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ret);
-		size_t pageind = (unsigned)(((uintptr_t)ret - (uintptr_t)chunk)
-		    >> PAGE_SHIFT);
-		chunk->map[pageind].bits |= CHUNK_MAP_CLASS_MASK;
+		size_t pageind = (((uintptr_t)ret - (uintptr_t)chunk) >>
+		    PAGE_SHIFT);
+		chunk->map[pageind-map_bias].bits &= ~CHUNK_MAP_CLASS_MASK;
 #endif
 		if (zero == false) {
 #ifdef JEMALLOC_FILL
@@ -299,8 +321,8 @@
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	arena = chunk->arena;
-	pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT);
-	mapelm = &chunk->map[pageind];
+	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
+	mapelm = &chunk->map[pageind-map_bias];
 	run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)((pageind -
 	    (mapelm->bits >> PAGE_SHIFT)) << PAGE_SHIFT));
 	assert(run->magic == ARENA_RUN_MAGIC);
@@ -339,7 +361,6 @@
 	arena_chunk_t *chunk;
 	size_t pageind, binind;
 	tcache_bin_t *tbin;
-	arena_chunk_map_t *mapelm;
 
 	assert((size & PAGE_MASK) == 0);
 	assert(arena_salloc(ptr) > small_maxclass);
@@ -347,8 +368,7 @@
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	arena = chunk->arena;
-	pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT);
-	mapelm = &chunk->map[pageind];
+	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
 	binind = nbins + (size >> PAGE_SHIFT) - 1;
 
 #ifdef JEMALLOC_FILL
diff --git a/jemalloc/include/jemalloc/internal/zone.h b/jemalloc/include/jemalloc/internal/zone.h
new file mode 100644
index 0000000..859b529
--- /dev/null
+++ b/jemalloc/include/jemalloc/internal/zone.h
@@ -0,0 +1,23 @@
+#ifndef JEMALLOC_ZONE
+#  error "This source file is for zones on Darwin (OS X)."
+#endif
+/******************************************************************************/
+#ifdef JEMALLOC_H_TYPES
+
+#endif /* JEMALLOC_H_TYPES */
+/******************************************************************************/
+#ifdef JEMALLOC_H_STRUCTS
+
+#endif /* JEMALLOC_H_STRUCTS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_EXTERNS
+
+malloc_zone_t *create_zone(void);
+void	szone2ozone(malloc_zone_t *zone);
+
+#endif /* JEMALLOC_H_EXTERNS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_INLINES
+
+#endif /* JEMALLOC_H_INLINES */
+/******************************************************************************/
diff --git a/jemalloc/include/jemalloc/jemalloc.h.in b/jemalloc/include/jemalloc/jemalloc.h.in
index 8ef8183..4dd3981 100644
--- a/jemalloc/include/jemalloc/jemalloc.h.in
+++ b/jemalloc/include/jemalloc/jemalloc.h.in
@@ -4,6 +4,9 @@
 extern "C" {
 #endif
 
+#include <limits.h>
+#include <strings.h>
+
 #define	JEMALLOC_VERSION "@jemalloc_version@"
 #define	JEMALLOC_VERSION_MAJOR @jemalloc_version_major@
 #define	JEMALLOC_VERSION_MINOR @jemalloc_version_minor@
@@ -16,7 +19,20 @@
 #  define JEMALLOC_P(s) s
 #endif
 
-extern const char	*JEMALLOC_P(malloc_options);
+#define	ALLOCM_LG_ALIGN	((int)0x3f)
+#if LG_SIZEOF_PTR == 2
+#define	ALLOCM_ALIGN(a)	(ffs(a)-1)
+#else
+#define	ALLOCM_ALIGN(a)	((a < (size_t)INT_MAX) ? ffs(a)-1 : ffs(a>>32)+31)
+#endif
+#define	ALLOCM_ZERO	((int)0x40)
+#define	ALLOCM_NO_MOVE	((int)0x80)
+
+#define	ALLOCM_SUCCESS		0
+#define	ALLOCM_ERR_OOM		1
+#define	ALLOCM_ERR_NOT_MOVED	2
+
+extern const char	*JEMALLOC_P(malloc_conf);
 extern void		(*JEMALLOC_P(malloc_message))(void *, const char *);
 
 void	*JEMALLOC_P(malloc)(size_t size) JEMALLOC_ATTR(malloc);
@@ -36,6 +52,14 @@
 int	JEMALLOC_P(mallctlbymib)(const size_t *mib, size_t miblen, void *oldp,
     size_t *oldlenp, void *newp, size_t newlen);
 
+int	JEMALLOC_P(allocm)(void **ptr, size_t *rsize, size_t size, int flags)
+    JEMALLOC_ATTR(nonnull(1));
+int	JEMALLOC_P(rallocm)(void **ptr, size_t *rsize, size_t size,
+    size_t extra, int flags) JEMALLOC_ATTR(nonnull(1));
+int	JEMALLOC_P(sallocm)(const void *ptr, size_t *rsize, int flags)
+    JEMALLOC_ATTR(nonnull(1));
+int	JEMALLOC_P(dallocm)(void *ptr, int flags) JEMALLOC_ATTR(nonnull(1));
+
 #ifdef __cplusplus
 };
 #endif
diff --git a/jemalloc/include/jemalloc/jemalloc_defs.h.in b/jemalloc/include/jemalloc/jemalloc_defs.h.in
index 8b98d67..b8f3f36 100644
--- a/jemalloc/include/jemalloc/jemalloc_defs.h.in
+++ b/jemalloc/include/jemalloc/jemalloc_defs.h.in
@@ -13,6 +13,7 @@
  * the API prefixing.
  */
 #undef JEMALLOC_PREFIX
+#undef JEMALLOC_CPREFIX
 #if (defined(JEMALLOC_PREFIX) && defined(JEMALLOC_MANGLE))
 #undef JEMALLOC_P
 #endif
@@ -31,6 +32,9 @@
 #  define JEMALLOC_ATTR(s)
 #endif
 
+/* JEMALLOC_CC_SILENCE enables code that silences unuseful compiler warnings. */
+#undef JEMALLOC_CC_SILENCE
+
 /*
  * JEMALLOC_DEBUG enables assertions and other sanity checks, and disables
  * inline functions.
@@ -92,6 +96,38 @@
 /* TLS is used to map arenas and magazine caches to threads. */
 #undef NO_TLS
 
+/*
+ * JEMALLOC_IVSALLOC enables ivsalloc(), which verifies that pointers reside
+ * within jemalloc-owned chunks before dereferencing them.
+ */
+#undef JEMALLOC_IVSALLOC
+
+/*
+ * Define overrides for non-standard allocator-related functions if they
+ * are present on the system.
+ */
+#undef JEMALLOC_OVERRIDE_MEMALIGN
+#undef JEMALLOC_OVERRIDE_VALLOC
+
+/*
+ * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings.
+ */
+#undef JEMALLOC_ZONE
+#undef JEMALLOC_ZONE_VERSION
+
+/*
+ * Methods for purging unused pages differ between operating systems.
+ *
+ *   madvise(..., MADV_DONTNEED) : On Linux, this immediately discards pages,
+ *                                 such that new pages will be demand-zeroed if
+ *                                 the address region is later touched.
+ *   madvise(..., MADV_FREE) : On FreeBSD and Darwin, this marks pages as being
+ *                             unused, such that they will be discarded rather
+ *                             than swapped out.
+ */
+#undef JEMALLOC_PURGE_MADVISE_DONTNEED
+#undef JEMALLOC_PURGE_MADVISE_FREE
+
 /* sizeof(void *) == 2^LG_SIZEOF_PTR. */
 #undef LG_SIZEOF_PTR
 
diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c
index ee859fc..3d4f888 100644
--- a/jemalloc/src/arena.c
+++ b/jemalloc/src/arena.c
@@ -165,7 +165,7 @@
 static void	arena_chunk_dealloc(arena_t *arena, arena_chunk_t *chunk);
 static arena_run_t *arena_run_alloc(arena_t *arena, size_t size, bool large,
     bool zero);
-static void	arena_purge(arena_t *arena);
+static void	arena_purge(arena_t *arena, bool all);
 static void	arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty);
 static void	arena_run_trim_head(arena_t *arena, arena_chunk_t *chunk,
     arena_run_t *run, size_t oldsize, size_t newsize);
@@ -174,16 +174,18 @@
 static arena_run_t *arena_bin_nonfull_run_get(arena_t *arena, arena_bin_t *bin);
 static void	*arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin);
 static size_t	arena_bin_run_size_calc(arena_bin_t *bin, size_t min_run_size);
+static void	arena_dissociate_bin_run(arena_chunk_t *chunk, arena_run_t *run,
+    arena_bin_t *bin);
 static void	arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk,
     arena_run_t *run, arena_bin_t *bin);
+static void	arena_bin_lower_run(arena_t *arena, arena_chunk_t *chunk,
+    arena_run_t *run, arena_bin_t *bin);
 static void	arena_ralloc_large_shrink(arena_t *arena, arena_chunk_t *chunk,
-    void *ptr, size_t size, size_t oldsize);
+    void *ptr, size_t oldsize, size_t size);
 static bool	arena_ralloc_large_grow(arena_t *arena, arena_chunk_t *chunk,
-    void *ptr, size_t size, size_t oldsize);
-static bool	arena_ralloc_large(void *ptr, size_t size, size_t oldsize);
-#ifdef JEMALLOC_TINY
-static size_t	pow2_ceil(size_t x);
-#endif
+    void *ptr, size_t oldsize, size_t size, size_t extra, bool zero);
+static bool	arena_ralloc_large(void *ptr, size_t oldsize, size_t size,
+    size_t extra, bool zero);
 static bool	small_size2bin_init(void);
 #ifdef JEMALLOC_DEBUG
 static void	small_size2bin_validate(void);
@@ -281,12 +283,33 @@
 	assert(((uintptr_t)ptr - ((uintptr_t)run +
 	    (uintptr_t)run->bin->reg0_offset)) % (uintptr_t)run->bin->reg_size
 	    == 0);
+	/*
+	 * Freeing a pointer lower than region zero can cause assertion
+	 * failure.
+	 */
+	assert((uintptr_t)ptr >= (uintptr_t)run +
+	    (uintptr_t)run->bin->reg0_offset);
+	/*
+	 * Freeing a pointer past in the run's frontier can cause assertion
+	 * failure.
+	 */
+	assert((uintptr_t)ptr < (uintptr_t)run->next);
 
 	*(void **)ptr = run->avail;
 	run->avail = ptr;
 	run->nfree++;
 }
 
+#ifdef JEMALLOC_DEBUG
+static inline void
+arena_chunk_validate_zeroed(arena_chunk_t *chunk, size_t run_ind)
+{
+	size_t *p = (size_t *)((uintptr_t)chunk + (run_ind << PAGE_SHIFT));
+	for (size_t i = 0; i < PAGE_SIZE / sizeof(size_t); i++)
+		assert(p[i] == 0);
+}
+#endif
+
 static void
 arena_run_split(arena_t *arena, arena_run_t *run, size_t size, bool large,
     bool zero)
@@ -300,39 +323,40 @@
 	old_ndirty = chunk->ndirty;
 	run_ind = (unsigned)(((uintptr_t)run - (uintptr_t)chunk)
 	    >> PAGE_SHIFT);
-	flag_dirty = chunk->map[run_ind].bits & CHUNK_MAP_DIRTY;
+	flag_dirty = chunk->map[run_ind-map_bias].bits & CHUNK_MAP_DIRTY;
 	runs_avail = (flag_dirty != 0) ? &arena->runs_avail_dirty :
 	    &arena->runs_avail_clean;
-	total_pages = (chunk->map[run_ind].bits & ~PAGE_MASK) >>
+	total_pages = (chunk->map[run_ind-map_bias].bits & ~PAGE_MASK) >>
 	    PAGE_SHIFT;
-	assert((chunk->map[run_ind+total_pages-1].bits & CHUNK_MAP_DIRTY) ==
-	    flag_dirty);
+	assert((chunk->map[run_ind+total_pages-1-map_bias].bits &
+	    CHUNK_MAP_DIRTY) == flag_dirty);
 	need_pages = (size >> PAGE_SHIFT);
 	assert(need_pages > 0);
 	assert(need_pages <= total_pages);
 	rem_pages = total_pages - need_pages;
 
-	arena_avail_tree_remove(runs_avail, &chunk->map[run_ind]);
+	arena_avail_tree_remove(runs_avail, &chunk->map[run_ind-map_bias]);
 	arena->nactive += need_pages;
 
 	/* Keep track of trailing unused pages for later use. */
 	if (rem_pages > 0) {
 		if (flag_dirty != 0) {
-			chunk->map[run_ind+need_pages].bits = (rem_pages <<
-			    PAGE_SHIFT) | CHUNK_MAP_DIRTY;
-			chunk->map[run_ind+total_pages-1].bits = (rem_pages <<
-			    PAGE_SHIFT) | CHUNK_MAP_DIRTY;
+			chunk->map[run_ind+need_pages-map_bias].bits =
+			    (rem_pages << PAGE_SHIFT) | CHUNK_MAP_DIRTY;
+			chunk->map[run_ind+total_pages-1-map_bias].bits =
+			    (rem_pages << PAGE_SHIFT) | CHUNK_MAP_DIRTY;
 		} else {
-			chunk->map[run_ind+need_pages].bits = (rem_pages <<
-			    PAGE_SHIFT) | (chunk->map[run_ind+need_pages].bits &
-			    CHUNK_MAP_ZEROED);
-			chunk->map[run_ind+total_pages-1].bits = (rem_pages <<
-			    PAGE_SHIFT) |
-			    (chunk->map[run_ind+total_pages-1].bits &
-			    CHUNK_MAP_ZEROED);
+			chunk->map[run_ind+need_pages-map_bias].bits =
+			    (rem_pages << PAGE_SHIFT) |
+			    (chunk->map[run_ind+need_pages-map_bias].bits &
+			    CHUNK_MAP_UNZEROED);
+			chunk->map[run_ind+total_pages-1-map_bias].bits =
+			    (rem_pages << PAGE_SHIFT) |
+			    (chunk->map[run_ind+total_pages-1-map_bias].bits &
+			    CHUNK_MAP_UNZEROED);
 		}
 		arena_avail_tree_insert(runs_avail,
-		    &chunk->map[run_ind+need_pages]);
+		    &chunk->map[run_ind+need_pages-map_bias]);
 	}
 
 	/* Update dirty page accounting. */
@@ -353,13 +377,19 @@
 				 * zeroed (i.e. never before touched).
 				 */
 				for (i = 0; i < need_pages; i++) {
-					if ((chunk->map[run_ind + i].bits &
-					    CHUNK_MAP_ZEROED) == 0) {
+					if ((chunk->map[run_ind+i-map_bias].bits
+					    & CHUNK_MAP_UNZEROED) != 0) {
 						memset((void *)((uintptr_t)
-						    chunk + ((run_ind + i) <<
+						    chunk + ((run_ind+i) <<
 						    PAGE_SHIFT)), 0,
 						    PAGE_SIZE);
 					}
+#ifdef JEMALLOC_DEBUG
+					else {
+						arena_chunk_validate_zeroed(
+						    chunk, run_ind+i);
+					}
+#endif
 				}
 			} else {
 				/*
@@ -376,27 +406,54 @@
 		 * Set the last element first, in case the run only contains one
 		 * page (i.e. both statements set the same element).
 		 */
-		chunk->map[run_ind+need_pages-1].bits = CHUNK_MAP_LARGE |
-		    CHUNK_MAP_ALLOCATED | flag_dirty;
-		chunk->map[run_ind].bits = size | CHUNK_MAP_LARGE |
-#ifdef JEMALLOC_PROF
-		    CHUNK_MAP_CLASS_MASK |
-#endif
-		    CHUNK_MAP_ALLOCATED | flag_dirty;
+		chunk->map[run_ind+need_pages-1-map_bias].bits =
+		    CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED | flag_dirty;
+		chunk->map[run_ind-map_bias].bits = size | flag_dirty |
+		    CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED;
 	} else {
 		assert(zero == false);
 		/*
-		 * Propagate the dirty flag to the allocated small run, so that
-		 * arena_dalloc_bin_run() has the ability to conditionally trim
-		 * clean pages.
+		 * Propagate the dirty and unzeroed flags to the allocated
+		 * small run, so that arena_dalloc_bin_run() has the ability to
+		 * conditionally trim clean pages.
 		 */
-		chunk->map[run_ind].bits = CHUNK_MAP_ALLOCATED | flag_dirty;
+		chunk->map[run_ind-map_bias].bits =
+		    (chunk->map[run_ind-map_bias].bits & CHUNK_MAP_UNZEROED) |
+		    CHUNK_MAP_ALLOCATED | flag_dirty;
+#ifdef JEMALLOC_DEBUG
+		/*
+		 * The first page will always be dirtied during small run
+		 * initialization, so a validation failure here would not
+		 * actually cause an observable failure.
+		 */
+		if (flag_dirty == 0 &&
+		    (chunk->map[run_ind-map_bias].bits & CHUNK_MAP_UNZEROED)
+		    == 0)
+			arena_chunk_validate_zeroed(chunk, run_ind);
+#endif
 		for (i = 1; i < need_pages - 1; i++) {
-			chunk->map[run_ind + i].bits = (i << PAGE_SHIFT)
-			    | CHUNK_MAP_ALLOCATED;
+			chunk->map[run_ind+i-map_bias].bits = (i << PAGE_SHIFT)
+			    | (chunk->map[run_ind+i-map_bias].bits &
+			    CHUNK_MAP_UNZEROED) | CHUNK_MAP_ALLOCATED;
+#ifdef JEMALLOC_DEBUG
+			if (flag_dirty == 0 &&
+			    (chunk->map[run_ind+i-map_bias].bits &
+			    CHUNK_MAP_UNZEROED) == 0)
+				arena_chunk_validate_zeroed(chunk, run_ind+i);
+#endif
 		}
-		chunk->map[run_ind + need_pages - 1].bits = ((need_pages - 1) <<
-		    PAGE_SHIFT) | CHUNK_MAP_ALLOCATED | flag_dirty;
+		chunk->map[run_ind+need_pages-1-map_bias].bits = ((need_pages
+		    - 1) << PAGE_SHIFT) |
+		    (chunk->map[run_ind+need_pages-1-map_bias].bits &
+		    CHUNK_MAP_UNZEROED) | CHUNK_MAP_ALLOCATED | flag_dirty;
+#ifdef JEMALLOC_DEBUG
+		if (flag_dirty == 0 &&
+		    (chunk->map[run_ind+need_pages-1-map_bias].bits &
+		    CHUNK_MAP_UNZEROED) == 0) {
+			arena_chunk_validate_zeroed(chunk,
+			    run_ind+need_pages-1);
+		}
+#endif
 	}
 }
 
@@ -413,20 +470,24 @@
 		arena->spare = NULL;
 
 		/* Insert the run into the appropriate runs_avail_* tree. */
-		if ((chunk->map[arena_chunk_header_npages].bits &
-		    CHUNK_MAP_DIRTY) == 0)
+		if ((chunk->map[0].bits & CHUNK_MAP_DIRTY) == 0)
 			runs_avail = &arena->runs_avail_clean;
 		else
 			runs_avail = &arena->runs_avail_dirty;
-		arena_avail_tree_insert(runs_avail,
-		    &chunk->map[arena_chunk_header_npages]);
+		assert((chunk->map[0].bits & ~PAGE_MASK) == arena_maxclass);
+		assert((chunk->map[chunk_npages-1-map_bias].bits & ~PAGE_MASK)
+		    == arena_maxclass);
+		assert((chunk->map[0].bits & CHUNK_MAP_DIRTY) ==
+		    (chunk->map[chunk_npages-1-map_bias].bits &
+		    CHUNK_MAP_DIRTY));
+		arena_avail_tree_insert(runs_avail, &chunk->map[0]);
 	} else {
 		bool zero;
-		size_t zeroed;
+		size_t unzeroed;
 
 		zero = false;
 		malloc_mutex_unlock(&arena->lock);
-		chunk = (arena_chunk_t *)chunk_alloc(chunksize, &zero);
+		chunk = (arena_chunk_t *)chunk_alloc(chunksize, false, &zero);
 		malloc_mutex_lock(&arena->lock);
 		if (chunk == NULL)
 			return (NULL);
@@ -449,17 +510,28 @@
 		 * Mark the pages as zeroed iff chunk_alloc() returned a zeroed
 		 * chunk.
 		 */
-		zeroed = zero ? CHUNK_MAP_ZEROED : 0;
-		for (i = 0; i < arena_chunk_header_npages; i++)
-			chunk->map[i].bits = 0;
-		chunk->map[i].bits = arena_maxclass | zeroed;
-		for (i++; i < chunk_npages-1; i++)
-			chunk->map[i].bits = zeroed;
-		chunk->map[chunk_npages-1].bits = arena_maxclass | zeroed;
+		unzeroed = zero ? 0 : CHUNK_MAP_UNZEROED;
+		chunk->map[0].bits = arena_maxclass | unzeroed;
+		/*
+		 * There is no need to initialize the internal page map entries
+		 * unless the chunk is not zeroed.
+		 */
+		if (zero == false) {
+			for (i = map_bias+1; i < chunk_npages-1; i++)
+				chunk->map[i-map_bias].bits = unzeroed;
+		}
+#ifdef JEMALLOC_DEBUG
+		else {
+			for (i = map_bias+1; i < chunk_npages-1; i++)
+				assert(chunk->map[i-map_bias].bits == unzeroed);
+		}
+#endif
+		chunk->map[chunk_npages-1-map_bias].bits = arena_maxclass |
+		    unzeroed;
 
 		/* Insert the run into the runs_avail_clean tree. */
 		arena_avail_tree_insert(&arena->runs_avail_clean,
-		    &chunk->map[arena_chunk_header_npages]);
+		    &chunk->map[0]);
 	}
 
 	return (chunk);
@@ -474,13 +546,11 @@
 	 * Remove run from the appropriate runs_avail_* tree, so that the arena
 	 * does not use it.
 	 */
-	if ((chunk->map[arena_chunk_header_npages].bits &
-	    CHUNK_MAP_DIRTY) == 0)
+	if ((chunk->map[0].bits & CHUNK_MAP_DIRTY) == 0)
 		runs_avail = &arena->runs_avail_clean;
 	else
 		runs_avail = &arena->runs_avail_dirty;
-	arena_avail_tree_remove(runs_avail,
-	    &chunk->map[arena_chunk_header_npages]);
+	arena_avail_tree_remove(runs_avail, &chunk->map[0]);
 
 	if (arena->spare != NULL) {
 		arena_chunk_t *spare = arena->spare;
@@ -516,8 +586,9 @@
 	mapelm = arena_avail_tree_nsearch(&arena->runs_avail_dirty, &key);
 	if (mapelm != NULL) {
 		arena_chunk_t *run_chunk = CHUNK_ADDR2BASE(mapelm);
-		size_t pageind = ((uintptr_t)mapelm - (uintptr_t)run_chunk->map)
-		    / sizeof(arena_chunk_map_t);
+		size_t pageind = (((uintptr_t)mapelm -
+		    (uintptr_t)run_chunk->map) / sizeof(arena_chunk_map_t))
+		    + map_bias;
 
 		run = (arena_run_t *)((uintptr_t)run_chunk + (pageind <<
 		    PAGE_SHIFT));
@@ -527,8 +598,9 @@
 	mapelm = arena_avail_tree_nsearch(&arena->runs_avail_clean, &key);
 	if (mapelm != NULL) {
 		arena_chunk_t *run_chunk = CHUNK_ADDR2BASE(mapelm);
-		size_t pageind = ((uintptr_t)mapelm - (uintptr_t)run_chunk->map)
-		    / sizeof(arena_chunk_map_t);
+		size_t pageind = (((uintptr_t)mapelm -
+		    (uintptr_t)run_chunk->map) / sizeof(arena_chunk_map_t))
+		    + map_bias;
 
 		run = (arena_run_t *)((uintptr_t)run_chunk + (pageind <<
 		    PAGE_SHIFT));
@@ -541,8 +613,8 @@
 	 */
 	chunk = arena_chunk_alloc(arena);
 	if (chunk != NULL) {
-		run = (arena_run_t *)((uintptr_t)chunk +
-		    (arena_chunk_header_npages << PAGE_SHIFT));
+		run = (arena_run_t *)((uintptr_t)chunk + (map_bias <<
+		    PAGE_SHIFT));
 		arena_run_split(arena, run, size, large, zero);
 		return (run);
 	}
@@ -555,8 +627,9 @@
 	mapelm = arena_avail_tree_nsearch(&arena->runs_avail_dirty, &key);
 	if (mapelm != NULL) {
 		arena_chunk_t *run_chunk = CHUNK_ADDR2BASE(mapelm);
-		size_t pageind = ((uintptr_t)mapelm - (uintptr_t)run_chunk->map)
-		    / sizeof(arena_chunk_map_t);
+		size_t pageind = (((uintptr_t)mapelm -
+		    (uintptr_t)run_chunk->map) / sizeof(arena_chunk_map_t))
+		    + map_bias;
 
 		run = (arena_run_t *)((uintptr_t)run_chunk + (pageind <<
 		    PAGE_SHIFT));
@@ -566,8 +639,9 @@
 	mapelm = arena_avail_tree_nsearch(&arena->runs_avail_clean, &key);
 	if (mapelm != NULL) {
 		arena_chunk_t *run_chunk = CHUNK_ADDR2BASE(mapelm);
-		size_t pageind = ((uintptr_t)mapelm - (uintptr_t)run_chunk->map)
-		    / sizeof(arena_chunk_map_t);
+		size_t pageind = (((uintptr_t)mapelm -
+		    (uintptr_t)run_chunk->map) / sizeof(arena_chunk_map_t))
+		    + map_bias;
 
 		run = (arena_run_t *)((uintptr_t)run_chunk + (pageind <<
 		    PAGE_SHIFT));
@@ -587,7 +661,7 @@
 	    (arena->ndirty - arena->npurgatory) > chunk_npages &&
 	    (arena->nactive >> opt_lg_dirty_mult) < (arena->ndirty -
 	    arena->npurgatory))
-		arena_purge(arena);
+		arena_purge(arena, false);
 }
 
 static inline void
@@ -595,7 +669,7 @@
 {
 	ql_head(arena_chunk_map_t) mapelms;
 	arena_chunk_map_t *mapelm;
-	size_t pageind, flag_zeroed;
+	size_t pageind, flag_unzeroed;
 #ifdef JEMALLOC_DEBUG
 	size_t ndirty;
 #endif
@@ -605,11 +679,19 @@
 
 	ql_new(&mapelms);
 
-	flag_zeroed =
-#ifdef JEMALLOC_SWAP
-	    swap_enabled ? 0 :
+	flag_unzeroed =
+#ifdef JEMALLOC_PURGE_MADVISE_DONTNEED
+   /*
+    * madvise(..., MADV_DONTNEED) results in zero-filled pages for anonymous
+    * mappings, but not for file-backed mappings.
+    */
+#  ifdef JEMALLOC_SWAP
+	    swap_enabled ? CHUNK_MAP_UNZEROED :
+#  endif
+	    0;
+#else
+	    CHUNK_MAP_UNZEROED;
 #endif
-	    CHUNK_MAP_ZEROED;
 
 	/*
 	 * If chunk is the spare, temporarily re-allocate it, 1) so that its
@@ -627,14 +709,13 @@
 	 * run.
 	 */
 	if (chunk == arena->spare) {
-		assert((chunk->map[arena_chunk_header_npages].bits &
-		    CHUNK_MAP_DIRTY) != 0);
+		assert((chunk->map[0].bits & CHUNK_MAP_DIRTY) != 0);
 		arena_chunk_alloc(arena);
 	}
 
 	/* Temporarily allocate all free dirty runs within chunk. */
-	for (pageind = arena_chunk_header_npages; pageind < chunk_npages;) {
-		mapelm = &chunk->map[pageind];
+	for (pageind = map_bias; pageind < chunk_npages;) {
+		mapelm = &chunk->map[pageind-map_bias];
 		if ((mapelm->bits & CHUNK_MAP_ALLOCATED) == 0) {
 			size_t npages;
 
@@ -646,25 +727,22 @@
 				arena_avail_tree_remove(
 				    &arena->runs_avail_dirty, mapelm);
 
+				mapelm->bits = (npages << PAGE_SHIFT) |
+				    flag_unzeroed | CHUNK_MAP_LARGE |
+				    CHUNK_MAP_ALLOCATED;
 				/*
 				 * Update internal elements in the page map, so
-				 * that CHUNK_MAP_ZEROED is properly set.
-				 * madvise(..., MADV_DONTNEED) results in
-				 * zero-filled pages for anonymous mappings,
-				 * but not for file-backed mappings.
+				 * that CHUNK_MAP_UNZEROED is properly set.
 				 */
-				mapelm->bits = (npages << PAGE_SHIFT) |
-				    CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED |
-				    flag_zeroed;
 				for (i = 1; i < npages - 1; i++) {
-					chunk->map[pageind + i].bits =
-					    flag_zeroed;
+					chunk->map[pageind+i-map_bias].bits =
+					    flag_unzeroed;
 				}
 				if (npages > 1) {
-					chunk->map[pageind + npages - 1].bits =
-					(npages << PAGE_SHIFT) |
-					CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED |
-					flag_zeroed;
+					chunk->map[
+					    pageind+npages-1-map_bias].bits =
+					    flag_unzeroed | CHUNK_MAP_LARGE |
+					    CHUNK_MAP_ALLOCATED;
 				}
 
 				arena->nactive += npages;
@@ -706,8 +784,8 @@
 	nmadvise = 0;
 #endif
 	ql_foreach(mapelm, &mapelms, u.ql_link) {
-		size_t pageind = ((uintptr_t)mapelm - (uintptr_t)chunk->map) /
-		    sizeof(arena_chunk_map_t);
+		size_t pageind = (((uintptr_t)mapelm - (uintptr_t)chunk->map) /
+		    sizeof(arena_chunk_map_t)) + map_bias;
 		size_t npages = mapelm->bits >> PAGE_SHIFT;
 
 		assert(pageind + npages <= chunk_npages);
@@ -715,8 +793,17 @@
 		assert(ndirty >= npages);
 		ndirty -= npages;
 #endif
+
+#ifdef JEMALLOC_PURGE_MADVISE_DONTNEED
 		madvise((void *)((uintptr_t)chunk + (pageind << PAGE_SHIFT)),
 		    (npages << PAGE_SHIFT), MADV_DONTNEED);
+#elif defined(JEMALLOC_PURGE_MADVISE_FREE)
+		madvise((void *)((uintptr_t)chunk + (pageind << PAGE_SHIFT)),
+		    (npages << PAGE_SHIFT), MADV_FREE);
+#else
+#  error "No method defined for purging unused dirty pages."
+#endif
+
 #ifdef JEMALLOC_STATS
 		nmadvise++;
 #endif
@@ -732,8 +819,8 @@
 	/* Deallocate runs. */
 	for (mapelm = ql_first(&mapelms); mapelm != NULL;
 	    mapelm = ql_first(&mapelms)) {
-		size_t pageind = ((uintptr_t)mapelm - (uintptr_t)chunk->map) /
-		    sizeof(arena_chunk_map_t);
+		size_t pageind = (((uintptr_t)mapelm - (uintptr_t)chunk->map) /
+		    sizeof(arena_chunk_map_t)) + map_bias;
 		arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
 		    (uintptr_t)(pageind << PAGE_SHIFT));
 
@@ -743,7 +830,7 @@
 }
 
 static void
-arena_purge(arena_t *arena)
+arena_purge(arena_t *arena, bool all)
 {
 	arena_chunk_t *chunk;
 	size_t npurgatory;
@@ -757,8 +844,8 @@
 	assert(ndirty == arena->ndirty);
 #endif
 	assert(arena->ndirty > arena->npurgatory);
-	assert(arena->ndirty > chunk_npages);
-	assert((arena->nactive >> opt_lg_dirty_mult) < arena->ndirty);
+	assert(arena->ndirty > chunk_npages || all);
+	assert((arena->nactive >> opt_lg_dirty_mult) < arena->ndirty || all);
 
 #ifdef JEMALLOC_STATS
 	arena->stats.npurge++;
@@ -769,8 +856,9 @@
 	 * purge, and add the result to arena->npurgatory.  This will keep
 	 * multiple threads from racing to reduce ndirty below the threshold.
 	 */
-	npurgatory = (arena->ndirty - arena->npurgatory) - (arena->nactive >>
-	    opt_lg_dirty_mult);
+	npurgatory = arena->ndirty - arena->npurgatory;
+	if (all == false)
+		npurgatory -= arena->nactive >> opt_lg_dirty_mult;
 	arena->npurgatory += npurgatory;
 
 	while (npurgatory > 0) {
@@ -826,6 +914,15 @@
 	}
 }
 
+void
+arena_purge_all(arena_t *arena)
+{
+
+	malloc_mutex_lock(&arena->lock);
+	arena_purge(arena, true);
+	malloc_mutex_unlock(&arena->lock);
+}
+
 static void
 arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty)
 {
@@ -836,11 +933,18 @@
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
 	run_ind = (size_t)(((uintptr_t)run - (uintptr_t)chunk)
 	    >> PAGE_SHIFT);
-	assert(run_ind >= arena_chunk_header_npages);
+	assert(run_ind >= map_bias);
 	assert(run_ind < chunk_npages);
-	if ((chunk->map[run_ind].bits & CHUNK_MAP_LARGE) != 0)
-		size = chunk->map[run_ind].bits & ~PAGE_MASK;
-	else
+	if ((chunk->map[run_ind-map_bias].bits & CHUNK_MAP_LARGE) != 0) {
+		size = chunk->map[run_ind-map_bias].bits & ~PAGE_MASK;
+		assert(size == PAGE_SIZE ||
+		    (chunk->map[run_ind+(size>>PAGE_SHIFT)-1-map_bias].bits &
+		    ~PAGE_MASK) == 0);
+		assert((chunk->map[run_ind+(size>>PAGE_SHIFT)-1-map_bias].bits &
+		    CHUNK_MAP_LARGE) != 0);
+		assert((chunk->map[run_ind+(size>>PAGE_SHIFT)-1-map_bias].bits &
+		    CHUNK_MAP_ALLOCATED) != 0);
+	} else
 		size = run->bin->run_size;
 	run_pages = (size >> PAGE_SHIFT);
 	arena->nactive -= run_pages;
@@ -849,7 +953,7 @@
 	 * The run is dirty if the caller claims to have dirtied it, as well as
 	 * if it was already dirty before being allocated.
 	 */
-	if ((chunk->map[run_ind].bits & CHUNK_MAP_DIRTY) != 0)
+	if ((chunk->map[run_ind-map_bias].bits & CHUNK_MAP_DIRTY) != 0)
 		dirty = true;
 	flag_dirty = dirty ? CHUNK_MAP_DIRTY : 0;
 	runs_avail = dirty ? &arena->runs_avail_dirty :
@@ -857,72 +961,91 @@
 
 	/* Mark pages as unallocated in the chunk map. */
 	if (dirty) {
-		chunk->map[run_ind].bits = size | flag_dirty;
-		chunk->map[run_ind+run_pages-1].bits = size | flag_dirty;
+		chunk->map[run_ind-map_bias].bits = size | CHUNK_MAP_DIRTY;
+		chunk->map[run_ind+run_pages-1-map_bias].bits = size |
+		    CHUNK_MAP_DIRTY;
 
 		chunk->ndirty += run_pages;
 		arena->ndirty += run_pages;
 	} else {
-		chunk->map[run_ind].bits = size | (chunk->map[run_ind].bits &
-		    CHUNK_MAP_ZEROED);
-		chunk->map[run_ind+run_pages-1].bits = size |
-		    (chunk->map[run_ind+run_pages-1].bits & CHUNK_MAP_ZEROED);
+		chunk->map[run_ind-map_bias].bits = size |
+		    (chunk->map[run_ind-map_bias].bits & CHUNK_MAP_UNZEROED);
+		chunk->map[run_ind+run_pages-1-map_bias].bits = size |
+		    (chunk->map[run_ind+run_pages-1-map_bias].bits &
+		    CHUNK_MAP_UNZEROED);
 	}
 
 	/* Try to coalesce forward. */
 	if (run_ind + run_pages < chunk_npages &&
-	    (chunk->map[run_ind+run_pages].bits & CHUNK_MAP_ALLOCATED) == 0 &&
-	    (chunk->map[run_ind+run_pages].bits & CHUNK_MAP_DIRTY) ==
-	    flag_dirty) {
-		size_t nrun_size = chunk->map[run_ind+run_pages].bits &
+	    (chunk->map[run_ind+run_pages-map_bias].bits & CHUNK_MAP_ALLOCATED)
+	    == 0 && (chunk->map[run_ind+run_pages-map_bias].bits &
+	    CHUNK_MAP_DIRTY) == flag_dirty) {
+		size_t nrun_size = chunk->map[run_ind+run_pages-map_bias].bits &
 		    ~PAGE_MASK;
+		size_t nrun_pages = nrun_size >> PAGE_SHIFT;
 
 		/*
 		 * Remove successor from runs_avail; the coalesced run is
 		 * inserted later.
 		 */
+		assert((chunk->map[run_ind+run_pages+nrun_pages-1-map_bias].bits
+		    & ~PAGE_MASK) == nrun_size);
+		assert((chunk->map[run_ind+run_pages+nrun_pages-1-map_bias].bits
+		    & CHUNK_MAP_ALLOCATED) == 0);
+		assert((chunk->map[run_ind+run_pages+nrun_pages-1-map_bias].bits
+		    & CHUNK_MAP_DIRTY) == flag_dirty);
 		arena_avail_tree_remove(runs_avail,
-		    &chunk->map[run_ind+run_pages]);
+		    &chunk->map[run_ind+run_pages-map_bias]);
 
 		size += nrun_size;
-		run_pages = size >> PAGE_SHIFT;
+		run_pages += nrun_pages;
 
-		assert((chunk->map[run_ind+run_pages-1].bits & ~PAGE_MASK)
-		    == nrun_size);
-		chunk->map[run_ind].bits = size | (chunk->map[run_ind].bits &
-		    CHUNK_MAP_FLAGS_MASK);
-		chunk->map[run_ind+run_pages-1].bits = size |
-		    (chunk->map[run_ind+run_pages-1].bits &
+		chunk->map[run_ind-map_bias].bits = size |
+		    (chunk->map[run_ind-map_bias].bits & CHUNK_MAP_FLAGS_MASK);
+		chunk->map[run_ind+run_pages-1-map_bias].bits = size |
+		    (chunk->map[run_ind+run_pages-1-map_bias].bits &
 		    CHUNK_MAP_FLAGS_MASK);
 	}
 
 	/* Try to coalesce backward. */
-	if (run_ind > arena_chunk_header_npages && (chunk->map[run_ind-1].bits &
-	    CHUNK_MAP_ALLOCATED) == 0 && (chunk->map[run_ind-1].bits &
+	if (run_ind > map_bias && (chunk->map[run_ind-1-map_bias].bits &
+	    CHUNK_MAP_ALLOCATED) == 0 && (chunk->map[run_ind-1-map_bias].bits &
 	    CHUNK_MAP_DIRTY) == flag_dirty) {
-		size_t prun_size = chunk->map[run_ind-1].bits & ~PAGE_MASK;
+		size_t prun_size = chunk->map[run_ind-1-map_bias].bits &
+		    ~PAGE_MASK;
+		size_t prun_pages = prun_size >> PAGE_SHIFT;
 
-		run_ind -= prun_size >> PAGE_SHIFT;
+		run_ind -= prun_pages;
 
 		/*
 		 * Remove predecessor from runs_avail; the coalesced run is
 		 * inserted later.
 		 */
-		arena_avail_tree_remove(runs_avail, &chunk->map[run_ind]);
+		assert((chunk->map[run_ind-map_bias].bits & ~PAGE_MASK)
+		    == prun_size);
+		assert((chunk->map[run_ind-map_bias].bits & CHUNK_MAP_ALLOCATED)
+		    == 0);
+		assert((chunk->map[run_ind-map_bias].bits & CHUNK_MAP_DIRTY)
+		    == flag_dirty);
+		arena_avail_tree_remove(runs_avail,
+		    &chunk->map[run_ind-map_bias]);
 
 		size += prun_size;
-		run_pages = size >> PAGE_SHIFT;
+		run_pages += prun_pages;
 
-		assert((chunk->map[run_ind].bits & ~PAGE_MASK) == prun_size);
-		chunk->map[run_ind].bits = size | (chunk->map[run_ind].bits &
-		    CHUNK_MAP_FLAGS_MASK);
-		chunk->map[run_ind+run_pages-1].bits = size |
-		    (chunk->map[run_ind+run_pages-1].bits &
+		chunk->map[run_ind-map_bias].bits = size |
+		    (chunk->map[run_ind-map_bias].bits & CHUNK_MAP_FLAGS_MASK);
+		chunk->map[run_ind+run_pages-1-map_bias].bits = size |
+		    (chunk->map[run_ind+run_pages-1-map_bias].bits &
 		    CHUNK_MAP_FLAGS_MASK);
 	}
 
 	/* Insert into runs_avail, now that coalescing is complete. */
-	arena_avail_tree_insert(runs_avail, &chunk->map[run_ind]);
+	assert((chunk->map[run_ind-map_bias].bits & ~PAGE_MASK) ==
+	    (chunk->map[run_ind+run_pages-1-map_bias].bits & ~PAGE_MASK));
+	assert((chunk->map[run_ind-map_bias].bits & CHUNK_MAP_DIRTY) ==
+	    (chunk->map[run_ind+run_pages-1-map_bias].bits & CHUNK_MAP_DIRTY));
+	arena_avail_tree_insert(runs_avail, &chunk->map[run_ind-map_bias]);
 
 	if (dirty) {
 		/*
@@ -941,8 +1064,8 @@
 	 * manipulation checks whether the first run is unallocated and extends
 	 * to the end of the chunk.
 	 */
-	if ((chunk->map[arena_chunk_header_npages].bits & (~PAGE_MASK |
-	    CHUNK_MAP_ALLOCATED)) == arena_maxclass)
+	if ((chunk->map[0].bits & (~PAGE_MASK | CHUNK_MAP_ALLOCATED)) ==
+	    arena_maxclass)
 		arena_chunk_dealloc(arena, chunk);
 
 	/*
@@ -962,18 +1085,40 @@
 {
 	size_t pageind = ((uintptr_t)run - (uintptr_t)chunk) >> PAGE_SHIFT;
 	size_t head_npages = (oldsize - newsize) >> PAGE_SHIFT;
-	size_t flags = chunk->map[pageind].bits & CHUNK_MAP_FLAGS_MASK;
+	size_t flag_dirty = chunk->map[pageind-map_bias].bits & CHUNK_MAP_DIRTY;
 
 	assert(oldsize > newsize);
 
 	/*
 	 * Update the chunk map so that arena_run_dalloc() can treat the
-	 * leading run as separately allocated.
+	 * leading run as separately allocated.  Set the last element of each
+	 * run first, in case of single-page runs.
 	 */
-	assert(chunk->map[pageind].bits & CHUNK_MAP_LARGE);
-	assert(chunk->map[pageind].bits & CHUNK_MAP_ALLOCATED);
-	chunk->map[pageind].bits = (oldsize - newsize) | flags;
-	chunk->map[pageind+head_npages].bits = newsize | flags;
+	assert((chunk->map[pageind-map_bias].bits & CHUNK_MAP_LARGE) != 0);
+	assert((chunk->map[pageind-map_bias].bits & CHUNK_MAP_ALLOCATED) != 0);
+	chunk->map[pageind+head_npages-1-map_bias].bits = flag_dirty |
+	    (chunk->map[pageind+head_npages-1-map_bias].bits &
+	    CHUNK_MAP_UNZEROED) | CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED;
+	chunk->map[pageind-map_bias].bits = (oldsize - newsize)
+	    | flag_dirty | (chunk->map[pageind-map_bias].bits &
+	    CHUNK_MAP_UNZEROED) | CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED;
+
+#ifdef JEMALLOC_DEBUG
+	{
+		size_t tail_npages = newsize >> PAGE_SHIFT;
+		assert((chunk->map[pageind+head_npages+tail_npages-1-map_bias]
+		    .bits & ~PAGE_MASK) == 0);
+		assert((chunk->map[pageind+head_npages+tail_npages-1-map_bias]
+		    .bits & CHUNK_MAP_DIRTY) == flag_dirty);
+		assert((chunk->map[pageind+head_npages+tail_npages-1-map_bias]
+		    .bits & CHUNK_MAP_LARGE) != 0);
+		assert((chunk->map[pageind+head_npages+tail_npages-1-map_bias]
+		    .bits & CHUNK_MAP_ALLOCATED) != 0);
+	}
+#endif
+	chunk->map[pageind+head_npages-map_bias].bits = newsize | flag_dirty |
+	    (chunk->map[pageind+head_npages-map_bias].bits &
+	    CHUNK_MAP_FLAGS_MASK) | CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED;
 
 	arena_run_dalloc(arena, run, false);
 }
@@ -983,20 +1128,40 @@
     size_t oldsize, size_t newsize, bool dirty)
 {
 	size_t pageind = ((uintptr_t)run - (uintptr_t)chunk) >> PAGE_SHIFT;
-	size_t npages = newsize >> PAGE_SHIFT;
-	size_t flags = chunk->map[pageind].bits & CHUNK_MAP_FLAGS_MASK;
+	size_t head_npages = newsize >> PAGE_SHIFT;
+	size_t tail_npages = (oldsize - newsize) >> PAGE_SHIFT;
+	size_t flag_dirty = chunk->map[pageind-map_bias].bits &
+	    CHUNK_MAP_DIRTY;
 
 	assert(oldsize > newsize);
 
 	/*
 	 * Update the chunk map so that arena_run_dalloc() can treat the
-	 * trailing run as separately allocated.
+	 * trailing run as separately allocated.  Set the last element of each
+	 * run first, in case of single-page runs.
 	 */
-	assert(chunk->map[pageind].bits & CHUNK_MAP_LARGE);
-	assert(chunk->map[pageind].bits & CHUNK_MAP_ALLOCATED);
-	chunk->map[pageind].bits = newsize | flags;
-	chunk->map[pageind+npages-1].bits = newsize | flags;
-	chunk->map[pageind+npages].bits = (oldsize - newsize) | flags;
+	assert((chunk->map[pageind-map_bias].bits & CHUNK_MAP_LARGE) != 0);
+	assert((chunk->map[pageind-map_bias].bits & CHUNK_MAP_ALLOCATED) != 0);
+	chunk->map[pageind+head_npages-1-map_bias].bits = flag_dirty |
+	    (chunk->map[pageind+head_npages-1-map_bias].bits &
+	    CHUNK_MAP_UNZEROED) | CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED;
+	chunk->map[pageind-map_bias].bits = newsize | flag_dirty |
+	    (chunk->map[pageind-map_bias].bits & CHUNK_MAP_UNZEROED) |
+	    CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED;
+
+	assert((chunk->map[pageind+head_npages+tail_npages-1-map_bias].bits &
+	    ~PAGE_MASK) == 0);
+	assert((chunk->map[pageind+head_npages+tail_npages-1-map_bias].bits &
+	    CHUNK_MAP_LARGE) != 0);
+	assert((chunk->map[pageind+head_npages+tail_npages-1-map_bias].bits &
+	    CHUNK_MAP_ALLOCATED) != 0);
+	chunk->map[pageind+head_npages+tail_npages-1-map_bias].bits =
+	    flag_dirty |
+	    (chunk->map[pageind+head_npages+tail_npages-1-map_bias].bits &
+	    CHUNK_MAP_UNZEROED) | CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED;
+	chunk->map[pageind+head_npages-map_bias].bits = (oldsize - newsize) |
+	    flag_dirty | (chunk->map[pageind+head_npages-map_bias].bits &
+	    CHUNK_MAP_UNZEROED) | CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED;
 
 	arena_run_dalloc(arena, (arena_run_t *)((uintptr_t)run + newsize),
 	    dirty);
@@ -1018,8 +1183,8 @@
 		arena_run_tree_remove(&bin->runs, mapelm);
 
 		chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(mapelm);
-		pageind = (((uintptr_t)mapelm - (uintptr_t)chunk->map) /
-		    sizeof(arena_chunk_map_t));
+		pageind = ((((uintptr_t)mapelm - (uintptr_t)chunk->map) /
+		    sizeof(arena_chunk_map_t))) + map_bias;
 		run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)((pageind -
 		    (mapelm->bits >> PAGE_SHIFT))
 		    << PAGE_SHIFT));
@@ -1039,7 +1204,7 @@
 		/* Initialize run internals. */
 		run->bin = bin;
 		run->avail = NULL;
-		run->next = (void *)(((uintptr_t)run) +
+		run->next = (void *)((uintptr_t)run +
 		    (uintptr_t)bin->reg0_offset);
 		run->nfree = bin->nregs;
 #ifdef JEMALLOC_DEBUG
@@ -1061,7 +1226,7 @@
 
 	/*
 	 * arena_run_alloc() failed, but another thread may have made
-	 * sufficient memory available while this one dopped bin->lock above,
+	 * sufficient memory available while this one dropped bin->lock above,
 	 * so search one more time.
 	 */
 	mapelm = arena_run_tree_first(&bin->runs);
@@ -1073,8 +1238,8 @@
 		arena_run_tree_remove(&bin->runs, mapelm);
 
 		chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(mapelm);
-		pageind = (((uintptr_t)mapelm - (uintptr_t)chunk->map) /
-		    sizeof(arena_chunk_map_t));
+		pageind = ((((uintptr_t)mapelm - (uintptr_t)chunk->map) /
+		    sizeof(arena_chunk_map_t))) + map_bias;
 		run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)((pageind -
 		    (mapelm->bits >> PAGE_SHIFT))
 		    << PAGE_SHIFT));
@@ -1105,11 +1270,21 @@
 		assert(bin->runcur->nfree > 0);
 		ret = arena_run_reg_alloc(bin->runcur, bin);
 		if (run != NULL) {
-			malloc_mutex_unlock(&bin->lock);
-			malloc_mutex_lock(&arena->lock);
-			arena_run_dalloc(arena, run, false);
-			malloc_mutex_unlock(&arena->lock);
-			malloc_mutex_lock(&bin->lock);
+			arena_chunk_t *chunk;
+
+			/*
+			 * arena_run_alloc() may have allocated run, or it may
+			 * have pulled it from the bin's run tree.  Therefore
+			 * it is unsafe to make any assumptions about how run
+			 * has previously been used, and arena_bin_lower_run()
+			 * must be called, as if a region were just deallocated
+			 * from the run.
+			 */
+			chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
+			if (run->nfree == bin->nregs)
+				arena_dalloc_bin_run(arena, chunk, run, bin);
+			else
+				arena_bin_lower_run(arena, chunk, run, bin);
 		}
 		return (ret);
 	}
@@ -1424,17 +1599,19 @@
 
 /* Only handles large allocations that require more than page alignment. */
 void *
-arena_palloc(arena_t *arena, size_t alignment, size_t size, size_t alloc_size)
+arena_palloc(arena_t *arena, size_t size, size_t alloc_size, size_t alignment,
+    bool zero)
 {
 	void *ret;
 	size_t offset;
 	arena_chunk_t *chunk;
 
 	assert((size & PAGE_MASK) == 0);
-	assert((alignment & PAGE_MASK) == 0);
+
+	alignment = PAGE_CEILING(alignment);
 
 	malloc_mutex_lock(&arena->lock);
-	ret = (void *)arena_run_alloc(arena, alloc_size, true, false);
+	ret = (void *)arena_run_alloc(arena, alloc_size, true, zero);
 	if (ret == NULL) {
 		malloc_mutex_unlock(&arena->lock);
 		return (NULL);
@@ -1482,10 +1659,12 @@
 	malloc_mutex_unlock(&arena->lock);
 
 #ifdef JEMALLOC_FILL
-	if (opt_junk)
-		memset(ret, 0xa5, size);
-	else if (opt_zero)
-		memset(ret, 0, size);
+	if (zero == false) {
+		if (opt_junk)
+			memset(ret, 0xa5, size);
+		else if (opt_zero)
+			memset(ret, 0, size);
+	}
 #endif
 	return (ret);
 }
@@ -1502,8 +1681,8 @@
 	assert(CHUNK_ADDR2BASE(ptr) != ptr);
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT);
-	mapbits = chunk->map[pageind].bits;
+	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
+	mapbits = chunk->map[pageind-map_bias].bits;
 	assert((mapbits & CHUNK_MAP_ALLOCATED) != 0);
 	if ((mapbits & CHUNK_MAP_LARGE) == 0) {
 		arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
@@ -1535,11 +1714,11 @@
 	assert(isalloc(ptr) == PAGE_SIZE);
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT);
+	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
 	binind = small_size2bin[size];
 	assert(binind < nbins);
-	chunk->map[pageind].bits = (chunk->map[pageind].bits &
-	    ~CHUNK_MAP_CLASS_MASK) | (binind << CHUNK_MAP_CLASS_SHIFT);
+	chunk->map[pageind-map_bias].bits = (chunk->map[pageind-map_bias].bits &
+	    ~CHUNK_MAP_CLASS_MASK) | ((binind+1) << CHUNK_MAP_CLASS_SHIFT);
 }
 
 size_t
@@ -1553,8 +1732,8 @@
 	assert(CHUNK_ADDR2BASE(ptr) != ptr);
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT);
-	mapbits = chunk->map[pageind].bits;
+	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
+	mapbits = chunk->map[pageind-map_bias].bits;
 	assert((mapbits & CHUNK_MAP_ALLOCATED) != 0);
 	if ((mapbits & CHUNK_MAP_LARGE) == 0) {
 		arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
@@ -1569,9 +1748,9 @@
 		assert(((uintptr_t)ptr & PAGE_MASK) == 0);
 		ret = mapbits & ~PAGE_MASK;
 		if (prof_promote && ret == PAGE_SIZE && (mapbits &
-		    CHUNK_MAP_CLASS_MASK) != CHUNK_MAP_CLASS_MASK) {
+		    CHUNK_MAP_CLASS_MASK) != 0) {
 			size_t binind = ((mapbits & CHUNK_MAP_CLASS_MASK) >>
-			    CHUNK_MAP_CLASS_SHIFT);
+			    CHUNK_MAP_CLASS_SHIFT) - 1;
 			assert(binind < nbins);
 			ret = chunk->arena->bins[binind].reg_size;
 		}
@@ -1580,144 +1759,12 @@
 
 	return (ret);
 }
-
-static inline unsigned
-arena_run_regind(arena_run_t *run, arena_bin_t *bin, const void *ptr,
-    size_t size)
-{
-	unsigned shift, diff, regind;
-
-	assert(run->magic == ARENA_RUN_MAGIC);
-
-	/*
-	 * Avoid doing division with a variable divisor if possible.  Using
-	 * actual division here can reduce allocator throughput by over 20%!
-	 */
-	diff = (unsigned)((uintptr_t)ptr - (uintptr_t)run - bin->reg0_offset);
-
-	/* Rescale (factor powers of 2 out of the numerator and denominator). */
-	shift = ffs(size) - 1;
-	diff >>= shift;
-	size >>= shift;
-
-	if (size == 1) {
-		/* The divisor was a power of 2. */
-		regind = diff;
-	} else {
-		/*
-		 * To divide by a number D that is not a power of two we
-		 * multiply by (2^21 / D) and then right shift by 21 positions.
-		 *
-		 *   X / D
-		 *
-		 * becomes
-		 *
-		 *   (X * size_invs[D - 3]) >> SIZE_INV_SHIFT
-		 *
-		 * We can omit the first three elements, because we never
-		 * divide by 0, and 1 and 2 are both powers of two, which are
-		 * handled above.
-		 */
-#define	SIZE_INV_SHIFT 21
-#define	SIZE_INV(s) (((1U << SIZE_INV_SHIFT) / (s)) + 1)
-		static const unsigned size_invs[] = {
-		    SIZE_INV(3),
-		    SIZE_INV(4), SIZE_INV(5), SIZE_INV(6), SIZE_INV(7),
-		    SIZE_INV(8), SIZE_INV(9), SIZE_INV(10), SIZE_INV(11),
-		    SIZE_INV(12), SIZE_INV(13), SIZE_INV(14), SIZE_INV(15),
-		    SIZE_INV(16), SIZE_INV(17), SIZE_INV(18), SIZE_INV(19),
-		    SIZE_INV(20), SIZE_INV(21), SIZE_INV(22), SIZE_INV(23),
-		    SIZE_INV(24), SIZE_INV(25), SIZE_INV(26), SIZE_INV(27),
-		    SIZE_INV(28), SIZE_INV(29), SIZE_INV(30), SIZE_INV(31)
-		};
-
-		if (size <= ((sizeof(size_invs) / sizeof(unsigned)) + 2))
-			regind = (diff * size_invs[size - 3]) >> SIZE_INV_SHIFT;
-		else
-			regind = diff / size;
-#undef SIZE_INV
-#undef SIZE_INV_SHIFT
-	}
-	assert(diff == regind * size);
-	assert(regind < bin->nregs);
-
-	return (regind);
-}
-
-prof_ctx_t *
-arena_prof_ctx_get(const void *ptr)
-{
-	prof_ctx_t *ret;
-	arena_chunk_t *chunk;
-	size_t pageind, mapbits;
-
-	assert(ptr != NULL);
-	assert(CHUNK_ADDR2BASE(ptr) != ptr);
-
-	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT);
-	mapbits = chunk->map[pageind].bits;
-	assert((mapbits & CHUNK_MAP_ALLOCATED) != 0);
-	if ((mapbits & CHUNK_MAP_LARGE) == 0) {
-		if (prof_promote)
-			ret = (prof_ctx_t *)(uintptr_t)1U;
-		else {
-			arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
-			    (uintptr_t)((pageind - (mapbits >> PAGE_SHIFT)) <<
-			    PAGE_SHIFT));
-			arena_bin_t *bin = run->bin;
-			unsigned regind;
-
-			assert(run->magic == ARENA_RUN_MAGIC);
-			regind = arena_run_regind(run, bin, ptr, bin->reg_size);
-			ret = *(prof_ctx_t **)((uintptr_t)run +
-			    bin->ctx0_offset + (regind *
-			    sizeof(prof_ctx_t *)));
-		}
-	} else
-		ret = chunk->map[pageind].prof_ctx;
-
-	return (ret);
-}
-
-void
-arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
-{
-	arena_chunk_t *chunk;
-	size_t pageind, mapbits;
-
-	assert(ptr != NULL);
-	assert(CHUNK_ADDR2BASE(ptr) != ptr);
-
-	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT);
-	mapbits = chunk->map[pageind].bits;
-	assert((mapbits & CHUNK_MAP_ALLOCATED) != 0);
-	if ((mapbits & CHUNK_MAP_LARGE) == 0) {
-		if (prof_promote == false) {
-			arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
-			    (uintptr_t)((pageind - (mapbits >> PAGE_SHIFT)) <<
-			    PAGE_SHIFT));
-			arena_bin_t *bin = run->bin;
-			unsigned regind;
-
-			assert(run->magic == ARENA_RUN_MAGIC);
-			regind = arena_run_regind(run, bin, ptr, bin->reg_size);
-
-			*((prof_ctx_t **)((uintptr_t)run + bin->ctx0_offset
-			    + (regind * sizeof(prof_ctx_t *)))) = ctx;
-		} else
-			assert((uintptr_t)ctx == (uintptr_t)1U);
-	} else
-		chunk->map[pageind].prof_ctx = ctx;
-}
 #endif
 
 static void
-arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
+arena_dissociate_bin_run(arena_chunk_t *chunk, arena_run_t *run,
     arena_bin_t *bin)
 {
-	size_t npages, run_ind, past;
 
 	/* Dissociate run from bin. */
 	if (run == bin->runcur)
@@ -1725,7 +1772,8 @@
 	else if (bin->nregs != 1) {
 		size_t run_pageind = (((uintptr_t)run - (uintptr_t)chunk)) >>
 		    PAGE_SHIFT;
-		arena_chunk_map_t *run_mapelm = &chunk->map[run_pageind];
+		arena_chunk_map_t *run_mapelm =
+		    &chunk->map[run_pageind-map_bias];
 		/*
 		 * This block's conditional is necessary because if the run
 		 * only contains one region, then it never gets inserted into
@@ -1733,13 +1781,24 @@
 		 */
 		arena_run_tree_remove(&bin->runs, run_mapelm);
 	}
+}
+
+static void
+arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
+    arena_bin_t *bin)
+{
+	size_t npages, run_ind, past;
+
+	assert(run != bin->runcur);
+	assert(arena_run_tree_search(&bin->runs, &chunk->map[
+	    (((uintptr_t)run-(uintptr_t)chunk)>>PAGE_SHIFT)-map_bias]) == NULL);
 
 	malloc_mutex_unlock(&bin->lock);
 	/******************************/
 	npages = bin->run_size >> PAGE_SHIFT;
 	run_ind = (size_t)(((uintptr_t)run - (uintptr_t)chunk) >> PAGE_SHIFT);
-	past = (size_t)(((uintptr_t)run->next - (uintptr_t)1U -
-	    (uintptr_t)chunk) >> PAGE_SHIFT) + 1;
+	past = (size_t)((PAGE_CEILING((uintptr_t)run->next) - (uintptr_t)chunk)
+	    >> PAGE_SHIFT);
 	malloc_mutex_lock(&arena->lock);
 
 	/*
@@ -1747,19 +1806,21 @@
 	 * trim the clean pages before deallocating the dirty portion of the
 	 * run.
 	 */
-	if ((chunk->map[run_ind].bits & CHUNK_MAP_DIRTY) == 0 && past - run_ind
-	    < npages) {
+	if ((chunk->map[run_ind-map_bias].bits & CHUNK_MAP_DIRTY) == 0 && past
+	    - run_ind < npages) {
 		/*
 		 * Trim clean pages.  Convert to large run beforehand.  Set the
 		 * last map element first, in case this is a one-page run.
 		 */
-		chunk->map[run_ind+npages-1].bits = CHUNK_MAP_LARGE |
-		    (chunk->map[run_ind].bits & CHUNK_MAP_FLAGS_MASK);
-		chunk->map[run_ind].bits = bin->run_size | CHUNK_MAP_LARGE |
-		    (chunk->map[run_ind].bits & CHUNK_MAP_FLAGS_MASK);
+		chunk->map[run_ind+npages-1-map_bias].bits = CHUNK_MAP_LARGE |
+		    (chunk->map[run_ind+npages-1-map_bias].bits &
+		    CHUNK_MAP_FLAGS_MASK);
+		chunk->map[run_ind-map_bias].bits = bin->run_size |
+		    CHUNK_MAP_LARGE | (chunk->map[run_ind-map_bias].bits &
+		    CHUNK_MAP_FLAGS_MASK);
 		arena_run_trim_tail(arena, chunk, run, (npages << PAGE_SHIFT),
-		    ((npages - (past - run_ind)) << PAGE_SHIFT), false);
-		npages = past - run_ind;
+		    ((past - run_ind) << PAGE_SHIFT), false);
+		/* npages = past - run_ind; */
 	}
 #ifdef JEMALLOC_DEBUG
 	run->magic = 0;
@@ -1773,6 +1834,42 @@
 #endif
 }
 
+static void
+arena_bin_lower_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
+    arena_bin_t *bin)
+{
+
+	/*
+	 * Make sure that bin->runcur always refers to the lowest non-full run,
+	 * if one exists.
+	 */
+	if (bin->runcur == NULL)
+		bin->runcur = run;
+	else if ((uintptr_t)run < (uintptr_t)bin->runcur) {
+		/* Switch runcur. */
+		if (bin->runcur->nfree > 0) {
+			arena_chunk_t *runcur_chunk =
+			    CHUNK_ADDR2BASE(bin->runcur);
+			size_t runcur_pageind = (((uintptr_t)bin->runcur -
+			    (uintptr_t)runcur_chunk)) >> PAGE_SHIFT;
+			arena_chunk_map_t *runcur_mapelm =
+			    &runcur_chunk->map[runcur_pageind-map_bias];
+
+			/* Insert runcur. */
+			arena_run_tree_insert(&bin->runs, runcur_mapelm);
+		}
+		bin->runcur = run;
+	} else {
+		size_t run_pageind = (((uintptr_t)run -
+		    (uintptr_t)chunk)) >> PAGE_SHIFT;
+		arena_chunk_map_t *run_mapelm =
+		    &chunk->map[run_pageind-map_bias];
+
+		assert(arena_run_tree_search(&bin->runs, run_mapelm) == NULL);
+		arena_run_tree_insert(&bin->runs, run_mapelm);
+	}
+}
+
 void
 arena_dalloc_bin(arena_t *arena, arena_chunk_t *chunk, void *ptr,
     arena_chunk_map_t *mapelm)
@@ -1784,7 +1881,7 @@
 	size_t size;
 #endif
 
-	pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT);
+	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
 	run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)((pageind -
 	    (mapelm->bits >> PAGE_SHIFT)) << PAGE_SHIFT));
 	assert(run->magic == ARENA_RUN_MAGIC);
@@ -1799,43 +1896,11 @@
 #endif
 
 	arena_run_reg_dalloc(run, ptr);
-
-	if (run->nfree == bin->nregs)
+	if (run->nfree == bin->nregs) {
+		arena_dissociate_bin_run(chunk, run, bin);
 		arena_dalloc_bin_run(arena, chunk, run, bin);
-	else if (run->nfree == 1 && run != bin->runcur) {
-		/*
-		 * Make sure that bin->runcur always refers to the lowest
-		 * non-full run, if one exists.
-		 */
-		if (bin->runcur == NULL)
-			bin->runcur = run;
-		else if ((uintptr_t)run < (uintptr_t)bin->runcur) {
-			/* Switch runcur. */
-			if (bin->runcur->nfree > 0) {
-				arena_chunk_t *runcur_chunk =
-				    CHUNK_ADDR2BASE(bin->runcur);
-				size_t runcur_pageind =
-				    (((uintptr_t)bin->runcur -
-				    (uintptr_t)runcur_chunk)) >> PAGE_SHIFT;
-				arena_chunk_map_t *runcur_mapelm =
-				    &runcur_chunk->map[runcur_pageind];
-
-				/* Insert runcur. */
-				arena_run_tree_insert(&bin->runs,
-				    runcur_mapelm);
-			}
-			bin->runcur = run;
-		} else {
-			size_t run_pageind = (((uintptr_t)run -
-			    (uintptr_t)chunk)) >> PAGE_SHIFT;
-			arena_chunk_map_t *run_mapelm =
-			    &chunk->map[run_pageind];
-
-			assert(arena_run_tree_search(&bin->runs, run_mapelm) ==
-			    NULL);
-			arena_run_tree_insert(&bin->runs, run_mapelm);
-		}
-	}
+	} else if (run->nfree == 1 && run != bin->runcur)
+		arena_bin_lower_run(arena, chunk, run, bin);
 
 #ifdef JEMALLOC_STATS
 	bin->stats.allocated -= size;
@@ -1908,7 +1973,7 @@
 #if (defined(JEMALLOC_FILL) || defined(JEMALLOC_STATS))
 		size_t pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >>
 		    PAGE_SHIFT;
-		size_t size = chunk->map[pageind].bits & ~PAGE_MASK;
+		size_t size = chunk->map[pageind-map_bias].bits & ~PAGE_MASK;
 #endif
 
 #ifdef JEMALLOC_FILL
@@ -1930,7 +1995,7 @@
 
 static void
 arena_ralloc_large_shrink(arena_t *arena, arena_chunk_t *chunk, void *ptr,
-    size_t size, size_t oldsize)
+    size_t oldsize, size_t size)
 {
 
 	assert(size < oldsize);
@@ -1965,50 +2030,71 @@
 
 static bool
 arena_ralloc_large_grow(arena_t *arena, arena_chunk_t *chunk, void *ptr,
-    size_t size, size_t oldsize)
+    size_t oldsize, size_t size, size_t extra, bool zero)
 {
 	size_t pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
 	size_t npages = oldsize >> PAGE_SHIFT;
+	size_t followsize;
 
-	assert(oldsize == (chunk->map[pageind].bits & ~PAGE_MASK));
+	assert(oldsize == (chunk->map[pageind-map_bias].bits & ~PAGE_MASK));
 
 	/* Try to extend the run. */
-	assert(size > oldsize);
+	assert(size + extra > oldsize);
 	malloc_mutex_lock(&arena->lock);
-	if (pageind + npages < chunk_npages && (chunk->map[pageind+npages].bits
-	    & CHUNK_MAP_ALLOCATED) == 0 && (chunk->map[pageind+npages].bits &
-	    ~PAGE_MASK) >= size - oldsize) {
+	if (pageind + npages < chunk_npages &&
+	    (chunk->map[pageind+npages-map_bias].bits
+	    & CHUNK_MAP_ALLOCATED) == 0 && (followsize =
+	    chunk->map[pageind+npages-map_bias].bits & ~PAGE_MASK) >= size -
+	    oldsize) {
 		/*
 		 * The next run is available and sufficiently large.  Split the
 		 * following run, then merge the first part with the existing
 		 * allocation.
 		 */
+		size_t flag_dirty;
+		size_t splitsize = (oldsize + followsize <= size + extra)
+		    ? followsize : size + extra - oldsize;
 		arena_run_split(arena, (arena_run_t *)((uintptr_t)chunk +
-		    ((pageind+npages) << PAGE_SHIFT)), size - oldsize, true,
-		    false);
+		    ((pageind+npages) << PAGE_SHIFT)), splitsize, true, zero);
 
-		chunk->map[pageind].bits = size | CHUNK_MAP_LARGE |
-		    CHUNK_MAP_ALLOCATED;
-		chunk->map[pageind+npages].bits = CHUNK_MAP_LARGE |
-		    CHUNK_MAP_ALLOCATED;
+		size = oldsize + splitsize;
+		npages = size >> PAGE_SHIFT;
+
+		/*
+		 * Mark the extended run as dirty if either portion of the run
+		 * was dirty before allocation.  This is rather pedantic,
+		 * because there's not actually any sequence of events that
+		 * could cause the resulting run to be passed to
+		 * arena_run_dalloc() with the dirty argument set to false
+		 * (which is when dirty flag consistency would really matter).
+		 */
+		flag_dirty = (chunk->map[pageind-map_bias].bits &
+		    CHUNK_MAP_DIRTY) |
+		    (chunk->map[pageind+npages-1-map_bias].bits &
+		    CHUNK_MAP_DIRTY);
+		chunk->map[pageind-map_bias].bits = size | flag_dirty
+		    | CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED;
+		chunk->map[pageind+npages-1-map_bias].bits = flag_dirty |
+		    CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED;
 
 #ifdef JEMALLOC_STATS
-	arena->stats.ndalloc_large++;
-	arena->stats.allocated_large -= oldsize;
-	arena->stats.lstats[(oldsize >> PAGE_SHIFT) - 1].ndalloc++;
-	arena->stats.lstats[(oldsize >> PAGE_SHIFT) - 1].curruns--;
+		arena->stats.ndalloc_large++;
+		arena->stats.allocated_large -= oldsize;
+		arena->stats.lstats[(oldsize >> PAGE_SHIFT) - 1].ndalloc++;
+		arena->stats.lstats[(oldsize >> PAGE_SHIFT) - 1].curruns--;
 
-	arena->stats.nmalloc_large++;
-	arena->stats.nrequests_large++;
-	arena->stats.allocated_large += size;
-	arena->stats.lstats[(size >> PAGE_SHIFT) - 1].nmalloc++;
-	arena->stats.lstats[(size >> PAGE_SHIFT) - 1].nrequests++;
-	arena->stats.lstats[(size >> PAGE_SHIFT) - 1].curruns++;
-	if (arena->stats.lstats[(size >> PAGE_SHIFT) - 1].curruns >
-	    arena->stats.lstats[(size >> PAGE_SHIFT) - 1].highruns) {
-		arena->stats.lstats[(size >> PAGE_SHIFT) - 1].highruns =
-		    arena->stats.lstats[(size >> PAGE_SHIFT) - 1].curruns;
-	}
+		arena->stats.nmalloc_large++;
+		arena->stats.nrequests_large++;
+		arena->stats.allocated_large += size;
+		arena->stats.lstats[(size >> PAGE_SHIFT) - 1].nmalloc++;
+		arena->stats.lstats[(size >> PAGE_SHIFT) - 1].nrequests++;
+		arena->stats.lstats[(size >> PAGE_SHIFT) - 1].curruns++;
+		if (arena->stats.lstats[(size >> PAGE_SHIFT) - 1].curruns >
+		    arena->stats.lstats[(size >> PAGE_SHIFT) - 1].highruns) {
+			arena->stats.lstats[(size >> PAGE_SHIFT) - 1].highruns =
+			    arena->stats.lstats[(size >> PAGE_SHIFT) -
+			    1].curruns;
+		}
 #endif
 		malloc_mutex_unlock(&arena->lock);
 		return (false);
@@ -2023,11 +2109,12 @@
  * always fail if growing an object, and the following run is already in use.
  */
 static bool
-arena_ralloc_large(void *ptr, size_t size, size_t oldsize)
+arena_ralloc_large(void *ptr, size_t oldsize, size_t size, size_t extra,
+    bool zero)
 {
 	size_t psize;
 
-	psize = PAGE_CEILING(size);
+	psize = PAGE_CEILING(size + extra);
 	if (psize == oldsize) {
 		/* Same size class. */
 #ifdef JEMALLOC_FILL
@@ -2053,14 +2140,15 @@
 				    oldsize - size);
 			}
 #endif
-			arena_ralloc_large_shrink(arena, chunk, ptr, psize,
-			    oldsize);
+			arena_ralloc_large_shrink(arena, chunk, ptr, oldsize,
+			    psize);
 			return (false);
 		} else {
 			bool ret = arena_ralloc_large_grow(arena, chunk, ptr,
-			    psize, oldsize);
+			    oldsize, PAGE_CEILING(size),
+			    psize - PAGE_CEILING(size), zero);
 #ifdef JEMALLOC_FILL
-			if (ret == false && opt_zero) {
+			if (ret == false && zero == false && opt_zero) {
 				memset((void *)((uintptr_t)ptr + oldsize), 0,
 				    size - oldsize);
 			}
@@ -2071,49 +2159,89 @@
 }
 
 void *
-arena_ralloc(void *ptr, size_t size, size_t oldsize)
+arena_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra,
+    bool zero)
 {
-	void *ret;
-	size_t copysize;
 
-	/* Try to avoid moving the allocation. */
+	/*
+	 * Avoid moving the allocation if the size class can be left the same.
+	 */
 	if (oldsize <= arena_maxclass) {
 		if (oldsize <= small_maxclass) {
-			if (size <= small_maxclass && small_size2bin[size] ==
-			    small_size2bin[oldsize])
-				goto IN_PLACE;
+			assert(choose_arena()->bins[small_size2bin[
+			    oldsize]].reg_size == oldsize);
+			if ((size + extra <= small_maxclass &&
+			    small_size2bin[size + extra] ==
+			    small_size2bin[oldsize]) || (size <= oldsize &&
+			    size + extra >= oldsize)) {
+#ifdef JEMALLOC_FILL
+				if (opt_junk && size < oldsize) {
+					memset((void *)((uintptr_t)ptr + size),
+					    0x5a, oldsize - size);
+				}
+#endif
+				return (ptr);
+			}
 		} else {
 			assert(size <= arena_maxclass);
-			if (size > small_maxclass) {
-				if (arena_ralloc_large(ptr, size, oldsize) ==
-				    false)
+			if (size + extra > small_maxclass) {
+				if (arena_ralloc_large(ptr, oldsize, size,
+				    extra, zero) == false)
 					return (ptr);
 			}
 		}
 	}
 
-	/*
-	 * If we get here, then size and oldsize are different enough that we
-	 * need to move the object.  In that case, fall back to allocating new
-	 * space and copying.
-	 */
-	ret = arena_malloc(size, false);
-	if (ret == NULL)
-		return (NULL);
+	/* Reallocation would require a move. */
+	return (NULL);
+}
 
-	/* Junk/zero-filling were already done by arena_malloc(). */
+void *
+arena_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra,
+    size_t alignment, bool zero)
+{
+	void *ret;
+	size_t copysize;
+
+	/* Try to avoid moving the allocation. */
+	ret = arena_ralloc_no_move(ptr, oldsize, size, extra, zero);
+	if (ret != NULL)
+		return (ret);
+
+
+	/*
+	 * size and oldsize are different enough that we need to move the
+	 * object.  In that case, fall back to allocating new space and
+	 * copying.
+	 */
+	if (alignment != 0)
+		ret = ipalloc(size + extra, alignment, zero);
+	else
+		ret = arena_malloc(size + extra, zero);
+
+	if (ret == NULL) {
+		if (extra == 0)
+			return (NULL);
+		/* Try again, this time without extra. */
+		if (alignment != 0)
+			ret = ipalloc(size, alignment, zero);
+		else
+			ret = arena_malloc(size, zero);
+
+		if (ret == NULL)
+			return (NULL);
+	}
+
+	/* Junk/zero-filling were already done by ipalloc()/arena_malloc(). */
+
+	/*
+	 * Copy at most size bytes (not size+extra), since the caller has no
+	 * expectation that the extra bytes will be reliably preserved.
+	 */
 	copysize = (size < oldsize) ? size : oldsize;
 	memcpy(ret, ptr, copysize);
 	idalloc(ptr);
 	return (ret);
-IN_PLACE:
-#ifdef JEMALLOC_FILL
-	if (opt_junk && size < oldsize)
-		memset((void *)((uintptr_t)ptr + size), 0x5a, oldsize - size);
-	else if (opt_zero && size > oldsize)
-		memset((void *)((uintptr_t)ptr + oldsize), 0, size - oldsize);
-#endif
-	return (ptr);
 }
 
 bool
@@ -2239,26 +2367,6 @@
 	return (false);
 }
 
-#ifdef JEMALLOC_TINY
-/* Compute the smallest power of 2 that is >= x. */
-static size_t
-pow2_ceil(size_t x)
-{
-
-	x--;
-	x |= x >> 1;
-	x |= x >> 2;
-	x |= x >> 4;
-	x |= x >> 8;
-	x |= x >> 16;
-#if (SIZEOF_PTR == 8)
-	x |= x >> 32;
-#endif
-	x++;
-	return (x);
-}
-#endif
-
 #ifdef JEMALLOC_DEBUG
 static void
 small_size2bin_validate(void)
@@ -2381,6 +2489,7 @@
 arena_boot(void)
 {
 	size_t header_size;
+	unsigned i;
 
 	/* Set variables according to the value of opt_lg_[qc]space_max. */
 	qspace_max = (1U << opt_lg_qspace_max);
@@ -2420,7 +2529,7 @@
 		if (nbins > 255) {
 		    char line_buf[UMAX2S_BUFSIZE];
 		    malloc_write("<jemalloc>: Too many small size classes (");
-		    malloc_write(umax2s(nbins, 10, line_buf));
+		    malloc_write(u2s(nbins, 10, line_buf));
 		    malloc_write(" > max 255)\n");
 		    abort();
 		}
@@ -2429,7 +2538,7 @@
 	if (nbins > 256) {
 	    char line_buf[UMAX2S_BUFSIZE];
 	    malloc_write("<jemalloc>: Too many small size classes (");
-	    malloc_write(umax2s(nbins, 10, line_buf));
+	    malloc_write(u2s(nbins, 10, line_buf));
 	    malloc_write(" > max 256)\n");
 	    abort();
 	}
@@ -2439,13 +2548,26 @@
 
 	/*
 	 * Compute the header size such that it is large enough to contain the
-	 * page map.
+	 * page map.  The page map is biased to omit entries for the header
+	 * itself, so some iteration is necessary to compute the map bias.
+	 *
+	 * 1) Compute safe header_size and map_bias values that include enough
+	 *    space for an unbiased page map.
+	 * 2) Refine map_bias based on (1) to omit the header pages in the page
+	 *    map.  The resulting map_bias may be one too small.
+	 * 3) Refine map_bias based on (2).  The result will be >= the result
+	 *    from (2), and will always be correct.
 	 */
-	header_size = sizeof(arena_chunk_t) +
-	    (sizeof(arena_chunk_map_t) * (chunk_npages - 1));
-	arena_chunk_header_npages = (header_size >> PAGE_SHIFT) +
-	    ((header_size & PAGE_MASK) != 0);
-	arena_maxclass = chunksize - (arena_chunk_header_npages << PAGE_SHIFT);
+	map_bias = 0;
+	for (i = 0; i < 3; i++) {
+		header_size = offsetof(arena_chunk_t, map)
+			+ (sizeof(arena_chunk_map_t) * (chunk_npages-map_bias));
+		map_bias = (header_size >> PAGE_SHIFT) + ((header_size &
+		    PAGE_MASK) != 0);
+	}
+	assert(map_bias > 0);
+
+	arena_maxclass = chunksize - (map_bias << PAGE_SHIFT);
 
 	return (false);
 }
diff --git a/jemalloc/src/base.c b/jemalloc/src/base.c
index 605197e..cc85e84 100644
--- a/jemalloc/src/base.c
+++ b/jemalloc/src/base.c
@@ -32,7 +32,7 @@
 	assert(minsize != 0);
 	csize = CHUNK_CEILING(minsize);
 	zero = false;
-	base_pages = chunk_alloc(csize, &zero);
+	base_pages = chunk_alloc(csize, true, &zero);
 	if (base_pages == NULL)
 		return (true);
 	base_next_addr = base_pages;
diff --git a/jemalloc/src/chunk.c b/jemalloc/src/chunk.c
index e6e3bcd..00bf50a 100644
--- a/jemalloc/src/chunk.c
+++ b/jemalloc/src/chunk.c
@@ -14,11 +14,15 @@
 chunk_stats_t	stats_chunks;
 #endif
 
+#ifdef JEMALLOC_IVSALLOC
+rtree_t		*chunks_rtree;
+#endif
+
 /* Various chunk-related settings. */
 size_t		chunksize;
 size_t		chunksize_mask; /* (chunksize - 1). */
 size_t		chunk_npages;
-size_t		arena_chunk_header_npages;
+size_t		map_bias;
 size_t		arena_maxclass; /* Max size class for arenas. */
 
 /******************************************************************************/
@@ -30,7 +34,7 @@
  * advantage of them if they are returned.
  */
 void *
-chunk_alloc(size_t size, bool *zero)
+chunk_alloc(size_t size, bool base, bool *zero)
 {
 	void *ret;
 
@@ -63,10 +67,18 @@
 	/* All strategies for allocation failed. */
 	ret = NULL;
 RETURN:
+#ifdef JEMALLOC_IVSALLOC
+	if (base == false && ret != NULL) {
+		if (rtree_set(chunks_rtree, (uintptr_t)ret, ret)) {
+			chunk_dealloc(ret, size);
+			return (NULL);
+		}
+	}
+#endif
 #if (defined(JEMALLOC_STATS) || defined(JEMALLOC_PROF))
 	if (ret != NULL) {
 #  ifdef JEMALLOC_PROF
-		bool udump;
+		bool gdump;
 #  endif
 		malloc_mutex_lock(&chunks_mtx);
 #  ifdef JEMALLOC_STATS
@@ -76,17 +88,17 @@
 		if (stats_chunks.curchunks > stats_chunks.highchunks) {
 			stats_chunks.highchunks = stats_chunks.curchunks;
 #  ifdef JEMALLOC_PROF
-			udump = true;
+			gdump = true;
 #  endif
 		}
 #  ifdef JEMALLOC_PROF
 		else
-			udump = false;
+			gdump = false;
 #  endif
 		malloc_mutex_unlock(&chunks_mtx);
 #  ifdef JEMALLOC_PROF
-		if (opt_prof && opt_prof_udump && udump)
-			prof_udump();
+		if (opt_prof && opt_prof_gdump && gdump)
+			prof_gdump();
 #  endif
 	}
 #endif
@@ -104,6 +116,9 @@
 	assert(size != 0);
 	assert((size & chunksize_mask) == 0);
 
+#ifdef JEMALLOC_IVSALLOC
+	rtree_set(chunks_rtree, (uintptr_t)chunk, NULL);
+#endif
 #if (defined(JEMALLOC_STATS) || defined(JEMALLOC_PROF))
 	malloc_mutex_lock(&chunks_mtx);
 	stats_chunks.curchunks -= (size / chunksize);
@@ -126,21 +141,27 @@
 {
 
 	/* Set variables according to the value of opt_lg_chunk. */
-	chunksize = (1LU << opt_lg_chunk);
+	chunksize = (ZU(1) << opt_lg_chunk);
 	assert(chunksize >= PAGE_SIZE);
 	chunksize_mask = chunksize - 1;
 	chunk_npages = (chunksize >> PAGE_SHIFT);
 
+#ifdef JEMALLOC_IVSALLOC
+	chunks_rtree = rtree_new((ZU(1) << (LG_SIZEOF_PTR+3)) - opt_lg_chunk);
+	if (chunks_rtree == NULL)
+		return (true);
+#endif
 #if (defined(JEMALLOC_STATS) || defined(JEMALLOC_PROF))
 	if (malloc_mutex_init(&chunks_mtx))
 		return (true);
 	memset(&stats_chunks, 0, sizeof(chunk_stats_t));
 #endif
-
 #ifdef JEMALLOC_SWAP
 	if (chunk_swap_boot())
 		return (true);
 #endif
+	if (chunk_mmap_boot())
+		return (true);
 #ifdef JEMALLOC_DSS
 	if (chunk_dss_boot())
 		return (true);
diff --git a/jemalloc/src/chunk_mmap.c b/jemalloc/src/chunk_mmap.c
index d9f9e86..bc36755 100644
--- a/jemalloc/src/chunk_mmap.c
+++ b/jemalloc/src/chunk_mmap.c
@@ -6,26 +6,30 @@
 
 /*
  * Used by chunk_alloc_mmap() to decide whether to attempt the fast path and
- * potentially avoid some system calls.  We can get away without TLS here,
- * since the state of mmap_unaligned only affects performance, rather than
- * correct function.
+ * potentially avoid some system calls.
  */
-static
 #ifndef NO_TLS
-       __thread
+static __thread bool	mmap_unaligned_tls
+    JEMALLOC_ATTR(tls_model("initial-exec"));
+#define	MMAP_UNALIGNED_GET()	mmap_unaligned_tls
+#define	MMAP_UNALIGNED_SET(v)	do {					\
+	mmap_unaligned_tls = (v);					\
+} while (0)
+#else
+static pthread_key_t	mmap_unaligned_tsd;
+#define	MMAP_UNALIGNED_GET()	((bool)pthread_getspecific(mmap_unaligned_tsd))
+#define	MMAP_UNALIGNED_SET(v)	do {					\
+	pthread_setspecific(mmap_unaligned_tsd, (void *)(v));		\
+} while (0)
 #endif
-                bool	mmap_unaligned
-#ifndef NO_TLS
-                                       JEMALLOC_ATTR(tls_model("initial-exec"))
-#endif
-                                                                               ;
 
 /******************************************************************************/
 /* Function prototypes for non-inline static functions. */
 
 static void	*pages_map(void *addr, size_t size, bool noreserve);
 static void	pages_unmap(void *addr, size_t size);
-static void	*chunk_alloc_mmap_slow(size_t size, bool unaligned, bool noreserve);
+static void	*chunk_alloc_mmap_slow(size_t size, bool unaligned,
+    bool noreserve);
 static void	*chunk_alloc_mmap_internal(size_t size, bool noreserve);
 
 /******************************************************************************/
@@ -54,9 +58,9 @@
 		 * We succeeded in mapping memory, but not in the right place.
 		 */
 		if (munmap(ret, size) == -1) {
-			char buf[STRERROR_BUF];
+			char buf[BUFERROR_BUF];
 
-			strerror_r(errno, buf, sizeof(buf));
+			buferror(errno, buf, sizeof(buf));
 			malloc_write("<jemalloc>: Error in munmap(): ");
 			malloc_write(buf);
 			malloc_write("\n");
@@ -76,9 +80,9 @@
 {
 
 	if (munmap(addr, size) == -1) {
-		char buf[STRERROR_BUF];
+		char buf[BUFERROR_BUF];
 
-		strerror_r(errno, buf, sizeof(buf));
+		buferror(errno, buf, sizeof(buf));
 		malloc_write("<jemalloc>: Error in munmap(): ");
 		malloc_write(buf);
 		malloc_write("\n");
@@ -128,7 +132,7 @@
 	 * method.
 	 */
 	if (unaligned == false)
-		mmap_unaligned = false;
+		MMAP_UNALIGNED_SET(false);
 
 	return (ret);
 }
@@ -166,7 +170,7 @@
 	 * fast method next time.
 	 */
 
-	if (mmap_unaligned == false) {
+	if (MMAP_UNALIGNED_GET() == false) {
 		size_t offset;
 
 		ret = pages_map(NULL, size, noreserve);
@@ -175,7 +179,7 @@
 
 		offset = CHUNK_ADDR2OFFSET(ret);
 		if (offset != 0) {
-			mmap_unaligned = true;
+			MMAP_UNALIGNED_SET(true);
 			/* Try to extend chunk boundary. */
 			if (pages_map((void *)((uintptr_t)ret + size),
 			    chunksize - offset, noreserve) == NULL) {
@@ -184,7 +188,8 @@
 				 * the reliable-but-expensive method.
 				 */
 				pages_unmap(ret, size);
-				ret = chunk_alloc_mmap_slow(size, true, noreserve);
+				ret = chunk_alloc_mmap_slow(size, true,
+				    noreserve);
 			} else {
 				/* Clean up unneeded leading space. */
 				pages_unmap(ret, chunksize - offset);
@@ -216,3 +221,17 @@
 
 	pages_unmap(chunk, size);
 }
+
+bool
+chunk_mmap_boot(void)
+{
+
+#ifdef NO_TLS
+	if (pthread_key_create(&mmap_unaligned_tsd, NULL) != 0) {
+		malloc_write("<jemalloc>: Error in pthread_key_create()\n");
+		return (true);
+	}
+#endif
+
+	return (false);
+}
diff --git a/jemalloc/src/chunk_swap.c b/jemalloc/src/chunk_swap.c
index ed9e414..ee038ba 100644
--- a/jemalloc/src/chunk_swap.c
+++ b/jemalloc/src/chunk_swap.c
@@ -294,9 +294,10 @@
 		void *addr = mmap((void *)((uintptr_t)vaddr + voff), sizes[i],
 		    PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, fds[i], 0);
 		if (addr == MAP_FAILED) {
-			char buf[STRERROR_BUF];
+			char buf[BUFERROR_BUF];
 
-			strerror_r(errno, buf, sizeof(buf));
+
+			buferror(errno, buf, sizeof(buf));
 			malloc_write(
 			    "<jemalloc>: Error in mmap(..., MAP_FIXED, ...): ");
 			malloc_write(buf);
@@ -304,7 +305,7 @@
 			if (opt_abort)
 				abort();
 			if (munmap(vaddr, voff) == -1) {
-				strerror_r(errno, buf, sizeof(buf));
+				buferror(errno, buf, sizeof(buf));
 				malloc_write("<jemalloc>: Error in munmap(): ");
 				malloc_write(buf);
 				malloc_write("\n");
diff --git a/jemalloc/src/ckh.c b/jemalloc/src/ckh.c
index a0c4162..682a8db 100644
--- a/jemalloc/src/ckh.c
+++ b/jemalloc/src/ckh.c
@@ -263,13 +263,12 @@
 	lg_curcells = ckh->lg_curbuckets + LG_CKH_BUCKET_CELLS;
 	while (true) {
 		lg_curcells++;
-		tab = (ckhc_t *) ipalloc((ZU(1) << LG_CACHELINE),
-		    sizeof(ckhc_t) << lg_curcells);
+		tab = (ckhc_t *)ipalloc(sizeof(ckhc_t) << lg_curcells,
+		    ZU(1) << LG_CACHELINE, true);
 		if (tab == NULL) {
 			ret = true;
 			goto RETURN;
 		}
-		memset(tab, 0, sizeof(ckhc_t) << lg_curcells);
 		/* Swap in new table. */
 		ttab = ckh->tab;
 		ckh->tab = tab;
@@ -305,8 +304,8 @@
 	 */
 	lg_prevbuckets = ckh->lg_curbuckets;
 	lg_curcells = ckh->lg_curbuckets + LG_CKH_BUCKET_CELLS - 1;
-	tab = (ckhc_t *)ipalloc((ZU(1) << LG_CACHELINE),
-	    sizeof(ckhc_t) << lg_curcells);
+	tab = (ckhc_t *)ipalloc(sizeof(ckhc_t) << lg_curcells,
+	    ZU(1) << LG_CACHELINE, true);
 	if (tab == NULL) {
 		/*
 		 * An OOM error isn't worth propagating, since it doesn't
@@ -314,7 +313,6 @@
 		 */
 		return;
 	}
-	memset(tab, 0, sizeof(ckhc_t) << lg_curcells);
 	/* Swap in new table. */
 	ttab = ckh->tab;
 	ckh->tab = tab;
@@ -377,13 +375,12 @@
 	ckh->hash = hash;
 	ckh->keycomp = keycomp;
 
-	ckh->tab = (ckhc_t *)ipalloc((ZU(1) << LG_CACHELINE),
-	    sizeof(ckhc_t) << lg_mincells);
+	ckh->tab = (ckhc_t *)ipalloc(sizeof(ckhc_t) << lg_mincells,
+	    (ZU(1) << LG_CACHELINE), true);
 	if (ckh->tab == NULL) {
 		ret = true;
 		goto RETURN;
 	}
-	memset(ckh->tab, 0, sizeof(ckhc_t) << lg_mincells);
 
 #ifdef JEMALLOC_DEBUG
 	ckh->magic = CKH_MAGIG;
@@ -570,12 +567,21 @@
 {
 	size_t ret1, ret2;
 	uint64_t h;
+	union {
+		const void	*v;
+		uint64_t	i;
+	} u;
 
 	assert(minbits <= 32 || (SIZEOF_PTR == 8 && minbits <= 64));
 	assert(hash1 != NULL);
 	assert(hash2 != NULL);
 
-	h = hash(&key, sizeof(void *), 0xd983396e68886082LLU);
+	assert(sizeof(u.v) == sizeof(u.i));
+#if (LG_SIZEOF_PTR != LG_SIZEOF_INT)
+	u.i = 0;
+#endif
+	u.v = key;
+	h = hash(&u.i, sizeof(u.i), 0xd983396e68886082LLU);
 	if (minbits <= 32) {
 		/*
 		 * Avoid doing multiple hashes, since a single hash provides
@@ -586,7 +592,7 @@
 	} else {
 		assert(SIZEOF_PTR == 8);
 		ret1 = h;
-		ret2 = hash(&key, sizeof(void *), 0x5e2be9aff8709a5dLLU);
+		ret2 = hash(&u.i, sizeof(u.i), 0x5e2be9aff8709a5dLLU);
 	}
 
 	*hash1 = ret1;
diff --git a/jemalloc/src/ctl.c b/jemalloc/src/ctl.c
index ffb732d..c83ee4f 100644
--- a/jemalloc/src/ctl.c
+++ b/jemalloc/src/ctl.c
@@ -41,6 +41,11 @@
 #ifdef JEMALLOC_TCACHE
 CTL_PROTO(tcache_flush)
 #endif
+CTL_PROTO(thread_arena)
+#ifdef JEMALLOC_STATS
+CTL_PROTO(thread_allocated)
+CTL_PROTO(thread_deallocated)
+#endif
 CTL_PROTO(config_debug)
 CTL_PROTO(config_dss)
 CTL_PROTO(config_dynamic_page_shift)
@@ -57,8 +62,15 @@
 CTL_PROTO(config_tls)
 CTL_PROTO(config_xmalloc)
 CTL_PROTO(opt_abort)
+CTL_PROTO(opt_lg_qspace_max)
+CTL_PROTO(opt_lg_cspace_max)
+CTL_PROTO(opt_lg_chunk)
+CTL_PROTO(opt_narenas)
+CTL_PROTO(opt_lg_dirty_mult)
+CTL_PROTO(opt_stats_print)
 #ifdef JEMALLOC_FILL
 CTL_PROTO(opt_junk)
+CTL_PROTO(opt_zero)
 #endif
 #ifdef JEMALLOC_SYSV
 CTL_PROTO(opt_sysv)
@@ -66,27 +78,22 @@
 #ifdef JEMALLOC_XMALLOC
 CTL_PROTO(opt_xmalloc)
 #endif
-#ifdef JEMALLOC_ZERO
-CTL_PROTO(opt_zero)
-#endif
 #ifdef JEMALLOC_TCACHE
 CTL_PROTO(opt_tcache)
 CTL_PROTO(opt_lg_tcache_gc_sweep)
 #endif
 #ifdef JEMALLOC_PROF
 CTL_PROTO(opt_prof)
+CTL_PROTO(opt_prof_prefix)
 CTL_PROTO(opt_prof_active)
 CTL_PROTO(opt_lg_prof_bt_max)
 CTL_PROTO(opt_lg_prof_sample)
 CTL_PROTO(opt_lg_prof_interval)
-CTL_PROTO(opt_prof_udump)
+CTL_PROTO(opt_prof_gdump)
 CTL_PROTO(opt_prof_leak)
+CTL_PROTO(opt_prof_accum)
+CTL_PROTO(opt_lg_prof_tcmax)
 #endif
-CTL_PROTO(opt_stats_print)
-CTL_PROTO(opt_lg_qspace_max)
-CTL_PROTO(opt_lg_cspace_max)
-CTL_PROTO(opt_lg_dirty_mult)
-CTL_PROTO(opt_lg_chunk)
 #ifdef JEMALLOC_SWAP
 CTL_PROTO(opt_overcommit)
 #endif
@@ -125,6 +132,7 @@
 CTL_PROTO(arenas_nhbins)
 #endif
 CTL_PROTO(arenas_nlruns)
+CTL_PROTO(arenas_purge)
 #ifdef JEMALLOC_PROF
 CTL_PROTO(prof_active)
 CTL_PROTO(prof_dump)
@@ -210,6 +218,15 @@
 };
 #endif
 
+static const ctl_node_t	thread_node[] = {
+	{NAME("arena"),		CTL(thread_arena)}
+#ifdef JEMALLOC_STATS
+	,
+	{NAME("allocated"),	CTL(thread_allocated)},
+	{NAME("deallocated"),	CTL(thread_deallocated)}
+#endif
+};
+
 static const ctl_node_t	config_node[] = {
 	{NAME("debug"),			CTL(config_debug)},
 	{NAME("dss"),			CTL(config_dss)},
@@ -230,36 +247,43 @@
 
 static const ctl_node_t opt_node[] = {
 	{NAME("abort"),			CTL(opt_abort)},
+	{NAME("lg_qspace_max"),		CTL(opt_lg_qspace_max)},
+	{NAME("lg_cspace_max"),		CTL(opt_lg_cspace_max)},
+	{NAME("lg_chunk"),		CTL(opt_lg_chunk)},
+	{NAME("narenas"),		CTL(opt_narenas)},
+	{NAME("lg_dirty_mult"),		CTL(opt_lg_dirty_mult)},
+	{NAME("stats_print"),		CTL(opt_stats_print)}
 #ifdef JEMALLOC_FILL
+	,
 	{NAME("junk"),			CTL(opt_junk)},
+	{NAME("zero"),			CTL(opt_zero)}
 #endif
 #ifdef JEMALLOC_SYSV
-	{NAME("sysv"),			CTL(opt_sysv)},
+	,
+	{NAME("sysv"),			CTL(opt_sysv)}
 #endif
 #ifdef JEMALLOC_XMALLOC
-	{NAME("xmalloc"),		CTL(opt_xmalloc)},
-#endif
-#ifdef JEMALLOC_ZERO
-	{NAME("zero"),			CTL(opt_zero)},
+	,
+	{NAME("xmalloc"),		CTL(opt_xmalloc)}
 #endif
 #ifdef JEMALLOC_TCACHE
+	,
 	{NAME("tcache"),		CTL(opt_tcache)},
-	{NAME("lg_tcache_gc_sweep"),	CTL(opt_lg_tcache_gc_sweep)},
+	{NAME("lg_tcache_gc_sweep"),	CTL(opt_lg_tcache_gc_sweep)}
 #endif
 #ifdef JEMALLOC_PROF
+	,
 	{NAME("prof"),			CTL(opt_prof)},
+	{NAME("prof_prefix"),		CTL(opt_prof_prefix)},
 	{NAME("prof_active"),		CTL(opt_prof_active)},
 	{NAME("lg_prof_bt_max"),	CTL(opt_lg_prof_bt_max)},
 	{NAME("lg_prof_sample"),	CTL(opt_lg_prof_sample)},
 	{NAME("lg_prof_interval"),	CTL(opt_lg_prof_interval)},
-	{NAME("prof_udump"),		CTL(opt_prof_udump)},
+	{NAME("prof_gdump"),		CTL(opt_prof_gdump)},
 	{NAME("prof_leak"),		CTL(opt_prof_leak)},
+	{NAME("prof_accum"),		CTL(opt_prof_accum)},
+	{NAME("lg_prof_tcmax"),		CTL(opt_lg_prof_tcmax)}
 #endif
-	{NAME("stats_print"),		CTL(opt_stats_print)},
-	{NAME("lg_qspace_max"),		CTL(opt_lg_qspace_max)},
-	{NAME("lg_cspace_max"),		CTL(opt_lg_cspace_max)},
-	{NAME("lg_dirty_mult"),		CTL(opt_lg_dirty_mult)},
-	{NAME("lg_chunk"),		CTL(opt_lg_chunk)}
 #ifdef JEMALLOC_SWAP
 	,
 	{NAME("overcommit"),		CTL(opt_overcommit)}
@@ -321,7 +345,8 @@
 #endif
 	{NAME("bin"),			CHILD(arenas_bin)},
 	{NAME("nlruns"),		CTL(arenas_nlruns)},
-	{NAME("lrun"),			CHILD(arenas_lrun)}
+	{NAME("lrun"),			CHILD(arenas_lrun)},
+	{NAME("purge"),			CTL(arenas_purge)}
 };
 
 #ifdef JEMALLOC_PROF
@@ -448,6 +473,7 @@
 #ifdef JEMALLOC_TCACHE
 	{NAME("tcache"),	CHILD(tcache)},
 #endif
+	{NAME("thread"),	CHILD(thread)},
 	{NAME("config"),	CHILD(config)},
 	{NAME("opt"),		CHILD(opt)},
 	{NAME("arenas"),	CHILD(arenas)},
@@ -1028,13 +1054,13 @@
 
 	VOID();
 
-	tcache = tcache_tls;
+	tcache = TCACHE_GET();
 	if (tcache == NULL) {
 		ret = 0;
 		goto RETURN;
 	}
 	tcache_destroy(tcache);
-	tcache_tls = NULL;
+	TCACHE_SET(NULL);
 
 	ret = 0;
 RETURN:
@@ -1042,6 +1068,49 @@
 }
 #endif
 
+static int
+thread_arena_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
+    void *newp, size_t newlen)
+{
+	int ret;
+	unsigned newind, oldind;
+
+	newind = oldind = choose_arena()->ind;
+	WRITE(oldind, unsigned);
+	READ(newind, unsigned);
+	if (newind != oldind) {
+		arena_t *arena;
+
+		if (newind >= narenas) {
+			/* New arena index is out of range. */
+			ret = EFAULT;
+			goto RETURN;
+		}
+
+		/* Initialize arena if necessary. */
+		malloc_mutex_lock(&arenas_lock);
+		if ((arena = arenas[newind]) == NULL)
+			arena = arenas_extend(newind);
+		malloc_mutex_unlock(&arenas_lock);
+		if (arena == NULL) {
+			ret = EAGAIN;
+			goto RETURN;
+		}
+
+		/* Set new arena association. */
+		ARENA_SET(arena);
+	}
+
+	ret = 0;
+RETURN:
+	return (ret);
+}
+
+#ifdef JEMALLOC_STATS
+CTL_RO_GEN(thread_allocated, ALLOCATED_GET(), uint64_t);
+CTL_RO_GEN(thread_deallocated, DEALLOCATED_GET(), uint64_t);
+#endif
+
 /******************************************************************************/
 
 #ifdef JEMALLOC_DEBUG
@@ -1137,8 +1206,15 @@
 /******************************************************************************/
 
 CTL_RO_GEN(opt_abort, opt_abort, bool)
+CTL_RO_GEN(opt_lg_qspace_max, opt_lg_qspace_max, size_t)
+CTL_RO_GEN(opt_lg_cspace_max, opt_lg_cspace_max, size_t)
+CTL_RO_GEN(opt_lg_chunk, opt_lg_chunk, size_t)
+CTL_RO_GEN(opt_narenas, opt_narenas, size_t)
+CTL_RO_GEN(opt_lg_dirty_mult, opt_lg_dirty_mult, ssize_t)
+CTL_RO_GEN(opt_stats_print, opt_stats_print, bool)
 #ifdef JEMALLOC_FILL
 CTL_RO_GEN(opt_junk, opt_junk, bool)
+CTL_RO_GEN(opt_zero, opt_zero, bool)
 #endif
 #ifdef JEMALLOC_SYSV
 CTL_RO_GEN(opt_sysv, opt_sysv, bool)
@@ -1146,27 +1222,22 @@
 #ifdef JEMALLOC_XMALLOC
 CTL_RO_GEN(opt_xmalloc, opt_xmalloc, bool)
 #endif
-#ifdef JEMALLOC_ZERO
-CTL_RO_GEN(opt_zero, opt_zero, bool)
-#endif
 #ifdef JEMALLOC_TCACHE
 CTL_RO_GEN(opt_tcache, opt_tcache, bool)
 CTL_RO_GEN(opt_lg_tcache_gc_sweep, opt_lg_tcache_gc_sweep, ssize_t)
 #endif
 #ifdef JEMALLOC_PROF
 CTL_RO_GEN(opt_prof, opt_prof, bool)
+CTL_RO_GEN(opt_prof_prefix, opt_prof_prefix, const char *)
 CTL_RO_GEN(opt_prof_active, opt_prof_active, bool)
 CTL_RO_GEN(opt_lg_prof_bt_max, opt_lg_prof_bt_max, size_t)
 CTL_RO_GEN(opt_lg_prof_sample, opt_lg_prof_sample, size_t)
 CTL_RO_GEN(opt_lg_prof_interval, opt_lg_prof_interval, ssize_t)
-CTL_RO_GEN(opt_prof_udump, opt_prof_udump, bool)
+CTL_RO_GEN(opt_prof_gdump, opt_prof_gdump, bool)
 CTL_RO_GEN(opt_prof_leak, opt_prof_leak, bool)
+CTL_RO_GEN(opt_prof_accum, opt_prof_accum, bool)
+CTL_RO_GEN(opt_lg_prof_tcmax, opt_lg_prof_tcmax, ssize_t)
 #endif
-CTL_RO_GEN(opt_stats_print, opt_stats_print, bool)
-CTL_RO_GEN(opt_lg_qspace_max, opt_lg_qspace_max, size_t)
-CTL_RO_GEN(opt_lg_cspace_max, opt_lg_cspace_max, size_t)
-CTL_RO_GEN(opt_lg_dirty_mult, opt_lg_dirty_mult, ssize_t)
-CTL_RO_GEN(opt_lg_chunk, opt_lg_chunk, size_t)
 #ifdef JEMALLOC_SWAP
 CTL_RO_GEN(opt_overcommit, opt_overcommit, bool)
 #endif
@@ -1249,6 +1320,44 @@
 #endif
 CTL_RO_GEN(arenas_nlruns, nlclasses, size_t)
 
+static int
+arenas_purge_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
+    void *newp, size_t newlen)
+{
+	int ret;
+	unsigned arena;
+
+	WRITEONLY();
+	arena = UINT_MAX;
+	WRITE(arena, unsigned);
+	if (newp != NULL && arena >= narenas) {
+		ret = EFAULT;
+		goto RETURN;
+	} else {
+		arena_t *tarenas[narenas];
+
+		malloc_mutex_lock(&arenas_lock);
+		memcpy(tarenas, arenas, sizeof(arena_t *) * narenas);
+		malloc_mutex_unlock(&arenas_lock);
+
+		if (arena == UINT_MAX) {
+			unsigned i;
+			for (i = 0; i < narenas; i++) {
+				if (tarenas[i] != NULL)
+					arena_purge_all(tarenas[i]);
+			}
+		} else {
+			assert(arena < narenas);
+			if (tarenas[arena] != NULL)
+				arena_purge_all(tarenas[arena]);
+		}
+	}
+
+	ret = 0;
+RETURN:
+	return (ret);
+}
+
 /******************************************************************************/
 
 #ifdef JEMALLOC_PROF
diff --git a/jemalloc/src/huge.c b/jemalloc/src/huge.c
index 49962ea..a035197 100644
--- a/jemalloc/src/huge.c
+++ b/jemalloc/src/huge.c
@@ -37,7 +37,7 @@
 	if (node == NULL)
 		return (NULL);
 
-	ret = chunk_alloc(csize, &zero);
+	ret = chunk_alloc(csize, false, &zero);
 	if (ret == NULL) {
 		base_node_dealloc(node);
 		return (NULL);
@@ -69,12 +69,11 @@
 
 /* Only handles large allocations that require more than chunk alignment. */
 void *
-huge_palloc(size_t alignment, size_t size)
+huge_palloc(size_t size, size_t alignment, bool zero)
 {
 	void *ret;
 	size_t alloc_size, chunk_size, offset;
 	extent_node_t *node;
-	bool zero;
 
 	/*
 	 * This allocation requires alignment that is even larger than chunk
@@ -98,8 +97,7 @@
 	if (node == NULL)
 		return (NULL);
 
-	zero = false;
-	ret = chunk_alloc(alloc_size, &zero);
+	ret = chunk_alloc(alloc_size, false, &zero);
 	if (ret == NULL) {
 		base_node_dealloc(node);
 		return (NULL);
@@ -142,45 +140,80 @@
 	malloc_mutex_unlock(&huge_mtx);
 
 #ifdef JEMALLOC_FILL
-	if (opt_junk)
-		memset(ret, 0xa5, chunk_size);
-	else if (opt_zero)
-		memset(ret, 0, chunk_size);
+	if (zero == false) {
+		if (opt_junk)
+			memset(ret, 0xa5, chunk_size);
+		else if (opt_zero)
+			memset(ret, 0, chunk_size);
+	}
 #endif
 
 	return (ret);
 }
 
 void *
-huge_ralloc(void *ptr, size_t size, size_t oldsize)
+huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra)
 {
-	void *ret;
-	size_t copysize;
 
-	/* Avoid moving the allocation if the size class would not change. */
-	if (oldsize > arena_maxclass &&
-	    CHUNK_CEILING(size) == CHUNK_CEILING(oldsize)) {
+	/*
+	 * Avoid moving the allocation if the size class can be left the same.
+	 */
+	if (oldsize > arena_maxclass
+	    && CHUNK_CEILING(oldsize) >= CHUNK_CEILING(size)
+	    && CHUNK_CEILING(oldsize) <= CHUNK_CEILING(size+extra)) {
+		assert(CHUNK_CEILING(oldsize) == oldsize);
 #ifdef JEMALLOC_FILL
 		if (opt_junk && size < oldsize) {
-			memset((void *)((uintptr_t)ptr + size), 0x5a, oldsize
-			    - size);
-		} else if (opt_zero && size > oldsize) {
-			memset((void *)((uintptr_t)ptr + oldsize), 0, size
-			    - oldsize);
+			memset((void *)((uintptr_t)ptr + size), 0x5a,
+			    oldsize - size);
 		}
 #endif
 		return (ptr);
 	}
 
-	/*
-	 * If we get here, then size and oldsize are different enough that we
-	 * need to use a different size class.  In that case, fall back to
-	 * allocating new space and copying.
-	 */
-	ret = huge_malloc(size, false);
-	if (ret == NULL)
-		return (NULL);
+	/* Reallocation would require a move. */
+	return (NULL);
+}
 
+void *
+huge_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra,
+    size_t alignment, bool zero)
+{
+	void *ret;
+	size_t copysize;
+
+	/* Try to avoid moving the allocation. */
+	ret = huge_ralloc_no_move(ptr, oldsize, size, extra);
+	if (ret != NULL)
+		return (ret);
+
+	/*
+	 * size and oldsize are different enough that we need to use a
+	 * different size class.  In that case, fall back to allocating new
+	 * space and copying.
+	 */
+	if (alignment != 0)
+		ret = huge_palloc(size + extra, alignment, zero);
+	else
+		ret = huge_malloc(size + extra, zero);
+
+	if (ret == NULL) {
+		if (extra == 0)
+			return (NULL);
+		/* Try again, this time without extra. */
+		if (alignment != 0)
+			ret = huge_palloc(size, alignment, zero);
+		else
+			ret = huge_malloc(size, zero);
+
+		if (ret == NULL)
+			return (NULL);
+	}
+
+	/*
+	 * Copy at most size bytes (not size+extra), since the caller has no
+	 * expectation that the extra bytes will be reliably preserved.
+	 */
 	copysize = (size < oldsize) ? size : oldsize;
 	memcpy(ret, ptr, copysize);
 	idalloc(ptr);
diff --git a/jemalloc/src/jemalloc.c b/jemalloc/src/jemalloc.c
index b36590d..2aebc51 100644
--- a/jemalloc/src/jemalloc.c
+++ b/jemalloc/src/jemalloc.c
@@ -1,85 +1,3 @@
-/*-
- * This allocator implementation is designed to provide scalable performance
- * for multi-threaded programs on multi-processor systems.  The following
- * features are included for this purpose:
- *
- *   + Multiple arenas are used if there are multiple CPUs, which reduces lock
- *     contention and cache sloshing.
- *
- *   + Thread-specific caching is used if there are multiple threads, which
- *     reduces the amount of locking.
- *
- *   + Cache line sharing between arenas is avoided for internal data
- *     structures.
- *
- *   + Memory is managed in chunks and runs (chunks can be split into runs),
- *     rather than as individual pages.  This provides a constant-time
- *     mechanism for associating allocations with particular arenas.
- *
- * Allocation requests are rounded up to the nearest size class, and no record
- * of the original request size is maintained.  Allocations are broken into
- * categories according to size class.  Assuming 1 MiB chunks, 4 KiB pages and
- * a 16 byte quantum on a 32-bit system, the size classes in each category are
- * as follows:
- *
- *   |========================================|
- *   | Category | Subcategory      |     Size |
- *   |========================================|
- *   | Small    | Tiny             |        2 |
- *   |          |                  |        4 |
- *   |          |                  |        8 |
- *   |          |------------------+----------|
- *   |          | Quantum-spaced   |       16 |
- *   |          |                  |       32 |
- *   |          |                  |       48 |
- *   |          |                  |      ... |
- *   |          |                  |       96 |
- *   |          |                  |      112 |
- *   |          |                  |      128 |
- *   |          |------------------+----------|
- *   |          | Cacheline-spaced |      192 |
- *   |          |                  |      256 |
- *   |          |                  |      320 |
- *   |          |                  |      384 |
- *   |          |                  |      448 |
- *   |          |                  |      512 |
- *   |          |------------------+----------|
- *   |          | Sub-page         |      760 |
- *   |          |                  |     1024 |
- *   |          |                  |     1280 |
- *   |          |                  |      ... |
- *   |          |                  |     3328 |
- *   |          |                  |     3584 |
- *   |          |                  |     3840 |
- *   |========================================|
- *   | Large                       |    4 KiB |
- *   |                             |    8 KiB |
- *   |                             |   12 KiB |
- *   |                             |      ... |
- *   |                             | 1012 KiB |
- *   |                             | 1016 KiB |
- *   |                             | 1020 KiB |
- *   |========================================|
- *   | Huge                        |    1 MiB |
- *   |                             |    2 MiB |
- *   |                             |    3 MiB |
- *   |                             |      ... |
- *   |========================================|
- *
- * Different mechanisms are used accoding to category:
- *
- *   Small: Each size class is segregated into its own set of runs.  Each run
- *          maintains a bitmap of which regions are free/allocated.
- *
- *   Large : Each allocation is backed by a dedicated run.  Metadata are stored
- *           in the associated arena chunk header maps.
- *
- *   Huge : Each allocation is backed by a dedicated contiguous set of chunks.
- *          Metadata are stored in a separate red-black tree.
- *
- *******************************************************************************
- */
-
 #define	JEMALLOC_C_
 #include "jemalloc/internal/jemalloc_internal.h"
 
@@ -89,22 +7,30 @@
 malloc_mutex_t		arenas_lock;
 arena_t			**arenas;
 unsigned		narenas;
-#ifndef NO_TLS
 static unsigned		next_arena;
-#endif
 
 #ifndef NO_TLS
-__thread arena_t	*arenas_map JEMALLOC_ATTR(tls_model("initial-exec"));
+__thread arena_t	*arenas_tls JEMALLOC_ATTR(tls_model("initial-exec"));
+#else
+pthread_key_t		arenas_tsd;
+#endif
+
+#ifdef JEMALLOC_STATS
+#  ifndef NO_TLS
+__thread thread_allocated_t	thread_allocated_tls;
+#  else
+pthread_key_t		thread_allocated_tsd;
+#  endif
 #endif
 
 /* Set to true once the allocator has been initialized. */
-static bool malloc_initialized = false;
+static bool		malloc_initialized = false;
 
 /* Used to let the initializing thread recursively allocate. */
-static pthread_t malloc_initializer = (unsigned long)0;
+static pthread_t	malloc_initializer = (unsigned long)0;
 
 /* Used to avoid initialization races. */
-static malloc_mutex_t init_lock = PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP;
+static malloc_mutex_t	init_lock = MALLOC_MUTEX_INITIALIZER;
 
 #ifdef DYNAMIC_PAGE_SHIFT
 size_t		pagesize;
@@ -115,8 +41,7 @@
 unsigned	ncpus;
 
 /* Runtime configuration options. */
-const char	*JEMALLOC_P(malloc_options)
-    JEMALLOC_ATTR(visibility("default"));
+const char	*JEMALLOC_P(malloc_conf) JEMALLOC_ATTR(visibility("default"));
 #ifdef JEMALLOC_DEBUG
 bool	opt_abort = true;
 #  ifdef JEMALLOC_FILL
@@ -137,7 +62,7 @@
 #ifdef JEMALLOC_FILL
 bool	opt_zero = false;
 #endif
-static int	opt_narenas_lshift = 0;
+size_t	opt_narenas = 0;
 
 /******************************************************************************/
 /* Function prototypes for non-inline static functions. */
@@ -145,9 +70,15 @@
 static void	wrtmessage(void *cbopaque, const char *s);
 static void	stats_print_atexit(void);
 static unsigned	malloc_ncpus(void);
+#if (defined(JEMALLOC_STATS) && defined(NO_TLS))
+static void	thread_allocated_cleanup(void *arg);
+#endif
+static bool	malloc_conf_next(char const **opts_p, char const **k_p,
+    size_t *klen_p, char const **v_p, size_t *vlen_p);
+static void	malloc_conf_error(const char *msg, const char *k, size_t klen,
+    const char *v, size_t vlen);
+static void	malloc_conf_init(void);
 static bool	malloc_init_hard(void);
-static void	jemalloc_prefork(void);
-static void	jemalloc_postfork(void);
 
 /******************************************************************************/
 /* malloc_message() setup. */
@@ -160,8 +91,14 @@
 void
 wrtmessage(void *cbopaque, const char *s)
 {
-
-	write(STDERR_FILENO, s, strlen(s));
+#ifdef JEMALLOC_CC_SILENCE
+	int result =
+#endif
+	    write(STDERR_FILENO, s, strlen(s));
+#ifdef JEMALLOC_CC_SILENCE
+	if (result < 0)
+		result = errno;
+#endif
 }
 
 void	(*JEMALLOC_P(malloc_message))(void *, const char *s)
@@ -179,8 +116,8 @@
 	arena_t *ret;
 
 	/* Allocate enough space for trailing bins. */
-	ret = (arena_t *)base_alloc(sizeof(arena_t)
-	    + (sizeof(arena_bin_t) * (nbins - 1)));
+	ret = (arena_t *)base_alloc(offsetof(arena_t, bins)
+	    + (sizeof(arena_bin_t) * nbins));
 	if (ret != NULL && arena_new(ret, ind) == false) {
 		arenas[ind] = ret;
 		return (ret);
@@ -200,7 +137,6 @@
 	return (arenas[0]);
 }
 
-#ifndef NO_TLS
 /*
  * Choose an arena based on a per-thread value (slow-path code only, called
  * only by choose_arena()).
@@ -219,11 +155,29 @@
 	} else
 		ret = arenas[0];
 
-	arenas_map = ret;
+	ARENA_SET(ret);
 
 	return (ret);
 }
+
+/*
+ * glibc provides a non-standard strerror_r() when _GNU_SOURCE is defined, so
+ * provide a wrapper.
+ */
+int
+buferror(int errnum, char *buf, size_t buflen)
+{
+#ifdef _GNU_SOURCE
+	char *b = strerror_r(errno, buf, buflen);
+	if (b != buf) {
+		strncpy(buf, b, buflen);
+		buf[buflen-1] = '\0';
+	}
+	return (0);
+#else
+	return (strerror_r(errno, buf, buflen));
 #endif
+}
 
 static void
 stats_print_atexit(void)
@@ -283,6 +237,17 @@
 	return (ret);
 }
 
+#if (defined(JEMALLOC_STATS) && defined(NO_TLS))
+static void
+thread_allocated_cleanup(void *arg)
+{
+	uint64_t *allocated = (uint64_t *)arg;
+
+	if (allocated != NULL)
+		idalloc(allocated);
+}
+#endif
+
 /*
  * FreeBSD's pthreads implementation calls malloc(3), so the malloc
  * implementation has to take pains to avoid infinite recursion during
@@ -299,12 +264,323 @@
 }
 
 static bool
-malloc_init_hard(void)
+malloc_conf_next(char const **opts_p, char const **k_p, size_t *klen_p,
+    char const **v_p, size_t *vlen_p)
+{
+	bool accept;
+	const char *opts = *opts_p;
+
+	*k_p = opts;
+
+	for (accept = false; accept == false;) {
+		switch (*opts) {
+			case 'A': case 'B': case 'C': case 'D': case 'E':
+			case 'F': case 'G': case 'H': case 'I': case 'J':
+			case 'K': case 'L': case 'M': case 'N': case 'O':
+			case 'P': case 'Q': case 'R': case 'S': case 'T':
+			case 'U': case 'V': case 'W': case 'X': case 'Y':
+			case 'Z':
+			case 'a': case 'b': case 'c': case 'd': case 'e':
+			case 'f': case 'g': case 'h': case 'i': case 'j':
+			case 'k': case 'l': case 'm': case 'n': case 'o':
+			case 'p': case 'q': case 'r': case 's': case 't':
+			case 'u': case 'v': case 'w': case 'x': case 'y':
+			case 'z':
+			case '0': case '1': case '2': case '3': case '4':
+			case '5': case '6': case '7': case '8': case '9':
+			case '_':
+				opts++;
+				break;
+			case ':':
+				opts++;
+				*klen_p = (uintptr_t)opts - 1 - (uintptr_t)*k_p;
+				*v_p = opts;
+				accept = true;
+				break;
+			case '\0':
+				if (opts != *opts_p) {
+					malloc_write("<jemalloc>: Conf string "
+					    "ends with key\n");
+				}
+				return (true);
+			default:
+				malloc_write("<jemalloc>: Malformed conf "
+				    "string\n");
+				return (true);
+		}
+	}
+
+	for (accept = false; accept == false;) {
+		switch (*opts) {
+			case ',':
+				opts++;
+				/*
+				 * Look ahead one character here, because the
+				 * next time this function is called, it will
+				 * assume that end of input has been cleanly
+				 * reached if no input remains, but we have
+				 * optimistically already consumed the comma if
+				 * one exists.
+				 */
+				if (*opts == '\0') {
+					malloc_write("<jemalloc>: Conf string "
+					    "ends with comma\n");
+				}
+				*vlen_p = (uintptr_t)opts - 1 - (uintptr_t)*v_p;
+				accept = true;
+				break;
+			case '\0':
+				*vlen_p = (uintptr_t)opts - (uintptr_t)*v_p;
+				accept = true;
+				break;
+			default:
+				opts++;
+				break;
+		}
+	}
+
+	*opts_p = opts;
+	return (false);
+}
+
+static void
+malloc_conf_error(const char *msg, const char *k, size_t klen, const char *v,
+    size_t vlen)
+{
+	char buf[PATH_MAX + 1];
+
+	malloc_write("<jemalloc>: ");
+	malloc_write(msg);
+	malloc_write(": ");
+	memcpy(buf, k, klen);
+	memcpy(&buf[klen], ":", 1);
+	memcpy(&buf[klen+1], v, vlen);
+	buf[klen+1+vlen] = '\0';
+	malloc_write(buf);
+	malloc_write("\n");
+}
+
+static void
+malloc_conf_init(void)
 {
 	unsigned i;
-	int linklen;
 	char buf[PATH_MAX + 1];
-	const char *opts;
+	const char *opts, *k, *v;
+	size_t klen, vlen;
+
+	for (i = 0; i < 3; i++) {
+		/* Get runtime configuration. */
+		switch (i) {
+		case 0:
+			if (JEMALLOC_P(malloc_conf) != NULL) {
+				/*
+				 * Use options that were compiled into the
+				 * program.
+				 */
+				opts = JEMALLOC_P(malloc_conf);
+			} else {
+				/* No configuration specified. */
+				buf[0] = '\0';
+				opts = buf;
+			}
+			break;
+		case 1: {
+			int linklen;
+			const char *linkname =
+#ifdef JEMALLOC_PREFIX
+			    "/etc/"JEMALLOC_PREFIX"malloc.conf"
+#else
+			    "/etc/malloc.conf"
+#endif
+			    ;
+
+			if ((linklen = readlink(linkname, buf,
+			    sizeof(buf) - 1)) != -1) {
+				/*
+				 * Use the contents of the "/etc/malloc.conf"
+				 * symbolic link's name.
+				 */
+				buf[linklen] = '\0';
+				opts = buf;
+			} else {
+				/* No configuration specified. */
+				buf[0] = '\0';
+				opts = buf;
+			}
+			break;
+		}
+		case 2: {
+			const char *envname =
+#ifdef JEMALLOC_PREFIX
+			    JEMALLOC_CPREFIX"MALLOC_CONF"
+#else
+			    "MALLOC_CONF"
+#endif
+			    ;
+
+			if ((opts = getenv(envname)) != NULL) {
+				/*
+				 * Do nothing; opts is already initialized to
+				 * the value of the JEMALLOC_OPTIONS
+				 * environment variable.
+				 */
+			} else {
+				/* No configuration specified. */
+				buf[0] = '\0';
+				opts = buf;
+			}
+			break;
+		}
+		default:
+			/* NOTREACHED */
+			assert(false);
+			buf[0] = '\0';
+			opts = buf;
+		}
+
+		while (*opts != '\0' && malloc_conf_next(&opts, &k, &klen, &v,
+		    &vlen) == false) {
+#define	CONF_HANDLE_BOOL(n)						\
+			if (sizeof(#n)-1 == klen && strncmp(#n, k,	\
+			    klen) == 0) {				\
+				if (strncmp("true", v, vlen) == 0 &&	\
+				    vlen == sizeof("true")-1)		\
+					opt_##n = true;			\
+				else if (strncmp("false", v, vlen) ==	\
+				    0 && vlen == sizeof("false")-1)	\
+					opt_##n = false;		\
+				else {					\
+					malloc_conf_error(		\
+					    "Invalid conf value",	\
+					    k, klen, v, vlen);		\
+				}					\
+				continue;				\
+			}
+#define	CONF_HANDLE_SIZE_T(n, min, max)					\
+			if (sizeof(#n)-1 == klen && strncmp(#n, k,	\
+			    klen) == 0) {				\
+				unsigned long ul;			\
+				char *end;				\
+									\
+				errno = 0;				\
+				ul = strtoul(v, &end, 0);		\
+				if (errno != 0 || (uintptr_t)end -	\
+				    (uintptr_t)v != vlen) {		\
+					malloc_conf_error(		\
+					    "Invalid conf value",	\
+					    k, klen, v, vlen);		\
+				} else if (ul < min || ul > max) {	\
+					malloc_conf_error(		\
+					    "Out-of-range conf value",	\
+					    k, klen, v, vlen);		\
+				} else					\
+					opt_##n = ul;			\
+				continue;				\
+			}
+#define	CONF_HANDLE_SSIZE_T(n, min, max)				\
+			if (sizeof(#n)-1 == klen && strncmp(#n, k,	\
+			    klen) == 0) {				\
+				long l;					\
+				char *end;				\
+									\
+				errno = 0;				\
+				l = strtol(v, &end, 0);			\
+				if (errno != 0 || (uintptr_t)end -	\
+				    (uintptr_t)v != vlen) {		\
+					malloc_conf_error(		\
+					    "Invalid conf value",	\
+					    k, klen, v, vlen);		\
+				} else if (l < (ssize_t)min || l >	\
+				    (ssize_t)max) {			\
+					malloc_conf_error(		\
+					    "Out-of-range conf value",	\
+					    k, klen, v, vlen);		\
+				} else					\
+					opt_##n = l;			\
+				continue;				\
+			}
+#define	CONF_HANDLE_CHAR_P(n, d)					\
+			if (sizeof(#n)-1 == klen && strncmp(#n, k,	\
+			    klen) == 0) {				\
+				size_t cpylen = (vlen <=		\
+				    sizeof(opt_##n)-1) ? vlen :		\
+				    sizeof(opt_##n)-1;			\
+				strncpy(opt_##n, v, cpylen);		\
+				opt_##n[cpylen] = '\0';			\
+				continue;				\
+			}
+
+			CONF_HANDLE_BOOL(abort)
+			CONF_HANDLE_SIZE_T(lg_qspace_max, LG_QUANTUM,
+			    PAGE_SHIFT-1)
+			CONF_HANDLE_SIZE_T(lg_cspace_max, LG_QUANTUM,
+			    PAGE_SHIFT-1)
+			/*
+			 * Chunks always require at least one * header page,
+			 * plus one data page.
+			 */
+			CONF_HANDLE_SIZE_T(lg_chunk, PAGE_SHIFT+1,
+			    (sizeof(size_t) << 3) - 1)
+			CONF_HANDLE_SIZE_T(narenas, 1, SIZE_T_MAX)
+			CONF_HANDLE_SSIZE_T(lg_dirty_mult, -1,
+			    (sizeof(size_t) << 3) - 1)
+			CONF_HANDLE_BOOL(stats_print)
+#ifdef JEMALLOC_FILL
+			CONF_HANDLE_BOOL(junk)
+			CONF_HANDLE_BOOL(zero)
+#endif
+#ifdef JEMALLOC_SYSV
+			CONF_HANDLE_BOOL(sysv)
+#endif
+#ifdef JEMALLOC_XMALLOC
+			CONF_HANDLE_BOOL(xmalloc)
+#endif
+#ifdef JEMALLOC_TCACHE
+			CONF_HANDLE_BOOL(tcache)
+			CONF_HANDLE_SSIZE_T(lg_tcache_gc_sweep, -1,
+			    (sizeof(size_t) << 3) - 1)
+			CONF_HANDLE_SSIZE_T(lg_tcache_max, -1,
+			    (sizeof(size_t) << 3) - 1)
+#endif
+#ifdef JEMALLOC_PROF
+			CONF_HANDLE_BOOL(prof)
+			CONF_HANDLE_CHAR_P(prof_prefix, "jeprof")
+			CONF_HANDLE_SIZE_T(lg_prof_bt_max, 0, LG_PROF_BT_MAX)
+			CONF_HANDLE_BOOL(prof_active)
+			CONF_HANDLE_SSIZE_T(lg_prof_sample, 0,
+			    (sizeof(uint64_t) << 3) - 1)
+			CONF_HANDLE_BOOL(prof_accum)
+			CONF_HANDLE_SSIZE_T(lg_prof_tcmax, -1,
+			    (sizeof(size_t) << 3) - 1)
+			CONF_HANDLE_SSIZE_T(lg_prof_interval, -1,
+			    (sizeof(uint64_t) << 3) - 1)
+			CONF_HANDLE_BOOL(prof_gdump)
+			CONF_HANDLE_BOOL(prof_leak)
+#endif
+#ifdef JEMALLOC_SWAP
+			CONF_HANDLE_BOOL(overcommit)
+#endif
+			malloc_conf_error("Invalid conf pair", k, klen, v,
+			    vlen);
+#undef CONF_HANDLE_BOOL
+#undef CONF_HANDLE_SIZE_T
+#undef CONF_HANDLE_SSIZE_T
+#undef CONF_HANDLE_CHAR_P
+		}
+
+		/* Validate configuration of options that are inter-related. */
+		if (opt_lg_qspace_max+1 >= opt_lg_cspace_max) {
+			malloc_write("<jemalloc>: Invalid lg_[qc]space_max "
+			    "relationship; restoring defaults\n");
+			opt_lg_qspace_max = LG_QSPACE_MAX_DEFAULT;
+			opt_lg_cspace_max = LG_CSPACE_MAX_DEFAULT;
+		}
+	}
+}
+
+static bool
+malloc_init_hard(void)
+{
 	arena_t *init_arenas[1];
 
 	malloc_mutex_lock(&init_lock);
@@ -347,287 +623,11 @@
 	}
 #endif
 
-	for (i = 0; i < 3; i++) {
-		unsigned j;
-
-		/* Get runtime configuration. */
-		switch (i) {
-		case 0:
-			if ((linklen = readlink("/etc/jemalloc.conf", buf,
-						sizeof(buf) - 1)) != -1) {
-				/*
-				 * Use the contents of the "/etc/jemalloc.conf"
-				 * symbolic link's name.
-				 */
-				buf[linklen] = '\0';
-				opts = buf;
-			} else {
-				/* No configuration specified. */
-				buf[0] = '\0';
-				opts = buf;
-			}
-			break;
-		case 1:
-			if ((opts = getenv("JEMALLOC_OPTIONS")) != NULL) {
-				/*
-				 * Do nothing; opts is already initialized to
-				 * the value of the JEMALLOC_OPTIONS
-				 * environment variable.
-				 */
-			} else {
-				/* No configuration specified. */
-				buf[0] = '\0';
-				opts = buf;
-			}
-			break;
-		case 2:
-			if (JEMALLOC_P(malloc_options) != NULL) {
-				/*
-				 * Use options that were compiled into the
-				 * program.
-				 */
-				opts = JEMALLOC_P(malloc_options);
-			} else {
-				/* No configuration specified. */
-				buf[0] = '\0';
-				opts = buf;
-			}
-			break;
-		default:
-			/* NOTREACHED */
-			assert(false);
-			buf[0] = '\0';
-			opts = buf;
-		}
-
-		for (j = 0; opts[j] != '\0'; j++) {
-			unsigned k, nreps;
-			bool nseen;
-
-			/* Parse repetition count, if any. */
-			for (nreps = 0, nseen = false;; j++, nseen = true) {
-				switch (opts[j]) {
-					case '0': case '1': case '2': case '3':
-					case '4': case '5': case '6': case '7':
-					case '8': case '9':
-						nreps *= 10;
-						nreps += opts[j] - '0';
-						break;
-					default:
-						goto MALLOC_OUT;
-				}
-			}
-MALLOC_OUT:
-			if (nseen == false)
-				nreps = 1;
-
-			for (k = 0; k < nreps; k++) {
-				switch (opts[j]) {
-				case 'a':
-					opt_abort = false;
-					break;
-				case 'A':
-					opt_abort = true;
-					break;
 #ifdef JEMALLOC_PROF
-				case 'b':
-					if (opt_lg_prof_bt_max > 0)
-						opt_lg_prof_bt_max--;
-					break;
-				case 'B':
-					if (opt_lg_prof_bt_max < LG_PROF_BT_MAX)
-						opt_lg_prof_bt_max++;
-					break;
+	prof_boot0();
 #endif
-				case 'c':
-					if (opt_lg_cspace_max - 1 >
-					    opt_lg_qspace_max &&
-					    opt_lg_cspace_max >
-					    LG_CACHELINE)
-						opt_lg_cspace_max--;
-					break;
-				case 'C':
-					if (opt_lg_cspace_max < PAGE_SHIFT
-					    - 1)
-						opt_lg_cspace_max++;
-					break;
-				case 'd':
-					if (opt_lg_dirty_mult + 1 <
-					    (sizeof(size_t) << 3))
-						opt_lg_dirty_mult++;
-					break;
-				case 'D':
-					if (opt_lg_dirty_mult >= 0)
-						opt_lg_dirty_mult--;
-					break;
-#ifdef JEMALLOC_PROF
-				case 'e':
-					opt_prof_active = false;
-					break;
-				case 'E':
-					opt_prof_active = true;
-					break;
-				case 'f':
-					opt_prof = false;
-					break;
-				case 'F':
-					opt_prof = true;
-					break;
-#endif
-#ifdef JEMALLOC_TCACHE
-				case 'g':
-					if (opt_lg_tcache_gc_sweep >= 0)
-						opt_lg_tcache_gc_sweep--;
-					break;
-				case 'G':
-					if (opt_lg_tcache_gc_sweep + 1 <
-					    (sizeof(size_t) << 3))
-						opt_lg_tcache_gc_sweep++;
-					break;
-				case 'h':
-					opt_tcache = false;
-					break;
-				case 'H':
-					opt_tcache = true;
-					break;
-#endif
-#ifdef JEMALLOC_PROF
-				case 'i':
-					if (opt_lg_prof_interval >= 0)
-						opt_lg_prof_interval--;
-					break;
-				case 'I':
-					if (opt_lg_prof_interval + 1 <
-					    (sizeof(uint64_t) << 3))
-						opt_lg_prof_interval++;
-					break;
-#endif
-#ifdef JEMALLOC_FILL
-				case 'j':
-					opt_junk = false;
-					break;
-				case 'J':
-					opt_junk = true;
-					break;
-#endif
-				case 'k':
-					/*
-					 * Chunks always require at least one
-					 * header page, plus one data page.
-					 */
-					if ((1U << (opt_lg_chunk - 1)) >=
-					    (2U << PAGE_SHIFT))
-						opt_lg_chunk--;
-					break;
-				case 'K':
-					if (opt_lg_chunk + 1 <
-					    (sizeof(size_t) << 3))
-						opt_lg_chunk++;
-					break;
-#ifdef JEMALLOC_PROF
-				case 'l':
-					opt_prof_leak = false;
-					break;
-				case 'L':
-					opt_prof_leak = true;
-					break;
-#endif
-#ifdef JEMALLOC_TCACHE
-				case 'm':
-					if (opt_lg_tcache_maxclass >= 0)
-						opt_lg_tcache_maxclass--;
-					break;
-				case 'M':
-					if (opt_lg_tcache_maxclass + 1 <
-					    (sizeof(size_t) << 3))
-						opt_lg_tcache_maxclass++;
-					break;
-#endif
-				case 'n':
-					opt_narenas_lshift--;
-					break;
-				case 'N':
-					opt_narenas_lshift++;
-					break;
-#ifdef JEMALLOC_SWAP
-				case 'o':
-					opt_overcommit = false;
-					break;
-				case 'O':
-					opt_overcommit = true;
-					break;
-#endif
-				case 'p':
-					opt_stats_print = false;
-					break;
-				case 'P':
-					opt_stats_print = true;
-					break;
-				case 'q':
-					if (opt_lg_qspace_max > LG_QUANTUM)
-						opt_lg_qspace_max--;
-					break;
-				case 'Q':
-					if (opt_lg_qspace_max + 1 <
-					    opt_lg_cspace_max)
-						opt_lg_qspace_max++;
-					break;
-#ifdef JEMALLOC_PROF
-				case 's':
-					if (opt_lg_prof_sample > 0)
-						opt_lg_prof_sample--;
-					break;
-				case 'S':
-					if (opt_lg_prof_sample + 1 <
-					    (sizeof(uint64_t) << 3))
-						opt_lg_prof_sample++;
-					break;
-				case 'u':
-					opt_prof_udump = false;
-					break;
-				case 'U':
-					opt_prof_udump = true;
-					break;
-#endif
-#ifdef JEMALLOC_SYSV
-				case 'v':
-					opt_sysv = false;
-					break;
-				case 'V':
-					opt_sysv = true;
-					break;
-#endif
-#ifdef JEMALLOC_XMALLOC
-				case 'x':
-					opt_xmalloc = false;
-					break;
-				case 'X':
-					opt_xmalloc = true;
-					break;
-#endif
-#ifdef JEMALLOC_FILL
-				case 'z':
-					opt_zero = false;
-					break;
-				case 'Z':
-					opt_zero = true;
-					break;
-#endif
-				default: {
-					char cbuf[2];
 
-					cbuf[0] = opts[j];
-					cbuf[1] = '\0';
-					malloc_write(
-					    "<jemalloc>: Unsupported character "
-					    "in malloc options: '");
-					malloc_write(cbuf);
-					malloc_write("'\n");
-				}
-				}
-			}
-		}
-	}
+	malloc_conf_init();
 
 	/* Register fork handlers. */
 	if (pthread_atfork(jemalloc_prefork, jemalloc_postfork,
@@ -662,7 +662,7 @@
 	}
 
 #ifdef JEMALLOC_PROF
-	prof_boot0();
+	prof_boot1();
 #endif
 
 	if (arena_boot()) {
@@ -679,6 +679,15 @@
 		return (true);
 	}
 
+#if (defined(JEMALLOC_STATS) && defined(NO_TLS))
+	/* Initialize allocation counters before any allocations can occur. */
+	if (pthread_key_create(&thread_allocated_tsd, thread_allocated_cleanup)
+	    != 0) {
+		malloc_mutex_unlock(&init_lock);
+		return (true);
+	}
+#endif
+
 	/*
 	 * Create enough scaffolding to allow recursive allocation in
 	 * malloc_ncpus().
@@ -697,19 +706,17 @@
 		return (true);
 	}
 
-#ifndef NO_TLS
 	/*
 	 * Assign the initial arena to the initial thread, in order to avoid
 	 * spurious creation of an extra arena if the application switches to
 	 * threaded mode.
 	 */
-	arenas_map = arenas[0];
-#endif
+	ARENA_SET(arenas[0]);
 
 	malloc_mutex_init(&arenas_lock);
 
 #ifdef JEMALLOC_PROF
-	if (prof_boot1()) {
+	if (prof_boot2()) {
 		malloc_mutex_unlock(&init_lock);
 		return (true);
 	}
@@ -721,64 +728,40 @@
 	ncpus = malloc_ncpus();
 	malloc_mutex_lock(&init_lock);
 
-	if (ncpus > 1) {
+	if (opt_narenas == 0) {
 		/*
 		 * For SMP systems, create more than one arena per CPU by
 		 * default.
 		 */
-		opt_narenas_lshift += 2;
+		if (ncpus > 1)
+			opt_narenas = ncpus << 2;
+		else
+			opt_narenas = 1;
+	}
+	narenas = opt_narenas;
+	/*
+	 * Make sure that the arenas array can be allocated.  In practice, this
+	 * limit is enough to allow the allocator to function, but the ctl
+	 * machinery will fail to allocate memory at far lower limits.
+	 */
+	if (narenas > chunksize / sizeof(arena_t *)) {
+		char buf[UMAX2S_BUFSIZE];
+
+		narenas = chunksize / sizeof(arena_t *);
+		malloc_write("<jemalloc>: Reducing narenas to limit (");
+		malloc_write(u2s(narenas, 10, buf));
+		malloc_write(")\n");
 	}
 
-	/* Determine how many arenas to use. */
-	narenas = ncpus;
-	if (opt_narenas_lshift > 0) {
-		if ((narenas << opt_narenas_lshift) > narenas)
-			narenas <<= opt_narenas_lshift;
-		/*
-		 * Make sure not to exceed the limits of what base_alloc() can
-		 * handle.
-		 */
-		if (narenas * sizeof(arena_t *) > chunksize)
-			narenas = chunksize / sizeof(arena_t *);
-	} else if (opt_narenas_lshift < 0) {
-		if ((narenas >> -opt_narenas_lshift) < narenas)
-			narenas >>= -opt_narenas_lshift;
-		/* Make sure there is at least one arena. */
-		if (narenas == 0)
-			narenas = 1;
-	}
+	next_arena = (narenas > 0) ? 1 : 0;
 
 #ifdef NO_TLS
-	if (narenas > 1) {
-		static const unsigned primes[] = {1, 3, 5, 7, 11, 13, 17, 19,
-		    23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83,
-		    89, 97, 101, 103, 107, 109, 113, 127, 131, 137, 139, 149,
-		    151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211,
-		    223, 227, 229, 233, 239, 241, 251, 257, 263};
-		unsigned nprimes, parenas;
-
-		/*
-		 * Pick a prime number of hash arenas that is more than narenas
-		 * so that direct hashing of pthread_self() pointers tends to
-		 * spread allocations evenly among the arenas.
-		 */
-		assert((narenas & 1) == 0); /* narenas must be even. */
-		nprimes = (sizeof(primes) >> LG_SIZEOF_INT);
-		parenas = primes[nprimes - 1]; /* In case not enough primes. */
-		for (i = 1; i < nprimes; i++) {
-			if (primes[i] > narenas) {
-				parenas = primes[i];
-				break;
-			}
-		}
-		narenas = parenas;
+	if (pthread_key_create(&arenas_tsd, NULL) != 0) {
+		malloc_mutex_unlock(&init_lock);
+		return (true);
 	}
 #endif
 
-#ifndef NO_TLS
-	next_arena = (narenas > 0) ? 1 : 0;
-#endif
-
 	/* Allocate and initialize arenas. */
 	arenas = (arena_t **)base_alloc(sizeof(arena_t *) * narenas);
 	if (arenas == NULL) {
@@ -793,11 +776,35 @@
 	/* Copy the pointer to the one arena that was already initialized. */
 	arenas[0] = init_arenas[0];
 
+#ifdef JEMALLOC_ZONE
+	/* Register the custom zone. */
+	malloc_zone_register(create_zone());
+
+	/*
+	 * Convert the default szone to an "overlay zone" that is capable of
+	 * deallocating szone-allocated objects, but allocating new objects
+	 * from jemalloc.
+	 */
+	szone2ozone(malloc_default_zone());
+#endif
+
 	malloc_initialized = true;
 	malloc_mutex_unlock(&init_lock);
 	return (false);
 }
 
+
+#ifdef JEMALLOC_ZONE
+JEMALLOC_ATTR(constructor)
+void
+jemalloc_darwin_init(void)
+{
+
+	if (malloc_init_hard())
+		abort();
+}
+#endif
+
 /*
  * End initialization functions.
  */
@@ -812,8 +819,19 @@
 JEMALLOC_P(malloc)(size_t size)
 {
 	void *ret;
+#if (defined(JEMALLOC_PROF) || defined(JEMALLOC_STATS))
+	size_t usize
+#  ifdef JEMALLOC_CC_SILENCE
+	    = 0
+#  endif
+	    ;
+#endif
 #ifdef JEMALLOC_PROF
-	prof_thr_cnt_t *cnt;
+	prof_thr_cnt_t *cnt
+#  ifdef JEMALLOC_CC_SILENCE
+	    = NULL
+#  endif
+	    ;
 #endif
 
 	if (malloc_init()) {
@@ -843,20 +861,26 @@
 
 #ifdef JEMALLOC_PROF
 	if (opt_prof) {
-		if ((cnt = prof_alloc_prep(size)) == NULL) {
+		usize = s2u(size);
+		if ((cnt = prof_alloc_prep(usize)) == NULL) {
 			ret = NULL;
 			goto OOM;
 		}
-		if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U && size <=
+		if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U && usize <=
 		    small_maxclass) {
 			ret = imalloc(small_maxclass+1);
 			if (ret != NULL)
-				arena_prof_promoted(ret, size);
+				arena_prof_promoted(ret, usize);
 		} else
 			ret = imalloc(size);
 	} else
 #endif
+	{
+#ifdef JEMALLOC_STATS
+		usize = s2u(size);
+#endif
 		ret = imalloc(size);
+	}
 
 OOM:
 	if (ret == NULL) {
@@ -875,7 +899,13 @@
 #endif
 #ifdef JEMALLOC_PROF
 	if (opt_prof && ret != NULL)
-		prof_malloc(ret, cnt);
+		prof_malloc(ret, usize, cnt);
+#endif
+#ifdef JEMALLOC_STATS
+	if (ret != NULL) {
+		assert(usize == isalloc(ret));
+		ALLOCATED_ADD(usize, 0);
+	}
 #endif
 	return (ret);
 }
@@ -887,8 +917,19 @@
 {
 	int ret;
 	void *result;
+#if (defined(JEMALLOC_PROF) || defined(JEMALLOC_STATS))
+	size_t usize
+#  ifdef JEMALLOC_CC_SILENCE
+	    = 0
+#  endif
+	    ;
+#endif
 #ifdef JEMALLOC_PROF
-	prof_thr_cnt_t *cnt;
+	prof_thr_cnt_t *cnt
+#  ifdef JEMALLOC_CC_SILENCE
+	    = NULL
+#  endif
+	    ;
 #endif
 
 	if (malloc_init())
@@ -934,24 +975,32 @@
 
 #ifdef JEMALLOC_PROF
 		if (opt_prof) {
-			if ((cnt = prof_alloc_prep(size)) == NULL) {
+			usize = sa2u(size, alignment, NULL);
+			if ((cnt = prof_alloc_prep(usize)) == NULL) {
 				result = NULL;
 				ret = EINVAL;
 			} else {
 				if (prof_promote && (uintptr_t)cnt !=
-				    (uintptr_t)1U && size <= small_maxclass) {
-					result = ipalloc(alignment,
-					    small_maxclass+1);
+				    (uintptr_t)1U && usize <= small_maxclass) {
+					result = ipalloc(small_maxclass+1,
+					    alignment, false);
 					if (result != NULL) {
 						arena_prof_promoted(result,
-						    size);
+						    usize);
 					}
-				} else
-					result = ipalloc(alignment, size);
+				} else {
+					result = ipalloc(size, alignment,
+					    false);
+				}
 			}
 		} else
 #endif
-			result = ipalloc(alignment, size);
+		{
+#ifdef JEMALLOC_STATS
+			usize = sa2u(size, alignment, NULL);
+#endif
+			result = ipalloc(size, alignment, false);
+		}
 	}
 
 	if (result == NULL) {
@@ -970,9 +1019,15 @@
 	ret = 0;
 
 RETURN:
+#ifdef JEMALLOC_STATS
+	if (result != NULL) {
+		assert(usize == isalloc(result));
+		ALLOCATED_ADD(usize, 0);
+	}
+#endif
 #ifdef JEMALLOC_PROF
 	if (opt_prof && result != NULL)
-		prof_malloc(result, cnt);
+		prof_malloc(result, usize, cnt);
 #endif
 	return (ret);
 }
@@ -984,8 +1039,19 @@
 {
 	void *ret;
 	size_t num_size;
+#if (defined(JEMALLOC_PROF) || defined(JEMALLOC_STATS))
+	size_t usize
+#  ifdef JEMALLOC_CC_SILENCE
+	    = 0
+#  endif
+	    ;
+#endif
 #ifdef JEMALLOC_PROF
-	prof_thr_cnt_t *cnt;
+	prof_thr_cnt_t *cnt
+#  ifdef JEMALLOC_CC_SILENCE
+	    = NULL
+#  endif
+	    ;
 #endif
 
 	if (malloc_init()) {
@@ -1020,20 +1086,26 @@
 
 #ifdef JEMALLOC_PROF
 	if (opt_prof) {
-		if ((cnt = prof_alloc_prep(num_size)) == NULL) {
+		usize = s2u(num_size);
+		if ((cnt = prof_alloc_prep(usize)) == NULL) {
 			ret = NULL;
 			goto RETURN;
 		}
-		if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U && num_size
+		if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U && usize
 		    <= small_maxclass) {
 			ret = icalloc(small_maxclass+1);
 			if (ret != NULL)
-				arena_prof_promoted(ret, num_size);
+				arena_prof_promoted(ret, usize);
 		} else
 			ret = icalloc(num_size);
 	} else
 #endif
+	{
+#ifdef JEMALLOC_STATS
+		usize = s2u(num_size);
+#endif
 		ret = icalloc(num_size);
+	}
 
 RETURN:
 	if (ret == NULL) {
@@ -1049,7 +1121,13 @@
 
 #ifdef JEMALLOC_PROF
 	if (opt_prof && ret != NULL)
-		prof_malloc(ret, cnt);
+		prof_malloc(ret, usize, cnt);
+#endif
+#ifdef JEMALLOC_STATS
+	if (ret != NULL) {
+		assert(usize == isalloc(ret));
+		ALLOCATED_ADD(usize, 0);
+	}
 #endif
 	return (ret);
 }
@@ -1059,10 +1137,25 @@
 JEMALLOC_P(realloc)(void *ptr, size_t size)
 {
 	void *ret;
+#if (defined(JEMALLOC_PROF) || defined(JEMALLOC_STATS))
+	size_t usize
+#  ifdef JEMALLOC_CC_SILENCE
+	    = 0
+#  endif
+	    ;
+	size_t old_size = 0;
+#endif
 #ifdef JEMALLOC_PROF
-	size_t old_size;
-	prof_thr_cnt_t *cnt;
-	prof_ctx_t *old_ctx;
+	prof_thr_cnt_t *cnt
+#  ifdef JEMALLOC_CC_SILENCE
+	    = NULL
+#  endif
+	    ;
+	prof_ctx_t *old_ctx
+#  ifdef JEMALLOC_CC_SILENCE
+	    = NULL
+#  endif
+	    ;
 #endif
 
 	if (size == 0) {
@@ -1073,9 +1166,11 @@
 #ifdef JEMALLOC_SYSV
 		else {
 			if (ptr != NULL) {
+#if (defined(JEMALLOC_PROF) || defined(JEMALLOC_STATS))
+				old_size = isalloc(ptr);
+#endif
 #ifdef JEMALLOC_PROF
 				if (opt_prof) {
-					old_size = isalloc(ptr);
 					old_ctx = prof_ctx_get(ptr);
 					cnt = NULL;
 				}
@@ -1084,7 +1179,6 @@
 			}
 #ifdef JEMALLOC_PROF
 			else if (opt_prof) {
-				old_size = 0;
 				old_ctx = NULL;
 				cnt = NULL;
 			}
@@ -1099,24 +1193,33 @@
 		assert(malloc_initialized || malloc_initializer ==
 		    pthread_self());
 
+#if (defined(JEMALLOC_PROF) || defined(JEMALLOC_STATS))
+		old_size = isalloc(ptr);
+#endif
 #ifdef JEMALLOC_PROF
 		if (opt_prof) {
-			old_size = isalloc(ptr);
+			usize = s2u(size);
 			old_ctx = prof_ctx_get(ptr);
-			if ((cnt = prof_alloc_prep(size)) == NULL) {
+			if ((cnt = prof_alloc_prep(usize)) == NULL) {
 				ret = NULL;
 				goto OOM;
 			}
 			if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U &&
-			    size <= small_maxclass) {
-				ret = iralloc(ptr, small_maxclass+1);
+			    usize <= small_maxclass) {
+				ret = iralloc(ptr, small_maxclass+1, 0, 0,
+				    false, false);
 				if (ret != NULL)
-					arena_prof_promoted(ret, size);
+					arena_prof_promoted(ret, usize);
 			} else
-				ret = iralloc(ptr, size);
+				ret = iralloc(ptr, size, 0, 0, false, false);
 		} else
 #endif
-			ret = iralloc(ptr, size);
+		{
+#ifdef JEMALLOC_STATS
+			usize = s2u(size);
+#endif
+			ret = iralloc(ptr, size, 0, 0, false, false);
+		}
 
 #ifdef JEMALLOC_PROF
 OOM:
@@ -1133,10 +1236,8 @@
 		}
 	} else {
 #ifdef JEMALLOC_PROF
-		if (opt_prof) {
-			old_size = 0;
+		if (opt_prof)
 			old_ctx = NULL;
-		}
 #endif
 		if (malloc_init()) {
 #ifdef JEMALLOC_PROF
@@ -1147,23 +1248,29 @@
 		} else {
 #ifdef JEMALLOC_PROF
 			if (opt_prof) {
-				if ((cnt = prof_alloc_prep(size)) == NULL)
+				usize = s2u(size);
+				if ((cnt = prof_alloc_prep(usize)) == NULL)
 					ret = NULL;
 				else {
 					if (prof_promote && (uintptr_t)cnt !=
-					    (uintptr_t)1U && size <=
+					    (uintptr_t)1U && usize <=
 					    small_maxclass) {
 						ret = imalloc(small_maxclass+1);
 						if (ret != NULL) {
 							arena_prof_promoted(ret,
-							    size);
+							    usize);
 						}
 					} else
 						ret = imalloc(size);
 				}
 			} else
 #endif
+			{
+#ifdef JEMALLOC_STATS
+				usize = s2u(size);
+#endif
 				ret = imalloc(size);
+			}
 		}
 
 		if (ret == NULL) {
@@ -1183,7 +1290,13 @@
 #endif
 #ifdef JEMALLOC_PROF
 	if (opt_prof)
-		prof_realloc(ret, cnt, ptr, old_size, old_ctx);
+		prof_realloc(ret, usize, cnt, old_size, old_ctx);
+#endif
+#ifdef JEMALLOC_STATS
+	if (ret != NULL) {
+		assert(usize == isalloc(ret));
+		ALLOCATED_ADD(usize, old_size);
+	}
 #endif
 	return (ret);
 }
@@ -1194,12 +1307,26 @@
 {
 
 	if (ptr != NULL) {
+#if (defined(JEMALLOC_PROF) || defined(JEMALLOC_STATS))
+		size_t usize;
+#endif
+
 		assert(malloc_initialized || malloc_initializer ==
 		    pthread_self());
 
+#ifdef JEMALLOC_STATS
+		usize = isalloc(ptr);
+#endif
 #ifdef JEMALLOC_PROF
-		if (opt_prof)
-			prof_free(ptr);
+		if (opt_prof) {
+#  ifndef JEMALLOC_STATS
+			usize = isalloc(ptr);
+#  endif
+			prof_free(ptr, usize);
+		}
+#endif
+#ifdef JEMALLOC_STATS
+		ALLOCATED_ADD(0, usize);
 #endif
 		idalloc(ptr);
 	}
@@ -1210,6 +1337,57 @@
  */
 /******************************************************************************/
 /*
+ * Begin non-standard override functions.
+ *
+ * These overrides are omitted if the JEMALLOC_PREFIX is defined, since the
+ * entire point is to avoid accidental mixed allocator usage.
+ */
+#ifndef JEMALLOC_PREFIX
+
+#ifdef JEMALLOC_OVERRIDE_MEMALIGN
+JEMALLOC_ATTR(malloc)
+JEMALLOC_ATTR(visibility("default"))
+void *
+JEMALLOC_P(memalign)(size_t alignment, size_t size)
+{
+	void *ret;
+#ifdef JEMALLOC_CC_SILENCE
+	int result =
+#endif
+	    JEMALLOC_P(posix_memalign)(&ret, alignment, size);
+#ifdef JEMALLOC_CC_SILENCE
+	if (result != 0)
+		return (NULL);
+#endif
+	return (ret);
+}
+#endif
+
+#ifdef JEMALLOC_OVERRIDE_VALLOC
+JEMALLOC_ATTR(malloc)
+JEMALLOC_ATTR(visibility("default"))
+void *
+JEMALLOC_P(valloc)(size_t size)
+{
+	void *ret;
+#ifdef JEMALLOC_CC_SILENCE
+	int result =
+#endif
+	    JEMALLOC_P(posix_memalign)(&ret, PAGE_SIZE, size);
+#ifdef JEMALLOC_CC_SILENCE
+	if (result != 0)
+		return (NULL);
+#endif
+	return (ret);
+}
+#endif
+
+#endif /* JEMALLOC_PREFIX */
+/*
+ * End non-standard override functions.
+ */
+/******************************************************************************/
+/*
  * Begin non-standard functions.
  */
 
@@ -1219,29 +1397,18 @@
 {
 	size_t ret;
 
+	assert(malloc_initialized || malloc_initializer == pthread_self());
+
+#ifdef JEMALLOC_IVSALLOC
+	ret = ivsalloc(ptr);
+#else
 	assert(ptr != NULL);
 	ret = isalloc(ptr);
+#endif
 
 	return (ret);
 }
 
-#ifdef JEMALLOC_SWAP
-JEMALLOC_ATTR(visibility("default"))
-int
-JEMALLOC_P(malloc_swap_enable)(const int *fds, unsigned nfds, int prezeroed)
-{
-
-	/*
-	 * Make sure malloc is initialized, because we need page size, chunk
-	 * size, etc.
-	 */
-	if (malloc_init())
-		return (-1);
-
-	return (chunk_swap_enable(fds, nfds, (prezeroed != 0)) ? -1 : 0);
-}
-#endif
-
 JEMALLOC_ATTR(visibility("default"))
 void
 JEMALLOC_P(malloc_stats_print)(void (*write_cb)(void *, const char *),
@@ -1286,6 +1453,247 @@
 	return (ctl_bymib(mib, miblen, oldp, oldlenp, newp, newlen));
 }
 
+JEMALLOC_INLINE void *
+iallocm(size_t size, size_t alignment, bool zero)
+{
+
+	if (alignment != 0)
+		return (ipalloc(size, alignment, zero));
+	else if (zero)
+		return (icalloc(size));
+	else
+		return (imalloc(size));
+}
+
+JEMALLOC_ATTR(nonnull(1))
+JEMALLOC_ATTR(visibility("default"))
+int
+JEMALLOC_P(allocm)(void **ptr, size_t *rsize, size_t size, int flags)
+{
+	void *p;
+	size_t usize;
+	size_t alignment = (ZU(1) << (flags & ALLOCM_LG_ALIGN_MASK)
+	    & (SIZE_T_MAX-1));
+	bool zero = flags & ALLOCM_ZERO;
+#ifdef JEMALLOC_PROF
+	prof_thr_cnt_t *cnt;
+#endif
+
+	assert(ptr != NULL);
+	assert(size != 0);
+
+	if (malloc_init())
+		goto OOM;
+
+#ifdef JEMALLOC_PROF
+	if (opt_prof) {
+		usize = (alignment == 0) ? s2u(size) : sa2u(size, alignment,
+		    NULL);
+		if ((cnt = prof_alloc_prep(usize)) == NULL)
+			goto OOM;
+		if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U && usize <=
+		    small_maxclass) {
+			p = iallocm(small_maxclass+1, alignment, zero);
+			if (p == NULL)
+				goto OOM;
+			arena_prof_promoted(p, usize);
+		} else {
+			p = iallocm(size, alignment, zero);
+			if (p == NULL)
+				goto OOM;
+		}
+
+		if (rsize != NULL)
+			*rsize = usize;
+	} else
+#endif
+	{
+		p = iallocm(size, alignment, zero);
+		if (p == NULL)
+			goto OOM;
+#ifndef JEMALLOC_STATS
+		if (rsize != NULL)
+#endif
+		{
+			usize = (alignment == 0) ? s2u(size) : sa2u(size,
+			    alignment, NULL);
+#ifdef JEMALLOC_STATS
+			if (rsize != NULL)
+#endif
+				*rsize = usize;
+		}
+	}
+
+	*ptr = p;
+#ifdef JEMALLOC_STATS
+	assert(usize == isalloc(p));
+	ALLOCATED_ADD(usize, 0);
+#endif
+	return (ALLOCM_SUCCESS);
+OOM:
+#ifdef JEMALLOC_XMALLOC
+	if (opt_xmalloc) {
+		malloc_write("<jemalloc>: Error in allocm(): "
+		    "out of memory\n");
+		abort();
+	}
+#endif
+	*ptr = NULL;
+	return (ALLOCM_ERR_OOM);
+}
+
+JEMALLOC_ATTR(nonnull(1))
+JEMALLOC_ATTR(visibility("default"))
+int
+JEMALLOC_P(rallocm)(void **ptr, size_t *rsize, size_t size, size_t extra,
+    int flags)
+{
+	void *p, *q;
+	size_t usize;
+#if (defined(JEMALLOC_PROF) || defined(JEMALLOC_STATS))
+	size_t old_size;
+#endif
+	size_t alignment = (ZU(1) << (flags & ALLOCM_LG_ALIGN_MASK)
+	    & (SIZE_T_MAX-1));
+	bool zero = flags & ALLOCM_ZERO;
+	bool no_move = flags & ALLOCM_NO_MOVE;
+#ifdef JEMALLOC_PROF
+	prof_thr_cnt_t *cnt;
+	prof_ctx_t *old_ctx;
+#endif
+
+	assert(ptr != NULL);
+	assert(*ptr != NULL);
+	assert(size != 0);
+	assert(SIZE_T_MAX - size >= extra);
+	assert(malloc_initialized || malloc_initializer == pthread_self());
+
+	p = *ptr;
+#ifdef JEMALLOC_PROF
+	if (opt_prof) {
+		/*
+		 * usize isn't knowable before iralloc() returns when extra is
+		 * non-zero.  Therefore, compute its maximum possible value and
+		 * use that in prof_alloc_prep() to decide whether to capture a
+		 * backtrace.  prof_realloc() will use the actual usize to
+		 * decide whether to sample.
+		 */
+		size_t max_usize = (alignment == 0) ? s2u(size+extra) :
+		    sa2u(size+extra, alignment, NULL);
+		old_size = isalloc(p);
+		old_ctx = prof_ctx_get(p);
+		if ((cnt = prof_alloc_prep(max_usize)) == NULL)
+			goto OOM;
+		if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U && max_usize
+		    <= small_maxclass) {
+			q = iralloc(p, small_maxclass+1, (small_maxclass+1 >=
+			    size+extra) ? 0 : size+extra - (small_maxclass+1),
+			    alignment, zero, no_move);
+			if (q == NULL)
+				goto ERR;
+			usize = isalloc(q);
+			arena_prof_promoted(q, usize);
+		} else {
+			q = iralloc(p, size, extra, alignment, zero, no_move);
+			if (q == NULL)
+				goto ERR;
+			usize = isalloc(q);
+		}
+		prof_realloc(q, usize, cnt, old_size, old_ctx);
+	} else
+#endif
+	{
+#ifdef JEMALLOC_STATS
+		old_size = isalloc(p);
+#endif
+		q = iralloc(p, size, extra, alignment, zero, no_move);
+		if (q == NULL)
+			goto ERR;
+#ifndef JEMALLOC_STATS
+		if (rsize != NULL)
+#endif
+		{
+			usize = isalloc(q);
+#ifdef JEMALLOC_STATS
+			if (rsize != NULL)
+#endif
+				*rsize = usize;
+		}
+	}
+
+	*ptr = q;
+#ifdef JEMALLOC_STATS
+	ALLOCATED_ADD(usize, old_size);
+#endif
+	return (ALLOCM_SUCCESS);
+ERR:
+	if (no_move)
+		return (ALLOCM_ERR_NOT_MOVED);
+#ifdef JEMALLOC_PROF
+OOM:
+#endif
+#ifdef JEMALLOC_XMALLOC
+	if (opt_xmalloc) {
+		malloc_write("<jemalloc>: Error in rallocm(): "
+		    "out of memory\n");
+		abort();
+	}
+#endif
+	return (ALLOCM_ERR_OOM);
+}
+
+JEMALLOC_ATTR(nonnull(1))
+JEMALLOC_ATTR(visibility("default"))
+int
+JEMALLOC_P(sallocm)(const void *ptr, size_t *rsize, int flags)
+{
+	size_t sz;
+
+	assert(malloc_initialized || malloc_initializer == pthread_self());
+
+#ifdef JEMALLOC_IVSALLOC
+	sz = ivsalloc(ptr);
+#else
+	assert(ptr != NULL);
+	sz = isalloc(ptr);
+#endif
+	assert(rsize != NULL);
+	*rsize = sz;
+
+	return (ALLOCM_SUCCESS);
+}
+
+JEMALLOC_ATTR(nonnull(1))
+JEMALLOC_ATTR(visibility("default"))
+int
+JEMALLOC_P(dallocm)(void *ptr, int flags)
+{
+#if (defined(JEMALLOC_PROF) || defined(JEMALLOC_STATS))
+	size_t usize;
+#endif
+
+	assert(ptr != NULL);
+	assert(malloc_initialized || malloc_initializer == pthread_self());
+
+#ifdef JEMALLOC_STATS
+	usize = isalloc(ptr);
+#endif
+#ifdef JEMALLOC_PROF
+	if (opt_prof) {
+#  ifndef JEMALLOC_STATS
+		usize = isalloc(ptr);
+#  endif
+		prof_free(ptr, usize);
+	}
+#endif
+#ifdef JEMALLOC_STATS
+	ALLOCATED_ADD(0, usize);
+#endif
+	idalloc(ptr);
+
+	return (ALLOCM_SUCCESS);
+}
+
 /*
  * End non-standard functions.
  */
@@ -1293,12 +1701,10 @@
 
 /*
  * The following functions are used by threading libraries for protection of
- * malloc during fork().  These functions are only called if the program is
- * running in threaded mode, so there is no need to check whether the program
- * is threaded here.
+ * malloc during fork().
  */
 
-static void
+void
 jemalloc_prefork(void)
 {
 	unsigned i;
@@ -1324,7 +1730,7 @@
 #endif
 }
 
-static void
+void
 jemalloc_postfork(void)
 {
 	unsigned i;
@@ -1349,3 +1755,5 @@
 	}
 	malloc_mutex_unlock(&arenas_lock);
 }
+
+/******************************************************************************/
diff --git a/jemalloc/src/mutex.c b/jemalloc/src/mutex.c
index 3b6081a..3ecb18a 100644
--- a/jemalloc/src/mutex.c
+++ b/jemalloc/src/mutex.c
@@ -59,7 +59,11 @@
 
 	if (pthread_mutexattr_init(&attr) != 0)
 		return (true);
+#ifdef PTHREAD_MUTEX_ADAPTIVE_NP
 	pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ADAPTIVE_NP);
+#else
+	pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_DEFAULT);
+#endif
 	if (pthread_mutex_init(mutex, &attr) != 0) {
 		pthread_mutexattr_destroy(&attr);
 		return (true);
@@ -68,3 +72,13 @@
 
 	return (false);
 }
+
+void
+malloc_mutex_destroy(malloc_mutex_t *mutex)
+{
+
+	if (pthread_mutex_destroy(mutex) != 0) {
+		malloc_write("<jemalloc>: Error in pthread_mutex_destroy()\n");
+		abort();
+	}
+}
diff --git a/jemalloc/src/prof.c b/jemalloc/src/prof.c
index 6d6910e..84ce1ba 100644
--- a/jemalloc/src/prof.c
+++ b/jemalloc/src/prof.c
@@ -12,8 +12,6 @@
 #include <libunwind.h>
 #endif
 
-#include <math.h>
-
 /******************************************************************************/
 /* Data. */
 
@@ -22,48 +20,30 @@
 size_t		opt_lg_prof_bt_max = LG_PROF_BT_MAX_DEFAULT;
 size_t		opt_lg_prof_sample = LG_PROF_SAMPLE_DEFAULT;
 ssize_t		opt_lg_prof_interval = LG_PROF_INTERVAL_DEFAULT;
-bool		opt_prof_udump = false;
+bool		opt_prof_gdump = false;
 bool		opt_prof_leak = false;
+bool		opt_prof_accum = true;
+ssize_t		opt_lg_prof_tcmax = LG_PROF_TCMAX_DEFAULT;
+char		opt_prof_prefix[PATH_MAX + 1];
 
 uint64_t	prof_interval;
 bool		prof_promote;
 
+unsigned	prof_bt_max;
+
+#ifndef NO_TLS
+__thread prof_tdata_t	*prof_tdata_tls
+    JEMALLOC_ATTR(tls_model("initial-exec"));
+#endif
+pthread_key_t	prof_tdata_tsd;
+
 /*
  * Global hash of (prof_bt_t *)-->(prof_ctx_t *).  This is the master data
- * structure that knows about all backtraces ever captured.
+ * structure that knows about all backtraces currently captured.
  */
 static ckh_t		bt2ctx;
 static malloc_mutex_t	bt2ctx_mtx;
 
-/*
- * Thread-specific hash of (prof_bt_t *)-->(prof_thr_cnt_t *).  Each thread
- * keeps a cache of backtraces, with associated thread-specific prof_thr_cnt_t
- * objects.  Other threads may read the prof_thr_cnt_t contents, but no others
- * will ever write them.
- *
- * Upon thread exit, the thread must merge all the prof_thr_cnt_t counter data
- * into the associated prof_ctx_t objects, and unlink/free the prof_thr_cnt_t
- * objects.
- */
-static __thread ckh_t	*bt2cnt_tls JEMALLOC_ATTR(tls_model("initial-exec"));
-
-/*
- * Same contents as b2cnt_tls, but initialized such that the TSD destructor is
- * called when a thread exits, so that bt2cnt_tls contents can be merged,
- * unlinked, and deallocated.
- */
-static pthread_key_t	bt2cnt_tsd;
-
-/* (1U << opt_lg_prof_bt_max). */
-static unsigned		prof_bt_max;
-
-static __thread uint64_t prof_sample_prn_state
-    JEMALLOC_ATTR(tls_model("initial-exec"));
-static __thread uint64_t prof_sample_threshold
-    JEMALLOC_ATTR(tls_model("initial-exec"));
-static __thread uint64_t prof_sample_accum
-    JEMALLOC_ATTR(tls_model("initial-exec"));
-
 static malloc_mutex_t	prof_dump_seq_mtx;
 static uint64_t		prof_dump_seq;
 static uint64_t		prof_dump_iseq;
@@ -85,26 +65,25 @@
 static malloc_mutex_t	enq_mtx;
 static bool		enq;
 static bool		enq_idump;
-static bool		enq_udump;
+static bool		enq_gdump;
 
 /******************************************************************************/
 /* Function prototypes for non-inline static functions. */
 
 static prof_bt_t	*bt_dup(prof_bt_t *bt);
-static void	bt_init(prof_bt_t *bt, void **vec);
+static void	bt_destroy(prof_bt_t *bt);
 #ifdef JEMALLOC_PROF_LIBGCC
 static _Unwind_Reason_Code	prof_unwind_init_callback(
     struct _Unwind_Context *context, void *arg);
 static _Unwind_Reason_Code	prof_unwind_callback(
     struct _Unwind_Context *context, void *arg);
 #endif
-static void	prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max);
-static prof_thr_cnt_t	*prof_lookup(prof_bt_t *bt);
-static void	prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
 static bool	prof_flush(bool propagate_err);
 static bool	prof_write(const char *s, bool propagate_err);
-static void	prof_ctx_merge(prof_ctx_t *ctx, prof_cnt_t *cnt_all,
+static void	prof_ctx_sum(prof_ctx_t *ctx, prof_cnt_t *cnt_all,
     size_t *leak_nctx);
+static void	prof_ctx_destroy(prof_ctx_t *ctx);
+static void	prof_ctx_merge(prof_ctx_t *ctx, prof_thr_cnt_t *cnt);
 static bool	prof_dump_ctx(prof_ctx_t *ctx, prof_bt_t *bt,
     bool propagate_err);
 static bool	prof_dump_maps(bool propagate_err);
@@ -115,11 +94,11 @@
 static void	prof_bt_hash(const void *key, unsigned minbits, size_t *hash1,
     size_t *hash2);
 static bool	prof_bt_keycomp(const void *k1, const void *k2);
-static void	bt2cnt_thread_cleanup(void *arg);
+static void	prof_tdata_cleanup(void *arg);
 
 /******************************************************************************/
 
-static void
+void
 bt_init(prof_bt_t *bt, void **vec)
 {
 
@@ -127,6 +106,13 @@
 	bt->len = 0;
 }
 
+static void
+bt_destroy(prof_bt_t *bt)
+{
+
+	idalloc(bt);
+}
+
 static prof_bt_t *
 bt_dup(prof_bt_t *bt)
 {
@@ -165,7 +151,7 @@
 static inline void
 prof_leave(void)
 {
-	bool idump, udump;
+	bool idump, gdump;
 
 	malloc_mutex_unlock(&bt2ctx_mtx);
 
@@ -173,14 +159,14 @@
 	enq = false;
 	idump = enq_idump;
 	enq_idump = false;
-	udump = enq_udump;
-	enq_udump = false;
+	gdump = enq_gdump;
+	enq_gdump = false;
 	malloc_mutex_unlock(&enq_mtx);
 
 	if (idump)
 		prof_idump();
-	if (udump)
-		prof_udump();
+	if (gdump)
+		prof_gdump();
 }
 
 #ifdef JEMALLOC_PROF_LIBGCC
@@ -208,7 +194,7 @@
 	return (_URC_NO_REASON);
 }
 
-static void
+void
 prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max)
 {
 	prof_unwind_data_t data = {bt, nignore, max};
@@ -216,7 +202,7 @@
 	_Unwind_Backtrace(prof_unwind_callback, &data);
 }
 #elif defined(JEMALLOC_PROF_LIBUNWIND)
-static void
+void
 prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max)
 {
 	unw_context_t uc;
@@ -251,41 +237,29 @@
 	}
 }
 #else
-static void
+void
 prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max)
 {
-#define	NIGNORE	3
 #define	BT_FRAME(i)							\
-	if ((i) < NIGNORE + max) {					\
+	if ((i) < nignore + max) {					\
 		void *p;						\
 		if (__builtin_frame_address(i) == 0)			\
 			return;						\
 		p = __builtin_return_address(i);			\
 		if (p == NULL)						\
 			return;						\
-		if (i >= NIGNORE) {					\
-			bt->vec[(i) - NIGNORE] = p;			\
-			bt->len = (i) - NIGNORE + 1;			\
+		if (i >= nignore) {					\
+			bt->vec[(i) - nignore] = p;			\
+			bt->len = (i) - nignore + 1;			\
 		}							\
 	} else								\
 		return;
 
 	assert(max <= (1U << opt_lg_prof_bt_max));
 
-	/*
-	 * Ignore the first three frames, since they are:
-	 *
-	 *   0: prof_backtrace()
-	 *   1: prof_alloc_prep()
-	 *   2: malloc(), calloc(), etc.
-	 */
-#if 1
-	assert(nignore + 1 == NIGNORE);
-#else
 	BT_FRAME(0)
 	BT_FRAME(1)
 	BT_FRAME(2)
-#endif
 	BT_FRAME(3)
 	BT_FRAME(4)
 	BT_FRAME(5)
@@ -432,345 +406,119 @@
 }
 #endif
 
-static prof_thr_cnt_t *
+prof_thr_cnt_t *
 prof_lookup(prof_bt_t *bt)
 {
-	prof_thr_cnt_t *ret;
-	ckh_t *bt2cnt = bt2cnt_tls;
+	union {
+		prof_thr_cnt_t	*p;
+		void		*v;
+	} ret;
+	prof_tdata_t *prof_tdata;
 
-	if (bt2cnt == NULL) {
-		/* Initialize an empty cache for this thread. */
-		bt2cnt = (ckh_t *)imalloc(sizeof(ckh_t));
-		if (bt2cnt == NULL)
+	prof_tdata = PROF_TCACHE_GET();
+	if (prof_tdata == NULL) {
+		prof_tdata = prof_tdata_init();
+		if (prof_tdata == NULL)
 			return (NULL);
-		if (ckh_new(bt2cnt, PROF_CKH_MINITEMS, prof_bt_hash,
-		    prof_bt_keycomp)) {
-			idalloc(bt2cnt);
-			return (NULL);
-		}
-		bt2cnt_tls = bt2cnt;
-		pthread_setspecific(bt2cnt_tsd, bt2cnt);
 	}
 
-	if (ckh_search(bt2cnt, bt, NULL, (void **)&ret)) {
-		prof_bt_t *btkey;
-		prof_ctx_t *ctx;
+	if (ckh_search(&prof_tdata->bt2cnt, bt, NULL, &ret.v)) {
+		union {
+			prof_bt_t	*p;
+			void		*v;
+		} btkey;
+		union {
+			prof_ctx_t	*p;
+			void		*v;
+		} ctx;
 
 		/*
 		 * This thread's cache lacks bt.  Look for it in the global
 		 * cache.
 		 */
 		prof_enter();
-		if (ckh_search(&bt2ctx, bt, (void **)&btkey, (void **)&ctx)) {
-
+		if (ckh_search(&bt2ctx, bt, &btkey.v, &ctx.v)) {
 			/* bt has never been seen before.  Insert it. */
-			ctx = (prof_ctx_t *)imalloc(sizeof(prof_ctx_t));
-			if (ctx == NULL) {
+			ctx.v = imalloc(sizeof(prof_ctx_t));
+			if (ctx.v == NULL) {
 				prof_leave();
 				return (NULL);
 			}
-			btkey = bt_dup(bt);
-			if (btkey == NULL) {
+			btkey.p = bt_dup(bt);
+			if (btkey.v == NULL) {
 				prof_leave();
-				idalloc(ctx);
+				idalloc(ctx.v);
 				return (NULL);
 			}
-			ctx->bt = btkey;
-			if (malloc_mutex_init(&ctx->lock)) {
+			ctx.p->bt = btkey.p;
+			if (malloc_mutex_init(&ctx.p->lock)) {
 				prof_leave();
-				idalloc(btkey);
-				idalloc(ctx);
+				idalloc(btkey.v);
+				idalloc(ctx.v);
 				return (NULL);
 			}
-			memset(&ctx->cnt_merged, 0, sizeof(prof_cnt_t));
-			ql_new(&ctx->cnts_ql);
-			if (ckh_insert(&bt2ctx, btkey, ctx)) {
+			memset(&ctx.p->cnt_merged, 0, sizeof(prof_cnt_t));
+			ql_new(&ctx.p->cnts_ql);
+			if (ckh_insert(&bt2ctx, btkey.v, ctx.v)) {
 				/* OOM. */
 				prof_leave();
-				idalloc(btkey);
-				idalloc(ctx);
+				malloc_mutex_destroy(&ctx.p->lock);
+				idalloc(btkey.v);
+				idalloc(ctx.v);
 				return (NULL);
 			}
 		}
+		/*
+		 * Acquire ctx's lock before releasing bt2ctx_mtx, in order to
+		 * avoid a race condition with prof_ctx_destroy().
+		 */
+		malloc_mutex_lock(&ctx.p->lock);
 		prof_leave();
 
 		/* Link a prof_thd_cnt_t into ctx for this thread. */
-		ret = (prof_thr_cnt_t *)imalloc(sizeof(prof_thr_cnt_t));
-		if (ret == NULL)
-			return (NULL);
-		ql_elm_new(ret, link);
-		ret->ctx = ctx;
-		ret->epoch = 0;
-		memset(&ret->cnts, 0, sizeof(prof_cnt_t));
-		if (ckh_insert(bt2cnt, btkey, ret)) {
-			idalloc(ret);
-			return (NULL);
-		}
-		malloc_mutex_lock(&ctx->lock);
-		ql_tail_insert(&ctx->cnts_ql, ret, link);
-		malloc_mutex_unlock(&ctx->lock);
-	}
-
-	return (ret);
-}
-
-static inline void
-prof_sample_threshold_update(void)
-{
-	uint64_t r;
-	double u;
-
-	/*
-	 * Compute prof_sample_threshold as a geometrically distributed random
-	 * variable with mean (2^opt_lg_prof_sample).
-	 */
-	prn64(r, 53, prof_sample_prn_state, (uint64_t)1125899906842625LLU,
-	    1058392653243283975);
-	u = (double)r * (1.0/9007199254740992.0L);
-	prof_sample_threshold = (uint64_t)(log(u) /
-	    log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample))))
-	    + (uint64_t)1U;
-}
-
-prof_thr_cnt_t *
-prof_alloc_prep(size_t size)
-{
-	prof_thr_cnt_t *ret;
-	void *vec[prof_bt_max];
-	prof_bt_t bt;
-
-	if (opt_prof_active == false) {
-		/* Sampling is currently inactive, so avoid sampling. */
-		ret = (prof_thr_cnt_t *)(uintptr_t)1U;
-	} else if (opt_lg_prof_sample == 0) {
-		/*
-		 * Don't bother with sampling logic, since sampling interval is
-		 * 1.
-		 */
-		bt_init(&bt, vec);
-		prof_backtrace(&bt, 2, prof_bt_max);
-		ret = prof_lookup(&bt);
-	} else {
-		if (prof_sample_threshold == 0) {
+		if (opt_lg_prof_tcmax >= 0 && ckh_count(&prof_tdata->bt2cnt)
+		    == (ZU(1) << opt_lg_prof_tcmax)) {
+			assert(ckh_count(&prof_tdata->bt2cnt) > 0);
 			/*
-			 * Initialize.  Seed the prng differently for each
-			 * thread.
+			 * Flush the least recently used cnt in order to keep
+			 * bt2cnt from becoming too large.
 			 */
-			prof_sample_prn_state = (uint64_t)(uintptr_t)&size;
-			prof_sample_threshold_update();
-		}
-
-		/*
-		 * Determine whether to capture a backtrace based on whether
-		 * size is enough for prof_accum to reach
-		 * prof_sample_threshold.  However, delay updating these
-		 * variables until prof_{m,re}alloc(), because we don't know
-		 * for sure that the allocation will succeed.
-		 *
-		 * Use subtraction rather than addition to avoid potential
-		 * integer overflow.
-		 */
-		if (size >= prof_sample_threshold - prof_sample_accum) {
-			bt_init(&bt, vec);
-			prof_backtrace(&bt, 2, prof_bt_max);
-			ret = prof_lookup(&bt);
-		} else
-			ret = (prof_thr_cnt_t *)(uintptr_t)1U;
-	}
-
-	return (ret);
-}
-
-prof_ctx_t *
-prof_ctx_get(const void *ptr)
-{
-	prof_ctx_t *ret;
-	arena_chunk_t *chunk;
-
-	assert(ptr != NULL);
-
-	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	if (chunk != ptr) {
-		/* Region. */
-		assert(chunk->arena->magic == ARENA_MAGIC);
-
-		ret = arena_prof_ctx_get(ptr);
-	} else
-		ret = huge_prof_ctx_get(ptr);
-
-	return (ret);
-}
-
-static void
-prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
-{
-	arena_chunk_t *chunk;
-
-	assert(ptr != NULL);
-
-	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	if (chunk != ptr) {
-		/* Region. */
-		assert(chunk->arena->magic == ARENA_MAGIC);
-
-		arena_prof_ctx_set(ptr, ctx);
-	} else
-		huge_prof_ctx_set(ptr, ctx);
-}
-
-static inline void
-prof_sample_accum_update(size_t size)
-{
-
-	/* Sampling logic is unnecessary if the interval is 1. */
-	assert(opt_lg_prof_sample != 0);
-
-	/* Take care to avoid integer overflow. */
-	if (size >= prof_sample_threshold - prof_sample_accum) {
-		prof_sample_accum -= (prof_sample_threshold - size);
-		/* Compute new prof_sample_threshold. */
-		prof_sample_threshold_update();
-		while (prof_sample_accum >= prof_sample_threshold) {
-			prof_sample_accum -= prof_sample_threshold;
-			prof_sample_threshold_update();
-		}
-	} else
-		prof_sample_accum += size;
-}
-
-void
-prof_malloc(const void *ptr, prof_thr_cnt_t *cnt)
-{
-	size_t size;
-
-	assert(ptr != NULL);
-
-	if (opt_lg_prof_sample != 0) {
-		size = isalloc(ptr);
-		prof_sample_accum_update(size);
-	} else if ((uintptr_t)cnt > (uintptr_t)1U)
-		size = isalloc(ptr);
-
-	if ((uintptr_t)cnt > (uintptr_t)1U) {
-		prof_ctx_set(ptr, cnt->ctx);
-
-		cnt->epoch++;
-		/*********/
-		mb_write();
-		/*********/
-		cnt->cnts.curobjs++;
-		cnt->cnts.curbytes += size;
-		cnt->cnts.accumobjs++;
-		cnt->cnts.accumbytes += size;
-		/*********/
-		mb_write();
-		/*********/
-		cnt->epoch++;
-		/*********/
-		mb_write();
-		/*********/
-	} else
-		prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
-}
-
-void
-prof_realloc(const void *ptr, prof_thr_cnt_t *cnt, const void *old_ptr,
-    size_t old_size, prof_ctx_t *old_ctx)
-{
-	size_t size;
-	prof_thr_cnt_t *told_cnt;
-
-	assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U);
-
-	if (ptr != NULL) {
-		if (opt_lg_prof_sample != 0) {
-			size = isalloc(ptr);
-			prof_sample_accum_update(size);
-		} else if ((uintptr_t)cnt > (uintptr_t)1U)
-			size = isalloc(ptr);
-	}
-
-	if ((uintptr_t)old_ctx > (uintptr_t)1U) {
-		told_cnt = prof_lookup(old_ctx->bt);
-		if (told_cnt == NULL) {
-			/*
-			 * It's too late to propagate OOM for this realloc(),
-			 * so operate directly on old_cnt->ctx->cnt_merged.
-			 */
-			malloc_mutex_lock(&old_ctx->lock);
-			old_ctx->cnt_merged.curobjs--;
-			old_ctx->cnt_merged.curbytes -= old_size;
-			malloc_mutex_unlock(&old_ctx->lock);
-			told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
-		}
-	} else
-		told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
-
-	if ((uintptr_t)told_cnt > (uintptr_t)1U)
-		told_cnt->epoch++;
-	if ((uintptr_t)cnt > (uintptr_t)1U) {
-		prof_ctx_set(ptr, cnt->ctx);
-		cnt->epoch++;
-	} else
-		prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
-	/*********/
-	mb_write();
-	/*********/
-	if ((uintptr_t)told_cnt > (uintptr_t)1U) {
-		told_cnt->cnts.curobjs--;
-		told_cnt->cnts.curbytes -= old_size;
-	}
-	if ((uintptr_t)cnt > (uintptr_t)1U) {
-		cnt->cnts.curobjs++;
-		cnt->cnts.curbytes += size;
-		cnt->cnts.accumobjs++;
-		cnt->cnts.accumbytes += size;
-	}
-	/*********/
-	mb_write();
-	/*********/
-	if ((uintptr_t)told_cnt > (uintptr_t)1U)
-		told_cnt->epoch++;
-	if ((uintptr_t)cnt > (uintptr_t)1U)
-		cnt->epoch++;
-	/*********/
-	mb_write(); /* Not strictly necessary. */
-}
-
-void
-prof_free(const void *ptr)
-{
-	prof_ctx_t *ctx = prof_ctx_get(ptr);
-
-	if ((uintptr_t)ctx > (uintptr_t)1) {
-		size_t size = isalloc(ptr);
-		prof_thr_cnt_t *tcnt = prof_lookup(ctx->bt);
-
-		if (tcnt != NULL) {
-			tcnt->epoch++;
-			/*********/
-			mb_write();
-			/*********/
-			tcnt->cnts.curobjs--;
-			tcnt->cnts.curbytes -= size;
-			/*********/
-			mb_write();
-			/*********/
-			tcnt->epoch++;
-			/*********/
-			mb_write();
-			/*********/
+			ret.p = ql_last(&prof_tdata->lru_ql, lru_link);
+			assert(ret.v != NULL);
+			ckh_remove(&prof_tdata->bt2cnt, ret.p->ctx->bt, NULL,
+			    NULL);
+			ql_remove(&prof_tdata->lru_ql, ret.p, lru_link);
+			prof_ctx_merge(ret.p->ctx, ret.p);
+			/* ret can now be re-used. */
 		} else {
-			/*
-			 * OOM during free() cannot be propagated, so operate
-			 * directly on cnt->ctx->cnt_merged.
-			 */
-			malloc_mutex_lock(&ctx->lock);
-			ctx->cnt_merged.curobjs--;
-			ctx->cnt_merged.curbytes -= size;
-			malloc_mutex_unlock(&ctx->lock);
+			assert(opt_lg_prof_tcmax < 0 ||
+			    ckh_count(&prof_tdata->bt2cnt) < (ZU(1) <<
+			    opt_lg_prof_tcmax));
+			/* Allocate and partially initialize a new cnt. */
+			ret.v = imalloc(sizeof(prof_thr_cnt_t));
+			if (ret.p == NULL)
+				return (NULL);
+			ql_elm_new(ret.p, cnts_link);
+			ql_elm_new(ret.p, lru_link);
 		}
+		/* Finish initializing ret. */
+		ret.p->ctx = ctx.p;
+		ret.p->epoch = 0;
+		memset(&ret.p->cnts, 0, sizeof(prof_cnt_t));
+		if (ckh_insert(&prof_tdata->bt2cnt, btkey.v, ret.v)) {
+			idalloc(ret.v);
+			return (NULL);
+		}
+		ql_head_insert(&prof_tdata->lru_ql, ret.p, lru_link);
+		ql_tail_insert(&ctx.p->cnts_ql, ret.p, cnts_link);
+		malloc_mutex_unlock(&ctx.p->lock);
+	} else {
+		/* Move ret to the front of the LRU. */
+		ql_remove(&prof_tdata->lru_ql, ret.p, lru_link);
+		ql_head_insert(&prof_tdata->lru_ql, ret.p, lru_link);
 	}
+
+	return (ret.p);
 }
 
 static bool
@@ -823,15 +571,15 @@
 }
 
 static void
-prof_ctx_merge(prof_ctx_t *ctx, prof_cnt_t *cnt_all, size_t *leak_nctx)
+prof_ctx_sum(prof_ctx_t *ctx, prof_cnt_t *cnt_all, size_t *leak_nctx)
 {
 	prof_thr_cnt_t *thr_cnt;
 	prof_cnt_t tcnt;
 
 	malloc_mutex_lock(&ctx->lock);
 
-	memcpy(&ctx->cnt_dump, &ctx->cnt_merged, sizeof(prof_cnt_t));
-	ql_foreach(thr_cnt, &ctx->cnts_ql, link) {
+	memcpy(&ctx->cnt_summed, &ctx->cnt_merged, sizeof(prof_cnt_t));
+	ql_foreach(thr_cnt, &ctx->cnts_ql, cnts_link) {
 		volatile unsigned *epoch = &thr_cnt->epoch;
 
 		while (true) {
@@ -848,46 +596,108 @@
 				break;
 		}
 
-		ctx->cnt_dump.curobjs += tcnt.curobjs;
-		ctx->cnt_dump.curbytes += tcnt.curbytes;
-		ctx->cnt_dump.accumobjs += tcnt.accumobjs;
-		ctx->cnt_dump.accumbytes += tcnt.accumbytes;
-
-		if (tcnt.curobjs != 0)
-			(*leak_nctx)++;
+		ctx->cnt_summed.curobjs += tcnt.curobjs;
+		ctx->cnt_summed.curbytes += tcnt.curbytes;
+		if (opt_prof_accum) {
+			ctx->cnt_summed.accumobjs += tcnt.accumobjs;
+			ctx->cnt_summed.accumbytes += tcnt.accumbytes;
+		}
 	}
 
-	/* Merge into cnt_all. */
-	cnt_all->curobjs += ctx->cnt_dump.curobjs;
-	cnt_all->curbytes += ctx->cnt_dump.curbytes;
-	cnt_all->accumobjs += ctx->cnt_dump.accumobjs;
-	cnt_all->accumbytes += ctx->cnt_dump.accumbytes;
+	if (ctx->cnt_summed.curobjs != 0)
+		(*leak_nctx)++;
+
+	/* Add to cnt_all. */
+	cnt_all->curobjs += ctx->cnt_summed.curobjs;
+	cnt_all->curbytes += ctx->cnt_summed.curbytes;
+	if (opt_prof_accum) {
+		cnt_all->accumobjs += ctx->cnt_summed.accumobjs;
+		cnt_all->accumbytes += ctx->cnt_summed.accumbytes;
+	}
 
 	malloc_mutex_unlock(&ctx->lock);
 }
 
+static void
+prof_ctx_destroy(prof_ctx_t *ctx)
+{
+
+	/*
+	 * Check that ctx is still unused by any thread cache before destroying
+	 * it.  prof_lookup() interlocks bt2ctx_mtx and ctx->lock in order to
+	 * avoid a race condition with this function.
+	 */
+	prof_enter();
+	malloc_mutex_lock(&ctx->lock);
+	if (ql_first(&ctx->cnts_ql) == NULL && ctx->cnt_merged.curobjs == 0) {
+		assert(ctx->cnt_merged.curbytes == 0);
+		assert(ctx->cnt_merged.accumobjs == 0);
+		assert(ctx->cnt_merged.accumbytes == 0);
+		/* Remove ctx from bt2ctx. */
+		ckh_remove(&bt2ctx, ctx->bt, NULL, NULL);
+		prof_leave();
+		/* Destroy ctx. */
+		malloc_mutex_unlock(&ctx->lock);
+		bt_destroy(ctx->bt);
+		malloc_mutex_destroy(&ctx->lock);
+		idalloc(ctx);
+	} else {
+		malloc_mutex_unlock(&ctx->lock);
+		prof_leave();
+	}
+}
+
+static void
+prof_ctx_merge(prof_ctx_t *ctx, prof_thr_cnt_t *cnt)
+{
+	bool destroy;
+
+	/* Merge cnt stats and detach from ctx. */
+	malloc_mutex_lock(&ctx->lock);
+	ctx->cnt_merged.curobjs += cnt->cnts.curobjs;
+	ctx->cnt_merged.curbytes += cnt->cnts.curbytes;
+	ctx->cnt_merged.accumobjs += cnt->cnts.accumobjs;
+	ctx->cnt_merged.accumbytes += cnt->cnts.accumbytes;
+	ql_remove(&ctx->cnts_ql, cnt, cnts_link);
+	if (opt_prof_accum == false && ql_first(&ctx->cnts_ql) == NULL &&
+	    ctx->cnt_merged.curobjs == 0)
+		destroy = true;
+	else
+		destroy = false;
+	malloc_mutex_unlock(&ctx->lock);
+	if (destroy)
+		prof_ctx_destroy(ctx);
+}
+
 static bool
 prof_dump_ctx(prof_ctx_t *ctx, prof_bt_t *bt, bool propagate_err)
 {
 	char buf[UMAX2S_BUFSIZE];
 	unsigned i;
 
-	if (prof_write(umax2s(ctx->cnt_dump.curobjs, 10, buf), propagate_err)
+	if (opt_prof_accum == false && ctx->cnt_summed.curobjs == 0) {
+		assert(ctx->cnt_summed.curbytes == 0);
+		assert(ctx->cnt_summed.accumobjs == 0);
+		assert(ctx->cnt_summed.accumbytes == 0);
+		return (false);
+	}
+
+	if (prof_write(u2s(ctx->cnt_summed.curobjs, 10, buf), propagate_err)
 	    || prof_write(": ", propagate_err)
-	    || prof_write(umax2s(ctx->cnt_dump.curbytes, 10, buf),
+	    || prof_write(u2s(ctx->cnt_summed.curbytes, 10, buf),
 	    propagate_err)
 	    || prof_write(" [", propagate_err)
-	    || prof_write(umax2s(ctx->cnt_dump.accumobjs, 10, buf),
+	    || prof_write(u2s(ctx->cnt_summed.accumobjs, 10, buf),
 	    propagate_err)
 	    || prof_write(": ", propagate_err)
-	    || prof_write(umax2s(ctx->cnt_dump.accumbytes, 10, buf),
+	    || prof_write(u2s(ctx->cnt_summed.accumbytes, 10, buf),
 	    propagate_err)
 	    || prof_write("] @", propagate_err))
 		return (true);
 
 	for (i = 0; i < bt->len; i++) {
 		if (prof_write(" 0x", propagate_err)
-		    || prof_write(umax2s((uintptr_t)bt->vec[i], 16, buf),
+		    || prof_write(u2s((uintptr_t)bt->vec[i], 16, buf),
 		    propagate_err))
 			return (true);
 	}
@@ -916,7 +726,7 @@
 	memcpy(&mpath[i], s, slen);
 	i += slen;
 
-	s = umax2s(getpid(), 10, buf);
+	s = u2s(getpid(), 10, buf);
 	slen = strlen(s);
 	memcpy(&mpath[i], s, slen);
 	i += slen;
@@ -958,8 +768,14 @@
 {
 	prof_cnt_t cnt_all;
 	size_t tabind;
-	prof_bt_t *bt;
-	prof_ctx_t *ctx;
+	union {
+		prof_bt_t	*p;
+		void		*v;
+	} bt;
+	union {
+		prof_ctx_t	*p;
+		void		*v;
+	} ctx;
 	char buf[UMAX2S_BUFSIZE];
 	size_t leak_nctx;
 
@@ -979,20 +795,18 @@
 	/* Merge per thread profile stats, and sum them in cnt_all. */
 	memset(&cnt_all, 0, sizeof(prof_cnt_t));
 	leak_nctx = 0;
-	for (tabind = 0; ckh_iter(&bt2ctx, &tabind, NULL, (void **)&ctx)
-	    == false;) {
-		prof_ctx_merge(ctx, &cnt_all, &leak_nctx);
-	}
+	for (tabind = 0; ckh_iter(&bt2ctx, &tabind, NULL, &ctx.v) == false;)
+		prof_ctx_sum(ctx.p, &cnt_all, &leak_nctx);
 
 	/* Dump profile header. */
 	if (prof_write("heap profile: ", propagate_err)
-	    || prof_write(umax2s(cnt_all.curobjs, 10, buf), propagate_err)
+	    || prof_write(u2s(cnt_all.curobjs, 10, buf), propagate_err)
 	    || prof_write(": ", propagate_err)
-	    || prof_write(umax2s(cnt_all.curbytes, 10, buf), propagate_err)
+	    || prof_write(u2s(cnt_all.curbytes, 10, buf), propagate_err)
 	    || prof_write(" [", propagate_err)
-	    || prof_write(umax2s(cnt_all.accumobjs, 10, buf), propagate_err)
+	    || prof_write(u2s(cnt_all.accumobjs, 10, buf), propagate_err)
 	    || prof_write(": ", propagate_err)
-	    || prof_write(umax2s(cnt_all.accumbytes, 10, buf), propagate_err))
+	    || prof_write(u2s(cnt_all.accumbytes, 10, buf), propagate_err))
 		goto ERROR;
 
 	if (opt_lg_prof_sample == 0) {
@@ -1000,16 +814,16 @@
 			goto ERROR;
 	} else {
 		if (prof_write("] @ heap_v2/", propagate_err)
-		    || prof_write(umax2s((uint64_t)1U << opt_lg_prof_sample, 10,
+		    || prof_write(u2s((uint64_t)1U << opt_lg_prof_sample, 10,
 		    buf), propagate_err)
 		    || prof_write("\n", propagate_err))
 			goto ERROR;
 	}
 
 	/* Dump  per ctx profile stats. */
-	for (tabind = 0; ckh_iter(&bt2ctx, &tabind, (void **)&bt, (void **)&ctx)
+	for (tabind = 0; ckh_iter(&bt2ctx, &tabind, &bt.v, &ctx.v)
 	    == false;) {
-		if (prof_dump_ctx(ctx, bt, propagate_err))
+		if (prof_dump_ctx(ctx.p, bt.p, propagate_err))
 			goto ERROR;
 	}
 
@@ -1024,12 +838,12 @@
 
 	if (leakcheck && cnt_all.curbytes != 0) {
 		malloc_write("<jemalloc>: Leak summary: ");
-		malloc_write(umax2s(cnt_all.curbytes, 10, buf));
+		malloc_write(u2s(cnt_all.curbytes, 10, buf));
 		malloc_write((cnt_all.curbytes != 1) ? " bytes, " : " byte, ");
-		malloc_write(umax2s(cnt_all.curobjs, 10, buf));
+		malloc_write(u2s(cnt_all.curobjs, 10, buf));
 		malloc_write((cnt_all.curobjs != 1) ? " objects, " :
 		    " object, ");
-		malloc_write(umax2s(leak_nctx, 10, buf));
+		malloc_write(u2s(leak_nctx, 10, buf));
 		malloc_write((leak_nctx != 1) ? " contexts\n" : " context\n");
 		malloc_write("<jemalloc>: Run pprof on \"");
 		malloc_write(filename);
@@ -1059,31 +873,11 @@
 	 * Construct a filename of the form:
 	 *
 	 *   <prefix>.<pid>.<seq>.v<vseq>.heap\0
-	 * or
-	 *   jeprof.<pid>.<seq>.v<vseq>.heap\0
 	 */
 
 	i = 0;
 
-	/*
-	 * Use JEMALLOC_PROF_PREFIX if it's set, and if it is short enough to
-	 * avoid overflowing DUMP_FILENAME_BUFSIZE.  The result may exceed
-	 * PATH_MAX, but creat(2) will catch that problem.
-	 */
-	if ((s = getenv("JEMALLOC_PROF_PREFIX")) != NULL
-	    && strlen(s) + (DUMP_FILENAME_BUFSIZE - PATH_MAX) <= PATH_MAX) {
-		slen = strlen(s);
-		memcpy(&filename[i], s, slen);
-		i += slen;
-
-		s = ".";
-	} else
-		s = "jeprof.";
-	slen = strlen(s);
-	memcpy(&filename[i], s, slen);
-	i += slen;
-
-	s = umax2s(getpid(), 10, buf);
+	s = opt_prof_prefix;
 	slen = strlen(s);
 	memcpy(&filename[i], s, slen);
 	i += slen;
@@ -1093,7 +887,17 @@
 	memcpy(&filename[i], s, slen);
 	i += slen;
 
-	s = umax2s(prof_dump_seq, 10, buf);
+	s = u2s(getpid(), 10, buf);
+	slen = strlen(s);
+	memcpy(&filename[i], s, slen);
+	i += slen;
+
+	s = ".";
+	slen = strlen(s);
+	memcpy(&filename[i], s, slen);
+	i += slen;
+
+	s = u2s(prof_dump_seq, 10, buf);
 	prof_dump_seq++;
 	slen = strlen(s);
 	memcpy(&filename[i], s, slen);
@@ -1108,7 +912,7 @@
 	i++;
 
 	if (vseq != 0xffffffffffffffffLLU) {
-		s = umax2s(vseq, 10, buf);
+		s = u2s(vseq, 10, buf);
 		slen = strlen(s);
 		memcpy(&filename[i], s, slen);
 		i += slen;
@@ -1130,10 +934,12 @@
 	if (prof_booted == false)
 		return;
 
-	malloc_mutex_lock(&prof_dump_seq_mtx);
-	prof_dump_filename(filename, 'f', 0xffffffffffffffffLLU);
-	malloc_mutex_unlock(&prof_dump_seq_mtx);
-	prof_dump(filename, opt_prof_leak, false);
+	if (opt_prof_prefix[0] != '\0') {
+		malloc_mutex_lock(&prof_dump_seq_mtx);
+		prof_dump_filename(filename, 'f', 0xffffffffffffffffLLU);
+		malloc_mutex_unlock(&prof_dump_seq_mtx);
+		prof_dump(filename, opt_prof_leak, false);
+	}
 }
 
 void
@@ -1151,11 +957,13 @@
 	}
 	malloc_mutex_unlock(&enq_mtx);
 
-	malloc_mutex_lock(&prof_dump_seq_mtx);
-	prof_dump_filename(filename, 'i', prof_dump_iseq);
-	prof_dump_iseq++;
-	malloc_mutex_unlock(&prof_dump_seq_mtx);
-	prof_dump(filename, false, false);
+	if (opt_prof_prefix[0] != '\0') {
+		malloc_mutex_lock(&prof_dump_seq_mtx);
+		prof_dump_filename(filename, 'i', prof_dump_iseq);
+		prof_dump_iseq++;
+		malloc_mutex_unlock(&prof_dump_seq_mtx);
+		prof_dump(filename, false, false);
+	}
 }
 
 bool
@@ -1168,6 +976,8 @@
 
 	if (filename == NULL) {
 		/* No filename specified, so automatically generate one. */
+		if (opt_prof_prefix[0] == '\0')
+			return (true);
 		malloc_mutex_lock(&prof_dump_seq_mtx);
 		prof_dump_filename(filename_buf, 'm', prof_dump_mseq);
 		prof_dump_mseq++;
@@ -1178,7 +988,7 @@
 }
 
 void
-prof_udump(void)
+prof_gdump(void)
 {
 	char filename[DUMP_FILENAME_BUFSIZE];
 
@@ -1186,17 +996,19 @@
 		return;
 	malloc_mutex_lock(&enq_mtx);
 	if (enq) {
-		enq_udump = true;
+		enq_gdump = true;
 		malloc_mutex_unlock(&enq_mtx);
 		return;
 	}
 	malloc_mutex_unlock(&enq_mtx);
 
-	malloc_mutex_lock(&prof_dump_seq_mtx);
-	prof_dump_filename(filename, 'u', prof_dump_useq);
-	prof_dump_useq++;
-	malloc_mutex_unlock(&prof_dump_seq_mtx);
-	prof_dump(filename, false, false);
+	if (opt_prof_prefix[0] != '\0') {
+		malloc_mutex_lock(&prof_dump_seq_mtx);
+		prof_dump_filename(filename, 'u', prof_dump_useq);
+		prof_dump_useq++;
+		malloc_mutex_unlock(&prof_dump_seq_mtx);
+		prof_dump(filename, false, false);
+	}
 }
 
 static void
@@ -1239,52 +1051,69 @@
 	return (memcmp(bt1->vec, bt2->vec, bt1->len * sizeof(void *)) == 0);
 }
 
-static void
-bt2cnt_thread_cleanup(void *arg)
+prof_tdata_t *
+prof_tdata_init(void)
 {
-	ckh_t *bt2cnt;
+	prof_tdata_t *prof_tdata;
 
-	bt2cnt = bt2cnt_tls;
-	if (bt2cnt != NULL) {
-		ql_head(prof_thr_cnt_t) cnts_ql;
-		size_t tabind;
+	/* Initialize an empty cache for this thread. */
+	prof_tdata = (prof_tdata_t *)imalloc(sizeof(prof_tdata_t));
+	if (prof_tdata == NULL)
+		return (NULL);
+
+	if (ckh_new(&prof_tdata->bt2cnt, PROF_CKH_MINITEMS,
+	    prof_bt_hash, prof_bt_keycomp)) {
+		idalloc(prof_tdata);
+		return (NULL);
+	}
+	ql_new(&prof_tdata->lru_ql);
+
+	prof_tdata->vec = imalloc(sizeof(void *) * prof_bt_max);
+	if (prof_tdata->vec == NULL) {
+
+		ckh_delete(&prof_tdata->bt2cnt);
+		idalloc(prof_tdata);
+		return (NULL);
+	}
+
+	prof_tdata->prn_state = 0;
+	prof_tdata->threshold = 0;
+	prof_tdata->accum = 0;
+
+	PROF_TCACHE_SET(prof_tdata);
+
+	return (prof_tdata);
+}
+
+static void
+prof_tdata_cleanup(void *arg)
+{
+	prof_tdata_t *prof_tdata;
+
+	prof_tdata = PROF_TCACHE_GET();
+	if (prof_tdata != NULL) {
 		prof_thr_cnt_t *cnt;
 
-		/* Iteratively merge cnt's into the global stats. */
-		ql_new(&cnts_ql);
-		tabind = 0;
-		while (ckh_iter(bt2cnt, &tabind, NULL, (void **)&cnt) ==
-		    false) {
-			prof_ctx_t *ctx = cnt->ctx;
-			/* Merge stats and detach from ctx. */
-			malloc_mutex_lock(&ctx->lock);
-			ctx->cnt_merged.curobjs += cnt->cnts.curobjs;
-			ctx->cnt_merged.curbytes += cnt->cnts.curbytes;
-			ctx->cnt_merged.accumobjs += cnt->cnts.accumobjs;
-			ctx->cnt_merged.accumbytes += cnt->cnts.accumbytes;
-			ql_remove(&ctx->cnts_ql, cnt, link);
-			malloc_mutex_unlock(&ctx->lock);
-
-			/*
-			 * Stash cnt for deletion after finishing with
-			 * ckh_iter().
-			 */
-			ql_tail_insert(&cnts_ql, cnt, link);
-		}
+		/*
+		 * Delete the hash table.  All of its contents can still be
+		 * iterated over via the LRU.
+		 */
+		ckh_delete(&prof_tdata->bt2cnt);
 
 		/*
-		 * Delete the hash table now that cnts_ql has a list of all
-		 * cnt's.
+		 * Iteratively merge cnt's into the global stats and delete
+		 * them.
 		 */
-		ckh_delete(bt2cnt);
-		idalloc(bt2cnt);
-		bt2cnt_tls = NULL;
-
-		/* Delete cnt's. */
-		while ((cnt = ql_last(&cnts_ql, link)) != NULL) {
-			ql_remove(&cnts_ql, cnt, link);
+		while ((cnt = ql_last(&prof_tdata->lru_ql, lru_link)) != NULL) {
+			prof_ctx_merge(cnt->ctx, cnt);
+			ql_remove(&prof_tdata->lru_ql, cnt, lru_link);
 			idalloc(cnt);
 		}
+
+		idalloc(prof_tdata->vec);
+
+		idalloc(prof_tdata);
+		PROF_TCACHE_SET(NULL);
 	}
 }
 
@@ -1292,6 +1121,14 @@
 prof_boot0(void)
 {
 
+	memcpy(opt_prof_prefix, PROF_PREFIX_DEFAULT,
+	    sizeof(PROF_PREFIX_DEFAULT));
+}
+
+void
+prof_boot1(void)
+{
+
 	/*
 	 * opt_prof and prof_promote must be in their final state before any
 	 * arenas are initialized, so this function must be executed early.
@@ -1303,7 +1140,7 @@
 		 * automatically dumped.
 		 */
 		opt_prof = true;
-		opt_prof_udump = false;
+		opt_prof_gdump = false;
 		prof_interval = 0;
 	} else if (opt_prof) {
 		if (opt_lg_prof_interval >= 0) {
@@ -1317,7 +1154,7 @@
 }
 
 bool
-prof_boot1(void)
+prof_boot2(void)
 {
 
 	if (opt_prof) {
@@ -1326,7 +1163,7 @@
 			return (true);
 		if (malloc_mutex_init(&bt2ctx_mtx))
 			return (true);
-		if (pthread_key_create(&bt2cnt_tsd, bt2cnt_thread_cleanup)
+		if (pthread_key_create(&prof_tdata_tsd, prof_tdata_cleanup)
 		    != 0) {
 			malloc_write(
 			    "<jemalloc>: Error in pthread_key_create()\n");
@@ -1341,7 +1178,7 @@
 			return (true);
 		enq = false;
 		enq_idump = false;
-		enq_udump = false;
+		enq_gdump = false;
 
 		if (atexit(prof_fdump) != 0) {
 			malloc_write("<jemalloc>: Error in atexit()\n");
diff --git a/jemalloc/src/rtree.c b/jemalloc/src/rtree.c
new file mode 100644
index 0000000..7753743
--- /dev/null
+++ b/jemalloc/src/rtree.c
@@ -0,0 +1,43 @@
+#define	RTREE_C_
+#include "jemalloc/internal/jemalloc_internal.h"
+
+rtree_t *
+rtree_new(unsigned bits)
+{
+	rtree_t *ret;
+	unsigned bits_per_level, height, i;
+
+	bits_per_level = ffs(pow2_ceil((RTREE_NODESIZE / sizeof(void *)))) - 1;
+	height = bits / bits_per_level;
+	if (height * bits_per_level != bits)
+		height++;
+	assert(height * bits_per_level >= bits);
+
+	ret = (rtree_t*)base_alloc(offsetof(rtree_t, level2bits) +
+	    (sizeof(unsigned) * height));
+	if (ret == NULL)
+		return (NULL);
+	memset(ret, 0, offsetof(rtree_t, level2bits) + (sizeof(unsigned) *
+	    height));
+
+	malloc_mutex_init(&ret->mutex);
+	ret->height = height;
+	if (bits_per_level * height > bits)
+		ret->level2bits[0] = bits % bits_per_level;
+	else
+		ret->level2bits[0] = bits_per_level;
+	for (i = 1; i < height; i++)
+		ret->level2bits[i] = bits_per_level;
+
+	ret->root = (void**)base_alloc(sizeof(void *) << ret->level2bits[0]);
+	if (ret->root == NULL) {
+		/*
+		 * We leak the rtree here, since there's no generic base
+		 * deallocation.
+		 */
+		return (NULL);
+	}
+	memset(ret->root, 0, sizeof(void *) << ret->level2bits[0]);
+
+	return (ret);
+}
diff --git a/jemalloc/src/stats.c b/jemalloc/src/stats.c
index 9dc7529..3dfe0d2 100644
--- a/jemalloc/src/stats.c
+++ b/jemalloc/src/stats.c
@@ -57,12 +57,12 @@
 
 /*
  * We don't want to depend on vsnprintf() for production builds, since that can
- * cause unnecessary bloat for static binaries.  umax2s() provides minimal
- * integer printing functionality, so that malloc_printf() use can be limited to
+ * cause unnecessary bloat for static binaries.  u2s() provides minimal integer
+ * printing functionality, so that malloc_printf() use can be limited to
  * JEMALLOC_STATS code.
  */
 char *
-umax2s(uintmax_t x, unsigned base, char *s)
+u2s(uint64_t x, unsigned base, char *s)
 {
 	unsigned i;
 
@@ -72,8 +72,8 @@
 	case 10:
 		do {
 			i--;
-			s[i] = "0123456789"[x % 10];
-			x /= 10;
+			s[i] = "0123456789"[x % (uint64_t)10];
+			x /= (uint64_t)10;
 		} while (x > 0);
 		break;
 	case 16:
@@ -86,8 +86,9 @@
 	default:
 		do {
 			i--;
-			s[i] = "0123456789abcdefghijklmnopqrstuvwxyz"[x % base];
-			x /= base;
+			s[i] = "0123456789abcdefghijklmnopqrstuvwxyz"[x %
+			    (uint64_t)base];
+			x /= (uint64_t)base;
 		} while (x > 0);
 	}
 
@@ -374,6 +375,7 @@
 stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
     const char *opts)
 {
+	int err;
 	uint64_t epoch;
 	size_t u64sz;
 	char s[UMAX2S_BUFSIZE];
@@ -383,10 +385,27 @@
 	bool bins = true;
 	bool large = true;
 
-	/* Refresh stats, in case mallctl() was called by the application. */
+	/*
+	 * Refresh stats, in case mallctl() was called by the application.
+	 *
+	 * Check for OOM here, since refreshing the ctl cache can trigger
+	 * allocation.  In practice, none of the subsequent mallctl()-related
+	 * calls in this function will cause OOM if this one succeeds.
+	 * */
 	epoch = 1;
 	u64sz = sizeof(uint64_t);
-	xmallctl("epoch", &epoch, &u64sz, &epoch, sizeof(uint64_t));
+	err = JEMALLOC_P(mallctl)("epoch", &epoch, &u64sz, &epoch,
+	    sizeof(uint64_t));
+	if (err != 0) {
+		if (err == EAGAIN) {
+			malloc_write("<jemalloc>: Memory allocation failure in "
+			    "mallctl(\"epoch\", ...)\n");
+			return;
+		}
+		malloc_write("<jemalloc>: Failure in mallctl(\"epoch\", "
+		    "...)\n");
+		abort();
+	}
 
 	if (write_cb == NULL) {
 		/*
@@ -430,10 +449,12 @@
 		bool bv;
 		unsigned uv;
 		ssize_t ssv;
-		size_t sv, bsz, ssz;
+		size_t sv, bsz, ssz, sssz, cpsz;
 
 		bsz = sizeof(bool);
 		ssz = sizeof(size_t);
+		sssz = sizeof(ssize_t);
+		cpsz = sizeof(const char *);
 
 		CTL_GET("version", &cpv, const char *);
 		write_cb(cbopaque, "Version: ");
@@ -444,113 +465,140 @@
 		write_cb(cbopaque, bv ? "enabled" : "disabled");
 		write_cb(cbopaque, "\n");
 
-		write_cb(cbopaque, "Boolean JEMALLOC_OPTIONS: ");
-		if ((err = JEMALLOC_P(mallctl)("opt.abort", &bv, &bsz, NULL, 0))
-		    == 0)
-			write_cb(cbopaque, bv ? "A" : "a");
-		if ((err = JEMALLOC_P(mallctl)("prof.active", &bv, &bsz,
-		    NULL, 0)) == 0)
-			write_cb(cbopaque, bv ? "E" : "e");
-		if ((err = JEMALLOC_P(mallctl)("opt.prof", &bv, &bsz, NULL, 0))
-		    == 0)
-			write_cb(cbopaque, bv ? "F" : "f");
-		if ((err = JEMALLOC_P(mallctl)("opt.tcache", &bv, &bsz, NULL,
-		    0)) == 0)
-			write_cb(cbopaque, bv ? "H" : "h");
-		if ((err = JEMALLOC_P(mallctl)("opt.junk", &bv, &bsz, NULL, 0))
-		    == 0)
-			write_cb(cbopaque, bv ? "J" : "j");
-		if ((err = JEMALLOC_P(mallctl)("opt.prof_leak", &bv, &bsz, NULL,
-		    0)) == 0)
-			write_cb(cbopaque, bv ? "L" : "l");
-		if ((err = JEMALLOC_P(mallctl)("opt.overcommit", &bv, &bsz,
-		    NULL, 0)) == 0)
-			write_cb(cbopaque, bv ? "O" : "o");
-		if ((err = JEMALLOC_P(mallctl)("opt.stats_print", &bv, &bsz,
-		    NULL, 0)) == 0)
-			write_cb(cbopaque, bv ? "P" : "p");
-		if ((err = JEMALLOC_P(mallctl)("opt.prof_udump", &bv, &bsz,
-		    NULL, 0)) == 0)
-			write_cb(cbopaque, bv ? "U" : "u");
-		if ((err = JEMALLOC_P(mallctl)("opt.sysv", &bv, &bsz, NULL, 0))
-		    == 0)
-			write_cb(cbopaque, bv ? "V" : "v");
-		if ((err = JEMALLOC_P(mallctl)("opt.xmalloc", &bv, &bsz, NULL,
-		    0)) == 0)
-			write_cb(cbopaque, bv ? "X" : "x");
-		if ((err = JEMALLOC_P(mallctl)("opt.zero", &bv, &bsz, NULL, 0))
-		    == 0)
-			write_cb(cbopaque, bv ? "Z" : "z");
-		write_cb(cbopaque, "\n");
+#define OPT_WRITE_BOOL(n)						\
+		if ((err = JEMALLOC_P(mallctl)("opt."#n, &bv, &bsz,	\
+		    NULL, 0)) == 0) {					\
+			write_cb(cbopaque, "  opt."#n": ");		\
+			write_cb(cbopaque, bv ? "true" : "false");	\
+			write_cb(cbopaque, "\n");			\
+		}
+#define OPT_WRITE_SIZE_T(n)						\
+		if ((err = JEMALLOC_P(mallctl)("opt."#n, &sv, &ssz,	\
+		    NULL, 0)) == 0) {					\
+			write_cb(cbopaque, "  opt."#n": ");		\
+			write_cb(cbopaque, u2s(sv, 10, s));		\
+			write_cb(cbopaque, "\n");			\
+		}
+#define OPT_WRITE_SSIZE_T(n)						\
+		if ((err = JEMALLOC_P(mallctl)("opt."#n, &ssv, &sssz,	\
+		    NULL, 0)) == 0) {					\
+			if (ssv >= 0) {					\
+				write_cb(cbopaque, "  opt."#n": ");	\
+				write_cb(cbopaque, u2s(ssv, 10, s));	\
+			} else {					\
+				write_cb(cbopaque, "  opt."#n": -");	\
+				write_cb(cbopaque, u2s(-ssv, 10, s));	\
+			}						\
+			write_cb(cbopaque, "\n");			\
+		}
+#define OPT_WRITE_CHAR_P(n)						\
+		if ((err = JEMALLOC_P(mallctl)("opt."#n, &cpv, &cpsz,	\
+		    NULL, 0)) == 0) {					\
+			write_cb(cbopaque, "  opt."#n": \"");		\
+			write_cb(cbopaque, cpv);			\
+			write_cb(cbopaque, "\"\n");			\
+		}
+
+		write_cb(cbopaque, "Run-time option settings:\n");
+		OPT_WRITE_BOOL(abort)
+		OPT_WRITE_SIZE_T(lg_qspace_max)
+		OPT_WRITE_SIZE_T(lg_cspace_max)
+		OPT_WRITE_SIZE_T(lg_chunk)
+		OPT_WRITE_SIZE_T(narenas)
+		OPT_WRITE_SSIZE_T(lg_dirty_mult)
+		OPT_WRITE_BOOL(stats_print)
+		OPT_WRITE_BOOL(junk)
+		OPT_WRITE_BOOL(zero)
+		OPT_WRITE_BOOL(sysv)
+		OPT_WRITE_BOOL(xmalloc)
+		OPT_WRITE_BOOL(tcache)
+		OPT_WRITE_SSIZE_T(lg_tcache_gc_sweep)
+		OPT_WRITE_SSIZE_T(lg_tcache_max)
+		OPT_WRITE_BOOL(prof)
+		OPT_WRITE_CHAR_P(prof_prefix)
+		OPT_WRITE_SIZE_T(lg_prof_bt_max)
+		OPT_WRITE_BOOL(prof_active)
+		OPT_WRITE_SSIZE_T(lg_prof_sample)
+		OPT_WRITE_BOOL(prof_accum)
+		OPT_WRITE_SSIZE_T(lg_prof_tcmax)
+		OPT_WRITE_SSIZE_T(lg_prof_interval)
+		OPT_WRITE_BOOL(prof_gdump)
+		OPT_WRITE_BOOL(prof_leak)
+		OPT_WRITE_BOOL(overcommit)
+
+#undef OPT_WRITE_BOOL
+#undef OPT_WRITE_SIZE_T
+#undef OPT_WRITE_SSIZE_T
+#undef OPT_WRITE_CHAR_P
 
 		write_cb(cbopaque, "CPUs: ");
-		write_cb(cbopaque, umax2s(ncpus, 10, s));
+		write_cb(cbopaque, u2s(ncpus, 10, s));
 		write_cb(cbopaque, "\n");
 
 		CTL_GET("arenas.narenas", &uv, unsigned);
 		write_cb(cbopaque, "Max arenas: ");
-		write_cb(cbopaque, umax2s(uv, 10, s));
+		write_cb(cbopaque, u2s(uv, 10, s));
 		write_cb(cbopaque, "\n");
 
 		write_cb(cbopaque, "Pointer size: ");
-		write_cb(cbopaque, umax2s(sizeof(void *), 10, s));
+		write_cb(cbopaque, u2s(sizeof(void *), 10, s));
 		write_cb(cbopaque, "\n");
 
 		CTL_GET("arenas.quantum", &sv, size_t);
 		write_cb(cbopaque, "Quantum size: ");
-		write_cb(cbopaque, umax2s(sv, 10, s));
+		write_cb(cbopaque, u2s(sv, 10, s));
 		write_cb(cbopaque, "\n");
 
 		CTL_GET("arenas.cacheline", &sv, size_t);
 		write_cb(cbopaque, "Cacheline size (assumed): ");
-		write_cb(cbopaque, umax2s(sv, 10, s));
+		write_cb(cbopaque, u2s(sv, 10, s));
 		write_cb(cbopaque, "\n");
 
 		CTL_GET("arenas.subpage", &sv, size_t);
 		write_cb(cbopaque, "Subpage spacing: ");
-		write_cb(cbopaque, umax2s(sv, 10, s));
+		write_cb(cbopaque, u2s(sv, 10, s));
 		write_cb(cbopaque, "\n");
 
 		if ((err = JEMALLOC_P(mallctl)("arenas.tspace_min", &sv, &ssz,
 		    NULL, 0)) == 0) {
 			write_cb(cbopaque, "Tiny 2^n-spaced sizes: [");
-			write_cb(cbopaque, umax2s(sv, 10, s));
+			write_cb(cbopaque, u2s(sv, 10, s));
 			write_cb(cbopaque, "..");
 
 			CTL_GET("arenas.tspace_max", &sv, size_t);
-			write_cb(cbopaque, umax2s(sv, 10, s));
+			write_cb(cbopaque, u2s(sv, 10, s));
 			write_cb(cbopaque, "]\n");
 		}
 
 		CTL_GET("arenas.qspace_min", &sv, size_t);
 		write_cb(cbopaque, "Quantum-spaced sizes: [");
-		write_cb(cbopaque, umax2s(sv, 10, s));
+		write_cb(cbopaque, u2s(sv, 10, s));
 		write_cb(cbopaque, "..");
 		CTL_GET("arenas.qspace_max", &sv, size_t);
-		write_cb(cbopaque, umax2s(sv, 10, s));
+		write_cb(cbopaque, u2s(sv, 10, s));
 		write_cb(cbopaque, "]\n");
 
 		CTL_GET("arenas.cspace_min", &sv, size_t);
 		write_cb(cbopaque, "Cacheline-spaced sizes: [");
-		write_cb(cbopaque, umax2s(sv, 10, s));
+		write_cb(cbopaque, u2s(sv, 10, s));
 		write_cb(cbopaque, "..");
 		CTL_GET("arenas.cspace_max", &sv, size_t);
-		write_cb(cbopaque, umax2s(sv, 10, s));
+		write_cb(cbopaque, u2s(sv, 10, s));
 		write_cb(cbopaque, "]\n");
 
 		CTL_GET("arenas.sspace_min", &sv, size_t);
 		write_cb(cbopaque, "Subpage-spaced sizes: [");
-		write_cb(cbopaque, umax2s(sv, 10, s));
+		write_cb(cbopaque, u2s(sv, 10, s));
 		write_cb(cbopaque, "..");
 		CTL_GET("arenas.sspace_max", &sv, size_t);
-		write_cb(cbopaque, umax2s(sv, 10, s));
+		write_cb(cbopaque, u2s(sv, 10, s));
 		write_cb(cbopaque, "]\n");
 
 		CTL_GET("opt.lg_dirty_mult", &ssv, ssize_t);
 		if (ssv >= 0) {
 			write_cb(cbopaque,
 			    "Min active:dirty page ratio per arena: ");
-			write_cb(cbopaque, umax2s((1U << ssv), 10, s));
+			write_cb(cbopaque, u2s((1U << ssv), 10, s));
 			write_cb(cbopaque, ":1\n");
 		} else {
 			write_cb(cbopaque,
@@ -560,7 +608,7 @@
 		    &ssz, NULL, 0)) == 0) {
 			write_cb(cbopaque,
 			    "Maximum thread-cached size class: ");
-			write_cb(cbopaque, umax2s(sv, 10, s));
+			write_cb(cbopaque, u2s(sv, 10, s));
 			write_cb(cbopaque, "\n");
 		}
 		if ((err = JEMALLOC_P(mallctl)("opt.lg_tcache_gc_sweep", &ssv,
@@ -570,39 +618,51 @@
 			CTL_GET("opt.tcache", &tcache_enabled, bool);
 			write_cb(cbopaque, "Thread cache GC sweep interval: ");
 			write_cb(cbopaque, tcache_enabled && ssv >= 0 ?
-			    umax2s(tcache_gc_sweep, 10, s) : "N/A");
+			    u2s(tcache_gc_sweep, 10, s) : "N/A");
 			write_cb(cbopaque, "\n");
 		}
 		if ((err = JEMALLOC_P(mallctl)("opt.prof", &bv, &bsz, NULL, 0))
 		   == 0 && bv) {
 			CTL_GET("opt.lg_prof_bt_max", &sv, size_t);
 			write_cb(cbopaque, "Maximum profile backtrace depth: ");
-			write_cb(cbopaque, umax2s((1U << sv), 10, s));
+			write_cb(cbopaque, u2s((1U << sv), 10, s));
 			write_cb(cbopaque, "\n");
 
+			CTL_GET("opt.lg_prof_tcmax", &ssv, ssize_t);
+			write_cb(cbopaque,
+			    "Maximum per thread backtrace cache: ");
+			if (ssv >= 0) {
+				write_cb(cbopaque, u2s((1U << ssv), 10, s));
+				write_cb(cbopaque, " (2^");
+				write_cb(cbopaque, u2s(ssv, 10, s));
+				write_cb(cbopaque, ")\n");
+			} else
+				write_cb(cbopaque, "N/A\n");
+
 			CTL_GET("opt.lg_prof_sample", &sv, size_t);
 			write_cb(cbopaque, "Average profile sample interval: ");
-			write_cb(cbopaque, umax2s((1U << sv), 10, s));
+			write_cb(cbopaque, u2s((((uint64_t)1U) << sv), 10, s));
 			write_cb(cbopaque, " (2^");
-			write_cb(cbopaque, umax2s(sv, 10, s));
+			write_cb(cbopaque, u2s(sv, 10, s));
 			write_cb(cbopaque, ")\n");
 
 			CTL_GET("opt.lg_prof_interval", &ssv, ssize_t);
 			write_cb(cbopaque, "Average profile dump interval: ");
 			if (ssv >= 0) {
-				write_cb(cbopaque, umax2s((1U << ssv), 10, s));
+				write_cb(cbopaque, u2s((((uint64_t)1U) << ssv),
+				    10, s));
 				write_cb(cbopaque, " (2^");
-				write_cb(cbopaque, umax2s(ssv, 10, s));
+				write_cb(cbopaque, u2s(ssv, 10, s));
 				write_cb(cbopaque, ")\n");
 			} else
 				write_cb(cbopaque, "N/A\n");
 		}
 		CTL_GET("arenas.chunksize", &sv, size_t);
 		write_cb(cbopaque, "Chunk size: ");
-		write_cb(cbopaque, umax2s(sv, 10, s));
+		write_cb(cbopaque, u2s(sv, 10, s));
 		CTL_GET("opt.lg_chunk", &sv, size_t);
 		write_cb(cbopaque, " (2^");
-		write_cb(cbopaque, umax2s(sv, 10, s));
+		write_cb(cbopaque, u2s(sv, 10, s));
 		write_cb(cbopaque, ")\n");
 	}
 
diff --git a/jemalloc/src/tcache.c b/jemalloc/src/tcache.c
index ace24ce..cbbe7a1 100644
--- a/jemalloc/src/tcache.c
+++ b/jemalloc/src/tcache.c
@@ -5,17 +5,19 @@
 /* Data. */
 
 bool	opt_tcache = true;
-ssize_t	opt_lg_tcache_maxclass = LG_TCACHE_MAXCLASS_DEFAULT;
+ssize_t	opt_lg_tcache_max = LG_TCACHE_MAXCLASS_DEFAULT;
 ssize_t	opt_lg_tcache_gc_sweep = LG_TCACHE_GC_SWEEP_DEFAULT;
 
 /* Map of thread-specific caches. */
+#ifndef NO_TLS
 __thread tcache_t	*tcache_tls JEMALLOC_ATTR(tls_model("initial-exec"));
+#endif
 
 /*
  * Same contents as tcache, but initialized such that the TSD destructor is
  * called when a thread exits, so that the cache can be cleaned up.
  */
-static pthread_key_t		tcache_tsd;
+pthread_key_t		tcache_tsd;
 
 size_t				nhbins;
 size_t				tcache_maxclass;
@@ -93,10 +95,10 @@
 			flush = *(void **)ptr;
 			chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 			if (chunk->arena == arena) {
-				size_t pageind = (((uintptr_t)ptr -
-				    (uintptr_t)chunk) >> PAGE_SHIFT);
+				size_t pageind = ((uintptr_t)ptr -
+				    (uintptr_t)chunk) >> PAGE_SHIFT;
 				arena_chunk_map_t *mapelm =
-				    &chunk->map[pageind];
+				    &chunk->map[pageind-map_bias];
 				arena_dalloc_bin(arena, chunk, ptr, mapelm);
 			} else {
 				/*
@@ -202,12 +204,14 @@
 	size_t size;
 	unsigned i;
 
-	size = sizeof(tcache_t) + (sizeof(tcache_bin_t) * (nhbins - 1));
+	size = offsetof(tcache_t, tbins) + (sizeof(tcache_bin_t) * nhbins);
 	/*
 	 * Round up to the nearest multiple of the cacheline size, in order to
 	 * avoid the possibility of false cacheline sharing.
 	 *
-	 * That this works relies on the same logic as in ipalloc().
+	 * That this works relies on the same logic as in ipalloc(), but we
+	 * cannot directly call ipalloc() here due to tcache bootstrapping
+	 * issues.
 	 */
 	size = (size + CACHELINE_MASK) & (-CACHELINE);
 
@@ -239,8 +243,7 @@
 	for (; i < nhbins; i++)
 		tcache->tbins[i].ncached_max = TCACHE_NSLOTS_LARGE;
 
-	tcache_tls = tcache;
-	pthread_setspecific(tcache_tsd, tcache);
+	TCACHE_SET(tcache);
 
 	return (tcache);
 }
@@ -308,9 +311,9 @@
 	if (arena_salloc(tcache) <= small_maxclass) {
 		arena_chunk_t *chunk = CHUNK_ADDR2BASE(tcache);
 		arena_t *arena = chunk->arena;
-		size_t pageind = (((uintptr_t)tcache - (uintptr_t)chunk) >>
-		    PAGE_SHIFT);
-		arena_chunk_map_t *mapelm = &chunk->map[pageind];
+		size_t pageind = ((uintptr_t)tcache - (uintptr_t)chunk) >>
+		    PAGE_SHIFT;
+		arena_chunk_map_t *mapelm = &chunk->map[pageind-map_bias];
 		arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
 		    (uintptr_t)((pageind - (mapelm->bits >> PAGE_SHIFT)) <<
 		    PAGE_SHIFT));
@@ -328,11 +331,24 @@
 {
 	tcache_t *tcache = (tcache_t *)arg;
 
-	assert(tcache == tcache_tls);
-	if (tcache != NULL) {
+	if (tcache == (void *)(uintptr_t)1) {
+		/*
+		 * The previous time this destructor was called, we set the key
+		 * to 1 so that other destructors wouldn't cause re-creation of
+		 * the tcache.  This time, do nothing, so that the destructor
+		 * will not be called again.
+		 */
+	} else if (tcache == (void *)(uintptr_t)2) {
+		/*
+		 * Another destructor called an allocator function after this
+		 * destructor was called.  Reset tcache to 1 in order to
+		 * receive another callback.
+		 */
+		TCACHE_SET((uintptr_t)1);
+	} else if (tcache != NULL) {
 		assert(tcache != (void *)(uintptr_t)1);
 		tcache_destroy(tcache);
-		tcache_tls = (void *)(uintptr_t)1;
+		TCACHE_SET((uintptr_t)1);
 	}
 }
 
@@ -368,16 +384,16 @@
 
 	if (opt_tcache) {
 		/*
-		 * If necessary, clamp opt_lg_tcache_maxclass, now that
+		 * If necessary, clamp opt_lg_tcache_max, now that
 		 * small_maxclass and arena_maxclass are known.
 		 */
-		if (opt_lg_tcache_maxclass < 0 || (1U <<
-		    opt_lg_tcache_maxclass) < small_maxclass)
+		if (opt_lg_tcache_max < 0 || (1U <<
+		    opt_lg_tcache_max) < small_maxclass)
 			tcache_maxclass = small_maxclass;
-		else if ((1U << opt_lg_tcache_maxclass) > arena_maxclass)
+		else if ((1U << opt_lg_tcache_max) > arena_maxclass)
 			tcache_maxclass = arena_maxclass;
 		else
-			tcache_maxclass = (1U << opt_lg_tcache_maxclass);
+			tcache_maxclass = (1U << opt_lg_tcache_max);
 
 		nhbins = nbins + (tcache_maxclass >> PAGE_SHIFT);
 
diff --git a/jemalloc/src/zone.c b/jemalloc/src/zone.c
new file mode 100644
index 0000000..2c1b231
--- /dev/null
+++ b/jemalloc/src/zone.c
@@ -0,0 +1,354 @@
+#include "jemalloc/internal/jemalloc_internal.h"
+#ifndef JEMALLOC_ZONE
+#  error "This source file is for zones on Darwin (OS X)."
+#endif
+
+/******************************************************************************/
+/* Data. */
+
+static malloc_zone_t zone, szone;
+static struct malloc_introspection_t zone_introspect, ozone_introspect;
+
+/******************************************************************************/
+/* Function prototypes for non-inline static functions. */
+
+static size_t	zone_size(malloc_zone_t *zone, void *ptr);
+static void	*zone_malloc(malloc_zone_t *zone, size_t size);
+static void	*zone_calloc(malloc_zone_t *zone, size_t num, size_t size);
+static void	*zone_valloc(malloc_zone_t *zone, size_t size);
+static void	zone_free(malloc_zone_t *zone, void *ptr);
+static void	*zone_realloc(malloc_zone_t *zone, void *ptr, size_t size);
+#if (JEMALLOC_ZONE_VERSION >= 6)
+static void	*zone_memalign(malloc_zone_t *zone, size_t alignment,
+    size_t size);
+static void	zone_free_definite_size(malloc_zone_t *zone, void *ptr,
+    size_t size);
+#endif
+static void	*zone_destroy(malloc_zone_t *zone);
+static size_t	zone_good_size(malloc_zone_t *zone, size_t size);
+static void	zone_force_lock(malloc_zone_t *zone);
+static void	zone_force_unlock(malloc_zone_t *zone);
+static size_t	ozone_size(malloc_zone_t *zone, void *ptr);
+static void	ozone_free(malloc_zone_t *zone, void *ptr);
+static void	*ozone_realloc(malloc_zone_t *zone, void *ptr, size_t size);
+static unsigned	ozone_batch_malloc(malloc_zone_t *zone, size_t size,
+    void **results, unsigned num_requested);
+static void	ozone_batch_free(malloc_zone_t *zone, void **to_be_freed,
+    unsigned num);
+#if (JEMALLOC_ZONE_VERSION >= 6)
+static void	ozone_free_definite_size(malloc_zone_t *zone, void *ptr,
+    size_t size);
+#endif
+static void	ozone_force_lock(malloc_zone_t *zone);
+static void	ozone_force_unlock(malloc_zone_t *zone);
+
+/******************************************************************************/
+/*
+ * Functions.
+ */
+
+static size_t
+zone_size(malloc_zone_t *zone, void *ptr)
+{
+
+	/*
+	 * There appear to be places within Darwin (such as setenv(3)) that
+	 * cause calls to this function with pointers that *no* zone owns.  If
+	 * we knew that all pointers were owned by *some* zone, we could split
+	 * our zone into two parts, and use one as the default allocator and
+	 * the other as the default deallocator/reallocator.  Since that will
+	 * not work in practice, we must check all pointers to assure that they
+	 * reside within a mapped chunk before determining size.
+	 */
+	return (ivsalloc(ptr));
+}
+
+static void *
+zone_malloc(malloc_zone_t *zone, size_t size)
+{
+
+	return (JEMALLOC_P(malloc)(size));
+}
+
+static void *
+zone_calloc(malloc_zone_t *zone, size_t num, size_t size)
+{
+
+	return (JEMALLOC_P(calloc)(num, size));
+}
+
+static void *
+zone_valloc(malloc_zone_t *zone, size_t size)
+{
+	void *ret = NULL; /* Assignment avoids useless compiler warning. */
+
+	JEMALLOC_P(posix_memalign)(&ret, PAGE_SIZE, size);
+
+	return (ret);
+}
+
+static void
+zone_free(malloc_zone_t *zone, void *ptr)
+{
+
+	JEMALLOC_P(free)(ptr);
+}
+
+static void *
+zone_realloc(malloc_zone_t *zone, void *ptr, size_t size)
+{
+
+	return (JEMALLOC_P(realloc)(ptr, size));
+}
+
+#if (JEMALLOC_ZONE_VERSION >= 6)
+static void *
+zone_memalign(malloc_zone_t *zone, size_t alignment, size_t size)
+{
+	void *ret = NULL; /* Assignment avoids useless compiler warning. */
+
+	JEMALLOC_P(posix_memalign)(&ret, alignment, size);
+
+	return (ret);
+}
+
+static void
+zone_free_definite_size(malloc_zone_t *zone, void *ptr, size_t size)
+{
+
+	assert(ivsalloc(ptr) == size);
+	JEMALLOC_P(free)(ptr);
+}
+#endif
+
+static void *
+zone_destroy(malloc_zone_t *zone)
+{
+
+	/* This function should never be called. */
+	assert(false);
+	return (NULL);
+}
+
+static size_t
+zone_good_size(malloc_zone_t *zone, size_t size)
+{
+	size_t ret;
+	void *p;
+
+	/*
+	 * Actually create an object of the appropriate size, then find out
+	 * how large it could have been without moving up to the next size
+	 * class.
+	 */
+	p = JEMALLOC_P(malloc)(size);
+	if (p != NULL) {
+		ret = isalloc(p);
+		JEMALLOC_P(free)(p);
+	} else
+		ret = size;
+
+	return (ret);
+}
+
+static void
+zone_force_lock(malloc_zone_t *zone)
+{
+
+	if (isthreaded)
+		jemalloc_prefork();
+}
+
+static void
+zone_force_unlock(malloc_zone_t *zone)
+{
+
+	if (isthreaded)
+		jemalloc_postfork();
+}
+
+malloc_zone_t *
+create_zone(void)
+{
+
+	zone.size = (void *)zone_size;
+	zone.malloc = (void *)zone_malloc;
+	zone.calloc = (void *)zone_calloc;
+	zone.valloc = (void *)zone_valloc;
+	zone.free = (void *)zone_free;
+	zone.realloc = (void *)zone_realloc;
+	zone.destroy = (void *)zone_destroy;
+	zone.zone_name = "jemalloc_zone";
+	zone.batch_malloc = NULL;
+	zone.batch_free = NULL;
+	zone.introspect = &zone_introspect;
+	zone.version = JEMALLOC_ZONE_VERSION;
+#if (JEMALLOC_ZONE_VERSION >= 6)
+	zone.memalign = zone_memalign;
+	zone.free_definite_size = zone_free_definite_size;
+#endif
+
+	zone_introspect.enumerator = NULL;
+	zone_introspect.good_size = (void *)zone_good_size;
+	zone_introspect.check = NULL;
+	zone_introspect.print = NULL;
+	zone_introspect.log = NULL;
+	zone_introspect.force_lock = (void *)zone_force_lock;
+	zone_introspect.force_unlock = (void *)zone_force_unlock;
+	zone_introspect.statistics = NULL;
+#if (JEMALLOC_ZONE_VERSION >= 6)
+	zone_introspect.zone_locked = NULL;
+#endif
+
+	return (&zone);
+}
+
+static size_t
+ozone_size(malloc_zone_t *zone, void *ptr)
+{
+	size_t ret;
+
+	ret = ivsalloc(ptr);
+	if (ret == 0)
+		ret = szone.size(zone, ptr);
+
+	return (ret);
+}
+
+static void
+ozone_free(malloc_zone_t *zone, void *ptr)
+{
+
+	if (ivsalloc(ptr) != 0)
+		JEMALLOC_P(free)(ptr);
+	else {
+		size_t size = szone.size(zone, ptr);
+		if (size != 0)
+			(szone.free)(zone, ptr);
+	}
+}
+
+static void *
+ozone_realloc(malloc_zone_t *zone, void *ptr, size_t size)
+{
+	size_t oldsize;
+
+	if (ptr == NULL)
+		return (JEMALLOC_P(malloc)(size));
+
+	oldsize = ivsalloc(ptr);
+	if (oldsize != 0)
+		return (JEMALLOC_P(realloc)(ptr, size));
+	else {
+		oldsize = szone.size(zone, ptr);
+		if (oldsize == 0)
+			return (JEMALLOC_P(malloc)(size));
+		else {
+			void *ret = JEMALLOC_P(malloc)(size);
+			if (ret != NULL) {
+				memcpy(ret, ptr, (oldsize < size) ? oldsize :
+				    size);
+				(szone.free)(zone, ptr);
+			}
+			return (ret);
+		}
+	}
+}
+
+static unsigned
+ozone_batch_malloc(malloc_zone_t *zone, size_t size, void **results,
+    unsigned num_requested)
+{
+
+	/* Don't bother implementing this interface, since it isn't required. */
+	return (0);
+}
+
+static void
+ozone_batch_free(malloc_zone_t *zone, void **to_be_freed, unsigned num)
+{
+	unsigned i;
+
+	for (i = 0; i < num; i++)
+		ozone_free(zone, to_be_freed[i]);
+}
+
+#if (JEMALLOC_ZONE_VERSION >= 6)
+static void
+ozone_free_definite_size(malloc_zone_t *zone, void *ptr, size_t size)
+{
+
+	if (ivsalloc(ptr) != 0) {
+		assert(ivsalloc(ptr) == size);
+		JEMALLOC_P(free)(ptr);
+	} else {
+		assert(size == szone.size(zone, ptr));
+		szone.free_definite_size(zone, ptr, size);
+	}
+}
+#endif
+
+static void
+ozone_force_lock(malloc_zone_t *zone)
+{
+
+	/* jemalloc locking is taken care of by the normal jemalloc zone. */
+	szone.introspect->force_lock(zone);
+}
+
+static void
+ozone_force_unlock(malloc_zone_t *zone)
+{
+
+	/* jemalloc locking is taken care of by the normal jemalloc zone. */
+	szone.introspect->force_unlock(zone);
+}
+
+/*
+ * Overlay the default scalable zone (szone) such that existing allocations are
+ * drained, and further allocations come from jemalloc.  This is necessary
+ * because Core Foundation directly accesses and uses the szone before the
+ * jemalloc library is even loaded.
+ */
+void
+szone2ozone(malloc_zone_t *zone)
+{
+
+	/*
+	 * Stash a copy of the original szone so that we can call its
+	 * functions as needed.  Note that the internally, the szone stores its
+	 * bookkeeping data structures immediately following the malloc_zone_t
+	 * header, so when calling szone functions, we need to pass a pointer
+	 * to the original zone structure.
+	 */
+	memcpy(&szone, zone, sizeof(malloc_zone_t));
+
+	zone->size = (void *)ozone_size;
+	zone->malloc = (void *)zone_malloc;
+	zone->calloc = (void *)zone_calloc;
+	zone->valloc = (void *)zone_valloc;
+	zone->free = (void *)ozone_free;
+	zone->realloc = (void *)ozone_realloc;
+	zone->destroy = (void *)zone_destroy;
+	zone->zone_name = "jemalloc_ozone";
+	zone->batch_malloc = ozone_batch_malloc;
+	zone->batch_free = ozone_batch_free;
+	zone->introspect = &ozone_introspect;
+	zone->version = JEMALLOC_ZONE_VERSION;
+#if (JEMALLOC_ZONE_VERSION >= 6)
+	zone->memalign = zone_memalign;
+	zone->free_definite_size = ozone_free_definite_size;
+#endif
+
+	ozone_introspect.enumerator = NULL;
+	ozone_introspect.good_size = (void *)zone_good_size;
+	ozone_introspect.check = NULL;
+	ozone_introspect.print = NULL;
+	ozone_introspect.log = NULL;
+	ozone_introspect.force_lock = (void *)ozone_force_lock;
+	ozone_introspect.force_unlock = (void *)ozone_force_unlock;
+	ozone_introspect.statistics = NULL;
+#if (JEMALLOC_ZONE_VERSION >= 6)
+	ozone_introspect.zone_locked = NULL;
+#endif
+}
diff --git a/jemalloc/test/allocated.c b/jemalloc/test/allocated.c
new file mode 100644
index 0000000..64a1735
--- /dev/null
+++ b/jemalloc/test/allocated.c
@@ -0,0 +1,105 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <pthread.h>
+#include <assert.h>
+#include <errno.h>
+#include <string.h>
+
+#define	JEMALLOC_MANGLE
+#include "jemalloc_test.h"
+
+void *
+thread_start(void *arg)
+{
+	int err;
+	void *p;
+	uint64_t a0, a1, d0, d1;
+	size_t sz, usize;
+
+	sz = sizeof(a0);
+	if ((err = JEMALLOC_P(mallctl)("thread.allocated", &a0, &sz, NULL,
+	    0))) {
+		if (err == ENOENT) {
+#ifdef JEMALLOC_STATS
+			assert(false);
+#endif
+			goto RETURN;
+		}
+		fprintf(stderr, "%s(): Error in mallctl(): %s\n", __func__,
+		    strerror(err));
+		exit(1);
+	}
+
+	sz = sizeof(d0);
+	if ((err = JEMALLOC_P(mallctl)("thread.deallocated", &d0, &sz, NULL,
+	    0))) {
+		if (err == ENOENT) {
+#ifdef JEMALLOC_STATS
+			assert(false);
+#endif
+			goto RETURN;
+		}
+		fprintf(stderr, "%s(): Error in mallctl(): %s\n", __func__,
+		    strerror(err));
+		exit(1);
+	}
+
+	p = JEMALLOC_P(malloc)(1);
+	if (p == NULL) {
+		fprintf(stderr, "%s(): Error in malloc()\n", __func__);
+		exit(1);
+	}
+
+	sz = sizeof(a1);
+	JEMALLOC_P(mallctl)("thread.allocated", &a1, &sz, NULL, 0);
+
+	usize = JEMALLOC_P(malloc_usable_size)(p);
+	assert(a0 + usize <= a1);
+
+	JEMALLOC_P(free)(p);
+
+	sz = sizeof(d1);
+	JEMALLOC_P(mallctl)("thread.deallocated", &d1, &sz, NULL, 0);
+
+	assert(d0 + usize <= d1);
+
+RETURN:
+	return (NULL);
+}
+
+int
+main(void)
+{
+	int ret = 0;
+	pthread_t thread;
+
+	fprintf(stderr, "Test begin\n");
+
+	thread_start(NULL);
+
+	if (pthread_create(&thread, NULL, thread_start, NULL)
+	    != 0) {
+		fprintf(stderr, "%s(): Error in pthread_create()\n", __func__);
+		ret = 1;
+		goto RETURN;
+	}
+	pthread_join(thread, (void *)&ret);
+
+	thread_start(NULL);
+
+	if (pthread_create(&thread, NULL, thread_start, NULL)
+	    != 0) {
+		fprintf(stderr, "%s(): Error in pthread_create()\n", __func__);
+		ret = 1;
+		goto RETURN;
+	}
+	pthread_join(thread, (void *)&ret);
+
+	thread_start(NULL);
+
+RETURN:
+	fprintf(stderr, "Test end\n");
+	return (ret);
+}
diff --git a/jemalloc/test/allocated.exp b/jemalloc/test/allocated.exp
new file mode 100644
index 0000000..369a88d
--- /dev/null
+++ b/jemalloc/test/allocated.exp
@@ -0,0 +1,2 @@
+Test begin
+Test end
diff --git a/jemalloc/test/allocm.c b/jemalloc/test/allocm.c
new file mode 100644
index 0000000..59d0002
--- /dev/null
+++ b/jemalloc/test/allocm.c
@@ -0,0 +1,133 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+
+#define	JEMALLOC_MANGLE
+#include "jemalloc_test.h"
+
+#define CHUNK 0x400000
+/* #define MAXALIGN ((size_t)0x80000000000LLU) */
+#define MAXALIGN ((size_t)0x2000000LLU)
+#define NITER 4
+
+int
+main(void)
+{
+	int r;
+	void *p;
+	size_t sz, alignment, total, tsz;
+	unsigned i;
+	void *ps[NITER];
+
+	fprintf(stderr, "Test begin\n");
+
+	sz = 0;
+	r = JEMALLOC_P(allocm)(&p, &sz, 42, 0);
+	if (r != ALLOCM_SUCCESS) {
+		fprintf(stderr, "Unexpected allocm() error\n");
+		abort();
+	}
+	if (sz < 42)
+		fprintf(stderr, "Real size smaller than expected\n");
+	if (JEMALLOC_P(dallocm)(p, 0) != ALLOCM_SUCCESS)
+		fprintf(stderr, "Unexpected dallocm() error\n");
+
+	r = JEMALLOC_P(allocm)(&p, NULL, 42, 0);
+	if (r != ALLOCM_SUCCESS) {
+		fprintf(stderr, "Unexpected allocm() error\n");
+		abort();
+	}
+	if (JEMALLOC_P(dallocm)(p, 0) != ALLOCM_SUCCESS)
+		fprintf(stderr, "Unexpected dallocm() error\n");
+
+	r = JEMALLOC_P(allocm)(&p, NULL, 42, ALLOCM_ZERO);
+	if (r != ALLOCM_SUCCESS) {
+		fprintf(stderr, "Unexpected allocm() error\n");
+		abort();
+	}
+	if (JEMALLOC_P(dallocm)(p, 0) != ALLOCM_SUCCESS)
+		fprintf(stderr, "Unexpected dallocm() error\n");
+
+#if LG_SIZEOF_PTR == 3
+	alignment = 0x8000000000000000LLU;
+	sz        = 0x8000000000000000LLU;
+#else
+	alignment = 0x80000000LU;
+	sz        = 0x80000000LU;
+#endif
+	r = JEMALLOC_P(allocm)(&p, NULL, sz, ALLOCM_ALIGN(alignment));
+	if (r == ALLOCM_SUCCESS) {
+		fprintf(stderr,
+		    "Expected error for allocm(&p, %zu, 0x%x)\n",
+		    sz, ALLOCM_ALIGN(alignment));
+	}
+
+#if LG_SIZEOF_PTR == 3
+	alignment = 0x4000000000000000LLU;
+	sz        = 0x8400000000000001LLU;
+#else
+	alignment = 0x40000000LU;
+	sz        = 0x84000001LU;
+#endif
+	r = JEMALLOC_P(allocm)(&p, NULL, sz, ALLOCM_ALIGN(alignment));
+	if (r == ALLOCM_SUCCESS) {
+		fprintf(stderr,
+		    "Expected error for allocm(&p, %zu, 0x%x)\n",
+		    sz, ALLOCM_ALIGN(alignment));
+	}
+
+	alignment = 0x10LLU;
+#if LG_SIZEOF_PTR == 3
+	sz   = 0xfffffffffffffff0LLU;
+#else
+	sz   = 0xfffffff0LU;
+#endif
+	r = JEMALLOC_P(allocm)(&p, NULL, sz, ALLOCM_ALIGN(alignment));
+	if (r == ALLOCM_SUCCESS) {
+		fprintf(stderr,
+		    "Expected error for allocm(&p, %zu, 0x%x)\n",
+		    sz, ALLOCM_ALIGN(alignment));
+	}
+
+	for (i = 0; i < NITER; i++)
+		ps[i] = NULL;
+
+	for (alignment = 8;
+	    alignment <= MAXALIGN;
+	    alignment <<= 1) {
+		total = 0;
+		fprintf(stderr, "Alignment: %zu\n", alignment);
+		for (sz = 1;
+		    sz < 3 * alignment && sz < (1U << 31);
+		    sz += (alignment >> (LG_SIZEOF_PTR-1)) - 1) {
+			for (i = 0; i < NITER; i++) {
+				r = JEMALLOC_P(allocm)(&ps[i], NULL, sz,
+				    ALLOCM_ALIGN(alignment) | ALLOCM_ZERO);
+				if (r != ALLOCM_SUCCESS) {
+					fprintf(stderr,
+					    "Error for size %zu (0x%zx): %d\n",
+					    sz, sz, r);
+					exit(1);
+				}
+				if ((uintptr_t)p & (alignment-1)) {
+					fprintf(stderr,
+					    "%p inadequately aligned for"
+					    " alignment: %zu\n", p, alignment);
+				}
+				JEMALLOC_P(sallocm)(ps[i], &tsz, 0);
+				total += tsz;
+				if (total >= (MAXALIGN << 1))
+					break;
+			}
+			for (i = 0; i < NITER; i++) {
+				if (ps[i] != NULL) {
+					JEMALLOC_P(dallocm)(ps[i], 0);
+					ps[i] = NULL;
+				}
+			}
+		}
+	}
+
+	fprintf(stderr, "Test end\n");
+	return (0);
+}
diff --git a/jemalloc/test/allocm.exp b/jemalloc/test/allocm.exp
new file mode 100644
index 0000000..b5061c7
--- /dev/null
+++ b/jemalloc/test/allocm.exp
@@ -0,0 +1,25 @@
+Test begin
+Alignment: 8
+Alignment: 16
+Alignment: 32
+Alignment: 64
+Alignment: 128
+Alignment: 256
+Alignment: 512
+Alignment: 1024
+Alignment: 2048
+Alignment: 4096
+Alignment: 8192
+Alignment: 16384
+Alignment: 32768
+Alignment: 65536
+Alignment: 131072
+Alignment: 262144
+Alignment: 524288
+Alignment: 1048576
+Alignment: 2097152
+Alignment: 4194304
+Alignment: 8388608
+Alignment: 16777216
+Alignment: 33554432
+Test end
diff --git a/jemalloc/test/jemalloc_test.h.in b/jemalloc/test/jemalloc_test.h.in
new file mode 100644
index 0000000..0c48895
--- /dev/null
+++ b/jemalloc/test/jemalloc_test.h.in
@@ -0,0 +1,6 @@
+/*
+ * This header should be included by tests, rather than directly including
+ * jemalloc/jemalloc.h, because --with-install-suffix may cause the header to
+ * have a different name.
+ */
+#include "jemalloc/jemalloc@install_suffix@.h"
diff --git a/jemalloc/test/posix_memalign.c b/jemalloc/test/posix_memalign.c
new file mode 100644
index 0000000..3e306c0
--- /dev/null
+++ b/jemalloc/test/posix_memalign.c
@@ -0,0 +1,121 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+
+#define	JEMALLOC_MANGLE
+#include "jemalloc_test.h"
+
+#define CHUNK 0x400000
+/* #define MAXALIGN ((size_t)0x80000000000LLU) */
+#define MAXALIGN ((size_t)0x2000000LLU)
+#define NITER 4
+
+int
+main(void)
+{
+	size_t alignment, size, total;
+	unsigned i;
+	int err;
+	void *p, *ps[NITER];
+
+	fprintf(stderr, "Test begin\n");
+
+	/* Test error conditions. */
+	for (alignment = 0; alignment < sizeof(void *); alignment++) {
+		err = JEMALLOC_P(posix_memalign)(&p, alignment, 1);
+		if (err != EINVAL) {
+			fprintf(stderr,
+			    "Expected error for invalid alignment %zu\n",
+			    alignment);
+		}
+	}
+
+	for (alignment = sizeof(size_t); alignment < MAXALIGN;
+	    alignment <<= 1) {
+		err = JEMALLOC_P(posix_memalign)(&p, alignment + 1, 1);
+		if (err == 0) {
+			fprintf(stderr,
+			    "Expected error for invalid alignment %zu\n",
+			    alignment + 1);
+		}
+	}
+
+#if LG_SIZEOF_PTR == 3
+	alignment = 0x8000000000000000LLU;
+	size      = 0x8000000000000000LLU;
+#else
+	alignment = 0x80000000LU;
+	size      = 0x80000000LU;
+#endif
+	err = JEMALLOC_P(posix_memalign)(&p, alignment, size);
+	if (err == 0) {
+		fprintf(stderr,
+		    "Expected error for posix_memalign(&p, %zu, %zu)\n",
+		    alignment, size);
+	}
+
+#if LG_SIZEOF_PTR == 3
+	alignment = 0x4000000000000000LLU;
+	size      = 0x8400000000000001LLU;
+#else
+	alignment = 0x40000000LU;
+	size      = 0x84000001LU;
+#endif
+	err = JEMALLOC_P(posix_memalign)(&p, alignment, size);
+	if (err == 0) {
+		fprintf(stderr,
+		    "Expected error for posix_memalign(&p, %zu, %zu)\n",
+		    alignment, size);
+	}
+
+	alignment = 0x10LLU;
+#if LG_SIZEOF_PTR == 3
+	size = 0xfffffffffffffff0LLU;
+#else
+	size = 0xfffffff0LU;
+#endif
+	err = JEMALLOC_P(posix_memalign)(&p, alignment, size);
+	if (err == 0) {
+		fprintf(stderr,
+		    "Expected error for posix_memalign(&p, %zu, %zu)\n",
+		    alignment, size);
+	}
+
+	for (i = 0; i < NITER; i++)
+		ps[i] = NULL;
+
+	for (alignment = 8;
+	    alignment <= MAXALIGN;
+	    alignment <<= 1) {
+		total = 0;
+		fprintf(stderr, "Alignment: %zu\n", alignment);
+		for (size = 1;
+		    size < 3 * alignment && size < (1U << 31);
+		    size += (alignment >> (LG_SIZEOF_PTR-1)) - 1) {
+			for (i = 0; i < NITER; i++) {
+				err = JEMALLOC_P(posix_memalign)(&ps[i],
+				    alignment, size);
+				if (err) {
+					fprintf(stderr,
+					    "Error for size %zu (0x%zx): %s\n",
+					    size, size, strerror(err));
+					exit(1);
+				}
+				total += JEMALLOC_P(malloc_usable_size)(ps[i]);
+				if (total >= (MAXALIGN << 1))
+					break;
+			}
+			for (i = 0; i < NITER; i++) {
+				if (ps[i] != NULL) {
+					JEMALLOC_P(free)(ps[i]);
+					ps[i] = NULL;
+				}
+			}
+		}
+	}
+
+	fprintf(stderr, "Test end\n");
+	return (0);
+}
diff --git a/jemalloc/test/posix_memalign.exp b/jemalloc/test/posix_memalign.exp
new file mode 100644
index 0000000..b5061c7
--- /dev/null
+++ b/jemalloc/test/posix_memalign.exp
@@ -0,0 +1,25 @@
+Test begin
+Alignment: 8
+Alignment: 16
+Alignment: 32
+Alignment: 64
+Alignment: 128
+Alignment: 256
+Alignment: 512
+Alignment: 1024
+Alignment: 2048
+Alignment: 4096
+Alignment: 8192
+Alignment: 16384
+Alignment: 32768
+Alignment: 65536
+Alignment: 131072
+Alignment: 262144
+Alignment: 524288
+Alignment: 1048576
+Alignment: 2097152
+Alignment: 4194304
+Alignment: 8388608
+Alignment: 16777216
+Alignment: 33554432
+Test end
diff --git a/jemalloc/test/rallocm.c b/jemalloc/test/rallocm.c
new file mode 100644
index 0000000..a8cadeb
--- /dev/null
+++ b/jemalloc/test/rallocm.c
@@ -0,0 +1,117 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define	JEMALLOC_MANGLE
+#include "jemalloc_test.h"
+
+int
+main(void)
+{
+	void *p, *q;
+	size_t sz, tsz;
+	int r;
+
+	fprintf(stderr, "Test begin\n");
+
+	r = JEMALLOC_P(allocm)(&p, &sz, 42, 0);
+	if (r != ALLOCM_SUCCESS) {
+		fprintf(stderr, "Unexpected allocm() error\n");
+		abort();
+	}
+
+	q = p;
+	r = JEMALLOC_P(rallocm)(&q, &tsz, sz, 0, ALLOCM_NO_MOVE);
+	if (r != ALLOCM_SUCCESS)
+		fprintf(stderr, "Unexpected rallocm() error\n");
+	if (q != p)
+		fprintf(stderr, "Unexpected object move\n");
+	if (tsz != sz) {
+		fprintf(stderr, "Unexpected size change: %zu --> %zu\n",
+		    sz, tsz);
+	}
+
+	q = p;
+	r = JEMALLOC_P(rallocm)(&q, &tsz, sz, 5, ALLOCM_NO_MOVE);
+	if (r != ALLOCM_SUCCESS)
+		fprintf(stderr, "Unexpected rallocm() error\n");
+	if (q != p)
+		fprintf(stderr, "Unexpected object move\n");
+	if (tsz != sz) {
+		fprintf(stderr, "Unexpected size change: %zu --> %zu\n",
+		    sz, tsz);
+	}
+
+	q = p;
+	r = JEMALLOC_P(rallocm)(&q, &tsz, sz + 5, 0, ALLOCM_NO_MOVE);
+	if (r != ALLOCM_ERR_NOT_MOVED)
+		fprintf(stderr, "Unexpected rallocm() result\n");
+	if (q != p)
+		fprintf(stderr, "Unexpected object move\n");
+	if (tsz != sz) {
+		fprintf(stderr, "Unexpected size change: %zu --> %zu\n",
+		    sz, tsz);
+	}
+
+	q = p;
+	r = JEMALLOC_P(rallocm)(&q, &tsz, sz + 5, 0, 0);
+	if (r != ALLOCM_SUCCESS)
+		fprintf(stderr, "Unexpected rallocm() error\n");
+	if (q == p)
+		fprintf(stderr, "Expected object move\n");
+	if (tsz == sz) {
+		fprintf(stderr, "Expected size change: %zu --> %zu\n",
+		    sz, tsz);
+	}
+	p = q;
+	sz = tsz;
+
+	r = JEMALLOC_P(rallocm)(&q, &tsz, 8192, 0, 0);
+	if (r != ALLOCM_SUCCESS)
+		fprintf(stderr, "Unexpected rallocm() error\n");
+	if (q == p)
+		fprintf(stderr, "Expected object move\n");
+	if (tsz == sz) {
+		fprintf(stderr, "Expected size change: %zu --> %zu\n",
+		    sz, tsz);
+	}
+	p = q;
+	sz = tsz;
+
+	r = JEMALLOC_P(rallocm)(&q, &tsz, 16384, 0, 0);
+	if (r != ALLOCM_SUCCESS)
+		fprintf(stderr, "Unexpected rallocm() error\n");
+	if (tsz == sz) {
+		fprintf(stderr, "Expected size change: %zu --> %zu\n",
+		    sz, tsz);
+	}
+	p = q;
+	sz = tsz;
+
+	r = JEMALLOC_P(rallocm)(&q, &tsz, 8192, 0, ALLOCM_NO_MOVE);
+	if (r != ALLOCM_SUCCESS)
+		fprintf(stderr, "Unexpected rallocm() error\n");
+	if (q != p)
+		fprintf(stderr, "Unexpected object move\n");
+	if (tsz == sz) {
+		fprintf(stderr, "Expected size change: %zu --> %zu\n",
+		    sz, tsz);
+	}
+	sz = tsz;
+
+	r = JEMALLOC_P(rallocm)(&q, &tsz, 16384, 0, ALLOCM_NO_MOVE);
+	if (r != ALLOCM_SUCCESS)
+		fprintf(stderr, "Unexpected rallocm() error\n");
+	if (q != p)
+		fprintf(stderr, "Unexpected object move\n");
+	if (tsz == sz) {
+		fprintf(stderr, "Expected size change: %zu --> %zu\n",
+		    sz, tsz);
+	}
+	sz = tsz;
+
+	JEMALLOC_P(dallocm)(p, 0);
+
+	fprintf(stderr, "Test end\n");
+	return (0);
+}
diff --git a/jemalloc/test/rallocm.exp b/jemalloc/test/rallocm.exp
new file mode 100644
index 0000000..369a88d
--- /dev/null
+++ b/jemalloc/test/rallocm.exp
@@ -0,0 +1,2 @@
+Test begin
+Test end
diff --git a/jemalloc/test/thread_arena.c b/jemalloc/test/thread_arena.c
new file mode 100644
index 0000000..bd884e1
--- /dev/null
+++ b/jemalloc/test/thread_arena.c
@@ -0,0 +1,82 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <pthread.h>
+#include <string.h>
+
+#define	JEMALLOC_MANGLE
+#include "jemalloc_test.h"
+
+void *
+thread_start(void *arg)
+{
+	unsigned main_arena_ind = *(unsigned *)arg;
+	void *p;
+	unsigned arena_ind;
+	size_t size;
+	int err;
+
+	p = JEMALLOC_P(malloc)(1);
+	if (p == NULL) {
+		fprintf(stderr, "%s(): Error in malloc()\n", __func__);
+		return (void *)1;
+	}
+
+	size = sizeof(arena_ind);
+	if ((err = JEMALLOC_P(mallctl)("thread.arena", &arena_ind, &size,
+	    &main_arena_ind, sizeof(main_arena_ind)))) {
+		fprintf(stderr, "%s(): Error in mallctl(): %s\n", __func__,
+		    strerror(err));
+		return (void *)1;
+	}
+
+	return (NULL);
+}
+
+int
+main(void)
+{
+	int ret = 0;
+	void *p;
+	unsigned arena_ind;
+	size_t size;
+	int err;
+	pthread_t thread;
+
+	fprintf(stderr, "Test begin\n");
+
+	p = JEMALLOC_P(malloc)(1);
+	if (p == NULL) {
+		fprintf(stderr, "%s(): Error in malloc()\n", __func__);
+		ret = 1;
+		goto RETURN;
+	}
+
+	size = sizeof(arena_ind);
+	if ((err = JEMALLOC_P(mallctl)("thread.arena", &arena_ind, &size, NULL,
+	    0))) {
+		fprintf(stderr, "%s(): Error in mallctl(): %s\n", __func__,
+		    strerror(err));
+		ret = 1;
+		goto RETURN;
+	}
+
+	if (pthread_create(&thread, NULL, thread_start, (void *)&arena_ind)
+	    != 0) {
+		fprintf(stderr, "%s(): Error in pthread_create()\n", __func__);
+		ret = 1;
+		goto RETURN;
+	}
+	pthread_join(thread, (void *)&ret);
+
+	if (pthread_create(&thread, NULL, thread_start, (void *)&arena_ind)
+	    != 0) {
+		fprintf(stderr, "%s(): Error in pthread_create()\n", __func__);
+		ret = 1;
+		goto RETURN;
+	}
+	pthread_join(thread, (void *)&ret);
+
+RETURN:
+	fprintf(stderr, "Test end\n");
+	return (ret);
+}
diff --git a/jemalloc/test/thread_arena.exp b/jemalloc/test/thread_arena.exp
new file mode 100644
index 0000000..369a88d
--- /dev/null
+++ b/jemalloc/test/thread_arena.exp
@@ -0,0 +1,2 @@
+Test begin
+Test end