diff --git a/.circleci/config.yml b/.circleci/config.yml
new file mode 100644
index 0000000..b0138d4
--- /dev/null
+++ b/.circleci/config.yml
@@ -0,0 +1,125 @@
+version: 2
+
+references:
+  # Install the dependencies required for tests.
+  # Add the step "- *install-dependencies" to the beginning of your job to run
+  # this command.
+  install-dependencies: &install-dependencies
+    run:
+      name: Install dependencies
+      # TODO: We can split these dependencies up by job to reduce installation
+      # time.
+      command: |
+        sudo dpkg --add-architecture i386
+        sudo apt-get -y -qq update
+        sudo apt-get -y install \
+            gcc-multilib-powerpc-linux-gnu gcc-arm-linux-gnueabi \
+            libc6-dev-armel-cross gcc-aarch64-linux-gnu libc6-dev-arm64-cross \
+            libc6-dev-ppc64-powerpc-cross
+
+jobs:
+  # the first half of the jobs are in this test
+  short-tests-0:
+    # TODO: Create a small custom docker image with all the dependencies we need
+    #       preinstalled to reduce installation time.
+    docker:
+      - image: circleci/buildpack-deps:bionic
+    steps:
+      - checkout
+      - *install-dependencies
+      - run:
+          name: Test
+          command: |
+            cc -v; CFLAGS="-O0 -Werror" make all && make clean
+            make c99build         ; make clean
+            make c11build         ; make clean
+            make aarch64build     ; make clean
+            make -j regressiontest; make clean
+            make shortest         ; make clean
+            make cxxtest          ; make clean
+  # the second half of the jobs are in this test
+  short-tests-1:
+    docker:
+      - image: circleci/buildpack-deps:bionic
+    steps:
+      - checkout
+      - *install-dependencies
+      - run:
+          name: Test
+          command: |
+            make gnu90build; make clean
+            make gnu99build; make clean
+            make ppc64build; make clean
+            make ppcbuild  ; make clean
+            make armbuild  ; make clean
+            make -C tests test-legacy test-longmatch test-symbols; make clean
+            make -C lib libzstd-nomt; make clean
+  # This step is only run on release tags.
+  # It publishes the source tarball as artifacts and if the GITHUB_TOKEN
+  # environment variable is set it will publish the source tarball to the
+  # tagged release.
+  publish-github-release:
+    docker:
+      - image: cibuilds/github:0.12.0
+    environment:
+      CIRCLE_ARTIFACTS: /tmp/circleci-artifacts
+    steps:
+      - checkout
+      - run:
+          name: Install dependencies
+          command: |
+            apk add -q gzip coreutils
+      - run:
+          name: Publish
+          command: |
+            export VERSION=$(echo $CIRCLE_TAG | tail -c +2)
+            export ZSTD_VERSION=zstd-$VERSION
+            git archive $CIRCLE_TAG --prefix $ZSTD_VERSION/ --format tar \
+                        -o $ZSTD_VERSION.tar
+            gzip -9 $ZSTD_VERSION.tar
+            sha256sum $ZSTD_VERSION.tar.gz > $ZSTD_VERSION.tar.gz.sha256sum
+            mkdir -p $CIRCLE_ARTIFACTS
+            cp $ZSTD_VERSION.tar.gz{,.sha256sum} $CIRCLE_ARTIFACTS
+      - store_artifacts:
+          path: /tmp/circleci-artifacts
+
+workflows:
+  version: 2
+  commit:
+    jobs:
+      # Run the tests in parallel
+      - short-tests-0:
+          filters:
+            tags:
+              only: /.*/
+      - short-tests-1:
+          filters:
+            tags:
+              only: /.*/
+      # Only run on release tags.
+      - publish-github-release:
+          requires:
+            - short-tests-0
+            - short-tests-1
+          filters:
+            branches:
+              ignore: /.*/
+            tags:
+              only: /^v\d+\.\d+\.\d+$/
+
+  # Longer tests
+    #- make -C tests test-zstd-nolegacy && make clean
+    #- pyenv global 3.4.4; make -C tests versionsTest && make clean
+    #- make zlibwrapper         && make clean
+    #- gcc -v; make -C tests test32 MOREFLAGS="-I/usr/include/x86_64-linux-gnu" && make clean
+    #- make uasan               && make clean
+    #- make asan32              && make clean
+    #- make -C tests test32 CC=clang MOREFLAGS="-g -fsanitize=address -I/usr/include/x86_64-linux-gnu"
+  # Valgrind tests
+    #- CFLAGS="-O1 -g" make -C zlibWrapper valgrindTest && make clean
+    #- make -C tests valgrindTest && make clean
+  # ARM, AArch64, PowerPC, PowerPC64 tests
+    #- make ppctest             && make clean
+    #- make ppc64test           && make clean
+    #- make armtest             && make clean
+    #- make aarch64test         && make clean
diff --git a/.travis.yml b/.travis.yml
index 0967f60..284e8a4 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -10,28 +10,38 @@
 matrix:
   include:
     # Ubuntu 14.04
-    - env: Cmd='make gcc6install && CC=gcc-6 make -j all && make clean && CC=gcc-6 make clean uasan-test-zstd'
-    - env: Cmd='make gcc6install libc6install && CC=gcc-6 make clean uasan-test-zstd32'
-    - env: Cmd='make gcc7install && CC=gcc-7 make clean uasan-test-zstd'
+    - env: Cmd='make test'
+
+    - env: Cmd='make gcc6install && CC=gcc-6 CFLAGS=-Werror make -j all
+             && make clean && CC=gcc-6 make -j uasan-test-zstd </dev/null'   # test when stdin is not a tty
+    - env: Cmd='make gcc6install libc6install
+             && make clean && CC=gcc-6 make -j uasan-test-zstd32'
+    - env: Cmd='make gcc7install && make clean && CC=gcc-7 make -j uasan-test-zstd'
+    - env: Cmd='make gcc8install && CC=gcc-8 CFLAGS="-Werror -O3" make -j all'
     - env: Cmd='make clang38install && CC=clang-3.8 make clean msan-test-zstd'
 
+    - env: Cmd='make staticAnalyze'
+
     - env: Cmd='make gcc6install && CC=gcc-6 make clean uasan-fuzztest'
-    - env: Cmd='make gcc6install libc6install && CC=gcc-6 CFLAGS=-m32 make clean uasan-fuzztest'
+    - env: Cmd='make gcc6install libc6install
+             && make clean && CC=gcc-6 CFLAGS=-m32 make uasan-fuzztest'
     - env: Cmd='make clang38install && CC=clang-3.8 make clean msan-fuzztest'
     - env: Cmd='make clang38install && CC=clang-3.8 make clean tsan-test-zstream'
 
-    - env: Cmd='make -C tests test-fuzzer-stackmode'
+    - env: Cmd='make -j uasanregressiontest
+             && make clean && make -j msanregressiontest'
 
-    - env: Cmd='make valgrindinstall && make -C tests clean valgrindTest'
+    - env: Cmd='make valgrindinstall && make -C tests clean valgrindTest
+             && make clean && make -C tests test-fuzzer-stackmode'
 
     - env: Cmd='make arminstall && make armfuzz'
     - env: Cmd='make arminstall && make aarch64fuzz'
-
     - env: Cmd='make ppcinstall && make ppcfuzz'
     - env: Cmd='make ppcinstall && make ppc64fuzz'
-    - env: Cmd='make -j uasanregressiontest && make clean && make -j msanregressiontest'
 
-    - env: Cmd='make lz4install && make -C tests test-lz4 test-pool && make clean && bash tests/libzstd_partial_builds.sh'
+    - env: Cmd='make lz4install && make -C tests test-lz4
+             && make clean && make -C tests test-pool
+             && make clean && bash tests/libzstd_partial_builds.sh'
 
     # tag-specific test
     - if: tag =~ ^v[0-9]\.[0-9]
@@ -52,6 +62,7 @@
   only:
   - dev
   - master
+  - travisTest
 
 script:
   - JOB_NUMBER=$(echo $TRAVIS_JOB_NUMBER | sed -e 's:[0-9][0-9]*\.\(.*\):\1:')
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000..0f7ad8b
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,5 @@
+# Code of Conduct
+
+Facebook has adopted a Code of Conduct that we expect project participants to adhere to.
+Please read the [full text](https://code.fb.com/codeofconduct/)
+so that you can understand what actions will and will not be tolerated.
diff --git a/Makefile b/Makefile
index 59af7a0..9ee0b0b 100644
--- a/Makefile
+++ b/Makefile
@@ -23,6 +23,7 @@
 EXT =
 endif
 
+## default: Build lib-release and zstd-release
 .PHONY: default
 default: lib-release zstd-release
 
@@ -30,10 +31,9 @@
 all: allmost examples manual contrib
 
 .PHONY: allmost
-allmost: allzstd
-	$(MAKE) -C $(ZWRAPDIR) all
+allmost: allzstd zlibwrapper
 
-#skip zwrapper, can't build that on alternate architectures without the proper zlib installed
+# skip zwrapper, can't build that on alternate architectures without the proper zlib installed
 .PHONY: allzstd
 allzstd: lib
 	$(MAKE) -C $(PRGDIR) all
@@ -44,8 +44,8 @@
 	$(MAKE) -C $(PRGDIR) zstd32
 	$(MAKE) -C $(TESTDIR) all32
 
-.PHONY: lib lib-release
-lib lib-release:
+.PHONY: lib lib-release libzstd.a
+lib lib-release :
 	@$(MAKE) -C $(ZSTDDIR) $@
 
 .PHONY: zstd zstd-release
@@ -59,12 +59,13 @@
 	cp $(PRGDIR)/zstd$(EXT) ./zstdmt$(EXT)
 
 .PHONY: zlibwrapper
-zlibwrapper:
-	$(MAKE) -C $(ZWRAPDIR) test
+zlibwrapper: lib
+	$(MAKE) -C $(ZWRAPDIR) all
 
 .PHONY: test
+test: MOREFLAGS += -g -DDEBUGLEVEL=1 -Werror
 test:
-	$(MAKE) -C $(PRGDIR) allVariants MOREFLAGS+="-g -DDEBUGLEVEL=1"
+	MOREFLAGS="$(MOREFLAGS)" $(MAKE) -j -C $(PRGDIR) allVariants
 	$(MAKE) -C $(TESTDIR) $@
 
 .PHONY: shortest
@@ -87,6 +88,7 @@
 	$(MAKE) -C contrib/pzstd all
 	$(MAKE) -C contrib/seekable_format/examples all
 	$(MAKE) -C contrib/adaptive-compression all
+	$(MAKE) -C contrib/largeNbDicts all
 
 .PHONY: cleanTabs
 cleanTabs:
@@ -103,6 +105,7 @@
 	@$(MAKE) -C contrib/pzstd $@ > $(VOID)
 	@$(MAKE) -C contrib/seekable_format/examples $@ > $(VOID)
 	@$(MAKE) -C contrib/adaptive-compression $@ > $(VOID)
+	@$(MAKE) -C contrib/largeNbDicts $@ > $(VOID)
 	@$(RM) zstd$(EXT) zstdmt$(EXT) tmp*
 	@$(RM) -r lz4
 	@echo Cleaning completed
@@ -110,14 +113,31 @@
 #------------------------------------------------------------------------------
 # make install is validated only for Linux, macOS, Hurd and some BSD targets
 #------------------------------------------------------------------------------
-ifneq (,$(filter $(shell uname),Linux Darwin GNU/kFreeBSD GNU OpenBSD FreeBSD DragonFly NetBSD MSYS_NT))
+ifneq (,$(filter $(shell uname),Linux Darwin GNU/kFreeBSD GNU OpenBSD FreeBSD DragonFly NetBSD MSYS_NT Haiku))
 
 HOST_OS = POSIX
-CMAKE_PARAMS = -DZSTD_BUILD_CONTRIB:BOOL=ON -DZSTD_BUILD_STATIC:BOOL=ON -DZSTD_BUILD_TESTS:BOOL=ON -DZSTD_ZLIB_SUPPORT:BOOL=ON -DZSTD_LZMA_SUPPORT:BOOL=ON
+CMAKE_PARAMS = -DZSTD_BUILD_CONTRIB:BOOL=ON -DZSTD_BUILD_STATIC:BOOL=ON -DZSTD_BUILD_TESTS:BOOL=ON -DZSTD_ZLIB_SUPPORT:BOOL=ON -DZSTD_LZMA_SUPPORT:BOOL=ON -DCMAKE_BUILD_TYPE=Release
 
+EGREP = egrep --color=never
+
+# Print a two column output of targets and their description. To add a target description, put a
+# comment in the Makefile with the format "## <TARGET>: <DESCRIPTION>".  For example:
+#
+## list: Print all targets and their descriptions (if provided)
 .PHONY: list
 list:
-	@$(MAKE) -pRrq -f $(lastword $(MAKEFILE_LIST)) : 2>/dev/null | awk -v RS= -F: '/^# File/,/^# Finished Make data base/ {if ($$1 !~ "^[#.]") {print $$1}}' | sort | egrep -v -e '^[^[:alnum:]]' -e '^$@$$' | xargs
+	@TARGETS=$$($(MAKE) -pRrq -f $(lastword $(MAKEFILE_LIST)) : 2>/dev/null \
+		| awk -v RS= -F: '/^# File/,/^# Finished Make data base/ {if ($$1 !~ "^[#.]") {print $$1}}' \
+		| $(EGREP) -v  -e '^[^[:alnum:]]' | sort); \
+	{ \
+	    printf "Target Name\tDescription\n"; \
+	    printf "%0.s-" {1..16}; printf "\t"; printf "%0.s-" {1..40}; printf "\n"; \
+	    for target in $$TARGETS; do \
+	        line=$$($(EGREP) "^##[[:space:]]+$$target:" $(lastword $(MAKEFILE_LIST))); \
+	        description=$$(echo $$line | awk '{i=index($$0,":"); print substr($$0,i+1)}' | xargs); \
+	        printf "$$target\t$$description\n"; \
+	    done \
+	} | column -t -s $$'\t'
 
 .PHONY: install clangtest armtest usan asan uasan
 install:
@@ -197,7 +217,7 @@
 
 clangtest: clean
 	clang -v
-	$(MAKE) all CXX=clang-++ CC=clang MOREFLAGS="-Werror -Wconversion -Wno-sign-conversion -Wdocumentation"
+	$(MAKE) all CXX=clang++ CC=clang MOREFLAGS="-Werror -Wconversion -Wno-sign-conversion -Wdocumentation"
 
 armtest: clean
 	$(MAKE) -C $(TESTDIR) datagen   # use native, faster
@@ -286,6 +306,9 @@
 gcc7install: apt-add-repo
 	APT_PACKAGES="libc6-dev-i386 gcc-multilib gcc-7 gcc-7-multilib" $(MAKE) apt-install
 
+gcc8install: apt-add-repo
+	APT_PACKAGES="libc6-dev-i386 gcc-multilib gcc-8 gcc-8-multilib" $(MAKE) apt-install
+
 gpp6install: apt-add-repo
 	APT_PACKAGES="libc6-dev-i386 g++-multilib gcc-6 g++-6 g++-6-multilib" $(MAKE) apt-install
 
@@ -317,23 +340,23 @@
 
 c90build: clean
 	$(CC) -v
-	CFLAGS="-std=c90" $(MAKE) allmost  # will fail, due to missing support for `long long`
+	CFLAGS="-std=c90 -Werror" $(MAKE) allmost  # will fail, due to missing support for `long long`
 
 gnu90build: clean
 	$(CC) -v
-	CFLAGS="-std=gnu90" $(MAKE) allmost
+	CFLAGS="-std=gnu90 -Werror" $(MAKE) allmost
 
 c99build: clean
 	$(CC) -v
-	CFLAGS="-std=c99" $(MAKE) allmost
+	CFLAGS="-std=c99 -Werror" $(MAKE) allmost
 
 gnu99build: clean
 	$(CC) -v
-	CFLAGS="-std=gnu99" $(MAKE) allmost
+	CFLAGS="-std=gnu99 -Werror" $(MAKE) allmost
 
 c11build: clean
 	$(CC) -v
-	CFLAGS="-std=c11" $(MAKE) allmost
+	CFLAGS="-std=c11 -Werror" $(MAKE) allmost
 
 bmix64build: clean
 	$(CC) -v
@@ -347,7 +370,10 @@
 	$(CC) -v
 	CFLAGS="-O3 -mbmi -m32 -Werror" $(MAKE) -C $(TESTDIR) test
 
-staticAnalyze: clean
+# static analyzer test uses clang's scan-build
+# does not analyze zlibWrapper, due to detected issues in zlib source code
+staticAnalyze: SCANBUILD ?= scan-build
+staticAnalyze:
 	$(CC) -v
-	CPPFLAGS=-g scan-build --status-bugs -v $(MAKE) all
+	CC=$(CC) CPPFLAGS=-g $(SCANBUILD) --status-bugs -v $(MAKE) allzstd examples contrib
 endif
diff --git a/NEWS b/NEWS
index e3bfb24..40805c1 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,17 @@
+v1.3.6
+perf: much faster dictionary builder, by @jenniferliu
+perf: faster dictionary compression on small data when using multiple contexts, by @felixhandte
+perf: faster dictionary decompression when using a very large number of dictionaries simultaneously
+cli : fix : does no longer overwrite destination when source does not exist (#1082)
+cli : new command --adapt, for automatic compression level adaptation
+api : fix : block api can be streamed with > 4 GB, reported by @catid
+api : reduced ZSTD_DDict size by 2 KB
+api : minimum negative compression level is defined, and can be queried using ZSTD_minCLevel().
+build: support Haiku target, by @korli
+build: Read Legacy format is limited to v0.5+ by default. Can be changed at compile time with macro ZSTD_LEGACY_SUPPORT.
+doc : zstd_compression_format.md updated to match wording in IETF RFC 8478
+misc: tests/paramgrill, a parameter optimizer, by @GeorgeLu97
+
 v1.3.5
 perf: much faster dictionary compression, by @felixhandte
 perf: small quality improvement for dictionary generation, by @terrelln
diff --git a/README.md b/README.md
index 17edecb..dc99dc0 100644
--- a/README.md
+++ b/README.md
@@ -121,6 +121,8 @@
 It can generate Makefiles or other build scripts
 to create `zstd` binary, and `libzstd` dynamic and static libraries.
 
+By default, `CMAKE_BUILD_TYPE` is set to `Release`.
+
 #### Meson
 
 A Meson project is provided within `contrib/meson`.
diff --git a/appveyor.yml b/appveyor.yml
index 742f612..2b674ce 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -181,15 +181,15 @@
     - COMPILER: "gcc"
       HOST:     "mingw"
       PLATFORM: "x64"
-      SCRIPT:   "make allzstd"
+      SCRIPT:   "CPPFLAGS=-DDEBUGLEVEL=2 CFLAGS=-Werror make -j allzstd DEBUGLEVEL=2"
     - COMPILER: "gcc"
       HOST:     "mingw"
       PLATFORM: "x86"
-      SCRIPT:   "make allzstd"
+      SCRIPT:   "CFLAGS=-Werror make -j allzstd"
     - COMPILER: "clang"
       HOST:     "mingw"
       PLATFORM: "x64"
-      SCRIPT:   "MOREFLAGS='--target=x86_64-w64-mingw32 -Werror -Wconversion -Wno-sign-conversion' make allzstd"
+      SCRIPT:   "CFLAGS='--target=x86_64-w64-mingw32 -Werror -Wconversion -Wno-sign-conversion' make -j allzstd"
 
     - COMPILER: "visual"
       HOST:     "visual"
diff --git a/build/.gitignore b/build/.gitignore
index b00e709..1ceb70e 100644
--- a/build/.gitignore
+++ b/build/.gitignore
@@ -18,3 +18,14 @@
 
 # CMake
 cmake/build/
+CMakeCache.txt
+CMakeFiles
+CMakeScripts
+Testing
+Makefile
+cmake_install.cmake
+install_manifest.txt
+compile_commands.json
+CTestTestfile.cmake
+build
+lib
diff --git a/build/VS2008/fullbench/fullbench.vcproj b/build/VS2008/fullbench/fullbench.vcproj
index 2ce7f74..a31883a 100644
--- a/build/VS2008/fullbench/fullbench.vcproj
+++ b/build/VS2008/fullbench/fullbench.vcproj
@@ -389,6 +389,10 @@
 				>
 			</File>
 			<File
+				RelativePath="..\..\..\programs\bench.c"
+				>
+			</File>
+			<File
 				RelativePath="..\..\..\tests\fullbench.c"
 				>
 			</File>
diff --git a/build/VS2008/fuzzer/fuzzer.vcproj b/build/VS2008/fuzzer/fuzzer.vcproj
index f6ea1f8..4d444ca 100644
--- a/build/VS2008/fuzzer/fuzzer.vcproj
+++ b/build/VS2008/fuzzer/fuzzer.vcproj
@@ -337,6 +337,10 @@
 				>
 			</File>
 			<File
+				RelativePath="..\..\..\lib\dictBuilder\fastcover.c"
+				>
+			</File>
+			<File
 				RelativePath="..\..\..\lib\dictBuilder\divsufsort.c"
 				>
 			</File>
@@ -483,6 +487,10 @@
 				>
 			</File>
 			<File
+				RelativePath="..\..\..\lib\dictBuilder\cover.h"
+				>
+			</File>
+			<File
 				RelativePath="..\..\..\lib\dictBuilder\zdict_static.h"
 				>
 			</File>
diff --git a/build/VS2008/zstd/zstd.vcproj b/build/VS2008/zstd/zstd.vcproj
index 5d9f683..595733d 100644
--- a/build/VS2008/zstd/zstd.vcproj
+++ b/build/VS2008/zstd/zstd.vcproj
@@ -45,7 +45,7 @@
 				Name="VCCLCompilerTool"
 				Optimization="0"
 				AdditionalIncludeDirectories="$(SolutionDir)..\..\lib;$(SolutionDir)..\..\lib\common;$(SolutionDir)..\..\lib\legacy;$(SolutionDir)..\..\lib\dictBuilder;$(SolutionDir)..\..\lib\compress"
-				PreprocessorDefinitions="ZSTD_MULTITHREAD=1;ZSTD_LEGACY_SUPPORT=4;WIN32;_DEBUG;_CONSOLE"
+				PreprocessorDefinitions="ZSTD_MULTITHREAD=1;ZSTD_LEGACY_SUPPORT=5;WIN32;_DEBUG;_CONSOLE"
 				MinimalRebuild="true"
 				BasicRuntimeChecks="3"
 				RuntimeLibrary="3"
@@ -122,7 +122,7 @@
 				EnableIntrinsicFunctions="true"
 				OmitFramePointers="true"
 				AdditionalIncludeDirectories="$(SolutionDir)..\..\lib;$(SolutionDir)..\..\lib\common;$(SolutionDir)..\..\lib\legacy;$(SolutionDir)..\..\lib\dictBuilder;$(SolutionDir)..\..\lib\compress"
-				PreprocessorDefinitions="ZSTD_MULTITHREAD=1;ZSTD_LEGACY_SUPPORT=4;WIN32;NDEBUG;_CONSOLE"
+				PreprocessorDefinitions="ZSTD_MULTITHREAD=1;ZSTD_LEGACY_SUPPORT=5;WIN32;NDEBUG;_CONSOLE"
 				RuntimeLibrary="0"
 				EnableFunctionLevelLinking="true"
 				UsePrecompiledHeader="0"
@@ -197,7 +197,7 @@
 				Name="VCCLCompilerTool"
 				Optimization="0"
 				AdditionalIncludeDirectories="$(SolutionDir)..\..\lib;$(SolutionDir)..\..\lib\common;$(SolutionDir)..\..\lib\legacy;$(SolutionDir)..\..\lib\dictBuilder;$(SolutionDir)..\..\lib\compress"
-				PreprocessorDefinitions="ZSTD_MULTITHREAD=1;ZSTD_LEGACY_SUPPORT=4;WIN32;_DEBUG;_CONSOLE"
+				PreprocessorDefinitions="ZSTD_MULTITHREAD=1;ZSTD_LEGACY_SUPPORT=5;WIN32;_DEBUG;_CONSOLE"
 				MinimalRebuild="true"
 				BasicRuntimeChecks="3"
 				RuntimeLibrary="3"
@@ -275,7 +275,7 @@
 				EnableIntrinsicFunctions="true"
 				OmitFramePointers="true"
 				AdditionalIncludeDirectories="$(SolutionDir)..\..\lib;$(SolutionDir)..\..\lib\common;$(SolutionDir)..\..\lib\legacy;$(SolutionDir)..\..\lib\dictBuilder;$(SolutionDir)..\..\lib\compress"
-				PreprocessorDefinitions="ZSTD_MULTITHREAD=1;ZSTD_LEGACY_SUPPORT=4;WIN32;NDEBUG;_CONSOLE"
+				PreprocessorDefinitions="ZSTD_MULTITHREAD=1;ZSTD_LEGACY_SUPPORT=5;WIN32;NDEBUG;_CONSOLE"
 				RuntimeLibrary="0"
 				EnableFunctionLevelLinking="true"
 				UsePrecompiledHeader="0"
@@ -349,6 +349,10 @@
 				>
 			</File>
 			<File
+				RelativePath="..\..\..\lib\dictBuilder\fastcover.c"
+				>
+			</File>
+			<File
 				RelativePath="..\..\..\lib\dictBuilder\divsufsort.c"
 				>
 			</File>
@@ -523,6 +527,10 @@
 				>
 			</File>
 			<File
+				RelativePath="..\..\..\lib\dictBuilder\cover.h"
+				>
+			</File>
+			<File
 				RelativePath="..\..\..\lib\dictBuilder\zdict_static.h"
 				>
 			</File>
diff --git a/build/VS2008/zstdlib/zstdlib.vcproj b/build/VS2008/zstdlib/zstdlib.vcproj
index 7234b02..5f89ebf 100644
--- a/build/VS2008/zstdlib/zstdlib.vcproj
+++ b/build/VS2008/zstdlib/zstdlib.vcproj
@@ -45,7 +45,7 @@
 				Name="VCCLCompilerTool"
 				Optimization="0"
 				AdditionalIncludeDirectories="$(SolutionDir)..\..\lib;$(SolutionDir)..\..\lib\common;$(SolutionDir)..\..\lib\legacy;$(SolutionDir)..\..\programs\legacy;$(SolutionDir)..\..\lib\dictBuilder"
-				PreprocessorDefinitions="ZSTD_DLL_EXPORT=1;ZSTD_MULTITHREAD=1;ZSTD_LEGACY_SUPPORT=4;WIN32;_DEBUG;_CONSOLE"
+				PreprocessorDefinitions="ZSTD_DLL_EXPORT=1;ZSTD_MULTITHREAD=1;ZSTD_LEGACY_SUPPORT=5;WIN32;_DEBUG;_CONSOLE"
 				MinimalRebuild="true"
 				BasicRuntimeChecks="3"
 				RuntimeLibrary="3"
@@ -121,7 +121,7 @@
 				EnableIntrinsicFunctions="true"
 				OmitFramePointers="true"
 				AdditionalIncludeDirectories="$(SolutionDir)..\..\lib;$(SolutionDir)..\..\lib\common;$(SolutionDir)..\..\lib\legacy;$(SolutionDir)..\..\programs\legacy;$(SolutionDir)..\..\lib\dictBuilder"
-				PreprocessorDefinitions="ZSTD_DLL_EXPORT=1;ZSTD_MULTITHREAD=1;ZSTD_LEGACY_SUPPORT=4;WIN32;NDEBUG;_CONSOLE"
+				PreprocessorDefinitions="ZSTD_DLL_EXPORT=1;ZSTD_MULTITHREAD=1;ZSTD_LEGACY_SUPPORT=5;WIN32;NDEBUG;_CONSOLE"
 				RuntimeLibrary="0"
 				EnableFunctionLevelLinking="true"
 				UsePrecompiledHeader="0"
@@ -195,7 +195,7 @@
 				Name="VCCLCompilerTool"
 				Optimization="0"
 				AdditionalIncludeDirectories="$(SolutionDir)..\..\lib;$(SolutionDir)..\..\lib\common;$(SolutionDir)..\..\lib\legacy;$(SolutionDir)..\..\programs\legacy;$(SolutionDir)..\..\lib\dictBuilder"
-				PreprocessorDefinitions="ZSTD_DLL_EXPORT=1;ZSTD_MULTITHREAD=1;ZSTD_LEGACY_SUPPORT=4;WIN32;_DEBUG;_CONSOLE"
+				PreprocessorDefinitions="ZSTD_DLL_EXPORT=1;ZSTD_MULTITHREAD=1;ZSTD_LEGACY_SUPPORT=5;WIN32;_DEBUG;_CONSOLE"
 				MinimalRebuild="true"
 				BasicRuntimeChecks="3"
 				RuntimeLibrary="3"
@@ -272,7 +272,7 @@
 				EnableIntrinsicFunctions="true"
 				OmitFramePointers="true"
 				AdditionalIncludeDirectories="$(SolutionDir)..\..\lib;$(SolutionDir)..\..\lib\common;$(SolutionDir)..\..\lib\legacy;$(SolutionDir)..\..\programs\legacy;$(SolutionDir)..\..\lib\dictBuilder"
-				PreprocessorDefinitions="ZSTD_DLL_EXPORT=1;ZSTD_MULTITHREAD=1;ZSTD_LEGACY_SUPPORT=4;WIN32;NDEBUG;_CONSOLE"
+				PreprocessorDefinitions="ZSTD_DLL_EXPORT=1;ZSTD_MULTITHREAD=1;ZSTD_LEGACY_SUPPORT=5;WIN32;NDEBUG;_CONSOLE"
 				RuntimeLibrary="0"
 				EnableFunctionLevelLinking="true"
 				UsePrecompiledHeader="0"
@@ -333,6 +333,10 @@
 				>
 			</File>
 			<File
+				RelativePath="..\..\..\lib\dictBuilder\fastcover.c"
+				>
+			</File>
+			<File
 				RelativePath="..\..\..\lib\dictBuilder\divsufsort.c"
 				>
 			</File>
@@ -503,6 +507,10 @@
 				>
 			</File>
 			<File
+				RelativePath="..\..\..\lib\dictBuilder\cover.h"
+				>
+			</File>
+			<File
 				RelativePath="..\..\..\lib\dictBuilder\zdict_static.h"
 				>
 			</File>
diff --git a/build/VS2010/fullbench-dll/fullbench-dll.vcxproj b/build/VS2010/fullbench-dll/fullbench-dll.vcxproj
index e697318..6939d44 100644
--- a/build/VS2010/fullbench-dll/fullbench-dll.vcxproj
+++ b/build/VS2010/fullbench-dll/fullbench-dll.vcxproj
@@ -167,11 +167,13 @@
   <ItemGroup>
     <ClCompile Include="..\..\..\lib\common\xxhash.c" />
     <ClCompile Include="..\..\..\programs\datagen.c" />
+    <ClCompile Include="..\..\..\programs\bench.c" />
     <ClCompile Include="..\..\..\tests\fullbench.c" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\..\lib\zstd.h" />
     <ClInclude Include="..\..\..\programs\datagen.h" />
+    <ClInclude Include="..\..\..\programs\bench.h" />
     <ClInclude Include="..\..\..\programs\util.h" />
   </ItemGroup>
   <ItemGroup>
diff --git a/build/VS2010/fullbench/fullbench.vcxproj b/build/VS2010/fullbench/fullbench.vcxproj
index 19faf92..d2276c3 100644
--- a/build/VS2010/fullbench/fullbench.vcxproj
+++ b/build/VS2010/fullbench/fullbench.vcxproj
@@ -176,6 +176,7 @@
     <ClCompile Include="..\..\..\lib\decompress\huf_decompress.c" />
     <ClCompile Include="..\..\..\lib\decompress\zstd_decompress.c" />
     <ClCompile Include="..\..\..\programs\datagen.c" />
+    <ClCompile Include="..\..\..\programs\bench.c" />
     <ClCompile Include="..\..\..\tests\fullbench.c" />
   </ItemGroup>
   <ItemGroup>
@@ -197,6 +198,7 @@
     <ClInclude Include="..\..\..\lib\legacy\zstd_legacy.h" />
     <ClInclude Include="..\..\..\programs\datagen.h" />
     <ClInclude Include="..\..\..\programs\util.h" />
+    <ClInclude Include="..\..\..\programs\bench.h" />
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
diff --git a/build/VS2010/fuzzer/fuzzer.vcxproj b/build/VS2010/fuzzer/fuzzer.vcxproj
index f0d1ab0..6077cd2 100644
--- a/build/VS2010/fuzzer/fuzzer.vcxproj
+++ b/build/VS2010/fuzzer/fuzzer.vcxproj
@@ -176,6 +176,7 @@
     <ClCompile Include="..\..\..\lib\decompress\huf_decompress.c" />
     <ClCompile Include="..\..\..\lib\decompress\zstd_decompress.c" />
     <ClCompile Include="..\..\..\lib\dictBuilder\cover.c" />
+    <ClCompile Include="..\..\..\lib\dictBuilder\fastcover.c" />
     <ClCompile Include="..\..\..\lib\dictBuilder\divsufsort.c" />
     <ClCompile Include="..\..\..\lib\dictBuilder\zdict.c" />
     <ClCompile Include="..\..\..\programs\datagen.c" />
@@ -199,6 +200,7 @@
     <ClInclude Include="..\..\..\lib\compress\zstdmt_compress.h" />
     <ClInclude Include="..\..\..\lib\dictBuilder\divsufsort.h" />
     <ClInclude Include="..\..\..\lib\dictBuilder\zdict.h" />
+    <ClInclude Include="..\..\..\lib\dictBuilder\cover.h" />
     <ClInclude Include="..\..\..\lib\legacy\zstd_legacy.h" />
     <ClInclude Include="..\..\..\programs\datagen.h" />
     <ClInclude Include="..\..\..\programs\util.h" />
diff --git a/build/VS2010/libzstd-dll/libzstd-dll.vcxproj b/build/VS2010/libzstd-dll/libzstd-dll.vcxproj
index 92d518d..e7e906e 100644
--- a/build/VS2010/libzstd-dll/libzstd-dll.vcxproj
+++ b/build/VS2010/libzstd-dll/libzstd-dll.vcxproj
@@ -43,6 +43,7 @@
     <ClCompile Include="..\..\..\lib\deprecated\zbuff_compress.c" />
     <ClCompile Include="..\..\..\lib\deprecated\zbuff_decompress.c" />
     <ClCompile Include="..\..\..\lib\dictBuilder\cover.c" />
+    <ClCompile Include="..\..\..\lib\dictBuilder\fastcover.c" />
     <ClCompile Include="..\..\..\lib\dictBuilder\divsufsort.c" />
     <ClCompile Include="..\..\..\lib\dictBuilder\zdict.c" />
     <ClCompile Include="..\..\..\lib\legacy\zstd_v01.c" />
@@ -161,7 +162,7 @@
       </PrecompiledHeader>
       <WarningLevel>Level4</WarningLevel>
       <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>ZSTD_DLL_EXPORT=1;ZSTD_MULTITHREAD=1;ZSTD_LEGACY_SUPPORT=4;WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>ZSTD_DLL_EXPORT=1;ZSTD_MULTITHREAD=1;ZSTD_LEGACY_SUPPORT=5;WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <MinimalRebuild>true</MinimalRebuild>
       <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
       <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
@@ -181,7 +182,7 @@
       </PrecompiledHeader>
       <WarningLevel>Level4</WarningLevel>
       <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>ZSTD_DLL_EXPORT=1;ZSTD_MULTITHREAD=1;ZSTD_LEGACY_SUPPORT=4;WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>ZSTD_DLL_EXPORT=1;ZSTD_MULTITHREAD=1;ZSTD_LEGACY_SUPPORT=5;WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <TreatWarningAsError>true</TreatWarningAsError>
       <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
       <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
@@ -201,7 +202,7 @@
       <Optimization>MaxSpeed</Optimization>
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>ZSTD_DLL_EXPORT=1;ZSTD_MULTITHREAD=1;ZSTD_LEGACY_SUPPORT=4;WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>ZSTD_DLL_EXPORT=1;ZSTD_MULTITHREAD=1;ZSTD_LEGACY_SUPPORT=5;WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <EnablePREfast>false</EnablePREfast>
       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
       <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
@@ -223,7 +224,7 @@
       <Optimization>MaxSpeed</Optimization>
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>ZSTD_DLL_EXPORT=1;ZSTD_MULTITHREAD=1;ZSTD_LEGACY_SUPPORT=4;WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>ZSTD_DLL_EXPORT=1;ZSTD_MULTITHREAD=1;ZSTD_LEGACY_SUPPORT=5;WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <TreatWarningAsError>false</TreatWarningAsError>
       <EnablePREfast>false</EnablePREfast>
       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
diff --git a/build/VS2010/libzstd/libzstd.vcxproj b/build/VS2010/libzstd/libzstd.vcxproj
index c306fce..7c2af2b 100644
--- a/build/VS2010/libzstd/libzstd.vcxproj
+++ b/build/VS2010/libzstd/libzstd.vcxproj
@@ -43,6 +43,7 @@
     <ClCompile Include="..\..\..\lib\deprecated\zbuff_compress.c" />
     <ClCompile Include="..\..\..\lib\deprecated\zbuff_decompress.c" />
     <ClCompile Include="..\..\..\lib\dictBuilder\cover.c" />
+    <ClCompile Include="..\..\..\lib\dictBuilder\fastcover.c" />
     <ClCompile Include="..\..\..\lib\dictBuilder\divsufsort.c" />
     <ClCompile Include="..\..\..\lib\dictBuilder\zdict.c" />
     <ClCompile Include="..\..\..\lib\legacy\zstd_v01.c" />
@@ -158,7 +159,7 @@
       </PrecompiledHeader>
       <WarningLevel>Level4</WarningLevel>
       <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>ZSTD_DLL_EXPORT=1;ZSTD_MULTITHREAD=1;ZSTD_LEGACY_SUPPORT=4;WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>ZSTD_DLL_EXPORT=1;ZSTD_MULTITHREAD=1;ZSTD_LEGACY_SUPPORT=5;WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <MinimalRebuild>true</MinimalRebuild>
       <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
       <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
@@ -178,7 +179,7 @@
       </PrecompiledHeader>
       <WarningLevel>Level4</WarningLevel>
       <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>ZSTD_DLL_EXPORT=1;ZSTD_MULTITHREAD=1;ZSTD_LEGACY_SUPPORT=4;WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>ZSTD_DLL_EXPORT=1;ZSTD_MULTITHREAD=1;ZSTD_LEGACY_SUPPORT=5;WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <TreatWarningAsError>true</TreatWarningAsError>
       <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
       <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
@@ -198,7 +199,7 @@
       <Optimization>MaxSpeed</Optimization>
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>ZSTD_DLL_EXPORT=1;ZSTD_MULTITHREAD=1;ZSTD_LEGACY_SUPPORT=4;WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>ZSTD_DLL_EXPORT=1;ZSTD_MULTITHREAD=1;ZSTD_LEGACY_SUPPORT=5;WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <EnablePREfast>false</EnablePREfast>
       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
       <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
@@ -220,7 +221,7 @@
       <Optimization>MaxSpeed</Optimization>
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>ZSTD_DLL_EXPORT=1;ZSTD_MULTITHREAD=1;ZSTD_LEGACY_SUPPORT=4;WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>ZSTD_DLL_EXPORT=1;ZSTD_MULTITHREAD=1;ZSTD_LEGACY_SUPPORT=5;WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <TreatWarningAsError>false</TreatWarningAsError>
       <EnablePREfast>false</EnablePREfast>
       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
diff --git a/build/VS2010/zstd/zstd.vcxproj b/build/VS2010/zstd/zstd.vcxproj
index 4af2813..aea18b2 100644
--- a/build/VS2010/zstd/zstd.vcxproj
+++ b/build/VS2010/zstd/zstd.vcxproj
@@ -40,6 +40,7 @@
     <ClCompile Include="..\..\..\lib\decompress\huf_decompress.c" />
     <ClCompile Include="..\..\..\lib\decompress\zstd_decompress.c" />
     <ClCompile Include="..\..\..\lib\dictBuilder\cover.c" />
+    <ClCompile Include="..\..\..\lib\dictBuilder\fastcover.c" />
     <ClCompile Include="..\..\..\lib\dictBuilder\divsufsort.c" />
     <ClCompile Include="..\..\..\lib\dictBuilder\zdict.c" />
     <ClCompile Include="..\..\..\lib\legacy\zstd_v01.c" />
@@ -61,6 +62,7 @@
     <ClInclude Include="..\..\..\lib\common\xxhash.h" />
     <ClInclude Include="..\..\..\lib\compress\zstdmt_compress.h" />
     <ClInclude Include="..\..\..\lib\dictBuilder\zdict.h" />
+    <ClInclude Include="..\..\..\lib\dictBuilder\cover.h" />
     <ClInclude Include="..\..\..\lib\dictBuilder\divsufsort.h" />
     <ClInclude Include="..\..\..\lib\common\fse.h" />
     <ClInclude Include="..\..\..\lib\common\huf.h" />
@@ -167,7 +169,7 @@
       </PrecompiledHeader>
       <WarningLevel>Level4</WarningLevel>
       <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>ZSTD_MULTITHREAD=1;ZSTD_LEGACY_SUPPORT=4;WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>ZSTD_MULTITHREAD=1;ZSTD_LEGACY_SUPPORT=5;WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <TreatWarningAsError>true</TreatWarningAsError>
       <EnablePREfast>false</EnablePREfast>
     </ClCompile>
@@ -183,7 +185,7 @@
       </PrecompiledHeader>
       <WarningLevel>Level4</WarningLevel>
       <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>ZSTD_MULTITHREAD=1;ZSTD_LEGACY_SUPPORT=4;WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>ZSTD_MULTITHREAD=1;ZSTD_LEGACY_SUPPORT=5;WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <TreatWarningAsError>true</TreatWarningAsError>
       <EnablePREfast>false</EnablePREfast>
     </ClCompile>
@@ -201,7 +203,7 @@
       <Optimization>MaxSpeed</Optimization>
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>ZSTD_MULTITHREAD=1;ZSTD_LEGACY_SUPPORT=4;WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>ZSTD_MULTITHREAD=1;ZSTD_LEGACY_SUPPORT=5;WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <EnablePREfast>false</EnablePREfast>
       <TreatWarningAsError>false</TreatWarningAsError>
       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
@@ -222,7 +224,7 @@
       <Optimization>MaxSpeed</Optimization>
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>ZSTD_MULTITHREAD=1;ZSTD_LEGACY_SUPPORT=4;WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>ZSTD_MULTITHREAD=1;ZSTD_LEGACY_SUPPORT=5;WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <TreatWarningAsError>false</TreatWarningAsError>
       <EnablePREfast>false</EnablePREfast>
       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
diff --git a/build/VS_scripts/build.generic.cmd b/build/VS_scripts/build.generic.cmd
index bae42fd..a7ca4d0 100644
--- a/build/VS_scripts/build.generic.cmd
+++ b/build/VS_scripts/build.generic.cmd
@@ -34,7 +34,7 @@
 SET msbuild_vs2017enterprise="%programfiles(x86)%\Microsoft Visual Studio\2017\Enterprise\MSBuild\15.0\Bin\MSBuild.exe"
 IF %msbuild_version% == VS2013 SET msbuild="%programfiles(x86)%\MSBuild\12.0\Bin\MSBuild.exe"
 IF %msbuild_version% == VS2015 SET msbuild="%programfiles(x86)%\MSBuild\14.0\Bin\MSBuild.exe"
-IF %msbuild_version% == VS2017Community SET msbuild="%msbuild_vs2017community%
+IF %msbuild_version% == VS2017Community SET msbuild=%msbuild_vs2017community%
 IF %msbuild_version% == VS2017Professional SET msbuild=%msbuild_vs2017professional%
 IF %msbuild_version% == VS2017Enterprise SET msbuild=%msbuild_vs2017enterprise%
 IF %msbuild_version% == VS2017 (
diff --git a/build/cmake/.gitignore b/build/cmake/.gitignore
index ad4283f..2e51e89 100644
--- a/build/cmake/.gitignore
+++ b/build/cmake/.gitignore
@@ -1,3 +1,6 @@
+# cmake working directory
+cmakeBuild
+
 # cmake artefacts
 CMakeCache.txt
 CMakeFiles
diff --git a/build/cmake/CMakeLists.txt b/build/cmake/CMakeLists.txt
index fd9bc2b..1e2921d 100644
--- a/build/cmake/CMakeLists.txt
+++ b/build/cmake/CMakeLists.txt
@@ -10,6 +10,10 @@
 PROJECT(zstd)
 CMAKE_MINIMUM_REQUIRED(VERSION 2.8.9)
 SET(ZSTD_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../..")
+
+# Ensure Release build even if not invoked via Makefile
+SET(CMAKE_BUILD_TYPE "Release")
+
 LIST(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules")
 INCLUDE(GNUInstallDirs)
 
diff --git a/build/cmake/contrib/gen_html/CMakeLists.txt b/build/cmake/contrib/gen_html/CMakeLists.txt
index 958e9d1..c4062d4 100644
--- a/build/cmake/contrib/gen_html/CMakeLists.txt
+++ b/build/cmake/contrib/gen_html/CMakeLists.txt
@@ -27,4 +27,4 @@
                   ${GENHTML_BINARY} "${LIBVERSION}" "${LIBRARY_DIR}/zstd.h" "${PROJECT_BINARY_DIR}/zstd_manual.html"
                   DEPENDS gen_html COMMENT "Update zstd manual")
 
-INSTALL(FILES "${PROJECT_BINARY_DIR}/zstd_manual.html" DESTINATION "${CMAKE_INSTALL_PREFIX}/${DOC_INSTALL_DIR}")
+INSTALL(FILES "${PROJECT_BINARY_DIR}/zstd_manual.html" DESTINATION "${CMAKE_INSTALL_DOCDIR}")
diff --git a/build/cmake/lib/CMakeLists.txt b/build/cmake/lib/CMakeLists.txt
index c4c2f81..ffc196d 100644
--- a/build/cmake/lib/CMakeLists.txt
+++ b/build/cmake/lib/CMakeLists.txt
@@ -14,7 +14,7 @@
 OPTION(ZSTD_BUILD_SHARED "BUILD SHARED LIBRARIES" ON)
 
 IF(NOT ZSTD_BUILD_SHARED AND NOT ZSTD_BUILD_STATIC)
-    MESSAGE(SEND_ERROR "You need to build at least one flavor of libstd")
+    MESSAGE(SEND_ERROR "You need to build at least one flavor of libzstd")
 ENDIF()
 
 # Define library directory, where sources and header files are located
@@ -47,6 +47,7 @@
         ${LIBRARY_DIR}/decompress/huf_decompress.c
         ${LIBRARY_DIR}/decompress/zstd_decompress.c
         ${LIBRARY_DIR}/dictBuilder/cover.c
+        ${LIBRARY_DIR}/dictBuilder/fastcover.c
         ${LIBRARY_DIR}/dictBuilder/divsufsort.c
         ${LIBRARY_DIR}/dictBuilder/zdict.c
         ${LIBRARY_DIR}/deprecated/zbuff_common.c
@@ -74,6 +75,7 @@
         ${LIBRARY_DIR}/compress/zstd_ldm.h
         ${LIBRARY_DIR}/compress/zstdmt_compress.h
         ${LIBRARY_DIR}/dictBuilder/zdict.h
+        ${LIBRARY_DIR}/dictBuilder/cover.h
         ${LIBRARY_DIR}/deprecated/zbuff.h)
 
 IF (ZSTD_LEGACY_SUPPORT)
@@ -178,6 +180,7 @@
     ${LIBRARY_DIR}/zstd.h
     ${LIBRARY_DIR}/deprecated/zbuff.h
     ${LIBRARY_DIR}/dictBuilder/zdict.h
+    ${LIBRARY_DIR}/dictBuilder/cover.h
     ${LIBRARY_DIR}/common/zstd_errors.h
     DESTINATION "include")
 
diff --git a/build/cmake/tests/CMakeLists.txt b/build/cmake/tests/CMakeLists.txt
index 11c0db1..9f4c64c 100644
--- a/build/cmake/tests/CMakeLists.txt
+++ b/build/cmake/tests/CMakeLists.txt
@@ -40,7 +40,10 @@
 SET(TESTS_DIR ${ZSTD_SOURCE_DIR}/tests)
 INCLUDE_DIRECTORIES(${TESTS_DIR} ${PROGRAMS_DIR} ${LIBRARY_DIR} ${LIBRARY_DIR}/common ${LIBRARY_DIR}/compress ${LIBRARY_DIR}/dictBuilder)
 
-ADD_EXECUTABLE(fullbench ${PROGRAMS_DIR}/datagen.c ${TESTS_DIR}/fullbench.c)
+ADD_EXECUTABLE(datagen ${PROGRAMS_DIR}/datagen.c ${TESTS_DIR}/datagencli.c)
+TARGET_LINK_LIBRARIES(datagen libzstd_static)
+
+ADD_EXECUTABLE(fullbench ${PROGRAMS_DIR}/datagen.c ${PROGRAMS_DIR}/bench.c ${TESTS_DIR}/fullbench.c)
 TARGET_LINK_LIBRARIES(fullbench libzstd_static)
 
 ADD_EXECUTABLE(fuzzer ${PROGRAMS_DIR}/datagen.c ${TESTS_DIR}/fuzzer.c)
@@ -49,7 +52,4 @@
 IF (UNIX)
     ADD_EXECUTABLE(paramgrill ${PROGRAMS_DIR}/bench.c ${PROGRAMS_DIR}/datagen.c ${TESTS_DIR}/paramgrill.c)
     TARGET_LINK_LIBRARIES(paramgrill libzstd_static m) #m is math library
-
-    ADD_EXECUTABLE(datagen ${PROGRAMS_DIR}/datagen.c ${TESTS_DIR}/datagencli.c)
-    TARGET_LINK_LIBRARIES(datagen libzstd_static)
 ENDIF (UNIX)
diff --git a/circle.yml b/circle.yml
deleted file mode 100644
index ed50d59..0000000
--- a/circle.yml
+++ /dev/null
@@ -1,63 +0,0 @@
-dependencies:
-  override:
-    - sudo dpkg --add-architecture i386
-    - sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test; sudo apt-get -y -qq update
-    - sudo apt-get -y install gcc-powerpc-linux-gnu gcc-arm-linux-gnueabi libc6-dev-armel-cross gcc-aarch64-linux-gnu libc6-dev-arm64-cross
-
-test:
-  override:
-    - ? |
-        if [[ "$CIRCLE_NODE_INDEX" == "0" ]]                                    ; then cc -v; CFLAGS="-O0 -Werror" make all && make clean; fi &&
-        if [[ "$CIRCLE_NODE_TOTAL" < "2" ]] || [[ "$CIRCLE_NODE_INDEX" == "1" ]]; then make gnu90build   && make clean; fi
-      :
-        parallel: true
-    - ? |
-        if [[ "$CIRCLE_NODE_INDEX" == "0" ]]                                    ; then make c99build     && make clean; fi &&
-        if [[ "$CIRCLE_NODE_TOTAL" < "2" ]] || [[ "$CIRCLE_NODE_INDEX" == "1" ]]; then make gnu99build   && make clean; fi
-      :
-        parallel: true
-    - ? |
-        if [[ "$CIRCLE_NODE_INDEX" == "0" ]]                                    ; then make c11build     && make clean; fi &&
-        if [[ "$CIRCLE_NODE_TOTAL" < "2" ]] || [[ "$CIRCLE_NODE_INDEX" == "1" ]]; then make ppc64build   && make clean; fi
-      :
-        parallel: true
-    - ? |
-        if [[ "$CIRCLE_NODE_INDEX" == "0" ]]                                    ; then make aarch64build && make clean; fi &&
-        if [[ "$CIRCLE_NODE_TOTAL" < "2" ]] || [[ "$CIRCLE_NODE_INDEX" == "1" ]]; then make ppcbuild     && make clean; fi
-      :
-        parallel: true
-    - ? |
-        if [[ "$CIRCLE_NODE_INDEX" == "0" ]]                                    ; then make -j regressiontest && make clean; fi &&
-        if [[ "$CIRCLE_NODE_TOTAL" < "2" ]] || [[ "$CIRCLE_NODE_INDEX" == "1" ]]; then make armbuild     && make clean; fi
-      :
-        parallel: true
-    - ? |
-        if [[ "$CIRCLE_NODE_INDEX" == "0" ]]                                    ; then make shortest     && make clean; fi &&
-        if [[ "$CIRCLE_NODE_TOTAL" < "2" ]] || [[ "$CIRCLE_NODE_INDEX" == "1" ]]; then make -C tests test-legacy test-longmatch test-symbols && make clean; fi
-      :
-        parallel: true
-    - ? |
-        if [[ "$CIRCLE_NODE_INDEX" == "0" ]]                                    ; then make cxxtest      && make clean; fi &&
-        if [[ "$CIRCLE_NODE_TOTAL" < "2" ]] || [[ "$CIRCLE_NODE_INDEX" == "1" ]]; then make -C lib libzstd-nomt && make clean; fi
-      :
-        parallel: true
-
-  post:
-    - echo Circle CI tests finished
-
-  # Longer tests
-    #- make -C tests test-zstd-nolegacy && make clean
-    #- pyenv global 3.4.4; make -C tests versionsTest && make clean
-    #- make zlibwrapper         && make clean
-    #- gcc -v; make -C tests test32 MOREFLAGS="-I/usr/include/x86_64-linux-gnu" && make clean
-    #- make uasan               && make clean
-    #- make asan32              && make clean
-    #- make -C tests test32 CC=clang MOREFLAGS="-g -fsanitize=address -I/usr/include/x86_64-linux-gnu"
-  # Valgrind tests
-    #- CFLAGS="-O1 -g" make -C zlibWrapper valgrindTest && make clean
-    #- make -C tests valgrindTest && make clean
-  # ARM, AArch64, PowerPC, PowerPC64 tests
-    #- make ppctest             && make clean
-    #- make ppc64test           && make clean
-    #- make armtest             && make clean
-    #- make aarch64test         && make clean
diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/Makefile b/contrib/experimental_dict_builders/benchmarkDictBuilder/Makefile
new file mode 100644
index 0000000..72ce04f
--- /dev/null
+++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/Makefile
@@ -0,0 +1,44 @@
+ARG :=
+
+CC ?= gcc
+CFLAGS ?= -O3
+INCLUDES := -I ../randomDictBuilder -I ../../../programs -I ../../../lib/common -I ../../../lib -I ../../../lib/dictBuilder
+
+RANDOM_FILE := ../randomDictBuilder/random.c
+IO_FILE := ../randomDictBuilder/io.c
+
+all: run clean
+
+.PHONY: run
+run: benchmark
+	echo "Benchmarking with $(ARG)"
+	./benchmark $(ARG)
+
+.PHONY: test
+test: benchmarkTest clean
+
+.PHONY: benchmarkTest
+benchmarkTest: benchmark test.sh
+	sh test.sh
+
+benchmark: benchmark.o io.o random.o libzstd.a
+	$(CC) $(CFLAGS) benchmark.o io.o random.o libzstd.a -o benchmark
+
+benchmark.o: benchmark.c
+	$(CC) $(CFLAGS) $(INCLUDES) -c benchmark.c
+
+random.o: $(RANDOM_FILE)
+	$(CC) $(CFLAGS) $(INCLUDES) -c $(RANDOM_FILE)
+
+io.o: $(IO_FILE)
+	$(CC) $(CFLAGS) $(INCLUDES) -c $(IO_FILE)
+
+libzstd.a:
+	$(MAKE) -C ../../../lib libzstd.a
+	mv ../../../lib/libzstd.a .
+
+.PHONY: clean
+clean:
+	rm -f *.o benchmark libzstd.a
+	$(MAKE) -C ../../../lib clean
+	echo "Cleaning is completed"
diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md
new file mode 100644
index 0000000..6a6c7f1
--- /dev/null
+++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md
@@ -0,0 +1,849 @@
+Benchmarking Dictionary Builder
+
+### Permitted Argument:
+Input File/Directory (in=fileName): required; file/directory used to build dictionary; if directory, will operate recursively for files inside directory; can include multiple files/directories, each following "in="
+
+###Running Test:
+make test
+
+###Usage:
+Benchmark given input files: make ARG= followed by permitted arguments
+
+### Examples:
+make ARG="in=../../../lib/dictBuilder in=../../../lib/compress"
+
+###Benchmarking Result:
+- First Cover is optimize cover, second Cover uses optimized d and k from first one.
+- For every f value of fastCover, the first one is optimize fastCover and the second one uses optimized d and k from first one. This is run for accel values from 1 to 10.
+- Fourth column is chosen d and fifth column is chosen k
+
+github:
+NODICT       0.000004       2.999642        
+RANDOM       0.024560       8.791189        
+LEGACY       0.727109       8.173529        
+COVER       40.565676       10.652243        8          1298
+COVER       3.608284       10.652243        8          1298
+FAST f=15 a=1       4.181024       10.570882        8          1154
+FAST f=15 a=1       0.040788       10.570882        8          1154
+FAST f=15 a=2       3.548352       10.574287        6          1970
+FAST f=15 a=2       0.035535       10.574287        6          1970
+FAST f=15 a=3       3.287364       10.613950        6          1010
+FAST f=15 a=3       0.032182       10.613950        6          1010
+FAST f=15 a=4       3.184976       10.573883        6          1058
+FAST f=15 a=4       0.029878       10.573883        6          1058
+FAST f=15 a=5       3.045513       10.580640        8          1154
+FAST f=15 a=5       0.022162       10.580640        8          1154
+FAST f=15 a=6       3.003296       10.583677        6          1010
+FAST f=15 a=6       0.028091       10.583677        6          1010
+FAST f=15 a=7       2.952655       10.622551        6          1106
+FAST f=15 a=7       0.02724       10.622551        6          1106
+FAST f=15 a=8       2.945674       10.614657        6          1010
+FAST f=15 a=8       0.027264       10.614657        6          1010
+FAST f=15 a=9       3.153439       10.564018        8          1154
+FAST f=15 a=9       0.020635       10.564018        8          1154
+FAST f=15 a=10       2.950416       10.511454        6          1010
+FAST f=15 a=10       0.026606       10.511454        6          1010
+FAST f=16 a=1       3.970029       10.681035        8          1154
+FAST f=16 a=1       0.038188       10.681035        8          1154
+FAST f=16 a=2       3.422892       10.484978        6          1874
+FAST f=16 a=2       0.034702       10.484978        6          1874
+FAST f=16 a=3       3.215836       10.632631        8          1154
+FAST f=16 a=3       0.026084       10.632631        8          1154
+FAST f=16 a=4       3.081353       10.626533        6          1106
+FAST f=16 a=4       0.030032       10.626533        6          1106
+FAST f=16 a=5       3.041241       10.545027        8          1922
+FAST f=16 a=5       0.022882       10.545027        8          1922
+FAST f=16 a=6       2.989390       10.638284        6          1874
+FAST f=16 a=6       0.028308       10.638284        6          1874
+FAST f=16 a=7       3.001581       10.797136        6          1106
+FAST f=16 a=7       0.027479       10.797136        6          1106
+FAST f=16 a=8       2.984107       10.658356        8          1058
+FAST f=16 a=8       0.021099       10.658356        8          1058
+FAST f=16 a=9       2.925788       10.523869        6          1010
+FAST f=16 a=9       0.026905       10.523869        6          1010
+FAST f=16 a=10       2.889605       10.745841        6          1874
+FAST f=16 a=10       0.026846       10.745841        6          1874
+FAST f=17 a=1       4.031953       10.672080        8          1202
+FAST f=17 a=1       0.040658       10.672080        8          1202
+FAST f=17 a=2       3.458107       10.589352        8          1106
+FAST f=17 a=2       0.02926       10.589352        8          1106
+FAST f=17 a=3       3.291189       10.662714        8          1154
+FAST f=17 a=3       0.026531       10.662714        8          1154
+FAST f=17 a=4       3.154950       10.549456        8          1346
+FAST f=17 a=4       0.024991       10.549456        8          1346
+FAST f=17 a=5       3.092271       10.541670        6          1202
+FAST f=17 a=5       0.038285       10.541670        6          1202
+FAST f=17 a=6       3.166146       10.729112        6          1874
+FAST f=17 a=6       0.038217       10.729112        6          1874
+FAST f=17 a=7       3.035467       10.810485        6          1106
+FAST f=17 a=7       0.036655       10.810485        6          1106
+FAST f=17 a=8       3.035668       10.530532        6          1058
+FAST f=17 a=8       0.037715       10.530532        6          1058
+FAST f=17 a=9       2.987917       10.589802        8          1922
+FAST f=17 a=9       0.02217       10.589802        8          1922
+FAST f=17 a=10       2.981647       10.722579        8          1106
+FAST f=17 a=10       0.021948       10.722579        8          1106
+FAST f=18 a=1       4.067144       10.634943        8          1154
+FAST f=18 a=1       0.041386       10.634943        8          1154
+FAST f=18 a=2       3.507377       10.546230        6          1970
+FAST f=18 a=2       0.037572       10.546230        6          1970
+FAST f=18 a=3       3.323015       10.648061        8          1154
+FAST f=18 a=3       0.028306       10.648061        8          1154
+FAST f=18 a=4       3.216735       10.705402        6          1010
+FAST f=18 a=4       0.030755       10.705402        6          1010
+FAST f=18 a=5       3.175794       10.588154        8          1874
+FAST f=18 a=5       0.025315       10.588154        8          1874
+FAST f=18 a=6       3.127459       10.751104        8          1106
+FAST f=18 a=6       0.023897       10.751104        8          1106
+FAST f=18 a=7       3.083017       10.780402        6          1106
+FAST f=18 a=7       0.029158       10.780402        6          1106
+FAST f=18 a=8       3.069700       10.547226        8          1346
+FAST f=18 a=8       0.024046       10.547226        8          1346
+FAST f=18 a=9       3.056591       10.674759        6          1010
+FAST f=18 a=9       0.028496       10.674759        6          1010
+FAST f=18 a=10       3.063588       10.737578        8          1106
+FAST f=18 a=10       0.023033       10.737578        8          1106
+FAST f=19 a=1       4.164041       10.650333        8          1154
+FAST f=19 a=1       0.042906       10.650333        8          1154
+FAST f=19 a=2       3.585409       10.577066        6          1058
+FAST f=19 a=2       0.038994       10.577066        6          1058
+FAST f=19 a=3       3.439643       10.639403        8          1154
+FAST f=19 a=3       0.028427       10.639403        8          1154
+FAST f=19 a=4       3.268869       10.554410        8          1298
+FAST f=19 a=4       0.026866       10.554410        8          1298
+FAST f=19 a=5       3.238225       10.615109        6          1010
+FAST f=19 a=5       0.03078       10.615109        6          1010
+FAST f=19 a=6       3.199558       10.609782        6          1874
+FAST f=19 a=6       0.030099       10.609782        6          1874
+FAST f=19 a=7       3.132395       10.794753        6          1106
+FAST f=19 a=7       0.028964       10.794753        6          1106
+FAST f=19 a=8       3.148446       10.554842        8          1298
+FAST f=19 a=8       0.024277       10.554842        8          1298
+FAST f=19 a=9       3.108324       10.668763        6          1010
+FAST f=19 a=9       0.02896       10.668763        6          1010
+FAST f=19 a=10       3.159863       10.757347        8          1106
+FAST f=19 a=10       0.023351       10.757347        8          1106
+FAST f=20 a=1       4.462698       10.661788        8          1154
+FAST f=20 a=1       0.047174       10.661788        8          1154
+FAST f=20 a=2       3.820269       10.678612        6          1106
+FAST f=20 a=2       0.040807       10.678612        6          1106
+FAST f=20 a=3       3.644955       10.648424        8          1154
+FAST f=20 a=3       0.031398       10.648424        8          1154
+FAST f=20 a=4       3.546257       10.559756        8          1298
+FAST f=20 a=4       0.029856       10.559756        8          1298
+FAST f=20 a=5       3.485248       10.646637        6          1010
+FAST f=20 a=5       0.033756       10.646637        6          1010
+FAST f=20 a=6       3.490438       10.775824        8          1106
+FAST f=20 a=6       0.028338       10.775824        8          1106
+FAST f=20 a=7       3.631289       10.801795        6          1106
+FAST f=20 a=7       0.035228       10.801795        6          1106
+FAST f=20 a=8       3.758936       10.545116        8          1346
+FAST f=20 a=8       0.027495       10.545116        8          1346
+FAST f=20 a=9       3.707024       10.677454        6          1010
+FAST f=20 a=9       0.031326       10.677454        6          1010
+FAST f=20 a=10       3.586593       10.756017        8          1106
+FAST f=20 a=10       0.027122       10.756017        8          1106
+FAST f=21 a=1       5.701396       10.655398        8          1154
+FAST f=21 a=1       0.067744       10.655398        8          1154
+FAST f=21 a=2       5.270542       10.650743        6          1106
+FAST f=21 a=2       0.052999       10.650743        6          1106
+FAST f=21 a=3       4.945294       10.652380        8          1154
+FAST f=21 a=3       0.052678       10.652380        8          1154
+FAST f=21 a=4       4.894079       10.543185        8          1298
+FAST f=21 a=4       0.04997       10.543185        8          1298
+FAST f=21 a=5       4.785417       10.630321        6          1010
+FAST f=21 a=5       0.045294       10.630321        6          1010
+FAST f=21 a=6       4.789381       10.664477        6          1874
+FAST f=21 a=6       0.046578       10.664477        6          1874
+FAST f=21 a=7       4.302955       10.805179        6          1106
+FAST f=21 a=7       0.041205       10.805179        6          1106
+FAST f=21 a=8       4.034630       10.551211        8          1298
+FAST f=21 a=8       0.040121       10.551211        8          1298
+FAST f=21 a=9       4.523868       10.799114        6          1010
+FAST f=21 a=9       0.043592       10.799114        6          1010
+FAST f=21 a=10       4.760736       10.750255        8          1106
+FAST f=21 a=10       0.043483       10.750255        8          1106
+FAST f=22 a=1       6.743064       10.640537        8          1154
+FAST f=22 a=1       0.086967       10.640537        8          1154
+FAST f=22 a=2       6.121739       10.626638        6          1970
+FAST f=22 a=2       0.066337       10.626638        6          1970
+FAST f=22 a=3       5.248851       10.640688        8          1154
+FAST f=22 a=3       0.054935       10.640688        8          1154
+FAST f=22 a=4       5.436579       10.588333        8          1298
+FAST f=22 a=4       0.064113       10.588333        8          1298
+FAST f=22 a=5       5.812815       10.652653        6          1010
+FAST f=22 a=5       0.058189       10.652653        6          1010
+FAST f=22 a=6       5.745472       10.666437        6          1874
+FAST f=22 a=6       0.057188       10.666437        6          1874
+FAST f=22 a=7       5.716393       10.806911        6          1106
+FAST f=22 a=7       0.056       10.806911        6          1106
+FAST f=22 a=8       5.698799       10.530784        8          1298
+FAST f=22 a=8       0.0583       10.530784        8          1298
+FAST f=22 a=9       5.710533       10.777391        6          1010
+FAST f=22 a=9       0.054945       10.777391        6          1010
+FAST f=22 a=10       5.685395       10.745023        8          1106
+FAST f=22 a=10       0.056526       10.745023        8          1106
+FAST f=23 a=1       7.836923       10.638828        8          1154
+FAST f=23 a=1       0.099522       10.638828        8          1154
+FAST f=23 a=2       6.627834       10.631061        6          1970
+FAST f=23 a=2       0.066769       10.631061        6          1970
+FAST f=23 a=3       5.602533       10.647288        8          1154
+FAST f=23 a=3       0.064513       10.647288        8          1154
+FAST f=23 a=4       6.005580       10.568747        8          1298
+FAST f=23 a=4       0.062022       10.568747        8          1298
+FAST f=23 a=5       5.481816       10.676921        6          1010
+FAST f=23 a=5       0.058959       10.676921        6          1010
+FAST f=23 a=6       5.460444       10.666194        6          1874
+FAST f=23 a=6       0.057687       10.666194        6          1874
+FAST f=23 a=7       5.659822       10.800377        6          1106
+FAST f=23 a=7       0.06783       10.800377        6          1106
+FAST f=23 a=8       6.826940       10.522167        8          1298
+FAST f=23 a=8       0.070533       10.522167        8          1298
+FAST f=23 a=9       6.804757       10.577799        8          1682
+FAST f=23 a=9       0.069949       10.577799        8          1682
+FAST f=23 a=10       6.774933       10.742093        8          1106
+FAST f=23 a=10       0.068395       10.742093        8          1106
+FAST f=24 a=1       8.444110       10.632783        8          1154
+FAST f=24 a=1       0.094357       10.632783        8          1154
+FAST f=24 a=2       7.289578       10.631061        6          1970
+FAST f=24 a=2       0.098515       10.631061        6          1970
+FAST f=24 a=3       8.619780       10.646289        8          1154
+FAST f=24 a=3       0.098041       10.646289        8          1154
+FAST f=24 a=4       8.508455       10.555199        8          1298
+FAST f=24 a=4       0.093885       10.555199        8          1298
+FAST f=24 a=5       8.471145       10.674363        6          1010
+FAST f=24 a=5       0.088676       10.674363        6          1010
+FAST f=24 a=6       8.426727       10.667228        6          1874
+FAST f=24 a=6       0.087247       10.667228        6          1874
+FAST f=24 a=7       8.356826       10.803027        6          1106
+FAST f=24 a=7       0.085835       10.803027        6          1106
+FAST f=24 a=8       6.756811       10.522049        8          1298
+FAST f=24 a=8       0.07107       10.522049        8          1298
+FAST f=24 a=9       6.548169       10.571882        8          1682
+FAST f=24 a=9       0.0713       10.571882        8          1682
+FAST f=24 a=10       8.238079       10.736453        8          1106
+FAST f=24 a=10       0.07004       10.736453        8          1106
+
+
+hg-commands:
+NODICT       0.000005       2.425276        
+RANDOM       0.046332       3.490331        
+LEGACY       0.720351       3.911682        
+COVER       45.507731       4.132653        8          386
+COVER       1.868810       4.132653        8          386
+FAST f=15 a=1       4.561427       3.866894        8          1202
+FAST f=15 a=1       0.048946       3.866894        8          1202
+FAST f=15 a=2       3.574462       3.892119        8          1538
+FAST f=15 a=2       0.033677       3.892119        8          1538
+FAST f=15 a=3       3.230227       3.888791        6          1346
+FAST f=15 a=3       0.034312       3.888791        6          1346
+FAST f=15 a=4       3.042388       3.899739        8          1010
+FAST f=15 a=4       0.024307       3.899739        8          1010
+FAST f=15 a=5       2.800148       3.896220        8          818
+FAST f=15 a=5       0.022331       3.896220        8          818
+FAST f=15 a=6       2.706518       3.882039        8          578
+FAST f=15 a=6       0.020955       3.882039        8          578
+FAST f=15 a=7       2.701820       3.885430        6          866
+FAST f=15 a=7       0.026074       3.885430        6          866
+FAST f=15 a=8       2.604445       3.906932        8          1826
+FAST f=15 a=8       0.021789       3.906932        8          1826
+FAST f=15 a=9       2.598568       3.870324        6          1682
+FAST f=15 a=9       0.026004       3.870324        6          1682
+FAST f=15 a=10       2.575920       3.920783        8          1442
+FAST f=15 a=10       0.020228       3.920783        8          1442
+FAST f=16 a=1       4.630623       4.001430        8          770
+FAST f=16 a=1       0.047497       4.001430        8          770
+FAST f=16 a=2       3.674721       3.974431        8          1874
+FAST f=16 a=2       0.035761       3.974431        8          1874
+FAST f=16 a=3       3.338384       3.978703        8          1010
+FAST f=16 a=3       0.029436       3.978703        8          1010
+FAST f=16 a=4       3.004412       3.983035        8          1010
+FAST f=16 a=4       0.025744       3.983035        8          1010
+FAST f=16 a=5       2.881892       3.987710        8          770
+FAST f=16 a=5       0.023211       3.987710        8          770
+FAST f=16 a=6       2.807410       3.952717        8          1298
+FAST f=16 a=6       0.023199       3.952717        8          1298
+FAST f=16 a=7       2.819623       3.994627        8          770
+FAST f=16 a=7       0.021806       3.994627        8          770
+FAST f=16 a=8       2.740092       3.954032        8          1826
+FAST f=16 a=8       0.0226       3.954032        8          1826
+FAST f=16 a=9       2.682564       3.969879        6          1442
+FAST f=16 a=9       0.026324       3.969879        6          1442
+FAST f=16 a=10       2.657959       3.969755        8          674
+FAST f=16 a=10       0.020413       3.969755        8          674
+FAST f=17 a=1       4.729228       4.046000        8          530
+FAST f=17 a=1       0.049703       4.046000        8          530
+FAST f=17 a=2       3.764510       3.991519        8          1970
+FAST f=17 a=2       0.038195       3.991519        8          1970
+FAST f=17 a=3       3.416992       4.006296        6          914
+FAST f=17 a=3       0.036244       4.006296        6          914
+FAST f=17 a=4       3.145626       3.979182        8          1970
+FAST f=17 a=4       0.028676       3.979182        8          1970
+FAST f=17 a=5       2.995070       4.050070        8          770
+FAST f=17 a=5       0.025707       4.050070        8          770
+FAST f=17 a=6       2.911833       4.040024        8          770
+FAST f=17 a=6       0.02453       4.040024        8          770
+FAST f=17 a=7       2.894796       4.015884        8          818
+FAST f=17 a=7       0.023956       4.015884        8          818
+FAST f=17 a=8       2.789962       4.039303        8          530
+FAST f=17 a=8       0.023219       4.039303        8          530
+FAST f=17 a=9       2.787625       3.996762        8          1634
+FAST f=17 a=9       0.023651       3.996762        8          1634
+FAST f=17 a=10       2.754796       4.005059        8          1058
+FAST f=17 a=10       0.022537       4.005059        8          1058
+FAST f=18 a=1       4.779117       4.038214        8          242
+FAST f=18 a=1       0.048814       4.038214        8          242
+FAST f=18 a=2       3.829753       4.045768        8          722
+FAST f=18 a=2       0.036541       4.045768        8          722
+FAST f=18 a=3       3.495053       4.021497        8          770
+FAST f=18 a=3       0.032648       4.021497        8          770
+FAST f=18 a=4       3.221395       4.039623        8          770
+FAST f=18 a=4       0.027818       4.039623        8          770
+FAST f=18 a=5       3.059369       4.050414        8          530
+FAST f=18 a=5       0.026296       4.050414        8          530
+FAST f=18 a=6       3.019292       4.010714        6          962
+FAST f=18 a=6       0.031104       4.010714        6          962
+FAST f=18 a=7       2.949322       4.031439        6          770
+FAST f=18 a=7       0.030745       4.031439        6          770
+FAST f=18 a=8       2.876425       4.032088        6          386
+FAST f=18 a=8       0.027407       4.032088        6          386
+FAST f=18 a=9       2.850958       4.053372        8          674
+FAST f=18 a=9       0.023799       4.053372        8          674
+FAST f=18 a=10       2.884352       4.020148        8          1730
+FAST f=18 a=10       0.024401       4.020148        8          1730
+FAST f=19 a=1       4.815669       4.061203        8          674
+FAST f=19 a=1       0.051425       4.061203        8          674
+FAST f=19 a=2       3.951356       4.013822        8          1442
+FAST f=19 a=2       0.039968       4.013822        8          1442
+FAST f=19 a=3       3.554682       4.050425        8          722
+FAST f=19 a=3       0.032725       4.050425        8          722
+FAST f=19 a=4       3.242585       4.054677        8          722
+FAST f=19 a=4       0.028194       4.054677        8          722
+FAST f=19 a=5       3.105909       4.064524        8          818
+FAST f=19 a=5       0.02675       4.064524        8          818
+FAST f=19 a=6       3.059901       4.036857        8          1250
+FAST f=19 a=6       0.026396       4.036857        8          1250
+FAST f=19 a=7       3.016151       4.068234        6          770
+FAST f=19 a=7       0.031501       4.068234        6          770
+FAST f=19 a=8       2.962902       4.077509        8          530
+FAST f=19 a=8       0.023333       4.077509        8          530
+FAST f=19 a=9       2.899607       4.067328        8          530
+FAST f=19 a=9       0.024553       4.067328        8          530
+FAST f=19 a=10       2.950978       4.059901        8          434
+FAST f=19 a=10       0.023852       4.059901        8          434
+FAST f=20 a=1       5.259834       4.027579        8          1634
+FAST f=20 a=1       0.061123       4.027579        8          1634
+FAST f=20 a=2       4.382150       4.025093        8          1634
+FAST f=20 a=2       0.048009       4.025093        8          1634
+FAST f=20 a=3       4.104323       4.060842        8          530
+FAST f=20 a=3       0.040965       4.060842        8          530
+FAST f=20 a=4       3.853340       4.023504        6          914
+FAST f=20 a=4       0.041072       4.023504        6          914
+FAST f=20 a=5       3.728841       4.018089        6          1634
+FAST f=20 a=5       0.037469       4.018089        6          1634
+FAST f=20 a=6       3.683045       4.069138        8          578
+FAST f=20 a=6       0.028011       4.069138        8          578
+FAST f=20 a=7       3.726973       4.063160        8          722
+FAST f=20 a=7       0.028437       4.063160        8          722
+FAST f=20 a=8       3.555073       4.057690        8          386
+FAST f=20 a=8       0.027588       4.057690        8          386
+FAST f=20 a=9       3.551095       4.067253        8          482
+FAST f=20 a=9       0.025976       4.067253        8          482
+FAST f=20 a=10       3.490127       4.068518        8          530
+FAST f=20 a=10       0.025971       4.068518        8          530
+FAST f=21 a=1       7.343816       4.064945        8          770
+FAST f=21 a=1       0.085035       4.064945        8          770
+FAST f=21 a=2       5.930894       4.048206        8          386
+FAST f=21 a=2       0.067349       4.048206        8          386
+FAST f=21 a=3       6.770775       4.063417        8          578
+FAST f=21 a=3       0.077104       4.063417        8          578
+FAST f=21 a=4       6.889409       4.066761        8          626
+FAST f=21 a=4       0.0717       4.066761        8          626
+FAST f=21 a=5       6.714896       4.051813        8          914
+FAST f=21 a=5       0.071026       4.051813        8          914
+FAST f=21 a=6       6.539890       4.047263        8          1922
+FAST f=21 a=6       0.07127       4.047263        8          1922
+FAST f=21 a=7       6.511052       4.068373        8          482
+FAST f=21 a=7       0.065467       4.068373        8          482
+FAST f=21 a=8       6.458788       4.071597        8          482
+FAST f=21 a=8       0.063817       4.071597        8          482
+FAST f=21 a=9       6.377591       4.052905        8          434
+FAST f=21 a=9       0.063112       4.052905        8          434
+FAST f=21 a=10       6.360752       4.047773        8          530
+FAST f=21 a=10       0.063606       4.047773        8          530
+FAST f=22 a=1       10.523471       4.040812        8          962
+FAST f=22 a=1       0.14214       4.040812        8          962
+FAST f=22 a=2       9.454758       4.059396        8          914
+FAST f=22 a=2       0.118343       4.059396        8          914
+FAST f=22 a=3       9.043197       4.043019        8          1922
+FAST f=22 a=3       0.109798       4.043019        8          1922
+FAST f=22 a=4       8.716261       4.044819        8          770
+FAST f=22 a=4       0.099687       4.044819        8          770
+FAST f=22 a=5       8.529472       4.070576        8          530
+FAST f=22 a=5       0.093127       4.070576        8          530
+FAST f=22 a=6       8.424241       4.070565        8          722
+FAST f=22 a=6       0.093703       4.070565        8          722
+FAST f=22 a=7       8.403391       4.070591        8          578
+FAST f=22 a=7       0.089763       4.070591        8          578
+FAST f=22 a=8       8.285221       4.089171        8          530
+FAST f=22 a=8       0.087716       4.089171        8          530
+FAST f=22 a=9       8.282506       4.047470        8          722
+FAST f=22 a=9       0.089773       4.047470        8          722
+FAST f=22 a=10       8.241809       4.064151        8          818
+FAST f=22 a=10       0.090413       4.064151        8          818
+FAST f=23 a=1       12.389208       4.051635        6          530
+FAST f=23 a=1       0.147796       4.051635        6          530
+FAST f=23 a=2       11.300910       4.042835        6          914
+FAST f=23 a=2       0.133178       4.042835        6          914
+FAST f=23 a=3       10.879455       4.047415        8          626
+FAST f=23 a=3       0.129571       4.047415        8          626
+FAST f=23 a=4       10.522718       4.038269        6          914
+FAST f=23 a=4       0.118121       4.038269        6          914
+FAST f=23 a=5       10.348043       4.066884        8          434
+FAST f=23 a=5       0.112098       4.066884        8          434
+FAST f=23 a=6       10.238630       4.048635        8          1010
+FAST f=23 a=6       0.120281       4.048635        8          1010
+FAST f=23 a=7       10.213255       4.061809        8          530
+FAST f=23 a=7       0.1121       4.061809        8          530
+FAST f=23 a=8       10.107879       4.074104        8          818
+FAST f=23 a=8       0.116544       4.074104        8          818
+FAST f=23 a=9       10.063424       4.064811        8          674
+FAST f=23 a=9       0.109045       4.064811        8          674
+FAST f=23 a=10       10.035801       4.054918        8          530
+FAST f=23 a=10       0.108735       4.054918        8          530
+FAST f=24 a=1       14.963878       4.073490        8          722
+FAST f=24 a=1       0.206344       4.073490        8          722
+FAST f=24 a=2       13.833472       4.036100        8          962
+FAST f=24 a=2       0.17486       4.036100        8          962
+FAST f=24 a=3       13.404631       4.026281        6          1106
+FAST f=24 a=3       0.153961       4.026281        6          1106
+FAST f=24 a=4       13.041164       4.065448        8          674
+FAST f=24 a=4       0.155509       4.065448        8          674
+FAST f=24 a=5       12.879412       4.054636        8          674
+FAST f=24 a=5       0.148282       4.054636        8          674
+FAST f=24 a=6       12.773736       4.081376        8          530
+FAST f=24 a=6       0.142563       4.081376        8          530
+FAST f=24 a=7       12.711310       4.059834        8          770
+FAST f=24 a=7       0.149321       4.059834        8          770
+FAST f=24 a=8       12.635459       4.052050        8          1298
+FAST f=24 a=8       0.15095       4.052050        8          1298
+FAST f=24 a=9       12.558104       4.076516        8          722
+FAST f=24 a=9       0.144361       4.076516        8          722
+FAST f=24 a=10       10.661348       4.062137        8          818
+FAST f=24 a=10       0.108232       4.062137        8          818
+
+
+hg-changelog:
+NODICT       0.000017       1.377590        
+RANDOM       0.186171       2.097487        
+LEGACY       1.670867       2.058907        
+COVER       173.561948       2.189685        8          98
+COVER       4.811180       2.189685        8          98
+FAST f=15 a=1       18.685906       2.129682        8          434
+FAST f=15 a=1       0.173376       2.129682        8          434
+FAST f=15 a=2       12.928259       2.131890        8          482
+FAST f=15 a=2       0.102582       2.131890        8          482
+FAST f=15 a=3       11.132343       2.128027        8          386
+FAST f=15 a=3       0.077122       2.128027        8          386
+FAST f=15 a=4       10.120683       2.125797        8          434
+FAST f=15 a=4       0.065175       2.125797        8          434
+FAST f=15 a=5       9.479092       2.127697        8          386
+FAST f=15 a=5       0.057905       2.127697        8          386
+FAST f=15 a=6       9.159523       2.127132        8          1682
+FAST f=15 a=6       0.058604       2.127132        8          1682
+FAST f=15 a=7       8.724003       2.129914        8          434
+FAST f=15 a=7       0.0493       2.129914        8          434
+FAST f=15 a=8       8.595001       2.127137        8          338
+FAST f=15 a=8       0.0474       2.127137        8          338
+FAST f=15 a=9       8.356405       2.125512        8          482
+FAST f=15 a=9       0.046126       2.125512        8          482
+FAST f=15 a=10       8.207111       2.126066        8          338
+FAST f=15 a=10       0.043292       2.126066        8          338
+FAST f=16 a=1       18.464436       2.144040        8          242
+FAST f=16 a=1       0.172156       2.144040        8          242
+FAST f=16 a=2       12.844825       2.148171        8          194
+FAST f=16 a=2       0.099619       2.148171        8          194
+FAST f=16 a=3       11.082568       2.140837        8          290
+FAST f=16 a=3       0.079165       2.140837        8          290
+FAST f=16 a=4       10.066749       2.144405        8          386
+FAST f=16 a=4       0.068411       2.144405        8          386
+FAST f=16 a=5       9.501121       2.140720        8          386
+FAST f=16 a=5       0.061316       2.140720        8          386
+FAST f=16 a=6       9.179332       2.139478        8          386
+FAST f=16 a=6       0.056322       2.139478        8          386
+FAST f=16 a=7       8.849438       2.142412        8          194
+FAST f=16 a=7       0.050493       2.142412        8          194
+FAST f=16 a=8       8.810919       2.143454        8          434
+FAST f=16 a=8       0.051304       2.143454        8          434
+FAST f=16 a=9       8.553900       2.140339        8          194
+FAST f=16 a=9       0.047285       2.140339        8          194
+FAST f=16 a=10       8.398027       2.143130        8          386
+FAST f=16 a=10       0.046386       2.143130        8          386
+FAST f=17 a=1       18.644657       2.157192        8          98
+FAST f=17 a=1       0.173884       2.157192        8          98
+FAST f=17 a=2       13.071242       2.159830        8          146
+FAST f=17 a=2       0.10388       2.159830        8          146
+FAST f=17 a=3       11.332366       2.153654        6          194
+FAST f=17 a=3       0.08983       2.153654        6          194
+FAST f=17 a=4       10.362413       2.156813        8          242
+FAST f=17 a=4       0.070389       2.156813        8          242
+FAST f=17 a=5       9.808159       2.155098        6          338
+FAST f=17 a=5       0.072661       2.155098        6          338
+FAST f=17 a=6       9.451165       2.153845        6          146
+FAST f=17 a=6       0.064959       2.153845        6          146
+FAST f=17 a=7       9.163097       2.155424        6          242
+FAST f=17 a=7       0.064323       2.155424        6          242
+FAST f=17 a=8       9.047276       2.156640        8          242
+FAST f=17 a=8       0.053382       2.156640        8          242
+FAST f=17 a=9       8.807671       2.152396        8          146
+FAST f=17 a=9       0.049617       2.152396        8          146
+FAST f=17 a=10       8.649827       2.152370        8          146
+FAST f=17 a=10       0.047849       2.152370        8          146
+FAST f=18 a=1       18.809502       2.168116        8          98
+FAST f=18 a=1       0.175226       2.168116        8          98
+FAST f=18 a=2       13.756502       2.170870        6          242
+FAST f=18 a=2       0.119507       2.170870        6          242
+FAST f=18 a=3       12.059748       2.163094        6          98
+FAST f=18 a=3       0.093912       2.163094        6          98
+FAST f=18 a=4       11.410294       2.172372        8          98
+FAST f=18 a=4       0.073048       2.172372        8          98
+FAST f=18 a=5       10.560297       2.166388        8          98
+FAST f=18 a=5       0.065136       2.166388        8          98
+FAST f=18 a=6       10.071390       2.162672        8          98
+FAST f=18 a=6       0.059402       2.162672        8          98
+FAST f=18 a=7       10.084214       2.166624        6          194
+FAST f=18 a=7       0.073276       2.166624        6          194
+FAST f=18 a=8       9.953226       2.167454        8          98
+FAST f=18 a=8       0.053659       2.167454        8          98
+FAST f=18 a=9       8.982461       2.161593        6          146
+FAST f=18 a=9       0.05955       2.161593        6          146
+FAST f=18 a=10       8.986092       2.164373        6          242
+FAST f=18 a=10       0.059135       2.164373        6          242
+FAST f=19 a=1       18.908277       2.176021        8          98
+FAST f=19 a=1       0.177316       2.176021        8          98
+FAST f=19 a=2       13.471313       2.176103        8          98
+FAST f=19 a=2       0.106344       2.176103        8          98
+FAST f=19 a=3       11.571406       2.172812        8          98
+FAST f=19 a=3       0.083293       2.172812        8          98
+FAST f=19 a=4       10.632775       2.177770        6          146
+FAST f=19 a=4       0.079864       2.177770        6          146
+FAST f=19 a=5       10.030190       2.175574        6          146
+FAST f=19 a=5       0.07223       2.175574        6          146
+FAST f=19 a=6       9.717818       2.169997        8          98
+FAST f=19 a=6       0.060049       2.169997        8          98
+FAST f=19 a=7       9.397531       2.172770        8          146
+FAST f=19 a=7       0.057188       2.172770        8          146
+FAST f=19 a=8       9.281061       2.175822        8          98
+FAST f=19 a=8       0.053711       2.175822        8          98
+FAST f=19 a=9       9.165242       2.169849        6          146
+FAST f=19 a=9       0.059898       2.169849        6          146
+FAST f=19 a=10       9.048763       2.173394        8          98
+FAST f=19 a=10       0.049757       2.173394        8          98
+FAST f=20 a=1       21.166917       2.183923        6          98
+FAST f=20 a=1       0.205425       2.183923        6          98
+FAST f=20 a=2       15.642753       2.182349        6          98
+FAST f=20 a=2       0.135957       2.182349        6          98
+FAST f=20 a=3       14.053730       2.173544        6          98
+FAST f=20 a=3       0.11266       2.173544        6          98
+FAST f=20 a=4       15.270019       2.183656        8          98
+FAST f=20 a=4       0.107892       2.183656        8          98
+FAST f=20 a=5       15.497927       2.174661        6          98
+FAST f=20 a=5       0.100305       2.174661        6          98
+FAST f=20 a=6       13.973505       2.172391        8          98
+FAST f=20 a=6       0.087565       2.172391        8          98
+FAST f=20 a=7       14.083296       2.172443        8          98
+FAST f=20 a=7       0.078062       2.172443        8          98
+FAST f=20 a=8       12.560048       2.175581        8          98
+FAST f=20 a=8       0.070282       2.175581        8          98
+FAST f=20 a=9       13.078645       2.173975        6          146
+FAST f=20 a=9       0.081041       2.173975        6          146
+FAST f=20 a=10       12.823328       2.177778        8          98
+FAST f=20 a=10       0.074522       2.177778        8          98
+FAST f=21 a=1       29.825370       2.183057        6          98
+FAST f=21 a=1       0.334453       2.183057        6          98
+FAST f=21 a=2       29.476474       2.182752        8          98
+FAST f=21 a=2       0.286602       2.182752        8          98
+FAST f=21 a=3       25.937186       2.175867        8          98
+FAST f=21 a=3       0.17626       2.175867        8          98
+FAST f=21 a=4       20.413865       2.179780        8          98
+FAST f=21 a=4       0.206085       2.179780        8          98
+FAST f=21 a=5       20.541889       2.178328        6          146
+FAST f=21 a=5       0.199157       2.178328        6          146
+FAST f=21 a=6       21.090670       2.174443        6          146
+FAST f=21 a=6       0.190645       2.174443        6          146
+FAST f=21 a=7       20.221569       2.177384        6          146
+FAST f=21 a=7       0.184278       2.177384        6          146
+FAST f=21 a=8       20.322357       2.179456        6          98
+FAST f=21 a=8       0.178458       2.179456        6          98
+FAST f=21 a=9       20.683912       2.174396        6          146
+FAST f=21 a=9       0.190829       2.174396        6          146
+FAST f=21 a=10       20.840865       2.174905        8          98
+FAST f=21 a=10       0.172515       2.174905        8          98
+FAST f=22 a=1       36.822827       2.181612        6          98
+FAST f=22 a=1       0.437389       2.181612        6          98
+FAST f=22 a=2       30.616902       2.183142        8          98
+FAST f=22 a=2       0.324284       2.183142        8          98
+FAST f=22 a=3       28.472482       2.178130        8          98
+FAST f=22 a=3       0.236538       2.178130        8          98
+FAST f=22 a=4       25.847028       2.181878        8          98
+FAST f=22 a=4       0.263744       2.181878        8          98
+FAST f=22 a=5       27.095881       2.180775        8          98
+FAST f=22 a=5       0.24988       2.180775        8          98
+FAST f=22 a=6       25.939172       2.170916        8          98
+FAST f=22 a=6       0.240033       2.170916        8          98
+FAST f=22 a=7       27.064194       2.177849        8          98
+FAST f=22 a=7       0.242383       2.177849        8          98
+FAST f=22 a=8       25.140221       2.178216        8          98
+FAST f=22 a=8       0.237601       2.178216        8          98
+FAST f=22 a=9       25.505283       2.177455        6          146
+FAST f=22 a=9       0.223217       2.177455        6          146
+FAST f=22 a=10       24.529362       2.176705        6          98
+FAST f=22 a=10       0.222876       2.176705        6          98
+FAST f=23 a=1       39.127310       2.183006        6          98
+FAST f=23 a=1       0.417338       2.183006        6          98
+FAST f=23 a=2       32.468161       2.183524        6          98
+FAST f=23 a=2       0.351645       2.183524        6          98
+FAST f=23 a=3       31.577620       2.172604        6          98
+FAST f=23 a=3       0.319659       2.172604        6          98
+FAST f=23 a=4       30.129247       2.183932        6          98
+FAST f=23 a=4       0.307239       2.183932        6          98
+FAST f=23 a=5       29.103376       2.183529        6          146
+FAST f=23 a=5       0.285533       2.183529        6          146
+FAST f=23 a=6       29.776045       2.174367        8          98
+FAST f=23 a=6       0.276846       2.174367        8          98
+FAST f=23 a=7       28.940407       2.178022        6          146
+FAST f=23 a=7       0.274082       2.178022        6          146
+FAST f=23 a=8       29.256009       2.179462        6          98
+FAST f=23 a=8       0.26949       2.179462        6          98
+FAST f=23 a=9       29.347312       2.170407        8          98
+FAST f=23 a=9       0.265034       2.170407        8          98
+FAST f=23 a=10       29.140081       2.171762        8          98
+FAST f=23 a=10       0.259183       2.171762        8          98
+FAST f=24 a=1       44.871179       2.182115        6          98
+FAST f=24 a=1       0.509433       2.182115        6          98
+FAST f=24 a=2       38.694867       2.180549        8          98
+FAST f=24 a=2       0.406695       2.180549        8          98
+FAST f=24 a=3       38.363769       2.172821        8          98
+FAST f=24 a=3       0.359581       2.172821        8          98
+FAST f=24 a=4       36.580797       2.184142        8          98
+FAST f=24 a=4       0.340614       2.184142        8          98
+FAST f=24 a=5       33.125701       2.183301        8          98
+FAST f=24 a=5       0.324874       2.183301        8          98
+FAST f=24 a=6       34.776068       2.173019        6          146
+FAST f=24 a=6       0.340397       2.173019        6          146
+FAST f=24 a=7       34.417625       2.176561        6          146
+FAST f=24 a=7       0.308223       2.176561        6          146
+FAST f=24 a=8       35.470291       2.182161        6          98
+FAST f=24 a=8       0.307724       2.182161        6          98
+FAST f=24 a=9       34.927252       2.172682        6          146
+FAST f=24 a=9       0.300598       2.172682        6          146
+FAST f=24 a=10       33.238355       2.173395        6          98
+FAST f=24 a=10       0.249916       2.173395        6          98
+
+
+hg-manifest:
+NODICT       0.000004       1.866377        
+RANDOM       0.696346       2.309436        
+LEGACY       7.064527       2.506977        
+COVER       876.312865       2.582528        8          434
+COVER       35.684533       2.582528        8          434
+FAST f=15 a=1       76.618201       2.404013        8          1202
+FAST f=15 a=1       0.700722       2.404013        8          1202
+FAST f=15 a=2       49.213058       2.409248        6          1826
+FAST f=15 a=2       0.473393       2.409248        6          1826
+FAST f=15 a=3       41.753197       2.409677        8          1490
+FAST f=15 a=3       0.336848       2.409677        8          1490
+FAST f=15 a=4       38.648295       2.407996        8          1538
+FAST f=15 a=4       0.283952       2.407996        8          1538
+FAST f=15 a=5       36.144936       2.402895        8          1874
+FAST f=15 a=5       0.270128       2.402895        8          1874
+FAST f=15 a=6       35.484675       2.394873        8          1586
+FAST f=15 a=6       0.251637       2.394873        8          1586
+FAST f=15 a=7       34.280599       2.397311        8          1778
+FAST f=15 a=7       0.23984       2.397311        8          1778
+FAST f=15 a=8       32.122572       2.396089        6          1490
+FAST f=15 a=8       0.251508       2.396089        6          1490
+FAST f=15 a=9       29.909842       2.390092        6          1970
+FAST f=15 a=9       0.251233       2.390092        6          1970
+FAST f=15 a=10       30.102938       2.400086        6          1682
+FAST f=15 a=10       0.23688       2.400086        6          1682
+FAST f=16 a=1       67.750401       2.475460        6          1346
+FAST f=16 a=1       0.796035       2.475460        6          1346
+FAST f=16 a=2       52.812027       2.480860        6          1730
+FAST f=16 a=2       0.480384       2.480860        6          1730
+FAST f=16 a=3       44.179259       2.469304        8          1970
+FAST f=16 a=3       0.332657       2.469304        8          1970
+FAST f=16 a=4       37.612728       2.478208        6          1970
+FAST f=16 a=4       0.32498       2.478208        6          1970
+FAST f=16 a=5       35.056222       2.475568        6          1298
+FAST f=16 a=5       0.302824       2.475568        6          1298
+FAST f=16 a=6       34.713012       2.486079        8          1730
+FAST f=16 a=6       0.24755       2.486079        8          1730
+FAST f=16 a=7       33.713687       2.477180        6          1682
+FAST f=16 a=7       0.280358       2.477180        6          1682
+FAST f=16 a=8       31.571412       2.475418        8          1538
+FAST f=16 a=8       0.241241       2.475418        8          1538
+FAST f=16 a=9       31.608069       2.478263        8          1922
+FAST f=16 a=9       0.241764       2.478263        8          1922
+FAST f=16 a=10       31.358002       2.472263        8          1442
+FAST f=16 a=10       0.221661       2.472263        8          1442
+FAST f=17 a=1       66.185775       2.536085        6          1346
+FAST f=17 a=1       0.713549       2.536085        6          1346
+FAST f=17 a=2       50.365000       2.546105        8          1298
+FAST f=17 a=2       0.467846       2.546105        8          1298
+FAST f=17 a=3       42.712843       2.536250        8          1298
+FAST f=17 a=3       0.34047       2.536250        8          1298
+FAST f=17 a=4       39.514227       2.535555        8          1442
+FAST f=17 a=4       0.302989       2.535555        8          1442
+FAST f=17 a=5       35.189292       2.524925        8          1202
+FAST f=17 a=5       0.273451       2.524925        8          1202
+FAST f=17 a=6       35.791683       2.523466        8          1202
+FAST f=17 a=6       0.268261       2.523466        8          1202
+FAST f=17 a=7       37.416136       2.526625        6          1010
+FAST f=17 a=7       0.277558       2.526625        6          1010
+FAST f=17 a=8       37.084707       2.533274        6          1250
+FAST f=17 a=8       0.285104       2.533274        6          1250
+FAST f=17 a=9       34.183814       2.532765        8          1298
+FAST f=17 a=9       0.235133       2.532765        8          1298
+FAST f=17 a=10       31.149235       2.528722        8          1346
+FAST f=17 a=10       0.232679       2.528722        8          1346
+FAST f=18 a=1       72.942176       2.559857        6          386
+FAST f=18 a=1       0.718618       2.559857        6          386
+FAST f=18 a=2       51.690440       2.559572        8          290
+FAST f=18 a=2       0.403978       2.559572        8          290
+FAST f=18 a=3       45.344908       2.561040        8          962
+FAST f=18 a=3       0.357205       2.561040        8          962
+FAST f=18 a=4       39.804522       2.558446        8          1010
+FAST f=18 a=4       0.310526       2.558446        8          1010
+FAST f=18 a=5       38.134888       2.561811        8          626
+FAST f=18 a=5       0.273743       2.561811        8          626
+FAST f=18 a=6       35.091890       2.555518        8          722
+FAST f=18 a=6       0.260135       2.555518        8          722
+FAST f=18 a=7       34.639523       2.562938        8          290
+FAST f=18 a=7       0.234294       2.562938        8          290
+FAST f=18 a=8       36.076431       2.563567        8          1586
+FAST f=18 a=8       0.274075       2.563567        8          1586
+FAST f=18 a=9       36.376433       2.560950        8          722
+FAST f=18 a=9       0.240106       2.560950        8          722
+FAST f=18 a=10       32.624790       2.559340        8          578
+FAST f=18 a=10       0.234704       2.559340        8          578
+FAST f=19 a=1       70.513761       2.572441        8          194
+FAST f=19 a=1       0.726112       2.572441        8          194
+FAST f=19 a=2       59.263032       2.574560        8          482
+FAST f=19 a=2       0.451554       2.574560        8          482
+FAST f=19 a=3       51.509594       2.571546        6          194
+FAST f=19 a=3       0.393014       2.571546        6          194
+FAST f=19 a=4       55.393906       2.573386        8          482
+FAST f=19 a=4       0.38819       2.573386        8          482
+FAST f=19 a=5       43.201736       2.567589        8          674
+FAST f=19 a=5       0.292155       2.567589        8          674
+FAST f=19 a=6       42.911687       2.572666        6          434
+FAST f=19 a=6       0.303988       2.572666        6          434
+FAST f=19 a=7       44.687591       2.573613        6          290
+FAST f=19 a=7       0.308721       2.573613        6          290
+FAST f=19 a=8       37.372868       2.571039        6          194
+FAST f=19 a=8       0.287137       2.571039        6          194
+FAST f=19 a=9       36.074230       2.566473        6          482
+FAST f=19 a=9       0.280721       2.566473        6          482
+FAST f=19 a=10       33.731720       2.570306        8          194
+FAST f=19 a=10       0.224073       2.570306        8          194
+FAST f=20 a=1       79.670634       2.581146        6          290
+FAST f=20 a=1       0.899986       2.581146        6          290
+FAST f=20 a=2       58.827141       2.579782        8          386
+FAST f=20 a=2       0.602288       2.579782        8          386
+FAST f=20 a=3       51.289004       2.579627        8          722
+FAST f=20 a=3       0.446091       2.579627        8          722
+FAST f=20 a=4       47.711068       2.581508        8          722
+FAST f=20 a=4       0.473007       2.581508        8          722
+FAST f=20 a=5       47.402929       2.578062        6          434
+FAST f=20 a=5       0.497131       2.578062        6          434
+FAST f=20 a=6       54.797102       2.577365        8          482
+FAST f=20 a=6       0.515061       2.577365        8          482
+FAST f=20 a=7       51.370877       2.583050        8          386
+FAST f=20 a=7       0.402878       2.583050        8          386
+FAST f=20 a=8       51.437931       2.574875        6          242
+FAST f=20 a=8       0.453094       2.574875        6          242
+FAST f=20 a=9       44.105456       2.576700        6          242
+FAST f=20 a=9       0.456633       2.576700        6          242
+FAST f=20 a=10       44.447580       2.578305        8          338
+FAST f=20 a=10       0.409121       2.578305        8          338
+FAST f=21 a=1       113.031686       2.582449        6          242
+FAST f=21 a=1       1.456971       2.582449        6          242
+FAST f=21 a=2       97.700932       2.582124        8          194
+FAST f=21 a=2       1.072078       2.582124        8          194
+FAST f=21 a=3       96.563648       2.585479        8          434
+FAST f=21 a=3       0.949528       2.585479        8          434
+FAST f=21 a=4       90.597813       2.582366        6          386
+FAST f=21 a=4       0.76944       2.582366        6          386
+FAST f=21 a=5       86.815980       2.579043        8          434
+FAST f=21 a=5       0.858167       2.579043        8          434
+FAST f=21 a=6       91.235820       2.578378        8          530
+FAST f=21 a=6       0.684274       2.578378        8          530
+FAST f=21 a=7       84.392788       2.581243        8          386
+FAST f=21 a=7       0.814386       2.581243        8          386
+FAST f=21 a=8       82.052310       2.582547        8          338
+FAST f=21 a=8       0.822633       2.582547        8          338
+FAST f=21 a=9       74.696074       2.579319        8          194
+FAST f=21 a=9       0.811028       2.579319        8          194
+FAST f=21 a=10       76.211170       2.578766        8          290
+FAST f=21 a=10       0.809715       2.578766        8          290
+FAST f=22 a=1       138.976871       2.580478        8          194
+FAST f=22 a=1       1.748932       2.580478        8          194
+FAST f=22 a=2       120.164097       2.583633        8          386
+FAST f=22 a=2       1.333239       2.583633        8          386
+FAST f=22 a=3       111.986474       2.582566        6          194
+FAST f=22 a=3       1.305734       2.582566        6          194
+FAST f=22 a=4       108.548148       2.583068        6          194
+FAST f=22 a=4       1.314026       2.583068        6          194
+FAST f=22 a=5       103.173017       2.583495        6          290
+FAST f=22 a=5       1.228664       2.583495        6          290
+FAST f=22 a=6       108.421262       2.582349        8          530
+FAST f=22 a=6       1.076773       2.582349        8          530
+FAST f=22 a=7       103.284127       2.581022        8          386
+FAST f=22 a=7       1.112117       2.581022        8          386
+FAST f=22 a=8       96.330279       2.581073        8          290
+FAST f=22 a=8       1.109303       2.581073        8          290
+FAST f=22 a=9       97.651348       2.580075        6          194
+FAST f=22 a=9       0.933032       2.580075        6          194
+FAST f=22 a=10       101.660621       2.584886        8          194
+FAST f=22 a=10       0.796823       2.584886        8          194
+FAST f=23 a=1       159.322978       2.581474        6          242
+FAST f=23 a=1       2.015878       2.581474        6          242
+FAST f=23 a=2       134.331775       2.581619        8          194
+FAST f=23 a=2       1.545845       2.581619        8          194
+FAST f=23 a=3       127.724552       2.579888        6          338
+FAST f=23 a=3       1.444496       2.579888        6          338
+FAST f=23 a=4       126.077675       2.578137        6          242
+FAST f=23 a=4       1.364394       2.578137        6          242
+FAST f=23 a=5       124.914027       2.580843        8          338
+FAST f=23 a=5       1.116059       2.580843        8          338
+FAST f=23 a=6       122.874153       2.577637        6          338
+FAST f=23 a=6       1.164584       2.577637        6          338
+FAST f=23 a=7       123.099257       2.582715        6          386
+FAST f=23 a=7       1.354042       2.582715        6          386
+FAST f=23 a=8       122.026753       2.577681        8          194
+FAST f=23 a=8       1.210966       2.577681        8          194
+FAST f=23 a=9       121.164312       2.584599        6          290
+FAST f=23 a=9       1.174859       2.584599        6          290
+FAST f=23 a=10       117.462222       2.580358        8          194
+FAST f=23 a=10       1.075258       2.580358        8          194
+FAST f=24 a=1       169.539659       2.581642        6          194
+FAST f=24 a=1       1.916804       2.581642        6          194
+FAST f=24 a=2       160.539270       2.580421        6          290
+FAST f=24 a=2       1.71087       2.580421        6          290
+FAST f=24 a=3       155.455874       2.580449        6          242
+FAST f=24 a=3       1.60307       2.580449        6          242
+FAST f=24 a=4       147.630320       2.582953        6          338
+FAST f=24 a=4       1.396364       2.582953        6          338
+FAST f=24 a=5       133.767428       2.580589        6          290
+FAST f=24 a=5       1.19933       2.580589        6          290
+FAST f=24 a=6       146.437535       2.579453        8          194
+FAST f=24 a=6       1.385405       2.579453        8          194
+FAST f=24 a=7       147.227507       2.584155        8          386
+FAST f=24 a=7       1.48942       2.584155        8          386
+FAST f=24 a=8       138.005773       2.584115        8          194
+FAST f=24 a=8       1.352       2.584115        8          194
+FAST f=24 a=9       141.442625       2.582902        8          290
+FAST f=24 a=9       1.39647       2.582902        8          290
+FAST f=24 a=10       142.157446       2.582701        8          434
+FAST f=24 a=10       1.498889       2.582701        8          434
diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c b/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c
new file mode 100644
index 0000000..b193456
--- /dev/null
+++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c
@@ -0,0 +1,442 @@
+#include <stdio.h>  /* fprintf */
+#include <stdlib.h> /* malloc, free, qsort */
+#include <string.h>   /* strcmp, strlen */
+#include <errno.h>    /* errno */
+#include <ctype.h>
+#include <time.h>
+#include "random.h"
+#include "dictBuilder.h"
+#include "zstd_internal.h" /* includes zstd.h */
+#include "io.h"
+#include "util.h"
+#include "zdict.h"
+
+
+
+/*-*************************************
+*  Console display
+***************************************/
+#define DISPLAY(...)         fprintf(stderr, __VA_ARGS__)
+#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); }
+
+static const U64 g_refreshRate = SEC_TO_MICRO / 6;
+static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
+
+#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \
+            if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \
+            { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \
+            if (displayLevel>=4) fflush(stderr); } } }
+
+
+/*-*************************************
+*  Exceptions
+***************************************/
+#ifndef DEBUG
+#  define DEBUG 0
+#endif
+#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__);
+#define EXM_THROW(error, ...)                                             \
+{                                                                         \
+    DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \
+    DISPLAY("Error %i : ", error);                                        \
+    DISPLAY(__VA_ARGS__);                                                 \
+    DISPLAY("\n");                                                        \
+    exit(error);                                                          \
+}
+
+
+/*-*************************************
+*  Constants
+***************************************/
+static const unsigned g_defaultMaxDictSize = 110 KB;
+#define DEFAULT_CLEVEL 3
+#define DEFAULT_DISPLAYLEVEL 2
+
+
+/*-*************************************
+*  Struct
+***************************************/
+typedef struct {
+  const void* dictBuffer;
+  size_t dictSize;
+} dictInfo;
+
+
+/*-*************************************
+* Dictionary related operations
+***************************************/
+/** createDictFromFiles() :
+ *  Based on type of param given, train dictionary using the corresponding algorithm
+ *  @return dictInfo containing dictionary buffer and dictionary size
+ */
+dictInfo* createDictFromFiles(sampleInfo *info, unsigned maxDictSize,
+                  ZDICT_random_params_t *randomParams, ZDICT_cover_params_t *coverParams,
+                  ZDICT_legacy_params_t *legacyParams, ZDICT_fastCover_params_t *fastParams) {
+    unsigned const displayLevel = randomParams ? randomParams->zParams.notificationLevel :
+                                  coverParams ? coverParams->zParams.notificationLevel :
+                                  legacyParams ? legacyParams->zParams.notificationLevel :
+                                  fastParams ? fastParams->zParams.notificationLevel :
+                                  DEFAULT_DISPLAYLEVEL;   /* no dict */
+    void* const dictBuffer = malloc(maxDictSize);
+
+    dictInfo* dInfo = NULL;
+
+    /* Checks */
+    if (!dictBuffer)
+        EXM_THROW(12, "not enough memory for trainFromFiles");   /* should not happen */
+
+    {   size_t dictSize;
+        if(randomParams) {
+          dictSize = ZDICT_trainFromBuffer_random(dictBuffer, maxDictSize, info->srcBuffer,
+                                               info->samplesSizes, info->nbSamples, *randomParams);
+        }else if(coverParams) {
+          /* Run the optimize version if either k or d is not provided */
+          if (!coverParams->d || !coverParams->k){
+            dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize, info->srcBuffer,
+                                                  info->samplesSizes, info->nbSamples, coverParams);
+          } else {
+            dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, info->srcBuffer,
+                                                  info->samplesSizes, info->nbSamples, *coverParams);
+          }
+        } else if(legacyParams) {
+          dictSize = ZDICT_trainFromBuffer_legacy(dictBuffer, maxDictSize, info->srcBuffer,
+                                               info->samplesSizes, info->nbSamples, *legacyParams);
+        } else if(fastParams) {
+          /* Run the optimize version if either k or d is not provided */
+          if (!fastParams->d || !fastParams->k) {
+            dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize, info->srcBuffer,
+                                                  info->samplesSizes, info->nbSamples, fastParams);
+          } else {
+            dictSize = ZDICT_trainFromBuffer_fastCover(dictBuffer, maxDictSize, info->srcBuffer,
+                                                  info->samplesSizes, info->nbSamples, *fastParams);
+          }
+        } else {
+          dictSize = 0;
+        }
+        if (ZDICT_isError(dictSize)) {
+            DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize));   /* should not happen */
+            free(dictBuffer);
+            return dInfo;
+        }
+        dInfo = (dictInfo *)malloc(sizeof(dictInfo));
+        dInfo->dictBuffer = dictBuffer;
+        dInfo->dictSize = dictSize;
+    }
+    return dInfo;
+}
+
+
+/** compressWithDict() :
+ *  Compress samples from sample buffer given dicionary stored on dictionary buffer and compression level
+ *  @return compression ratio
+ */
+double compressWithDict(sampleInfo *srcInfo, dictInfo* dInfo, int compressionLevel, int displayLevel) {
+  /* Local variables */
+  size_t totalCompressedSize = 0;
+  size_t totalOriginalSize = 0;
+  const unsigned hasDict = dInfo->dictSize > 0 ? 1 : 0;
+  double cRatio;
+  size_t dstCapacity;
+  int i;
+
+  /* Pointers */
+  ZSTD_CDict *cdict = NULL;
+  ZSTD_CCtx* cctx = NULL;
+  size_t *offsets = NULL;
+  void* dst = NULL;
+
+  /* Allocate dst with enough space to compress the maximum sized sample */
+  {
+    size_t maxSampleSize = 0;
+    for (i = 0; i < srcInfo->nbSamples; i++) {
+      maxSampleSize = MAX(srcInfo->samplesSizes[i], maxSampleSize);
+    }
+    dstCapacity = ZSTD_compressBound(maxSampleSize);
+    dst = malloc(dstCapacity);
+  }
+
+  /* Calculate offset for each sample */
+  offsets = (size_t *)malloc((srcInfo->nbSamples + 1) * sizeof(size_t));
+  offsets[0] = 0;
+  for (i = 1; i <= srcInfo->nbSamples; i++) {
+    offsets[i] = offsets[i - 1] + srcInfo->samplesSizes[i - 1];
+  }
+
+  /* Create the cctx */
+  cctx = ZSTD_createCCtx();
+  if(!cctx || !dst) {
+    cRatio = -1;
+    goto _cleanup;
+  }
+
+  /* Create CDict if there's a dictionary stored on buffer */
+  if (hasDict) {
+    cdict = ZSTD_createCDict(dInfo->dictBuffer, dInfo->dictSize, compressionLevel);
+    if(!cdict) {
+      cRatio = -1;
+      goto _cleanup;
+    }
+  }
+
+  /* Compress each sample and sum their sizes*/
+  const BYTE *const samples = (const BYTE *)srcInfo->srcBuffer;
+  for (i = 0; i < srcInfo->nbSamples; i++) {
+    size_t compressedSize;
+    if(hasDict) {
+      compressedSize = ZSTD_compress_usingCDict(cctx, dst, dstCapacity, samples + offsets[i], srcInfo->samplesSizes[i], cdict);
+    } else {
+      compressedSize = ZSTD_compressCCtx(cctx, dst, dstCapacity,samples + offsets[i], srcInfo->samplesSizes[i], compressionLevel);
+    }
+    if (ZSTD_isError(compressedSize)) {
+      cRatio = -1;
+      goto _cleanup;
+    }
+    totalCompressedSize += compressedSize;
+  }
+
+  /* Sum orignal sizes */
+  for (i = 0; i<srcInfo->nbSamples; i++) {
+    totalOriginalSize += srcInfo->samplesSizes[i];
+  }
+
+  /* Calculate compression ratio */
+  DISPLAYLEVEL(2, "original size is %lu\n", totalOriginalSize);
+  DISPLAYLEVEL(2, "compressed size is %lu\n", totalCompressedSize);
+  cRatio = (double)totalOriginalSize/(double)totalCompressedSize;
+
+_cleanup:
+  free(dst);
+  free(offsets);
+  ZSTD_freeCCtx(cctx);
+  ZSTD_freeCDict(cdict);
+  return cRatio;
+}
+
+
+/** FreeDictInfo() :
+ *  Free memory allocated for dictInfo
+ */
+void freeDictInfo(dictInfo* info) {
+  if (!info) return;
+  if (info->dictBuffer) free((void*)(info->dictBuffer));
+  free(info);
+}
+
+
+
+/*-********************************************************
+  *  Benchmarking functions
+**********************************************************/
+/** benchmarkDictBuilder() :
+ *  Measure how long a dictionary builder takes and compression ratio with the dictionary built
+ *  @return 0 if benchmark successfully, 1 otherwise
+ */
+int benchmarkDictBuilder(sampleInfo *srcInfo, unsigned maxDictSize, ZDICT_random_params_t *randomParam,
+                        ZDICT_cover_params_t *coverParam, ZDICT_legacy_params_t *legacyParam,
+                        ZDICT_fastCover_params_t *fastParam) {
+  /* Local variables */
+  const unsigned displayLevel = randomParam ? randomParam->zParams.notificationLevel :
+                                coverParam ? coverParam->zParams.notificationLevel :
+                                legacyParam ? legacyParam->zParams.notificationLevel :
+                                fastParam ? fastParam->zParams.notificationLevel:
+                                DEFAULT_DISPLAYLEVEL;   /* no dict */
+  const char* name = randomParam ? "RANDOM" :
+                    coverParam ? "COVER" :
+                    legacyParam ? "LEGACY" :
+                    fastParam ? "FAST":
+                    "NODICT";    /* no dict */
+  const unsigned cLevel = randomParam ? randomParam->zParams.compressionLevel :
+                          coverParam ? coverParam->zParams.compressionLevel :
+                          legacyParam ? legacyParam->zParams.compressionLevel :
+                          fastParam ? fastParam->zParams.compressionLevel:
+                          DEFAULT_CLEVEL;   /* no dict */
+  int result = 0;
+
+  /* Calculate speed */
+  const UTIL_time_t begin = UTIL_getTime();
+  dictInfo* dInfo = createDictFromFiles(srcInfo, maxDictSize, randomParam, coverParam, legacyParam, fastParam);
+  const U64 timeMicro = UTIL_clockSpanMicro(begin);
+  const double timeSec = timeMicro / (double)SEC_TO_MICRO;
+  if (!dInfo) {
+    DISPLAYLEVEL(1, "%s does not train successfully\n", name);
+    result = 1;
+    goto _cleanup;
+  }
+  DISPLAYLEVEL(1, "%s took %f seconds to execute \n", name, timeSec);
+
+  /* Calculate compression ratio */
+  const double cRatio = compressWithDict(srcInfo, dInfo, cLevel, displayLevel);
+  if (cRatio < 0) {
+    DISPLAYLEVEL(1, "Compressing with %s dictionary does not work\n", name);
+    result = 1;
+    goto _cleanup;
+
+  }
+  DISPLAYLEVEL(1, "Compression ratio with %s dictionary is %f\n", name, cRatio);
+
+_cleanup:
+  freeDictInfo(dInfo);
+  return result;
+}
+
+
+
+int main(int argCount, const char* argv[])
+{
+  const int displayLevel = DEFAULT_DISPLAYLEVEL;
+  const char* programName = argv[0];
+  int result = 0;
+
+  /* Initialize arguments to default values */
+  unsigned k = 200;
+  unsigned d = 8;
+  unsigned f;
+  unsigned accel;
+  unsigned i;
+  const unsigned cLevel = DEFAULT_CLEVEL;
+  const unsigned dictID = 0;
+  const unsigned maxDictSize = g_defaultMaxDictSize;
+
+  /* Initialize table to store input files */
+  const char** filenameTable = (const char**)malloc(argCount * sizeof(const char*));
+  unsigned filenameIdx = 0;
+
+  char* fileNamesBuf = NULL;
+  unsigned fileNamesNb = filenameIdx;
+  const int followLinks = 0;
+  const char** extendedFileList = NULL;
+
+  /* Parse arguments */
+  for (i = 1; i < argCount; i++) {
+    const char* argument = argv[i];
+    if (longCommandWArg(&argument, "in=")) {
+      filenameTable[filenameIdx] = argument;
+      filenameIdx++;
+      continue;
+    }
+    DISPLAYLEVEL(1, "benchmark: Incorrect parameters\n");
+    return 1;
+  }
+
+  /* Get the list of all files recursively (because followLinks==0)*/
+  extendedFileList = UTIL_createFileList(filenameTable, filenameIdx, &fileNamesBuf,
+                                        &fileNamesNb, followLinks);
+  if (extendedFileList) {
+    unsigned u;
+    for (u=0; u<fileNamesNb; u++) DISPLAYLEVEL(4, "%u %s\n", u, extendedFileList[u]);
+    free((void*)filenameTable);
+    filenameTable = extendedFileList;
+    filenameIdx = fileNamesNb;
+  }
+
+  /* get sampleInfo */
+  size_t blockSize = 0;
+  sampleInfo* srcInfo= getSampleInfo(filenameTable,
+                    filenameIdx, blockSize, maxDictSize, displayLevel);
+
+  /* set up zParams */
+  ZDICT_params_t zParams;
+  zParams.compressionLevel = cLevel;
+  zParams.notificationLevel = displayLevel;
+  zParams.dictID = dictID;
+
+  /* with no dict */
+  {
+    const int noDictResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, NULL);
+    if(noDictResult) {
+      result = 1;
+      goto _cleanup;
+    }
+  }
+
+  /* for random */
+  {
+    ZDICT_random_params_t randomParam;
+    randomParam.zParams = zParams;
+    randomParam.k = k;
+    const int randomResult = benchmarkDictBuilder(srcInfo, maxDictSize, &randomParam, NULL, NULL, NULL);
+    DISPLAYLEVEL(2, "k=%u\n", randomParam.k);
+    if(randomResult) {
+      result = 1;
+      goto _cleanup;
+    }
+  }
+
+  /* for legacy */
+  {
+    ZDICT_legacy_params_t legacyParam;
+    legacyParam.zParams = zParams;
+    legacyParam.selectivityLevel = 9;
+    const int legacyResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, &legacyParam, NULL);
+    DISPLAYLEVEL(2, "selectivityLevel=%u\n", legacyParam.selectivityLevel);
+    if(legacyResult) {
+      result = 1;
+      goto _cleanup;
+    }
+  }
+
+  /* for cover */
+  {
+    /* for cover (optimizing k and d) */
+    ZDICT_cover_params_t coverParam;
+    memset(&coverParam, 0, sizeof(coverParam));
+    coverParam.zParams = zParams;
+    coverParam.splitPoint = 1.0;
+    coverParam.steps = 40;
+    coverParam.nbThreads = 1;
+    const int coverOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, &coverParam, NULL, NULL);
+    DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\nsplit=%u\n", coverParam.k, coverParam.d, coverParam.steps, (unsigned)(coverParam.splitPoint * 100));
+    if(coverOptResult) {
+      result = 1;
+      goto _cleanup;
+    }
+
+    /* for cover (with k and d provided) */
+    const int coverResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, &coverParam, NULL, NULL);
+    DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\nsplit=%u\n", coverParam.k, coverParam.d, coverParam.steps, (unsigned)(coverParam.splitPoint * 100));
+    if(coverResult) {
+      result = 1;
+      goto _cleanup;
+    }
+
+  }
+
+  /* for fastCover */
+  for (f = 15; f < 25; f++){
+    DISPLAYLEVEL(2, "current f is %u\n", f);
+    for (accel = 1; accel < 11; accel++) {
+      DISPLAYLEVEL(2, "current accel is %u\n", accel);
+      /* for fastCover (optimizing k and d) */
+      ZDICT_fastCover_params_t fastParam;
+      memset(&fastParam, 0, sizeof(fastParam));
+      fastParam.zParams = zParams;
+      fastParam.f = f;
+      fastParam.steps = 40;
+      fastParam.nbThreads = 1;
+      fastParam.accel = accel;
+      const int fastOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam);
+      DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\n", fastParam.k, fastParam.d, fastParam.f, fastParam.steps, (unsigned)(fastParam.splitPoint * 100), fastParam.accel);
+      if(fastOptResult) {
+        result = 1;
+        goto _cleanup;
+      }
+
+      /* for fastCover (with k and d provided) */
+      for (i = 0; i < 5; i++) {
+        const int fastResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam);
+        DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\n", fastParam.k, fastParam.d, fastParam.f, fastParam.steps, (unsigned)(fastParam.splitPoint * 100), fastParam.accel);
+        if(fastResult) {
+          result = 1;
+          goto _cleanup;
+        }
+      }
+    }
+  }
+
+
+  /* Free allocated memory */
+_cleanup:
+  UTIL_freeFileList(extendedFileList, fileNamesBuf);
+  freeSampleInfo(srcInfo);
+  return result;
+}
diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/dictBuilder.h b/contrib/experimental_dict_builders/benchmarkDictBuilder/dictBuilder.h
new file mode 100644
index 0000000..781ec8c
--- /dev/null
+++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/dictBuilder.h
@@ -0,0 +1,6 @@
+/* ZDICT_trainFromBuffer_legacy() :
+ * issue : samplesBuffer need to be followed by a noisy guard band.
+ * work around : duplicate the buffer, and add the noise */
+size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,
+                                    const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
+                                    ZDICT_legacy_params_t params);
diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/test.sh b/contrib/experimental_dict_builders/benchmarkDictBuilder/test.sh
new file mode 100644
index 0000000..5eaf593
--- /dev/null
+++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/test.sh
@@ -0,0 +1,2 @@
+echo "Benchmark with in=../../lib/common"
+./benchmark in=../../../lib/common
diff --git a/contrib/experimental_dict_builders/fastCover/Makefile b/contrib/experimental_dict_builders/fastCover/Makefile
new file mode 100644
index 0000000..3ba2479
--- /dev/null
+++ b/contrib/experimental_dict_builders/fastCover/Makefile
@@ -0,0 +1,54 @@
+ARG :=
+
+CC ?= gcc
+CFLAGS ?= -O3 -g
+INCLUDES := -I ../../../programs -I ../randomDictBuilder -I ../../../lib/common -I ../../../lib -I ../../../lib/dictBuilder
+
+IO_FILE := ../randomDictBuilder/io.c
+
+TEST_INPUT := ../../../lib
+TEST_OUTPUT := fastCoverDict
+
+all: main run clean
+
+.PHONY: test
+test: main testrun testshell clean
+
+.PHONY: run
+run:
+	echo "Building a fastCover dictionary with given arguments"
+	./main $(ARG)
+
+main: main.o io.o fastCover.o libzstd.a
+	$(CC) $(CFLAGS) main.o io.o fastCover.o libzstd.a -o main
+
+main.o: main.c
+	$(CC) $(CFLAGS) $(INCLUDES) -c main.c
+
+fastCover.o: fastCover.c
+	$(CC) $(CFLAGS) $(INCLUDES) -c fastCover.c
+
+io.o: $(IO_FILE)
+	$(CC) $(CFLAGS) $(INCLUDES) -c $(IO_FILE)
+
+libzstd.a:
+	$(MAKE) MOREFLAGS=-g -C ../../../lib libzstd.a
+	mv ../../../lib/libzstd.a .
+
+.PHONY: testrun
+testrun: main
+	echo "Run with $(TEST_INPUT) and $(TEST_OUTPUT) "
+	./main in=$(TEST_INPUT) out=$(TEST_OUTPUT)
+	zstd -be3 -D $(TEST_OUTPUT) -r $(TEST_INPUT) -q
+	rm -f $(TEST_OUTPUT)
+
+.PHONY: testshell
+testshell: test.sh
+	sh test.sh
+	echo "Finish running test.sh"
+
+.PHONY: clean
+clean:
+	rm -f *.o main libzstd.a
+	$(MAKE) -C ../../../lib clean
+	echo "Cleaning is completed"
diff --git a/contrib/experimental_dict_builders/fastCover/README.md b/contrib/experimental_dict_builders/fastCover/README.md
new file mode 100644
index 0000000..ad37774
--- /dev/null
+++ b/contrib/experimental_dict_builders/fastCover/README.md
@@ -0,0 +1,24 @@
+FastCover Dictionary Builder
+
+### Permitted Arguments:
+Input File/Directory (in=fileName): required; file/directory used to build dictionary; if directory, will operate recursively for files inside directory; can include multiple files/directories, each following "in="
+Output Dictionary (out=dictName): if not provided, default to fastCoverDict
+Dictionary ID (dictID=#): nonnegative number; if not provided, default to 0
+Maximum Dictionary Size (maxdict=#): positive number; in bytes, if not provided, default to 110KB
+Size of Selected Segment (k=#): positive number; in bytes; if not provided, default to 200
+Size of Dmer (d=#): either 6 or 8; if not provided, default to 8
+Number of steps (steps=#): positive number, if not provided, default to 32
+Percentage of samples used for training(split=#): positive number; if not provided, default to 100
+
+
+###Running Test:
+make test
+
+
+###Usage:
+To build a FASTCOVER dictionary with the provided arguments: make ARG= followed by arguments
+If k or d is not provided, the optimize version of FASTCOVER is run.
+
+### Examples:
+make ARG="in=../../../lib/dictBuilder out=dict100 dictID=520"
+make ARG="in=../../../lib/dictBuilder in=../../../lib/compress"
diff --git a/contrib/experimental_dict_builders/fastCover/fastCover.c b/contrib/experimental_dict_builders/fastCover/fastCover.c
new file mode 100644
index 0000000..02c155a
--- /dev/null
+++ b/contrib/experimental_dict_builders/fastCover/fastCover.c
@@ -0,0 +1,809 @@
+/*-*************************************
+*  Dependencies
+***************************************/
+#include <stdio.h>  /* fprintf */
+#include <stdlib.h> /* malloc, free, qsort */
+#include <string.h> /* memset */
+#include <time.h>   /* clock */
+#include "mem.h" /* read */
+#include "pool.h"
+#include "threading.h"
+#include "fastCover.h"
+#include "zstd_internal.h" /* includes zstd.h */
+#include "zdict.h"
+
+
+/*-*************************************
+*  Constants
+***************************************/
+#define FASTCOVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB))
+#define FASTCOVER_MAX_F 32
+#define DEFAULT_SPLITPOINT 1.0
+
+/*-*************************************
+*  Console display
+***************************************/
+static int g_displayLevel = 2;
+#define DISPLAY(...)                                                           \
+  {                                                                            \
+    fprintf(stderr, __VA_ARGS__);                                              \
+    fflush(stderr);                                                            \
+  }
+#define LOCALDISPLAYLEVEL(displayLevel, l, ...)                                \
+  if (displayLevel >= l) {                                                     \
+    DISPLAY(__VA_ARGS__);                                                      \
+  } /* 0 : no display;   1: errors;   2: default;  3: details;  4: debug */
+#define DISPLAYLEVEL(l, ...) LOCALDISPLAYLEVEL(g_displayLevel, l, __VA_ARGS__)
+
+#define LOCALDISPLAYUPDATE(displayLevel, l, ...)                               \
+  if (displayLevel >= l) {                                                     \
+    if ((clock() - g_time > refreshRate) || (displayLevel >= 4)) {             \
+      g_time = clock();                                                        \
+      DISPLAY(__VA_ARGS__);                                                    \
+    }                                                                          \
+  }
+#define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(g_displayLevel, l, __VA_ARGS__)
+static const clock_t refreshRate = CLOCKS_PER_SEC * 15 / 100;
+static clock_t g_time = 0;
+
+
+/*-*************************************
+* Hash Functions
+***************************************/
+static const U64 prime6bytes = 227718039650203ULL;
+static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u  << (64-48)) * prime6bytes) >> (64-h)) ; }
+static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); }
+
+static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL;
+static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; }
+static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); }
+
+
+/**
+ * Hash the d-byte value pointed to by p and mod 2^f
+ */
+static size_t FASTCOVER_hashPtrToIndex(const void* p, U32 h, unsigned d) {
+  if (d == 6) {
+    return ZSTD_hash6Ptr(p, h) & ((1 << h) - 1);
+  }
+  return ZSTD_hash8Ptr(p, h) & ((1 << h) - 1);
+}
+
+
+/*-*************************************
+* Context
+***************************************/
+typedef struct {
+  const BYTE *samples;
+  size_t *offsets;
+  const size_t *samplesSizes;
+  size_t nbSamples;
+  size_t nbTrainSamples;
+  size_t nbTestSamples;
+  size_t nbDmers;
+  U32 *freqs;
+  U16 *segmentFreqs;
+  unsigned d;
+} FASTCOVER_ctx_t;
+
+
+/*-*************************************
+*  Helper functions
+***************************************/
+/**
+ * Returns the sum of the sample sizes.
+ */
+static size_t FASTCOVER_sum(const size_t *samplesSizes, unsigned nbSamples) {
+  size_t sum = 0;
+  unsigned i;
+  for (i = 0; i < nbSamples; ++i) {
+    sum += samplesSizes[i];
+  }
+  return sum;
+}
+
+
+/*-*************************************
+*  fast functions
+***************************************/
+/**
+ * A segment is a range in the source as well as the score of the segment.
+ */
+typedef struct {
+  U32 begin;
+  U32 end;
+  U32 score;
+} FASTCOVER_segment_t;
+
+
+/**
+ * Selects the best segment in an epoch.
+ * Segments of are scored according to the function:
+ *
+ * Let F(d) be the frequency of all dmers with hash value d.
+ * Let S_i be hash value of the dmer at position i of segment S which has length k.
+ *
+ *     Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
+ *
+ * Once the dmer with hash value d is in the dictionay we set F(d) = F(d)/2.
+ */
+static FASTCOVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx,
+                                                  U32 *freqs, U32 begin,U32 end,
+                                                  ZDICT_fastCover_params_t parameters) {
+  /* Constants */
+  const U32 k = parameters.k;
+  const U32 d = parameters.d;
+  const U32 dmersInK = k - d + 1;
+  /* Try each segment (activeSegment) and save the best (bestSegment) */
+  FASTCOVER_segment_t bestSegment = {0, 0, 0};
+  FASTCOVER_segment_t activeSegment;
+  /* Reset the activeDmers in the segment */
+  /* The activeSegment starts at the beginning of the epoch. */
+  activeSegment.begin = begin;
+  activeSegment.end = begin;
+  activeSegment.score = 0;
+  {
+    /* Slide the activeSegment through the whole epoch.
+     * Save the best segment in bestSegment.
+     */
+    while (activeSegment.end < end) {
+      /* Get hash value of current dmer */
+      const size_t index = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.end, parameters.f, ctx->d);
+      /* Add frequency of this index to score if this is the first occurence of index in active segment */
+      if (ctx->segmentFreqs[index] == 0) {
+        activeSegment.score += freqs[index];
+      }
+      ctx->segmentFreqs[index] += 1;
+      /* Increment end of segment */
+      activeSegment.end += 1;
+      /* If the window is now too large, drop the first position */
+      if (activeSegment.end - activeSegment.begin == dmersInK + 1) {
+        /* Get hash value of the dmer to be eliminated from active segment */
+        const size_t delIndex = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.begin, parameters.f, ctx->d);
+        ctx->segmentFreqs[delIndex] -= 1;
+        /* Subtract frequency of this index from score if this is the last occurrence of this index in active segment */
+        if (ctx->segmentFreqs[delIndex] == 0) {
+          activeSegment.score -= freqs[delIndex];
+        }
+        /* Increment start of segment */
+        activeSegment.begin += 1;
+      }
+      /* If this segment is the best so far save it */
+      if (activeSegment.score > bestSegment.score) {
+        bestSegment = activeSegment;
+      }
+    }
+    /* Zero out rest of segmentFreqs array */
+    while (activeSegment.begin < end) {
+      const size_t delIndex = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.begin, parameters.f, ctx->d);
+      ctx->segmentFreqs[delIndex] -= 1;
+      activeSegment.begin += 1;
+    }
+  }
+  {
+    /* Trim off the zero frequency head and tail from the segment. */
+    U32 newBegin = bestSegment.end;
+    U32 newEnd = bestSegment.begin;
+    U32 pos;
+    for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
+      const size_t index = FASTCOVER_hashPtrToIndex(ctx->samples + pos, parameters.f, ctx->d);
+      U32 freq = freqs[index];
+      if (freq != 0) {
+        newBegin = MIN(newBegin, pos);
+        newEnd = pos + 1;
+      }
+    }
+    bestSegment.begin = newBegin;
+    bestSegment.end = newEnd;
+  }
+  {
+    /*  Zero the frequency of hash value of each dmer covered by the chosen segment. */
+    U32 pos;
+    for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
+      const size_t i = FASTCOVER_hashPtrToIndex(ctx->samples + pos, parameters.f, ctx->d);
+      freqs[i] = 0;
+    }
+  }
+  return bestSegment;
+}
+
+/**
+ * Check the validity of the parameters.
+ * Returns non-zero if the parameters are valid and 0 otherwise.
+ */
+static int FASTCOVER_checkParameters(ZDICT_fastCover_params_t parameters,
+                                 size_t maxDictSize) {
+  /* k, d, and f are required parameters */
+  if (parameters.d == 0 || parameters.k == 0 || parameters.f == 0) {
+    return 0;
+  }
+  /* d has to be 6 or 8 */
+  if (parameters.d != 6 && parameters.d != 8) {
+    return 0;
+  }
+  /* 0 < f <= FASTCOVER_MAX_F */
+  if (parameters.f > FASTCOVER_MAX_F) {
+    return 0;
+  }
+  /* k <= maxDictSize */
+  if (parameters.k > maxDictSize) {
+    return 0;
+  }
+  /* d <= k */
+  if (parameters.d > parameters.k) {
+    return 0;
+  }
+  /* 0 < splitPoint <= 1 */
+  if (parameters.splitPoint <= 0 || parameters.splitPoint > 1) {
+    return 0;
+  }
+  return 1;
+}
+
+
+/**
+ * Clean up a context initialized with `FASTCOVER_ctx_init()`.
+ */
+static void FASTCOVER_ctx_destroy(FASTCOVER_ctx_t *ctx) {
+  if (!ctx) {
+    return;
+  }
+  if (ctx->segmentFreqs) {
+    free(ctx->segmentFreqs);
+    ctx->segmentFreqs = NULL;
+  }
+  if (ctx->freqs) {
+    free(ctx->freqs);
+    ctx->freqs = NULL;
+  }
+  if (ctx->offsets) {
+    free(ctx->offsets);
+    ctx->offsets = NULL;
+  }
+}
+
+/**
+ * Calculate for frequency of hash value of each dmer in ctx->samples
+ */
+static void FASTCOVER_computeFrequency(U32 *freqs, unsigned f, FASTCOVER_ctx_t *ctx){
+  size_t start; /* start of current dmer */
+  for (unsigned i = 0; i < ctx->nbTrainSamples; i++) {
+    size_t currSampleStart = ctx->offsets[i];
+    size_t currSampleEnd = ctx->offsets[i+1];
+    start = currSampleStart;
+    while (start + ctx->d <= currSampleEnd) {
+      const size_t dmerIndex = FASTCOVER_hashPtrToIndex(ctx->samples + start, f, ctx->d);
+      freqs[dmerIndex]++;
+      start++;
+    }
+  }
+}
+
+/**
+ * Prepare a context for dictionary building.
+ * The context is only dependent on the parameter `d` and can used multiple
+ * times.
+ * Returns 1 on success or zero on error.
+ * The context must be destroyed with `FASTCOVER_ctx_destroy()`.
+ */
+static int FASTCOVER_ctx_init(FASTCOVER_ctx_t *ctx, const void *samplesBuffer,
+                          const size_t *samplesSizes, unsigned nbSamples,
+                          unsigned d, double splitPoint, unsigned f) {
+  const BYTE *const samples = (const BYTE *)samplesBuffer;
+  const size_t totalSamplesSize = FASTCOVER_sum(samplesSizes, nbSamples);
+  /* Split samples into testing and training sets */
+  const unsigned nbTrainSamples = splitPoint < 1.0 ? (unsigned)((double)nbSamples * splitPoint) : nbSamples;
+  const unsigned nbTestSamples = splitPoint < 1.0 ? nbSamples - nbTrainSamples : nbSamples;
+  const size_t trainingSamplesSize = splitPoint < 1.0 ? FASTCOVER_sum(samplesSizes, nbTrainSamples) : totalSamplesSize;
+  const size_t testSamplesSize = splitPoint < 1.0 ? FASTCOVER_sum(samplesSizes + nbTrainSamples, nbTestSamples) : totalSamplesSize;
+  /* Checks */
+  if (totalSamplesSize < MAX(d, sizeof(U64)) ||
+      totalSamplesSize >= (size_t)FASTCOVER_MAX_SAMPLES_SIZE) {
+    DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
+                 (U32)(totalSamplesSize >> 20), (FASTCOVER_MAX_SAMPLES_SIZE >> 20));
+    return 0;
+  }
+  /* Check if there are at least 5 training samples */
+  if (nbTrainSamples < 5) {
+    DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples);
+    return 0;
+  }
+  /* Check if there's testing sample */
+  if (nbTestSamples < 1) {
+    DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.", nbTestSamples);
+    return 0;
+  }
+  /* Zero the context */
+  memset(ctx, 0, sizeof(*ctx));
+  DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbTrainSamples,
+               (U32)trainingSamplesSize);
+  DISPLAYLEVEL(2, "Testing on %u samples of total size %u\n", nbTestSamples,
+               (U32)testSamplesSize);
+
+  ctx->samples = samples;
+  ctx->samplesSizes = samplesSizes;
+  ctx->nbSamples = nbSamples;
+  ctx->nbTrainSamples = nbTrainSamples;
+  ctx->nbTestSamples = nbTestSamples;
+  ctx->nbDmers = trainingSamplesSize - d + 1;
+  ctx->d = d;
+
+  /* The offsets of each file */
+  ctx->offsets = (size_t *)malloc((nbSamples + 1) * sizeof(size_t));
+  if (!ctx->offsets) {
+    DISPLAYLEVEL(1, "Failed to allocate scratch buffers\n");
+    FASTCOVER_ctx_destroy(ctx);
+    return 0;
+  }
+
+  /* Fill offsets from the samplesSizes */
+  {
+    U32 i;
+    ctx->offsets[0] = 0;
+    for (i = 1; i <= nbSamples; ++i) {
+      ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1];
+    }
+  }
+
+  /* Initialize frequency array of size 2^f */
+  ctx->freqs = (U32 *)calloc((1 << f), sizeof(U32));
+  ctx->segmentFreqs = (U16 *)calloc((1 << f), sizeof(U16));
+  DISPLAYLEVEL(2, "Computing frequencies\n");
+  FASTCOVER_computeFrequency(ctx->freqs, f, ctx);
+
+  return 1;
+}
+
+
+/**
+ * Given the prepared context build the dictionary.
+ */
+static size_t FASTCOVER_buildDictionary(const FASTCOVER_ctx_t *ctx, U32 *freqs,
+                                    void *dictBuffer,
+                                    size_t dictBufferCapacity,
+                                    ZDICT_fastCover_params_t parameters){
+  BYTE *const dict = (BYTE *)dictBuffer;
+  size_t tail = dictBufferCapacity;
+  /* Divide the data up into epochs of equal size.
+   * We will select at least one segment from each epoch.
+   */
+  const U32 epochs = MAX(1, (U32)(dictBufferCapacity / parameters.k));
+  const U32 epochSize = (U32)(ctx->nbDmers / epochs);
+  size_t epoch;
+  DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n", epochs,
+               epochSize);
+  /* Loop through the epochs until there are no more segments or the dictionary
+   * is full.
+   */
+  for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) {
+    const U32 epochBegin = (U32)(epoch * epochSize);
+    const U32 epochEnd = epochBegin + epochSize;
+    size_t segmentSize;
+    /* Select a segment */
+    FASTCOVER_segment_t segment = FASTCOVER_selectSegment(
+        ctx, freqs, epochBegin, epochEnd, parameters);
+
+    /* If the segment covers no dmers, then we are out of content */
+    if (segment.score == 0) {
+      break;
+    }
+
+    /* Trim the segment if necessary and if it is too small then we are done */
+    segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
+    if (segmentSize < parameters.d) {
+      break;
+    }
+
+    /* We fill the dictionary from the back to allow the best segments to be
+     * referenced with the smallest offsets.
+     */
+    tail -= segmentSize;
+    memcpy(dict + tail, ctx->samples + segment.begin, segmentSize);
+    DISPLAYUPDATE(
+        2, "\r%u%%       ",
+        (U32)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity));
+  }
+  DISPLAYLEVEL(2, "\r%79s\r", "");
+  return tail;
+}
+
+
+/**
+ * FASTCOVER_best_t is used for two purposes:
+ * 1. Synchronizing threads.
+ * 2. Saving the best parameters and dictionary.
+ *
+ * All of the methods except FASTCOVER_best_init() are thread safe if zstd is
+ * compiled with multithreaded support.
+ */
+typedef struct fast_best_s {
+  ZSTD_pthread_mutex_t mutex;
+  ZSTD_pthread_cond_t cond;
+  size_t liveJobs;
+  void *dict;
+  size_t dictSize;
+  ZDICT_fastCover_params_t parameters;
+  size_t compressedSize;
+} FASTCOVER_best_t;
+
+/**
+ * Initialize the `FASTCOVER_best_t`.
+ */
+static void FASTCOVER_best_init(FASTCOVER_best_t *best) {
+  if (best==NULL) return; /* compatible with init on NULL */
+  (void)ZSTD_pthread_mutex_init(&best->mutex, NULL);
+  (void)ZSTD_pthread_cond_init(&best->cond, NULL);
+  best->liveJobs = 0;
+  best->dict = NULL;
+  best->dictSize = 0;
+  best->compressedSize = (size_t)-1;
+  memset(&best->parameters, 0, sizeof(best->parameters));
+}
+
+/**
+ * Wait until liveJobs == 0.
+ */
+static void FASTCOVER_best_wait(FASTCOVER_best_t *best) {
+  if (!best) {
+    return;
+  }
+  ZSTD_pthread_mutex_lock(&best->mutex);
+  while (best->liveJobs != 0) {
+    ZSTD_pthread_cond_wait(&best->cond, &best->mutex);
+  }
+  ZSTD_pthread_mutex_unlock(&best->mutex);
+}
+
+/**
+ * Call FASTCOVER_best_wait() and then destroy the FASTCOVER_best_t.
+ */
+static void FASTCOVER_best_destroy(FASTCOVER_best_t *best) {
+  if (!best) {
+    return;
+  }
+  FASTCOVER_best_wait(best);
+  if (best->dict) {
+    free(best->dict);
+  }
+  ZSTD_pthread_mutex_destroy(&best->mutex);
+  ZSTD_pthread_cond_destroy(&best->cond);
+}
+
+/**
+ * Called when a thread is about to be launched.
+ * Increments liveJobs.
+ */
+static void FASTCOVER_best_start(FASTCOVER_best_t *best) {
+  if (!best) {
+    return;
+  }
+  ZSTD_pthread_mutex_lock(&best->mutex);
+  ++best->liveJobs;
+  ZSTD_pthread_mutex_unlock(&best->mutex);
+}
+
+/**
+ * Called when a thread finishes executing, both on error or success.
+ * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
+ * If this dictionary is the best so far save it and its parameters.
+ */
+static void FASTCOVER_best_finish(FASTCOVER_best_t *best, size_t compressedSize,
+                              ZDICT_fastCover_params_t parameters, void *dict,
+                              size_t dictSize) {
+  if (!best) {
+    return;
+  }
+  {
+    size_t liveJobs;
+    ZSTD_pthread_mutex_lock(&best->mutex);
+    --best->liveJobs;
+    liveJobs = best->liveJobs;
+    /* If the new dictionary is better */
+    if (compressedSize < best->compressedSize) {
+      /* Allocate space if necessary */
+      if (!best->dict || best->dictSize < dictSize) {
+        if (best->dict) {
+          free(best->dict);
+        }
+        best->dict = malloc(dictSize);
+        if (!best->dict) {
+          best->compressedSize = ERROR(GENERIC);
+          best->dictSize = 0;
+          return;
+        }
+      }
+      /* Save the dictionary, parameters, and size */
+      memcpy(best->dict, dict, dictSize);
+      best->dictSize = dictSize;
+      best->parameters = parameters;
+      best->compressedSize = compressedSize;
+    }
+    ZSTD_pthread_mutex_unlock(&best->mutex);
+    if (liveJobs == 0) {
+      ZSTD_pthread_cond_broadcast(&best->cond);
+    }
+  }
+}
+
+/**
+ * Parameters for FASTCOVER_tryParameters().
+ */
+typedef struct FASTCOVER_tryParameters_data_s {
+  const FASTCOVER_ctx_t *ctx;
+  FASTCOVER_best_t *best;
+  size_t dictBufferCapacity;
+  ZDICT_fastCover_params_t parameters;
+} FASTCOVER_tryParameters_data_t;
+
+/**
+ * Tries a set of parameters and updates the FASTCOVER_best_t with the results.
+ * This function is thread safe if zstd is compiled with multithreaded support.
+ * It takes its parameters as an *OWNING* opaque pointer to support threading.
+ */
+static void FASTCOVER_tryParameters(void *opaque) {
+  /* Save parameters as local variables */
+  FASTCOVER_tryParameters_data_t *const data = (FASTCOVER_tryParameters_data_t *)opaque;
+  const FASTCOVER_ctx_t *const ctx = data->ctx;
+  const ZDICT_fastCover_params_t parameters = data->parameters;
+  size_t dictBufferCapacity = data->dictBufferCapacity;
+  size_t totalCompressedSize = ERROR(GENERIC);
+  /* Allocate space for hash table, dict, and freqs */
+  BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity);
+  U32 *freqs = (U32*) malloc((1 << parameters.f) * sizeof(U32));
+  if (!dict || !freqs) {
+    DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n");
+    goto _cleanup;
+  }
+  /* Copy the frequencies because we need to modify them */
+  memcpy(freqs, ctx->freqs, (1 << parameters.f) * sizeof(U32));
+  /* Build the dictionary */
+  {
+    const size_t tail = FASTCOVER_buildDictionary(ctx, freqs, dict,
+                                              dictBufferCapacity, parameters);
+
+    dictBufferCapacity = ZDICT_finalizeDictionary(
+        dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
+        ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbTrainSamples,
+        parameters.zParams);
+    if (ZDICT_isError(dictBufferCapacity)) {
+      DISPLAYLEVEL(1, "Failed to finalize dictionary\n");
+      goto _cleanup;
+    }
+  }
+  /* Check total compressed size */
+  {
+    /* Pointers */
+    ZSTD_CCtx *cctx;
+    ZSTD_CDict *cdict;
+    void *dst;
+    /* Local variables */
+    size_t dstCapacity;
+    size_t i;
+    /* Allocate dst with enough space to compress the maximum sized sample */
+    {
+      size_t maxSampleSize = 0;
+      i = parameters.splitPoint < 1.0 ? ctx->nbTrainSamples : 0;
+      for (; i < ctx->nbSamples; ++i) {
+        maxSampleSize = MAX(ctx->samplesSizes[i], maxSampleSize);
+      }
+      dstCapacity = ZSTD_compressBound(maxSampleSize);
+      dst = malloc(dstCapacity);
+    }
+    /* Create the cctx and cdict */
+    cctx = ZSTD_createCCtx();
+    cdict = ZSTD_createCDict(dict, dictBufferCapacity,
+                             parameters.zParams.compressionLevel);
+    if (!dst || !cctx || !cdict) {
+      goto _compressCleanup;
+    }
+    /* Compress each sample and sum their sizes (or error) */
+    totalCompressedSize = dictBufferCapacity;
+    i = parameters.splitPoint < 1.0 ? ctx->nbTrainSamples : 0;
+    for (; i < ctx->nbSamples; ++i) {
+      const size_t size = ZSTD_compress_usingCDict(
+          cctx, dst, dstCapacity, ctx->samples + ctx->offsets[i],
+          ctx->samplesSizes[i], cdict);
+      if (ZSTD_isError(size)) {
+        totalCompressedSize = ERROR(GENERIC);
+        goto _compressCleanup;
+      }
+      totalCompressedSize += size;
+    }
+  _compressCleanup:
+    ZSTD_freeCCtx(cctx);
+    ZSTD_freeCDict(cdict);
+    if (dst) {
+      free(dst);
+    }
+  }
+
+_cleanup:
+  FASTCOVER_best_finish(data->best, totalCompressedSize, parameters, dict,
+                    dictBufferCapacity);
+  free(data);
+  if (dict) {
+    free(dict);
+  }
+  if (freqs) {
+    free(freqs);
+  }
+}
+
+ZDICTLIB_API size_t ZDICT_trainFromBuffer_fastCover(
+    void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
+    const size_t *samplesSizes, unsigned nbSamples, ZDICT_fastCover_params_t parameters) {
+    BYTE* const dict = (BYTE*)dictBuffer;
+    FASTCOVER_ctx_t ctx;
+    parameters.splitPoint = 1.0;
+    /* Initialize global data */
+    g_displayLevel = parameters.zParams.notificationLevel;
+    /* Checks */
+    if (!FASTCOVER_checkParameters(parameters, dictBufferCapacity)) {
+      DISPLAYLEVEL(1, "FASTCOVER parameters incorrect\n");
+      return ERROR(GENERIC);
+    }
+    if (nbSamples == 0) {
+      DISPLAYLEVEL(1, "FASTCOVER must have at least one input file\n");
+      return ERROR(GENERIC);
+    }
+    if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
+      DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
+                   ZDICT_DICTSIZE_MIN);
+      return ERROR(dstSize_tooSmall);
+    }
+    /* Initialize context */
+    if (!FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
+                            parameters.d, parameters.splitPoint, parameters.f)) {
+      DISPLAYLEVEL(1, "Failed to initialize context\n");
+      return ERROR(GENERIC);
+    }
+    /* Build the dictionary */
+    DISPLAYLEVEL(2, "Building dictionary\n");
+    {
+      const size_t tail = FASTCOVER_buildDictionary(&ctx, ctx.freqs, dictBuffer,
+                                                dictBufferCapacity, parameters);
+
+      const size_t dictionarySize = ZDICT_finalizeDictionary(
+          dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
+          samplesBuffer, samplesSizes, (unsigned)ctx.nbTrainSamples,
+          parameters.zParams);
+      if (!ZSTD_isError(dictionarySize)) {
+          DISPLAYLEVEL(2, "Constructed dictionary of size %u\n",
+                      (U32)dictionarySize);
+      }
+      FASTCOVER_ctx_destroy(&ctx);
+      return dictionarySize;
+    }
+}
+
+
+
+ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover(
+    void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
+    const size_t *samplesSizes, unsigned nbSamples,
+    ZDICT_fastCover_params_t *parameters) {
+    /* constants */
+    const unsigned nbThreads = parameters->nbThreads;
+    const double splitPoint =
+        parameters->splitPoint <= 0.0 ? DEFAULT_SPLITPOINT : parameters->splitPoint;
+    const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;
+    const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d;
+    const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k;
+    const unsigned kMaxK = parameters->k == 0 ? 2000 : parameters->k;
+    const unsigned kSteps = parameters->steps == 0 ? 40 : parameters->steps;
+    const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1);
+    const unsigned kIterations =
+        (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
+    const unsigned f = parameters->f == 0 ? 23 : parameters->f;
+
+    /* Local variables */
+    const int displayLevel = parameters->zParams.notificationLevel;
+    unsigned iteration = 1;
+    unsigned d;
+    unsigned k;
+    FASTCOVER_best_t best;
+    POOL_ctx *pool = NULL;
+
+    /* Checks */
+    if (splitPoint <= 0 || splitPoint > 1) {
+      LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect splitPoint\n");
+      return ERROR(GENERIC);
+    }
+    if (kMinK < kMaxD || kMaxK < kMinK) {
+      LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect k\n");
+      return ERROR(GENERIC);
+    }
+    if (nbSamples == 0) {
+      DISPLAYLEVEL(1, "FASTCOVER must have at least one input file\n");
+      return ERROR(GENERIC);
+    }
+    if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
+      DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
+                   ZDICT_DICTSIZE_MIN);
+      return ERROR(dstSize_tooSmall);
+    }
+    if (nbThreads > 1) {
+      pool = POOL_create(nbThreads, 1);
+      if (!pool) {
+        return ERROR(memory_allocation);
+      }
+    }
+    /* Initialization */
+    FASTCOVER_best_init(&best);
+    /* Turn down global display level to clean up display at level 2 and below */
+    g_displayLevel = displayLevel == 0 ? 0 : displayLevel - 1;
+    /* Loop through d first because each new value needs a new context */
+    LOCALDISPLAYLEVEL(displayLevel, 2, "Trying %u different sets of parameters\n",
+                      kIterations);
+    for (d = kMinD; d <= kMaxD; d += 2) {
+      /* Initialize the context for this value of d */
+      FASTCOVER_ctx_t ctx;
+      LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
+      if (!FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint, f)) {
+        LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
+        FASTCOVER_best_destroy(&best);
+        POOL_free(pool);
+        return ERROR(GENERIC);
+      }
+      /* Loop through k reusing the same context */
+      for (k = kMinK; k <= kMaxK; k += kStepSize) {
+        /* Prepare the arguments */
+        FASTCOVER_tryParameters_data_t *data = (FASTCOVER_tryParameters_data_t *)malloc(
+            sizeof(FASTCOVER_tryParameters_data_t));
+        LOCALDISPLAYLEVEL(displayLevel, 3, "k=%u\n", k);
+        if (!data) {
+          LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to allocate parameters\n");
+          FASTCOVER_best_destroy(&best);
+          FASTCOVER_ctx_destroy(&ctx);
+          POOL_free(pool);
+          return ERROR(GENERIC);
+        }
+        data->ctx = &ctx;
+        data->best = &best;
+        data->dictBufferCapacity = dictBufferCapacity;
+        data->parameters = *parameters;
+        data->parameters.k = k;
+        data->parameters.d = d;
+        data->parameters.f = f;
+        data->parameters.splitPoint = splitPoint;
+        data->parameters.steps = kSteps;
+        data->parameters.zParams.notificationLevel = g_displayLevel;
+        /* Check the parameters */
+        if (!FASTCOVER_checkParameters(data->parameters, dictBufferCapacity)) {
+          DISPLAYLEVEL(1, "fastCover parameters incorrect\n");
+          free(data);
+          continue;
+        }
+        /* Call the function and pass ownership of data to it */
+        FASTCOVER_best_start(&best);
+        if (pool) {
+          POOL_add(pool, &FASTCOVER_tryParameters, data);
+        } else {
+          FASTCOVER_tryParameters(data);
+        }
+        /* Print status */
+        LOCALDISPLAYUPDATE(displayLevel, 2, "\r%u%%       ",
+                           (U32)((iteration * 100) / kIterations));
+        ++iteration;
+      }
+      FASTCOVER_best_wait(&best);
+      FASTCOVER_ctx_destroy(&ctx);
+    }
+    LOCALDISPLAYLEVEL(displayLevel, 2, "\r%79s\r", "");
+    /* Fill the output buffer and parameters with output of the best parameters */
+    {
+      const size_t dictSize = best.dictSize;
+      if (ZSTD_isError(best.compressedSize)) {
+        const size_t compressedSize = best.compressedSize;
+        FASTCOVER_best_destroy(&best);
+        POOL_free(pool);
+        return compressedSize;
+      }
+      *parameters = best.parameters;
+      memcpy(dictBuffer, best.dict, dictSize);
+      FASTCOVER_best_destroy(&best);
+      POOL_free(pool);
+      return dictSize;
+    }
+
+}
diff --git a/contrib/experimental_dict_builders/fastCover/fastCover.h b/contrib/experimental_dict_builders/fastCover/fastCover.h
new file mode 100644
index 0000000..958e9f4
--- /dev/null
+++ b/contrib/experimental_dict_builders/fastCover/fastCover.h
@@ -0,0 +1,57 @@
+#include <stdio.h>  /* fprintf */
+#include <stdlib.h> /* malloc, free, qsort */
+#include <string.h> /* memset */
+#include <time.h>   /* clock */
+#include "mem.h" /* read */
+#include "pool.h"
+#include "threading.h"
+#include "zstd_internal.h" /* includes zstd.h */
+#ifndef ZDICT_STATIC_LINKING_ONLY
+#define ZDICT_STATIC_LINKING_ONLY
+#endif
+#include "zdict.h"
+
+
+typedef struct {
+    unsigned k;                  /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
+    unsigned d;                  /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */
+    unsigned f;                  /* log of size of frequency array */
+    unsigned steps;              /* Number of steps : Only used for optimization : 0 means default (32) : Higher means more parameters checked */
+    unsigned nbThreads;          /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */
+    double splitPoint;           /* Percentage of samples used for training: the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (1.0), 1.0 when all samples are used for both training and testing */
+    ZDICT_params_t zParams;
+} ZDICT_fastCover_params_t;
+
+
+/*! ZDICT_optimizeTrainFromBuffer_fastCover():
+ *  Train a dictionary from an array of samples using a modified version of the COVER algorithm.
+ *  Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
+ *  supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
+ *  The resulting dictionary will be saved into `dictBuffer`.
+ *  All of the parameters except for f are optional.
+ *  If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8, 10, 12, 14, 16}.
+ *  if steps is zero it defaults to its default value.
+ *  If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [16, 2048].
+ *
+ *  @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+ *           or an error code, which can be tested with ZDICT_isError().
+ *           On success `*parameters` contains the parameters selected.
+ */
+ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover(
+     void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
+     const size_t *samplesSizes, unsigned nbSamples,
+     ZDICT_fastCover_params_t *parameters);
+
+
+/*! ZDICT_trainFromBuffer_fastCover():
+ *  Train a dictionary from an array of samples using a modified version of the COVER algorithm.
+ *  Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
+ *  supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
+ *  The resulting dictionary will be saved into `dictBuffer`.
+ *  d, k, and f are required.
+ *  @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+ *           or an error code, which can be tested with ZDICT_isError().
+ */
+ZDICTLIB_API size_t ZDICT_trainFromBuffer_fastCover(
+    void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
+    const size_t *samplesSizes, unsigned nbSamples, ZDICT_fastCover_params_t parameters);
diff --git a/contrib/experimental_dict_builders/fastCover/main.c b/contrib/experimental_dict_builders/fastCover/main.c
new file mode 100644
index 0000000..df7d918
--- /dev/null
+++ b/contrib/experimental_dict_builders/fastCover/main.c
@@ -0,0 +1,183 @@
+#include <stdio.h>  /* fprintf */
+#include <stdlib.h> /* malloc, free, qsort */
+#include <string.h>   /* strcmp, strlen */
+#include <errno.h>    /* errno */
+#include <ctype.h>
+#include "fastCover.h"
+#include "io.h"
+#include "util.h"
+#include "zdict.h"
+
+
+/*-*************************************
+*  Console display
+***************************************/
+#define DISPLAY(...)         fprintf(stderr, __VA_ARGS__)
+#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); }
+
+static const U64 g_refreshRate = SEC_TO_MICRO / 6;
+static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
+
+#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \
+            if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \
+            { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \
+            if (displayLevel>=4) fflush(stderr); } } }
+
+
+/*-*************************************
+*  Exceptions
+***************************************/
+#ifndef DEBUG
+#  define DEBUG 0
+#endif
+#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__);
+#define EXM_THROW(error, ...)                                             \
+{                                                                         \
+    DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \
+    DISPLAY("Error %i : ", error);                                        \
+    DISPLAY(__VA_ARGS__);                                                 \
+    DISPLAY("\n");                                                        \
+    exit(error);                                                          \
+}
+
+
+/*-*************************************
+*  Constants
+***************************************/
+static const unsigned g_defaultMaxDictSize = 110 KB;
+#define DEFAULT_CLEVEL 3
+
+
+/*-*************************************
+*  FASTCOVER
+***************************************/
+int FASTCOVER_trainFromFiles(const char* dictFileName, sampleInfo *info,
+                          unsigned maxDictSize,
+                          ZDICT_fastCover_params_t *params) {
+    unsigned const displayLevel = params->zParams.notificationLevel;
+    void* const dictBuffer = malloc(maxDictSize);
+
+    int result = 0;
+
+    /* Checks */
+    if (!dictBuffer)
+        EXM_THROW(12, "not enough memory for trainFromFiles");   /* should not happen */
+
+    {   size_t dictSize;
+        /* Run the optimize version if either k or d is not provided */
+        if (!params->d || !params->k) {
+          dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize, info->srcBuffer,
+                                               info->samplesSizes, info->nbSamples, params);
+        } else {
+          dictSize = ZDICT_trainFromBuffer_fastCover(dictBuffer, maxDictSize, info->srcBuffer,
+                                               info->samplesSizes, info->nbSamples, *params);
+        }
+        DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\n", params->k, params->d, params->f, params->steps, (unsigned)(params->splitPoint*100));
+        if (ZDICT_isError(dictSize)) {
+            DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize));   /* should not happen */
+            result = 1;
+            goto _done;
+        }
+        /* save dict */
+        DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName);
+        saveDict(dictFileName, dictBuffer, dictSize);
+    }
+
+    /* clean up */
+_done:
+    free(dictBuffer);
+    return result;
+}
+
+
+
+int main(int argCount, const char* argv[])
+{
+  int displayLevel = 2;
+  const char* programName = argv[0];
+  int operationResult = 0;
+
+  /* Initialize arguments to default values */
+  unsigned k = 0;
+  unsigned d = 0;
+  unsigned f = 23;
+  unsigned steps = 32;
+  unsigned nbThreads = 1;
+  unsigned split = 100;
+  const char* outputFile = "fastCoverDict";
+  unsigned dictID = 0;
+  unsigned maxDictSize = g_defaultMaxDictSize;
+
+  /* Initialize table to store input files */
+  const char** filenameTable = (const char**)malloc(argCount * sizeof(const char*));
+  unsigned filenameIdx = 0;
+
+  char* fileNamesBuf = NULL;
+  unsigned fileNamesNb = filenameIdx;
+  int followLinks = 0; /* follow directory recursively */
+  const char** extendedFileList = NULL;
+
+  /* Parse arguments */
+  for (int i = 1; i < argCount; i++) {
+    const char* argument = argv[i];
+    if (longCommandWArg(&argument, "k=")) { k = readU32FromChar(&argument); continue; }
+    if (longCommandWArg(&argument, "d=")) { d = readU32FromChar(&argument); continue; }
+    if (longCommandWArg(&argument, "f=")) { f = readU32FromChar(&argument); continue; }
+    if (longCommandWArg(&argument, "steps=")) { steps = readU32FromChar(&argument); continue; }
+    if (longCommandWArg(&argument, "split=")) { split = readU32FromChar(&argument); continue; }
+    if (longCommandWArg(&argument, "dictID=")) { dictID = readU32FromChar(&argument); continue; }
+    if (longCommandWArg(&argument, "maxdict=")) { maxDictSize = readU32FromChar(&argument); continue; }
+    if (longCommandWArg(&argument, "in=")) {
+      filenameTable[filenameIdx] = argument;
+      filenameIdx++;
+      continue;
+    }
+    if (longCommandWArg(&argument, "out=")) {
+      outputFile = argument;
+      continue;
+    }
+    DISPLAYLEVEL(1, "Incorrect parameters\n");
+    operationResult = 1;
+    return operationResult;
+  }
+
+  /* Get the list of all files recursively (because followLinks==0)*/
+  extendedFileList = UTIL_createFileList(filenameTable, filenameIdx, &fileNamesBuf,
+                                        &fileNamesNb, followLinks);
+  if (extendedFileList) {
+      unsigned u;
+      for (u=0; u<fileNamesNb; u++) DISPLAYLEVEL(4, "%u %s\n", u, extendedFileList[u]);
+      free((void*)filenameTable);
+      filenameTable = extendedFileList;
+      filenameIdx = fileNamesNb;
+  }
+
+  size_t blockSize = 0;
+
+  /* Set up zParams */
+  ZDICT_params_t zParams;
+  zParams.compressionLevel = DEFAULT_CLEVEL;
+  zParams.notificationLevel = displayLevel;
+  zParams.dictID = dictID;
+
+  /* Set up fastCover params */
+  ZDICT_fastCover_params_t params;
+  params.zParams = zParams;
+  params.k = k;
+  params.d = d;
+  params.f = f;
+  params.steps = steps;
+  params.nbThreads = nbThreads;
+  params.splitPoint = (double)split/100;
+
+  /* Build dictionary */
+  sampleInfo* info = getSampleInfo(filenameTable,
+                    filenameIdx, blockSize, maxDictSize, zParams.notificationLevel);
+  operationResult = FASTCOVER_trainFromFiles(outputFile, info, maxDictSize, &params);
+
+  /* Free allocated memory */
+  UTIL_freeFileList(extendedFileList, fileNamesBuf);
+  freeSampleInfo(info);
+
+  return operationResult;
+}
diff --git a/contrib/experimental_dict_builders/fastCover/test.sh b/contrib/experimental_dict_builders/fastCover/test.sh
new file mode 100644
index 0000000..f86915b
--- /dev/null
+++ b/contrib/experimental_dict_builders/fastCover/test.sh
@@ -0,0 +1,15 @@
+echo "Building fastCover dictionary with in=../../lib/common f=20 out=dict1"
+./main in=../../../lib/common f=20 out=dict1
+zstd -be3 -D dict1 -r ../../../lib/common -q
+echo "Building fastCover dictionary with in=../../lib/common k=500 d=6 f=24 out=dict2 dictID=100 maxdict=140000"
+./main in=../../../lib/common k=500 d=6 f=24 out=dict2 dictID=100 maxdict=140000
+zstd -be3 -D dict2 -r ../../../lib/common -q
+echo "Building fastCover dictionary with 2 sample sources"
+./main in=../../../lib/common in=../../../lib/compress out=dict3
+zstd -be3 -D dict3 -r ../../../lib/common -q
+echo "Removing dict1 dict2 dict3"
+rm -f dict1 dict2 dict3
+
+echo "Testing with invalid parameters, should fail"
+! ./main in=../../../lib/common r=10
+! ./main in=../../../lib/common d=10
diff --git a/contrib/experimental_dict_builders/randomDictBuilder/Makefile b/contrib/experimental_dict_builders/randomDictBuilder/Makefile
new file mode 100644
index 0000000..bbd40e4
--- /dev/null
+++ b/contrib/experimental_dict_builders/randomDictBuilder/Makefile
@@ -0,0 +1,52 @@
+ARG :=
+
+CC ?= gcc
+CFLAGS ?= -O3
+INCLUDES := -I ../../../programs -I ../../../lib/common -I ../../../lib -I ../../../lib/dictBuilder
+
+TEST_INPUT := ../../../lib
+TEST_OUTPUT := randomDict
+
+all: main run clean
+
+.PHONY: test
+test: main testrun testshell clean
+
+.PHONY: run
+run:
+	echo "Building a random dictionary with given arguments"
+	./main $(ARG)
+
+main: main.o io.o random.o libzstd.a
+	$(CC) $(CFLAGS) main.o io.o random.o libzstd.a -o main
+
+main.o: main.c
+	$(CC) $(CFLAGS) $(INCLUDES) -c main.c
+
+random.o: random.c
+	$(CC) $(CFLAGS) $(INCLUDES) -c random.c
+
+io.o: io.c
+	$(CC) $(CFLAGS) $(INCLUDES) -c io.c
+
+libzstd.a:
+	$(MAKE) -C ../../../lib libzstd.a
+	mv ../../../lib/libzstd.a .
+
+.PHONY: testrun
+testrun: main
+	echo "Run with $(TEST_INPUT) and $(TEST_OUTPUT) "
+	./main in=$(TEST_INPUT) out=$(TEST_OUTPUT)
+	zstd -be3 -D $(TEST_OUTPUT) -r $(TEST_INPUT) -q
+	rm -f $(TEST_OUTPUT)
+
+.PHONY: testshell
+testshell: test.sh
+	sh test.sh
+	echo "Finish running test.sh"
+
+.PHONY: clean
+clean:
+	rm -f *.o main libzstd.a
+	$(MAKE) -C ../../../lib clean
+	echo "Cleaning is completed"
diff --git a/contrib/experimental_dict_builders/randomDictBuilder/README.md b/contrib/experimental_dict_builders/randomDictBuilder/README.md
new file mode 100644
index 0000000..da12a42
--- /dev/null
+++ b/contrib/experimental_dict_builders/randomDictBuilder/README.md
@@ -0,0 +1,20 @@
+Random Dictionary Builder
+
+### Permitted Arguments:
+Input File/Directory (in=fileName): required; file/directory used to build dictionary; if directory, will operate recursively for files inside directory; can include multiple files/directories, each following "in="
+Output Dictionary (out=dictName): if not provided, default to defaultDict
+Dictionary ID (dictID=#): nonnegative number; if not provided, default to 0
+Maximum Dictionary Size (maxdict=#): positive number; in bytes, if not provided, default to 110KB
+Size of Randomly Selected Segment (k=#): positive number; in bytes; if not provided, default to 200
+
+###Running Test:
+make test
+
+
+###Usage:
+To build a random dictionary with the provided arguments: make ARG= followed by arguments
+
+
+### Examples:
+make ARG="in=../../../lib/dictBuilder out=dict100 dictID=520"
+make ARG="in=../../../lib/dictBuilder in=../../../lib/compress"
diff --git a/contrib/experimental_dict_builders/randomDictBuilder/io.c b/contrib/experimental_dict_builders/randomDictBuilder/io.c
new file mode 100644
index 0000000..bfe39ea
--- /dev/null
+++ b/contrib/experimental_dict_builders/randomDictBuilder/io.c
@@ -0,0 +1,284 @@
+#include <stdio.h>  /* fprintf */
+#include <stdlib.h> /* malloc, free, qsort */
+#include <string.h>   /* strcmp, strlen */
+#include <errno.h>    /* errno */
+#include <ctype.h>
+#include "io.h"
+#include "fileio.h"   /* stdinmark, stdoutmark, ZSTD_EXTENSION */
+#include "platform.h"         /* Large Files support */
+#include "util.h"
+#include "zdict.h"
+
+/*-*************************************
+*  Console display
+***************************************/
+#define DISPLAY(...)         fprintf(stderr, __VA_ARGS__)
+#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); }
+
+static const U64 g_refreshRate = SEC_TO_MICRO / 6;
+static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
+
+#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \
+            if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \
+            { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \
+            if (displayLevel>=4) fflush(stderr); } } }
+
+/*-*************************************
+*  Exceptions
+***************************************/
+#ifndef DEBUG
+#  define DEBUG 0
+#endif
+#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__);
+#define EXM_THROW(error, ...)                                             \
+{                                                                         \
+    DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \
+    DISPLAY("Error %i : ", error);                                        \
+    DISPLAY(__VA_ARGS__);                                                 \
+    DISPLAY("\n");                                                        \
+    exit(error);                                                          \
+}
+
+
+/*-*************************************
+*  Constants
+***************************************/
+
+#define SAMPLESIZE_MAX (128 KB)
+#define RANDOM_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB))
+#define RANDOM_MEMMULT 9
+static const size_t g_maxMemory = (sizeof(size_t) == 4) ?
+                          (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
+
+#define NOISELENGTH 32
+
+
+/*-*************************************
+*  Commandline related functions
+***************************************/
+unsigned readU32FromChar(const char** stringPtr){
+    const char errorMsg[] = "error: numeric value too large";
+    unsigned result = 0;
+    while ((**stringPtr >='0') && (**stringPtr <='9')) {
+        unsigned const max = (((unsigned)(-1)) / 10) - 1;
+        if (result > max) exit(1);
+        result *= 10, result += **stringPtr - '0', (*stringPtr)++ ;
+    }
+    if ((**stringPtr=='K') || (**stringPtr=='M')) {
+        unsigned const maxK = ((unsigned)(-1)) >> 10;
+        if (result > maxK) exit(1);
+        result <<= 10;
+        if (**stringPtr=='M') {
+            if (result > maxK) exit(1);
+            result <<= 10;
+        }
+        (*stringPtr)++;  /* skip `K` or `M` */
+        if (**stringPtr=='i') (*stringPtr)++;
+        if (**stringPtr=='B') (*stringPtr)++;
+    }
+    return result;
+}
+
+unsigned longCommandWArg(const char** stringPtr, const char* longCommand){
+    size_t const comSize = strlen(longCommand);
+    int const result = !strncmp(*stringPtr, longCommand, comSize);
+    if (result) *stringPtr += comSize;
+    return result;
+}
+
+
+/* ********************************************************
+*  File related operations
+**********************************************************/
+/** loadFiles() :
+ *  load samples from files listed in fileNamesTable into buffer.
+ *  works even if buffer is too small to load all samples.
+ *  Also provides the size of each sample into sampleSizes table
+ *  which must be sized correctly, using DiB_fileStats().
+ * @return : nb of samples effectively loaded into `buffer`
+ * *bufferSizePtr is modified, it provides the amount data loaded within buffer.
+ *  sampleSizes is filled with the size of each sample.
+ */
+static unsigned loadFiles(void* buffer, size_t* bufferSizePtr, size_t* sampleSizes,
+                          unsigned sstSize, const char** fileNamesTable, unsigned nbFiles,
+                          size_t targetChunkSize, unsigned displayLevel) {
+    char* const buff = (char*)buffer;
+    size_t pos = 0;
+    unsigned nbLoadedChunks = 0, fileIndex;
+
+    for (fileIndex=0; fileIndex<nbFiles; fileIndex++) {
+        const char* const fileName = fileNamesTable[fileIndex];
+        unsigned long long const fs64 = UTIL_getFileSize(fileName);
+        unsigned long long remainingToLoad = (fs64 == UTIL_FILESIZE_UNKNOWN) ? 0 : fs64;
+        U32 const nbChunks = targetChunkSize ? (U32)((fs64 + (targetChunkSize-1)) / targetChunkSize) : 1;
+        U64 const chunkSize = targetChunkSize ? MIN(targetChunkSize, fs64) : fs64;
+        size_t const maxChunkSize = (size_t)MIN(chunkSize, SAMPLESIZE_MAX);
+        U32 cnb;
+        FILE* const f = fopen(fileName, "rb");
+        if (f==NULL) EXM_THROW(10, "zstd: dictBuilder: %s %s ", fileName, strerror(errno));
+        DISPLAYUPDATE(2, "Loading %s...       \r", fileName);
+        for (cnb=0; cnb<nbChunks; cnb++) {
+            size_t const toLoad = (size_t)MIN(maxChunkSize, remainingToLoad);
+            if (toLoad > *bufferSizePtr-pos) break;
+            {   size_t const readSize = fread(buff+pos, 1, toLoad, f);
+                if (readSize != toLoad) EXM_THROW(11, "Pb reading %s", fileName);
+                pos += readSize;
+                sampleSizes[nbLoadedChunks++] = toLoad;
+                remainingToLoad -= targetChunkSize;
+                if (nbLoadedChunks == sstSize) { /* no more space left in sampleSizes table */
+                    fileIndex = nbFiles;  /* stop there */
+                    break;
+                }
+                if (toLoad < targetChunkSize) {
+                    fseek(f, (long)(targetChunkSize - toLoad), SEEK_CUR);
+        }   }   }
+        fclose(f);
+    }
+    DISPLAYLEVEL(2, "\r%79s\r", "");
+    *bufferSizePtr = pos;
+    DISPLAYLEVEL(4, "loaded : %u KB \n", (U32)(pos >> 10))
+    return nbLoadedChunks;
+}
+
+#define rotl32(x,r) ((x << r) | (x >> (32 - r)))
+static U32 getRand(U32* src)
+{
+    static const U32 prime1 = 2654435761U;
+    static const U32 prime2 = 2246822519U;
+    U32 rand32 = *src;
+    rand32 *= prime1;
+    rand32 ^= prime2;
+    rand32  = rotl32(rand32, 13);
+    *src = rand32;
+    return rand32 >> 5;
+}
+
+/* shuffle() :
+ * shuffle a table of file names in a semi-random way
+ * It improves dictionary quality by reducing "locality" impact, so if sample set is very large,
+ * it will load random elements from it, instead of just the first ones. */
+static void shuffle(const char** fileNamesTable, unsigned nbFiles) {
+    U32 seed = 0xFD2FB528;
+    unsigned i;
+    for (i = nbFiles - 1; i > 0; --i) {
+        unsigned const j = getRand(&seed) % (i + 1);
+        const char* const tmp = fileNamesTable[j];
+        fileNamesTable[j] = fileNamesTable[i];
+        fileNamesTable[i] = tmp;
+    }
+}
+
+
+/*-********************************************************
+*  Dictionary training functions
+**********************************************************/
+size_t findMaxMem(unsigned long long requiredMem) {
+    size_t const step = 8 MB;
+    void* testmem = NULL;
+
+    requiredMem = (((requiredMem >> 23) + 1) << 23);
+    requiredMem += step;
+    if (requiredMem > g_maxMemory) requiredMem = g_maxMemory;
+
+    while (!testmem) {
+        testmem = malloc((size_t)requiredMem);
+        requiredMem -= step;
+    }
+
+    free(testmem);
+    return (size_t)requiredMem;
+}
+
+void saveDict(const char* dictFileName,
+                         const void* buff, size_t buffSize) {
+    FILE* const f = fopen(dictFileName, "wb");
+    if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName);
+
+    { size_t const n = fwrite(buff, 1, buffSize, f);
+      if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName) }
+
+    { size_t const n = (size_t)fclose(f);
+      if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) }
+}
+
+/*! getFileStats() :
+ *  Given a list of files, and a chunkSize (0 == no chunk, whole files)
+ *  provides the amount of data to be loaded and the resulting nb of samples.
+ *  This is useful primarily for allocation purpose => sample buffer, and sample sizes table.
+ */
+static fileStats getFileStats(const char** fileNamesTable, unsigned nbFiles,
+                              size_t chunkSize, unsigned displayLevel) {
+    fileStats fs;
+    unsigned n;
+    memset(&fs, 0, sizeof(fs));
+    for (n=0; n<nbFiles; n++) {
+        U64 const fileSize = UTIL_getFileSize(fileNamesTable[n]);
+        U64 const srcSize = (fileSize == UTIL_FILESIZE_UNKNOWN) ? 0 : fileSize;
+        U32 const nbSamples = (U32)(chunkSize ? (srcSize + (chunkSize-1)) / chunkSize : 1);
+        U64 const chunkToLoad = chunkSize ? MIN(chunkSize, srcSize) : srcSize;
+        size_t const cappedChunkSize = (size_t)MIN(chunkToLoad, SAMPLESIZE_MAX);
+        fs.totalSizeToLoad += cappedChunkSize * nbSamples;
+        fs.oneSampleTooLarge |= (chunkSize > 2*SAMPLESIZE_MAX);
+        fs.nbSamples += nbSamples;
+    }
+    DISPLAYLEVEL(4, "Preparing to load : %u KB \n", (U32)(fs.totalSizeToLoad >> 10));
+    return fs;
+}
+
+
+
+
+sampleInfo* getSampleInfo(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize,
+                          unsigned maxDictSize, const unsigned displayLevel) {
+    fileStats const fs = getFileStats(fileNamesTable, nbFiles, chunkSize, displayLevel);
+    size_t* const sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t));
+    size_t const memMult = RANDOM_MEMMULT;
+    size_t const maxMem =  findMaxMem(fs.totalSizeToLoad * memMult) / memMult;
+    size_t loadedSize = (size_t) MIN ((unsigned long long)maxMem, fs.totalSizeToLoad);
+    void* const srcBuffer = malloc(loadedSize+NOISELENGTH);
+
+    /* Checks */
+    if ((!sampleSizes) || (!srcBuffer))
+        EXM_THROW(12, "not enough memory for trainFromFiles");   /* should not happen */
+    if (fs.oneSampleTooLarge) {
+        DISPLAYLEVEL(2, "!  Warning : some sample(s) are very large \n");
+        DISPLAYLEVEL(2, "!  Note that dictionary is only useful for small samples. \n");
+        DISPLAYLEVEL(2, "!  As a consequence, only the first %u bytes of each sample are loaded \n", SAMPLESIZE_MAX);
+    }
+    if (fs.nbSamples < 5) {
+        DISPLAYLEVEL(2, "!  Warning : nb of samples too low for proper processing ! \n");
+        DISPLAYLEVEL(2, "!  Please provide _one file per sample_. \n");
+        DISPLAYLEVEL(2, "!  Alternatively, split files into fixed-size blocks representative of samples, with -B# \n");
+        EXM_THROW(14, "nb of samples too low");   /* we now clearly forbid this case */
+    }
+    if (fs.totalSizeToLoad < (unsigned long long)(8 * maxDictSize)) {
+        DISPLAYLEVEL(2, "!  Warning : data size of samples too small for target dictionary size \n");
+        DISPLAYLEVEL(2, "!  Samples should be about 100x larger than target dictionary size \n");
+    }
+
+    /* init */
+    if (loadedSize < fs.totalSizeToLoad)
+        DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(loadedSize >> 20));
+
+    /* Load input buffer */
+    DISPLAYLEVEL(3, "Shuffling input files\n");
+    shuffle(fileNamesTable, nbFiles);
+    nbFiles = loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples,
+                        fileNamesTable, nbFiles, chunkSize, displayLevel);
+
+    sampleInfo *info = (sampleInfo *)malloc(sizeof(sampleInfo));
+
+    info->nbSamples = fs.nbSamples;
+    info->samplesSizes = sampleSizes;
+    info->srcBuffer = srcBuffer;
+
+    return info;
+}
+
+
+void freeSampleInfo(sampleInfo *info) {
+    if (!info) return;
+    if (info->samplesSizes) free((void*)(info->samplesSizes));
+    if (info->srcBuffer) free((void*)(info->srcBuffer));
+    free(info);
+}
diff --git a/contrib/experimental_dict_builders/randomDictBuilder/io.h b/contrib/experimental_dict_builders/randomDictBuilder/io.h
new file mode 100644
index 0000000..0ee2460
--- /dev/null
+++ b/contrib/experimental_dict_builders/randomDictBuilder/io.h
@@ -0,0 +1,60 @@
+#include <stdio.h>  /* fprintf */
+#include <stdlib.h> /* malloc, free, qsort */
+#include <string.h>   /* strcmp, strlen */
+#include <errno.h>    /* errno */
+#include <ctype.h>
+#include "zstd_internal.h" /* includes zstd.h */
+#include "fileio.h"   /* stdinmark, stdoutmark, ZSTD_EXTENSION */
+#include "platform.h"         /* Large Files support */
+#include "util.h"
+#include "zdict.h"
+
+
+/*-*************************************
+*  Structs
+***************************************/
+typedef struct {
+    U64 totalSizeToLoad;
+    unsigned oneSampleTooLarge;
+    unsigned nbSamples;
+} fileStats;
+
+typedef struct {
+  const void* srcBuffer;
+  const size_t *samplesSizes;
+  size_t nbSamples;
+}sampleInfo;
+
+
+
+/*! getSampleInfo():
+ *  Load from input files and add samples to buffer
+ * @return: a sampleInfo struct containing infomation about buffer where samples are stored,
+ *          size of each sample, and total number of samples
+ */
+sampleInfo* getSampleInfo(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize,
+                          unsigned maxDictSize, const unsigned displayLevel);
+
+
+
+/*! freeSampleInfo():
+ *  Free memory allocated for info
+ */
+void freeSampleInfo(sampleInfo *info);
+
+
+
+/*! saveDict():
+ *  Save data stored on buff to dictFileName
+ */
+void saveDict(const char* dictFileName, const void* buff, size_t buffSize);
+
+
+unsigned readU32FromChar(const char** stringPtr);
+
+/** longCommandWArg() :
+ *  check if *stringPtr is the same as longCommand.
+ *  If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand.
+ * @return 0 and doesn't modify *stringPtr otherwise.
+ */
+unsigned longCommandWArg(const char** stringPtr, const char* longCommand);
diff --git a/contrib/experimental_dict_builders/randomDictBuilder/main.c b/contrib/experimental_dict_builders/randomDictBuilder/main.c
new file mode 100644
index 0000000..3ad8857
--- /dev/null
+++ b/contrib/experimental_dict_builders/randomDictBuilder/main.c
@@ -0,0 +1,161 @@
+#include <stdio.h>  /* fprintf */
+#include <stdlib.h> /* malloc, free, qsort */
+#include <string.h>   /* strcmp, strlen */
+#include <errno.h>    /* errno */
+#include <ctype.h>
+#include "random.h"
+#include "io.h"
+#include "util.h"
+#include "zdict.h"
+
+
+/*-*************************************
+*  Console display
+***************************************/
+#define DISPLAY(...)         fprintf(stderr, __VA_ARGS__)
+#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); }
+
+static const U64 g_refreshRate = SEC_TO_MICRO / 6;
+static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
+
+#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \
+            if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \
+            { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \
+            if (displayLevel>=4) fflush(stderr); } } }
+
+
+/*-*************************************
+*  Exceptions
+***************************************/
+#ifndef DEBUG
+#  define DEBUG 0
+#endif
+#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__);
+#define EXM_THROW(error, ...)                                             \
+{                                                                         \
+    DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \
+    DISPLAY("Error %i : ", error);                                        \
+    DISPLAY(__VA_ARGS__);                                                 \
+    DISPLAY("\n");                                                        \
+    exit(error);                                                          \
+}
+
+
+/*-*************************************
+*  Constants
+***************************************/
+static const unsigned g_defaultMaxDictSize = 110 KB;
+#define DEFAULT_CLEVEL 3
+#define DEFAULT_k 200
+#define DEFAULT_OUTPUTFILE "defaultDict"
+#define DEFAULT_DICTID 0
+
+
+
+/*-*************************************
+*  RANDOM
+***************************************/
+int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info,
+                          unsigned maxDictSize,
+                          ZDICT_random_params_t *params) {
+    unsigned const displayLevel = params->zParams.notificationLevel;
+    void* const dictBuffer = malloc(maxDictSize);
+
+    int result = 0;
+
+    /* Checks */
+    if (!dictBuffer)
+        EXM_THROW(12, "not enough memory for trainFromFiles");   /* should not happen */
+
+    {   size_t dictSize;
+        dictSize = ZDICT_trainFromBuffer_random(dictBuffer, maxDictSize, info->srcBuffer,
+                                             info->samplesSizes, info->nbSamples, *params);
+        DISPLAYLEVEL(2, "k=%u\n", params->k);
+        if (ZDICT_isError(dictSize)) {
+            DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize));   /* should not happen */
+            result = 1;
+            goto _done;
+        }
+        /* save dict */
+        DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName);
+        saveDict(dictFileName, dictBuffer, dictSize);
+    }
+
+    /* clean up */
+_done:
+    free(dictBuffer);
+    return result;
+}
+
+
+
+int main(int argCount, const char* argv[])
+{
+  int displayLevel = 2;
+  const char* programName = argv[0];
+  int operationResult = 0;
+
+  /* Initialize arguments to default values */
+  unsigned k = DEFAULT_k;
+  const char* outputFile = DEFAULT_OUTPUTFILE;
+  unsigned dictID = DEFAULT_DICTID;
+  unsigned maxDictSize = g_defaultMaxDictSize;
+
+  /* Initialize table to store input files */
+  const char** filenameTable = (const char**)malloc(argCount * sizeof(const char*));
+  unsigned filenameIdx = 0;
+
+  /* Parse arguments */
+  for (int i = 1; i < argCount; i++) {
+    const char* argument = argv[i];
+    if (longCommandWArg(&argument, "k=")) { k = readU32FromChar(&argument); continue; }
+    if (longCommandWArg(&argument, "dictID=")) { dictID = readU32FromChar(&argument); continue; }
+    if (longCommandWArg(&argument, "maxdict=")) { maxDictSize = readU32FromChar(&argument); continue; }
+    if (longCommandWArg(&argument, "in=")) {
+      filenameTable[filenameIdx] = argument;
+      filenameIdx++;
+      continue;
+    }
+    if (longCommandWArg(&argument, "out=")) {
+      outputFile = argument;
+      continue;
+    }
+    DISPLAYLEVEL(1, "Incorrect parameters\n");
+    operationResult = 1;
+    return operationResult;
+  }
+
+  char* fileNamesBuf = NULL;
+  unsigned fileNamesNb = filenameIdx;
+  int followLinks = 0; /* follow directory recursively */
+  const char** extendedFileList = NULL;
+  extendedFileList = UTIL_createFileList(filenameTable, filenameIdx, &fileNamesBuf,
+                                        &fileNamesNb, followLinks);
+  if (extendedFileList) {
+      unsigned u;
+      for (u=0; u<fileNamesNb; u++) DISPLAYLEVEL(4, "%u %s\n", u, extendedFileList[u]);
+      free((void*)filenameTable);
+      filenameTable = extendedFileList;
+      filenameIdx = fileNamesNb;
+  }
+
+  size_t blockSize = 0;
+
+  ZDICT_random_params_t params;
+  ZDICT_params_t zParams;
+  zParams.compressionLevel = DEFAULT_CLEVEL;
+  zParams.notificationLevel = displayLevel;
+  zParams.dictID = dictID;
+  params.zParams = zParams;
+  params.k = k;
+
+  sampleInfo* info = getSampleInfo(filenameTable,
+                    filenameIdx, blockSize, maxDictSize, zParams.notificationLevel);
+  operationResult = RANDOM_trainFromFiles(outputFile, info, maxDictSize, &params);
+
+  /* Free allocated memory */
+  UTIL_freeFileList(extendedFileList, fileNamesBuf);
+  freeSampleInfo(info);
+
+  return operationResult;
+}
diff --git a/contrib/experimental_dict_builders/randomDictBuilder/random.c b/contrib/experimental_dict_builders/randomDictBuilder/random.c
new file mode 100644
index 0000000..5276bea
--- /dev/null
+++ b/contrib/experimental_dict_builders/randomDictBuilder/random.c
@@ -0,0 +1,163 @@
+/*-*************************************
+*  Dependencies
+***************************************/
+#include <stdio.h>            /* fprintf */
+#include <stdlib.h>           /* malloc, free, qsort */
+#include <string.h>           /* memset */
+#include <time.h>             /* clock */
+#include "random.h"
+#include "util.h"             /* UTIL_getFileSize, UTIL_getTotalFileSize */
+#ifndef ZDICT_STATIC_LINKING_ONLY
+#define ZDICT_STATIC_LINKING_ONLY
+#endif
+#include "zdict.h"
+
+/*-*************************************
+*  Console display
+***************************************/
+#define DISPLAY(...)         fprintf(stderr, __VA_ARGS__)
+#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); }
+
+#define LOCALDISPLAYUPDATE(displayLevel, l, ...)                               \
+  if (displayLevel >= l) {                                                     \
+    if ((clock() - g_time > refreshRate) || (displayLevel >= 4)) {             \
+      g_time = clock();                                                        \
+      DISPLAY(__VA_ARGS__);                                                    \
+    }                                                                          \
+  }
+#define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(displayLevel, l, __VA_ARGS__)
+static const clock_t refreshRate = CLOCKS_PER_SEC * 15 / 100;
+static clock_t g_time = 0;
+
+
+
+/* ********************************************************
+*  Random Dictionary Builder
+**********************************************************/
+/**
+ * Returns the sum of the sample sizes.
+ */
+static size_t RANDOM_sum(const size_t *samplesSizes, unsigned nbSamples) {
+  size_t sum = 0;
+  unsigned i;
+  for (i = 0; i < nbSamples; ++i) {
+    sum += samplesSizes[i];
+  }
+  return sum;
+}
+
+
+/**
+ * A segment is an inclusive range in the source.
+ */
+typedef struct {
+  U32 begin;
+  U32 end;
+} RANDOM_segment_t;
+
+
+/**
+ * Selects a random segment from totalSamplesSize - k + 1 possible segments
+ */
+static RANDOM_segment_t RANDOM_selectSegment(const size_t totalSamplesSize,
+                                            ZDICT_random_params_t parameters) {
+    const U32 k = parameters.k;
+    RANDOM_segment_t segment;
+    unsigned index;
+
+    /* Randomly generate a number from 0 to sampleSizes - k */
+    index = rand()%(totalSamplesSize - k + 1);
+
+    /* inclusive */
+    segment.begin = index;
+    segment.end = index + k - 1;
+
+    return segment;
+}
+
+
+/**
+ * Check the validity of the parameters.
+ * Returns non-zero if the parameters are valid and 0 otherwise.
+ */
+static int RANDOM_checkParameters(ZDICT_random_params_t parameters,
+                                  size_t maxDictSize) {
+    /* k is a required parameter */
+    if (parameters.k == 0) {
+      return 0;
+    }
+    /* k <= maxDictSize */
+    if (parameters.k > maxDictSize) {
+      return 0;
+    }
+    return 1;
+}
+
+
+/**
+ * Given the prepared context build the dictionary.
+ */
+static size_t RANDOM_buildDictionary(const size_t totalSamplesSize, const BYTE *samples,
+                                    void *dictBuffer, size_t dictBufferCapacity,
+                                    ZDICT_random_params_t parameters) {
+    BYTE *const dict = (BYTE *)dictBuffer;
+    size_t tail = dictBufferCapacity;
+    const int displayLevel = parameters.zParams.notificationLevel;
+    while (tail > 0) {
+
+      /* Select a segment */
+      RANDOM_segment_t segment = RANDOM_selectSegment(totalSamplesSize, parameters);
+
+      size_t segmentSize;
+      segmentSize = MIN(segment.end - segment.begin + 1, tail);
+
+      tail -= segmentSize;
+      memcpy(dict + tail, samples + segment.begin, segmentSize);
+      DISPLAYUPDATE(
+          2, "\r%u%%       ",
+          (U32)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity));
+    }
+
+    return tail;
+}
+
+
+
+
+ZDICTLIB_API size_t ZDICT_trainFromBuffer_random(
+    void *dictBuffer, size_t dictBufferCapacity,
+    const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
+    ZDICT_random_params_t parameters) {
+      const int displayLevel = parameters.zParams.notificationLevel;
+      BYTE* const dict = (BYTE*)dictBuffer;
+      /* Checks */
+      if (!RANDOM_checkParameters(parameters, dictBufferCapacity)) {
+          DISPLAYLEVEL(1, "k is incorrect\n");
+          return ERROR(GENERIC);
+      }
+      if (nbSamples == 0) {
+        DISPLAYLEVEL(1, "Random must have at least one input file\n");
+        return ERROR(GENERIC);
+      }
+      if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
+        DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
+                     ZDICT_DICTSIZE_MIN);
+        return ERROR(dstSize_tooSmall);
+      }
+      const size_t totalSamplesSize = RANDOM_sum(samplesSizes, nbSamples);
+      const BYTE *const samples = (const BYTE *)samplesBuffer;
+
+      DISPLAYLEVEL(2, "Building dictionary\n");
+      {
+        const size_t tail = RANDOM_buildDictionary(totalSamplesSize, samples,
+                                  dictBuffer, dictBufferCapacity, parameters);
+        const size_t dictSize = ZDICT_finalizeDictionary(
+            dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
+            samplesBuffer, samplesSizes, nbSamples, parameters.zParams);
+        if (!ZSTD_isError(dictSize)) {
+            DISPLAYLEVEL(2, "Constructed dictionary of size %u\n",
+                          (U32)dictSize);
+        }
+        return dictSize;
+      }
+}
diff --git a/contrib/experimental_dict_builders/randomDictBuilder/random.h b/contrib/experimental_dict_builders/randomDictBuilder/random.h
new file mode 100644
index 0000000..352775f
--- /dev/null
+++ b/contrib/experimental_dict_builders/randomDictBuilder/random.h
@@ -0,0 +1,29 @@
+#include <stdio.h>  /* fprintf */
+#include <stdlib.h> /* malloc, free, qsort */
+#include <string.h> /* memset */
+#include <time.h>   /* clock */
+#include "zstd_internal.h" /* includes zstd.h */
+#ifndef ZDICT_STATIC_LINKING_ONLY
+#define ZDICT_STATIC_LINKING_ONLY
+#endif
+#include "zdict.h"
+
+
+
+typedef struct {
+    unsigned k; /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+]; Default to 200 */
+    ZDICT_params_t zParams;
+} ZDICT_random_params_t;
+
+
+/*! ZDICT_trainFromBuffer_random():
+ *  Train a dictionary from an array of samples.
+ *  Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
+ *  supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
+ *  The resulting dictionary will be saved into `dictBuffer`.
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+ *          or an error code, which can be tested with ZDICT_isError().
+ */
+ZDICTLIB_API size_t ZDICT_trainFromBuffer_random( void *dictBuffer, size_t dictBufferCapacity,
+    const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
+    ZDICT_random_params_t parameters);
diff --git a/contrib/experimental_dict_builders/randomDictBuilder/test.sh b/contrib/experimental_dict_builders/randomDictBuilder/test.sh
new file mode 100644
index 0000000..1eb732e
--- /dev/null
+++ b/contrib/experimental_dict_builders/randomDictBuilder/test.sh
@@ -0,0 +1,14 @@
+echo "Building random dictionary with in=../../lib/common k=200 out=dict1"
+./main in=../../../lib/common k=200 out=dict1
+zstd -be3 -D dict1 -r ../../../lib/common -q
+echo "Building random dictionary with in=../../lib/common k=500 out=dict2 dictID=100 maxdict=140000"
+./main in=../../../lib/common k=500 out=dict2 dictID=100 maxdict=140000
+zstd -be3 -D dict2 -r ../../../lib/common -q
+echo "Building random dictionary with 2 sample sources"
+./main in=../../../lib/common in=../../../lib/compress out=dict3
+zstd -be3 -D dict3 -r ../../../lib/common -q
+echo "Removing dict1 dict2 dict3"
+rm -f dict1 dict2 dict3
+
+echo "Testing with invalid parameters, should fail"
+! ./main r=10
diff --git a/contrib/largeNbDicts/.gitignore b/contrib/largeNbDicts/.gitignore
new file mode 100644
index 0000000..e77c4e4
--- /dev/null
+++ b/contrib/largeNbDicts/.gitignore
@@ -0,0 +1,2 @@
+# build artifacts
+largeNbDicts
diff --git a/contrib/largeNbDicts/Makefile b/contrib/largeNbDicts/Makefile
new file mode 100644
index 0000000..624140f
--- /dev/null
+++ b/contrib/largeNbDicts/Makefile
@@ -0,0 +1,49 @@
+# ################################################################
+# Copyright (c) 2018-present, Yann Collet, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under both the BSD-style license (found in the
+# LICENSE file in the root directory of this source tree) and the GPLv2 (found
+# in the COPYING file in the root directory of this source tree).
+# ################################################################
+
+PROGDIR = ../../programs
+LIBDIR  = ../../lib
+
+LIBZSTD = $(LIBDIR)/libzstd.a
+
+CPPFLAGS+= -I$(LIBDIR) -I$(LIBDIR)/common -I$(LIBDIR)/dictBuilder -I$(PROGDIR)
+
+CFLAGS  ?= -O3
+CFLAGS  += -std=gnu99
+DEBUGFLAGS= -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \
+            -Wstrict-aliasing=1 -Wswitch-enum \
+            -Wstrict-prototypes -Wundef -Wpointer-arith -Wformat-security \
+            -Wvla -Wformat=2 -Winit-self -Wfloat-equal -Wwrite-strings \
+            -Wredundant-decls
+CFLAGS  += $(DEBUGFLAGS) $(MOREFLAGS)
+
+
+default: largeNbDicts
+
+all : largeNbDicts
+
+largeNbDicts: bench.o datagen.o xxhash.o largeNbDicts.c $(LIBZSTD)
+	$(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@
+
+.PHONY: $(LIBZSTD)
+$(LIBZSTD):
+	$(MAKE) -C $(LIBDIR) libzstd.a
+
+bench.o  : $(PROGDIR)/bench.c
+	$(CC) $(CPPFLAGS) $(CFLAGS) $^ -c
+
+datagen.o: $(PROGDIR)/datagen.c
+	$(CC) $(CPPFLAGS) $(CFLAGS) $^ -c
+
+xxhash.o : $(LIBDIR)/common/xxhash.c
+	$(CC) $(CPPFLAGS) $(CFLAGS) $^ -c
+
+clean:
+	$(RM) *.o
+	$(RM) largeNbDicts
diff --git a/contrib/largeNbDicts/README.md b/contrib/largeNbDicts/README.md
new file mode 100644
index 0000000..f29bcdf
--- /dev/null
+++ b/contrib/largeNbDicts/README.md
@@ -0,0 +1,25 @@
+largeNbDicts
+=====================
+
+`largeNbDicts` is a benchmark test tool
+dedicated to the specific scenario of
+dictionary decompression using a very large number of dictionaries.
+When dictionaries are constantly changing, they are always "cold",
+suffering from increased latency due to cache misses.
+
+The tool is created in a bid to investigate performance for this scenario,
+and experiment mitigation techniques.
+
+Command line :
+```
+largeNbDicts [Options] filename(s)
+
+Options :
+-r           : recursively load all files in subdirectories (default: off)
+-B#          : split input into blocks of size # (default: no split)
+-#           : use compression level # (default: 3)
+-D #         : use # as a dictionary (default: create one)
+-i#          : nb benchmark rounds (default: 6)
+--nbDicts=#  : set nb of dictionaries to # (default: one per block)
+-h           : help (this text)
+```
diff --git a/contrib/largeNbDicts/largeNbDicts.c b/contrib/largeNbDicts/largeNbDicts.c
new file mode 100644
index 0000000..e0bc553
--- /dev/null
+++ b/contrib/largeNbDicts/largeNbDicts.c
@@ -0,0 +1,806 @@
+/*
+ * Copyright (c) 2018-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* largeNbDicts
+ * This is a benchmark test tool
+ * dedicated to the specific case of dictionary decompression
+ * using a very large nb of dictionaries
+ * thus suffering latency from lots of cache misses.
+ * It's created in a bid to investigate performance and find optimizations. */
+
+
+/*---  Dependencies  ---*/
+
+#include <stddef.h>   /* size_t */
+#include <stdlib.h>   /* malloc, free, abort */
+#include <stdio.h>    /* fprintf */
+#include <assert.h>   /* assert */
+
+#include "util.h"
+#include "bench.h"
+#define ZSTD_STATIC_LINKING_ONLY
+#include "zstd.h"
+#include "zdict.h"
+
+
+/*---  Constants  --- */
+
+#define KB  *(1<<10)
+#define MB  *(1<<20)
+
+#define BLOCKSIZE_DEFAULT 0  /* no slicing into blocks */
+#define DICTSIZE  (4 KB)
+#define CLEVEL_DEFAULT 3
+
+#define BENCH_TIME_DEFAULT_S   6
+#define RUN_TIME_DEFAULT_MS    1000
+#define BENCH_TIME_DEFAULT_MS (BENCH_TIME_DEFAULT_S * RUN_TIME_DEFAULT_MS)
+
+#define DISPLAY_LEVEL_DEFAULT 3
+
+#define BENCH_SIZE_MAX (1200 MB)
+
+
+/*---  Macros  ---*/
+#define CONTROL(c)   { if (!(c)) abort(); }
+#undef MIN
+#define MIN(a,b)     ((a) < (b) ? (a) : (b))
+
+
+/*---  Display Macros  ---*/
+
+#define DISPLAY(...)         fprintf(stdout, __VA_ARGS__)
+#define DISPLAYLEVEL(l, ...) { if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); } }
+static int g_displayLevel = DISPLAY_LEVEL_DEFAULT;   /* 0 : no display,  1: errors,  2 : + result + interaction + warnings,  3 : + progression,  4 : + information */
+
+
+/*---  buffer_t  ---*/
+
+typedef struct {
+    void* ptr;
+    size_t size;
+    size_t capacity;
+} buffer_t;
+
+static const buffer_t kBuffNull = { NULL, 0, 0 };
+
+/* @return : kBuffNull if any error */
+static buffer_t createBuffer(size_t capacity)
+{
+    assert(capacity > 0);
+    void* const ptr = malloc(capacity);
+    if (ptr==NULL) return kBuffNull;
+
+    buffer_t buffer;
+    buffer.ptr = ptr;
+    buffer.capacity = capacity;
+    buffer.size = 0;
+    return buffer;
+}
+
+static void freeBuffer(buffer_t buff)
+{
+    free(buff.ptr);
+}
+
+
+static void fillBuffer_fromHandle(buffer_t* buff, FILE* f)
+{
+    size_t const readSize = fread(buff->ptr, 1, buff->capacity, f);
+    buff->size = readSize;
+}
+
+
+/* @return : kBuffNull if any error */
+static buffer_t createBuffer_fromFile(const char* fileName)
+{
+    U64 const fileSize = UTIL_getFileSize(fileName);
+    size_t const bufferSize = (size_t) fileSize;
+
+    if (fileSize == UTIL_FILESIZE_UNKNOWN) return kBuffNull;
+    assert((U64)bufferSize == fileSize);   /* check overflow */
+
+    {   FILE* const f = fopen(fileName, "rb");
+        if (f == NULL) return kBuffNull;
+
+        buffer_t buff = createBuffer(bufferSize);
+        CONTROL(buff.ptr != NULL);
+
+        fillBuffer_fromHandle(&buff, f);
+        CONTROL(buff.size == buff.capacity);
+
+        fclose(f);   /* do nothing specific if fclose() fails */
+        return buff;
+    }
+}
+
+
+/* @return : kBuffNull if any error */
+static buffer_t
+createDictionaryBuffer(const char* dictionaryName,
+                       const void* srcBuffer,
+                       const size_t* srcBlockSizes, unsigned nbBlocks,
+                       size_t requestedDictSize)
+{
+    if (dictionaryName) {
+        DISPLAYLEVEL(3, "loading dictionary %s \n", dictionaryName);
+        return createBuffer_fromFile(dictionaryName);  /* note : result might be kBuffNull */
+
+    } else {
+
+        DISPLAYLEVEL(3, "creating dictionary, of target size %u bytes \n",
+                        (unsigned)requestedDictSize);
+        void* const dictBuffer = malloc(requestedDictSize);
+        CONTROL(dictBuffer != NULL);
+
+        size_t const dictSize = ZDICT_trainFromBuffer(dictBuffer, requestedDictSize,
+                                                      srcBuffer,
+                                                      srcBlockSizes, nbBlocks);
+        CONTROL(!ZSTD_isError(dictSize));
+
+        buffer_t result;
+        result.ptr = dictBuffer;
+        result.capacity = requestedDictSize;
+        result.size = dictSize;
+        return result;
+    }
+}
+
+
+/*! BMK_loadFiles() :
+ *  Loads `buffer`, with content from files listed within `fileNamesTable`.
+ *  Fills `buffer` entirely.
+ * @return : 0 on success, !=0 on error */
+static int loadFiles(void* buffer, size_t bufferSize,
+                     size_t* fileSizes,
+                     const char* const * fileNamesTable, unsigned nbFiles)
+{
+    size_t pos = 0, totalSize = 0;
+
+    for (unsigned n=0; n<nbFiles; n++) {
+        U64 fileSize = UTIL_getFileSize(fileNamesTable[n]);
+        if (UTIL_isDirectory(fileNamesTable[n])) {
+            fileSizes[n] = 0;
+            continue;
+        }
+        if (fileSize == UTIL_FILESIZE_UNKNOWN) {
+            fileSizes[n] = 0;
+            continue;
+        }
+
+        FILE* const f = fopen(fileNamesTable[n], "rb");
+        assert(f!=NULL);
+
+        assert(pos <= bufferSize);
+        assert(fileSize <= bufferSize - pos);
+
+        {   size_t const readSize = fread(((char*)buffer)+pos, 1, (size_t)fileSize, f);
+            assert(readSize == fileSize);
+            pos += readSize;
+        }
+        fileSizes[n] = (size_t)fileSize;
+        totalSize += (size_t)fileSize;
+        fclose(f);
+    }
+
+    assert(totalSize == bufferSize);
+    return 0;
+}
+
+
+
+/*---  slice_collection_t  ---*/
+
+typedef struct {
+    void** slicePtrs;
+    size_t* capacities;
+    size_t nbSlices;
+} slice_collection_t;
+
+static const slice_collection_t kNullCollection = { NULL, NULL, 0 };
+
+static void freeSliceCollection(slice_collection_t collection)
+{
+    free(collection.slicePtrs);
+    free(collection.capacities);
+}
+
+/* shrinkSizes() :
+ * downsizes sizes of slices within collection, according to `newSizes`.
+ * every `newSizes` entry must be <= than its corresponding collection size */
+void shrinkSizes(slice_collection_t collection,
+                 const size_t* newSizes)  /* presumed same size as collection */
+{
+    size_t const nbSlices = collection.nbSlices;
+    for (size_t blockNb = 0; blockNb < nbSlices; blockNb++) {
+        assert(newSizes[blockNb] <= collection.capacities[blockNb]);
+        collection.capacities[blockNb] = newSizes[blockNb];
+    }
+}
+
+
+/* splitSlices() :
+ * nbSlices : if == 0, nbSlices is automatically determined from srcSlices and blockSize.
+ *            otherwise, creates exactly nbSlices slices,
+ *            by either truncating input (when smaller)
+ *            or repeating input from beginning */
+static slice_collection_t
+splitSlices(slice_collection_t srcSlices, size_t blockSize, size_t nbSlices)
+{
+    if (blockSize==0) blockSize = (size_t)(-1);   /* means "do not cut" */
+    size_t nbSrcBlocks = 0;
+    for (size_t ssnb=0; ssnb < srcSlices.nbSlices; ssnb++) {
+        size_t pos = 0;
+        while (pos <= srcSlices.capacities[ssnb]) {
+            nbSrcBlocks++;
+            pos += blockSize;
+        }
+    }
+
+    if (nbSlices == 0) nbSlices = nbSrcBlocks;
+
+    void** const sliceTable = (void**)malloc(nbSlices * sizeof(*sliceTable));
+    size_t* const capacities = (size_t*)malloc(nbSlices * sizeof(*capacities));
+    if (sliceTable == NULL || capacities == NULL) {
+        free(sliceTable);
+        free(capacities);
+        return kNullCollection;
+    }
+
+    size_t ssnb = 0;
+    for (size_t sliceNb=0; sliceNb < nbSlices; ) {
+        ssnb = (ssnb + 1) % srcSlices.nbSlices;
+        size_t pos = 0;
+        char* const ptr = (char*)srcSlices.slicePtrs[ssnb];
+        while (pos < srcSlices.capacities[ssnb] && sliceNb < nbSlices) {
+            size_t const size = MIN(blockSize, srcSlices.capacities[ssnb] - pos);
+            sliceTable[sliceNb] = ptr + pos;
+            capacities[sliceNb] = size;
+            sliceNb++;
+            pos += blockSize;
+        }
+    }
+
+    slice_collection_t result;
+    result.nbSlices = nbSlices;
+    result.slicePtrs = sliceTable;
+    result.capacities = capacities;
+    return result;
+}
+
+
+static size_t sliceCollection_totalCapacity(slice_collection_t sc)
+{
+    size_t totalSize = 0;
+    for (size_t n=0; n<sc.nbSlices; n++)
+        totalSize += sc.capacities[n];
+    return totalSize;
+}
+
+
+/* ---  buffer collection  --- */
+
+typedef struct {
+    buffer_t buffer;
+    slice_collection_t slices;
+} buffer_collection_t;
+
+
+static void freeBufferCollection(buffer_collection_t bc)
+{
+    freeBuffer(bc.buffer);
+    freeSliceCollection(bc.slices);
+}
+
+
+static buffer_collection_t
+createBufferCollection_fromSliceCollectionSizes(slice_collection_t sc)
+{
+    size_t const bufferSize = sliceCollection_totalCapacity(sc);
+
+    buffer_t buffer = createBuffer(bufferSize);
+    CONTROL(buffer.ptr != NULL);
+
+    size_t const nbSlices = sc.nbSlices;
+    void** const slices = (void**)malloc(nbSlices * sizeof(*slices));
+    CONTROL(slices != NULL);
+
+    size_t* const capacities = (size_t*)malloc(nbSlices * sizeof(*capacities));
+    CONTROL(capacities != NULL);
+
+    char* const ptr = (char*)buffer.ptr;
+    size_t pos = 0;
+    for (size_t n=0; n < nbSlices; n++) {
+        capacities[n] = sc.capacities[n];
+        slices[n] = ptr + pos;
+        pos += capacities[n];
+    }
+
+    buffer_collection_t result;
+    result.buffer = buffer;
+    result.slices.nbSlices = nbSlices;
+    result.slices.capacities = capacities;
+    result.slices.slicePtrs = slices;
+    return result;
+}
+
+
+/* @return : kBuffNull if any error */
+static buffer_collection_t
+createBufferCollection_fromFiles(const char* const * fileNamesTable, unsigned nbFiles)
+{
+    U64 const totalSizeToLoad = UTIL_getTotalFileSize(fileNamesTable, nbFiles);
+    assert(totalSizeToLoad != UTIL_FILESIZE_UNKNOWN);
+    assert(totalSizeToLoad <= BENCH_SIZE_MAX);
+    size_t const loadedSize = (size_t)totalSizeToLoad;
+    assert(loadedSize > 0);
+    void* const srcBuffer = malloc(loadedSize);
+    assert(srcBuffer != NULL);
+
+    assert(nbFiles > 0);
+    size_t* const fileSizes = (size_t*)calloc(nbFiles, sizeof(*fileSizes));
+    assert(fileSizes != NULL);
+
+    /* Load input buffer */
+    int const errorCode = loadFiles(srcBuffer, loadedSize,
+                                    fileSizes,
+                                    fileNamesTable, nbFiles);
+    assert(errorCode == 0);
+
+    void** sliceTable = (void**)malloc(nbFiles * sizeof(*sliceTable));
+    assert(sliceTable != NULL);
+
+    char* const ptr = (char*)srcBuffer;
+    size_t pos = 0;
+    unsigned fileNb = 0;
+    for ( ; (pos < loadedSize) && (fileNb < nbFiles); fileNb++) {
+        sliceTable[fileNb] = ptr + pos;
+        pos += fileSizes[fileNb];
+    }
+    assert(pos == loadedSize);
+    assert(fileNb == nbFiles);
+
+
+    buffer_t buffer;
+    buffer.ptr = srcBuffer;
+    buffer.capacity = loadedSize;
+    buffer.size = loadedSize;
+
+    slice_collection_t slices;
+    slices.slicePtrs = sliceTable;
+    slices.capacities = fileSizes;
+    slices.nbSlices = nbFiles;
+
+    buffer_collection_t bc;
+    bc.buffer = buffer;
+    bc.slices = slices;
+    return bc;
+}
+
+
+
+
+/*---  ddict_collection_t  ---*/
+
+typedef struct {
+    ZSTD_DDict** ddicts;
+    size_t nbDDict;
+} ddict_collection_t;
+
+static const ddict_collection_t kNullDDictCollection = { NULL, 0 };
+
+static void freeDDictCollection(ddict_collection_t ddictc)
+{
+    for (size_t dictNb=0; dictNb < ddictc.nbDDict; dictNb++) {
+        ZSTD_freeDDict(ddictc.ddicts[dictNb]);
+    }
+    free(ddictc.ddicts);
+}
+
+/* returns .buffers=NULL if operation fails */
+static ddict_collection_t createDDictCollection(const void* dictBuffer, size_t dictSize, size_t nbDDict)
+{
+    ZSTD_DDict** const ddicts = malloc(nbDDict * sizeof(ZSTD_DDict*));
+    assert(ddicts != NULL);
+    if (ddicts==NULL) return kNullDDictCollection;
+    for (size_t dictNb=0; dictNb < nbDDict; dictNb++) {
+        ddicts[dictNb] = ZSTD_createDDict(dictBuffer, dictSize);
+        assert(ddicts[dictNb] != NULL);
+    }
+    ddict_collection_t ddictc;
+    ddictc.ddicts = ddicts;
+    ddictc.nbDDict = nbDDict;
+    return ddictc;
+}
+
+
+/* mess with adresses, so that linear scanning dictionaries != linear address scanning */
+void shuffleDictionaries(ddict_collection_t dicts)
+{
+    size_t const nbDicts = dicts.nbDDict;
+    for (size_t r=0; r<nbDicts; r++) {
+        size_t const d = rand() % nbDicts;
+        ZSTD_DDict* tmpd = dicts.ddicts[d];
+        dicts.ddicts[d] = dicts.ddicts[r];
+        dicts.ddicts[r] = tmpd;
+    }
+    for (size_t r=0; r<nbDicts; r++) {
+        size_t const d1 = rand() % nbDicts;
+        size_t const d2 = rand() % nbDicts;
+        ZSTD_DDict* tmpd = dicts.ddicts[d1];
+        dicts.ddicts[d1] = dicts.ddicts[d2];
+        dicts.ddicts[d2] = tmpd;
+    }
+}
+
+
+/* ---   Compression  --- */
+
+/* compressBlocks() :
+ * @return : total compressed size of all blocks,
+ *        or 0 if error.
+ */
+static size_t compressBlocks(size_t* cSizes,   /* optional (can be NULL). If present, must contain at least nbBlocks fields */
+                             slice_collection_t dstBlockBuffers,
+                             slice_collection_t srcBlockBuffers,
+                             ZSTD_CDict* cdict, int cLevel)
+{
+    size_t const nbBlocks = srcBlockBuffers.nbSlices;
+    assert(dstBlockBuffers.nbSlices == srcBlockBuffers.nbSlices);
+
+    ZSTD_CCtx* const cctx = ZSTD_createCCtx();
+    assert(cctx != NULL);
+
+    size_t totalCSize = 0;
+    for (size_t blockNb=0; blockNb < nbBlocks; blockNb++) {
+        size_t cBlockSize;
+        if (cdict == NULL) {
+            cBlockSize = ZSTD_compressCCtx(cctx,
+                            dstBlockBuffers.slicePtrs[blockNb], dstBlockBuffers.capacities[blockNb],
+                            srcBlockBuffers.slicePtrs[blockNb], srcBlockBuffers.capacities[blockNb],
+                            cLevel);
+        } else {
+            cBlockSize = ZSTD_compress_usingCDict(cctx,
+                            dstBlockBuffers.slicePtrs[blockNb], dstBlockBuffers.capacities[blockNb],
+                            srcBlockBuffers.slicePtrs[blockNb], srcBlockBuffers.capacities[blockNb],
+                            cdict);
+        }
+        CONTROL(!ZSTD_isError(cBlockSize));
+        if (cSizes) cSizes[blockNb] = cBlockSize;
+        totalCSize += cBlockSize;
+    }
+    return totalCSize;
+}
+
+
+/* ---  Benchmark  --- */
+
+typedef struct {
+    ZSTD_DCtx* dctx;
+    size_t nbDicts;
+    size_t dictNb;
+    ddict_collection_t dictionaries;
+} decompressInstructions;
+
+decompressInstructions createDecompressInstructions(ddict_collection_t dictionaries)
+{
+    decompressInstructions di;
+    di.dctx = ZSTD_createDCtx();
+    assert(di.dctx != NULL);
+    di.nbDicts = dictionaries.nbDDict;
+    di.dictNb = 0;
+    di.dictionaries = dictionaries;
+    return di;
+}
+
+void freeDecompressInstructions(decompressInstructions di)
+{
+    ZSTD_freeDCtx(di.dctx);
+}
+
+/* benched function */
+size_t decompress(const void* src, size_t srcSize, void* dst, size_t dstCapacity, void* payload)
+{
+    decompressInstructions* const di = (decompressInstructions*) payload;
+
+    size_t const result = ZSTD_decompress_usingDDict(di->dctx,
+                                        dst, dstCapacity,
+                                        src, srcSize,
+                                        di->dictionaries.ddicts[di->dictNb]);
+
+    di->dictNb = di->dictNb + 1;
+    if (di->dictNb >= di->nbDicts) di->dictNb = 0;
+
+    return result;
+}
+
+
+static int benchMem(slice_collection_t dstBlocks,
+                    slice_collection_t srcBlocks,
+                    ddict_collection_t dictionaries,
+                    int nbRounds)
+{
+    assert(dstBlocks.nbSlices == srcBlocks.nbSlices);
+
+    unsigned const ms_per_round = RUN_TIME_DEFAULT_MS;
+    unsigned const total_time_ms = nbRounds * ms_per_round;
+
+    double bestSpeed = 0.;
+
+    BMK_timedFnState_t* const benchState =
+            BMK_createTimedFnState(total_time_ms, ms_per_round);
+    decompressInstructions di = createDecompressInstructions(dictionaries);
+
+    for (;;) {
+        BMK_runOutcome_t const outcome = BMK_benchTimedFn(benchState,
+                                decompress, &di,
+                                NULL, NULL,
+                                dstBlocks.nbSlices,
+                                (const void* const *)srcBlocks.slicePtrs, srcBlocks.capacities,
+                                dstBlocks.slicePtrs, dstBlocks.capacities,
+                                NULL);
+        CONTROL(BMK_isSuccessful_runOutcome(outcome));
+
+        BMK_runTime_t const result = BMK_extract_runTime(outcome);
+        U64 const dTime_ns = result.nanoSecPerRun;
+        double const dTime_sec = (double)dTime_ns / 1000000000;
+        size_t const srcSize = result.sumOfReturn;
+        double const dSpeed_MBps = (double)srcSize / dTime_sec / (1 MB);
+        if (dSpeed_MBps > bestSpeed) bestSpeed = dSpeed_MBps;
+        DISPLAY("Decompression Speed : %.1f MB/s \r", bestSpeed);
+        fflush(stdout);
+        if (BMK_isCompleted_TimedFn(benchState)) break;
+    }
+    DISPLAY("\n");
+
+    freeDecompressInstructions(di);
+    BMK_freeTimedFnState(benchState);
+
+    return 0;   /* success */
+}
+
+
+/*! bench() :
+ *  fileName : file to load for benchmarking purpose
+ *  dictionary : optional (can be NULL), file to load as dictionary,
+ *              if none provided : will be calculated on the fly by the program.
+ * @return : 0 is success, 1+ otherwise */
+int bench(const char** fileNameTable, unsigned nbFiles,
+          const char* dictionary,
+          size_t blockSize, int clevel,
+          unsigned nbDictMax, unsigned nbBlocks,
+          int nbRounds)
+{
+    int result = 0;
+
+    DISPLAYLEVEL(3, "loading %u files... \n", nbFiles);
+    buffer_collection_t const srcs = createBufferCollection_fromFiles(fileNameTable, nbFiles);
+    CONTROL(srcs.buffer.ptr != NULL);
+    buffer_t srcBuffer = srcs.buffer;
+    size_t const srcSize = srcBuffer.size;
+    DISPLAYLEVEL(3, "created src buffer of size %.1f MB \n",
+                    (double)srcSize / (1 MB));
+
+    slice_collection_t const srcSlices = splitSlices(srcs.slices, blockSize, nbBlocks);
+    nbBlocks = (unsigned)(srcSlices.nbSlices);
+    DISPLAYLEVEL(3, "split input into %u blocks ", nbBlocks);
+    if (blockSize)
+        DISPLAYLEVEL(3, "of max size %u bytes ", (unsigned)blockSize);
+    DISPLAYLEVEL(3, "\n");
+
+
+    size_t* const dstCapacities = malloc(nbBlocks * sizeof(*dstCapacities));
+    CONTROL(dstCapacities != NULL);
+    size_t dstBufferCapacity = 0;
+    for (size_t bnb=0; bnb<nbBlocks; bnb++) {
+        dstCapacities[bnb] = ZSTD_compressBound(srcSlices.capacities[bnb]);
+        dstBufferCapacity += dstCapacities[bnb];
+    }
+
+    buffer_t dstBuffer = createBuffer(dstBufferCapacity);
+    CONTROL(dstBuffer.ptr != NULL);
+
+    void** const sliceTable = malloc(nbBlocks * sizeof(*sliceTable));
+    CONTROL(sliceTable != NULL);
+
+    {   char* const ptr = dstBuffer.ptr;
+        size_t pos = 0;
+        for (size_t snb=0; snb < nbBlocks; snb++) {
+            sliceTable[snb] = ptr + pos;
+            pos += dstCapacities[snb];
+    }   }
+
+    slice_collection_t dstSlices;
+    dstSlices.capacities = dstCapacities;
+    dstSlices.slicePtrs = sliceTable;
+    dstSlices.nbSlices = nbBlocks;
+
+
+    /* dictionary determination */
+    buffer_t const dictBuffer = createDictionaryBuffer(dictionary,
+                                srcBuffer.ptr,
+                                srcSlices.capacities, nbBlocks,
+                                DICTSIZE);
+    CONTROL(dictBuffer.ptr != NULL);
+
+    ZSTD_CDict* const cdict = ZSTD_createCDict(dictBuffer.ptr, dictBuffer.size, clevel);
+    CONTROL(cdict != NULL);
+
+    size_t const cTotalSizeNoDict = compressBlocks(NULL, dstSlices, srcSlices, NULL, clevel);
+    CONTROL(cTotalSizeNoDict != 0);
+    DISPLAYLEVEL(3, "compressing at level %u without dictionary : Ratio=%.2f  (%u bytes) \n",
+                    clevel,
+                    (double)srcSize / cTotalSizeNoDict, (unsigned)cTotalSizeNoDict);
+
+    size_t* const cSizes = malloc(nbBlocks * sizeof(size_t));
+    CONTROL(cSizes != NULL);
+
+    size_t const cTotalSize = compressBlocks(cSizes, dstSlices, srcSlices, cdict, clevel);
+    CONTROL(cTotalSize != 0);
+    DISPLAYLEVEL(3, "compressed using a %u bytes dictionary : Ratio=%.2f  (%u bytes) \n",
+                    (unsigned)dictBuffer.size,
+                    (double)srcSize / cTotalSize, (unsigned)cTotalSize);
+
+    /* now dstSlices contain the real compressed size of each block, instead of the maximum capacity */
+    shrinkSizes(dstSlices, cSizes);
+
+    size_t const dictMem = ZSTD_estimateDDictSize(dictBuffer.size, ZSTD_dlm_byCopy);
+    unsigned const nbDicts = nbDictMax ? nbDictMax : nbBlocks;
+    size_t const allDictMem = dictMem * nbDicts;
+    DISPLAYLEVEL(3, "generating %u dictionaries, using %.1f MB of memory \n",
+                    nbDicts, (double)allDictMem / (1 MB));
+
+    ddict_collection_t const dictionaries = createDDictCollection(dictBuffer.ptr, dictBuffer.size, nbDicts);
+    CONTROL(dictionaries.ddicts != NULL);
+
+    shuffleDictionaries(dictionaries);
+
+    buffer_collection_t resultCollection = createBufferCollection_fromSliceCollectionSizes(srcSlices);
+    CONTROL(resultCollection.buffer.ptr != NULL);
+
+    result = benchMem(resultCollection.slices, dstSlices, dictionaries, nbRounds);
+
+    /* free all heap objects in reverse order */
+    freeBufferCollection(resultCollection);
+    freeDDictCollection(dictionaries);
+    free(cSizes);
+    ZSTD_freeCDict(cdict);
+    freeBuffer(dictBuffer);
+    freeSliceCollection(dstSlices);
+    freeBuffer(dstBuffer);
+    freeSliceCollection(srcSlices);
+    freeBufferCollection(srcs);
+
+    return result;
+}
+
+
+
+/* ---  Command Line  --- */
+
+/*! readU32FromChar() :
+ * @return : unsigned integer value read from input in `char` format.
+ *  allows and interprets K, KB, KiB, M, MB and MiB suffix.
+ *  Will also modify `*stringPtr`, advancing it to position where it stopped reading.
+ *  Note : function will exit() program if digit sequence overflows */
+static unsigned readU32FromChar(const char** stringPtr)
+{
+    unsigned result = 0;
+    while ((**stringPtr >='0') && (**stringPtr <='9')) {
+        unsigned const max = (((unsigned)(-1)) / 10) - 1;
+        assert(result <= max);   /* check overflow */
+        result *= 10, result += **stringPtr - '0', (*stringPtr)++ ;
+    }
+    if ((**stringPtr=='K') || (**stringPtr=='M')) {
+        unsigned const maxK = ((unsigned)(-1)) >> 10;
+        assert(result <= maxK);   /* check overflow */
+        result <<= 10;
+        if (**stringPtr=='M') {
+            assert(result <= maxK);   /* check overflow */
+            result <<= 10;
+        }
+        (*stringPtr)++;  /* skip `K` or `M` */
+        if (**stringPtr=='i') (*stringPtr)++;
+        if (**stringPtr=='B') (*stringPtr)++;
+    }
+    return result;
+}
+
+/** longCommandWArg() :
+ *  check if *stringPtr is the same as longCommand.
+ *  If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand.
+ * @return 0 and doesn't modify *stringPtr otherwise.
+ */
+static unsigned longCommandWArg(const char** stringPtr, const char* longCommand)
+{
+    size_t const comSize = strlen(longCommand);
+    int const result = !strncmp(*stringPtr, longCommand, comSize);
+    if (result) *stringPtr += comSize;
+    return result;
+}
+
+
+int usage(const char* exeName)
+{
+    DISPLAY (" \n");
+    DISPLAY (" %s [Options] filename(s) \n", exeName);
+    DISPLAY (" \n");
+    DISPLAY ("Options : \n");
+    DISPLAY ("-r          : recursively load all files in subdirectories (default: off) \n");
+    DISPLAY ("-B#         : split input into blocks of size # (default: no split) \n");
+    DISPLAY ("-#          : use compression level # (default: %u) \n", CLEVEL_DEFAULT);
+    DISPLAY ("-D #        : use # as a dictionary (default: create one) \n");
+    DISPLAY ("-i#         : nb benchmark rounds (default: %u) \n", BENCH_TIME_DEFAULT_S);
+    DISPLAY ("--nbBlocks=#: use # blocks for bench (default: one per file) \n");
+    DISPLAY ("--nbDicts=# : create # dictionaries for bench (default: one per block) \n");
+    DISPLAY ("-h          : help (this text) \n");
+    return 0;
+}
+
+int bad_usage(const char* exeName)
+{
+    DISPLAY (" bad usage : \n");
+    usage(exeName);
+    return 1;
+}
+
+int main (int argc, const char** argv)
+{
+    int recursiveMode = 0;
+    int nbRounds = BENCH_TIME_DEFAULT_S;
+    const char* const exeName = argv[0];
+
+    if (argc < 2) return bad_usage(exeName);
+
+    const char** nameTable = (const char**)malloc(argc * sizeof(const char*));
+    assert(nameTable != NULL);
+    unsigned nameIdx = 0;
+
+    const char* dictionary = NULL;
+    int cLevel = CLEVEL_DEFAULT;
+    size_t blockSize = BLOCKSIZE_DEFAULT;
+    unsigned nbDicts = 0;  /* determine nbDicts automatically: 1 dictionary per block */
+    unsigned nbBlocks = 0; /* determine nbBlocks automatically, from source and blockSize */
+
+    for (int argNb = 1; argNb < argc ; argNb++) {
+        const char* argument = argv[argNb];
+        if (!strcmp(argument, "-h")) { free(nameTable); return usage(exeName); }
+        if (!strcmp(argument, "-r")) { recursiveMode = 1; continue; }
+        if (!strcmp(argument, "-D")) { argNb++; assert(argNb < argc); dictionary = argv[argNb]; continue; }
+        if (longCommandWArg(&argument, "-i")) { nbRounds = readU32FromChar(&argument); continue; }
+        if (longCommandWArg(&argument, "--dictionary=")) { dictionary = argument; continue; }
+        if (longCommandWArg(&argument, "-B")) { blockSize = readU32FromChar(&argument); continue; }
+        if (longCommandWArg(&argument, "--blockSize=")) { blockSize = readU32FromChar(&argument); continue; }
+        if (longCommandWArg(&argument, "--nbDicts=")) { nbDicts = readU32FromChar(&argument); continue; }
+        if (longCommandWArg(&argument, "--nbBlocks=")) { nbBlocks = readU32FromChar(&argument); continue; }
+        if (longCommandWArg(&argument, "--clevel=")) { cLevel = readU32FromChar(&argument); continue; }
+        if (longCommandWArg(&argument, "-")) { cLevel = readU32FromChar(&argument); continue; }
+        /* anything that's not a command is a filename */
+        nameTable[nameIdx++] = argument;
+    }
+
+    const char** filenameTable = nameTable;
+    unsigned nbFiles = nameIdx;
+    char* buffer_containing_filenames = NULL;
+
+    if (recursiveMode) {
+#ifndef UTIL_HAS_CREATEFILELIST
+        assert(0);   /* missing capability, do not run */
+#endif
+        filenameTable = UTIL_createFileList(nameTable, nameIdx, &buffer_containing_filenames, &nbFiles, 1 /* follow_links */);
+    }
+
+    int result = bench(filenameTable, nbFiles, dictionary, blockSize, cLevel, nbDicts, nbBlocks, nbRounds);
+
+    free(buffer_containing_filenames);
+    free(nameTable);
+
+    return result;
+}
diff --git a/contrib/meson/meson.build b/contrib/meson/meson.build
index 079c045..98c9b02 100644
--- a/contrib/meson/meson.build
+++ b/contrib/meson/meson.build
@@ -18,6 +18,7 @@
     join_paths(common_dir, 'error_private.c'),
     join_paths(common_dir, 'xxhash.c'),
     join_paths(compress_dir, 'fse_compress.c'),
+    join_paths(compress_dir, 'hist.c'),
     join_paths(compress_dir, 'huf_compress.c'),
     join_paths(compress_dir, 'zstd_compress.c'),
     join_paths(compress_dir, 'zstd_fast.c'),
@@ -130,6 +131,7 @@
 if target_machine.system() != 'windows'
     paramgrill = executable('paramgrill',
                             datagen_c, join_paths(tests_dir, 'paramgrill.c'),
+                            join_paths(programs_dir, 'bench.c'),
                             include_directories: test_includes,
                             link_with: libzstd,
                             dependencies: libm)
diff --git a/contrib/premake/premake4.lua b/contrib/premake/premake4.lua
new file mode 100644
index 0000000..6675e2e
--- /dev/null
+++ b/contrib/premake/premake4.lua
@@ -0,0 +1,6 @@
+-- Include zstd.lua in your GENie or premake4 file, which exposes a project_zstd function
+dofile('zstd.lua')
+
+solution 'example'
+	configurations { 'Debug', 'Release' }
+	project_zstd('../../lib/')
diff --git a/contrib/premake/zstd.lua b/contrib/premake/zstd.lua
new file mode 100644
index 0000000..df1ace3
--- /dev/null
+++ b/contrib/premake/zstd.lua
@@ -0,0 +1,80 @@
+-- This GENie/premake file copies the behavior of the Makefile in the lib folder.
+-- Basic usage: project_zstd(ZSTD_DIR)
+
+function project_zstd(dir, compression, decompression, deprecated, dictbuilder, legacy)
+	if compression == nil then compression = true end
+	if decompression == nil then decompression = true end
+	if deprecated == nil then deprecated = false end
+	if dictbuilder == nil then dictbuilder = false end
+
+	if legacy == nil then legacy = 0 end
+
+	if not compression then
+		dictbuilder = false
+		deprecated = false
+	end
+
+	if not decompression then
+		legacy = 0
+		deprecated = false
+	end
+
+	project 'zstd'
+		kind 'StaticLib'
+		language 'C'
+
+		files {
+			dir .. 'zstd.h',
+			dir .. 'common/**.c',
+			dir .. 'common/**.h'
+		}
+
+		if compression then
+			files {
+				dir .. 'compress/**.c',
+				dir .. 'compress/**.h'
+			}
+		end
+
+		if decompression then
+			files {
+				dir .. 'decompress/**.c',
+				dir .. 'decompress/**.h'
+			}
+		end
+
+		if dictbuilder then
+			files {
+				dir .. 'dictBuilder/**.c',
+				dir .. 'dictBuilder/**.h'
+			}
+		end
+
+		if deprecated then
+			files {
+				dir .. 'deprecated/**.c',
+				dir .. 'deprecated/**.h'
+			}
+		end
+
+		if legacy ~= 0 then
+			if legacy >= 8 then
+				files {
+					dir .. 'legacy/zstd_v0' .. (legacy - 7) .. '.*'
+				}
+			end
+			includedirs {
+				dir .. 'legacy'
+			}
+		end
+
+		includedirs {
+			dir,
+			dir .. 'common'
+		}
+
+		defines {
+			'XXH_NAMESPACE=ZSTD_',
+			'ZSTD_LEGACY_SUPPORT=' .. legacy
+		}
+end
diff --git a/contrib/pzstd/Options.cpp b/contrib/pzstd/Options.cpp
index 1590d85..2123f88 100644
--- a/contrib/pzstd/Options.cpp
+++ b/contrib/pzstd/Options.cpp
@@ -18,17 +18,6 @@
 #include <thread>
 #include <vector>
 
-#if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(_WIN32) ||     \
-    defined(__CYGWIN__)
-#include <io.h> /* _isatty */
-#define IS_CONSOLE(stdStream) _isatty(_fileno(stdStream))
-#elif defined(_POSIX_C_SOURCE) || defined(_XOPEN_SOURCE) || defined(_POSIX_SOURCE) || (defined(__APPLE__) && defined(__MACH__)) || \
-      defined(__DragonFly__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)  /* https://sourceforge.net/p/predef/wiki/OperatingSystems/ */
-#include <unistd.h> /* isatty */
-#define IS_CONSOLE(stdStream) isatty(fileno(stdStream))
-#else
-#define IS_CONSOLE(stdStream) 0
-#endif
 
 namespace pzstd {
 
diff --git a/contrib/pzstd/Pzstd.cpp b/contrib/pzstd/Pzstd.cpp
index 1eb4ce1..6c580b3 100644
--- a/contrib/pzstd/Pzstd.cpp
+++ b/contrib/pzstd/Pzstd.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree) and the GPLv2 (found
  * in the COPYING file in the root directory of this source tree).
  */
+#include "platform.h"   /* Large Files support, SET_BINARY_MODE */
 #include "Pzstd.h"
 #include "SkippableFrame.h"
 #include "utils/FileSystem.h"
@@ -21,14 +22,6 @@
 #include <memory>
 #include <string>
 
-#if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(_WIN32) || defined(__CYGWIN__)
-#  include <fcntl.h>    /* _O_BINARY */
-#  include <io.h>       /* _setmode, _isatty */
-#  define SET_BINARY_MODE(file) { if (_setmode(_fileno(file), _O_BINARY) == -1) perror("Cannot set _O_BINARY"); }
-#else
-#  include <unistd.h>   /* isatty */
-#  define SET_BINARY_MODE(file)
-#endif
 
 namespace pzstd {
 
diff --git a/contrib/seekable_format/examples/seekable_compression.c b/contrib/seekable_format/examples/seekable_compression.c
index 9485bf2..9a331a8 100644
--- a/contrib/seekable_format/examples/seekable_compression.c
+++ b/contrib/seekable_format/examples/seekable_compression.c
@@ -101,7 +101,7 @@
     free(buffOut);
 }
 
-static const char* createOutFilename_orDie(const char* filename)
+static char* createOutFilename_orDie(const char* filename)
 {
     size_t const inL = strlen(filename);
     size_t const outL = inL + 5;
@@ -109,7 +109,7 @@
     memset(outSpace, 0, outL);
     strcat(outSpace, filename);
     strcat(outSpace, ".zst");
-    return (const char*)outSpace;
+    return (char*)outSpace;
 }
 
 int main(int argc, const char** argv) {
@@ -124,8 +124,9 @@
     {   const char* const inFileName = argv[1];
         unsigned const frameSize = (unsigned)atoi(argv[2]);
 
-        const char* const outFileName = createOutFilename_orDie(inFileName);
+        char* const outFileName = createOutFilename_orDie(inFileName);
         compressFile_orDie(inFileName, outFileName, 5, frameSize);
+        free(outFileName);
     }
 
     return 0;
diff --git a/contrib/seekable_format/examples/seekable_decompression.c b/contrib/seekable_format/examples/seekable_decompression.c
index 9cd2329..7050e0f 100644
--- a/contrib/seekable_format/examples/seekable_decompression.c
+++ b/contrib/seekable_format/examples/seekable_decompression.c
@@ -84,7 +84,7 @@
 }
 
 
-static void decompressFile_orDie(const char* fname, unsigned startOffset, unsigned endOffset)
+static void decompressFile_orDie(const char* fname, off_t startOffset, off_t endOffset)
 {
     FILE* const fin  = fopen_orDie(fname, "rb");
     FILE* const fout = stdout;
@@ -129,8 +129,8 @@
 
     {
         const char* const inFilename = argv[1];
-        unsigned const startOffset = (unsigned) atoi(argv[2]);
-        unsigned const endOffset = (unsigned) atoi(argv[3]);
+        off_t const startOffset = atoll(argv[2]);
+        off_t const endOffset = atoll(argv[3]);
         decompressFile_orDie(inFilename, startOffset, endOffset);
     }
 
diff --git a/contrib/seekable_format/zstdseek_decompress.c b/contrib/seekable_format/zstdseek_decompress.c
index b006ff8..b4c4875 100644
--- a/contrib/seekable_format/zstdseek_decompress.c
+++ b/contrib/seekable_format/zstdseek_decompress.c
@@ -56,6 +56,7 @@
 
 #include <stdlib.h> /* malloc, free */
 #include <stdio.h>  /* FILE* */
+#include <assert.h>
 
 #define XXH_STATIC_LINKING_ONLY
 #define XXH_NAMESPACE ZSTD_
@@ -112,7 +113,7 @@
 
 static int ZSTD_seekable_seek_buff(void* opaque, long long offset, int origin)
 {
-    buffWrapper_t* buff = (buffWrapper_t*) opaque;
+    buffWrapper_t* const buff = (buffWrapper_t*) opaque;
     unsigned long long newOffset;
     switch (origin) {
     case SEEK_SET:
@@ -124,6 +125,8 @@
     case SEEK_END:
         newOffset = (unsigned long long)buff->size - offset;
         break;
+    default:
+        assert(0);  /* not possible */
     }
     if (newOffset > buff->size) {
         return -1;
@@ -310,8 +313,8 @@
             /* compute cumulative positions */
             for (; idx < numFrames; idx++) {
                 if (pos + sizePerEntry > SEEKABLE_BUFF_SIZE) {
-                    U32 const toRead = MIN(remaining, SEEKABLE_BUFF_SIZE);
                     U32 const offset = SEEKABLE_BUFF_SIZE - pos;
+                    U32 const toRead = MIN(remaining, SEEKABLE_BUFF_SIZE - offset);
                     memmove(zs->inBuff, zs->inBuff + pos, offset); /* move any data we haven't read yet */
                     CHECK_IO(src.read(src.opaque, zs->inBuff+offset, toRead));
                     remaining -= toRead;
diff --git a/doc/images/cdict_v136.png b/doc/images/cdict_v136.png
new file mode 100644
index 0000000..4a6d456
--- /dev/null
+++ b/doc/images/cdict_v136.png
Binary files differ
diff --git a/doc/zstd_compression_format.md b/doc/zstd_compression_format.md
index 62430e4..e562e62 100644
--- a/doc/zstd_compression_format.md
+++ b/doc/zstd_compression_format.md
@@ -16,7 +16,7 @@
 
 ### Version
 
-0.2.8 (30/05/18)
+0.3.0 (25/09/18)
 
 
 Introduction
@@ -72,7 +72,7 @@
 and a set of parameters which tells the decoder how to decompress it.
 
 A frame encapsulates one or multiple __blocks__.
-Each block can be compressed or not,
+Each block contains arbitrary content, which is described by its header,
 and has a guaranteed maximum content size, which depends on frame parameters.
 Unlike frames, each block depends on previous blocks for proper decoding.
 However, each block can be decompressed without waiting for its successor,
@@ -488,20 +488,20 @@
 __`Size_Format` for `Raw_Literals_Block` and `RLE_Literals_Block`__ :
 
 `Size_Format` uses 1 _or_ 2 bits.
-Its value is : `Size_Format = (Header[0]>>2) & 3`
+Its value is : `Size_Format = (Literals_Section_Header[0]>>2) & 3`
 
 - `Size_Format` == 00 or 10 : `Size_Format` uses 1 bit.
                `Regenerated_Size` uses 5 bits (0-31).
                `Literals_Section_Header` uses 1 byte.
-               `Regenerated_Size = Header[0]>>3`
+               `Regenerated_Size = Literals_Section_Header[0]>>3`
 - `Size_Format` == 01 : `Size_Format` uses 2 bits.
                `Regenerated_Size` uses 12 bits (0-4095).
                `Literals_Section_Header` uses 2 bytes.
-               `Regenerated_Size = (Header[0]>>4) + (Header[1]<<4)`
+               `Regenerated_Size = (Literals_Section_Header[0]>>4) + (Literals_Section_Header[1]<<4)`
 - `Size_Format` == 11 : `Size_Format` uses 2 bits.
                `Regenerated_Size` uses 20 bits (0-1048575).
                `Literals_Section_Header` uses 3 bytes.
-               `Regenerated_Size = (Header[0]>>4) + (Header[1]<<4) + (Header[2]<<12)`
+               `Regenerated_Size = (Literals_Section_Header[0]>>4) + (Literals_Section_Header[1]<<4) + (Literals_Section_Header[2]<<12)`
 
 Only Stream1 is present for these cases.
 Note : it's allowed to represent a short value (for example `13`)
@@ -591,7 +591,7 @@
 A match copy command specifies an offset and a length.
 
 When all _sequences_ are decoded,
-if there are literals left in the _literal section_,
+if there are literals left in the _literals section_,
 these bytes are added at the end of the block.
 
 This is described in more detail in [Sequence Execution](#sequence-execution).
@@ -608,7 +608,7 @@
 | -------------------------- | ------------------------- | ---------------- | ---------------------- | --------- |
 
 To decode the `Sequences_Section`, it's required to know its size.
-This size is deduced from the literals section size:
+Its size is deduced from the size of `Literals_Section`:
 `Sequences_Section_Size = Block_Size - Literals_Section_Size`.
 
 
@@ -805,7 +805,7 @@
 
 ##### Decoding a sequence
 For each of the symbol types, the FSE state can be used to determine the appropriate code.
-The code then defines the baseline and number of bits to read for each type.
+The code then defines the `Baseline` and `Number_of_Bits` to read for each type.
 See the [description of the codes] for how to determine these values.
 
 [description of the codes]: #the-codes-for-literals-lengths-match-lengths-and-offsets
@@ -872,8 +872,8 @@
 
 Each sequence consists of a tuple of (`literals_length`, `offset_value`, `match_length`),
 decoded as described in the [Sequences Section](#sequences-section).
-To execute a sequence, first copy `literals_length` bytes from the literals section
-to the output.
+To execute a sequence, first copy `literals_length` bytes
+from the decoded literals to the output.
 
 Then `match_length` bytes are copied from previous decoded data.
 The offset to copy from is determined by `offset_value`:
@@ -1192,6 +1192,8 @@
 The last symbol's `Weight` is deduced from previously decoded ones,
 by completing to the nearest power of 2.
 This power of 2 gives `Max_Number_of_Bits`, the depth of the current tree.
+`Max_Number_of_Bits` must be <= 11,
+otherwise the representation is considered corrupted.
 
 __Example__ :
 Let's presume the following Huffman tree must be described :
@@ -1216,12 +1218,12 @@
 |   `Weight`    |  4  |  3  |  2  |  0  |  1  |
 
 The decoder will do the inverse operation :
-having collected weights of literals from `0` to `4`,
-it knows the last literal, `5`, is present with a non-zero weight.
-The weight of `5` can be determined by advancing to the next power of 2.
+having collected weights of literal symbols from `0` to `4`,
+it knows the last literal, `5`, is present with a non-zero `Weight`.
+The `Weight` of `5` can be determined by advancing to the next power of 2.
 The sum of `2^(Weight-1)` (excluding 0's) is :
 `8 + 4 + 2 + 0 + 1 = 15`.
-Nearest power of 2 is 16.
+Nearest larger power of 2 value is 16.
 Therefore, `Max_Number_of_Bits = 4` and `Weight[5] = 16-15 = 1`.
 
 #### Huffman Tree header
@@ -1233,18 +1235,24 @@
   the series of weights is compressed using FSE (see below).
   The length of the FSE-compressed series is equal to `headerByte` (0-127).
 
-- if `headerByte` >= 128 : this is a direct representation,
-  where each `Weight` is written directly as a 4 bits field (0-15).
-  They are encoded forward, 2 weights to a byte with the first weight taking
-  the top four bits and the second taking the bottom four (e.g. the following
-  operations could be used to read the weights:
-  `Weight[0] = (Byte[0] >> 4), Weight[1] = (Byte[0] & 0xf)`, etc.).
-  The full representation occupies `Ceiling(Number_of_Symbols/2)` bytes,
-  meaning it uses only full bytes even if `Number_of_Symbols` is odd.
-  `Number_of_Symbols = headerByte - 127`.
-  Note that maximum `Number_of_Symbols` is 255-127 = 128.
-  If any literal has a value > 128, raw header mode is not possible.
-  In such case, it's necessary to use FSE compression.
+- if `headerByte` >= 128 :
+  + the series of weights uses a direct representation,
+    where each `Weight` is encoded directly as a 4 bits field (0-15).
+  + They are encoded forward, 2 weights to a byte,
+    first weight taking the top four bits and second one taking the bottom four.
+    * e.g. the following operations could be used to read the weights:
+      `Weight[0] = (Byte[0] >> 4), Weight[1] = (Byte[0] & 0xf)`, etc.
+  + The full representation occupies `Ceiling(Number_of_Weights/2)` bytes,
+    meaning it uses only full bytes even if `Number_of_Weights` is odd.
+  + `Number_of_Weights = headerByte - 127`.
+    * Note that maximum `Number_of_Weights` is 255-127 = 128,
+      therefore, only up to 128 `Weight` can be encoded using direct representation.
+    * Since the last non-zero `Weight` is _not_ encoded,
+      this scheme is compatible with alphabet sizes of up to 129 symbols,
+      hence including literal symbol 128.
+    * If any literal symbol > 128 has a non-zero `Weight`,
+      direct representation is not possible.
+      In such case, it's necessary to use FSE compression.
 
 
 #### Finite State Entropy (FSE) compression of Huffman weights
@@ -1257,7 +1265,7 @@
 Compressed size is provided by `headerByte`.
 It's also necessary to know its _maximum possible_ decompressed size,
 which is `255`, since literal values span from `0` to `255`,
-and last symbol's weight is not represented.
+and last symbol's `Weight` is not represented.
 
 An FSE bitstream starts by a header, describing probabilities distribution.
 It will create a Decoding Table.
@@ -1267,7 +1275,7 @@
 The Huffman header compression uses 2 states,
 which share the same FSE distribution table.
 The first state (`State1`) encodes the even indexed symbols,
-and the second (`State2`) encodes the odd indexes.
+and the second (`State2`) encodes the odd indexed symbols.
 `State1` is initialized first, and then `State2`, and they take turns
 decoding a single symbol and updating their state.
 For more details on these FSE operations, see the [FSE section](#fse).
@@ -1288,7 +1296,7 @@
 Symbols are sorted by `Weight`.
 Within same `Weight`, symbols keep natural sequential order.
 Symbols with a `Weight` of zero are removed.
-Then, starting from lowest weight, prefix codes are distributed in sequential order.
+Then, starting from lowest `Weight`, prefix codes are distributed in sequential order.
 
 __Example__ :
 Let's presume the following list of weights has been decoded :
@@ -1315,7 +1323,7 @@
 that is starting from the end down to the beginning.
 Therefore it's necessary to know the size of each bitstream.
 
-It's also necessary to know exactly which _bit_ is the latest.
+It's also necessary to know exactly which _bit_ is the last one.
 This is detected by a final bit flag :
 the highest bit of latest byte is a final-bit-flag.
 Consequently, a last byte of `0` is not possible.
@@ -1621,6 +1629,8 @@
 
 Version changes
 ---------------
+- 0.3.0 : minor edits to match RFC8478
+- 0.2.9 : clarifications for huffman weights direct representation, by Ulrich Kunitz
 - 0.2.8 : clarifications for IETF RFC discuss
 - 0.2.7 : clarifications from IETF RFC review, by Vijay Gurbani and Nick Terrell
 - 0.2.6 : fixed an error in huffman example, by Ulrich Kunitz
diff --git a/doc/zstd_manual.html b/doc/zstd_manual.html
index bd79200..4a8985f 100644
--- a/doc/zstd_manual.html
+++ b/doc/zstd_manual.html
@@ -1,10 +1,10 @@
 <html>
 <head>
 <meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
-<title>zstd 1.3.5 Manual</title>
+<title>zstd 1.3.6 Manual</title>
 </head>
 <body>
-<h1>zstd 1.3.5 Manual</h1>
+<h1>zstd 1.3.6 Manual</h1>
 <hr>
 <a name="Contents"></a><h2>Contents</h2>
 <ol>
@@ -18,39 +18,48 @@
 <li><a href="#Chapter8">Streaming</a></li>
 <li><a href="#Chapter9">Streaming compression - HowTo</a></li>
 <li><a href="#Chapter10">Streaming decompression - HowTo</a></li>
-<li><a href="#Chapter11">START OF ADVANCED AND EXPERIMENTAL FUNCTIONS</a></li>
-<li><a href="#Chapter12">Advanced types</a></li>
-<li><a href="#Chapter13">Frame size functions</a></li>
-<li><a href="#Chapter14">ZSTD_frameHeaderSize() :</a></li>
-<li><a href="#Chapter15">Memory management</a></li>
-<li><a href="#Chapter16">Advanced compression functions</a></li>
-<li><a href="#Chapter17">Advanced decompression functions</a></li>
-<li><a href="#Chapter18">Advanced streaming functions</a></li>
-<li><a href="#Chapter19">Buffer-less and synchronous inner streaming functions</a></li>
-<li><a href="#Chapter20">Buffer-less streaming compression (synchronous mode)</a></li>
-<li><a href="#Chapter21">Buffer-less streaming decompression (synchronous mode)</a></li>
-<li><a href="#Chapter22">New advanced API (experimental)</a></li>
-<li><a href="#Chapter23">ZSTD_getFrameHeader_advanced() :</a></li>
-<li><a href="#Chapter24">Block level API</a></li>
+<li><a href="#Chapter11">ADVANCED AND EXPERIMENTAL FUNCTIONS</a></li>
+<li><a href="#Chapter12">Frame size functions</a></li>
+<li><a href="#Chapter13">Memory management</a></li>
+<li><a href="#Chapter14">Advanced compression functions</a></li>
+<li><a href="#Chapter15">Advanced decompression functions</a></li>
+<li><a href="#Chapter16">Advanced streaming functions</a></li>
+<li><a href="#Chapter17">Buffer-less and synchronous inner streaming functions</a></li>
+<li><a href="#Chapter18">Buffer-less streaming compression (synchronous mode)</a></li>
+<li><a href="#Chapter19">Buffer-less streaming decompression (synchronous mode)</a></li>
+<li><a href="#Chapter20">New advanced API (experimental)</a></li>
+<li><a href="#Chapter21">Block level API</a></li>
 </ol>
 <hr>
 <a name="Chapter1"></a><h2>Introduction</h2><pre>
-  zstd, short for Zstandard, is a fast lossless compression algorithm,
-  targeting real-time compression scenarios at zlib-level and better compression ratios.
-  The zstd compression library provides in-memory compression and decompression functions.
-  The library supports compression levels from 1 up to ZSTD_maxCLevel() which is currently 22.
-  Levels >= 20, labeled `--ultra`, should be used with caution, as they require more memory.
+  zstd, short for Zstandard, is a fast lossless compression algorithm, targeting
+  real-time compression scenarios at zlib-level and better compression ratios.
+  The zstd compression library provides in-memory compression and decompression
+  functions.
+
+  The library supports regular compression levels from 1 up to ZSTD_maxCLevel(),
+  which is currently 22. Levels >= 20, labeled `--ultra`, should be used with
+  caution, as they require more memory. The library also offers negative
+  compression levels, which extend the range of speed vs. ratio preferences.
+  The lower the level, the faster the speed (at the cost of compression).
+
   Compression can be done in:
     - a single step (described as Simple API)
     - a single step, reusing a context (described as Explicit context)
     - unbounded multiple steps (described as Streaming compression)
-  The compression ratio achievable on small data can be highly improved using a dictionary in:
-    - a single step (described as Simple dictionary API)
-    - a single step, reusing a dictionary (described as Bulk-processing dictionary API)
 
-  Advanced experimental functions can be accessed using #define ZSTD_STATIC_LINKING_ONLY before including zstd.h.
-  Advanced experimental APIs shall never be used with a dynamic library.
-  They are not "stable", their definition may change in the future. Only static linking is allowed.
+  The compression ratio achievable on small data can be highly improved using
+  a dictionary. Dictionary compression can be performed in:
+    - a single step (described as Simple dictionary API)
+    - a single step, reusing a dictionary (described as Bulk-processing
+      dictionary API)
+
+  Advanced experimental functions can be accessed using
+  `#define ZSTD_STATIC_LINKING_ONLY` before including zstd.h.
+
+  Advanced experimental APIs should never be used with a dynamically-linked
+  library. They are not "stable"; their definitions or signatures may change in
+  the future. Only static linking is allowed.
 <BR></pre>
 
 <a name="Chapter2"></a><h2>Version</h2><pre></pre>
@@ -181,7 +190,8 @@
 </b><p>  When compressing multiple messages / blocks with the same dictionary, it's recommended to load it just once.
   ZSTD_createCDict() will create a digested dictionary, ready to start future compression operations without startup delay.
   ZSTD_CDict can be created once and shared by multiple threads concurrently, since its usage is read-only.
-  `dictBuffer` can be released after ZSTD_CDict creation, since its content is copied within CDict 
+  `dictBuffer` can be released after ZSTD_CDict creation, since its content is copied within CDict
+  Note : A ZSTD_CDict can be created with an empty dictionary, but it is inefficient for small data. 
 </p></pre><BR>
 
 <pre><b>size_t      ZSTD_freeCDict(ZSTD_CDict* CDict);
@@ -195,7 +205,9 @@
 </b><p>  Compression using a digested Dictionary.
   Faster startup than ZSTD_compress_usingDict(), recommended when same dictionary is used multiple times.
   Note that compression level is decided during dictionary creation.
-  Frame parameters are hardcoded (dictID=yes, contentSize=yes, checksum=no) 
+  Frame parameters are hardcoded (dictID=yes, contentSize=yes, checksum=no)
+  Note : ZSTD_compress_usingCDict() can be used with a ZSTD_CDict created from an empty dictionary.
+         But it is inefficient for small data, and it is recommended to use ZSTD_compressCCtx(). 
 </p></pre><BR>
 
 <pre><b>ZSTD_DDict* ZSTD_createDDict(const void* dictBuffer, size_t dictSize);
@@ -321,15 +333,16 @@
 </b></pre><BR>
 <pre><b>size_t ZSTD_DStreamOutSize(void);   </b>/*!< recommended size for output buffer. Guarantee to successfully flush at least one complete block in all circumstances. */<b>
 </b></pre><BR>
-<a name="Chapter11"></a><h2>START OF ADVANCED AND EXPERIMENTAL FUNCTIONS</h2><pre> The definitions in this section are considered experimental.
+<a name="Chapter11"></a><h2>ADVANCED AND EXPERIMENTAL FUNCTIONS</h2><pre>
+ The definitions in this section are considered experimental.
  They should never be used with a dynamic library, as prototypes may change in the future.
  They are provided for advanced scenarios.
  Use them only in association with static linking.
  
 <BR></pre>
 
-<a name="Chapter12"></a><h2>Advanced types</h2><pre></pre>
-
+<pre><b>int ZSTD_minCLevel(void);  </b>/*!< minimum negative compression level allowed */<b>
+</b></pre><BR>
 <pre><b>typedef enum { ZSTD_fast=1, ZSTD_dfast, ZSTD_greedy, ZSTD_lazy, ZSTD_lazy2,
                ZSTD_btlazy2, ZSTD_btopt, ZSTD_btultra } ZSTD_strategy;   </b>/* from faster to stronger */<b>
 </b></pre><BR>
@@ -365,7 +378,7 @@
     ZSTD_dlm_byRef,      </b>/**< Reference dictionary content -- the dictionary buffer must outlive its users. */<b>
 } ZSTD_dictLoadMethod_e;
 </b></pre><BR>
-<a name="Chapter13"></a><h2>Frame size functions</h2><pre></pre>
+<a name="Chapter12"></a><h2>Frame size functions</h2><pre></pre>
 
 <pre><b>size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize);
 </b><p>  `src` should point to the start of a ZSTD encoded frame or skippable frame
@@ -398,12 +411,13 @@
             however it does mean that all frame data must be present and valid. 
 </p></pre><BR>
 
-<a name="Chapter14"></a><h2>ZSTD_frameHeaderSize() :</h2><pre>  srcSize must be >= ZSTD_frameHeaderSize_prefix.
+<pre><b>size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize);
+</b><p>  srcSize must be >= ZSTD_frameHeaderSize_prefix.
  @return : size of the Frame Header,
            or an error code (if srcSize is too small) 
-<BR></pre>
+</p></pre><BR>
 
-<a name="Chapter15"></a><h2>Memory management</h2><pre></pre>
+<a name="Chapter13"></a><h2>Memory management</h2><pre></pre>
 
 <pre><b>size_t ZSTD_sizeof_CCtx(const ZSTD_CCtx* cctx);
 size_t ZSTD_sizeof_DCtx(const ZSTD_DCtx* dctx);
@@ -493,7 +507,7 @@
  
 </p></pre><BR>
 
-<a name="Chapter16"></a><h2>Advanced compression functions</h2><pre></pre>
+<a name="Chapter14"></a><h2>Advanced compression functions</h2><pre></pre>
 
 <pre><b>ZSTD_CDict* ZSTD_createCDict_byReference(const void* dictBuffer, size_t dictSize, int compressionLevel);
 </b><p>  Create a digested dictionary for compression
@@ -535,7 +549,7 @@
 </b><p>   Same as ZSTD_compress_usingCDict(), with fine-tune control over frame parameters 
 </p></pre><BR>
 
-<a name="Chapter17"></a><h2>Advanced decompression functions</h2><pre></pre>
+<a name="Chapter15"></a><h2>Advanced decompression functions</h2><pre></pre>
 
 <pre><b>unsigned ZSTD_isFrame(const void* buffer, size_t size);
 </b><p>  Tells if the content of `buffer` starts with a valid Frame Identifier.
@@ -575,7 +589,7 @@
   When identifying the exact failure cause, it's possible to use ZSTD_getFrameHeader(), which will provide a more precise error code. 
 </p></pre><BR>
 
-<a name="Chapter18"></a><h2>Advanced streaming functions</h2><pre></pre>
+<a name="Chapter16"></a><h2>Advanced streaming functions</h2><pre></pre>
 
 <h3>Advanced Streaming compression functions</h3><pre></pre><b><pre>size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, int compressionLevel, unsigned long long pledgedSrcSize);   </b>/**< pledgedSrcSize must be correct. If it is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN. Note that, for compatibility with older programs, "0" also disables frame content size field. It may be enabled in the future. */<b>
 size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, const void* dict, size_t dictSize, int compressionLevel); </b>/**< creates of an internal CDict (incompatible with static CCtx), except if dict == NULL or dictSize < 8, in which case no dict is used. Note: dict is loaded with ZSTD_dm_auto (treated as a full zstd dictionary if it begins with ZSTD_MAGIC_DICTIONARY, else as raw content) and ZSTD_dlm_byCopy.*/<b>
@@ -599,6 +613,7 @@
     unsigned long long ingested;
     unsigned long long consumed;
     unsigned long long produced;
+    unsigned currentJobID;
 } ZSTD_frameProgression;
 </b></pre><BR>
 <h3>Advanced Streaming decompression functions</h3><pre></pre><b><pre>typedef enum { DStream_p_maxWindowSize } ZSTD_DStreamParameter_e;
@@ -607,14 +622,14 @@
 size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict);  </b>/**< note : ddict is referenced, it must outlive decompression session */<b>
 size_t ZSTD_resetDStream(ZSTD_DStream* zds);  </b>/**< re-use decompression parameters from previous init; saves dictionary loading */<b>
 </pre></b><BR>
-<a name="Chapter19"></a><h2>Buffer-less and synchronous inner streaming functions</h2><pre>
+<a name="Chapter17"></a><h2>Buffer-less and synchronous inner streaming functions</h2><pre>
   This is an advanced API, giving full control over buffer management, for users which need direct control over memory.
   But it's also a complex one, with several restrictions, documented below.
   Prefer normal streaming API for an easier experience.
  
 <BR></pre>
 
-<a name="Chapter20"></a><h2>Buffer-less streaming compression (synchronous mode)</h2><pre>
+<a name="Chapter18"></a><h2>Buffer-less streaming compression (synchronous mode)</h2><pre>
   A ZSTD_CCtx object is required to track streaming operations.
   Use ZSTD_createCCtx() / ZSTD_freeCCtx() to manage resource.
   ZSTD_CCtx object can be re-used multiple times within successive compression operations.
@@ -650,7 +665,7 @@
 size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize);   </b>/* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */<b>
 size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); </b>/**<  note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */<b>
 </pre></b><BR>
-<a name="Chapter21"></a><h2>Buffer-less streaming decompression (synchronous mode)</h2><pre>
+<a name="Chapter19"></a><h2>Buffer-less streaming decompression (synchronous mode)</h2><pre>
   A ZSTD_DCtx object is required to track streaming operations.
   Use ZSTD_createDCtx() / ZSTD_freeDCtx() to manage it.
   A ZSTD_DCtx object can be re-used multiple times.
@@ -741,7 +756,7 @@
 </pre></b><BR>
 <pre><b>typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e;
 </b></pre><BR>
-<a name="Chapter22"></a><h2>New advanced API (experimental)</h2><pre></pre>
+<a name="Chapter20"></a><h2>New advanced API (experimental)</h2><pre></pre>
 
 <pre><b>typedef enum {
     </b>/* Opened question : should we have a format ZSTD_f_auto ?<b>
@@ -964,16 +979,21 @@
                            const void* prefix, size_t prefixSize,
                            ZSTD_dictContentType_e dictContentType);
 </b><p>  Reference a prefix (single-usage dictionary) for next compression job.
-  Decompression need same prefix to properly regenerate data.
-  Prefix is **only used once**. Tables are discarded at end of compression job (ZSTD_e_end).
+  Decompression will need same prefix to properly regenerate data.
+  Compressing with a prefix is similar in outcome as performing a diff and compressing it,
+  but performs much faster, especially during decompression (compression speed is tunable with compression level).
+  Note that prefix is **only used once**. Tables are discarded at end of compression job (ZSTD_e_end).
  @result : 0, or an error code (which can be tested with ZSTD_isError()).
   Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary
   Note 1 : Prefix buffer is referenced. It **must** outlive compression job.
            Its contain must remain unmodified up to end of compression (ZSTD_e_end).
-  Note 2 : Referencing a prefix involves building tables, which are dependent on compression parameters.
+  Note 2 : If the intention is to diff some large src data blob with some prior version of itself,
+           ensure that the window size is large enough to contain the entire source.
+           See ZSTD_p_windowLog.
+  Note 3 : Referencing a prefix involves building tables, which are dependent on compression parameters.
            It's a CPU consuming operation, with non-negligible impact on latency.
            If there is a need to use same prefix multiple times, consider loadDictionary instead.
-  Note 3 : By default, the prefix is treated as raw content (ZSTD_dm_rawContent).
+  Note 4 : By default, the prefix is treated as raw content (ZSTD_dm_rawContent).
            Use ZSTD_CCtx_refPrefix_advanced() to alter dictMode. 
 </p></pre><BR>
 
@@ -1140,6 +1160,8 @@
                         const void* prefix, size_t prefixSize,
                         ZSTD_dictContentType_e dictContentType);
 </b><p>  Reference a prefix (single-usage dictionary) for next compression job.
+  This is the reverse operation of ZSTD_CCtx_refPrefix(),
+  and must use the same prefix as the one used during compression.
   Prefix is **only used once**. Reference is discarded at end of frame.
   End of frame is reached when ZSTD_DCtx_decompress_generic() returns 0.
  @result : 0, or an error code (which can be tested with ZSTD_isError()).
@@ -1171,9 +1193,11 @@
  
 </p></pre><BR>
 
-<a name="Chapter23"></a><h2>ZSTD_getFrameHeader_advanced() :</h2><pre>  same as ZSTD_getFrameHeader(),
+<pre><b>size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr,
+            const void* src, size_t srcSize, ZSTD_format_e format);
+</b><p>  same as ZSTD_getFrameHeader(),
   with added capability to select a format (like ZSTD_f_zstd1_magicless) 
-<BR></pre>
+</p></pre><BR>
 
 <pre><b>size_t ZSTD_decompress_generic(ZSTD_DCtx* dctx,
                                ZSTD_outBuffer* output,
@@ -1207,7 +1231,7 @@
  
 </p></pre><BR>
 
-<a name="Chapter24"></a><h2>Block level API</h2><pre></pre>
+<a name="Chapter21"></a><h2>Block level API</h2><pre></pre>
 
 <pre><b></b><p>    Frame metadata cost is typically ~18 bytes, which can be non-negligible for very small blocks (< 100 bytes).
     User will have to take in charge required information to regenerate data, such as compressed and content sizes.
diff --git a/examples/multiple_streaming_compression.c b/examples/multiple_streaming_compression.c
index e395aef..4308a2e 100644
--- a/examples/multiple_streaming_compression.c
+++ b/examples/multiple_streaming_compression.c
@@ -158,7 +158,8 @@
     }
 
     freeResources(ress);
-    /* success */
+    free(ofnBuffer);
+
     printf("compressed %i files \n", argc-1);
 
     return 0;
diff --git a/examples/streaming_compression.c b/examples/streaming_compression.c
index f76364d..9287ff3 100644
--- a/examples/streaming_compression.c
+++ b/examples/streaming_compression.c
@@ -73,7 +73,11 @@
     ZSTD_CStream* const cstream = ZSTD_createCStream();
     if (cstream==NULL) { fprintf(stderr, "ZSTD_createCStream() error \n"); exit(10); }
     size_t const initResult = ZSTD_initCStream(cstream, cLevel);
-    if (ZSTD_isError(initResult)) { fprintf(stderr, "ZSTD_initCStream() error : %s \n", ZSTD_getErrorName(initResult)); exit(11); }
+    if (ZSTD_isError(initResult)) {
+        fprintf(stderr, "ZSTD_initCStream() error : %s \n",
+                    ZSTD_getErrorName(initResult));
+        exit(11);
+    }
 
     size_t read, toRead = buffInSize;
     while( (read = fread_orDie(buffIn, toRead, fin)) ) {
@@ -81,7 +85,11 @@
         while (input.pos < input.size) {
             ZSTD_outBuffer output = { buffOut, buffOutSize, 0 };
             toRead = ZSTD_compressStream(cstream, &output , &input);   /* toRead is guaranteed to be <= ZSTD_CStreamInSize() */
-            if (ZSTD_isError(toRead)) { fprintf(stderr, "ZSTD_compressStream() error : %s \n", ZSTD_getErrorName(toRead)); exit(12); }
+            if (ZSTD_isError(toRead)) {
+                fprintf(stderr, "ZSTD_compressStream() error : %s \n",
+                                ZSTD_getErrorName(toRead));
+                exit(12);
+            }
             if (toRead > buffInSize) toRead = buffInSize;   /* Safely handle case when `buffInSize` is manually changed to a value < ZSTD_CStreamInSize()*/
             fwrite_orDie(buffOut, output.pos, fout);
         }
@@ -100,15 +108,15 @@
 }
 
 
-static const char* createOutFilename_orDie(const char* filename)
+static char* createOutFilename_orDie(const char* filename)
 {
     size_t const inL = strlen(filename);
     size_t const outL = inL + 5;
-    void* outSpace = malloc_orDie(outL);
+    void* const outSpace = malloc_orDie(outL);
     memset(outSpace, 0, outL);
     strcat(outSpace, filename);
     strcat(outSpace, ".zst");
-    return (const char*)outSpace;
+    return (char*)outSpace;
 }
 
 int main(int argc, const char** argv)
@@ -124,8 +132,10 @@
 
     const char* const inFilename = argv[1];
 
-    const char* const outFilename = createOutFilename_orDie(inFilename);
+    char* const outFilename = createOutFilename_orDie(inFilename);
     compressFile_orDie(inFilename, outFilename, 1);
 
+    free(outFilename);   /* not strictly required, since program execution stops there,
+                          * but some static analyzer main complain otherwise */
     return 0;
 }
diff --git a/lib/BUCK b/lib/BUCK
index dbe8885..bd93b08 100644
--- a/lib/BUCK
+++ b/lib/BUCK
@@ -69,6 +69,7 @@
     ]),
     headers=subdir_glob([
         ('dictBuilder', 'divsufsort.h'),
+        ('dictBuilder', 'cover.h'),
     ]),
     srcs=glob(['dictBuilder/*.c']),
     deps=[':common'],
diff --git a/lib/Makefile b/lib/Makefile
index 9cedd53..9711f75 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -19,15 +19,20 @@
 VERSION?= $(LIBVER)
 
 CPPFLAGS+= -I. -I./common -DXXH_NAMESPACE=ZSTD_
+ifeq ($(OS),Windows_NT)   # MinGW assumed
+CPPFLAGS   += -D__USE_MINGW_ANSI_STDIO   # compatibility with %zu formatting
+endif
 CFLAGS  ?= -O3
-DEBUGFLAGS = -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \
+DEBUGFLAGS= -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \
             -Wstrict-aliasing=1 -Wswitch-enum -Wdeclaration-after-statement \
             -Wstrict-prototypes -Wundef -Wpointer-arith -Wformat-security \
             -Wvla -Wformat=2 -Winit-self -Wfloat-equal -Wwrite-strings \
-            -Wredundant-decls
+            -Wredundant-decls -Wmissing-prototypes
 CFLAGS  += $(DEBUGFLAGS) $(MOREFLAGS)
 FLAGS    = $(CPPFLAGS) $(CFLAGS)
 
+GREP = grep --color=never
+
 ZSTDCOMMON_FILES := $(sort $(wildcard common/*.c))
 ZSTDCOMP_FILES := $(sort $(wildcard compress/*.c))
 ZSTDDECOMP_FILES := $(sort $(wildcard decompress/*.c))
@@ -35,7 +40,7 @@
 ZDEPR_FILES := $(sort $(wildcard deprecated/*.c))
 ZSTD_FILES := $(ZSTDCOMMON_FILES)
 
-ZSTD_LEGACY_SUPPORT ?= 4
+ZSTD_LEGACY_SUPPORT ?= 5
 ZSTD_LIB_COMPRESSION ?= 1
 ZSTD_LIB_DECOMPRESSION ?= 1
 ZSTD_LIB_DICTBUILDER ?= 1
@@ -52,11 +57,11 @@
 endif
 
 ifneq ($(ZSTD_LIB_COMPRESSION), 0)
-	ZSTD_FILES += $(ZSTDCOMP_FILES) 
+	ZSTD_FILES += $(ZSTDCOMP_FILES)
 endif
 
 ifneq ($(ZSTD_LIB_DECOMPRESSION), 0)
-	ZSTD_FILES += $(ZSTDDECOMP_FILES) 
+	ZSTD_FILES += $(ZSTDDECOMP_FILES)
 endif
 
 ifneq ($(ZSTD_LIB_DEPRECATED), 0)
@@ -69,7 +74,7 @@
 
 ifneq ($(ZSTD_LEGACY_SUPPORT), 0)
 ifeq ($(shell test $(ZSTD_LEGACY_SUPPORT) -lt 8; echo $$?), 0)
-	ZSTD_FILES += $(shell ls legacy/*.c | grep 'v0[$(ZSTD_LEGACY_SUPPORT)-7]')
+	ZSTD_FILES += $(shell ls legacy/*.c | $(GREP) 'v0[$(ZSTD_LEGACY_SUPPORT)-7]')
 endif
 	CPPFLAGS += -I./legacy
 endif
@@ -91,8 +96,6 @@
 	SHARED_EXT_VER = $(SHARED_EXT).$(LIBVER)
 endif
 
-LIBZSTD = libzstd.$(SHARED_EXT_VER)
-
 
 .PHONY: default all clean install uninstall
 
@@ -108,19 +111,28 @@
 libzstd.a-mt: CPPFLAGS += -DZSTD_MULTITHREAD
 libzstd.a-mt: libzstd.a
 
+ifneq (,$(filter Windows%,$(OS)))
+
+LIBZSTD = dll\libzstd.dll
+$(LIBZSTD): $(ZSTD_FILES)
+	@echo compiling dynamic library $(LIBVER)
+	@$(CC) $(FLAGS) -DZSTD_DLL_EXPORT=1 -shared $^ -o $@
+	dlltool -D $@ -d dll\libzstd.def -l dll\libzstd.lib
+
+else
+
+LIBZSTD = libzstd.$(SHARED_EXT_VER)
 $(LIBZSTD): LDFLAGS += -shared -fPIC -fvisibility=hidden
 $(LIBZSTD): $(ZSTD_FILES)
 	@echo compiling dynamic library $(LIBVER)
-ifneq (,$(filter Windows%,$(OS)))
-	@$(CC) $(FLAGS) -DZSTD_DLL_EXPORT=1 -shared $^ -o dll\libzstd.dll
-	dlltool -D dll\libzstd.dll -d dll\libzstd.def -l dll\libzstd.lib
-else
 	@$(CC) $(FLAGS) $^ $(LDFLAGS) $(SONAME_FLAGS) -o $@
 	@echo creating versioned links
 	@ln -sf $@ libzstd.$(SHARED_EXT_MAJOR)
 	@ln -sf $@ libzstd.$(SHARED_EXT)
+
 endif
 
+
 libzstd : $(LIBZSTD)
 
 libzstd-mt : CPPFLAGS += -DZSTD_MULTITHREAD
@@ -154,7 +166,7 @@
 #-----------------------------------------------------------------------------
 # make install is validated only for Linux, macOS, BSD, Hurd and Solaris targets
 #-----------------------------------------------------------------------------
-ifneq (,$(filter $(shell uname),Linux Darwin GNU/kFreeBSD GNU OpenBSD FreeBSD NetBSD DragonFly SunOS))
+ifneq (,$(filter $(shell uname),Linux Darwin GNU/kFreeBSD GNU OpenBSD FreeBSD NetBSD DragonFly SunOS Haiku))
 
 DESTDIR     ?=
 # directory variables : GNU conventions prefer lowercase
diff --git a/lib/README.md b/lib/README.md
index 75debe8..0966c7a 100644
--- a/lib/README.md
+++ b/lib/README.md
@@ -13,7 +13,7 @@
 - `make install` : install libraries in default system directories
 
 `libzstd` default scope includes compression, decompression, dictionary building,
-and decoding support for legacy formats >= v0.4.0.
+and decoding support for legacy formats >= v0.5.0.
 
 
 #### API
@@ -48,23 +48,24 @@
         This module depends on both `lib/common` and `lib/compress` .
 - `lib/legacy` : source code to decompress legacy zstd formats, starting from `v0.1.0`.
         This module depends on `lib/common` and `lib/decompress`.
-        To enable this feature, it's required to define `ZSTD_LEGACY_SUPPORT` during compilation.
-        Typically, with `gcc`, add argument `-DZSTD_LEGACY_SUPPORT=1`.
-        Using higher number limits versions supported.
+        To enable this feature, define `ZSTD_LEGACY_SUPPORT` during compilation.
+        Specifying a number limits versions supported to that version onward.
         For example, `ZSTD_LEGACY_SUPPORT=2` means : "support legacy formats >= v0.2.0".
         `ZSTD_LEGACY_SUPPORT=3` means : "support legacy formats >= v0.3.0", and so on.
-        Starting v0.8.0, all versions of `zstd` produce frames compliant with specification.
-        As a consequence, `ZSTD_LEGACY_SUPPORT=8` (or more) doesn't trigger legacy support.
-        Also, `ZSTD_LEGACY_SUPPORT=0` means "do __not__ support legacy formats".
+        Currently, the default library setting is `ZST_LEGACY_SUPPORT=5`.
+        It can be changed at build by any other value.
+        Note that any number >= 8 translates into "do __not__ support legacy formats",
+        since all versions of `zstd` >= v0.8 are compatible with v1+ specification.
+        `ZSTD_LEGACY_SUPPORT=0` also means "do __not__ support legacy formats".
         Once enabled, this capability is transparently triggered within decompression functions.
         It's also possible to invoke directly legacy API, as exposed in `lib/legacy/zstd_legacy.h`.
         Each version also provides an additional dedicated set of advanced API.
         For example, advanced API for version `v0.4` is exposed in `lib/legacy/zstd_v04.h` .
         Note : `lib/legacy` only supports _decoding_ legacy formats.
-- Similarly, you can define `ZSTD_LIB_COMPRESSION, ZSTD_LIB_DECOMPRESSION`, `ZSTD_LIB_DICTBUILDER`, 
-        and `ZSTD_LIB_DEPRECATED` as 0 to forgo compilation of the corresponding features. This will 
+- Similarly, you can define `ZSTD_LIB_COMPRESSION, ZSTD_LIB_DECOMPRESSION`, `ZSTD_LIB_DICTBUILDER`,
+        and `ZSTD_LIB_DEPRECATED` as 0 to forgo compilation of the corresponding features. This will
         also disable compilation of all dependencies (eg. `ZSTD_LIB_COMPRESSION=0` will also disable
-        dictBuilder). 
+        dictBuilder).
 
 
 #### Multithreading support
diff --git a/lib/common/compiler.h b/lib/common/compiler.h
index 366ed2b..07f875e 100644
--- a/lib/common/compiler.h
+++ b/lib/common/compiler.h
@@ -88,15 +88,37 @@
   #endif
 #endif
 
-/* prefetch */
-#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86))  /* _mm_prefetch() is not defined outside of x86/x64 */
-#  include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
-#  define PREFETCH(ptr)   _mm_prefetch((const char*)ptr, _MM_HINT_T0)
-#elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
-#  define PREFETCH(ptr)   __builtin_prefetch(ptr, 0, 0)
+/* prefetch
+ * can be disabled, by declaring NO_PREFETCH macro
+ * All prefetch invocations use a single default locality 2,
+ * generating instruction prefetcht1,
+ * which, according to Intel, means "load data into L2 cache".
+ * This is a good enough "middle ground" for the time being,
+ * though in theory, it would be better to specialize locality depending on data being prefetched.
+ * Tests could not determine any sensible difference based on locality value. */
+#if defined(NO_PREFETCH)
+#  define PREFETCH(ptr)     (void)(ptr)  /* disabled */
 #else
-#  define PREFETCH(ptr)   /* disabled */
-#endif
+#  if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86))  /* _mm_prefetch() is not defined outside of x86/x64 */
+#    include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
+#    define PREFETCH(ptr)   _mm_prefetch((const char*)(ptr), _MM_HINT_T1)
+#  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
+#    define PREFETCH(ptr)   __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */)
+#  else
+#    define PREFETCH(ptr)   (void)(ptr)  /* disabled */
+#  endif
+#endif  /* NO_PREFETCH */
+
+#define CACHELINE_SIZE 64
+
+#define PREFETCH_AREA(p, s)  {            \
+    const char* const _ptr = (const char*)(p);  \
+    size_t const _size = (size_t)(s);     \
+    size_t _pos;                          \
+    for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) {  \
+        PREFETCH(_ptr + _pos);            \
+    }                                     \
+}
 
 /* disable warnings */
 #ifdef _MSC_VER    /* Visual Studio */
diff --git a/lib/common/cpu.h b/lib/common/cpu.h
index 88e0ebf..eeb428a 100644
--- a/lib/common/cpu.h
+++ b/lib/common/cpu.h
@@ -36,7 +36,7 @@
     U32 f1d = 0;
     U32 f7b = 0;
     U32 f7c = 0;
-#ifdef _MSC_VER
+#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
     int reg[4];
     __cpuid((int*)reg, 0);
     {
diff --git a/lib/common/mem.h b/lib/common/mem.h
index 47d2300..2051bca 100644
--- a/lib/common/mem.h
+++ b/lib/common/mem.h
@@ -57,11 +57,23 @@
   typedef  uint64_t U64;
   typedef   int64_t S64;
 #else
+# include <limits.h>
+#if CHAR_BIT != 8
+#  error "this implementation requires char to be exactly 8-bit type"
+#endif
   typedef unsigned char      BYTE;
+#if USHRT_MAX != 65535
+#  error "this implementation requires short to be exactly 16-bit type"
+#endif
   typedef unsigned short      U16;
   typedef   signed short      S16;
+#if UINT_MAX != 4294967295
+#  error "this implementation requires int to be exactly 32-bit type"
+#endif
   typedef unsigned int        U32;
   typedef   signed int        S32;
+/* note : there are no limits defined for long long type in C90.
+ * limits exist in C99, however, in such case, <stdint.h> is preferred */
   typedef unsigned long long  U64;
   typedef   signed long long  S64;
 #endif
diff --git a/lib/common/xxhash.c b/lib/common/xxhash.c
index 9d9c0e9..532b816 100644
--- a/lib/common/xxhash.c
+++ b/lib/common/xxhash.c
@@ -98,6 +98,7 @@
 /* Modify the local functions below should you wish to use some other memory routines */
 /* for malloc(), free() */
 #include <stdlib.h>
+#include <stddef.h>     /* size_t */
 static void* XXH_malloc(size_t s) { return malloc(s); }
 static void  XXH_free  (void* p)  { free(p); }
 /* for memcpy() */
diff --git a/lib/common/zstd_internal.h b/lib/common/zstd_internal.h
index b4c1af5..e75adfa 100644
--- a/lib/common/zstd_internal.h
+++ b/lib/common/zstd_internal.h
@@ -79,8 +79,7 @@
 static const size_t ZSTD_fcs_fieldSize[4] = { 0, 2, 4, 8 };
 static const size_t ZSTD_did_fieldSize[4] = { 0, 1, 2, 4 };
 
-#define ZSTD_FRAMEIDSIZE 4
-static const size_t ZSTD_frameIdSize = ZSTD_FRAMEIDSIZE;  /* magic number size */
+#define ZSTD_FRAMEIDSIZE 4   /* magic number size */
 
 #define ZSTD_BLOCKHEADERSIZE 3   /* C standard doesn't allow `static const` variable to be init using another `static const` variable */
 static const size_t ZSTD_blockHeaderSize = ZSTD_BLOCKHEADERSIZE;
@@ -193,6 +192,8 @@
     BYTE* llCode;
     BYTE* mlCode;
     BYTE* ofCode;
+    size_t maxNbSeq;
+    size_t maxNbLit;
     U32   longLengthID;   /* 0 == no longLength; 1 == Lit.longLength; 2 == Match.longLength; */
     U32   longLengthPos;
 } seqStore_t;
diff --git a/lib/compress/fse_compress.c b/lib/compress/fse_compress.c
index 07b3ab8..4408f0e 100644
--- a/lib/compress/fse_compress.c
+++ b/lib/compress/fse_compress.c
@@ -83,7 +83,9 @@
  * wkspSize should be sized to handle worst case situation, which is `1<<max_tableLog * sizeof(FSE_FUNCTION_TYPE)`
  * workSpace must also be properly aligned with FSE_FUNCTION_TYPE requirements
  */
-size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize)
+size_t FSE_buildCTable_wksp(FSE_CTable* ct,
+                      const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog,
+                            void* workSpace, size_t wkspSize)
 {
     U32 const tableSize = 1 << tableLog;
     U32 const tableMask = tableSize - 1;
@@ -101,10 +103,14 @@
     if (((size_t)1 << tableLog) * sizeof(FSE_FUNCTION_TYPE) > wkspSize) return ERROR(tableLog_tooLarge);
     tableU16[-2] = (U16) tableLog;
     tableU16[-1] = (U16) maxSymbolValue;
-    assert(tableLog < 16);   /* required for the threshold strategy to work */
+    assert(tableLog < 16);   /* required for threshold strategy to work */
 
     /* For explanations on how to distribute symbol values over the table :
-    *  http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
+     * http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
+
+     #ifdef __clang_analyzer__
+     memset(tableSymbol, 0, sizeof(*tableSymbol) * tableSize);   /* useless initialization, just to keep scan-build happy */
+     #endif
 
     /* symbol start positions */
     {   U32 u;
@@ -124,13 +130,15 @@
         U32 symbol;
         for (symbol=0; symbol<=maxSymbolValue; symbol++) {
             int nbOccurences;
-            for (nbOccurences=0; nbOccurences<normalizedCounter[symbol]; nbOccurences++) {
+            int const freq = normalizedCounter[symbol];
+            for (nbOccurences=0; nbOccurences<freq; nbOccurences++) {
                 tableSymbol[position] = (FSE_FUNCTION_TYPE)symbol;
                 position = (position + step) & tableMask;
-                while (position > highThreshold) position = (position + step) & tableMask;   /* Low proba area */
+                while (position > highThreshold)
+                    position = (position + step) & tableMask;   /* Low proba area */
         }   }
 
-        if (position!=0) return ERROR(GENERIC);   /* Must have gone through all positions */
+        assert(position==0);  /* Must have initialized all positions */
     }
 
     /* Build table */
@@ -201,9 +209,10 @@
     return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND;  /* maxSymbolValue==0 ? use default */
 }
 
-static size_t FSE_writeNCount_generic (void* header, size_t headerBufferSize,
-                                       const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog,
-                                       unsigned writeIsSafe)
+static size_t
+FSE_writeNCount_generic (void* header, size_t headerBufferSize,
+                   const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog,
+                         unsigned writeIsSafe)
 {
     BYTE* const ostart = (BYTE*) header;
     BYTE* out = ostart;
@@ -212,13 +221,12 @@
     const int tableSize = 1 << tableLog;
     int remaining;
     int threshold;
-    U32 bitStream;
-    int bitCount;
-    unsigned charnum = 0;
-    int previous0 = 0;
+    U32 bitStream = 0;
+    int bitCount = 0;
+    unsigned symbol = 0;
+    unsigned const alphabetSize = maxSymbolValue + 1;
+    int previousIs0 = 0;
 
-    bitStream = 0;
-    bitCount  = 0;
     /* Table Size */
     bitStream += (tableLog-FSE_MIN_TABLELOG) << bitCount;
     bitCount  += 4;
@@ -228,48 +236,53 @@
     threshold = tableSize;
     nbBits = tableLog+1;
 
-    while (remaining>1) {  /* stops at 1 */
-        if (previous0) {
-            unsigned start = charnum;
-            while (!normalizedCounter[charnum]) charnum++;
-            while (charnum >= start+24) {
+    while ((symbol < alphabetSize) && (remaining>1)) {  /* stops at 1 */
+        if (previousIs0) {
+            unsigned start = symbol;
+            while ((symbol < alphabetSize) && !normalizedCounter[symbol]) symbol++;
+            if (symbol == alphabetSize) break;   /* incorrect distribution */
+            while (symbol >= start+24) {
                 start+=24;
                 bitStream += 0xFFFFU << bitCount;
-                if ((!writeIsSafe) && (out > oend-2)) return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+                if ((!writeIsSafe) && (out > oend-2))
+                    return ERROR(dstSize_tooSmall);   /* Buffer overflow */
                 out[0] = (BYTE) bitStream;
                 out[1] = (BYTE)(bitStream>>8);
                 out+=2;
                 bitStream>>=16;
             }
-            while (charnum >= start+3) {
+            while (symbol >= start+3) {
                 start+=3;
                 bitStream += 3 << bitCount;
                 bitCount += 2;
             }
-            bitStream += (charnum-start) << bitCount;
+            bitStream += (symbol-start) << bitCount;
             bitCount += 2;
             if (bitCount>16) {
-                if ((!writeIsSafe) && (out > oend - 2)) return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+                if ((!writeIsSafe) && (out > oend - 2))
+                    return ERROR(dstSize_tooSmall);   /* Buffer overflow */
                 out[0] = (BYTE)bitStream;
                 out[1] = (BYTE)(bitStream>>8);
                 out += 2;
                 bitStream >>= 16;
                 bitCount -= 16;
         }   }
-        {   int count = normalizedCounter[charnum++];
-            int const max = (2*threshold-1)-remaining;
+        {   int count = normalizedCounter[symbol++];
+            int const max = (2*threshold-1) - remaining;
             remaining -= count < 0 ? -count : count;
             count++;   /* +1 for extra accuracy */
-            if (count>=threshold) count += max;   /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */
+            if (count>=threshold)
+                count += max;   /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */
             bitStream += count << bitCount;
             bitCount  += nbBits;
             bitCount  -= (count<max);
-            previous0  = (count==1);
+            previousIs0  = (count==1);
             if (remaining<1) return ERROR(GENERIC);
             while (remaining<threshold) { nbBits--; threshold>>=1; }
         }
         if (bitCount>16) {
-            if ((!writeIsSafe) && (out > oend - 2)) return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+            if ((!writeIsSafe) && (out > oend - 2))
+                return ERROR(dstSize_tooSmall);   /* Buffer overflow */
             out[0] = (BYTE)bitStream;
             out[1] = (BYTE)(bitStream>>8);
             out += 2;
@@ -277,19 +290,23 @@
             bitCount -= 16;
     }   }
 
+    if (remaining != 1)
+        return ERROR(GENERIC);  /* incorrect normalized distribution */
+    assert(symbol <= alphabetSize);
+
     /* flush remaining bitStream */
-    if ((!writeIsSafe) && (out > oend - 2)) return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+    if ((!writeIsSafe) && (out > oend - 2))
+        return ERROR(dstSize_tooSmall);   /* Buffer overflow */
     out[0] = (BYTE)bitStream;
     out[1] = (BYTE)(bitStream>>8);
     out+= (bitCount+7) /8;
 
-    if (charnum > maxSymbolValue + 1) return ERROR(GENERIC);
-
     return (out-ostart);
 }
 
 
-size_t FSE_writeNCount (void* buffer, size_t bufferSize, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+size_t FSE_writeNCount (void* buffer, size_t bufferSize,
+                  const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
 {
     if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);   /* Unsupported */
     if (tableLog < FSE_MIN_TABLELOG) return ERROR(GENERIC);   /* Unsupported */
@@ -297,26 +314,13 @@
     if (bufferSize < FSE_NCountWriteBound(maxSymbolValue, tableLog))
         return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 0);
 
-    return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 1);
+    return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 1 /* write in buffer is safe */);
 }
 
 
 /*-**************************************************************
 *  FSE Compression Code
 ****************************************************************/
-/*! FSE_sizeof_CTable() :
-    FSE_CTable is a variable size structure which contains :
-    `U16 tableLog;`
-    `U16 maxSymbolValue;`
-    `U16 nextStateNumber[1 << tableLog];`                         // This size is variable
-    `FSE_symbolCompressionTransform symbolTT[maxSymbolValue+1];`  // This size is variable
-Allocation is manual (C standard does not support variable-size structures).
-*/
-size_t FSE_sizeof_CTable (unsigned maxSymbolValue, unsigned tableLog)
-{
-    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
-    return FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32);
-}
 
 FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog)
 {
@@ -331,7 +335,7 @@
 /* provides the minimum logSize to safely represent a distribution */
 static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
 {
-    U32 minBitsSrc = BIT_highbit32((U32)(srcSize - 1)) + 1;
+    U32 minBitsSrc = BIT_highbit32((U32)(srcSize)) + 1;
     U32 minBitsSymbols = BIT_highbit32(maxSymbolValue) + 2;
     U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols;
     assert(srcSize > 1); /* Not supported, RLE should be used instead */
@@ -394,6 +398,9 @@
     }
     ToDistribute = (1 << tableLog) - distributed;
 
+    if (ToDistribute == 0)
+        return 0;
+
     if ((total / ToDistribute) > lowOne) {
         /* risk of rounding to zero */
         lowOne = (U32)((total * 3) / (ToDistribute * 2));
diff --git a/lib/compress/hist.h b/lib/compress/hist.h
index 788470d..8b1991a 100644
--- a/lib/compress/hist.h
+++ b/lib/compress/hist.h
@@ -50,7 +50,7 @@
 size_t HIST_count(unsigned* count, unsigned* maxSymbolValuePtr,
                   const void* src, size_t srcSize);
 
-unsigned HIST_isError(size_t code);  /*< tells if a return value is an error code */
+unsigned HIST_isError(size_t code);  /**< tells if a return value is an error code */
 
 
 /* --- advanced histogram functions --- */
diff --git a/lib/compress/huf_compress.c b/lib/compress/huf_compress.c
index 9cdaa5d..4c40572 100644
--- a/lib/compress/huf_compress.c
+++ b/lib/compress/huf_compress.c
@@ -82,7 +82,7 @@
  * Note : all elements within weightTable are supposed to be <= HUF_TABLELOG_MAX.
  */
 #define MAX_FSE_TABLELOG_FOR_HUFF_HEADER 6
-size_t HUF_compressWeights (void* dst, size_t dstSize, const void* weightTable, size_t wtSize)
+static size_t HUF_compressWeights (void* dst, size_t dstSize, const void* weightTable, size_t wtSize)
 {
     BYTE* const ostart = (BYTE*) dst;
     BYTE* op = ostart;
diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c
index c668625..5f6280a 100644
--- a/lib/compress/zstd_compress.c
+++ b/lib/compress/zstd_compress.c
@@ -46,7 +46,6 @@
     size_t workspaceSize;
     ZSTD_matchState_t matchState;
     ZSTD_compressedBlockState_t cBlockState;
-    ZSTD_compressionParameters cParams;
     ZSTD_customMem customMem;
     U32 dictID;
 };  /* typedef'd to ZSTD_CDict within "zstd.h" */
@@ -679,6 +678,9 @@
     CLAMPCHECK(cParams.hashLog, ZSTD_HASHLOG_MIN, ZSTD_HASHLOG_MAX);
     CLAMPCHECK(cParams.searchLog, ZSTD_SEARCHLOG_MIN, ZSTD_SEARCHLOG_MAX);
     CLAMPCHECK(cParams.searchLength, ZSTD_SEARCHLENGTH_MIN, ZSTD_SEARCHLENGTH_MAX);
+    ZSTD_STATIC_ASSERT(ZSTD_TARGETLENGTH_MIN == 0);
+    if (cParams.targetLength > ZSTD_TARGETLENGTH_MAX)
+        return ERROR(parameter_outOfBound);
     if ((U32)(cParams.strategy) > (U32)ZSTD_btultra)
         return ERROR(parameter_unsupported);
     return 0;
@@ -699,6 +701,9 @@
     CLAMP(cParams.hashLog, ZSTD_HASHLOG_MIN, ZSTD_HASHLOG_MAX);
     CLAMP(cParams.searchLog, ZSTD_SEARCHLOG_MIN, ZSTD_SEARCHLOG_MAX);
     CLAMP(cParams.searchLength, ZSTD_SEARCHLENGTH_MIN, ZSTD_SEARCHLENGTH_MAX);
+    ZSTD_STATIC_ASSERT(ZSTD_TARGETLENGTH_MIN == 0);
+    if (cParams.targetLength > ZSTD_TARGETLENGTH_MAX)
+        cParams.targetLength = ZSTD_TARGETLENGTH_MAX;
     CLAMP(cParams.strategy, ZSTD_fast, ZSTD_btultra);
     return cParams;
 }
@@ -805,7 +810,7 @@
         size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog);
         U32    const divider = (cParams.searchLength==3) ? 3 : 4;
         size_t const maxNbSeq = blockSize / divider;
-        size_t const tokenSpace = blockSize + 11*maxNbSeq;
+        size_t const tokenSpace = WILDCOPY_OVERLENGTH + blockSize + 11*maxNbSeq;
         size_t const entropySpace = HUF_WORKSPACE_SIZE;
         size_t const blockStateSpace = 2 * sizeof(ZSTD_compressedBlockState_t);
         size_t const matchStateSize = ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 1);
@@ -900,9 +905,27 @@
         fp.ingested = cctx->consumedSrcSize + buffered;
         fp.consumed = cctx->consumedSrcSize;
         fp.produced = cctx->producedCSize;
+        fp.flushed  = cctx->producedCSize;   /* simplified; some data might still be left within streaming output buffer */
+        fp.currentJobID = 0;
+        fp.nbActiveWorkers = 0;
         return fp;
 }   }
 
+/*! ZSTD_toFlushNow()
+ *  Only useful for multithreading scenarios currently (nbWorkers >= 1).
+ */
+size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx)
+{
+#ifdef ZSTD_MULTITHREAD
+    if (cctx->appliedParams.nbWorkers > 0) {
+        return ZSTDMT_toFlushNow(cctx->mtctx);
+    }
+#endif
+    (void)cctx;
+    return 0;   /* over-simplification; could also check if context is currently running in streaming mode, and in which case, report how many bytes are left to be flushed within output buffer */
+}
+
+
 
 static U32 ZSTD_equivalentCParams(ZSTD_compressionParameters cParams1,
                                   ZSTD_compressionParameters cParams2)
@@ -913,6 +936,20 @@
          & ((cParams1.searchLength==3) == (cParams2.searchLength==3));  /* hashlog3 space */
 }
 
+static void ZSTD_assertEqualCParams(ZSTD_compressionParameters cParams1,
+                                    ZSTD_compressionParameters cParams2)
+{
+    (void)cParams1;
+    (void)cParams2;
+    assert(cParams1.windowLog    == cParams2.windowLog);
+    assert(cParams1.chainLog     == cParams2.chainLog);
+    assert(cParams1.hashLog      == cParams2.hashLog);
+    assert(cParams1.searchLog    == cParams2.searchLog);
+    assert(cParams1.searchLength == cParams2.searchLength);
+    assert(cParams1.targetLength == cParams2.targetLength);
+    assert(cParams1.strategy     == cParams2.strategy);
+}
+
 /** The parameters are equivalent if ldm is not enabled in both sets or
  *  all the parameters are equivalent. */
 static U32 ZSTD_equivalentLdmParams(ldmParams_t ldmParams1,
@@ -931,33 +968,51 @@
 /* ZSTD_sufficientBuff() :
  * check internal buffers exist for streaming if buffPol == ZSTDb_buffered .
  * Note : they are assumed to be correctly sized if ZSTD_equivalentCParams()==1 */
-static U32 ZSTD_sufficientBuff(size_t bufferSize1, size_t blockSize1,
+static U32 ZSTD_sufficientBuff(size_t bufferSize1, size_t maxNbSeq1,
+                            size_t maxNbLit1,
                             ZSTD_buffered_policy_e buffPol2,
                             ZSTD_compressionParameters cParams2,
                             U64 pledgedSrcSize)
 {
     size_t const windowSize2 = MAX(1, (size_t)MIN(((U64)1 << cParams2.windowLog), pledgedSrcSize));
     size_t const blockSize2 = MIN(ZSTD_BLOCKSIZE_MAX, windowSize2);
+    size_t const maxNbSeq2 = blockSize2 / ((cParams2.searchLength == 3) ? 3 : 4);
+    size_t const maxNbLit2 = blockSize2;
     size_t const neededBufferSize2 = (buffPol2==ZSTDb_buffered) ? windowSize2 + blockSize2 : 0;
-    DEBUGLOG(4, "ZSTD_sufficientBuff: is windowSize2=%u <= wlog1=%u",
-                (U32)windowSize2, cParams2.windowLog);
-    DEBUGLOG(4, "ZSTD_sufficientBuff: is blockSize2=%u <= blockSize1=%u",
-                (U32)blockSize2, (U32)blockSize1);
-    return (blockSize2 <= blockSize1) /* seqStore space depends on blockSize */
+    DEBUGLOG(4, "ZSTD_sufficientBuff: is neededBufferSize2=%u <= bufferSize1=%u",
+                (U32)neededBufferSize2, (U32)bufferSize1);
+    DEBUGLOG(4, "ZSTD_sufficientBuff: is maxNbSeq2=%u <= maxNbSeq1=%u",
+                (U32)maxNbSeq2, (U32)maxNbSeq1);
+    DEBUGLOG(4, "ZSTD_sufficientBuff: is maxNbLit2=%u <= maxNbLit1=%u",
+                (U32)maxNbLit2, (U32)maxNbLit1);
+    return (maxNbLit2 <= maxNbLit1)
+         & (maxNbSeq2 <= maxNbSeq1)
          & (neededBufferSize2 <= bufferSize1);
 }
 
 /** Equivalence for resetCCtx purposes */
 static U32 ZSTD_equivalentParams(ZSTD_CCtx_params params1,
                                  ZSTD_CCtx_params params2,
-                                 size_t buffSize1, size_t blockSize1,
+                                 size_t buffSize1,
+                                 size_t maxNbSeq1, size_t maxNbLit1,
                                  ZSTD_buffered_policy_e buffPol2,
                                  U64 pledgedSrcSize)
 {
     DEBUGLOG(4, "ZSTD_equivalentParams: pledgedSrcSize=%u", (U32)pledgedSrcSize);
-    return ZSTD_equivalentCParams(params1.cParams, params2.cParams) &&
-           ZSTD_equivalentLdmParams(params1.ldmParams, params2.ldmParams) &&
-           ZSTD_sufficientBuff(buffSize1, blockSize1, buffPol2, params2.cParams, pledgedSrcSize);
+    if (!ZSTD_equivalentCParams(params1.cParams, params2.cParams)) {
+      DEBUGLOG(4, "ZSTD_equivalentCParams() == 0");
+      return 0;
+    }
+    if (!ZSTD_equivalentLdmParams(params1.ldmParams, params2.ldmParams)) {
+      DEBUGLOG(4, "ZSTD_equivalentLdmParams() == 0");
+      return 0;
+    }
+    if (!ZSTD_sufficientBuff(buffSize1, maxNbSeq1, maxNbLit1, buffPol2,
+                             params2.cParams, pledgedSrcSize)) {
+      DEBUGLOG(4, "ZSTD_sufficientBuff() == 0");
+      return 0;
+    }
+    return 1;
 }
 
 static void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs)
@@ -996,6 +1051,7 @@
 
     cctx->blockSize = blockSize;   /* previous block size could be different even for same windowLog, due to pledgedSrcSize */
     cctx->appliedParams = params;
+    cctx->blockState.matchState.cParams = params.cParams;
     cctx->pledgedSrcSizePlusOne = pledgedSrcSize+1;
     cctx->consumedSrcSize = 0;
     cctx->producedCSize = 0;
@@ -1032,6 +1088,9 @@
 
     ms->hashLog3 = hashLog3;
     memset(&ms->window, 0, sizeof(ms->window));
+    ms->window.dictLimit = 1;    /* start from 1, so that 1st position is valid */
+    ms->window.lowLimit = 1;     /* it ensures first and later CCtx usages compress the same */
+    ms->window.nextSrc = ms->window.base + 1;   /* see issue #1241 */
     ZSTD_invalidateMatchState(ms);
 
     /* opt parser space */
@@ -1057,6 +1116,8 @@
     ms->hashTable3 = ms->chainTable + chainSize;
     ptr = ms->hashTable3 + h3Size;
 
+    ms->cParams = *cParams;
+
     assert(((size_t)ptr & 3) == 0);
     return ptr;
 }
@@ -1082,8 +1143,9 @@
 
     if (crp == ZSTDcrp_continue) {
         if (ZSTD_equivalentParams(zc->appliedParams, params,
-                                zc->inBuffSize, zc->blockSize,
-                                zbuff, pledgedSrcSize)) {
+                                  zc->inBuffSize,
+                                  zc->seqStore.maxNbSeq, zc->seqStore.maxNbLit,
+                                  zbuff, pledgedSrcSize)) {
             DEBUGLOG(4, "ZSTD_equivalentParams()==1 -> continue mode (wLog1=%u, blockSize1=%zu)",
                         zc->appliedParams.cParams.windowLog, zc->blockSize);
             zc->workSpaceOversizedDuration += (zc->workSpaceOversizedDuration > 0);   /* if it was too large, it still is */
@@ -1104,7 +1166,7 @@
         size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize);
         U32    const divider = (params.cParams.searchLength==3) ? 3 : 4;
         size_t const maxNbSeq = blockSize / divider;
-        size_t const tokenSpace = blockSize + 11*maxNbSeq;
+        size_t const tokenSpace = WILDCOPY_OVERLENGTH + blockSize + 11*maxNbSeq;
         size_t const buffOutSize = (zbuff==ZSTDb_buffered) ? ZSTD_compressBound(blockSize)+1 : 0;
         size_t const buffInSize = (zbuff==ZSTDb_buffered) ? windowSize + blockSize : 0;
         size_t const matchStateSize = ZSTD_sizeof_matchState(&params.cParams, /* forCCtx */ 1);
@@ -1144,7 +1206,6 @@
                 if (zc->workSpace == NULL) return ERROR(memory_allocation);
                 zc->workSpaceSize = neededSpace;
                 zc->workSpaceOversizedDuration = 0;
-                ptr = zc->workSpace;
 
                 /* Statically sized space.
                  * entropyWorkspace never moves,
@@ -1159,6 +1220,7 @@
 
         /* init params */
         zc->appliedParams = params;
+        zc->blockState.matchState.cParams = params.cParams;
         zc->pledgedSrcSizePlusOne = pledgedSrcSize+1;
         zc->consumedSrcSize = 0;
         zc->producedCSize = 0;
@@ -1195,13 +1257,18 @@
         ptr = ZSTD_reset_matchState(&zc->blockState.matchState, ptr, &params.cParams, crp, /* forCCtx */ 1);
 
         /* sequences storage */
+        zc->seqStore.maxNbSeq = maxNbSeq;
         zc->seqStore.sequencesStart = (seqDef*)ptr;
         ptr = zc->seqStore.sequencesStart + maxNbSeq;
         zc->seqStore.llCode = (BYTE*) ptr;
         zc->seqStore.mlCode = zc->seqStore.llCode + maxNbSeq;
         zc->seqStore.ofCode = zc->seqStore.mlCode + maxNbSeq;
         zc->seqStore.litStart = zc->seqStore.ofCode + maxNbSeq;
-        ptr = zc->seqStore.litStart + blockSize;
+        /* ZSTD_wildcopy() is used to copy into the literals buffer,
+         * so we have to oversize the buffer by WILDCOPY_OVERLENGTH bytes.
+         */
+        zc->seqStore.maxNbLit = blockSize;
+        ptr = zc->seqStore.litStart + blockSize + WILDCOPY_OVERLENGTH;
 
         /* ldm bucketOffsets table */
         if (params.ldmParams.enableLdm) {
@@ -1235,54 +1302,59 @@
     assert(!ZSTD_window_hasExtDict(cctx->blockState.matchState.window));
 }
 
-static size_t ZSTD_resetCCtx_usingCDict(ZSTD_CCtx* cctx,
-                            const ZSTD_CDict* cdict,
-                            ZSTD_CCtx_params params,
-                            U64 pledgedSrcSize,
-                            ZSTD_buffered_policy_e zbuff)
+/* These are the approximate sizes for each strategy past which copying the
+ * dictionary tables into the working context is faster than using them
+ * in-place.
+ */
+static const size_t attachDictSizeCutoffs[(unsigned)ZSTD_btultra+1] = {
+    8 KB, /* unused */
+    8 KB, /* ZSTD_fast */
+    16 KB, /* ZSTD_dfast */
+    32 KB, /* ZSTD_greedy */
+    32 KB, /* ZSTD_lazy */
+    32 KB, /* ZSTD_lazy2 */
+    32 KB, /* ZSTD_btlazy2 */
+    32 KB, /* ZSTD_btopt */
+    8 KB /* ZSTD_btultra */
+};
+
+static int ZSTD_shouldAttachDict(const ZSTD_CDict* cdict,
+                                 ZSTD_CCtx_params params,
+                                 U64 pledgedSrcSize)
 {
-    /* We have a choice between copying the dictionary context into the working
-     * context, or referencing the dictionary context from the working context
-     * in-place. We decide here which strategy to use. */
-    const U64 attachDictSizeCutoffs[(unsigned)ZSTD_btultra+1] = {
-        8 KB, /* unused */
-        8 KB, /* ZSTD_fast */
-        16 KB, /* ZSTD_dfast */
-        32 KB, /* ZSTD_greedy */
-        32 KB, /* ZSTD_lazy */
-        32 KB, /* ZSTD_lazy2 */
-        32 KB, /* ZSTD_btlazy2 */
-        32 KB, /* ZSTD_btopt */
-        8 KB /* ZSTD_btultra */
-    };
-    const int attachDict = ( pledgedSrcSize <= attachDictSizeCutoffs[cdict->cParams.strategy]
-                          || pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN
-                          || params.attachDictPref == ZSTD_dictForceAttach )
-                        && params.attachDictPref != ZSTD_dictForceCopy
-                        && !params.forceWindow /* dictMatchState isn't correctly
-                                                * handled in _enforceMaxDist */
-                        && ZSTD_equivalentCParams(cctx->appliedParams.cParams,
-                                                  cdict->cParams);
+    size_t cutoff = attachDictSizeCutoffs[cdict->matchState.cParams.strategy];
+    return ( pledgedSrcSize <= cutoff
+          || pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN
+          || params.attachDictPref == ZSTD_dictForceAttach )
+        && params.attachDictPref != ZSTD_dictForceCopy
+        && !params.forceWindow; /* dictMatchState isn't correctly
+                                 * handled in _enforceMaxDist */
+}
 
-    DEBUGLOG(4, "ZSTD_resetCCtx_usingCDict (pledgedSrcSize=%u)", (U32)pledgedSrcSize);
-
-
-    {   unsigned const windowLog = params.cParams.windowLog;
+static size_t ZSTD_resetCCtx_byAttachingCDict(
+    ZSTD_CCtx* cctx,
+    const ZSTD_CDict* cdict,
+    ZSTD_CCtx_params params,
+    U64 pledgedSrcSize,
+    ZSTD_buffered_policy_e zbuff)
+{
+    {
+        const ZSTD_compressionParameters *cdict_cParams = &cdict->matchState.cParams;
+        unsigned const windowLog = params.cParams.windowLog;
         assert(windowLog != 0);
-        /* Copy only compression parameters related to tables. */
-        params.cParams = cdict->cParams;
+        /* Resize working context table params for input only, since the dict
+         * has its own tables. */
+        params.cParams = ZSTD_adjustCParams_internal(*cdict_cParams, pledgedSrcSize, 0);
         params.cParams.windowLog = windowLog;
         ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize,
-                                attachDict ? ZSTDcrp_continue : ZSTDcrp_noMemset,
-                                zbuff);
-        assert(cctx->appliedParams.cParams.strategy == cdict->cParams.strategy);
-        assert(cctx->appliedParams.cParams.hashLog == cdict->cParams.hashLog);
-        assert(cctx->appliedParams.cParams.chainLog == cdict->cParams.chainLog);
+                                ZSTDcrp_continue, zbuff);
+        assert(cctx->appliedParams.cParams.strategy == cdict_cParams->strategy);
     }
 
-    if (attachDict) {
-        const U32 cdictLen = (U32)( cdict->matchState.window.nextSrc
+    {
+        const U32 cdictEnd = (U32)( cdict->matchState.window.nextSrc
                                   - cdict->matchState.window.base);
+        const U32 cdictLen = cdictEnd - cdict->matchState.window.dictLimit;
         if (cdictLen == 0) {
             /* don't even attach dictionaries with no contents */
             DEBUGLOG(4, "skipping attaching empty dictionary");
@@ -1292,41 +1364,13 @@
 
             /* prep working match state so dict matches never have negative indices
              * when they are translated to the working context's index space. */
-            if (cctx->blockState.matchState.window.dictLimit < cdictLen) {
+            if (cctx->blockState.matchState.window.dictLimit < cdictEnd) {
                 cctx->blockState.matchState.window.nextSrc =
-                    cctx->blockState.matchState.window.base + cdictLen;
+                    cctx->blockState.matchState.window.base + cdictEnd;
                 ZSTD_window_clear(&cctx->blockState.matchState.window);
             }
             cctx->blockState.matchState.loadedDictEnd = cctx->blockState.matchState.window.dictLimit;
         }
-    } else {
-        DEBUGLOG(4, "copying dictionary into context");
-        /* copy tables */
-        {   size_t const chainSize = (cdict->cParams.strategy == ZSTD_fast) ? 0 : ((size_t)1 << cdict->cParams.chainLog);
-            size_t const hSize =  (size_t)1 << cdict->cParams.hashLog;
-            size_t const tableSpace = (chainSize + hSize) * sizeof(U32);
-            assert((U32*)cctx->blockState.matchState.chainTable == (U32*)cctx->blockState.matchState.hashTable + hSize);  /* chainTable must follow hashTable */
-            assert((U32*)cctx->blockState.matchState.hashTable3 == (U32*)cctx->blockState.matchState.chainTable + chainSize);
-            assert((U32*)cdict->matchState.chainTable == (U32*)cdict->matchState.hashTable + hSize);  /* chainTable must follow hashTable */
-            assert((U32*)cdict->matchState.hashTable3 == (U32*)cdict->matchState.chainTable + chainSize);
-            memcpy(cctx->blockState.matchState.hashTable, cdict->matchState.hashTable, tableSpace);   /* presumes all tables follow each other */
-        }
-
-        /* Zero the hashTable3, since the cdict never fills it */
-        {   size_t const h3Size = (size_t)1 << cctx->blockState.matchState.hashLog3;
-            assert(cdict->matchState.hashLog3 == 0);
-            memset(cctx->blockState.matchState.hashTable3, 0, h3Size * sizeof(U32));
-        }
-
-        /* copy dictionary offsets */
-        {
-            ZSTD_matchState_t const* srcMatchState = &cdict->matchState;
-            ZSTD_matchState_t* dstMatchState = &cctx->blockState.matchState;
-            dstMatchState->window       = srcMatchState->window;
-            dstMatchState->nextToUpdate = srcMatchState->nextToUpdate;
-            dstMatchState->nextToUpdate3= srcMatchState->nextToUpdate3;
-            dstMatchState->loadedDictEnd= srcMatchState->loadedDictEnd;
-        }
     }
 
     cctx->dictID = cdict->dictID;
@@ -1337,6 +1381,83 @@
     return 0;
 }
 
+static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
+                            const ZSTD_CDict* cdict,
+                            ZSTD_CCtx_params params,
+                            U64 pledgedSrcSize,
+                            ZSTD_buffered_policy_e zbuff)
+{
+    const ZSTD_compressionParameters *cdict_cParams = &cdict->matchState.cParams;
+
+    DEBUGLOG(4, "copying dictionary into context");
+
+    {   unsigned const windowLog = params.cParams.windowLog;
+        assert(windowLog != 0);
+        /* Copy only compression parameters related to tables. */
+        params.cParams = *cdict_cParams;
+        params.cParams.windowLog = windowLog;
+        ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize,
+                                ZSTDcrp_noMemset, zbuff);
+        assert(cctx->appliedParams.cParams.strategy == cdict_cParams->strategy);
+        assert(cctx->appliedParams.cParams.hashLog == cdict_cParams->hashLog);
+        assert(cctx->appliedParams.cParams.chainLog == cdict_cParams->chainLog);
+    }
+
+    /* copy tables */
+    {   size_t const chainSize = (cdict_cParams->strategy == ZSTD_fast) ? 0 : ((size_t)1 << cdict_cParams->chainLog);
+        size_t const hSize =  (size_t)1 << cdict_cParams->hashLog;
+        size_t const tableSpace = (chainSize + hSize) * sizeof(U32);
+        assert((U32*)cctx->blockState.matchState.chainTable == (U32*)cctx->blockState.matchState.hashTable + hSize);  /* chainTable must follow hashTable */
+        assert((U32*)cctx->blockState.matchState.hashTable3 == (U32*)cctx->blockState.matchState.chainTable + chainSize);
+        assert((U32*)cdict->matchState.chainTable == (U32*)cdict->matchState.hashTable + hSize);  /* chainTable must follow hashTable */
+        assert((U32*)cdict->matchState.hashTable3 == (U32*)cdict->matchState.chainTable + chainSize);
+        memcpy(cctx->blockState.matchState.hashTable, cdict->matchState.hashTable, tableSpace);   /* presumes all tables follow each other */
+    }
+
+    /* Zero the hashTable3, since the cdict never fills it */
+    {   size_t const h3Size = (size_t)1 << cctx->blockState.matchState.hashLog3;
+        assert(cdict->matchState.hashLog3 == 0);
+        memset(cctx->blockState.matchState.hashTable3, 0, h3Size * sizeof(U32));
+    }
+
+    /* copy dictionary offsets */
+    {   ZSTD_matchState_t const* srcMatchState = &cdict->matchState;
+        ZSTD_matchState_t* dstMatchState = &cctx->blockState.matchState;
+        dstMatchState->window       = srcMatchState->window;
+        dstMatchState->nextToUpdate = srcMatchState->nextToUpdate;
+        dstMatchState->nextToUpdate3= srcMatchState->nextToUpdate3;
+        dstMatchState->loadedDictEnd= srcMatchState->loadedDictEnd;
+    }
+
+    cctx->dictID = cdict->dictID;
+
+    /* copy block state */
+    memcpy(cctx->blockState.prevCBlock, &cdict->cBlockState, sizeof(cdict->cBlockState));
+
+    return 0;
+}
+
+/* We have a choice between copying the dictionary context into the working
+ * context, or referencing the dictionary context from the working context
+ * in-place. We decide here which strategy to use. */
+static size_t ZSTD_resetCCtx_usingCDict(ZSTD_CCtx* cctx,
+                            const ZSTD_CDict* cdict,
+                            ZSTD_CCtx_params params,
+                            U64 pledgedSrcSize,
+                            ZSTD_buffered_policy_e zbuff)
+{
+
+    DEBUGLOG(4, "ZSTD_resetCCtx_usingCDict (pledgedSrcSize=%u)", (U32)pledgedSrcSize);
+
+    if (ZSTD_shouldAttachDict(cdict, params, pledgedSrcSize)) {
+        return ZSTD_resetCCtx_byAttachingCDict(
+            cctx, cdict, params, pledgedSrcSize, zbuff);
+    } else {
+        return ZSTD_resetCCtx_byCopyingCDict(
+            cctx, cdict, params, pledgedSrcSize, zbuff);
+    }
+}
+
 /*! ZSTD_copyCCtx_internal() :
  *  Duplicate an existing context `srcCCtx` into another one `dstCCtx`.
  *  Only works during stage ZSTDcs_init (i.e. after creation, but before first call to ZSTD_compressContinue()).
@@ -1481,15 +1602,15 @@
 
 /* See doc/zstd_compression_format.md for detailed format description */
 
-size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+static size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock)
 {
+    U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw)<<1) + (U32)(srcSize << 3);
     if (srcSize + ZSTD_blockHeaderSize > dstCapacity) return ERROR(dstSize_tooSmall);
+    MEM_writeLE24(dst, cBlockHeader24);
     memcpy((BYTE*)dst + ZSTD_blockHeaderSize, src, srcSize);
-    MEM_writeLE24(dst, (U32)(srcSize << 2) + (U32)bt_raw);
-    return ZSTD_blockHeaderSize+srcSize;
+    return ZSTD_blockHeaderSize + srcSize;
 }
 
-
 static size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
 {
     BYTE* const ostart = (BYTE* const)dst;
@@ -1644,6 +1765,7 @@
     BYTE* const mlCodeTable = seqStorePtr->mlCode;
     U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
     U32 u;
+    assert(nbSeq <= seqStorePtr->maxNbSeq);
     for (u=0; u<nbSeq; u++) {
         U32 const llv = sequences[u].litLength;
         U32 const mlv = sequences[u].matchLength;
@@ -2040,7 +2162,7 @@
 
 #endif
 
-size_t ZSTD_encodeSequences(
+static size_t ZSTD_encodeSequences(
             void* dst, size_t dstCapacity,
             FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
             FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
@@ -2232,13 +2354,6 @@
         if (cSize >= maxCSize) return 0;  /* block not compressed */
     }
 
-    /* We check that dictionaries have offset codes available for the first
-     * block. After the first block, the offcode table might not have large
-     * enough codes to represent the offsets in the data.
-     */
-    if (nextEntropy->fse.offcode_repeatMode == FSE_repeat_valid)
-        nextEntropy->fse.offcode_repeatMode = FSE_repeat_check;
-
     return cSize;
 }
 
@@ -2305,12 +2420,18 @@
                                         const void* src, size_t srcSize)
 {
     ZSTD_matchState_t* const ms = &zc->blockState.matchState;
+    size_t cSize;
     DEBUGLOG(5, "ZSTD_compressBlock_internal (dstCapacity=%zu, dictLimit=%u, nextToUpdate=%u)",
                 dstCapacity, ms->window.dictLimit, ms->nextToUpdate);
+    assert(srcSize <= ZSTD_BLOCKSIZE_MAX);
+
+    /* Assert that we have correctly flushed the ctx params into the ms's copy */
+    ZSTD_assertEqualCParams(zc->appliedParams.cParams, ms->cParams);
 
     if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) {
         ZSTD_ldm_skipSequences(&zc->externSeqStore, srcSize, zc->appliedParams.cParams.searchLength);
-        return 0;   /* don't even attempt compression below a certain srcSize */
+        cSize = 0;
+        goto out;  /* don't even attempt compression below a certain srcSize */
     }
     ZSTD_resetSeqStore(&(zc->seqStore));
     ms->opt.symbolCosts = &zc->blockState.prevCBlock->entropy;   /* required for optimal parser to read stats from dictionary */
@@ -2343,7 +2464,6 @@
                 ZSTD_ldm_blockCompress(&zc->externSeqStore,
                                        ms, &zc->seqStore,
                                        zc->blockState.nextCBlock->rep,
-                                       &zc->appliedParams.cParams,
                                        src, srcSize);
             assert(zc->externSeqStore.pos <= zc->externSeqStore.size);
         } else if (zc->appliedParams.ldmParams.enableLdm) {
@@ -2360,31 +2480,38 @@
                 ZSTD_ldm_blockCompress(&ldmSeqStore,
                                        ms, &zc->seqStore,
                                        zc->blockState.nextCBlock->rep,
-                                       &zc->appliedParams.cParams,
                                        src, srcSize);
             assert(ldmSeqStore.pos == ldmSeqStore.size);
         } else {   /* not long range mode */
             ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy, dictMode);
-            lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, &zc->appliedParams.cParams, src, srcSize);
+            lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize);
         }
         {   const BYTE* const lastLiterals = (const BYTE*)src + srcSize - lastLLSize;
             ZSTD_storeLastLiterals(&zc->seqStore, lastLiterals, lastLLSize);
     }   }
 
     /* encode sequences and literals */
-    {   size_t const cSize = ZSTD_compressSequences(&zc->seqStore,
-                                &zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBlock->entropy,
-                                &zc->appliedParams,
-                                dst, dstCapacity,
-                                srcSize, zc->entropyWorkspace, zc->bmi2);
-        if (ZSTD_isError(cSize) || cSize == 0) return cSize;
-        /* confirm repcodes and entropy tables */
-        {   ZSTD_compressedBlockState_t* const tmp = zc->blockState.prevCBlock;
-            zc->blockState.prevCBlock = zc->blockState.nextCBlock;
-            zc->blockState.nextCBlock = tmp;
-        }
-        return cSize;
+    cSize = ZSTD_compressSequences(&zc->seqStore,
+            &zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBlock->entropy,
+            &zc->appliedParams,
+            dst, dstCapacity,
+            srcSize, zc->entropyWorkspace, zc->bmi2);
+
+out:
+    if (!ZSTD_isError(cSize) && cSize != 0) {
+        /* confirm repcodes and entropy tables when emitting a compressed block */
+        ZSTD_compressedBlockState_t* const tmp = zc->blockState.prevCBlock;
+        zc->blockState.prevCBlock = zc->blockState.nextCBlock;
+        zc->blockState.nextCBlock = tmp;
     }
+    /* We check that dictionaries have offset codes available for the first
+     * block. After the first block, the offcode table might not have large
+     * enough codes to represent the offsets in the data.
+     */
+    if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
+        zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
+
+    return cSize;
 }
 
 
@@ -2426,7 +2553,6 @@
             ZSTD_STATIC_ASSERT(ZSTD_CHAINLOG_MAX <= 30);
             ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX_32 <= 30);
             ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31);
-
             ZSTD_reduceIndex(cctx, correction);
             if (ms->nextToUpdate < correction) ms->nextToUpdate = 0;
             else ms->nextToUpdate -= correction;
@@ -2442,11 +2568,8 @@
             if (ZSTD_isError(cSize)) return cSize;
 
             if (cSize == 0) {  /* block is not compressible */
-                U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw)<<1) + (U32)(blockSize << 3);
-                if (blockSize + ZSTD_blockHeaderSize > dstCapacity) return ERROR(dstSize_tooSmall);
-                MEM_writeLE32(op, cBlockHeader24);   /* 4th byte will be overwritten */
-                memcpy(op + ZSTD_blockHeaderSize, ip, blockSize);
-                cSize = ZSTD_blockHeaderSize + blockSize;
+                cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock);
+                if (ZSTD_isError(cSize)) return cSize;
             } else {
                 U32 const cBlockHeader24 = lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3);
                 MEM_writeLE24(op, cBlockHeader24);
@@ -2545,7 +2668,7 @@
                         const void* src, size_t srcSize,
                                U32 frame, U32 lastFrameChunk)
 {
-    ZSTD_matchState_t* ms = &cctx->blockState.matchState;
+    ZSTD_matchState_t* const ms = &cctx->blockState.matchState;
     size_t fhSize = 0;
 
     DEBUGLOG(5, "ZSTD_compressContinue_internal, stage: %u, srcSize: %u",
@@ -2566,8 +2689,25 @@
     if (!ZSTD_window_update(&ms->window, src, srcSize)) {
         ms->nextToUpdate = ms->window.dictLimit;
     }
-    if (cctx->appliedParams.ldmParams.enableLdm)
+    if (cctx->appliedParams.ldmParams.enableLdm) {
         ZSTD_window_update(&cctx->ldmState.window, src, srcSize);
+    }
+
+    if (!frame) {
+        /* overflow check and correction for block mode */
+        if (ZSTD_window_needOverflowCorrection(ms->window, (const char*)src + srcSize)) {
+            U32 const cycleLog = ZSTD_cycleLog(cctx->appliedParams.cParams.chainLog, cctx->appliedParams.cParams.strategy);
+            U32 const correction = ZSTD_window_correctOverflow(&ms->window, cycleLog, 1 << cctx->appliedParams.cParams.windowLog, src);
+            ZSTD_STATIC_ASSERT(ZSTD_CHAINLOG_MAX <= 30);
+            ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX_32 <= 30);
+            ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31);
+            ZSTD_reduceIndex(cctx, correction);
+            if (ms->nextToUpdate < correction) ms->nextToUpdate = 0;
+            else ms->nextToUpdate -= correction;
+            ms->loadedDictEnd = 0;
+            ms->dictMatchState = NULL;
+        }
+    }
 
     DEBUGLOG(5, "ZSTD_compressContinue_internal (blockSize=%u)", (U32)cctx->blockSize);
     {   size_t const cSize = frame ?
@@ -2609,6 +2749,7 @@
 {
     size_t const blockSizeMax = ZSTD_getBlockSize(cctx);
     if (srcSize > blockSizeMax) return ERROR(srcSize_wrong);
+
     return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0 /* frame mode */, 0 /* last chunk */);
 }
 
@@ -2622,34 +2763,36 @@
 {
     const BYTE* const ip = (const BYTE*) src;
     const BYTE* const iend = ip + srcSize;
-    ZSTD_compressionParameters const* cParams = &params->cParams;
 
     ZSTD_window_update(&ms->window, src, srcSize);
     ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base);
 
+    /* Assert that we the ms params match the params we're being given */
+    ZSTD_assertEqualCParams(params->cParams, ms->cParams);
+
     if (srcSize <= HASH_READ_SIZE) return 0;
 
     switch(params->cParams.strategy)
     {
     case ZSTD_fast:
-        ZSTD_fillHashTable(ms, cParams, iend, dtlm);
+        ZSTD_fillHashTable(ms, iend, dtlm);
         break;
     case ZSTD_dfast:
-        ZSTD_fillDoubleHashTable(ms, cParams, iend, dtlm);
+        ZSTD_fillDoubleHashTable(ms, iend, dtlm);
         break;
 
     case ZSTD_greedy:
     case ZSTD_lazy:
     case ZSTD_lazy2:
         if (srcSize >= HASH_READ_SIZE)
-            ZSTD_insertAndFindFirstIndex(ms, cParams, iend-HASH_READ_SIZE);
+            ZSTD_insertAndFindFirstIndex(ms, iend-HASH_READ_SIZE);
         break;
 
     case ZSTD_btlazy2:   /* we want the dictionary table fully sorted */
     case ZSTD_btopt:
     case ZSTD_btultra:
         if (srcSize >= HASH_READ_SIZE)
-            ZSTD_updateTree(ms, cParams, iend-HASH_READ_SIZE, iend);
+            ZSTD_updateTree(ms, iend-HASH_READ_SIZE, iend);
         break;
 
     default:
@@ -2813,13 +2956,13 @@
 
 /*! ZSTD_compressBegin_internal() :
  * @return : 0, or an error code */
-size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
-                             const void* dict, size_t dictSize,
-                             ZSTD_dictContentType_e dictContentType,
-                             ZSTD_dictTableLoadMethod_e dtlm,
-                             const ZSTD_CDict* cdict,
-                             ZSTD_CCtx_params params, U64 pledgedSrcSize,
-                             ZSTD_buffered_policy_e zbuff)
+static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
+                                    const void* dict, size_t dictSize,
+                                    ZSTD_dictContentType_e dictContentType,
+                                    ZSTD_dictTableLoadMethod_e dtlm,
+                                    const ZSTD_CDict* cdict,
+                                    ZSTD_CCtx_params params, U64 pledgedSrcSize,
+                                    ZSTD_buffered_policy_e zbuff)
 {
     DEBUGLOG(4, "ZSTD_compressBegin_internal: wlog=%u", params.cParams.windowLog);
     /* params are supposed to be fully validated at this point */
@@ -3073,7 +3216,7 @@
 {
     DEBUGLOG(3, "ZSTD_initCDict_internal (dictContentType:%u)", (U32)dictContentType);
     assert(!ZSTD_checkCParams(cParams));
-    cdict->cParams = cParams;
+    cdict->matchState.cParams = cParams;
     if ((dictLoadMethod == ZSTD_dlm_byRef) || (!dictBuffer) || (!dictSize)) {
         cdict->dictBuffer = NULL;
         cdict->dictContent = dictBuffer;
@@ -3227,7 +3370,7 @@
 ZSTD_compressionParameters ZSTD_getCParamsFromCDict(const ZSTD_CDict* cdict)
 {
     assert(cdict != NULL);
-    return cdict->cParams;
+    return cdict->matchState.cParams;
 }
 
 /* ZSTD_compressBegin_usingCDict_advanced() :
@@ -3332,9 +3475,11 @@
 static size_t ZSTD_resetCStream_internal(ZSTD_CStream* cctx,
                     const void* const dict, size_t const dictSize, ZSTD_dictContentType_e const dictContentType,
                     const ZSTD_CDict* const cdict,
-                    ZSTD_CCtx_params const params, unsigned long long const pledgedSrcSize)
+                    ZSTD_CCtx_params params, unsigned long long const pledgedSrcSize)
 {
     DEBUGLOG(4, "ZSTD_resetCStream_internal");
+    /* Finalize the compression parameters */
+    params.cParams = ZSTD_getCParamsFromCCtxParams(&params, pledgedSrcSize, dictSize);
     /* params are supposed to be fully validated at this point */
     assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
     assert(!((dict) && (cdict)));  /* either dict or cdict, not both */
@@ -3363,7 +3508,6 @@
     DEBUGLOG(4, "ZSTD_resetCStream: pledgedSrcSize = %u", (U32)pledgedSrcSize);
     if (pledgedSrcSize==0) pledgedSrcSize = ZSTD_CONTENTSIZE_UNKNOWN;
     params.fParams.contentSizeFlag = 1;
-    params.cParams = ZSTD_getCParamsFromCCtxParams(&params, pledgedSrcSize, 0);
     return ZSTD_resetCStream_internal(zcs, NULL, 0, ZSTD_dct_auto, zcs->cdict, params, pledgedSrcSize);
 }
 
@@ -3376,6 +3520,7 @@
                     ZSTD_CCtx_params params, unsigned long long pledgedSrcSize)
 {
     DEBUGLOG(4, "ZSTD_initCStream_internal");
+    params.cParams = ZSTD_getCParamsFromCCtxParams(&params, pledgedSrcSize, dictSize);
     assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
     assert(!((dict) && (cdict)));  /* either dict or cdict, not both */
 
@@ -3442,25 +3587,21 @@
                 (U32)pledgedSrcSize, params.fParams.contentSizeFlag);
     CHECK_F( ZSTD_checkCParams(params.cParams) );
     if ((pledgedSrcSize==0) && (params.fParams.contentSizeFlag==0)) pledgedSrcSize = ZSTD_CONTENTSIZE_UNKNOWN;  /* for compatibility with older programs relying on this behavior. Users should now specify ZSTD_CONTENTSIZE_UNKNOWN. This line will be removed in the future. */
-    {   ZSTD_CCtx_params const cctxParams = ZSTD_assignParamsToCCtxParams(zcs->requestedParams, params);
-        return ZSTD_initCStream_internal(zcs, dict, dictSize, NULL /*cdict*/, cctxParams, pledgedSrcSize);
-    }
+    zcs->requestedParams = ZSTD_assignParamsToCCtxParams(zcs->requestedParams, params);
+    return ZSTD_initCStream_internal(zcs, dict, dictSize, NULL /*cdict*/, zcs->requestedParams, pledgedSrcSize);
 }
 
 size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, const void* dict, size_t dictSize, int compressionLevel)
 {
-    ZSTD_parameters const params = ZSTD_getParams(compressionLevel, 0, dictSize);
-    ZSTD_CCtx_params const cctxParams =
-            ZSTD_assignParamsToCCtxParams(zcs->requestedParams, params);
-    return ZSTD_initCStream_internal(zcs, dict, dictSize, NULL, cctxParams, ZSTD_CONTENTSIZE_UNKNOWN);
+    ZSTD_CCtxParams_init(&zcs->requestedParams, compressionLevel);
+    return ZSTD_initCStream_internal(zcs, dict, dictSize, NULL, zcs->requestedParams, ZSTD_CONTENTSIZE_UNKNOWN);
 }
 
 size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, int compressionLevel, unsigned long long pss)
 {
     U64 const pledgedSrcSize = (pss==0) ? ZSTD_CONTENTSIZE_UNKNOWN : pss;  /* temporary : 0 interpreted as "unknown" during transition period. Users willing to specify "unknown" **must** use ZSTD_CONTENTSIZE_UNKNOWN. `0` will be interpreted as "empty" in the future */
-    ZSTD_parameters const params = ZSTD_getParams(compressionLevel, pledgedSrcSize, 0);
-    ZSTD_CCtx_params const cctxParams = ZSTD_assignParamsToCCtxParams(zcs->requestedParams, params);
-    return ZSTD_initCStream_internal(zcs, NULL, 0, NULL, cctxParams, pledgedSrcSize);
+    ZSTD_CCtxParams_init(&zcs->requestedParams, compressionLevel);
+    return ZSTD_initCStream_internal(zcs, NULL, 0, NULL, zcs->requestedParams, pledgedSrcSize);
 }
 
 size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel)
@@ -3701,6 +3842,7 @@
               || (endOp == ZSTD_e_end && flushMin == 0) ) { /* compression completed */
                 ZSTD_CCtx_reset(cctx);
             }
+            DEBUGLOG(5, "completed ZSTD_compress_generic delegating to ZSTDMT_compressStream_generic");
             return flushMin;
     }   }
 #endif
@@ -3756,6 +3898,7 @@
 
 #define ZSTD_MAX_CLEVEL     22
 int ZSTD_maxCLevel(void) { return ZSTD_MAX_CLEVEL; }
+int ZSTD_minCLevel(void) { return (int)-ZSTD_TARGETLENGTH_MAX; }
 
 static const ZSTD_compressionParameters ZSTD_defaultCParameters[4][ZSTD_MAX_CLEVEL+1] = {
 {   /* "default" - guarantees a monotonically increasing memory budget */
diff --git a/lib/compress/zstd_compress_internal.h b/lib/compress/zstd_compress_internal.h
index d31542c..43f7c14 100644
--- a/lib/compress/zstd_compress_internal.h
+++ b/lib/compress/zstd_compress_internal.h
@@ -140,6 +140,7 @@
     U32* chainTable;
     optState_t opt;         /* optimal parser state */
     const ZSTD_matchState_t *dictMatchState;
+    ZSTD_compressionParameters cParams;
 };
 
 typedef struct {
@@ -264,7 +265,7 @@
 
 typedef size_t (*ZSTD_blockCompressor) (
         ZSTD_matchState_t* bs, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_dictMode_e dictMode);
 
 
@@ -314,8 +315,10 @@
                pos, (U32)litLength, (U32)mlBase+MINMATCH, (U32)offsetCode);
     }
 #endif
+    assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq);
     /* copy Literals */
-    assert(seqStorePtr->lit + litLength <= seqStorePtr->litStart + 128 KB);
+    assert(seqStorePtr->maxNbLit <= 128 KB);
+    assert(seqStorePtr->lit + litLength <= seqStorePtr->litStart + seqStorePtr->maxNbLit);
     ZSTD_wildcopy(seqStorePtr->lit, literals, litLength);
     seqStorePtr->lit += litLength;
 
diff --git a/lib/compress/zstd_double_fast.c b/lib/compress/zstd_double_fast.c
index 7fc11eb..7b9e18e 100644
--- a/lib/compress/zstd_double_fast.c
+++ b/lib/compress/zstd_double_fast.c
@@ -13,9 +13,9 @@
 
 
 void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
-                              ZSTD_compressionParameters const* cParams,
                               void const* end, ZSTD_dictTableLoadMethod_e dtlm)
 {
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
     U32* const hashLarge = ms->hashTable;
     U32  const hBitsL = cParams->hashLog;
     U32  const mls = cParams->searchLength;
@@ -51,9 +51,10 @@
 FORCE_INLINE_TEMPLATE
 size_t ZSTD_compressBlock_doubleFast_generic(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize,
+        void const* src, size_t srcSize,
         U32 const mls /* template */, ZSTD_dictMode_e const dictMode)
 {
+    ZSTD_compressionParameters const* cParams = &ms->cParams;
     U32* const hashLong = ms->hashTable;
     const U32 hBitsL = cParams->hashLog;
     U32* const hashSmall = ms->chainTable;
@@ -70,6 +71,9 @@
     U32 offsetSaved = 0;
 
     const ZSTD_matchState_t* const dms = ms->dictMatchState;
+    const ZSTD_compressionParameters* const dictCParams =
+                                     dictMode == ZSTD_dictMatchState ?
+                                     &dms->cParams : NULL;
     const U32* const dictHashLong  = dictMode == ZSTD_dictMatchState ?
                                      dms->hashTable : NULL;
     const U32* const dictHashSmall = dictMode == ZSTD_dictMatchState ?
@@ -85,6 +89,10 @@
     const U32 dictIndexDelta       = dictMode == ZSTD_dictMatchState ?
                                      prefixLowestIndex - (U32)(dictEnd - dictBase) :
                                      0;
+    const U32 dictHBitsL           = dictMode == ZSTD_dictMatchState ?
+                                     dictCParams->hashLog : hBitsL;
+    const U32 dictHBitsS           = dictMode == ZSTD_dictMatchState ?
+                                     dictCParams->chainLog : hBitsS;
     const U32 dictAndPrefixLength  = (U32)(ip - prefixLowest + dictEnd - dictStart);
 
     assert(dictMode == ZSTD_noDict || dictMode == ZSTD_dictMatchState);
@@ -109,6 +117,8 @@
         U32 offset;
         size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8);
         size_t const h = ZSTD_hashPtr(ip, hBitsS, mls);
+        size_t const dictHL = ZSTD_hashPtr(ip, dictHBitsL, 8);
+        size_t const dictHS = ZSTD_hashPtr(ip, dictHBitsS, mls);
         U32 const current = (U32)(ip-base);
         U32 const matchIndexL = hashLong[h2];
         U32 matchIndexS = hashSmall[h];
@@ -141,17 +151,17 @@
             goto _match_stored;
         }
 
-        /* check prefix long match */
-        if ( (matchIndexL > prefixLowestIndex) && (MEM_read64(matchLong) == MEM_read64(ip)) ) {
-            mLength = ZSTD_count(ip+8, matchLong+8, iend) + 8;
-            offset = (U32)(ip-matchLong);
-            while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */
-            goto _match_found;
-        }
-
-        /* check dictMatchState long match */
-        if (dictMode == ZSTD_dictMatchState) {
-            U32 const dictMatchIndexL = dictHashLong[h2];
+        if (matchIndexL > prefixLowestIndex) {
+            /* check prefix long match */
+            if (MEM_read64(matchLong) == MEM_read64(ip)) {
+                mLength = ZSTD_count(ip+8, matchLong+8, iend) + 8;
+                offset = (U32)(ip-matchLong);
+                while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */
+                goto _match_found;
+            }
+        } else if (dictMode == ZSTD_dictMatchState) {
+            /* check dictMatchState long match */
+            U32 const dictMatchIndexL = dictHashLong[dictHL];
             const BYTE* dictMatchL = dictBase + dictMatchIndexL;
             assert(dictMatchL < dictEnd);
 
@@ -163,14 +173,14 @@
             }
         }
 
-        /* check prefix short match */
-        if ( (matchIndexS > prefixLowestIndex) && (MEM_read32(match) == MEM_read32(ip)) ) {
-            goto _search_next_long;
-        }
-
-        /* check dictMatchState short match */
-        if (dictMode == ZSTD_dictMatchState) {
-            U32 const dictMatchIndexS = dictHashSmall[h];
+        if (matchIndexS > prefixLowestIndex) {
+            /* check prefix short match */
+            if (MEM_read32(match) == MEM_read32(ip)) {
+                goto _search_next_long;
+            }
+        } else if (dictMode == ZSTD_dictMatchState) {
+            /* check dictMatchState short match */
+            U32 const dictMatchIndexS = dictHashSmall[dictHS];
             match = dictBase + dictMatchIndexS;
             matchIndexS = dictMatchIndexS + dictIndexDelta;
 
@@ -186,22 +196,23 @@
 
         {
             size_t const hl3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
+            size_t const dictHLNext = ZSTD_hashPtr(ip+1, dictHBitsL, 8);
             U32 const matchIndexL3 = hashLong[hl3];
             const BYTE* matchL3 = base + matchIndexL3;
             hashLong[hl3] = current + 1;
 
             /* check prefix long +1 match */
-            if ( (matchIndexL3 > prefixLowestIndex) && (MEM_read64(matchL3) == MEM_read64(ip+1)) ) {
-                mLength = ZSTD_count(ip+9, matchL3+8, iend) + 8;
-                ip++;
-                offset = (U32)(ip-matchL3);
-                while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */
-                goto _match_found;
-            }
-
-            /* check dict long +1 match */
-            if (dictMode == ZSTD_dictMatchState) {
-                U32 const dictMatchIndexL3 = dictHashLong[hl3];
+            if (matchIndexL3 > prefixLowestIndex) {
+                if (MEM_read64(matchL3) == MEM_read64(ip+1)) {
+                    mLength = ZSTD_count(ip+9, matchL3+8, iend) + 8;
+                    ip++;
+                    offset = (U32)(ip-matchL3);
+                    while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */
+                    goto _match_found;
+                }
+            } else if (dictMode == ZSTD_dictMatchState) {
+                /* check dict long +1 match */
+                U32 const dictMatchIndexL3 = dictHashLong[dictHLNext];
                 const BYTE* dictMatchL3 = dictBase + dictMatchIndexL3;
                 assert(dictMatchL3 < dictEnd);
                 if (dictMatchL3 > dictStart && MEM_read64(dictMatchL3) == MEM_read64(ip+1)) {
@@ -296,49 +307,50 @@
 
 size_t ZSTD_compressBlock_doubleFast(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        void const* src, size_t srcSize)
 {
-    const U32 mls = cParams->searchLength;
+    const U32 mls = ms->cParams.searchLength;
     switch(mls)
     {
     default: /* includes case 3 */
     case 4 :
-        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, cParams, src, srcSize, 4, ZSTD_noDict);
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 4, ZSTD_noDict);
     case 5 :
-        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, cParams, src, srcSize, 5, ZSTD_noDict);
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 5, ZSTD_noDict);
     case 6 :
-        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, cParams, src, srcSize, 6, ZSTD_noDict);
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 6, ZSTD_noDict);
     case 7 :
-        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, cParams, src, srcSize, 7, ZSTD_noDict);
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 7, ZSTD_noDict);
     }
 }
 
 
 size_t ZSTD_compressBlock_doubleFast_dictMatchState(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        void const* src, size_t srcSize)
 {
-    const U32 mls = cParams->searchLength;
+    const U32 mls = ms->cParams.searchLength;
     switch(mls)
     {
     default: /* includes case 3 */
     case 4 :
-        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, cParams, src, srcSize, 4, ZSTD_dictMatchState);
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 4, ZSTD_dictMatchState);
     case 5 :
-        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, cParams, src, srcSize, 5, ZSTD_dictMatchState);
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 5, ZSTD_dictMatchState);
     case 6 :
-        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, cParams, src, srcSize, 6, ZSTD_dictMatchState);
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 6, ZSTD_dictMatchState);
     case 7 :
-        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, cParams, src, srcSize, 7, ZSTD_dictMatchState);
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 7, ZSTD_dictMatchState);
     }
 }
 
 
 static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize,
+        void const* src, size_t srcSize,
         U32 const mls /* template */)
 {
+    ZSTD_compressionParameters const* cParams = &ms->cParams;
     U32* const hashLong = ms->hashTable;
     U32  const hBitsL = cParams->hashLog;
     U32* const hashSmall = ms->chainTable;
@@ -469,19 +481,19 @@
 
 size_t ZSTD_compressBlock_doubleFast_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        void const* src, size_t srcSize)
 {
-    U32 const mls = cParams->searchLength;
+    U32 const mls = ms->cParams.searchLength;
     switch(mls)
     {
     default: /* includes case 3 */
     case 4 :
-        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 4);
+        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 4);
     case 5 :
-        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 5);
+        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 5);
     case 6 :
-        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 6);
+        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 6);
     case 7 :
-        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 7);
+        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 7);
     }
 }
diff --git a/lib/compress/zstd_double_fast.h b/lib/compress/zstd_double_fast.h
index c475021..4fa31ac 100644
--- a/lib/compress/zstd_double_fast.h
+++ b/lib/compress/zstd_double_fast.h
@@ -19,17 +19,16 @@
 #include "zstd_compress_internal.h"     /* ZSTD_CCtx, size_t */
 
 void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
-                              ZSTD_compressionParameters const* cParams,
                               void const* end, ZSTD_dictTableLoadMethod_e dtlm);
 size_t ZSTD_compressBlock_doubleFast(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 size_t ZSTD_compressBlock_doubleFast_dictMatchState(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 size_t ZSTD_compressBlock_doubleFast_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 
 
 #if defined (__cplusplus)
diff --git a/lib/compress/zstd_fast.c b/lib/compress/zstd_fast.c
index 37a7151..2477465 100644
--- a/lib/compress/zstd_fast.c
+++ b/lib/compress/zstd_fast.c
@@ -13,9 +13,9 @@
 
 
 void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
-                        ZSTD_compressionParameters const* cParams,
                         void const* end, ZSTD_dictTableLoadMethod_e dtlm)
 {
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
     U32* const hashTable = ms->hashTable;
     U32  const hBits = cParams->hashLog;
     U32  const mls = cParams->searchLength;
@@ -45,10 +45,13 @@
 size_t ZSTD_compressBlock_fast_generic(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize,
-        U32 const hlog, U32 stepSize, U32 const mls,
-        ZSTD_dictMode_e const dictMode)
+        U32 const mls, ZSTD_dictMode_e const dictMode)
 {
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
     U32* const hashTable = ms->hashTable;
+    U32 const hlog = cParams->hashLog;
+    /* support stepSize of 0 */
+    U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
     const BYTE* const base = ms->window.base;
     const BYTE* const istart = (const BYTE*)src;
     const BYTE* ip = istart;
@@ -61,6 +64,9 @@
     U32 offsetSaved = 0;
 
     const ZSTD_matchState_t* const dms = ms->dictMatchState;
+    const ZSTD_compressionParameters* const dictCParams =
+                                     dictMode == ZSTD_dictMatchState ?
+                                     &dms->cParams : NULL;
     const U32* const dictHashTable = dictMode == ZSTD_dictMatchState ?
                                      dms->hashTable : NULL;
     const U32 dictStartIndex       = dictMode == ZSTD_dictMatchState ?
@@ -75,6 +81,8 @@
                                      prefixStartIndex - (U32)(dictEnd - dictBase) :
                                      0;
     const U32 dictAndPrefixLength  = (U32)(ip - prefixStart + dictEnd - dictStart);
+    const U32 dictHLog             = dictMode == ZSTD_dictMatchState ?
+                                     dictCParams->hashLog : hlog;
 
     assert(dictMode == ZSTD_noDict || dictMode == ZSTD_dictMatchState);
 
@@ -84,7 +92,6 @@
         || prefixStartIndex >= (U32)(dictEnd - dictBase));
 
     /* init */
-    stepSize += !stepSize;  /* support stepSize of 0 */
     ip += (dictAndPrefixLength == 0);
     if (dictMode == ZSTD_noDict) {
         U32 const maxRep = (U32)(ip - prefixStart);
@@ -124,10 +131,10 @@
             mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
             ip++;
             ZSTD_storeSeq(seqStore, ip-anchor, anchor, 0, mLength-MINMATCH);
-        } else if ( (matchIndex <= prefixStartIndex)
-                 || (MEM_read32(match) != MEM_read32(ip)) ) {
+        } else if ( (matchIndex <= prefixStartIndex) ) {
             if (dictMode == ZSTD_dictMatchState) {
-                U32 const dictMatchIndex = dictHashTable[h];
+                size_t const dictHash = ZSTD_hashPtr(ip, dictHLog, mls);
+                U32 const dictMatchIndex = dictHashTable[dictHash];
                 const BYTE* dictMatch = dictBase + dictMatchIndex;
                 if (dictMatchIndex <= dictStartIndex ||
                     MEM_read32(dictMatch) != MEM_read32(ip)) {
@@ -151,6 +158,11 @@
                 ip += ((ip-anchor) >> kSearchStrength) + stepSize;
                 continue;
             }
+        } else if (MEM_read32(match) != MEM_read32(ip)) {
+            /* it's not a match, and we're not going to check the dictionary */
+            assert(stepSize >= 1);
+            ip += ((ip-anchor) >> kSearchStrength) + stepSize;
+            continue;
         } else {
             /* found a regular match */
             U32 const offset = (U32)(ip-match);
@@ -168,6 +180,7 @@
 
         if (ip <= ilimit) {
             /* Fill Table */
+            assert(base+current+2 > istart);  /* check base overflow */
             hashTable[ZSTD_hashPtr(base+current+2, hlog, mls)] = current+2;  /* here because current+2 could be > iend-8 */
             hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base);
 
@@ -219,55 +232,56 @@
 
 size_t ZSTD_compressBlock_fast(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        void const* src, size_t srcSize)
 {
-    U32 const hlog = cParams->hashLog;
+    ZSTD_compressionParameters const* cParams = &ms->cParams;
     U32 const mls = cParams->searchLength;
-    U32 const stepSize = cParams->targetLength;
     assert(ms->dictMatchState == NULL);
     switch(mls)
     {
     default: /* includes case 3 */
     case 4 :
-        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 4, ZSTD_noDict);
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 4, ZSTD_noDict);
     case 5 :
-        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 5, ZSTD_noDict);
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 5, ZSTD_noDict);
     case 6 :
-        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 6, ZSTD_noDict);
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 6, ZSTD_noDict);
     case 7 :
-        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 7, ZSTD_noDict);
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 7, ZSTD_noDict);
     }
 }
 
 size_t ZSTD_compressBlock_fast_dictMatchState(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        void const* src, size_t srcSize)
 {
-    U32 const hlog = cParams->hashLog;
+    ZSTD_compressionParameters const* cParams = &ms->cParams;
     U32 const mls = cParams->searchLength;
-    U32 const stepSize = cParams->targetLength;
     assert(ms->dictMatchState != NULL);
     switch(mls)
     {
     default: /* includes case 3 */
     case 4 :
-        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 4, ZSTD_dictMatchState);
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 4, ZSTD_dictMatchState);
     case 5 :
-        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 5, ZSTD_dictMatchState);
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 5, ZSTD_dictMatchState);
     case 6 :
-        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 6, ZSTD_dictMatchState);
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 6, ZSTD_dictMatchState);
     case 7 :
-        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 7, ZSTD_dictMatchState);
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 7, ZSTD_dictMatchState);
     }
 }
 
 
 static size_t ZSTD_compressBlock_fast_extDict_generic(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        void const* src, size_t srcSize,
-        U32 const hlog, U32 stepSize, U32 const mls)
+        void const* src, size_t srcSize, U32 const mls)
 {
-    U32* hashTable = ms->hashTable;
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const hashTable = ms->hashTable;
+    U32 const hlog = cParams->hashLog;
+    /* support stepSize of 0 */
+    U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
     const BYTE* const base = ms->window.base;
     const BYTE* const dictBase = ms->window.dictBase;
     const BYTE* const istart = (const BYTE*)src;
@@ -282,8 +296,6 @@
     const BYTE* const ilimit = iend - 8;
     U32 offset_1=rep[0], offset_2=rep[1];
 
-    stepSize += !stepSize;   /* support stepSize == 0 */
-
     /* Search Loop */
     while (ip < ilimit) {  /* < instead of <=, because (ip+1) */
         const size_t h = ZSTD_hashPtr(ip, hlog, mls);
@@ -360,21 +372,20 @@
 
 size_t ZSTD_compressBlock_fast_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        void const* src, size_t srcSize)
 {
-    U32 const hlog = cParams->hashLog;
+    ZSTD_compressionParameters const* cParams = &ms->cParams;
     U32 const mls = cParams->searchLength;
-    U32 const stepSize = cParams->targetLength;
     switch(mls)
     {
     default: /* includes case 3 */
     case 4 :
-        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 4);
+        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 4);
     case 5 :
-        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 5);
+        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 5);
     case 6 :
-        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 6);
+        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 6);
     case 7 :
-        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 7);
+        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 7);
     }
 }
diff --git a/lib/compress/zstd_fast.h b/lib/compress/zstd_fast.h
index 7e7435f..b74a88c 100644
--- a/lib/compress/zstd_fast.h
+++ b/lib/compress/zstd_fast.h
@@ -19,17 +19,16 @@
 #include "zstd_compress_internal.h"
 
 void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
-                        ZSTD_compressionParameters const* cParams,
                         void const* end, ZSTD_dictTableLoadMethod_e dtlm);
 size_t ZSTD_compressBlock_fast(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 size_t ZSTD_compressBlock_fast_dictMatchState(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 size_t ZSTD_compressBlock_fast_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 
 #if defined (__cplusplus)
 }
diff --git a/lib/compress/zstd_lazy.c b/lib/compress/zstd_lazy.c
index bfe9449..4ca69e3 100644
--- a/lib/compress/zstd_lazy.c
+++ b/lib/compress/zstd_lazy.c
@@ -16,11 +16,12 @@
 *  Binary Tree search
 ***************************************/
 
-void ZSTD_updateDUBT(
-                ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+static void
+ZSTD_updateDUBT(ZSTD_matchState_t* ms,
                 const BYTE* ip, const BYTE* iend,
                 U32 mls)
 {
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
     U32* const hashTable = ms->hashTable;
     U32  const hashLog = cParams->hashLog;
 
@@ -59,11 +60,12 @@
  *  sort one already inserted but unsorted position
  *  assumption : current >= btlow == (current - btmask)
  *  doesn't fail */
-static void ZSTD_insertDUBT1(
-                 ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+static void
+ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
                  U32 current, const BYTE* inputEnd,
                  U32 nbCompares, U32 btLow, const ZSTD_dictMode_e dictMode)
 {
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
     U32*   const bt = ms->chainTable;
     U32    const btLog  = cParams->chainLog - 1;
     U32    const btMask = (1 << btLog) - 1;
@@ -140,17 +142,19 @@
 }
 
 
-static size_t ZSTD_DUBT_findBetterDictMatch (
-        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+static size_t
+ZSTD_DUBT_findBetterDictMatch (
+        ZSTD_matchState_t* ms,
         const BYTE* const ip, const BYTE* const iend,
         size_t* offsetPtr,
-        size_t bestLength,
         U32 nbCompares,
         U32 const mls,
-        const ZSTD_dictMode_e dictMode) {
+        const ZSTD_dictMode_e dictMode)
+{
     const ZSTD_matchState_t * const dms = ms->dictMatchState;
+    const ZSTD_compressionParameters* const dmsCParams = &dms->cParams;
     const U32 * const dictHashTable = dms->hashTable;
-    U32         const hashLog = cParams->hashLog;
+    U32         const hashLog = dmsCParams->hashLog;
     size_t      const h  = ZSTD_hashPtr(ip, hashLog, mls);
     U32               dictMatchIndex = dictHashTable[h];
 
@@ -164,11 +168,11 @@
     U32         const dictIndexDelta = ms->window.lowLimit - dictHighLimit;
 
     U32*        const dictBt = dms->chainTable;
-    U32         const btLog  = cParams->chainLog - 1;
+    U32         const btLog  = dmsCParams->chainLog - 1;
     U32         const btMask = (1 << btLog) - 1;
     U32         const btLow = (btMask >= dictHighLimit - dictLowLimit) ? dictLowLimit : dictHighLimit - btMask;
 
-    size_t commonLengthSmaller=0, commonLengthLarger=0;
+    size_t commonLengthSmaller=0, commonLengthLarger=0, bestLength=0;
     U32 matchEndIdx = current+8+1;
 
     (void)dictMode;
@@ -187,15 +191,16 @@
             if (matchLength > matchEndIdx - matchIndex)
                 matchEndIdx = matchIndex + (U32)matchLength;
             if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(current-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
-                DEBUGLOG(9, "ZSTD_DUBT_findBestDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
+                DEBUGLOG(2, "ZSTD_DUBT_findBestDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
                     current, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, ZSTD_REP_MOVE + current - matchIndex, dictMatchIndex, matchIndex);
                 bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + current - matchIndex;
             }
-            if (ip+matchLength == iend) {   /* equal : no way to know if inf or sup */
+            if (ip+matchLength == iend) {   /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */
                 break;   /* drop, to guarantee consistency (miss a little bit of compression) */
             }
         }
 
+        DEBUGLOG(2, "matchLength:%6zu, match:%p, prefixStart:%p, ip:%p", matchLength, match, prefixStart, ip);
         if (match[matchLength] < ip[matchLength]) {
             if (dictMatchIndex <= btLow) { break; }   /* beyond tree size, stop the search */
             commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
@@ -210,7 +215,7 @@
 
     if (bestLength >= MINMATCH) {
         U32 const mIndex = current - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
-        DEBUGLOG(8, "ZSTD_DUBT_findBestDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
+        DEBUGLOG(2, "ZSTD_DUBT_findBestDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
                     current, (U32)bestLength, (U32)*offsetPtr, mIndex);
     }
     return bestLength;
@@ -218,13 +223,14 @@
 }
 
 
-static size_t ZSTD_DUBT_findBestMatch (
-                            ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
-                            const BYTE* const ip, const BYTE* const iend,
-                            size_t* offsetPtr,
-                            U32 const mls,
-                            const ZSTD_dictMode_e dictMode)
+static size_t
+ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
+                        const BYTE* const ip, const BYTE* const iend,
+                        size_t* offsetPtr,
+                        U32 const mls,
+                        const ZSTD_dictMode_e dictMode)
 {
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
     U32*   const hashTable = ms->hashTable;
     U32    const hashLog = cParams->hashLog;
     size_t const h  = ZSTD_hashPtr(ip, hashLog, mls);
@@ -275,7 +281,7 @@
     while (matchIndex) {  /* will end on matchIndex == 0 */
         U32* const nextCandidateIdxPtr = bt + 2*(matchIndex&btMask) + 1;
         U32 const nextCandidateIdx = *nextCandidateIdxPtr;
-        ZSTD_insertDUBT1(ms, cParams, matchIndex, iend,
+        ZSTD_insertDUBT1(ms, matchIndex, iend,
                          nbCandidates, unsortLimit, dictMode);
         matchIndex = nextCandidateIdx;
         nbCandidates++;
@@ -340,7 +346,7 @@
         *smallerPtr = *largerPtr = 0;
 
         if (dictMode == ZSTD_dictMatchState && nbCompares) {
-            bestLength = ZSTD_DUBT_findBetterDictMatch(ms, cParams, ip, iend, offsetPtr, bestLength, nbCompares, mls, dictMode);
+            bestLength = ZSTD_DUBT_findBetterDictMatch(ms, ip, iend, offsetPtr, nbCompares, mls, dictMode);
         }
 
         assert(matchEndIdx > current+8); /* ensure nextToUpdate is increased */
@@ -356,64 +362,64 @@
 
 
 /** ZSTD_BtFindBestMatch() : Tree updater, providing best match */
-FORCE_INLINE_TEMPLATE size_t ZSTD_BtFindBestMatch (
-                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
-                        const BYTE* const ip, const BYTE* const iLimit,
-                        size_t* offsetPtr,
-                        const U32 mls /* template */,
-                        const ZSTD_dictMode_e dictMode)
+FORCE_INLINE_TEMPLATE size_t
+ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
+                const BYTE* const ip, const BYTE* const iLimit,
+                      size_t* offsetPtr,
+                const U32 mls /* template */,
+                const ZSTD_dictMode_e dictMode)
 {
     DEBUGLOG(7, "ZSTD_BtFindBestMatch");
     if (ip < ms->window.base + ms->nextToUpdate) return 0;   /* skipped area */
-    ZSTD_updateDUBT(ms, cParams, ip, iLimit, mls);
-    return ZSTD_DUBT_findBestMatch(ms, cParams, ip, iLimit, offsetPtr, mls, dictMode);
+    ZSTD_updateDUBT(ms, ip, iLimit, mls);
+    return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode);
 }
 
 
-static size_t ZSTD_BtFindBestMatch_selectMLS (
-                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
-                        const BYTE* ip, const BYTE* const iLimit,
-                        size_t* offsetPtr)
+static size_t
+ZSTD_BtFindBestMatch_selectMLS (  ZSTD_matchState_t* ms,
+                            const BYTE* ip, const BYTE* const iLimit,
+                                  size_t* offsetPtr)
 {
-    switch(cParams->searchLength)
+    switch(ms->cParams.searchLength)
     {
     default : /* includes case 3 */
-    case 4 : return ZSTD_BtFindBestMatch(ms, cParams, ip, iLimit, offsetPtr, 4, ZSTD_noDict);
-    case 5 : return ZSTD_BtFindBestMatch(ms, cParams, ip, iLimit, offsetPtr, 5, ZSTD_noDict);
+    case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict);
+    case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict);
     case 7 :
-    case 6 : return ZSTD_BtFindBestMatch(ms, cParams, ip, iLimit, offsetPtr, 6, ZSTD_noDict);
+    case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict);
     }
 }
 
 
 static size_t ZSTD_BtFindBestMatch_dictMatchState_selectMLS (
-                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        ZSTD_matchState_t* ms,
                         const BYTE* ip, const BYTE* const iLimit,
                         size_t* offsetPtr)
 {
-    switch(cParams->searchLength)
+    switch(ms->cParams.searchLength)
     {
     default : /* includes case 3 */
-    case 4 : return ZSTD_BtFindBestMatch(ms, cParams, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState);
-    case 5 : return ZSTD_BtFindBestMatch(ms, cParams, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState);
+    case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState);
+    case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState);
     case 7 :
-    case 6 : return ZSTD_BtFindBestMatch(ms, cParams, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState);
+    case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState);
     }
 }
 
 
 static size_t ZSTD_BtFindBestMatch_extDict_selectMLS (
-                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        ZSTD_matchState_t* ms,
                         const BYTE* ip, const BYTE* const iLimit,
                         size_t* offsetPtr)
 {
-    switch(cParams->searchLength)
+    switch(ms->cParams.searchLength)
     {
     default : /* includes case 3 */
-    case 4 : return ZSTD_BtFindBestMatch(ms, cParams, ip, iLimit, offsetPtr, 4, ZSTD_extDict);
-    case 5 : return ZSTD_BtFindBestMatch(ms, cParams, ip, iLimit, offsetPtr, 5, ZSTD_extDict);
+    case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict);
+    case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict);
     case 7 :
-    case 6 : return ZSTD_BtFindBestMatch(ms, cParams, ip, iLimit, offsetPtr, 6, ZSTD_extDict);
+    case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict);
     }
 }
 
@@ -427,7 +433,8 @@
 /* Update chains up to ip (excluded)
    Assumption : always within prefix (i.e. not within extDict) */
 static U32 ZSTD_insertAndFindFirstIndex_internal(
-                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        ZSTD_matchState_t* ms,
+                        const ZSTD_compressionParameters* const cParams,
                         const BYTE* ip, U32 const mls)
 {
     U32* const hashTable  = ms->hashTable;
@@ -449,22 +456,21 @@
     return hashTable[ZSTD_hashPtr(ip, hashLog, mls)];
 }
 
-U32 ZSTD_insertAndFindFirstIndex(
-                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
-                        const BYTE* ip)
-{
-    return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, cParams->searchLength);
+U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.searchLength);
 }
 
 
 /* inlining is important to hardwire a hot branch (template emulation) */
 FORCE_INLINE_TEMPLATE
 size_t ZSTD_HcFindBestMatch_generic (
-                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        ZSTD_matchState_t* ms,
                         const BYTE* const ip, const BYTE* const iLimit,
                         size_t* offsetPtr,
                         const U32 mls, const ZSTD_dictMode_e dictMode)
 {
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
     U32* const chainTable = ms->chainTable;
     const U32 chainSize = (1 << cParams->chainLog);
     const U32 chainMask = chainSize-1;
@@ -509,14 +515,16 @@
     if (dictMode == ZSTD_dictMatchState) {
         const ZSTD_matchState_t* const dms = ms->dictMatchState;
         const U32* const dmsChainTable = dms->chainTable;
+        const U32 dmsChainSize         = (1 << dms->cParams.chainLog);
+        const U32 dmsChainMask         = dmsChainSize - 1;
         const U32 dmsLowestIndex       = dms->window.dictLimit;
         const BYTE* const dmsBase      = dms->window.base;
         const BYTE* const dmsEnd       = dms->window.nextSrc;
         const U32 dmsSize              = (U32)(dmsEnd - dmsBase);
         const U32 dmsIndexDelta        = dictLimit - dmsSize;
-        const U32 dmsMinChain = dmsSize > chainSize ? dmsSize - chainSize : 0;
+        const U32 dmsMinChain = dmsSize > dmsChainSize ? dmsSize - dmsChainSize : 0;
 
-        matchIndex = dms->hashTable[ZSTD_hashPtr(ip, cParams->hashLog, mls)];
+        matchIndex = dms->hashTable[ZSTD_hashPtr(ip, dms->cParams.hashLog, mls)];
 
         for ( ; (matchIndex>dmsLowestIndex) & (nbAttempts>0) ; nbAttempts--) {
             size_t currentMl=0;
@@ -533,7 +541,7 @@
             }
 
             if (matchIndex <= dmsMinChain) break;
-            matchIndex = dmsChainTable[matchIndex & chainMask];
+            matchIndex = dmsChainTable[matchIndex & dmsChainMask];
         }
     }
 
@@ -542,49 +550,49 @@
 
 
 FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_selectMLS (
-                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        ZSTD_matchState_t* ms,
                         const BYTE* ip, const BYTE* const iLimit,
                         size_t* offsetPtr)
 {
-    switch(cParams->searchLength)
+    switch(ms->cParams.searchLength)
     {
     default : /* includes case 3 */
-    case 4 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 4, ZSTD_noDict);
-    case 5 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 5, ZSTD_noDict);
+    case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict);
+    case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict);
     case 7 :
-    case 6 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 6, ZSTD_noDict);
+    case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict);
     }
 }
 
 
 static size_t ZSTD_HcFindBestMatch_dictMatchState_selectMLS (
-                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        ZSTD_matchState_t* ms,
                         const BYTE* ip, const BYTE* const iLimit,
                         size_t* offsetPtr)
 {
-    switch(cParams->searchLength)
+    switch(ms->cParams.searchLength)
     {
     default : /* includes case 3 */
-    case 4 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState);
-    case 5 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState);
+    case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState);
+    case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState);
     case 7 :
-    case 6 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState);
+    case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState);
     }
 }
 
 
 FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS (
-                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        ZSTD_matchState_t* ms,
                         const BYTE* ip, const BYTE* const iLimit,
                         size_t* offsetPtr)
 {
-    switch(cParams->searchLength)
+    switch(ms->cParams.searchLength)
     {
     default : /* includes case 3 */
-    case 4 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 4, ZSTD_extDict);
-    case 5 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 5, ZSTD_extDict);
+    case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict);
+    case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict);
     case 7 :
-    case 6 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 6, ZSTD_extDict);
+    case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict);
     }
 }
 
@@ -596,7 +604,6 @@
 size_t ZSTD_compressBlock_lazy_generic(
                         ZSTD_matchState_t* ms, seqStore_t* seqStore,
                         U32 rep[ZSTD_REP_NUM],
-                        ZSTD_compressionParameters const* cParams,
                         const void* src, size_t srcSize,
                         const U32 searchMethod, const U32 depth,
                         ZSTD_dictMode_e const dictMode)
@@ -611,7 +618,7 @@
     const BYTE* const prefixLowest = base + prefixLowestIndex;
 
     typedef size_t (*searchMax_f)(
-                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        ZSTD_matchState_t* ms,
                         const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
     searchMax_f const searchMax = dictMode == ZSTD_dictMatchState ?
         (searchMethod ? ZSTD_BtFindBestMatch_dictMatchState_selectMLS : ZSTD_HcFindBestMatch_dictMatchState_selectMLS) :
@@ -632,8 +639,6 @@
                                      0;
     const U32 dictAndPrefixLength = (U32)(ip - prefixLowest + dictEnd - dictLowest);
 
-    (void)dictMode;
-
     /* init */
     ip += (dictAndPrefixLength == 0);
     ms->nextToUpdate3 = ms->nextToUpdate;
@@ -676,8 +681,8 @@
         }
 
         /* first search (depth 0) */
-        {   size_t offsetFound = 99999999;
-            size_t const ml2 = searchMax(ms, cParams, ip, iend, &offsetFound);
+        {   size_t offsetFound = 999999999;
+            size_t const ml2 = searchMax(ms, ip, iend, &offsetFound);
             if (ml2 > matchLength)
                 matchLength = ml2, start = ip, offset=offsetFound;
         }
@@ -714,8 +719,8 @@
                         matchLength = mlRep, offset = 0, start = ip;
                 }
             }
-            {   size_t offset2=99999999;
-                size_t const ml2 = searchMax(ms, cParams, ip, iend, &offset2);
+            {   size_t offset2=999999999;
+                size_t const ml2 = searchMax(ms, ip, iend, &offset2);
                 int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1));   /* raw approx */
                 int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4);
                 if ((ml2 >= 4) && (gain2 > gain1)) {
@@ -749,8 +754,8 @@
                             matchLength = mlRep, offset = 0, start = ip;
                     }
                 }
-                {   size_t offset2=99999999;
-                    size_t const ml2 = searchMax(ms, cParams, ip, iend, &offset2);
+                {   size_t offset2=999999999;
+                    size_t const ml2 = searchMax(ms, ip, iend, &offset2);
                     int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1));   /* raw approx */
                     int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7);
                     if ((ml2 >= 4) && (gain2 > gain1)) {
@@ -833,58 +838,58 @@
 
 size_t ZSTD_compressBlock_btlazy2(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, cParams, src, srcSize, 1, 2, ZSTD_noDict);
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, 1, 2, ZSTD_noDict);
 }
 
 size_t ZSTD_compressBlock_lazy2(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 2, ZSTD_noDict);
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, 0, 2, ZSTD_noDict);
 }
 
 size_t ZSTD_compressBlock_lazy(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 1, ZSTD_noDict);
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, 0, 1, ZSTD_noDict);
 }
 
 size_t ZSTD_compressBlock_greedy(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 0, ZSTD_noDict);
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, 0, 0, ZSTD_noDict);
 }
 
 size_t ZSTD_compressBlock_btlazy2_dictMatchState(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, cParams, src, srcSize, 1, 2, ZSTD_dictMatchState);
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, 1, 2, ZSTD_dictMatchState);
 }
 
 size_t ZSTD_compressBlock_lazy2_dictMatchState(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 2, ZSTD_dictMatchState);
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, 0, 2, ZSTD_dictMatchState);
 }
 
 size_t ZSTD_compressBlock_lazy_dictMatchState(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 1, ZSTD_dictMatchState);
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, 0, 1, ZSTD_dictMatchState);
 }
 
 size_t ZSTD_compressBlock_greedy_dictMatchState(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 0, ZSTD_dictMatchState);
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, 0, 0, ZSTD_dictMatchState);
 }
 
 
@@ -892,7 +897,6 @@
 size_t ZSTD_compressBlock_lazy_extDict_generic(
                         ZSTD_matchState_t* ms, seqStore_t* seqStore,
                         U32 rep[ZSTD_REP_NUM],
-                        ZSTD_compressionParameters const* cParams,
                         const void* src, size_t srcSize,
                         const U32 searchMethod, const U32 depth)
 {
@@ -910,7 +914,7 @@
     const BYTE* const dictStart  = dictBase + lowestIndex;
 
     typedef size_t (*searchMax_f)(
-                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        ZSTD_matchState_t* ms,
                         const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
     searchMax_f searchMax = searchMethod ? ZSTD_BtFindBestMatch_extDict_selectMLS : ZSTD_HcFindBestMatch_extDict_selectMLS;
 
@@ -940,8 +944,8 @@
         }   }
 
         /* first search (depth 0) */
-        {   size_t offsetFound = 99999999;
-            size_t const ml2 = searchMax(ms, cParams, ip, iend, &offsetFound);
+        {   size_t offsetFound = 999999999;
+            size_t const ml2 = searchMax(ms, ip, iend, &offsetFound);
             if (ml2 > matchLength)
                 matchLength = ml2, start = ip, offset=offsetFound;
         }
@@ -973,8 +977,8 @@
             }   }
 
             /* search match, depth 1 */
-            {   size_t offset2=99999999;
-                size_t const ml2 = searchMax(ms, cParams, ip, iend, &offset2);
+            {   size_t offset2=999999999;
+                size_t const ml2 = searchMax(ms, ip, iend, &offset2);
                 int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1));   /* raw approx */
                 int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4);
                 if ((ml2 >= 4) && (gain2 > gain1)) {
@@ -1003,8 +1007,8 @@
                 }   }
 
                 /* search match, depth 2 */
-                {   size_t offset2=99999999;
-                    size_t const ml2 = searchMax(ms, cParams, ip, iend, &offset2);
+                {   size_t offset2=999999999;
+                    size_t const ml2 = searchMax(ms, ip, iend, &offset2);
                     int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1));   /* raw approx */
                     int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7);
                     if ((ml2 >= 4) && (gain2 > gain1)) {
@@ -1060,31 +1064,31 @@
 
 size_t ZSTD_compressBlock_greedy_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 0);
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, 0, 0);
 }
 
 size_t ZSTD_compressBlock_lazy_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        void const* src, size_t srcSize)
 
 {
-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 1);
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, 0, 1);
 }
 
 size_t ZSTD_compressBlock_lazy2_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        void const* src, size_t srcSize)
 
 {
-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 2);
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, 0, 2);
 }
 
 size_t ZSTD_compressBlock_btlazy2_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        void const* src, size_t srcSize)
 
 {
-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 1, 2);
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, 1, 2);
 }
diff --git a/lib/compress/zstd_lazy.h b/lib/compress/zstd_lazy.h
index c299de6..ef85a6d 100644
--- a/lib/compress/zstd_lazy.h
+++ b/lib/compress/zstd_lazy.h
@@ -17,50 +17,48 @@
 
 #include "zstd_compress_internal.h"
 
-U32 ZSTD_insertAndFindFirstIndex(
-        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
-        const BYTE* ip);
+U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip);
 
 void ZSTD_preserveUnsortedMark (U32* const table, U32 const size, U32 const reducerValue);  /*! used in ZSTD_reduceIndex(). pre-emptively increase value of ZSTD_DUBT_UNSORTED_MARK */
 
 size_t ZSTD_compressBlock_btlazy2(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 size_t ZSTD_compressBlock_lazy2(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 size_t ZSTD_compressBlock_lazy(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 size_t ZSTD_compressBlock_greedy(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 
 size_t ZSTD_compressBlock_btlazy2_dictMatchState(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 size_t ZSTD_compressBlock_lazy2_dictMatchState(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 size_t ZSTD_compressBlock_lazy_dictMatchState(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 size_t ZSTD_compressBlock_greedy_dictMatchState(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 
 size_t ZSTD_compressBlock_greedy_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 size_t ZSTD_compressBlock_lazy_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 size_t ZSTD_compressBlock_lazy2_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 size_t ZSTD_compressBlock_btlazy2_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 
 #if defined (__cplusplus)
 }
diff --git a/lib/compress/zstd_ldm.c b/lib/compress/zstd_ldm.c
index 215f55c..6238dde 100644
--- a/lib/compress/zstd_ldm.c
+++ b/lib/compress/zstd_ldm.c
@@ -218,19 +218,18 @@
  *  The tables for the other strategies are filled within their
  *  block compressors. */
 static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms,
-                                      ZSTD_compressionParameters const* cParams,
                                       void const* end)
 {
     const BYTE* const iend = (const BYTE*)end;
 
-    switch(cParams->strategy)
+    switch(ms->cParams.strategy)
     {
     case ZSTD_fast:
-        ZSTD_fillHashTable(ms, cParams, iend, ZSTD_dtlm_fast);
+        ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast);
         break;
 
     case ZSTD_dfast:
-        ZSTD_fillDoubleHashTable(ms, cParams, iend, ZSTD_dtlm_fast);
+        ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast);
         break;
 
     case ZSTD_greedy:
@@ -591,8 +590,9 @@
 
 size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
     ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+    void const* src, size_t srcSize)
 {
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
     unsigned const minMatch = cParams->searchLength;
     ZSTD_blockCompressor const blockCompressor =
         ZSTD_selectBlockCompressor(cParams->strategy, ZSTD_matchState_dictMode(ms));
@@ -620,13 +620,12 @@
 
         /* Fill tables for block compressor */
         ZSTD_ldm_limitTableUpdate(ms, ip);
-        ZSTD_ldm_fillFastTables(ms, cParams, ip);
+        ZSTD_ldm_fillFastTables(ms, ip);
         /* Run the block compressor */
         DEBUGLOG(5, "calling block compressor on segment of size %u", sequence.litLength);
         {
             size_t const newLitLength =
-                blockCompressor(ms, seqStore, rep, cParams, ip,
-                                sequence.litLength);
+                blockCompressor(ms, seqStore, rep, ip, sequence.litLength);
             ip += sequence.litLength;
             /* Update the repcodes */
             for (i = ZSTD_REP_NUM - 1; i > 0; i--)
@@ -641,8 +640,7 @@
     }
     /* Fill the tables for the block compressor */
     ZSTD_ldm_limitTableUpdate(ms, ip);
-    ZSTD_ldm_fillFastTables(ms, cParams, ip);
+    ZSTD_ldm_fillFastTables(ms, ip);
     /* Compress the last literals */
-    return blockCompressor(ms, seqStore, rep, cParams,
-                           ip, iend - ip);
+    return blockCompressor(ms, seqStore, rep, ip, iend - ip);
 }
diff --git a/lib/compress/zstd_ldm.h b/lib/compress/zstd_ldm.h
index 96588ad..21fba4d 100644
--- a/lib/compress/zstd_ldm.h
+++ b/lib/compress/zstd_ldm.h
@@ -61,7 +61,6 @@
  */
 size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-            ZSTD_compressionParameters const* cParams,
             void const* src, size_t srcSize);
 
 /**
diff --git a/lib/compress/zstd_opt.c b/lib/compress/zstd_opt.c
index 476cdc1..8af69a9 100644
--- a/lib/compress/zstd_opt.c
+++ b/lib/compress/zstd_opt.c
@@ -360,10 +360,11 @@
  *  ip : assumed <= iend-8 .
  * @return : nb of positions added */
 static U32 ZSTD_insertBt1(
-                ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                ZSTD_matchState_t* ms,
                 const BYTE* const ip, const BYTE* const iend,
                 U32 const mls, const int extDict)
 {
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
     U32*   const hashTable = ms->hashTable;
     U32    const hashLog = cParams->hashLog;
     size_t const h  = ZSTD_hashPtr(ip, hashLog, mls);
@@ -471,7 +472,7 @@
 
 FORCE_INLINE_TEMPLATE
 void ZSTD_updateTree_internal(
-                ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                ZSTD_matchState_t* ms,
                 const BYTE* const ip, const BYTE* const iend,
                 const U32 mls, const ZSTD_dictMode_e dictMode)
 {
@@ -482,24 +483,22 @@
                 idx, target, dictMode);
 
     while(idx < target)
-        idx += ZSTD_insertBt1(ms, cParams, base+idx, iend, mls, dictMode == ZSTD_extDict);
+        idx += ZSTD_insertBt1(ms, base+idx, iend, mls, dictMode == ZSTD_extDict);
     ms->nextToUpdate = target;
 }
 
-void ZSTD_updateTree(
-                ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
-                const BYTE* ip, const BYTE* iend)
-{
-    ZSTD_updateTree_internal(ms, cParams, ip, iend, cParams->searchLength, ZSTD_noDict);
+void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend) {
+    ZSTD_updateTree_internal(ms, ip, iend, ms->cParams.searchLength, ZSTD_noDict);
 }
 
 FORCE_INLINE_TEMPLATE
 U32 ZSTD_insertBtAndGetAllMatches (
-                    ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                    ZSTD_matchState_t* ms,
                     const BYTE* const ip, const BYTE* const iLimit, const ZSTD_dictMode_e dictMode,
                     U32 rep[ZSTD_REP_NUM], U32 const ll0,
                     ZSTD_match_t* matches, const U32 lengthToBeat, U32 const mls /* template */)
 {
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
     U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1);
     const BYTE* const base = ms->window.base;
     U32 const current = (U32)(ip-base);
@@ -527,12 +526,17 @@
     U32 nbCompares = 1U << cParams->searchLog;
 
     const ZSTD_matchState_t* dms    = dictMode == ZSTD_dictMatchState ? ms->dictMatchState : NULL;
+    const ZSTD_compressionParameters* const dmsCParams =
+                                      dictMode == ZSTD_dictMatchState ? &dms->cParams : NULL;
     const BYTE* const dmsBase       = dictMode == ZSTD_dictMatchState ? dms->window.base : NULL;
     const BYTE* const dmsEnd        = dictMode == ZSTD_dictMatchState ? dms->window.nextSrc : NULL;
     U32         const dmsHighLimit  = dictMode == ZSTD_dictMatchState ? (U32)(dmsEnd - dmsBase) : 0;
     U32         const dmsLowLimit   = dictMode == ZSTD_dictMatchState ? dms->window.lowLimit : 0;
     U32         const dmsIndexDelta = dictMode == ZSTD_dictMatchState ? windowLow - dmsHighLimit : 0;
-    U32         const dmsBtLow      = dictMode == ZSTD_dictMatchState && btMask < dmsHighLimit - dmsLowLimit ? dmsHighLimit - btMask : dmsLowLimit;
+    U32         const dmsHashLog    = dictMode == ZSTD_dictMatchState ? dmsCParams->hashLog : hashLog;
+    U32         const dmsBtLog      = dictMode == ZSTD_dictMatchState ? dmsCParams->chainLog - 1 : btLog;
+    U32         const dmsBtMask     = dictMode == ZSTD_dictMatchState ? (1U << dmsBtLog) - 1 : 0;
+    U32         const dmsBtLow      = dictMode == ZSTD_dictMatchState && dmsBtMask < dmsHighLimit - dmsLowLimit ? dmsHighLimit - dmsBtMask : dmsLowLimit;
 
     size_t bestLength = lengthToBeat-1;
     DEBUGLOG(8, "ZSTD_insertBtAndGetAllMatches: current=%u", current);
@@ -667,11 +671,12 @@
     *smallerPtr = *largerPtr = 0;
 
     if (dictMode == ZSTD_dictMatchState && nbCompares) {
-        U32 dictMatchIndex = dms->hashTable[h];
+        size_t const dmsH = ZSTD_hashPtr(ip, dmsHashLog, mls);
+        U32 dictMatchIndex = dms->hashTable[dmsH];
         const U32* const dmsBt = dms->chainTable;
         commonLengthSmaller = commonLengthLarger = 0;
         while (nbCompares-- && (dictMatchIndex > dmsLowLimit)) {
-            const U32* const nextPtr = dmsBt + 2*(dictMatchIndex & btMask);
+            const U32* const nextPtr = dmsBt + 2*(dictMatchIndex & dmsBtMask);
             size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
             const BYTE* match = dmsBase + dictMatchIndex;
             matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iLimit, dmsEnd, prefixStart);
@@ -713,23 +718,24 @@
 
 
 FORCE_INLINE_TEMPLATE U32 ZSTD_BtGetAllMatches (
-                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        ZSTD_matchState_t* ms,
                         const BYTE* ip, const BYTE* const iHighLimit, const ZSTD_dictMode_e dictMode,
                         U32 rep[ZSTD_REP_NUM], U32 const ll0,
                         ZSTD_match_t* matches, U32 const lengthToBeat)
 {
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
     U32 const matchLengthSearch = cParams->searchLength;
     DEBUGLOG(8, "ZSTD_BtGetAllMatches");
     if (ip < ms->window.base + ms->nextToUpdate) return 0;   /* skipped area */
-    ZSTD_updateTree_internal(ms, cParams, ip, iHighLimit, matchLengthSearch, dictMode);
+    ZSTD_updateTree_internal(ms, ip, iHighLimit, matchLengthSearch, dictMode);
     switch(matchLengthSearch)
     {
-    case 3 : return ZSTD_insertBtAndGetAllMatches(ms, cParams, ip, iHighLimit, dictMode, rep, ll0, matches, lengthToBeat, 3);
+    case 3 : return ZSTD_insertBtAndGetAllMatches(ms, ip, iHighLimit, dictMode, rep, ll0, matches, lengthToBeat, 3);
     default :
-    case 4 : return ZSTD_insertBtAndGetAllMatches(ms, cParams, ip, iHighLimit, dictMode, rep, ll0, matches, lengthToBeat, 4);
-    case 5 : return ZSTD_insertBtAndGetAllMatches(ms, cParams, ip, iHighLimit, dictMode, rep, ll0, matches, lengthToBeat, 5);
+    case 4 : return ZSTD_insertBtAndGetAllMatches(ms, ip, iHighLimit, dictMode, rep, ll0, matches, lengthToBeat, 4);
+    case 5 : return ZSTD_insertBtAndGetAllMatches(ms, ip, iHighLimit, dictMode, rep, ll0, matches, lengthToBeat, 5);
     case 7 :
-    case 6 : return ZSTD_insertBtAndGetAllMatches(ms, cParams, ip, iHighLimit, dictMode, rep, ll0, matches, lengthToBeat, 6);
+    case 6 : return ZSTD_insertBtAndGetAllMatches(ms, ip, iHighLimit, dictMode, rep, ll0, matches, lengthToBeat, 6);
     }
 }
 
@@ -741,7 +747,7 @@
     U32 rep[3];
 } repcodes_t;
 
-repcodes_t ZSTD_updateRep(U32 const rep[3], U32 const offset, U32 const ll0)
+static repcodes_t ZSTD_updateRep(U32 const rep[3], U32 const offset, U32 const ll0)
 {
     repcodes_t newReps;
     if (offset >= ZSTD_REP_NUM) {  /* full offset */
@@ -772,7 +778,6 @@
 ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
                                seqStore_t* seqStore,
                                U32 rep[ZSTD_REP_NUM],
-                               const ZSTD_compressionParameters* cParams,
                                const void* src, size_t srcSize,
                                const int optLevel, const ZSTD_dictMode_e dictMode)
 {
@@ -784,6 +789,7 @@
     const BYTE* const ilimit = iend - 8;
     const BYTE* const base = ms->window.base;
     const BYTE* const prefixStart = base + ms->window.dictLimit;
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
 
     U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1);
     U32 const minMatch = (cParams->searchLength == 3) ? 3 : 4;
@@ -806,7 +812,7 @@
         /* find first match */
         {   U32 const litlen = (U32)(ip - anchor);
             U32 const ll0 = !litlen;
-            U32 const nbMatches = ZSTD_BtGetAllMatches(ms, cParams, ip, iend, dictMode, rep, ll0, matches, minMatch);
+            U32 const nbMatches = ZSTD_BtGetAllMatches(ms, ip, iend, dictMode, rep, ll0, matches, minMatch);
             if (!nbMatches) { ip++; continue; }
 
             /* initialize opt[0] */
@@ -903,7 +909,7 @@
                 U32 const litlen = (opt[cur].mlen == 0) ? opt[cur].litlen : 0;
                 U32 const previousPrice = opt[cur].price;
                 U32 const basePrice = previousPrice + ZSTD_litLengthPrice(0, optStatePtr, optLevel);
-                U32 const nbMatches = ZSTD_BtGetAllMatches(ms, cParams, inr, iend, dictMode, opt[cur].rep, ll0, matches, minMatch);
+                U32 const nbMatches = ZSTD_BtGetAllMatches(ms, inr, iend, dictMode, opt[cur].rep, ll0, matches, minMatch);
                 U32 matchNb;
                 if (!nbMatches) {
                     DEBUGLOG(7, "rPos:%u : no match found", cur);
@@ -970,7 +976,7 @@
             U32 seqPos = cur;
 
             DEBUGLOG(6, "start reverse traversal (last_pos:%u, cur:%u)",
-                        last_pos, cur);
+                        last_pos, cur); (void)last_pos;
             assert(storeEnd < ZSTD_OPT_NUM);
             DEBUGLOG(6, "last sequence copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
                         storeEnd, lastSequence.litlen, lastSequence.mlen, lastSequence.off);
@@ -1033,10 +1039,10 @@
 
 size_t ZSTD_compressBlock_btopt(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        const ZSTD_compressionParameters* cParams, const void* src, size_t srcSize)
+        const void* src, size_t srcSize)
 {
     DEBUGLOG(5, "ZSTD_compressBlock_btopt");
-    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, cParams, src, srcSize, 0 /*optLevel*/, ZSTD_noDict);
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /*optLevel*/, ZSTD_noDict);
 }
 
 
@@ -1064,7 +1070,7 @@
 
 size_t ZSTD_compressBlock_btultra(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        const ZSTD_compressionParameters* cParams, const void* src, size_t srcSize)
+        const void* src, size_t srcSize)
 {
     DEBUGLOG(5, "ZSTD_compressBlock_btultra (srcSize=%zu)", srcSize);
 #if 0
@@ -1082,7 +1088,7 @@
         assert(ms->nextToUpdate >= ms->window.dictLimit
             && ms->nextToUpdate <= ms->window.dictLimit + 1);
         memcpy(tmpRep, rep, sizeof(tmpRep));
-        ZSTD_compressBlock_opt_generic(ms, seqStore, tmpRep, cParams, src, srcSize, 2 /*optLevel*/, ZSTD_noDict);   /* generate stats into ms->opt*/
+        ZSTD_compressBlock_opt_generic(ms, seqStore, tmpRep, src, srcSize, 2 /*optLevel*/, ZSTD_noDict);   /* generate stats into ms->opt*/
         ZSTD_resetSeqStore(seqStore);
         /* invalidate first scan from history */
         ms->window.base -= srcSize;
@@ -1094,33 +1100,33 @@
         ZSTD_upscaleStats(&ms->opt);
     }
 #endif
-    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, cParams, src, srcSize, 2 /*optLevel*/, ZSTD_noDict);
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_noDict);
 }
 
 size_t ZSTD_compressBlock_btopt_dictMatchState(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        const ZSTD_compressionParameters* cParams, const void* src, size_t srcSize)
+        const void* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, cParams, src, srcSize, 0 /*optLevel*/, ZSTD_dictMatchState);
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /*optLevel*/, ZSTD_dictMatchState);
 }
 
 size_t ZSTD_compressBlock_btultra_dictMatchState(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        const ZSTD_compressionParameters* cParams, const void* src, size_t srcSize)
+        const void* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, cParams, src, srcSize, 2 /*optLevel*/, ZSTD_dictMatchState);
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_dictMatchState);
 }
 
 size_t ZSTD_compressBlock_btopt_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        const ZSTD_compressionParameters* cParams, const void* src, size_t srcSize)
+        const void* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, cParams, src, srcSize, 0 /*optLevel*/, ZSTD_extDict);
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /*optLevel*/, ZSTD_extDict);
 }
 
 size_t ZSTD_compressBlock_btultra_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        const ZSTD_compressionParameters* cParams, const void* src, size_t srcSize)
+        const void* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, cParams, src, srcSize, 2 /*optLevel*/, ZSTD_extDict);
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_extDict);
 }
diff --git a/lib/compress/zstd_opt.h b/lib/compress/zstd_opt.h
index 63dbe79..eeadb60 100644
--- a/lib/compress/zstd_opt.h
+++ b/lib/compress/zstd_opt.h
@@ -17,30 +17,29 @@
 
 #include "zstd_compress_internal.h"
 
-void ZSTD_updateTree(
-        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
-        const BYTE* ip, const BYTE* iend);  /* used in ZSTD_loadDictionaryContent() */
+/* used in ZSTD_loadDictionaryContent() */
+void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend);
 
 size_t ZSTD_compressBlock_btopt(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 size_t ZSTD_compressBlock_btultra(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 
 size_t ZSTD_compressBlock_btopt_dictMatchState(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 size_t ZSTD_compressBlock_btultra_dictMatchState(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 
 size_t ZSTD_compressBlock_btopt_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 size_t ZSTD_compressBlock_btultra_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 
 #if defined (__cplusplus)
 }
diff --git a/lib/compress/zstdmt_compress.c b/lib/compress/zstdmt_compress.c
index 6daedca..f4aba1d 100644
--- a/lib/compress/zstdmt_compress.c
+++ b/lib/compress/zstdmt_compress.c
@@ -37,7 +37,9 @@
 #define ZSTD_RESIZE_SEQPOOL 0
 
 /* ======   Debug   ====== */
-#if defined(DEBUGLEVEL) && (DEBUGLEVEL>=2) && !defined(_MSC_VER)
+#if defined(DEBUGLEVEL) && (DEBUGLEVEL>=2) \
+    && !defined(_MSC_VER) \
+    && !defined(__MINGW32__)
 
 #  include <stdio.h>
 #  include <unistd.h>
@@ -247,8 +249,8 @@
 /* store buffer for later re-use, up to pool capacity */
 static void ZSTDMT_releaseBuffer(ZSTDMT_bufferPool* bufPool, buffer_t buf)
 {
-    if (buf.start == NULL) return;   /* compatible with release on NULL */
     DEBUGLOG(5, "ZSTDMT_releaseBuffer");
+    if (buf.start == NULL) return;   /* compatible with release on NULL */
     ZSTD_pthread_mutex_lock(&bufPool->poolMutex);
     if (bufPool->nbBuffers < bufPool->totalBuffers) {
         bufPool->bTable[bufPool->nbBuffers++] = buf;  /* stored for later use */
@@ -318,7 +320,8 @@
 
 static ZSTDMT_seqPool* ZSTDMT_createSeqPool(unsigned nbWorkers, ZSTD_customMem cMem)
 {
-    ZSTDMT_seqPool* seqPool = ZSTDMT_createBufferPool(nbWorkers, cMem);
+    ZSTDMT_seqPool* const seqPool = ZSTDMT_createBufferPool(nbWorkers, cMem);
+    if (seqPool == NULL) return NULL;
     ZSTDMT_setNbSeq(seqPool, 0);
     return seqPool;
 }
@@ -539,6 +542,7 @@
     /* Wait for our turn */
     ZSTD_PTHREAD_MUTEX_LOCK(&serialState->mutex);
     while (serialState->nextJobID < jobID) {
+        DEBUGLOG(5, "wait for serialState->cond");
         ZSTD_pthread_cond_wait(&serialState->cond, &serialState->mutex);
     }
     /* A future job may error and skip our job */
@@ -628,32 +632,32 @@
     unsigned frameChecksumNeeded;        /* used only by mtctx */
 } ZSTDMT_jobDescription;
 
+#define JOB_ERROR(e) {                          \
+    ZSTD_PTHREAD_MUTEX_LOCK(&job->job_mutex);   \
+    job->cSize = e;                             \
+    ZSTD_pthread_mutex_unlock(&job->job_mutex); \
+    goto _endJob;                               \
+}
+
 /* ZSTDMT_compressionJob() is a POOL_function type */
-void ZSTDMT_compressionJob(void* jobDescription)
+static void ZSTDMT_compressionJob(void* jobDescription)
 {
     ZSTDMT_jobDescription* const job = (ZSTDMT_jobDescription*)jobDescription;
     ZSTD_CCtx_params jobParams = job->params;   /* do not modify job->params ! copy it, modify the copy */
     ZSTD_CCtx* const cctx = ZSTDMT_getCCtx(job->cctxPool);
     rawSeqStore_t rawSeqStore = ZSTDMT_getSeq(job->seqPool);
     buffer_t dstBuff = job->dstBuff;
+    size_t lastCBlockSize = 0;
 
     /* ressources */
-    if (cctx==NULL) {
-        job->cSize = ERROR(memory_allocation);
-        goto _endJob;
-    }
+    if (cctx==NULL) JOB_ERROR(ERROR(memory_allocation));
     if (dstBuff.start == NULL) {   /* streaming job : doesn't provide a dstBuffer */
         dstBuff = ZSTDMT_getBuffer(job->bufPool);
-        if (dstBuff.start==NULL) {
-            job->cSize = ERROR(memory_allocation);
-            goto _endJob;
-        }
+        if (dstBuff.start==NULL) JOB_ERROR(ERROR(memory_allocation));
         job->dstBuff = dstBuff;   /* this value can be read in ZSTDMT_flush, when it copies the whole job */
     }
-    if (jobParams.ldmParams.enableLdm && rawSeqStore.seq == NULL) {
-        job->cSize = ERROR(memory_allocation);
-        goto _endJob;
-    }
+    if (jobParams.ldmParams.enableLdm && rawSeqStore.seq == NULL)
+        JOB_ERROR(ERROR(memory_allocation));
 
     /* Don't compute the checksum for chunks, since we compute it externally,
      * but write it in the header.
@@ -667,30 +671,26 @@
     if (job->cdict) {
         size_t const initError = ZSTD_compressBegin_advanced_internal(cctx, NULL, 0, ZSTD_dct_auto, ZSTD_dtlm_fast, job->cdict, jobParams, job->fullFrameSize);
         assert(job->firstJob);  /* only allowed for first job */
-        if (ZSTD_isError(initError)) { job->cSize = initError; goto _endJob; }
+        if (ZSTD_isError(initError)) JOB_ERROR(initError);
     } else {  /* srcStart points at reloaded section */
         U64 const pledgedSrcSize = job->firstJob ? job->fullFrameSize : job->src.size;
         {   size_t const forceWindowError = ZSTD_CCtxParam_setParameter(&jobParams, ZSTD_p_forceMaxWindow, !job->firstJob);
-            if (ZSTD_isError(forceWindowError)) {
-                job->cSize = forceWindowError;
-                goto _endJob;
-        }   }
+            if (ZSTD_isError(forceWindowError)) JOB_ERROR(forceWindowError);
+        }
         {   size_t const initError = ZSTD_compressBegin_advanced_internal(cctx,
                                         job->prefix.start, job->prefix.size, ZSTD_dct_rawContent, /* load dictionary in "content-only" mode (no header analysis) */
                                         ZSTD_dtlm_fast,
                                         NULL, /*cdict*/
                                         jobParams, pledgedSrcSize);
-            if (ZSTD_isError(initError)) {
-                job->cSize = initError;
-                goto _endJob;
-    }   }   }
+            if (ZSTD_isError(initError)) JOB_ERROR(initError);
+    }   }
 
     /* Perform serial step as early as possible, but after CCtx initialization */
     ZSTDMT_serialState_update(job->serial, cctx, rawSeqStore, job->src, job->jobID);
 
     if (!job->firstJob) {  /* flush and overwrite frame header when it's not first job */
         size_t const hSize = ZSTD_compressContinue(cctx, dstBuff.start, dstBuff.capacity, job->src.start, 0);
-        if (ZSTD_isError(hSize)) { job->cSize = hSize; /* save error code */ goto _endJob; }
+        if (ZSTD_isError(hSize)) JOB_ERROR(hSize);
         DEBUGLOG(5, "ZSTDMT_compressionJob: flush and overwrite %u bytes of frame header (not first job)", (U32)hSize);
         ZSTD_invalidateRepCodes(cctx);
     }
@@ -708,7 +708,7 @@
         assert(job->cSize == 0);
         for (chunkNb = 1; chunkNb < nbChunks; chunkNb++) {
             size_t const cSize = ZSTD_compressContinue(cctx, op, oend-op, ip, chunkSize);
-            if (ZSTD_isError(cSize)) { job->cSize = cSize; goto _endJob; }
+            if (ZSTD_isError(cSize)) JOB_ERROR(cSize);
             ip += chunkSize;
             op += cSize; assert(op < oend);
             /* stats */
@@ -721,18 +721,16 @@
             ZSTD_pthread_mutex_unlock(&job->job_mutex);
         }
         /* last block */
-        assert(chunkSize > 0); assert((chunkSize & (chunkSize - 1)) == 0);  /* chunkSize must be power of 2 for mask==(chunkSize-1) to work */
+        assert(chunkSize > 0);
+        assert((chunkSize & (chunkSize - 1)) == 0);  /* chunkSize must be power of 2 for mask==(chunkSize-1) to work */
         if ((nbChunks > 0) | job->lastJob /*must output a "last block" flag*/ ) {
             size_t const lastBlockSize1 = job->src.size & (chunkSize-1);
             size_t const lastBlockSize = ((lastBlockSize1==0) & (job->src.size>=chunkSize)) ? chunkSize : lastBlockSize1;
             size_t const cSize = (job->lastJob) ?
                  ZSTD_compressEnd     (cctx, op, oend-op, ip, lastBlockSize) :
                  ZSTD_compressContinue(cctx, op, oend-op, ip, lastBlockSize);
-            if (ZSTD_isError(cSize)) { job->cSize = cSize; goto _endJob; }
-            /* stats */
-            ZSTD_PTHREAD_MUTEX_LOCK(&job->job_mutex);
-            job->cSize += cSize;
-            ZSTD_pthread_mutex_unlock(&job->job_mutex);
+            if (ZSTD_isError(cSize)) JOB_ERROR(cSize);
+            lastCBlockSize = cSize;
     }   }
 
 _endJob:
@@ -745,7 +743,9 @@
     ZSTDMT_releaseCCtx(job->cctxPool, cctx);
     /* report */
     ZSTD_PTHREAD_MUTEX_LOCK(&job->job_mutex);
-    job->consumed = job->src.size;
+    if (ZSTD_isError(job->cSize)) assert(lastCBlockSize == 0);
+    job->cSize += lastCBlockSize;
+    job->consumed = job->src.size;  /* when job->consumed == job->src.size , compression job is presumed completed */
     ZSTD_pthread_cond_signal(&job->job_cond);
     ZSTD_pthread_mutex_unlock(&job->job_mutex);
 }
@@ -930,7 +930,7 @@
         unsigned const jobID = mtctx->doneJobID & mtctx->jobIDMask;
         ZSTD_PTHREAD_MUTEX_LOCK(&mtctx->jobs[jobID].job_mutex);
         while (mtctx->jobs[jobID].consumed < mtctx->jobs[jobID].src.size) {
-            DEBUGLOG(5, "waiting for jobCompleted signal from job %u", mtctx->doneJobID);   /* we want to block when waiting for data to flush */
+            DEBUGLOG(4, "waiting for jobCompleted signal from job %u", mtctx->doneJobID);   /* we want to block when waiting for data to flush */
             ZSTD_pthread_cond_wait(&mtctx->jobs[jobID].job_cond, &mtctx->jobs[jobID].job_mutex);
         }
         ZSTD_pthread_mutex_unlock(&mtctx->jobs[jobID].job_mutex);
@@ -1055,7 +1055,7 @@
 
 
 /*! ZSTDMT_updateCParams_whileCompressing() :
- *  Updates only a selected set of compression parameters, to remain compatible with current frame.
+ *  Updates a selected set of compression parameters, remaining compatible with currently active frame.
  *  New parameters will be applied to next compression job. */
 void ZSTDMT_updateCParams_whileCompressing(ZSTDMT_CCtx* mtctx, const ZSTD_CCtx_params* cctxParams)
 {
@@ -1073,26 +1073,33 @@
 /* ZSTDMT_getFrameProgression():
  * tells how much data has been consumed (input) and produced (output) for current frame.
  * able to count progression inside worker threads.
- * Note : mutex will be acquired during statistics collection. */
+ * Note : mutex will be acquired during statistics collection inside workers. */
 ZSTD_frameProgression ZSTDMT_getFrameProgression(ZSTDMT_CCtx* mtctx)
 {
     ZSTD_frameProgression fps;
-    DEBUGLOG(6, "ZSTDMT_getFrameProgression");
-    fps.consumed = mtctx->consumed;
-    fps.produced = mtctx->produced;
+    DEBUGLOG(5, "ZSTDMT_getFrameProgression");
     fps.ingested = mtctx->consumed + mtctx->inBuff.filled;
+    fps.consumed = mtctx->consumed;
+    fps.produced = fps.flushed = mtctx->produced;
+    fps.currentJobID = mtctx->nextJobID;
+    fps.nbActiveWorkers = 0;
     {   unsigned jobNb;
         unsigned lastJobNb = mtctx->nextJobID + mtctx->jobReady; assert(mtctx->jobReady <= 1);
         DEBUGLOG(6, "ZSTDMT_getFrameProgression: jobs: from %u to <%u (jobReady:%u)",
                     mtctx->doneJobID, lastJobNb, mtctx->jobReady)
         for (jobNb = mtctx->doneJobID ; jobNb < lastJobNb ; jobNb++) {
             unsigned const wJobID = jobNb & mtctx->jobIDMask;
-            ZSTD_pthread_mutex_lock(&mtctx->jobs[wJobID].job_mutex);
-            {   size_t const cResult = mtctx->jobs[wJobID].cSize;
+            ZSTDMT_jobDescription* jobPtr = &mtctx->jobs[wJobID];
+            ZSTD_pthread_mutex_lock(&jobPtr->job_mutex);
+            {   size_t const cResult = jobPtr->cSize;
                 size_t const produced = ZSTD_isError(cResult) ? 0 : cResult;
-                fps.consumed += mtctx->jobs[wJobID].consumed;
-                fps.ingested += mtctx->jobs[wJobID].src.size;
+                size_t const flushed = ZSTD_isError(cResult) ? 0 : jobPtr->dstFlushed;
+                assert(flushed <= produced);
+                fps.ingested += jobPtr->src.size;
+                fps.consumed += jobPtr->consumed;
                 fps.produced += produced;
+                fps.flushed  += flushed;
+                fps.nbActiveWorkers += (jobPtr->consumed < jobPtr->src.size);
             }
             ZSTD_pthread_mutex_unlock(&mtctx->jobs[wJobID].job_mutex);
         }
@@ -1101,6 +1108,34 @@
 }
 
 
+size_t ZSTDMT_toFlushNow(ZSTDMT_CCtx* mtctx)
+{
+    size_t toFlush;
+    unsigned const jobID = mtctx->doneJobID;
+    assert(jobID <= mtctx->nextJobID);
+    if (jobID == mtctx->nextJobID) return 0;   /* no active job => nothing to flush */
+
+    /* look into oldest non-fully-flushed job */
+    {   unsigned const wJobID = jobID & mtctx->jobIDMask;
+        ZSTDMT_jobDescription* const jobPtr = &mtctx->jobs[wJobID];
+        ZSTD_pthread_mutex_lock(&jobPtr->job_mutex);
+        {   size_t const cResult = jobPtr->cSize;
+            size_t const produced = ZSTD_isError(cResult) ? 0 : cResult;
+            size_t const flushed = ZSTD_isError(cResult) ? 0 : jobPtr->dstFlushed;
+            assert(flushed <= produced);
+            toFlush = produced - flushed;
+            if (toFlush==0 && (jobPtr->consumed >= jobPtr->src.size)) {
+                /* doneJobID is not-fully-flushed, but toFlush==0 : doneJobID should be compressing some more data */
+                assert(jobPtr->consumed < jobPtr->src.size);
+            }
+        }
+        ZSTD_pthread_mutex_unlock(&mtctx->jobs[wJobID].job_mutex);
+    }
+
+    return toFlush;
+}
+
+
 /* ------------------------------------------ */
 /* =====   Multi-threaded compression   ===== */
 /* ------------------------------------------ */
@@ -1495,7 +1530,7 @@
         mtctx->jobs[jobID].jobID = mtctx->nextJobID;
         mtctx->jobs[jobID].firstJob = (mtctx->nextJobID==0);
         mtctx->jobs[jobID].lastJob = endFrame;
-        mtctx->jobs[jobID].frameChecksumNeeded = endFrame && (mtctx->nextJobID>0) && mtctx->params.fParams.checksumFlag;
+        mtctx->jobs[jobID].frameChecksumNeeded = mtctx->params.fParams.checksumFlag && endFrame && (mtctx->nextJobID>0);
         mtctx->jobs[jobID].dstFlushed = 0;
 
         /* Update the round buffer pos and clear the input buffer to be reset */
@@ -1543,6 +1578,8 @@
 
 
 /*! ZSTDMT_flushProduced() :
+ *  flush whatever data has been produced but not yet flushed in current job.
+ *  move to next job if current one is fully flushed.
  * `output` : `pos` will be updated with amount of data flushed .
  * `blockToFlush` : if >0, the function will block and wait if there is no data available to flush .
  * @return : amount of data remaining within internal buffer, 0 if no more, 1 if unknown but > 0, or an error code */
@@ -1571,7 +1608,7 @@
     /* try to flush something */
     {   size_t cSize = mtctx->jobs[wJobID].cSize;                  /* shared */
         size_t const srcConsumed = mtctx->jobs[wJobID].consumed;   /* shared */
-        size_t const srcSize = mtctx->jobs[wJobID].src.size;        /* read-only, could be done after mutex lock, but no-declaration-after-statement */
+        size_t const srcSize = mtctx->jobs[wJobID].src.size;       /* read-only, could be done after mutex lock, but no-declaration-after-statement */
         ZSTD_pthread_mutex_unlock(&mtctx->jobs[wJobID].job_mutex);
         if (ZSTD_isError(cSize)) {
             DEBUGLOG(5, "ZSTDMT_flushProduced: job %u : compression error detected : %s",
@@ -1591,6 +1628,7 @@
             mtctx->jobs[wJobID].cSize += 4;  /* can write this shared value, as worker is no longer active */
             mtctx->jobs[wJobID].frameChecksumNeeded = 0;
         }
+
         if (cSize > 0) {   /* compression is ongoing or completed */
             size_t const toFlush = MIN(cSize - mtctx->jobs[wJobID].dstFlushed, output->size - output->pos);
             DEBUGLOG(5, "ZSTDMT_flushProduced: Flushing %u bytes from job %u (completion:%u/%u, generated:%u)",
@@ -1604,11 +1642,12 @@
             output->pos += toFlush;
             mtctx->jobs[wJobID].dstFlushed += toFlush;  /* can write : this value is only used by mtctx */
 
-            if ( (srcConsumed == srcSize)    /* job completed */
+            if ( (srcConsumed == srcSize)    /* job is completed */
               && (mtctx->jobs[wJobID].dstFlushed == cSize) ) {   /* output buffer fully flushed => free this job position */
                 DEBUGLOG(5, "Job %u completed (%u bytes), moving to next one",
                         mtctx->doneJobID, (U32)mtctx->jobs[wJobID].dstFlushed);
                 ZSTDMT_releaseBuffer(mtctx->bufPool, mtctx->jobs[wJobID].dstBuff);
+                DEBUGLOG(5, "dstBuffer released");
                 mtctx->jobs[wJobID].dstBuff = g_nullBuffer;
                 mtctx->jobs[wJobID].cSize = 0;   /* ensure this job slot is considered "not started" in future check */
                 mtctx->consumed += srcSize;
@@ -1685,6 +1724,7 @@
     range_t extDict;
     range_t prefix;
 
+    DEBUGLOG(5, "ZSTDMT_doesOverlapWindow");
     extDict.start = window.dictBase + window.lowLimit;
     extDict.size = window.dictLimit - window.lowLimit;
 
@@ -1705,12 +1745,13 @@
 {
     if (mtctx->params.ldmParams.enableLdm) {
         ZSTD_pthread_mutex_t* mutex = &mtctx->serial.ldmWindowMutex;
+        DEBUGLOG(5, "ZSTDMT_waitForLdmComplete");
         DEBUGLOG(5, "source  [0x%zx, 0x%zx)",
                     (size_t)buffer.start,
                     (size_t)buffer.start + buffer.capacity);
         ZSTD_PTHREAD_MUTEX_LOCK(mutex);
         while (ZSTDMT_doesOverlapWindow(buffer, mtctx->serial.ldmWindow)) {
-            DEBUGLOG(6, "Waiting for LDM to finish...");
+            DEBUGLOG(5, "Waiting for LDM to finish...");
             ZSTD_pthread_cond_wait(&mtctx->serial.ldmWindowCond, mutex);
         }
         DEBUGLOG(6, "Done waiting for LDM to finish");
@@ -1730,6 +1771,7 @@
     size_t const target = mtctx->targetSectionSize;
     buffer_t buffer;
 
+    DEBUGLOG(5, "ZSTDMT_tryGetInputRange");
     assert(mtctx->inBuff.buffer.start == NULL);
     assert(mtctx->roundBuff.capacity >= target);
 
@@ -1743,7 +1785,7 @@
         buffer.start = start;
         buffer.capacity = prefixSize;
         if (ZSTDMT_isOverlapped(buffer, inUse)) {
-            DEBUGLOG(6, "Waiting for buffer...");
+            DEBUGLOG(5, "Waiting for buffer...");
             return 0;
         }
         ZSTDMT_waitForLdmComplete(mtctx, buffer);
@@ -1755,7 +1797,7 @@
     buffer.capacity = target;
 
     if (ZSTDMT_isOverlapped(buffer, inUse)) {
-        DEBUGLOG(6, "Waiting for buffer...");
+        DEBUGLOG(5, "Waiting for buffer...");
         return 0;
     }
     assert(!ZSTDMT_isOverlapped(buffer, mtctx->inBuff.prefix));
@@ -1828,8 +1870,10 @@
                 /* It is only possible for this operation to fail if there are
                  * still compression jobs ongoing.
                  */
+                DEBUGLOG(5, "ZSTDMT_tryGetInputRange failed");
                 assert(mtctx->doneJobID != mtctx->nextJobID);
-            }
+            } else
+                DEBUGLOG(5, "ZSTDMT_tryGetInputRange completed successfully : mtctx->inBuff.buffer.start = %p", mtctx->inBuff.buffer.start);
         }
         if (mtctx->inBuff.buffer.start != NULL) {
             size_t const toLoad = MIN(input->size - input->pos, mtctx->targetSectionSize - mtctx->inBuff.filled);
@@ -1857,6 +1901,7 @@
     /* check for potential compressed data ready to be flushed */
     {   size_t const remainingToFlush = ZSTDMT_flushProduced(mtctx, output, !forwardInputProgress, endOp); /* block if there was no forward input progress */
         if (input->pos < input->size) return MAX(remainingToFlush, 1);  /* input not consumed : do not end flush yet */
+        DEBUGLOG(5, "end of ZSTDMT_compressStream_generic: remainingToFlush = %u", (U32)remainingToFlush);
         return remainingToFlush;
     }
 }
diff --git a/lib/compress/zstdmt_compress.h b/lib/compress/zstdmt_compress.h
index 34a475a..12ad9f8 100644
--- a/lib/compress/zstdmt_compress.h
+++ b/lib/compress/zstdmt_compress.h
@@ -119,11 +119,21 @@
  * ===  Not exposed in libzstd. Never invoke directly   ===
  * ======================================================== */
 
+ /*! ZSTDMT_toFlushNow()
+  *  Tell how many bytes are ready to be flushed immediately.
+  *  Probe the oldest active job (not yet entirely flushed) and check its output buffer.
+  *  If return 0, it means there is no active job,
+  *  or, it means oldest job is still active, but everything produced has been flushed so far,
+  *  therefore flushing is limited by speed of oldest job. */
+size_t ZSTDMT_toFlushNow(ZSTDMT_CCtx* mtctx);
+
+/*! ZSTDMT_CCtxParam_setMTCtxParameter()
+ *  like ZSTDMT_setMTCtxParameter(), but into a ZSTD_CCtx_Params */
 size_t ZSTDMT_CCtxParam_setMTCtxParameter(ZSTD_CCtx_params* params, ZSTDMT_parameter parameter, unsigned value);
 
-/* ZSTDMT_CCtxParam_setNbWorkers()
- * Set nbWorkers, and clamp it.
- * Also reset jobSize and overlapLog */
+/*! ZSTDMT_CCtxParam_setNbWorkers()
+ *  Set nbWorkers, and clamp it.
+ *  Also reset jobSize and overlapLog */
 size_t ZSTDMT_CCtxParam_setNbWorkers(ZSTD_CCtx_params* params, unsigned nbWorkers);
 
 /*! ZSTDMT_updateCParams_whileCompressing() :
@@ -131,9 +141,9 @@
  *  New parameters will be applied to next compression job. */
 void ZSTDMT_updateCParams_whileCompressing(ZSTDMT_CCtx* mtctx, const ZSTD_CCtx_params* cctxParams);
 
-/* ZSTDMT_getFrameProgression():
- * tells how much data has been consumed (input) and produced (output) for current frame.
- * able to count progression inside worker threads.
+/*! ZSTDMT_getFrameProgression():
+ *  tells how much data has been consumed (input) and produced (output) for current frame.
+ *  able to count progression inside worker threads.
  */
 ZSTD_frameProgression ZSTDMT_getFrameProgression(ZSTDMT_CCtx* mtctx);
 
diff --git a/lib/decompress/huf_decompress.c b/lib/decompress/huf_decompress.c
index a696261..83ecaff 100644
--- a/lib/decompress/huf_decompress.c
+++ b/lib/decompress/huf_decompress.c
@@ -533,9 +533,9 @@
     }
 }
 
-size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src,
-                             size_t srcSize, void* workSpace,
-                             size_t wkspSize)
+size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
+                       const void* src, size_t srcSize,
+                             void* workSpace, size_t wkspSize)
 {
     U32 tableLog, maxW, sizeOfSort, nbSymbols;
     DTableDesc dtd = HUF_getDTableDesc(DTable);
diff --git a/lib/decompress/zstd_decompress.c b/lib/decompress/zstd_decompress.c
index 8f4589d..711b5b6 100644
--- a/lib/decompress/zstd_decompress.c
+++ b/lib/decompress/zstd_decompress.c
@@ -40,7 +40,6 @@
 #  define ZSTD_MAXWINDOWSIZE_DEFAULT (((U32)1 << ZSTD_WINDOWLOG_DEFAULTMAX) + 1)
 #endif
 
-
 /*!
  *  NO_FORWARD_PROGRESS_MAX :
  *  maximum allowed nb of calls to ZSTD_decompressStream() and ZSTD_decompress_generic()
@@ -52,11 +51,13 @@
 #  define ZSTD_NO_FORWARD_PROGRESS_MAX 16
 #endif
 
+
 /*-*******************************************************
 *  Dependencies
 *********************************************************/
 #include <string.h>      /* memcpy, memmove, memset */
-#include "cpu.h"
+#include "compiler.h"    /* prefetch */
+#include "cpu.h"         /* bmi2 */
 #include "mem.h"         /* low level memory routines */
 #define FSE_STATIC_LINKING_ONLY
 #include "fse.h"
@@ -68,6 +69,9 @@
 #  include "zstd_legacy.h"
 #endif
 
+static const void* ZSTD_DDictDictContent(const ZSTD_DDict* ddict);
+static size_t ZSTD_DDictDictSize(const ZSTD_DDict* ddict);
+
 
 /*-*************************************
 *  Errors
@@ -110,11 +114,10 @@
 #define SEQSYMBOL_TABLE_SIZE(log)   (1 + (1 << (log)))
 
 typedef struct {
-    ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)];
-    ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)];
-    ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)];
+    ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)];    /* Note : Space reserved for FSE Tables */
+    ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)];   /* is also used as temporary workspace while building hufTable during DDict creation */
+    ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)];    /* and therefore must be at least HUF_DECOMPRESS_WORKSPACE_SIZE large */
     HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)];  /* can accommodate HUF_decompress4X */
-    U32 workspace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
     U32 rep[ZSTD_REP_NUM];
 } ZSTD_entropyDTables_t;
 
@@ -125,6 +128,7 @@
     const ZSTD_seqSymbol* OFTptr;
     const HUF_DTable* HUFptr;
     ZSTD_entropyDTables_t entropy;
+    U32 workspace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];   /* space needed when building huffman tables */
     const void* previousDstEnd;   /* detect continuity */
     const void* prefixStart;      /* start of current segment */
     const void* virtualStart;     /* virtual start of previous segment if it was just before current one */
@@ -138,7 +142,6 @@
     U32 fseEntropy;
     XXH64_state_t xxhState;
     size_t headerSize;
-    U32 dictID;
     ZSTD_format_e format;
     const BYTE* litPtr;
     ZSTD_customMem customMem;
@@ -147,9 +150,13 @@
     size_t staticSize;
     int bmi2;                     /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */
 
-    /* streaming */
+    /* dictionary */
     ZSTD_DDict* ddictLocal;
-    const ZSTD_DDict* ddict;
+    const ZSTD_DDict* ddict;     /* set by ZSTD_initDStream_usingDDict(), or ZSTD_DCtx_refDDict() */
+    U32 dictID;
+    int ddictIsCold;             /* if == 1 : dictionary is "new" for working context, and presumed "cold" (not in cpu cache) */
+
+    /* streaming */
     ZSTD_dStreamStage streamStage;
     char*  inBuff;
     size_t inBuffSize;
@@ -185,7 +192,7 @@
 static size_t ZSTD_startingInputLength(ZSTD_format_e format)
 {
     size_t const startingInputLength = (format==ZSTD_f_zstd1_magicless) ?
-                    ZSTD_frameHeaderSize_prefix - ZSTD_frameIdSize :
+                    ZSTD_frameHeaderSize_prefix - ZSTD_FRAMEIDSIZE :
                     ZSTD_frameHeaderSize_prefix;
     ZSTD_STATIC_ASSERT(ZSTD_FRAMEHEADERSIZE_PREFIX >= ZSTD_FRAMEIDSIZE);
     /* only supports formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless */
@@ -200,6 +207,8 @@
     dctx->maxWindowSize = ZSTD_MAXWINDOWSIZE_DEFAULT;
     dctx->ddict       = NULL;
     dctx->ddictLocal  = NULL;
+    dctx->dictEnd     = NULL;
+    dctx->ddictIsCold = 0;
     dctx->inBuff      = NULL;
     dctx->inBuffSize  = 0;
     dctx->outBuffSize = 0;
@@ -278,7 +287,7 @@
  *  Note 3 : Skippable Frame Identifiers are considered valid. */
 unsigned ZSTD_isFrame(const void* buffer, size_t size)
 {
-    if (size < ZSTD_frameIdSize) return 0;
+    if (size < ZSTD_FRAMEIDSIZE) return 0;
     {   U32 const magic = MEM_readLE32(buffer);
         if (magic == ZSTD_MAGICNUMBER) return 1;
         if ((magic & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) return 1;
@@ -330,7 +339,9 @@
     const BYTE* ip = (const BYTE*)src;
     size_t const minInputSize = ZSTD_startingInputLength(format);
 
+    memset(zfhPtr, 0, sizeof(*zfhPtr));   /* not strictly necessary, but static analyzer do not understand that zfhPtr is only going to be read only if return value is zero, since they are 2 different signals */
     if (srcSize < minInputSize) return minInputSize;
+    if (src==NULL) return ERROR(GENERIC);   /* invalid parameter */
 
     if ( (format != ZSTD_f_zstd1_magicless)
       && (MEM_readLE32(src) != ZSTD_MAGICNUMBER) ) {
@@ -339,7 +350,7 @@
             if (srcSize < ZSTD_skippableHeaderSize)
                 return ZSTD_skippableHeaderSize; /* magic number + frame length */
             memset(zfhPtr, 0, sizeof(*zfhPtr));
-            zfhPtr->frameContentSize = MEM_readLE32((const char *)src + ZSTD_frameIdSize);
+            zfhPtr->frameContentSize = MEM_readLE32((const char *)src + ZSTD_FRAMEIDSIZE);
             zfhPtr->frameType = ZSTD_skippableFrame;
             return 0;
         }
@@ -451,7 +462,7 @@
             size_t skippableSize;
             if (srcSize < ZSTD_skippableHeaderSize)
                 return ERROR(srcSize_wrong);
-            skippableSize = MEM_readLE32((const BYTE *)src + ZSTD_frameIdSize)
+            skippableSize = MEM_readLE32((const BYTE *)src + ZSTD_FRAMEIDSIZE)
                           + ZSTD_skippableHeaderSize;
             if (srcSize < skippableSize) {
                 return ZSTD_CONTENTSIZE_ERROR;
@@ -540,6 +551,7 @@
 static size_t ZSTD_copyRawBlock(void* dst, size_t dstCapacity,
                           const void* src, size_t srcSize)
 {
+    if (dst==NULL) return ERROR(dstSize_tooSmall);
     if (srcSize > dstCapacity) return ERROR(dstSize_tooSmall);
     memcpy(dst, src, srcSize);
     return srcSize;
@@ -556,6 +568,9 @@
     return regenSize;
 }
 
+/* Hidden declaration for fullbench */
+size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                          const void* src, size_t srcSize);
 /*! ZSTD_decodeLiteralsBlock() :
  * @return : nb of bytes read from src (< srcSize )
  *  note : symbol not declared but exposed for fullbench */
@@ -572,6 +587,7 @@
         case set_repeat:
             if (dctx->litEntropy==0) return ERROR(dictionary_corrupted);
             /* fall-through */
+
         case set_compressed:
             if (srcSize < 5) return ERROR(corruption_detected);   /* srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3 */
             {   size_t lhSize, litSize, litCSize;
@@ -603,15 +619,20 @@
                 if (litSize > ZSTD_BLOCKSIZE_MAX) return ERROR(corruption_detected);
                 if (litCSize + lhSize > srcSize) return ERROR(corruption_detected);
 
+                /* prefetch huffman table if cold */
+                if (dctx->ddictIsCold && (litSize > 768 /* heuristic */)) {
+                    PREFETCH_AREA(dctx->HUFptr, sizeof(dctx->entropy.hufTable));
+                }
+
                 if (HUF_isError((litEncType==set_repeat) ?
                                     ( singleStream ?
                                         HUF_decompress1X_usingDTable_bmi2(dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->HUFptr, dctx->bmi2) :
                                         HUF_decompress4X_usingDTable_bmi2(dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->HUFptr, dctx->bmi2) ) :
                                     ( singleStream ?
                                         HUF_decompress1X1_DCtx_wksp_bmi2(dctx->entropy.hufTable, dctx->litBuffer, litSize, istart+lhSize, litCSize,
-                                                                         dctx->entropy.workspace, sizeof(dctx->entropy.workspace), dctx->bmi2) :
+                                                                         dctx->workspace, sizeof(dctx->workspace), dctx->bmi2) :
                                         HUF_decompress4X_hufOnly_wksp_bmi2(dctx->entropy.hufTable, dctx->litBuffer, litSize, istart+lhSize, litCSize,
-                                                                           dctx->entropy.workspace, sizeof(dctx->entropy.workspace), dctx->bmi2))))
+                                                                           dctx->workspace, sizeof(dctx->workspace), dctx->bmi2))))
                     return ERROR(corruption_detected);
 
                 dctx->litPtr = dctx->litBuffer;
@@ -883,7 +904,8 @@
                                  symbolEncodingType_e type, U32 max, U32 maxLog,
                                  const void* src, size_t srcSize,
                                  const U32* baseValue, const U32* nbAdditionalBits,
-                                 const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable)
+                                 const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable,
+                                 int ddictIsCold, int nbSeq)
 {
     switch(type)
     {
@@ -902,6 +924,12 @@
         return 0;
     case set_repeat:
         if (!flagRepeatTable) return ERROR(corruption_detected);
+        /* prefetch FSE table if used */
+        if (ddictIsCold && (nbSeq > 24 /* heuristic */)) {
+            const void* const pStart = *DTablePtr;
+            size_t const pSize = sizeof(ZSTD_seqSymbol) * (SEQSYMBOL_TABLE_SIZE(maxLog));
+            PREFETCH_AREA(pStart, pSize);
+        }
         return 0;
     case set_compressed :
         {   U32 tableLog;
@@ -947,6 +975,9 @@
                     67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803,
                     0x1003, 0x2003, 0x4003, 0x8003, 0x10003 };
 
+/* Hidden delcaration for fullbench */
+size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
+                             const void* src, size_t srcSize);
 
 size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
                              const void* src, size_t srcSize)
@@ -954,25 +985,25 @@
     const BYTE* const istart = (const BYTE* const)src;
     const BYTE* const iend = istart + srcSize;
     const BYTE* ip = istart;
+    int nbSeq;
     DEBUGLOG(5, "ZSTD_decodeSeqHeaders");
 
     /* check */
     if (srcSize < MIN_SEQUENCES_SIZE) return ERROR(srcSize_wrong);
 
     /* SeqHead */
-    {   int nbSeq = *ip++;
-        if (!nbSeq) { *nbSeqPtr=0; return 1; }
-        if (nbSeq > 0x7F) {
-            if (nbSeq == 0xFF) {
-                if (ip+2 > iend) return ERROR(srcSize_wrong);
-                nbSeq = MEM_readLE16(ip) + LONGNBSEQ, ip+=2;
-            } else {
-                if (ip >= iend) return ERROR(srcSize_wrong);
-                nbSeq = ((nbSeq-0x80)<<8) + *ip++;
-            }
+    nbSeq = *ip++;
+    if (!nbSeq) { *nbSeqPtr=0; return 1; }
+    if (nbSeq > 0x7F) {
+        if (nbSeq == 0xFF) {
+            if (ip+2 > iend) return ERROR(srcSize_wrong);
+            nbSeq = MEM_readLE16(ip) + LONGNBSEQ, ip+=2;
+        } else {
+            if (ip >= iend) return ERROR(srcSize_wrong);
+            nbSeq = ((nbSeq-0x80)<<8) + *ip++;
         }
-        *nbSeqPtr = nbSeq;
     }
+    *nbSeqPtr = nbSeq;
 
     /* FSE table descriptors */
     if (ip+4 > iend) return ERROR(srcSize_wrong); /* minimum possible size */
@@ -986,7 +1017,8 @@
                                                       LLtype, MaxLL, LLFSELog,
                                                       ip, iend-ip,
                                                       LL_base, LL_bits,
-                                                      LL_defaultDTable, dctx->fseEntropy);
+                                                      LL_defaultDTable, dctx->fseEntropy,
+                                                      dctx->ddictIsCold, nbSeq);
             if (ZSTD_isError(llhSize)) return ERROR(corruption_detected);
             ip += llhSize;
         }
@@ -995,7 +1027,8 @@
                                                       OFtype, MaxOff, OffFSELog,
                                                       ip, iend-ip,
                                                       OF_base, OF_bits,
-                                                      OF_defaultDTable, dctx->fseEntropy);
+                                                      OF_defaultDTable, dctx->fseEntropy,
+                                                      dctx->ddictIsCold, nbSeq);
             if (ZSTD_isError(ofhSize)) return ERROR(corruption_detected);
             ip += ofhSize;
         }
@@ -1004,12 +1037,23 @@
                                                       MLtype, MaxML, MLFSELog,
                                                       ip, iend-ip,
                                                       ML_base, ML_bits,
-                                                      ML_defaultDTable, dctx->fseEntropy);
+                                                      ML_defaultDTable, dctx->fseEntropy,
+                                                      dctx->ddictIsCold, nbSeq);
             if (ZSTD_isError(mlhSize)) return ERROR(corruption_detected);
             ip += mlhSize;
         }
     }
 
+    /* prefetch dictionary content */
+    if (dctx->ddictIsCold) {
+        size_t const dictSize = (const char*)dctx->prefixStart - (const char*)dctx->virtualStart;
+        size_t const psmin = MIN(dictSize, (size_t)(64*nbSeq) /* heuristic */ );
+        size_t const pSize = MIN(psmin, 128 KB /* protection */ );
+        const void* const pStart = (const char*)dctx->dictEnd - pSize;
+        PREFETCH_AREA(pStart, pSize);
+        dctx->ddictIsCold = 0;
+    }
+
     return ip-istart;
 }
 
@@ -1676,7 +1720,8 @@
     /* isLongOffset must be true if there are long offsets.
      * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN.
      * We don't expect that to be the case in 64-bit mode.
-     * In block mode, window size is not known, so we have to be conservative. (note: but it could be evaluated from current-lowLimit)
+     * In block mode, window size is not known, so we have to be conservative.
+     * (note: but it could be evaluated from current-lowLimit)
      */
     ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN)));
     DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);
@@ -1743,10 +1788,10 @@
 }
 
 
-static size_t ZSTD_generateNxBytes(void* dst, size_t dstCapacity, BYTE byte, size_t length)
+static size_t ZSTD_generateNxBytes(void* dst, size_t dstCapacity, BYTE value, size_t length)
 {
     if (length > dstCapacity) return ERROR(dstSize_tooSmall);
-    memset(dst, byte, length);
+    memset(dst, value, length);
     return length;
 }
 
@@ -1763,7 +1808,7 @@
 #endif
     if ( (srcSize >= ZSTD_skippableHeaderSize)
       && (MEM_readLE32(src) & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START ) {
-        return ZSTD_skippableHeaderSize + MEM_readLE32((const BYTE*)src + ZSTD_frameIdSize);
+        return ZSTD_skippableHeaderSize + MEM_readLE32((const BYTE*)src + ZSTD_FRAMEIDSIZE);
     } else {
         const BYTE* ip = (const BYTE*)src;
         const BYTE* const ipstart = ip;
@@ -1797,7 +1842,6 @@
         if (zfh.checksumFlag) {   /* Final frame content checksum */
             if (remainingSize < 4) return ERROR(srcSize_wrong);
             ip += 4;
-            remainingSize -= 4;
         }
 
         return ip - ipstart;
@@ -1885,9 +1929,6 @@
     return op-ostart;
 }
 
-static const void* ZSTD_DDictDictContent(const ZSTD_DDict* ddict);
-static size_t ZSTD_DDictDictSize(const ZSTD_DDict* ddict);
-
 static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
                                         void* dst, size_t dstCapacity,
                                   const void* src, size_t srcSize,
@@ -1896,6 +1937,8 @@
 {
     void* const dststart = dst;
     int moreThan1Frame = 0;
+
+    DEBUGLOG(5, "ZSTD_decompressMultiFrame");
     assert(dict==NULL || ddict==NULL);  /* either dict or ddict set, not both */
 
     if (ddict) {
@@ -1932,7 +1975,7 @@
                 size_t skippableSize;
                 if (srcSize < ZSTD_skippableHeaderSize)
                     return ERROR(srcSize_wrong);
-                skippableSize = MEM_readLE32((const BYTE*)src + ZSTD_frameIdSize)
+                skippableSize = MEM_readLE32((const BYTE*)src + ZSTD_FRAMEIDSIZE)
                               + ZSTD_skippableHeaderSize;
                 if (srcSize < skippableSize) return ERROR(srcSize_wrong);
 
@@ -2057,7 +2100,7 @@
     case ZSTDds_getFrameHeaderSize :
         assert(src != NULL);
         if (dctx->format == ZSTD_f_zstd1) {  /* allows header */
-            assert(srcSize >= ZSTD_frameIdSize);  /* to read skippable magic number */
+            assert(srcSize >= ZSTD_FRAMEIDSIZE);  /* to read skippable magic number */
             if ((MEM_readLE32(src) & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) {        /* skippable frame */
                 memcpy(dctx->headerBuffer, src, srcSize);
                 dctx->expected = ZSTD_skippableHeaderSize - srcSize;  /* remaining to load to get full skippable frame header */
@@ -2167,7 +2210,7 @@
         assert(src != NULL);
         assert(srcSize <= ZSTD_skippableHeaderSize);
         memcpy(dctx->headerBuffer + (ZSTD_skippableHeaderSize - srcSize), src, srcSize);   /* complete skippable header */
-        dctx->expected = MEM_readLE32(dctx->headerBuffer + ZSTD_frameIdSize);   /* note : dctx->expected can grow seriously large, beyond local buffer size */
+        dctx->expected = MEM_readLE32(dctx->headerBuffer + ZSTD_FRAMEIDSIZE);   /* note : dctx->expected can grow seriously large, beyond local buffer size */
         dctx->stage = ZSTDds_skipFrame;
         return 0;
 
@@ -2191,21 +2234,27 @@
     return 0;
 }
 
-/* ZSTD_loadEntropy() :
- * dict : must point at beginning of a valid zstd dictionary
+/*! ZSTD_loadEntropy() :
+ *  dict : must point at beginning of a valid zstd dictionary.
  * @return : size of entropy tables read */
-static size_t ZSTD_loadEntropy(ZSTD_entropyDTables_t* entropy, const void* const dict, size_t const dictSize)
+static size_t ZSTD_loadEntropy(ZSTD_entropyDTables_t* entropy,
+                         const void* const dict, size_t const dictSize)
 {
     const BYTE* dictPtr = (const BYTE*)dict;
     const BYTE* const dictEnd = dictPtr + dictSize;
 
     if (dictSize <= 8) return ERROR(dictionary_corrupted);
+    assert(MEM_readLE32(dict) == ZSTD_MAGIC_DICTIONARY);   /* dict must be valid */
     dictPtr += 8;   /* skip header = magic + dictID */
 
-
-    {   size_t const hSize = HUF_readDTableX2_wksp(
-            entropy->hufTable, dictPtr, dictEnd - dictPtr,
-            entropy->workspace, sizeof(entropy->workspace));
+    ZSTD_STATIC_ASSERT(offsetof(ZSTD_entropyDTables_t, OFTable) == offsetof(ZSTD_entropyDTables_t, LLTable) + sizeof(entropy->LLTable));
+    ZSTD_STATIC_ASSERT(offsetof(ZSTD_entropyDTables_t, MLTable) == offsetof(ZSTD_entropyDTables_t, OFTable) + sizeof(entropy->OFTable));
+    ZSTD_STATIC_ASSERT(sizeof(entropy->LLTable) + sizeof(entropy->OFTable) + sizeof(entropy->MLTable) >= HUF_DECOMPRESS_WORKSPACE_SIZE);
+    {   void* const workspace = &entropy->LLTable;   /* use fse tables as temporary workspace; implies fse tables are grouped together */
+        size_t const workspaceSize = sizeof(entropy->LLTable) + sizeof(entropy->OFTable) + sizeof(entropy->MLTable);
+        size_t const hSize = HUF_readDTableX2_wksp(entropy->hufTable,
+                                                dictPtr, dictEnd - dictPtr,
+                                                workspace, workspaceSize);
         if (HUF_isError(hSize)) return ERROR(dictionary_corrupted);
         dictPtr += hSize;
     }
@@ -2216,7 +2265,7 @@
         if (FSE_isError(offcodeHeaderSize)) return ERROR(dictionary_corrupted);
         if (offcodeMaxValue > MaxOff) return ERROR(dictionary_corrupted);
         if (offcodeLog > OffFSELog) return ERROR(dictionary_corrupted);
-        ZSTD_buildFSETable(entropy->OFTable,
+        ZSTD_buildFSETable( entropy->OFTable,
                             offcodeNCount, offcodeMaxValue,
                             OF_base, OF_bits,
                             offcodeLog);
@@ -2229,7 +2278,7 @@
         if (FSE_isError(matchlengthHeaderSize)) return ERROR(dictionary_corrupted);
         if (matchlengthMaxValue > MaxML) return ERROR(dictionary_corrupted);
         if (matchlengthLog > MLFSELog) return ERROR(dictionary_corrupted);
-        ZSTD_buildFSETable(entropy->MLTable,
+        ZSTD_buildFSETable( entropy->MLTable,
                             matchlengthNCount, matchlengthMaxValue,
                             ML_base, ML_bits,
                             matchlengthLog);
@@ -2242,7 +2291,7 @@
         if (FSE_isError(litlengthHeaderSize)) return ERROR(dictionary_corrupted);
         if (litlengthMaxValue > MaxLL) return ERROR(dictionary_corrupted);
         if (litlengthLog > LLFSELog) return ERROR(dictionary_corrupted);
-        ZSTD_buildFSETable(entropy->LLTable,
+        ZSTD_buildFSETable( entropy->LLTable,
                             litlengthNCount, litlengthMaxValue,
                             LL_base, LL_bits,
                             litlengthLog);
@@ -2268,7 +2317,7 @@
         if (magic != ZSTD_MAGIC_DICTIONARY) {
             return ZSTD_refDictContent(dctx, dict, dictSize);   /* pure content mode */
     }   }
-    dctx->dictID = MEM_readLE32((const char*)dict + ZSTD_frameIdSize);
+    dctx->dictID = MEM_readLE32((const char*)dict + ZSTD_FRAMEIDSIZE);
 
     /* load entropy tables */
     {   size_t const eSize = ZSTD_loadEntropy(&dctx->entropy, dict, dictSize);
@@ -2282,7 +2331,6 @@
     return ZSTD_refDictContent(dctx, dict, dictSize);
 }
 
-/* Note : this function cannot fail */
 size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx)
 {
     assert(dctx != NULL);
@@ -2328,42 +2376,53 @@
 
 static const void* ZSTD_DDictDictContent(const ZSTD_DDict* ddict)
 {
+    assert(ddict != NULL);
     return ddict->dictContent;
 }
 
 static size_t ZSTD_DDictDictSize(const ZSTD_DDict* ddict)
 {
+    assert(ddict != NULL);
     return ddict->dictSize;
 }
 
-size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dstDCtx, const ZSTD_DDict* ddict)
+size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict)
 {
-    CHECK_F( ZSTD_decompressBegin(dstDCtx) );
-    if (ddict) {   /* support begin on NULL */
-        dstDCtx->dictID = ddict->dictID;
-        dstDCtx->prefixStart = ddict->dictContent;
-        dstDCtx->virtualStart = ddict->dictContent;
-        dstDCtx->dictEnd = (const BYTE*)ddict->dictContent + ddict->dictSize;
-        dstDCtx->previousDstEnd = dstDCtx->dictEnd;
+    DEBUGLOG(4, "ZSTD_decompressBegin_usingDDict");
+    assert(dctx != NULL);
+    if (ddict) {
+        dctx->ddictIsCold = (dctx->dictEnd != (const char*)ddict->dictContent + ddict->dictSize);
+        DEBUGLOG(4, "DDict is %s",
+                    dctx->ddictIsCold ? "~cold~" : "hot!");
+    }
+    CHECK_F( ZSTD_decompressBegin(dctx) );
+    if (ddict) {   /* NULL ddict is equivalent to no dictionary */
+        dctx->dictID = ddict->dictID;
+        dctx->prefixStart = ddict->dictContent;
+        dctx->virtualStart = ddict->dictContent;
+        dctx->dictEnd = (const BYTE*)ddict->dictContent + ddict->dictSize;
+        dctx->previousDstEnd = dctx->dictEnd;
         if (ddict->entropyPresent) {
-            dstDCtx->litEntropy = 1;
-            dstDCtx->fseEntropy = 1;
-            dstDCtx->LLTptr = ddict->entropy.LLTable;
-            dstDCtx->MLTptr = ddict->entropy.MLTable;
-            dstDCtx->OFTptr = ddict->entropy.OFTable;
-            dstDCtx->HUFptr = ddict->entropy.hufTable;
-            dstDCtx->entropy.rep[0] = ddict->entropy.rep[0];
-            dstDCtx->entropy.rep[1] = ddict->entropy.rep[1];
-            dstDCtx->entropy.rep[2] = ddict->entropy.rep[2];
+            dctx->litEntropy = 1;
+            dctx->fseEntropy = 1;
+            dctx->LLTptr = ddict->entropy.LLTable;
+            dctx->MLTptr = ddict->entropy.MLTable;
+            dctx->OFTptr = ddict->entropy.OFTable;
+            dctx->HUFptr = ddict->entropy.hufTable;
+            dctx->entropy.rep[0] = ddict->entropy.rep[0];
+            dctx->entropy.rep[1] = ddict->entropy.rep[1];
+            dctx->entropy.rep[2] = ddict->entropy.rep[2];
         } else {
-            dstDCtx->litEntropy = 0;
-            dstDCtx->fseEntropy = 0;
+            dctx->litEntropy = 0;
+            dctx->fseEntropy = 0;
         }
     }
     return 0;
 }
 
-static size_t ZSTD_loadEntropy_inDDict(ZSTD_DDict* ddict, ZSTD_dictContentType_e dictContentType)
+static size_t
+ZSTD_loadEntropy_inDDict(ZSTD_DDict* ddict,
+                         ZSTD_dictContentType_e dictContentType)
 {
     ddict->dictID = 0;
     ddict->entropyPresent = 0;
@@ -2381,10 +2440,12 @@
             return 0;   /* pure content mode */
         }
     }
-    ddict->dictID = MEM_readLE32((const char*)ddict->dictContent + ZSTD_frameIdSize);
+    ddict->dictID = MEM_readLE32((const char*)ddict->dictContent + ZSTD_FRAMEIDSIZE);
 
     /* load entropy tables */
-    CHECK_E( ZSTD_loadEntropy(&ddict->entropy, ddict->dictContent, ddict->dictSize), dictionary_corrupted );
+    CHECK_E( ZSTD_loadEntropy(&ddict->entropy,
+                              ddict->dictContent, ddict->dictSize),
+             dictionary_corrupted );
     ddict->entropyPresent = 1;
     return 0;
 }
@@ -2398,6 +2459,7 @@
     if ((dictLoadMethod == ZSTD_dlm_byRef) || (!dict) || (!dictSize)) {
         ddict->dictBuffer = NULL;
         ddict->dictContent = dict;
+        if (!dict) dictSize = 0;
     } else {
         void* const internalBuffer = ZSTD_malloc(dictSize, ddict->cMem);
         ddict->dictBuffer = internalBuffer;
@@ -2422,14 +2484,15 @@
     if (!customMem.customAlloc ^ !customMem.customFree) return NULL;
 
     {   ZSTD_DDict* const ddict = (ZSTD_DDict*) ZSTD_malloc(sizeof(ZSTD_DDict), customMem);
-        if (!ddict) return NULL;
+        if (ddict == NULL) return NULL;
         ddict->cMem = customMem;
-
-        if (ZSTD_isError( ZSTD_initDDict_internal(ddict, dict, dictSize, dictLoadMethod, dictContentType) )) {
-            ZSTD_freeDDict(ddict);
-            return NULL;
-        }
-
+        {   size_t const initResult = ZSTD_initDDict_internal(ddict,
+                                            dict, dictSize,
+                                            dictLoadMethod, dictContentType);
+            if (ZSTD_isError(initResult)) {
+                ZSTD_freeDDict(ddict);
+                return NULL;
+        }   }
         return ddict;
     }
 }
@@ -2456,23 +2519,25 @@
 
 
 const ZSTD_DDict* ZSTD_initStaticDDict(
-                                void* workspace, size_t workspaceSize,
+                                void* sBuffer, size_t sBufferSize,
                                 const void* dict, size_t dictSize,
                                 ZSTD_dictLoadMethod_e dictLoadMethod,
                                 ZSTD_dictContentType_e dictContentType)
 {
-    size_t const neededSpace =
-            sizeof(ZSTD_DDict) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : dictSize);
-    ZSTD_DDict* const ddict = (ZSTD_DDict*)workspace;
-    assert(workspace != NULL);
+    size_t const neededSpace = sizeof(ZSTD_DDict)
+                             + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : dictSize);
+    ZSTD_DDict* const ddict = (ZSTD_DDict*)sBuffer;
+    assert(sBuffer != NULL);
     assert(dict != NULL);
-    if ((size_t)workspace & 7) return NULL;  /* 8-aligned */
-    if (workspaceSize < neededSpace) return NULL;
+    if ((size_t)sBuffer & 7) return NULL;   /* 8-aligned */
+    if (sBufferSize < neededSpace) return NULL;
     if (dictLoadMethod == ZSTD_dlm_byCopy) {
         memcpy(ddict+1, dict, dictSize);  /* local copy */
         dict = ddict+1;
     }
-    if (ZSTD_isError( ZSTD_initDDict_internal(ddict, dict, dictSize, ZSTD_dlm_byRef, dictContentType) ))
+    if (ZSTD_isError( ZSTD_initDDict_internal(ddict,
+                                              dict, dictSize,
+                                              ZSTD_dlm_byRef, dictContentType) ))
         return NULL;
     return ddict;
 }
@@ -2510,7 +2575,7 @@
 {
     if (dictSize < 8) return 0;
     if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) return 0;
-    return MEM_readLE32((const char*)dict + ZSTD_frameIdSize);
+    return MEM_readLE32((const char*)dict + ZSTD_FRAMEIDSIZE);
 }
 
 /*! ZSTD_getDictID_fromDDict() :
@@ -2586,12 +2651,15 @@
 }
 
 
-/* *** Initialization *** */
+/* ***  Initialization  *** */
 
 size_t ZSTD_DStreamInSize(void)  { return ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize; }
 size_t ZSTD_DStreamOutSize(void) { return ZSTD_BLOCKSIZE_MAX; }
 
-size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType)
+size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx,
+                                   const void* dict, size_t dictSize,
+                                         ZSTD_dictLoadMethod_e dictLoadMethod,
+                                         ZSTD_dictContentType_e dictContentType)
 {
     if (dctx->streamStage != zdss_init) return ERROR(stage_wrong);
     ZSTD_freeDDict(dctx->ddictLocal);
@@ -2645,13 +2713,6 @@
     return ZSTD_initDStream_usingDict(zds, NULL, 0);
 }
 
-size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict)
-{
-    if (dctx->streamStage != zdss_init) return ERROR(stage_wrong);
-    dctx->ddict = ddict;
-    return 0;
-}
-
 /* ZSTD_initDStream_usingDDict() :
  * ddict will just be referenced, and must outlive decompression session
  * this function cannot fail */
@@ -2690,6 +2751,13 @@
     return 0;
 }
 
+size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict)
+{
+    if (dctx->streamStage != zdss_init) return ERROR(stage_wrong);
+    dctx->ddict = ddict;
+    return 0;
+}
+
 size_t ZSTD_DCtx_setMaxWindowSize(ZSTD_DCtx* dctx, size_t maxWindowSize)
 {
     if (dctx->streamStage != zdss_init) return ERROR(stage_wrong);
@@ -2855,7 +2923,7 @@
             CHECK_F(ZSTD_decompressBegin_usingDDict(zds, zds->ddict));
 
             if ((MEM_readLE32(zds->headerBuffer) & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) {  /* skippable frame */
-                zds->expected = MEM_readLE32(zds->headerBuffer + ZSTD_frameIdSize);
+                zds->expected = MEM_readLE32(zds->headerBuffer + ZSTD_FRAMEIDSIZE);
                 zds->stage = ZSTDds_skipFrame;
             } else {
                 CHECK_F(ZSTD_decodeFrameHeader(zds, zds->headerBuffer, zds->lhSize));
diff --git a/lib/dictBuilder/cover.c b/lib/dictBuilder/cover.c
index 448f713..6b4af69 100644
--- a/lib/dictBuilder/cover.c
+++ b/lib/dictBuilder/cover.c
@@ -29,6 +29,7 @@
 #include "mem.h" /* read */
 #include "pool.h"
 #include "threading.h"
+#include "cover.h"
 #include "zstd_internal.h" /* includes zstd.h */
 #ifndef ZDICT_STATIC_LINKING_ONLY
 #define ZDICT_STATIC_LINKING_ONLY
@@ -39,6 +40,7 @@
 *  Constants
 ***************************************/
 #define COVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB))
+#define DEFAULT_SPLITPOINT 1.0
 
 /*-*************************************
 *  Console display
@@ -184,7 +186,7 @@
 }
 
 /**
- * Destroyes a map that is inited with COVER_map_init().
+ * Destroys a map that is inited with COVER_map_init().
  */
 static void COVER_map_destroy(COVER_map_t *map) {
   if (map->data) {
@@ -203,6 +205,8 @@
   size_t *offsets;
   const size_t *samplesSizes;
   size_t nbSamples;
+  size_t nbTrainSamples;
+  size_t nbTestSamples;
   U32 *suffix;
   size_t suffixSize;
   U32 *freqs;
@@ -220,9 +224,9 @@
 /**
  * Returns the sum of the sample sizes.
  */
-static size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) {
+size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) {
   size_t sum = 0;
-  size_t i;
+  unsigned i;
   for (i = 0; i < nbSamples; ++i) {
     sum += samplesSizes[i];
   }
@@ -377,14 +381,6 @@
   ctx->suffix[dmerId] = freq;
 }
 
-/**
- * A segment is a range in the source as well as the score of the segment.
- */
-typedef struct {
-  U32 begin;
-  U32 end;
-  U32 score;
-} COVER_segment_t;
 
 /**
  * Selects the best segment in an epoch.
@@ -494,6 +490,10 @@
   if (parameters.d > parameters.k) {
     return 0;
   }
+  /* 0 < splitPoint <= 1 */
+  if (parameters.splitPoint <= 0 || parameters.splitPoint > 1){
+    return 0;
+  }
   return 1;
 }
 
@@ -531,9 +531,14 @@
  */
 static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
                           const size_t *samplesSizes, unsigned nbSamples,
-                          unsigned d) {
+                          unsigned d, double splitPoint) {
   const BYTE *const samples = (const BYTE *)samplesBuffer;
   const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples);
+  /* Split samples into testing and training sets */
+  const unsigned nbTrainSamples = splitPoint < 1.0 ? (unsigned)((double)nbSamples * splitPoint) : nbSamples;
+  const unsigned nbTestSamples = splitPoint < 1.0 ? nbSamples - nbTrainSamples : nbSamples;
+  const size_t trainingSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes, nbTrainSamples) : totalSamplesSize;
+  const size_t testSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes + nbTrainSamples, nbTestSamples) : totalSamplesSize;
   /* Checks */
   if (totalSamplesSize < MAX(d, sizeof(U64)) ||
       totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) {
@@ -541,15 +546,29 @@
                  (U32)(totalSamplesSize>>20), (COVER_MAX_SAMPLES_SIZE >> 20));
     return 0;
   }
+  /* Check if there are at least 5 training samples */
+  if (nbTrainSamples < 5) {
+    DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples);
+    return 0;
+  }
+  /* Check if there's testing sample */
+  if (nbTestSamples < 1) {
+    DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.", nbTestSamples);
+    return 0;
+  }
   /* Zero the context */
   memset(ctx, 0, sizeof(*ctx));
-  DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbSamples,
-               (U32)totalSamplesSize);
+  DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbTrainSamples,
+               (U32)trainingSamplesSize);
+  DISPLAYLEVEL(2, "Testing on %u samples of total size %u\n", nbTestSamples,
+               (U32)testSamplesSize);
   ctx->samples = samples;
   ctx->samplesSizes = samplesSizes;
   ctx->nbSamples = nbSamples;
+  ctx->nbTrainSamples = nbTrainSamples;
+  ctx->nbTestSamples = nbTestSamples;
   /* Partial suffix array */
-  ctx->suffixSize = totalSamplesSize - MAX(d, sizeof(U64)) + 1;
+  ctx->suffixSize = trainingSamplesSize - MAX(d, sizeof(U64)) + 1;
   ctx->suffix = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
   /* Maps index to the dmerID */
   ctx->dmerAt = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
@@ -563,7 +582,7 @@
   ctx->freqs = NULL;
   ctx->d = d;
 
-  /* Fill offsets from the samlesSizes */
+  /* Fill offsets from the samplesSizes */
   {
     U32 i;
     ctx->offsets[0] = 0;
@@ -665,7 +684,7 @@
   BYTE* const dict = (BYTE*)dictBuffer;
   COVER_ctx_t ctx;
   COVER_map_t activeDmers;
-
+  parameters.splitPoint = 1.0;
   /* Initialize global data */
   g_displayLevel = parameters.zParams.notificationLevel;
   /* Checks */
@@ -684,7 +703,7 @@
   }
   /* Initialize context and activeDmers */
   if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
-                      parameters.d)) {
+                      parameters.d, parameters.splitPoint)) {
     return ERROR(GENERIC);
   }
   if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
@@ -711,28 +730,65 @@
   }
 }
 
-/**
- * COVER_best_t is used for two purposes:
- * 1. Synchronizing threads.
- * 2. Saving the best parameters and dictionary.
- *
- * All of the methods except COVER_best_init() are thread safe if zstd is
- * compiled with multithreaded support.
- */
-typedef struct COVER_best_s {
-  ZSTD_pthread_mutex_t mutex;
-  ZSTD_pthread_cond_t cond;
-  size_t liveJobs;
-  void *dict;
-  size_t dictSize;
-  ZDICT_cover_params_t parameters;
-  size_t compressedSize;
-} COVER_best_t;
+
+
+size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
+                                    const size_t *samplesSizes, const BYTE *samples,
+                                    size_t *offsets,
+                                    size_t nbTrainSamples, size_t nbSamples,
+                                    BYTE *const dict, size_t dictBufferCapacity) {
+  size_t totalCompressedSize = ERROR(GENERIC);
+  /* Pointers */
+  ZSTD_CCtx *cctx;
+  ZSTD_CDict *cdict;
+  void *dst;
+  /* Local variables */
+  size_t dstCapacity;
+  size_t i;
+  /* Allocate dst with enough space to compress the maximum sized sample */
+  {
+    size_t maxSampleSize = 0;
+    i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0;
+    for (; i < nbSamples; ++i) {
+      maxSampleSize = MAX(samplesSizes[i], maxSampleSize);
+    }
+    dstCapacity = ZSTD_compressBound(maxSampleSize);
+    dst = malloc(dstCapacity);
+  }
+  /* Create the cctx and cdict */
+  cctx = ZSTD_createCCtx();
+  cdict = ZSTD_createCDict(dict, dictBufferCapacity,
+                           parameters.zParams.compressionLevel);
+  if (!dst || !cctx || !cdict) {
+    goto _compressCleanup;
+  }
+  /* Compress each sample and sum their sizes (or error) */
+  totalCompressedSize = dictBufferCapacity;
+  i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0;
+  for (; i < nbSamples; ++i) {
+    const size_t size = ZSTD_compress_usingCDict(
+        cctx, dst, dstCapacity, samples + offsets[i],
+        samplesSizes[i], cdict);
+    if (ZSTD_isError(size)) {
+      totalCompressedSize = ERROR(GENERIC);
+      goto _compressCleanup;
+    }
+    totalCompressedSize += size;
+  }
+_compressCleanup:
+  ZSTD_freeCCtx(cctx);
+  ZSTD_freeCDict(cdict);
+  if (dst) {
+    free(dst);
+  }
+  return totalCompressedSize;
+}
+
 
 /**
  * Initialize the `COVER_best_t`.
  */
-static void COVER_best_init(COVER_best_t *best) {
+void COVER_best_init(COVER_best_t *best) {
   if (best==NULL) return; /* compatible with init on NULL */
   (void)ZSTD_pthread_mutex_init(&best->mutex, NULL);
   (void)ZSTD_pthread_cond_init(&best->cond, NULL);
@@ -746,7 +802,7 @@
 /**
  * Wait until liveJobs == 0.
  */
-static void COVER_best_wait(COVER_best_t *best) {
+void COVER_best_wait(COVER_best_t *best) {
   if (!best) {
     return;
   }
@@ -760,7 +816,7 @@
 /**
  * Call COVER_best_wait() and then destroy the COVER_best_t.
  */
-static void COVER_best_destroy(COVER_best_t *best) {
+void COVER_best_destroy(COVER_best_t *best) {
   if (!best) {
     return;
   }
@@ -776,7 +832,7 @@
  * Called when a thread is about to be launched.
  * Increments liveJobs.
  */
-static void COVER_best_start(COVER_best_t *best) {
+void COVER_best_start(COVER_best_t *best) {
   if (!best) {
     return;
   }
@@ -790,7 +846,7 @@
  * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
  * If this dictionary is the best so far save it and its parameters.
  */
-static void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
+void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
                               ZDICT_cover_params_t parameters, void *dict,
                               size_t dictSize) {
   if (!best) {
@@ -821,10 +877,10 @@
       best->parameters = parameters;
       best->compressedSize = compressedSize;
     }
-    ZSTD_pthread_mutex_unlock(&best->mutex);
     if (liveJobs == 0) {
       ZSTD_pthread_cond_broadcast(&best->cond);
     }
+    ZSTD_pthread_mutex_unlock(&best->mutex);
   }
 }
 
@@ -839,7 +895,7 @@
 } COVER_tryParameters_data_t;
 
 /**
- * Tries a set of parameters and upates the COVER_best_t with the results.
+ * Tries a set of parameters and updates the COVER_best_t with the results.
  * This function is thread safe if zstd is compiled with multithreaded support.
  * It takes its parameters as an *OWNING* opaque pointer to support threading.
  */
@@ -870,7 +926,7 @@
                                               dictBufferCapacity, parameters);
     dictBufferCapacity = ZDICT_finalizeDictionary(
         dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
-        ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbSamples,
+        ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbTrainSamples,
         parameters.zParams);
     if (ZDICT_isError(dictBufferCapacity)) {
       DISPLAYLEVEL(1, "Failed to finalize dictionary\n");
@@ -878,49 +934,10 @@
     }
   }
   /* Check total compressed size */
-  {
-    /* Pointers */
-    ZSTD_CCtx *cctx;
-    ZSTD_CDict *cdict;
-    void *dst;
-    /* Local variables */
-    size_t dstCapacity;
-    size_t i;
-    /* Allocate dst with enough space to compress the maximum sized sample */
-    {
-      size_t maxSampleSize = 0;
-      for (i = 0; i < ctx->nbSamples; ++i) {
-        maxSampleSize = MAX(ctx->samplesSizes[i], maxSampleSize);
-      }
-      dstCapacity = ZSTD_compressBound(maxSampleSize);
-      dst = malloc(dstCapacity);
-    }
-    /* Create the cctx and cdict */
-    cctx = ZSTD_createCCtx();
-    cdict = ZSTD_createCDict(dict, dictBufferCapacity,
-                             parameters.zParams.compressionLevel);
-    if (!dst || !cctx || !cdict) {
-      goto _compressCleanup;
-    }
-    /* Compress each sample and sum their sizes (or error) */
-    totalCompressedSize = dictBufferCapacity;
-    for (i = 0; i < ctx->nbSamples; ++i) {
-      const size_t size = ZSTD_compress_usingCDict(
-          cctx, dst, dstCapacity, ctx->samples + ctx->offsets[i],
-          ctx->samplesSizes[i], cdict);
-      if (ZSTD_isError(size)) {
-        totalCompressedSize = ERROR(GENERIC);
-        goto _compressCleanup;
-      }
-      totalCompressedSize += size;
-    }
-  _compressCleanup:
-    ZSTD_freeCCtx(cctx);
-    ZSTD_freeCDict(cdict);
-    if (dst) {
-      free(dst);
-    }
-  }
+  totalCompressedSize = COVER_checkTotalCompressedSize(parameters, ctx->samplesSizes,
+                                                       ctx->samples, ctx->offsets,
+                                                       ctx->nbTrainSamples, ctx->nbSamples,
+                                                       dict, dictBufferCapacity);
 
 _cleanup:
   COVER_best_finish(data->best, totalCompressedSize, parameters, dict,
@@ -941,6 +958,8 @@
     ZDICT_cover_params_t *parameters) {
   /* constants */
   const unsigned nbThreads = parameters->nbThreads;
+  const double splitPoint =
+      parameters->splitPoint <= 0.0 ? DEFAULT_SPLITPOINT : parameters->splitPoint;
   const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;
   const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d;
   const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k;
@@ -958,6 +977,10 @@
   POOL_ctx *pool = NULL;
 
   /* Checks */
+  if (splitPoint <= 0 || splitPoint > 1) {
+    LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
+    return ERROR(GENERIC);
+  }
   if (kMinK < kMaxD || kMaxK < kMinK) {
     LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
     return ERROR(GENERIC);
@@ -988,7 +1011,7 @@
     /* Initialize the context for this value of d */
     COVER_ctx_t ctx;
     LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
-    if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d)) {
+    if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint)) {
       LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
       COVER_best_destroy(&best);
       POOL_free(pool);
@@ -1013,6 +1036,7 @@
       data->parameters = *parameters;
       data->parameters.k = k;
       data->parameters.d = d;
+      data->parameters.splitPoint = splitPoint;
       data->parameters.steps = kSteps;
       data->parameters.zParams.notificationLevel = g_displayLevel;
       /* Check the parameters */
diff --git a/lib/dictBuilder/cover.h b/lib/dictBuilder/cover.h
new file mode 100644
index 0000000..82e2e1c
--- /dev/null
+++ b/lib/dictBuilder/cover.h
@@ -0,0 +1,83 @@
+#include <stdio.h>  /* fprintf */
+#include <stdlib.h> /* malloc, free, qsort */
+#include <string.h> /* memset */
+#include <time.h>   /* clock */
+#include "mem.h" /* read */
+#include "pool.h"
+#include "threading.h"
+#include "zstd_internal.h" /* includes zstd.h */
+#ifndef ZDICT_STATIC_LINKING_ONLY
+#define ZDICT_STATIC_LINKING_ONLY
+#endif
+#include "zdict.h"
+
+/**
+ * COVER_best_t is used for two purposes:
+ * 1. Synchronizing threads.
+ * 2. Saving the best parameters and dictionary.
+ *
+ * All of the methods except COVER_best_init() are thread safe if zstd is
+ * compiled with multithreaded support.
+ */
+typedef struct COVER_best_s {
+  ZSTD_pthread_mutex_t mutex;
+  ZSTD_pthread_cond_t cond;
+  size_t liveJobs;
+  void *dict;
+  size_t dictSize;
+  ZDICT_cover_params_t parameters;
+  size_t compressedSize;
+} COVER_best_t;
+
+/**
+ * A segment is a range in the source as well as the score of the segment.
+ */
+typedef struct {
+  U32 begin;
+  U32 end;
+  U32 score;
+} COVER_segment_t;
+
+/**
+ *  Checks total compressed size of a dictionary
+ */
+size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
+                                      const size_t *samplesSizes, const BYTE *samples,
+                                      size_t *offsets,
+                                      size_t nbTrainSamples, size_t nbSamples,
+                                      BYTE *const dict, size_t dictBufferCapacity);
+
+/**
+ * Returns the sum of the sample sizes.
+ */
+size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) ;
+
+/**
+ * Initialize the `COVER_best_t`.
+ */
+void COVER_best_init(COVER_best_t *best);
+
+/**
+ * Wait until liveJobs == 0.
+ */
+void COVER_best_wait(COVER_best_t *best);
+
+/**
+ * Call COVER_best_wait() and then destroy the COVER_best_t.
+ */
+void COVER_best_destroy(COVER_best_t *best);
+
+/**
+ * Called when a thread is about to be launched.
+ * Increments liveJobs.
+ */
+void COVER_best_start(COVER_best_t *best);
+
+/**
+ * Called when a thread finishes executing, both on error or success.
+ * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
+ * If this dictionary is the best so far save it and its parameters.
+ */
+void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
+                       ZDICT_cover_params_t parameters, void *dict,
+                       size_t dictSize);
diff --git a/lib/dictBuilder/divsufsort.c b/lib/dictBuilder/divsufsort.c
index 60cceb0..ead9220 100644
--- a/lib/dictBuilder/divsufsort.c
+++ b/lib/dictBuilder/divsufsort.c
@@ -1637,7 +1637,7 @@
             if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
             k = SA + BUCKET_B(c2 = c0, c1);
           }
-          assert(k < j);
+          assert(k < j); assert(k != NULL);
           *k-- = s;
         } else {
           assert(((s == 0) && (T[s] == c1)) || (s < 0));
@@ -1701,7 +1701,7 @@
             if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
             k = SA + BUCKET_B(c2 = c0, c1);
           }
-          assert(k < j);
+          assert(k < j); assert(k != NULL);
           *k-- = s;
         } else if(s != 0) {
           *j = ~s;
@@ -1785,7 +1785,7 @@
             if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
             k = SA + BUCKET_B(c2 = c0, c1);
           }
-          assert(k < j);
+          assert(k < j); assert(k != NULL);
           *k-- = s;
         } else if(s != 0) {
           *j = ~s;
diff --git a/lib/dictBuilder/fastcover.c b/lib/dictBuilder/fastcover.c
new file mode 100644
index 0000000..dfee457
--- /dev/null
+++ b/lib/dictBuilder/fastcover.c
@@ -0,0 +1,728 @@
+/*-*************************************
+*  Dependencies
+***************************************/
+#include <stdio.h>  /* fprintf */
+#include <stdlib.h> /* malloc, free, qsort */
+#include <string.h> /* memset */
+#include <time.h>   /* clock */
+
+#include "mem.h" /* read */
+#include "pool.h"
+#include "threading.h"
+#include "cover.h"
+#include "zstd_internal.h" /* includes zstd.h */
+#ifndef ZDICT_STATIC_LINKING_ONLY
+#define ZDICT_STATIC_LINKING_ONLY
+#endif
+#include "zdict.h"
+
+
+/*-*************************************
+*  Constants
+***************************************/
+#define FASTCOVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB))
+#define FASTCOVER_MAX_F 31
+#define FASTCOVER_MAX_ACCEL 10
+#define DEFAULT_SPLITPOINT 0.75
+#define DEFAULT_F 20
+#define DEFAULT_ACCEL 1
+
+
+/*-*************************************
+*  Console display
+***************************************/
+static int g_displayLevel = 2;
+#define DISPLAY(...)                                                           \
+  {                                                                            \
+    fprintf(stderr, __VA_ARGS__);                                              \
+    fflush(stderr);                                                            \
+  }
+#define LOCALDISPLAYLEVEL(displayLevel, l, ...)                                \
+  if (displayLevel >= l) {                                                     \
+    DISPLAY(__VA_ARGS__);                                                      \
+  } /* 0 : no display;   1: errors;   2: default;  3: details;  4: debug */
+#define DISPLAYLEVEL(l, ...) LOCALDISPLAYLEVEL(g_displayLevel, l, __VA_ARGS__)
+
+#define LOCALDISPLAYUPDATE(displayLevel, l, ...)                               \
+  if (displayLevel >= l) {                                                     \
+    if ((clock() - g_time > refreshRate) || (displayLevel >= 4)) {             \
+      g_time = clock();                                                        \
+      DISPLAY(__VA_ARGS__);                                                    \
+    }                                                                          \
+  }
+#define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(g_displayLevel, l, __VA_ARGS__)
+static const clock_t refreshRate = CLOCKS_PER_SEC * 15 / 100;
+static clock_t g_time = 0;
+
+
+/*-*************************************
+* Hash Functions
+***************************************/
+static const U64 prime6bytes = 227718039650203ULL;
+static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u  << (64-48)) * prime6bytes) >> (64-h)) ; }
+static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); }
+
+static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL;
+static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; }
+static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); }
+
+
+/**
+ * Hash the d-byte value pointed to by p and mod 2^f
+ */
+static size_t FASTCOVER_hashPtrToIndex(const void* p, U32 h, unsigned d) {
+  if (d == 6) {
+    return ZSTD_hash6Ptr(p, h) & ((1 << h) - 1);
+  }
+  return ZSTD_hash8Ptr(p, h) & ((1 << h) - 1);
+}
+
+
+/*-*************************************
+* Acceleration
+***************************************/
+typedef struct {
+  unsigned finalize;    /* Percentage of training samples used for ZDICT_finalizeDictionary */
+  unsigned skip;        /* Number of dmer skipped between each dmer counted in computeFrequency */
+} FASTCOVER_accel_t;
+
+
+static const FASTCOVER_accel_t FASTCOVER_defaultAccelParameters[FASTCOVER_MAX_ACCEL+1] = {
+  { 100, 0 },   /* accel = 0, should not happen because accel = 0 defaults to accel = 1 */
+  { 100, 0 },   /* accel = 1 */
+  { 50, 1 },   /* accel = 2 */
+  { 34, 2 },   /* accel = 3 */
+  { 25, 3 },   /* accel = 4 */
+  { 20, 4 },   /* accel = 5 */
+  { 17, 5 },   /* accel = 6 */
+  { 14, 6 },   /* accel = 7 */
+  { 13, 7 },   /* accel = 8 */
+  { 11, 8 },   /* accel = 9 */
+  { 10, 9 },   /* accel = 10 */
+};
+
+
+/*-*************************************
+* Context
+***************************************/
+typedef struct {
+  const BYTE *samples;
+  size_t *offsets;
+  const size_t *samplesSizes;
+  size_t nbSamples;
+  size_t nbTrainSamples;
+  size_t nbTestSamples;
+  size_t nbDmers;
+  U32 *freqs;
+  unsigned d;
+  unsigned f;
+  FASTCOVER_accel_t accelParams;
+} FASTCOVER_ctx_t;
+
+
+/*-*************************************
+*  Helper functions
+***************************************/
+/**
+ * Selects the best segment in an epoch.
+ * Segments of are scored according to the function:
+ *
+ * Let F(d) be the frequency of all dmers with hash value d.
+ * Let S_i be hash value of the dmer at position i of segment S which has length k.
+ *
+ *     Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
+ *
+ * Once the dmer with hash value d is in the dictionay we set F(d) = 0.
+ */
+static COVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx,
+                                              U32 *freqs, U32 begin, U32 end,
+                                              ZDICT_cover_params_t parameters,
+                                              U16* segmentFreqs) {
+  /* Constants */
+  const U32 k = parameters.k;
+  const U32 d = parameters.d;
+  const U32 f = ctx->f;
+  const U32 dmersInK = k - d + 1;
+
+  /* Try each segment (activeSegment) and save the best (bestSegment) */
+  COVER_segment_t bestSegment = {0, 0, 0};
+  COVER_segment_t activeSegment;
+
+  /* Reset the activeDmers in the segment */
+  /* The activeSegment starts at the beginning of the epoch. */
+  activeSegment.begin = begin;
+  activeSegment.end = begin;
+  activeSegment.score = 0;
+
+  /* Slide the activeSegment through the whole epoch.
+   * Save the best segment in bestSegment.
+   */
+  while (activeSegment.end < end) {
+    /* Get hash value of current dmer */
+    const size_t index = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.end, f, d);
+
+    /* Add frequency of this index to score if this is the first occurence of index in active segment */
+    if (segmentFreqs[index] == 0) {
+      activeSegment.score += freqs[index];
+    }
+    /* Increment end of segment and segmentFreqs*/
+    activeSegment.end += 1;
+    segmentFreqs[index] += 1;
+    /* If the window is now too large, drop the first position */
+    if (activeSegment.end - activeSegment.begin == dmersInK + 1) {
+      /* Get hash value of the dmer to be eliminated from active segment */
+      const size_t delIndex = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.begin, f, d);
+      segmentFreqs[delIndex] -= 1;
+      /* Subtract frequency of this index from score if this is the last occurrence of this index in active segment */
+      if (segmentFreqs[delIndex] == 0) {
+        activeSegment.score -= freqs[delIndex];
+      }
+      /* Increment start of segment */
+      activeSegment.begin += 1;
+    }
+
+    /* If this segment is the best so far save it */
+    if (activeSegment.score > bestSegment.score) {
+      bestSegment = activeSegment;
+    }
+  }
+
+  /* Zero out rest of segmentFreqs array */
+  while (activeSegment.begin < end) {
+    const size_t delIndex = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.begin, f, d);
+    segmentFreqs[delIndex] -= 1;
+    activeSegment.begin += 1;
+  }
+
+  {
+    /*  Zero the frequency of hash value of each dmer covered by the chosen segment. */
+    U32 pos;
+    for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
+      const size_t i = FASTCOVER_hashPtrToIndex(ctx->samples + pos, f, d);
+      freqs[i] = 0;
+    }
+  }
+
+  return bestSegment;
+}
+
+
+static int FASTCOVER_checkParameters(ZDICT_cover_params_t parameters,
+                                     size_t maxDictSize, unsigned f,
+                                     unsigned accel) {
+  /* k, d, and f are required parameters */
+  if (parameters.d == 0 || parameters.k == 0) {
+    return 0;
+  }
+  /* d has to be 6 or 8 */
+  if (parameters.d != 6 && parameters.d != 8) {
+    return 0;
+  }
+  /* k <= maxDictSize */
+  if (parameters.k > maxDictSize) {
+    return 0;
+  }
+  /* d <= k */
+  if (parameters.d > parameters.k) {
+    return 0;
+  }
+  /* 0 < f <= FASTCOVER_MAX_F*/
+  if (f > FASTCOVER_MAX_F || f == 0) {
+    return 0;
+  }
+  /* 0 < splitPoint <= 1 */
+  if (parameters.splitPoint <= 0 || parameters.splitPoint > 1) {
+    return 0;
+  }
+  /* 0 < accel <= 10 */
+  if (accel > 10 || accel == 0) {
+    return 0;
+  }
+  return 1;
+}
+
+
+/**
+ * Clean up a context initialized with `FASTCOVER_ctx_init()`.
+ */
+static void
+FASTCOVER_ctx_destroy(FASTCOVER_ctx_t* ctx)
+{
+    if (!ctx) return;
+
+    free(ctx->freqs);
+    ctx->freqs = NULL;
+
+    free(ctx->offsets);
+    ctx->offsets = NULL;
+}
+
+
+/**
+ * Calculate for frequency of hash value of each dmer in ctx->samples
+ */
+static void
+FASTCOVER_computeFrequency(U32* freqs, const FASTCOVER_ctx_t* ctx)
+{
+    const unsigned f = ctx->f;
+    const unsigned d = ctx->d;
+    const unsigned skip = ctx->accelParams.skip;
+    const unsigned readLength = MAX(d, 8);
+    size_t i;
+    assert(ctx->nbTrainSamples >= 5);
+    assert(ctx->nbTrainSamples <= ctx->nbSamples);
+    for (i = 0; i < ctx->nbTrainSamples; i++) {
+        size_t start = ctx->offsets[i];  /* start of current dmer */
+        size_t const currSampleEnd = ctx->offsets[i+1];
+        while (start + readLength <= currSampleEnd) {
+            const size_t dmerIndex = FASTCOVER_hashPtrToIndex(ctx->samples + start, f, d);
+            freqs[dmerIndex]++;
+            start = start + skip + 1;
+        }
+    }
+}
+
+
+/**
+ * Prepare a context for dictionary building.
+ * The context is only dependent on the parameter `d` and can used multiple
+ * times.
+ * Returns 1 on success or zero on error.
+ * The context must be destroyed with `FASTCOVER_ctx_destroy()`.
+ */
+static int
+FASTCOVER_ctx_init(FASTCOVER_ctx_t* ctx,
+                   const void* samplesBuffer,
+                   const size_t* samplesSizes, unsigned nbSamples,
+                   unsigned d, double splitPoint, unsigned f,
+                   FASTCOVER_accel_t accelParams)
+{
+    const BYTE* const samples = (const BYTE*)samplesBuffer;
+    const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples);
+    /* Split samples into testing and training sets */
+    const unsigned nbTrainSamples = splitPoint < 1.0 ? (unsigned)((double)nbSamples * splitPoint) : nbSamples;
+    const unsigned nbTestSamples = splitPoint < 1.0 ? nbSamples - nbTrainSamples : nbSamples;
+    const size_t trainingSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes, nbTrainSamples) : totalSamplesSize;
+    const size_t testSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes + nbTrainSamples, nbTestSamples) : totalSamplesSize;
+
+    /* Checks */
+    if (totalSamplesSize < MAX(d, sizeof(U64)) ||
+        totalSamplesSize >= (size_t)FASTCOVER_MAX_SAMPLES_SIZE) {
+        DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
+                    (U32)(totalSamplesSize >> 20), (FASTCOVER_MAX_SAMPLES_SIZE >> 20));
+        return 0;
+    }
+
+    /* Check if there are at least 5 training samples */
+    if (nbTrainSamples < 5) {
+        DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid\n", nbTrainSamples);
+        return 0;
+    }
+
+    /* Check if there's testing sample */
+    if (nbTestSamples < 1) {
+        DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.\n", nbTestSamples);
+        return 0;
+    }
+
+    /* Zero the context */
+    memset(ctx, 0, sizeof(*ctx));
+    DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbTrainSamples,
+                    (U32)trainingSamplesSize);
+    DISPLAYLEVEL(2, "Testing on %u samples of total size %u\n", nbTestSamples,
+                    (U32)testSamplesSize);
+
+    ctx->samples = samples;
+    ctx->samplesSizes = samplesSizes;
+    ctx->nbSamples = nbSamples;
+    ctx->nbTrainSamples = nbTrainSamples;
+    ctx->nbTestSamples = nbTestSamples;
+    ctx->nbDmers = trainingSamplesSize - MAX(d, sizeof(U64)) + 1;
+    ctx->d = d;
+    ctx->f = f;
+    ctx->accelParams = accelParams;
+
+    /* The offsets of each file */
+    ctx->offsets = (size_t*)calloc((nbSamples + 1), sizeof(size_t));
+    if (ctx->offsets == NULL) {
+        DISPLAYLEVEL(1, "Failed to allocate scratch buffers \n");
+        FASTCOVER_ctx_destroy(ctx);
+        return 0;
+    }
+
+    /* Fill offsets from the samplesSizes */
+    {   U32 i;
+        ctx->offsets[0] = 0;
+        assert(nbSamples >= 5);
+        for (i = 1; i <= nbSamples; ++i) {
+            ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1];
+        }
+    }
+
+    /* Initialize frequency array of size 2^f */
+    ctx->freqs = (U32*)calloc(((U64)1 << f), sizeof(U32));
+    if (ctx->freqs == NULL) {
+        DISPLAYLEVEL(1, "Failed to allocate frequency table \n");
+        FASTCOVER_ctx_destroy(ctx);
+        return 0;
+    }
+
+    DISPLAYLEVEL(2, "Computing frequencies\n");
+    FASTCOVER_computeFrequency(ctx->freqs, ctx);
+
+    return 1;
+}
+
+
+/**
+ * Given the prepared context build the dictionary.
+ */
+static size_t
+FASTCOVER_buildDictionary(const FASTCOVER_ctx_t* ctx,
+                          U32* freqs,
+                          void* dictBuffer, size_t dictBufferCapacity,
+                          ZDICT_cover_params_t parameters,
+                          U16* segmentFreqs)
+{
+  BYTE *const dict = (BYTE *)dictBuffer;
+  size_t tail = dictBufferCapacity;
+  /* Divide the data up into epochs of equal size.
+   * We will select at least one segment from each epoch.
+   */
+  const U32 epochs = MAX(1, (U32)(dictBufferCapacity / parameters.k));
+  const U32 epochSize = (U32)(ctx->nbDmers / epochs);
+  size_t epoch;
+  DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n", epochs,
+               epochSize);
+  /* Loop through the epochs until there are no more segments or the dictionary
+   * is full.
+   */
+  for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) {
+    const U32 epochBegin = (U32)(epoch * epochSize);
+    const U32 epochEnd = epochBegin + epochSize;
+    size_t segmentSize;
+    /* Select a segment */
+    COVER_segment_t segment = FASTCOVER_selectSegment(
+        ctx, freqs, epochBegin, epochEnd, parameters, segmentFreqs);
+
+    /* If the segment covers no dmers, then we are out of content */
+    if (segment.score == 0) {
+      break;
+    }
+
+    /* Trim the segment if necessary and if it is too small then we are done */
+    segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
+    if (segmentSize < parameters.d) {
+      break;
+    }
+
+    /* We fill the dictionary from the back to allow the best segments to be
+     * referenced with the smallest offsets.
+     */
+    tail -= segmentSize;
+    memcpy(dict + tail, ctx->samples + segment.begin, segmentSize);
+    DISPLAYUPDATE(
+        2, "\r%u%%       ",
+        (U32)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity));
+  }
+  DISPLAYLEVEL(2, "\r%79s\r", "");
+  return tail;
+}
+
+
+/**
+ * Parameters for FASTCOVER_tryParameters().
+ */
+typedef struct FASTCOVER_tryParameters_data_s {
+    const FASTCOVER_ctx_t* ctx;
+    COVER_best_t* best;
+    size_t dictBufferCapacity;
+    ZDICT_cover_params_t parameters;
+} FASTCOVER_tryParameters_data_t;
+
+
+/**
+ * Tries a set of parameters and updates the COVER_best_t with the results.
+ * This function is thread safe if zstd is compiled with multithreaded support.
+ * It takes its parameters as an *OWNING* opaque pointer to support threading.
+ */
+static void FASTCOVER_tryParameters(void *opaque)
+{
+  /* Save parameters as local variables */
+  FASTCOVER_tryParameters_data_t *const data = (FASTCOVER_tryParameters_data_t *)opaque;
+  const FASTCOVER_ctx_t *const ctx = data->ctx;
+  const ZDICT_cover_params_t parameters = data->parameters;
+  size_t dictBufferCapacity = data->dictBufferCapacity;
+  size_t totalCompressedSize = ERROR(GENERIC);
+  /* Initialize array to keep track of frequency of dmer within activeSegment */
+  U16* segmentFreqs = (U16 *)calloc(((U64)1 << ctx->f), sizeof(U16));
+  /* Allocate space for hash table, dict, and freqs */
+  BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity);
+  U32 *freqs = (U32*) malloc(((U64)1 << ctx->f) * sizeof(U32));
+  if (!segmentFreqs || !dict || !freqs) {
+    DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n");
+    goto _cleanup;
+  }
+  /* Copy the frequencies because we need to modify them */
+  memcpy(freqs, ctx->freqs, ((U64)1 << ctx->f) * sizeof(U32));
+  /* Build the dictionary */
+  { const size_t tail = FASTCOVER_buildDictionary(ctx, freqs, dict, dictBufferCapacity,
+                                                  parameters, segmentFreqs);
+    const unsigned nbFinalizeSamples = (unsigned)(ctx->nbTrainSamples * ctx->accelParams.finalize / 100);
+    dictBufferCapacity = ZDICT_finalizeDictionary(
+        dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
+        ctx->samples, ctx->samplesSizes, nbFinalizeSamples, parameters.zParams);
+    if (ZDICT_isError(dictBufferCapacity)) {
+      DISPLAYLEVEL(1, "Failed to finalize dictionary\n");
+      goto _cleanup;
+    }
+  }
+  /* Check total compressed size */
+  totalCompressedSize = COVER_checkTotalCompressedSize(parameters, ctx->samplesSizes,
+                                                       ctx->samples, ctx->offsets,
+                                                       ctx->nbTrainSamples, ctx->nbSamples,
+                                                       dict, dictBufferCapacity);
+_cleanup:
+  COVER_best_finish(data->best, totalCompressedSize, parameters, dict,
+                    dictBufferCapacity);
+  free(data);
+  free(segmentFreqs);
+  free(dict);
+  free(freqs);
+}
+
+
+static void
+FASTCOVER_convertToCoverParams(ZDICT_fastCover_params_t fastCoverParams,
+                               ZDICT_cover_params_t* coverParams)
+{
+    coverParams->k = fastCoverParams.k;
+    coverParams->d = fastCoverParams.d;
+    coverParams->steps = fastCoverParams.steps;
+    coverParams->nbThreads = fastCoverParams.nbThreads;
+    coverParams->splitPoint = fastCoverParams.splitPoint;
+    coverParams->zParams = fastCoverParams.zParams;
+}
+
+
+static void
+FASTCOVER_convertToFastCoverParams(ZDICT_cover_params_t coverParams,
+                                   ZDICT_fastCover_params_t* fastCoverParams,
+                                   unsigned f, unsigned accel)
+{
+    fastCoverParams->k = coverParams.k;
+    fastCoverParams->d = coverParams.d;
+    fastCoverParams->steps = coverParams.steps;
+    fastCoverParams->nbThreads = coverParams.nbThreads;
+    fastCoverParams->splitPoint = coverParams.splitPoint;
+    fastCoverParams->f = f;
+    fastCoverParams->accel = accel;
+    fastCoverParams->zParams = coverParams.zParams;
+}
+
+
+ZDICTLIB_API size_t
+ZDICT_trainFromBuffer_fastCover(void* dictBuffer, size_t dictBufferCapacity,
+                                const void* samplesBuffer,
+                                const size_t* samplesSizes, unsigned nbSamples,
+                                ZDICT_fastCover_params_t parameters)
+{
+    BYTE* const dict = (BYTE*)dictBuffer;
+    FASTCOVER_ctx_t ctx;
+    ZDICT_cover_params_t coverParams;
+    FASTCOVER_accel_t accelParams;
+    /* Initialize global data */
+    g_displayLevel = parameters.zParams.notificationLevel;
+    /* Assign splitPoint and f if not provided */
+    parameters.splitPoint = 1.0;
+    parameters.f = parameters.f == 0 ? DEFAULT_F : parameters.f;
+    parameters.accel = parameters.accel == 0 ? DEFAULT_ACCEL : parameters.accel;
+    /* Convert to cover parameter */
+    memset(&coverParams, 0 , sizeof(coverParams));
+    FASTCOVER_convertToCoverParams(parameters, &coverParams);
+    /* Checks */
+    if (!FASTCOVER_checkParameters(coverParams, dictBufferCapacity, parameters.f,
+                                   parameters.accel)) {
+      DISPLAYLEVEL(1, "FASTCOVER parameters incorrect\n");
+      return ERROR(GENERIC);
+    }
+    if (nbSamples == 0) {
+      DISPLAYLEVEL(1, "FASTCOVER must have at least one input file\n");
+      return ERROR(GENERIC);
+    }
+    if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
+      DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
+                   ZDICT_DICTSIZE_MIN);
+      return ERROR(dstSize_tooSmall);
+    }
+    /* Assign corresponding FASTCOVER_accel_t to accelParams*/
+    accelParams = FASTCOVER_defaultAccelParameters[parameters.accel];
+    /* Initialize context */
+    if (!FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
+                            coverParams.d, parameters.splitPoint, parameters.f,
+                            accelParams)) {
+      DISPLAYLEVEL(1, "Failed to initialize context\n");
+      return ERROR(GENERIC);
+    }
+    /* Build the dictionary */
+    DISPLAYLEVEL(2, "Building dictionary\n");
+    {
+      /* Initialize array to keep track of frequency of dmer within activeSegment */
+      U16* segmentFreqs = (U16 *)calloc(((U64)1 << parameters.f), sizeof(U16));
+      const size_t tail = FASTCOVER_buildDictionary(&ctx, ctx.freqs, dictBuffer,
+                                                dictBufferCapacity, coverParams, segmentFreqs);
+      const unsigned nbFinalizeSamples = (unsigned)(ctx.nbTrainSamples * ctx.accelParams.finalize / 100);
+      const size_t dictionarySize = ZDICT_finalizeDictionary(
+          dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
+          samplesBuffer, samplesSizes, nbFinalizeSamples, coverParams.zParams);
+      if (!ZSTD_isError(dictionarySize)) {
+          DISPLAYLEVEL(2, "Constructed dictionary of size %u\n",
+                      (U32)dictionarySize);
+      }
+      FASTCOVER_ctx_destroy(&ctx);
+      free(segmentFreqs);
+      return dictionarySize;
+    }
+}
+
+
+ZDICTLIB_API size_t
+ZDICT_optimizeTrainFromBuffer_fastCover(
+                    void* dictBuffer, size_t dictBufferCapacity,
+                    const void* samplesBuffer,
+                    const size_t* samplesSizes, unsigned nbSamples,
+                    ZDICT_fastCover_params_t* parameters)
+{
+    ZDICT_cover_params_t coverParams;
+    FASTCOVER_accel_t accelParams;
+    /* constants */
+    const unsigned nbThreads = parameters->nbThreads;
+    const double splitPoint =
+        parameters->splitPoint <= 0.0 ? DEFAULT_SPLITPOINT : parameters->splitPoint;
+    const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;
+    const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d;
+    const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k;
+    const unsigned kMaxK = parameters->k == 0 ? 2000 : parameters->k;
+    const unsigned kSteps = parameters->steps == 0 ? 40 : parameters->steps;
+    const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1);
+    const unsigned kIterations =
+        (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
+    const unsigned f = parameters->f == 0 ? DEFAULT_F : parameters->f;
+    const unsigned accel = parameters->accel == 0 ? DEFAULT_ACCEL : parameters->accel;
+    /* Local variables */
+    const int displayLevel = parameters->zParams.notificationLevel;
+    unsigned iteration = 1;
+    unsigned d;
+    unsigned k;
+    COVER_best_t best;
+    POOL_ctx *pool = NULL;
+    /* Checks */
+    if (splitPoint <= 0 || splitPoint > 1) {
+      LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect splitPoint\n");
+      return ERROR(GENERIC);
+    }
+    if (accel == 0 || accel > FASTCOVER_MAX_ACCEL) {
+      LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect accel\n");
+      return ERROR(GENERIC);
+    }
+    if (kMinK < kMaxD || kMaxK < kMinK) {
+      LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect k\n");
+      return ERROR(GENERIC);
+    }
+    if (nbSamples == 0) {
+      LOCALDISPLAYLEVEL(displayLevel, 1, "FASTCOVER must have at least one input file\n");
+      return ERROR(GENERIC);
+    }
+    if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
+      LOCALDISPLAYLEVEL(displayLevel, 1, "dictBufferCapacity must be at least %u\n",
+                   ZDICT_DICTSIZE_MIN);
+      return ERROR(dstSize_tooSmall);
+    }
+    if (nbThreads > 1) {
+      pool = POOL_create(nbThreads, 1);
+      if (!pool) {
+        return ERROR(memory_allocation);
+      }
+    }
+    /* Initialization */
+    COVER_best_init(&best);
+    memset(&coverParams, 0 , sizeof(coverParams));
+    FASTCOVER_convertToCoverParams(*parameters, &coverParams);
+    accelParams = FASTCOVER_defaultAccelParameters[accel];
+    /* Turn down global display level to clean up display at level 2 and below */
+    g_displayLevel = displayLevel == 0 ? 0 : displayLevel - 1;
+    /* Loop through d first because each new value needs a new context */
+    LOCALDISPLAYLEVEL(displayLevel, 2, "Trying %u different sets of parameters\n",
+                      kIterations);
+    for (d = kMinD; d <= kMaxD; d += 2) {
+      /* Initialize the context for this value of d */
+      FASTCOVER_ctx_t ctx;
+      LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
+      if (!FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint, f, accelParams)) {
+        LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
+        COVER_best_destroy(&best);
+        POOL_free(pool);
+        return ERROR(GENERIC);
+      }
+      /* Loop through k reusing the same context */
+      for (k = kMinK; k <= kMaxK; k += kStepSize) {
+        /* Prepare the arguments */
+        FASTCOVER_tryParameters_data_t *data = (FASTCOVER_tryParameters_data_t *)malloc(
+            sizeof(FASTCOVER_tryParameters_data_t));
+        LOCALDISPLAYLEVEL(displayLevel, 3, "k=%u\n", k);
+        if (!data) {
+          LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to allocate parameters\n");
+          COVER_best_destroy(&best);
+          FASTCOVER_ctx_destroy(&ctx);
+          POOL_free(pool);
+          return ERROR(GENERIC);
+        }
+        data->ctx = &ctx;
+        data->best = &best;
+        data->dictBufferCapacity = dictBufferCapacity;
+        data->parameters = coverParams;
+        data->parameters.k = k;
+        data->parameters.d = d;
+        data->parameters.splitPoint = splitPoint;
+        data->parameters.steps = kSteps;
+        data->parameters.zParams.notificationLevel = g_displayLevel;
+        /* Check the parameters */
+        if (!FASTCOVER_checkParameters(data->parameters, dictBufferCapacity,
+                                       data->ctx->f, accel)) {
+          DISPLAYLEVEL(1, "FASTCOVER parameters incorrect\n");
+          free(data);
+          continue;
+        }
+        /* Call the function and pass ownership of data to it */
+        COVER_best_start(&best);
+        if (pool) {
+          POOL_add(pool, &FASTCOVER_tryParameters, data);
+        } else {
+          FASTCOVER_tryParameters(data);
+        }
+        /* Print status */
+        LOCALDISPLAYUPDATE(displayLevel, 2, "\r%u%%       ",
+                           (U32)((iteration * 100) / kIterations));
+        ++iteration;
+      }
+      COVER_best_wait(&best);
+      FASTCOVER_ctx_destroy(&ctx);
+    }
+    LOCALDISPLAYLEVEL(displayLevel, 2, "\r%79s\r", "");
+    /* Fill the output buffer and parameters with output of the best parameters */
+    {
+      const size_t dictSize = best.dictSize;
+      if (ZSTD_isError(best.compressedSize)) {
+        const size_t compressedSize = best.compressedSize;
+        COVER_best_destroy(&best);
+        POOL_free(pool);
+        return compressedSize;
+      }
+      FASTCOVER_convertToFastCoverParams(best.parameters, parameters, f, accel);
+      memcpy(dictBuffer, best.dict, dictSize);
+      COVER_best_destroy(&best);
+      POOL_free(pool);
+      return dictSize;
+    }
+
+}
diff --git a/lib/dictBuilder/zdict.c b/lib/dictBuilder/zdict.c
index 2024e0b..2964b69 100644
--- a/lib/dictBuilder/zdict.c
+++ b/lib/dictBuilder/zdict.c
@@ -293,7 +293,7 @@
             refinedEnd = refinedStart + selectedCount;
         }
 
-        /* evaluate gain based on new ref */
+        /* evaluate gain based on new dict */
         start = refinedStart;
         pos = suffix[refinedStart];
         end = start;
@@ -341,7 +341,7 @@
         for (i=MINMATCHLENGTH; i<=(int)maxLength; i++)
             savings[i] = savings[i-1] + (lengthList[i] * (i-3));
 
-        DISPLAYLEVEL(4, "Selected ref at position %u, of length %u : saves %u (ratio: %.2f)  \n",
+        DISPLAYLEVEL(4, "Selected dict at position %u, of length %u : saves %u (ratio: %.2f)  \n",
                      (U32)pos, (U32)maxLength, savings[maxLength], (double)savings[maxLength] / maxLength);
 
         solution.pos = (U32)pos;
@@ -581,7 +581,7 @@
 
 typedef struct
 {
-    ZSTD_CCtx* ref;    /* contains reference to dictionary */
+    ZSTD_CDict* dict;    /* dictionary */
     ZSTD_CCtx* zc;     /* working context */
     void* workPlace;   /* must be ZSTD_BLOCKSIZE_MAX allocated */
 } EStats_ress_t;
@@ -597,8 +597,9 @@
     size_t cSize;
 
     if (srcSize > blockSizeMax) srcSize = blockSizeMax;   /* protection vs large samples */
-    {   size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref, 0);
-        if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; }
+    {   size_t const errorCode = ZSTD_compressBegin_usingCDict(esr.zc, esr.dict);
+        if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_compressBegin_usingCDict failed \n"); return; }
+
     }
     cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize);
     if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (U32)srcSize); return; }
@@ -697,7 +698,7 @@
     short litLengthNCount[MaxLL+1];
     U32 repOffset[MAXREPOFFSET];
     offsetCount_t bestRepOffset[ZSTD_REP_NUM+1];
-    EStats_ress_t esr;
+    EStats_ress_t esr = { NULL, NULL, NULL };
     ZSTD_parameters params;
     U32 u, huffLog = 11, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total;
     size_t pos = 0, errorCode;
@@ -708,14 +709,6 @@
 
     /* init */
     DEBUGLOG(4, "ZDICT_analyzeEntropy");
-    esr.ref = ZSTD_createCCtx();
-    esr.zc = ZSTD_createCCtx();
-    esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX);
-    if (!esr.ref || !esr.zc || !esr.workPlace) {
-        eSize = ERROR(memory_allocation);
-        DISPLAYLEVEL(1, "Not enough memory \n");
-        goto _cleanup;
-    }
     if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionaryCreation_failed); goto _cleanup; }   /* too large dictionary */
     for (u=0; u<256; u++) countLit[u] = 1;   /* any character must be described */
     for (u=0; u<=offcodeMax; u++) offcodeCount[u] = 1;
@@ -726,12 +719,15 @@
     memset(bestRepOffset, 0, sizeof(bestRepOffset));
     if (compressionLevel==0) compressionLevel = g_compressionLevel_default;
     params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);
-    {   size_t const beginResult = ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params, 0);
-        if (ZSTD_isError(beginResult)) {
-            DISPLAYLEVEL(1, "error : ZSTD_compressBegin_advanced() failed : %s \n", ZSTD_getErrorName(beginResult));
-            eSize = ERROR(GENERIC);
-            goto _cleanup;
-    }   }
+
+    esr.dict = ZSTD_createCDict_advanced(dictBuffer, dictBufferSize, ZSTD_dlm_byRef, ZSTD_dct_rawContent, params.cParams, ZSTD_defaultCMem);
+    esr.zc = ZSTD_createCCtx();
+    esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX);
+    if (!esr.dict || !esr.zc || !esr.workPlace) {
+        eSize = ERROR(memory_allocation);
+        DISPLAYLEVEL(1, "Not enough memory \n");
+        goto _cleanup;
+    }
 
     /* collect stats on all samples */
     for (u=0; u<nbFiles; u++) {
@@ -856,7 +852,7 @@
     eSize += 12;
 
 _cleanup:
-    ZSTD_freeCCtx(esr.ref);
+    ZSTD_freeCDict(esr.dict);
     ZSTD_freeCCtx(esr.zc);
     free(esr.workPlace);
 
@@ -867,8 +863,8 @@
 
 size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
                           const void* customDictContent, size_t dictContentSize,
-                          const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
-                          ZDICT_params_t params)
+                          const void* samplesBuffer, const size_t* samplesSizes,
+                          unsigned nbSamples, ZDICT_params_t params)
 {
     size_t hSize;
 #define HBUFFSIZE 256   /* should prove large enough for all entropy headers */
@@ -914,9 +910,10 @@
 }
 
 
-size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
-                                                 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
-                                                 ZDICT_params_t params)
+static size_t ZDICT_addEntropyTablesFromBuffer_advanced(
+        void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
+        const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
+        ZDICT_params_t params)
 {
     int const compressionLevel = (params.compressionLevel == 0) ? g_compressionLevel_default : params.compressionLevel;
     U32 const notificationLevel = params.notificationLevel;
@@ -947,7 +944,11 @@
     return MIN(dictBufferCapacity, hSize+dictContentSize);
 }
 
-
+/* Hidden declaration for dbio.c */
+size_t ZDICT_trainFromBuffer_unsafe_legacy(
+                            void* dictBuffer, size_t maxDictSize,
+                            const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
+                            ZDICT_legacy_params_t params);
 /*! ZDICT_trainFromBuffer_unsafe_legacy() :
 *   Warning : `samplesBuffer` must be followed by noisy guard band.
 *   @return : size of dictionary, or an error code which can be tested with ZDICT_isError()
@@ -991,8 +992,10 @@
             U32 const pos = dictList[u].pos;
             U32 const length = dictList[u].length;
             U32 const printedLength = MIN(40, length);
-            if ((pos > samplesBuffSize) || ((pos + length) > samplesBuffSize))
+            if ((pos > samplesBuffSize) || ((pos + length) > samplesBuffSize)) {
+                free(dictList);
                 return ERROR(GENERIC);   /* should never happen */
+            }
             DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
                          u, length, pos, dictList[u].savings);
             ZDICT_printHex((const char*)samplesBuffer+pos, printedLength);
@@ -1082,17 +1085,17 @@
 size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
                              const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
 {
-    ZDICT_cover_params_t params;
+    ZDICT_fastCover_params_t params;
     DEBUGLOG(3, "ZDICT_trainFromBuffer");
     memset(&params, 0, sizeof(params));
     params.d = 8;
     params.steps = 4;
     /* Default to level 6 since no compression level information is available */
-    params.zParams.compressionLevel = 6;
+    params.zParams.compressionLevel = 3;
 #if defined(DEBUGLEVEL) && (DEBUGLEVEL>=1)
     params.zParams.notificationLevel = DEBUGLEVEL;
 #endif
-    return ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, dictBufferCapacity,
+    return ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, dictBufferCapacity,
                                                samplesBuffer, samplesSizes, nbSamples,
                                                &params);
 }
diff --git a/lib/dictBuilder/zdict.h b/lib/dictBuilder/zdict.h
index ad459c2..d57d59f 100644
--- a/lib/dictBuilder/zdict.h
+++ b/lib/dictBuilder/zdict.h
@@ -39,7 +39,8 @@
 
 /*! ZDICT_trainFromBuffer():
  *  Train a dictionary from an array of samples.
- *  Redirect towards ZDICT_optimizeTrainFromBuffer_cover() single-threaded, with d=8 and steps=4.
+ *  Redirect towards ZDICT_optimizeTrainFromBuffer_fastCover() single-threaded, with d=8, steps=4,
+ *  f=20, and accel=1.
  *  Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
  *  supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
  *  The resulting dictionary will be saved into `dictBuffer`.
@@ -52,7 +53,8 @@
  *        It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
  */
 ZDICTLIB_API size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
-                                    const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples);
+                                    const void* samplesBuffer,
+                                    const size_t* samplesSizes, unsigned nbSamples);
 
 
 /*======   Helper functions   ======*/
@@ -84,11 +86,22 @@
 typedef struct {
     unsigned k;                  /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
     unsigned d;                  /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */
-    unsigned steps;              /* Number of steps : Only used for optimization : 0 means default (32) : Higher means more parameters checked */
+    unsigned steps;              /* Number of steps : Only used for optimization : 0 means default (40) : Higher means more parameters checked */
     unsigned nbThreads;          /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */
+    double splitPoint;           /* Percentage of samples used for training: Only used for optimization : the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (1.0), 1.0 when all samples are used for both training and testing */
     ZDICT_params_t zParams;
 } ZDICT_cover_params_t;
 
+typedef struct {
+    unsigned k;                  /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
+    unsigned d;                  /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */
+    unsigned f;                  /* log of size of frequency array : constraint: 0 < f <= 31 : 1 means default(20)*/
+    unsigned steps;              /* Number of steps : Only used for optimization : 0 means default (40) : Higher means more parameters checked */
+    unsigned nbThreads;          /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */
+    double splitPoint;           /* Percentage of samples used for training: Only used for optimization : the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (0.75), 1.0 when all samples are used for both training and testing */
+    unsigned accel;              /* Acceleration level: constraint: 0 < accel <= 10, higher means faster and less accurate, 0 means default(1) */
+    ZDICT_params_t zParams;
+} ZDICT_fastCover_params_t;
 
 /*! ZDICT_trainFromBuffer_cover():
  *  Train a dictionary from an array of samples using the COVER algorithm.
@@ -115,9 +128,9 @@
  * dictionary constructed with those parameters is stored in `dictBuffer`.
  *
  * All of the parameters d, k, steps are optional.
- * If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8, 10, 12, 14, 16}.
+ * If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8}.
  * if steps is zero it defaults to its default value.
- * If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [16, 2048].
+ * If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [50, 2000].
  *
  * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
  *           or an error code, which can be tested with ZDICT_isError().
@@ -129,6 +142,48 @@
     const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
           ZDICT_cover_params_t* parameters);
 
+/*! ZDICT_trainFromBuffer_fastCover():
+ *  Train a dictionary from an array of samples using a modified version of COVER algorithm.
+ *  Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
+ *  supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
+ *  d and k are required.
+ *  All other parameters are optional, will use default values if not provided
+ *  The resulting dictionary will be saved into `dictBuffer`.
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+ *          or an error code, which can be tested with ZDICT_isError().
+ *  Note: ZDICT_trainFromBuffer_fastCover() requires about 1 bytes of memory for each input byte and additionally another 6 * 2^f bytes of memory .
+ *  Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
+ *        It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
+ *        In general, it's recommended to provide a few thousands samples, though this can vary a lot.
+ *        It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
+ */
+ZDICTLIB_API size_t ZDICT_trainFromBuffer_fastCover(void *dictBuffer,
+                    size_t dictBufferCapacity, const void *samplesBuffer,
+                    const size_t *samplesSizes, unsigned nbSamples,
+                    ZDICT_fastCover_params_t parameters);
+
+/*! ZDICT_optimizeTrainFromBuffer_fastCover():
+ * The same requirements as above hold for all the parameters except `parameters`.
+ * This function tries many parameter combinations (specifically, k and d combinations)
+ * and picks the best parameters. `*parameters` is filled with the best parameters found,
+ * dictionary constructed with those parameters is stored in `dictBuffer`.
+ * All of the parameters d, k, steps, f, and accel are optional.
+ * If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8}.
+ * if steps is zero it defaults to its default value.
+ * If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [50, 2000].
+ * If f is zero, default value of 20 is used.
+ * If accel is zero, default value of 1 is used.
+ *
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+ *           or an error code, which can be tested with ZDICT_isError().
+ *           On success `*parameters` contains the parameters selected.
+ * Note: ZDICT_optimizeTrainFromBuffer_fastCover() requires about 1 byte of memory for each input byte and additionally another 6 * 2^f bytes of memory for each thread.
+ */
+ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover(void* dictBuffer,
+                    size_t dictBufferCapacity, const void* samplesBuffer,
+                    const size_t* samplesSizes, unsigned nbSamples,
+                    ZDICT_fastCover_params_t* parameters);
+
 /*! ZDICT_finalizeDictionary():
  * Given a custom content as a basis for dictionary, and a set of samples,
  * finalize dictionary by adding headers and statistics.
diff --git a/lib/legacy/zstd_v01.c b/lib/legacy/zstd_v01.c
index ae1cb2c..c007e7c 100644
--- a/lib/legacy/zstd_v01.c
+++ b/lib/legacy/zstd_v01.c
@@ -668,11 +668,17 @@
         switch(srcSize)
         {
             case 7: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[6]) << (sizeof(size_t)*8 - 16);
+                    /* fallthrough */
             case 6: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[5]) << (sizeof(size_t)*8 - 24);
+                    /* fallthrough */
             case 5: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[4]) << (sizeof(size_t)*8 - 32);
+                    /* fallthrough */
             case 4: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[3]) << 24;
+                    /* fallthrough */
             case 3: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[2]) << 16;
+                    /* fallthrough */
             case 2: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[1]) <<  8;
+                    /* fallthrough */
             default:;
         }
         contain32 = ((const BYTE*)srcBuffer)[srcSize-1];
@@ -1458,7 +1464,7 @@
 *   Decompression code
 **************************************************************/
 
-size_t ZSTDv01_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr)
+static size_t ZSTDv01_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr)
 {
     const BYTE* const in = (const BYTE* const)src;
     BYTE headerFlags;
@@ -1511,7 +1517,7 @@
 }
 
 
-size_t ZSTDv01_decodeLiteralsBlock(void* ctx,
+static size_t ZSTDv01_decodeLiteralsBlock(void* ctx,
                                 void* dst, size_t maxDstSize,
                           const BYTE** litStart, size_t* litSize,
                           const void* src, size_t srcSize)
@@ -1563,7 +1569,7 @@
 }
 
 
-size_t ZSTDv01_decodeSeqHeaders(int* nbSeq, const BYTE** dumpsPtr, size_t* dumpsLengthPtr,
+static size_t ZSTDv01_decodeSeqHeaders(int* nbSeq, const BYTE** dumpsPtr, size_t* dumpsLengthPtr,
                          FSE_DTable* DTableLL, FSE_DTable* DTableML, FSE_DTable* DTableOffb,
                          const void* src, size_t srcSize)
 {
diff --git a/lib/legacy/zstd_v02.c b/lib/legacy/zstd_v02.c
index 8bc0ece..c09ef8c 100644
--- a/lib/legacy/zstd_v02.c
+++ b/lib/legacy/zstd_v02.c
@@ -399,11 +399,17 @@
         switch(srcSize)
         {
             case 7: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[6]) << (sizeof(size_t)*8 - 16);
+                    /* fallthrough */
             case 6: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[5]) << (sizeof(size_t)*8 - 24);
+                    /* fallthrough */
             case 5: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[4]) << (sizeof(size_t)*8 - 32);
+                    /* fallthrough */
             case 4: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[3]) << 24;
+                    /* fallthrough */
             case 3: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[2]) << 16;
+                    /* fallthrough */
             case 2: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[1]) <<  8;
+                    /* fallthrough */
             default:;
         }
         contain32 = ((const BYTE*)srcBuffer)[srcSize-1];
diff --git a/lib/legacy/zstd_v03.c b/lib/legacy/zstd_v03.c
index 54445af..0c4cdf6 100644
--- a/lib/legacy/zstd_v03.c
+++ b/lib/legacy/zstd_v03.c
@@ -402,11 +402,17 @@
         switch(srcSize)
         {
             case 7: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[6]) << (sizeof(size_t)*8 - 16);
+                    /* fallthrough */
             case 6: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[5]) << (sizeof(size_t)*8 - 24);
+                    /* fallthrough */
             case 5: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[4]) << (sizeof(size_t)*8 - 32);
+                    /* fallthrough */
             case 4: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[3]) << 24;
+                    /* fallthrough */
             case 3: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[2]) << 16;
+                    /* fallthrough */
             case 2: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[1]) <<  8;
+                    /* fallthrough */
             default:;
         }
         contain32 = ((const BYTE*)srcBuffer)[srcSize-1];
diff --git a/lib/legacy/zstd_v04.c b/lib/legacy/zstd_v04.c
index a2e2cfa..e852bb9 100644
--- a/lib/legacy/zstd_v04.c
+++ b/lib/legacy/zstd_v04.c
@@ -1093,6 +1093,7 @@
     if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
 
     /* Init, lay down lowprob symbols */
+    memset(tableDecode, 0, sizeof(FSE_DECODE_TYPE) * (maxSymbolValue+1) );   /* useless init, but keep static analyzer happy, and we don't need to performance optimize legacy decoders */
     DTableH.tableLog = (U16)tableLog;
     for (s=0; s<=maxSymbolValue; s++)
     {
@@ -3621,8 +3622,3 @@
 
 ZSTD_DCtx* ZSTDv04_createDCtx(void) { return ZSTD_createDCtx(); }
 size_t ZSTDv04_freeDCtx(ZSTD_DCtx* dctx) { return ZSTD_freeDCtx(dctx); }
-
-size_t ZSTDv04_getFrameParams(ZSTD_parameters* params, const void* src, size_t srcSize)
-{
-    return ZSTD_getFrameParams(params, src, srcSize);
-}
diff --git a/lib/legacy/zstd_v05.c b/lib/legacy/zstd_v05.c
index a5e1b1f..a1580a2 100644
--- a/lib/legacy/zstd_v05.c
+++ b/lib/legacy/zstd_v05.c
@@ -1224,6 +1224,7 @@
     if (tableLog > FSEv05_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
 
     /* Init, lay down lowprob symbols */
+    memset(tableDecode, 0, sizeof(FSEv05_FUNCTION_TYPE) * (maxSymbolValue+1) );   /* useless init, but keep static analyzer happy, and we don't need to performance optimize legacy decoders */
     DTableH.tableLog = (U16)tableLog;
     for (s=0; s<=maxSymbolValue; s++) {
         if (normalizedCounter[s]==-1) {
@@ -2658,6 +2659,7 @@
     BYTE headerBuffer[ZSTDv05_frameHeaderSize_max];
 };  /* typedef'd to ZSTDv05_DCtx within "zstd_static.h" */
 
+size_t ZSTDv05_sizeofDCtx (void); /* Hidden declaration */
 size_t ZSTDv05_sizeofDCtx (void) { return sizeof(ZSTDv05_DCtx); }
 
 size_t ZSTDv05_decompressBegin(ZSTDv05_DCtx* dctx)
@@ -2822,7 +2824,7 @@
 }
 
 
-size_t ZSTDv05_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr)
+static size_t ZSTDv05_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr)
 {
     const BYTE* const in = (const BYTE* const)src;
     BYTE headerFlags;
@@ -2845,6 +2847,7 @@
 
 static size_t ZSTDv05_copyRawBlock(void* dst, size_t maxDstSize, const void* src, size_t srcSize)
 {
+    if (dst==NULL) return ERROR(dstSize_tooSmall);
     if (srcSize > maxDstSize) return ERROR(dstSize_tooSmall);
     memcpy(dst, src, srcSize);
     return srcSize;
@@ -2853,8 +2856,8 @@
 
 /*! ZSTDv05_decodeLiteralsBlock() :
     @return : nb of bytes read from src (< srcSize ) */
-size_t ZSTDv05_decodeLiteralsBlock(ZSTDv05_DCtx* dctx,
-                          const void* src, size_t srcSize)   /* note : srcSize < BLOCKSIZE */
+static size_t ZSTDv05_decodeLiteralsBlock(ZSTDv05_DCtx* dctx,
+                                    const void* src, size_t srcSize)   /* note : srcSize < BLOCKSIZE */
 {
     const BYTE* const istart = (const BYTE*) src;
 
@@ -2988,7 +2991,7 @@
 }
 
 
-size_t ZSTDv05_decodeSeqHeaders(int* nbSeq, const BYTE** dumpsPtr, size_t* dumpsLengthPtr,
+static size_t ZSTDv05_decodeSeqHeaders(int* nbSeq, const BYTE** dumpsPtr, size_t* dumpsLengthPtr,
                          FSEv05_DTable* DTableLL, FSEv05_DTable* DTableML, FSEv05_DTable* DTableOffb,
                          const void* src, size_t srcSize, U32 flagStaticTable)
 {
@@ -3297,11 +3300,11 @@
     BYTE* const ostart = (BYTE* const)dst;
     BYTE* op = ostart;
     BYTE* const oend = ostart + maxDstSize;
-    size_t errorCode, dumpsLength;
+    size_t errorCode, dumpsLength=0;
     const BYTE* litPtr = dctx->litPtr;
     const BYTE* const litEnd = litPtr + dctx->litSize;
-    int nbSeq;
-    const BYTE* dumps;
+    int nbSeq=0;
+    const BYTE* dumps = NULL;
     U32* DTableLL = dctx->LLTable;
     U32* DTableML = dctx->MLTable;
     U32* DTableOffb = dctx->OffTable;
@@ -3410,10 +3413,10 @@
     BYTE* const oend = ostart + maxDstSize;
     size_t remainingSize = srcSize;
     blockProperties_t blockProperties;
+    memset(&blockProperties, 0, sizeof(blockProperties));
 
     /* Frame Header */
-    {
-        size_t frameHeaderSize;
+    {   size_t frameHeaderSize;
         if (srcSize < ZSTDv05_frameHeaderSize_min+ZSTDv05_blockHeaderSize) return ERROR(srcSize_wrong);
         frameHeaderSize = ZSTDv05_decodeFrameHeader_Part1(dctx, src, ZSTDv05_frameHeaderSize_min);
         if (ZSTDv05_isError(frameHeaderSize)) return frameHeaderSize;
diff --git a/lib/legacy/zstd_v06.c b/lib/legacy/zstd_v06.c
index 8b068b3..60d8d6f 100644
--- a/lib/legacy/zstd_v06.c
+++ b/lib/legacy/zstd_v06.c
@@ -1250,9 +1250,7 @@
 /* **************************************************************
 *  HUF Error Management
 ****************************************************************/
-unsigned HUFv06_isError(size_t code) { return ERR_isError(code); }
-
-const char* HUFv06_getErrorName(size_t code) { return ERR_getErrorName(code); }
+static unsigned HUFv06_isError(size_t code) { return ERR_isError(code); }
 
 
 /*-**************************************************************
@@ -2823,7 +2821,8 @@
     BYTE headerBuffer[ZSTDv06_FRAMEHEADERSIZE_MAX];
 };  /* typedef'd to ZSTDv06_DCtx within "zstd_static.h" */
 
-size_t ZSTDv06_sizeofDCtx (void) { return sizeof(ZSTDv06_DCtx); }   /* non published interface */
+size_t ZSTDv06_sizeofDCtx (void); /* Hidden declaration */
+size_t ZSTDv06_sizeofDCtx (void) { return sizeof(ZSTDv06_DCtx); }
 
 size_t ZSTDv06_decompressBegin(ZSTDv06_DCtx* dctx)
 {
@@ -3022,7 +3021,7 @@
 
 /*! ZSTDv06_getcBlockSize() :
 *   Provides the size of compressed block from block header `src` */
-size_t ZSTDv06_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr)
+static size_t ZSTDv06_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr)
 {
     const BYTE* const in = (const BYTE* const)src;
     U32 cSize;
@@ -3041,6 +3040,7 @@
 
 static size_t ZSTDv06_copyRawBlock(void* dst, size_t dstCapacity, const void* src, size_t srcSize)
 {
+    if (dst==NULL) return ERROR(dstSize_tooSmall);
     if (srcSize > dstCapacity) return ERROR(dstSize_tooSmall);
     memcpy(dst, src, srcSize);
     return srcSize;
@@ -3049,7 +3049,7 @@
 
 /*! ZSTDv06_decodeLiteralsBlock() :
     @return : nb of bytes read from src (< srcSize ) */
-size_t ZSTDv06_decodeLiteralsBlock(ZSTDv06_DCtx* dctx,
+static size_t ZSTDv06_decodeLiteralsBlock(ZSTDv06_DCtx* dctx,
                           const void* src, size_t srcSize)   /* note : srcSize < BLOCKSIZE */
 {
     const BYTE* const istart = (const BYTE*) src;
@@ -3183,7 +3183,7 @@
     @return : nb bytes read from src,
               or an error code if it fails, testable with ZSTDv06_isError()
 */
-size_t ZSTDv06_buildSeqTable(FSEv06_DTable* DTable, U32 type, U32 max, U32 maxLog,
+static size_t ZSTDv06_buildSeqTable(FSEv06_DTable* DTable, U32 type, U32 max, U32 maxLog,
                                  const void* src, size_t srcSize,
                                  const S16* defaultNorm, U32 defaultLog, U32 flagRepeatTable)
 {
@@ -3213,7 +3213,7 @@
 }
 
 
-size_t ZSTDv06_decodeSeqHeaders(int* nbSeqPtr,
+static size_t ZSTDv06_decodeSeqHeaders(int* nbSeqPtr,
                              FSEv06_DTable* DTableLL, FSEv06_DTable* DTableML, FSEv06_DTable* DTableOffb, U32 flagRepeatTable,
                              const void* src, size_t srcSize)
 {
@@ -3358,7 +3358,7 @@
 }
 
 
-size_t ZSTDv06_execSequence(BYTE* op,
+static size_t ZSTDv06_execSequence(BYTE* op,
                                 BYTE* const oend, seq_t sequence,
                                 const BYTE** litPtr, const BYTE* const litLimit,
                                 const BYTE* const base, const BYTE* const vBase, const BYTE* const dictEnd)
@@ -4006,7 +4006,7 @@
                     if (ZSTDv06_isError(hSize)) return hSize;
                     if (toLoad > (size_t)(iend-ip)) {   /* not enough input to load full header */
                         memcpy(zbd->headerBuffer + zbd->lhSize, ip, iend-ip);
-                        zbd->lhSize += iend-ip; ip = iend; notDone = 0;
+                        zbd->lhSize += iend-ip;
                         *dstCapacityPtr = 0;
                         return (hSize - zbd->lhSize) + ZSTDv06_blockHeaderSize;   /* remaining header bytes + next block header */
                     }
diff --git a/lib/legacy/zstd_v07.c b/lib/legacy/zstd_v07.c
index 70b170f..c7bb7a5 100644
--- a/lib/legacy/zstd_v07.c
+++ b/lib/legacy/zstd_v07.c
@@ -2628,7 +2628,7 @@
 
 
 
-void* ZSTDv07_defaultAllocFunction(void* opaque, size_t size)
+static void* ZSTDv07_defaultAllocFunction(void* opaque, size_t size)
 {
     void* address = malloc(size);
     (void)opaque;
@@ -2636,7 +2636,7 @@
     return address;
 }
 
-void ZSTDv07_defaultFreeFunction(void* opaque, void* address)
+static void ZSTDv07_defaultFreeFunction(void* opaque, void* address)
 {
     (void)opaque;
     /* if (address) printf("free %p opaque=%p \n", address, opaque); */
@@ -3150,10 +3150,10 @@
     const BYTE* ip = (const BYTE*)src;
 
     if (srcSize < ZSTDv07_frameHeaderSize_min) return ZSTDv07_frameHeaderSize_min;
+    memset(fparamsPtr, 0, sizeof(*fparamsPtr));
     if (MEM_readLE32(src) != ZSTDv07_MAGICNUMBER) {
         if ((MEM_readLE32(src) & 0xFFFFFFF0U) == ZSTDv07_MAGIC_SKIPPABLE_START) {
             if (srcSize < ZSTDv07_skippableHeaderSize) return ZSTDv07_skippableHeaderSize; /* magic number + skippable frame length */
-            memset(fparamsPtr, 0, sizeof(*fparamsPtr));
             fparamsPtr->frameContentSize = MEM_readLE32((const char *)src + 4);
             fparamsPtr->windowSize = 0; /* windowSize==0 means a frame is skippable */
             return 0;
@@ -3175,11 +3175,13 @@
         U32 windowSize = 0;
         U32 dictID = 0;
         U64 frameContentSize = 0;
-        if ((fhdByte & 0x08) != 0) return ERROR(frameParameter_unsupported);   /* reserved bits, which must be zero */
+        if ((fhdByte & 0x08) != 0)   /* reserved bits, which must be zero */
+            return ERROR(frameParameter_unsupported);
         if (!directMode) {
             BYTE const wlByte = ip[pos++];
             U32 const windowLog = (wlByte >> 3) + ZSTDv07_WINDOWLOG_ABSOLUTEMIN;
-            if (windowLog > ZSTDv07_WINDOWLOG_MAX) return ERROR(frameParameter_unsupported);
+            if (windowLog > ZSTDv07_WINDOWLOG_MAX)
+                return ERROR(frameParameter_unsupported);
             windowSize = (1U << windowLog);
             windowSize += (windowSize >> 3) * (wlByte&7);
         }
@@ -3201,7 +3203,8 @@
             case 3 : frameContentSize = MEM_readLE64(ip+pos); break;
         }
         if (!windowSize) windowSize = (U32)frameContentSize;
-        if (windowSize > windowSizeMax) return ERROR(frameParameter_unsupported);
+        if (windowSize > windowSizeMax)
+            return ERROR(frameParameter_unsupported);
         fparamsPtr->frameContentSize = frameContentSize;
         fparamsPtr->windowSize = windowSize;
         fparamsPtr->dictID = dictID;
@@ -3220,11 +3223,10 @@
                    - frame header not completely provided (`srcSize` too small) */
 unsigned long long ZSTDv07_getDecompressedSize(const void* src, size_t srcSize)
 {
-    {   ZSTDv07_frameParams fparams;
-        size_t const frResult = ZSTDv07_getFrameParams(&fparams, src, srcSize);
-        if (frResult!=0) return 0;
-        return fparams.frameContentSize;
-    }
+    ZSTDv07_frameParams fparams;
+    size_t const frResult = ZSTDv07_getFrameParams(&fparams, src, srcSize);
+    if (frResult!=0) return 0;
+    return fparams.frameContentSize;
 }
 
 
@@ -3248,7 +3250,7 @@
 
 /*! ZSTDv07_getcBlockSize() :
 *   Provides the size of compressed block from block header `src` */
-size_t ZSTDv07_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr)
+static size_t ZSTDv07_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr)
 {
     const BYTE* const in = (const BYTE* const)src;
     U32 cSize;
@@ -3275,7 +3277,7 @@
 
 /*! ZSTDv07_decodeLiteralsBlock() :
     @return : nb of bytes read from src (< srcSize ) */
-size_t ZSTDv07_decodeLiteralsBlock(ZSTDv07_DCtx* dctx,
+static size_t ZSTDv07_decodeLiteralsBlock(ZSTDv07_DCtx* dctx,
                           const void* src, size_t srcSize)   /* note : srcSize < BLOCKSIZE */
 {
     const BYTE* const istart = (const BYTE*) src;
@@ -3409,7 +3411,7 @@
     @return : nb bytes read from src,
               or an error code if it fails, testable with ZSTDv07_isError()
 */
-size_t ZSTDv07_buildSeqTable(FSEv07_DTable* DTable, U32 type, U32 max, U32 maxLog,
+static size_t ZSTDv07_buildSeqTable(FSEv07_DTable* DTable, U32 type, U32 max, U32 maxLog,
                                  const void* src, size_t srcSize,
                                  const S16* defaultNorm, U32 defaultLog, U32 flagRepeatTable)
 {
@@ -3439,7 +3441,7 @@
 }
 
 
-size_t ZSTDv07_decodeSeqHeaders(int* nbSeqPtr,
+static size_t ZSTDv07_decodeSeqHeaders(int* nbSeqPtr,
                              FSEv07_DTable* DTableLL, FSEv07_DTable* DTableML, FSEv07_DTable* DTableOffb, U32 flagRepeatTable,
                              const void* src, size_t srcSize)
 {
@@ -3771,7 +3773,7 @@
 }
 
 
-size_t ZSTDv07_generateNxBytes(void* dst, size_t dstCapacity, BYTE byte, size_t length)
+static size_t ZSTDv07_generateNxBytes(void* dst, size_t dstCapacity, BYTE byte, size_t length)
 {
     if (length > dstCapacity) return ERROR(dstSize_tooSmall);
     memset(dst, byte, length);
@@ -3851,7 +3853,7 @@
 *   It avoids reloading the dictionary each time.
 *   `preparedDCtx` must have been properly initialized using ZSTDv07_decompressBegin_usingDict().
 *   Requires 2 contexts : 1 for reference (preparedDCtx), which will not be modified, and 1 to run the decompression operation (dctx) */
-size_t ZSTDv07_decompress_usingPreparedDCtx(ZSTDv07_DCtx* dctx, const ZSTDv07_DCtx* refDCtx,
+static size_t ZSTDv07_decompress_usingPreparedDCtx(ZSTDv07_DCtx* dctx, const ZSTDv07_DCtx* refDCtx,
                                          void* dst, size_t dstCapacity,
                                    const void* src, size_t srcSize)
 {
@@ -4146,7 +4148,7 @@
     ZSTDv07_DCtx* refContext;
 };  /* typedef'd tp ZSTDv07_CDict within zstd.h */
 
-ZSTDv07_DDict* ZSTDv07_createDDict_advanced(const void* dict, size_t dictSize, ZSTDv07_customMem customMem)
+static ZSTDv07_DDict* ZSTDv07_createDDict_advanced(const void* dict, size_t dictSize, ZSTDv07_customMem customMem)
 {
     if (!customMem.customAlloc && !customMem.customFree)
         customMem = defaultCustomMem;
diff --git a/lib/zstd.h b/lib/zstd.h
index 0c20bb7..7b6964b 100644
--- a/lib/zstd.h
+++ b/lib/zstd.h
@@ -35,31 +35,43 @@
 #endif
 
 
-/*******************************************************************************************************
+/*******************************************************************************
   Introduction
 
-  zstd, short for Zstandard, is a fast lossless compression algorithm,
-  targeting real-time compression scenarios at zlib-level and better compression ratios.
-  The zstd compression library provides in-memory compression and decompression functions.
-  The library supports compression levels from 1 up to ZSTD_maxCLevel() which is currently 22.
-  Levels >= 20, labeled `--ultra`, should be used with caution, as they require more memory.
+  zstd, short for Zstandard, is a fast lossless compression algorithm, targeting
+  real-time compression scenarios at zlib-level and better compression ratios.
+  The zstd compression library provides in-memory compression and decompression
+  functions.
+
+  The library supports regular compression levels from 1 up to ZSTD_maxCLevel(),
+  which is currently 22. Levels >= 20, labeled `--ultra`, should be used with
+  caution, as they require more memory. The library also offers negative
+  compression levels, which extend the range of speed vs. ratio preferences.
+  The lower the level, the faster the speed (at the cost of compression).
+
   Compression can be done in:
     - a single step (described as Simple API)
     - a single step, reusing a context (described as Explicit context)
     - unbounded multiple steps (described as Streaming compression)
-  The compression ratio achievable on small data can be highly improved using a dictionary in:
-    - a single step (described as Simple dictionary API)
-    - a single step, reusing a dictionary (described as Bulk-processing dictionary API)
 
-  Advanced experimental functions can be accessed using #define ZSTD_STATIC_LINKING_ONLY before including zstd.h.
-  Advanced experimental APIs shall never be used with a dynamic library.
-  They are not "stable", their definition may change in the future. Only static linking is allowed.
-*********************************************************************************************************/
+  The compression ratio achievable on small data can be highly improved using
+  a dictionary. Dictionary compression can be performed in:
+    - a single step (described as Simple dictionary API)
+    - a single step, reusing a dictionary (described as Bulk-processing
+      dictionary API)
+
+  Advanced experimental functions can be accessed using
+  `#define ZSTD_STATIC_LINKING_ONLY` before including zstd.h.
+
+  Advanced experimental APIs should never be used with a dynamically-linked
+  library. They are not "stable"; their definitions or signatures may change in
+  the future. Only static linking is allowed.
+*******************************************************************************/
 
 /*------   Version   ------*/
 #define ZSTD_VERSION_MAJOR    1
 #define ZSTD_VERSION_MINOR    3
-#define ZSTD_VERSION_RELEASE  5
+#define ZSTD_VERSION_RELEASE  6
 
 #define ZSTD_VERSION_NUMBER  (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
 ZSTDLIB_API unsigned ZSTD_versionNumber(void);   /**< useful to check dll version */
@@ -68,7 +80,7 @@
 #define ZSTD_QUOTE(str) #str
 #define ZSTD_EXPAND_AND_QUOTE(str) ZSTD_QUOTE(str)
 #define ZSTD_VERSION_STRING ZSTD_EXPAND_AND_QUOTE(ZSTD_LIB_VERSION)
-ZSTDLIB_API const char* ZSTD_versionString(void);   /* added in v1.3.0 */
+ZSTDLIB_API const char* ZSTD_versionString(void);   /* v1.3.0+ */
 
 /***************************************
 *  Default constant
@@ -211,7 +223,8 @@
  *  When compressing multiple messages / blocks with the same dictionary, it's recommended to load it just once.
  *  ZSTD_createCDict() will create a digested dictionary, ready to start future compression operations without startup delay.
  *  ZSTD_CDict can be created once and shared by multiple threads concurrently, since its usage is read-only.
- *  `dictBuffer` can be released after ZSTD_CDict creation, since its content is copied within CDict */
+ *  `dictBuffer` can be released after ZSTD_CDict creation, since its content is copied within CDict
+ *  Note : A ZSTD_CDict can be created with an empty dictionary, but it is inefficient for small data. */
 ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict(const void* dictBuffer, size_t dictSize,
                                          int compressionLevel);
 
@@ -223,7 +236,9 @@
  *  Compression using a digested Dictionary.
  *  Faster startup than ZSTD_compress_usingDict(), recommended when same dictionary is used multiple times.
  *  Note that compression level is decided during dictionary creation.
- *  Frame parameters are hardcoded (dictID=yes, contentSize=yes, checksum=no) */
+ *  Frame parameters are hardcoded (dictID=yes, contentSize=yes, checksum=no)
+ *  Note : ZSTD_compress_usingCDict() can be used with a ZSTD_CDict created from an empty dictionary.
+ *         But it is inefficient for small data, and it is recommended to use ZSTD_compressCCtx(). */
 ZSTDLIB_API size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx,
                                             void* dst, size_t dstCapacity,
                                       const void* src, size_t srcSize,
@@ -315,7 +330,7 @@
 * *******************************************************************/
 
 typedef ZSTD_CCtx ZSTD_CStream;  /**< CCtx and CStream are now effectively same object (>= v1.3.0) */
-                                 /* Continue to distinguish them for compatibility with versions <= v1.2.0 */
+                                 /* Continue to distinguish them for compatibility with older versions <= v1.2.0 */
 /*===== ZSTD_CStream management functions =====*/
 ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream(void);
 ZSTDLIB_API size_t ZSTD_freeCStream(ZSTD_CStream* zcs);
@@ -370,21 +385,28 @@
 
 
 
+
+#if defined(ZSTD_STATIC_LINKING_ONLY) && !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY)
+#define ZSTD_H_ZSTD_STATIC_LINKING_ONLY
+
 /****************************************************************************************
- * START OF ADVANCED AND EXPERIMENTAL FUNCTIONS
+ *   ADVANCED AND EXPERIMENTAL FUNCTIONS
+ ****************************************************************************************
  * The definitions in this section are considered experimental.
  * They should never be used with a dynamic library, as prototypes may change in the future.
  * They are provided for advanced scenarios.
  * Use them only in association with static linking.
  * ***************************************************************************************/
 
-#if defined(ZSTD_STATIC_LINKING_ONLY) && !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY)
-#define ZSTD_H_ZSTD_STATIC_LINKING_ONLY
+ZSTDLIB_API int ZSTD_minCLevel(void);  /*!< minimum negative compression level allowed */
 
-/* --- Constants ---*/
-#define ZSTD_MAGICNUMBER            0xFD2FB528   /* >= v0.8.0 */
+/* ---  Constants  ---*/
+#define ZSTD_MAGICNUMBER            0xFD2FB528   /* v0.8+ */
+#define ZSTD_MAGIC_DICTIONARY       0xEC30A437   /* v0.7+ */
 #define ZSTD_MAGIC_SKIPPABLE_START  0x184D2A50U
-#define ZSTD_MAGIC_DICTIONARY       0xEC30A437   /* >= v0.7.0 */
+
+#define ZSTD_BLOCKSIZELOG_MAX 17
+#define ZSTD_BLOCKSIZE_MAX   (1<<ZSTD_BLOCKSIZELOG_MAX)   /* define, for static allocation */
 
 #define ZSTD_WINDOWLOG_MAX_32   30
 #define ZSTD_WINDOWLOG_MAX_64   31
@@ -401,8 +423,10 @@
 #define ZSTD_SEARCHLOG_MIN       1
 #define ZSTD_SEARCHLENGTH_MAX    7   /* only for ZSTD_fast, other strategies are limited to 6 */
 #define ZSTD_SEARCHLENGTH_MIN    3   /* only for ZSTD_btopt, other strategies are limited to 4 */
-#define ZSTD_LDM_MINMATCH_MIN    4
+#define ZSTD_TARGETLENGTH_MAX  ZSTD_BLOCKSIZE_MAX
+#define ZSTD_TARGETLENGTH_MIN    0   /* note : comparing this constant to an unsigned results in a tautological test */
 #define ZSTD_LDM_MINMATCH_MAX 4096
+#define ZSTD_LDM_MINMATCH_MIN    4
 #define ZSTD_LDM_BUCKETSIZELOG_MAX 8
 
 #define ZSTD_FRAMEHEADERSIZE_PREFIX 5   /* minimum input size to know frame header size */
@@ -414,7 +438,8 @@
 static const size_t ZSTD_skippableHeaderSize = 8;  /* magic number + skippable frame length */
 
 
-/*--- Advanced types ---*/
+
+/* ---  Advanced types  --- */
 typedef enum { ZSTD_fast=1, ZSTD_dfast, ZSTD_greedy, ZSTD_lazy, ZSTD_lazy2,
                ZSTD_btlazy2, ZSTD_btopt, ZSTD_btultra } ZSTD_strategy;   /* from faster to stronger */
 
@@ -489,7 +514,7 @@
  *            however it does mean that all frame data must be present and valid. */
 ZSTDLIB_API unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize);
 
-/** ZSTD_frameHeaderSize() :
+/*! ZSTD_frameHeaderSize() :
  *  srcSize must be >= ZSTD_frameHeaderSize_prefix.
  * @return : size of the Frame Header,
  *           or an error code (if srcSize is too small) */
@@ -721,29 +746,48 @@
 
 /*! ZSTD_resetCStream() :
  *  start a new compression job, using same parameters from previous job.
- *  This is typically useful to skip dictionary loading stage, since it will re-use it in-place..
+ *  This is typically useful to skip dictionary loading stage, since it will re-use it in-place.
  *  Note that zcs must be init at least once before using ZSTD_resetCStream().
  *  If pledgedSrcSize is not known at reset time, use macro ZSTD_CONTENTSIZE_UNKNOWN.
  *  If pledgedSrcSize > 0, its value must be correct, as it will be written in header, and controlled at the end.
  *  For the time being, pledgedSrcSize==0 is interpreted as "srcSize unknown" for compatibility with older programs,
  *  but it will change to mean "empty" in future version, so use macro ZSTD_CONTENTSIZE_UNKNOWN instead.
- * @return : 0, or an error code (which can be tested using ZSTD_isError()) */
+ * @return : 0, or an error code (which can be tested using ZSTD_isError())
+ */
 ZSTDLIB_API size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize);
 
 
 typedef struct {
-    unsigned long long ingested;
-    unsigned long long consumed;
-    unsigned long long produced;
+    unsigned long long ingested;   /* nb input bytes read and buffered */
+    unsigned long long consumed;   /* nb input bytes actually compressed */
+    unsigned long long produced;   /* nb of compressed bytes generated and buffered */
+    unsigned long long flushed;    /* nb of compressed bytes flushed : not provided; can be tracked from caller side */
+    unsigned currentJobID;         /* MT only : latest started job nb */
+    unsigned nbActiveWorkers;      /* MT only : nb of workers actively compressing at probe time */
 } ZSTD_frameProgression;
 
-/* ZSTD_getFrameProgression():
+/* ZSTD_getFrameProgression() :
  * tells how much data has been ingested (read from input)
  * consumed (input actually compressed) and produced (output) for current frame.
- * Therefore, (ingested - consumed) is amount of input data buffered internally, not yet compressed.
- * Can report progression inside worker threads (multi-threading and non-blocking mode).
+ * Note : (ingested - consumed) is amount of input data buffered internally, not yet compressed.
+ * Aggregates progression inside active worker threads.
  */
-ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx* cctx);
+ZSTDLIB_API ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx* cctx);
+
+/*! ZSTD_toFlushNow() :
+ *  Tell how many bytes are ready to be flushed immediately.
+ *  Useful for multithreading scenarios (nbWorkers >= 1).
+ *  Probe the oldest active job, defined as oldest job not yet entirely flushed,
+ *  and check its output buffer.
+ * @return : amount of data stored in oldest job and ready to be flushed immediately.
+ *  if @return == 0, it means either :
+ *  + there is no active job (could be checked with ZSTD_frameProgression()), or
+ *  + oldest job is still actively compressing data,
+ *    but everything it has produced has also been flushed so far,
+ *    therefore flushing speed is currently limited by production speed of oldest job
+ *    irrespective of the speed of concurrent newer jobs.
+ */
+ZSTDLIB_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx);
 
 
 
@@ -1149,16 +1193,21 @@
 
 /*! ZSTD_CCtx_refPrefix() :
  *  Reference a prefix (single-usage dictionary) for next compression job.
- *  Decompression need same prefix to properly regenerate data.
- *  Prefix is **only used once**. Tables are discarded at end of compression job (ZSTD_e_end).
+ *  Decompression will need same prefix to properly regenerate data.
+ *  Compressing with a prefix is similar in outcome as performing a diff and compressing it,
+ *  but performs much faster, especially during decompression (compression speed is tunable with compression level).
+ *  Note that prefix is **only used once**. Tables are discarded at end of compression job (ZSTD_e_end).
  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
  *  Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary
  *  Note 1 : Prefix buffer is referenced. It **must** outlive compression job.
  *           Its contain must remain unmodified up to end of compression (ZSTD_e_end).
- *  Note 2 : Referencing a prefix involves building tables, which are dependent on compression parameters.
+ *  Note 2 : If the intention is to diff some large src data blob with some prior version of itself,
+ *           ensure that the window size is large enough to contain the entire source.
+ *           See ZSTD_p_windowLog.
+ *  Note 3 : Referencing a prefix involves building tables, which are dependent on compression parameters.
  *           It's a CPU consuming operation, with non-negligible impact on latency.
  *           If there is a need to use same prefix multiple times, consider loadDictionary instead.
- *  Note 3 : By default, the prefix is treated as raw content (ZSTD_dm_rawContent).
+ *  Note 4 : By default, the prefix is treated as raw content (ZSTD_dm_rawContent).
  *           Use ZSTD_CCtx_refPrefix_advanced() to alter dictMode. */
 ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx,
                                        const void* prefix, size_t prefixSize);
@@ -1341,6 +1390,8 @@
 
 /*! ZSTD_DCtx_refPrefix() :
  *  Reference a prefix (single-usage dictionary) for next compression job.
+ *  This is the reverse operation of ZSTD_CCtx_refPrefix(),
+ *  and must use the same prefix as the one used during compression.
  *  Prefix is **only used once**. Reference is discarded at end of frame.
  *  End of frame is reached when ZSTD_DCtx_decompress_generic() returns 0.
  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
@@ -1379,7 +1430,7 @@
 ZSTDLIB_API size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format);
 
 
-/** ZSTD_getFrameHeader_advanced() :
+/*! ZSTD_getFrameHeader_advanced() :
  *  same as ZSTD_getFrameHeader(),
  *  with added capability to select a format (like ZSTD_f_zstd1_magicless) */
 ZSTDLIB_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr,
@@ -1451,8 +1502,6 @@
         Use ZSTD_insertBlock() for such a case.
 */
 
-#define ZSTD_BLOCKSIZELOG_MAX 17
-#define ZSTD_BLOCKSIZE_MAX   (1<<ZSTD_BLOCKSIZELOG_MAX)   /* define, for static allocation */
 /*=====   Raw zstd block functions  =====*/
 ZSTDLIB_API size_t ZSTD_getBlockSize   (const ZSTD_CCtx* cctx);
 ZSTDLIB_API size_t ZSTD_compressBlock  (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
diff --git a/programs/Makefile b/programs/Makefile
index 4202764..8faa511 100644
--- a/programs/Makefile
+++ b/programs/Makefile
@@ -27,9 +27,11 @@
 LIBVER_PATCH := $(shell echo $(LIBVER_PATCH_SCRIPT))
 LIBVER  := $(shell echo $(LIBVER_SCRIPT))
 
-ZSTD_VERSION=$(LIBVER)
+ZSTD_VERSION = $(LIBVER)
 
-ifeq ($(shell $(CC) -v 2>&1 | grep -c "gcc version "), 1)
+GREP = grep --color=never
+
+ifeq ($(shell $(CC) -v 2>&1 | $(GREP) -c "gcc version "), 1)
 ALIGN_LOOP = -falign-loops=32
 else
 ALIGN_LOOP =
@@ -38,12 +40,15 @@
 CPPFLAGS+= -I$(ZSTDDIR) -I$(ZSTDDIR)/common -I$(ZSTDDIR)/compress \
            -I$(ZSTDDIR)/dictBuilder \
            -DXXH_NAMESPACE=ZSTD_
+ifeq ($(OS),Windows_NT)   # MinGW assumed
+CPPFLAGS   += -D__USE_MINGW_ANSI_STDIO   # compatibility with %zu formatting
+endif
 CFLAGS  ?= -O3
 DEBUGFLAGS+=-Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \
             -Wstrict-aliasing=1 -Wswitch-enum -Wdeclaration-after-statement \
             -Wstrict-prototypes -Wundef -Wpointer-arith -Wformat-security \
             -Wvla -Wformat=2 -Winit-self -Wfloat-equal -Wwrite-strings \
-            -Wredundant-decls
+            -Wredundant-decls -Wmissing-prototypes
 CFLAGS  += $(DEBUGFLAGS) $(MOREFLAGS)
 FLAGS    = $(CPPFLAGS) $(CFLAGS) $(LDFLAGS)
 
@@ -55,11 +60,11 @@
 ZDICT_FILES := $(ZSTDDIR)/dictBuilder/*.c
 ZSTDDECOMP_O = $(ZSTDDIR)/decompress/zstd_decompress.o
 
-ZSTD_LEGACY_SUPPORT ?= 4
+ZSTD_LEGACY_SUPPORT ?= 5
 ZSTDLEGACY_FILES :=
 ifneq ($(ZSTD_LEGACY_SUPPORT), 0)
 ifeq ($(shell test $(ZSTD_LEGACY_SUPPORT) -lt 8; echo $$?), 0)
-	ZSTDLEGACY_FILES += $(shell ls $(ZSTDDIR)/legacy/*.c | grep 'v0[$(ZSTD_LEGACY_SUPPORT)-7]')
+	ZSTDLEGACY_FILES += $(shell ls $(ZSTDDIR)/legacy/*.c | $(GREP) 'v0[$(ZSTD_LEGACY_SUPPORT)-7]')
 endif
 	CPPFLAGS += -I$(ZSTDDIR)/legacy
 else
@@ -129,6 +134,15 @@
 LZ4_MSG := $(NO_LZ4_MSG)
 endif
 
+# enable backtrace symbol names for Linux/Darwin
+ALL_SYMBOLS := 0
+ifeq (,$(filter Windows%, $(OS)))
+ifeq ($(ALL_SYMBOLS), 1)
+DEBUGFLAGS_LD+=-rdynamic
+endif
+endif
+
+
 .PHONY: default
 default: zstd-release
 
@@ -141,7 +155,7 @@
 $(ZSTDDECOMP_O): CFLAGS += $(ALIGN_LOOP)
 
 zstd : CPPFLAGS += $(THREAD_CPP) $(ZLIBCPP) $(LZMACPP) $(LZ4CPP)
-zstd : LDFLAGS += $(THREAD_LD) $(ZLIBLD) $(LZMALD) $(LZ4LD)
+zstd : LDFLAGS += $(THREAD_LD) $(ZLIBLD) $(LZMALD) $(LZ4LD) $(DEBUGFLAGS_LD)
 zstd : CPPFLAGS += -DZSTD_LEGACY_SUPPORT=$(ZSTD_LEGACY_SUPPORT)
 zstd : $(ZSTDLIB_FILES) zstdcli.o fileio.o bench.o datagen.o dibio.o
 	@echo "$(THREAD_MSG)"
@@ -155,10 +169,11 @@
 
 .PHONY: zstd-release
 zstd-release: DEBUGFLAGS :=
+zstd-release: DEBUGFLAGS_LD :=
 zstd-release: zstd
 
 zstd32 : CPPFLAGS += $(THREAD_CPP)
-zstd32 : LDFLAGS += $(THREAD_LD) 
+zstd32 : LDFLAGS += $(THREAD_LD)
 zstd32 : CPPFLAGS += -DZSTD_LEGACY_SUPPORT=$(ZSTD_LEGACY_SUPPORT)
 zstd32 : $(ZSTDLIB_FILES) zstdcli.c fileio.c bench.c datagen.c dibio.c
 ifneq (,$(filter Windows%,$(OS)))
@@ -245,11 +260,29 @@
 #-----------------------------------------------------------------------------
 # make install is validated only for Linux, macOS, BSD, Hurd and Solaris targets
 #-----------------------------------------------------------------------------
-ifneq (,$(filter $(shell uname),Linux Darwin GNU/kFreeBSD GNU OpenBSD FreeBSD NetBSD DragonFly SunOS))
+ifneq (,$(filter $(shell uname),Linux Darwin GNU/kFreeBSD GNU OpenBSD FreeBSD NetBSD DragonFly SunOS Haiku))
 
+EGREP = egrep --color=never
+
+# Print a two column output of targets and their description. To add a target description, put a
+# comment in the Makefile with the format "## <TARGET>: <DESCRIPTION>".  For example:
+#
+## list: Print all targets and their descriptions (if provided)
 .PHONY: list
 list:
-	@$(MAKE) -pRrq -f $(lastword $(MAKEFILE_LIST)) : 2>/dev/null | awk -v RS= -F: '/^# File/,/^# Finished Make data base/ {if ($$1 !~ "^[#.]") {print $$1}}' | sort | egrep -v -e '^[^[:alnum:]]' -e '^$@$$' | xargs
+	@TARGETS=$$($(MAKE) -pRrq -f $(lastword $(MAKEFILE_LIST)) : 2>/dev/null \
+		| awk -v RS= -F: '/^# File/,/^# Finished Make data base/ {if ($$1 !~ "^[#.]") {print $$1}}' \
+		| $(EGREP) -v  -e '^[^[:alnum:]]' | sort); \
+	{ \
+	    printf "Target Name\tDescription\n"; \
+	    printf "%0.s-" {1..16}; printf "\t"; printf "%0.s-" {1..40}; printf "\n"; \
+	    for target in $$TARGETS; do \
+	        line=$$($(EGREP) "^##[[:space:]]+$$target:" $(lastword $(MAKEFILE_LIST))); \
+	        description=$$(echo $$line | awk '{i=index($$0,":"); print substr($$0,i+1)}' | xargs); \
+	        printf "$$target\t$$description\n"; \
+	    done \
+	} | column -t -s $$'\t'
+
 
 DESTDIR     ?=
 # directory variables : GNU conventions prefer lowercase
diff --git a/programs/README.md b/programs/README.md
index a308fcc..804cb8b 100644
--- a/programs/README.md
+++ b/programs/README.md
@@ -61,6 +61,13 @@
   In which case, linking stage will fail if `lz4` library cannot be found.
   This is useful to prevent silent feature disabling.
 
+- __ALL_SYMBOLS__ : `zstd` can display a stack backtrace if the execution
+  generates a runtime exception. By default, this feature may be
+  degraded/disabled on some platforms unless additional compiler directives are
+  applied. When triaging a runtime issue, enabling this feature can provided
+  more context to determine the location of the fault.
+  Example : `make zstd ALL_SYMBOLS=1`
+
 
 #### Aggregation of parameters
 CLI supports aggregation of parameters i.e. `-b1`, `-e18`, and `-i1` can be joined into `-b1e18i1`.
@@ -150,7 +157,8 @@
 
 Dictionary builder :
 --train ## : create a dictionary from a training set of files
---train-cover[=k=#,d=#,steps=#] : use the cover algorithm with optional args
+--train-cover[=k=#,d=#,steps=#,split=#] : use the cover algorithm with optional args
+--train-fastcover[=k=#,d=#,f=#,steps=#,split=#,accel=#] : use the fastcover algorithm with optional args
 --train-legacy[=s=#] : use the legacy algorithm with selectivity (default: 9)
  -o file : `file` is dictionary name (default: dictionary)
 --maxdict=# : limit dictionary to specified size (default: 112640)
@@ -185,7 +193,7 @@
 
 Compression Speed vs Ratio | Decompression Speed
 ---------------------------|---------------------
-![Compression Speed vs Ratio](../doc/images/ldmCspeed.png "Compression Speed vs Ratio") | ![Decompression Speed](../doc/images/ldmDspeed.png "Decompression Speed")
+![Compression Speed vs Ratio](https://raw.githubusercontent.com/facebook/zstd/v1.3.3/doc/images/ldmCspeed.png "Compression Speed vs Ratio") | ![Decompression Speed](https://raw.githubusercontent.com/facebook/zstd/v1.3.3/doc/images/ldmDspeed.png "Decompression Speed")
 
 | Method | Compression ratio | Compression speed | Decompression speed  |
 |:-------|------------------:|-------------------------:|---------------------------:|
@@ -208,10 +216,24 @@
 [Silesia compression corpus]: http://sun.aei.polsl.pl/~sdeor/index.php?page=silesia
 
 | Method | Compression ratio | Compression speed | Decompression speed  |
-|:-------|------------------:|-------------------------:|---------------------------:|
-| `zstd -1`   | `2.878`   | `231.7 MB/s`  | `594.4 MB/s`  |
-| `zstd -1 --long` | `2.929` | `106.5 MB/s` | `517.9 MB/s` |
-| `zstd -5`  | `3.274`    | `77.1 MB/s`  | `464.2 MB/s`  |
-| `zstd -5 --long` | `3.319` | `51.7 MB/s` | `371.9 MB/s` |
-| `zstd -10` | `3.523`    | `16.4 MB/s`   | `489.2 MB/s`  |
-| `zstd -10 --long`| `3.566` | `16.2 MB/s` | `415.7 MB/s`  |
+|:-------|------------------:|------------------:|---------------------:|
+| `zstd -1`        | `2.878` | `231.7 MB/s`      | `594.4 MB/s`   |
+| `zstd -1 --long` | `2.929` | `106.5 MB/s`      | `517.9 MB/s`   |
+| `zstd -5`        | `3.274` | `77.1 MB/s`       | `464.2 MB/s`   |
+| `zstd -5 --long` | `3.319` | `51.7 MB/s`       | `371.9 MB/s`   |
+| `zstd -10`       | `3.523` | `16.4 MB/s`       | `489.2 MB/s`   |
+| `zstd -10 --long`| `3.566` | `16.2 MB/s`       | `415.7 MB/s`   |
+
+
+#### zstdgrep
+
+`zstdgrep` is a utility which makes it possible to `grep` directly a `.zst` compressed file.
+It's used the same way as normal `grep`, for example :
+`zstdgrep pattern file.zst`
+
+`zstdgrep` is _not_ compatible with dictionary compression.
+
+To search into a file compressed with a dictionary,
+it's necessary to decompress it using `zstd` or `zstdcat`,
+and then pipe the result to `grep`. For example  :
+`zstdcat -D dictionary -qc -- file.zst | grep pattern`
diff --git a/programs/bench.c b/programs/bench.c
index 09697d1..326c1c1 100644
--- a/programs/bench.c
+++ b/programs/bench.c
@@ -42,6 +42,7 @@
 #include "datagen.h"     /* RDG_genBuffer */
 #include "xxhash.h"
 #include "bench.h"
+#include "zstd_errors.h"
 
 
 /* *************************************
@@ -62,9 +63,11 @@
 #define MB *(1 <<20)
 #define GB *(1U<<30)
 
-static const size_t maxMemory = (sizeof(size_t)==4)  ?  (2 GB - 64 MB) : (size_t)(1ULL << ((sizeof(size_t)*8)-31));
+#define BMK_RUNTEST_DEFAULT_MS 1000
 
-static U32 g_compressibilityDefault = 50;
+static const size_t maxMemory = (sizeof(size_t)==4)  ?
+                    /* 32-bit */ (2 GB - 64 MB) :
+                    /* 64-bit */ (size_t)(1ULL << ((sizeof(size_t)*8)-31));
 
 
 /* *************************************
@@ -90,88 +93,59 @@
 #  define DEBUG 0
 #endif
 #define DEBUGOUTPUT(...) { if (DEBUG) DISPLAY(__VA_ARGS__); }
-#define EXM_THROW(error, ...)  {                      \
+
+#define EXM_THROW_INT(errorNum, ...)  {               \
     DEBUGOUTPUT("%s: %i: \n", __FILE__, __LINE__);    \
-    DISPLAYLEVEL(1, "Error %i : ", error);            \
+    DISPLAYLEVEL(1, "Error %i : ", errorNum);         \
     DISPLAYLEVEL(1, __VA_ARGS__);                     \
     DISPLAYLEVEL(1, " \n");                           \
-    exit(error);                                      \
+    return errorNum;                                  \
 }
 
+#define RETURN_ERROR(errorNum, retType, ...)  {       \
+    retType r;                                        \
+    memset(&r, 0, sizeof(retType));                   \
+    DEBUGOUTPUT("%s: %i: \n", __FILE__, __LINE__);    \
+    DISPLAYLEVEL(1, "Error %i : ", errorNum);         \
+    DISPLAYLEVEL(1, __VA_ARGS__);                     \
+    DISPLAYLEVEL(1, " \n");                           \
+    r.tag = errorNum;                                 \
+    return r;                                         \
+}
+
+/* error without displaying */
+#define RETURN_QUIET_ERROR(errorNum, retType, ...)  { \
+    retType r;                                        \
+    memset(&r, 0, sizeof(retType));                   \
+    DEBUGOUTPUT("%s: %i: \n", __FILE__, __LINE__);    \
+    DEBUGOUTPUT("Error %i : ", errorNum);             \
+    DEBUGOUTPUT(__VA_ARGS__);                         \
+    DEBUGOUTPUT(" \n");                               \
+    r.tag = errorNum;                                 \
+    return r;                                         \
+}
 
 /* *************************************
 *  Benchmark Parameters
 ***************************************/
-static int g_additionalParam = 0;
-static U32 g_decodeOnly = 0;
 
-void BMK_setAdditionalParam(int additionalParam) { g_additionalParam=additionalParam; }
-
-
-//TODO : Deal with DISPLAYLEVEL for all these set functions
-
-static U32 g_nbSeconds = BMK_TIMETEST_DEFAULT_S;
-
-void BMK_setNbSeconds(unsigned nbSeconds)
-{
-    g_nbSeconds = nbSeconds;
-    DISPLAY("- test >= %u seconds per compression / decompression - \n", g_nbSeconds);
+BMK_advancedParams_t BMK_initAdvancedParams(void) {
+    BMK_advancedParams_t const res = {
+        BMK_both, /* mode */
+        BMK_TIMETEST_DEFAULT_S, /* nbSeconds */
+        0, /* blockSize */
+        0, /* nbWorkers */
+        0, /* realTime */
+        0, /* additionalParam */
+        0, /* ldmFlag */
+        0, /* ldmMinMatch */
+        0, /* ldmHashLog */
+        0, /* ldmBuckSizeLog */
+        0  /* ldmHashEveryLog */
+    };
+    return res;
 }
 
-static size_t g_blockSize = 0;
-
-void BMK_setBlockSize(size_t blockSize)
-{
-    g_blockSize = blockSize;
-    if (g_blockSize) DISPLAY("using blocks of size %u KB \n", (U32)(blockSize>>10));
-}
-
-void BMK_setDecodeOnlyMode(unsigned decodeFlag) { g_decodeOnly = (decodeFlag>0); }
-
-static U32 g_nbWorkers = 0;
-
-void BMK_setNbWorkers(unsigned nbWorkers) {
-#ifndef ZSTD_MULTITHREAD
-    if (nbWorkers > 0) DISPLAY("Note : multi-threading is disabled \n");
-#endif
-    g_nbWorkers = nbWorkers;
-}
-
-static U32 g_realTime = 0;
-void BMK_setRealTime(unsigned priority) {
-    g_realTime = (priority>0);
-}
-
-static U32 g_separateFiles = 0;
-void BMK_setSeparateFiles(unsigned separate) {
-    g_separateFiles = (separate>0);
-}
-
-static U32 g_ldmFlag = 0;
-void BMK_setLdmFlag(unsigned ldmFlag) {
-    g_ldmFlag = ldmFlag;
-}
-
-static U32 g_ldmMinMatch = 0;
-void BMK_setLdmMinMatch(unsigned ldmMinMatch) {
-    g_ldmMinMatch = ldmMinMatch;
-}
-
-static U32 g_ldmHashLog = 0;
-void BMK_setLdmHashLog(unsigned ldmHashLog) {
-    g_ldmHashLog = ldmHashLog;
-}
-
-#define BMK_LDM_PARAM_NOTSET 9999
-static U32 g_ldmBucketSizeLog = BMK_LDM_PARAM_NOTSET;
-void BMK_setLdmBucketSizeLog(unsigned ldmBucketSizeLog) {
-    g_ldmBucketSizeLog = ldmBucketSizeLog;
-}
-
-static U32 g_ldmHashEveryLog = BMK_LDM_PARAM_NOTSET;
-void BMK_setLdmHashEveryLog(unsigned ldmHashEveryLog) {
-    g_ldmHashEveryLog = ldmHashEveryLog;
-}
 
 /* ********************************************************
 *  Bench functions
@@ -191,341 +165,770 @@
 #define MIN(a,b)    ((a) < (b) ? (a) : (b))
 #define MAX(a,b)    ((a) > (b) ? (a) : (b))
 
-BMK_return_t BMK_benchMem(const void* srcBuffer, size_t srcSize,
-                        const size_t* fileSizes, unsigned nbFiles,
-                        const int cLevel, const ZSTD_compressionParameters* comprParams,
-                        const void* dictBuffer, size_t dictBufferSize,
-                        ZSTD_CCtx* ctx, ZSTD_DCtx* dctx,
-                        int displayLevel, const char* displayName)
+static void BMK_initCCtx(ZSTD_CCtx* ctx,
+    const void* dictBuffer, size_t dictBufferSize, int cLevel,
+    const ZSTD_compressionParameters* comprParams, const BMK_advancedParams_t* adv) {
+    ZSTD_CCtx_reset(ctx);
+    ZSTD_CCtx_resetParameters(ctx);
+    if (adv->nbWorkers==1) {
+        ZSTD_CCtx_setParameter(ctx, ZSTD_p_nbWorkers, 0);
+    } else {
+        ZSTD_CCtx_setParameter(ctx, ZSTD_p_nbWorkers, adv->nbWorkers);
+    }
+    ZSTD_CCtx_setParameter(ctx, ZSTD_p_compressionLevel, cLevel);
+    ZSTD_CCtx_setParameter(ctx, ZSTD_p_enableLongDistanceMatching, adv->ldmFlag);
+    ZSTD_CCtx_setParameter(ctx, ZSTD_p_ldmMinMatch, adv->ldmMinMatch);
+    ZSTD_CCtx_setParameter(ctx, ZSTD_p_ldmHashLog, adv->ldmHashLog);
+    ZSTD_CCtx_setParameter(ctx, ZSTD_p_ldmBucketSizeLog, adv->ldmBucketSizeLog);
+    ZSTD_CCtx_setParameter(ctx, ZSTD_p_ldmHashEveryLog, adv->ldmHashEveryLog);
+    ZSTD_CCtx_setParameter(ctx, ZSTD_p_windowLog, comprParams->windowLog);
+    ZSTD_CCtx_setParameter(ctx, ZSTD_p_hashLog, comprParams->hashLog);
+    ZSTD_CCtx_setParameter(ctx, ZSTD_p_chainLog, comprParams->chainLog);
+    ZSTD_CCtx_setParameter(ctx, ZSTD_p_searchLog, comprParams->searchLog);
+    ZSTD_CCtx_setParameter(ctx, ZSTD_p_minMatch, comprParams->searchLength);
+    ZSTD_CCtx_setParameter(ctx, ZSTD_p_targetLength, comprParams->targetLength);
+    ZSTD_CCtx_setParameter(ctx, ZSTD_p_compressionStrategy, comprParams->strategy);
+    ZSTD_CCtx_loadDictionary(ctx, dictBuffer, dictBufferSize);
+}
 
+static void BMK_initDCtx(ZSTD_DCtx* dctx,
+    const void* dictBuffer, size_t dictBufferSize) {
+    ZSTD_DCtx_reset(dctx);
+    ZSTD_DCtx_loadDictionary(dctx, dictBuffer, dictBufferSize);
+}
+
+
+typedef struct {
+    ZSTD_CCtx* cctx;
+    const void* dictBuffer;
+    size_t dictBufferSize;
+    int cLevel;
+    const ZSTD_compressionParameters* comprParams;
+    const BMK_advancedParams_t* adv;
+} BMK_initCCtxArgs;
+
+static size_t local_initCCtx(void* payload) {
+    BMK_initCCtxArgs* ag = (BMK_initCCtxArgs*)payload;
+    BMK_initCCtx(ag->cctx, ag->dictBuffer, ag->dictBufferSize, ag->cLevel, ag->comprParams, ag->adv);
+    return 0;
+}
+
+typedef struct {
+    ZSTD_DCtx* dctx;
+    const void* dictBuffer;
+    size_t dictBufferSize;
+} BMK_initDCtxArgs;
+
+static size_t local_initDCtx(void* payload) {
+    BMK_initDCtxArgs* ag = (BMK_initDCtxArgs*)payload;
+    BMK_initDCtx(ag->dctx, ag->dictBuffer, ag->dictBufferSize);
+    return 0;
+}
+
+
+/* `addArgs` is the context */
+static size_t local_defaultCompress(
+                    const void* srcBuffer, size_t srcSize,
+                    void* dstBuffer, size_t dstSize,
+                    void* addArgs)
 {
-    size_t const blockSize = ((g_blockSize>=32 && !g_decodeOnly) ? g_blockSize : srcSize) + (!srcSize) /* avoid div by 0 */ ;
-    U32 const maxNbBlocks = (U32) ((srcSize + (blockSize-1)) / blockSize) + nbFiles;
-    blockParam_t* const blockTable = (blockParam_t*) malloc(maxNbBlocks * sizeof(blockParam_t));
-    size_t const maxCompressedSize = ZSTD_compressBound(srcSize) + (maxNbBlocks * 1024);   /* add some room for safety */
-    void* const compressedBuffer = malloc(maxCompressedSize);
-    void* resultBuffer = malloc(srcSize);
-    BMK_return_t results;
+    size_t moreToFlush = 1;
+    ZSTD_CCtx* const cctx = (ZSTD_CCtx*)addArgs;
+    ZSTD_inBuffer in;
+    ZSTD_outBuffer out;
+    in.src = srcBuffer; in.size = srcSize; in.pos = 0;
+    out.dst = dstBuffer; out.size = dstSize; out.pos = 0;
+    while (moreToFlush) {
+        if(out.pos == out.size) {
+            return (size_t)-ZSTD_error_dstSize_tooSmall;
+        }
+        moreToFlush = ZSTD_compress_generic(cctx, &out, &in, ZSTD_e_end);
+        if (ZSTD_isError(moreToFlush)) {
+            return moreToFlush;
+        }
+    }
+    return out.pos;
+}
 
+/* `addArgs` is the context */
+static size_t local_defaultDecompress(
+                    const void* srcBuffer, size_t srcSize,
+                    void* dstBuffer, size_t dstCapacity,
+                    void* addArgs)
+{
+    size_t moreToFlush = 1;
+    ZSTD_DCtx* const dctx = (ZSTD_DCtx*)addArgs;
+    ZSTD_inBuffer in;
+    ZSTD_outBuffer out;
+    in.src = srcBuffer; in.size = srcSize; in.pos = 0;
+    out.dst = dstBuffer; out.size = dstCapacity; out.pos = 0;
+    while (moreToFlush) {
+        if(out.pos == out.size) {
+            return (size_t)-ZSTD_error_dstSize_tooSmall;
+        }
+        moreToFlush = ZSTD_decompress_generic(dctx, &out, &in);
+        if (ZSTD_isError(moreToFlush)) {
+            return moreToFlush;
+        }
+    }
+    return out.pos;
+
+}
+
+
+/*===  Benchmarking an arbitrary function  ===*/
+
+int BMK_isSuccessful_runOutcome(BMK_runOutcome_t outcome)
+{
+    return outcome.tag == 0;
+}
+
+/* warning : this function will stop program execution if outcome is invalid !
+ *           check outcome validity first, using BMK_isValid_runResult() */
+BMK_runTime_t BMK_extract_runTime(BMK_runOutcome_t outcome)
+{
+    assert(outcome.tag == 0);
+    return outcome.internal_never_use_directly;
+}
+
+static BMK_runOutcome_t BMK_runOutcome_error(void)
+{
+    BMK_runOutcome_t b;
+    memset(&b, 0, sizeof(b));
+    b.tag = 1;
+    return b;
+}
+
+static BMK_runOutcome_t BMK_setValid_runTime(BMK_runTime_t runTime)
+{
+    BMK_runOutcome_t outcome;
+    outcome.tag = 0;
+    outcome.internal_never_use_directly = runTime;
+    return outcome;
+}
+
+
+/* initFn will be measured once, benchFn will be measured `nbLoops` times */
+/* initFn is optional, provide NULL if none */
+/* benchFn must return size_t field compliant with ZSTD_isError for error valuee */
+/* takes # of blocks and list of size & stuff for each. */
+/* can report result of benchFn for each block into blockResult. */
+/* blockResult is optional, provide NULL if this information is not required */
+/* note : time per loop could be zero if run time < timer resolution */
+BMK_runOutcome_t BMK_benchFunction(
+            BMK_benchFn_t benchFn, void* benchPayload,
+            BMK_initFn_t initFn, void* initPayload,
+            size_t blockCount,
+            const void* const * srcBlockBuffers, const size_t* srcBlockSizes,
+            void* const * dstBlockBuffers, const size_t* dstBlockCapacities,
+            size_t* blockResults,
+            unsigned nbLoops)
+{
+    size_t dstSize = 0;
+
+    if(!nbLoops) {
+        RETURN_QUIET_ERROR(2, BMK_runOutcome_t, "nbLoops must be nonzero ");
+    }
+
+    /* init */
+    {   size_t i;
+        for(i = 0; i < blockCount; i++) {
+            memset(dstBlockBuffers[i], 0xE5, dstBlockCapacities[i]);  /* warm up and erase result buffer */
+        }
+#if 0
+        /* based on testing these seem to lower accuracy of multiple calls of 1 nbLoops vs 1 call of multiple nbLoops
+         * (Makes former slower)
+         */
+        UTIL_sleepMilli(5);  /* give processor time to other processes */
+        UTIL_waitForNextTick();
+#endif
+    }
+
+    /* benchmark */
+    {   UTIL_time_t const clockStart = UTIL_getTime();
+        unsigned loopNb, blockNb;
+        if (initFn != NULL) initFn(initPayload);
+        for (loopNb = 0; loopNb < nbLoops; loopNb++) {
+            for (blockNb = 0; blockNb < blockCount; blockNb++) {
+                size_t const res = benchFn(srcBlockBuffers[blockNb], srcBlockSizes[blockNb],
+                                    dstBlockBuffers[blockNb], dstBlockCapacities[blockNb],
+                                    benchPayload);
+                if(ZSTD_isError(res)) {
+                    RETURN_QUIET_ERROR(2, BMK_runOutcome_t,
+                        "Function benchmark failed on block %u of size %u : %s",
+                        blockNb, (U32)dstBlockCapacities[blockNb], ZSTD_getErrorName(res));
+                } else if (loopNb == 0) {
+                    dstSize += res;
+                    if (blockResults != NULL) blockResults[blockNb] = res;
+            }   }
+        }  /* for (loopNb = 0; loopNb < nbLoops; loopNb++) */
+
+        {   U64 const totalTime = UTIL_clockSpanNano(clockStart);
+            BMK_runTime_t rt;
+            rt.nanoSecPerRun = totalTime / nbLoops;
+            rt.sumOfReturn = dstSize;
+            return BMK_setValid_runTime(rt);
+    }   }
+}
+
+
+/* ====  Benchmarking any function, providing intermediate results  ==== */
+
+struct BMK_timedFnState_s {
+    U64 timeSpent_ns;
+    U64 timeBudget_ns;
+    U64 runBudget_ns;
+    BMK_runTime_t fastestRun;
+    unsigned nbLoops;
+    UTIL_time_t coolTime;
+};  /* typedef'd to BMK_timedFnState_t within bench.h */
+
+BMK_timedFnState_t* BMK_createTimedFnState(unsigned total_ms, unsigned run_ms)
+{
+    BMK_timedFnState_t* const r = (BMK_timedFnState_t*)malloc(sizeof(*r));
+    if (r == NULL) return NULL;   /* malloc() error */
+    BMK_resetTimedFnState(r, total_ms, run_ms);
+    return r;
+}
+
+void BMK_freeTimedFnState(BMK_timedFnState_t* state) {
+    free(state);
+}
+
+void BMK_resetTimedFnState(BMK_timedFnState_t* timedFnState, unsigned total_ms, unsigned run_ms)
+{
+    if (!total_ms) total_ms = 1 ;
+    if (!run_ms) run_ms = 1;
+    if (run_ms > total_ms) run_ms = total_ms;
+    timedFnState->timeSpent_ns = 0;
+    timedFnState->timeBudget_ns = (U64)total_ms * TIMELOOP_NANOSEC / 1000;
+    timedFnState->runBudget_ns = (U64)run_ms * TIMELOOP_NANOSEC / 1000;
+    timedFnState->fastestRun.nanoSecPerRun = (U64)(-1LL);
+    timedFnState->fastestRun.sumOfReturn = (size_t)(-1LL);
+    timedFnState->nbLoops = 1;
+    timedFnState->coolTime = UTIL_getTime();
+}
+
+/* Tells if nb of seconds set in timedFnState for all runs is spent.
+ * note : this function will return 1 if BMK_benchFunctionTimed() has actually errored. */
+int BMK_isCompleted_TimedFn(const BMK_timedFnState_t* timedFnState)
+{
+    return (timedFnState->timeSpent_ns >= timedFnState->timeBudget_ns);
+}
+
+
+#define MINUSABLETIME  (TIMELOOP_NANOSEC / 2)  /* 0.5 seconds */
+
+BMK_runOutcome_t BMK_benchTimedFn(
+            BMK_timedFnState_t* cont,
+            BMK_benchFn_t benchFn, void* benchPayload,
+            BMK_initFn_t initFn, void* initPayload,
+            size_t blockCount,
+            const void* const* srcBlockBuffers, const size_t* srcBlockSizes,
+            void * const * dstBlockBuffers, const size_t * dstBlockCapacities,
+            size_t* blockResults)
+{
+    U64 const runBudget_ns = cont->runBudget_ns;
+    U64 const runTimeMin_ns = runBudget_ns / 2;
+    int completed = 0;
+    BMK_runTime_t bestRunTime = cont->fastestRun;
+
+    while (!completed) {
+        BMK_runOutcome_t runResult;
+
+        /* Overheat protection */
+        if (UTIL_clockSpanMicro(cont->coolTime) > ACTIVEPERIOD_MICROSEC) {
+            DEBUGOUTPUT("\rcooling down ...    \r");
+            UTIL_sleep(COOLPERIOD_SEC);
+            cont->coolTime = UTIL_getTime();
+        }
+
+        /* reinitialize capacity */
+        runResult = BMK_benchFunction(benchFn, benchPayload,
+                                    initFn, initPayload,
+                                    blockCount,
+                                    srcBlockBuffers, srcBlockSizes,
+                                    dstBlockBuffers, dstBlockCapacities,
+                                    blockResults,
+                                    cont->nbLoops);
+
+        if(!BMK_isSuccessful_runOutcome(runResult)) { /* error : move out */
+            return BMK_runOutcome_error();
+        }
+
+        {   BMK_runTime_t const newRunTime = BMK_extract_runTime(runResult);
+            U64 const loopDuration_ns = newRunTime.nanoSecPerRun * cont->nbLoops;
+
+            cont->timeSpent_ns += loopDuration_ns;
+
+            /* estimate nbLoops for next run to last approximately 1 second */
+            if (loopDuration_ns > (runBudget_ns / 50)) {
+                U64 const fastestRun_ns = MIN(bestRunTime.nanoSecPerRun, newRunTime.nanoSecPerRun);
+                cont->nbLoops = (U32)(runBudget_ns / fastestRun_ns) + 1;
+            } else {
+                /* previous run was too short : blindly increase workload by x multiplier */
+                const unsigned multiplier = 10;
+                assert(cont->nbLoops < ((unsigned)-1) / multiplier);  /* avoid overflow */
+                cont->nbLoops *= multiplier;
+            }
+
+            if(loopDuration_ns < runTimeMin_ns) {
+                /* don't report results for which benchmark run time was too small : increased risks of rounding errors */
+                assert(completed == 0);
+                continue;
+            } else {
+                if(newRunTime.nanoSecPerRun < bestRunTime.nanoSecPerRun) {
+                    bestRunTime = newRunTime;
+                }
+                completed = 1;
+            }
+        }
+    }   /* while (!completed) */
+
+    return BMK_setValid_runTime(bestRunTime);
+}
+
+
+/* ================================================================= */
+/*      Benchmark Zstandard, mem-to-mem scenarios                    */
+/* ================================================================= */
+
+int BMK_isSuccessful_benchOutcome(BMK_benchOutcome_t outcome)
+{
+    return outcome.tag == 0;
+}
+
+BMK_benchResult_t BMK_extract_benchResult(BMK_benchOutcome_t outcome)
+{
+    assert(outcome.tag == 0);
+    return outcome.internal_never_use_directly;
+}
+
+static BMK_benchOutcome_t BMK_benchOutcome_error(void)
+{
+    BMK_benchOutcome_t b;
+    memset(&b, 0, sizeof(b));
+    b.tag = 1;
+    return b;
+}
+
+static BMK_benchOutcome_t BMK_benchOutcome_setValidResult(BMK_benchResult_t result)
+{
+    BMK_benchOutcome_t b;
+    b.tag = 0;
+    b.internal_never_use_directly = result;
+    return b;
+}
+
+
+/* benchMem with no allocation */
+static BMK_benchOutcome_t BMK_benchMemAdvancedNoAlloc(
+            const void** srcPtrs, size_t* srcSizes,
+            void** cPtrs, size_t* cCapacities, size_t* cSizes,
+            void** resPtrs, size_t* resSizes,
+            void** resultBufferPtr, void* compressedBuffer,
+            size_t maxCompressedSize,
+            BMK_timedFnState_t* timeStateCompress,
+            BMK_timedFnState_t* timeStateDecompress,
+
+            const void* srcBuffer, size_t srcSize,
+            const size_t* fileSizes, unsigned nbFiles,
+            const int cLevel, const ZSTD_compressionParameters* comprParams,
+            const void* dictBuffer, size_t dictBufferSize,
+            ZSTD_CCtx* cctx, ZSTD_DCtx* dctx,
+            int displayLevel, const char* displayName,
+            const BMK_advancedParams_t* adv)
+{
+    size_t const blockSize = ((adv->blockSize>=32 && (adv->mode != BMK_decodeOnly)) ? adv->blockSize : srcSize) + (!srcSize);  /* avoid div by 0 */
+    BMK_benchResult_t benchResult;
     size_t const loadedCompressedSize = srcSize;
     size_t cSize = 0;
     double ratio = 0.;
     U32 nbBlocks;
 
-    /* checks */
-    if (!compressedBuffer || !resultBuffer || !blockTable)
-        EXM_THROW(31, "allocation error : not enough memory");
-
-    if(!ctx || !dctx) 
-        EXM_THROW(31, "error: passed in null context");
+    assert(cctx != NULL); assert(dctx != NULL);
 
     /* init */
-    if (strlen(displayName)>17) displayName += strlen(displayName)-17;   /* display last 17 characters */
-    if (g_nbWorkers==1) g_nbWorkers=0;   /* prefer synchronous mode */
-
-    if (g_decodeOnly) {  /* benchmark only decompression : source must be already compressed */
+    memset(&benchResult, 0, sizeof(benchResult));
+    if (strlen(displayName)>17) displayName += strlen(displayName) - 17;   /* display last 17 characters */
+    if (adv->mode == BMK_decodeOnly) {  /* benchmark only decompression : source must be already compressed */
         const char* srcPtr = (const char*)srcBuffer;
         U64 totalDSize64 = 0;
         U32 fileNb;
         for (fileNb=0; fileNb<nbFiles; fileNb++) {
             U64 const fSize64 = ZSTD_findDecompressedSize(srcPtr, fileSizes[fileNb]);
-            if (fSize64==0) EXM_THROW(32, "Impossible to determine original size ");
+            if (fSize64==0) RETURN_ERROR(32, BMK_benchOutcome_t, "Impossible to determine original size ");
             totalDSize64 += fSize64;
             srcPtr += fileSizes[fileNb];
         }
         {   size_t const decodedSize = (size_t)totalDSize64;
-            if (totalDSize64 > decodedSize) EXM_THROW(32, "original size is too large");   /* size_t overflow */
-            free(resultBuffer);
-            resultBuffer = malloc(decodedSize);
-            if (!resultBuffer) EXM_THROW(33, "not enough memory");
+            assert((U64)decodedSize == totalDSize64);   /* check overflow */
+            free(*resultBufferPtr);
+            *resultBufferPtr = malloc(decodedSize);
+            if (!(*resultBufferPtr)) {
+                RETURN_ERROR(33, BMK_benchOutcome_t, "not enough memory");
+            }
+            if (totalDSize64 > decodedSize) {  /* size_t overflow */
+                free(*resultBufferPtr);
+                RETURN_ERROR(32, BMK_benchOutcome_t, "original size is too large");
+            }
             cSize = srcSize;
             srcSize = decodedSize;
             ratio = (double)srcSize / (double)cSize;
-    }   }
+        }
+    }
 
-    /* Init blockTable data */
+    /* Init data blocks  */
     {   const char* srcPtr = (const char*)srcBuffer;
         char* cPtr = (char*)compressedBuffer;
-        char* resPtr = (char*)resultBuffer;
+        char* resPtr = (char*)(*resultBufferPtr);
         U32 fileNb;
         for (nbBlocks=0, fileNb=0; fileNb<nbFiles; fileNb++) {
             size_t remaining = fileSizes[fileNb];
-            U32 const nbBlocksforThisFile = g_decodeOnly ? 1 : (U32)((remaining + (blockSize-1)) / blockSize);
+            U32 const nbBlocksforThisFile = (adv->mode == BMK_decodeOnly) ? 1 : (U32)((remaining + (blockSize-1)) / blockSize);
             U32 const blockEnd = nbBlocks + nbBlocksforThisFile;
             for ( ; nbBlocks<blockEnd; nbBlocks++) {
                 size_t const thisBlockSize = MIN(remaining, blockSize);
-                blockTable[nbBlocks].srcPtr = (const void*)srcPtr;
-                blockTable[nbBlocks].srcSize = thisBlockSize;
-                blockTable[nbBlocks].cPtr = (void*)cPtr;
-                blockTable[nbBlocks].cRoom = g_decodeOnly ? thisBlockSize : ZSTD_compressBound(thisBlockSize);
-                blockTable[nbBlocks].cSize = blockTable[nbBlocks].cRoom;
-                blockTable[nbBlocks].resPtr = (void*)resPtr;
-                blockTable[nbBlocks].resSize = g_decodeOnly ? (size_t) ZSTD_findDecompressedSize(srcPtr, thisBlockSize) : thisBlockSize;
+                srcPtrs[nbBlocks] = srcPtr;
+                srcSizes[nbBlocks] = thisBlockSize;
+                cPtrs[nbBlocks] = cPtr;
+                cCapacities[nbBlocks] = (adv->mode == BMK_decodeOnly) ? thisBlockSize : ZSTD_compressBound(thisBlockSize);
+                resPtrs[nbBlocks] = resPtr;
+                resSizes[nbBlocks] = (adv->mode == BMK_decodeOnly) ? (size_t) ZSTD_findDecompressedSize(srcPtr, thisBlockSize) : thisBlockSize;
                 srcPtr += thisBlockSize;
-                cPtr += blockTable[nbBlocks].cRoom;
+                cPtr += cCapacities[nbBlocks];
                 resPtr += thisBlockSize;
                 remaining -= thisBlockSize;
-    }   }   }
+            }
+        }
+    }
 
-    /* warmimg up memory */
-    if (g_decodeOnly) {
+    /* warmimg up `compressedBuffer` */
+    if (adv->mode == BMK_decodeOnly) {
         memcpy(compressedBuffer, srcBuffer, loadedCompressedSize);
     } else {
         RDG_genBuffer(compressedBuffer, maxCompressedSize, 0.10, 0.50, 1);
     }
 
     /* Bench */
-    {   U64 fastestC = (U64)(-1LL), fastestD = (U64)(-1LL);
-        U64 const crcOrig = g_decodeOnly ? 0 : XXH64(srcBuffer, srcSize, 0);
-        UTIL_time_t coolTime;
-        U64 const maxTime = (g_nbSeconds * TIMELOOP_NANOSEC) + 1;
-        U32 nbDecodeLoops = (U32)((100 MB) / (srcSize+1)) + 1;  /* initial conservative speed estimate */
-        U32 nbCompressionLoops = (U32)((2 MB) / (srcSize+1)) + 1;  /* initial conservative speed estimate */
-        U64 totalCTime=0, totalDTime=0;
-        U32 cCompleted=g_decodeOnly, dCompleted=0;
+    {   U64 const crcOrig = (adv->mode == BMK_decodeOnly) ? 0 : XXH64(srcBuffer, srcSize, 0);
 #       define NB_MARKS 4
-        const char* const marks[NB_MARKS] = { " |", " /", " =",  "\\" };
+        const char* marks[NB_MARKS] = { " |", " /", " =", " \\" };
         U32 markNb = 0;
+        int compressionCompleted = (adv->mode == BMK_decodeOnly);
+        int decompressionCompleted = (adv->mode == BMK_compressOnly);
+        BMK_initCCtxArgs cctxprep;
+        BMK_initDCtxArgs dctxprep;
+        cctxprep.cctx = cctx;
+        cctxprep.dictBuffer = dictBuffer;
+        cctxprep.dictBufferSize = dictBufferSize;
+        cctxprep.cLevel = cLevel;
+        cctxprep.comprParams = comprParams;
+        cctxprep.adv = adv;
+        dctxprep.dctx = dctx;
+        dctxprep.dictBuffer = dictBuffer;
+        dctxprep.dictBufferSize = dictBufferSize;
 
-        coolTime = UTIL_getTime();
-        DISPLAYLEVEL(2, "\r%79s\r", "");
-        while (!cCompleted || !dCompleted) {
+        DISPLAYLEVEL(2, "\r%70s\r", "");   /* blank line */
+        DISPLAYLEVEL(2, "%2s-%-17.17s :%10u ->\r", marks[markNb], displayName, (U32)srcSize);
 
-            /* overheat protection */
-            if (UTIL_clockSpanMicro(coolTime) > ACTIVEPERIOD_MICROSEC) {
-                DISPLAYLEVEL(2, "\rcooling down ...    \r");
-                UTIL_sleep(COOLPERIOD_SEC);
-                coolTime = UTIL_getTime();
-            }
+        while (!(compressionCompleted && decompressionCompleted)) {
 
-            if (!g_decodeOnly) {
-                /* Compression */
-                DISPLAYLEVEL(2, "%2s-%-17.17s :%10u ->\r", marks[markNb], displayName, (U32)srcSize);
-                if (!cCompleted) memset(compressedBuffer, 0xE5, maxCompressedSize);  /* warm up and erase result buffer */
+            if (!compressionCompleted) {
+                BMK_runOutcome_t const cOutcome =
+                        BMK_benchTimedFn( timeStateCompress,
+                                        &local_defaultCompress, cctx,
+                                        &local_initCCtx, &cctxprep,
+                                        nbBlocks,
+                                        srcPtrs, srcSizes,
+                                        cPtrs, cCapacities,
+                                        cSizes);
 
-                UTIL_sleepMilli(5);  /* give processor time to other processes */
-                UTIL_waitForNextTick();
-
-                if (!cCompleted) {   /* still some time to do compression tests */
-                    U32 nbLoops = 0;
-                    UTIL_time_t const clockStart = UTIL_getTime();
-                    ZSTD_CCtx_setParameter(ctx, ZSTD_p_nbWorkers, g_nbWorkers);
-                    ZSTD_CCtx_setParameter(ctx, ZSTD_p_compressionLevel, cLevel);
-                    ZSTD_CCtx_setParameter(ctx, ZSTD_p_enableLongDistanceMatching, g_ldmFlag);
-                    ZSTD_CCtx_setParameter(ctx, ZSTD_p_ldmMinMatch, g_ldmMinMatch);
-                    ZSTD_CCtx_setParameter(ctx, ZSTD_p_ldmHashLog, g_ldmHashLog);
-                    if (g_ldmBucketSizeLog != BMK_LDM_PARAM_NOTSET) {
-                      ZSTD_CCtx_setParameter(ctx, ZSTD_p_ldmBucketSizeLog, g_ldmBucketSizeLog);
-                    }
-                    if (g_ldmHashEveryLog != BMK_LDM_PARAM_NOTSET) {
-                      ZSTD_CCtx_setParameter(ctx, ZSTD_p_ldmHashEveryLog, g_ldmHashEveryLog);
-                    }
-                    ZSTD_CCtx_setParameter(ctx, ZSTD_p_windowLog, comprParams->windowLog);
-                    ZSTD_CCtx_setParameter(ctx, ZSTD_p_hashLog, comprParams->hashLog);
-                    ZSTD_CCtx_setParameter(ctx, ZSTD_p_chainLog, comprParams->chainLog);
-                    ZSTD_CCtx_setParameter(ctx, ZSTD_p_searchLog, comprParams->searchLog);
-                    ZSTD_CCtx_setParameter(ctx, ZSTD_p_minMatch, comprParams->searchLength);
-                    ZSTD_CCtx_setParameter(ctx, ZSTD_p_targetLength, comprParams->targetLength);
-                    ZSTD_CCtx_setParameter(ctx, ZSTD_p_compressionStrategy, comprParams->strategy);
-                    ZSTD_CCtx_loadDictionary(ctx, dictBuffer, dictBufferSize);
-
-                    if (!g_nbSeconds) nbCompressionLoops=1;
-                    for (nbLoops=0; nbLoops<nbCompressionLoops; nbLoops++) {
-                        U32 blockNb;
-                        for (blockNb=0; blockNb<nbBlocks; blockNb++) {
-#if 0   /* direct compression function, for occasional comparison */
-                            ZSTD_parameters const params = ZSTD_getParams(cLevel, blockTable[blockNb].srcSize, dictBufferSize);
-                            blockTable[blockNb].cSize = ZSTD_compress_advanced(ctx,
-                                                            blockTable[blockNb].cPtr, blockTable[blockNb].cRoom,
-                                                            blockTable[blockNb].srcPtr, blockTable[blockNb].srcSize,
-                                                            dictBuffer, dictBufferSize,
-                                                            params);
-#else
-                            size_t moreToFlush = 1;
-                            ZSTD_outBuffer out;
-                            ZSTD_inBuffer in;
-                            in.src = blockTable[blockNb].srcPtr;
-                            in.size = blockTable[blockNb].srcSize;
-                            in.pos = 0;
-                            out.dst = blockTable[blockNb].cPtr;
-                            out.size = blockTable[blockNb].cRoom;
-                            out.pos = 0;
-                            while (moreToFlush) {
-                                moreToFlush = ZSTD_compress_generic(ctx,
-                                                    &out, &in, ZSTD_e_end);
-                                if (ZSTD_isError(moreToFlush))
-                                    EXM_THROW(1, "ZSTD_compress_generic() error : %s",
-                                                ZSTD_getErrorName(moreToFlush));
-                            }
-                            blockTable[blockNb].cSize = out.pos;
-#endif
-                    }   }
-                    {   U64 const loopDuration = UTIL_clockSpanNano(clockStart);
-                        if (loopDuration > 0) {
-                            if (loopDuration < fastestC * nbCompressionLoops)
-                                fastestC = loopDuration / nbCompressionLoops;
-                            nbCompressionLoops = (U32)(TIMELOOP_NANOSEC / fastestC) + 1;
-                        } else {
-                            assert(nbCompressionLoops < 40000000);  /* avoid overflow */
-                            nbCompressionLoops *= 100;
-                        }
-                        totalCTime += loopDuration;
-                        cCompleted = (totalCTime >= maxTime);  /* end compression tests */
-                }   }
-
-                cSize = 0;
-                { U32 blockNb; for (blockNb=0; blockNb<nbBlocks; blockNb++) cSize += blockTable[blockNb].cSize; }
-                ratio = (double)srcSize / (double)cSize;
-                results.result.cSize = cSize;
-                markNb = (markNb+1) % NB_MARKS;
-                {   int const ratioAccuracy = (ratio < 10.) ? 3 : 2;
-                    double const compressionSpeed = ((double)srcSize / fastestC) * 1000;
-                    int const cSpeedAccuracy = (compressionSpeed < 10.) ? 2 : 1;
-                    results.result.cSpeed = compressionSpeed * 1000000;
-                    DISPLAYLEVEL(2, "%2s-%-17.17s :%10u ->%10u (%5.*f),%6.*f MB/s\r",
-                            marks[markNb], displayName, (U32)srcSize, (U32)cSize,
-                            ratioAccuracy, ratio,
-                            cSpeedAccuracy, compressionSpeed );
+                if (!BMK_isSuccessful_runOutcome(cOutcome)) {
+                    return BMK_benchOutcome_error();
                 }
-            }  /* if (!g_decodeOnly) */
 
-#if 0       /* disable decompression test */
-            dCompleted=1;
-            (void)totalDTime; (void)fastestD; (void)crcOrig;   /* unused when decompression disabled */
-#else
-            /* Decompression */
-            if (!dCompleted) memset(resultBuffer, 0xD6, srcSize);  /* warm result buffer */
-
-            UTIL_sleepMilli(5); /* give processor time to other processes */
-            UTIL_waitForNextTick();
-
-            if (!dCompleted) {
-                U32 nbLoops = 0;
-                ZSTD_DDict* const ddict = ZSTD_createDDict(dictBuffer, dictBufferSize);
-                UTIL_time_t const clockStart = UTIL_getTime();
-                if (!ddict) EXM_THROW(2, "ZSTD_createDDict() allocation failure");
-                if (!g_nbSeconds) nbDecodeLoops = 1;
-                for (nbLoops=0; nbLoops < nbDecodeLoops; nbLoops++) {
-                    U32 blockNb;
-                    for (blockNb=0; blockNb<nbBlocks; blockNb++) {
-                        size_t const regenSize = ZSTD_decompress_usingDDict(dctx,
-                            blockTable[blockNb].resPtr, blockTable[blockNb].resSize,
-                            blockTable[blockNb].cPtr, blockTable[blockNb].cSize,
-                            ddict);
-                        if (ZSTD_isError(regenSize)) {
-                            EXM_THROW(2, "ZSTD_decompress_usingDDict() failed on block %u of size %u : %s  \n",
-                                      blockNb, (U32)blockTable[blockNb].cSize, ZSTD_getErrorName(regenSize));
-                        }
-                        blockTable[blockNb].resSize = regenSize;
+                {   BMK_runTime_t const cResult = BMK_extract_runTime(cOutcome);
+                    cSize = cResult.sumOfReturn;
+                    ratio = (double)srcSize / cSize;
+                    {   BMK_benchResult_t newResult;
+                        newResult.cSpeed = ((U64)srcSize * TIMELOOP_NANOSEC / cResult.nanoSecPerRun);
+                        benchResult.cSize = cSize;
+                        if (newResult.cSpeed > benchResult.cSpeed)
+                            benchResult.cSpeed = newResult.cSpeed;
                 }   }
-                ZSTD_freeDDict(ddict);
-                {   U64 const loopDuration = UTIL_clockSpanNano(clockStart);
-                    if (loopDuration > 0) {
-                        if (loopDuration < fastestD * nbDecodeLoops)
-                            fastestD = loopDuration / nbDecodeLoops;
-                        nbDecodeLoops = (U32)(TIMELOOP_NANOSEC / fastestD) + 1;
-                    } else {
-                        assert(nbDecodeLoops < 40000000);  /* avoid overflow */
-                        nbDecodeLoops *= 100;
-                    }
-                    totalDTime += loopDuration;
-                    dCompleted = (totalDTime >= maxTime);
-            }   }
 
-            markNb = (markNb+1) % NB_MARKS;
-            {   int const ratioAccuracy = (ratio < 10.) ? 3 : 2;
-                double const compressionSpeed = ((double)srcSize / fastestC) * 1000;
-                int const cSpeedAccuracy = (compressionSpeed < 10.) ? 2 : 1;
-                double const decompressionSpeed = ((double)srcSize / fastestD) * 1000;
-                results.result.cSpeed = compressionSpeed * 1000000;
-                results.result.dSpeed = decompressionSpeed * 1000000;
-                DISPLAYLEVEL(2, "%2s-%-17.17s :%10u ->%10u (%5.*f),%6.*f MB/s ,%6.1f MB/s \r",
-                        marks[markNb], displayName, (U32)srcSize, (U32)cSize,
-                        ratioAccuracy, ratio,
-                        cSpeedAccuracy, compressionSpeed,
-                        decompressionSpeed);
+                {   int const ratioAccuracy = (ratio < 10.) ? 3 : 2;
+                    markNb = (markNb+1) % NB_MARKS;
+                    DISPLAYLEVEL(2, "%2s-%-17.17s :%10u ->%10u (%5.*f),%6.*f MB/s\r",
+                            marks[markNb], displayName,
+                            (U32)srcSize, (U32)cSize,
+                            ratioAccuracy, ratio,
+                            benchResult.cSpeed < (10 MB) ? 2 : 1, (double)benchResult.cSpeed / MB_UNIT);
+                }
+                compressionCompleted = BMK_isCompleted_TimedFn(timeStateCompress);
             }
 
-            /* CRC Checking */
-            {   U64 const crcCheck = XXH64(resultBuffer, srcSize, 0);
-                if (!g_decodeOnly && (crcOrig!=crcCheck)) {
-                    size_t u;
-                    DISPLAY("!!! WARNING !!! %14s : Invalid Checksum : %x != %x   \n", displayName, (unsigned)crcOrig, (unsigned)crcCheck);
-                    for (u=0; u<srcSize; u++) {
-                        if (((const BYTE*)srcBuffer)[u] != ((const BYTE*)resultBuffer)[u]) {
-                            U32 segNb, bNb, pos;
-                            size_t bacc = 0;
-                            DISPLAY("Decoding error at pos %u ", (U32)u);
-                            for (segNb = 0; segNb < nbBlocks; segNb++) {
-                                if (bacc + blockTable[segNb].srcSize > u) break;
-                                bacc += blockTable[segNb].srcSize;
-                            }
-                            pos = (U32)(u - bacc);
-                            bNb = pos / (128 KB);
-                            DISPLAY("(sample %u, block %u, pos %u) \n", segNb, bNb, pos);
-                            if (u>5) {
-                                int n;
-                                DISPLAY("origin: ");
-                                for (n=-5; n<0; n++) DISPLAY("%02X ", ((const BYTE*)srcBuffer)[u+n]);
-                                DISPLAY(" :%02X:  ", ((const BYTE*)srcBuffer)[u]);
-                                for (n=1; n<3; n++) DISPLAY("%02X ", ((const BYTE*)srcBuffer)[u+n]);
-                                DISPLAY(" \n");
-                                DISPLAY("decode: ");
-                                for (n=-5; n<0; n++) DISPLAY("%02X ", ((const BYTE*)resultBuffer)[u+n]);
-                                DISPLAY(" :%02X:  ", ((const BYTE*)resultBuffer)[u]);
-                                for (n=1; n<3; n++) DISPLAY("%02X ", ((const BYTE*)resultBuffer)[u+n]);
-                                DISPLAY(" \n");
-                            }
-                            break;
+            if(!decompressionCompleted) {
+                BMK_runOutcome_t const dOutcome =
+                        BMK_benchTimedFn(timeStateDecompress,
+                                        &local_defaultDecompress, dctx,
+                                        &local_initDCtx, &dctxprep,
+                                        nbBlocks,
+                                        (const void *const *)cPtrs, cSizes,
+                                        resPtrs, resSizes,
+                                        NULL);
+
+                if(!BMK_isSuccessful_runOutcome(dOutcome)) {
+                    return BMK_benchOutcome_error();
+                }
+
+                {   BMK_runTime_t const dResult = BMK_extract_runTime(dOutcome);
+                    U64 const newDSpeed = (srcSize * TIMELOOP_NANOSEC / dResult.nanoSecPerRun);
+                    if (newDSpeed > benchResult.dSpeed)
+                        benchResult.dSpeed = newDSpeed;
+                }
+
+                {   int const ratioAccuracy = (ratio < 10.) ? 3 : 2;
+                    markNb = (markNb+1) % NB_MARKS;
+                    DISPLAYLEVEL(2, "%2s-%-17.17s :%10u ->%10u (%5.*f),%6.*f MB/s ,%6.1f MB/s \r",
+                            marks[markNb], displayName,
+                            (U32)srcSize, (U32)benchResult.cSize,
+                            ratioAccuracy, ratio,
+                            benchResult.cSpeed < (10 MB) ? 2 : 1, (double)benchResult.cSpeed / MB_UNIT,
+                            (double)benchResult.dSpeed / MB_UNIT);
+                }
+                decompressionCompleted = BMK_isCompleted_TimedFn(timeStateDecompress);
+            }
+        }   /* while (!(compressionCompleted && decompressionCompleted)) */
+
+        /* CRC Checking */
+        {   const BYTE* resultBuffer = (const BYTE*)(*resultBufferPtr);
+            U64 const crcCheck = XXH64(resultBuffer, srcSize, 0);
+            if ((adv->mode == BMK_both) && (crcOrig!=crcCheck)) {
+                size_t u;
+                DISPLAY("!!! WARNING !!! %14s : Invalid Checksum : %x != %x   \n", displayName, (unsigned)crcOrig, (unsigned)crcCheck);
+                for (u=0; u<srcSize; u++) {
+                    if (((const BYTE*)srcBuffer)[u] != resultBuffer[u]) {
+                        U32 segNb, bNb, pos;
+                        size_t bacc = 0;
+                        DISPLAY("Decoding error at pos %u ", (U32)u);
+                        for (segNb = 0; segNb < nbBlocks; segNb++) {
+                            if (bacc + srcSizes[segNb] > u) break;
+                            bacc += srcSizes[segNb];
                         }
-                        if (u==srcSize-1) {  /* should never happen */
-                            DISPLAY("no difference detected\n");
-                    }   }
-                    break;
-            }   }   /* CRC Checking */
-#endif
-        }   /* for (testNb = 1; testNb <= (g_nbSeconds + !g_nbSeconds); testNb++) */
+                        pos = (U32)(u - bacc);
+                        bNb = pos / (128 KB);
+                        DISPLAY("(sample %u, block %u, pos %u) \n", segNb, bNb, pos);
+                        if (u>5) {
+                            int n;
+                            DISPLAY("origin: ");
+                            for (n=-5; n<0; n++) DISPLAY("%02X ", ((const BYTE*)srcBuffer)[u+n]);
+                            DISPLAY(" :%02X:  ", ((const BYTE*)srcBuffer)[u]);
+                            for (n=1; n<3; n++) DISPLAY("%02X ", ((const BYTE*)srcBuffer)[u+n]);
+                            DISPLAY(" \n");
+                            DISPLAY("decode: ");
+                            for (n=-5; n<0; n++) DISPLAY("%02X ", resultBuffer[u+n]);
+                            DISPLAY(" :%02X:  ", resultBuffer[u]);
+                            for (n=1; n<3; n++) DISPLAY("%02X ", resultBuffer[u+n]);
+                            DISPLAY(" \n");
+                        }
+                        break;
+                    }
+                    if (u==srcSize-1) {  /* should never happen */
+                        DISPLAY("no difference detected\n");
+                    }
+                }
+            }
+        }   /* CRC Checking */
 
         if (displayLevel == 1) {   /* hidden display mode -q, used by python speed benchmark */
-            double const cSpeed = ((double)srcSize / fastestC) * 1000;
-            double const dSpeed = ((double)srcSize / fastestD) * 1000;
-            if (g_additionalParam)
-                DISPLAY("-%-3i%11i (%5.3f) %6.2f MB/s %6.1f MB/s  %s (param=%d)\n", cLevel, (int)cSize, ratio, cSpeed, dSpeed, displayName, g_additionalParam);
-            else
+            double const cSpeed = (double)benchResult.cSpeed / MB_UNIT;
+            double const dSpeed = (double)benchResult.dSpeed / MB_UNIT;
+            if (adv->additionalParam) {
+                DISPLAY("-%-3i%11i (%5.3f) %6.2f MB/s %6.1f MB/s  %s (param=%d)\n", cLevel, (int)cSize, ratio, cSpeed, dSpeed, displayName, adv->additionalParam);
+            } else {
                 DISPLAY("-%-3i%11i (%5.3f) %6.2f MB/s %6.1f MB/s  %s\n", cLevel, (int)cSize, ratio, cSpeed, dSpeed, displayName);
+            }
         }
+
         DISPLAYLEVEL(2, "%2i#\n", cLevel);
     }   /* Bench */
 
-    /* clean up */
-    free(blockTable);
-    free(compressedBuffer);
-    free(resultBuffer);
-    results.errorCode = 0;
-    return results;
+    benchResult.cMem = (1ULL << (comprParams->windowLog)) + ZSTD_sizeof_CCtx(cctx);
+    return BMK_benchOutcome_setValidResult(benchResult);
 }
 
-static void BMK_benchMemCtxless(const void* srcBuffer, size_t srcSize,
-                        const size_t* fileSizes, unsigned nbFiles, 
-                        int cLevel, const ZSTD_compressionParameters* const comprParams,
+BMK_benchOutcome_t BMK_benchMemAdvanced(const void* srcBuffer, size_t srcSize,
+                        void* dstBuffer, size_t dstCapacity,
+                        const size_t* fileSizes, unsigned nbFiles,
+                        int cLevel, const ZSTD_compressionParameters* comprParams,
                         const void* dictBuffer, size_t dictBufferSize,
-                         int displayLevel, const char* displayName) 
+                        int displayLevel, const char* displayName, const BMK_advancedParams_t* adv)
+
 {
-    ZSTD_CCtx* ctx = ZSTD_createCCtx();
-    ZSTD_DCtx* dctx = ZSTD_createDCtx();
-    if(ctx == NULL || dctx == NULL) {
-        EXM_THROW(12, "not enough memory for contexts");
+    int const dstParamsError = !dstBuffer ^ !dstCapacity;  /* must be both NULL or none */
+
+    size_t const blockSize = ((adv->blockSize>=32 && (adv->mode != BMK_decodeOnly)) ? adv->blockSize : srcSize) + (!srcSize) /* avoid div by 0 */ ;
+    U32 const maxNbBlocks = (U32) ((srcSize + (blockSize-1)) / blockSize) + nbFiles;
+
+    /* these are the blockTable parameters, just split up */
+    const void ** const srcPtrs = (const void**)malloc(maxNbBlocks * sizeof(void*));
+    size_t* const srcSizes = (size_t*)malloc(maxNbBlocks * sizeof(size_t));
+
+
+    void ** const cPtrs = (void**)malloc(maxNbBlocks * sizeof(void*));
+    size_t* const cSizes = (size_t*)malloc(maxNbBlocks * sizeof(size_t));
+    size_t* const cCapacities = (size_t*)malloc(maxNbBlocks * sizeof(size_t));
+
+    void ** const resPtrs = (void**)malloc(maxNbBlocks * sizeof(void*));
+    size_t* const resSizes = (size_t*)malloc(maxNbBlocks * sizeof(size_t));
+
+    BMK_timedFnState_t* timeStateCompress = BMK_createTimedFnState(adv->nbSeconds * 1000, BMK_RUNTEST_DEFAULT_MS);
+    BMK_timedFnState_t* timeStateDecompress = BMK_createTimedFnState(adv->nbSeconds * 1000, BMK_RUNTEST_DEFAULT_MS);
+
+    ZSTD_CCtx* const cctx = ZSTD_createCCtx();
+    ZSTD_DCtx* const dctx = ZSTD_createDCtx();
+
+    const size_t maxCompressedSize = dstCapacity ? dstCapacity : ZSTD_compressBound(srcSize) + (maxNbBlocks * 1024);
+
+    void* const internalDstBuffer = dstBuffer ? NULL : malloc(maxCompressedSize);
+    void* const compressedBuffer = dstBuffer ? dstBuffer : internalDstBuffer;
+
+    BMK_benchOutcome_t outcome = BMK_benchOutcome_error();  /* error by default */
+
+    void* resultBuffer = srcSize ? malloc(srcSize) : NULL;
+
+    int allocationincomplete = !srcPtrs || !srcSizes || !cPtrs ||
+        !cSizes || !cCapacities || !resPtrs || !resSizes ||
+        !timeStateCompress || !timeStateDecompress ||
+        !cctx || !dctx ||
+        !compressedBuffer || !resultBuffer;
+
+
+    if (!allocationincomplete && !dstParamsError) {
+        outcome = BMK_benchMemAdvancedNoAlloc(srcPtrs, srcSizes,
+                                            cPtrs, cCapacities, cSizes,
+                                            resPtrs, resSizes,
+                                            &resultBuffer,
+                                            compressedBuffer, maxCompressedSize,
+                                            timeStateCompress, timeStateDecompress,
+                                            srcBuffer, srcSize,
+                                            fileSizes, nbFiles,
+                                            cLevel, comprParams,
+                                            dictBuffer, dictBufferSize,
+                                            cctx, dctx,
+                                            displayLevel, displayName, adv);
     }
-    BMK_benchMem(srcBuffer, srcSize, 
-                fileSizes, nbFiles, 
-                cLevel, comprParams, 
-                dictBuffer, dictBufferSize, 
-                ctx, dctx, 
-                displayLevel, displayName);
-    ZSTD_freeCCtx(ctx);
+
+    /* clean up */
+    BMK_freeTimedFnState(timeStateCompress);
+    BMK_freeTimedFnState(timeStateDecompress);
+
+    ZSTD_freeCCtx(cctx);
     ZSTD_freeDCtx(dctx);
+
+    free(internalDstBuffer);
+    free(resultBuffer);
+
+    free((void*)srcPtrs);
+    free(srcSizes);
+    free(cPtrs);
+    free(cSizes);
+    free(cCapacities);
+    free(resPtrs);
+    free(resSizes);
+
+    if(allocationincomplete) {
+        RETURN_ERROR(31, BMK_benchOutcome_t, "allocation error : not enough memory");
+    }
+
+    if(dstParamsError) {
+        RETURN_ERROR(32, BMK_benchOutcome_t, "Dst parameters not coherent");
+    }
+    return outcome;
 }
 
+BMK_benchOutcome_t BMK_benchMem(const void* srcBuffer, size_t srcSize,
+                        const size_t* fileSizes, unsigned nbFiles,
+                        int cLevel, const ZSTD_compressionParameters* comprParams,
+                        const void* dictBuffer, size_t dictBufferSize,
+                        int displayLevel, const char* displayName) {
+
+    BMK_advancedParams_t const adv = BMK_initAdvancedParams();
+    return BMK_benchMemAdvanced(srcBuffer, srcSize,
+                                NULL, 0,
+                                fileSizes, nbFiles,
+                                cLevel, comprParams,
+                                dictBuffer, dictBufferSize,
+                                displayLevel, displayName, &adv);
+}
+
+static BMK_benchOutcome_t BMK_benchCLevel(const void* srcBuffer, size_t benchedSize,
+                            const size_t* fileSizes, unsigned nbFiles,
+                            int cLevel, const ZSTD_compressionParameters* comprParams,
+                            const void* dictBuffer, size_t dictBufferSize,
+                            int displayLevel, const char* displayName,
+                            BMK_advancedParams_t const * const adv)
+{
+    const char* pch = strrchr(displayName, '\\'); /* Windows */
+    if (!pch) pch = strrchr(displayName, '/');    /* Linux */
+    if (pch) displayName = pch+1;
+
+    if (adv->realTime) {
+        DISPLAYLEVEL(2, "Note : switching to real-time priority \n");
+        SET_REALTIME_PRIORITY;
+    }
+
+    if (displayLevel == 1 && !adv->additionalParam)   /* --quiet mode */
+        DISPLAY("bench %s %s: input %u bytes, %u seconds, %u KB blocks\n",
+                ZSTD_VERSION_STRING, ZSTD_GIT_COMMIT_STRING,
+                (U32)benchedSize, adv->nbSeconds, (U32)(adv->blockSize>>10));
+
+    return BMK_benchMemAdvanced(srcBuffer, benchedSize,
+                                NULL, 0,
+                                fileSizes, nbFiles,
+                                cLevel, comprParams,
+                                dictBuffer, dictBufferSize,
+                                displayLevel, displayName, adv);
+}
+
+BMK_benchOutcome_t BMK_syntheticTest(int cLevel, double compressibility,
+                          const ZSTD_compressionParameters* compressionParams,
+                          int displayLevel, const BMK_advancedParams_t* adv)
+{
+    char name[20] = {0};
+    size_t const benchedSize = 10000000;
+    void* srcBuffer;
+    BMK_benchOutcome_t res;
+
+    if (cLevel > ZSTD_maxCLevel()) {
+        RETURN_ERROR(15, BMK_benchOutcome_t, "Invalid Compression Level");
+    }
+
+    /* Memory allocation */
+    srcBuffer = malloc(benchedSize);
+    if (!srcBuffer) RETURN_ERROR(21, BMK_benchOutcome_t, "not enough memory");
+
+    /* Fill input buffer */
+    RDG_genBuffer(srcBuffer, benchedSize, compressibility, 0.0, 0);
+
+    /* Bench */
+    snprintf (name, sizeof(name), "Synthetic %2u%%", (unsigned)(compressibility*100));
+    res = BMK_benchCLevel(srcBuffer, benchedSize,
+                    &benchedSize /* ? */, 1 /* ? */,
+                    cLevel, compressionParams,
+                    NULL, 0,  /* dictionary */
+                    displayLevel, name, adv);
+
+    /* clean up */
+    free(srcBuffer);
+
+    return res;
+}
+
+
+
 static size_t BMK_findMaxMem(U64 requiredMem)
 {
     size_t const step = 64 MB;
@@ -538,52 +941,19 @@
     do {
         testmem = (BYTE*)malloc((size_t)requiredMem);
         requiredMem -= step;
-    } while (!testmem);
+    } while (!testmem && requiredMem > 0);
 
     free(testmem);
     return (size_t)(requiredMem);
 }
 
-/* returns average stats over all range [cLevel, cLevelLast] */
-static void BMK_benchCLevel(const void* srcBuffer, size_t benchedSize,
-                            const size_t* fileSizes, unsigned nbFiles,
-                            const int cLevel, const int cLevelLast, const ZSTD_compressionParameters* comprParams,
-                            const void* dictBuffer, size_t dictBufferSize,
-                            int displayLevel, const char* displayName)
-{
-    int l;
-
-    const char* pch = strrchr(displayName, '\\'); /* Windows */
-    if (!pch) pch = strrchr(displayName, '/'); /* Linux */
-    if (pch) displayName = pch+1;
-
-    if (g_realTime) {
-        DISPLAYLEVEL(2, "Note : switching to real-time priority \n");
-        SET_REALTIME_PRIORITY;
-    }
-
-    if (displayLevel == 1 && !g_additionalParam)
-        DISPLAY("bench %s %s: input %u bytes, %u seconds, %u KB blocks\n", ZSTD_VERSION_STRING, ZSTD_GIT_COMMIT_STRING, (U32)benchedSize, g_nbSeconds, (U32)(g_blockSize>>10));
-
-    for (l=cLevel; l <= cLevelLast; l++) {
-        if (l==0) continue;  /* skip level 0 */
-        BMK_benchMemCtxless(srcBuffer, benchedSize,
-                    fileSizes, nbFiles, 
-                    l, comprParams, 
-                    dictBuffer, dictBufferSize, 
-                    displayLevel, displayName);
-    }
-
-    return;
-}
-
-
 /*! BMK_loadFiles() :
  *  Loads `buffer` with content of files listed within `fileNamesTable`.
  *  At most, fills `buffer` entirely. */
-static void BMK_loadFiles(void* buffer, size_t bufferSize,
-                          size_t* fileSizes, const char* const * const fileNamesTable, 
-                          unsigned nbFiles, int displayLevel)
+static int BMK_loadFiles(void* buffer, size_t bufferSize,
+                         size_t* fileSizes,
+                         const char* const * fileNamesTable, unsigned nbFiles,
+                         int displayLevel)
 {
     size_t pos = 0, totalSize = 0;
     unsigned n;
@@ -601,44 +971,69 @@
             continue;
         }
         f = fopen(fileNamesTable[n], "rb");
-        if (f==NULL) EXM_THROW(10, "impossible to open file %s", fileNamesTable[n]);
+        if (f==NULL) EXM_THROW_INT(10, "impossible to open file %s", fileNamesTable[n]);
         DISPLAYUPDATE(2, "Loading %s...       \r", fileNamesTable[n]);
         if (fileSize > bufferSize-pos) fileSize = bufferSize-pos, nbFiles=n;   /* buffer too small - stop after this file */
-        { size_t const readSize = fread(((char*)buffer)+pos, 1, (size_t)fileSize, f);
-          if (readSize != (size_t)fileSize) EXM_THROW(11, "could not read %s", fileNamesTable[n]);
-          pos += readSize; }
+        {   size_t const readSize = fread(((char*)buffer)+pos, 1, (size_t)fileSize, f);
+            if (readSize != (size_t)fileSize) EXM_THROW_INT(11, "could not read %s", fileNamesTable[n]);
+            pos += readSize;
+        }
         fileSizes[n] = (size_t)fileSize;
         totalSize += (size_t)fileSize;
         fclose(f);
     }
 
-    if (totalSize == 0) EXM_THROW(12, "no data to bench");
+    if (totalSize == 0) EXM_THROW_INT(12, "no data to bench");
+    return 0;
 }
 
-static void BMK_benchFileTable(const char* const * const fileNamesTable, unsigned const nbFiles,
-                               const char* const dictFileName, int const cLevel, int const cLevelLast,
-                               const ZSTD_compressionParameters* const compressionParams, int displayLevel)
+BMK_benchOutcome_t BMK_benchFilesAdvanced(
+                        const char* const * fileNamesTable, unsigned nbFiles,
+                        const char* dictFileName, int cLevel,
+                        const ZSTD_compressionParameters* compressionParams,
+                        int displayLevel, const BMK_advancedParams_t* adv)
 {
-    void* srcBuffer;
+    void* srcBuffer = NULL;
     size_t benchedSize;
     void* dictBuffer = NULL;
     size_t dictBufferSize = 0;
-    size_t* const fileSizes = (size_t*)malloc(nbFiles * sizeof(size_t));
+    size_t* fileSizes = NULL;
+    BMK_benchOutcome_t res;
     U64 const totalSizeToLoad = UTIL_getTotalFileSize(fileNamesTable, nbFiles);
 
-    if (!fileSizes) EXM_THROW(12, "not enough memory for fileSizes");
+    if (!nbFiles) {
+        RETURN_ERROR(14, BMK_benchOutcome_t, "No Files to Benchmark");
+    }
+
+    if (cLevel > ZSTD_maxCLevel()) {
+        RETURN_ERROR(15, BMK_benchOutcome_t, "Invalid Compression Level");
+    }
+
+    fileSizes = (size_t*)calloc(nbFiles, sizeof(size_t));
+    if (!fileSizes) RETURN_ERROR(12, BMK_benchOutcome_t, "not enough memory for fileSizes");
 
     /* Load dictionary */
     if (dictFileName != NULL) {
         U64 const dictFileSize = UTIL_getFileSize(dictFileName);
-        if (dictFileSize > 64 MB)
-            EXM_THROW(10, "dictionary file %s too large", dictFileName);
+        if (dictFileSize > 64 MB) {
+            free(fileSizes);
+            RETURN_ERROR(10, BMK_benchOutcome_t, "dictionary file %s too large", dictFileName);
+        }
         dictBufferSize = (size_t)dictFileSize;
         dictBuffer = malloc(dictBufferSize);
-        if (dictBuffer==NULL)
-            EXM_THROW(11, "not enough memory for dictionary (%u bytes)",
+        if (dictBuffer==NULL) {
+            free(fileSizes);
+            RETURN_ERROR(11, BMK_benchOutcome_t, "not enough memory for dictionary (%u bytes)",
                             (U32)dictBufferSize);
-        BMK_loadFiles(dictBuffer, dictBufferSize, fileSizes, &dictFileName, 1, displayLevel);
+        }
+
+        {   int const errorCode = BMK_loadFiles(dictBuffer, dictBufferSize,
+                                                fileSizes, &dictFileName /*?*/,
+                                                1 /*?*/, displayLevel);
+            if (errorCode) {
+                res = BMK_benchOutcome_error();
+                goto _cleanUp;
+        }   }
     }
 
     /* Memory allocation & restrictions */
@@ -646,97 +1041,49 @@
     if ((U64)benchedSize > totalSizeToLoad) benchedSize = (size_t)totalSizeToLoad;
     if (benchedSize < totalSizeToLoad)
         DISPLAY("Not enough memory; testing %u MB only...\n", (U32)(benchedSize >> 20));
-    srcBuffer = malloc(benchedSize);
-    if (!srcBuffer) EXM_THROW(12, "not enough memory");
+
+    srcBuffer = benchedSize ? malloc(benchedSize) : NULL;
+    if (!srcBuffer) {
+        free(dictBuffer);
+        free(fileSizes);
+        RETURN_ERROR(12, BMK_benchOutcome_t, "not enough memory");
+    }
 
     /* Load input buffer */
-    BMK_loadFiles(srcBuffer, benchedSize, fileSizes, fileNamesTable, nbFiles, displayLevel);
-
-    /* Bench */
-    if (g_separateFiles) {
-        const BYTE* srcPtr = (const BYTE*)srcBuffer;
-        U32 fileNb;
-        BMK_result_t* resultarray = (BMK_result_t*)malloc(sizeof(BMK_result_t) * nbFiles);
-        if(resultarray == NULL) EXM_THROW(12, "not enough memory");
-        for (fileNb=0; fileNb<nbFiles; fileNb++) {
-            size_t const fileSize = fileSizes[fileNb];
-            BMK_benchCLevel(srcPtr, fileSize,
-                            fileSizes+fileNb, 1, 
-                            cLevel, cLevelLast, compressionParams,
-                            dictBuffer, dictBufferSize, 
-                            displayLevel, fileNamesTable[fileNb]);
-            srcPtr += fileSize;
-        }
-
-    } else {
-        char mfName[20] = {0};
-        snprintf (mfName, sizeof(mfName), " %u files", nbFiles);
-        {   const char* const displayName = (nbFiles > 1) ? mfName : fileNamesTable[0];
-            BMK_benchCLevel(srcBuffer, benchedSize, 
-                            fileSizes, nbFiles, 
-                            cLevel, cLevelLast, compressionParams,
-                            dictBuffer, dictBufferSize, 
-                            displayLevel, displayName);
+    {   int const errorCode = BMK_loadFiles(srcBuffer, benchedSize,
+                                        fileSizes, fileNamesTable, nbFiles,
+                                        displayLevel);
+        if (errorCode) {
+            res = BMK_benchOutcome_error();
+            goto _cleanUp;
     }   }
 
-    /* clean up */
+    /* Bench */
+    {   char mfName[20] = {0};
+        snprintf (mfName, sizeof(mfName), " %u files", nbFiles);
+        {   const char* const displayName = (nbFiles > 1) ? mfName : fileNamesTable[0];
+            res = BMK_benchCLevel(srcBuffer, benchedSize,
+                                fileSizes, nbFiles,
+                                cLevel, compressionParams,
+                                dictBuffer, dictBufferSize,
+                                displayLevel, displayName,
+                                adv);
+    }   }
+
+_cleanUp:
     free(srcBuffer);
     free(dictBuffer);
     free(fileSizes);
+    return res;
 }
 
 
-static void BMK_syntheticTest(int cLevel, int cLevelLast, double compressibility,
-                              const ZSTD_compressionParameters* compressionParams,
-                              int displayLevel)
+BMK_benchOutcome_t BMK_benchFiles(
+                    const char* const * fileNamesTable, unsigned nbFiles,
+                    const char* dictFileName,
+                    int cLevel, const ZSTD_compressionParameters* compressionParams,
+                    int displayLevel)
 {
-    char name[20] = {0};
-    size_t benchedSize = 10000000;
-    void* const srcBuffer = malloc(benchedSize);
-
-    /* Memory allocation */
-    if (!srcBuffer) EXM_THROW(21, "not enough memory");
-
-    /* Fill input buffer */
-    RDG_genBuffer(srcBuffer, benchedSize, compressibility, 0.0, 0);
-
-    /* Bench */
-    snprintf (name, sizeof(name), "Synthetic %2u%%", (unsigned)(compressibility*100));
-    BMK_benchCLevel(srcBuffer, benchedSize, 
-                    &benchedSize, 1, 
-                    cLevel, cLevelLast, compressionParams, 
-                    NULL, 0, 
-                    displayLevel, name);
-
-    /* clean up */
-    free(srcBuffer);
-}
-
-
-static void BMK_benchFilesFull(const char** fileNamesTable, unsigned nbFiles,
-                   const char* dictFileName, 
-                   int cLevel, int cLevelLast, 
-                   const ZSTD_compressionParameters* compressionParams, int displayLevel)
-{
-    double const compressibility = (double)g_compressibilityDefault / 100;
-
-    if (cLevel > ZSTD_maxCLevel()) cLevel = ZSTD_maxCLevel();
-    if (cLevelLast > ZSTD_maxCLevel()) cLevelLast = ZSTD_maxCLevel();
-    if (cLevelLast < cLevel) cLevelLast = cLevel;
-    if (cLevelLast > cLevel)
-        DISPLAYLEVEL(2, "Benchmarking levels from %d to %d\n", cLevel, cLevelLast);
-
-    if (nbFiles == 0)
-        BMK_syntheticTest(cLevel, cLevelLast, compressibility, compressionParams, displayLevel);
-    else
-        BMK_benchFileTable(fileNamesTable, nbFiles, dictFileName, cLevel, cLevelLast, compressionParams, displayLevel);
-}
-
-int BMK_benchFiles(const char** fileNamesTable, unsigned nbFiles,
-                   const char* dictFileName,
-                   int cLevel, int cLevelLast,
-                   const ZSTD_compressionParameters* compressionParams,
-                   int displayLevel) {
-    BMK_benchFilesFull(fileNamesTable, nbFiles, dictFileName, cLevel, cLevelLast, compressionParams, displayLevel);
-    return 0;
+    BMK_advancedParams_t const adv = BMK_initAdvancedParams();
+    return BMK_benchFilesAdvanced(fileNamesTable, nbFiles, dictFileName, cLevel, compressionParams, displayLevel, &adv);
 }
diff --git a/programs/bench.h b/programs/bench.h
index 0ba6f89..13ca5b5 100644
--- a/programs/bench.h
+++ b/programs/bench.h
@@ -15,50 +15,286 @@
 #ifndef BENCH_H_121279284357
 #define BENCH_H_121279284357
 
+/* ===  Dependencies  === */
 #include <stddef.h>   /* size_t */
 #define ZSTD_STATIC_LINKING_ONLY   /* ZSTD_compressionParameters */
 #include "zstd.h"     /* ZSTD_compressionParameters */
 
+
+/* ===  Constants  === */
+
+#define MB_UNIT 1000000
+
+
+/* ===  Benchmark functions  === */
+
+/* Creates a variant `typeName`, able to express "error or valid result".
+ * Functions with return type `typeName`
+ * must first check if result is valid, using BMK_isSuccessful_*(),
+ * and only then can extract `baseType`.
+ */
+#define VARIANT_ERROR_RESULT(baseType, variantName)  \
+                                             \
+typedef struct {                             \
+    baseType internal_never_use_directly;    \
+    int tag;                                 \
+} variantName
+
+
 typedef struct {
     size_t cSize;
-    double cSpeed;   /* bytes / sec */
-    double dSpeed;
-} BMK_result_t;
+    unsigned long long cSpeed;   /* bytes / sec */
+    unsigned long long dSpeed;
+    size_t cMem;                 /* memory usage during compression */
+} BMK_benchResult_t;
 
-/* 0 = no Error */
-typedef struct {
-	int errorCode;
-	BMK_result_t result;
-} BMK_return_t;
+VARIANT_ERROR_RESULT(BMK_benchResult_t, BMK_benchOutcome_t);
 
-/* called in cli */
-int BMK_benchFiles(const char** fileNamesTable, unsigned nbFiles, const char* dictFileName,
-                   int cLevel, int cLevelLast, const ZSTD_compressionParameters* compressionParams, 
+/* check first if the return structure represents an error or a valid result */
+int BMK_isSuccessful_benchOutcome(BMK_benchOutcome_t outcome);
+
+/* extract result from variant type.
+ * note : this function will abort() program execution if result is not valid
+ *        check result validity first, by using BMK_isSuccessful_benchOutcome()
+ */
+BMK_benchResult_t BMK_extract_benchResult(BMK_benchOutcome_t outcome);
+
+
+/*! BMK_benchFiles() -- called by zstdcli */
+/*  Loads files from fileNamesTable into memory,
+ *  and an optional dictionary from dictFileName (can be NULL),
+ *  then uses benchMem().
+ *  fileNamesTable - name of files to benchmark.
+ *  nbFiles - number of files (size of fileNamesTable), must be > 0.
+ *  dictFileName - name of dictionary file to load.
+ *  cLevel - compression level to benchmark, errors if invalid.
+ *  compressionParams - advanced compression Parameters.
+ *  displayLevel - what gets printed:
+ *      0 : no display;
+ *      1 : errors;
+ *      2 : + result + interaction + warnings;
+ *      3 : + information;
+ *      4 : + debug
+ * @return:
+ *      a variant, which expresses either an error, or a valid result.
+ *      Use BMK_isSuccessful_benchOutcome() to check if function was successful.
+ *      If yes, extract the valid result with BMK_extract_benchResult(),
+ *      it will contain :
+ *          .cSpeed: compression speed in bytes per second,
+ *          .dSpeed: decompression speed in bytes per second,
+ *          .cSize : compressed size, in bytes
+ *          .cMem  : memory budget required for the compression context
+ */
+BMK_benchOutcome_t BMK_benchFiles(
+                   const char* const * fileNamesTable, unsigned nbFiles,
+                   const char* dictFileName,
+                   int cLevel, const ZSTD_compressionParameters* compressionParams,
                    int displayLevel);
 
-/* basic benchmarking function, called in paramgrill
- * ctx, dctx must be valid */
-BMK_return_t BMK_benchMem(const void* srcBuffer, size_t srcSize,
+
+typedef enum {
+    BMK_both = 0,
+    BMK_decodeOnly = 1,
+    BMK_compressOnly = 2
+} BMK_mode_t;
+
+typedef struct {
+    BMK_mode_t mode;            /* 0: all, 1: compress only 2: decode only */
+    unsigned nbSeconds;         /* default timing is in nbSeconds */
+    size_t blockSize;           /* Maximum size of each block*/
+    unsigned nbWorkers;         /* multithreading */
+    unsigned realTime;          /* real time priority */
+    int additionalParam;        /* used by python speed benchmark */
+    unsigned ldmFlag;           /* enables long distance matching */
+    unsigned ldmMinMatch;       /* below: parameters for long distance matching, see zstd.1.md */
+    unsigned ldmHashLog;
+    unsigned ldmBucketSizeLog;
+    unsigned ldmHashEveryLog;
+} BMK_advancedParams_t;
+
+/* returns default parameters used by nonAdvanced functions */
+BMK_advancedParams_t BMK_initAdvancedParams(void);
+
+/*! BMK_benchFilesAdvanced():
+ *  Same as BMK_benchFiles(),
+ *  with more controls, provided through advancedParams_t structure */
+BMK_benchOutcome_t BMK_benchFilesAdvanced(
+                   const char* const * fileNamesTable, unsigned nbFiles,
+                   const char* dictFileName,
+                   int cLevel, const ZSTD_compressionParameters* compressionParams,
+                   int displayLevel, const BMK_advancedParams_t* adv);
+
+/*! BMK_syntheticTest() -- called from zstdcli */
+/*  Generates a sample with datagen, using compressibility argument */
+/*  cLevel - compression level to benchmark, errors if invalid
+ *  compressibility - determines compressibility of sample
+ *  compressionParams - basic compression Parameters
+ *  displayLevel - see benchFiles
+ *  adv - see advanced_Params_t
+ * @return:
+ *      a variant, which expresses either an error, or a valid result.
+ *      Use BMK_isSuccessful_benchOutcome() to check if function was successful.
+ *      If yes, extract the valid result with BMK_extract_benchResult(),
+ *      it will contain :
+ *          .cSpeed: compression speed in bytes per second,
+ *          .dSpeed: decompression speed in bytes per second,
+ *          .cSize : compressed size, in bytes
+ *          .cMem  : memory budget required for the compression context
+ */
+BMK_benchOutcome_t BMK_syntheticTest(
+                              int cLevel, double compressibility,
+                              const ZSTD_compressionParameters* compressionParams,
+                              int displayLevel, const BMK_advancedParams_t* adv);
+
+
+
+/* ===  Benchmark Zstandard in a memory-to-memory scenario  === */
+
+/** BMK_benchMem() -- core benchmarking function, called in paramgrill
+ *  applies ZSTD_compress_generic() and ZSTD_decompress_generic() on data in srcBuffer
+ *  with specific compression parameters provided by other arguments using benchFunction
+ *  (cLevel, comprParams + adv in advanced Mode) */
+/*  srcBuffer - data source, expected to be valid compressed data if in Decode Only Mode
+ *  srcSize - size of data in srcBuffer
+ *  fileSizes - srcBuffer is considered cut into 1+ segments, to compress separately.
+ *              note : sum(fileSizes) must be == srcSize.  (<== ensure it's properly checked)
+ *  nbFiles - nb of segments
+ *  cLevel - compression level
+ *  comprParams - basic compression parameters
+ *  dictBuffer - a dictionary if used, null otherwise
+ *  dictBufferSize - size of dictBuffer, 0 otherwise
+ *  diplayLevel - see BMK_benchFiles
+ *  displayName - name used by display
+ * @return:
+ *      a variant, which expresses either an error, or a valid result.
+ *      Use BMK_isSuccessful_benchOutcome() to check if function was successful.
+ *      If yes, extract the valid result with BMK_extract_benchResult(),
+ *      it will contain :
+ *          .cSpeed: compression speed in bytes per second,
+ *          .dSpeed: decompression speed in bytes per second,
+ *          .cSize : compressed size, in bytes
+ *          .cMem  : memory budget required for the compression context
+ */
+BMK_benchOutcome_t BMK_benchMem(const void* srcBuffer, size_t srcSize,
                         const size_t* fileSizes, unsigned nbFiles,
-                        const int cLevel, const ZSTD_compressionParameters* comprParams,
+                        int cLevel, const ZSTD_compressionParameters* comprParams,
                         const void* dictBuffer, size_t dictBufferSize,
-                        ZSTD_CCtx* ctx, ZSTD_DCtx* dctx,
                         int displayLevel, const char* displayName);
 
-/* Set Parameters */
-void BMK_setNbSeconds(unsigned nbLoops);
-void BMK_setBlockSize(size_t blockSize);
-void BMK_setNbWorkers(unsigned nbWorkers);
-void BMK_setRealTime(unsigned priority);
-void BMK_setNotificationLevel(unsigned level);
-void BMK_setSeparateFiles(unsigned separate);
-void BMK_setAdditionalParam(int additionalParam);
-void BMK_setDecodeOnlyMode(unsigned decodeFlag);
-void BMK_setLdmFlag(unsigned ldmFlag);
-void BMK_setLdmMinMatch(unsigned ldmMinMatch);
-void BMK_setLdmHashLog(unsigned ldmHashLog);
-void BMK_setLdmBucketSizeLog(unsigned ldmBucketSizeLog);
-void BMK_setLdmHashEveryLog(unsigned ldmHashEveryLog);
+/* BMK_benchMemAdvanced() : same as BMK_benchMem()
+ * with following additional options :
+ * dstBuffer - destination buffer to write compressed output in, NULL if none provided.
+ * dstCapacity - capacity of destination buffer, give 0 if dstBuffer = NULL
+ * adv = see advancedParams_t
+ */
+BMK_benchOutcome_t BMK_benchMemAdvanced(const void* srcBuffer, size_t srcSize,
+                        void* dstBuffer, size_t dstCapacity,
+                        const size_t* fileSizes, unsigned nbFiles,
+                        int cLevel, const ZSTD_compressionParameters* comprParams,
+                        const void* dictBuffer, size_t dictBufferSize,
+                        int displayLevel, const char* displayName,
+                        const BMK_advancedParams_t* adv);
+
+
+
+/* ====  Benchmarking any function, iterated on a set of blocks  ==== */
+
+typedef struct {
+    unsigned long long nanoSecPerRun;  /* time per iteration */
+    size_t sumOfReturn;       /* sum of return values */
+} BMK_runTime_t;
+
+VARIANT_ERROR_RESULT(BMK_runTime_t, BMK_runOutcome_t);
+
+/* check first if the return structure represents an error or a valid result */
+int BMK_isSuccessful_runOutcome(BMK_runOutcome_t outcome);
+
+/* extract result from variant type.
+ * note : this function will abort() program execution if result is not valid
+ *        check result validity first, by using BMK_isSuccessful_runOutcome()
+ */
+BMK_runTime_t BMK_extract_runTime(BMK_runOutcome_t outcome);
+
+
+
+typedef size_t (*BMK_benchFn_t)(const void* src, size_t srcSize, void* dst, size_t dstCapacity, void* customPayload);
+typedef size_t (*BMK_initFn_t)(void* initPayload);
+
+
+/* BMK_benchFunction() :
+ * This function times the execution of 2 argument functions, benchFn and initFn  */
+
+/* benchFn - (*benchFn)(srcBuffers[i], srcSizes[i], dstBuffers[i], dstCapacities[i], benchPayload)
+ *      is run nbLoops times
+ * initFn - (*initFn)(initPayload) is run once per benchmark, at the beginning.
+ *      This argument can be NULL, in which case nothing is run.
+ * blockCount - number of blocks. Size of all array parameters : srcBuffers, srcSizes, dstBuffers, dstCapacities, blockResults
+ * srcBuffers - an array of buffers to be operated on by benchFn
+ * srcSizes - an array of the sizes of above buffers
+ * dstBuffers - an array of buffers to be written into by benchFn
+ * dstCapacities - an array of the capacities of above buffers
+ * blockResults - Optional: store the return value of benchFn for each block. Use NULL if this result is not requested.
+ * nbLoops - defines number of times benchFn is run.
+ * @return: a variant, which express either an error, or can generate a valid BMK_runTime_t result.
+ *          Use BMK_isSuccessful_runOutcome() to check if function was successful.
+ *          If yes, extract the result with BMK_extract_runTime(),
+ *          it will contain :
+ *              .sumOfReturn : the sum of all return values of benchFn through all of blocks
+ *              .nanoSecPerRun : time per run of benchFn + (time for initFn / nbLoops)
+ *          .sumOfReturn is generally intended for functions which return a # of bytes written into dstBuffer,
+ *              in which case, this value will be the total amount of bytes written into dstBuffer.
+ */
+BMK_runOutcome_t BMK_benchFunction(
+                        BMK_benchFn_t benchFn, void* benchPayload,
+                        BMK_initFn_t initFn, void* initPayload,
+                        size_t blockCount,
+                        const void *const * srcBuffers, const size_t* srcSizes,
+                        void *const * dstBuffers, const size_t* dstCapacities,
+                        size_t* blockResults,
+                        unsigned nbLoops);
+
+
+
+/* ====  Benchmark any function, providing intermediate results  ==== */
+
+/* state information tracking benchmark session */
+typedef struct BMK_timedFnState_s BMK_timedFnState_t;
+
+/* BMK_createTimedFnState() and BMK_resetTimedFnState() :
+ * Create/Set BMK_timedFnState_t for next benchmark session,
+ * which shall last a minimum of total_ms milliseconds,
+ * producing intermediate results, paced at interval of (approximately) run_ms.
+ */
+BMK_timedFnState_t* BMK_createTimedFnState(unsigned total_ms, unsigned run_ms);
+void BMK_resetTimedFnState(BMK_timedFnState_t* timedFnState, unsigned total_ms, unsigned run_ms);
+void BMK_freeTimedFnState(BMK_timedFnState_t* state);
+
+
+/* Tells if duration of all benchmark runs has exceeded total_ms
+ */
+int BMK_isCompleted_TimedFn(const BMK_timedFnState_t* timedFnState);
+
+
+/* BMK_benchTimedFn() :
+ * Similar to BMK_benchFunction(), most arguments being identical.
+ * Automatically determines `nbLoops` so that each result is regularly produced at interval of about run_ms.
+ * Note : minimum `nbLoops` is 1, therefore a run may last more than run_ms, and possibly even more than total_ms.
+ * Usage - initialize timedFnState, select benchmark duration (total_ms) and each measurement duration (run_ms)
+ *         call BMK_benchTimedFn() repetitively, each measurement is supposed to last about run_ms
+ *         Check if total time budget is spent or exceeded, using BMK_isCompleted_TimedFn()
+ */
+BMK_runOutcome_t BMK_benchTimedFn(
+                    BMK_timedFnState_t* timedFnState,
+                    BMK_benchFn_t benchFn, void* benchPayload,
+                    BMK_initFn_t initFn, void* initPayload,
+                    size_t blockCount,
+                    const void *const * srcBlockBuffers, const size_t* srcBlockSizes,
+                    void *const * dstBlockBuffers, const size_t* dstBlockCapacities,
+                    size_t* blockResults);
+
+
+
+
 
 #endif   /* BENCH_H_121279284357 */
 
diff --git a/programs/datagen.c b/programs/datagen.c
index a489d6a..c838365 100644
--- a/programs/datagen.c
+++ b/programs/datagen.c
@@ -13,6 +13,7 @@
 /*-************************************
 *  Dependencies
 **************************************/
+#include "datagen.h"
 #include "platform.h"  /* SET_BINARY_MODE */
 #include <stdlib.h>    /* malloc, free */
 #include <stdio.h>     /* FILE, fwrite, fprintf */
@@ -91,7 +92,7 @@
     return (RDG_rand(seedPtr) & 0x1FF) + 0xF;
 }
 
-void RDG_genBlock(void* buffer, size_t buffSize, size_t prefixSize, double matchProba, const BYTE* ldt, unsigned* seedPtr)
+static void RDG_genBlock(void* buffer, size_t buffSize, size_t prefixSize, double matchProba, const BYTE* ldt, unsigned* seedPtr)
 {
     BYTE* const buffPtr = (BYTE*)buffer;
     U32 const matchProba32 = (U32)(32768 * matchProba);
diff --git a/programs/dibio.c b/programs/dibio.c
index 112259d..d3fd8cc 100644
--- a/programs/dibio.c
+++ b/programs/dibio.c
@@ -27,6 +27,7 @@
 #include <string.h>         /* memset */
 #include <stdio.h>          /* fprintf, fopen, ftello64 */
 #include <errno.h>          /* errno */
+#include <assert.h>
 
 #include "mem.h"            /* read */
 #include "error_private.h"
@@ -43,6 +44,7 @@
 #define SAMPLESIZE_MAX (128 KB)
 #define MEMMULT 11    /* rough estimation : memory cost to analyze 1 byte of sample */
 #define COVER_MEMMULT 9    /* rough estimation : memory cost to analyze 1 byte of sample */
+#define FASTCOVER_MEMMULT 1    /* rough estimation : memory cost to analyze 1 byte of sample */
 static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
 
 #define NOISELENGTH 32
@@ -82,10 +84,6 @@
 /* ********************************************************
 *  Helper functions
 **********************************************************/
-unsigned DiB_isError(size_t errorCode) { return ERR_isError(errorCode); }
-
-const char* DiB_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
-
 #undef MIN
 #define MIN(a,b)    ((a) < (b) ? (a) : (b))
 
@@ -165,6 +163,7 @@
 static void DiB_shuffle(const char** fileNamesTable, unsigned nbFiles) {
     U32 seed = 0xFD2FB528;
     unsigned i;
+    assert(nbFiles >= 1);
     for (i = nbFiles - 1; i > 0; --i) {
         unsigned const j = DiB_rand(&seed) % (i + 1);
         const char* const tmp = fileNamesTable[j];
@@ -269,16 +268,19 @@
 
 int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
                        const char** fileNamesTable, unsigned nbFiles, size_t chunkSize,
-                       ZDICT_legacy_params_t *params, ZDICT_cover_params_t *coverParams,
-                       int optimizeCover)
+                       ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams,
+                       ZDICT_fastCover_params_t* fastCoverParams, int optimize)
 {
     unsigned const displayLevel = params ? params->zParams.notificationLevel :
                         coverParams ? coverParams->zParams.notificationLevel :
+                        fastCoverParams ? fastCoverParams->zParams.notificationLevel :
                         0;   /* should never happen */
     void* const dictBuffer = malloc(maxDictSize);
     fileStats const fs = DiB_fileStats(fileNamesTable, nbFiles, chunkSize, displayLevel);
     size_t* const sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t));
-    size_t const memMult = params ? MEMMULT : COVER_MEMMULT;
+    size_t const memMult = params ? MEMMULT :
+                           coverParams ? COVER_MEMMULT:
+                           FASTCOVER_MEMMULT;
     size_t const maxMem =  DiB_findMaxMem(fs.totalSizeToLoad * memMult) / memMult;
     size_t loadedSize = (size_t) MIN ((unsigned long long)maxMem, fs.totalSizeToLoad);
     void* const srcBuffer = malloc(loadedSize+NOISELENGTH);
@@ -310,7 +312,8 @@
     /* Load input buffer */
     DISPLAYLEVEL(3, "Shuffling input files\n");
     DiB_shuffle(fileNamesTable, nbFiles);
-    nbFiles = DiB_loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable, nbFiles, chunkSize, displayLevel);
+
+    DiB_loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable, nbFiles, chunkSize, displayLevel);
 
     {   size_t dictSize;
         if (params) {
@@ -318,16 +321,36 @@
             dictSize = ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, maxDictSize,
                                                            srcBuffer, sampleSizes, fs.nbSamples,
                                                            *params);
-        } else if (optimizeCover) {
-            dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize,
-                                                           srcBuffer, sampleSizes, fs.nbSamples,
-                                                           coverParams);
-            if (!ZDICT_isError(dictSize)) {
-                DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\n", coverParams->k, coverParams->d, coverParams->steps);
+        } else if (coverParams) {
+            if (optimize) {
+              dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize,
+                                                             srcBuffer, sampleSizes, fs.nbSamples,
+                                                             coverParams);
+              if (!ZDICT_isError(dictSize)) {
+                  unsigned splitPercentage = (unsigned)(coverParams->splitPoint * 100);
+                  DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\nsplit=%u\n", coverParams->k, coverParams->d,
+                              coverParams->steps, splitPercentage);
+              }
+            } else {
+              dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, srcBuffer,
+                                                     sampleSizes, fs.nbSamples, *coverParams);
             }
         } else {
-            dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, srcBuffer,
-                                                   sampleSizes, fs.nbSamples, *coverParams);
+            assert(fastCoverParams != NULL);
+            if (optimize) {
+              dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize,
+                                                              srcBuffer, sampleSizes, fs.nbSamples,
+                                                              fastCoverParams);
+              if (!ZDICT_isError(dictSize)) {
+                unsigned splitPercentage = (unsigned)(fastCoverParams->splitPoint * 100);
+                DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\n", fastCoverParams->k,
+                            fastCoverParams->d, fastCoverParams->f, fastCoverParams->steps, splitPercentage,
+                            fastCoverParams->accel);
+              }
+            } else {
+              dictSize = ZDICT_trainFromBuffer_fastCover(dictBuffer, maxDictSize, srcBuffer,
+                                                        sampleSizes, fs.nbSamples, *fastCoverParams);
+            }
         }
         if (ZDICT_isError(dictSize)) {
             DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize));   /* should not happen */
diff --git a/programs/dibio.h b/programs/dibio.h
index 499e303..ea163fe 100644
--- a/programs/dibio.h
+++ b/programs/dibio.h
@@ -33,7 +33,7 @@
 */
 int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
                        const char** fileNamesTable, unsigned nbFiles, size_t chunkSize,
-                       ZDICT_legacy_params_t *params, ZDICT_cover_params_t *coverParams,
-                       int optimizeCover);
+                       ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams,
+                       ZDICT_fastCover_params_t* fastCoverParams, int optimize);
 
 #endif
diff --git a/programs/fileio.c b/programs/fileio.c
index 0175b31..ed3a29c 100644
--- a/programs/fileio.c
+++ b/programs/fileio.c
@@ -20,6 +20,11 @@
 #  define _POSIX_SOURCE 1          /* disable %llu warnings with MinGW on Windows */
 #endif
 
+#if !defined(BACKTRACES_ENABLE) && \
+   (defined(__linux__) || (defined(__APPLE__) && defined(__MACH__)) )
+#  define BACKTRACES_ENABLE 1
+#endif
+
 
 /*-*************************************
 *  Includes
@@ -29,17 +34,20 @@
 #include <stdio.h>      /* fprintf, fopen, fread, _fileno, stdin, stdout */
 #include <stdlib.h>     /* malloc, free */
 #include <string.h>     /* strcmp, strlen */
+#include <assert.h>
 #include <errno.h>      /* errno */
+#include <signal.h>
+#ifdef BACKTRACES_ENABLE
+#  include <execinfo.h>   /* backtrace, backtrace_symbols */
+#endif
 
 #if defined (_MSC_VER)
 #  include <sys/stat.h>
 #  include <io.h>
 #endif
 
-#include "debug.h"
-#include "mem.h"
+#include "mem.h"       /* U32, U64 */
 #include "fileio.h"
-#include "util.h"
 
 #define ZSTD_STATIC_LINKING_ONLY   /* ZSTD_magicNumber, ZSTD_frameHeaderSize_max */
 #include "zstd.h"
@@ -71,6 +79,7 @@
 #define MB *(1<<20)
 #define GB *(1U<<30)
 
+#define ADAPT_WINDOWLOG_DEFAULT 23   /* 8 MB */
 #define DICTSIZE_MAX (32 MB)   /* protection against large input (attack scenario) */
 
 #define FNSPACE 30
@@ -105,7 +114,7 @@
 #define EXM_THROW(error, ...)                                             \
 {                                                                         \
     DISPLAYLEVEL(1, "zstd: ");                                            \
-    DEBUGLOG(1, "Error defined at %s, line %i : \n", __FILE__, __LINE__); \
+    DISPLAYLEVEL(5, "Error defined at %s, line %i : \n", __FILE__, __LINE__); \
     DISPLAYLEVEL(1, "error %i : ", error);                                \
     DISPLAYLEVEL(1, __VA_ARGS__);                                         \
     DISPLAYLEVEL(1, " \n");                                               \
@@ -115,7 +124,7 @@
 #define CHECK_V(v, f)                                \
     v = f;                                           \
     if (ZSTD_isError(v)) {                           \
-        DEBUGLOG(1, "%s \n", #f);                    \
+        DISPLAYLEVEL(5, "%s \n", #f);                \
         EXM_THROW(11, "%s", ZSTD_getErrorName(v));   \
     }
 #define CHECK(f) { size_t err; CHECK_V(err, f); }
@@ -124,8 +133,6 @@
 /*-************************************
 *  Signal (Ctrl-C trapping)
 **************************************/
-#include  <signal.h>
-
 static const char* g_artefact = NULL;
 static void INThandler(int sig)
 {
@@ -157,7 +164,61 @@
 }
 
 
-/* ************************************************************
+/*-*********************************************************
+*  Termination signal trapping (Print debug stack trace)
+***********************************************************/
+#ifdef BACKTRACES_ENABLE
+
+#define MAX_STACK_FRAMES    50
+
+static void ABRThandler(int sig) {
+    const char* name;
+    void* addrlist[MAX_STACK_FRAMES];
+    char** symbollist;
+    U32 addrlen, i;
+
+    switch (sig) {
+        case SIGABRT: name = "SIGABRT"; break;
+        case SIGFPE: name = "SIGFPE"; break;
+        case SIGILL: name = "SIGILL"; break;
+        case SIGINT: name = "SIGINT"; break;
+        case SIGSEGV: name = "SIGSEGV"; break;
+        default: name = "UNKNOWN";
+    }
+
+    DISPLAY("Caught %s signal, printing stack:\n", name);
+    /* Retrieve current stack addresses. */
+    addrlen = backtrace(addrlist, MAX_STACK_FRAMES);
+    if (addrlen == 0) {
+        DISPLAY("\n");
+        return;
+    }
+    /* Create readable strings to each frame. */
+    symbollist = backtrace_symbols(addrlist, addrlen);
+    /* Print the stack trace, excluding calls handling the signal. */
+    for (i = ZSTD_START_SYMBOLLIST_FRAME; i < addrlen; i++) {
+        DISPLAY("%s\n", symbollist[i]);
+    }
+    free(symbollist);
+    /* Reset and raise the signal so default handler runs. */
+    signal(sig, SIG_DFL);
+    raise(sig);
+}
+#endif
+
+void FIO_addAbortHandler()
+{
+#ifdef BACKTRACES_ENABLE
+    signal(SIGABRT, ABRThandler);
+    signal(SIGFPE, ABRThandler);
+    signal(SIGILL, ABRThandler);
+    signal(SIGSEGV, ABRThandler);
+    signal(SIGBUS, ABRThandler);
+#endif
+}
+
+
+/*-************************************************************
 * Avoid fseek()'s 2GiB barrier with MSVC, macOS, *BSD, MinGW
 ***************************************************************/
 #if defined(_MSC_VER) && _MSC_VER >= 1400
@@ -226,6 +287,26 @@
         DISPLAYLEVEL(2, "Setting overlapLog is useless in single-thread mode \n");
     g_overlapLog = overlapLog;
 }
+static U32 g_adaptiveMode = 0;
+void FIO_setAdaptiveMode(unsigned adapt) {
+    if ((adapt>0) && (g_nbWorkers==0))
+        EXM_THROW(1, "Adaptive mode is not compatible with single thread mode \n");
+    g_adaptiveMode = adapt;
+}
+static int g_minAdaptLevel = -50;   /* initializing this value requires a constant, so ZSTD_minCLevel() doesn't work */
+void FIO_setAdaptMin(int minCLevel)
+{
+#ifndef ZSTD_NOCOMPRESS
+    assert(minCLevel >= ZSTD_minCLevel());
+#endif
+    g_minAdaptLevel = minCLevel;
+}
+static int g_maxAdaptLevel = 22;   /* initializing this value requires a constant, so ZSTD_maxCLevel() doesn't work */
+void FIO_setAdaptMax(int maxCLevel)
+{
+    g_maxAdaptLevel = maxCLevel;
+}
+
 static U32 g_ldmFlag = 0;
 void FIO_setLdmFlag(unsigned ldmFlag) {
     g_ldmFlag = (ldmFlag>0);
@@ -404,7 +485,7 @@
 
 static cRess_t FIO_createCResources(const char* dictFileName, int cLevel,
                                     U64 srcSize,
-                                    ZSTD_compressionParameters* comprParams) {
+                                    ZSTD_compressionParameters comprParams) {
     cRess_t ress;
     memset(&ress, 0, sizeof(ress));
 
@@ -425,6 +506,9 @@
         if (dictFileName && (dictBuffer==NULL))
             EXM_THROW(32, "allocation error : can't create dictBuffer");
 
+        if (g_adaptiveMode && !g_ldmFlag && !comprParams.windowLog)
+            comprParams.windowLog = ADAPT_WINDOWLOG_DEFAULT;
+
         CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_contentSizeFlag, 1) );  /* always enable content size when available (note: supposed to be default) */
         CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_dictIDFlag, g_dictIDFlag) );
         CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_checksumFlag, g_checksumFlag) );
@@ -441,13 +525,13 @@
             CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_ldmHashEveryLog, g_ldmHashEveryLog) );
         }
         /* compression parameters */
-        CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_windowLog, comprParams->windowLog) );
-        CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_chainLog, comprParams->chainLog) );
-        CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_hashLog, comprParams->hashLog) );
-        CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_searchLog, comprParams->searchLog) );
-        CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_minMatch, comprParams->searchLength) );
-        CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_targetLength, comprParams->targetLength) );
-        CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_compressionStrategy, (U32)comprParams->strategy) );
+        CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_windowLog, comprParams.windowLog) );
+        CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_chainLog, comprParams.chainLog) );
+        CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_hashLog, comprParams.hashLog) );
+        CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_searchLog, comprParams.searchLog) );
+        CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_minMatch, comprParams.searchLength) );
+        CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_targetLength, comprParams.targetLength) );
+        CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_compressionStrategy, (U32)comprParams.strategy) );
         /* multi-threading */
 #ifdef ZSTD_MULTITHREAD
         DISPLAYLEVEL(5,"set nb workers = %u \n", g_nbWorkers);
@@ -480,7 +564,8 @@
 
 
 #ifdef ZSTD_GZCOMPRESS
-static unsigned long long FIO_compressGzFrame(cRess_t* ress,
+static unsigned long long
+FIO_compressGzFrame(cRess_t* ress,
                     const char* srcFileName, U64 const srcFileSize,
                     int compressionLevel, U64* readsize)
 {
@@ -562,9 +647,10 @@
 
 
 #ifdef ZSTD_LZMACOMPRESS
-static unsigned long long FIO_compressLzmaFrame(cRess_t* ress,
-                            const char* srcFileName, U64 const srcFileSize,
-                            int compressionLevel, U64* readsize, int plain_lzma)
+static unsigned long long
+FIO_compressLzmaFrame(cRess_t* ress,
+                      const char* srcFileName, U64 const srcFileSize,
+                      int compressionLevel, U64* readsize, int plain_lzma)
 {
     unsigned long long inFileSize = 0, outFileSize = 0;
     lzma_stream strm = LZMA_STREAM_INIT;
@@ -637,9 +723,10 @@
 #define LZ4F_max64KB max64KB
 #endif
 static int FIO_LZ4_GetBlockSize_FromBlockId (int id) { return (1 << (8 + (2 * id))); }
-static unsigned long long FIO_compressLz4Frame(cRess_t* ress,
-                            const char* srcFileName, U64 const srcFileSize,
-                            int compressionLevel, U64* readsize)
+static unsigned long long
+FIO_compressLz4Frame(cRess_t* ress,
+                     const char* srcFileName, U64 const srcFileSize,
+                     int compressionLevel, U64* readsize)
 {
     const size_t blockSize = FIO_LZ4_GetBlockSize_FromBlockId(LZ4F_max64KB);
     unsigned long long inFileSize = 0, outFileSize = 0;
@@ -727,11 +814,6 @@
 #endif
 
 
-/*! FIO_compressFilename_internal() :
- *  same as FIO_compressFilename_extRess(), with `ress.desFile` already opened.
- *  @return : 0 : compression completed correctly,
- *            1 : missing or pb opening srcFileName
- */
 static unsigned long long
 FIO_compressZstdFrame(const cRess_t* ressPtr,
                       const char* srcFileName, U64 fileSize,
@@ -742,17 +824,28 @@
     FILE* const dstFile = ress.dstFile;
     U64 compressedfilesize = 0;
     ZSTD_EndDirective directive = ZSTD_e_continue;
+
+    /* stats */
+    ZSTD_frameProgression previous_zfp_update = { 0, 0, 0, 0, 0, 0 };
+    ZSTD_frameProgression previous_zfp_correction = { 0, 0, 0, 0, 0, 0 };
+    typedef enum { noChange, slower, faster } speedChange_e;
+    speedChange_e speedChange = noChange;
+    unsigned flushWaiting = 0;
+    unsigned inputPresented = 0;
+    unsigned inputBlocked = 0;
+    unsigned lastJobID = 0;
+
     DISPLAYLEVEL(6, "compression using zstd format \n");
 
     /* init */
     if (fileSize != UTIL_FILESIZE_UNKNOWN) {
         CHECK(ZSTD_CCtx_setPledgedSrcSize(ress.cctx, fileSize));
     }
-    (void)compressionLevel; (void)srcFileName;
+    (void)srcFileName;
 
     /* Main compression loop */
     do {
-        size_t result;
+        size_t stillToFlush;
         /* Fill input Buffer */
         size_t const inSize = fread(ress.srcBuffer, (size_t)1, ress.srcBufferSize, srcFile);
         ZSTD_inBuffer inBuff = { ress.srcBuffer, inSize, 0 };
@@ -762,41 +855,149 @@
         if ((inSize == 0) || (*readsize == fileSize))
             directive = ZSTD_e_end;
 
-        result = 1;
-        while (inBuff.pos != inBuff.size || (directive == ZSTD_e_end && result != 0)) {
+        stillToFlush = 1;
+        while ((inBuff.pos != inBuff.size)   /* input buffer must be entirely ingested */
+            || (directive == ZSTD_e_end && stillToFlush != 0) ) {
+
+            size_t const oldIPos = inBuff.pos;
             ZSTD_outBuffer outBuff = { ress.dstBuffer, ress.dstBufferSize, 0 };
-            CHECK_V(result, ZSTD_compress_generic(ress.cctx, &outBuff, &inBuff, directive));
+            size_t const toFlushNow = ZSTD_toFlushNow(ress.cctx);
+            CHECK_V(stillToFlush, ZSTD_compress_generic(ress.cctx, &outBuff, &inBuff, directive));
+
+            /* count stats */
+            inputPresented++;
+            if (oldIPos == inBuff.pos) inputBlocked++;  /* input buffer is full and can't take any more : input speed is faster than consumption rate */
+            if (!toFlushNow) flushWaiting = 1;
 
             /* Write compressed stream */
-            DISPLAYLEVEL(6, "ZSTD_compress_generic(end:%u) => intput pos(%u)<=(%u)size ; output generated %u bytes \n",
+            DISPLAYLEVEL(6, "ZSTD_compress_generic(end:%u) => input pos(%u)<=(%u)size ; output generated %u bytes \n",
                             (U32)directive, (U32)inBuff.pos, (U32)inBuff.size, (U32)outBuff.pos);
             if (outBuff.pos) {
                 size_t const sizeCheck = fwrite(ress.dstBuffer, 1, outBuff.pos, dstFile);
-                if (sizeCheck!=outBuff.pos)
+                if (sizeCheck != outBuff.pos)
                     EXM_THROW(25, "Write error : cannot write compressed block");
                 compressedfilesize += outBuff.pos;
             }
+
+            /* display notification; and adapt compression level */
             if (READY_FOR_UPDATE()) {
                 ZSTD_frameProgression const zfp = ZSTD_getFrameProgression(ress.cctx);
                 double const cShare = (double)zfp.produced / (zfp.consumed + !zfp.consumed/*avoid div0*/) * 100;
+
+                /* display progress notifications */
                 if (g_displayLevel >= 3) {
-                    DISPLAYUPDATE(3, "\r(L%i) Buffered :%4u MB - Consumed :%4u MB - Compressed :%4u MB => %.2f%%",
+                    DISPLAYUPDATE(3, "\r(L%i) Buffered :%4u MB - Consumed :%4u MB - Compressed :%4u MB => %.2f%% ",
                                 compressionLevel,
                                 (U32)((zfp.ingested - zfp.consumed) >> 20),
                                 (U32)(zfp.consumed >> 20),
                                 (U32)(zfp.produced >> 20),
                                 cShare );
-                } else {   /* g_displayLevel == 2 */
+                } else {   /* summarized notifications if == 2; */
                     DISPLAYLEVEL(2, "\rRead : %u ", (U32)(zfp.consumed >> 20));
                     if (fileSize != UTIL_FILESIZE_UNKNOWN)
                         DISPLAYLEVEL(2, "/ %u ", (U32)(fileSize >> 20));
                     DISPLAYLEVEL(2, "MB ==> %2.f%% ", cShare);
                     DELAY_NEXT_UPDATE();
                 }
-            }
-        }
+
+                /* adaptive mode : statistics measurement and speed correction */
+                if (g_adaptiveMode) {
+
+                    /* check output speed */
+                    if (zfp.currentJobID > 1) {  /* only possible if nbWorkers >= 1 */
+
+                        unsigned long long newlyProduced = zfp.produced - previous_zfp_update.produced;
+                        unsigned long long newlyFlushed = zfp.flushed - previous_zfp_update.flushed;
+                        assert(zfp.produced >= previous_zfp_update.produced);
+                        assert(g_nbWorkers >= 1);
+
+                        /* test if compression is blocked
+                         * either because output is slow and all buffers are full
+                         * or because input is slow and no job can start while waiting for at least one buffer to be filled.
+                         * note : excluse starting part, since currentJobID > 1 */
+                        if ( (zfp.consumed == previous_zfp_update.consumed)   /* no data compressed : no data available, or no more buffer to compress to, OR compression is really slow (compression of a single block is slower than update rate)*/
+                          && (zfp.nbActiveWorkers == 0)                       /* confirmed : no compression ongoing */
+                          ) {
+                            DISPLAYLEVEL(6, "all buffers full : compression stopped => slow down \n")
+                            speedChange = slower;
+                        }
+
+                        previous_zfp_update = zfp;
+
+                        if ( (newlyProduced > (newlyFlushed * 9 / 8))   /* compression produces more data than output can flush (though production can be spiky, due to work unit : (N==4)*block sizes) */
+                          && (flushWaiting == 0)                        /* flush speed was never slowed by lack of production, so it's operating at max capacity */
+                          ) {
+                            DISPLAYLEVEL(6, "compression faster than flush (%llu > %llu), and flushed was never slowed down by lack of production => slow down \n", newlyProduced, newlyFlushed);
+                            speedChange = slower;
+                        }
+                        flushWaiting = 0;
+                    }
+
+                    /* course correct only if there is at least one new job completed */
+                    if (zfp.currentJobID > lastJobID) {
+                        DISPLAYLEVEL(6, "compression level adaptation check \n")
+
+                        /* check input speed */
+                        if (zfp.currentJobID > g_nbWorkers+1) {   /* warm up period, to fill all workers */
+                            if (inputBlocked <= 0) {
+                                DISPLAYLEVEL(6, "input is never blocked => input is slower than ingestion \n");
+                                speedChange = slower;
+                            } else if (speedChange == noChange) {
+                                unsigned long long newlyIngested = zfp.ingested - previous_zfp_correction.ingested;
+                                unsigned long long newlyConsumed = zfp.consumed - previous_zfp_correction.consumed;
+                                unsigned long long newlyProduced = zfp.produced - previous_zfp_correction.produced;
+                                unsigned long long newlyFlushed  = zfp.flushed  - previous_zfp_correction.flushed;
+                                previous_zfp_correction = zfp;
+                                assert(inputPresented > 0);
+                                DISPLAYLEVEL(6, "input blocked %u/%u(%.2f) - ingested:%u vs %u:consumed - flushed:%u vs %u:produced \n",
+                                                inputBlocked, inputPresented, (double)inputBlocked/inputPresented*100,
+                                                (U32)newlyIngested, (U32)newlyConsumed,
+                                                (U32)newlyFlushed, (U32)newlyProduced);
+                                if ( (inputBlocked > inputPresented / 8)     /* input is waiting often, because input buffers is full : compression or output too slow */
+                                  && (newlyFlushed * 33 / 32 > newlyProduced)  /* flush everything that is produced */
+                                  && (newlyIngested * 33 / 32 > newlyConsumed) /* input speed as fast or faster than compression speed */
+                                ) {
+                                    DISPLAYLEVEL(6, "recommend faster as in(%llu) >= (%llu)comp(%llu) <= out(%llu) \n",
+                                                    newlyIngested, newlyConsumed, newlyProduced, newlyFlushed);
+                                    speedChange = faster;
+                                }
+                            }
+                            inputBlocked = 0;
+                            inputPresented = 0;
+                        }
+
+                        if (speedChange == slower) {
+                            DISPLAYLEVEL(6, "slower speed , higher compression \n")
+                            compressionLevel ++;
+                            if (compressionLevel > ZSTD_maxCLevel()) compressionLevel = ZSTD_maxCLevel();
+                            if (compressionLevel > g_maxAdaptLevel) compressionLevel = g_maxAdaptLevel;
+                            compressionLevel += (compressionLevel == 0);   /* skip 0 */
+                            ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_compressionLevel, (unsigned)compressionLevel);
+                        }
+                        if (speedChange == faster) {
+                            DISPLAYLEVEL(6, "faster speed , lighter compression \n")
+                            compressionLevel --;
+                            if (compressionLevel < g_minAdaptLevel) compressionLevel = g_minAdaptLevel;
+                            compressionLevel -= (compressionLevel == 0);   /* skip 0 */
+                            ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_compressionLevel, (unsigned)compressionLevel);
+                        }
+                        speedChange = noChange;
+
+                        lastJobID = zfp.currentJobID;
+                    }  /* if (zfp.currentJobID > lastJobID) */
+                }  /* if (g_adaptiveMode) */
+            }  /* if (READY_FOR_UPDATE()) */
+        }  /* while ((inBuff.pos != inBuff.size) */
     } while (directive != ZSTD_e_end);
 
+    if (ferror(srcFile)) {
+        EXM_THROW(26, "Read error : I/O error");
+    }
+    if (fileSize != UTIL_FILESIZE_UNKNOWN && *readsize != fileSize) {
+        EXM_THROW(27, "Read error : Incomplete read : %llu / %llu B",
+                (unsigned long long)*readsize, (unsigned long long)fileSize);
+    }
+
     return compressedfilesize;
 }
 
@@ -866,14 +1067,80 @@
 }
 
 
+/*! FIO_compressFilename_dstFile() :
+ *  open dstFileName, or pass-through if ress.dstFile != NULL,
+ *  then start compression with FIO_compressFilename_internal().
+ *  Manages source removal (--rm) and file permissions transfer.
+ *  note : ress.srcFile must be != NULL,
+ *  so reach this function through FIO_compressFilename_srcFile().
+ *  @return : 0 : compression completed correctly,
+ *            1 : pb
+ */
+static int FIO_compressFilename_dstFile(cRess_t ress,
+                                        const char* dstFileName,
+                                        const char* srcFileName,
+                                        int compressionLevel)
+{
+    int closeDstFile = 0;
+    int result;
+    stat_t statbuf;
+    int transfer_permissions = 0;
+
+    assert(ress.srcFile != NULL);
+
+    if (ress.dstFile == NULL) {
+        closeDstFile = 1;
+        DISPLAYLEVEL(6, "FIO_compressFilename_dstFile: opening dst: %s", dstFileName);
+        ress.dstFile = FIO_openDstFile(dstFileName);
+        if (ress.dstFile==NULL) return 1;  /* could not open dstFileName */
+        /* Must only be added after FIO_openDstFile() succeeds.
+         * Otherwise we may delete the destination file if it already exists,
+         * and the user presses Ctrl-C when asked if they wish to overwrite.
+         */
+        addHandler(dstFileName);
+
+        if ( strcmp (srcFileName, stdinmark)
+          && UTIL_getFileStat(srcFileName, &statbuf))
+            transfer_permissions = 1;
+    }
+
+    result = FIO_compressFilename_internal(ress, dstFileName, srcFileName, compressionLevel);
+
+    if (closeDstFile) {
+        FILE* const dstFile = ress.dstFile;
+        ress.dstFile = NULL;
+
+        clearHandler();
+
+        if (fclose(dstFile)) { /* error closing dstFile */
+            DISPLAYLEVEL(1, "zstd: %s: %s \n", dstFileName, strerror(errno));
+            result=1;
+        }
+        if ( (result != 0)  /* operation failure */
+          && strcmp(dstFileName, nulmark)     /* special case : don't remove() /dev/null */
+          && strcmp(dstFileName, stdoutmark)  /* special case : don't remove() stdout */
+          ) {
+            FIO_remove(dstFileName); /* remove compression artefact; note don't do anything special if remove() fails */
+        } else if ( strcmp(dstFileName, stdoutmark)
+                 && strcmp(dstFileName, nulmark)
+                 && transfer_permissions) {
+            UTIL_setFileStat(dstFileName, &statbuf);
+        }
+    }
+
+    return result;
+}
+
+
 /*! FIO_compressFilename_srcFile() :
- *  note : ress.destFile already opened
  *  @return : 0 : compression completed correctly,
  *            1 : missing or pb opening srcFileName
  */
-static int FIO_compressFilename_srcFile(cRess_t ress,
-                            const char* dstFileName, const char* srcFileName,
-                            int compressionLevel)
+static int
+FIO_compressFilename_srcFile(cRess_t ress,
+                             const char* dstFileName,
+                             const char* srcFileName,
+                             int compressionLevel)
 {
     int result;
 
@@ -884,12 +1151,16 @@
     }
 
     ress.srcFile = FIO_openSrcFile(srcFileName);
-    if (!ress.srcFile) return 1;   /* srcFile could not be opened */
+    if (ress.srcFile == NULL) return 1;   /* srcFile could not be opened */
 
-    result = FIO_compressFilename_internal(ress, dstFileName, srcFileName, compressionLevel);
+    result = FIO_compressFilename_dstFile(ress, dstFileName, srcFileName, compressionLevel);
 
     fclose(ress.srcFile);
-    if (g_removeSrcFile /* --rm */ && !result && strcmp(srcFileName, stdinmark)) {
+    ress.srcFile = NULL;
+    if ( g_removeSrcFile   /* --rm */
+      && result == 0       /* success */
+      && strcmp(srcFileName, stdinmark)   /* exception : don't erase stdin */
+      ) {
         /* We must clear the handler, since after this point calling it would
          * delete both the source and destination files.
          */
@@ -901,59 +1172,16 @@
 }
 
 
-/*! FIO_compressFilename_dstFile() :
- *  @return : 0 : compression completed correctly,
- *            1 : pb
- */
-static int FIO_compressFilename_dstFile(cRess_t ress,
-                                        const char* dstFileName,
-                                        const char* srcFileName,
-                                        int compressionLevel)
-{
-    int result;
-    stat_t statbuf;
-    int stat_result = 0;
-
-    DISPLAYLEVEL(6, "FIO_compressFilename_dstFile: opening dst: %s", dstFileName);
-    ress.dstFile = FIO_openDstFile(dstFileName);
-    if (ress.dstFile==NULL) return 1;  /* could not open dstFileName */
-    /* Must ony be added after FIO_openDstFile() succeeds.
-     * Otherwise we may delete the destination file if at already exists, and
-     * the user presses Ctrl-C when asked if they wish to overwrite.
-     */
-    addHandler(dstFileName);
-
-    if (strcmp (srcFileName, stdinmark) && UTIL_getFileStat(srcFileName, &statbuf))
-        stat_result = 1;
-    result = FIO_compressFilename_srcFile(ress, dstFileName, srcFileName, compressionLevel);
-    clearHandler();
-
-    if (fclose(ress.dstFile)) { /* error closing dstFile */
-        DISPLAYLEVEL(1, "zstd: %s: %s \n", dstFileName, strerror(errno));
-        result=1;
-    }
-    if ( (result != 0)  /* operation failure */
-      && strcmp(dstFileName, nulmark)      /* special case : don't remove() /dev/null */
-      && strcmp(dstFileName, stdoutmark) ) /* special case : don't remove() stdout */
-        FIO_remove(dstFileName); /* remove compression artefact; note don't do anything special if remove() fails */
-    else if ( strcmp(dstFileName, stdoutmark)
-           && strcmp(dstFileName, nulmark)
-           && stat_result)
-        UTIL_setFileStat(dstFileName, &statbuf);
-
-    return result;
-}
-
-
 int FIO_compressFilename(const char* dstFileName, const char* srcFileName,
-                         const char* dictFileName, int compressionLevel, ZSTD_compressionParameters* comprParams)
+                         const char* dictFileName, int compressionLevel,
+                         ZSTD_compressionParameters comprParams)
 {
     clock_t const start = clock();
     U64 const fileSize = UTIL_getFileSize(srcFileName);
     U64 const srcSize = (fileSize == UTIL_FILESIZE_UNKNOWN) ? ZSTD_CONTENTSIZE_UNKNOWN : fileSize;
 
     cRess_t const ress = FIO_createCResources(dictFileName, compressionLevel, srcSize, comprParams);
-    int const result = FIO_compressFilename_dstFile(ress, dstFileName, srcFileName, compressionLevel);
+    int const result = FIO_compressFilename_srcFile(ress, dstFileName, srcFileName, compressionLevel);
 
     double const seconds = (double)(clock() - start) / CLOCKS_PER_SEC;
     DISPLAYLEVEL(4, "Completed in %.2f sec \n", seconds);
@@ -963,57 +1191,76 @@
 }
 
 
+/* FIO_determineCompressedName() :
+ * create a destination filename for compressed srcFileName.
+ * @return a pointer to it.
+ * This function never returns an error (it may abort() in case of pb)
+ */
+static const char*
+FIO_determineCompressedName(const char* srcFileName, const char* suffix)
+{
+    static size_t dfnbCapacity = 0;
+    static char* dstFileNameBuffer = NULL;   /* using static allocation : this function cannot be multi-threaded */
+
+    size_t const sfnSize = strlen(srcFileName);
+    size_t const suffixSize = strlen(suffix);
+
+    if (dfnbCapacity <= sfnSize+suffixSize+1) {  /* resize name buffer */
+        free(dstFileNameBuffer);
+        dfnbCapacity = sfnSize + suffixSize + 30;
+        dstFileNameBuffer = (char*)malloc(dfnbCapacity);
+        if (!dstFileNameBuffer) {
+            EXM_THROW(30, "zstd: %s", strerror(errno));
+    }   }
+    assert(dstFileNameBuffer != NULL);
+    strncpy(dstFileNameBuffer, srcFileName, sfnSize+1 /* Include null */);
+    strncat(dstFileNameBuffer, suffix, suffixSize);
+
+    return dstFileNameBuffer;
+}
+
+
+/* FIO_compressMultipleFilenames() :
+ * compress nbFiles files
+ * into one destination (outFileName)
+ * or into one file each (outFileName == NULL, but suffix != NULL).
+ */
 int FIO_compressMultipleFilenames(const char** inFileNamesTable, unsigned nbFiles,
                                   const char* outFileName, const char* suffix,
                                   const char* dictFileName, int compressionLevel,
-                                  ZSTD_compressionParameters* comprParams)
+                                  ZSTD_compressionParameters comprParams)
 {
-    int missed_files = 0;
-    size_t dfnSize = FNSPACE;
-    char*  dstFileName = (char*)malloc(FNSPACE);
-    size_t const suffixSize = suffix ? strlen(suffix) : 0;
+    int error = 0;
     U64 const firstFileSize = UTIL_getFileSize(inFileNamesTable[0]);
     U64 const firstSrcSize = (firstFileSize == UTIL_FILESIZE_UNKNOWN) ? ZSTD_CONTENTSIZE_UNKNOWN : firstFileSize;
     U64 const srcSize = (nbFiles != 1) ? ZSTD_CONTENTSIZE_UNKNOWN : firstSrcSize ;
     cRess_t ress = FIO_createCResources(dictFileName, compressionLevel, srcSize, comprParams);
 
     /* init */
-    if (dstFileName==NULL)
-        EXM_THROW(27, "FIO_compressMultipleFilenames : allocation error for dstFileName");
-    if (outFileName == NULL && suffix == NULL)
-        EXM_THROW(28, "FIO_compressMultipleFilenames : dst unknown");  /* should never happen */
+    assert(outFileName != NULL || suffix != NULL);
 
-    /* loop on each file */
-    if (outFileName != NULL) {
-        unsigned u;
+    if (outFileName != NULL) {   /* output into a single destination (stdout typically) */
         ress.dstFile = FIO_openDstFile(outFileName);
-        if (ress.dstFile==NULL) {  /* could not open outFileName */
-            missed_files = nbFiles;
+        if (ress.dstFile == NULL) {  /* could not open outFileName */
+            error = 1;
         } else {
+            unsigned u;
             for (u=0; u<nbFiles; u++)
-                missed_files += FIO_compressFilename_srcFile(ress, outFileName, inFileNamesTable[u], compressionLevel);
+                error |= FIO_compressFilename_srcFile(ress, outFileName, inFileNamesTable[u], compressionLevel);
             if (fclose(ress.dstFile))
-                EXM_THROW(29, "Write error : cannot properly close stdout");
+                EXM_THROW(29, "Write error : cannot properly close %s", outFileName);
+            ress.dstFile = NULL;
         }
     } else {
         unsigned u;
         for (u=0; u<nbFiles; u++) {
-            size_t const ifnSize = strlen(inFileNamesTable[u]);
-            if (dfnSize <= ifnSize+suffixSize+1) {  /* resize name buffer */
-                free(dstFileName);
-                dfnSize = ifnSize + 20;
-                dstFileName = (char*)malloc(dfnSize);
-                if (!dstFileName) {
-                    EXM_THROW(30, "zstd: %s", strerror(errno));
-            }   }
-            strcpy(dstFileName, inFileNamesTable[u]);
-            strcat(dstFileName, suffix);
-            missed_files += FIO_compressFilename_dstFile(ress, dstFileName, inFileNamesTable[u], compressionLevel);
+            const char* const srcFileName = inFileNamesTable[u];
+            const char* const dstFileName = FIO_determineCompressedName(srcFileName, suffix);  /* cannot fail */
+            error |= FIO_compressFilename_srcFile(ress, dstFileName, srcFileName, compressionLevel);
     }   }
 
     FIO_freeCResources(ress);
-    free(dstFileName);
-    return missed_files;
+    return error;
 }
 
 #endif /* #ifndef ZSTD_NOCOMPRESS */
@@ -1202,12 +1449,12 @@
     if (err == 0) {
         unsigned long long const windowSize = header.windowSize;
         U32 const windowLog = FIO_highbit64(windowSize) + ((windowSize & (windowSize - 1)) != 0);
-        U32 const windowMB = (U32)((windowSize >> 20) + ((windowSize & ((1 MB) - 1)) != 0));
-        assert(windowSize < (U64)(1ULL << 52));
         assert(g_memLimit > 0);
         DISPLAYLEVEL(1, "%s : Window size larger than maximum : %llu > %u\n",
                         srcFileName, windowSize, g_memLimit);
         if (windowLog <= ZSTD_WINDOWLOG_MAX) {
+            U32 const windowMB = (U32)((windowSize >> 20) + ((windowSize & ((1 MB) - 1)) != 0));
+            assert(windowSize < (U64)(1ULL << 52));   /* ensure now overflow for windowMB */
             DISPLAYLEVEL(1, "%s : Use --long=%u or --memory=%uMB\n",
                             srcFileName, windowLog, windowMB);
             return;
@@ -1221,7 +1468,7 @@
  *  @return : size of decoded zstd frame, or an error code
 */
 #define FIO_ERROR_FRAME_DECODING   ((unsigned long long)(-2))
-unsigned long long FIO_decompressZstdFrame(dRess_t* ress,
+static unsigned long long FIO_decompressZstdFrame(dRess_t* ress,
                                        FILE* finput,
                                        const char* srcFileName,
                                        U64 alreadyDecoded)
@@ -1591,11 +1838,71 @@
     return 0;
 }
 
+/** FIO_decompressDstFile() :
+    open `dstFileName`,
+    or path-through if ress.dstFile is already != 0,
+    then start decompression process (FIO_decompressFrames()).
+    @return : 0 : OK
+              1 : operation aborted
+*/
+static int FIO_decompressDstFile(dRess_t ress, FILE* srcFile,
+                                 const char* dstFileName, const char* srcFileName)
+{
+    int result;
+    stat_t statbuf;
+    int transfer_permissions = 0;
+    int releaseDstFile = 0;
+
+    if (ress.dstFile == NULL) {
+        releaseDstFile = 1;
+
+        ress.dstFile = FIO_openDstFile(dstFileName);
+        if (ress.dstFile==0) return 1;
+
+        /* Must only be added after FIO_openDstFile() succeeds.
+         * Otherwise we may delete the destination file if it already exists,
+         * and the user presses Ctrl-C when asked if they wish to overwrite.
+         */
+        addHandler(dstFileName);
+
+        if ( strcmp(srcFileName, stdinmark)   /* special case : don't transfer permissions from stdin */
+          && UTIL_getFileStat(srcFileName, &statbuf) )
+            transfer_permissions = 1;
+    }
+
+
+    result = FIO_decompressFrames(ress, srcFile, dstFileName, srcFileName);
+
+    if (releaseDstFile) {
+        FILE* const dstFile = ress.dstFile;
+        clearHandler();
+        ress.dstFile = NULL;
+        if (fclose(dstFile)) {
+            DISPLAYLEVEL(1, "zstd: %s: %s \n", dstFileName, strerror(errno));
+            result = 1;
+        }
+
+        if ( (result != 0)  /* operation failure */
+          && strcmp(dstFileName, nulmark)     /* special case : don't remove() /dev/null (#316) */
+          && strcmp(dstFileName, stdoutmark)  /* special case : don't remove() stdout */
+          ) {
+            FIO_remove(dstFileName);  /* remove decompression artefact; note: don't do anything special if remove() fails */
+        } else {  /* operation success */
+            if ( strcmp(dstFileName, stdoutmark) /* special case : don't chmod stdout */
+              && strcmp(dstFileName, nulmark)    /* special case : don't chmod /dev/null */
+              && transfer_permissions )          /* file permissions correctly extracted from src */
+                UTIL_setFileStat(dstFileName, &statbuf);  /* transfer file permissions from src into dst */
+        }
+    }
+
+    return result;
+}
+
 
 /** FIO_decompressSrcFile() :
-    Decompression `srcFileName` into `ress.dstFile`
+    Open `srcFileName`, transfer control to decompressDstFile()
     @return : 0 : OK
-              1 : operation not started
+              1 : error
 */
 static int FIO_decompressSrcFile(dRess_t ress, const char* dstFileName, const char* srcFileName)
 {
@@ -1609,16 +1916,17 @@
 
     srcFile = FIO_openSrcFile(srcFileName);
     if (srcFile==NULL) return 1;
+    ress.srcBufferLoaded = 0;
 
-    result = FIO_decompressFrames(ress, srcFile, dstFileName, srcFileName);
+    result = FIO_decompressDstFile(ress, srcFile, dstFileName, srcFileName);
 
     /* Close file */
     if (fclose(srcFile)) {
         DISPLAYLEVEL(1, "zstd: %s: %s \n", srcFileName, strerror(errno));  /* error should not happen */
         return 1;
     }
-    if ( g_removeSrcFile /* --rm */
-      && (result==0)     /* decompression successful */
+    if ( g_removeSrcFile  /* --rm */
+      && (result==0)      /* decompression successful */
       && strcmp(srcFileName, stdinmark) ) /* not stdin */ {
         /* We must clear the handler, since after this point calling it would
          * delete both the source and destination files.
@@ -1633,73 +1941,94 @@
 }
 
 
-/** FIO_decompressFile_extRess() :
-    decompress `srcFileName` into `dstFileName`
-    @return : 0 : OK
-              1 : operation aborted (src not available, dst already taken, etc.)
-*/
-static int FIO_decompressDstFile(dRess_t ress,
-                                 const char* dstFileName, const char* srcFileName)
-{
-    int result;
-    stat_t statbuf;
-    int stat_result = 0;
-
-    ress.dstFile = FIO_openDstFile(dstFileName);
-    if (ress.dstFile==0) return 1;
-    /* Must ony be added after FIO_openDstFile() succeeds.
-     * Otherwise we may delete the destination file if at already exists, and
-     * the user presses Ctrl-C when asked if they wish to overwrite.
-     */
-    addHandler(dstFileName);
-
-    if ( strcmp(srcFileName, stdinmark)
-      && UTIL_getFileStat(srcFileName, &statbuf) )
-        stat_result = 1;
-    result = FIO_decompressSrcFile(ress, dstFileName, srcFileName);
-    clearHandler();
-
-    if (fclose(ress.dstFile)) {
-        DISPLAYLEVEL(1, "zstd: %s: %s \n", dstFileName, strerror(errno));
-        result = 1;
-    }
-
-    if ( (result != 0)  /* operation failure */
-      && strcmp(dstFileName, nulmark)      /* special case : don't remove() /dev/null (#316) */
-      && strcmp(dstFileName, stdoutmark) ) /* special case : don't remove() stdout */
-        FIO_remove(dstFileName);  /* remove decompression artefact; note don't do anything special if remove() fails */
-    else {  /* operation success */
-        if ( strcmp(dstFileName, stdoutmark) /* special case : don't chmod stdout */
-          && strcmp(dstFileName, nulmark)    /* special case : don't chmod /dev/null */
-          && stat_result )                   /* file permissions correctly extracted from src */
-            UTIL_setFileStat(dstFileName, &statbuf);  /* transfer file permissions from src into dst */
-    }
-
-    signal(SIGINT, SIG_DFL);
-
-    return result;
-}
-
 
 int FIO_decompressFilename(const char* dstFileName, const char* srcFileName,
                            const char* dictFileName)
 {
     dRess_t const ress = FIO_createDResources(dictFileName);
 
-    int const decodingError = FIO_decompressDstFile(ress, dstFileName, srcFileName);
+    int const decodingError = FIO_decompressSrcFile(ress, dstFileName, srcFileName);
 
     FIO_freeDResources(ress);
     return decodingError;
 }
 
 
-#define MAXSUFFIXSIZE 8
-int FIO_decompressMultipleFilenames(const char** srcNamesTable, unsigned nbFiles,
-                                    const char* outFileName,
-                                    const char* dictFileName)
+/* FIO_determineDstName() :
+ * create a destination filename from a srcFileName.
+ * @return a pointer to it.
+ * @return == NULL if there is an error */
+static const char*
+FIO_determineDstName(const char* srcFileName)
 {
-    int skippedFiles = 0;
-    int missingFiles = 0;
+    static size_t dfnbCapacity = 0;
+    static char* dstFileNameBuffer = NULL;   /* using static allocation : this function cannot be multi-threaded */
+
+    size_t const sfnSize = strlen(srcFileName);
+    size_t suffixSize;
+    const char* const suffixPtr = strrchr(srcFileName, '.');
+    if (suffixPtr == NULL) {
+        DISPLAYLEVEL(1, "zstd: %s: unknown suffix -- ignored \n",
+                        srcFileName);
+        return NULL;
+    }
+    suffixSize = strlen(suffixPtr);
+
+    /* check suffix is authorized */
+    if (sfnSize <= suffixSize
+        || (   strcmp(suffixPtr, ZSTD_EXTENSION)
+        #ifdef ZSTD_GZDECOMPRESS
+            && strcmp(suffixPtr, GZ_EXTENSION)
+        #endif
+        #ifdef ZSTD_LZMADECOMPRESS
+            && strcmp(suffixPtr, XZ_EXTENSION)
+            && strcmp(suffixPtr, LZMA_EXTENSION)
+        #endif
+        #ifdef ZSTD_LZ4DECOMPRESS
+            && strcmp(suffixPtr, LZ4_EXTENSION)
+        #endif
+            ) ) {
+        const char* suffixlist = ZSTD_EXTENSION
+        #ifdef ZSTD_GZDECOMPRESS
+            "/" GZ_EXTENSION
+        #endif
+        #ifdef ZSTD_LZMADECOMPRESS
+            "/" XZ_EXTENSION "/" LZMA_EXTENSION
+        #endif
+        #ifdef ZSTD_LZ4DECOMPRESS
+            "/" LZ4_EXTENSION
+        #endif
+        ;
+        DISPLAYLEVEL(1, "zstd: %s: unknown suffix (%s expected) -- ignored \n",
+                     srcFileName, suffixlist);
+        return NULL;
+    }
+
+    /* allocate enough space to write dstFilename into it */
+    if (dfnbCapacity+suffixSize <= sfnSize+1) {
+        free(dstFileNameBuffer);
+        dfnbCapacity = sfnSize + 20;
+        dstFileNameBuffer = (char*)malloc(dfnbCapacity);
+        if (dstFileNameBuffer==NULL)
+            EXM_THROW(74, "not enough memory for dstFileName");
+    }
+
+    /* return dst name == src name truncated from suffix */
+    assert(dstFileNameBuffer != NULL);
+    memcpy(dstFileNameBuffer, srcFileName, sfnSize - suffixSize);
+    dstFileNameBuffer[sfnSize-suffixSize] = '\0';
+    return dstFileNameBuffer;
+
+    /* note : dstFileNameBuffer memory is not going to be free */
+}
+
+
+int
+FIO_decompressMultipleFilenames(const char* srcNamesTable[], unsigned nbFiles,
+                                const char* outFileName,
+                                const char* dictFileName)
+{
+    int error = 0;
     dRess_t ress = FIO_createDResources(dictFileName);
 
     if (outFileName) {
@@ -1707,66 +2036,22 @@
         ress.dstFile = FIO_openDstFile(outFileName);
         if (ress.dstFile == 0) EXM_THROW(71, "cannot open %s", outFileName);
         for (u=0; u<nbFiles; u++)
-            missingFiles += FIO_decompressSrcFile(ress, outFileName, srcNamesTable[u]);
+            error |= FIO_decompressSrcFile(ress, outFileName, srcNamesTable[u]);
         if (fclose(ress.dstFile))
             EXM_THROW(72, "Write error : cannot properly close output file");
     } else {
-        size_t suffixSize;
-        size_t dfnSize = FNSPACE;
         unsigned u;
-        char* dstFileName = (char*)malloc(FNSPACE);
-        if (dstFileName==NULL)
-            EXM_THROW(73, "not enough memory for dstFileName");
         for (u=0; u<nbFiles; u++) {   /* create dstFileName */
             const char* const srcFileName = srcNamesTable[u];
-            const char* const suffixPtr = strrchr(srcFileName, '.');
-            size_t const sfnSize = strlen(srcFileName);
-            if (!suffixPtr) {
-                DISPLAYLEVEL(1, "zstd: %s: unknown suffix -- ignored \n",
-                                srcFileName);
-                skippedFiles++;
-                continue;
-            }
-            suffixSize = strlen(suffixPtr);
-            if (dfnSize+suffixSize <= sfnSize+1) {
-                free(dstFileName);
-                dfnSize = sfnSize + 20;
-                dstFileName = (char*)malloc(dfnSize);
-                if (dstFileName==NULL)
-                    EXM_THROW(74, "not enough memory for dstFileName");
-            }
-            if (sfnSize <= suffixSize
-                || (strcmp(suffixPtr, GZ_EXTENSION)
-                    && strcmp(suffixPtr, XZ_EXTENSION)
-                    && strcmp(suffixPtr, ZSTD_EXTENSION)
-                    && strcmp(suffixPtr, LZMA_EXTENSION)
-                    && strcmp(suffixPtr, LZ4_EXTENSION)) ) {
-                const char* suffixlist = ZSTD_EXTENSION
-                #ifdef ZSTD_GZCOMPRESS
-                    "/" GZ_EXTENSION
-                #endif
-                #ifdef ZSTD_LZMACOMPRESS
-                    "/" XZ_EXTENSION "/" LZMA_EXTENSION
-                #endif
-                #ifdef ZSTD_LZ4COMPRESS
-                    "/" LZ4_EXTENSION
-                #endif
-                ;
-                DISPLAYLEVEL(1, "zstd: %s: unknown suffix (%s expected) -- ignored \n",
-                             srcFileName, suffixlist);
-                skippedFiles++;
-                continue;
-            } else {
-                memcpy(dstFileName, srcFileName, sfnSize - suffixSize);
-                dstFileName[sfnSize-suffixSize] = '\0';
-            }
-            missingFiles += FIO_decompressDstFile(ress, dstFileName, srcFileName);
+            const char* const dstFileName = FIO_determineDstName(srcFileName);
+            if (dstFileName == NULL) { error=1; continue; }
+
+            error |= FIO_decompressSrcFile(ress, dstFileName, srcFileName);
         }
-        free(dstFileName);
     }
 
     FIO_freeDResources(ress);
-    return missingFiles + skippedFiles;
+    return error;
 }
 
 
@@ -1786,22 +2071,19 @@
     U32 nbFiles;
 } fileInfo_t;
 
-/** getFileInfo() :
- *  Reads information from file, stores in *info
- * @return : 0 if successful
- *           1 for frame analysis error
- *           2 for file not compressed with zstd
- *           3 for cases in which file could not be opened.
- */
-static int getFileInfo_fileConfirmed(fileInfo_t* info, const char* inFileName){
-    int detectError = 0;
-    FILE* const srcFile = FIO_openSrcFile(inFileName);
-    if (srcFile == NULL) {
-        DISPLAY("Error: could not open source file %s\n", inFileName);
-        return 3;
-    }
-    info->compressedSize = UTIL_getFileSize(inFileName);
+typedef enum { info_success=0, info_frame_error=1, info_not_zstd=2, info_file_error=3 } InfoError;
 
+#define ERROR_IF(c,n,...) {             \
+    if (c) {                           \
+        DISPLAYLEVEL(1, __VA_ARGS__);  \
+        DISPLAYLEVEL(1, " \n");        \
+        return n;                      \
+    }                                  \
+}
+
+static InfoError
+FIO_analyzeFrames(fileInfo_t* info, FILE* const srcFile)
+{
     /* begin analyzing frame */
     for ( ; ; ) {
         BYTE headerBuffer[ZSTD_FRAMEHEADERSIZE_MAX];
@@ -1811,130 +2093,111 @@
               && (numBytesRead == 0)
               && (info->compressedSize > 0)
               && (info->compressedSize != UTIL_FILESIZE_UNKNOWN) ) {
-                break;
+                break;  /* correct end of file => success */
             }
-            else if (feof(srcFile)) {
-                DISPLAY("Error: reached end of file with incomplete frame\n");
-                detectError = 2;
-                break;
-            }
-            else {
-                DISPLAY("Error: did not reach end of file but ran out of frames\n");
-                detectError = 1;
-                break;
-            }
+            ERROR_IF(feof(srcFile), info_not_zstd, "Error: reached end of file with incomplete frame");
+            ERROR_IF(1, info_frame_error, "Error: did not reach end of file but ran out of frames");
         }
         {   U32 const magicNumber = MEM_readLE32(headerBuffer);
             /* Zstandard frame */
             if (magicNumber == ZSTD_MAGICNUMBER) {
                 ZSTD_frameHeader header;
                 U64 const frameContentSize = ZSTD_getFrameContentSize(headerBuffer, numBytesRead);
-                if (frameContentSize == ZSTD_CONTENTSIZE_ERROR || frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN) {
+                if ( frameContentSize == ZSTD_CONTENTSIZE_ERROR
+                  || frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN ) {
                     info->decompUnavailable = 1;
                 } else {
                     info->decompressedSize += frameContentSize;
                 }
-                if (ZSTD_getFrameHeader(&header, headerBuffer, numBytesRead) != 0) {
-                    DISPLAY("Error: could not decode frame header\n");
-                    detectError = 1;
-                    break;
-                }
+                ERROR_IF(ZSTD_getFrameHeader(&header, headerBuffer, numBytesRead) != 0,
+                        info_frame_error, "Error: could not decode frame header");
                 info->windowSize = header.windowSize;
                 /* move to the end of the frame header */
                 {   size_t const headerSize = ZSTD_frameHeaderSize(headerBuffer, numBytesRead);
-                    if (ZSTD_isError(headerSize)) {
-                        DISPLAY("Error: could not determine frame header size\n");
-                        detectError = 1;
-                        break;
-                    }
-                    {   int const ret = fseek(srcFile, ((long)headerSize)-((long)numBytesRead), SEEK_CUR);
-                        if (ret != 0) {
-                            DISPLAY("Error: could not move to end of frame header\n");
-                            detectError = 1;
-                            break;
-                }   }   }
+                    ERROR_IF(ZSTD_isError(headerSize), info_frame_error, "Error: could not determine frame header size");
+                    ERROR_IF(fseek(srcFile, ((long)headerSize)-((long)numBytesRead), SEEK_CUR) != 0,
+                            info_frame_error, "Error: could not move to end of frame header");
+                }
 
-                /* skip the rest of the blocks in the frame */
+                /* skip all blocks in the frame */
                 {   int lastBlock = 0;
                     do {
                         BYTE blockHeaderBuffer[3];
-                        size_t const readBytes = fread(blockHeaderBuffer, 1, 3, srcFile);
-                        if (readBytes != 3) {
-                            DISPLAY("There was a problem reading the block header\n");
-                            detectError = 1;
-                            break;
-                        }
+                        ERROR_IF(fread(blockHeaderBuffer, 1, 3, srcFile) != 3,
+                                info_frame_error, "Error while reading block header");
                         {   U32 const blockHeader = MEM_readLE24(blockHeaderBuffer);
                             U32 const blockTypeID = (blockHeader >> 1) & 3;
                             U32 const isRLE = (blockTypeID == 1);
                             U32 const isWrongBlock = (blockTypeID == 3);
                             long const blockSize = isRLE ? 1 : (long)(blockHeader >> 3);
-                            if (isWrongBlock) {
-                                DISPLAY("Error: unsupported block type \n");
-                                detectError = 1;
-                                break;
-                            }
+                            ERROR_IF(isWrongBlock, info_frame_error, "Error: unsupported block type");
                             lastBlock = blockHeader & 1;
-                            {   int const ret = fseek(srcFile, blockSize, SEEK_CUR);
-                                if (ret != 0) {
-                                    DISPLAY("Error: could not skip to end of block\n");
-                                    detectError = 1;
-                                    break;
-                        }   }   }
+                            ERROR_IF(fseek(srcFile, blockSize, SEEK_CUR) != 0,
+                                    info_frame_error, "Error: could not skip to end of block");
+                        }
                     } while (lastBlock != 1);
-
-                    if (detectError) break;
                 }
 
                 /* check if checksum is used */
                 {   BYTE const frameHeaderDescriptor = headerBuffer[4];
                     int const contentChecksumFlag = (frameHeaderDescriptor & (1 << 2)) >> 2;
                     if (contentChecksumFlag) {
-                        int const ret = fseek(srcFile, 4, SEEK_CUR);
                         info->usesCheck = 1;
-                        if (ret != 0) {
-                            DISPLAY("Error: could not skip past checksum\n");
-                            detectError = 1;
-                            break;
-                }   }   }
+                        ERROR_IF(fseek(srcFile, 4, SEEK_CUR) != 0,
+                                info_frame_error, "Error: could not skip past checksum");
+                }   }
                 info->numActualFrames++;
             }
             /* Skippable frame */
             else if ((magicNumber & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) {
                 U32 const frameSize = MEM_readLE32(headerBuffer + 4);
                 long const seek = (long)(8 + frameSize - numBytesRead);
-                int const ret = LONG_SEEK(srcFile, seek, SEEK_CUR);
-                if (ret != 0) {
-                    DISPLAY("Error: could not find end of skippable frame\n");
-                    detectError = 1;
-                    break;
-                }
+                ERROR_IF(LONG_SEEK(srcFile, seek, SEEK_CUR) != 0,
+                        info_frame_error, "Error: could not find end of skippable frame");
                 info->numSkippableFrames++;
             }
             /* unknown content */
             else {
-                detectError = 2;
-                break;
+                return info_not_zstd;
             }
-        }
-    }  /* end analyzing frame */
-    fclose(srcFile);
-    info->nbFiles = 1;
-    return detectError;
+        }  /* magic number analysis */
+    }  /* end analyzing frames */
+    return info_success;
 }
 
-static int getFileInfo(fileInfo_t* info, const char* srcFileName)
+
+static InfoError
+getFileInfo_fileConfirmed(fileInfo_t* info, const char* inFileName)
 {
-    int const isAFile = UTIL_isRegularFile(srcFileName);
-    if (!isAFile) {
-        DISPLAY("Error : %s is not a file", srcFileName);
-        return 3;
-    }
+    InfoError status;
+    FILE* const srcFile = FIO_openSrcFile(inFileName);
+    ERROR_IF(srcFile == NULL, info_file_error, "Error: could not open source file %s", inFileName);
+
+    info->compressedSize = UTIL_getFileSize(inFileName);
+    status = FIO_analyzeFrames(info, srcFile);
+
+    fclose(srcFile);
+    info->nbFiles = 1;
+    return status;
+}
+
+
+/** getFileInfo() :
+ *  Reads information from file, stores in *info
+ * @return : InfoError status
+ */
+static InfoError
+getFileInfo(fileInfo_t* info, const char* srcFileName)
+{
+    ERROR_IF(!UTIL_isRegularFile(srcFileName),
+            info_file_error, "Error : %s is not a file", srcFileName);
     return getFileInfo_fileConfirmed(info, srcFileName);
 }
 
 
-static void displayInfo(const char* inFileName, const fileInfo_t* info, int displayLevel){
+static void
+displayInfo(const char* inFileName, const fileInfo_t* info, int displayLevel)
+{
     unsigned const unit = info->compressedSize < (1 MB) ? (1 KB) : (1 MB);
     const char* const unitStr = info->compressedSize < (1 MB) ? "KB" : "MB";
     double const windowSizeUnit = (double)info->windowSize / unit;
@@ -1992,52 +2255,62 @@
     return total;
 }
 
-static int FIO_listFile(fileInfo_t* total, const char* inFileName, int displayLevel){
+static int
+FIO_listFile(fileInfo_t* total, const char* inFileName, int displayLevel)
+{
     fileInfo_t info;
     memset(&info, 0, sizeof(info));
-    {   int const error = getFileInfo(&info, inFileName);
-        if (error == 1) {
+    {   InfoError const error = getFileInfo(&info, inFileName);
+        if (error == info_frame_error) {
             /* display error, but provide output */
-            DISPLAY("An error occurred while getting file info \n");
+            DISPLAYLEVEL(1, "Error while parsing %s \n", inFileName);
         }
-        else if (error == 2) {
+        else if (error == info_not_zstd) {
             DISPLAYOUT("File %s not compressed by zstd \n", inFileName);
             if (displayLevel > 2) DISPLAYOUT("\n");
             return 1;
         }
-        else if (error == 3) {
+        else if (error == info_file_error) {
             /* error occurred while opening the file */
             if (displayLevel > 2) DISPLAYOUT("\n");
             return 1;
         }
         displayInfo(inFileName, &info, displayLevel);
         *total = FIO_addFInfo(*total, info);
+        assert(error>=0 || error<=1);
         return error;
     }
 }
 
-int FIO_listMultipleFiles(unsigned numFiles, const char** filenameTable, int displayLevel){
+int FIO_listMultipleFiles(unsigned numFiles, const char** filenameTable, int displayLevel)
+{
+    /* ensure no specified input is stdin (needs fseek() capability) */
+    {   unsigned u;
+        for (u=0; u<numFiles;u++) {
+            ERROR_IF(!strcmp (filenameTable[u], stdinmark),
+                    1, "zstd: --list does not support reading from standard input");
+    }   }
 
-    if (!IS_CONSOLE(stdin)) {
-        DISPLAYOUT("zstd: --list does not support reading from standard input\n");
+    if (numFiles == 0) {
+        if (!IS_CONSOLE(stdin)) {
+            DISPLAYLEVEL(1, "zstd: --list does not support reading from standard input \n");
+        }
+        DISPLAYLEVEL(1, "No files given \n");
         return 1;
     }
 
-    if (numFiles == 0) {
-        DISPLAYOUT("No files given\n");
-        return 0;
-    }
     if (displayLevel <= 2) {
         DISPLAYOUT("Frames  Skips  Compressed  Uncompressed  Ratio  Check  Filename\n");
     }
     {   int error = 0;
-        unsigned u;
         fileInfo_t total;
         memset(&total, 0, sizeof(total));
         total.usesCheck = 1;
-        for (u=0; u<numFiles;u++) {
-            error |= FIO_listFile(&total, filenameTable[u], displayLevel);
-        }
+        /* --list each file, and check for any error */
+        {   unsigned u;
+            for (u=0; u<numFiles;u++) {
+                error |= FIO_listFile(&total, filenameTable[u], displayLevel);
+        }   }
         if (numFiles > 1 && displayLevel <= 2) {   /* display total */
             unsigned const unit = total.compressedSize < (1 MB) ? (1 KB) : (1 MB);
             const char* const unitStr = total.compressedSize < (1 MB) ? "KB" : "MB";
diff --git a/programs/fileio.h b/programs/fileio.h
index 69c83f7..4c7049c 100644
--- a/programs/fileio.h
+++ b/programs/fileio.h
@@ -48,20 +48,23 @@
 ***************************************/
 void FIO_setCompressionType(FIO_compressionType_t compressionType);
 void FIO_overwriteMode(void);
-void FIO_setNotificationLevel(unsigned level);
-void FIO_setSparseWrite(unsigned sparse);  /**< 0: no sparse; 1: disable on stdout; 2: always enabled */
-void FIO_setDictIDFlag(unsigned dictIDFlag);
-void FIO_setChecksumFlag(unsigned checksumFlag);
-void FIO_setRemoveSrcFile(unsigned flag);
-void FIO_setMemLimit(unsigned memLimit);
-void FIO_setNbWorkers(unsigned nbWorkers);
+void FIO_setAdaptiveMode(unsigned adapt);
+void FIO_setAdaptMin(int minCLevel);
+void FIO_setAdaptMax(int maxCLevel);
 void FIO_setBlockSize(unsigned blockSize);
-void FIO_setOverlapLog(unsigned overlapLog);
+void FIO_setChecksumFlag(unsigned checksumFlag);
+void FIO_setDictIDFlag(unsigned dictIDFlag);
+void FIO_setLdmBucketSizeLog(unsigned ldmBucketSizeLog);
 void FIO_setLdmFlag(unsigned ldmFlag);
+void FIO_setLdmHashEveryLog(unsigned ldmHashEveryLog);
 void FIO_setLdmHashLog(unsigned ldmHashLog);
 void FIO_setLdmMinMatch(unsigned ldmMinMatch);
-void FIO_setLdmBucketSizeLog(unsigned ldmBucketSizeLog);
-void FIO_setLdmHashEveryLog(unsigned ldmHashEveryLog);
+void FIO_setMemLimit(unsigned memLimit);
+void FIO_setNbWorkers(unsigned nbWorkers);
+void FIO_setNotificationLevel(unsigned level);
+void FIO_setOverlapLog(unsigned overlapLog);
+void FIO_setRemoveSrcFile(unsigned flag);
+void FIO_setSparseWrite(unsigned sparse);  /**< 0: no sparse; 1: disable on stdout; 2: always enabled */
 
 
 /*-*************************************
@@ -70,7 +73,7 @@
 /** FIO_compressFilename() :
     @return : 0 == ok;  1 == pb with src file. */
 int FIO_compressFilename (const char* outfilename, const char* infilename, const char* dictFileName,
-                          int compressionLevel, ZSTD_compressionParameters* comprParams);
+                          int compressionLevel, ZSTD_compressionParameters comprParams);
 
 /** FIO_decompressFilename() :
     @return : 0 == ok;  1 == pb with src file. */
@@ -78,6 +81,7 @@
 
 int FIO_listMultipleFiles(unsigned numFiles, const char** filenameTable, int displayLevel);
 
+
 /*-*************************************
 *  Multiple File functions
 ***************************************/
@@ -86,7 +90,7 @@
 int FIO_compressMultipleFilenames(const char** srcNamesTable, unsigned nbFiles,
                                   const char* outFileName, const char* suffix,
                                   const char* dictFileName, int compressionLevel,
-                                  ZSTD_compressionParameters* comprParams);
+                                  ZSTD_compressionParameters comprParams);
 
 /** FIO_decompressMultipleFilenames() :
     @return : nb of missing or skipped files */
@@ -95,6 +99,15 @@
                                     const char* dictFileName);
 
 
+/*-*************************************
+*  Advanced stuff (should actually be hosted elsewhere)
+***************************************/
+
+/* custom crash signal handler */
+void FIO_addAbortHandler(void);
+
+
+
 #if defined (__cplusplus)
 }
 #endif
diff --git a/programs/platform.h b/programs/platform.h
index a550eb1..155ebcd 100644
--- a/programs/platform.h
+++ b/programs/platform.h
@@ -50,53 +50,67 @@
 /* *********************************************************
 *  Turn on Large Files support (>4GB) for 32-bit Linux/Unix
 ***********************************************************/
-#if !defined(__64BIT__) || defined(__MINGW32__)       /* No point defining Large file for 64 bit but MinGW-w64 requires it */
+#if !defined(__64BIT__) || defined(__MINGW32__)    /* No point defining Large file for 64 bit but MinGW-w64 requires it */
 #  if !defined(_FILE_OFFSET_BITS)
-#    define _FILE_OFFSET_BITS 64                      /* turn off_t into a 64-bit type for ftello, fseeko */
+#    define _FILE_OFFSET_BITS 64                   /* turn off_t into a 64-bit type for ftello, fseeko */
 #  endif
-#  if !defined(_LARGEFILE_SOURCE)                     /* obsolete macro, replaced with _FILE_OFFSET_BITS */
-#    define _LARGEFILE_SOURCE 1                       /* Large File Support extension (LFS) - fseeko, ftello */
+#  if !defined(_LARGEFILE_SOURCE)                  /* obsolete macro, replaced with _FILE_OFFSET_BITS */
+#    define _LARGEFILE_SOURCE 1                    /* Large File Support extension (LFS) - fseeko, ftello */
 #  endif
 #  if defined(_AIX) || defined(__hpux)
-#    define _LARGE_FILES                              /* Large file support on 32-bits AIX and HP-UX */
+#    define _LARGE_FILES                           /* Large file support on 32-bits AIX and HP-UX */
 #  endif
 #endif
 
 
 /* ************************************************************
 *  Detect POSIX version
-*  PLATFORM_POSIX_VERSION = -1 for non-Unix e.g. Windows
-*  PLATFORM_POSIX_VERSION = 0 for Unix-like non-POSIX
-*  PLATFORM_POSIX_VERSION >= 1 is equal to found _POSIX_VERSION
+*  PLATFORM_POSIX_VERSION = 0 for non-Unix e.g. Windows
+*  PLATFORM_POSIX_VERSION = 1 for Unix-like but non-POSIX
+*  PLATFORM_POSIX_VERSION > 1 is equal to found _POSIX_VERSION
+*  Value of PLATFORM_POSIX_VERSION can be forced on command line
 ***************************************************************/
-#if !defined(_WIN32) && (defined(__unix__) || defined(__unix) || (defined(__APPLE__) && defined(__MACH__)) /* UNIX-like OS */ \
-   || defined(__midipix__) || defined(__VMS))
+#ifndef PLATFORM_POSIX_VERSION
+
 #  if (defined(__APPLE__) && defined(__MACH__)) || defined(__SVR4) || defined(_AIX) || defined(__hpux) /* POSIX.1-2001 (SUSv3) conformant */ \
      || defined(__DragonFly__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)  /* BSD distros */
+     /* exception rule : force posix version to 200112L,
+      * note: it's better to use unistd.h's _POSIX_VERSION whenever possible */
 #    define PLATFORM_POSIX_VERSION 200112L
-#  else
+
+/* try to determine posix version through official unistd.h's _POSIX_VERSION (http://pubs.opengroup.org/onlinepubs/7908799/xsh/unistd.h.html).
+ * note : there is no simple way to know in advance if <unistd.h> is present or not on target system,
+ * Posix specification mandates its presence and its content, but target system must respect this spec.
+ * It's necessary to _not_ #include <unistd.h> whenever target OS is not unix-like
+ * otherwise it will block preprocessing stage.
+ * The following list of build macros tries to "guess" if target OS is likely unix-like, and therefore can #include <unistd.h>
+ */
+#  elif !defined(_WIN32) \
+     && (defined(__unix__) || defined(__unix) \
+     || defined(__midipix__) || defined(__VMS) || defined(__HAIKU__))
+
 #    if defined(__linux__) || defined(__linux)
 #      ifndef _POSIX_C_SOURCE
-#        define _POSIX_C_SOURCE 200112L  /* use feature test macro */
+#        define _POSIX_C_SOURCE 200112L  /* feature test macro : https://www.gnu.org/software/libc/manual/html_node/Feature-Test-Macros.html */
 #      endif
 #    endif
 #    include <unistd.h>  /* declares _POSIX_VERSION */
 #    if defined(_POSIX_VERSION)  /* POSIX compliant */
 #      define PLATFORM_POSIX_VERSION _POSIX_VERSION
 #    else
-#      define PLATFORM_POSIX_VERSION 0
+#      define PLATFORM_POSIX_VERSION 1
 #    endif
-#  endif
-#endif
-#if !defined(PLATFORM_POSIX_VERSION)
-#  define PLATFORM_POSIX_VERSION -1
-#endif
 
+#  else  /* non-unix target platform (like Windows) */
+#    define PLATFORM_POSIX_VERSION 0
+#  endif
+
+#endif   /* PLATFORM_POSIX_VERSION */
 
 /*-*********************************************
 *  Detect if isatty() and fileno() are available
 ************************************************/
-#if (defined(__linux__) && (PLATFORM_POSIX_VERSION >= 1)) \
+#if (defined(__linux__) && (PLATFORM_POSIX_VERSION > 1)) \
  || (PLATFORM_POSIX_VERSION >= 200112L) \
  || defined(__DJGPP__) \
  || defined(__MSYS__)
@@ -148,6 +162,34 @@
 #endif
 
 
+#ifndef ZSTD_START_SYMBOLLIST_FRAME
+#  ifdef __linux__
+#    define ZSTD_START_SYMBOLLIST_FRAME 2
+#  elif defined __APPLE__
+#    define ZSTD_START_SYMBOLLIST_FRAME 4
+#  else
+#    define ZSTD_START_SYMBOLLIST_FRAME 0
+#  endif
+#endif
+
+
+#ifndef ZSTD_SETPRIORITY_SUPPORT
+   /* mandates presence of <sys/resource.h> and support for setpriority() : http://man7.org/linux/man-pages/man2/setpriority.2.html */
+#  define ZSTD_SETPRIORITY_SUPPORT (PLATFORM_POSIX_VERSION >= 200112L)
+#endif
+
+
+#ifndef ZSTD_NANOSLEEP_SUPPORT
+   /* mandates support of nanosleep() within <time.h> : http://man7.org/linux/man-pages/man2/nanosleep.2.html */
+#  if (defined(__linux__) && (PLATFORM_POSIX_VERSION >= 199309L)) \
+   || (PLATFORM_POSIX_VERSION >= 200112L)
+#     define ZSTD_NANOSLEEP_SUPPORT 1
+#  else
+#     define ZSTD_NANOSLEEP_SUPPORT 0
+#  endif
+#endif
+
+
 #if defined (__cplusplus)
 }
 #endif
diff --git a/programs/util.h b/programs/util.h
index 4392a5b..67aa7a5 100644
--- a/programs/util.h
+++ b/programs/util.h
@@ -20,13 +20,13 @@
 /*-****************************************
 *  Dependencies
 ******************************************/
-#include "platform.h"     /* PLATFORM_POSIX_VERSION */
+#include "platform.h"     /* PLATFORM_POSIX_VERSION, ZSTD_NANOSLEEP_SUPPORT, ZSTD_SETPRIORITY_SUPPORT */
 #include <stdlib.h>       /* malloc */
 #include <stddef.h>       /* size_t, ptrdiff_t */
 #include <stdio.h>        /* fprintf */
 #include <string.h>       /* strncmp */
 #include <sys/types.h>    /* stat, utime */
-#include <sys/stat.h>     /* stat */
+#include <sys/stat.h>     /* stat, chmod */
 #if defined(_MSC_VER)
 #  include <sys/utime.h>  /* utime */
 #  include <io.h>         /* _chmod */
@@ -53,32 +53,34 @@
 #endif
 
 
-/*-****************************************
-*  Sleep functions: Windows - Posix - others
-******************************************/
+/*-*************************************************
+*  Sleep & priority functions: Windows - Posix - others
+***************************************************/
 #if defined(_WIN32)
 #  include <windows.h>
 #  define SET_REALTIME_PRIORITY SetPriorityClass(GetCurrentProcess(), REALTIME_PRIORITY_CLASS)
 #  define UTIL_sleep(s) Sleep(1000*s)
 #  define UTIL_sleepMilli(milli) Sleep(milli)
-#elif PLATFORM_POSIX_VERSION >= 0 /* Unix-like operating system */
-#  include <unistd.h>
-#  include <sys/resource.h> /* setpriority */
-#  if defined(PRIO_PROCESS)
-#    define SET_REALTIME_PRIORITY setpriority(PRIO_PROCESS, 0, -20)
-#  else
-#    define SET_REALTIME_PRIORITY /* disabled */
-#  endif
+
+#elif PLATFORM_POSIX_VERSION > 0 /* Unix-like operating system */
+#  include <unistd.h>   /* sleep */
 #  define UTIL_sleep(s) sleep(s)
-#  if (defined(__linux__) && (PLATFORM_POSIX_VERSION >= 199309L)) || (PLATFORM_POSIX_VERSION >= 200112L)  /* nanosleep requires POSIX.1-2001 */
+#  if ZSTD_NANOSLEEP_SUPPORT   /* necessarily defined in platform.h */
 #      define UTIL_sleepMilli(milli) { struct timespec t; t.tv_sec=0; t.tv_nsec=milli*1000000ULL; nanosleep(&t, NULL); }
 #  else
 #      define UTIL_sleepMilli(milli) /* disabled */
 #  endif
-#else
-#  define SET_REALTIME_PRIORITY      /* disabled */
+#  if ZSTD_SETPRIORITY_SUPPORT
+#    include <sys/resource.h> /* setpriority */
+#    define SET_REALTIME_PRIORITY setpriority(PRIO_PROCESS, 0, -20)
+#  else
+#    define SET_REALTIME_PRIORITY /* disabled */
+#  endif
+
+#else  /* unknown non-unix operating systen */
 #  define UTIL_sleep(s)          /* disabled */
 #  define UTIL_sleepMilli(milli) /* disabled */
+#  define SET_REALTIME_PRIORITY  /* disabled */
 #endif
 
 
@@ -119,6 +121,7 @@
 #if defined(_WIN32)   /* Windows */
     #define UTIL_TIME_INITIALIZER { { 0, 0 } }
     typedef LARGE_INTEGER UTIL_time_t;
+
     UTIL_STATIC UTIL_time_t UTIL_getTime(void) { UTIL_time_t x; QueryPerformanceCounter(&x); return x; }
     UTIL_STATIC U64 UTIL_getSpanTimeMicro(UTIL_time_t clockStart, UTIL_time_t clockEnd)
     {
@@ -148,6 +151,7 @@
     #include <mach/mach_time.h>
     #define UTIL_TIME_INITIALIZER 0
     typedef U64 UTIL_time_t;
+
     UTIL_STATIC UTIL_time_t UTIL_getTime(void) { return mach_absolute_time(); }
     UTIL_STATIC U64 UTIL_getSpanTimeMicro(UTIL_time_t clockStart, UTIL_time_t clockEnd)
     {
@@ -170,11 +174,16 @@
         return ((clockEnd - clockStart) * (U64)rate.numer) / ((U64)rate.denom);
     }
 
-#elif (PLATFORM_POSIX_VERSION >= 200112L) && (defined __UCLIBC__ || ((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 17) || __GLIBC__ > 2))
+#elif (PLATFORM_POSIX_VERSION >= 200112L) \
+   && (defined(__UCLIBC__)                \
+      || (defined(__GLIBC__)              \
+          && ((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 17) \
+             || (__GLIBC__ > 2))))
 
     #define UTIL_TIME_INITIALIZER { 0, 0 }
     typedef struct timespec UTIL_freq_t;
     typedef struct timespec UTIL_time_t;
+
     UTIL_STATIC UTIL_time_t UTIL_getTime(void)
     {
         UTIL_time_t time;
@@ -182,6 +191,7 @@
             UTIL_DISPLAYLEVEL(1, "ERROR: Failed to get time\n");   /* we could also exit() */
         return time;
     }
+
     UTIL_STATIC UTIL_time_t UTIL_getSpanTime(UTIL_time_t begin, UTIL_time_t end)
     {
         UTIL_time_t diff;
@@ -194,6 +204,7 @@
         }
         return diff;
     }
+
     UTIL_STATIC U64 UTIL_getSpanTimeMicro(UTIL_time_t begin, UTIL_time_t end)
     {
         UTIL_time_t const diff = UTIL_getSpanTime(begin, end);
@@ -202,6 +213,7 @@
         micro += diff.tv_nsec / 1000ULL;
         return micro;
     }
+
     UTIL_STATIC U64 UTIL_getSpanTimeNano(UTIL_time_t begin, UTIL_time_t end)
     {
         UTIL_time_t const diff = UTIL_getSpanTime(begin, end);
@@ -210,6 +222,7 @@
         nano += diff.tv_nsec;
         return nano;
     }
+
 #else   /* relies on standard C (note : clock_t measurements can be wrong when using multi-threading) */
     typedef clock_t UTIL_time_t;
     #define UTIL_TIME_INITIALIZER 0
@@ -319,15 +332,20 @@
 
 UTIL_STATIC U32 UTIL_isLink(const char* infilename)
 {
-#if defined(_WIN32)
-    /* no symlinks on windows */
-    (void)infilename;
-#else
+/* macro guards, as defined in : https://linux.die.net/man/2/lstat */
+#ifndef __STRICT_ANSI__
+#if defined(_BSD_SOURCE) \
+    || (defined(_XOPEN_SOURCE) && (_XOPEN_SOURCE >= 500)) \
+    || (defined(_XOPEN_SOURCE) && defined(_XOPEN_SOURCE_EXTENDED)) \
+    || (defined(_POSIX_C_SOURCE) && (_POSIX_C_SOURCE >= 200112L)) \
+    || (defined(__APPLE__) && defined(__MACH__))
     int r;
     stat_t statbuf;
     r = lstat(infilename, &statbuf);
     if (!r && S_ISLNK(statbuf.st_mode)) return 1;
 #endif
+#endif
+    (void)infilename;
     return 0;
 }
 
@@ -526,7 +544,10 @@
  * After finishing usage of the list the structures should be freed with UTIL_freeFileList(params: return value, allocatedBuffer)
  * In case of error UTIL_createFileList returns NULL and UTIL_freeFileList should not be called.
  */
-UTIL_STATIC const char** UTIL_createFileList(const char **inputNames, unsigned inputNamesNb, char** allocatedBuffer, unsigned* allocatedNamesNb, int followLinks)
+UTIL_STATIC const char**
+UTIL_createFileList(const char **inputNames, unsigned inputNamesNb,
+                    char** allocatedBuffer, unsigned* allocatedNamesNb,
+                    int followLinks)
 {
     size_t pos;
     unsigned i, nbFiles;
diff --git a/programs/windres/zstd32.res b/programs/windres/zstd32.res
index 26800ab..276cb20 100644
--- a/programs/windres/zstd32.res
+++ b/programs/windres/zstd32.res
Binary files differ
diff --git a/programs/windres/zstd64.res b/programs/windres/zstd64.res
index 723d73a..3eb0162 100644
--- a/programs/windres/zstd64.res
+++ b/programs/windres/zstd64.res
Binary files differ
diff --git a/programs/zstd.1 b/programs/zstd.1
index b136bb3..27ee668 100644
--- a/programs/zstd.1
+++ b/programs/zstd.1
@@ -1,5 +1,5 @@
 .
-.TH "ZSTD" "1" "2018-06-27" "zstd 1.3.5" "User Commands"
+.TH "ZSTD" "1" "September 2018" "zstd 1.3.5" "User Commands"
 .
 .SH "NAME"
 \fBzstd\fR \- zstd, zstdmt, unzstd, zstdcat \- Compress or decompress \.zst files
@@ -100,6 +100,10 @@
 \fB#\fR compression level [1\-19] (default: 3)
 .
 .TP
+\fB\-\-fast[=#]\fR
+switch to ultra\-fast compression levels\. If \fB=#\fR is not present, it defaults to \fB1\fR\. The higher the value, the faster the compression speed, at the cost of some compression ratio\. This setting overwrites compression level if one was set previously\. Similarly, if a compression level is set after \fB\-\-fast\fR, it overrides it\.
+.
+.TP
 \fB\-\-ultra\fR
 unlocks high compression levels 20+ (maximum 22), using a lot more memory\. Note that decompression will also require more memory when using these levels\.
 .
@@ -111,16 +115,16 @@
 Note: If \fBwindowLog\fR is set to larger than 27, \fB\-\-long=windowLog\fR or \fB\-\-memory=windowSize\fR needs to be passed to the decompressor\.
 .
 .TP
-\fB\-\-fast[=#]\fR
-switch to ultra\-fast compression levels\. If \fB=#\fR is not present, it defaults to \fB1\fR\. The higher the value, the faster the compression speed, at the cost of some compression ratio\. This setting overwrites compression level if one was set previously\. Similarly, if a compression level is set after \fB\-\-fast\fR, it overrides it\.
-.
-.TP
 \fB\-T#\fR, \fB\-\-threads=#\fR
 Compress using \fB#\fR working threads (default: 1)\. If \fB#\fR is 0, attempt to detect and use the number of physical CPU cores\. In all cases, the nb of threads is capped to ZSTDMT_NBTHREADS_MAX==200\. This modifier does nothing if \fBzstd\fR is compiled without multithread support\.
 .
 .TP
 \fB\-\-single\-thread\fR
-Does not spawn a thread for compression, use caller thread instead\. This is the only available mode when multithread support is disabled\. In this mode, compression is serialized with I/O\. (This is different from \fB\-T1\fR, which spawns 1 compression thread in parallel of I/O)\. Single\-thread mode also features lower memory usage\.
+Does not spawn a thread for compression, use a single thread for both I/O and compression\. In this mode, compression is serialized with I/O, which is slightly slower\. (This is different from \fB\-T1\fR, which spawns 1 compression thread in parallel of I/O)\. This mode is the only one available when multithread support is disabled\. Single\-thread mode features lower memory usage\. Final compressed result is slightly different from \fB\-T1\fR\.
+.
+.TP
+\fB\-\-adapt\fR
+\fBzstd\fR will dynamically adapt compression level to perceived I/O conditions\. Compression level adaptation can be observed live by using command \fB\-v\fR\. The feature works when combined with multi\-threading and \fB\-\-long\fR mode\. It does not work with \fB\-\-single\-thread\fR\. It sets window size to 8 MB by default (can be changed manually, see \fBwlog\fR)\. Due to the chaotic nature of dynamic adaptation, compressed result is not reproducible\. \fInote\fR : at the time of this writing, \fB\-\-adapt\fR can remain stuck at low speed when combined with multiple worker threads (>=2)\.
 .
 .TP
 \fB\-D file\fR
@@ -194,7 +198,7 @@
 Use FILEs as training set to create a dictionary\. The training set should contain a lot of small files (> 100), and weight typically 100x the target dictionary size (for example, 10 MB for a 100 KB dictionary)\.
 .
 .IP
-Supports multithreading if \fBzstd\fR is compiled with threading support\. Additional parameters can be specified with \fB\-\-train\-cover\fR\. The legacy dictionary builder can be accessed with \fB\-\-train\-legacy\fR\. Equivalent to \fB\-\-train\-cover=d=8,steps=4\fR\.
+Supports multithreading if \fBzstd\fR is compiled with threading support\. Additional parameters can be specified with \fB\-\-train\-fastcover\fR\. The legacy dictionary builder can be accessed with \fB\-\-train\-legacy\fR\. The cover dictionary builder can be accessed with \fB\-\-train\-cover\fR\. Equivalent to \fB\-\-train\-fastcover=d=8,steps=4\fR\.
 .
 .TP
 \fB\-o file\fR
@@ -217,11 +221,11 @@
 A dictionary ID is a locally unique ID that a decoder can use to verify it is using the right dictionary\. By default, zstd will create a 4\-bytes random number ID\. It\'s possible to give a precise number instead\. Short numbers have an advantage : an ID < 256 will only need 1 byte in the compressed frame header, and an ID < 65536 will only need 2 bytes\. This compares favorably to 4 bytes default\. However, it\'s up to the dictionary manager to not assign twice the same ID to 2 different dictionaries\.
 .
 .TP
-\fB\-\-train\-cover[=k#,d=#,steps=#]\fR
-Select parameters for the default dictionary builder algorithm named cover\. If \fId\fR is not specified, then it tries \fId\fR = 6 and \fId\fR = 8\. If \fIk\fR is not specified, then it tries \fIsteps\fR values in the range [50, 2000]\. If \fIsteps\fR is not specified, then the default value of 40 is used\. Requires that \fId\fR <= \fIk\fR\.
+\fB\-\-train\-cover[=k#,d=#,steps=#,split=#]\fR
+Select parameters for the default dictionary builder algorithm named cover\. If \fId\fR is not specified, then it tries \fId\fR = 6 and \fId\fR = 8\. If \fIk\fR is not specified, then it tries \fIsteps\fR values in the range [50, 2000]\. If \fIsteps\fR is not specified, then the default value of 40 is used\. If \fIsplit\fR is not specified or split <= 0, then the default value of 100 is used\. Requires that \fId\fR <= \fIk\fR\.
 .
 .IP
-Selects segments of size \fIk\fR with highest score to put in the dictionary\. The score of a segment is computed by the sum of the frequencies of all the subsegments of size \fId\fR\. Generally \fId\fR should be in the range [6, 8], occasionally up to 16, but the algorithm will run faster with d <= \fI8\fR\. Good values for \fIk\fR vary widely based on the input data, but a safe range is [2 * \fId\fR, 2000]\. Supports multithreading if \fBzstd\fR is compiled with threading support\.
+Selects segments of size \fIk\fR with highest score to put in the dictionary\. The score of a segment is computed by the sum of the frequencies of all the subsegments of size \fId\fR\. Generally \fId\fR should be in the range [6, 8], occasionally up to 16, but the algorithm will run faster with d <= \fI8\fR\. Good values for \fIk\fR vary widely based on the input data, but a safe range is [2 * \fId\fR, 2000]\. If \fIsplit\fR is 100, all input samples are used for both training and testing to find optimal \fId\fR and \fIk\fR to build dictionary\. Supports multithreading if \fBzstd\fR is compiled with threading support\.
 .
 .IP
 Examples:
@@ -238,6 +242,25 @@
 .IP
 \fBzstd \-\-train\-cover=k=50 FILEs\fR
 .
+.IP
+\fBzstd \-\-train\-cover=k=50,split=60 FILEs\fR
+.
+.TP
+\fB\-\-train\-fastcover[=k#,d=#,f=#,steps=#,split=#,accel=#]\fR
+Same as cover but with extra parameters \fIf\fR and \fIaccel\fR and different default value of split If \fIsplit\fR is not specified, then it tries \fIsplit\fR = 75\. If \fIf\fR is not specified, then it tries \fIf\fR = 20\. Requires that 0 < \fIf\fR < 32\. If \fIaccel\fR is not specified, then it tries \fIaccel\fR = 1\. Requires that 0 < \fIaccel\fR <= 10\. Requires that \fId\fR = 6 or \fId\fR = 8\.
+.
+.IP
+\fIf\fR is log of size of array that keeps track of frequency of subsegments of size \fId\fR\. The subsegment is hashed to an index in the range [0,2^\fIf\fR \- 1]\. It is possible that 2 different subsegments are hashed to the same index, and they are considered as the same subsegment when computing frequency\. Using a higher \fIf\fR reduces collision but takes longer\.
+.
+.IP
+Examples:
+.
+.IP
+\fBzstd \-\-train\-fastcover FILEs\fR
+.
+.IP
+\fBzstd \-\-train\-fastcover=d=8,f=15,accel=2 FILEs\fR
+.
 .TP
 \fB\-\-train\-legacy[=selectivity=#]\fR
 Use legacy dictionary builder algorithm with the given dictionary \fIselectivity\fR (default: 9)\. The smaller the \fIselectivity\fR value, the denser the dictionary, improving its efficiency but reducing its possible maximum size\. \fB\-\-train\-legacy=s=#\fR is also accepted\.
diff --git a/programs/zstd.1.md b/programs/zstd.1.md
index c8b8d8d..c0c0469 100644
--- a/programs/zstd.1.md
+++ b/programs/zstd.1.md
@@ -102,6 +102,13 @@
 
 * `-#`:
     `#` compression level \[1-19] (default: 3)
+* `--fast[=#]`:
+    switch to ultra-fast compression levels.
+    If `=#` is not present, it defaults to `1`.
+    The higher the value, the faster the compression speed,
+    at the cost of some compression ratio.
+    This setting overwrites compression level if one was set previously.
+    Similarly, if a compression level is set after `--fast`, it overrides it.
 * `--ultra`:
     unlocks high compression levels 20+ (maximum 22), using a lot more memory.
     Note that decompression will also require more memory when using these levels.
@@ -115,25 +122,28 @@
 
     Note: If `windowLog` is set to larger than 27, `--long=windowLog` or
     `--memory=windowSize` needs to be passed to the decompressor.
-* `--fast[=#]`:
-    switch to ultra-fast compression levels.
-    If `=#` is not present, it defaults to `1`.
-    The higher the value, the faster the compression speed,
-    at the cost of some compression ratio.
-    This setting overwrites compression level if one was set previously.
-    Similarly, if a compression level is set after `--fast`, it overrides it.
-
 * `-T#`, `--threads=#`:
     Compress using `#` working threads (default: 1).
     If `#` is 0, attempt to detect and use the number of physical CPU cores.
     In all cases, the nb of threads is capped to ZSTDMT_NBTHREADS_MAX==200.
     This modifier does nothing if `zstd` is compiled without multithread support.
 * `--single-thread`:
-    Does not spawn a thread for compression, use caller thread instead.
-    This is the only available mode when multithread support is disabled.
-    In this mode, compression is serialized with I/O.
+    Does not spawn a thread for compression, use a single thread for both I/O and compression.
+    In this mode, compression is serialized with I/O, which is slightly slower.
     (This is different from `-T1`, which spawns 1 compression thread in parallel of I/O).
-    Single-thread mode also features lower memory usage.
+    This mode is the only one available when multithread support is disabled.
+    Single-thread mode features lower memory usage.
+    Final compressed result is slightly different from `-T1`.
+* `--adapt[=min=#,max=#]` :
+    `zstd` will dynamically adapt compression level to perceived I/O conditions.
+    Compression level adaptation can be observed live by using command `-v`.
+    Adaptation can be constrained between supplied `min` and `max` levels.
+    The feature works when combined with multi-threading and `--long` mode.
+    It does not work with `--single-thread`.
+    It sets window size to 8 MB by default (can be changed manually, see `wlog`).
+    Due to the chaotic nature of dynamic adaptation, compressed result is not reproducible.
+    _note_ : at the time of this writing, `--adapt` can remain stuck at low speed
+    when combined with multiple worker threads (>=2).
 * `-D file`:
     use `file` as Dictionary to compress or decompress FILE(s)
 * `--no-dictID`:
@@ -200,9 +210,10 @@
     (for example, 10 MB for a 100 KB dictionary).
 
     Supports multithreading if `zstd` is compiled with threading support.
-    Additional parameters can be specified with `--train-cover`.
+    Additional parameters can be specified with `--train-fastcover`.
     The legacy dictionary builder can be accessed with `--train-legacy`.
-    Equivalent to `--train-cover=d=8,steps=4`.
+    The cover dictionary builder can be accessed with `--train-cover`.
+    Equivalent to `--train-fastcover=d=8,steps=4`.
 * `-o file`:
     Dictionary saved into `file` (default name: dictionary).
 * `--maxdict=#`:
@@ -223,11 +234,12 @@
     This compares favorably to 4 bytes default.
     However, it's up to the dictionary manager to not assign twice the same ID to
     2 different dictionaries.
-* `--train-cover[=k#,d=#,steps=#]`:
+* `--train-cover[=k#,d=#,steps=#,split=#]`:
     Select parameters for the default dictionary builder algorithm named cover.
     If _d_ is not specified, then it tries _d_ = 6 and _d_ = 8.
     If _k_ is not specified, then it tries _steps_ values in the range [50, 2000].
     If _steps_ is not specified, then the default value of 40 is used.
+    If _split_ is not specified or split <= 0, then the default value of 100 is used.
     Requires that _d_ <= _k_.
 
     Selects segments of size _k_ with highest score to put in the dictionary.
@@ -237,6 +249,8 @@
     algorithm will run faster with d <= _8_.
     Good values for _k_ vary widely based on the input data, but a safe range is
     [2 * _d_, 2000].
+    If _split_ is 100, all input samples are used for both training and testing
+    to find optimal _d_ and _k_ to build dictionary.
     Supports multithreading if `zstd` is compiled with threading support.
 
     Examples:
@@ -249,6 +263,28 @@
 
     `zstd --train-cover=k=50 FILEs`
 
+    `zstd --train-cover=k=50,split=60 FILEs`
+
+* `--train-fastcover[=k#,d=#,f=#,steps=#,split=#,accel=#]`:
+    Same as cover but with extra parameters _f_ and _accel_ and different default value of split
+    If _split_ is not specified, then it tries _split_ = 75.
+    If _f_ is not specified, then it tries _f_ = 20.
+    Requires that 0 < _f_ < 32.
+    If _accel_ is not specified, then it tries _accel_ = 1.
+    Requires that 0 < _accel_ <= 10.
+    Requires that _d_ = 6 or _d_ = 8.
+
+    _f_ is log of size of array that keeps track of frequency of subsegments of size _d_.
+    The subsegment is hashed to an index in the range [0,2^_f_ - 1].
+    It is possible that 2 different subsegments are hashed to the same index, and they are considered as the same subsegment when computing frequency.
+    Using a higher _f_ reduces collision but takes longer.
+
+    Examples:
+
+    `zstd --train-fastcover FILEs`
+
+    `zstd --train-fastcover=d=8,f=15,accel=2 FILEs`
+
 * `--train-legacy[=selectivity=#]`:
     Use legacy dictionary builder algorithm with the given dictionary
     _selectivity_ (default: 9).
diff --git a/programs/zstdcli.c b/programs/zstdcli.c
index be62046..1545d1c 100644
--- a/programs/zstdcli.c
+++ b/programs/zstdcli.c
@@ -32,13 +32,13 @@
 #include <errno.h>    /* errno */
 #include "fileio.h"   /* stdinmark, stdoutmark, ZSTD_EXTENSION */
 #ifndef ZSTD_NOBENCH
-#  include "bench.h"  /* BMK_benchFiles, BMK_SetNbSeconds */
+#  include "bench.h"  /* BMK_benchFiles */
 #endif
 #ifndef ZSTD_NODICT
 #  include "dibio.h"  /* ZDICT_cover_params_t, DiB_trainFromFiles() */
 #endif
-#define ZSTD_STATIC_LINKING_ONLY   /* ZSTD_maxCLevel */
-#include "zstd.h"     /* ZSTD_VERSION_STRING */
+#define ZSTD_STATIC_LINKING_ONLY   /* ZSTD_minCLevel */
+#include "zstd.h"     /* ZSTD_VERSION_STRING, ZSTD_maxCLevel */
 
 
 /*-************************************
@@ -85,6 +85,10 @@
 static U32 g_ldmBucketSizeLog = LDM_PARAM_DEFAULT;
 
 
+#define DEFAULT_ACCEL 1
+
+typedef enum { cover, fastCover, legacy } dictType;
+
 /*-************************************
 *  Display Macros
 **************************************/
@@ -135,6 +139,7 @@
     DISPLAY( "--ultra : enable levels beyond %i, up to %i (requires more memory)\n", ZSTDCLI_CLEVEL_MAX, ZSTD_maxCLevel());
     DISPLAY( "--long[=#]: enable long distance matching with given window log (default: %u)\n", g_defaultMaxWindowLog);
     DISPLAY( "--fast[=#]: switch to ultra fast compression level (default: %u)\n", 1);
+    DISPLAY( "--adapt : dynamically adapt compression level to I/O conditions \n");
 #ifdef ZSTD_MULTITHREAD
     DISPLAY( " -T#    : spawns # compression threads (default: 1, 0==# cores) \n");
     DISPLAY( " -B#    : select size of each job (default: 0==automatic) \n");
@@ -170,7 +175,8 @@
     DISPLAY( "\n");
     DISPLAY( "Dictionary builder : \n");
     DISPLAY( "--train ## : create a dictionary from a training set of files \n");
-    DISPLAY( "--train-cover[=k=#,d=#,steps=#] : use the cover algorithm with optional args\n");
+    DISPLAY( "--train-cover[=k=#,d=#,steps=#,split=#] : use the cover algorithm with optional args\n");
+    DISPLAY( "--train-fastcover[=k=#,d=#,f=#,steps=#,split=#,accel=#] : use the fast cover algorithm with optional args\n");
     DISPLAY( "--train-legacy[=s=#] : use the legacy algorithm with selectivity (default: %u)\n", g_defaultSelectivityLevel);
     DISPLAY( " -o file : `file` is dictionary name (default: %s) \n", g_defaultDictName);
     DISPLAY( "--maxdict=# : limit dictionary to specified size (default: %u) \n", g_defaultMaxDictSize);
@@ -282,10 +288,42 @@
         if (longCommandWArg(&stringPtr, "k=")) { params->k = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
         if (longCommandWArg(&stringPtr, "d=")) { params->d = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
         if (longCommandWArg(&stringPtr, "steps=")) { params->steps = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
+        if (longCommandWArg(&stringPtr, "split=")) {
+          unsigned splitPercentage = readU32FromChar(&stringPtr);
+          params->splitPoint = (double)splitPercentage / 100.0;
+          if (stringPtr[0]==',') { stringPtr++; continue; } else break;
+        }
         return 0;
     }
     if (stringPtr[0] != 0) return 0;
-    DISPLAYLEVEL(4, "cover: k=%u\nd=%u\nsteps=%u\n", params->k, params->d, params->steps);
+    DISPLAYLEVEL(4, "cover: k=%u\nd=%u\nsteps=%u\nsplit=%u\n", params->k, params->d, params->steps, (unsigned)(params->splitPoint * 100));
+    return 1;
+}
+
+/**
+ * parseFastCoverParameters() :
+ * reads fastcover parameters from *stringPtr (e.g. "--train-fastcover=k=48,d=8,f=20,steps=32,accel=2") into *params
+ * @return 1 means that fastcover parameters were correct
+ * @return 0 in case of malformed parameters
+ */
+static unsigned parseFastCoverParameters(const char* stringPtr, ZDICT_fastCover_params_t* params)
+{
+    memset(params, 0, sizeof(*params));
+    for (; ;) {
+        if (longCommandWArg(&stringPtr, "k=")) { params->k = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
+        if (longCommandWArg(&stringPtr, "d=")) { params->d = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
+        if (longCommandWArg(&stringPtr, "f=")) { params->f = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
+        if (longCommandWArg(&stringPtr, "steps=")) { params->steps = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
+        if (longCommandWArg(&stringPtr, "accel=")) { params->accel = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
+        if (longCommandWArg(&stringPtr, "split=")) {
+          unsigned splitPercentage = readU32FromChar(&stringPtr);
+          params->splitPoint = (double)splitPercentage / 100.0;
+          if (stringPtr[0]==',') { stringPtr++; continue; } else break;
+        }
+        return 0;
+    }
+    if (stringPtr[0] != 0) return 0;
+    DISPLAYLEVEL(4, "cover: k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\n", params->k, params->d, params->f, params->steps, (unsigned)(params->splitPoint * 100), params->accel);
     return 1;
 }
 
@@ -310,11 +348,48 @@
     memset(&params, 0, sizeof(params));
     params.d = 8;
     params.steps = 4;
+    params.splitPoint = 1.0;
+    return params;
+}
+
+static ZDICT_fastCover_params_t defaultFastCoverParams(void)
+{
+    ZDICT_fastCover_params_t params;
+    memset(&params, 0, sizeof(params));
+    params.d = 8;
+    params.f = 20;
+    params.steps = 4;
+    params.splitPoint = 0.75; /* different from default splitPoint of cover */
+    params.accel = DEFAULT_ACCEL;
     return params;
 }
 #endif
 
 
+/** parseAdaptParameters() :
+ *  reads adapt parameters from *stringPtr (e.g. "--zstd=min=1,max=19) and store them into adaptMinPtr and adaptMaxPtr.
+ *  Both adaptMinPtr and adaptMaxPtr must be already allocated and correctly initialized.
+ *  There is no guarantee that any of these values will be updated.
+ *  @return 1 means that parsing was successful,
+ *  @return 0 in case of malformed parameters
+ */
+static unsigned parseAdaptParameters(const char* stringPtr, int* adaptMinPtr, int* adaptMaxPtr)
+{
+    for ( ; ;) {
+        if (longCommandWArg(&stringPtr, "min=")) { *adaptMinPtr = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
+        if (longCommandWArg(&stringPtr, "max=")) { *adaptMaxPtr = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
+        DISPLAYLEVEL(4, "invalid compression parameter \n");
+        return 0;
+    }
+    if (stringPtr[0] != 0) return 0; /* check the end of string */
+    if (*adaptMinPtr > *adaptMaxPtr) {
+        DISPLAYLEVEL(4, "incoherent adaptation limits \n");
+        return 0;
+    }
+    return 1;
+}
+
+
 /** parseCompressionParameters() :
  *  reads compression parameters from *stringPtr (e.g. "--zstd=wlog=23,clog=23,hlog=22,slog=6,slen=3,tlen=48,strat=6") into *params
  *  @return 1 means that compression parameters were correct
@@ -379,6 +454,15 @@
 
 #define CLEAN_RETURN(i) { operationResult = (i); goto _end; }
 
+#ifdef ZSTD_NOCOMPRESS
+/* symbols from compression library are not defined and should not be invoked */
+# define MINCLEVEL  -50
+# define MAXCLEVEL   22
+#else
+# define MINCLEVEL  ZSTD_minCLevel()
+# define MAXCLEVEL  ZSTD_maxCLevel()
+#endif
+
 int main(int argCount, const char* argv[])
 {
     int argNb,
@@ -388,6 +472,9 @@
         ldmFlag = 0,
         main_pause = 0,
         nbWorkers = 0,
+        adapt = 0,
+        adaptMin = MINCLEVEL,
+        adaptMax = MAXCLEVEL,
         nextArgumentIsOutFileName = 0,
         nextArgumentIsMaxDict = 0,
         nextArgumentIsDictID = 0,
@@ -398,6 +485,7 @@
         setRealTimePrio = 0,
         singleThread = 0,
         ultra=0;
+    double compressibility = 0.5;
     unsigned bench_nbSeconds = 3;   /* would be better if this value was synchronized from bench */
     size_t blockSize = 0;
     zstd_operation_mode operation = zom_compress;
@@ -423,14 +511,16 @@
 #endif
 #ifndef ZSTD_NODICT
     ZDICT_cover_params_t coverParams = defaultCoverParams();
-    int cover = 1;
+    ZDICT_fastCover_params_t fastCoverParams = defaultFastCoverParams();
+    dictType dict = fastCover;
+#endif
+#ifndef ZSTD_NOBENCH
+    BMK_advancedParams_t benchParams = BMK_initAdvancedParams();
 #endif
 
 
     /* init */
     (void)recursive; (void)cLevelLast;    /* not used when ZSTD_NOBENCH set */
-    (void)dictCLevel; (void)dictSelect; (void)dictID;  (void)maxDictSize; /* not used when ZSTD_NODICT set */
-    (void)ultra; (void)cLevel; (void)ldmFlag; /* not used when ZSTD_NOCOMPRESS set */
     (void)memLimit;   /* not used when ZSTD_NODECOMPRESS set */
     if (filenameTable==NULL) { DISPLAY("zstd: %s \n", strerror(errno)); exit(1); }
     filenameTable[0] = stdinmark;
@@ -441,7 +531,7 @@
 #endif
 
     /* preset behaviors */
-    if (exeNameMatch(programName, ZSTD_ZSTDMT)) nbWorkers=0;
+    if (exeNameMatch(programName, ZSTD_ZSTDMT)) nbWorkers=0, singleThread=0;
     if (exeNameMatch(programName, ZSTD_UNZSTD)) operation=zom_decompress;
     if (exeNameMatch(programName, ZSTD_CAT)) { operation=zom_decompress; forceStdout=1; FIO_overwriteMode(); outFileName=stdoutmark; g_displayLevel=1; }   /* supports multiple formats */
     if (exeNameMatch(programName, ZSTD_ZCAT)) { operation=zom_decompress; forceStdout=1; FIO_overwriteMode(); outFileName=stdoutmark; g_displayLevel=1; }  /* behave like zcat, also supports multiple formats */
@@ -456,6 +546,9 @@
     if (exeNameMatch(programName, ZSTD_UNLZ4)) { operation=zom_decompress; FIO_setCompressionType(FIO_lz4Compression); }                                   /* behave like unlz4, also supports multiple formats */
     memset(&compressionParams, 0, sizeof(compressionParams));
 
+    /* init crash handler */
+    FIO_addAbortHandler();
+
     /* command switches */
     for (argNb=1; argNb<argCount; argNb++) {
         const char* argument = argv[argNb];
@@ -493,13 +586,15 @@
                     if (!strcmp(argument, "--sparse")) { FIO_setSparseWrite(2); continue; }
                     if (!strcmp(argument, "--no-sparse")) { FIO_setSparseWrite(0); continue; }
                     if (!strcmp(argument, "--test")) { operation=zom_test; continue; }
-                    if (!strcmp(argument, "--train")) { operation=zom_train; outFileName=g_defaultDictName; continue; }
+                    if (!strcmp(argument, "--train")) { operation=zom_train; if (outFileName==NULL) outFileName=g_defaultDictName; continue; }
                     if (!strcmp(argument, "--maxdict")) { nextArgumentIsMaxDict=1; lastCommand=1; continue; }  /* kept available for compatibility with old syntax ; will be removed one day */
                     if (!strcmp(argument, "--dictID")) { nextArgumentIsDictID=1; lastCommand=1; continue; }  /* kept available for compatibility with old syntax ; will be removed one day */
                     if (!strcmp(argument, "--no-dictID")) { FIO_setDictIDFlag(0); continue; }
                     if (!strcmp(argument, "--keep")) { FIO_setRemoveSrcFile(0); continue; }
                     if (!strcmp(argument, "--rm")) { FIO_setRemoveSrcFile(1); continue; }
                     if (!strcmp(argument, "--priority=rt")) { setRealTimePrio = 1; continue; }
+                    if (!strcmp(argument, "--adapt")) { adapt = 1; continue; }
+                    if (longCommandWArg(&argument, "--adapt=")) { adapt = 1; if (!parseAdaptParameters(argument, &adaptMin, &adaptMax)) CLEAN_RETURN(badusage(programName)); continue; }
                     if (!strcmp(argument, "--single-thread")) { nbWorkers = 0; singleThread = 1; continue; }
                     if (!strcmp(argument, "--format=zstd")) { suffix = ZSTD_EXTENSION; FIO_setCompressionType(FIO_zstdCompression); continue; }
 #ifdef ZSTD_GZCOMPRESS
@@ -517,18 +612,31 @@
 #ifndef ZSTD_NODICT
                     if (longCommandWArg(&argument, "--train-cover")) {
                       operation = zom_train;
-                      outFileName = g_defaultDictName;
-                      cover = 1;
+                      if (outFileName == NULL)
+                          outFileName = g_defaultDictName;
+                      dict = cover;
                       /* Allow optional arguments following an = */
                       if (*argument == 0) { memset(&coverParams, 0, sizeof(coverParams)); }
                       else if (*argument++ != '=') { CLEAN_RETURN(badusage(programName)); }
                       else if (!parseCoverParameters(argument, &coverParams)) { CLEAN_RETURN(badusage(programName)); }
                       continue;
                     }
+                    if (longCommandWArg(&argument, "--train-fastcover")) {
+                      operation = zom_train;
+                      if (outFileName == NULL)
+                          outFileName = g_defaultDictName;
+                      dict = fastCover;
+                      /* Allow optional arguments following an = */
+                      if (*argument == 0) { memset(&fastCoverParams, 0, sizeof(fastCoverParams)); }
+                      else if (*argument++ != '=') { CLEAN_RETURN(badusage(programName)); }
+                      else if (!parseFastCoverParameters(argument, &fastCoverParams)) { CLEAN_RETURN(badusage(programName)); }
+                      continue;
+                    }
                     if (longCommandWArg(&argument, "--train-legacy")) {
                       operation = zom_train;
-                      outFileName = g_defaultDictName;
-                      cover = 0;
+                      if (outFileName == NULL)
+                          outFileName = g_defaultDictName;
+                      dict = legacy;
                       /* Allow optional arguments following an = */
                       if (*argument == 0) { continue; }
                       else if (*argument++ != '=') { CLEAN_RETURN(badusage(programName)); }
@@ -560,12 +668,15 @@
                             compressionParams.windowLog = ldmWindowLog;
                         continue;
                     }
+#ifndef ZSTD_NOCOMPRESS   /* linking ZSTD_minCLevel() requires compression support */
                     if (longCommandWArg(&argument, "--fast")) {
                         /* Parse optional acceleration factor */
                         if (*argument == '=') {
+                            U32 const maxFast = (U32)-ZSTD_minCLevel();
                             U32 fastLevel;
                             ++argument;
                             fastLevel = readU32FromChar(&argument);
+                            if (fastLevel > maxFast) fastLevel = maxFast;
                             if (fastLevel) {
                               dictCLevel = cLevel = -(int)fastLevel;
                             } else {
@@ -579,6 +690,7 @@
                         }
                         continue;
                     }
+#endif
                     /* fall-through, will trigger bad_usage() later on */
                 }
 
@@ -609,7 +721,7 @@
                          /* Decoding */
                     case 'd':
 #ifndef ZSTD_NOBENCH
-                            BMK_setDecodeOnlyMode(1);
+                            benchParams.mode = BMK_decodeOnly;
                             if (operation==zom_bench) { argument++; break; }  /* benchmark decode (hidden option) */
 #endif
                             operation=zom_decompress; argument++; break;
@@ -702,11 +814,19 @@
                     case 'p': argument++;
 #ifndef ZSTD_NOBENCH
                         if ((*argument>='0') && (*argument<='9')) {
-                            BMK_setAdditionalParam(readU32FromChar(&argument));
+                            benchParams.additionalParam = (int)readU32FromChar(&argument);
                         } else
 #endif
                             main_pause=1;
                         break;
+
+                        /* Select compressibility of synthetic sample */
+                    case 'P':
+                    {   argument++;
+                        compressibility = (double)readU32FromChar(&argument) / 100;
+                    }
+                    break;
+
                         /* unknown command */
                     default : CLEAN_RETURN(badusage(programName));
                     }
@@ -764,7 +884,7 @@
         DISPLAYLEVEL(3, "Note: %d physical core(s) detected \n", nbWorkers);
     }
 #else
-    (void)singleThread;
+    (void)singleThread; (void)nbWorkers;
 #endif
 
 #ifdef UTIL_HAS_CREATEFILELIST
@@ -807,23 +927,48 @@
     /* Check if benchmark is selected */
     if (operation==zom_bench) {
 #ifndef ZSTD_NOBENCH
-        BMK_setSeparateFiles(separateFiles);
-        BMK_setBlockSize(blockSize);
-        BMK_setNbWorkers(nbWorkers);
-        BMK_setRealTime(setRealTimePrio);
-        BMK_setNbSeconds(bench_nbSeconds);
-        BMK_setLdmFlag(ldmFlag);
-        BMK_setLdmMinMatch(g_ldmMinMatch);
-        BMK_setLdmHashLog(g_ldmHashLog);
+        benchParams.blockSize = blockSize;
+        benchParams.nbWorkers = nbWorkers;
+        benchParams.realTime = setRealTimePrio;
+        benchParams.nbSeconds = bench_nbSeconds;
+        benchParams.ldmFlag = ldmFlag;
+        benchParams.ldmMinMatch = g_ldmMinMatch;
+        benchParams.ldmHashLog = g_ldmHashLog;
         if (g_ldmBucketSizeLog != LDM_PARAM_DEFAULT) {
-            BMK_setLdmBucketSizeLog(g_ldmBucketSizeLog);
+            benchParams.ldmBucketSizeLog = g_ldmBucketSizeLog;
         }
         if (g_ldmHashEveryLog != LDM_PARAM_DEFAULT) {
-            BMK_setLdmHashEveryLog(g_ldmHashEveryLog);
+            benchParams.ldmHashEveryLog = g_ldmHashEveryLog;
         }
-        BMK_benchFiles(filenameTable, filenameIdx, dictFileName, cLevel, cLevelLast, &compressionParams, g_displayLevel);
+
+        if (cLevel > ZSTD_maxCLevel()) cLevel = ZSTD_maxCLevel();
+        if (cLevelLast > ZSTD_maxCLevel()) cLevelLast = ZSTD_maxCLevel();
+        if (cLevelLast < cLevel) cLevelLast = cLevel;
+        if (cLevelLast > cLevel)
+            DISPLAYLEVEL(3, "Benchmarking levels from %d to %d\n", cLevel, cLevelLast);
+        if(filenameIdx) {
+            if(separateFiles) {
+                unsigned i;
+                for(i = 0; i < filenameIdx; i++) {
+                    int c;
+                    DISPLAYLEVEL(3, "Benchmarking %s \n", filenameTable[i]);
+                    for(c = cLevel; c <= cLevelLast; c++) {
+                        BMK_benchFilesAdvanced(&filenameTable[i], 1, dictFileName, c, &compressionParams, g_displayLevel, &benchParams);
+                    }
+                }
+            } else {
+                for(; cLevel <= cLevelLast; cLevel++) {
+                    BMK_benchFilesAdvanced(filenameTable, filenameIdx, dictFileName, cLevel, &compressionParams, g_displayLevel, &benchParams);
+                }
+            }
+        } else {
+            for(; cLevel <= cLevelLast; cLevel++) {
+                BMK_syntheticTest(cLevel, compressibility, &compressionParams, g_displayLevel, &benchParams);
+            }
+        }
+
 #else
-        (void)bench_nbSeconds; (void)blockSize; (void)setRealTimePrio; (void)separateFiles;
+        (void)bench_nbSeconds; (void)blockSize; (void)setRealTimePrio; (void)separateFiles; (void)compressibility;
 #endif
         goto _end;
     }
@@ -835,18 +980,27 @@
         zParams.compressionLevel = dictCLevel;
         zParams.notificationLevel = g_displayLevel;
         zParams.dictID = dictID;
-        if (cover) {
+        if (dict == cover) {
             int const optimize = !coverParams.k || !coverParams.d;
             coverParams.nbThreads = nbWorkers;
             coverParams.zParams = zParams;
-            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, blockSize, NULL, &coverParams, optimize);
+            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, blockSize, NULL, &coverParams, NULL, optimize);
+        } else if (dict == fastCover) {
+            int const optimize = !fastCoverParams.k || !fastCoverParams.d;
+            fastCoverParams.nbThreads = nbWorkers;
+            fastCoverParams.zParams = zParams;
+            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, blockSize, NULL, NULL, &fastCoverParams, optimize);
         } else {
             ZDICT_legacy_params_t dictParams;
             memset(&dictParams, 0, sizeof(dictParams));
             dictParams.selectivityLevel = dictSelect;
             dictParams.zParams = zParams;
-            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, blockSize, &dictParams, NULL, 0);
+            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, blockSize, &dictParams, NULL, NULL, 0);
         }
+#else
+        (void)dictCLevel; (void)dictSelect; (void)dictID;  (void)maxDictSize; /* not used when ZSTD_NODICT set */
+        DISPLAYLEVEL(1, "training mode not available \n");
+        operationResult = 1;
 #endif
         goto _end;
     }
@@ -889,24 +1043,25 @@
 #ifndef ZSTD_NOCOMPRESS
         FIO_setNbWorkers(nbWorkers);
         FIO_setBlockSize((U32)blockSize);
+        if (g_overlapLog!=OVERLAP_LOG_DEFAULT) FIO_setOverlapLog(g_overlapLog);
         FIO_setLdmFlag(ldmFlag);
         FIO_setLdmHashLog(g_ldmHashLog);
         FIO_setLdmMinMatch(g_ldmMinMatch);
-        if (g_ldmBucketSizeLog != LDM_PARAM_DEFAULT) {
-            FIO_setLdmBucketSizeLog(g_ldmBucketSizeLog);
-        }
-        if (g_ldmHashEveryLog != LDM_PARAM_DEFAULT) {
-            FIO_setLdmHashEveryLog(g_ldmHashEveryLog);
-        }
+        if (g_ldmBucketSizeLog != LDM_PARAM_DEFAULT) FIO_setLdmBucketSizeLog(g_ldmBucketSizeLog);
+        if (g_ldmHashEveryLog != LDM_PARAM_DEFAULT) FIO_setLdmHashEveryLog(g_ldmHashEveryLog);
+        FIO_setAdaptiveMode(adapt);
+        FIO_setAdaptMin(adaptMin);
+        FIO_setAdaptMax(adaptMax);
+        if (adaptMin > cLevel) cLevel = adaptMin;
+        if (adaptMax < cLevel) cLevel = adaptMax;
 
-        if (g_overlapLog!=OVERLAP_LOG_DEFAULT) FIO_setOverlapLog(g_overlapLog);
         if ((filenameIdx==1) && outFileName)
-          operationResult = FIO_compressFilename(outFileName, filenameTable[0], dictFileName, cLevel, &compressionParams);
+          operationResult = FIO_compressFilename(outFileName, filenameTable[0], dictFileName, cLevel, compressionParams);
         else
-          operationResult = FIO_compressMultipleFilenames(filenameTable, filenameIdx, outFileName, suffix, dictFileName, cLevel, &compressionParams);
+          operationResult = FIO_compressMultipleFilenames(filenameTable, filenameIdx, outFileName, suffix, dictFileName, cLevel, compressionParams);
 #else
-        (void)suffix;
-        DISPLAY("Compression not supported\n");
+        (void)suffix; (void)adapt; (void)ultra; (void)cLevel; (void)ldmFlag; /* not used when ZSTD_NOCOMPRESS set */
+        DISPLAY("Compression not supported \n");
 #endif
     } else {  /* decompression or test */
 #ifndef ZSTD_NODECOMPRESS
@@ -923,7 +1078,7 @@
         else
             operationResult = FIO_decompressMultipleFilenames(filenameTable, filenameIdx, outFileName, dictFileName);
 #else
-        DISPLAY("Decompression not supported\n");
+        DISPLAY("Decompression not supported \n");
 #endif
     }
 
diff --git a/tests/.gitignore b/tests/.gitignore
index 4911b2d..da53625 100644
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -26,6 +26,7 @@
 checkTag
 zcat
 zstdcat
+tm
 
 # Tmp test directory
 zstdtest
diff --git a/tests/Makefile b/tests/Makefile
index 4bd43ea..2a96829 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -27,12 +27,15 @@
 DEBUGFLAGS  = -g -DDEBUGLEVEL=$(DEBUGLEVEL)
 CPPFLAGS   += -I$(ZSTDDIR) -I$(ZSTDDIR)/common -I$(ZSTDDIR)/compress \
               -I$(ZSTDDIR)/dictBuilder -I$(ZSTDDIR)/deprecated -I$(PRGDIR)
+ifeq ($(OS),Windows_NT)   # MinGW assumed
+CPPFLAGS   += -D__USE_MINGW_ANSI_STDIO   # compatibility with %zu formatting
+endif
 CFLAGS     ?= -O3
 CFLAGS     += -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow                 \
               -Wstrict-aliasing=1 -Wswitch-enum -Wdeclaration-after-statement \
               -Wstrict-prototypes -Wundef -Wformat-security                   \
               -Wvla -Wformat=2 -Winit-self -Wfloat-equal -Wwrite-strings      \
-              -Wredundant-decls
+              -Wredundant-decls -Wmissing-prototypes
 CFLAGS     += $(DEBUGFLAGS) $(MOREFLAGS)
 FLAGS       = $(CPPFLAGS) $(CFLAGS) $(LDFLAGS)
 
@@ -78,7 +81,8 @@
 default: fullbench
 	@echo $(ZSTDMT_OBJECTS)
 
-all: fullbench fuzzer zstreamtest paramgrill datagen decodecorpus roundTripCrash
+all: fullbench fuzzer zstreamtest paramgrill datagen decodecorpus roundTripCrash \
+     fullbench-lib
 
 all32: fullbench32 fuzzer32 zstreamtest32
 
@@ -88,13 +92,8 @@
 
 dll: fuzzer-dll zstreamtest-dll
 
-zstd:
-	$(MAKE) -C $(PRGDIR) $@ MOREFLAGS+="$(DEBUGFLAGS)"
-
-zstd32:
-	$(MAKE) -C $(PRGDIR) $@ MOREFLAGS+="$(DEBUGFLAGS)"
-
-zstd-nolegacy:
+PHONY: zstd zstd32 zstd-nolegacy  # must be phony, only external makefile knows how to build them, or if they need an update
+zstd zstd32 zstd-nolegacy:
 	$(MAKE) -C $(PRGDIR) $@ MOREFLAGS+="$(DEBUGFLAGS)"
 
 gzstd:
@@ -131,13 +130,14 @@
 fullbench32: CPPFLAGS += -m32
 fullbench fullbench32 : CPPFLAGS += $(MULTITHREAD_CPP)
 fullbench fullbench32 : LDFLAGS += $(MULTITHREAD_LD)
-fullbench fullbench32 : DEBUGFLAGS =   # turn off assert() for speed measurements
+fullbench fullbench32 : DEBUGFLAGS = -DNDEBUG  # turn off assert() for speed measurements
 fullbench fullbench32 : $(ZSTD_FILES)
-fullbench fullbench32 : $(PRGDIR)/datagen.c fullbench.c
+fullbench fullbench32 : $(PRGDIR)/datagen.c $(PRGDIR)/bench.c fullbench.c
 	$(CC) $(FLAGS) $^ -o $@$(EXT)
 
+fullbench-lib : CPPFLAGS += -DXXH_NAMESPACE=ZSTD_
 fullbench-lib : zstd-staticLib
-fullbench-lib : $(PRGDIR)/datagen.c fullbench.c
+fullbench-lib : $(PRGDIR)/datagen.c $(PRGDIR)/bench.c fullbench.c
 	$(CC) $(FLAGS) $(filter %.c,$^) -o $@$(EXT) $(ZSTDDIR)/libzstd.a
 
 # note : broken : requires unavailable symbols
@@ -202,7 +202,7 @@
 zstreamtest-dll : $(ZSTREAM_LOCAL_FILES)
 	$(CC) $(CPPFLAGS) $(CFLAGS) $(filter %.c,$^) $(LDFLAGS) -o $@$(EXT)
 
-paramgrill : DEBUGFLAGS =   # turn off assert() for speed measurements
+paramgrill : DEBUGFLAGS =  # turn off assert() by default for speed measurements
 paramgrill : $(ZSTD_FILES) $(PRGDIR)/bench.c $(PRGDIR)/datagen.c paramgrill.c
 	$(CC) $(FLAGS) $^ -lm -o $@$(EXT)
 
@@ -245,13 +245,14 @@
 
 clean:
 	$(MAKE) -C $(ZSTDDIR) clean
+	$(MAKE) -C $(PRGDIR) clean
 	@$(RM) -fR $(TESTARTEFACT)
 	@$(RM) -f core *.o tmp* result* *.gcda dictionary *.zst \
         $(PRGDIR)/zstd$(EXT) $(PRGDIR)/zstd32$(EXT) \
         fullbench$(EXT) fullbench32$(EXT) \
         fullbench-lib$(EXT) fullbench-dll$(EXT) \
         fuzzer$(EXT) fuzzer32$(EXT) zbufftest$(EXT) zbufftest32$(EXT) \
-        fuzzer-dll$(EXT) zstreamtest-dll$(EXT) zbufftest-dll$(EXT)\
+        fuzzer-dll$(EXT) zstreamtest-dll$(EXT) zbufftest-dll$(EXT) \
         zstreamtest$(EXT) zstreamtest32$(EXT) \
         datagen$(EXT) paramgrill$(EXT) roundTripCrash$(EXT) longmatch$(EXT) \
         symbols$(EXT) invalidDictionaries$(EXT) legacy$(EXT) poolTests$(EXT) \
@@ -301,11 +302,6 @@
 list:
 	@$(MAKE) -pRrq -f $(lastword $(MAKEFILE_LIST)) : 2>/dev/null | awk -v RS= -F: '/^# File/,/^# Finished Make data base/ {if ($$1 !~ "^[#.]") {print $$1}}' | sort | egrep -v -e '^[^[:alnum:]]' -e '^$@$$' | xargs
 
-.PHONY: zstd-playTests
-zstd-playTests: datagen
-	file $(ZSTD)
-	ZSTD="$(QEMU_SYS) $(ZSTD)" ./playTests.sh $(ZSTDRTTEST)
-
 .PHONY: shortest
 shortest: ZSTDRTTEST=
 shortest: test-zstd
@@ -323,14 +319,21 @@
 
 test-all: test test32 valgrindTest test-decodecorpus-cli
 
+
+.PHONY: test-zstd test-zstd32 test-zstd-nolegacy
 test-zstd: ZSTD = $(PRGDIR)/zstd
-test-zstd: zstd zstd-playTests
+test-zstd: zstd
 
 test-zstd32: ZSTD = $(PRGDIR)/zstd32
-test-zstd32: zstd32 zstd-playTests
+test-zstd32: zstd32
 
 test-zstd-nolegacy: ZSTD = $(PRGDIR)/zstd-nolegacy
-test-zstd-nolegacy: zstd-nolegacy zstd-playTests
+test-zstd-nolegacy: zstd-nolegacy
+
+test-zstd test-zstd32 test-zstd-nolegacy: datagen
+	file $(ZSTD)
+	ZSTD="$(QEMU_SYS) $(ZSTD)" ./playTests.sh $(ZSTDRTTEST)
+
 
 test-gzstd: gzstd
 	$(PRGDIR)/zstd -f README.md test-zstd-speed.py
diff --git a/tests/README.md b/tests/README.md
index 24a28ab..f28766b 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -88,3 +88,56 @@
 will choose a random seed, and for 1 minute,
 generate random test frames and ensure that the
 zstd library correctly decompresses them in both simple and streaming modes.
+
+#### `paramgrill` - tool for generating compression table parameters and optimizing parameters on file given constraints
+
+Full list of arguments
+```
+ -T#          : set level 1 speed objective
+ -B#          : cut input into blocks of size # (default : single block)
+ -S           : benchmarks a single run (example command: -Sl3w10h12)
+    w# - windowLog
+    h# - hashLog
+    c# - chainLog
+    s# - searchLog
+    l# - searchLength
+    t# - targetLength
+    S# - strategy
+    L# - level
+ --zstd=      : Single run, parameter selection syntax same as zstdcli with more parameters
+                    (Added forceAttachDictionary / fadt) 
+                    When invoked with --optimize, this represents the sample to exceed. 
+ --optimize=  : find parameters to maximize compression ratio given parameters
+                    Can use all --zstd= commands to constrain the type of solution found in addition to the following constraints
+    cSpeed=   : Minimum compression speed
+    dSpeed=   : Minimum decompression speed
+    cMem=     : Maximum compression memory
+    lvl=      : Searches for solutions which are strictly better than that compression lvl in ratio and cSpeed, 
+    stc=      : When invoked with lvl=, represents percentage slack in ratio/cSpeed allowed for a solution to be considered (Default 100%)
+              : In normal operation, represents percentage slack in choosing viable starting strategy selection in choosing the default parameters
+                    (Lower value will begin with stronger strategies) (Default 90%)
+    speedRatio=   (accepts decimals)
+              : determines value of gains in speed vs gains in ratio
+                    when determining overall winner (default 5 (1% ratio = 5% speed)).
+    tries=    : Maximum number of random restarts on a single strategy before switching (Default 5)
+                    Higher values will make optimizer run longer, more chances to find better solution.
+    memLog    : Limits the log of the size of each memotable (1 per strategy). Will use hash tables when state space is larger than max size. 
+                    Setting memLog = 0 turns off memoization 
+ --display=   : specifiy which parameters are included in the output
+                    can use all --zstd parameter names and 'cParams' as a shorthand for all parameters used in ZSTD_compressionParameters 
+                    (Default: display all params available)
+ -P#          : generated sample compressibility (when no file is provided)
+ -t#          : Caps runtime of operation in seconds (default : 99999 seconds (about 27 hours )) 
+ -v           : Prints Benchmarking output
+ -D           : Next argument dictionary file
+ -s           : Benchmark all files separately
+ -q           : Quiet, repeat for more quiet
+                  -q Prints parameters + results whenever a new best is found
+                  -qq Only prints parameters whenever a new best is found, prints final parameters + results
+                  -qqq Only print final parameters + results
+                  -qqqq Only prints final parameter set in the form --zstd=
+ -v           : Verbose, cancels quiet, repeat for more volume
+                  -v Prints all candidate parameters and results
+
+```
+ Any inputs afterwards are treated as files to benchmark.
diff --git a/tests/decodecorpus.c b/tests/decodecorpus.c
index 936d307..2c22760 100644
--- a/tests/decodecorpus.c
+++ b/tests/decodecorpus.c
@@ -620,6 +620,8 @@
 }
 
 static inline void initSeqStore(seqStore_t *seqStore) {
+    seqStore->maxNbSeq = MAX_NB_SEQ;
+    seqStore->maxNbLit = ZSTD_BLOCKSIZE_MAX;
     seqStore->sequencesStart = SEQUENCE_BUFFER;
     seqStore->litStart = SEQUENCE_LITERAL_BUFFER;
     seqStore->llCode = SEQUENCE_LLCODE;
diff --git a/tests/fullbench.c b/tests/fullbench.c
index 6abdd4d..b05f153 100644
--- a/tests/fullbench.c
+++ b/tests/fullbench.c
@@ -30,6 +30,7 @@
 #include "zstd.h"        /* ZSTD_versionString */
 #include "util.h"        /* time functions */
 #include "datagen.h"
+#include "bench.h"       /* CustomBench*/
 
 
 /*_************************************
@@ -45,9 +46,13 @@
 #define KNUTH      2654435761U
 #define MAX_MEM    (1984 MB)
 
+#define DEFAULT_CLEVEL 1
+
 #define COMPRESSIBILITY_DEFAULT 0.50
 static const size_t g_sampleSize = 10000000;
 
+#define TIMELOOP_NANOSEC      (1*1000000000ULL) /* 1 second */
+
 
 /*_************************************
 *  Macros
@@ -93,14 +98,26 @@
 /*_*******************************************************
 *  Benchmark wrappers
 *********************************************************/
-size_t local_ZSTD_compress(void* dst, size_t dstSize, void* buff2, const void* src, size_t srcSize)
+
+static ZSTD_CCtx* g_zcc = NULL;
+
+static size_t
+local_ZSTD_compress(const void* src, size_t srcSize,
+                    void* dst, size_t dstSize,
+                    void* buff2)
 {
-    (void)buff2;
-    return ZSTD_compress(dst, dstSize, src, srcSize, 1);
+    ZSTD_parameters p;
+    ZSTD_frameParameters f = { 1 /* contentSizeHeader*/, 0, 0 };
+    p.fParams = f;
+    p.cParams = *(ZSTD_compressionParameters*)buff2;
+    return ZSTD_compress_advanced (g_zcc, dst, dstSize, src, srcSize, NULL ,0, p);
+    //return ZSTD_compress(dst, dstSize, src, srcSize, cLevel);
 }
 
 static size_t g_cSize = 0;
-size_t local_ZSTD_decompress(void* dst, size_t dstSize, void* buff2, const void* src, size_t srcSize)
+static size_t local_ZSTD_decompress(const void* src, size_t srcSize,
+                                    void* dst, size_t dstSize,
+                                    void* buff2)
 {
     (void)src; (void)srcSize;
     return ZSTD_decompress(dst, dstSize, buff2, g_cSize);
@@ -110,14 +127,14 @@
 
 #ifndef ZSTD_DLL_IMPORT
 extern size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* ctx, const void* src, size_t srcSize);
-size_t local_ZSTD_decodeLiteralsBlock(void* dst, size_t dstSize, void* buff2, const void* src, size_t srcSize)
+static size_t local_ZSTD_decodeLiteralsBlock(const void* src, size_t srcSize, void* dst, size_t dstSize, void* buff2)
 {
     (void)src; (void)srcSize; (void)dst; (void)dstSize;
     return ZSTD_decodeLiteralsBlock((ZSTD_DCtx*)g_zdc, buff2, g_cSize);
 }
 
 extern size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeq, const void* src, size_t srcSize);
-size_t local_ZSTD_decodeSeqHeaders(void* dst, size_t dstSize, void* buff2, const void* src, size_t srcSize)
+static size_t local_ZSTD_decodeSeqHeaders(const void* src, size_t srcSize, void* dst, size_t dstSize, void* buff2)
 {
     int nbSeq;
     (void)src; (void)srcSize; (void)dst; (void)dstSize;
@@ -126,12 +143,18 @@
 #endif
 
 static ZSTD_CStream* g_cstream= NULL;
-size_t local_ZSTD_compressStream(void* dst, size_t dstCapacity, void* buff2, const void* src, size_t srcSize)
+static size_t
+local_ZSTD_compressStream(const void* src, size_t srcSize,
+                          void* dst, size_t dstCapacity,
+                          void* buff2)
 {
     ZSTD_outBuffer buffOut;
     ZSTD_inBuffer buffIn;
-    (void)buff2;
-    ZSTD_initCStream(g_cstream, 1);
+    ZSTD_parameters p;
+    ZSTD_frameParameters f = {1 /* contentSizeHeader*/, 0, 0};
+    p.fParams = f;
+    p.cParams = *(ZSTD_compressionParameters*)buff2;
+    ZSTD_initCStream_advanced(g_cstream, NULL, 0, p, ZSTD_CONTENTSIZE_UNKNOWN);
     buffOut.dst = dst;
     buffOut.size = dstCapacity;
     buffOut.pos = 0;
@@ -143,12 +166,14 @@
     return buffOut.pos;
 }
 
-static size_t local_ZSTD_compress_generic_end(void* dst, size_t dstCapacity, void* buff2, const void* src, size_t srcSize)
+static size_t
+local_ZSTD_compress_generic_end(const void* src, size_t srcSize,
+                                void* dst, size_t dstCapacity,
+                                void* buff2)
 {
     ZSTD_outBuffer buffOut;
     ZSTD_inBuffer buffIn;
     (void)buff2;
-    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_compressionLevel, 1);
     buffOut.dst = dst;
     buffOut.size = dstCapacity;
     buffOut.pos = 0;
@@ -159,12 +184,14 @@
     return buffOut.pos;
 }
 
-static size_t local_ZSTD_compress_generic_continue(void* dst, size_t dstCapacity, void* buff2, const void* src, size_t srcSize)
+static size_t
+local_ZSTD_compress_generic_continue(const void* src, size_t srcSize,
+                                     void* dst, size_t dstCapacity,
+                                     void* buff2)
 {
     ZSTD_outBuffer buffOut;
     ZSTD_inBuffer buffIn;
     (void)buff2;
-    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_compressionLevel, 1);
     buffOut.dst = dst;
     buffOut.size = dstCapacity;
     buffOut.pos = 0;
@@ -176,12 +203,14 @@
     return buffOut.pos;
 }
 
-static size_t local_ZSTD_compress_generic_T2_end(void* dst, size_t dstCapacity, void* buff2, const void* src, size_t srcSize)
+static size_t
+local_ZSTD_compress_generic_T2_end(const void* src, size_t srcSize,
+                                   void* dst, size_t dstCapacity,
+                                   void* buff2)
 {
     ZSTD_outBuffer buffOut;
     ZSTD_inBuffer buffIn;
     (void)buff2;
-    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_compressionLevel, 1);
     ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_nbWorkers, 2);
     buffOut.dst = dst;
     buffOut.size = dstCapacity;
@@ -193,12 +222,14 @@
     return buffOut.pos;
 }
 
-static size_t local_ZSTD_compress_generic_T2_continue(void* dst, size_t dstCapacity, void* buff2, const void* src, size_t srcSize)
+static size_t
+local_ZSTD_compress_generic_T2_continue(const void* src, size_t srcSize,
+                                        void* dst, size_t dstCapacity,
+                                        void* buff2)
 {
     ZSTD_outBuffer buffOut;
     ZSTD_inBuffer buffIn;
     (void)buff2;
-    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_compressionLevel, 1);
     ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_nbWorkers, 2);
     buffOut.dst = dst;
     buffOut.size = dstCapacity;
@@ -212,7 +243,10 @@
 }
 
 static ZSTD_DStream* g_dstream= NULL;
-static size_t local_ZSTD_decompressStream(void* dst, size_t dstCapacity, void* buff2, const void* src, size_t srcSize)
+static size_t
+local_ZSTD_decompressStream(const void* src, size_t srcSize,
+                            void* dst, size_t dstCapacity,
+                            void* buff2)
 {
     ZSTD_outBuffer buffOut;
     ZSTD_inBuffer buffIn;
@@ -228,34 +262,52 @@
     return buffOut.pos;
 }
 
-static ZSTD_CCtx* g_zcc = NULL;
-
 #ifndef ZSTD_DLL_IMPORT
-size_t local_ZSTD_compressContinue(void* dst, size_t dstCapacity, void* buff2, const void* src, size_t srcSize)
+static size_t local_ZSTD_compressContinue(const void* src, size_t srcSize,
+                                          void* dst, size_t dstCapacity,
+                                          void* buff2)
 {
-    (void)buff2;
-    ZSTD_compressBegin(g_zcc, 1 /* compressionLevel */);
+    ZSTD_parameters p;
+    ZSTD_frameParameters f = { 1 /* contentSizeHeader*/, 0, 0 };
+    p.fParams = f;
+    p.cParams = *(ZSTD_compressionParameters*)buff2;
+    ZSTD_compressBegin_advanced(g_zcc, NULL, 0, p, srcSize);
     return ZSTD_compressEnd(g_zcc, dst, dstCapacity, src, srcSize);
 }
 
 #define FIRST_BLOCK_SIZE 8
-size_t local_ZSTD_compressContinue_extDict(void* dst, size_t dstCapacity, void* buff2, const void* src, size_t srcSize)
+static size_t local_ZSTD_compressContinue_extDict(const void* src, size_t srcSize,
+                                                  void* dst, size_t dstCapacity,
+                                                  void* buff2)
 {
     BYTE firstBlockBuf[FIRST_BLOCK_SIZE];
 
-    (void)buff2;
+    ZSTD_parameters p;
+    ZSTD_frameParameters f = { 1, 0, 0 };
+    p.fParams = f;
+    p.cParams = *(ZSTD_compressionParameters*)buff2;
+    ZSTD_compressBegin_advanced(g_zcc, NULL, 0, p, srcSize);
     memcpy(firstBlockBuf, src, FIRST_BLOCK_SIZE);
-    ZSTD_compressBegin(g_zcc, 1);
 
-    {   size_t const compressResult = ZSTD_compressContinue(g_zcc, dst, dstCapacity, firstBlockBuf, FIRST_BLOCK_SIZE);
-        if (ZSTD_isError(compressResult)) { DISPLAY("local_ZSTD_compressContinue_extDict error : %s\n", ZSTD_getErrorName(compressResult)); return compressResult; }
+    {   size_t const compressResult = ZSTD_compressContinue(g_zcc,
+                                            dst, dstCapacity,
+                                            firstBlockBuf, FIRST_BLOCK_SIZE);
+        if (ZSTD_isError(compressResult)) {
+            DISPLAY("local_ZSTD_compressContinue_extDict error : %s\n",
+                    ZSTD_getErrorName(compressResult));
+            return compressResult;
+        }
         dst = (BYTE*)dst + compressResult;
         dstCapacity -= compressResult;
     }
-    return ZSTD_compressEnd(g_zcc, dst, dstCapacity, (const BYTE*)src + FIRST_BLOCK_SIZE, srcSize - FIRST_BLOCK_SIZE);
+    return ZSTD_compressEnd(g_zcc, dst, dstCapacity,
+                            (const BYTE*)src + FIRST_BLOCK_SIZE,
+                            srcSize - FIRST_BLOCK_SIZE);
 }
 
-size_t local_ZSTD_decompressContinue(void* dst, size_t dstCapacity, void* buff2, const void* src, size_t srcSize)
+static size_t local_ZSTD_decompressContinue(const void* src, size_t srcSize,
+                                            void* dst, size_t dstCapacity,
+                                            void* buff2)
 {
     size_t regeneratedSize = 0;
     const BYTE* ip = (const BYTE*)buff2;
@@ -263,7 +315,7 @@
     BYTE* op = (BYTE*)dst;
     size_t remainingCapacity = dstCapacity;
 
-    (void)src; (void)srcSize;
+    (void)src; (void)srcSize;  /* unused */
     ZSTD_decompressBegin(g_zdc);
     while (ip < iend) {
         size_t const iSize = ZSTD_nextSrcSizeToDecompress(g_zdc);
@@ -282,27 +334,30 @@
 /*_*******************************************************
 *  Bench functions
 *********************************************************/
-static size_t benchMem(const void* src, size_t srcSize, U32 benchNb)
+static size_t benchMem(U32 benchNb,
+                       const void* src, size_t srcSize,
+                       int cLevel, ZSTD_compressionParameters cparams)
 {
+    size_t dstBuffSize = ZSTD_compressBound(srcSize);
     BYTE*  dstBuff;
-    size_t const dstBuffSize = ZSTD_compressBound(srcSize);
+    void*  dstBuff2;
     void*  buff2;
     const char* benchName;
-    size_t (*benchFunction)(void* dst, size_t dstSize, void* verifBuff, const void* src, size_t srcSize);
-    double bestTime = 100000000.;
+    BMK_benchFn_t benchFunction;
+    int errorcode = 0;
 
     /* Selection */
     switch(benchNb)
     {
     case 1:
-        benchFunction = local_ZSTD_compress; benchName = "compress(1)";
+        benchFunction = local_ZSTD_compress; benchName = "compress";
         break;
     case 2:
         benchFunction = local_ZSTD_decompress; benchName = "decompress";
         break;
 #ifndef ZSTD_DLL_IMPORT
     case 11:
-        benchFunction = local_ZSTD_compressContinue; benchName = "compressContinue(1)";
+        benchFunction = local_ZSTD_compressContinue; benchName = "compressContinue";
         break;
     case 12:
         benchFunction = local_ZSTD_compressContinue_extDict; benchName = "compressContinue_extDict";
@@ -318,7 +373,7 @@
         break;
 #endif
     case 41:
-        benchFunction = local_ZSTD_compressStream; benchName = "compressStream(1)";
+        benchFunction = local_ZSTD_compressStream; benchName = "compressStream";
         break;
     case 42:
         benchFunction = local_ZSTD_decompressStream; benchName = "decompressStream";
@@ -341,32 +396,65 @@
 
     /* Allocation */
     dstBuff = (BYTE*)malloc(dstBuffSize);
-    buff2 = malloc(dstBuffSize);
-    if ((!dstBuff) || (!buff2)) {
+    dstBuff2 = malloc(dstBuffSize);
+    if ((!dstBuff) || (!dstBuff2)) {
         DISPLAY("\nError: not enough memory!\n");
-        free(dstBuff); free(buff2);
+        free(dstBuff); free(dstBuff2);
         return 12;
     }
+    buff2 = dstBuff2;
     if (g_zcc==NULL) g_zcc = ZSTD_createCCtx();
     if (g_zdc==NULL) g_zdc = ZSTD_createDCtx();
     if (g_cstream==NULL) g_cstream = ZSTD_createCStream();
     if (g_dstream==NULL) g_dstream = ZSTD_createDStream();
 
+    /* DISPLAY("params: cLevel %d, wlog %d hlog %d clog %d slog %d slen %d tlen %d strat %d \n",
+          cLevel, cparams->windowLog, cparams->hashLog, cparams->chainLog, cparams->searchLog,
+          cparams->searchLength, cparams->targetLength, cparams->strategy); */
+
+    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_compressionLevel, cLevel);
+    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_windowLog, cparams.windowLog);
+    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_hashLog, cparams.hashLog);
+    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_chainLog, cparams.chainLog);
+    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_searchLog, cparams.searchLog);
+    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_minMatch, cparams.searchLength);
+    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_targetLength, cparams.targetLength);
+    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_compressionStrategy, cparams.strategy);
+
+
+    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_compressionLevel, cLevel);
+    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_windowLog, cparams.windowLog);
+    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_hashLog, cparams.hashLog);
+    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_chainLog, cparams.chainLog);
+    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_searchLog, cparams.searchLog);
+    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_minMatch, cparams.searchLength);
+    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_targetLength, cparams.targetLength);
+    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_compressionStrategy, cparams.strategy);
+
     /* Preparation */
     switch(benchNb)
     {
+    case 1:
+        buff2 = &cparams;
+        break;
     case 2:
-        g_cSize = ZSTD_compress(buff2, dstBuffSize, src, srcSize, 1);
+        g_cSize = ZSTD_compress(buff2, dstBuffSize, src, srcSize, cLevel);
         break;
 #ifndef ZSTD_DLL_IMPORT
+    case 11:
+        buff2 = &cparams;
+        break;
+    case 12:
+        buff2 = &cparams;
+        break;
     case 13 :
-        g_cSize = ZSTD_compress(buff2, dstBuffSize, src, srcSize, 1);
+        g_cSize = ZSTD_compress(buff2, dstBuffSize, src, srcSize, cLevel);
         break;
     case 31:  /* ZSTD_decodeLiteralsBlock */
         {   blockProperties_t bp;
             ZSTD_frameHeader zfp;
             size_t frameHeaderSize, skippedSize;
-            g_cSize = ZSTD_compress(dstBuff, dstBuffSize, src, srcSize, 1);
+            g_cSize = ZSTD_compress(dstBuff, dstBuffSize, src, srcSize, cLevel);
             frameHeaderSize = ZSTD_getFrameHeader(&zfp, dstBuff, ZSTD_frameHeaderSize_min);
             if (frameHeaderSize==0) frameHeaderSize = ZSTD_frameHeaderSize_min;
             ZSTD_getcBlockSize(dstBuff+frameHeaderSize, dstBuffSize, &bp);  /* Get 1st block type */
@@ -386,8 +474,8 @@
             const BYTE* ip = dstBuff;
             const BYTE* iend;
             size_t frameHeaderSize, cBlockSize;
-            ZSTD_compress(dstBuff, dstBuffSize, src, srcSize, 1);   /* it would be better to use direct block compression here */
-            g_cSize = ZSTD_compress(dstBuff, dstBuffSize, src, srcSize, 1);
+            ZSTD_compress(dstBuff, dstBuffSize, src, srcSize, cLevel);   /* it would be better to use direct block compression here */
+            g_cSize = ZSTD_compress(dstBuff, dstBuffSize, src, srcSize, cLevel);
             frameHeaderSize = ZSTD_getFrameHeader(&zfp, dstBuff, ZSTD_frameHeaderSize_min);
             if (frameHeaderSize==0) frameHeaderSize = ZSTD_frameHeaderSize_min;
             ip += frameHeaderSize;   /* Skip frame Header */
@@ -409,8 +497,11 @@
     case 31:
         goto _cleanOut;
 #endif
+    case 41 :
+        buff2 = &cparams;
+        break;
     case 42 :
-        g_cSize = ZSTD_compress(buff2, dstBuffSize, src, srcSize, 1);
+        g_cSize = ZSTD_compress(buff2, dstBuffSize, src, srcSize, cLevel);
         break;
 
     /* test functions */
@@ -419,138 +510,190 @@
     default : ;
     }
 
-     /* warming up memory */
+     /* warming up dstBuff */
     { size_t i; for (i=0; i<dstBuffSize; i++) dstBuff[i]=(BYTE)i; }
 
     /* benchmark loop */
-    {   U32 loopNb;
-        U32 nbRounds = (U32)((50 MB) / (srcSize+1)) + 1;   /* initial conservative speed estimate */
-#       define TIME_SEC_MICROSEC    (1*1000000ULL) /* 1 second */
-#       define TIME_SEC_NANOSEC     (1*1000000000ULL) /* 1 second */
-        DISPLAY("%2i- %-30.30s : \r", benchNb, benchName);
-        for (loopNb = 1; loopNb <= g_nbIterations; loopNb++) {
-            UTIL_time_t clockStart;
-            size_t benchResult=0;
-            U32 roundNb;
+    {   BMK_timedFnState_t* const tfs = BMK_createTimedFnState(g_nbIterations * 1000, 1000);
+        BMK_runTime_t bestResult;
+        bestResult.sumOfReturn = 0;
+        bestResult.nanoSecPerRun = (unsigned long long)(-1LL);
+        assert(tfs != NULL);
+        for (;;) {
+            void* const dstBuffv = dstBuff;
+            BMK_runOutcome_t const bOutcome =
+                    BMK_benchTimedFn( tfs,
+                            benchFunction, buff2,
+                            NULL, NULL,   /* initFn */
+                            1,  /* blockCount */
+                            &src, &srcSize,
+                            &dstBuffv, &dstBuffSize,
+                            NULL);
 
-            UTIL_sleepMilli(5);  /* give processor time to other processes */
-            UTIL_waitForNextTick();
-            clockStart = UTIL_getTime();
-            for (roundNb=0; roundNb < nbRounds; roundNb++) {
-                benchResult = benchFunction(dstBuff, dstBuffSize, buff2, src, srcSize);
-                if (ZSTD_isError(benchResult)) {
-                    DISPLAY("ERROR ! %s() => %s !! \n", benchName, ZSTD_getErrorName(benchResult));
-                    exit(1);
-            }   }
-            {   U64 const clockSpanNano = UTIL_clockSpanNano(clockStart);
-                double const averageTime = (double)clockSpanNano / TIME_SEC_NANOSEC / nbRounds;
-                if (clockSpanNano > 0) {
-                    if (averageTime < bestTime) bestTime = averageTime;
-                    assert(bestTime > (1./2000000000));
-                    nbRounds = (U32)(1. / bestTime);   /* aim for 1 sec */
-                    DISPLAY("%2i- %-30.30s : %7.1f MB/s  (%9u)\r",
-                            loopNb, benchName,
-                            (double)srcSize / (1 MB) / bestTime,
-                            (U32)benchResult);
-                } else {
-                    assert(nbRounds < 40000000);  /* avoid overflow */
-                    nbRounds *= 100;
-                }
-    }   }   }
-    DISPLAY("%2u\n", benchNb);
+            if (!BMK_isSuccessful_runOutcome(bOutcome)) {
+                DISPLAY("ERROR benchmarking function ! ! \n");
+                errorcode = 1;
+                goto _cleanOut;
+            }
+
+            {   BMK_runTime_t const newResult = BMK_extract_runTime(bOutcome);
+                if (newResult.nanoSecPerRun < bestResult.nanoSecPerRun )
+                    bestResult.nanoSecPerRun = newResult.nanoSecPerRun;
+                DISPLAY("\r%2u#%-29.29s:%8.1f MB/s  (%8u) ",
+                        benchNb, benchName,
+                        (double)srcSize * TIMELOOP_NANOSEC / bestResult.nanoSecPerRun / MB_UNIT,
+                        (unsigned)newResult.sumOfReturn );
+            }
+
+            if ( BMK_isCompleted_TimedFn(tfs) ) break;
+        }
+        BMK_freeTimedFnState(tfs);
+    }
+    DISPLAY("\n");
 
 _cleanOut:
     free(dstBuff);
-    free(buff2);
+    free(dstBuff2);
     ZSTD_freeCCtx(g_zcc); g_zcc=NULL;
     ZSTD_freeDCtx(g_zdc); g_zdc=NULL;
     ZSTD_freeCStream(g_cstream); g_cstream=NULL;
     ZSTD_freeDStream(g_dstream); g_dstream=NULL;
-    return 0;
+    return errorcode;
 }
 
 
-static int benchSample(U32 benchNb)
+static int benchSample(U32 benchNb,
+                       int cLevel, ZSTD_compressionParameters cparams)
 {
     size_t const benchedSize = g_sampleSize;
-    const char* name = "Sample 10MiB";
+    const char* const name = "Sample 10MiB";
 
     /* Allocation */
-    void* origBuff = malloc(benchedSize);
+    void* const origBuff = malloc(benchedSize);
     if (!origBuff) { DISPLAY("\nError: not enough memory!\n"); return 12; }
 
     /* Fill buffer */
     RDG_genBuffer(origBuff, benchedSize, g_compressibility, 0.0, 0);
 
     /* bench */
-    DISPLAY("\r%79s\r", "");
+    DISPLAY("\r%70s\r", "");
     DISPLAY(" %s : \n", name);
-    if (benchNb)
-        benchMem(origBuff, benchedSize, benchNb);
-    else
-        for (benchNb=0; benchNb<100; benchNb++) benchMem(origBuff, benchedSize, benchNb);
+    if (benchNb) {
+        benchMem(benchNb, origBuff, benchedSize, cLevel, cparams);
+    } else {  /* 0 == run all tests */
+        for (benchNb=0; benchNb<100; benchNb++) {
+            benchMem(benchNb, origBuff, benchedSize, cLevel, cparams);
+    }   }
 
     free(origBuff);
     return 0;
 }
 
 
-static int benchFiles(const char** fileNamesTable, const int nbFiles, U32 benchNb)
+static int benchFiles(U32 benchNb,
+                      const char** fileNamesTable, const int nbFiles,
+                      int cLevel, ZSTD_compressionParameters cparams)
 {
     /* Loop for each file */
     int fileIdx;
     for (fileIdx=0; fileIdx<nbFiles; fileIdx++) {
         const char* const inFileName = fileNamesTable[fileIdx];
         FILE* const inFile = fopen( inFileName, "rb" );
-        U64   inFileSize;
         size_t benchedSize;
-        void* origBuff;
 
         /* Check file existence */
         if (inFile==NULL) { DISPLAY( "Pb opening %s\n", inFileName); return 11; }
 
         /* Memory allocation & restrictions */
-        inFileSize = UTIL_getFileSize(inFileName);
-        if (inFileSize == UTIL_FILESIZE_UNKNOWN) {
-            DISPLAY( "Cannot measure size of %s\n", inFileName);
-            fclose(inFile);
-            return 11;
-        }
-        benchedSize = BMK_findMaxMem(inFileSize*3) / 3;
-        if ((U64)benchedSize > inFileSize) benchedSize = (size_t)inFileSize;
-        if (benchedSize < inFileSize)
-            DISPLAY("Not enough memory for '%s' full size; testing %u MB only...\n", inFileName, (U32)(benchedSize>>20));
-
-        /* Alloc */
-        origBuff = malloc(benchedSize);
-        if (!origBuff) { DISPLAY("\nError: not enough memory!\n"); fclose(inFile); return 12; }
-
-        /* Fill input buffer */
-        DISPLAY("Loading %s...       \r", inFileName);
-        {
-            size_t readSize = fread(origBuff, 1, benchedSize, inFile);
-            fclose(inFile);
-            if (readSize != benchedSize) {
-                DISPLAY("\nError: problem reading file '%s' !!    \n", inFileName);
-                free(origBuff);
-                return 13;
+        {   U64 const inFileSize = UTIL_getFileSize(inFileName);
+            if (inFileSize == UTIL_FILESIZE_UNKNOWN) {
+                DISPLAY( "Cannot measure size of %s\n", inFileName);
+                fclose(inFile);
+                return 11;
+            }
+            benchedSize = BMK_findMaxMem(inFileSize*3) / 3;
+            if ((U64)benchedSize > inFileSize)
+                benchedSize = (size_t)inFileSize;
+            if ((U64)benchedSize < inFileSize) {
+                DISPLAY("Not enough memory for '%s' full size; testing %u MB only... \n",
+                        inFileName, (U32)(benchedSize>>20));
         }   }
 
-        /* bench */
-        DISPLAY("\r%79s\r", "");
-        DISPLAY(" %s : \n", inFileName);
-        if (benchNb)
-            benchMem(origBuff, benchedSize, benchNb);
-        else
-            for (benchNb=0; benchNb<100; benchNb++) benchMem(origBuff, benchedSize, benchNb);
+        /* Alloc */
+        {   void* const origBuff = malloc(benchedSize);
+            if (!origBuff) { DISPLAY("\nError: not enough memory!\n"); fclose(inFile); return 12; }
 
-        free(origBuff);
-    }
+            /* Fill input buffer */
+            DISPLAY("Loading %s...       \r", inFileName);
+            {   size_t const readSize = fread(origBuff, 1, benchedSize, inFile);
+                fclose(inFile);
+                if (readSize != benchedSize) {
+                    DISPLAY("\nError: problem reading file '%s' !!    \n", inFileName);
+                    free(origBuff);
+                    return 13;
+            }   }
+
+            /* bench */
+            DISPLAY("\r%70s\r", "");   /* blank line */
+            DISPLAY(" %s : \n", inFileName);
+            if (benchNb) {
+                benchMem(benchNb, origBuff, benchedSize, cLevel, cparams);
+            } else {
+                for (benchNb=0; benchNb<100; benchNb++) {
+                    benchMem(benchNb, origBuff, benchedSize, cLevel, cparams);
+            }   }
+
+            free(origBuff);
+    }   }
 
     return 0;
 }
 
 
+
+/*_*******************************************************
+*  Argument Parsing
+*********************************************************/
+
+#define ERROR_OUT(msg) { DISPLAY("%s \n", msg); exit(1); }
+
+static unsigned readU32FromChar(const char** stringPtr)
+{
+    const char errorMsg[] = "error: numeric value too large";
+    unsigned result = 0;
+    while ((**stringPtr >='0') && (**stringPtr <='9')) {
+        unsigned const max = (((unsigned)(-1)) / 10) - 1;
+        if (result > max) ERROR_OUT(errorMsg);
+        result *= 10, result += **stringPtr - '0', (*stringPtr)++ ;
+    }
+    if ((**stringPtr=='K') || (**stringPtr=='M')) {
+        unsigned const maxK = ((unsigned)(-1)) >> 10;
+        if (result > maxK) ERROR_OUT(errorMsg);
+        result <<= 10;
+        if (**stringPtr=='M') {
+            if (result > maxK) ERROR_OUT(errorMsg);
+            result <<= 10;
+        }
+        (*stringPtr)++;  /* skip `K` or `M` */
+        if (**stringPtr=='i') (*stringPtr)++;
+        if (**stringPtr=='B') (*stringPtr)++;
+    }
+    return result;
+}
+
+static unsigned longCommandWArg(const char** stringPtr, const char* longCommand)
+{
+    size_t const comSize = strlen(longCommand);
+    int const result = !strncmp(*stringPtr, longCommand, comSize);
+    if (result) *stringPtr += comSize;
+    return result;
+}
+
+
+/*_*******************************************************
+*  Command line
+*********************************************************/
+
 static int usage(const char* exename)
 {
     DISPLAY( "Usage :\n");
@@ -567,6 +710,8 @@
     DISPLAY( " -b#    : test only function # \n");
     DISPLAY( " -i#    : iteration loops [1-9](default : %i)\n", NBLOOPS);
     DISPLAY( " -P#    : sample compressibility (default : %.1f%%)\n", COMPRESSIBILITY_DEFAULT * 100);
+    DISPLAY( " -l#    : benchmark functions at that compression level (default : %i)\n", DEFAULT_CLEVEL);
+    DISPLAY( " --zstd : custom parameter selection. Format same as zstdcli \n");
     return 0;
 }
 
@@ -579,23 +724,45 @@
 
 int main(int argc, const char** argv)
 {
-    int i, filenamesStart=0, result;
-    const char* exename = argv[0];
+    int argNb, filenamesStart=0, result;
+    const char* const exename = argv[0];
     const char* input_filename = NULL;
     U32 benchNb = 0, main_pause = 0;
+    int cLevel = DEFAULT_CLEVEL;
+    ZSTD_compressionParameters cparams = ZSTD_getCParams(cLevel, 0, 0);
 
     DISPLAY(WELCOME_MESSAGE);
     if (argc<1) return badusage(exename);
 
-    for(i=1; i<argc; i++) {
-        const char* argument = argv[i];
+    for (argNb=1; argNb<argc; argNb++) {
+        const char* argument = argv[argNb];
         assert(argument != NULL);
 
-        /* Commands (note : aggregated commands are allowed) */
-        if (argument[0]=='-') {
+        if (longCommandWArg(&argument, "--zstd=")) {
+            for ( ; ;) {
+                if (longCommandWArg(&argument, "windowLog=") || longCommandWArg(&argument, "wlog=")) { cparams.windowLog = readU32FromChar(&argument); if (argument[0]==',') { argument++; continue; } else break; }
+                if (longCommandWArg(&argument, "chainLog=") || longCommandWArg(&argument, "clog=")) { cparams.chainLog = readU32FromChar(&argument); if (argument[0]==',') { argument++; continue; } else break; }
+                if (longCommandWArg(&argument, "hashLog=") || longCommandWArg(&argument, "hlog=")) { cparams.hashLog = readU32FromChar(&argument); if (argument[0]==',') { argument++; continue; } else break; }
+                if (longCommandWArg(&argument, "searchLog=") || longCommandWArg(&argument, "slog=")) { cparams.searchLog = readU32FromChar(&argument); if (argument[0]==',') { argument++; continue; } else break; }
+                if (longCommandWArg(&argument, "searchLength=") || longCommandWArg(&argument, "slen=")) { cparams.searchLength = readU32FromChar(&argument); if (argument[0]==',') { argument++; continue; } else break; }
+                if (longCommandWArg(&argument, "targetLength=") || longCommandWArg(&argument, "tlen=")) { cparams.targetLength = readU32FromChar(&argument); if (argument[0]==',') { argument++; continue; } else break; }
+                if (longCommandWArg(&argument, "strategy=") || longCommandWArg(&argument, "strat=")) { cparams.strategy = (ZSTD_strategy)(readU32FromChar(&argument)); if (argument[0]==',') { argument++; continue; } else break; }
+                if (longCommandWArg(&argument, "level=") || longCommandWArg(&argument, "lvl=")) { cLevel = (int)readU32FromChar(&argument); cparams = ZSTD_getCParams(cLevel, 0, 0); if (argument[0]==',') { argument++; continue; } else break; }
+                DISPLAY("invalid compression parameter \n");
+                return 1;
+            }
 
-            while (argument[1]!=0) {
-                argument++;
+            /* check end of string */
+            if (argument[0] != 0) {
+                DISPLAY("invalid --zstd= format \n");
+                return 1;
+            } else {
+                continue;
+            }
+
+        } else if (argument[0]=='-') { /* Commands (note : aggregated commands are allowed) */
+            argument++;
+            while (argument[0]!=0) {
 
                 switch(argument[0])
                 {
@@ -608,33 +775,25 @@
 
                     /* Select specific algorithm to bench */
                 case 'b':
-                    benchNb = 0;
-                    while ((argument[1]>= '0') && (argument[1]<= '9')) {
-                        benchNb *= 10;
-                        benchNb += argument[1] - '0';
-                        argument++;
-                    }
+                    argument++;
+                    benchNb = readU32FromChar(&argument);
                     break;
 
                     /* Modify Nb Iterations */
                 case 'i':
-                    if ((argument[1] >='0') && (argument[1] <='9')) {
-                        int iters = argument[1] - '0';
-                        BMK_SetNbIterations(iters);
-                        argument++;
-                    }
+                    argument++;
+                    BMK_SetNbIterations((int)readU32FromChar(&argument));
                     break;
 
                     /* Select compressibility of synthetic sample */
                 case 'P':
-                    {   U32 proba32 = 0;
-                        while ((argument[1]>= '0') && (argument[1]<= '9')) {
-                            proba32 *= 10;
-                            proba32 += argument[1] - '0';
-                            argument++;
-                        }
-                        g_compressibility = (double)proba32 / 100.;
-                    }
+                    argument++;
+                    g_compressibility = (double)readU32FromChar(&argument) / 100.;
+                    break;
+                case 'l':
+                    argument++;
+                    cLevel = readU32FromChar(&argument);
+                    cparams = ZSTD_getCParams(cLevel, 0, 0);
                     break;
 
                     /* Unknown command */
@@ -645,13 +804,15 @@
         }
 
         /* first provided filename is input */
-        if (!input_filename) { input_filename=argument; filenamesStart=i; continue; }
+        if (!input_filename) { input_filename=argument; filenamesStart=argNb; continue; }
     }
 
+
+
     if (filenamesStart==0)   /* no input file */
-        result = benchSample(benchNb);
+        result = benchSample(benchNb, cLevel, cparams);
     else
-        result = benchFiles(argv+filenamesStart, argc-filenamesStart, benchNb);
+        result = benchFiles(benchNb, argv+filenamesStart, argc-filenamesStart, cLevel, cparams);
 
     if (main_pause) { int unused; printf("press enter...\n"); unused = getchar(); (void)unused; }
 
diff --git a/tests/fuzz/fuzz.py b/tests/fuzz/fuzz.py
index 84c28c6..8ce293a 100755
--- a/tests/fuzz/fuzz.py
+++ b/tests/fuzz/fuzz.py
@@ -13,6 +13,7 @@
 import contextlib
 import os
 import re
+import shlex
 import shutil
 import subprocess
 import sys
@@ -349,11 +350,11 @@
     targets = args.TARGET
     cc = args.cc
     cxx = args.cxx
-    cppflags = [args.cppflags]
-    cflags = [args.cflags]
-    ldflags = [args.ldflags]
-    cxxflags = [args.cxxflags]
-    mflags = [args.mflags] if args.mflags else []
+    cppflags = shlex.split(args.cppflags)
+    cflags = shlex.split(args.cflags)
+    ldflags = shlex.split(args.ldflags)
+    cxxflags = shlex.split(args.cxxflags)
+    mflags = shlex.split(args.mflags)
     # Flags to be added to both cflags and cxxflags
     common_flags = []
 
diff --git a/tests/fuzzer.c b/tests/fuzzer.c
index f7bacd5..5616285 100644
--- a/tests/fuzzer.c
+++ b/tests/fuzzer.c
@@ -27,6 +27,7 @@
 #include <string.h>       /* strcmp */
 #include <assert.h>
 #define ZSTD_STATIC_LINKING_ONLY  /* ZSTD_compressContinue, ZSTD_compressBlock */
+#include "fse.h"
 #include "zstd.h"         /* ZSTD_VERSION_STRING */
 #include "zstd_errors.h"  /* ZSTD_getErrorCode */
 #include "zstdmt_compress.h"
@@ -71,6 +72,8 @@
 *********************************************************/
 #undef MIN
 #undef MAX
+/* Declaring the function is it isn't unused */
+void FUZ_bug976(void);
 void FUZ_bug976(void)
 {   /* these constants shall not depend on MIN() macro */
     assert(ZSTD_HASHLOG_MAX < 31);
@@ -178,13 +181,9 @@
         (U32)(count.totalMalloc >> 10));
 }
 
-static int FUZ_mallocTests(unsigned seed, double compressibility, unsigned part)
+static int FUZ_mallocTests_internal(unsigned seed, double compressibility, unsigned part,
+                void* inBuffer, size_t inSize, void* outBuffer, size_t outSize)
 {
-    size_t const inSize = 64 MB + 16 MB + 4 MB + 1 MB + 256 KB + 64 KB; /* 85.3 MB */
-    size_t const outSize = ZSTD_compressBound(inSize);
-    void* const inBuffer = malloc(inSize);
-    void* const outBuffer = malloc(outSize);
-
     /* test only played in verbose mode, as they are long */
     if (g_displayLevel<3) return 0;
 
@@ -269,6 +268,28 @@
     return 0;
 }
 
+static int FUZ_mallocTests(unsigned seed, double compressibility, unsigned part)
+{
+    size_t const inSize = 64 MB + 16 MB + 4 MB + 1 MB + 256 KB + 64 KB; /* 85.3 MB */
+    size_t const outSize = ZSTD_compressBound(inSize);
+    void* const inBuffer = malloc(inSize);
+    void* const outBuffer = malloc(outSize);
+    int result;
+
+    /* Create compressible noise */
+    if (!inBuffer || !outBuffer) {
+        DISPLAY("Not enough memory, aborting \n");
+        exit(1);
+    }
+
+    result = FUZ_mallocTests_internal(seed, compressibility, part,
+                    inBuffer, inSize, outBuffer, outSize);
+
+    free(inBuffer);
+    free(outBuffer);
+    return result;
+}
+
 #else
 
 static int FUZ_mallocTests(unsigned seed, double compressibility, unsigned part)
@@ -314,9 +335,13 @@
         DISPLAYLEVEL(3, "OK : %s \n", errorString);
     }
 
+    DISPLAYLEVEL(3, "test%3i : min compression level : ", testNb++);
+    {   int const mcl = ZSTD_minCLevel();
+        DISPLAYLEVEL(3, "%i (OK) \n", mcl);
+    }
 
     DISPLAYLEVEL(3, "test%3i : compress %u bytes : ", testNb++, (U32)CNBuffSize);
-    {   ZSTD_CCtx* cctx = ZSTD_createCCtx();
+    {   ZSTD_CCtx* const cctx = ZSTD_createCCtx();
         if (cctx==NULL) goto _output_error;
         CHECKPLUS(r, ZSTD_compressCCtx(cctx,
                             compressedBuffer, compressedBufferSize,
@@ -411,6 +436,26 @@
     }
     DISPLAYLEVEL(3, "OK \n");
 
+    DISPLAYLEVEL(3, "test%3d : re-using a CCtx should compress the same : ", testNb++);
+    {   int i;
+        for (i=0; i<20; i++)
+            ((char*)CNBuffer)[i] = (char)i;   /* ensure no match during initial section */
+        memcpy((char*)CNBuffer + 20, CNBuffer, 10);   /* create one match, starting from beginning of sample, which is the difficult case (see #1241) */
+        for (i=1; i<=19; i++) {
+            ZSTD_CCtx* const cctx = ZSTD_createCCtx();
+            size_t size1, size2;
+            DISPLAYLEVEL(5, "l%i ", i);
+            size1 = ZSTD_compressCCtx(cctx, compressedBuffer, compressedBufferSize, CNBuffer, 30, i);
+            CHECK_Z(size1);
+            size2 = ZSTD_compressCCtx(cctx, compressedBuffer, compressedBufferSize, CNBuffer, 30, i);
+            CHECK_Z(size2);
+            CHECK_EQ(size1, size2);
+
+            ZSTD_freeCCtx(cctx);
+        }
+    }
+    DISPLAYLEVEL(3, "OK \n");
+
     DISPLAYLEVEL(3, "test%3d : ZSTD_CCtx_getParameter() : ", testNb++);
     {   ZSTD_CCtx* const cctx = ZSTD_createCCtx();
         ZSTD_outBuffer out = {NULL, 0, 0};
@@ -1248,6 +1293,20 @@
           if (r != blockSize) goto _output_error; }
         DISPLAYLEVEL(3, "OK \n");
 
+        /* very long stream of block compression */
+        DISPLAYLEVEL(3, "test%3i : Huge block streaming compression test : ", testNb++);
+        CHECK( ZSTD_compressBegin(cctx, -99) );  /* we just want to quickly overflow internal U32 index */
+        CHECK( ZSTD_getBlockSize(cctx) >= blockSize);
+        {   U64 const toCompress = 5000000000ULL;   /* > 4 GB */
+            U64 compressed = 0;
+            while (compressed < toCompress) {
+                size_t const blockCSize = ZSTD_compressBlock(cctx, compressedBuffer, ZSTD_compressBound(blockSize), CNBuffer, blockSize);
+                if (ZSTD_isError(cSize)) goto _output_error;
+                compressed += blockCSize;
+            }
+        }
+        DISPLAYLEVEL(3, "OK \n");
+
         /* dictionary block compression */
         DISPLAYLEVEL(3, "test%3i : Dictionary Block compression test : ", testNb++);
         CHECK( ZSTD_compressBegin_usingDict(cctx, CNBuffer, dictSize, 5) );
@@ -1336,6 +1395,24 @@
                 ((BYTE*)CNBuffer)[i+1] = _3BytesSeqs[id][1];
                 ((BYTE*)CNBuffer)[i+2] = _3BytesSeqs[id][2];
     }   }   }
+    DISPLAYLEVEL(3, "test%3i : growing nbSeq : ", testNb++);
+    {   ZSTD_CCtx* const cctx = ZSTD_createCCtx();
+        size_t const maxNbSeq = _3BYTESTESTLENGTH / 3;
+        size_t const bound = ZSTD_compressBound(_3BYTESTESTLENGTH);
+        size_t nbSeq = 1;
+        while (nbSeq <= maxNbSeq) {
+          CHECK(ZSTD_compressCCtx(cctx, compressedBuffer, bound, CNBuffer, nbSeq * 3, 19));
+          /* Check every sequence for the first 100, then skip more rapidly. */
+          if (nbSeq < 100) {
+            ++nbSeq;
+          } else {
+            nbSeq += (nbSeq >> 2);
+          }
+        }
+        ZSTD_freeCCtx(cctx);
+    }
+    DISPLAYLEVEL(3, "OK \n");
+
     DISPLAYLEVEL(3, "test%3i : compress lots 3-bytes sequences : ", testNb++);
     { CHECK_V(r, ZSTD_compress(compressedBuffer, ZSTD_compressBound(_3BYTESTESTLENGTH),
                                  CNBuffer, _3BYTESTESTLENGTH, 19) );
@@ -1347,8 +1424,26 @@
       if (r != _3BYTESTESTLENGTH) goto _output_error; }
     DISPLAYLEVEL(3, "OK \n");
 
-    DISPLAYLEVEL(3, "test%3i : incompressible data and ill suited dictionary : ", testNb++);
+
+    DISPLAYLEVEL(3, "test%3i : growing literals buffer : ", testNb++);
     RDG_genBuffer(CNBuffer, CNBuffSize, 0.0, 0.1, seed);
+    {   ZSTD_CCtx* const cctx = ZSTD_createCCtx();
+        size_t const bound = ZSTD_compressBound(CNBuffSize);
+        size_t size = 1;
+        while (size <= CNBuffSize) {
+          CHECK(ZSTD_compressCCtx(cctx, compressedBuffer, bound, CNBuffer, size, 3));
+          /* Check every size for the first 100, then skip more rapidly. */
+          if (size < 100) {
+            ++size;
+          } else {
+            size += (size >> 2);
+          }
+        }
+        ZSTD_freeCCtx(cctx);
+    }
+    DISPLAYLEVEL(3, "OK \n");
+
+    DISPLAYLEVEL(3, "test%3i : incompressible data and ill suited dictionary : ", testNb++);
     {   /* Train a dictionary on low characters */
         size_t dictSize = 16 KB;
         void* const dictBuffer = malloc(dictSize);
@@ -1423,6 +1518,24 @@
     }
     DISPLAYLEVEL(3, "OK \n");
 
+    DISPLAYLEVEL(3, "test%3i : testing FSE_normalizeCount() PR#1255: ", testNb++);
+    {
+        short norm[32];
+        unsigned count[32];
+        unsigned const tableLog = 5;
+        size_t const nbSeq = 32;
+        unsigned const maxSymbolValue = 31;
+        size_t i;
+
+        for (i = 0; i < 32; ++i)
+            count[i] = 1;
+        /* Calling FSE_normalizeCount() on a uniform distribution should not
+         * cause a division by zero.
+         */
+        FSE_normalizeCount(norm, tableLog, count, nbSeq, maxSymbolValue);
+    }
+    DISPLAYLEVEL(3, "OK \n");
+
 _end:
     free(CNBuffer);
     free(compressedBuffer);
@@ -1496,7 +1609,6 @@
     size_t const dstBufferSize = (size_t)1<<maxSampleLog;
     size_t const cBufferSize   = ZSTD_compressBound(dstBufferSize);
     BYTE* cNoiseBuffer[5];
-    BYTE* srcBuffer;   /* jumping pointer */
     BYTE* const cBuffer = (BYTE*) malloc (cBufferSize);
     BYTE* const dstBuffer = (BYTE*) malloc (dstBufferSize);
     BYTE* const mirrorBuffer = (BYTE*) malloc (dstBufferSize);
@@ -1505,7 +1617,7 @@
     ZSTD_DCtx* const dctx = ZSTD_createDCtx();
     U32 result = 0;
     U32 testNb = 0;
-    U32 coreSeed = seed, lseed = 0;
+    U32 coreSeed = seed;
     UTIL_time_t const startClock = UTIL_getTime();
     U64 const maxClockSpan = maxDurationS * SEC_TO_MICRO;
     int const cLevelLimiter = bigTests ? 3 : 2;
@@ -1526,13 +1638,14 @@
     RDG_genBuffer(cNoiseBuffer[2], srcBufferSize, compressibility, 0., coreSeed);
     RDG_genBuffer(cNoiseBuffer[3], srcBufferSize, 0.95, 0., coreSeed);    /* highly compressible */
     RDG_genBuffer(cNoiseBuffer[4], srcBufferSize, 1.00, 0., coreSeed);    /* sparse content */
-    srcBuffer = cNoiseBuffer[2];
 
     /* catch up testNb */
     for (testNb=1; testNb < startTest; testNb++) FUZ_rand(&coreSeed);
 
     /* main test loop */
     for ( ; (testNb <= nbTests) || (UTIL_clockSpanMicro(startClock) < maxClockSpan); testNb++ ) {
+        BYTE* srcBuffer;   /* jumping pointer */
+        U32 lseed;
         size_t sampleSize, maxTestSize, totalTestSize;
         size_t cSize, totalCSize, totalGenSize;
         U64 crcOrig;
@@ -1763,11 +1876,9 @@
         CHECK (totalGenSize != totalTestSize, "streaming decompressed data : wrong size")
         CHECK (totalCSize != cSize, "compressed data should be fully read")
         {   U64 const crcDest = XXH64(dstBuffer, totalTestSize, 0);
-            if (crcDest!=crcOrig) {
-                size_t const errorPos = findDiff(mirrorBuffer, dstBuffer, totalTestSize);
-                CHECK (1, "streaming decompressed data corrupted : byte %u / %u  (%02X!=%02X)",
-                   (U32)errorPos, (U32)totalTestSize, dstBuffer[errorPos], mirrorBuffer[errorPos]);
-        }   }
+            CHECK(crcOrig != crcDest, "streaming decompressed data corrupted (pos %u / %u)",
+                (U32)findDiff(mirrorBuffer, dstBuffer, totalTestSize), (U32)totalTestSize);
+        }
     }   /* for ( ; (testNb <= nbTests) */
     DISPLAY("\r%u fuzzer tests completed   \n", testNb-1);
 
diff --git a/tests/legacy.c b/tests/legacy.c
index 847e1d2..e1cf82f 100644
--- a/tests/legacy.c
+++ b/tests/legacy.c
@@ -36,7 +36,7 @@
 const char* const EXPECTED; /* content is at end of file */
 
 
-int testSimpleAPI(void)
+static int testSimpleAPI(void)
 {
     size_t const size = strlen(EXPECTED);
     char* const output = malloc(size);
@@ -71,7 +71,8 @@
     return 0;
 }
 
-int testStreamingAPI(void)
+
+static int testStreamingAPI(void)
 {
     size_t const outBuffSize = ZSTD_DStreamOutSize();
     char* const outBuff = malloc(outBuffSize);
diff --git a/tests/longmatch.c b/tests/longmatch.c
index ed38615..1271e9a 100644
--- a/tests/longmatch.c
+++ b/tests/longmatch.c
@@ -17,25 +17,25 @@
 #define ZSTD_STATIC_LINKING_ONLY
 #include "zstd.h"
 
-int compress(ZSTD_CStream *ctx, ZSTD_outBuffer out, const void *data, size_t size) {
+static int
+compress(ZSTD_CStream *ctx, ZSTD_outBuffer out, const void *data, size_t size)
+{
   ZSTD_inBuffer in = { data, size, 0 };
   while (in.pos < in.size) {
     ZSTD_outBuffer tmp = out;
     const size_t rc = ZSTD_compressStream(ctx, &tmp, &in);
-    if (ZSTD_isError(rc)) {
-      return 1;
-    }
+    if (ZSTD_isError(rc)) return 1;
   }
-  {
-    ZSTD_outBuffer tmp = out;
+  { ZSTD_outBuffer tmp = out;
     const size_t rc = ZSTD_flushStream(ctx, &tmp);
     if (rc != 0) { return 1; }
   }
   return 0;
 }
 
-int main(int argc, const char** argv) {
-  ZSTD_CStream *ctx;
+int main(int argc, const char** argv)
+{
+  ZSTD_CStream* ctx;
   ZSTD_parameters params;
   size_t rc;
   unsigned windowLog;
diff --git a/tests/paramgrill.c b/tests/paramgrill.c
index db45220..7a4be85 100644
--- a/tests/paramgrill.c
+++ b/tests/paramgrill.c
@@ -17,7 +17,6 @@
 #include <stdio.h>     /* fprintf, fopen, ftello64 */
 #include <string.h>    /* strcmp */
 #include <math.h>      /* log */
-#include <time.h>
 #include <assert.h>
 
 #include "mem.h"
@@ -27,6 +26,8 @@
 #include "xxhash.h"
 #include "util.h"
 #include "bench.h"
+#include "zstd_errors.h"
+#include "zstd_internal.h"     /* should not be needed */
 
 
 /*-************************************
@@ -36,13 +37,7 @@
 #define AUTHOR "Yann Collet"
 #define WELCOME_MESSAGE "*** %s %s %i-bits, by %s ***\n", PROGRAM_DESCRIPTION, ZSTD_VERSION_STRING, (int)(sizeof(void*)*8), AUTHOR
 
-
-#define KB *(1<<10)
-#define MB *(1<<20)
-#define GB *(1ULL<<30)
-
-#define NBLOOPS    2
-#define TIMELOOP  (2 * SEC_TO_MICRO)
+#define TIMELOOP_NANOSEC      (1*1000000000ULL) /* 1 second */
 #define NB_LEVELS_TRACKED 22   /* ensured being >= ZSTD_maxCLevel() in BMK_init_level_constraints() */
 
 static const size_t maxMemory = (sizeof(size_t)==4)  ?  (2 GB - 64 MB) : (size_t)(1ULL << ((sizeof(size_t)*8)-31));
@@ -52,43 +47,306 @@
 static const U64 g_maxVariationTime = 60 * SEC_TO_MICRO;
 static const int g_maxNbVariations = 64;
 
+
 /*-************************************
 *  Macros
 **************************************/
 #define DISPLAY(...)  fprintf(stderr, __VA_ARGS__)
+#define DISPLAYLEVEL(n, ...) if(g_displayLevel >= n) { fprintf(stderr, __VA_ARGS__); }
+#define DEBUGOUTPUT(...) { if (DEBUG) DISPLAY(__VA_ARGS__); }
+
+#define TIMED 0
+#ifndef DEBUG
+#  define DEBUG 0
+#endif
 
 #undef MIN
 #undef MAX
 #define MIN(a,b)   ( (a) < (b) ? (a) : (b) )
 #define MAX(a,b)   ( (a) > (b) ? (a) : (b) )
 #define CUSTOM_LEVEL 99
+#define BASE_CLEVEL 1
+
+#define FADT_MIN 0
+#define FADT_MAX ((U32)-1)
+
+#define WLOG_RANGE (ZSTD_WINDOWLOG_MAX - ZSTD_WINDOWLOG_MIN + 1)
+#define CLOG_RANGE (ZSTD_CHAINLOG_MAX - ZSTD_CHAINLOG_MIN + 1)
+#define HLOG_RANGE (ZSTD_HASHLOG_MAX - ZSTD_HASHLOG_MIN + 1)
+#define SLOG_RANGE (ZSTD_SEARCHLOG_MAX - ZSTD_SEARCHLOG_MIN + 1)
+#define SLEN_RANGE (ZSTD_SEARCHLENGTH_MAX - ZSTD_SEARCHLENGTH_MIN + 1)
+#define TLEN_RANGE 17
+#define STRT_RANGE (ZSTD_btultra - ZSTD_fast + 1)
+#define FADT_RANGE 3
+
+#define CHECKTIME(r) { if(BMK_timeSpan(g_time) > g_timeLimit_s) { DEBUGOUTPUT("Time Limit Reached\n"); return r; } }
+#define CHECKTIMEGT(ret, val, _gototag) {if(BMK_timeSpan(g_time) > g_timeLimit_s) { DEBUGOUTPUT("Time Limit Reached\n"); ret = val; goto _gototag; } }
+
+#define PARAM_UNSET ((U32)-2) /* can't be -1 b/c fadt uses -1 */
+
+static const char* g_stratName[ZSTD_btultra+1] = {
+                "(none)       ", "ZSTD_fast    ", "ZSTD_dfast   ",
+                "ZSTD_greedy  ", "ZSTD_lazy    ", "ZSTD_lazy2   ",
+                "ZSTD_btlazy2 ", "ZSTD_btopt   ", "ZSTD_btultra "};
+
+static const U32 tlen_table[TLEN_RANGE] = { 0, 1, 2, 4, 6, 8, 12, 16, 24, 32, 48, 64, 96, 128, 256, 512, 999 };
+
 
 /*-************************************
-*  Benchmark Parameters
+*  Setup for Adding new params
 **************************************/
 
-static double g_grillDuration_s = 99999;   /* about 27 hours */
-static U32 g_nbIterations = NBLOOPS;
-static double g_compressibility = COMPRESSIBILITY_DEFAULT;
-static U32 g_blockSize = 0;
-static U32 g_rand = 1;
-static U32 g_singleRun = 0;
-static U32 g_target = 0;
-static U32 g_noSeed = 0;
-static ZSTD_compressionParameters g_params = { 0, 0, 0, 0, 0, 0, ZSTD_greedy };
+/* indices for each of the variables */
+typedef enum {
+    wlog_ind = 0,
+    clog_ind = 1,
+    hlog_ind = 2,
+    slog_ind = 3,
+    slen_ind = 4,
+    tlen_ind = 5,
+    strt_ind = 6,
+    fadt_ind = 7, /* forceAttachDict */
+    NUM_PARAMS = 8
+} varInds_t;
 
-void BMK_SetNbIterations(int nbLoops)
-{
-    g_nbIterations = nbLoops;
-    DISPLAY("- %u iterations -\n", g_nbIterations);
+typedef struct {
+    U32 vals[NUM_PARAMS];
+} paramValues_t;
+
+/* maximum value of parameters */
+static const U32 mintable[NUM_PARAMS] =
+        { ZSTD_WINDOWLOG_MIN, ZSTD_CHAINLOG_MIN, ZSTD_HASHLOG_MIN, ZSTD_SEARCHLOG_MIN, ZSTD_SEARCHLENGTH_MIN, ZSTD_TARGETLENGTH_MIN, ZSTD_fast, FADT_MIN };
+
+/* minimum value of parameters */
+static const U32 maxtable[NUM_PARAMS] =
+        { ZSTD_WINDOWLOG_MAX, ZSTD_CHAINLOG_MAX, ZSTD_HASHLOG_MAX, ZSTD_SEARCHLOG_MAX, ZSTD_SEARCHLENGTH_MAX, ZSTD_TARGETLENGTH_MAX, ZSTD_btultra, FADT_MAX };
+
+/* # of values parameters can take on */
+static const U32 rangetable[NUM_PARAMS] =
+        { WLOG_RANGE, CLOG_RANGE, HLOG_RANGE, SLOG_RANGE, SLEN_RANGE, TLEN_RANGE, STRT_RANGE, FADT_RANGE };
+
+/* ZSTD_cctxSetParameter() index to set */
+static const ZSTD_cParameter cctxSetParamTable[NUM_PARAMS] =
+        { ZSTD_p_windowLog, ZSTD_p_chainLog, ZSTD_p_hashLog, ZSTD_p_searchLog, ZSTD_p_minMatch, ZSTD_p_targetLength, ZSTD_p_compressionStrategy, ZSTD_p_forceAttachDict };
+
+/* names of parameters */
+static const char* g_paramNames[NUM_PARAMS] =
+        { "windowLog", "chainLog", "hashLog","searchLog", "searchLength", "targetLength", "strategy", "forceAttachDict" };
+
+/* shortened names of parameters */
+static const char* g_shortParamNames[NUM_PARAMS] =
+        { "wlog", "clog", "hlog","slog", "slen", "tlen", "strt", "fadt" };
+
+/* maps value from { 0 to rangetable[param] - 1 } to valid paramvalues */
+static U32 rangeMap(varInds_t param, int ind) {
+    ind = MAX(MIN(ind, (int)rangetable[param] - 1), 0);
+    switch(param) {
+        case tlen_ind:
+            return tlen_table[ind];
+        case fadt_ind: /* 0, 1, 2 -> -1, 0, 1 */
+            return ind - 1;
+        case wlog_ind: /* using default: triggers -Wswitch-enum */
+        case clog_ind:
+        case hlog_ind:
+        case slog_ind:
+        case slen_ind:
+        case strt_ind:
+            return mintable[param] + ind;
+        case NUM_PARAMS:
+            DISPLAY("Error, not a valid param\n ");
+            return (U32)-1;
+    }
+    return 0; /* should never happen, stop compiler warnings */
 }
 
+/* inverse of rangeMap */
+static int invRangeMap(varInds_t param, U32 value) {
+    value = MIN(MAX(mintable[param], value), maxtable[param]);
+    switch(param) {
+        case tlen_ind: /* bin search */
+        {
+            int lo = 0;
+            int hi = TLEN_RANGE;
+            while(lo < hi) {
+                int mid = (lo + hi) / 2;
+                if(tlen_table[mid] < value) {
+                    lo = mid + 1;
+                } if(tlen_table[mid] == value) {
+                    return mid;
+                } else {
+                    hi = mid;
+                }
+            }
+            return lo;
+        }
+        case fadt_ind:
+            return (int)value + 1;
+        case wlog_ind:
+        case clog_ind:
+        case hlog_ind:
+        case slog_ind:
+        case slen_ind:
+        case strt_ind:
+            return value - mintable[param];
+        case NUM_PARAMS:
+            DISPLAY("Error, not a valid param\n ");
+            return -2;
+    }
+    return 0; /* should never happen, stop compiler warnings */
+}
+
+/* display of params */
+static void displayParamVal(FILE* f, varInds_t param, U32 value, int width) {
+    switch(param) {
+        case fadt_ind: if(width) { fprintf(f, "%*d", width, (int)value); } else { fprintf(f, "%d", (int)value); } break;
+        case strt_ind: if(width) { fprintf(f, "%*s", width, g_stratName[value]); } else { fprintf(f, "%s", g_stratName[value]); } break;
+        case wlog_ind:
+        case clog_ind:
+        case hlog_ind:
+        case slog_ind:
+        case slen_ind:
+        case tlen_ind: if(width) { fprintf(f, "%*u", width, value); } else { fprintf(f, "%u", value); } break;
+        case NUM_PARAMS:
+            DISPLAY("Error, not a valid param\n "); break;
+    }
+}
+
+
+/*-************************************
+*  Benchmark Parameters/Global Variables
+**************************************/
+
+typedef BYTE U8;
+
+/* General Utility */
+static U32 g_timeLimit_s = 99999;   /* about 27 hours */
+static UTIL_time_t g_time; /* to be used to compare solution finding speeds to compare to original */
+static U32 g_blockSize = 0;
+static U32 g_rand = 1;
+
+/* Display */
+static int g_displayLevel = 3;
+static BYTE g_silenceParams[NUM_PARAMS];
+
+/* Mode Selection */
+static U32 g_singleRun = 0;
+static U32 g_optimizer = 0;
+static int g_optmode = 0;
+
+/* For cLevel Table generation */
+static U32 g_target = 0;
+static U32 g_noSeed = 0;
+
+/* For optimizer */
+static paramValues_t g_params; /* Initialized at the beginning of main w/ emptyParams() function */
+static double g_ratioMultiplier = 5.;
+static U32 g_strictness = PARAM_UNSET; /* range 1 - 100, measure of how strict  */
+static BMK_benchResult_t g_lvltarget;
+
+typedef enum {
+    directMap,
+    xxhashMap,
+    noMemo
+} memoTableType_t;
+
+typedef struct {
+    memoTableType_t tableType;
+    BYTE* table;
+    size_t tableLen;
+    varInds_t varArray[NUM_PARAMS];
+    size_t varLen;
+} memoTable_t;
+
+typedef struct {
+    BMK_benchResult_t result;
+    paramValues_t params;
+} winnerInfo_t;
+
+typedef struct {
+    U32 cSpeed;  /* bytes / sec */
+    U32 dSpeed;
+    U32 cMem;    /* bytes */
+} constraint_t;
+
+typedef struct winner_ll_node winner_ll_node;
+struct winner_ll_node {
+    winnerInfo_t res;
+    winner_ll_node* next;
+};
+
+static winner_ll_node* g_winners; /* linked list sorted ascending by cSize & cSpeed */
+
+/*
+ * Additional Global Variables (Defined Above Use)
+ * g_level_constraint
+ * g_alreadyTested
+ * g_maxTries
+ * g_clockGranularity
+ */
+
+
 /*-*******************************************************
-*  Private functions
+*  General Util Functions
 *********************************************************/
 
-/* accuracy in seconds only, span can be multiple years */
-static double BMK_timeSpan(time_t tStart) { return difftime(time(NULL), tStart); }
+/* nullified useless params, to ensure count stats */
+/* cleans up params for memoizing / display */
+static paramValues_t sanitizeParams(paramValues_t params)
+{
+    if (params.vals[strt_ind] == ZSTD_fast)
+        params.vals[clog_ind] = 0, params.vals[slog_ind] = 0;
+    if (params.vals[strt_ind] == ZSTD_dfast)
+        params.vals[slog_ind] = 0;
+    if (params.vals[strt_ind] != ZSTD_btopt && params.vals[strt_ind] != ZSTD_btultra && params.vals[strt_ind] != ZSTD_fast)
+        params.vals[tlen_ind] = 0;
+
+    return params;
+}
+
+static ZSTD_compressionParameters pvalsToCParams(paramValues_t p) {
+    ZSTD_compressionParameters c;
+    memset(&c, 0, sizeof(ZSTD_compressionParameters));
+    c.windowLog = p.vals[wlog_ind];
+    c.chainLog = p.vals[clog_ind];
+    c.hashLog = p.vals[hlog_ind];
+    c.searchLog = p.vals[slog_ind];
+    c.searchLength = p.vals[slen_ind];
+    c.targetLength = p.vals[tlen_ind];
+    c.strategy = p.vals[strt_ind];
+    /* no forceAttachDict */
+    return c;
+}
+
+static paramValues_t cParamsToPVals(ZSTD_compressionParameters c) {
+    paramValues_t p;
+    varInds_t i;
+    p.vals[wlog_ind] = c.windowLog;
+    p.vals[clog_ind] = c.chainLog;
+    p.vals[hlog_ind] = c.hashLog;
+    p.vals[slog_ind] = c.searchLog;
+    p.vals[slen_ind] = c.searchLength;
+    p.vals[tlen_ind] = c.targetLength;
+    p.vals[strt_ind] = c.strategy;
+
+    /* set all other params to their minimum value */
+    for(i = strt_ind + 1; i < NUM_PARAMS; i++) {
+        p.vals[i] = mintable[i];
+    }
+    return p;
+}
+
+/* equivalent of ZSTD_adjustCParams for paramValues_t */
+static paramValues_t adjustParams(paramValues_t p, const size_t maxBlockSize, const size_t dictSize) {
+    paramValues_t ot = p;
+    varInds_t i;
+    p = cParamsToPVals(ZSTD_adjustCParams(pvalsToCParams(p), maxBlockSize, dictSize));
+    if(!dictSize) { p.vals[fadt_ind] = 0; }
+    /* retain value of all other parameters */
+    for(i = strt_ind + 1; i < NUM_PARAMS; i++) {
+        p.vals[i] = ot.vals[i];
+    }
+    return p;
+}
 
 static size_t BMK_findMaxMem(U64 requiredMem)
 {
@@ -98,23 +356,25 @@
     requiredMem = (((requiredMem >> 26) + 1) << 26);
     if (requiredMem > maxMemory) requiredMem = maxMemory;
 
-    requiredMem += 2*step;
-    while (!testmem) {
-        requiredMem -= step;
+    requiredMem += 2 * step;
+    while (!testmem && requiredMem > 0) {
         testmem = malloc ((size_t)requiredMem);
+        requiredMem -= step;
     }
 
     free (testmem);
-    return (size_t) (requiredMem - step);
+    return (size_t) requiredMem;
 }
 
+/* accuracy in seconds only, span can be multiple years */
+static U32 BMK_timeSpan(const UTIL_time_t tStart) { return (U32)(UTIL_clockSpanMicro(tStart) / 1000000ULL); }
 
 static U32 FUZ_rotl32(U32 x, U32 r)
 {
     return ((x << r) | (x >> (32 - r)));
 }
 
-U32 FUZ_rand(U32* src)
+static U32 FUZ_rand(U32* src)
 {
     const U32 prime1 = 2654435761U;
     const U32 prime2 = 2246822519U;
@@ -126,78 +386,477 @@
     return rand32 >> 5;
 }
 
-/** longCommandWArg() :
- *  check if *stringPtr is the same as longCommand.
- *  If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand.
- * @return 0 and doesn't modify *stringPtr otherwise.
- * from zstdcli.c
+/* allows zeros */
+#define CLAMPCHECK(val,min,max) {                     \
+    if (((val)<(min)) | ((val)>(max))) {              \
+        DISPLAY("INVALID PARAMETER CONSTRAINTS\n");   \
+        return 0;                                     \
+}   }
+
+static int paramValid(const paramValues_t paramTarget) {
+    U32 i;
+    for(i = 0; i < NUM_PARAMS; i++) {
+        CLAMPCHECK(paramTarget.vals[i], mintable[i], maxtable[i]);
+    }
+    return 1;
+}
+
+static paramValues_t cParamUnsetMin(paramValues_t paramTarget) {
+    varInds_t i;
+    for(i = 0; i < NUM_PARAMS; i++) {
+        if(paramTarget.vals[i] == PARAM_UNSET) {
+            paramTarget.vals[i] = mintable[i];
+        }
+    }
+    return paramTarget;
+}
+
+static paramValues_t emptyParams(void) {
+    U32 i;
+    paramValues_t p;
+    for(i = 0; i < NUM_PARAMS; i++) {
+        p.vals[i] = PARAM_UNSET;
+    }
+    return p;
+}
+
+static winnerInfo_t initWinnerInfo(const paramValues_t p) {
+    winnerInfo_t w1;
+    w1.result.cSpeed = 0.;
+    w1.result.dSpeed = 0.;
+    w1.result.cMem = (size_t)-1;
+    w1.result.cSize = (size_t)-1;
+    w1.params = p;
+    return w1;
+}
+
+static paramValues_t overwriteParams(paramValues_t base, const paramValues_t mask) {
+    U32 i;
+    for(i = 0; i < NUM_PARAMS; i++) {
+        if(mask.vals[i] != PARAM_UNSET) {
+            base.vals[i] = mask.vals[i];
+        }
+    }
+    return base;
+}
+
+static void paramVaryOnce(const varInds_t paramIndex, const int amt, paramValues_t* ptr) {
+    ptr->vals[paramIndex] = rangeMap(paramIndex, invRangeMap(paramIndex, ptr->vals[paramIndex]) + amt);
+}
+
+/* varies ptr by nbChanges respecting varyParams*/
+static void paramVariation(paramValues_t* ptr, memoTable_t* mtAll, const U32 nbChanges)
+{
+    paramValues_t p;
+    U32 validated = 0;
+    while (!validated) {
+        U32 i;
+        p = *ptr;
+        for (i = 0 ; i < nbChanges ; i++) {
+            const U32 changeID = (U32)FUZ_rand(&g_rand) % (mtAll[p.vals[strt_ind]].varLen << 1);
+            paramVaryOnce(mtAll[p.vals[strt_ind]].varArray[changeID >> 1], ((changeID & 1) << 1) - 1, &p);
+        }
+        validated = paramValid(p);
+    }
+    *ptr = p;
+}
+
+/* Completely random parameter selection */
+static paramValues_t randomParams(void)
+{
+    varInds_t v; paramValues_t p;
+    for(v = 0; v < NUM_PARAMS; v++) {
+        p.vals[v] = rangeMap(v, FUZ_rand(&g_rand) % rangetable[v]);
+    }
+    return p;
+}
+
+static U64 g_clockGranularity = 100000000ULL;
+
+static void findClockGranularity(void) {
+    UTIL_time_t clockStart = UTIL_getTime();
+    U64 el1 = 0, el2 = 0;
+    int i = 0;
+    do {
+        el1 = el2;
+        el2 = UTIL_clockSpanNano(clockStart);
+        if(el1 < el2) {
+            U64 iv = el2 - el1;
+            if(g_clockGranularity > iv) {
+                g_clockGranularity = iv;
+                i = 0;
+            } else {
+                i++;
+            }
+        }
+    } while(i < 10);
+    DEBUGOUTPUT("Granularity: %llu\n", (unsigned long long)g_clockGranularity);
+}
+
+/*-************************************
+*  Optimizer Util Functions
+**************************************/
+
+/* checks results are feasible */
+static int feasible(const BMK_benchResult_t results, const constraint_t target) {
+    return (results.cSpeed >= target.cSpeed)
+        && (results.dSpeed >= target.dSpeed)
+        && (results.cMem <= target.cMem)
+        && (!g_optmode || results.cSize <= g_lvltarget.cSize);
+}
+
+/* hill climbing value for part 1 */
+/* Scoring here is a linear reward for all set constraints normalized between 0 to 1
+ * (with 0 at 0 and 1 being fully fulfilling the constraint), summed with a logarithmic
+ * bonus to exceeding the constraint value. We also give linear ratio for compression ratio.
+ * The constant factors are experimental.
  */
-static unsigned longCommandWArg(const char** stringPtr, const char* longCommand)
-{
-    size_t const comSize = strlen(longCommand);
-    int const result = !strncmp(*stringPtr, longCommand, comSize);
-    if (result) *stringPtr += comSize;
-    return result;
+static double resultScore(const BMK_benchResult_t res, const size_t srcSize, const constraint_t target) {
+    double cs = 0., ds = 0., rt, cm = 0.;
+    const double r1 = 1, r2 = 0.1, rtr = 0.5;
+    double ret;
+    if(target.cSpeed) { cs = res.cSpeed / (double)target.cSpeed; }
+    if(target.dSpeed) { ds = res.dSpeed / (double)target.dSpeed; }
+    if(target.cMem != (U32)-1) { cm = (double)target.cMem / res.cMem; }
+    rt = ((double)srcSize / res.cSize);
+
+    ret = (MIN(1, cs) + MIN(1, ds)  + MIN(1, cm))*r1 + rt * rtr +
+         (MAX(0, log(cs))+ MAX(0, log(ds))+ MAX(0, log(cm))) * r2;
+
+    return ret;
 }
 
-/*-*******************************************************
-*  Bench functions
-*********************************************************/
-
-typedef struct
-{
-    const char* srcPtr;
-    size_t srcSize;
-    char*  cPtr;
-    size_t cRoom;
-    size_t cSize;
-    char*  resPtr;
-    size_t resSize;
-} blockParam_t;
-
-
-const char* g_stratName[ZSTD_btultra+1] = {
-                "(none)       ", "ZSTD_fast    ", "ZSTD_dfast   ",
-                "ZSTD_greedy  ", "ZSTD_lazy    ", "ZSTD_lazy2   ",
-                "ZSTD_btlazy2 ", "ZSTD_btopt   ", "ZSTD_btultra "};
-
-/* TODO: support additional parameters (more files, fileSizes) */
-
-//TODO: benchMem dctx can't = NULL in new system
-static size_t
-BMK_benchParam(BMK_result_t* resultPtr,
-               const void* srcBuffer, size_t srcSize,
-               ZSTD_CCtx* ctx, ZSTD_DCtx* dctx, 
-               const ZSTD_compressionParameters cParams) {
-
-
-    BMK_return_t res = BMK_benchMem(srcBuffer,srcSize, &srcSize, 1, 0, &cParams, NULL, 0, ctx, dctx, 0, "File");
-    *resultPtr = res.result;
-    return res.errorCode;
+/* calculates normalized squared euclidean distance of result1 if it is in the first quadrant relative to lvlRes */
+static double resultDistLvl(const BMK_benchResult_t result1, const BMK_benchResult_t lvlRes) {
+    double normalizedCSpeedGain1 = (result1.cSpeed / lvlRes.cSpeed) - 1;
+    double normalizedRatioGain1 = ((double)lvlRes.cSize / result1.cSize) - 1;
+    if(normalizedRatioGain1 < 0 || normalizedCSpeedGain1 < 0) {
+        return 0.0;
+    }
+    return normalizedRatioGain1 * g_ratioMultiplier + normalizedCSpeedGain1;
 }
 
-static void BMK_printWinner(FILE* f, U32 cLevel, BMK_result_t result, ZSTD_compressionParameters params, size_t srcSize)
+/* return true if r2 strictly better than r1 */
+static int compareResultLT(const BMK_benchResult_t result1, const BMK_benchResult_t result2, const constraint_t target, size_t srcSize) {
+    if(feasible(result1, target) && feasible(result2, target)) {
+        if(g_optmode) {
+            return resultDistLvl(result1, g_lvltarget) < resultDistLvl(result2, g_lvltarget);
+        } else {
+            return (result1.cSize > result2.cSize) || (result1.cSize == result2.cSize && result2.cSpeed > result1.cSpeed)
+            || (result1.cSize == result2.cSize && result2.cSpeed == result1.cSpeed && result2.dSpeed > result1.dSpeed);
+        }
+    }
+    return feasible(result2, target) || (!feasible(result1, target) && (resultScore(result1, srcSize, target) < resultScore(result2, srcSize, target)));
+}
+
+static constraint_t relaxTarget(constraint_t target) {
+    target.cMem = (U32)-1;
+    target.cSpeed *= ((double)g_strictness) / 100;
+    target.dSpeed *= ((double)g_strictness) / 100;
+    return target;
+}
+
+static void optimizerAdjustInput(paramValues_t* pc, const size_t maxBlockSize) {
+    varInds_t v;
+    for(v = 0; v < NUM_PARAMS; v++) {
+        if(pc->vals[v] != PARAM_UNSET) {
+            U32 newval = MIN(MAX(pc->vals[v], mintable[v]), maxtable[v]);
+            if(newval != pc->vals[v]) {
+                pc->vals[v] = newval;
+                DISPLAY("Warning: parameter %s not in valid range, adjusting to ", g_paramNames[v]); displayParamVal(stderr, v, newval, 0); DISPLAY("\n");
+            }
+        }
+    }
+
+    if(pc->vals[wlog_ind] != PARAM_UNSET) {
+
+        U32 sshb = maxBlockSize > 1 ? ZSTD_highbit32((U32)(maxBlockSize-1)) + 1 : 1;
+        /* edge case of highBit not working for 0 */
+
+        if(maxBlockSize < (1ULL << 31) && sshb + 1 < pc->vals[wlog_ind]) {
+            U32 adjust = MAX(mintable[wlog_ind], sshb);
+            if(adjust != pc->vals[wlog_ind]) {
+                pc->vals[wlog_ind] = adjust;
+                DISPLAY("Warning: windowLog larger than src/block size, adjusted to %u\n", pc->vals[wlog_ind]);
+            }
+        }
+    }
+
+    if(pc->vals[wlog_ind] != PARAM_UNSET && pc->vals[clog_ind] != PARAM_UNSET) {
+        U32 maxclog;
+        if(pc->vals[strt_ind] == PARAM_UNSET || pc->vals[strt_ind] >= (U32)ZSTD_btlazy2) {
+            maxclog = pc->vals[wlog_ind] + 1;
+        } else {
+            maxclog = pc->vals[wlog_ind];
+        }
+
+        if(pc->vals[clog_ind] > maxclog) {
+            pc->vals[clog_ind] = maxclog;
+            DISPLAY("Warning: chainlog too much larger than windowLog size, adjusted to %u\n", pc->vals[clog_ind]);
+        }
+    }
+
+    if(pc->vals[wlog_ind] != PARAM_UNSET && pc->vals[hlog_ind] != PARAM_UNSET) {
+        if(pc->vals[wlog_ind] + 1 < pc->vals[hlog_ind]) {
+            pc->vals[hlog_ind] = pc->vals[wlog_ind] + 1;
+            DISPLAY("Warning: hashlog too much larger than windowLog size, adjusted to %u\n", pc->vals[hlog_ind]);
+        }
+    }
+
+    if(pc->vals[slog_ind] != PARAM_UNSET && pc->vals[clog_ind] != PARAM_UNSET) {
+        if(pc->vals[slog_ind] > pc->vals[clog_ind]) {
+            pc->vals[clog_ind] = pc->vals[slog_ind];
+            DISPLAY("Warning: searchLog larger than chainLog, adjusted to %u\n", pc->vals[slog_ind]);
+        }
+    }
+}
+
+static int redundantParams(const paramValues_t paramValues, const constraint_t target, const size_t maxBlockSize) {
+    return
+       (ZSTD_estimateCStreamSize_usingCParams(pvalsToCParams(paramValues)) > (size_t)target.cMem) /* Uses too much memory */
+    || ((1ULL << (paramValues.vals[wlog_ind] - 1)) >= maxBlockSize && paramValues.vals[wlog_ind] != mintable[wlog_ind]) /* wlog too much bigger than src size */
+    || (paramValues.vals[clog_ind] > (paramValues.vals[wlog_ind] + (paramValues.vals[strt_ind] > ZSTD_btlazy2))) /* chainLog larger than windowLog*/
+    || (paramValues.vals[slog_ind] > paramValues.vals[clog_ind]) /* searchLog larger than chainLog */
+    || (paramValues.vals[hlog_ind] > paramValues.vals[wlog_ind] + 1); /* hashLog larger than windowLog + 1 */
+
+}
+
+/*-************************************
+*  Display Functions
+**************************************/
+
+static void BMK_translateAdvancedParams(FILE* f, const paramValues_t params) {
+    varInds_t v;
+    int first = 1;
+    fprintf(f,"--zstd=");
+    for (v = 0; v < NUM_PARAMS; v++) {
+        if (g_silenceParams[v]) { continue; }
+        if (!first) { fprintf(f, ","); }
+        fprintf(f,"%s=", g_paramNames[v]);
+
+        if (v == strt_ind) { fprintf(f,"%u", params.vals[v]); }
+        else { displayParamVal(f, v, params.vals[v], 0); }
+        first = 0;
+    }
+    fprintf(f, "\n");
+}
+
+static void BMK_displayOneResult(FILE* f, winnerInfo_t res, const size_t srcSize)
+{
+    varInds_t v;
+    int first = 1;
+    res.params = cParamUnsetMin(res.params);
+    fprintf(f, "    {");
+    for (v = 0; v < NUM_PARAMS; v++) {
+        if (g_silenceParams[v]) { continue; }
+        if (!first) { fprintf(f, ","); }
+        displayParamVal(f, v, res.params.vals[v], 3);
+        first = 0;
+    }
+
+    {   double const ratio = res.result.cSize ?
+                            (double)srcSize / res.result.cSize : 0;
+        double const cSpeedMBps = (double)res.result.cSpeed / MB_UNIT;
+        double const dSpeedMBps = (double)res.result.dSpeed / MB_UNIT;
+
+        fprintf(f, " },     /* R:%5.3f at %5.1f MB/s - %5.1f MB/s */\n",
+                            ratio, cSpeedMBps, dSpeedMBps);
+    }
+}
+
+/* Writes to f the results of a parameter benchmark */
+/* when used with --optimize, will only print results better than previously discovered */
+static void BMK_printWinner(FILE* f, const int cLevel, const BMK_benchResult_t result, const paramValues_t params, const size_t srcSize)
 {
     char lvlstr[15] = "Custom Level";
-    DISPLAY("\r%79s\r", "");
-    fprintf(f,"    {%3u,%3u,%3u,%3u,%3u,%3u, %s },  ",
-            params.windowLog, params.chainLog, params.hashLog, params.searchLog, params.searchLength,
-            params.targetLength, g_stratName[(U32)(params.strategy)]);
+    winnerInfo_t w;
+    w.params = params;
+    w.result = result;
+
+    fprintf(f, "\r%79s\r", "");
+
     if(cLevel != CUSTOM_LEVEL) {
-        snprintf(lvlstr, 15, "  Level %2u  ", cLevel);
+        snprintf(lvlstr, 15, "  Level %2d  ", cLevel);
     }
-    fprintf(f,
-        "/* %s */   /* R:%5.3f at %5.1f MB/s - %5.1f MB/s */\n",
-        lvlstr, (double)srcSize / result.cSize, result.cSpeed / 1000000., result.dSpeed / 1000000.);
+
+    if(TIMED) {
+        const U64 time = UTIL_clockSpanNano(g_time);
+        const U64 minutes = time / (60ULL * TIMELOOP_NANOSEC);
+        fprintf(f, "%1lu:%2lu:%05.2f - ", (unsigned long) minutes / 60,(unsigned long) minutes % 60, (double)(time - minutes * TIMELOOP_NANOSEC * 60ULL)/TIMELOOP_NANOSEC);
+    }
+
+    fprintf(f, "/* %s */   ", lvlstr);
+    BMK_displayOneResult(f, w, srcSize);
 }
 
+/* comparison function: */
+/* strictly better, strictly worse, equal, speed-side adv, size-side adv */
+#define WORSE_RESULT 0
+#define BETTER_RESULT 1
+#define ERROR_RESULT 2
 
-typedef struct {
-    BMK_result_t result;
-    ZSTD_compressionParameters params;
-} winnerInfo_t;
+#define SPEED_RESULT 4
+#define SIZE_RESULT 5
+/* maybe have epsilon-eq to limit table size? */
+static int speedSizeCompare(const BMK_benchResult_t r1, const BMK_benchResult_t r2) {
+    if(r1.cSpeed < r2.cSpeed) {
+        if(r1.cSize >= r2.cSize) {
+            return BETTER_RESULT;
+        }
+        return SPEED_RESULT; /* r2 is smaller but not faster. */
+    } else {
+        if(r1.cSize <= r2.cSize) {
+            return WORSE_RESULT;
+        }
+        return SIZE_RESULT; /* r2 is faster but not smaller */
+    }
+}
 
-static void BMK_printWinners2(FILE* f, const winnerInfo_t* winners, size_t srcSize)
+/* 0 for insertion, 1 for no insert */
+/* maintain invariant speedSizeCompare(n, n->next) = SPEED_RESULT */
+static int insertWinner(const winnerInfo_t w, const constraint_t targetConstraints) {
+    BMK_benchResult_t r = w.result;
+    winner_ll_node* cur_node = g_winners;
+    /* first node to insert */
+    if(!feasible(r, targetConstraints)) {
+        return 1;
+    }
+
+    if(g_winners == NULL) {
+        winner_ll_node* first_node = malloc(sizeof(winner_ll_node));
+        if(first_node == NULL) {
+            return 1;
+        }
+        first_node->next = NULL;
+        first_node->res = w;
+        g_winners = first_node;
+        return 0;
+    }
+
+    while(cur_node->next != NULL) {
+        switch(speedSizeCompare(cur_node->res.result, r)) {
+            case WORSE_RESULT:
+            {
+                return 1; /* never insert if better */
+            }
+            case BETTER_RESULT:
+            {
+                winner_ll_node* tmp;
+                cur_node->res = cur_node->next->res;
+                tmp = cur_node->next;
+                cur_node->next = cur_node->next->next;
+                free(tmp);
+                break;
+            }
+            case SIZE_RESULT:
+            {
+                cur_node = cur_node->next;
+                break;
+            }
+            case SPEED_RESULT: /* insert after first size result, then return */
+            {
+                winner_ll_node* newnode = malloc(sizeof(winner_ll_node));
+                if(newnode == NULL) {
+                    return 1;
+                }
+                newnode->res = cur_node->res;
+                cur_node->res = w;
+                newnode->next = cur_node->next;
+                cur_node->next = newnode;
+                return 0;
+            }
+        }
+
+    }
+
+    assert(cur_node->next == NULL);
+    switch(speedSizeCompare(cur_node->res.result, r)) {
+        case WORSE_RESULT:
+        {
+            return 1; /* never insert if better */
+        }
+        case BETTER_RESULT:
+        {
+            cur_node->res = w;
+            return 0;
+        }
+        case SIZE_RESULT:
+        {
+            winner_ll_node* newnode = malloc(sizeof(winner_ll_node));
+            if(newnode == NULL) {
+                return 1;
+            }
+            newnode->res = w;
+            newnode->next = NULL;
+            cur_node->next = newnode;
+            return 0;
+        }
+        case SPEED_RESULT: /* insert before first size result, then return */
+        {
+            winner_ll_node* newnode = malloc(sizeof(winner_ll_node));
+            if(newnode == NULL) {
+                return 1;
+            }
+            newnode->res = cur_node->res;
+            cur_node->res = w;
+            newnode->next = cur_node->next;
+            cur_node->next = newnode;
+            return 0;
+        }
+        default:
+            return 1;
+    }
+}
+
+static void BMK_printWinnerOpt(FILE* f, const U32 cLevel, const BMK_benchResult_t result, const paramValues_t params, const constraint_t targetConstraints, const size_t srcSize)
+{
+    /* global winner used for constraints */
+                                    /* cSize, cSpeed, dSpeed, cMem */
+    static winnerInfo_t g_winner = { { (size_t)-1LL, 0, 0, (size_t)-1LL }, { { PARAM_UNSET, PARAM_UNSET, PARAM_UNSET, PARAM_UNSET, PARAM_UNSET, PARAM_UNSET, PARAM_UNSET, PARAM_UNSET } } };
+    if(DEBUG || compareResultLT(g_winner.result, result, targetConstraints, srcSize) || g_displayLevel >= 4) {
+        if(DEBUG && compareResultLT(g_winner.result, result, targetConstraints, srcSize)) {
+            DISPLAY("New Winner: \n");
+        }
+
+        if(g_displayLevel >= 2) { BMK_printWinner(f, cLevel, result, params, srcSize); }
+
+        if(compareResultLT(g_winner.result, result, targetConstraints, srcSize)) {
+            if(g_displayLevel >= 1) { BMK_translateAdvancedParams(f, params); }
+            g_winner.result = result;
+            g_winner.params = params;
+        }
+    }
+
+    if(g_optmode && g_optimizer && (DEBUG || g_displayLevel == 3)) {
+        winnerInfo_t w;
+        winner_ll_node* n;
+        w.result = result;
+        w.params = params;
+        insertWinner(w, targetConstraints);
+
+        if(!DEBUG) { fprintf(f, "\033c"); }
+        fprintf(f, "\n");
+
+        /* the table */
+        fprintf(f, "================================\n");
+        for(n = g_winners; n != NULL; n = n->next) {
+            BMK_displayOneResult(f, n->res, srcSize);
+        }
+        fprintf(f, "================================\n");
+        fprintf(f, "Level Bounds: R: > %.3f AND C: < %.1f MB/s \n\n",
+            (double)srcSize / g_lvltarget.cSize, (double)g_lvltarget.cSpeed / MB_UNIT);
+
+
+        fprintf(f, "Overall Winner: \n");
+        BMK_displayOneResult(f, g_winner, srcSize);
+        BMK_translateAdvancedParams(f, g_winner.params);
+
+        fprintf(f, "Latest BMK: \n");\
+        BMK_displayOneResult(f, w, srcSize);
+    }
+}
+
+static void BMK_printWinners2(FILE* f, const winnerInfo_t* winners, const size_t srcSize)
 {
     int cLevel;
 
@@ -209,7 +868,7 @@
 }
 
 
-static void BMK_printWinners(FILE* f, const winnerInfo_t* winners, size_t srcSize)
+static void BMK_printWinners(FILE* f, const winnerInfo_t* winners, const size_t srcSize)
 {
     fseek(f, 0, SEEK_SET);
     BMK_printWinners2(f, winners, srcSize);
@@ -218,9 +877,770 @@
 }
 
 
+/*-*******************************************************
+*  Functions to Benchmark
+*********************************************************/
+
 typedef struct {
-    double cSpeed_min;
-    double dSpeed_min;
+    ZSTD_CCtx* cctx;
+    const void* dictBuffer;
+    size_t dictBufferSize;
+    int cLevel;
+    const paramValues_t* comprParams;
+} BMK_initCCtxArgs;
+
+static size_t local_initCCtx(void* payload) {
+    const BMK_initCCtxArgs* ag = (const BMK_initCCtxArgs*)payload;
+    varInds_t i;
+    ZSTD_CCtx_reset(ag->cctx);
+    ZSTD_CCtx_resetParameters(ag->cctx);
+    ZSTD_CCtx_setParameter(ag->cctx, ZSTD_p_compressionLevel, ag->cLevel);
+
+    for(i = 0; i < NUM_PARAMS; i++) {
+        if(ag->comprParams->vals[i] != PARAM_UNSET)
+        ZSTD_CCtx_setParameter(ag->cctx, cctxSetParamTable[i], ag->comprParams->vals[i]);
+    }
+    ZSTD_CCtx_loadDictionary(ag->cctx, ag->dictBuffer, ag->dictBufferSize);
+
+    return 0;
+}
+
+typedef struct {
+    ZSTD_DCtx* dctx;
+    const void* dictBuffer;
+    size_t dictBufferSize;
+} BMK_initDCtxArgs;
+
+static size_t local_initDCtx(void* payload) {
+    const BMK_initDCtxArgs* ag = (const BMK_initDCtxArgs*)payload;
+    ZSTD_DCtx_reset(ag->dctx);
+    ZSTD_DCtx_loadDictionary(ag->dctx, ag->dictBuffer, ag->dictBufferSize);
+    return 0;
+}
+
+/* additional argument is just the context */
+static size_t local_defaultCompress(
+    const void* srcBuffer, size_t srcSize,
+    void* dstBuffer, size_t dstSize,
+    void* addArgs) {
+    size_t moreToFlush = 1;
+    ZSTD_CCtx* ctx = (ZSTD_CCtx*)addArgs;
+    ZSTD_inBuffer in;
+    ZSTD_outBuffer out;
+    in.src = srcBuffer;
+    in.size = srcSize;
+    in.pos = 0;
+    out.dst = dstBuffer;
+    out.size = dstSize;
+    out.pos = 0;
+    assert(dstSize == ZSTD_compressBound(srcSize)); /* specific to this version, which is only used in paramgrill */
+    while (moreToFlush) {
+        if(out.pos == out.size) {
+            return (size_t)-ZSTD_error_dstSize_tooSmall;
+        }
+        moreToFlush = ZSTD_compress_generic(ctx, &out, &in, ZSTD_e_end);
+        if (ZSTD_isError(moreToFlush)) {
+            return moreToFlush;
+        }
+    }
+    return out.pos;
+}
+
+/* additional argument is just the context */
+static size_t local_defaultDecompress(
+    const void* srcBuffer, size_t srcSize,
+    void* dstBuffer, size_t dstSize,
+    void* addArgs) {
+    size_t moreToFlush = 1;
+    ZSTD_DCtx* dctx = (ZSTD_DCtx*)addArgs;
+    ZSTD_inBuffer in;
+    ZSTD_outBuffer out;
+    in.src = srcBuffer;
+    in.size = srcSize;
+    in.pos = 0;
+    out.dst = dstBuffer;
+    out.size = dstSize;
+    out.pos = 0;
+    while (moreToFlush) {
+        if(out.pos == out.size) {
+            return (size_t)-ZSTD_error_dstSize_tooSmall;
+        }
+        moreToFlush = ZSTD_decompress_generic(dctx,
+                            &out, &in);
+        if (ZSTD_isError(moreToFlush)) {
+            return moreToFlush;
+        }
+    }
+    return out.pos;
+
+}
+
+/*-************************************
+*  Data Initialization Functions
+**************************************/
+
+typedef struct {
+    void* srcBuffer;
+    size_t srcSize;
+    const void** srcPtrs;
+    size_t* srcSizes;
+    void** dstPtrs;
+    size_t* dstCapacities;
+    size_t* dstSizes;
+    void** resPtrs;
+    size_t* resSizes;
+    size_t nbBlocks;
+    size_t maxBlockSize;
+} buffers_t;
+
+typedef struct {
+    size_t dictSize;
+    void* dictBuffer;
+    ZSTD_CCtx* cctx;
+    ZSTD_DCtx* dctx;
+} contexts_t;
+
+static void freeNonSrcBuffers(const buffers_t b) {
+    free(b.srcPtrs);
+    free(b.srcSizes);
+
+    if(b.dstPtrs != NULL) {
+        free(b.dstPtrs[0]);
+    }
+    free(b.dstPtrs);
+    free(b.dstCapacities);
+    free(b.dstSizes);
+
+    if(b.resPtrs != NULL) {
+        free(b.resPtrs[0]);
+    }
+    free(b.resPtrs);
+    free(b.resSizes);
+}
+
+static void freeBuffers(const buffers_t b) {
+    if(b.srcPtrs != NULL) {
+        free(b.srcBuffer);
+    }
+    freeNonSrcBuffers(b);
+}
+
+/* srcBuffer will be freed by freeBuffers now */
+static int createBuffersFromMemory(buffers_t* buff, void * srcBuffer, const size_t nbFiles,
+    const size_t* fileSizes)
+{
+    size_t pos = 0, n, blockSize;
+    U32 maxNbBlocks, blockNb = 0;
+    buff->srcSize = 0;
+    for(n = 0; n < nbFiles; n++) {
+        buff->srcSize += fileSizes[n];
+    }
+
+    if(buff->srcSize == 0) {
+        DISPLAY("No data to bench\n");
+        return 1;
+    }
+
+    blockSize = g_blockSize ? g_blockSize : buff->srcSize;
+    maxNbBlocks = (U32) ((buff->srcSize + (blockSize-1)) / blockSize) + (U32)nbFiles;
+
+    buff->srcPtrs = (const void**)calloc(maxNbBlocks, sizeof(void*));
+    buff->srcSizes = (size_t*)malloc(maxNbBlocks * sizeof(size_t));
+
+    buff->dstPtrs = (void**)calloc(maxNbBlocks, sizeof(void*));
+    buff->dstCapacities = (size_t*)malloc(maxNbBlocks * sizeof(size_t));
+    buff->dstSizes = (size_t*)malloc(maxNbBlocks * sizeof(size_t));
+
+    buff->resPtrs = (void**)calloc(maxNbBlocks, sizeof(void*));
+    buff->resSizes = (size_t*)malloc(maxNbBlocks * sizeof(size_t));
+
+    if(!buff->srcPtrs || !buff->srcSizes || !buff->dstPtrs || !buff->dstCapacities || !buff->dstSizes || !buff->resPtrs || !buff->resSizes) {
+        DISPLAY("alloc error\n");
+        freeNonSrcBuffers(*buff);
+        return 1;
+    }
+
+    buff->srcBuffer = srcBuffer;
+    buff->srcPtrs[0] = (const void*)buff->srcBuffer;
+    buff->dstPtrs[0] = malloc(ZSTD_compressBound(buff->srcSize) + (maxNbBlocks * 1024));
+    buff->resPtrs[0] = malloc(buff->srcSize);
+
+    if(!buff->dstPtrs[0] || !buff->resPtrs[0]) {
+        DISPLAY("alloc error\n");
+        freeNonSrcBuffers(*buff);
+        return 1;
+    }
+
+    for(n = 0; n < nbFiles; n++) {
+        size_t pos_end = pos + fileSizes[n];
+        for(; pos < pos_end; blockNb++) {
+            buff->srcPtrs[blockNb] = (const void*)((char*)srcBuffer + pos);
+            buff->srcSizes[blockNb] = blockSize;
+            pos += blockSize;
+        }
+
+        if(fileSizes[n] > 0) { buff->srcSizes[blockNb - 1] = ((fileSizes[n] - 1) % blockSize) + 1; }
+        pos = pos_end;
+    }
+
+    buff->dstCapacities[0] = ZSTD_compressBound(buff->srcSizes[0]);
+    buff->dstSizes[0] = buff->dstCapacities[0];
+    buff->resSizes[0] = buff->srcSizes[0];
+    buff->maxBlockSize = buff->srcSizes[0];
+
+    for(n = 1; n < blockNb; n++) {
+        buff->dstPtrs[n] = ((char*)buff->dstPtrs[n-1]) + buff->dstCapacities[n-1];
+        buff->resPtrs[n] = ((char*)buff->resPtrs[n-1]) + buff->resSizes[n-1];
+        buff->dstCapacities[n] = ZSTD_compressBound(buff->srcSizes[n]);
+        buff->dstSizes[n] = buff->dstCapacities[n];
+        buff->resSizes[n] = buff->srcSizes[n];
+
+        buff->maxBlockSize = MAX(buff->maxBlockSize, buff->srcSizes[n]);
+    }
+
+    buff->nbBlocks = blockNb;
+
+    return 0;
+}
+
+/* allocates buffer's arguments. returns success / failuere */
+static int createBuffers(buffers_t* buff, const char* const * const fileNamesTable,
+                          size_t nbFiles) {
+    size_t pos = 0;
+    size_t n;
+    size_t totalSizeToLoad = UTIL_getTotalFileSize(fileNamesTable, (U32)nbFiles);
+    size_t benchedSize = MIN(BMK_findMaxMem(totalSizeToLoad * 3) / 3, totalSizeToLoad);
+    size_t* fileSizes = calloc(sizeof(size_t), nbFiles);
+    void* srcBuffer = NULL;
+    int ret = 0;
+
+    if(!totalSizeToLoad || !benchedSize) {
+        ret = 1;
+        DISPLAY("Nothing to Bench\n");
+        goto _cleanUp;
+    }
+
+    srcBuffer = malloc(benchedSize);
+
+    if(!fileSizes || !srcBuffer) {
+        ret = 1;
+        goto _cleanUp;
+    }
+
+    for(n = 0; n < nbFiles; n++) {
+        FILE* f;
+        U64 fileSize = UTIL_getFileSize(fileNamesTable[n]);
+        if (UTIL_isDirectory(fileNamesTable[n])) {
+            DISPLAY("Ignoring %s directory...       \n", fileNamesTable[n]);
+            continue;
+        }
+        if (fileSize == UTIL_FILESIZE_UNKNOWN) {
+            DISPLAY("Cannot evaluate size of %s, ignoring ... \n", fileNamesTable[n]);
+            continue;
+        }
+        f = fopen(fileNamesTable[n], "rb");
+        if (f==NULL) {
+            DISPLAY("impossible to open file %s\n", fileNamesTable[n]);
+            fclose(f);
+            ret = 10;
+            goto _cleanUp;
+        }
+
+        DISPLAYLEVEL(2, "Loading %s...       \r", fileNamesTable[n]);
+
+        if (fileSize + pos > benchedSize) fileSize = benchedSize - pos, nbFiles=n;   /* buffer too small - stop after this file */
+        {
+            char* buffer = (char*)(srcBuffer);
+            size_t const readSize = fread((buffer)+pos, 1, (size_t)fileSize, f);
+            fclose(f);
+            if (readSize != (size_t)fileSize) {
+                DISPLAY("could not read %s", fileNamesTable[n]);
+                ret = 1;
+                goto _cleanUp;
+            }
+
+            fileSizes[n] = readSize;
+            pos += readSize;
+        }
+    }
+
+    ret = createBuffersFromMemory(buff, srcBuffer, nbFiles, fileSizes);
+
+_cleanUp:
+    if(ret) { free(srcBuffer); }
+    free(fileSizes);
+    return ret;
+}
+
+static void freeContexts(const contexts_t ctx) {
+    free(ctx.dictBuffer);
+    ZSTD_freeCCtx(ctx.cctx);
+    ZSTD_freeDCtx(ctx.dctx);
+}
+
+static int createContexts(contexts_t* ctx, const char* dictFileName) {
+    FILE* f;
+    size_t readSize;
+    ctx->cctx = ZSTD_createCCtx();
+    ctx->dctx = ZSTD_createDCtx();
+    assert(ctx->cctx != NULL);
+    assert(ctx->dctx != NULL);
+
+    if(dictFileName == NULL) {
+        ctx->dictSize = 0;
+        ctx->dictBuffer = NULL;
+        return 0;
+    }
+    {   U64 const dictFileSize = UTIL_getFileSize(dictFileName);
+        assert(dictFileSize != UTIL_FILESIZE_UNKNOWN);
+        ctx->dictSize = dictFileSize;
+        assert((U64)ctx->dictSize == dictFileSize); /* check overflow */
+    }
+    ctx->dictBuffer = malloc(ctx->dictSize);
+
+    f = fopen(dictFileName, "rb");
+
+    if (f==NULL) {
+        DISPLAY("unable to open file\n");
+        freeContexts(*ctx);
+        return 1;
+    }
+
+    if (ctx->dictSize > 64 MB || !(ctx->dictBuffer)) {
+        DISPLAY("dictionary too large\n");
+        fclose(f);
+        freeContexts(*ctx);
+        return 1;
+    }
+    readSize = fread(ctx->dictBuffer, 1, ctx->dictSize, f);
+    fclose(f);
+    if (readSize != ctx->dictSize) {
+        DISPLAY("unable to read file\n");
+        freeContexts(*ctx);
+        return 1;
+    }
+    return 0;
+}
+
+/*-************************************
+*  Optimizer Memoization Functions
+**************************************/
+
+/* return: new length */
+/* keep old array, will need if iter over strategy. */
+/* prunes useless params */
+static size_t sanitizeVarArray(varInds_t* varNew, const size_t varLength, const varInds_t* varArray, const ZSTD_strategy strat) {
+    size_t i, j = 0;
+    for(i = 0; i < varLength; i++) {
+        if( !((varArray[i] == clog_ind && strat == ZSTD_fast)
+            || (varArray[i] == slog_ind && strat == ZSTD_fast)
+            || (varArray[i] == slog_ind && strat == ZSTD_dfast)
+            || (varArray[i] == tlen_ind && strat != ZSTD_btopt && strat != ZSTD_btultra && strat != ZSTD_fast))) {
+            varNew[j] = varArray[i];
+            j++;
+        }
+    }
+    return j;
+}
+
+/* res should be NUM_PARAMS size */
+/* constructs varArray from paramValues_t style parameter */
+/* pass in using dict. */
+static size_t variableParams(const paramValues_t paramConstraints, varInds_t* res, const int usingDictionary) {
+    varInds_t i;
+    size_t j = 0;
+    for(i = 0; i < NUM_PARAMS; i++) {
+        if(paramConstraints.vals[i] == PARAM_UNSET) {
+            if(i == fadt_ind && !usingDictionary) continue; /* don't use fadt if no dictionary */
+            res[j] = i; j++;
+        }
+    }
+    return j;
+}
+
+/* length of memo table given free variables */
+static size_t memoTableLen(const varInds_t* varyParams, const size_t varyLen) {
+    size_t arrayLen = 1;
+    size_t i;
+    for(i = 0; i < varyLen; i++) {
+        if(varyParams[i] == strt_ind) continue; /* strategy separated by table */
+        arrayLen *= rangetable[varyParams[i]];
+    }
+    return arrayLen;
+}
+
+/* returns unique index in memotable of compression parameters */
+static unsigned memoTableIndDirect(const paramValues_t* ptr, const varInds_t* varyParams, const size_t varyLen) {
+    size_t i;
+    unsigned ind = 0;
+    for(i = 0; i < varyLen; i++) {
+        varInds_t v = varyParams[i];
+        if(v == strt_ind) continue; /* exclude strategy from memotable */
+        ind *= rangetable[v]; ind += (unsigned)invRangeMap(v, ptr->vals[v]);
+    }
+    return ind;
+}
+
+static size_t memoTableGet(const memoTable_t* memoTableArray, const paramValues_t p) {
+    const memoTable_t mt = memoTableArray[p.vals[strt_ind]];
+    switch(mt.tableType) {
+        case directMap:
+            return mt.table[memoTableIndDirect(&p, mt.varArray, mt.varLen)];
+        case xxhashMap:
+            return mt.table[(XXH64(&p.vals, sizeof(U32) * NUM_PARAMS, 0) >> 3) % mt.tableLen];
+        case noMemo:
+            return 0;
+    }
+    return 0; /* should never happen, stop compiler warnings */
+}
+
+static void memoTableSet(const memoTable_t* memoTableArray, const paramValues_t p, const BYTE value) {
+    const memoTable_t mt = memoTableArray[p.vals[strt_ind]];
+    switch(mt.tableType) {
+        case directMap:
+            mt.table[memoTableIndDirect(&p, mt.varArray, mt.varLen)] = value; break;
+        case xxhashMap:
+            mt.table[(XXH64(&p.vals, sizeof(U32) * NUM_PARAMS, 0) >> 3) % mt.tableLen] = value; break;
+        case noMemo:
+            break;
+    }
+}
+
+/* frees all allocated memotables */
+static void freeMemoTableArray(memoTable_t* const mtAll) {
+    int i;
+    if(mtAll == NULL) { return; }
+    for(i = 1; i <= (int)ZSTD_btultra; i++) {
+        free(mtAll[i].table);
+    }
+    free(mtAll);
+}
+
+/* inits memotables for all (including mallocs), all strategies */
+/* takes unsanitized varyParams */
+static memoTable_t* createMemoTableArray(const paramValues_t p, const varInds_t* const varyParams, const size_t varyLen, const U32 memoTableLog) {
+    memoTable_t* mtAll = (memoTable_t*)calloc(sizeof(memoTable_t),(ZSTD_btultra + 1));
+    ZSTD_strategy i, stratMin = ZSTD_fast, stratMax = ZSTD_btultra;
+
+    if(mtAll == NULL) {
+        return NULL;
+    }
+
+    for(i = 1; i <= (int)ZSTD_btultra; i++) {
+        mtAll[i].varLen = sanitizeVarArray(mtAll[i].varArray, varyLen, varyParams, i);
+    }
+
+    /* no memoization */
+    if(memoTableLog == 0) {
+        for(i = 1; i <= (int)ZSTD_btultra; i++) {
+            mtAll[i].tableType = noMemo;
+            mtAll[i].table = NULL;
+            mtAll[i].tableLen = 0;
+        }
+        return mtAll;
+    }
+
+
+    if(p.vals[strt_ind] != PARAM_UNSET) {
+        stratMin = p.vals[strt_ind];
+        stratMax = p.vals[strt_ind];
+    }
+
+
+    for(i = stratMin; i <= stratMax; i++) {
+        size_t mtl = memoTableLen(mtAll[i].varArray, mtAll[i].varLen);
+        mtAll[i].tableType = directMap;
+
+        if(memoTableLog != PARAM_UNSET && mtl > (1ULL << memoTableLog)) { /* use hash table */ /* provide some option to only use hash tables? */
+            mtAll[i].tableType = xxhashMap;
+            mtl = (1ULL << memoTableLog);
+        }
+
+        mtAll[i].table = (BYTE*)calloc(sizeof(BYTE), mtl);
+        mtAll[i].tableLen = mtl;
+
+        if(mtAll[i].table == NULL) {
+            freeMemoTableArray(mtAll);
+            return NULL;
+        }
+    }
+
+    return mtAll;
+}
+
+/* Sets pc to random unmeasured set of parameters */
+/* specifiy strategy */
+static void randomConstrainedParams(paramValues_t* pc, const memoTable_t* memoTableArray, const ZSTD_strategy st)
+{
+    size_t j;
+    const memoTable_t mt = memoTableArray[st];
+    pc->vals[strt_ind] = st;
+    for(j = 0; j < mt.tableLen; j++) {
+        int i;
+        for(i = 0; i < NUM_PARAMS; i++) {
+            varInds_t v = mt.varArray[i];
+            if(v == strt_ind) continue;
+            pc->vals[v] = rangeMap(v, FUZ_rand(&g_rand) % rangetable[v]);
+        }
+
+        if(!(memoTableGet(memoTableArray, *pc))) break; /* only pick unpicked params. */
+    }
+}
+
+/*-************************************
+*  Benchmarking Functions
+**************************************/
+
+/* Replicate functionality of benchMemAdvanced, but with pre-split src / dst buffers */
+/* The purpose is so that sufficient information is returned so that a decompression call to benchMemInvertible is possible */
+/* BMK_benchMemAdvanced(srcBuffer,srcSize, dstBuffer, dstSize, fileSizes, nbFiles, 0, &cParams, dictBuffer, dictSize, ctx, dctx, 0, "File", &adv); */
+/* nbSeconds used in same way as in BMK_advancedParams_t */
+/* if in decodeOnly, then srcPtr's will be compressed blocks, and uncompressedBlocks will be written to dstPtrs */
+/* dictionary nullable, nothing else though. */
+/* note : it would be better if this function was in bench.c, sharing code with benchMemAdvanced(), since it's technically a part of it */
+static BMK_benchOutcome_t
+BMK_benchMemInvertible( buffers_t buf, contexts_t ctx,
+                        int cLevel, const paramValues_t* comprParams,
+                        BMK_mode_t mode, unsigned nbSeconds)
+{
+    U32 i;
+    BMK_benchResult_t bResult;
+    const void *const *const srcPtrs = (const void *const *const)buf.srcPtrs;
+    size_t const *const srcSizes = buf.srcSizes;
+    void** const dstPtrs = buf.dstPtrs;
+    size_t const *const dstCapacities = buf.dstCapacities;
+    size_t* const dstSizes = buf.dstSizes;
+    void** const resPtrs = buf.resPtrs;
+    size_t const *const resSizes = buf.resSizes;
+    const void* dictBuffer = ctx.dictBuffer;
+    const size_t dictBufferSize = ctx.dictSize;
+    const size_t nbBlocks = buf.nbBlocks;
+    const size_t srcSize = buf.srcSize;
+    ZSTD_CCtx* cctx = ctx.cctx;
+    ZSTD_DCtx* dctx = ctx.dctx;
+
+    /* init */
+    memset(&bResult, 0, sizeof(bResult));
+
+    /* warmimg up memory */
+    for (i = 0; i < buf.nbBlocks; i++) {
+        if (mode != BMK_decodeOnly) {
+            RDG_genBuffer(dstPtrs[i], dstCapacities[i], 0.10, 0.50, 1);
+        } else {
+            RDG_genBuffer(resPtrs[i], resSizes[i], 0.10, 0.50, 1);
+        }
+    }
+
+    /* Bench */
+    {
+        /* init args */
+        int compressionCompleted = (mode == BMK_decodeOnly);
+        int decompressionCompleted = (mode == BMK_compressOnly);
+        BMK_timedFnState_t* timeStateCompress = BMK_createTimedFnState(nbSeconds * 1000, 1000);
+        BMK_timedFnState_t* timeStateDecompress = BMK_createTimedFnState(nbSeconds * 1000, 1000);
+        BMK_initCCtxArgs cctxprep;
+        BMK_initDCtxArgs dctxprep;
+        cctxprep.cctx = cctx;
+        cctxprep.dictBuffer = dictBuffer;
+        cctxprep.dictBufferSize = dictBufferSize;
+        cctxprep.cLevel = cLevel;
+        cctxprep.comprParams = comprParams;
+        dctxprep.dctx = dctx;
+        dctxprep.dictBuffer = dictBuffer;
+        dctxprep.dictBufferSize = dictBufferSize;
+
+        assert(timeStateCompress != NULL);
+        assert(timeStateDecompress != NULL);
+        while(!compressionCompleted) {
+            BMK_runOutcome_t const cOutcome = BMK_benchTimedFn(timeStateCompress,
+                                            &local_defaultCompress, cctx,
+                                            &local_initCCtx, &cctxprep,
+                                            nbBlocks,
+                                            srcPtrs, srcSizes,
+                                            dstPtrs, dstCapacities,
+                                            dstSizes);
+
+            if (!BMK_isSuccessful_runOutcome(cOutcome)) {
+                BMK_benchOutcome_t bOut;
+                memset(&bOut, 0, sizeof(bOut));
+                bOut.tag = 1;   /* should rather be a function or a constant */
+                BMK_freeTimedFnState(timeStateCompress);
+                BMK_freeTimedFnState(timeStateDecompress);
+                return bOut;
+            }
+            {   BMK_runTime_t const rResult = BMK_extract_runTime(cOutcome);
+                bResult.cSpeed = (srcSize * TIMELOOP_NANOSEC) / rResult.nanoSecPerRun;
+                bResult.cSize = rResult.sumOfReturn;
+            }
+            compressionCompleted = BMK_isCompleted_TimedFn(timeStateCompress);
+        }
+
+        while (!decompressionCompleted) {
+            BMK_runOutcome_t const dOutcome = BMK_benchTimedFn(timeStateDecompress,
+                                        &local_defaultDecompress, dctx,
+                                        &local_initDCtx, &dctxprep,
+                                        nbBlocks,
+                                        (const void* const*)dstPtrs, dstSizes,
+                                        resPtrs, resSizes,
+                                        NULL);
+
+            if (!BMK_isSuccessful_runOutcome(dOutcome)) {
+                BMK_benchOutcome_t bOut;
+                memset(&bOut, 0, sizeof(bOut));
+                bOut.tag = 1;   /* should rather be a function or a constant */
+                BMK_freeTimedFnState(timeStateCompress);
+                BMK_freeTimedFnState(timeStateDecompress);
+                return bOut;
+            }
+            {   BMK_runTime_t const rResult = BMK_extract_runTime(dOutcome);
+                bResult.dSpeed = (srcSize * TIMELOOP_NANOSEC) / rResult.nanoSecPerRun;
+            }
+            decompressionCompleted = BMK_isCompleted_TimedFn(timeStateDecompress);
+        }
+
+        BMK_freeTimedFnState(timeStateCompress);
+        BMK_freeTimedFnState(timeStateDecompress);
+    }
+
+   /* Bench */
+    bResult.cMem = (1 << (comprParams->vals[wlog_ind])) + ZSTD_sizeof_CCtx(cctx);
+
+    {   BMK_benchOutcome_t bOut;
+        bOut.tag = 0;
+        bOut.internal_never_use_directly = bResult;  /* should be a function */
+        return bOut;
+    }
+}
+
+static int BMK_benchParam ( BMK_benchResult_t* resultPtr,
+                            buffers_t buf, contexts_t ctx,
+                            paramValues_t cParams)
+{
+    BMK_benchOutcome_t const outcome = BMK_benchMemInvertible(buf, ctx,
+                                                        BASE_CLEVEL, &cParams,
+                                                        BMK_both, 3);
+    int const success = BMK_isSuccessful_benchOutcome(outcome);
+    if (!success) return 1;
+    *resultPtr = BMK_extract_benchResult(outcome);
+    return 0;
+}
+
+
+#define CBENCHMARK(conditional, resultvar, tmpret, mode, sec) {                                                 \
+    if(conditional) {                                                                                           \
+        BMK_benchOutcome_t const outcome = BMK_benchMemInvertible(buf, ctx, BASE_CLEVEL, &cParams, mode, sec);  \
+        if (!BMK_isSuccessful_benchOutcome(outcome)) {                                                          \
+            DEBUGOUTPUT("Benchmarking failed\n");                                                               \
+            return ERROR_RESULT;                                                                                \
+        }                                                                                                       \
+        {   BMK_benchResult_t const tmpResult = BMK_extract_benchResult(outcome);                               \
+            if (mode != BMK_decodeOnly)  {                                                                      \
+                resultvar.cSpeed = tmpResult.cSpeed;                                                            \
+                resultvar.cSize = tmpResult.cSize;                                                              \
+                resultvar.cMem = tmpResult.cMem;                                                                \
+            }                                                                                                   \
+            if (mode != BMK_compressOnly) { resultvar.dSpeed = tmpResult.dSpeed; }                              \
+    }   }                                                                                                       \
+}
+
+/* Benchmarking which stops when we are sufficiently sure the solution is infeasible / worse than the winner */
+#define VARIANCE 1.2
+static int allBench(BMK_benchResult_t* resultPtr,
+                const buffers_t buf, const contexts_t ctx,
+                const paramValues_t cParams,
+                const constraint_t target,
+                BMK_benchResult_t* winnerResult, int feas)
+{
+    BMK_benchResult_t benchres;
+    double uncertaintyConstantC = 3., uncertaintyConstantD = 3.;
+    double winnerRS;
+
+    BMK_benchOutcome_t const outcome = BMK_benchMemInvertible(buf, ctx, BASE_CLEVEL, &cParams, BMK_both, 2);
+    if (!BMK_isSuccessful_benchOutcome(outcome)) {
+        DEBUGOUTPUT("Benchmarking failed \n");
+        return ERROR_RESULT;
+    }
+    benchres = BMK_extract_benchResult(outcome);
+
+    winnerRS = resultScore(*winnerResult, buf.srcSize, target);
+    DEBUGOUTPUT("WinnerScore: %f \n ", winnerRS);
+
+    *resultPtr = benchres;
+
+    /* anything with worse ratio in feas is definitely worse, discard */
+    if(feas && benchres.cSize < winnerResult->cSize && !g_optmode) {
+        return WORSE_RESULT;
+    }
+
+    /* calculate uncertainty in compression / decompression runs */
+    if (benchres.cSpeed) {
+        U64 const loopDurationC = (((U64)buf.srcSize * TIMELOOP_NANOSEC) / benchres.cSpeed);
+        uncertaintyConstantC = ((loopDurationC + (double)(2 * g_clockGranularity))/loopDurationC);
+    }
+
+    if (benchres.dSpeed) {
+        U64 const loopDurationD = (((U64)buf.srcSize * TIMELOOP_NANOSEC) / benchres.dSpeed);
+        uncertaintyConstantD = ((loopDurationD + (double)(2 * g_clockGranularity))/loopDurationD);
+    }
+
+    /* optimistic assumption of benchres */
+    {   BMK_benchResult_t resultMax = benchres;
+        resultMax.cSpeed *= uncertaintyConstantC * VARIANCE;
+        resultMax.dSpeed *= uncertaintyConstantD * VARIANCE;
+
+        /* disregard infeasible results in feas mode */
+        /* disregard if resultMax < winner in infeas mode */
+        if((feas && !feasible(resultMax, target)) ||
+          (!feas && (winnerRS > resultScore(resultMax, buf.srcSize, target)))) {
+            return WORSE_RESULT;
+        }
+    }
+
+    /* compare by resultScore when in infeas */
+    /* compare by compareResultLT when in feas */
+    if((!feas && (resultScore(benchres, buf.srcSize, target) > resultScore(*winnerResult, buf.srcSize, target))) ||
+       (feas && (compareResultLT(*winnerResult, benchres, target, buf.srcSize))) )  {
+        return BETTER_RESULT;
+    } else {
+        return WORSE_RESULT;
+    }
+}
+
+
+#define INFEASIBLE_THRESHOLD 200
+/* Memoized benchmarking, won't benchmark anything which has already been benchmarked before. */
+static int benchMemo(BMK_benchResult_t* resultPtr,
+                const buffers_t buf, const contexts_t ctx,
+                const paramValues_t cParams,
+                const constraint_t target,
+                BMK_benchResult_t* winnerResult, memoTable_t* const memoTableArray,
+                const int feas) {
+    static int bmcount = 0;
+    int res;
+
+    if ( memoTableGet(memoTableArray, cParams) >= INFEASIBLE_THRESHOLD
+      || redundantParams(cParams, target, buf.maxBlockSize) ) {
+        return WORSE_RESULT;
+    }
+
+    res = allBench(resultPtr, buf, ctx, cParams, target, winnerResult, feas);
+
+    if(DEBUG && !(bmcount % 250)) {
+        DISPLAY("Count: %d\n", bmcount);
+        bmcount++;
+    }
+    BMK_printWinnerOpt(stdout, CUSTOM_LEVEL, *resultPtr, cParams, target, buf.srcSize);
+
+    if(res == BETTER_RESULT || feas) {
+        memoTableSet(memoTableArray, cParams, 255); /* what happens if collisions are frequent */
+    }
+    return res;
+}
+
+
+typedef struct {
+    U64 cSpeed_min;
+    U64 dSpeed_min;
     U32 windowLog_max;
     ZSTD_strategy strategy_max;
 } level_constraints_t;
@@ -246,15 +1666,14 @@
     }   }
 }
 
-static int BMK_seed(winnerInfo_t* winners, const ZSTD_compressionParameters params,
-              const void* srcBuffer, size_t srcSize,
-                    ZSTD_CCtx* ctx, ZSTD_DCtx* dctx)
+static int BMK_seed(winnerInfo_t* winners, const paramValues_t params,
+                    const buffers_t buf, const contexts_t ctx)
 {
-    BMK_result_t testResult;
+    BMK_benchResult_t testResult;
     int better = 0;
     int cLevel;
 
-    BMK_benchParam(&testResult, srcBuffer, srcSize, ctx, dctx, params);
+    BMK_benchParam(&testResult, buf, ctx, params);
 
 
     for (cLevel = 1; cLevel <= NB_LEVELS_TRACKED; cLevel++) {
@@ -262,32 +1681,32 @@
             continue;   /* not fast enough for this level */
         if (testResult.dSpeed < g_level_constraint[cLevel].dSpeed_min)
             continue;   /* not fast enough for this level */
-        if (params.windowLog > g_level_constraint[cLevel].windowLog_max)
+        if (params.vals[wlog_ind] > g_level_constraint[cLevel].windowLog_max)
             continue;   /* too much memory for this level */
-        if (params.strategy > g_level_constraint[cLevel].strategy_max)
+        if (params.vals[strt_ind] > g_level_constraint[cLevel].strategy_max)
             continue;   /* forbidden strategy for this level */
         if (winners[cLevel].result.cSize==0) {
             /* first solution for this cLevel */
             winners[cLevel].result = testResult;
             winners[cLevel].params = params;
-            BMK_printWinner(stdout, cLevel, testResult, params, srcSize);
+            BMK_printWinner(stdout, cLevel, testResult, params, buf.srcSize);
             better = 1;
             continue;
         }
 
         if ((double)testResult.cSize <= ((double)winners[cLevel].result.cSize * (1. + (0.02 / cLevel))) ) {
             /* Validate solution is "good enough" */
-            double W_ratio = (double)srcSize / testResult.cSize;
-            double O_ratio = (double)srcSize / winners[cLevel].result.cSize;
+            double W_ratio = (double)buf.srcSize / testResult.cSize;
+            double O_ratio = (double)buf.srcSize / winners[cLevel].result.cSize;
             double W_ratioNote = log (W_ratio);
             double O_ratioNote = log (O_ratio);
-            size_t W_DMemUsed = (1 << params.windowLog) + (16 KB);
-            size_t O_DMemUsed = (1 << winners[cLevel].params.windowLog) + (16 KB);
+            size_t W_DMemUsed = (1 << params.vals[wlog_ind]) + (16 KB);
+            size_t O_DMemUsed = (1 << winners[cLevel].params.vals[wlog_ind]) + (16 KB);
             double W_DMemUsed_note = W_ratioNote * ( 40 + 9*cLevel) - log((double)W_DMemUsed);
             double O_DMemUsed_note = O_ratioNote * ( 40 + 9*cLevel) - log((double)O_DMemUsed);
 
-            size_t W_CMemUsed = (1 << params.windowLog) + ZSTD_estimateCCtxSize_usingCParams(params);
-            size_t O_CMemUsed = (1 << winners[cLevel].params.windowLog) + ZSTD_estimateCCtxSize_usingCParams(winners[cLevel].params);
+            size_t W_CMemUsed = (1 << params.vals[wlog_ind]) + ZSTD_estimateCCtxSize_usingCParams(pvalsToCParams(params));
+            size_t O_CMemUsed = (1 << winners[cLevel].params.vals[wlog_ind]) + ZSTD_estimateCCtxSize_usingCParams(pvalsToCParams(winners[cLevel].params));
             double W_CMemUsed_note = W_ratioNote * ( 50 + 13*cLevel) - log((double)W_CMemUsed);
             double O_CMemUsed_note = O_ratioNote * ( 50 + 13*cLevel) - log((double)O_CMemUsed);
 
@@ -317,16 +1736,16 @@
                 /* too large compression speed difference for the compression benefit */
                 if (W_ratio > O_ratio)
                 DISPLAY ("Compression Speed : %5.3f @ %4.1f MB/s  vs  %5.3f @ %4.1f MB/s   : not enough for level %i\n",
-                         W_ratio, testResult.cSpeed / 1000000,
-                         O_ratio, winners[cLevel].result.cSpeed / 1000000.,   cLevel);
+                         W_ratio, (double)testResult.cSpeed / MB_UNIT,
+                         O_ratio, (double)winners[cLevel].result.cSpeed / MB_UNIT,   cLevel);
                 continue;
             }
             if (W_DSpeed_note   < O_DSpeed_note  ) {
                 /* too large decompression speed difference for the compression benefit */
                 if (W_ratio > O_ratio)
                 DISPLAY ("Decompression Speed : %5.3f @ %4.1f MB/s  vs  %5.3f @ %4.1f MB/s   : not enough for level %i\n",
-                         W_ratio, testResult.dSpeed / 1000000.,
-                         O_ratio, winners[cLevel].result.dSpeed / 1000000.,   cLevel);
+                         W_ratio, (double)testResult.dSpeed / MB_UNIT,
+                         O_ratio, (double)winners[cLevel].result.dSpeed / MB_UNIT,   cLevel);
                 continue;
             }
 
@@ -335,7 +1754,7 @@
 
             winners[cLevel].result = testResult;
             winners[cLevel].params = params;
-            BMK_printWinner(stdout, cLevel, testResult, params, srcSize);
+            BMK_printWinner(stdout, cLevel, testResult, params, buf.srcSize);
 
             better = 1;
     }   }
@@ -343,158 +1762,72 @@
     return better;
 }
 
-
-/* nullified useless params, to ensure count stats */
-static ZSTD_compressionParameters* sanitizeParams(ZSTD_compressionParameters params)
-{
-    g_params = params;
-    if (params.strategy == ZSTD_fast)
-        g_params.chainLog = 0, g_params.searchLog = 0;
-    if (params.strategy == ZSTD_dfast)
-        g_params.searchLog = 0;
-    if (params.strategy != ZSTD_btopt && params.strategy != ZSTD_btultra)
-        g_params.targetLength = 0;
-    return &g_params;
-}
-
-
-static void paramVariation(ZSTD_compressionParameters* ptr)
-{
-    ZSTD_compressionParameters p;
-    U32 validated = 0;
-    while (!validated) {
-        U32 nbChanges = (FUZ_rand(&g_rand) & 3) + 1;
-        p = *ptr;
-        for ( ; nbChanges ; nbChanges--) {
-            const U32 changeID = FUZ_rand(&g_rand) % 14;
-            switch(changeID)
-            {
-            case 0:
-                p.chainLog++; break;
-            case 1:
-                p.chainLog--; break;
-            case 2:
-                p.hashLog++; break;
-            case 3:
-                p.hashLog--; break;
-            case 4:
-                p.searchLog++; break;
-            case 5:
-                p.searchLog--; break;
-            case 6:
-                p.windowLog++; break;
-            case 7:
-                p.windowLog--; break;
-            case 8:
-                p.searchLength++; break;
-            case 9:
-                p.searchLength--; break;
-            case 10:
-                p.strategy = (ZSTD_strategy)(((U32)p.strategy)+1); break;
-            case 11:
-                p.strategy = (ZSTD_strategy)(((U32)p.strategy)-1); break;
-            case 12:
-                p.targetLength *= 1 + ((double)(FUZ_rand(&g_rand)&255)) / 256.; break;
-            case 13:
-                p.targetLength /= 1 + ((double)(FUZ_rand(&g_rand)&255)) / 256.; break;
-            }
-        }
-        validated = !ZSTD_isError(ZSTD_checkCParams(p));
-    }
-    *ptr = p;
-}
-
+/*-************************************
+*  Compression Level Table Generation Functions
+**************************************/
 
 #define PARAMTABLELOG   25
 #define PARAMTABLESIZE (1<<PARAMTABLELOG)
 #define PARAMTABLEMASK (PARAMTABLESIZE-1)
 static BYTE g_alreadyTested[PARAMTABLESIZE] = {0};   /* init to zero */
 
-#define NB_TESTS_PLAYED(p) \
-    g_alreadyTested[(XXH64(sanitizeParams(p), sizeof(p), 0) >> 3) & PARAMTABLEMASK]
-
+static BYTE* NB_TESTS_PLAYED(paramValues_t p) {
+    ZSTD_compressionParameters p2 = pvalsToCParams(sanitizeParams(p));
+    return &g_alreadyTested[(XXH64((void*)&p2, sizeof(p2), 0) >> 3) & PARAMTABLEMASK];
+}
 
 static void playAround(FILE* f, winnerInfo_t* winners,
-                       ZSTD_compressionParameters params,
-                       const void* srcBuffer, size_t srcSize,
-                       ZSTD_CCtx* ctx, ZSTD_DCtx* dctx)
+                       paramValues_t p,
+                       const buffers_t buf, const contexts_t ctx)
 {
-    int nbVariations = 0;
+    int nbVariations = 0, i;
     UTIL_time_t const clockStart = UTIL_getTime();
 
     while (UTIL_clockSpanMicro(clockStart) < g_maxVariationTime) {
-        ZSTD_compressionParameters p = params;
+        BYTE* b;
 
         if (nbVariations++ > g_maxNbVariations) break;
-        paramVariation(&p);
+
+        do { for(i = 0; i < 4; i++) { paramVaryOnce(FUZ_rand(&g_rand) % (strt_ind + 1), ((FUZ_rand(&g_rand) & 1) << 1) - 1, &p); } }
+        while(!paramValid(p));
 
         /* exclude faster if already played params */
-        if (FUZ_rand(&g_rand) & ((1 << NB_TESTS_PLAYED(p))-1))
+        if (FUZ_rand(&g_rand) & ((1 << *NB_TESTS_PLAYED(p))-1))
             continue;
 
         /* test */
-        NB_TESTS_PLAYED(p)++;
-        if (!BMK_seed(winners, p, srcBuffer, srcSize, ctx, dctx)) continue;
+        b = NB_TESTS_PLAYED(p);
+        (*b)++;
+        if (!BMK_seed(winners, p, buf, ctx)) continue;
 
         /* improvement found => search more */
-        BMK_printWinners(f, winners, srcSize);
-        playAround(f, winners, p, srcBuffer, srcSize, ctx, dctx);
+        BMK_printWinners(f, winners, buf.srcSize);
+        playAround(f, winners, p, buf, ctx);
     }
 
 }
 
-
-static ZSTD_compressionParameters randomParams(void)
-{
-    ZSTD_compressionParameters p;
-    U32 validated = 0;
-    while (!validated) {
-        /* totally random entry */
-        p.chainLog   = (FUZ_rand(&g_rand) % (ZSTD_CHAINLOG_MAX+1 - ZSTD_CHAINLOG_MIN)) + ZSTD_CHAINLOG_MIN;
-        p.hashLog    = (FUZ_rand(&g_rand) % (ZSTD_HASHLOG_MAX+1 - ZSTD_HASHLOG_MIN)) + ZSTD_HASHLOG_MIN;
-        p.searchLog  = (FUZ_rand(&g_rand) % (ZSTD_SEARCHLOG_MAX+1 - ZSTD_SEARCHLOG_MIN)) + ZSTD_SEARCHLOG_MIN;
-        p.windowLog  = (FUZ_rand(&g_rand) % (ZSTD_WINDOWLOG_MAX+1 - ZSTD_WINDOWLOG_MIN)) + ZSTD_WINDOWLOG_MIN;
-        p.searchLength=(FUZ_rand(&g_rand) % (ZSTD_SEARCHLENGTH_MAX+1 - ZSTD_SEARCHLENGTH_MIN)) + ZSTD_SEARCHLENGTH_MIN;
-        p.targetLength=(FUZ_rand(&g_rand) % (512));
-        p.strategy   = (ZSTD_strategy) (FUZ_rand(&g_rand) % (ZSTD_btultra +1));
-        validated = !ZSTD_isError(ZSTD_checkCParams(p));
-    }
-    return p;
-}
-
 static void BMK_selectRandomStart(
                        FILE* f, winnerInfo_t* winners,
-                       const void* srcBuffer, size_t srcSize,
-                       ZSTD_CCtx* ctx, ZSTD_DCtx* dctx)
+                       const buffers_t buf, const contexts_t ctx)
 {
     U32 const id = FUZ_rand(&g_rand) % (NB_LEVELS_TRACKED+1);
-    if ((id==0) || (winners[id].params.windowLog==0)) {
+    if ((id==0) || (winners[id].params.vals[wlog_ind]==0)) {
         /* use some random entry */
-        ZSTD_compressionParameters const p = ZSTD_adjustCParams(randomParams(), srcSize, 0);
-        playAround(f, winners, p, srcBuffer, srcSize, ctx, dctx);
+        paramValues_t const p = adjustParams(cParamsToPVals(pvalsToCParams(randomParams())), /* defaults nonCompression parameters */
+            buf.srcSize, 0);
+        playAround(f, winners, p, buf, ctx);
     } else {
-        playAround(f, winners, winners[id].params, srcBuffer, srcSize, ctx, dctx);
+        playAround(f, winners, winners[id].params, buf, ctx);
     }
 }
 
-
-static void BMK_benchOnce(ZSTD_CCtx* cctx, ZSTD_DCtx* dctx, const void* srcBuffer, size_t srcSize)
+static void BMK_benchFullTable(const buffers_t buf, const contexts_t ctx)
 {
-    BMK_result_t testResult;
-    g_params = ZSTD_adjustCParams(g_params, srcSize, 0);
-    BMK_benchParam(&testResult, srcBuffer, srcSize, cctx, dctx, g_params);
-    DISPLAY("Compression Ratio: %.3f  Compress Speed: %.1f MB/s Decompress Speed: %.1f MB/s\n", (double)srcSize / testResult.cSize, 
-        testResult.cSpeed / 1000000, testResult.dSpeed / 1000000);
-    return;
-}
-
-static void BMK_benchFullTable(ZSTD_CCtx* cctx, ZSTD_DCtx* dctx, const void* srcBuffer, size_t srcSize)
-{
-    ZSTD_compressionParameters params;
+    paramValues_t params;
     winnerInfo_t winners[NB_LEVELS_TRACKED+1];
     const char* const rfName = "grillResults.txt";
     FILE* const f = fopen(rfName, "w");
-    const size_t blockSize = g_blockSize ? g_blockSize : srcSize;   /* cut by block or not ? */
 
     /* init */
     assert(g_singleRun==0);
@@ -502,12 +1835,12 @@
     if (f==NULL) { DISPLAY("error opening %s \n", rfName); exit(1); }
 
     if (g_target) {
-        BMK_init_level_constraints(g_target*1000000);
+        BMK_init_level_constraints(g_target * MB_UNIT);
     } else {
         /* baseline config for level 1 */
-        ZSTD_compressionParameters const l1params = ZSTD_getCParams(1, blockSize, 0);
-        BMK_result_t testResult;
-        BMK_benchParam(&testResult, srcBuffer, srcSize, cctx, dctx, l1params);
+        paramValues_t const l1params = cParamsToPVals(ZSTD_getCParams(1, buf.maxBlockSize, ctx.dictSize));
+        BMK_benchResult_t testResult;
+        BMK_benchParam(&testResult, buf, ctx, l1params);
         BMK_init_level_constraints((int)((testResult.cSpeed * 31) / 32));
     }
 
@@ -515,251 +1848,544 @@
     {   const int maxSeeds = g_noSeed ? 1 : ZSTD_maxCLevel();
         int i;
         for (i=0; i<=maxSeeds; i++) {
-            params = ZSTD_getCParams(i, blockSize, 0);
-            BMK_seed(winners, params, srcBuffer, srcSize, cctx, dctx);
+            params = cParamsToPVals(ZSTD_getCParams(i, buf.maxBlockSize, 0));
+            BMK_seed(winners, params, buf, ctx);
     }   }
-    BMK_printWinners(f, winners, srcSize);
+    BMK_printWinners(f, winners, buf.srcSize);
 
     /* start tests */
-    {   const time_t grillStart = time(NULL);
+    {   const UTIL_time_t grillStart = UTIL_getTime();
         do {
-            BMK_selectRandomStart(f, winners, srcBuffer, srcSize, cctx, dctx);
-        } while (BMK_timeSpan(grillStart) < g_grillDuration_s);
+            BMK_selectRandomStart(f, winners, buf, ctx);
+        } while (BMK_timeSpan(grillStart) < g_timeLimit_s);
     }
 
     /* end summary */
-    BMK_printWinners(f, winners, srcSize);
+    BMK_printWinners(f, winners, buf.srcSize);
     DISPLAY("grillParams operations completed \n");
 
     /* clean up*/
     fclose(f);
 }
 
-static void BMK_benchMem_usingCCtx(ZSTD_CCtx* const cctx, ZSTD_DCtx* const dctx, const void* srcBuffer, size_t srcSize)
-{
-    if (g_singleRun)
-        return BMK_benchOnce(cctx, dctx, srcBuffer, srcSize);
-    else
-        return BMK_benchFullTable(cctx, dctx, srcBuffer, srcSize);
-}
 
-static void BMK_benchMemCCtxInit(const void* srcBuffer, size_t srcSize)
-{
-    ZSTD_CCtx* const cctx = ZSTD_createCCtx();
-    ZSTD_DCtx* const dctx = ZSTD_createDCtx();
-    if (cctx==NULL || dctx==NULL) { DISPLAY("Context Creation failed \n"); exit(1); }
-    BMK_benchMem_usingCCtx(cctx, dctx, srcBuffer, srcSize);
-    ZSTD_freeCCtx(cctx);
-}
+/*-************************************
+*  Single Benchmark Functions
+**************************************/
 
+static int benchOnce(const buffers_t buf, const contexts_t ctx, const int cLevel) {
+    BMK_benchResult_t testResult;
+    g_params = adjustParams(overwriteParams(cParamsToPVals(ZSTD_getCParams(cLevel, buf.maxBlockSize, ctx.dictSize)), g_params), buf.maxBlockSize, ctx.dictSize);
 
-static int benchSample(void)
-{
-    const char* const name = "Sample 10MB";
-    size_t const benchedSize = 10000000;
+    if (BMK_benchParam(&testResult, buf, ctx, g_params)) {
+        DISPLAY("Error during benchmarking\n");
+        return 1;
+    }
 
-    void* origBuff = malloc(benchedSize);
-    if (!origBuff) { perror("not enough memory"); return 12; }
+    BMK_printWinner(stdout, CUSTOM_LEVEL, testResult, g_params, buf.srcSize);
 
-    /* Fill buffer */
-    RDG_genBuffer(origBuff, benchedSize, g_compressibility, 0.0, 0);
-
-    /* bench */
-    DISPLAY("\r%79s\r", "");
-    DISPLAY("using %s %i%%: \n", name, (int)(g_compressibility*100));
-    BMK_benchMemCCtxInit(origBuff, benchedSize);
-
-    free(origBuff);
     return 0;
 }
 
+static int benchSample(double compressibility, int cLevel)
+{
+    const char* const name = "Sample 10MB";
+    size_t const benchedSize = 10 MB;
+    void* const srcBuffer = malloc(benchedSize);
+    int ret = 0;
+
+    buffers_t buf;
+    contexts_t ctx;
+
+    if(srcBuffer == NULL) {
+        DISPLAY("Out of Memory\n");
+        return 2;
+    }
+
+    RDG_genBuffer(srcBuffer, benchedSize, compressibility, 0.0, 0);
+
+    if(createBuffersFromMemory(&buf, srcBuffer, 1, &benchedSize)) {
+        DISPLAY("Buffer Creation Error\n");
+        free(srcBuffer);
+        return 3;
+    }
+
+    if(createContexts(&ctx, NULL)) {
+        DISPLAY("Context Creation Error\n");
+        freeBuffers(buf);
+        return 1;
+    }
+
+    /* bench */
+    DISPLAY("\r%79s\r", "");
+    DISPLAY("using %s %i%%: \n", name, (int)(compressibility*100));
+
+    if(g_singleRun) {
+        ret = benchOnce(buf, ctx, cLevel);
+    } else {
+        BMK_benchFullTable(buf, ctx);
+    }
+
+    freeBuffers(buf);
+    freeContexts(ctx);
+
+    return ret;
+}
 
 /* benchFiles() :
  * note: while this function takes a table of filenames,
  * in practice, only the first filename will be used */
-int benchFiles(const char** fileNamesTable, int nbFiles)
+static int benchFiles(const char** fileNamesTable, int nbFiles,
+                      const char* dictFileName, int cLevel)
 {
-    int fileIdx=0;
+    buffers_t buf;
+    contexts_t ctx;
+    int ret = 0;
 
-    /* Loop for each file */
-    while (fileIdx<nbFiles) {
-        const char* const inFileName = fileNamesTable[fileIdx++];
-        FILE* const inFile = fopen( inFileName, "rb" );
-        U64 const inFileSize = UTIL_getFileSize(inFileName);
-        size_t benchedSize;
-        void* origBuff;
-
-        /* Check file existence */
-        if (inFile==NULL) {
-            DISPLAY( "Pb opening %s\n", inFileName);
-            return 11;
-        }
-        if (inFileSize == UTIL_FILESIZE_UNKNOWN) {
-            DISPLAY("Pb evaluating size of %s \n", inFileName);
-            fclose(inFile);
-            return 11;
-        }
-
-        /* Memory allocation */
-        benchedSize = BMK_findMaxMem(inFileSize*3) / 3;
-        if ((U64)benchedSize > inFileSize) benchedSize = (size_t)inFileSize;
-        if (benchedSize < inFileSize)
-            DISPLAY("Not enough memory for '%s' full size; testing %i MB only...\n", inFileName, (int)(benchedSize>>20));
-        origBuff = malloc(benchedSize);
-        if (origBuff==NULL) {
-            DISPLAY("\nError: not enough memory!\n");
-            fclose(inFile);
-            return 12;
-        }
-
-        /* Fill input buffer */
-        DISPLAY("Loading %s...       \r", inFileName);
-        {   size_t const readSize = fread(origBuff, 1, benchedSize, inFile);
-            fclose(inFile);
-            if(readSize != benchedSize) {
-                DISPLAY("\nError: problem reading file '%s' !!    \n", inFileName);
-                free(origBuff);
-                return 13;
-        }   }
-
-        /* bench */
-        DISPLAY("\r%79s\r", "");
-        DISPLAY("using %s : \n", inFileName);
-        BMK_benchMemCCtxInit(origBuff, benchedSize);
-
-        /* clean */
-        free(origBuff);
+    if (createBuffers(&buf, fileNamesTable, nbFiles)) {
+        DISPLAY("unable to load files\n");
+        return 1;
     }
 
-    return 0;
-}
-
-
-static void BMK_translateAdvancedParams(ZSTD_compressionParameters params)
-{
-    DISPLAY("--zstd=windowLog=%u,chainLog=%u,hashLog=%u,searchLog=%u,searchLength=%u,targetLength=%u,strategy=%u \n",
-             params.windowLog, params.chainLog, params.hashLog, params.searchLog, params.searchLength, params.targetLength, (U32)(params.strategy));
-}
-
-/* optimizeForSize():
- * targetSpeed : expressed in MB/s */
-int optimizeForSize(const char* inFileName, U32 targetSpeed)
-{
-    FILE* const inFile = fopen( inFileName, "rb" );
-    U64 const inFileSize = UTIL_getFileSize(inFileName);
-    size_t benchedSize = BMK_findMaxMem(inFileSize*3) / 3;
-    void* origBuff;
-    /* Init */
-    if (inFile==NULL) { DISPLAY( "Pb opening %s\n", inFileName); return 11; }
-    if (inFileSize == UTIL_FILESIZE_UNKNOWN) {
-        DISPLAY("Pb evaluatin size of %s \n", inFileName);
-        fclose(inFile);
-        return 11;
+    if (createContexts(&ctx, dictFileName)) {
+        DISPLAY("unable to load dictionary\n");
+        freeBuffers(buf);
+        return 2;
     }
 
-    /* Memory allocation & restrictions */
-    if ((U64)benchedSize > inFileSize) benchedSize = (size_t)inFileSize;
-    if (benchedSize < inFileSize) {
-        DISPLAY("Not enough memory for '%s' \n", inFileName);
-        fclose(inFile);
-        return 11;
-    }
-
-    /* Alloc */
-    origBuff = malloc(benchedSize);
-    if(!origBuff) {
-        DISPLAY("\nError: not enough memory!\n");
-        fclose(inFile);
-        return 12;
-    }
-
-    /* Fill input buffer */
-    DISPLAY("Loading %s...       \r", inFileName);
-    {   size_t const readSize = fread(origBuff, 1, benchedSize, inFile);
-        fclose(inFile);
-        if(readSize != benchedSize) {
-            DISPLAY("\nError: problem reading file '%s' !!    \n", inFileName);
-            free(origBuff);
-            return 13;
-    }   }
-
-    /* bench */
     DISPLAY("\r%79s\r", "");
-    DISPLAY("optimizing for %s - limit speed %u MB/s \n", inFileName, targetSpeed);
-    targetSpeed *= 1000000;
-    {   ZSTD_CCtx* const ctx = ZSTD_createCCtx();
-        ZSTD_DCtx* const dctx = ZSTD_createDCtx();
-        winnerInfo_t winner;
-        BMK_result_t candidate;
-        const size_t blockSize = g_blockSize ? g_blockSize : benchedSize;
+    if (nbFiles == 1) {
+        DISPLAY("using %s : \n", fileNamesTable[0]);
+    } else {
+        DISPLAY("using %d Files : \n", nbFiles);
+    }
 
-        /* init */
-        if (ctx==NULL) { DISPLAY("\n ZSTD_createCCtx error \n"); free(origBuff); return 14;}
-        memset(&winner, 0, sizeof(winner));
-        winner.result.cSize = (size_t)(-1);
+    if (g_singleRun) {
+        ret = benchOnce(buf, ctx, cLevel);
+    } else {
+        BMK_benchFullTable(buf, ctx);
+    }
 
-        /* find best solution from default params */
-        {   const int maxSeeds = g_noSeed ? 1 : ZSTD_maxCLevel();
-            int i;
-            for (i=1; i<=maxSeeds; i++) {
-                ZSTD_compressionParameters const CParams = ZSTD_getCParams(i, blockSize, 0);
-                BMK_benchParam(&candidate, origBuff, benchedSize, ctx, dctx, CParams);
-                if (candidate.cSpeed < (double)targetSpeed) {
+    freeBuffers(buf);
+    freeContexts(ctx);
+    return ret;
+}
+
+
+/*-************************************
+*  Local Optimization Functions
+**************************************/
+
+/* One iteration of hill climbing. Specifically, it first tries all
+ * valid parameter configurations w/ manhattan distance 1 and picks the best one
+ * failing that, it progressively tries candidates further and further away (up to #dim + 2)
+ * if it finds a candidate exceeding winnerInfo, it will repeat. Otherwise, it will stop the
+ * current stage of hill climbing.
+ * Each iteration of hill climbing proceeds in 2 'phases'. Phase 1 climbs according to
+ * the resultScore function, which is effectively a linear increase in reward until it reaches
+ * the constraint-satisfying value, it which point any excess results in only logarithmic reward.
+ * This aims to find some constraint-satisfying point.
+ * Phase 2 optimizes in accordance with what the original function sets out to maximize, with
+ * all feasible solutions valued over all infeasible solutions.
+ */
+
+/* sanitize all params here.
+ * all generation after random should be sanitized. (maybe sanitize random)
+ */
+static winnerInfo_t climbOnce(const constraint_t target,
+                memoTable_t* mtAll,
+                const buffers_t buf, const contexts_t ctx,
+                const paramValues_t init)
+{
+    /*
+     * cparam - currently considered 'center'
+     * candidate - params to benchmark/results
+     * winner - best option found so far.
+     */
+    paramValues_t cparam = init;
+    winnerInfo_t candidateInfo, winnerInfo;
+    int better = 1;
+    int feas = 0;
+
+    winnerInfo = initWinnerInfo(init);
+    candidateInfo = winnerInfo;
+
+    {   winnerInfo_t bestFeasible1 = initWinnerInfo(cparam);
+        DEBUGOUTPUT("Climb Part 1\n");
+        while(better) {
+            int offset;
+            size_t i, dist;
+            const size_t varLen = mtAll[cparam.vals[strt_ind]].varLen;
+            better = 0;
+            DEBUGOUTPUT("Start\n");
+            cparam = winnerInfo.params;
+            candidateInfo.params = cparam;
+             /* all dist-1 candidates */
+            for (i = 0; i < varLen; i++) {
+                for (offset = -1; offset <= 1; offset += 2) {
+                    CHECKTIME(winnerInfo);
+                    candidateInfo.params = cparam;
+                    paramVaryOnce(mtAll[cparam.vals[strt_ind]].varArray[i], offset, &candidateInfo.params);
+
+                    if(paramValid(candidateInfo.params)) {
+                        int res;
+                        res = benchMemo(&candidateInfo.result, buf, ctx,
+                            sanitizeParams(candidateInfo.params), target, &winnerInfo.result, mtAll, feas);
+                        DEBUGOUTPUT("Res: %d\n", res);
+                        if(res == BETTER_RESULT) { /* synonymous with better when called w/ infeasibleBM */
+                            winnerInfo = candidateInfo;
+                            better = 1;
+                            if(compareResultLT(bestFeasible1.result, winnerInfo.result, target, buf.srcSize)) {
+                                bestFeasible1 = winnerInfo;
+                            }
+                        }
+                    }
+                }
+            }   /* for (i = 0; i < varLen; i++) */
+
+            if(better) {
+                continue;
+            }
+
+            for(dist = 2; dist < varLen + 2; dist++) { /* varLen is # dimensions */
+                for(i = 0; i < (1 << varLen) / varLen + 2; i++) {
+                    int res;
+                    CHECKTIME(winnerInfo);
+                    candidateInfo.params = cparam;
+                    /* param error checking already done here */
+                    paramVariation(&candidateInfo.params, mtAll, (U32)dist);
+
+                    res = benchMemo(&candidateInfo.result,
+                                buf, ctx,
+                                sanitizeParams(candidateInfo.params), target,
+                                &winnerInfo.result, mtAll, feas);
+                    DEBUGOUTPUT("Res: %d\n", res);
+                    if (res == BETTER_RESULT) { /* synonymous with better in this case*/
+                        winnerInfo = candidateInfo;
+                        better = 1;
+                        if (compareResultLT(bestFeasible1.result, winnerInfo.result, target, buf.srcSize)) {
+                            bestFeasible1 = winnerInfo;
+                        }
+                        break;
+                    }
+                }
+
+                if (better) {
                     break;
                 }
-                if ( (candidate.cSize < winner.result.cSize)
-                   | ((candidate.cSize == winner.result.cSize) & (candidate.cSpeed > winner.result.cSpeed)) )
-                {
-                    winner.params = CParams;
-                    winner.result = candidate;
-                    BMK_printWinner(stdout, i, winner.result, winner.params, benchedSize);
-            }   }
+            }   /* for(dist = 2; dist < varLen + 2; dist++) */
+
+            if (!better) { /* infeas -> feas -> stop */
+                if (feas) return winnerInfo;
+                feas = 1;
+                better = 1;
+                winnerInfo = bestFeasible1; /* note with change, bestFeasible may not necessarily be feasible, but if one has been benchmarked, it will be. */
+                DEBUGOUTPUT("Climb Part 2\n");
+            }
         }
-
-        BMK_printWinner(stdout, CUSTOM_LEVEL, winner.result, winner.params, benchedSize);
-
-        BMK_translateAdvancedParams(winner.params);
-
-        /* start tests */
-        {   time_t const grillStart = time(NULL);
-            do {
-                ZSTD_compressionParameters params = winner.params;
-                paramVariation(&params);
-                if ((FUZ_rand(&g_rand) & 31) == 3) params = randomParams();  /* totally random config to improve search space */
-                params = ZSTD_adjustCParams(params, blockSize, 0);
-
-                /* exclude faster if already played set of params */
-                if (FUZ_rand(&g_rand) & ((1 << NB_TESTS_PLAYED(params))-1)) continue;
-
-                /* test */
-                NB_TESTS_PLAYED(params)++;
-                BMK_benchParam(&candidate, origBuff, benchedSize, ctx, dctx, params);
-
-                /* improvement found => new winner */
-                if ( (candidate.cSpeed > targetSpeed)
-                   & ( (candidate.cSize < winner.result.cSize)
-                     | ((candidate.cSize == winner.result.cSize) & (candidate.cSpeed > winner.result.cSpeed)) )  )
-                {
-                    winner.params = params;
-                    winner.result = candidate;
-                    BMK_printWinner(stdout, CUSTOM_LEVEL, winner.result, winner.params, benchedSize);
-                    BMK_translateAdvancedParams(winner.params);
-                }
-            } while (BMK_timeSpan(grillStart) < g_grillDuration_s);
-        }
-        /* end summary */
-
-        BMK_printWinner(stdout, CUSTOM_LEVEL, winner.result, winner.params, benchedSize);
-        BMK_translateAdvancedParams(winner.params);
-        DISPLAY("grillParams size - optimizer completed \n");
-
-        /* clean up*/
-        ZSTD_freeCCtx(ctx);
-        ZSTD_freeDCtx(dctx);
+        winnerInfo = bestFeasible1;
     }
 
-    free(origBuff);
-    return 0;
+    return winnerInfo;
+}
+
+/* Optimizes for a fixed strategy */
+
+/* flexible parameters: iterations of failed climbing (or if we do non-random, maybe this is when everything is close to visitied)
+   weight more on visit for bad results, less on good results/more on later results / ones with more failures.
+   allocate memoTable here.
+ */
+static winnerInfo_t optimizeFixedStrategy(
+    const buffers_t buf, const contexts_t ctx,
+    const constraint_t target, paramValues_t paramTarget,
+    const ZSTD_strategy strat,
+    memoTable_t* memoTableArray, const int tries) {
+    int i = 0;
+
+    paramValues_t init;
+    winnerInfo_t winnerInfo, candidateInfo;
+    winnerInfo = initWinnerInfo(emptyParams());
+    /* so climb is given the right fixed strategy */
+    paramTarget.vals[strt_ind] = strat;
+    /* to pass ZSTD_checkCParams */
+    paramTarget = cParamUnsetMin(paramTarget);
+
+    init = paramTarget;
+
+    for(i = 0; i < tries; i++) {
+        DEBUGOUTPUT("Restart\n");
+        do { randomConstrainedParams(&init, memoTableArray, strat); } while(redundantParams(init, target, buf.maxBlockSize));
+        candidateInfo = climbOnce(target, memoTableArray, buf, ctx, init);
+        if(compareResultLT(winnerInfo.result, candidateInfo.result, target, buf.srcSize)) {
+            winnerInfo = candidateInfo;
+            BMK_printWinnerOpt(stdout, CUSTOM_LEVEL, winnerInfo.result, winnerInfo.params, target, buf.srcSize);
+            i = 0;
+            continue;
+        }
+        CHECKTIME(winnerInfo);
+        i++;
+    }
+    return winnerInfo;
+}
+
+/* goes best, best-1, best+1, best-2, ... */
+/* return 0 if nothing remaining */
+static int nextStrategy(const int currentStrategy, const int bestStrategy) {
+    if(bestStrategy <= currentStrategy) {
+        int candidate = 2 * bestStrategy - currentStrategy - 1;
+        if(candidate < 1) {
+            candidate = currentStrategy + 1;
+            if(candidate > (int)ZSTD_btultra) {
+                return 0;
+            } else {
+                return candidate;
+            }
+        } else {
+            return candidate;
+        }
+    } else { /* bestStrategy >= currentStrategy */
+        int candidate = 2 * bestStrategy - currentStrategy;
+        if(candidate > (int)ZSTD_btultra) {
+            candidate = currentStrategy - 1;
+            if(candidate < 1) {
+                return 0;
+            } else {
+                return candidate;
+            }
+        } else {
+            return candidate;
+        }
+    }
+}
+
+/* experiment with playing with this and decay value */
+
+/* main fn called when using --optimize */
+/* Does strategy selection by benchmarking default compression levels
+ * then optimizes by strategy, starting with the best one and moving
+ * progressively moving further away by number
+ * args:
+ * fileNamesTable - list of files to benchmark
+ * nbFiles - length of fileNamesTable
+ * dictFileName - name of dictionary file if one, else NULL
+ * target - performance constraints (cSpeed, dSpeed, cMem)
+ * paramTarget - parameter constraints (i.e. restriction search space to where strategy = ZSTD_fast)
+ * cLevel - compression level to exceed (all solutions must be > lvl in cSpeed + ratio)
+ */
+
+static int g_maxTries = 5;
+#define TRY_DECAY 1
+
+static int optimizeForSize(const char* const * const fileNamesTable, const size_t nbFiles, const char* dictFileName, constraint_t target, paramValues_t paramTarget,
+    const int cLevelOpt, const int cLevelRun, const U32 memoTableLog)
+{
+    varInds_t varArray [NUM_PARAMS];
+    int ret = 0;
+    const size_t varLen = variableParams(paramTarget, varArray, dictFileName != NULL);
+    winnerInfo_t winner = initWinnerInfo(emptyParams());
+    memoTable_t* allMT = NULL;
+    paramValues_t paramBase;
+    contexts_t ctx;
+    buffers_t buf;
+    g_time = UTIL_getTime();
+
+    if(createBuffers(&buf, fileNamesTable, nbFiles)) {
+        DISPLAY("unable to load files\n");
+        return 1;
+    }
+
+    if(createContexts(&ctx, dictFileName)) {
+        DISPLAY("unable to load dictionary\n");
+        freeBuffers(buf);
+        return 2;
+    }
+
+    if(nbFiles == 1) {
+        DISPLAYLEVEL(2, "Loading %s...       \r", fileNamesTable[0]);
+    } else {
+        DISPLAYLEVEL(2, "Loading %lu Files...       \r", (unsigned long)nbFiles);
+    }
+
+    /* sanitize paramTarget */
+    optimizerAdjustInput(&paramTarget, buf.maxBlockSize);
+    paramBase = cParamUnsetMin(paramTarget);
+
+    allMT = createMemoTableArray(paramTarget, varArray, varLen, memoTableLog);
+
+    if (!allMT) {
+        DISPLAY("MemoTable Init Error\n");
+        ret = 2;
+        goto _cleanUp;
+    }
+
+    /* default strictnesses */
+    if (g_strictness == PARAM_UNSET) {
+        if(g_optmode) {
+            g_strictness = 100;
+        } else {
+            g_strictness = 90;
+        }
+    } else {
+        if(0 >= g_strictness || g_strictness > 100) {
+            DISPLAY("Strictness Outside of Bounds\n");
+            ret = 4;
+            goto _cleanUp;
+        }
+    }
+
+    /* use level'ing mode instead of normal target mode */
+    if (g_optmode) {
+        winner.params = cParamsToPVals(ZSTD_getCParams(cLevelOpt, buf.maxBlockSize, ctx.dictSize));
+        if(BMK_benchParam(&winner.result, buf, ctx, winner.params)) {
+            ret = 3;
+            goto _cleanUp;
+        }
+
+        g_lvltarget = winner.result;
+        g_lvltarget.cSpeed *= ((double)g_strictness) / 100;
+        g_lvltarget.dSpeed *= ((double)g_strictness) / 100;
+        g_lvltarget.cSize /= ((double)g_strictness) / 100;
+
+        target.cSpeed = (U32)g_lvltarget.cSpeed;
+        target.dSpeed = (U32)g_lvltarget.dSpeed;
+
+        BMK_printWinnerOpt(stdout, cLevelOpt, winner.result, winner.params, target, buf.srcSize);
+    }
+
+    /* Don't want it to return anything worse than the best known result */
+    if (g_singleRun) {
+        BMK_benchResult_t res;
+        g_params = adjustParams(overwriteParams(cParamsToPVals(ZSTD_getCParams(cLevelRun, buf.maxBlockSize, ctx.dictSize)), g_params), buf.maxBlockSize, ctx.dictSize);
+        if (BMK_benchParam(&res, buf, ctx, g_params)) {
+            ret = 45;
+            goto _cleanUp;
+        }
+        if(compareResultLT(winner.result, res, relaxTarget(target), buf.srcSize)) {
+            winner.result = res;
+            winner.params = g_params;
+        }
+    }
+
+    /* bench */
+    DISPLAYLEVEL(2, "\r%79s\r", "");
+    if(nbFiles == 1) {
+        DISPLAYLEVEL(2, "optimizing for %s", fileNamesTable[0]);
+    } else {
+        DISPLAYLEVEL(2, "optimizing for %lu Files", (unsigned long)nbFiles);
+    }
+
+    if(target.cSpeed != 0) { DISPLAYLEVEL(2," - limit compression speed %u MB/s", target.cSpeed >> 20); }
+    if(target.dSpeed != 0) { DISPLAYLEVEL(2, " - limit decompression speed %u MB/s", target.dSpeed >> 20); }
+    if(target.cMem != (U32)-1) { DISPLAYLEVEL(2, " - limit memory %u MB", target.cMem >> 20); }
+
+    DISPLAYLEVEL(2, "\n");
+    findClockGranularity();
+
+    {   paramValues_t CParams;
+
+        /* find best solution from default params */
+        {
+            /* strategy selection */
+            const int maxSeeds = g_noSeed ? 1 : ZSTD_maxCLevel();
+            DEBUGOUTPUT("Strategy Selection\n");
+            if(paramTarget.vals[strt_ind] == PARAM_UNSET) {
+                BMK_benchResult_t candidate;
+                int i;
+                for (i=1; i<=maxSeeds; i++) {
+                    int ec;
+                    CParams = overwriteParams(cParamsToPVals(ZSTD_getCParams(i, buf.maxBlockSize, ctx.dictSize)), paramTarget);
+                    ec = BMK_benchParam(&candidate, buf, ctx, CParams);
+                    BMK_printWinnerOpt(stdout, i, candidate, CParams, target, buf.srcSize);
+
+                    if(!ec && compareResultLT(winner.result, candidate, relaxTarget(target), buf.srcSize)) {
+                        winner.result = candidate;
+                        winner.params = CParams;
+                    }
+
+                    CHECKTIMEGT(ret, 0, _displayCleanUp); /* if pass time limit, stop */
+                    /* if the current params are too slow, just stop. */
+                    if(target.cSpeed > candidate.cSpeed * 3 / 2) { break; }
+                }
+
+                BMK_printWinnerOpt(stdout, CUSTOM_LEVEL, winner.result, winner.params, target, buf.srcSize);
+            }
+        }
+
+        DEBUGOUTPUT("Real Opt\n");
+        /* start 'real' optimization */
+        {
+            int bestStrategy = (int)winner.params.vals[strt_ind];
+            if(paramTarget.vals[strt_ind] == PARAM_UNSET) {
+                int st = bestStrategy;
+                int tries = g_maxTries;
+
+                {
+                    /* one iterations of hill climbing with the level-defined parameters. */
+                    winnerInfo_t w1 = climbOnce(target, allMT, buf, ctx, winner.params);
+                    if(compareResultLT(winner.result, w1.result, target, buf.srcSize)) {
+                        winner = w1;
+                    }
+                    CHECKTIMEGT(ret, 0, _displayCleanUp);
+                }
+
+                while(st && tries > 0) {
+                    winnerInfo_t wc;
+                    DEBUGOUTPUT("StrategySwitch: %s\n", g_stratName[st]);
+
+                    wc = optimizeFixedStrategy(buf, ctx, target, paramBase, st, allMT, tries);
+
+                    if(compareResultLT(winner.result, wc.result, target, buf.srcSize)) {
+                        winner = wc;
+                        tries = g_maxTries;
+                        bestStrategy = st;
+                    } else {
+                        st = nextStrategy(st, bestStrategy);
+                        tries -= TRY_DECAY;
+                    }
+                    CHECKTIMEGT(ret, 0, _displayCleanUp);
+                }
+            } else {
+                winner = optimizeFixedStrategy(buf, ctx, target, paramBase, paramTarget.vals[strt_ind], allMT, g_maxTries);
+            }
+
+        }
+
+        /* no solution found */
+        if(winner.result.cSize == (size_t)-1) {
+            ret = 1;
+            DISPLAY("No feasible solution found\n");
+            goto _cleanUp;
+        }
+        /* end summary */
+_displayCleanUp:
+        if(g_displayLevel >= 0) { BMK_displayOneResult(stdout, winner, buf.srcSize); }
+        BMK_translateAdvancedParams(stdout, winner.params);
+        DISPLAYLEVEL(1, "grillParams size - optimizer completed \n");
+
+    }
+_cleanUp:
+    freeContexts(ctx);
+    freeBuffers(buf);
+    freeMemoTableArray(allMT);
+    return ret;
+}
+
+/*-************************************
+*  CLI parsing functions
+**************************************/
+
+/** longCommandWArg() :
+ *  check if *stringPtr is the same as longCommand.
+ *  If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand.
+ * @return 0 and doesn't modify *stringPtr otherwise.
+ * from zstdcli.c
+ */
+static unsigned longCommandWArg(const char** stringPtr, const char* longCommand)
+{
+    size_t const comSize = strlen(longCommand);
+    int const result = !strncmp(*stringPtr, longCommand, comSize);
+    if (result) *stringPtr += comSize;
+    return result;
 }
 
 static void errorOut(const char* msg)
@@ -775,7 +2401,9 @@
 static unsigned readU32FromChar(const char** stringPtr)
 {
     const char errorMsg[] = "error: numeric value too large";
+    unsigned sign = 1;
     unsigned result = 0;
+    if(**stringPtr == '-') { sign = (unsigned)-1; (*stringPtr)++; }
     while ((**stringPtr >='0') && (**stringPtr <='9')) {
         unsigned const max = (((unsigned)(-1)) / 10) - 1;
         if (result > max) errorOut(errorMsg);
@@ -793,6 +2421,22 @@
         if (**stringPtr=='i') (*stringPtr)++;
         if (**stringPtr=='B') (*stringPtr)++;
     }
+    return result * sign;
+}
+
+static double readDoubleFromChar(const char** stringPtr)
+{
+    double result = 0, divide = 10;
+    while ((**stringPtr >='0') && (**stringPtr <='9')) {
+        result *= 10, result += **stringPtr - '0', (*stringPtr)++ ;
+    }
+    if(**stringPtr!='.') {
+        return result;
+    }
+    (*stringPtr)++;
+    while ((**stringPtr >='0') && (**stringPtr <='9')) {
+        result += (double)(**stringPtr - '0') / divide, divide *= 10, (*stringPtr)++ ;
+    }
     return result;
 }
 
@@ -809,15 +2453,16 @@
 static int usage_advanced(void)
 {
     DISPLAY( "\nAdvanced options :\n");
-    DISPLAY( " -T#    : set level 1 speed objective \n");
-    DISPLAY( " -B#    : cut input into blocks of size # (default : single block) \n");
-    DISPLAY( " -i#    : iteration loops [1-9](default : %i) \n", NBLOOPS);
-    DISPLAY( " -O#    : find Optimized parameters for # MB/s compression speed (default : 0) \n");
-    DISPLAY( " -S     : Single run \n");
-    DISPLAY( " --zstd : Single run, parameter selection same as zstdcli \n");
-    DISPLAY( " -P#    : generated sample compressibility (default : %.1f%%) \n", COMPRESSIBILITY_DEFAULT * 100);
-    DISPLAY( " -t#    : Caps runtime of operation in seconds (default : %u seconds (%.1f hours)) \n", (U32)g_grillDuration_s, g_grillDuration_s / 3600);
-    DISPLAY( " -v     : Prints Benchmarking output\n");
+    DISPLAY( " -T#          : set level 1 speed objective \n");
+    DISPLAY( " -B#          : cut input into blocks of size # (default : single block) \n");
+    DISPLAY( " --optimize=  : same as -O with more verbose syntax (see README.md)\n");
+    DISPLAY( " -S           : Single run \n");
+    DISPLAY( " --zstd       : Single run, parameter selection same as zstdcli \n");
+    DISPLAY( " -P#          : generated sample compressibility (default : %.1f%%) \n", COMPRESSIBILITY_DEFAULT * 100);
+    DISPLAY( " -t#          : Caps runtime of operation in seconds (default : %u seconds (%.1f hours)) \n", g_timeLimit_s, (double)g_timeLimit_s / 3600);
+    DISPLAY( " -v           : Prints Benchmarking output\n");
+    DISPLAY( " -D           : Next argument dictionary file\n");
+    DISPLAY( " -s           : Seperate Files\n");
     return 0;
 }
 
@@ -828,41 +2473,87 @@
     return 1;
 }
 
+#define PARSE_SUB_ARGS(stringLong, stringShort, variable) { if (longCommandWArg(&argument, stringLong) || longCommandWArg(&argument, stringShort)) { variable = readU32FromChar(&argument); if (argument[0]==',') { argument++; continue; } else break; } }
+/* 1 if successful parse, 0 otherwise */
+static int parse_params(const char** argptr, paramValues_t* pv) {
+    int matched = 0;
+    const char* argOrig = *argptr;
+    varInds_t v;
+    for(v = 0; v < NUM_PARAMS; v++) {
+        if(longCommandWArg(argptr,g_shortParamNames[v]) || longCommandWArg(argptr, g_paramNames[v])) {
+            if(**argptr == '=') {
+                (*argptr)++;
+                pv->vals[v] = readU32FromChar(argptr);
+                matched = 1;
+                break;
+            }
+        }
+        /* reset and try again */
+        *argptr = argOrig;
+    }
+    return matched;
+}
+
+/*-************************************
+*  Main
+**************************************/
+
 int main(int argc, const char** argv)
 {
     int i,
         filenamesStart=0,
         result;
     const char* exename=argv[0];
-    const char* input_filename=0;
-    U32 optimizer = 0;
+    const char* input_filename = NULL;
+    const char* dictFileName = NULL;
     U32 main_pause = 0;
-    U32 targetSpeed = 0;
+    int cLevelOpt = 0, cLevelRun = 0;
+    int seperateFiles = 0;
+    double compressibility = COMPRESSIBILITY_DEFAULT;
+    U32 memoTableLog = PARAM_UNSET;
+    constraint_t target = { 0, 0, (U32)-1 };
+
+    paramValues_t paramTarget = emptyParams();
+    g_params = emptyParams();
 
     assert(argc>=1);   /* for exename */
 
-    /* Welcome message */
-    DISPLAY(WELCOME_MESSAGE);
-
     for(i=1; i<argc; i++) {
         const char* argument = argv[i];
+        DEBUGOUTPUT("%d: %s\n", i, argument);
         assert(argument != NULL);
 
         if(!strcmp(argument,"--no-seed")) { g_noSeed = 1; continue; }
 
-        /* Decode command (note : aggregated commands are allowed) */
-        if (longCommandWArg(&argument, "--zstd=")) {
-            g_singleRun = 1;
-            g_params = ZSTD_getCParams(2, g_blockSize, 0);
+        if (longCommandWArg(&argument, "--optimize=")) {
+            g_optimizer = 1;
             for ( ; ;) {
-                if (longCommandWArg(&argument, "windowLog=") || longCommandWArg(&argument, "wlog=")) { g_params.windowLog = readU32FromChar(&argument); if (argument[0]==',') { argument++; continue; } else break; }
-                if (longCommandWArg(&argument, "chainLog=") || longCommandWArg(&argument, "clog=")) { g_params.chainLog = readU32FromChar(&argument); if (argument[0]==',') { argument++; continue; } else break; }
-                if (longCommandWArg(&argument, "hashLog=") || longCommandWArg(&argument, "hlog=")) { g_params.hashLog = readU32FromChar(&argument); if (argument[0]==',') { argument++; continue; } else break; }
-                if (longCommandWArg(&argument, "searchLog=") || longCommandWArg(&argument, "slog=")) { g_params.searchLog = readU32FromChar(&argument); if (argument[0]==',') { argument++; continue; } else break; }
-                if (longCommandWArg(&argument, "searchLength=") || longCommandWArg(&argument, "slen=")) { g_params.searchLength = readU32FromChar(&argument); if (argument[0]==',') { argument++; continue; } else break; }
-                if (longCommandWArg(&argument, "targetLength=") || longCommandWArg(&argument, "tlen=")) { g_params.targetLength = readU32FromChar(&argument); if (argument[0]==',') { argument++; continue; } else break; }
-                if (longCommandWArg(&argument, "strategy=") || longCommandWArg(&argument, "strat=")) { g_params.strategy = (ZSTD_strategy)(readU32FromChar(&argument)); if (argument[0]==',') { argument++; continue; } else break; }
-                if (longCommandWArg(&argument, "level=") || longCommandWArg(&argument, "lvl=")) { g_params = ZSTD_getCParams(readU32FromChar(&argument), g_blockSize, 0); if (argument[0]==',') { argument++; continue; } else break; }
+                if(parse_params(&argument, &paramTarget)) { if(argument[0] == ',') { argument++; continue; } else break; }
+                PARSE_SUB_ARGS("compressionSpeed=" ,  "cSpeed=", target.cSpeed);
+                PARSE_SUB_ARGS("decompressionSpeed=", "dSpeed=", target.dSpeed);
+                PARSE_SUB_ARGS("compressionMemory=" , "cMem=", target.cMem);
+                PARSE_SUB_ARGS("strict=", "stc=", g_strictness);
+                PARSE_SUB_ARGS("maxTries=", "tries=", g_maxTries);
+                PARSE_SUB_ARGS("memoLimitLog=", "memLog=", memoTableLog);
+                if (longCommandWArg(&argument, "level=") || longCommandWArg(&argument, "lvl=")) { cLevelOpt = readU32FromChar(&argument); g_optmode = 1; if (argument[0]==',') { argument++; continue; } else break; }
+                if (longCommandWArg(&argument, "speedForRatio=") || longCommandWArg(&argument, "speedRatio=")) { g_ratioMultiplier = readDoubleFromChar(&argument); if (argument[0]==',') { argument++; continue; } else break; }
+
+                DISPLAY("invalid optimization parameter \n");
+                return 1;
+            }
+
+            if (argument[0] != 0) {
+                DISPLAY("invalid --optimize= format\n");
+                return 1; /* check the end of string */
+            }
+            continue;
+        } else if (longCommandWArg(&argument, "--zstd=")) {
+        /* Decode command (note : aggregated commands are allowed) */
+            g_singleRun = 1;
+            for ( ; ;) {
+                if(parse_params(&argument, &g_params)) { if(argument[0] == ',') { argument++; continue; } else break; }
+                if (longCommandWArg(&argument, "level=") || longCommandWArg(&argument, "lvl=")) { cLevelRun = readU32FromChar(&argument); g_params = emptyParams(); if (argument[0]==',') { argument++; continue; } else break; }
+
                 DISPLAY("invalid compression parameter \n");
                 return 1;
             }
@@ -871,7 +2562,45 @@
                 DISPLAY("invalid --zstd= format\n");
                 return 1; /* check the end of string */
             }
+            continue;
             /* if not return, success */
+
+        } else if (longCommandWArg(&argument, "--display=")) {
+            /* Decode command (note : aggregated commands are allowed) */
+            memset(g_silenceParams, 1, sizeof(g_silenceParams));
+            for ( ; ;) {
+                int found = 0;
+                varInds_t v;
+                for(v = 0; v < NUM_PARAMS; v++) {
+                    if(longCommandWArg(&argument, g_shortParamNames[v]) || longCommandWArg(&argument, g_paramNames[v])) {
+                        g_silenceParams[v] = 0;
+                        found = 1;
+                    }
+                }
+                if(longCommandWArg(&argument, "compressionParameters") || longCommandWArg(&argument, "cParams")) {
+                    for(v = 0; v <= strt_ind; v++) {
+                        g_silenceParams[v] = 0;
+                    }
+                    found = 1;
+                }
+
+
+                if(found) {
+                    if(argument[0]==',') {
+                        continue;
+                    } else {
+                        break;
+                    }
+                }
+                DISPLAY("invalid parameter name parameter \n");
+                return 1;
+            }
+
+            if (argument[0] != 0) {
+                DISPLAY("invalid --display format\n");
+                return 1; /* check the end of string */
+            }
+            continue;
         } else if (argument[0]=='-') {
             argument++;
 
@@ -885,66 +2614,58 @@
 
                     /* Pause at the end (hidden option) */
                 case 'p': main_pause = 1; argument++; break;
-                    /* Modify Nb Iterations */
-
-                case 'i':
-                    argument++;
-                    g_nbIterations = readU32FromChar(&argument);
-                    break;
 
                     /* Sample compressibility (when no file provided) */
                 case 'P':
                     argument++;
                     {   U32 const proba32 = readU32FromChar(&argument);
-                        g_compressibility = (double)proba32 / 100.;
+                        compressibility = (double)proba32 / 100.;
                     }
                     break;
 
-                case 'O':
-                    argument++;
-                    optimizer = 1;
-                    targetSpeed = readU32FromChar(&argument);
-                    break;
-
                     /* Run Single conf */
                 case 'S':
                     g_singleRun = 1;
                     argument++;
-                    g_params = ZSTD_getCParams(2, g_blockSize, 0);
                     for ( ; ; ) {
                         switch(*argument)
                         {
                         case 'w':
                             argument++;
-                            g_params.windowLog = readU32FromChar(&argument);
+                            g_params.vals[wlog_ind] = readU32FromChar(&argument);
                             continue;
                         case 'c':
                             argument++;
-                            g_params.chainLog = readU32FromChar(&argument);
+                            g_params.vals[clog_ind] = readU32FromChar(&argument);
                             continue;
                         case 'h':
                             argument++;
-                            g_params.hashLog = readU32FromChar(&argument);
+                            g_params.vals[hlog_ind] = readU32FromChar(&argument);
                             continue;
                         case 's':
                             argument++;
-                            g_params.searchLog = readU32FromChar(&argument);
+                            g_params.vals[slog_ind] = readU32FromChar(&argument);
                             continue;
                         case 'l':  /* search length */
                             argument++;
-                            g_params.searchLength = readU32FromChar(&argument);
+                            g_params.vals[slen_ind] = readU32FromChar(&argument);
                             continue;
                         case 't':  /* target length */
                             argument++;
-                            g_params.targetLength = readU32FromChar(&argument);
+                            g_params.vals[tlen_ind] = readU32FromChar(&argument);
                             continue;
                         case 'S':  /* strategy */
                             argument++;
-                            g_params.strategy = (ZSTD_strategy)readU32FromChar(&argument);
+                            g_params.vals[strt_ind] = readU32FromChar(&argument);
+                            continue;
+                        case 'f':  /* forceAttachDict */
+                            argument++;
+                            g_params.vals[fadt_ind] = readU32FromChar(&argument);
                             continue;
                         case 'L':
-                            {   int const cLevel = readU32FromChar(&argument);
-                                g_params = ZSTD_getCParams(cLevel, g_blockSize, 0);
+                            {   argument++;
+                                cLevelRun = readU32FromChar(&argument);
+                                g_params = emptyParams();
                                 continue;
                             }
                         default : ;
@@ -970,7 +2691,32 @@
                     /* caps runtime (in seconds) */
                 case 't':
                     argument++;
-                    g_grillDuration_s = (double)readU32FromChar(&argument);
+                    g_timeLimit_s = readU32FromChar(&argument);
+                    break;
+
+                case 's':
+                    argument++;
+                    seperateFiles = 1;
+                    break;
+
+                case 'q':
+                    while (argument[0] == 'q') { argument++; g_displayLevel--; }
+                    break;
+
+                case 'v':
+                    while (argument[0] == 'v') { argument++; g_displayLevel++; }
+                    break;
+
+                /* load dictionary file (only applicable for optimizer rn) */
+                case 'D':
+                    if(i == argc - 1) { /* last argument, return error. */
+                        DISPLAY("Dictionary file expected but not given : %d\n", i);
+                        return 1;
+                    } else {
+                        i++;
+                        dictFileName = argv[i];
+                        argument += strlen(argument);
+                    }
                     break;
 
                     /* Unknown command */
@@ -984,19 +2730,35 @@
         if (!input_filename) { input_filename=argument; filenamesStart=i; continue; }
     }
 
+    /* Welcome message */
+    DISPLAYLEVEL(2, WELCOME_MESSAGE);
+
     if (filenamesStart==0) {
-        if (optimizer) {
+        if (g_optimizer) {
             DISPLAY("Optimizer Expects File\n");
             return 1;
         } else {
-            result = benchSample();
+            result = benchSample(compressibility, cLevelRun);
         }
     } else {
-        if (optimizer) {
-            result = optimizeForSize(input_filename, targetSpeed);
+        if(seperateFiles) {
+            for(i = 0; i < argc - filenamesStart; i++) {
+                if (g_optimizer) {
+                    result = optimizeForSize(argv+filenamesStart + i, 1, dictFileName, target, paramTarget, cLevelOpt, cLevelRun, memoTableLog);
+                    if(result) { DISPLAY("Error on File %d", i); return result; }
+                } else {
+                    result = benchFiles(argv+filenamesStart + i, 1, dictFileName, cLevelRun);
+                    if(result) { DISPLAY("Error on File %d", i); return result; }
+                }
+            }
         } else {
-            result = benchFiles(argv+filenamesStart, argc-filenamesStart);
-    }   }
+            if (g_optimizer) {
+                result = optimizeForSize(argv+filenamesStart, argc-filenamesStart, dictFileName, target, paramTarget, cLevelOpt, cLevelRun, memoTableLog);
+            } else {
+                result = benchFiles(argv+filenamesStart, argc-filenamesStart, dictFileName, cLevelRun);
+            }
+        }
+    }
 
     if (main_pause) { int unused; printf("press enter...\n"); unused = getchar(); (void)unused; }
 
diff --git a/tests/playTests.sh b/tests/playTests.sh
index 09a7377..b86a0dc 100755
--- a/tests/playTests.sh
+++ b/tests/playTests.sh
@@ -48,6 +48,12 @@
     $DIFF -q tmp.md5.1 tmp.md5.2
 }
 
+truncateLastByte() {
+	dd bs=1 count=$(($(wc -c < "$1") - 1)) if="$1" status=none
+}
+
+UNAME=$(uname)
+
 isTerminal=false
 if [ -t 0 ] && [ -t 1 ]
 then
@@ -56,7 +62,10 @@
 
 isWindows=false
 INTOVOID="/dev/null"
-DEVDEVICE="/dev/random"
+case "$UNAME" in
+  GNU) DEVDEVICE="/dev/random" ;;
+  *) DEVDEVICE="/dev/zero" ;;
+esac
 case "$OS" in
   Windows*)
     isWindows=true
@@ -65,7 +74,6 @@
     ;;
 esac
 
-UNAME=$(uname)
 case "$UNAME" in
   Darwin) MD5SUM="md5 -r" ;;
   FreeBSD) MD5SUM="gmd5sum" ;;
@@ -95,6 +103,7 @@
 fi
 
 
+
 $ECHO "\n===>  simple tests "
 
 ./datagen > tmp
@@ -104,11 +113,13 @@
 $ZSTD -df tmp.zst                 # trivial decompression case (overwrites tmp)
 $ECHO "test : too large compression level => auto-fix"
 $ZSTD -99 -f tmp  # too large compression level, automatic sized down
+$ZSTD -5000000000 -f tmp && die "too large numeric value : must fail"
 $ECHO "test : --fast aka negative compression levels"
 $ZSTD --fast -f tmp  # == -1
 $ZSTD --fast=3 -f tmp  # == -3
-$ZSTD --fast=200000 -f tmp  # == no compression
-! $ZSTD -c --fast=0 tmp > $INTOVOID # should fail
+$ZSTD --fast=200000 -f tmp  # too low compression level, automatic fixed
+$ZSTD --fast=5000000000 -f tmp && die "too large numeric value : must fail"
+$ZSTD -c --fast=0 tmp > $INTOVOID && die "--fast must not accept value 0"
 $ECHO "test : too large numeric argument"
 $ZSTD --fast=9999999999 -f tmp  && die "should have refused numeric value"
 $ECHO "test : compress to stdout"
@@ -169,6 +180,8 @@
 $ZSTD -q tmpro && die "should have refused to overwrite read-only file"
 $ZSTD -q -f tmpro
 rm -f tmpro tmpro.zst
+
+
 $ECHO "test : file removal"
 $ZSTD -f --rm tmp
 test ! -f tmp  # tmp should no longer be present
@@ -185,9 +198,19 @@
 rm tmp
 $ZSTD -f tmp && die "tmp not present : should have failed"
 test ! -f tmp.zst  # tmp.zst should not be created
+$ECHO "test : -d -f do not delete destination when source is not present"
+touch tmp    # create destination file
+$ZSTD -d -f tmp.zst && die "attempt to decompress a non existing file"
+test -f tmp  # destination file should still be present
+$ECHO "test : -f do not delete destination when source is not present"
+rm tmp         # erase source file
+touch tmp.zst  # create destination file
+$ZSTD -f tmp && die "attempt to compress a non existing file"
+test -f tmp.zst  # destination file should still be present
+rm tmp*
+
 
 $ECHO "test : compress multiple files"
-rm tmp*
 $ECHO hello > tmp1
 $ECHO world > tmp2
 $ZSTD tmp1 tmp2 -o "$INTOVOID"
@@ -400,28 +423,54 @@
 $ZSTD --train-legacy -q tmp && die "Dictionary training should fail : not enough input source"
 ./datagen -P0 -g10M > tmp
 $ZSTD --train-legacy -q tmp && die "Dictionary training should fail : source is pure noise"
-rm tmp*
+$ECHO "- Test -o before --train"
+rm -f tmpDict dictionary
+$ZSTD -o tmpDict --train *.c ../programs/*.c
+test -f tmpDict
+$ZSTD --train *.c ../programs/*.c
+test -f dictionary
+rm tmp* dictionary
 
 
-$ECHO "\n===>  cover dictionary builder : advanced options "
+$ECHO "\n===>  fastCover dictionary builder : advanced options "
 
 TESTFILE=../programs/zstdcli.c
 ./datagen > tmpDict
 $ECHO "- Create first dictionary"
-$ZSTD --train-cover=k=46,d=8 *.c ../programs/*.c -o tmpDict
+$ZSTD --train-fastcover=k=46,d=8,f=15,split=80 *.c ../programs/*.c -o tmpDict
 cp $TESTFILE tmp
 $ZSTD -f tmp -D tmpDict
 $ZSTD -d tmp.zst -D tmpDict -fo result
 $DIFF $TESTFILE result
 $ECHO "- Create second (different) dictionary"
-$ZSTD --train-cover=k=56,d=8 *.c ../programs/*.c ../programs/*.h -o tmpDictC
+$ZSTD --train-fastcover=k=56,d=8 *.c ../programs/*.c ../programs/*.h -o tmpDictC
 $ZSTD -d tmp.zst -D tmpDictC -fo result && die "wrong dictionary not detected!"
 $ECHO "- Create dictionary with short dictID"
-$ZSTD --train-cover=k=46,d=8 *.c ../programs/*.c --dictID=1 -o tmpDict1
+$ZSTD --train-fastcover=k=46,d=8,f=15,split=80 *.c ../programs/*.c --dictID=1 -o tmpDict1
 cmp tmpDict tmpDict1 && die "dictionaries should have different ID !"
 $ECHO "- Create dictionary with size limit"
-$ZSTD --train-cover=steps=8 *.c ../programs/*.c -o tmpDict2 --maxdict=4K
-rm tmp*
+$ZSTD --train-fastcover=steps=8 *.c ../programs/*.c -o tmpDict2 --maxdict=4K
+$ECHO "- Compare size of dictionary from 90% training samples with 80% training samples"
+$ZSTD --train-fastcover=split=90 -r *.c ../programs/*.c
+$ZSTD --train-fastcover=split=80 -r *.c ../programs/*.c
+$ECHO "- Create dictionary using all samples for both training and testing"
+$ZSTD --train-fastcover=split=100 -r *.c ../programs/*.c
+$ECHO "- Create dictionary using f=16"
+$ZSTD --train-fastcover=f=16 -r *.c ../programs/*.c
+$ECHO "- Create dictionary using accel=2"
+$ZSTD --train-fastcover=accel=2 -r *.c ../programs/*.c
+$ECHO "- Create dictionary using accel=10"
+$ZSTD --train-fastcover=accel=10 -r *.c ../programs/*.c
+$ECHO "- Create dictionary with multithreading"
+$ZSTD --train-fastcover -T4 -r *.c ../programs/*.c
+$ECHO "- Test -o before --train-fastcover"
+rm -f tmpDict dictionary
+$ZSTD -o tmpDict --train-fastcover *.c ../programs/*.c
+test -f tmpDict
+$ZSTD --train-fastcover *.c ../programs/*.c
+test -f dictionary
+rm tmp* dictionary
+
 
 $ECHO "\n===>  legacy dictionary builder "
 
@@ -441,7 +490,13 @@
 cmp tmpDict tmpDict1 && die "dictionaries should have different ID !"
 $ECHO "- Create dictionary with size limit"
 $ZSTD --train-legacy -s9 *.c ../programs/*.c -o tmpDict2 --maxdict=4K
-rm tmp*
+$ECHO "- Test -o before --train-legacy"
+rm -f tmpDict dictionary
+$ZSTD -o tmpDict --train-legacy *.c ../programs/*.c
+test -f tmpDict
+$ZSTD --train-legacy *.c ../programs/*.c
+test -f dictionary
+rm tmp* dictionary
 
 
 $ECHO "\n===>  integrity tests "
@@ -524,7 +579,7 @@
     $ZSTD -f --format=gzip tmp
     $ZSTD -f tmp
     cat tmp.gz tmp.zst tmp.gz tmp.zst | $ZSTD -d -f -o tmp
-    head -c -1 tmp.gz | $ZSTD -t > $INTOVOID && die "incomplete frame not detected !"
+    truncateLastByte tmp.gz | $ZSTD -t > $INTOVOID && die "incomplete frame not detected !"
     rm tmp*
 else
     $ECHO "gzip mode not supported"
@@ -544,16 +599,16 @@
 if [ $LZMAMODE -eq 1 ]; then
     $ECHO "xz support detected"
     XZEXE=1
-    xz -V && lzma -V || XZEXE=0
+    xz -Q -V && lzma -Q -V || XZEXE=0
     if [ $XZEXE -eq 1 ]; then
         $ECHO "Testing zstd xz and lzma support"
         ./datagen > tmp
         $ZSTD --format=lzma -f tmp
         $ZSTD --format=xz -f tmp
-        xz -t -v tmp.xz
-        xz -t -v tmp.lzma
-        xz -f -k tmp
-        lzma -f -k --lzma1 tmp
+        xz -Q -t -v tmp.xz
+        xz -Q -t -v tmp.lzma
+        xz -Q -f -k tmp
+        lzma -Q -f -k --lzma1 tmp
         $ZSTD -d -f -v tmp.xz
         $ZSTD -d -f -v tmp.lzma
         rm tmp*
@@ -565,13 +620,13 @@
         $ECHO "Testing xz and lzma symlinks"
         ./datagen > tmp
         ./xz tmp
-        xz -d tmp.xz
+        xz -Q -d tmp.xz
         ./lzma tmp
-        lzma -d tmp.lzma
+        lzma -Q -d tmp.lzma
         $ECHO "Testing unxz and unlzma symlinks"
-        xz tmp
+        xz -Q tmp
         ./xz -d tmp.xz
-        lzma tmp
+        lzma -Q tmp
         ./lzma -d tmp.lzma
         rm xz unxz lzma unlzma
         rm tmp*
@@ -591,8 +646,8 @@
     $ZSTD -f --format=lzma tmp
     $ZSTD -f tmp
     cat tmp.xz tmp.lzma tmp.zst tmp.lzma tmp.xz tmp.zst | $ZSTD -d -f -o tmp
-    head -c -1 tmp.xz | $ZSTD -t > $INTOVOID && die "incomplete frame not detected !"
-    head -c -1 tmp.lzma | $ZSTD -t > $INTOVOID && die "incomplete frame not detected !"
+    truncateLastByte tmp.xz | $ZSTD -t > $INTOVOID && die "incomplete frame not detected !"
+    truncateLastByte tmp.lzma | $ZSTD -t > $INTOVOID && die "incomplete frame not detected !"
     rm tmp*
 else
     $ECHO "xz mode not supported"
@@ -628,7 +683,7 @@
     $ZSTD -f --format=lz4 tmp
     $ZSTD -f tmp
     cat tmp.lz4 tmp.zst tmp.lz4 tmp.zst | $ZSTD -d -f -o tmp
-    head -c -1 tmp.lz4 | $ZSTD -t > $INTOVOID && die "incomplete frame not detected !"
+    truncateLastByte tmp.lz4 | $ZSTD -t > $INTOVOID && die "incomplete frame not detected !"
     rm tmp*
 else
     $ECHO "lz4 mode not supported"
@@ -726,26 +781,32 @@
 $ZSTD -lv *.zst
 
 $ECHO "\n===>  zstd --list/-l error detection tests "
-! $ZSTD -l tmp1 tmp1.zst
-! $ZSTD --list tmp*
-! $ZSTD -lv tmp1*
-! $ZSTD --list -v tmp2 tmp12.zst
+$ZSTD -l tmp1 tmp1.zst && die "-l must fail on non-zstd file"
+$ZSTD --list tmp* && die "-l must fail on non-zstd file"
+$ZSTD -lv tmp1* && die "-l must fail on non-zstd file"
+$ZSTD --list -v tmp2 tmp12.zst && die "-l must fail on non-zstd file"
 
-$ECHO "\n===>  zstd --list/-l exits 1 when stdin is piped in"
-! echo "piped STDIN" | $ZSTD --list
+$ECHO "\n===>  zstd --list/-l errors when presented with stdin / no files"
+$ZSTD -l && die "-l must fail on empty list of files"
+$ZSTD -l - && die "-l does not work on stdin"
+$ZSTD -l < tmp1.zst && die "-l does not work on stdin"
+$ZSTD -l - < tmp1.zst && die "-l does not work on stdin"
+$ZSTD -l - tmp1.zst && die "-l does not work on stdin"
+$ZSTD -l - tmp1.zst < tmp1.zst && die "-l does not work on stdin"
+$ZSTD -l tmp1.zst < tmp2.zst # this will check tmp1.zst, but not tmp2.zst, which is not an error : zstd simply doesn't read stdin in this case. It must not error just because stdin is not a tty
 
 $ECHO "\n===>  zstd --list/-l test with null files "
 ./datagen -g0 > tmp5
 $ZSTD tmp5
 $ZSTD -l tmp5.zst
-! $ZSTD -l tmp5*
+$ZSTD -l tmp5* && die "-l must fail on non-zstd file"
 $ZSTD -lv tmp5.zst | grep "Decompressed Size: 0.00 KB (0 B)"  # check that 0 size is present in header
-! $ZSTD -lv tmp5*
+$ZSTD -lv tmp5* && die "-l must fail on non-zstd file"
 
 $ECHO "\n===>  zstd --list/-l test with no content size field "
 ./datagen -g513K | $ZSTD > tmp6.zst
 $ZSTD -l tmp6.zst
-! $ZSTD -lv tmp6.zst | grep "Decompressed Size:"  # must NOT be present in header
+$ZSTD -lv tmp6.zst | grep "Decompressed Size:"  && die "Field :Decompressed Size: should not be available in this compressed file"
 
 $ECHO "\n===>   zstd --list/-l test with no checksum "
 $ZSTD -f --no-check tmp1
@@ -765,11 +826,22 @@
 roundTripTest -g1M -P50 "1 --single-thread --long=29" " --zstd=wlog=28 --memory=512MB"
 
 
+$ECHO "\n===>   adaptive mode "
+roundTripTest -g270000000 " --adapt"
+roundTripTest -g27000000 " --adapt=min=1,max=4"
+$ECHO "===>   test: --adapt must fail on incoherent bounds "
+./datagen > tmp
+$ZSTD -f -vv --adapt=min=10,max=9 tmp && die "--adapt must fail on incoherent bounds"
+
+
 if [ "$1" != "--test-large-data" ]; then
     $ECHO "Skipping large data tests"
     exit 0
 fi
 
+
+#############################################################################
+
 $ECHO "\n===>   large files tests "
 
 roundTripTest -g270000000 1
@@ -824,4 +896,37 @@
     $ECHO "\n**** no multithreading, skipping zstdmt tests **** "
 fi
 
-rm tmp*
+
+$ECHO "\n===>  cover dictionary builder : advanced options "
+
+TESTFILE=../programs/zstdcli.c
+./datagen > tmpDict
+$ECHO "- Create first dictionary"
+$ZSTD --train-cover=k=46,d=8,split=80 *.c ../programs/*.c -o tmpDict
+cp $TESTFILE tmp
+$ZSTD -f tmp -D tmpDict
+$ZSTD -d tmp.zst -D tmpDict -fo result
+$DIFF $TESTFILE result
+$ECHO "- Create second (different) dictionary"
+$ZSTD --train-cover=k=56,d=8 *.c ../programs/*.c ../programs/*.h -o tmpDictC
+$ZSTD -d tmp.zst -D tmpDictC -fo result && die "wrong dictionary not detected!"
+$ECHO "- Create dictionary with short dictID"
+$ZSTD --train-cover=k=46,d=8,split=80 *.c ../programs/*.c --dictID=1 -o tmpDict1
+cmp tmpDict tmpDict1 && die "dictionaries should have different ID !"
+$ECHO "- Create dictionary with size limit"
+$ZSTD --train-cover=steps=8 *.c ../programs/*.c -o tmpDict2 --maxdict=4K
+$ECHO "- Compare size of dictionary from 90% training samples with 80% training samples"
+$ZSTD --train-cover=split=90 -r *.c ../programs/*.c
+$ZSTD --train-cover=split=80 -r *.c ../programs/*.c
+$ECHO "- Create dictionary using all samples for both training and testing"
+$ZSTD --train-cover=split=100 -r *.c ../programs/*.c
+$ECHO "- Test -o before --train-cover"
+rm -f tmpDict dictionary
+$ZSTD -o tmpDict --train-cover *.c ../programs/*.c
+test -f tmpDict
+$ZSTD --train-cover *.c ../programs/*.c
+test -f dictionary
+rm -f tmp* dictionary
+
+
+rm -f tmp*
diff --git a/tests/rateLimiter.py b/tests/rateLimiter.py
new file mode 100755
index 0000000..da0baf0
--- /dev/null
+++ b/tests/rateLimiter.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+
+# ################################################################
+# Copyright (c) 2018-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under both the BSD-style license (found in the
+# LICENSE file in the root directory of this source tree) and the GPLv2 (found
+# in the COPYING file in the root directory of this source tree).
+# ##########################################################################
+
+# Rate limiter, replacement for pv
+# this rate limiter does not "catch up" after a blocking period
+# Limitations:
+# - only accepts limit speed in MB/s
+
+import sys
+import time
+
+MB = 1024 * 1024
+rate = float(sys.argv[1]) * MB
+start = time.time()
+total_read = 0
+
+# sys.stderr.close()  # remove error message, for Ctrl+C
+
+try:
+  buf = " "
+  while len(buf):
+    now = time.time()
+    to_read = max(int(rate * (now - start)), 1)
+    max_buf_size = 1 * MB
+    to_read = min(to_read, max_buf_size)
+    start = now
+
+    buf = sys.stdin.buffer.read(to_read)
+    sys.stdout.buffer.write(buf)
+
+except (KeyboardInterrupt, BrokenPipeError) as e:
+    pass
diff --git a/tests/roundTripCrash.c b/tests/roundTripCrash.c
index 7d937fc..90afcd4 100644
--- a/tests/roundTripCrash.c
+++ b/tests/roundTripCrash.c
@@ -212,7 +212,7 @@
 static void fileCheck(const char* fileName, int testCCtxParams)
 {
     size_t const fileSize = getFileSize(fileName);
-    void* buffer = malloc(fileSize);
+    void* const buffer = malloc(fileSize + !fileSize /* avoid 0 */);
     if (!buffer) {
         fprintf(stderr, "not enough memory \n");
         exit(4);
diff --git a/tests/symbols.c b/tests/symbols.c
index c0bed2e..b370821 100644
--- a/tests/symbols.c
+++ b/tests/symbols.c
@@ -144,6 +144,8 @@
 /* zdict.h: advanced functions */
   &ZDICT_trainFromBuffer_cover,
   &ZDICT_optimizeTrainFromBuffer_cover,
+  &ZDICT_trainFromBuffer_fastCover,
+  &ZDICT_optimizeTrainFromBuffer_fastCover,
   &ZDICT_finalizeDictionary,
   &ZDICT_trainFromBuffer_legacy,
   &ZDICT_addEntropyTablesFromBuffer,
diff --git a/tests/test-zstd-versions.py b/tests/test-zstd-versions.py
index f2deac1..8e88b86 100755
--- a/tests/test-zstd-versions.py
+++ b/tests/test-zstd-versions.py
@@ -213,7 +213,7 @@
     print('Retrieve all release tags :')
     os.chdir(clone_dir)
     alltags = get_git_tags() + [head]
-    tags = [t for t in alltags if t >= 'v0.4.0']
+    tags = [t for t in alltags if t >= 'v0.5.0']
     print(tags)
 
     # Build all release zstd
diff --git a/tests/zstreamtest.c b/tests/zstreamtest.c
index 22c49cb..f47451a 100644
--- a/tests/zstreamtest.c
+++ b/tests/zstreamtest.c
@@ -84,7 +84,7 @@
     @return : a 27 bits random value, from a 32-bits `seed`.
     `seed` is also modified */
 #define FUZ_rotl32(x,r) ((x << r) | (x >> (32 - r)))
-unsigned int FUZ_rand(unsigned int* seedPtr)
+static unsigned int FUZ_rand(unsigned int* seedPtr)
 {
     static const U32 prime2 = 2246822519U;
     U32 rand32 = *seedPtr;
@@ -135,34 +135,34 @@
     size_t filled;
 } buffer_t;
 
-static const buffer_t g_nullBuffer = { NULL, 0 , 0 };
+static const buffer_t kBuffNull = { NULL, 0 , 0 };
+
+static void FUZ_freeDictionary(buffer_t dict)
+{
+    free(dict.start);
+}
 
 static buffer_t FUZ_createDictionary(const void* src, size_t srcSize, size_t blockSize, size_t requestedDictSize)
 {
-    buffer_t dict = { NULL, 0, 0 };
+    buffer_t dict = kBuffNull;
     size_t const nbBlocks = (srcSize + (blockSize-1)) / blockSize;
-    size_t* const blockSizes = (size_t*) malloc(nbBlocks * sizeof(size_t));
-    if (!blockSizes) return dict;
+    size_t* const blockSizes = (size_t*)malloc(nbBlocks * sizeof(size_t));
+    if (!blockSizes) return kBuffNull;
     dict.start = malloc(requestedDictSize);
-    if (!dict.start) { free(blockSizes); return dict; }
+    if (!dict.start) { free(blockSizes); return kBuffNull; }
     {   size_t nb;
         for (nb=0; nb<nbBlocks-1; nb++) blockSizes[nb] = blockSize;
         blockSizes[nbBlocks-1] = srcSize - (blockSize * (nbBlocks-1));
     }
     {   size_t const dictSize = ZDICT_trainFromBuffer(dict.start, requestedDictSize, src, blockSizes, (unsigned)nbBlocks);
         free(blockSizes);
-        if (ZDICT_isError(dictSize)) { free(dict.start); return g_nullBuffer; }
+        if (ZDICT_isError(dictSize)) { FUZ_freeDictionary(dict); return kBuffNull; }
         dict.size = requestedDictSize;
         dict.filled = dictSize;
-        return dict;   /* how to return dictSize ? */
+        return dict;
     }
 }
 
-static void FUZ_freeDictionary(buffer_t dict)
-{
-    free(dict.start);
-}
-
 /* Round trips data and updates xxh with the decompressed data produced */
 static size_t SEQ_roundTrip(ZSTD_CCtx* cctx, ZSTD_DCtx* dctx,
                             XXH64_state_t* xxh, void* data, size_t size,
@@ -276,7 +276,7 @@
 
     ZSTD_inBuffer  inBuff, inBuff2;
     ZSTD_outBuffer outBuff;
-    buffer_t dictionary = g_nullBuffer;
+    buffer_t dictionary = kBuffNull;
     size_t const dictSize = 128 KB;
     unsigned dictID = 0;
 
@@ -600,7 +600,6 @@
         size_t const initError = ZSTD_initCStream_usingCDict(zc, cdict);
         DISPLAYLEVEL(5, "ZSTD_initCStream_usingCDict result : %u ", (U32)initError);
         if (ZSTD_isError(initError)) goto _output_error;
-        cSize = 0;
         outBuff.dst = compressedBuffer;
         outBuff.size = compressedBufferSize;
         outBuff.pos = 0;
@@ -718,7 +717,6 @@
         ZSTD_CDict* const cdict = ZSTD_createCDict_advanced(dictionary.start, dictionary.filled, ZSTD_dlm_byRef, ZSTD_dct_auto, cParams, ZSTD_defaultCMem);
         size_t const initError = ZSTD_initCStream_usingCDict_advanced(zc, cdict, fParams, CNBufferSize);
         if (ZSTD_isError(initError)) goto _output_error;
-        cSize = 0;
         outBuff.dst = compressedBuffer;
         outBuff.size = compressedBufferSize;
         outBuff.pos = 0;
@@ -969,6 +967,26 @@
     }
     DISPLAYLEVEL(3, "OK \n");
 
+    DISPLAYLEVEL(3, "test%3i : ZSTD_initCStream_srcSize sets requestedParams : ", testNb++);
+    {   unsigned level;
+        CHECK_Z(ZSTD_initCStream_srcSize(zc, 11, ZSTD_CONTENTSIZE_UNKNOWN));
+        CHECK_Z(ZSTD_CCtx_getParameter(zc, ZSTD_p_compressionLevel, &level));
+        CHECK(level != 11, "Compression level does not match");
+        ZSTD_resetCStream(zc, ZSTD_CONTENTSIZE_UNKNOWN);
+        CHECK_Z(ZSTD_CCtx_getParameter(zc, ZSTD_p_compressionLevel, &level));
+        CHECK(level != 11, "Compression level does not match");
+    }
+    DISPLAYLEVEL(3, "OK \n");
+
+    DISPLAYLEVEL(3, "test%3i : ZSTD_initCStream_advanced sets requestedParams : ", testNb++);
+    {   ZSTD_parameters const params = ZSTD_getParams(9, 0, 0);
+        CHECK_Z(ZSTD_initCStream_advanced(zc, NULL, 0, params, ZSTD_CONTENTSIZE_UNKNOWN));
+        CHECK(badParameters(zc, params), "Compression parameters do not match");
+        ZSTD_resetCStream(zc, ZSTD_CONTENTSIZE_UNKNOWN);
+        CHECK(badParameters(zc, params), "Compression parameters do not match");
+    }
+    DISPLAYLEVEL(3, "OK \n");
+
     /* Overlen overwriting window data bug */
     DISPLAYLEVEL(3, "test%3i : wildcopy doesn't overwrite potential match data : ", testNb++);
     {   /* This test has a window size of 1024 bytes and consists of 3 blocks:
@@ -1002,6 +1020,97 @@
     }
     DISPLAYLEVEL(3, "OK \n");
 
+    DISPLAYLEVEL(3, "test%3i : dictionary + uncompressible block + reusing tables checks offset table validity: ", testNb++);
+    {   ZSTD_CDict* const cdict = ZSTD_createCDict_advanced(
+            dictionary.start, dictionary.filled,
+            ZSTD_dlm_byRef, ZSTD_dct_fullDict,
+            ZSTD_getCParams(3, 0, dictionary.filled),
+            ZSTD_defaultCMem);
+        const size_t inbufsize = 2 * 128 * 1024; /* 2 blocks */
+        const size_t outbufsize = ZSTD_compressBound(inbufsize);
+        size_t inbufpos = 0;
+        size_t cursegmentlen;
+        BYTE *inbuf = (BYTE *)malloc(inbufsize);
+        BYTE *outbuf = (BYTE *)malloc(outbufsize);
+        BYTE *checkbuf = (BYTE *)malloc(inbufsize);
+        size_t ret;
+
+        CHECK(cdict == NULL, "failed to alloc cdict");
+        CHECK(inbuf == NULL, "failed to alloc input buffer");
+
+        /* first block is uncompressible */
+        cursegmentlen = 128 * 1024;
+        RDG_genBuffer(inbuf + inbufpos, cursegmentlen, 0., 0., seed);
+        inbufpos += cursegmentlen;
+
+        /* second block is compressible */
+        cursegmentlen = 128 * 1024 - 256;
+        RDG_genBuffer(inbuf + inbufpos, cursegmentlen, 0.05, 0., seed);
+        inbufpos += cursegmentlen;
+
+        /* and includes a very long backref */
+        cursegmentlen = 128;
+        memcpy(inbuf + inbufpos, dictionary.start + 256, cursegmentlen);
+        inbufpos += cursegmentlen;
+
+        /* and includes a very long backref */
+        cursegmentlen = 128;
+        memcpy(inbuf + inbufpos, dictionary.start + 128, cursegmentlen);
+        inbufpos += cursegmentlen;
+
+        ret = ZSTD_compress_usingCDict(zc, outbuf, outbufsize, inbuf, inbufpos, cdict);
+        CHECK_Z(ret);
+
+        ret = ZSTD_decompress_usingDict(zd, checkbuf, inbufsize, outbuf, ret, dictionary.start, dictionary.filled);
+        CHECK_Z(ret);
+
+        CHECK(memcmp(inbuf, checkbuf, inbufpos), "start and finish buffers don't match");
+
+        ZSTD_freeCDict(cdict);
+        free(inbuf);
+        free(outbuf);
+        free(checkbuf);
+    }
+    DISPLAYLEVEL(3, "OK \n");
+
+    DISPLAYLEVEL(3, "test%3i : dictionary + small blocks + reusing tables checks offset table validity: ", testNb++);
+    {   ZSTD_CDict* const cdict = ZSTD_createCDict_advanced(
+            dictionary.start, dictionary.filled,
+            ZSTD_dlm_byRef, ZSTD_dct_fullDict,
+            ZSTD_getCParams(3, 0, dictionary.filled),
+            ZSTD_defaultCMem);
+        ZSTD_outBuffer out = {compressedBuffer, compressedBufferSize, 0};
+        int remainingInput = 256 * 1024;
+        int offset;
+
+        ZSTD_CCtx_reset(zc);
+        CHECK_Z(ZSTD_CCtx_resetParameters(zc));
+        CHECK_Z(ZSTD_CCtx_refCDict(zc, cdict));
+        CHECK_Z(ZSTD_CCtx_setParameter(zc, ZSTD_p_checksumFlag, 1));
+        /* Write a bunch of 6 byte blocks */
+        while (remainingInput > 0) {
+          char testBuffer[6] = "\xAA\xAA\xAA\xAA\xAA\xAA";
+          const size_t kSmallBlockSize = sizeof(testBuffer);
+          ZSTD_inBuffer in = {testBuffer, kSmallBlockSize, 0};
+
+          CHECK_Z(ZSTD_compress_generic(zc, &out, &in, ZSTD_e_flush));
+          CHECK(in.pos != in.size, "input not fully consumed");
+          remainingInput -= kSmallBlockSize;
+        }
+        /* Write several very long offset matches into the dictionary */
+        for (offset = 1024; offset >= 0; offset -= 128) {
+          ZSTD_inBuffer in = {dictionary.start + offset, 128, 0};
+          ZSTD_EndDirective flush = offset > 0 ? ZSTD_e_continue : ZSTD_e_end;
+          CHECK_Z(ZSTD_compress_generic(zc, &out, &in, flush));
+          CHECK(in.pos != in.size, "input not fully consumed");
+        }
+        /* Ensure decompression works */
+        CHECK_Z(ZSTD_decompress_usingDict(zd, decodedBuffer, CNBufferSize, out.dst, out.pos, dictionary.start, dictionary.filled));
+
+        ZSTD_freeCDict(cdict);
+    }
+    DISPLAYLEVEL(3, "OK \n");
+
 _end:
     FUZ_freeDictionary(dictionary);
     ZSTD_freeCStream(zc);
@@ -1737,7 +1846,8 @@
             }
             {   U64 const pledgedSrcSize = (FUZ_rand(&lseed) & 3) ? ZSTD_CONTENTSIZE_UNKNOWN : maxTestSize;
                 ZSTD_compressionParameters cParams = ZSTD_getCParams(cLevel, pledgedSrcSize, dictSize);
-                static const U32 windowLogMax = 24;
+                const U32 windowLogMax = bigTests ? 24 : 20;
+                const U32 searchLogMax = bigTests ? 15 : 13;
                 if (dictSize)
                     DISPLAYLEVEL(5, "t%u: with dictionary of size : %zu \n", testNb, dictSize);
 
@@ -1747,6 +1857,7 @@
                 cParams.hashLog += (FUZ_rand(&lseed) & 3) - 1;
                 cParams.chainLog += (FUZ_rand(&lseed) & 3) - 1;
                 cParams.searchLog += (FUZ_rand(&lseed) & 3) - 1;
+                cParams.searchLog = MIN(searchLogMax, cParams.searchLog);
                 cParams.searchLength += (FUZ_rand(&lseed) & 3) - 1;
                 cParams.targetLength = (U32)((cParams.targetLength + 1 ) * (0.5 + ((double)(FUZ_rand(&lseed) & 127) / 128)));
                 cParams = ZSTD_adjustCParams(cParams, pledgedSrcSize, dictSize);
@@ -1791,8 +1902,9 @@
                     CHECK_Z( ZSTD_CCtx_setPledgedSrcSize(zc, pledgedSrcSize) );
                 }
 
-                /* multi-threading parameters */
-                {   U32 const nbThreadsCandidate = (FUZ_rand(&lseed) & 4) + 1;
+                /* multi-threading parameters. Only adjust ocassionally for small tests. */
+                if (bigTests || (FUZ_rand(&lseed) & 0xF) == 0xF) {
+                    U32 const nbThreadsCandidate = (FUZ_rand(&lseed) & 4) + 1;
                     U32 const nbThreadsAdjusted = (windowLogMalus < nbThreadsCandidate) ? nbThreadsCandidate - windowLogMalus : 1;
                     U32 const nbThreads = MIN(nbThreadsAdjusted, nbThreadsMax);
                     DISPLAYLEVEL(5, "t%u: nbThreads : %u \n", testNb, nbThreads);
@@ -1969,7 +2081,7 @@
 /*-*******************************************************
 *  Command line
 *********************************************************/
-int FUZ_usage(const char* programName)
+static int FUZ_usage(const char* programName)
 {
     DISPLAY( "Usage :\n");
     DISPLAY( "      %s [args]\n", programName);
diff --git a/zlibWrapper/examples/minigzip.c b/zlibWrapper/examples/minigzip.c
index 521d047..f67be09 100644
--- a/zlibWrapper/examples/minigzip.c
+++ b/zlibWrapper/examples/minigzip.c
@@ -18,6 +18,8 @@
 
 /* @(#) $Id$ */
 
+#define _POSIX_SOURCE /* fileno */
+
 #include "zstd_zlibwrapper.h"
 #include <stdio.h>
 
@@ -470,12 +472,8 @@
         exit(1);
     }
 
-#if !defined(NO_snprintf) && !defined(NO_vsnprintf)
-    snprintf(outfile, sizeof(outfile), "%s%s", file, GZ_SUFFIX);
-#else
     strcpy(outfile, file);
     strcat(outfile, GZ_SUFFIX);
-#endif
 
     in = fopen(file, "rb");
     if (in == NULL) {
@@ -510,11 +508,7 @@
         exit(1);
     }
 
-#if !defined(NO_snprintf) && !defined(NO_vsnprintf)
-    snprintf(buf, sizeof(buf), "%s", file);
-#else
     strcpy(buf, file);
-#endif
 
     if (len > SUFFIX_LEN && strcmp(file+len-SUFFIX_LEN, GZ_SUFFIX) == 0) {
         infile = file;
@@ -523,11 +517,7 @@
     } else {
         outfile = file;
         infile = buf;
-#if !defined(NO_snprintf) && !defined(NO_vsnprintf)
-        snprintf(buf + len, sizeof(buf) - len, "%s", GZ_SUFFIX);
-#else
         strcat(infile, GZ_SUFFIX);
-#endif
     }
     in = gzopen(infile, "rb");
     if (in == NULL) {
@@ -565,11 +555,7 @@
     gzFile file;
     char *bname, outmode[20];
 
-#if !defined(NO_snprintf) && !defined(NO_vsnprintf)
-    snprintf(outmode, sizeof(outmode), "%s", "wb6 ");
-#else
     strcpy(outmode, "wb6 ");
-#endif
 
     prog = argv[0];
     bname = strrchr(argv[0], '/');
diff --git a/zlibWrapper/examples/zwrapbench.c b/zlibWrapper/examples/zwrapbench.c
index a4dfbb6..d2d6073 100644
--- a/zlibWrapper/examples/zwrapbench.c
+++ b/zlibWrapper/examples/zwrapbench.c
@@ -573,10 +573,10 @@
     do {
         testmem = (BYTE*)malloc((size_t)requiredMem);
         requiredMem -= step;
-    } while (!testmem);
+    } while (!testmem && requiredMem);   /* do not allocate zero bytes */
 
     free(testmem);
-    return (size_t)(requiredMem);
+    return (size_t)(requiredMem+1);  /* avoid zero */
 }
 
 static void BMK_benchCLevel(void* srcBuffer, size_t benchedSize,
@@ -734,7 +734,7 @@
     if ((U64)benchedSize > totalSizeToLoad) benchedSize = (size_t)totalSizeToLoad;
     if (benchedSize < totalSizeToLoad)
         DISPLAY("Not enough memory; testing %u MB only...\n", (U32)(benchedSize >> 20));
-    srcBuffer = malloc(benchedSize);
+    srcBuffer = malloc(benchedSize + !benchedSize);
     if (!srcBuffer) EXM_THROW(12, "not enough memory");
 
     /* Load input buffer */
diff --git a/zlibWrapper/gzguts.h b/zlibWrapper/gzguts.h
index 84651b8..05bf4d9 100644
--- a/zlibWrapper/gzguts.h
+++ b/zlibWrapper/gzguts.h
@@ -1,5 +1,5 @@
 /* gzguts.h contains minimal changes required to be compiled with zlibWrapper:
- * - #include "zlib.h" was changed to #include "zstd_zlibwrapper.h"        
+ * - #include "zlib.h" was changed to #include "zstd_zlibwrapper.h"
  * - gz_statep was converted to union to work with -Wstrict-aliasing=1      */
 
 /* gzguts.h -- zlib internal header definitions for gz* operations
@@ -44,7 +44,7 @@
 #  include <io.h>
 #endif
 
-#if defined(_WIN32) || defined(__CYGWIN__)
+#if defined(_WIN32)
 #  define WIDECHAR
 #endif
 
diff --git a/zlibWrapper/gzlib.c b/zlibWrapper/gzlib.c
index 8235cff..3070dd8 100644
--- a/zlibWrapper/gzlib.c
+++ b/zlibWrapper/gzlib.c
@@ -111,7 +111,7 @@
         return NULL;
 
     /* allocate gzFile structure to return */
-    state = (gz_statep)(gz_state*)malloc(sizeof(gz_state));
+    state.state = (gz_state*)malloc(sizeof(gz_state));
     if (state.state == NULL)
         return NULL;
     state.state->size = 0;            /* no buffers allocated yet */
@@ -266,7 +266,7 @@
     gz_reset(state);
 
     /* return stream */
-    return (gzFile)state.file;
+    return state.file;
 }
 
 /* -- see zlib.h -- */
diff --git a/zlibWrapper/gzwrite.c b/zlibWrapper/gzwrite.c
index d1250b9..21d5f84 100644
--- a/zlibWrapper/gzwrite.c
+++ b/zlibWrapper/gzwrite.c
@@ -6,6 +6,8 @@
  * For conditions of distribution and use, see http://www.zlib.net/zlib_license.html
  */
 
+#include <assert.h>
+
 #include "gzguts.h"
 
 /* Local functions */
@@ -24,7 +26,7 @@
     z_streamp strm = &(state.state->strm);
 
     /* allocate input buffer (double size for gzprintf) */
-    state.state->in = (unsigned char *)malloc(state.state->want << 1);
+    state.state->in = (unsigned char*)malloc(state.state->want << 1);
     if (state.state->in == NULL) {
         gz_error(state, Z_MEM_ERROR, "out of memory");
         return -1;
@@ -33,7 +35,7 @@
     /* only need output buffer and deflate state if compressing */
     if (!state.state->direct) {
         /* allocate output buffer */
-        state.state->out = (unsigned char *)malloc(state.state->want);
+        state.state->out = (unsigned char*)malloc(state.state->want);
         if (state.state->out == NULL) {
             free(state.state->in);
             gz_error(state, Z_MEM_ERROR, "out of memory");
@@ -284,6 +286,7 @@
     gz_statep state;
 
     /* get internal structure */
+    assert(size != 0);
     if (file == NULL)
         return 0;
     state = (gz_statep)file;
@@ -294,7 +297,7 @@
 
     /* compute bytes to read -- error on overflow */
     len = nitems * size;
-    if (size && len / size != nitems) {
+    if (size && (len / size != nitems)) {
         gz_error(state, Z_STREAM_ERROR, "request does not fit in a size_t");
         return 0;
     }
