Make heuristic match skipping more aggressive.
This is the Go equivalent of an algorithmic change in the C++ snappy
code:
https://github.com/google/snappy/commit/d53de18799418e113e44444252a39b12a0e4e0cc
The discussion is at:
https://groups.google.com/d/topic/snappy-compression/3Qa3fASLkNA/discussion
benchmark old MB/s new MB/s speedup
BenchmarkWordsEncode1e1-8 680.57 679.12 1.00x
BenchmarkWordsEncode1e2-8 49.90 49.65 0.99x
BenchmarkWordsEncode1e3-8 213.28 212.75 1.00x
BenchmarkWordsEncode1e4-8 247.05 245.76 0.99x
BenchmarkWordsEncode1e5-8 180.68 179.95 1.00x
BenchmarkWordsEncode1e6-8 205.65 204.83 1.00x
BenchmarkRandomEncode-8 5678.83 11217.33 1.98x
Benchmark_ZFlat0-8 422.83 423.18 1.00x
Benchmark_ZFlat1-8 269.60 271.01 1.01x
Benchmark_ZFlat2-8 5517.16 11517.40 2.09x
Benchmark_ZFlat3-8 92.47 92.39 1.00x
Benchmark_ZFlat4-8 954.63 2947.73 3.09x
Benchmark_ZFlat5-8 419.71 419.87 1.00x
Benchmark_ZFlat6-8 184.13 183.45 1.00x
Benchmark_ZFlat7-8 175.83 175.89 1.00x
Benchmark_ZFlat8-8 193.49 193.84 1.00x
Benchmark_ZFlat9-8 169.02 168.59 1.00x
Benchmark_ZFlat10-8 500.19 499.85 1.00x
Benchmark_ZFlat11-8 271.20 270.60 1.00x
diff --git a/cmd/snappytool/main.cpp b/cmd/snappytool/main.cpp
index db28a89..fc31f51 100644
--- a/cmd/snappytool/main.cpp
+++ b/cmd/snappytool/main.cpp
@@ -1,6 +1,9 @@
/*
To build the snappytool binary:
g++ main.cpp /usr/lib/libsnappy.a -o snappytool
+or, if you have built the C++ snappy library from source:
+g++ main.cpp /path/to/your/snappy/.libs/libsnappy.a -o snappytool
+after running "make" from your snappy checkout directory.
*/
#include <errno.h>
diff --git a/encode.go b/encode.go
index 0fc1cc9..9055405 100644
--- a/encode.go
+++ b/encode.go
@@ -204,12 +204,13 @@
//
// Heuristic match skipping: If 32 bytes are scanned with no matches
// found, start looking only at every other byte. If 32 more bytes are
- // scanned, look at every third byte, etc.. When a match is found,
- // immediately go back to looking at every byte. This is a small loss
- // (~5% performance, ~0.1% density) for compressible data due to more
- // bookkeeping, but for non-compressible data (such as JPEG) it's a
- // huge win since the compressor quickly "realizes" the data is
- // incompressible and doesn't bother looking for matches everywhere.
+ // scanned (or skipped), look at every third byte, etc.. When a match
+ // is found, immediately go back to looking at every byte. This is a
+ // small loss (~5% performance, ~0.1% density) for compressible data
+ // due to more bookkeeping, but for non-compressible data (such as
+ // JPEG) it's a huge win since the compressor quickly "realizes" the
+ // data is incompressible and doesn't bother looking for matches
+ // everywhere.
//
// The "skip" variable keeps track of how many bytes there are since
// the last match; dividing it by 32 (ie. right-shifting by five) gives
@@ -220,8 +221,9 @@
candidate := 0
for {
s = nextS
- nextS = s + skip>>5
- skip++
+ bytesBetweenHashLookups := skip >> 5
+ nextS = s + bytesBetweenHashLookups
+ skip += bytesBetweenHashLookups
if nextS > sLimit {
goto emitRemainder
}
diff --git a/snappy_test.go b/snappy_test.go
index 714e2a2..665d2bb 100644
--- a/snappy_test.go
+++ b/snappy_test.go
@@ -459,6 +459,12 @@
}
}
+// TODO: pi.txt no longer compresses at all, after the Go code implemented the
+// same algorithmic change as the C++ code:
+// https://github.com/google/snappy/commit/d53de18799418e113e44444252a39b12a0e4e0cc
+//
+// We should use a more compressible test file.
+
func TestDecodeGoldenInput(t *testing.T) {
src, err := ioutil.ReadFile("testdata/pi.txt.rawsnappy")
if err != nil {
@@ -532,6 +538,7 @@
if msg := skipTestSameEncodingAsCpp(); msg != "" {
t.Skip(msg)
}
+ failed := false
for i, tf := range testFiles {
if err := downloadBenchmarkFiles(t, tf.filename); err != nil {
t.Fatalf("failed to download testdata: %s", err)
@@ -542,8 +549,14 @@
}
if err := runTestSameEncodingAsCpp(data); err != nil {
t.Errorf("i=%d: %v", i, err)
+ failed = true
}
}
+ if failed {
+ t.Errorf("was the snappytool program built against the C++ snappy library version " +
+ "d53de187 or later, commited on 2016-04-05? See " +
+ "https://github.com/google/snappy/commit/d53de18799418e113e44444252a39b12a0e4e0cc")
+ }
}
// TestSlowForwardCopyOverrun tests the "expand the pattern" algorithm
@@ -595,7 +608,7 @@
// incompressible and the second half is very compressible. The encoded form's
// length should be closer to 50% of the original length than 100%.
func TestEncodeNoiseThenRepeats(t *testing.T) {
- for _, origLen := range []int{32 * 1024, 256 * 1024, 2048 * 1024} {
+ for _, origLen := range []int{256 * 1024, 2048 * 1024} {
src := make([]byte, origLen)
rng := rand.New(rand.NewSource(1))
firstHalf, secondHalf := src[:origLen/2], src[origLen/2:]
diff --git a/testdata/pi.txt.rawsnappy b/testdata/pi.txt.rawsnappy
index 3378c72..9620784 100644
--- a/testdata/pi.txt.rawsnappy
+++ b/testdata/pi.txt.rawsnappy
Binary files differ