Fix heuristic match skipping. The heuristic was introduced in 4e2aa98e, based on the C++ Snappy implementation, but the Go code contained a flawed optimization. The C++ code used an explicit skip variable: uint32 bytes_between_hash_lookups = skip++ >> 5; next_ip = ip + bytes_between_hash_lookups; whereas the Go code optimized this to be an implicit skip: s += 1 + (s-lit)>>5 This is equivalent for small s values (relative to lit, the last hash table hit), but diverges for large ones. This Go program demonstrates the difference: // main prints the encoder skipping behavior when seeing no hash table hits. func main() { s0, s1 := 0, 0 skip := 32 for i := 0; i < 300; i++ { // This is the C++ Snappy algorithm. bytes_between_hash_lookups := skip >> 5 skip++ s0 += bytes_between_hash_lookups // This is the Go Snappy algorithm. s1 += 1 + s1>>5 // The intention was that the Go algorithm behaves the same as the C++ // one, but it doesn't. if i%10 == 0 { fmt.Printf("%d\t%d\t%d\n", i, s0, s1) } } } It prints: 0 1 1 10 11 11 20 21 21 30 31 31 40 50 50 50 70 73 60 90 105 70 117 149 80 147 208 90 177 288 100 212 398 110 252 548 120 292 752 130 335 1030 140 385 1408 150 435 1922 160 486 2619 170 546 3568 180 606 4861 190 666 6617 200 735 9005 210 805 12257 220 875 16681 230 952 22697 240 1032 30881 250 1112 42015 260 1197 57161 270 1287 77764 280 1377 105791 290 1470 143914 The C++ algorithm is quadratic. The Go algorithm is exponential. This commit re-introduces the explicit skip variable, so that the Go implementation matches the C++ implementation. For completeness, benchmark numbers are included below, but the worse numbers merely reflect that the old Go algorithm was too aggressive on skipping ahead on incompressible input (RandomEncode, ZFlat2 and ZFlat4), and so after an initial warm-up period, it was essentially performing not much more than a memcpy. Memcpy is indeed fast in terms of MB/s, but it doesn't compress at all, which obviously defeats the whole purpose of a compression format like Snappy. benchmark old MB/s new MB/s speedup BenchmarkWordsEncode1e1-4 3.65 3.77 1.03x BenchmarkWordsEncode1e2-4 29.22 29.35 1.00x BenchmarkWordsEncode1e3-4 99.46 101.20 1.02x BenchmarkWordsEncode1e4-4 118.11 121.54 1.03x BenchmarkWordsEncode1e5-4 90.37 91.72 1.01x BenchmarkWordsEncode1e6-4 107.49 108.88 1.01x BenchmarkRandomEncode-4 7679.09 4491.97 0.58x Benchmark_ZFlat0-4 229.41 233.79 1.02x Benchmark_ZFlat1-4 115.10 116.83 1.02x Benchmark_ZFlat2-4 7256.88 3003.79 0.41x Benchmark_ZFlat3-4 53.39 54.02 1.01x Benchmark_ZFlat4-4 1873.63 289.28 0.15x Benchmark_ZFlat5-4 233.29 234.95 1.01x Benchmark_ZFlat6-4 101.33 102.79 1.01x Benchmark_ZFlat7-4 95.26 96.63 1.01x Benchmark_ZFlat8-4 105.66 106.89 1.01x Benchmark_ZFlat9-4 92.04 93.11 1.01x Benchmark_ZFlat10-4 265.68 265.93 1.00x Benchmark_ZFlat11-4 149.72 151.32 1.01x These numbers were generated on an amd64 machine, but on a different machine than the one used for other recent commits. The raw MB/s numbers are therefore not directly comparable, although the speedup numbers should be.

commit: d1d908a252c22fd7afd36190d5cffb144aa8f777 [log] [tgz]
author: Nigel Tao <nigeltao@golang.org> Sun Feb 14 16:54:35 2016 +1100
committer: Nigel Tao <nigeltao@golang.org> Sun Feb 14 16:54:35 2016 +1100
tree: 747c65252cd7e9ae49c6d916954c50006c40b2c2
parent: c2359a1bd0bd4a2de4f1bd92ccd045fb60d0a994 [diff]
diff --git a/encode.go b/encode.go
index b9fe4dc..89109b6 100644
--- a/encode.go
+++ b/encode.go

@@ -137,6 +137,22 @@
 		s   int32 // The iterator position.
 		t   int32 // The last position with the same hash as s.
 		lit int32 // The start position of any pending literal bytes.
+
+		// Copied from the C++ snappy implementation:
+		//
+		// Heuristic match skipping: If 32 bytes are scanned with no matches
+		// found, start looking only at every other byte. If 32 more bytes are
+		// scanned, look at every third byte, etc.. When a match is found,
+		// immediately go back to looking at every byte. This is a small loss
+		// (~5% performance, ~0.1% density) for compressible data due to more
+		// bookkeeping, but for non-compressible data (such as JPEG) it's a
+		// huge win since the compressor quickly "realizes" the data is
+		// incompressible and doesn't bother looking for matches everywhere.
+		//
+		// The "skip" variable keeps track of how many bytes there are since
+		// the last match; dividing it by 32 (ie. right-shifting by five) gives
+		// the number of bytes to move ahead for each iteration.
+		skip uint32 = 32
 	)
 	for uint32(s+3) < uint32(len(src)) { // The uint32 conversions catch overflow from the +3.
 		// Update the hash table.
@@ -150,10 +166,11 @@
 		t, *p = *p-1, s+1
 		// If t is invalid or src[s:s+4] differs from src[t:t+4], accumulate a literal byte.
 		if t < 0 || s-t >= maxOffset || b0 != src[t] || b1 != src[t+1] || b2 != src[t+2] || b3 != src[t+3] {
-			// Skip multiple bytes if the last match was >= 32 bytes prior.
-			s += 1 + (s-lit)>>5
+			s += int32(skip >> 5)
+			skip++
 			continue
 		}
+		skip = 32
 		// Otherwise, we have a match. First, emit any pending literal bytes.
 		if lit != s {
 			d += emitLiteral(dst[d:], src[lit:s])

diff --git a/snappy_test.go b/snappy_test.go
index c365e9c..6584403 100644
--- a/snappy_test.go
+++ b/snappy_test.go

@@ -66,7 +66,7 @@
 	for n := 1; n < 20000; n += 23 {
 		b := make([]byte, n)
 		for i := range b {
-			b[i] = uint8(rng.Uint32())
+			b[i] = uint8(rng.Intn(256))
 		}
 		if err := roundtrip(b, nil, nil); err != nil {
 			t.Fatal(err)
@@ -237,6 +237,26 @@
 	}
 }
 
+// TestEncodeNoiseThenRepeats encodes a 32K block for which the first half is
+// very incompressible and the second half is very compressible. The encoded
+// form's length should be closer to 50% of the original length than 100%.
+func TestEncodeNoiseThenRepeats(t *testing.T) {
+	const origLen = 32768
+	src := make([]byte, origLen)
+	rng := rand.New(rand.NewSource(1))
+	firstHalf, secondHalf := src[:origLen/2], src[origLen/2:]
+	for i := range firstHalf {
+		firstHalf[i] = uint8(rng.Intn(256))
+	}
+	for i := range secondHalf {
+		secondHalf[i] = uint8(i >> 8)
+	}
+	dst := Encode(nil, src)
+	if got, want := len(dst), origLen*3/4; got >= want {
+		t.Fatalf("got %d encoded bytes, want less than %d", got, want)
+	}
+}
+
 func cmp(a, b []byte) error {
 	if len(a) != len(b) {
 		return fmt.Errorf("got %d bytes, want %d", len(a), len(b))
commit	d1d908a252c22fd7afd36190d5cffb144aa8f777	[log] [tgz]
author	Nigel Tao <nigeltao@golang.org>	Sun Feb 14 16:54:35 2016 +1100
committer	Nigel Tao <nigeltao@golang.org>	Sun Feb 14 16:54:35 2016 +1100
tree	747c65252cd7e9ae49c6d916954c50006c40b2c2
parent	c2359a1bd0bd4a2de4f1bd92ccd045fb60d0a994 [diff]