Change the encoder's hash table values from int to int32. Doing s/int/int32/ in "var table [maxTableSize]int" saves 64 KiB of stack space that needed zero'ing. maxTableSize is 1<<14, or 16384. The benchmarks show the biggest effect for small src lengths, or for mostly uncompressible data such as the JPEG file (possibly because the multiple-byte skipping means that the src is effectively short). On amd64: benchmark old MB/s new MB/s speedup BenchmarkWordsEncode1e1-8 3.05 5.71 1.87x BenchmarkWordsEncode1e2-8 26.98 44.87 1.66x BenchmarkWordsEncode1e3-8 130.87 156.72 1.20x BenchmarkWordsEncode1e4-8 162.48 180.89 1.11x BenchmarkWordsEncode1e5-8 132.35 131.27 0.99x BenchmarkWordsEncode1e6-8 159.97 158.49 0.99x BenchmarkRandomEncode-8 12340.86 13485.69 1.09x Benchmark_ZFlat0-8 329.92 329.17 1.00x Benchmark_ZFlat1-8 165.06 164.46 1.00x Benchmark_ZFlat2-8 8955.25 10530.49 1.18x Benchmark_ZFlat3-8 47.79 80.06 1.68x Benchmark_ZFlat4-8 2650.55 2732.00 1.03x Benchmark_ZFlat5-8 336.52 334.94 1.00x Benchmark_ZFlat6-8 147.99 145.85 0.99x Benchmark_ZFlat7-8 136.32 137.20 1.01x Benchmark_ZFlat8-8 153.03 152.15 0.99x Benchmark_ZFlat9-8 133.18 131.74 0.99x Benchmark_ZFlat10-8 376.02 378.28 1.01x Benchmark_ZFlat11-8 224.16 216.81 0.97x Thanks to Klaus Post for the original suggestion on https://github.com/golang/snappy/pull/23 but I hesitate to accept that pull request in its entirety as it makes many changes, some more complicated than this separable, self-contained s/int/int32/ change.

commit: cc71ae7cc5b621a5a0d6c776ca4c571ab629f495 [log] [tgz]
author: Nigel Tao <nigeltao@golang.org> Sat Feb 13 14:11:38 2016 +1100
committer: Nigel Tao <nigeltao@golang.org> Sat Feb 13 14:11:38 2016 +1100
tree: 8b0e3d1aac786833efbac1bf4e6fcd6a241ec1b3
parent: 51195a16b052437072a3e46972e348569f2a2f7a [diff]
diff --git a/encode.go b/encode.go
index 297e628..8e3b8c4 100644
--- a/encode.go
+++ b/encode.go

@@ -52,7 +52,7 @@
 }
 
 // emitCopy writes a copy chunk and returns the number of bytes written.
-func emitCopy(dst []byte, offset, length int) int {
+func emitCopy(dst []byte, offset, length int32) int {
 	i := 0
 	for length > 0 {
 		x := length - 4
@@ -88,12 +88,36 @@
 	// The block starts with the varint-encoded length of the decompressed bytes.
 	d := binary.PutUvarint(dst, uint64(len(src)))
 
+	for len(src) > 0 {
+		p := src
+		src = nil
+		if len(p) > maxInternalEncodeSrcLen {
+			p, src = p[:maxInternalEncodeSrcLen], p[maxInternalEncodeSrcLen:]
+		}
+		d += encode(dst[d:], p)
+	}
+	return dst[:d]
+}
+
+// maxInternalEncodeSrcLen must be less than math.MaxInt32, so that in the
+// (internal) encode function, it is safe to have the s variable (which indexes
+// the src slice), and therefore the hash table entries, to have type int32
+// instead of int.
+const maxInternalEncodeSrcLen = 0x40000000
+
+// encode encodes a non-empty src to a guaranteed-large-enough dst. It assumes
+// that the varint-encoded length of the decompressed bytes has already been
+// written.
+//
+// It also assumes that:
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+// 	0 < len(src) &&
+//	len(src) <= maxInternalEncodeSrcLen &&
+// 	maxInternalEncodeSrcLen < math.MaxInt32.
+func encode(dst, src []byte) (d int) {
 	// Return early if src is short.
 	if len(src) <= 4 {
-		if len(src) != 0 {
-			d += emitLiteral(dst[d:], src)
-		}
-		return dst[:d]
+		return emitLiteral(dst, src)
 	}
 
 	// Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive.
@@ -103,15 +127,15 @@
 		shift--
 		tableSize *= 2
 	}
-	var table [maxTableSize]int
+	var table [maxTableSize]int32
 
 	// Iterate over the source bytes.
 	var (
-		s   int // The iterator position.
-		t   int // The last position with the same hash as s.
-		lit int // The start position of any pending literal bytes.
+		s   int32 // The iterator position.
+		t   int32 // The last position with the same hash as s.
+		lit int32 // The start position of any pending literal bytes.
 	)
-	for uint(s+3) < uint(len(src)) { // The uint conversions catch overflow from the +3.
+	for uint32(s+3) < uint32(len(src)) { // The uint32 conversions catch overflow from the +3.
 		// Update the hash table.
 		b0, b1, b2, b3 := src[s], src[s+1], src[s+2], src[s+3]
 		h := uint32(b0) | uint32(b1)<<8 | uint32(b2)<<16 | uint32(b3)<<24
@@ -134,7 +158,7 @@
 		// Extend the match to be as long as possible.
 		s0 := s
 		s, t = s+4, t+4
-		for s < len(src) && src[s] == src[t] {
+		for int(s) < len(src) && src[s] == src[t] {
 			s++
 			t++
 		}
@@ -144,10 +168,10 @@
 	}
 
 	// Emit any final pending literal bytes and return.
-	if lit != len(src) {
+	if int(lit) != len(src) {
 		d += emitLiteral(dst[d:], src[lit:])
 	}
-	return dst[:d]
+	return d
 }
 
 // MaxEncodedLen returns the maximum length of a snappy block, given its
commit	cc71ae7cc5b621a5a0d6c776ca4c571ab629f495	[log] [tgz]
author	Nigel Tao <nigeltao@golang.org>	Sat Feb 13 14:11:38 2016 +1100
committer	Nigel Tao <nigeltao@golang.org>	Sat Feb 13 14:11:38 2016 +1100
tree	8b0e3d1aac786833efbac1bf4e6fcd6a241ec1b3
parent	51195a16b052437072a3e46972e348569f2a2f7a [diff]