Change the encoder's hash table values from int to int32.
Doing s/int/int32/ in "var table [maxTableSize]int" saves 64 KiB of
stack space that needed zero'ing. maxTableSize is 1<<14, or 16384.
The benchmarks show the biggest effect for small src lengths, or for
mostly uncompressible data such as the JPEG file (possibly because the
multiple-byte skipping means that the src is effectively short).
On amd64:
benchmark old MB/s new MB/s speedup
BenchmarkWordsEncode1e1-8 3.05 5.71 1.87x
BenchmarkWordsEncode1e2-8 26.98 44.87 1.66x
BenchmarkWordsEncode1e3-8 130.87 156.72 1.20x
BenchmarkWordsEncode1e4-8 162.48 180.89 1.11x
BenchmarkWordsEncode1e5-8 132.35 131.27 0.99x
BenchmarkWordsEncode1e6-8 159.97 158.49 0.99x
BenchmarkRandomEncode-8 12340.86 13485.69 1.09x
Benchmark_ZFlat0-8 329.92 329.17 1.00x
Benchmark_ZFlat1-8 165.06 164.46 1.00x
Benchmark_ZFlat2-8 8955.25 10530.49 1.18x
Benchmark_ZFlat3-8 47.79 80.06 1.68x
Benchmark_ZFlat4-8 2650.55 2732.00 1.03x
Benchmark_ZFlat5-8 336.52 334.94 1.00x
Benchmark_ZFlat6-8 147.99 145.85 0.99x
Benchmark_ZFlat7-8 136.32 137.20 1.01x
Benchmark_ZFlat8-8 153.03 152.15 0.99x
Benchmark_ZFlat9-8 133.18 131.74 0.99x
Benchmark_ZFlat10-8 376.02 378.28 1.01x
Benchmark_ZFlat11-8 224.16 216.81 0.97x
Thanks to Klaus Post for the original suggestion on
https://github.com/golang/snappy/pull/23 but I hesitate to accept that
pull request in its entirety as it makes many changes, some more
complicated than this separable, self-contained s/int/int32/ change.
diff --git a/encode.go b/encode.go
index 297e628..8e3b8c4 100644
--- a/encode.go
+++ b/encode.go
@@ -52,7 +52,7 @@
}
// emitCopy writes a copy chunk and returns the number of bytes written.
-func emitCopy(dst []byte, offset, length int) int {
+func emitCopy(dst []byte, offset, length int32) int {
i := 0
for length > 0 {
x := length - 4
@@ -88,12 +88,36 @@
// The block starts with the varint-encoded length of the decompressed bytes.
d := binary.PutUvarint(dst, uint64(len(src)))
+ for len(src) > 0 {
+ p := src
+ src = nil
+ if len(p) > maxInternalEncodeSrcLen {
+ p, src = p[:maxInternalEncodeSrcLen], p[maxInternalEncodeSrcLen:]
+ }
+ d += encode(dst[d:], p)
+ }
+ return dst[:d]
+}
+
+// maxInternalEncodeSrcLen must be less than math.MaxInt32, so that in the
+// (internal) encode function, it is safe to have the s variable (which indexes
+// the src slice), and therefore the hash table entries, to have type int32
+// instead of int.
+const maxInternalEncodeSrcLen = 0x40000000
+
+// encode encodes a non-empty src to a guaranteed-large-enough dst. It assumes
+// that the varint-encoded length of the decompressed bytes has already been
+// written.
+//
+// It also assumes that:
+// len(dst) >= MaxEncodedLen(len(src)) &&
+// 0 < len(src) &&
+// len(src) <= maxInternalEncodeSrcLen &&
+// maxInternalEncodeSrcLen < math.MaxInt32.
+func encode(dst, src []byte) (d int) {
// Return early if src is short.
if len(src) <= 4 {
- if len(src) != 0 {
- d += emitLiteral(dst[d:], src)
- }
- return dst[:d]
+ return emitLiteral(dst, src)
}
// Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive.
@@ -103,15 +127,15 @@
shift--
tableSize *= 2
}
- var table [maxTableSize]int
+ var table [maxTableSize]int32
// Iterate over the source bytes.
var (
- s int // The iterator position.
- t int // The last position with the same hash as s.
- lit int // The start position of any pending literal bytes.
+ s int32 // The iterator position.
+ t int32 // The last position with the same hash as s.
+ lit int32 // The start position of any pending literal bytes.
)
- for uint(s+3) < uint(len(src)) { // The uint conversions catch overflow from the +3.
+ for uint32(s+3) < uint32(len(src)) { // The uint32 conversions catch overflow from the +3.
// Update the hash table.
b0, b1, b2, b3 := src[s], src[s+1], src[s+2], src[s+3]
h := uint32(b0) | uint32(b1)<<8 | uint32(b2)<<16 | uint32(b3)<<24
@@ -134,7 +158,7 @@
// Extend the match to be as long as possible.
s0 := s
s, t = s+4, t+4
- for s < len(src) && src[s] == src[t] {
+ for int(s) < len(src) && src[s] == src[t] {
s++
t++
}
@@ -144,10 +168,10 @@
}
// Emit any final pending literal bytes and return.
- if lit != len(src) {
+ if int(lit) != len(src) {
d += emitLiteral(dst[d:], src[lit:])
}
- return dst[:d]
+ return d
}
// MaxEncodedLen returns the maximum length of a snappy block, given its