Change the encoder's hash table values from int32 to uint16.
Doing s/int32/uint16/ in "var table [maxTableSize]int32" saves 32 KiB of stack
space that needed zero'ing. maxTableSize is 1<<14, or 16384.
We couldn't do this before, in commit cc71ae7c, since we didn't have
maxBlockSize = 65536 at the time.
benchmark old MB/s new MB/s speedup
BenchmarkWordsEncode1e1-4 468.81 465.82 0.99x
BenchmarkWordsEncode1e2-4 32.62 60.23 1.85x
BenchmarkWordsEncode1e3-4 138.36 174.61 1.26x
BenchmarkWordsEncode1e4-4 170.79 172.43 1.01x
BenchmarkWordsEncode1e5-4 131.02 134.25 1.02x
BenchmarkWordsEncode1e6-4 149.33 153.05 1.02x
BenchmarkRandomEncode-4 5933.57 6846.03 1.15x
Benchmark_ZFlat0-4 306.98 310.74 1.01x
Benchmark_ZFlat1-4 194.65 198.74 1.02x
Benchmark_ZFlat2-4 6784.51 8110.98 1.20x
Benchmark_ZFlat3-4 64.06 123.43 1.93x
Benchmark_ZFlat4-4 2102.05 2224.84 1.06x
Benchmark_ZFlat5-4 303.89 307.19 1.01x
Benchmark_ZFlat6-4 132.74 136.37 1.03x
Benchmark_ZFlat7-4 126.50 130.72 1.03x
Benchmark_ZFlat8-4 140.35 143.61 1.02x
Benchmark_ZFlat9-4 121.73 125.62 1.03x
Benchmark_ZFlat10-4 360.89 365.62 1.01x
Benchmark_ZFlat11-4 195.92 199.96 1.02x
diff --git a/encode.go b/encode.go
index 9055405..1a8a821 100644
--- a/encode.go
+++ b/encode.go
@@ -178,13 +178,15 @@
// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
func encodeBlock(dst, src []byte) (d int) {
// Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive.
+ // The table element type is uint16, as s < sLimit and sLimit < len(src)
+ // and len(src) <= maxBlockSize and maxBlockSize == 65536.
const maxTableSize = 1 << 14
shift, tableSize := uint32(32-8), 1<<8
for tableSize < maxTableSize && tableSize < len(src) {
shift--
tableSize *= 2
}
- var table [maxTableSize]int32
+ var table [maxTableSize]uint16
// sLimit is when to stop looking for offset/length copies. The inputMargin
// lets us use a fast path for emitLiteral in the main loop, while we are
@@ -228,7 +230,7 @@
goto emitRemainder
}
candidate = int(table[nextHash])
- table[nextHash] = int32(s)
+ table[nextHash] = uint16(s)
nextHash = hash(load32(src, nextS), shift)
if load32(src, s) == load32(src, candidate) {
break
@@ -269,10 +271,10 @@
// three load32 calls.
x := load64(src, s-1)
prevHash := hash(uint32(x>>0), shift)
- table[prevHash] = int32(s - 1)
+ table[prevHash] = uint16(s - 1)
currHash := hash(uint32(x>>8), shift)
candidate = int(table[currHash])
- table[currHash] = int32(s)
+ table[currHash] = uint16(s)
if uint32(x>>8) != load32(src, candidate) {
nextHash = hash(uint32(x>>16), shift)
s++