Eliminate some bounds checks.
It seems like a small win:
benchmark old MB/s new MB/s speedup
BenchmarkWordsEncode1e1-4 465.69 471.77 1.01x
BenchmarkWordsEncode1e2-4 60.18 60.27 1.00x
BenchmarkWordsEncode1e3-4 174.42 176.26 1.01x
BenchmarkWordsEncode1e4-4 172.40 175.95 1.02x
BenchmarkWordsEncode1e5-4 134.42 134.86 1.00x
BenchmarkWordsEncode1e6-4 153.09 154.03 1.01x
BenchmarkRandomEncode-4 6504.88 6553.55 1.01x
Benchmark_ZFlat0-4 310.55 313.22 1.01x
Benchmark_ZFlat1-4 198.43 199.73 1.01x
Benchmark_ZFlat2-4 7915.02 8052.65 1.02x
Benchmark_ZFlat3-4 123.07 123.53 1.00x
Benchmark_ZFlat4-4 2220.35 2230.80 1.00x
Benchmark_ZFlat5-4 307.05 309.51 1.01x
Benchmark_ZFlat6-4 136.35 137.19 1.01x
Benchmark_ZFlat7-4 130.67 131.33 1.01x
Benchmark_ZFlat8-4 143.17 144.47 1.01x
Benchmark_ZFlat9-4 125.40 125.85 1.00x
Benchmark_ZFlat10-4 364.30 370.35 1.02x
Benchmark_ZFlat11-4 200.04 199.80 1.00x
diff --git a/encode.go b/encode.go
index 1a8a821..59d44d7 100644
--- a/encode.go
+++ b/encode.go
@@ -180,7 +180,12 @@
// Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive.
// The table element type is uint16, as s < sLimit and sLimit < len(src)
// and len(src) <= maxBlockSize and maxBlockSize == 65536.
- const maxTableSize = 1 << 14
+ const (
+ maxTableSize = 1 << 14
+ // tableMask is redundant, but helps the compiler eliminate bounds
+ // checks.
+ tableMask = maxTableSize - 1
+ )
shift, tableSize := uint32(32-8), 1<<8
for tableSize < maxTableSize && tableSize < len(src) {
shift--
@@ -229,8 +234,8 @@
if nextS > sLimit {
goto emitRemainder
}
- candidate = int(table[nextHash])
- table[nextHash] = uint16(s)
+ candidate = int(table[nextHash&tableMask])
+ table[nextHash&tableMask] = uint16(s)
nextHash = hash(load32(src, nextS), shift)
if load32(src, s) == load32(src, candidate) {
break
@@ -271,10 +276,10 @@
// three load32 calls.
x := load64(src, s-1)
prevHash := hash(uint32(x>>0), shift)
- table[prevHash] = uint16(s - 1)
+ table[prevHash&tableMask] = uint16(s - 1)
currHash := hash(uint32(x>>8), shift)
- candidate = int(table[currHash])
- table[currHash] = uint16(s)
+ candidate = int(table[currHash&tableMask])
+ table[currHash&tableMask] = uint16(s)
if uint32(x>>8) != load32(src, candidate) {
nextHash = hash(uint32(x>>16), shift)
s++