Eliminate some bounds checks.

It seems like a small win:

benchmark                     old MB/s     new MB/s     speedup
BenchmarkWordsEncode1e1-4     465.69       471.77       1.01x
BenchmarkWordsEncode1e2-4     60.18        60.27        1.00x
BenchmarkWordsEncode1e3-4     174.42       176.26       1.01x
BenchmarkWordsEncode1e4-4     172.40       175.95       1.02x
BenchmarkWordsEncode1e5-4     134.42       134.86       1.00x
BenchmarkWordsEncode1e6-4     153.09       154.03       1.01x
BenchmarkRandomEncode-4       6504.88      6553.55      1.01x
Benchmark_ZFlat0-4            310.55       313.22       1.01x
Benchmark_ZFlat1-4            198.43       199.73       1.01x
Benchmark_ZFlat2-4            7915.02      8052.65      1.02x
Benchmark_ZFlat3-4            123.07       123.53       1.00x
Benchmark_ZFlat4-4            2220.35      2230.80      1.00x
Benchmark_ZFlat5-4            307.05       309.51       1.01x
Benchmark_ZFlat6-4            136.35       137.19       1.01x
Benchmark_ZFlat7-4            130.67       131.33       1.01x
Benchmark_ZFlat8-4            143.17       144.47       1.01x
Benchmark_ZFlat9-4            125.40       125.85       1.00x
Benchmark_ZFlat10-4           364.30       370.35       1.02x
Benchmark_ZFlat11-4           200.04       199.80       1.00x
diff --git a/encode.go b/encode.go
index 1a8a821..59d44d7 100644
--- a/encode.go
+++ b/encode.go
@@ -180,7 +180,12 @@
 	// Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive.
 	// The table element type is uint16, as s < sLimit and sLimit < len(src)
 	// and len(src) <= maxBlockSize and maxBlockSize == 65536.
-	const maxTableSize = 1 << 14
+	const (
+		maxTableSize = 1 << 14
+		// tableMask is redundant, but helps the compiler eliminate bounds
+		// checks.
+		tableMask = maxTableSize - 1
+	)
 	shift, tableSize := uint32(32-8), 1<<8
 	for tableSize < maxTableSize && tableSize < len(src) {
 		shift--
@@ -229,8 +234,8 @@
 			if nextS > sLimit {
 				goto emitRemainder
 			}
-			candidate = int(table[nextHash])
-			table[nextHash] = uint16(s)
+			candidate = int(table[nextHash&tableMask])
+			table[nextHash&tableMask] = uint16(s)
 			nextHash = hash(load32(src, nextS), shift)
 			if load32(src, s) == load32(src, candidate) {
 				break
@@ -271,10 +276,10 @@
 			// three load32 calls.
 			x := load64(src, s-1)
 			prevHash := hash(uint32(x>>0), shift)
-			table[prevHash] = uint16(s - 1)
+			table[prevHash&tableMask] = uint16(s - 1)
 			currHash := hash(uint32(x>>8), shift)
-			candidate = int(table[currHash])
-			table[currHash] = uint16(s)
+			candidate = int(table[currHash&tableMask])
+			table[currHash&tableMask] = uint16(s)
 			if uint32(x>>8) != load32(src, candidate) {
 				nextHash = hash(uint32(x>>16), shift)
 				s++