Change the encoder's hash table values from int32 to uint16.

Doing s/int32/uint16/ in "var table [maxTableSize]int32" saves 32 KiB of stack
space that needed zero'ing. maxTableSize is 1<<14, or 16384.

We couldn't do this before, in commit cc71ae7c, since we didn't have
maxBlockSize = 65536 at the time.

benchmark                     old MB/s     new MB/s     speedup
BenchmarkWordsEncode1e1-4     468.81       465.82       0.99x
BenchmarkWordsEncode1e2-4     32.62        60.23        1.85x
BenchmarkWordsEncode1e3-4     138.36       174.61       1.26x
BenchmarkWordsEncode1e4-4     170.79       172.43       1.01x
BenchmarkWordsEncode1e5-4     131.02       134.25       1.02x
BenchmarkWordsEncode1e6-4     149.33       153.05       1.02x
BenchmarkRandomEncode-4       5933.57      6846.03      1.15x
Benchmark_ZFlat0-4            306.98       310.74       1.01x
Benchmark_ZFlat1-4            194.65       198.74       1.02x
Benchmark_ZFlat2-4            6784.51      8110.98      1.20x
Benchmark_ZFlat3-4            64.06        123.43       1.93x
Benchmark_ZFlat4-4            2102.05      2224.84      1.06x
Benchmark_ZFlat5-4            303.89       307.19       1.01x
Benchmark_ZFlat6-4            132.74       136.37       1.03x
Benchmark_ZFlat7-4            126.50       130.72       1.03x
Benchmark_ZFlat8-4            140.35       143.61       1.02x
Benchmark_ZFlat9-4            121.73       125.62       1.03x
Benchmark_ZFlat10-4           360.89       365.62       1.01x
Benchmark_ZFlat11-4           195.92       199.96       1.02x
diff --git a/encode.go b/encode.go
index 9055405..1a8a821 100644
--- a/encode.go
+++ b/encode.go
@@ -178,13 +178,15 @@
 // 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
 func encodeBlock(dst, src []byte) (d int) {
 	// Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive.
+	// The table element type is uint16, as s < sLimit and sLimit < len(src)
+	// and len(src) <= maxBlockSize and maxBlockSize == 65536.
 	const maxTableSize = 1 << 14
 	shift, tableSize := uint32(32-8), 1<<8
 	for tableSize < maxTableSize && tableSize < len(src) {
 		shift--
 		tableSize *= 2
 	}
-	var table [maxTableSize]int32
+	var table [maxTableSize]uint16
 
 	// sLimit is when to stop looking for offset/length copies. The inputMargin
 	// lets us use a fast path for emitLiteral in the main loop, while we are
@@ -228,7 +230,7 @@
 				goto emitRemainder
 			}
 			candidate = int(table[nextHash])
-			table[nextHash] = int32(s)
+			table[nextHash] = uint16(s)
 			nextHash = hash(load32(src, nextS), shift)
 			if load32(src, s) == load32(src, candidate) {
 				break
@@ -269,10 +271,10 @@
 			// three load32 calls.
 			x := load64(src, s-1)
 			prevHash := hash(uint32(x>>0), shift)
-			table[prevHash] = int32(s - 1)
+			table[prevHash] = uint16(s - 1)
 			currHash := hash(uint32(x>>8), shift)
 			candidate = int(table[currHash])
-			table[currHash] = int32(s)
+			table[currHash] = uint16(s)
 			if uint32(x>>8) != load32(src, candidate) {
 				nextHash = hash(uint32(x>>16), shift)
 				s++