Change the encoder's hash table values from int to int32.

Doing s/int/int32/ in "var table [maxTableSize]int" saves 64 KiB of
stack space that needed zero'ing. maxTableSize is 1<<14, or 16384.

The benchmarks show the biggest effect for small src lengths, or for
mostly uncompressible data such as the JPEG file (possibly because the
multiple-byte skipping means that the src is effectively short).

On amd64:

benchmark                     old MB/s     new MB/s     speedup
BenchmarkWordsEncode1e1-8     3.05         5.71         1.87x
BenchmarkWordsEncode1e2-8     26.98        44.87        1.66x
BenchmarkWordsEncode1e3-8     130.87       156.72       1.20x
BenchmarkWordsEncode1e4-8     162.48       180.89       1.11x
BenchmarkWordsEncode1e5-8     132.35       131.27       0.99x
BenchmarkWordsEncode1e6-8     159.97       158.49       0.99x
BenchmarkRandomEncode-8       12340.86     13485.69     1.09x
Benchmark_ZFlat0-8            329.92       329.17       1.00x
Benchmark_ZFlat1-8            165.06       164.46       1.00x
Benchmark_ZFlat2-8            8955.25      10530.49     1.18x
Benchmark_ZFlat3-8            47.79        80.06        1.68x
Benchmark_ZFlat4-8            2650.55      2732.00      1.03x
Benchmark_ZFlat5-8            336.52       334.94       1.00x
Benchmark_ZFlat6-8            147.99       145.85       0.99x
Benchmark_ZFlat7-8            136.32       137.20       1.01x
Benchmark_ZFlat8-8            153.03       152.15       0.99x
Benchmark_ZFlat9-8            133.18       131.74       0.99x
Benchmark_ZFlat10-8           376.02       378.28       1.01x
Benchmark_ZFlat11-8           224.16       216.81       0.97x

Thanks to Klaus Post for the original suggestion on
https://github.com/golang/snappy/pull/23 but I hesitate to accept that
pull request in its entirety as it makes many changes, some more
complicated than this separable, self-contained s/int/int32/ change.
diff --git a/encode.go b/encode.go
index 297e628..8e3b8c4 100644
--- a/encode.go
+++ b/encode.go
@@ -52,7 +52,7 @@
 }
 
 // emitCopy writes a copy chunk and returns the number of bytes written.
-func emitCopy(dst []byte, offset, length int) int {
+func emitCopy(dst []byte, offset, length int32) int {
 	i := 0
 	for length > 0 {
 		x := length - 4
@@ -88,12 +88,36 @@
 	// The block starts with the varint-encoded length of the decompressed bytes.
 	d := binary.PutUvarint(dst, uint64(len(src)))
 
+	for len(src) > 0 {
+		p := src
+		src = nil
+		if len(p) > maxInternalEncodeSrcLen {
+			p, src = p[:maxInternalEncodeSrcLen], p[maxInternalEncodeSrcLen:]
+		}
+		d += encode(dst[d:], p)
+	}
+	return dst[:d]
+}
+
+// maxInternalEncodeSrcLen must be less than math.MaxInt32, so that in the
+// (internal) encode function, it is safe to have the s variable (which indexes
+// the src slice), and therefore the hash table entries, to have type int32
+// instead of int.
+const maxInternalEncodeSrcLen = 0x40000000
+
+// encode encodes a non-empty src to a guaranteed-large-enough dst. It assumes
+// that the varint-encoded length of the decompressed bytes has already been
+// written.
+//
+// It also assumes that:
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+// 	0 < len(src) &&
+//	len(src) <= maxInternalEncodeSrcLen &&
+// 	maxInternalEncodeSrcLen < math.MaxInt32.
+func encode(dst, src []byte) (d int) {
 	// Return early if src is short.
 	if len(src) <= 4 {
-		if len(src) != 0 {
-			d += emitLiteral(dst[d:], src)
-		}
-		return dst[:d]
+		return emitLiteral(dst, src)
 	}
 
 	// Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive.
@@ -103,15 +127,15 @@
 		shift--
 		tableSize *= 2
 	}
-	var table [maxTableSize]int
+	var table [maxTableSize]int32
 
 	// Iterate over the source bytes.
 	var (
-		s   int // The iterator position.
-		t   int // The last position with the same hash as s.
-		lit int // The start position of any pending literal bytes.
+		s   int32 // The iterator position.
+		t   int32 // The last position with the same hash as s.
+		lit int32 // The start position of any pending literal bytes.
 	)
-	for uint(s+3) < uint(len(src)) { // The uint conversions catch overflow from the +3.
+	for uint32(s+3) < uint32(len(src)) { // The uint32 conversions catch overflow from the +3.
 		// Update the hash table.
 		b0, b1, b2, b3 := src[s], src[s+1], src[s+2], src[s+3]
 		h := uint32(b0) | uint32(b1)<<8 | uint32(b2)<<16 | uint32(b3)<<24
@@ -134,7 +158,7 @@
 		// Extend the match to be as long as possible.
 		s0 := s
 		s, t = s+4, t+4
-		for s < len(src) && src[s] == src[t] {
+		for int(s) < len(src) && src[s] == src[t] {
 			s++
 			t++
 		}
@@ -144,10 +168,10 @@
 	}
 
 	// Emit any final pending literal bytes and return.
-	if lit != len(src) {
+	if int(lit) != len(src) {
 		d += emitLiteral(dst[d:], src[lit:])
 	}
-	return dst[:d]
+	return d
 }
 
 // MaxEncodedLen returns the maximum length of a snappy block, given its