Encode copies of length 65, 66 or 67 as 5 bytes, not 6.

The benchmarks don't show a big change either way, but the output is
shorter, and it matches what the C++ snappy code does.

benchmark                     old MB/s     new MB/s     speedup
BenchmarkWordsEncode1e1-8     5.77         5.77         1.00x
BenchmarkWordsEncode1e2-8     47.15        47.26        1.00x
BenchmarkWordsEncode1e3-8     180.77       183.25       1.01x
BenchmarkWordsEncode1e4-8     202.01       198.96       0.98x
BenchmarkWordsEncode1e5-8     145.66       144.68       0.99x
BenchmarkWordsEncode1e6-8     174.12       172.31       0.99x
BenchmarkRandomEncode-8       4522.91      4495.78      0.99x
Benchmark_ZFlat0-8            359.70       359.79       1.00x
Benchmark_ZFlat1-8            181.18       181.82       1.00x
Benchmark_ZFlat2-8            4612.52      4557.46      0.99x
Benchmark_ZFlat3-8            85.65        84.82        0.99x
Benchmark_ZFlat4-8            559.51       558.52       1.00x
Benchmark_ZFlat5-8            354.88       352.91       0.99x
Benchmark_ZFlat6-8            156.14       156.26       1.00x
Benchmark_ZFlat7-8            148.18       148.12       1.00x
Benchmark_ZFlat8-8            162.68       162.21       1.00x
Benchmark_ZFlat9-8            141.81       142.32       1.00x
Benchmark_ZFlat10-8           399.79       401.94       1.01x
Benchmark_ZFlat11-8           237.43       235.91       0.99x
diff --git a/encode.go b/encode.go
index 38ebe95..939fb58 100644
--- a/encode.go
+++ b/encode.go
@@ -55,26 +55,42 @@
 // emitCopy writes a copy chunk and returns the number of bytes written.
 func emitCopy(dst []byte, offset, length int32) int {
 	i := 0
-	for length > 0 {
-		x := length - 4
-		if 0 <= x && x < 1<<3 && offset < 1<<11 {
-			dst[i+0] = uint8(offset>>8)&0x07<<5 | uint8(x)<<2 | tagCopy1
-			dst[i+1] = uint8(offset)
-			i += 2
-			break
-		}
-
-		x = length
-		if x > 1<<6 {
-			x = 1 << 6
-		}
-		dst[i+0] = uint8(x-1)<<2 | tagCopy2
+	// The maximum length for a single tagCopy1 or tagCopy2 op is 64 bytes. The
+	// threshold for this loop is a little higher (at 68 = 64 + 4), and the
+	// length emitted down below is is a little lower (at 60 = 64 - 4), because
+	// it's shorter to encode a length 67 copy as a length 60 tagCopy2 followed
+	// by a length 7 tagCopy1 (which encodes as 3+2 bytes) than to encode it as
+	// a length 64 tagCopy2 followed by a length 3 tagCopy2 (which encodes as
+	// 3+3 bytes). The magic 4 in the 64±4 is because the minimum length for a
+	// tagCopy1 op is 4 bytes, which is why a length 3 copy has to be an
+	// encodes-as-3-bytes tagCopy2 instead of an encodes-as-2-bytes tagCopy1.
+	for length >= 68 {
+		// Emit a length 64 copy, encoded as 3 bytes.
+		dst[i+0] = 63<<2 | tagCopy2
 		dst[i+1] = uint8(offset)
 		dst[i+2] = uint8(offset >> 8)
 		i += 3
-		length -= x
+		length -= 64
 	}
-	return i
+	if length > 64 {
+		// Emit a length 60 copy, encoded as 3 bytes.
+		dst[i+0] = 59<<2 | tagCopy2
+		dst[i+1] = uint8(offset)
+		dst[i+2] = uint8(offset >> 8)
+		i += 3
+		length -= 60
+	}
+	if length >= 12 || offset >= 2048 {
+		// Emit the remaining copy, encoded as 3 bytes.
+		dst[i+0] = uint8(length-1)<<2 | tagCopy2
+		dst[i+1] = uint8(offset)
+		dst[i+2] = uint8(offset >> 8)
+		return i + 3
+	}
+	// Emit the remaining copy, encoded as 2 bytes.
+	dst[i+0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
+	dst[i+1] = uint8(offset)
+	return i + 2
 }
 
 // Encode returns the encoded form of src. The returned slice may be a sub-
diff --git a/snappy_test.go b/snappy_test.go
index 65c0013..3d70a26 100644
--- a/snappy_test.go
+++ b/snappy_test.go
@@ -553,20 +553,36 @@
 	defer w.Close()
 	w.Write([]byte("abcd")) // Not compressible.
 	w.Flush()
-	w.Write(bytes.Repeat([]byte{'A'}, 100)) // Compressible.
+	w.Write(bytes.Repeat([]byte{'A'}, 150)) // Compressible.
 	w.Flush()
+	// The next chunk is also compressible, but a naive, greedy encoding of the
+	// overall length 67 copy as a length 64 copy (the longest expressible as a
+	// tagCopy1 or tagCopy2) plus a length 3 remainder would be two 3-byte
+	// tagCopy2 tags (6 bytes), since the minimum length for a tagCopy1 is 4
+	// bytes. Instead, we could do it shorter, in 5 bytes: a 3-byte tagCopy2
+	// (of length 60) and a 2-byte tagCopy1 (of length 7).
+	w.Write(bytes.Repeat([]byte{'B'}, 68))
+	w.Flush()
+
 	got := buf.String()
 	want := strings.Join([]string{
 		magicChunk,
 		"\x01\x08\x00\x00", // Uncompressed chunk, 8 bytes long (including 4 byte checksum).
 		"\x68\x10\xe6\xb6", // Checksum.
 		"\x61\x62\x63\x64", // Uncompressed payload: "abcd".
-		"\x00\x0d\x00\x00", // Compressed chunk, 13 bytes long (including 4 byte checksum).
-		"\x37\xcb\xbc\x9d", // Checksum.
-		"\x64",             // Compressed payload: Uncompressed length (varint encoded): 100.
+		"\x00\x11\x00\x00", // Compressed chunk, 17 bytes long (including 4 byte checksum).
+		"\x5f\xeb\xf2\x10", // Checksum.
+		"\x96\x01",         // Compressed payload: Uncompressed length (varint encoded): 150.
 		"\x00\x41",         // Compressed payload: tagLiteral, length=1, "A".
 		"\xfe\x01\x00",     // Compressed payload: tagCopy2,   length=64, offset=1.
-		"\x8a\x01\x00",     // Compressed payload: tagCopy2,   length=35, offset=1.
+		"\xfe\x01\x00",     // Compressed payload: tagCopy2,   length=64, offset=1.
+		"\x52\x01\x00",     // Compressed payload: tagCopy2,   length=21, offset=1.
+		"\x00\x0c\x00\x00", // Compressed chunk, 12 bytes long (including 4 byte checksum).
+		"\x27\x50\xe4\x4e", // Checksum.
+		"\x44",             // Compressed payload: Uncompressed length (varint encoded): 68.
+		"\x00\x42",         // Compressed payload: tagLiteral, length=1, "B".
+		"\xee\x01\x00",     // Compressed payload: tagCopy2,   length=60, offset=1.
+		"\x0d\x01",         // Compressed payload: tagCopy1,   length=7, offset=1.
 	}, "")
 	if got != want {
 		t.Fatalf("\ngot:  % x\nwant: % x", got, want)