Optimize asm for decoding copy fragments.

Relative to the previous commit:

benchmark                     old MB/s     new MB/s     speedup
BenchmarkWordsDecode1e1-8     518.05       518.80       1.00x
BenchmarkWordsDecode1e2-8     776.28       871.43       1.12x
BenchmarkWordsDecode1e3-8     995.41       1411.32      1.42x
BenchmarkWordsDecode1e4-8     615.92       1469.60      2.39x
BenchmarkWordsDecode1e5-8     453.95       771.07       1.70x
BenchmarkWordsDecode1e6-8     453.74       872.19       1.92x
Benchmark_UFlat0-8            863.12       1129.79      1.31x
Benchmark_UFlat1-8            766.01       1075.37      1.40x
Benchmark_UFlat2-8            15463.36     15617.45     1.01x
Benchmark_UFlat3-8            1388.63      1438.15      1.04x
Benchmark_UFlat4-8            4367.79      4838.37      1.11x
Benchmark_UFlat5-8            844.84       1075.46      1.27x
Benchmark_UFlat6-8            442.42       811.70       1.83x
Benchmark_UFlat7-8            437.68       781.87       1.79x
Benchmark_UFlat8-8            458.19       819.38       1.79x
Benchmark_UFlat9-8            423.36       724.43       1.71x
Benchmark_UFlat10-8           1023.05      1193.70      1.17x
Benchmark_UFlat11-8           507.18       879.15       1.73x
diff --git a/decode_amd64.s b/decode_amd64.s
index b0513d0..c38bd68 100644
--- a/decode_amd64.s
+++ b/decode_amd64.s
@@ -23,7 +23,7 @@
 //	+ R11	src_base
 //	+ R12	src_len
 //	+ R13	src_base + src_len
-//	- R14	unused
+//	- R14	used by doCopy
 //	- R15	used by doCopy
 //
 // The registers R8-R13 (marked with a "+") are set at the start of the
@@ -299,10 +299,37 @@
 	// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
 	//
 	// Set:
+	//	- R14 = len(dst)-d
 	//	- R15 = &dst[d-offset]
+	MOVQ R10, R14
+	SUBQ DI, R14
 	MOVQ DI, R15
 	SUBQ DX, R15
 
+	// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
+	//
+	// First, try using two 8-byte load/stores, similar to the doLit technique
+	// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
+	// still OK if offset >= 8.
+	//
+	// if length > 16 || offset < 8 || len(dst)-d < 16 {
+	//   goto slowForwardCopy
+	// }
+	// copy 16 bytes
+	// d += length
+	CMPQ CX, $16
+	JGT  verySlowForwardCopy
+	CMPQ DX, $8
+	JLT  verySlowForwardCopy
+	CMPQ R14, $16
+	JLT  verySlowForwardCopy
+	MOVQ 0(R15), AX
+	MOVQ AX, 0(DI)
+	MOVQ 8(R15), BX
+	MOVQ BX, 8(DI)
+	ADDQ CX, DI
+	JMP  loop
+
 verySlowForwardCopy:
 	// verySlowForwardCopy is a simple implementation of forward copy. In C
 	// parlance, this is a do/while loop instead of a while loop, since we know