Optimize asm for decoding literal fragments.

Relative to the previous commit:

benchmark                     old MB/s     new MB/s     speedup
BenchmarkWordsDecode1e1-8     519.36       518.05       1.00x
BenchmarkWordsDecode1e2-8     691.63       776.28       1.12x
BenchmarkWordsDecode1e3-8     858.97       995.41       1.16x
BenchmarkWordsDecode1e4-8     581.86       615.92       1.06x
BenchmarkWordsDecode1e5-8     380.78       453.95       1.19x
BenchmarkWordsDecode1e6-8     403.12       453.74       1.13x
Benchmark_UFlat0-8            784.21       863.12       1.10x
Benchmark_UFlat1-8            625.49       766.01       1.22x
Benchmark_UFlat2-8            15366.67     15463.36     1.01x
Benchmark_UFlat3-8            1321.47      1388.63      1.05x
Benchmark_UFlat4-8            4338.83      4367.79      1.01x
Benchmark_UFlat5-8            770.24       844.84       1.10x
Benchmark_UFlat6-8            386.10       442.42       1.15x
Benchmark_UFlat7-8            376.79       437.68       1.16x
Benchmark_UFlat8-8            400.47       458.19       1.14x
Benchmark_UFlat9-8            362.89       423.36       1.17x
Benchmark_UFlat10-8           943.89       1023.05      1.08x
Benchmark_UFlat11-8           493.98       507.18       1.03x
diff --git a/decode_amd64.s b/decode_amd64.s
index 2e6ac59..b0513d0 100644
--- a/decode_amd64.s
+++ b/decode_amd64.s
@@ -94,6 +94,47 @@
 	MOVQ R13, BX
 	SUBQ SI, BX
 
+	// !!! Try a faster technique for short (16 or fewer bytes) copies.
+	//
+	// if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
+	//   goto callMemmove // Fall back on calling runtime·memmove.
+	// }
+	//
+	// The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
+	// against 21 instead of 16, because it cannot assume that all of its input
+	// is contiguous in memory and so it needs to leave enough source bytes to
+	// read the next tag without refilling buffers, but Go's Decode assumes
+	// contiguousness (the src argument is a []byte).
+	CMPQ CX, $16
+	JGT  callMemmove
+	CMPQ AX, $16
+	JLT  callMemmove
+	CMPQ BX, $16
+	JLT  callMemmove
+
+	// !!! Implement the copy from src to dst as two 8-byte loads and stores.
+	// (Decode's documentation says that dst and src must not overlap.)
+	//
+	// This always copies 16 bytes, instead of only length bytes, but that's
+	// OK. If the input is a valid Snappy encoding then subsequent iterations
+	// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
+	// non-nil error), so the overrun will be ignored.
+	//
+	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte
+	// loads and stores. This technique probably wouldn't be as effective on
+	// architectures that are fussier about alignment.
+	MOVQ 0(SI), AX
+	MOVQ AX, 0(DI)
+	MOVQ 8(SI), BX
+	MOVQ BX, 8(DI)
+
+	// d += length
+	// s += length
+	ADDQ CX, DI
+	ADDQ CX, SI
+	JMP  loop
+
+callMemmove:
 	// if length > len(dst)-d || length > len(src)-s { etc }
 	CMPQ CX, AX
 	JGT  errCorrupt