Optimize a 16-byte load and store.

benchmark                     old MB/s     new MB/s     speedup
BenchmarkWordsDecode1e1-8     528.93       528.93       1.00x
BenchmarkWordsDecode1e2-8     983.60       999.00       1.02x
BenchmarkWordsDecode1e3-8     1474.03      1513.22      1.03x
BenchmarkWordsDecode1e4-8     1523.38      1561.36      1.02x
BenchmarkWordsDecode1e5-8     792.34       800.00       1.01x
BenchmarkWordsDecode1e6-8     881.58       885.13       1.00x
Benchmark_UFlat0-8            2168.73      2224.25      1.03x
Benchmark_UFlat1-8            1431.99      1446.11      1.01x
Benchmark_UFlat2-8            15392.46     15301.72     0.99x
Benchmark_UFlat3-8            1825.26      1841.57      1.01x
Benchmark_UFlat4-8            10885.32     11384.32     1.05x
Benchmark_UFlat5-8            1955.55      2002.59      1.02x
Benchmark_UFlat6-8            833.99       829.35       0.99x
Benchmark_UFlat7-8            794.80       793.35       1.00x
Benchmark_UFlat8-8            859.01       854.84       1.00x
Benchmark_UFlat9-8            731.84       726.50       0.99x
Benchmark_UFlat10-8           2775.21      2898.57      1.04x
Benchmark_UFlat11-8           1032.75      1032.47      1.00x
diff --git a/decode_amd64.s b/decode_amd64.s
index 1486aba..c33f5bf 100644
--- a/decode_amd64.s
+++ b/decode_amd64.s
@@ -112,7 +112,7 @@
 	CMPQ BX, $16
 	JLT  callMemmove
 
-	// !!! Implement the copy from src to dst as two 8-byte loads and stores.
+	// !!! Implement the copy from src to dst as a 16-byte load and store.
 	// (Decode's documentation says that dst and src must not overlap.)
 	//
 	// This always copies 16 bytes, instead of only length bytes, but that's
@@ -120,13 +120,11 @@
 	// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
 	// non-nil error), so the overrun will be ignored.
 	//
-	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte
-	// loads and stores. This technique probably wouldn't be as effective on
-	// architectures that are fussier about alignment.
-	MOVQ 0(SI), AX
-	MOVQ AX, 0(DI)
-	MOVQ 8(SI), BX
-	MOVQ BX, 8(DI)
+	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
+	// 16-byte loads and stores. This technique probably wouldn't be as
+	// effective on architectures that are fussier about alignment.
+	MOVOU 0(SI), X0
+	MOVOU X0, 0(DI)
 
 	// d += length
 	// s += length
@@ -310,7 +308,9 @@
 	//
 	// First, try using two 8-byte load/stores, similar to the doLit technique
 	// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
-	// still OK if offset >= 8.
+	// still OK if offset >= 8. Note that this has to be two 8-byte load/stores
+	// and not one 16-byte load/store, and the first store has to be before the
+	// second load, due to the overlap if offset is in the range [8, 16).
 	//
 	// if length > 16 || offset < 8 || len(dst)-d < 16 {
 	//   goto slowForwardCopy