Optimize a 16-byte load and store. benchmark old MB/s new MB/s speedup BenchmarkWordsDecode1e1-8 528.93 528.93 1.00x BenchmarkWordsDecode1e2-8 983.60 999.00 1.02x BenchmarkWordsDecode1e3-8 1474.03 1513.22 1.03x BenchmarkWordsDecode1e4-8 1523.38 1561.36 1.02x BenchmarkWordsDecode1e5-8 792.34 800.00 1.01x BenchmarkWordsDecode1e6-8 881.58 885.13 1.00x Benchmark_UFlat0-8 2168.73 2224.25 1.03x Benchmark_UFlat1-8 1431.99 1446.11 1.01x Benchmark_UFlat2-8 15392.46 15301.72 0.99x Benchmark_UFlat3-8 1825.26 1841.57 1.01x Benchmark_UFlat4-8 10885.32 11384.32 1.05x Benchmark_UFlat5-8 1955.55 2002.59 1.02x Benchmark_UFlat6-8 833.99 829.35 0.99x Benchmark_UFlat7-8 794.80 793.35 1.00x Benchmark_UFlat8-8 859.01 854.84 1.00x Benchmark_UFlat9-8 731.84 726.50 0.99x Benchmark_UFlat10-8 2775.21 2898.57 1.04x Benchmark_UFlat11-8 1032.75 1032.47 1.00x

commit: 5f1c01d9f64b941dd9582c638279d046eda6ca31 [log] [tgz]
author: Nigel Tao <nigeltao@golang.org> Fri Mar 04 16:48:22 2016 +1100
committer: Nigel Tao <nigeltao@golang.org> Fri Mar 04 16:48:22 2016 +1100
tree: 5a6a9de0fbf2069d11709eded2c7abe614ca9d0e
parent: 427fb6fc07997f43afa32f35e850833760e489a7 [diff]
diff --git a/decode_amd64.s b/decode_amd64.s
index 1486aba..c33f5bf 100644
--- a/decode_amd64.s
+++ b/decode_amd64.s

@@ -112,7 +112,7 @@
 	CMPQ BX, $16
 	JLT  callMemmove
 
-	// !!! Implement the copy from src to dst as two 8-byte loads and stores.
+	// !!! Implement the copy from src to dst as a 16-byte load and store.
 	// (Decode's documentation says that dst and src must not overlap.)
 	//
 	// This always copies 16 bytes, instead of only length bytes, but that's
@@ -120,13 +120,11 @@
 	// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
 	// non-nil error), so the overrun will be ignored.
 	//
-	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte
-	// loads and stores. This technique probably wouldn't be as effective on
-	// architectures that are fussier about alignment.
-	MOVQ 0(SI), AX
-	MOVQ AX, 0(DI)
-	MOVQ 8(SI), BX
-	MOVQ BX, 8(DI)
+	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
+	// 16-byte loads and stores. This technique probably wouldn't be as
+	// effective on architectures that are fussier about alignment.
+	MOVOU 0(SI), X0
+	MOVOU X0, 0(DI)
 
 	// d += length
 	// s += length
@@ -310,7 +308,9 @@
 	//
 	// First, try using two 8-byte load/stores, similar to the doLit technique
 	// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
-	// still OK if offset >= 8.
+	// still OK if offset >= 8. Note that this has to be two 8-byte load/stores
+	// and not one 16-byte load/store, and the first store has to be before the
+	// second load, due to the overlap if offset is in the range [8, 16).
 	//
 	// if length > 16 || offset < 8 || len(dst)-d < 16 {
 	//   goto slowForwardCopy
commit	5f1c01d9f64b941dd9582c638279d046eda6ca31	[log] [tgz]
author	Nigel Tao <nigeltao@golang.org>	Fri Mar 04 16:48:22 2016 +1100
committer	Nigel Tao <nigeltao@golang.org>	Fri Mar 04 16:48:22 2016 +1100
tree	5a6a9de0fbf2069d11709eded2c7abe614ca9d0e
parent	427fb6fc07997f43afa32f35e850833760e489a7 [diff]