Optimize asm for decoding copy fragments. Relative to the previous commit: benchmark old MB/s new MB/s speedup BenchmarkWordsDecode1e1-8 518.05 518.80 1.00x BenchmarkWordsDecode1e2-8 776.28 871.43 1.12x BenchmarkWordsDecode1e3-8 995.41 1411.32 1.42x BenchmarkWordsDecode1e4-8 615.92 1469.60 2.39x BenchmarkWordsDecode1e5-8 453.95 771.07 1.70x BenchmarkWordsDecode1e6-8 453.74 872.19 1.92x Benchmark_UFlat0-8 863.12 1129.79 1.31x Benchmark_UFlat1-8 766.01 1075.37 1.40x Benchmark_UFlat2-8 15463.36 15617.45 1.01x Benchmark_UFlat3-8 1388.63 1438.15 1.04x Benchmark_UFlat4-8 4367.79 4838.37 1.11x Benchmark_UFlat5-8 844.84 1075.46 1.27x Benchmark_UFlat6-8 442.42 811.70 1.83x Benchmark_UFlat7-8 437.68 781.87 1.79x Benchmark_UFlat8-8 458.19 819.38 1.79x Benchmark_UFlat9-8 423.36 724.43 1.71x Benchmark_UFlat10-8 1023.05 1193.70 1.17x Benchmark_UFlat11-8 507.18 879.15 1.73x

commit: 4c1fc8e426266f00229956994142877543e8b514 [log] [tgz]
author: Nigel Tao <nigeltao@golang.org> Fri Feb 26 17:21:48 2016 +1100
committer: Nigel Tao <nigeltao@golang.org> Fri Feb 26 17:21:48 2016 +1100
tree: 933464732990407ee05625fb7d6a27a94188246c
parent: 8c7c9dec5965484f0a81268ce7985fe31e5d5955 [diff]
diff --git a/decode_amd64.s b/decode_amd64.s
index b0513d0..c38bd68 100644
--- a/decode_amd64.s
+++ b/decode_amd64.s

@@ -23,7 +23,7 @@
 //	+ R11	src_base
 //	+ R12	src_len
 //	+ R13	src_base + src_len
-//	- R14	unused
+//	- R14	used by doCopy
 //	- R15	used by doCopy
 //
 // The registers R8-R13 (marked with a "+") are set at the start of the
@@ -299,10 +299,37 @@
 	// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
 	//
 	// Set:
+	//	- R14 = len(dst)-d
 	//	- R15 = &dst[d-offset]
+	MOVQ R10, R14
+	SUBQ DI, R14
 	MOVQ DI, R15
 	SUBQ DX, R15
 
+	// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
+	//
+	// First, try using two 8-byte load/stores, similar to the doLit technique
+	// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
+	// still OK if offset >= 8.
+	//
+	// if length > 16 || offset < 8 || len(dst)-d < 16 {
+	//   goto slowForwardCopy
+	// }
+	// copy 16 bytes
+	// d += length
+	CMPQ CX, $16
+	JGT  verySlowForwardCopy
+	CMPQ DX, $8
+	JLT  verySlowForwardCopy
+	CMPQ R14, $16
+	JLT  verySlowForwardCopy
+	MOVQ 0(R15), AX
+	MOVQ AX, 0(DI)
+	MOVQ 8(R15), BX
+	MOVQ BX, 8(DI)
+	ADDQ CX, DI
+	JMP  loop
+
 verySlowForwardCopy:
 	// verySlowForwardCopy is a simple implementation of forward copy. In C
 	// parlance, this is a do/while loop instead of a while loop, since we know
commit	4c1fc8e426266f00229956994142877543e8b514	[log] [tgz]
author	Nigel Tao <nigeltao@golang.org>	Fri Feb 26 17:21:48 2016 +1100
committer	Nigel Tao <nigeltao@golang.org>	Fri Feb 26 17:21:48 2016 +1100
tree	933464732990407ee05625fb7d6a27a94188246c
parent	8c7c9dec5965484f0a81268ce7985fe31e5d5955 [diff]