Optimize asm for decoding literal fragments. Relative to the previous commit: benchmark old MB/s new MB/s speedup BenchmarkWordsDecode1e1-8 519.36 518.05 1.00x BenchmarkWordsDecode1e2-8 691.63 776.28 1.12x BenchmarkWordsDecode1e3-8 858.97 995.41 1.16x BenchmarkWordsDecode1e4-8 581.86 615.92 1.06x BenchmarkWordsDecode1e5-8 380.78 453.95 1.19x BenchmarkWordsDecode1e6-8 403.12 453.74 1.13x Benchmark_UFlat0-8 784.21 863.12 1.10x Benchmark_UFlat1-8 625.49 766.01 1.22x Benchmark_UFlat2-8 15366.67 15463.36 1.01x Benchmark_UFlat3-8 1321.47 1388.63 1.05x Benchmark_UFlat4-8 4338.83 4367.79 1.01x Benchmark_UFlat5-8 770.24 844.84 1.10x Benchmark_UFlat6-8 386.10 442.42 1.15x Benchmark_UFlat7-8 376.79 437.68 1.16x Benchmark_UFlat8-8 400.47 458.19 1.14x Benchmark_UFlat9-8 362.89 423.36 1.17x Benchmark_UFlat10-8 943.89 1023.05 1.08x Benchmark_UFlat11-8 493.98 507.18 1.03x

commit: 8c7c9dec5965484f0a81268ce7985fe31e5d5955 [log] [tgz]
author: Nigel Tao <nigeltao@golang.org> Fri Feb 26 17:17:00 2016 +1100
committer: Nigel Tao <nigeltao@golang.org> Fri Feb 26 17:17:00 2016 +1100
tree: 3d28a7ffcd2b548265a6b8ed2c28f654439f9039
parent: 402436317ad8035a7246ee89492064f9e6cbb4ce [diff]
diff --git a/decode_amd64.s b/decode_amd64.s
index 2e6ac59..b0513d0 100644
--- a/decode_amd64.s
+++ b/decode_amd64.s

@@ -94,6 +94,47 @@
 	MOVQ R13, BX
 	SUBQ SI, BX
 
+	// !!! Try a faster technique for short (16 or fewer bytes) copies.
+	//
+	// if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
+	//   goto callMemmove // Fall back on calling runtime·memmove.
+	// }
+	//
+	// The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
+	// against 21 instead of 16, because it cannot assume that all of its input
+	// is contiguous in memory and so it needs to leave enough source bytes to
+	// read the next tag without refilling buffers, but Go's Decode assumes
+	// contiguousness (the src argument is a []byte).
+	CMPQ CX, $16
+	JGT  callMemmove
+	CMPQ AX, $16
+	JLT  callMemmove
+	CMPQ BX, $16
+	JLT  callMemmove
+
+	// !!! Implement the copy from src to dst as two 8-byte loads and stores.
+	// (Decode's documentation says that dst and src must not overlap.)
+	//
+	// This always copies 16 bytes, instead of only length bytes, but that's
+	// OK. If the input is a valid Snappy encoding then subsequent iterations
+	// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
+	// non-nil error), so the overrun will be ignored.
+	//
+	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte
+	// loads and stores. This technique probably wouldn't be as effective on
+	// architectures that are fussier about alignment.
+	MOVQ 0(SI), AX
+	MOVQ AX, 0(DI)
+	MOVQ 8(SI), BX
+	MOVQ BX, 8(DI)
+
+	// d += length
+	// s += length
+	ADDQ CX, DI
+	ADDQ CX, SI
+	JMP  loop
+
+callMemmove:
 	// if length > len(dst)-d || length > len(src)-s { etc }
 	CMPQ CX, AX
 	JGT  errCorrupt
commit	8c7c9dec5965484f0a81268ce7985fe31e5d5955	[log] [tgz]
author	Nigel Tao <nigeltao@golang.org>	Fri Feb 26 17:17:00 2016 +1100
committer	Nigel Tao <nigeltao@golang.org>	Fri Feb 26 17:17:00 2016 +1100
tree	3d28a7ffcd2b548265a6b8ed2c28f654439f9039
parent	402436317ad8035a7246ee89492064f9e6cbb4ce [diff]