Optimize asm for decoding literal fragments.
Relative to the previous commit:
benchmark old MB/s new MB/s speedup
BenchmarkWordsDecode1e1-8 519.36 518.05 1.00x
BenchmarkWordsDecode1e2-8 691.63 776.28 1.12x
BenchmarkWordsDecode1e3-8 858.97 995.41 1.16x
BenchmarkWordsDecode1e4-8 581.86 615.92 1.06x
BenchmarkWordsDecode1e5-8 380.78 453.95 1.19x
BenchmarkWordsDecode1e6-8 403.12 453.74 1.13x
Benchmark_UFlat0-8 784.21 863.12 1.10x
Benchmark_UFlat1-8 625.49 766.01 1.22x
Benchmark_UFlat2-8 15366.67 15463.36 1.01x
Benchmark_UFlat3-8 1321.47 1388.63 1.05x
Benchmark_UFlat4-8 4338.83 4367.79 1.01x
Benchmark_UFlat5-8 770.24 844.84 1.10x
Benchmark_UFlat6-8 386.10 442.42 1.15x
Benchmark_UFlat7-8 376.79 437.68 1.16x
Benchmark_UFlat8-8 400.47 458.19 1.14x
Benchmark_UFlat9-8 362.89 423.36 1.17x
Benchmark_UFlat10-8 943.89 1023.05 1.08x
Benchmark_UFlat11-8 493.98 507.18 1.03x
diff --git a/decode_amd64.s b/decode_amd64.s
index 2e6ac59..b0513d0 100644
--- a/decode_amd64.s
+++ b/decode_amd64.s
@@ -94,6 +94,47 @@
MOVQ R13, BX
SUBQ SI, BX
+ // !!! Try a faster technique for short (16 or fewer bytes) copies.
+ //
+ // if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
+ // goto callMemmove // Fall back on calling runtime·memmove.
+ // }
+ //
+ // The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
+ // against 21 instead of 16, because it cannot assume that all of its input
+ // is contiguous in memory and so it needs to leave enough source bytes to
+ // read the next tag without refilling buffers, but Go's Decode assumes
+ // contiguousness (the src argument is a []byte).
+ CMPQ CX, $16
+ JGT callMemmove
+ CMPQ AX, $16
+ JLT callMemmove
+ CMPQ BX, $16
+ JLT callMemmove
+
+ // !!! Implement the copy from src to dst as two 8-byte loads and stores.
+ // (Decode's documentation says that dst and src must not overlap.)
+ //
+ // This always copies 16 bytes, instead of only length bytes, but that's
+ // OK. If the input is a valid Snappy encoding then subsequent iterations
+ // will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
+ // non-nil error), so the overrun will be ignored.
+ //
+ // Note that on amd64, it is legal and cheap to issue unaligned 8-byte
+ // loads and stores. This technique probably wouldn't be as effective on
+ // architectures that are fussier about alignment.
+ MOVQ 0(SI), AX
+ MOVQ AX, 0(DI)
+ MOVQ 8(SI), BX
+ MOVQ BX, 8(DI)
+
+ // d += length
+ // s += length
+ ADDQ CX, DI
+ ADDQ CX, SI
+ JMP loop
+
+callMemmove:
// if length > len(dst)-d || length > len(src)-s { etc }
CMPQ CX, AX
JGT errCorrupt