Optimize asm for decoding copy fragments.
Relative to the previous commit:
benchmark old MB/s new MB/s speedup
BenchmarkWordsDecode1e1-8 518.05 518.80 1.00x
BenchmarkWordsDecode1e2-8 776.28 871.43 1.12x
BenchmarkWordsDecode1e3-8 995.41 1411.32 1.42x
BenchmarkWordsDecode1e4-8 615.92 1469.60 2.39x
BenchmarkWordsDecode1e5-8 453.95 771.07 1.70x
BenchmarkWordsDecode1e6-8 453.74 872.19 1.92x
Benchmark_UFlat0-8 863.12 1129.79 1.31x
Benchmark_UFlat1-8 766.01 1075.37 1.40x
Benchmark_UFlat2-8 15463.36 15617.45 1.01x
Benchmark_UFlat3-8 1388.63 1438.15 1.04x
Benchmark_UFlat4-8 4367.79 4838.37 1.11x
Benchmark_UFlat5-8 844.84 1075.46 1.27x
Benchmark_UFlat6-8 442.42 811.70 1.83x
Benchmark_UFlat7-8 437.68 781.87 1.79x
Benchmark_UFlat8-8 458.19 819.38 1.79x
Benchmark_UFlat9-8 423.36 724.43 1.71x
Benchmark_UFlat10-8 1023.05 1193.70 1.17x
Benchmark_UFlat11-8 507.18 879.15 1.73x
diff --git a/decode_amd64.s b/decode_amd64.s
index b0513d0..c38bd68 100644
--- a/decode_amd64.s
+++ b/decode_amd64.s
@@ -23,7 +23,7 @@
// + R11 src_base
// + R12 src_len
// + R13 src_base + src_len
-// - R14 unused
+// - R14 used by doCopy
// - R15 used by doCopy
//
// The registers R8-R13 (marked with a "+") are set at the start of the
@@ -299,10 +299,37 @@
// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
//
// Set:
+ // - R14 = len(dst)-d
// - R15 = &dst[d-offset]
+ MOVQ R10, R14
+ SUBQ DI, R14
MOVQ DI, R15
SUBQ DX, R15
+ // !!! Try a faster technique for short (16 or fewer bytes) forward copies.
+ //
+ // First, try using two 8-byte load/stores, similar to the doLit technique
+ // above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
+ // still OK if offset >= 8.
+ //
+ // if length > 16 || offset < 8 || len(dst)-d < 16 {
+ // goto slowForwardCopy
+ // }
+ // copy 16 bytes
+ // d += length
+ CMPQ CX, $16
+ JGT verySlowForwardCopy
+ CMPQ DX, $8
+ JLT verySlowForwardCopy
+ CMPQ R14, $16
+ JLT verySlowForwardCopy
+ MOVQ 0(R15), AX
+ MOVQ AX, 0(DI)
+ MOVQ 8(R15), BX
+ MOVQ BX, 8(DI)
+ ADDQ CX, DI
+ JMP loop
+
verySlowForwardCopy:
// verySlowForwardCopy is a simple implementation of forward copy. In C
// parlance, this is a do/while loop instead of a while loop, since we know