Rearrange the extendMatch register allocation.
This minimizes the diff in a follow-up commit, when manually inlining.
It's not an optimization per se, but for the record:
name old speed new speed delta
WordsEncode1e1-8 700MB/s ± 1% 701MB/s ± 0% ~ (p=0.393 n=10+10)
WordsEncode1e2-8 460MB/s ± 1% 460MB/s ± 0% ~ (p=0.393 n=10+10)
WordsEncode1e3-8 478MB/s ± 2% 480MB/s ± 0% ~ (p=0.912 n=10+10)
WordsEncode1e4-8 414MB/s ± 0% 416MB/s ± 0% +0.64% (p=0.000 n=9+10)
WordsEncode1e5-8 296MB/s ± 1% 297MB/s ± 0% ~ (p=0.113 n=9+10)
WordsEncode1e6-8 345MB/s ± 0% 345MB/s ± 0% ~ (p=0.949 n=8+10)
RandomEncode-8 14.4GB/s ± 2% 14.4GB/s ± 2% ~ (p=0.278 n=9+10)
_ZFlat0-8 888MB/s ± 1% 891MB/s ± 1% +0.35% (p=0.010 n=10+9)
_ZFlat1-8 471MB/s ± 1% 471MB/s ± 0% ~ (p=0.447 n=10+9)
_ZFlat2-8 16.2GB/s ± 3% 16.2GB/s ± 3% ~ (p=0.912 n=10+10)
_ZFlat3-8 675MB/s ± 1% 676MB/s ± 0% ~ (p=0.150 n=9+10)
_ZFlat4-8 8.31GB/s ± 1% 8.36GB/s ± 1% +0.65% (p=0.035 n=10+10)
_ZFlat5-8 850MB/s ± 0% 852MB/s ± 0% ~ (p=0.182 n=9+10)
_ZFlat6-8 316MB/s ± 0% 316MB/s ± 0% ~ (p=0.762 n=10+8)
_ZFlat7-8 294MB/s ± 1% 296MB/s ± 0% +0.51% (p=0.006 n=9+8)
_ZFlat8-8 330MB/s ± 1% 331MB/s ± 1% ~ (p=0.881 n=9+9)
_ZFlat9-8 273MB/s ± 0% 274MB/s ± 0% +0.23% (p=0.043 n=10+8)
_ZFlat10-8 1.17GB/s ± 1% 1.17GB/s ± 0% ~ (p=0.922 n=10+9)
_ZFlat11-8 461MB/s ± 0% 462MB/s ± 0% ~ (p=0.219 n=10+9)
Also:
name old time/op new time/op delta
ExtendMatch-8 7.92µs ± 2% 7.80µs ± 2% -1.51% (p=0.002 n=10+9)
and note that this is time/op instead of MB/s, so negative is better,
although it's quite possibly all just noise.
diff --git a/encode_amd64.s b/encode_amd64.s
index fda505c..40dcde8 100644
--- a/encode_amd64.s
+++ b/encode_amd64.s
@@ -169,33 +169,37 @@
// func extendMatch(src []byte, i, j int) int
//
// All local variables fit into registers. The register allocation:
-// - CX &src[0]
-// - DX &src[len(src)]
-// - SI &src[i]
-// - DI &src[j]
-// - R9 &src[len(src) - 8]
+// - DX &src[0]
+// - SI &src[j]
+// - R13 &src[len(src) - 8]
+// - R14 &src[len(src)]
+// - R15 &src[i]
+//
+// The unusual register allocation of local variables, such as R15 for a source
+// pointer, matches the allocation used at the call site in encodeBlock, which
+// makes it easier to manually inline this function.
TEXT ·extendMatch(SB), NOSPLIT, $0-48
- MOVQ src_base+0(FP), CX
- MOVQ src_len+8(FP), DX
- MOVQ i+24(FP), SI
- MOVQ j+32(FP), DI
- ADDQ CX, DX
- ADDQ CX, SI
- ADDQ CX, DI
- MOVQ DX, R9
- SUBQ $8, R9
+ MOVQ src_base+0(FP), DX
+ MOVQ src_len+8(FP), R14
+ MOVQ i+24(FP), R15
+ MOVQ j+32(FP), SI
+ ADDQ DX, R14
+ ADDQ DX, R15
+ ADDQ DX, SI
+ MOVQ R14, R13
+ SUBQ $8, R13
cmp8:
// As long as we are 8 or more bytes before the end of src, we can load and
// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
- CMPQ DI, R9
+ CMPQ SI, R13
JA cmp1
- MOVQ (SI), AX
- MOVQ (DI), BX
+ MOVQ (R15), AX
+ MOVQ (SI), BX
CMPQ AX, BX
JNE bsf
+ ADDQ $8, R15
ADDQ $8, SI
- ADDQ $8, DI
JMP cmp8
bsf:
@@ -206,29 +210,29 @@
XORQ AX, BX
BSFQ BX, BX
SHRQ $3, BX
- ADDQ BX, DI
+ ADDQ BX, SI
// Convert from &src[ret] to ret.
- SUBQ CX, DI
- MOVQ DI, ret+40(FP)
+ SUBQ DX, SI
+ MOVQ SI, ret+40(FP)
RET
cmp1:
// In src's tail, compare 1 byte at a time.
- CMPQ DI, DX
+ CMPQ SI, R14
JAE extendMatchEnd
- MOVB (SI), AX
- MOVB (DI), BX
+ MOVB (R15), AX
+ MOVB (SI), BX
CMPB AX, BX
JNE extendMatchEnd
+ ADDQ $1, R15
ADDQ $1, SI
- ADDQ $1, DI
JMP cmp1
extendMatchEnd:
// Convert from &src[ret] to ret.
- SUBQ CX, DI
- MOVQ DI, ret+40(FP)
+ SUBQ DX, SI
+ MOVQ SI, ret+40(FP)
RET
// ----------------------------------------------------------------------------