Rearrange the emitCopy register allocation.
This minimizes the diff in a follow-up commit, when manually inlining.
It's not an optimization per se, but for the record:
name old speed new speed delta
WordsEncode1e1-8 711MB/s ± 1% 700MB/s ± 1% -1.64% (p=0.000 n=9+10)
WordsEncode1e2-8 407MB/s ± 1% 430MB/s ± 0% +5.57% (p=0.000 n=10+10)
WordsEncode1e3-8 441MB/s ± 1% 447MB/s ± 0% +1.52% (p=0.000 n=8+8)
WordsEncode1e4-8 311MB/s ± 1% 322MB/s ± 0% +3.69% (p=0.000 n=9+10)
WordsEncode1e5-8 267MB/s ± 0% 267MB/s ± 1% ~ (p=0.068 n=8+10)
WordsEncode1e6-8 312MB/s ± 1% 314MB/s ± 0% +0.45% (p=0.000 n=9+10)
RandomEncode-8 14.4GB/s ± 2% 14.4GB/s ± 2% ~ (p=0.739 n=10+10)
_ZFlat0-8 792MB/s ± 1% 801MB/s ± 0% +1.11% (p=0.000 n=8+9)
_ZFlat1-8 435MB/s ± 1% 437MB/s ± 0% ~ (p=0.857 n=9+10)
_ZFlat2-8 16.0GB/s ± 4% 16.3GB/s ± 1% ~ (p=0.143 n=10+10)
_ZFlat3-8 613MB/s ± 0% 634MB/s ± 0% +3.54% (p=0.000 n=8+10)
_ZFlat4-8 7.96GB/s ± 1% 7.97GB/s ± 1% ~ (p=0.829 n=8+10)
_ZFlat5-8 770MB/s ± 0% 773MB/s ± 0% +0.33% (p=0.000 n=8+9)
_ZFlat6-8 283MB/s ± 0% 283MB/s ± 0% +0.13% (p=0.043 n=8+9)
_ZFlat7-8 264MB/s ± 2% 265MB/s ± 0% +0.61% (p=0.000 n=9+9)
_ZFlat8-8 297MB/s ± 3% 299MB/s ± 0% ~ (p=0.161 n=9+9)
_ZFlat9-8 247MB/s ± 1% 247MB/s ± 0% ~ (p=0.465 n=8+9)
_ZFlat10-8 1.03GB/s ± 0% 1.05GB/s ± 1% +1.75% (p=0.000 n=9+9)
_ZFlat11-8 409MB/s ± 0% 412MB/s ± 0% +0.64% (p=0.000 n=8+8)
diff --git a/encode_amd64.s b/encode_amd64.s
index 48386b7..81205ac 100644
--- a/encode_amd64.s
+++ b/encode_amd64.s
@@ -86,55 +86,59 @@
// func emitCopy(dst []byte, offset, length int) int
//
// All local variables fit into registers. The register allocation:
-// - BX offset
-// - CX length
+// - AX length
// - SI &dst[0]
// - DI &dst[i]
+// - R11 offset
+//
+// The unusual register allocation of AX and R11 for local variables matches
+// the allocation used at the call site in encodeBlock, which makes it easier
+// to manually inline this function.
TEXT ·emitCopy(SB), NOSPLIT, $0-48
MOVQ dst_base+0(FP), DI
MOVQ DI, SI
- MOVQ offset+24(FP), BX
- MOVQ length+32(FP), CX
+ MOVQ offset+24(FP), R11
+ MOVQ length+32(FP), AX
loop0:
// for length >= 68 { etc }
- CMPL CX, $68
+ CMPL AX, $68
JLT step1
// Emit a length 64 copy, encoded as 3 bytes.
MOVB $0xfe, 0(DI)
- MOVW BX, 1(DI)
+ MOVW R11, 1(DI)
ADDQ $3, DI
- SUBL $64, CX
+ SUBL $64, AX
JMP loop0
step1:
// if length > 64 { etc }
- CMPL CX, $64
+ CMPL AX, $64
JLE step2
// Emit a length 60 copy, encoded as 3 bytes.
MOVB $0xee, 0(DI)
- MOVW BX, 1(DI)
+ MOVW R11, 1(DI)
ADDQ $3, DI
- SUBL $60, CX
+ SUBL $60, AX
step2:
// if length >= 12 || offset >= 2048 { goto step3 }
- CMPL CX, $12
+ CMPL AX, $12
JGE step3
- CMPL BX, $2048
+ CMPL R11, $2048
JGE step3
// Emit the remaining copy, encoded as 2 bytes.
- MOVB BX, 1(DI)
- SHRL $8, BX
- SHLB $5, BX
- SUBB $4, CX
- SHLB $2, CX
- ORB CX, BX
- ORB $1, BX
- MOVB BX, 0(DI)
+ MOVB R11, 1(DI)
+ SHRL $8, R11
+ SHLB $5, R11
+ SUBB $4, AX
+ SHLB $2, AX
+ ORB AX, R11
+ ORB $1, R11
+ MOVB R11, 0(DI)
ADDQ $2, DI
// Return the number of bytes written.
@@ -144,11 +148,11 @@
step3:
// Emit the remaining copy, encoded as 3 bytes.
- SUBL $1, CX
- SHLB $2, CX
- ORB $2, CX
- MOVB CX, 0(DI)
- MOVW BX, 1(DI)
+ SUBL $1, AX
+ SHLB $2, AX
+ ORB $2, AX
+ MOVB AX, 0(DI)
+ MOVW R11, 1(DI)
ADDQ $3, DI
// Return the number of bytes written.