Inline the emitLiteral call.
name old speed new speed delta
WordsEncode1e1-8 712MB/s ± 1% 700MB/s ± 1% -1.65% (p=0.000 n=10+10)
WordsEncode1e2-8 467MB/s ± 0% 460MB/s ± 1% -1.53% (p=0.000 n=9+10)
WordsEncode1e3-8 483MB/s ± 0% 478MB/s ± 2% -0.98% (p=0.007 n=9+10)
WordsEncode1e4-8 353MB/s ± 1% 414MB/s ± 0% +17.03% (p=0.000 n=10+9)
WordsEncode1e5-8 293MB/s ± 0% 296MB/s ± 1% +1.03% (p=0.000 n=8+9)
WordsEncode1e6-8 345MB/s ± 0% 345MB/s ± 0% ~ (p=0.332 n=9+8)
RandomEncode-8 14.4GB/s ± 2% 14.4GB/s ± 2% ~ (p=1.000 n=10+9)
_ZFlat0-8 863MB/s ± 0% 888MB/s ± 1% +2.86% (p=0.000 n=9+10)
_ZFlat1-8 471MB/s ± 0% 471MB/s ± 1% ~ (p=0.897 n=8+10)
_ZFlat2-8 16.2GB/s ± 2% 16.2GB/s ± 3% ~ (p=0.631 n=10+10)
_ZFlat3-8 659MB/s ± 1% 675MB/s ± 1% +2.32% (p=0.000 n=9+9)
_ZFlat4-8 8.29GB/s ± 1% 8.31GB/s ± 1% ~ (p=0.315 n=10+10)
_ZFlat5-8 836MB/s ± 1% 850MB/s ± 0% +1.78% (p=0.000 n=9+9)
_ZFlat6-8 315MB/s ± 0% 316MB/s ± 0% +0.39% (p=0.002 n=9+10)
_ZFlat7-8 293MB/s ± 1% 294MB/s ± 1% ~ (p=0.139 n=10+9)
_ZFlat8-8 331MB/s ± 1% 330MB/s ± 1% ~ (p=0.356 n=10+9)
_ZFlat9-8 273MB/s ± 1% 273MB/s ± 0% ~ (p=0.280 n=10+10)
_ZFlat10-8 1.12GB/s ± 1% 1.17GB/s ± 1% +4.12% (p=0.000 n=10+10)
_ZFlat11-8 460MB/s ± 0% 461MB/s ± 0% +0.34% (p=0.006 n=8+10)
diff --git a/encode_amd64.s b/encode_amd64.s
index 6a21444..fda505c 100644
--- a/encode_amd64.s
+++ b/encode_amd64.s
@@ -57,14 +57,14 @@
MOVW BX, 1(DI)
ADDQ $3, DI
ADDQ $3, DX
- JMP emitLiteralEnd
+ JMP memmove
twoBytes:
MOVB $0xf0, 0(DI)
MOVB BX, 1(DI)
ADDQ $2, DI
ADDQ $2, DX
- JMP emitLiteralEnd
+ JMP memmove
oneByte:
SHLB $2, BX
@@ -72,7 +72,7 @@
ADDQ $1, DI
ADDQ $1, DX
-emitLiteralEnd:
+memmove:
MOVQ DX, ret+48(FP)
// copy(dst[i:], lit)
@@ -400,32 +400,64 @@
CMPQ AX, $16
JLE emitLiteralFastPath
- // d += emitLiteral(dst[d:], src[nextEmit:s])
+ // ----------------------------------------
+ // Begin inline of the emitLiteral call.
//
- // Push args.
- MOVQ DI, 0(SP)
- MOVQ $0, 8(SP) // Unnecessary, as the callee ignores it, but conservative.
- MOVQ $0, 16(SP) // Unnecessary, as the callee ignores it, but conservative.
- MOVQ R10, 24(SP)
- MOVQ AX, 32(SP)
- MOVQ AX, 40(SP) // Unnecessary, as the callee ignores it, but conservative.
+ // d += emitLiteral(dst[d:], src[nextEmit:s])
+ MOVL AX, BX
+ SUBL $1, BX
+
+ CMPL BX, $60
+ JLT inlineEmitLiteralOneByte
+ CMPL BX, $256
+ JLT inlineEmitLiteralTwoBytes
+
+inlineEmitLiteralThreeBytes:
+ MOVB $0xf4, 0(DI)
+ MOVW BX, 1(DI)
+ ADDQ $3, DI
+ JMP inlineEmitLiteralMemmove
+
+inlineEmitLiteralTwoBytes:
+ MOVB $0xf0, 0(DI)
+ MOVB BX, 1(DI)
+ ADDQ $2, DI
+ JMP inlineEmitLiteralMemmove
+
+inlineEmitLiteralOneByte:
+ SHLB $2, BX
+ MOVB BX, 0(DI)
+ ADDQ $1, DI
+
+inlineEmitLiteralMemmove:
// Spill local variables (registers) onto the stack; call; unspill.
+ //
+ // copy(dst[i:], lit)
+ //
+ // This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
+ // DI, R10 and AX as arguments.
+ MOVQ DI, 0(SP)
+ MOVQ R10, 8(SP)
+ MOVQ AX, 16(SP)
+ // Finish the "d +=" part of "d += emitLiteral(etc)".
+ ADDQ AX, DI
MOVQ SI, 72(SP)
MOVQ DI, 80(SP)
MOVQ R15, 112(SP)
- CALL ·emitLiteral(SB)
+ CALL runtime·memmove(SB)
MOVQ 56(SP), CX
MOVQ 64(SP), DX
MOVQ 72(SP), SI
MOVQ 80(SP), DI
MOVQ 88(SP), R9
MOVQ 112(SP), R15
-
- // Finish the "d +=" part of "d += emitLiteral(etc)".
- ADDQ 48(SP), DI
JMP inner1
+inlineEmitLiteralEnd:
+ // End inline of the emitLiteral call.
+ // ----------------------------------------
+
emitLiteralFastPath:
// !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
MOVB AX, BX