Inline the emitCopy call. name old speed new speed delta WordsEncode1e1-8 701MB/s ± 1% 712MB/s ± 1% +1.64% (p=0.000 n=10+10) WordsEncode1e2-8 429MB/s ± 0% 467MB/s ± 0% +8.86% (p=0.000 n=9+9) WordsEncode1e3-8 447MB/s ± 0% 483MB/s ± 0% +8.20% (p=0.000 n=9+9) WordsEncode1e4-8 322MB/s ± 1% 353MB/s ± 1% +9.76% (p=0.000 n=10+10) WordsEncode1e5-8 268MB/s ± 0% 293MB/s ± 0% +9.42% (p=0.000 n=9+8) WordsEncode1e6-8 313MB/s ± 0% 345MB/s ± 0% +10.06% (p=0.000 n=8+9) RandomEncode-8 14.4GB/s ± 1% 14.4GB/s ± 2% ~ (p=0.829 n=8+10) _ZFlat0-8 797MB/s ± 2% 863MB/s ± 0% +8.39% (p=0.000 n=9+9) _ZFlat1-8 435MB/s ± 1% 471MB/s ± 0% +8.34% (p=0.000 n=9+8) _ZFlat2-8 16.1GB/s ± 2% 16.2GB/s ± 2% ~ (p=0.165 n=10+10) _ZFlat3-8 633MB/s ± 0% 659MB/s ± 1% +4.12% (p=0.000 n=10+9) _ZFlat4-8 7.95GB/s ± 1% 8.29GB/s ± 1% +4.22% (p=0.000 n=10+10) _ZFlat5-8 771MB/s ± 0% 836MB/s ± 1% +8.33% (p=0.000 n=10+9) _ZFlat6-8 283MB/s ± 0% 315MB/s ± 0% +11.19% (p=0.000 n=10+9) _ZFlat7-8 265MB/s ± 0% 293MB/s ± 1% +10.73% (p=0.000 n=9+10) _ZFlat8-8 299MB/s ± 0% 331MB/s ± 1% +10.74% (p=0.000 n=9+10) _ZFlat9-8 246MB/s ± 1% 273MB/s ± 1% +10.90% (p=0.000 n=10+10) _ZFlat10-8 1.05GB/s ± 1% 1.12GB/s ± 1% +7.02% (p=0.000 n=10+10) _ZFlat11-8 411MB/s ± 0% 460MB/s ± 0% +11.79% (p=0.000 n=10+8)
diff --git a/encode_amd64.s b/encode_amd64.s index a233b59..6a21444 100644 --- a/encode_amd64.s +++ b/encode_amd64.s
@@ -472,15 +472,14 @@ MOVQ SI, 32(SP) // Spill local variables (registers) onto the stack; call; unspill. - // - // We don't need to unspill CX or R9 as we are just about to call another - // function. MOVQ DI, 80(SP) MOVQ R11, 96(SP) MOVQ R12, 104(SP) CALL ·extendMatch(SB) + MOVQ 56(SP), CX MOVQ 64(SP), DX MOVQ 80(SP), DI + MOVQ 88(SP), R9 MOVQ 96(SP), R11 MOVQ 104(SP), R12 @@ -489,29 +488,69 @@ MOVQ 40(SP), SI ADDQ DX, SI - // d += emitCopy(dst[d:], base-candidate, s-base) + // ---------------------------------------- + // Begin inline of the emitCopy call. // - // Push args. - MOVQ DI, 0(SP) - MOVQ $0, 8(SP) // Unnecessary, as the callee ignores it, but conservative. - MOVQ $0, 16(SP) // Unnecessary, as the callee ignores it, but conservative. - MOVQ R11, 24(SP) + // d += emitCopy(dst[d:], base-candidate, s-base) + + // !!! length := s - base MOVQ SI, AX SUBQ R12, AX - MOVQ AX, 32(SP) - // Spill local variables (registers) onto the stack; call; unspill. - MOVQ SI, 72(SP) - MOVQ DI, 80(SP) - CALL ·emitCopy(SB) - MOVQ 56(SP), CX - MOVQ 64(SP), DX - MOVQ 72(SP), SI - MOVQ 80(SP), DI - MOVQ 88(SP), R9 +inlineEmitCopyLoop0: + // for length >= 68 { etc } + CMPL AX, $68 + JLT inlineEmitCopyStep1 - // Finish the "d +=" part of "d += emitCopy(etc)". - ADDQ 40(SP), DI + // Emit a length 64 copy, encoded as 3 bytes. + MOVB $0xfe, 0(DI) + MOVW R11, 1(DI) + ADDQ $3, DI + SUBL $64, AX + JMP inlineEmitCopyLoop0 + +inlineEmitCopyStep1: + // if length > 64 { etc } + CMPL AX, $64 + JLE inlineEmitCopyStep2 + + // Emit a length 60 copy, encoded as 3 bytes. + MOVB $0xee, 0(DI) + MOVW R11, 1(DI) + ADDQ $3, DI + SUBL $60, AX + +inlineEmitCopyStep2: + // if length >= 12 || offset >= 2048 { goto inlineEmitCopyStep3 } + CMPL AX, $12 + JGE inlineEmitCopyStep3 + CMPL R11, $2048 + JGE inlineEmitCopyStep3 + + // Emit the remaining copy, encoded as 2 bytes. + MOVB R11, 1(DI) + SHRL $8, R11 + SHLB $5, R11 + SUBB $4, AX + SHLB $2, AX + ORB AX, R11 + ORB $1, R11 + MOVB R11, 0(DI) + ADDQ $2, DI + JMP inlineEmitCopyEnd + +inlineEmitCopyStep3: + // Emit the remaining copy, encoded as 3 bytes. + SUBL $1, AX + SHLB $2, AX + ORB $2, AX + MOVB AX, 0(DI) + MOVW R11, 1(DI) + ADDQ $3, DI + +inlineEmitCopyEnd: + // End inline of the emitCopy call. + // ---------------------------------------- // nextEmit = s MOVQ SI, R10