Inline the extendMatch call.
Compared to the previous commit:
name old speed new speed delta
WordsEncode1e1-8 701MB/s ± 0% 699MB/s ± 1% ~ (p=0.123 n=10+10)
WordsEncode1e2-8 460MB/s ± 0% 583MB/s ± 1% +26.64% (p=0.000 n=10+10)
WordsEncode1e3-8 480MB/s ± 0% 647MB/s ± 2% +34.85% (p=0.000 n=10+10)
WordsEncode1e4-8 416MB/s ± 0% 451MB/s ± 0% +8.30% (p=0.000 n=10+8)
WordsEncode1e5-8 297MB/s ± 0% 355MB/s ± 2% +19.50% (p=0.000 n=10+9)
WordsEncode1e6-8 345MB/s ± 0% 433MB/s ± 2% +25.47% (p=0.000 n=10+9)
RandomEncode-8 14.4GB/s ± 2% 14.3GB/s ± 3% ~ (p=0.075 n=10+10)
_ZFlat0-8 891MB/s ± 1% 1040MB/s ± 0% +16.67% (p=0.000 n=9+9)
_ZFlat1-8 471MB/s ± 0% 535MB/s ± 1% +13.68% (p=0.000 n=9+10)
_ZFlat2-8 16.2GB/s ± 3% 16.4GB/s ± 1% ~ (p=0.122 n=10+8)
_ZFlat3-8 676MB/s ± 0% 762MB/s ± 0% +12.62% (p=0.000 n=10+9)
_ZFlat4-8 8.36GB/s ± 1% 9.47GB/s ± 1% +13.28% (p=0.000 n=10+10)
_ZFlat5-8 852MB/s ± 0% 986MB/s ± 1% +15.79% (p=0.000 n=10+9)
_ZFlat6-8 316MB/s ± 0% 380MB/s ± 1% +20.41% (p=0.000 n=8+9)
_ZFlat7-8 296MB/s ± 0% 353MB/s ± 0% +19.44% (p=0.000 n=8+10)
_ZFlat8-8 331MB/s ± 1% 399MB/s ± 0% +20.53% (p=0.000 n=9+8)
_ZFlat9-8 274MB/s ± 0% 329MB/s ± 0% +20.27% (p=0.000 n=8+9)
_ZFlat10-8 1.17GB/s ± 0% 1.35GB/s ± 1% +15.15% (p=0.000 n=9+9)
_ZFlat11-8 462MB/s ± 0% 608MB/s ± 0% +31.54% (p=0.000 n=9+9)
The net effect of the past four inlining commits, when compared to just
before c3defccc "Inline the emitCopy call":
name old speed new speed delta
WordsEncode1e1-8 701MB/s ± 1% 699MB/s ± 1% ~ (p=0.353 n=10+10)
WordsEncode1e2-8 429MB/s ± 0% 583MB/s ± 1% +35.95% (p=0.000 n=9+10)
WordsEncode1e3-8 447MB/s ± 0% 647MB/s ± 2% +44.85% (p=0.000 n=9+10)
WordsEncode1e4-8 322MB/s ± 1% 451MB/s ± 0% +40.00% (p=0.000 n=10+8)
WordsEncode1e5-8 268MB/s ± 0% 355MB/s ± 2% +32.41% (p=0.000 n=9+9)
WordsEncode1e6-8 313MB/s ± 0% 433MB/s ± 2% +38.28% (p=0.000 n=8+9)
RandomEncode-8 14.4GB/s ± 1% 14.3GB/s ± 3% ~ (p=0.897 n=8+10)
_ZFlat0-8 797MB/s ± 2% 1040MB/s ± 0% +30.53% (p=0.000 n=9+9)
_ZFlat1-8 435MB/s ± 1% 535MB/s ± 1% +22.97% (p=0.000 n=9+10)
_ZFlat2-8 16.1GB/s ± 2% 16.4GB/s ± 1% +1.47% (p=0.001 n=10+8)
_ZFlat3-8 633MB/s ± 0% 762MB/s ± 0% +20.32% (p=0.000 n=10+9)
_ZFlat4-8 7.95GB/s ± 1% 9.47GB/s ± 1% +19.11% (p=0.000 n=10+10)
_ZFlat5-8 771MB/s ± 0% 986MB/s ± 1% +27.83% (p=0.000 n=10+9)
_ZFlat6-8 283MB/s ± 0% 380MB/s ± 1% +34.46% (p=0.000 n=10+9)
_ZFlat7-8 265MB/s ± 0% 353MB/s ± 0% +33.29% (p=0.000 n=9+10)
_ZFlat8-8 299MB/s ± 0% 399MB/s ± 0% +33.36% (p=0.000 n=9+8)
_ZFlat9-8 246MB/s ± 1% 329MB/s ± 0% +33.58% (p=0.000 n=10+9)
_ZFlat10-8 1.05GB/s ± 1% 1.35GB/s ± 1% +28.35% (p=0.000 n=10+9)
_ZFlat11-8 411MB/s ± 0% 608MB/s ± 0% +47.82% (p=0.000 n=10+9)
diff --git a/encode_amd64.s b/encode_amd64.s
index 40dcde8..adfd979 100644
--- a/encode_amd64.s
+++ b/encode_amd64.s
@@ -251,8 +251,8 @@
// - R10 . &src[nextEmit]
// - R11 96 prevHash, currHash, nextHash, offset
// - R12 104 &src[base], skip
-// - R13 . &src[nextS]
-// - R14 . len(src), bytesBetweenHashLookups, x
+// - R13 . &src[nextS], &src[len(src) - 8]
+// - R14 . len(src), bytesBetweenHashLookups, &src[len(src)], x
// - R15 112 candidate
//
// The second column (56, 64, etc) is the stack offset to spill the registers
@@ -444,8 +444,7 @@
MOVQ DI, 0(SP)
MOVQ R10, 8(SP)
MOVQ AX, 16(SP)
- // Finish the "d +=" part of "d += emitLiteral(etc)".
- ADDQ AX, DI
+ ADDQ AX, DI // Finish the "d +=" part of "d += emitLiteral(etc)".
MOVQ SI, 72(SP)
MOVQ DI, 80(SP)
MOVQ R15, 112(SP)
@@ -494,35 +493,65 @@
SUBQ R15, R11
SUBQ DX, R11
- // s = extendMatch(src, candidate+4, s+4)
+ // ----------------------------------------
+ // Begin inline of the extendMatch call.
//
- // Push args.
- MOVQ DX, 0(SP)
+ // s = extendMatch(src, candidate+4, s+4)
+
+ // !!! R14 = &src[len(src)]
MOVQ src_len+32(FP), R14
- MOVQ R14, 8(SP)
- MOVQ R14, 16(SP) // Unnecessary, as the callee ignores it, but conservative.
+ ADDQ DX, R14
+
+ // !!! R13 = &src[len(src) - 8]
+ MOVQ R14, R13
+ SUBQ $8, R13
+
+ // !!! R15 = &src[candidate + 4]
ADDQ $4, R15
- MOVQ R15, 24(SP)
+ ADDQ DX, R15
+
+ // !!! s += 4
ADDQ $4, SI
- SUBQ DX, SI
- MOVQ SI, 32(SP)
- // Spill local variables (registers) onto the stack; call; unspill.
- MOVQ DI, 80(SP)
- MOVQ R11, 96(SP)
- MOVQ R12, 104(SP)
- CALL ·extendMatch(SB)
- MOVQ 56(SP), CX
- MOVQ 64(SP), DX
- MOVQ 80(SP), DI
- MOVQ 88(SP), R9
- MOVQ 96(SP), R11
- MOVQ 104(SP), R12
+inlineExtendMatchCmp8:
+ // As long as we are 8 or more bytes before the end of src, we can load and
+ // compare 8 bytes at a time. If those 8 bytes are equal, repeat.
+ CMPQ SI, R13
+ JA inlineExtendMatchCmp1
+ MOVQ (R15), AX
+ MOVQ (SI), BX
+ CMPQ AX, BX
+ JNE inlineExtendMatchBSF
+ ADDQ $8, R15
+ ADDQ $8, SI
+ JMP inlineExtendMatchCmp8
- // Finish the "s =" part of "s = extendMatch(etc)", remembering that the SI
- // register holds &src[s], not s.
- MOVQ 40(SP), SI
- ADDQ DX, SI
+inlineExtendMatchBSF:
+ // If those 8 bytes were not equal, XOR the two 8 byte values, and return
+ // the index of the first byte that differs. The BSF instruction finds the
+ // least significant 1 bit, the amd64 architecture is little-endian, and
+ // the shift by 3 converts a bit index to a byte index.
+ XORQ AX, BX
+ BSFQ BX, BX
+ SHRQ $3, BX
+ ADDQ BX, SI
+ JMP inlineExtendMatchEnd
+
+inlineExtendMatchCmp1:
+ // In src's tail, compare 1 byte at a time.
+ CMPQ SI, R14
+ JAE inlineExtendMatchEnd
+ MOVB (R15), AX
+ MOVB (SI), BX
+ CMPB AX, BX
+ JNE inlineExtendMatchEnd
+ ADDQ $1, R15
+ ADDQ $1, SI
+ JMP inlineExtendMatchCmp1
+
+inlineExtendMatchEnd:
+ // End inline of the extendMatch call.
+ // ----------------------------------------
// ----------------------------------------
// Begin inline of the emitCopy call.