Add a fast path for short emitLiteral calls.
Compared to the previous commit:
name old speed new speed delta
WordsEncode1e1-8 667MB/s ± 0% 677MB/s ± 1% +1.57% (p=0.008 n=5+5)
WordsEncode1e2-8 353MB/s ± 1% 428MB/s ± 0% +21.37% (p=0.008 n=5+5)
WordsEncode1e3-8 383MB/s ± 1% 446MB/s ± 1% +16.65% (p=0.008 n=5+5)
WordsEncode1e4-8 277MB/s ± 1% 316MB/s ± 0% +13.93% (p=0.008 n=5+5)
WordsEncode1e5-8 248MB/s ± 0% 269MB/s ± 0% +8.57% (p=0.008 n=5+5)
WordsEncode1e6-8 296MB/s ± 0% 314MB/s ± 1% +6.08% (p=0.008 n=5+5)
RandomEncode-8 14.4GB/s ± 2% 14.4GB/s ± 1% ~ (p=1.000 n=5+5)
_ZFlat0-8 748MB/s ± 0% 792MB/s ± 0% +5.87% (p=0.008 n=5+5)
_ZFlat1-8 406MB/s ± 0% 436MB/s ± 1% +7.42% (p=0.008 n=5+5)
_ZFlat2-8 16.1GB/s ± 1% 16.2GB/s ± 1% ~ (p=0.421 n=5+5)
_ZFlat3-8 604MB/s ± 0% 632MB/s ± 1% +4.49% (p=0.008 n=5+5)
_ZFlat4-8 7.62GB/s ± 1% 8.00GB/s ± 0% +5.03% (p=0.008 n=5+5)
_ZFlat5-8 729MB/s ± 0% 768MB/s ± 0% +5.26% (p=0.008 n=5+5)
_ZFlat6-8 267MB/s ± 0% 282MB/s ± 1% +5.92% (p=0.008 n=5+5)
_ZFlat7-8 248MB/s ± 0% 264MB/s ± 1% +6.48% (p=0.008 n=5+5)
_ZFlat8-8 282MB/s ± 0% 298MB/s ± 0% +5.87% (p=0.008 n=5+5)
_ZFlat9-8 231MB/s ± 0% 247MB/s ± 0% +6.79% (p=0.008 n=5+5)
_ZFlat10-8 972MB/s ± 0% 1027MB/s ± 0% +5.64% (p=0.008 n=5+5)
_ZFlat11-8 401MB/s ± 0% 411MB/s ± 0% +2.43% (p=0.008 n=5+5)
The net effect of the past three commits, when compared to just before
68801229 "Write the encoder's encodeBlock in asm":
name old speed new speed delta
WordsEncode1e1-8 665MB/s ± 0% 677MB/s ± 1% +1.80% (p=0.016 n=4+5)
WordsEncode1e2-8 85.0MB/s ± 0% 428.3MB/s ± 0% +403.65% (p=0.016 n=4+5)
WordsEncode1e3-8 234MB/s ± 2% 446MB/s ± 1% +90.90% (p=0.008 n=5+5)
WordsEncode1e4-8 233MB/s ± 0% 316MB/s ± 0% +35.22% (p=0.008 n=5+5)
WordsEncode1e5-8 214MB/s ± 1% 269MB/s ± 0% +25.45% (p=0.008 n=5+5)
WordsEncode1e6-8 258MB/s ± 0% 314MB/s ± 1% +21.82% (p=0.008 n=5+5)
RandomEncode-8 13.1GB/s ± 1% 14.4GB/s ± 1% +10.31% (p=0.008 n=5+5)
_ZFlat0-8 630MB/s ± 0% 792MB/s ± 0% +25.71% (p=0.016 n=4+5)
_ZFlat1-8 326MB/s ± 0% 436MB/s ± 1% +33.89% (p=0.016 n=4+5)
_ZFlat2-8 13.9GB/s ± 1% 16.2GB/s ± 1% +16.27% (p=0.008 n=5+5)
_ZFlat3-8 177MB/s ± 1% 632MB/s ± 1% +257.58% (p=0.008 n=5+5)
_ZFlat4-8 6.19GB/s ± 1% 8.00GB/s ± 0% +29.32% (p=0.008 n=5+5)
_ZFlat5-8 615MB/s ± 0% 768MB/s ± 0% +24.91% (p=0.008 n=5+5)
_ZFlat6-8 231MB/s ± 0% 282MB/s ± 1% +21.95% (p=0.008 n=5+5)
_ZFlat7-8 215MB/s ± 1% 264MB/s ± 1% +22.83% (p=0.008 n=5+5)
_ZFlat8-8 246MB/s ± 0% 298MB/s ± 0% +21.46% (p=0.008 n=5+5)
_ZFlat9-8 202MB/s ± 0% 247MB/s ± 0% +22.17% (p=0.008 n=5+5)
_ZFlat10-8 803MB/s ± 0% 1027MB/s ± 0% +27.93% (p=0.008 n=5+5)
_ZFlat11-8 351MB/s ± 0% 411MB/s ± 0% +16.92% (p=0.008 n=5+5)
diff --git a/encode.go b/encode.go
index b0ff679..8749689 100644
--- a/encode.go
+++ b/encode.go
@@ -48,8 +48,6 @@
// can copy up to 15 bytes too much, but that's OK as subsequent iterations of
// the encoding loop will fix up the copy overrun, and this inputMargin ensures
// that we don't overrun the dst and src buffers.
-//
-// TODO: implement this fast path.
const inputMargin = 16 - 1
// minNonLiteralBlockSize is the minimum size of the input to encodeBlock that
diff --git a/encode_amd64.s b/encode_amd64.s
index 92f0a39..a91f4ba 100644
--- a/encode_amd64.s
+++ b/encode_amd64.s
@@ -358,6 +358,13 @@
//
// A 4-byte match has been found. We'll later see etc.
+ // !!! Jump to a fast path for short (<= 16 byte) literals. See the comment
+ // on inputMargin in encode.go.
+ MOVQ SI, AX
+ SUBQ R10, AX
+ CMPQ AX, $16
+ JLE emitLiteralFastPath
+
// d += emitLiteral(dst[d:], src[nextEmit:s])
//
// Push args.
@@ -365,8 +372,6 @@
MOVQ $0, 8(SP) // Unnecessary, as the callee ignores it, but conservative.
MOVQ $0, 16(SP) // Unnecessary, as the callee ignores it, but conservative.
MOVQ R10, 24(SP)
- MOVQ SI, AX
- SUBQ R10, AX
MOVQ AX, 32(SP)
MOVQ AX, 40(SP) // Unnecessary, as the callee ignores it, but conservative.
@@ -384,6 +389,28 @@
// Finish the "d +=" part of "d += emitLiteral(etc)".
ADDQ 48(SP), DI
+ JMP inner1
+
+emitLiteralFastPath:
+ // !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
+ MOVB AX, BX
+ SUBB $1, BX
+ SHLB $2, BX
+ MOVB BX, (DI)
+ ADDQ $1, DI
+
+ // !!! Implement the copy from lit to dst as a 16-byte load and store.
+ // (Encode's documentation says that dst and src must not overlap.)
+ //
+ // This always copies 16 bytes, instead of only len(lit) bytes, but that's
+ // OK. Subsequent iterations will fix up the overrun.
+ //
+ // Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
+ // 16-byte loads and stores. This technique probably wouldn't be as
+ // effective on architectures that are fussier about alignment.
+ MOVOU 0(R10), X0
+ MOVOU X0, 0(DI)
+ ADDQ AX, DI
inner1:
// for { etc }