Add a fast path for short emitLiteral calls. Compared to the previous commit: name old speed new speed delta WordsEncode1e1-8 667MB/s ± 0% 677MB/s ± 1% +1.57% (p=0.008 n=5+5) WordsEncode1e2-8 353MB/s ± 1% 428MB/s ± 0% +21.37% (p=0.008 n=5+5) WordsEncode1e3-8 383MB/s ± 1% 446MB/s ± 1% +16.65% (p=0.008 n=5+5) WordsEncode1e4-8 277MB/s ± 1% 316MB/s ± 0% +13.93% (p=0.008 n=5+5) WordsEncode1e5-8 248MB/s ± 0% 269MB/s ± 0% +8.57% (p=0.008 n=5+5) WordsEncode1e6-8 296MB/s ± 0% 314MB/s ± 1% +6.08% (p=0.008 n=5+5) RandomEncode-8 14.4GB/s ± 2% 14.4GB/s ± 1% ~ (p=1.000 n=5+5) _ZFlat0-8 748MB/s ± 0% 792MB/s ± 0% +5.87% (p=0.008 n=5+5) _ZFlat1-8 406MB/s ± 0% 436MB/s ± 1% +7.42% (p=0.008 n=5+5) _ZFlat2-8 16.1GB/s ± 1% 16.2GB/s ± 1% ~ (p=0.421 n=5+5) _ZFlat3-8 604MB/s ± 0% 632MB/s ± 1% +4.49% (p=0.008 n=5+5) _ZFlat4-8 7.62GB/s ± 1% 8.00GB/s ± 0% +5.03% (p=0.008 n=5+5) _ZFlat5-8 729MB/s ± 0% 768MB/s ± 0% +5.26% (p=0.008 n=5+5) _ZFlat6-8 267MB/s ± 0% 282MB/s ± 1% +5.92% (p=0.008 n=5+5) _ZFlat7-8 248MB/s ± 0% 264MB/s ± 1% +6.48% (p=0.008 n=5+5) _ZFlat8-8 282MB/s ± 0% 298MB/s ± 0% +5.87% (p=0.008 n=5+5) _ZFlat9-8 231MB/s ± 0% 247MB/s ± 0% +6.79% (p=0.008 n=5+5) _ZFlat10-8 972MB/s ± 0% 1027MB/s ± 0% +5.64% (p=0.008 n=5+5) _ZFlat11-8 401MB/s ± 0% 411MB/s ± 0% +2.43% (p=0.008 n=5+5) The net effect of the past three commits, when compared to just before 68801229 "Write the encoder's encodeBlock in asm": name old speed new speed delta WordsEncode1e1-8 665MB/s ± 0% 677MB/s ± 1% +1.80% (p=0.016 n=4+5) WordsEncode1e2-8 85.0MB/s ± 0% 428.3MB/s ± 0% +403.65% (p=0.016 n=4+5) WordsEncode1e3-8 234MB/s ± 2% 446MB/s ± 1% +90.90% (p=0.008 n=5+5) WordsEncode1e4-8 233MB/s ± 0% 316MB/s ± 0% +35.22% (p=0.008 n=5+5) WordsEncode1e5-8 214MB/s ± 1% 269MB/s ± 0% +25.45% (p=0.008 n=5+5) WordsEncode1e6-8 258MB/s ± 0% 314MB/s ± 1% +21.82% (p=0.008 n=5+5) RandomEncode-8 13.1GB/s ± 1% 14.4GB/s ± 1% +10.31% (p=0.008 n=5+5) _ZFlat0-8 630MB/s ± 0% 792MB/s ± 0% +25.71% (p=0.016 n=4+5) _ZFlat1-8 326MB/s ± 0% 436MB/s ± 1% +33.89% (p=0.016 n=4+5) _ZFlat2-8 13.9GB/s ± 1% 16.2GB/s ± 1% +16.27% (p=0.008 n=5+5) _ZFlat3-8 177MB/s ± 1% 632MB/s ± 1% +257.58% (p=0.008 n=5+5) _ZFlat4-8 6.19GB/s ± 1% 8.00GB/s ± 0% +29.32% (p=0.008 n=5+5) _ZFlat5-8 615MB/s ± 0% 768MB/s ± 0% +24.91% (p=0.008 n=5+5) _ZFlat6-8 231MB/s ± 0% 282MB/s ± 1% +21.95% (p=0.008 n=5+5) _ZFlat7-8 215MB/s ± 1% 264MB/s ± 1% +22.83% (p=0.008 n=5+5) _ZFlat8-8 246MB/s ± 0% 298MB/s ± 0% +21.46% (p=0.008 n=5+5) _ZFlat9-8 202MB/s ± 0% 247MB/s ± 0% +22.17% (p=0.008 n=5+5) _ZFlat10-8 803MB/s ± 0% 1027MB/s ± 0% +27.93% (p=0.008 n=5+5) _ZFlat11-8 351MB/s ± 0% 411MB/s ± 0% +16.92% (p=0.008 n=5+5)

commit: 988ce01844c82a01b918c35840520b91b398049e [log] [tgz]
author: Nigel Tao <nigeltao@golang.org> Sat Apr 23 14:45:26 2016 +1000
committer: Nigel Tao <nigeltao@golang.org> Sat Apr 23 14:49:49 2016 +1000
tree: 8e4de5c8ccd2ed65a3ec08a7638fcbb7c0834a90
parent: 45da9d9c0fa12c1d81bb7dcb799ffe9ca08c8e84 [diff]
diff --git a/encode.go b/encode.go
index b0ff679..8749689 100644
--- a/encode.go
+++ b/encode.go

@@ -48,8 +48,6 @@
 // can copy up to 15 bytes too much, but that's OK as subsequent iterations of
 // the encoding loop will fix up the copy overrun, and this inputMargin ensures
 // that we don't overrun the dst and src buffers.
-//
-// TODO: implement this fast path.
 const inputMargin = 16 - 1
 
 // minNonLiteralBlockSize is the minimum size of the input to encodeBlock that

diff --git a/encode_amd64.s b/encode_amd64.s
index 92f0a39..a91f4ba 100644
--- a/encode_amd64.s
+++ b/encode_amd64.s

@@ -358,6 +358,13 @@
 	//
 	// A 4-byte match has been found. We'll later see etc.
 
+	// !!! Jump to a fast path for short (<= 16 byte) literals. See the comment
+	// on inputMargin in encode.go.
+	MOVQ SI, AX
+	SUBQ R10, AX
+	CMPQ AX, $16
+	JLE  emitLiteralFastPath
+
 	// d += emitLiteral(dst[d:], src[nextEmit:s])
 	//
 	// Push args.
@@ -365,8 +372,6 @@
 	MOVQ $0, 8(SP)   // Unnecessary, as the callee ignores it, but conservative.
 	MOVQ $0, 16(SP)  // Unnecessary, as the callee ignores it, but conservative.
 	MOVQ R10, 24(SP)
-	MOVQ SI, AX
-	SUBQ R10, AX
 	MOVQ AX, 32(SP)
 	MOVQ AX, 40(SP)  // Unnecessary, as the callee ignores it, but conservative.
 
@@ -384,6 +389,28 @@
 
 	// Finish the "d +=" part of "d += emitLiteral(etc)".
 	ADDQ 48(SP), DI
+	JMP  inner1
+
+emitLiteralFastPath:
+	// !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
+	MOVB AX, BX
+	SUBB $1, BX
+	SHLB $2, BX
+	MOVB BX, (DI)
+	ADDQ $1, DI
+
+	// !!! Implement the copy from lit to dst as a 16-byte load and store.
+	// (Encode's documentation says that dst and src must not overlap.)
+	//
+	// This always copies 16 bytes, instead of only len(lit) bytes, but that's
+	// OK. Subsequent iterations will fix up the overrun.
+	//
+	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
+	// 16-byte loads and stores. This technique probably wouldn't be as
+	// effective on architectures that are fussier about alignment.
+	MOVOU 0(R10), X0
+	MOVOU X0, 0(DI)
+	ADDQ  AX, DI
 
 inner1:
 	// for { etc }
commit	988ce01844c82a01b918c35840520b91b398049e	[log] [tgz]
author	Nigel Tao <nigeltao@golang.org>	Sat Apr 23 14:45:26 2016 +1000
committer	Nigel Tao <nigeltao@golang.org>	Sat Apr 23 14:49:49 2016 +1000
tree	8e4de5c8ccd2ed65a3ec08a7638fcbb7c0834a90
parent	45da9d9c0fa12c1d81bb7dcb799ffe9ca08c8e84 [diff]