Inline the emitLiteral call. name old speed new speed delta WordsEncode1e1-8 712MB/s ± 1% 700MB/s ± 1% -1.65% (p=0.000 n=10+10) WordsEncode1e2-8 467MB/s ± 0% 460MB/s ± 1% -1.53% (p=0.000 n=9+10) WordsEncode1e3-8 483MB/s ± 0% 478MB/s ± 2% -0.98% (p=0.007 n=9+10) WordsEncode1e4-8 353MB/s ± 1% 414MB/s ± 0% +17.03% (p=0.000 n=10+9) WordsEncode1e5-8 293MB/s ± 0% 296MB/s ± 1% +1.03% (p=0.000 n=8+9) WordsEncode1e6-8 345MB/s ± 0% 345MB/s ± 0% ~ (p=0.332 n=9+8) RandomEncode-8 14.4GB/s ± 2% 14.4GB/s ± 2% ~ (p=1.000 n=10+9) _ZFlat0-8 863MB/s ± 0% 888MB/s ± 1% +2.86% (p=0.000 n=9+10) _ZFlat1-8 471MB/s ± 0% 471MB/s ± 1% ~ (p=0.897 n=8+10) _ZFlat2-8 16.2GB/s ± 2% 16.2GB/s ± 3% ~ (p=0.631 n=10+10) _ZFlat3-8 659MB/s ± 1% 675MB/s ± 1% +2.32% (p=0.000 n=9+9) _ZFlat4-8 8.29GB/s ± 1% 8.31GB/s ± 1% ~ (p=0.315 n=10+10) _ZFlat5-8 836MB/s ± 1% 850MB/s ± 0% +1.78% (p=0.000 n=9+9) _ZFlat6-8 315MB/s ± 0% 316MB/s ± 0% +0.39% (p=0.002 n=9+10) _ZFlat7-8 293MB/s ± 1% 294MB/s ± 1% ~ (p=0.139 n=10+9) _ZFlat8-8 331MB/s ± 1% 330MB/s ± 1% ~ (p=0.356 n=10+9) _ZFlat9-8 273MB/s ± 1% 273MB/s ± 0% ~ (p=0.280 n=10+10) _ZFlat10-8 1.12GB/s ± 1% 1.17GB/s ± 1% +4.12% (p=0.000 n=10+10) _ZFlat11-8 460MB/s ± 0% 461MB/s ± 0% +0.34% (p=0.006 n=8+10)

commit: 5a44a9da2110bee7a93f552127af8a74bad44fe8 [log] [tgz]
author: Nigel Tao <nigeltao@golang.org> Fri Apr 29 13:20:53 2016 +1000
committer: Nigel Tao <nigeltao@golang.org> Fri Apr 29 13:20:53 2016 +1000
tree: 9937478769cb04b8f2fd6f26c0e5606a32a1891f
parent: c3defccc353d53ff7091dba7a646f482bf15dc98 [diff]
diff --git a/encode_amd64.s b/encode_amd64.s
index 6a21444..fda505c 100644
--- a/encode_amd64.s
+++ b/encode_amd64.s

@@ -57,14 +57,14 @@
 	MOVW BX, 1(DI)
 	ADDQ $3, DI
 	ADDQ $3, DX
-	JMP  emitLiteralEnd
+	JMP  memmove
 
 twoBytes:
 	MOVB $0xf0, 0(DI)
 	MOVB BX, 1(DI)
 	ADDQ $2, DI
 	ADDQ $2, DX
-	JMP  emitLiteralEnd
+	JMP  memmove
 
 oneByte:
 	SHLB $2, BX
@@ -72,7 +72,7 @@
 	ADDQ $1, DI
 	ADDQ $1, DX
 
-emitLiteralEnd:
+memmove:
 	MOVQ DX, ret+48(FP)
 
 	// copy(dst[i:], lit)
@@ -400,32 +400,64 @@
 	CMPQ AX, $16
 	JLE  emitLiteralFastPath
 
-	// d += emitLiteral(dst[d:], src[nextEmit:s])
+	// ----------------------------------------
+	// Begin inline of the emitLiteral call.
 	//
-	// Push args.
-	MOVQ DI, 0(SP)
-	MOVQ $0, 8(SP)   // Unnecessary, as the callee ignores it, but conservative.
-	MOVQ $0, 16(SP)  // Unnecessary, as the callee ignores it, but conservative.
-	MOVQ R10, 24(SP)
-	MOVQ AX, 32(SP)
-	MOVQ AX, 40(SP)  // Unnecessary, as the callee ignores it, but conservative.
+	// d += emitLiteral(dst[d:], src[nextEmit:s])
 
+	MOVL AX, BX
+	SUBL $1, BX
+
+	CMPL BX, $60
+	JLT  inlineEmitLiteralOneByte
+	CMPL BX, $256
+	JLT  inlineEmitLiteralTwoBytes
+
+inlineEmitLiteralThreeBytes:
+	MOVB $0xf4, 0(DI)
+	MOVW BX, 1(DI)
+	ADDQ $3, DI
+	JMP  inlineEmitLiteralMemmove
+
+inlineEmitLiteralTwoBytes:
+	MOVB $0xf0, 0(DI)
+	MOVB BX, 1(DI)
+	ADDQ $2, DI
+	JMP  inlineEmitLiteralMemmove
+
+inlineEmitLiteralOneByte:
+	SHLB $2, BX
+	MOVB BX, 0(DI)
+	ADDQ $1, DI
+
+inlineEmitLiteralMemmove:
 	// Spill local variables (registers) onto the stack; call; unspill.
+	//
+	// copy(dst[i:], lit)
+	//
+	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
+	// DI, R10 and AX as arguments.
+	MOVQ DI, 0(SP)
+	MOVQ R10, 8(SP)
+	MOVQ AX, 16(SP)
+	// Finish the "d +=" part of "d += emitLiteral(etc)".
+	ADDQ AX, DI
 	MOVQ SI, 72(SP)
 	MOVQ DI, 80(SP)
 	MOVQ R15, 112(SP)
-	CALL ·emitLiteral(SB)
+	CALL runtime·memmove(SB)
 	MOVQ 56(SP), CX
 	MOVQ 64(SP), DX
 	MOVQ 72(SP), SI
 	MOVQ 80(SP), DI
 	MOVQ 88(SP), R9
 	MOVQ 112(SP), R15
-
-	// Finish the "d +=" part of "d += emitLiteral(etc)".
-	ADDQ 48(SP), DI
 	JMP  inner1
 
+inlineEmitLiteralEnd:
+	// End inline of the emitLiteral call.
+	// ----------------------------------------
+
 emitLiteralFastPath:
 	// !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
 	MOVB AX, BX
commit	5a44a9da2110bee7a93f552127af8a74bad44fe8	[log] [tgz]
author	Nigel Tao <nigeltao@golang.org>	Fri Apr 29 13:20:53 2016 +1000
committer	Nigel Tao <nigeltao@golang.org>	Fri Apr 29 13:20:53 2016 +1000
tree	9937478769cb04b8f2fd6f26c0e5606a32a1891f
parent	c3defccc353d53ff7091dba7a646f482bf15dc98 [diff]