Rearrange the emitCopy register allocation. This minimizes the diff in a follow-up commit, when manually inlining. It's not an optimization per se, but for the record: name old speed new speed delta WordsEncode1e1-8 711MB/s ± 1% 700MB/s ± 1% -1.64% (p=0.000 n=9+10) WordsEncode1e2-8 407MB/s ± 1% 430MB/s ± 0% +5.57% (p=0.000 n=10+10) WordsEncode1e3-8 441MB/s ± 1% 447MB/s ± 0% +1.52% (p=0.000 n=8+8) WordsEncode1e4-8 311MB/s ± 1% 322MB/s ± 0% +3.69% (p=0.000 n=9+10) WordsEncode1e5-8 267MB/s ± 0% 267MB/s ± 1% ~ (p=0.068 n=8+10) WordsEncode1e6-8 312MB/s ± 1% 314MB/s ± 0% +0.45% (p=0.000 n=9+10) RandomEncode-8 14.4GB/s ± 2% 14.4GB/s ± 2% ~ (p=0.739 n=10+10) _ZFlat0-8 792MB/s ± 1% 801MB/s ± 0% +1.11% (p=0.000 n=8+9) _ZFlat1-8 435MB/s ± 1% 437MB/s ± 0% ~ (p=0.857 n=9+10) _ZFlat2-8 16.0GB/s ± 4% 16.3GB/s ± 1% ~ (p=0.143 n=10+10) _ZFlat3-8 613MB/s ± 0% 634MB/s ± 0% +3.54% (p=0.000 n=8+10) _ZFlat4-8 7.96GB/s ± 1% 7.97GB/s ± 1% ~ (p=0.829 n=8+10) _ZFlat5-8 770MB/s ± 0% 773MB/s ± 0% +0.33% (p=0.000 n=8+9) _ZFlat6-8 283MB/s ± 0% 283MB/s ± 0% +0.13% (p=0.043 n=8+9) _ZFlat7-8 264MB/s ± 2% 265MB/s ± 0% +0.61% (p=0.000 n=9+9) _ZFlat8-8 297MB/s ± 3% 299MB/s ± 0% ~ (p=0.161 n=9+9) _ZFlat9-8 247MB/s ± 1% 247MB/s ± 0% ~ (p=0.465 n=8+9) _ZFlat10-8 1.03GB/s ± 0% 1.05GB/s ± 1% +1.75% (p=0.000 n=9+9) _ZFlat11-8 409MB/s ± 0% 412MB/s ± 0% +0.64% (p=0.000 n=8+8)

commit: 9f7b278fd78696f07ce25b6941fd896e50870f40 [log] [tgz]
author: Nigel Tao <nigeltao@golang.org> Fri Apr 29 11:22:44 2016 +1000
committer: Nigel Tao <nigeltao@golang.org> Fri Apr 29 11:22:44 2016 +1000
tree: d742b34e443272da728531f0498d266b68517b37
parent: 2b29335120440273ce87e777e4d1d511fe7894b8 [diff]
diff --git a/encode_amd64.s b/encode_amd64.s
index 48386b7..81205ac 100644
--- a/encode_amd64.s
+++ b/encode_amd64.s

@@ -86,55 +86,59 @@
 // func emitCopy(dst []byte, offset, length int) int
 //
 // All local variables fit into registers. The register allocation:
-//	- BX	offset
-//	- CX	length
+//	- AX	length
 //	- SI	&dst[0]
 //	- DI	&dst[i]
+//	- R11	offset
+//
+// The unusual register allocation of AX and R11 for local variables matches
+// the allocation used at the call site in encodeBlock, which makes it easier
+// to manually inline this function.
 TEXT ·emitCopy(SB), NOSPLIT, $0-48
 	MOVQ dst_base+0(FP), DI
 	MOVQ DI, SI
-	MOVQ offset+24(FP), BX
-	MOVQ length+32(FP), CX
+	MOVQ offset+24(FP), R11
+	MOVQ length+32(FP), AX
 
 loop0:
 	// for length >= 68 { etc }
-	CMPL CX, $68
+	CMPL AX, $68
 	JLT  step1
 
 	// Emit a length 64 copy, encoded as 3 bytes.
 	MOVB $0xfe, 0(DI)
-	MOVW BX, 1(DI)
+	MOVW R11, 1(DI)
 	ADDQ $3, DI
-	SUBL $64, CX
+	SUBL $64, AX
 	JMP  loop0
 
 step1:
 	// if length > 64 { etc }
-	CMPL CX, $64
+	CMPL AX, $64
 	JLE  step2
 
 	// Emit a length 60 copy, encoded as 3 bytes.
 	MOVB $0xee, 0(DI)
-	MOVW BX, 1(DI)
+	MOVW R11, 1(DI)
 	ADDQ $3, DI
-	SUBL $60, CX
+	SUBL $60, AX
 
 step2:
 	// if length >= 12 || offset >= 2048 { goto step3 }
-	CMPL CX, $12
+	CMPL AX, $12
 	JGE  step3
-	CMPL BX, $2048
+	CMPL R11, $2048
 	JGE  step3
 
 	// Emit the remaining copy, encoded as 2 bytes.
-	MOVB BX, 1(DI)
-	SHRL $8, BX
-	SHLB $5, BX
-	SUBB $4, CX
-	SHLB $2, CX
-	ORB  CX, BX
-	ORB  $1, BX
-	MOVB BX, 0(DI)
+	MOVB R11, 1(DI)
+	SHRL $8, R11
+	SHLB $5, R11
+	SUBB $4, AX
+	SHLB $2, AX
+	ORB  AX, R11
+	ORB  $1, R11
+	MOVB R11, 0(DI)
 	ADDQ $2, DI
 
 	// Return the number of bytes written.
@@ -144,11 +148,11 @@
 
 step3:
 	// Emit the remaining copy, encoded as 3 bytes.
-	SUBL $1, CX
-	SHLB $2, CX
-	ORB  $2, CX
-	MOVB CX, 0(DI)
-	MOVW BX, 1(DI)
+	SUBL $1, AX
+	SHLB $2, AX
+	ORB  $2, AX
+	MOVB AX, 0(DI)
+	MOVW R11, 1(DI)
 	ADDQ $3, DI
 
 	// Return the number of bytes written.
commit	9f7b278fd78696f07ce25b6941fd896e50870f40	[log] [tgz]
author	Nigel Tao <nigeltao@golang.org>	Fri Apr 29 11:22:44 2016 +1000
committer	Nigel Tao <nigeltao@golang.org>	Fri Apr 29 11:22:44 2016 +1000
tree	d742b34e443272da728531f0498d266b68517b37
parent	2b29335120440273ce87e777e4d1d511fe7894b8 [diff]