Rearrange the emitCopy register allocation.

This minimizes the diff in a follow-up commit, when manually inlining.

It's not an optimization per se, but for the record:
name              old speed      new speed      delta
WordsEncode1e1-8   711MB/s ± 1%   700MB/s ± 1%  -1.64%   (p=0.000 n=9+10)
WordsEncode1e2-8   407MB/s ± 1%   430MB/s ± 0%  +5.57%  (p=0.000 n=10+10)
WordsEncode1e3-8   441MB/s ± 1%   447MB/s ± 0%  +1.52%    (p=0.000 n=8+8)
WordsEncode1e4-8   311MB/s ± 1%   322MB/s ± 0%  +3.69%   (p=0.000 n=9+10)
WordsEncode1e5-8   267MB/s ± 0%   267MB/s ± 1%    ~      (p=0.068 n=8+10)
WordsEncode1e6-8   312MB/s ± 1%   314MB/s ± 0%  +0.45%   (p=0.000 n=9+10)
RandomEncode-8    14.4GB/s ± 2%  14.4GB/s ± 2%    ~     (p=0.739 n=10+10)
_ZFlat0-8          792MB/s ± 1%   801MB/s ± 0%  +1.11%    (p=0.000 n=8+9)
_ZFlat1-8          435MB/s ± 1%   437MB/s ± 0%    ~      (p=0.857 n=9+10)
_ZFlat2-8         16.0GB/s ± 4%  16.3GB/s ± 1%    ~     (p=0.143 n=10+10)
_ZFlat3-8          613MB/s ± 0%   634MB/s ± 0%  +3.54%   (p=0.000 n=8+10)
_ZFlat4-8         7.96GB/s ± 1%  7.97GB/s ± 1%    ~      (p=0.829 n=8+10)
_ZFlat5-8          770MB/s ± 0%   773MB/s ± 0%  +0.33%    (p=0.000 n=8+9)
_ZFlat6-8          283MB/s ± 0%   283MB/s ± 0%  +0.13%    (p=0.043 n=8+9)
_ZFlat7-8          264MB/s ± 2%   265MB/s ± 0%  +0.61%    (p=0.000 n=9+9)
_ZFlat8-8          297MB/s ± 3%   299MB/s ± 0%    ~       (p=0.161 n=9+9)
_ZFlat9-8          247MB/s ± 1%   247MB/s ± 0%    ~       (p=0.465 n=8+9)
_ZFlat10-8        1.03GB/s ± 0%  1.05GB/s ± 1%  +1.75%    (p=0.000 n=9+9)
_ZFlat11-8         409MB/s ± 0%   412MB/s ± 0%  +0.64%    (p=0.000 n=8+8)
diff --git a/encode_amd64.s b/encode_amd64.s
index 48386b7..81205ac 100644
--- a/encode_amd64.s
+++ b/encode_amd64.s
@@ -86,55 +86,59 @@
 // func emitCopy(dst []byte, offset, length int) int
 //
 // All local variables fit into registers. The register allocation:
-//	- BX	offset
-//	- CX	length
+//	- AX	length
 //	- SI	&dst[0]
 //	- DI	&dst[i]
+//	- R11	offset
+//
+// The unusual register allocation of AX and R11 for local variables matches
+// the allocation used at the call site in encodeBlock, which makes it easier
+// to manually inline this function.
 TEXT ·emitCopy(SB), NOSPLIT, $0-48
 	MOVQ dst_base+0(FP), DI
 	MOVQ DI, SI
-	MOVQ offset+24(FP), BX
-	MOVQ length+32(FP), CX
+	MOVQ offset+24(FP), R11
+	MOVQ length+32(FP), AX
 
 loop0:
 	// for length >= 68 { etc }
-	CMPL CX, $68
+	CMPL AX, $68
 	JLT  step1
 
 	// Emit a length 64 copy, encoded as 3 bytes.
 	MOVB $0xfe, 0(DI)
-	MOVW BX, 1(DI)
+	MOVW R11, 1(DI)
 	ADDQ $3, DI
-	SUBL $64, CX
+	SUBL $64, AX
 	JMP  loop0
 
 step1:
 	// if length > 64 { etc }
-	CMPL CX, $64
+	CMPL AX, $64
 	JLE  step2
 
 	// Emit a length 60 copy, encoded as 3 bytes.
 	MOVB $0xee, 0(DI)
-	MOVW BX, 1(DI)
+	MOVW R11, 1(DI)
 	ADDQ $3, DI
-	SUBL $60, CX
+	SUBL $60, AX
 
 step2:
 	// if length >= 12 || offset >= 2048 { goto step3 }
-	CMPL CX, $12
+	CMPL AX, $12
 	JGE  step3
-	CMPL BX, $2048
+	CMPL R11, $2048
 	JGE  step3
 
 	// Emit the remaining copy, encoded as 2 bytes.
-	MOVB BX, 1(DI)
-	SHRL $8, BX
-	SHLB $5, BX
-	SUBB $4, CX
-	SHLB $2, CX
-	ORB  CX, BX
-	ORB  $1, BX
-	MOVB BX, 0(DI)
+	MOVB R11, 1(DI)
+	SHRL $8, R11
+	SHLB $5, R11
+	SUBB $4, AX
+	SHLB $2, AX
+	ORB  AX, R11
+	ORB  $1, R11
+	MOVB R11, 0(DI)
 	ADDQ $2, DI
 
 	// Return the number of bytes written.
@@ -144,11 +148,11 @@
 
 step3:
 	// Emit the remaining copy, encoded as 3 bytes.
-	SUBL $1, CX
-	SHLB $2, CX
-	ORB  $2, CX
-	MOVB CX, 0(DI)
-	MOVW BX, 1(DI)
+	SUBL $1, AX
+	SHLB $2, AX
+	ORB  $2, AX
+	MOVB AX, 0(DI)
+	MOVW R11, 1(DI)
 	ADDQ $3, DI
 
 	// Return the number of bytes written.