Rearrange the emitLiteral register allocation.

This minimizes the diff in a follow-up commit, when manually inlining.

It's not an optimization per se, but for the record:
name              old speed      new speed      delta
WordsEncode1e1-8   698MB/s ± 1%   701MB/s ± 1%   ~     (p=0.165 n=10+10)
WordsEncode1e2-8   428MB/s ± 0%   429MB/s ± 0%   ~       (p=0.489 n=9+9)
WordsEncode1e3-8   446MB/s ± 0%   447MB/s ± 0%   ~       (p=0.476 n=9+9)
WordsEncode1e4-8   321MB/s ± 1%   322MB/s ± 1%   ~     (p=0.593 n=10+10)
WordsEncode1e5-8   267MB/s ± 1%   268MB/s ± 0%   ~       (p=0.287 n=9+9)
WordsEncode1e6-8   313MB/s ± 1%   313MB/s ± 0%   ~       (p=0.190 n=9+8)
RandomEncode-8    14.4GB/s ± 1%  14.4GB/s ± 1%   ~       (p=0.673 n=9+8)
_ZFlat0-8          800MB/s ± 0%   797MB/s ± 2%   ~       (p=0.387 n=9+9)
_ZFlat1-8          436MB/s ± 1%   435MB/s ± 1%   ~       (p=0.169 n=9+9)
_ZFlat2-8         16.2GB/s ± 1%  16.1GB/s ± 2%   ~     (p=0.063 n=10+10)
_ZFlat3-8          633MB/s ± 1%   633MB/s ± 0%   ~      (p=0.661 n=9+10)
_ZFlat4-8         7.96GB/s ± 1%  7.95GB/s ± 1%   ~     (p=0.796 n=10+10)
_ZFlat5-8          771MB/s ± 0%   771MB/s ± 0%   ~     (p=0.929 n=10+10)
_ZFlat6-8          283MB/s ± 1%   283MB/s ± 0%   ~     (p=0.912 n=10+10)
_ZFlat7-8          265MB/s ± 0%   265MB/s ± 0%   ~       (p=0.649 n=9+9)
_ZFlat8-8          299MB/s ± 0%   299MB/s ± 0%   ~       (p=0.748 n=9+9)
_ZFlat9-8          246MB/s ± 1%   246MB/s ± 1%   ~      (p=0.921 n=9+10)
_ZFlat10-8        1.05GB/s ± 1%  1.05GB/s ± 1%   ~     (p=0.089 n=10+10)
_ZFlat11-8         410MB/s ± 0%   411MB/s ± 0%   ~     (p=0.190 n=10+10)
diff --git a/encode_amd64.s b/encode_amd64.s
index 81205ac..a233b59 100644
--- a/encode_amd64.s
+++ b/encode_amd64.s
@@ -28,19 +28,23 @@
 // func emitLiteral(dst, lit []byte) int
 //
 // All local variables fit into registers. The register allocation:
-//	- AX	return value
+//	- AX	len(lit)
 //	- BX	n
-//	- CX	len(lit)
-//	- SI	&lit[0]
+//	- DX	return value
 //	- DI	&dst[i]
+//	- R10	&lit[0]
 //
 // The 24 bytes of stack space is to call runtime·memmove.
+//
+// The unusual register allocation of local variables, such as R10 for the
+// source pointer, matches the allocation used at the call site in encodeBlock,
+// which makes it easier to manually inline this function.
 TEXT ·emitLiteral(SB), NOSPLIT, $24-56
 	MOVQ dst_base+0(FP), DI
-	MOVQ lit_base+24(FP), SI
-	MOVQ lit_len+32(FP), CX
-	MOVQ CX, AX
-	MOVL CX, BX
+	MOVQ lit_base+24(FP), R10
+	MOVQ lit_len+32(FP), AX
+	MOVQ AX, DX
+	MOVL AX, BX
 	SUBL $1, BX
 
 	CMPL BX, $60
@@ -52,32 +56,32 @@
 	MOVB $0xf4, 0(DI)
 	MOVW BX, 1(DI)
 	ADDQ $3, DI
-	ADDQ $3, AX
+	ADDQ $3, DX
 	JMP  emitLiteralEnd
 
 twoBytes:
 	MOVB $0xf0, 0(DI)
 	MOVB BX, 1(DI)
 	ADDQ $2, DI
-	ADDQ $2, AX
+	ADDQ $2, DX
 	JMP  emitLiteralEnd
 
 oneByte:
 	SHLB $2, BX
 	MOVB BX, 0(DI)
 	ADDQ $1, DI
-	ADDQ $1, AX
+	ADDQ $1, DX
 
 emitLiteralEnd:
-	MOVQ AX, ret+48(FP)
+	MOVQ DX, ret+48(FP)
 
 	// copy(dst[i:], lit)
 	//
 	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
-	// DI, SI and CX as arguments.
+	// DI, R10 and AX as arguments.
 	MOVQ DI, 0(SP)
-	MOVQ SI, 8(SP)
-	MOVQ CX, 16(SP)
+	MOVQ R10, 8(SP)
+	MOVQ AX, 16(SP)
 	CALL runtime·memmove(SB)
 	RET
 
@@ -91,9 +95,9 @@
 //	- DI	&dst[i]
 //	- R11	offset
 //
-// The unusual register allocation of AX and R11 for local variables matches
-// the allocation used at the call site in encodeBlock, which makes it easier
-// to manually inline this function.
+// The unusual register allocation of local variables, such as R11 for the
+// offset, matches the allocation used at the call site in encodeBlock, which
+// makes it easier to manually inline this function.
 TEXT ·emitCopy(SB), NOSPLIT, $0-48
 	MOVQ dst_base+0(FP), DI
 	MOVQ DI, SI