Rearrange the extendMatch register allocation.

This minimizes the diff in a follow-up commit, when manually inlining.

It's not an optimization per se, but for the record:
name              old speed      new speed      delta
WordsEncode1e1-8   700MB/s ± 1%   701MB/s ± 0%    ~     (p=0.393 n=10+10)
WordsEncode1e2-8   460MB/s ± 1%   460MB/s ± 0%    ~     (p=0.393 n=10+10)
WordsEncode1e3-8   478MB/s ± 2%   480MB/s ± 0%    ~     (p=0.912 n=10+10)
WordsEncode1e4-8   414MB/s ± 0%   416MB/s ± 0%  +0.64%   (p=0.000 n=9+10)
WordsEncode1e5-8   296MB/s ± 1%   297MB/s ± 0%    ~      (p=0.113 n=9+10)
WordsEncode1e6-8   345MB/s ± 0%   345MB/s ± 0%    ~      (p=0.949 n=8+10)
RandomEncode-8    14.4GB/s ± 2%  14.4GB/s ± 2%    ~      (p=0.278 n=9+10)
_ZFlat0-8          888MB/s ± 1%   891MB/s ± 1%  +0.35%   (p=0.010 n=10+9)
_ZFlat1-8          471MB/s ± 1%   471MB/s ± 0%    ~      (p=0.447 n=10+9)
_ZFlat2-8         16.2GB/s ± 3%  16.2GB/s ± 3%    ~     (p=0.912 n=10+10)
_ZFlat3-8          675MB/s ± 1%   676MB/s ± 0%    ~      (p=0.150 n=9+10)
_ZFlat4-8         8.31GB/s ± 1%  8.36GB/s ± 1%  +0.65%  (p=0.035 n=10+10)
_ZFlat5-8          850MB/s ± 0%   852MB/s ± 0%    ~      (p=0.182 n=9+10)
_ZFlat6-8          316MB/s ± 0%   316MB/s ± 0%    ~      (p=0.762 n=10+8)
_ZFlat7-8          294MB/s ± 1%   296MB/s ± 0%  +0.51%    (p=0.006 n=9+8)
_ZFlat8-8          330MB/s ± 1%   331MB/s ± 1%    ~       (p=0.881 n=9+9)
_ZFlat9-8          273MB/s ± 0%   274MB/s ± 0%  +0.23%   (p=0.043 n=10+8)
_ZFlat10-8        1.17GB/s ± 1%  1.17GB/s ± 0%    ~      (p=0.922 n=10+9)
_ZFlat11-8         461MB/s ± 0%   462MB/s ± 0%    ~      (p=0.219 n=10+9)

Also:
name           old time/op  new time/op  delta
ExtendMatch-8  7.92µs ± 2%  7.80µs ± 2%  -1.51%  (p=0.002 n=10+9)
and note that this is time/op instead of MB/s, so negative is better,
although it's quite possibly all just noise.
diff --git a/encode_amd64.s b/encode_amd64.s
index fda505c..40dcde8 100644
--- a/encode_amd64.s
+++ b/encode_amd64.s
@@ -169,33 +169,37 @@
 // func extendMatch(src []byte, i, j int) int
 //
 // All local variables fit into registers. The register allocation:
-//	- CX	&src[0]
-//	- DX	&src[len(src)]
-//	- SI	&src[i]
-//	- DI	&src[j]
-//	- R9	&src[len(src) - 8]
+//	- DX	&src[0]
+//	- SI	&src[j]
+//	- R13	&src[len(src) - 8]
+//	- R14	&src[len(src)]
+//	- R15	&src[i]
+//
+// The unusual register allocation of local variables, such as R15 for a source
+// pointer, matches the allocation used at the call site in encodeBlock, which
+// makes it easier to manually inline this function.
 TEXT ·extendMatch(SB), NOSPLIT, $0-48
-	MOVQ src_base+0(FP), CX
-	MOVQ src_len+8(FP), DX
-	MOVQ i+24(FP), SI
-	MOVQ j+32(FP), DI
-	ADDQ CX, DX
-	ADDQ CX, SI
-	ADDQ CX, DI
-	MOVQ DX, R9
-	SUBQ $8, R9
+	MOVQ src_base+0(FP), DX
+	MOVQ src_len+8(FP), R14
+	MOVQ i+24(FP), R15
+	MOVQ j+32(FP), SI
+	ADDQ DX, R14
+	ADDQ DX, R15
+	ADDQ DX, SI
+	MOVQ R14, R13
+	SUBQ $8, R13
 
 cmp8:
 	// As long as we are 8 or more bytes before the end of src, we can load and
 	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
-	CMPQ DI, R9
+	CMPQ SI, R13
 	JA   cmp1
-	MOVQ (SI), AX
-	MOVQ (DI), BX
+	MOVQ (R15), AX
+	MOVQ (SI), BX
 	CMPQ AX, BX
 	JNE  bsf
+	ADDQ $8, R15
 	ADDQ $8, SI
-	ADDQ $8, DI
 	JMP  cmp8
 
 bsf:
@@ -206,29 +210,29 @@
 	XORQ AX, BX
 	BSFQ BX, BX
 	SHRQ $3, BX
-	ADDQ BX, DI
+	ADDQ BX, SI
 
 	// Convert from &src[ret] to ret.
-	SUBQ CX, DI
-	MOVQ DI, ret+40(FP)
+	SUBQ DX, SI
+	MOVQ SI, ret+40(FP)
 	RET
 
 cmp1:
 	// In src's tail, compare 1 byte at a time.
-	CMPQ DI, DX
+	CMPQ SI, R14
 	JAE  extendMatchEnd
-	MOVB (SI), AX
-	MOVB (DI), BX
+	MOVB (R15), AX
+	MOVB (SI), BX
 	CMPB AX, BX
 	JNE  extendMatchEnd
+	ADDQ $1, R15
 	ADDQ $1, SI
-	ADDQ $1, DI
 	JMP  cmp1
 
 extendMatchEnd:
 	// Convert from &src[ret] to ret.
-	SUBQ CX, DI
-	MOVQ DI, ret+40(FP)
+	SUBQ DX, SI
+	MOVQ SI, ret+40(FP)
 	RET
 
 // ----------------------------------------------------------------------------