Inline the extendMatch call.

Compared to the previous commit:
name              old speed      new speed      delta
WordsEncode1e1-8   701MB/s ± 0%   699MB/s ± 1%     ~     (p=0.123 n=10+10)
WordsEncode1e2-8   460MB/s ± 0%   583MB/s ± 1%  +26.64%  (p=0.000 n=10+10)
WordsEncode1e3-8   480MB/s ± 0%   647MB/s ± 2%  +34.85%  (p=0.000 n=10+10)
WordsEncode1e4-8   416MB/s ± 0%   451MB/s ± 0%   +8.30%   (p=0.000 n=10+8)
WordsEncode1e5-8   297MB/s ± 0%   355MB/s ± 2%  +19.50%   (p=0.000 n=10+9)
WordsEncode1e6-8   345MB/s ± 0%   433MB/s ± 2%  +25.47%   (p=0.000 n=10+9)
RandomEncode-8    14.4GB/s ± 2%  14.3GB/s ± 3%     ~     (p=0.075 n=10+10)
_ZFlat0-8          891MB/s ± 1%  1040MB/s ± 0%  +16.67%    (p=0.000 n=9+9)
_ZFlat1-8          471MB/s ± 0%   535MB/s ± 1%  +13.68%   (p=0.000 n=9+10)
_ZFlat2-8         16.2GB/s ± 3%  16.4GB/s ± 1%     ~      (p=0.122 n=10+8)
_ZFlat3-8          676MB/s ± 0%   762MB/s ± 0%  +12.62%   (p=0.000 n=10+9)
_ZFlat4-8         8.36GB/s ± 1%  9.47GB/s ± 1%  +13.28%  (p=0.000 n=10+10)
_ZFlat5-8          852MB/s ± 0%   986MB/s ± 1%  +15.79%   (p=0.000 n=10+9)
_ZFlat6-8          316MB/s ± 0%   380MB/s ± 1%  +20.41%    (p=0.000 n=8+9)
_ZFlat7-8          296MB/s ± 0%   353MB/s ± 0%  +19.44%   (p=0.000 n=8+10)
_ZFlat8-8          331MB/s ± 1%   399MB/s ± 0%  +20.53%    (p=0.000 n=9+8)
_ZFlat9-8          274MB/s ± 0%   329MB/s ± 0%  +20.27%    (p=0.000 n=8+9)
_ZFlat10-8        1.17GB/s ± 0%  1.35GB/s ± 1%  +15.15%    (p=0.000 n=9+9)
_ZFlat11-8         462MB/s ± 0%   608MB/s ± 0%  +31.54%    (p=0.000 n=9+9)

The net effect of the past four inlining commits, when compared to just
before c3defccc "Inline the emitCopy call":
name              old speed      new speed      delta
WordsEncode1e1-8   701MB/s ± 1%   699MB/s ± 1%     ~     (p=0.353 n=10+10)
WordsEncode1e2-8   429MB/s ± 0%   583MB/s ± 1%  +35.95%   (p=0.000 n=9+10)
WordsEncode1e3-8   447MB/s ± 0%   647MB/s ± 2%  +44.85%   (p=0.000 n=9+10)
WordsEncode1e4-8   322MB/s ± 1%   451MB/s ± 0%  +40.00%   (p=0.000 n=10+8)
WordsEncode1e5-8   268MB/s ± 0%   355MB/s ± 2%  +32.41%    (p=0.000 n=9+9)
WordsEncode1e6-8   313MB/s ± 0%   433MB/s ± 2%  +38.28%    (p=0.000 n=8+9)
RandomEncode-8    14.4GB/s ± 1%  14.3GB/s ± 3%     ~      (p=0.897 n=8+10)
_ZFlat0-8          797MB/s ± 2%  1040MB/s ± 0%  +30.53%    (p=0.000 n=9+9)
_ZFlat1-8          435MB/s ± 1%   535MB/s ± 1%  +22.97%   (p=0.000 n=9+10)
_ZFlat2-8         16.1GB/s ± 2%  16.4GB/s ± 1%   +1.47%   (p=0.001 n=10+8)
_ZFlat3-8          633MB/s ± 0%   762MB/s ± 0%  +20.32%   (p=0.000 n=10+9)
_ZFlat4-8         7.95GB/s ± 1%  9.47GB/s ± 1%  +19.11%  (p=0.000 n=10+10)
_ZFlat5-8          771MB/s ± 0%   986MB/s ± 1%  +27.83%   (p=0.000 n=10+9)
_ZFlat6-8          283MB/s ± 0%   380MB/s ± 1%  +34.46%   (p=0.000 n=10+9)
_ZFlat7-8          265MB/s ± 0%   353MB/s ± 0%  +33.29%   (p=0.000 n=9+10)
_ZFlat8-8          299MB/s ± 0%   399MB/s ± 0%  +33.36%    (p=0.000 n=9+8)
_ZFlat9-8          246MB/s ± 1%   329MB/s ± 0%  +33.58%   (p=0.000 n=10+9)
_ZFlat10-8        1.05GB/s ± 1%  1.35GB/s ± 1%  +28.35%   (p=0.000 n=10+9)
_ZFlat11-8         411MB/s ± 0%   608MB/s ± 0%  +47.82%   (p=0.000 n=10+9)
diff --git a/encode_amd64.s b/encode_amd64.s
index 40dcde8..adfd979 100644
--- a/encode_amd64.s
+++ b/encode_amd64.s
@@ -251,8 +251,8 @@
 //	- R10	.	&src[nextEmit]
 //	- R11	96	prevHash, currHash, nextHash, offset
 //	- R12	104	&src[base], skip
-//	- R13	.	&src[nextS]
-//	- R14	.	len(src), bytesBetweenHashLookups, x
+//	- R13	.	&src[nextS], &src[len(src) - 8]
+//	- R14	.	len(src), bytesBetweenHashLookups, &src[len(src)], x
 //	- R15	112	candidate
 //
 // The second column (56, 64, etc) is the stack offset to spill the registers
@@ -444,8 +444,7 @@
 	MOVQ DI, 0(SP)
 	MOVQ R10, 8(SP)
 	MOVQ AX, 16(SP)
-	// Finish the "d +=" part of "d += emitLiteral(etc)".
-	ADDQ AX, DI
+	ADDQ AX, DI              // Finish the "d +=" part of "d += emitLiteral(etc)".
 	MOVQ SI, 72(SP)
 	MOVQ DI, 80(SP)
 	MOVQ R15, 112(SP)
@@ -494,35 +493,65 @@
 	SUBQ R15, R11
 	SUBQ DX, R11
 
-	// s = extendMatch(src, candidate+4, s+4)
+	// ----------------------------------------
+	// Begin inline of the extendMatch call.
 	//
-	// Push args.
-	MOVQ DX, 0(SP)
+	// s = extendMatch(src, candidate+4, s+4)
+
+	// !!! R14 = &src[len(src)]
 	MOVQ src_len+32(FP), R14
-	MOVQ R14, 8(SP)
-	MOVQ R14, 16(SP)         // Unnecessary, as the callee ignores it, but conservative.
+	ADDQ DX, R14
+
+	// !!! R13 = &src[len(src) - 8]
+	MOVQ R14, R13
+	SUBQ $8, R13
+
+	// !!! R15 = &src[candidate + 4]
 	ADDQ $4, R15
-	MOVQ R15, 24(SP)
+	ADDQ DX, R15
+
+	// !!! s += 4
 	ADDQ $4, SI
-	SUBQ DX, SI
-	MOVQ SI, 32(SP)
 
-	// Spill local variables (registers) onto the stack; call; unspill.
-	MOVQ DI, 80(SP)
-	MOVQ R11, 96(SP)
-	MOVQ R12, 104(SP)
-	CALL ·extendMatch(SB)
-	MOVQ 56(SP), CX
-	MOVQ 64(SP), DX
-	MOVQ 80(SP), DI
-	MOVQ 88(SP), R9
-	MOVQ 96(SP), R11
-	MOVQ 104(SP), R12
+inlineExtendMatchCmp8:
+	// As long as we are 8 or more bytes before the end of src, we can load and
+	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
+	CMPQ SI, R13
+	JA   inlineExtendMatchCmp1
+	MOVQ (R15), AX
+	MOVQ (SI), BX
+	CMPQ AX, BX
+	JNE  inlineExtendMatchBSF
+	ADDQ $8, R15
+	ADDQ $8, SI
+	JMP  inlineExtendMatchCmp8
 
-	// Finish the "s =" part of "s = extendMatch(etc)", remembering that the SI
-	// register holds &src[s], not s.
-	MOVQ 40(SP), SI
-	ADDQ DX, SI
+inlineExtendMatchBSF:
+	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
+	// the index of the first byte that differs. The BSF instruction finds the
+	// least significant 1 bit, the amd64 architecture is little-endian, and
+	// the shift by 3 converts a bit index to a byte index.
+	XORQ AX, BX
+	BSFQ BX, BX
+	SHRQ $3, BX
+	ADDQ BX, SI
+	JMP  inlineExtendMatchEnd
+
+inlineExtendMatchCmp1:
+	// In src's tail, compare 1 byte at a time.
+	CMPQ SI, R14
+	JAE  inlineExtendMatchEnd
+	MOVB (R15), AX
+	MOVB (SI), BX
+	CMPB AX, BX
+	JNE  inlineExtendMatchEnd
+	ADDQ $1, R15
+	ADDQ $1, SI
+	JMP  inlineExtendMatchCmp1
+
+inlineExtendMatchEnd:
+	// End inline of the extendMatch call.
+	// ----------------------------------------
 
 	// ----------------------------------------
 	// Begin inline of the emitCopy call.