Inline the emitLiteral call.

name              old speed      new speed      delta
WordsEncode1e1-8   712MB/s ± 1%   700MB/s ± 1%   -1.65%  (p=0.000 n=10+10)
WordsEncode1e2-8   467MB/s ± 0%   460MB/s ± 1%   -1.53%   (p=0.000 n=9+10)
WordsEncode1e3-8   483MB/s ± 0%   478MB/s ± 2%   -0.98%   (p=0.007 n=9+10)
WordsEncode1e4-8   353MB/s ± 1%   414MB/s ± 0%  +17.03%   (p=0.000 n=10+9)
WordsEncode1e5-8   293MB/s ± 0%   296MB/s ± 1%   +1.03%    (p=0.000 n=8+9)
WordsEncode1e6-8   345MB/s ± 0%   345MB/s ± 0%     ~       (p=0.332 n=9+8)
RandomEncode-8    14.4GB/s ± 2%  14.4GB/s ± 2%     ~      (p=1.000 n=10+9)
_ZFlat0-8          863MB/s ± 0%   888MB/s ± 1%   +2.86%   (p=0.000 n=9+10)
_ZFlat1-8          471MB/s ± 0%   471MB/s ± 1%     ~      (p=0.897 n=8+10)
_ZFlat2-8         16.2GB/s ± 2%  16.2GB/s ± 3%     ~     (p=0.631 n=10+10)
_ZFlat3-8          659MB/s ± 1%   675MB/s ± 1%   +2.32%    (p=0.000 n=9+9)
_ZFlat4-8         8.29GB/s ± 1%  8.31GB/s ± 1%     ~     (p=0.315 n=10+10)
_ZFlat5-8          836MB/s ± 1%   850MB/s ± 0%   +1.78%    (p=0.000 n=9+9)
_ZFlat6-8          315MB/s ± 0%   316MB/s ± 0%   +0.39%   (p=0.002 n=9+10)
_ZFlat7-8          293MB/s ± 1%   294MB/s ± 1%     ~      (p=0.139 n=10+9)
_ZFlat8-8          331MB/s ± 1%   330MB/s ± 1%     ~      (p=0.356 n=10+9)
_ZFlat9-8          273MB/s ± 1%   273MB/s ± 0%     ~     (p=0.280 n=10+10)
_ZFlat10-8        1.12GB/s ± 1%  1.17GB/s ± 1%   +4.12%  (p=0.000 n=10+10)
_ZFlat11-8         460MB/s ± 0%   461MB/s ± 0%   +0.34%   (p=0.006 n=8+10)
diff --git a/encode_amd64.s b/encode_amd64.s
index 6a21444..fda505c 100644
--- a/encode_amd64.s
+++ b/encode_amd64.s
@@ -57,14 +57,14 @@
 	MOVW BX, 1(DI)
 	ADDQ $3, DI
 	ADDQ $3, DX
-	JMP  emitLiteralEnd
+	JMP  memmove
 
 twoBytes:
 	MOVB $0xf0, 0(DI)
 	MOVB BX, 1(DI)
 	ADDQ $2, DI
 	ADDQ $2, DX
-	JMP  emitLiteralEnd
+	JMP  memmove
 
 oneByte:
 	SHLB $2, BX
@@ -72,7 +72,7 @@
 	ADDQ $1, DI
 	ADDQ $1, DX
 
-emitLiteralEnd:
+memmove:
 	MOVQ DX, ret+48(FP)
 
 	// copy(dst[i:], lit)
@@ -400,32 +400,64 @@
 	CMPQ AX, $16
 	JLE  emitLiteralFastPath
 
-	// d += emitLiteral(dst[d:], src[nextEmit:s])
+	// ----------------------------------------
+	// Begin inline of the emitLiteral call.
 	//
-	// Push args.
-	MOVQ DI, 0(SP)
-	MOVQ $0, 8(SP)   // Unnecessary, as the callee ignores it, but conservative.
-	MOVQ $0, 16(SP)  // Unnecessary, as the callee ignores it, but conservative.
-	MOVQ R10, 24(SP)
-	MOVQ AX, 32(SP)
-	MOVQ AX, 40(SP)  // Unnecessary, as the callee ignores it, but conservative.
+	// d += emitLiteral(dst[d:], src[nextEmit:s])
 
+	MOVL AX, BX
+	SUBL $1, BX
+
+	CMPL BX, $60
+	JLT  inlineEmitLiteralOneByte
+	CMPL BX, $256
+	JLT  inlineEmitLiteralTwoBytes
+
+inlineEmitLiteralThreeBytes:
+	MOVB $0xf4, 0(DI)
+	MOVW BX, 1(DI)
+	ADDQ $3, DI
+	JMP  inlineEmitLiteralMemmove
+
+inlineEmitLiteralTwoBytes:
+	MOVB $0xf0, 0(DI)
+	MOVB BX, 1(DI)
+	ADDQ $2, DI
+	JMP  inlineEmitLiteralMemmove
+
+inlineEmitLiteralOneByte:
+	SHLB $2, BX
+	MOVB BX, 0(DI)
+	ADDQ $1, DI
+
+inlineEmitLiteralMemmove:
 	// Spill local variables (registers) onto the stack; call; unspill.
+	//
+	// copy(dst[i:], lit)
+	//
+	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
+	// DI, R10 and AX as arguments.
+	MOVQ DI, 0(SP)
+	MOVQ R10, 8(SP)
+	MOVQ AX, 16(SP)
+	// Finish the "d +=" part of "d += emitLiteral(etc)".
+	ADDQ AX, DI
 	MOVQ SI, 72(SP)
 	MOVQ DI, 80(SP)
 	MOVQ R15, 112(SP)
-	CALL ·emitLiteral(SB)
+	CALL runtime·memmove(SB)
 	MOVQ 56(SP), CX
 	MOVQ 64(SP), DX
 	MOVQ 72(SP), SI
 	MOVQ 80(SP), DI
 	MOVQ 88(SP), R9
 	MOVQ 112(SP), R15
-
-	// Finish the "d +=" part of "d += emitLiteral(etc)".
-	ADDQ 48(SP), DI
 	JMP  inner1
 
+inlineEmitLiteralEnd:
+	// End inline of the emitLiteral call.
+	// ----------------------------------------
+
 emitLiteralFastPath:
 	// !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
 	MOVB AX, BX