Add a fast path for short emitLiteral calls.

Compared to the previous commit:
name              old speed      new speed      delta
WordsEncode1e1-8   667MB/s ± 0%   677MB/s ± 1%   +1.57%  (p=0.008 n=5+5)
WordsEncode1e2-8   353MB/s ± 1%   428MB/s ± 0%  +21.37%  (p=0.008 n=5+5)
WordsEncode1e3-8   383MB/s ± 1%   446MB/s ± 1%  +16.65%  (p=0.008 n=5+5)
WordsEncode1e4-8   277MB/s ± 1%   316MB/s ± 0%  +13.93%  (p=0.008 n=5+5)
WordsEncode1e5-8   248MB/s ± 0%   269MB/s ± 0%   +8.57%  (p=0.008 n=5+5)
WordsEncode1e6-8   296MB/s ± 0%   314MB/s ± 1%   +6.08%  (p=0.008 n=5+5)
RandomEncode-8    14.4GB/s ± 2%  14.4GB/s ± 1%     ~     (p=1.000 n=5+5)
_ZFlat0-8          748MB/s ± 0%   792MB/s ± 0%   +5.87%  (p=0.008 n=5+5)
_ZFlat1-8          406MB/s ± 0%   436MB/s ± 1%   +7.42%  (p=0.008 n=5+5)
_ZFlat2-8         16.1GB/s ± 1%  16.2GB/s ± 1%     ~     (p=0.421 n=5+5)
_ZFlat3-8          604MB/s ± 0%   632MB/s ± 1%   +4.49%  (p=0.008 n=5+5)
_ZFlat4-8         7.62GB/s ± 1%  8.00GB/s ± 0%   +5.03%  (p=0.008 n=5+5)
_ZFlat5-8          729MB/s ± 0%   768MB/s ± 0%   +5.26%  (p=0.008 n=5+5)
_ZFlat6-8          267MB/s ± 0%   282MB/s ± 1%   +5.92%  (p=0.008 n=5+5)
_ZFlat7-8          248MB/s ± 0%   264MB/s ± 1%   +6.48%  (p=0.008 n=5+5)
_ZFlat8-8          282MB/s ± 0%   298MB/s ± 0%   +5.87%  (p=0.008 n=5+5)
_ZFlat9-8          231MB/s ± 0%   247MB/s ± 0%   +6.79%  (p=0.008 n=5+5)
_ZFlat10-8         972MB/s ± 0%  1027MB/s ± 0%   +5.64%  (p=0.008 n=5+5)
_ZFlat11-8         401MB/s ± 0%   411MB/s ± 0%   +2.43%  (p=0.008 n=5+5)

The net effect of the past three commits, when compared to just before
68801229 "Write the encoder's encodeBlock in asm":
name              old speed      new speed       delta
WordsEncode1e1-8   665MB/s ± 0%    677MB/s ± 1%    +1.80%  (p=0.016 n=4+5)
WordsEncode1e2-8  85.0MB/s ± 0%  428.3MB/s ± 0%  +403.65%  (p=0.016 n=4+5)
WordsEncode1e3-8   234MB/s ± 2%    446MB/s ± 1%   +90.90%  (p=0.008 n=5+5)
WordsEncode1e4-8   233MB/s ± 0%    316MB/s ± 0%   +35.22%  (p=0.008 n=5+5)
WordsEncode1e5-8   214MB/s ± 1%    269MB/s ± 0%   +25.45%  (p=0.008 n=5+5)
WordsEncode1e6-8   258MB/s ± 0%    314MB/s ± 1%   +21.82%  (p=0.008 n=5+5)
RandomEncode-8    13.1GB/s ± 1%   14.4GB/s ± 1%   +10.31%  (p=0.008 n=5+5)
_ZFlat0-8          630MB/s ± 0%    792MB/s ± 0%   +25.71%  (p=0.016 n=4+5)
_ZFlat1-8          326MB/s ± 0%    436MB/s ± 1%   +33.89%  (p=0.016 n=4+5)
_ZFlat2-8         13.9GB/s ± 1%   16.2GB/s ± 1%   +16.27%  (p=0.008 n=5+5)
_ZFlat3-8          177MB/s ± 1%    632MB/s ± 1%  +257.58%  (p=0.008 n=5+5)
_ZFlat4-8         6.19GB/s ± 1%   8.00GB/s ± 0%   +29.32%  (p=0.008 n=5+5)
_ZFlat5-8          615MB/s ± 0%    768MB/s ± 0%   +24.91%  (p=0.008 n=5+5)
_ZFlat6-8          231MB/s ± 0%    282MB/s ± 1%   +21.95%  (p=0.008 n=5+5)
_ZFlat7-8          215MB/s ± 1%    264MB/s ± 1%   +22.83%  (p=0.008 n=5+5)
_ZFlat8-8          246MB/s ± 0%    298MB/s ± 0%   +21.46%  (p=0.008 n=5+5)
_ZFlat9-8          202MB/s ± 0%    247MB/s ± 0%   +22.17%  (p=0.008 n=5+5)
_ZFlat10-8         803MB/s ± 0%   1027MB/s ± 0%   +27.93%  (p=0.008 n=5+5)
_ZFlat11-8         351MB/s ± 0%    411MB/s ± 0%   +16.92%  (p=0.008 n=5+5)
diff --git a/encode.go b/encode.go
index b0ff679..8749689 100644
--- a/encode.go
+++ b/encode.go
@@ -48,8 +48,6 @@
 // can copy up to 15 bytes too much, but that's OK as subsequent iterations of
 // the encoding loop will fix up the copy overrun, and this inputMargin ensures
 // that we don't overrun the dst and src buffers.
-//
-// TODO: implement this fast path.
 const inputMargin = 16 - 1
 
 // minNonLiteralBlockSize is the minimum size of the input to encodeBlock that
diff --git a/encode_amd64.s b/encode_amd64.s
index 92f0a39..a91f4ba 100644
--- a/encode_amd64.s
+++ b/encode_amd64.s
@@ -358,6 +358,13 @@
 	//
 	// A 4-byte match has been found. We'll later see etc.
 
+	// !!! Jump to a fast path for short (<= 16 byte) literals. See the comment
+	// on inputMargin in encode.go.
+	MOVQ SI, AX
+	SUBQ R10, AX
+	CMPQ AX, $16
+	JLE  emitLiteralFastPath
+
 	// d += emitLiteral(dst[d:], src[nextEmit:s])
 	//
 	// Push args.
@@ -365,8 +372,6 @@
 	MOVQ $0, 8(SP)   // Unnecessary, as the callee ignores it, but conservative.
 	MOVQ $0, 16(SP)  // Unnecessary, as the callee ignores it, but conservative.
 	MOVQ R10, 24(SP)
-	MOVQ SI, AX
-	SUBQ R10, AX
 	MOVQ AX, 32(SP)
 	MOVQ AX, 40(SP)  // Unnecessary, as the callee ignores it, but conservative.
 
@@ -384,6 +389,28 @@
 
 	// Finish the "d +=" part of "d += emitLiteral(etc)".
 	ADDQ 48(SP), DI
+	JMP  inner1
+
+emitLiteralFastPath:
+	// !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
+	MOVB AX, BX
+	SUBB $1, BX
+	SHLB $2, BX
+	MOVB BX, (DI)
+	ADDQ $1, DI
+
+	// !!! Implement the copy from lit to dst as a 16-byte load and store.
+	// (Encode's documentation says that dst and src must not overlap.)
+	//
+	// This always copies 16 bytes, instead of only len(lit) bytes, but that's
+	// OK. Subsequent iterations will fix up the overrun.
+	//
+	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
+	// 16-byte loads and stores. This technique probably wouldn't be as
+	// effective on architectures that are fussier about alignment.
+	MOVOU 0(R10), X0
+	MOVOU X0, 0(DI)
+	ADDQ  AX, DI
 
 inner1:
 	// for { etc }