Write the encoder's emitLiteral in asm.

name              old speed      new speed      delta
WordsEncode1e1-8   665MB/s ± 0%   667MB/s ± 0%  +0.35%  (p=0.008 n=5+5)
WordsEncode1e2-8  83.8MB/s ± 1%  85.1MB/s ± 0%  +1.47%  (p=0.008 n=5+5)
WordsEncode1e3-8   231MB/s ± 1%   235MB/s ± 0%  +1.81%  (p=0.008 n=5+5)
WordsEncode1e4-8   232MB/s ± 1%   234MB/s ± 0%  +0.78%  (p=0.016 n=5+5)
WordsEncode1e5-8   212MB/s ± 1%   216MB/s ± 0%  +1.55%  (p=0.008 n=5+5)
WordsEncode1e6-8   257MB/s ± 0%   258MB/s ± 0%  +0.68%  (p=0.008 n=5+5)
RandomEncode-8    13.2GB/s ± 1%  13.2GB/s ± 1%    ~     (p=0.548 n=5+5)
_ZFlat0-8          629MB/s ± 0%   629MB/s ± 0%    ~     (p=0.690 n=5+5)
_ZFlat1-8          324MB/s ± 0%   325MB/s ± 0%    ~     (p=0.222 n=5+5)
_ZFlat2-8         13.9GB/s ± 1%  13.7GB/s ± 5%    ~     (p=0.310 n=5+5)
_ZFlat3-8          176MB/s ± 1%   177MB/s ± 0%    ~     (p=0.548 n=5+5)
_ZFlat4-8         6.12GB/s ± 0%  6.15GB/s ± 2%    ~     (p=0.151 n=5+5)
_ZFlat5-8          614MB/s ± 0%   614MB/s ± 0%    ~     (p=0.548 n=5+5)
_ZFlat6-8          230MB/s ± 0%   231MB/s ± 2%    ~     (p=0.151 n=5+5)
_ZFlat7-8          214MB/s ± 0%   215MB/s ± 2%    ~     (p=0.151 n=5+5)
_ZFlat8-8          244MB/s ± 0%   246MB/s ± 0%  +0.71%  (p=0.016 n=5+4)
_ZFlat9-8          200MB/s ± 0%   202MB/s ± 0%  +0.95%  (p=0.016 n=5+4)
_ZFlat10-8         797MB/s ± 0%   794MB/s ± 2%    ~     (p=1.000 n=5+5)
_ZFlat11-8         351MB/s ± 1%   350MB/s ± 0%    ~     (p=0.730 n=5+4)
diff --git a/encode.go b/encode.go
index 0ba5512..25c64c2 100644
--- a/encode.go
+++ b/encode.go
@@ -21,30 +21,6 @@
 		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
 }
 
-// emitLiteral writes a literal chunk and returns the number of bytes written.
-//
-// It assumes that:
-//	dst is long enough to hold the encoded bytes
-//	1 <= len(lit) && len(lit) <= 65536
-func emitLiteral(dst, lit []byte) int {
-	i, n := 0, uint(len(lit)-1)
-	switch {
-	case n < 60:
-		dst[0] = uint8(n)<<2 | tagLiteral
-		i = 1
-	case n < 1<<8:
-		dst[0] = 60<<2 | tagLiteral
-		dst[1] = uint8(n)
-		i = 2
-	default:
-		dst[0] = 61<<2 | tagLiteral
-		dst[1] = uint8(n)
-		dst[2] = uint8(n >> 8)
-		i = 3
-	}
-	return i + copy(dst[i:], lit)
-}
-
 // Encode returns the encoded form of src. The returned slice may be a sub-
 // slice of dst if dst was large enough to hold the entire encoded block.
 // Otherwise, a newly allocated slice will be returned.
diff --git a/encode_amd64.go b/encode_amd64.go
index 4423b20..60fb084 100644
--- a/encode_amd64.go
+++ b/encode_amd64.go
@@ -8,6 +8,11 @@
 
 package snappy
 
+// emitLiteral has the same semantics as in encode_other.go.
+//
+//go:noescape
+func emitLiteral(dst, lit []byte) int
+
 // emitCopy has the same semantics as in encode_other.go.
 //
 //go:noescape
diff --git a/encode_amd64.s b/encode_amd64.s
index 1b727ff..91cf939 100644
--- a/encode_amd64.s
+++ b/encode_amd64.s
@@ -13,6 +13,64 @@
 
 // ----------------------------------------------------------------------------
 
+// func emitLiteral(dst, lit []byte) int
+//
+// All local variables fit into registers. The register allocation:
+//	- AX	return value
+//	- BX	n
+//	- CX	len(lit)
+//	- SI	&lit[0]
+//	- DI	&dst[i]
+//
+// The 24 bytes of stack space is to call runtime·memmove.
+TEXT ·emitLiteral(SB), NOSPLIT, $24-56
+	MOVQ dst_base+0(FP), DI
+	MOVQ lit_base+24(FP), SI
+	MOVQ lit_len+32(FP), CX
+	MOVQ CX, AX
+	MOVL CX, BX
+	SUBL $1, BX
+
+	CMPL BX, $60
+	JLT  oneByte
+	CMPL BX, $256
+	JLT  twoBytes
+
+threeBytes:
+	MOVB $0xf4, 0(DI)
+	MOVW BX, 1(DI)
+	ADDQ $3, DI
+	ADDQ $3, AX
+	JMP  end
+
+twoBytes:
+	MOVB $0xf0, 0(DI)
+	MOVB BX, 1(DI)
+	ADDQ $2, DI
+	ADDQ $2, AX
+	JMP  end
+
+oneByte:
+	SHLB $2, BX
+	MOVB BX, 0(DI)
+	ADDQ $1, DI
+	ADDQ $1, AX
+
+end:
+	MOVQ AX, ret+48(FP)
+
+	// copy(dst[i:], lit)
+	//
+	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
+	// DI, SI and CX as arguments.
+	MOVQ DI, 0(SP)
+	MOVQ SI, 8(SP)
+	MOVQ CX, 16(SP)
+	CALL runtime·memmove(SB)
+	RET
+
+// ----------------------------------------------------------------------------
+
 // func emitCopy(dst []byte, offset, length int) int
 //
 // All local variables fit into registers. The register allocation:
diff --git a/encode_other.go b/encode_other.go
index 2d28510..6ed2eb9 100644
--- a/encode_other.go
+++ b/encode_other.go
@@ -6,6 +6,30 @@
 
 package snappy
 
+// emitLiteral writes a literal chunk and returns the number of bytes written.
+//
+// It assumes that:
+//	dst is long enough to hold the encoded bytes
+//	1 <= len(lit) && len(lit) <= 65536
+func emitLiteral(dst, lit []byte) int {
+	i, n := 0, uint(len(lit)-1)
+	switch {
+	case n < 60:
+		dst[0] = uint8(n)<<2 | tagLiteral
+		i = 1
+	case n < 1<<8:
+		dst[0] = 60<<2 | tagLiteral
+		dst[1] = uint8(n)
+		i = 2
+	default:
+		dst[0] = 61<<2 | tagLiteral
+		dst[1] = uint8(n)
+		dst[2] = uint8(n >> 8)
+		i = 3
+	}
+	return i + copy(dst[i:], lit)
+}
+
 // emitCopy writes a copy chunk and returns the number of bytes written.
 //
 // It assumes that: