Write the encoder's emitLiteral in asm.
name old speed new speed delta
WordsEncode1e1-8 665MB/s ± 0% 667MB/s ± 0% +0.35% (p=0.008 n=5+5)
WordsEncode1e2-8 83.8MB/s ± 1% 85.1MB/s ± 0% +1.47% (p=0.008 n=5+5)
WordsEncode1e3-8 231MB/s ± 1% 235MB/s ± 0% +1.81% (p=0.008 n=5+5)
WordsEncode1e4-8 232MB/s ± 1% 234MB/s ± 0% +0.78% (p=0.016 n=5+5)
WordsEncode1e5-8 212MB/s ± 1% 216MB/s ± 0% +1.55% (p=0.008 n=5+5)
WordsEncode1e6-8 257MB/s ± 0% 258MB/s ± 0% +0.68% (p=0.008 n=5+5)
RandomEncode-8 13.2GB/s ± 1% 13.2GB/s ± 1% ~ (p=0.548 n=5+5)
_ZFlat0-8 629MB/s ± 0% 629MB/s ± 0% ~ (p=0.690 n=5+5)
_ZFlat1-8 324MB/s ± 0% 325MB/s ± 0% ~ (p=0.222 n=5+5)
_ZFlat2-8 13.9GB/s ± 1% 13.7GB/s ± 5% ~ (p=0.310 n=5+5)
_ZFlat3-8 176MB/s ± 1% 177MB/s ± 0% ~ (p=0.548 n=5+5)
_ZFlat4-8 6.12GB/s ± 0% 6.15GB/s ± 2% ~ (p=0.151 n=5+5)
_ZFlat5-8 614MB/s ± 0% 614MB/s ± 0% ~ (p=0.548 n=5+5)
_ZFlat6-8 230MB/s ± 0% 231MB/s ± 2% ~ (p=0.151 n=5+5)
_ZFlat7-8 214MB/s ± 0% 215MB/s ± 2% ~ (p=0.151 n=5+5)
_ZFlat8-8 244MB/s ± 0% 246MB/s ± 0% +0.71% (p=0.016 n=5+4)
_ZFlat9-8 200MB/s ± 0% 202MB/s ± 0% +0.95% (p=0.016 n=5+4)
_ZFlat10-8 797MB/s ± 0% 794MB/s ± 2% ~ (p=1.000 n=5+5)
_ZFlat11-8 351MB/s ± 1% 350MB/s ± 0% ~ (p=0.730 n=5+4)
diff --git a/encode.go b/encode.go
index 0ba5512..25c64c2 100644
--- a/encode.go
+++ b/encode.go
@@ -21,30 +21,6 @@
uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
}
-// emitLiteral writes a literal chunk and returns the number of bytes written.
-//
-// It assumes that:
-// dst is long enough to hold the encoded bytes
-// 1 <= len(lit) && len(lit) <= 65536
-func emitLiteral(dst, lit []byte) int {
- i, n := 0, uint(len(lit)-1)
- switch {
- case n < 60:
- dst[0] = uint8(n)<<2 | tagLiteral
- i = 1
- case n < 1<<8:
- dst[0] = 60<<2 | tagLiteral
- dst[1] = uint8(n)
- i = 2
- default:
- dst[0] = 61<<2 | tagLiteral
- dst[1] = uint8(n)
- dst[2] = uint8(n >> 8)
- i = 3
- }
- return i + copy(dst[i:], lit)
-}
-
// Encode returns the encoded form of src. The returned slice may be a sub-
// slice of dst if dst was large enough to hold the entire encoded block.
// Otherwise, a newly allocated slice will be returned.
diff --git a/encode_amd64.go b/encode_amd64.go
index 4423b20..60fb084 100644
--- a/encode_amd64.go
+++ b/encode_amd64.go
@@ -8,6 +8,11 @@
package snappy
+// emitLiteral has the same semantics as in encode_other.go.
+//
+//go:noescape
+func emitLiteral(dst, lit []byte) int
+
// emitCopy has the same semantics as in encode_other.go.
//
//go:noescape
diff --git a/encode_amd64.s b/encode_amd64.s
index 1b727ff..91cf939 100644
--- a/encode_amd64.s
+++ b/encode_amd64.s
@@ -13,6 +13,64 @@
// ----------------------------------------------------------------------------
+// func emitLiteral(dst, lit []byte) int
+//
+// All local variables fit into registers. The register allocation:
+// - AX return value
+// - BX n
+// - CX len(lit)
+// - SI &lit[0]
+// - DI &dst[i]
+//
+// The 24 bytes of stack space is to call runtime·memmove.
+TEXT ·emitLiteral(SB), NOSPLIT, $24-56
+ MOVQ dst_base+0(FP), DI
+ MOVQ lit_base+24(FP), SI
+ MOVQ lit_len+32(FP), CX
+ MOVQ CX, AX
+ MOVL CX, BX
+ SUBL $1, BX
+
+ CMPL BX, $60
+ JLT oneByte
+ CMPL BX, $256
+ JLT twoBytes
+
+threeBytes:
+ MOVB $0xf4, 0(DI)
+ MOVW BX, 1(DI)
+ ADDQ $3, DI
+ ADDQ $3, AX
+ JMP end
+
+twoBytes:
+ MOVB $0xf0, 0(DI)
+ MOVB BX, 1(DI)
+ ADDQ $2, DI
+ ADDQ $2, AX
+ JMP end
+
+oneByte:
+ SHLB $2, BX
+ MOVB BX, 0(DI)
+ ADDQ $1, DI
+ ADDQ $1, AX
+
+end:
+ MOVQ AX, ret+48(FP)
+
+ // copy(dst[i:], lit)
+ //
+ // This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
+ // DI, SI and CX as arguments.
+ MOVQ DI, 0(SP)
+ MOVQ SI, 8(SP)
+ MOVQ CX, 16(SP)
+ CALL runtime·memmove(SB)
+ RET
+
+// ----------------------------------------------------------------------------
+
// func emitCopy(dst []byte, offset, length int) int
//
// All local variables fit into registers. The register allocation:
diff --git a/encode_other.go b/encode_other.go
index 2d28510..6ed2eb9 100644
--- a/encode_other.go
+++ b/encode_other.go
@@ -6,6 +6,30 @@
package snappy
+// emitLiteral writes a literal chunk and returns the number of bytes written.
+//
+// It assumes that:
+// dst is long enough to hold the encoded bytes
+// 1 <= len(lit) && len(lit) <= 65536
+func emitLiteral(dst, lit []byte) int {
+ i, n := 0, uint(len(lit)-1)
+ switch {
+ case n < 60:
+ dst[0] = uint8(n)<<2 | tagLiteral
+ i = 1
+ case n < 1<<8:
+ dst[0] = 60<<2 | tagLiteral
+ dst[1] = uint8(n)
+ i = 2
+ default:
+ dst[0] = 61<<2 | tagLiteral
+ dst[1] = uint8(n)
+ dst[2] = uint8(n >> 8)
+ i = 3
+ }
+ return i + copy(dst[i:], lit)
+}
+
// emitCopy writes a copy chunk and returns the number of bytes written.
//
// It assumes that: