Rewrite the core of the decoder in asm.

This is an experiment. A future commit may roll back this commit if it
turns out that the complexity and inherent unsafety of asm code
outweights the performance benefits.

The new asm code is covered by existing tests: TestDecode,
TestDecodeLengthOffset and TestDecodeGoldenInput. These tests were
checked in by previous commits, to make it clear that they pass both
before and after this new implementation. This commit is purely an
optimization; there should be no other change in behavior.

benchmark                     old MB/s     new MB/s     speedup
BenchmarkWordsDecode1e1-8     498.83       519.36       1.04x
BenchmarkWordsDecode1e2-8     445.12       691.63       1.55x
BenchmarkWordsDecode1e3-8     530.29       858.97       1.62x
BenchmarkWordsDecode1e4-8     361.08       581.86       1.61x
BenchmarkWordsDecode1e5-8     270.69       380.78       1.41x
BenchmarkWordsDecode1e6-8     290.91       403.12       1.39x
Benchmark_UFlat0-8            543.87       784.21       1.44x
Benchmark_UFlat1-8            449.84       625.49       1.39x
Benchmark_UFlat2-8            15511.96     15366.67     0.99x
Benchmark_UFlat3-8            873.92       1321.47      1.51x
Benchmark_UFlat4-8            2978.58      4338.83      1.46x
Benchmark_UFlat5-8            536.04       770.24       1.44x
Benchmark_UFlat6-8            278.33       386.10       1.39x
Benchmark_UFlat7-8            271.63       376.79       1.39x
Benchmark_UFlat8-8            288.86       400.47       1.39x
Benchmark_UFlat9-8            262.13       362.89       1.38x
Benchmark_UFlat10-8           640.03       943.89       1.47x
Benchmark_UFlat11-8           356.37       493.98       1.39x

The numbers above are pure Go vs the new asm; about a 1.4x improvement.
As a data point, the numbers below are pure Go vs pure Go with bounds
checking disabled:

benchmark                     old MB/s     new MB/s     speedup
BenchmarkWordsDecode1e1-8     498.83       516.68       1.04x
BenchmarkWordsDecode1e2-8     445.12       495.22       1.11x
BenchmarkWordsDecode1e3-8     530.29       612.44       1.15x
BenchmarkWordsDecode1e4-8     361.08       374.12       1.04x
BenchmarkWordsDecode1e5-8     270.69       300.66       1.11x
BenchmarkWordsDecode1e6-8     290.91       325.22       1.12x
Benchmark_UFlat0-8            543.87       655.85       1.21x
Benchmark_UFlat1-8            449.84       516.04       1.15x
Benchmark_UFlat2-8            15511.96     15291.13     0.99x
Benchmark_UFlat3-8            873.92       1063.07      1.22x
Benchmark_UFlat4-8            2978.58      3615.30      1.21x
Benchmark_UFlat5-8            536.04       639.51       1.19x
Benchmark_UFlat6-8            278.33       309.44       1.11x
Benchmark_UFlat7-8            271.63       301.89       1.11x
Benchmark_UFlat8-8            288.86       322.38       1.12x
Benchmark_UFlat9-8            262.13       289.92       1.11x
Benchmark_UFlat10-8           640.03       787.34       1.23x
Benchmark_UFlat11-8           356.37       403.35       1.13x

In other words, eliminating bounds checking gets you about a 1.15x
improvement. All the other benefits of hand-written asm gets you another
1.2x over and above that.

For reference, I've copy/pasted the "go tool compile -S -B -o /dev/null
main.go" output at http://play.golang.org/p/vOs4Z7Qf1X
diff --git a/decode.go b/decode.go
index d4bb615..7be590c 100644
--- a/decode.go
+++ b/decode.go
@@ -43,6 +43,12 @@
 	return int(v), n, nil
 }
 
+const (
+	decodeErrCodeCorrupt                  = 1
+	decodeErrCodeUnsupportedLiteralLength = 2
+	decodeErrCodeUnsupportedCopy4Tag      = 3
+)
+
 // Decode returns the decoded form of src. The returned slice may be a sub-
 // slice of dst if dst was large enough to hold the entire decoded block.
 // Otherwise, a newly allocated slice will be returned.
@@ -58,88 +64,15 @@
 	} else {
 		dst = make([]byte, dLen)
 	}
-
-	var d, offset, length int
-	for s < len(src) {
-		switch src[s] & 0x03 {
-		case tagLiteral:
-			x := uint32(src[s] >> 2)
-			switch {
-			case x < 60:
-				s++
-			case x == 60:
-				s += 2
-				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
-					return nil, ErrCorrupt
-				}
-				x = uint32(src[s-1])
-			case x == 61:
-				s += 3
-				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
-					return nil, ErrCorrupt
-				}
-				x = uint32(src[s-2]) | uint32(src[s-1])<<8
-			case x == 62:
-				s += 4
-				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
-					return nil, ErrCorrupt
-				}
-				x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
-			case x == 63:
-				s += 5
-				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
-					return nil, ErrCorrupt
-				}
-				x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
-			}
-			length = int(x) + 1
-			if length <= 0 {
-				return nil, errUnsupportedLiteralLength
-			}
-			if length > len(dst)-d || length > len(src)-s {
-				return nil, ErrCorrupt
-			}
-			copy(dst[d:], src[s:s+length])
-			d += length
-			s += length
-			continue
-
-		case tagCopy1:
-			s += 2
-			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
-				return nil, ErrCorrupt
-			}
-			length = 4 + int(src[s-2])>>2&0x7
-			offset = int(src[s-2])&0xe0<<3 | int(src[s-1])
-
-		case tagCopy2:
-			s += 3
-			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
-				return nil, ErrCorrupt
-			}
-			length = 1 + int(src[s-3])>>2
-			offset = int(src[s-2]) | int(src[s-1])<<8
-
-		case tagCopy4:
-			return nil, errUnsupportedCopy4Tag
-		}
-
-		if offset <= 0 || d < offset || length > len(dst)-d {
-			return nil, ErrCorrupt
-		}
-		// Copy from an earlier sub-slice of dst to a later sub-slice. Unlike
-		// the built-in copy function, this byte-by-byte copy always runs
-		// forwards, even if the slices overlap. Conceptually, this is:
-		//
-		// d += forwardCopy(dst[d:d+length], dst[d-offset:])
-		for end := d + length; d != end; d++ {
-			dst[d] = dst[d-offset]
-		}
+	switch decode(dst, src[s:]) {
+	case 0:
+		return dst, nil
+	case decodeErrCodeUnsupportedLiteralLength:
+		return nil, errUnsupportedLiteralLength
+	case decodeErrCodeUnsupportedCopy4Tag:
+		return nil, errUnsupportedCopy4Tag
 	}
-	if d != dLen {
-		return nil, ErrCorrupt
-	}
-	return dst[:d], nil
+	return nil, ErrCorrupt
 }
 
 // NewReader returns a new Reader that decompresses from r, using the framing
diff --git a/decode_amd64.go b/decode_amd64.go
new file mode 100644
index 0000000..32bce47
--- /dev/null
+++ b/decode_amd64.go
@@ -0,0 +1,10 @@
+// Copyright 2016 The Snappy-Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package snappy
+
+// decode has the same semantics as in decode_other.go.
+//
+//go:noescape
+func decode(dst, src []byte) int
diff --git a/decode_amd64.s b/decode_amd64.s
new file mode 100644
index 0000000..2e6ac59
--- /dev/null
+++ b/decode_amd64.s
@@ -0,0 +1,308 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// func decode(dst, src []byte) int
+//
+// The asm code generally follows the pure Go code in decode_other.go, except
+// where marked with a "!!!".
+//
+// All local variables fit into registers. The non-zero stack size is only to
+// spill registers and push args when issuing a CALL. The register allocation:
+//	- AX	scratch
+//	- BX	scratch
+//	- CX	length or x
+//	- DX	offset
+//	- SI	&src[s]
+//	- DI	&dst[d]
+//	+ R8	dst_base
+//	+ R9	dst_len
+//	+ R10	dst_base + dst_len
+//	+ R11	src_base
+//	+ R12	src_len
+//	+ R13	src_base + src_len
+//	- R14	unused
+//	- R15	used by doCopy
+//
+// The registers R8-R13 (marked with a "+") are set at the start of the
+// function, and after a CALL returns, and are not otherwise modified.
+//
+// The d variable is implicitly DI - R8,  and len(dst)-d is R10 - DI.
+// The s variable is implicitly SI - R11, and len(src)-s is R13 - SI.
+TEXT ·decode(SB), NOSPLIT, $48-56
+	// Initialize SI, DI and R8-R13.
+	MOVQ dst_base+0(FP), R8
+	MOVQ dst_len+8(FP), R9
+	MOVQ R8, DI
+	MOVQ R8, R10
+	ADDQ R9, R10
+	MOVQ src_base+24(FP), R11
+	MOVQ src_len+32(FP), R12
+	MOVQ R11, SI
+	MOVQ R11, R13
+	ADDQ R12, R13
+
+loop:
+	// for s < len(src)
+	CMPQ SI, R13
+	JEQ  end
+
+	// CX = uint32(src[s])
+	//
+	// switch src[s] & 0x03
+	MOVBLZX (SI), CX
+	MOVL    CX, BX
+	ANDL    $3, BX
+	CMPL    BX, $1
+	JAE     tagCopy
+
+	// ----------------------------------------
+	// The code below handles literal tags.
+
+	// case tagLiteral:
+	// x := uint32(src[s] >> 2)
+	// switch
+	SHRL $2, CX
+	CMPL CX, $60
+	JAE  tagLit60Plus
+
+	// case x < 60:
+	// s++
+	INCQ SI
+
+doLit:
+	// This is the end of the inner "switch", when we have a literal tag.
+	//
+	// We assume that CX == x and x fits in a uint32, where x is the variable
+	// used in the pure Go decode_other.go code.
+
+	// length = int(x) + 1
+	//
+	// Unlike the pure Go code, we don't need to check if length <= 0 because
+	// CX can hold 64 bits, so the increment cannot overflow.
+	INCQ CX
+
+	// Prepare to check if copying length bytes will run past the end of dst or
+	// src.
+	//
+	// AX = len(dst) - d
+	// BX = len(src) - s
+	MOVQ R10, AX
+	SUBQ DI, AX
+	MOVQ R13, BX
+	SUBQ SI, BX
+
+	// if length > len(dst)-d || length > len(src)-s { etc }
+	CMPQ CX, AX
+	JGT  errCorrupt
+	CMPQ CX, BX
+	JGT  errCorrupt
+
+	// copy(dst[d:], src[s:s+length])
+	//
+	// This means calling runtime·memmove(&dst[d], &src[s], length), so we push
+	// DI, SI and CX as arguments. Coincidentally, we also need to spill those
+	// three registers to the stack, to save local variables across the CALL.
+	MOVQ DI, 0(SP)
+	MOVQ SI, 8(SP)
+	MOVQ CX, 16(SP)
+	MOVQ DI, 24(SP)
+	MOVQ SI, 32(SP)
+	MOVQ CX, 40(SP)
+	CALL runtime·memmove(SB)
+
+	// Restore local variables: unspill registers from the stack and
+	// re-calculate R8-R13.
+	MOVQ 24(SP), DI
+	MOVQ 32(SP), SI
+	MOVQ 40(SP), CX
+	MOVQ dst_base+0(FP), R8
+	MOVQ dst_len+8(FP), R9
+	MOVQ R8, R10
+	ADDQ R9, R10
+	MOVQ src_base+24(FP), R11
+	MOVQ src_len+32(FP), R12
+	MOVQ R11, R13
+	ADDQ R12, R13
+
+	// d += length
+	// s += length
+	ADDQ CX, DI
+	ADDQ CX, SI
+	JMP  loop
+
+tagLit60Plus:
+	// !!! This fragment does the
+	//
+	// s += x - 58; if uint(s) > uint(len(src)) { etc }
+	//
+	// checks. In the asm version, we code it once instead of once per switch case.
+	ADDQ CX, SI
+	SUBQ $58, SI
+	MOVQ SI, BX
+	SUBQ R11, BX
+	CMPQ BX, R12
+	JA   errCorrupt
+
+	// case x == 60:
+	CMPL CX, $61
+	JEQ  tagLit61
+	JA   tagLit62Plus
+
+	// x = uint32(src[s-1])
+	MOVBLZX -1(SI), CX
+	JMP     doLit
+
+tagLit61:
+	// case x == 61:
+	// x = uint32(src[s-2]) | uint32(src[s-1])<<8
+	MOVWLZX -2(SI), CX
+	JMP     doLit
+
+tagLit62Plus:
+	CMPL CX, $62
+	JA   tagLit63
+
+	// case x == 62:
+	// x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
+	MOVWLZX -3(SI), CX
+	MOVBLZX -1(SI), BX
+	SHLL    $16, BX
+	ORL     BX, CX
+	JMP     doLit
+
+tagLit63:
+	// case x == 63:
+	// x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
+	MOVL -4(SI), CX
+	JMP  doLit
+
+// The code above handles literal tags.
+// ----------------------------------------
+// The code below handles copy tags.
+
+tagCopy2:
+	// case tagCopy2:
+	// s += 3
+	ADDQ $3, SI
+
+	// if uint(s) > uint(len(src)) { etc }
+	MOVQ SI, BX
+	SUBQ R11, BX
+	CMPQ BX, R12
+	JA   errCorrupt
+
+	// length = 1 + int(src[s-3])>>2
+	SHRQ $2, CX
+	INCQ CX
+
+	// offset = int(src[s-2]) | int(src[s-1])<<8
+	MOVWQZX -2(SI), DX
+	JMP     doCopy
+
+tagCopy:
+	// We have a copy tag. We assume that:
+	//	- BX == src[s] & 0x03
+	//	- CX == src[s]
+	CMPQ BX, $2
+	JEQ  tagCopy2
+	JA   errUC4T
+
+	// case tagCopy1:
+	// s += 2
+	ADDQ $2, SI
+
+	// if uint(s) > uint(len(src)) { etc }
+	MOVQ SI, BX
+	SUBQ R11, BX
+	CMPQ BX, R12
+	JA   errCorrupt
+
+	// offset = int(src[s-2])&0xe0<<3 | int(src[s-1])
+	MOVQ    CX, DX
+	ANDQ    $0xe0, DX
+	SHLQ    $3, DX
+	MOVBQZX -1(SI), BX
+	ORQ     BX, DX
+
+	// length = 4 + int(src[s-2])>>2&0x7
+	SHRQ $2, CX
+	ANDQ $7, CX
+	ADDQ $4, CX
+
+doCopy:
+	// This is the end of the outer "switch", when we have a copy tag.
+	//
+	// We assume that:
+	//	- CX == length && CX > 0
+	//	- DX == offset
+
+	// if offset <= 0 { etc }
+	CMPQ DX, $0
+	JLE  errCorrupt
+
+	// if d < offset { etc }
+	MOVQ DI, BX
+	SUBQ R8, BX
+	CMPQ BX, DX
+	JLT  errCorrupt
+
+	// if length > len(dst)-d { etc }
+	MOVQ R10, BX
+	SUBQ DI, BX
+	CMPQ CX, BX
+	JGT  errCorrupt
+
+	// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
+	//
+	// Set:
+	//	- R15 = &dst[d-offset]
+	MOVQ DI, R15
+	SUBQ DX, R15
+
+verySlowForwardCopy:
+	// verySlowForwardCopy is a simple implementation of forward copy. In C
+	// parlance, this is a do/while loop instead of a while loop, since we know
+	// that length > 0. In Go syntax:
+	//
+	// for {
+	//   dst[d] = dst[d - offset]
+	//   d++
+	//   length--
+	//   if length == 0 {
+	//     break
+	//   }
+	// }
+	MOVB (R15), BX
+	MOVB BX, (DI)
+	INCQ R15
+	INCQ DI
+	DECQ CX
+	JNZ  verySlowForwardCopy
+	JMP  loop
+
+// The code above handles copy tags.
+// ----------------------------------------
+
+end:
+	// This is the end of the "for s < len(src)".
+	//
+	// if d != len(dst) { etc }
+	CMPQ DI, R10
+	JNE  errCorrupt
+
+	// return 0
+	MOVQ $0, ret+48(FP)
+	RET
+
+errCorrupt:
+	// return decodeErrCodeCorrupt
+	MOVQ $1, ret+48(FP)
+	RET
+
+errUC4T:
+	// return decodeErrCodeUnsupportedCopy4Tag
+	MOVQ $3, ret+48(FP)
+	RET
diff --git a/decode_other.go b/decode_other.go
new file mode 100644
index 0000000..1a8114a
--- /dev/null
+++ b/decode_other.go
@@ -0,0 +1,96 @@
+// Copyright 2016 The Snappy-Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !amd64
+
+package snappy
+
+// decode writes the decoding of src to dst. It assumes that the varint-encoded
+// length of the decompressed bytes has already been read, and that len(dst)
+// equals that length.
+//
+// It returns 0 on success or a decodeErrCodeXxx error code on failure.
+func decode(dst, src []byte) int {
+	var d, s, offset, length int
+	for s < len(src) {
+		switch src[s] & 0x03 {
+		case tagLiteral:
+			x := uint32(src[s] >> 2)
+			switch {
+			case x < 60:
+				s++
+			case x == 60:
+				s += 2
+				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+					return decodeErrCodeCorrupt
+				}
+				x = uint32(src[s-1])
+			case x == 61:
+				s += 3
+				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+					return decodeErrCodeCorrupt
+				}
+				x = uint32(src[s-2]) | uint32(src[s-1])<<8
+			case x == 62:
+				s += 4
+				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+					return decodeErrCodeCorrupt
+				}
+				x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
+			case x == 63:
+				s += 5
+				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+					return decodeErrCodeCorrupt
+				}
+				x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
+			}
+			length = int(x) + 1
+			if length <= 0 {
+				return decodeErrCodeUnsupportedLiteralLength
+			}
+			if length > len(dst)-d || length > len(src)-s {
+				return decodeErrCodeCorrupt
+			}
+			copy(dst[d:], src[s:s+length])
+			d += length
+			s += length
+			continue
+
+		case tagCopy1:
+			s += 2
+			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+				return decodeErrCodeCorrupt
+			}
+			length = 4 + int(src[s-2])>>2&0x7
+			offset = int(src[s-2])&0xe0<<3 | int(src[s-1])
+
+		case tagCopy2:
+			s += 3
+			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+				return decodeErrCodeCorrupt
+			}
+			length = 1 + int(src[s-3])>>2
+			offset = int(src[s-2]) | int(src[s-1])<<8
+
+		case tagCopy4:
+			return decodeErrCodeUnsupportedCopy4Tag
+		}
+
+		if offset <= 0 || d < offset || length > len(dst)-d {
+			return decodeErrCodeCorrupt
+		}
+		// Copy from an earlier sub-slice of dst to a later sub-slice. Unlike
+		// the built-in copy function, this byte-by-byte copy always runs
+		// forwards, even if the slices overlap. Conceptually, this is:
+		//
+		// d += forwardCopy(dst[d:d+length], dst[d-offset:])
+		for end := d + length; d != end; d++ {
+			dst[d] = dst[d-offset]
+		}
+	}
+	if d != len(dst) {
+		return decodeErrCodeCorrupt
+	}
+	return 0
+}