Optimize asm for decoding copy fragments some more.

Relative to the previous commit:

benchmark                     old MB/s     new MB/s     speedup
BenchmarkWordsDecode1e1-8     518.80       508.74       0.98x
BenchmarkWordsDecode1e2-8     871.43       962.52       1.10x
BenchmarkWordsDecode1e3-8     1411.32      1435.51      1.02x
BenchmarkWordsDecode1e4-8     1469.60      1514.02      1.03x
BenchmarkWordsDecode1e5-8     771.07       807.73       1.05x
BenchmarkWordsDecode1e6-8     872.19       892.24       1.02x
Benchmark_UFlat0-8            1129.79      2200.22      1.95x
Benchmark_UFlat1-8            1075.37      1446.09      1.34x
Benchmark_UFlat2-8            15617.45     14706.88     0.94x
Benchmark_UFlat3-8            1438.15      1787.82      1.24x
Benchmark_UFlat4-8            4838.37      10683.24     2.21x
Benchmark_UFlat5-8            1075.46      1965.33      1.83x
Benchmark_UFlat6-8            811.70       833.52       1.03x
Benchmark_UFlat7-8            781.87       792.85       1.01x
Benchmark_UFlat8-8            819.38       854.75       1.04x
Benchmark_UFlat9-8            724.43       730.21       1.01x
Benchmark_UFlat10-8           1193.70      2775.98      2.33x
Benchmark_UFlat11-8           879.15       1037.94      1.18x

As with previous recent commits, the new asm code is covered by existing
tests: TestDecode, TestDecodeLengthOffset and TestDecodeGoldenInput.
There is also a new test for the slowForwardCopy algorithm.

As a data point, the "new MB/s" numbers are now in the same ballpark as
the benchmark numbers that I get from the C++ snappy implementation on
the same machine:

BM_UFlat/0   2.4GB/s    html
BM_UFlat/1   1.4GB/s    urls
BM_UFlat/2   21.1GB/s   jpg
BM_UFlat/3   1.5GB/s    jpg_200
BM_UFlat/4   10.2GB/s   pdf
BM_UFlat/5   2.1GB/s    html4
BM_UFlat/6   990.6MB/s  txt1
BM_UFlat/7   930.1MB/s  txt2
BM_UFlat/8   1.0GB/s    txt3
BM_UFlat/9   849.7MB/s  txt4
BM_UFlat/10  2.9GB/s    pb
BM_UFlat/11  1.2GB/s    gaviota

As another data point, here is the amd64 asm code as of this commit
compared to the most recent pure Go implementation, revision 03ee571c:

benchmark                     old MB/s     new MB/s     speedup
BenchmarkWordsDecode1e1-8     498.83       508.74       1.02x
BenchmarkWordsDecode1e2-8     445.12       962.52       2.16x
BenchmarkWordsDecode1e3-8     530.29       1435.51      2.71x
BenchmarkWordsDecode1e4-8     361.08       1514.02      4.19x
BenchmarkWordsDecode1e5-8     270.69       807.73       2.98x
BenchmarkWordsDecode1e6-8     290.91       892.24       3.07x
Benchmark_UFlat0-8            543.87       2200.22      4.05x
Benchmark_UFlat1-8            449.84       1446.09      3.21x
Benchmark_UFlat2-8            15511.96     14706.88     0.95x
Benchmark_UFlat3-8            873.92       1787.82      2.05x
Benchmark_UFlat4-8            2978.58      10683.24     3.59x
Benchmark_UFlat5-8            536.04       1965.33      3.67x
Benchmark_UFlat6-8            278.33       833.52       2.99x
Benchmark_UFlat7-8            271.63       792.85       2.92x
Benchmark_UFlat8-8            288.86       854.75       2.96x
Benchmark_UFlat9-8            262.13       730.21       2.79x
Benchmark_UFlat10-8           640.03       2775.98      4.34x
Benchmark_UFlat11-8           356.37       1037.94      2.91x

The UFlat2 case is decoding a compressed JPEG file, a binary format, and
so Snappy is not actually getting much extra compression. Decompression
collapses to not much more than repeatedly invoking runtime.memmove, so
optimizing the snappy code per se doesn't have a huge impact on that
particular benchmark number.
diff --git a/decode_amd64.s b/decode_amd64.s
index c38bd68..1486aba 100644
--- a/decode_amd64.s
+++ b/decode_amd64.s
@@ -318,11 +318,11 @@
 	// copy 16 bytes
 	// d += length
 	CMPQ CX, $16
-	JGT  verySlowForwardCopy
+	JGT  slowForwardCopy
 	CMPQ DX, $8
-	JLT  verySlowForwardCopy
+	JLT  slowForwardCopy
 	CMPQ R14, $16
-	JLT  verySlowForwardCopy
+	JLT  slowForwardCopy
 	MOVQ 0(R15), AX
 	MOVQ AX, 0(DI)
 	MOVQ 8(R15), BX
@@ -330,6 +330,102 @@
 	ADDQ CX, DI
 	JMP  loop
 
+slowForwardCopy:
+	// !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
+	// can still try 8-byte load stores, provided we can overrun up to 10 extra
+	// bytes. As above, the overrun will be fixed up by subsequent iterations
+	// of the outermost loop.
+	//
+	// The C++ snappy code calls this technique IncrementalCopyFastPath. Its
+	// commentary says:
+	//
+	// ----
+	//
+	// The main part of this loop is a simple copy of eight bytes at a time
+	// until we've copied (at least) the requested amount of bytes.  However,
+	// if d and d-offset are less than eight bytes apart (indicating a
+	// repeating pattern of length < 8), we first need to expand the pattern in
+	// order to get the correct results. For instance, if the buffer looks like
+	// this, with the eight-byte <d-offset> and <d> patterns marked as
+	// intervals:
+	//
+	//    abxxxxxxxxxxxx
+	//    [------]           d-offset
+	//      [------]         d
+	//
+	// a single eight-byte copy from <d-offset> to <d> will repeat the pattern
+	// once, after which we can move <d> two bytes without moving <d-offset>:
+	//
+	//    ababxxxxxxxxxx
+	//    [------]           d-offset
+	//        [------]       d
+	//
+	// and repeat the exercise until the two no longer overlap.
+	//
+	// This allows us to do very well in the special case of one single byte
+	// repeated many times, without taking a big hit for more general cases.
+	//
+	// The worst case of extra writing past the end of the match occurs when
+	// offset == 1 and length == 1; the last copy will read from byte positions
+	// [0..7] and write to [4..11], whereas it was only supposed to write to
+	// position 1. Thus, ten excess bytes.
+	//
+	// ----
+	//
+	// That "10 byte overrun" worst case is confirmed by Go's
+	// TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
+	// and finishSlowForwardCopy algorithm.
+	//
+	// if length > len(dst)-d-10 {
+	//   goto verySlowForwardCopy
+	// }
+	SUBQ $10, R14
+	CMPQ CX, R14
+	JGT  verySlowForwardCopy
+
+makeOffsetAtLeast8:
+	// !!! As above, expand the pattern so that offset >= 8 and we can use
+	// 8-byte load/stores.
+	//
+	// for offset < 8 {
+	//   copy 8 bytes from dst[d-offset:] to dst[d:]
+	//   length -= offset
+	//   d      += offset
+	//   offset += offset
+	//   // The two previous lines together means that d-offset, and therefore
+	//   // R15, is unchanged.
+	// }
+	CMPQ DX, $8
+	JGE  fixUpSlowForwardCopy
+	MOVQ (R15), BX
+	MOVQ BX, (DI)
+	SUBQ DX, CX
+	ADDQ DX, DI
+	ADDQ DX, DX
+	JMP  makeOffsetAtLeast8
+
+fixUpSlowForwardCopy:
+	// !!! Add length (which might be negative now) to d (implied by DI being
+	// &dst[d]) so that d ends up at the right place when we jump back to the
+	// top of the loop. Before we do that, though, we save DI to AX so that, if
+	// length is positive, copying the remaining length bytes will write to the
+	// right place.
+	MOVQ DI, AX
+	ADDQ CX, DI
+
+finishSlowForwardCopy:
+	// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
+	// length means that we overrun, but as above, that will be fixed up by
+	// subsequent iterations of the outermost loop.
+	CMPQ CX, $0
+	JLE  loop
+	MOVQ (R15), BX
+	MOVQ BX, (AX)
+	ADDQ $8, R15
+	ADDQ $8, AX
+	SUBQ $8, CX
+	JMP  finishSlowForwardCopy
+
 verySlowForwardCopy:
 	// verySlowForwardCopy is a simple implementation of forward copy. In C
 	// parlance, this is a do/while loop instead of a while loop, since we know
diff --git a/snappy_test.go b/snappy_test.go
index cf3879e..65c0013 100644
--- a/snappy_test.go
+++ b/snappy_test.go
@@ -450,6 +450,51 @@
 	}
 }
 
+// TestSlowForwardCopyOverrun tests the "expand the pattern" algorithm
+// described in decode_amd64.s and its claim of a 10 byte overrun worst case.
+func TestSlowForwardCopyOverrun(t *testing.T) {
+	const base = 100
+
+	for length := 1; length < 18; length++ {
+		for offset := 1; offset < 18; offset++ {
+			highWaterMark := base
+			d := base
+			l := length
+			o := offset
+
+			// makeOffsetAtLeast8
+			for o < 8 {
+				if end := d + 8; highWaterMark < end {
+					highWaterMark = end
+				}
+				l -= o
+				d += o
+				o += o
+			}
+
+			// fixUpSlowForwardCopy
+			a := d
+			d += l
+
+			// finishSlowForwardCopy
+			for l > 0 {
+				if end := a + 8; highWaterMark < end {
+					highWaterMark = end
+				}
+				a += 8
+				l -= 8
+			}
+
+			dWant := base + length
+			overrun := highWaterMark - dWant
+			if d != dWant || overrun < 0 || 10 < overrun {
+				t.Errorf("length=%d, offset=%d: d and overrun: got (%d, %d), want (%d, something in [0, 10])",
+					length, offset, d, overrun, dWant)
+			}
+		}
+	}
+}
+
 // TestEncodeNoiseThenRepeats encodes input for which the first half is very
 // incompressible and the second half is very compressible. The encoded form's
 // length should be closer to 50% of the original length than 100%.