Optimize asm for decoding copy fragments some more. Relative to the previous commit: benchmark old MB/s new MB/s speedup BenchmarkWordsDecode1e1-8 518.80 508.74 0.98x BenchmarkWordsDecode1e2-8 871.43 962.52 1.10x BenchmarkWordsDecode1e3-8 1411.32 1435.51 1.02x BenchmarkWordsDecode1e4-8 1469.60 1514.02 1.03x BenchmarkWordsDecode1e5-8 771.07 807.73 1.05x BenchmarkWordsDecode1e6-8 872.19 892.24 1.02x Benchmark_UFlat0-8 1129.79 2200.22 1.95x Benchmark_UFlat1-8 1075.37 1446.09 1.34x Benchmark_UFlat2-8 15617.45 14706.88 0.94x Benchmark_UFlat3-8 1438.15 1787.82 1.24x Benchmark_UFlat4-8 4838.37 10683.24 2.21x Benchmark_UFlat5-8 1075.46 1965.33 1.83x Benchmark_UFlat6-8 811.70 833.52 1.03x Benchmark_UFlat7-8 781.87 792.85 1.01x Benchmark_UFlat8-8 819.38 854.75 1.04x Benchmark_UFlat9-8 724.43 730.21 1.01x Benchmark_UFlat10-8 1193.70 2775.98 2.33x Benchmark_UFlat11-8 879.15 1037.94 1.18x As with previous recent commits, the new asm code is covered by existing tests: TestDecode, TestDecodeLengthOffset and TestDecodeGoldenInput. There is also a new test for the slowForwardCopy algorithm. As a data point, the "new MB/s" numbers are now in the same ballpark as the benchmark numbers that I get from the C++ snappy implementation on the same machine: BM_UFlat/0 2.4GB/s html BM_UFlat/1 1.4GB/s urls BM_UFlat/2 21.1GB/s jpg BM_UFlat/3 1.5GB/s jpg_200 BM_UFlat/4 10.2GB/s pdf BM_UFlat/5 2.1GB/s html4 BM_UFlat/6 990.6MB/s txt1 BM_UFlat/7 930.1MB/s txt2 BM_UFlat/8 1.0GB/s txt3 BM_UFlat/9 849.7MB/s txt4 BM_UFlat/10 2.9GB/s pb BM_UFlat/11 1.2GB/s gaviota As another data point, here is the amd64 asm code as of this commit compared to the most recent pure Go implementation, revision 03ee571c: benchmark old MB/s new MB/s speedup BenchmarkWordsDecode1e1-8 498.83 508.74 1.02x BenchmarkWordsDecode1e2-8 445.12 962.52 2.16x BenchmarkWordsDecode1e3-8 530.29 1435.51 2.71x BenchmarkWordsDecode1e4-8 361.08 1514.02 4.19x BenchmarkWordsDecode1e5-8 270.69 807.73 2.98x BenchmarkWordsDecode1e6-8 290.91 892.24 3.07x Benchmark_UFlat0-8 543.87 2200.22 4.05x Benchmark_UFlat1-8 449.84 1446.09 3.21x Benchmark_UFlat2-8 15511.96 14706.88 0.95x Benchmark_UFlat3-8 873.92 1787.82 2.05x Benchmark_UFlat4-8 2978.58 10683.24 3.59x Benchmark_UFlat5-8 536.04 1965.33 3.67x Benchmark_UFlat6-8 278.33 833.52 2.99x Benchmark_UFlat7-8 271.63 792.85 2.92x Benchmark_UFlat8-8 288.86 854.75 2.96x Benchmark_UFlat9-8 262.13 730.21 2.79x Benchmark_UFlat10-8 640.03 2775.98 4.34x Benchmark_UFlat11-8 356.37 1037.94 2.91x The UFlat2 case is decoding a compressed JPEG file, a binary format, and so Snappy is not actually getting much extra compression. Decompression collapses to not much more than repeatedly invoking runtime.memmove, so optimizing the snappy code per se doesn't have a huge impact on that particular benchmark number.

commit: 427fb6fc07997f43afa32f35e850833760e489a7 [log] [tgz]
author: Nigel Tao <nigeltao@golang.org> Fri Feb 26 17:30:25 2016 +1100
committer: Nigel Tao <nigeltao@golang.org> Fri Feb 26 17:30:25 2016 +1100
tree: ab6c116a071e3d0db9c79e88031040f750f13abd
parent: 4c1fc8e426266f00229956994142877543e8b514 [diff]
diff --git a/decode_amd64.s b/decode_amd64.s
index c38bd68..1486aba 100644
--- a/decode_amd64.s
+++ b/decode_amd64.s

@@ -318,11 +318,11 @@
 	// copy 16 bytes
 	// d += length
 	CMPQ CX, $16
-	JGT  verySlowForwardCopy
+	JGT  slowForwardCopy
 	CMPQ DX, $8
-	JLT  verySlowForwardCopy
+	JLT  slowForwardCopy
 	CMPQ R14, $16
-	JLT  verySlowForwardCopy
+	JLT  slowForwardCopy
 	MOVQ 0(R15), AX
 	MOVQ AX, 0(DI)
 	MOVQ 8(R15), BX
@@ -330,6 +330,102 @@
 	ADDQ CX, DI
 	JMP  loop
 
+slowForwardCopy:
+	// !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
+	// can still try 8-byte load stores, provided we can overrun up to 10 extra
+	// bytes. As above, the overrun will be fixed up by subsequent iterations
+	// of the outermost loop.
+	//
+	// The C++ snappy code calls this technique IncrementalCopyFastPath. Its
+	// commentary says:
+	//
+	// ----
+	//
+	// The main part of this loop is a simple copy of eight bytes at a time
+	// until we've copied (at least) the requested amount of bytes.  However,
+	// if d and d-offset are less than eight bytes apart (indicating a
+	// repeating pattern of length < 8), we first need to expand the pattern in
+	// order to get the correct results. For instance, if the buffer looks like
+	// this, with the eight-byte <d-offset> and <d> patterns marked as
+	// intervals:
+	//
+	//    abxxxxxxxxxxxx
+	//    [------]           d-offset
+	//      [------]         d
+	//
+	// a single eight-byte copy from <d-offset> to <d> will repeat the pattern
+	// once, after which we can move <d> two bytes without moving <d-offset>:
+	//
+	//    ababxxxxxxxxxx
+	//    [------]           d-offset
+	//        [------]       d
+	//
+	// and repeat the exercise until the two no longer overlap.
+	//
+	// This allows us to do very well in the special case of one single byte
+	// repeated many times, without taking a big hit for more general cases.
+	//
+	// The worst case of extra writing past the end of the match occurs when
+	// offset == 1 and length == 1; the last copy will read from byte positions
+	// [0..7] and write to [4..11], whereas it was only supposed to write to
+	// position 1. Thus, ten excess bytes.
+	//
+	// ----
+	//
+	// That "10 byte overrun" worst case is confirmed by Go's
+	// TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
+	// and finishSlowForwardCopy algorithm.
+	//
+	// if length > len(dst)-d-10 {
+	//   goto verySlowForwardCopy
+	// }
+	SUBQ $10, R14
+	CMPQ CX, R14
+	JGT  verySlowForwardCopy
+
+makeOffsetAtLeast8:
+	// !!! As above, expand the pattern so that offset >= 8 and we can use
+	// 8-byte load/stores.
+	//
+	// for offset < 8 {
+	//   copy 8 bytes from dst[d-offset:] to dst[d:]
+	//   length -= offset
+	//   d      += offset
+	//   offset += offset
+	//   // The two previous lines together means that d-offset, and therefore
+	//   // R15, is unchanged.
+	// }
+	CMPQ DX, $8
+	JGE  fixUpSlowForwardCopy
+	MOVQ (R15), BX
+	MOVQ BX, (DI)
+	SUBQ DX, CX
+	ADDQ DX, DI
+	ADDQ DX, DX
+	JMP  makeOffsetAtLeast8
+
+fixUpSlowForwardCopy:
+	// !!! Add length (which might be negative now) to d (implied by DI being
+	// &dst[d]) so that d ends up at the right place when we jump back to the
+	// top of the loop. Before we do that, though, we save DI to AX so that, if
+	// length is positive, copying the remaining length bytes will write to the
+	// right place.
+	MOVQ DI, AX
+	ADDQ CX, DI
+
+finishSlowForwardCopy:
+	// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
+	// length means that we overrun, but as above, that will be fixed up by
+	// subsequent iterations of the outermost loop.
+	CMPQ CX, $0
+	JLE  loop
+	MOVQ (R15), BX
+	MOVQ BX, (AX)
+	ADDQ $8, R15
+	ADDQ $8, AX
+	SUBQ $8, CX
+	JMP  finishSlowForwardCopy
+
 verySlowForwardCopy:
 	// verySlowForwardCopy is a simple implementation of forward copy. In C
 	// parlance, this is a do/while loop instead of a while loop, since we know

diff --git a/snappy_test.go b/snappy_test.go
index cf3879e..65c0013 100644
--- a/snappy_test.go
+++ b/snappy_test.go

@@ -450,6 +450,51 @@
 	}
 }
 
+// TestSlowForwardCopyOverrun tests the "expand the pattern" algorithm
+// described in decode_amd64.s and its claim of a 10 byte overrun worst case.
+func TestSlowForwardCopyOverrun(t *testing.T) {
+	const base = 100
+
+	for length := 1; length < 18; length++ {
+		for offset := 1; offset < 18; offset++ {
+			highWaterMark := base
+			d := base
+			l := length
+			o := offset
+
+			// makeOffsetAtLeast8
+			for o < 8 {
+				if end := d + 8; highWaterMark < end {
+					highWaterMark = end
+				}
+				l -= o
+				d += o
+				o += o
+			}
+
+			// fixUpSlowForwardCopy
+			a := d
+			d += l
+
+			// finishSlowForwardCopy
+			for l > 0 {
+				if end := a + 8; highWaterMark < end {
+					highWaterMark = end
+				}
+				a += 8
+				l -= 8
+			}
+
+			dWant := base + length
+			overrun := highWaterMark - dWant
+			if d != dWant || overrun < 0 || 10 < overrun {
+				t.Errorf("length=%d, offset=%d: d and overrun: got (%d, %d), want (%d, something in [0, 10])",
+					length, offset, d, overrun, dWant)
+			}
+		}
+	}
+}
+
 // TestEncodeNoiseThenRepeats encodes input for which the first half is very
 // incompressible and the second half is very compressible. The encoded form's
 // length should be closer to 50% of the original length than 100%.
commit	427fb6fc07997f43afa32f35e850833760e489a7	[log] [tgz]
author	Nigel Tao <nigeltao@golang.org>	Fri Feb 26 17:30:25 2016 +1100
committer	Nigel Tao <nigeltao@golang.org>	Fri Feb 26 17:30:25 2016 +1100
tree	ab6c116a071e3d0db9c79e88031040f750f13abd
parent	4c1fc8e426266f00229956994142877543e8b514 [diff]