Use 64K blocks when encoding long inputs. This enables future optimizations, such as an encoder's hash table entry being uint16 instead of int32.

commit: bf2ded9d81f5c22b62cf76363673c6f9765a6703 [log] [tgz]
author: Nigel Tao <nigeltao@golang.org> Mon Feb 22 12:44:36 2016 +1100
committer: Nigel Tao <nigeltao@golang.org> Mon Feb 22 12:44:36 2016 +1100
tree: 57d94b39b503b9a7d5359d29848052c676b2a47b
parent: d1d908a252c22fd7afd36190d5cffb144aa8f777 [diff]
diff --git a/decode.go b/decode.go
index 6c5dd66..efb1f6f 100644
--- a/decode.go
+++ b/decode.go

@@ -140,8 +140,8 @@
 func NewReader(r io.Reader) *Reader {
 	return &Reader{
 		r:       r,
-		decoded: make([]byte, maxUncompressedChunkLen),
-		buf:     make([]byte, maxEncodedLenOfMaxUncompressedChunkLen+checksumSize),
+		decoded: make([]byte, maxBlockSize),
+		buf:     make([]byte, maxEncodedLenOfMaxBlockSize+checksumSize),
 	}
 }
 

diff --git a/encode.go b/encode.go
index 89109b6..407b613 100644
--- a/encode.go
+++ b/encode.go

@@ -94,30 +94,22 @@
 	for len(src) > 0 {
 		p := src
 		src = nil
-		if len(p) > maxInternalEncodeSrcLen {
-			p, src = p[:maxInternalEncodeSrcLen], p[maxInternalEncodeSrcLen:]
+		if len(p) > maxBlockSize {
+			p, src = p[:maxBlockSize], p[maxBlockSize:]
 		}
-		d += encode(dst[d:], p)
+		d += encodeBlock(dst[d:], p)
 	}
 	return dst[:d]
 }
 
-// maxInternalEncodeSrcLen must be less than math.MaxInt32, so that in the
-// (internal) encode function, it is safe to have the s variable (which indexes
-// the src slice), and therefore the hash table entries, to have type int32
-// instead of int.
-const maxInternalEncodeSrcLen = 0x40000000
-
-// encode encodes a non-empty src to a guaranteed-large-enough dst. It assumes
-// that the varint-encoded length of the decompressed bytes has already been
-// written.
+// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
 //
 // It also assumes that:
 //	len(dst) >= MaxEncodedLen(len(src)) &&
-// 	0 < len(src) &&
-//	len(src) <= maxInternalEncodeSrcLen &&
-// 	maxInternalEncodeSrcLen < math.MaxInt32.
-func encode(dst, src []byte) (d int) {
+// 	0 < len(src) && len(src) <= maxBlockSize
+func encodeBlock(dst, src []byte) (d int) {
 	// Return early if src is short.
 	if len(src) <= 4 {
 		return emitLiteral(dst, src)
@@ -258,7 +250,7 @@
 func NewBufferedWriter(w io.Writer) *Writer {
 	return &Writer{
 		w:    w,
-		ibuf: make([]byte, 0, maxUncompressedChunkLen),
+		ibuf: make([]byte, 0, maxBlockSize),
 		obuf: make([]byte, obufLen),
 	}
 }
@@ -342,8 +334,8 @@
 		}
 
 		var uncompressed []byte
-		if len(p) > maxUncompressedChunkLen {
-			uncompressed, p = p[:maxUncompressedChunkLen], p[maxUncompressedChunkLen:]
+		if len(p) > maxBlockSize {
+			uncompressed, p = p[:maxBlockSize], p[maxBlockSize:]
 		} else {
 			uncompressed, p = p, nil
 		}

diff --git a/snappy.go b/snappy.go
index 1c2b671..0102542 100644
--- a/snappy.go
+++ b/snappy.go

@@ -46,18 +46,25 @@
 	chunkHeaderSize = 4
 	magicChunk      = "\xff\x06\x00\x00" + magicBody
 	magicBody       = "sNaPpY"
-	// https://github.com/google/snappy/blob/master/framing_format.txt says
-	// that "the uncompressed data in a chunk must be no longer than 65536 bytes".
-	maxUncompressedChunkLen = 65536
 
-	// maxEncodedLenOfMaxUncompressedChunkLen equals
-	// MaxEncodedLen(maxUncompressedChunkLen), but is hard coded to be a const
-	// instead of a variable, so that obufLen can also be a const. Their
-	// equivalence is confirmed by TestMaxEncodedLenOfMaxUncompressedChunkLen.
-	maxEncodedLenOfMaxUncompressedChunkLen = 76490
+	// maxBlockSize is the maximum size of the input to encodeBlock. It is not
+	// part of the wire format per se, but some parts of the encoder assume
+	// that an offset fits into a uint16.
+	//
+	// Also, for the framing format (Writer type instead of Encode function),
+	// https://github.com/google/snappy/blob/master/framing_format.txt says
+	// that "the uncompressed data in a chunk must be no longer than 65536
+	// bytes".
+	maxBlockSize = 65536
+
+	// maxEncodedLenOfMaxBlockSize equals MaxEncodedLen(maxBlockSize), but is
+	// hard coded to be a const instead of a variable, so that obufLen can also
+	// be a const. Their equivalence is confirmed by
+	// TestMaxEncodedLenOfMaxBlockSize.
+	maxEncodedLenOfMaxBlockSize = 76490
 
 	obufHeaderLen = len(magicChunk) + checksumSize + chunkHeaderSize
-	obufLen       = obufHeaderLen + maxEncodedLenOfMaxUncompressedChunkLen
+	obufLen       = obufHeaderLen + maxEncodedLenOfMaxBlockSize
 )
 
 const (

diff --git a/snappy_test.go b/snappy_test.go
index 6584403..83d3ba5 100644
--- a/snappy_test.go
+++ b/snappy_test.go

@@ -23,9 +23,9 @@
 	testdata = flag.String("testdata", "testdata", "Directory containing the test data")
 )
 
-func TestMaxEncodedLenOfMaxUncompressedChunkLen(t *testing.T) {
-	got := maxEncodedLenOfMaxUncompressedChunkLen
-	want := MaxEncodedLen(maxUncompressedChunkLen)
+func TestMaxEncodedLenOfMaxBlockSize(t *testing.T) {
+	got := maxEncodedLenOfMaxBlockSize
+	want := MaxEncodedLen(maxBlockSize)
 	if got != want {
 		t.Fatalf("got %d, want %d", got, want)
 	}
@@ -237,23 +237,24 @@
 	}
 }
 
-// TestEncodeNoiseThenRepeats encodes a 32K block for which the first half is
-// very incompressible and the second half is very compressible. The encoded
-// form's length should be closer to 50% of the original length than 100%.
+// TestEncodeNoiseThenRepeats encodes input for which the first half is very
+// incompressible and the second half is very compressible. The encoded form's
+// length should be closer to 50% of the original length than 100%.
 func TestEncodeNoiseThenRepeats(t *testing.T) {
-	const origLen = 32768
-	src := make([]byte, origLen)
-	rng := rand.New(rand.NewSource(1))
-	firstHalf, secondHalf := src[:origLen/2], src[origLen/2:]
-	for i := range firstHalf {
-		firstHalf[i] = uint8(rng.Intn(256))
-	}
-	for i := range secondHalf {
-		secondHalf[i] = uint8(i >> 8)
-	}
-	dst := Encode(nil, src)
-	if got, want := len(dst), origLen*3/4; got >= want {
-		t.Fatalf("got %d encoded bytes, want less than %d", got, want)
+	for _, origLen := range []int{32 * 1024, 256 * 1024, 2048 * 1024} {
+		src := make([]byte, origLen)
+		rng := rand.New(rand.NewSource(1))
+		firstHalf, secondHalf := src[:origLen/2], src[origLen/2:]
+		for i := range firstHalf {
+			firstHalf[i] = uint8(rng.Intn(256))
+		}
+		for i := range secondHalf {
+			secondHalf[i] = uint8(i >> 8)
+		}
+		dst := Encode(nil, src)
+		if got, want := len(dst), origLen*3/4; got >= want {
+			t.Errorf("origLen=%d: got %d encoded bytes, want less than %d", origLen, got, want)
+		}
 	}
 }
 
@@ -272,7 +273,7 @@
 func TestFramingFormat(t *testing.T) {
 	// src is comprised of alternating 1e5-sized sequences of random
 	// (incompressible) bytes and repeated (compressible) bytes. 1e5 was chosen
-	// because it is larger than maxUncompressedChunkLen (64k).
+	// because it is larger than maxBlockSize (64k).
 	src := make([]byte, 1e6)
 	rng := rand.New(rand.NewSource(1))
 	for i := 0; i < 10; i++ {
@@ -330,7 +331,7 @@
 	// Test all 32 possible sub-sequences of these 5 input slices.
 	//
 	// Their lengths sum to 400,000, which is over 6 times the Writer ibuf
-	// capacity: 6 * maxUncompressedChunkLen is 393,216.
+	// capacity: 6 * maxBlockSize is 393,216.
 	inputs := [][]byte{
 		bytes.Repeat([]byte{'a'}, 40000),
 		bytes.Repeat([]byte{'b'}, 150000),
commit	bf2ded9d81f5c22b62cf76363673c6f9765a6703	[log] [tgz]
author	Nigel Tao <nigeltao@golang.org>	Mon Feb 22 12:44:36 2016 +1100
committer	Nigel Tao <nigeltao@golang.org>	Mon Feb 22 12:44:36 2016 +1100
tree	57d94b39b503b9a7d5359d29848052c676b2a47b
parent	d1d908a252c22fd7afd36190d5cffb144aa8f777 [diff]