Use 64K blocks when encoding long inputs.
This enables future optimizations, such as an encoder's hash table entry being
uint16 instead of int32.
diff --git a/decode.go b/decode.go
index 6c5dd66..efb1f6f 100644
--- a/decode.go
+++ b/decode.go
@@ -140,8 +140,8 @@
func NewReader(r io.Reader) *Reader {
return &Reader{
r: r,
- decoded: make([]byte, maxUncompressedChunkLen),
- buf: make([]byte, maxEncodedLenOfMaxUncompressedChunkLen+checksumSize),
+ decoded: make([]byte, maxBlockSize),
+ buf: make([]byte, maxEncodedLenOfMaxBlockSize+checksumSize),
}
}
diff --git a/encode.go b/encode.go
index 89109b6..407b613 100644
--- a/encode.go
+++ b/encode.go
@@ -94,30 +94,22 @@
for len(src) > 0 {
p := src
src = nil
- if len(p) > maxInternalEncodeSrcLen {
- p, src = p[:maxInternalEncodeSrcLen], p[maxInternalEncodeSrcLen:]
+ if len(p) > maxBlockSize {
+ p, src = p[:maxBlockSize], p[maxBlockSize:]
}
- d += encode(dst[d:], p)
+ d += encodeBlock(dst[d:], p)
}
return dst[:d]
}
-// maxInternalEncodeSrcLen must be less than math.MaxInt32, so that in the
-// (internal) encode function, it is safe to have the s variable (which indexes
-// the src slice), and therefore the hash table entries, to have type int32
-// instead of int.
-const maxInternalEncodeSrcLen = 0x40000000
-
-// encode encodes a non-empty src to a guaranteed-large-enough dst. It assumes
-// that the varint-encoded length of the decompressed bytes has already been
-// written.
+// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
//
// It also assumes that:
// len(dst) >= MaxEncodedLen(len(src)) &&
-// 0 < len(src) &&
-// len(src) <= maxInternalEncodeSrcLen &&
-// maxInternalEncodeSrcLen < math.MaxInt32.
-func encode(dst, src []byte) (d int) {
+// 0 < len(src) && len(src) <= maxBlockSize
+func encodeBlock(dst, src []byte) (d int) {
// Return early if src is short.
if len(src) <= 4 {
return emitLiteral(dst, src)
@@ -258,7 +250,7 @@
func NewBufferedWriter(w io.Writer) *Writer {
return &Writer{
w: w,
- ibuf: make([]byte, 0, maxUncompressedChunkLen),
+ ibuf: make([]byte, 0, maxBlockSize),
obuf: make([]byte, obufLen),
}
}
@@ -342,8 +334,8 @@
}
var uncompressed []byte
- if len(p) > maxUncompressedChunkLen {
- uncompressed, p = p[:maxUncompressedChunkLen], p[maxUncompressedChunkLen:]
+ if len(p) > maxBlockSize {
+ uncompressed, p = p[:maxBlockSize], p[maxBlockSize:]
} else {
uncompressed, p = p, nil
}
diff --git a/snappy.go b/snappy.go
index 1c2b671..0102542 100644
--- a/snappy.go
+++ b/snappy.go
@@ -46,18 +46,25 @@
chunkHeaderSize = 4
magicChunk = "\xff\x06\x00\x00" + magicBody
magicBody = "sNaPpY"
- // https://github.com/google/snappy/blob/master/framing_format.txt says
- // that "the uncompressed data in a chunk must be no longer than 65536 bytes".
- maxUncompressedChunkLen = 65536
- // maxEncodedLenOfMaxUncompressedChunkLen equals
- // MaxEncodedLen(maxUncompressedChunkLen), but is hard coded to be a const
- // instead of a variable, so that obufLen can also be a const. Their
- // equivalence is confirmed by TestMaxEncodedLenOfMaxUncompressedChunkLen.
- maxEncodedLenOfMaxUncompressedChunkLen = 76490
+ // maxBlockSize is the maximum size of the input to encodeBlock. It is not
+ // part of the wire format per se, but some parts of the encoder assume
+ // that an offset fits into a uint16.
+ //
+ // Also, for the framing format (Writer type instead of Encode function),
+ // https://github.com/google/snappy/blob/master/framing_format.txt says
+ // that "the uncompressed data in a chunk must be no longer than 65536
+ // bytes".
+ maxBlockSize = 65536
+
+ // maxEncodedLenOfMaxBlockSize equals MaxEncodedLen(maxBlockSize), but is
+ // hard coded to be a const instead of a variable, so that obufLen can also
+ // be a const. Their equivalence is confirmed by
+ // TestMaxEncodedLenOfMaxBlockSize.
+ maxEncodedLenOfMaxBlockSize = 76490
obufHeaderLen = len(magicChunk) + checksumSize + chunkHeaderSize
- obufLen = obufHeaderLen + maxEncodedLenOfMaxUncompressedChunkLen
+ obufLen = obufHeaderLen + maxEncodedLenOfMaxBlockSize
)
const (
diff --git a/snappy_test.go b/snappy_test.go
index 6584403..83d3ba5 100644
--- a/snappy_test.go
+++ b/snappy_test.go
@@ -23,9 +23,9 @@
testdata = flag.String("testdata", "testdata", "Directory containing the test data")
)
-func TestMaxEncodedLenOfMaxUncompressedChunkLen(t *testing.T) {
- got := maxEncodedLenOfMaxUncompressedChunkLen
- want := MaxEncodedLen(maxUncompressedChunkLen)
+func TestMaxEncodedLenOfMaxBlockSize(t *testing.T) {
+ got := maxEncodedLenOfMaxBlockSize
+ want := MaxEncodedLen(maxBlockSize)
if got != want {
t.Fatalf("got %d, want %d", got, want)
}
@@ -237,23 +237,24 @@
}
}
-// TestEncodeNoiseThenRepeats encodes a 32K block for which the first half is
-// very incompressible and the second half is very compressible. The encoded
-// form's length should be closer to 50% of the original length than 100%.
+// TestEncodeNoiseThenRepeats encodes input for which the first half is very
+// incompressible and the second half is very compressible. The encoded form's
+// length should be closer to 50% of the original length than 100%.
func TestEncodeNoiseThenRepeats(t *testing.T) {
- const origLen = 32768
- src := make([]byte, origLen)
- rng := rand.New(rand.NewSource(1))
- firstHalf, secondHalf := src[:origLen/2], src[origLen/2:]
- for i := range firstHalf {
- firstHalf[i] = uint8(rng.Intn(256))
- }
- for i := range secondHalf {
- secondHalf[i] = uint8(i >> 8)
- }
- dst := Encode(nil, src)
- if got, want := len(dst), origLen*3/4; got >= want {
- t.Fatalf("got %d encoded bytes, want less than %d", got, want)
+ for _, origLen := range []int{32 * 1024, 256 * 1024, 2048 * 1024} {
+ src := make([]byte, origLen)
+ rng := rand.New(rand.NewSource(1))
+ firstHalf, secondHalf := src[:origLen/2], src[origLen/2:]
+ for i := range firstHalf {
+ firstHalf[i] = uint8(rng.Intn(256))
+ }
+ for i := range secondHalf {
+ secondHalf[i] = uint8(i >> 8)
+ }
+ dst := Encode(nil, src)
+ if got, want := len(dst), origLen*3/4; got >= want {
+ t.Errorf("origLen=%d: got %d encoded bytes, want less than %d", origLen, got, want)
+ }
}
}
@@ -272,7 +273,7 @@
func TestFramingFormat(t *testing.T) {
// src is comprised of alternating 1e5-sized sequences of random
// (incompressible) bytes and repeated (compressible) bytes. 1e5 was chosen
- // because it is larger than maxUncompressedChunkLen (64k).
+ // because it is larger than maxBlockSize (64k).
src := make([]byte, 1e6)
rng := rand.New(rand.NewSource(1))
for i := 0; i < 10; i++ {
@@ -330,7 +331,7 @@
// Test all 32 possible sub-sequences of these 5 input slices.
//
// Their lengths sum to 400,000, which is over 6 times the Writer ibuf
- // capacity: 6 * maxUncompressedChunkLen is 393,216.
+ // capacity: 6 * maxBlockSize is 393,216.
inputs := [][]byte{
bytes.Repeat([]byte{'a'}, 40000),
bytes.Repeat([]byte{'b'}, 150000),