Have flatecut emit Stored blocks if shorter
diff --git a/lib/flatecut/flatecut.go b/lib/flatecut/flatecut.go
index 07919bb..0557168 100644
--- a/lib/flatecut/flatecut.go
+++ b/lib/flatecut/flatecut.go
@@ -34,6 +34,7 @@
errInternalInconsistentDecodedLen = errors.New("flatecut: internal: inconsistent decodedLen")
errInternalNoProgress = errors.New("flatecut: internal: no progress")
+ errInternalReplaceWithSingleBlock = errors.New("flatecut: internal: replace with single block")
errInternalSomeProgress = errors.New("flatecut: internal: some progress")
errInvalidBadBlockLength = errors.New("flatecut: invalid input: bad block length")
@@ -324,15 +325,47 @@
}
}
-// cutEmpty sets encoding[:2] to contain valid DEFLATE-encoded data. Decoding
-// that data produces zero bytes.
-func cutEmpty(encoded []byte, maxEncodedLen int) (encodedLen int, decodedLen int, retErr error) {
+// cutSingleBlock is an implementation of Cut whose output consists of a single
+// DEFLATE block.
+//
+// If maxEncodedLen is sufficiently large, this will be a Stored block (i.e. a
+// header followed by literal bytes). Otherwise, it will set encoding[:2] so
+// that decoding produces zero bytes.
+//
+// A precondition is that maxEncodedLen >= SmallestValidMaxEncodedLen.
+func cutSingleBlock(encoded []byte, maxEncodedLen int) (encodedLen int, decodedLen int, retErr error) {
if maxEncodedLen < SmallestValidMaxEncodedLen {
panic("unreachable")
}
+
+ // Try re-encoding as a single Stored block: up to 0xFFFF literal bytes,
+ // with a 5 byte prefix.
+ if maxEncodedLen > 5 {
+ n := maxEncodedLen - 5
+ if n > 0xFFFF {
+ n = 0xFFFF
+ }
+
+ buf := make([]byte, n)
+ n, err := io.ReadFull(flate.NewReader(bytes.NewReader(encoded)), buf)
+ if err != nil && err != io.EOF && err != io.ErrUnexpectedEOF {
+ return 0, 0, err
+ }
+
+ if n > 0 {
+ encoded[0] = 0x01 // finalBlock = true, blockType = 0 (Stored).
+ encoded[1] = uint8(n >> 0)
+ encoded[2] = uint8(n >> 8)
+ encoded[3] = ^encoded[1]
+ encoded[4] = ^encoded[2]
+ copy(encoded[5:], buf[:n])
+ return n + 5, n, nil
+ }
+ }
+
// Set encoded[:2] to hold:
// - 1 bit ...._...._...._...1 finalBlock = true.
- // - 2 bits ...._...._...._.01. blockType = 1 (static Huffman).
+ // - 2 bits ...._...._...._.01. blockType = 1 (Static Huffman).
// - 7 bits ...._..00_0000_0... litLenSymbol = 256 (end-of-block).
// - 6 bits 0000_00.._...._.... padding.
encoded[0] = 0x03
@@ -438,9 +471,9 @@
case 0:
err = c.doStored()
case 1:
- err = c.doStaticHuffman()
+ err = c.doStaticHuffman(prevFinalBlockIndex < 0)
case 2:
- err = c.doDynamicHuffman()
+ err = c.doDynamicHuffman(prevFinalBlockIndex < 0)
case 3:
return 0, 0, errInvalidBadBlockType
}
@@ -460,7 +493,7 @@
case errInternalNoProgress:
if prevFinalBlockIndex < 0 {
- return cutEmpty(c.bits.bytes, c.maxEncodedLen)
+ return cutSingleBlock(c.bits.bytes, c.maxEncodedLen)
}
// Un-read to just before the finalBlock bit.
@@ -482,6 +515,9 @@
mask := uint32(1) << n
c.bits.bytes[finalBlockIndex-1] |= uint8(mask)
+ case errInternalReplaceWithSingleBlock:
+ return cutSingleBlock(c.bits.bytes, c.maxEncodedLen)
+
default:
return 0, 0, err
@@ -543,7 +579,7 @@
return errInternalSomeProgress
}
-func (c *cutter) doStaticHuffman() error {
+func (c *cutter) doStaticHuffman(isFirstBlock bool) error {
const (
numLCodes = 288
numDCodes = 32
@@ -568,10 +604,10 @@
lengths[i] = 5
}
- return c.doHuffman(lengths[:numLCodes], lengths[numLCodes:])
+ return c.doHuffman(isFirstBlock, lengths[:numLCodes], lengths[numLCodes:])
}
-func (c *cutter) doDynamicHuffman() error {
+func (c *cutter) doDynamicHuffman(isFirstBlock bool) error {
numLCodes := 257 + c.bits.take(5)
if numLCodes < 0 {
return errInvalidNotEnoughData
@@ -644,10 +680,10 @@
}
}
- return c.doHuffman(lengths[:numLCodes], lengths[numLCodes:])
+ return c.doHuffman(isFirstBlock, lengths[:numLCodes], lengths[numLCodes:])
}
-func (c *cutter) doHuffman(lLengths []uint32, dLengths []uint32) error {
+func (c *cutter) doHuffman(isFirstBlock bool, lLengths []uint32, dLengths []uint32) error {
err := error(nil)
if c.endCodeBits, c.endCodeNBits, err = c.lHuff.construct(lLengths); err != nil {
return err
@@ -730,6 +766,19 @@
if checkpointIndex < 0 {
return errInternalNoProgress
}
+
+ // If we're the first block in the DEFLATE stream, check if we would be
+ // better off replacing the Huffman block with a Stored block.
+ if isFirstBlock && (c.maxEncodedLen > 5) {
+ n := c.maxEncodedLen - 5
+ if n > 0xFFFF {
+ n = 0xFFFF
+ }
+ if c.decodedLen < int32(n) {
+ return errInternalReplaceWithSingleBlock
+ }
+ }
+
c.bits.index = checkpointIndex
c.bits.nBits = checkpointNBits
for c.bits.nBits >= 8 {
diff --git a/lib/flatecut/flatecut_test.go b/lib/flatecut/flatecut_test.go
index 5ba070e..5b0035e 100644
--- a/lib/flatecut/flatecut_test.go
+++ b/lib/flatecut/flatecut_test.go
@@ -42,6 +42,14 @@
return flate.NewReader(r), nil
}
+func decodeFlate(src []byte) string {
+ dst, err := ioutil.ReadAll(flate.NewReader(bytes.NewReader(src)))
+ if err != nil {
+ return err.Error()
+ }
+ return string(dst)
+}
+
func TestHuffmanDecode(t *testing.T) {
// This exercises the "ABCDEFGH" example from RFC 1951 section 3.2.2,
// discussed in the "type huffman" doc comment.
@@ -112,38 +120,112 @@
}
}
-func TestDegenerateHuffmanUnused(t *testing.T) {
- decode := func(src []byte) string {
- dst, err := ioutil.ReadAll(
- flate.NewReader(bytes.NewReader(src)))
- if err != nil {
- return err.Error()
- }
- return string(dst)
- }
+// A DEFLATE encoding of the first 64 bytes of the AUTHORS file in the
+// repository root directory.
+//
+// The encoding uses a Dynamic Huffman block, but one whose H-D Huffman
+// distance tree is degenerate (there's a mapping for the "0" code but no
+// mapping for the "1" code) and unused.
+//
+// The first 25-30 bytes contain the encoded H-L and H-D Huffman trees. The
+// last 30-35 bytes contain the actual payload, encoded with H-L.
+//
+// The high 7 bits of the final byte is unused padding.
+const (
+ degenerateHuffmanDec = "# This is the official list of Wuffs authors for copyright purpo"
+ degenerateHuffmanEnc = "" +
+ "\x05\xc0\xc1\x09\xc5\x30\x0c\x03\xd0\x55\x04\x7f\xa5\x0f\x3d\x87" +
+ "\x50\xd5\x82\x80\x82\xed\x1c\xba\x7d\xdf\x0f\xff\x50\x41\x85\x8e" +
+ "\x1b\x26\x35\x35\x16\x96\xaa\x61\xe2\x3a\x64\x61\x9c\x0e\x67\x81" +
+ "\x4e\x4c\xef\x37\xf5\x44\x63\x9f\xdc\xfe\x00"
+)
- full, err := ioutil.ReadFile("../../test/data/artificial/deflate-degenerate-huffman-unused.deflate")
- if err != nil {
- t.Fatalf("ReadFile: %v", err)
+func TestReplaceHuffmanWithStored(t *testing.T) {
+ const dec = degenerateHuffmanDec
+ const enc = degenerateHuffmanEnc
+ if (len(dec) != 64) || (len(enc) != 59) {
+ panic("inconsistent const string lengths")
}
- if len(full) != 15 {
- t.Fatalf("len(full): got %d, want %d", len(full), 15)
- }
-
- if got, want := decode(full), "foo"; got != want {
+ if got, want := decodeFlate([]byte(enc)), dec; got != want {
t.Fatalf("before Cut: got %q, want %q", got, want)
}
- if encLen, decLen, err := Cut(nil, full, 14); err != nil {
+ for i := 4; i <= 59; i += 5 {
+ b := []byte(enc)
+ encLen, decLen, err := Cut(nil, b, i)
+ if err != nil {
+ t.Errorf("i=%d: %v", i, err)
+ continue
+ }
+ if encLen < 1 {
+ t.Errorf("i=%d: encLen: got %d, want >= 1", i, encLen)
+ continue
+ } else if encLen > len(enc) {
+ t.Errorf("i=%d: encLen: got %d, want <= %d", i, encLen, len(enc))
+ continue
+ }
+ // If we can make some progress (decLen > 0), even if the input uses a
+ // Huffman block, one option is to re-encode to a single Stored block
+ // (for 5 bytes of overhead). It's single because len(dec) < 0xFFFF.
+ // Regardless of whether the cut form uses a Huffman or Stored block,
+ // we should be able to produce at least i-5 bytes of decoded output.
+ if (decLen > 0) && (i > 5) && (decLen < i-5) {
+ t.Errorf("i=%d: decLen: got %d, want >= %d", i, decLen, i-5)
+ continue
+ } else if decLen > len(dec) {
+ t.Errorf("i=%d: decLen: got %d, want <= %d", i, decLen, len(dec))
+ continue
+ }
+
+ if got, want := decodeFlate(b[:encLen]), dec[:decLen]; got != want {
+ t.Errorf("i=%d: after Cut: got %q, want %q", i, got, want)
+ continue
+ }
+
+ // Check that we are using a space-efficient block type.
+ {
+ got := (b[0] >> 1) & 3
+ want := uint8(0xFF)
+
+ switch i {
+ case 4:
+ want = 1 // Static Huffman, for a decLen of 0.
+ case 9:
+ want = 0 // Stored.
+ case 59:
+ want = 2 // Dynamic Huffman.
+ default:
+ continue
+ }
+
+ if got != want {
+ t.Errorf("i=%d: block type: got %d, want %d", i, got, want)
+ }
+ }
+ }
+}
+
+func TestDegenerateHuffmanUnused(t *testing.T) {
+ const dec = degenerateHuffmanDec
+ const enc = degenerateHuffmanEnc
+
+ // Cutting 1 byte off the end of the encoded form will lead to cutting n
+ // bytes off the decoded form. Coincidentally, n equals 1, even though each
+ // decoded byte (8 bits) is packed into smaller number of bits, as most of
+ // the final encoded byte's bits are unused padding.
+ const n = 1
+
+ b := []byte(enc)
+ encLen, decLen, err := Cut(nil, b, len(enc)-1)
+ if err != nil {
t.Fatalf("Cut: %v", err)
- } else if encLen != 14 {
- t.Fatalf("encLen: got %d, want %d", encLen, 14)
- } else if decLen != 2 {
- t.Fatalf("decLen: got %d, want %d", decLen, 2)
+ } else if encLen != len(enc)-1 {
+ t.Fatalf("encLen: got %d, want %d", encLen, len(enc)-1)
+ } else if decLen != len(dec)-n {
+ t.Fatalf("decLen: got %d, want %d", decLen, len(dec)-n)
}
- if got, want := decode(full[:14]), "fo"; got != want {
+ if got, want := decodeFlate(b[:encLen]), dec[:decLen]; got != want {
t.Fatalf("after Cut: got %q, want %q", got, want)
}
-
}