Add a look-up table to flatecut

name    old time/op  new time/op  delta
pkg:github.com/google/wuffs/lib/flatecut goos:linux goarch:amd64
Cut-56  1.50ms ± 1%  1.27ms ± 0%  -14.96%  (p=0.008 n=5+5)
pkg:github.com/google/wuffs/lib/zlibcut goos:linux goarch:amd64
Cut-56  2.94ms ± 0%  2.72ms ± 1%   -7.39%  (p=0.008 n=5+5)
diff --git a/lib/flatecut/flatecut.go b/lib/flatecut/flatecut.go
index ff8c3dd..d02f9fd 100644
--- a/lib/flatecut/flatecut.go
+++ b/lib/flatecut/flatecut.go
@@ -131,6 +131,9 @@
 //   symbols: [maxNumCodes]int32{
 //     0: 'F', 1: 'A', 2: 'B', 3: 'C', 4: 'D', 5: 'E', 6: 'G', 7: 'H',
 //   },
+//   lookUpTable: [256]uint32{
+//     etc,
+//   },
 // }
 //
 // Continuing the example from the RFC, decoding "1110" from the bitstream will
@@ -138,9 +141,43 @@
 type huffman struct {
 	counts  [maxCodeBits + 1]uint32
 	symbols [maxNumCodes]int32
+
+	// lookUpTable holds cached results of the slowDecode method.
+	//
+	// The uint8 key is the next 8 bits of input.
+	//
+	// The uint32 value's low 16 bits are the symbol, the high 16 bits are the
+	// number of bits used to encode that symbol.
+	//
+	// Zero means that the entry is invalid. For example, the encoded symbol
+	// might be longer than 8 bits.
+	lookUpTable [256]uint32
 }
 
 func (h *huffman) decode(b *bitstream) int32 {
+	if b.index < len(b.bytes) {
+		b.bits |= uint32(b.bytes[b.index]) << b.nBits
+		b.nBits += 8
+		b.index++
+		if x := h.lookUpTable[b.bits&0xFF]; x != 0 {
+			n := x >> 16
+			b.bits >>= n
+			b.nBits -= n
+			if b.nBits >= 8 {
+				b.index--
+				b.nBits -= 8
+				b.bits &= (uint32(1) << b.nBits) - 1
+			}
+			return int32(x & 0xFFFF)
+		}
+		b.index--
+		b.nBits -= 8
+		b.bits &= (uint32(1) << b.nBits) - 1
+	}
+	return h.slowDecode(b)
+}
+
+func (h *huffman) slowDecode(b *bitstream) int32 {
 	code := uint32(0)     // The i bits from the bitstream.
 	first := uint32(0)    // The first Huffman code of length i.
 	symIndex := uint32(0) // How many elements of h.symbols we've gone past.
@@ -244,9 +281,27 @@
 			offsets[length]++
 		}
 	}
+	h.constructLookUpTable()
 	return endCodeBits, endCodeNBits, nil
 }
 
+func (h *huffman) constructLookUpTable() {
+	b := bitstream{
+		bytes: []byte{0x00},
+	}
+	for i := range h.lookUpTable {
+		b.bytes[0] = uint8(i)
+		b.index = 0
+		b.bits = 0
+		b.nBits = 0
+		if x := h.slowDecode(&b); x >= 0 {
+			h.lookUpTable[i] = ((8 - b.nBits) << 16) | uint32(x)
+		} else {
+			h.lookUpTable[i] = 0
+		}
+	}
+}
+
 // cutEmpty sets encoding[:2] to contain valid DEFLATE-encoded data. Decoding
 // that data produces zero bytes.
 func cutEmpty(encoded []byte, maxEncodedLen int) (encodedLen int, decodedLen int, retErr error) {
diff --git a/lib/flatecut/flatecut_test.go b/lib/flatecut/flatecut_test.go
index 50abd39..95d94e7 100644
--- a/lib/flatecut/flatecut_test.go
+++ b/lib/flatecut/flatecut_test.go
@@ -89,6 +89,7 @@
 			0: 'F', 1: 'A', 2: 'B', 3: 'C', 4: 'D', 5: 'E', 6: 'G', 7: 'H',
 		},
 	}
+	h.constructLookUpTable()
 
 	b := &bitstream{
 		bytes: encoded,