Have flatecut's huffman.decode load more bits

name    old time/op  new time/op  delta
pkg:github.com/google/wuffs/lib/flatecut goos:linux goarch:amd64
Cut-56  1.03ms ± 0%  0.92ms ± 0%  -10.44%  (p=0.008 n=5+5)
pkg:github.com/google/wuffs/lib/zlibcut goos:linux goarch:amd64
Cut-56  2.44ms ± 0%  2.34ms ± 1%   -4.19%  (p=0.008 n=5+5)
diff --git a/lib/flatecut/flatecut.go b/lib/flatecut/flatecut.go
index b0b7391..b574222 100644
--- a/lib/flatecut/flatecut.go
+++ b/lib/flatecut/flatecut.go
@@ -84,6 +84,11 @@
 	mostNegativeInt32 = -0x80000000
 )
 
+func loadU32LE(b []byte) uint32 {
+	_ = b[3] // bounds check hint to compiler; see golang.org/issue/14808
+	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+}
+
 type bitstream struct {
 	// bytes[index] is the next byte to load into the 'bits' field.
 	bytes []byte
@@ -157,6 +162,13 @@
 func (h *huffman) decode(b *bitstream) int32 {
 	if b.nBits >= 8 {
 		// No-op.
+	} else if b.index < (len(b.bytes) - 4) {
+		// This is "Variant 4" of
+		// https://fgiesen.wordpress.com/2018/02/20/reading-bits-in-far-too-many-ways-part-2/
+		u := loadU32LE(b.bytes[b.index:])
+		b.bits |= u << b.nBits
+		b.index += int((31 - b.nBits) >> 3)
+		b.nBits |= 24
 	} else if b.index < len(b.bytes) {
 		b.bits |= uint32(b.bytes[b.index]) << b.nBits
 		b.nBits += 8