Have flatecut use a 64-bit bit buffer, not 32

name    old time/op  new time/op  delta
pkg:github.com/google/wuffs/lib/flatecut goos:linux goarch:amd64
Cut-56   919µs ± 0%   867µs ± 0%  -5.70%  (p=0.008 n=5+5)
pkg:github.com/google/wuffs/lib/zlibcut goos:linux goarch:amd64
Cut-56  2.34ms ± 1%  2.28ms ± 0%  -2.46%  (p=0.008 n=5+5)
diff --git a/lib/flatecut/flatecut.go b/lib/flatecut/flatecut.go
index b574222..f359a52 100644
--- a/lib/flatecut/flatecut.go
+++ b/lib/flatecut/flatecut.go
@@ -84,9 +84,11 @@
 	mostNegativeInt32 = -0x80000000
 )
 
-func loadU32LE(b []byte) uint32 {
-	_ = b[3] // bounds check hint to compiler; see golang.org/issue/14808
-	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+func loadU64LE(b []byte) uint64 {
+	_ = b[7] // bounds check hint to compiler; see golang.org/issue/14808
+	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
+		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+
 }
 
 type bitstream struct {
@@ -96,7 +98,7 @@
 
 	// The low nBits bits of the 'bits' field hold the next bits (in LSB-first
 	// order).
-	bits  uint32
+	bits  uint64
 	nBits uint32
 }
 
@@ -105,13 +107,13 @@
 		if b.index >= len(b.bytes) {
 			return mostNegativeInt32
 		}
-		b.bits |= uint32(b.bytes[b.index]) << b.nBits
+		b.bits |= uint64(b.bytes[b.index]) << b.nBits
 		b.nBits += 8
 		b.index++
 	}
 
 	mask := ((uint32(1)) << nBits) - 1
-	ret := b.bits & mask
+	ret := uint32(b.bits) & mask
 	b.bits >>= nBits
 	b.nBits -= nBits
 	return int32(ret)
@@ -162,15 +164,15 @@
 func (h *huffman) decode(b *bitstream) int32 {
 	if b.nBits >= 8 {
 		// No-op.
-	} else if b.index < (len(b.bytes) - 4) {
+	} else if b.index < (len(b.bytes) - 8) {
 		// This is "Variant 4" of
 		// https://fgiesen.wordpress.com/2018/02/20/reading-bits-in-far-too-many-ways-part-2/
-		u := loadU32LE(b.bytes[b.index:])
+		u := loadU64LE(b.bytes[b.index:])
 		b.bits |= u << b.nBits
-		b.index += int((31 - b.nBits) >> 3)
-		b.nBits |= 24
+		b.index += int((63 - b.nBits) >> 3)
+		b.nBits |= 56
 	} else if b.index < len(b.bytes) {
-		b.bits |= uint32(b.bytes[b.index]) << b.nBits
+		b.bits |= uint64(b.bytes[b.index]) << b.nBits
 		b.nBits += 8
 		b.index++
 	} else {
@@ -202,12 +204,12 @@
 			if b.index >= len(b.bytes) {
 				return mostNegativeInt32
 			}
-			b.bits = uint32(b.bytes[b.index])
+			b.bits = uint64(b.bytes[b.index])
 			b.nBits = 8
 			b.index++
 		}
 
-		code |= b.bits & 1
+		code |= uint32(b.bits & 1)
 		b.bits >>= 1
 		b.nBits -= 1