Flatten the lzw.decoder.suffixes array

This commit will soon be followed by a rollback, but it is committed
anyway so that we can refer to these numbers in the git log.

name                                     old speed      new speed      delta

wuffs_lzw_decode_20k/clang5               265MB/s ± 0%   255MB/s ± 0%  -3.85%  (p=0.008 n=5+5)
wuffs_lzw_decode_100k/clang5              452MB/s ± 1%   418MB/s ± 1%  -7.62%  (p=0.008 n=5+5)

wuffs_lzw_decode_20k/gcc7                 263MB/s ± 1%   255MB/s ± 1%  -3.21%  (p=0.008 n=5+5)
wuffs_lzw_decode_100k/gcc7                482MB/s ± 2%   444MB/s ± 1%  -7.85%  (p=0.008 n=5+5)

wuffs_gif_decode_1k_bw/clang5             241MB/s ± 1%   239MB/s ± 2%    ~     (p=0.310 n=5+5)
wuffs_gif_decode_1k_color/clang5          141MB/s ± 1%   135MB/s ± 0%  -4.30%  (p=0.008 n=5+5)
wuffs_gif_decode_10k_bgra/clang5          721MB/s ± 0%   692MB/s ± 1%  -4.00%  (p=0.008 n=5+5)
wuffs_gif_decode_10k_indexed/clang5       194MB/s ± 1%   186MB/s ± 0%  -4.03%  (p=0.008 n=5+5)
wuffs_gif_decode_20k/clang5               246MB/s ± 1%   237MB/s ± 1%  -3.60%  (p=0.008 n=5+5)
wuffs_gif_decode_100k_artificial/clang5   551MB/s ± 0%   536MB/s ± 0%  -2.73%  (p=0.008 n=5+5)
wuffs_gif_decode_100k_realistic/clang5    227MB/s ± 0%   219MB/s ± 0%  -3.74%  (p=0.008 n=5+5)
wuffs_gif_decode_1000k/clang5             231MB/s ± 0%   222MB/s ± 0%  -3.88%  (p=0.008 n=5+5)
wuffs_gif_decode_anim_screencap/clang5   1.10GB/s ± 0%  1.07GB/s ± 0%  -2.35%  (p=0.008 n=5+5)

wuffs_gif_decode_1k_bw/gcc7               265MB/s ± 1%   262MB/s ± 0%  -1.49%  (p=0.008 n=5+5)
wuffs_gif_decode_1k_color/gcc7            145MB/s ± 1%   142MB/s ± 1%  -2.52%  (p=0.008 n=5+5)
wuffs_gif_decode_10k_bgra/gcc7            622MB/s ± 1%   596MB/s ± 1%  -4.18%  (p=0.008 n=5+5)
wuffs_gif_decode_10k_indexed/gcc7         194MB/s ± 0%   185MB/s ± 0%  -4.81%  (p=0.008 n=5+5)
wuffs_gif_decode_20k/gcc7                 234MB/s ± 1%   230MB/s ± 0%  -1.80%  (p=0.008 n=5+5)
wuffs_gif_decode_100k_artificial/gcc7     506MB/s ± 1%   490MB/s ± 1%  -3.14%  (p=0.008 n=5+5)
wuffs_gif_decode_100k_realistic/gcc7      210MB/s ± 1%   203MB/s ± 1%  -2.89%  (p=0.008 n=5+5)
wuffs_gif_decode_1000k/gcc7               213MB/s ± 0%   207MB/s ± 0%  -2.66%  (p=0.008 n=5+5)
wuffs_gif_decode_anim_screencap/gcc7     1.06GB/s ± 0%  1.04GB/s ± 0%  -2.38%  (p=0.008 n=5+5)
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index f93fd9a..d9ebee5 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -2896,7 +2896,7 @@
     uint32_t f_output_ri;
     uint32_t f_output_wi;
     uint32_t f_read_from_return_value;
-    uint8_t f_suffixes[4096][8];
+    uint8_t f_suffixes[32768];
     uint16_t f_prefixes[4096];
     uint16_t f_lm1s[4096];
     uint8_t f_output[8199];
@@ -7856,7 +7856,7 @@
     v_i = 0;
     while (v_i < self->private_impl.f_clear_code) {
       self->private_impl.f_lm1s[v_i] = 0;
-      self->private_impl.f_suffixes[v_i][0] = ((uint8_t)(v_i));
+      self->private_impl.f_suffixes[((8 * v_i) + 0)] = ((uint8_t)(v_i));
       v_i += 1;
     }
   label_0_continue:;
@@ -7991,15 +7991,16 @@
         if ((v_lm1_a % 8) != 0) {
           self->private_impl.f_prefixes[v_save_code] =
               self->private_impl.f_prefixes[v_prev_code];
-          memcpy(self->private_impl.f_suffixes[v_save_code],
-                 self->private_impl.f_suffixes[v_prev_code],
-                 sizeof(self->private_impl.f_suffixes[v_save_code]));
-          self->private_impl.f_suffixes[v_save_code][(v_lm1_a % 8)] =
+          memcpy((self->private_impl.f_suffixes) + ((8 * v_save_code)),
+                 (self->private_impl.f_suffixes) + ((8 * v_prev_code)), 8);
+          self->private_impl
+              .f_suffixes[((8 * v_save_code) + ((uint32_t)((v_lm1_a % 8))))] =
               ((uint8_t)(v_code));
         } else {
           self->private_impl.f_prefixes[v_save_code] =
               ((uint16_t)(v_prev_code));
-          self->private_impl.f_suffixes[v_save_code][0] = ((uint8_t)(v_code));
+          self->private_impl.f_suffixes[((8 * v_save_code) + 0)] =
+              ((uint8_t)(v_code));
         }
         v_save_code += 1;
         if (v_width < 12) {
@@ -8029,7 +8030,7 @@
       v_steps = (((uint32_t)(self->private_impl.f_lm1s[v_c])) >> 3);
       while (true) {
         memcpy((self->private_impl.f_output) + (v_o),
-               (self->private_impl.f_suffixes[v_c]), 8);
+               (self->private_impl.f_suffixes) + ((8 * v_c)), 8);
         if (v_steps <= 0) {
           goto label_1_break;
         }
@@ -8038,7 +8039,7 @@
         v_c = ((uint32_t)(self->private_impl.f_prefixes[v_c]));
       }
     label_1_break:;
-      v_first_byte = self->private_impl.f_suffixes[v_c][0];
+      v_first_byte = self->private_impl.f_suffixes[((8 * v_c) + 0)];
       if (v_code == v_save_code) {
         self->private_impl.f_output[v_output_wi] = v_first_byte;
         v_output_wi = ((v_output_wi + 1) & 8191);
@@ -8049,15 +8050,15 @@
         if ((v_lm1_b % 8) != 0) {
           self->private_impl.f_prefixes[v_save_code] =
               self->private_impl.f_prefixes[v_prev_code];
-          memcpy(self->private_impl.f_suffixes[v_save_code],
-                 self->private_impl.f_suffixes[v_prev_code],
-                 sizeof(self->private_impl.f_suffixes[v_save_code]));
-          self->private_impl.f_suffixes[v_save_code][(v_lm1_b % 8)] =
+          memcpy((self->private_impl.f_suffixes) + ((8 * v_save_code)),
+                 (self->private_impl.f_suffixes) + ((8 * v_prev_code)), 8);
+          self->private_impl
+              .f_suffixes[((8 * v_save_code) + ((uint32_t)((v_lm1_b % 8))))] =
               v_first_byte;
         } else {
           self->private_impl.f_prefixes[v_save_code] =
               ((uint16_t)(v_prev_code));
-          self->private_impl.f_suffixes[v_save_code][0] =
+          self->private_impl.f_suffixes[((8 * v_save_code) + 0)] =
               ((uint8_t)(v_first_byte));
         }
         v_save_code += 1;
diff --git a/std/lzw/decode_lzw.wuffs b/std/lzw/decode_lzw.wuffs
index f7ec898..70156be 100644
--- a/std/lzw/decode_lzw.wuffs
+++ b/std/lzw/decode_lzw.wuffs
@@ -47,7 +47,7 @@
 	read_from_return_value base.u32,
 
 	// read_from per-code state.
-	suffixes array[4096] array[8] base.u8,
+	suffixes array[4096 * 8] base.u8,
 	prefixes array[4096] base.u16[..4095],
 	// lm1s is the "length minus 1"s of the values for the implicit key-value
 	// table in this decoder. See std/lzw/README.md for more detail.
@@ -89,7 +89,7 @@
 	while i < this.clear_code {
 		assert i < 256 via "a < b: a < c; c <= b"(c:this.clear_code)
 		this.lm1s[i] = 0
-		this.suffixes[i][0] = i as base.u8
+		this.suffixes[(8 * i) + 0] = i as base.u8
 		i += 1
 	}
 
@@ -196,12 +196,15 @@
 				this.lm1s[save_code] = lm1_a
 
 				if (lm1_a % 8) != 0 {
+					assert (8 * save_code) <= ((8 * save_code) + 8) via "a <= (a + b): 0 <= b"(b:8)
+					assert (8 * prev_code) <= ((8 * prev_code) + 8) via "a <= (a + b): 0 <= b"(b:8)
 					this.prefixes[save_code] = this.prefixes[prev_code]
-					this.suffixes[save_code] = this.suffixes[prev_code]
-					this.suffixes[save_code][lm1_a % 8] = code as base.u8
+					this.suffixes[(8 * save_code):(8 * save_code) + 8].copy_from_slice!(
+						s:this.suffixes[(8 * prev_code):(8 * prev_code) + 8])
+					this.suffixes[(8 * save_code) + ((lm1_a % 8) as base.u32)] = code as base.u8
 				} else {
 					this.prefixes[save_code] = prev_code as base.u16
-					this.suffixes[save_code][0] = code as base.u8
+					this.suffixes[(8 * save_code) + 0] = code as base.u8
 				}
 
 				save_code += 1
@@ -241,11 +244,12 @@
 			steps = (this.lm1s[c] as base.u32) >> 3
 			while true {
 				assert o <= (o + 8) via "a <= (a + b): 0 <= b"(b:8)
+				assert (8 * c) <= ((8 * c) + 8) via "a <= (a + b): 0 <= b"(b:8)
 
 				// The final "8" is redundant semantically, but helps the
 				// wuffs-c code generator recognize that both slices have the
 				// same constant length, and hence produce efficient C code.
-				this.output[o:o + 8].copy_from_slice!(s:this.suffixes[c][:8])
+				this.output[o:o + 8].copy_from_slice!(s:this.suffixes[(8 * c):(8 * c) + 8])
 
 				if steps <= 0 {
 					break
@@ -257,7 +261,7 @@
 				o = (o ~mod- 8) & 8191
 				c = this.prefixes[c] as base.u32
 			}
-			first_byte = this.suffixes[c][0]
+			first_byte = this.suffixes[(8 * c) + 0]
 
 			if code == save_code {
 				this.output[output_wi] = first_byte
@@ -269,12 +273,15 @@
 				this.lm1s[save_code] = lm1_b
 
 				if (lm1_b % 8) != 0 {
+					assert (8 * save_code) <= ((8 * save_code) + 8) via "a <= (a + b): 0 <= b"(b:8)
+					assert (8 * prev_code) <= ((8 * prev_code) + 8) via "a <= (a + b): 0 <= b"(b:8)
 					this.prefixes[save_code] = this.prefixes[prev_code]
-					this.suffixes[save_code] = this.suffixes[prev_code]
-					this.suffixes[save_code][lm1_b % 8] = first_byte
+					this.suffixes[(8 * save_code):(8 * save_code) + 8].copy_from_slice!(
+						s:this.suffixes[(8 * prev_code):(8 * prev_code) + 8])
+					this.suffixes[(8 * save_code) + ((lm1_b % 8) as base.u32)] = first_byte
 				} else {
 					this.prefixes[save_code] = prev_code as base.u16
-					this.suffixes[save_code][0] = first_byte as base.u8
+					this.suffixes[(8 * save_code) + 0] = first_byte as base.u8
 				}
 
 				save_code += 1