Flatten the lzw.decoder.suffixes array
This commit will soon be followed by a rollback, but it is committed
anyway so that we can refer to these numbers in the git log.
name old speed new speed delta
wuffs_lzw_decode_20k/clang5 265MB/s ± 0% 255MB/s ± 0% -3.85% (p=0.008 n=5+5)
wuffs_lzw_decode_100k/clang5 452MB/s ± 1% 418MB/s ± 1% -7.62% (p=0.008 n=5+5)
wuffs_lzw_decode_20k/gcc7 263MB/s ± 1% 255MB/s ± 1% -3.21% (p=0.008 n=5+5)
wuffs_lzw_decode_100k/gcc7 482MB/s ± 2% 444MB/s ± 1% -7.85% (p=0.008 n=5+5)
wuffs_gif_decode_1k_bw/clang5 241MB/s ± 1% 239MB/s ± 2% ~ (p=0.310 n=5+5)
wuffs_gif_decode_1k_color/clang5 141MB/s ± 1% 135MB/s ± 0% -4.30% (p=0.008 n=5+5)
wuffs_gif_decode_10k_bgra/clang5 721MB/s ± 0% 692MB/s ± 1% -4.00% (p=0.008 n=5+5)
wuffs_gif_decode_10k_indexed/clang5 194MB/s ± 1% 186MB/s ± 0% -4.03% (p=0.008 n=5+5)
wuffs_gif_decode_20k/clang5 246MB/s ± 1% 237MB/s ± 1% -3.60% (p=0.008 n=5+5)
wuffs_gif_decode_100k_artificial/clang5 551MB/s ± 0% 536MB/s ± 0% -2.73% (p=0.008 n=5+5)
wuffs_gif_decode_100k_realistic/clang5 227MB/s ± 0% 219MB/s ± 0% -3.74% (p=0.008 n=5+5)
wuffs_gif_decode_1000k/clang5 231MB/s ± 0% 222MB/s ± 0% -3.88% (p=0.008 n=5+5)
wuffs_gif_decode_anim_screencap/clang5 1.10GB/s ± 0% 1.07GB/s ± 0% -2.35% (p=0.008 n=5+5)
wuffs_gif_decode_1k_bw/gcc7 265MB/s ± 1% 262MB/s ± 0% -1.49% (p=0.008 n=5+5)
wuffs_gif_decode_1k_color/gcc7 145MB/s ± 1% 142MB/s ± 1% -2.52% (p=0.008 n=5+5)
wuffs_gif_decode_10k_bgra/gcc7 622MB/s ± 1% 596MB/s ± 1% -4.18% (p=0.008 n=5+5)
wuffs_gif_decode_10k_indexed/gcc7 194MB/s ± 0% 185MB/s ± 0% -4.81% (p=0.008 n=5+5)
wuffs_gif_decode_20k/gcc7 234MB/s ± 1% 230MB/s ± 0% -1.80% (p=0.008 n=5+5)
wuffs_gif_decode_100k_artificial/gcc7 506MB/s ± 1% 490MB/s ± 1% -3.14% (p=0.008 n=5+5)
wuffs_gif_decode_100k_realistic/gcc7 210MB/s ± 1% 203MB/s ± 1% -2.89% (p=0.008 n=5+5)
wuffs_gif_decode_1000k/gcc7 213MB/s ± 0% 207MB/s ± 0% -2.66% (p=0.008 n=5+5)
wuffs_gif_decode_anim_screencap/gcc7 1.06GB/s ± 0% 1.04GB/s ± 0% -2.38% (p=0.008 n=5+5)
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index f93fd9a..d9ebee5 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -2896,7 +2896,7 @@
uint32_t f_output_ri;
uint32_t f_output_wi;
uint32_t f_read_from_return_value;
- uint8_t f_suffixes[4096][8];
+ uint8_t f_suffixes[32768];
uint16_t f_prefixes[4096];
uint16_t f_lm1s[4096];
uint8_t f_output[8199];
@@ -7856,7 +7856,7 @@
v_i = 0;
while (v_i < self->private_impl.f_clear_code) {
self->private_impl.f_lm1s[v_i] = 0;
- self->private_impl.f_suffixes[v_i][0] = ((uint8_t)(v_i));
+ self->private_impl.f_suffixes[((8 * v_i) + 0)] = ((uint8_t)(v_i));
v_i += 1;
}
label_0_continue:;
@@ -7991,15 +7991,16 @@
if ((v_lm1_a % 8) != 0) {
self->private_impl.f_prefixes[v_save_code] =
self->private_impl.f_prefixes[v_prev_code];
- memcpy(self->private_impl.f_suffixes[v_save_code],
- self->private_impl.f_suffixes[v_prev_code],
- sizeof(self->private_impl.f_suffixes[v_save_code]));
- self->private_impl.f_suffixes[v_save_code][(v_lm1_a % 8)] =
+ memcpy((self->private_impl.f_suffixes) + ((8 * v_save_code)),
+ (self->private_impl.f_suffixes) + ((8 * v_prev_code)), 8);
+ self->private_impl
+ .f_suffixes[((8 * v_save_code) + ((uint32_t)((v_lm1_a % 8))))] =
((uint8_t)(v_code));
} else {
self->private_impl.f_prefixes[v_save_code] =
((uint16_t)(v_prev_code));
- self->private_impl.f_suffixes[v_save_code][0] = ((uint8_t)(v_code));
+ self->private_impl.f_suffixes[((8 * v_save_code) + 0)] =
+ ((uint8_t)(v_code));
}
v_save_code += 1;
if (v_width < 12) {
@@ -8029,7 +8030,7 @@
v_steps = (((uint32_t)(self->private_impl.f_lm1s[v_c])) >> 3);
while (true) {
memcpy((self->private_impl.f_output) + (v_o),
- (self->private_impl.f_suffixes[v_c]), 8);
+ (self->private_impl.f_suffixes) + ((8 * v_c)), 8);
if (v_steps <= 0) {
goto label_1_break;
}
@@ -8038,7 +8039,7 @@
v_c = ((uint32_t)(self->private_impl.f_prefixes[v_c]));
}
label_1_break:;
- v_first_byte = self->private_impl.f_suffixes[v_c][0];
+ v_first_byte = self->private_impl.f_suffixes[((8 * v_c) + 0)];
if (v_code == v_save_code) {
self->private_impl.f_output[v_output_wi] = v_first_byte;
v_output_wi = ((v_output_wi + 1) & 8191);
@@ -8049,15 +8050,15 @@
if ((v_lm1_b % 8) != 0) {
self->private_impl.f_prefixes[v_save_code] =
self->private_impl.f_prefixes[v_prev_code];
- memcpy(self->private_impl.f_suffixes[v_save_code],
- self->private_impl.f_suffixes[v_prev_code],
- sizeof(self->private_impl.f_suffixes[v_save_code]));
- self->private_impl.f_suffixes[v_save_code][(v_lm1_b % 8)] =
+ memcpy((self->private_impl.f_suffixes) + ((8 * v_save_code)),
+ (self->private_impl.f_suffixes) + ((8 * v_prev_code)), 8);
+ self->private_impl
+ .f_suffixes[((8 * v_save_code) + ((uint32_t)((v_lm1_b % 8))))] =
v_first_byte;
} else {
self->private_impl.f_prefixes[v_save_code] =
((uint16_t)(v_prev_code));
- self->private_impl.f_suffixes[v_save_code][0] =
+ self->private_impl.f_suffixes[((8 * v_save_code) + 0)] =
((uint8_t)(v_first_byte));
}
v_save_code += 1;
diff --git a/std/lzw/decode_lzw.wuffs b/std/lzw/decode_lzw.wuffs
index f7ec898..70156be 100644
--- a/std/lzw/decode_lzw.wuffs
+++ b/std/lzw/decode_lzw.wuffs
@@ -47,7 +47,7 @@
read_from_return_value base.u32,
// read_from per-code state.
- suffixes array[4096] array[8] base.u8,
+ suffixes array[4096 * 8] base.u8,
prefixes array[4096] base.u16[..4095],
// lm1s is the "length minus 1"s of the values for the implicit key-value
// table in this decoder. See std/lzw/README.md for more detail.
@@ -89,7 +89,7 @@
while i < this.clear_code {
assert i < 256 via "a < b: a < c; c <= b"(c:this.clear_code)
this.lm1s[i] = 0
- this.suffixes[i][0] = i as base.u8
+ this.suffixes[(8 * i) + 0] = i as base.u8
i += 1
}
@@ -196,12 +196,15 @@
this.lm1s[save_code] = lm1_a
if (lm1_a % 8) != 0 {
+ assert (8 * save_code) <= ((8 * save_code) + 8) via "a <= (a + b): 0 <= b"(b:8)
+ assert (8 * prev_code) <= ((8 * prev_code) + 8) via "a <= (a + b): 0 <= b"(b:8)
this.prefixes[save_code] = this.prefixes[prev_code]
- this.suffixes[save_code] = this.suffixes[prev_code]
- this.suffixes[save_code][lm1_a % 8] = code as base.u8
+ this.suffixes[(8 * save_code):(8 * save_code) + 8].copy_from_slice!(
+ s:this.suffixes[(8 * prev_code):(8 * prev_code) + 8])
+ this.suffixes[(8 * save_code) + ((lm1_a % 8) as base.u32)] = code as base.u8
} else {
this.prefixes[save_code] = prev_code as base.u16
- this.suffixes[save_code][0] = code as base.u8
+ this.suffixes[(8 * save_code) + 0] = code as base.u8
}
save_code += 1
@@ -241,11 +244,12 @@
steps = (this.lm1s[c] as base.u32) >> 3
while true {
assert o <= (o + 8) via "a <= (a + b): 0 <= b"(b:8)
+ assert (8 * c) <= ((8 * c) + 8) via "a <= (a + b): 0 <= b"(b:8)
// The final "8" is redundant semantically, but helps the
// wuffs-c code generator recognize that both slices have the
// same constant length, and hence produce efficient C code.
- this.output[o:o + 8].copy_from_slice!(s:this.suffixes[c][:8])
+ this.output[o:o + 8].copy_from_slice!(s:this.suffixes[(8 * c):(8 * c) + 8])
if steps <= 0 {
break
@@ -257,7 +261,7 @@
o = (o ~mod- 8) & 8191
c = this.prefixes[c] as base.u32
}
- first_byte = this.suffixes[c][0]
+ first_byte = this.suffixes[(8 * c) + 0]
if code == save_code {
this.output[output_wi] = first_byte
@@ -269,12 +273,15 @@
this.lm1s[save_code] = lm1_b
if (lm1_b % 8) != 0 {
+ assert (8 * save_code) <= ((8 * save_code) + 8) via "a <= (a + b): 0 <= b"(b:8)
+ assert (8 * prev_code) <= ((8 * prev_code) + 8) via "a <= (a + b): 0 <= b"(b:8)
this.prefixes[save_code] = this.prefixes[prev_code]
- this.suffixes[save_code] = this.suffixes[prev_code]
- this.suffixes[save_code][lm1_b % 8] = first_byte
+ this.suffixes[(8 * save_code):(8 * save_code) + 8].copy_from_slice!(
+ s:this.suffixes[(8 * prev_code):(8 * prev_code) + 8])
+ this.suffixes[(8 * save_code) + ((lm1_b % 8) as base.u32)] = first_byte
} else {
this.prefixes[save_code] = prev_code as base.u16
- this.suffixes[save_code][0] = first_byte as base.u8
+ this.suffixes[(8 * save_code) + 0] = first_byte as base.u8
}
save_code += 1