blob: 73734d318415ee62ec470929ef3e36c6757c1959 [file] [log] [blame]
// Copyright 2018 The Wuffs Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// ----------------
// This file contains a hand-written C benchmark of different strategies for
// decoding PNG data.
//
// For a PNG image with width W and height H, the H rows can be decompressed
// one-at-a-time or all-at-once. Roughly speaking, this corresponds to H versus
// 1 call into the zlib decoder. The former (call it "fragmented dst") requires
// less scratch-space memory than the latter ("full dst"): 2 * bytes_per_row
// instead of H * bytes_per row, but the latter can be faster.
//
// The zlib-compressed data can be split into multiple IDAT chunks. Similarly,
// these chunks can be decompressed separately ("fragmented IDAT") or together
// ("full IDAT"), again providing a memory vs speed trade-off.
//
// This program reports the speed of combining the independent frag/full dst
// and frag/full IDAT techniques.
//
// For example, with gcc 7.3 (and -O3) as of January 2019:
//
// On ../test/data/hat.png (90 × 112 pixels):
// name time/op relative
// FragDstFragIDAT/gcc 289µs ± 1% 1.00x
// FragDstFullIDAT/gcc 288µs ± 0% 1.00x
// FullDstFragIDAT/gcc 149µs ± 1% 1.93x
// FullDstFullIDAT/gcc 148µs ± 1% 1.95x
//
// On ../test/data/hibiscus.regular.png (312 × 442 pixels):
// name time/op relative
// FragDstFragIDAT/gcc 2.49ms ± 0% 1.00x
// FragDstFullIDAT/gcc 2.49ms ± 0% 1.00x
// FullDstFragIDAT/gcc 2.08ms ± 0% 1.20x
// FullDstFullIDAT/gcc 2.02ms ± 1% 1.23x
//
// On ../test/data/harvesters.png (1165 × 859 pixels):
// name time/op relative
// FragDstFragIDAT/gcc 15.6ms ± 2% 1.00x
// FragDstFullIDAT/gcc 15.4ms ± 0% 1.01x
// FullDstFragIDAT/gcc 14.4ms ± 0% 1.08x
// FullDstFullIDAT/gcc 14.1ms ± 0% 1.10x
#include <errno.h>
#include <inttypes.h>
#include <stdio.h>
#include <string.h>
#include <sys/time.h>
#include <unistd.h>
// Wuffs ships as a "single file C library" or "header file library" as per
// https://github.com/nothings/stb/blob/master/docs/stb_howto.txt
//
// To use that single file as a "foo.c"-like implementation, instead of a
// "foo.h"-like header, #define WUFFS_IMPLEMENTATION before #include'ing or
// compiling it.
#define WUFFS_IMPLEMENTATION
// Defining the WUFFS_CONFIG__STATIC_FUNCTIONS macro is optional, but when
// combined with WUFFS_IMPLEMENTATION, it demonstrates making all of Wuffs'
// functions have static storage.
//
// This can help the compiler ignore or discard unused code, which can produce
// faster compiles and smaller binaries. Other motivations are discussed in the
// "ALLOW STATIC IMPLEMENTATION" section of
// https://raw.githubusercontent.com/nothings/stb/master/docs/stb_howto.txt
#define WUFFS_CONFIG__STATIC_FUNCTIONS
// If building this program in an environment that doesn't easily accommodate
// relative includes, you can use the script/inline-c-relative-includes.go
// program to generate a stand-alone C file.
#include "../release/c/wuffs-unsupported-snapshot.c"
// The order matters here. Clang also defines "__GNUC__".
#if defined(__clang__)
const char* g_cc = "clang";
const char* g_cc_version = __clang_version__;
#elif defined(__GNUC__)
const char* g_cc = "gcc";
const char* g_cc_version = __VERSION__;
#elif defined(_MSC_VER)
const char* g_cc = "cl";
const char* g_cc_version = "???";
#else
const char* g_cc = "cc";
const char* g_cc_version = "???";
#endif
static inline uint32_t //
load_u32be(uint8_t* p) {
return ((uint32_t)(p[0]) << 24) | ((uint32_t)(p[1]) << 16) |
((uint32_t)(p[2]) << 8) | ((uint32_t)(p[3]) << 0);
}
// Limit the input PNG image (and therefore its IDAT data) to (64 MiB - 1 byte)
// compressed, in up to 1024 IDAT chunks, and 256 MiB and 16384 × 16384 pixels
// uncompressed. This is a limitation of this program (which uses the Wuffs
// standard library), not a limitation of Wuffs per se.
#define DST_BUFFER_ARRAY_SIZE (256 * 1024 * 1024)
#define SRC_BUFFER_ARRAY_SIZE (64 * 1024 * 1024)
#define MAX_DIMENSION (16384)
#define MAX_IDAT_CHUNKS (1024)
uint8_t g_dst_buffer_array[DST_BUFFER_ARRAY_SIZE] = {0};
size_t g_dst_len = 0;
uint8_t g_src_buffer_array[SRC_BUFFER_ARRAY_SIZE] = {0};
size_t g_src_len = 0;
uint8_t g_idat_buffer_array[SRC_BUFFER_ARRAY_SIZE] = {0};
// The n'th IDAT chunk data (where n is a zero-based count) is in
// g_idat_buffer_array[i:j], where i = g_idat_splits[n+0] and j =
// g_idat_splits[n+1].
size_t g_idat_splits[MAX_IDAT_CHUNKS + 1] = {0};
uint32_t g_num_idat_chunks = 0;
#define WORK_BUFFER_ARRAY_SIZE \
WUFFS_ZLIB__DECODER_WORKBUF_LEN_MAX_INCL_WORST_CASE
#if WORK_BUFFER_ARRAY_SIZE > 0
uint8_t g_work_buffer_array[WORK_BUFFER_ARRAY_SIZE];
#else
// Not all C/C++ compilers support 0-length arrays.
uint8_t g_work_buffer_array[1];
#endif
uint32_t g_width = 0;
uint32_t g_height = 0;
uint64_t g_bytes_per_pixel = 0;
uint64_t g_bytes_per_row = 0;
uint64_t g_bytes_per_frame = 0;
const char* //
read_stdin() {
while (g_src_len < SRC_BUFFER_ARRAY_SIZE) {
const int stdin_fd = 0;
ssize_t n = read(stdin_fd, g_src_buffer_array + g_src_len,
SRC_BUFFER_ARRAY_SIZE - g_src_len);
if (n > 0) {
g_src_len += n;
} else if (n == 0) {
return NULL;
} else if (errno == EINTR) {
// No-op.
} else {
return strerror(errno);
}
}
return "input is too large";
}
const char* //
process_png_chunks(uint8_t* p, size_t n) {
while (n > 0) {
// Process the 8 byte chunk header.
if (n < 8) {
return "invalid PNG chunk";
}
uint32_t chunk_len = load_u32be(p + 0);
uint32_t chunk_type = load_u32be(p + 4);
p += 8;
n -= 8;
// Process the chunk payload.
if (n < chunk_len) {
return "short PNG chunk data";
}
switch (chunk_type) {
case 0x49484452: // "IHDR"
if (chunk_len != 13) {
return "invalid PNG IDAT chunk";
}
g_width = load_u32be(p + 0);
g_height = load_u32be(p + 4);
if ((g_width == 0) || (g_height == 0)) {
return "image dimensions are too small";
}
if ((g_width > MAX_DIMENSION) || (g_height > MAX_DIMENSION)) {
return "image dimensions are too large";
}
if (p[8] != 8) {
return "unsupported PNG bit depth";
}
if (g_bytes_per_pixel != 0) {
return "duplicate PNG IHDR chunk";
}
// Process the color type, as per the PNG spec table 11.1.
switch (p[9]) {
case 0:
g_bytes_per_pixel = 1;
break;
case 2:
g_bytes_per_pixel = 3;
break;
case 3:
g_bytes_per_pixel = 1;
break;
case 4:
g_bytes_per_pixel = 2;
break;
case 6:
g_bytes_per_pixel = 4;
break;
default:
return "unsupported PNG color type";
}
if (p[12] != 0) {
return "unsupported PNG interlacing";
}
break;
case 0x49444154: // "IDAT"
if (g_num_idat_chunks == MAX_IDAT_CHUNKS - 1) {
return "too many IDAT chunks";
}
memcpy(g_idat_buffer_array + g_idat_splits[g_num_idat_chunks], p,
chunk_len);
g_idat_splits[g_num_idat_chunks + 1] =
g_idat_splits[g_num_idat_chunks] + chunk_len;
g_num_idat_chunks++;
break;
}
p += chunk_len;
n -= chunk_len;
// Process (and ignore) the 4 byte chunk footer (a checksum).
if (n < 4) {
return "invalid PNG chunk";
}
p += 4;
n -= 4;
}
return NULL;
}
const char* //
decode_once(bool frag_dst, bool frag_idat) {
wuffs_zlib__decoder dec;
wuffs_base__status status =
wuffs_zlib__decoder__initialize(&dec, sizeof dec, WUFFS_VERSION, 0);
if (!wuffs_base__status__is_ok(&status)) {
return wuffs_base__status__message(&status);
}
wuffs_base__io_buffer dst = ((wuffs_base__io_buffer){
.data = ((wuffs_base__slice_u8){
.ptr = g_dst_buffer_array,
.len = g_bytes_per_frame,
}),
});
wuffs_base__io_buffer idat = ((wuffs_base__io_buffer){
.data = ((wuffs_base__slice_u8){
.ptr = g_idat_buffer_array,
.len = SRC_BUFFER_ARRAY_SIZE,
}),
.meta = ((wuffs_base__io_buffer_meta){
.wi = g_idat_splits[g_num_idat_chunks],
.ri = 0,
.pos = 0,
.closed = true,
}),
});
uint32_t i = 0; // Number of dst fragments processed, if frag_dst.
if (frag_dst) {
dst.data.len = g_bytes_per_row;
}
uint32_t j = 0; // Number of IDAT fragments processed, if frag_idat.
if (frag_idat) {
idat.meta.wi = g_idat_splits[1];
idat.meta.closed = (g_num_idat_chunks == 1);
}
while (true) {
status =
wuffs_zlib__decoder__transform_io(&dec, &dst, &idat,
((wuffs_base__slice_u8){
.ptr = g_work_buffer_array,
.len = WORK_BUFFER_ARRAY_SIZE,
}));
if (wuffs_base__status__is_ok(&status)) {
break;
}
if ((status.repr == wuffs_base__suspension__short_write) && frag_dst &&
(i < g_height - 1)) {
i++;
dst.data.len = g_bytes_per_row * (i + 1);
continue;
}
if ((status.repr == wuffs_base__suspension__short_read) && frag_idat &&
(j < g_num_idat_chunks - 1)) {
j++;
idat.meta.wi = g_idat_splits[j + 1];
idat.meta.closed = (g_num_idat_chunks == j + 1);
continue;
}
return wuffs_base__status__message(&status);
}
if (dst.meta.wi != g_bytes_per_frame) {
return "unexpected number of bytes decoded";
}
return NULL;
}
const char* //
decode(bool frag_dst, bool frag_idat) {
int reps;
if (g_bytes_per_frame < 100000) {
reps = 1000;
} else if (g_bytes_per_frame < 1000000) {
reps = 100;
} else if (g_bytes_per_frame < 10000000) {
reps = 10;
} else {
reps = 1;
}
struct timeval bench_start_tv;
gettimeofday(&bench_start_tv, NULL);
for (int i = 0; i < reps; i++) {
const char* msg = decode_once(frag_dst, frag_idat);
if (msg) {
return msg;
}
}
struct timeval bench_finish_tv;
gettimeofday(&bench_finish_tv, NULL);
int64_t micros =
(int64_t)(bench_finish_tv.tv_sec - bench_start_tv.tv_sec) * 1000000 +
(int64_t)(bench_finish_tv.tv_usec - bench_start_tv.tv_usec);
uint64_t nanos = 1;
if (micros > 0) {
nanos = (uint64_t)(micros)*1000;
}
printf("Benchmark%sDst%sIDAT/%s\t%8d\t%8" PRIu64 " ns/op\n",
frag_dst ? "Frag" : "Full", //
frag_idat ? "Frag" : "Full", //
g_cc, reps, nanos / reps);
return NULL;
}
int //
fail(const char* msg) {
const int stderr_fd = 2;
write(stderr_fd, msg, strnlen(msg, 4095));
write(stderr_fd, "\n", 1);
return 1;
}
int //
main(int argc, char** argv) {
const char* msg = read_stdin();
if (msg) {
return fail(msg);
}
if ((g_src_len < 8) || strncmp((const char*)(g_src_buffer_array),
"\x89PNG\x0D\x0A\x1A\x0A", 8)) {
return fail("invalid PNG");
}
msg = process_png_chunks(g_src_buffer_array + 8, g_src_len - 8);
if (msg) {
return fail(msg);
}
if (g_bytes_per_pixel == 0) {
return fail("missing PNG IHDR chunk");
}
if (g_num_idat_chunks == 0) {
return fail("missing PNG IDAT chunk");
}
// The +1 here is for the per-row filter byte.
g_bytes_per_row = (uint64_t)g_width * g_bytes_per_pixel + 1;
g_bytes_per_frame = (uint64_t)g_height * g_bytes_per_row;
if (g_bytes_per_frame > DST_BUFFER_ARRAY_SIZE) {
return fail("decompressed data is too large");
}
printf("# %s version %s\n#\n", g_cc, g_cc_version);
printf(
"# The output format, including the \"Benchmark\" prefixes, is "
"compatible with the\n"
"# https://godoc.org/golang.org/x/perf/cmd/benchstat tool. To install "
"it, first\n"
"# install Go, then run \"go get golang.org/x/perf/cmd/benchstat\".\n");
for (int i = 0; i < 5; i++) {
msg = decode(true, true);
if (msg) {
return fail(msg);
}
msg = decode(true, false);
if (msg) {
return fail(msg);
}
msg = decode(false, true);
if (msg) {
return fail(msg);
}
msg = decode(false, false);
if (msg) {
return fail(msg);
}
}
return 0;
}