Add LIT_MEM define to use more memory for a small deflate speedup.
A bug fix in zlib 1.2.12 resulted in a slight slowdown (1-2%) of
deflate. This commit provides the option to #define LIT_MEM, which
uses more memory to reverse most of that slowdown. The memory for
the pending buffer and symbol buffers is increased by 25%, which
increases the total memory usage with the default parameters by
about 6%.
diff --git a/deflate.c b/deflate.c
index 5e72cd9..263bbc8 100644
--- a/deflate.c
+++ b/deflate.c
@@ -493,7 +493,11 @@
* symbols from which it is being constructed.
*/
+#ifdef LIT_MEM
+ s->pending_buf = (uchf *) ZALLOC(strm, s->lit_bufsize, 5);
+#else
s->pending_buf = (uchf *) ZALLOC(strm, s->lit_bufsize, 4);
+#endif
s->pending_buf_size = (ulg)s->lit_bufsize * 4;
if (s->window == Z_NULL || s->prev == Z_NULL || s->head == Z_NULL ||
@@ -503,8 +507,14 @@
deflateEnd (strm);
return Z_MEM_ERROR;
}
+#ifdef LIT_MEM
+ s->d_buf = (ushf *)(s->pending_buf + (s->lit_bufsize << 1));
+ s->l_buf = s->pending_buf + (s->lit_bufsize << 2);
+ s->sym_end = s->lit_bufsize - 1;
+#else
s->sym_buf = s->pending_buf + s->lit_bufsize;
s->sym_end = (s->lit_bufsize - 1) * 3;
+#endif
/* We avoid equality with lit_bufsize*3 because of wraparound at 64K
* on 16 bit machines and because stored blocks are restricted to
* 64K-1 bytes.
@@ -720,9 +730,15 @@
if (deflateStateCheck(strm)) return Z_STREAM_ERROR;
s = strm->state;
+#ifdef LIT_MEM
+ if (bits < 0 || bits > 16 ||
+ (uchf *)s->d_buf < s->pending_out + ((Buf_size + 7) >> 3))
+ return Z_BUF_ERROR;
+#else
if (bits < 0 || bits > 16 ||
s->sym_buf < s->pending_out + ((Buf_size + 7) >> 3))
return Z_BUF_ERROR;
+#endif
do {
put = Buf_size - s->bi_valid;
if (put > bits)
@@ -1308,7 +1324,12 @@
zmemcpy(ds->pending_buf, ss->pending_buf, (uInt)ds->pending_buf_size);
ds->pending_out = ds->pending_buf + (ss->pending_out - ss->pending_buf);
+#ifdef LIT_MEM
+ ds->d_buf = (ushf *)(ds->pending_buf + (ds->lit_bufsize << 1));
+ ds->l_buf = ds->pending_buf + (ds->lit_bufsize << 2);
+#else
ds->sym_buf = ds->pending_buf + ds->lit_bufsize;
+#endif
ds->l_desc.dyn_tree = ds->dyn_ltree;
ds->d_desc.dyn_tree = ds->dyn_dtree;
diff --git a/deflate.h b/deflate.h
index 8696791..69fe3a9 100644
--- a/deflate.h
+++ b/deflate.h
@@ -23,6 +23,10 @@
# define GZIP
#endif
+/* define LIT_MEM to slightly increase the speed of deflate (order 1% to 2%) at
+ the cost of a larger memory footprint */
+/* #define LIT_MEM */
+
/* ===========================================================================
* Internal compression state.
*/
@@ -217,7 +221,12 @@
/* Depth of each subtree used as tie breaker for trees of equal frequency
*/
+#ifdef LIT_MEM
+ ushf *d_buf; /* buffer for distances */
+ uchf *l_buf; /* buffer for literals/lengths */
+#else
uchf *sym_buf; /* buffer for distances and literals/lengths */
+#endif
uInt lit_bufsize;
/* Size of match buffer for literals/lengths. There are 4 reasons for
@@ -239,7 +248,7 @@
* - I can't count above 4
*/
- uInt sym_next; /* running index in sym_buf */
+ uInt sym_next; /* running index in symbol buffer */
uInt sym_end; /* symbol table full when sym_next reaches this */
ulg opt_len; /* bit length of current block with optimal trees */
@@ -318,6 +327,25 @@
extern const uch ZLIB_INTERNAL _dist_code[];
#endif
+#ifdef LIT_MEM
+# define _tr_tally_lit(s, c, flush) \
+ { uch cc = (c); \
+ s->d_buf[s->sym_next] = 0; \
+ s->l_buf[s->sym_next++] = cc; \
+ s->dyn_ltree[cc].Freq++; \
+ flush = (s->sym_next == s->sym_end); \
+ }
+# define _tr_tally_dist(s, distance, length, flush) \
+ { uch len = (uch)(length); \
+ ush dist = (ush)(distance); \
+ s->d_buf[s->sym_next] = dist; \
+ s->l_buf[s->sym_next++] = len; \
+ dist--; \
+ s->dyn_ltree[_length_code[len]+LITERALS+1].Freq++; \
+ s->dyn_dtree[d_code(dist)].Freq++; \
+ flush = (s->sym_next == s->sym_end); \
+ }
+#else
# define _tr_tally_lit(s, c, flush) \
{ uch cc = (c); \
s->sym_buf[s->sym_next++] = 0; \
@@ -337,6 +365,7 @@
s->dyn_dtree[d_code(dist)].Freq++; \
flush = (s->sym_next == s->sym_end); \
}
+#endif
#else
# define _tr_tally_lit(s, c, flush) flush = _tr_tally(s, 0, c)
# define _tr_tally_dist(s, distance, length, flush) \
diff --git a/trees.c b/trees.c
index 8dbdc40..5ca23e9 100644
--- a/trees.c
+++ b/trees.c
@@ -899,14 +899,19 @@
const ct_data *dtree) {
unsigned dist; /* distance of matched string */
int lc; /* match length or unmatched char (if dist == 0) */
- unsigned sx = 0; /* running index in sym_buf */
+ unsigned sx = 0; /* running index in symbol buffers */
unsigned code; /* the code to send */
int extra; /* number of extra bits to send */
if (s->sym_next != 0) do {
+#ifdef LIT_MEM
+ dist = s->d_buf[sx];
+ lc = s->l_buf[sx++];
+#else
dist = s->sym_buf[sx++] & 0xff;
dist += (unsigned)(s->sym_buf[sx++] & 0xff) << 8;
lc = s->sym_buf[sx++];
+#endif
if (dist == 0) {
send_code(s, lc, ltree); /* send a literal byte */
Tracecv(isgraph(lc), (stderr," '%c' ", lc));
@@ -931,8 +936,12 @@
}
} /* literal or match pair ? */
- /* Check that the overlay between pending_buf and sym_buf is ok: */
+ /* Check for no overlay of pending_buf on needed symbols */
+#ifdef LIT_MEM
+ Assert(s->pending < (s->lit_bufsize << 1) + sx, "pendingBuf overflow");
+#else
Assert(s->pending < s->lit_bufsize + sx, "pendingBuf overflow");
+#endif
} while (sx < s->sym_next);
@@ -1082,9 +1091,14 @@
* the current block must be flushed.
*/
int ZLIB_INTERNAL _tr_tally(deflate_state *s, unsigned dist, unsigned lc) {
+#ifdef LIT_MEM
+ s->d_buf[s->sym_next] = (ush)dist;
+ s->l_buf[s->sym_next++] = (uch)lc;
+#else
s->sym_buf[s->sym_next++] = (uch)dist;
s->sym_buf[s->sym_next++] = (uch)(dist >> 8);
s->sym_buf[s->sym_next++] = (uch)lc;
+#endif
if (dist == 0) {
/* lc is the unmatched char */
s->dyn_ltree[lc].Freq++;