examples/pigz.c - third_party/github.com/madler/zlib - Git at Google

 /* pigz.c -- parallel implementation of gzip
  * Copyright (C) 2007 Mark Adler
  * Version 1.1  28 January 2007  Mark Adler
  */

 /* Version history:
    1.0  17 Jan 2007  First version
    1.1  28 Jan 2007  Avoid void * arithmetic (some compilers don't get that)
                      Add note about requiring zlib 1.2.3
                      Allow compression level 0 (no compression)
                      Completely rewrite parallelism -- add a write thread
                      Use deflateSetDictionary() to make use of history
                      Tune argument defaults to best performance on four cores
  */

 /*
    pigz compresses from stdin to stdout using threads to make use of multiple
    processors and cores.  The input is broken up into 128 KB chunks, and each
    is compressed separately.  The CRC for each chunk is also calculated
    separately.  The compressed chunks are written in order to the output,
    and the overall CRC is calculated from the CRC's of the chunks.

    The compressed data format generated is the gzip format using the deflate
    compression method.  First a gzip header is written, followed by raw deflate
    partial streams.  They are partial, in that they do not have a terminating
    block.  At the end, the deflate stream is terminated with a final empty
    static block, and lastly a gzip trailer is written with the CRC and the
    number of input bytes.

    Each raw deflate partial stream is terminated by an empty stored block
    (using the Z_SYNC_FLUSH option of zlib), in order to end that partial
    bit stream at a byte boundary.  That allows the partial streams to be
    concantenated simply as sequences of bytes.  This adds a very small four
    or five byte overhead to the output for each input chunk.

    zlib's crc32_combine() routine allows the calcuation of the CRC of the
    entire input using the independent CRC's of the chunks.  pigz requires zlib
    version 1.2.3 or later, since that is the first version that provides the
    crc32_combine() function.

    pigz uses the POSIX pthread library for thread control and communication.
  */

 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <pthread.h>
 #include <sys/types.h>
 #include <sys/uio.h>
 #include <unistd.h>
 #include "zlib.h"

 #define local static

 /* exit with error */
 local void bail(char *msg)
 {
     fprintf(stderr, "pigz abort: %s\n", msg);
     exit(1);
 }

 /* read up to len bytes into buf, repeating read() calls as needed */
 local size_t readn(int desc, unsigned char *buf, size_t len)
 {
     ssize_t ret;
     size_t got;

     got = 0;
     while (len) {
         ret = read(desc, buf, len);
         if (ret < 0)
             bail("read error");
         if (ret == 0)
             break;
         buf += ret;
         len -= ret;
         got += ret;
     }
     return got;
 }

 /* write len bytes, repeating write() calls as needed */
 local void writen(int desc, unsigned char *buf, size_t len)
 {
     ssize_t ret;

     while (len) {
         ret = write(desc, buf, len);
         if (ret < 1)
             bail("write error");
         buf += ret;
         len -= ret;
     }
 }

 /* a flag variable for communication between two threads */
 struct flag {
     int value;              /* value of flag */
     pthread_mutex_t lock;   /* lock for checking and changing flag */
     pthread_cond_t cond;    /* condition for signaling on flag change */
 };

 /* initialize a flag for use, starting with value val */
 local void flag_init(struct flag *me, int val)
 {
     me->value = val;
     pthread_mutex_init(&(me->lock), NULL);
     pthread_cond_init(&(me->cond), NULL);
 }

 /* set the flag to val, signal another process that may be waiting for it */
 local void flag_set(struct flag *me, int val)
 {
     pthread_mutex_lock(&(me->lock));
     me->value = val;
     pthread_cond_signal(&(me->cond));
     pthread_mutex_unlock(&(me->lock));
 }

 /* if it isn't already, wait for some other thread to set the flag to val */
 local void flag_wait(struct flag *me, int val)
 {
     pthread_mutex_lock(&(me->lock));
     while (me->value != val)
         pthread_cond_wait(&(me->cond), &(me->lock));
     pthread_mutex_unlock(&(me->lock));
 }

 /* if flag is equal to val, wait for some other thread to change it */
 local void flag_wait_not(struct flag *me, int val)
 {
     pthread_mutex_lock(&(me->lock));
     while (me->value == val)
         pthread_cond_wait(&(me->cond), &(me->lock));
     pthread_mutex_unlock(&(me->lock));
 }

 /* clean up the flag when done with it */
 local void flag_done(struct flag *me)
 {
     pthread_cond_destroy(&(me->cond));
     pthread_mutex_destroy(&(me->lock));
 }

 /* a unit of work to feed to compress_thread() -- it is assumed that the out
    buffer is large enough to hold the maximum size len bytes could deflate to,
    plus five bytes for the final sync marker */
 struct work {
     size_t len;                 /* length of input */
     unsigned long crc;          /* crc of input */
     unsigned char *buf;         /* input */
     unsigned char *out;         /* space for output (guaranteed big enough) */
     z_stream strm;              /* pre-initialized z_stream */
     struct flag busy;           /* busy flag indicating work unit in use */
     pthread_t comp;             /* this compression thread */
 };

 /* busy flag values */
 #define IDLE 0          /* compress and writing done -- can start compress */
 #define COMP 1          /* compress -- input and output buffers in use */
 #define WRITE 2         /* compress done, writing output -- can read input */

 /* read-only globals (set by main/read thread before others started) */
 local int ind;              /* input file descriptor */
 local int outd;             /* output file descriptor */
 local int level;            /* compression level */
 local int procs;            /* number of compression threads (>= 2) */
 local size_t size;          /* uncompressed input size per thread (>= 32K) */
 local struct work *jobs;    /* work units: jobs[0..procs-1] */

 /* next and previous jobs[] indices */
 #define NEXT(n) ((n) == procs - 1 ? 0 : (n) + 1)
 #define PREV(n) ((n) == 0 ? procs - 1 : (n) - 1)

 /* sliding dictionary size for deflate */
 #define DICT 32768U

 /* largest power of 2 that fits in an unsigned int -- used to limit requests
    to zlib functions that use unsigned int lengths */
 #define MAX ((((unsigned)-1) >> 1) + 1)

 /* compress thread: compress the input in the provided work unit and compute
    its crc -- assume that the amount of space at job->out is guaranteed to be
    enough for the compressed output, as determined by the maximum expansion
    of deflate compression -- use the input in the previous work unit (if there
    is one) to set the deflate dictionary for better compression */
 local void *compress_thread(void *arg)
 {
     size_t len;                     /* input length for this work unit */
     unsigned long crc;              /* crc of input data */
     struct work *prev;              /* previous work unit */
     struct work *job = arg;         /* work unit for this thread */
     z_stream *strm = &(job->strm);  /* zlib stream for this work unit */

     /* reset state for a new compressed stream */
     (void)deflateReset(strm);

     /* initialize input, output, and crc */
     strm->next_in = job->buf;
     strm->next_out = job->out;
     len = job->len;
     crc = crc32(0L, Z_NULL, 0);

     /* set dictionary if this isn't the first work unit, and if we will be
        compressing something (the read thread assures that the dictionary
        data in the previous work unit is still there) */
     prev = jobs + PREV(job - jobs);
     if (prev->buf != NULL && len != 0)
         deflateSetDictionary(strm, prev->buf + (size - DICT), DICT);

     /* run MAX-sized amounts of input through deflate and crc32 -- this loop
        is needed for those cases where the integer type is smaller than the
        size_t type, or when len is close to the limit of the size_t type */
     while (len > MAX) {
         strm->avail_in = MAX;
         strm->avail_out = (unsigned)-1;
         crc = crc32(crc, strm->next_in, strm->avail_in);
         (void)deflate(strm, Z_NO_FLUSH);
         len -= MAX;
     }

     /* run last piece through deflate and crc32, follow with a sync marker */
     if (len) {
         strm->avail_in = len;
         strm->avail_out = (unsigned)-1;
         crc = crc32(crc, strm->next_in, strm->avail_in);
         (void)deflate(strm, Z_SYNC_FLUSH);
     }

     /* don't need to Z_FINISH, since we'd delete the last two bytes anyway */

     /* return result */
     job->crc = crc;
     return NULL;
 }

 /* put a 4-byte integer into a byte array in LSB order */
 #define PUT4(a,b) (*(a)=(b),(a)[1]=(b)>>8,(a)[2]=(b)>>16,(a)[3]=(b)>>24)

 /* write thread: wait for compression threads to complete, write output in
    order, also write gzip header and trailer around the compressed data */
 local void *write_thread(void *arg)
 {
     int n;                          /* compress thread index */
     size_t len;                     /* length of input processed */
     unsigned long tot;              /* total uncompressed size (overflow ok) */
     unsigned long crc;              /* CRC-32 of uncompressed data */
     unsigned char wrap[10];         /* gzip header or trailer */

     /* write simple gzip header */
     memcpy(wrap, "\037\213\10\0\0\0\0\0\0\3", 10);
     wrap[8] = level == 9 ? 2 : (level == 1 ? 4 : 0);
     writen(outd, wrap, 10);

     /* process output of compress threads until end of input */
     tot = 0;
     crc = crc32(0L, Z_NULL, 0);
     n = 0;
     do {
         /* wait for compress thread to start, then wait to complete */
         flag_wait(&(jobs[n].busy), COMP);
         pthread_join(jobs[n].comp, NULL);

         /* now that compress is done, allow read thread to use input buffer */
         flag_set(&(jobs[n].busy), WRITE);

         /* write compressed data and update length and crc */
         writen(outd, jobs[n].out, jobs[n].strm.next_out - jobs[n].out);
         len = jobs[n].len;
         tot += len;
         crc = crc32_combine(crc, jobs[n].crc, len);

         /* release this work unit and go to the next work unit */
         flag_set(&(jobs[n].busy), IDLE);
         n = NEXT(n);

         /* an input buffer less than size in length indicates end of input */
     } while (len == size);

     /* write final static block and gzip trailer (crc and len mod 2^32) */
     wrap[0] = 3;  wrap[1] = 0;
     PUT4(wrap + 2, crc);
     PUT4(wrap + 6, tot);
     writen(outd, wrap, 10);
     return NULL;
 }

 /* one-time initialization of a work unit -- this is where we set the deflate
    compression level and request raw deflate, and also where we set the size
    of the output buffer to guarantee enough space for a worst-case deflate
    ending with a Z_SYNC_FLUSH */
 local void job_init(struct work *job)
 {
     int ret;                        /* deflateInit2() return value */

     job->buf = malloc(size);
     job->out = malloc(size + (size >> 11) + 10);
     job->strm.zfree = Z_NULL;
     job->strm.zalloc = Z_NULL;
     job->strm.opaque = Z_NULL;
     ret = deflateInit2(&(job->strm), level, Z_DEFLATED, -15, 8,
                        Z_DEFAULT_STRATEGY);
     if (job->buf == NULL || job->out == NULL || ret != Z_OK)
         bail("not enough memory");
 }

 /* compress ind to outd in the gzip format, using multiple threads for the
    compression and crc calculation and another thread for writing the output --
    the read thread is the main thread */
 local void read_thread(void)
 {
     int n;                          /* general index */
     size_t got;                     /* amount read */
     pthread_attr_t attr;            /* thread attributes (left at defaults) */
     pthread_t write;                /* write thread */

     /* set defaults (not all pthread implementations default to joinable) */
     pthread_attr_init(&attr);
     pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);

     /* allocate and set up work list (individual work units will be initialized
        as needed, in case the input is short), assure that allocation size
        arithmetic does not overflow */
     if (size + (size >> 11) + 10 < (size >> 11) + 10 ||
         (ssize_t)(size + (size >> 11) + 10) < 0 ||
         ((size_t)0 - 1) / procs <= sizeof(struct work) ||
         (jobs = malloc(procs * sizeof(struct work))) == NULL)
         bail("not enough memory");
     for (n = 0; n < procs; n++) {
         jobs[n].buf = NULL;
         flag_init(&(jobs[n].busy), IDLE);
     }

     /* start write thread */
     pthread_create(&write, &attr, write_thread, NULL);

     /* read from input and start compress threads (write thread will pick up
        the output of the compress threads) */
     n = 0;
     do {
         /* initialize this work unit if it's the first time it's used */
         if (jobs[n].buf == NULL)
             job_init(jobs + n);

         /* read input data, but wait for last compress on this work unit to be
            done, and wait for the dictionary to be used by the last compress on
            the next work unit */
         flag_wait_not(&(jobs[n].busy), COMP);
         flag_wait_not(&(jobs[NEXT(n)].busy), COMP);
         got = readn(ind, jobs[n].buf, size);

         /* start compress thread, but wait for write to be done first */
         flag_wait(&(jobs[n].busy), IDLE);
         jobs[n].len = got;
         pthread_create(&(jobs[n].comp), &attr, compress_thread, jobs + n);

         /* mark work unit so write thread knows compress was started */
         flag_set(&(jobs[n].busy), COMP);

         /* go to the next work unit */
         n = NEXT(n);

         /* do until end of input, indicated by a read less than size */
     } while (got == size);

     /* wait for the write thread to complete -- the write thread will join with
        all of the compress threads, so this waits for all of the threads to
        complete */
     pthread_join(write, NULL);

     /* free up all requested resources and return */
     for (n = procs - 1; n >= 0; n--) {
         flag_done(&(jobs[n].busy));
         (void)deflateEnd(&(jobs[n].strm));
         free(jobs[n].out);
         free(jobs[n].buf);
     }
     free(jobs);
     pthread_attr_destroy(&attr);
 }

 /* Process arguments for level, size, and procs, compress from stdin to
    stdout in the gzip format.  Note that procs must be at least two in
    order to provide a dictionary in one work unit for the other work
    unit, and that size must be at least 32K to store a full dictionary. */
 int main(int argc, char **argv)
 {
     int n;                          /* general index */
     int get;                        /* command line parameters to get */
     char *arg;                      /* command line argument */

     /* set defaults -- 32 processes and 128K buffers was found to provide
        good utilization of four cores (about 97%) and balanced the overall
        execution time impact of more threads against more dictionary
        processing for a fixed amount of memory -- the memory usage for these
        settings and full use of all work units (at least 4 MB of input) is
        16.2 MB
        */
     level = Z_DEFAULT_COMPRESSION;
     procs = 32;
     size = 131072UL;

     /* process command-line arguments */
     get = 0;
     for (n = 1; n < argc; n++) {
         arg = argv[n];
         if (*arg == '-') {
             while (*++arg)
                 if (*arg >= '0' && *arg <= '9')     /* compression level */
                     level = *arg - '0';
                 else if (*arg == 'b')               /* chunk size in K */
                     get |= 1;
                 else if (*arg == 'p')               /* number of processes */
                     get |= 2;
                 else if (*arg == 'h') {             /* help */
                     fputs("usage: pigz [-0..9] [-b blocksizeinK]", stderr);
                     fputs(" [-p processes] < foo > foo.gz\n", stderr);
                     return 0;
                 }
                 else
                     bail("invalid option");
         }
         else if (get & 1) {
             if (get & 2)
                 bail("you need to separate the -b and -p options");
             size = (size_t)(atol(arg)) << 10;       /* chunk size */
             if (size < DICT)
                 bail("invalid option");
             get = 0;
         }
         else if (get & 2) {
             procs = atoi(arg);                      /* processes */
             if (procs < 2)
                 bail("invalid option");
             get = 0;
         }
         else
             bail("invalid option (you need to pipe input and output)");
     }
     if (get)
         bail("missing option argument");

     /* do parallel compression from stdin to stdout (the read thread starts up
        the write thread and the compression threads, and they all join before
        the read thread returns) */
     ind = 0;
     outd = 1;
     read_thread();

     /* done */
     return 0;
 }
	/* pigz.c -- parallel implementation of gzip
	* Copyright (C) 2007 Mark Adler
	* Version 1.1 28 January 2007 Mark Adler
	*/

	/* Version history:
	1.0 17 Jan 2007 First version
	1.1 28 Jan 2007 Avoid void * arithmetic (some compilers don't get that)
	Add note about requiring zlib 1.2.3
	Allow compression level 0 (no compression)
	Completely rewrite parallelism -- add a write thread
	Use deflateSetDictionary() to make use of history
	Tune argument defaults to best performance on four cores
	*/

	/*
	pigz compresses from stdin to stdout using threads to make use of multiple
	processors and cores. The input is broken up into 128 KB chunks, and each
	is compressed separately. The CRC for each chunk is also calculated
	separately. The compressed chunks are written in order to the output,
	and the overall CRC is calculated from the CRC's of the chunks.

	The compressed data format generated is the gzip format using the deflate
	compression method. First a gzip header is written, followed by raw deflate
	partial streams. They are partial, in that they do not have a terminating
	block. At the end, the deflate stream is terminated with a final empty
	static block, and lastly a gzip trailer is written with the CRC and the
	number of input bytes.

	Each raw deflate partial stream is terminated by an empty stored block
	(using the Z_SYNC_FLUSH option of zlib), in order to end that partial
	bit stream at a byte boundary. That allows the partial streams to be
	concantenated simply as sequences of bytes. This adds a very small four
	or five byte overhead to the output for each input chunk.

	zlib's crc32_combine() routine allows the calcuation of the CRC of the
	entire input using the independent CRC's of the chunks. pigz requires zlib
	version 1.2.3 or later, since that is the first version that provides the
	crc32_combine() function.

	pigz uses the POSIX pthread library for thread control and communication.
	*/

	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>
	#include <pthread.h>
	#include <sys/types.h>
	#include <sys/uio.h>
	#include <unistd.h>
	#include "zlib.h"

	#define local static

	/* exit with error */
	local void bail(char *msg)
	{
	fprintf(stderr, "pigz abort: %s\n", msg);
	exit(1);
	}

	/* read up to len bytes into buf, repeating read() calls as needed */
	local size_t readn(int desc, unsigned char *buf, size_t len)
	{
	ssize_t ret;
	size_t got;

	got = 0;
	while (len) {
	ret = read(desc, buf, len);
	if (ret < 0)
	bail("read error");
	if (ret == 0)
	break;
	buf += ret;
	len -= ret;
	got += ret;
	}
	return got;
	}

	/* write len bytes, repeating write() calls as needed */
	local void writen(int desc, unsigned char *buf, size_t len)
	{
	ssize_t ret;

	while (len) {
	ret = write(desc, buf, len);
	if (ret < 1)
	bail("write error");
	buf += ret;
	len -= ret;
	}
	}

	/* a flag variable for communication between two threads */
	struct flag {
	int value; /* value of flag */
	pthread_mutex_t lock; /* lock for checking and changing flag */
	pthread_cond_t cond; /* condition for signaling on flag change */
	};

	/* initialize a flag for use, starting with value val */
	local void flag_init(struct flag *me, int val)
	{
	me->value = val;
	pthread_mutex_init(&(me->lock), NULL);
	pthread_cond_init(&(me->cond), NULL);
	}

	/* set the flag to val, signal another process that may be waiting for it */
	local void flag_set(struct flag *me, int val)
	{
	pthread_mutex_lock(&(me->lock));
	me->value = val;
	pthread_cond_signal(&(me->cond));
	pthread_mutex_unlock(&(me->lock));
	}

	/* if it isn't already, wait for some other thread to set the flag to val */
	local void flag_wait(struct flag *me, int val)
	{
	pthread_mutex_lock(&(me->lock));
	while (me->value != val)
	pthread_cond_wait(&(me->cond), &(me->lock));
	pthread_mutex_unlock(&(me->lock));
	}

	/* if flag is equal to val, wait for some other thread to change it */
	local void flag_wait_not(struct flag *me, int val)
	{
	pthread_mutex_lock(&(me->lock));
	while (me->value == val)
	pthread_cond_wait(&(me->cond), &(me->lock));
	pthread_mutex_unlock(&(me->lock));
	}

	/* clean up the flag when done with it */
	local void flag_done(struct flag *me)
	{
	pthread_cond_destroy(&(me->cond));
	pthread_mutex_destroy(&(me->lock));
	}

	/* a unit of work to feed to compress_thread() -- it is assumed that the out
	buffer is large enough to hold the maximum size len bytes could deflate to,
	plus five bytes for the final sync marker */
	struct work {
	size_t len; /* length of input */
	unsigned long crc; /* crc of input */
	unsigned char buf; / input */
	unsigned char out; / space for output (guaranteed big enough) */
	z_stream strm; /* pre-initialized z_stream */
	struct flag busy; /* busy flag indicating work unit in use */
	pthread_t comp; /* this compression thread */
	};

	/* busy flag values */
	#define IDLE 0 /* compress and writing done -- can start compress */
	#define COMP 1 /* compress -- input and output buffers in use */
	#define WRITE 2 /* compress done, writing output -- can read input */

	/* read-only globals (set by main/read thread before others started) */
	local int ind; /* input file descriptor */
	local int outd; /* output file descriptor */
	local int level; /* compression level */
	local int procs; /* number of compression threads (>= 2) */
	local size_t size; /* uncompressed input size per thread (>= 32K) */
	local struct work jobs; / work units: jobs[0..procs-1] */

	/* next and previous jobs[] indices */
	#define NEXT(n) ((n) == procs - 1 ? 0 : (n) + 1)
	#define PREV(n) ((n) == 0 ? procs - 1 : (n) - 1)

	/* sliding dictionary size for deflate */
	#define DICT 32768U

	/* largest power of 2 that fits in an unsigned int -- used to limit requests
	to zlib functions that use unsigned int lengths */
	#define MAX ((((unsigned)-1) >> 1) + 1)

	/* compress thread: compress the input in the provided work unit and compute
	its crc -- assume that the amount of space at job->out is guaranteed to be
	enough for the compressed output, as determined by the maximum expansion
	of deflate compression -- use the input in the previous work unit (if there
	is one) to set the deflate dictionary for better compression */
	local void compress_thread(void arg)
	{
	size_t len; /* input length for this work unit */
	unsigned long crc; /* crc of input data */
	struct work prev; / previous work unit */
	struct work job = arg; / work unit for this thread */
	z_stream strm = &(job->strm); / zlib stream for this work unit */

	/* reset state for a new compressed stream */
	(void)deflateReset(strm);

	/* initialize input, output, and crc */
	strm->next_in = job->buf;
	strm->next_out = job->out;
	len = job->len;
	crc = crc32(0L, Z_NULL, 0);

	/* set dictionary if this isn't the first work unit, and if we will be
	compressing something (the read thread assures that the dictionary
	data in the previous work unit is still there) */
	prev = jobs + PREV(job - jobs);
	if (prev->buf != NULL && len != 0)
	deflateSetDictionary(strm, prev->buf + (size - DICT), DICT);

	/* run MAX-sized amounts of input through deflate and crc32 -- this loop
	is needed for those cases where the integer type is smaller than the
	size_t type, or when len is close to the limit of the size_t type */
	while (len > MAX) {
	strm->avail_in = MAX;
	strm->avail_out = (unsigned)-1;
	crc = crc32(crc, strm->next_in, strm->avail_in);
	(void)deflate(strm, Z_NO_FLUSH);
	len -= MAX;
	}

	/* run last piece through deflate and crc32, follow with a sync marker */
	if (len) {
	strm->avail_in = len;
	strm->avail_out = (unsigned)-1;
	crc = crc32(crc, strm->next_in, strm->avail_in);
	(void)deflate(strm, Z_SYNC_FLUSH);
	}

	/* don't need to Z_FINISH, since we'd delete the last two bytes anyway */

	/* return result */
	job->crc = crc;
	return NULL;
	}

	/* put a 4-byte integer into a byte array in LSB order */
	#define PUT4(a,b) (*(a)=(b),(a)[1]=(b)>>8,(a)[2]=(b)>>16,(a)[3]=(b)>>24)

	/* write thread: wait for compression threads to complete, write output in
	order, also write gzip header and trailer around the compressed data */
	local void write_thread(void arg)
	{
	int n; /* compress thread index */
	size_t len; /* length of input processed */
	unsigned long tot; /* total uncompressed size (overflow ok) */
	unsigned long crc; /* CRC-32 of uncompressed data */
	unsigned char wrap[10]; /* gzip header or trailer */

	/* write simple gzip header */
	memcpy(wrap, "\037\213\10\0\0\0\0\0\0\3", 10);
	wrap[8] = level == 9 ? 2 : (level == 1 ? 4 : 0);
	writen(outd, wrap, 10);

	/* process output of compress threads until end of input */
	tot = 0;
	crc = crc32(0L, Z_NULL, 0);
	n = 0;
	do {
	/* wait for compress thread to start, then wait to complete */
	flag_wait(&(jobs[n].busy), COMP);
	pthread_join(jobs[n].comp, NULL);

	/* now that compress is done, allow read thread to use input buffer */
	flag_set(&(jobs[n].busy), WRITE);

	/* write compressed data and update length and crc */
	writen(outd, jobs[n].out, jobs[n].strm.next_out - jobs[n].out);
	len = jobs[n].len;
	tot += len;
	crc = crc32_combine(crc, jobs[n].crc, len);

	/* release this work unit and go to the next work unit */
	flag_set(&(jobs[n].busy), IDLE);
	n = NEXT(n);

	/* an input buffer less than size in length indicates end of input */
	} while (len == size);

	/* write final static block and gzip trailer (crc and len mod 2^32) */
	wrap[0] = 3; wrap[1] = 0;
	PUT4(wrap + 2, crc);
	PUT4(wrap + 6, tot);
	writen(outd, wrap, 10);
	return NULL;
	}

	/* one-time initialization of a work unit -- this is where we set the deflate
	compression level and request raw deflate, and also where we set the size
	of the output buffer to guarantee enough space for a worst-case deflate
	ending with a Z_SYNC_FLUSH */
	local void job_init(struct work *job)
	{
	int ret; /* deflateInit2() return value */

	job->buf = malloc(size);
	job->out = malloc(size + (size >> 11) + 10);
	job->strm.zfree = Z_NULL;
	job->strm.zalloc = Z_NULL;
	job->strm.opaque = Z_NULL;
	ret = deflateInit2(&(job->strm), level, Z_DEFLATED, -15, 8,
	Z_DEFAULT_STRATEGY);
	if (job->buf == NULL \|\| job->out == NULL \|\| ret != Z_OK)
	bail("not enough memory");
	}

	/* compress ind to outd in the gzip format, using multiple threads for the
	compression and crc calculation and another thread for writing the output --
	the read thread is the main thread */
	local void read_thread(void)
	{
	int n; /* general index */
	size_t got; /* amount read */
	pthread_attr_t attr; /* thread attributes (left at defaults) */
	pthread_t write; /* write thread */

	/* set defaults (not all pthread implementations default to joinable) */
	pthread_attr_init(&attr);
	pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);

	/* allocate and set up work list (individual work units will be initialized
	as needed, in case the input is short), assure that allocation size
	arithmetic does not overflow */
	if (size + (size >> 11) + 10 < (size >> 11) + 10 \|\|
	(ssize_t)(size + (size >> 11) + 10) < 0 \|\|
	((size_t)0 - 1) / procs <= sizeof(struct work) \|\|
	(jobs = malloc(procs * sizeof(struct work))) == NULL)
	bail("not enough memory");
	for (n = 0; n < procs; n++) {
	jobs[n].buf = NULL;
	flag_init(&(jobs[n].busy), IDLE);
	}

	/* start write thread */
	pthread_create(&write, &attr, write_thread, NULL);

	/* read from input and start compress threads (write thread will pick up
	the output of the compress threads) */
	n = 0;
	do {
	/* initialize this work unit if it's the first time it's used */
	if (jobs[n].buf == NULL)
	job_init(jobs + n);

	/* read input data, but wait for last compress on this work unit to be
	done, and wait for the dictionary to be used by the last compress on
	the next work unit */
	flag_wait_not(&(jobs[n].busy), COMP);
	flag_wait_not(&(jobs[NEXT(n)].busy), COMP);
	got = readn(ind, jobs[n].buf, size);

	/* start compress thread, but wait for write to be done first */
	flag_wait(&(jobs[n].busy), IDLE);
	jobs[n].len = got;
	pthread_create(&(jobs[n].comp), &attr, compress_thread, jobs + n);

	/* mark work unit so write thread knows compress was started */
	flag_set(&(jobs[n].busy), COMP);

	/* go to the next work unit */
	n = NEXT(n);

	/* do until end of input, indicated by a read less than size */
	} while (got == size);

	/* wait for the write thread to complete -- the write thread will join with
	all of the compress threads, so this waits for all of the threads to
	complete */
	pthread_join(write, NULL);

	/* free up all requested resources and return */
	for (n = procs - 1; n >= 0; n--) {
	flag_done(&(jobs[n].busy));
	(void)deflateEnd(&(jobs[n].strm));
	free(jobs[n].out);
	free(jobs[n].buf);
	}
	free(jobs);
	pthread_attr_destroy(&attr);
	}

	/* Process arguments for level, size, and procs, compress from stdin to
	stdout in the gzip format. Note that procs must be at least two in
	order to provide a dictionary in one work unit for the other work
	unit, and that size must be at least 32K to store a full dictionary. */
	int main(int argc, char **argv)
	{
	int n; /* general index */
	int get; /* command line parameters to get */
	char arg; / command line argument */

	/* set defaults -- 32 processes and 128K buffers was found to provide
	good utilization of four cores (about 97%) and balanced the overall
	execution time impact of more threads against more dictionary
	processing for a fixed amount of memory -- the memory usage for these
	settings and full use of all work units (at least 4 MB of input) is
	16.2 MB
	*/
	level = Z_DEFAULT_COMPRESSION;
	procs = 32;
	size = 131072UL;

	/* process command-line arguments */
	get = 0;
	for (n = 1; n < argc; n++) {
	arg = argv[n];
	if (*arg == '-') {
	while (*++arg)
	if (arg >= '0' && arg <= '9') /* compression level */
	level = *arg - '0';
	else if (arg == 'b') / chunk size in K */
	get \|= 1;
	else if (arg == 'p') / number of processes */
	get \|= 2;
	else if (arg == 'h') { / help */
	fputs("usage: pigz [-0..9] [-b blocksizeinK]", stderr);
	fputs(" [-p processes] < foo > foo.gz\n", stderr);
	return 0;
	}
	else
	bail("invalid option");
	}
	else if (get & 1) {
	if (get & 2)
	bail("you need to separate the -b and -p options");
	size = (size_t)(atol(arg)) << 10; /* chunk size */
	if (size < DICT)
	bail("invalid option");
	get = 0;
	}
	else if (get & 2) {
	procs = atoi(arg); /* processes */
	if (procs < 2)
	bail("invalid option");
	get = 0;
	}
	else
	bail("invalid option (you need to pipe input and output)");
	}
	if (get)
	bail("missing option argument");

	/* do parallel compression from stdin to stdout (the read thread starts up
	the write thread and the compression threads, and they all join before
	the read thread returns) */
	ind = 0;
	outd = 1;
	read_thread();

	/* done */
	return 0;
	}