3 /* gzjoin -- command to join gzip files into one gzip file
5 Copyright (C) 2004 Mark Adler, all rights reserved
6 version 1.0, 11 Dec 2004
8 This software is provided 'as-is', without any express or implied
9 warranty. In no event will the author be held liable for any damages
10 arising from the use of this software.
12 Permission is granted to anyone to use this software for any purpose,
13 including commercial applications, and to alter it and redistribute it
14 freely, subject to the following restrictions:
16 1. The origin of this software must not be misrepresented; you must not
17 claim that you wrote the original software. If you use this software
18 in a product, an acknowledgment in the product documentation would be
19 appreciated but is not required.
20 2. Altered source versions must be plainly marked as such, and must not be
21 misrepresented as being the original software.
22 3. This notice may not be removed or altered from any source distribution.
24 Mark Adler madler@alumni.caltech.edu
30 * 1.0 11 Dec 2004 - First version
31 * 1.1 12 Jun 2005 - Changed ssize_t to long for portability
35 gzjoin takes one or more gzip files on the command line and writes out a
36 single gzip file that will uncompress to the concatenation of the
37 uncompressed data from the individual gzip files. gzjoin does this without
38 having to recompress any of the data and without having to calculate a new
39 crc32 for the concatenated uncompressed data. gzjoin does however have to
40 decompress all of the input data in order to find the bits in the compressed
41 data that need to be modified to concatenate the streams.
43 gzjoin does not do an integrity check on the input gzip files other than
44 checking the gzip header and decompressing the compressed data. They are
45 otherwise assumed to be complete and correct.
47 Each joint between gzip files removes at least 18 bytes of previous trailer
48 and subsequent header, and inserts an average of about three bytes to the
49 compressed data in order to connect the streams. The output gzip file
50 has a minimal ten-byte gzip header with no file name or modification time.
52 This program was written to illustrate the use of the Z_BLOCK option of
53 inflate() and the crc32_combine() function. gzjoin will not compile with
54 versions of zlib earlier than 1.2.3.
57 #include <stdio.h> /* fputs(), fprintf(), fwrite(), putc() */
58 #include <stdlib.h> /* exit(), malloc(), free() */
59 #include <fcntl.h> /* open() */
60 #include <unistd.h> /* close(), read(), lseek() */
62 /* crc32(), crc32_combine(), inflateInit2(), inflate(), inflateEnd() */
66 /* exit with an error (return a value to allow use in an expression) */
67 local
int bail(char *why1
, char *why2
)
69 fprintf(stderr
, "gzjoin error: %s%s, output incomplete\n", why1
, why2
);
74 /* -- simple buffered file input with access to the buffer -- */
76 #define CHUNK 32768 /* must be a power of two and fit in unsigned */
78 /* bin buffered input file type */
80 char *name
; /* name of file for error messages */
81 int fd
; /* file descriptor */
82 unsigned left
; /* bytes remaining at next */
83 unsigned char *next
; /* next byte to read */
84 unsigned char *buf
; /* allocated buffer of length CHUNK */
87 /* close a buffered file and free allocated memory */
88 local
void bclose(bin
*in
)
99 /* open a buffered file for input, return a pointer to type bin, or NULL on
101 local bin
*bopen(char *name
)
105 in
= malloc(sizeof(bin
));
108 in
->buf
= malloc(CHUNK
);
109 in
->fd
= open(name
, O_RDONLY
, 0);
110 if (in
->buf
== NULL
|| in
->fd
== -1) {
120 /* load buffer from file, return -1 on read error, 0 or 1 on success, with
121 1 indicating that end-of-file was reached */
122 local
int bload(bin
*in
)
132 len
= (long)read(in
->fd
, in
->buf
+ in
->left
, CHUNK
- in
->left
);
135 in
->left
+= (unsigned)len
;
136 } while (len
!= 0 && in
->left
< CHUNK
);
137 return len
== 0 ? 1 : 0;
140 /* get a byte from the file, bail if end of file */
141 #define bget(in) (in->left ? 0 : bload(in), \
142 in->left ? (in->left--, *(in->next)++) : \
143 bail("unexpected end of file on ", in->name))
145 /* get a four-byte little-endian unsigned integer from file */
146 local
unsigned long bget4(bin
*in
)
151 val
+= (unsigned long)(bget(in
)) << 8;
152 val
+= (unsigned long)(bget(in
)) << 16;
153 val
+= (unsigned long)(bget(in
)) << 24;
157 /* skip bytes in file */
158 local
void bskip(bin
*in
, unsigned skip
)
164 /* easy case -- skip bytes in buffer */
165 if (skip
<= in
->left
) {
171 /* skip what's in buffer, discard buffer contents */
175 /* seek past multiples of CHUNK bytes */
179 left
= skip
& (CHUNK
- 1);
181 /* exact number of chunks: seek all the way minus one byte to check
182 for end-of-file with a read */
183 lseek(in
->fd
, skip
- 1, SEEK_CUR
);
184 if (read(in
->fd
, in
->buf
, 1) != 1)
185 bail("unexpected end of file on ", in
->name
);
189 /* skip the integral chunks, update skip with remainder */
190 lseek(in
->fd
, skip
- left
, SEEK_CUR
);
194 /* read more input and skip remainder */
197 bail("unexpected end of file on ", in
->name
);
202 /* -- end of buffered input functions -- */
204 /* skip the gzip header from file in */
205 local
void gzhead(bin
*in
)
209 /* verify gzip magic header and compression method */
210 if (bget(in
) != 0x1f || bget(in
) != 0x8b || bget(in
) != 8)
211 bail(in
->name
, " is not a valid gzip file");
213 /* get and verify flags */
215 if ((flags
& 0xe0) != 0)
216 bail("unknown reserved bits set in ", in
->name
);
218 /* skip modification time, extra flags, and os */
221 /* skip extra field if present */
226 len
+= (unsigned)(bget(in
)) << 8;
230 /* skip file name if present */
232 while (bget(in
) != 0)
235 /* skip comment if present */
237 while (bget(in
) != 0)
240 /* skip header crc if present */
245 /* write a four-byte little-endian unsigned integer to out */
246 local
void put4(unsigned long val
, FILE *out
)
248 putc(val
& 0xff, out
);
249 putc((val
>> 8) & 0xff, out
);
250 putc((val
>> 16) & 0xff, out
);
251 putc((val
>> 24) & 0xff, out
);
254 /* Load up zlib stream from buffered input, bail if end of file */
255 local
void zpull(z_streamp strm
, bin
*in
)
260 bail("unexpected end of file on ", in
->name
);
261 strm
->avail_in
= in
->left
;
262 strm
->next_in
= in
->next
;
265 /* Write header for gzip file to out and initialize trailer. */
266 local
void gzinit(unsigned long *crc
, unsigned long *tot
, FILE *out
)
268 fwrite("\x1f\x8b\x08\0\0\0\0\0\0\xff", 1, 10, out
);
269 *crc
= crc32(0L, Z_NULL
, 0);
273 /* Copy the compressed data from name, zeroing the last block bit of the last
274 block if clr is true, and adding empty blocks as needed to get to a byte
275 boundary. If clr is false, then the last block becomes the last block of
276 the output, and the gzip trailer is written. crc and tot maintains the
277 crc and length (modulo 2^32) of the output for the trailer. The resulting
278 gzip file is written to out. gzinit() must be called before the first call
279 of gzcopy() to write the gzip header and to initialize crc and tot. */
280 local
void gzcopy(char *name
, int clr
, unsigned long *crc
, unsigned long *tot
,
283 int ret
; /* return value from zlib functions */
284 int pos
; /* where the "last block" bit is in byte */
285 int last
; /* true if processing the last block */
286 bin
*in
; /* buffered input file */
287 unsigned char *start
; /* start of compressed data in buffer */
288 unsigned char *junk
; /* buffer for uncompressed data -- discarded */
289 z_off_t len
; /* length of uncompressed data (support > 4 GB) */
290 z_stream strm
; /* zlib inflate stream */
292 /* open gzip file and skip header */
295 bail("could not open ", name
);
298 /* allocate buffer for uncompressed data and initialize raw inflate
300 junk
= malloc(CHUNK
);
301 strm
.zalloc
= Z_NULL
;
303 strm
.opaque
= Z_NULL
;
305 strm
.next_in
= Z_NULL
;
306 ret
= inflateInit2(&strm
, -15);
307 if (junk
== NULL
|| ret
!= Z_OK
)
308 bail("out of memory", "");
310 /* inflate and copy compressed data, clear last-block bit if requested */
313 start
= strm
.next_in
;
319 /* if input used and output done, write used input and get more */
320 if (strm
.avail_in
== 0 && strm
.avail_out
!= 0) {
321 fwrite(start
, 1, strm
.next_in
- start
, out
);
327 /* decompress -- return early when end-of-block reached */
328 strm
.avail_out
= CHUNK
;
329 strm
.next_out
= junk
;
330 ret
= inflate(&strm
, Z_BLOCK
);
333 bail("out of memory", "");
335 bail("invalid compressed data in ", in
->name
);
338 /* update length of uncompressed data */
339 len
+= CHUNK
- strm
.avail_out
;
341 /* check for block boundary (only get this when block copied out) */
342 if (strm
.data_type
& 128) {
343 /* if that was the last block, then done */
347 /* number of unused bits in last byte */
348 pos
= strm
.data_type
& 7;
350 /* find the next last-block bit */
352 /* next last-block bit is in last used byte */
354 last
= strm
.next_in
[-1] & pos
;
356 strm
.next_in
[-1] &= ~pos
;
359 /* next last-block bit is in next unused byte */
360 if (strm
.avail_in
== 0) {
361 /* don't have that byte yet -- get it */
362 fwrite(start
, 1, strm
.next_in
- start
, out
);
367 last
= strm
.next_in
[0] & 1;
369 strm
.next_in
[0] &= ~1;
374 /* update buffer with unused input */
375 in
->left
= strm
.avail_in
;
376 in
->next
= strm
.next_in
;
378 /* copy used input, write empty blocks to get to byte boundary */
379 pos
= strm
.data_type
& 7;
380 fwrite(start
, 1, in
->next
- start
- 1, out
);
382 if (pos
== 0 || !clr
)
383 /* already at byte boundary, or last file: write last byte */
386 /* append empty blocks to last byte */
387 last
&= ((0x100 >> pos
) - 1); /* assure unused bits are zero */
389 /* odd -- append an empty stored block */
392 putc(0, out
); /* two more bits in block header */
393 fwrite("\0\0\xff\xff", 1, 4, out
);
396 /* even -- append 1, 2, or 3 empty fixed blocks */
402 putc(last
| 0x20, out
);
405 putc(last
| 0x80, out
);
411 /* update crc and tot */
412 *crc
= crc32_combine(*crc
, bget4(in
), len
);
413 *tot
+= (unsigned long)len
;
420 /* write trailer if this is the last gzip file */
427 /* join the gzip files on the command line, write result to stdout */
428 int main(int argc
, char **argv
)
430 unsigned long crc
, tot
; /* running crc and total uncompressed length */
432 /* skip command name */
436 /* show usage if no arguments */
438 fputs("gzjoin usage: gzjoin f1.gz [f2.gz [f3.gz ...]] > fjoin.gz\n",
443 /* join gzip files on command line and write to stdout */
444 gzinit(&crc
, &tot
, stdout
);
446 gzcopy(*argv
++, argc
, &crc
, &tot
, stdout
);