2 Copyright (c) 1990-2002 Info-ZIP. All rights reserved.
4 See the accompanying file LICENSE, version 2000-Apr-09 or later
5 (the contents of which are also included in unzip.h) for terms of use.
6 If, for some reason, all these files are missing, the Info-ZIP license
7 also may be found at: ftp://ftp.info-zip.org/pub/infozip/license.html
9 /* explode.c -- by Mark Adler
10 version c15, 6 July 1996 */
14 - Starting with UnZip 5.41 of 16-April-2000, this source file
15 is covered by the Info-Zip LICENSE cited above.
16 - Prior versions of this source file, found in UnZip source packages
17 up to UnZip 5.40, were put in the public domain.
18 The original copyright note by Mark Adler was:
19 "You can do whatever you like with this source file,
20 though I would prefer that if you modify it and
21 redistribute it that you include comments to that effect
22 with your name and the date. Thank you."
26 ---- --------- -------------- ------------------------------------
27 c1 30 Mar 92 M. Adler explode that uses huft_build from inflate
28 (this gives over a 70% speed improvement
29 over the original unimplode.c, which
30 decoded a bit at a time)
31 c2 4 Apr 92 M. Adler fixed bug for file sizes a multiple of 32k.
32 c3 10 Apr 92 M. Adler added a little memory tracking if DEBUG
33 c4 11 Apr 92 M. Adler added NOMEMCPY do kill use of memcpy()
34 c5 21 Apr 92 M. Adler added the WSIZE #define to allow reducing
35 the 32K window size for specialized
37 c6 31 May 92 M. Adler added typecasts to eliminate some warnings
38 c7 27 Jun 92 G. Roelofs added more typecasts.
39 c8 17 Oct 92 G. Roelofs changed ULONG/UWORD/byte to ulg/ush/uch.
40 c9 19 Jul 93 J. Bush added more typecasts (to return values);
41 made l[256] array static for Amiga.
42 c10 8 Oct 93 G. Roelofs added used_csize for diagnostics; added
43 buf and unshrink arguments to flush();
44 undef'd various macros at end for Turbo C;
45 removed NEXTBYTE macro (now in unzip.h)
46 and bytebuf variable (not used); changed
47 memset() to memzero().
48 c11 9 Jan 94 M. Adler fixed incorrect used_csize calculation.
49 c12 9 Apr 94 G. Roelofs fixed split comments on preprocessor lines
50 to avoid bug in Encore compiler.
51 c13 25 Aug 94 M. Adler fixed distance-length comment (orig c9 fix)
52 c14 22 Nov 95 S. Maxwell removed unnecessary "static" on auto array
53 c15 6 Jul 96 W. Haidinger added ulg typecasts to flush() calls.
54 c16 8 Feb 98 C. Spieler added ZCONST modifiers to const tables
55 and #ifdef DEBUG around debugging code.
56 c16b 25 Mar 98 C. Spieler modified DLL code for slide redirection.
57 c16d 05 Jul 99 C. Spieler take care of flush() return values and
58 stop processing in case of errors
59 c17 04 Feb 01 C. Spieler reorganized code to reduce repetitions
60 of large code parts; adapted huft decoding
61 to the changes in inflate's huft_build()
62 due to support of deflate64; fixed memory
63 leaks (huft tables were not free'd when
65 c17b 16 Feb 02 C. Spieler changed type of the "extra lengths" array
66 "extra" from ush into uch (to save space)
71 Explode imploded (PKZIP method 6 compressed) data. This compression
72 method searches for as much of the current string of bytes (up to a length
73 of ~320) in the previous 4K or 8K bytes. If it doesn't find any matches
74 (of at least length 2 or 3), it codes the next byte. Otherwise, it codes
75 the length of the matched string and its distance backwards from the
76 current position. Single bytes ("literals") are preceded by a one (a
77 single bit) and are either uncoded (the eight bits go directly into the
78 compressed stream for a total of nine bits) or Huffman coded with a
79 supplied literal code tree. If literals are coded, then the minimum match
80 length is three, otherwise it is two.
82 There are therefore four kinds of imploded streams: 8K search with coded
83 literals (min match = 3), 4K search with coded literals (min match = 3),
84 8K with uncoded literals (min match = 2), and 4K with uncoded literals
85 (min match = 2). The kind of stream is identified in two bits of a
86 general purpose bit flag that is outside of the compressed stream.
88 Distance-length pairs for matched strings are preceded by a zero bit (to
89 distinguish them from literals) and are always coded. The distance comes
90 first and is either the low six (4K) or low seven (8K) bits of the
91 distance (uncoded), followed by the high six bits of the distance coded.
92 Then the length is six bits coded (0..63 + min match length), and if the
93 maximum such length is coded, then it's followed by another eight bits
94 (uncoded) to be added to the coded length. This gives a match length
95 range of 2..320 or 3..321 bytes.
97 The literal, length, and distance codes are all represented in a slightly
98 compressed form themselves. What is sent are the lengths of the codes for
99 each value, which is sufficient to construct the codes. Each byte of the
100 code representation is the code length (the low four bits representing
101 1..16), and the number of values sequentially with that length (the high
102 four bits also representing 1..16). There are 256 literal code values (if
103 literals are coded), 64 length code values, and 64 distance code values,
104 in that order at the beginning of the compressed stream. Each set of code
105 values is preceded (redundantly) with a byte indicating how many bytes are
106 in the code description that follows, in the range 1..256.
108 The codes themselves are decoded using tables made by huft_build() from
109 the bit lengths. That routine and its comments are in the inflate.c
113 #define __EXPLODE_C /* identifies this source module */
114 #define UNZIP_INTERNAL
115 #include "unzip.h" /* must supply slide[] (uch) array and NEXTBYTE macro */
118 # define WSIZE 0x8000 /* window size--must be a power of two, and */
119 #endif /* at least 8K for zip's implode method */
121 #if (defined(DLL) && !defined(NO_SLIDE_REDIR))
122 # define wszimpl (unsigned)(G._wsize)
124 # if defined(USE_DEFLATE64) && defined(INT_16BIT)
125 # define wszimpl (unsigned)(WSIZE>>1)
126 # else /* !(USE_DEFLATE64 && INT_16BIT) */
127 # define wszimpl WSIZE
128 # endif /* !(USE_DEFLATE64 && INT_16BIT) */
132 static int get_tree
OF((__GPRO__
unsigned *l
, unsigned n
));
133 static int explode_lit
OF((__GPRO__
struct huft
*tb
, struct huft
*tl
,
134 struct huft
*td
, int bb
, int bl
, int bd
,
136 static int explode_nolit
OF((__GPRO__
struct huft
*tl
, struct huft
*td
,
137 int bl
, int bd
, unsigned bdl
));
138 int explode
OF((__GPRO
));
141 /* The implode algorithm uses a sliding 4K or 8K byte window on the
142 uncompressed stream to find repeated byte strings. This is implemented
143 here as a circular buffer. The index is updated simply by incrementing
144 and then and'ing with 0x0fff (4K-1) or 0x1fff (8K-1). Here, the 32K
145 buffer of inflate is used, and it works just as well to always have
146 a 32K circular buffer, so the index is anded with 0x7fff. This is
147 done to allow the window to also be used as the output buffer. */
148 /* This must be supplied in an external module useable like "uch slide[8192];"
149 or "uch *slide;", where the latter would be malloc'ed. In unzip, slide[]
150 is actually a 32K area for use by inflate, which uses a 32K sliding window.
154 #define INVALID_CODE 99
155 #define IS_INVALID_CODE(c) ((c) == INVALID_CODE)
157 /* Tables for length and distance */
158 static ZCONST ush cplen2
[] =
159 {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
160 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
161 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
162 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65};
163 static ZCONST ush cplen3
[] =
164 {3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
165 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
166 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
167 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66};
168 static ZCONST uch extra
[] =
169 {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
173 static ZCONST ush cpdist4
[] =
174 {1, 65, 129, 193, 257, 321, 385, 449, 513, 577, 641, 705,
175 769, 833, 897, 961, 1025, 1089, 1153, 1217, 1281, 1345, 1409, 1473,
176 1537, 1601, 1665, 1729, 1793, 1857, 1921, 1985, 2049, 2113, 2177,
177 2241, 2305, 2369, 2433, 2497, 2561, 2625, 2689, 2753, 2817, 2881,
178 2945, 3009, 3073, 3137, 3201, 3265, 3329, 3393, 3457, 3521, 3585,
179 3649, 3713, 3777, 3841, 3905, 3969, 4033};
180 static ZCONST ush cpdist8
[] =
181 {1, 129, 257, 385, 513, 641, 769, 897, 1025, 1153, 1281,
182 1409, 1537, 1665, 1793, 1921, 2049, 2177, 2305, 2433, 2561, 2689,
183 2817, 2945, 3073, 3201, 3329, 3457, 3585, 3713, 3841, 3969, 4097,
184 4225, 4353, 4481, 4609, 4737, 4865, 4993, 5121, 5249, 5377, 5505,
185 5633, 5761, 5889, 6017, 6145, 6273, 6401, 6529, 6657, 6785, 6913,
186 7041, 7169, 7297, 7425, 7553, 7681, 7809, 7937, 8065};
189 /* Macros for inflate() bit peeking and grabbing.
193 x = b & mask_bits[j];
196 where NEEDBITS makes sure that b has at least j bits in it, and
197 DUMPBITS removes the bits from b. The macros use the variable k
198 for the number of bits in b. Normally, b and k are register
202 #define NEEDBITS(n) {while(k<(n)){b|=((ulg)NEXTBYTE)<<k;k+=8;}}
203 #define DUMPBITS(n) {b>>=(n);k-=(n);}
205 #define DECODEHUFT(htab, bits, mask) {\
206 NEEDBITS((unsigned)(bits))\
207 t = (htab) + ((~(unsigned)b)&(mask));\
210 if ((e=t->e) <= 32) break;\
211 if (IS_INVALID_CODE(e)) return 1;\
214 t = t->v.t + ((~(unsigned)b)&mask_bits[e]);\
219 static int get_tree(__G__ l
, n
)
221 unsigned *l
; /* bit lengths */
222 unsigned n
; /* number expected */
223 /* Get the bit lengths for a code representation from the compressed
224 stream. If get_tree() returns 4, then there is an error in the data.
225 Otherwise zero is returned. */
227 unsigned i
; /* bytes remaining in list */
228 unsigned k
; /* lengths entered */
229 unsigned j
; /* number of codes */
230 unsigned b
; /* bit length for those codes */
233 /* get bit lengths */
234 i
= NEXTBYTE
+ 1; /* length/count pairs to read */
235 k
= 0; /* next code */
237 b
= ((j
= NEXTBYTE
) & 0xf) + 1; /* bits in code (1..16) */
238 j
= ((j
& 0xf0) >> 4) + 1; /* codes with those bits (1..16) */
240 return 4; /* don't overflow l[] */
245 return k
!= n
? 4 : 0; /* should have read n of them */
250 static int explode_lit(__G__ tb
, tl
, td
, bb
, bl
, bd
, bdl
)
252 struct huft
*tb
, *tl
, *td
; /* literal, length, and distance tables */
253 int bb
, bl
, bd
; /* number of bits decoded by those */
254 unsigned bdl
; /* number of distance low bits */
255 /* Decompress the imploded data using coded literals and a sliding
256 window (of size 2^(6+bdl) bytes). */
258 ulg s
; /* bytes to decompress */
259 register unsigned e
; /* table entry flag/number of extra bits */
260 unsigned n
, d
; /* length and index for copy */
261 unsigned w
; /* current window position */
262 struct huft
*t
; /* pointer to table entry */
263 unsigned mb
, ml
, md
; /* masks for bb, bl, and bd bits */
264 unsigned mdl
; /* mask for bdl (distance lower) bits */
265 register ulg b
; /* bit buffer */
266 register unsigned k
; /* number of bits in bit buffer */
267 unsigned u
; /* true if unflushed */
268 int retval
= 0; /* error code returned: initialized to "no error" */
271 /* explode the coded data */
272 b
= k
= w
= 0; /* initialize bit buffer, window */
273 u
= 1; /* buffer unflushed */
274 mb
= mask_bits
[bb
]; /* precompute masks for speed */
277 mdl
= mask_bits
[bdl
];
279 while (s
> 0) /* do until ucsize bytes uncompressed */
282 if (b
& 1) /* then literal--decode it */
286 DECODEHUFT(tb
, bb
, mb
) /* get coded literal */
287 redirSlide
[w
++] = (uch
)t
->v
.n
;
290 if ((retval
= flush(__G__ redirSlide
, (ulg
)w
, 0)) != 0)
295 else /* else distance/length */
298 NEEDBITS(bdl
) /* get distance low bits */
299 d
= (unsigned)b
& mdl
;
301 DECODEHUFT(td
, bd
, md
) /* get coded distance high bits */
302 d
= w
- d
- t
->v
.n
; /* construct offset */
303 DECODEHUFT(tl
, bl
, ml
) /* get coded length */
305 if (e
) /* get length extra bits */
308 n
+= (unsigned)b
& 0xff;
313 s
= (s
> (ulg
)n
? s
- (ulg
)n
: 0);
315 #if (defined(DLL) && !defined(NO_SLIDE_REDIR))
316 if (G
.redirect_slide
) {
317 /* &= w/ wszimpl not needed and wrong if redirect */
320 e
= wszimpl
- (d
> w
? d
: w
);
323 e
= wszimpl
- ((d
&= wszimpl
-1) > w
? d
: w
);
328 memzero(redirSlide
+ w
, e
);
334 if (w
- d
>= e
) /* (this test assumes unsigned comparison) */
336 memcpy(redirSlide
+ w
, redirSlide
+ d
, e
);
340 else /* do it slow to avoid memcpy() overlap */
341 #endif /* !NOMEMCPY */
343 redirSlide
[w
++] = redirSlide
[d
++];
347 if ((retval
= flush(__G__ redirSlide
, (ulg
)w
, 0)) != 0)
355 /* flush out redirSlide */
356 if ((retval
= flush(__G__ redirSlide
, (ulg
)w
, 0)) != 0)
358 if (G
.csize
+ G
.incnt
+ (k
>> 3)) /* should have read csize bytes, but */
359 { /* sometimes read one too many: k>>3 compensates */
360 G
.used_csize
= G
.lrec
.csize
- G
.csize
- G
.incnt
- (k
>> 3);
368 static int explode_nolit(__G__ tl
, td
, bl
, bd
, bdl
)
370 struct huft
*tl
, *td
; /* length and distance decoder tables */
371 int bl
, bd
; /* number of bits decoded by tl[] and td[] */
372 unsigned bdl
; /* number of distance low bits */
373 /* Decompress the imploded data using uncoded literals and a sliding
374 window (of size 2^(6+bdl) bytes). */
376 ulg s
; /* bytes to decompress */
377 register unsigned e
; /* table entry flag/number of extra bits */
378 unsigned n
, d
; /* length and index for copy */
379 unsigned w
; /* current window position */
380 struct huft
*t
; /* pointer to table entry */
381 unsigned ml
, md
; /* masks for bl and bd bits */
382 unsigned mdl
; /* mask for bdl (distance lower) bits */
383 register ulg b
; /* bit buffer */
384 register unsigned k
; /* number of bits in bit buffer */
385 unsigned u
; /* true if unflushed */
386 int retval
= 0; /* error code returned: initialized to "no error" */
389 /* explode the coded data */
390 b
= k
= w
= 0; /* initialize bit buffer, window */
391 u
= 1; /* buffer unflushed */
392 ml
= mask_bits
[bl
]; /* precompute masks for speed */
394 mdl
= mask_bits
[bdl
];
396 while (s
> 0) /* do until ucsize bytes uncompressed */
399 if (b
& 1) /* then literal--get eight bits */
404 redirSlide
[w
++] = (uch
)b
;
407 if ((retval
= flush(__G__ redirSlide
, (ulg
)w
, 0)) != 0)
413 else /* else distance/length */
416 NEEDBITS(bdl
) /* get distance low bits */
417 d
= (unsigned)b
& mdl
;
419 DECODEHUFT(td
, bd
, md
) /* get coded distance high bits */
420 d
= w
- d
- t
->v
.n
; /* construct offset */
421 DECODEHUFT(tl
, bl
, ml
) /* get coded length */
423 if (e
) /* get length extra bits */
426 n
+= (unsigned)b
& 0xff;
431 s
= (s
> (ulg
)n
? s
- (ulg
)n
: 0);
433 #if (defined(DLL) && !defined(NO_SLIDE_REDIR))
434 if (G
.redirect_slide
) {
435 /* &= w/ wszimpl not needed and wrong if redirect */
438 e
= wszimpl
- (d
> w
? d
: w
);
441 e
= wszimpl
- ((d
&= wszimpl
-1) > w
? d
: w
);
446 memzero(redirSlide
+ w
, e
);
452 if (w
- d
>= e
) /* (this test assumes unsigned comparison) */
454 memcpy(redirSlide
+ w
, redirSlide
+ d
, e
);
458 else /* do it slow to avoid memcpy() overlap */
459 #endif /* !NOMEMCPY */
461 redirSlide
[w
++] = redirSlide
[d
++];
465 if ((retval
= flush(__G__ redirSlide
, (ulg
)w
, 0)) != 0)
473 /* flush out redirSlide */
474 if ((retval
= flush(__G__ redirSlide
, (ulg
)w
, 0)) != 0)
476 if (G
.csize
+ G
.incnt
+ (k
>> 3)) /* should have read csize bytes, but */
477 { /* sometimes read one too many: k>>3 compensates */
478 G
.used_csize
= G
.lrec
.csize
- G
.csize
- G
.incnt
- (k
>> 3);
488 /* Explode an imploded compressed stream. Based on the general purpose
489 bit flag, decide on coded or uncoded literals, and an 8K or 4K sliding
490 window. Construct the literal (if any), length, and distance codes and
491 the tables needed to decode them (using huft_build() from inflate.c),
492 and call the appropriate routine for the type of data in the remainder
493 of the stream. The four routines are nearly identical, differing only
494 in whether the literal is decoded or simply read in, and in how many
495 bits are read in, uncoded, for the low distance bits. */
497 unsigned r
; /* return codes */
498 struct huft
*tb
; /* literal code table */
499 struct huft
*tl
; /* length code table */
500 struct huft
*td
; /* distance code table */
501 int bb
; /* bits for tb */
502 int bl
; /* bits for tl */
503 int bd
; /* bits for td */
504 unsigned bdl
; /* number of uncoded lower distance bits */
505 unsigned l
[256]; /* bit lengths for codes */
507 #if (defined(DLL) && !defined(NO_SLIDE_REDIR))
508 if (G
.redirect_slide
)
509 /* For 16-bit systems, it has already been checked at DLL entrance that
510 * the buffer size in G.redirect_size does not exceed unsigned range.
512 G
._wsize
= G
.redirect_size
, redirSlide
= G
.redirect_buffer
;
514 #if defined(USE_DEFLATE64) && defined(INT_16BIT)
515 /* For systems using 16-bit ints, reduce the used buffer size below
516 * the limit of "unsigned int" numbers range.
518 G
._wsize
= WSIZE
>>1, redirSlide
= slide
;
519 #else /* !(USE_DEFLATE64 && INT_16BIT) */
520 G
._wsize
= WSIZE
, redirSlide
= slide
;
521 #endif /* !(USE_DEFLATE64 && INT_16BIT) */
522 #endif /* DLL && !NO_SLIDE_REDIR */
524 /* Tune base table sizes. Note: I thought that to truly optimize speed,
525 I would have to select different bl, bd, and bb values for different
526 compressed file sizes. I was surprised to find out that the values of
527 7, 7, and 9 worked best over a very wide range of sizes, except that
528 bd = 8 worked marginally better for large compressed sizes. */
530 bd
= (G
.csize
+ G
.incnt
) > 200000L ? 8 : 7;
533 G
.hufts
= 0; /* initialize huft's malloc'ed */
536 if (G
.lrec
.general_purpose_bit_flag
& 4)
537 /* With literal tree--minimum match length is 3 */
539 bb
= 9; /* base table size for literals */
540 if ((r
= get_tree(__G__ l
, 256)) != 0)
542 if ((r
= huft_build(__G__ l
, 256, 256, NULL
, NULL
, &tb
, &bb
)) != 0)
548 if ((r
= get_tree(__G__ l
, 64)) != 0) {
552 if ((r
= huft_build(__G__ l
, 64, 0, cplen3
, extra
, &tl
, &bl
)) != 0)
561 /* No literal tree--minimum match length is 2 */
563 tb
= (struct huft
*)NULL
;
564 if ((r
= get_tree(__G__ l
, 64)) != 0)
566 if ((r
= huft_build(__G__ l
, 64, 0, cplen2
, extra
, &tl
, &bl
)) != 0)
574 if ((r
= get_tree(__G__ l
, 64)) != 0) {
576 if (tb
!= (struct huft
*)NULL
) huft_free(tb
);
579 if (G
.lrec
.general_purpose_bit_flag
& 2) /* true if 8K */
582 r
= huft_build(__G__ l
, 64, 0, cpdist8
, extra
, &td
, &bd
);
587 r
= huft_build(__G__ l
, 64, 0, cpdist4
, extra
, &td
, &bd
);
594 if (tb
!= (struct huft
*)NULL
) huft_free(tb
);
599 r
= explode_lit(__G__ tb
, tl
, td
, bb
, bl
, bd
, bdl
);
602 r
= explode_nolit(__G__ tl
, td
, bl
, bd
, bdl
);
607 Trace((stderr
, "<%u > ", G
.hufts
));
611 /* so explode.c and inflate.c can be compiled together into one object: */