1 // This file is part of Deark.
2 // Copyright (C) 2019-2022 Jason Summers
3 // See the file COPYING for terms of use.
7 #define DE_NOT_IN_MODULE
8 #include "deark-private.h"
9 #include "deark-fmtutil.h"
11 #define DELZW_CODE u32 // int type used in most cases
12 #define DELZW_CODE_MINRANGE u16 // int type used for parents in table entries
13 #define DELZW_MINMINCODESIZE 3
14 #define DELZW_MAXMAXCODESIZE 16
15 #define DELZW_NBITS_TO_MAXCODE(n) ((DELZW_CODE)((1<<(n))-1))
16 #define DELZW_NBITS_TO_NCODES(n) ((DELZW_CODE)(1<<(n)))
18 struct delzwctx_struct
;
19 typedef struct delzwctx_struct delzwctx
;
21 struct delzw_tableentry
{
22 DELZW_CODE_MINRANGE parent
;
24 #define DELZW_CODETYPE_INVALID 0x00
25 #define DELZW_CODETYPE_STATIC 0x01
26 #define DELZW_CODETYPE_DYN_UNUSED 0x02
27 #define DELZW_CODETYPE_DYN_USED 0x03
28 #define DELZW_CODETYPE_CLEAR 0x08
29 #define DELZW_CODETYPE_STOP 0x09
30 #define DELZW_CODETYPE_INC_CDSZ 0x0a
31 #define DELZW_CODETYPE_SPECIAL 0x0f
36 struct delzw_tableentry2
{
37 #define DELZW_NEXTPTR_NONE 0xffff // Note - This table is only used with 12-bit codes
38 DELZW_CODE_MINRANGE next
;
41 struct delzwctx_struct
{
43 struct de_dfilter_ctx
*dfctx
;
45 enum de_lzwfmt_enum fmt
;
47 struct de_lzw_params delzwp_copy
;
49 #define DELZW_HEADERTYPE_NONE 0
50 #define DELZW_HEADERTYPE_UNIXCOMPRESS3BYTE 1
51 #define DELZW_HEADERTYPE_ARC1BYTE 2
54 u8 stop_on_invalid_code
; // Invalid code means "stop", not a fatal error
57 i64 output_expected_len
;
60 u8 unixcompress_has_clear_code
;
66 u8 early_codesize_inc
;
67 u8 has_partial_clearing
;
71 u8 header_unixcompress_mode
;
72 u8 header_unixcompress_max_codesize
;
73 u8 header_unixcompress_block_mode
; // = 1 or 0
76 #define DELZW_ERRCODE_OK 0
77 #define DELZW_ERRCODE_GENERIC_ERROR 1
78 #define DELZW_ERRCODE_BAD_CDATA 2
79 #define DELZW_ERRCODE_MALLOC_FAILED 3
80 #define DELZW_ERRCODE_WRITE_FAILED 7
81 #define DELZW_ERRCODE_INSUFFICIENT_CDATA 8
82 #define DELZW_ERRCODE_UNSUPPORTED_OPTION 9
83 #define DELZW_ERRCODE_INTERNAL_ERROR 10
86 #define DELZW_STATE_INIT 0
87 #define DELZW_STATE_READING_HEADER 1
88 #define DELZW_STATE_READING_CODES 2
89 #define DELZW_STATE_FINISHED 3
91 i64 total_nbytes_processed
;
92 i64 uncmpr_nbytes_written
;
94 i64 ncodes_in_this_bitgroup
;
95 i64 nbytes_left_to_skip
;
102 DELZW_CODE last_code_added
;
104 DELZW_CODE highest_code_ever_used
;
105 DELZW_CODE free_code_search_start
;
106 DELZW_CODE first_dynamic_code
;
107 u8 special_code_is_pending
;
109 struct de_bitbuf_lowlevel bbll
;
111 DELZW_CODE ct_capacity
;
112 DELZW_CODE ct_code_count
; // Note - Not always maintained if not needed
113 struct delzw_tableentry
*ct
;
114 struct delzw_tableentry2
*ct2
;
118 size_t valbuf_capacity
;
124 static void delzw_debugmsg(delzwctx
*dc
, int level
, const char *fmt
, ...)
125 de_gnuc_attribute ((format (printf
, 3, 4)));
127 static void delzw_debugmsg(delzwctx
*dc
, int level
, const char *fmt
, ...)
132 if(level
>dc
->debug_level
) return;
135 de_vsnprintf(msg
, sizeof(msg
), fmt
, ap
);
138 de_dbg(dc
->c
, "[delzw:i%"I64_FMT
"/o%"I64_FMT
"] %s",
139 (i64
)dc
->total_nbytes_processed
, (i64
)dc
->uncmpr_nbytes_written
, msg
);
142 static void delzw_dumptable(delzwctx
*dc
)
145 for(k
=0; k
<dc
->highest_code_ever_used
; k
++) {
146 delzw_debugmsg(dc
, 4, "[%d] ty=%d p=%d v=%d f=%d",
147 (int)k
, (int)dc
->ct
[k
].codetype
, (int)dc
->ct
[k
].parent
,
148 (int)dc
->ct
[k
].value
, (int)dc
->ct
[k
].flags
);
152 static void delzw_stop(delzwctx
*dc
, const char *reason
)
154 if(dc
->state
== DELZW_STATE_FINISHED
) return;
155 delzw_debugmsg(dc
, 2, "stopping due to %s", reason
);
156 dc
->state
= DELZW_STATE_FINISHED
;
159 static void delzw_set_errorf(delzwctx
*dc
, int errcode
, const char *fmt
, ...)
160 de_gnuc_attribute ((format (printf
, 3, 4)));
162 static void delzw_set_errorf(delzwctx
*dc
, int errcode
, const char *fmt
, ...)
166 delzw_stop(dc
, "error");
167 if(dc
->errcode
) return;
168 dc
->errcode
= errcode
;
170 de_vsnprintf(dc
->errmsg
, sizeof(dc
->errmsg
), fmt
, ap
);
174 static void delzw_set_error(delzwctx
*dc
, int errcode
, const char *msg
)
176 delzw_stop(dc
, "error");
177 if(dc
->errcode
) return;
178 dc
->errcode
= errcode
;
179 if(!msg
|| !msg
[0]) {
180 msg
= "LZW decompression error";
182 de_strlcpy(dc
->errmsg
, msg
, sizeof(dc
->errmsg
));
185 static void delzw_write(delzwctx
*dc
, const u8
*buf
, size_t n1
)
189 if(dc
->errcode
) return;
192 if(dc
->output_len_known
) {
193 if(dc
->uncmpr_nbytes_written
+ n
> dc
->output_expected_len
) {
194 n
= dc
->output_expected_len
- dc
->uncmpr_nbytes_written
;
199 dbuf_writebyte(dc
->dfctx
->dcmpro
->f
, buf
[0]);
202 dbuf_write(dc
->dfctx
->dcmpro
->f
, buf
, n
);
204 dc
->uncmpr_nbytes_written
+= n
;
207 static void delzw_process_unixcompress_3byteheader(delzwctx
*dc
)
209 if(dc
->header_buf
[0]!=0x1f || dc
->header_buf
[1]!=0x9d) {
210 delzw_set_error(dc
, DELZW_ERRCODE_BAD_CDATA
, "Not in compress format");
214 dc
->header_unixcompress_mode
= dc
->header_buf
[2];
215 dc
->header_unixcompress_max_codesize
= (dc
->header_unixcompress_mode
& 0x1f);
216 dc
->header_unixcompress_block_mode
= (dc
->header_unixcompress_mode
& 0x80) ? 1 : 0;
217 delzw_debugmsg(dc
, 2, "LZW mode=0x%02x, maxbits=%u, blockmode=%u",
218 (UI
)dc
->header_unixcompress_mode
,
219 (UI
)dc
->header_unixcompress_max_codesize
,
220 (UI
)dc
->header_unixcompress_block_mode
);
222 dc
->max_codesize
= (UI
)dc
->header_unixcompress_max_codesize
;
223 dc
->unixcompress_has_clear_code
= dc
->header_unixcompress_block_mode
;
226 static void delzw_process_arc_1byteheader(delzwctx
*dc
)
228 dc
->header_unixcompress_max_codesize
= (dc
->header_buf
[0] & 0x1f);
229 dc
->max_codesize
= (UI
)dc
->header_unixcompress_max_codesize
;
230 delzw_debugmsg(dc
, 2, "max code size: %u", dc
->max_codesize
);
233 // Is this a valid code with a value (a static, or in-use dynamic code)?
234 static int delzw_code_is_in_table(delzwctx
*dc
, DELZW_CODE code
)
236 u8 codetype
= dc
->ct
[code
].codetype
;
238 if(codetype
==DELZW_CODETYPE_STATIC
) return 1;
239 if(codetype
==DELZW_CODETYPE_DYN_USED
) return 1;
243 // Decode an LZW code to one or more values, and write the values.
244 // Updates dc->last_value.
245 static void delzw_emit_code(delzwctx
*dc
, DELZW_CODE code1
)
247 DELZW_CODE code
= code1
;
248 size_t valbuf_pos
= dc
->valbuf_capacity
; // = First entry that's used
251 if(code
>= dc
->ct_capacity
) {
252 delzw_set_errorf(dc
, DELZW_ERRCODE_GENERIC_ERROR
, "Bad LZW code (%d)", (int)code
);
257 // We must be in an infinite loop (probably an internal error).
258 delzw_set_error(dc
, DELZW_ERRCODE_GENERIC_ERROR
, NULL
);
259 if(dc
->debug_level
>=4) {
265 // valbuf is a stack, essentially. We fill it in the reverse direction,
266 // to make it simpler to write the final byte sequence.
269 if(dc
->ct
[code
].codetype
==DELZW_CODETYPE_DYN_UNUSED
) {
270 dc
->valbuf
[valbuf_pos
] = dc
->last_value
;
275 dc
->valbuf
[valbuf_pos
] = dc
->ct
[code
].value
;
277 if(dc
->ct
[code
].codetype
==DELZW_CODETYPE_STATIC
) {
278 dc
->last_value
= dc
->ct
[code
].value
;
282 // Traverse the tree, back toward the root codes.
283 code
= dc
->ct
[code
].parent
;
286 // Write out the collected values.
287 delzw_write(dc
, &dc
->valbuf
[valbuf_pos
], dc
->valbuf_capacity
- valbuf_pos
);
290 static void delzw_find_first_free_entry(delzwctx
*dc
, DELZW_CODE
*pentry
)
294 for(k
=dc
->free_code_search_start
; k
<dc
->ct_capacity
; k
++) {
295 if(dc
->ct
[k
].codetype
==DELZW_CODETYPE_DYN_UNUSED
) {
296 *pentry
= (DELZW_CODE
)k
;
301 *pentry
= (DELZW_CODE
)(dc
->ct_capacity
-1);
302 delzw_set_error(dc
, DELZW_ERRCODE_BAD_CDATA
, "LZW table unexpectedly full");
305 static void delzw_unixcompress_end_bitgroup(delzwctx
*dc
)
308 i64 nbits_left_to_skip
;
310 // The Unix 'compress' format has a quirk.
311 // The codes are written 8 at a time, with all 8 having the same codesize.
312 // The codesize cannot change in the middle of a block of 8. If it needs to,
313 // the remainder of the block is unused padding, which we must skip over.
314 // This is relevant when we encounter a clear code. It is also potentially
315 // relevant when the codesize is auto-incremented. But except possibly for
316 // the first group of codes (the 9-bit codes), the natural number of codes of
317 // a given size is always (?) a power of 2, and a multiple of 8. So, usually
318 // no padding is present at the auto-increment position.
319 // As it happens, when code 256 is used as the clear code, it reduces the
320 // natural number of 9-bit codes from 257 to 256, and since 256 is a multiple
321 // of 8, still no padding is present.
322 // But "v2" format does not use a clear code, and AFAICT it does have padding
323 // after the 9-bit codes.
325 ncodes_alloc
= ((dc
->ncodes_in_this_bitgroup
+ 7)/8)*8;
326 nbits_left_to_skip
= (ncodes_alloc
- dc
->ncodes_in_this_bitgroup
) * dc
->curr_codesize
;
329 // Each "bitgroup" has a whole number of bytes.
330 // When we get here, we've just read a code, so the bitreader's buffer can have no more than
332 // All of the bits in it will be part of the "bits to skip". After accounting for them, we'll
333 // be left with a whole number of *bytes* left to skip, which always start on a byte boundary
334 // in the input stream.
335 // So, whenever the main input loop needs to skip anything, it will be a whole byte, and the
336 // bitreader's buffer will be empty. That's good; it makes it easier to deal with this
339 if(nbits_left_to_skip
>0) {
340 delzw_debugmsg(dc
, 2, "padding bits: %d", (int)nbits_left_to_skip
);
343 dc
->ncodes_in_this_bitgroup
= 0;
344 if(dc
->bbll
.nbits_in_bitbuf
>7 || dc
->bbll
.nbits_in_bitbuf
>nbits_left_to_skip
) {
345 delzw_set_error(dc
, DELZW_ERRCODE_INTERNAL_ERROR
, NULL
);
349 nbits_left_to_skip
-= dc
->bbll
.nbits_in_bitbuf
;
350 if(nbits_left_to_skip
%8 != 0) {
351 delzw_set_error(dc
, DELZW_ERRCODE_INTERNAL_ERROR
, NULL
);
355 de_bitbuf_lowlevel_empty(&dc
->bbll
);
356 dc
->nbytes_left_to_skip
= nbits_left_to_skip
/8;
359 static void bbll_skip_to_byte_boundary(struct de_bitbuf_lowlevel
*bbll
)
363 n
= bbll
->nbits_in_bitbuf
% 8;
365 (void)de_bitbuf_lowlevel_get_bits(bbll
, n
);
369 static void delzw_increase_codesize(delzwctx
*dc
)
371 if(dc
->fmt
==DE_LZWFMT_UNIXCOMPRESS
) {
372 delzw_unixcompress_end_bitgroup(dc
);
374 else if(dc
->fmt
==DE_LZWFMT_ASC2COM
) {
375 bbll_skip_to_byte_boundary(&dc
->bbll
);
378 if(dc
->curr_codesize
<dc
->max_codesize
) {
380 delzw_debugmsg(dc
, 2, "increased code size to %u", dc
->curr_codesize
);
384 static DELZW_CODE
delzw_get_hashed_code(delzwctx
*dc
, DELZW_CODE code
,
391 h
= ((code
+(DELZW_CODE
)value
) | 0x0800) & 0xffff;
392 h
= ((h
*h
) >> 6) % dc
->ct_capacity
;
394 if(dc
->ct
[h
].codetype
==DELZW_CODETYPE_DYN_UNUSED
) {
398 // Collision - First, walk to the end of the duplicates list
400 while(dc
->ct2
[h
].next
!= DELZW_NEXTPTR_NONE
) {
404 if(count
> dc
->ct_capacity
) {
405 delzw_set_error(dc
, DELZW_ERRCODE_GENERIC_ERROR
, NULL
);
412 // Then search for an open slot
419 h
%= dc
->ct_capacity
;
421 if(dc
->ct
[h
].codetype
==DELZW_CODETYPE_DYN_UNUSED
)
425 if(count
> dc
->ct_capacity
) {
426 delzw_set_error(dc
, DELZW_ERRCODE_GENERIC_ERROR
, NULL
);
431 dc
->ct2
[saved_h
].next
= h
;
435 static void delzw_hashed_add_code_to_dict(delzwctx
*dc
, DELZW_CODE code
, u8 value
)
439 if(dc
->ct_code_count
>= dc
->ct_capacity
) {
443 idx
= delzw_get_hashed_code(dc
, code
, value
);
444 if(dc
->errcode
) return;
446 dc
->ct
[idx
].parent
= (DELZW_CODE_MINRANGE
)dc
->oldcode
;
447 dc
->ct
[idx
].value
= value
;
448 dc
->ct
[idx
].codetype
= DELZW_CODETYPE_DYN_USED
;
450 dc
->last_code_added
= idx
;
453 static void delzw_hashed_add_root_code_to_dict(delzwctx
*dc
, u8 value
)
457 idx
= delzw_get_hashed_code(dc
, 0xffff, value
);
458 if(dc
->errcode
) return;
460 dc
->ct
[idx
].value
= value
;
461 dc
->ct
[idx
].codetype
= DELZW_CODETYPE_STATIC
;
465 // Add a code to the dictionary.
466 // Sets delzw->last_code_added to the position where it was added.
467 static void delzw_add_to_dict(delzwctx
*dc
, DELZW_CODE parent
, u8 value
)
472 delzw_hashed_add_code_to_dict(dc
, parent
, value
);
476 if(dc
->fmt
==DE_LZWFMT_ZIPSHRINK
) {
477 delzw_find_first_free_entry(dc
, &newpos
);
480 newpos
= dc
->free_code_search_start
;
482 if(dc
->errcode
) return;
483 if(newpos
>= dc
->ct_capacity
) {
487 if(newpos
< dc
->first_dynamic_code
) {
488 delzw_set_error(dc
, DELZW_ERRCODE_GENERIC_ERROR
, NULL
);
492 dc
->ct
[newpos
].parent
= (DELZW_CODE_MINRANGE
)parent
;
493 dc
->ct
[newpos
].value
= value
;
494 dc
->ct
[newpos
].codetype
= DELZW_CODETYPE_DYN_USED
;
496 dc
->last_code_added
= newpos
;
497 dc
->free_code_search_start
= newpos
+1;
498 if(newpos
> dc
->highest_code_ever_used
) {
499 dc
->highest_code_ever_used
= newpos
;
502 if(dc
->auto_inc_codesize
) {
503 if(dc
->early_codesize_inc
) {
504 if(dc
->free_code_search_start
>=DELZW_NBITS_TO_MAXCODE(dc
->curr_codesize
)) {
505 delzw_increase_codesize(dc
);
509 if(dc
->free_code_search_start
>DELZW_NBITS_TO_MAXCODE(dc
->curr_codesize
)) {
510 delzw_increase_codesize(dc
);
516 static void delzw_process_data_code(delzwctx
*dc
, DELZW_CODE code
)
518 if(code
>= dc
->ct_capacity
) {
522 if(!dc
->have_oldcode
) {
523 // Special case for the first code.
524 delzw_emit_code(dc
, code
);
526 dc
->have_oldcode
= 1;
527 dc
->last_value
= dc
->ct
[code
].value
;
531 if(delzw_code_is_in_table(dc
, code
)) {
532 delzw_emit_code(dc
, code
);
533 if(dc
->errcode
) return;
535 // Let k = the first character of the translation of the code.
536 // Add <oldcode>k to the dictionary.
537 delzw_add_to_dict(dc
, dc
->oldcode
, dc
->last_value
);
540 if(code
>dc
->free_code_search_start
&& !dc
->has_partial_clearing
&& !dc
->is_hashed
) {
541 if(dc
->stop_on_invalid_code
) {
542 delzw_debugmsg(dc
, 1, "bad code: %d when max=%d (assuming data stops here)",
543 (int)code
, (int)dc
->free_code_search_start
);
544 delzw_stop(dc
, "bad LZW code");
547 delzw_set_errorf(dc
, DELZW_ERRCODE_BAD_CDATA
, "Bad LZW code (%d when max=%d)",
548 (int)code
, (int)dc
->free_code_search_start
);
552 // Let k = the first char of the translation of oldcode.
553 // Add <oldcode>k to the dictionary.
554 delzw_add_to_dict(dc
, dc
->oldcode
, dc
->last_value
);
555 if(dc
->errcode
) return;
557 // Write <oldcode>k to the output stream.
558 delzw_emit_code(dc
, dc
->last_code_added
);
564 static void delzw_clear_one_dynamic_code(delzwctx
*dc
, DELZW_CODE code
)
566 if(code
<dc
->first_dynamic_code
|| code
>=dc
->ct_capacity
) return;
567 dc
->ct
[code
].codetype
= DELZW_CODETYPE_DYN_UNUSED
;
568 dc
->ct
[code
].parent
= 0;
569 dc
->ct
[code
].value
= 0;
572 static void delzw_clear(delzwctx
*dc
)
576 delzw_debugmsg(dc
, 2, "clear code");
578 if(dc
->fmt
==DE_LZWFMT_UNIXCOMPRESS
) {
579 delzw_unixcompress_end_bitgroup(dc
);
582 for(i
=dc
->first_dynamic_code
; i
<=dc
->highest_code_ever_used
; i
++) {
583 delzw_clear_one_dynamic_code(dc
, i
);
586 dc
->curr_codesize
= dc
->min_codesize
;
587 dc
->free_code_search_start
= dc
->first_dynamic_code
;
588 dc
->have_oldcode
= 0;
590 dc
->last_code_added
= 0;
593 delzw_debugmsg(dc
, 2, "code size: %u", dc
->curr_codesize
);
596 static void delzw_partial_clear(delzwctx
*dc
)
600 delzw_debugmsg(dc
, 2, "partial clear code");
602 for(i
=dc
->first_dynamic_code
; i
<=dc
->highest_code_ever_used
; i
++) {
603 // If this code is in use
604 if(dc
->ct
[i
].codetype
==DELZW_CODETYPE_DYN_USED
) {
605 // and its parent is a dynamic code,
606 // mark its parent as having a child
607 if(dc
->ct
[i
].parent
>=257) {
608 dc
->ct
[dc
->ct
[i
].parent
].flags
= 1;
613 for(i
=dc
->first_dynamic_code
; i
<=dc
->highest_code_ever_used
; i
++) {
614 if(dc
->ct
[i
].flags
==0) {
615 // If this code has no children, clear it
616 delzw_clear_one_dynamic_code(dc
, i
);
619 // Leave all flags clear, for next time
624 dc
->free_code_search_start
= dc
->first_dynamic_code
;
627 static void delzw_process_code(delzwctx
*dc
, DELZW_CODE code
)
629 if(dc
->debug_level
>=3) {
630 delzw_debugmsg(dc
, 3, "%scode=%d oc=%d lca=%d lv=%d next=%d",
631 (dc
->special_code_is_pending
?"special":""),
633 (int)dc
->oldcode
, (int)dc
->last_code_added
, (int)dc
->last_value
,
634 (int)dc
->free_code_search_start
);
637 if(dc
->special_code_is_pending
) {
638 dc
->special_code_is_pending
= 0;
639 if(dc
->fmt
==DE_LZWFMT_ZIPSHRINK
) {
640 if(code
==1 && (dc
->curr_codesize
<dc
->max_codesize
)) {
641 delzw_increase_codesize(dc
);
644 delzw_partial_clear(dc
);
647 delzw_set_error(dc
, DELZW_ERRCODE_BAD_CDATA
, NULL
);
650 else if(dc
->fmt
==DE_LZWFMT_DWC
) {
651 if(code
==0) { // no-op?
655 // TODO: Find out what DWC special codes do
656 delzw_set_errorf(dc
, DELZW_ERRCODE_UNSUPPORTED_OPTION
,
657 "Unsupported special code: %u", (UI
)code
);
663 if(code
>= dc
->ct_capacity
) return;
665 switch(dc
->ct
[code
].codetype
) {
666 case DELZW_CODETYPE_STATIC
:
667 case DELZW_CODETYPE_DYN_UNUSED
:
668 case DELZW_CODETYPE_DYN_USED
:
669 delzw_process_data_code(dc
, code
);
671 case DELZW_CODETYPE_CLEAR
:
674 case DELZW_CODETYPE_STOP
:
675 delzw_stop(dc
, "stop code");
677 case DELZW_CODETYPE_INC_CDSZ
:
678 delzw_increase_codesize(dc
);
680 case DELZW_CODETYPE_SPECIAL
:
681 if(dc
->fmt
==DE_LZWFMT_ZIPSHRINK
&& code
==256) {
682 dc
->special_code_is_pending
= 1; // next code is an "escaped" code
686 delzw_set_errorf(dc
, DELZW_ERRCODE_UNSUPPORTED_OPTION
,
687 "Unsupported or invalid code (%u)", (UI
)code
);
692 if(dc
->fmt
==DE_LZWFMT_DWC
) {
693 if((dc
->code_counter
%512)==256) {
694 dc
->special_code_is_pending
= 1;
699 // Most configuration is done in delzw_on_codes_start(), not here.
700 // This function does need to do some things to handle formats that have a
702 static void delzw_on_decompression_start(delzwctx
*dc
)
704 if(dc
->fmt
!=DE_LZWFMT_ZIPSHRINK
&&
705 dc
->fmt
!=DE_LZWFMT_GIF
&&
706 dc
->fmt
!=DE_LZWFMT_UNIXCOMPRESS
&&
707 dc
->fmt
!=DE_LZWFMT_ZOOLZD
&&
708 dc
->fmt
!=DE_LZWFMT_TIFFNEW
&&
709 dc
->fmt
!=DE_LZWFMT_TIFFOLD
&&
710 dc
->fmt
!=DE_LZWFMT_ARC5
&&
711 dc
->fmt
!=DE_LZWFMT_DWC
&&
712 dc
->fmt
!=DE_LZWFMT_SHRINKIT1
&&
713 dc
->fmt
!=DE_LZWFMT_SHRINKIT2
&&
714 dc
->fmt
!=DE_LZWFMT_PAKLEO
&&
715 dc
->fmt
!=DE_LZWFMT_ASC2COM
)
717 delzw_set_error(dc
, DELZW_ERRCODE_UNSUPPORTED_OPTION
, "Unsupported LZW format");
721 if(dc
->fmt
==DE_LZWFMT_UNIXCOMPRESS
) {
722 if(dc
->delzwp_copy
.flags
& DE_LZWFLAG_HAS3BYTEHEADER
) {
723 dc
->header_type
= DELZW_HEADERTYPE_UNIXCOMPRESS3BYTE
;
726 else if(dc
->delzwp_copy
.flags
& DE_LZWFLAG_HAS1BYTEHEADER
) {
727 dc
->unixcompress_has_clear_code
= 1;
728 dc
->header_type
= DELZW_HEADERTYPE_ARC1BYTE
;
732 dc
->unixcompress_has_clear_code
= 1;
733 dc
->max_codesize
= dc
->delzwp_copy
.max_code_size
;
736 if((dc
->delzwp_copy
.flags
& DE_LZWFLAG_TOLERATETRAILINGJUNK
) &&
737 !dc
->output_len_known
)
739 dc
->stop_on_invalid_code
= 1;
747 // Print dbg messages and warnings about the header
748 static void lzw_after_header_parsed(delzwctx
*dc
)
752 if(dc
->header_type
==DELZW_HEADERTYPE_UNIXCOMPRESS3BYTE
) {
753 de_dbg(c
, "LZW mode: 0x%02x", (UI
)dc
->header_unixcompress_mode
);
755 de_dbg(c
, "maxbits: %u", (UI
)dc
->header_unixcompress_max_codesize
);
756 de_dbg(c
, "blockmode: %d", (int)dc
->header_unixcompress_block_mode
);
757 if(!dc
->header_unixcompress_block_mode
) {
758 de_warn(c
, "This file uses an obsolete compress'd format, which "
759 "might not be decompressed correctly");
761 de_dbg_indent(c
, -1);
763 else if(dc
->header_type
==DELZW_HEADERTYPE_ARC1BYTE
) {
764 de_dbg(c
, "LZW maxbits: %u", (UI
)dc
->header_unixcompress_max_codesize
);
768 static void set_std_static_codes(delzwctx
*dc
)
772 for(i
=0; i
<256; i
++) {
773 dc
->ct
[i
].codetype
= DELZW_CODETYPE_STATIC
;
774 dc
->ct
[i
].value
= (u8
)i
;
778 // Process the header, if any.
779 // Set any remaining params needed, and validate params.
780 // This is called upon encountering the first byte after the header.
781 // (If zero bytes of data were compressed, it might never be called.)
782 static void delzw_on_codes_start(delzwctx
*dc
)
785 UI default_max_codesize
= 0;
787 if(dc
->errcode
) goto done
;
789 if(dc
->header_size
> 0) {
790 delzw_debugmsg(dc
, 2, "processing header");
792 if(dc
->header_type
==DELZW_HEADERTYPE_UNIXCOMPRESS3BYTE
) {
793 delzw_process_unixcompress_3byteheader(dc
);
795 else if(dc
->header_type
==DELZW_HEADERTYPE_ARC1BYTE
) {
796 delzw_process_arc_1byteheader(dc
);
799 lzw_after_header_parsed(dc
);
802 delzw_debugmsg(dc
, 2, "start of codes");
804 if(dc
->fmt
==DE_LZWFMT_UNIXCOMPRESS
) {
806 dc
->auto_inc_codesize
= 1;
808 else if(dc
->fmt
==DE_LZWFMT_GIF
) {
810 dc
->auto_inc_codesize
= 1;
811 dc
->min_codesize
= dc
->delzwp_copy
.gif_root_code_size
+ 1;
812 dc
->max_codesize
= 12;
814 else if(dc
->fmt
==DE_LZWFMT_ZIPSHRINK
) {
816 dc
->has_partial_clearing
= 1;
817 dc
->max_codesize
= 13;
819 else if(dc
->fmt
==DE_LZWFMT_ZOOLZD
) {
821 dc
->auto_inc_codesize
= 1;
822 default_max_codesize
= 13;
824 else if(dc
->fmt
==DE_LZWFMT_TIFFNEW
) {
825 dc
->auto_inc_codesize
= 1;
826 dc
->early_codesize_inc
= 1;
827 default_max_codesize
= 12;
829 else if(dc
->fmt
==DE_LZWFMT_TIFFOLD
) {
831 dc
->auto_inc_codesize
= 1;
832 default_max_codesize
= 12;
834 else if(dc
->fmt
==DE_LZWFMT_ARC5
) {
836 dc
->max_codesize
= 12;
837 dc
->min_codesize
= 12;
839 else if(dc
->fmt
==DE_LZWFMT_DWC
) {
840 dc
->early_codesize_inc
= 1;
841 dc
->auto_inc_codesize
= 1;
842 default_max_codesize
= 14;
844 else if(dc
->fmt
==DE_LZWFMT_SHRINKIT1
|| dc
->fmt
==DE_LZWFMT_SHRINKIT2
) {
846 default_max_codesize
= 12;
847 dc
->auto_inc_codesize
= 1;
848 dc
->early_codesize_inc
= 1;
850 else if(dc
->fmt
==DE_LZWFMT_PAKLEO
) {
851 default_max_codesize
= 14;
853 else if(dc
->fmt
==DE_LZWFMT_ASC2COM
) {
855 default_max_codesize
= 10;
858 if(dc
->min_codesize
==0) {
859 // 9 is the usual minimum codesize for general purpose LZW compression schemes
860 // with a variable code size.
861 dc
->min_codesize
= 9;
864 if(dc
->max_codesize
==0) {
865 dc
->max_codesize
= dc
->delzwp_copy
.max_code_size
?
866 dc
->delzwp_copy
.max_code_size
: default_max_codesize
;
869 if(dc
->min_codesize
<DELZW_MINMINCODESIZE
|| dc
->min_codesize
>DELZW_MAXMAXCODESIZE
||
870 dc
->max_codesize
<DELZW_MINMINCODESIZE
|| dc
->max_codesize
>DELZW_MAXMAXCODESIZE
||
871 dc
->min_codesize
>dc
->max_codesize
)
873 delzw_set_errorf(dc
, DELZW_ERRCODE_UNSUPPORTED_OPTION
, "Unsupported code size (%u,%u)",
874 dc
->min_codesize
, dc
->max_codesize
);
878 delzw_debugmsg(dc
, 2, "code size: %u, max=%u", dc
->min_codesize
, dc
->max_codesize
);
880 dc
->curr_codesize
= dc
->min_codesize
;
882 dc
->ct_capacity
= ((DELZW_CODE
)1)<<dc
->max_codesize
;
883 dc
->ct
= de_mallocarray(dc
->c
, dc
->ct_capacity
, sizeof(struct delzw_tableentry
));
885 dc
->ct2
= de_mallocarray(dc
->c
, dc
->ct_capacity
, sizeof(struct delzw_tableentry2
));
887 dc
->valbuf_capacity
= dc
->ct_capacity
;
888 dc
->valbuf
= de_malloc(dc
->c
, dc
->valbuf_capacity
);
890 if(dc
->fmt
==DE_LZWFMT_UNIXCOMPRESS
) {
891 set_std_static_codes(dc
);
893 if(dc
->unixcompress_has_clear_code
) {
894 dc
->ct
[256].codetype
= DELZW_CODETYPE_CLEAR
;
895 dc
->first_dynamic_code
= 257;
898 dc
->first_dynamic_code
= 256;
901 else if(dc
->fmt
==DE_LZWFMT_GIF
) {
902 DELZW_CODE n
= DELZW_NBITS_TO_NCODES(dc
->delzwp_copy
.gif_root_code_size
);
905 dc
->ct
[i
].codetype
= DELZW_CODETYPE_STATIC
;
906 dc
->ct
[i
].value
= (i
<=255)?((u8
)i
):0;
908 dc
->ct
[n
].codetype
= DELZW_CODETYPE_CLEAR
;
909 dc
->ct
[n
+1].codetype
= DELZW_CODETYPE_STOP
;
910 dc
->first_dynamic_code
= n
+2;
912 else if(dc
->fmt
==DE_LZWFMT_ZIPSHRINK
) {
913 set_std_static_codes(dc
);
914 dc
->ct
[256].codetype
= DELZW_CODETYPE_SPECIAL
;
915 dc
->first_dynamic_code
= 257;
917 else if(dc
->fmt
==DE_LZWFMT_ZOOLZD
) {
918 set_std_static_codes(dc
);
919 dc
->ct
[256].codetype
= DELZW_CODETYPE_CLEAR
;
920 dc
->ct
[257].codetype
= DELZW_CODETYPE_STOP
;
921 dc
->first_dynamic_code
= 258;
923 else if(dc
->fmt
==DE_LZWFMT_TIFFNEW
|| dc
->fmt
==DE_LZWFMT_TIFFOLD
) {
924 set_std_static_codes(dc
);
925 dc
->ct
[256].codetype
= DELZW_CODETYPE_CLEAR
;
926 dc
->ct
[257].codetype
= DELZW_CODETYPE_STOP
;
927 dc
->first_dynamic_code
= 258;
929 else if(dc
->fmt
==DE_LZWFMT_DWC
) {
930 set_std_static_codes(dc
);
931 dc
->first_dynamic_code
= 256;
933 else if(dc
->fmt
==DE_LZWFMT_SHRINKIT1
) {
934 set_std_static_codes(dc
);
935 dc
->ct
[256].codetype
= DELZW_CODETYPE_INVALID
; // ??
936 dc
->first_dynamic_code
= 257;
938 else if(dc
->fmt
==DE_LZWFMT_SHRINKIT2
) {
939 set_std_static_codes(dc
);
940 dc
->ct
[256].codetype
= DELZW_CODETYPE_CLEAR
;
941 dc
->first_dynamic_code
= 257;
943 else if(dc
->fmt
==DE_LZWFMT_PAKLEO
) {
944 set_std_static_codes(dc
);
945 dc
->ct
[256].codetype
= DELZW_CODETYPE_STOP
;
946 dc
->ct
[257].codetype
= DELZW_CODETYPE_INC_CDSZ
;
947 dc
->ct
[258].codetype
= DELZW_CODETYPE_CLEAR
;
948 dc
->first_dynamic_code
= 259;
950 else if(dc
->fmt
==DE_LZWFMT_ASC2COM
) {
951 set_std_static_codes(dc
);
952 dc
->ct
[256].codetype
= DELZW_CODETYPE_INVALID
;
953 dc
->ct
[257].codetype
= DELZW_CODETYPE_INC_CDSZ
;
954 dc
->ct
[258].codetype
= DELZW_CODETYPE_STOP
;
955 dc
->first_dynamic_code
= 259;
959 for(i
=0; i
<dc
->ct_capacity
; i
++) {
960 dc
->ct2
[i
].next
= DELZW_NEXTPTR_NONE
;
964 for(i
=dc
->first_dynamic_code
; i
<dc
->ct_capacity
; i
++) {
965 dc
->ct
[i
].codetype
= DELZW_CODETYPE_DYN_UNUSED
;
967 dc
->free_code_search_start
= dc
->first_dynamic_code
;
970 if(dc
->delzwp_copy
.arc5_has_stop_code
) {
971 dc
->ct
[0].codetype
= DELZW_CODETYPE_STOP
;
975 for(i
=0; i
<256; i
++) {
976 delzw_hashed_add_root_code_to_dict(dc
, (u8
)i
);
980 dc
->bbll
.is_lsb
= dc
->is_lsb
;
981 de_bitbuf_lowlevel_empty(&dc
->bbll
);
986 static int delzw_have_enough_output(delzwctx
*dc
)
988 if(dc
->output_len_known
) {
989 if(dc
->uncmpr_nbytes_written
>= dc
->output_expected_len
) {
996 static void delzw_process_byte(delzwctx
*dc
, u8 b
)
998 if(dc
->state
==DELZW_STATE_INIT
) {
999 delzw_on_decompression_start(dc
);
1000 dc
->state
= DELZW_STATE_READING_HEADER
;
1001 if(dc
->errcode
) return;
1004 if(dc
->state
==DELZW_STATE_READING_HEADER
) {
1005 if(dc
->total_nbytes_processed
< (i64
)dc
->header_size
) {
1006 dc
->header_buf
[dc
->total_nbytes_processed
] = b
;
1010 // (This is the first byte after the header.)
1011 delzw_on_codes_start(dc
);
1012 dc
->state
= DELZW_STATE_READING_CODES
;
1013 if(dc
->errcode
) return;
1016 if(dc
->state
==DELZW_STATE_READING_CODES
) {
1017 if(dc
->nbytes_left_to_skip
>0) {
1018 dc
->nbytes_left_to_skip
--;
1022 // Add a byte's worth of bits to the pending code
1023 de_bitbuf_lowlevel_add_byte(&dc
->bbll
, b
);
1029 if(dc
->errcode
) break;
1031 if(dc
->special_code_is_pending
&& dc
->fmt
==DE_LZWFMT_DWC
) {
1035 this_codesize
= dc
->curr_codesize
;
1038 if(dc
->bbll
.nbits_in_bitbuf
< this_codesize
) {
1042 code
= (DELZW_CODE
)de_bitbuf_lowlevel_get_bits(&dc
->bbll
, this_codesize
);
1043 dc
->ncodes_in_this_bitgroup
++;
1044 delzw_process_code(dc
, code
);
1046 if(dc
->state
!= DELZW_STATE_READING_CODES
) {
1049 if(dc
->nbytes_left_to_skip
>0) {
1056 static void delzw_addbuf(delzwctx
*dc
, const u8
*buf
, i64 buf_len
)
1060 if(dc
->debug_level
>=3) {
1061 delzw_debugmsg(dc
, 3, "received %"I64_FMT
" bytes of input", buf_len
);
1064 for(i
=0; i
<buf_len
; i
++) {
1065 if(dc
->errcode
) break;
1066 if(dc
->state
== DELZW_STATE_FINISHED
) break;
1067 if(delzw_have_enough_output(dc
)) {
1068 delzw_stop(dc
, "sufficient output");
1071 delzw_process_byte(dc
, buf
[i
]);
1072 dc
->total_nbytes_processed
++;
1076 static void delzw_finish(delzwctx
*dc
)
1080 if(dc
->output_len_known
&& (dc
->uncmpr_nbytes_written
==dc
->output_expected_len
)) {
1081 reason
= "end of input and sufficient output";
1084 reason
= "end of input";
1087 delzw_stop(dc
, reason
);
1090 void fmtutil_decompress_lzw(deark
*c
, struct de_dfilter_in_params
*dcmpri
,
1091 struct de_dfilter_out_params
*dcmpro
, struct de_dfilter_results
*dres
,
1092 struct de_lzw_params
*lzwp
)
1094 de_dfilter_decompress_oneshot(c
, dfilter_lzw_codec
, (void*)lzwp
,
1095 dcmpri
, dcmpro
, dres
);
1098 static void my_lzw_codec_finish(struct de_dfilter_ctx
*dfctx
)
1100 const char *modname
= "delzw";
1101 delzwctx
*dc
= (delzwctx
*)dfctx
->codec_private
;
1106 dfctx
->dres
->bytes_consumed
= dc
->total_nbytes_processed
;
1107 dfctx
->dres
->bytes_consumed_valid
= 1;
1110 de_dfilter_set_errorf(dfctx
->c
, dfctx
->dres
, modname
, "%s", dc
->errmsg
);
1114 static void my_lzw_codec_addbuf(struct de_dfilter_ctx
*dfctx
,
1115 const u8
*buf
, i64 buf_len
)
1117 delzwctx
*dc
= (delzwctx
*)dfctx
->codec_private
;
1120 delzw_addbuf(dc
, buf
, buf_len
);
1121 if(dc
->state
== DELZW_STATE_FINISHED
) {
1122 dfctx
->finished_flag
= 1;
1126 static void my_lzw_codec_command(struct de_dfilter_ctx
*dfctx
, int cmd
, UI flags
)
1128 delzwctx
*dc
= (delzwctx
*)dfctx
->codec_private
;
1130 if(dc
->fmt
==DE_LZWFMT_SHRINKIT2
) {
1131 if(cmd
==DE_DFILTER_COMMAND_FINISH_BLOCK
) {
1132 dc
->total_nbytes_processed
-= (i64
)(dc
->bbll
.nbits_in_bitbuf
/8);
1133 de_bitbuf_lowlevel_empty(&dc
->bbll
);
1135 else if(cmd
==DE_DFILTER_COMMAND_SOFTRESET
) {
1141 static void my_lzw_codec_destroy(struct de_dfilter_ctx
*dfctx
)
1143 deark
*c
= dfctx
->c
;
1144 delzwctx
*dc
= (delzwctx
*)dfctx
->codec_private
;
1149 if(dc
->ct2
) de_free(c
, dc
->ct2
);
1150 de_free(c
, dc
->valbuf
);
1153 dfctx
->codec_private
= NULL
;
1156 // codec_private_params is type struct de_lzw_params.
1157 void dfilter_lzw_codec(struct de_dfilter_ctx
*dfctx
, void *codec_private_params
)
1159 delzwctx
*dc
= NULL
;
1160 struct de_lzw_params
*delzwp
= (struct de_lzw_params
*)codec_private_params
;
1162 dc
= de_malloc(dfctx
->c
, sizeof(delzwctx
));
1165 dc
->debug_level
= dc
->c
->debug_level
;
1167 dfctx
->codec_private
= (void*)dc
;
1168 dfctx
->codec_finish_fn
= my_lzw_codec_finish
;
1169 dfctx
->codec_destroy_fn
= my_lzw_codec_destroy
;
1170 dfctx
->codec_addbuf_fn
= my_lzw_codec_addbuf
;
1171 dfctx
->codec_command_fn
= my_lzw_codec_command
;
1173 dc
->output_len_known
= dfctx
->dcmpro
->len_known
;
1174 dc
->output_expected_len
= dfctx
->dcmpro
->expected_len
;
1176 dc
->delzwp_copy
= *delzwp
; // struct copy
1177 dc
->fmt
= delzwp
->fmt
;
1180 /////////////////////////////////////////////////////////
1181 // 12-bit LZW (LZMW?) scheme used in some OS/2 formats
1183 #include "../foreign/dskdcmps.h"
1185 void fmtutil_ibmlzw_codectype1(deark
*c
, struct de_dfilter_in_params
*dcmpri
,
1186 struct de_dfilter_out_params
*dcmpro
, struct de_dfilter_results
*dres
,
1187 void *codec_private_params
)
1189 dskdcmps_run(c
, dcmpri
, dcmpro
, dres
);