rosprite: Cleanup and modernization
[deark.git] / src / fmtutil-lzw.c
blob1708327f5684f37280cb6c36cab25220ce07fcdd
1 // This file is part of Deark.
2 // Copyright (C) 2019-2022 Jason Summers
3 // See the file COPYING for terms of use.
5 // LZW decompressor
7 #define DE_NOT_IN_MODULE
8 #include "deark-private.h"
9 #include "deark-fmtutil.h"
11 #define DELZW_CODE u32 // int type used in most cases
12 #define DELZW_CODE_MINRANGE u16 // int type used for parents in table entries
13 #define DELZW_MINMINCODESIZE 3
14 #define DELZW_MAXMAXCODESIZE 16
15 #define DELZW_NBITS_TO_MAXCODE(n) ((DELZW_CODE)((1<<(n))-1))
16 #define DELZW_NBITS_TO_NCODES(n) ((DELZW_CODE)(1<<(n)))
18 struct delzwctx_struct;
19 typedef struct delzwctx_struct delzwctx;
21 struct delzw_tableentry {
22 DELZW_CODE_MINRANGE parent;
23 u8 value;
24 #define DELZW_CODETYPE_INVALID 0x00
25 #define DELZW_CODETYPE_STATIC 0x01
26 #define DELZW_CODETYPE_DYN_UNUSED 0x02
27 #define DELZW_CODETYPE_DYN_USED 0x03
28 #define DELZW_CODETYPE_CLEAR 0x08
29 #define DELZW_CODETYPE_STOP 0x09
30 #define DELZW_CODETYPE_INC_CDSZ 0x0a
31 #define DELZW_CODETYPE_SPECIAL 0x0f
32 u8 codetype;
33 u8 flags;
36 struct delzw_tableentry2 {
37 #define DELZW_NEXTPTR_NONE 0xffff // Note - This table is only used with 12-bit codes
38 DELZW_CODE_MINRANGE next;
41 struct delzwctx_struct {
42 deark *c;
43 struct de_dfilter_ctx *dfctx;
44 int debug_level;
45 enum de_lzwfmt_enum fmt;
47 struct de_lzw_params delzwp_copy;
49 #define DELZW_HEADERTYPE_NONE 0
50 #define DELZW_HEADERTYPE_UNIXCOMPRESS3BYTE 1
51 #define DELZW_HEADERTYPE_ARC1BYTE 2
52 int header_type;
54 u8 stop_on_invalid_code; // Invalid code means "stop", not a fatal error
56 u8 output_len_known;
57 i64 output_expected_len;
59 u8 auto_inc_codesize;
60 u8 unixcompress_has_clear_code;
61 UI min_codesize;
62 UI max_codesize;
64 size_t header_size;
65 u8 is_lsb;
66 u8 early_codesize_inc;
67 u8 has_partial_clearing;
68 u8 is_hashed;
70 // Informational:
71 u8 header_unixcompress_mode;
72 u8 header_unixcompress_max_codesize;
73 u8 header_unixcompress_block_mode; // = 1 or 0
75 // Internal state:
76 #define DELZW_ERRCODE_OK 0
77 #define DELZW_ERRCODE_GENERIC_ERROR 1
78 #define DELZW_ERRCODE_BAD_CDATA 2
79 #define DELZW_ERRCODE_MALLOC_FAILED 3
80 #define DELZW_ERRCODE_WRITE_FAILED 7
81 #define DELZW_ERRCODE_INSUFFICIENT_CDATA 8
82 #define DELZW_ERRCODE_UNSUPPORTED_OPTION 9
83 #define DELZW_ERRCODE_INTERNAL_ERROR 10
84 int errcode;
86 #define DELZW_STATE_INIT 0
87 #define DELZW_STATE_READING_HEADER 1
88 #define DELZW_STATE_READING_CODES 2
89 #define DELZW_STATE_FINISHED 3
90 int state;
91 i64 total_nbytes_processed;
92 i64 uncmpr_nbytes_written;
94 i64 ncodes_in_this_bitgroup;
95 i64 nbytes_left_to_skip;
96 i64 code_counter;
98 UI curr_codesize;
100 u8 have_oldcode;
101 DELZW_CODE oldcode;
102 DELZW_CODE last_code_added;
103 u8 last_value;
104 DELZW_CODE highest_code_ever_used;
105 DELZW_CODE free_code_search_start;
106 DELZW_CODE first_dynamic_code;
107 u8 special_code_is_pending;
109 struct de_bitbuf_lowlevel bbll;
111 DELZW_CODE ct_capacity;
112 DELZW_CODE ct_code_count; // Note - Not always maintained if not needed
113 struct delzw_tableentry *ct;
114 struct delzw_tableentry2 *ct2;
116 u8 header_buf[3];
118 size_t valbuf_capacity;
119 u8 *valbuf;
121 char errmsg[80];
124 static void delzw_debugmsg(delzwctx *dc, int level, const char *fmt, ...)
125 de_gnuc_attribute ((format (printf, 3, 4)));
127 static void delzw_debugmsg(delzwctx *dc, int level, const char *fmt, ...)
129 va_list ap;
130 char msg[200];
132 if(level>dc->debug_level) return;
134 va_start(ap, fmt);
135 de_vsnprintf(msg, sizeof(msg), fmt, ap);
136 va_end(ap);
138 de_dbg(dc->c, "[delzw:i%"I64_FMT"/o%"I64_FMT"] %s",
139 (i64)dc->total_nbytes_processed, (i64)dc->uncmpr_nbytes_written, msg);
142 static void delzw_dumptable(delzwctx *dc)
144 DELZW_CODE k;
145 for(k=0; k<dc->highest_code_ever_used; k++) {
146 delzw_debugmsg(dc, 4, "[%d] ty=%d p=%d v=%d f=%d",
147 (int)k, (int)dc->ct[k].codetype, (int)dc->ct[k].parent,
148 (int)dc->ct[k].value, (int)dc->ct[k].flags);
152 static void delzw_stop(delzwctx *dc, const char *reason)
154 if(dc->state == DELZW_STATE_FINISHED) return;
155 delzw_debugmsg(dc, 2, "stopping due to %s", reason);
156 dc->state = DELZW_STATE_FINISHED;
159 static void delzw_set_errorf(delzwctx *dc, int errcode, const char *fmt, ...)
160 de_gnuc_attribute ((format (printf, 3, 4)));
162 static void delzw_set_errorf(delzwctx *dc, int errcode, const char *fmt, ...)
164 va_list ap;
166 delzw_stop(dc, "error");
167 if(dc->errcode) return;
168 dc->errcode = errcode;
169 va_start(ap, fmt);
170 de_vsnprintf(dc->errmsg, sizeof(dc->errmsg), fmt, ap);
171 va_end(ap);
174 static void delzw_set_error(delzwctx *dc, int errcode, const char *msg)
176 delzw_stop(dc, "error");
177 if(dc->errcode) return;
178 dc->errcode = errcode;
179 if(!msg || !msg[0]) {
180 msg = "LZW decompression error";
182 de_strlcpy(dc->errmsg, msg, sizeof(dc->errmsg));
185 static void delzw_write(delzwctx *dc, const u8 *buf, size_t n1)
187 i64 n;
189 if(dc->errcode) return;
190 n = (i64)n1;
192 if(dc->output_len_known) {
193 if(dc->uncmpr_nbytes_written + n > dc->output_expected_len) {
194 n = dc->output_expected_len - dc->uncmpr_nbytes_written;
197 if(n<1) return;
198 if(n==1) {
199 dbuf_writebyte(dc->dfctx->dcmpro->f, buf[0]);
201 else {
202 dbuf_write(dc->dfctx->dcmpro->f, buf, n);
204 dc->uncmpr_nbytes_written += n;
207 static void delzw_process_unixcompress_3byteheader(delzwctx *dc)
209 if(dc->header_buf[0]!=0x1f || dc->header_buf[1]!=0x9d) {
210 delzw_set_error(dc, DELZW_ERRCODE_BAD_CDATA, "Not in compress format");
211 return;
214 dc->header_unixcompress_mode = dc->header_buf[2];
215 dc->header_unixcompress_max_codesize = (dc->header_unixcompress_mode & 0x1f);
216 dc->header_unixcompress_block_mode = (dc->header_unixcompress_mode & 0x80) ? 1 : 0;
217 delzw_debugmsg(dc, 2, "LZW mode=0x%02x, maxbits=%u, blockmode=%u",
218 (UI)dc->header_unixcompress_mode,
219 (UI)dc->header_unixcompress_max_codesize,
220 (UI)dc->header_unixcompress_block_mode);
222 dc->max_codesize = (UI)dc->header_unixcompress_max_codesize;
223 dc->unixcompress_has_clear_code = dc->header_unixcompress_block_mode;
226 static void delzw_process_arc_1byteheader(delzwctx *dc)
228 dc->header_unixcompress_max_codesize = (dc->header_buf[0] & 0x1f);
229 dc->max_codesize = (UI)dc->header_unixcompress_max_codesize;
230 delzw_debugmsg(dc, 2, "max code size: %u", dc->max_codesize);
233 // Is this a valid code with a value (a static, or in-use dynamic code)?
234 static int delzw_code_is_in_table(delzwctx *dc, DELZW_CODE code)
236 u8 codetype = dc->ct[code].codetype;
238 if(codetype==DELZW_CODETYPE_STATIC) return 1;
239 if(codetype==DELZW_CODETYPE_DYN_USED) return 1;
240 return 0;
243 // Decode an LZW code to one or more values, and write the values.
244 // Updates dc->last_value.
245 static void delzw_emit_code(delzwctx *dc, DELZW_CODE code1)
247 DELZW_CODE code = code1;
248 size_t valbuf_pos = dc->valbuf_capacity; // = First entry that's used
250 while(1) {
251 if(code >= dc->ct_capacity) {
252 delzw_set_errorf(dc, DELZW_ERRCODE_GENERIC_ERROR, "Bad LZW code (%d)", (int)code);
253 return;
256 if(valbuf_pos==0) {
257 // We must be in an infinite loop (probably an internal error).
258 delzw_set_error(dc, DELZW_ERRCODE_GENERIC_ERROR, NULL);
259 if(dc->debug_level>=4) {
260 delzw_dumptable(dc);
262 return;
265 // valbuf is a stack, essentially. We fill it in the reverse direction,
266 // to make it simpler to write the final byte sequence.
267 valbuf_pos--;
269 if(dc->ct[code].codetype==DELZW_CODETYPE_DYN_UNUSED) {
270 dc->valbuf[valbuf_pos] = dc->last_value;
271 code = dc->oldcode;
272 continue;
275 dc->valbuf[valbuf_pos] = dc->ct[code].value;
277 if(dc->ct[code].codetype==DELZW_CODETYPE_STATIC) {
278 dc->last_value = dc->ct[code].value;
279 break;
282 // Traverse the tree, back toward the root codes.
283 code = dc->ct[code].parent;
286 // Write out the collected values.
287 delzw_write(dc, &dc->valbuf[valbuf_pos], dc->valbuf_capacity - valbuf_pos);
290 static void delzw_find_first_free_entry(delzwctx *dc, DELZW_CODE *pentry)
292 DELZW_CODE k;
294 for(k=dc->free_code_search_start; k<dc->ct_capacity; k++) {
295 if(dc->ct[k].codetype==DELZW_CODETYPE_DYN_UNUSED) {
296 *pentry = (DELZW_CODE)k;
297 return;
301 *pentry = (DELZW_CODE)(dc->ct_capacity-1);
302 delzw_set_error(dc, DELZW_ERRCODE_BAD_CDATA, "LZW table unexpectedly full");
305 static void delzw_unixcompress_end_bitgroup(delzwctx *dc)
307 i64 ncodes_alloc;
308 i64 nbits_left_to_skip;
310 // The Unix 'compress' format has a quirk.
311 // The codes are written 8 at a time, with all 8 having the same codesize.
312 // The codesize cannot change in the middle of a block of 8. If it needs to,
313 // the remainder of the block is unused padding, which we must skip over.
314 // This is relevant when we encounter a clear code. It is also potentially
315 // relevant when the codesize is auto-incremented. But except possibly for
316 // the first group of codes (the 9-bit codes), the natural number of codes of
317 // a given size is always (?) a power of 2, and a multiple of 8. So, usually
318 // no padding is present at the auto-increment position.
319 // As it happens, when code 256 is used as the clear code, it reduces the
320 // natural number of 9-bit codes from 257 to 256, and since 256 is a multiple
321 // of 8, still no padding is present.
322 // But "v2" format does not use a clear code, and AFAICT it does have padding
323 // after the 9-bit codes.
325 ncodes_alloc = ((dc->ncodes_in_this_bitgroup + 7)/8)*8;
326 nbits_left_to_skip = (ncodes_alloc - dc->ncodes_in_this_bitgroup) * dc->curr_codesize;
328 // My thinking:
329 // Each "bitgroup" has a whole number of bytes.
330 // When we get here, we've just read a code, so the bitreader's buffer can have no more than
331 // 7 bits in it.
332 // All of the bits in it will be part of the "bits to skip". After accounting for them, we'll
333 // be left with a whole number of *bytes* left to skip, which always start on a byte boundary
334 // in the input stream.
335 // So, whenever the main input loop needs to skip anything, it will be a whole byte, and the
336 // bitreader's buffer will be empty. That's good; it makes it easier to deal with this
337 // padding.
339 if(nbits_left_to_skip>0) {
340 delzw_debugmsg(dc, 2, "padding bits: %d", (int)nbits_left_to_skip);
343 dc->ncodes_in_this_bitgroup = 0;
344 if(dc->bbll.nbits_in_bitbuf>7 || dc->bbll.nbits_in_bitbuf>nbits_left_to_skip) {
345 delzw_set_error(dc, DELZW_ERRCODE_INTERNAL_ERROR, NULL);
346 return;
349 nbits_left_to_skip -= dc->bbll.nbits_in_bitbuf;
350 if(nbits_left_to_skip%8 != 0) {
351 delzw_set_error(dc, DELZW_ERRCODE_INTERNAL_ERROR, NULL);
352 return;
355 de_bitbuf_lowlevel_empty(&dc->bbll);
356 dc->nbytes_left_to_skip = nbits_left_to_skip/8;
359 static void bbll_skip_to_byte_boundary(struct de_bitbuf_lowlevel *bbll)
361 UI n;
363 n = bbll->nbits_in_bitbuf % 8;
364 if(n) {
365 (void)de_bitbuf_lowlevel_get_bits(bbll, n);
369 static void delzw_increase_codesize(delzwctx *dc)
371 if(dc->fmt==DE_LZWFMT_UNIXCOMPRESS) {
372 delzw_unixcompress_end_bitgroup(dc);
374 else if(dc->fmt==DE_LZWFMT_ASC2COM) {
375 bbll_skip_to_byte_boundary(&dc->bbll);
378 if(dc->curr_codesize<dc->max_codesize) {
379 dc->curr_codesize++;
380 delzw_debugmsg(dc, 2, "increased code size to %u", dc->curr_codesize);
384 static DELZW_CODE delzw_get_hashed_code(delzwctx *dc, DELZW_CODE code,
385 u8 value)
387 DELZW_CODE h;
388 DELZW_CODE saved_h;
389 u32 count;
391 h = ((code+(DELZW_CODE)value) | 0x0800) & 0xffff;
392 h = ((h*h) >> 6) % dc->ct_capacity;
394 if(dc->ct[h].codetype==DELZW_CODETYPE_DYN_UNUSED) {
395 return h;
398 // Collision - First, walk to the end of the duplicates list
399 count = 0;
400 while(dc->ct2[h].next != DELZW_NEXTPTR_NONE) {
401 h = dc->ct2[h].next;
403 count++;
404 if(count > dc->ct_capacity) {
405 delzw_set_error(dc, DELZW_ERRCODE_GENERIC_ERROR, NULL);
406 return 0;
410 saved_h = h;
412 // Then search for an open slot
413 count = 0;
414 while(1) {
415 if(count==0)
416 h += 101;
417 else
418 h += 1;
419 h %= dc->ct_capacity;
421 if(dc->ct[h].codetype==DELZW_CODETYPE_DYN_UNUSED)
422 break;
424 count++;
425 if(count > dc->ct_capacity) {
426 delzw_set_error(dc, DELZW_ERRCODE_GENERIC_ERROR, NULL);
427 return 0;
431 dc->ct2[saved_h].next = h;
432 return h;
435 static void delzw_hashed_add_code_to_dict(delzwctx *dc, DELZW_CODE code, u8 value)
437 DELZW_CODE idx;
439 if(dc->ct_code_count >= dc->ct_capacity) {
440 return;
443 idx = delzw_get_hashed_code(dc, code, value);
444 if(dc->errcode) return;
446 dc->ct[idx].parent = (DELZW_CODE_MINRANGE)dc->oldcode;
447 dc->ct[idx].value = value;
448 dc->ct[idx].codetype = DELZW_CODETYPE_DYN_USED;
449 dc->ct_code_count++;
450 dc->last_code_added = idx;
453 static void delzw_hashed_add_root_code_to_dict(delzwctx *dc, u8 value)
455 int idx;
457 idx = delzw_get_hashed_code(dc, 0xffff, value);
458 if(dc->errcode) return;
460 dc->ct[idx].value = value;
461 dc->ct[idx].codetype = DELZW_CODETYPE_STATIC;
462 dc->ct_code_count++;
465 // Add a code to the dictionary.
466 // Sets delzw->last_code_added to the position where it was added.
467 static void delzw_add_to_dict(delzwctx *dc, DELZW_CODE parent, u8 value)
469 DELZW_CODE newpos;
471 if(dc->is_hashed) {
472 delzw_hashed_add_code_to_dict(dc, parent, value);
473 return;
476 if(dc->fmt==DE_LZWFMT_ZIPSHRINK) {
477 delzw_find_first_free_entry(dc, &newpos);
479 else {
480 newpos = dc->free_code_search_start;
482 if(dc->errcode) return;
483 if(newpos >= dc->ct_capacity) {
484 return;
487 if(newpos < dc->first_dynamic_code) {
488 delzw_set_error(dc, DELZW_ERRCODE_GENERIC_ERROR, NULL);
489 return;
492 dc->ct[newpos].parent = (DELZW_CODE_MINRANGE)parent;
493 dc->ct[newpos].value = value;
494 dc->ct[newpos].codetype = DELZW_CODETYPE_DYN_USED;
495 dc->ct_code_count++;
496 dc->last_code_added = newpos;
497 dc->free_code_search_start = newpos+1;
498 if(newpos > dc->highest_code_ever_used) {
499 dc->highest_code_ever_used = newpos;
502 if(dc->auto_inc_codesize) {
503 if(dc->early_codesize_inc) {
504 if(dc->free_code_search_start>=DELZW_NBITS_TO_MAXCODE(dc->curr_codesize)) {
505 delzw_increase_codesize(dc);
508 else {
509 if(dc->free_code_search_start>DELZW_NBITS_TO_MAXCODE(dc->curr_codesize)) {
510 delzw_increase_codesize(dc);
516 static void delzw_process_data_code(delzwctx *dc, DELZW_CODE code)
518 if(code >= dc->ct_capacity) {
519 return;
522 if(!dc->have_oldcode) {
523 // Special case for the first code.
524 delzw_emit_code(dc, code);
525 dc->oldcode = code;
526 dc->have_oldcode = 1;
527 dc->last_value = dc->ct[code].value;
528 return;
531 if(delzw_code_is_in_table(dc, code)) {
532 delzw_emit_code(dc, code);
533 if(dc->errcode) return;
535 // Let k = the first character of the translation of the code.
536 // Add <oldcode>k to the dictionary.
537 delzw_add_to_dict(dc, dc->oldcode, dc->last_value);
539 else {
540 if(code>dc->free_code_search_start && !dc->has_partial_clearing && !dc->is_hashed) {
541 if(dc->stop_on_invalid_code) {
542 delzw_debugmsg(dc, 1, "bad code: %d when max=%d (assuming data stops here)",
543 (int)code, (int)dc->free_code_search_start);
544 delzw_stop(dc, "bad LZW code");
545 return;
547 delzw_set_errorf(dc, DELZW_ERRCODE_BAD_CDATA, "Bad LZW code (%d when max=%d)",
548 (int)code, (int)dc->free_code_search_start);
549 return;
552 // Let k = the first char of the translation of oldcode.
553 // Add <oldcode>k to the dictionary.
554 delzw_add_to_dict(dc, dc->oldcode, dc->last_value);
555 if(dc->errcode) return;
557 // Write <oldcode>k to the output stream.
558 delzw_emit_code(dc, dc->last_code_added);
561 dc->oldcode = code;
564 static void delzw_clear_one_dynamic_code(delzwctx *dc, DELZW_CODE code)
566 if(code<dc->first_dynamic_code || code>=dc->ct_capacity) return;
567 dc->ct[code].codetype = DELZW_CODETYPE_DYN_UNUSED;
568 dc->ct[code].parent = 0;
569 dc->ct[code].value = 0;
572 static void delzw_clear(delzwctx *dc)
574 DELZW_CODE i;
576 delzw_debugmsg(dc, 2, "clear code");
578 if(dc->fmt==DE_LZWFMT_UNIXCOMPRESS) {
579 delzw_unixcompress_end_bitgroup(dc);
582 for(i=dc->first_dynamic_code; i<=dc->highest_code_ever_used; i++) {
583 delzw_clear_one_dynamic_code(dc, i);
586 dc->curr_codesize = dc->min_codesize;
587 dc->free_code_search_start = dc->first_dynamic_code;
588 dc->have_oldcode = 0;
589 dc->oldcode = 0;
590 dc->last_code_added = 0;
591 dc->last_value = 0;
593 delzw_debugmsg(dc, 2, "code size: %u", dc->curr_codesize);
596 static void delzw_partial_clear(delzwctx *dc)
598 DELZW_CODE i;
600 delzw_debugmsg(dc, 2, "partial clear code");
602 for(i=dc->first_dynamic_code; i<=dc->highest_code_ever_used; i++) {
603 // If this code is in use
604 if(dc->ct[i].codetype==DELZW_CODETYPE_DYN_USED) {
605 // and its parent is a dynamic code,
606 // mark its parent as having a child
607 if(dc->ct[i].parent>=257) {
608 dc->ct[dc->ct[i].parent].flags = 1;
613 for(i=dc->first_dynamic_code; i<=dc->highest_code_ever_used; i++) {
614 if(dc->ct[i].flags==0) {
615 // If this code has no children, clear it
616 delzw_clear_one_dynamic_code(dc, i);
618 else {
619 // Leave all flags clear, for next time
620 dc->ct[i].flags = 0;
624 dc->free_code_search_start = dc->first_dynamic_code;
627 static void delzw_process_code(delzwctx *dc, DELZW_CODE code)
629 if(dc->debug_level>=3) {
630 delzw_debugmsg(dc, 3, "%scode=%d oc=%d lca=%d lv=%d next=%d",
631 (dc->special_code_is_pending?"special":""),
632 (int)code,
633 (int)dc->oldcode, (int)dc->last_code_added, (int)dc->last_value,
634 (int)dc->free_code_search_start);
637 if(dc->special_code_is_pending) {
638 dc->special_code_is_pending = 0;
639 if(dc->fmt==DE_LZWFMT_ZIPSHRINK) {
640 if(code==1 && (dc->curr_codesize<dc->max_codesize)) {
641 delzw_increase_codesize(dc);
643 else if(code==2) {
644 delzw_partial_clear(dc);
646 else {
647 delzw_set_error(dc, DELZW_ERRCODE_BAD_CDATA, NULL);
650 else if(dc->fmt==DE_LZWFMT_DWC) {
651 if(code==0) { // no-op?
654 else {
655 // TODO: Find out what DWC special codes do
656 delzw_set_errorf(dc, DELZW_ERRCODE_UNSUPPORTED_OPTION,
657 "Unsupported special code: %u", (UI)code);
660 return;
663 if(code >= dc->ct_capacity) return;
665 switch(dc->ct[code].codetype) {
666 case DELZW_CODETYPE_STATIC:
667 case DELZW_CODETYPE_DYN_UNUSED:
668 case DELZW_CODETYPE_DYN_USED:
669 delzw_process_data_code(dc, code);
670 break;
671 case DELZW_CODETYPE_CLEAR:
672 delzw_clear(dc);
673 break;
674 case DELZW_CODETYPE_STOP:
675 delzw_stop(dc, "stop code");
676 break;
677 case DELZW_CODETYPE_INC_CDSZ:
678 delzw_increase_codesize(dc);
679 break;
680 case DELZW_CODETYPE_SPECIAL:
681 if(dc->fmt==DE_LZWFMT_ZIPSHRINK && code==256) {
682 dc->special_code_is_pending = 1; // next code is an "escaped" code
684 break;
685 default:
686 delzw_set_errorf(dc, DELZW_ERRCODE_UNSUPPORTED_OPTION,
687 "Unsupported or invalid code (%u)", (UI)code);
688 break;
691 dc->code_counter++;
692 if(dc->fmt==DE_LZWFMT_DWC) {
693 if((dc->code_counter%512)==256) {
694 dc->special_code_is_pending = 1;
699 // Most configuration is done in delzw_on_codes_start(), not here.
700 // This function does need to do some things to handle formats that have a
701 // header.
702 static void delzw_on_decompression_start(delzwctx *dc)
704 if(dc->fmt!=DE_LZWFMT_ZIPSHRINK &&
705 dc->fmt!=DE_LZWFMT_GIF &&
706 dc->fmt!=DE_LZWFMT_UNIXCOMPRESS &&
707 dc->fmt!=DE_LZWFMT_ZOOLZD &&
708 dc->fmt!=DE_LZWFMT_TIFFNEW &&
709 dc->fmt!=DE_LZWFMT_TIFFOLD &&
710 dc->fmt!=DE_LZWFMT_ARC5 &&
711 dc->fmt!=DE_LZWFMT_DWC &&
712 dc->fmt!=DE_LZWFMT_SHRINKIT1 &&
713 dc->fmt!=DE_LZWFMT_SHRINKIT2 &&
714 dc->fmt!=DE_LZWFMT_PAKLEO &&
715 dc->fmt!=DE_LZWFMT_ASC2COM)
717 delzw_set_error(dc, DELZW_ERRCODE_UNSUPPORTED_OPTION, "Unsupported LZW format");
718 goto done;
721 if(dc->fmt==DE_LZWFMT_UNIXCOMPRESS) {
722 if(dc->delzwp_copy.flags & DE_LZWFLAG_HAS3BYTEHEADER) {
723 dc->header_type = DELZW_HEADERTYPE_UNIXCOMPRESS3BYTE;
724 dc->header_size = 3;
726 else if(dc->delzwp_copy.flags & DE_LZWFLAG_HAS1BYTEHEADER) {
727 dc->unixcompress_has_clear_code = 1;
728 dc->header_type = DELZW_HEADERTYPE_ARC1BYTE;
729 dc->header_size = 1;
731 else {
732 dc->unixcompress_has_clear_code = 1;
733 dc->max_codesize = dc->delzwp_copy.max_code_size;
736 if((dc->delzwp_copy.flags & DE_LZWFLAG_TOLERATETRAILINGJUNK) &&
737 !dc->output_len_known)
739 dc->stop_on_invalid_code = 1;
743 done:
747 // Print dbg messages and warnings about the header
748 static void lzw_after_header_parsed(delzwctx *dc)
750 deark *c = dc->c;
752 if(dc->header_type==DELZW_HEADERTYPE_UNIXCOMPRESS3BYTE) {
753 de_dbg(c, "LZW mode: 0x%02x", (UI)dc->header_unixcompress_mode);
754 de_dbg_indent(c, 1);
755 de_dbg(c, "maxbits: %u", (UI)dc->header_unixcompress_max_codesize);
756 de_dbg(c, "blockmode: %d", (int)dc->header_unixcompress_block_mode);
757 if(!dc->header_unixcompress_block_mode) {
758 de_warn(c, "This file uses an obsolete compress'd format, which "
759 "might not be decompressed correctly");
761 de_dbg_indent(c, -1);
763 else if(dc->header_type==DELZW_HEADERTYPE_ARC1BYTE) {
764 de_dbg(c, "LZW maxbits: %u", (UI)dc->header_unixcompress_max_codesize);
768 static void set_std_static_codes(delzwctx *dc)
770 DELZW_CODE i;
772 for(i=0; i<256; i++) {
773 dc->ct[i].codetype = DELZW_CODETYPE_STATIC;
774 dc->ct[i].value = (u8)i;
778 // Process the header, if any.
779 // Set any remaining params needed, and validate params.
780 // This is called upon encountering the first byte after the header.
781 // (If zero bytes of data were compressed, it might never be called.)
782 static void delzw_on_codes_start(delzwctx *dc)
784 DELZW_CODE i;
785 UI default_max_codesize = 0;
787 if(dc->errcode) goto done;
789 if(dc->header_size > 0) {
790 delzw_debugmsg(dc, 2, "processing header");
792 if(dc->header_type==DELZW_HEADERTYPE_UNIXCOMPRESS3BYTE) {
793 delzw_process_unixcompress_3byteheader(dc);
795 else if(dc->header_type==DELZW_HEADERTYPE_ARC1BYTE) {
796 delzw_process_arc_1byteheader(dc);
799 lzw_after_header_parsed(dc);
802 delzw_debugmsg(dc, 2, "start of codes");
804 if(dc->fmt==DE_LZWFMT_UNIXCOMPRESS) {
805 dc->is_lsb = 1;
806 dc->auto_inc_codesize = 1;
808 else if(dc->fmt==DE_LZWFMT_GIF) {
809 dc->is_lsb = 1;
810 dc->auto_inc_codesize = 1;
811 dc->min_codesize = dc->delzwp_copy.gif_root_code_size + 1;
812 dc->max_codesize = 12;
814 else if(dc->fmt==DE_LZWFMT_ZIPSHRINK) {
815 dc->is_lsb = 1;
816 dc->has_partial_clearing = 1;
817 dc->max_codesize = 13;
819 else if(dc->fmt==DE_LZWFMT_ZOOLZD) {
820 dc->is_lsb = 1;
821 dc->auto_inc_codesize = 1;
822 default_max_codesize = 13;
824 else if(dc->fmt==DE_LZWFMT_TIFFNEW) {
825 dc->auto_inc_codesize = 1;
826 dc->early_codesize_inc = 1;
827 default_max_codesize = 12;
829 else if(dc->fmt==DE_LZWFMT_TIFFOLD) {
830 dc->is_lsb = 1;
831 dc->auto_inc_codesize = 1;
832 default_max_codesize = 12;
834 else if(dc->fmt==DE_LZWFMT_ARC5) {
835 dc->is_hashed = 1;
836 dc->max_codesize = 12;
837 dc->min_codesize = 12;
839 else if(dc->fmt==DE_LZWFMT_DWC) {
840 dc->early_codesize_inc = 1;
841 dc->auto_inc_codesize = 1;
842 default_max_codesize = 14;
844 else if(dc->fmt==DE_LZWFMT_SHRINKIT1 || dc->fmt==DE_LZWFMT_SHRINKIT2) {
845 dc->is_lsb = 1;
846 default_max_codesize = 12;
847 dc->auto_inc_codesize = 1;
848 dc->early_codesize_inc = 1;
850 else if(dc->fmt==DE_LZWFMT_PAKLEO) {
851 default_max_codesize = 14;
853 else if(dc->fmt==DE_LZWFMT_ASC2COM) {
854 dc->is_lsb = 1;
855 default_max_codesize = 10;
858 if(dc->min_codesize==0) {
859 // 9 is the usual minimum codesize for general purpose LZW compression schemes
860 // with a variable code size.
861 dc->min_codesize = 9;
864 if(dc->max_codesize==0) {
865 dc->max_codesize = dc->delzwp_copy.max_code_size ?
866 dc->delzwp_copy.max_code_size : default_max_codesize;
869 if(dc->min_codesize<DELZW_MINMINCODESIZE || dc->min_codesize>DELZW_MAXMAXCODESIZE ||
870 dc->max_codesize<DELZW_MINMINCODESIZE || dc->max_codesize>DELZW_MAXMAXCODESIZE ||
871 dc->min_codesize>dc->max_codesize)
873 delzw_set_errorf(dc, DELZW_ERRCODE_UNSUPPORTED_OPTION, "Unsupported code size (%u,%u)",
874 dc->min_codesize, dc->max_codesize);
875 goto done;
878 delzw_debugmsg(dc, 2, "code size: %u, max=%u", dc->min_codesize, dc->max_codesize);
880 dc->curr_codesize = dc->min_codesize;
882 dc->ct_capacity = ((DELZW_CODE)1)<<dc->max_codesize;
883 dc->ct = de_mallocarray(dc->c, dc->ct_capacity, sizeof(struct delzw_tableentry));
884 if(dc->is_hashed) {
885 dc->ct2 = de_mallocarray(dc->c, dc->ct_capacity, sizeof(struct delzw_tableentry2));
887 dc->valbuf_capacity = dc->ct_capacity;
888 dc->valbuf = de_malloc(dc->c, dc->valbuf_capacity);
890 if(dc->fmt==DE_LZWFMT_UNIXCOMPRESS) {
891 set_std_static_codes(dc);
893 if(dc->unixcompress_has_clear_code) {
894 dc->ct[256].codetype = DELZW_CODETYPE_CLEAR;
895 dc->first_dynamic_code = 257;
897 else {
898 dc->first_dynamic_code = 256;
901 else if(dc->fmt==DE_LZWFMT_GIF) {
902 DELZW_CODE n = DELZW_NBITS_TO_NCODES(dc->delzwp_copy.gif_root_code_size);
904 for(i=0; i<n; i++) {
905 dc->ct[i].codetype = DELZW_CODETYPE_STATIC;
906 dc->ct[i].value = (i<=255)?((u8)i):0;
908 dc->ct[n].codetype = DELZW_CODETYPE_CLEAR;
909 dc->ct[n+1].codetype = DELZW_CODETYPE_STOP;
910 dc->first_dynamic_code = n+2;
912 else if(dc->fmt==DE_LZWFMT_ZIPSHRINK) {
913 set_std_static_codes(dc);
914 dc->ct[256].codetype = DELZW_CODETYPE_SPECIAL;
915 dc->first_dynamic_code = 257;
917 else if(dc->fmt==DE_LZWFMT_ZOOLZD) {
918 set_std_static_codes(dc);
919 dc->ct[256].codetype = DELZW_CODETYPE_CLEAR;
920 dc->ct[257].codetype = DELZW_CODETYPE_STOP;
921 dc->first_dynamic_code = 258;
923 else if(dc->fmt==DE_LZWFMT_TIFFNEW || dc->fmt==DE_LZWFMT_TIFFOLD) {
924 set_std_static_codes(dc);
925 dc->ct[256].codetype = DELZW_CODETYPE_CLEAR;
926 dc->ct[257].codetype = DELZW_CODETYPE_STOP;
927 dc->first_dynamic_code = 258;
929 else if(dc->fmt==DE_LZWFMT_DWC) {
930 set_std_static_codes(dc);
931 dc->first_dynamic_code = 256;
933 else if(dc->fmt==DE_LZWFMT_SHRINKIT1) {
934 set_std_static_codes(dc);
935 dc->ct[256].codetype = DELZW_CODETYPE_INVALID; // ??
936 dc->first_dynamic_code = 257;
938 else if(dc->fmt==DE_LZWFMT_SHRINKIT2) {
939 set_std_static_codes(dc);
940 dc->ct[256].codetype = DELZW_CODETYPE_CLEAR;
941 dc->first_dynamic_code = 257;
943 else if(dc->fmt==DE_LZWFMT_PAKLEO) {
944 set_std_static_codes(dc);
945 dc->ct[256].codetype = DELZW_CODETYPE_STOP;
946 dc->ct[257].codetype = DELZW_CODETYPE_INC_CDSZ;
947 dc->ct[258].codetype = DELZW_CODETYPE_CLEAR;
948 dc->first_dynamic_code = 259;
950 else if(dc->fmt==DE_LZWFMT_ASC2COM) {
951 set_std_static_codes(dc);
952 dc->ct[256].codetype = DELZW_CODETYPE_INVALID;
953 dc->ct[257].codetype = DELZW_CODETYPE_INC_CDSZ;
954 dc->ct[258].codetype = DELZW_CODETYPE_STOP;
955 dc->first_dynamic_code = 259;
958 if(dc->is_hashed) {
959 for(i=0; i<dc->ct_capacity; i++) {
960 dc->ct2[i].next = DELZW_NEXTPTR_NONE;
964 for(i=dc->first_dynamic_code; i<dc->ct_capacity; i++) {
965 dc->ct[i].codetype = DELZW_CODETYPE_DYN_UNUSED;
967 dc->free_code_search_start = dc->first_dynamic_code;
969 if(dc->is_hashed) {
970 if(dc->delzwp_copy.arc5_has_stop_code) {
971 dc->ct[0].codetype = DELZW_CODETYPE_STOP;
972 dc->ct_code_count++;
975 for(i=0; i<256; i++) {
976 delzw_hashed_add_root_code_to_dict(dc, (u8)i);
980 dc->bbll.is_lsb = dc->is_lsb;
981 de_bitbuf_lowlevel_empty(&dc->bbll);
982 done:
986 static int delzw_have_enough_output(delzwctx *dc)
988 if(dc->output_len_known) {
989 if(dc->uncmpr_nbytes_written >= dc->output_expected_len) {
990 return 1;
993 return 0;
996 static void delzw_process_byte(delzwctx *dc, u8 b)
998 if(dc->state==DELZW_STATE_INIT) {
999 delzw_on_decompression_start(dc);
1000 dc->state = DELZW_STATE_READING_HEADER;
1001 if(dc->errcode) return;
1004 if(dc->state==DELZW_STATE_READING_HEADER) {
1005 if(dc->total_nbytes_processed < (i64)dc->header_size) {
1006 dc->header_buf[dc->total_nbytes_processed] = b;
1007 return;
1010 // (This is the first byte after the header.)
1011 delzw_on_codes_start(dc);
1012 dc->state = DELZW_STATE_READING_CODES;
1013 if(dc->errcode) return;
1016 if(dc->state==DELZW_STATE_READING_CODES) {
1017 if(dc->nbytes_left_to_skip>0) {
1018 dc->nbytes_left_to_skip--;
1019 return;
1022 // Add a byte's worth of bits to the pending code
1023 de_bitbuf_lowlevel_add_byte(&dc->bbll, b);
1025 while(1) {
1026 DELZW_CODE code;
1027 UI this_codesize;
1029 if(dc->errcode) break;
1031 if(dc->special_code_is_pending && dc->fmt==DE_LZWFMT_DWC) {
1032 this_codesize = 3;
1034 else {
1035 this_codesize = dc->curr_codesize;
1038 if(dc->bbll.nbits_in_bitbuf < this_codesize) {
1039 break;
1042 code = (DELZW_CODE)de_bitbuf_lowlevel_get_bits(&dc->bbll, this_codesize);
1043 dc->ncodes_in_this_bitgroup++;
1044 delzw_process_code(dc, code);
1046 if(dc->state != DELZW_STATE_READING_CODES) {
1047 break;
1049 if(dc->nbytes_left_to_skip>0) {
1050 break;
1056 static void delzw_addbuf(delzwctx *dc, const u8 *buf, i64 buf_len)
1058 i64 i;
1060 if(dc->debug_level>=3) {
1061 delzw_debugmsg(dc, 3, "received %"I64_FMT" bytes of input", buf_len);
1064 for(i=0; i<buf_len; i++) {
1065 if(dc->errcode) break;
1066 if(dc->state == DELZW_STATE_FINISHED) break;
1067 if(delzw_have_enough_output(dc)) {
1068 delzw_stop(dc, "sufficient output");
1069 break;
1071 delzw_process_byte(dc, buf[i]);
1072 dc->total_nbytes_processed++;
1076 static void delzw_finish(delzwctx *dc)
1078 const char *reason;
1080 if(dc->output_len_known && (dc->uncmpr_nbytes_written==dc->output_expected_len)) {
1081 reason = "end of input and sufficient output";
1083 else {
1084 reason = "end of input";
1087 delzw_stop(dc, reason);
1090 void fmtutil_decompress_lzw(deark *c, struct de_dfilter_in_params *dcmpri,
1091 struct de_dfilter_out_params *dcmpro, struct de_dfilter_results *dres,
1092 struct de_lzw_params *lzwp)
1094 de_dfilter_decompress_oneshot(c, dfilter_lzw_codec, (void*)lzwp,
1095 dcmpri, dcmpro, dres);
1098 static void my_lzw_codec_finish(struct de_dfilter_ctx *dfctx)
1100 const char *modname = "delzw";
1101 delzwctx *dc = (delzwctx*)dfctx->codec_private;
1103 if(!dc) return;
1104 delzw_finish(dc);
1106 dfctx->dres->bytes_consumed = dc->total_nbytes_processed;
1107 dfctx->dres->bytes_consumed_valid = 1;
1109 if(dc->errcode) {
1110 de_dfilter_set_errorf(dfctx->c, dfctx->dres, modname, "%s", dc->errmsg);
1114 static void my_lzw_codec_addbuf(struct de_dfilter_ctx *dfctx,
1115 const u8 *buf, i64 buf_len)
1117 delzwctx *dc = (delzwctx*)dfctx->codec_private;
1119 if(!dc) return;
1120 delzw_addbuf(dc, buf, buf_len);
1121 if(dc->state == DELZW_STATE_FINISHED) {
1122 dfctx->finished_flag = 1;
1126 static void my_lzw_codec_command(struct de_dfilter_ctx *dfctx, int cmd, UI flags)
1128 delzwctx *dc = (delzwctx*)dfctx->codec_private;
1130 if(dc->fmt==DE_LZWFMT_SHRINKIT2) {
1131 if(cmd==DE_DFILTER_COMMAND_FINISH_BLOCK) {
1132 dc->total_nbytes_processed -= (i64)(dc->bbll.nbits_in_bitbuf/8);
1133 de_bitbuf_lowlevel_empty(&dc->bbll);
1135 else if(cmd==DE_DFILTER_COMMAND_SOFTRESET) {
1136 delzw_clear(dc);
1141 static void my_lzw_codec_destroy(struct de_dfilter_ctx *dfctx)
1143 deark *c = dfctx->c;
1144 delzwctx *dc = (delzwctx*)dfctx->codec_private;
1146 if(!dc) return;
1148 de_free(c, dc->ct);
1149 if(dc->ct2) de_free(c, dc->ct2);
1150 de_free(c, dc->valbuf);
1152 de_free(c, dc);
1153 dfctx->codec_private = NULL;
1156 // codec_private_params is type struct de_lzw_params.
1157 void dfilter_lzw_codec(struct de_dfilter_ctx *dfctx, void *codec_private_params)
1159 delzwctx *dc = NULL;
1160 struct de_lzw_params *delzwp = (struct de_lzw_params*)codec_private_params;
1162 dc = de_malloc(dfctx->c, sizeof(delzwctx));
1163 dc->c = dfctx->c;
1164 dc->dfctx = dfctx;
1165 dc->debug_level = dc->c->debug_level;
1167 dfctx->codec_private = (void*)dc;
1168 dfctx->codec_finish_fn = my_lzw_codec_finish;
1169 dfctx->codec_destroy_fn = my_lzw_codec_destroy;
1170 dfctx->codec_addbuf_fn = my_lzw_codec_addbuf;
1171 dfctx->codec_command_fn = my_lzw_codec_command;
1173 dc->output_len_known = dfctx->dcmpro->len_known;
1174 dc->output_expected_len = dfctx->dcmpro->expected_len;
1176 dc->delzwp_copy = *delzwp; // struct copy
1177 dc->fmt = delzwp->fmt;
1180 /////////////////////////////////////////////////////////
1181 // 12-bit LZW (LZMW?) scheme used in some OS/2 formats
1183 #include "../foreign/dskdcmps.h"
1185 void fmtutil_ibmlzw_codectype1(deark *c, struct de_dfilter_in_params *dcmpri,
1186 struct de_dfilter_out_params *dcmpro, struct de_dfilter_results *dres,
1187 void *codec_private_params)
1189 dskdcmps_run(c, dcmpri, dcmpro, dres);