arc: Support "distilled" (PAK) decompression
[deark.git] / foreign / lzhuf.h
blobb5164cb62a690c0da40b03270a3e364adde4870f
1 // See the file readme-lzhuf.txt for more information about this file.
2 // Modifications for Deark are Copyright (C) 2021 Jason Summers, and have the
3 // same terms of use as the main part of Deark.
4 // Alternatively, at your option, the modifications for Deark may be treated
5 // as public domain.
7 // Intro from the original software:
9 /**************************************************************
10 lzhuf.c
11 written by Haruyasu Yoshizaki 11/20/1988
12 some minor changes 4/6/1989
13 comments translated by Haruhiko Okumura 4/7/1989
14 **************************************************************/
16 // Max of #literals + #special_codes + #match_lengths.
17 // Must be at least 256+59 (lh1:58, CRLZH:59)
18 #define LZHUF_MAX_N_CHAR (256 + 60)
20 #define LZHUF_MAX_T (LZHUF_MAX_N_CHAR * 2 - 1) /* size of table */
21 #define LZHUF_MAX_FREQ 0x8000 /* updates tree when the */
22 /* root frequency comes to this value. */
24 struct lzahuf_ctx {
25 deark *c;
26 const char *modname;
27 u8 dbg_mode;
28 struct de_lh1_params lh1p;
29 struct de_dfilter_ctx *dfctx;
30 struct de_dfilter_out_params *dcmpro; // same as dfctx->dcmpro
31 struct de_dfilter_results *dres; // same as dfctx->dres
33 UI num_length_codes;
34 UI match_length_bias;
35 UI lzhuf_N_CHAR; /* kinds of characters (character code = 0..N_CHAR-1) */
36 UI lzhuf_T;
37 UI lzhuf_R; /* position of root */ /* (LZHUF_T - 1) */
38 UI num_special_codes;
39 UI dpparam_dcode_shift;
40 UI dpparam_dlen_bias;
41 UI dpparam_mask;
43 int errflag;
44 i64 total_nbytes_processed;
45 i64 nbytes_written;
47 struct de_lz77buffer *ringbuf;
48 struct de_bitbuf_lowlevel bbll;
50 u16 freq[LZHUF_MAX_T + 1]; /* frequency table */
52 u16 prnt[LZHUF_MAX_T + LZHUF_MAX_N_CHAR]; /* pointers to parent nodes, except for the */
53 /* elements [T..T + N_CHAR - 1] which are used to get */
54 /* the positions of leaves corresponding to the codes. */
56 u16 son[LZHUF_MAX_T]; /* pointers to child nodes (son[], son[] + 1) */
59 // These getters/setters are ugly, but it's too difficult for me to follow the
60 // dancing variables in the lzhuf code, and convince myself that there are no
61 // array overruns.
62 // Assuming the code is safe, these functions can be replaced by simple macros.
64 static u16 get_freq(struct lzahuf_ctx *cctx, UI idx)
66 if(idx < (UI)DE_ARRAYCOUNT(cctx->freq)) return cctx->freq[idx];
67 cctx->errflag = 1;
68 return 0;
71 static void set_freq(struct lzahuf_ctx *cctx, UI idx, u16 val)
73 if(idx < (UI)DE_ARRAYCOUNT(cctx->freq)) cctx->freq[idx] = val;
74 else cctx->errflag = 1;
77 static u16 get_son(struct lzahuf_ctx *cctx, UI idx)
79 if(idx < (UI)DE_ARRAYCOUNT(cctx->son)) return cctx->son[idx];
80 cctx->errflag = 1;
81 return 0;
84 static void set_son(struct lzahuf_ctx *cctx, UI idx, u16 val)
86 if(idx < (UI)DE_ARRAYCOUNT(cctx->son)) cctx->son[idx] = val;
87 else cctx->errflag = 1;
90 static u16 get_prnt(struct lzahuf_ctx *cctx, UI idx)
92 if(idx < (UI)DE_ARRAYCOUNT(cctx->prnt)) return cctx->prnt[idx];
93 cctx->errflag = 1;
94 return 0;
97 static void set_prnt(struct lzahuf_ctx *cctx, UI idx, u16 val)
99 if(idx < (UI)DE_ARRAYCOUNT(cctx->prnt)) cctx->prnt[idx] = val;
100 else cctx->errflag = 1;
103 static UI lzhuf_getbits(struct lzahuf_ctx *cctx, UI nbits)
105 if(cctx->bbll.nbits_in_bitbuf < nbits) {
106 cctx->errflag = 1;
107 return 0;
109 return (UI)de_bitbuf_lowlevel_get_bits(&cctx->bbll, nbits);
112 /* initialization of tree */
114 static void lzhuf_StartHuff(struct lzahuf_ctx *cctx)
116 UI i, j;
118 for (i = 0; i < cctx->lzhuf_N_CHAR; i++) {
119 set_freq(cctx, i, 1);
120 set_son(cctx, i, i + cctx->lzhuf_T);
121 set_prnt(cctx, i + cctx->lzhuf_T, i);
123 i = 0;
124 j = cctx->lzhuf_N_CHAR;
125 while (j <= cctx->lzhuf_R) {
126 set_freq(cctx, j, get_freq(cctx, i) + get_freq(cctx, i + 1));
127 set_son(cctx, j, i);
128 set_prnt(cctx, i, j);
129 set_prnt(cctx, i + 1, j);
130 i += 2;
131 j++;
133 set_freq(cctx, cctx->lzhuf_T, 0xffff);
134 set_prnt(cctx, cctx->lzhuf_R, 0);
138 /* reconstruction of tree */
140 static void lzhuf_reconst(struct lzahuf_ctx *cctx)
142 UI i, j, k;
143 UI f, l;
145 /* collect leaf nodes in the first half of the table */
146 /* and replace the freq by (freq + 1) / 2. */
147 j = 0;
148 for (i = 0; i < cctx->lzhuf_T; i++) {
149 if (get_son(cctx, i) >= cctx->lzhuf_T) {
150 set_freq(cctx, j, (get_freq(cctx, i) + 1) / 2);
151 set_son(cctx, j, get_son(cctx, i));
152 j++;
155 /* begin constructing tree by connecting sons */
156 for (i = 0, j = cctx->lzhuf_N_CHAR; j < cctx->lzhuf_T; i += 2, j++) {
157 k = i + 1;
158 set_freq(cctx, j, get_freq(cctx, i) + get_freq(cctx, k));
159 f = get_freq(cctx, j);
161 k = j - 1;
162 while(f < get_freq(cctx, k)) {
163 k--;
166 k++;
168 l = (j - k);
169 // son[] is smaller than freq[], so bounds check uses son[].
170 if(l > (UI)DE_ARRAYCOUNT(cctx->son) ||
171 k+1+l > (UI)DE_ARRAYCOUNT(cctx->son))
173 cctx->errflag = 1;
174 return;
177 de_memmove(&cctx->freq[k + 1], &cctx->freq[k], l*sizeof(cctx->freq[0]));
178 set_freq(cctx, k, f);
179 de_memmove(&cctx->son[k + 1], &cctx->son[k], l*sizeof(cctx->son[0]));
180 set_son(cctx, k, i);
182 /* connect prnt */
183 for (i = 0; i < cctx->lzhuf_T; i++) {
184 k = get_son(cctx, i);
185 if (k >= cctx->lzhuf_T) {
186 set_prnt(cctx, k, i);
187 } else {
188 set_prnt(cctx, k, i);
189 set_prnt(cctx, k + 1, i);
195 /* increment frequency of given code by one, and update tree */
197 static void lzhuf_update(struct lzahuf_ctx *cctx, UI c)
199 UI i, j, l;
200 UI k;
201 UI counter = 0;
202 UI r_freq;
204 r_freq = get_freq(cctx, cctx->lzhuf_R);
205 if(r_freq > LZHUF_MAX_FREQ) {
206 cctx->errflag = 1;
207 return;
209 if (r_freq == LZHUF_MAX_FREQ) {
210 lzhuf_reconst(cctx);
211 if(cctx->errflag) return;
213 c = get_prnt(cctx, c + cctx->lzhuf_T);
214 do {
215 if(counter > (UI)DE_ARRAYCOUNT(cctx->prnt)) { // infinite loop?
216 cctx->errflag = 1;
217 return;
219 counter++;
221 set_freq(cctx, c, get_freq(cctx, c)+1);
222 k = get_freq(cctx, c);
224 /* if the order is disturbed, exchange nodes */
225 l = c + 1;
226 if (k > get_freq(cctx, l)) {
228 do {
229 l++;
230 } while(k > get_freq(cctx, l));
232 l--;
233 set_freq(cctx, c, get_freq(cctx, l));
234 set_freq(cctx, l, k);
236 i = get_son(cctx, c);
237 set_prnt(cctx, i, l);
238 if (i < cctx->lzhuf_T) set_prnt(cctx, i + 1, l);
240 j = get_son(cctx, l);
241 set_son(cctx, l, i);
243 set_prnt(cctx, j, c);
244 if (j < cctx->lzhuf_T) set_prnt(cctx, j + 1, c);
245 set_son(cctx, c, j);
247 c = l;
249 c = get_prnt(cctx, c);
250 } while (c != 0); /* repeat up to root */
253 static UI lzhuf_DecodeChar(struct lzahuf_ctx *cctx)
255 UI c;
257 c = get_son(cctx, cctx->lzhuf_R);
259 /* travel from root to leaf, */
260 /* choosing the smaller child node (son[]) if the read bit is 0, */
261 /* the bigger (son[]+1) if 1 */
262 while (c < cctx->lzhuf_T) {
263 c += lzhuf_getbits(cctx, 1);
264 // There will never be more than 64 bits available, so a hypothetical
265 // infinite loop would be caught here.
266 if(cctx->errflag) return 0;
267 c = get_son(cctx, c);
269 c -= cctx->lzhuf_T;
270 lzhuf_update(cctx, c);
271 return c;
274 static UI lzhuf_DecodePosition(struct lzahuf_ctx *cctx)
276 UI i, j, c;
277 UI d_code, d_len;
279 /* recover upper bits from table */
280 i = lzhuf_getbits(cctx, 8);
281 if(cctx->errflag) return 0;
282 fmtutil_get_lzhuf_d_code_and_len(i, &d_code, &d_len);
283 c = d_code << cctx->dpparam_dcode_shift;
285 /* read lower bits verbatim */
286 j = d_len - cctx->dpparam_dlen_bias;
287 i = (i<<j) | lzhuf_getbits(cctx, j);
288 if(cctx->errflag) return 0;
289 i &= cctx->dpparam_mask;
290 return c | i;
293 static int lzah_have_enough_output(struct lzahuf_ctx *cctx)
295 if(cctx->dcmpro->len_known) {
296 if(cctx->nbytes_written >= cctx->dcmpro->expected_len) {
297 cctx->dfctx->finished_flag = 1;
298 return 1;
301 return 0;
304 static void lzah_lz77buf_writebytecb(struct de_lz77buffer *rb, u8 n)
306 struct lzahuf_ctx *cctx = (struct lzahuf_ctx*)rb->userdata;
308 if(lzah_have_enough_output(cctx)) {
309 return;
311 dbuf_writebyte(cctx->dcmpro->f, n);
312 cctx->nbytes_written++;
315 static void lzhuf_decodesubtree_nodepair(struct lzahuf_ctx *cctx, UI p1, u64 val, UI val_nbits,
316 char *b2buf, size_t b2buf_len);
318 static void lzhuf_decodesubtree_node(struct lzahuf_ctx *cctx, UI p1, u64 val, UI val_nbits,
319 char *b2buf, size_t b2buf_len)
321 if(cctx->son[p1] < cctx->lzhuf_T) { // [pointer node]
322 lzhuf_decodesubtree_nodepair(cctx, cctx->son[p1], val, val_nbits, b2buf, b2buf_len);
324 else { // [leaf node]
325 de_dbg(cctx->c, "code: \"%s\" = %d [%u]",
326 de_print_base2_fixed(b2buf, b2buf_len, val, val_nbits),
327 (int)cctx->son[p1], (UI)(cctx->son[p1]-cctx->lzhuf_T));
331 // Interpret son[p1] and son[p1+1]
332 static void lzhuf_decodesubtree_nodepair(struct lzahuf_ctx *cctx, UI p1, u64 val, UI val_nbits,
333 char *b2buf, size_t b2buf_len)
335 if(p1 >= cctx->lzhuf_T) return; // error
336 lzhuf_decodesubtree_node(cctx, p1, (val<<1), val_nbits+1, b2buf, b2buf_len);
337 lzhuf_decodesubtree_node(cctx, p1+1, (val<<1)|1, val_nbits+1, b2buf, b2buf_len);
340 static void lzhuf_dumptree(struct lzahuf_ctx *cctx)
342 UI i;
343 char b2buf[72];
345 de_dbg(cctx->c, "R: %u", (UI)cctx->lzhuf_R);
346 de_dbg(cctx->c, "T: %u", (UI)cctx->lzhuf_T);
347 lzhuf_decodesubtree_node(cctx, cctx->lzhuf_R, 0, 0, b2buf, sizeof(b2buf));
348 for(i=0; i<DE_ARRAYCOUNT(cctx->son); i++) {
349 de_dbg(cctx->c, "son[%u]: %u", i, (UI)cctx->son[i]);
353 static void lzhuf_Decode_init(struct lzahuf_ctx *cctx)
355 UI rb_size; // LZHUF_N
357 // Defaults, for standard LHarc -lh1-:
358 // codes 0-255 = literals 0x00-0xff
359 // codes 256-313 = match lengths 3-60
360 cctx->num_special_codes = 0;
361 cctx->num_length_codes = 58;
362 cctx->match_length_bias = 253;
363 cctx->dpparam_dcode_shift = 6;
364 cctx->dpparam_dlen_bias = 2;
365 cctx->dpparam_mask = 0x3f;
366 rb_size = 4096;
368 if(cctx->lh1p.is_crlzh11 || cctx->lh1p.is_crlzh20) {
369 // codes 0-255 = literals 0x00-0xff
370 // code 256 = "stop"
371 // codes 257-314 = match lengths 3-60
372 cctx->num_special_codes = 1;
373 cctx->num_length_codes = 58;
374 cctx->match_length_bias = 254;
375 rb_size = 2048;
376 if(cctx->lh1p.is_crlzh20) {
377 cctx->dpparam_dcode_shift = 5;
378 cctx->dpparam_dlen_bias = 3;
379 cctx->dpparam_mask = 0x1f;
382 else if(cctx->lh1p.is_arc_trimmed) {
383 // codes 0-255 = literals 0x00-0xff
384 // code 256 = "stop"
385 // codes 257-313 = match lengths 3-59
386 cctx->num_special_codes = 1;
387 cctx->num_length_codes = 57;
388 cctx->match_length_bias = 254;
390 else if(cctx->lh1p.is_dms_deep) {
391 rb_size = 16*1024;
392 cctx->dpparam_dcode_shift = 8;
393 cctx->dpparam_dlen_bias = 0;
394 cctx->dpparam_mask = 0xff;
397 cctx->lzhuf_N_CHAR = 256 + cctx->num_special_codes + cctx->num_length_codes;
398 if(cctx->lzhuf_N_CHAR > LZHUF_MAX_N_CHAR) {
399 cctx->errflag = 1;
400 goto done;
402 cctx->lzhuf_T = cctx->lzhuf_N_CHAR * 2 - 1;
403 cctx->lzhuf_R = cctx->lzhuf_T - 1;
405 cctx->ringbuf = de_lz77buffer_create(cctx->c, rb_size);
406 cctx->ringbuf->userdata = (void*)cctx;
407 cctx->ringbuf->writebyte_cb = lzah_lz77buf_writebytecb;
408 if(cctx->lh1p.history_fill_val!=0) {
409 de_lz77buffer_clear(cctx->ringbuf, cctx->lh1p.history_fill_val);
412 cctx->bbll.is_lsb = 0;
413 de_bitbuf_lowlevel_empty(&cctx->bbll);
415 lzhuf_StartHuff(cctx);
417 if(cctx->dbg_mode) {
418 lzhuf_dumptree(cctx);
421 done:
425 static void lzhuf_Decode_continue(struct lzahuf_ctx *cctx, const u8 *buf, i64 buf_len, int flush)
427 UI i, j, c;
428 i64 bufpos = 0;
429 char pos_descr[32];
431 pos_descr[0] = '\0';
432 if(cctx->dfctx->finished_flag) goto done;
434 while(1) {
435 if(cctx->errflag) goto done;
436 if(lzah_have_enough_output(cctx)) {
437 goto done;
440 // We're relying on the assumption that a compression instruction can
441 // never be more than 57 bits in size.
442 // The Huffman-encoded "character" maxes out, I think, at around 15 or
443 // 16 bits -- this follows indirectly from MAX_FREQ, which reduces the
444 // tree depth perdiodically.
445 // For a "match" instruction, an 8-bit code is then read.
446 // Then, some additional bits are read -- the most possible for the
447 // supported formats is 8, for DMS-Deep.
448 // So, about 16+8+8 = 32 bits should be sufficient.
450 // Top off the bitbuf, if possible.
451 while(bufpos<buf_len && cctx->bbll.nbits_in_bitbuf<=(64-8)) {
452 de_bitbuf_lowlevel_add_byte(&cctx->bbll, buf[bufpos++]);
453 cctx->total_nbytes_processed++;
456 if(!flush && cctx->bbll.nbits_in_bitbuf<=(64-8)) {
457 // Wait for more data before trying to continue
458 goto done;
461 if(cctx->c->debug_level>=4) {
462 de_bitbuf_describe_curpos(&cctx->bbll,
463 cctx->dfctx->input_file_offset+cctx->total_nbytes_processed,
464 pos_descr, sizeof(pos_descr));
467 c = lzhuf_DecodeChar(cctx);
468 if(cctx->errflag) goto done;
470 if(c==256 && cctx->num_special_codes>=1) {
471 if(cctx->c->debug_level>=4) {
472 de_dbg(cctx->c, "special @%s =%u", pos_descr, c);
474 cctx->dfctx->finished_flag = 1;
475 goto done;
477 if (c < 256) {
478 if(cctx->c->debug_level>=4) {
479 de_dbg(cctx->c, "lit @%s =%u", pos_descr, c);
481 de_lz77buffer_add_literal_byte(cctx->ringbuf, (u8)c);
483 else {
484 // i is the distance back
485 i = lzhuf_DecodePosition(cctx);
486 if(cctx->errflag) goto done;
488 // j is the match length
489 j = c - cctx->match_length_bias;
491 if(cctx->c->debug_level>=4) {
492 de_dbg(cctx->c, "match @%s dist=%u len=%u", pos_descr, i, j);
495 de_lz77buffer_copy_from_hist(cctx->ringbuf,
496 (UI)(cctx->ringbuf->curpos - (UI)i - 1), j);
500 done:
501 if(!cctx->dfctx->finished_flag && !cctx->errflag) {
502 if(bufpos<buf_len) {
503 // It shouldn't be possible to get here. If we do, it means some input
504 // bytes will be lost.
505 de_dfilter_set_generic_error(cctx->dfctx->c, cctx->dres, cctx->modname);
506 cctx->errflag = 1;