Edited some module descriptions
[deark.git] / src / fmtutil-huffman.c
blob7e4ebd64c18a37ac74021c1aa63f76e0a17f8792
1 // This file is part of Deark.
2 // Copyright (C) 2020 Jason Summers
3 // See the file COPYING for terms of use.
5 // Functions related to Huffman coding decompression
7 #define DE_NOT_IN_MODULE
8 #include "deark-private.h"
9 #include "deark-fmtutil.h"
11 #define NODE_REF_TYPE u32
12 #define MAX_MAX_NODES 66000
14 struct huffman_nval_pointer_data {
15 NODE_REF_TYPE noderef;
17 struct huffman_nval_value_data {
18 fmtutil_huffman_valtype value;
21 union huffman_nval_data {
22 struct huffman_nval_pointer_data hnpd;
23 struct huffman_nval_value_data hnvd;
26 struct huffman_node {
27 #define CHILDSTATUS_UNUSED 0
28 #define CHILDSTATUS_POINTER 1
29 #define CHILDSTATUS_VALUE 2
30 u8 child_status[2];
31 u8 depth;
32 union huffman_nval_data child[2];
35 struct huffman_lengths_arr_item {
36 fmtutil_huffman_valtype val;
37 UI len;
40 struct fmtutil_huffman_code_builder {
41 i64 lengths_arr_numalloc;
42 i64 lengths_arr_numused;
43 struct huffman_lengths_arr_item *lengths_arr; // array[lengths_arr_numalloc]
46 struct fmtutil_huffman_cursor {
47 NODE_REF_TYPE curr_noderef;
50 struct fmtutil_huffman_codebook {
51 i64 max_nodes;
52 NODE_REF_TYPE next_avail_node;
53 NODE_REF_TYPE nodes_alloc;
54 struct huffman_node *nodes; // array[nodes_alloc]
55 u8 has_null_code;
56 fmtutil_huffman_valtype value_of_null_code;
58 i64 num_codes;
59 UI max_bits;
62 // Ensure that at least n nodes are allocated (0 through n-1)
63 static int huffman_ensure_alloc(deark *c, struct fmtutil_huffman_codebook *bk, NODE_REF_TYPE n)
65 i64 new_nodes_alloc;
67 if(n <= bk->nodes_alloc) return 1;
68 if((i64)n > bk->max_nodes) return 0;
70 new_nodes_alloc = (i64)bk->nodes_alloc * 2;
71 if(new_nodes_alloc > bk->max_nodes) new_nodes_alloc = bk->max_nodes;
72 if(new_nodes_alloc < (i64)n) new_nodes_alloc = (i64)n;
73 if(new_nodes_alloc < 16) new_nodes_alloc = 16;
75 bk->nodes = de_reallocarray(c, bk->nodes, bk->nodes_alloc, sizeof(struct huffman_node),
76 new_nodes_alloc);
77 bk->nodes_alloc = (NODE_REF_TYPE)new_nodes_alloc;
78 return 1;
81 // Tracks the number of items with VALUE status ("codes").
82 static void huffman_setchildstatus(struct fmtutil_huffman_codebook *bk, NODE_REF_TYPE n,
83 u8 child_idx, u8 newstatus)
85 if(n>=bk->nodes_alloc) return;
86 if(child_idx>1) return;
88 if(bk->nodes[n].child_status[child_idx]==newstatus) return;
89 if(bk->nodes[n].child_status[child_idx]==CHILDSTATUS_VALUE) {
90 bk->num_codes--;
92 if(newstatus==CHILDSTATUS_VALUE) {
93 bk->num_codes++;
95 bk->nodes[n].child_status[child_idx] = newstatus;
98 // The size of the longest current code.
99 // This is mainly for debugging info -- it is not guaranteed to be correct if
100 // the tree was constructed improperly.
101 UI fmtutil_huffman_get_max_bits(struct fmtutil_huffman_codebook *bk)
103 return bk->max_bits;
106 // The number of codes (symbols) in the the tree.
107 // This is mainly for debugging info -- it is not guaranteed to be correct if
108 // the tree was constructed improperly.
109 i64 fmtutil_huffman_get_num_codes(struct fmtutil_huffman_codebook *bk)
111 if(bk->num_codes>=0) return bk->num_codes;
112 return 0;
115 void fmtutil_huffman_reset_cursor(struct fmtutil_huffman_cursor *cursor)
117 cursor->curr_noderef = 0;
120 // Add a code, adding to the current tree structure as needed. Codes can be
121 // added in any order.
123 // If inconsistent codes are added (i.e. a code is a prefix of another code, or
124 // the tree is left incomplete), we only promise that it will be safe to use
125 // the decoding functions. Such errors will not necessarily be detected.
127 // Note that adding the 0-length code is allowed.
128 int fmtutil_huffman_add_code(deark *c, struct fmtutil_huffman_codebook *bk,
129 u64 code, UI code_nbits, fmtutil_huffman_valtype val)
131 UI k;
132 NODE_REF_TYPE curr_noderef = 0; // Note that this may temporarily point to an unallocated node
133 int retval = 0;
135 if(code_nbits>FMTUTIL_HUFFMAN_MAX_CODE_LENGTH) goto done;
137 if(code_nbits<1) {
138 bk->value_of_null_code = val;
139 bk->has_null_code = 1;
140 retval = 1;
141 goto done;
143 bk->has_null_code = 0;
145 if(code_nbits > bk->max_bits) {
146 bk->max_bits = code_nbits;
149 // Iterate through the bits, high bit first.
150 for(k=0; k<code_nbits; k++) {
151 UI child_idx; // 0 or 1
153 // Make sure the current node exists
154 if(curr_noderef >= bk->nodes_alloc) {
155 if(!huffman_ensure_alloc(c, bk, curr_noderef+1)) goto done;
157 // Claim the current node, if necessary
158 if(curr_noderef >= bk->next_avail_node) {
159 bk->next_avail_node = curr_noderef+1;
160 bk->nodes[curr_noderef].depth = (u8)k;
163 child_idx = (code>>(code_nbits-1-k))&0x1;
165 if(k==code_nbits-1) {
166 // Reached the "leaf" node. Set the value for this child_idx.
167 huffman_setchildstatus(bk, curr_noderef, child_idx, CHILDSTATUS_VALUE);
168 bk->nodes[curr_noderef].child[child_idx].hnvd.value = val;
170 else {
171 // Not at the leaf node yet.
172 if(bk->nodes[curr_noderef].child_status[child_idx]==CHILDSTATUS_POINTER) {
173 // It's already a pointer.
174 curr_noderef = bk->nodes[curr_noderef].child[child_idx].hnpd.noderef;
176 else {
177 NODE_REF_TYPE next_noderef;
179 // It's not a pointer -- make it one.
180 if(bk->next_avail_node >= bk->max_nodes) goto done;
181 next_noderef = bk->next_avail_node;
182 huffman_setchildstatus(bk, curr_noderef, child_idx, CHILDSTATUS_POINTER);
183 bk->nodes[curr_noderef].child[child_idx].hnpd.noderef = next_noderef;
184 curr_noderef = next_noderef;
189 retval = 1;
191 done:
192 return retval;
195 // Caller supplies one bit of data to the decoder (the low bit of bitval).
196 // Returns:
197 // 1 = This was the last bit of a code; value returned in *pval
198 // 2 = Need more bits (*pval unchanged)
199 // 0 = Error (*pval unchanged)
200 // If return value is not 2, resets the cursor before returning.
201 // Note that, by itself, this function cannot read the zero-length code.
202 int fmtutil_huffman_decode_bit(struct fmtutil_huffman_codebook *bk, struct fmtutil_huffman_cursor *cursor,
203 u8 bitval, fmtutil_huffman_valtype *pval)
205 UI child_idx;
206 int retval = 0;
207 NODE_REF_TYPE curr_noderef = cursor->curr_noderef;
209 if(curr_noderef >= bk->nodes_alloc) goto done;
210 if(curr_noderef >= bk->next_avail_node) goto done;
211 child_idx = bitval & 0x01;
213 if(bk->nodes[curr_noderef].child_status[child_idx]==CHILDSTATUS_VALUE) {
214 *pval = bk->nodes[curr_noderef].child[child_idx].hnvd.value;
215 retval = 1;
216 goto done;
218 else if(bk->nodes[curr_noderef].child_status[child_idx]==CHILDSTATUS_POINTER) {
219 cursor->curr_noderef = bk->nodes[curr_noderef].child[child_idx].hnpd.noderef;
220 retval = 2;
221 goto done;
224 done:
225 if(retval!=2) {
226 fmtutil_huffman_reset_cursor(cursor);
228 return retval;
231 // Read the next Huffman code from a bitreader, and decode it.
232 // *pval will always be written to. On error, it will be set to 0.
233 // pnbits returns the number of bits read. Can be NULL.
234 // Return value:
235 // nonzero on success
236 // 0 on error - Can happen if the tree was not constructed properly, or on EOF
237 // (bitrd->eof_flag can distinguish these cases).
238 int fmtutil_huffman_read_next_value(struct fmtutil_huffman_codebook *bk,
239 struct de_bitreader *bitrd, fmtutil_huffman_valtype *pval, UI *pnbits)
241 int bitcount = 0;
242 int retval = 0;
243 struct fmtutil_huffman_cursor tmpcursor;
245 if(bitrd->eof_flag) goto done;
246 tmpcursor.curr_noderef = 0;
248 if(bk->has_null_code) {
249 *pval = bk->value_of_null_code;
250 retval = 1;
251 goto done;
254 while(1) {
255 int ret;
256 u8 b;
258 b = (u8)de_bitreader_getbits(bitrd, 1);
259 if(bitrd->eof_flag) goto done;
260 bitcount++;
261 if(bitcount>FMTUTIL_HUFFMAN_MAX_CODE_LENGTH) goto done; // Should be impossible
263 ret = fmtutil_huffman_decode_bit(bk, &tmpcursor, b, pval);
264 if(ret==1) { // finished the code
265 retval = 1;
266 goto done;
268 else if(ret!=2) { // decoding error
269 goto done;
272 done:
273 if(!retval) {
274 *pval = 0;
276 if(pnbits) {
277 *pnbits = retval ? bitcount : 0;
279 return retval;
282 // For debugging
283 void fmtutil_huffman_dump(deark *c, struct fmtutil_huffman_decoder *ht)
285 NODE_REF_TYPE k;
286 de_ucstring *tmps = NULL;
288 de_dbg(c, "internal huffman table:");
289 de_dbg_indent(c, 1);
291 de_dbg(c, "number of codes: %"I64_FMT, fmtutil_huffman_get_num_codes(ht->bk));
292 de_dbg(c, "max code size: %u bits", fmtutil_huffman_get_max_bits(ht->bk));
293 tmps = ucstring_create(c);
294 for(k=0; k<ht->bk->next_avail_node && k<ht->bk->nodes_alloc; k++) {
295 UI child_idx;
296 struct huffman_node *nd = &ht->bk->nodes[k];
298 ucstring_empty(tmps);
299 ucstring_printf(tmps, DE_ENCODING_LATIN1, "node[%u]: depth=%u (", (UI)k, (UI)nd->depth);
301 for(child_idx=0; child_idx<=1; child_idx++) {
302 if(child_idx==1) {
303 ucstring_append_sz(tmps, " ", DE_ENCODING_LATIN1);
305 if(nd->child_status[child_idx]==CHILDSTATUS_POINTER) {
306 ucstring_printf(tmps, DE_ENCODING_LATIN1, "next=%u",
307 (UI)nd->child[child_idx].hnpd.noderef);
309 else if(nd->child_status[child_idx]==CHILDSTATUS_VALUE) {
310 ucstring_printf(tmps, DE_ENCODING_LATIN1, "value=%d",
311 (int)nd->child[child_idx].hnvd.value);
313 else {
314 ucstring_append_sz(tmps, "unused", DE_ENCODING_LATIN1);
317 ucstring_printf(tmps, DE_ENCODING_LATIN1, ")");
318 de_dbg(c, "%s", ucstring_getpsz_d(tmps));
320 ucstring_destroy(tmps);
321 de_dbg_indent(c, -1);
324 // This is only used with fmtutil_huffman_make_canonical_tree().
325 // Call this first, once per item.
326 // The order that you supply the items matters, at least within the set of items
327 // having the same length.
328 // Cannot be used for zero-length items. If len==0, it's a successful no-op.
329 int fmtutil_huffman_record_a_code_length(deark *c, struct fmtutil_huffman_code_builder *builder,
330 fmtutil_huffman_valtype val, UI len)
332 if(len==0) return 1;
333 if(len > FMTUTIL_HUFFMAN_MAX_CODE_LENGTH) return 0;
334 if(builder->lengths_arr_numused > MAX_MAX_NODES) return 0;
336 if(builder->lengths_arr_numused >= builder->lengths_arr_numalloc) {
337 i64 new_numalloc;
339 new_numalloc = builder->lengths_arr_numused + 128;
340 builder->lengths_arr = de_reallocarray(c, builder->lengths_arr, builder->lengths_arr_numalloc,
341 sizeof(struct huffman_lengths_arr_item), new_numalloc);
342 builder->lengths_arr_numalloc = new_numalloc;
344 builder->lengths_arr[builder->lengths_arr_numused].val = val;
345 builder->lengths_arr[builder->lengths_arr_numused++].len = len;
346 return 1;
349 // The usual canonical format - leaves are left-aligned
350 static int fmtutil_huffman_make_canonical_tree1(deark *c, struct fmtutil_huffman_codebook *bk,
351 struct fmtutil_huffman_code_builder *builder, UI max_sym_len_used)
353 UI symlen;
354 UI codes_count_total = 0;
355 UI prev_code_bit_length = 0;
356 u64 prev_code = 0;
357 int retval = 0;
358 char b2buf[72];
360 // For each possible symbol length...
361 for(symlen=1; symlen<=max_sym_len_used; symlen++) {
362 UI k;
364 // Find all the codes that use this symbol length, in order
365 for(k=0; k<(UI)builder->lengths_arr_numused; k++) {
366 int ret;
367 u64 thiscode;
369 if(builder->lengths_arr[k].len != symlen) continue;
370 // Found a code of the length we're looking for.
372 if(codes_count_total==0) {
373 thiscode = 0;
375 else {
376 thiscode = prev_code + 1;
377 if(symlen > prev_code_bit_length) {
378 thiscode <<= (symlen - prev_code_bit_length);
382 prev_code = thiscode;
383 prev_code_bit_length = symlen;
384 codes_count_total++;
386 if(c->debug_level>=3) {
387 de_dbg3(c, "code: \"%s\" = %d",
388 de_print_base2_fixed(b2buf, sizeof(b2buf), thiscode, symlen),
389 (int)builder->lengths_arr[k].val);
391 ret = fmtutil_huffman_add_code(c, bk, thiscode, symlen, builder->lengths_arr[k].val);
392 if(!ret) {
393 goto done;
397 retval = 1;
399 done:
400 return retval;
403 // "pack" style - branches are left-aligned
404 static int fmtutil_huffman_make_canonical_tree2(deark *c, struct fmtutil_huffman_codebook *bk,
405 struct fmtutil_huffman_code_builder *builder, UI max_sym_len_used)
407 UI symlen;
408 UI codes_count_total = 0;
409 UI prev_code_bit_length = 0;
410 u64 prev_code = 0;
411 int retval = 0;
412 char b2buf[72];
414 // For each possible symbol length...
415 for(symlen=max_sym_len_used; symlen>=1; symlen--) {
416 UI k;
418 // Find all the codes that use this symbol length, in order
419 for(k=0; k<(UI)builder->lengths_arr_numused; k++) {
420 int ret;
421 u64 this_code;
423 if(builder->lengths_arr[k].len != symlen) continue;
424 // Found a code of the length we're looking for.
426 if(codes_count_total==0) {
427 this_code = 0;
429 else {
430 this_code = (prev_code>>(prev_code_bit_length-symlen)) + 1;
433 prev_code = this_code;
434 prev_code_bit_length = symlen;
435 codes_count_total++;
437 if(c->debug_level>=3) {
438 de_dbg3(c, "code: \"%s\" = %d",
439 de_print_base2_fixed(b2buf, sizeof(b2buf), this_code, symlen),
440 (int)builder->lengths_arr[k].val);
442 ret = fmtutil_huffman_add_code(c, bk, this_code, symlen, builder->lengths_arr[k].val);
443 if(!ret) {
444 goto done;
448 retval = 1;
450 done:
451 return retval;
454 static void reverse_lengths_array(struct fmtutil_huffman_code_builder *builder)
456 i64 i;
457 i64 num_swaps = builder->lengths_arr_numused / 2;
458 struct huffman_lengths_arr_item tmpitem;
460 for(i=0; i<num_swaps; i++) {
461 tmpitem = builder->lengths_arr[i]; // struct copy
462 builder->lengths_arr[i] = builder->lengths_arr[builder->lengths_arr_numused-1-i];
463 builder->lengths_arr[builder->lengths_arr_numused-1-i] = tmpitem;
467 // Call this after calling huffman_record_item_length() (usually many times).
468 // Creates a canonical Huffman tree derived from the known code lengths.
469 int fmtutil_huffman_make_canonical_code(deark *c, struct fmtutil_huffman_codebook *bk,
470 struct fmtutil_huffman_code_builder *builder, UI flags)
472 UI max_sym_len_used;
473 UI i;
474 int saved_indent_level;
475 int retval = 0;
477 de_dbg_indent_save(c, &saved_indent_level);
478 de_dbg3(c, "derived huffman codebook:");
479 de_dbg_indent(c, 1);
481 if(!builder->lengths_arr) {
482 retval = 1;
483 goto done;
486 if(flags & FMTUTIL_MCTFLAG_LAST_CODE_FIRST) {
487 // Bit of a hack. Instead of each worker function having to have a way
488 // to read the array from back to front, reverse the order of items in
489 // the array, so we don't have to deal with it later.
490 reverse_lengths_array(builder);
493 // Find the maximum length
494 max_sym_len_used = 0;
495 for(i=0; i<(UI)builder->lengths_arr_numused; i++) {
496 if(builder->lengths_arr[i].len > max_sym_len_used) {
497 max_sym_len_used = builder->lengths_arr[i].len;
500 if(max_sym_len_used>FMTUTIL_HUFFMAN_MAX_CODE_LENGTH) {
501 goto done;
504 if(flags & FMTUTIL_MCTFLAG_LEFT_ALIGN_BRANCHES) {
505 retval = fmtutil_huffman_make_canonical_tree2(c, bk, builder, max_sym_len_used);
507 else {
508 retval = fmtutil_huffman_make_canonical_tree1(c, bk, builder, max_sym_len_used);
511 done:
512 de_dbg_indent_restore(c, saved_indent_level);
513 return retval;
516 static struct fmtutil_huffman_codebook *huffman_create_codebook(deark *c,
517 i64 initial_codes, i64 max_codes)
519 struct fmtutil_huffman_codebook *bk;
520 i64 initial_nodes;
522 bk = de_malloc(c, sizeof(struct fmtutil_huffman_codebook));
523 if(max_codes>0) {
524 bk->max_nodes = max_codes;
526 else {
527 bk->max_nodes = MAX_MAX_NODES;
529 if(bk->max_nodes > MAX_MAX_NODES) {
530 bk->max_nodes = MAX_MAX_NODES;
533 if(initial_codes>0) {
534 initial_nodes = initial_codes;
536 else {
537 initial_nodes = 1;
539 if(initial_nodes > MAX_MAX_NODES) {
540 initial_nodes = MAX_MAX_NODES;
543 huffman_ensure_alloc(c, bk, (NODE_REF_TYPE)initial_nodes);
544 bk->next_avail_node = 0;
545 bk->num_codes = 0;
546 bk->max_bits = 0;
547 return bk;
550 static void huffman_destroy_codebook(deark *c, struct fmtutil_huffman_codebook *bk)
552 if(!bk) return;
553 de_free(c, bk->nodes);
554 de_free(c, bk);
557 // initial_codes: If not 0, pre-allocate enough nodes for this many codes.
558 // max_codes: If not 0, attempting to add substantially more codes than this will fail.
559 struct fmtutil_huffman_decoder *fmtutil_huffman_create_decoder(deark *c, i64 initial_codes, i64 max_codes)
561 struct fmtutil_huffman_decoder *ht = NULL;
563 ht = de_malloc(c, sizeof(struct fmtutil_huffman_decoder));
564 ht->cursor = de_malloc(c, sizeof(struct fmtutil_huffman_cursor));
565 ht->bk = huffman_create_codebook(c, initial_codes, max_codes);
566 ht->builder = de_malloc(c, sizeof(struct fmtutil_huffman_code_builder));
567 fmtutil_huffman_reset_cursor(ht->cursor);
568 return ht;
571 void fmtutil_huffman_destroy_decoder(deark *c, struct fmtutil_huffman_decoder *ht)
573 if(!ht) return;
574 huffman_destroy_codebook(c, ht->bk);
575 ht->bk = NULL;
576 de_free(c, ht->builder->lengths_arr);
577 de_free(c, ht->builder);
578 de_free(c, ht->cursor);
579 de_free(c, ht);