Cleaned up some Huffman-related debug messages
[deark.git] / src / fmtutil-huffman.c
blob87d64e3d91c7ae55b298ec327127d68528b15869
1 // This file is part of Deark.
2 // Copyright (C) 2020 Jason Summers
3 // See the file COPYING for terms of use.
5 // Functions related to Huffman coding decompression
7 #define DE_NOT_IN_MODULE
8 #include "deark-private.h"
9 #include "deark-fmtutil.h"
11 #define NODE_REF_TYPE u32
12 #define MAX_MAX_NODES 66000
14 struct huffman_nval_pointer_data {
15 NODE_REF_TYPE noderef;
17 struct huffman_nval_value_data {
18 fmtutil_huffman_valtype value;
21 union huffman_nval_data {
22 struct huffman_nval_pointer_data hnpd;
23 struct huffman_nval_value_data hnvd;
26 struct huffman_node {
27 #define CHILDSTATUS_UNUSED 0
28 #define CHILDSTATUS_POINTER 1
29 #define CHILDSTATUS_VALUE 2
30 u8 child_status[2];
31 u8 depth;
32 union huffman_nval_data child[2];
35 struct huffman_lengths_arr_item {
36 fmtutil_huffman_valtype val;
37 UI len;
40 struct huffman_cursor {
41 NODE_REF_TYPE curr_noderef;
44 struct fmtutil_huffman_tree {
45 // In principle, the cursor should be separate, so we could have multiple
46 // cursors for one tree. But that's inconvenient, and it's not clear that
47 // it would be of any use in practice.
48 struct huffman_cursor cursor;
50 i64 max_nodes;
51 NODE_REF_TYPE next_avail_node;
52 NODE_REF_TYPE nodes_alloc;
53 struct huffman_node *nodes; // array[nodes_alloc]
54 u8 has_null_code;
55 fmtutil_huffman_valtype value_of_null_code;
57 i64 num_codes;
58 UI max_bits;
60 i64 lengths_arr_numalloc;
61 i64 lengths_arr_numused;
62 struct huffman_lengths_arr_item *lengths_arr; // array[lengths_arr_numalloc]
65 // Ensure that at least n nodes are allocated (0 through n-1)
66 static int huffman_ensure_alloc(deark *c, struct fmtutil_huffman_tree *ht, NODE_REF_TYPE n)
68 i64 new_nodes_alloc;
70 if(n <= ht->nodes_alloc) return 1;
71 if((i64)n > ht->max_nodes) return 0;
73 new_nodes_alloc = (i64)ht->nodes_alloc * 2;
74 if(new_nodes_alloc > ht->max_nodes) new_nodes_alloc = ht->max_nodes;
75 if(new_nodes_alloc < (i64)n) new_nodes_alloc = (i64)n;
76 if(new_nodes_alloc < 16) new_nodes_alloc = 16;
78 ht->nodes = de_reallocarray(c, ht->nodes, ht->nodes_alloc, sizeof(struct huffman_node),
79 new_nodes_alloc);
80 ht->nodes_alloc = (NODE_REF_TYPE)new_nodes_alloc;
81 return 1;
84 // Tracks the number of items with VALUE status ("codes").
85 static void huffman_setchildstatus(struct fmtutil_huffman_tree *ht, NODE_REF_TYPE n,
86 u8 child_idx, u8 newstatus)
88 if(n>=ht->nodes_alloc) return;
89 if(child_idx>1) return;
91 if(ht->nodes[n].child_status[child_idx]==newstatus) return;
92 if(ht->nodes[n].child_status[child_idx]==CHILDSTATUS_VALUE) {
93 ht->num_codes--;
95 if(newstatus==CHILDSTATUS_VALUE) {
96 ht->num_codes++;
98 ht->nodes[n].child_status[child_idx] = newstatus;
101 // The size of the longest current code.
102 // This is mainly for debugging info -- it is not guaranteed to be correct if
103 // the tree was constructed improperly.
104 UI fmtutil_huffman_get_max_bits(struct fmtutil_huffman_tree *ht)
106 return ht->max_bits;
109 // The number of codes (symbols) in the the tree.
110 // This is mainly for debugging info -- it is not guaranteed to be correct if
111 // the tree was constructed improperly.
112 i64 fmtutil_huffman_get_num_codes(struct fmtutil_huffman_tree *ht)
114 if(ht->num_codes>=0) return ht->num_codes;
115 return 0;
118 void fmtutil_huffman_reset_cursor(struct fmtutil_huffman_tree *ht)
120 ht->cursor.curr_noderef = 0;
123 // Add a code, adding to the current tree structure as needed. Codes can be
124 // added in any order.
126 // If inconsistent codes are added (i.e. a code is a prefix of another code, or
127 // the tree is left incomplete), we only promise that it will be safe to use
128 // the decoding functions. Such errors will not necessarily be detected.
130 // Note that adding the 0-length code is allowed.
131 int fmtutil_huffman_add_code(deark *c, struct fmtutil_huffman_tree *ht,
132 u64 code, UI code_nbits, fmtutil_huffman_valtype val)
134 UI k;
135 NODE_REF_TYPE curr_noderef = 0; // Note that this may temporarily point to an unallocated node
136 int retval = 0;
138 if(code_nbits>FMTUTIL_HUFFMAN_MAX_CODE_LENGTH) goto done;
140 if(code_nbits<1) {
141 ht->value_of_null_code = val;
142 ht->has_null_code = 1;
143 retval = 1;
144 goto done;
146 ht->has_null_code = 0;
148 if(code_nbits > ht->max_bits) {
149 ht->max_bits = code_nbits;
152 // Iterate through the bits, high bit first.
153 for(k=0; k<code_nbits; k++) {
154 UI child_idx; // 0 or 1
156 // Make sure the current node exists
157 if(curr_noderef >= ht->nodes_alloc) {
158 if(!huffman_ensure_alloc(c, ht, curr_noderef+1)) goto done;
160 // Claim the current node, if necessary
161 if(curr_noderef >= ht->next_avail_node) {
162 ht->next_avail_node = curr_noderef+1;
163 ht->nodes[curr_noderef].depth = (u8)k;
166 child_idx = (code>>(code_nbits-1-k))&0x1;
168 if(k==code_nbits-1) {
169 // Reached the "leaf" node. Set the value for this child_idx.
170 huffman_setchildstatus(ht, curr_noderef, child_idx, CHILDSTATUS_VALUE);
171 ht->nodes[curr_noderef].child[child_idx].hnvd.value = val;
173 else {
174 // Not at the leaf node yet.
175 if(ht->nodes[curr_noderef].child_status[child_idx]==CHILDSTATUS_POINTER) {
176 // It's already a pointer.
177 curr_noderef = ht->nodes[curr_noderef].child[child_idx].hnpd.noderef;
179 else {
180 NODE_REF_TYPE next_noderef;
182 // It's not a pointer -- make it one.
183 if(ht->next_avail_node >= ht->max_nodes) goto done;
184 next_noderef = ht->next_avail_node;
185 huffman_setchildstatus(ht, curr_noderef, child_idx, CHILDSTATUS_POINTER);
186 ht->nodes[curr_noderef].child[child_idx].hnpd.noderef = next_noderef;
187 curr_noderef = next_noderef;
192 retval = 1;
194 done:
195 return retval;
198 // Caller supplies one bit of data to the decoder (the low bit of bitval).
199 // Returns:
200 // 1 = This was the last bit of a code; value returned in *pval
201 // 2 = Need more bits (*pval unchanged)
202 // 0 = Error (*pval unchanged)
203 // If return value is not 2, resets the cursor before returning.
204 // Note that, by itself, this function cannot read the zero-length code.
205 int fmtutil_huffman_decode_bit(struct fmtutil_huffman_tree *ht, u8 bitval, fmtutil_huffman_valtype *pval)
207 UI child_idx;
208 int retval = 0;
209 NODE_REF_TYPE curr_noderef = ht->cursor.curr_noderef;
211 if(curr_noderef >= ht->nodes_alloc) goto done;
212 if(curr_noderef >= ht->next_avail_node) goto done;
213 child_idx = bitval & 0x01;
215 if(ht->nodes[curr_noderef].child_status[child_idx]==CHILDSTATUS_VALUE) {
216 *pval = ht->nodes[curr_noderef].child[child_idx].hnvd.value;
217 retval = 1;
218 goto done;
220 else if(ht->nodes[curr_noderef].child_status[child_idx]==CHILDSTATUS_POINTER) {
221 ht->cursor.curr_noderef = ht->nodes[curr_noderef].child[child_idx].hnpd.noderef;
222 retval = 2;
223 goto done;
226 done:
227 if(retval!=2) {
228 fmtutil_huffman_reset_cursor(ht);
230 return retval;
233 // Read the next Huffman code from a bitreader, and decode it.
234 // *pval will always be written to. On error, it will be set to 0.
235 // pnbits returns the number of bits read. Can be NULL.
236 // Return value:
237 // nonzero on success
238 // 0 on error - Can happen if the tree was not constructed properly, or on EOF
239 // (bitrd->eof_flag can distinguish these cases).
240 int fmtutil_huffman_read_next_value(struct fmtutil_huffman_tree *ht,
241 struct de_bitreader *bitrd, fmtutil_huffman_valtype *pval, UI *pnbits)
243 int bitcount = 0;
244 int retval = 0;
246 if(bitrd->eof_flag) goto done;
248 if(ht->has_null_code) {
249 *pval = ht->value_of_null_code;
250 retval = 1;
251 goto done;
254 while(1) {
255 int ret;
256 u8 b;
258 b = (u8)de_bitreader_getbits(bitrd, 1);
259 if(bitrd->eof_flag) goto done;
260 bitcount++;
261 if(bitcount>FMTUTIL_HUFFMAN_MAX_CODE_LENGTH) goto done; // Should be impossible
263 ret = fmtutil_huffman_decode_bit(ht, b, pval);
264 if(ret==1) { // finished the code
265 retval = 1;
266 goto done;
268 else if(ret!=2) { // decoding error
269 goto done;
272 done:
273 if(!retval) {
274 *pval = 0;
276 if(pnbits) {
277 *pnbits = retval ? bitcount : 0;
279 return retval;
282 // For debugging
283 void fmtutil_huffman_dump(deark *c, struct fmtutil_huffman_tree *ht)
285 NODE_REF_TYPE k;
286 de_ucstring *tmps = NULL;
288 de_dbg(c, "internal huffman table:");
289 de_dbg_indent(c, 1);
291 de_dbg(c, "number of codes: %"I64_FMT, fmtutil_huffman_get_num_codes(ht));
292 de_dbg(c, "max code size: %u bits", fmtutil_huffman_get_max_bits(ht));
293 tmps = ucstring_create(c);
294 for(k=0; k<ht->next_avail_node && k<ht->nodes_alloc; k++) {
295 UI child_idx;
296 struct huffman_node *nd = &ht->nodes[k];
298 ucstring_empty(tmps);
299 ucstring_printf(tmps, DE_ENCODING_LATIN1, "node[%u]: depth=%u (", (UI)k, (UI)nd->depth);
301 for(child_idx=0; child_idx<=1; child_idx++) {
302 if(child_idx==1) {
303 ucstring_append_sz(tmps, " ", DE_ENCODING_LATIN1);
305 if(nd->child_status[child_idx]==CHILDSTATUS_POINTER) {
306 ucstring_printf(tmps, DE_ENCODING_LATIN1, "next=%u",
307 (UI)nd->child[child_idx].hnpd.noderef);
309 else if(nd->child_status[child_idx]==CHILDSTATUS_VALUE) {
310 ucstring_printf(tmps, DE_ENCODING_LATIN1, "value=%d",
311 (int)nd->child[child_idx].hnvd.value);
313 else {
314 ucstring_append_sz(tmps, "unused", DE_ENCODING_LATIN1);
317 ucstring_printf(tmps, DE_ENCODING_LATIN1, ")");
318 de_dbg(c, "%s", ucstring_getpsz_d(tmps));
320 ucstring_destroy(tmps);
321 de_dbg_indent(c, -1);
324 // This is only used with fmtutil_huffman_make_canonical_tree().
325 // Call this first, once per item.
326 // The order that you supply the items matters, at least within the set of items
327 // having the same length.
328 // Cannot be used for zero-length items. If len==0, it's a successful no-op.
329 int fmtutil_huffman_record_a_code_length(deark *c, struct fmtutil_huffman_tree *ht,
330 fmtutil_huffman_valtype val, UI len)
332 if(len==0) return 1;
333 if(ht->lengths_arr_numused > MAX_MAX_NODES) return 0;
335 if(ht->lengths_arr_numused >= ht->lengths_arr_numalloc) {
336 i64 new_numalloc;
338 new_numalloc = ht->lengths_arr_numused + 128;
339 ht->lengths_arr = de_reallocarray(c, ht->lengths_arr, ht->lengths_arr_numalloc,
340 sizeof(struct huffman_lengths_arr_item), new_numalloc);
341 ht->lengths_arr_numalloc = new_numalloc;
343 ht->lengths_arr[ht->lengths_arr_numused].val = val;
344 ht->lengths_arr[ht->lengths_arr_numused++].len = len;
345 return 1;
348 // The usual canonical format - leaves are left-aligned
349 static int fmtutil_huffman_make_canonical_tree1(deark *c, struct fmtutil_huffman_tree *ht,
350 UI max_sym_len_used)
352 UI symlen;
353 UI codes_count_total = 0;
354 UI prev_code_bit_length = 0;
355 u64 prev_code = 0;
356 int retval = 0;
357 char b2buf[72];
359 // For each possible symbol length...
360 for(symlen=1; symlen<=max_sym_len_used; symlen++) {
361 UI k;
363 // Find all the codes that use this symbol length, in order
364 for(k=0; k<(UI)ht->lengths_arr_numused; k++) {
365 int ret;
366 u64 thiscode;
368 if(ht->lengths_arr[k].len != symlen) continue;
369 // Found a code of the length we're looking for.
371 if(codes_count_total==0) {
372 thiscode = 0;
374 else {
375 thiscode = prev_code + 1;
376 if(symlen > prev_code_bit_length) {
377 thiscode <<= (symlen - prev_code_bit_length);
381 prev_code = thiscode;
382 prev_code_bit_length = symlen;
383 codes_count_total++;
385 if(c->debug_level>=3) {
386 de_dbg3(c, "code: \"%s\" = %d",
387 de_print_base2_fixed(b2buf, sizeof(b2buf), thiscode, symlen),
388 (int)ht->lengths_arr[k].val);
390 ret = fmtutil_huffman_add_code(c, ht, thiscode, symlen, ht->lengths_arr[k].val);
391 if(!ret) {
392 goto done;
396 retval = 1;
398 done:
399 return retval;
402 // "pack" style - branches are left-aligned
403 static int fmtutil_huffman_make_canonical_tree2(deark *c, struct fmtutil_huffman_tree *ht,
404 UI max_sym_len_used)
406 UI symlen;
407 UI codes_count_total = 0;
408 UI prev_code_bit_length = 0;
409 u64 prev_code = 0;
410 int retval = 0;
411 char b2buf[72];
413 // For each possible symbol length...
414 for(symlen=max_sym_len_used; symlen>=1; symlen--) {
415 UI k;
417 // Find all the codes that use this symbol length, in order
418 for(k=0; k<(UI)ht->lengths_arr_numused; k++) {
419 int ret;
420 u64 this_code;
422 if(ht->lengths_arr[k].len != symlen) continue;
423 // Found a code of the length we're looking for.
425 if(codes_count_total==0) {
426 this_code = 0;
428 else {
429 this_code = (prev_code>>(prev_code_bit_length-symlen)) + 1;
432 prev_code = this_code;
433 prev_code_bit_length = symlen;
434 codes_count_total++;
436 if(c->debug_level>=3) {
437 de_dbg3(c, "code: \"%s\" = %d",
438 de_print_base2_fixed(b2buf, sizeof(b2buf), this_code, symlen),
439 (int)ht->lengths_arr[k].val);
441 ret = fmtutil_huffman_add_code(c, ht, this_code, symlen, ht->lengths_arr[k].val);
442 if(!ret) {
443 goto done;
447 retval = 1;
449 done:
450 return retval;
453 // Call this after calling huffman_record_item_length() (usually many times).
454 // Creates a canonical Huffman tree derived from the known code lengths.
455 int fmtutil_huffman_make_canonical_tree(deark *c, struct fmtutil_huffman_tree *ht, UI flags)
457 UI max_sym_len_used;
458 UI i;
459 int saved_indent_level;
460 int retval = 0;
462 de_dbg_indent_save(c, &saved_indent_level);
463 de_dbg3(c, "derived huffman codebook:");
464 de_dbg_indent(c, 1);
466 if(!ht->lengths_arr) {
467 retval = 1;
468 goto done;
471 // Find the maximum length
472 max_sym_len_used = 0;
473 for(i=0; i<(UI)ht->lengths_arr_numused; i++) {
474 if(ht->lengths_arr[i].len > max_sym_len_used) {
475 max_sym_len_used = ht->lengths_arr[i].len;
478 if(max_sym_len_used>FMTUTIL_HUFFMAN_MAX_CODE_LENGTH) {
479 goto done;
482 if(flags & FMTUTIL_MCTFLAG_LEFT_ALIGN_BRANCHES) {
483 retval = fmtutil_huffman_make_canonical_tree2(c, ht, max_sym_len_used);
485 else {
486 retval = fmtutil_huffman_make_canonical_tree1(c, ht, max_sym_len_used);
489 done:
490 de_dbg_indent_restore(c, saved_indent_level);
491 return retval;
494 // initial_codes: If not 0, pre-allocate enough nodes for this many codes.
495 // max_codes: If not 0, attempting to add substantially more codes than this will fail.
496 struct fmtutil_huffman_tree *fmtutil_huffman_create_tree(deark *c, i64 initial_codes, i64 max_codes)
498 i64 initial_nodes;
499 struct fmtutil_huffman_tree *ht = NULL;
501 ht = de_malloc(c, sizeof(struct fmtutil_huffman_tree));
503 if(max_codes>0) {
504 ht->max_nodes = max_codes;
506 else {
507 ht->max_nodes = MAX_MAX_NODES;
509 if(ht->max_nodes > MAX_MAX_NODES) {
510 ht->max_nodes = MAX_MAX_NODES;
513 if(initial_codes>0) {
514 initial_nodes = initial_codes;
516 else {
517 initial_nodes = 1;
519 if(initial_nodes > MAX_MAX_NODES) {
520 initial_nodes = MAX_MAX_NODES;
523 huffman_ensure_alloc(c, ht, (NODE_REF_TYPE)initial_nodes);
524 ht->next_avail_node = 0;
525 ht->num_codes = 0;
526 ht->max_bits = 0;
528 return ht;
531 void fmtutil_huffman_destroy_tree(deark *c, struct fmtutil_huffman_tree *ht)
533 if(!ht) return;
534 de_free(c, ht->lengths_arr);
535 de_free(c, ht);