1 // This file is part of Deark.
2 // Copyright (C) 2020 Jason Summers
3 // See the file COPYING for terms of use.
5 // Functions related to Huffman coding decompression
7 #define DE_NOT_IN_MODULE
8 #include "deark-private.h"
9 #include "deark-fmtutil.h"
11 #define NODE_REF_TYPE u32
12 #define MAX_MAX_NODES 66000
14 struct huffman_nval_pointer_data
{
15 NODE_REF_TYPE noderef
;
17 struct huffman_nval_value_data
{
18 fmtutil_huffman_valtype value
;
21 union huffman_nval_data
{
22 struct huffman_nval_pointer_data hnpd
;
23 struct huffman_nval_value_data hnvd
;
27 #define CHILDSTATUS_UNUSED 0
28 #define CHILDSTATUS_POINTER 1
29 #define CHILDSTATUS_VALUE 2
32 union huffman_nval_data child
[2];
35 struct huffman_lengths_arr_item
{
36 fmtutil_huffman_valtype val
;
40 struct huffman_cursor
{
41 NODE_REF_TYPE curr_noderef
;
44 struct fmtutil_huffman_tree
{
45 // In principle, the cursor should be separate, so we could have multiple
46 // cursors for one tree. But that's inconvenient, and it's not clear that
47 // it would be of any use in practice.
48 struct huffman_cursor cursor
;
51 NODE_REF_TYPE next_avail_node
;
52 NODE_REF_TYPE nodes_alloc
;
53 struct huffman_node
*nodes
; // array[nodes_alloc]
55 fmtutil_huffman_valtype value_of_null_code
;
60 i64 lengths_arr_numalloc
;
61 i64 lengths_arr_numused
;
62 struct huffman_lengths_arr_item
*lengths_arr
; // array[lengths_arr_numalloc]
65 // Ensure that at least n nodes are allocated (0 through n-1)
66 static int huffman_ensure_alloc(deark
*c
, struct fmtutil_huffman_tree
*ht
, NODE_REF_TYPE n
)
70 if(n
<= ht
->nodes_alloc
) return 1;
71 if((i64
)n
> ht
->max_nodes
) return 0;
73 new_nodes_alloc
= (i64
)ht
->nodes_alloc
* 2;
74 if(new_nodes_alloc
> ht
->max_nodes
) new_nodes_alloc
= ht
->max_nodes
;
75 if(new_nodes_alloc
< (i64
)n
) new_nodes_alloc
= (i64
)n
;
76 if(new_nodes_alloc
< 16) new_nodes_alloc
= 16;
78 ht
->nodes
= de_reallocarray(c
, ht
->nodes
, ht
->nodes_alloc
, sizeof(struct huffman_node
),
80 ht
->nodes_alloc
= (NODE_REF_TYPE
)new_nodes_alloc
;
84 // Tracks the number of items with VALUE status ("codes").
85 static void huffman_setchildstatus(struct fmtutil_huffman_tree
*ht
, NODE_REF_TYPE n
,
86 u8 child_idx
, u8 newstatus
)
88 if(n
>=ht
->nodes_alloc
) return;
89 if(child_idx
>1) return;
91 if(ht
->nodes
[n
].child_status
[child_idx
]==newstatus
) return;
92 if(ht
->nodes
[n
].child_status
[child_idx
]==CHILDSTATUS_VALUE
) {
95 if(newstatus
==CHILDSTATUS_VALUE
) {
98 ht
->nodes
[n
].child_status
[child_idx
] = newstatus
;
101 // The size of the longest current code.
102 // This is mainly for debugging info -- it is not guaranteed to be correct if
103 // the tree was constructed improperly.
104 UI
fmtutil_huffman_get_max_bits(struct fmtutil_huffman_tree
*ht
)
109 // The number of codes (symbols) in the the tree.
110 // This is mainly for debugging info -- it is not guaranteed to be correct if
111 // the tree was constructed improperly.
112 i64
fmtutil_huffman_get_num_codes(struct fmtutil_huffman_tree
*ht
)
114 if(ht
->num_codes
>=0) return ht
->num_codes
;
118 void fmtutil_huffman_reset_cursor(struct fmtutil_huffman_tree
*ht
)
120 ht
->cursor
.curr_noderef
= 0;
123 // Add a code, adding to the current tree structure as needed. Codes can be
124 // added in any order.
126 // If inconsistent codes are added (i.e. a code is a prefix of another code, or
127 // the tree is left incomplete), we only promise that it will be safe to use
128 // the decoding functions. Such errors will not necessarily be detected.
130 // Note that adding the 0-length code is allowed.
131 int fmtutil_huffman_add_code(deark
*c
, struct fmtutil_huffman_tree
*ht
,
132 u64 code
, UI code_nbits
, fmtutil_huffman_valtype val
)
135 NODE_REF_TYPE curr_noderef
= 0; // Note that this may temporarily point to an unallocated node
138 if(code_nbits
>FMTUTIL_HUFFMAN_MAX_CODE_LENGTH
) goto done
;
141 ht
->value_of_null_code
= val
;
142 ht
->has_null_code
= 1;
146 ht
->has_null_code
= 0;
148 if(code_nbits
> ht
->max_bits
) {
149 ht
->max_bits
= code_nbits
;
152 // Iterate through the bits, high bit first.
153 for(k
=0; k
<code_nbits
; k
++) {
154 UI child_idx
; // 0 or 1
156 // Make sure the current node exists
157 if(curr_noderef
>= ht
->nodes_alloc
) {
158 if(!huffman_ensure_alloc(c
, ht
, curr_noderef
+1)) goto done
;
160 // Claim the current node, if necessary
161 if(curr_noderef
>= ht
->next_avail_node
) {
162 ht
->next_avail_node
= curr_noderef
+1;
163 ht
->nodes
[curr_noderef
].depth
= (u8
)k
;
166 child_idx
= (code
>>(code_nbits
-1-k
))&0x1;
168 if(k
==code_nbits
-1) {
169 // Reached the "leaf" node. Set the value for this child_idx.
170 huffman_setchildstatus(ht
, curr_noderef
, child_idx
, CHILDSTATUS_VALUE
);
171 ht
->nodes
[curr_noderef
].child
[child_idx
].hnvd
.value
= val
;
174 // Not at the leaf node yet.
175 if(ht
->nodes
[curr_noderef
].child_status
[child_idx
]==CHILDSTATUS_POINTER
) {
176 // It's already a pointer.
177 curr_noderef
= ht
->nodes
[curr_noderef
].child
[child_idx
].hnpd
.noderef
;
180 NODE_REF_TYPE next_noderef
;
182 // It's not a pointer -- make it one.
183 if(ht
->next_avail_node
>= ht
->max_nodes
) goto done
;
184 next_noderef
= ht
->next_avail_node
;
185 huffman_setchildstatus(ht
, curr_noderef
, child_idx
, CHILDSTATUS_POINTER
);
186 ht
->nodes
[curr_noderef
].child
[child_idx
].hnpd
.noderef
= next_noderef
;
187 curr_noderef
= next_noderef
;
198 // Caller supplies one bit of data to the decoder (the low bit of bitval).
200 // 1 = This was the last bit of a code; value returned in *pval
201 // 2 = Need more bits (*pval unchanged)
202 // 0 = Error (*pval unchanged)
203 // If return value is not 2, resets the cursor before returning.
204 // Note that, by itself, this function cannot read the zero-length code.
205 int fmtutil_huffman_decode_bit(struct fmtutil_huffman_tree
*ht
, u8 bitval
, fmtutil_huffman_valtype
*pval
)
209 NODE_REF_TYPE curr_noderef
= ht
->cursor
.curr_noderef
;
211 if(curr_noderef
>= ht
->nodes_alloc
) goto done
;
212 if(curr_noderef
>= ht
->next_avail_node
) goto done
;
213 child_idx
= bitval
& 0x01;
215 if(ht
->nodes
[curr_noderef
].child_status
[child_idx
]==CHILDSTATUS_VALUE
) {
216 *pval
= ht
->nodes
[curr_noderef
].child
[child_idx
].hnvd
.value
;
220 else if(ht
->nodes
[curr_noderef
].child_status
[child_idx
]==CHILDSTATUS_POINTER
) {
221 ht
->cursor
.curr_noderef
= ht
->nodes
[curr_noderef
].child
[child_idx
].hnpd
.noderef
;
228 fmtutil_huffman_reset_cursor(ht
);
233 // Read the next Huffman code from a bitreader, and decode it.
234 // *pval will always be written to. On error, it will be set to 0.
235 // pnbits returns the number of bits read. Can be NULL.
237 // nonzero on success
238 // 0 on error - Can happen if the tree was not constructed properly, or on EOF
239 // (bitrd->eof_flag can distinguish these cases).
240 int fmtutil_huffman_read_next_value(struct fmtutil_huffman_tree
*ht
,
241 struct de_bitreader
*bitrd
, fmtutil_huffman_valtype
*pval
, UI
*pnbits
)
246 if(bitrd
->eof_flag
) goto done
;
248 if(ht
->has_null_code
) {
249 *pval
= ht
->value_of_null_code
;
258 b
= (u8
)de_bitreader_getbits(bitrd
, 1);
259 if(bitrd
->eof_flag
) goto done
;
261 if(bitcount
>FMTUTIL_HUFFMAN_MAX_CODE_LENGTH
) goto done
; // Should be impossible
263 ret
= fmtutil_huffman_decode_bit(ht
, b
, pval
);
264 if(ret
==1) { // finished the code
268 else if(ret
!=2) { // decoding error
277 *pnbits
= retval
? bitcount
: 0;
283 void fmtutil_huffman_dump(deark
*c
, struct fmtutil_huffman_tree
*ht
)
286 de_ucstring
*tmps
= NULL
;
288 de_dbg(c
, "internal huffman table:");
291 de_dbg(c
, "number of codes: %"I64_FMT
, fmtutil_huffman_get_num_codes(ht
));
292 de_dbg(c
, "max code size: %u bits", fmtutil_huffman_get_max_bits(ht
));
293 tmps
= ucstring_create(c
);
294 for(k
=0; k
<ht
->next_avail_node
&& k
<ht
->nodes_alloc
; k
++) {
296 struct huffman_node
*nd
= &ht
->nodes
[k
];
298 ucstring_empty(tmps
);
299 ucstring_printf(tmps
, DE_ENCODING_LATIN1
, "node[%u]: depth=%u (", (UI
)k
, (UI
)nd
->depth
);
301 for(child_idx
=0; child_idx
<=1; child_idx
++) {
303 ucstring_append_sz(tmps
, " ", DE_ENCODING_LATIN1
);
305 if(nd
->child_status
[child_idx
]==CHILDSTATUS_POINTER
) {
306 ucstring_printf(tmps
, DE_ENCODING_LATIN1
, "next=%u",
307 (UI
)nd
->child
[child_idx
].hnpd
.noderef
);
309 else if(nd
->child_status
[child_idx
]==CHILDSTATUS_VALUE
) {
310 ucstring_printf(tmps
, DE_ENCODING_LATIN1
, "value=%d",
311 (int)nd
->child
[child_idx
].hnvd
.value
);
314 ucstring_append_sz(tmps
, "unused", DE_ENCODING_LATIN1
);
317 ucstring_printf(tmps
, DE_ENCODING_LATIN1
, ")");
318 de_dbg(c
, "%s", ucstring_getpsz_d(tmps
));
320 ucstring_destroy(tmps
);
321 de_dbg_indent(c
, -1);
324 // This is only used with fmtutil_huffman_make_canonical_tree().
325 // Call this first, once per item.
326 // The order that you supply the items matters, at least within the set of items
327 // having the same length.
328 // Cannot be used for zero-length items. If len==0, it's a successful no-op.
329 int fmtutil_huffman_record_a_code_length(deark
*c
, struct fmtutil_huffman_tree
*ht
,
330 fmtutil_huffman_valtype val
, UI len
)
333 if(ht
->lengths_arr_numused
> MAX_MAX_NODES
) return 0;
335 if(ht
->lengths_arr_numused
>= ht
->lengths_arr_numalloc
) {
338 new_numalloc
= ht
->lengths_arr_numused
+ 128;
339 ht
->lengths_arr
= de_reallocarray(c
, ht
->lengths_arr
, ht
->lengths_arr_numalloc
,
340 sizeof(struct huffman_lengths_arr_item
), new_numalloc
);
341 ht
->lengths_arr_numalloc
= new_numalloc
;
343 ht
->lengths_arr
[ht
->lengths_arr_numused
].val
= val
;
344 ht
->lengths_arr
[ht
->lengths_arr_numused
++].len
= len
;
348 // The usual canonical format - leaves are left-aligned
349 static int fmtutil_huffman_make_canonical_tree1(deark
*c
, struct fmtutil_huffman_tree
*ht
,
353 UI codes_count_total
= 0;
354 UI prev_code_bit_length
= 0;
359 // For each possible symbol length...
360 for(symlen
=1; symlen
<=max_sym_len_used
; symlen
++) {
363 // Find all the codes that use this symbol length, in order
364 for(k
=0; k
<(UI
)ht
->lengths_arr_numused
; k
++) {
368 if(ht
->lengths_arr
[k
].len
!= symlen
) continue;
369 // Found a code of the length we're looking for.
371 if(codes_count_total
==0) {
375 thiscode
= prev_code
+ 1;
376 if(symlen
> prev_code_bit_length
) {
377 thiscode
<<= (symlen
- prev_code_bit_length
);
381 prev_code
= thiscode
;
382 prev_code_bit_length
= symlen
;
385 if(c
->debug_level
>=3) {
386 de_dbg3(c
, "code: \"%s\" = %d",
387 de_print_base2_fixed(b2buf
, sizeof(b2buf
), thiscode
, symlen
),
388 (int)ht
->lengths_arr
[k
].val
);
390 ret
= fmtutil_huffman_add_code(c
, ht
, thiscode
, symlen
, ht
->lengths_arr
[k
].val
);
402 // "pack" style - branches are left-aligned
403 static int fmtutil_huffman_make_canonical_tree2(deark
*c
, struct fmtutil_huffman_tree
*ht
,
407 UI codes_count_total
= 0;
408 UI prev_code_bit_length
= 0;
413 // For each possible symbol length...
414 for(symlen
=max_sym_len_used
; symlen
>=1; symlen
--) {
417 // Find all the codes that use this symbol length, in order
418 for(k
=0; k
<(UI
)ht
->lengths_arr_numused
; k
++) {
422 if(ht
->lengths_arr
[k
].len
!= symlen
) continue;
423 // Found a code of the length we're looking for.
425 if(codes_count_total
==0) {
429 this_code
= (prev_code
>>(prev_code_bit_length
-symlen
)) + 1;
432 prev_code
= this_code
;
433 prev_code_bit_length
= symlen
;
436 if(c
->debug_level
>=3) {
437 de_dbg3(c
, "code: \"%s\" = %d",
438 de_print_base2_fixed(b2buf
, sizeof(b2buf
), this_code
, symlen
),
439 (int)ht
->lengths_arr
[k
].val
);
441 ret
= fmtutil_huffman_add_code(c
, ht
, this_code
, symlen
, ht
->lengths_arr
[k
].val
);
453 // Call this after calling huffman_record_item_length() (usually many times).
454 // Creates a canonical Huffman tree derived from the known code lengths.
455 int fmtutil_huffman_make_canonical_tree(deark
*c
, struct fmtutil_huffman_tree
*ht
, UI flags
)
459 int saved_indent_level
;
462 de_dbg_indent_save(c
, &saved_indent_level
);
463 de_dbg3(c
, "derived huffman codebook:");
466 if(!ht
->lengths_arr
) {
471 // Find the maximum length
472 max_sym_len_used
= 0;
473 for(i
=0; i
<(UI
)ht
->lengths_arr_numused
; i
++) {
474 if(ht
->lengths_arr
[i
].len
> max_sym_len_used
) {
475 max_sym_len_used
= ht
->lengths_arr
[i
].len
;
478 if(max_sym_len_used
>FMTUTIL_HUFFMAN_MAX_CODE_LENGTH
) {
482 if(flags
& FMTUTIL_MCTFLAG_LEFT_ALIGN_BRANCHES
) {
483 retval
= fmtutil_huffman_make_canonical_tree2(c
, ht
, max_sym_len_used
);
486 retval
= fmtutil_huffman_make_canonical_tree1(c
, ht
, max_sym_len_used
);
490 de_dbg_indent_restore(c
, saved_indent_level
);
494 // initial_codes: If not 0, pre-allocate enough nodes for this many codes.
495 // max_codes: If not 0, attempting to add substantially more codes than this will fail.
496 struct fmtutil_huffman_tree
*fmtutil_huffman_create_tree(deark
*c
, i64 initial_codes
, i64 max_codes
)
499 struct fmtutil_huffman_tree
*ht
= NULL
;
501 ht
= de_malloc(c
, sizeof(struct fmtutil_huffman_tree
));
504 ht
->max_nodes
= max_codes
;
507 ht
->max_nodes
= MAX_MAX_NODES
;
509 if(ht
->max_nodes
> MAX_MAX_NODES
) {
510 ht
->max_nodes
= MAX_MAX_NODES
;
513 if(initial_codes
>0) {
514 initial_nodes
= initial_codes
;
519 if(initial_nodes
> MAX_MAX_NODES
) {
520 initial_nodes
= MAX_MAX_NODES
;
523 huffman_ensure_alloc(c
, ht
, (NODE_REF_TYPE
)initial_nodes
);
524 ht
->next_avail_node
= 0;
531 void fmtutil_huffman_destroy_tree(deark
*c
, struct fmtutil_huffman_tree
*ht
)
534 de_free(c
, ht
->lengths_arr
);