1 // This file is part of Deark.
2 // Copyright (C) 2020 Jason Summers
3 // See the file COPYING for terms of use.
5 // Functions related to Huffman coding decompression
7 #define DE_NOT_IN_MODULE
8 #include "deark-private.h"
9 #include "deark-fmtutil.h"
11 #define NODE_REF_TYPE u32
12 #define MAX_MAX_NODES 66000
14 struct huffman_nval_pointer_data
{
15 NODE_REF_TYPE noderef
;
17 struct huffman_nval_value_data
{
18 fmtutil_huffman_valtype value
;
21 union huffman_nval_data
{
22 struct huffman_nval_pointer_data hnpd
;
23 struct huffman_nval_value_data hnvd
;
27 #define CHILDSTATUS_UNUSED 0
28 #define CHILDSTATUS_POINTER 1
29 #define CHILDSTATUS_VALUE 2
32 union huffman_nval_data child
[2];
35 struct huffman_lengths_arr_item
{
36 fmtutil_huffman_valtype val
;
40 struct fmtutil_huffman_code_builder
{
41 i64 lengths_arr_numalloc
;
42 i64 lengths_arr_numused
;
43 struct huffman_lengths_arr_item
*lengths_arr
; // array[lengths_arr_numalloc]
46 struct fmtutil_huffman_cursor
{
47 NODE_REF_TYPE curr_noderef
;
50 struct fmtutil_huffman_codebook
{
52 NODE_REF_TYPE next_avail_node
;
53 NODE_REF_TYPE nodes_alloc
;
54 struct huffman_node
*nodes
; // array[nodes_alloc]
56 fmtutil_huffman_valtype value_of_null_code
;
62 // Ensure that at least n nodes are allocated (0 through n-1)
63 static int huffman_ensure_alloc(deark
*c
, struct fmtutil_huffman_codebook
*bk
, NODE_REF_TYPE n
)
67 if(n
<= bk
->nodes_alloc
) return 1;
68 if((i64
)n
> bk
->max_nodes
) return 0;
70 new_nodes_alloc
= (i64
)bk
->nodes_alloc
* 2;
71 if(new_nodes_alloc
> bk
->max_nodes
) new_nodes_alloc
= bk
->max_nodes
;
72 if(new_nodes_alloc
< (i64
)n
) new_nodes_alloc
= (i64
)n
;
73 if(new_nodes_alloc
< 16) new_nodes_alloc
= 16;
75 bk
->nodes
= de_reallocarray(c
, bk
->nodes
, bk
->nodes_alloc
, sizeof(struct huffman_node
),
77 bk
->nodes_alloc
= (NODE_REF_TYPE
)new_nodes_alloc
;
81 // Tracks the number of items with VALUE status ("codes").
82 static void huffman_setchildstatus(struct fmtutil_huffman_codebook
*bk
, NODE_REF_TYPE n
,
83 u8 child_idx
, u8 newstatus
)
85 if(n
>=bk
->nodes_alloc
) return;
86 if(child_idx
>1) return;
88 if(bk
->nodes
[n
].child_status
[child_idx
]==newstatus
) return;
89 if(bk
->nodes
[n
].child_status
[child_idx
]==CHILDSTATUS_VALUE
) {
92 if(newstatus
==CHILDSTATUS_VALUE
) {
95 bk
->nodes
[n
].child_status
[child_idx
] = newstatus
;
98 // The size of the longest current code.
99 // This is mainly for debugging info -- it is not guaranteed to be correct if
100 // the tree was constructed improperly.
101 UI
fmtutil_huffman_get_max_bits(struct fmtutil_huffman_codebook
*bk
)
106 // The number of codes (symbols) in the the tree.
107 // This is mainly for debugging info -- it is not guaranteed to be correct if
108 // the tree was constructed improperly.
109 i64
fmtutil_huffman_get_num_codes(struct fmtutil_huffman_codebook
*bk
)
111 if(bk
->num_codes
>=0) return bk
->num_codes
;
115 void fmtutil_huffman_reset_cursor(struct fmtutil_huffman_cursor
*cursor
)
117 cursor
->curr_noderef
= 0;
120 // Add a code, adding to the current tree structure as needed. Codes can be
121 // added in any order.
123 // If inconsistent codes are added (i.e. a code is a prefix of another code, or
124 // the tree is left incomplete), we only promise that it will be safe to use
125 // the decoding functions. Such errors will not necessarily be detected.
127 // Note that adding the 0-length code is allowed.
128 int fmtutil_huffman_add_code(deark
*c
, struct fmtutil_huffman_codebook
*bk
,
129 u64 code
, UI code_nbits
, fmtutil_huffman_valtype val
)
132 NODE_REF_TYPE curr_noderef
= 0; // Note that this may temporarily point to an unallocated node
135 if(code_nbits
>FMTUTIL_HUFFMAN_MAX_CODE_LENGTH
) goto done
;
138 bk
->value_of_null_code
= val
;
139 bk
->has_null_code
= 1;
143 bk
->has_null_code
= 0;
145 if(code_nbits
> bk
->max_bits
) {
146 bk
->max_bits
= code_nbits
;
149 // Iterate through the bits, high bit first.
150 for(k
=0; k
<code_nbits
; k
++) {
151 UI child_idx
; // 0 or 1
153 // Make sure the current node exists
154 if(curr_noderef
>= bk
->nodes_alloc
) {
155 if(!huffman_ensure_alloc(c
, bk
, curr_noderef
+1)) goto done
;
157 // Claim the current node, if necessary
158 if(curr_noderef
>= bk
->next_avail_node
) {
159 bk
->next_avail_node
= curr_noderef
+1;
160 bk
->nodes
[curr_noderef
].depth
= (u8
)k
;
163 child_idx
= (code
>>(code_nbits
-1-k
))&0x1;
165 if(k
==code_nbits
-1) {
166 // Reached the "leaf" node. Set the value for this child_idx.
167 huffman_setchildstatus(bk
, curr_noderef
, child_idx
, CHILDSTATUS_VALUE
);
168 bk
->nodes
[curr_noderef
].child
[child_idx
].hnvd
.value
= val
;
171 // Not at the leaf node yet.
172 if(bk
->nodes
[curr_noderef
].child_status
[child_idx
]==CHILDSTATUS_POINTER
) {
173 // It's already a pointer.
174 curr_noderef
= bk
->nodes
[curr_noderef
].child
[child_idx
].hnpd
.noderef
;
177 NODE_REF_TYPE next_noderef
;
179 // It's not a pointer -- make it one.
180 if(bk
->next_avail_node
>= bk
->max_nodes
) goto done
;
181 next_noderef
= bk
->next_avail_node
;
182 huffman_setchildstatus(bk
, curr_noderef
, child_idx
, CHILDSTATUS_POINTER
);
183 bk
->nodes
[curr_noderef
].child
[child_idx
].hnpd
.noderef
= next_noderef
;
184 curr_noderef
= next_noderef
;
195 // Caller supplies one bit of data to the decoder (the low bit of bitval).
197 // 1 = This was the last bit of a code; value returned in *pval
198 // 2 = Need more bits (*pval unchanged)
199 // 0 = Error (*pval unchanged)
200 // If return value is not 2, resets the cursor before returning.
201 // Note that, by itself, this function cannot read the zero-length code.
202 int fmtutil_huffman_decode_bit(struct fmtutil_huffman_codebook
*bk
, struct fmtutil_huffman_cursor
*cursor
,
203 u8 bitval
, fmtutil_huffman_valtype
*pval
)
207 NODE_REF_TYPE curr_noderef
= cursor
->curr_noderef
;
209 if(curr_noderef
>= bk
->nodes_alloc
) goto done
;
210 if(curr_noderef
>= bk
->next_avail_node
) goto done
;
211 child_idx
= bitval
& 0x01;
213 if(bk
->nodes
[curr_noderef
].child_status
[child_idx
]==CHILDSTATUS_VALUE
) {
214 *pval
= bk
->nodes
[curr_noderef
].child
[child_idx
].hnvd
.value
;
218 else if(bk
->nodes
[curr_noderef
].child_status
[child_idx
]==CHILDSTATUS_POINTER
) {
219 cursor
->curr_noderef
= bk
->nodes
[curr_noderef
].child
[child_idx
].hnpd
.noderef
;
226 fmtutil_huffman_reset_cursor(cursor
);
231 // Read the next Huffman code from a bitreader, and decode it.
232 // *pval will always be written to. On error, it will be set to 0.
233 // pnbits returns the number of bits read. Can be NULL.
235 // nonzero on success
236 // 0 on error - Can happen if the tree was not constructed properly, or on EOF
237 // (bitrd->eof_flag can distinguish these cases).
238 int fmtutil_huffman_read_next_value(struct fmtutil_huffman_codebook
*bk
,
239 struct de_bitreader
*bitrd
, fmtutil_huffman_valtype
*pval
, UI
*pnbits
)
243 struct fmtutil_huffman_cursor tmpcursor
;
245 if(bitrd
->eof_flag
) goto done
;
246 tmpcursor
.curr_noderef
= 0;
248 if(bk
->has_null_code
) {
249 *pval
= bk
->value_of_null_code
;
258 b
= (u8
)de_bitreader_getbits(bitrd
, 1);
259 if(bitrd
->eof_flag
) goto done
;
261 if(bitcount
>FMTUTIL_HUFFMAN_MAX_CODE_LENGTH
) goto done
; // Should be impossible
263 ret
= fmtutil_huffman_decode_bit(bk
, &tmpcursor
, b
, pval
);
264 if(ret
==1) { // finished the code
268 else if(ret
!=2) { // decoding error
277 *pnbits
= retval
? bitcount
: 0;
283 void fmtutil_huffman_dump(deark
*c
, struct fmtutil_huffman_decoder
*ht
)
286 de_ucstring
*tmps
= NULL
;
288 de_dbg(c
, "internal huffman table:");
291 de_dbg(c
, "number of codes: %"I64_FMT
, fmtutil_huffman_get_num_codes(ht
->bk
));
292 de_dbg(c
, "max code size: %u bits", fmtutil_huffman_get_max_bits(ht
->bk
));
293 tmps
= ucstring_create(c
);
294 for(k
=0; k
<ht
->bk
->next_avail_node
&& k
<ht
->bk
->nodes_alloc
; k
++) {
296 struct huffman_node
*nd
= &ht
->bk
->nodes
[k
];
298 ucstring_empty(tmps
);
299 ucstring_printf(tmps
, DE_ENCODING_LATIN1
, "node[%u]: depth=%u (", (UI
)k
, (UI
)nd
->depth
);
301 for(child_idx
=0; child_idx
<=1; child_idx
++) {
303 ucstring_append_sz(tmps
, " ", DE_ENCODING_LATIN1
);
305 if(nd
->child_status
[child_idx
]==CHILDSTATUS_POINTER
) {
306 ucstring_printf(tmps
, DE_ENCODING_LATIN1
, "next=%u",
307 (UI
)nd
->child
[child_idx
].hnpd
.noderef
);
309 else if(nd
->child_status
[child_idx
]==CHILDSTATUS_VALUE
) {
310 ucstring_printf(tmps
, DE_ENCODING_LATIN1
, "value=%d",
311 (int)nd
->child
[child_idx
].hnvd
.value
);
314 ucstring_append_sz(tmps
, "unused", DE_ENCODING_LATIN1
);
317 ucstring_printf(tmps
, DE_ENCODING_LATIN1
, ")");
318 de_dbg(c
, "%s", ucstring_getpsz_d(tmps
));
320 ucstring_destroy(tmps
);
321 de_dbg_indent(c
, -1);
324 // This is only used with fmtutil_huffman_make_canonical_tree().
325 // Call this first, once per item.
326 // The order that you supply the items matters, at least within the set of items
327 // having the same length.
328 // Cannot be used for zero-length items. If len==0, it's a successful no-op.
329 int fmtutil_huffman_record_a_code_length(deark
*c
, struct fmtutil_huffman_code_builder
*builder
,
330 fmtutil_huffman_valtype val
, UI len
)
333 if(len
> FMTUTIL_HUFFMAN_MAX_CODE_LENGTH
) return 0;
334 if(builder
->lengths_arr_numused
> MAX_MAX_NODES
) return 0;
336 if(builder
->lengths_arr_numused
>= builder
->lengths_arr_numalloc
) {
339 new_numalloc
= builder
->lengths_arr_numused
+ 128;
340 builder
->lengths_arr
= de_reallocarray(c
, builder
->lengths_arr
, builder
->lengths_arr_numalloc
,
341 sizeof(struct huffman_lengths_arr_item
), new_numalloc
);
342 builder
->lengths_arr_numalloc
= new_numalloc
;
344 builder
->lengths_arr
[builder
->lengths_arr_numused
].val
= val
;
345 builder
->lengths_arr
[builder
->lengths_arr_numused
++].len
= len
;
349 // The usual canonical format - leaves are left-aligned
350 static int fmtutil_huffman_make_canonical_tree1(deark
*c
, struct fmtutil_huffman_codebook
*bk
,
351 struct fmtutil_huffman_code_builder
*builder
, UI max_sym_len_used
)
354 UI codes_count_total
= 0;
355 UI prev_code_bit_length
= 0;
360 // For each possible symbol length...
361 for(symlen
=1; symlen
<=max_sym_len_used
; symlen
++) {
364 // Find all the codes that use this symbol length, in order
365 for(k
=0; k
<(UI
)builder
->lengths_arr_numused
; k
++) {
369 if(builder
->lengths_arr
[k
].len
!= symlen
) continue;
370 // Found a code of the length we're looking for.
372 if(codes_count_total
==0) {
376 thiscode
= prev_code
+ 1;
377 if(symlen
> prev_code_bit_length
) {
378 thiscode
<<= (symlen
- prev_code_bit_length
);
382 prev_code
= thiscode
;
383 prev_code_bit_length
= symlen
;
386 if(c
->debug_level
>=3) {
387 de_dbg3(c
, "code: \"%s\" = %d",
388 de_print_base2_fixed(b2buf
, sizeof(b2buf
), thiscode
, symlen
),
389 (int)builder
->lengths_arr
[k
].val
);
391 ret
= fmtutil_huffman_add_code(c
, bk
, thiscode
, symlen
, builder
->lengths_arr
[k
].val
);
403 // "pack" style - branches are left-aligned
404 static int fmtutil_huffman_make_canonical_tree2(deark
*c
, struct fmtutil_huffman_codebook
*bk
,
405 struct fmtutil_huffman_code_builder
*builder
, UI max_sym_len_used
)
408 UI codes_count_total
= 0;
409 UI prev_code_bit_length
= 0;
414 // For each possible symbol length...
415 for(symlen
=max_sym_len_used
; symlen
>=1; symlen
--) {
418 // Find all the codes that use this symbol length, in order
419 for(k
=0; k
<(UI
)builder
->lengths_arr_numused
; k
++) {
423 if(builder
->lengths_arr
[k
].len
!= symlen
) continue;
424 // Found a code of the length we're looking for.
426 if(codes_count_total
==0) {
430 this_code
= (prev_code
>>(prev_code_bit_length
-symlen
)) + 1;
433 prev_code
= this_code
;
434 prev_code_bit_length
= symlen
;
437 if(c
->debug_level
>=3) {
438 de_dbg3(c
, "code: \"%s\" = %d",
439 de_print_base2_fixed(b2buf
, sizeof(b2buf
), this_code
, symlen
),
440 (int)builder
->lengths_arr
[k
].val
);
442 ret
= fmtutil_huffman_add_code(c
, bk
, this_code
, symlen
, builder
->lengths_arr
[k
].val
);
454 static void reverse_lengths_array(struct fmtutil_huffman_code_builder
*builder
)
457 i64 num_swaps
= builder
->lengths_arr_numused
/ 2;
458 struct huffman_lengths_arr_item tmpitem
;
460 for(i
=0; i
<num_swaps
; i
++) {
461 tmpitem
= builder
->lengths_arr
[i
]; // struct copy
462 builder
->lengths_arr
[i
] = builder
->lengths_arr
[builder
->lengths_arr_numused
-1-i
];
463 builder
->lengths_arr
[builder
->lengths_arr_numused
-1-i
] = tmpitem
;
467 // Call this after calling huffman_record_item_length() (usually many times).
468 // Creates a canonical Huffman tree derived from the known code lengths.
469 int fmtutil_huffman_make_canonical_code(deark
*c
, struct fmtutil_huffman_codebook
*bk
,
470 struct fmtutil_huffman_code_builder
*builder
, UI flags
)
474 int saved_indent_level
;
477 de_dbg_indent_save(c
, &saved_indent_level
);
478 de_dbg3(c
, "derived huffman codebook:");
481 if(!builder
->lengths_arr
) {
486 if(flags
& FMTUTIL_MCTFLAG_LAST_CODE_FIRST
) {
487 // Bit of a hack. Instead of each worker function having to have a way
488 // to read the array from back to front, reverse the order of items in
489 // the array, so we don't have to deal with it later.
490 reverse_lengths_array(builder
);
493 // Find the maximum length
494 max_sym_len_used
= 0;
495 for(i
=0; i
<(UI
)builder
->lengths_arr_numused
; i
++) {
496 if(builder
->lengths_arr
[i
].len
> max_sym_len_used
) {
497 max_sym_len_used
= builder
->lengths_arr
[i
].len
;
500 if(max_sym_len_used
>FMTUTIL_HUFFMAN_MAX_CODE_LENGTH
) {
504 if(flags
& FMTUTIL_MCTFLAG_LEFT_ALIGN_BRANCHES
) {
505 retval
= fmtutil_huffman_make_canonical_tree2(c
, bk
, builder
, max_sym_len_used
);
508 retval
= fmtutil_huffman_make_canonical_tree1(c
, bk
, builder
, max_sym_len_used
);
512 de_dbg_indent_restore(c
, saved_indent_level
);
516 static struct fmtutil_huffman_codebook
*huffman_create_codebook(deark
*c
,
517 i64 initial_codes
, i64 max_codes
)
519 struct fmtutil_huffman_codebook
*bk
;
522 bk
= de_malloc(c
, sizeof(struct fmtutil_huffman_codebook
));
524 bk
->max_nodes
= max_codes
;
527 bk
->max_nodes
= MAX_MAX_NODES
;
529 if(bk
->max_nodes
> MAX_MAX_NODES
) {
530 bk
->max_nodes
= MAX_MAX_NODES
;
533 if(initial_codes
>0) {
534 initial_nodes
= initial_codes
;
539 if(initial_nodes
> MAX_MAX_NODES
) {
540 initial_nodes
= MAX_MAX_NODES
;
543 huffman_ensure_alloc(c
, bk
, (NODE_REF_TYPE
)initial_nodes
);
544 bk
->next_avail_node
= 0;
550 static void huffman_destroy_codebook(deark
*c
, struct fmtutil_huffman_codebook
*bk
)
553 de_free(c
, bk
->nodes
);
557 // initial_codes: If not 0, pre-allocate enough nodes for this many codes.
558 // max_codes: If not 0, attempting to add substantially more codes than this will fail.
559 struct fmtutil_huffman_decoder
*fmtutil_huffman_create_decoder(deark
*c
, i64 initial_codes
, i64 max_codes
)
561 struct fmtutil_huffman_decoder
*ht
= NULL
;
563 ht
= de_malloc(c
, sizeof(struct fmtutil_huffman_decoder
));
564 ht
->cursor
= de_malloc(c
, sizeof(struct fmtutil_huffman_cursor
));
565 ht
->bk
= huffman_create_codebook(c
, initial_codes
, max_codes
);
566 ht
->builder
= de_malloc(c
, sizeof(struct fmtutil_huffman_code_builder
));
567 fmtutil_huffman_reset_cursor(ht
->cursor
);
571 void fmtutil_huffman_destroy_decoder(deark
*c
, struct fmtutil_huffman_decoder
*ht
)
574 huffman_destroy_codebook(c
, ht
->bk
);
576 de_free(c
, ht
->builder
->lengths_arr
);
577 de_free(c
, ht
->builder
);
578 de_free(c
, ht
->cursor
);