Minor refactoring of the IFF and box-format parsers
[deark.git] / modules / hlp.c
blob90d761a64be2f0065fe224770851820ec8caca15
1 // This file is part of Deark.
2 // Copyright (C) 2017 Jason Summers
3 // See the file COPYING for terms of use.
5 // Windows HLP
7 // This module was developed with the help of information from the helpfile.txt
8 // document included with the helpdeco software by M. Winterhoff.
10 #include <deark-config.h>
11 #include <deark-private.h>
12 #include <deark-fmtutil.h>
13 DE_DECLARE_MODULE(de_module_hlp);
15 #define TOPICBLOCKHDRSIZE 12
16 #define INVALIDPOS 0xffffffffU
18 enum hlp_filetype {
19 FILETYPE_UNKNOWN = 0,
20 FILETYPE_OTHERSPECIAL,
21 FILETYPE_EXTRACTABLE,
22 FILETYPE_INTERNALDIR,
23 FILETYPE_SYSTEM,
24 FILETYPE_TOPIC,
25 FILETYPE_SHG,
26 FILETYPE_PHRASES,
27 FILETYPE_PHRINDEX,
28 FILETYPE_PHRIMAGE
31 struct bptree {
32 unsigned int flags;
33 i64 pagesize;
34 i64 root_page;
35 i64 num_levels;
36 i64 num_pages;
37 i64 num_entries;
38 i64 pagesdata_pos;
39 i64 first_leaf_page;
42 struct phrase_item {
43 u32 pos; // pos in ->phrases_data
44 u32 len;
47 typedef struct localctx_struct {
48 int input_encoding;
49 int output_is_utf8;
50 int extract_text;
51 u8 extract_raw_streams;
52 i64 internal_dir_FILEHEADER_offs;
53 struct bptree bpt;
54 u8 found_system_file;
55 u8 found_Phrases_file;
56 u8 found_PhrIndex_file;
57 u8 found_PhrImage_file;
58 u8 found_TOPIC_file;
59 u8 valid_Phrases_file;
60 i64 offset_of_system_file;
61 i64 offset_of_Phrases;
62 i64 offset_of_PhrIndex;
63 i64 offset_of_PhrImage;
64 i64 offset_of_TOPIC;
65 int ver_major;
66 int ver_minor;
67 i64 topic_block_size;
68 int is_lz77_compressed;
69 int uses_old_phrase_compression;
70 int uses_hall_compression;
71 int pass;
72 int has_shg, has_ico, has_bmp;
73 i64 internal_dir_num_levels;
74 dbuf *outf_text;
75 i64 num_topic_blocks;
76 dbuf *tmpdbuf1;
77 dbuf *unc_linkdata2_dbuf;
78 i64 PhrImageUncSize;
79 i64 PhrImageCmprSize;
80 dbuf *phrases_data;
81 unsigned int num_phrases;
82 struct phrase_item *phrase_info; // array [num_phrases]
83 de_ucstring *tmpucstring1;
84 de_ucstring *help_file_title;
85 de_ucstring *help_file_copyright;
86 struct de_timestamp gendate;
87 } lctx;
89 static void do_file(deark *c, lctx *d, i64 pos1, enum hlp_filetype file_fmt, int extract_only,
90 struct de_stringreaderdata *fn);
92 struct systemrec_info {
93 unsigned int rectype;
95 // low 8 bits = version info
96 // 0x0010 = STRINGZ type
97 unsigned int flags;
99 const char *name;
100 void *reserved;
102 static const struct systemrec_info systemrec_info_arr[] = {
103 { 1, 0x0010, "Title", NULL },
104 { 2, 0x0010, "Copyright", NULL },
105 { 3, 0x0000, "Contents", NULL },
106 { 4, 0x0010, "Macro", NULL },
107 { 5, 0x0000, "Icon", NULL },
108 { 6, 0x0000, "Window", NULL },
109 { 8, 0x0010, "Citation", NULL },
110 { 9, 0x0000, "Language ID", NULL },
111 { 10, 0x0010, "CNT file name", NULL },
112 { 11, 0x0000, "Charset", NULL },
113 { 12, 0x0000, "Default dialog font", NULL },
114 { 13, 0x0010, "Defined GROUPs", NULL },
115 { 14, 0x0011, "IndexSeparators", NULL },
116 { 14, 0x0002, "Multimedia Help Files", NULL },
117 { 18, 0x0010, "Defined language", NULL },
118 { 19, 0x0000, "Defined DLLMAPS", NULL }
120 static const struct systemrec_info systemrec_info_default =
121 { 0, 0x0000, "?", NULL };
123 static void format_topiclink(deark *c, lctx *d, u32 n, char *buf, size_t buf_len)
125 if(d->ver_minor<=16) {
126 de_snprintf(buf, buf_len, "%u", (UI)n);
128 else {
129 if(n==INVALIDPOS) {
130 de_strlcpy(buf, "-1", buf_len);
132 else {
133 de_snprintf(buf, buf_len, "Blk%u:%u", (UI)(n/16384),
134 (UI)(n%16384));
139 static void hlptime_to_timestamp(i64 ht, struct de_timestamp *ts)
141 if(ht!=0) {
142 de_unix_time_to_timestamp(ht, ts, 0);
144 else {
145 ts->is_valid = 0;
149 // s can be NULL, or it can be a string to save the value in.
150 static void do_display_and_store_STRINGZ(deark *c, lctx *d, i64 pos1, i64 len,
151 const char *name, de_ucstring *s1)
153 de_ucstring *s_tmp = NULL;
154 de_ucstring *s;
156 if(s1) {
157 s = s1;
159 else {
160 s_tmp = ucstring_create(c);
161 s = s_tmp;
164 ucstring_empty(s);
165 if(len<1) return;
167 dbuf_read_to_ucstring_n(c->infile,
168 pos1, len, DE_DBG_MAX_STRLEN,
169 s, DE_CONVFLAG_STOP_AT_NUL, d->input_encoding);
170 de_dbg(c, "%s: \"%s\"", name, ucstring_getpsz(s));
172 if(s_tmp) ucstring_destroy(s_tmp);
175 static void do_SYSTEMREC_STRINGZ(deark *c, lctx *d, unsigned int recordtype,
176 i64 pos1, i64 len, const struct systemrec_info *sti, de_ucstring *s)
178 do_display_and_store_STRINGZ(c, d, pos1, len, sti->name, s);
181 static void do_SYSTEMREC_uint32_hex(deark *c, lctx *d, unsigned int recordtype,
182 i64 pos1, i64 len)
184 unsigned int n;
186 if(len!=4) return;
187 n = (unsigned int)de_getu32le(pos1);
188 de_dbg(c, "value: 0x%08x", n);
191 static void extract_system_icon(deark *c, lctx *d, i64 pos, i64 len)
193 de_finfo *fi = NULL;
195 fi = de_finfo_create(c);
196 fi->timestamp[DE_TIMESTAMPIDX_MODIFY] = d->gendate;
197 dbuf_create_file_from_slice(c->infile, pos, len, "ico", fi, DE_CREATEFLAG_IS_AUX);
198 de_finfo_destroy(c, fi);
201 static void do_SYSTEMREC(deark *c, lctx *d, unsigned int recordtype,
202 i64 pos1, i64 len, const struct systemrec_info *sti)
204 if(recordtype==1) { // title
205 if(!d->help_file_title) {
206 d->help_file_title = ucstring_create(c);
208 do_SYSTEMREC_STRINGZ(c, d, recordtype, pos1, len, sti, d->help_file_title);
210 else if(recordtype==2) { // copyright
211 if(!d->help_file_copyright) {
212 d->help_file_copyright = ucstring_create(c);
214 do_SYSTEMREC_STRINGZ(c, d, recordtype, pos1, len, sti, d->help_file_copyright);
216 else if(recordtype==3 && len==4) { // contents
217 do_SYSTEMREC_uint32_hex(c, d, recordtype, pos1, len);
219 else if(recordtype==5) { // Icon
220 d->has_ico = 1;
221 extract_system_icon(c, d, pos1, len);
223 else if(sti->flags&0x10) {
224 do_SYSTEMREC_STRINGZ(c, d, recordtype, pos1, len, sti, NULL);
226 else {
227 if(c->debug_level>=2) {
228 de_dbg_hexdump(c, c->infile, pos1, len, 256, NULL, 0x1);
233 static const struct systemrec_info *find_sysrec_info(deark *c, lctx *d, unsigned int t)
235 size_t i;
237 for(i=0; i<DE_ARRAYCOUNT(systemrec_info_arr); i++) {
238 const struct systemrec_info *sti;
239 sti = &systemrec_info_arr[i];
240 if(sti->rectype==t &&
241 (sti->flags&0x0f)==0)
243 return sti;
246 return &systemrec_info_default;
249 static int do_file_SYSTEM_header(deark *c, lctx *d, i64 pos1)
251 i64 pos = pos1;
252 i64 magic;
253 i64 gen_date;
254 unsigned int flags;
255 char timestamp_buf[64];
256 int retval = 0;
258 magic = de_getu16le_p(&pos);
259 if(magic!=0x036c) {
260 de_err(c, "Expected SYSTEM data at %d not found", (int)pos1);
261 goto done;
264 de_dbg(c, "SYSTEM file data at %d", (int)pos1);
265 de_dbg_indent(c, 1);
267 d->ver_minor = (int)de_getu16le_p(&pos);
268 d->ver_major = (int)de_getu16le_p(&pos);
269 de_dbg(c, "help format version: %d.%d", d->ver_major, d->ver_minor);
271 if(d->ver_major!=1) {
272 de_err(c, "Unsupported file version: %d.%d", d->ver_major, d->ver_minor);
273 goto done;
276 gen_date = de_geti32le_p(&pos);
277 hlptime_to_timestamp(gen_date, &d->gendate);
278 de_timestamp_to_string(&d->gendate, timestamp_buf, sizeof(timestamp_buf), 0);
279 de_dbg(c, "GenDate: %d (%s)", (int)gen_date, timestamp_buf);
281 flags = (unsigned int)de_getu16le_p(&pos);
282 de_dbg(c, "system flags: 0x%04x", flags);
284 if(d->ver_minor>16) {
285 if(flags==8) {
286 d->is_lz77_compressed = 1;
287 d->topic_block_size = 2048;
289 else if(flags==4) {
290 d->is_lz77_compressed = 1;
291 d->topic_block_size = 4096;
293 else {
294 d->is_lz77_compressed = 0;
295 d->topic_block_size = 4096;
298 else {
299 d->is_lz77_compressed = 0;
300 d->topic_block_size = 2048;
302 de_dbg(c, "lz77 compression: %d", d->is_lz77_compressed);
303 de_dbg(c, "topic block size: %d", (int)d->topic_block_size);
305 retval = 1;
306 done:
307 return retval;
310 static void do_file_SYSTEM_SYSTEMRECS(deark *c, lctx *d, i64 pos1, i64 len,
311 int systemrecs_pass)
313 i64 pos = pos1;
315 while((pos1+len)-pos >=4) {
316 unsigned int recordtype;
317 i64 datasize;
318 i64 systemrec_startpos;
319 const struct systemrec_info *sti;
321 systemrec_startpos = pos;
323 recordtype = (unsigned int)de_getu16le_p(&pos);
324 datasize = de_getu16le_p(&pos);
326 sti = find_sysrec_info(c, d, recordtype);
327 de_dbg(c, "SYSTEMREC type %u (%s) at %d, dpos=%d, dlen=%d",
328 recordtype, sti->name,
329 (int)systemrec_startpos, (int)pos, (int)datasize);
331 if(pos+datasize > pos1+len) break; // bad data
332 de_dbg_indent(c, 1);
333 do_SYSTEMREC(c, d, recordtype, pos, datasize, sti);
334 de_dbg_indent(c, -1);
335 pos += datasize;
339 static void do_file_SYSTEM(deark *c, lctx *d, i64 pos1, i64 len)
341 i64 pos = pos1;
342 int saved_indent_level;
344 de_dbg_indent_save(c, &saved_indent_level);
346 // We'll read the SYSTEM "file" before pass 2, most importantly to record
347 // the format version information.
349 // The SYSTEM file may contain a series of SYSTEMREC records that we want
350 // to parse.
351 // Note: It seems like we might have to make two (sub)passes over the
352 // SYSTEMREC records, the first pass to collect the "charset" setting, so it
353 // can be used to interpret records that appear before it. But I've never
354 // seen a charset record that I can make sense of -- it's usually just a
355 // random number of NUL bytes.
357 if(d->pass!=1) goto done;
359 if(!do_file_SYSTEM_header(c, d, pos)) goto done;
360 pos += 12;
362 if(d->ver_minor<=16) {
363 d->help_file_title = ucstring_create(c);
364 do_display_and_store_STRINGZ(c, d, pos, (pos1+len)-pos, "HelpFileTitle", d->help_file_title);
366 else {
367 // A sequence of variable-sized SYSTEMRECs
368 do_file_SYSTEM_SYSTEMRECS(c, d, pos, (pos1+len)-pos, 1);
371 done:
372 de_dbg_indent_restore(c, saved_indent_level);
375 static void do_file_SHG(deark *c, lctx *d, i64 pos1, i64 used_space)
377 i64 oldsig;
378 const char *ext;
379 dbuf *outf = NULL;
380 de_finfo *fi = NULL;
382 // Reportedly, 0x506c = SHG = 1 image, and 0x706c = MRB = >1 image.
383 // But I'm not convinced that's correct.
384 // I'm to sure what to do, as far as selecting a file extension, and potentially
385 // correcting the signature. Current behavior is to leave the signature the same,
386 // and derive the file extension from the number of images.
387 oldsig = de_getu16le(pos1);
389 if(oldsig==0x506c || oldsig==0x706c) {
390 i64 num_images;
392 num_images = de_getu16le(pos1+2);
393 if(num_images>1) {
394 ext="mrb";
396 else {
397 ext="shg";
400 else {
401 ext="bin";
404 fi = de_finfo_create(c);
405 // (Note that if we were to correct the signature, we probably should not copy
406 // the mod time.)
407 fi->timestamp[DE_TIMESTAMPIDX_MODIFY] = d->gendate;
408 outf = dbuf_create_output_file(c, ext, fi, 0);
409 dbuf_copy(c->infile, pos1, used_space, outf);
410 dbuf_close(outf);
411 de_finfo_destroy(c, fi);
414 static void do_extract_raw_file(deark *c, lctx *d, i64 pos1, i64 used_space,
415 struct de_stringreaderdata *fn)
417 de_finfo *fi = NULL;
418 const char *ext = NULL;
420 fi = de_finfo_create(c);
421 fi->timestamp[DE_TIMESTAMPIDX_MODIFY] = d->gendate;
422 if(fn && ucstring_isnonempty(fn->str)) {
423 de_finfo_set_name_from_ucstring(c, fi, fn->str, 0);
424 fi->original_filename_flag = 1;
426 else {
427 ext = "bin";
429 dbuf_create_file_from_slice(c->infile, pos1, used_space, ext, fi, 0);
431 de_finfo_destroy(c, fi);
434 struct topiclink_data {
435 i64 blocksize;
436 i64 datalen2;
437 u32 prevblock;
438 u32 nextblock;
439 i64 datalen1;
440 u8 recordtype;
442 i64 linkdata1_pos;
443 i64 linkdata1_len;
444 i64 linkdata2_pos;
445 i64 linkdata2_cmprlen;
446 i64 linkdata2_uncmprlen;
448 i64 paragraphinfo_pos;
449 u8 seems_compressed;
452 struct topic_block_info_item {
453 i64 pos; // position in ->unc_topicdata
454 i64 len;
457 struct topic_ctx {
458 i64 num_topic_blocks;
459 struct topic_block_info_item *topic_block_info; // array [num_topic_blocks]
460 dbuf *unc_topicdata;
461 u32 pos_of_first_topiclink;
464 static void ensure_text_output_file_open(deark *c, lctx *d)
466 if(d->outf_text) return;
467 d->outf_text = dbuf_create_output_file(c, "dump.txt", NULL, 0);
468 if(d->output_is_utf8 && c->write_bom) {
469 dbuf_write_uchar_as_utf8(d->outf_text, 0xfeff);
471 if(ucstring_isnonempty(d->help_file_title)) {
472 dbuf_puts(d->outf_text, "Title: ");
473 // TODO: This doesn't do the right thing if !d->output_is_utf8.
474 ucstring_write_as_utf8(c, d->help_file_title, d->outf_text, 0);
475 dbuf_puts(d->outf_text, "\n");
477 if(ucstring_isnonempty(d->help_file_copyright)) {
478 dbuf_puts(d->outf_text, "Copyright: ");
479 ucstring_write_as_utf8(c, d->help_file_copyright, d->outf_text, 0);
480 dbuf_puts(d->outf_text, "\n");
484 // Emit a string that needs no conversion.
485 static void emit_raw_sz(deark *c, lctx *d, const char *sz)
487 if(!d->outf_text) return;
488 dbuf_puts(d->outf_text, sz);
491 // Emit a content string, i.e. one that might need character encoding conversion
492 // or escaping.
493 static void emit_slice(deark *c, lctx *d, dbuf *inf, i64 pos, i64 len)
495 if(!d->outf_text) return;
497 if(!d->output_is_utf8) {
498 dbuf_copy(inf, pos, len, d->outf_text);
499 return;
502 ucstring_empty(d->tmpucstring1);
503 dbuf_read_to_ucstring(inf, pos, len, d->tmpucstring1, 0, d->input_encoding);
504 ucstring_write_as_utf8(c, d->tmpucstring1, d->outf_text, 0);
507 static void emit_phrase(deark *c, lctx *d, dbuf *outf, unsigned int phrasenum)
509 if(phrasenum < d->num_phrases) {
510 dbuf_copy(d->phrases_data, (i64)d->phrase_info[phrasenum].pos,
511 (i64)d->phrase_info[phrasenum].len, outf);
515 static void do_phrase_decompression(deark *c, lctx *d, dbuf *inf, i64 pos1, i64 len,
516 dbuf *outf, i64 unc_len_expected)
518 i64 pos = pos1;
519 i64 endpos = pos1+len;
521 if(!d->phrases_data) return;
523 while(1) {
524 u8 b;
526 if(pos >= endpos) break;
527 b = dbuf_getbyte_p(inf, &pos);
528 if(b==0 || b>=0x10) {
529 dbuf_writebyte(outf, b);
531 else {
532 u8 b2;
533 unsigned int n;
535 if(pos >= endpos) break;
536 b2 = dbuf_getbyte_p(inf, &pos);
537 n = ((unsigned int)(b-1)<<8) | b2;
538 emit_phrase(c, d, outf, n>>1);
539 if(n & 0x1) {
540 dbuf_writebyte(outf, ' ');
546 static void do_hall_decompression(deark *c, lctx *d, dbuf *inf, i64 pos1, i64 len,
547 dbuf *outf, i64 unc_len_expected)
549 static const u8 action[16] = {0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4};
550 i64 pos = pos1;
551 i64 endpos = pos1+len;
552 unsigned int n;
553 i64 outf_expected_endpos;
555 outf_expected_endpos = outf->len + unc_len_expected;
557 while(1) {
558 u8 b;
560 if(outf->len >= outf_expected_endpos) goto unc_done;
561 if(pos >= endpos) goto unc_done;
562 b = dbuf_getbyte_p(inf, &pos);
563 switch(action[b&0x0f]) {
564 case 0:
565 emit_phrase(c, d, outf, b>>1);
566 break;
567 case 1:
568 if(pos >= endpos) goto unc_done;
569 n = (((unsigned int)b & 0xfc) << 6) | 0x80;
570 n += dbuf_getbyte_p(inf, &pos);
571 emit_phrase(c, d, outf, n);
572 break;
573 case 2:
574 n = (b >> 3) +1;
575 if(pos + (i64)n > endpos) goto unc_done;
576 dbuf_copy(inf, pos, (i64)n, outf);
577 pos += (i64)n;
578 break;
579 case 3:
580 dbuf_write_run(outf, ' ', (i64)(b>>4)+1);
581 break;
582 default: // 4
583 dbuf_write_zeroes(outf, (i64)(b>>4)+1);
584 break;
587 unc_done:
591 static int do_topiclink_rectype_32_linkdata1(deark *c, lctx *d,
592 struct topic_ctx *tctx, struct topiclink_data *tld)
594 i64 pos = tld->linkdata1_pos;
595 dbuf *inf = tctx->unc_topicdata;
596 i64 topicsize;
597 i64 topiclength;
598 unsigned int id;
599 unsigned int bits;
600 int retval = 0;
602 // TODO: type 33 (table)
603 if(tld->recordtype!=1 && tld->recordtype!=32) goto done;
605 topicsize = fmtutil_hlp_get_csl_p(inf, &pos);
606 de_dbg(c, "topic size: %"I64_FMT, topicsize);
608 if(tld->recordtype==32) {
609 topiclength = fmtutil_hlp_get_cus_p(inf, &pos);
610 de_dbg(c, "topic length: %"I64_FMT, topiclength);
613 pos++; // unknownUnsignedChar
614 pos++; // unknownBiasedChar
615 id = (unsigned int)dbuf_getu16le_p(inf, &pos);
616 de_dbg(c, "id: %u", id);
617 bits = (unsigned int)dbuf_getu16le_p(inf, &pos);
618 de_dbg(c, "bits: 0x%04x", bits);
620 if(bits & 0x0001) { // Unknown
621 (void)fmtutil_hlp_get_csl_p(inf, &pos);
623 if(bits & 0x0002) { // SpacingAbove
624 (void)fmtutil_hlp_get_css_p(inf, &pos);
626 if(bits & 0x0004) { // SpacingBelow
627 (void)fmtutil_hlp_get_css_p(inf, &pos);
629 if(bits & 0x0008) { // SpacingLines
630 (void)fmtutil_hlp_get_css_p(inf, &pos);
632 if(bits & 0x0010) { // LeftIndent
633 (void)fmtutil_hlp_get_css_p(inf, &pos);
635 if(bits & 0x0020) { // RightIndent
636 (void)fmtutil_hlp_get_css_p(inf, &pos);
638 if(bits & 0x0040) { // FirstlineIndent
639 (void)fmtutil_hlp_get_css_p(inf, &pos);
641 // 0x0080 = unused
642 if(bits & 0x0100) { // Borderinfo
643 goto done; // TODO
645 if(bits & 0x0200) { // Tabinfo
646 goto done; // TODO
648 // 0x0400 = RightAlignedParagraph
649 // 0x0800 = CenterAlignedParagraph
651 tld->paragraphinfo_pos = pos;
652 retval = 1;
653 done:
654 return retval;
657 // linkdata2 = after Phrase/Hall decompression
658 static void do_topiclink_rectype_1_32(deark *c, lctx *d,
659 struct topic_ctx *tctx, struct topiclink_data *tld, dbuf *linkdata2)
661 i64 pos;
662 int in_string = 0;
663 int string_count = 0;
664 int byte_count = 0;
666 if(!d->extract_text) goto done;
667 ensure_text_output_file_open(c, d);
669 do_topiclink_rectype_32_linkdata1(c, d, tctx, tld);
671 // TODO: This is very quick & dirty.
672 // The linkdata2 is a collection of NUL-terminated strings. We'd have to
673 // interpret the command bytes from linkdata1 to know how to format them.
675 pos = 0;
676 dbuf_truncate(d->tmpdbuf1, 0); // A place to collect the current output string
678 while(1) {
679 u8 b;
681 if(pos >= linkdata2->len) break;
682 b = dbuf_getbyte_p(linkdata2, &pos);
683 if(b==0x00) {
684 if(in_string) {
685 emit_slice(c, d, d->tmpdbuf1, 0, d->tmpdbuf1->len);
686 dbuf_truncate(d->tmpdbuf1, 0);
687 emit_raw_sz(c, d, "\n");
688 string_count++;
689 in_string = 0;
692 else {
693 dbuf_writebyte(d->tmpdbuf1, b);
694 byte_count++;
695 in_string = 1;
698 if(in_string) {
699 emit_slice(c, d, d->tmpdbuf1, 0, d->tmpdbuf1->len);
700 dbuf_truncate(d->tmpdbuf1, 0);
701 emit_raw_sz(c, d, "\n");
702 string_count++;
704 de_dbg2(c, "[emitted %d strings, totaling %d bytes]", string_count, byte_count);
706 done:
710 // TOPIC header
711 static void do_topiclink_rectype_2_linkdata2(deark *c, lctx *d,
712 struct topic_ctx *tctx, struct topiclink_data *tld, dbuf *linkdata2)
714 i64 k;
715 int bytecount = 0;
717 dbuf_truncate(d->tmpdbuf1, 0);
718 emit_raw_sz(c, d, "# ");
720 for(k=0; k<linkdata2->len; k++) {
721 u8 b;
723 b = dbuf_getbyte(linkdata2, k);
724 if(b==0) break;
725 dbuf_writebyte(d->tmpdbuf1, b);
726 bytecount++;
729 if(bytecount>0) {
730 emit_slice(c, d, d->tmpdbuf1, 0, d->tmpdbuf1->len);
732 else {
733 emit_raw_sz(c, d, "(untitled topic)");
736 emit_raw_sz(c, d, " #\n");
739 // topic header and title
740 static void do_topiclink_rectype_2(deark *c, lctx *d,
741 struct topic_ctx *tctx, struct topiclink_data *tld, dbuf *linkdata2)
743 if(!d->extract_text) goto done;
744 ensure_text_output_file_open(c, d);
746 do_topiclink_rectype_2_linkdata2(c, d, tctx, tld, linkdata2);
747 done:
751 // Returns 1 if we set next_pos_code
752 static int do_topiclink(deark *c, lctx *d, struct topic_ctx *tctx, i64 pos1, u32 *next_pos_code)
754 struct topiclink_data *tld = NULL;
755 i64 pos = pos1;
756 i64 linkdata2_nbytes_avail;
757 int retval = 0;
758 dbuf *inf = tctx->unc_topicdata;
759 char tmpbuf[24];
761 tld = de_malloc(c, sizeof(struct topiclink_data));
763 tld->blocksize = dbuf_geti32le_p(inf, &pos);
764 de_dbg(c, "blocksize: %d", (int)tld->blocksize);
765 if((tld->blocksize<21) || (pos1 + tld->blocksize > inf->len)) {
766 de_dbg(c, "bad topiclink blocksize");
767 goto done;
769 tld->datalen2 = dbuf_geti32le_p(inf, &pos);
770 de_dbg(c, "datalen2 (after any decompression): %d", (int)tld->datalen2);
772 tld->prevblock = (u32)dbuf_getu32le_p(inf, &pos);
773 format_topiclink(c, d, tld->prevblock, tmpbuf, sizeof(tmpbuf));
774 de_dbg(c, "prevblock: %s", tmpbuf);
776 tld->nextblock = (u32)dbuf_getu32le_p(inf, &pos);
777 format_topiclink(c, d, tld->nextblock, tmpbuf, sizeof(tmpbuf));
778 de_dbg(c, "nextblock: %s", tmpbuf);
779 *next_pos_code = (u32)tld->nextblock;
780 retval = 1;
782 tld->datalen1 = dbuf_geti32le_p(inf, &pos);
783 de_dbg(c, "datalen1: %d", (int)tld->datalen1);
784 tld->recordtype = dbuf_getbyte_p(inf, &pos);
785 de_dbg(c, "record type: %d", (int)tld->recordtype);
787 tld->linkdata1_pos = pos1 + 21;
788 tld->linkdata1_len = tld->datalen1 - 21;
789 de_dbg(c, "linkdata1: pos=[%"I64_FMT"], len=%"I64_FMT, tld->linkdata1_pos, tld->linkdata1_len);
791 tld->linkdata2_pos = tld->linkdata1_pos + tld->linkdata1_len;
792 linkdata2_nbytes_avail = tld->blocksize - tld->datalen1;
794 if(d->uses_old_phrase_compression || d->uses_hall_compression) {
795 if(tld->datalen2 > linkdata2_nbytes_avail) {
796 // Phrase/Hall compression used in this topiclink
797 tld->seems_compressed = 1;
798 tld->linkdata2_cmprlen = linkdata2_nbytes_avail;
799 tld->linkdata2_uncmprlen = tld->datalen2;
801 else {
802 // Phrase/Hall compression used in this file, but not in this topiclink
803 tld->linkdata2_cmprlen = tld->datalen2;
804 tld->linkdata2_uncmprlen = tld->datalen2;
807 else {
808 // Phrase/Hall compression not used in this file
809 tld->linkdata2_cmprlen = de_min_int(tld->datalen2, linkdata2_nbytes_avail);
810 tld->linkdata2_uncmprlen = tld->linkdata2_cmprlen;
813 if((tld->linkdata1_pos<pos1) || (tld->linkdata2_pos<pos1) ||
814 (tld->linkdata1_len<0) || (tld->linkdata2_cmprlen<0) ||
815 (tld->linkdata1_pos + tld->linkdata1_len > pos1+tld->blocksize) ||
816 (tld->linkdata2_pos + tld->linkdata2_cmprlen > pos1+tld->blocksize))
818 de_dbg(c, "bad linkdata");
819 goto done;
822 de_dbg(c, "linkdata2: pos=[%"I64_FMT"], cmprlen=%"I64_FMT", uncmprlen=%"I64_FMT,
823 tld->linkdata2_pos, tld->linkdata2_cmprlen, tld->linkdata2_uncmprlen);
825 // Decompress linkdata2 if necessary
826 dbuf_truncate(d->unc_linkdata2_dbuf, 0);
827 if(tld->seems_compressed && d->uses_old_phrase_compression) {
828 do_phrase_decompression(c, d, inf, tld->linkdata2_pos, tld->linkdata2_cmprlen,
829 d->unc_linkdata2_dbuf, tld->linkdata2_uncmprlen);
831 else if(tld->seems_compressed && d->uses_hall_compression) {
832 do_hall_decompression(c, d, inf, tld->linkdata2_pos, tld->linkdata2_cmprlen,
833 d->unc_linkdata2_dbuf,tld->linkdata2_uncmprlen);
835 else {
836 dbuf_copy(inf, tld->linkdata2_pos, tld->linkdata2_cmprlen, d->unc_linkdata2_dbuf);
839 switch(tld->recordtype) {
840 case 1:
841 case 32:
842 case 35:
843 do_topiclink_rectype_1_32(c, d, tctx, tld, d->unc_linkdata2_dbuf);
844 break;
845 case 2:
846 do_topiclink_rectype_2(c, d, tctx, tld, d->unc_linkdata2_dbuf);
847 break;
848 default:
849 de_warn(c, "Unsupported record type: %d", (int)tld->recordtype);
852 done:
853 de_free(c, tld);
854 return retval;
857 static int topicpos_to_abspos(deark *c, lctx *d, struct topic_ctx *tctx, i64 topicpos,
858 i64 *pabspos)
860 i64 blknum, blkoffs;
862 if(!d->topic_block_size) return 0;
863 blkoffs = topicpos % 16384;
864 if(blkoffs<TOPICBLOCKHDRSIZE) return 0;
865 blknum = topicpos / 16384;
866 if(blknum<0 || blknum>=tctx->num_topic_blocks) return 0;
867 *pabspos = tctx->topic_block_info[blknum].pos + (blkoffs-TOPICBLOCKHDRSIZE);
868 return 1;
871 static i64 hc30_abspos_plus_offset_to_abspos(deark *c, lctx *d, i64 pos, i64 offset)
873 i64 blksize = d->topic_block_size-TOPICBLOCKHDRSIZE;
874 i64 start_of_curr_block;
875 i64 end_of_curr_block;
876 i64 n;
878 // We're at a position in blocks of size (d->topic_block_size-12). We need to add
879 // 'offset', but subtract 12 every time we cross a block boundary.
880 start_of_curr_block = (pos/blksize)*blksize;
881 end_of_curr_block = start_of_curr_block + blksize;
882 if(pos+offset <= end_of_curr_block) {
883 return pos + offset;
886 n = pos+offset - end_of_curr_block;
887 return pos + offset - TOPICBLOCKHDRSIZE*(1+(n / blksize));
890 static int calc_next_topiclink_pos(deark *c, lctx *d, struct topic_ctx *tctx, i64 curpos,
891 u32 next_pos_code, i64 *pnextpos)
893 *pnextpos = 0;
895 if(next_pos_code==INVALIDPOS) {
896 de_dbg(c, "[stopping TOPIC parsing, end-of-links marker found]");
897 return 0;
900 if(d->ver_minor<=16) {
901 if(next_pos_code < 21) {
902 de_dbg(c, "[stopping TOPIC parsing, no nextblock available]");
903 return 0;
905 *pnextpos = hc30_abspos_plus_offset_to_abspos(c, d, curpos, next_pos_code);
907 else {
908 if(!topicpos_to_abspos(c, d, tctx, next_pos_code, pnextpos)) {
909 de_dbg(c, "[stopping TOPIC parsing, no nextblock available]");
910 return 0;
913 if((*pnextpos) <= curpos) {
914 de_dbg(c, "[stopping TOPIC parsing, blocks not in order]");
915 return 0;
919 return 1;
922 static void do_topicdata(deark *c, lctx *d, struct topic_ctx *tctx)
924 i64 pos;
925 int saved_indent_level, saved_indent_level2;
926 dbuf *inf = tctx->unc_topicdata;
928 de_dbg_indent_save(c, &saved_indent_level);
930 de_dbg(c, "topic data");
932 de_dbg_indent(c, 1);
933 if(!inf) {
934 goto done;
937 de_dbg_indent_save(c, &saved_indent_level2);
939 if(tctx->pos_of_first_topiclink==INVALIDPOS || tctx->pos_of_first_topiclink<TOPICBLOCKHDRSIZE) {
940 de_warn(c, "Bad first topic link");
941 pos = 0;
943 else {
944 // Maybe we should call topicpos_to_abspos(), etc., but this should be
945 // correct since an offset in the first topicblock.
946 pos = tctx->pos_of_first_topiclink - TOPICBLOCKHDRSIZE;
949 while(1) {
950 u32 next_pos_code;
951 i64 next_pos = 0;
952 int ret;
954 if(pos > inf->len) {
955 de_dbg(c, "[stopping TOPIC parsing, exceeded end of data]");
956 break;
958 if(pos == inf->len) {
959 de_dbg(c, "[stopping TOPIC parsing, reached end of data]");
960 break;
962 if(pos + 21 > inf->len) {
963 de_warn(c, "Error parsing TOPIC, not enough room for another TOPICLINK (%"I64_FMT
964 ", %"I64_FMT")", pos, inf->len);
965 break;
968 // TODO: Display as "Blk#:offs"
969 de_dbg(c, "topiclink at [%"I64_FMT"]", pos);
970 de_dbg_indent(c, 1);
971 next_pos_code = 0;
972 if(!do_topiclink(c, d, tctx, pos, &next_pos_code)) goto done;
973 de_dbg_indent(c, -1);
975 ret = calc_next_topiclink_pos(c, d, tctx, pos, next_pos_code, &next_pos);
976 if(!ret) break;
977 pos = next_pos;
980 done:
981 de_dbg_indent_restore(c, saved_indent_level);
984 static void decompress_topic_block(deark *c, lctx *d, struct topic_ctx *tctx,
985 i64 blknum, i64 blk_dpos, i64 blk_dlen,
986 dbuf *outf)
988 struct de_dfilter_in_params dcmpri;
989 struct de_dfilter_out_params dcmpro;
990 struct de_dfilter_results dres;
991 i64 len_before;
993 de_dfilter_init_objects(c, &dcmpri, &dcmpro, &dres);
995 dcmpri.f = c->infile;
996 dcmpri.pos = blk_dpos;
997 dcmpri.len = blk_dlen;
998 dcmpro.f = outf;
999 dcmpro.len_known = 1;
1000 // TODO: Confirm what happens if a block decompresses to more than 16384-12 bytes.
1001 dcmpro.expected_len = 16384-TOPICBLOCKHDRSIZE;
1002 len_before = outf->len;
1003 fmtutil_hlp_lz77_codectype1(c, &dcmpri, &dcmpro, &dres, NULL);
1004 de_dbg(c, "decompressed %"I64_FMT" to %"I64_FMT" bytes", blk_dlen,
1005 outf->len - len_before);
1008 static void do_file_TOPIC(deark *c, lctx *d, i64 pos1, i64 len)
1010 i64 pos = pos1;
1011 int saved_indent_level;
1012 struct topic_ctx *tctx = NULL;
1013 i64 blknum;
1015 de_dbg_indent_save(c, &saved_indent_level);
1016 tctx = de_malloc(c, sizeof(struct topic_ctx));
1017 de_dbg(c, "TOPIC at %"I64_FMT", len=%"I64_FMT, pos1, len);
1018 de_dbg_indent(c, 1);
1020 if(!d->found_system_file || d->topic_block_size<2048) {
1021 de_err(c, "SYSTEM file not found");
1022 goto done;
1025 if(d->extract_text) {
1026 tctx->unc_topicdata = dbuf_create_membuf(c, 0, 0);
1028 tctx->num_topic_blocks = (len + (d->topic_block_size - TOPICBLOCKHDRSIZE)) % d->topic_block_size;
1029 tctx->topic_block_info = de_mallocarray(c, tctx->num_topic_blocks, sizeof(struct topic_block_info_item));
1031 tctx->pos_of_first_topiclink = INVALIDPOS;
1033 // A series of blocks, each with a 12-byte header
1034 for(blknum=0; blknum<tctx->num_topic_blocks; blknum++) {
1035 u32 lastlink, firstlink, lastheader;
1036 i64 blklen;
1037 i64 blk_dpos;
1038 i64 blk_dlen;
1039 char tbuf1[24], tbuf2[24], tbuf3[24];
1041 blklen = d->topic_block_size;
1042 if(blklen > (pos1+len)-pos) {
1043 blklen = (pos1+len)-pos;
1045 if(blklen<TOPICBLOCKHDRSIZE) break;
1046 blk_dpos = pos+TOPICBLOCKHDRSIZE;
1047 blk_dlen = blklen-TOPICBLOCKHDRSIZE;
1049 de_dbg(c, "TOPIC block #%d at %d, dpos=%d, dlen=%d", (int)blknum, (int)pos,
1050 (int)blk_dpos, (int)blk_dlen);
1051 de_dbg_indent(c, 1);
1052 lastlink = (u32)de_getu32le(pos);
1053 firstlink = (u32)de_getu32le(pos+4);
1054 lastheader = (u32)de_getu32le(pos+8);
1055 format_topiclink(c, d, lastlink, tbuf1, sizeof(tbuf1));
1056 format_topiclink(c, d, firstlink, tbuf2, sizeof(tbuf2));
1057 format_topiclink(c, d, lastheader, tbuf3, sizeof(tbuf3));
1058 de_dbg(c, "LastLink=%s, FirstLink=%s, LastHeader=%s",
1059 tbuf1, tbuf2, tbuf3);
1060 if(tctx->pos_of_first_topiclink==INVALIDPOS && firstlink!=INVALIDPOS) {
1061 tctx->pos_of_first_topiclink = firstlink;
1064 if(d->extract_text && tctx->unc_topicdata) {
1065 // Record the position for later reference.
1066 tctx->topic_block_info[blknum].pos = tctx->unc_topicdata->len;
1068 if(d->is_lz77_compressed) {
1069 decompress_topic_block(c, d, tctx, blknum, blk_dpos, blk_dlen, tctx->unc_topicdata);
1071 else {
1072 dbuf_copy(c->infile, blk_dpos, blk_dlen, tctx->unc_topicdata);
1075 tctx->topic_block_info[blknum].len = tctx->unc_topicdata->len - tctx->topic_block_info[blknum].pos;
1077 de_dbg2(c, "[current decompressed size: %"I64_FMT"]", tctx->unc_topicdata->len);
1080 de_dbg_indent(c, -1);
1081 pos += blklen;
1084 if(tctx->unc_topicdata && tctx->unc_topicdata->len>0) {
1085 do_topicdata(c, d, tctx);
1088 done:
1089 if(tctx) {
1090 dbuf_close(tctx->unc_topicdata);
1091 de_free(c, tctx->topic_block_info);
1092 de_free(c, tctx);
1094 de_dbg_indent_restore(c, saved_indent_level);
1097 static void do_index_page(deark *c, lctx *d, i64 pos1, i64 *prev_page)
1099 *prev_page = de_geti16le(pos1+4);
1100 de_dbg(c, "PreviousPage: %d", (int)*prev_page);
1103 static int de_is_digit(char x)
1105 return (x>='0' && x<='9');
1108 static enum hlp_filetype filename_to_filetype(deark *c, lctx *d, const char *fn)
1110 const char *ext;
1111 size_t extlen;
1113 if(fn[0]=='|') {
1114 if(!de_strcmp(fn, "|TOPIC")) return FILETYPE_TOPIC;
1115 if(!de_strcmp(fn, "|SYSTEM")) return FILETYPE_SYSTEM;
1116 if(!de_strncmp(fn, "|bm", 3) && de_is_digit(fn[3])) return FILETYPE_SHG;
1117 if(!de_strcmp(fn, "|Phrases")) return FILETYPE_PHRASES;
1118 if(!de_strcmp(fn, "|PhrIndex")) return FILETYPE_PHRINDEX;
1119 if(!de_strcmp(fn, "|PhrImage")) return FILETYPE_PHRIMAGE;
1120 return FILETYPE_OTHERSPECIAL;
1123 ext = de_get_sz_ext(fn);
1124 extlen = de_strlen(ext);
1126 // Some SHG streams' names don't start with "|". Assume it is SHG if it
1127 // starts with "bm" and a digit, and doesn't have a ".".
1128 if(extlen==0) {
1129 if(!de_strncmp(fn, "bm", 2) && de_is_digit(fn[2])) return FILETYPE_SHG;
1132 // Not sure how bold to be here. Should we extract every file that we can't
1133 // identify? Or maybe only those that have a filename extension?
1134 return FILETYPE_EXTRACTABLE;
1137 static void do_leaf_page(deark *c, lctx *d, i64 pos1, i64 *pnext_page)
1139 i64 n;
1140 i64 pos = pos1;
1141 i64 foundpos;
1142 i64 num_entries;
1143 i64 file_offset;
1144 i64 k;
1145 struct de_stringreaderdata *fn_srd = NULL;
1146 enum hlp_filetype file_type;
1147 int saved_indent_level;
1149 de_dbg_indent_save(c, &saved_indent_level);
1150 n = de_getu16le_p(&pos); // "Unused"
1151 de_dbg(c, "free bytes at end of this page: %d", (int)n);
1153 num_entries = de_geti16le_p(&pos);
1154 de_dbg(c, "NEntries: %d", (int)num_entries);
1156 n = de_geti16le_p(&pos);
1157 de_dbg(c, "PreviousPage: %d", (int)n);
1159 n = de_geti16le_p(&pos);
1160 de_dbg(c, "NextPage: %d", (int)n);
1161 if(pnext_page) *pnext_page = n;
1163 for(k=0; k<num_entries; k++) {
1164 int pass_for_this_file;
1166 de_dbg(c, "entry[%d]", (int)k);
1167 de_dbg_indent(c, 1);
1169 if(!dbuf_search_byte(c->infile, 0x00, pos, 260, &foundpos)) {
1170 de_err(c, "Malformed leaf page at %d", (int)pos1);
1171 goto done;
1174 if(fn_srd) {
1175 de_destroy_stringreaderdata(c, fn_srd);
1177 fn_srd = dbuf_read_string(c->infile, pos, foundpos-pos, foundpos-pos, 0, d->input_encoding);
1178 de_dbg(c, "FileName: \"%s\"", ucstring_getpsz_d(fn_srd->str));
1179 pos = foundpos + 1;
1181 file_offset = de_geti32le_p(&pos);
1182 de_dbg(c, "FileOffset: %d", (int)file_offset);
1184 file_type = filename_to_filetype(c, d, fn_srd->sz);
1186 switch(file_type) {
1187 case FILETYPE_SYSTEM:
1188 d->found_system_file = 1;
1189 d->offset_of_system_file = file_offset;
1190 pass_for_this_file = 1;
1191 break;
1192 case FILETYPE_TOPIC:
1193 d->found_TOPIC_file = 1;
1194 d->offset_of_TOPIC = file_offset;
1195 pass_for_this_file = 1;
1196 break;
1197 case FILETYPE_PHRASES:
1198 d->found_Phrases_file = 1;
1199 d->offset_of_Phrases = file_offset;
1200 pass_for_this_file = 1;
1201 break;
1202 case FILETYPE_PHRINDEX:
1203 d->found_PhrIndex_file = 1;
1204 d->offset_of_PhrIndex = file_offset;
1205 pass_for_this_file = 1;
1206 break;
1207 case FILETYPE_PHRIMAGE:
1208 d->found_PhrImage_file = 1;
1209 d->offset_of_PhrImage = file_offset;
1210 pass_for_this_file = 1;
1211 break;
1212 default:
1213 pass_for_this_file = 2;
1216 if(d->pass==2 && d->extract_raw_streams) {
1217 do_file(c, d, file_offset, file_type, 1, fn_srd);
1219 else if(d->pass==2 && pass_for_this_file==2) {
1220 do_file(c, d, file_offset, file_type, 0, fn_srd);
1223 de_dbg_indent(c, -1);
1226 done:
1227 de_destroy_stringreaderdata(c, fn_srd);
1228 de_dbg_indent_restore(c, saved_indent_level);
1231 // Sets d->bpt.first_leaf_page
1232 static int find_first_leaf_page(deark *c, lctx *d)
1234 i64 curr_page;
1235 i64 curr_level;
1236 int saved_indent_level;
1237 int retval = 0;
1239 de_dbg_indent_save(c, &saved_indent_level);
1240 curr_page = d->bpt.root_page;
1241 curr_level = d->bpt.num_levels;
1243 de_dbg(c, "looking for first leaf page");
1244 de_dbg_indent(c, 1);
1246 while(curr_level>1) {
1247 i64 prev_page;
1248 i64 page_pos;
1250 if(curr_page<0) goto done;
1251 page_pos = d->bpt.pagesdata_pos + curr_page*d->bpt.pagesize;
1253 de_dbg(c, "page %d is an index page, level=%d", (int)curr_page, (int)curr_level);
1255 prev_page = -1;
1256 de_dbg_indent(c, 1);
1257 do_index_page(c, d, page_pos, &prev_page);
1258 de_dbg_indent(c, -1);
1260 curr_page = prev_page;
1261 curr_level--;
1264 de_dbg(c, "page %d is the first leaf page", (int)curr_page);
1265 d->bpt.first_leaf_page = curr_page;
1267 retval = 1;
1269 done:
1270 de_dbg_indent_restore(c, saved_indent_level);
1271 return retval;
1274 static void sanitize_phrase_info(deark *c, lctx *d)
1276 unsigned int k;
1278 if(!d->phrases_data) {
1279 d->num_phrases = 0;
1280 return;
1283 for(k=0; k<d->num_phrases; k++) {
1284 if((i64)d->phrase_info[k].pos + (i64)d->phrase_info[k].len > d->phrases_data->len) {
1285 d->phrase_info[k].pos = 0;
1286 d->phrase_info[k].len = 0;
1291 static void do_after_pass_1(deark *c, lctx *d)
1293 // Read the SYSTEM file first -- lots of other things depend on it.
1294 if(d->found_system_file) {
1295 do_file(c, d, d->offset_of_system_file, FILETYPE_SYSTEM, 0, NULL);
1298 if(d->found_Phrases_file) {
1299 d->uses_old_phrase_compression = 1;
1301 else if(d->found_PhrIndex_file && d->found_PhrImage_file) {
1302 d->uses_hall_compression = 1;
1305 if(d->input_encoding==DE_ENCODING_UNKNOWN || d->input_encoding==DE_ENCODING_ASCII) {
1306 d->output_is_utf8 = 0;
1308 else {
1309 d->output_is_utf8 = 1;
1312 // Read other special files, in a suitable order.
1314 if(d->found_Phrases_file && d->uses_old_phrase_compression) {
1315 do_file(c, d, d->offset_of_Phrases, FILETYPE_PHRASES, 0, NULL);
1318 if(d->found_PhrIndex_file && d->uses_hall_compression) {
1319 do_file(c, d, d->offset_of_PhrIndex, FILETYPE_PHRINDEX, 0, NULL);
1321 if(d->found_PhrImage_file && d->uses_hall_compression) {
1322 do_file(c, d, d->offset_of_PhrImage, FILETYPE_PHRIMAGE, 0, NULL);
1324 sanitize_phrase_info(c, d);
1326 if(d->found_TOPIC_file) {
1327 do_file(c, d, d->offset_of_TOPIC, FILETYPE_TOPIC, 0, NULL);
1331 // This function is only for the "internal directory" tree.
1332 // There are other data objects in HLP files that use the same kind of data
1333 // structure. If we ever want to parse them, this function will have to be
1334 // genericized.
1335 static void do_bplustree(deark *c, lctx *d, i64 pos1, i64 len,
1336 int is_internaldir)
1338 i64 pos = pos1;
1339 i64 n;
1340 int saved_indent_level;
1341 u8 *page_seen = NULL;
1342 struct de_stringreaderdata *structure = NULL;
1344 if(!is_internaldir) return;
1346 de_dbg_indent_save(c, &saved_indent_level);
1348 n = de_getu16le_p(&pos);
1349 if(n != 0x293b) {
1350 de_err(c, "Expected B+ tree structure at %d not found", (int)pos1);
1351 goto done;
1354 //de_dbg(c, "B+ tree at %d", (int)pos1);
1355 de_dbg_indent(c, 1);
1357 d->bpt.flags = (unsigned int)de_getu16le_p(&pos);
1358 de_dbg(c, "Btree flags: 0x%04x", d->bpt.flags);
1360 d->bpt.pagesize = de_getu16le_p(&pos);
1361 de_dbg(c, "PageSize: %d", (int)d->bpt.pagesize);
1363 // TODO: Understand the Structure field
1364 structure = dbuf_read_string(c->infile, pos, 16, 16, DE_CONVFLAG_STOP_AT_NUL,
1365 DE_ENCODING_ASCII);
1366 de_dbg(c, "Structure: \"%s\"", ucstring_getpsz_d(structure->str));
1367 de_destroy_stringreaderdata(c, structure);
1368 structure = NULL;
1369 pos += 16;
1371 pos += 2; // MustBeZero
1372 pos += 2; // PageSplits
1374 d->bpt.root_page = de_geti16le_p(&pos);
1375 de_dbg(c, "RootPage: %d", (int)d->bpt.root_page);
1377 pos += 2; // MustBeNegOne
1379 d->bpt.num_pages = de_geti16le_p(&pos);
1380 de_dbg(c, "TotalPages: %d", (int)d->bpt.num_pages);
1382 d->bpt.num_levels = de_geti16le_p(&pos);
1383 de_dbg(c, "NLevels: %d", (int)d->bpt.num_levels);
1384 if(is_internaldir) d->internal_dir_num_levels = d->bpt.num_levels;
1386 d->bpt.num_entries = de_geti32le_p(&pos);
1387 de_dbg(c, "TotalBtreeEntries: %d", (int)d->bpt.num_entries);
1389 d->bpt.pagesdata_pos = pos;
1390 de_dbg(c, "num pages: %d, %d bytes each, at %d (total size=%d)",
1391 (int)d->bpt.num_pages, (int)d->bpt.pagesize, (int)d->bpt.pagesdata_pos,
1392 (int)(d->bpt.num_pages * d->bpt.pagesize));
1394 if(!find_first_leaf_page(c, d)) goto done;
1396 page_seen = de_malloc(c, d->bpt.num_pages); // For loop detection
1398 for(d->pass=1; d->pass<=2; d->pass++) {
1399 i64 curr_page;
1401 de_zeromem(page_seen, (size_t)d->bpt.num_pages);
1403 de_dbg(c, "pass %d", d->pass);
1404 de_dbg_indent(c, 1);
1406 curr_page = d->bpt.first_leaf_page;
1408 while(1) {
1409 i64 page_pos;
1410 i64 next_page;
1412 if(curr_page<0) break;
1413 if(curr_page>d->bpt.num_pages) goto done;
1415 if(d->pass==1 && page_seen[curr_page]) {
1416 de_err(c, "Page loop detected");
1417 goto done;
1419 page_seen[curr_page] = 1;
1421 page_pos = d->bpt.pagesdata_pos + curr_page*d->bpt.pagesize;
1423 de_dbg(c, "page[%d] at %d (leaf page)", (int)curr_page, (int)page_pos);
1425 next_page = -1;
1426 de_dbg_indent(c, 1);
1427 do_leaf_page(c, d, page_pos, &next_page);
1428 de_dbg_indent(c, -1);
1430 curr_page = next_page;
1433 de_dbg_indent(c, -1);
1435 if(d->pass==1) {
1436 de_dbg(c, "reading items after pass 1");
1437 de_dbg_indent(c, 1);
1438 do_after_pass_1(c, d);
1439 de_dbg_indent(c, -1);
1443 done:
1444 de_free(c, page_seen);
1445 if(structure) de_destroy_stringreaderdata(c, structure);
1446 de_dbg_indent_restore(c, saved_indent_level);
1449 static void do_file_INTERNALDIR(deark *c, lctx *d, i64 pos1, i64 len)
1451 de_dbg(c, "internal dir data at %d", (int)pos1);
1452 do_bplustree(c, d, pos1, len, 1);
1455 static void dump_phrase_offset_table(deark *c, lctx *d)
1457 unsigned int k;
1459 for(k=0; k<d->num_phrases; k++) {
1460 de_dbg2(c, "phrase[%u]: offs=%u, len=%u", k, d->phrase_info[k].pos,
1461 d->phrase_info[k].len);
1465 static void decompress_Phrases(deark *c, lctx *d, i64 pos, i64 cmpr_len, i64 uncmpr_len)
1467 struct de_dfilter_in_params dcmpri;
1468 struct de_dfilter_out_params dcmpro;
1469 struct de_dfilter_results dres;
1471 de_dfilter_init_objects(c, &dcmpri, &dcmpro, &dres);
1472 dcmpri.f = c->infile;
1473 dcmpri.pos = pos;
1474 dcmpri.len = cmpr_len;
1475 dcmpro.f = d->phrases_data;
1476 dcmpro.len_known = 1;
1477 dcmpro.expected_len = uncmpr_len;
1478 fmtutil_hlp_lz77_codectype1(c, &dcmpri, &dcmpro, &dres, NULL);
1479 if(dres.errcode || (d->phrases_data->len!=uncmpr_len)) {
1480 de_warn(c, "Phrases decompression may have failed");
1484 static void do_file_Phrases(deark *c, lctx *d, i64 pos1, i64 len)
1486 i64 pos;
1487 i64 s0, s1, s2;
1488 i64 phrase_data_pos;
1489 i64 phrase_data_cmpr_len;
1490 i64 phrase_data_uncmpr_len = 0;
1491 i64 phrase_offset_table_pos;
1492 i64 phrase_offset_table_len;
1493 unsigned int k;
1494 int is_MVB_format = 0;
1495 int is_compressed = 0;
1496 int saved_indent_level;
1498 de_dbg_indent_save(c, &saved_indent_level);
1499 if(!d->extract_text) goto done;
1501 de_dbg(c, "Phrases data at %"I64_FMT", len=%"I64_FMT, pos1, len);
1502 de_dbg_indent(c, 1);
1503 if(len<6) goto done;
1505 s0 = de_getu16le(pos1);
1506 s1 = de_getu16le(pos1+2);
1507 s2 = de_getu16le(pos1+4);
1509 if(s0==0x0800 && s1!=0x0100 && s2==0x0100) {
1510 is_MVB_format = 1;
1512 else if(s1==0x0100) {
1515 else {
1516 de_err(c, "Unknown Phrases format");
1517 goto done;
1519 de_dbg(c, "MVB format: %d", is_MVB_format);
1520 if(is_MVB_format) {
1521 de_err(c, "Unsupported Phrases format");
1522 goto done;
1525 d->num_phrases = (unsigned int)s0;
1526 de_dbg(c, "num phrases: %u", d->num_phrases);
1527 pos = pos1+4;
1529 if(d->ver_minor>16) {
1530 is_compressed = 1;
1532 de_dbg(c, "Phrases are lzw-compressed: %d", is_compressed);
1534 if(is_compressed) {
1535 phrase_data_uncmpr_len = de_getu32le_p(&pos);
1536 de_dbg(c, "decompressed len (reported): %"I64_FMT, phrase_data_uncmpr_len);
1539 // Phrase offsets are measured from the start of the offset table.
1540 phrase_offset_table_pos = pos;
1541 phrase_offset_table_len = ((i64)d->num_phrases+1)*2;
1542 de_dbg(c, "offset table at %"I64_FMT", len=%"I64_FMT, phrase_offset_table_pos,
1543 phrase_offset_table_len);
1545 phrase_data_pos = phrase_offset_table_pos + phrase_offset_table_len;
1546 phrase_data_cmpr_len = pos1+len - phrase_data_pos; // (before any decompression)
1547 if(phrase_data_cmpr_len<0) goto done;
1548 if(!is_compressed) {
1549 phrase_data_uncmpr_len = phrase_data_cmpr_len;
1552 d->phrase_info = de_mallocarray(c, (i64)d->num_phrases, sizeof(struct phrase_item));
1553 for(k=0; k<d->num_phrases+1; k++) {
1554 u32 offs;
1556 offs = (u32)de_getu16le_p(&pos);
1557 offs -= (u32)phrase_offset_table_len;
1559 if(k<d->num_phrases) {
1560 d->phrase_info[k].pos = offs;
1562 if(k>=1) {
1563 d->phrase_info[k-1].len = offs - d->phrase_info[k-1].pos;
1566 if(c->debug_level>=2) {
1567 dump_phrase_offset_table(c, d);
1570 de_dbg(c, "phrase data at %"I64_FMT", len=%"I64_FMT, phrase_data_pos, phrase_data_cmpr_len);
1572 if(is_compressed) {
1573 decompress_Phrases(c, d, phrase_data_pos, phrase_data_cmpr_len, phrase_data_uncmpr_len);
1575 else {
1576 dbuf_copy(c->infile, phrase_data_pos, phrase_data_cmpr_len, d->phrases_data);
1579 d->valid_Phrases_file = 1;
1581 done:
1582 de_dbg_indent_restore(c, saved_indent_level);
1585 static void phrdecompress(deark *c, lctx *d, i64 pos1, i64 len, unsigned int BitCount)
1587 unsigned int n;
1588 unsigned int i;
1589 struct de_bitreader bitrd;
1591 de_zeromem(&bitrd, sizeof(struct de_bitreader));
1592 bitrd.f = c->infile;
1593 bitrd.curpos = pos1;
1594 bitrd.endpos = pos1 + len;
1595 bitrd.bbll.is_lsb = 1;
1597 d->phrase_info[0].pos = 0;
1599 for(i=0; i<d->num_phrases; i++) {
1600 unsigned int num1bits = 0;
1602 while(de_bitreader_getbits(&bitrd, 1)) {
1603 if(bitrd.eof_flag) goto done;
1604 num1bits++;
1606 n = num1bits<<BitCount;
1607 n += (UI)de_bitreader_getbits(&bitrd, BitCount) + 1;
1608 if(bitrd.eof_flag) goto done;
1610 d->phrase_info[i].len = n;
1611 if(i+1<d->num_phrases) {
1612 d->phrase_info[i+1].pos = d->phrase_info[i].pos+n;
1616 done:
1617 if(c->debug_level>=2) {
1618 dump_phrase_offset_table(c, d);
1622 static void do_file_PhrIndex(deark *c, lctx *d, i64 pos1, i64 len)
1624 i64 pos = pos1;
1625 i64 cmprsize;
1626 i64 n;
1627 unsigned int bits;
1628 unsigned int bitcount;
1629 int saved_indent_level;
1631 de_dbg_indent_save(c, &saved_indent_level);
1632 if(len<30) goto done;
1633 n = de_getu32le_p(&pos);
1634 if(n!=1) goto done;
1635 d->num_phrases = (unsigned int)de_getu32le_p(&pos);
1636 de_dbg(c, "num phrases: %u", d->num_phrases);
1637 d->phrase_info = de_mallocarray(c, (i64)d->num_phrases, sizeof(struct phrase_item));
1639 cmprsize = de_getu32le_p(&pos);
1640 de_dbg(c, "index cmpr size: %"I64_FMT, cmprsize);
1641 d->PhrImageUncSize = de_getu32le_p(&pos);
1642 de_dbg(c, "PhrImage uncmpr size: %"I64_FMT, d->PhrImageUncSize);
1643 d->PhrImageCmprSize = de_getu32le_p(&pos);
1644 de_dbg(c, "PhrImage cmpr size: %"I64_FMT, d->PhrImageCmprSize);
1645 pos += 4;
1646 bits = (unsigned int)de_getu16le_p(&pos);
1647 de_dbg(c, "bits: 0x%04x", bits);
1648 de_dbg_indent(c, 1);
1649 bitcount = bits & 0xf;
1650 de_dbg(c, "bit count: %u", bitcount);
1651 de_dbg_indent(c, -1);
1652 pos += 2;
1654 de_dbg(c, "avail size: %d", (int)(pos1+len-pos));
1655 phrdecompress(c, d, pos, pos1+len-pos, bitcount);
1656 done:
1657 de_dbg_indent_restore(c, saved_indent_level);
1660 static void do_file_PhrImage(deark *c, lctx *d, i64 pos1, i64 len)
1662 de_dbg(c, "PhrImage data at %"I64_FMT", len=%"I64_FMT, pos1, len);
1663 if(len < d->PhrImageCmprSize) {
1664 return;
1667 if(d->PhrImageCmprSize == d->PhrImageUncSize) {
1668 dbuf_copy(c->infile, pos1, d->PhrImageCmprSize, d->phrases_data);
1670 else {
1671 decompress_Phrases(c, d, pos1, d->PhrImageCmprSize, d->PhrImageUncSize);
1672 de_dbg(c, "decompressed to %"I64_FMT" bytes", d->phrases_data->len);
1676 static const char* file_type_to_type_name(enum hlp_filetype file_fmt)
1678 const char *name = "unspecified";
1679 switch(file_fmt) {
1680 case FILETYPE_SYSTEM: name="system"; break;
1681 case FILETYPE_TOPIC: name="topic"; break;
1682 case FILETYPE_SHG: name="SHG/MRB"; break;
1683 case FILETYPE_INTERNALDIR: name="directory"; break;
1684 case FILETYPE_PHRASES: name="Phrases"; break;
1685 case FILETYPE_PHRINDEX: name="PhrIndex"; break;
1686 case FILETYPE_PHRIMAGE: name="PhrImage"; break;
1687 case FILETYPE_OTHERSPECIAL: name="other special stream"; break;
1688 case FILETYPE_EXTRACTABLE: name="other extractable file"; break;
1689 default: ;
1691 return name;
1694 static void do_file(deark *c, lctx *d, i64 pos1, enum hlp_filetype file_fmt, int force_extract,
1695 struct de_stringreaderdata *fn)
1697 i64 reserved_space;
1698 i64 used_space;
1699 i64 pos = pos1;
1700 unsigned int fileflags;
1702 de_dbg(c, "file at %d, type=%s", (int)pos1, file_type_to_type_name(file_fmt));
1703 de_dbg_indent(c, 1);
1705 // FILEHEADER
1706 reserved_space = de_getu32le_p(&pos);
1707 de_dbg(c, "ReservedSpace: %d", (int)reserved_space);
1709 used_space = de_getu32le_p(&pos);
1710 de_dbg(c, "UsedSpace: %d", (int)used_space);
1712 fileflags = (unsigned int)de_getbyte_p(&pos);
1713 de_dbg(c, "FileFlags: 0x%02x", fileflags);
1715 if(pos+used_space > c->infile->len) {
1716 de_err(c, "Bad file size");
1717 goto done;
1720 if(force_extract) {
1721 do_extract_raw_file(c, d, pos, used_space, fn);
1722 goto done;
1725 switch(file_fmt) {
1726 case FILETYPE_INTERNALDIR:
1727 do_file_INTERNALDIR(c, d, pos, used_space);
1728 break;
1729 case FILETYPE_TOPIC:
1730 do_file_TOPIC(c, d, pos, used_space);
1731 break;
1732 case FILETYPE_PHRASES:
1733 do_file_Phrases(c, d, pos, used_space);
1734 break;
1735 case FILETYPE_PHRINDEX:
1736 do_file_PhrIndex(c, d, pos, used_space);
1737 break;
1738 case FILETYPE_PHRIMAGE:
1739 do_file_PhrImage(c, d, pos, used_space);
1740 break;
1741 case FILETYPE_SYSTEM:
1742 do_file_SYSTEM(c, d, pos, used_space);
1743 break;
1744 case FILETYPE_SHG:
1745 d->has_shg = 1;
1746 do_file_SHG(c, d, pos, used_space);
1747 break;
1748 case FILETYPE_EXTRACTABLE:
1749 do_extract_raw_file(c, d, pos, used_space, fn);
1750 break;
1751 default: ;
1754 done:
1755 de_dbg_indent(c, -1);
1758 static void do_header(deark *c, lctx *d, i64 pos)
1760 i64 n;
1762 de_dbg(c, "header at %d", (int)pos);
1763 de_dbg_indent(c, 1);
1765 d->internal_dir_FILEHEADER_offs = de_geti32le(4);
1766 de_dbg(c, "internal dir FILEHEADER pos: %d", (int)d->internal_dir_FILEHEADER_offs);
1768 n = de_geti32le(8);
1769 de_dbg(c, "FREEHEADER pos: %d", (int)n);
1771 n = de_geti32le(12);
1772 de_dbg(c, "reported file size: %d", (int)n);
1774 de_dbg_indent(c, -1);
1777 static void de_run_hlp(deark *c, de_module_params *mparams)
1779 lctx *d = NULL;
1780 i64 pos;
1782 d = de_malloc(c, sizeof(lctx));
1784 d->input_encoding = de_get_input_encoding(c, NULL, DE_ENCODING_WINDOWS1252);
1785 d->extract_text = de_get_ext_option_bool(c, "hlp:extracttext",
1786 ((c->extract_level>=2)?1:0));
1787 d->extract_raw_streams = (u8)de_get_ext_option_bool(c, "hlp:extractstreams", 0);
1788 d->tmpdbuf1 = dbuf_create_membuf(c, 0, 0);
1789 d->unc_linkdata2_dbuf = dbuf_create_membuf(c, 0, 0);
1790 d->tmpucstring1 = ucstring_create(c);
1791 d->phrases_data = dbuf_create_membuf(c, 0, 0);
1793 pos = 0;
1794 do_header(c, d, pos);
1796 do_file(c, d, d->internal_dir_FILEHEADER_offs, FILETYPE_INTERNALDIR, 0, NULL);
1798 de_dbg(c, "summary: v%d.%d %s%s%s blksize=%d levels=%d%s%s%s",
1799 d->ver_major, d->ver_minor,
1800 d->is_lz77_compressed?"lz77":"no_lz77",
1801 d->uses_old_phrase_compression?" phrase_compression":"",
1802 d->uses_hall_compression?" Hall_compression":"",
1803 (int)d->topic_block_size,
1804 (int)d->internal_dir_num_levels,
1805 d->has_shg?" has-shg":"",
1806 d->has_ico?" has-ico":"",
1807 d->has_bmp?" has-bmp":"");
1809 if(d) {
1810 dbuf_close(d->tmpdbuf1);
1811 dbuf_close(d->unc_linkdata2_dbuf);
1812 dbuf_close(d->phrases_data);
1813 ucstring_destroy(d->tmpucstring1);
1814 ucstring_destroy(d->help_file_title);
1815 ucstring_destroy(d->help_file_copyright);
1816 dbuf_close(d->outf_text);
1817 de_free(c, d->phrase_info);
1818 de_free(c, d);
1822 static int de_identify_hlp(deark *c)
1824 if(!dbuf_memcmp(c->infile, 0, "\x3f\x5f\x03\x00", 4))
1825 return 100;
1826 return 0;
1829 static void de_help_hlp(deark *c)
1831 de_msg(c, "-opt hlp:extracttext : Write the text (unformatted) to a file");
1832 de_msg(c, "-opt hlp:extractstreams : Extract raw files, instead of decoding");
1835 void de_module_hlp(deark *c, struct deark_module_info *mi)
1837 mi->id = "hlp";
1838 mi->desc = "HLP";
1839 mi->run_fn = de_run_hlp;
1840 mi->identify_fn = de_identify_hlp;
1841 mi->help_fn = de_help_hlp;