1 // This file is part of Deark.
2 // Copyright (C) 2017 Jason Summers
3 // See the file COPYING for terms of use.
7 // This module was developed with the help of information from the helpfile.txt
8 // document included with the helpdeco software by M. Winterhoff.
10 #include <deark-config.h>
11 #include <deark-private.h>
12 #include <deark-fmtutil.h>
13 DE_DECLARE_MODULE(de_module_hlp
);
15 #define TOPICBLOCKHDRSIZE 12
16 #define INVALIDPOS 0xffffffffU
20 FILETYPE_OTHERSPECIAL
,
43 u32 pos
; // pos in ->phrases_data
47 typedef struct localctx_struct
{
51 u8 extract_raw_streams
;
52 i64 internal_dir_FILEHEADER_offs
;
55 u8 found_Phrases_file
;
56 u8 found_PhrIndex_file
;
57 u8 found_PhrImage_file
;
59 u8 valid_Phrases_file
;
60 i64 offset_of_system_file
;
61 i64 offset_of_Phrases
;
62 i64 offset_of_PhrIndex
;
63 i64 offset_of_PhrImage
;
68 int is_lz77_compressed
;
69 int uses_old_phrase_compression
;
70 int uses_hall_compression
;
72 int has_shg
, has_ico
, has_bmp
;
73 i64 internal_dir_num_levels
;
77 dbuf
*unc_linkdata2_dbuf
;
81 unsigned int num_phrases
;
82 struct phrase_item
*phrase_info
; // array [num_phrases]
83 de_ucstring
*tmpucstring1
;
84 de_ucstring
*help_file_title
;
85 de_ucstring
*help_file_copyright
;
86 struct de_timestamp gendate
;
89 static void do_file(deark
*c
, lctx
*d
, i64 pos1
, enum hlp_filetype file_fmt
, int extract_only
,
90 struct de_stringreaderdata
*fn
);
92 struct systemrec_info
{
95 // low 8 bits = version info
96 // 0x0010 = STRINGZ type
102 static const struct systemrec_info systemrec_info_arr
[] = {
103 { 1, 0x0010, "Title", NULL
},
104 { 2, 0x0010, "Copyright", NULL
},
105 { 3, 0x0000, "Contents", NULL
},
106 { 4, 0x0010, "Macro", NULL
},
107 { 5, 0x0000, "Icon", NULL
},
108 { 6, 0x0000, "Window", NULL
},
109 { 8, 0x0010, "Citation", NULL
},
110 { 9, 0x0000, "Language ID", NULL
},
111 { 10, 0x0010, "CNT file name", NULL
},
112 { 11, 0x0000, "Charset", NULL
},
113 { 12, 0x0000, "Default dialog font", NULL
},
114 { 13, 0x0010, "Defined GROUPs", NULL
},
115 { 14, 0x0011, "IndexSeparators", NULL
},
116 { 14, 0x0002, "Multimedia Help Files", NULL
},
117 { 18, 0x0010, "Defined language", NULL
},
118 { 19, 0x0000, "Defined DLLMAPS", NULL
}
120 static const struct systemrec_info systemrec_info_default
=
121 { 0, 0x0000, "?", NULL
};
123 static void format_topiclink(deark
*c
, lctx
*d
, u32 n
, char *buf
, size_t buf_len
)
125 if(d
->ver_minor
<=16) {
126 de_snprintf(buf
, buf_len
, "%u", (UI
)n
);
130 de_strlcpy(buf
, "-1", buf_len
);
133 de_snprintf(buf
, buf_len
, "Blk%u:%u", (UI
)(n
/16384),
139 static void hlptime_to_timestamp(i64 ht
, struct de_timestamp
*ts
)
142 de_unix_time_to_timestamp(ht
, ts
, 0);
149 // s can be NULL, or it can be a string to save the value in.
150 static void do_display_and_store_STRINGZ(deark
*c
, lctx
*d
, i64 pos1
, i64 len
,
151 const char *name
, de_ucstring
*s1
)
153 de_ucstring
*s_tmp
= NULL
;
160 s_tmp
= ucstring_create(c
);
167 dbuf_read_to_ucstring_n(c
->infile
,
168 pos1
, len
, DE_DBG_MAX_STRLEN
,
169 s
, DE_CONVFLAG_STOP_AT_NUL
, d
->input_encoding
);
170 de_dbg(c
, "%s: \"%s\"", name
, ucstring_getpsz(s
));
172 if(s_tmp
) ucstring_destroy(s_tmp
);
175 static void do_SYSTEMREC_STRINGZ(deark
*c
, lctx
*d
, unsigned int recordtype
,
176 i64 pos1
, i64 len
, const struct systemrec_info
*sti
, de_ucstring
*s
)
178 do_display_and_store_STRINGZ(c
, d
, pos1
, len
, sti
->name
, s
);
181 static void do_SYSTEMREC_uint32_hex(deark
*c
, lctx
*d
, unsigned int recordtype
,
187 n
= (unsigned int)de_getu32le(pos1
);
188 de_dbg(c
, "value: 0x%08x", n
);
191 static void extract_system_icon(deark
*c
, lctx
*d
, i64 pos
, i64 len
)
195 fi
= de_finfo_create(c
);
196 fi
->timestamp
[DE_TIMESTAMPIDX_MODIFY
] = d
->gendate
;
197 dbuf_create_file_from_slice(c
->infile
, pos
, len
, "ico", fi
, DE_CREATEFLAG_IS_AUX
);
198 de_finfo_destroy(c
, fi
);
201 static void do_SYSTEMREC(deark
*c
, lctx
*d
, unsigned int recordtype
,
202 i64 pos1
, i64 len
, const struct systemrec_info
*sti
)
204 if(recordtype
==1) { // title
205 if(!d
->help_file_title
) {
206 d
->help_file_title
= ucstring_create(c
);
208 do_SYSTEMREC_STRINGZ(c
, d
, recordtype
, pos1
, len
, sti
, d
->help_file_title
);
210 else if(recordtype
==2) { // copyright
211 if(!d
->help_file_copyright
) {
212 d
->help_file_copyright
= ucstring_create(c
);
214 do_SYSTEMREC_STRINGZ(c
, d
, recordtype
, pos1
, len
, sti
, d
->help_file_copyright
);
216 else if(recordtype
==3 && len
==4) { // contents
217 do_SYSTEMREC_uint32_hex(c
, d
, recordtype
, pos1
, len
);
219 else if(recordtype
==5) { // Icon
221 extract_system_icon(c
, d
, pos1
, len
);
223 else if(sti
->flags
&0x10) {
224 do_SYSTEMREC_STRINGZ(c
, d
, recordtype
, pos1
, len
, sti
, NULL
);
227 if(c
->debug_level
>=2) {
228 de_dbg_hexdump(c
, c
->infile
, pos1
, len
, 256, NULL
, 0x1);
233 static const struct systemrec_info
*find_sysrec_info(deark
*c
, lctx
*d
, unsigned int t
)
237 for(i
=0; i
<DE_ARRAYCOUNT(systemrec_info_arr
); i
++) {
238 const struct systemrec_info
*sti
;
239 sti
= &systemrec_info_arr
[i
];
240 if(sti
->rectype
==t
&&
241 (sti
->flags
&0x0f)==0)
246 return &systemrec_info_default
;
249 static int do_file_SYSTEM_header(deark
*c
, lctx
*d
, i64 pos1
)
255 char timestamp_buf
[64];
258 magic
= de_getu16le_p(&pos
);
260 de_err(c
, "Expected SYSTEM data at %d not found", (int)pos1
);
264 de_dbg(c
, "SYSTEM file data at %d", (int)pos1
);
267 d
->ver_minor
= (int)de_getu16le_p(&pos
);
268 d
->ver_major
= (int)de_getu16le_p(&pos
);
269 de_dbg(c
, "help format version: %d.%d", d
->ver_major
, d
->ver_minor
);
271 if(d
->ver_major
!=1) {
272 de_err(c
, "Unsupported file version: %d.%d", d
->ver_major
, d
->ver_minor
);
276 gen_date
= de_geti32le_p(&pos
);
277 hlptime_to_timestamp(gen_date
, &d
->gendate
);
278 de_timestamp_to_string(&d
->gendate
, timestamp_buf
, sizeof(timestamp_buf
), 0);
279 de_dbg(c
, "GenDate: %d (%s)", (int)gen_date
, timestamp_buf
);
281 flags
= (unsigned int)de_getu16le_p(&pos
);
282 de_dbg(c
, "system flags: 0x%04x", flags
);
284 if(d
->ver_minor
>16) {
286 d
->is_lz77_compressed
= 1;
287 d
->topic_block_size
= 2048;
290 d
->is_lz77_compressed
= 1;
291 d
->topic_block_size
= 4096;
294 d
->is_lz77_compressed
= 0;
295 d
->topic_block_size
= 4096;
299 d
->is_lz77_compressed
= 0;
300 d
->topic_block_size
= 2048;
302 de_dbg(c
, "lz77 compression: %d", d
->is_lz77_compressed
);
303 de_dbg(c
, "topic block size: %d", (int)d
->topic_block_size
);
310 static void do_file_SYSTEM_SYSTEMRECS(deark
*c
, lctx
*d
, i64 pos1
, i64 len
,
315 while((pos1
+len
)-pos
>=4) {
316 unsigned int recordtype
;
318 i64 systemrec_startpos
;
319 const struct systemrec_info
*sti
;
321 systemrec_startpos
= pos
;
323 recordtype
= (unsigned int)de_getu16le_p(&pos
);
324 datasize
= de_getu16le_p(&pos
);
326 sti
= find_sysrec_info(c
, d
, recordtype
);
327 de_dbg(c
, "SYSTEMREC type %u (%s) at %d, dpos=%d, dlen=%d",
328 recordtype
, sti
->name
,
329 (int)systemrec_startpos
, (int)pos
, (int)datasize
);
331 if(pos
+datasize
> pos1
+len
) break; // bad data
333 do_SYSTEMREC(c
, d
, recordtype
, pos
, datasize
, sti
);
334 de_dbg_indent(c
, -1);
339 static void do_file_SYSTEM(deark
*c
, lctx
*d
, i64 pos1
, i64 len
)
342 int saved_indent_level
;
344 de_dbg_indent_save(c
, &saved_indent_level
);
346 // We'll read the SYSTEM "file" before pass 2, most importantly to record
347 // the format version information.
349 // The SYSTEM file may contain a series of SYSTEMREC records that we want
351 // Note: It seems like we might have to make two (sub)passes over the
352 // SYSTEMREC records, the first pass to collect the "charset" setting, so it
353 // can be used to interpret records that appear before it. But I've never
354 // seen a charset record that I can make sense of -- it's usually just a
355 // random number of NUL bytes.
357 if(d
->pass
!=1) goto done
;
359 if(!do_file_SYSTEM_header(c
, d
, pos
)) goto done
;
362 if(d
->ver_minor
<=16) {
363 d
->help_file_title
= ucstring_create(c
);
364 do_display_and_store_STRINGZ(c
, d
, pos
, (pos1
+len
)-pos
, "HelpFileTitle", d
->help_file_title
);
367 // A sequence of variable-sized SYSTEMRECs
368 do_file_SYSTEM_SYSTEMRECS(c
, d
, pos
, (pos1
+len
)-pos
, 1);
372 de_dbg_indent_restore(c
, saved_indent_level
);
375 static void do_file_SHG(deark
*c
, lctx
*d
, i64 pos1
, i64 used_space
)
382 // Reportedly, 0x506c = SHG = 1 image, and 0x706c = MRB = >1 image.
383 // But I'm not convinced that's correct.
384 // I'm to sure what to do, as far as selecting a file extension, and potentially
385 // correcting the signature. Current behavior is to leave the signature the same,
386 // and derive the file extension from the number of images.
387 oldsig
= de_getu16le(pos1
);
389 if(oldsig
==0x506c || oldsig
==0x706c) {
392 num_images
= de_getu16le(pos1
+2);
404 fi
= de_finfo_create(c
);
405 // (Note that if we were to correct the signature, we probably should not copy
407 fi
->timestamp
[DE_TIMESTAMPIDX_MODIFY
] = d
->gendate
;
408 outf
= dbuf_create_output_file(c
, ext
, fi
, 0);
409 dbuf_copy(c
->infile
, pos1
, used_space
, outf
);
411 de_finfo_destroy(c
, fi
);
414 static void do_extract_raw_file(deark
*c
, lctx
*d
, i64 pos1
, i64 used_space
,
415 struct de_stringreaderdata
*fn
)
418 const char *ext
= NULL
;
420 fi
= de_finfo_create(c
);
421 fi
->timestamp
[DE_TIMESTAMPIDX_MODIFY
] = d
->gendate
;
422 if(fn
&& ucstring_isnonempty(fn
->str
)) {
423 de_finfo_set_name_from_ucstring(c
, fi
, fn
->str
, 0);
424 fi
->original_filename_flag
= 1;
429 dbuf_create_file_from_slice(c
->infile
, pos1
, used_space
, ext
, fi
, 0);
431 de_finfo_destroy(c
, fi
);
434 struct topiclink_data
{
445 i64 linkdata2_cmprlen
;
446 i64 linkdata2_uncmprlen
;
448 i64 paragraphinfo_pos
;
452 struct topic_block_info_item
{
453 i64 pos
; // position in ->unc_topicdata
458 i64 num_topic_blocks
;
459 struct topic_block_info_item
*topic_block_info
; // array [num_topic_blocks]
461 u32 pos_of_first_topiclink
;
464 static void ensure_text_output_file_open(deark
*c
, lctx
*d
)
466 if(d
->outf_text
) return;
467 d
->outf_text
= dbuf_create_output_file(c
, "dump.txt", NULL
, 0);
468 if(d
->output_is_utf8
&& c
->write_bom
) {
469 dbuf_write_uchar_as_utf8(d
->outf_text
, 0xfeff);
471 if(ucstring_isnonempty(d
->help_file_title
)) {
472 dbuf_puts(d
->outf_text
, "Title: ");
473 // TODO: This doesn't do the right thing if !d->output_is_utf8.
474 ucstring_write_as_utf8(c
, d
->help_file_title
, d
->outf_text
, 0);
475 dbuf_puts(d
->outf_text
, "\n");
477 if(ucstring_isnonempty(d
->help_file_copyright
)) {
478 dbuf_puts(d
->outf_text
, "Copyright: ");
479 ucstring_write_as_utf8(c
, d
->help_file_copyright
, d
->outf_text
, 0);
480 dbuf_puts(d
->outf_text
, "\n");
484 // Emit a string that needs no conversion.
485 static void emit_raw_sz(deark
*c
, lctx
*d
, const char *sz
)
487 if(!d
->outf_text
) return;
488 dbuf_puts(d
->outf_text
, sz
);
491 // Emit a content string, i.e. one that might need character encoding conversion
493 static void emit_slice(deark
*c
, lctx
*d
, dbuf
*inf
, i64 pos
, i64 len
)
495 if(!d
->outf_text
) return;
497 if(!d
->output_is_utf8
) {
498 dbuf_copy(inf
, pos
, len
, d
->outf_text
);
502 ucstring_empty(d
->tmpucstring1
);
503 dbuf_read_to_ucstring(inf
, pos
, len
, d
->tmpucstring1
, 0, d
->input_encoding
);
504 ucstring_write_as_utf8(c
, d
->tmpucstring1
, d
->outf_text
, 0);
507 static void emit_phrase(deark
*c
, lctx
*d
, dbuf
*outf
, unsigned int phrasenum
)
509 if(phrasenum
< d
->num_phrases
) {
510 dbuf_copy(d
->phrases_data
, (i64
)d
->phrase_info
[phrasenum
].pos
,
511 (i64
)d
->phrase_info
[phrasenum
].len
, outf
);
515 static void do_phrase_decompression(deark
*c
, lctx
*d
, dbuf
*inf
, i64 pos1
, i64 len
,
516 dbuf
*outf
, i64 unc_len_expected
)
519 i64 endpos
= pos1
+len
;
521 if(!d
->phrases_data
) return;
526 if(pos
>= endpos
) break;
527 b
= dbuf_getbyte_p(inf
, &pos
);
528 if(b
==0 || b
>=0x10) {
529 dbuf_writebyte(outf
, b
);
535 if(pos
>= endpos
) break;
536 b2
= dbuf_getbyte_p(inf
, &pos
);
537 n
= ((unsigned int)(b
-1)<<8) | b2
;
538 emit_phrase(c
, d
, outf
, n
>>1);
540 dbuf_writebyte(outf
, ' ');
546 static void do_hall_decompression(deark
*c
, lctx
*d
, dbuf
*inf
, i64 pos1
, i64 len
,
547 dbuf
*outf
, i64 unc_len_expected
)
549 static const u8 action
[16] = {0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4};
551 i64 endpos
= pos1
+len
;
553 i64 outf_expected_endpos
;
555 outf_expected_endpos
= outf
->len
+ unc_len_expected
;
560 if(outf
->len
>= outf_expected_endpos
) goto unc_done
;
561 if(pos
>= endpos
) goto unc_done
;
562 b
= dbuf_getbyte_p(inf
, &pos
);
563 switch(action
[b
&0x0f]) {
565 emit_phrase(c
, d
, outf
, b
>>1);
568 if(pos
>= endpos
) goto unc_done
;
569 n
= (((unsigned int)b
& 0xfc) << 6) | 0x80;
570 n
+= dbuf_getbyte_p(inf
, &pos
);
571 emit_phrase(c
, d
, outf
, n
);
575 if(pos
+ (i64
)n
> endpos
) goto unc_done
;
576 dbuf_copy(inf
, pos
, (i64
)n
, outf
);
580 dbuf_write_run(outf
, ' ', (i64
)(b
>>4)+1);
583 dbuf_write_zeroes(outf
, (i64
)(b
>>4)+1);
591 static int do_topiclink_rectype_32_linkdata1(deark
*c
, lctx
*d
,
592 struct topic_ctx
*tctx
, struct topiclink_data
*tld
)
594 i64 pos
= tld
->linkdata1_pos
;
595 dbuf
*inf
= tctx
->unc_topicdata
;
602 // TODO: type 33 (table)
603 if(tld
->recordtype
!=1 && tld
->recordtype
!=32) goto done
;
605 topicsize
= fmtutil_hlp_get_csl_p(inf
, &pos
);
606 de_dbg(c
, "topic size: %"I64_FMT
, topicsize
);
608 if(tld
->recordtype
==32) {
609 topiclength
= fmtutil_hlp_get_cus_p(inf
, &pos
);
610 de_dbg(c
, "topic length: %"I64_FMT
, topiclength
);
613 pos
++; // unknownUnsignedChar
614 pos
++; // unknownBiasedChar
615 id
= (unsigned int)dbuf_getu16le_p(inf
, &pos
);
616 de_dbg(c
, "id: %u", id
);
617 bits
= (unsigned int)dbuf_getu16le_p(inf
, &pos
);
618 de_dbg(c
, "bits: 0x%04x", bits
);
620 if(bits
& 0x0001) { // Unknown
621 (void)fmtutil_hlp_get_csl_p(inf
, &pos
);
623 if(bits
& 0x0002) { // SpacingAbove
624 (void)fmtutil_hlp_get_css_p(inf
, &pos
);
626 if(bits
& 0x0004) { // SpacingBelow
627 (void)fmtutil_hlp_get_css_p(inf
, &pos
);
629 if(bits
& 0x0008) { // SpacingLines
630 (void)fmtutil_hlp_get_css_p(inf
, &pos
);
632 if(bits
& 0x0010) { // LeftIndent
633 (void)fmtutil_hlp_get_css_p(inf
, &pos
);
635 if(bits
& 0x0020) { // RightIndent
636 (void)fmtutil_hlp_get_css_p(inf
, &pos
);
638 if(bits
& 0x0040) { // FirstlineIndent
639 (void)fmtutil_hlp_get_css_p(inf
, &pos
);
642 if(bits
& 0x0100) { // Borderinfo
645 if(bits
& 0x0200) { // Tabinfo
648 // 0x0400 = RightAlignedParagraph
649 // 0x0800 = CenterAlignedParagraph
651 tld
->paragraphinfo_pos
= pos
;
657 // linkdata2 = after Phrase/Hall decompression
658 static void do_topiclink_rectype_1_32(deark
*c
, lctx
*d
,
659 struct topic_ctx
*tctx
, struct topiclink_data
*tld
, dbuf
*linkdata2
)
663 int string_count
= 0;
666 if(!d
->extract_text
) goto done
;
667 ensure_text_output_file_open(c
, d
);
669 do_topiclink_rectype_32_linkdata1(c
, d
, tctx
, tld
);
671 // TODO: This is very quick & dirty.
672 // The linkdata2 is a collection of NUL-terminated strings. We'd have to
673 // interpret the command bytes from linkdata1 to know how to format them.
676 dbuf_truncate(d
->tmpdbuf1
, 0); // A place to collect the current output string
681 if(pos
>= linkdata2
->len
) break;
682 b
= dbuf_getbyte_p(linkdata2
, &pos
);
685 emit_slice(c
, d
, d
->tmpdbuf1
, 0, d
->tmpdbuf1
->len
);
686 dbuf_truncate(d
->tmpdbuf1
, 0);
687 emit_raw_sz(c
, d
, "\n");
693 dbuf_writebyte(d
->tmpdbuf1
, b
);
699 emit_slice(c
, d
, d
->tmpdbuf1
, 0, d
->tmpdbuf1
->len
);
700 dbuf_truncate(d
->tmpdbuf1
, 0);
701 emit_raw_sz(c
, d
, "\n");
704 de_dbg2(c
, "[emitted %d strings, totaling %d bytes]", string_count
, byte_count
);
711 static void do_topiclink_rectype_2_linkdata2(deark
*c
, lctx
*d
,
712 struct topic_ctx
*tctx
, struct topiclink_data
*tld
, dbuf
*linkdata2
)
717 dbuf_truncate(d
->tmpdbuf1
, 0);
718 emit_raw_sz(c
, d
, "# ");
720 for(k
=0; k
<linkdata2
->len
; k
++) {
723 b
= dbuf_getbyte(linkdata2
, k
);
725 dbuf_writebyte(d
->tmpdbuf1
, b
);
730 emit_slice(c
, d
, d
->tmpdbuf1
, 0, d
->tmpdbuf1
->len
);
733 emit_raw_sz(c
, d
, "(untitled topic)");
736 emit_raw_sz(c
, d
, " #\n");
739 // topic header and title
740 static void do_topiclink_rectype_2(deark
*c
, lctx
*d
,
741 struct topic_ctx
*tctx
, struct topiclink_data
*tld
, dbuf
*linkdata2
)
743 if(!d
->extract_text
) goto done
;
744 ensure_text_output_file_open(c
, d
);
746 do_topiclink_rectype_2_linkdata2(c
, d
, tctx
, tld
, linkdata2
);
751 // Returns 1 if we set next_pos_code
752 static int do_topiclink(deark
*c
, lctx
*d
, struct topic_ctx
*tctx
, i64 pos1
, u32
*next_pos_code
)
754 struct topiclink_data
*tld
= NULL
;
756 i64 linkdata2_nbytes_avail
;
758 dbuf
*inf
= tctx
->unc_topicdata
;
761 tld
= de_malloc(c
, sizeof(struct topiclink_data
));
763 tld
->blocksize
= dbuf_geti32le_p(inf
, &pos
);
764 de_dbg(c
, "blocksize: %d", (int)tld
->blocksize
);
765 if((tld
->blocksize
<21) || (pos1
+ tld
->blocksize
> inf
->len
)) {
766 de_dbg(c
, "bad topiclink blocksize");
769 tld
->datalen2
= dbuf_geti32le_p(inf
, &pos
);
770 de_dbg(c
, "datalen2 (after any decompression): %d", (int)tld
->datalen2
);
772 tld
->prevblock
= (u32
)dbuf_getu32le_p(inf
, &pos
);
773 format_topiclink(c
, d
, tld
->prevblock
, tmpbuf
, sizeof(tmpbuf
));
774 de_dbg(c
, "prevblock: %s", tmpbuf
);
776 tld
->nextblock
= (u32
)dbuf_getu32le_p(inf
, &pos
);
777 format_topiclink(c
, d
, tld
->nextblock
, tmpbuf
, sizeof(tmpbuf
));
778 de_dbg(c
, "nextblock: %s", tmpbuf
);
779 *next_pos_code
= (u32
)tld
->nextblock
;
782 tld
->datalen1
= dbuf_geti32le_p(inf
, &pos
);
783 de_dbg(c
, "datalen1: %d", (int)tld
->datalen1
);
784 tld
->recordtype
= dbuf_getbyte_p(inf
, &pos
);
785 de_dbg(c
, "record type: %d", (int)tld
->recordtype
);
787 tld
->linkdata1_pos
= pos1
+ 21;
788 tld
->linkdata1_len
= tld
->datalen1
- 21;
789 de_dbg(c
, "linkdata1: pos=[%"I64_FMT
"], len=%"I64_FMT
, tld
->linkdata1_pos
, tld
->linkdata1_len
);
791 tld
->linkdata2_pos
= tld
->linkdata1_pos
+ tld
->linkdata1_len
;
792 linkdata2_nbytes_avail
= tld
->blocksize
- tld
->datalen1
;
794 if(d
->uses_old_phrase_compression
|| d
->uses_hall_compression
) {
795 if(tld
->datalen2
> linkdata2_nbytes_avail
) {
796 // Phrase/Hall compression used in this topiclink
797 tld
->seems_compressed
= 1;
798 tld
->linkdata2_cmprlen
= linkdata2_nbytes_avail
;
799 tld
->linkdata2_uncmprlen
= tld
->datalen2
;
802 // Phrase/Hall compression used in this file, but not in this topiclink
803 tld
->linkdata2_cmprlen
= tld
->datalen2
;
804 tld
->linkdata2_uncmprlen
= tld
->datalen2
;
808 // Phrase/Hall compression not used in this file
809 tld
->linkdata2_cmprlen
= de_min_int(tld
->datalen2
, linkdata2_nbytes_avail
);
810 tld
->linkdata2_uncmprlen
= tld
->linkdata2_cmprlen
;
813 if((tld
->linkdata1_pos
<pos1
) || (tld
->linkdata2_pos
<pos1
) ||
814 (tld
->linkdata1_len
<0) || (tld
->linkdata2_cmprlen
<0) ||
815 (tld
->linkdata1_pos
+ tld
->linkdata1_len
> pos1
+tld
->blocksize
) ||
816 (tld
->linkdata2_pos
+ tld
->linkdata2_cmprlen
> pos1
+tld
->blocksize
))
818 de_dbg(c
, "bad linkdata");
822 de_dbg(c
, "linkdata2: pos=[%"I64_FMT
"], cmprlen=%"I64_FMT
", uncmprlen=%"I64_FMT
,
823 tld
->linkdata2_pos
, tld
->linkdata2_cmprlen
, tld
->linkdata2_uncmprlen
);
825 // Decompress linkdata2 if necessary
826 dbuf_truncate(d
->unc_linkdata2_dbuf
, 0);
827 if(tld
->seems_compressed
&& d
->uses_old_phrase_compression
) {
828 do_phrase_decompression(c
, d
, inf
, tld
->linkdata2_pos
, tld
->linkdata2_cmprlen
,
829 d
->unc_linkdata2_dbuf
, tld
->linkdata2_uncmprlen
);
831 else if(tld
->seems_compressed
&& d
->uses_hall_compression
) {
832 do_hall_decompression(c
, d
, inf
, tld
->linkdata2_pos
, tld
->linkdata2_cmprlen
,
833 d
->unc_linkdata2_dbuf
,tld
->linkdata2_uncmprlen
);
836 dbuf_copy(inf
, tld
->linkdata2_pos
, tld
->linkdata2_cmprlen
, d
->unc_linkdata2_dbuf
);
839 switch(tld
->recordtype
) {
843 do_topiclink_rectype_1_32(c
, d
, tctx
, tld
, d
->unc_linkdata2_dbuf
);
846 do_topiclink_rectype_2(c
, d
, tctx
, tld
, d
->unc_linkdata2_dbuf
);
849 de_warn(c
, "Unsupported record type: %d", (int)tld
->recordtype
);
857 static int topicpos_to_abspos(deark
*c
, lctx
*d
, struct topic_ctx
*tctx
, i64 topicpos
,
862 if(!d
->topic_block_size
) return 0;
863 blkoffs
= topicpos
% 16384;
864 if(blkoffs
<TOPICBLOCKHDRSIZE
) return 0;
865 blknum
= topicpos
/ 16384;
866 if(blknum
<0 || blknum
>=tctx
->num_topic_blocks
) return 0;
867 *pabspos
= tctx
->topic_block_info
[blknum
].pos
+ (blkoffs
-TOPICBLOCKHDRSIZE
);
871 static i64
hc30_abspos_plus_offset_to_abspos(deark
*c
, lctx
*d
, i64 pos
, i64 offset
)
873 i64 blksize
= d
->topic_block_size
-TOPICBLOCKHDRSIZE
;
874 i64 start_of_curr_block
;
875 i64 end_of_curr_block
;
878 // We're at a position in blocks of size (d->topic_block_size-12). We need to add
879 // 'offset', but subtract 12 every time we cross a block boundary.
880 start_of_curr_block
= (pos
/blksize
)*blksize
;
881 end_of_curr_block
= start_of_curr_block
+ blksize
;
882 if(pos
+offset
<= end_of_curr_block
) {
886 n
= pos
+offset
- end_of_curr_block
;
887 return pos
+ offset
- TOPICBLOCKHDRSIZE
*(1+(n
/ blksize
));
890 static int calc_next_topiclink_pos(deark
*c
, lctx
*d
, struct topic_ctx
*tctx
, i64 curpos
,
891 u32 next_pos_code
, i64
*pnextpos
)
895 if(next_pos_code
==INVALIDPOS
) {
896 de_dbg(c
, "[stopping TOPIC parsing, end-of-links marker found]");
900 if(d
->ver_minor
<=16) {
901 if(next_pos_code
< 21) {
902 de_dbg(c
, "[stopping TOPIC parsing, no nextblock available]");
905 *pnextpos
= hc30_abspos_plus_offset_to_abspos(c
, d
, curpos
, next_pos_code
);
908 if(!topicpos_to_abspos(c
, d
, tctx
, next_pos_code
, pnextpos
)) {
909 de_dbg(c
, "[stopping TOPIC parsing, no nextblock available]");
913 if((*pnextpos
) <= curpos
) {
914 de_dbg(c
, "[stopping TOPIC parsing, blocks not in order]");
922 static void do_topicdata(deark
*c
, lctx
*d
, struct topic_ctx
*tctx
)
925 int saved_indent_level
, saved_indent_level2
;
926 dbuf
*inf
= tctx
->unc_topicdata
;
928 de_dbg_indent_save(c
, &saved_indent_level
);
930 de_dbg(c
, "topic data");
937 de_dbg_indent_save(c
, &saved_indent_level2
);
939 if(tctx
->pos_of_first_topiclink
==INVALIDPOS
|| tctx
->pos_of_first_topiclink
<TOPICBLOCKHDRSIZE
) {
940 de_warn(c
, "Bad first topic link");
944 // Maybe we should call topicpos_to_abspos(), etc., but this should be
945 // correct since an offset in the first topicblock.
946 pos
= tctx
->pos_of_first_topiclink
- TOPICBLOCKHDRSIZE
;
955 de_dbg(c
, "[stopping TOPIC parsing, exceeded end of data]");
958 if(pos
== inf
->len
) {
959 de_dbg(c
, "[stopping TOPIC parsing, reached end of data]");
962 if(pos
+ 21 > inf
->len
) {
963 de_warn(c
, "Error parsing TOPIC, not enough room for another TOPICLINK (%"I64_FMT
964 ", %"I64_FMT
")", pos
, inf
->len
);
968 // TODO: Display as "Blk#:offs"
969 de_dbg(c
, "topiclink at [%"I64_FMT
"]", pos
);
972 if(!do_topiclink(c
, d
, tctx
, pos
, &next_pos_code
)) goto done
;
973 de_dbg_indent(c
, -1);
975 ret
= calc_next_topiclink_pos(c
, d
, tctx
, pos
, next_pos_code
, &next_pos
);
981 de_dbg_indent_restore(c
, saved_indent_level
);
984 static void decompress_topic_block(deark
*c
, lctx
*d
, struct topic_ctx
*tctx
,
985 i64 blknum
, i64 blk_dpos
, i64 blk_dlen
,
988 struct de_dfilter_in_params dcmpri
;
989 struct de_dfilter_out_params dcmpro
;
990 struct de_dfilter_results dres
;
993 de_dfilter_init_objects(c
, &dcmpri
, &dcmpro
, &dres
);
995 dcmpri
.f
= c
->infile
;
996 dcmpri
.pos
= blk_dpos
;
997 dcmpri
.len
= blk_dlen
;
999 dcmpro
.len_known
= 1;
1000 // TODO: Confirm what happens if a block decompresses to more than 16384-12 bytes.
1001 dcmpro
.expected_len
= 16384-TOPICBLOCKHDRSIZE
;
1002 len_before
= outf
->len
;
1003 fmtutil_hlp_lz77_codectype1(c
, &dcmpri
, &dcmpro
, &dres
, NULL
);
1004 de_dbg(c
, "decompressed %"I64_FMT
" to %"I64_FMT
" bytes", blk_dlen
,
1005 outf
->len
- len_before
);
1008 static void do_file_TOPIC(deark
*c
, lctx
*d
, i64 pos1
, i64 len
)
1011 int saved_indent_level
;
1012 struct topic_ctx
*tctx
= NULL
;
1015 de_dbg_indent_save(c
, &saved_indent_level
);
1016 tctx
= de_malloc(c
, sizeof(struct topic_ctx
));
1017 de_dbg(c
, "TOPIC at %"I64_FMT
", len=%"I64_FMT
, pos1
, len
);
1018 de_dbg_indent(c
, 1);
1020 if(!d
->found_system_file
|| d
->topic_block_size
<2048) {
1021 de_err(c
, "SYSTEM file not found");
1025 if(d
->extract_text
) {
1026 tctx
->unc_topicdata
= dbuf_create_membuf(c
, 0, 0);
1028 tctx
->num_topic_blocks
= (len
+ (d
->topic_block_size
- TOPICBLOCKHDRSIZE
)) % d
->topic_block_size
;
1029 tctx
->topic_block_info
= de_mallocarray(c
, tctx
->num_topic_blocks
, sizeof(struct topic_block_info_item
));
1031 tctx
->pos_of_first_topiclink
= INVALIDPOS
;
1033 // A series of blocks, each with a 12-byte header
1034 for(blknum
=0; blknum
<tctx
->num_topic_blocks
; blknum
++) {
1035 u32 lastlink
, firstlink
, lastheader
;
1039 char tbuf1
[24], tbuf2
[24], tbuf3
[24];
1041 blklen
= d
->topic_block_size
;
1042 if(blklen
> (pos1
+len
)-pos
) {
1043 blklen
= (pos1
+len
)-pos
;
1045 if(blklen
<TOPICBLOCKHDRSIZE
) break;
1046 blk_dpos
= pos
+TOPICBLOCKHDRSIZE
;
1047 blk_dlen
= blklen
-TOPICBLOCKHDRSIZE
;
1049 de_dbg(c
, "TOPIC block #%d at %d, dpos=%d, dlen=%d", (int)blknum
, (int)pos
,
1050 (int)blk_dpos
, (int)blk_dlen
);
1051 de_dbg_indent(c
, 1);
1052 lastlink
= (u32
)de_getu32le(pos
);
1053 firstlink
= (u32
)de_getu32le(pos
+4);
1054 lastheader
= (u32
)de_getu32le(pos
+8);
1055 format_topiclink(c
, d
, lastlink
, tbuf1
, sizeof(tbuf1
));
1056 format_topiclink(c
, d
, firstlink
, tbuf2
, sizeof(tbuf2
));
1057 format_topiclink(c
, d
, lastheader
, tbuf3
, sizeof(tbuf3
));
1058 de_dbg(c
, "LastLink=%s, FirstLink=%s, LastHeader=%s",
1059 tbuf1
, tbuf2
, tbuf3
);
1060 if(tctx
->pos_of_first_topiclink
==INVALIDPOS
&& firstlink
!=INVALIDPOS
) {
1061 tctx
->pos_of_first_topiclink
= firstlink
;
1064 if(d
->extract_text
&& tctx
->unc_topicdata
) {
1065 // Record the position for later reference.
1066 tctx
->topic_block_info
[blknum
].pos
= tctx
->unc_topicdata
->len
;
1068 if(d
->is_lz77_compressed
) {
1069 decompress_topic_block(c
, d
, tctx
, blknum
, blk_dpos
, blk_dlen
, tctx
->unc_topicdata
);
1072 dbuf_copy(c
->infile
, blk_dpos
, blk_dlen
, tctx
->unc_topicdata
);
1075 tctx
->topic_block_info
[blknum
].len
= tctx
->unc_topicdata
->len
- tctx
->topic_block_info
[blknum
].pos
;
1077 de_dbg2(c
, "[current decompressed size: %"I64_FMT
"]", tctx
->unc_topicdata
->len
);
1080 de_dbg_indent(c
, -1);
1084 if(tctx
->unc_topicdata
&& tctx
->unc_topicdata
->len
>0) {
1085 do_topicdata(c
, d
, tctx
);
1090 dbuf_close(tctx
->unc_topicdata
);
1091 de_free(c
, tctx
->topic_block_info
);
1094 de_dbg_indent_restore(c
, saved_indent_level
);
1097 static void do_index_page(deark
*c
, lctx
*d
, i64 pos1
, i64
*prev_page
)
1099 *prev_page
= de_geti16le(pos1
+4);
1100 de_dbg(c
, "PreviousPage: %d", (int)*prev_page
);
1103 static int de_is_digit(char x
)
1105 return (x
>='0' && x
<='9');
1108 static enum hlp_filetype
filename_to_filetype(deark
*c
, lctx
*d
, const char *fn
)
1114 if(!de_strcmp(fn
, "|TOPIC")) return FILETYPE_TOPIC
;
1115 if(!de_strcmp(fn
, "|SYSTEM")) return FILETYPE_SYSTEM
;
1116 if(!de_strncmp(fn
, "|bm", 3) && de_is_digit(fn
[3])) return FILETYPE_SHG
;
1117 if(!de_strcmp(fn
, "|Phrases")) return FILETYPE_PHRASES
;
1118 if(!de_strcmp(fn
, "|PhrIndex")) return FILETYPE_PHRINDEX
;
1119 if(!de_strcmp(fn
, "|PhrImage")) return FILETYPE_PHRIMAGE
;
1120 return FILETYPE_OTHERSPECIAL
;
1123 ext
= de_get_sz_ext(fn
);
1124 extlen
= de_strlen(ext
);
1126 // Some SHG streams' names don't start with "|". Assume it is SHG if it
1127 // starts with "bm" and a digit, and doesn't have a ".".
1129 if(!de_strncmp(fn
, "bm", 2) && de_is_digit(fn
[2])) return FILETYPE_SHG
;
1132 // Not sure how bold to be here. Should we extract every file that we can't
1133 // identify? Or maybe only those that have a filename extension?
1134 return FILETYPE_EXTRACTABLE
;
1137 static void do_leaf_page(deark
*c
, lctx
*d
, i64 pos1
, i64
*pnext_page
)
1145 struct de_stringreaderdata
*fn_srd
= NULL
;
1146 enum hlp_filetype file_type
;
1147 int saved_indent_level
;
1149 de_dbg_indent_save(c
, &saved_indent_level
);
1150 n
= de_getu16le_p(&pos
); // "Unused"
1151 de_dbg(c
, "free bytes at end of this page: %d", (int)n
);
1153 num_entries
= de_geti16le_p(&pos
);
1154 de_dbg(c
, "NEntries: %d", (int)num_entries
);
1156 n
= de_geti16le_p(&pos
);
1157 de_dbg(c
, "PreviousPage: %d", (int)n
);
1159 n
= de_geti16le_p(&pos
);
1160 de_dbg(c
, "NextPage: %d", (int)n
);
1161 if(pnext_page
) *pnext_page
= n
;
1163 for(k
=0; k
<num_entries
; k
++) {
1164 int pass_for_this_file
;
1166 de_dbg(c
, "entry[%d]", (int)k
);
1167 de_dbg_indent(c
, 1);
1169 if(!dbuf_search_byte(c
->infile
, 0x00, pos
, 260, &foundpos
)) {
1170 de_err(c
, "Malformed leaf page at %d", (int)pos1
);
1175 de_destroy_stringreaderdata(c
, fn_srd
);
1177 fn_srd
= dbuf_read_string(c
->infile
, pos
, foundpos
-pos
, foundpos
-pos
, 0, d
->input_encoding
);
1178 de_dbg(c
, "FileName: \"%s\"", ucstring_getpsz_d(fn_srd
->str
));
1181 file_offset
= de_geti32le_p(&pos
);
1182 de_dbg(c
, "FileOffset: %d", (int)file_offset
);
1184 file_type
= filename_to_filetype(c
, d
, fn_srd
->sz
);
1187 case FILETYPE_SYSTEM
:
1188 d
->found_system_file
= 1;
1189 d
->offset_of_system_file
= file_offset
;
1190 pass_for_this_file
= 1;
1192 case FILETYPE_TOPIC
:
1193 d
->found_TOPIC_file
= 1;
1194 d
->offset_of_TOPIC
= file_offset
;
1195 pass_for_this_file
= 1;
1197 case FILETYPE_PHRASES
:
1198 d
->found_Phrases_file
= 1;
1199 d
->offset_of_Phrases
= file_offset
;
1200 pass_for_this_file
= 1;
1202 case FILETYPE_PHRINDEX
:
1203 d
->found_PhrIndex_file
= 1;
1204 d
->offset_of_PhrIndex
= file_offset
;
1205 pass_for_this_file
= 1;
1207 case FILETYPE_PHRIMAGE
:
1208 d
->found_PhrImage_file
= 1;
1209 d
->offset_of_PhrImage
= file_offset
;
1210 pass_for_this_file
= 1;
1213 pass_for_this_file
= 2;
1216 if(d
->pass
==2 && d
->extract_raw_streams
) {
1217 do_file(c
, d
, file_offset
, file_type
, 1, fn_srd
);
1219 else if(d
->pass
==2 && pass_for_this_file
==2) {
1220 do_file(c
, d
, file_offset
, file_type
, 0, fn_srd
);
1223 de_dbg_indent(c
, -1);
1227 de_destroy_stringreaderdata(c
, fn_srd
);
1228 de_dbg_indent_restore(c
, saved_indent_level
);
1231 // Sets d->bpt.first_leaf_page
1232 static int find_first_leaf_page(deark
*c
, lctx
*d
)
1236 int saved_indent_level
;
1239 de_dbg_indent_save(c
, &saved_indent_level
);
1240 curr_page
= d
->bpt
.root_page
;
1241 curr_level
= d
->bpt
.num_levels
;
1243 de_dbg(c
, "looking for first leaf page");
1244 de_dbg_indent(c
, 1);
1246 while(curr_level
>1) {
1250 if(curr_page
<0) goto done
;
1251 page_pos
= d
->bpt
.pagesdata_pos
+ curr_page
*d
->bpt
.pagesize
;
1253 de_dbg(c
, "page %d is an index page, level=%d", (int)curr_page
, (int)curr_level
);
1256 de_dbg_indent(c
, 1);
1257 do_index_page(c
, d
, page_pos
, &prev_page
);
1258 de_dbg_indent(c
, -1);
1260 curr_page
= prev_page
;
1264 de_dbg(c
, "page %d is the first leaf page", (int)curr_page
);
1265 d
->bpt
.first_leaf_page
= curr_page
;
1270 de_dbg_indent_restore(c
, saved_indent_level
);
1274 static void sanitize_phrase_info(deark
*c
, lctx
*d
)
1278 if(!d
->phrases_data
) {
1283 for(k
=0; k
<d
->num_phrases
; k
++) {
1284 if((i64
)d
->phrase_info
[k
].pos
+ (i64
)d
->phrase_info
[k
].len
> d
->phrases_data
->len
) {
1285 d
->phrase_info
[k
].pos
= 0;
1286 d
->phrase_info
[k
].len
= 0;
1291 static void do_after_pass_1(deark
*c
, lctx
*d
)
1293 // Read the SYSTEM file first -- lots of other things depend on it.
1294 if(d
->found_system_file
) {
1295 do_file(c
, d
, d
->offset_of_system_file
, FILETYPE_SYSTEM
, 0, NULL
);
1298 if(d
->found_Phrases_file
) {
1299 d
->uses_old_phrase_compression
= 1;
1301 else if(d
->found_PhrIndex_file
&& d
->found_PhrImage_file
) {
1302 d
->uses_hall_compression
= 1;
1305 if(d
->input_encoding
==DE_ENCODING_UNKNOWN
|| d
->input_encoding
==DE_ENCODING_ASCII
) {
1306 d
->output_is_utf8
= 0;
1309 d
->output_is_utf8
= 1;
1312 // Read other special files, in a suitable order.
1314 if(d
->found_Phrases_file
&& d
->uses_old_phrase_compression
) {
1315 do_file(c
, d
, d
->offset_of_Phrases
, FILETYPE_PHRASES
, 0, NULL
);
1318 if(d
->found_PhrIndex_file
&& d
->uses_hall_compression
) {
1319 do_file(c
, d
, d
->offset_of_PhrIndex
, FILETYPE_PHRINDEX
, 0, NULL
);
1321 if(d
->found_PhrImage_file
&& d
->uses_hall_compression
) {
1322 do_file(c
, d
, d
->offset_of_PhrImage
, FILETYPE_PHRIMAGE
, 0, NULL
);
1324 sanitize_phrase_info(c
, d
);
1326 if(d
->found_TOPIC_file
) {
1327 do_file(c
, d
, d
->offset_of_TOPIC
, FILETYPE_TOPIC
, 0, NULL
);
1331 // This function is only for the "internal directory" tree.
1332 // There are other data objects in HLP files that use the same kind of data
1333 // structure. If we ever want to parse them, this function will have to be
1335 static void do_bplustree(deark
*c
, lctx
*d
, i64 pos1
, i64 len
,
1340 int saved_indent_level
;
1341 u8
*page_seen
= NULL
;
1342 struct de_stringreaderdata
*structure
= NULL
;
1344 if(!is_internaldir
) return;
1346 de_dbg_indent_save(c
, &saved_indent_level
);
1348 n
= de_getu16le_p(&pos
);
1350 de_err(c
, "Expected B+ tree structure at %d not found", (int)pos1
);
1354 //de_dbg(c, "B+ tree at %d", (int)pos1);
1355 de_dbg_indent(c
, 1);
1357 d
->bpt
.flags
= (unsigned int)de_getu16le_p(&pos
);
1358 de_dbg(c
, "Btree flags: 0x%04x", d
->bpt
.flags
);
1360 d
->bpt
.pagesize
= de_getu16le_p(&pos
);
1361 de_dbg(c
, "PageSize: %d", (int)d
->bpt
.pagesize
);
1363 // TODO: Understand the Structure field
1364 structure
= dbuf_read_string(c
->infile
, pos
, 16, 16, DE_CONVFLAG_STOP_AT_NUL
,
1366 de_dbg(c
, "Structure: \"%s\"", ucstring_getpsz_d(structure
->str
));
1367 de_destroy_stringreaderdata(c
, structure
);
1371 pos
+= 2; // MustBeZero
1372 pos
+= 2; // PageSplits
1374 d
->bpt
.root_page
= de_geti16le_p(&pos
);
1375 de_dbg(c
, "RootPage: %d", (int)d
->bpt
.root_page
);
1377 pos
+= 2; // MustBeNegOne
1379 d
->bpt
.num_pages
= de_geti16le_p(&pos
);
1380 de_dbg(c
, "TotalPages: %d", (int)d
->bpt
.num_pages
);
1382 d
->bpt
.num_levels
= de_geti16le_p(&pos
);
1383 de_dbg(c
, "NLevels: %d", (int)d
->bpt
.num_levels
);
1384 if(is_internaldir
) d
->internal_dir_num_levels
= d
->bpt
.num_levels
;
1386 d
->bpt
.num_entries
= de_geti32le_p(&pos
);
1387 de_dbg(c
, "TotalBtreeEntries: %d", (int)d
->bpt
.num_entries
);
1389 d
->bpt
.pagesdata_pos
= pos
;
1390 de_dbg(c
, "num pages: %d, %d bytes each, at %d (total size=%d)",
1391 (int)d
->bpt
.num_pages
, (int)d
->bpt
.pagesize
, (int)d
->bpt
.pagesdata_pos
,
1392 (int)(d
->bpt
.num_pages
* d
->bpt
.pagesize
));
1394 if(!find_first_leaf_page(c
, d
)) goto done
;
1396 page_seen
= de_malloc(c
, d
->bpt
.num_pages
); // For loop detection
1398 for(d
->pass
=1; d
->pass
<=2; d
->pass
++) {
1401 de_zeromem(page_seen
, (size_t)d
->bpt
.num_pages
);
1403 de_dbg(c
, "pass %d", d
->pass
);
1404 de_dbg_indent(c
, 1);
1406 curr_page
= d
->bpt
.first_leaf_page
;
1412 if(curr_page
<0) break;
1413 if(curr_page
>d
->bpt
.num_pages
) goto done
;
1415 if(d
->pass
==1 && page_seen
[curr_page
]) {
1416 de_err(c
, "Page loop detected");
1419 page_seen
[curr_page
] = 1;
1421 page_pos
= d
->bpt
.pagesdata_pos
+ curr_page
*d
->bpt
.pagesize
;
1423 de_dbg(c
, "page[%d] at %d (leaf page)", (int)curr_page
, (int)page_pos
);
1426 de_dbg_indent(c
, 1);
1427 do_leaf_page(c
, d
, page_pos
, &next_page
);
1428 de_dbg_indent(c
, -1);
1430 curr_page
= next_page
;
1433 de_dbg_indent(c
, -1);
1436 de_dbg(c
, "reading items after pass 1");
1437 de_dbg_indent(c
, 1);
1438 do_after_pass_1(c
, d
);
1439 de_dbg_indent(c
, -1);
1444 de_free(c
, page_seen
);
1445 if(structure
) de_destroy_stringreaderdata(c
, structure
);
1446 de_dbg_indent_restore(c
, saved_indent_level
);
1449 static void do_file_INTERNALDIR(deark
*c
, lctx
*d
, i64 pos1
, i64 len
)
1451 de_dbg(c
, "internal dir data at %d", (int)pos1
);
1452 do_bplustree(c
, d
, pos1
, len
, 1);
1455 static void dump_phrase_offset_table(deark
*c
, lctx
*d
)
1459 for(k
=0; k
<d
->num_phrases
; k
++) {
1460 de_dbg2(c
, "phrase[%u]: offs=%u, len=%u", k
, d
->phrase_info
[k
].pos
,
1461 d
->phrase_info
[k
].len
);
1465 static void decompress_Phrases(deark
*c
, lctx
*d
, i64 pos
, i64 cmpr_len
, i64 uncmpr_len
)
1467 struct de_dfilter_in_params dcmpri
;
1468 struct de_dfilter_out_params dcmpro
;
1469 struct de_dfilter_results dres
;
1471 de_dfilter_init_objects(c
, &dcmpri
, &dcmpro
, &dres
);
1472 dcmpri
.f
= c
->infile
;
1474 dcmpri
.len
= cmpr_len
;
1475 dcmpro
.f
= d
->phrases_data
;
1476 dcmpro
.len_known
= 1;
1477 dcmpro
.expected_len
= uncmpr_len
;
1478 fmtutil_hlp_lz77_codectype1(c
, &dcmpri
, &dcmpro
, &dres
, NULL
);
1479 if(dres
.errcode
|| (d
->phrases_data
->len
!=uncmpr_len
)) {
1480 de_warn(c
, "Phrases decompression may have failed");
1484 static void do_file_Phrases(deark
*c
, lctx
*d
, i64 pos1
, i64 len
)
1488 i64 phrase_data_pos
;
1489 i64 phrase_data_cmpr_len
;
1490 i64 phrase_data_uncmpr_len
= 0;
1491 i64 phrase_offset_table_pos
;
1492 i64 phrase_offset_table_len
;
1494 int is_MVB_format
= 0;
1495 int is_compressed
= 0;
1496 int saved_indent_level
;
1498 de_dbg_indent_save(c
, &saved_indent_level
);
1499 if(!d
->extract_text
) goto done
;
1501 de_dbg(c
, "Phrases data at %"I64_FMT
", len=%"I64_FMT
, pos1
, len
);
1502 de_dbg_indent(c
, 1);
1503 if(len
<6) goto done
;
1505 s0
= de_getu16le(pos1
);
1506 s1
= de_getu16le(pos1
+2);
1507 s2
= de_getu16le(pos1
+4);
1509 if(s0
==0x0800 && s1
!=0x0100 && s2
==0x0100) {
1512 else if(s1
==0x0100) {
1516 de_err(c
, "Unknown Phrases format");
1519 de_dbg(c
, "MVB format: %d", is_MVB_format
);
1521 de_err(c
, "Unsupported Phrases format");
1525 d
->num_phrases
= (unsigned int)s0
;
1526 de_dbg(c
, "num phrases: %u", d
->num_phrases
);
1529 if(d
->ver_minor
>16) {
1532 de_dbg(c
, "Phrases are lzw-compressed: %d", is_compressed
);
1535 phrase_data_uncmpr_len
= de_getu32le_p(&pos
);
1536 de_dbg(c
, "decompressed len (reported): %"I64_FMT
, phrase_data_uncmpr_len
);
1539 // Phrase offsets are measured from the start of the offset table.
1540 phrase_offset_table_pos
= pos
;
1541 phrase_offset_table_len
= ((i64
)d
->num_phrases
+1)*2;
1542 de_dbg(c
, "offset table at %"I64_FMT
", len=%"I64_FMT
, phrase_offset_table_pos
,
1543 phrase_offset_table_len
);
1545 phrase_data_pos
= phrase_offset_table_pos
+ phrase_offset_table_len
;
1546 phrase_data_cmpr_len
= pos1
+len
- phrase_data_pos
; // (before any decompression)
1547 if(phrase_data_cmpr_len
<0) goto done
;
1548 if(!is_compressed
) {
1549 phrase_data_uncmpr_len
= phrase_data_cmpr_len
;
1552 d
->phrase_info
= de_mallocarray(c
, (i64
)d
->num_phrases
, sizeof(struct phrase_item
));
1553 for(k
=0; k
<d
->num_phrases
+1; k
++) {
1556 offs
= (u32
)de_getu16le_p(&pos
);
1557 offs
-= (u32
)phrase_offset_table_len
;
1559 if(k
<d
->num_phrases
) {
1560 d
->phrase_info
[k
].pos
= offs
;
1563 d
->phrase_info
[k
-1].len
= offs
- d
->phrase_info
[k
-1].pos
;
1566 if(c
->debug_level
>=2) {
1567 dump_phrase_offset_table(c
, d
);
1570 de_dbg(c
, "phrase data at %"I64_FMT
", len=%"I64_FMT
, phrase_data_pos
, phrase_data_cmpr_len
);
1573 decompress_Phrases(c
, d
, phrase_data_pos
, phrase_data_cmpr_len
, phrase_data_uncmpr_len
);
1576 dbuf_copy(c
->infile
, phrase_data_pos
, phrase_data_cmpr_len
, d
->phrases_data
);
1579 d
->valid_Phrases_file
= 1;
1582 de_dbg_indent_restore(c
, saved_indent_level
);
1585 static void phrdecompress(deark
*c
, lctx
*d
, i64 pos1
, i64 len
, unsigned int BitCount
)
1589 struct de_bitreader bitrd
;
1591 de_zeromem(&bitrd
, sizeof(struct de_bitreader
));
1592 bitrd
.f
= c
->infile
;
1593 bitrd
.curpos
= pos1
;
1594 bitrd
.endpos
= pos1
+ len
;
1595 bitrd
.bbll
.is_lsb
= 1;
1597 d
->phrase_info
[0].pos
= 0;
1599 for(i
=0; i
<d
->num_phrases
; i
++) {
1600 unsigned int num1bits
= 0;
1602 while(de_bitreader_getbits(&bitrd
, 1)) {
1603 if(bitrd
.eof_flag
) goto done
;
1606 n
= num1bits
<<BitCount
;
1607 n
+= (UI
)de_bitreader_getbits(&bitrd
, BitCount
) + 1;
1608 if(bitrd
.eof_flag
) goto done
;
1610 d
->phrase_info
[i
].len
= n
;
1611 if(i
+1<d
->num_phrases
) {
1612 d
->phrase_info
[i
+1].pos
= d
->phrase_info
[i
].pos
+n
;
1617 if(c
->debug_level
>=2) {
1618 dump_phrase_offset_table(c
, d
);
1622 static void do_file_PhrIndex(deark
*c
, lctx
*d
, i64 pos1
, i64 len
)
1628 unsigned int bitcount
;
1629 int saved_indent_level
;
1631 de_dbg_indent_save(c
, &saved_indent_level
);
1632 if(len
<30) goto done
;
1633 n
= de_getu32le_p(&pos
);
1635 d
->num_phrases
= (unsigned int)de_getu32le_p(&pos
);
1636 de_dbg(c
, "num phrases: %u", d
->num_phrases
);
1637 d
->phrase_info
= de_mallocarray(c
, (i64
)d
->num_phrases
, sizeof(struct phrase_item
));
1639 cmprsize
= de_getu32le_p(&pos
);
1640 de_dbg(c
, "index cmpr size: %"I64_FMT
, cmprsize
);
1641 d
->PhrImageUncSize
= de_getu32le_p(&pos
);
1642 de_dbg(c
, "PhrImage uncmpr size: %"I64_FMT
, d
->PhrImageUncSize
);
1643 d
->PhrImageCmprSize
= de_getu32le_p(&pos
);
1644 de_dbg(c
, "PhrImage cmpr size: %"I64_FMT
, d
->PhrImageCmprSize
);
1646 bits
= (unsigned int)de_getu16le_p(&pos
);
1647 de_dbg(c
, "bits: 0x%04x", bits
);
1648 de_dbg_indent(c
, 1);
1649 bitcount
= bits
& 0xf;
1650 de_dbg(c
, "bit count: %u", bitcount
);
1651 de_dbg_indent(c
, -1);
1654 de_dbg(c
, "avail size: %d", (int)(pos1
+len
-pos
));
1655 phrdecompress(c
, d
, pos
, pos1
+len
-pos
, bitcount
);
1657 de_dbg_indent_restore(c
, saved_indent_level
);
1660 static void do_file_PhrImage(deark
*c
, lctx
*d
, i64 pos1
, i64 len
)
1662 de_dbg(c
, "PhrImage data at %"I64_FMT
", len=%"I64_FMT
, pos1
, len
);
1663 if(len
< d
->PhrImageCmprSize
) {
1667 if(d
->PhrImageCmprSize
== d
->PhrImageUncSize
) {
1668 dbuf_copy(c
->infile
, pos1
, d
->PhrImageCmprSize
, d
->phrases_data
);
1671 decompress_Phrases(c
, d
, pos1
, d
->PhrImageCmprSize
, d
->PhrImageUncSize
);
1672 de_dbg(c
, "decompressed to %"I64_FMT
" bytes", d
->phrases_data
->len
);
1676 static const char* file_type_to_type_name(enum hlp_filetype file_fmt
)
1678 const char *name
= "unspecified";
1680 case FILETYPE_SYSTEM
: name
="system"; break;
1681 case FILETYPE_TOPIC
: name
="topic"; break;
1682 case FILETYPE_SHG
: name
="SHG/MRB"; break;
1683 case FILETYPE_INTERNALDIR
: name
="directory"; break;
1684 case FILETYPE_PHRASES
: name
="Phrases"; break;
1685 case FILETYPE_PHRINDEX
: name
="PhrIndex"; break;
1686 case FILETYPE_PHRIMAGE
: name
="PhrImage"; break;
1687 case FILETYPE_OTHERSPECIAL
: name
="other special stream"; break;
1688 case FILETYPE_EXTRACTABLE
: name
="other extractable file"; break;
1694 static void do_file(deark
*c
, lctx
*d
, i64 pos1
, enum hlp_filetype file_fmt
, int force_extract
,
1695 struct de_stringreaderdata
*fn
)
1700 unsigned int fileflags
;
1702 de_dbg(c
, "file at %d, type=%s", (int)pos1
, file_type_to_type_name(file_fmt
));
1703 de_dbg_indent(c
, 1);
1706 reserved_space
= de_getu32le_p(&pos
);
1707 de_dbg(c
, "ReservedSpace: %d", (int)reserved_space
);
1709 used_space
= de_getu32le_p(&pos
);
1710 de_dbg(c
, "UsedSpace: %d", (int)used_space
);
1712 fileflags
= (unsigned int)de_getbyte_p(&pos
);
1713 de_dbg(c
, "FileFlags: 0x%02x", fileflags
);
1715 if(pos
+used_space
> c
->infile
->len
) {
1716 de_err(c
, "Bad file size");
1721 do_extract_raw_file(c
, d
, pos
, used_space
, fn
);
1726 case FILETYPE_INTERNALDIR
:
1727 do_file_INTERNALDIR(c
, d
, pos
, used_space
);
1729 case FILETYPE_TOPIC
:
1730 do_file_TOPIC(c
, d
, pos
, used_space
);
1732 case FILETYPE_PHRASES
:
1733 do_file_Phrases(c
, d
, pos
, used_space
);
1735 case FILETYPE_PHRINDEX
:
1736 do_file_PhrIndex(c
, d
, pos
, used_space
);
1738 case FILETYPE_PHRIMAGE
:
1739 do_file_PhrImage(c
, d
, pos
, used_space
);
1741 case FILETYPE_SYSTEM
:
1742 do_file_SYSTEM(c
, d
, pos
, used_space
);
1746 do_file_SHG(c
, d
, pos
, used_space
);
1748 case FILETYPE_EXTRACTABLE
:
1749 do_extract_raw_file(c
, d
, pos
, used_space
, fn
);
1755 de_dbg_indent(c
, -1);
1758 static void do_header(deark
*c
, lctx
*d
, i64 pos
)
1762 de_dbg(c
, "header at %d", (int)pos
);
1763 de_dbg_indent(c
, 1);
1765 d
->internal_dir_FILEHEADER_offs
= de_geti32le(4);
1766 de_dbg(c
, "internal dir FILEHEADER pos: %d", (int)d
->internal_dir_FILEHEADER_offs
);
1769 de_dbg(c
, "FREEHEADER pos: %d", (int)n
);
1771 n
= de_geti32le(12);
1772 de_dbg(c
, "reported file size: %d", (int)n
);
1774 de_dbg_indent(c
, -1);
1777 static void de_run_hlp(deark
*c
, de_module_params
*mparams
)
1782 d
= de_malloc(c
, sizeof(lctx
));
1784 d
->input_encoding
= de_get_input_encoding(c
, NULL
, DE_ENCODING_WINDOWS1252
);
1785 d
->extract_text
= de_get_ext_option_bool(c
, "hlp:extracttext",
1786 ((c
->extract_level
>=2)?1:0));
1787 d
->extract_raw_streams
= (u8
)de_get_ext_option_bool(c
, "hlp:extractstreams", 0);
1788 d
->tmpdbuf1
= dbuf_create_membuf(c
, 0, 0);
1789 d
->unc_linkdata2_dbuf
= dbuf_create_membuf(c
, 0, 0);
1790 d
->tmpucstring1
= ucstring_create(c
);
1791 d
->phrases_data
= dbuf_create_membuf(c
, 0, 0);
1794 do_header(c
, d
, pos
);
1796 do_file(c
, d
, d
->internal_dir_FILEHEADER_offs
, FILETYPE_INTERNALDIR
, 0, NULL
);
1798 de_dbg(c
, "summary: v%d.%d %s%s%s blksize=%d levels=%d%s%s%s",
1799 d
->ver_major
, d
->ver_minor
,
1800 d
->is_lz77_compressed
?"lz77":"no_lz77",
1801 d
->uses_old_phrase_compression
?" phrase_compression":"",
1802 d
->uses_hall_compression
?" Hall_compression":"",
1803 (int)d
->topic_block_size
,
1804 (int)d
->internal_dir_num_levels
,
1805 d
->has_shg
?" has-shg":"",
1806 d
->has_ico
?" has-ico":"",
1807 d
->has_bmp
?" has-bmp":"");
1810 dbuf_close(d
->tmpdbuf1
);
1811 dbuf_close(d
->unc_linkdata2_dbuf
);
1812 dbuf_close(d
->phrases_data
);
1813 ucstring_destroy(d
->tmpucstring1
);
1814 ucstring_destroy(d
->help_file_title
);
1815 ucstring_destroy(d
->help_file_copyright
);
1816 dbuf_close(d
->outf_text
);
1817 de_free(c
, d
->phrase_info
);
1822 static int de_identify_hlp(deark
*c
)
1824 if(!dbuf_memcmp(c
->infile
, 0, "\x3f\x5f\x03\x00", 4))
1829 static void de_help_hlp(deark
*c
)
1831 de_msg(c
, "-opt hlp:extracttext : Write the text (unformatted) to a file");
1832 de_msg(c
, "-opt hlp:extractstreams : Extract raw files, instead of decoding");
1835 void de_module_hlp(deark
*c
, struct deark_module_info
*mi
)
1839 mi
->run_fn
= de_run_hlp
;
1840 mi
->identify_fn
= de_identify_hlp
;
1841 mi
->help_fn
= de_help_hlp
;