Refactoring the iff decoder
[deark.git] / modules / lha.c
bloba5ef13980aab4a1d254e1415d2ee2ae4afcb157e
1 // This file is part of Deark.
2 // Copyright (C) 2017 Jason Summers
3 // See the file COPYING for terms of use.
5 // LHA/LZH compressed archive format
7 #include <deark-config.h>
8 #include <deark-private.h>
9 #include <deark-fmtutil.h>
10 DE_DECLARE_MODULE(de_module_lha);
11 DE_DECLARE_MODULE(de_module_swg);
12 DE_DECLARE_MODULE(de_module_car_lha);
13 DE_DECLARE_MODULE(de_module_arx);
15 #define MAX_SUBDIR_LEVEL 32
17 #define CODE_S_LH0 0x204c4830 // SAR
18 #define CODE_S_LH5 0x204c4835 // SAR
19 #define CODE_ah0 0x2d616830U // MAR
20 #define CODE_ari 0x2d617269U // MAR
21 #define CODE_hf0 0x2d686630U // MAR
22 #define CODE_lZ0 0x2d6c5a30U // PUT
23 #define CODE_lZ1 0x2d6c5a31U // PUT
24 #define CODE_lZ5 0x2d6c5a35U // PUT
25 #define CODE_lh0 0x2d6c6830U
26 #define CODE_lh1 0x2d6c6831U
27 #define CODE_lh2 0x2d6c6832U
28 #define CODE_lh3 0x2d6c6833U
29 #define CODE_lh4 0x2d6c6834U
30 #define CODE_lh5 0x2d6c6835U
31 #define CODE_lh6 0x2d6c6836U
32 #define CODE_lh7 0x2d6c6837U // standard, or LHARK
33 #define CODE_lh8 0x2d6c6838U
34 #define CODE_lh9 0x2d6c6839U
35 #define CODE_lha 0x2d6c6861U
36 #define CODE_lhb 0x2d6c6862U
37 #define CODE_lhc 0x2d6c6863U
38 #define CODE_lhd 0x2d6c6864U
39 #define CODE_lhe 0x2d6c6865U
40 #define CODE_lhx 0x2d6c6878U
41 #define CODE_lx1 0x2d6c7831U
42 #define CODE_lz2 0x2d6c7a32U
43 #define CODE_lz3 0x2d6c7a33U
44 #define CODE_lz4 0x2d6c7a34U
45 #define CODE_lz5 0x2d6c7a35U
46 #define CODE_lz7 0x2d6c7a37U
47 #define CODE_lz8 0x2d6c7a38U
48 #define CODE_lzs 0x2d6c7a73U
49 #define CODE_pc1 0x2d706331U
50 #define CODE_pm0 0x2d706d30U
51 #define CODE_pm1 0x2d706d31U
52 #define CODE_pm2 0x2d706d32U
53 #define CODE_sw0 0x2d737730U
54 #define CODE_sw1 0x2d737731U
56 #define TIMESTAMPIDX_INVALID (-1)
57 struct timestamp_data {
58 struct de_timestamp ts; // The best timestamp of this type found so far
59 int quality;
62 struct cmpr_meth_info;
64 struct member_data {
65 u8 hlev; // header level
66 de_encoding encoding;
67 i64 member_pos;
68 i64 total_size;
69 struct cmpr_meth_info *cmi;
70 u8 is_dir;
71 u8 is_special;
72 u8 is_nonexecutable;
73 u8 is_executable;
74 i64 orig_size;
75 UI hdr_checksum_calc;
77 u8 have_hdr_crc_reported;
78 u32 hdr_crc_reported;
79 u32 hdr_crc_calc;
80 i64 hdr_crc_field_pos;
82 u32 crc16;
83 u8 os_id;
84 i64 compressed_data_pos; // relative to beginning of file
85 i64 compressed_data_len;
86 de_ucstring *dirname;
87 de_ucstring *filename;
88 de_ucstring *fullfilename;
89 struct timestamp_data tsdata[DE_TIMESTAMPIDX_COUNT];
92 typedef struct localctx_struct {
93 de_encoding input_encoding;
94 u8 hlev_of_first_member;
95 u8 swg_fmt;
96 u8 lhark_fmt;
97 u8 lh7_success_flag;
98 u8 lh7_failed_flag;
99 u8 trailer_found;
100 int member_count;
101 i64 trailer_pos;
102 struct de_crcobj *crco;
103 } lctx;
105 typedef void (*decompressor_fn)(deark *c, lctx *d, struct member_data *md,
106 struct de_dfilter_in_params *dcmpri, struct de_dfilter_out_params *dcmpro,
107 struct de_dfilter_results *dres);
109 struct cmpr_meth_info {
110 u8 is_recognized;
111 u32 uniq_id;
112 decompressor_fn decompressor;
113 u8 id_raw[5];
114 char id_printable_sz[6];
115 char descr[80];
118 struct exthdr_type_info_struct;
120 typedef void (*exthdr_decoder_fn)(deark *c, lctx *d, struct member_data *md,
121 u8 id, const struct exthdr_type_info_struct *e,
122 i64 pos, i64 dlen);
124 struct exthdr_type_info_struct {
125 u8 id;
126 u8 flags;
127 const char *name;
128 exthdr_decoder_fn decoder_fn;
131 static int lha_isdigit(u8 x)
133 return (x>='0' && x<='9');
136 static int lha_isalpha(u8 x)
138 return ((x>='A' && x<='Z') || (x>='a' && x<='z'));
141 static int lha_isalnum(u8 x)
143 return (lha_isdigit(x) || lha_isalpha(x));
146 static int is_possible_cmpr_meth(const u8 m[5])
148 if(m[0]!=m[4]) return 0;
149 if(m[0]==' ' && m[1]=='L' && m[2]=='H' && lha_isdigit(m[3])) return 1;
150 if(m[0]!='-') return 0;
151 if(!lha_isalpha(m[1]) ||
152 !lha_isalnum(m[2]) ||
153 !lha_isalnum(m[3]))
155 return 0;
157 return 1;
160 static void apply_timestamp(deark *c, lctx *d, struct member_data *md,
161 int tsidx, const struct de_timestamp *ts, int quality)
163 if(!ts->is_valid) return;
164 if(tsidx<0 || tsidx>=DE_TIMESTAMPIDX_COUNT) return;
165 if(quality < md->tsdata[tsidx].quality) return;
166 md->tsdata[tsidx].ts = *ts;
167 md->tsdata[tsidx].quality = quality;
170 static void read_msdos_modtime(deark *c, lctx *d, struct member_data *md,
171 i64 pos, const char *name)
173 i64 mod_time_raw, mod_date_raw;
174 char timestamp_buf[64];
175 struct de_timestamp tmp_timestamp;
177 mod_time_raw = de_getu16le(pos);
178 mod_date_raw = de_getu16le(pos+2);
179 if(mod_time_raw==0 && mod_date_raw==0) {
180 de_dbg(c, "%s: (not set)", name);
181 return;
183 de_dos_datetime_to_timestamp(&tmp_timestamp, mod_date_raw, mod_time_raw);
184 tmp_timestamp.tzcode = DE_TZCODE_LOCAL;
185 de_timestamp_to_string(&tmp_timestamp, timestamp_buf, sizeof(timestamp_buf), 0);
186 de_dbg(c, "%s: %s", name, timestamp_buf);
187 apply_timestamp(c, d, md, DE_TIMESTAMPIDX_MODIFY, &tmp_timestamp, 10);
190 static void read_windows_FILETIME(deark *c, lctx *d, struct member_data *md,
191 i64 pos, int tsidx, const char *name)
193 i64 t_FILETIME;
194 char timestamp_buf[64];
195 struct de_timestamp tmp_timestamp;
197 t_FILETIME = de_geti64le(pos);
198 de_FILETIME_to_timestamp(t_FILETIME, &tmp_timestamp, 0x1);
199 if(t_FILETIME<=0) tmp_timestamp.is_valid = 0;
200 de_timestamp_to_string(&tmp_timestamp, timestamp_buf, sizeof(timestamp_buf), 0);
201 de_dbg(c, "%s: %"I64_FMT" (%s)", name, t_FILETIME, timestamp_buf);
202 apply_timestamp(c, d, md, tsidx, &tmp_timestamp, 90);
205 static void read_unix_timestamp(deark *c, lctx *d, struct member_data *md,
206 i64 pos, int tsidx, const char *name)
208 i64 t;
209 char timestamp_buf[64];
210 struct de_timestamp tmp_timestamp;
212 t = de_geti32le(pos);
213 de_unix_time_to_timestamp(t, &tmp_timestamp, 0x1);
214 de_timestamp_to_string(&tmp_timestamp, timestamp_buf, sizeof(timestamp_buf), 0);
215 de_dbg(c, "%s: %d (%s)", name, (int)t, timestamp_buf);
216 apply_timestamp(c, d, md, tsidx, &tmp_timestamp, 50);
219 static void rp_add_component(deark *c, lctx *d, struct member_data *md,
220 dbuf *f, i64 pos, i64 len, struct de_strarray *sa, de_ucstring *tmpstr)
222 if(len<1) return;
223 ucstring_empty(tmpstr);
224 dbuf_read_to_ucstring(f, pos, len, tmpstr, 0, md->encoding);
225 de_strarray_push(sa, tmpstr);
228 static void read_path_to_strarray(deark *c, lctx *d, struct member_data *md,
229 dbuf *inf, i64 pos, i64 len, struct de_strarray *sa, int is_exthdr_dirname)
231 dbuf *tmpdbuf = NULL;
232 de_ucstring *tmpstr = NULL;
233 i64 component_startpos;
234 i64 component_len;
235 i64 i;
237 tmpstr = ucstring_create(c);
239 tmpdbuf = dbuf_create_membuf(c, len, 0);
240 dbuf_copy(inf, pos, len, tmpdbuf);
242 component_startpos = 0;
243 component_len = 0;
245 for(i=0; i<len; i++) {
246 u8 ch;
248 ch = dbuf_getbyte(tmpdbuf, i);
249 if(ch==0x00) break; // Tolerate NUL termination
250 if((is_exthdr_dirname && ch==0xff) ||
251 (!is_exthdr_dirname && (ch=='\\' || ch=='/')))
253 component_len = i - component_startpos;
254 rp_add_component(c, d, md, tmpdbuf, component_startpos, component_len, sa, tmpstr);
255 component_startpos = i+1;
256 component_len = 0;
258 else {
259 component_len++;
262 rp_add_component(c, d, md, tmpdbuf, component_startpos, component_len, sa, tmpstr);
264 dbuf_close(tmpdbuf);
265 ucstring_destroy(tmpstr);
268 static void read_filename_hlev0(deark *c, lctx *d, struct member_data *md,
269 i64 pos, i64 len)
271 struct de_strarray *sa = NULL;
273 if(md->filename) {
274 ucstring_empty(md->filename);
276 else {
277 md->filename = ucstring_create(c);
280 sa = de_strarray_create(c, MAX_SUBDIR_LEVEL+2);
281 read_path_to_strarray(c, d, md, c->infile, pos, len, sa, 0);
283 de_strarray_make_path(sa, md->filename, DE_MPFLAG_NOTRAILINGSLASH);
284 de_dbg(c, "filename (parsed): \"%s\"", ucstring_getpsz_d(md->filename));
286 de_strarray_destroy(sa);
289 static void read_filename_hlev1_or_exthdr(deark *c, lctx *d, struct member_data *md,
290 i64 pos, i64 len)
292 i64 i;
294 if(md->filename) {
295 ucstring_empty(md->filename);
297 else {
298 md->filename = ucstring_create(c);
301 // Some files seem to assume NUL termination is allowed.
302 dbuf_read_to_ucstring(c->infile, pos, len,
303 md->filename, DE_CONVFLAG_STOP_AT_NUL, md->encoding);
304 de_dbg(c, "filename: \"%s\"", ucstring_getpsz_d(md->filename));
306 // I don't think slashes are allowed
307 for(i=0; i<md->filename->len; i++) {
308 if(md->filename->str[i]=='/') {
309 md->filename->str[i]='_';
314 static void exthdr_common(deark *c, lctx *d, struct member_data *md,
315 u8 id, const struct exthdr_type_info_struct *e,
316 i64 pos, i64 dlen)
318 if(dlen<2) return;
319 md->hdr_crc_reported = (u32)de_getu16le(pos);
320 md->have_hdr_crc_reported = 1;
321 md->hdr_crc_field_pos = pos;
322 de_dbg(c, "header crc (reported): 0x%04x", (unsigned int)md->hdr_crc_reported);
323 // TODO: Additional information
326 static void exthdr_filename(deark *c, lctx *d, struct member_data *md,
327 u8 id, const struct exthdr_type_info_struct *e,
328 i64 pos, i64 dlen)
330 read_filename_hlev1_or_exthdr(c, d, md, pos, dlen);
333 static void exthdr_dirname(deark *c, lctx *d, struct member_data *md,
334 u8 id, const struct exthdr_type_info_struct *e,
335 i64 pos, i64 dlen)
337 struct de_strarray *dirname_sa = NULL;
339 if(md->dirname) {
340 ucstring_empty(md->dirname);
342 else {
343 md->dirname = ucstring_create(c);
346 dirname_sa = de_strarray_create(c, MAX_SUBDIR_LEVEL+2);
347 // 0xff is used as the path separator. Don't know what happens if a directory
348 // name contains an actual 0xff byte.
349 read_path_to_strarray(c, d, md, c->infile, pos, dlen, dirname_sa, 1);
350 de_strarray_make_path(dirname_sa, md->dirname, DE_MPFLAG_NOTRAILINGSLASH);
351 de_dbg(c, "%s (parsed): \"%s\"", e->name, ucstring_getpsz_d(md->dirname));
353 de_strarray_destroy(dirname_sa);
356 static void exthdr_msdosattribs(deark *c, lctx *d, struct member_data *md,
357 u8 id, const struct exthdr_type_info_struct *e,
358 i64 pos, i64 dlen)
360 u32 attribs;
361 de_ucstring *descr = NULL;
363 if(dlen<2) goto done;
364 attribs = (u32)de_getu16le(pos);
365 descr = ucstring_create(c);
366 de_describe_dos_attribs(c, (UI)attribs, descr, 0);
367 de_dbg(c, "%s: 0x%04x (%s)", e->name, (UI)attribs, ucstring_getpsz_d(descr));
368 done:
369 ucstring_destroy(descr);
372 static void exthdr_filesize(deark *c, lctx *d, struct member_data *md,
373 u8 id, const struct exthdr_type_info_struct *e,
374 i64 pos, i64 dlen)
376 // TODO: Support this
377 de_warn(c, "Unsupported \"file size\" extended header found. This may prevent "
378 "the rest of the file from being processed correctly.");
381 static void exthdr_windowstimestamp(deark *c, lctx *d, struct member_data *md,
382 u8 id, const struct exthdr_type_info_struct *e,
383 i64 pos, i64 dlen)
385 if(dlen<24) return;
386 read_windows_FILETIME(c, d, md, pos, DE_TIMESTAMPIDX_CREATE, "create time");
387 read_windows_FILETIME(c, d, md, pos+8, DE_TIMESTAMPIDX_MODIFY, "mod time ");
388 read_windows_FILETIME(c, d, md, pos+16, DE_TIMESTAMPIDX_ACCESS, "access time");
391 static void interpret_unix_perms(deark *c, lctx *d, struct member_data *md, unsigned int mode)
393 if(mode & 0100000) { // regular file
394 if(mode & 0111) { // executable
395 md->is_executable = 1;
397 else {
398 md->is_nonexecutable = 1;
402 if((mode & 0170000) == 0120000) {
403 md->is_special = 1; // symlink
407 static void exthdr_unixperms(deark *c, lctx *d, struct member_data *md,
408 u8 id, const struct exthdr_type_info_struct *e,
409 i64 pos, i64 dlen)
411 unsigned int mode;
413 if(dlen<2) return;
414 mode = (unsigned int)de_getu16le(pos);
415 de_dbg(c, "mode: octal(%06o)", mode);
416 interpret_unix_perms(c, d, md, mode);
419 static void exthdr_unixuidgid(deark *c, lctx *d, struct member_data *md,
420 u8 id, const struct exthdr_type_info_struct *e,
421 i64 pos, i64 dlen)
423 i64 uid, gid;
424 if(dlen<4) return;
426 // It's strange that the GID comes first, while the UID comes first in the
427 // level-0 "extended area".
428 gid = de_getu16le(pos);
429 de_dbg(c, "gid: %d", (int)gid);
430 uid = de_getu16le(pos+2);
431 de_dbg(c, "uid: %d", (int)uid);
434 static void exthdr_unixtimestamp(deark *c, lctx *d, struct member_data *md,
435 u8 id, const struct exthdr_type_info_struct *e,
436 i64 pos, i64 dlen)
438 if(dlen<4) return;
439 read_unix_timestamp(c, d, md, pos, DE_TIMESTAMPIDX_MODIFY, "last-modified");
442 static void exthdr_lev3newattribs2(deark *c, lctx *d, struct member_data *md,
443 u8 id, const struct exthdr_type_info_struct *e,
444 i64 pos, i64 dlen)
446 if(dlen<20) return;
448 // TODO: Permission
449 // TODO: GID/UID
451 // [Documented as "creation time", but this is a Unix-style header, so I
452 // wonder if someone mistranslated "ctime" (=change time).]
453 read_unix_timestamp(c, d, md, pos+12, TIMESTAMPIDX_INVALID, "create(?) time");
455 read_unix_timestamp(c, d, md, pos+16, DE_TIMESTAMPIDX_ACCESS, "access time ");
458 static void exthdr_codepage(deark *c, lctx *d, struct member_data *md,
459 u8 id, const struct exthdr_type_info_struct *e,
460 i64 pos, i64 dlen)
462 int n_codepage;
463 de_encoding n_encoding;
464 char descr[100];
466 if(dlen!=4) return;
467 n_codepage = (int)de_geti32le(pos);
468 n_encoding = de_windows_codepage_to_encoding(c, n_codepage, descr, sizeof(descr), 0);
469 de_dbg(c, "codepage: %d (%s)", n_codepage, descr);
470 if(n_encoding != DE_ENCODING_UNKNOWN) {
471 md->encoding = n_encoding;
475 static const struct exthdr_type_info_struct exthdr_type_info_arr[] = {
476 { 0x00, 0, "common", exthdr_common },
477 { 0x01, 0, "filename", exthdr_filename },
478 { 0x02, 0, "dir name", exthdr_dirname },
479 { 0x39, 0, "multi-disc", NULL },
480 { 0x3f, 0, "comment", NULL },
481 { 0x40, 0, "MS-DOS file attribs", exthdr_msdosattribs },
482 { 0x41, 0, "Windows timestamp", exthdr_windowstimestamp },
483 { 0x42, 0, "MS-DOS file size", exthdr_filesize },
484 { 0x43, 0, "time zone", NULL },
485 { 0x44, 0, "UTF-16 filename", NULL },
486 { 0x45, 0, "UTF-16 dir name", NULL },
487 { 0x46, 0, "codepage", exthdr_codepage },
488 { 0x50, 0, "Unix perms", exthdr_unixperms },
489 { 0x51, 0, "Unix UID/GID", exthdr_unixuidgid },
490 { 0x52, 0, "Unix group name", NULL },
491 { 0x53, 0, "Unix username", NULL },
492 { 0x54, 0, "Unix timestamp", exthdr_unixtimestamp },
493 { 0x7d, 0, "capsule", NULL },
494 { 0x7e, 0, "OS/2 extended attribs", NULL },
495 { 0x7f, 0, "level 3 new attribs type-1", NULL }, // (OS/2 only)
496 { 0xff, 0, "level 3 new attribs type-2", exthdr_lev3newattribs2 }
499 static void destroy_member_data(deark *c, struct member_data *md)
501 if(!md) return;
502 ucstring_destroy(md->dirname);
503 ucstring_destroy(md->filename);
504 ucstring_destroy(md->fullfilename);
505 de_free(c, md->cmi);
506 de_free(c, md);
509 static const struct exthdr_type_info_struct *get_exthdr_type_info(u8 id)
511 size_t i;
513 for(i=0; i<DE_ARRAYCOUNT(exthdr_type_info_arr); i++) {
514 if(id == exthdr_type_info_arr[i].id) {
515 return &exthdr_type_info_arr[i];
518 return NULL;
521 static void do_read_ext_header(deark *c, lctx *d, struct member_data *md,
522 i64 pos1, i64 len, i64 dlen)
524 u8 id = 0;
525 const char *name;
526 const struct exthdr_type_info_struct *e = NULL;
528 if(dlen>=1) {
529 id = de_getbyte(pos1);
530 e = get_exthdr_type_info(id);
532 name = e ? e->name : "?";
534 de_dbg(c, "ext header at %d, len=%d (1+%d+%d), id=0x%02x (%s)", (int)pos1, (int)len,
535 (int)(dlen-1), (int)(len-dlen), (unsigned int)id, name);
537 if(dlen<1) return; // Invalid header, too short to even have an id field
539 if(e && e->decoder_fn) {
540 de_dbg_indent(c, 1);
541 e->decoder_fn(c, d, md, id, e, pos1+1, dlen-1);
542 de_dbg_indent(c, -1);
544 else {
545 if(c->debug_level>=2) {
546 de_dbg_hexdump(c, c->infile, pos1+1, dlen-1, 256, NULL, 0x1);
551 static const char *get_os_name(u8 id)
553 const char *name = NULL;
554 switch(id) {
555 case ' ': name="unspecified"; break;
556 case '2': name="OS/2"; break;
557 case '3': name="OS/386?"; break;
558 case '9': name="OS-9"; break;
559 case 'A': name="Amiga"; break;
560 case 'C': name="CP/M"; break;
561 case 'F': name="FLEX"; break;
562 case 'H': name="Human68K"; break;
563 case 'J': name="JVM"; break;
564 case 'K': name="OS-9/68K"; break;
565 case 'M': name="DOS"; break;
566 case 'R': name="RUNser"; break;
567 case 'T': name="TownsOS"; break;
568 case 'U': name="Unix"; break;
569 case 'W': name="Windows NT"; break;
570 case 'a': name="Atari ST?"; break;
571 case 'm': name="Macintosh"; break;
572 case 'w': name="Windows"; break;
574 return name?name:"?";
577 static void do_lev0_ext_area(deark *c, lctx *d, struct member_data *md,
578 i64 pos1, i64 len)
580 if(len<1) return;
581 md->os_id = de_getbyte(pos1);
582 de_dbg(c, "OS id: %d ('%c') (%s)", (int)md->os_id,
583 de_byte_to_printable_char(md->os_id), get_os_name(md->os_id));
585 // TODO: Finish this
586 if(md->os_id=='U') {
587 unsigned int mode;
588 i64 uid, gid;
590 if(len<12) goto done;
592 read_unix_timestamp(c, d, md, pos1+2, DE_TIMESTAMPIDX_MODIFY, "last-modified");
594 mode = (unsigned int)de_getu16le(pos1+6);
595 de_dbg(c, "mode: octal(%06o)", mode);
596 interpret_unix_perms(c, d, md, mode);
598 uid = de_getu16le(pos1+8);
599 de_dbg(c, "uid: %d", (int)uid);
600 gid = de_getu16le(pos1+10);
601 de_dbg(c, "gid: %d", (int)gid);
604 done: ;
607 // AFAICT, we're expected to think of the extended headers as a kind of linked
608 // list. The last field in each node is the "size of next node" (instead of
609 // "pointer to next node", as a real linked list would have). A size of 0 is
610 // like a "nil" pointer, and marks the end of the list.
611 // The "size of the first node" field (analogous to the "head" pointer) is
612 // conceptually not part of the extended headers section.
614 // Note that if we simply shift our frame of reference, this format is identical
615 // to a more typical length-prefixed format. But our code follows the
616 // linked-list model, to make it more consistent with most LHA documentation,
617 // and the various "size" fields.
619 // A return value of 0 means we failed to calculate the size of the
620 // extended headers segment.
621 static int do_read_ext_headers(deark *c, lctx *d, struct member_data *md,
622 i64 pos1, i64 len, i64 first_ext_hdr_size, i64 *tot_bytes_consumed)
624 i64 pos = pos1;
625 i64 this_ext_hdr_size, next_ext_hdr_size;
626 int retval = 0;
627 i64 size_of_size_field;
629 *tot_bytes_consumed = 0;
631 if(first_ext_hdr_size==0) {
632 return 1;
635 de_dbg(c, "ext headers section at %d", (int)pos);
636 de_dbg_indent(c, 1);
638 size_of_size_field = (md->hlev==3) ? 4 : 2;
640 next_ext_hdr_size = first_ext_hdr_size;
641 while(1) {
642 this_ext_hdr_size = next_ext_hdr_size;
643 if(this_ext_hdr_size==0) {
644 retval = 1;
645 *tot_bytes_consumed = pos - pos1;
646 goto done;
648 if(this_ext_hdr_size<size_of_size_field) goto done;
649 if(pos+this_ext_hdr_size > pos1+len) goto done;
651 do_read_ext_header(c, d, md, pos, this_ext_hdr_size, this_ext_hdr_size-size_of_size_field);
653 // Each ext header ends with a "size of next header" field.
654 // We'll read it at this level, instead of in do_read_ext_header().
655 pos += this_ext_hdr_size-size_of_size_field;
656 if(size_of_size_field==2) {
657 next_ext_hdr_size = de_getu16le(pos);
659 else {
660 next_ext_hdr_size = de_getu32le(pos);
662 pos += size_of_size_field;
665 done:
666 if(retval) {
667 de_dbg(c, "size of ext headers section: %d", (int)*tot_bytes_consumed);
669 else {
670 de_dbg(c, "failed to parse all extended headers");
672 de_dbg_indent(c, -1);
673 return retval;
676 static void make_fullfilename(deark *c, lctx *d, struct member_data *md)
678 if(md->fullfilename) return;
680 if(!md->filename) {
681 md->filename = ucstring_create(c);
683 md->fullfilename = ucstring_create(c);
685 if(md->hlev==0) {
686 ucstring_append_ucstring(md->fullfilename, md->filename);
688 else {
689 if(md->is_dir) {
690 ucstring_append_ucstring(md->fullfilename, md->dirname);
692 else {
693 if(ucstring_isnonempty(md->dirname)) {
694 ucstring_append_ucstring(md->fullfilename, md->dirname);
695 ucstring_append_sz(md->fullfilename, "/", DE_ENCODING_LATIN1);
697 if(ucstring_isnonempty(md->filename)) {
698 ucstring_append_ucstring(md->fullfilename, md->filename);
700 else {
701 ucstring_append_char(md->fullfilename, '_');
707 static void decompress_uncompressed(deark *c, lctx *d, struct member_data *md,
708 struct de_dfilter_in_params *dcmpri, struct de_dfilter_out_params *dcmpro,
709 struct de_dfilter_results *dres)
711 fmtutil_decompress_uncompressed(c, dcmpri, dcmpro, dres, 0);
714 static void decompress_lh1(deark *c, lctx *d, struct member_data *md,
715 struct de_dfilter_in_params *dcmpri, struct de_dfilter_out_params *dcmpro,
716 struct de_dfilter_results *dres)
718 fmtutil_lh1_codectype1(c, dcmpri, dcmpro, dres, NULL);
721 // Caller supplies fmt (DE_LH5X_FMT_*).
722 static void decompress_lh5x_internal(deark *c, lctx *d,
723 struct de_dfilter_in_params *dcmpri, struct de_dfilter_out_params *dcmpro,
724 struct de_dfilter_results *dres, int fmt)
726 struct de_lh5x_params lzhparams;
728 de_zeromem(&lzhparams, sizeof(struct de_lh5x_params));
729 lzhparams.fmt = fmt;
730 lzhparams.zero_codes_block_behavior = DE_LH5X_ZCB_65536;
731 lzhparams.warn_about_zero_codes_block = 1;
732 lzhparams.history_fill_val = 0x20;
733 fmtutil_decompress_lh5x(c, dcmpri, dcmpro, dres, &lzhparams);
736 // Compression method will be selected based on id_raw[3] (which
737 // should be '4'...'8'), etc.
738 static void decompress_lh5x_auto(deark *c, lctx *d, struct member_data *md,
739 struct de_dfilter_in_params *dcmpri, struct de_dfilter_out_params *dcmpro,
740 struct de_dfilter_results *dres)
742 int fmt;
744 switch(md->cmi->id_raw[3]) {
745 case '4': case '5':
746 fmt = DE_LH5X_FMT_LH5;
747 break;
748 case '6':
749 fmt = DE_LH5X_FMT_LH6;
750 break;
751 case '7':
752 if(d->lhark_fmt) {
753 fmt = DE_LH5X_FMT_LHARK;
755 else {
756 fmt = DE_LH5X_FMT_LH7;
758 break;
759 case '8':
760 fmt = DE_LH5X_FMT_LH7;
761 break;
762 default:
763 return;
766 decompress_lh5x_internal(c, d, dcmpri, dcmpro, dres, fmt);
769 static void decompress_lh5(deark *c, lctx *d, struct member_data *md,
770 struct de_dfilter_in_params *dcmpri, struct de_dfilter_out_params *dcmpro,
771 struct de_dfilter_results *dres)
773 decompress_lh5x_internal(c, d, dcmpri, dcmpro, dres, DE_LH5X_FMT_LH5);
776 static void decompress_lz5(deark *c, lctx *d, struct member_data *md,
777 struct de_dfilter_in_params *dcmpri, struct de_dfilter_out_params *dcmpro,
778 struct de_dfilter_results *dres)
780 fmtutil_decompress_szdd(c, dcmpri, dcmpro, dres, 0x1);
783 struct cmpr_meth_array_item {
784 unsigned int flags;
785 u32 uniq_id;
786 const char *descr;
787 decompressor_fn decompressor;
790 // Compression methods with a decompressor or a description are usually
791 // listed here, but note that it is also possible for get_cmpr_meth_info()
792 // to handle them procedurally.
793 static const struct cmpr_meth_array_item cmpr_meth_arr[] = {
794 { 0x00, CODE_lhd, "directory", NULL },
795 { 0x00, CODE_lh0, "uncompressed", decompress_uncompressed },
796 { 0x00, CODE_lh1, "LZ77-4K, adaptive Huffman", decompress_lh1 },
797 { 0x00, CODE_lh4, NULL, decompress_lh5x_auto },
798 { 0x00, CODE_lh5, "LZ77-8K, static Huffman", decompress_lh5 },
799 { 0x00, CODE_lh6, "LZ77-32K, static Huffman", decompress_lh5x_auto },
800 { 0x00, CODE_lh7, NULL, decompress_lh5x_auto },
801 { 0x00, CODE_lh8, NULL, decompress_lh5x_auto },
802 { 0x00, CODE_lz4, "uncompressed (LArc)", decompress_uncompressed },
803 { 0x00, CODE_lz5, "LZSS-4K (LArc)", decompress_lz5 },
804 { 0x00, CODE_pm0, "uncompressed (PMArc)", decompress_uncompressed },
805 { 0x00, CODE_lZ0, "uncompressed (MicroFox PUT)", decompress_uncompressed },
806 { 0x00, CODE_lZ1, "MicroFox PUT lZ1", decompress_lh1 },
807 { 0x00, CODE_lZ5, "MicroFox PUT lZ5", decompress_lh5 },
808 { 0x00, CODE_S_LH0, "uncompressed (SAR)", decompress_uncompressed },
809 { 0x00, CODE_S_LH5, "SAR LH5", decompress_lh5 },
810 { 0x00, CODE_sw0, NULL, decompress_uncompressed },
811 { 0x00, CODE_sw1, NULL, NULL }
814 static const u32 other_known_cmpr_methods[] = {
815 CODE_ah0, CODE_ari, CODE_hf0,
816 CODE_lh2, CODE_lh3, CODE_lh4, CODE_lh7, CODE_lh8, CODE_lh9,
817 CODE_lha, CODE_lhb, CODE_lhc, CODE_lhe, CODE_lhx, CODE_lx1,
818 CODE_lz2, CODE_lz3, CODE_lz7, CODE_lz8, CODE_lzs,
819 CODE_pc1, CODE_pm1, CODE_pm2 };
821 // Only call this after is_possible_cmpr_meth() return nonzero.
822 // Caller allocates cmi, and initializes to zeroes.
823 static void get_cmpr_meth_info(const u8 idbuf[5], struct cmpr_meth_info *cmi)
825 size_t k;
826 const struct cmpr_meth_array_item *cmai = NULL;
828 // The first 4 bytes are unique for all known methods.
829 cmi->uniq_id = (u32)de_getu32be_direct(idbuf);
831 de_memcpy(cmi->id_raw, idbuf, 5);
833 // All "possible" methods only use printable characters.
834 de_memcpy(cmi->id_printable_sz, idbuf, 5);
835 cmi->id_printable_sz[5] = '\0';
837 for(k=0; k<DE_ARRAYCOUNT(cmpr_meth_arr); k++) {
838 if(cmpr_meth_arr[k].uniq_id == cmi->uniq_id) {
839 cmai = &cmpr_meth_arr[k];
840 break;
844 if(cmai) {
845 cmi->is_recognized = 1;
846 cmi->decompressor = cmai->decompressor;
848 else {
849 for(k=0; k<DE_ARRAYCOUNT(other_known_cmpr_methods); k++) {
850 if(other_known_cmpr_methods[k] == cmi->uniq_id) {
851 cmi->is_recognized = 1;
852 break;
857 if(cmai && cmai->descr) {
858 de_strlcpy(cmi->descr, cmai->descr, sizeof(cmi->descr));
860 else if(cmi->is_recognized) {
861 de_strlcpy(cmi->descr, "recognized, but no info avail.", sizeof(cmi->descr));
863 else {
864 de_strlcpy(cmi->descr, "?", sizeof(cmi->descr));
868 static void our_writelistener_cb(dbuf *f, void *userdata, const u8 *buf, i64 buf_len)
870 struct de_crcobj *crco = (struct de_crcobj*)userdata;
871 de_crcobj_addbuf(crco, buf, buf_len);
874 static void do_extract_file(deark *c, lctx *d, struct member_data *md)
876 de_finfo *fi = NULL;
877 dbuf *outf = NULL;
878 u32 crc_calc;
879 int tsidx;
880 u8 dcmpr_disabled = 0;
881 u8 dcmpr_attempted = 0;
882 u8 dcmpr_ok = 0;
883 struct de_dfilter_in_params dcmpri;
884 struct de_dfilter_out_params dcmpro;
885 struct de_dfilter_results dres;
887 if(!md->cmi) goto done;
889 if(md->is_special) {
890 de_dbg(c, "[not extracting special file]");
891 goto done;
893 else if(md->is_dir) {
896 else if((!md->cmi->decompressor) || dcmpr_disabled) {
897 de_err(c, "%s: Unsupported compression method '%s'",
898 ucstring_getpsz_d(md->fullfilename), md->cmi->id_printable_sz);
899 goto done;
902 fi = de_finfo_create(c);
904 for(tsidx=0; tsidx<DE_TIMESTAMPIDX_COUNT; tsidx++) {
905 if(md->tsdata[tsidx].ts.is_valid) {
906 fi->timestamp[tsidx] = md->tsdata[tsidx].ts;
910 if(md->is_dir) {
911 fi->is_directory = 1;
913 else if(md->is_executable) {
914 fi->mode_flags |= DE_MODEFLAG_EXE;
916 else if(md->is_nonexecutable) {
917 fi->mode_flags |= DE_MODEFLAG_NONEXE;
920 de_finfo_set_name_from_ucstring(c, fi, md->fullfilename, DE_SNFLAG_FULLPATH);
921 fi->original_filename_flag = 1;
923 outf = dbuf_create_output_file(c, NULL, fi, 0x0);
924 de_crcobj_reset(d->crco);
925 dbuf_set_writelistener(outf, our_writelistener_cb, (void*)d->crco);
927 de_dfilter_init_objects(c, &dcmpri, &dcmpro, &dres);
928 dcmpri.f = c->infile;
929 dcmpri.pos = md->compressed_data_pos;
930 dcmpri.len = md->compressed_data_len;
931 dcmpro.f = outf;
932 dcmpro.expected_len = md->orig_size;
933 dcmpro.len_known = 1;
935 if(md->is_dir) goto done; // For directories, we're done.
937 dcmpr_attempted = 1;
938 if(md->cmi->decompressor) {
939 md->cmi->decompressor(c, d, md, &dcmpri, &dcmpro, &dres);
942 if(dres.errcode) {
943 de_err(c, "%s: Decompression failed: %s", ucstring_getpsz_d(md->fullfilename),
944 de_dfilter_get_errmsg(c, &dres));
945 goto done;
948 crc_calc = de_crcobj_getval(d->crco);
949 de_dbg(c, "crc (calculated): 0x%04x", (unsigned int)crc_calc);
950 if(crc_calc != md->crc16) {
951 de_err(c, "%s: CRC check failed", ucstring_getpsz_d(md->fullfilename));
952 goto done;
955 dcmpr_ok = 1;
957 done:
958 if(dcmpr_attempted && md->cmi && md->cmi->uniq_id==CODE_lh7) {
959 if(dcmpr_ok)
960 d->lh7_success_flag = 1;
961 else
962 d->lh7_failed_flag = 1;
964 dbuf_close(outf);
965 de_finfo_destroy(c, fi);
968 static int cksum_cbfn(struct de_bufferedreadctx *brctx, const u8 *buf, i64 buf_len)
970 UI *pcksum = (UI*)brctx->userdata;
971 i64 i;
973 for(i=0; i<buf_len; i++) {
974 *pcksum = (*pcksum + buf[i]) & 0xff;
976 return 1;
979 static void do_check_header_crc(deark *c, lctx *d, struct member_data *md)
981 // LHA members don't have to have a header CRC field, though it's probably
982 // considered best practice to have one when the checksum field doesn't
983 // exist, or there are any extended headers.
984 if(!md->have_hdr_crc_reported) return;
985 de_crcobj_reset(d->crco);
987 // Everything before the CRC field:
988 de_crcobj_addslice(d->crco, c->infile, md->member_pos,
989 md->hdr_crc_field_pos - md->member_pos);
991 // The zeroed-out CRC field:
992 de_crcobj_addzeroes(d->crco, 2);
994 // Everything after the CRC field:
995 de_crcobj_addslice(d->crco, c->infile, md->hdr_crc_field_pos+2,
996 md->compressed_data_pos - (md->hdr_crc_field_pos+2));
998 md->hdr_crc_calc = de_crcobj_getval(d->crco);
999 de_dbg(c, "header crc (calculated): 0x%04x", (UI)md->hdr_crc_calc);
1000 if(md->hdr_crc_calc != md->hdr_crc_reported) {
1001 de_err(c, "Wrong header CRC: reported=0x%04x, calculated=0x%04x",
1002 (UI)md->hdr_crc_reported, (UI)md->hdr_crc_calc);
1006 enum lha_whats_next_enum {
1007 LHA_WN_MEMBER,
1008 LHA_WN_TRAILER,
1009 LHA_WN_TRAILER_AND_JUNK, // Note: No longer handled differently from TRAILER
1010 LHA_WN_JUNK,
1011 LHA_WN_NOTHING
1014 static enum lha_whats_next_enum lha_classify_whats_next(deark *c, lctx *d, i64 pos, i64 len)
1016 u8 b[21];
1017 u8 hlev;
1019 if(len<=0) return LHA_WN_NOTHING;
1020 b[0] = de_getbyte(pos);
1021 if(b[0]==0 && len<=2) return LHA_WN_TRAILER;
1022 if(b[0]==0 && len<21) return LHA_WN_TRAILER_AND_JUNK;
1023 de_read(&b[1], pos+1, sizeof(b)-1);
1024 if(d->swg_fmt) hlev = 0;
1025 else hlev = b[20];
1026 if(b[0]==0 && b[1]==0) return LHA_WN_TRAILER_AND_JUNK;
1027 if(b[0]==0 && hlev!=2) return LHA_WN_TRAILER_AND_JUNK;
1028 if(hlev>3) return LHA_WN_JUNK;
1029 if(is_possible_cmpr_meth(&b[2])) return LHA_WN_MEMBER;
1030 return LHA_WN_JUNK;
1033 static void do_swg_string_field(deark *c, lctx *d,
1034 de_ucstring *s, i64 pos, i64 fldlen, const char *name)
1036 i64 dlen = (i64)de_getbyte(pos);
1037 if(dlen>fldlen-1) dlen = fldlen-1;
1038 ucstring_empty(s);
1039 dbuf_read_to_ucstring(c->infile, pos+1, dlen, s, 0, d->input_encoding);
1040 ucstring_strip_trailing_spaces(s);
1041 de_dbg(c, "SWG %s: \"%s\"", name, ucstring_getpsz_d(s));
1044 static void do_special_swg_fields(deark *c, lctx *d, struct member_data *md, i64 pos1)
1046 i64 pos = pos1;
1047 u32 crc32;
1048 de_ucstring *s = NULL;
1050 crc32 = (u32)de_getu32le_p(&pos);
1051 de_dbg(c, "SWG crc32 (reported): 0x%08x", (UI)crc32);
1052 s = ucstring_create(c);
1053 do_swg_string_field(c, d, s, pos, 13, "stored filename");
1054 pos += 13;
1055 do_swg_string_field(c, d, s, pos, 41, "subject");
1056 pos += 41;
1057 do_swg_string_field(c, d, s, pos, 36, "contributor");
1058 pos += 36;
1059 do_swg_string_field(c, d, s, pos, 71, "search keys");
1060 ucstring_destroy(s);
1063 // This single function parses all the different header formats, using lots of
1064 // "if" statements. It is messy, but it's a no-win situation.
1065 // The alternative of four separate functions would be have a lot of redundant
1066 // code, and be harder to maintain.
1068 // Caller allocates and initializes md.
1069 // If the member was successfully parsed, sets md->total_size and returns nonzero.
1070 static int do_read_member(deark *c, lctx *d, struct member_data *md)
1072 int retval = 0;
1073 i64 lev0_header_size = 0;
1074 i64 lev1_base_header_size = 0;
1075 i64 lev1_skip_size = 0;
1076 i64 lev2_total_header_size = 0;
1077 i64 lev3_header_size = 0;
1078 i64 pos1 = md->member_pos;
1079 i64 pos = pos1;
1080 i64 nbytes_avail;
1081 i64 exthdr_bytes_consumed = 0;
1082 i64 fnlen = 0;
1083 UI attribs;
1084 UI hdr_checksum_reported = 0;
1085 u8 has_hdr_checksum = 0;
1086 int is_compressed;
1087 int ret;
1088 enum lha_whats_next_enum wn;
1089 u8 cmpr_meth_raw[5];
1090 int saved_indent_level;
1092 de_dbg_indent_save(c, &saved_indent_level);
1094 nbytes_avail = c->infile->len - pos1;
1095 wn = lha_classify_whats_next(c, d, pos1, nbytes_avail);
1096 if(wn!=LHA_WN_MEMBER) {
1097 if(d->member_count==0) {
1098 de_err(c, "Not an LHA file");
1100 else if(wn==LHA_WN_TRAILER || wn==LHA_WN_TRAILER_AND_JUNK) {
1101 d->trailer_found = 1;
1102 d->trailer_pos = pos1;
1103 de_dbg(c, "trailer at %"I64_FMT, d->trailer_pos);
1105 else if(wn==LHA_WN_JUNK) {
1106 de_warn(c, "%"I64_FMT" bytes of non-LHA data found at end of file (offset %"I64_FMT")",
1107 nbytes_avail, pos1);
1109 goto done;
1112 de_dbg(c, "member at %"I64_FMT, pos1);
1113 de_dbg_indent(c, 1);
1115 // Look ahead to figure out the header format version.
1116 // This byte was originally the high byte of the "MS-DOS file attribute" field,
1117 // which happened to always be zero.
1118 // In later LHA versions, it is overloaded to identify the header format
1119 // version (called "header level" in LHA jargon).
1120 if(d->swg_fmt) {
1121 md->hlev = 0; // SWG is most similar to header level 0
1123 else {
1124 md->hlev = de_getbyte(pos1+20);
1126 de_dbg(c, "header level: %d", (int)md->hlev);
1127 if(md->hlev>3) {
1128 goto done; // Shouldn't be possible; checked in lha_classify_whats_next().
1131 if(d->member_count==0) {
1132 d->hlev_of_first_member = md->hlev;
1135 if(md->hlev==0) {
1136 lev0_header_size = (i64)de_getbyte_p(&pos);
1137 de_dbg(c, "header size: (2+)%d", (int)lev0_header_size);
1138 hdr_checksum_reported = (UI)de_getbyte_p(&pos);
1139 has_hdr_checksum = 1;
1140 dbuf_buffered_read(c->infile, pos, lev0_header_size, cksum_cbfn, (void*)&md->hdr_checksum_calc);
1142 else if(md->hlev==1) {
1143 lev1_base_header_size = (i64)de_getbyte_p(&pos);
1144 de_dbg(c, "base header size: %d", (int)lev1_base_header_size);
1145 hdr_checksum_reported = (UI)de_getbyte_p(&pos);
1146 has_hdr_checksum = 1;
1147 dbuf_buffered_read(c->infile, pos, lev1_base_header_size, cksum_cbfn, (void*)&md->hdr_checksum_calc);
1149 else if(md->hlev==2) {
1150 lev2_total_header_size = de_getu16le_p(&pos);
1151 de_dbg(c, "total header size: %d", (int)lev2_total_header_size);
1153 else if(md->hlev==3) {
1154 i64 lev3_word_size;
1155 lev3_word_size = de_getu16le_p(&pos);
1156 de_dbg(c, "word size: %d", (int)lev3_word_size);
1157 if(lev3_word_size!=4) {
1158 de_err(c, "Unsupported word size: %d", (int)lev3_word_size);
1159 goto done;
1163 if(has_hdr_checksum) {
1164 de_dbg(c, "header checksum (reported): 0x%02x", hdr_checksum_reported);
1165 de_dbg(c, "header checksum (calculated): 0x%02x", md->hdr_checksum_calc);
1166 if(md->hdr_checksum_calc != hdr_checksum_reported) {
1167 de_err(c, "Wrong header checksum: reported=0x%02x, calculated=0x%02x",
1168 hdr_checksum_reported, md->hdr_checksum_calc);
1172 de_read(cmpr_meth_raw, pos, 5);
1173 md->cmi = de_malloc(c, sizeof(struct cmpr_meth_info));
1174 get_cmpr_meth_info(cmpr_meth_raw, md->cmi);
1175 de_dbg(c, "cmpr method: '%s' (%s)", md->cmi->id_printable_sz, md->cmi->descr);
1176 pos+=5;
1178 if(md->cmi->uniq_id == CODE_lhd) {
1179 is_compressed = 0;
1180 md->is_dir = 1;
1182 else if(md->cmi->decompressor == decompress_uncompressed) {
1183 is_compressed = 0;
1185 else {
1186 is_compressed = 1;
1189 if(md->hlev==1) {
1190 // lev1_skip_size is the distance from the third byte of the extended
1191 // header section, to the end of the compressed data.
1192 lev1_skip_size = de_getu32le_p(&pos);
1193 de_dbg(c, "skip size: %u", (unsigned int)lev1_skip_size);
1194 md->total_size = 2 + lev1_base_header_size + lev1_skip_size;
1196 else {
1197 md->compressed_data_len = de_getu32le(pos);
1198 de_dbg(c, "compressed size: %"I64_FMT, md->compressed_data_len);
1199 pos += 4;
1201 if(md->hlev==0) {
1202 md->total_size = 2 + lev0_header_size + md->compressed_data_len;
1204 else if(md->hlev==2) {
1205 md->total_size = lev2_total_header_size + md->compressed_data_len;
1209 md->orig_size = de_getu32le(pos);
1210 de_dbg(c, "original size: %u", (unsigned int)md->orig_size);
1211 pos += 4;
1213 if(md->hlev==0 || md->hlev==1) {
1214 read_msdos_modtime(c, d, md, pos, "last-modified");
1215 pos += 4; // modification time/date (MS-DOS)
1217 else if(md->hlev==2 || md->hlev==3) {
1218 read_unix_timestamp(c, d, md, pos, DE_TIMESTAMPIDX_MODIFY, "last-modified");
1219 pos += 4; // Unix time
1222 if(md->hlev==0) {
1223 de_ucstring *attr_descr;
1225 // Normally, the high byte can only be 0 here, because it's
1226 // also the header level.
1227 attribs = (UI)de_getu16le_p(&pos);
1229 attr_descr = ucstring_create(c);
1230 de_describe_dos_attribs(c, attribs, attr_descr, 0);
1231 de_dbg(c, "attribs: 0x%04x (%s)", attribs, ucstring_getpsz_d(attr_descr));
1232 ucstring_destroy(attr_descr);
1234 else {
1235 attribs = (UI)de_getbyte_p(&pos);
1236 de_dbg(c, "obsolete attribs low byte: 0x%02x", attribs);
1237 pos++; // header level, already handled
1240 if(d->swg_fmt) {
1241 do_special_swg_fields(c, d, md, pos);
1242 pos += 165;
1245 if(md->hlev<=1) {
1246 fnlen = de_getbyte(pos++);
1247 de_dbg(c, "filename len: %d", (int)fnlen);
1248 if(md->hlev==0) {
1249 read_filename_hlev0(c, d, md, pos, fnlen);
1251 else {
1252 read_filename_hlev1_or_exthdr(c, d, md, pos, fnlen);
1254 pos += fnlen;
1257 md->crc16 = (u32)de_getu16le_p(&pos);
1258 de_dbg(c, "crc16 (reported): 0x%04x", (unsigned int)md->crc16);
1260 if(md->hlev==1 || md->hlev==2 || md->hlev==3) {
1261 md->os_id = de_getbyte_p(&pos);
1262 de_dbg(c, "OS id: %d ('%c') (%s)", (int)md->os_id,
1263 de_byte_to_printable_char(md->os_id), get_os_name(md->os_id));
1266 if(md->hlev==3) {
1267 lev3_header_size = de_getu32le_p(&pos);
1268 md->total_size = lev3_header_size + md->compressed_data_len;
1271 if(md->hlev==0) {
1272 i64 ext_headers_size = (2+lev0_header_size) - (pos-pos1);
1273 md->compressed_data_pos = pos1 + 2 + lev0_header_size;
1274 if(ext_headers_size>0) {
1275 de_dbg(c, "extended header area at %d, len=%d", (int)pos, (int)ext_headers_size);
1276 de_dbg_indent(c, 1);
1277 do_lev0_ext_area(c, d, md, pos, ext_headers_size);
1278 de_dbg_indent(c, -1);
1281 else if(md->hlev==1) {
1282 i64 first_ext_hdr_size;
1284 // The last two bytes of the base header are the size of the first ext. header.
1285 pos = pos1 + 2 + lev1_base_header_size - 2;
1286 // TODO: sanitize pos?
1287 first_ext_hdr_size = de_getu16le_p(&pos);
1288 de_dbg(c, "first ext hdr size: %d", (int)first_ext_hdr_size);
1290 ret = do_read_ext_headers(c, d, md, pos, lev1_skip_size, first_ext_hdr_size,
1291 &exthdr_bytes_consumed);
1293 if(!ret) {
1294 de_err(c, "Error parsing extended headers at %d. Cannot extract this file.",
1295 (int)pos);
1296 retval = 1;
1297 goto done;
1300 pos += exthdr_bytes_consumed;
1301 md->compressed_data_pos = pos;
1302 md->compressed_data_len = lev1_skip_size - exthdr_bytes_consumed;
1304 else if(md->hlev==2) {
1305 i64 first_ext_hdr_size;
1307 if(md->os_id=='K') {
1308 // So that some lhasa test files will work.
1309 // TODO: The extended headers section is (usually?) self-terminating, so we
1310 // should be able to parse it and figure out if this bug is present. That
1311 // would be better than just guessing.
1312 lev2_total_header_size += 2;
1313 md->total_size = lev2_total_header_size + md->compressed_data_len;
1314 de_dbg(c, "attempting bug workaround: changing total header size to %d",
1315 (int)lev2_total_header_size);
1318 md->compressed_data_pos = pos1+lev2_total_header_size;
1320 first_ext_hdr_size = de_getu16le_p(&pos);
1321 de_dbg(c, "first ext hdr size: %d", (int)first_ext_hdr_size);
1323 do_read_ext_headers(c, d, md, pos, pos1+lev2_total_header_size-pos,
1324 first_ext_hdr_size, &exthdr_bytes_consumed);
1326 else if(md->hlev==3) {
1327 i64 first_ext_hdr_size;
1329 md->compressed_data_pos = pos1+lev3_header_size;
1331 first_ext_hdr_size = de_getu32le_p(&pos);
1332 de_dbg(c, "first ext hdr size: %d", (int)first_ext_hdr_size);
1334 do_read_ext_headers(c, d, md, pos, pos1+lev3_header_size-pos,
1335 first_ext_hdr_size, &exthdr_bytes_consumed);
1338 do_check_header_crc(c, d, md);
1340 de_dbg(c, "member data (%scompressed) at %"I64_FMT", len=%"I64_FMT,
1341 is_compressed?"":"un",
1342 md->compressed_data_pos, md->compressed_data_len);
1344 make_fullfilename(c, d, md);
1346 de_dbg_indent(c, 1);
1347 do_extract_file(c, d, md);
1348 de_dbg_indent(c, -1);
1350 retval = 1;
1351 done:
1352 de_dbg_indent_restore(c, saved_indent_level);
1353 return retval;
1356 static void do_swg_footer(deark *c, lctx *d, i64 pos1)
1358 i64 pos = pos1;
1359 i64 n;
1360 de_ucstring *s = NULL;
1362 de_dbg(c, "SWG footer at %"I64_FMT, pos1);
1363 de_dbg_indent(c, 1);
1364 s = ucstring_create(c);
1365 do_swg_string_field(c, d, s, pos, 61, "message");
1366 pos += 61;
1367 do_swg_string_field(c, d, s, pos, 66, "title");
1368 pos += 66;
1369 n = de_getu16le_p(&pos);
1370 de_dbg(c, "SWG number of items: %d", (int)n);
1371 de_dbg_indent(c, -1);
1372 ucstring_destroy(s);
1375 static void do_lha_footer(deark *c, lctx *d)
1377 i64 extra_bytes_pos, extra_bytes_len;
1379 if(!d->trailer_found) goto done;
1380 extra_bytes_pos = d->trailer_pos+1;
1381 extra_bytes_len = c->infile->len - extra_bytes_pos;
1382 if(extra_bytes_len<=1) goto done;
1384 if(d->swg_fmt && extra_bytes_len==129) {
1385 do_swg_footer(c, d, extra_bytes_pos);
1386 goto done;
1389 de_info(c, "Note: %"I64_FMT" extra bytes at end of file (offset %"I64_FMT")",
1390 extra_bytes_len, extra_bytes_pos);
1391 done:
1395 static void do_run_lha_internal(deark *c, de_module_params *mparams, int is_swg)
1397 lctx *d = NULL;
1398 i64 pos;
1399 struct member_data *md = NULL;
1401 d = de_malloc(c, sizeof(lctx));
1402 if(is_swg) d->swg_fmt = 1;
1404 d->lhark_fmt = (u8)de_get_ext_option_bool(c, "lha:lhark", 0);
1406 // It's not really safe to guess CP437, because Japanese-encoded (CP932?)
1407 // filenames are common.
1408 d->input_encoding = de_get_input_encoding(c, NULL, DE_ENCODING_ASCII);
1410 d->hlev_of_first_member = 0xff;
1411 d->crco = de_crcobj_create(c, DE_CRCOBJ_CRC16_ARC);
1413 pos = 0;
1414 while(1) {
1415 if(pos >= c->infile->len) break;
1417 md = de_malloc(c, sizeof(struct member_data));
1418 md->encoding = d->input_encoding;
1419 md->member_pos = pos;
1420 if(!do_read_member(c, d, md)) goto done;
1421 if(md->total_size<1) goto done;
1423 d->member_count++;
1424 pos += md->total_size;
1426 destroy_member_data(c, md);
1427 md = NULL;
1430 done:
1431 do_lha_footer(c, d);
1433 destroy_member_data(c, md);
1434 if(d) {
1435 if(!d->lhark_fmt && d->hlev_of_first_member==1 && d->lh7_failed_flag &&
1436 !d->lh7_success_flag)
1438 de_info(c, "Note: 'lh7' decompression failed. Maybe this file uses "
1439 "LHARK compression. Try \"-opt lha:lhark\".");
1442 de_crcobj_destroy(d->crco);
1443 de_free(c, d);
1447 static void de_run_lha(deark *c, de_module_params *mparams)
1449 do_run_lha_internal(c, mparams, 0);
1452 static int is_swg_sig(const u8 *b)
1454 return b[0]=='-' && b[1]=='s' && b[2]=='w' &&
1455 (b[3]=='0' || b[3]=='1') && b[4]=='-';
1458 static int de_identify_lha(deark *c)
1460 int has_ext = 0;
1461 u8 b[22];
1462 struct cmpr_meth_info cmi;
1464 de_read(b, 0, sizeof(b));
1465 if(b[20]>3) return 0; // header level
1467 if(!is_possible_cmpr_meth(&b[2])) return 0;
1468 if(is_swg_sig(&b[2])) return 0; // Handled by the swg module
1470 if(b[20]==0) {
1471 if(b[0]<22) return 0;
1472 if(22 + (int)b[21] + 2 > 2 + (int)b[0]) return 0;
1474 else if(b[20]==1) {
1475 if(b[0]<25) return 0;
1476 if(22 + (int)b[21] + 5 > 2 + (int)b[0]) return 0;
1478 else if(b[20]==2) {
1479 i64 hsize = de_getu16le_direct(&b[0]);
1480 if(hsize < 26) return 0;
1482 else if(b[20]==3) {
1483 if((b[0]!=4 && b[0]!=8) || b[1]!=0) return 0;
1486 de_zeromem(&cmi, sizeof(struct cmpr_meth_info));
1487 get_cmpr_meth_info(&b[2], &cmi);
1488 if(!cmi.is_recognized) {
1489 return 0;
1492 if(de_input_file_has_ext(c, "lzh") ||
1493 de_input_file_has_ext(c, "lha"))
1495 has_ext = 1;
1498 if(has_ext) return 100;
1499 return 80; // Must be less than car_lha
1502 static void de_help_lha(deark *c)
1504 de_msg(c, "-opt lha:lhark : Enable LHARK mode (for 'lh7' compression)");
1507 void de_module_lha(deark *c, struct deark_module_info *mi)
1509 mi->id = "lha";
1510 mi->desc = "LHA/LZH/PMA archive";
1511 mi->run_fn = de_run_lha;
1512 mi->identify_fn = de_identify_lha;
1513 mi->help_fn = de_help_lha;
1516 /////////////////////// SWG / SWAG
1518 // This module works almost just like lha, except that all members are assumed
1519 // to use the SWG header format. (For lha, the SWG header format is never used.)
1521 static void de_run_swg(deark *c, de_module_params *mparams)
1523 de_declare_fmt(c, "SWAG packet (LHA-like)");
1524 // TODO?: Some diagnostic messages in the LHA module are hardcoded to
1525 // say "LHA", which is likely to be confusing to uses who know this as
1526 // SWG format.
1527 do_run_lha_internal(c, mparams, 1);
1530 static int de_identify_swg(deark *c)
1532 u8 b[5];
1534 de_read(b, 2, sizeof(b));
1535 if(is_swg_sig(b)) {
1536 if(de_input_file_has_ext(c, "swg")) return 100;
1537 return 90;
1539 return 0;
1542 void de_module_swg(deark *c, struct deark_module_info *mi)
1544 mi->id = "swg";
1545 mi->desc = "SWAG packet";
1546 mi->run_fn = de_run_swg;
1547 mi->identify_fn = de_identify_swg;
1550 /////////////////////// CAR (MylesHi!)
1552 struct car_member_data {
1553 i64 member_pos;
1554 i64 total_size;
1555 UI hdr_checksum_calc;
1558 struct car_ctx {
1559 dbuf *hdr_tmp;
1560 dbuf *lha_outf;
1563 static int looks_like_car_member(deark *c, i64 pos)
1565 u8 b[16];
1567 de_read(b, pos, 16);
1568 if(b[2]!='-' || b[3]!='l'|| b[4]!='h' || b[6]!='-') return 0;
1569 if(b[5]!='0' && b[5]!='5') return 0;
1570 if((int)b[0] != (int)b[15] + 25) return 0;
1571 if(dbuf_memcmp(c->infile, pos + (i64)b[15] + 24, (const u8*)"\x20\x00\x00", 3)) return 0;
1572 return 1;
1575 static int do_car_member(deark *c, struct car_ctx *d, struct car_member_data *md)
1577 i64 lev1_base_header_size;
1578 i64 fnlen;
1579 i64 hdr_endpos;
1580 i64 compressed_data_len;
1581 i64 pos1 = md->member_pos;
1582 int retval = 0;
1583 int saved_indent_level;
1585 de_dbg_indent_save(c, &saved_indent_level);
1586 de_dbg(c, "member at %"I64_FMT, pos1);
1587 de_dbg_indent(c, 1);
1589 // Figure out where everything is...
1590 lev1_base_header_size = (i64)de_getbyte(pos1);
1591 de_dbg(c, "base header size: %d", (int)lev1_base_header_size);
1592 hdr_endpos = pos1 + 2 + lev1_base_header_size;
1593 fnlen = lev1_base_header_size - 25;
1594 de_dbg(c, "implied filename len: %d", (int)fnlen);
1595 if(fnlen<0) goto done;
1597 compressed_data_len = de_getu32le(pos1 + 7);
1598 de_dbg(c, "compressed size: %"I64_FMT, compressed_data_len);
1599 if(hdr_endpos + compressed_data_len > c->infile->len) goto done;
1601 // Convert to an LHA level-1 header
1602 dbuf_empty(d->hdr_tmp);
1604 // Fields through uncmpr_size are the same (we'll patch the checksum later)
1605 dbuf_copy(c->infile, pos1, 15, d->hdr_tmp);
1607 dbuf_copy(c->infile, hdr_endpos-7, 4, d->hdr_tmp); // timestamp
1609 // attribute (low byte)
1610 dbuf_copy(c->infile, hdr_endpos-9, 1, d->hdr_tmp);
1611 dbuf_writebyte(d->hdr_tmp, 0x01); // level identifier
1613 // Fields starting with filename length, through crc
1614 dbuf_copy(c->infile, pos1+15, 1+fnlen+2, d->hdr_tmp);
1616 dbuf_writebyte(d->hdr_tmp, 77); // OS ID = 'M' = MS-DOS
1618 // Recalculate checksum
1619 dbuf_buffered_read(d->hdr_tmp, 2, lev1_base_header_size, cksum_cbfn,
1620 (void*)&md->hdr_checksum_calc);
1621 de_dbg(c, "header checksum (calculated): 0x%02x", md->hdr_checksum_calc);
1622 dbuf_writebyte_at(d->hdr_tmp, 1, (u8)md->hdr_checksum_calc);
1623 dbuf_truncate(d->hdr_tmp, 2+lev1_base_header_size);
1625 // Write everything out
1626 dbuf_copy(d->hdr_tmp, 0, d->hdr_tmp->len, d->lha_outf);
1627 de_dbg(c, "member data at %"I64_FMT", len=%"I64_FMT, hdr_endpos, compressed_data_len);
1628 dbuf_copy(c->infile, hdr_endpos, compressed_data_len, d->lha_outf);
1629 md->total_size = (hdr_endpos-md->member_pos) + compressed_data_len;
1630 retval = 1;
1632 done:
1633 de_dbg_indent_restore(c, saved_indent_level);
1634 return retval;
1637 static void de_run_car_lha(deark *c, de_module_params *mparams)
1639 struct car_ctx *d = NULL;
1640 struct car_member_data *md = NULL;
1641 int ok = 0;
1642 i64 pos = 0;
1644 d = de_malloc(c, sizeof(struct car_ctx));
1646 if(!looks_like_car_member(c, 0)) {
1647 de_err(c, "Not a CAR file");
1648 goto done;
1651 d->lha_outf = dbuf_create_output_file(c, "lha", NULL, 0);
1652 d->hdr_tmp = dbuf_create_membuf(c, 0, 0);
1654 md = de_malloc(c, sizeof(struct car_member_data));
1655 while(1) {
1656 if(de_getbyte(pos)==0) {
1657 de_dbg(c, "trailer at %"I64_FMT, pos);
1658 dbuf_writebyte(d->lha_outf, 0);
1659 ok = 1;
1660 break;
1662 if(pos+27 > c->infile->len) goto done;
1663 if(!looks_like_car_member(c, pos)) goto done;
1665 de_zeromem(md, sizeof(struct car_member_data));
1666 md->member_pos = pos;
1667 if(!do_car_member(c, d, md)) goto done;
1668 pos += md->total_size;
1671 done:
1672 de_free(c, md);
1673 if(d) {
1674 if(d->lha_outf) {
1675 dbuf_close(d->lha_outf);
1676 if(!ok) {
1677 de_err(c, "Conversion to LHA format failed");
1680 dbuf_close(d->hdr_tmp);
1681 de_free(c, d);
1685 static int de_identify_car_lha(deark *c)
1687 if(!de_input_file_has_ext(c, "car")) return 0;
1688 if(looks_like_car_member(c, 0)) {
1689 return 95;
1691 return 0;
1694 void de_module_car_lha(deark *c, struct deark_module_info *mi)
1696 mi->id = "car_lha";
1697 mi->desc = "CAR (MylesHi!) LHA-like archive";
1698 mi->run_fn = de_run_car_lha;
1699 mi->identify_fn = de_identify_car_lha;
1702 /////////////////////// ARX
1704 struct arx_member_data {
1705 i64 member_pos;
1706 i64 total_size;
1707 i64 hdr_endpos;
1708 i64 compressed_data_len;
1709 i64 unc_data_len;
1710 int is_uncompressed;
1711 u32 crc_calc;
1714 struct arx_ctx {
1715 dbuf *hdr_tmp;
1716 dbuf *lha_outf;
1717 struct de_crcobj *crco;
1720 static int looks_like_arx_member(deark *c, i64 pos)
1722 u8 b[22];
1724 de_read(b, pos, sizeof(b));
1725 if(b[2]!='-' || b[3]!='l'|| b[4]!='h' || b[6]!='-') return 0;
1726 if(b[21]!=0) return 0;
1727 return 1;
1730 // Decompress the file, discarding the output, just to figure out the CRC.
1731 static void arx_recalc_lh1(deark *c, struct arx_ctx *d, struct arx_member_data *md)
1733 dbuf *outf = NULL;
1734 struct de_dfilter_in_params dcmpri;
1735 struct de_dfilter_out_params dcmpro;
1736 struct de_dfilter_results dres;
1738 outf = dbuf_create_custom_dbuf(c, md->unc_data_len, 0);
1739 dbuf_set_writelistener(outf, our_writelistener_cb, (void*)d->crco);
1741 de_dfilter_init_objects(c, &dcmpri, &dcmpro, &dres);
1742 dcmpri.f = c->infile;
1743 dcmpri.pos = md->hdr_endpos;
1744 dcmpri.len = md->compressed_data_len;
1745 dcmpro.f = outf;
1746 dcmpro.expected_len = md->unc_data_len;
1747 dcmpro.len_known = 1;
1749 fmtutil_lh1_codectype1(c, &dcmpri, &dcmpro, &dres, NULL);
1751 dbuf_close(outf);
1754 static void arx_recalc_crc(deark *c, struct arx_ctx *d, struct arx_member_data *md)
1756 de_crcobj_reset(d->crco);
1757 if(md->is_uncompressed) {
1758 de_crcobj_addslice(d->crco, c->infile, md->hdr_endpos, md->compressed_data_len);
1760 else {
1761 arx_recalc_lh1(c, d, md);
1763 md->crc_calc = de_crcobj_getval(d->crco);
1766 static int do_arx_member(deark *c, struct arx_ctx *d, struct arx_member_data *md)
1768 i64 lev0_header_size;
1769 i64 pos1 = md->member_pos;
1770 UI hdr_checksum_calc = 0;
1771 u8 extra_crc_byte;
1772 int retval = 0;
1773 int saved_indent_level;
1775 de_dbg_indent_save(c, &saved_indent_level);
1776 de_dbg(c, "member at %"I64_FMT, pos1);
1777 de_dbg_indent(c, 1);
1779 lev0_header_size = (i64)de_getbyte(pos1);
1780 de_dbg(c, "header size: %d", (int)lev0_header_size);
1781 if(lev0_header_size<22) goto done;
1782 md->hdr_endpos = pos1 + 2 + lev0_header_size;
1784 md->compressed_data_len = de_getu32le(pos1+8);
1785 de_dbg(c, "compressed size: %"I64_FMT, md->compressed_data_len);
1787 md->unc_data_len = de_getu32le(pos1+12);
1788 de_dbg(c, "uncmpr. size: %"I64_FMT, md->unc_data_len);
1790 if(md->compressed_data_len==0) {
1791 md->is_uncompressed = 1;
1792 md->compressed_data_len = md->unc_data_len;
1795 if(md->hdr_endpos + md->compressed_data_len > c->infile->len) goto done;
1797 // Just the first byte of the CRC is present, and apparently not even
1798 // that for non-compressed files.
1799 extra_crc_byte = de_getbyte(md->hdr_endpos-1);
1800 de_dbg(c, "crc (reported): 0x??%02x", (UI)extra_crc_byte);
1802 // Find the correct CRC of the file data.
1803 arx_recalc_crc(c, d, md);
1804 de_dbg(c, "crc (calculated): 0x%04x", (UI)md->crc_calc);
1805 if(!md->is_uncompressed) {
1806 if((u8)(md->crc_calc & 0xff) != extra_crc_byte) {
1807 de_warn(c, "CRC mismatch. Conversion to LHA may have failed.");
1811 // Convert to an LHA header
1812 dbuf_empty(d->hdr_tmp);
1814 // Fields through cmpr meth. (We'll patch the checksum, and
1815 // compression method if necessary, later.)
1816 dbuf_copy(c->infile, pos1, 7, d->hdr_tmp);
1818 dbuf_writeu32le(d->hdr_tmp, md->compressed_data_len);
1819 dbuf_writeu32le(d->hdr_tmp, md->unc_data_len);
1821 /// This part of the header can be copied as-is.
1822 dbuf_copy(c->infile, pos1+8+8, lev0_header_size-1-6-8, d->hdr_tmp);
1824 // CRC
1825 dbuf_writebyte(d->hdr_tmp, (u8)(md->crc_calc & 0xff));
1826 dbuf_writebyte(d->hdr_tmp, (u8)((md->crc_calc & 0xff00)>>8));
1828 if(md->is_uncompressed) {
1829 dbuf_writebyte_at(d->hdr_tmp, 5, '0'); // lh1 -> lh0
1832 // Recalculate header checksum
1833 dbuf_buffered_read(d->hdr_tmp, 2, lev0_header_size, cksum_cbfn,
1834 (void*)&hdr_checksum_calc);
1835 de_dbg(c, "header checksum (calculated): 0x%02x", hdr_checksum_calc);
1836 dbuf_writebyte_at(d->hdr_tmp, 1, (u8)hdr_checksum_calc);
1837 dbuf_truncate(d->hdr_tmp, 2+lev0_header_size);
1839 // Write everything out
1840 dbuf_copy(d->hdr_tmp, 0, d->hdr_tmp->len, d->lha_outf);
1841 de_dbg(c, "member data at %"I64_FMT", len=%"I64_FMT, md->hdr_endpos, md->compressed_data_len);
1842 dbuf_copy(c->infile, md->hdr_endpos, md->compressed_data_len, d->lha_outf);
1843 md->total_size = 2 + lev0_header_size + md->compressed_data_len;
1844 retval = 1;
1846 done:
1847 de_dbg_indent_restore(c, saved_indent_level);
1848 return retval;
1851 static void de_run_arx(deark *c, de_module_params *mparams)
1853 struct arx_ctx *d = NULL;
1854 struct arx_member_data *md = NULL;
1855 int ok = 0;
1856 i64 pos = 0;
1858 d = de_malloc(c, sizeof(struct arx_ctx));
1860 if(!looks_like_arx_member(c, 0)) {
1861 de_err(c, "Not an ARX file");
1862 goto done;
1865 d->crco = de_crcobj_create(c, DE_CRCOBJ_CRC16_ARC);
1866 d->lha_outf = dbuf_create_output_file(c, "lha", NULL, 0);
1867 d->hdr_tmp = dbuf_create_membuf(c, 0, 0);
1869 md = de_malloc(c, sizeof(struct arx_member_data));
1870 while(1) {
1871 if(de_getbyte(pos)==0) {
1872 de_dbg(c, "trailer at %"I64_FMT, pos);
1873 dbuf_writebyte(d->lha_outf, 0);
1874 ok = 1;
1875 break;
1877 if(pos+27 > c->infile->len) goto done;
1878 if(!looks_like_arx_member(c, pos)) goto done;
1880 de_zeromem(md, sizeof(struct arx_member_data));
1881 md->member_pos = pos;
1882 if(!do_arx_member(c, d, md)) goto done;
1883 pos += md->total_size;
1886 done:
1887 de_free(c, md);
1888 if(d) {
1889 if(d->lha_outf) {
1890 dbuf_close(d->lha_outf);
1891 if(!ok) {
1892 de_err(c, "Conversion to LHA format failed");
1895 dbuf_close(d->hdr_tmp);
1896 de_crcobj_destroy(d->crco);
1897 de_free(c, d);
1901 static int de_identify_arx(deark *c)
1903 if(dbuf_memcmp(c->infile, 2, "-lh1-", 5)) return 0;
1904 if(de_getbyte(20)!=0x20 || de_getbyte(21)!=0x00) return 0;
1905 if(de_input_file_has_ext(c, "arx")) return 100;
1906 return 70;
1909 void de_module_arx(deark *c, struct deark_module_info *mi)
1911 mi->id = "arx";
1912 mi->desc = "ARX LHA-like archive";
1913 mi->run_fn = de_run_arx;
1914 mi->identify_fn = de_identify_arx;