lzhuf: Refactored to avoid direct array access
[deark.git] / modules / tar.c
blob52093d2c166da8366e59058ac65507c9399f6ccd
1 // This file is part of Deark.
2 // Copyright (C) 2016 Jason Summers
3 // See the file COPYING for terms of use.
5 // Tar archive format
7 #include <deark-config.h>
8 #include <deark-private.h>
9 DE_DECLARE_MODULE(de_module_tar);
11 // Represents a single physical header block, and the associated data that
12 // follows it.
13 // Sometimes, a logical member file is composed of multiple physical members.
14 struct phys_member_data {
15 #define TARFMT_UNKNOWN 0
16 #define TARFMT_POSIX 1
17 #define TARFMT_GNU 2
18 #define TARFMT_STAR 3
19 int fmt;
20 u8 linkflag;
21 i64 mode;
22 i64 file_data_pos;
23 i64 filesize;
24 i64 checksum;
25 i64 checksum_calc;
26 de_ucstring *name;
27 struct de_stringreaderdata *linkname;
28 de_ucstring *prefix;
29 struct de_timestamp timestamps[DE_TIMESTAMPIDX_COUNT];
32 // A struct to collect various extended attributes for a logical member
33 // (or for global attributes).
34 struct extattr_data {
35 de_ucstring *alt_name;
36 de_ucstring *linkname;
37 u8 main_file_is_special;
38 u8 has_alt_size;
39 i64 alt_size;
40 struct de_timestamp alt_timestamps[DE_TIMESTAMPIDX_COUNT];
43 struct member_data {
44 de_ucstring *filename;
45 de_finfo *fi;
46 int is_dir, is_regular_file, is_symlink;
49 typedef struct localctx_struct {
50 int input_encoding;
51 int found_trailer;
52 struct extattr_data *global_ea;
53 } lctx;
55 static const char* get_fmt_name(int fmt)
57 const char *n = "unknown or old-style";
58 switch(fmt) {
59 case TARFMT_POSIX: n = "POSIX"; break;
60 case TARFMT_GNU: n = "GNU"; break;
61 case TARFMT_STAR: n = "star"; break;
63 return n;
66 static int read_ascii_octal_number(dbuf *f, i64 pos, i64 fieldsize,
67 i64 *value)
69 u8 b1;
70 b1 = dbuf_getbyte(f, pos);
72 if(b1<0x80) {
73 // The usual ASCII-octal format
74 return dbuf_read_ascii_number(f, pos, fieldsize, 8, value);
77 // "base-256" or some other special format
78 if(b1==0x80) { // positive base-256 number
79 *value = dbuf_getint_ext(f, pos+1, (unsigned int)(fieldsize-1), 0, 0);
80 return 1;
82 else if(b1==0xff) { // negative base-256 number
83 *value = dbuf_getint_ext(f, pos+1, (unsigned int)(fieldsize-1), 0, 1);
84 return 1;
87 *value = 0;
88 return 0;
91 static void read_12char_timestamp(deark *c, struct phys_member_data *pmd, i64 pos,
92 int tsidx, const char *name)
94 int ret;
95 i64 timestamp_unix;
96 char timestamp_buf[64];
98 ret = read_ascii_octal_number(c->infile, pos, 12, &timestamp_unix);
99 if(ret) {
100 de_unix_time_to_timestamp(timestamp_unix, &pmd->timestamps[tsidx], 0x1);
101 de_dbg_timestamp_to_string(c, &pmd->timestamps[tsidx],
102 timestamp_buf, sizeof(timestamp_buf), 0);
103 de_dbg(c, "%s: %"I64_FMT" (%s)", name, timestamp_unix, timestamp_buf);
107 // Sets md->checksum_calc
108 static void calc_checksum(deark *c, lctx *d, struct phys_member_data *pmd,
109 const u8 *hdrblock)
111 size_t i;
113 pmd->checksum_calc = 0;
114 for(i=0; i<512; i++) {
115 if(i>=148 && i<156)
116 pmd->checksum_calc += 32; // (The checksum field itself)
117 else
118 pmd->checksum_calc += (i64)hdrblock[i];
122 // Returns 1 if it was parsed successfully, and is not a trailer.
123 static int read_phys_member_header(deark *c, lctx *d,
124 struct phys_member_data *pmd, i64 pos1)
126 i64 n;
127 int ret;
128 i64 pos = pos1;
129 de_ucstring *tmpstr = NULL;
130 int retval = 0;
131 int saved_indent_level;
132 u8 hdrblock[512];
134 de_dbg_indent_save(c, &saved_indent_level);
136 de_dbg(c, "physical archive member header at %"I64_FMT, pos1);
137 de_dbg_indent(c, 1);
139 // Look ahead to try to figure out some things about the format of this member.
141 de_read(hdrblock, pos1, 512);
142 calc_checksum(c, d, pmd, hdrblock);
144 if(pmd->checksum_calc==8*32 && de_is_all_zeroes(&hdrblock[148], 8)) {
145 // "The end of the archive is indicated by two records consisting
146 // entirely of zero bytes."
147 // Most tar programs seem to stop at the first "zero block", so that's
148 // what we'll do.
149 de_dbg(c, "[trailer record]");
150 d->found_trailer = 1;
151 goto done;
154 pmd->linkflag = hdrblock[156];
156 if(!de_memcmp(&hdrblock[257], (const void*)"ustar \0", 8)) {
157 pmd->fmt = TARFMT_GNU;
159 else if(!de_memcmp(&hdrblock[257], (const void*)"ustar\0", 6)) {
160 pmd->fmt = TARFMT_POSIX;
162 else if(!de_memcmp(&hdrblock[508], (const void*)"tar\0", 4)) {
163 pmd->fmt = TARFMT_STAR;
166 de_dbg(c, "tar format: %s", get_fmt_name(pmd->fmt));
168 pmd->name = ucstring_create(c);
169 dbuf_read_to_ucstring(c->infile, pos, 100, pmd->name, DE_CONVFLAG_STOP_AT_NUL,
170 d->input_encoding);
171 pos += 100;
172 de_dbg(c, "name: \"%s\"", ucstring_getpsz_d(pmd->name));
174 ret = read_ascii_octal_number(c->infile, pos, 8, &pmd->mode);
175 if(ret) {
176 de_dbg(c, "mode: octal(%06o)", (unsigned int)pmd->mode);
178 pos += 8;
180 ret = read_ascii_octal_number(c->infile, pos, 8, &n);
181 if(ret) {
182 de_dbg(c, "uid: %"I64_FMT, n);
184 pos += 8;
185 ret = read_ascii_octal_number(c->infile, pos, 8, &n);
186 if(ret) {
187 de_dbg(c, "gid: %"I64_FMT, n);
189 pos += 8;
191 ret = read_ascii_octal_number(c->infile, pos, 12, &pmd->filesize);
192 if(!ret) goto done;
193 pos += 12;
194 de_dbg(c, "size: %"I64_FMT, pmd->filesize);
195 pmd->file_data_pos = pos1 + 512;
197 read_12char_timestamp(c, pmd, pos, DE_TIMESTAMPIDX_MODIFY, "mtime");
198 pos += 12;
200 (void)read_ascii_octal_number(c->infile, pos, 8, &pmd->checksum);
201 de_dbg(c, "header checksum (reported): %"I64_FMT, pmd->checksum);
202 de_dbg(c, "header checksum (calculated): %"I64_FMT, pmd->checksum_calc);
203 if(pmd->checksum != pmd->checksum_calc) {
204 de_err(c, "%s: Header checksum failed: reported=%"I64_FMT", calculated=%"I64_FMT,
205 ucstring_getpsz_d(pmd->name), pmd->checksum, pmd->checksum_calc);
207 pos += 8;
209 // linkflag already set, above
210 de_dbg(c, "linkflag/typeflag: 0x%02x ('%c')", (unsigned int)pmd->linkflag,
211 de_byte_to_printable_char(pmd->linkflag));
212 pos += 1;
214 if(de_getbyte(pos)!=0) {
215 pmd->linkname = dbuf_read_string(c->infile, pos, 100, 100, DE_CONVFLAG_STOP_AT_NUL,
216 d->input_encoding);
217 de_dbg(c, "linkname: \"%s\"", ucstring_getpsz_d(pmd->linkname->str));
219 pos += 100;
221 tmpstr = ucstring_create(c);
223 if(c->debug_level>=2) {
224 ucstring_empty(tmpstr);
225 dbuf_read_to_ucstring(c->infile, pos, 8, tmpstr, 0,
226 DE_EXTENC_MAKE(DE_ENCODING_ASCII, DE_ENCSUBTYPE_PRINTABLE));
227 de_dbg2(c, "magic/version: \"%s\"", ucstring_getpsz_d(tmpstr));
229 pos += 6; // magic
230 pos += 2; // version
232 if(pmd->fmt==TARFMT_POSIX || pmd->fmt==TARFMT_GNU) {
233 ucstring_empty(tmpstr);
234 dbuf_read_to_ucstring(c->infile, pos, 32, tmpstr, DE_CONVFLAG_STOP_AT_NUL,
235 DE_ENCODING_ASCII);
236 de_dbg(c, "uname: \"%s\"", ucstring_getpsz(tmpstr));
238 pos += 32;
240 if(pmd->fmt==TARFMT_POSIX || pmd->fmt==TARFMT_GNU) {
241 ucstring_empty(tmpstr);
242 dbuf_read_to_ucstring(c->infile, pos, 32, tmpstr, DE_CONVFLAG_STOP_AT_NUL,
243 DE_ENCODING_ASCII);
244 de_dbg(c, "gname: \"%s\"", ucstring_getpsz(tmpstr));
246 pos += 32;
248 pos += 8; // devmajor
249 pos += 8; // devminor
251 // TODO?: There are various dialect-specific fields after this point, more of
252 // which might be worth supporting.
254 // TODO?: Some (rare?) GNU files have atime/ctime fields here, but is it
255 // worth trying to detect them?
256 // And "star" files can have atime/ctime fields at offset 476.
258 if((pmd->fmt==TARFMT_POSIX || pmd->fmt==TARFMT_STAR) && (de_getbyte(pos)!=0)) {
259 // This field might only be 131 bytes, instead of 155. Let's hope that
260 // it's NUL terminated in that case.
261 pmd->prefix = ucstring_create(c);
262 dbuf_read_to_ucstring(c->infile, pos, 155, pmd->prefix,
263 DE_CONVFLAG_STOP_AT_NUL, d->input_encoding);
264 de_dbg(c, "prefix: \"%s\"", ucstring_getpsz_d(pmd->prefix));
265 //pos += 155;
268 retval = 1;
270 done:
271 ucstring_destroy(tmpstr);
273 de_dbg_indent_restore(c, saved_indent_level);
274 return retval;
277 static void destroy_pmd(deark *c, struct phys_member_data *pmd)
279 if(!pmd) return;
280 ucstring_destroy(pmd->name);
281 de_destroy_stringreaderdata(c, pmd->linkname);
282 ucstring_destroy(pmd->prefix);
283 de_free(c, pmd);
286 static void destroy_extattr_data(deark *c, struct extattr_data *ea)
288 if(!ea) return;
289 ucstring_destroy(ea->alt_name);
290 ucstring_destroy(ea->linkname);
293 static void read_gnu_longpath(deark *c, lctx *d, struct phys_member_data *pmd,
294 struct extattr_data *ea)
296 i64 pos = pmd->file_data_pos;
297 i64 ext_name_len = pmd->filesize;
299 de_dbg(c, "LongPath data at %"I64_FMT, pos);
300 de_dbg_indent(c, 1);
301 if(ext_name_len<1) goto done;
303 if(pmd->linkflag=='K') {
304 if(!ea->linkname) {
305 ea->linkname = ucstring_create(c);
307 ucstring_empty(ea->linkname);
308 // TODO: It's a little inconsistent that we convert a GNU extended linkname
309 // to a ucstring, while we keep the original bytes of old-style linknames.
310 dbuf_read_to_ucstring_n(c->infile, pos, ext_name_len-1, 32767, ea->linkname, 0,
311 d->input_encoding);
312 de_dbg(c, "ext. linkname: \"%s\"", ucstring_getpsz_d(ea->linkname));
314 else { // 'L', presumably
315 if(!ea->alt_name) {
316 ea->alt_name = ucstring_create(c);
318 ucstring_empty(ea->alt_name);
319 dbuf_read_to_ucstring_n(c->infile, pos, ext_name_len-1, 32767, ea->alt_name, 0,
320 d->input_encoding);
321 de_dbg(c, "ext. filename: \"%s\"", ucstring_getpsz_d(ea->alt_name));
324 done:
325 de_dbg_indent(c, -1);
328 struct exthdr_item {
329 i64 base_pos;
330 i64 fieldlen;
331 i64 fieldlen_offs;
332 i64 fieldlen_len;
333 i64 name_offs;
334 i64 name_len;
335 i64 val_offs;
336 i64 val_len;
337 struct de_stringreaderdata *name;
338 struct de_stringreaderdata *value;
341 static void do_exthdr_timestamp(deark *c, lctx *d, struct phys_member_data *pmd,
342 struct exthdr_item *ehi, struct extattr_data *ea, int tsidx, const char *name)
344 double val_dbl;
345 double val_frac;
346 i64 val_int;
347 char timestamp_buf[64];
349 if(ehi->val_len<1) return;
351 // TODO: There is probably more roundoff error here than there needs to be.
352 val_dbl = de_strtod(ehi->value->sz, NULL);
353 if(val_dbl > 0.0) {
354 val_int = (i64)val_dbl;
355 val_frac = val_dbl - (double)val_int;
357 else {
358 val_int = (i64)val_dbl;
359 val_frac = 0.0;
362 de_unix_time_to_timestamp(val_int, &ea->alt_timestamps[tsidx], 0x1);
363 if(val_frac > 0.0) {
364 de_timestamp_set_subsec(&ea->alt_timestamps[tsidx], val_frac);
367 de_dbg_timestamp_to_string(c, &ea->alt_timestamps[tsidx],
368 timestamp_buf, sizeof(timestamp_buf), 0);
369 de_dbg(c, "%s: %s", name, timestamp_buf);
372 static int read_exthdr_item(deark *c, lctx *d, struct phys_member_data *pmd,
373 struct extattr_data *ea,
374 i64 pos1, i64 max_len, i64 *bytes_consumed)
376 struct exthdr_item *ehi = NULL;
377 int retval = 0;
378 int ret;
379 int saved_indent_level;
380 i64 offs;
381 i64 n;
382 enum {
383 STATE_LOOKING_FOR_LEN, STATE_READING_LEN,
384 STATE_LOOKING_FOR_NAME, STATE_READING_NAME, STATE_DONE
385 } state;
387 state = STATE_LOOKING_FOR_LEN;
388 de_dbg_indent_save(c, &saved_indent_level);
389 de_dbg(c, "extended header field at %"I64_FMT, pos1);
390 de_dbg_indent(c, 1);
392 ehi = de_malloc(c, sizeof(struct exthdr_item));
393 ehi->base_pos = pos1;
395 if(max_len<1) {
396 goto done;
399 // Parse one header item. We will read the initial "length" field
400 // immediately, because it is needed for proper parsing.
401 // For the name and value fields, we only record their location and size.
402 for(offs=0; ; offs++) {
403 u8 ch;
404 int is_whitespace;
406 if(state==STATE_DONE) break;
408 if(offs>=max_len) goto done;
410 // If we know the reported length of this item, enforce it.
411 if(state>STATE_READING_LEN && offs>=ehi->fieldlen) goto done;
413 ch = de_getbyte(pos1+offs);
414 is_whitespace = (ch==' ' || ch==0x09);
416 if(state==STATE_LOOKING_FOR_LEN) {
417 if(is_whitespace) continue;
418 ehi->fieldlen_offs = offs;
419 state = STATE_READING_LEN;
421 else if(state==STATE_READING_LEN) {
422 if(is_whitespace) {
423 ehi->fieldlen_len = offs - ehi->fieldlen_offs;
424 ret = dbuf_read_ascii_number(c->infile,
425 pos1+ehi->fieldlen_offs, ehi->fieldlen_len, 10, &ehi->fieldlen);
426 if(!ret) {
427 goto done;
429 de_dbg(c, "length: %d", (int)ehi->fieldlen);
430 if(ehi->fieldlen > max_len) {
431 goto done;
433 state = STATE_LOOKING_FOR_NAME;
436 else if(state==STATE_LOOKING_FOR_NAME) {
437 if(is_whitespace) continue;
438 ehi->name_offs = offs;
439 state = STATE_READING_NAME;
441 else if(state==STATE_READING_NAME) {
442 if(ch=='=') {
443 ehi->name_len = offs - ehi->name_offs;
444 ehi->val_offs = offs+1;
445 ehi->val_len = ehi->fieldlen - offs - 2;
446 if(ehi->val_len<0) goto done;
447 state = STATE_DONE;
452 // Sanity check: The item must end with a newline
453 if(de_getbyte(pos1+ehi->fieldlen-1) != 0x0a) {
454 goto done;
457 n = de_min_int(ehi->name_len, 256);
458 ehi->name = dbuf_read_string(c->infile, pos1+ehi->name_offs,
459 n, n, 0, DE_ENCODING_UTF8);
460 de_dbg(c, "keyword: \"%s\"", ucstring_getpsz_d(ehi->name->str));
462 n = de_min_int(ehi->val_len, 65536);
463 ehi->value = dbuf_read_string(c->infile, pos1+ehi->val_offs,
464 n, n, 0, DE_ENCODING_UTF8);de_dbg(c, "value: \"%s\"", ucstring_getpsz_d(ehi->value->str));
466 if(!de_strncmp(ehi->name->sz, "GNU.sparse.", 11)) {
467 ea->main_file_is_special = 1;
470 if(!de_strcmp(ehi->name->sz, "path") ||
471 !de_strcmp(ehi->name->sz, "GNU.sparse.name"))
473 if(!ea->alt_name) ea->alt_name = ucstring_create(c);
474 ucstring_empty(ea->alt_name);
475 ucstring_append_ucstring(ea->alt_name, ehi->value->str);
477 else if(!de_strcmp(ehi->name->sz, "linkpath")) {
478 if(!ea->linkname) ea->linkname = ucstring_create(c);
479 ucstring_empty(ea->linkname);
480 ucstring_append_ucstring(ea->linkname, ehi->value->str);
482 else if(!de_strcmp(ehi->name->sz, "mtime")) {
483 do_exthdr_timestamp(c, d, pmd, ehi, ea, DE_TIMESTAMPIDX_MODIFY, "mod time");
485 else if(!de_strcmp(ehi->name->sz, "atime")) {
486 do_exthdr_timestamp(c, d, pmd, ehi, ea, DE_TIMESTAMPIDX_ACCESS, "access time");
488 else if(!de_strcmp(ehi->name->sz, "ctime")) {
489 do_exthdr_timestamp(c, d, pmd, ehi, ea, DE_TIMESTAMPIDX_ATTRCHANGE, "attrib-change time");
491 else if(!de_strcmp(ehi->name->sz, "LIBARCHIVE.creationtime")) {
492 do_exthdr_timestamp(c, d, pmd, ehi, ea, DE_TIMESTAMPIDX_CREATE, "create time");
494 else if(!de_strcmp(ehi->name->sz, "size")) {
495 if(ehi->val_len==0) {
496 ea->has_alt_size = 0;
498 else {
499 ea->has_alt_size = 1;
500 ea->alt_size = de_strtoll(ehi->value->sz, NULL, 10);
503 // TODO: "hdrcharset"
505 *bytes_consumed = ehi->fieldlen;
506 retval = 1;
508 done:
509 if(!retval) {
510 de_warn(c, "Failed to parse extended header at %"I64_FMT, pos1);
512 if(ehi) {
513 de_destroy_stringreaderdata(c, ehi->name);
514 de_destroy_stringreaderdata(c, ehi->value);
515 de_free(c, ehi);
517 de_dbg_indent_restore(c, saved_indent_level);
518 return retval;
521 static void read_exthdr(deark *c, lctx *d, struct phys_member_data *pmd,
522 struct extattr_data *ea)
524 int saved_indent_level;
525 i64 pos = pmd->file_data_pos;
527 de_dbg_indent_save(c, &saved_indent_level);
528 de_dbg(c, "POSIX extended header data at %"I64_FMT, pmd->file_data_pos);
529 de_dbg_indent(c, 1);
531 if(c->debug_level>=2) {
532 de_ucstring *tmps;
533 tmps = ucstring_create(c);
534 dbuf_read_to_ucstring_n(c->infile, pmd->file_data_pos, pmd->filesize,
535 32768, tmps, 0, DE_ENCODING_UTF8);
536 de_dbg(c, "data: \"%s\"", ucstring_getpsz_d(tmps));
537 ucstring_destroy(tmps);
540 while(pos < pmd->file_data_pos + pmd->filesize) {
541 i64 bytes_consumed = 0;
543 if(!read_exthdr_item(c, d, pmd, ea, pos,
544 pmd->file_data_pos+pmd->filesize-pos, &bytes_consumed))
546 break;
548 if(bytes_consumed<1) break;
549 pos += bytes_consumed;
552 de_dbg_indent_restore(c, saved_indent_level);
555 static int read_member(deark *c, lctx *d, i64 pos1, i64 *bytes_consumed_member)
557 int saved_indent_level;
558 int retval = 0;
559 struct member_data *md = NULL;
560 struct phys_member_data *pmd = NULL;
561 struct extattr_data *ea = NULL;
562 dbuf *outf = NULL;
563 unsigned int snflags;
564 int tsidx;
565 i64 pos = pos1;
567 de_dbg_indent_save(c, &saved_indent_level);
569 de_dbg(c, "logical archive member at %"I64_FMT, pos1);
570 de_dbg_indent(c, 1);
572 md = de_malloc(c, sizeof(struct member_data));
573 md->fi = de_finfo_create(c);
574 md->fi->detect_root_dot_dir = 1;
575 md->filename = ucstring_create(c);
577 ea = de_malloc(c, sizeof(struct extattr_data));
579 while(1) {
580 int is_supplemental_item = 0;
582 if(pos >= c->infile->len) goto done;
583 if(pmd) {
584 destroy_pmd(c, pmd);
586 pmd = de_malloc(c, sizeof(struct phys_member_data));
588 if(!read_phys_member_header(c, d, pmd, pos)) {
589 goto done;
591 pos += 512;
593 if(pmd->linkflag=='L' || pmd->linkflag=='K') {
594 is_supplemental_item = 1;
595 read_gnu_longpath(c, d, pmd, ea);
597 else if(pmd->linkflag == 'x' || pmd->linkflag == 'X') {
598 is_supplemental_item = 1;
599 read_exthdr(c, d, pmd, ea);
601 else if(pmd->linkflag == 'g') {
602 read_exthdr(c, d, pmd, d->global_ea);
604 // TODO: linkflag 'K'
606 if(!is_supplemental_item) {
607 break;
610 // Prepare to read the next physical member
611 pos += de_pad_to_n(pmd->filesize, 512);
613 if(!pmd) goto done;
615 // At this point, pmd is the main physical member for this logical file.
616 // Any other 'pmd's have been discarded, other than extended attributes
617 // that were recorded in ea.
619 if(ea->has_alt_size) {
620 pmd->filesize = ea->alt_size;
622 pos += de_pad_to_n(pmd->filesize, 512);
624 retval = 1;
626 if((pmd->checksum != pmd->checksum_calc) && c->extract_level<2) {
627 // TODO: This little more than a hack, so that we don't extract so
628 // much garbage if the file is corrupt, or we go off the rails.
629 // There are more robust ways to deal with such issues.
630 de_dbg(c, "[not extracting, due to bad checksum]");
631 goto done;
634 // Decide on a filename
635 if(ucstring_isnonempty(ea->alt_name)) {
636 ucstring_append_ucstring(md->filename, ea->alt_name);
638 else {
639 if(ucstring_isnonempty(pmd->prefix)) {
640 ucstring_append_ucstring(md->filename, pmd->prefix);
641 ucstring_append_char(md->filename, '/');
643 if(ucstring_isnonempty(pmd->name)) {
644 ucstring_append_ucstring(md->filename, pmd->name);
648 // Try to figure out what kind of "file" this is.
650 if(pmd->linkflag=='2') {
651 md->is_symlink = 1;
653 else if(pmd->fmt==TARFMT_POSIX || pmd->fmt==TARFMT_STAR) {
654 if(pmd->linkflag=='0' || pmd->linkflag==0) {
655 md->is_regular_file = 1;
657 else if(pmd->linkflag=='5') {
658 md->is_dir = 1;
661 else if(pmd->fmt==TARFMT_GNU) {
662 if(pmd->linkflag=='0' || pmd->linkflag=='7' || pmd->linkflag==0) {
663 md->is_regular_file = 1;
665 else if(pmd->linkflag=='5') {
666 md->is_dir = 1;
669 else {
670 if(pmd->name->len>=1 && pmd->name->str[pmd->name->len-1]=='/') {
671 md->is_dir = 1;
673 else if(pmd->linkflag==0 || pmd->linkflag=='0') {
674 md->is_regular_file = 1;
678 if(ea->main_file_is_special) {
679 md->is_regular_file = 0;
682 de_dbg(c, "file data at %"I64_FMT", len=%"I64_FMT, pmd->file_data_pos,
683 pmd->filesize);
685 for(tsidx=0; tsidx<DE_TIMESTAMPIDX_COUNT; tsidx++) {
686 if(ea->alt_timestamps[tsidx].is_valid) {
687 md->fi->timestamp[tsidx] = ea->alt_timestamps[tsidx];
689 else if(pmd->timestamps[tsidx].is_valid) {
690 md->fi->timestamp[tsidx] = pmd->timestamps[tsidx];
694 if(!md->is_regular_file && !md->is_dir) {
695 de_warn(c, "\"%s\" is a %s. It will not be extracted as such.",
696 ucstring_getpsz(md->filename),
697 md->is_symlink?"symlink":"special file");
700 snflags = DE_SNFLAG_FULLPATH;
701 if(md->is_dir) {
702 md->fi->is_directory = 1;
703 snflags |= DE_SNFLAG_STRIPTRAILINGSLASH;
705 else if(md->is_regular_file) {
706 if((pmd->mode & 0111)!=0) {
707 md->fi->mode_flags |= DE_MODEFLAG_EXE;
709 else {
710 md->fi->mode_flags |= DE_MODEFLAG_NONEXE;
713 de_finfo_set_name_from_ucstring(c, md->fi, md->filename, snflags);
714 md->fi->original_filename_flag = 1;
716 if(pmd->file_data_pos + pmd->filesize > c->infile->len) goto done;
718 outf = dbuf_create_output_file(c, NULL, md->fi, 0);
720 // If a symlink has no data, write the 'linkname' field instead.
721 if(md->is_symlink && pmd->filesize==0) {
722 if(ucstring_isnonempty(ea->linkname)) {
723 ucstring_write_as_utf8(c, ea->linkname, outf, 0);
724 goto done;
726 else if(pmd->linkname) {
727 dbuf_write(outf, (const u8*)pmd->linkname->sz,
728 (i64)pmd->linkname->sz_strlen);
729 goto done;
733 dbuf_copy(c->infile, pmd->file_data_pos, pmd->filesize, outf);
735 done:
736 dbuf_close(outf);
737 *bytes_consumed_member = pos - pos1;
738 destroy_pmd(c, pmd);
739 destroy_extattr_data(c, ea);
740 if(md) {
741 ucstring_destroy(md->filename);
742 de_finfo_destroy(c, md->fi);
743 de_free(c, md);
745 de_dbg_indent_restore(c, saved_indent_level);
746 return retval;
749 static void de_run_tar(deark *c, de_module_params *mparams)
751 lctx *d = NULL;
752 i64 pos;
753 i64 item_len;
754 int ret;
756 d = de_malloc(c, sizeof(lctx));
758 d->global_ea = de_malloc(c, sizeof(struct extattr_data));
760 d->input_encoding = de_get_input_encoding(c, NULL, DE_ENCODING_UTF8);
762 pos = 0;
763 while(1) {
764 if(d->found_trailer) break;
765 if(pos >= c->infile->len) break;
766 if(pos+512 > c->infile->len) {
767 de_warn(c, "Ignoring %d extra bytes at end of file", (int)(c->infile->len - pos));
768 break;
771 ret = read_member(c, d, pos, &item_len);
772 if(!ret || item_len<1) break;
773 pos += item_len;
776 if(d) {
777 destroy_extattr_data(c, d->global_ea);
778 de_free(c, d);
782 static int de_identify_tar(deark *c)
784 int has_ext;
785 u8 buf[8];
786 i64 k;
787 i64 digit_count;
789 has_ext = de_input_file_has_ext(c, "tar");;
790 if(!dbuf_memcmp(c->infile, 257, "ustar", 5)) {
791 return has_ext ? 100 : 90;
794 if(has_ext) {
795 if(!dbuf_memcmp(c->infile, 508, "tar\0", 4)) {
796 return 90;
800 // Try to detect tar formats that don't have the "ustar" identifier.
801 if(!has_ext) return 0;
803 // The 'checksum' field has a fairly distinctive format.
804 // "This field should be stored as six octal digits followed by a null and
805 // a space character."
807 de_read(buf, 148, 8);
808 digit_count = 0;
809 for(k=0; k<6; k++) {
810 if(buf[k]>='0' && buf[k]<='7') {
811 digit_count++;
813 else if(buf[k]!=' ') {
814 return 0;
817 if(digit_count<1) return 0;
818 if(buf[6]==0x00 && buf[7]==' ') return 60;
819 if(buf[6]==' ' && buf[7]==0x00) return 15;
820 return 0;
823 void de_module_tar(deark *c, struct deark_module_info *mi)
825 mi->id = "tar";
826 mi->desc = "tar archive";
827 mi->run_fn = de_run_tar;
828 mi->identify_fn = de_identify_tar;