fnt: Improved error handling, etc.
[deark.git] / modules / tar.c
blobc3b975c6f3612b2c8c555a07068393aa233113f0
1 // This file is part of Deark.
2 // Copyright (C) 2016 Jason Summers
3 // See the file COPYING for terms of use.
5 // Tar archive format
7 #include <deark-config.h>
8 #include <deark-private.h>
9 DE_DECLARE_MODULE(de_module_tar);
11 // Represents a single physical header block, and the associated data that
12 // follows it.
13 // Sometimes, a logical member file is composed of multiple physical members.
14 struct phys_member_data {
15 #define TARFMT_UNKNOWN 0
16 #define TARFMT_POSIX 1
17 #define TARFMT_GNU 2
18 #define TARFMT_STAR 3
19 int fmt;
20 u8 linkflag;
21 i64 mode;
22 i64 file_data_pos;
23 i64 filesize;
24 i64 checksum;
25 i64 checksum_calc;
26 de_ucstring *name;
27 struct de_stringreaderdata *linkname;
28 de_ucstring *prefix;
29 struct de_timestamp timestamps[DE_TIMESTAMPIDX_COUNT];
32 // A struct to collect various extended attributes for a logical member
33 // (or for global attributes).
34 struct extattr_data {
35 de_ucstring *alt_name;
36 de_ucstring *linkname;
37 u8 main_file_is_special;
38 u8 has_alt_size;
39 i64 alt_size;
40 struct de_timestamp alt_timestamps[DE_TIMESTAMPIDX_COUNT];
43 struct member_data {
44 de_ucstring *filename;
45 de_finfo *fi;
46 int is_dir, is_regular_file, is_symlink;
49 typedef struct localctx_struct {
50 int input_encoding;
51 int found_trailer;
52 struct extattr_data *global_ea;
53 struct de_crcobj *crco_cksum;
54 } lctx;
56 static const char* get_fmt_name(int fmt)
58 const char *n = "unknown or old-style";
59 switch(fmt) {
60 case TARFMT_POSIX: n = "POSIX"; break;
61 case TARFMT_GNU: n = "GNU"; break;
62 case TARFMT_STAR: n = "star"; break;
64 return n;
67 static int read_ascii_octal_number(dbuf *f, i64 pos, i64 fieldsize,
68 i64 *value)
70 u8 b1;
71 b1 = dbuf_getbyte(f, pos);
73 if(b1<0x80) {
74 // The usual ASCII-octal format
75 return dbuf_read_ascii_number(f, pos, fieldsize, 8, value);
78 // "base-256" or some other special format
79 if(b1==0x80) { // positive base-256 number
80 *value = dbuf_getint_ext(f, pos+1, (unsigned int)(fieldsize-1), 0, 0);
81 return 1;
83 else if(b1==0xff) { // negative base-256 number
84 *value = dbuf_getint_ext(f, pos+1, (unsigned int)(fieldsize-1), 0, 1);
85 return 1;
88 *value = 0;
89 return 0;
92 static void read_12char_timestamp(deark *c, struct phys_member_data *pmd, i64 pos,
93 int tsidx, const char *name)
95 int ret;
96 i64 timestamp_unix;
97 char timestamp_buf[64];
99 ret = read_ascii_octal_number(c->infile, pos, 12, &timestamp_unix);
100 if(ret) {
101 de_unix_time_to_timestamp(timestamp_unix, &pmd->timestamps[tsidx], 0x1);
102 de_dbg_timestamp_to_string(c, &pmd->timestamps[tsidx],
103 timestamp_buf, sizeof(timestamp_buf), 0);
104 de_dbg(c, "%s: %"I64_FMT" (%s)", name, timestamp_unix, timestamp_buf);
108 // Sets md->checksum_calc
109 static void calc_checksum(deark *c, lctx *d, struct phys_member_data *pmd,
110 const u8 *hdrblock)
112 if(!d->crco_cksum) {
113 d->crco_cksum = de_crcobj_create(c, DE_CRCOBJ_SUM_BYTES);
115 else {
116 de_crcobj_reset(d->crco_cksum);
119 // A loose upper bound for the max possible checksum value is 512*255
120 // = 130560.
121 de_crcobj_addbuf(d->crco_cksum, &hdrblock[0], 148);
122 de_crcobj_addrun(d->crco_cksum, 32, 8); // (The checksum field itself)
123 de_crcobj_addbuf(d->crco_cksum, &hdrblock[156], 512-156);
124 pmd->checksum_calc = (i64)de_crcobj_getval(d->crco_cksum);
127 // Returns 1 if it was parsed successfully, and is not a trailer.
128 static int read_phys_member_header(deark *c, lctx *d,
129 struct phys_member_data *pmd, i64 pos1)
131 i64 n;
132 int ret;
133 i64 pos = pos1;
134 de_ucstring *tmpstr = NULL;
135 int retval = 0;
136 int saved_indent_level;
137 u8 hdrblock[512];
139 de_dbg_indent_save(c, &saved_indent_level);
141 de_dbg(c, "physical archive member header at %"I64_FMT, pos1);
142 de_dbg_indent(c, 1);
144 // Look ahead to try to figure out some things about the format of this member.
146 de_read(hdrblock, pos1, 512);
147 calc_checksum(c, d, pmd, hdrblock);
149 if(pmd->checksum_calc==8*32 && de_is_all_zeroes(&hdrblock[148], 8)) {
150 // "The end of the archive is indicated by two records consisting
151 // entirely of zero bytes."
152 // Most tar programs seem to stop at the first "zero block", so that's
153 // what we'll do.
154 de_dbg(c, "[trailer record]");
155 d->found_trailer = 1;
156 goto done;
159 pmd->linkflag = hdrblock[156];
161 if(!de_memcmp(&hdrblock[257], (const void*)"ustar \0", 8)) {
162 pmd->fmt = TARFMT_GNU;
164 else if(!de_memcmp(&hdrblock[257], (const void*)"ustar\0", 6)) {
165 pmd->fmt = TARFMT_POSIX;
167 else if(!de_memcmp(&hdrblock[508], (const void*)"tar\0", 4)) {
168 pmd->fmt = TARFMT_STAR;
171 de_dbg(c, "tar format: %s", get_fmt_name(pmd->fmt));
173 pmd->name = ucstring_create(c);
174 dbuf_read_to_ucstring(c->infile, pos, 100, pmd->name, DE_CONVFLAG_STOP_AT_NUL,
175 d->input_encoding);
176 pos += 100;
177 de_dbg(c, "name: \"%s\"", ucstring_getpsz_d(pmd->name));
179 ret = read_ascii_octal_number(c->infile, pos, 8, &pmd->mode);
180 if(ret) {
181 de_dbg(c, "mode: octal(%06o)", (unsigned int)pmd->mode);
183 pos += 8;
185 ret = read_ascii_octal_number(c->infile, pos, 8, &n);
186 if(ret) {
187 de_dbg(c, "uid: %"I64_FMT, n);
189 pos += 8;
190 ret = read_ascii_octal_number(c->infile, pos, 8, &n);
191 if(ret) {
192 de_dbg(c, "gid: %"I64_FMT, n);
194 pos += 8;
196 ret = read_ascii_octal_number(c->infile, pos, 12, &pmd->filesize);
197 de_sanitize_length(&pmd->filesize);
198 if(!ret) goto done;
199 pos += 12;
200 de_dbg(c, "size: %"I64_FMT, pmd->filesize);
201 pmd->file_data_pos = pos1 + 512;
203 read_12char_timestamp(c, pmd, pos, DE_TIMESTAMPIDX_MODIFY, "mtime");
204 pos += 12;
206 (void)read_ascii_octal_number(c->infile, pos, 8, &pmd->checksum);
207 de_dbg(c, "header checksum (reported): %"I64_FMT, pmd->checksum);
208 de_dbg(c, "header checksum (calculated): %"I64_FMT, pmd->checksum_calc);
209 if(pmd->checksum != pmd->checksum_calc) {
210 de_err(c, "%s: Header checksum failed: reported=%"I64_FMT", calculated=%"I64_FMT,
211 ucstring_getpsz_d(pmd->name), pmd->checksum, pmd->checksum_calc);
213 pos += 8;
215 // linkflag already set, above
216 de_dbg(c, "linkflag/typeflag: 0x%02x ('%c')", (unsigned int)pmd->linkflag,
217 de_byte_to_printable_char(pmd->linkflag));
218 pos += 1;
220 if(de_getbyte(pos)!=0) {
221 pmd->linkname = dbuf_read_string(c->infile, pos, 100, 100, DE_CONVFLAG_STOP_AT_NUL,
222 d->input_encoding);
223 de_dbg(c, "linkname: \"%s\"", ucstring_getpsz_d(pmd->linkname->str));
225 pos += 100;
227 tmpstr = ucstring_create(c);
229 if(c->debug_level>=2) {
230 ucstring_empty(tmpstr);
231 dbuf_read_to_ucstring(c->infile, pos, 8, tmpstr, 0,
232 DE_EXTENC_MAKE(DE_ENCODING_ASCII, DE_ENCSUBTYPE_PRINTABLE));
233 de_dbg2(c, "magic/version: \"%s\"", ucstring_getpsz_d(tmpstr));
235 pos += 6; // magic
236 pos += 2; // version
238 if(pmd->fmt==TARFMT_POSIX || pmd->fmt==TARFMT_GNU) {
239 ucstring_empty(tmpstr);
240 dbuf_read_to_ucstring(c->infile, pos, 32, tmpstr, DE_CONVFLAG_STOP_AT_NUL,
241 DE_ENCODING_ASCII);
242 de_dbg(c, "uname: \"%s\"", ucstring_getpsz(tmpstr));
244 pos += 32;
246 if(pmd->fmt==TARFMT_POSIX || pmd->fmt==TARFMT_GNU) {
247 ucstring_empty(tmpstr);
248 dbuf_read_to_ucstring(c->infile, pos, 32, tmpstr, DE_CONVFLAG_STOP_AT_NUL,
249 DE_ENCODING_ASCII);
250 de_dbg(c, "gname: \"%s\"", ucstring_getpsz(tmpstr));
252 pos += 32;
254 pos += 8; // devmajor
255 pos += 8; // devminor
257 // TODO?: There are various dialect-specific fields after this point, more of
258 // which might be worth supporting.
260 // TODO?: Some (rare?) GNU files have atime/ctime fields here, but is it
261 // worth trying to detect them?
262 // And "star" files can have atime/ctime fields at offset 476.
264 if((pmd->fmt==TARFMT_POSIX || pmd->fmt==TARFMT_STAR) && (de_getbyte(pos)!=0)) {
265 // This field might only be 131 bytes, instead of 155. Let's hope that
266 // it's NUL terminated in that case.
267 pmd->prefix = ucstring_create(c);
268 dbuf_read_to_ucstring(c->infile, pos, 155, pmd->prefix,
269 DE_CONVFLAG_STOP_AT_NUL, d->input_encoding);
270 de_dbg(c, "prefix: \"%s\"", ucstring_getpsz_d(pmd->prefix));
271 //pos += 155;
274 retval = 1;
276 done:
277 ucstring_destroy(tmpstr);
279 de_dbg_indent_restore(c, saved_indent_level);
280 return retval;
283 static void destroy_pmd(deark *c, struct phys_member_data *pmd)
285 if(!pmd) return;
286 ucstring_destroy(pmd->name);
287 de_destroy_stringreaderdata(c, pmd->linkname);
288 ucstring_destroy(pmd->prefix);
289 de_free(c, pmd);
292 static void destroy_extattr_data(deark *c, struct extattr_data *ea)
294 if(!ea) return;
295 ucstring_destroy(ea->alt_name);
296 ucstring_destroy(ea->linkname);
299 static void read_gnu_longpath(deark *c, lctx *d, struct phys_member_data *pmd,
300 struct extattr_data *ea)
302 i64 pos = pmd->file_data_pos;
303 i64 ext_name_len = pmd->filesize;
305 de_dbg(c, "LongPath data at %"I64_FMT, pos);
306 de_dbg_indent(c, 1);
307 if(ext_name_len<1) goto done;
309 if(pmd->linkflag=='K') {
310 if(!ea->linkname) {
311 ea->linkname = ucstring_create(c);
313 ucstring_empty(ea->linkname);
314 // TODO: It's a little inconsistent that we convert a GNU extended linkname
315 // to a ucstring, while we keep the original bytes of old-style linknames.
316 dbuf_read_to_ucstring_n(c->infile, pos, ext_name_len-1, 32767, ea->linkname, 0,
317 d->input_encoding);
318 de_dbg(c, "ext. linkname: \"%s\"", ucstring_getpsz_d(ea->linkname));
320 else { // 'L', presumably
321 if(!ea->alt_name) {
322 ea->alt_name = ucstring_create(c);
324 ucstring_empty(ea->alt_name);
325 dbuf_read_to_ucstring_n(c->infile, pos, ext_name_len-1, 32767, ea->alt_name, 0,
326 d->input_encoding);
327 de_dbg(c, "ext. filename: \"%s\"", ucstring_getpsz_d(ea->alt_name));
330 done:
331 de_dbg_indent(c, -1);
334 struct exthdr_item {
335 i64 base_pos;
336 i64 fieldlen;
337 i64 fieldlen_offs;
338 i64 fieldlen_len;
339 i64 name_offs;
340 i64 name_len;
341 i64 val_offs;
342 i64 val_len;
343 struct de_stringreaderdata *name;
344 struct de_stringreaderdata *value;
347 static void do_exthdr_timestamp(deark *c, lctx *d, struct phys_member_data *pmd,
348 struct exthdr_item *ehi, struct extattr_data *ea, int tsidx, const char *name)
350 double val_dbl;
351 double val_frac;
352 i64 val_int;
353 char timestamp_buf[64];
355 if(ehi->val_len<1) return;
357 // TODO: There is probably more roundoff error here than there needs to be.
358 val_dbl = de_strtod(ehi->value->sz, NULL);
359 if(val_dbl > 0.0) {
360 val_int = (i64)val_dbl;
361 val_frac = val_dbl - (double)val_int;
363 else {
364 val_int = (i64)val_dbl;
365 val_frac = 0.0;
368 de_unix_time_to_timestamp(val_int, &ea->alt_timestamps[tsidx], 0x1);
369 if(val_frac > 0.0) {
370 de_timestamp_set_subsec(&ea->alt_timestamps[tsidx], val_frac);
373 de_dbg_timestamp_to_string(c, &ea->alt_timestamps[tsidx],
374 timestamp_buf, sizeof(timestamp_buf), 0);
375 de_dbg(c, "%s: %s", name, timestamp_buf);
378 static int read_exthdr_item(deark *c, lctx *d, struct phys_member_data *pmd,
379 struct extattr_data *ea,
380 i64 pos1, i64 max_len, i64 *bytes_consumed)
382 struct exthdr_item *ehi = NULL;
383 int retval = 0;
384 int ret;
385 int saved_indent_level;
386 i64 offs;
387 i64 n;
388 enum {
389 STATE_LOOKING_FOR_LEN, STATE_READING_LEN,
390 STATE_LOOKING_FOR_NAME, STATE_READING_NAME, STATE_DONE
391 } state;
393 state = STATE_LOOKING_FOR_LEN;
394 de_dbg_indent_save(c, &saved_indent_level);
395 de_dbg(c, "extended header field at %"I64_FMT, pos1);
396 de_dbg_indent(c, 1);
398 ehi = de_malloc(c, sizeof(struct exthdr_item));
399 ehi->base_pos = pos1;
401 if(max_len<1) {
402 goto done;
405 // Parse one header item. We will read the initial "length" field
406 // immediately, because it is needed for proper parsing.
407 // For the name and value fields, we only record their location and size.
408 for(offs=0; ; offs++) {
409 u8 ch;
410 int is_whitespace;
412 if(state==STATE_DONE) break;
414 if(offs>=max_len) goto done;
416 // If we know the reported length of this item, enforce it.
417 if(state>STATE_READING_LEN && offs>=ehi->fieldlen) goto done;
419 ch = de_getbyte(pos1+offs);
420 is_whitespace = (ch==' ' || ch==0x09);
422 if(state==STATE_LOOKING_FOR_LEN) {
423 if(is_whitespace) continue;
424 ehi->fieldlen_offs = offs;
425 state = STATE_READING_LEN;
427 else if(state==STATE_READING_LEN) {
428 if(is_whitespace) {
429 ehi->fieldlen_len = offs - ehi->fieldlen_offs;
430 ret = dbuf_read_ascii_number(c->infile,
431 pos1+ehi->fieldlen_offs, ehi->fieldlen_len, 10, &ehi->fieldlen);
432 if(!ret) {
433 goto done;
435 de_dbg(c, "length: %d", (int)ehi->fieldlen);
436 if(ehi->fieldlen > max_len) {
437 goto done;
439 state = STATE_LOOKING_FOR_NAME;
442 else if(state==STATE_LOOKING_FOR_NAME) {
443 if(is_whitespace) continue;
444 ehi->name_offs = offs;
445 state = STATE_READING_NAME;
447 else if(state==STATE_READING_NAME) {
448 if(ch=='=') {
449 ehi->name_len = offs - ehi->name_offs;
450 ehi->val_offs = offs+1;
451 ehi->val_len = ehi->fieldlen - offs - 2;
452 if(ehi->val_len<0) goto done;
453 state = STATE_DONE;
458 // Sanity check: The item must end with a newline
459 if(de_getbyte(pos1+ehi->fieldlen-1) != 0x0a) {
460 goto done;
463 n = de_min_int(ehi->name_len, 256);
464 ehi->name = dbuf_read_string(c->infile, pos1+ehi->name_offs,
465 n, n, 0, DE_ENCODING_UTF8);
466 de_dbg(c, "keyword: \"%s\"", ucstring_getpsz_d(ehi->name->str));
468 n = de_min_int(ehi->val_len, 65536);
469 ehi->value = dbuf_read_string(c->infile, pos1+ehi->val_offs,
470 n, n, 0, DE_ENCODING_UTF8);de_dbg(c, "value: \"%s\"", ucstring_getpsz_d(ehi->value->str));
472 if(!de_strncmp(ehi->name->sz, "GNU.sparse.", 11)) {
473 ea->main_file_is_special = 1;
476 if(!de_strcmp(ehi->name->sz, "path") ||
477 !de_strcmp(ehi->name->sz, "GNU.sparse.name"))
479 if(!ea->alt_name) ea->alt_name = ucstring_create(c);
480 ucstring_empty(ea->alt_name);
481 ucstring_append_ucstring(ea->alt_name, ehi->value->str);
483 else if(!de_strcmp(ehi->name->sz, "linkpath")) {
484 if(!ea->linkname) ea->linkname = ucstring_create(c);
485 ucstring_empty(ea->linkname);
486 ucstring_append_ucstring(ea->linkname, ehi->value->str);
488 else if(!de_strcmp(ehi->name->sz, "mtime")) {
489 do_exthdr_timestamp(c, d, pmd, ehi, ea, DE_TIMESTAMPIDX_MODIFY, "mod time");
491 else if(!de_strcmp(ehi->name->sz, "atime")) {
492 do_exthdr_timestamp(c, d, pmd, ehi, ea, DE_TIMESTAMPIDX_ACCESS, "access time");
494 else if(!de_strcmp(ehi->name->sz, "ctime")) {
495 do_exthdr_timestamp(c, d, pmd, ehi, ea, DE_TIMESTAMPIDX_ATTRCHANGE, "attrib-change time");
497 else if(!de_strcmp(ehi->name->sz, "LIBARCHIVE.creationtime")) {
498 do_exthdr_timestamp(c, d, pmd, ehi, ea, DE_TIMESTAMPIDX_CREATE, "create time");
500 else if(!de_strcmp(ehi->name->sz, "size")) {
501 if(ehi->val_len==0) {
502 ea->has_alt_size = 0;
504 else {
505 ea->has_alt_size = 1;
506 ea->alt_size = de_strtoll(ehi->value->sz, NULL, 10);
509 // TODO: "hdrcharset"
511 *bytes_consumed = ehi->fieldlen;
512 retval = 1;
514 done:
515 if(!retval) {
516 de_warn(c, "Failed to parse extended header at %"I64_FMT, pos1);
518 if(ehi) {
519 de_destroy_stringreaderdata(c, ehi->name);
520 de_destroy_stringreaderdata(c, ehi->value);
521 de_free(c, ehi);
523 de_dbg_indent_restore(c, saved_indent_level);
524 return retval;
527 static void read_exthdr(deark *c, lctx *d, struct phys_member_data *pmd,
528 struct extattr_data *ea)
530 int saved_indent_level;
531 i64 pos = pmd->file_data_pos;
533 de_dbg_indent_save(c, &saved_indent_level);
534 de_dbg(c, "POSIX extended header data at %"I64_FMT, pmd->file_data_pos);
535 de_dbg_indent(c, 1);
537 if(c->debug_level>=2) {
538 de_ucstring *tmps;
539 tmps = ucstring_create(c);
540 dbuf_read_to_ucstring_n(c->infile, pmd->file_data_pos, pmd->filesize,
541 32768, tmps, 0, DE_ENCODING_UTF8);
542 de_dbg(c, "data: \"%s\"", ucstring_getpsz_d(tmps));
543 ucstring_destroy(tmps);
546 while(pos < pmd->file_data_pos + pmd->filesize) {
547 i64 bytes_consumed = 0;
549 if(!read_exthdr_item(c, d, pmd, ea, pos,
550 pmd->file_data_pos+pmd->filesize-pos, &bytes_consumed))
552 break;
554 if(bytes_consumed<1) break;
555 pos += bytes_consumed;
558 de_dbg_indent_restore(c, saved_indent_level);
561 static int read_member(deark *c, lctx *d, i64 pos1, i64 *bytes_consumed_member)
563 int saved_indent_level;
564 int retval = 0;
565 struct member_data *md = NULL;
566 struct phys_member_data *pmd = NULL;
567 struct extattr_data *ea = NULL;
568 dbuf *outf = NULL;
569 unsigned int snflags;
570 int tsidx;
571 i64 pos = pos1;
573 de_dbg_indent_save(c, &saved_indent_level);
575 de_dbg(c, "logical archive member at %"I64_FMT, pos1);
576 de_dbg_indent(c, 1);
578 md = de_malloc(c, sizeof(struct member_data));
579 md->fi = de_finfo_create(c);
580 md->fi->detect_root_dot_dir = 1;
581 md->filename = ucstring_create(c);
583 ea = de_malloc(c, sizeof(struct extattr_data));
585 while(1) {
586 int is_supplemental_item = 0;
588 if(pos >= c->infile->len) goto done;
589 if(pmd) {
590 destroy_pmd(c, pmd);
592 pmd = de_malloc(c, sizeof(struct phys_member_data));
594 if(!read_phys_member_header(c, d, pmd, pos)) {
595 goto done;
597 pos += 512;
599 if(pmd->linkflag=='L' || pmd->linkflag=='K') {
600 is_supplemental_item = 1;
601 read_gnu_longpath(c, d, pmd, ea);
603 else if(pmd->linkflag == 'x' || pmd->linkflag == 'X') {
604 is_supplemental_item = 1;
605 read_exthdr(c, d, pmd, ea);
607 else if(pmd->linkflag == 'g') {
608 read_exthdr(c, d, pmd, d->global_ea);
610 // TODO: linkflag 'K'
612 if(!is_supplemental_item) {
613 break;
616 // Prepare to read the next physical member
617 pos += de_pad_to_n(pmd->filesize, 512);
619 if(!pmd) goto done;
621 // At this point, pmd is the main physical member for this logical file.
622 // Any other 'pmd's have been discarded, other than extended attributes
623 // that were recorded in ea.
625 if(ea->has_alt_size) {
626 pmd->filesize = ea->alt_size;
628 pos += de_pad_to_n(pmd->filesize, 512);
630 retval = 1;
632 if((pmd->checksum != pmd->checksum_calc) && c->extract_level<2) {
633 // TODO: This little more than a hack, so that we don't extract so
634 // much garbage if the file is corrupt, or we go off the rails.
635 // There are more robust ways to deal with such issues.
636 de_dbg(c, "[not extracting, due to bad checksum]");
637 goto done;
640 // Decide on a filename
641 if(ucstring_isnonempty(ea->alt_name)) {
642 ucstring_append_ucstring(md->filename, ea->alt_name);
644 else {
645 if(ucstring_isnonempty(pmd->prefix)) {
646 ucstring_append_ucstring(md->filename, pmd->prefix);
647 ucstring_append_char(md->filename, '/');
649 if(ucstring_isnonempty(pmd->name)) {
650 ucstring_append_ucstring(md->filename, pmd->name);
654 // Try to figure out what kind of "file" this is.
656 if(pmd->linkflag=='2') {
657 md->is_symlink = 1;
659 else if(pmd->fmt==TARFMT_POSIX || pmd->fmt==TARFMT_STAR) {
660 if(pmd->linkflag=='0' || pmd->linkflag==0) {
661 md->is_regular_file = 1;
663 else if(pmd->linkflag=='5') {
664 md->is_dir = 1;
667 else if(pmd->fmt==TARFMT_GNU) {
668 if(pmd->linkflag=='0' || pmd->linkflag=='7' || pmd->linkflag==0) {
669 md->is_regular_file = 1;
671 else if(pmd->linkflag=='5') {
672 md->is_dir = 1;
675 else {
676 if(pmd->name->len>=1 && pmd->name->str[pmd->name->len-1]=='/') {
677 md->is_dir = 1;
679 else if(pmd->linkflag==0 || pmd->linkflag=='0') {
680 md->is_regular_file = 1;
684 if(ea->main_file_is_special) {
685 md->is_regular_file = 0;
688 de_dbg(c, "file data at %"I64_FMT", len=%"I64_FMT, pmd->file_data_pos,
689 pmd->filesize);
691 for(tsidx=0; tsidx<DE_TIMESTAMPIDX_COUNT; tsidx++) {
692 if(ea->alt_timestamps[tsidx].is_valid) {
693 md->fi->timestamp[tsidx] = ea->alt_timestamps[tsidx];
695 else if(pmd->timestamps[tsidx].is_valid) {
696 md->fi->timestamp[tsidx] = pmd->timestamps[tsidx];
700 if(!md->is_regular_file && !md->is_dir) {
701 de_warn(c, "\"%s\" is a %s. It will not be extracted as such.",
702 ucstring_getpsz(md->filename),
703 md->is_symlink?"symlink":"special file");
706 snflags = DE_SNFLAG_FULLPATH;
707 if(md->is_dir) {
708 md->fi->is_directory = 1;
709 snflags |= DE_SNFLAG_STRIPTRAILINGSLASH;
711 else if(md->is_regular_file) {
712 if((pmd->mode & 0111)!=0) {
713 md->fi->mode_flags |= DE_MODEFLAG_EXE;
715 else {
716 md->fi->mode_flags |= DE_MODEFLAG_NONEXE;
719 de_finfo_set_name_from_ucstring(c, md->fi, md->filename, snflags);
720 md->fi->original_filename_flag = 1;
722 if(pmd->file_data_pos + pmd->filesize > c->infile->len) goto done;
724 outf = dbuf_create_output_file(c, NULL, md->fi, 0);
726 // If a symlink has no data, write the 'linkname' field instead.
727 if(md->is_symlink && pmd->filesize==0) {
728 if(ucstring_isnonempty(ea->linkname)) {
729 ucstring_write_as_utf8(c, ea->linkname, outf, 0);
730 goto done;
732 else if(pmd->linkname) {
733 dbuf_write(outf, (const u8*)pmd->linkname->sz,
734 (i64)pmd->linkname->sz_strlen);
735 goto done;
739 dbuf_copy(c->infile, pmd->file_data_pos, pmd->filesize, outf);
741 done:
742 dbuf_close(outf);
743 *bytes_consumed_member = pos - pos1;
744 destroy_pmd(c, pmd);
745 destroy_extattr_data(c, ea);
746 if(md) {
747 ucstring_destroy(md->filename);
748 de_finfo_destroy(c, md->fi);
749 de_free(c, md);
751 de_dbg_indent_restore(c, saved_indent_level);
752 return retval;
755 static void de_run_tar(deark *c, de_module_params *mparams)
757 lctx *d = NULL;
758 i64 pos;
759 i64 item_len;
760 int ret;
762 d = de_malloc(c, sizeof(lctx));
764 d->global_ea = de_malloc(c, sizeof(struct extattr_data));
766 d->input_encoding = de_get_input_encoding(c, NULL, DE_ENCODING_UTF8);
768 pos = 0;
769 while(1) {
770 if(d->found_trailer) break;
771 if(pos >= c->infile->len) break;
772 if(pos+512 > c->infile->len) {
773 de_warn(c, "Ignoring %d extra bytes at end of file", (int)(c->infile->len - pos));
774 break;
777 ret = read_member(c, d, pos, &item_len);
778 if(!ret || item_len<1) break;
779 pos += item_len;
782 if(d) {
783 de_crcobj_destroy(d->crco_cksum);
784 destroy_extattr_data(c, d->global_ea);
785 de_free(c, d);
789 static int de_identify_tar(deark *c)
791 int has_ext;
792 u8 buf[8];
793 i64 k;
794 i64 digit_count;
796 has_ext = de_input_file_has_ext(c, "tar");;
797 if(!dbuf_memcmp(c->infile, 257, "ustar", 5)) {
798 return has_ext ? 100 : 90;
801 if(has_ext) {
802 if(!dbuf_memcmp(c->infile, 508, "tar\0", 4)) {
803 return 90;
807 // Try to detect tar formats that don't have the "ustar" identifier.
808 if(!has_ext) return 0;
810 // The 'checksum' field has a fairly distinctive format.
811 // "This field should be stored as six octal digits followed by a null and
812 // a space character."
814 de_read(buf, 148, 8);
815 digit_count = 0;
816 for(k=0; k<6; k++) {
817 if(buf[k]>='0' && buf[k]<='7') {
818 digit_count++;
820 else if(buf[k]!=' ') {
821 return 0;
824 if(digit_count<1) return 0;
825 if(buf[6]==0x00 && buf[7]==' ') return 60;
826 if(buf[6]==' ' && buf[7]==0x00) return 15;
827 return 0;
830 void de_module_tar(deark *c, struct deark_module_info *mi)
832 mi->id = "tar";
833 mi->desc = "tar archive";
834 mi->run_fn = de_run_tar;
835 mi->identify_fn = de_identify_tar;