1 // This file is part of Deark.
2 // Copyright (C) 2016 Jason Summers
3 // See the file COPYING for terms of use.
8 #include <deark-config.h>
9 #include <deark-private.h>
10 DE_DECLARE_MODULE(de_module_base16
);
11 DE_DECLARE_MODULE(de_module_base64
);
12 DE_DECLARE_MODULE(de_module_uuencode
);
13 DE_DECLARE_MODULE(de_module_xxencode
);
14 DE_DECLARE_MODULE(de_module_ascii85
);
16 struct uu_hdr_parser
{
17 #define HDR_UUENCODE_OR_XXENCODE 11
18 #define HDR_UUENCODE_BASE64 12
20 i64 hdr_line_startpos
;
25 typedef struct localctx_struct
{
30 #define FMT_UUENCODE 2
31 #define FMT_XXENCODE 3
34 #define ASCII85_FMT_BTOA_OLD 21
35 #define ASCII85_FMT_BTOA_NEW 22
36 #define ASCII85_FMT_STANDARD 23
41 int output_filesize_known
;
45 static i64
bom_length(deark
*c
)
47 if(dbuf_has_utf8_bom(c
->infile
, 0)) return 3;
51 // **************************************************************************
52 // Base16 / Hex encoding
53 // **************************************************************************
55 static void de_run_base16(deark
*c
, de_module_params
*mparams
)
59 f
= dbuf_create_output_file(c
, "bin", NULL
, 0);
60 de_decode_base16(c
, c
->infile
, 0, c
->infile
->len
, f
, 0);
64 void de_module_base16(deark
*c
, struct deark_module_info
*mi
)
67 mi
->id_alias
[0] = "hex";
69 mi
->run_fn
= de_run_base16
;
72 // **************************************************************************
74 // **************************************************************************
76 // Returns number of bytes written
77 static i64
do_base64_flush(deark
*c
, lctx
*d
, dbuf
*f
, i64 max_to_write
)
79 i64 bytes_written
= 0;
81 if(d
->cbuf_count
>=2 && bytes_written
<max_to_write
) {
82 dbuf_writebyte(f
, (u8
)((d
->cbuf
[0]<<2)|(d
->cbuf
[1]>>4)));
85 if(d
->cbuf_count
>=3 && bytes_written
<max_to_write
) {
86 dbuf_writebyte(f
, (u8
)((d
->cbuf
[1]<<4)|(d
->cbuf
[2]>>2)));
89 if(d
->cbuf_count
>=4 && bytes_written
<max_to_write
) {
90 dbuf_writebyte(f
, (u8
)((d
->cbuf
[2]<<6)|d
->cbuf
[3]));
97 // Read base64 from c->infile starting at offset 'pos'.
98 static void do_base64_internal(deark
*c
, lctx
*d
, i64 pos
, dbuf
*outf
)
101 int found_terminator
= 0;
105 while(pos
<c
->infile
->len
) {
106 b
= de_getbyte(pos
++);
107 if(b
>='A' && b
<='Z') {
108 d
->cbuf
[d
->cbuf_count
++] = b
-65;
110 else if(b
>='a' && b
<='z') {
111 d
->cbuf
[d
->cbuf_count
++] = b
-71;
113 else if(b
>='0' && b
<='9') {
114 d
->cbuf
[d
->cbuf_count
++] = b
+4;
117 d
->cbuf
[d
->cbuf_count
++] = 62;
120 d
->cbuf
[d
->cbuf_count
++] = 63;
123 found_terminator
= 1;
126 else if(b
==9 || b
==10 || b
==13 || b
==32) {
127 ; // ignore whitespace
131 de_warn(c
, "Bad Base64 character(s) found (offset %d)", (int)pos
);
136 if(d
->cbuf_count
>=4) {
137 do_base64_flush(c
, d
, outf
, 3);
141 if(d
->cbuf_count
>0) {
142 if(!found_terminator
|| d
->cbuf_count
==1) {
143 de_warn(c
, "Unexpected end of Base64 data");
145 do_base64_flush(c
, d
, outf
, 3);
149 static void de_run_base64(deark
*c
, de_module_params
*mparams
)
154 d
= de_malloc(c
, sizeof(lctx
));
156 f
= dbuf_create_output_file(c
, "bin", NULL
, 0);
157 do_base64_internal(c
, d
, 0, f
);
163 void de_module_base64(deark
*c
, struct deark_module_info
*mi
)
167 mi
->run_fn
= de_run_base64
;
170 // **************************************************************************
173 // **************************************************************************
175 // Caller passes buf (not NUL terminated) to us.
176 static void parse_begin_line(deark
*c
,
177 struct uu_hdr_parser
*uuhp
, de_finfo
*fi
, const u8
*buf
, i64 buf_len
)
180 de_ucstring
*fn
= NULL
;
182 size_t nbytes_to_copy
;
187 if(uuhp
->hdr_line_type
==HDR_UUENCODE_OR_XXENCODE
) {
188 beginsize
= 5; // "begin" has 5 letters
190 else if(uuhp
->hdr_line_type
==HDR_UUENCODE_BASE64
) {
191 beginsize
= 12; // "begin-base64"
197 if(buf_len
<beginsize
+6 || buf
[beginsize
]!=' ' || buf
[beginsize
+4]!=' ') {
201 // Make a NUL-terminated copy of the file permissions mode.
202 nbytes_to_copy
= (size_t)(buf_len
- (beginsize
+1));
203 if(nbytes_to_copy
>sizeof(tmpbuf
)) nbytes_to_copy
= sizeof(tmpbuf
);
204 de_strlcpy(tmpbuf
, (const char*)&buf
[beginsize
+1], nbytes_to_copy
);
205 mode
= de_strtoll(tmpbuf
, NULL
, 8);
206 de_dbg(c
, "mode: %03o", (unsigned int)mode
);
207 if((mode
& 0111)!=0) {
208 fi
->mode_flags
|= DE_MODEFLAG_EXE
;
211 fi
->mode_flags
|= DE_MODEFLAG_NONEXE
;
214 fn
= ucstring_create(c
);
215 ucstring_append_bytes(fn
, &buf
[beginsize
+5], buf_len
-(beginsize
+5), 0, DE_ENCODING_ASCII
);
216 de_dbg(c
, "filename: \"%s\"", ucstring_getpsz_d(fn
));
217 de_finfo_set_name_from_ucstring(c
, fi
, fn
, 0);
218 fi
->original_filename_flag
= 1;
221 ucstring_destroy(fn
);
224 // If id_mode==1, just find the header line and identify the format.
225 static int uuencode_read_header(deark
*c
, struct uu_hdr_parser
*uuhp
, de_finfo
*fi
,
232 i64 nbytes_in_linebuf
;
234 uuhp
->hdr_line_startpos
= bom_length(c
);
236 while(line_count
<100) {
237 ret
= dbuf_find_line(c
->infile
, uuhp
->hdr_line_startpos
,
238 &uuhp
->hdr_line_len
, &total_len
);
240 if(uuhp
->hdr_line_len
> 1000) return 0;
242 nbytes_in_linebuf
= (i64
)sizeof(linebuf
);
243 if(uuhp
->hdr_line_len
< nbytes_in_linebuf
)
244 nbytes_in_linebuf
= uuhp
->hdr_line_len
;
245 de_read(linebuf
, uuhp
->hdr_line_startpos
, nbytes_in_linebuf
);
247 uuhp
->data_startpos
= uuhp
->hdr_line_startpos
+ total_len
;
249 if(nbytes_in_linebuf
>=9 && !de_memcmp(linebuf
, "begin ", 6)) {
250 uuhp
->hdr_line_type
= HDR_UUENCODE_OR_XXENCODE
;
252 parse_begin_line(c
, uuhp
, fi
, linebuf
, nbytes_in_linebuf
);
257 if(nbytes_in_linebuf
>=16 && !de_memcmp(linebuf
, "begin-base64 ", 13)) {
258 uuhp
->hdr_line_type
= HDR_UUENCODE_BASE64
;
260 parse_begin_line(c
, uuhp
, fi
, linebuf
, nbytes_in_linebuf
);
265 uuhp
->hdr_line_startpos
+= total_len
;
269 if(!id_mode
) de_err(c
, "Unrecognized file format");
270 uuhp
->hdr_line_type
= 0;
274 static int get_uu_byte_value(deark
*c
, lctx
*d
, u8 b
, u8
*val
)
276 if(d
->data_fmt
==FMT_XXENCODE
) {
277 if(b
>='0' && b
<='9') *val
= b
-46;
278 else if(b
>='A' && b
<='Z') *val
= b
-53;
279 else if(b
>='a' && b
<='z') *val
= b
-59;
280 else if(b
=='+') *val
= 0;
281 else if(b
=='-') *val
= 1;
289 // Standard UUEncoding
298 // Data is decoded from c->infile, starting at d->data_startpos.
299 static void do_uudecode_internal(deark
*c
, lctx
*d
, struct uu_hdr_parser
*uuhp
,
307 int start_of_line_flag
;
308 i64 decoded_bytes_this_line
;
309 i64 expected_decoded_bytes_this_line
;
311 pos
= uuhp
->data_startpos
;
313 decoded_bytes_this_line
= 0;
314 expected_decoded_bytes_this_line
= 0;
315 start_of_line_flag
= 1;
318 while(pos
<c
->infile
->len
) {
319 b
= de_getbyte(pos
++);
321 if(start_of_line_flag
&& (b
==10 || b
==13)) {
322 // Multi-byte EOL sequence, or blank line
326 if(start_of_line_flag
) {
327 start_of_line_flag
= 0;
329 ret
= get_uu_byte_value(c
, d
, b
, &x
);
331 if(b
=='e' && d
->data_fmt
==FMT_UUENCODE
) {
332 // Assume this is the "end" footer line.
335 de_err(c
, "Bad uuencoded data (offset %d)", (int)pos
);
339 expected_decoded_bytes_this_line
= x
;
340 if(expected_decoded_bytes_this_line
==0) {
348 decoded_bytes_this_line
+= do_base64_flush(c
, d
, outf
,
349 expected_decoded_bytes_this_line
-decoded_bytes_this_line
);
351 if(decoded_bytes_this_line
!= expected_decoded_bytes_this_line
) {
352 de_warn(c
, "Expected %d bytes on line, got %d",
353 (int)expected_decoded_bytes_this_line
, (int)decoded_bytes_this_line
);
356 if(decoded_bytes_this_line
<45 &&
357 decoded_bytes_this_line
==expected_decoded_bytes_this_line
)
359 // Assume a short line means end of data.
363 decoded_bytes_this_line
= 0;
364 start_of_line_flag
= 1;
368 // Expecting a regular data byte
369 ret
= get_uu_byte_value(c
, d
, b
, &x
);
371 d
->cbuf
[d
->cbuf_count
++] = x
;
372 if(d
->cbuf_count
>=4) {
373 decoded_bytes_this_line
+= do_base64_flush(c
, d
, outf
,
374 expected_decoded_bytes_this_line
-decoded_bytes_this_line
);
379 de_warn(c
, "Bad uuencode character (offset %d)", (int)pos
);
389 static void de_run_uuencode(deark
*c
, de_module_params
*mparams
)
393 struct uu_hdr_parser
*uuhp
= NULL
;
396 uuhp
= de_malloc(c
, sizeof(struct uu_hdr_parser
));
397 d
= de_malloc(c
, sizeof(lctx
));
399 d
->fi
= de_finfo_create(c
);
400 ret
= uuencode_read_header(c
, uuhp
, d
->fi
, 0);
403 if(uuhp
->hdr_line_type
==HDR_UUENCODE_BASE64
) {
404 de_declare_fmt(c
, "Base64 with uuencode wrapper");
405 d
->data_fmt
= FMT_BASE64
;
406 f
= dbuf_create_output_file(c
, NULL
, d
->fi
, 0);
407 do_base64_internal(c
, d
, uuhp
->data_startpos
, f
);
410 de_declare_fmt(c
, "Uuencoded");
411 d
->data_fmt
= FMT_UUENCODE
;
412 f
= dbuf_create_output_file(c
, NULL
, d
->fi
, 0);
413 do_uudecode_internal(c
, d
, uuhp
, f
);
422 de_finfo_destroy(c
, d
->fi
);
427 static int de_is_digit(u8 x
)
429 return (x
>='0' && x
<='9');
432 static int de_is_digit_string(const u8
*s
, i64 len
)
435 for(i
=0; i
<len
; i
++) {
436 if(!de_is_digit(s
[i
])) return 0;
441 static int de_identify_uuencode(deark
*c
)
446 struct uu_hdr_parser
*uuhp
;
449 de_read(b
, pos
, sizeof(b
));
451 if(!de_memcmp(b
, "begin-base64 ", 13)) {
454 if(!de_memcmp(b
, "begin ", 6)) {
455 if(b
[9]==' ' && de_is_digit_string(&b
[6], 3)) {
456 // This needs to be lower than XXEncode.
461 if(!de_input_file_has_ext(c
, "uue") &&
462 !de_input_file_has_ext(c
, "uu"))
467 // For some extensions, do detection the slow way.
468 uuhp
= de_malloc(c
, sizeof(struct uu_hdr_parser
));
469 uuencode_read_header(c
, uuhp
, NULL
, 1);
470 if(uuhp
->hdr_line_type
==HDR_UUENCODE_OR_XXENCODE
) {
473 else if(uuhp
->hdr_line_type
==HDR_UUENCODE_BASE64
) {
481 void de_module_uuencode(deark
*c
, struct deark_module_info
*mi
)
484 mi
->desc
= "UUEncode";
485 mi
->run_fn
= de_run_uuencode
;
486 mi
->identify_fn
= de_identify_uuencode
;
489 // **************************************************************************
491 // **************************************************************************
493 static void de_run_xxencode(deark
*c
, de_module_params
*mparams
)
497 struct uu_hdr_parser
*uuhp
= NULL
;
500 uuhp
= de_malloc(c
, sizeof(struct uu_hdr_parser
));
501 d
= de_malloc(c
, sizeof(lctx
));
503 d
->fi
= de_finfo_create(c
);
504 ret
= uuencode_read_header(c
, uuhp
, d
->fi
, 0);
506 if(uuhp
->hdr_line_type
!=HDR_UUENCODE_OR_XXENCODE
) goto done
;
508 de_declare_fmt(c
, "XXEncoded");
509 f
= dbuf_create_output_file(c
, NULL
, d
->fi
, 0);
510 d
->data_fmt
= FMT_XXENCODE
;
511 do_uudecode_internal(c
, d
, uuhp
, f
);
519 de_finfo_destroy(c
, d
->fi
);
524 static int de_identify_xxencode(deark
*c
)
527 struct uu_hdr_parser
*uuhp
;
529 // XXEncode is hard to distinguish from UUEncode, so we rely on the
531 if(!de_input_file_has_ext(c
, "xxe") &&
532 !de_input_file_has_ext(c
, "xx"))
537 uuhp
= de_malloc(c
, sizeof(struct uu_hdr_parser
));
538 uuencode_read_header(c
, uuhp
, NULL
, 1);
539 if(uuhp
->hdr_line_type
==HDR_UUENCODE_OR_XXENCODE
) {
540 retval
= (uuhp
->hdr_line_startpos
<=3) ? 90 : 20;
547 void de_module_xxencode(deark
*c
, struct deark_module_info
*mi
)
550 mi
->desc
= "XXEncode";
551 mi
->run_fn
= de_run_xxencode
;
552 mi
->identify_fn
= de_identify_xxencode
;
555 // **************************************************************************
557 // **************************************************************************
559 static void do_ascii85_flush(deark
*c
, lctx
*d
, dbuf
*f
)
564 if(d
->cbuf_count
<1) return;
566 code
= (u32
)d
->cbuf
[0];
569 code
= code
*85 + (u32
)d
->cbuf
[i
];
571 code
= code
*85 + 84; // (This shouldn't happen with btoa format)
574 // TODO: Simplify this code
575 if(d
->cbuf_count
>=2) {
576 dbuf_writebyte(f
, (u8
)((code
>>24)&0xff));
578 if(d
->output_filesize_known
&& d
->bytes_written
>=d
->output_filesize
) goto done
;
580 if(d
->cbuf_count
>=3) {
581 dbuf_writebyte(f
, (u8
)((code
>>16)&0xff));
583 if(d
->output_filesize_known
&& d
->bytes_written
>=d
->output_filesize
) goto done
;
585 if(d
->cbuf_count
>=4) {
586 dbuf_writebyte(f
, (u8
)((code
>>8)&0xff));
588 if(d
->output_filesize_known
&& d
->bytes_written
>=d
->output_filesize
) goto done
;
590 if(d
->cbuf_count
>=5) {
591 dbuf_writebyte(f
, (u8
)(code
&0xff));
593 if(d
->output_filesize_known
&& d
->bytes_written
>=d
->output_filesize
) goto done
;
600 static void do_ascii85_data_char_processed(deark
*c
, lctx
*d
, dbuf
*f
, i64 linenum
,
603 // Write to the output file immediately before we empty cbuf, instead of
604 // immediately after we fill it.
605 // This is necessary because (in old btoa format at least) we don't yet
606 // know the file size.
607 // Until we know that, we don't know how many bytes need to be written for
608 // the very last group.
609 if(d
->cbuf_count
>=5) {
610 do_ascii85_flush(c
, d
, f
);
613 if(d
->cbuf_count
<(UI
)sizeof(d
->cbuf
)) {
614 d
->cbuf
[d
->cbuf_count
] = x
;
619 static void do_ascii85_data_char_raw(deark
*c
, lctx
*d
, dbuf
*f
, i64 linenum
,
624 if(x
>='!' && x
<='u') {
625 do_ascii85_data_char_processed(c
, d
, f
, linenum
, x
-33);
628 // 'z' represents four 0x00 bytes, which encodes to five 0 values
629 // (not including the +33 bias).
631 do_ascii85_data_char_processed(c
, d
, f
, linenum
, 0);
633 else if(x
=='y' && d
->ascii85_fmt
==ASCII85_FMT_BTOA_NEW
) {
634 // This is what four spaces encodes to (not including the +33 bias).
635 do_ascii85_data_char_processed(c
, d
, f
, linenum
, 0x0a);
636 do_ascii85_data_char_processed(c
, d
, f
, linenum
, 0x1b);
637 do_ascii85_data_char_processed(c
, d
, f
, linenum
, 0x35);
638 do_ascii85_data_char_processed(c
, d
, f
, linenum
, 0x43);
639 do_ascii85_data_char_processed(c
, d
, f
, linenum
, 0x2b);
643 static void do_ascii85_data_line(deark
*c
, lctx
*d
, dbuf
*f
, i64 linenum
,
644 const u8
*linebuf
, i64 line_len
)
649 if(line_len
<1) return;
651 if(d
->ascii85_fmt
==ASCII85_FMT_BTOA_NEW
)
652 num_data_chars
= line_len
-1; // The last character is a checksum
654 num_data_chars
= line_len
;
656 for(i
=0; i
<num_data_chars
; i
++) {
657 do_ascii85_data_char_raw(c
, d
, f
, linenum
, linebuf
[i
]);
660 // TODO: Verify the checksum character, if present.
663 static int do_ascii85_read_btoa_end_line(deark
*c
, lctx
*d
, i64 linenum
,
664 const u8
*linebuf
, i64 line_len
)
668 de_dbg(c
, "btoa footer at line %d", (int)linenum
);
669 if(de_sscanf((const char *)linebuf
, "xbtoa End N %ld ", &filesize1
) != 1) {
670 de_err(c
, "Bad btoa End line");
674 d
->output_filesize
= (i64
)filesize1
;
675 d
->output_filesize_known
= 1;
676 de_dbg(c
, "reported file size: %d", (int)d
->output_filesize
);
680 static void do_ascii85_btoa(deark
*c
, lctx
*d
, dbuf
*f
)
693 if(!dbuf_find_line(c
->infile
, pos
, &content_len
, &total_len
)) {
694 de_err(c
, "Bad Ascii85 format at line %d", (int)linenum
);
699 if(content_len
> (i64
)(sizeof(linebuf
)-1)) {
700 de_err(c
, "Line %d too long", (int)linenum
);
703 de_read(linebuf
, pos
, content_len
);
704 linebuf
[content_len
] = '\0'; // NUL terminate, in case we run sscanf
707 if(content_len
<1) continue;
709 if(linebuf
[0]=='x') {
710 if(content_len
>=7 && !de_memcmp(linebuf
, "xbtoa5 ", 7)) {
711 de_dbg(c
, "btoa new format header at line %d", (int)linenum
);
712 d
->ascii85_fmt
= ASCII85_FMT_BTOA_NEW
;
714 else if(content_len
>=11 && !de_memcmp(linebuf
, "xbtoa Begin", 11)) {
715 de_dbg(c
, "btoa old format header at line %d", (int)linenum
);
717 else if(content_len
>=9 && !de_memcmp(linebuf
, "xbtoa End", 9)) {
718 if(!do_ascii85_read_btoa_end_line(c
, d
, linenum
, linebuf
, content_len
)) {
727 do_ascii85_data_line(c
, d
, f
, linenum
, linebuf
, content_len
);
730 do_ascii85_flush(c
, d
, f
);
732 if(d
->output_filesize_known
&& (d
->bytes_written
!= d
->output_filesize
)) {
733 de_err(c
, "Expected output file size=%d, actual size=%d", (int)d
->output_filesize
,
734 (int)d
->bytes_written
);
741 static void do_ascii85_standard(deark
*c
, lctx
*d
, dbuf
*f
, i64 pos
)
748 if(pos
>= c
->infile
->len
) break;
749 x
= de_getbyte(pos
++);
753 do_ascii85_data_char_raw(c
, d
, f
, 0, x
);
756 do_ascii85_flush(c
, d
, f
);
759 static int ascii85_detect_fmt(deark
*c
)
765 if(!de_memcmp(buf
, "xbtoa Begin", 11)) {
766 return ASCII85_FMT_BTOA_OLD
;
768 else if(!dbuf_memcmp(c
->infile
, 0, "xbtoa5 ", 7)) {
769 return ASCII85_FMT_BTOA_NEW
;
771 else if(!dbuf_memcmp(c
->infile
, 0, "<~", 2)) {
772 return ASCII85_FMT_STANDARD
;
777 static void de_run_ascii85(deark
*c
, de_module_params
*mparams
)
782 d
= de_malloc(c
, sizeof(lctx
));
784 d
->ascii85_fmt
= ascii85_detect_fmt(c
);
785 if(d
->ascii85_fmt
==0) {
786 // TODO: Scan the file to try to detect the format.
787 de_err(c
, "Unknown Ascii85 format");
791 f
= dbuf_create_output_file(c
, "bin", NULL
, 0);
793 if(d
->ascii85_fmt
==ASCII85_FMT_BTOA_OLD
||
794 d
->ascii85_fmt
==ASCII85_FMT_BTOA_NEW
)
796 do_ascii85_btoa(c
, d
, f
);
798 else if(d
->ascii85_fmt
==ASCII85_FMT_STANDARD
) {
799 do_ascii85_standard(c
, d
, f
, 2);
807 static int de_identify_ascii85(deark
*c
)
811 fmt
= ascii85_detect_fmt(c
);
813 if(fmt
==ASCII85_FMT_BTOA_OLD
) {
816 else if(fmt
==ASCII85_FMT_BTOA_NEW
) {
819 else if(fmt
==ASCII85_FMT_STANDARD
) {
826 void de_module_ascii85(deark
*c
, struct deark_module_info
*mi
)
829 mi
->desc
= "Ascii85";
830 mi
->run_fn
= de_run_ascii85
;
831 mi
->identify_fn
= de_identify_ascii85
;