zip: Better parsing of Info-ZIP type 1 extra field
[deark.git] / modules / gzip.c
blob3d72f1e501cd3aa44cb951bd0059ba5f8bb155da
1 // This file is part of Deark.
2 // Copyright (C) 2016 Jason Summers
3 // See the file COPYING for terms of use.
5 // gzip compressed file format
7 #include <deark-config.h>
8 #include <deark-private.h>
9 #include <deark-fmtutil.h>
10 DE_DECLARE_MODULE(de_module_gzip);
12 struct member_data {
13 #define GZIPFLAG_FTEXT 0x01
14 #define GZIPFLAG_FHCRC 0x02
15 #define GZIPFLAG_FEXTRA 0x04
16 #define GZIPFLAG_FNAME 0x08
17 #define GZIPFLAG_FCOMMENT 0x10
18 u8 flags;
19 u8 cmpr_code;
20 u32 crc16_reported;
21 u32 crc32_reported;
22 i64 isize;
23 struct de_timestamp mod_time_ts;
25 struct de_crcobj *crco; // A copy of lctx->crco
28 typedef struct lctx_struct {
29 dbuf *output_file;
30 struct de_crcobj *crco;
31 } lctx;
33 static const char *get_os_name(u8 n)
35 const char *names[14] = { "FAT", "Amiga", "VMS", "Unix",
36 "VM/CMS", "Atari", "HPFS", "Mac", "Z-System", "CP/M",
37 "TOPS-20", "NTFS", "QDOS", "RISCOS" };
38 const char *name = "?";
40 if((unsigned int)n<=13) {
41 name = names[(unsigned int)n];
43 return name;
46 static void our_writelistener_cb(dbuf *f, void *userdata, const u8 *buf, i64 buf_len)
48 struct member_data *md = (struct member_data *)userdata;
49 de_crcobj_addbuf(md->crco, buf, buf_len);
52 static int do_gzip_read_member(deark *c, lctx *d, i64 pos1, i64 *member_size)
54 u8 b0, b1;
55 i64 pos;
56 i64 n;
57 i64 foundpos;
58 i64 string_len;
59 i64 cmpr_data_len;
60 i64 mod_time_unix;
61 u32 crc_calculated;
62 de_ucstring *member_name = NULL;
63 int saved_indent_level;
64 int ret;
65 struct member_data *md = NULL;
66 int retval = 0;
68 md = de_malloc(c, sizeof(struct member_data));
70 de_dbg_indent_save(c, &saved_indent_level);
72 de_dbg(c, "gzip member at %d", (int)pos1);
73 de_dbg_indent(c, 1);
74 pos = pos1;
76 b0 = de_getbyte(pos+0);
77 b1 = de_getbyte(pos+1);
78 if(b0!=0x1f || b1!=0x8b) {
79 de_err(c, "Invalid gzip signature at %d. This is not a valid gzip file.",
80 (int)pos1);
81 goto done;
84 md->cmpr_code = de_getbyte(pos+2);
85 if(md->cmpr_code!=0x08) {
86 de_err(c, "Unsupported compression type (%d)", (int)md->cmpr_code);
87 goto done;
90 md->flags = de_getbyte(pos+3);
91 de_dbg(c, "flags: 0x%02x", (unsigned int)md->flags);
92 pos += 4;
94 mod_time_unix = de_getu32le(pos);
95 de_unix_time_to_timestamp(mod_time_unix, &md->mod_time_ts, 0x1);
96 if(md->mod_time_ts.is_valid) {
97 char timestamp_buf[64];
98 de_timestamp_to_string(&md->mod_time_ts, timestamp_buf, sizeof(timestamp_buf), 0);
99 de_dbg(c, "mod time: %" I64_FMT " (%s)", mod_time_unix, timestamp_buf);
101 pos += 4;
103 b0 = de_getbyte(pos++);
104 de_dbg(c, "extra flags: 0x%02x", (unsigned int)b0);
106 b0 = de_getbyte(pos++);
107 de_dbg(c, "OS or filesystem: %d (%s)", (int)b0, get_os_name(b0));
109 if(md->flags & GZIPFLAG_FEXTRA) {
110 n = de_getu16le(pos); // XLEN
111 // TODO: It might be interesting to dissect these extra fields, but it's
112 // hard to find even a single file that uses them.
113 de_dbg(c, "[extra fields at %d, dpos=%d, dlen=%d]",
114 (int)pos, (int)(pos+2), (int)n);
115 pos += 2;
116 pos += n;
119 if(md->flags & GZIPFLAG_FNAME) {
120 ret = dbuf_search_byte(c->infile, 0x00, pos, c->infile->len - pos,
121 &foundpos);
122 if(!ret) {
123 de_err(c, "Invalid NAME field");
124 goto done;
127 string_len = foundpos - pos;
129 member_name = ucstring_create(c);
130 #define DE_GZIP_MAX_FNLEN 300
131 dbuf_read_to_ucstring_n(c->infile, pos, string_len, DE_GZIP_MAX_FNLEN,
132 member_name, 0, DE_ENCODING_LATIN1);
133 de_dbg(c, "file name at %d, len=%d: \"%s\"", (int)pos, (int)string_len,
134 ucstring_getpsz_d(member_name));
135 pos = foundpos + 1;
138 if(md->flags & GZIPFLAG_FCOMMENT) {
139 ret = dbuf_search_byte(c->infile, 0x00, pos, c->infile->len - pos,
140 &foundpos);
141 if(!ret) {
142 de_err(c, "Invalid COMMENT field");
143 goto done;
145 pos = foundpos + 1;
148 if(md->flags & GZIPFLAG_FHCRC) {
149 md->crc16_reported = (u32)de_getu16le(pos);
150 de_dbg(c, "crc16 (reported): 0x%04x", (unsigned int)md->crc16_reported);
151 pos += 2;
154 de_dbg(c, "compressed blocks at %d", (int)pos);
156 if(!d->output_file) {
157 // Although any member can have a name and mod time, this metadata
158 // is ignored for members after the first one.
159 de_finfo *fi = NULL;
161 fi = de_finfo_create(c);
163 if(member_name && c->filenames_from_file) {
164 de_finfo_set_name_from_ucstring(c, fi, member_name, 0);
165 fi->original_filename_flag = 1;
168 if(md->mod_time_ts.is_valid) {
169 fi->timestamp[DE_TIMESTAMPIDX_MODIFY] = md->mod_time_ts;
172 d->output_file = dbuf_create_output_file(c, member_name?NULL:"bin", fi, 0);
174 de_finfo_destroy(c, fi);
177 dbuf_set_writelistener(d->output_file, our_writelistener_cb, (void*)md);
178 md->crco = d->crco;
179 de_crcobj_reset(md->crco);
181 ret = fmtutil_decompress_deflate(c->infile, pos, c->infile->len - pos, d->output_file,
182 0, &cmpr_data_len, 0);
184 crc_calculated = de_crcobj_getval(md->crco);
185 dbuf_set_writelistener(d->output_file, NULL, NULL);
187 if(!ret) goto done;
188 pos += cmpr_data_len;
190 de_dbg(c, "crc32 (calculated): 0x%08x", (unsigned int)crc_calculated);
192 md->crc32_reported = (u32)de_getu32le(pos);
193 de_dbg(c, "crc32 (reported) : 0x%08x", (unsigned int)md->crc32_reported);
194 pos += 4;
196 if(crc_calculated != md->crc32_reported) {
197 de_warn(c, "CRC check failed: Expected 0x%08x, got 0x%08x",
198 (unsigned int)md->crc32_reported, (unsigned int)crc_calculated);
201 md->isize = de_getu32le(pos);
202 de_dbg(c, "uncompressed size (mod 2^32): %u", (unsigned int)md->isize);
203 pos += 4;
205 retval = 1;
207 done:
208 if(retval)
209 *member_size = pos - pos1;
210 else
211 *member_size = 0;
212 ucstring_destroy(member_name);
213 de_free(c, md);
214 de_dbg_indent_restore(c, saved_indent_level);
215 return retval;
218 static void de_run_gzip(deark *c, de_module_params *mparams)
220 lctx *d = NULL;
221 i64 pos;
222 i64 member_size;
224 d = de_malloc(c, sizeof(lctx));
225 d->crco = de_crcobj_create(c, DE_CRCOBJ_CRC32_IEEE);
227 pos = 0;
228 while(1) {
229 if(pos >= c->infile->len) break;
230 if(!do_gzip_read_member(c, d, pos, &member_size)) {
231 break;
233 if(member_size<=0) break;
235 pos += member_size;
237 dbuf_close(d->output_file);
239 if(d) {
240 de_crcobj_destroy(d->crco);
241 de_free(c, d);
245 static int de_identify_gzip(deark *c)
247 u8 buf[3];
249 de_read(buf, 0, 3);
250 if(buf[0]==0x1f && buf[1]==0x8b) {
251 if(buf[2]==0x08) return 100;
252 return 10;
254 return 0;
257 void de_module_gzip(deark *c, struct deark_module_info *mi)
259 mi->id = "gzip";
260 mi->desc = "gzip compressed file";
261 mi->run_fn = de_run_gzip;
262 mi->identify_fn = de_identify_gzip;