Added support for cp932 (Shift-JIS) conversion
[deark.git] / modules / gzip.c
blob9505d4848288373a70c7fceb17acf67cccea3eca
1 // This file is part of Deark.
2 // Copyright (C) 2016 Jason Summers
3 // See the file COPYING for terms of use.
5 // gzip compressed file format
7 #include <deark-config.h>
8 #include <deark-private.h>
9 #include <deark-fmtutil.h>
10 DE_DECLARE_MODULE(de_module_gzip);
12 struct member_data {
13 #define GZIPFLAG_FTEXT 0x01
14 #define GZIPFLAG_FHCRC 0x02
15 #define GZIPFLAG_FEXTRA 0x04
16 #define GZIPFLAG_FNAME 0x08
17 #define GZIPFLAG_FCOMMENT 0x10
18 u8 flags;
19 u8 cmpr_code;
20 u32 crc16_reported;
21 u32 crc32_reported;
22 i64 isize;
23 struct de_timestamp mod_time_ts;
26 typedef struct lctx_struct {
27 dbuf *output_file;
28 struct de_crcobj *crco;
29 } lctx;
31 static const char *get_os_name(u8 n)
33 const char *names[14] = { "FAT", "Amiga", "VMS", "Unix",
34 "VM/CMS", "Atari", "HPFS", "Mac", "Z-System", "CP/M",
35 "TOPS-20", "NTFS", "QDOS", "RISCOS" };
36 const char *name = "?";
38 if((unsigned int)n<=13) {
39 name = names[(unsigned int)n];
41 return name;
44 static int do_gzip_read_member(deark *c, lctx *d, i64 pos1, i64 *member_size)
46 u8 b0, b1;
47 i64 pos;
48 i64 n;
49 i64 foundpos;
50 i64 string_len;
51 i64 cmpr_data_len;
52 i64 mod_time_unix;
53 u32 crc_calculated;
54 de_ucstring *member_name = NULL;
55 int saved_indent_level;
56 int ret;
57 struct member_data *md = NULL;
58 int retval = 0;
60 md = de_malloc(c, sizeof(struct member_data));
62 de_dbg_indent_save(c, &saved_indent_level);
64 de_dbg(c, "gzip member at %d", (int)pos1);
65 de_dbg_indent(c, 1);
66 pos = pos1;
68 b0 = de_getbyte(pos+0);
69 b1 = de_getbyte(pos+1);
70 if(b0!=0x1f || b1!=0x8b) {
71 de_err(c, "Invalid gzip signature at %d. This is not a valid gzip file.",
72 (int)pos1);
73 goto done;
76 md->cmpr_code = de_getbyte(pos+2);
77 if(md->cmpr_code!=0x08) {
78 de_err(c, "Unsupported compression type (%d)", (int)md->cmpr_code);
79 goto done;
82 md->flags = de_getbyte(pos+3);
83 de_dbg(c, "flags: 0x%02x", (unsigned int)md->flags);
84 pos += 4;
86 mod_time_unix = de_getu32le(pos);
87 de_unix_time_to_timestamp(mod_time_unix, &md->mod_time_ts, 0x1);
88 if(md->mod_time_ts.is_valid) {
89 char timestamp_buf[64];
90 de_timestamp_to_string(&md->mod_time_ts, timestamp_buf, sizeof(timestamp_buf), 0);
91 de_dbg(c, "mod time: %" I64_FMT " (%s)", mod_time_unix, timestamp_buf);
93 pos += 4;
95 b0 = de_getbyte(pos++);
96 de_dbg(c, "extra flags: 0x%02x", (unsigned int)b0);
98 b0 = de_getbyte(pos++);
99 de_dbg(c, "OS or filesystem: %d (%s)", (int)b0, get_os_name(b0));
101 if(md->flags & GZIPFLAG_FEXTRA) {
102 n = de_getu16le(pos); // XLEN
103 // TODO: It might be interesting to dissect these extra fields, but it's
104 // hard to find even a single file that uses them.
105 de_dbg(c, "[extra fields at %d, dpos=%d, dlen=%d]",
106 (int)pos, (int)(pos+2), (int)n);
107 pos += 2;
108 pos += n;
111 if(md->flags & GZIPFLAG_FNAME) {
112 ret = dbuf_search_byte(c->infile, 0x00, pos, c->infile->len - pos,
113 &foundpos);
114 if(!ret) {
115 de_err(c, "Invalid NAME field");
116 goto done;
119 string_len = foundpos - pos;
121 member_name = ucstring_create(c);
122 #define DE_GZIP_MAX_FNLEN 300
123 dbuf_read_to_ucstring_n(c->infile, pos, string_len, DE_GZIP_MAX_FNLEN,
124 member_name, 0, DE_ENCODING_LATIN1);
125 de_dbg(c, "file name at %d, len=%d: \"%s\"", (int)pos, (int)string_len,
126 ucstring_getpsz_d(member_name));
127 pos = foundpos + 1;
130 if(md->flags & GZIPFLAG_FCOMMENT) {
131 ret = dbuf_search_byte(c->infile, 0x00, pos, c->infile->len - pos,
132 &foundpos);
133 if(!ret) {
134 de_err(c, "Invalid COMMENT field");
135 goto done;
137 pos = foundpos + 1;
140 if(md->flags & GZIPFLAG_FHCRC) {
141 md->crc16_reported = (u32)de_getu16le(pos);
142 de_dbg(c, "crc16 (reported): 0x%04x", (unsigned int)md->crc16_reported);
143 pos += 2;
146 de_dbg(c, "compressed blocks at %d", (int)pos);
148 if(!d->output_file) {
149 // Although any member can have a name and mod time, this metadata
150 // is ignored for members after the first one.
151 de_finfo *fi = NULL;
153 fi = de_finfo_create(c);
155 if(member_name && c->filenames_from_file) {
156 de_finfo_set_name_from_ucstring(c, fi, member_name, 0);
157 fi->original_filename_flag = 1;
160 if(md->mod_time_ts.is_valid) {
161 fi->timestamp[DE_TIMESTAMPIDX_MODIFY] = md->mod_time_ts;
164 d->output_file = dbuf_create_output_file(c, member_name?NULL:"bin", fi, 0);
165 dbuf_enable_wbuffer(d->output_file);
167 de_finfo_destroy(c, fi);
170 dbuf_set_writelistener(d->output_file, de_writelistener_for_crc, (void*)d->crco);
171 de_crcobj_reset(d->crco);
173 ret = fmtutil_decompress_deflate(c->infile, pos, c->infile->len - pos, d->output_file,
174 0, &cmpr_data_len, 0);
175 dbuf_flush(d->output_file);
177 crc_calculated = de_crcobj_getval(d->crco);
178 dbuf_set_writelistener(d->output_file, NULL, NULL);
180 if(!ret) goto done;
181 pos += cmpr_data_len;
183 de_dbg(c, "crc32 (calculated): 0x%08x", (unsigned int)crc_calculated);
185 md->crc32_reported = (u32)de_getu32le(pos);
186 de_dbg(c, "crc32 (reported) : 0x%08x", (unsigned int)md->crc32_reported);
187 pos += 4;
189 if(crc_calculated != md->crc32_reported) {
190 de_warn(c, "CRC check failed: Expected 0x%08x, got 0x%08x",
191 (unsigned int)md->crc32_reported, (unsigned int)crc_calculated);
194 md->isize = de_getu32le(pos);
195 de_dbg(c, "uncompressed size (mod 2^32): %u", (unsigned int)md->isize);
196 pos += 4;
198 retval = 1;
200 done:
201 if(retval)
202 *member_size = pos - pos1;
203 else
204 *member_size = 0;
205 ucstring_destroy(member_name);
206 de_free(c, md);
207 de_dbg_indent_restore(c, saved_indent_level);
208 return retval;
211 static void de_run_gzip(deark *c, de_module_params *mparams)
213 lctx *d = NULL;
214 i64 pos;
215 i64 member_size;
217 d = de_malloc(c, sizeof(lctx));
218 d->crco = de_crcobj_create(c, DE_CRCOBJ_CRC32_IEEE);
220 pos = 0;
221 while(1) {
222 if(pos >= c->infile->len) break;
223 if(!do_gzip_read_member(c, d, pos, &member_size)) {
224 break;
226 if(member_size<=0) break;
228 pos += member_size;
230 dbuf_close(d->output_file);
232 if(d) {
233 de_crcobj_destroy(d->crco);
234 de_free(c, d);
238 static int de_identify_gzip(deark *c)
240 u8 buf[3];
242 de_read(buf, 0, 3);
243 if(buf[0]==0x1f && buf[1]==0x8b) {
244 if(buf[2]==0x08) return 100;
245 return 10;
247 return 0;
250 void de_module_gzip(deark *c, struct deark_module_info *mi)
252 mi->id = "gzip";
253 mi->desc = "gzip compressed file";
254 mi->run_fn = de_run_gzip;
255 mi->identify_fn = de_identify_gzip;