New "ea_data" module
[deark.git] / modules / riff.c
blob392f2e6b97013f42b19e781185b82bb528361812
1 // This file is part of Deark.
2 // Copyright (C) 2016 Jason Summers
3 // See the file COPYING for terms of use.
5 // Generic RIFF format
6 // Windows animated cursor format
8 #include <deark-config.h>
9 #include <deark-private.h>
10 #include <deark-fmtutil.h>
11 DE_DECLARE_MODULE(de_module_riff);
13 #define CODE_ACON 0x41434f4eU
14 #define CODE_AVI 0x41564920U
15 #define CODE_CDRX 0x43445258U
16 #define CODE_CMX1 0x434d5831U
17 #define CODE_INFO 0x494e464fU
18 #define CODE_PAL 0x50414c20U
19 #define CODE_RMID 0x524d4944U
20 #define CODE_WAVE 0x57415645U
21 #define CODE_WEBP 0x57454250U
22 #define CODE_auds 0x61756473U
23 #define CODE_bmpt 0x626d7074U
24 #define CODE_cmpr 0x636d7072U
25 #define CODE_movi 0x6d6f7669U
26 #define CODE_vids 0x76696473U
28 #define CHUNK_DISP 0x44495350U
29 #define CHUNK_EXIF 0x45584946U
30 #define CHUNK_IART 0x49415254U
31 #define CHUNK_ICOP 0x49434f50U
32 #define CHUNK_ICCP 0x49434350U
33 #define CHUNK_ICMT 0x49434d54U
34 #define CHUNK_IKEY 0x494b4559U
35 #define CHUNK_ISBJ 0x4953424aU
36 #define CHUNK_JUNK 0x4a554e4bU
37 #define CHUNK_LIST 0x4c495354U
38 #define CHUNK_RIFF 0x52494646U
39 #define CHUNK_RIFX 0x52494658U
40 #define CHUNK_XMP 0x584d5020U
41 #define CHUNK__PMX 0x5f504d58U
42 #define CHUNK_avih 0x61766968U
43 #define CHUNK_bmp 0x626d7020U
44 #define CHUNK_data 0x64617461U
45 #define CHUNK_fact 0x66616374U
46 #define CHUNK_fmt 0x666d7420U
47 #define CHUNK_icon 0x69636f6eU
48 #define CHUNK_strf 0x73747266U
49 #define CHUNK_strh 0x73747268U
51 typedef struct localctx_struct {
52 int is_cdr;
53 u32 curr_avi_stream_type;
54 u8 cmx_parse_hack;
55 u8 in_movi;
56 int in_movi_level;
57 } lctx;
59 static void do_extract_raw(deark *c, lctx *d, struct de_iffctx *ictx, i64 pos, i64 len, const char *ext,
60 unsigned int createflags)
62 dbuf_create_file_from_slice(ictx->f, pos, len, ext, NULL, createflags);
65 static void do_INFO_item(deark *c, lctx *d, struct de_iffctx *ictx, i64 pos, i64 len, u32 chunk_id)
67 de_ucstring *s = NULL;
69 s = ucstring_create(c);
71 // TODO: Decode the chunk_id (e.g. ICRD = Creation date).
73 // TODO: Support the CSET chunk
74 dbuf_read_to_ucstring_n(ictx->f, pos, len, DE_DBG_MAX_STRLEN, s,
75 DE_CONVFLAG_STOP_AT_NUL, DE_ENCODING_LATIN1);
76 de_dbg(c, "value: \"%s\"", ucstring_getpsz(s));
78 ucstring_destroy(s);
81 static void extract_ani_frame(deark *c, lctx *d, struct de_iffctx *ictx, i64 pos, i64 len)
83 u8 buf[4];
84 const char *ext;
86 de_dbg(c, "frame at %d, len=%d", (int)pos, (int)len);
88 de_read(buf, pos, 4);
90 // Try to identify the format of this frame.
91 if(!de_memcmp(buf, "\x00\x00\x01\x00", 4)) {
92 ext = "ico";
94 else if(!de_memcmp(buf, "\x00\x00\x02\x00", 4)) {
95 ext = "cur";
97 else {
98 ext = "bin";
101 dbuf_create_file_from_slice(ictx->f, pos, len, ext, NULL, 0);
104 static const char *get_wav_fmt_name(unsigned int n)
106 const char *name = NULL;
107 switch(n) {
108 case 0x0001: name="PCM"; break;
109 case 0x0002: name="ADPCM"; break;
110 case 0x0050: name="MPEG"; break;
111 case 0x0055: name="MPEGLAYER3"; break;
112 case 0xFFFE: name="EXTENSIBLE"; break;
113 // TODO: There are lots more formats.
116 return name?name:"?";
119 static void decode_WAVEFORMATEX(deark *c, lctx *d, struct de_iffctx *ictx, i64 pos1, i64 len)
121 unsigned int formattag;
122 i64 n;
123 i64 pos = pos1;
125 if(!ictx->is_le) goto done;
126 if(len<14) goto done;
128 formattag = (unsigned int)dbuf_getu16le_p(ictx->f, &pos);
129 de_dbg(c, "FormatTag: 0x%04x (%s)", formattag, get_wav_fmt_name(formattag));
130 n = dbuf_getu16le_p(ictx->f, &pos);
131 de_dbg(c, "Channels: %u", (unsigned int)n);
132 n = dbuf_getu32le_p(ictx->f, &pos);
133 de_dbg(c, "SamplesPerSec: %u", (unsigned int)n);
134 n = dbuf_getu32le_p(ictx->f, &pos);
135 de_dbg(c, "AvgBytesPerSec: %u", (unsigned int)n);
136 n = dbuf_getu16le_p(ictx->f, &pos);
137 de_dbg(c, "BlockAlign: %u", (unsigned int)n);
138 if(len<16) goto done;
139 n = dbuf_getu16le_p(ictx->f, &pos);
140 de_dbg(c, "BitsPerSample: %u", (unsigned int)n);
141 if(len<18) goto done;
142 n = dbuf_getu16le_p(ictx->f, &pos);
143 de_dbg(c, "cbSize: %u", (unsigned int)n);
145 done:
149 static void do_wav_fmt(deark *c, lctx *d, struct de_iffctx *ictx, i64 pos, i64 len)
151 decode_WAVEFORMATEX(c, d, ictx, pos, len);
154 static void do_wav_fact(deark *c, lctx *d, struct de_iffctx *ictx, i64 pos, i64 len)
156 i64 n;
158 if(!ictx->is_le) return;
159 if(len<4) return;
160 n = de_getu32le(pos);
161 de_dbg(c, "number of samples: %u", (unsigned int)n);
164 static void do_avi_avih(deark *c, lctx *d, struct de_iffctx *ictx, i64 pos, i64 len)
166 i64 n, n2;
168 if(len<40) return;
169 n = de_getu32le(pos);
170 de_dbg(c, "microseconds/frame: %u", (unsigned int)n);
171 n = de_getu32le(pos+12);
172 de_dbg(c, "flags: 0x%08x", (unsigned int)n);
173 n = de_getu32le(pos+16);
174 de_dbg(c, "number of frames: %u", (unsigned int)n);
175 n = de_getu32le(pos+24);
176 de_dbg(c, "number of streams: %u", (unsigned int)n);
177 n = de_getu32le(pos+32);
178 n2 = de_getu32le(pos+36);
179 de_dbg_dimensions(c, n, n2);
180 // TODO: There are more fields in this chunk.
183 static void do_avi_strh(deark *c, lctx *d, struct de_iffctx *ictx, i64 pos, i64 len)
185 struct de_fourcc type4cc;
186 struct de_fourcc codec4cc;
188 if(len<8) return;
190 dbuf_read_fourcc(ictx->f, pos, &type4cc, 4, 0x0);
191 de_dbg(c, "stream type: '%s'", type4cc.id_dbgstr);
192 // Hack. TODO: Need a better way to track state.
193 d->curr_avi_stream_type = type4cc.id;
195 dbuf_read_fourcc(ictx->f, pos+4, &codec4cc, 4, 0x0);
196 de_dbg(c, "codec: '%s'", codec4cc.id_dbgstr);
198 // TODO: There are more fields here.
201 static void do_avi_strf(deark *c, lctx *d, struct de_iffctx *ictx, i64 pos, i64 len)
203 if(d->curr_avi_stream_type==CODE_vids) {
204 struct de_bmpinfo bi;
205 // For video streams, this is a BITMAPINFO.
206 fmtutil_get_bmpinfo(c, ictx->f, &bi, pos, len, DE_BMPINFO_CMPR_IS_4CC);
207 // This chunk contains just a bitmap header, so we can't extract a bitmap.
209 else if(d->curr_avi_stream_type==CODE_auds) {
210 // For audio streams, this is a WAVEFORMATEX.
211 decode_WAVEFORMATEX(c, d, ictx, pos, len);
215 static void do_cdr_bmp(deark *c, lctx *d, struct de_iffctx *ictx, i64 pos, i64 len)
217 if(len<20) return;
218 // The first 2 bytes are an index, or something. BMP starts at offset 2.
219 dbuf_create_file_from_slice(ictx->f, pos+2, len-2, "bmp", NULL, 0);
222 static void do_palette(deark *c, lctx *d, struct de_iffctx *ictx, i64 pos, i64 len)
224 i64 ver;
225 i64 n;
226 i64 i;
227 u8 r,g,b,flags;
228 u32 clr;
229 char tmps[32];
231 if(!ictx->is_le) return;
232 ver = de_getu16le(pos);
233 de_dbg(c, "version: 0x%04x", (unsigned int)ver);
234 pos += 2;
235 n = de_getu16le(pos);
236 de_dbg(c, "number of entries: %d", (int)n);
237 pos += 2;
238 if(n>(len-4)/4) n=(len-4)/4;
239 if(n>1024) n=1024;
240 if(n<1) return;
242 de_dbg(c, "palette entries at %d", (int)pos);
243 de_dbg_indent(c, 1);
244 for(i=0; i<n; i++) {
245 r = de_getbyte(pos);
246 g = de_getbyte(pos+1);
247 b = de_getbyte(pos+2);
248 flags = de_getbyte(pos+3);
249 pos += 4;
250 clr = DE_MAKE_RGB(r, g, b);
251 de_snprintf(tmps, sizeof(tmps), " flags=0x%02x", (unsigned int)flags);
252 de_dbg_pal_entry2(c, i, clr, NULL, NULL, tmps);
254 de_dbg_indent(c, -1);
257 static void do_DISP_DIB(deark *c, lctx *d, struct de_iffctx *ictx, i64 pos, i64 len)
259 if(len<12) return;
260 // "X" = Tell the dib module to mark the output file as "auxiliary".
261 de_run_module_by_id_on_slice2(c, "dib", "X", ictx->f, pos, len);
264 static void do_DISP_TEXT(deark *c, lctx *d, struct de_iffctx *ictx, i64 pos, i64 len1)
266 i64 foundpos;
267 i64 len = len1;
269 // Stop at NUL
270 if(dbuf_search_byte(ictx->f, 0x00, pos, len1, &foundpos)) {
271 len = foundpos - pos;
273 de_dbg(c, "text length: %d", (int)len);
274 if(len<1) return;
276 do_extract_raw(c, d, ictx, pos, len, "disp.txt", DE_CREATEFLAG_IS_AUX);
279 static void do_ICCP(deark *c, lctx *d, struct de_iffctx *ictx, i64 pos, i64 len)
281 dbuf_create_file_from_slice(ictx->f, pos, len, "icc", NULL, DE_CREATEFLAG_IS_AUX);
284 static void do_EXIF(deark *c, lctx *d, struct de_iffctx *ictx, i64 pos, i64 len)
286 fmtutil_handle_exif(c, pos, len);
289 static void do_XMP(deark *c, lctx *d, struct de_iffctx *ictx, i64 pos, i64 len)
291 dbuf_create_file_from_slice(ictx->f, pos, len, "xmp", NULL, DE_CREATEFLAG_IS_AUX);
294 static void do_DISP(deark *c, lctx *d, struct de_iffctx *ictx, i64 pos, i64 len)
296 unsigned int ty;
297 i64 dpos, dlen;
299 if(!ictx->is_le) return;
300 if(len<4) return;
301 ty = (unsigned int)de_getu32le(pos);
302 de_dbg(c, "data type: %u (%s)", ty,
303 fmtutil_get_windows_cb_data_type_name(ty));
305 dpos = pos+4;
306 dlen = len-4;
307 switch(ty) {
308 case 1:
309 case 7:
310 do_DISP_TEXT(c, d, ictx, dpos, dlen);
311 break;
312 case 8:
313 case 17:
314 do_DISP_DIB(c, d, ictx, dpos, dlen);
315 break;
319 static int is_fourcc_at(deark *c, struct de_iffctx *ictx, i64 pos)
321 u8 b[4];
322 size_t i;
324 dbuf_read(ictx->f, b, pos, 4);
325 for(i=0; i<4; i++) {
326 if(b[i]<32 || b[i]>126) return 0;
328 return 1;
331 static int do_cmx_parse_hack(deark *c, lctx *d, struct de_iffctx *ictx, i64 pos, i64 *plen)
333 i64 n, n_padded;
335 // Some CMX chunks seem to be followed by a non-RIFF segment starting with either
336 // 04 00 (4 bytes) or 10 00 (16 bytes). I'm just guessing how to parse them.
337 n = dbuf_getu16le(ictx->f, pos);
338 if(n>256 || n==0) return 0;
340 n_padded = de_pad_to_2(n);
341 if(is_fourcc_at(c, ictx, pos + n_padded)) {
342 de_dbg(c, "[%d non-RIFF bytes at %"I64_FMT"]", (int)n_padded, pos);
343 *plen = n_padded;
344 return 1;
346 return 0;
349 static int my_handle_nonchunk_riff_data_fn(deark *c, struct de_iffctx *ictx,
350 i64 pos, i64 *plen)
352 lctx *d = (lctx*)ictx->userdata;
354 if(d->cmx_parse_hack) {
355 return do_cmx_parse_hack(c, d, ictx, pos, plen);
357 return 0;
360 static int my_on_std_container_start_fn(deark *c, struct de_iffctx *ictx)
362 lctx *d = (lctx*)ictx->userdata;
364 if(ictx->level==0) {
365 const char *fmtname = NULL;
367 switch(ictx->main_contentstype4cc.id) {
368 case CODE_ACON: fmtname = "Windows animated cursor"; break;
369 case CODE_AVI: fmtname = "AVI"; break;
370 case CODE_CDRX: fmtname = "Corel CCX"; break;
371 case CODE_CMX1:
372 fmtname = "Corel CMX";
373 ictx->handle_nonchunk_data_fn = my_handle_nonchunk_riff_data_fn;
374 d->cmx_parse_hack = 1;
375 break;
376 case CODE_WAVE: fmtname = "WAVE"; break;
377 case CODE_WEBP: fmtname = "WebP"; break;
380 // Special check for CorelDraw formats.
381 if(!fmtname && !de_memcmp(ictx->main_contentstype4cc.bytes, (const void*)"CDR", 3)) {
382 d->is_cdr = 1;
383 fmtname = "CorelDRAW (RIFF-based)";
386 if(fmtname) {
387 de_declare_fmt(c, fmtname);
391 if(d->is_cdr && ictx->curr_container_fmt4cc.id==CHUNK_LIST) {
392 // 'cmpr' LISTs in CorelDraw files are not correctly formed.
393 // Tell the parser not to process them.
394 if(ictx->curr_container_contentstype4cc.id==CODE_cmpr) {
395 de_dbg(c, "[not decoding CDR cmpr list]");
396 return 0;
400 if(ictx->main_contentstype4cc.id==CODE_AVI &&
401 ictx->curr_container_contentstype4cc.id==CODE_movi &&
402 c->debug_level<2)
404 // There are often a huge number of these chunks, and we can't do
405 // anything interesting with them, so skip them by default.
406 de_dbg(c, "[not decoding movi chunks]");
407 return 0;
410 if(ictx->main_contentstype4cc.id==CODE_AVI &&
411 ictx->curr_container_contentstype4cc.id==CODE_movi && !d->in_movi)
413 // Keep track of when we are inside a 'movi' container.
414 d->in_movi = 1;
415 d->in_movi_level = ictx->level;
418 return 1;
421 static int my_on_container_end_fn(deark *c, struct de_iffctx *ictx)
423 lctx *d = (lctx*)ictx->userdata;
425 if(ictx->curr_container_contentstype4cc.id==CODE_movi &&
426 d->in_movi && ictx->level==d->in_movi_level)
428 d->in_movi = 0;
431 return 1;
434 static int my_preprocess_riff_chunk_fn(deark *c, struct de_iffctx *ictx)
436 const char *name = NULL;
438 // TODO: Need a better way to do this.
439 switch(ictx->chunkctx->chunk4cc.id) {
440 case CHUNK_DISP: name="display"; break;
441 case CHUNK_IART: name="artist"; break;
442 case CHUNK_ICOP: name="copyright"; break;
443 case CHUNK_ICMT: name="comments"; break;
444 case CHUNK_IKEY: name="keywords"; break;
445 case CHUNK_ISBJ: name="subject"; break;
446 case CHUNK_JUNK: name="filler"; break;
447 case CHUNK_LIST: name="subchunk container"; break;
450 if(name) {
451 ictx->chunkctx->chunk_name = name;
453 return 1;
456 static int my_riff_chunk_handler(deark *c, struct de_iffctx *ictx)
458 i64 dpos, dlen;
459 u32 list_type;
460 lctx *d = (lctx*)ictx->userdata;
462 // We should always set this flag for formats (like RIFF) that aren't standard IFF.
463 ictx->handled = 1;
465 list_type = ictx->curr_container_contentstype4cc.id;
466 dpos = ictx->chunkctx->dpos;
467 dlen = ictx->chunkctx->dlen;
469 switch(ictx->chunkctx->chunk4cc.id) {
470 case CHUNK_RIFF:
471 case CHUNK_RIFX:
472 case CHUNK_LIST:
473 ictx->is_std_container = 1;
474 return 1;
477 if(list_type==CODE_INFO) {
478 do_INFO_item(c, d, ictx, dpos, dlen, ictx->chunkctx->chunk4cc.id);
479 goto chunk_handled;
482 switch(ictx->chunkctx->chunk4cc.id) {
484 case CHUNK_DISP:
485 do_DISP(c, d, ictx, dpos, dlen);
486 break;
488 case CHUNK_JUNK:
489 break;
491 case CHUNK_ICCP: // Used by WebP
492 do_ICCP(c, d, ictx, dpos, dlen);
493 break;
495 case CHUNK_EXIF: // Used by WebP
496 do_EXIF(c, d, ictx, dpos, dlen);
497 break;
499 case CHUNK_XMP: // Used by WebP
500 case CHUNK__PMX: // Used by WAVE, AVI
501 do_XMP(c, d, ictx, dpos, dlen);
502 break;
504 case CHUNK_icon:
505 if(ictx->main_contentstype4cc.id==CODE_ACON) {
506 extract_ani_frame(c, d, ictx, dpos, dlen);
508 break;
510 case CHUNK_data:
511 if(list_type==CODE_RMID) {
512 do_extract_raw(c, d, ictx, dpos, dlen, "mid", 0);
514 else if(list_type==CODE_PAL) {
515 do_palette(c, d, ictx, dpos, dlen);
517 break;
519 case CHUNK_fmt:
520 if(ictx->main_contentstype4cc.id==CODE_WAVE) {
521 do_wav_fmt(c, d, ictx, dpos, dlen);
523 break;
525 case CHUNK_fact:
526 if(ictx->main_contentstype4cc.id==CODE_WAVE) {
527 do_wav_fact(c, d, ictx, dpos, dlen);
529 break;
531 case CHUNK_avih:
532 if(ictx->main_contentstype4cc.id==CODE_AVI) {
533 do_avi_avih(c, d, ictx, dpos, dlen);
535 break;
537 case CHUNK_strh:
538 if(ictx->main_contentstype4cc.id==CODE_AVI) {
539 do_avi_strh(c, d, ictx, dpos, dlen);
541 break;
543 case CHUNK_strf:
544 if(ictx->main_contentstype4cc.id==CODE_AVI) {
545 do_avi_strf(c, d, ictx, dpos, dlen);
547 break;
549 case CHUNK_bmp:
550 if(d->is_cdr && ictx->curr_container_contentstype4cc.id==CODE_bmpt) {
551 do_cdr_bmp(c, d, ictx, dpos, dlen);
553 break;
555 default:
556 if(c->debug_level>=2 &&
557 ictx->main_contentstype4cc.id==CODE_AVI && !d->in_movi)
559 de_dbg_hexdump(c, ictx->f, dpos, dlen, 256, NULL, 0x1);
563 chunk_handled:
564 return 1;
567 static void de_run_riff(deark *c, de_module_params *mparams)
569 lctx *d = NULL;
570 struct de_iffctx *ictx = NULL;
571 u8 buf[4];
573 d = de_malloc(c, sizeof(lctx));
574 ictx = de_malloc(c, sizeof(struct de_iffctx));
576 ictx->userdata = (void*)d;
577 ictx->preprocess_chunk_fn = my_preprocess_riff_chunk_fn;
578 ictx->handle_chunk_fn = my_riff_chunk_handler;
579 ictx->on_std_container_start_fn = my_on_std_container_start_fn;
580 ictx->on_container_end_fn = my_on_container_end_fn;
581 ictx->f = c->infile;
583 de_read(buf, 0, 4);
585 if(!de_memcmp(buf, "RIFF", 4)) {
586 ictx->is_le = 1;
587 ictx->reversed_4cc = 0;
589 else if(!de_memcmp(buf, "RIFX", 4)) {
590 ictx->is_le = 0;
591 ictx->reversed_4cc = 0;
593 else if(!de_memcmp(buf, "XFIR", 4)) {
594 ictx->is_le = 1;
595 ictx->reversed_4cc = 1;
597 else {
598 de_warn(c, "This is probably not a RIFF file.");
599 ictx->is_le = 1;
600 ictx->reversed_4cc = 0;
603 fmtutil_read_iff_format(c, ictx, 0, ictx->f->len);
605 de_free(c, ictx);
606 de_free(c, d);
609 static int de_identify_riff(deark *c)
611 u8 buf[4];
612 int has_sig;
613 i64 dlen;
615 de_read(buf, 0, 4);
616 has_sig = (!de_memcmp(buf, "RIFF", 4)) ||
617 (!de_memcmp(buf, "XFIR", 4)) ||
618 (!de_memcmp(buf, "RIFX", 4));
619 if(!has_sig) return 0;
621 dlen = de_getu32le(4);
622 // This check screens out .AMV format, for example.
623 if(dlen==0 && c->infile->len!=8) return 0;
625 return 50;
628 void de_module_riff(deark *c, struct deark_module_info *mi)
630 mi->id = "riff";
631 mi->desc = "RIFF-based formats";
632 mi->run_fn = de_run_riff;
633 mi->identify_fn = de_identify_riff;