Minor refactoring of the IFF and box-format parsers
[deark.git] / modules / riff.c
blob77ea94d6c6f669c44e173412ba21ab882097fce5
1 // This file is part of Deark.
2 // Copyright (C) 2016 Jason Summers
3 // See the file COPYING for terms of use.
5 // Generic RIFF format
6 // Windows animated cursor format
8 #include <deark-config.h>
9 #include <deark-private.h>
10 #include <deark-fmtutil.h>
11 DE_DECLARE_MODULE(de_module_riff);
13 #define CODE_ACON 0x41434f4eU
14 #define CODE_AVI 0x41564920U
15 #define CODE_CDRX 0x43445258U
16 #define CODE_CMX1 0x434d5831U
17 #define CODE_INFO 0x494e464fU
18 #define CODE_PAL 0x50414c20U
19 #define CODE_RMID 0x524d4944U
20 #define CODE_WAVE 0x57415645U
21 #define CODE_WEBP 0x57454250U
22 #define CODE_auds 0x61756473U
23 #define CODE_bmpt 0x626d7074U
24 #define CODE_cmov 0x636d6f76U
25 #define CODE_cmpr 0x636d7072U
26 #define CODE_movi 0x6d6f7669U
27 #define CODE_vids 0x76696473U
29 #define CHUNK_DISP 0x44495350U
30 #define CHUNK_EXIF 0x45584946U
31 #define CHUNK_IART 0x49415254U
32 #define CHUNK_ICOP 0x49434f50U
33 #define CHUNK_ICCP 0x49434350U
34 #define CHUNK_ICMT 0x49434d54U
35 #define CHUNK_IKEY 0x494b4559U
36 #define CHUNK_ISBJ 0x4953424aU
37 #define CHUNK_JUNK 0x4a554e4bU
38 #define CHUNK_LIST 0x4c495354U
39 #define CHUNK_RIFF 0x52494646U
40 #define CHUNK_RIFX 0x52494658U
41 #define CHUNK_XMP 0x584d5020U
42 #define CHUNK__PMX 0x5f504d58U
43 #define CHUNK_avih 0x61766968U
44 #define CHUNK_bmp 0x626d7020U
45 #define CHUNK_data 0x64617461U
46 #define CHUNK_fact 0x66616374U
47 #define CHUNK_fmt 0x666d7420U
48 #define CHUNK_icon 0x69636f6eU
49 #define CHUNK_strf 0x73747266U
50 #define CHUNK_strh 0x73747268U
52 typedef struct localctx_struct {
53 UI top_level_chunk_count;
54 int is_cdr;
55 u32 curr_avi_stream_type;
56 u8 cmx_parse_hack;
57 u8 cmv_parse_hack;
58 u8 in_movi;
59 int in_movi_level;
60 } lctx;
62 static void do_extract_raw(deark *c, lctx *d, struct de_iffctx *ictx, i64 pos, i64 len, const char *ext,
63 unsigned int createflags)
65 dbuf_create_file_from_slice(ictx->f, pos, len, ext, NULL, createflags);
68 static void do_INFO_item(deark *c, lctx *d, struct de_iffctx *ictx, i64 pos, i64 len, u32 chunk_id)
70 de_ucstring *s = NULL;
72 s = ucstring_create(c);
74 // TODO: Decode the chunk_id (e.g. ICRD = Creation date).
76 // TODO: Support the CSET chunk
77 dbuf_read_to_ucstring_n(ictx->f, pos, len, DE_DBG_MAX_STRLEN, s,
78 DE_CONVFLAG_STOP_AT_NUL, DE_ENCODING_LATIN1);
79 de_dbg(c, "value: \"%s\"", ucstring_getpsz(s));
81 ucstring_destroy(s);
84 static void extract_ani_frame(deark *c, lctx *d, struct de_iffctx *ictx, i64 pos, i64 len)
86 u8 buf[4];
87 const char *ext;
89 de_dbg(c, "frame at %d, len=%d", (int)pos, (int)len);
91 de_read(buf, pos, 4);
93 // Try to identify the format of this frame.
94 if(!de_memcmp(buf, "\x00\x00\x01\x00", 4)) {
95 ext = "ico";
97 else if(!de_memcmp(buf, "\x00\x00\x02\x00", 4)) {
98 ext = "cur";
100 else {
101 ext = "bin";
104 dbuf_create_file_from_slice(ictx->f, pos, len, ext, NULL, 0);
107 static const char *get_wav_fmt_name(unsigned int n)
109 const char *name = NULL;
110 switch(n) {
111 case 0x0001: name="PCM"; break;
112 case 0x0002: name="ADPCM"; break;
113 case 0x0050: name="MPEG"; break;
114 case 0x0055: name="MPEGLAYER3"; break;
115 case 0xFFFE: name="EXTENSIBLE"; break;
116 // TODO: There are lots more formats.
119 return name?name:"?";
122 static void decode_WAVEFORMATEX(deark *c, lctx *d, struct de_iffctx *ictx, i64 pos1, i64 len)
124 unsigned int formattag;
125 i64 n;
126 i64 pos = pos1;
128 if(!ictx->is_le) goto done;
129 if(len<14) goto done;
131 formattag = (unsigned int)dbuf_getu16le_p(ictx->f, &pos);
132 de_dbg(c, "FormatTag: 0x%04x (%s)", formattag, get_wav_fmt_name(formattag));
133 n = dbuf_getu16le_p(ictx->f, &pos);
134 de_dbg(c, "Channels: %u", (unsigned int)n);
135 n = dbuf_getu32le_p(ictx->f, &pos);
136 de_dbg(c, "SamplesPerSec: %u", (unsigned int)n);
137 n = dbuf_getu32le_p(ictx->f, &pos);
138 de_dbg(c, "AvgBytesPerSec: %u", (unsigned int)n);
139 n = dbuf_getu16le_p(ictx->f, &pos);
140 de_dbg(c, "BlockAlign: %u", (unsigned int)n);
141 if(len<16) goto done;
142 n = dbuf_getu16le_p(ictx->f, &pos);
143 de_dbg(c, "BitsPerSample: %u", (unsigned int)n);
144 if(len<18) goto done;
145 n = dbuf_getu16le_p(ictx->f, &pos);
146 de_dbg(c, "cbSize: %u", (unsigned int)n);
148 done:
152 static void do_wav_fmt(deark *c, lctx *d, struct de_iffctx *ictx, i64 pos, i64 len)
154 decode_WAVEFORMATEX(c, d, ictx, pos, len);
157 static void do_wav_fact(deark *c, lctx *d, struct de_iffctx *ictx, i64 pos, i64 len)
159 i64 n;
161 if(!ictx->is_le) return;
162 if(len<4) return;
163 n = de_getu32le(pos);
164 de_dbg(c, "number of samples: %u", (unsigned int)n);
167 static void do_avi_avih(deark *c, lctx *d, struct de_iffctx *ictx, i64 pos, i64 len)
169 i64 n, n2;
171 if(len<40) return;
172 n = de_getu32le(pos);
173 de_dbg(c, "microseconds/frame: %u", (unsigned int)n);
174 n = de_getu32le(pos+12);
175 de_dbg(c, "flags: 0x%08x", (unsigned int)n);
176 n = de_getu32le(pos+16);
177 de_dbg(c, "number of frames: %u", (unsigned int)n);
178 n = de_getu32le(pos+24);
179 de_dbg(c, "number of streams: %u", (unsigned int)n);
180 n = de_getu32le(pos+32);
181 n2 = de_getu32le(pos+36);
182 de_dbg_dimensions(c, n, n2);
183 // TODO: There are more fields in this chunk.
186 static void do_avi_strh(deark *c, lctx *d, struct de_iffctx *ictx, i64 pos, i64 len)
188 struct de_fourcc type4cc;
189 struct de_fourcc codec4cc;
191 if(len<8) return;
193 dbuf_read_fourcc(ictx->f, pos, &type4cc, 4, 0x0);
194 de_dbg(c, "stream type: '%s'", type4cc.id_dbgstr);
195 // Hack. TODO: Need a better way to track state.
196 d->curr_avi_stream_type = type4cc.id;
198 dbuf_read_fourcc(ictx->f, pos+4, &codec4cc, 4, 0x0);
199 de_dbg(c, "codec: '%s'", codec4cc.id_dbgstr);
201 // TODO: There are more fields here.
204 static void do_avi_strf(deark *c, lctx *d, struct de_iffctx *ictx, i64 pos, i64 len)
206 if(d->curr_avi_stream_type==CODE_vids) {
207 struct de_bmpinfo bi;
208 // For video streams, this is a BITMAPINFO.
209 fmtutil_get_bmpinfo(c, ictx->f, &bi, pos, len, DE_BMPINFO_CMPR_IS_4CC);
210 // This chunk contains just a bitmap header, so we can't extract a bitmap.
212 else if(d->curr_avi_stream_type==CODE_auds) {
213 // For audio streams, this is a WAVEFORMATEX.
214 decode_WAVEFORMATEX(c, d, ictx, pos, len);
218 static void do_cdr_bmp(deark *c, lctx *d, struct de_iffctx *ictx, i64 pos, i64 len)
220 if(len<20) return;
221 // The first 2 bytes are an index, or something. BMP starts at offset 2.
222 dbuf_create_file_from_slice(ictx->f, pos+2, len-2, "bmp", NULL, 0);
225 static void do_palette(deark *c, lctx *d, struct de_iffctx *ictx, i64 pos, i64 len)
227 i64 ver;
228 i64 n;
229 i64 i;
230 u8 r,g,b,flags;
231 u32 clr;
232 char tmps[32];
234 if(!ictx->is_le) return;
235 ver = de_getu16le(pos);
236 de_dbg(c, "version: 0x%04x", (unsigned int)ver);
237 pos += 2;
238 n = de_getu16le(pos);
239 de_dbg(c, "number of entries: %d", (int)n);
240 pos += 2;
241 if(n>(len-4)/4) n=(len-4)/4;
242 if(n>1024) n=1024;
243 if(n<1) return;
245 de_dbg(c, "palette entries at %d", (int)pos);
246 de_dbg_indent(c, 1);
247 for(i=0; i<n; i++) {
248 r = de_getbyte(pos);
249 g = de_getbyte(pos+1);
250 b = de_getbyte(pos+2);
251 flags = de_getbyte(pos+3);
252 pos += 4;
253 clr = DE_MAKE_RGB(r, g, b);
254 de_snprintf(tmps, sizeof(tmps), " flags=0x%02x", (unsigned int)flags);
255 de_dbg_pal_entry2(c, i, clr, NULL, NULL, tmps);
257 de_dbg_indent(c, -1);
260 static void do_DISP_DIB(deark *c, lctx *d, struct de_iffctx *ictx, i64 pos, i64 len)
262 de_module_params *mparams = NULL;
264 if(len<12) return;
266 mparams = de_malloc(c, sizeof(de_module_params));
267 mparams->in_params.codes = "X"; // "auxiliary"
268 mparams->in_params.flags = 0x80; // ".preview.bmp"
269 de_run_module_by_id_on_slice(c, "dib", mparams, ictx->f, pos, len);
270 de_free(c, mparams);
273 static void do_DISP_TEXT(deark *c, lctx *d, struct de_iffctx *ictx, i64 pos, i64 len1)
275 i64 foundpos;
276 i64 len = len1;
278 // Stop at NUL
279 if(dbuf_search_byte(ictx->f, 0x00, pos, len1, &foundpos)) {
280 len = foundpos - pos;
282 de_dbg(c, "text length: %d", (int)len);
283 if(len<1) return;
285 do_extract_raw(c, d, ictx, pos, len, "disp.txt", DE_CREATEFLAG_IS_AUX);
288 static void do_ICCP(deark *c, lctx *d, struct de_iffctx *ictx, i64 pos, i64 len)
290 dbuf_create_file_from_slice(ictx->f, pos, len, "icc", NULL, DE_CREATEFLAG_IS_AUX);
293 static void do_EXIF(deark *c, lctx *d, struct de_iffctx *ictx, i64 pos, i64 len)
295 fmtutil_handle_exif(c, pos, len);
298 static void do_XMP(deark *c, lctx *d, struct de_iffctx *ictx, i64 pos, i64 len)
300 dbuf_create_file_from_slice(ictx->f, pos, len, "xmp", NULL, DE_CREATEFLAG_IS_AUX);
303 static void do_DISP(deark *c, lctx *d, struct de_iffctx *ictx, i64 pos, i64 len)
305 unsigned int ty;
306 i64 dpos, dlen;
308 if(!ictx->is_le) return;
309 if(len<4) return;
310 ty = (unsigned int)de_getu32le(pos);
311 de_dbg(c, "data type: %u (%s)", ty,
312 fmtutil_get_windows_cb_data_type_name(ty));
314 dpos = pos+4;
315 dlen = len-4;
316 switch(ty) {
317 case 1:
318 case 7:
319 do_DISP_TEXT(c, d, ictx, dpos, dlen);
320 break;
321 case 8:
322 case 17:
323 do_DISP_DIB(c, d, ictx, dpos, dlen);
324 break;
328 static int is_fourcc_at(deark *c, struct de_iffctx *ictx, i64 pos)
330 u8 b[4];
331 size_t i;
333 dbuf_read(ictx->f, b, pos, 4);
334 for(i=0; i<4; i++) {
335 if(b[i]<32 || b[i]>126) return 0;
337 return 1;
340 static int do_cmx_parse_hack(deark *c, lctx *d, struct de_iffctx *ictx, i64 pos, i64 *plen)
342 i64 n, n_padded;
344 // Some CMX chunks seem to be followed by a non-RIFF segment starting with either
345 // 04 00 (4 bytes) or 10 00 (16 bytes). I'm just guessing how to parse them.
346 n = dbuf_getu16le(ictx->f, pos);
347 if(n>256 || n==0) return 0;
349 n_padded = de_pad_to_2(n);
350 if(is_fourcc_at(c, ictx, pos + n_padded)) {
351 de_dbg(c, "[%d non-RIFF bytes at %"I64_FMT"]", (int)n_padded, pos);
352 *plen = n_padded;
353 return 1;
355 return 0;
358 // CMV files seem to consist of two RIFF chunks, separated by four 0x00 bytes.
359 // (Maybe some sort of scan-for-the-next-RIFF-chunk logic should happen by
360 // default, but it's hard to be sure we won't break something.)
361 static int do_cmv_parse_hack(deark *c, lctx *d, struct de_iffctx *ictx, i64 pos, i64 *plen)
363 if(ictx->level!=0) return 0;
364 if(dbuf_getu32be(ictx->f, pos)!=0) return 0;
365 if(dbuf_getu32be(ictx->f, pos+4)!=CHUNK_RIFF) return 0;
366 de_dbg(c, "[%d non-RIFF bytes at %"I64_FMT"]", 4, pos);
367 *plen = 4;
368 return 1;
371 static int my_handle_nonchunk_riff_data_fn(deark *c, struct de_iffctx *ictx,
372 i64 pos, i64 *plen)
374 lctx *d = (lctx*)ictx->userdata;
376 if(d->cmx_parse_hack) {
377 return do_cmx_parse_hack(c, d, ictx, pos, plen);
379 else if(d->cmv_parse_hack) {
380 return do_cmv_parse_hack(c, d, ictx, pos, plen);
382 return 0;
385 static int my_on_std_container_start_fn(deark *c, struct de_iffctx *ictx)
387 lctx *d = (lctx*)ictx->userdata;
388 u32 chunktype = ictx->curr_container_fmt4cc.id;
389 u32 formtype = ictx->curr_container_contentstype4cc.id;
390 int suppress_decoding = 0;
392 if(ictx->level==0 && (chunktype==CHUNK_RIFF || chunktype==CHUNK_RIFX) &&
393 d->top_level_chunk_count==0)
395 const char *fmtname = NULL;
397 switch(formtype) {
398 case CODE_ACON: fmtname = "Windows animated cursor"; break;
399 case CODE_AVI: fmtname = "AVI"; break;
400 case CODE_CDRX: fmtname = "Corel CCX"; break;
401 case CODE_CMX1:
402 fmtname = "Corel CMX";
403 ictx->handle_nonchunk_data_fn = my_handle_nonchunk_riff_data_fn;
404 d->cmx_parse_hack = 1;
405 break;
406 case CODE_cmov:
407 fmtname = "CorelMOVE";
408 ictx->handle_nonchunk_data_fn = my_handle_nonchunk_riff_data_fn;
409 d->cmv_parse_hack = 1;
410 break;
411 case CODE_WAVE: fmtname = "WAVE"; break;
412 case CODE_WEBP: fmtname = "WebP"; break;
415 // Special check for CorelDraw formats.
416 if(!fmtname && (formtype>>8 == 0x434452U) /* "CDR" */) {
417 d->is_cdr = 1;
418 fmtname = "CorelDRAW (RIFF-based)";
421 if(fmtname) {
422 de_declare_fmt(c, fmtname);
426 if(d->is_cdr && chunktype==CHUNK_LIST) {
427 // 'cmpr' LISTs in CorelDraw files are not correctly formed.
428 // Tell the parser not to process them.
429 if(formtype==CODE_cmpr) {
430 de_dbg(c, "[not decoding CDR cmpr list]");
431 suppress_decoding = 1;
432 goto done;
436 if(ictx->main_contentstype4cc.id==CODE_AVI && chunktype==CHUNK_LIST &&
437 formtype==CODE_movi)
439 // There are often a huge number of these chunks, and we can't do
440 // anything interesting with them, so skip them by default.
441 if(c->debug_level<2) {
442 de_dbg(c, "[not decoding movi chunks]");
443 suppress_decoding = 1;
444 goto done;
447 if(!d->in_movi) {
448 // Keep track of when we are inside a 'movi' container.
449 d->in_movi = 1;
450 d->in_movi_level = ictx->level;
454 done:
455 if(ictx->level==0) {
456 d->top_level_chunk_count++;
458 return !suppress_decoding;
461 static int my_on_container_end_fn(deark *c, struct de_iffctx *ictx)
463 lctx *d = (lctx*)ictx->userdata;
465 if(ictx->curr_container_contentstype4cc.id==CODE_movi &&
466 d->in_movi && ictx->level==d->in_movi_level)
468 d->in_movi = 0;
471 return 1;
474 static int my_preprocess_riff_chunk_fn(deark *c, struct de_iffctx *ictx)
476 const char *name = NULL;
478 // TODO: Need a better way to do this.
479 switch(ictx->chunkctx->chunk4cc.id) {
480 case CHUNK_DISP: name="display"; break;
481 case CHUNK_IART: name="artist"; break;
482 case CHUNK_ICOP: name="copyright"; break;
483 case CHUNK_ICMT: name="comments"; break;
484 case CHUNK_IKEY: name="keywords"; break;
485 case CHUNK_ISBJ: name="subject"; break;
486 case CHUNK_JUNK: name="filler"; break;
487 case CHUNK_LIST: name="subchunk container"; break;
490 if(name) {
491 ictx->chunkctx->chunk_name = name;
493 return 1;
496 static int my_riff_chunk_handler(deark *c, struct de_iffctx *ictx)
498 i64 dpos, dlen;
499 u32 list_type;
500 lctx *d = (lctx*)ictx->userdata;
502 // We should always set this flag for formats (like RIFF) that aren't standard IFF.
503 ictx->handled = 1;
505 list_type = ictx->curr_container_contentstype4cc.id;
506 dpos = ictx->chunkctx->dpos;
507 dlen = ictx->chunkctx->dlen;
509 switch(ictx->chunkctx->chunk4cc.id) {
510 case CHUNK_RIFF:
511 case CHUNK_RIFX:
512 case CHUNK_LIST:
513 ictx->is_std_container = 1;
514 return 1;
517 if(list_type==CODE_INFO) {
518 do_INFO_item(c, d, ictx, dpos, dlen, ictx->chunkctx->chunk4cc.id);
519 goto chunk_handled;
522 switch(ictx->chunkctx->chunk4cc.id) {
524 case CHUNK_DISP:
525 do_DISP(c, d, ictx, dpos, dlen);
526 break;
528 case CHUNK_JUNK:
529 break;
531 case CHUNK_ICCP: // Used by WebP
532 do_ICCP(c, d, ictx, dpos, dlen);
533 break;
535 case CHUNK_EXIF: // Used by WebP
536 do_EXIF(c, d, ictx, dpos, dlen);
537 break;
539 case CHUNK_XMP: // Used by WebP
540 case CHUNK__PMX: // Used by WAVE, AVI
541 do_XMP(c, d, ictx, dpos, dlen);
542 break;
544 case CHUNK_icon:
545 if(ictx->main_contentstype4cc.id==CODE_ACON) {
546 extract_ani_frame(c, d, ictx, dpos, dlen);
548 break;
550 case CHUNK_data:
551 if(list_type==CODE_RMID) {
552 do_extract_raw(c, d, ictx, dpos, dlen, "mid", 0);
554 else if(list_type==CODE_PAL) {
555 do_palette(c, d, ictx, dpos, dlen);
557 break;
559 case CHUNK_fmt:
560 if(ictx->main_contentstype4cc.id==CODE_WAVE) {
561 do_wav_fmt(c, d, ictx, dpos, dlen);
563 break;
565 case CHUNK_fact:
566 if(ictx->main_contentstype4cc.id==CODE_WAVE) {
567 do_wav_fact(c, d, ictx, dpos, dlen);
569 break;
571 case CHUNK_avih:
572 if(ictx->main_contentstype4cc.id==CODE_AVI) {
573 do_avi_avih(c, d, ictx, dpos, dlen);
575 break;
577 case CHUNK_strh:
578 if(ictx->main_contentstype4cc.id==CODE_AVI) {
579 do_avi_strh(c, d, ictx, dpos, dlen);
581 break;
583 case CHUNK_strf:
584 if(ictx->main_contentstype4cc.id==CODE_AVI) {
585 do_avi_strf(c, d, ictx, dpos, dlen);
587 break;
589 case CHUNK_bmp:
590 if(d->is_cdr && ictx->curr_container_contentstype4cc.id==CODE_bmpt) {
591 do_cdr_bmp(c, d, ictx, dpos, dlen);
593 break;
595 default:
596 if(c->debug_level>=2 &&
597 ictx->main_contentstype4cc.id==CODE_AVI && !d->in_movi)
599 de_dbg_hexdump(c, ictx->f, dpos, dlen, 256, NULL, 0x1);
603 chunk_handled:
604 return 1;
607 static void de_run_riff(deark *c, de_module_params *mparams)
609 lctx *d = NULL;
610 struct de_iffctx *ictx = NULL;
611 u8 buf[4];
613 d = de_malloc(c, sizeof(lctx));
614 ictx = de_malloc(c, sizeof(struct de_iffctx));
616 ictx->userdata = (void*)d;
617 ictx->preprocess_chunk_fn = my_preprocess_riff_chunk_fn;
618 ictx->handle_chunk_fn = my_riff_chunk_handler;
619 ictx->on_std_container_start_fn = my_on_std_container_start_fn;
620 ictx->on_container_end_fn = my_on_container_end_fn;
621 ictx->f = c->infile;
623 de_read(buf, 0, 4);
625 if(!de_memcmp(buf, "RIFF", 4)) {
626 ictx->is_le = 1;
627 ictx->reversed_4cc = 0;
629 else if(!de_memcmp(buf, "RIFX", 4)) {
630 ictx->is_le = 0;
631 ictx->reversed_4cc = 0;
633 else if(!de_memcmp(buf, "XFIR", 4)) {
634 ictx->is_le = 1;
635 ictx->reversed_4cc = 1;
637 else {
638 de_warn(c, "This is probably not a RIFF file.");
639 ictx->is_le = 1;
640 ictx->reversed_4cc = 0;
643 fmtutil_read_iff_format(c, ictx, 0, ictx->f->len);
645 de_free(c, ictx);
646 de_free(c, d);
649 static int de_identify_riff(deark *c)
651 u8 buf[4];
652 int has_sig;
653 i64 dlen;
655 de_read(buf, 0, 4);
656 has_sig = (!de_memcmp(buf, "RIFF", 4)) ||
657 (!de_memcmp(buf, "XFIR", 4)) ||
658 (!de_memcmp(buf, "RIFX", 4));
659 if(!has_sig) return 0;
661 dlen = de_getu32le(4);
662 // This check screens out .AMV format, for example.
663 if(dlen==0 && c->infile->len!=8) return 0;
665 return 50;
668 void de_module_riff(deark *c, struct deark_module_info *mi)
670 mi->id = "riff";
671 mi->desc = "RIFF-based formats";
672 mi->run_fn = de_run_riff;
673 mi->identify_fn = de_identify_riff;