1 // This file is part of Deark.
2 // Copyright (C) 2018 Jason Summers
3 // See the file COPYING for terms of use.
7 #include <deark-config.h>
8 #include <deark-private.h>
9 DE_DECLARE_MODULE(de_module_ebml
);
11 struct attachmentctx_struct
{
12 de_ucstring
*filename
;
13 i64 data_pos
; // 0 = no info
14 i64 data_len
; // valid if data_pos!=0
17 typedef struct localctx_struct
{
20 struct attachmentctx_struct
*attachmentctx
;
23 struct handler_params
{
27 // Set if handler is being called (again) at the *end* of the element
30 typedef void (*handler_fn_type
)(deark
*c
, lctx
*d
, struct handler_params
*hp
);
34 // The low byte is a code for the data type:
35 #define TY_m 0x01 // Master
36 #define TY_u 0x02 // unsigned int
37 #define TY_i 0x03 // signed int
38 #define TY_s 0x04 // string
39 #define TY_8 0x05 // UTF-8 string
40 #define TY_b 0x06 // binary
41 #define TY_f 0x07 // float
42 #define TY_d 0x08 // date
43 // 0x0100 = Don't decode this element by default, because it's too noisy
44 // 0x0200 = Don't decode this element, because it has special handling
45 // 0x0800 = Also call the handler function at the *end* of the element
46 // (useful for TY_m only)
54 static const char *get_type_name(unsigned int t
)
58 case TY_m
: name
="master"; break;
59 case TY_u
: name
="unsigned int"; break;
60 case TY_i
: name
="signed int"; break;
61 case TY_s
: name
="string"; break;
62 case TY_8
: name
="UTF-8 string"; break;
63 case TY_b
: name
="binary"; break;
64 case TY_f
: name
="float"; break;
65 case TY_d
: name
="date"; break;
71 // Read a "Variable Size Integer".
76 // 2 for a special "reserved" value
77 static int get_var_size_int(dbuf
*f
, i64
*val
, i64
*pos
,
86 unsigned int initial_zero_bits
;
89 if(nbytes_avail
<1) goto done
;
91 // This is an unsigned int. In a i64, we can support up to 63
93 // For now we'll hope that 8 octets is the most we'll have to support,
94 // but it's possible we'll have to support 9 or even more, which will
95 // require additional logic.
97 // 1xxxxxxx width_nbits=0, octets=1, data_bits=7
98 // 01xxxxxx width_nbits=1, octets=2, data_bits=14
100 // 00000001 width_nbits=7, octets=8, data_bits=56
101 // 00000000 1xxxxxxx width_nbits=8, octets=9, data_bits=63
103 b
= dbuf_getbyte(f
, *pos
);
107 initial_zero_bits
= 0;
113 // "Not it". Try the next-larger number of initial 0 bits.
121 mask
= 0x7f >> initial_zero_bits
;
123 *val
= (i64
)(b
& mask
);
125 // Read remaining bytes, if any.
126 for(k
=0; k
<initial_zero_bits
; k
++) {
127 if(*pos
>= pos1
+nbytes_avail
) goto done
;
128 b
= dbuf_getbyte(f
, *pos
);
130 if(*val
> 0x07ffffffffffffffLL
) {
133 *val
= ((*val
)<<8) | ((i64
)b
);
136 if(initial_zero_bits
==0 && (*val
)==0x7f) {
137 // TODO: Fully handle "reserved" element value of all 1 bits.
151 static void handler_filename(deark
*c
, lctx
*d
, struct handler_params
*hp
)
153 if(!d
->attachmentctx
) return;
154 if(d
->attachmentctx
->filename
) return;
155 if(hp
->dlen
<1) return;
157 d
->attachmentctx
->filename
= ucstring_create(c
);
158 dbuf_read_to_ucstring_n(c
->infile
, hp
->dpos
, hp
->dlen
, 300,
159 d
->attachmentctx
->filename
,
160 DE_CONVFLAG_STOP_AT_NUL
, DE_ENCODING_UTF8
);
163 static void handler_filedata(deark
*c
, lctx
*d
, struct handler_params
*hp
)
165 if(!d
->attachmentctx
) return;
166 d
->attachmentctx
->data_pos
= hp
->dpos
;
167 d
->attachmentctx
->data_len
= hp
->dlen
;
170 static void destroy_attachment_data(deark
*c
, lctx
*d
)
172 if(!d
->attachmentctx
) return;
173 ucstring_destroy(d
->attachmentctx
->filename
);
174 de_free(c
, d
->attachmentctx
);
175 d
->attachmentctx
= NULL
;
178 static void handler_attachedfile_start(deark
*c
, lctx
*d
, struct handler_params
*hp
)
180 if(d
->attachmentctx
) {
181 destroy_attachment_data(c
, d
);
184 d
->attachmentctx
= de_malloc(c
, sizeof(struct attachmentctx_struct
));
187 static void handler_attachedfile_end(deark
*c
, lctx
*d
)
191 if(!d
->attachmentctx
) goto done
;
192 if(d
->attachmentctx
->data_pos
==0) goto done
;
194 fi
= de_finfo_create(c
);
196 // TODO: We could do a better job of constructing filenames in various
198 if(d
->attachmentctx
->filename
&&
199 (d
->attachmentctx
->filename
->len
> 0) &&
200 c
->filenames_from_file
)
202 de_finfo_set_name_from_ucstring(c
, fi
, d
->attachmentctx
->filename
, 0);
206 de_finfo_set_name_from_sz(c
, fi
, "bin", 0, DE_ENCODING_UTF8
);
209 dbuf_create_file_from_slice(c
->infile
, d
->attachmentctx
->data_pos
,
210 d
->attachmentctx
->data_len
, NULL
, fi
, DE_CREATEFLAG_IS_AUX
);
213 de_finfo_destroy(c
, fi
);
214 destroy_attachment_data(c
, d
);
217 static void handler_attachedfile(deark
*c
, lctx
*d
, struct handler_params
*hp
)
220 handler_attachedfile_end(c
, d
);
223 handler_attachedfile_start(c
, d
, hp
);
227 static void handler_hexdumpa(deark
*c
, lctx
*d
, struct handler_params
*hp
)
229 de_dbg_hexdump(c
, c
->infile
, hp
->dpos
, hp
->dlen
, 256, NULL
, 0x1);
232 static void handler_hexdumpb(deark
*c
, lctx
*d
, struct handler_params
*hp
)
234 de_dbg_hexdump(c
, c
->infile
, hp
->dpos
, hp
->dlen
, 256, NULL
, 0x0);
237 static const struct ele_id_info ele_id_info_arr
[] = {
238 // Note that the Matroska spec may conflate encoded IDs with decoded IDs.
239 // This table lists decoded IDs. Encoded IDs have an extra 1 bit in a
240 // position that makes it more significant than any of the other 1 bits.
241 {TY_m
, 0x0, "ChapterDisplay", NULL
},
242 {TY_u
, 0x3, "TrackType", NULL
},
243 {TY_8
, 0x5, "ChapString", NULL
},
244 {TY_s
, 0x6, "CodecID", NULL
},
245 {TY_u
, 0x8, "FlagDefault", NULL
},
246 {TY_u
, 0x11, "ChapterTimeStart", NULL
},
247 {TY_u
, 0x12, "ChapterTimeEnd", NULL
},
248 {TY_u
, 0x18, "ChapterFlagHidden", NULL
},
249 {TY_u
, 0x1a, "FlagInterlaced", NULL
},
250 {TY_u
, 0x1b, "BlockDuration", NULL
},
251 {TY_u
, 0x1c, "FlagLacing", NULL
},
252 {TY_u
, 0x1f, "Channels", NULL
},
253 {TY_m
, 0x20, "BlockGroup", NULL
},
254 {TY_b
, 0x21, "Block", NULL
},
255 {TY_b
, 0x23, "SimpleBlock", NULL
},
256 {TY_u
, 0x27, "Position", NULL
},
257 {TY_u
, 0x2a, "CodecDecodeAll", NULL
},
258 {TY_u
, 0x2b, "PrevSize", NULL
},
259 {TY_m
, 0x2e, "TrackEntry", NULL
},
260 {TY_u
, 0x30, "PixelWidth", NULL
},
261 {TY_u
, 0x33, "CueTime", NULL
},
262 {TY_f
, 0x35, "SamplingFrequency", NULL
},
263 {TY_m
, 0x36, "ChapterAtom", NULL
},
264 {TY_m
, 0x37, "CueTrackPositions", NULL
},
265 {TY_u
, 0x39, "FlagEnabled", NULL
},
266 {TY_u
, 0x3a, "PixelHeight", NULL
},
267 {TY_m
, 0x3b, "CuePoint", NULL
},
268 {TY_b
, 0x3f, "CRC-32", handler_hexdumpb
},
269 {TY_u
, 0x57, "TrackNumber", NULL
},
270 {TY_m
, 0x60, "Video", NULL
},
271 {TY_m
, 0x61, "Audio", NULL
},
272 {TY_u
, 0x67, "Timecode", NULL
},
273 {TY_b
|0x0100, 0x6c, "Void", handler_hexdumpb
},
274 {TY_u
, 0x70, "CueRelativePosition", NULL
},
275 {TY_u
, 0x71, "CueClusterPosition", NULL
},
276 {TY_u
, 0x77, "CueTrack", NULL
},
277 {TY_i
, 0x7b, "ReferenceBlock", NULL
},
278 {TY_u
, 0x254, "ContentCompAlgo", NULL
},
279 {TY_b
, 0x255, "ContentCompSettings", handler_hexdumpb
},
280 {TY_s
, 0x282, "DocType", NULL
},
281 {TY_u
, 0x285, "DocTypeReadVersion", NULL
},
282 {TY_u
, 0x286, "EBMLVersion", NULL
},
283 {TY_u
, 0x287, "DocTypeVersion", NULL
},
284 {TY_u
, 0x2f2, "EBMLMaxIDLength", NULL
},
285 {TY_u
, 0x2f3, "EBMLMaxSizeLength", NULL
},
286 {TY_u
, 0x2f7, "EBMLReadVersion", NULL
},
287 {TY_s
, 0x37c, "ChapLanguage", NULL
},
288 {TY_d
, 0x461, "DateUTC", NULL
},
289 {TY_s
, 0x47a, "TagLanguage", NULL
},
290 {TY_u
, 0x484, "TagDefault", NULL
},
291 {TY_8
, 0x487, "TagString", NULL
},
292 {TY_f
, 0x489, "Duration", NULL
},
293 // 0x4b4, "TagDefault?" // Some buggy software does this
294 {TY_u
, 0x598, "ChapterFlagEnabled", NULL
},
295 {TY_8
, 0x5a3, "TagName", NULL
},
296 {TY_m
, 0x5b9, "EditionEntry", NULL
},
297 {TY_u
, 0x5bc, "EditionUID", NULL
},
298 {TY_u
, 0x5bd, "EditionFlagHidden", NULL
},
299 {TY_u
, 0x5db, "EditionFlagDefault", NULL
},
300 {TY_u
, 0x5dd, "EditionFlagOrdered", NULL
},
301 {TY_b
, 0x65c, "FileData", handler_filedata
},
302 {TY_s
, 0x660, "FileMimeType", NULL
},
303 {TY_8
, 0x66e, "FileName", handler_filename
},
304 {TY_u
, 0x6ae, "FileUID", NULL
},
305 {TY_8
, 0xd80, "MuxingApp", NULL
},
306 {TY_m
, 0xdbb, "Seek", NULL
},
307 {TY_m
, 0x1034, "ContentCompression", NULL
},
308 {TY_8
, 0x136e, "Name", NULL
},
309 {TY_b
, 0x13ab, "SeekID", handler_hexdumpb
},
310 {TY_u
, 0x13ac, "SeekPosition", NULL
},
311 {TY_u
, 0x13b8, "StereoMode", NULL
},
312 {TY_u
, 0x14b0, "DisplayWidth", NULL
},
313 {TY_u
, 0x14b2, "DisplayUnit", NULL
},
314 {TY_u
, 0x14ba, "DisplayHeight", NULL
},
315 {TY_u
, 0x15aa, "FlagForced", NULL
},
316 {TY_u
, 0x15ee, "MaxBlockAdditionID", NULL
},
317 {TY_8
, 0x1741, "WritingApp", NULL
},
318 {TY_m
|0x0800, 0x21a7, "AttachedFile", handler_attachedfile
},
319 {TY_m
, 0x2240, "ContentEncoding", NULL
},
320 {TY_u
, 0x2264, "BitDepth", NULL
},
321 {TY_b
, 0x23a2, "CodecPrivate", handler_hexdumpa
},
322 {TY_m
, 0x23c0, "Targets", NULL
},
323 {TY_u
, 0x23c5, "TagTrackUID", NULL
},
324 {TY_s
, 0x23ca, "TargetType", NULL
},
325 {TY_m
, 0x27c8, "SimpleTag", NULL
},
326 {TY_u
, 0x28ca, "TargetTypeValue", NULL
},
327 {TY_m
, 0x2d80, "ContentEncodings", NULL
},
328 {TY_u
, 0x2de7, "MinCache", NULL
},
329 {TY_u
, 0x2df8, "MaxCache", NULL
},
330 {TY_m
, 0x3373, "Tag", NULL
},
331 {TY_b
, 0x33a4, "SegmentUID", handler_hexdumpb
},
332 {TY_u
, 0x33c4, "ChapterUID", NULL
},
333 {TY_u
, 0x33c5, "TrackUID", NULL
},
334 {TY_f
, 0x38b5, "OutputSamplingFrequency", NULL
},
335 {TY_8
, 0x3ba9, "Title", NULL
},
336 {TY_s
, 0x2b59c, "Language", NULL
},
337 {TY_f
, 0x3314f, "TrackTimecodeScale (deprecated)", NULL
},
338 {TY_u
, 0x3e383, "DefaultDuration", NULL
},
339 {TY_u
, 0xad7b1, "TimecodeScale", NULL
},
340 {TY_m
, 0x43a770, "Chapters", NULL
},
341 {TY_m
|0x0100, 0x14d9b74, "SeekHead", NULL
},
342 {TY_m
, 0x254c367, "Tags", NULL
},
343 {TY_m
, 0x549a966, "Info", NULL
},
344 {TY_m
, 0x654ae6b, "Tracks", NULL
},
345 {TY_m
, 0x8538067, "Segment", NULL
},
346 {TY_m
, 0x941a469, "Attachments", NULL
},
347 {TY_m
, 0xa45dfa3, "EBML", NULL
},
348 {TY_m
, 0xc53bb6b, "Cues", NULL
},
349 {TY_m
|0x0100, 0xf43b675, "Cluster", NULL
}
352 static const struct ele_id_info
*find_ele_id_info(i64 ele_id
)
355 for(k
=0; k
<DE_ARRAYCOUNT(ele_id_info_arr
); k
++) {
356 if(ele_id_info_arr
[k
].ele_id
== ele_id
) {
357 return &ele_id_info_arr
[k
];
363 // This is a variable size integer, but it's different from the one named
364 // "Variable Size Integer".
365 static void decode_uint(deark
*c
, lctx
*d
, const struct ele_id_info
*ele_id
,
372 if(len1
==0) goto done
;
373 if(len1
<1 || len1
>8) return;
374 len
= (unsigned int)len1
;
377 for(k
=0; k
<len
; k
++) {
379 x
= (u64
)de_getbyte(pos
+(i64
)k
);
380 v
|= x
<<((len
-1-k
)*8);
384 de_dbg(c
, "value: %"U64_FMT
, v
);
387 static void decode_float(deark
*c
, lctx
*d
, const struct ele_id_info
*ele_id
,
393 v
= dbuf_getfloat32x(c
->infile
, pos
, 0);
396 v
= dbuf_getfloat64x(c
->infile
, pos
, 0);
401 de_dbg(c
, "value: %f", v
);
404 static void EBMLdate_to_timestamp(i64 ed
, struct de_timestamp
*ts
)
408 // ed is the number of nanoseconds since the beginning of 2001.
410 // Now t is seconds since the beginning of 2001.
411 // We want seconds since the beginning of 1970.
412 // So, add the number of seconds in the years from 1970 through 2000. This
414 // There are 86400 seconds in a day.
415 // There are 8 leap days in this range ('72, 76, 80, 84, 88, 92, 96, 00).
416 t
+= 86400LL * (31*365 + 8);
417 de_unix_time_to_timestamp(t
, ts
, 0x1);
418 de_timestamp_set_subsec(ts
, ((double)(ed
%1000000000))/1000000000.0);
421 static void decode_date(deark
*c
, lctx
*d
, const struct ele_id_info
*ele_id
,
425 struct de_timestamp ts
;
429 dt_int
= de_geti64be(pos
);
430 EBMLdate_to_timestamp(dt_int
, &ts
);
431 de_timestamp_to_string(&ts
, buf
, sizeof(buf
), 0);
432 de_dbg(c
, "value: %"I64_FMT
" (%s)", dt_int
, buf
);
435 static void decode_string(deark
*c
, lctx
*d
, const struct ele_id_info
*ele_id
,
436 i64 pos
, i64 len
, de_ext_encoding ee
)
438 de_ucstring
*s
= NULL
;
440 s
= ucstring_create(c
);
441 dbuf_read_to_ucstring_n(c
->infile
, pos
, len
, DE_DBG_MAX_STRLEN
, s
,
442 DE_CONVFLAG_STOP_AT_NUL
, ee
);
443 de_dbg(c
, "value: \"%s\"", ucstring_getpsz_d(s
));
448 // Print an element ID number, in the format used by the Matroska spec.
449 static void print_encoded_id(deark
*c
, lctx
*d
, i64 pos
, i64 len
)
451 de_ucstring
*s
= NULL
;
455 s
= ucstring_create(c
);
456 for(i
=0; i
<len
; i
++) {
457 ucstring_printf(s
, DE_ENCODING_LATIN1
, "[%02x]",
458 (unsigned int)de_getbyte(pos
+i
));
460 de_dbg(c
, "encoded id: %s", ucstring_getpsz_d(s
));
464 static int do_element_sequence(deark
*c
, lctx
*d
, i64 pos1
, i64 len
);
466 static int do_element(deark
*c
, lctx
*d
, i64 pos1
,
467 i64 nbytes_avail
, i64
*bytes_used
)
473 const struct ele_id_info
*einfo
;
474 const char *ele_name
;
475 int saved_indent_level
;
477 int should_call_start_handler
= 0;
478 int should_decode_default
= 0;
479 int should_print_NOT_DECODING_msg
= 0;
480 int should_call_end_handler
= 0;
484 de_dbg_indent_save(c
, &saved_indent_level
);
486 de_dbg(c
, "element at %"I64_FMT
", max_len=%"I64_FMT
, pos1
, nbytes_avail
);
489 if(1!=get_var_size_int(c
->infile
, &ele_id
, &pos
, nbytes_avail
)) {
490 de_err(c
, "Failed to read ID of element at %"I64_FMT
, pos1
);
494 einfo
= find_ele_id_info(ele_id
);
495 if(einfo
&& einfo
->name
)
496 ele_name
= einfo
->name
;
501 dtype
= einfo
->flags
& 0xff;
505 de_dbg(c
, "id: 0x%"U64_FMTx
" (%s)", (u64
)ele_id
, ele_name
);
506 if(d
->show_encoded_id
) {
507 print_encoded_id(c
, d
, pos1
, pos
-pos1
);
510 len_ret
= get_var_size_int(c
->infile
, &ele_dlen
, &pos
, pos1
+nbytes_avail
-pos
);
512 de_snprintf(tmpbuf
, sizeof(tmpbuf
), "%"I64_FMT
, ele_dlen
);
514 else if(len_ret
==2) {
515 ele_dlen
= c
->infile
->len
- pos
;
516 de_strlcpy(tmpbuf
, "unknown", sizeof(tmpbuf
));
519 de_err(c
, "Failed to read length of element at %"I64_FMT
, pos1
);
522 de_dbg(c
, "data at %"I64_FMT
", dlen=%s, type=%s", pos
, tmpbuf
,
523 get_type_name(dtype
));
526 // EBML does not have any sort of end-of-master-element marker, which
527 // presents a problem when a master element has an unknown length.
529 // EBML's "solution" is this:
530 // "The end of an Unknown-Sized Element is determined by whichever
531 // comes first: the end of the file or the beginning of the next EBML
532 // Element, defined by this document or the corresponding EBML Schema,
533 // that is not independently valid as Descendant Element of the
534 // Unknown-Sized Element."
536 // This would appear to require a sophisticated, high-level algorithm
537 // with 100% complete knowledge of the latest version of the specific
538 // application format. We do not have such an algorithm.
540 de_err(c
, "EBML files with unknown-length elements are not supported");
544 if(pos
+ ele_dlen
> c
->infile
->len
) {
545 de_err(c
, "Element at %"I64_FMT
" goes beyond end of file", pos1
);
550 should_decode_default
= 1;
552 if(einfo
->flags
& 0x0200) {
553 should_decode_default
= 0;
555 else if((einfo
->flags
& 0x0100) && c
->debug_level
<2) {
556 should_decode_default
= 0;
557 should_print_NOT_DECODING_msg
= 1;
561 if(should_decode_default
&& einfo
&& einfo
->hfn
) {
562 should_call_start_handler
= 1;
565 if(should_decode_default
&& einfo
&& einfo
->hfn
&& (einfo
->flags
& 0x0800)) {
566 should_call_end_handler
= 1;
569 if(should_call_start_handler
) {
570 struct handler_params hp
;
571 de_zeromem(&hp
, sizeof(struct handler_params
));
574 einfo
->hfn(c
, d
, &hp
);
577 if(should_decode_default
) {
580 do_element_sequence(c
, d
, pos
, ele_dlen
);
583 decode_uint(c
, d
, einfo
, pos
, ele_dlen
);
586 decode_float(c
, d
, einfo
, pos
, ele_dlen
);
589 decode_string(c
, d
, einfo
, pos
, ele_dlen
, DE_ENCODING_UTF8
);
592 decode_string(c
, d
, einfo
, pos
, ele_dlen
,
593 DE_EXTENC_MAKE(DE_ENCODING_ASCII
, DE_ENCSUBTYPE_PRINTABLE
));
596 decode_date(c
, d
, einfo
, pos
, ele_dlen
);
601 if(should_print_NOT_DECODING_msg
) {
602 de_dbg(c
, "[not decoding this element]");
606 if(should_call_end_handler
) {
607 struct handler_params hp
;
608 de_zeromem(&hp
, sizeof(struct handler_params
));
612 einfo
->hfn(c
, d
, &hp
);
617 *bytes_used
= pos
- pos1
;
621 de_dbg_indent_restore(c
, saved_indent_level
);
625 static int do_element_sequence(deark
*c
, lctx
*d
, i64 pos1
, i64 len
)
630 int saved_indent_level
;
633 // From the EBML spec:
634 // "data that is not part of an EBML Element is permitted to be present
635 // within a Master Element if unknownsizeallowed is enabled within the
636 // definition for that Master Element. In this case, the EBML Reader
637 // should skip data until a valid Element ID of the same EBMLParentPath or
638 // the next upper level Element Path of the Master Element is found."
640 // We do not support this. We can't even detect it, so our parser will go
641 // off the rails. How do you even support it efficiently? What kind of
642 // psychopath designs a format like this? It's incredibly fragile (a new
643 // format version that defines a new optional element will completely
644 // break backward compatibility), and its abstractions are leaking all
648 de_dbg_indent_save(c
, &saved_indent_level
);
649 if(d
->level
> 16) goto done
;
650 if(len
==0) { retval
= 1; goto done
; }
652 de_dbg(c
, "element sequence at %"I64_FMT
", max_len=%"I64_FMT
, pos1
, len
);
657 if(pos
>= pos1
+len
) {
660 ret
= do_element(c
, d
, pos
, pos1
+len
-pos
, &ele_len
);
662 if(ele_len
<1) goto done
;
669 de_dbg_indent_restore(c
, saved_indent_level
);
674 static void de_run_ebml(deark
*c
, de_module_params
*mparams
)
679 d
= de_malloc(c
, sizeof(lctx
));
681 if(de_get_ext_option(c
, "ebml:encodedid")) {
682 d
->show_encoded_id
= 1;
686 do_element_sequence(c
, d
, pos
, c
->infile
->len
);
689 destroy_attachment_data(c
, d
);
694 static int de_identify_ebml(deark
*c
)
696 if(!dbuf_memcmp(c
->infile
, 0, "\x1a\x45\xdf\xa3", 4))
701 static void de_help_ebml(deark
*c
)
703 de_msg(c
, "-opt ebml:encodedid : Also print element ID numbers in raw form");
706 void de_module_ebml(deark
*c
, struct deark_module_info
*mi
)
710 mi
->run_fn
= de_run_ebml
;
711 mi
->identify_fn
= de_identify_ebml
;
712 mi
->help_fn
= de_help_ebml
;