2 * native ebml reader for the Matroska demuxer
3 * new parser copyright (c) 2010 Uoti Urpala
4 * copyright (c) 2004 Aurelien Jacobs <aurel@gnuage.org>
5 * based on the one written by Ronald Bultje for gstreamer
7 * This file is part of MPlayer.
9 * MPlayer is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * MPlayer is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License along
20 * with MPlayer; if not, write to the Free Software Foundation, Inc.,
21 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
32 #include <libavutil/intfloat.h>
33 #include <libavutil/common.h>
36 #include "stream/stream.h"
41 #define SIZE_MAX ((size_t)-1)
45 * Read: the element content data ID.
48 uint32_t ebml_read_id(stream_t
*s
, int *length
)
50 int i
, len_mask
= 0x80;
53 for (i
= 0, id
= stream_read_char(s
); i
< 4 && !(id
& len_mask
); i
++)
56 return EBML_ID_INVALID
;
60 id
= (id
<< 8) | stream_read_char(s
);
65 * Read a variable length unsigned int.
67 uint64_t ebml_read_vlen_uint(uint8_t *buffer
, int *length
)
69 int i
, j
, num_ffs
= 0, len_mask
= 0x80;
72 for (i
= 0, num
= *buffer
++; i
< 8 && !(num
& len_mask
); i
++)
75 return EBML_UINT_INVALID
;
79 if ((int) (num
&= (len_mask
- 1)) == len_mask
- 1)
82 num
= (num
<< 8) | *buffer
++;
83 if ((num
& 0xFF) == 0xFF)
87 return EBML_UINT_INVALID
;
92 * Read a variable length signed int.
94 int64_t ebml_read_vlen_int(uint8_t *buffer
, int *length
)
99 /* read as unsigned number first */
100 unum
= ebml_read_vlen_uint(buffer
, &l
);
101 if (unum
== EBML_UINT_INVALID
)
102 return EBML_INT_INVALID
;
106 return unum
- ((1 << ((7 * l
) - 1)) - 1);
110 * Read: element content length.
112 uint64_t ebml_read_length(stream_t
*s
, int *length
)
114 int i
, j
, num_ffs
= 0, len_mask
= 0x80;
117 for (i
= 0, len
= stream_read_char(s
); i
< 8 && !(len
& len_mask
); i
++)
120 return EBML_UINT_INVALID
;
124 if ((int) (len
&= (len_mask
- 1)) == len_mask
- 1)
127 len
= (len
<< 8) | stream_read_char(s
);
128 if ((len
& 0xFF) == 0xFF)
132 return EBML_UINT_INVALID
;
133 if (len
>= 1ULL<<63) // Can happen if stream_read_char returns EOF
134 return EBML_UINT_INVALID
;
139 * Read the next element as an unsigned int.
141 uint64_t ebml_read_uint(stream_t
*s
, uint64_t *length
)
143 uint64_t len
, value
= 0;
146 len
= ebml_read_length(s
, &l
);
147 if (len
== EBML_UINT_INVALID
|| len
< 1 || len
> 8)
148 return EBML_UINT_INVALID
;
153 value
= (value
<< 8) | stream_read_char(s
);
159 * Read the next element as a signed int.
161 int64_t ebml_read_int(stream_t
*s
, uint64_t *length
)
167 len
= ebml_read_length(s
, &l
);
168 if (len
== EBML_UINT_INVALID
|| len
< 1 || len
> 8)
169 return EBML_INT_INVALID
;
174 l
= stream_read_char(s
);
177 value
= (value
<< 8) | l
;
179 value
= (value
<< 8) | stream_read_char(s
);
185 * Read the next element as a float.
187 double ebml_read_float(stream_t
*s
, uint64_t *length
)
193 len
= ebml_read_length(s
, &l
);
196 value
= av_int2float(stream_read_dword(s
));
200 value
= av_int2double(stream_read_qword(s
));
204 return EBML_FLOAT_INVALID
;
214 * Read the next element as an ASCII string.
216 char *ebml_read_ascii(stream_t
*s
, uint64_t *length
)
222 len
= ebml_read_length(s
, &l
);
223 if (len
== EBML_UINT_INVALID
)
225 if (len
> SIZE_MAX
- 1)
230 str
= malloc(len
+ 1);
231 if (stream_read(s
, str
, len
) != (int) len
) {
241 * Read the next element as a UTF-8 string.
243 char *ebml_read_utf8(stream_t
*s
, uint64_t *length
)
245 return ebml_read_ascii(s
, length
);
249 * Skip the next element.
251 int ebml_read_skip(stream_t
*s
, uint64_t *length
)
256 len
= ebml_read_length(s
, &l
);
257 if (len
== EBML_UINT_INVALID
)
268 * Read the next element, but only the header. The contents
269 * are supposed to be sub-elements which can be read separately.
271 uint32_t ebml_read_master(stream_t
*s
, uint64_t *length
)
276 id
= ebml_read_id(s
, NULL
);
277 if (id
== EBML_ID_INVALID
)
280 len
= ebml_read_length(s
, NULL
);
281 if (len
== EBML_UINT_INVALID
)
282 return EBML_ID_INVALID
;
291 #define EVALARGS(F, ...) F(__VA_ARGS__)
292 #define E(str, N, type) const struct ebml_elem_desc ebml_ ## N ## _desc = { str, type };
293 #define E_SN(str, count, N) const struct ebml_elem_desc ebml_ ## N ## _desc = { str, EBML_TYPE_SUBELEMENTS, sizeof(struct ebml_ ## N), count, (const struct ebml_field_desc[]){
294 #define E_S(str, count) EVALARGS(E_SN, str, count, N)
295 #define FN(id, name, multiple, N) { id, multiple, offsetof(struct ebml_ ## N, name), offsetof(struct ebml_ ## N, n_ ## name), &ebml_##name##_desc},
296 #define F(id, name, multiple) EVALARGS(FN, id, name, multiple, N)
297 #include "ebml_defs.c"
304 // Used to read/write pointers to different struct types
306 #define generic_struct struct generic
308 static uint32_t ebml_parse_id(uint8_t *data
, int *length
)
311 uint32_t id
= *data
++;
312 for (int len_mask
= 0x80; !(id
& len_mask
); len_mask
>>= 1) {
316 return EBML_ID_INVALID
;
321 id
= (id
<< 8) | *data
++;
325 static uint64_t parse_vlen(uint8_t *data
, int *length
, bool is_length
)
327 uint64_t r
= *data
++;
330 for (len_mask
= 0x80; !(r
& len_mask
); len_mask
>>= 1) {
340 if (r
== len_mask
- 1)
342 for (int i
= 1; i
< len
; i
++) {
345 r
= (r
<< 8) | *data
++;
347 if (is_length
&& num_allones
== len
) {
348 // According to Matroska specs this means "unknown length"
349 // Could be supported if there are any actual files using it
357 static uint64_t ebml_parse_length(uint8_t *data
, int *length
)
359 return parse_vlen(data
, length
, true);
362 static uint64_t ebml_parse_uint(uint8_t *data
, int length
)
364 assert(length
>= 1 && length
<= 8);
367 r
= (r
<< 8) + *data
++;
371 static int64_t ebml_parse_sint(uint8_t *data
, int length
)
373 assert(length
>=1 && length
<= 8);
378 r
= (r
<< 8) | *data
++;
382 static double ebml_parse_float(uint8_t *data
, int length
)
384 assert(length
== 4 || length
== 8);
385 uint64_t i
= ebml_parse_uint(data
, length
);
387 return av_int2float(i
);
389 return av_int2double(i
);
393 // target must be initialized to zero
394 static void ebml_parse_element(struct ebml_parse_ctx
*ctx
, void *target
,
395 uint8_t *data
, int size
,
396 const struct ebml_elem_desc
*type
, int level
)
398 assert(type
->type
== EBML_TYPE_SUBELEMENTS
);
400 mp_msg(MSGT_DEMUX
, MSGL_DBG2
, "%.*s[mkv] Parsing element %s\n",
401 level
, " ", type
->name
);
405 uint8_t *end
= data
+ size
;
407 int num_elems
[MAX_EBML_SUBELEMENTS
] = {};
410 uint32_t id
= ebml_parse_id(p
, &len
);
414 mp_msg(MSGT_DEMUX
, MSGL_DBG2
, "[mkv] Error parsing subelement "
419 uint64_t length
= ebml_parse_length(p
, &len
);
423 mp_msg(MSGT_DEMUX
, MSGL_DBG2
, "[mkv] Error parsing subelement "
430 for (int i
= 0; i
< type
->field_count
; i
++)
431 if (type
->fields
[i
].id
== id
) {
437 if (length
> end
- p
) {
438 if (field_idx
>= 0 && type
->fields
[field_idx
].desc
->type
439 != EBML_TYPE_SUBELEMENTS
) {
440 mp_msg(MSGT_DEMUX
, MSGL_DBG2
, "[mkv] Subelement content goes "
441 "past end of containing element\n");
444 // Try to parse what is possible from inside this partial element
445 ctx
->has_errors
= true;
453 mp_msg(MSGT_DEMUX
, MSGL_DBG2
, "[mkv] Subelement headers go "
454 "past end of containing element\n");
456 ctx
->has_errors
= true;
461 for (int i
= 0; i
< type
->field_count
; i
++)
462 if (num_elems
[i
] && type
->fields
[i
].multiple
) {
463 char *ptr
= s
+ type
->fields
[i
].offset
;
464 switch (type
->fields
[i
].desc
->type
) {
465 case EBML_TYPE_SUBELEMENTS
:
466 num_elems
[i
] = FFMIN(num_elems
[i
],
467 1000000000 / type
->fields
[i
].desc
->size
);
468 int size
= num_elems
[i
] * type
->fields
[i
].desc
->size
;
469 *(generic_struct
**) ptr
= talloc_zero_size(ctx
->talloc_ctx
,
473 *(uint64_t **) ptr
= talloc_zero_array(ctx
->talloc_ctx
,
474 uint64_t, num_elems
[i
]);
477 *(int64_t **) ptr
= talloc_zero_array(ctx
->talloc_ctx
,
478 int64_t, num_elems
[i
]);
480 case EBML_TYPE_FLOAT
:
481 *(double **) ptr
= talloc_zero_array(ctx
->talloc_ctx
,
482 double, num_elems
[i
]);
485 case EBML_TYPE_BINARY
:
486 *(struct bstr
**) ptr
= talloc_zero_array(ctx
->talloc_ctx
,
490 case EBML_TYPE_EBML_ID
:
491 *(int32_t **) ptr
= talloc_zero_array(ctx
->talloc_ctx
,
492 uint32_t, num_elems
[i
]);
501 uint32_t id
= ebml_parse_id(data
, &len
);
502 assert(len
>= 0 && len
<= end
- data
);
504 uint64_t length
= ebml_parse_length(data
, &len
);
505 assert(len
>= 0 && len
<= end
- data
);
507 if (length
> end
- data
) {
508 // Try to parse what is possible from inside this partial element
510 mp_msg(MSGT_DEMUX
, MSGL_DBG2
, "[mkv] Next subelement content goes "
511 "past end of containing element, will be truncated\n");
514 for (int i
= 0; i
< type
->field_count
; i
++)
515 if (type
->fields
[i
].id
== id
) {
521 mp_msg(MSGT_DEMUX
, MSGL_DBG2
, "%.*s[mkv] Ignoring Void element "
522 "size: %"PRIu64
"\n", level
+1, " ", length
);
524 mp_msg(MSGT_DEMUX
, MSGL_DBG2
, "%.*s[mkv] Ignoring CRC-32 "
525 "element size: %"PRIu64
"\n", level
+1, " ",
528 mp_msg(MSGT_DEMUX
, MSGL_DBG2
, "[mkv] Ignoring unrecognized "
529 "subelement. ID: %x size: %"PRIu64
"\n", id
, length
);
533 const struct ebml_field_desc
*fd
= &type
->fields
[field_idx
];
534 const struct ebml_elem_desc
*ed
= fd
->desc
;
535 bool multiple
= fd
->multiple
;
536 int *countptr
= (int *) (s
+ fd
->count_offset
);
537 if (*countptr
>= num_elems
[field_idx
]) {
538 // Shouldn't happen with on any sane file without bugs
539 mp_msg(MSGT_DEMUX
, MSGL_ERR
, "[mkv] Too many subelems?\n");
540 ctx
->has_errors
= true;
544 if (*countptr
> 0 && !multiple
) {
545 mp_msg(MSGT_DEMUX
, MSGL_DBG2
, "[mkv] Another subelement of type "
546 "%x %s (size: %"PRIu64
"). Only one allowed. Ignoring.\n",
547 id
, ed
->name
, length
);
548 ctx
->has_errors
= true;
552 mp_msg(MSGT_DEMUX
, MSGL_DBG2
, "%.*s[mkv] Parsing %x %s size: %"PRIu64
553 " value: ", level
+1, " ", id
, ed
->name
, length
);
555 char *fieldptr
= s
+ fd
->offset
;
557 case EBML_TYPE_SUBELEMENTS
:
558 mp_msg(MSGT_DEMUX
, MSGL_DBG2
, "subelements\n");
561 char *array_start
= (char *) *(generic_struct
**) fieldptr
;
562 subelptr
= array_start
+ *countptr
* ed
->size
;
565 ebml_parse_element(ctx
, subelptr
, data
, length
, ed
, level
+ 1);
568 case EBML_TYPE_UINT
:;
570 #define GETPTR(subelptr, fieldtype) \
572 subelptr = *(fieldtype **) fieldptr + *countptr; \
574 subelptr = (fieldtype *) fieldptr
575 GETPTR(uintptr
, uint64_t);
576 if (length
< 1 || length
> 8) {
577 mp_msg(MSGT_DEMUX
, MSGL_DBG2
, "uint invalid length %"PRIu64
581 *uintptr
= ebml_parse_uint(data
, length
);
582 mp_msg(MSGT_DEMUX
, MSGL_DBG2
, "uint %"PRIu64
"\n", *uintptr
);
585 case EBML_TYPE_SINT
:;
587 GETPTR(sintptr
, int64_t);
588 if (length
< 1 || length
> 8) {
589 mp_msg(MSGT_DEMUX
, MSGL_DBG2
, "sint invalid length %"PRIu64
593 *sintptr
= ebml_parse_sint(data
, length
);
594 mp_msg(MSGT_DEMUX
, MSGL_DBG2
, "sint %"PRId64
"\n", *sintptr
);
597 case EBML_TYPE_FLOAT
:;
599 GETPTR(floatptr
, double);
600 if (length
!= 4 && length
!= 8) {
601 mp_msg(MSGT_DEMUX
, MSGL_DBG2
, "float invalid length %"PRIu64
605 *floatptr
= ebml_parse_float(data
, length
);
606 mp_msg(MSGT_DEMUX
, MSGL_DBG2
, "float %f\n", *floatptr
);
610 case EBML_TYPE_BINARY
:;
612 GETPTR(strptr
, struct bstr
);
613 strptr
->start
= data
;
614 strptr
->len
= length
;
615 if (ed
->type
== EBML_TYPE_STR
)
616 mp_msg(MSGT_DEMUX
, MSGL_DBG2
, "string \"%.*s\"\n",
619 mp_msg(MSGT_DEMUX
, MSGL_DBG2
, "binary %zd bytes\n",
623 case EBML_TYPE_EBML_ID
:;
625 GETPTR(idptr
, uint32_t);
626 *idptr
= ebml_parse_id(data
, &len
);
628 mp_msg(MSGT_DEMUX
, MSGL_DBG2
, "ebml_id broken value\n");
631 mp_msg(MSGT_DEMUX
, MSGL_DBG2
, "ebml_id %x\n", (unsigned)*idptr
);
642 // target must be initialized to zero
643 int ebml_read_element(struct stream
*s
, struct ebml_parse_ctx
*ctx
,
644 void *target
, const struct ebml_elem_desc
*desc
)
646 ctx
->has_errors
= false;
647 int msglevel
= ctx
->no_error_messages
? MSGL_DBG2
: MSGL_WARN
;
648 uint64_t length
= ebml_read_length(s
, &ctx
->bytes_read
);
650 mp_msg(MSGT_DEMUX
, msglevel
, "[mkv] Unexpected end of file "
651 "- partial or corrupt file?\n");
654 if (length
> 1000000000) {
655 mp_msg(MSGT_DEMUX
, msglevel
, "[mkv] Refusing to read element over "
659 ctx
->talloc_ctx
= talloc_size(NULL
, length
+ 8);
660 int read_len
= stream_read(s
, ctx
->talloc_ctx
, length
);
661 ctx
->bytes_read
+= read_len
;
662 if (read_len
< length
)
663 mp_msg(MSGT_DEMUX
, msglevel
, "[mkv] Unexpected end of file "
664 "- partial or corrupt file?\n");
665 ebml_parse_element(ctx
, target
, ctx
->talloc_ctx
, read_len
, desc
, 0);
667 mp_msg(MSGT_DEMUX
, msglevel
, "[mkv] Error parsing element %s\n",