1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "media/formats/mp4/mp4_stream_parser.h"
7 #include "base/callback_helpers.h"
8 #include "base/logging.h"
9 #include "base/time/time.h"
10 #include "media/base/audio_decoder_config.h"
11 #include "media/base/stream_parser_buffer.h"
12 #include "media/base/text_track_config.h"
13 #include "media/base/video_decoder_config.h"
14 #include "media/base/video_util.h"
15 #include "media/formats/mp4/box_definitions.h"
16 #include "media/formats/mp4/box_reader.h"
17 #include "media/formats/mp4/es_descriptor.h"
18 #include "media/formats/mp4/rcheck.h"
19 #include "media/formats/mpeg/adts_constants.h"
24 MP4StreamParser::MP4StreamParser(const std::set
<int>& audio_object_types
,
26 : state_(kWaitingForInit
),
29 highest_end_offset_(0),
34 audio_object_types_(audio_object_types
),
36 is_audio_track_encrypted_(false),
37 is_video_track_encrypted_(false),
38 num_top_level_box_skipped_(0) {
41 MP4StreamParser::~MP4StreamParser() {}
43 void MP4StreamParser::Init(
44 const InitCB
& init_cb
,
45 const NewConfigCB
& config_cb
,
46 const NewBuffersCB
& new_buffers_cb
,
47 bool /* ignore_text_tracks */,
48 const EncryptedMediaInitDataCB
& encrypted_media_init_data_cb
,
49 const NewMediaSegmentCB
& new_segment_cb
,
50 const base::Closure
& end_of_segment_cb
,
51 const scoped_refptr
<MediaLog
>& media_log
) {
52 DCHECK_EQ(state_
, kWaitingForInit
);
53 DCHECK(init_cb_
.is_null());
54 DCHECK(!init_cb
.is_null());
55 DCHECK(!config_cb
.is_null());
56 DCHECK(!new_buffers_cb
.is_null());
57 DCHECK(!encrypted_media_init_data_cb
.is_null());
58 DCHECK(!end_of_segment_cb
.is_null());
60 ChangeState(kParsingBoxes
);
62 config_cb_
= config_cb
;
63 new_buffers_cb_
= new_buffers_cb
;
64 encrypted_media_init_data_cb_
= encrypted_media_init_data_cb
;
65 new_segment_cb_
= new_segment_cb
;
66 end_of_segment_cb_
= end_of_segment_cb
;
67 media_log_
= media_log
;
70 void MP4StreamParser::Reset() {
77 void MP4StreamParser::Flush() {
78 DCHECK_NE(state_
, kWaitingForInit
);
80 ChangeState(kParsingBoxes
);
83 bool MP4StreamParser::Parse(const uint8
* buf
, int size
) {
84 DCHECK_NE(state_
, kWaitingForInit
);
89 queue_
.Push(buf
, size
);
91 BufferQueue audio_buffers
;
92 BufferQueue video_buffers
;
105 result
= ParseBox(&err
);
108 case kWaitingForSampleData
:
109 result
= HaveEnoughDataToEnqueueSamples();
111 ChangeState(kEmittingSamples
);
114 case kEmittingSamples
:
115 result
= EnqueueSample(&audio_buffers
, &video_buffers
, &err
);
117 int64 max_clear
= runs_
->GetMaxClearOffset() + moof_head_
;
118 err
= !ReadAndDiscardMDATsUntil(max_clear
);
122 } while (result
&& !err
);
125 err
= !SendAndFlushSamples(&audio_buffers
, &video_buffers
);
128 DLOG(ERROR
) << "Error while parsing MP4";
138 bool MP4StreamParser::ParseBox(bool* err
) {
141 queue_
.Peek(&buf
, &size
);
142 if (!size
) return false;
144 scoped_ptr
<BoxReader
> reader(
145 BoxReader::ReadTopLevelBox(buf
, size
, media_log_
, err
));
146 if (reader
.get() == NULL
) return false;
148 if (reader
->type() == FOURCC_MOOV
) {
149 *err
= !ParseMoov(reader
.get());
150 } else if (reader
->type() == FOURCC_MOOF
) {
151 moof_head_
= queue_
.head();
152 *err
= !ParseMoof(reader
.get());
154 // Set up first mdat offset for ReadMDATsUntil().
155 mdat_tail_
= queue_
.head() + reader
->size();
157 // Return early to avoid evicting 'moof' data from queue. Auxiliary info may
158 // be located anywhere in the file, including inside the 'moof' itself.
159 // (Since 'default-base-is-moof' is mandated, no data references can come
160 // before the head of the 'moof', so keeping this box around is sufficient.)
163 // TODO(wolenetz,chcunningham): Enforce more strict adherence to MSE byte
164 // stream spec for ftyp and styp. See http://crbug.com/504514.
165 DVLOG(2) << "Skipping unrecognized top-level box: "
166 << FourCCToString(reader
->type());
169 queue_
.Pop(reader
->size());
173 bool MP4StreamParser::ParseMoov(BoxReader
* reader
) {
174 moov_
.reset(new Movie
);
175 RCHECK(moov_
->Parse(reader
));
181 AudioDecoderConfig audio_config
;
182 VideoDecoderConfig video_config
;
184 for (std::vector
<Track
>::const_iterator track
= moov_
->tracks
.begin();
185 track
!= moov_
->tracks
.end(); ++track
) {
186 // TODO(strobe): Only the first audio and video track present in a file are
187 // used. (Track selection is better accomplished via Source IDs, though, so
188 // adding support for track selection within a stream is low-priority.)
189 const SampleDescription
& samp_descr
=
190 track
->media
.information
.sample_table
.description
;
192 // TODO(strobe): When codec reconfigurations are supported, detect and send
193 // a codec reconfiguration for fragments using a sample description index
194 // different from the previous one
196 for (size_t t
= 0; t
< moov_
->extends
.tracks
.size(); t
++) {
197 const TrackExtends
& trex
= moov_
->extends
.tracks
[t
];
198 if (trex
.track_id
== track
->header
.track_id
) {
199 desc_idx
= trex
.default_sample_description_index
;
203 RCHECK(desc_idx
> 0);
204 desc_idx
-= 1; // BMFF descriptor index is one-based
206 if (track
->media
.handler
.type
== kAudio
&& !audio_config
.IsValidConfig()) {
207 RCHECK(!samp_descr
.audio_entries
.empty());
209 // It is not uncommon to find otherwise-valid files with incorrect sample
210 // description indices, so we fail gracefully in that case.
211 if (desc_idx
>= samp_descr
.audio_entries
.size())
213 const AudioSampleEntry
& entry
= samp_descr
.audio_entries
[desc_idx
];
214 const AAC
& aac
= entry
.esds
.aac
;
216 if (!(entry
.format
== FOURCC_MP4A
||
217 (entry
.format
== FOURCC_ENCA
&&
218 entry
.sinf
.format
.format
== FOURCC_MP4A
))) {
219 MEDIA_LOG(ERROR
, media_log_
) << "Unsupported audio format 0x"
220 << std::hex
<< entry
.format
225 uint8 audio_type
= entry
.esds
.object_type
;
226 DVLOG(1) << "audio_type " << std::hex
<< static_cast<int>(audio_type
);
227 if (audio_object_types_
.find(audio_type
) == audio_object_types_
.end()) {
228 MEDIA_LOG(ERROR
, media_log_
)
229 << "audio object type 0x" << std::hex
<< audio_type
230 << " does not match what is specified in the"
235 AudioCodec codec
= kUnknownAudioCodec
;
236 ChannelLayout channel_layout
= CHANNEL_LAYOUT_NONE
;
237 int sample_per_second
= 0;
238 std::vector
<uint8
> extra_data
;
239 // Check if it is MPEG4 AAC defined in ISO 14496 Part 3 or
240 // supported MPEG2 AAC varients.
241 if (ESDescriptor::IsAAC(audio_type
)) {
243 channel_layout
= aac
.GetChannelLayout(has_sbr_
);
244 sample_per_second
= aac
.GetOutputSamplesPerSecond(has_sbr_
);
245 #if defined(OS_ANDROID)
246 extra_data
= aac
.codec_specific_data();
249 MEDIA_LOG(ERROR
, media_log_
) << "Unsupported audio object type 0x"
250 << std::hex
<< audio_type
<< " in esds.";
254 SampleFormat sample_format
;
255 if (entry
.samplesize
== 8) {
256 sample_format
= kSampleFormatU8
;
257 } else if (entry
.samplesize
== 16) {
258 sample_format
= kSampleFormatS16
;
259 } else if (entry
.samplesize
== 32) {
260 sample_format
= kSampleFormatS32
;
262 LOG(ERROR
) << "Unsupported sample size.";
266 is_audio_track_encrypted_
= entry
.sinf
.info
.track_encryption
.is_encrypted
;
267 DVLOG(1) << "is_audio_track_encrypted_: " << is_audio_track_encrypted_
;
268 audio_config
.Initialize(
269 codec
, sample_format
, channel_layout
, sample_per_second
,
270 extra_data
.size() ? &extra_data
[0] : NULL
, extra_data
.size(),
271 is_audio_track_encrypted_
, false, base::TimeDelta(),
274 audio_track_id_
= track
->header
.track_id
;
276 if (track
->media
.handler
.type
== kVideo
&& !video_config
.IsValidConfig()) {
277 RCHECK(!samp_descr
.video_entries
.empty());
278 if (desc_idx
>= samp_descr
.video_entries
.size())
280 const VideoSampleEntry
& entry
= samp_descr
.video_entries
[desc_idx
];
282 if (!entry
.IsFormatValid()) {
283 MEDIA_LOG(ERROR
, media_log_
) << "Unsupported video format 0x"
284 << std::hex
<< entry
.format
289 // TODO(strobe): Recover correct crop box
290 gfx::Size
coded_size(entry
.width
, entry
.height
);
291 gfx::Rect
visible_rect(coded_size
);
293 // If PASP is available, use the coded size and PASP to calculate the
294 // natural size. Otherwise, use the size in track header for natural size.
295 gfx::Size
natural_size(visible_rect
.size());
296 if (entry
.pixel_aspect
.h_spacing
!= 1 ||
297 entry
.pixel_aspect
.v_spacing
!= 1) {
299 GetNaturalSize(visible_rect
.size(), entry
.pixel_aspect
.h_spacing
,
300 entry
.pixel_aspect
.v_spacing
);
301 } else if (track
->header
.width
&& track
->header
.height
) {
302 // An even width makes things easier for YV12 and appears to be the
303 // behavior expected by WebKit layout tests. See GetNaturalSize().
305 gfx::Size(track
->header
.width
& ~1, track
->header
.height
);
308 is_video_track_encrypted_
= entry
.sinf
.info
.track_encryption
.is_encrypted
;
309 DVLOG(1) << "is_video_track_encrypted_: " << is_video_track_encrypted_
;
310 video_config
.Initialize(kCodecH264
, H264PROFILE_MAIN
, PIXEL_FORMAT_YV12
,
311 COLOR_SPACE_HD_REC709
, coded_size
, visible_rect
,
313 // No decoder-specific buffer needed for AVC;
314 // SPS/PPS are embedded in the video stream
315 NULL
, 0, is_video_track_encrypted_
, false);
317 video_track_id_
= track
->header
.track_id
;
321 RCHECK(config_cb_
.Run(audio_config
, video_config
, TextTrackConfigMap()));
323 StreamParser::InitParameters
params(kInfiniteDuration());
324 if (moov_
->extends
.header
.fragment_duration
> 0) {
325 params
.duration
= TimeDeltaFromRational(
326 moov_
->extends
.header
.fragment_duration
, moov_
->header
.timescale
);
327 params
.liveness
= DemuxerStream::LIVENESS_RECORDED
;
328 } else if (moov_
->header
.duration
> 0 &&
329 moov_
->header
.duration
!= kuint64max
) {
331 TimeDeltaFromRational(moov_
->header
.duration
, moov_
->header
.timescale
);
332 params
.liveness
= DemuxerStream::LIVENESS_RECORDED
;
334 // In ISO/IEC 14496-12:2005(E), 8.30.2: ".. If an MP4 file is created in
335 // real-time, such as used in live streaming, it is not likely that the
336 // fragment_duration is known in advance and this (mehd) box may be
338 // TODO(wolenetz): Investigate gating liveness detection on timeline_offset
339 // when it's populated. See http://crbug.com/312699
340 params
.liveness
= DemuxerStream::LIVENESS_LIVE
;
343 DVLOG(1) << "liveness: " << params
.liveness
;
345 if (!init_cb_
.is_null())
346 base::ResetAndReturn(&init_cb_
).Run(params
);
348 if (!moov_
->pssh
.empty())
349 OnEncryptedMediaInitData(moov_
->pssh
);
354 bool MP4StreamParser::ParseMoof(BoxReader
* reader
) {
355 RCHECK(moov_
.get()); // Must already have initialization segment
357 RCHECK(moof
.Parse(reader
));
359 runs_
.reset(new TrackRunIterator(moov_
.get(), media_log_
));
360 RCHECK(runs_
->Init(moof
));
361 RCHECK(ComputeHighestEndOffset(moof
));
363 if (!moof
.pssh
.empty())
364 OnEncryptedMediaInitData(moof
.pssh
);
366 new_segment_cb_
.Run();
367 ChangeState(kWaitingForSampleData
);
371 void MP4StreamParser::OnEncryptedMediaInitData(
372 const std::vector
<ProtectionSystemSpecificHeader
>& headers
) {
373 // TODO(strobe): ensure that the value of init_data (all PSSH headers
374 // concatenated in arbitrary order) matches the EME spec.
375 // See https://www.w3.org/Bugs/Public/show_bug.cgi?id=17673.
376 size_t total_size
= 0;
377 for (size_t i
= 0; i
< headers
.size(); i
++)
378 total_size
+= headers
[i
].raw_box
.size();
380 std::vector
<uint8
> init_data(total_size
);
382 for (size_t i
= 0; i
< headers
.size(); i
++) {
383 memcpy(&init_data
[pos
], &headers
[i
].raw_box
[0],
384 headers
[i
].raw_box
.size());
385 pos
+= headers
[i
].raw_box
.size();
387 encrypted_media_init_data_cb_
.Run(EmeInitDataType::CENC
, init_data
);
390 bool MP4StreamParser::PrepareAVCBuffer(
391 const AVCDecoderConfigurationRecord
& avc_config
,
392 std::vector
<uint8
>* frame_buf
,
393 std::vector
<SubsampleEntry
>* subsamples
) const {
394 // Convert the AVC NALU length fields to Annex B headers, as expected by
395 // decoding libraries. Since this may enlarge the size of the buffer, we also
396 // update the clear byte count for each subsample if encryption is used to
397 // account for the difference in size between the length prefix and Annex B
399 RCHECK(AVC::ConvertFrameToAnnexB(avc_config
.length_size
, frame_buf
));
400 if (!subsamples
->empty()) {
401 const int nalu_size_diff
= 4 - avc_config
.length_size
;
402 size_t expected_size
= runs_
->sample_size() +
403 subsamples
->size() * nalu_size_diff
;
404 RCHECK(frame_buf
->size() == expected_size
);
405 for (size_t i
= 0; i
< subsamples
->size(); i
++)
406 (*subsamples
)[i
].clear_bytes
+= nalu_size_diff
;
409 if (runs_
->is_keyframe()) {
410 // If this is a keyframe, we (re-)inject SPS and PPS headers at the start of
411 // a frame. If subsample info is present, we also update the clear byte
412 // count for that first subsample.
413 RCHECK(AVC::InsertParamSetsAnnexB(avc_config
, frame_buf
, subsamples
));
416 DCHECK(AVC::IsValidAnnexB(*frame_buf
, *subsamples
));
420 bool MP4StreamParser::PrepareAACBuffer(
421 const AAC
& aac_config
, std::vector
<uint8
>* frame_buf
,
422 std::vector
<SubsampleEntry
>* subsamples
) const {
423 // Append an ADTS header to every audio sample.
424 RCHECK(aac_config
.ConvertEsdsToADTS(frame_buf
));
426 // As above, adjust subsample information to account for the headers. AAC is
427 // not required to use subsample encryption, so we may need to add an entry.
428 if (subsamples
->empty()) {
429 subsamples
->push_back(SubsampleEntry(
430 kADTSHeaderMinSize
, frame_buf
->size() - kADTSHeaderMinSize
));
432 (*subsamples
)[0].clear_bytes
+= kADTSHeaderMinSize
;
437 bool MP4StreamParser::EnqueueSample(BufferQueue
* audio_buffers
,
438 BufferQueue
* video_buffers
,
440 DCHECK_EQ(state_
, kEmittingSamples
);
442 if (!runs_
->IsRunValid()) {
443 // Flush any buffers we've gotten in this chunk so that buffers don't
444 // cross NewSegment() calls
445 *err
= !SendAndFlushSamples(audio_buffers
, video_buffers
);
449 // Remain in kEmittingSamples state, discarding data, until the end of
450 // the current 'mdat' box has been appended to the queue.
451 if (!queue_
.Trim(mdat_tail_
))
454 ChangeState(kParsingBoxes
);
455 end_of_segment_cb_
.Run();
459 if (!runs_
->IsSampleValid()) {
468 queue_
.Peek(&buf
, &buf_size
);
469 if (!buf_size
) return false;
471 bool audio
= has_audio_
&& audio_track_id_
== runs_
->track_id();
472 bool video
= has_video_
&& video_track_id_
== runs_
->track_id();
474 // Skip this entire track if it's not one we're interested in
475 if (!audio
&& !video
) {
480 // Attempt to cache the auxiliary information first. Aux info is usually
481 // placed in a contiguous block before the sample data, rather than being
482 // interleaved. If we didn't cache it, this would require that we retain the
483 // start of the segment buffer while reading samples. Aux info is typically
484 // quite small compared to sample data, so this pattern is useful on
485 // memory-constrained devices where the source buffer consumes a substantial
486 // portion of the total system memory.
487 if (runs_
->AuxInfoNeedsToBeCached()) {
488 queue_
.PeekAt(runs_
->aux_info_offset() + moof_head_
, &buf
, &buf_size
);
489 if (buf_size
< runs_
->aux_info_size()) return false;
490 *err
= !runs_
->CacheAuxInfo(buf
, buf_size
);
494 queue_
.PeekAt(runs_
->sample_offset() + moof_head_
, &buf
, &buf_size
);
495 if (buf_size
< runs_
->sample_size()) return false;
497 scoped_ptr
<DecryptConfig
> decrypt_config
;
498 std::vector
<SubsampleEntry
> subsamples
;
499 if (runs_
->is_encrypted()) {
500 decrypt_config
= runs_
->GetDecryptConfig();
501 if (!decrypt_config
) {
505 subsamples
= decrypt_config
->subsamples();
508 std::vector
<uint8
> frame_buf(buf
, buf
+ runs_
->sample_size());
510 if (!PrepareAVCBuffer(runs_
->video_description().avcc
,
511 &frame_buf
, &subsamples
)) {
512 MEDIA_LOG(ERROR
, media_log_
) << "Failed to prepare AVC sample for decode";
519 if (ESDescriptor::IsAAC(runs_
->audio_description().esds
.object_type
) &&
520 !PrepareAACBuffer(runs_
->audio_description().esds
.aac
,
521 &frame_buf
, &subsamples
)) {
522 MEDIA_LOG(ERROR
, media_log_
) << "Failed to prepare AAC sample for decode";
528 if (decrypt_config
) {
529 if (!subsamples
.empty()) {
530 // Create a new config with the updated subsamples.
531 decrypt_config
.reset(new DecryptConfig(
532 decrypt_config
->key_id(),
533 decrypt_config
->iv(),
536 // else, use the existing config.
537 } else if ((audio
&& is_audio_track_encrypted_
) ||
538 (video
&& is_video_track_encrypted_
)) {
539 // The media pipeline requires a DecryptConfig with an empty |iv|.
540 // TODO(ddorwin): Refactor so we do not need a fake key ID ("1");
541 decrypt_config
.reset(
542 new DecryptConfig("1", "", std::vector
<SubsampleEntry
>()));
545 StreamParserBuffer::Type buffer_type
= audio
? DemuxerStream::AUDIO
:
546 DemuxerStream::VIDEO
;
548 // TODO(wolenetz/acolwell): Validate and use a common cross-parser TrackId
549 // type and allow multiple tracks for same media type, if applicable. See
550 // https://crbug.com/341581.
552 // NOTE: MPEG's "random access point" concept is equivalent to the
553 // downstream code's "is keyframe" concept.
554 scoped_refptr
<StreamParserBuffer
> stream_buf
=
555 StreamParserBuffer::CopyFrom(&frame_buf
[0], frame_buf
.size(),
556 runs_
->is_random_access_point(),
560 stream_buf
->set_decrypt_config(decrypt_config
.Pass());
562 stream_buf
->set_duration(runs_
->duration());
563 stream_buf
->set_timestamp(runs_
->cts());
564 stream_buf
->SetDecodeTimestamp(runs_
->dts());
566 DVLOG(3) << "Pushing frame: aud=" << audio
567 << ", key=" << runs_
->is_keyframe()
568 << ", rap=" << runs_
->is_random_access_point()
569 << ", dur=" << runs_
->duration().InMilliseconds()
570 << ", dts=" << runs_
->dts().InMilliseconds()
571 << ", cts=" << runs_
->cts().InMilliseconds()
572 << ", size=" << runs_
->sample_size();
575 audio_buffers
->push_back(stream_buf
);
577 video_buffers
->push_back(stream_buf
);
580 runs_
->AdvanceSample();
584 bool MP4StreamParser::SendAndFlushSamples(BufferQueue
* audio_buffers
,
585 BufferQueue
* video_buffers
) {
586 if (audio_buffers
->empty() && video_buffers
->empty())
589 TextBufferQueueMap empty_text_map
;
590 bool success
= new_buffers_cb_
.Run(*audio_buffers
,
593 audio_buffers
->clear();
594 video_buffers
->clear();
598 bool MP4StreamParser::ReadAndDiscardMDATsUntil(int64 max_clear_offset
) {
600 int64 upper_bound
= std::min(max_clear_offset
, queue_
.tail());
601 while (mdat_tail_
< upper_bound
) {
602 const uint8
* buf
= NULL
;
604 queue_
.PeekAt(mdat_tail_
, &buf
, &size
);
608 if (!BoxReader::StartTopLevelBox(buf
, size
, media_log_
, &type
, &box_sz
,
612 if (type
!= FOURCC_MDAT
) {
613 MEDIA_LOG(DEBUG
, media_log_
)
614 << "Unexpected box type while parsing MDATs: "
615 << FourCCToString(type
);
617 mdat_tail_
+= box_sz
;
619 queue_
.Trim(std::min(mdat_tail_
, upper_bound
));
623 void MP4StreamParser::ChangeState(State new_state
) {
624 DVLOG(2) << "Changing state: " << new_state
;
628 bool MP4StreamParser::HaveEnoughDataToEnqueueSamples() {
629 DCHECK_EQ(state_
, kWaitingForSampleData
);
630 // For muxed content, make sure we have data up to |highest_end_offset_|
631 // so we can ensure proper enqueuing behavior. Otherwise assume we have enough
632 // data and allow per sample offset checks to meter sample enqueuing.
633 // TODO(acolwell): Fix trun box handling so we don't have to special case
635 return !(has_audio_
&& has_video_
&&
636 queue_
.tail() < highest_end_offset_
+ moof_head_
);
639 bool MP4StreamParser::ComputeHighestEndOffset(const MovieFragment
& moof
) {
640 highest_end_offset_
= 0;
642 TrackRunIterator
runs(moov_
.get(), media_log_
);
643 RCHECK(runs
.Init(moof
));
645 while (runs
.IsRunValid()) {
646 int64 aux_info_end_offset
= runs
.aux_info_offset() + runs
.aux_info_size();
647 if (aux_info_end_offset
> highest_end_offset_
)
648 highest_end_offset_
= aux_info_end_offset
;
650 while (runs
.IsSampleValid()) {
651 int64 sample_end_offset
= runs
.sample_offset() + runs
.sample_size();
652 if (sample_end_offset
> highest_end_offset_
)
653 highest_end_offset_
= sample_end_offset
;
655 runs
.AdvanceSample();