[ie/facebook] Support more groups URLs (#11576)
[yt-dlp3.git] / yt_dlp / extractor / nbc.py
blob8f6fb22b173e703766c75c930149477384b61814
1 import base64
2 import json
3 import re
4 import urllib.parse
5 import xml.etree.ElementTree
7 from .adobepass import AdobePassIE
8 from .common import InfoExtractor
9 from .theplatform import ThePlatformIE, default_ns
10 from ..networking import HEADRequest
11 from ..utils import (
12 ExtractorError,
13 RegexNotFoundError,
14 UserNotLive,
15 clean_html,
16 determine_ext,
17 float_or_none,
18 int_or_none,
19 join_nonempty,
20 mimetype2ext,
21 parse_age_limit,
22 parse_duration,
23 remove_end,
24 smuggle_url,
25 traverse_obj,
26 try_get,
27 unescapeHTML,
28 unified_timestamp,
29 update_url_query,
30 url_basename,
34 class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE
35 _VALID_URL = r'https?(?P<permalink>://(?:www\.)?nbc\.com/(?:classic-tv/)?[^/]+/video/[^/]+/(?P<id>(?:NBCE|n)?\d+))'
37 _TESTS = [
39 'url': 'http://www.nbc.com/the-tonight-show/video/jimmy-fallon-surprises-fans-at-ben-jerrys/2848237',
40 'info_dict': {
41 'id': '2848237',
42 'ext': 'mp4',
43 'title': 'Jimmy Fallon Surprises Fans at Ben & Jerry\'s',
44 'description': 'Jimmy gives out free scoops of his new "Tonight Dough" ice cream flavor by surprising customers at the Ben & Jerry\'s scoop shop.',
45 'timestamp': 1424246400,
46 'upload_date': '20150218',
47 'uploader': 'NBCU-COM',
48 'episode': 'Jimmy Fallon Surprises Fans at Ben & Jerry\'s',
49 'episode_number': 86,
50 'season': 'Season 2',
51 'season_number': 2,
52 'series': 'Tonight Show: Jimmy Fallon',
53 'duration': 237.0,
54 'chapters': 'count:1',
55 'tags': 'count:4',
56 'thumbnail': r're:https?://.+\.jpg',
57 'categories': ['Series/The Tonight Show Starring Jimmy Fallon'],
58 'media_type': 'Full Episode',
60 'params': {
61 'skip_download': 'm3u8',
65 'url': 'http://www.nbc.com/saturday-night-live/video/star-wars-teaser/2832821',
66 'info_dict': {
67 'id': '2832821',
68 'ext': 'mp4',
69 'title': 'Star Wars Teaser',
70 'description': 'md5:0b40f9cbde5b671a7ff62fceccc4f442',
71 'timestamp': 1417852800,
72 'upload_date': '20141206',
73 'uploader': 'NBCU-COM',
75 'skip': 'page not found',
78 # HLS streams requires the 'hdnea3' cookie
79 'url': 'http://www.nbc.com/Kings/video/goliath/n1806',
80 'info_dict': {
81 'id': '101528f5a9e8127b107e98c5e6ce4638',
82 'ext': 'mp4',
83 'title': 'Goliath',
84 'description': 'When an unknown soldier saves the life of the King\'s son in battle, he\'s thrust into the limelight and politics of the kingdom.',
85 'timestamp': 1237100400,
86 'upload_date': '20090315',
87 'uploader': 'NBCU-COM',
89 'skip': 'page not found',
92 # manifest url does not have extension
93 'url': 'https://www.nbc.com/the-golden-globe-awards/video/oprah-winfrey-receives-cecil-b-de-mille-award-at-the-2018-golden-globes/3646439',
94 'info_dict': {
95 'id': '3646439',
96 'ext': 'mp4',
97 'title': 'Oprah Winfrey Receives Cecil B. de Mille Award at the 2018 Golden Globes',
98 'episode': 'Oprah Winfrey Receives Cecil B. de Mille Award at the 2018 Golden Globes',
99 'episode_number': 1,
100 'season': 'Season 75',
101 'season_number': 75,
102 'series': 'The Golden Globe Awards',
103 'description': 'Oprah Winfrey receives the Cecil B. de Mille Award at the 75th Annual Golden Globe Awards.',
104 'uploader': 'NBCU-COM',
105 'upload_date': '20180107',
106 'timestamp': 1515312000,
107 'duration': 570.0,
108 'tags': 'count:8',
109 'thumbnail': r're:https?://.+\.jpg',
110 'chapters': 'count:1',
112 'params': {
113 'skip_download': 'm3u8',
117 # new video_id format
118 'url': 'https://www.nbc.com/quantum-leap/video/bens-first-leap-nbcs-quantum-leap/NBCE125189978',
119 'info_dict': {
120 'id': 'NBCE125189978',
121 'ext': 'mp4',
122 'title': 'Ben\'s First Leap | NBC\'s Quantum Leap',
123 'description': 'md5:a82762449b7ec4bb83291a7b355ebf8e',
124 'uploader': 'NBCU-COM',
125 'series': 'Quantum Leap',
126 'season': 'Season 1',
127 'season_number': 1,
128 'episode': 'Ben\'s First Leap | NBC\'s Quantum Leap',
129 'episode_number': 1,
130 'duration': 170.171,
131 'chapters': [],
132 'timestamp': 1663956155,
133 'upload_date': '20220923',
134 'tags': 'count:10',
135 'age_limit': 0,
136 'thumbnail': r're:https?://.+\.jpg',
137 'categories': ['Series/Quantum Leap 2022'],
138 'media_type': 'Highlight',
140 'params': {
141 'skip_download': 'm3u8',
145 'url': 'https://www.nbc.com/classic-tv/charles-in-charge/video/charles-in-charge-pilot/n3310',
146 'only_matching': True,
149 # Percent escaped url
150 'url': 'https://www.nbc.com/up-all-night/video/day-after-valentine%27s-day/n2189',
151 'only_matching': True,
155 def _real_extract(self, url):
156 permalink, video_id = self._match_valid_url(url).groups()
157 permalink = 'http' + urllib.parse.unquote(permalink)
158 video_data = self._download_json(
159 'https://friendship.nbc.co/v2/graphql', video_id, query={
160 'query': '''query bonanzaPage(
161 $app: NBCUBrands! = nbc
162 $name: String!
163 $oneApp: Boolean
164 $platform: SupportedPlatforms! = web
165 $type: EntityPageType! = VIDEO
166 $userId: String!
168 bonanzaPage(
169 app: $app
170 name: $name
171 oneApp: $oneApp
172 platform: $platform
173 type: $type
174 userId: $userId
176 metadata {
177 ... on VideoPageData {
178 description
179 episodeNumber
180 keywords
181 locked
182 mpxAccountId
183 mpxGuid
184 rating
185 resourceId
186 seasonNumber
187 secondaryTitle
188 seriesShortTitle
192 }''',
193 'variables': json.dumps({
194 'name': permalink,
195 'oneApp': True,
196 'userId': '0',
198 })['data']['bonanzaPage']['metadata']
199 query = {
200 'mbr': 'true',
201 'manifest': 'm3u',
202 'switch': 'HLSServiceSecure',
204 video_id = video_data['mpxGuid']
205 tp_path = 'NnzsPC/media/guid/{}/{}'.format(video_data.get('mpxAccountId') or '2410887629', video_id)
206 tpm = self._download_theplatform_metadata(tp_path, video_id)
207 title = tpm.get('title') or video_data.get('secondaryTitle')
208 if video_data.get('locked'):
209 resource = self._get_mvpd_resource(
210 video_data.get('resourceId') or 'nbcentertainment',
211 title, video_id, video_data.get('rating'))
212 query['auth'] = self._extract_mvpd_auth(
213 url, video_id, 'nbcentertainment', resource)
214 theplatform_url = smuggle_url(update_url_query(
215 'http://link.theplatform.com/s/NnzsPC/media/guid/{}/{}'.format(video_data.get('mpxAccountId') or '2410887629', video_id),
216 query), {'force_smil_url': True})
218 # Empty string or 0 can be valid values for these. So the check must be `is None`
219 description = video_data.get('description')
220 if description is None:
221 description = tpm.get('description')
222 episode_number = int_or_none(video_data.get('episodeNumber'))
223 if episode_number is None:
224 episode_number = int_or_none(tpm.get('nbcu$airOrder'))
225 rating = video_data.get('rating')
226 if rating is None:
227 try_get(tpm, lambda x: x['ratings'][0]['rating'])
228 season_number = int_or_none(video_data.get('seasonNumber'))
229 if season_number is None:
230 season_number = int_or_none(tpm.get('nbcu$seasonNumber'))
231 series = video_data.get('seriesShortTitle')
232 if series is None:
233 series = tpm.get('nbcu$seriesShortTitle')
234 tags = video_data.get('keywords')
235 if tags is None or len(tags) == 0:
236 tags = tpm.get('keywords')
238 return {
239 '_type': 'url_transparent',
240 'age_limit': parse_age_limit(rating),
241 'description': description,
242 'episode': title,
243 'episode_number': episode_number,
244 'id': video_id,
245 'ie_key': 'ThePlatform',
246 'season_number': season_number,
247 'series': series,
248 'tags': tags,
249 'title': title,
250 'url': theplatform_url,
254 class NBCSportsVPlayerIE(InfoExtractor):
255 _VALID_URL_BASE = r'https?://(?:vplayer\.nbcsports\.com|(?:www\.)?nbcsports\.com/vplayer)/'
256 _VALID_URL = _VALID_URL_BASE + r'(?:[^/]+/)+(?P<id>[0-9a-zA-Z_]+)'
257 _EMBED_REGEX = [rf'(?:iframe[^>]+|var video|div[^>]+data-(?:mpx-)?)[sS]rc\s?=\s?"(?P<url>{_VALID_URL_BASE}[^\"]+)']
259 _TESTS = [{
260 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/9CsDKds0kvHI',
261 'info_dict': {
262 'id': '9CsDKds0kvHI',
263 'ext': 'mp4',
264 'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d',
265 'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson',
266 'timestamp': 1426270238,
267 'upload_date': '20150313',
268 'uploader': 'NBCU-SPORTS',
269 'duration': 72.818,
270 'chapters': [],
271 'thumbnail': r're:^https?://.*\.jpg$',
273 }, {
274 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/media/PEgOtlNcC_y2',
275 'only_matching': True,
276 }, {
277 'url': 'https://www.nbcsports.com/vplayer/p/BxmELC/nbcsports/select/PHJSaFWbrTY9?form=html&autoPlay=true',
278 'only_matching': True,
281 def _real_extract(self, url):
282 video_id = self._match_id(url)
283 webpage = self._download_webpage(url, video_id)
284 theplatform_url = self._html_search_regex(r'tp:releaseUrl="(.+?)"', webpage, 'url')
285 return self.url_result(theplatform_url, 'ThePlatform')
288 class NBCSportsIE(InfoExtractor):
289 _VALID_URL = r'https?://(?:www\.)?nbcsports\.com//?(?!vplayer/)(?:[^/]+/)+(?P<id>[0-9a-z-]+)'
291 _TESTS = [{
292 # iframe src
293 'url': 'https://www.nbcsports.com/watch/nfl/profootballtalk/pft-pm/unpacking-addisons-reckless-driving-citation',
294 'info_dict': {
295 'id': 'PHJSaFWbrTY9',
296 'ext': 'mp4',
297 'title': 'Tom Izzo, Michigan St. has \'so much respect\' for Duke',
298 'description': 'md5:ecb459c9d59e0766ac9c7d5d0eda8113',
299 'uploader': 'NBCU-SPORTS',
300 'upload_date': '20150330',
301 'timestamp': 1427726529,
302 'chapters': [],
303 'thumbnail': 'https://hdliveextra-a.akamaihd.net/HD/image_sports/NBCU_Sports_Group_-_nbcsports/253/303/izzodps.jpg',
304 'duration': 528.395,
306 }, {
307 # data-mpx-src
308 'url': 'https://www.nbcsports.com/philadelphia/philadelphia-phillies/bruce-bochy-hector-neris-hes-idiot',
309 'only_matching': True,
310 }, {
311 # data-src
312 'url': 'https://www.nbcsports.com/boston/video/report-card-pats-secondary-no-match-josh-allen',
313 'only_matching': True,
316 def _real_extract(self, url):
317 video_id = self._match_id(url)
318 webpage = self._download_webpage(url, video_id)
319 return self.url_result(
320 NBCSportsVPlayerIE._extract_url(webpage), 'NBCSportsVPlayer')
323 class NBCSportsStreamIE(AdobePassIE):
324 _VALID_URL = r'https?://stream\.nbcsports\.com/.+?\bpid=(?P<id>\d+)'
325 _TEST = {
326 'url': 'http://stream.nbcsports.com/nbcsn/generic?pid=206559',
327 'info_dict': {
328 'id': '206559',
329 'ext': 'mp4',
330 'title': 'Amgen Tour of California Women\'s Recap',
331 'description': 'md5:66520066b3b5281ada7698d0ea2aa894',
333 'params': {
334 # m3u8 download
335 'skip_download': True,
337 'skip': 'Requires Adobe Pass Authentication',
340 def _real_extract(self, url):
341 video_id = self._match_id(url)
342 live_source = self._download_json(
343 f'http://stream.nbcsports.com/data/live_sources_{video_id}.json',
344 video_id)
345 video_source = live_source['videoSources'][0]
346 title = video_source['title']
347 source_url = None
348 for k in ('source', 'msl4source', 'iossource', 'hlsv4'):
349 sk = k + 'Url'
350 source_url = video_source.get(sk) or video_source.get(sk + 'Alt')
351 if source_url:
352 break
353 else:
354 source_url = video_source['ottStreamUrl']
355 is_live = video_source.get('type') == 'live' or video_source.get('status') == 'Live'
356 resource = self._get_mvpd_resource('nbcsports', title, video_id, '')
357 token = self._extract_mvpd_auth(url, video_id, 'nbcsports', resource)
358 tokenized_url = self._download_json(
359 'https://token.playmakerservices.com/cdn',
360 video_id, data=json.dumps({
361 'requestorId': 'nbcsports',
362 'pid': video_id,
363 'application': 'NBCSports',
364 'version': 'v1',
365 'platform': 'desktop',
366 'cdn': 'akamai',
367 'url': video_source['sourceUrl'],
368 'token': base64.b64encode(token.encode()).decode(),
369 'resourceId': base64.b64encode(resource.encode()).decode(),
370 }).encode())['tokenizedUrl']
371 formats = self._extract_m3u8_formats(tokenized_url, video_id, 'mp4')
372 return {
373 'id': video_id,
374 'title': title,
375 'description': live_source.get('description'),
376 'formats': formats,
377 'is_live': is_live,
381 class NBCNewsIE(ThePlatformIE): # XXX: Do not subclass from concrete IE
382 _VALID_URL = r'(?x)https?://(?:www\.)?(?:nbcnews|today|msnbc)\.com/([^/]+/)*(?:.*-)?(?P<id>[^/?]+)'
383 _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//www\.nbcnews\.com/widget/video-embed/[^"\']+)\1']
385 _TESTS = [
387 'url': 'http://www.nbcnews.com/watch/nbcnews-com/how-twitter-reacted-to-the-snowden-interview-269389891880',
388 'md5': 'fb3dcd2d7b1dd9804305fa2fc95ab610', # md5 tends to fluctuate
389 'info_dict': {
390 'id': '269389891880',
391 'ext': 'mp4',
392 'title': 'How Twitter Reacted To The Snowden Interview',
393 'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64',
394 'timestamp': 1401363060,
395 'upload_date': '20140529',
396 'duration': 46.0,
397 'thumbnail': 'https://media-cldnry.s-nbcnews.com/image/upload/MSNBC/Components/Video/140529/p_tweet_snow_140529.jpg',
401 'url': 'http://www.nbcnews.com/feature/dateline-full-episodes/full-episode-family-business-n285156',
402 'md5': 'fdbf39ab73a72df5896b6234ff98518a',
403 'info_dict': {
404 'id': '529953347624',
405 'ext': 'mp4',
406 'title': 'FULL EPISODE: Family Business',
407 'description': 'md5:757988edbaae9d7be1d585eb5d55cc04',
409 'skip': 'This page is unavailable.',
412 'url': 'http://www.nbcnews.com/nightly-news/video/nightly-news-with-brian-williams-full-broadcast-february-4-394064451844',
413 'md5': '40d0e48c68896359c80372306ece0fc3',
414 'info_dict': {
415 'id': '394064451844',
416 'ext': 'mp4',
417 'title': 'Nightly News with Brian Williams Full Broadcast (February 4)',
418 'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5',
419 'timestamp': 1423104900,
420 'upload_date': '20150205',
421 'duration': 1236.0,
422 'thumbnail': 'https://media-cldnry.s-nbcnews.com/image/upload/MSNBC/Components/Video/__NEW/nn_netcast_150204.jpg',
426 'url': 'http://www.nbcnews.com/business/autos/volkswagen-11-million-vehicles-could-have-suspect-software-emissions-scandal-n431456',
427 'md5': 'ffb59bcf0733dc3c7f0ace907f5e3939',
428 'info_dict': {
429 'id': 'n431456',
430 'ext': 'mp4',
431 'title': "Volkswagen U.S. Chief: We 'Totally Screwed Up'",
432 'description': 'md5:d22d1281a24f22ea0880741bb4dd6301',
433 'upload_date': '20150922',
434 'timestamp': 1442917800,
435 'duration': 37.0,
436 'thumbnail': 'https://media-cldnry.s-nbcnews.com/image/upload/MSNBC/Components/Video/__NEW/x_lon_vwhorn_150922.jpg',
440 'url': 'http://www.today.com/video/see-the-aurora-borealis-from-space-in-stunning-new-nasa-video-669831235788',
441 'md5': '693d1fa21d23afcc9b04c66b227ed9ff',
442 'info_dict': {
443 'id': '669831235788',
444 'ext': 'mp4',
445 'title': 'See the aurora borealis from space in stunning new NASA video',
446 'description': 'md5:74752b7358afb99939c5f8bb2d1d04b1',
447 'upload_date': '20160420',
448 'timestamp': 1461152093,
449 'duration': 69.0,
450 'thumbnail': 'https://media-cldnry.s-nbcnews.com/image/upload/MSNBC/Components/Video/201604/2016-04-20T11-35-09-133Z--1280x720.jpg',
454 'url': 'http://www.msnbc.com/all-in-with-chris-hayes/watch/the-chaotic-gop-immigration-vote-314487875924',
455 'md5': '6d236bf4f3dddc226633ce6e2c3f814d',
456 'info_dict': {
457 'id': '314487875924',
458 'ext': 'mp4',
459 'title': 'The chaotic GOP immigration vote',
460 'description': 'The Republican House votes on a border bill that has no chance of getting through the Senate or signed by the President and is drawing criticism from all sides.',
461 'thumbnail': r're:^https?://.*\.jpg$',
462 'timestamp': 1406937606,
463 'upload_date': '20140802',
464 'duration': 940.0,
468 'url': 'http://www.nbcnews.com/watch/dateline/full-episode--deadly-betrayal-386250819952',
469 'only_matching': True,
472 # From http://www.vulture.com/2016/06/letterman-couldnt-care-less-about-late-night.html
473 'url': 'http://www.nbcnews.com/widget/video-embed/701714499682',
474 'only_matching': True,
478 def _real_extract(self, url):
479 video_id = self._match_id(url)
480 webpage = self._download_webpage(url, video_id)
482 data = self._search_nextjs_data(webpage, video_id)['props']['initialState']
483 video_data = try_get(data, lambda x: x['video']['current'], dict)
484 if not video_data:
485 video_data = data['article']['content'][0]['primaryMedia']['video']
486 title = video_data['headline']['primary']
488 formats = []
489 for va in video_data.get('videoAssets', []):
490 public_url = va.get('publicUrl')
491 if not public_url:
492 continue
493 if '://link.theplatform.com/' in public_url:
494 public_url = update_url_query(public_url, {'format': 'redirect'})
495 format_id = va.get('format')
496 if format_id == 'M3U':
497 formats.extend(self._extract_m3u8_formats(
498 public_url, video_id, 'mp4', 'm3u8_native',
499 m3u8_id=format_id, fatal=False))
500 continue
501 tbr = int_or_none(va.get('bitrate'), 1000)
502 formats.append({
503 'format_id': join_nonempty(format_id, tbr),
504 'url': public_url,
505 'width': int_or_none(va.get('width')),
506 'height': int_or_none(va.get('height')),
507 'tbr': tbr,
508 'ext': 'mp4',
511 subtitles = {}
512 closed_captioning = video_data.get('closedCaptioning')
513 if closed_captioning:
514 for cc_url in closed_captioning.values():
515 if not cc_url:
516 continue
517 subtitles.setdefault('en', []).append({
518 'url': cc_url,
521 return {
522 'id': video_id,
523 'title': title,
524 'description': try_get(video_data, lambda x: x['description']['primary']),
525 'thumbnail': try_get(video_data, lambda x: x['primaryImage']['url']['primary']),
526 'duration': parse_duration(video_data.get('duration')),
527 'timestamp': unified_timestamp(video_data.get('datePublished')),
528 'formats': formats,
529 'subtitles': subtitles,
533 class NBCOlympicsIE(InfoExtractor):
534 IE_NAME = 'nbcolympics'
535 _VALID_URL = r'https?://www\.nbcolympics\.com/videos?/(?P<id>[0-9a-z-]+)'
537 _TEST = {
538 # Geo-restricted to US
539 'url': 'http://www.nbcolympics.com/video/justin-roses-son-leo-was-tears-after-his-dad-won-gold',
540 'md5': '54fecf846d05429fbaa18af557ee523a',
541 'info_dict': {
542 'id': 'WjTBzDXx5AUq',
543 'display_id': 'justin-roses-son-leo-was-tears-after-his-dad-won-gold',
544 'ext': 'mp4',
545 'title': 'Rose\'s son Leo was in tears after his dad won gold',
546 'description': 'Olympic gold medalist Justin Rose gets emotional talking to the impact his win in men\'s golf has already had on his children.',
547 'timestamp': 1471274964,
548 'upload_date': '20160815',
549 'uploader': 'NBCU-SPORTS',
551 'skip': '404 Not Found',
554 def _real_extract(self, url):
555 display_id = self._match_id(url)
557 webpage = self._download_webpage(url, display_id)
559 try:
560 drupal_settings = self._parse_json(self._search_regex(
561 r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);',
562 webpage, 'drupal settings'), display_id)
564 iframe_url = drupal_settings['vod']['iframe_url']
565 theplatform_url = iframe_url.replace(
566 'vplayer.nbcolympics.com', 'player.theplatform.com')
567 except RegexNotFoundError:
568 theplatform_url = self._search_regex(
569 r"([\"'])embedUrl\1: *([\"'])(?P<embedUrl>.+)\2",
570 webpage, 'embedding URL', group='embedUrl')
572 return {
573 '_type': 'url_transparent',
574 'url': theplatform_url,
575 'ie_key': ThePlatformIE.ie_key(),
576 'display_id': display_id,
580 class NBCOlympicsStreamIE(AdobePassIE):
581 IE_NAME = 'nbcolympics:stream'
582 _VALID_URL = r'https?://stream\.nbcolympics\.com/(?P<id>[0-9a-z-]+)'
583 _TESTS = [
585 'note': 'Tokenized m3u8 source URL',
586 'url': 'https://stream.nbcolympics.com/womens-soccer-group-round-11',
587 'info_dict': {
588 'id': '2019740',
589 'ext': 'mp4',
590 'title': r"re:Women's Group Stage - Netherlands vs\. Brazil [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$",
592 'params': {
593 'skip_download': 'm3u8',
595 'skip': 'Livestream',
596 }, {
597 'note': 'Plain m3u8 source URL',
598 'url': 'https://stream.nbcolympics.com/gymnastics-event-finals-mens-floor-pommel-horse-womens-vault-bars',
599 'info_dict': {
600 'id': '2021729',
601 'ext': 'mp4',
602 'title': r're:Event Finals: M Floor, W Vault, M Pommel, W Uneven Bars [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
604 'params': {
605 'skip_download': 'm3u8',
607 'skip': 'Livestream',
611 def _real_extract(self, url):
612 display_id = self._match_id(url)
613 webpage = self._download_webpage(url, display_id)
614 pid = self._search_regex(r'pid\s*=\s*(\d+);', webpage, 'pid')
616 event_config = self._download_json(
617 f'http://stream.nbcolympics.com/data/event_config_{pid}.json',
618 pid, 'Downloading event config')['eventConfig']
620 title = event_config['eventTitle']
621 is_live = {'live': True, 'replay': False}.get(event_config.get('eventStatus'))
623 source_url = self._download_json(
624 f'https://api-leap.nbcsports.com/feeds/assets/{pid}?application=NBCOlympics&platform=desktop&format=nbc-player&env=staging',
625 pid, 'Downloading leap config',
626 )['videoSources'][0]['cdnSources']['primary'][0]['sourceUrl']
628 if event_config.get('cdnToken'):
629 ap_resource = self._get_mvpd_resource(
630 event_config.get('resourceId', 'NBCOlympics'),
631 re.sub(r'[^\w\d ]+', '', event_config['eventTitle']), pid,
632 event_config.get('ratingId', 'NO VALUE'))
633 media_token = self._extract_mvpd_auth(url, pid, event_config.get('requestorId', 'NBCOlympics'), ap_resource)
635 source_url = self._download_json(
636 'https://tokens.playmakerservices.com/', pid, 'Retrieving tokenized URL',
637 data=json.dumps({
638 'application': 'NBCSports',
639 'authentication-type': 'adobe-pass',
640 'cdn': 'akamai',
641 'pid': pid,
642 'platform': 'desktop',
643 'requestorId': 'NBCOlympics',
644 'resourceId': base64.b64encode(ap_resource.encode()).decode(),
645 'token': base64.b64encode(media_token.encode()).decode(),
646 'url': source_url,
647 'version': 'v1',
648 }).encode(),
649 )['akamai'][0]['tokenizedUrl']
651 formats = self._extract_m3u8_formats(source_url, pid, 'mp4', live=is_live)
652 for f in formats:
653 # -http_seekable requires ffmpeg 4.3+ but it doesnt seem possible to
654 # download with ffmpeg without this option
655 f['downloader_options'] = {'ffmpeg_args': ['-seekable', '0', '-http_seekable', '0', '-icy', '0']}
657 return {
658 'id': pid,
659 'display_id': display_id,
660 'title': title,
661 'formats': formats,
662 'is_live': is_live,
666 class NBCStationsIE(InfoExtractor):
667 _DOMAIN_RE = '|'.join(map(re.escape, (
668 'nbcbayarea', 'nbcboston', 'nbcchicago', 'nbcconnecticut', 'nbcdfw', 'nbclosangeles',
669 'nbcmiami', 'nbcnewyork', 'nbcphiladelphia', 'nbcsandiego', 'nbcwashington',
670 'necn', 'telemundo52', 'telemundoarizona', 'telemundochicago', 'telemundonuevainglaterra',
672 _VALID_URL = rf'https?://(?:www\.)?(?P<site>{_DOMAIN_RE})\.com/(?:[^/?#]+/)*(?P<id>[^/?#]+)/?(?:$|[#?])'
674 _TESTS = [{
675 'url': 'https://www.nbclosangeles.com/news/local/large-structure-fire-in-downtown-la-prompts-smoke-odor-advisory/2968618/',
676 'info_dict': {
677 'id': '2968618',
678 'ext': 'mp4',
679 'title': 'Large Structure Fire in Downtown LA Prompts Smoke Odor Advisory',
680 'description': 'md5:417ed3c2d91fe9d301e6db7b0942f182',
681 'duration': 112.513,
682 'timestamp': 1661135892,
683 'upload_date': '20220822',
684 'uploader': 'NBC 4',
685 'channel_id': 'KNBC',
686 'channel': 'nbclosangeles',
688 'params': {
689 'skip_download': 'm3u8',
691 }, {
692 'url': 'https://www.telemundoarizona.com/responde/huracan-complica-reembolso-para-televidente-de-tucson/2247002/',
693 'info_dict': {
694 'id': '2247002',
695 'ext': 'mp4',
696 'title': 'Huracán complica que televidente de Tucson reciba reembolso',
697 'description': 'md5:af298dc73aab74d4fca6abfb12acb6cf',
698 'duration': 172.406,
699 'timestamp': 1660886507,
700 'upload_date': '20220819',
701 'uploader': 'Telemundo Arizona',
702 'channel_id': 'KTAZ',
703 'channel': 'telemundoarizona',
705 'params': {
706 'skip_download': 'm3u8',
708 }, {
709 # direct mp4 link
710 'url': 'https://www.nbcboston.com/weather/video-weather/highs-near-freezing-in-boston-on-wednesday/2961135/',
711 'md5': '9bf8c41dc7abbb75b1a44f1491a4cc85',
712 'info_dict': {
713 'id': '2961135',
714 'ext': 'mp4',
715 'title': 'Highs Near Freezing in Boston on Wednesday',
716 'description': 'md5:3ec486609a926c99f00a3512e6c0e85b',
717 'duration': 235.669,
718 'timestamp': 1675268656,
719 'upload_date': '20230201',
720 'uploader': '',
721 'channel_id': 'WBTS',
722 'channel': 'nbcboston',
726 _RESOLUTIONS = {
727 '1080': '1920',
728 '720': '1280',
729 '540': '960',
730 '360': '640',
731 '234': '416',
734 def _real_extract(self, url):
735 channel, video_id = self._match_valid_url(url).group('site', 'id')
736 webpage = self._download_webpage(url, video_id)
738 nbc_data = self._search_json(
739 r'<script>\s*var\s+nbc\s*=', webpage, 'NBC JSON data', video_id)
740 pdk_acct = nbc_data.get('pdkAcct') or 'Yh1nAC'
741 fw_ssid = traverse_obj(nbc_data, ('video', 'fwSSID'))
743 video_data = self._search_json(
744 r'data-videos="\[', webpage, 'video data', video_id, default={}, transform_source=unescapeHTML)
745 video_data.update(self._search_json(
746 r'data-meta="', webpage, 'metadata', video_id, default={}, transform_source=unescapeHTML))
747 if not video_data:
748 raise ExtractorError('No video metadata found in webpage', expected=True)
750 info, formats = {}, []
751 is_live = int_or_none(video_data.get('mpx_is_livestream')) == 1
752 query = {
753 'formats': 'MPEG-DASH none,M3U none,MPEG-DASH none,MPEG4,MP3',
754 'format': 'SMIL',
755 'fwsitesection': fw_ssid,
756 'fwNetworkID': traverse_obj(nbc_data, ('video', 'fwNetworkID'), default='382114'),
757 'pprofile': 'ots_desktop_html',
758 'sensitive': 'false',
759 'w': '1920',
760 'h': '1080',
761 'mode': 'LIVE' if is_live else 'on-demand',
762 'vpaid': 'script',
763 'schema': '2.0',
764 'sdk': 'PDK 6.1.3',
767 if is_live:
768 player_id = traverse_obj(video_data, ((None, ('video', 'meta')), (
769 'mpx_m3upid', 'mpx_pid', 'pid_streaming_web_medium')), get_all=False)
770 info['title'] = f'{channel} livestream'
772 else:
773 player_id = traverse_obj(video_data, (
774 (None, ('video', 'meta')), ('pid_streaming_web_high', 'mpx_pid')), get_all=False)
776 date_string = traverse_obj(video_data, 'date_string', 'date_gmt')
777 if date_string:
778 date_string = self._search_regex(
779 r'datetime="([^"]+)"', date_string, 'date string', fatal=False)
780 else:
781 date_string = traverse_obj(
782 nbc_data, ('dataLayer', 'adobe', ('prop70', 'eVar70', 'eVar59')), get_all=False)
784 video_url = traverse_obj(video_data, ((None, ('video', 'meta')), 'mp4_url'), get_all=False)
785 if video_url:
786 ext = determine_ext(video_url)
787 height = self._search_regex(r'\d+-(\d+)p', url_basename(video_url), 'height', default=None)
788 formats.append({
789 'url': video_url,
790 'ext': ext,
791 'width': int_or_none(self._RESOLUTIONS.get(height)),
792 'height': int_or_none(height),
793 'format_id': f'http-{ext}',
796 info.update({
797 'title': video_data.get('title') or traverse_obj(nbc_data, (
798 'dataLayer', (None, 'adobe'), ('contenttitle', 'title', 'prop22')), get_all=False),
799 'description':
800 traverse_obj(video_data, 'summary', 'excerpt', 'video_hero_text')
801 or clean_html(traverse_obj(nbc_data, ('dataLayer', 'summary'))),
802 'timestamp': unified_timestamp(date_string),
805 smil = None
806 if player_id and fw_ssid:
807 smil = self._download_xml(
808 f'https://link.theplatform.com/s/{pdk_acct}/{player_id}', video_id,
809 note='Downloading SMIL data', query=query, fatal=is_live)
810 if not isinstance(smil, xml.etree.ElementTree.Element):
811 smil = None
812 subtitles = self._parse_smil_subtitles(smil, default_ns) if smil is not None else {}
813 for video in smil.findall(self._xpath_ns('.//video', default_ns)) if smil is not None else []:
814 info['duration'] = float_or_none(remove_end(video.get('dur'), 'ms'), 1000)
815 video_src_url = video.get('src')
816 ext = mimetype2ext(video.get('type'), default=determine_ext(video_src_url))
817 if ext == 'm3u8':
818 fmts, subs = self._extract_m3u8_formats_and_subtitles(
819 video_src_url, video_id, 'mp4', m3u8_id='hls', fatal=is_live,
820 live=is_live, errnote='No HLS formats found')
821 formats.extend(fmts)
822 self._merge_subtitles(subs, target=subtitles)
823 elif video_src_url:
824 formats.append({
825 'url': video_src_url,
826 'format_id': f'https-{ext}',
827 'ext': ext,
828 'width': int_or_none(video.get('width')),
829 'height': int_or_none(video.get('height')),
832 if not formats:
833 self.raise_no_formats('No video content found in webpage', expected=True)
834 elif is_live:
835 try:
836 self._request_webpage(
837 HEADRequest(formats[0]['url']), video_id, note='Checking live status')
838 except ExtractorError:
839 raise UserNotLive(video_id=channel)
841 return {
842 'id': video_id,
843 'channel': channel,
844 'channel_id': nbc_data.get('callLetters'),
845 'uploader': nbc_data.get('on_air_name'),
846 'formats': formats,
847 'subtitles': subtitles,
848 'is_live': is_live,
849 **info,