6 import xml
.etree
.ElementTree
8 from .common
import InfoExtractor
9 from ..networking
.exceptions
import HTTPError
34 class BBCCoUkIE(InfoExtractor
):
36 IE_DESC
= 'BBC iPlayer'
37 _ID_REGEX
= r
'(?:[pbml][\da-z]{7}|w[\da-z]{7,14})'
38 _VALID_URL
= rf
'''(?x)
40 (?:www\.)?bbc\.co\.uk/
42 programmes/(?!articles/)|
43 iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
44 music/(?:clips|audiovideo/popular)[/#]|
46 events/[^/]+/play/[^/]+/
48 (?P<id>{_ID_REGEX})(?!/(?:episodes|broadcasts|clips))
50 _EMBED_REGEX
= [r
'setPlaylist\("(?P<url>https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)']
52 _LOGIN_URL
= 'https://account.bbc.com/signin'
53 _NETRC_MACHINE
= 'bbc'
55 _MEDIA_SELECTOR_URL_TEMPL
= 'https://open.live.bbc.co.uk/mediaselector/6/select/version/2.0/mediaset/%s/vpid/%s'
57 # Provides HQ HLS streams with even better quality that pc mediaset but fails
58 # with geolocation in some cases when it's even not geo restricted at all (e.g.
59 # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
64 _EMP_PLAYLIST_NS
= 'http://bbc.co.uk/2008/emp/playlist'
68 'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
72 'title': 'Kaleidoscope, Leonard Cohen',
73 'description': 'The Canadian poet and songwriter reflects on his musical career.',
77 'skip_download': True,
81 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
85 'title': 'The Man in Black: Series 3: The Printed Name',
86 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
91 'skip_download': True,
93 'skip': 'Episode is no longer available on BBC iPlayer Radio',
96 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
100 'title': 'The Voice UK: Series 3: Blind Auditions 5',
101 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
106 'skip_download': True,
108 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
111 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
115 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
116 'description': '2. Invasion',
121 'skip_download': True,
123 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
125 'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
129 'title': 'Pete Tong, The Essential New Tune Special',
130 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
135 'skip_download': True,
137 'skip': 'Episode is no longer available on BBC iPlayer Radio',
139 'url': 'http://www.bbc.co.uk/music/clips/p022h44b',
144 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances',
145 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.",
150 'skip_download': True,
153 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
158 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
159 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
164 'skip_download': True,
167 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
171 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
172 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
177 'skip_download': True,
179 'skip': 'geolocation',
181 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
185 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
186 'title': 'Royal Academy Summer Exhibition',
191 'skip_download': True,
193 'skip': 'geolocation',
195 # iptv-all mediaset fails with geolocation however there is no geo restriction
196 # for this programme at all
197 'url': 'http://www.bbc.co.uk/programmes/b06rkn85',
201 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1",
202 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!",
206 'skip_download': True,
208 'skip': 'Now it\'s really geo-restricted',
210 # compact player (https://github.com/ytdl-org/youtube-dl/issues/8147)
211 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
215 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
216 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
220 'skip_download': True,
223 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
224 'only_matching': True,
226 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
227 'only_matching': True,
229 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
230 'only_matching': True,
232 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
233 'only_matching': True,
235 'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55',
236 'only_matching': True,
238 'url': 'http://www.bbc.co.uk/programmes/w3csv1y9',
239 'only_matching': True,
241 'url': 'https://www.bbc.co.uk/programmes/m00005xn',
242 'only_matching': True,
244 'url': 'https://www.bbc.co.uk/programmes/w172w4dww1jqt5s',
245 'only_matching': True,
248 def _perform_login(self
, username
, password
):
249 login_page
= self
._download
_webpage
(
250 self
._LOGIN
_URL
, None, 'Downloading signin page')
252 login_form
= self
._hidden
_inputs
(login_page
)
255 'username': username
,
256 'password': password
,
259 post_url
= urljoin(self
._LOGIN
_URL
, self
._search
_regex
(
260 r
'<form[^>]+action=(["\'])(?P
<url
>.+?
)\
1', login_page,
261 'post url
', default=self._LOGIN_URL, group='url
'))
263 response, urlh = self._download_webpage_handle(
264 post_url, None, 'Logging
in', data=urlencode_postdata(login_form),
265 headers={'Referer
': self._LOGIN_URL})
267 if self._LOGIN_URL in urlh.url:
268 error = clean_html(get_element_by_class('form
-message
', response))
270 raise ExtractorError(
271 f'Unable to login
: {error}
', expected=True)
272 raise ExtractorError('Unable to log
in')
274 class MediaSelectionError(Exception):
275 def __init__(self, error_id):
278 def _extract_asx_playlist(self, connection, programme_id):
279 asx = self._download_xml(connection.get('href
'), programme_id, 'Downloading ASX playlist
')
280 return [ref.get('href
') for ref in asx.findall('./Entry
/ref
')]
282 def _extract_items(self, playlist):
283 return playlist.findall(f'./{{{self
._EMP
_PLAYLIST
_NS
}}}item
')
285 def _extract_medias(self, media_selection):
286 error = media_selection.get('result
')
288 raise BBCCoUkIE.MediaSelectionError(error)
289 return media_selection.get('media
') or []
291 def _extract_connections(self, media):
292 return media.get('connection
') or []
294 def _get_subtitles(self, media, programme_id):
296 for connection in self._extract_connections(media):
297 cc_url = url_or_none(connection.get('href
'))
300 captions = self._download_xml(
301 cc_url, programme_id, 'Downloading captions
', fatal=False)
302 if not isinstance(captions, xml.etree.ElementTree.Element):
306 'url
': connection.get('href
'),
313 def _raise_extractor_error(self, media_selection_error):
314 raise ExtractorError(
315 f'{self
.IE_NAME
} returned error
: {media_selection_error
.id}',
318 def _download_media_selector(self, programme_id):
319 last_exception = None
320 formats, subtitles = [], {}
321 for media_set in self._MEDIA_SETS:
323 fmts, subs = self._download_media_selector_url(
324 self._MEDIA_SELECTOR_URL_TEMPL % (media_set, programme_id), programme_id)
327 self._merge_subtitles(subs, target=subtitles)
328 except BBCCoUkIE.MediaSelectionError as e:
329 if e.id in ('notukerror
', 'geolocation
', 'selectionunavailable
'):
332 self._raise_extractor_error(e)
334 if formats or subtitles:
335 self.report_warning(f'{self
.IE_NAME
} returned error
: {last_exception
.id}')
337 self._raise_extractor_error(last_exception)
338 return formats, subtitles
340 def _download_media_selector_url(self, url, programme_id=None):
341 media_selection = self._download_json(
342 url, programme_id, 'Downloading media selection JSON
',
343 expected_status=(403, 404))
344 return self._process_media_selector(media_selection, programme_id)
346 def _process_media_selector(self, media_selection, programme_id):
351 for media in self._extract_medias(media_selection):
352 kind = media.get('kind
')
353 if kind in ('video
', 'audio
'):
354 bitrate = int_or_none(media.get('bitrate
'))
355 encoding = media.get('encoding
')
356 width = int_or_none(media.get('width
'))
357 height = int_or_none(media.get('height
'))
358 file_size = int_or_none(media.get('media_file_size
'))
359 for connection in self._extract_connections(media):
360 href = connection.get('href
')
365 conn_kind = connection.get('kind
')
366 protocol = connection.get('protocol
')
367 supplier = connection.get('supplier
')
368 transfer_format = connection.get('transferFormat
')
369 format_id = supplier or conn_kind or protocol
371 if supplier == 'asx
':
372 for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
375 'format_id
': f'ref{i}_{format_id}
',
377 elif transfer_format == 'dash
':
378 formats.extend(self._extract_mpd_formats(
379 href, programme_id, mpd_id=format_id, fatal=False))
380 elif transfer_format == 'hls
':
381 # TODO: let expected_status be passed into _extract_xxx_formats() instead
383 fmts = self._extract_m3u8_formats(
384 href, programme_id, ext='mp4
', entry_protocol='m3u8_native
',
385 m3u8_id=format_id, fatal=False)
386 except ExtractorError as e:
387 if not (isinstance(e.exc_info[1], HTTPError)
388 and e.exc_info[1].status in (403, 404)):
392 elif transfer_format == 'hds
':
393 formats.extend(self._extract_f4m_formats(
394 href, programme_id, f4m_id=format_id, fatal=False))
396 if not supplier and bitrate:
397 format_id += f'-{bitrate}
'
399 'format_id
': format_id,
400 'filesize
': file_size,
415 if protocol in ('http
', 'https
'):
420 elif protocol == 'rtmp
':
421 application = connection.get('application
', 'ondemand
')
422 auth_string = connection.get('authString
')
423 identifier = connection.get('identifier
')
424 server = connection.get('server
')
426 'url
': f'{protocol}
://{server}
/{application}?{auth_string}
',
427 'play_path
': identifier,
428 'app
': f'{application}?{auth_string}
',
429 'page_url
': 'http
://www
.bbc
.co
.uk
',
430 'player_url
': 'http
://www
.bbc
.co
.uk
/emp
/releases
/iplayer
/revisions
/617463_618125_4/617463_618125_4_emp
.swf
',
437 elif kind == 'captions
':
438 subtitles = self.extract_subtitles(media, programme_id)
439 return formats, subtitles
441 def _download_playlist(self, playlist_id):
443 playlist = self._download_json(
444 f'http
://www
.bbc
.co
.uk
/programmes
/{playlist_id}
/playlist
.json
',
445 playlist_id, 'Downloading playlist JSON
')
449 for version in playlist.get('allAvailableVersions
', []):
450 smp_config = version['smpConfig
']
451 title = smp_config['title
']
452 description = smp_config['summary
']
453 for item in smp_config['items
']:
455 if kind not in ('programme
', 'radioProgramme
'):
457 programme_id = item.get('vpid
')
458 duration = int_or_none(item.get('duration
'))
459 version_formats, version_subtitles = self._download_media_selector(programme_id)
460 types = version['types
']
461 for f in version_formats:
462 f['format_note
'] = ', '.join(types)
463 if any('AudioDescribed
' in x for x in types):
464 f['language_preference
'] = -10
465 formats += version_formats
466 for tag, subformats in (version_subtitles or {}).items():
467 subtitles.setdefault(tag, []).extend(subformats)
469 return programme_id, title, description, duration, formats, subtitles
470 except ExtractorError as ee:
471 if not (isinstance(ee.cause, HTTPError) and ee.cause.status == 404):
474 # fallback to legacy playlist
475 return self._process_legacy_playlist(playlist_id)
477 def _process_legacy_playlist_url(self, url, display_id):
478 playlist = self._download_legacy_playlist_url(url, display_id)
479 return self._extract_from_legacy_playlist(playlist, display_id)
481 def _process_legacy_playlist(self, playlist_id):
482 return self._process_legacy_playlist_url(
483 f'http
://www
.bbc
.co
.uk
/iplayer
/playlist
/{playlist_id}
', playlist_id)
485 def _download_legacy_playlist_url(self, url, playlist_id=None):
486 return self._download_xml(
487 url, playlist_id, 'Downloading legacy playlist XML
')
489 def _extract_from_legacy_playlist(self, playlist, playlist_id):
490 no_items = playlist.find(f'./{{{self
._EMP
_PLAYLIST
_NS
}}}noItems
')
491 if no_items is not None:
492 reason = no_items.get('reason
')
493 if reason == 'preAvailability
':
494 msg = f'Episode {playlist_id}
is not yet available
'
495 elif reason == 'postAvailability
':
496 msg = f'Episode {playlist_id}
is no longer available
'
497 elif reason == 'noMedia
':
498 msg = f'Episode {playlist_id}
is not currently available
'
500 msg = f'Episode {playlist_id}
is not available
: {reason}
'
501 raise ExtractorError(msg, expected=True)
503 for item in self._extract_items(playlist):
504 kind = item.get('kind
')
505 if kind not in ('programme
', 'radioProgramme
'):
507 title = playlist.find(f'./{{{self
._EMP
_PLAYLIST
_NS
}}}title
').text
508 description_el = playlist.find(f'./{{{self
._EMP
_PLAYLIST
_NS
}}}summary
')
509 description = description_el.text if description_el is not None else None
511 def get_programme_id(item):
512 def get_from_attributes(item):
513 for p in ('identifier
', 'group
'):
515 if value and re.match(r'^
[pb
][\da
-z
]{7}$
', value):
517 get_from_attributes(item)
518 mediator = item.find(f'./{{{self
._EMP
_PLAYLIST
_NS
}}}mediator
')
519 if mediator is not None:
520 return get_from_attributes(mediator)
522 programme_id = get_programme_id(item)
523 duration = int_or_none(item.get('duration
'))
526 formats, subtitles = self._download_media_selector(programme_id)
528 formats, subtitles = self._process_media_selector(item, playlist_id)
529 programme_id = playlist_id
531 return programme_id, title, description, duration, formats, subtitles
533 def _real_extract(self, url):
534 group_id = self._match_id(url)
536 webpage = self._download_webpage(url, group_id, 'Downloading video page
')
538 error = self._search_regex(
539 r'<div
\b[^
>]+\bclass
=["\'](?:smp|playout)__message delta["\'][^
>]*>\s
*([^
<]+?
)\s
*<',
540 webpage, 'error
', default=None)
542 raise ExtractorError(error, expected=True)
547 tviplayer = self._search_regex(
548 r'mediator\
.bind\
(({.+?
})\s
*,\s
*document\
.getElementById
',
549 webpage, 'player
', default=None)
552 player = self._parse_json(tviplayer, group_id).get('player
', {})
553 duration = int_or_none(player.get('duration
'))
554 programme_id = player.get('vpid
')
557 programme_id = self._search_regex(
558 rf'"vpid"\s
*:\s
*"({self._ID_REGEX})"', webpage, 'vpid
', fatal=False, default=None)
561 formats, subtitles = self._download_media_selector(programme_id)
562 title = self._og_search_title(webpage, default=None) or self._html_search_regex(
563 (r'<h2
[^
>]+id="parent-title"[^
>]*>(.+?
)</h2
>',
564 r'<div
[^
>]+class="info"[^
>]*>\s
*<h1
>(.+?
)</h1
>'), webpage, 'title
')
565 description = self._search_regex(
566 (r'<p
class="[^"]*medium
-description
[^
"]*">([^
<]+)</p
>',
567 r'<div
[^
>]+class="info_+synopsis"[^
>]*>([^
<]+)</div
>'),
568 webpage, 'description
', default=None)
570 description = self._html_search_meta('description
', webpage)
572 programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
577 'description
': description,
578 'thumbnail
': self._og_search_thumbnail(webpage, default=None),
579 'duration
': duration,
581 'subtitles
': subtitles,
585 class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
588 _VALID_URL = r'''(?x)
589 https?://(?:www\.)?(?:
591 bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd\.onion|
592 bbcweb3hytmzhn5d532owbu6oqadra5z3ar726vq5kgwwn6aucdccrad\.onion
593 )/(?:[^/]+/)+(?P<id>[^/#?]+)'''
597 'mobile
-tablet
-main
',
601 # article with multiple videos embedded with data-playable containing vpids
602 'url
': 'http
://www
.bbc
.com
/news
/world
-europe
-32668511',
604 'id': 'world
-europe
-32668511',
605 'title
': 'Russia stages massive WW2 parade despite Western boycott
',
606 'description
': 'md5
:00ff61976f6081841f759a08bf78cc9c
',
610 # article with multiple videos embedded with data-playable (more videos)
611 'url
': 'http
://www
.bbc
.com
/news
/business
-28299555',
613 'id': 'business
-28299555',
614 'title
': 'Farnborough Airshow
: Video highlights
',
615 'description
': 'BBC reports
and video highlights at the Farnborough Airshow
.',
620 # article with multiple videos embedded with `new SMP()`
622 'url
': 'http
://www
.bbc
.co
.uk
/blogs
/adamcurtis
/entries
/3662a707
-0af9
-3149-963f
-47bea720b460
',
624 'id': '3662a707
-0af9
-3149-963f
-47bea720b460
',
626 'description
': r're
:BUGGER The recent revelations by the whistleblower Edward Snowden were fascinating
. .{211}\
.{3}$
',
628 'playlist_count
': 18,
630 # single video embedded with data-playable containing vpid
631 'url
': 'http
://www
.bbc
.com
/news
/world
-europe
-32041533',
635 'title
': 'Germanwings crash site aerial video
',
636 'description
': r're
:(?s
)Aerial video showed the site where the Germanwings flight
4U 9525, .{156} BFM TV\
.$
',
638 'timestamp
': 1427219242,
639 'upload_date
': '20150324',
640 'thumbnail
': 'https
://ichef
.bbci
.co
.uk
/news
/1024/media
/images
/81879000/jpg
/_81879090_81879089
.jpg
',
643 'skip_download
': True,
646 # article with single video embedded with data-playable containing XML playlist
647 # with direct video links as progressiveDownloadUrl (for now these are extracted)
648 # and playlist with f4m and m3u8 as streamingUrl
649 'url
': 'http
://www
.bbc
.com
/turkce
/haberler
/2015/06/150615_telabyad
_kentin
_cogu
',
651 'id': '150615_telabyad
_kentin
_cogu
',
653 'title
': "YPG: Tel Abyad'ın tamamı kontrolümüzde
",
654 'description': 'md5:33a4805a855c9baf7115fcbde57e7025',
655 'timestamp': 1434397334,
656 'upload_date': '20150615',
659 'skip_download': True,
661 'skip': 'now SIMORGH_DATA with no video',
663 # single video embedded with data-playable containing XML playlists (regional section)
664 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
667 'display_id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
669 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
670 'description': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
671 'timestamp': 1434713142,
672 'upload_date': '20150619',
673 'thumbnail': 'https://a.files.bbci.co.uk/worldservice/live/assets/images/2015/06/19/150619132146_honduras_hsopitales_militares_640x360_aptn_nocredit.jpg',
676 'skip_download': True,
679 # single video from video playlist embedded with vxp-playlist-data JSON
680 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
684 'title': '''Judge Mindy Glazer: "I
'm sorry to see you here... I always wondered what happened to you"''',
686 'description
': '''Judge Mindy Glazer: "I'm sorry to see you here
... I always wondered what happened to you
"''',
689 'skip_download': True,
691 'skip': '404 Not Found',
693 # single video story with __PWA_PRELOADED_STATE__
694 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
698 'title': 'Tasting the spice of life in Jaffna',
699 'description': r're:(?s)BBC Travel Show’s Henry Golding explores the city of Jaffna .{151} aftertaste\.$',
700 'timestamp': 1646058397,
701 'upload_date': '20220228',
703 'thumbnail': 'https://ichef.bbci.co.uk/images/ic/1920xn/p02vxvkn.jpg',
706 # single video story without digitalData
707 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
711 'title': 'Hyundai Santa Fe Sport: Rock star',
712 'description': 'md5:b042a26142c4154a6e472933cf20793d',
713 'timestamp': 1415867444,
714 'upload_date': '20141113',
716 'skip': 'redirects to TopGear home page',
718 # single video embedded with Morph
719 # TODO: replacement test page
720 'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
724 'title': "Nigeria v Japan
- Men
's First Round",
725 'description
': 'Live coverage of the first
round from Group B at the Amazonia Arena
.',
727 'uploader
': 'BBC Sport
',
728 'uploader_id
': 'bbc_sport
',
730 'skip
': 'Video no longer
in page
',
732 # single video in __INITIAL_DATA__
733 'url
': 'http
://www
.bbc
.com
/sport
/0/football
/33653409',
737 'title
': 'Ronaldo to Man Utd
, Arsenal to spend?
',
738 'description
': r're
:(?s
)BBC Sport
\'s David Ornstein rounds up the latest transfer reports
, .{359} here\
.$
',
739 'timestamp
': 1437750175,
740 'upload_date
': '20150724',
741 'thumbnail
': r're
:https?
://.+/.+media
/images
/69320000/png
/_69320754_mmgossipcolumnextraaugust18
.png
',
745 # article with multiple videos embedded with Morph.setPayload
746 'url
': 'http
://www
.bbc
.com
/sport
/0/football
/34475836',
749 'title
': 'Jurgen Klopp
: Furious football
from a witty
and winning coach
',
750 'description
': 'Fast
-paced football
, wit
, wisdom
and a ready smile
- why Liverpool fans should come to love new boss Jurgen Klopp
.',
755 'url
': 'http
://www
.bbc
.com
/sport
/0/football
/34475836',
759 'title
': 'All you need to know about Jurgen Klopp
',
760 'timestamp
': 1444335081,
761 'upload_date
': '20151008',
763 'thumbnail
': 'https
://ichef
.bbci
.co
.uk
/onesport
/cps
/976/cpsprodpb
/7542/production
/_85981003_klopp
.jpg
',
769 # school report article with single video
770 'url
': 'http
://www
.bbc
.co
.uk
/schoolreport
/35744779',
773 'title
': 'School which breaks down barriers
in Jerusalem
',
776 'skip
': 'redirects to Young Reporter home page https
://www
.bbc
.co
.uk
/news
/topics
/cg41ylwv43pt
',
778 # single video with playlist URL from weather section
779 'url
': 'http
://www
.bbc
.com
/weather
/features
/33601775',
780 'only_matching
': True,
782 # custom redirection to www.bbc.com
783 # also, video with window.__INITIAL_DATA__
784 'url
': 'http
://www
.bbc
.co
.uk
/news
/science
-environment
-33661876',
788 'title
': "Pluto may have 'nitrogen glaciers
'",
789 'description
': 'md5
:6a95b593f528d7a5f2605221bc56912f
',
790 'thumbnail
': r're
:https?
://.+/.+\
.jpg
',
791 'timestamp
': 1437785037,
792 'upload_date
': '20150725',
796 # video with window.__INITIAL_DATA__ and value as JSON string
797 'url
': 'https
://www
.bbc
.com
/news
/av
/world
-europe
-59468682',
801 'title
': 'Why France
is making this woman a national hero
',
802 'description
': r're
:(?s
)France
is honouring the US
-born
20th Century singer
and activist Josephine
.{208} Second World War
.',
803 'thumbnail
': r're
:https?
://.+/.+\
.jpg
',
804 'timestamp
': 1638215626,
805 'upload_date
': '20211129',
809 # video with script id __NEXT_DATA__ and value as JSON string
810 'url
': 'https
://www
.bbc
.com
/news
/uk
-68546268',
814 'title
': 'Nasser Hospital doctor describes his treatment by IDF
',
815 'description
': r're
:(?s
)Doctor Abu Sabha said he was detained by Israeli forces after
.{276} hostages\
."$',
816 'thumbnail': r're:https?://.+/.+\.jpg',
817 'timestamp': 1710188248,
818 'upload_date': '20240311',
822 # single video article embedded with data-media-vpid
823 'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
824 'only_matching': True,
827 'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1',
831 'title': 'Things Not To Say to people that live on council estates',
832 'description': "From being labelled a
'chav', to the presumption that they
're 'scroungers
', people who live on council estates encounter all kinds of prejudices and false assumptions about themselves, their families, and their lifestyles. Here, eight people discuss the common statements, misconceptions, and clichés that they're tired of hearing
.",
834 'thumbnail': r're:https?://.+/.+\.jpg',
837 # window.__PRELOADED_STATE__
838 'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl',
842 'title': 'Prom 6: An American in Paris and Turangalila',
843 'description': 'md5:51cf7d6f5c8553f197e58203bc78dff8',
844 'uploader': 'Radio 3',
845 'uploader_id': 'bbc_radio_three',
847 'skip': '404 Not Found',
849 'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227',
853 'title': 'md5:2fabf12a726603193a2879a055f72514',
854 'description': 'Learn English words and phrases from this story',
855 'thumbnail': 'https://ichef.bbci.co.uk/images/ic/1200x675/p06pq9gk.jpg',
857 'add_ie': [BBCCoUkIE.ie_key()],
860 'url': 'https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness',
864 'title': 'The downsides of positive thinking',
865 'description': 'The downsides of positive thinking',
867 'thumbnail': r're:https?://.+/p07c9dsr\.(?:jpg|webp|png)',
868 'upload_date': '20220223',
869 'timestamp': 1645632746,
873 'url': 'https://www.bbc.co.uk/sounds/play/w3ct5rgx',
877 'title': 'Are our coastlines being washed away?',
878 'description': r're:(?s)Around the world, coastlines are constantly changing .{2000,} Images\)$',
879 'timestamp': 1713556800,
880 'upload_date': '20240419',
882 'thumbnail': 'https://ichef.bbci.co.uk/images/ic/raw/p0hrnxbl.jpg',
883 'uploader': 'World Service',
884 'uploader_id': 'bbc_world_service',
885 'series': 'CrowdScience',
889 'url': 'https://www.bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd.onion/news/av/world-europe-63208576',
890 'only_matching': True,
892 'url': 'https://www.bbcweb3hytmzhn5d532owbu6oqadra5z3ar726vq5kgwwn6aucdccrad.onion/sport/av/football/63195681',
893 'only_matching': True,
897 def suitable(cls, url):
898 EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerEpisodesIE, BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE)
899 return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
900 else super().suitable(url))
902 def _extract_from_media_meta(self, media_meta, video_id):
903 # Direct links to media in media metadata (e.g.
904 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
905 # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
906 source_files = media_meta.get('sourceFiles')
910 'format_id': format_id,
911 'ext': f.get('encoding'),
912 'tbr': float_or_none(f.get('bitrate'), 1000),
913 'filesize': int_or_none(f.get('filesize')),
914 } for format_id, f in source_files.items() if f.get('url')], []
916 programme_id = media_meta.get('externalId')
918 return self._download_media_selector(programme_id)
920 # Process playlist.sxml as legacy playlist
921 href = media_meta.get('href')
923 playlist = self._download_legacy_playlist_url(href)
924 _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
925 return formats, subtitles
929 def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
930 programme_id, title, description, duration, formats, subtitles = \
931 self._process_legacy_playlist_url(url, playlist_id)
935 'description': description,
936 'duration': duration,
937 'timestamp': timestamp,
939 'subtitles': subtitles,
942 def _real_extract(self, url):
943 playlist_id = self._match_id(url)
945 webpage = self._download_webpage(url, playlist_id)
947 json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
948 timestamp = json_ld_info.get('timestamp')
950 playlist_title = json_ld_info.get('title') or re.sub(
951 r'(.+)\s*-\s*BBC.*?$', r'\1', self._generic_title('', webpage, default='')).strip() or None
953 playlist_description = json_ld_info.get(
954 'description') or self._og_search_description(webpage, default=None)
957 timestamp = parse_iso8601(self._search_regex(
958 [r'<meta[^>]+property="article
:published_time
"[^>]+content="([^
"]+)"',
959 r'itemprop
="datePublished"[^
>]+datetime
="([^"]+)"',
960 r'"datePublished
":\s*"([^
"]+)'],
961 webpage, 'date', default=None))
965 # article with multiple videos embedded with playlist.sxml (e.g.
966 # http://www.bbc.com/sport/0/football/34475836)
967 playlists = re.findall(r'<param[^>]+name="playlist
"[^>]+value="([^
"]+)"', webpage)
968 playlists.extend(re.findall(r'data
-media
-id="([^"]+/playlist\
.sxml
)"', webpage))
971 self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
972 for playlist_url in playlists]
974 # news article with multiple videos embedded with data-playable
975 data_playables = re.findall(r'data-playable=(["\'])({.+?
})\
1', webpage)
977 for _, data_playable_json in data_playables:
978 data_playable = self._parse_json(
979 unescapeHTML(data_playable_json), playlist_id, fatal=False)
980 if not data_playable:
982 settings = data_playable.get('settings
', {})
984 # data-playable with video vpid in settings.playlistObject.items (e.g.
985 # http://www.bbc.com/news/world-us-canada-34473351)
986 playlist_object = settings.get('playlistObject
', {})
988 items = playlist_object.get('items
')
989 if items and isinstance(items, list):
990 title = playlist_object['title
']
991 description = playlist_object.get('summary
')
992 duration = int_or_none(items[0].get('duration
'))
993 programme_id = items[0].get('vpid
')
994 formats, subtitles = self._download_media_selector(programme_id)
998 'description
': description,
999 'timestamp
': timestamp,
1000 'duration
': duration,
1002 'subtitles
': subtitles,
1005 # data-playable without vpid but with a playlist.sxml URLs
1006 # in otherSettings.playlist (e.g.
1007 # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
1008 playlist = data_playable.get('otherSettings
', {}).get('playlist
', {})
1011 for key in ('streaming
', 'progressiveDownload
'):
1012 playlist_url = playlist.get(f'{key}Url
')
1013 if not playlist_url:
1016 info = self._extract_from_playlist_sxml(
1017 playlist_url, playlist_id, timestamp)
1021 entry['title
'] = info['title
']
1022 entry['formats
'].extend(info['formats
'])
1023 except ExtractorError as e:
1024 # Some playlist URL may fail with 500, at the same time
1025 # the other one may work fine (e.g.
1026 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
1027 if isinstance(e.cause, HTTPError) and e.cause.status == 500:
1031 entries.append(entry)
1034 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
1036 # http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227
1037 group_id = self._search_regex(
1038 rf'<div
[^
>]+\bclass
=["\']video["\'][^
>]+\bdata
-pid
=["\']({self._ID_REGEX})',
1039 webpage, 'group id', default=None)
1041 return self.url_result(
1042 f'https://www.bbc.co.uk/programmes/{group_id}', BBCCoUkIE)
1044 # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
1045 programme_id = self._search_regex(
1046 [rf'data-(?:video-player|media)-vpid="({self
._ID
_REGEX
})"',
1047 rf'<param[^>]+name="externalIdentifier
"[^>]+value="({self
._ID
_REGEX
})"',
1048 rf'videoId\s*:\s*["\']({self
._ID
_REGEX
})["\']'],
1049 webpage, 'vpid', default=None)
1052 formats, subtitles = self._download_media_selector(programme_id)
1053 # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
1054 digital_data = self._parse_json(
1056 r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
1057 programme_id, fatal=False)
1058 page_info = digital_data.get('page', {}).get('pageInfo', {})
1059 title = page_info.get('pageName') or self._og_search_title(webpage)
1060 description = page_info.get('description') or self._og_search_description(webpage)
1061 timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
1065 'description': description,
1066 'timestamp': timestamp,
1068 'subtitles': subtitles,
1071 # bbc reel (e.g. https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness)
1072 initial_data = self._parse_json(self._html_search_regex(
1073 r'<script[^>]+id=(["\'])initial
-data\
1[^
>]+data
-json
=(["\'])(?P<json>(?:(?!\2).)+)',
1074 webpage, 'initial data', default='{}', group='json'), playlist_id, fatal=False)
1076 init_data = try_get(
1077 initial_data, lambda x: x['initData']['items'][0], dict) or {}
1078 smp_data = init_data.get('smpData') or {}
1079 clip_data = try_get(smp_data, lambda x: x['items'][0], dict) or {}
1080 version_id = clip_data.get('versionID')
1082 title = smp_data['title']
1083 formats, subtitles = self._download_media_selector(version_id)
1084 image_url = smp_data.get('holdingImageURL')
1085 display_date = init_data.get('displayDate')
1086 topic_title = init_data.get('topicTitle')
1092 'alt_title': init_data.get('shortTitle'),
1093 'thumbnail': image_url.replace('$recipe', 'raw') if image_url else None,
1094 'description': smp_data.get('summary') or init_data.get('shortSummary'),
1095 'upload_date': display_date.replace('-', '') if display_date else None,
1096 'subtitles': subtitles,
1097 'duration': int_or_none(clip_data.get('duration')),
1098 'categories': [topic_title] if topic_title else None,
1101 # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
1102 # Several setPayload calls may be present but the video(s)
1103 # should be in one that mentions leadMedia or videoData
1104 morph_payload = self._search_json(
1105 r'\bMorph\s*\.\s*setPayload\s*\([^,]+,', webpage, 'morph payload', playlist_id,
1106 contains_pattern=r'{(?s:(?:(?!</script>).)+(?:"leadMedia
"|\\"videoData
\\")\s*:.+)}',
1109 for lead_media in traverse_obj(morph_payload, (
1110 'body', 'components', ..., 'props', 'leadMedia', {dict})):
1111 programme_id = traverse_obj(lead_media, ('identifiers', ('vpid', 'playablePid'), {str}, any))
1112 if not programme_id:
1114 formats, subtitles = self._download_media_selector(programme_id)
1117 'title': lead_media.get('title') or self._og_search_title(webpage),
1118 **traverse_obj(lead_media, {
1119 'description': ('summary', {str}),
1120 'duration': ('duration', ('rawDuration', 'formattedDuration', 'spokenDuration'), {parse_duration}),
1121 'uploader': ('masterBrand', {str}),
1122 'uploader_id': ('mid', {str}),
1125 'subtitles': subtitles,
1127 body = self._parse_json(traverse_obj(morph_payload, (
1128 'body', 'content', 'article', 'body')), playlist_id, fatal=False)
1129 for video_data in traverse_obj(body, (lambda _, v: v['videoData']['pid'], 'videoData')):
1130 if video_data.get('vpid'):
1131 video_id = video_data['vpid']
1132 formats, subtitles = self._download_media_selector(video_id)
1136 'subtitles': subtitles,
1139 video_id = video_data['pid']
1140 entry = self.url_result(
1141 f'https://www.bbc.co.uk/programmes/{video_id}', BBCCoUkIE,
1142 video_id, url_transparent=True)
1144 'timestamp': traverse_obj(morph_payload, (
1145 'body', 'content', 'article', 'dateTimeInfo', 'dateTime', {parse_iso8601}),
1147 **traverse_obj(video_data, {
1148 'thumbnail': (('iChefImage', 'image'), {url_or_none}, any),
1149 'title': (('title', 'caption'), {str}, any),
1150 'duration': ('duration', {parse_duration}),
1153 if video_data.get('isLead') and not self._yes_playlist(playlist_id, video_id):
1155 entries.append(entry)
1157 playlist_title = traverse_obj(morph_payload, (
1158 'body', 'content', 'article', 'headline', {str})) or playlist_title
1159 return self.playlist_result(
1160 entries, playlist_id, playlist_title, playlist_description)
1162 # various PRELOADED_STATE JSON
1163 preload_state = self._search_json(
1164 r'window\.__(?:PWA_)?PRELOADED_STATE__\s*=', webpage,
1165 'preload state', playlist_id, transform_source=js_to_json, default={})
1166 # PRELOADED_STATE with current programmme
1167 current_programme = traverse_obj(preload_state, ('programmes', 'current', {dict}))
1168 programme_id = traverse_obj(current_programme, ('id', {str}))
1169 if programme_id and current_programme.get('type') == 'playable_item':
1170 title = traverse_obj(current_programme, ('titles', ('tertiary', 'secondary'), {str}, any)) or playlist_title
1171 formats, subtitles = self._download_media_selector(programme_id)
1176 **traverse_obj(current_programme, {
1177 'description': ('synopses', ('long', 'medium', 'short'), {str}, any),
1178 'thumbnail': ('image_url', {lambda u: url_or_none(u.replace('{recipe}', 'raw'))}),
1179 'duration': ('duration', 'value', {int_or_none}),
1180 'uploader': ('network', 'short_title', {str}),
1181 'uploader_id': ('network', 'id', {str}),
1182 'timestamp': ((('availability', 'from'), ('release', 'date')), {parse_iso8601}, any),
1183 'series': ('titles', 'primary', {str}),
1185 'subtitles': subtitles,
1186 'chapters': traverse_obj(preload_state, (
1187 'tracklist', 'tracks', lambda _, v: float(v['offset']['start']), {
1188 'title': ('titles', {lambda x: join_nonempty(
1189 'primary', 'secondary', 'tertiary', delim=' - ', from_dict=x)}),
1190 'start_time': ('offset', 'start', {float_or_none}),
1191 'end_time': ('offset', 'end', {float_or_none}),
1196 # PWA_PRELOADED_STATE with article video asset
1197 asset_id = traverse_obj(preload_state, (
1198 'entities', 'articles', lambda k, _: k.rsplit('/', 1)[-1] == playlist_id,
1199 'assetVideo', 0, {str}, any))
1201 video_id = traverse_obj(preload_state, ('entities', 'videos', asset_id, 'vpid', {str}))
1203 article = traverse_obj(preload_state, (
1204 'entities', 'articles', lambda _, v: v['assetVideo'][0] == asset_id, any))
1206 def image_url(image_id):
1207 return traverse_obj(preload_state, (
1208 'entities', 'images', image_id, 'url',
1209 {lambda u: url_or_none(u.replace('$recipe', 'raw'))}))
1211 formats, subtitles = self._download_media_selector(video_id)
1214 **traverse_obj(preload_state, ('entities', 'videos', asset_id, {
1215 'title': ('title', {str}),
1216 'description': (('synopsisLong', 'synopsisMedium', 'synopsisShort'), {str}, any),
1217 'thumbnail': (0, {image_url}),
1218 'duration': ('duration', {int_or_none}),
1221 'subtitles': subtitles,
1222 'timestamp': traverse_obj(article, ('displayDate', {parse_iso8601})),
1225 return self.url_result(
1226 f'https://www.bbc.co.uk/programmes/{asset_id}', BBCCoUkIE,
1227 asset_id, playlist_title, display_id=playlist_id,
1228 description=playlist_description)
1230 bbc3_config = self._parse_json(
1232 r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage,
1233 'bbcthree config', default='{}'),
1234 playlist_id, transform_source=js_to_json, fatal=False) or {}
1235 payload = bbc3_config.get('payload') or {}
1237 clip = payload.get('currentClip') or {}
1238 clip_vpid = clip.get('vpid')
1239 clip_title = clip.get('title')
1240 if clip_vpid and clip_title:
1241 formats, subtitles = self._download_media_selector(clip_vpid)
1244 'title': clip_title,
1245 'thumbnail': dict_get(clip, ('poster', 'imageUrl')),
1246 'description': clip.get('description'),
1247 'duration': parse_duration(clip.get('duration')),
1249 'subtitles': subtitles,
1251 bbc3_playlist = try_get(
1252 payload, lambda x: x['content']['bbcMedia']['playlist'],
1255 playlist_title = bbc3_playlist.get('title') or playlist_title
1256 thumbnail = bbc3_playlist.get('holdingImageURL')
1258 for bbc3_item in bbc3_playlist['items']:
1259 programme_id = bbc3_item.get('versionID')
1260 if not programme_id:
1262 formats, subtitles = self._download_media_selector(programme_id)
1265 'title': playlist_title,
1266 'thumbnail': thumbnail,
1267 'timestamp': timestamp,
1269 'subtitles': subtitles,
1271 return self.playlist_result(
1272 entries, playlist_id, playlist_title, playlist_description)
1274 def parse_model(model):
1275 """Extract single video from model structure"""
1276 item_id = traverse_obj(model, ('versions', 0, 'versionId', {str}))
1279 formats, subtitles = self._download_media_selector(item_id)
1283 'subtitles': subtitles,
1284 **traverse_obj(model, {
1285 'title': ('title', {str}),
1286 'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}),
1287 'description': ('synopses', ('long', 'medium', 'short'), {str}, filter, any),
1288 'duration': ('versions', 0, 'duration', {int}),
1289 'timestamp': ('versions', 0, 'availableFrom', {int_or_none(scale=1000)}),
1293 def is_type(*types):
1294 return lambda _, v: v['type'] in types
1296 initial_data = self._search_regex(
1297 r'window\.__INITIAL_DATA__\s*=\s*("{.+?
}")\s*;', webpage,
1298 'quoted preload state', default=None)
1299 if initial_data is None:
1300 initial_data = self._search_regex(
1301 r'window\.__INITIAL_DATA__\s*=\s*({.+?})\s*;', webpage,
1302 'preload state', default='{}')
1304 initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False)
1305 initial_data = self._parse_json(initial_data, playlist_id, fatal=False)
1307 for video_data in traverse_obj(initial_data, (
1308 'stores', 'article', 'articleBodyContent', is_type('video'))):
1309 model = traverse_obj(video_data, (
1310 'model', 'blocks', is_type('aresMedia'),
1311 'model', 'blocks', is_type('aresMediaMetadata'),
1312 'model', {dict}, any))
1313 entry = parse_model(model)
1315 entries.append(entry)
1317 return self.playlist_result(
1318 entries, playlist_id, playlist_title, playlist_description)
1320 def parse_media(media):
1323 for item in (try_get(media, lambda x: x['media']['items'], list) or []):
1324 item_id = item.get('id')
1325 item_title = item.get('title')
1326 if not (item_id and item_title):
1328 formats, subtitles = self._download_media_selector(item_id)
1330 blocks = try_get(media, lambda x: x['summary']['blocks'], list)
1333 for block in blocks:
1334 text = try_get(block, lambda x: x['model']['text'], str)
1336 summary.append(text)
1338 item_desc = '\n\n'.join(summary)
1340 for meta in try_get(media, lambda x: x['metadata']['items'], list) or []:
1341 if try_get(meta, lambda x: x['label']) == 'Published':
1342 item_time = unified_timestamp(meta.get('timestamp'))
1346 'title': item_title,
1347 'thumbnail': item.get('holdingImageUrl'),
1349 'subtitles': subtitles,
1350 'timestamp': item_time,
1351 'description': strip_or_none(item_desc),
1352 'duration': int_or_none(item.get('duration')),
1355 for resp in traverse_obj(initial_data, ('data', lambda _, v: v['name'])):
1357 if name == 'media-experience':
1358 parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict))
1359 elif name == 'article':
1360 for block in traverse_obj(resp, (
1361 'data', (None, ('content', 'model')), 'blocks',
1362 is_type('media', 'video'), 'model', {dict})):
1364 return self.playlist_result(
1365 entries, playlist_id, playlist_title, playlist_description)
1367 # extract from SIMORGH_DATA hydration JSON
1368 simorgh_data = self._search_json(
1369 r'window\s*\.\s*SIMORGH_DATA\s*=', webpage,
1370 'simorgh data', playlist_id, default={})
1373 for video_data in traverse_obj(simorgh_data, (
1374 'pageData', 'content', 'model', 'blocks', is_type('video', 'legacyMedia'))):
1375 model = traverse_obj(video_data, (
1376 'model', 'blocks', is_type('aresMedia'),
1377 'model', 'blocks', is_type('aresMediaMetadata'),
1378 'model', {dict}, any))
1379 if video_data['type'] == 'video':
1380 entry = parse_model(model)
1381 else: # legacyMedia: no duration, subtitles
1382 block_id, entry = traverse_obj(model, ('blockId', {str})), None
1383 media_data = traverse_obj(simorgh_data, (
1384 'pageData', 'promo', 'media',
1385 {lambda x: x if x['id'] == block_id else None}))
1386 formats = traverse_obj(media_data, ('playlist', lambda _, v: url_or_none(v['url']), {
1387 'url': ('url', {url_or_none}),
1388 'ext': ('format', {str}),
1389 'tbr': ('bitrate', {int_or_none(scale=1000)}),
1394 'display_id': playlist_id,
1396 'description': traverse_obj(simorgh_data, ('pageData', 'promo', 'summary', {str})),
1397 **traverse_obj(model, {
1398 'title': ('title', {str}),
1399 'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}),
1400 'description': ('synopses', ('long', 'medium', 'short'), {str}, any),
1401 'timestamp': ('firstPublished', {int_or_none(scale=1000)}),
1406 entries.append(entry)
1410 return self.playlist_result(
1411 entries, playlist_id, playlist_title, playlist_description)
1413 def extract_all(pattern):
1414 return list(filter(None, (
1415 self._parse_json(s, playlist_id, fatal=False)
1416 for s in re.findall(pattern, webpage))))
1418 # US accessed article with single embedded video (e.g.
1419 # https://www.bbc.com/news/uk-68546268)
1420 next_data = traverse_obj(self._search_nextjs_data(webpage, playlist_id, default={}),
1421 ('props', 'pageProps', 'page'))
1422 model = traverse_obj(next_data, (
1423 ..., 'contents', is_type('video'),
1424 'model', 'blocks', is_type('media'),
1425 'model', 'blocks', is_type('mediaMetadata'),
1426 'model', {dict}, any))
1427 if model and (entry := parse_model(model)):
1428 if not entry.get('timestamp'):
1429 entry['timestamp'] = traverse_obj(next_data, (
1430 ..., 'contents', is_type('timestamp'), 'model',
1431 'timestamp', {int_or_none(scale=1000)}, any))
1432 entries.append(entry)
1433 return self.playlist_result(
1434 entries, playlist_id, playlist_title, playlist_description)
1436 # Multiple video article (e.g.
1437 # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
1438 EMBED_URL = rf'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+{self._ID_REGEX}(?:\b[^"]+)?
'
1440 for match in extract_all(r'new\s
+SMP\
(({.+?
})\
)'):
1441 embed_url = match.get('playerSettings
', {}).get('externalEmbedUrl
')
1442 if embed_url and re.match(EMBED_URL, embed_url):
1443 entries.append(embed_url)
1444 entries.extend(re.findall(
1445 rf'setPlaylist\
("({EMBED_URL})"\
)', webpage))
1447 return self.playlist_result(
1448 [self.url_result(entry_, 'BBCCoUk
') for entry_ in entries],
1449 playlist_id, playlist_title, playlist_description)
1451 # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
1452 medias = extract_all(r"data-media-meta='({[^
']+})'")
1455 # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
1456 media_asset = self._search_regex(
1457 r'mediaAssetPage\.init\(\s*({.+?}), "/',
1458 webpage, 'media asset
', default=None)
1460 media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
1462 for video in media_asset_page.get('videos
', {}).values():
1463 medias.extend(video.values())
1466 # Multiple video playlist with single `now playing` entry (e.g.
1467 # http://www.bbc.com/news/video_and_audio/must_see/33767813)
1468 vxp_playlist = self._parse_json(
1470 r'<script
[^
>]+class="vxp-playlist-data"[^
>]+type="application/json"[^
>]*>([^
<]+)</script
>',
1471 webpage, 'playlist data
'),
1473 playlist_medias = []
1474 for item in vxp_playlist:
1475 media = item.get('media
')
1478 playlist_medias.append(media)
1479 # Download single video if found media with asset id matching the video id from URL
1480 if item.get('advert
', {}).get('assetId
') == playlist_id:
1483 # Fallback to the whole playlist
1485 medias = playlist_medias
1488 for num, media_meta in enumerate(medias, start=1):
1489 formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
1490 if not formats and not self.get_param('ignore_no_formats
'):
1493 video_id = media_meta.get('externalId
')
1495 video_id = playlist_id if len(medias) == 1 else f'{playlist_id}
-{num}
'
1497 title = media_meta.get('caption
')
1499 title = playlist_title if len(medias) == 1 else f'{playlist_title}
- Video {num}
'
1501 duration = int_or_none(media_meta.get('durationInSeconds
')) or parse_duration(media_meta.get('duration
'))
1504 for image in media_meta.get('images
', {}).values():
1505 images.extend(image.values())
1506 if 'image
' in media_meta:
1507 images.append(media_meta['image
'])
1510 'url
': image.get('href
'),
1511 'width
': int_or_none(image.get('width
')),
1512 'height
': int_or_none(image.get('height
')),
1513 } for image in images]
1518 'thumbnails
': thumbnails,
1519 'duration
': duration,
1520 'timestamp
': timestamp,
1522 'subtitles
': subtitles,
1525 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
1528 class BBCCoUkArticleIE(InfoExtractor):
1529 _VALID_URL = r'https?
://(?
:www\
.)?bbc\
.co\
.uk
/programmes
/articles
/(?P
<id>[a
-zA
-Z0
-9]+)'
1530 IE_NAME = 'bbc
.co
.uk
:article
'
1531 IE_DESC = 'BBC articles
'
1534 'url
': 'http
://www
.bbc
.co
.uk
/programmes
/articles
/3jNQLTMrPlYGTBn0WV6M2MS
/not-your
-typical
-role
-model
-ada
-lovelace
-the
-19th
-century
-programmer
',
1536 'id': '3jNQLTMrPlYGTBn0WV6M2MS
',
1537 'title
': 'Calculating Ada
: The Countess of Computing
- Not your typical role model
: Ada Lovelace the
19th century programmer
- BBC Four
',
1538 'description
': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming
.',
1540 'playlist_count
': 4,
1541 'add_ie
': ['BBCCoUk
'],
1544 def _real_extract(self, url):
1545 playlist_id = self._match_id(url)
1547 webpage = self._download_webpage(url, playlist_id)
1549 title = self._og_search_title(webpage)
1550 description = self._og_search_description(webpage).strip()
1552 entries = [self.url_result(programme_url) for programme_url in re.findall(
1553 r'<div
[^
>]+typeof
="Clip"[^
>]+resource
="([^"]+)"', webpage)]
1555 return self.playlist_result(entries, playlist_id, title, description)
1558 class BBCCoUkPlaylistBaseIE(InfoExtractor):
1559 def _entries(self, webpage, url, playlist_id):
1560 single_page = 'page' in urllib.parse.parse_qs(
1561 urllib.parse.urlparse(url).query)
1562 for page_num in itertools.count(2):
1563 for video_id in re.findall(
1564 self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage):
1565 yield self.url_result(
1566 self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
1569 next_page = self._search_regex(
1570 r'<li[^>]+class=(["\'])pagination_
+next\
1[^
>]*><a
[^
>]+href
=(["\'])(?P<url>(?:(?!\2).)+)\2',
1571 webpage, 'next page url', default=None, group='url')
1574 webpage = self._download_webpage(
1575 urllib.parse.urljoin(url, next_page), playlist_id,
1576 f'Downloading page {page_num}', page_num)
1578 def _real_extract(self, url):
1579 playlist_id = self._match_id(url)
1581 webpage = self._download_webpage(url, playlist_id)
1583 title, description = self._extract_title_and_description(webpage)
1585 return self.playlist_result(
1586 self._entries(webpage, url, playlist_id),
1587 playlist_id, title, description)
1590 class BBCCoUkIPlayerPlaylistBaseIE(InfoExtractor):
1591 _VALID_URL_TMPL = rf'https?://(?:www\.)?bbc\.co\.uk/iplayer/%s/(?P<id>{BBCCoUkIE._ID_REGEX})'
1594 def _get_default(episode, key, default_key='default'):
1595 return try_get(episode, lambda x: x[key][default_key])
1597 def _get_description(self, data):
1598 synopsis = data.get(self._DESCRIPTION_KEY) or {}
1599 return dict_get(synopsis, ('large', 'medium', 'small'))
1601 def _fetch_page(self, programme_id, per_page, series_id, page):
1602 elements = self._get_elements(self._call_api(
1603 programme_id, per_page, page + 1, series_id))
1604 for element in elements:
1605 episode = self._get_episode(element)
1606 episode_id = episode.get('id')
1610 image = self._get_episode_image(episode)
1612 thumbnail = image.replace('{recipe}', 'raw')
1613 category = self._get_default(episode, 'labels', 'category')
1617 'title': self._get_episode_field(episode, 'subtitle'),
1618 'url': 'https://www.bbc.co.uk/iplayer/episode/' + episode_id,
1619 'thumbnail': thumbnail,
1620 'description': self._get_description(episode),
1621 'categories': [category] if category else None,
1622 'series': self._get_episode_field(episode, 'title'),
1623 'ie_key': BBCCoUkIE.ie_key(),
1626 def _real_extract(self, url):
1627 pid = self._match_id(url)
1629 series_id = qs.get('seriesId', [None])[0]
1630 page = qs.get('page', [None])[0]
1631 per_page = 36 if page else self._PAGE_SIZE
1632 fetch_page = functools.partial(self._fetch_page, pid, per_page, series_id)
1633 entries = fetch_page(int(page) - 1) if page else OnDemandPagedList(fetch_page, self._PAGE_SIZE)
1634 playlist_data = self._get_playlist_data(self._call_api(pid, 1))
1635 return self.playlist_result(
1636 entries, pid, self._get_playlist_title(playlist_data),
1637 self._get_description(playlist_data))
1640 class BBCCoUkIPlayerEpisodesIE(BBCCoUkIPlayerPlaylistBaseIE):
1641 IE_NAME = 'bbc.co.uk:iplayer:episodes'
1642 _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'episodes'
1644 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
1647 'title': 'The Disappearance',
1648 'description': 'md5:58eb101aee3116bad4da05f91179c0cb',
1650 'playlist_mincount': 8,
1653 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster',
1656 'title': 'Doctor Foster',
1657 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
1659 'playlist_mincount': 10,
1662 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster?seriesId=b094m6nv',
1665 'title': 'Doctor Foster',
1666 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
1668 'playlist_mincount': 5,
1671 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove',
1674 'title': 'Beechgrove',
1675 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
1677 'playlist_mincount': 37,
1680 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove?page=2',
1683 'title': 'Beechgrove',
1684 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
1686 'playlist_mincount': 1,
1689 _DESCRIPTION_KEY = 'synopsis'
1691 def _get_episode_image(self, episode):
1692 return self._get_default(episode, 'image')
1694 def _get_episode_field(self, episode, field):
1695 return self._get_default(episode, field)
1698 def _get_elements(data):
1699 return data['entities']['results']
1702 def _get_episode(element):
1703 return element.get('episode') or {}
1705 def _call_api(self, pid, per_page, page=1, series_id=None):
1709 'perPage': per_page,
1712 variables['sliceId'] = series_id
1713 return self._download_json(
1714 'https://graph.ibl.api.bbc.co.uk/', pid, headers={
1715 'Content-Type': 'application/json',
1716 }, data=json.dumps({
1717 'id': '5692d93d5aac8d796a0305e895e61551',
1718 'variables': variables,
1719 }).encode())['data']['programme']
1722 def _get_playlist_data(data):
1725 def _get_playlist_title(self, data):
1726 return self._get_default(data, 'title')
1729 class BBCCoUkIPlayerGroupIE(BBCCoUkIPlayerPlaylistBaseIE):
1730 IE_NAME = 'bbc.co.uk:iplayer:group'
1731 _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'group'
1733 # Available for over a year unlike 30 days for most other programmes
1734 'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
1737 'title': 'Bohemian Icons',
1738 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
1740 'playlist_mincount': 10,
1743 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7',
1746 'title': 'Music in Scotland',
1747 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
1749 'playlist_mincount': 47,
1752 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7?page=2',
1755 'title': 'Music in Scotland',
1756 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
1758 'playlist_mincount': 11,
1761 _DESCRIPTION_KEY = 'synopses'
1763 def _get_episode_image(self, episode):
1764 return self._get_default(episode, 'images', 'standard')
1766 def _get_episode_field(self, episode, field):
1767 return episode.get(field)
1770 def _get_elements(data):
1771 return data['elements']
1774 def _get_episode(element):
1777 def _call_api(self, pid, per_page, page=1, series_id=None):
1778 return self._download_json(
1779 f'http://ibl.api.bbc.co.uk/ibl/v1/groups/{pid}/episodes',
1782 'per_page': per_page,
1783 })['group_episodes']
1786 def _get_playlist_data(data):
1787 return data['group']
1789 def _get_playlist_title(self, data):
1790 return data.get('title')
1793 class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
1794 IE_NAME = 'bbc.co.uk:playlist'
1795 _VALID_URL = rf'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>{BBCCoUkIE._ID_REGEX})/(?:episodes|broadcasts|clips)'
1796 _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s'
1797 _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)'
1799 'url
': 'http
://www
.bbc
.co
.uk
/programmes
/b05rcz9v
/clips
',
1802 'title
': 'The Disappearance
- Clips
- BBC Four
',
1803 'description
': 'French thriller serial about a missing teenager
.',
1805 'playlist_mincount
': 7,
1807 # multipage playlist, explicit page
1808 'url
': 'http
://www
.bbc
.co
.uk
/programmes
/b00mfl7n
/clips?page
=1',
1811 'title
': 'Frozen Planet
- Clips
- BBC One
',
1812 'description
': 'md5
:65dcbf591ae628dafe32aa6c4a4a0d8c
',
1814 'playlist_mincount
': 24,
1816 # multipage playlist, all pages
1817 'url
': 'http
://www
.bbc
.co
.uk
/programmes
/b00mfl7n
/clips
',
1820 'title
': 'Frozen Planet
- Clips
- BBC One
',
1821 'description
': 'md5
:65dcbf591ae628dafe32aa6c4a4a0d8c
',
1823 'playlist_mincount
': 142,
1825 'url
': 'http
://www
.bbc
.co
.uk
/programmes
/b05rcz9v
/broadcasts
/2016/06',
1826 'only_matching
': True,
1828 'url
': 'http
://www
.bbc
.co
.uk
/programmes
/b05rcz9v
/clips
',
1829 'only_matching
': True,
1831 'url
': 'http
://www
.bbc
.co
.uk
/programmes
/b055jkys
/episodes
/player
',
1832 'only_matching
': True,
1835 def _extract_title_and_description(self, webpage):
1836 title = self._og_search_title(webpage, fatal=False)
1837 description = self._og_search_description(webpage)
1838 return title, description