4 from .common
import InfoExtractor
15 class TedBaseIE(InfoExtractor
):
16 _VALID_URL_BASE
= r
'https?://www\.ted\.com/(?:{type})(?:/lang/[^/#?]+)?/(?P<id>[\w-]+)'
18 def _parse_playlist(self
, playlist
):
19 for entry
in try_get(playlist
, lambda x
: x
['videos']['nodes'], list):
20 if entry
.get('__typename') == 'Video' and entry
.get('canonicalUrl'):
21 yield self
.url_result(entry
['canonicalUrl'], TedTalkIE
.ie_key())
24 class TedTalkIE(TedBaseIE
):
25 _VALID_URL
= TedBaseIE
._VALID
_URL
_BASE
.format(type='talks')
27 'url': 'https://www.ted.com/talks/candace_parker_how_to_break_down_barriers_and_not_accept_limits',
28 'md5': '47e82c666d9c3261d4fe74748a90aada',
32 'title': 'How to break down barriers and not accept limits',
33 'description': 'md5:000707cece219d1e165b11550d612331',
35 'tags': ['personal growth', 'equality', 'activism', 'motivation', 'social change', 'sports'],
36 'uploader': 'Candace Parker',
38 'upload_date': '20220114',
39 'release_date': '20211201',
40 'thumbnail': r
're:http.*\.jpg',
44 def _real_extract(self
, url
):
45 display_id
= self
._match
_id
(url
)
46 webpage
= self
._download
_webpage
(url
, display_id
)
47 talk_info
= self
._search
_nextjs
_data
(webpage
, display_id
)['props']['pageProps']['videoData']
48 video_id
= talk_info
['id']
49 player_data
= self
._parse
_json
(talk_info
.get('playerData'), video_id
)
52 formats
, subtitles
= [], {}
53 for format_id
, resources
in (player_data
.get('resources') or {}).items():
54 if format_id
== 'hls':
55 stream_url
= url_or_none(try_get(resources
, lambda x
: x
['stream']))
58 m3u8_formats
, m3u8_subs
= self
._extract
_m
3u8_formats
_and
_subtitles
(
59 stream_url
, video_id
, 'mp4', m3u8_id
=format_id
, fatal
=False)
60 formats
.extend(m3u8_formats
)
61 subtitles
= self
._merge
_subtitles
(subtitles
, m3u8_subs
)
64 if not isinstance(resources
, list):
66 if format_id
== 'h264':
67 for resource
in resources
:
68 h264_url
= resource
.get('file')
71 bitrate
= int_or_none(resource
.get('bitrate'))
74 'format_id': f
'{format_id}-{bitrate}k',
77 if re
.search(r
'\d+k', h264_url
):
79 elif format_id
== 'rtmp':
80 streamer
= talk_info
.get('streamer')
84 'format_id': '{}-{}'.format(format_id
, resource
.get('name')),
86 'play_path': resource
['file'],
88 'width': int_or_none(resource
.get('width')),
89 'height': int_or_none(resource
.get('height')),
90 'tbr': int_or_none(resource
.get('bitrate')),
91 } for resource
in resources
if resource
.get('file'))
94 m3u8_formats
= [f
for f
in formats
if f
.get('protocol') == 'm3u8' and f
.get('vcodec') != 'none']
95 for m3u8_format
in m3u8_formats
:
96 bitrate
= self
._search
_regex
(r
'(\d+k)', m3u8_format
['url'], 'bitrate', default
=None)
99 bitrate_url
= re
.sub(r
'\d+k', bitrate
, http_url
)
100 if not self
._is
_valid
_url
(
101 bitrate_url
, video_id
, f
'{bitrate} bitrate'):
103 f
= m3u8_format
.copy()
106 'format_id': m3u8_format
['format_id'].replace('hls', 'http'),
109 if f
.get('acodec') == 'none':
113 audio_download
= talk_info
.get('audioDownload')
116 'url': audio_download
,
117 'format_id': 'audio',
122 external
= player_data
.get('external') or {}
123 service
= external
.get('service') or ''
124 ext_url
= external
.get('code') if service
.lower() == 'youtube' else None
125 return self
.url_result(ext_url
or external
['uri'])
127 thumbnail
= player_data
.get('thumb') or self
._og
_search
_property
('image', webpage
)
129 # trim thumbnail resize parameters
130 thumbnail
= thumbnail
.split('?')[0]
134 'title': talk_info
.get('title') or self
._og
_search
_title
(webpage
),
135 'uploader': talk_info
.get('presenterDisplayName'),
136 'thumbnail': thumbnail
,
137 'description': talk_info
.get('description') or self
._og
_search
_description
(webpage
),
138 'subtitles': subtitles
,
140 'duration': talk_info
.get('duration') or parse_duration(self
._og
_search
_property
('video:duration', webpage
)),
141 'view_count': str_to_int(talk_info
.get('viewedCount')),
142 'upload_date': unified_strdate(talk_info
.get('publishedAt')),
143 'release_date': unified_strdate(talk_info
.get('recordedOn')),
144 'tags': try_get(player_data
, lambda x
: x
['targeting']['tag'].split(',')),
148 class TedSeriesIE(TedBaseIE
):
149 _VALID_URL
= fr
'{TedBaseIE._VALID_URL_BASE.format(type=r"series")}(?:#season_(?P<season>\d+))?'
151 'url': 'https://www.ted.com/series/small_thing_big_idea',
154 'title': 'Small Thing Big Idea',
155 'series': 'Small Thing Big Idea',
156 'description': 'md5:6869ca52cec661aef72b3e9f7441c55c',
158 'playlist_mincount': 16,
160 'url': 'https://www.ted.com/series/the_way_we_work#season_2',
163 'title': 'The Way We Work Season 2',
164 'series': 'The Way We Work',
165 'description': 'md5:59469256e533e1a48c4aa926a382234c',
168 'playlist_mincount': 8,
171 def _real_extract(self
, url
):
172 display_id
, season
= self
._match
_valid
_url
(url
).group('id', 'season')
173 webpage
= self
._download
_webpage
(url
, display_id
, 'Downloading series webpage')
174 info
= self
._search
_nextjs
_data
(webpage
, display_id
)['props']['pageProps']
176 entries
= itertools
.chain
.from_iterable(
177 self
._parse
_playlist
(s
) for s
in info
['seasons'] if season
in [None, s
.get('seasonNumber')])
179 series_id
= try_get(info
, lambda x
: x
['series']['id'])
180 series_name
= try_get(info
, lambda x
: x
['series']['name']) or self
._og
_search
_title
(webpage
, fatal
=False)
182 return self
.playlist_result(
184 f
'{series_id}_{season}' if season
and series_id
else series_id
,
185 f
'{series_name} Season {season}' if season
else series_name
,
186 self
._og
_search
_description
(webpage
),
187 series
=series_name
, season_number
=int_or_none(season
))
190 class TedPlaylistIE(TedBaseIE
):
191 _VALID_URL
= TedBaseIE
._VALID
_URL
_BASE
.format(type=r
'playlists(?:/\d+)?')
193 'url': 'https://www.ted.com/playlists/171/the_most_popular_talks_of_all',
196 'title': 'The most popular talks of all time',
197 'description': 'md5:d2f22831dc86c7040e733a3cb3993d78',
199 'playlist_mincount': 25,
202 def _real_extract(self
, url
):
203 display_id
= self
._match
_id
(url
)
204 webpage
= self
._download
_webpage
(url
, display_id
)
205 playlist
= self
._search
_nextjs
_data
(webpage
, display_id
)['props']['pageProps']['playlist']
207 return self
.playlist_result(
208 self
._parse
_playlist
(playlist
), playlist
.get('id'),
209 playlist
.get('title') or self
._og
_search
_title
(webpage
, default
='').replace(' | TED Talks', '') or None,
210 self
._og
_search
_description
(webpage
))
213 class TedEmbedIE(InfoExtractor
):
214 _VALID_URL
= r
'https?://embed(?:-ssl)?\.ted\.com/'
215 _EMBED_REGEX
= [rf
'<iframe[^>]+?src=(["\'])(?P<url>{_VALID_URL}.+?)\1']
218 'url': 'https://embed.ted.com/talks/janet_stovall_how_to_get_serious_about_diversity_and_inclusion_in_the_workplace',
222 'title': 'How to get serious about diversity and inclusion in the workplace',
223 'description': 'md5:0978aafe396e05341f8ecc795d22189d',
226 'uploader': 'Janet Stovall',
228 'upload_date': '20180822',
229 'release_date': '20180719',
230 'thumbnail': r
're:http.*\.jpg',
234 def _real_extract(self
, url
):
235 return self
.url_result(re
.sub(r
'://embed(-ssl)?', '://www', url
), TedTalkIE
.ie_key())