[ie/youtube] Add age-gate workaround for some embeddable videos (#11821)
[yt-dlp.git] / yt_dlp / extractor / radiofrance.py
blob9d904398418eb459b5db4ef6e1c23b071d02a670
1 import itertools
2 import re
3 import urllib.parse
5 from .common import InfoExtractor
6 from ..utils import (
7 int_or_none,
8 join_nonempty,
9 js_to_json,
10 parse_duration,
11 strftime_or_none,
12 traverse_obj,
13 unified_strdate,
14 urljoin,
18 class RadioFranceIE(InfoExtractor):
19 _VALID_URL = r'https?://maison\.radiofrance\.fr/radiovisions/(?P<id>[^?#]+)'
20 IE_NAME = 'radiofrance'
22 _TEST = {
23 'url': 'http://maison.radiofrance.fr/radiovisions/one-one',
24 'md5': 'bdbb28ace95ed0e04faab32ba3160daf',
25 'info_dict': {
26 'id': 'one-one',
27 'ext': 'ogg',
28 'title': 'One to one',
29 'description': "Plutôt que d'imaginer la radio de demain comme technologie ou comme création de contenu, je veux montrer que quelles que soient ses évolutions, j'ai l'intime conviction que la radio continuera d'être un grand média de proximité pour les auditeurs.",
30 'uploader': 'Thomas Hercouët',
34 def _real_extract(self, url):
35 m = self._match_valid_url(url)
36 video_id = m.group('id')
38 webpage = self._download_webpage(url, video_id)
39 title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, 'title')
40 description = self._html_search_regex(
41 r'<div class="bloc_page_wrapper"><div class="text">(.*?)</div>',
42 webpage, 'description', fatal=False)
43 uploader = self._html_search_regex(
44 r'<div class="credit">&nbsp;&nbsp;&copy;&nbsp;(.*?)</div>',
45 webpage, 'uploader', fatal=False)
47 formats_str = self._html_search_regex(
48 r'class="jp-jplayer[^"]*" data-source="([^"]+)">',
49 webpage, 'audio URLs')
50 formats = [
52 'format_id': fm[0],
53 'url': fm[1],
54 'vcodec': 'none',
55 'quality': i,
57 for i, fm in
58 enumerate(re.findall(r"([a-z0-9]+)\s*:\s*'([^']+)'", formats_str))
61 return {
62 'id': video_id,
63 'title': title,
64 'formats': formats,
65 'description': description,
66 'uploader': uploader,
70 class RadioFranceBaseIE(InfoExtractor):
71 _VALID_URL_BASE = r'https?://(?:www\.)?radiofrance\.fr'
73 _STATIONS_RE = '|'.join(map(re.escape, (
74 'franceculture',
75 'franceinfo',
76 'franceinter',
77 'francemusique',
78 'fip',
79 'mouv',
80 )))
82 def _extract_data_from_webpage(self, webpage, display_id, key):
83 return traverse_obj(self._search_json(
84 r'\bconst\s+data\s*=', webpage, key, display_id,
85 contains_pattern=r'\[\{(?s:.+)\}\]', transform_source=js_to_json),
86 (..., 'data', key, {dict}), get_all=False) or {}
89 class FranceCultureIE(RadioFranceBaseIE):
90 _VALID_URL = rf'''(?x)
91 {RadioFranceBaseIE._VALID_URL_BASE}
92 /(?:{RadioFranceBaseIE._STATIONS_RE})
93 /podcasts/(?:[^?#]+/)?(?P<display_id>[^?#]+)-(?P<id>\d{{6,}})(?:$|[?#])
94 '''
96 _TESTS = [
98 'url': 'https://www.radiofrance.fr/franceculture/podcasts/science-en-questions/la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau-8440487',
99 'info_dict': {
100 'id': '8440487',
101 'display_id': 'la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau',
102 'ext': 'mp3',
103 'title': 'La physique d’Einstein aiderait-elle à comprendre le cerveau ?',
104 'description': 'Existerait-il un pont conceptuel entre la physique de l’espace-temps et les neurosciences ?',
105 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
106 'upload_date': '20220514',
107 'duration': 2750,
111 'url': 'https://www.radiofrance.fr/franceinter/podcasts/le-7-9-30/le-7-9-30-du-vendredi-10-mars-2023-2107675',
112 'info_dict': {
113 'id': '2107675',
114 'display_id': 'le-7-9-30-du-vendredi-10-mars-2023',
115 'title': 'Inflation alimentaire : comment en sortir ? - Régis Debray et Claude Grange - Cybèle Idelot',
116 'description': 'md5:36ee74351ede77a314fdebb94026b916',
117 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
118 'upload_date': '20230310',
119 'duration': 8977,
120 'ext': 'mp3',
124 'url': 'https://www.radiofrance.fr/franceinter/podcasts/la-rafle-du-vel-d-hiv-une-affaire-d-etat/les-racines-du-crime-episode-1-3715507',
125 'only_matching': True,
126 }, {
127 'url': 'https://www.radiofrance.fr/franceinfo/podcasts/le-billet-sciences/sante-bientot-un-vaccin-contre-l-asthme-allergique-3057200',
128 'only_matching': True,
132 def _real_extract(self, url):
133 video_id, display_id = self._match_valid_url(url).group('id', 'display_id')
134 webpage = self._download_webpage(url, display_id)
136 # _search_json_ld doesn't correctly handle this. See https://github.com/yt-dlp/yt-dlp/pull/3874#discussion_r891903846
137 video_data = self._search_json('', webpage, 'audio data', display_id, contains_pattern=r'{\s*"@type"\s*:\s*"AudioObject".+}')
139 return {
140 'id': video_id,
141 'display_id': display_id,
142 'url': video_data['contentUrl'],
143 'vcodec': 'none' if video_data.get('encodingFormat') == 'mp3' else None,
144 'duration': parse_duration(video_data.get('duration')),
145 'title': self._html_search_regex(r'(?s)<h1[^>]*itemprop="[^"]*name[^"]*"[^>]*>(.+?)</h1>',
146 webpage, 'title', default=self._og_search_title(webpage)),
147 'description': self._html_search_regex(
148 r'(?s)<meta name="description"\s*content="([^"]+)', webpage, 'description', default=None),
149 'thumbnail': self._og_search_thumbnail(webpage),
150 'uploader': self._html_search_regex(
151 r'(?s)<span class="author">(.*?)</span>', webpage, 'uploader', default=None),
152 'upload_date': unified_strdate(self._search_regex(
153 r'"datePublished"\s*:\s*"([^"]+)', webpage, 'timestamp', fatal=False)),
157 class RadioFranceLiveIE(RadioFranceBaseIE):
158 _VALID_URL = rf'''(?x)
159 https?://(?:www\.)?radiofrance\.fr
160 /(?P<id>{RadioFranceBaseIE._STATIONS_RE})
161 /?(?P<substation_id>radio-[\w-]+)?(?:[#?]|$)
164 _TESTS = [{
165 'url': 'https://www.radiofrance.fr/franceinter/',
166 'info_dict': {
167 'id': 'franceinter',
168 'title': str,
169 'live_status': 'is_live',
170 'ext': 'aac',
172 'params': {
173 'skip_download': 'Livestream',
175 }, {
176 'url': 'https://www.radiofrance.fr/franceculture',
177 'info_dict': {
178 'id': 'franceculture',
179 'title': str,
180 'live_status': 'is_live',
181 'ext': 'aac',
183 'params': {
184 'skip_download': 'Livestream',
186 }, {
187 'url': 'https://www.radiofrance.fr/mouv/radio-musique-kids-family',
188 'info_dict': {
189 'id': 'mouv-radio-musique-kids-family',
190 'title': str,
191 'live_status': 'is_live',
192 'ext': 'aac',
194 'params': {
195 'skip_download': 'Livestream',
197 }, {
198 'url': 'https://www.radiofrance.fr/mouv/radio-rnb-soul',
199 'info_dict': {
200 'id': 'mouv-radio-rnb-soul',
201 'title': str,
202 'live_status': 'is_live',
203 'ext': 'aac',
205 'params': {
206 'skip_download': 'Livestream',
208 }, {
209 'url': 'https://www.radiofrance.fr/mouv/radio-musique-mix',
210 'info_dict': {
211 'id': 'mouv-radio-musique-mix',
212 'title': str,
213 'live_status': 'is_live',
214 'ext': 'aac',
216 'params': {
217 'skip_download': 'Livestream',
219 }, {
220 'url': 'https://www.radiofrance.fr/fip/radio-rock',
221 'info_dict': {
222 'id': 'fip-radio-rock',
223 'title': str,
224 'live_status': 'is_live',
225 'ext': 'aac',
227 'params': {
228 'skip_download': 'Livestream',
230 }, {
231 'url': 'https://www.radiofrance.fr/mouv',
232 'only_matching': True,
235 def _real_extract(self, url):
236 station_id, substation_id = self._match_valid_url(url).group('id', 'substation_id')
238 if substation_id:
239 webpage = self._download_webpage(url, station_id)
240 api_response = self._extract_data_from_webpage(webpage, station_id, 'webRadioData')
241 else:
242 api_response = self._download_json(
243 f'https://www.radiofrance.fr/{station_id}/api/live', station_id)
245 formats, subtitles = [], {}
246 for media_source in traverse_obj(api_response, (('now', None), 'media', 'sources', lambda _, v: v['url'])):
247 if media_source.get('format') == 'hls':
248 fmts, subs = self._extract_m3u8_formats_and_subtitles(media_source['url'], station_id, fatal=False)
249 formats.extend(fmts)
250 self._merge_subtitles(subs, target=subtitles)
251 else:
252 formats.append({
253 'url': media_source['url'],
254 'abr': media_source.get('bitrate'),
257 return {
258 'id': join_nonempty(station_id, substation_id),
259 'title': traverse_obj(api_response, ('visual', 'legend')) or join_nonempty(
260 ('now', 'firstLine', 'title'), ('now', 'secondLine', 'title'), from_dict=api_response, delim=' - '),
261 'formats': formats,
262 'subtitles': subtitles,
263 'is_live': True,
267 class RadioFrancePlaylistBaseIE(RadioFranceBaseIE):
268 """Subclasses must set _METADATA_KEY"""
270 def _call_api(self, content_id, cursor, page_num):
271 raise NotImplementedError('This method must be implemented by subclasses')
273 def _generate_playlist_entries(self, content_id, content_response):
274 for page_num in itertools.count(2):
275 for entry in content_response['items']:
276 yield self.url_result(
277 f'https://www.radiofrance.fr/{entry["path"]}', url_transparent=True, **traverse_obj(entry, {
278 'title': 'title',
279 'description': 'standFirst',
280 'timestamp': ('publishedDate', {int_or_none}),
281 'thumbnail': ('visual', 'src'),
284 next_cursor = traverse_obj(content_response, (('pagination', None), 'next'), get_all=False)
285 if not next_cursor:
286 break
288 content_response = self._call_api(content_id, next_cursor, page_num)
290 def _real_extract(self, url):
291 display_id = self._match_id(url)
293 metadata = self._download_json(
294 'https://www.radiofrance.fr/api/v2.1/path', display_id,
295 query={'value': urllib.parse.urlparse(url).path})['content']
297 content_id = metadata['id']
299 return self.playlist_result(
300 self._generate_playlist_entries(content_id, metadata[self._METADATA_KEY]), content_id,
301 display_id=display_id, **{**traverse_obj(metadata, {
302 'title': 'title',
303 'description': 'standFirst',
304 'thumbnail': ('visual', 'src'),
305 }), **traverse_obj(metadata, {
306 'title': 'name',
307 'description': 'role',
308 })})
311 class RadioFrancePodcastIE(RadioFrancePlaylistBaseIE):
312 _VALID_URL = rf'''(?x)
313 {RadioFranceBaseIE._VALID_URL_BASE}
314 /(?:{RadioFranceBaseIE._STATIONS_RE})
315 /podcasts/(?P<id>[\w-]+)/?(?:[?#]|$)
318 _TESTS = [{
319 'url': 'https://www.radiofrance.fr/franceinfo/podcasts/le-billet-vert',
320 'info_dict': {
321 'id': 'eaf6ef81-a980-4f1c-a7d1-8a75ecd54b17',
322 'display_id': 'le-billet-vert',
323 'title': 'Le billet sciences',
324 'description': 'md5:eb1007b34b0c0a680daaa71525bbd4c1',
325 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
327 'playlist_mincount': 11,
328 }, {
329 'url': 'https://www.radiofrance.fr/franceinter/podcasts/jean-marie-le-pen-l-obsession-nationale',
330 'info_dict': {
331 'id': '566fd524-3074-4fbc-ac69-8696f2152a54',
332 'display_id': 'jean-marie-le-pen-l-obsession-nationale',
333 'title': 'Jean-Marie Le Pen, l\'obsession nationale',
334 'description': 'md5:a07c0cfb894f6d07a62d0ad12c4b7d73',
335 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
337 'playlist_count': 7,
338 }, {
339 'url': 'https://www.radiofrance.fr/franceculture/podcasts/serie-thomas-grjebine',
340 'info_dict': {
341 'id': '63c1ddc9-9f15-457a-98b2-411bac63f48d',
342 'display_id': 'serie-thomas-grjebine',
343 'title': 'Thomas Grjebine',
345 'playlist_count': 1,
346 }, {
347 'url': 'https://www.radiofrance.fr/fip/podcasts/certains-l-aiment-fip',
348 'info_dict': {
349 'id': '143dff38-e956-4a5d-8576-1c0b7242b99e',
350 'display_id': 'certains-l-aiment-fip',
351 'title': 'Certains l’aiment Fip',
352 'description': 'md5:ff974672ba00d4fd5be80fb001c5b27e',
353 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
355 'playlist_mincount': 321,
356 }, {
357 'url': 'https://www.radiofrance.fr/franceinter/podcasts/le-7-9',
358 'only_matching': True,
359 }, {
360 'url': 'https://www.radiofrance.fr/mouv/podcasts/dirty-mix',
361 'only_matching': True,
364 _METADATA_KEY = 'expressions'
366 def _call_api(self, podcast_id, cursor, page_num):
367 return self._download_json(
368 f'https://www.radiofrance.fr/api/v2.1/concepts/{podcast_id}/expressions', podcast_id,
369 note=f'Downloading page {page_num}', query={'pageCursor': cursor})
372 class RadioFranceProfileIE(RadioFrancePlaylistBaseIE):
373 _VALID_URL = rf'{RadioFranceBaseIE._VALID_URL_BASE}/personnes/(?P<id>[\w-]+)'
375 _TESTS = [{
376 'url': 'https://www.radiofrance.fr/personnes/thomas-pesquet?p=3',
377 'info_dict': {
378 'id': '86c62790-e481-11e2-9f7b-782bcb6744eb',
379 'display_id': 'thomas-pesquet',
380 'title': 'Thomas Pesquet',
381 'description': 'Astronaute à l\'agence spatiale européenne',
383 'playlist_mincount': 212,
384 }, {
385 'url': 'https://www.radiofrance.fr/personnes/eugenie-bastie',
386 'info_dict': {
387 'id': '9593050b-0183-4972-a0b5-d8f699079e02',
388 'display_id': 'eugenie-bastie',
389 'title': 'Eugénie Bastié',
390 'description': 'Journaliste et essayiste',
391 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
393 'playlist_mincount': 39,
394 }, {
395 'url': 'https://www.radiofrance.fr/personnes/lea-salame',
396 'only_matching': True,
399 _METADATA_KEY = 'documents'
401 def _call_api(self, profile_id, cursor, page_num):
402 resp = self._download_json(
403 f'https://www.radiofrance.fr/api/v2.1/taxonomy/{profile_id}/documents', profile_id,
404 note=f'Downloading page {page_num}', query={
405 'relation': 'personality',
406 'cursor': cursor,
409 resp['next'] = traverse_obj(resp, ('pagination', 'next'))
410 return resp
413 class RadioFranceProgramScheduleIE(RadioFranceBaseIE):
414 _VALID_URL = rf'''(?x)
415 {RadioFranceBaseIE._VALID_URL_BASE}
416 /(?P<station>{RadioFranceBaseIE._STATIONS_RE})
417 /grille-programmes(?:\?date=(?P<date>[\d-]+))?
420 _TESTS = [{
421 'url': 'https://www.radiofrance.fr/franceinter/grille-programmes?date=17-02-2023',
422 'info_dict': {
423 'id': 'franceinter-program-20230217',
424 'upload_date': '20230217',
426 'playlist_count': 25,
427 }, {
428 'url': 'https://www.radiofrance.fr/franceculture/grille-programmes?date=01-02-2023',
429 'info_dict': {
430 'id': 'franceculture-program-20230201',
431 'upload_date': '20230201',
433 'playlist_count': 25,
434 }, {
435 'url': 'https://www.radiofrance.fr/mouv/grille-programmes?date=19-03-2023',
436 'info_dict': {
437 'id': 'mouv-program-20230319',
438 'upload_date': '20230319',
440 'playlist_count': 3,
441 }, {
442 'url': 'https://www.radiofrance.fr/francemusique/grille-programmes?date=18-03-2023',
443 'info_dict': {
444 'id': 'francemusique-program-20230318',
445 'upload_date': '20230318',
447 'playlist_count': 15,
448 }, {
449 'url': 'https://www.radiofrance.fr/franceculture/grille-programmes',
450 'only_matching': True,
453 def _generate_playlist_entries(self, webpage_url, api_response):
454 for entry in traverse_obj(api_response, ('steps', lambda _, v: v['expression']['path'])):
455 yield self.url_result(
456 urljoin(webpage_url, f'/{entry["expression"]["path"]}'), ie=FranceCultureIE,
457 url_transparent=True, **traverse_obj(entry, {
458 'title': ('expression', 'title'),
459 'thumbnail': ('expression', 'visual', 'src'),
460 'timestamp': ('startTime', {int_or_none}),
461 'series_id': ('concept', 'id'),
462 'series': ('concept', 'title'),
465 def _real_extract(self, url):
466 station, date = self._match_valid_url(url).group('station', 'date')
467 webpage = self._download_webpage(url, station)
468 grid_data = self._extract_data_from_webpage(webpage, station, 'grid')
469 upload_date = strftime_or_none(grid_data.get('date'), '%Y%m%d')
471 return self.playlist_result(
472 self._generate_playlist_entries(url, grid_data),
473 join_nonempty(station, 'program', upload_date), upload_date=upload_date)