5 from .common
import InfoExtractor
18 class RadioFranceIE(InfoExtractor
):
19 _VALID_URL
= r
'https?://maison\.radiofrance\.fr/radiovisions/(?P<id>[^?#]+)'
20 IE_NAME
= 'radiofrance'
23 'url': 'http://maison.radiofrance.fr/radiovisions/one-one',
24 'md5': 'bdbb28ace95ed0e04faab32ba3160daf',
28 'title': 'One to one',
29 'description': "Plutôt que d'imaginer la radio de demain comme technologie ou comme création de contenu, je veux montrer que quelles que soient ses évolutions, j'ai l'intime conviction que la radio continuera d'être un grand média de proximité pour les auditeurs.",
30 'uploader': 'Thomas Hercouët',
34 def _real_extract(self
, url
):
35 m
= self
._match
_valid
_url
(url
)
36 video_id
= m
.group('id')
38 webpage
= self
._download
_webpage
(url
, video_id
)
39 title
= self
._html
_search
_regex
(r
'<h1>(.*?)</h1>', webpage
, 'title')
40 description
= self
._html
_search
_regex
(
41 r
'<div class="bloc_page_wrapper"><div class="text">(.*?)</div>',
42 webpage
, 'description', fatal
=False)
43 uploader
= self
._html
_search
_regex
(
44 r
'<div class="credit"> © (.*?)</div>',
45 webpage
, 'uploader', fatal
=False)
47 formats_str
= self
._html
_search
_regex
(
48 r
'class="jp-jplayer[^"]*" data-source="([^"]+)">',
49 webpage
, 'audio URLs')
58 enumerate(re
.findall(r
"([a-z0-9]+)\s*:\s*'([^']+)'", formats_str
))
65 'description': description
,
70 class RadioFranceBaseIE(InfoExtractor
):
71 _VALID_URL_BASE
= r
'https?://(?:www\.)?radiofrance\.fr'
73 _STATIONS_RE
= '|'.join(map(re
.escape
, (
82 def _extract_data_from_webpage(self
, webpage
, display_id
, key
):
83 return traverse_obj(self
._search
_json
(
84 r
'\bconst\s+data\s*=', webpage
, key
, display_id
,
85 contains_pattern
=r
'\[\{(?s:.+)\}\]', transform_source
=js_to_json
),
86 (..., 'data', key
, {dict}
), get_all
=False) or {}
89 class FranceCultureIE(RadioFranceBaseIE
):
90 _VALID_URL
= rf
'''(?x)
91 {RadioFranceBaseIE._VALID_URL_BASE}
92 /(?:{RadioFranceBaseIE._STATIONS_RE})
93 /podcasts/(?:[^?#]+/)?(?P<display_id>[^?#]+)-(?P<id>\d{{6,}})(?:$|[?#])
98 'url': 'https://www.radiofrance.fr/franceculture/podcasts/science-en-questions/la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau-8440487',
101 'display_id': 'la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau',
103 'title': 'La physique d’Einstein aiderait-elle à comprendre le cerveau ?',
104 'description': 'Existerait-il un pont conceptuel entre la physique de l’espace-temps et les neurosciences ?',
105 'thumbnail': r
're:^https?://.*\.(?:jpg|png)',
106 'upload_date': '20220514',
111 'url': 'https://www.radiofrance.fr/franceinter/podcasts/le-7-9-30/le-7-9-30-du-vendredi-10-mars-2023-2107675',
114 'display_id': 'le-7-9-30-du-vendredi-10-mars-2023',
115 'title': 'Inflation alimentaire : comment en sortir ? - Régis Debray et Claude Grange - Cybèle Idelot',
116 'description': 'md5:36ee74351ede77a314fdebb94026b916',
117 'thumbnail': r
're:^https?://.*\.(?:jpg|png)',
118 'upload_date': '20230310',
124 'url': 'https://www.radiofrance.fr/franceinter/podcasts/la-rafle-du-vel-d-hiv-une-affaire-d-etat/les-racines-du-crime-episode-1-3715507',
125 'only_matching': True,
127 'url': 'https://www.radiofrance.fr/franceinfo/podcasts/le-billet-sciences/sante-bientot-un-vaccin-contre-l-asthme-allergique-3057200',
128 'only_matching': True,
132 def _real_extract(self
, url
):
133 video_id
, display_id
= self
._match
_valid
_url
(url
).group('id', 'display_id')
134 webpage
= self
._download
_webpage
(url
, display_id
)
136 # _search_json_ld doesn't correctly handle this. See https://github.com/yt-dlp/yt-dlp/pull/3874#discussion_r891903846
137 video_data
= self
._search
_json
('', webpage
, 'audio data', display_id
, contains_pattern
=r
'{\s*"@type"\s*:\s*"AudioObject".+}')
141 'display_id': display_id
,
142 'url': video_data
['contentUrl'],
143 'vcodec': 'none' if video_data
.get('encodingFormat') == 'mp3' else None,
144 'duration': parse_duration(video_data
.get('duration')),
145 'title': self
._html
_search
_regex
(r
'(?s)<h1[^>]*itemprop="[^"]*name[^"]*"[^>]*>(.+?)</h1>',
146 webpage
, 'title', default
=self
._og
_search
_title
(webpage
)),
147 'description': self
._html
_search
_regex
(
148 r
'(?s)<meta name="description"\s*content="([^"]+)', webpage
, 'description', default
=None),
149 'thumbnail': self
._og
_search
_thumbnail
(webpage
),
150 'uploader': self
._html
_search
_regex
(
151 r
'(?s)<span class="author">(.*?)</span>', webpage
, 'uploader', default
=None),
152 'upload_date': unified_strdate(self
._search
_regex
(
153 r
'"datePublished"\s*:\s*"([^"]+)', webpage
, 'timestamp', fatal
=False)),
157 class RadioFranceLiveIE(RadioFranceBaseIE
):
158 _VALID_URL
= rf
'''(?x)
159 https?://(?:www\.)?radiofrance\.fr
160 /(?P<id>{RadioFranceBaseIE._STATIONS_RE})
161 /?(?P<substation_id>radio-[\w-]+)?(?:[#?]|$)
165 'url': 'https://www.radiofrance.fr/franceinter/',
169 'live_status': 'is_live',
173 'skip_download': 'Livestream',
176 'url': 'https://www.radiofrance.fr/franceculture',
178 'id': 'franceculture',
180 'live_status': 'is_live',
184 'skip_download': 'Livestream',
187 'url': 'https://www.radiofrance.fr/mouv/radio-musique-kids-family',
189 'id': 'mouv-radio-musique-kids-family',
191 'live_status': 'is_live',
195 'skip_download': 'Livestream',
198 'url': 'https://www.radiofrance.fr/mouv/radio-rnb-soul',
200 'id': 'mouv-radio-rnb-soul',
202 'live_status': 'is_live',
206 'skip_download': 'Livestream',
209 'url': 'https://www.radiofrance.fr/mouv/radio-musique-mix',
211 'id': 'mouv-radio-musique-mix',
213 'live_status': 'is_live',
217 'skip_download': 'Livestream',
220 'url': 'https://www.radiofrance.fr/fip/radio-rock',
222 'id': 'fip-radio-rock',
224 'live_status': 'is_live',
228 'skip_download': 'Livestream',
231 'url': 'https://www.radiofrance.fr/mouv',
232 'only_matching': True,
235 def _real_extract(self
, url
):
236 station_id
, substation_id
= self
._match
_valid
_url
(url
).group('id', 'substation_id')
239 webpage
= self
._download
_webpage
(url
, station_id
)
240 api_response
= self
._extract
_data
_from
_webpage
(webpage
, station_id
, 'webRadioData')
242 api_response
= self
._download
_json
(
243 f
'https://www.radiofrance.fr/{station_id}/api/live', station_id
)
245 formats
, subtitles
= [], {}
246 for media_source
in traverse_obj(api_response
, (('now', None), 'media', 'sources', lambda _
, v
: v
['url'])):
247 if media_source
.get('format') == 'hls':
248 fmts
, subs
= self
._extract
_m
3u8_formats
_and
_subtitles
(media_source
['url'], station_id
, fatal
=False)
250 self
._merge
_subtitles
(subs
, target
=subtitles
)
253 'url': media_source
['url'],
254 'abr': media_source
.get('bitrate'),
258 'id': join_nonempty(station_id
, substation_id
),
259 'title': traverse_obj(api_response
, ('visual', 'legend')) or join_nonempty(
260 ('now', 'firstLine', 'title'), ('now', 'secondLine', 'title'), from_dict
=api_response
, delim
=' - '),
262 'subtitles': subtitles
,
267 class RadioFrancePlaylistBaseIE(RadioFranceBaseIE
):
268 """Subclasses must set _METADATA_KEY"""
270 def _call_api(self
, content_id
, cursor
, page_num
):
271 raise NotImplementedError('This method must be implemented by subclasses')
273 def _generate_playlist_entries(self
, content_id
, content_response
):
274 for page_num
in itertools
.count(2):
275 for entry
in content_response
['items']:
276 yield self
.url_result(
277 f
'https://www.radiofrance.fr/{entry["path"]}', url_transparent
=True, **traverse_obj(entry
, {
279 'description': 'standFirst',
280 'timestamp': ('publishedDate', {int_or_none}
),
281 'thumbnail': ('visual', 'src'),
284 next_cursor
= traverse_obj(content_response
, (('pagination', None), 'next'), get_all
=False)
288 content_response
= self
._call
_api
(content_id
, next_cursor
, page_num
)
290 def _real_extract(self
, url
):
291 display_id
= self
._match
_id
(url
)
293 metadata
= self
._download
_json
(
294 'https://www.radiofrance.fr/api/v2.1/path', display_id
,
295 query
={'value': urllib
.parse
.urlparse(url
).path
})['content']
297 content_id
= metadata
['id']
299 return self
.playlist_result(
300 self
._generate
_playlist
_entries
(content_id
, metadata
[self
._METADATA
_KEY
]), content_id
,
301 display_id
=display_id
, **{**traverse_obj(metadata
, {
303 'description': 'standFirst',
304 'thumbnail': ('visual', 'src'),
305 }), **traverse_obj(metadata
, {
307 'description': 'role',
311 class RadioFrancePodcastIE(RadioFrancePlaylistBaseIE
):
312 _VALID_URL
= rf
'''(?x)
313 {RadioFranceBaseIE._VALID_URL_BASE}
314 /(?:{RadioFranceBaseIE._STATIONS_RE})
315 /podcasts/(?P<id>[\w-]+)/?(?:[?#]|$)
319 'url': 'https://www.radiofrance.fr/franceinfo/podcasts/le-billet-vert',
321 'id': 'eaf6ef81-a980-4f1c-a7d1-8a75ecd54b17',
322 'display_id': 'le-billet-vert',
323 'title': 'Le billet sciences',
324 'description': 'md5:eb1007b34b0c0a680daaa71525bbd4c1',
325 'thumbnail': r
're:^https?://.*\.(?:jpg|png)',
327 'playlist_mincount': 11,
329 'url': 'https://www.radiofrance.fr/franceinter/podcasts/jean-marie-le-pen-l-obsession-nationale',
331 'id': '566fd524-3074-4fbc-ac69-8696f2152a54',
332 'display_id': 'jean-marie-le-pen-l-obsession-nationale',
333 'title': 'Jean-Marie Le Pen, l\'obsession nationale',
334 'description': 'md5:a07c0cfb894f6d07a62d0ad12c4b7d73',
335 'thumbnail': r
're:^https?://.*\.(?:jpg|png)',
339 'url': 'https://www.radiofrance.fr/franceculture/podcasts/serie-thomas-grjebine',
341 'id': '63c1ddc9-9f15-457a-98b2-411bac63f48d',
342 'display_id': 'serie-thomas-grjebine',
343 'title': 'Thomas Grjebine',
347 'url': 'https://www.radiofrance.fr/fip/podcasts/certains-l-aiment-fip',
349 'id': '143dff38-e956-4a5d-8576-1c0b7242b99e',
350 'display_id': 'certains-l-aiment-fip',
351 'title': 'Certains l’aiment Fip',
352 'description': 'md5:ff974672ba00d4fd5be80fb001c5b27e',
353 'thumbnail': r
're:^https?://.*\.(?:jpg|png)',
355 'playlist_mincount': 321,
357 'url': 'https://www.radiofrance.fr/franceinter/podcasts/le-7-9',
358 'only_matching': True,
360 'url': 'https://www.radiofrance.fr/mouv/podcasts/dirty-mix',
361 'only_matching': True,
364 _METADATA_KEY
= 'expressions'
366 def _call_api(self
, podcast_id
, cursor
, page_num
):
367 return self
._download
_json
(
368 f
'https://www.radiofrance.fr/api/v2.1/concepts/{podcast_id}/expressions', podcast_id
,
369 note
=f
'Downloading page {page_num}', query
={'pageCursor': cursor
})
372 class RadioFranceProfileIE(RadioFrancePlaylistBaseIE
):
373 _VALID_URL
= rf
'{RadioFranceBaseIE._VALID_URL_BASE}/personnes/(?P<id>[\w-]+)'
376 'url': 'https://www.radiofrance.fr/personnes/thomas-pesquet?p=3',
378 'id': '86c62790-e481-11e2-9f7b-782bcb6744eb',
379 'display_id': 'thomas-pesquet',
380 'title': 'Thomas Pesquet',
381 'description': 'Astronaute à l\'agence spatiale européenne',
383 'playlist_mincount': 212,
385 'url': 'https://www.radiofrance.fr/personnes/eugenie-bastie',
387 'id': '9593050b-0183-4972-a0b5-d8f699079e02',
388 'display_id': 'eugenie-bastie',
389 'title': 'Eugénie Bastié',
390 'description': 'Journaliste et essayiste',
391 'thumbnail': r
're:^https?://.*\.(?:jpg|png)',
393 'playlist_mincount': 39,
395 'url': 'https://www.radiofrance.fr/personnes/lea-salame',
396 'only_matching': True,
399 _METADATA_KEY
= 'documents'
401 def _call_api(self
, profile_id
, cursor
, page_num
):
402 resp
= self
._download
_json
(
403 f
'https://www.radiofrance.fr/api/v2.1/taxonomy/{profile_id}/documents', profile_id
,
404 note
=f
'Downloading page {page_num}', query
={
405 'relation': 'personality',
409 resp
['next'] = traverse_obj(resp
, ('pagination', 'next'))
413 class RadioFranceProgramScheduleIE(RadioFranceBaseIE
):
414 _VALID_URL
= rf
'''(?x)
415 {RadioFranceBaseIE._VALID_URL_BASE}
416 /(?P<station>{RadioFranceBaseIE._STATIONS_RE})
417 /grille-programmes(?:\?date=(?P<date>[\d-]+))?
421 'url': 'https://www.radiofrance.fr/franceinter/grille-programmes?date=17-02-2023',
423 'id': 'franceinter-program-20230217',
424 'upload_date': '20230217',
426 'playlist_count': 25,
428 'url': 'https://www.radiofrance.fr/franceculture/grille-programmes?date=01-02-2023',
430 'id': 'franceculture-program-20230201',
431 'upload_date': '20230201',
433 'playlist_count': 25,
435 'url': 'https://www.radiofrance.fr/mouv/grille-programmes?date=19-03-2023',
437 'id': 'mouv-program-20230319',
438 'upload_date': '20230319',
442 'url': 'https://www.radiofrance.fr/francemusique/grille-programmes?date=18-03-2023',
444 'id': 'francemusique-program-20230318',
445 'upload_date': '20230318',
447 'playlist_count': 15,
449 'url': 'https://www.radiofrance.fr/franceculture/grille-programmes',
450 'only_matching': True,
453 def _generate_playlist_entries(self
, webpage_url
, api_response
):
454 for entry
in traverse_obj(api_response
, ('steps', lambda _
, v
: v
['expression']['path'])):
455 yield self
.url_result(
456 urljoin(webpage_url
, f
'/{entry["expression"]["path"]}'), ie
=FranceCultureIE
,
457 url_transparent
=True, **traverse_obj(entry
, {
458 'title': ('expression', 'title'),
459 'thumbnail': ('expression', 'visual', 'src'),
460 'timestamp': ('startTime', {int_or_none}
),
461 'series_id': ('concept', 'id'),
462 'series': ('concept', 'title'),
465 def _real_extract(self
, url
):
466 station
, date
= self
._match
_valid
_url
(url
).group('station', 'date')
467 webpage
= self
._download
_webpage
(url
, station
)
468 grid_data
= self
._extract
_data
_from
_webpage
(webpage
, station
, 'grid')
469 upload_date
= strftime_or_none(grid_data
.get('date'), '%Y%m%d')
471 return self
.playlist_result(
472 self
._generate
_playlist
_entries
(url
, grid_data
),
473 join_nonempty(station
, 'program', upload_date
), upload_date
=upload_date
)