[ie/youtube] Add age-gate workaround for some embeddable videos (#11821)
[yt-dlp.git] / yt_dlp / extractor / arte.py
blob142d4b066b76c0d576da94ebd57445659e9e761d
1 import re
3 from .common import InfoExtractor
4 from ..utils import (
5 ExtractorError,
6 GeoRestrictedError,
7 int_or_none,
8 join_nonempty,
9 parse_iso8601,
10 parse_qs,
11 strip_or_none,
12 traverse_obj,
13 url_or_none,
17 class ArteTVBaseIE(InfoExtractor):
18 _ARTE_LANGUAGES = 'fr|de|en|es|it|pl'
19 _API_BASE = 'https://api.arte.tv/api/player/v2'
22 class ArteTVIE(ArteTVBaseIE):
23 _VALID_URL = rf'''(?x)
24 (?:https?://
25 (?:
26 (?:www\.)?arte\.tv/(?P<lang>{ArteTVBaseIE._ARTE_LANGUAGES})/videos|
27 api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>{ArteTVBaseIE._ARTE_LANGUAGES})
29 |arte://program)
30 /(?P<id>\d{{6}}-\d{{3}}-[AF]|LIVE)
31 '''
32 _TESTS = [{
33 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
34 'only_matching': True,
35 }, {
36 'note': 'No alt_title',
37 'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/',
38 'only_matching': True,
39 }, {
40 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
41 'only_matching': True,
42 }, {
43 'url': 'https://api.arte.tv/api/player/v2/config/de/LIVE',
44 'only_matching': True,
45 }, {
46 'url': 'https://www.arte.tv/de/videos/110203-006-A/zaz/',
47 'only_matching': True,
48 }, {
49 'url': 'https://www.arte.tv/fr/videos/109067-000-A/la-loi-de-teheran/',
50 'info_dict': {
51 'id': '109067-000-A',
52 'ext': 'mp4',
53 'description': 'md5:d2ca367b8ecee028dddaa8bd1aebc739',
54 'timestamp': 1713927600,
55 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/3rR6PLzfbigSkkeHtkCZNF/940x530',
56 'duration': 7599,
57 'title': 'La loi de Téhéran',
58 'upload_date': '20240424',
59 'subtitles': {
60 'fr': 'mincount:1',
61 'fr-acc': 'mincount:1',
62 'fr-forced': 'mincount:1',
65 }, {
66 'note': 'age-restricted',
67 'url': 'https://www.arte.tv/de/videos/006785-000-A/the-element-of-crime/',
68 'info_dict': {
69 'id': '006785-000-A',
70 'description': 'md5:c2f94fdfefc8a280e4dab68ab96ab0ba',
71 'title': 'The Element of Crime',
72 'timestamp': 1696111200,
73 'duration': 5849,
74 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/q82dTTfyuCXupPsGxXsd7B/940x530',
75 'upload_date': '20230930',
76 'ext': 'mp4',
78 'skip': '404 Not Found',
81 _GEO_BYPASS = True
83 _LANG_MAP = { # ISO639 -> French abbreviations
84 'fr': 'F',
85 'de': 'A',
86 'en': 'E[ANG]',
87 'es': 'E[ESP]',
88 'it': 'E[ITA]',
89 'pl': 'E[POL]',
90 # XXX: probably means mixed; <https://www.arte.tv/en/videos/107710-029-A/dispatches-from-ukraine-local-journalists-report/>
91 # uses this code for audio that happens to be in Ukrainian, but the manifest uses the ISO code 'mul' (mixed)
92 'mul': 'EU',
95 _VERSION_CODE_RE = re.compile(r'''(?x)
97 (?P<original_voice>O?)
98 (?P<vlang>[FA]|E\[[A-Z]+\]|EU)?
99 (?P<audio_desc>AUD|)
101 (?P<has_sub>-ST)
102 (?P<sdh_sub>M?)
103 (?P<sub_lang>[FA]|E\[[A-Z]+\]|EU)
105 ''')
107 # all obtained by exhaustive testing
108 _COUNTRIES_MAP = {
109 'DE_FR': (
110 'BL', 'DE', 'FR', 'GF', 'GP', 'MF', 'MQ', 'NC',
111 'PF', 'PM', 'RE', 'WF', 'YT',
113 # with both of the below 'BE' sometimes works, sometimes doesn't
114 'EUR_DE_FR': (
115 'AT', 'BL', 'CH', 'DE', 'FR', 'GF', 'GP', 'LI',
116 'MC', 'MF', 'MQ', 'NC', 'PF', 'PM', 'RE', 'WF',
117 'YT',
119 'SAT': (
120 'AD', 'AT', 'AX', 'BG', 'BL', 'CH', 'CY', 'CZ',
121 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GF',
122 'GR', 'HR', 'HU', 'IE', 'IS', 'IT', 'KN', 'LI',
123 'LT', 'LU', 'LV', 'MC', 'MF', 'MQ', 'MT', 'NC',
124 'NL', 'NO', 'PF', 'PL', 'PM', 'PT', 'RE', 'RO',
125 'SE', 'SI', 'SK', 'SM', 'VA', 'WF', 'YT',
129 @staticmethod
130 def _fix_accessible_subs_locale(subs):
131 updated_subs = {}
132 for lang, sub_formats in subs.items():
133 for fmt in sub_formats:
134 url = fmt.get('url') or ''
135 suffix = ('acc' if url.endswith('-MAL.m3u8')
136 else 'forced' if '_VO' not in url
137 else None)
138 updated_subs.setdefault(join_nonempty(lang, suffix), []).append(fmt)
139 return updated_subs
141 def _real_extract(self, url):
142 mobj = self._match_valid_url(url)
143 video_id = mobj.group('id')
144 lang = mobj.group('lang') or mobj.group('lang_2')
145 language_code = self._LANG_MAP.get(lang)
147 config = self._download_json(f'{self._API_BASE}/config/{lang}/{video_id}', video_id, headers={
148 'x-validated-age': '18',
151 geoblocking = traverse_obj(config, ('data', 'attributes', 'restriction', 'geoblocking')) or {}
152 if geoblocking.get('restrictedArea'):
153 raise GeoRestrictedError(f'Video restricted to {geoblocking["code"]!r}',
154 countries=self._COUNTRIES_MAP.get(geoblocking['code'], ('DE', 'FR')))
156 if not traverse_obj(config, ('data', 'attributes', 'rights')):
157 # Eg: https://www.arte.tv/de/videos/097407-215-A/28-minuten
158 # Eg: https://www.arte.tv/es/videos/104351-002-A/serviteur-du-peuple-1-23
159 raise ExtractorError(
160 'Video is not available in this language edition of Arte or broadcast rights expired', expected=True)
162 formats, subtitles = [], {}
163 secondary_formats = []
164 for stream in config['data']['attributes']['streams']:
165 # official player contains code like `e.get("versions")[0].eStat.ml5`
166 stream_version = stream['versions'][0]
167 stream_version_code = stream_version['eStat']['ml5']
169 lang_pref = -1
170 m = self._VERSION_CODE_RE.match(stream_version_code)
171 if m:
172 lang_pref = int(''.join('01'[x] for x in (
173 m.group('vlang') == language_code, # we prefer voice in the requested language
174 not m.group('audio_desc'), # and not the audio description version
175 bool(m.group('original_voice')), # but if voice is not in the requested language, at least choose the original voice
176 m.group('sub_lang') == language_code, # if subtitles are present, we prefer them in the requested language
177 not m.group('has_sub'), # but we prefer no subtitles otherwise
178 not m.group('sdh_sub'), # and we prefer not the hard-of-hearing subtitles if there are subtitles
181 short_label = traverse_obj(stream_version, 'shortLabel', expected_type=str, default='?')
182 if 'HLS' in stream['protocol']:
183 fmts, subs = self._extract_m3u8_formats_and_subtitles(
184 stream['url'], video_id=video_id, ext='mp4', m3u8_id=stream_version_code, fatal=False)
185 for fmt in fmts:
186 fmt.update({
187 'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
188 'language_preference': lang_pref,
190 if any(map(short_label.startswith, ('cc', 'OGsub'))):
191 secondary_formats.extend(fmts)
192 else:
193 formats.extend(fmts)
194 subs = self._fix_accessible_subs_locale(subs)
195 self._merge_subtitles(subs, target=subtitles)
197 elif stream['protocol'] in ('HTTPS', 'RTMP'):
198 formats.append({
199 'format_id': f'{stream["protocol"]}-{stream_version_code}',
200 'url': stream['url'],
201 'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
202 'language_preference': lang_pref,
203 # 'ext': 'mp4', # XXX: may or may not be necessary, at least for HTTPS
206 else:
207 self.report_warning(f'Skipping stream with unknown protocol {stream["protocol"]}')
209 formats.extend(secondary_formats)
210 self._remove_duplicate_formats(formats)
212 metadata = config['data']['attributes']['metadata']
214 return {
215 'id': metadata['providerId'],
216 'webpage_url': traverse_obj(metadata, ('link', 'url')),
217 'title': traverse_obj(metadata, 'subtitle', 'title'),
218 'alt_title': metadata.get('subtitle') and metadata.get('title'),
219 'description': metadata.get('description'),
220 'duration': traverse_obj(metadata, ('duration', 'seconds')),
221 'language': metadata.get('language'),
222 'timestamp': traverse_obj(config, ('data', 'attributes', 'rights', 'begin'), expected_type=parse_iso8601),
223 'is_live': config['data']['attributes'].get('live', False),
224 'formats': formats,
225 'subtitles': subtitles,
226 'thumbnails': [
227 {'url': image['url'], 'id': image.get('caption')}
228 for image in metadata.get('images') or [] if url_or_none(image.get('url'))
230 # TODO: chapters may also be in stream['segments']?
231 'chapters': traverse_obj(config, ('data', 'attributes', 'chapters', 'elements', ..., {
232 'start_time': 'startTime',
233 'title': 'title',
234 })) or None,
238 class ArteTVEmbedIE(InfoExtractor):
239 _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
240 _EMBED_REGEX = [r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1']
241 _TESTS = [{
242 'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
243 'info_dict': {
244 'id': '100605-013-A',
245 'ext': 'mp4',
246 'title': 'United we Stream November Lockdown Edition #13',
247 'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
248 'upload_date': '20201116',
250 'skip': 'No video available',
251 }, {
252 'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
253 'only_matching': True,
256 def _real_extract(self, url):
257 qs = parse_qs(url)
258 json_url = qs['json_url'][0]
259 video_id = ArteTVIE._match_id(json_url)
260 return self.url_result(
261 json_url, ie=ArteTVIE.ie_key(), video_id=video_id)
264 class ArteTVPlaylistIE(ArteTVBaseIE):
265 _VALID_URL = rf'https?://(?:www\.)?arte\.tv/(?P<lang>{ArteTVBaseIE._ARTE_LANGUAGES})/videos/(?P<id>RC-\d{{6}})'
266 _TESTS = [{
267 'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
268 'only_matching': True,
269 }, {
270 'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
271 'playlist_mincount': 100,
272 'info_dict': {
273 'description': 'md5:84e7bf1feda248bc325ebfac818c476e',
274 'id': 'RC-014123',
275 'title': 'ARTE Reportage - najlepsze reportaże',
279 def _real_extract(self, url):
280 lang, playlist_id = self._match_valid_url(url).group('lang', 'id')
281 playlist = self._download_json(
282 f'{self._API_BASE}/playlist/{lang}/{playlist_id}', playlist_id)['data']['attributes']
284 entries = [{
285 '_type': 'url_transparent',
286 'url': video['config']['url'],
287 'ie_key': ArteTVIE.ie_key(),
288 'id': video.get('providerId'),
289 'title': video.get('title'),
290 'alt_title': video.get('subtitle'),
291 'thumbnail': url_or_none(traverse_obj(video, ('mainImage', 'url'))),
292 'duration': int_or_none(traverse_obj(video, ('duration', 'seconds'))),
293 } for video in traverse_obj(playlist, ('items', lambda _, v: v['config']['url']))]
295 return self.playlist_result(entries, playlist_id,
296 traverse_obj(playlist, ('metadata', 'title')),
297 traverse_obj(playlist, ('metadata', 'description')))
300 class ArteTVCategoryIE(ArteTVBaseIE):
301 _VALID_URL = rf'https?://(?:www\.)?arte\.tv/(?P<lang>{ArteTVBaseIE._ARTE_LANGUAGES})/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$'
302 _TESTS = [{
303 'url': 'https://www.arte.tv/en/videos/politics-and-society/',
304 'info_dict': {
305 'id': 'politics-and-society',
306 'title': 'Politics and society',
307 'description': 'Investigative documentary series, geopolitical analysis, and international commentary',
309 'playlist_mincount': 13,
312 @classmethod
313 def suitable(cls, url):
314 return (
315 not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE))
316 and super().suitable(url))
318 def _real_extract(self, url):
319 lang, playlist_id = self._match_valid_url(url).groups()
320 webpage = self._download_webpage(url, playlist_id)
322 items = []
323 for video in re.finditer(
324 rf'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|\b)(?P<url>https?://www\.arte\.tv/{lang}/videos/[\w/-]+)(?P=q)',
325 webpage):
326 video = video.group('url')
327 if video == url:
328 continue
329 if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE)):
330 items.append(video)
332 title = strip_or_none(self._generic_title('', webpage, default='').rsplit('|', 1)[0]) or None
334 return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title,
335 description=self._og_search_description(webpage, default=None))