3 from .common
import InfoExtractor
17 class ArteTVBaseIE(InfoExtractor
):
18 _ARTE_LANGUAGES
= 'fr|de|en|es|it|pl'
19 _API_BASE
= 'https://api.arte.tv/api/player/v2'
22 class ArteTVIE(ArteTVBaseIE
):
23 _VALID_URL
= rf
'''(?x)
26 (?:www\.)?arte\.tv/(?P<lang>{ArteTVBaseIE._ARTE_LANGUAGES})/videos|
27 api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>{ArteTVBaseIE._ARTE_LANGUAGES})
30 /(?P<id>\d{{6}}-\d{{3}}-[AF]|LIVE)
33 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
34 'only_matching': True,
36 'note': 'No alt_title',
37 'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/',
38 'only_matching': True,
40 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
41 'only_matching': True,
43 'url': 'https://api.arte.tv/api/player/v2/config/de/LIVE',
44 'only_matching': True,
46 'url': 'https://www.arte.tv/de/videos/110203-006-A/zaz/',
47 'only_matching': True,
49 'url': 'https://www.arte.tv/fr/videos/109067-000-A/la-loi-de-teheran/',
53 'description': 'md5:d2ca367b8ecee028dddaa8bd1aebc739',
54 'timestamp': 1713927600,
55 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/3rR6PLzfbigSkkeHtkCZNF/940x530',
57 'title': 'La loi de Téhéran',
58 'upload_date': '20240424',
61 'fr-acc': 'mincount:1',
62 'fr-forced': 'mincount:1',
66 'note': 'age-restricted',
67 'url': 'https://www.arte.tv/de/videos/006785-000-A/the-element-of-crime/',
70 'description': 'md5:c2f94fdfefc8a280e4dab68ab96ab0ba',
71 'title': 'The Element of Crime',
72 'timestamp': 1696111200,
74 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/q82dTTfyuCXupPsGxXsd7B/940x530',
75 'upload_date': '20230930',
78 'skip': '404 Not Found',
83 _LANG_MAP
= { # ISO639 -> French abbreviations
90 # XXX: probably means mixed; <https://www.arte.tv/en/videos/107710-029-A/dispatches-from-ukraine-local-journalists-report/>
91 # uses this code for audio that happens to be in Ukrainian, but the manifest uses the ISO code 'mul' (mixed)
95 _VERSION_CODE_RE
= re
.compile(r
'''(?x)
97 (?P<original_voice>O?)
98 (?P<vlang>[FA]|E\[[A-Z]+\]|EU)?
103 (?P<sub_lang>[FA]|E\[[A-Z]+\]|EU)
107 # all obtained by exhaustive testing
110 'BL', 'DE', 'FR', 'GF', 'GP', 'MF', 'MQ', 'NC',
111 'PF', 'PM', 'RE', 'WF', 'YT',
113 # with both of the below 'BE' sometimes works, sometimes doesn't
115 'AT', 'BL', 'CH', 'DE', 'FR', 'GF', 'GP', 'LI',
116 'MC', 'MF', 'MQ', 'NC', 'PF', 'PM', 'RE', 'WF',
120 'AD', 'AT', 'AX', 'BG', 'BL', 'CH', 'CY', 'CZ',
121 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GF',
122 'GR', 'HR', 'HU', 'IE', 'IS', 'IT', 'KN', 'LI',
123 'LT', 'LU', 'LV', 'MC', 'MF', 'MQ', 'MT', 'NC',
124 'NL', 'NO', 'PF', 'PL', 'PM', 'PT', 'RE', 'RO',
125 'SE', 'SI', 'SK', 'SM', 'VA', 'WF', 'YT',
130 def _fix_accessible_subs_locale(subs
):
132 for lang
, sub_formats
in subs
.items():
133 for fmt
in sub_formats
:
134 url
= fmt
.get('url') or ''
135 suffix
= ('acc' if url
.endswith('-MAL.m3u8')
136 else 'forced' if '_VO' not in url
138 updated_subs
.setdefault(join_nonempty(lang
, suffix
), []).append(fmt
)
141 def _real_extract(self
, url
):
142 mobj
= self
._match
_valid
_url
(url
)
143 video_id
= mobj
.group('id')
144 lang
= mobj
.group('lang') or mobj
.group('lang_2')
145 language_code
= self
._LANG
_MAP
.get(lang
)
147 config
= self
._download
_json
(f
'{self._API_BASE}/config/{lang}/{video_id}', video_id
, headers
={
148 'x-validated-age': '18',
151 geoblocking
= traverse_obj(config
, ('data', 'attributes', 'restriction', 'geoblocking')) or {}
152 if geoblocking
.get('restrictedArea'):
153 raise GeoRestrictedError(f
'Video restricted to {geoblocking["code"]!r}',
154 countries
=self
._COUNTRIES
_MAP
.get(geoblocking
['code'], ('DE', 'FR')))
156 if not traverse_obj(config
, ('data', 'attributes', 'rights')):
157 # Eg: https://www.arte.tv/de/videos/097407-215-A/28-minuten
158 # Eg: https://www.arte.tv/es/videos/104351-002-A/serviteur-du-peuple-1-23
159 raise ExtractorError(
160 'Video is not available in this language edition of Arte or broadcast rights expired', expected
=True)
162 formats
, subtitles
= [], {}
163 secondary_formats
= []
164 for stream
in config
['data']['attributes']['streams']:
165 # official player contains code like `e.get("versions")[0].eStat.ml5`
166 stream_version
= stream
['versions'][0]
167 stream_version_code
= stream_version
['eStat']['ml5']
170 m
= self
._VERSION
_CODE
_RE
.match(stream_version_code
)
172 lang_pref
= int(''.join('01'[x
] for x
in (
173 m
.group('vlang') == language_code
, # we prefer voice in the requested language
174 not m
.group('audio_desc'), # and not the audio description version
175 bool(m
.group('original_voice')), # but if voice is not in the requested language, at least choose the original voice
176 m
.group('sub_lang') == language_code
, # if subtitles are present, we prefer them in the requested language
177 not m
.group('has_sub'), # but we prefer no subtitles otherwise
178 not m
.group('sdh_sub'), # and we prefer not the hard-of-hearing subtitles if there are subtitles
181 short_label
= traverse_obj(stream_version
, 'shortLabel', expected_type
=str, default
='?')
182 if 'HLS' in stream
['protocol']:
183 fmts
, subs
= self
._extract
_m
3u8_formats
_and
_subtitles
(
184 stream
['url'], video_id
=video_id
, ext
='mp4', m3u8_id
=stream_version_code
, fatal
=False)
187 'format_note': f
'{stream_version.get("label", "unknown")} [{short_label}]',
188 'language_preference': lang_pref
,
190 if any(map(short_label
.startswith
, ('cc', 'OGsub'))):
191 secondary_formats
.extend(fmts
)
194 subs
= self
._fix
_accessible
_subs
_locale
(subs
)
195 self
._merge
_subtitles
(subs
, target
=subtitles
)
197 elif stream
['protocol'] in ('HTTPS', 'RTMP'):
199 'format_id': f
'{stream["protocol"]}-{stream_version_code}',
200 'url': stream
['url'],
201 'format_note': f
'{stream_version.get("label", "unknown")} [{short_label}]',
202 'language_preference': lang_pref
,
203 # 'ext': 'mp4', # XXX: may or may not be necessary, at least for HTTPS
207 self
.report_warning(f
'Skipping stream with unknown protocol {stream["protocol"]}')
209 formats
.extend(secondary_formats
)
210 self
._remove
_duplicate
_formats
(formats
)
212 metadata
= config
['data']['attributes']['metadata']
215 'id': metadata
['providerId'],
216 'webpage_url': traverse_obj(metadata
, ('link', 'url')),
217 'title': traverse_obj(metadata
, 'subtitle', 'title'),
218 'alt_title': metadata
.get('subtitle') and metadata
.get('title'),
219 'description': metadata
.get('description'),
220 'duration': traverse_obj(metadata
, ('duration', 'seconds')),
221 'language': metadata
.get('language'),
222 'timestamp': traverse_obj(config
, ('data', 'attributes', 'rights', 'begin'), expected_type
=parse_iso8601
),
223 'is_live': config
['data']['attributes'].get('live', False),
225 'subtitles': subtitles
,
227 {'url': image
['url'], 'id': image
.get('caption')}
228 for image
in metadata
.get('images') or [] if url_or_none(image
.get('url'))
230 # TODO: chapters may also be in stream['segments']?
231 'chapters': traverse_obj(config
, ('data', 'attributes', 'chapters', 'elements', ..., {
232 'start_time': 'startTime',
238 class ArteTVEmbedIE(InfoExtractor
):
239 _VALID_URL
= r
'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
240 _EMBED_REGEX
= [r
'<(?:iframe|script)[^>]+src=(["\'])(?P
<url
>(?
:https?
:)?
//(?
:www\
.)?arte\
.tv
/player
/v\d
+/index\
.php
\?.*?
\bjson
_url
=.+?
)\
1']
242 'url
': 'https
://www
.arte
.tv
/player
/v5
/index
.php?json_url
=https
%3A
%2F
%2Fapi
.arte
.tv
%2Fapi
%2Fplayer
%2Fv2
%2Fconfig
%2Fde
%2F100605
-013-A
&lang
=de
&autoplay
=true
&mute
=0100605-013-A
',
244 'id': '100605-013-A
',
246 'title
': 'United we Stream November Lockdown Edition
#13',
247 'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
248 'upload_date': '20201116',
250 'skip': 'No video available',
252 'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
253 'only_matching': True,
256 def _real_extract(self
, url
):
258 json_url
= qs
['json_url'][0]
259 video_id
= ArteTVIE
._match
_id
(json_url
)
260 return self
.url_result(
261 json_url
, ie
=ArteTVIE
.ie_key(), video_id
=video_id
)
264 class ArteTVPlaylistIE(ArteTVBaseIE
):
265 _VALID_URL
= rf
'https?://(?:www\.)?arte\.tv/(?P<lang>{ArteTVBaseIE._ARTE_LANGUAGES})/videos/(?P<id>RC-\d{{6}})'
267 'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
268 'only_matching': True,
270 'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
271 'playlist_mincount': 100,
273 'description': 'md5:84e7bf1feda248bc325ebfac818c476e',
275 'title': 'ARTE Reportage - najlepsze reportaże',
279 def _real_extract(self
, url
):
280 lang
, playlist_id
= self
._match
_valid
_url
(url
).group('lang', 'id')
281 playlist
= self
._download
_json
(
282 f
'{self._API_BASE}/playlist/{lang}/{playlist_id}', playlist_id
)['data']['attributes']
285 '_type': 'url_transparent',
286 'url': video
['config']['url'],
287 'ie_key': ArteTVIE
.ie_key(),
288 'id': video
.get('providerId'),
289 'title': video
.get('title'),
290 'alt_title': video
.get('subtitle'),
291 'thumbnail': url_or_none(traverse_obj(video
, ('mainImage', 'url'))),
292 'duration': int_or_none(traverse_obj(video
, ('duration', 'seconds'))),
293 } for video
in traverse_obj(playlist
, ('items', lambda _
, v
: v
['config']['url']))]
295 return self
.playlist_result(entries
, playlist_id
,
296 traverse_obj(playlist
, ('metadata', 'title')),
297 traverse_obj(playlist
, ('metadata', 'description')))
300 class ArteTVCategoryIE(ArteTVBaseIE
):
301 _VALID_URL
= rf
'https?://(?:www\.)?arte\.tv/(?P<lang>{ArteTVBaseIE._ARTE_LANGUAGES})/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$'
303 'url': 'https://www.arte.tv/en/videos/politics-and-society/',
305 'id': 'politics-and-society',
306 'title': 'Politics and society',
307 'description': 'Investigative documentary series, geopolitical analysis, and international commentary',
309 'playlist_mincount': 13,
313 def suitable(cls
, url
):
315 not any(ie
.suitable(url
) for ie
in (ArteTVIE
, ArteTVPlaylistIE
))
316 and super().suitable(url
))
318 def _real_extract(self
, url
):
319 lang
, playlist_id
= self
._match
_valid
_url
(url
).groups()
320 webpage
= self
._download
_webpage
(url
, playlist_id
)
323 for video
in re
.finditer(
324 rf
'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|\b)(?P<url>https?://www\.arte\.tv/{lang}/videos/[\w/-]+)(?P=q)',
326 video
= video
.group('url')
329 if any(ie
.suitable(video
) for ie
in (ArteTVIE
, ArteTVPlaylistIE
)):
332 title
= strip_or_none(self
._generic
_title
('', webpage
, default
='').rsplit('|', 1)[0]) or None
334 return self
.playlist_from_matches(items
, playlist_id
=playlist_id
, playlist_title
=title
,
335 description
=self
._og
_search
_description
(webpage
, default
=None))