[ie/dropout] Fix extraction (#12102)
[yt-dlp.git] / yt_dlp / extractor / svt.py
blobb5df2e1a18d3d15db5e209c00477c9ec9aa8b1f9
1 import json
2 import re
4 from .common import InfoExtractor
5 from ..utils import (
6 determine_ext,
7 dict_get,
8 int_or_none,
9 traverse_obj,
10 try_get,
11 unified_timestamp,
15 class SVTBaseIE(InfoExtractor):
16 _GEO_COUNTRIES = ['SE']
18 def _extract_video(self, video_info, video_id):
19 is_live = dict_get(video_info, ('live', 'simulcast'), default=False)
20 m3u8_protocol = 'm3u8' if is_live else 'm3u8_native'
21 formats = []
22 subtitles = {}
23 for vr in video_info['videoReferences']:
24 player_type = vr.get('playerType') or vr.get('format')
25 vurl = vr['url']
26 ext = determine_ext(vurl)
27 if ext == 'm3u8':
28 fmts, subs = self._extract_m3u8_formats_and_subtitles(
29 vurl, video_id,
30 ext='mp4', entry_protocol=m3u8_protocol,
31 m3u8_id=player_type, fatal=False)
32 formats.extend(fmts)
33 self._merge_subtitles(subs, target=subtitles)
34 elif ext == 'f4m':
35 formats.extend(self._extract_f4m_formats(
36 vurl + '?hdcore=3.3.0', video_id,
37 f4m_id=player_type, fatal=False))
38 elif ext == 'mpd':
39 fmts, subs = self._extract_mpd_formats_and_subtitles(
40 vurl, video_id, mpd_id=player_type, fatal=False)
41 formats.extend(fmts)
42 self._merge_subtitles(subs, target=subtitles)
43 else:
44 formats.append({
45 'format_id': player_type,
46 'url': vurl,
48 rights = try_get(video_info, lambda x: x['rights'], dict) or {}
49 if not formats and rights.get('geoBlockedSweden'):
50 self.raise_geo_restricted(
51 'This video is only available in Sweden',
52 countries=self._GEO_COUNTRIES, metadata_available=True)
54 subtitle_references = dict_get(video_info, ('subtitles', 'subtitleReferences'))
55 if isinstance(subtitle_references, list):
56 for sr in subtitle_references:
57 subtitle_url = sr.get('url')
58 subtitle_lang = sr.get('language', 'sv')
59 if subtitle_url:
60 sub = {
61 'url': subtitle_url,
63 if determine_ext(subtitle_url) == 'm3u8':
64 # XXX: no way of testing, is it ever hit?
65 sub['ext'] = 'vtt'
66 subtitles.setdefault(subtitle_lang, []).append(sub)
68 title = video_info.get('title')
70 series = video_info.get('programTitle')
71 season_number = int_or_none(video_info.get('season'))
72 episode = video_info.get('episodeTitle')
73 episode_number = int_or_none(video_info.get('episodeNumber'))
75 timestamp = unified_timestamp(rights.get('validFrom'))
76 duration = int_or_none(dict_get(video_info, ('materialLength', 'contentDuration')))
77 age_limit = None
78 adult = dict_get(
79 video_info, ('inappropriateForChildren', 'blockedForChildren'),
80 skip_false_values=False)
81 if adult is not None:
82 age_limit = 18 if adult else 0
84 return {
85 'id': video_id,
86 'title': title,
87 'formats': formats,
88 'subtitles': subtitles,
89 'duration': duration,
90 'timestamp': timestamp,
91 'age_limit': age_limit,
92 'series': series,
93 'season_number': season_number,
94 'episode': episode,
95 'episode_number': episode_number,
96 'is_live': is_live,
100 class SVTIE(SVTBaseIE):
101 _VALID_URL = r'https?://(?:www\.)?svt\.se/wd\?(?:.*?&)?widgetId=(?P<widget_id>\d+)&.*?\barticleId=(?P<id>\d+)'
102 _EMBED_REGEX = [rf'(?:<iframe src|href)="(?P<url>{_VALID_URL}[^"]*)"']
103 _TEST = {
104 'url': 'http://www.svt.se/wd?widgetId=23991&sectionId=541&articleId=2900353&type=embed&contextSectionId=123&autostart=false',
105 'md5': '33e9a5d8f646523ce0868ecfb0eed77d',
106 'info_dict': {
107 'id': '2900353',
108 'ext': 'mp4',
109 'title': 'Stjärnorna skojar till det - under SVT-intervjun',
110 'duration': 27,
111 'age_limit': 0,
115 def _real_extract(self, url):
116 mobj = self._match_valid_url(url)
117 widget_id = mobj.group('widget_id')
118 article_id = mobj.group('id')
120 info = self._download_json(
121 f'http://www.svt.se/wd?widgetId={widget_id}&articleId={article_id}&format=json&type=embed&output=json',
122 article_id)
124 info_dict = self._extract_video(info['video'], article_id)
125 info_dict['title'] = info['context']['title']
126 return info_dict
129 class SVTPlayBaseIE(SVTBaseIE):
130 _SVTPLAY_RE = r'root\s*\[\s*(["\'])_*svtplay\1\s*\]\s*=\s*(?P<json>{.+?})\s*;\s*\n'
133 class SVTPlayIE(SVTPlayBaseIE):
134 IE_DESC = 'SVT Play and Öppet arkiv'
135 _VALID_URL = r'''(?x)
138 svt:|
139 https?://(?:www\.)?svt\.se/barnkanalen/barnplay/[^/]+/
141 (?P<svt_id>[^/?#&]+)|
142 https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P<id>[^/?#&]+)
143 (?:.*?(?:modalId|id)=(?P<modal_id>[\da-zA-Z-]+))?
146 _TESTS = [{
147 'url': 'https://www.svtplay.se/video/30479064',
148 'md5': '2382036fd6f8c994856c323fe51c426e',
149 'info_dict': {
150 'id': '8zVbDPA',
151 'ext': 'mp4',
152 'title': 'Designdrömmar i Stenungsund',
153 'timestamp': 1615770000,
154 'upload_date': '20210315',
155 'duration': 3519,
156 'thumbnail': r're:^https?://(?:.*[\.-]jpg|www.svtstatic.se/image/.*)$',
157 'age_limit': 0,
158 'subtitles': {
159 'sv': [{
160 'ext': 'vtt',
164 'params': {
165 'skip_download': 'm3u8',
167 'skip': 'Episode is no longer available',
168 }, {
169 'url': 'https://www.svtplay.se/video/emBxBQj',
170 'md5': '2382036fd6f8c994856c323fe51c426e',
171 'info_dict': {
172 'id': 'eyBd9aj',
173 'ext': 'mp4',
174 'title': '1. Farlig kryssning',
175 'timestamp': 1491019200,
176 'upload_date': '20170401',
177 'duration': 2566,
178 'thumbnail': r're:^https?://(?:.*[\.-]jpg|www.svtstatic.se/image/.*)$',
179 'age_limit': 0,
180 'episode': '1. Farlig kryssning',
181 'series': 'Rederiet',
182 'subtitles': {
183 'sv': 'count:3',
186 'params': {
187 'skip_download': 'm3u8',
189 }, {
190 'url': 'https://www.svtplay.se/video/jz2rYz7/anders-hansen-moter/james-fallon?info=visa',
191 'info_dict': {
192 'id': 'jvXAGVb',
193 'ext': 'mp4',
194 'title': 'James Fallon',
195 'timestamp': 1673917200,
196 'upload_date': '20230117',
197 'duration': 1081,
198 'thumbnail': r're:^https?://(?:.*[\.-]jpg|www.svtstatic.se/image/.*)$',
199 'age_limit': 0,
200 'episode': 'James Fallon',
201 'series': 'Anders Hansen möter...',
203 'params': {
204 'skip_download': 'dash',
206 }, {
207 'url': 'https://www.svtplay.se/video/30479064/husdrommar/husdrommar-sasong-8-designdrommar-i-stenungsund?modalId=8zVbDPA',
208 'only_matching': True,
209 }, {
210 'url': 'https://www.svtplay.se/video/30684086/rapport/rapport-24-apr-18-00-7?id=e72gVpa',
211 'only_matching': True,
212 }, {
213 # geo restricted to Sweden
214 'url': 'http://www.oppetarkiv.se/video/5219710/trollflojten',
215 'only_matching': True,
216 }, {
217 'url': 'http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg',
218 'only_matching': True,
219 }, {
220 'url': 'https://www.svtplay.se/kanaler/svt1',
221 'only_matching': True,
222 }, {
223 'url': 'svt:1376446-003A',
224 'only_matching': True,
225 }, {
226 'url': 'svt:14278044',
227 'only_matching': True,
228 }, {
229 'url': 'https://www.svt.se/barnkanalen/barnplay/kar/eWv5MLX/',
230 'only_matching': True,
231 }, {
232 'url': 'svt:eWv5MLX',
233 'only_matching': True,
236 def _extract_by_video_id(self, video_id, webpage=None):
237 data = self._download_json(
238 f'https://api.svt.se/videoplayer-api/video/{video_id}',
239 video_id, headers=self.geo_verification_headers())
240 info_dict = self._extract_video(data, video_id)
241 if not info_dict.get('title'):
242 title = dict_get(info_dict, ('episode', 'series'))
243 if not title and webpage:
244 title = re.sub(
245 r'\s*\|\s*.+?$', '', self._og_search_title(webpage))
246 if not title:
247 title = video_id
248 info_dict['title'] = title
249 return info_dict
251 def _real_extract(self, url):
252 mobj = self._match_valid_url(url)
253 video_id = mobj.group('id')
254 svt_id = mobj.group('svt_id') or mobj.group('modal_id')
256 if svt_id:
257 return self._extract_by_video_id(svt_id)
259 webpage = self._download_webpage(url, video_id)
261 data = self._parse_json(
262 self._search_regex(
263 self._SVTPLAY_RE, webpage, 'embedded data', default='{}',
264 group='json'),
265 video_id, fatal=False)
267 thumbnail = self._og_search_thumbnail(webpage)
269 if data:
270 video_info = try_get(
271 data, lambda x: x['context']['dispatcher']['stores']['VideoTitlePageStore']['data']['video'],
272 dict)
273 if video_info:
274 info_dict = self._extract_video(video_info, video_id)
275 info_dict.update({
276 'title': data['context']['dispatcher']['stores']['MetaStore']['title'],
277 'thumbnail': thumbnail,
279 return info_dict
281 svt_id = try_get(
282 data, lambda x: x['statistics']['dataLake']['content']['id'],
283 str)
285 if not svt_id:
286 nextjs_data = self._search_nextjs_data(webpage, video_id, fatal=False)
287 svt_id = traverse_obj(nextjs_data, (
288 'props', 'urqlState', ..., 'data', {json.loads}, 'detailsPageByPath',
289 'video', 'svtId', {str}), get_all=False)
291 if not svt_id:
292 svt_id = self._search_regex(
293 (r'<video[^>]+data-video-id=["\']([\da-zA-Z-]+)',
294 r'<[^>]+\bdata-rt=["\']top-area-play-button["\'][^>]+\bhref=["\'][^"\']*video/[\w-]+/[^"\']*\b(?:modalId|id)=([\w-]+)'),
295 webpage, 'video id')
297 info_dict = self._extract_by_video_id(svt_id, webpage)
298 info_dict['thumbnail'] = thumbnail
300 return info_dict
303 class SVTSeriesIE(SVTPlayBaseIE):
304 _VALID_URL = r'https?://(?:www\.)?svtplay\.se/(?P<id>[^/?&#]+)(?:.+?\btab=(?P<season_slug>[^&#]+))?'
305 _TESTS = [{
306 'url': 'https://www.svtplay.se/rederiet',
307 'info_dict': {
308 'id': '14445680',
309 'title': 'Rederiet',
310 'description': 'md5:d9fdfff17f5d8f73468176ecd2836039',
312 'playlist_mincount': 318,
313 }, {
314 'url': 'https://www.svtplay.se/rederiet?tab=season-2-14445680',
315 'info_dict': {
316 'id': 'season-2-14445680',
317 'title': 'Rederiet - Säsong 2',
318 'description': 'md5:d9fdfff17f5d8f73468176ecd2836039',
320 'playlist_mincount': 12,
323 @classmethod
324 def suitable(cls, url):
325 return False if SVTIE.suitable(url) or SVTPlayIE.suitable(url) else super().suitable(url)
327 def _real_extract(self, url):
328 series_slug, season_id = self._match_valid_url(url).groups()
330 series = self._download_json(
331 'https://api.svt.se/contento/graphql', series_slug,
332 'Downloading series page', query={
333 'query': '''{
334 listablesBySlug(slugs: ["%s"]) {
335 associatedContent(include: [productionPeriod, season]) {
336 items {
337 item {
338 ... on Episode {
339 videoSvtId
344 name
347 longDescription
348 name
349 shortDescription
351 }''' % series_slug, # noqa: UP031
352 })['data']['listablesBySlug'][0]
354 season_name = None
356 entries = []
357 for season in series['associatedContent']:
358 if not isinstance(season, dict):
359 continue
360 if season_id:
361 if season.get('id') != season_id:
362 continue
363 season_name = season.get('name')
364 items = season.get('items')
365 if not isinstance(items, list):
366 continue
367 for item in items:
368 video = item.get('item') or {}
369 content_id = video.get('videoSvtId')
370 if not content_id or not isinstance(content_id, str):
371 continue
372 entries.append(self.url_result(
373 'svt:' + content_id, SVTPlayIE.ie_key(), content_id))
375 title = series.get('name')
376 season_name = season_name or season_id
378 if title and season_name:
379 title = f'{title} - {season_name}'
380 elif season_id:
381 title = season_id
383 return self.playlist_result(
384 entries, season_id or series.get('id'), title,
385 dict_get(series, ('longDescription', 'shortDescription')))
388 class SVTPageIE(SVTBaseIE):
389 _VALID_URL = r'https?://(?:www\.)?svt\.se/(?:[^/?#]+/)*(?P<id>[^/?&#]+)'
390 _TESTS = [{
391 'url': 'https://www.svt.se/nyheter/lokalt/skane/viktor-18-forlorade-armar-och-ben-i-sepsis-vill-ateruppta-karaten-och-bli-svetsare',
392 'info_dict': {
393 'title': 'Viktor, 18, förlorade armar och ben i sepsis – vill återuppta karaten och bli svetsare',
394 'id': 'viktor-18-forlorade-armar-och-ben-i-sepsis-vill-ateruppta-karaten-och-bli-svetsare',
396 'playlist_count': 2,
397 }, {
398 'url': 'https://www.svt.se/nyheter/lokalt/skane/forsvarsmakten-om-trafikkaoset-pa-e22-kunde-inte-varit-dar-snabbare',
399 'info_dict': {
400 'id': 'jXvk42E',
401 'title': 'Försvarsmakten om trafikkaoset på E22: Kunde inte varit där snabbare',
402 'ext': 'mp4',
403 'duration': 80,
404 'age_limit': 0,
405 'timestamp': 1704370009,
406 'episode': 'Försvarsmakten om trafikkaoset på E22: Kunde inte varit där snabbare',
407 'series': 'Lokala Nyheter Skåne',
408 'upload_date': '20240104',
410 'params': {
411 'skip_download': True,
413 }, {
414 'url': 'https://www.svt.se/nyheter/svtforum/2023-tungt-ar-for-svensk-media',
415 'info_dict': {
416 'title': '2023 tungt år för svensk media',
417 'id': 'ewqAZv4',
418 'ext': 'mp4',
419 'duration': 3074,
420 'age_limit': 0,
421 'series': '',
422 'timestamp': 1702980479,
423 'upload_date': '20231219',
424 'episode': 'Mediestudier',
426 'params': {
427 'skip_download': True,
429 }, {
430 'url': 'https://www.svt.se/sport/ishockey/bakom-masken-lehners-kamp-mot-mental-ohalsa',
431 'info_dict': {
432 'id': '25298267',
433 'title': 'Bakom masken – Lehners kamp mot mental ohälsa',
435 'playlist_count': 4,
436 'skip': 'Video is gone',
437 }, {
438 'url': 'https://www.svt.se/nyheter/utrikes/svenska-andrea-ar-en-mil-fran-branderna-i-kalifornien',
439 'info_dict': {
440 'id': '24243746',
441 'title': 'Svenska Andrea redo att fly sitt hem i Kalifornien',
443 'playlist_count': 2,
444 'skip': 'Video is gone',
445 }, {
446 # only programTitle
447 'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun',
448 'info_dict': {
449 'id': '8439V2K',
450 'ext': 'mp4',
451 'title': 'Stjärnorna skojar till det - under SVT-intervjun',
452 'duration': 27,
453 'age_limit': 0,
455 'skip': 'Video is gone',
456 }, {
457 'url': 'https://www.svt.se/nyheter/lokalt/vast/svt-testar-tar-nagon-upp-skrapet-1',
458 'only_matching': True,
459 }, {
460 'url': 'https://www.svt.se/vader/manadskronikor/maj2018',
461 'only_matching': True,
464 @classmethod
465 def suitable(cls, url):
466 return False if SVTIE.suitable(url) or SVTPlayIE.suitable(url) else super().suitable(url)
468 def _real_extract(self, url):
469 display_id = self._match_id(url)
471 webpage = self._download_webpage(url, display_id)
472 title = self._og_search_title(webpage)
474 urql_state = self._search_json(
475 r'window\.svt\.(?:nyh\.)?urqlState\s*=', webpage, 'json data', display_id)
477 data = traverse_obj(urql_state, (..., 'data', {str}, {json.loads}), get_all=False) or {}
479 def entries():
480 for video_id in set(traverse_obj(data, (
481 'page', (('topMedia', 'svtId'), ('body', ..., 'video', 'svtId')), {str},
482 ))):
483 info = self._extract_video(
484 self._download_json(f'https://api.svt.se/video/{video_id}', video_id), video_id)
485 info['title'] = title
486 yield info
488 return self.playlist_result(entries(), display_id, title)