yt_dlp/extractor/svt.py

   1 import json
   2 import re
   3
   4 from .common import InfoExtractor
   5 from ..utils import (
   6     determine_ext,
   7     dict_get,
   8     int_or_none,
   9     traverse_obj,
  10     try_get,
  11     unified_timestamp,
  12 )
  13
  14
  15 class SVTBaseIE(InfoExtractor):
  16     _GEO_COUNTRIES = ['SE']
  17
  18     def _extract_video(self, video_info, video_id):
  19         is_live = dict_get(video_info, ('live', 'simulcast'), default=False)
  20         m3u8_protocol = 'm3u8' if is_live else 'm3u8_native'
  21         formats = []
  22         subtitles = {}
  23         for vr in video_info['videoReferences']:
  24             player_type = vr.get('playerType') or vr.get('format')
  25             vurl = vr['url']
  26             ext = determine_ext(vurl)
  27             if ext == 'm3u8':
  28                 fmts, subs = self._extract_m3u8_formats_and_subtitles(
  29                     vurl, video_id,
  30                     ext='mp4', entry_protocol=m3u8_protocol,
  31                     m3u8_id=player_type, fatal=False)
  32                 formats.extend(fmts)
  33                 self._merge_subtitles(subs, target=subtitles)
  34             elif ext == 'f4m':
  35                 formats.extend(self._extract_f4m_formats(
  36                     vurl + '?hdcore=3.3.0', video_id,
  37                     f4m_id=player_type, fatal=False))
  38             elif ext == 'mpd':
  39                 fmts, subs = self._extract_mpd_formats_and_subtitles(
  40                     vurl, video_id, mpd_id=player_type, fatal=False)
  41                 formats.extend(fmts)
  42                 self._merge_subtitles(subs, target=subtitles)
  43             else:
  44                 formats.append({
  45                     'format_id': player_type,
  46                     'url': vurl,
  47                 })
  48         rights = try_get(video_info, lambda x: x['rights'], dict) or {}
  49         if not formats and rights.get('geoBlockedSweden'):
  50             self.raise_geo_restricted(
  51                 'This video is only available in Sweden',
  52                 countries=self._GEO_COUNTRIES, metadata_available=True)
  53
  54         subtitle_references = dict_get(video_info, ('subtitles', 'subtitleReferences'))
  55         if isinstance(subtitle_references, list):
  56             for sr in subtitle_references:
  57                 subtitle_url = sr.get('url')
  58                 subtitle_lang = sr.get('language', 'sv')
  59                 if subtitle_url:
  60                     sub = {
  61                         'url': subtitle_url,
  62                     }
  63                     if determine_ext(subtitle_url) == 'm3u8':
  64                         # XXX: no way of testing, is it ever hit?
  65                         sub['ext'] = 'vtt'
  66                     subtitles.setdefault(subtitle_lang, []).append(sub)
  67
  68         title = video_info.get('title')
  69
  70         series = video_info.get('programTitle')
  71         season_number = int_or_none(video_info.get('season'))
  72         episode = video_info.get('episodeTitle')
  73         episode_number = int_or_none(video_info.get('episodeNumber'))
  74
  75         timestamp = unified_timestamp(rights.get('validFrom'))
  76         duration = int_or_none(dict_get(video_info, ('materialLength', 'contentDuration')))
  77         age_limit = None
  78         adult = dict_get(
  79             video_info, ('inappropriateForChildren', 'blockedForChildren'),
  80             skip_false_values=False)
  81         if adult is not None:
  82             age_limit = 18 if adult else 0
  83
  84         return {
  85             'id': video_id,
  86             'title': title,
  87             'formats': formats,
  88             'subtitles': subtitles,
  89             'duration': duration,
  90             'timestamp': timestamp,
  91             'age_limit': age_limit,
  92             'series': series,
  93             'season_number': season_number,
  94             'episode': episode,
  95             'episode_number': episode_number,
  96             'is_live': is_live,
  97         }
  98
  99
 100 class SVTIE(SVTBaseIE):
 101     _VALID_URL = r'https?://(?:www\.)?svt\.se/wd\?(?:.*?&)?widgetId=(?P<widget_id>\d+)&.*?\barticleId=(?P<id>\d+)'
 102     _EMBED_REGEX = [rf'(?:<iframe src|href)="(?P<url>{_VALID_URL}[^"]*)"']
 103     _TEST = {
 104         'url': 'http://www.svt.se/wd?widgetId=23991&sectionId=541&articleId=2900353&type=embed&contextSectionId=123&autostart=false',
 105         'md5': '33e9a5d8f646523ce0868ecfb0eed77d',
 106         'info_dict': {
 107             'id': '2900353',
 108             'ext': 'mp4',
 109             'title': 'Stjärnorna skojar till det - under SVT-intervjun',
 110             'duration': 27,
 111             'age_limit': 0,
 112         },
 113     }
 114
 115     def _real_extract(self, url):
 116         mobj = self._match_valid_url(url)
 117         widget_id = mobj.group('widget_id')
 118         article_id = mobj.group('id')
 119
 120         info = self._download_json(
 121             f'http://www.svt.se/wd?widgetId={widget_id}&articleId={article_id}&format=json&type=embed&output=json',
 122             article_id)
 123
 124         info_dict = self._extract_video(info['video'], article_id)
 125         info_dict['title'] = info['context']['title']
 126         return info_dict
 127
 128
 129 class SVTPlayBaseIE(SVTBaseIE):
 130     _SVTPLAY_RE = r'root\s*\[\s*(["\'])_*svtplay\1\s*\]\s*=\s*(?P<json>{.+?})\s*;\s*\n'
 131
 132
 133 class SVTPlayIE(SVTPlayBaseIE):
 134     IE_DESC = 'SVT Play and Öppet arkiv'
 135     _VALID_URL = r'''(?x)
 136                     (?:
 137                         (?:
 138                             svt:|
 139                             https?://(?:www\.)?svt\.se/barnkanalen/barnplay/[^/]+/
 140                         )
 141                         (?P<svt_id>[^/?#&]+)|
 142                         https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P<id>[^/?#&]+)
 143                         (?:.*?(?:modalId|id)=(?P<modal_id>[\da-zA-Z-]+))?
 144                     )
 145                     '''
 146     _TESTS = [{
 147         'url': 'https://www.svtplay.se/video/30479064',
 148         'md5': '2382036fd6f8c994856c323fe51c426e',
 149         'info_dict': {
 150             'id': '8zVbDPA',
 151             'ext': 'mp4',
 152             'title': 'Designdrömmar i Stenungsund',
 153             'timestamp': 1615770000,
 154             'upload_date': '20210315',
 155             'duration': 3519,
 156             'thumbnail': r're:^https?://(?:.*[\.-]jpg|www.svtstatic.se/image/.*)$',
 157             'age_limit': 0,
 158             'subtitles': {
 159                 'sv': [{
 160                     'ext': 'vtt',
 161                 }],
 162             },
 163         },
 164         'params': {
 165             'skip_download': 'm3u8',
 166         },
 167         'skip': 'Episode is no longer available',
 168     }, {
 169         'url': 'https://www.svtplay.se/video/emBxBQj',
 170         'md5': '2382036fd6f8c994856c323fe51c426e',
 171         'info_dict': {
 172             'id': 'eyBd9aj',
 173             'ext': 'mp4',
 174             'title': '1. Farlig kryssning',
 175             'timestamp': 1491019200,
 176             'upload_date': '20170401',
 177             'duration': 2566,
 178             'thumbnail': r're:^https?://(?:.*[\.-]jpg|www.svtstatic.se/image/.*)$',
 179             'age_limit': 0,
 180             'episode': '1. Farlig kryssning',
 181             'series': 'Rederiet',
 182             'subtitles': {
 183                 'sv': 'count:3',
 184             },
 185         },
 186         'params': {
 187             'skip_download': 'm3u8',
 188         },
 189     }, {
 190         'url': 'https://www.svtplay.se/video/jz2rYz7/anders-hansen-moter/james-fallon?info=visa',
 191         'info_dict': {
 192             'id': 'jvXAGVb',
 193             'ext': 'mp4',
 194             'title': 'James Fallon',
 195             'timestamp': 1673917200,
 196             'upload_date': '20230117',
 197             'duration': 1081,
 198             'thumbnail': r're:^https?://(?:.*[\.-]jpg|www.svtstatic.se/image/.*)$',
 199             'age_limit': 0,
 200             'episode': 'James Fallon',
 201             'series': 'Anders Hansen möter...',
 202         },
 203         'params': {
 204             'skip_download': 'dash',
 205         },
 206     }, {
 207         'url': 'https://www.svtplay.se/video/30479064/husdrommar/husdrommar-sasong-8-designdrommar-i-stenungsund?modalId=8zVbDPA',
 208         'only_matching': True,
 209     }, {
 210         'url': 'https://www.svtplay.se/video/30684086/rapport/rapport-24-apr-18-00-7?id=e72gVpa',
 211         'only_matching': True,
 212     }, {
 213         # geo restricted to Sweden
 214         'url': 'http://www.oppetarkiv.se/video/5219710/trollflojten',
 215         'only_matching': True,
 216     }, {
 217         'url': 'http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg',
 218         'only_matching': True,
 219     }, {
 220         'url': 'https://www.svtplay.se/kanaler/svt1',
 221         'only_matching': True,
 222     }, {
 223         'url': 'svt:1376446-003A',
 224         'only_matching': True,
 225     }, {
 226         'url': 'svt:14278044',
 227         'only_matching': True,
 228     }, {
 229         'url': 'https://www.svt.se/barnkanalen/barnplay/kar/eWv5MLX/',
 230         'only_matching': True,
 231     }, {
 232         'url': 'svt:eWv5MLX',
 233         'only_matching': True,
 234     }]
 235
 236     def _extract_by_video_id(self, video_id, webpage=None):
 237         data = self._download_json(
 238             f'https://api.svt.se/videoplayer-api/video/{video_id}',
 239             video_id, headers=self.geo_verification_headers())
 240         info_dict = self._extract_video(data, video_id)
 241         if not info_dict.get('title'):
 242             title = dict_get(info_dict, ('episode', 'series'))
 243             if not title and webpage:
 244                 title = re.sub(
 245                     r'\s*\|\s*.+?$', '', self._og_search_title(webpage))
 246             if not title:
 247                 title = video_id
 248             info_dict['title'] = title
 249         return info_dict
 250
 251     def _real_extract(self, url):
 252         mobj = self._match_valid_url(url)
 253         video_id = mobj.group('id')
 254         svt_id = mobj.group('svt_id') or mobj.group('modal_id')
 255
 256         if svt_id:
 257             return self._extract_by_video_id(svt_id)
 258
 259         webpage = self._download_webpage(url, video_id)
 260
 261         data = self._parse_json(
 262             self._search_regex(
 263                 self._SVTPLAY_RE, webpage, 'embedded data', default='{}',
 264                 group='json'),
 265             video_id, fatal=False)
 266
 267         thumbnail = self._og_search_thumbnail(webpage)
 268
 269         if data:
 270             video_info = try_get(
 271                 data, lambda x: x['context']['dispatcher']['stores']['VideoTitlePageStore']['data']['video'],
 272                 dict)
 273             if video_info:
 274                 info_dict = self._extract_video(video_info, video_id)
 275                 info_dict.update({
 276                     'title': data['context']['dispatcher']['stores']['MetaStore']['title'],
 277                     'thumbnail': thumbnail,
 278                 })
 279                 return info_dict
 280
 281             svt_id = try_get(
 282                 data, lambda x: x['statistics']['dataLake']['content']['id'],
 283                 str)
 284
 285         if not svt_id:
 286             nextjs_data = self._search_nextjs_data(webpage, video_id, fatal=False)
 287             svt_id = traverse_obj(nextjs_data, (
 288                 'props', 'urqlState', ..., 'data', {json.loads}, 'detailsPageByPath',
 289                 'video', 'svtId', {str}), get_all=False)
 290
 291         if not svt_id:
 292             svt_id = self._search_regex(
 293                 (r'<video[^>]+data-video-id=["\']([\da-zA-Z-]+)',
 294                  r'<[^>]+\bdata-rt=["\']top-area-play-button["\'][^>]+\bhref=["\'][^"\']*video/[\w-]+/[^"\']*\b(?:modalId|id)=([\w-]+)'),
 295                 webpage, 'video id')
 296
 297         info_dict = self._extract_by_video_id(svt_id, webpage)
 298         info_dict['thumbnail'] = thumbnail
 299
 300         return info_dict
 301
 302
 303 class SVTSeriesIE(SVTPlayBaseIE):
 304     _VALID_URL = r'https?://(?:www\.)?svtplay\.se/(?P<id>[^/?&#]+)(?:.+?\btab=(?P<season_slug>[^&#]+))?'
 305     _TESTS = [{
 306         'url': 'https://www.svtplay.se/rederiet',
 307         'info_dict': {
 308             'id': '14445680',
 309             'title': 'Rederiet',
 310             'description': 'md5:d9fdfff17f5d8f73468176ecd2836039',
 311         },
 312         'playlist_mincount': 318,
 313     }, {
 314         'url': 'https://www.svtplay.se/rederiet?tab=season-2-14445680',
 315         'info_dict': {
 316             'id': 'season-2-14445680',
 317             'title': 'Rederiet - Säsong 2',
 318             'description': 'md5:d9fdfff17f5d8f73468176ecd2836039',
 319         },
 320         'playlist_mincount': 12,
 321     }]
 322
 323     @classmethod
 324     def suitable(cls, url):
 325         return False if SVTIE.suitable(url) or SVTPlayIE.suitable(url) else super().suitable(url)
 326
 327     def _real_extract(self, url):
 328         series_slug, season_id = self._match_valid_url(url).groups()
 329
 330         series = self._download_json(
 331             'https://api.svt.se/contento/graphql', series_slug,
 332             'Downloading series page', query={
 333                 'query': '''{
 334   listablesBySlug(slugs: ["%s"]) {
 335     associatedContent(include: [productionPeriod, season]) {
 336       items {
 337         item {
 338           ... on Episode {
 339             videoSvtId
 340           }
 341         }
 342       }
 343       id
 344       name
 345     }
 346     id
 347     longDescription
 348     name
 349     shortDescription
 350   }
 351 }''' % series_slug,  # noqa: UP031
 352             })['data']['listablesBySlug'][0]
 353
 354         season_name = None
 355
 356         entries = []
 357         for season in series['associatedContent']:
 358             if not isinstance(season, dict):
 359                 continue
 360             if season_id:
 361                 if season.get('id') != season_id:
 362                     continue
 363                 season_name = season.get('name')
 364             items = season.get('items')
 365             if not isinstance(items, list):
 366                 continue
 367             for item in items:
 368                 video = item.get('item') or {}
 369                 content_id = video.get('videoSvtId')
 370                 if not content_id or not isinstance(content_id, str):
 371                     continue
 372                 entries.append(self.url_result(
 373                     'svt:' + content_id, SVTPlayIE.ie_key(), content_id))
 374
 375         title = series.get('name')
 376         season_name = season_name or season_id
 377
 378         if title and season_name:
 379             title = f'{title} - {season_name}'
 380         elif season_id:
 381             title = season_id
 382
 383         return self.playlist_result(
 384             entries, season_id or series.get('id'), title,
 385             dict_get(series, ('longDescription', 'shortDescription')))
 386
 387
 388 class SVTPageIE(SVTBaseIE):
 389     _VALID_URL = r'https?://(?:www\.)?svt\.se/(?:[^/?#]+/)*(?P<id>[^/?&#]+)'
 390     _TESTS = [{
 391         'url': 'https://www.svt.se/nyheter/lokalt/skane/viktor-18-forlorade-armar-och-ben-i-sepsis-vill-ateruppta-karaten-och-bli-svetsare',
 392         'info_dict': {
 393             'title': 'Viktor, 18, förlorade armar och ben i sepsis – vill återuppta karaten och bli svetsare',
 394             'id': 'viktor-18-forlorade-armar-och-ben-i-sepsis-vill-ateruppta-karaten-och-bli-svetsare',
 395         },
 396         'playlist_count': 2,
 397     }, {
 398         'url': 'https://www.svt.se/nyheter/lokalt/skane/forsvarsmakten-om-trafikkaoset-pa-e22-kunde-inte-varit-dar-snabbare',
 399         'info_dict': {
 400             'id': 'jXvk42E',
 401             'title': 'Försvarsmakten om trafikkaoset på E22: Kunde inte varit där snabbare',
 402             'ext': 'mp4',
 403             'duration': 80,
 404             'age_limit': 0,
 405             'timestamp': 1704370009,
 406             'episode': 'Försvarsmakten om trafikkaoset på E22: Kunde inte varit där snabbare',
 407             'series': 'Lokala Nyheter Skåne',
 408             'upload_date': '20240104',
 409         },
 410         'params': {
 411             'skip_download': True,
 412         },
 413     }, {
 414         'url': 'https://www.svt.se/nyheter/svtforum/2023-tungt-ar-for-svensk-media',
 415         'info_dict': {
 416             'title': '2023 tungt år för svensk media',
 417             'id': 'ewqAZv4',
 418             'ext': 'mp4',
 419             'duration': 3074,
 420             'age_limit': 0,
 421             'series': '',
 422             'timestamp': 1702980479,
 423             'upload_date': '20231219',
 424             'episode': 'Mediestudier',
 425         },
 426         'params': {
 427             'skip_download': True,
 428         },
 429     }, {
 430         'url': 'https://www.svt.se/sport/ishockey/bakom-masken-lehners-kamp-mot-mental-ohalsa',
 431         'info_dict': {
 432             'id': '25298267',
 433             'title': 'Bakom masken – Lehners kamp mot mental ohälsa',
 434         },
 435         'playlist_count': 4,
 436         'skip': 'Video is gone',
 437     }, {
 438         'url': 'https://www.svt.se/nyheter/utrikes/svenska-andrea-ar-en-mil-fran-branderna-i-kalifornien',
 439         'info_dict': {
 440             'id': '24243746',
 441             'title': 'Svenska Andrea redo att fly sitt hem i Kalifornien',
 442         },
 443         'playlist_count': 2,
 444         'skip': 'Video is gone',
 445     }, {
 446         # only programTitle
 447         'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun',
 448         'info_dict': {
 449             'id': '8439V2K',
 450             'ext': 'mp4',
 451             'title': 'Stjärnorna skojar till det - under SVT-intervjun',
 452             'duration': 27,
 453             'age_limit': 0,
 454         },
 455         'skip': 'Video is gone',
 456     }, {
 457         'url': 'https://www.svt.se/nyheter/lokalt/vast/svt-testar-tar-nagon-upp-skrapet-1',
 458         'only_matching': True,
 459     }, {
 460         'url': 'https://www.svt.se/vader/manadskronikor/maj2018',
 461         'only_matching': True,
 462     }]
 463
 464     @classmethod
 465     def suitable(cls, url):
 466         return False if SVTIE.suitable(url) or SVTPlayIE.suitable(url) else super().suitable(url)
 467
 468     def _real_extract(self, url):
 469         display_id = self._match_id(url)
 470
 471         webpage = self._download_webpage(url, display_id)
 472         title = self._og_search_title(webpage)
 473
 474         urql_state = self._search_json(
 475             r'window\.svt\.(?:nyh\.)?urqlState\s*=', webpage, 'json data', display_id)
 476
 477         data = traverse_obj(urql_state, (..., 'data', {str}, {json.loads}), get_all=False) or {}
 478
 479         def entries():
 480             for video_id in set(traverse_obj(data, (
 481                 'page', (('topMedia', 'svtId'), ('body', ..., 'video', 'svtId')), {str},
 482             ))):
 483                 info = self._extract_video(
 484                     self._download_json(f'https://api.svt.se/video/{video_id}', video_id), video_id)
 485                 info['title'] = title
 486                 yield info
 487
 488         return self.playlist_result(entries(), display_id, title)