yt_dlp/extractor/slideslive.py

   1 import re
   2 import urllib.parse
   3 import xml.etree.ElementTree
   4
   5 from .common import InfoExtractor
   6 from ..utils import (
   7     ExtractorError,
   8     int_or_none,
   9     parse_qs,
  10     smuggle_url,
  11     traverse_obj,
  12     unified_timestamp,
  13     update_url_query,
  14     url_or_none,
  15     xpath_text,
  16 )
  17
  18
  19 class SlidesLiveIE(InfoExtractor):
  20     _VALID_URL = r'https?://slideslive\.com/(?:embed/(?:presentation/)?)?(?P<id>[0-9]+)'
  21     _TESTS = [{
  22         # service_name = yoda, only XML slides info
  23         'url': 'https://slideslive.com/38902413/gcc-ia16-backend',
  24         'info_dict': {
  25             'id': '38902413',
  26             'ext': 'mp4',
  27             'title': 'GCC IA16 backend',
  28             'timestamp': 1697793372,
  29             'upload_date': '20231020',
  30             'thumbnail': r're:^https?://.*\.jpg',
  31             'thumbnails': 'count:42',
  32             'chapters': 'count:41',
  33             'duration': 1638,
  34         },
  35         'params': {
  36             'skip_download': 'm3u8',
  37         },
  38     }, {
  39         # service_name = yoda, /v7/ slides
  40         'url': 'https://slideslive.com/38935785',
  41         'info_dict': {
  42             'id': '38935785',
  43             'ext': 'mp4',
  44             'title': 'Offline Reinforcement Learning: From Algorithms to Practical Challenges',
  45             'upload_date': '20231020',
  46             'timestamp': 1697807002,
  47             'thumbnail': r're:^https?://.*\.(?:jpg|png)',
  48             'thumbnails': 'count:640',
  49             'chapters': 'count:639',
  50             'duration': 9832,
  51         },
  52         'params': {
  53             'skip_download': 'm3u8',
  54         },
  55     }, {
  56         # service_name = yoda, /v1/ slides
  57         'url': 'https://slideslive.com/38973182/how-should-a-machine-learning-researcher-think-about-ai-ethics',
  58         'info_dict': {
  59             'id': '38973182',
  60             'ext': 'mp4',
  61             'title': 'How Should a Machine Learning Researcher Think About AI Ethics?',
  62             'upload_date': '20231020',
  63             'thumbnail': r're:^https?://.*\.jpg',
  64             'timestamp': 1697822521,
  65             'thumbnails': 'count:3',
  66             'chapters': 'count:2',
  67             'duration': 5889,
  68         },
  69         'params': {
  70             'skip_download': 'm3u8',
  71         },
  72     }, {
  73         # formerly youtube, converted to native
  74         'url': 'https://slideslive.com/38897546/special-metaprednaska-petra-ludwiga-hodnoty-pro-lepsi-spolecnost',
  75         'md5': '8a79b5e3d700837f40bd2afca3c8fa01',
  76         'info_dict': {
  77             'id': '38897546',
  78             'ext': 'mp4',
  79             'title': 'SPECIÁL: Meta-přednáška Petra Ludwiga - Hodnoty pro lepší společnost',
  80             'thumbnail': r're:^https?://.*\.jpg',
  81             'upload_date': '20231029',
  82             'timestamp': 1698588144,
  83             'thumbnails': 'count:169',
  84             'chapters': 'count:168',
  85             'duration': 6827,
  86         },
  87         'params': {
  88             'skip_download': 'm3u8',
  89         },
  90     }, {
  91         # embed-only presentation, only XML slides info
  92         'url': 'https://slideslive.com/embed/presentation/38925850',
  93         'info_dict': {
  94             'id': '38925850',
  95             'ext': 'mp4',
  96             'title': 'Towards a Deep Network Architecture for Structured Smoothness',
  97             'thumbnail': r're:^https?://.*\.jpg',
  98             'thumbnails': 'count:8',
  99             'timestamp': 1697803109,
 100             'upload_date': '20231020',
 101             'chapters': 'count:7',
 102             'duration': 326,
 103         },
 104         'params': {
 105             'skip_download': 'm3u8',
 106         },
 107     }, {
 108         # embed-only presentation, only JSON slides info, /v5/ slides (.png)
 109         'url': 'https://slideslive.com/38979920/',
 110         'info_dict': {
 111             'id': '38979920',
 112             'ext': 'mp4',
 113             'title': 'MoReL: Multi-omics Relational Learning',
 114             'thumbnail': r're:^https?://.*\.(?:jpg|png)',
 115             'thumbnails': 'count:7',
 116             'timestamp': 1697824939,
 117             'upload_date': '20231020',
 118             'chapters': 'count:6',
 119             'duration': 171,
 120         },
 121         'params': {
 122             'skip_download': 'm3u8',
 123         },
 124     }, {
 125         # /v2/ slides (.jpg)
 126         'url': 'https://slideslive.com/38954074',
 127         'info_dict': {
 128             'id': '38954074',
 129             'ext': 'mp4',
 130             'title': 'Decentralized Attribution of Generative Models',
 131             'thumbnail': r're:^https?://.*\.jpg',
 132             'thumbnails': 'count:16',
 133             'timestamp': 1697814901,
 134             'upload_date': '20231020',
 135             'chapters': 'count:15',
 136             'duration': 306,
 137         },
 138         'params': {
 139             'skip_download': 'm3u8',
 140         },
 141     }, {
 142         # /v4/ slides (.png)
 143         'url': 'https://slideslive.com/38979570/',
 144         'info_dict': {
 145             'id': '38979570',
 146             'ext': 'mp4',
 147             'title': 'Efficient Active Search for Combinatorial Optimization Problems',
 148             'thumbnail': r're:^https?://.*\.(?:jpg|png)',
 149             'thumbnails': 'count:9',
 150             'timestamp': 1697824757,
 151             'upload_date': '20231020',
 152             'chapters': 'count:8',
 153             'duration': 295,
 154         },
 155         'params': {
 156             'skip_download': 'm3u8',
 157         },
 158     }, {
 159         # /v10/ slides
 160         'url': 'https://slideslive.com/embed/presentation/38979880?embed_parent_url=https%3A%2F%2Fedit.videoken.com%2F',
 161         'info_dict': {
 162             'id': '38979880',
 163             'ext': 'mp4',
 164             'title': 'The Representation Power of Neural Networks',
 165             'timestamp': 1697824919,
 166             'thumbnail': r're:^https?://.*\.(?:jpg|png)',
 167             'thumbnails': 'count:22',
 168             'upload_date': '20231020',
 169             'chapters': 'count:21',
 170             'duration': 294,
 171         },
 172         'params': {
 173             'skip_download': 'm3u8',
 174         },
 175     }, {
 176         # /v7/ slides, 2 video slides
 177         'url': 'https://slideslive.com/embed/presentation/38979682?embed_container_origin=https%3A%2F%2Fedit.videoken.com',
 178         'playlist_count': 3,
 179         'info_dict': {
 180             'id': '38979682-playlist',
 181             'title': 'LoRA: Low-Rank Adaptation of Large Language Models',
 182         },
 183         'playlist': [{
 184             'info_dict': {
 185                 'id': '38979682',
 186                 'ext': 'mp4',
 187                 'title': 'LoRA: Low-Rank Adaptation of Large Language Models',
 188                 'timestamp': 1697824815,
 189                 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
 190                 'thumbnails': 'count:30',
 191                 'upload_date': '20231020',
 192                 'chapters': 'count:31',
 193                 'duration': 272,
 194             },
 195         }, {
 196             'info_dict': {
 197                 'id': '38979682-021',
 198                 'ext': 'mp4',
 199                 'title': 'LoRA: Low-Rank Adaptation of Large Language Models - Slide 021',
 200                 'duration': 3,
 201                 'timestamp': 1697824815,
 202                 'upload_date': '20231020',
 203             },
 204         }, {
 205             'info_dict': {
 206                 'id': '38979682-024',
 207                 'ext': 'mp4',
 208                 'title': 'LoRA: Low-Rank Adaptation of Large Language Models - Slide 024',
 209                 'duration': 4,
 210                 'timestamp': 1697824815,
 211                 'upload_date': '20231020',
 212             },
 213         }],
 214         'params': {
 215             'skip_download': 'm3u8',
 216         },
 217     }, {
 218         # /v6/ slides, 1 video slide, edit.videoken.com embed
 219         'url': 'https://slideslive.com/38979481/',
 220         'playlist_count': 2,
 221         'info_dict': {
 222             'id': '38979481-playlist',
 223             'title': 'How to Train Your MAML to Excel in Few-Shot Classification',
 224         },
 225         'playlist': [{
 226             'info_dict': {
 227                 'id': '38979481',
 228                 'ext': 'mp4',
 229                 'title': 'How to Train Your MAML to Excel in Few-Shot Classification',
 230                 'timestamp': 1697824716,
 231                 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
 232                 'thumbnails': 'count:43',
 233                 'upload_date': '20231020',
 234                 'chapters': 'count:43',
 235                 'duration': 315,
 236             },
 237         }, {
 238             'info_dict': {
 239                 'id': '38979481-013',
 240                 'ext': 'mp4',
 241                 'title': 'How to Train Your MAML to Excel in Few-Shot Classification - Slide 013',
 242                 'duration': 3,
 243                 'timestamp': 1697824716,
 244                 'upload_date': '20231020',
 245             },
 246         }],
 247         'params': {
 248             'skip_download': 'm3u8',
 249         },
 250     }, {
 251         # /v3/ slides, .jpg and .png, service_name = youtube
 252         'url': 'https://slideslive.com/embed/38932460/',
 253         'info_dict': {
 254             'id': 'RTPdrgkyTiE',
 255             'display_id': '38932460',
 256             'ext': 'mp4',
 257             'title': 'Active Learning for Hierarchical Multi-Label Classification',
 258             'description': 'Watch full version of this video at https://slideslive.com/38932460.',
 259             'channel': 'SlidesLive Videos - A',
 260             'channel_id': 'UC62SdArr41t_-_fX40QCLRw',
 261             'channel_url': 'https://www.youtube.com/channel/UC62SdArr41t_-_fX40QCLRw',
 262             'uploader': 'SlidesLive Videos - A',
 263             'uploader_id': '@slideslivevideos-a6075',
 264             'uploader_url': 'https://www.youtube.com/@slideslivevideos-a6075',
 265             'upload_date': '20200903',
 266             'timestamp': 1697805922,
 267             'duration': 942,
 268             'age_limit': 0,
 269             'live_status': 'not_live',
 270             'playable_in_embed': True,
 271             'availability': 'unlisted',
 272             'categories': ['People & Blogs'],
 273             'tags': [],
 274             'channel_follower_count': int,
 275             'like_count': int,
 276             'view_count': int,
 277             'thumbnail': r're:^https?://.*\.(?:jpg|png|webp)',
 278             'thumbnails': 'count:21',
 279             'chapters': 'count:20',
 280         },
 281         'params': {
 282             'skip_download': 'm3u8',
 283         },
 284     }, {
 285         # /v3/ slides, .png only, service_name = yoda
 286         'url': 'https://slideslive.com/38983994',
 287         'info_dict': {
 288             'id': '38983994',
 289             'ext': 'mp4',
 290             'title': 'Zero-Shot AutoML with Pretrained Models',
 291             'timestamp': 1697826708,
 292             'upload_date': '20231020',
 293             'thumbnail': r're:^https?://.*\.(?:jpg|png)',
 294             'thumbnails': 'count:23',
 295             'chapters': 'count:22',
 296             'duration': 295,
 297         },
 298         'params': {
 299             'skip_download': 'm3u8',
 300         },
 301     }, {
 302         # service_name = yoda
 303         'url': 'https://slideslive.com/38903721/magic-a-scientific-resurrection-of-an-esoteric-legend',
 304         'only_matching': True,
 305     }, {
 306         # dead link, service_name = url
 307         'url': 'https://slideslive.com/38922070/learning-transferable-skills-1',
 308         'only_matching': True,
 309     }, {
 310         # dead link, service_name = vimeo
 311         'url': 'https://slideslive.com/38921896/retrospectives-a-venue-for-selfreflection-in-ml-research-3',
 312         'only_matching': True,
 313     }]
 314
 315     _WEBPAGE_TESTS = [{
 316         # only XML slides info
 317         'url': 'https://iclr.cc/virtual_2020/poster_Hklr204Fvr.html',
 318         'info_dict': {
 319             'id': '38925850',
 320             'ext': 'mp4',
 321             'title': 'Towards a Deep Network Architecture for Structured Smoothness',
 322             'thumbnail': r're:^https?://.*\.jpg',
 323             'thumbnails': 'count:8',
 324             'timestamp': 1697803109,
 325             'upload_date': '20231020',
 326             'chapters': 'count:7',
 327             'duration': 326,
 328         },
 329         'params': {
 330             'skip_download': 'm3u8',
 331         },
 332     }]
 333
 334     @classmethod
 335     def _extract_embed_urls(cls, url, webpage):
 336         # Reference: https://slideslive.com/embed_presentation.js
 337         for embed_id in re.findall(r'(?s)new\s+SlidesLiveEmbed\s*\([^)]+\bpresentationId:\s*["\'](\d+)["\']', webpage):
 338             url_parsed = urllib.parse.urlparse(url)
 339             origin = f'{url_parsed.scheme}://{url_parsed.netloc}'
 340             yield update_url_query(
 341                 f'https://slideslive.com/embed/presentation/{embed_id}', {
 342                     'embed_parent_url': url,
 343                     'embed_container_origin': origin,
 344                 })
 345
 346     def _download_embed_webpage_handle(self, video_id, headers):
 347         return self._download_webpage_handle(
 348             f'https://slideslive.com/embed/presentation/{video_id}', video_id,
 349             headers=headers, query=traverse_obj(headers, {
 350                 'embed_parent_url': 'Referer',
 351                 'embed_container_origin': 'Origin',
 352             }))
 353
 354     def _extract_custom_m3u8_info(self, m3u8_data):
 355         m3u8_dict = {}
 356
 357         lookup = {
 358             'PRESENTATION-TITLE': 'title',
 359             'PRESENTATION-UPDATED-AT': 'timestamp',
 360             'PRESENTATION-THUMBNAIL': 'thumbnail',
 361             'PLAYLIST-TYPE': 'playlist_type',
 362             'VOD-VIDEO-SERVICE-NAME': 'service_name',
 363             'VOD-VIDEO-ID': 'service_id',
 364             'VOD-VIDEO-SERVERS': 'video_servers',
 365             'VOD-SUBTITLES': 'subtitles',
 366             'VOD-SLIDES-JSON-URL': 'slides_json_url',
 367             'VOD-SLIDES-XML-URL': 'slides_xml_url',
 368         }
 369
 370         for line in m3u8_data.splitlines():
 371             if not line.startswith('#EXT-SL-'):
 372                 continue
 373             tag, _, value = line.partition(':')
 374             key = lookup.get(tag[8:])
 375             if not key:
 376                 continue
 377             m3u8_dict[key] = value
 378
 379         # Some values are stringified JSON arrays
 380         for key in ('video_servers', 'subtitles'):
 381             if key in m3u8_dict:
 382                 m3u8_dict[key] = self._parse_json(m3u8_dict[key], None, fatal=False) or []
 383
 384         return m3u8_dict
 385
 386     def _extract_formats_and_duration(self, cdn_hostname, path, video_id, skip_duration=False):
 387         formats, duration = [], None
 388
 389         hls_formats = self._extract_m3u8_formats(
 390             f'https://{cdn_hostname}/{path}/master.m3u8',
 391             video_id, 'mp4', m3u8_id='hls', fatal=False, live=True)
 392         if hls_formats:
 393             if not skip_duration:
 394                 duration = self._extract_m3u8_vod_duration(
 395                     hls_formats[0]['url'], video_id, note='Extracting duration from HLS manifest')
 396             formats.extend(hls_formats)
 397
 398         dash_formats = self._extract_mpd_formats(
 399             f'https://{cdn_hostname}/{path}/master.mpd', video_id, mpd_id='dash', fatal=False)
 400         if dash_formats:
 401             if not duration and not skip_duration:
 402                 duration = self._extract_mpd_vod_duration(
 403                     f'https://{cdn_hostname}/{path}/master.mpd', video_id,
 404                     note='Extracting duration from DASH manifest')
 405             formats.extend(dash_formats)
 406
 407         return formats, duration
 408
 409     def _real_extract(self, url):
 410         video_id = self._match_id(url)
 411         webpage, urlh = self._download_embed_webpage_handle(
 412             video_id, headers=traverse_obj(parse_qs(url), {
 413                 'Referer': ('embed_parent_url', -1),
 414                 'Origin': ('embed_container_origin', -1)}))
 415         redirect_url = urlh.url
 416         if 'domain_not_allowed' in redirect_url:
 417             domain = traverse_obj(parse_qs(redirect_url), ('allowed_domains[]', ...), get_all=False)
 418             if not domain:
 419                 raise ExtractorError(
 420                     'This is an embed-only presentation. Try passing --referer', expected=True)
 421             webpage, _ = self._download_embed_webpage_handle(video_id, headers={
 422                 'Referer': f'https://{domain}/',
 423                 'Origin': f'https://{domain}',
 424             })
 425
 426         player_token = self._search_regex(r'data-player-token="([^"]+)"', webpage, 'player token')
 427         player_data = self._download_webpage(
 428             f'https://ben.slideslive.com/player/{video_id}', video_id,
 429             note='Downloading player info', query={'player_token': player_token})
 430         player_info = self._extract_custom_m3u8_info(player_data)
 431
 432         service_name = player_info['service_name'].lower()
 433         assert service_name in ('url', 'yoda', 'vimeo', 'youtube')
 434         service_id = player_info['service_id']
 435
 436         slide_url_template = 'https://slides.slideslive.com/%s/slides/original/%s%s'
 437         slides, slides_info = {}, []
 438
 439         if player_info.get('slides_json_url'):
 440             slides = self._download_json(
 441                 player_info['slides_json_url'], video_id, fatal=False,
 442                 note='Downloading slides JSON', errnote=False) or {}
 443             slide_ext_default = '.png'
 444             slide_quality = traverse_obj(slides, ('slide_qualities', 0))
 445             if slide_quality:
 446                 slide_ext_default = '.jpg'
 447                 slide_url_template = f'https://cdn.slideslive.com/data/presentations/%s/slides/{slide_quality}/%s%s'
 448             for slide_id, slide in enumerate(traverse_obj(slides, ('slides', ...), expected_type=dict), 1):
 449                 slides_info.append((
 450                     slide_id, traverse_obj(slide, ('image', 'name')),
 451                     traverse_obj(slide, ('image', 'extname'), default=slide_ext_default),
 452                     int_or_none(slide.get('time'), scale=1000)))
 453
 454         if not slides and player_info.get('slides_xml_url'):
 455             slides = self._download_xml(
 456                 player_info['slides_xml_url'], video_id, fatal=False,
 457                 note='Downloading slides XML', errnote='Failed to download slides info')
 458             if isinstance(slides, xml.etree.ElementTree.Element):
 459                 slide_url_template = 'https://cdn.slideslive.com/data/presentations/%s/slides/big/%s%s'
 460                 for slide_id, slide in enumerate(slides.findall('./slide')):
 461                     slides_info.append((
 462                         slide_id, xpath_text(slide, './slideName', 'name'), '.jpg',
 463                         int_or_none(xpath_text(slide, './timeSec', 'time'))))
 464
 465         chapters, thumbnails = [], []
 466         if url_or_none(player_info.get('thumbnail')):
 467             thumbnails.append({'id': 'cover', 'url': player_info['thumbnail']})
 468         for slide_id, slide_path, slide_ext, start_time in slides_info:
 469             if slide_path:
 470                 thumbnails.append({
 471                     'id': f'{slide_id:03d}',
 472                     'url': slide_url_template % (video_id, slide_path, slide_ext),
 473                 })
 474             chapters.append({
 475                 'title': f'Slide {slide_id:03d}',
 476                 'start_time': start_time,
 477             })
 478
 479         subtitles = {}
 480         for sub in traverse_obj(player_info, ('subtitles', ...), expected_type=dict):
 481             webvtt_url = url_or_none(sub.get('webvtt_url'))
 482             if not webvtt_url:
 483                 continue
 484             subtitles.setdefault(sub.get('language') or 'en', []).append({
 485                 'url': webvtt_url,
 486                 'ext': 'vtt',
 487             })
 488
 489         info = {
 490             'id': video_id,
 491             'title': player_info.get('title') or self._html_search_meta('title', webpage, default=''),
 492             'timestamp': unified_timestamp(player_info.get('timestamp')),
 493             'is_live': player_info.get('playlist_type') != 'vod',
 494             'thumbnails': thumbnails,
 495             'chapters': chapters,
 496             'subtitles': subtitles,
 497         }
 498
 499         if service_name == 'url':
 500             info['url'] = service_id
 501         elif service_name == 'yoda':
 502             formats, duration = self._extract_formats_and_duration(
 503                 player_info['video_servers'][0], service_id, video_id)
 504             info.update({
 505                 'duration': duration,
 506                 'formats': formats,
 507             })
 508         else:
 509             info.update({
 510                 '_type': 'url_transparent',
 511                 'url': service_id,
 512                 'ie_key': service_name.capitalize(),
 513                 'display_id': video_id,
 514             })
 515             if service_name == 'vimeo':
 516                 info['url'] = smuggle_url(
 517                     f'https://player.vimeo.com/video/{service_id}',
 518                     {'referer': url})
 519
 520         video_slides = traverse_obj(slides, ('slides', ..., 'video', 'id'))
 521         if not video_slides:
 522             return info
 523
 524         def entries():
 525             yield info
 526
 527             service_data = self._download_json(
 528                 f'https://ben.slideslive.com/player/{video_id}/slides_video_service_data',
 529                 video_id, fatal=False, query={
 530                     'player_token': player_token,
 531                     'videos': ','.join(video_slides),
 532                 }, note='Downloading video slides info', errnote='Failed to download video slides info') or {}
 533
 534             for slide_id, slide in enumerate(traverse_obj(slides, ('slides', ...)), 1):
 535                 if traverse_obj(slide, ('video', 'service')) != 'yoda':
 536                     continue
 537                 video_path = traverse_obj(slide, ('video', 'id'))
 538                 cdn_hostname = traverse_obj(service_data, (
 539                     video_path, 'video_servers', ...), get_all=False)
 540                 if not cdn_hostname or not video_path:
 541                     continue
 542                 formats, _ = self._extract_formats_and_duration(
 543                     cdn_hostname, video_path, video_id, skip_duration=True)
 544                 if not formats:
 545                     continue
 546                 yield {
 547                     'id': f'{video_id}-{slide_id:03d}',
 548                     'title': f'{info["title"]} - Slide {slide_id:03d}',
 549                     'timestamp': info['timestamp'],
 550                     'duration': int_or_none(traverse_obj(slide, ('video', 'duration_ms')), scale=1000),
 551                     'formats': formats,
 552                 }
 553
 554         return self.playlist_result(entries(), f'{video_id}-playlist', info['title'])