yt_dlp/extractor/videoken.py

   1 import base64
   2 import functools
   3 import math
   4 import re
   5 import time
   6 import urllib.parse
   7
   8 from .common import InfoExtractor
   9 from .slideslive import SlidesLiveIE
  10 from ..utils import (
  11     ExtractorError,
  12     InAdvancePagedList,
  13     int_or_none,
  14     remove_start,
  15     traverse_obj,
  16     update_url_query,
  17     url_or_none,
  18 )
  19
  20
  21 class VideoKenBaseIE(InfoExtractor):
  22     _ORGANIZATIONS = {
  23         'videos.icts.res.in': 'icts',
  24         'videos.cncf.io': 'cncf',
  25         'videos.neurips.cc': 'neurips',
  26     }
  27     _BASE_URL_RE = rf'https?://(?P<host>{"|".join(map(re.escape, _ORGANIZATIONS))})/'
  28
  29     _PAGE_SIZE = 12
  30
  31     def _get_org_id_and_api_key(self, org, video_id):
  32         details = self._download_json(
  33             f'https://analytics.videoken.com/api/videolake/{org}/details', video_id,
  34             note='Downloading organization ID and API key', headers={
  35                 'Accept': 'application/json',
  36             })
  37         return details['id'], details['apikey']
  38
  39     def _create_slideslive_url(self, video_url, video_id, referer):
  40         if not video_url and not video_id:
  41             return
  42         elif not video_url or 'embed/sign-in' in video_url:
  43             video_url = f'https://slideslive.com/embed/{remove_start(video_id, "slideslive-")}'
  44         if url_or_none(referer):
  45             return update_url_query(video_url, {
  46                 'embed_parent_url': referer,
  47                 'embed_container_origin': f'https://{urllib.parse.urlparse(referer).hostname}',
  48             })
  49         return video_url
  50
  51     def _extract_videos(self, videos, url):
  52         for video in traverse_obj(videos, (('videos', 'results'), ...)):
  53             video_id = traverse_obj(video, 'youtube_id', 'videoid')
  54             if not video_id:
  55                 continue
  56             ie_key = None
  57             if traverse_obj(video, 'type', 'source') == 'youtube':
  58                 video_url = video_id
  59                 ie_key = 'Youtube'
  60             else:
  61                 video_url = traverse_obj(video, 'embed_url', 'embeddableurl', expected_type=url_or_none)
  62                 if not video_url:
  63                     continue
  64                 elif urllib.parse.urlparse(video_url).hostname == 'slideslive.com':
  65                     ie_key = SlidesLiveIE
  66                     video_url = self._create_slideslive_url(video_url, video_id, url)
  67             yield self.url_result(video_url, ie_key, video_id)
  68
  69
  70 class VideoKenIE(VideoKenBaseIE):
  71     _VALID_URL = VideoKenBaseIE._BASE_URL_RE + r'(?:(?:topic|category)/[^/#?]+/)?video/(?P<id>[\w-]+)'
  72     _TESTS = [{
  73         # neurips -> videoken -> slideslive
  74         'url': 'https://videos.neurips.cc/video/slideslive-38922815',
  75         'info_dict': {
  76             'id': '38922815',
  77             'ext': 'mp4',
  78             'title': 'Efficient Processing of Deep Neural Network: from Algorithms to Hardware Architectures',
  79             'timestamp': 1630939331,
  80             'upload_date': '20210906',
  81             'thumbnail': r're:^https?://.*\.(?:jpg|png)',
  82             'thumbnails': 'count:330',
  83             'chapters': 'count:329',
  84         },
  85         'params': {
  86             'skip_download': 'm3u8',
  87         },
  88         'expected_warnings': ['Failed to download VideoKen API JSON'],
  89     }, {
  90         # neurips -> videoken -> slideslive -> youtube
  91         'url': 'https://videos.neurips.cc/topic/machine%20learning/video/slideslive-38923348',
  92         'info_dict': {
  93             'id': '2Xa_dt78rJE',
  94             'ext': 'mp4',
  95             'display_id': '38923348',
  96             'title': 'Machine Education',
  97             'description': 'Watch full version of this video at https://slideslive.com/38923348.',
  98             'channel': 'SlidesLive Videos - G2',
  99             'channel_id': 'UCOExahQQ588Da8Nft_Ltb9w',
 100             'channel_url': 'https://www.youtube.com/channel/UCOExahQQ588Da8Nft_Ltb9w',
 101             'uploader': 'SlidesLive Videos - G2',
 102             'uploader_id': 'UCOExahQQ588Da8Nft_Ltb9w',
 103             'uploader_url': 'http://www.youtube.com/channel/UCOExahQQ588Da8Nft_Ltb9w',
 104             'duration': 2504,
 105             'timestamp': 1618922125,
 106             'upload_date': '20200131',
 107             'age_limit': 0,
 108             'channel_follower_count': int,
 109             'view_count': int,
 110             'availability': 'unlisted',
 111             'live_status': 'not_live',
 112             'playable_in_embed': True,
 113             'categories': ['People & Blogs'],
 114             'tags': [],
 115             'thumbnail': r're:^https?://.*\.(?:jpg|webp)',
 116             'thumbnails': 'count:78',
 117             'chapters': 'count:77',
 118         },
 119         'params': {
 120             'skip_download': 'm3u8',
 121         },
 122         'expected_warnings': ['Failed to download VideoKen API JSON'],
 123     }, {
 124         # icts -> videoken -> youtube
 125         'url': 'https://videos.icts.res.in/topic/random%20variable/video/zysIsojYdvc',
 126         'info_dict': {
 127             'id': 'zysIsojYdvc',
 128             'ext': 'mp4',
 129             'title': 'Small-worlds, complex networks and random graphs (Lecture 3)  by Remco van der Hofstad',
 130             'description': 'md5:87433069d79719eeadc1962cc2ace00b',
 131             'channel': 'International Centre for Theoretical Sciences',
 132             'channel_id': 'UCO3xnVTHzB7l-nc8mABUJIQ',
 133             'channel_url': 'https://www.youtube.com/channel/UCO3xnVTHzB7l-nc8mABUJIQ',
 134             'uploader': 'International Centre for Theoretical Sciences',
 135             'uploader_id': 'ICTStalks',
 136             'uploader_url': 'http://www.youtube.com/user/ICTStalks',
 137             'duration': 3372,
 138             'upload_date': '20191004',
 139             'age_limit': 0,
 140             'live_status': 'not_live',
 141             'availability': 'public',
 142             'playable_in_embed': True,
 143             'channel_follower_count': int,
 144             'like_count': int,
 145             'view_count': int,
 146             'categories': ['Science & Technology'],
 147             'tags': [],
 148             'thumbnail': r're:^https?://.*\.(?:jpg|webp)',
 149             'thumbnails': 'count:42',
 150             'chapters': 'count:20',
 151         },
 152         'params': {
 153             'skip_download': 'm3u8',
 154         },
 155     }, {
 156         'url': 'https://videos.cncf.io/category/478/video/IL4nxbmUIX8',
 157         'only_matching': True,
 158     }, {
 159         'url': 'https://videos.cncf.io/topic/kubernetes/video/YAM2d7yTrrI',
 160         'only_matching': True,
 161     }, {
 162         'url': 'https://videos.icts.res.in/video/d7HuP_abpKU',
 163         'only_matching': True,
 164     }]
 165
 166     def _real_extract(self, url):
 167         hostname, video_id = self._match_valid_url(url).group('host', 'id')
 168         org_id, _ = self._get_org_id_and_api_key(self._ORGANIZATIONS[hostname], video_id)
 169         details = self._download_json(
 170             'https://analytics.videoken.com/api/videoinfo_private', video_id, query={
 171                 'videoid': video_id,
 172                 'org_id': org_id,
 173             }, headers={'Accept': 'application/json'}, note='Downloading VideoKen API JSON',
 174             errnote='Failed to download VideoKen API JSON', fatal=False)
 175         if details:
 176             return next(self._extract_videos({'videos': [details]}, url))
 177         # fallback for API error 400 response
 178         elif video_id.startswith('slideslive-'):
 179             return self.url_result(
 180                 self._create_slideslive_url(None, video_id, url), SlidesLiveIE, video_id)
 181         elif re.match(r'^[\w-]{11}$', video_id):
 182             return self.url_result(video_id, 'Youtube', video_id)
 183         else:
 184             raise ExtractorError('Unable to extract without VideoKen API response')
 185
 186
 187 class VideoKenPlayerIE(VideoKenBaseIE):
 188     _VALID_URL = r'https?://player\.videoken\.com/embed/slideslive-(?P<id>\d+)'
 189     _TESTS = [{
 190         'url': 'https://player.videoken.com/embed/slideslive-38968434',
 191         'info_dict': {
 192             'id': '38968434',
 193             'ext': 'mp4',
 194             'title': 'Deep Learning with Label Differential Privacy',
 195             'timestamp': 1643377020,
 196             'upload_date': '20220128',
 197             'thumbnail': r're:^https?://.*\.(?:jpg|png)',
 198             'thumbnails': 'count:30',
 199             'chapters': 'count:29',
 200         },
 201         'params': {
 202             'skip_download': 'm3u8',
 203         },
 204     }]
 205
 206     def _real_extract(self, url):
 207         video_id = self._match_id(url)
 208         return self.url_result(
 209             self._create_slideslive_url(None, video_id, url), SlidesLiveIE, video_id)
 210
 211
 212 class VideoKenPlaylistIE(VideoKenBaseIE):
 213     _VALID_URL = VideoKenBaseIE._BASE_URL_RE + r'(?:category/\d+/)?playlist/(?P<id>\d+)'
 214     _TESTS = [{
 215         'url': 'https://videos.icts.res.in/category/1822/playlist/381',
 216         'playlist_mincount': 117,
 217         'info_dict': {
 218             'id': '381',
 219             'title': 'Cosmology - The Next Decade',
 220         },
 221     }]
 222
 223     def _real_extract(self, url):
 224         hostname, playlist_id = self._match_valid_url(url).group('host', 'id')
 225         org_id, _ = self._get_org_id_and_api_key(self._ORGANIZATIONS[hostname], playlist_id)
 226         videos = self._download_json(
 227             f'https://analytics.videoken.com/api/{org_id}/playlistitems/{playlist_id}/',
 228             playlist_id, headers={'Accept': 'application/json'}, note='Downloading API JSON')
 229         return self.playlist_result(self._extract_videos(videos, url), playlist_id, videos.get('title'))
 230
 231
 232 class VideoKenCategoryIE(VideoKenBaseIE):
 233     _VALID_URL = VideoKenBaseIE._BASE_URL_RE + r'category/(?P<id>\d+)/?(?:$|[?#])'
 234     _TESTS = [{
 235         'url': 'https://videos.icts.res.in/category/1822/',
 236         'playlist_mincount': 500,
 237         'info_dict': {
 238             'id': '1822',
 239             'title': 'Programs',
 240         },
 241     }, {
 242         'url': 'https://videos.neurips.cc/category/350/',
 243         'playlist_mincount': 34,
 244         'info_dict': {
 245             'id': '350',
 246             'title': 'NeurIPS 2018',
 247         },
 248     }, {
 249         'url': 'https://videos.cncf.io/category/479/',
 250         'playlist_mincount': 328,
 251         'info_dict': {
 252             'id': '479',
 253             'title': 'KubeCon + CloudNativeCon Europe\'19',
 254         },
 255     }]
 256
 257     def _get_category_page(self, category_id, org_id, page=1, note=None):
 258         return self._download_json(
 259             f'https://analytics.videoken.com/api/videolake/{org_id}/category_videos', category_id,
 260             fatal=False, note=note if note else f'Downloading category page {page}',
 261             query={
 262                 'category_id': category_id,
 263                 'page_number': page,
 264                 'length': self._PAGE_SIZE,
 265             }, headers={'Accept': 'application/json'}) or {}
 266
 267     def _entries(self, category_id, org_id, url, page):
 268         videos = self._get_category_page(category_id, org_id, page + 1)
 269         yield from self._extract_videos(videos, url)
 270
 271     def _real_extract(self, url):
 272         hostname, category_id = self._match_valid_url(url).group('host', 'id')
 273         org_id, _ = self._get_org_id_and_api_key(self._ORGANIZATIONS[hostname], category_id)
 274         category_info = self._get_category_page(category_id, org_id, note='Downloading category info')
 275         category = category_info['category_name']
 276         total_pages = math.ceil(int(category_info['recordsTotal']) / self._PAGE_SIZE)
 277         return self.playlist_result(InAdvancePagedList(
 278             functools.partial(self._entries, category_id, org_id, url),
 279             total_pages, self._PAGE_SIZE), category_id, category)
 280
 281
 282 class VideoKenTopicIE(VideoKenBaseIE):
 283     _VALID_URL = VideoKenBaseIE._BASE_URL_RE + r'topic/(?P<id>[^/#?]+)/?(?:$|[?#])'
 284     _TESTS = [{
 285         'url': 'https://videos.neurips.cc/topic/machine%20learning/',
 286         'playlist_mincount': 500,
 287         'info_dict': {
 288             'id': 'machine_learning',
 289             'title': 'machine learning',
 290         },
 291     }, {
 292         'url': 'https://videos.icts.res.in/topic/gravitational%20waves/',
 293         'playlist_mincount': 77,
 294         'info_dict': {
 295             'id': 'gravitational_waves',
 296             'title': 'gravitational waves',
 297         },
 298     }, {
 299         'url': 'https://videos.cncf.io/topic/prometheus/',
 300         'playlist_mincount': 134,
 301         'info_dict': {
 302             'id': 'prometheus',
 303             'title': 'prometheus',
 304         },
 305     }]
 306
 307     def _get_topic_page(self, topic, org_id, search_id, api_key, page=1, note=None):
 308         return self._download_json(
 309             'https://es.videoken.com/api/v1.0/get_results', topic, fatal=False, query={
 310                 'orgid': org_id,
 311                 'size': self._PAGE_SIZE,
 312                 'query': topic,
 313                 'page': page,
 314                 'sort': 'upload_desc',
 315                 'filter': 'all',
 316                 'token': api_key,
 317                 'is_topic': 'true',
 318                 'category': '',
 319                 'searchid': search_id,
 320             }, headers={'Accept': 'application/json'},
 321             note=note if note else f'Downloading topic page {page}') or {}
 322
 323     def _entries(self, topic, org_id, search_id, api_key, url, page):
 324         videos = self._get_topic_page(topic, org_id, search_id, api_key, page + 1)
 325         yield from self._extract_videos(videos, url)
 326
 327     def _real_extract(self, url):
 328         hostname, topic_id = self._match_valid_url(url).group('host', 'id')
 329         topic = urllib.parse.unquote(topic_id)
 330         topic_id = topic.replace(' ', '_')
 331         org_id, api_key = self._get_org_id_and_api_key(self._ORGANIZATIONS[hostname], topic)
 332         search_id = base64.b64encode(f':{topic}:{int(time.time())}:transient'.encode()).decode()
 333         total_pages = int_or_none(self._get_topic_page(
 334             topic, org_id, search_id, api_key, note='Downloading topic info')['total_no_of_pages'])
 335         return self.playlist_result(InAdvancePagedList(
 336             functools.partial(self._entries, topic, org_id, search_id, api_key, url),
 337             total_pages, self._PAGE_SIZE), topic_id, topic)