[cleanup] Make more playlist entries lazy (#11763)
[yt-dlp.git] / yt_dlp / extractor / videoken.py
blobdc1dcf12bd86867d1056762c9f35a4019ff9163c
1 import base64
2 import functools
3 import math
4 import re
5 import time
6 import urllib.parse
8 from .common import InfoExtractor
9 from .slideslive import SlidesLiveIE
10 from ..utils import (
11 ExtractorError,
12 InAdvancePagedList,
13 int_or_none,
14 remove_start,
15 traverse_obj,
16 update_url_query,
17 url_or_none,
21 class VideoKenBaseIE(InfoExtractor):
22 _ORGANIZATIONS = {
23 'videos.icts.res.in': 'icts',
24 'videos.cncf.io': 'cncf',
25 'videos.neurips.cc': 'neurips',
27 _BASE_URL_RE = rf'https?://(?P<host>{"|".join(map(re.escape, _ORGANIZATIONS))})/'
29 _PAGE_SIZE = 12
31 def _get_org_id_and_api_key(self, org, video_id):
32 details = self._download_json(
33 f'https://analytics.videoken.com/api/videolake/{org}/details', video_id,
34 note='Downloading organization ID and API key', headers={
35 'Accept': 'application/json',
37 return details['id'], details['apikey']
39 def _create_slideslive_url(self, video_url, video_id, referer):
40 if not video_url and not video_id:
41 return
42 elif not video_url or 'embed/sign-in' in video_url:
43 video_url = f'https://slideslive.com/embed/{remove_start(video_id, "slideslive-")}'
44 if url_or_none(referer):
45 return update_url_query(video_url, {
46 'embed_parent_url': referer,
47 'embed_container_origin': f'https://{urllib.parse.urlparse(referer).hostname}',
49 return video_url
51 def _extract_videos(self, videos, url):
52 for video in traverse_obj(videos, (('videos', 'results'), ...)):
53 video_id = traverse_obj(video, 'youtube_id', 'videoid')
54 if not video_id:
55 continue
56 ie_key = None
57 if traverse_obj(video, 'type', 'source') == 'youtube':
58 video_url = video_id
59 ie_key = 'Youtube'
60 else:
61 video_url = traverse_obj(video, 'embed_url', 'embeddableurl', expected_type=url_or_none)
62 if not video_url:
63 continue
64 elif urllib.parse.urlparse(video_url).hostname == 'slideslive.com':
65 ie_key = SlidesLiveIE
66 video_url = self._create_slideslive_url(video_url, video_id, url)
67 yield self.url_result(video_url, ie_key, video_id)
70 class VideoKenIE(VideoKenBaseIE):
71 _VALID_URL = VideoKenBaseIE._BASE_URL_RE + r'(?:(?:topic|category)/[^/#?]+/)?video/(?P<id>[\w-]+)'
72 _TESTS = [{
73 # neurips -> videoken -> slideslive
74 'url': 'https://videos.neurips.cc/video/slideslive-38922815',
75 'info_dict': {
76 'id': '38922815',
77 'ext': 'mp4',
78 'title': 'Efficient Processing of Deep Neural Network: from Algorithms to Hardware Architectures',
79 'timestamp': 1630939331,
80 'upload_date': '20210906',
81 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
82 'thumbnails': 'count:330',
83 'chapters': 'count:329',
85 'params': {
86 'skip_download': 'm3u8',
88 'expected_warnings': ['Failed to download VideoKen API JSON'],
89 }, {
90 # neurips -> videoken -> slideslive -> youtube
91 'url': 'https://videos.neurips.cc/topic/machine%20learning/video/slideslive-38923348',
92 'info_dict': {
93 'id': '2Xa_dt78rJE',
94 'ext': 'mp4',
95 'display_id': '38923348',
96 'title': 'Machine Education',
97 'description': 'Watch full version of this video at https://slideslive.com/38923348.',
98 'channel': 'SlidesLive Videos - G2',
99 'channel_id': 'UCOExahQQ588Da8Nft_Ltb9w',
100 'channel_url': 'https://www.youtube.com/channel/UCOExahQQ588Da8Nft_Ltb9w',
101 'uploader': 'SlidesLive Videos - G2',
102 'uploader_id': 'UCOExahQQ588Da8Nft_Ltb9w',
103 'uploader_url': 'http://www.youtube.com/channel/UCOExahQQ588Da8Nft_Ltb9w',
104 'duration': 2504,
105 'timestamp': 1618922125,
106 'upload_date': '20200131',
107 'age_limit': 0,
108 'channel_follower_count': int,
109 'view_count': int,
110 'availability': 'unlisted',
111 'live_status': 'not_live',
112 'playable_in_embed': True,
113 'categories': ['People & Blogs'],
114 'tags': [],
115 'thumbnail': r're:^https?://.*\.(?:jpg|webp)',
116 'thumbnails': 'count:78',
117 'chapters': 'count:77',
119 'params': {
120 'skip_download': 'm3u8',
122 'expected_warnings': ['Failed to download VideoKen API JSON'],
123 }, {
124 # icts -> videoken -> youtube
125 'url': 'https://videos.icts.res.in/topic/random%20variable/video/zysIsojYdvc',
126 'info_dict': {
127 'id': 'zysIsojYdvc',
128 'ext': 'mp4',
129 'title': 'Small-worlds, complex networks and random graphs (Lecture 3) by Remco van der Hofstad',
130 'description': 'md5:87433069d79719eeadc1962cc2ace00b',
131 'channel': 'International Centre for Theoretical Sciences',
132 'channel_id': 'UCO3xnVTHzB7l-nc8mABUJIQ',
133 'channel_url': 'https://www.youtube.com/channel/UCO3xnVTHzB7l-nc8mABUJIQ',
134 'uploader': 'International Centre for Theoretical Sciences',
135 'uploader_id': 'ICTStalks',
136 'uploader_url': 'http://www.youtube.com/user/ICTStalks',
137 'duration': 3372,
138 'upload_date': '20191004',
139 'age_limit': 0,
140 'live_status': 'not_live',
141 'availability': 'public',
142 'playable_in_embed': True,
143 'channel_follower_count': int,
144 'like_count': int,
145 'view_count': int,
146 'categories': ['Science & Technology'],
147 'tags': [],
148 'thumbnail': r're:^https?://.*\.(?:jpg|webp)',
149 'thumbnails': 'count:42',
150 'chapters': 'count:20',
152 'params': {
153 'skip_download': 'm3u8',
155 }, {
156 'url': 'https://videos.cncf.io/category/478/video/IL4nxbmUIX8',
157 'only_matching': True,
158 }, {
159 'url': 'https://videos.cncf.io/topic/kubernetes/video/YAM2d7yTrrI',
160 'only_matching': True,
161 }, {
162 'url': 'https://videos.icts.res.in/video/d7HuP_abpKU',
163 'only_matching': True,
166 def _real_extract(self, url):
167 hostname, video_id = self._match_valid_url(url).group('host', 'id')
168 org_id, _ = self._get_org_id_and_api_key(self._ORGANIZATIONS[hostname], video_id)
169 details = self._download_json(
170 'https://analytics.videoken.com/api/videoinfo_private', video_id, query={
171 'videoid': video_id,
172 'org_id': org_id,
173 }, headers={'Accept': 'application/json'}, note='Downloading VideoKen API JSON',
174 errnote='Failed to download VideoKen API JSON', fatal=False)
175 if details:
176 return next(self._extract_videos({'videos': [details]}, url))
177 # fallback for API error 400 response
178 elif video_id.startswith('slideslive-'):
179 return self.url_result(
180 self._create_slideslive_url(None, video_id, url), SlidesLiveIE, video_id)
181 elif re.match(r'^[\w-]{11}$', video_id):
182 return self.url_result(video_id, 'Youtube', video_id)
183 else:
184 raise ExtractorError('Unable to extract without VideoKen API response')
187 class VideoKenPlayerIE(VideoKenBaseIE):
188 _VALID_URL = r'https?://player\.videoken\.com/embed/slideslive-(?P<id>\d+)'
189 _TESTS = [{
190 'url': 'https://player.videoken.com/embed/slideslive-38968434',
191 'info_dict': {
192 'id': '38968434',
193 'ext': 'mp4',
194 'title': 'Deep Learning with Label Differential Privacy',
195 'timestamp': 1643377020,
196 'upload_date': '20220128',
197 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
198 'thumbnails': 'count:30',
199 'chapters': 'count:29',
201 'params': {
202 'skip_download': 'm3u8',
206 def _real_extract(self, url):
207 video_id = self._match_id(url)
208 return self.url_result(
209 self._create_slideslive_url(None, video_id, url), SlidesLiveIE, video_id)
212 class VideoKenPlaylistIE(VideoKenBaseIE):
213 _VALID_URL = VideoKenBaseIE._BASE_URL_RE + r'(?:category/\d+/)?playlist/(?P<id>\d+)'
214 _TESTS = [{
215 'url': 'https://videos.icts.res.in/category/1822/playlist/381',
216 'playlist_mincount': 117,
217 'info_dict': {
218 'id': '381',
219 'title': 'Cosmology - The Next Decade',
223 def _real_extract(self, url):
224 hostname, playlist_id = self._match_valid_url(url).group('host', 'id')
225 org_id, _ = self._get_org_id_and_api_key(self._ORGANIZATIONS[hostname], playlist_id)
226 videos = self._download_json(
227 f'https://analytics.videoken.com/api/{org_id}/playlistitems/{playlist_id}/',
228 playlist_id, headers={'Accept': 'application/json'}, note='Downloading API JSON')
229 return self.playlist_result(self._extract_videos(videos, url), playlist_id, videos.get('title'))
232 class VideoKenCategoryIE(VideoKenBaseIE):
233 _VALID_URL = VideoKenBaseIE._BASE_URL_RE + r'category/(?P<id>\d+)/?(?:$|[?#])'
234 _TESTS = [{
235 'url': 'https://videos.icts.res.in/category/1822/',
236 'playlist_mincount': 500,
237 'info_dict': {
238 'id': '1822',
239 'title': 'Programs',
241 }, {
242 'url': 'https://videos.neurips.cc/category/350/',
243 'playlist_mincount': 34,
244 'info_dict': {
245 'id': '350',
246 'title': 'NeurIPS 2018',
248 }, {
249 'url': 'https://videos.cncf.io/category/479/',
250 'playlist_mincount': 328,
251 'info_dict': {
252 'id': '479',
253 'title': 'KubeCon + CloudNativeCon Europe\'19',
257 def _get_category_page(self, category_id, org_id, page=1, note=None):
258 return self._download_json(
259 f'https://analytics.videoken.com/api/videolake/{org_id}/category_videos', category_id,
260 fatal=False, note=note if note else f'Downloading category page {page}',
261 query={
262 'category_id': category_id,
263 'page_number': page,
264 'length': self._PAGE_SIZE,
265 }, headers={'Accept': 'application/json'}) or {}
267 def _entries(self, category_id, org_id, url, page):
268 videos = self._get_category_page(category_id, org_id, page + 1)
269 yield from self._extract_videos(videos, url)
271 def _real_extract(self, url):
272 hostname, category_id = self._match_valid_url(url).group('host', 'id')
273 org_id, _ = self._get_org_id_and_api_key(self._ORGANIZATIONS[hostname], category_id)
274 category_info = self._get_category_page(category_id, org_id, note='Downloading category info')
275 category = category_info['category_name']
276 total_pages = math.ceil(int(category_info['recordsTotal']) / self._PAGE_SIZE)
277 return self.playlist_result(InAdvancePagedList(
278 functools.partial(self._entries, category_id, org_id, url),
279 total_pages, self._PAGE_SIZE), category_id, category)
282 class VideoKenTopicIE(VideoKenBaseIE):
283 _VALID_URL = VideoKenBaseIE._BASE_URL_RE + r'topic/(?P<id>[^/#?]+)/?(?:$|[?#])'
284 _TESTS = [{
285 'url': 'https://videos.neurips.cc/topic/machine%20learning/',
286 'playlist_mincount': 500,
287 'info_dict': {
288 'id': 'machine_learning',
289 'title': 'machine learning',
291 }, {
292 'url': 'https://videos.icts.res.in/topic/gravitational%20waves/',
293 'playlist_mincount': 77,
294 'info_dict': {
295 'id': 'gravitational_waves',
296 'title': 'gravitational waves',
298 }, {
299 'url': 'https://videos.cncf.io/topic/prometheus/',
300 'playlist_mincount': 134,
301 'info_dict': {
302 'id': 'prometheus',
303 'title': 'prometheus',
307 def _get_topic_page(self, topic, org_id, search_id, api_key, page=1, note=None):
308 return self._download_json(
309 'https://es.videoken.com/api/v1.0/get_results', topic, fatal=False, query={
310 'orgid': org_id,
311 'size': self._PAGE_SIZE,
312 'query': topic,
313 'page': page,
314 'sort': 'upload_desc',
315 'filter': 'all',
316 'token': api_key,
317 'is_topic': 'true',
318 'category': '',
319 'searchid': search_id,
320 }, headers={'Accept': 'application/json'},
321 note=note if note else f'Downloading topic page {page}') or {}
323 def _entries(self, topic, org_id, search_id, api_key, url, page):
324 videos = self._get_topic_page(topic, org_id, search_id, api_key, page + 1)
325 yield from self._extract_videos(videos, url)
327 def _real_extract(self, url):
328 hostname, topic_id = self._match_valid_url(url).group('host', 'id')
329 topic = urllib.parse.unquote(topic_id)
330 topic_id = topic.replace(' ', '_')
331 org_id, api_key = self._get_org_id_and_api_key(self._ORGANIZATIONS[hostname], topic)
332 search_id = base64.b64encode(f':{topic}:{int(time.time())}:transient'.encode()).decode()
333 total_pages = int_or_none(self._get_topic_page(
334 topic, org_id, search_id, api_key, note='Downloading topic info')['total_no_of_pages'])
335 return self.playlist_result(InAdvancePagedList(
336 functools.partial(self._entries, topic, org_id, search_id, api_key, url),
337 total_pages, self._PAGE_SIZE), topic_id, topic)