[ie/soundcloud] Various fixes (#11820)
[yt-dlp.git] / yt_dlp / extractor / odnoklassniki.py
blobd27d1c3f024283652a5ecd7485a85c2d399d7871
1 import urllib.parse
3 from .common import InfoExtractor
4 from ..compat import compat_etree_fromstring
5 from ..networking import HEADRequest
6 from ..utils import (
7 ExtractorError,
8 float_or_none,
9 int_or_none,
10 qualities,
11 smuggle_url,
12 traverse_obj,
13 unescapeHTML,
14 unified_strdate,
15 unsmuggle_url,
16 url_or_none,
17 urlencode_postdata,
21 class OdnoklassnikiIE(InfoExtractor):
22 _VALID_URL = r'''(?x)
23 https?://
24 (?:(?:www|m|mobile)\.)?
25 (?:odnoklassniki|ok)\.ru/
26 (?:
27 video(?P<embed>embed)?/|
28 web-api/video/moviePlayer/|
29 live/|
30 dk\?.*?st\.mvId=
32 (?P<id>[\d-]+)
33 '''
34 _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:odnoklassniki|ok)\.ru/videoembed/.+?)\1']
35 _TESTS = [{
36 'note': 'Coub embedded',
37 'url': 'http://ok.ru/video/1484130554189',
38 'info_dict': {
39 'id': '1keok9',
40 'ext': 'mp4',
41 'timestamp': 1545580896,
42 'view_count': int,
43 'thumbnail': r're:^https?://.*\.jpg$',
44 'title': 'Народная забава',
45 'uploader': 'Nevata',
46 'upload_date': '20181223',
47 'age_limit': 0,
48 'uploader_id': 'nevata.s',
49 'like_count': int,
50 'duration': 8.08,
51 'repost_count': int,
53 }, {
54 'note': 'vk.com embedded',
55 'url': 'https://ok.ru/video/3568183087575',
56 'info_dict': {
57 'id': '-165101755_456243749',
58 'ext': 'mp4',
59 'uploader_id': '-165101755',
60 'duration': 132,
61 'timestamp': 1642869935,
62 'upload_date': '20220122',
63 'thumbnail': str,
64 'title': str,
65 'uploader': str,
67 'skip': 'vk extractor error',
68 }, {
69 # metadata in JSON, webm_dash with Firefox UA
70 'url': 'http://ok.ru/video/20079905452',
71 'md5': '8f477d8931c531374a3e36daec617b2c',
72 'info_dict': {
73 'id': '20079905452',
74 'ext': 'webm',
75 'title': 'Культура меняет нас (прекрасный ролик!))',
76 'thumbnail': str,
77 'duration': 100,
78 'upload_date': '20141207',
79 'uploader_id': '330537914540',
80 'uploader': 'Виталий Добровольский',
81 'like_count': int,
82 'age_limit': 0,
84 'params': {
85 'format': 'bv[ext=webm]',
86 'http_headers': {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:102.0) Gecko/20100101 Firefox/102.0'},
88 }, {
89 # metadataUrl
90 'url': 'http://ok.ru/video/63567059965189-0?fromTime=5',
91 'md5': '2bae2f58eefe1b3d26f3926c4a64d2f3',
92 'info_dict': {
93 'id': '63567059965189-0',
94 'ext': 'mp4',
95 'title': 'Девушка без комплексов ...',
96 'thumbnail': str,
97 'duration': 191,
98 'upload_date': '20150518',
99 'uploader_id': '534380003155',
100 'uploader': '☭ Андрей Мещанинов ☭',
101 'like_count': int,
102 'age_limit': 0,
103 'start_time': 5,
105 'params': {'skip_download': 'm3u8'},
106 }, {
107 # YouTube embed (metadataUrl, provider == USER_YOUTUBE)
108 'url': 'https://ok.ru/video/3952212382174',
109 'md5': '5fb5f83ce16cb212d6bf887282b5da53',
110 'info_dict': {
111 'id': '5axVgHHDBvU',
112 'ext': 'mp4',
113 'title': 'Youtube-dl 101: What is it and HOW to use it! Full Download Walkthrough and Guide',
114 'description': 'md5:b57209eeb9d5c2f20c984dfb58862097',
115 'uploader': 'Lod Mer',
116 'uploader_id': '575186401502',
117 'duration': 1529,
118 'age_limit': 0,
119 'upload_date': '20210405',
120 'comment_count': int,
121 'live_status': 'not_live',
122 'view_count': int,
123 'thumbnail': 'https://i.mycdn.me/i?r=AEHujHvw2RjEbemUCNEorZbxYpb_p_9AcN2FmGik64Krkcmz37YtlY093oAM5-HIEAt7Zi9s0CiBOSDmbngC-I-k&fn=external_8',
124 'uploader_url': 'https://www.youtube.com/@MrKewlkid94',
125 'channel_follower_count': int,
126 'tags': ['youtube-dl', 'youtube playlists', 'download videos', 'download audio'],
127 'channel_id': 'UCVGtvURtEURYHtJFUegdSug',
128 'like_count': int,
129 'availability': 'public',
130 'channel_url': 'https://www.youtube.com/channel/UCVGtvURtEURYHtJFUegdSug',
131 'categories': ['Education'],
132 'playable_in_embed': True,
133 'channel': 'BornToReact',
135 }, {
136 # YouTube embed (metadata, provider == USER_YOUTUBE, no metadata.movie.title field)
137 'url': 'http://ok.ru/video/62036049272859-0',
138 'info_dict': {
139 'id': '62036049272859-0',
140 'ext': 'mp4',
141 'title': 'МУЗЫКА ДОЖДЯ .',
142 'description': 'md5:6f1867132bd96e33bf53eda1091e8ed0',
143 'upload_date': '20120106',
144 'uploader_id': '473534735899',
145 'uploader': 'МARINA D',
146 'age_limit': 0,
148 'params': {
149 'skip_download': True,
151 'skip': 'Video has not been found',
152 }, {
153 'note': 'Only available in mobile webpage',
154 'url': 'https://m.ok.ru/video/2361249957145',
155 'info_dict': {
156 'id': '2361249957145',
157 'ext': 'mp4',
158 'title': 'Быковское крещение',
159 'duration': 3038.181,
160 'thumbnail': r're:^https?://i\.mycdn\.me/videoPreview\?.+',
162 }, {
163 'note': 'subtitles',
164 'url': 'https://ok.ru/video/4249587550747',
165 'info_dict': {
166 'id': '4249587550747',
167 'ext': 'mp4',
168 'title': 'Small Country An African Childhood (2020) (1080p) +subtitle',
169 'uploader': 'Sunflower Movies',
170 'uploader_id': '595802161179',
171 'upload_date': '20220816',
172 'duration': 6728,
173 'age_limit': 0,
174 'thumbnail': r're:^https?://i\.mycdn\.me/videoPreview\?.+',
175 'like_count': int,
176 'subtitles': dict,
178 'params': {
179 'skip_download': True,
181 }, {
182 'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452',
183 'only_matching': True,
184 }, {
185 'url': 'http://www.ok.ru/video/20648036891',
186 'only_matching': True,
187 }, {
188 'url': 'http://www.ok.ru/videoembed/20648036891',
189 'only_matching': True,
190 }, {
191 'url': 'http://m.ok.ru/video/20079905452',
192 'only_matching': True,
193 }, {
194 'url': 'http://mobile.ok.ru/video/20079905452',
195 'only_matching': True,
196 }, {
197 'url': 'https://www.ok.ru/live/484531969818',
198 'only_matching': True,
199 }, {
200 'url': 'https://m.ok.ru/dk?st.cmd=movieLayer&st.discId=863789452017&st.retLoc=friend&st.rtu=%2Fdk%3Fst.cmd%3DfriendMovies%26st.mode%3Down%26st.mrkId%3D%257B%2522uploadedMovieMarker%2522%253A%257B%2522marker%2522%253A%25221519410114503%2522%252C%2522hasMore%2522%253Atrue%257D%252C%2522sharedMovieMarker%2522%253A%257B%2522marker%2522%253Anull%252C%2522hasMore%2522%253Afalse%257D%257D%26st.friendId%3D561722190321%26st.frwd%3Don%26_prevCmd%3DfriendMovies%26tkn%3D7257&st.discType=MOVIE&st.mvId=863789452017&_prevCmd=friendMovies&tkn=3648#lst#',
201 'only_matching': True,
202 }, {
203 # Paid video
204 'url': 'https://ok.ru/video/954886983203',
205 'only_matching': True,
206 }, {
207 'url': 'https://ok.ru/videoembed/2932705602075',
208 'info_dict': {
209 'id': '2932705602075',
210 'ext': 'mp4',
211 'thumbnail': 'https://i.mycdn.me/videoPreview?id=1369902483995&type=37&idx=2&tkn=fqlnoQD_xwq5ovIlKfgNyU08qmM&fn=external_8',
212 'title': 'Boosty для тебя!',
213 'uploader_id': '597811038747',
214 'like_count': 0,
215 'duration': 35,
219 _WEBPAGE_TESTS = [{
220 'url': 'https://boosty.to/ikakprosto/posts/56cedaca-b56a-4dfd-b3ed-98c79cfa0167',
221 'info_dict': {
222 'id': '3950343629563',
223 'ext': 'mp4',
224 'thumbnail': 'https://i.mycdn.me/videoPreview?id=2776238394107&type=37&idx=11&tkn=F3ejkUFcpuI4DnMRxrDGcH5YcmM&fn=external_8',
225 'title': 'Заяц Бусти.mp4',
226 'uploader_id': '571368965883',
227 'like_count': 0,
228 'duration': 10444,
230 'skip': 'Site no longer embeds',
233 def _clear_cookies(self, cdn_url):
234 # Direct http downloads will fail if CDN cookies are set
235 # so we need to reset them after each format extraction
236 self.cookiejar.clear(domain='.mycdn.me')
237 self.cookiejar.clear(domain=urllib.parse.urlparse(cdn_url).hostname)
239 @classmethod
240 def _extract_embed_urls(cls, url, webpage):
241 for x in super()._extract_embed_urls(url, webpage):
242 yield smuggle_url(x, {'referrer': url})
244 def _real_extract(self, url):
245 try:
246 return self._extract_desktop(url)
247 except ExtractorError as e:
248 try:
249 return self._extract_mobile(url)
250 except ExtractorError:
251 # error message of desktop webpage is in English
252 raise e
254 def _extract_desktop(self, url):
255 start_time = int_or_none(urllib.parse.parse_qs(
256 urllib.parse.urlparse(url).query).get('fromTime', [None])[0])
258 url, smuggled = unsmuggle_url(url, {})
259 video_id, is_embed = self._match_valid_url(url).group('id', 'embed')
260 mode = 'videoembed' if is_embed else 'video'
262 webpage = self._download_webpage(
263 f'https://ok.ru/{mode}/{video_id}', video_id,
264 note='Downloading desktop webpage',
265 headers={'Referer': smuggled['referrer']} if smuggled.get('referrer') else {})
267 error = self._search_regex(
268 r'[^>]+class="vp_video_stub_txt"[^>]*>([^<]+)<',
269 webpage, 'error', default=None)
270 # Direct link from boosty
271 if (error == 'The author of this video has not been found or is blocked'
272 and not smuggled.get('referrer') and mode == 'videoembed'):
273 return self._extract_desktop(smuggle_url(url, {'referrer': 'https://boosty.to'}))
274 elif error:
275 raise ExtractorError(error, expected=True)
277 player = self._parse_json(
278 unescapeHTML(self._search_regex(
279 rf'data-options=(?P<quote>["\'])(?P<player>{{.+?{video_id}.+?}})(?P=quote)',
280 webpage, 'player', group='player')),
281 video_id)
283 # embedded external player
284 if player.get('isExternalPlayer') and player.get('url'):
285 return self.url_result(player['url'])
287 flashvars = player['flashvars']
289 metadata = flashvars.get('metadata')
290 if metadata:
291 metadata = self._parse_json(metadata, video_id)
292 else:
293 data = {}
294 st_location = flashvars.get('location')
295 if st_location:
296 data['st.location'] = st_location
297 metadata = self._download_json(
298 urllib.parse.unquote(flashvars['metadataUrl']),
299 video_id, 'Downloading metadata JSON',
300 data=urlencode_postdata(data))
302 movie = metadata['movie']
304 # Some embedded videos may not contain title in movie dict (e.g.
305 # http://ok.ru/video/62036049272859-0) thus we allow missing title
306 # here and it's going to be extracted later by an extractor that
307 # will process the actual embed.
308 provider = metadata.get('provider')
309 title = movie['title'] if provider == 'UPLOADED_ODKL' else movie.get('title')
311 thumbnail = movie.get('poster')
312 duration = int_or_none(movie.get('duration'))
314 author = metadata.get('author', {})
315 uploader_id = author.get('id')
316 uploader = author.get('name')
318 upload_date = unified_strdate(self._html_search_meta(
319 'ya:ovs:upload_date', webpage, 'upload date', default=None))
321 age_limit = None
322 adult = self._html_search_meta(
323 'ya:ovs:adult', webpage, 'age limit', default=None)
324 if adult:
325 age_limit = 18 if adult == 'true' else 0
327 like_count = int_or_none(metadata.get('likeCount'))
329 subtitles = {}
330 for sub in traverse_obj(metadata, ('movie', 'subtitleTracks', ...), expected_type=dict):
331 sub_url = sub.get('url')
332 if not sub_url:
333 continue
334 subtitles.setdefault(sub.get('language') or 'en', []).append({
335 'url': sub_url,
336 'ext': 'vtt',
339 info = {
340 'id': video_id,
341 'title': title,
342 'thumbnail': thumbnail,
343 'duration': duration,
344 'upload_date': upload_date,
345 'uploader': uploader,
346 'uploader_id': uploader_id,
347 'like_count': like_count,
348 'age_limit': age_limit,
349 'start_time': start_time,
350 'subtitles': subtitles,
353 # pladform
354 if provider == 'OPEN_GRAPH':
355 info.update({
356 '_type': 'url_transparent',
357 'url': movie['contentId'],
359 return info
361 if provider == 'USER_YOUTUBE':
362 info.update({
363 '_type': 'url_transparent',
364 'url': movie['contentId'],
366 return info
368 assert title
369 if provider == 'LIVE_TV_APP':
370 info['title'] = title
372 quality = qualities(('4', '0', '1', '2', '3', '5', '6', '7'))
374 formats = [{
375 'url': f['url'],
376 'ext': 'mp4',
377 'format_id': f.get('name'),
378 } for f in traverse_obj(metadata, ('videos', lambda _, v: url_or_none(v['url'])))]
380 m3u8_url = traverse_obj(metadata, 'hlsManifestUrl', 'ondemandHls')
381 if m3u8_url:
382 formats.extend(self._extract_m3u8_formats(
383 m3u8_url, video_id, 'mp4', 'm3u8_native',
384 m3u8_id='hls', fatal=False))
385 self._clear_cookies(m3u8_url)
387 for mpd_id, mpd_key in [('dash', 'ondemandDash'), ('webm', 'metadataWebmUrl')]:
388 mpd_url = metadata.get(mpd_key)
389 if mpd_url:
390 formats.extend(self._extract_mpd_formats(
391 mpd_url, video_id, mpd_id=mpd_id, fatal=False))
392 self._clear_cookies(mpd_url)
394 dash_manifest = metadata.get('metadataEmbedded')
395 if dash_manifest:
396 formats.extend(self._parse_mpd_formats(
397 compat_etree_fromstring(dash_manifest), 'mpd'))
399 for fmt in formats:
400 fmt_type = self._search_regex(
401 r'\btype[/=](\d)', fmt['url'],
402 'format type', default=None)
403 if fmt_type:
404 fmt['quality'] = quality(fmt_type)
406 # Live formats
407 m3u8_url = metadata.get('hlsMasterPlaylistUrl')
408 if m3u8_url:
409 formats.extend(self._extract_m3u8_formats(
410 m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
411 self._clear_cookies(m3u8_url)
412 rtmp_url = metadata.get('rtmpUrl')
413 if rtmp_url:
414 formats.append({
415 'url': rtmp_url,
416 'format_id': 'rtmp',
417 'ext': 'flv',
420 if not formats:
421 payment_info = metadata.get('paymentInfo')
422 if payment_info:
423 self.raise_no_formats('This video is paid, subscribe to download it', expected=True)
425 info['formats'] = formats
426 return info
428 def _extract_mobile(self, url):
429 video_id = self._match_id(url)
431 webpage = self._download_webpage(
432 f'http://m.ok.ru/video/{video_id}', video_id,
433 note='Downloading mobile webpage')
435 error = self._search_regex(
436 r'видео</a>\s*<div\s+class="empty">(.+?)</div>',
437 webpage, 'error', default=None)
438 if error:
439 raise ExtractorError(error, expected=True)
441 json_data = self._search_regex(
442 r'data-video="(.+?)"', webpage, 'json data')
443 json_data = self._parse_json(unescapeHTML(json_data), video_id) or {}
445 redirect_url = self._request_webpage(HEADRequest(
446 json_data['videoSrc']), video_id, 'Requesting download URL').url
447 self._clear_cookies(redirect_url)
449 return {
450 'id': video_id,
451 'title': json_data.get('videoName'),
452 'duration': float_or_none(json_data.get('videoDuration'), scale=1000),
453 'thumbnail': json_data.get('videoPosterSrc'),
454 'formats': [{
455 'format_id': 'mobile',
456 'url': redirect_url,
457 'ext': 'mp4',