yt_dlp/extractor/cda.py

   1 import base64
   2 import codecs
   3 import datetime as dt
   4 import hashlib
   5 import hmac
   6 import json
   7 import random
   8 import re
   9 import urllib.parse
  10
  11 from .common import InfoExtractor
  12 from ..compat import compat_ord
  13 from ..utils import (
  14     ExtractorError,
  15     OnDemandPagedList,
  16     float_or_none,
  17     int_or_none,
  18     merge_dicts,
  19     multipart_encode,
  20     parse_duration,
  21     traverse_obj,
  22     try_call,
  23     try_get,
  24     urljoin,
  25 )
  26
  27
  28 class CDAIE(InfoExtractor):
  29     _VALID_URL = r'https?://(?:(?:www\.)?cda\.pl/video|ebd\.cda\.pl/[0-9]+x[0-9]+)/(?P<id>[0-9a-z]+)'
  30     _NETRC_MACHINE = 'cdapl'
  31
  32     _BASE_URL = 'https://www.cda.pl'
  33     _BASE_API_URL = 'https://api.cda.pl'
  34     _API_HEADERS = {
  35         'Accept': 'application/vnd.cda.public+json',
  36     }
  37     # hardcoded in the app
  38     _LOGIN_REQUEST_AUTH = 'Basic YzU3YzBlZDUtYTIzOC00MWQwLWI2NjQtNmZmMWMxY2Y2YzVlOklBTm95QlhRRVR6U09MV1hnV3MwMW0xT2VyNWJNZzV4clRNTXhpNGZJUGVGZ0lWUlo5UGVYTDhtUGZaR1U1U3Q'
  39     _BEARER_CACHE = 'cda-bearer'
  40
  41     _TESTS = [{
  42         'url': 'http://www.cda.pl/video/5749950c',
  43         'md5': '6f844bf51b15f31fae165365707ae970',
  44         'info_dict': {
  45             'id': '5749950c',
  46             'ext': 'mp4',
  47             'height': 720,
  48             'title': 'Oto dlaczego przed zakrętem należy zwolnić.',
  49             'description': 'md5:269ccd135d550da90d1662651fcb9772',
  50             'thumbnail': r're:^https?://.*\.jpg$',
  51             'average_rating': float,
  52             'duration': 39,
  53             'age_limit': 0,
  54             'upload_date': '20160221',
  55             'timestamp': 1456078244,
  56         },
  57     }, {
  58         'url': 'http://www.cda.pl/video/57413289',
  59         'md5': 'a88828770a8310fc00be6c95faf7f4d5',
  60         'info_dict': {
  61             'id': '57413289',
  62             'ext': 'mp4',
  63             'title': 'Lądowanie na lotnisku na Maderze',
  64             'description': 'md5:60d76b71186dcce4e0ba6d4bbdb13e1a',
  65             'thumbnail': r're:^https?://.*\.jpg$',
  66             'uploader': 'crash404',
  67             'average_rating': float,
  68             'duration': 137,
  69             'age_limit': 0,
  70             'upload_date': '20160220',
  71             'timestamp': 1455968218,
  72         },
  73     }, {
  74         # Age-restricted with vfilm redirection
  75         'url': 'https://www.cda.pl/video/8753244c4',
  76         'md5': 'd8eeb83d63611289507010d3df3bb8b3',
  77         'info_dict': {
  78             'id': '8753244c4',
  79             'ext': 'mp4',
  80             'title': '[18+] Bez Filtra: Rezerwowe Psy czyli...  najwulgarniejsza polska gra?',
  81             'description': 'md5:ae80bac31bd6a9f077a6cce03c7c077e',
  82             'height': 1080,
  83             'uploader': 'arhn eu',
  84             'thumbnail': r're:^https?://.*\.jpg$',
  85             'duration': 991,
  86             'age_limit': 18,
  87             'average_rating': float,
  88             'timestamp': 1633888264,
  89             'upload_date': '20211010',
  90         },
  91     }, {
  92         # Age-restricted without vfilm redirection
  93         'url': 'https://www.cda.pl/video/17028157b8',
  94         'md5': 'c1fe5ff4582bace95d4f0ce0fbd0f992',
  95         'info_dict': {
  96             'id': '17028157b8',
  97             'ext': 'mp4',
  98             'title': 'STENDUPY MICHAŁ OGIŃSKI',
  99             'description': 'md5:5851f3272bfc31f762d616040a1d609a',
 100             'height': 480,
 101             'uploader': 'oginski',
 102             'thumbnail': r're:^https?://.*\.jpg$',
 103             'duration': 18855,
 104             'age_limit': 18,
 105             'average_rating': float,
 106             'timestamp': 1699705901,
 107             'upload_date': '20231111',
 108         },
 109     }, {
 110         'url': 'http://ebd.cda.pl/0x0/5749950c',
 111         'only_matching': True,
 112     }]
 113
 114     def _download_age_confirm_page(self, url, video_id, *args, **kwargs):
 115         data, content_type = multipart_encode({'age_confirm': ''})
 116         return self._download_webpage(
 117             url, video_id, *args,
 118             data=data, headers={
 119                 'Referer': url,
 120                 'Content-Type': content_type,
 121             }, **kwargs)
 122
 123     def _perform_login(self, username, password):
 124         app_version = random.choice((
 125             '1.2.88 build 15306',
 126             '1.2.174 build 18469',
 127         ))
 128         android_version = random.randrange(8, 14)
 129         phone_model = random.choice((
 130             # x-kom.pl top selling Android smartphones, as of 2022-12-26
 131             # https://www.x-kom.pl/g-4/c/1590-smartfony-i-telefony.html?f201-system-operacyjny=61322-android
 132             'ASUS ZenFone 8',
 133             'Motorola edge 20 5G',
 134             'Motorola edge 30 neo 5G',
 135             'Motorola moto g22',
 136             'OnePlus Nord 2T 5G',
 137             'Samsung Galaxy A32 SM‑A325F',
 138             'Samsung Galaxy M13',
 139             'Samsung Galaxy S20 FE 5G',
 140             'Xiaomi 11T',
 141             'Xiaomi POCO M4 Pro',
 142             'Xiaomi Redmi 10',
 143             'Xiaomi Redmi 10C',
 144             'Xiaomi Redmi 9C NFC',
 145             'Xiaomi Redmi Note 10 Pro',
 146             'Xiaomi Redmi Note 11 Pro',
 147             'Xiaomi Redmi Note 11',
 148             'Xiaomi Redmi Note 11S 5G',
 149             'Xiaomi Redmi Note 11S',
 150             'realme 10',
 151             'realme 9 Pro+',
 152             'vivo Y33s',
 153         ))
 154         self._API_HEADERS['User-Agent'] = f'pl.cda 1.0 (version {app_version}; Android {android_version}; {phone_model})'
 155
 156         cached_bearer = self.cache.load(self._BEARER_CACHE, username) or {}
 157         if cached_bearer.get('valid_until', 0) > dt.datetime.now().timestamp() + 5:
 158             self._API_HEADERS['Authorization'] = f'Bearer {cached_bearer["token"]}'
 159             return
 160
 161         password_hash = base64.urlsafe_b64encode(hmac.new(
 162             b's01m1Oer5IANoyBXQETzSOLWXgWs01m1Oer5bMg5xrTMMxRZ9Pi4fIPeFgIVRZ9PeXL8mPfXQETZGUAN5StRZ9P',
 163             ''.join(f'{bytes((bt & 255, )).hex():0>2}'
 164                     for bt in hashlib.md5(password.encode()).digest()).encode(),
 165             hashlib.sha256).digest()).decode().replace('=', '')
 166
 167         token_res = self._download_json(
 168             f'{self._BASE_API_URL}/oauth/token', None, 'Logging in', data=b'',
 169             headers={**self._API_HEADERS, 'Authorization': self._LOGIN_REQUEST_AUTH},
 170             query={
 171                 'grant_type': 'password',
 172                 'login': username,
 173                 'password': password_hash,
 174             })
 175         self.cache.store(self._BEARER_CACHE, username, {
 176             'token': token_res['access_token'],
 177             'valid_until': token_res['expires_in'] + dt.datetime.now().timestamp(),
 178         })
 179         self._API_HEADERS['Authorization'] = f'Bearer {token_res["access_token"]}'
 180
 181     def _real_extract(self, url):
 182         video_id = self._match_id(url)
 183
 184         if 'Authorization' in self._API_HEADERS:
 185             return self._api_extract(video_id)
 186         else:
 187             return self._web_extract(video_id)
 188
 189     def _api_extract(self, video_id):
 190         meta = self._download_json(
 191             f'{self._BASE_API_URL}/video/{video_id}', video_id, headers=self._API_HEADERS)['video']
 192
 193         uploader = traverse_obj(meta, 'author', 'login')
 194
 195         formats = [{
 196             'url': quality['file'],
 197             'format': quality.get('title'),
 198             'resolution': quality.get('name'),
 199             'height': try_call(lambda: int(quality['name'][:-1])),
 200             'filesize': quality.get('length'),
 201         } for quality in meta['qualities'] if quality.get('file')]
 202
 203         if meta.get('premium') and not meta.get('premium_free') and not formats:
 204             raise ExtractorError(
 205                 'Video requires CDA Premium - subscription needed', expected=True)
 206
 207         return {
 208             'id': video_id,
 209             'title': meta.get('title'),
 210             'description': meta.get('description'),
 211             'uploader': None if uploader == 'anonim' else uploader,
 212             'average_rating': float_or_none(meta.get('rating')),
 213             'thumbnail': meta.get('thumb'),
 214             'formats': formats,
 215             'duration': meta.get('duration'),
 216             'age_limit': 18 if meta.get('for_adults') else 0,
 217             'view_count': meta.get('views'),
 218         }
 219
 220     def _web_extract(self, video_id):
 221         self._set_cookie('cda.pl', 'cda.player', 'html5')
 222         webpage, urlh = self._download_webpage_handle(
 223             f'{self._BASE_URL}/video/{video_id}/vfilm', video_id)
 224
 225         if 'Ten film jest dostępny dla użytkowników premium' in webpage:
 226             self.raise_login_required('This video is only available for premium users')
 227
 228         if re.search(r'niedostępn[ey] w(?:&nbsp;|\s+)Twoim kraju\s*<', webpage):
 229             self.raise_geo_restricted()
 230
 231         need_confirm_age = False
 232         if self._html_search_regex(r'(<button[^>]+name="[^"]*age_confirm[^"]*")',
 233                                    webpage, 'birthday validate form', default=None):
 234             webpage = self._download_age_confirm_page(
 235                 urlh.url, video_id, note='Confirming age')
 236             need_confirm_age = True
 237
 238         formats = []
 239
 240         uploader = self._search_regex(r'''(?x)
 241             <(span|meta)[^>]+itemprop=(["\'])author\2[^>]*>
 242             (?:<\1[^>]*>[^<]*</\1>|(?!</\1>)(?:.|\n))*?
 243             <(span|meta)[^>]+itemprop=(["\'])name\4[^>]*>(?P<uploader>[^<]+)</\3>
 244         ''', webpage, 'uploader', default=None, group='uploader')
 245         average_rating = self._search_regex(
 246             (r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P<rating_value>[0-9.]+)',
 247              r'<span[^>]+\bclass=["\']rating["\'][^>]*>(?P<rating_value>[0-9.]+)'), webpage, 'rating', fatal=False,
 248             group='rating_value')
 249
 250         info_dict = {
 251             'id': video_id,
 252             'title': self._og_search_title(webpage),
 253             'description': self._og_search_description(webpage),
 254             'uploader': uploader,
 255             'average_rating': float_or_none(average_rating),
 256             'thumbnail': self._og_search_thumbnail(webpage),
 257             'formats': formats,
 258             'duration': None,
 259             'age_limit': 18 if need_confirm_age else 0,
 260         }
 261
 262         info = self._search_json_ld(webpage, video_id, default={})
 263
 264         # Source: https://www.cda.pl/js/player.js?t=1606154898
 265         def decrypt_file(a):
 266             for p in ('_XDDD', '_CDA', '_ADC', '_CXD', '_QWE', '_Q5', '_IKSDE'):
 267                 a = a.replace(p, '')
 268             a = urllib.parse.unquote(a)
 269             b = []
 270             for c in a:
 271                 f = compat_ord(c)
 272                 b.append(chr(33 + (f + 14) % 94) if 33 <= f <= 126 else chr(f))
 273             a = ''.join(b)
 274             a = a.replace('.cda.mp4', '')
 275             for p in ('.2cda.pl', '.3cda.pl'):
 276                 a = a.replace(p, '.cda.pl')
 277             if '/upstream' in a:
 278                 a = a.replace('/upstream', '.mp4/upstream')
 279                 return 'https://' + a
 280             return 'https://' + a + '.mp4'
 281
 282         def extract_format(page, version):
 283             json_str = self._html_search_regex(
 284                 r'player_data=(\\?["\'])(?P<player_data>.+?)\1', page,
 285                 f'{version} player_json', fatal=False, group='player_data')
 286             if not json_str:
 287                 return
 288             player_data = self._parse_json(
 289                 json_str, f'{version} player_data', fatal=False)
 290             if not player_data:
 291                 return
 292             video = player_data.get('video')
 293             if not video or 'file' not in video:
 294                 self.report_warning(f'Unable to extract {version} version information')
 295                 return
 296             if video['file'].startswith('uggc'):
 297                 video['file'] = codecs.decode(video['file'], 'rot_13')
 298                 if video['file'].endswith('adc.mp4'):
 299                     video['file'] = video['file'].replace('adc.mp4', '.mp4')
 300             elif not video['file'].startswith('http'):
 301                 video['file'] = decrypt_file(video['file'])
 302             video_quality = video.get('quality')
 303             qualities = video.get('qualities', {})
 304             video_quality = next((k for k, v in qualities.items() if v == video_quality), video_quality)
 305             info_dict['formats'].append({
 306                 'url': video['file'],
 307                 'format_id': video_quality,
 308                 'height': int_or_none(video_quality[:-1]),
 309             })
 310             for quality, cda_quality in qualities.items():
 311                 if quality == video_quality:
 312                     continue
 313                 data = {'jsonrpc': '2.0', 'method': 'videoGetLink', 'id': 2,
 314                         'params': [video_id, cda_quality, video.get('ts'), video.get('hash2'), {}]}
 315                 data = json.dumps(data).encode()
 316                 video_url = self._download_json(
 317                     f'https://www.cda.pl/video/{video_id}', video_id, headers={
 318                         'Content-Type': 'application/json',
 319                         'X-Requested-With': 'XMLHttpRequest',
 320                     }, data=data, note=f'Fetching {quality} url',
 321                     errnote=f'Failed to fetch {quality} url', fatal=False)
 322                 if try_get(video_url, lambda x: x['result']['status']) == 'ok':
 323                     video_url = try_get(video_url, lambda x: x['result']['resp'])
 324                     info_dict['formats'].append({
 325                         'url': video_url,
 326                         'format_id': quality,
 327                         'height': int_or_none(quality[:-1]),
 328                     })
 329
 330             if not info_dict['duration']:
 331                 info_dict['duration'] = parse_duration(video.get('duration'))
 332
 333         extract_format(webpage, 'default')
 334
 335         for href, resolution in re.findall(
 336                 r'<a[^>]+data-quality="[^"]+"[^>]+href="([^"]+)"[^>]+class="quality-btn"[^>]*>([0-9]+p)',
 337                 webpage):
 338             if need_confirm_age:
 339                 handler = self._download_age_confirm_page
 340             else:
 341                 handler = self._download_webpage
 342
 343             webpage = handler(
 344                 urljoin(self._BASE_URL, href), video_id,
 345                 f'Downloading {resolution} version information', fatal=False)
 346             if not webpage:
 347                 # Manually report warning because empty page is returned when
 348                 # invalid version is requested.
 349                 self.report_warning(f'Unable to download {resolution} version information')
 350                 continue
 351
 352             extract_format(webpage, resolution)
 353
 354         return merge_dicts(info_dict, info)
 355
 356
 357 class CDAFolderIE(InfoExtractor):
 358     _MAX_PAGE_SIZE = 36
 359     _VALID_URL = r'https?://(?:www\.)?cda\.pl/(?P<channel>\w+)/folder/(?P<id>\d+)'
 360     _TESTS = [
 361         {
 362             'url': 'https://www.cda.pl/domino264/folder/31188385',
 363             'info_dict': {
 364                 'id': '31188385',
 365                 'title': 'SERIA DRUGA',
 366             },
 367             'playlist_mincount': 13,
 368         },
 369         {
 370             'url': 'https://www.cda.pl/smiechawaTV/folder/2664592/vfilm',
 371             'info_dict': {
 372                 'id': '2664592',
 373                 'title': 'VideoDowcipy - wszystkie odcinki',
 374             },
 375             'playlist_mincount': 71,
 376         },
 377         {
 378             'url': 'https://www.cda.pl/DeliciousBeauty/folder/19129979/vfilm',
 379             'info_dict': {
 380                 'id': '19129979',
 381                 'title': 'TESTY KOSMETYKÓW',
 382             },
 383             'playlist_mincount': 139,
 384         }]
 385
 386     def _real_extract(self, url):
 387         folder_id, channel = self._match_valid_url(url).group('id', 'channel')
 388
 389         webpage = self._download_webpage(url, folder_id)
 390
 391         def extract_page_entries(page):
 392             webpage = self._download_webpage(
 393                 f'https://www.cda.pl/{channel}/folder/{folder_id}/vfilm/{page + 1}', folder_id,
 394                 f'Downloading page {page + 1}', expected_status=404)
 395             items = re.findall(r'<a[^>]+href="/video/([0-9a-z]+)"', webpage)
 396             for video_id in items:
 397                 yield self.url_result(f'https://www.cda.pl/video/{video_id}', CDAIE, video_id)
 398
 399         return self.playlist_result(
 400             OnDemandPagedList(extract_page_entries, self._MAX_PAGE_SIZE),
 401             folder_id, self._og_search_title(webpage))