[ie/cwtv:movie] Add extractor (#12227)
[yt-dlp.git] / yt_dlp / extractor / cda.py
blobb2738e492f70b2b9a81b34b89946acbd4952cd87
1 import base64
2 import codecs
3 import datetime as dt
4 import hashlib
5 import hmac
6 import json
7 import random
8 import re
9 import urllib.parse
11 from .common import InfoExtractor
12 from ..compat import compat_ord
13 from ..utils import (
14 ExtractorError,
15 OnDemandPagedList,
16 float_or_none,
17 int_or_none,
18 merge_dicts,
19 multipart_encode,
20 parse_duration,
21 traverse_obj,
22 try_call,
23 try_get,
24 urljoin,
28 class CDAIE(InfoExtractor):
29 _VALID_URL = r'https?://(?:(?:www\.)?cda\.pl/video|ebd\.cda\.pl/[0-9]+x[0-9]+)/(?P<id>[0-9a-z]+)'
30 _NETRC_MACHINE = 'cdapl'
32 _BASE_URL = 'https://www.cda.pl'
33 _BASE_API_URL = 'https://api.cda.pl'
34 _API_HEADERS = {
35 'Accept': 'application/vnd.cda.public+json',
37 # hardcoded in the app
38 _LOGIN_REQUEST_AUTH = 'Basic YzU3YzBlZDUtYTIzOC00MWQwLWI2NjQtNmZmMWMxY2Y2YzVlOklBTm95QlhRRVR6U09MV1hnV3MwMW0xT2VyNWJNZzV4clRNTXhpNGZJUGVGZ0lWUlo5UGVYTDhtUGZaR1U1U3Q'
39 _BEARER_CACHE = 'cda-bearer'
41 _TESTS = [{
42 'url': 'http://www.cda.pl/video/5749950c',
43 'md5': '6f844bf51b15f31fae165365707ae970',
44 'info_dict': {
45 'id': '5749950c',
46 'ext': 'mp4',
47 'height': 720,
48 'title': 'Oto dlaczego przed zakrętem należy zwolnić.',
49 'description': 'md5:269ccd135d550da90d1662651fcb9772',
50 'thumbnail': r're:^https?://.*\.jpg$',
51 'average_rating': float,
52 'duration': 39,
53 'age_limit': 0,
54 'upload_date': '20160221',
55 'timestamp': 1456078244,
57 }, {
58 'url': 'http://www.cda.pl/video/57413289',
59 'md5': 'a88828770a8310fc00be6c95faf7f4d5',
60 'info_dict': {
61 'id': '57413289',
62 'ext': 'mp4',
63 'title': 'Lądowanie na lotnisku na Maderze',
64 'description': 'md5:60d76b71186dcce4e0ba6d4bbdb13e1a',
65 'thumbnail': r're:^https?://.*\.jpg$',
66 'uploader': 'crash404',
67 'average_rating': float,
68 'duration': 137,
69 'age_limit': 0,
70 'upload_date': '20160220',
71 'timestamp': 1455968218,
73 }, {
74 # Age-restricted with vfilm redirection
75 'url': 'https://www.cda.pl/video/8753244c4',
76 'md5': 'd8eeb83d63611289507010d3df3bb8b3',
77 'info_dict': {
78 'id': '8753244c4',
79 'ext': 'mp4',
80 'title': '[18+] Bez Filtra: Rezerwowe Psy czyli... najwulgarniejsza polska gra?',
81 'description': 'md5:ae80bac31bd6a9f077a6cce03c7c077e',
82 'height': 1080,
83 'uploader': 'arhn eu',
84 'thumbnail': r're:^https?://.*\.jpg$',
85 'duration': 991,
86 'age_limit': 18,
87 'average_rating': float,
88 'timestamp': 1633888264,
89 'upload_date': '20211010',
91 }, {
92 # Age-restricted without vfilm redirection
93 'url': 'https://www.cda.pl/video/17028157b8',
94 'md5': 'c1fe5ff4582bace95d4f0ce0fbd0f992',
95 'info_dict': {
96 'id': '17028157b8',
97 'ext': 'mp4',
98 'title': 'STENDUPY MICHAŁ OGIŃSKI',
99 'description': 'md5:5851f3272bfc31f762d616040a1d609a',
100 'height': 480,
101 'uploader': 'oginski',
102 'thumbnail': r're:^https?://.*\.jpg$',
103 'duration': 18855,
104 'age_limit': 18,
105 'average_rating': float,
106 'timestamp': 1699705901,
107 'upload_date': '20231111',
109 }, {
110 'url': 'http://ebd.cda.pl/0x0/5749950c',
111 'only_matching': True,
114 def _download_age_confirm_page(self, url, video_id, *args, **kwargs):
115 data, content_type = multipart_encode({'age_confirm': ''})
116 return self._download_webpage(
117 url, video_id, *args,
118 data=data, headers={
119 'Referer': url,
120 'Content-Type': content_type,
121 }, **kwargs)
123 def _perform_login(self, username, password):
124 app_version = random.choice((
125 '1.2.88 build 15306',
126 '1.2.174 build 18469',
128 android_version = random.randrange(8, 14)
129 phone_model = random.choice((
130 # x-kom.pl top selling Android smartphones, as of 2022-12-26
131 # https://www.x-kom.pl/g-4/c/1590-smartfony-i-telefony.html?f201-system-operacyjny=61322-android
132 'ASUS ZenFone 8',
133 'Motorola edge 20 5G',
134 'Motorola edge 30 neo 5G',
135 'Motorola moto g22',
136 'OnePlus Nord 2T 5G',
137 'Samsung Galaxy A32 SM‑A325F',
138 'Samsung Galaxy M13',
139 'Samsung Galaxy S20 FE 5G',
140 'Xiaomi 11T',
141 'Xiaomi POCO M4 Pro',
142 'Xiaomi Redmi 10',
143 'Xiaomi Redmi 10C',
144 'Xiaomi Redmi 9C NFC',
145 'Xiaomi Redmi Note 10 Pro',
146 'Xiaomi Redmi Note 11 Pro',
147 'Xiaomi Redmi Note 11',
148 'Xiaomi Redmi Note 11S 5G',
149 'Xiaomi Redmi Note 11S',
150 'realme 10',
151 'realme 9 Pro+',
152 'vivo Y33s',
154 self._API_HEADERS['User-Agent'] = f'pl.cda 1.0 (version {app_version}; Android {android_version}; {phone_model})'
156 cached_bearer = self.cache.load(self._BEARER_CACHE, username) or {}
157 if cached_bearer.get('valid_until', 0) > dt.datetime.now().timestamp() + 5:
158 self._API_HEADERS['Authorization'] = f'Bearer {cached_bearer["token"]}'
159 return
161 password_hash = base64.urlsafe_b64encode(hmac.new(
162 b's01m1Oer5IANoyBXQETzSOLWXgWs01m1Oer5bMg5xrTMMxRZ9Pi4fIPeFgIVRZ9PeXL8mPfXQETZGUAN5StRZ9P',
163 ''.join(f'{bytes((bt & 255, )).hex():0>2}'
164 for bt in hashlib.md5(password.encode()).digest()).encode(),
165 hashlib.sha256).digest()).decode().replace('=', '')
167 token_res = self._download_json(
168 f'{self._BASE_API_URL}/oauth/token', None, 'Logging in', data=b'',
169 headers={**self._API_HEADERS, 'Authorization': self._LOGIN_REQUEST_AUTH},
170 query={
171 'grant_type': 'password',
172 'login': username,
173 'password': password_hash,
175 self.cache.store(self._BEARER_CACHE, username, {
176 'token': token_res['access_token'],
177 'valid_until': token_res['expires_in'] + dt.datetime.now().timestamp(),
179 self._API_HEADERS['Authorization'] = f'Bearer {token_res["access_token"]}'
181 def _real_extract(self, url):
182 video_id = self._match_id(url)
184 if 'Authorization' in self._API_HEADERS:
185 return self._api_extract(video_id)
186 else:
187 return self._web_extract(video_id)
189 def _api_extract(self, video_id):
190 meta = self._download_json(
191 f'{self._BASE_API_URL}/video/{video_id}', video_id, headers=self._API_HEADERS)['video']
193 uploader = traverse_obj(meta, 'author', 'login')
195 formats = [{
196 'url': quality['file'],
197 'format': quality.get('title'),
198 'resolution': quality.get('name'),
199 'height': try_call(lambda: int(quality['name'][:-1])),
200 'filesize': quality.get('length'),
201 } for quality in meta['qualities'] if quality.get('file')]
203 if meta.get('premium') and not meta.get('premium_free') and not formats:
204 raise ExtractorError(
205 'Video requires CDA Premium - subscription needed', expected=True)
207 return {
208 'id': video_id,
209 'title': meta.get('title'),
210 'description': meta.get('description'),
211 'uploader': None if uploader == 'anonim' else uploader,
212 'average_rating': float_or_none(meta.get('rating')),
213 'thumbnail': meta.get('thumb'),
214 'formats': formats,
215 'duration': meta.get('duration'),
216 'age_limit': 18 if meta.get('for_adults') else 0,
217 'view_count': meta.get('views'),
220 def _web_extract(self, video_id):
221 self._set_cookie('cda.pl', 'cda.player', 'html5')
222 webpage, urlh = self._download_webpage_handle(
223 f'{self._BASE_URL}/video/{video_id}/vfilm', video_id)
225 if 'Ten film jest dostępny dla użytkowników premium' in webpage:
226 self.raise_login_required('This video is only available for premium users')
228 if re.search(r'niedostępn[ey] w(?:&nbsp;|\s+)Twoim kraju\s*<', webpage):
229 self.raise_geo_restricted()
231 need_confirm_age = False
232 if self._html_search_regex(r'(<button[^>]+name="[^"]*age_confirm[^"]*")',
233 webpage, 'birthday validate form', default=None):
234 webpage = self._download_age_confirm_page(
235 urlh.url, video_id, note='Confirming age')
236 need_confirm_age = True
238 formats = []
240 uploader = self._search_regex(r'''(?x)
241 <(span|meta)[^>]+itemprop=(["\'])author\2[^>]*>
242 (?:<\1[^>]*>[^<]*</\1>|(?!</\1>)(?:.|\n))*?
243 <(span|meta)[^>]+itemprop=(["\'])name\4[^>]*>(?P<uploader>[^<]+)</\3>
244 ''', webpage, 'uploader', default=None, group='uploader')
245 average_rating = self._search_regex(
246 (r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P<rating_value>[0-9.]+)',
247 r'<span[^>]+\bclass=["\']rating["\'][^>]*>(?P<rating_value>[0-9.]+)'), webpage, 'rating', fatal=False,
248 group='rating_value')
250 info_dict = {
251 'id': video_id,
252 'title': self._og_search_title(webpage),
253 'description': self._og_search_description(webpage),
254 'uploader': uploader,
255 'average_rating': float_or_none(average_rating),
256 'thumbnail': self._og_search_thumbnail(webpage),
257 'formats': formats,
258 'duration': None,
259 'age_limit': 18 if need_confirm_age else 0,
262 info = self._search_json_ld(webpage, video_id, default={})
264 # Source: https://www.cda.pl/js/player.js?t=1606154898
265 def decrypt_file(a):
266 for p in ('_XDDD', '_CDA', '_ADC', '_CXD', '_QWE', '_Q5', '_IKSDE'):
267 a = a.replace(p, '')
268 a = urllib.parse.unquote(a)
269 b = []
270 for c in a:
271 f = compat_ord(c)
272 b.append(chr(33 + (f + 14) % 94) if 33 <= f <= 126 else chr(f))
273 a = ''.join(b)
274 a = a.replace('.cda.mp4', '')
275 for p in ('.2cda.pl', '.3cda.pl'):
276 a = a.replace(p, '.cda.pl')
277 if '/upstream' in a:
278 a = a.replace('/upstream', '.mp4/upstream')
279 return 'https://' + a
280 return 'https://' + a + '.mp4'
282 def extract_format(page, version):
283 json_str = self._html_search_regex(
284 r'player_data=(\\?["\'])(?P<player_data>.+?)\1', page,
285 f'{version} player_json', fatal=False, group='player_data')
286 if not json_str:
287 return
288 player_data = self._parse_json(
289 json_str, f'{version} player_data', fatal=False)
290 if not player_data:
291 return
292 video = player_data.get('video')
293 if not video or 'file' not in video:
294 self.report_warning(f'Unable to extract {version} version information')
295 return
296 if video['file'].startswith('uggc'):
297 video['file'] = codecs.decode(video['file'], 'rot_13')
298 if video['file'].endswith('adc.mp4'):
299 video['file'] = video['file'].replace('adc.mp4', '.mp4')
300 elif not video['file'].startswith('http'):
301 video['file'] = decrypt_file(video['file'])
302 video_quality = video.get('quality')
303 qualities = video.get('qualities', {})
304 video_quality = next((k for k, v in qualities.items() if v == video_quality), video_quality)
305 info_dict['formats'].append({
306 'url': video['file'],
307 'format_id': video_quality,
308 'height': int_or_none(video_quality[:-1]),
310 for quality, cda_quality in qualities.items():
311 if quality == video_quality:
312 continue
313 data = {'jsonrpc': '2.0', 'method': 'videoGetLink', 'id': 2,
314 'params': [video_id, cda_quality, video.get('ts'), video.get('hash2'), {}]}
315 data = json.dumps(data).encode()
316 video_url = self._download_json(
317 f'https://www.cda.pl/video/{video_id}', video_id, headers={
318 'Content-Type': 'application/json',
319 'X-Requested-With': 'XMLHttpRequest',
320 }, data=data, note=f'Fetching {quality} url',
321 errnote=f'Failed to fetch {quality} url', fatal=False)
322 if try_get(video_url, lambda x: x['result']['status']) == 'ok':
323 video_url = try_get(video_url, lambda x: x['result']['resp'])
324 info_dict['formats'].append({
325 'url': video_url,
326 'format_id': quality,
327 'height': int_or_none(quality[:-1]),
330 if not info_dict['duration']:
331 info_dict['duration'] = parse_duration(video.get('duration'))
333 extract_format(webpage, 'default')
335 for href, resolution in re.findall(
336 r'<a[^>]+data-quality="[^"]+"[^>]+href="([^"]+)"[^>]+class="quality-btn"[^>]*>([0-9]+p)',
337 webpage):
338 if need_confirm_age:
339 handler = self._download_age_confirm_page
340 else:
341 handler = self._download_webpage
343 webpage = handler(
344 urljoin(self._BASE_URL, href), video_id,
345 f'Downloading {resolution} version information', fatal=False)
346 if not webpage:
347 # Manually report warning because empty page is returned when
348 # invalid version is requested.
349 self.report_warning(f'Unable to download {resolution} version information')
350 continue
352 extract_format(webpage, resolution)
354 return merge_dicts(info_dict, info)
357 class CDAFolderIE(InfoExtractor):
358 _MAX_PAGE_SIZE = 36
359 _VALID_URL = r'https?://(?:www\.)?cda\.pl/(?P<channel>\w+)/folder/(?P<id>\d+)'
360 _TESTS = [
362 'url': 'https://www.cda.pl/domino264/folder/31188385',
363 'info_dict': {
364 'id': '31188385',
365 'title': 'SERIA DRUGA',
367 'playlist_mincount': 13,
370 'url': 'https://www.cda.pl/smiechawaTV/folder/2664592/vfilm',
371 'info_dict': {
372 'id': '2664592',
373 'title': 'VideoDowcipy - wszystkie odcinki',
375 'playlist_mincount': 71,
378 'url': 'https://www.cda.pl/DeliciousBeauty/folder/19129979/vfilm',
379 'info_dict': {
380 'id': '19129979',
381 'title': 'TESTY KOSMETYKÓW',
383 'playlist_mincount': 139,
386 def _real_extract(self, url):
387 folder_id, channel = self._match_valid_url(url).group('id', 'channel')
389 webpage = self._download_webpage(url, folder_id)
391 def extract_page_entries(page):
392 webpage = self._download_webpage(
393 f'https://www.cda.pl/{channel}/folder/{folder_id}/vfilm/{page + 1}', folder_id,
394 f'Downloading page {page + 1}', expected_status=404)
395 items = re.findall(r'<a[^>]+href="/video/([0-9a-z]+)"', webpage)
396 for video_id in items:
397 yield self.url_result(f'https://www.cda.pl/video/{video_id}', CDAIE, video_id)
399 return self.playlist_result(
400 OnDemandPagedList(extract_page_entries, self._MAX_PAGE_SIZE),
401 folder_id, self._og_search_title(webpage))