[ie/youtube] Add age-gate workaround for some embeddable videos (#11821)
[yt-dlp.git] / yt_dlp / extractor / iprima.py
blob9b91a454b1d3f4b749c74eb0721e6103bf27fb82
1 import re
2 import time
4 from .common import InfoExtractor
5 from ..utils import (
6 ExtractorError,
7 determine_ext,
8 js_to_json,
9 parse_qs,
10 traverse_obj,
11 urlencode_postdata,
15 class IPrimaIE(InfoExtractor):
16 _VALID_URL = r'https?://(?!cnn)(?:[^/]+)\.iprima\.cz/(?:[^/]+/)*(?P<id>[^/?#&]+)'
17 _GEO_BYPASS = False
18 _NETRC_MACHINE = 'iprima'
19 _AUTH_ROOT = 'https://auth.iprima.cz'
20 access_token = None
22 _TESTS = [{
23 'url': 'https://prima.iprima.cz/particka/92-epizoda',
24 'info_dict': {
25 'id': 'p51388',
26 'ext': 'mp4',
27 'title': 'Partička (92)',
28 'description': 'md5:57943f6a50d6188288c3a579d2fd5f01',
29 'episode': 'Partička (92)',
30 'season': 'Partička',
31 'series': 'Prima Partička',
32 'episode_number': 92,
33 'thumbnail': 'https://d31b9s05ygj54s.cloudfront.net/prima-plus/image/video-ef6cf9de-c980-4443-92e4-17fe8bccd45c-16x9.jpeg',
35 'params': {
36 'skip_download': True, # m3u8 download
38 }, {
39 'url': 'https://zoom.iprima.cz/porady/krasy-kanarskych-ostrovu/tenerife-v-risi-ohne',
40 'info_dict': {
41 'id': 'p1412199',
42 'ext': 'mp4',
43 'episode_number': 3,
44 'episode': 'Tenerife: V říši ohně',
45 'description': 'md5:4b4a05c574b5eaef130e68d4811c3f2c',
46 'duration': 3111.0,
47 'thumbnail': 'https://d31b9s05ygj54s.cloudfront.net/prima-plus/image/video-f66dd7fb-c1a0-47d1-b3bc-7db328d566c5-16x9-1711636518.jpg/t_16x9_medium_1366_768',
48 'title': 'Tenerife: V říši ohně',
49 'timestamp': 1711825800,
50 'upload_date': '20240330',
52 'params': {
53 'skip_download': True, # m3u8 download
55 }, {
56 'url': 'http://play.iprima.cz/particka/particka-92',
57 'only_matching': True,
58 }, {
59 # geo restricted
60 'url': 'http://play.iprima.cz/closer-nove-pripady/closer-nove-pripady-iv-1',
61 'only_matching': True,
62 }, {
63 'url': 'https://prima.iprima.cz/my-little-pony/mapa-znameni-2-2',
64 'only_matching': True,
65 }, {
66 'url': 'https://prima.iprima.cz/porady/jak-se-stavi-sen/rodina-rathousova-praha',
67 'only_matching': True,
68 }, {
69 'url': 'http://www.iprima.cz/filmy/desne-rande',
70 'only_matching': True,
71 }, {
72 'url': 'https://zoom.iprima.cz/10-nejvetsich-tajemstvi-zahad/posvatna-mista-a-stavby',
73 'only_matching': True,
74 }, {
75 'url': 'https://krimi.iprima.cz/mraz-0/sebevrazdy',
76 'only_matching': True,
77 }, {
78 'url': 'https://cool.iprima.cz/derava-silnice-nevadi',
79 'only_matching': True,
80 }, {
81 'url': 'https://love.iprima.cz/laska-az-za-hrob/slib-dany-bratrovi',
82 'only_matching': True,
85 def _perform_login(self, username, password):
86 if self.access_token:
87 return
89 login_page = self._download_webpage(
90 f'{self._AUTH_ROOT}/oauth2/login', None, note='Downloading login page',
91 errnote='Downloading login page failed')
93 login_form = self._hidden_inputs(login_page)
95 login_form.update({
96 '_email': username,
97 '_password': password})
99 profile_select_html, login_handle = self._download_webpage_handle(
100 f'{self._AUTH_ROOT}/oauth2/login', None, data=urlencode_postdata(login_form),
101 note='Logging in')
103 # a profile may need to be selected first, even when there is only a single one
104 if '/profile-select' in login_handle.url:
105 profile_id = self._search_regex(
106 r'data-identifier\s*=\s*["\']?(\w+)', profile_select_html, 'profile id')
108 login_handle = self._request_webpage(
109 f'{self._AUTH_ROOT}/user/profile-select-perform/{profile_id}', None,
110 query={'continueUrl': '/user/login?redirect_uri=/user/'}, note='Selecting profile')
112 code = traverse_obj(login_handle.url, ({parse_qs}, 'code', 0))
113 if not code:
114 raise ExtractorError('Login failed', expected=True)
116 token_request_data = {
117 'scope': 'openid+email+profile+phone+address+offline_access',
118 'client_id': 'prima_sso',
119 'grant_type': 'authorization_code',
120 'code': code,
121 'redirect_uri': f'{self._AUTH_ROOT}/sso/auth-check'}
123 token_data = self._download_json(
124 f'{self._AUTH_ROOT}/oauth2/token', None,
125 note='Downloading token', errnote='Downloading token failed',
126 data=urlencode_postdata(token_request_data))
128 self.access_token = token_data.get('access_token')
129 if self.access_token is None:
130 raise ExtractorError('Getting token failed', expected=True)
132 def _real_initialize(self):
133 if not self.access_token:
134 self.raise_login_required('Login is required to access any iPrima content', method='password')
136 def _raise_access_error(self, error_code):
137 if error_code == 'PLAY_GEOIP_DENIED':
138 self.raise_geo_restricted(countries=['CZ'], metadata_available=True)
139 elif error_code is not None:
140 self.raise_no_formats('Access to stream infos forbidden', expected=True)
142 def _real_extract(self, url):
143 video_id = self._match_id(url)
145 webpage = self._download_webpage(url, video_id)
147 title = self._html_extract_title(webpage) or self._html_search_meta(
148 ['og:title', 'twitter:title'],
149 webpage, 'title', default=None)
151 video_id = self._search_regex((
152 r'productId\s*=\s*([\'"])(?P<id>p\d+)\1',
153 r'pproduct_id\s*=\s*([\'"])(?P<id>p\d+)\1',
154 r'let\s+videos\s*=\s*([\'"])(?P<id>p\d+)\1',
155 ), webpage, 'real id', group='id', default=None)
157 if not video_id:
158 nuxt_data = self._search_nuxt_data(webpage, video_id, traverse='data', fatal=False)
159 video_id = traverse_obj(
160 nuxt_data, (..., 'content', 'additionals', 'videoPlayId', {str}), get_all=False)
162 if not video_id:
163 nuxt_data = self._search_json(
164 r'<script[^>]+\bid=["\']__NUXT_DATA__["\'][^>]*>',
165 webpage, 'nuxt data', None, end_pattern=r'</script>', contains_pattern=r'\[(?s:.+)\]')
167 video_id = traverse_obj(nuxt_data, lambda _, v: re.fullmatch(r'p\d+', v), get_all=False)
169 if not video_id:
170 self.raise_no_formats('Unable to extract video ID from webpage')
172 metadata = self._download_json(
173 f'https://api.play-backend.iprima.cz/api/v1//products/id-{video_id}/play',
174 video_id, note='Getting manifest URLs', errnote='Failed to get manifest URLs',
175 headers={'X-OTT-Access-Token': self.access_token},
176 expected_status=403)
178 self._raise_access_error(metadata.get('errorCode'))
180 stream_infos = metadata.get('streamInfos')
181 formats = []
182 if stream_infos is None:
183 self.raise_no_formats('Reading stream infos failed', expected=True)
184 else:
185 for manifest in stream_infos:
186 manifest_type = manifest.get('type')
187 manifest_url = manifest.get('url')
188 ext = determine_ext(manifest_url)
189 if manifest_type == 'HLS' or ext == 'm3u8':
190 formats += self._extract_m3u8_formats(
191 manifest_url, video_id, 'mp4', entry_protocol='m3u8_native',
192 m3u8_id='hls', fatal=False)
193 elif manifest_type == 'DASH' or ext == 'mpd':
194 formats += self._extract_mpd_formats(
195 manifest_url, video_id, mpd_id='dash', fatal=False)
197 final_result = self._search_json_ld(webpage, video_id, default={})
198 final_result.update({
199 'id': video_id,
200 'title': final_result.get('title') or title,
201 'thumbnail': self._html_search_meta(
202 ['thumbnail', 'og:image', 'twitter:image'],
203 webpage, 'thumbnail', default=None),
204 'formats': formats,
205 'description': self._html_search_meta(
206 ['description', 'og:description', 'twitter:description'],
207 webpage, 'description', default=None)})
209 return final_result
212 class IPrimaCNNIE(InfoExtractor):
213 _VALID_URL = r'https?://cnn\.iprima\.cz/(?:[^/]+/)*(?P<id>[^/?#&]+)'
214 _GEO_BYPASS = False
216 _TESTS = [{
217 'url': 'https://cnn.iprima.cz/porady/strunc/24072020-koronaviru-mam-plne-zuby-strasit-druhou-vlnou-je-absurdni-rika-senatorka-dernerova',
218 'info_dict': {
219 'id': 'p716177',
220 'ext': 'mp4',
221 'title': 'md5:277c6b1ed0577e51b40ddd35602ff43e',
223 'params': {
224 'skip_download': 'm3u8',
228 def _real_extract(self, url):
229 video_id = self._match_id(url)
231 self._set_cookie('play.iprima.cz', 'ott_adult_confirmed', '1')
233 webpage = self._download_webpage(url, video_id)
235 title = self._og_search_title(
236 webpage, default=None) or self._search_regex(
237 r'<h1>([^<]+)', webpage, 'title')
239 video_id = self._search_regex(
240 (r'<iframe[^>]+\bsrc=["\'](?:https?:)?//(?:api\.play-backend\.iprima\.cz/prehravac/embedded|prima\.iprima\.cz/[^/]+/[^/]+)\?.*?\bid=(p\d+)',
241 r'data-product="([^"]+)">',
242 r'id=["\']player-(p\d+)"',
243 r'playerId\s*:\s*["\']player-(p\d+)',
244 r'\bvideos\s*=\s*["\'](p\d+)'),
245 webpage, 'real id')
247 playerpage = self._download_webpage(
248 'http://play.iprima.cz/prehravac/init',
249 video_id, note='Downloading player', query={
250 '_infuse': 1,
251 '_ts': round(time.time()),
252 'productId': video_id,
253 }, headers={'Referer': url})
255 formats = []
257 def extract_formats(format_url, format_key=None, lang=None):
258 ext = determine_ext(format_url)
259 new_formats = []
260 if format_key == 'hls' or ext == 'm3u8':
261 new_formats = self._extract_m3u8_formats(
262 format_url, video_id, 'mp4', entry_protocol='m3u8_native',
263 m3u8_id='hls', fatal=False)
264 elif format_key == 'dash' or ext == 'mpd':
265 return
266 new_formats = self._extract_mpd_formats(
267 format_url, video_id, mpd_id='dash', fatal=False)
268 if lang:
269 for f in new_formats:
270 if not f.get('language'):
271 f['language'] = lang
272 formats.extend(new_formats)
274 options = self._parse_json(
275 self._search_regex(
276 r'(?s)(?:TDIPlayerOptions|playerOptions)\s*=\s*({.+?});\s*\]\]',
277 playerpage, 'player options', default='{}'),
278 video_id, transform_source=js_to_json, fatal=False)
279 if options:
280 for key, tracks in options.get('tracks', {}).items():
281 if not isinstance(tracks, list):
282 continue
283 for track in tracks:
284 src = track.get('src')
285 if src:
286 extract_formats(src, key.lower(), track.get('lang'))
288 if not formats:
289 for _, src in re.findall(r'src["\']\s*:\s*(["\'])(.+?)\1', playerpage):
290 extract_formats(src)
292 if not formats and '>GEO_IP_NOT_ALLOWED<' in playerpage:
293 self.raise_geo_restricted(countries=['CZ'], metadata_available=True)
295 return {
296 'id': video_id,
297 'title': title,
298 'thumbnail': self._og_search_thumbnail(webpage, default=None),
299 'formats': formats,
300 'description': self._og_search_description(webpage, default=None),