[ie/youtube] Add age-gate workaround for some embeddable videos (#11821)
[yt-dlp.git] / yt_dlp / extractor / rtvcplay.py
blob5b0eee96598bf9e1b63ce758857e1d017eaf85c0
1 import re
3 from .common import InfoExtractor
4 from ..utils import (
5 ExtractorError,
6 clean_html,
7 determine_ext,
8 float_or_none,
9 int_or_none,
10 js_to_json,
11 mimetype2ext,
12 traverse_obj,
13 url_or_none,
14 urljoin,
18 class RTVCPlayBaseIE(InfoExtractor):
19 _BASE_VALID_URL = r'https?://(?:www\.)?rtvcplay\.co'
21 def _extract_player_config(self, webpage, video_id):
22 return self._search_json(
23 r'<script\b[^>]*>[^<]*(?:var|let|const)\s+config\s*=', re.sub(r'"\s*\+\s*"', '', webpage),
24 'player_config', video_id, transform_source=js_to_json)
26 def _extract_formats_and_subtitles_player_config(self, player_config, video_id):
27 formats, subtitles = [], {}
28 for source in traverse_obj(player_config, ('sources', ..., lambda _, v: url_or_none(v['url']))):
29 ext = mimetype2ext(source.get('mimetype'), default=determine_ext(source['url']))
30 if ext == 'm3u8':
31 fmts, subs = self._extract_m3u8_formats_and_subtitles(
32 source['url'], video_id, 'mp4', fatal=False)
33 formats.extend(fmts)
34 self._merge_subtitles(subs, target=subtitles)
35 else:
36 formats.append({
37 'url': source['url'],
38 'ext': ext,
41 return formats, subtitles
44 class RTVCPlayIE(RTVCPlayBaseIE):
45 _VALID_URL = RTVCPlayBaseIE._BASE_VALID_URL + r'/(?P<category>(?!embed)[^/]+)/(?:[^?#]+/)?(?P<id>[\w-]+)'
47 _TESTS = [{
48 'url': 'https://www.rtvcplay.co/en-vivo/canal-institucional',
49 'info_dict': {
50 'id': 'canal-institucional',
51 'title': r're:^Canal Institucional',
52 'description': 'md5:eff9e548394175928059320c006031ea',
53 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
54 'live_status': 'is_live',
55 'ext': 'mp4',
57 'params': {
58 'skip_download': 'Livestream',
60 }, {
61 'url': 'https://www.rtvcplay.co/en-vivo/senal-colombia',
62 'info_dict': {
63 'id': 'senal-colombia',
64 'title': r're:^Señal Colombia',
65 'description': 'md5:799f16a401d97f40c33a2c6a3e2a507b',
66 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
67 'live_status': 'is_live',
68 'ext': 'mp4',
70 'params': {
71 'skip_download': 'Livestream',
73 }, {
74 'url': 'https://www.rtvcplay.co/en-vivo/radio-nacional',
75 'info_dict': {
76 'id': 'radio-nacional',
77 'title': r're:^Radio Nacional',
78 'description': 'md5:5de009bc6a9fa79d2a6cf0b73f977d53',
79 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
80 'live_status': 'is_live',
81 'ext': 'mp4',
83 'params': {
84 'skip_download': 'Livestream',
86 }, {
87 'url': 'https://www.rtvcplay.co/peliculas-ficcion/senoritas',
88 'md5': '1288ee6f6d1330d880f98bff2ed710a3',
89 'info_dict': {
90 'id': 'senoritas',
91 'title': 'Señoritas',
92 'description': 'md5:f095a2bb52cb6cf279daf6302f86fb32',
93 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
94 'ext': 'mp4',
96 }, {
97 'url': 'https://www.rtvcplay.co/competencias-basicas-ciudadanas-y-socioemocionales/profe-en-tu-casa/james-regresa-clases-28022022',
98 'md5': 'f040a7380a269ad633cf837384d5e9fc',
99 'info_dict': {
100 'id': 'james-regresa-clases-28022022',
101 'title': 'James regresa a clases - 28/02/2022',
102 'description': 'md5:c5dcdf757c7ab29305e8763c6007e675',
103 'ext': 'mp4',
105 }, {
106 'url': 'https://www.rtvcplay.co/peliculas-documentales/llinas-el-cerebro-y-el-universo',
107 'info_dict': {
108 'id': 'llinas-el-cerebro-y-el-universo',
109 'title': 'Llinás, el cerebro y el universo',
110 'description': 'md5:add875bf2309bb52b3e8b9b06116d9b0',
111 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
113 'playlist_mincount': 3,
114 }, {
115 'url': 'https://www.rtvcplay.co/competencias-basicas-ciudadanas-y-socioemocionales/profe-en-tu-casa',
116 'info_dict': {
117 'id': 'profe-en-tu-casa',
118 'title': 'Profe en tu casa',
119 'description': 'md5:47dbe20e263194413b1db2a2805a4f2e',
120 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
122 'playlist_mincount': 537,
123 }, {
124 'url': 'https://www.rtvcplay.co/series-al-oido/relato-de-un-naufrago-una-travesia-del-periodismo-a-la-literatura',
125 'info_dict': {
126 'id': 'relato-de-un-naufrago-una-travesia-del-periodismo-a-la-literatura',
127 'title': 'Relato de un náufrago: una travesía del periodismo a la literatura',
128 'description': 'md5:6da28fdca4a5a568ea47ef65ef775603',
129 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
131 'playlist_mincount': 5,
132 }, {
133 'url': 'https://www.rtvcplay.co/series-al-oido/diez-versiones',
134 'info_dict': {
135 'id': 'diez-versiones',
136 'title': 'Diez versiones',
137 'description': 'md5:997471ed971cb3fd8e41969457675306',
138 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
140 'playlist_mincount': 20,
143 def _real_extract(self, url):
144 video_id, category = self._match_valid_url(url).group('id', 'category')
145 webpage = self._download_webpage(url, video_id)
147 hydration = self._search_json(
148 r'window\.__RTVCPLAY_STATE__\s*=', webpage, 'hydration',
149 video_id, transform_source=js_to_json)['content']['currentContent']
151 asset_id = traverse_obj(hydration, ('video', 'assetid'))
152 if asset_id:
153 hls_url = hydration['base_url_hls'].replace('[node:field_asset_id]', asset_id)
154 else:
155 hls_url = traverse_obj(hydration, ('channel', 'hls'))
157 metadata = traverse_obj(hydration, {
158 'title': 'title',
159 'description': 'description',
160 'thumbnail': ((('channel', 'image', 'logo'), ('resource', 'image', 'cover_desktop')), 'path'),
161 }, get_all=False)
163 # Probably it's a program's page
164 if not hls_url:
165 seasons = traverse_obj(
166 hydration, ('widgets', lambda _, y: y['type'] == 'seasonList', 'contents'),
167 get_all=False)
168 if not seasons:
169 podcast_episodes = hydration.get('audios')
170 if not podcast_episodes:
171 raise ExtractorError('Could not find asset_id nor program playlist nor podcast episodes')
173 return self.playlist_result([
174 self.url_result(episode['file'], url_transparent=True, **traverse_obj(episode, {
175 'title': 'title',
176 'description': ('description', {clean_html}),
177 'episode_number': ('chapter_number', {float_or_none}, {int_or_none}),
178 'season_number': ('season', {int_or_none}),
179 })) for episode in podcast_episodes], video_id, **metadata)
181 entries = [self.url_result(
182 urljoin(url, episode['slug']), url_transparent=True,
183 **traverse_obj(season, {
184 'season': 'title',
185 'season_number': ('season', {int_or_none}),
186 }), **traverse_obj(episode, {
187 'title': 'title',
188 'thumbnail': ('image', 'cover', 'path'),
189 'episode_number': ('chapter_number', {int_or_none}),
190 })) for season in seasons for episode in traverse_obj(season, ('contents', ...))]
192 return self.playlist_result(entries, video_id, **metadata)
194 formats, subtitles = self._extract_m3u8_formats_and_subtitles(hls_url, video_id, 'mp4')
196 return {
197 'id': video_id,
198 'formats': formats,
199 'subtitles': subtitles,
200 'is_live': category == 'en-vivo',
201 **metadata,
205 class RTVCPlayEmbedIE(RTVCPlayBaseIE):
206 _VALID_URL = RTVCPlayBaseIE._BASE_VALID_URL + r'/embed/(?P<id>[\w-]+)'
208 _TESTS = [{
209 'url': 'https://www.rtvcplay.co/embed/72b0e699-248b-4929-a4a8-3782702fa7f9',
210 'md5': 'ed529aeaee7aa2a72afe91ac7d1177a8',
211 'info_dict': {
212 'id': '72b0e699-248b-4929-a4a8-3782702fa7f9',
213 'title': 'Tráiler: Señoritas',
214 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
215 'ext': 'mp4',
219 def _real_extract(self, url):
220 video_id = self._match_id(url)
221 webpage = self._download_webpage(url, video_id)
223 player_config = self._extract_player_config(webpage, video_id)
224 formats, subtitles = self._extract_formats_and_subtitles_player_config(player_config, video_id)
226 asset_id = traverse_obj(player_config, ('rtvcplay', 'assetid'))
227 metadata = {} if not asset_id else self._download_json(
228 f'https://cms.rtvcplay.co/api/v1/video/asset-id/{asset_id}', video_id, fatal=False)
230 return {
231 'id': video_id,
232 'formats': formats,
233 'subtitles': subtitles,
234 **traverse_obj(metadata, {
235 'title': 'title',
236 'description': 'description',
237 'thumbnail': ('image', ..., 'thumbnail', 'path'),
238 }, get_all=False),
242 class RTVCKalturaIE(RTVCPlayBaseIE):
243 _VALID_URL = r'https?://media\.rtvc\.gov\.co/kalturartvc/(?P<id>[\w-]+)'
245 _TESTS = [{
246 'url': 'https://media.rtvc.gov.co/kalturartvc/indexSC.html',
247 'info_dict': {
248 'id': 'indexSC',
249 'title': r're:^Señal Colombia',
250 'description': 'md5:799f16a401d97f40c33a2c6a3e2a507b',
251 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
252 'live_status': 'is_live',
253 'ext': 'mp4',
255 'params': {
256 'skip_download': 'Livestream',
260 def _real_extract(self, url):
261 video_id = self._match_id(url)
262 webpage = self._download_webpage(url, video_id)
264 player_config = self._extract_player_config(webpage, video_id)
265 formats, subtitles = self._extract_formats_and_subtitles_player_config(player_config, video_id)
267 channel_id = traverse_obj(player_config, ('rtvcplay', 'channelId'))
268 metadata = {} if not channel_id else self._download_json(
269 f'https://cms.rtvcplay.co/api/v1/taxonomy_term/streaming/{channel_id}', video_id, fatal=False)
271 fmts, subs = self._extract_m3u8_formats_and_subtitles(
272 traverse_obj(metadata, ('channel', 'hls')), video_id, 'mp4', fatal=False)
273 formats.extend(fmts)
274 self._merge_subtitles(subs, target=subtitles)
276 return {
277 'id': video_id,
278 'formats': formats,
279 'subtitles': subtitles,
280 'is_live': True,
281 **traverse_obj(metadata, {
282 'title': 'title',
283 'description': 'description',
284 'thumbnail': ('channel', 'image', 'logo', 'path'),