[cleanup] Misc (#8968)
[yt-dlp.git] / yt_dlp / extractor / polskieradio.py
blobe0b22fffdfcab682fe1ca8c7950a922666958cb3
1 import itertools
2 import json
3 import math
4 import re
5 import urllib.parse
7 from .common import InfoExtractor
8 from ..compat import compat_str
9 from ..utils import (
10 ExtractorError,
11 InAdvancePagedList,
12 determine_ext,
13 extract_attributes,
14 int_or_none,
15 js_to_json,
16 parse_iso8601,
17 strip_or_none,
18 traverse_obj,
19 unescapeHTML,
20 unified_timestamp,
21 url_or_none,
22 urljoin,
26 class PolskieRadioBaseExtractor(InfoExtractor):
27 def _extract_webpage_player_entries(self, webpage, playlist_id, base_data):
28 media_urls = set()
30 for data_media in re.findall(r'<[^>]+data-media="?({[^>]+})"?', webpage):
31 media = self._parse_json(data_media, playlist_id, transform_source=unescapeHTML, fatal=False)
32 if not media.get('file') or not media.get('desc'):
33 continue
34 media_url = self._proto_relative_url(media['file'])
35 if media_url in media_urls:
36 continue
37 media_urls.add(media_url)
38 entry = base_data.copy()
39 entry.update({
40 'id': compat_str(media['id']),
41 'url': media_url,
42 'duration': int_or_none(media.get('length')),
43 'vcodec': 'none' if media.get('provider') == 'audio' else None,
45 entry_title = urllib.parse.unquote(media['desc'])
46 if entry_title:
47 entry['title'] = entry_title
48 yield entry
51 class PolskieRadioLegacyIE(PolskieRadioBaseExtractor):
52 # legacy sites
53 IE_NAME = 'polskieradio:legacy'
54 _VALID_URL = r'https?://(?:www\.)?polskieradio(?:24)?\.pl/\d+/\d+/[Aa]rtykul/(?P<id>\d+)'
55 _TESTS = [{
56 'url': 'https://www.polskieradio.pl/8/2382/Artykul/2534482,Zagarysci-Poezja-jak-spoiwo',
57 'info_dict': {
58 'id': '2534482',
59 'title': 'Żagaryści. Poezja jak spoiwo',
60 'description': 'md5:f18d95d5dcba747a09b635e21a4c0695',
62 'playlist': [{
63 'md5': 'd07559829f61d5a93a75755987ded760',
64 'info_dict': {
65 'id': '2516679',
66 'ext': 'mp3',
67 'title': 'md5:c6e1234e0b747ad883cb91b7ad06b98c',
68 'timestamp': 1592654400,
69 'upload_date': '20200620',
70 'duration': 1430,
71 'thumbnail': r're:^https?://static\.prsa\.pl/images/.*\.jpg$'
73 }],
74 }, {
75 # PR4 audition - other frontend
76 'url': 'https://www.polskieradio.pl/10/6071/Artykul/2610977,Poglos-29-pazdziernika-godz-2301',
77 'info_dict': {
78 'id': '2610977',
79 'ext': 'mp3',
80 'title': 'Pogłos 29 października godz. 23:01',
82 }, {
83 'url': 'https://polskieradio24.pl/130/4503/Artykul/2621876,Narusza-nasza-suwerennosc-Publicysci-o-uzaleznieniu-funduszy-UE-od-praworzadnosci',
84 'only_matching': True,
87 def _real_extract(self, url):
88 playlist_id = self._match_id(url)
90 webpage, urlh = self._download_webpage_handle(url, playlist_id)
91 if PolskieRadioIE.suitable(urlh.url):
92 return self.url_result(urlh.url, PolskieRadioIE, playlist_id)
94 content = self._search_regex(
95 r'(?s)<div[^>]+class="\s*this-article\s*"[^>]*>(.+?)<div[^>]+class="tags"[^>]*>',
96 webpage, 'content', default=None)
98 timestamp = unified_timestamp(self._html_search_regex(
99 r'(?s)<span[^>]+id="datetime2"[^>]*>(.+?)</span>',
100 webpage, 'timestamp', default=None))
102 thumbnail_url = self._og_search_thumbnail(webpage, default=None)
104 title = self._og_search_title(webpage).strip()
106 description = strip_or_none(self._og_search_description(webpage, default=None))
107 description = description.replace('\xa0', ' ') if description is not None else None
109 if not content:
110 return {
111 'id': playlist_id,
112 'url': self._proto_relative_url(
113 self._search_regex(
114 r"source:\s*'(//static\.prsa\.pl/[^']+)'",
115 webpage, 'audition record url')),
116 'title': title,
117 'description': description,
118 'timestamp': timestamp,
119 'thumbnail': thumbnail_url,
122 entries = self._extract_webpage_player_entries(content, playlist_id, {
123 'title': title,
124 'timestamp': timestamp,
125 'thumbnail': thumbnail_url,
128 return self.playlist_result(entries, playlist_id, title, description)
131 class PolskieRadioIE(PolskieRadioBaseExtractor):
132 # new next.js sites
133 _VALID_URL = r'https?://(?:[^/]+\.)?(?:polskieradio(?:24)?|radiokierowcow)\.pl/artykul/(?P<id>\d+)'
134 _TESTS = [{
135 # articleData, attachments
136 'url': 'https://jedynka.polskieradio.pl/artykul/1587943',
137 'info_dict': {
138 'id': '1587943',
139 'title': 'Prof. Andrzej Nowak: o historii nie da się myśleć beznamiętnie',
140 'description': 'md5:12f954edbf3120c5e7075e17bf9fc5c5',
142 'playlist': [{
143 'md5': '2984ee6ce9046d91fc233bc1a864a09a',
144 'info_dict': {
145 'id': '7a85d429-5356-4def-a347-925e4ae7406b',
146 'ext': 'mp3',
147 'title': 'md5:d4623290d4ac983bf924061c75c23a0d',
150 }, {
151 # post, legacy html players
152 'url': 'https://trojka.polskieradio.pl/artykul/2589163,Czy-wciaz-otrzymujemy-zdjecia-z-sond-Voyager',
153 'info_dict': {
154 'id': '2589163',
155 'title': 'Czy wciąż otrzymujemy zdjęcia z sond Voyager?',
156 'description': 'md5:cf1a7f348d63a2db9c0d7a63d1669473',
158 'playlist': [{
159 'info_dict': {
160 'id': '2577880',
161 'ext': 'mp3',
162 'title': 'md5:a57d10a0c02abd34dd675cb33707ad5a',
163 'duration': 321,
166 }, {
167 # data, legacy
168 'url': 'https://radiokierowcow.pl/artykul/2694529',
169 'info_dict': {
170 'id': '2694529',
171 'title': 'Zielona fala reliktem przeszłości?',
172 'description': 'md5:f20a9a7ed9cb58916c54add94eae3bc0',
174 'playlist_count': 3,
175 }, {
176 'url': 'https://trojka.polskieradio.pl/artykul/1632955',
177 'only_matching': True,
178 }, {
179 # with mp4 video
180 'url': 'https://trojka.polskieradio.pl/artykul/1634903',
181 'only_matching': True,
182 }, {
183 'url': 'https://jedynka.polskieradio.pl/artykul/3042436,Polityka-wschodnia-ojca-i-syna-Wladyslawa-Lokietka-i-Kazimierza-Wielkiego',
184 'only_matching': True,
187 def _real_extract(self, url):
188 playlist_id = self._match_id(url)
190 webpage = self._download_webpage(url, playlist_id)
192 article_data = traverse_obj(
193 self._search_nextjs_data(webpage, playlist_id), (
194 'props', 'pageProps', (('data', 'articleData'), 'post', 'data')), get_all=False)
196 title = strip_or_none(article_data['title'])
198 description = strip_or_none(article_data.get('lead'))
200 entries = [{
201 'url': entry['file'],
202 'ext': determine_ext(entry.get('fileName')),
203 'id': self._search_regex(
204 r'([a-f\d]{8}-(?:[a-f\d]{4}-){3}[a-f\d]{12})', entry['file'], 'entry id'),
205 'title': strip_or_none(entry.get('description')) or title,
206 } for entry in article_data.get('attachments') or () if entry.get('fileType') in ('Audio', )]
208 if not entries:
209 # some legacy articles have no json attachments, but players in body
210 entries = self._extract_webpage_player_entries(article_data['content'], playlist_id, {
211 'title': title,
214 return self.playlist_result(entries, playlist_id, title, description)
217 class PolskieRadioAuditionIE(InfoExtractor):
218 # new next.js sites
219 IE_NAME = 'polskieradio:audition'
220 _VALID_URL = r'https?://(?:[^/]+\.)?polskieradio\.pl/audycj[ae]/(?P<id>\d+)'
221 _TESTS = [{
222 # articles, PR1
223 'url': 'https://jedynka.polskieradio.pl/audycje/5102',
224 'info_dict': {
225 'id': '5102',
226 'title': 'Historia żywa',
227 'thumbnail': r're:https://static\.prsa\.pl/images/.+',
229 'playlist_mincount': 38,
230 }, {
231 # episodes, PR1
232 'url': 'https://jedynka.polskieradio.pl/audycje/5769',
233 'info_dict': {
234 'id': '5769',
235 'title': 'AgroFakty',
236 'thumbnail': r're:https://static\.prsa\.pl/images/.+',
238 'playlist_mincount': 269,
239 }, {
240 # both episodes and articles, PR3
241 'url': 'https://trojka.polskieradio.pl/audycja/8906',
242 'info_dict': {
243 'id': '8906',
244 'title': 'Trójka budzi',
245 'thumbnail': r're:https://static\.prsa\.pl/images/.+',
247 'playlist_mincount': 722,
248 }, {
249 # some articles were "promoted to main page" and thus link to old frontend
250 'url': 'https://trojka.polskieradio.pl/audycja/305',
251 'info_dict': {
252 'id': '305',
253 'title': 'Co w mowie piszczy?',
254 'thumbnail': r're:https://static\.prsa\.pl/images/.+',
256 'playlist_count': 1523,
259 def _call_lp3(self, path, query, video_id, note):
260 return self._download_json(
261 f'https://lp3test.polskieradio.pl/{path}', video_id, note,
262 query=query, headers={'x-api-key': '9bf6c5a2-a7d0-4980-9ed7-a3f7291f2a81'})
264 def _entries(self, playlist_id, has_episodes, has_articles):
265 for i in itertools.count(0) if has_episodes else []:
266 page = self._call_lp3(
267 'AudioArticle/GetListByCategoryId', {
268 'categoryId': playlist_id,
269 'PageSize': 10,
270 'skip': i,
271 'format': 400,
272 }, playlist_id, f'Downloading episode list page {i + 1}')
273 if not traverse_obj(page, 'data'):
274 break
275 for episode in page['data']:
276 yield {
277 'id': str(episode['id']),
278 'url': episode['file'],
279 'title': episode.get('title'),
280 'duration': int_or_none(episode.get('duration')),
281 'timestamp': parse_iso8601(episode.get('datePublic')),
284 for i in itertools.count(0) if has_articles else []:
285 page = self._call_lp3(
286 'Article/GetListByCategoryId', {
287 'categoryId': playlist_id,
288 'PageSize': 9,
289 'skip': i,
290 'format': 400,
291 }, playlist_id, f'Downloading article list page {i + 1}')
292 if not traverse_obj(page, 'data'):
293 break
294 for article in page['data']:
295 yield {
296 '_type': 'url_transparent',
297 'id': str(article['id']),
298 'url': article['url'],
299 'title': article.get('shortTitle'),
300 'description': traverse_obj(article, ('description', 'lead')),
301 'timestamp': parse_iso8601(article.get('datePublic')),
304 def _real_extract(self, url):
305 playlist_id = self._match_id(url)
307 page_props = traverse_obj(
308 self._search_nextjs_data(self._download_webpage(url, playlist_id), playlist_id),
309 ('props', 'pageProps', ('data', None)), get_all=False)
311 has_episodes = bool(traverse_obj(page_props, 'episodes', 'audios'))
312 has_articles = bool(traverse_obj(page_props, 'articles'))
314 return self.playlist_result(
315 self._entries(playlist_id, has_episodes, has_articles), playlist_id,
316 title=traverse_obj(page_props, ('details', 'name')),
317 description=traverse_obj(page_props, ('details', 'description', 'lead')),
318 thumbnail=traverse_obj(page_props, ('details', 'photo')))
321 class PolskieRadioCategoryIE(InfoExtractor):
322 # legacy sites
323 IE_NAME = 'polskieradio:category'
324 _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/(?:\d+(?:,[^/]+)?/|[^/]+/Tag)(?P<id>\d+)'
325 _TESTS = [{
326 'url': 'http://www.polskieradio.pl/37,RedakcjaKatolicka/4143,Kierunek-Krakow',
327 'info_dict': {
328 'id': '4143',
329 'title': 'Kierunek Kraków',
331 'playlist_mincount': 61
332 }, {
333 'url': 'http://www.polskieradio.pl/10,czworka/214,muzyka',
334 'info_dict': {
335 'id': '214',
336 'title': 'Muzyka',
338 'playlist_mincount': 61
339 }, {
340 # billennium tabs
341 'url': 'https://www.polskieradio.pl/8/2385',
342 'info_dict': {
343 'id': '2385',
344 'title': 'Droga przez mąkę',
346 'playlist_mincount': 111,
347 }, {
348 'url': 'https://www.polskieradio.pl/10/4930',
349 'info_dict': {
350 'id': '4930',
351 'title': 'Teraz K-pop!',
353 'playlist_mincount': 392,
354 }, {
355 # post back pages, audio content directly without articles
356 'url': 'https://www.polskieradio.pl/8,dwojka/7376,nowa-mowa',
357 'info_dict': {
358 'id': '7376',
359 'title': 'Nowa mowa',
361 'playlist_mincount': 244,
362 }, {
363 'url': 'https://www.polskieradio.pl/Krzysztof-Dziuba/Tag175458',
364 'info_dict': {
365 'id': '175458',
366 'title': 'Krzysztof Dziuba',
368 'playlist_mincount': 420,
369 }, {
370 'url': 'http://www.polskieradio.pl/8,Dwojka/196,Publicystyka',
371 'only_matching': True,
374 @classmethod
375 def suitable(cls, url):
376 return False if PolskieRadioLegacyIE.suitable(url) else super().suitable(url)
378 def _entries(self, url, page, category_id):
379 content = page
380 is_billennium_tabs = 'onclick="TB_LoadTab(' in page
381 is_post_back = 'onclick="__doPostBack(' in page
382 pagination = page if is_billennium_tabs else None
383 for page_num in itertools.count(2):
384 for a_entry, entry_id in re.findall(
385 r'(?s)<article[^>]+>.*?(<a[^>]+href=["\'](?:(?:https?)?://[^/]+)?/\d+/\d+/Artykul/(\d+)[^>]+>).*?</article>',
386 content):
387 entry = extract_attributes(a_entry)
388 if entry.get('href'):
389 yield self.url_result(
390 urljoin(url, entry['href']), PolskieRadioLegacyIE, entry_id, entry.get('title'))
391 for a_entry in re.findall(r'<span data-media=({[^ ]+})', content):
392 yield traverse_obj(self._parse_json(a_entry, category_id), {
393 'url': 'file',
394 'id': 'uid',
395 'duration': 'length',
396 'title': ('title', {urllib.parse.unquote}),
397 'description': ('desc', {urllib.parse.unquote}),
399 if is_billennium_tabs:
400 params = self._search_json(
401 r'<div[^>]+class=["\']next["\'][^>]*>\s*<a[^>]+onclick=["\']TB_LoadTab\(',
402 pagination, 'next page params', category_id, default=None, close_objects=1,
403 contains_pattern='.+', transform_source=lambda x: '[%s' % js_to_json(unescapeHTML(x)))
404 if not params:
405 break
406 tab_content = self._download_json(
407 'https://www.polskieradio.pl/CMS/TemplateBoxesManagement/TemplateBoxTabContent.aspx/GetTabContent',
408 category_id, f'Downloading page {page_num}', headers={'content-type': 'application/json'},
409 data=json.dumps(dict(zip((
410 'boxInstanceId', 'tabId', 'categoryType', 'sectionId', 'categoryId', 'pagerMode',
411 'subjectIds', 'tagIndexId', 'queryString', 'name', 'openArticlesInParentTemplate',
412 'idSectionFromUrl', 'maxDocumentAge', 'showCategoryForArticle', 'pageNumber'
413 ), params))).encode())['d']
414 content, pagination = tab_content['Content'], tab_content.get('PagerContent')
415 elif is_post_back:
416 target = self._search_regex(
417 r'onclick=(?:["\'])__doPostBack\((?P<q1>["\'])(?P<target>[\w$]+)(?P=q1)\s*,\s*(?P<q2>["\'])Next(?P=q2)',
418 content, 'pagination postback target', group='target', default=None)
419 if not target:
420 break
421 content = self._download_webpage(
422 url, category_id, f'Downloading page {page_num}',
423 data=urllib.parse.urlencode({
424 **self._hidden_inputs(content),
425 '__EVENTTARGET': target,
426 '__EVENTARGUMENT': 'Next',
427 }).encode())
428 else:
429 next_url = urljoin(url, self._search_regex(
430 r'<div[^>]+class=["\']next["\'][^>]*>\s*<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1',
431 content, 'next page url', group='url', default=None))
432 if not next_url:
433 break
434 content = self._download_webpage(next_url, category_id, f'Downloading page {page_num}')
436 def _real_extract(self, url):
437 category_id = self._match_id(url)
438 webpage, urlh = self._download_webpage_handle(url, category_id)
439 if PolskieRadioAuditionIE.suitable(urlh.url):
440 return self.url_result(urlh.url, PolskieRadioAuditionIE, category_id)
441 title = self._html_search_regex(
442 r'<title>([^<]+)(?: - [^<]+ - [^<]+| w [Pp]olskie[Rr]adio\.pl\s*)</title>',
443 webpage, 'title', fatal=False)
444 return self.playlist_result(
445 self._entries(url, webpage, category_id),
446 category_id, title)
449 class PolskieRadioPlayerIE(InfoExtractor):
450 IE_NAME = 'polskieradio:player'
451 _VALID_URL = r'https?://player\.polskieradio\.pl/anteny/(?P<id>[^/]+)'
453 _BASE_URL = 'https://player.polskieradio.pl'
454 _PLAYER_URL = 'https://player.polskieradio.pl/main.bundle.js'
455 _STATIONS_API_URL = 'https://apipr.polskieradio.pl/api/stacje'
457 _TESTS = [{
458 'url': 'https://player.polskieradio.pl/anteny/trojka',
459 'info_dict': {
460 'id': '3',
461 'ext': 'm4a',
462 'title': 'Trójka',
464 'params': {
465 'format': 'bestaudio',
466 'skip_download': 'endless stream',
470 def _get_channel_list(self, channel_url='no_channel'):
471 player_code = self._download_webpage(
472 self._PLAYER_URL, channel_url,
473 note='Downloading js player')
474 channel_list = js_to_json(self._search_regex(
475 r';var r="anteny",a=(\[.+?\])},', player_code, 'channel list'))
476 return self._parse_json(channel_list, channel_url)
478 def _real_extract(self, url):
479 channel_url = self._match_id(url)
480 channel_list = self._get_channel_list(channel_url)
482 channel = next((c for c in channel_list if c.get('url') == channel_url), None)
484 if not channel:
485 raise ExtractorError('Channel not found')
487 station_list = self._download_json(self._STATIONS_API_URL, channel_url,
488 note='Downloading stream url list',
489 headers={
490 'Accept': 'application/json',
491 'Referer': url,
492 'Origin': self._BASE_URL,
494 station = next((s for s in station_list
495 if s.get('Name') == (channel.get('streamName') or channel.get('name'))), None)
496 if not station:
497 raise ExtractorError('Station not found even though we extracted channel')
499 formats = []
500 for stream_url in station['Streams']:
501 stream_url = self._proto_relative_url(stream_url)
502 if stream_url.endswith('/playlist.m3u8'):
503 formats.extend(self._extract_m3u8_formats(stream_url, channel_url, live=True))
504 elif stream_url.endswith('/manifest.f4m'):
505 formats.extend(self._extract_mpd_formats(stream_url, channel_url))
506 elif stream_url.endswith('/Manifest'):
507 formats.extend(self._extract_ism_formats(stream_url, channel_url))
508 else:
509 formats.append({
510 'url': stream_url,
513 return {
514 'id': compat_str(channel['id']),
515 'formats': formats,
516 'title': channel.get('name') or channel.get('streamName'),
517 'display_id': channel_url,
518 'thumbnail': f'{self._BASE_URL}/images/{channel_url}-color-logo.png',
519 'is_live': True,
523 class PolskieRadioPodcastBaseExtractor(InfoExtractor):
524 _API_BASE = 'https://apipodcasts.polskieradio.pl/api'
526 def _parse_episode(self, data):
527 return {
528 'id': data['guid'],
529 'formats': [{
530 'url': data['url'],
531 'filesize': int_or_none(data.get('fileSize')),
533 'title': data['title'],
534 'description': data.get('description'),
535 'duration': int_or_none(data.get('length')),
536 'timestamp': parse_iso8601(data.get('publishDate')),
537 'thumbnail': url_or_none(data.get('image')),
538 'series': data.get('podcastTitle'),
539 'episode': data['title'],
543 class PolskieRadioPodcastListIE(PolskieRadioPodcastBaseExtractor):
544 IE_NAME = 'polskieradio:podcast:list'
545 _VALID_URL = r'https?://podcasty\.polskieradio\.pl/podcast/(?P<id>\d+)'
546 _TESTS = [{
547 'url': 'https://podcasty.polskieradio.pl/podcast/8/',
548 'info_dict': {
549 'id': '8',
550 'title': 'Śniadanie w Trójce',
551 'description': 'md5:57abcc27bc4c6a6b25baa3061975b9ef',
552 'uploader': 'Beata Michniewicz',
554 'playlist_mincount': 714,
556 _PAGE_SIZE = 10
558 def _call_api(self, podcast_id, page):
559 return self._download_json(
560 f'{self._API_BASE}/Podcasts/{podcast_id}/?pageSize={self._PAGE_SIZE}&page={page}',
561 podcast_id, f'Downloading page {page}')
563 def _real_extract(self, url):
564 podcast_id = self._match_id(url)
565 data = self._call_api(podcast_id, 1)
567 def get_page(page_num):
568 page_data = self._call_api(podcast_id, page_num + 1) if page_num else data
569 yield from (self._parse_episode(ep) for ep in page_data['items'])
571 return {
572 '_type': 'playlist',
573 'entries': InAdvancePagedList(
574 get_page, math.ceil(data['itemCount'] / self._PAGE_SIZE), self._PAGE_SIZE),
575 'id': str(data['id']),
576 'title': data.get('title'),
577 'description': data.get('description'),
578 'uploader': data.get('announcer'),
582 class PolskieRadioPodcastIE(PolskieRadioPodcastBaseExtractor):
583 IE_NAME = 'polskieradio:podcast'
584 _VALID_URL = r'https?://podcasty\.polskieradio\.pl/track/(?P<id>[a-f\d]{8}(?:-[a-f\d]{4}){4}[a-f\d]{8})'
585 _TESTS = [{
586 'url': 'https://podcasty.polskieradio.pl/track/6eafe403-cb8f-4756-b896-4455c3713c32',
587 'info_dict': {
588 'id': '6eafe403-cb8f-4756-b896-4455c3713c32',
589 'ext': 'mp3',
590 'title': 'Theresa May rezygnuje. Co dalej z brexitem?',
591 'description': 'md5:e41c409a29d022b70ef0faa61dbded60',
592 'episode': 'Theresa May rezygnuje. Co dalej z brexitem?',
593 'duration': 2893,
594 'thumbnail': 'https://static.prsa.pl/images/58649376-c8a0-4ba2-a714-78b383285f5f.jpg',
595 'series': 'Raport o stanie świata',
599 def _real_extract(self, url):
600 podcast_id = self._match_id(url)
601 data = self._download_json(
602 f'{self._API_BASE}/audio',
603 podcast_id, 'Downloading podcast metadata',
604 data=json.dumps({
605 'guids': [podcast_id],
606 }).encode('utf-8'),
607 headers={
608 'Content-Type': 'application/json',
610 return self._parse_episode(data[0])