7 from .common
import InfoExtractor
8 from ..compat
import compat_str
26 class PolskieRadioBaseExtractor(InfoExtractor
):
27 def _extract_webpage_player_entries(self
, webpage
, playlist_id
, base_data
):
30 for data_media
in re
.findall(r
'<[^>]+data-media="?({[^>]+})"?', webpage
):
31 media
= self
._parse
_json
(data_media
, playlist_id
, transform_source
=unescapeHTML
, fatal
=False)
32 if not media
.get('file') or not media
.get('desc'):
34 media_url
= self
._proto
_relative
_url
(media
['file'])
35 if media_url
in media_urls
:
37 media_urls
.add(media_url
)
38 entry
= base_data
.copy()
40 'id': compat_str(media
['id']),
42 'duration': int_or_none(media
.get('length')),
43 'vcodec': 'none' if media
.get('provider') == 'audio' else None,
45 entry_title
= urllib
.parse
.unquote(media
['desc'])
47 entry
['title'] = entry_title
51 class PolskieRadioLegacyIE(PolskieRadioBaseExtractor
):
53 IE_NAME
= 'polskieradio:legacy'
54 _VALID_URL
= r
'https?://(?:www\.)?polskieradio(?:24)?\.pl/\d+/\d+/[Aa]rtykul/(?P<id>\d+)'
56 'url': 'https://www.polskieradio.pl/8/2382/Artykul/2534482,Zagarysci-Poezja-jak-spoiwo',
59 'title': 'Żagaryści. Poezja jak spoiwo',
60 'description': 'md5:f18d95d5dcba747a09b635e21a4c0695',
63 'md5': 'd07559829f61d5a93a75755987ded760',
67 'title': 'md5:c6e1234e0b747ad883cb91b7ad06b98c',
68 'timestamp': 1592654400,
69 'upload_date': '20200620',
71 'thumbnail': r
're:^https?://static\.prsa\.pl/images/.*\.jpg$'
75 # PR4 audition - other frontend
76 'url': 'https://www.polskieradio.pl/10/6071/Artykul/2610977,Poglos-29-pazdziernika-godz-2301',
80 'title': 'Pogłos 29 października godz. 23:01',
83 'url': 'https://polskieradio24.pl/130/4503/Artykul/2621876,Narusza-nasza-suwerennosc-Publicysci-o-uzaleznieniu-funduszy-UE-od-praworzadnosci',
84 'only_matching': True,
87 def _real_extract(self
, url
):
88 playlist_id
= self
._match
_id
(url
)
90 webpage
, urlh
= self
._download
_webpage
_handle
(url
, playlist_id
)
91 if PolskieRadioIE
.suitable(urlh
.url
):
92 return self
.url_result(urlh
.url
, PolskieRadioIE
, playlist_id
)
94 content
= self
._search
_regex
(
95 r
'(?s)<div[^>]+class="\s*this-article\s*"[^>]*>(.+?)<div[^>]+class="tags"[^>]*>',
96 webpage
, 'content', default
=None)
98 timestamp
= unified_timestamp(self
._html
_search
_regex
(
99 r
'(?s)<span[^>]+id="datetime2"[^>]*>(.+?)</span>',
100 webpage
, 'timestamp', default
=None))
102 thumbnail_url
= self
._og
_search
_thumbnail
(webpage
, default
=None)
104 title
= self
._og
_search
_title
(webpage
).strip()
106 description
= strip_or_none(self
._og
_search
_description
(webpage
, default
=None))
107 description
= description
.replace('\xa0', ' ') if description
is not None else None
112 'url': self
._proto
_relative
_url
(
114 r
"source:\s*'(//static\.prsa\.pl/[^']+)'",
115 webpage
, 'audition record url')),
117 'description': description
,
118 'timestamp': timestamp
,
119 'thumbnail': thumbnail_url
,
122 entries
= self
._extract
_webpage
_player
_entries
(content
, playlist_id
, {
124 'timestamp': timestamp
,
125 'thumbnail': thumbnail_url
,
128 return self
.playlist_result(entries
, playlist_id
, title
, description
)
131 class PolskieRadioIE(PolskieRadioBaseExtractor
):
133 _VALID_URL
= r
'https?://(?:[^/]+\.)?(?:polskieradio(?:24)?|radiokierowcow)\.pl/artykul/(?P<id>\d+)'
135 # articleData, attachments
136 'url': 'https://jedynka.polskieradio.pl/artykul/1587943',
139 'title': 'Prof. Andrzej Nowak: o historii nie da się myśleć beznamiętnie',
140 'description': 'md5:12f954edbf3120c5e7075e17bf9fc5c5',
143 'md5': '2984ee6ce9046d91fc233bc1a864a09a',
145 'id': '7a85d429-5356-4def-a347-925e4ae7406b',
147 'title': 'md5:d4623290d4ac983bf924061c75c23a0d',
151 # post, legacy html players
152 'url': 'https://trojka.polskieradio.pl/artykul/2589163,Czy-wciaz-otrzymujemy-zdjecia-z-sond-Voyager',
155 'title': 'Czy wciąż otrzymujemy zdjęcia z sond Voyager?',
156 'description': 'md5:cf1a7f348d63a2db9c0d7a63d1669473',
162 'title': 'md5:a57d10a0c02abd34dd675cb33707ad5a',
168 'url': 'https://radiokierowcow.pl/artykul/2694529',
171 'title': 'Zielona fala reliktem przeszłości?',
172 'description': 'md5:f20a9a7ed9cb58916c54add94eae3bc0',
176 'url': 'https://trojka.polskieradio.pl/artykul/1632955',
177 'only_matching': True,
180 'url': 'https://trojka.polskieradio.pl/artykul/1634903',
181 'only_matching': True,
183 'url': 'https://jedynka.polskieradio.pl/artykul/3042436,Polityka-wschodnia-ojca-i-syna-Wladyslawa-Lokietka-i-Kazimierza-Wielkiego',
184 'only_matching': True,
187 def _real_extract(self
, url
):
188 playlist_id
= self
._match
_id
(url
)
190 webpage
= self
._download
_webpage
(url
, playlist_id
)
192 article_data
= traverse_obj(
193 self
._search
_nextjs
_data
(webpage
, playlist_id
), (
194 'props', 'pageProps', (('data', 'articleData'), 'post', 'data')), get_all
=False)
196 title
= strip_or_none(article_data
['title'])
198 description
= strip_or_none(article_data
.get('lead'))
201 'url': entry
['file'],
202 'ext': determine_ext(entry
.get('fileName')),
203 'id': self
._search
_regex
(
204 r
'([a-f\d]{8}-(?:[a-f\d]{4}-){3}[a-f\d]{12})', entry
['file'], 'entry id'),
205 'title': strip_or_none(entry
.get('description')) or title
,
206 } for entry
in article_data
.get('attachments') or () if entry
.get('fileType') in ('Audio', )]
209 # some legacy articles have no json attachments, but players in body
210 entries
= self
._extract
_webpage
_player
_entries
(article_data
['content'], playlist_id
, {
214 return self
.playlist_result(entries
, playlist_id
, title
, description
)
217 class PolskieRadioAuditionIE(InfoExtractor
):
219 IE_NAME
= 'polskieradio:audition'
220 _VALID_URL
= r
'https?://(?:[^/]+\.)?polskieradio\.pl/audycj[ae]/(?P<id>\d+)'
223 'url': 'https://jedynka.polskieradio.pl/audycje/5102',
226 'title': 'Historia żywa',
227 'thumbnail': r
're:https://static\.prsa\.pl/images/.+',
229 'playlist_mincount': 38,
232 'url': 'https://jedynka.polskieradio.pl/audycje/5769',
235 'title': 'AgroFakty',
236 'thumbnail': r
're:https://static\.prsa\.pl/images/.+',
238 'playlist_mincount': 269,
240 # both episodes and articles, PR3
241 'url': 'https://trojka.polskieradio.pl/audycja/8906',
244 'title': 'Trójka budzi',
245 'thumbnail': r
're:https://static\.prsa\.pl/images/.+',
247 'playlist_mincount': 722,
249 # some articles were "promoted to main page" and thus link to old frontend
250 'url': 'https://trojka.polskieradio.pl/audycja/305',
253 'title': 'Co w mowie piszczy?',
254 'thumbnail': r
're:https://static\.prsa\.pl/images/.+',
256 'playlist_count': 1523,
259 def _call_lp3(self
, path
, query
, video_id
, note
):
260 return self
._download
_json
(
261 f
'https://lp3test.polskieradio.pl/{path}', video_id
, note
,
262 query
=query
, headers
={'x-api-key': '9bf6c5a2-a7d0-4980-9ed7-a3f7291f2a81'})
264 def _entries(self
, playlist_id
, has_episodes
, has_articles
):
265 for i
in itertools
.count(0) if has_episodes
else []:
266 page
= self
._call
_lp
3(
267 'AudioArticle/GetListByCategoryId', {
268 'categoryId': playlist_id
,
272 }, playlist_id
, f
'Downloading episode list page {i + 1}')
273 if not traverse_obj(page
, 'data'):
275 for episode
in page
['data']:
277 'id': str(episode
['id']),
278 'url': episode
['file'],
279 'title': episode
.get('title'),
280 'duration': int_or_none(episode
.get('duration')),
281 'timestamp': parse_iso8601(episode
.get('datePublic')),
284 for i
in itertools
.count(0) if has_articles
else []:
285 page
= self
._call
_lp
3(
286 'Article/GetListByCategoryId', {
287 'categoryId': playlist_id
,
291 }, playlist_id
, f
'Downloading article list page {i + 1}')
292 if not traverse_obj(page
, 'data'):
294 for article
in page
['data']:
296 '_type': 'url_transparent',
297 'id': str(article
['id']),
298 'url': article
['url'],
299 'title': article
.get('shortTitle'),
300 'description': traverse_obj(article
, ('description', 'lead')),
301 'timestamp': parse_iso8601(article
.get('datePublic')),
304 def _real_extract(self
, url
):
305 playlist_id
= self
._match
_id
(url
)
307 page_props
= traverse_obj(
308 self
._search
_nextjs
_data
(self
._download
_webpage
(url
, playlist_id
), playlist_id
),
309 ('props', 'pageProps', ('data', None)), get_all
=False)
311 has_episodes
= bool(traverse_obj(page_props
, 'episodes', 'audios'))
312 has_articles
= bool(traverse_obj(page_props
, 'articles'))
314 return self
.playlist_result(
315 self
._entries
(playlist_id
, has_episodes
, has_articles
), playlist_id
,
316 title
=traverse_obj(page_props
, ('details', 'name')),
317 description
=traverse_obj(page_props
, ('details', 'description', 'lead')),
318 thumbnail
=traverse_obj(page_props
, ('details', 'photo')))
321 class PolskieRadioCategoryIE(InfoExtractor
):
323 IE_NAME
= 'polskieradio:category'
324 _VALID_URL
= r
'https?://(?:www\.)?polskieradio\.pl/(?:\d+(?:,[^/]+)?/|[^/]+/Tag)(?P<id>\d+)'
326 'url': 'http://www.polskieradio.pl/37,RedakcjaKatolicka/4143,Kierunek-Krakow',
329 'title': 'Kierunek Kraków',
331 'playlist_mincount': 61
333 'url': 'http://www.polskieradio.pl/10,czworka/214,muzyka',
338 'playlist_mincount': 61
341 'url': 'https://www.polskieradio.pl/8/2385',
344 'title': 'Droga przez mąkę',
346 'playlist_mincount': 111,
348 'url': 'https://www.polskieradio.pl/10/4930',
351 'title': 'Teraz K-pop!',
353 'playlist_mincount': 392,
355 # post back pages, audio content directly without articles
356 'url': 'https://www.polskieradio.pl/8,dwojka/7376,nowa-mowa',
359 'title': 'Nowa mowa',
361 'playlist_mincount': 244,
363 'url': 'https://www.polskieradio.pl/Krzysztof-Dziuba/Tag175458',
366 'title': 'Krzysztof Dziuba',
368 'playlist_mincount': 420,
370 'url': 'http://www.polskieradio.pl/8,Dwojka/196,Publicystyka',
371 'only_matching': True,
375 def suitable(cls
, url
):
376 return False if PolskieRadioLegacyIE
.suitable(url
) else super().suitable(url
)
378 def _entries(self
, url
, page
, category_id
):
380 is_billennium_tabs
= 'onclick="TB_LoadTab(' in page
381 is_post_back
= 'onclick="__doPostBack(' in page
382 pagination
= page
if is_billennium_tabs
else None
383 for page_num
in itertools
.count(2):
384 for a_entry
, entry_id
in re
.findall(
385 r
'(?s)<article[^>]+>.*?(<a[^>]+href=["\'](?
:(?
:https?
)?
://[^
/]+)?
/\d
+/\d
+/Artykul
/(\d
+)[^
>]+>).*?
</article
>',
387 entry = extract_attributes(a_entry)
388 if entry.get('href
'):
389 yield self.url_result(
390 urljoin(url, entry['href
']), PolskieRadioLegacyIE, entry_id, entry.get('title
'))
391 for a_entry in re.findall(r'<span data
-media
=({[^
]+})', content):
392 yield traverse_obj(self._parse_json(a_entry, category_id), {
395 'duration
': 'length
',
396 'title
': ('title
', {urllib.parse.unquote}),
397 'description
': ('desc
', {urllib.parse.unquote}),
399 if is_billennium_tabs:
400 params = self._search_json(
401 r'<div
[^
>]+class=["\']next["\'][^
>]*>\s
*<a
[^
>]+onclick
=["\']TB_LoadTab\(',
402 pagination, 'next page params', category_id, default=None, close_objects=1,
403 contains_pattern='.+', transform_source=lambda x: '[%s' % js_to_json(unescapeHTML(x)))
406 tab_content = self._download_json(
407 'https://www.polskieradio.pl/CMS/TemplateBoxesManagement/TemplateBoxTabContent.aspx/GetTabContent',
408 category_id, f'Downloading page {page_num}', headers={'content-type': 'application/json'},
409 data=json.dumps(dict(zip((
410 'boxInstanceId', 'tabId', 'categoryType', 'sectionId', 'categoryId', 'pagerMode',
411 'subjectIds', 'tagIndexId', 'queryString', 'name', 'openArticlesInParentTemplate',
412 'idSectionFromUrl', 'maxDocumentAge', 'showCategoryForArticle', 'pageNumber'
413 ), params))).encode())['d']
414 content, pagination = tab_content['Content'], tab_content.get('PagerContent')
416 target = self._search_regex(
417 r'onclick=(?:["\'])__doPostBack\
((?P
<q1
>["\'])(?P<target>[\w$]+)(?P=q1)\s*,\s*(?P<q2>["\'])Next(?P
=q2
)',
418 content, 'pagination postback target
', group='target
', default=None)
421 content = self._download_webpage(
422 url, category_id, f'Downloading page {page_num}
',
423 data=urllib.parse.urlencode({
424 **self._hidden_inputs(content),
425 '__EVENTTARGET
': target,
426 '__EVENTARGUMENT
': 'Next
',
429 next_url = urljoin(url, self._search_regex(
430 r'<div
[^
>]+class=["\']next["\'][^
>]*>\s
*<a
[^
>]+href
=(["\'])(?P<url>(?:(?!\1).)+)\1',
431 content, 'next page url', group='url', default=None))
434 content = self._download_webpage(next_url, category_id, f'Downloading page {page_num}')
436 def _real_extract(self, url):
437 category_id = self._match_id(url)
438 webpage, urlh = self._download_webpage_handle(url, category_id)
439 if PolskieRadioAuditionIE.suitable(urlh.url):
440 return self.url_result(urlh.url, PolskieRadioAuditionIE, category_id)
441 title = self._html_search_regex(
442 r'<title>([^<]+)(?: - [^<]+ - [^<]+| w [Pp]olskie[Rr]adio\.pl\s*)</title>',
443 webpage, 'title', fatal=False)
444 return self.playlist_result(
445 self._entries(url, webpage, category_id),
449 class PolskieRadioPlayerIE(InfoExtractor):
450 IE_NAME = 'polskieradio:player'
451 _VALID_URL = r'https?://player\.polskieradio\.pl/anteny/(?P<id>[^/]+)'
453 _BASE_URL = 'https://player.polskieradio.pl'
454 _PLAYER_URL = 'https://player.polskieradio.pl/main.bundle.js'
455 _STATIONS_API_URL = 'https://apipr.polskieradio.pl/api/stacje'
458 'url': 'https://player.polskieradio.pl/anteny/trojka',
465 'format': 'bestaudio',
466 'skip_download': 'endless stream',
470 def _get_channel_list(self, channel_url='no_channel'):
471 player_code = self._download_webpage(
472 self._PLAYER_URL, channel_url,
473 note='Downloading js player')
474 channel_list = js_to_json(self._search_regex(
475 r';var r="anteny
",a=(\[.+?\])},', player_code, 'channel list'))
476 return self._parse_json(channel_list, channel_url)
478 def _real_extract(self, url):
479 channel_url = self._match_id(url)
480 channel_list = self._get_channel_list(channel_url)
482 channel = next((c for c in channel_list if c.get('url') == channel_url), None)
485 raise ExtractorError('Channel not found')
487 station_list = self._download_json(self._STATIONS_API_URL, channel_url,
488 note='Downloading stream url list',
490 'Accept': 'application/json',
492 'Origin': self._BASE_URL,
494 station = next((s for s in station_list
495 if s.get('Name') == (channel.get('streamName') or channel.get('name'))), None)
497 raise ExtractorError('Station not found even though we extracted channel')
500 for stream_url in station['Streams']:
501 stream_url = self._proto_relative_url(stream_url)
502 if stream_url.endswith('/playlist.m3u8'):
503 formats.extend(self._extract_m3u8_formats(stream_url, channel_url, live=True))
504 elif stream_url.endswith('/manifest.f4m'):
505 formats.extend(self._extract_mpd_formats(stream_url, channel_url))
506 elif stream_url.endswith('/Manifest'):
507 formats.extend(self._extract_ism_formats(stream_url, channel_url))
514 'id': compat_str(channel['id']),
516 'title': channel.get('name') or channel.get('streamName'),
517 'display_id': channel_url,
518 'thumbnail': f'{self._BASE_URL}/images/{channel_url}-color-logo.png',
523 class PolskieRadioPodcastBaseExtractor(InfoExtractor):
524 _API_BASE = 'https://apipodcasts.polskieradio.pl/api'
526 def _parse_episode(self, data):
531 'filesize': int_or_none(data.get('fileSize')),
533 'title': data['title'],
534 'description': data.get('description'),
535 'duration': int_or_none(data.get('length')),
536 'timestamp': parse_iso8601(data.get('publishDate')),
537 'thumbnail': url_or_none(data.get('image')),
538 'series': data.get('podcastTitle'),
539 'episode': data['title'],
543 class PolskieRadioPodcastListIE(PolskieRadioPodcastBaseExtractor):
544 IE_NAME = 'polskieradio:podcast:list'
545 _VALID_URL = r'https?://podcasty\.polskieradio\.pl/podcast/(?P<id>\d+)'
547 'url': 'https://podcasty.polskieradio.pl/podcast/8/',
550 'title': 'Śniadanie w Trójce',
551 'description': 'md5:57abcc27bc4c6a6b25baa3061975b9ef',
552 'uploader': 'Beata Michniewicz',
554 'playlist_mincount': 714,
558 def _call_api(self, podcast_id, page):
559 return self._download_json(
560 f'{self._API_BASE}/Podcasts/{podcast_id}/?pageSize={self._PAGE_SIZE}&page={page}',
561 podcast_id, f'Downloading page {page}')
563 def _real_extract(self, url):
564 podcast_id = self._match_id(url)
565 data = self._call_api(podcast_id, 1)
567 def get_page(page_num):
568 page_data = self._call_api(podcast_id, page_num + 1) if page_num else data
569 yield from (self._parse_episode(ep) for ep in page_data['items'])
573 'entries': InAdvancePagedList(
574 get_page, math.ceil(data['itemCount'] / self._PAGE_SIZE), self._PAGE_SIZE),
575 'id': str(data['id']),
576 'title': data.get('title'),
577 'description': data.get('description'),
578 'uploader': data.get('announcer'),
582 class PolskieRadioPodcastIE(PolskieRadioPodcastBaseExtractor):
583 IE_NAME = 'polskieradio:podcast'
584 _VALID_URL = r'https?://podcasty\.polskieradio\.pl/track/(?P<id>[a-f\d]{8}(?:-[a-f\d]{4}){4}[a-f\d]{8})'
586 'url': 'https://podcasty.polskieradio.pl/track/6eafe403-cb8f-4756-b896-4455c3713c32',
588 'id': '6eafe403-cb8f-4756-b896-4455c3713c32',
590 'title': 'Theresa May rezygnuje. Co dalej z brexitem?',
591 'description': 'md5:e41c409a29d022b70ef0faa61dbded60',
592 'episode': 'Theresa May rezygnuje. Co dalej z brexitem?',
594 'thumbnail': 'https://static.prsa.pl/images/58649376-c8a0-4ba2-a714-78b383285f5f.jpg',
595 'series': 'Raport o stanie świata',
599 def _real_extract(self, url):
600 podcast_id = self._match_id(url)
601 data = self._download_json(
602 f'{self._API_BASE}/audio',
603 podcast_id, 'Downloading podcast metadata',
605 'guids': [podcast_id],
608 'Content-Type': 'application/json',
610 return self._parse_episode(data[0])