[ie/soundcloud] Various fixes (#11820)
[yt-dlp.git] / yt_dlp / extractor / podbayfm.py
blob0141eca90925fbe68c3f88ce132ed368649df34b
1 from .common import InfoExtractor
2 from ..utils import (
3 OnDemandPagedList,
4 clean_html,
5 int_or_none,
6 jwt_decode_hs256,
7 url_or_none,
9 from ..utils.traversal import traverse_obj
12 def result_from_props(props):
13 return {
14 **traverse_obj(props, {
15 'id': ('_id', {str}),
16 'title': ('title', {str}),
17 'url': ('mediaURL', {url_or_none}),
18 'description': ('description', {clean_html}),
19 'thumbnail': ('image', {jwt_decode_hs256}, 'url', {url_or_none}),
20 'timestamp': ('timestamp', {int_or_none}),
21 'duration': ('duration', {int_or_none}),
22 }),
23 'ext': 'mp3',
24 'vcodec': 'none',
28 class PodbayFMIE(InfoExtractor):
29 _VALID_URL = r'https?://podbay\.fm/p/[^/?#]+/e/(?P<id>\d+)'
30 _TESTS = [{
31 'url': 'https://podbay.fm/p/behind-the-bastards/e/1647338400',
32 'md5': '895ac8505de349515f5ee8a4a3195c93',
33 'info_dict': {
34 'id': '62306451f4a48e58d0c4d6a8',
35 'title': 'Part One: Kissinger',
36 'ext': 'mp3',
37 'description': r're:^We begin our epic six part series on Henry Kissinger.+',
38 'thumbnail': r're:^https?://.*\.jpg',
39 'timestamp': 1647338400,
40 'duration': 5001,
41 'upload_date': '20220315',
45 def _real_extract(self, url):
46 episode_id = self._match_id(url)
47 webpage = self._download_webpage(url, episode_id)
48 data = self._search_nextjs_data(webpage, episode_id)
49 return result_from_props(data['props']['pageProps']['episode'])
52 class PodbayFMChannelIE(InfoExtractor):
53 _VALID_URL = r'https?://podbay\.fm/p/(?P<id>[^/?#]+)/?(?:$|[?#])'
54 _TESTS = [{
55 'url': 'https://podbay.fm/p/behind-the-bastards',
56 'info_dict': {
57 'id': 'behind-the-bastards',
58 'title': 'Behind the Bastards',
60 'playlist_mincount': 21,
62 _PAGE_SIZE = 10
64 def _fetch_page(self, channel_id, pagenum):
65 return self._download_json(
66 f'https://podbay.fm/api/podcast?reverse=true&page={pagenum}&slug={channel_id}',
67 f'Downloading channel JSON page {pagenum + 1}', channel_id)['podcast']
69 @staticmethod
70 def _results_from_page(channel_id, page):
71 return [{
72 **result_from_props(e),
73 'extractor': PodbayFMIE.IE_NAME,
74 'extractor_key': PodbayFMIE.ie_key(),
75 # somehow they use timestamps as the episode identifier
76 'webpage_url': f'https://podbay.fm/p/{channel_id}/e/{e["timestamp"]}',
77 } for e in page['episodes']]
79 def _real_extract(self, url):
80 channel_id = self._match_id(url)
82 first_page = self._fetch_page(channel_id, 0)
83 entries = OnDemandPagedList(
84 lambda pagenum: self._results_from_page(
85 channel_id, self._fetch_page(channel_id, pagenum) if pagenum else first_page),
86 self._PAGE_SIZE)
88 return self.playlist_result(entries, channel_id, first_page.get('title'))