yt_dlp/extractor/podbayfm.py

   1 from .common import InfoExtractor
   2 from ..utils import (
   3     OnDemandPagedList,
   4     clean_html,
   5     int_or_none,
   6     jwt_decode_hs256,
   7     url_or_none,
   8 )
   9 from ..utils.traversal import traverse_obj
  10
  11
  12 def result_from_props(props):
  13     return {
  14         **traverse_obj(props, {
  15             'id': ('_id', {str}),
  16             'title': ('title', {str}),
  17             'url': ('mediaURL', {url_or_none}),
  18             'description': ('description', {clean_html}),
  19             'thumbnail': ('image', {jwt_decode_hs256}, 'url', {url_or_none}),
  20             'timestamp': ('timestamp', {int_or_none}),
  21             'duration': ('duration', {int_or_none}),
  22         }),
  23         'ext': 'mp3',
  24         'vcodec': 'none',
  25     }
  26
  27
  28 class PodbayFMIE(InfoExtractor):
  29     _VALID_URL = r'https?://podbay\.fm/p/[^/?#]+/e/(?P<id>\d+)'
  30     _TESTS = [{
  31         'url': 'https://podbay.fm/p/behind-the-bastards/e/1647338400',
  32         'md5': '895ac8505de349515f5ee8a4a3195c93',
  33         'info_dict': {
  34             'id': '62306451f4a48e58d0c4d6a8',
  35             'title': 'Part One: Kissinger',
  36             'ext': 'mp3',
  37             'description': r're:^We begin our epic six part series on Henry Kissinger.+',
  38             'thumbnail': r're:^https?://.*\.jpg',
  39             'timestamp': 1647338400,
  40             'duration': 5001,
  41             'upload_date': '20220315',
  42         },
  43     }]
  44
  45     def _real_extract(self, url):
  46         episode_id = self._match_id(url)
  47         webpage = self._download_webpage(url, episode_id)
  48         data = self._search_nextjs_data(webpage, episode_id)
  49         return result_from_props(data['props']['pageProps']['episode'])
  50
  51
  52 class PodbayFMChannelIE(InfoExtractor):
  53     _VALID_URL = r'https?://podbay\.fm/p/(?P<id>[^/?#]+)/?(?:$|[?#])'
  54     _TESTS = [{
  55         'url': 'https://podbay.fm/p/behind-the-bastards',
  56         'info_dict': {
  57             'id': 'behind-the-bastards',
  58             'title': 'Behind the Bastards',
  59         },
  60         'playlist_mincount': 21,
  61     }]
  62     _PAGE_SIZE = 10
  63
  64     def _fetch_page(self, channel_id, pagenum):
  65         return self._download_json(
  66             f'https://podbay.fm/api/podcast?reverse=true&page={pagenum}&slug={channel_id}',
  67             f'Downloading channel JSON page {pagenum + 1}', channel_id)['podcast']
  68
  69     @staticmethod
  70     def _results_from_page(channel_id, page):
  71         return [{
  72             **result_from_props(e),
  73             'extractor': PodbayFMIE.IE_NAME,
  74             'extractor_key': PodbayFMIE.ie_key(),
  75             # somehow they use timestamps as the episode identifier
  76             'webpage_url': f'https://podbay.fm/p/{channel_id}/e/{e["timestamp"]}',
  77         } for e in page['episodes']]
  78
  79     def _real_extract(self, url):
  80         channel_id = self._match_id(url)
  81
  82         first_page = self._fetch_page(channel_id, 0)
  83         entries = OnDemandPagedList(
  84             lambda pagenum: self._results_from_page(
  85                 channel_id, self._fetch_page(channel_id, pagenum) if pagenum else first_page),
  86             self._PAGE_SIZE)
  87
  88         return self.playlist_result(entries, channel_id, first_page.get('title'))