yt_dlp/extractor/spreaker.py

   1 import itertools
   2
   3 from .common import InfoExtractor
   4 from ..utils import (
   5     filter_dict,
   6     float_or_none,
   7     int_or_none,
   8     parse_qs,
   9     str_or_none,
  10     try_get,
  11     unified_timestamp,
  12     url_or_none,
  13 )
  14 from ..utils.traversal import traverse_obj
  15
  16
  17 def _extract_episode(data, episode_id=None):
  18     title = data['title']
  19     download_url = data['download_url']
  20
  21     series = try_get(data, lambda x: x['show']['title'], str)
  22     uploader = try_get(data, lambda x: x['author']['fullname'], str)
  23
  24     thumbnails = []
  25     for image in ('image_original', 'image_medium', 'image'):
  26         image_url = url_or_none(data.get(f'{image}_url'))
  27         if image_url:
  28             thumbnails.append({'url': image_url})
  29
  30     def stats(key):
  31         return int_or_none(try_get(
  32             data,
  33             (lambda x: x[f'{key}s_count'],
  34              lambda x: x['stats'][f'{key}s'])))
  35
  36     def duration(key):
  37         return float_or_none(data.get(key), scale=1000)
  38
  39     return {
  40         'id': str(episode_id or data['episode_id']),
  41         'url': download_url,
  42         'display_id': data.get('permalink'),
  43         'title': title,
  44         'description': data.get('description'),
  45         'timestamp': unified_timestamp(data.get('published_at')),
  46         'uploader': uploader,
  47         'uploader_id': str_or_none(data.get('author_id')),
  48         'creator': uploader,
  49         'duration': duration('duration') or duration('length'),
  50         'view_count': stats('play'),
  51         'like_count': stats('like'),
  52         'comment_count': stats('message'),
  53         'format': 'MPEG Layer 3',
  54         'format_id': 'mp3',
  55         'container': 'mp3',
  56         'ext': 'mp3',
  57         'thumbnails': thumbnails,
  58         'series': series,
  59         'extractor_key': SpreakerIE.ie_key(),
  60     }
  61
  62
  63 class SpreakerIE(InfoExtractor):
  64     _VALID_URL = [
  65         r'https?://api\.spreaker\.com/(?:(?:download/)?episode|v2/episodes)/(?P<id>\d+)',
  66         r'https?://(?:www\.)?spreaker\.com/episode/[^#?/]*?(?P<id>\d+)/?(?:[?#]|$)',
  67     ]
  68     _TESTS = [{
  69         'url': 'https://api.spreaker.com/episode/12534508',
  70         'info_dict': {
  71             'id': '12534508',
  72             'display_id': 'swm-ep15-how-to-market-your-music-part-2',
  73             'ext': 'mp3',
  74             'title': 'EP:15 | Music Marketing (Likes) - Part 2',
  75             'description': 'md5:0588c43e27be46423e183076fa071177',
  76             'timestamp': 1502250336,
  77             'upload_date': '20170809',
  78             'uploader': 'SWM',
  79             'uploader_id': '9780658',
  80             'duration': 1063.42,
  81             'view_count': int,
  82             'like_count': int,
  83             'comment_count': int,
  84             'series': 'Success With Music | SWM',
  85             'thumbnail': 'https://d3wo5wojvuv7l.cloudfront.net/t_square_limited_160/images.spreaker.com/original/777ce4f96b71b0e1b7c09a5e625210e3.jpg',
  86             'creators': ['SWM'],
  87         },
  88     }, {
  89         'url': 'https://api.spreaker.com/download/episode/12534508/swm_ep15_how_to_market_your_music_part_2.mp3',
  90         'only_matching': True,
  91     }, {
  92         'url': 'https://api.spreaker.com/v2/episodes/12534508?export=episode_segments',
  93         'only_matching': True,
  94     }, {
  95         'note': 'episode',
  96         'url': 'https://www.spreaker.com/episode/grunge-music-origins-the-raw-sound-that-defined-a-generation--60269615',
  97         'info_dict': {
  98             'id': '60269615',
  99             'display_id': 'grunge-music-origins-the-raw-sound-that-',
 100             'ext': 'mp3',
 101             'title': 'Grunge Music Origins - The Raw Sound that Defined a Generation',
 102             'description': str,
 103             'timestamp': 1717468905,
 104             'upload_date': '20240604',
 105             'uploader': 'Katie Brown 2',
 106             'uploader_id': '17733249',
 107             'duration': 818.83,
 108             'view_count': int,
 109             'like_count': int,
 110             'comment_count': int,
 111             'series': '90s Grunge',
 112             'thumbnail': 'https://d3wo5wojvuv7l.cloudfront.net/t_square_limited_160/images.spreaker.com/original/bb0d4178f7cf57cc8786dedbd9c5d969.jpg',
 113             'creators': ['Katie Brown 2'],
 114         },
 115     }, {
 116         'url': 'https://www.spreaker.com/episode/60269615',
 117         'only_matching': True,
 118     }]
 119
 120     def _real_extract(self, url):
 121         episode_id = self._match_id(url)
 122         data = self._download_json(
 123             f'https://api.spreaker.com/v2/episodes/{episode_id}', episode_id,
 124             query=traverse_obj(parse_qs(url), {'key': ('key', 0)}))['response']['episode']
 125         return _extract_episode(data, episode_id)
 126
 127
 128 class SpreakerShowIE(InfoExtractor):
 129     _VALID_URL = [
 130         r'https?://api\.spreaker\.com/show/(?P<id>\d+)',
 131         r'https?://(?:www\.)?spreaker\.com/podcast/[\w-]+--(?P<id>[\d]+)',
 132         r'https?://(?:www\.)?spreaker\.com/show/(?P<id>\d+)/episodes/feed',
 133     ]
 134     _TESTS = [{
 135         'url': 'https://api.spreaker.com/show/4652058',
 136         'info_dict': {
 137             'id': '4652058',
 138         },
 139         'playlist_mincount': 118,
 140     }, {
 141         'url': 'https://www.spreaker.com/podcast/health-wealth--5918323',
 142         'info_dict': {
 143             'id': '5918323',
 144         },
 145         'playlist_mincount': 60,
 146     }, {
 147         'url': 'https://www.spreaker.com/show/5887186/episodes/feed',
 148         'info_dict': {
 149             'id': '5887186',
 150         },
 151         'playlist_mincount': 290,
 152     }]
 153
 154     def _entries(self, show_id, key=None):
 155         for page_num in itertools.count(1):
 156             episodes = self._download_json(
 157                 f'https://api.spreaker.com/show/{show_id}/episodes',
 158                 show_id, note=f'Downloading JSON page {page_num}', query=filter_dict({
 159                     'page': page_num,
 160                     'max_per_page': 100,
 161                     'key': key,
 162                 }))
 163             pager = try_get(episodes, lambda x: x['response']['pager'], dict)
 164             if not pager:
 165                 break
 166             results = pager.get('results')
 167             if not results or not isinstance(results, list):
 168                 break
 169             for result in results:
 170                 if not isinstance(result, dict):
 171                     continue
 172                 yield _extract_episode(result)
 173             if page_num == pager.get('last_page'):
 174                 break
 175
 176     def _real_extract(self, url):
 177         show_id = self._match_id(url)
 178         key = traverse_obj(parse_qs(url), ('key', 0))
 179         return self.playlist_result(self._entries(show_id, key), playlist_id=show_id)