yt_dlp/extractor/stitcher.py

   1 from .common import InfoExtractor
   2 from ..utils import (
   3     ExtractorError,
   4     clean_html,
   5     clean_podcast_url,
   6     int_or_none,
   7     str_or_none,
   8     try_get,
   9     url_or_none,
  10 )
  11
  12
  13 class StitcherBaseIE(InfoExtractor):
  14     _VALID_URL_BASE = r'https?://(?:www\.)?stitcher\.com/(?:podcast|show)/'
  15
  16     def _call_api(self, path, video_id, query):
  17         resp = self._download_json(
  18             'https://api.prod.stitcher.com/' + path,
  19             video_id, query=query)
  20         error_massage = try_get(resp, lambda x: x['errors'][0]['message'])
  21         if error_massage:
  22             raise ExtractorError(error_massage, expected=True)
  23         return resp['data']
  24
  25     def _extract_description(self, data):
  26         return clean_html(data.get('html_description') or data.get('description'))
  27
  28     def _extract_audio_url(self, episode):
  29         return url_or_none(episode.get('audio_url') or episode.get('guid'))
  30
  31     def _extract_show_info(self, show):
  32         return {
  33             'thumbnail': show.get('image_base_url'),
  34             'series': show.get('title'),
  35         }
  36
  37     def _extract_episode(self, episode, audio_url, show_info):
  38         info = {
  39             'id': str(episode['id']),
  40             'display_id': episode.get('slug'),
  41             'title': episode['title'].strip(),
  42             'description': self._extract_description(episode),
  43             'duration': int_or_none(episode.get('duration')),
  44             'url': clean_podcast_url(audio_url),
  45             'vcodec': 'none',
  46             'timestamp': int_or_none(episode.get('date_published')),
  47             'season_number': int_or_none(episode.get('season')),
  48             'season_id': str_or_none(episode.get('season_id')),
  49         }
  50         info.update(show_info)
  51         return info
  52
  53
  54 class StitcherIE(StitcherBaseIE):
  55     _VALID_URL = StitcherBaseIE._VALID_URL_BASE + r'(?:[^/]+/)+e(?:pisode)?/(?:[^/#?&]+-)?(?P<id>\d+)'
  56     _TESTS = [{
  57         'url': 'http://www.stitcher.com/podcast/the-talking-machines/e/40789481?autoplay=true',
  58         'md5': 'e9635098e0da10b21a0e2b85585530f6',
  59         'info_dict': {
  60             'id': '40789481',
  61             'ext': 'mp3',
  62             'title': 'Machine Learning Mastery and Cancer Clusters',
  63             'description': 'md5:547adb4081864be114ae3831b4c2b42f',
  64             'duration': 1604,
  65             'thumbnail': r're:^https?://.*\.jpg',
  66             'upload_date': '20151008',
  67             'timestamp': 1444285800,
  68             'series': 'Talking Machines',
  69         },
  70     }, {
  71         'url': 'http://www.stitcher.com/podcast/panoply/vulture-tv/e/the-rare-hourlong-comedy-plus-40846275?autoplay=true',
  72         'info_dict': {
  73             'id': '40846275',
  74             'display_id': 'the-rare-hourlong-comedy-plus',
  75             'ext': 'mp3',
  76             'title': "The CW's 'Crazy Ex-Girlfriend'",
  77             'description': 'md5:04f1e2f98eb3f5cbb094cea0f9e19b17',
  78             'duration': 2235,
  79             'thumbnail': r're:^https?://.*\.jpg',
  80         },
  81         'params': {
  82             'skip_download': True,
  83         },
  84         'skip': 'Page Not Found',
  85     }, {
  86         # escaped title
  87         'url': 'http://www.stitcher.com/podcast/marketplace-on-stitcher/e/40910226?autoplay=true',
  88         'only_matching': True,
  89     }, {
  90         'url': 'http://www.stitcher.com/podcast/panoply/getting-in/e/episode-2a-how-many-extracurriculars-should-i-have-40876278?autoplay=true',
  91         'only_matching': True,
  92     }, {
  93         'url': 'https://www.stitcher.com/show/threedom/episode/circles-on-a-stick-200212584',
  94         'only_matching': True,
  95     }]
  96
  97     def _real_extract(self, url):
  98         audio_id = self._match_id(url)
  99         data = self._call_api(
 100             'shows/episodes', audio_id, {'episode_ids': audio_id})
 101         episode = data['episodes'][0]
 102         audio_url = self._extract_audio_url(episode)
 103         if not audio_url:
 104             self.raise_login_required()
 105         show = try_get(data, lambda x: x['shows'][0], dict) or {}
 106         return self._extract_episode(
 107             episode, audio_url, self._extract_show_info(show))
 108
 109
 110 class StitcherShowIE(StitcherBaseIE):
 111     _VALID_URL = StitcherBaseIE._VALID_URL_BASE + r'(?P<id>[^/#?&]+)/?(?:[?#&]|$)'
 112     _TESTS = [{
 113         'url': 'http://www.stitcher.com/podcast/the-talking-machines',
 114         'info_dict': {
 115             'id': 'the-talking-machines',
 116             'title': 'Talking Machines',
 117             'description': 'md5:831f0995e40f26c10231af39cf1ebf0b',
 118         },
 119         'playlist_mincount': 106,
 120     }, {
 121         'url': 'https://www.stitcher.com/show/the-talking-machines',
 122         'only_matching': True,
 123     }]
 124
 125     def _real_extract(self, url):
 126         show_slug = self._match_id(url)
 127         data = self._call_api(
 128             f'search/show/{show_slug}/allEpisodes', show_slug, {'count': 10000})
 129         show = try_get(data, lambda x: x['shows'][0], dict) or {}
 130         show_info = self._extract_show_info(show)
 131
 132         entries = []
 133         for episode in (data.get('episodes') or []):
 134             audio_url = self._extract_audio_url(episode)
 135             if not audio_url:
 136                 continue
 137             entries.append(self._extract_episode(episode, audio_url, show_info))
 138
 139         return self.playlist_result(
 140             entries, show_slug, show.get('title'),
 141             self._extract_description(show))