yt_dlp/extractor/globalplayer.py

   1 from .common import InfoExtractor
   2 from ..utils import (
   3     clean_html,
   4     join_nonempty,
   5     parse_duration,
   6     str_or_none,
   7     traverse_obj,
   8     unified_strdate,
   9     unified_timestamp,
  10     urlhandle_detect_ext,
  11 )
  12
  13
  14 class GlobalPlayerBaseIE(InfoExtractor):
  15     def _get_page_props(self, url, video_id):
  16         webpage = self._download_webpage(url, video_id)
  17         return self._search_nextjs_data(webpage, video_id)['props']['pageProps']
  18
  19     def _request_ext(self, url, video_id):
  20         return urlhandle_detect_ext(self._request_webpage(  # Server rejects HEAD requests
  21             url, video_id, note='Determining source extension'))
  22
  23     def _extract_audio(self, episode, series):
  24         return {
  25             'vcodec': 'none',
  26             **traverse_obj(series, {
  27                 'series': 'title',
  28                 'series_id': 'id',
  29                 'thumbnail': 'imageUrl',
  30                 'uploader': 'itunesAuthor',  # podcasts only
  31             }),
  32             **traverse_obj(episode, {
  33                 'id': 'id',
  34                 'description': ('description', {clean_html}),
  35                 'duration': ('duration', {parse_duration}),
  36                 'thumbnail': 'imageUrl',
  37                 'url': 'streamUrl',
  38                 'timestamp': (('pubDate', 'startDate'), {unified_timestamp}),
  39                 'title': 'title',
  40             }, get_all=False),
  41         }
  42
  43
  44 class GlobalPlayerLiveIE(GlobalPlayerBaseIE):
  45     _VALID_URL = r'https?://www\.globalplayer\.com/live/(?P<id>\w+)/\w+'
  46     _TESTS = [{
  47         'url': 'https://www.globalplayer.com/live/smoothchill/uk/',
  48         'info_dict': {
  49             'id': '2mx1E',
  50             'ext': 'aac',
  51             'display_id': 'smoothchill-uk',
  52             'title': 're:^Smooth Chill.+$',
  53             'thumbnail': 'https://herald.musicradio.com/media/f296ade8-50c9-4f60-911f-924e96873620.png',
  54             'description': 'Music To Chill To',
  55             'live_status': 'is_live',
  56         },
  57     }, {
  58         # national station
  59         'url': 'https://www.globalplayer.com/live/heart/uk/',
  60         'info_dict': {
  61             'id': '2mwx4',
  62             'ext': 'aac',
  63             'description': 'turn up the feel good!',
  64             'thumbnail': 'https://herald.musicradio.com/media/49b9e8cb-15bf-4bf2-8c28-a4850cc6b0f3.png',
  65             'live_status': 'is_live',
  66             'title': 're:^Heart UK.+$',
  67             'display_id': 'heart-uk',
  68         },
  69     }, {
  70         # regional variation
  71         'url': 'https://www.globalplayer.com/live/heart/london/',
  72         'info_dict': {
  73             'id': 'AMqg',
  74             'ext': 'aac',
  75             'thumbnail': 'https://herald.musicradio.com/media/49b9e8cb-15bf-4bf2-8c28-a4850cc6b0f3.png',
  76             'title': 're:^Heart London.+$',
  77             'live_status': 'is_live',
  78             'display_id': 'heart-london',
  79             'description': 'turn up the feel good!',
  80         },
  81     }]
  82
  83     def _real_extract(self, url):
  84         video_id = self._match_id(url)
  85         station = self._get_page_props(url, video_id)['station']
  86         stream_url = station['streamUrl']
  87
  88         return {
  89             'id': station['id'],
  90             'display_id': join_nonempty('brandSlug', 'slug', from_dict=station) or station.get('legacyStationPrefix'),
  91             'url': stream_url,
  92             'ext': self._request_ext(stream_url, video_id),
  93             'vcodec': 'none',
  94             'is_live': True,
  95             **traverse_obj(station, {
  96                 'title': (('name', 'brandName'), {str_or_none}),
  97                 'description': 'tagline',
  98                 'thumbnail': 'brandLogo',
  99             }, get_all=False),
 100         }
 101
 102
 103 class GlobalPlayerLivePlaylistIE(GlobalPlayerBaseIE):
 104     _VALID_URL = r'https?://www\.globalplayer\.com/playlists/(?P<id>\w+)'
 105     _TESTS = [{
 106         # "live playlist"
 107         'url': 'https://www.globalplayer.com/playlists/8bLk/',
 108         'info_dict': {
 109             'id': '8bLk',
 110             'ext': 'aac',
 111             'live_status': 'is_live',
 112             'description': 'md5:e10f5e10b01a7f2c14ba815509fbb38d',
 113             'thumbnail': 'https://images.globalplayer.com/images/551379?width=450&signature=oMLPZIoi5_dBSHnTMREW0Xg76mA=',
 114             'title': 're:^Classic FM Hall of Fame.+$',
 115         },
 116     }]
 117
 118     def _real_extract(self, url):
 119         video_id = self._match_id(url)
 120         station = self._get_page_props(url, video_id)['playlistData']
 121         stream_url = station['streamUrl']
 122
 123         return {
 124             'id': video_id,
 125             'url': stream_url,
 126             'ext': self._request_ext(stream_url, video_id),
 127             'vcodec': 'none',
 128             'is_live': True,
 129             **traverse_obj(station, {
 130                 'title': 'title',
 131                 'description': 'description',
 132                 'thumbnail': 'image',
 133             }),
 134         }
 135
 136
 137 class GlobalPlayerAudioIE(GlobalPlayerBaseIE):
 138     _VALID_URL = r'https?://www\.globalplayer\.com/(?:(?P<podcast>podcasts)/|catchup/\w+/\w+/)(?P<id>\w+)/?(?:$|[?#])'
 139     _TESTS = [{
 140         # podcast
 141         'url': 'https://www.globalplayer.com/podcasts/42KuaM/',
 142         'playlist_mincount': 5,
 143         'info_dict': {
 144             'id': '42KuaM',
 145             'title': 'Filthy Ritual',
 146             'thumbnail': 'md5:60286e7d12d795bd1bbc9efc6cee643e',
 147             'categories': ['Society & Culture', 'True Crime'],
 148             'uploader': 'Global',
 149             'description': 'md5:da5b918eac9ae319454a10a563afacf9',
 150         },
 151     }, {
 152         # radio catchup
 153         'url': 'https://www.globalplayer.com/catchup/lbc/uk/46vyD7z/',
 154         'playlist_mincount': 3,
 155         'info_dict': {
 156             'id': '46vyD7z',
 157             'description': 'Nick Ferrari At Breakfast is Leading Britain\'s Conversation.',
 158             'title': 'Nick Ferrari',
 159             'thumbnail': 'md5:4df24d8a226f5b2508efbcc6ae874ebf',
 160         },
 161     }]
 162
 163     def _real_extract(self, url):
 164         video_id, podcast = self._match_valid_url(url).group('id', 'podcast')
 165         props = self._get_page_props(url, video_id)
 166         series = props['podcastInfo'] if podcast else props['catchupInfo']
 167
 168         return {
 169             '_type': 'playlist',
 170             'id': video_id,
 171             'entries': [self._extract_audio(ep, series) for ep in traverse_obj(
 172                         series, ('episodes', lambda _, v: v['id'] and v['streamUrl']))],
 173             'categories': traverse_obj(series, ('categories', ..., 'name')) or None,
 174             **traverse_obj(series, {
 175                 'description': 'description',
 176                 'thumbnail': 'imageUrl',
 177                 'title': 'title',
 178                 'uploader': 'itunesAuthor',  # podcasts only
 179             }),
 180         }
 181
 182
 183 class GlobalPlayerAudioEpisodeIE(GlobalPlayerBaseIE):
 184     _VALID_URL = r'https?://www\.globalplayer\.com/(?:(?P<podcast>podcasts)|catchup/\w+/\w+)/episodes/(?P<id>\w+)/?(?:$|[?#])'
 185     _TESTS = [{
 186         # podcast
 187         'url': 'https://www.globalplayer.com/podcasts/episodes/7DrfNnE/',
 188         'info_dict': {
 189             'id': '7DrfNnE',
 190             'ext': 'mp3',
 191             'title': 'Filthy Ritual - Trailer',
 192             'description': 'md5:1f1562fd0f01b4773b590984f94223e0',
 193             'thumbnail': 'md5:60286e7d12d795bd1bbc9efc6cee643e',
 194             'duration': 225.0,
 195             'timestamp': 1681254900,
 196             'series': 'Filthy Ritual',
 197             'series_id': '42KuaM',
 198             'upload_date': '20230411',
 199             'uploader': 'Global',
 200         },
 201     }, {
 202         # radio catchup
 203         'url': 'https://www.globalplayer.com/catchup/lbc/uk/episodes/2zGq26Vcv1fCWhddC4JAwETXWe/',
 204         'info_dict': {
 205             'id': '2zGq26Vcv1fCWhddC4JAwETXWe',
 206             'ext': 'm4a',
 207             'timestamp': 1682056800,
 208             'series': 'Nick Ferrari',
 209             'thumbnail': 'md5:4df24d8a226f5b2508efbcc6ae874ebf',
 210             'upload_date': '20230421',
 211             'series_id': '46vyD7z',
 212             'description': 'Nick Ferrari At Breakfast is Leading Britain\'s Conversation.',
 213             'title': 'Nick Ferrari',
 214             'duration': 10800.0,
 215         },
 216     }]
 217
 218     def _real_extract(self, url):
 219         video_id, podcast = self._match_valid_url(url).group('id', 'podcast')
 220         props = self._get_page_props(url, video_id)
 221         episode = props['podcastEpisode'] if podcast else props['catchupEpisode']
 222
 223         return self._extract_audio(
 224             episode, traverse_obj(episode, 'podcast', 'show', expected_type=dict) or {})
 225
 226
 227 class GlobalPlayerVideoIE(GlobalPlayerBaseIE):
 228     _VALID_URL = r'https?://www\.globalplayer\.com/videos/(?P<id>\w+)'
 229     _TESTS = [{
 230         'url': 'https://www.globalplayer.com/videos/2JsSZ7Gm2uP/',
 231         'info_dict': {
 232             'id': '2JsSZ7Gm2uP',
 233             'ext': 'mp4',
 234             'description': 'md5:6a9f063c67c42f218e42eee7d0298bfd',
 235             'thumbnail': 'md5:d4498af48e15aae4839ce77b97d39550',
 236             'upload_date': '20230420',
 237             'title': 'Treble Malakai Bayoh sings a sublime Handel aria at Classic FM Live',
 238         },
 239     }]
 240
 241     def _real_extract(self, url):
 242         video_id = self._match_id(url)
 243         meta = self._get_page_props(url, video_id)['videoData']
 244
 245         return {
 246             'id': video_id,
 247             **traverse_obj(meta, {
 248                 'url': 'url',
 249                 'thumbnail': ('image', 'url'),
 250                 'title': 'title',
 251                 'upload_date': ('publish_date', {unified_strdate}),
 252                 'description': 'description',
 253             }),
 254         }