yt_dlp/extractor/nebula.py

   1 import itertools
   2 import json
   3
   4 from .art19 import Art19IE
   5 from .common import InfoExtractor
   6 from ..networking.exceptions import HTTPError
   7 from ..utils import (
   8     ExtractorError,
   9     int_or_none,
  10     make_archive_id,
  11     parse_iso8601,
  12     smuggle_url,
  13     try_call,
  14     unsmuggle_url,
  15     update_url_query,
  16     url_or_none,
  17     urljoin,
  18 )
  19 from ..utils.traversal import traverse_obj
  20
  21 _BASE_URL_RE = r'https?://(?:www\.|beta\.)?(?:watchnebula\.com|nebula\.app|nebula\.tv)'
  22
  23
  24 class NebulaBaseIE(InfoExtractor):
  25     _NETRC_MACHINE = 'watchnebula'
  26     _token = _api_token = None
  27
  28     def _perform_login(self, username, password):
  29         try:
  30             response = self._download_json(
  31                 'https://nebula.tv/auth/login/', None,
  32                 'Logging in to Nebula', 'Login failed',
  33                 data=json.dumps({'email': username, 'password': password}).encode(),
  34                 headers={'content-type': 'application/json'})
  35         except ExtractorError as e:
  36             if isinstance(e.cause, HTTPError) and e.cause.status == 400:
  37                 raise ExtractorError('Login failed: Invalid username or password', expected=True)
  38             raise
  39         self._api_token = traverse_obj(response, ('key', {str}))
  40         if not self._api_token:
  41             raise ExtractorError('Login failed: No token')
  42
  43     def _call_api(self, *args, **kwargs):
  44         if self._token:
  45             kwargs.setdefault('headers', {})['Authorization'] = f'Bearer {self._token}'
  46         try:
  47             return self._download_json(*args, **kwargs)
  48         except ExtractorError as e:
  49             if not isinstance(e.cause, HTTPError) or e.cause.status not in (401, 403):
  50                 raise
  51             self.to_screen(
  52                 f'Reauthorizing with Nebula and retrying, because last API call resulted in error {e.cause.status}')
  53             self._real_initialize()
  54             if self._token:
  55                 kwargs.setdefault('headers', {})['Authorization'] = f'Bearer {self._token}'
  56             return self._download_json(*args, **kwargs)
  57
  58     def _real_initialize(self):
  59         if not self._api_token:
  60             self._api_token = try_call(
  61                 lambda: self._get_cookies('https://nebula.tv')['nebula_auth.apiToken'].value)
  62         self._token = self._download_json(
  63             'https://users.api.nebula.app/api/v1/authorization/', None,
  64             headers={'Authorization': f'Token {self._api_token}'} if self._api_token else None,
  65             note='Authorizing to Nebula', data=b'')['token']
  66
  67     def _extract_formats(self, content_id, slug):
  68         for retry in (False, True):
  69             try:
  70                 fmts, subs = self._extract_m3u8_formats_and_subtitles(
  71                     f'https://content.api.nebula.app/{content_id.split(":")[0]}s/{content_id}/manifest.m3u8',
  72                     slug, 'mp4', query={
  73                         'token': self._token,
  74                         'app_version': '23.10.0',
  75                         'platform': 'ios',
  76                     })
  77                 return {'formats': fmts, 'subtitles': subs}
  78             except ExtractorError as e:
  79                 if isinstance(e.cause, HTTPError) and e.cause.status == 401:
  80                     self.raise_login_required()
  81                 if not retry and isinstance(e.cause, HTTPError) and e.cause.status == 403:
  82                     self.to_screen('Reauthorizing with Nebula and retrying, because fetching video resulted in error')
  83                     self._real_initialize()
  84                     continue
  85                 raise
  86
  87     def _extract_video_metadata(self, episode):
  88         channel_url = traverse_obj(
  89             episode, (('channel_slug', 'class_slug'), {lambda x: urljoin('https://nebula.tv/', x)}), get_all=False)
  90         return {
  91             'id': episode['id'].partition(':')[2],
  92             **traverse_obj(episode, {
  93                 'display_id': 'slug',
  94                 'title': 'title',
  95                 'description': 'description',
  96                 'timestamp': ('published_at', {parse_iso8601}),
  97                 'duration': ('duration', {int_or_none}),
  98                 'channel_id': 'channel_slug',
  99                 'uploader_id': 'channel_slug',
 100                 'channel': 'channel_title',
 101                 'uploader': 'channel_title',
 102                 'series': 'channel_title',
 103                 'creator': 'channel_title',
 104                 'thumbnail': ('images', 'thumbnail', 'src', {url_or_none}),
 105                 'episode_number': ('order', {int_or_none}),
 106                 # Old code was wrongly setting extractor_key from NebulaSubscriptionsIE
 107                 '_old_archive_ids': ('zype_id', {lambda x: [
 108                     make_archive_id(NebulaIE, x), make_archive_id(NebulaSubscriptionsIE, x)] if x else None}),
 109             }),
 110             'channel_url': channel_url,
 111             'uploader_url': channel_url,
 112         }
 113
 114
 115 class NebulaIE(NebulaBaseIE):
 116     IE_NAME = 'nebula:video'
 117     _VALID_URL = rf'{_BASE_URL_RE}/videos/(?P<id>[\w-]+)'
 118     _TESTS = [{
 119         'url': 'https://nebula.tv/videos/that-time-disney-remade-beauty-and-the-beast',
 120         'info_dict': {
 121             'id': '84ed544d-4afd-4723-8cd5-2b95261f0abf',
 122             'ext': 'mp4',
 123             'title': 'That Time Disney Remade Beauty and the Beast',
 124             'description': 'md5:2aae3c4cfc5ee09a1ecdff0909618cf4',
 125             'upload_date': '20180731',
 126             'timestamp': 1533009600,
 127             'channel': 'Lindsay Ellis',
 128             'channel_id': 'lindsayellis',
 129             'uploader': 'Lindsay Ellis',
 130             'uploader_id': 'lindsayellis',
 131             'uploader_url': r're:https://nebula\.(tv|app)/lindsayellis',
 132             'series': 'Lindsay Ellis',
 133             'display_id': 'that-time-disney-remade-beauty-and-the-beast',
 134             'channel_url': r're:https://nebula\.(tv|app)/lindsayellis',
 135             'creator': 'Lindsay Ellis',
 136             'duration': 2212,
 137             'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+',
 138             '_old_archive_ids': ['nebula 5c271b40b13fd613090034fd', 'nebulasubscriptions 5c271b40b13fd613090034fd'],
 139         },
 140         'params': {'skip_download': 'm3u8'},
 141     }, {
 142         'url': 'https://nebula.tv/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore',
 143         'md5': 'd05739cf6c38c09322422f696b569c23',
 144         'info_dict': {
 145             'id': '7e623145-1b44-4ca3-aa0b-ed25a247ea34',
 146             'ext': 'mp4',
 147             'title': 'Landing Craft - How The Allies Got Ashore',
 148             'description': r're:^In this episode we explore the unsung heroes of D-Day, the landing craft.',
 149             'upload_date': '20200327',
 150             'timestamp': 1585348140,
 151             'channel': 'Real Engineering — The Logistics of D-Day',
 152             'channel_id': 'd-day',
 153             'uploader': 'Real Engineering — The Logistics of D-Day',
 154             'uploader_id': 'd-day',
 155             'series': 'Real Engineering — The Logistics of D-Day',
 156             'display_id': 'the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore',
 157             'creator': 'Real Engineering — The Logistics of D-Day',
 158             'duration': 841,
 159             'channel_url': 'https://nebula.tv/d-day',
 160             'uploader_url': 'https://nebula.tv/d-day',
 161             'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+',
 162             '_old_archive_ids': ['nebula 5e7e78171aaf320001fbd6be', 'nebulasubscriptions 5e7e78171aaf320001fbd6be'],
 163         },
 164         'params': {'skip_download': 'm3u8'},
 165     }, {
 166         'url': 'https://nebula.tv/videos/money-episode-1-the-draw',
 167         'md5': 'ebe28a7ad822b9ee172387d860487868',
 168         'info_dict': {
 169             'id': 'b96c5714-9e2b-4ec3-b3f1-20f6e89cc553',
 170             'ext': 'mp4',
 171             'title': 'Episode 1: The Draw',
 172             'description': r'contains:There’s free money on offer… if the players can all work together.',
 173             'upload_date': '20200323',
 174             'timestamp': 1584980400,
 175             'channel': 'Tom Scott Presents: Money',
 176             'channel_id': 'tom-scott-presents-money',
 177             'uploader': 'Tom Scott Presents: Money',
 178             'uploader_id': 'tom-scott-presents-money',
 179             'uploader_url': 'https://nebula.tv/tom-scott-presents-money',
 180             'duration': 825,
 181             'channel_url': 'https://nebula.tv/tom-scott-presents-money',
 182             'series': 'Tom Scott Presents: Money',
 183             'display_id': 'money-episode-1-the-draw',
 184             'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+',
 185             'creator': 'Tom Scott Presents: Money',
 186             '_old_archive_ids': ['nebula 5e779ebdd157bc0001d1c75a', 'nebulasubscriptions 5e779ebdd157bc0001d1c75a'],
 187         },
 188         'params': {'skip_download': 'm3u8'},
 189     }, {
 190         'url': 'https://watchnebula.com/videos/money-episode-1-the-draw',
 191         'only_matching': True,
 192     }, {
 193         'url': 'https://nebula.tv/videos/tldrnewseu-did-the-us-really-blow-up-the-nordstream-pipelines',
 194         'info_dict': {
 195             'id': 'e389af9d-1dab-44f2-8788-ee24deb7ff0d',
 196             'ext': 'mp4',
 197             'display_id': 'tldrnewseu-did-the-us-really-blow-up-the-nordstream-pipelines',
 198             'title': 'Did the US Really Blow Up the NordStream Pipelines?',
 199             'description': 'md5:b4e2a14e3ff08f546a3209c75261e789',
 200             'upload_date': '20230223',
 201             'timestamp': 1677144070,
 202             'channel': 'TLDR News EU',
 203             'channel_id': 'tldrnewseu',
 204             'uploader': 'TLDR News EU',
 205             'uploader_id': 'tldrnewseu',
 206             'uploader_url': r're:https://nebula\.(tv|app)/tldrnewseu',
 207             'duration': 524,
 208             'channel_url': r're:https://nebula\.(tv|app)/tldrnewseu',
 209             'series': 'TLDR News EU',
 210             'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+',
 211             'creator': 'TLDR News EU',
 212             '_old_archive_ids': ['nebula 63f64c74366fcd00017c1513', 'nebulasubscriptions 63f64c74366fcd00017c1513'],
 213         },
 214         'params': {'skip_download': 'm3u8'},
 215     }, {
 216         'url': 'https://beta.nebula.tv/videos/money-episode-1-the-draw',
 217         'only_matching': True,
 218     }]
 219
 220     def _real_extract(self, url):
 221         slug = self._match_id(url)
 222         url, smuggled_data = unsmuggle_url(url, {})
 223         if smuggled_data.get('id'):
 224             return {
 225                 'id': smuggled_data['id'],
 226                 'display_id': slug,
 227                 'title': '',
 228                 **self._extract_formats(smuggled_data['id'], slug),
 229             }
 230
 231         metadata = self._call_api(
 232             f'https://content.api.nebula.app/content/videos/{slug}',
 233             slug, note='Fetching video metadata')
 234         return {
 235             **self._extract_video_metadata(metadata),
 236             **self._extract_formats(metadata['id'], slug),
 237         }
 238
 239
 240 class NebulaClassIE(NebulaBaseIE):
 241     IE_NAME = 'nebula:media'
 242     _VALID_URL = rf'{_BASE_URL_RE}/(?!(?:myshows|library|videos)/)(?P<id>[\w-]+)/(?P<ep>[\w-]+)/?(?:$|[?#])'
 243     _TESTS = [{
 244         'url': 'https://nebula.tv/copyright-for-fun-and-profit/14',
 245         'info_dict': {
 246             'id': 'd7432cdc-c608-474d-942c-f74345daed7b',
 247             'ext': 'mp4',
 248             'display_id': '14',
 249             'channel_url': 'https://nebula.tv/copyright-for-fun-and-profit',
 250             'episode_number': 14,
 251             'thumbnail': 'https://dj423fildxgac.cloudfront.net/d533718d-9307-42d4-8fb0-e283285e99c9',
 252             'uploader_url': 'https://nebula.tv/copyright-for-fun-and-profit',
 253             'duration': 646,
 254             'episode': 'Episode 14',
 255             'title': 'Photos, Sculpture, and Video',
 256         },
 257         'params': {'skip_download': 'm3u8'},
 258     }, {
 259         'url': 'https://nebula.tv/extremitiespodcast/pyramiden-the-high-arctic-soviet-ghost-town',
 260         'info_dict': {
 261             'ext': 'mp3',
 262             'id': '018f65f0-0033-4021-8f87-2d132beb19aa',
 263             'description': 'md5:05d2b23ab780c955e2511a2b9127acff',
 264             'series_id': '335e8159-d663-491a-888f-1732285706ac',
 265             'modified_timestamp': 1599091504,
 266             'episode_id': '018f65f0-0033-4021-8f87-2d132beb19aa',
 267             'series': 'Extremities',
 268             'modified_date': '20200903',
 269             'upload_date': '20200902',
 270             'title': 'Pyramiden: The High-Arctic Soviet Ghost Town',
 271             'release_timestamp': 1571237958,
 272             'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$',
 273             'duration': 1546.05714,
 274             'timestamp': 1599085608,
 275             'release_date': '20191016',
 276         },
 277     }, {
 278         'url': 'https://nebula.tv/thelayover/the-layover-episode-1',
 279         'info_dict': {
 280             'ext': 'mp3',
 281             'id': '9d74a762-00bb-45a8-9e8d-9ed47c04a1d0',
 282             'episode_number': 1,
 283             'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$',
 284             'release_date': '20230304',
 285             'modified_date': '20230403',
 286             'series': 'The Layover',
 287             'episode_id': '9d74a762-00bb-45a8-9e8d-9ed47c04a1d0',
 288             'modified_timestamp': 1680554566,
 289             'duration': 3130.46401,
 290             'release_timestamp': 1677943800,
 291             'title': 'The Layover — Episode 1',
 292             'series_id': '874303a5-4900-4626-a4b6-2aacac34466a',
 293             'upload_date': '20230303',
 294             'episode': 'Episode 1',
 295             'timestamp': 1677883672,
 296             'description': 'md5:002cca89258e3bc7c268d5b8c24ba482',
 297         },
 298     }]
 299
 300     def _real_extract(self, url):
 301         slug, episode = self._match_valid_url(url).group('id', 'ep')
 302         url, smuggled_data = unsmuggle_url(url, {})
 303         if smuggled_data.get('id'):
 304             return {
 305                 'id': smuggled_data['id'],
 306                 'display_id': slug,
 307                 'title': '',
 308                 **self._extract_formats(smuggled_data['id'], slug),
 309             }
 310
 311         metadata = self._call_api(
 312             f'https://content.api.nebula.app/content/{slug}/{episode}/?include=lessons',
 313             slug, note='Fetching class/podcast metadata')
 314         content_type = metadata.get('type')
 315         if content_type == 'lesson':
 316             return {
 317                 **self._extract_video_metadata(metadata),
 318                 **self._extract_formats(metadata['id'], slug),
 319             }
 320         elif content_type == 'podcast_episode':
 321             episode_url = metadata['episode_url']
 322             if not episode_url and metadata.get('premium'):
 323                 self.raise_login_required()
 324
 325             if Art19IE.suitable(episode_url):
 326                 return self.url_result(episode_url, Art19IE)
 327             return traverse_obj(metadata, {
 328                 'id': ('id', {str}),
 329                 'url': ('episode_url', {url_or_none}),
 330                 'title': ('title', {str}),
 331                 'description': ('description', {str}),
 332                 'timestamp': ('published_at', {parse_iso8601}),
 333                 'duration': ('duration', {int_or_none}),
 334                 'channel_id': ('channel_id', {str}),
 335                 'chnanel': ('channel_title', {str}),
 336                 'thumbnail': ('assets', 'regular', {url_or_none}),
 337             })
 338
 339         raise ExtractorError(f'Unexpected content type {content_type!r}')
 340
 341
 342 class NebulaSubscriptionsIE(NebulaBaseIE):
 343     IE_NAME = 'nebula:subscriptions'
 344     _VALID_URL = rf'{_BASE_URL_RE}/(?P<id>myshows|library/latest-videos)/?(?:$|[?#])'
 345     _TESTS = [{
 346         'url': 'https://nebula.tv/myshows',
 347         'playlist_mincount': 1,
 348         'info_dict': {
 349             'id': 'myshows',
 350         },
 351     }]
 352
 353     def _generate_playlist_entries(self):
 354         next_url = update_url_query('https://content.api.nebula.app/video_episodes/', {
 355             'following': 'true',
 356             'include': 'engagement',
 357             'ordering': '-published_at',
 358         })
 359         for page_num in itertools.count(1):
 360             channel = self._call_api(
 361                 next_url, 'myshows', note=f'Retrieving subscriptions page {page_num}')
 362             for episode in channel['results']:
 363                 metadata = self._extract_video_metadata(episode)
 364                 yield self.url_result(smuggle_url(
 365                     f'https://nebula.tv/videos/{metadata["display_id"]}',
 366                     {'id': episode['id']}), NebulaIE, url_transparent=True, **metadata)
 367             next_url = channel.get('next')
 368             if not next_url:
 369                 return
 370
 371     def _real_extract(self, url):
 372         return self.playlist_result(self._generate_playlist_entries(), 'myshows')
 373
 374
 375 class NebulaChannelIE(NebulaBaseIE):
 376     IE_NAME = 'nebula:channel'
 377     _VALID_URL = rf'{_BASE_URL_RE}/(?!myshows|library|videos)(?P<id>[\w-]+)/?(?:$|[?#])'
 378     _TESTS = [{
 379         'url': 'https://nebula.tv/tom-scott-presents-money',
 380         'info_dict': {
 381             'id': 'tom-scott-presents-money',
 382             'title': 'Tom Scott Presents: Money',
 383             'description': 'Tom Scott hosts a series all about trust, negotiation and money.',
 384         },
 385         'playlist_count': 5,
 386     }, {
 387         'url': 'https://nebula.tv/lindsayellis',
 388         'info_dict': {
 389             'id': 'lindsayellis',
 390             'title': 'Lindsay Ellis',
 391             'description': 'Enjoy these hottest of takes on Disney, Transformers, and Musicals.',
 392         },
 393         'playlist_mincount': 2,
 394     }, {
 395         'url': 'https://nebula.tv/johnnyharris',
 396         'info_dict': {
 397             'id': 'johnnyharris',
 398             'title': 'Johnny Harris',
 399             'description': 'I make videos about maps and many other things.',
 400         },
 401         'playlist_mincount': 90,
 402     }, {
 403         'url': 'https://nebula.tv/copyright-for-fun-and-profit',
 404         'info_dict': {
 405             'id': 'copyright-for-fun-and-profit',
 406             'title': 'Copyright for Fun and Profit',
 407             'description': 'md5:6690248223eed044a9f11cd5a24f9742',
 408         },
 409         'playlist_count': 23,
 410     }, {
 411         'url': 'https://nebula.tv/trussissuespodcast',
 412         'info_dict': {
 413             'id': 'trussissuespodcast',
 414             'title': 'The TLDR News Podcast',
 415             'description': 'md5:a08c4483bc0b705881d3e0199e721385',
 416         },
 417         'playlist_mincount': 80,
 418     }]
 419
 420     def _generate_playlist_entries(self, collection_id, collection_slug):
 421         next_url = f'https://content.api.nebula.app/video_channels/{collection_id}/video_episodes/?ordering=-published_at'
 422         for page_num in itertools.count(1):
 423             episodes = self._call_api(next_url, collection_slug, note=f'Retrieving channel page {page_num}')
 424             for episode in episodes['results']:
 425                 metadata = self._extract_video_metadata(episode)
 426                 yield self.url_result(smuggle_url(
 427                     episode.get('share_url') or f'https://nebula.tv/videos/{metadata["display_id"]}',
 428                     {'id': episode['id']}), NebulaIE, url_transparent=True, **metadata)
 429             next_url = episodes.get('next')
 430             if not next_url:
 431                 break
 432
 433     def _generate_class_entries(self, channel):
 434         for lesson in channel['lessons']:
 435             metadata = self._extract_video_metadata(lesson)
 436             yield self.url_result(smuggle_url(
 437                 lesson.get('share_url') or f'https://nebula.tv/{metadata["class_slug"]}/{metadata["slug"]}',
 438                 {'id': lesson['id']}), NebulaClassIE, url_transparent=True, **metadata)
 439
 440     def _generate_podcast_entries(self, collection_id, collection_slug):
 441         next_url = f'https://content.api.nebula.app/podcast_channels/{collection_id}/podcast_episodes/?ordering=-published_at&premium=true'
 442         for page_num in itertools.count(1):
 443             episodes = self._call_api(next_url, collection_slug, note=f'Retrieving podcast page {page_num}')
 444
 445             for episode in traverse_obj(episodes, ('results', lambda _, v: url_or_none(v['share_url']))):
 446                 yield self.url_result(episode['share_url'], NebulaClassIE)
 447             next_url = episodes.get('next')
 448             if not next_url:
 449                 break
 450
 451     def _real_extract(self, url):
 452         collection_slug = self._match_id(url)
 453         channel = self._call_api(
 454             f'https://content.api.nebula.app/content/{collection_slug}/?include=lessons',
 455             collection_slug, note='Retrieving channel')
 456
 457         if channel.get('type') == 'class':
 458             entries = self._generate_class_entries(channel)
 459         elif channel.get('type') == 'podcast_channel':
 460             entries = self._generate_podcast_entries(channel['id'], collection_slug)
 461         else:
 462             entries = self._generate_playlist_entries(channel['id'], collection_slug)
 463
 464         return self.playlist_result(
 465             entries=entries,
 466             playlist_id=collection_slug,
 467             playlist_title=channel.get('title'),
 468             playlist_description=channel.get('description'))