yt_dlp/extractor/rudovideo.py

   1 from .common import InfoExtractor
   2 from ..utils import (
   3     ExtractorError,
   4     determine_ext,
   5     js_to_json,
   6     traverse_obj,
   7     update_url_query,
   8     url_or_none,
   9 )
  10
  11
  12 class RudoVideoIE(InfoExtractor):
  13     _VALID_URL = r'https?://rudo\.video/(?P<type>vod|podcast|live)/(?P<id>[^/?&#]+)'
  14     _EMBED_REGEX = [r'<iframe[^>]+src=[\'"](?P<url>(?:https?:)//rudo\.video/(?:vod|podcast|live)/[^\'"]+)']
  15     _TESTS = [{
  16         'url': 'https://rudo.video/podcast/cz2wrUy8l0o',
  17         'md5': '28ed82b477708dc5e12e072da2449221',
  18         'info_dict': {
  19             'id': 'cz2wrUy8l0o',
  20             'title': 'Diego Cabot',
  21             'ext': 'mp4',
  22             'thumbnail': r're:^(?:https?:)?//.*\.(png|jpg)$',
  23         },
  24     }, {
  25         'url': 'https://rudo.video/podcast/bQkt07',
  26         'md5': '36b22a9863de0f47f00fc7532a32a898',
  27         'info_dict': {
  28             'id': 'bQkt07',
  29             'title': 'Tubular Bells',
  30             'ext': 'mp4',
  31             'thumbnail': r're:^(?:https?:)?//.*\.(png|jpg)$',
  32         },
  33     }, {
  34         'url': 'https://rudo.video/podcast/b42ZUznHX0',
  35         'md5': 'b91c70d832938871367f8ad10c895821',
  36         'info_dict': {
  37             'id': 'b42ZUznHX0',
  38             'title': 'Columna Ruperto Concha',
  39             'ext': 'mp3',
  40             'thumbnail': r're:^(?:https?:)?//.*\.(png|jpg)$',
  41         },
  42     }, {
  43         'url': 'https://rudo.video/vod/bN5AaJ',
  44         'md5': '01324a329227e2591530ecb4f555c881',
  45         'info_dict': {
  46             'id': 'bN5AaJ',
  47             'title': 'Ucrania 19.03',
  48             'creator': 'La Tercera',
  49             'ext': 'mp4',
  50             'thumbnail': r're:^(?:https?:)?//.*\.(png|jpg)$',
  51         },
  52     }, {
  53         'url': 'https://rudo.video/live/bbtv',
  54         'info_dict': {
  55             'id': 'bbtv',
  56             'ext': 'mp4',
  57             'creator': 'BioBioTV',
  58             'live_status': 'is_live',
  59             'title': r're:^LIVE BBTV\s\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}$',
  60             'thumbnail': r're:^(?:https?:)?//.*\.(png|jpg)$',
  61         },
  62     }, {
  63         'url': 'https://rudo.video/live/c13',
  64         'info_dict': {
  65             'id': 'c13',
  66             'title': 'CANAL13',
  67             'ext': 'mp4',
  68         },
  69         'skip': 'Geo-restricted to Chile',
  70     }, {
  71         'url': 'https://rudo.video/live/t13-13cl',
  72         'info_dict': {
  73             'id': 't13-13cl',
  74             'title': 'T13',
  75             'ext': 'mp4',
  76         },
  77         'skip': 'Geo-restricted to Chile',
  78     }]
  79
  80     def _real_extract(self, url):
  81         video_id, type_ = self._match_valid_url(url).group('id', 'type')
  82         is_live = type_ == 'live'
  83
  84         webpage = self._download_webpage(url, video_id)
  85         if 'Streaming is not available in your area' in webpage:
  86             self.raise_geo_restricted()
  87
  88         media_url = (
  89             self._search_regex(
  90                 r'var\s+streamURL\s*=\s*[\'"]([^?\'"]+)', webpage, 'stream url', default=None)
  91             # Source URL must be used only if streamURL is unavailable
  92             or self._search_regex(
  93                 r'<source[^>]+src=[\'"]([^\'"]+)', webpage, 'source url', default=None))
  94         if not media_url:
  95             youtube_url = self._search_regex(r'file:\s*[\'"]((?:https?:)//(?:www\.)?youtube\.com[^\'"]+)',
  96                                              webpage, 'youtube url', default=None)
  97             if youtube_url:
  98                 return self.url_result(youtube_url, 'Youtube')
  99             raise ExtractorError('Unable to extract stream url')
 100
 101         token_array = self._search_json(
 102             r'<script>var\s+_\$_[a-zA-Z0-9]+\s*=', webpage, 'access token array', video_id,
 103             contains_pattern=r'\[(?s:.+)\]', default=None, transform_source=js_to_json)
 104         if token_array:
 105             token_url = traverse_obj(token_array, (..., {url_or_none}), get_all=False)
 106             if not token_url:
 107                 raise ExtractorError('Invalid access token array')
 108             access_token = self._download_json(
 109                 token_url, video_id, note='Downloading access token')['data']['authToken']
 110             media_url = update_url_query(media_url, {'auth-token': access_token})
 111
 112         ext = determine_ext(media_url)
 113         if ext == 'm3u8':
 114             formats = self._extract_m3u8_formats(media_url, video_id, live=is_live)
 115         elif ext == 'mp3':
 116             formats = [{
 117                 'url': media_url,
 118                 'vcodec': 'none',
 119             }]
 120         else:
 121             formats = [{'url': media_url}]
 122
 123         return {
 124             'id': video_id,
 125             'title': (self._search_regex(r'var\s+titleVideo\s*=\s*[\'"]([^\'"]+)',
 126                                          webpage, 'title', default=None)
 127                       or self._og_search_title(webpage)),
 128             'creator': self._search_regex(r'var\s+videoAuthor\s*=\s*[\'"]([^?\'"]+)',
 129                                           webpage, 'videoAuthor', default=None),
 130             'thumbnail': (self._search_regex(r'var\s+posterIMG\s*=\s*[\'"]([^?\'"]+)',
 131                                              webpage, 'thumbnail', default=None)
 132                           or self._og_search_thumbnail(webpage)),
 133             'formats': formats,
 134             'is_live': is_live,
 135         }