yt_dlp/extractor/sproutvideo.py

   1 import base64
   2 import urllib.parse
   3
   4 from .common import InfoExtractor
   5 from ..networking.exceptions import HTTPError
   6 from ..utils import (
   7     ExtractorError,
   8     int_or_none,
   9     qualities,
  10     remove_start,
  11     smuggle_url,
  12     unsmuggle_url,
  13     update_url_query,
  14     url_or_none,
  15     urlencode_postdata,
  16 )
  17 from ..utils.traversal import traverse_obj
  18
  19
  20 class SproutVideoIE(InfoExtractor):
  21     _NO_SCHEME_RE = r'//videos\.sproutvideo\.com/embed/(?P<id>[\da-f]+)/[\da-f]+'
  22     _VALID_URL = rf'https?:{_NO_SCHEME_RE}'
  23     _EMBED_REGEX = [rf'<iframe [^>]*\bsrc=["\'](?P<url>(?:https?:)?{_NO_SCHEME_RE}[^"\']*)["\']']
  24     _TESTS = [{
  25         'url': 'https://videos.sproutvideo.com/embed/4c9dddb01910e3c9c4/0fc24387c4f24ee3',
  26         'md5': '1343ce1a6cb39d67889bfa07c7b02b0e',
  27         'info_dict': {
  28             'id': '4c9dddb01910e3c9c4',
  29             'ext': 'mp4',
  30             'title': 'Adrien Labaeye : Berlin, des communautés aux communs',
  31             'duration': 576,
  32             'thumbnail': r're:https?://images\.sproutvideo\.com/.+\.jpg',
  33         },
  34     }, {
  35         'url': 'https://videos.sproutvideo.com/embed/a79fdcb21f1be2c62e/93bf31e41e39ca27',
  36         'md5': 'cebae5cf558cca83271917cf4ec03f26',
  37         'info_dict': {
  38             'id': 'a79fdcb21f1be2c62e',
  39             'ext': 'mp4',
  40             'title': 'HS_01_Live Stream 2023-01-14 10:00',
  41             'duration': 703,
  42             'thumbnail': r're:https?://images\.sproutvideo\.com/.+\.jpg',
  43         },
  44     }, {
  45         # http formats 'sd' and 'hd' are available
  46         'url': 'https://videos.sproutvideo.com/embed/119cd6bc1a18e6cd98/30751a1761ae5b90',
  47         'md5': 'f368c78df07e78a749508b221528672c',
  48         'info_dict': {
  49             'id': '119cd6bc1a18e6cd98',
  50             'ext': 'mp4',
  51             'title': '3. Updating your Partner details',
  52             'thumbnail': r're:https?://images\.sproutvideo\.com/.+\.jpg',
  53             'duration': 60,
  54         },
  55         'params': {'format': 'hd'},
  56     }, {
  57         # subtitles
  58         'url': 'https://videos.sproutvideo.com/embed/119dd8ba121ee0cc98/4ee50c88a343215d?type=hd',
  59         'md5': '7f6798f037d7a3e3e07e67959de68fc6',
  60         'info_dict': {
  61             'id': '119dd8ba121ee0cc98',
  62             'ext': 'mp4',
  63             'title': 'Recipients Setup - Domestic Wire Only',
  64             'thumbnail': r're:https?://images\.sproutvideo\.com/.+\.jpg',
  65             'duration': 77,
  66             'subtitles': {'en': 'count:1'},
  67         },
  68     }]
  69     _WEBPAGE_TESTS = [{
  70         'url': 'https://www.solidarum.org/vivre-ensemble/adrien-labaeye-berlin-des-communautes-aux-communs',
  71         'info_dict': {
  72             'id': '4c9dddb01910e3c9c4',
  73             'ext': 'mp4',
  74             'title': 'Adrien Labaeye : Berlin, des communautés aux communs',
  75             'duration': 576,
  76             'thumbnail': r're:https?://images\.sproutvideo\.com/.+\.jpg',
  77         },
  78     }]
  79     _M3U8_URL_TMPL = 'https://{base}.videos.sproutvideo.com/{s3_user_hash}/{s3_video_hash}/video/index.m3u8'
  80     _QUALITIES = ('hd', 'uhd', 'source')  # Exclude 'sd' to prioritize hls formats above it
  81
  82     @staticmethod
  83     def _policy_to_qs(policy, signature_key, as_string=False):
  84         query = {}
  85         for key, value in policy['signatures'][signature_key].items():
  86             query[remove_start(key, 'CloudFront-')] = value
  87         query['sessionID'] = policy['sessionID']
  88         return urllib.parse.urlencode(query, doseq=True) if as_string else query
  89
  90     @classmethod
  91     def _extract_embed_urls(cls, url, webpage):
  92         for embed_url in super()._extract_embed_urls(url, webpage):
  93             if embed_url.startswith('//'):
  94                 embed_url = f'https:{embed_url}'
  95             yield smuggle_url(embed_url, {'referer': url})
  96
  97     def _real_extract(self, url):
  98         url, smuggled_data = unsmuggle_url(url, {})
  99         video_id = self._match_id(url)
 100         webpage = self._download_webpage(
 101             url, video_id, headers=traverse_obj(smuggled_data, {'Referer': 'referer'}))
 102         data = self._search_json(
 103             r'var\s+dat\s*=\s*["\']', webpage, 'data', video_id, contains_pattern=r'[A-Za-z0-9+/=]+',
 104             end_pattern=r'["\'];', transform_source=lambda x: base64.b64decode(x).decode())
 105
 106         formats, subtitles = [], {}
 107         headers = {
 108             'Accept': '*/*',
 109             'Origin': 'https://videos.sproutvideo.com',
 110             'Referer': url,
 111         }
 112
 113         # HLS extraction is fatal; only attempt it if the JSON data says it's available
 114         if traverse_obj(data, 'hls'):
 115             manifest_query = self._policy_to_qs(data, 'm')
 116             fragment_query = self._policy_to_qs(data, 't', as_string=True)
 117             key_query = self._policy_to_qs(data, 'k', as_string=True)
 118
 119             formats.extend(self._extract_m3u8_formats(
 120                 self._M3U8_URL_TMPL.format(**data), video_id, 'mp4',
 121                 m3u8_id='hls', headers=headers, query=manifest_query))
 122             for fmt in formats:
 123                 fmt.update({
 124                     'url': update_url_query(fmt['url'], manifest_query),
 125                     'extra_param_to_segment_url': fragment_query,
 126                     'extra_param_to_key_url': key_query,
 127                 })
 128
 129         if downloads := traverse_obj(data, ('downloads', {dict.items}, lambda _, v: url_or_none(v[1]))):
 130             quality = qualities(self._QUALITIES)
 131             acodec = 'none' if data.get('has_audio') is False else None
 132             formats.extend([{
 133                 'format_id': str(format_id),
 134                 'url': format_url,
 135                 'ext': 'mp4',
 136                 'quality': quality(format_id),
 137                 'acodec': acodec,
 138             } for format_id, format_url in downloads])
 139
 140         for sub_data in traverse_obj(data, ('subtitleData', lambda _, v: url_or_none(v['src']))):
 141             subtitles.setdefault(sub_data.get('srclang', 'en'), []).append({
 142                 'url': sub_data['src'],
 143             })
 144
 145         return {
 146             'id': video_id,
 147             'formats': formats,
 148             'subtitles': subtitles,
 149             'http_headers': headers,
 150             **traverse_obj(data, {
 151                 'title': ('title', {str}),
 152                 'duration': ('duration', {int_or_none}),
 153                 'thumbnail': ('posterframe_url', {url_or_none}),
 154             }),
 155         }
 156
 157
 158 class VidsIoIE(InfoExtractor):
 159     IE_NAME = 'vids.io'
 160     _VALID_URL = r'https?://[\w-]+\.vids\.io/videos/(?P<id>[\da-f]+)/(?P<display_id>[\w-]+)'
 161     _TESTS = [{
 162         'url': 'https://how-to-video.vids.io/videos/799cd8b11c10efc1f0/how-to-video-live-streaming',
 163         'md5': '9bbbb2c0c0739eb163b80f87b8d77c9e',
 164         'info_dict': {
 165             'id': '799cd8b11c10efc1f0',
 166             'ext': 'mp4',
 167             'title': 'How to Video: Live Streaming',
 168             'duration': 2787,
 169             'thumbnail': r're:https?://images\.sproutvideo\.com/.+\.jpg',
 170         },
 171     }]
 172
 173     def _real_extract(self, url):
 174         video_id, display_id = self._match_valid_url(url).group('id', 'display_id')
 175         webpage, urlh = self._download_webpage_handle(url, display_id, expected_status=403)
 176
 177         if urlh.status == 403:
 178             password = self.get_param('videopassword')
 179             if not password:
 180                 raise ExtractorError(
 181                     'This video is password-protected; use the --video-password option', expected=True)
 182             try:
 183                 webpage = self._download_webpage(
 184                     url, display_id, 'Submitting video password',
 185                     data=urlencode_postdata({
 186                         'password': password,
 187                         **self._hidden_inputs(webpage),
 188                     }))
 189                 # Requests with user's session cookie `_sproutvideo_session` are now authorized
 190             except ExtractorError as e:
 191                 if isinstance(e.cause, HTTPError) and e.cause.status == 403:
 192                     raise ExtractorError('Incorrect password', expected=True)
 193                 raise
 194
 195         if embed_url := next(SproutVideoIE._extract_embed_urls(url, webpage), None):
 196             return self.url_result(embed_url, SproutVideoIE, video_id)
 197
 198         raise ExtractorError('Unable to extract any SproutVideo embed url')