yt_dlp/extractor/wistia.py

   1 import base64
   2 import re
   3 import urllib.parse
   4
   5 from .common import InfoExtractor
   6 from ..networking import HEADRequest
   7 from ..networking.exceptions import HTTPError
   8 from ..utils import (
   9     ExtractorError,
  10     determine_ext,
  11     float_or_none,
  12     int_or_none,
  13     parse_qs,
  14     traverse_obj,
  15     try_get,
  16     update_url_query,
  17     urlhandle_detect_ext,
  18 )
  19
  20
  21 class WistiaBaseIE(InfoExtractor):
  22     _VALID_ID_REGEX = r'(?P<id>[a-z0-9]{10})'
  23     _VALID_URL_BASE = r'https?://(?:\w+\.)?wistia\.(?:net|com)/(?:embed/)?'
  24     _EMBED_BASE_URL = 'http://fast.wistia.net/embed/'
  25
  26     def _download_embed_config(self, config_type, config_id, referer):
  27         base_url = self._EMBED_BASE_URL + f'{config_type}/{config_id}'
  28         embed_config = self._download_json(
  29             base_url + '.json', config_id, headers={
  30                 'Referer': referer if referer.startswith('http') else base_url,  # Some videos require this.
  31             })
  32
  33         error = traverse_obj(embed_config, 'error')
  34         if error:
  35             raise ExtractorError(
  36                 f'Error while getting the playlist: {error}', expected=True)
  37
  38         return embed_config
  39
  40     def _get_real_ext(self, url):
  41         ext = determine_ext(url, default_ext='bin')
  42         if ext == 'bin':
  43             urlh = self._request_webpage(
  44                 HEADRequest(url), None, note='Checking media extension',
  45                 errnote='HEAD request returned error', fatal=False)
  46             if urlh:
  47                 ext = urlhandle_detect_ext(urlh, default='bin')
  48         return 'mp4' if ext == 'mov' else ext
  49
  50     def _extract_media(self, embed_config):
  51         data = embed_config['media']
  52         video_id = data['hashedId']
  53         title = data['name']
  54
  55         formats = []
  56         thumbnails = []
  57         for a in data['assets']:
  58             aurl = a.get('url')
  59             if not aurl:
  60                 continue
  61             astatus = a.get('status')
  62             atype = a.get('type')
  63             if (astatus is not None and astatus != 2) or atype in ('preview', 'storyboard'):
  64                 continue
  65             elif atype in ('still', 'still_image'):
  66                 thumbnails.append({
  67                     'url': aurl.replace('.bin', f'.{self._get_real_ext(aurl)}'),
  68                     'width': int_or_none(a.get('width')),
  69                     'height': int_or_none(a.get('height')),
  70                     'filesize': int_or_none(a.get('size')),
  71                 })
  72             else:
  73                 aext = a.get('ext') or self._get_real_ext(aurl)
  74                 display_name = a.get('display_name')
  75                 format_id = atype
  76                 if atype and atype.endswith('_video') and display_name:
  77                     format_id = f'{atype[:-6]}-{display_name}'
  78                 f = {
  79                     'format_id': format_id,
  80                     'url': aurl,
  81                     'tbr': int_or_none(a.get('bitrate')) or None,
  82                     'quality': 1 if atype == 'original' else None,
  83                 }
  84                 if display_name == 'Audio':
  85                     f.update({
  86                         'vcodec': 'none',
  87                     })
  88                 else:
  89                     f.update({
  90                         'width': int_or_none(a.get('width')),
  91                         'height': int_or_none(a.get('height')),
  92                         'vcodec': a.get('codec'),
  93                     })
  94                 if a.get('container') == 'm3u8' or aext == 'm3u8':
  95                     ts_f = f.copy()
  96                     ts_f.update({
  97                         'ext': 'ts',
  98                         'format_id': f['format_id'].replace('hls-', 'ts-'),
  99                         'url': f['url'].replace('.bin', '.ts'),
 100                     })
 101                     formats.append(ts_f)
 102                     f.update({
 103                         'ext': 'mp4',
 104                         'protocol': 'm3u8_native',
 105                     })
 106                 else:
 107                     f.update({
 108                         'container': a.get('container'),
 109                         'ext': aext,
 110                         'filesize': int_or_none(a.get('size')),
 111                     })
 112                 formats.append(f)
 113
 114         subtitles = {}
 115         for caption in data.get('captions', []):
 116             language = caption.get('language')
 117             if not language:
 118                 continue
 119             subtitles[language] = [{
 120                 'url': self._EMBED_BASE_URL + 'captions/' + video_id + '.vtt?language=' + language,
 121             }]
 122
 123         return {
 124             'id': video_id,
 125             'title': title,
 126             'description': data.get('seoDescription'),
 127             'formats': formats,
 128             'thumbnails': thumbnails,
 129             'duration': float_or_none(data.get('duration')),
 130             'timestamp': int_or_none(data.get('createdAt')),
 131             'subtitles': subtitles,
 132         }
 133
 134     @classmethod
 135     def _extract_from_webpage(cls, url, webpage):
 136         from .teachable import TeachableIE
 137
 138         if list(TeachableIE._extract_embed_urls(url, webpage)):
 139             return
 140
 141         yield from super()._extract_from_webpage(url, webpage)
 142
 143     @classmethod
 144     def _extract_wistia_async_embed(cls, webpage):
 145         # https://wistia.com/support/embed-and-share/video-on-your-website
 146         # https://wistia.com/support/embed-and-share/channel-embeds
 147         yield from re.finditer(
 148             r'''(?sx)
 149                 <(?:div|section)[^>]+class=([\"'])(?:(?!\1).)*?(?P<type>wistia[a-z_0-9]+)\s*\bwistia_async_(?P<id>[a-z0-9]{10})\b(?:(?!\1).)*?\1
 150             ''', webpage)
 151
 152     @classmethod
 153     def _extract_url_media_id(cls, url):
 154         mobj = re.search(r'(?:wmediaid|wvideo(?:id)?)]?=(?P<id>[a-z0-9]{10})', urllib.parse.unquote_plus(url))
 155         if mobj:
 156             return mobj.group('id')
 157
 158
 159 class WistiaIE(WistiaBaseIE):
 160     _VALID_URL = rf'(?:wistia:|{WistiaBaseIE._VALID_URL_BASE}(?:iframe|medias)/){WistiaBaseIE._VALID_ID_REGEX}'
 161     _EMBED_REGEX = [
 162         r'''(?x)
 163             <(?:meta[^>]+?content|(?:iframe|script)[^>]+?src)=["\']
 164             (?P<url>(?:https?:)?//(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/[a-z0-9]{10})
 165             ''']
 166     _TESTS = [{
 167         # with hls video
 168         'url': 'wistia:807fafadvk',
 169         'md5': 'daff0f3687a41d9a71b40e0e8c2610fe',
 170         'info_dict': {
 171             'id': '807fafadvk',
 172             'ext': 'mp4',
 173             'title': 'Drip Brennan Dunn Workshop',
 174             'description': 'a JV Webinars video',
 175             'upload_date': '20160518',
 176             'timestamp': 1463607249,
 177             'duration': 4987.11,
 178         },
 179         'skip': 'video unavailable',
 180     }, {
 181         'url': 'wistia:a6ndpko1wg',
 182         'md5': '10c1ce9c4dde638202513ed17a3767bd',
 183         'info_dict': {
 184             'id': 'a6ndpko1wg',
 185             'ext': 'mp4',
 186             'title': 'Episode 2: Boxed Water\'s retention is thirsty',
 187             'upload_date': '20210324',
 188             'description': 'md5:da5994c2c2d254833b412469d9666b7a',
 189             'duration': 966.0,
 190             'timestamp': 1616614369,
 191             'thumbnail': 'https://embed-ssl.wistia.com/deliveries/53dc60239348dc9b9fba3755173ea4c2.png',
 192         },
 193     }, {
 194         'url': 'wistia:5vd7p4bct5',
 195         'md5': 'b9676d24bf30945d97060638fbfe77f0',
 196         'info_dict': {
 197             'id': '5vd7p4bct5',
 198             'ext': 'mp4',
 199             'title': 'md5:eaa9f64c4efd7b5f098b9b6118597679',
 200             'description': 'md5:a9bea0315f0616aa5df2dc413ddcdd0f',
 201             'upload_date': '20220915',
 202             'timestamp': 1663258727,
 203             'duration': 623.019,
 204             'thumbnail': r're:https?://embed(?:-ssl)?.wistia.com/.+\.jpg$',
 205         },
 206     }, {
 207         'url': 'wistia:sh7fpupwlt',
 208         'only_matching': True,
 209     }, {
 210         'url': 'http://fast.wistia.net/embed/iframe/sh7fpupwlt',
 211         'only_matching': True,
 212     }, {
 213         'url': 'http://fast.wistia.com/embed/iframe/sh7fpupwlt',
 214         'only_matching': True,
 215     }, {
 216         'url': 'http://fast.wistia.net/embed/medias/sh7fpupwlt.json',
 217         'only_matching': True,
 218     }]
 219
 220     _WEBPAGE_TESTS = [{
 221         'url': 'https://www.weidert.com/blog/wistia-channels-video-marketing-tool',
 222         'info_dict': {
 223             'id': 'cqwukac3z1',
 224             'ext': 'mp4',
 225             'title': 'How Wistia Channels Can Help Capture Inbound Value From Your Video Content',
 226             'duration': 158.125,
 227             'timestamp': 1618974400,
 228             'description': 'md5:27abc99a758573560be72600ef95cece',
 229             'upload_date': '20210421',
 230             'thumbnail': 'https://embed-ssl.wistia.com/deliveries/6c551820ae950cdee2306d6cbe9ef742.jpg',
 231         },
 232     }, {
 233         'url': 'https://study.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
 234         'md5': 'b9676d24bf30945d97060638fbfe77f0',
 235         'info_dict': {
 236             'id': '5vd7p4bct5',
 237             'ext': 'mp4',
 238             'title': 'paywall_north-american-exploration-failed-colonies-of-spain-france-england',
 239             'upload_date': '20220915',
 240             'timestamp': 1663258727,
 241             'duration': 623.019,
 242             'thumbnail': 'https://embed-ssl.wistia.com/deliveries/83e6ec693e2c05a0ce65809cbaead86a.jpg',
 243             'description': 'a Paywall Videos video',
 244         },
 245     }]
 246
 247     def _real_extract(self, url):
 248         video_id = self._match_id(url)
 249         embed_config = self._download_embed_config('medias', video_id, url)
 250         return self._extract_media(embed_config)
 251
 252     @classmethod
 253     def _extract_embed_urls(cls, url, webpage):
 254         urls = list(super()._extract_embed_urls(url, webpage))
 255         for match in cls._extract_wistia_async_embed(webpage):
 256             if match.group('type') != 'wistia_channel':
 257                 urls.append('wistia:{}'.format(match.group('id')))
 258         for match in re.finditer(r'(?:data-wistia-?id=["\']|Wistia\.embed\(["\']|id=["\']wistia_)(?P<id>[a-z0-9]{10})',
 259                                  webpage):
 260             urls.append('wistia:{}'.format(match.group('id')))
 261         if not WistiaChannelIE._extract_embed_urls(url, webpage):  # Fallback
 262             media_id = cls._extract_url_media_id(url)
 263             if media_id:
 264                 urls.append('wistia:{}'.format(match.group('id')))
 265         return urls
 266
 267
 268 class WistiaPlaylistIE(WistiaBaseIE):
 269     _VALID_URL = rf'{WistiaBaseIE._VALID_URL_BASE}playlists/{WistiaBaseIE._VALID_ID_REGEX}'
 270
 271     _TEST = {
 272         'url': 'https://fast.wistia.net/embed/playlists/aodt9etokc',
 273         'info_dict': {
 274             'id': 'aodt9etokc',
 275         },
 276         'playlist_count': 3,
 277     }
 278
 279     def _real_extract(self, url):
 280         playlist_id = self._match_id(url)
 281         playlist = self._download_embed_config('playlists', playlist_id, url)
 282
 283         entries = []
 284         for media in (try_get(playlist, lambda x: x[0]['medias']) or []):
 285             embed_config = media.get('embed_config')
 286             if not embed_config:
 287                 continue
 288             entries.append(self._extract_media(embed_config))
 289
 290         return self.playlist_result(entries, playlist_id)
 291
 292
 293 class WistiaChannelIE(WistiaBaseIE):
 294     _VALID_URL = rf'(?:wistiachannel:|{WistiaBaseIE._VALID_URL_BASE}channel/){WistiaBaseIE._VALID_ID_REGEX}'
 295
 296     _TESTS = [{
 297         # JSON Embed API returns 403, should fall back to webpage
 298         'url': 'https://fast.wistia.net/embed/channel/yvyvu7wjbg?wchannelid=yvyvu7wjbg',
 299         'info_dict': {
 300             'id': 'yvyvu7wjbg',
 301             'title': 'Copysmith Tutorials and Education!',
 302             'description': 'Learn all things Copysmith via short and informative videos!',
 303         },
 304         'playlist_mincount': 7,
 305         'expected_warnings': ['falling back to webpage'],
 306     }, {
 307         'url': 'https://fast.wistia.net/embed/channel/3802iirk0l',
 308         'info_dict': {
 309             'id': '3802iirk0l',
 310             'title': 'The Roof',
 311         },
 312         'playlist_mincount': 20,
 313     }, {
 314         # link to popup video, follow --no-playlist
 315         'url': 'https://fast.wistia.net/embed/channel/3802iirk0l?wchannelid=3802iirk0l&wmediaid=sp5dqjzw3n',
 316         'info_dict': {
 317             'id': 'sp5dqjzw3n',
 318             'ext': 'mp4',
 319             'title': 'The Roof S2: The Modern CRO',
 320             'thumbnail': 'https://embed-ssl.wistia.com/deliveries/dadfa9233eaa505d5e0c85c23ff70741.png',
 321             'duration': 86.487,
 322             'description': 'A sales leader on The Roof? Man, they really must be letting anyone up here this season.\n',
 323             'timestamp': 1619790290,
 324             'upload_date': '20210430',
 325         },
 326         'params': {'noplaylist': True, 'skip_download': True},
 327     }]
 328     _WEBPAGE_TESTS = [{
 329         'url': 'https://www.profitwell.com/recur/boxed-out',
 330         'info_dict': {
 331             'id': '6jyvmqz6zs',
 332             'title': 'Boxed Out',
 333             'description': 'md5:14a8a93a1dbe236718e6a59f8c8c7bae',
 334         },
 335         'playlist_mincount': 30,
 336     }, {
 337         # section instead of div
 338         'url': 'https://360learning.com/studio/onboarding-joei/',
 339         'info_dict': {
 340             'id': 'z874k93n2o',
 341             'title': 'Onboarding Joei.',
 342             'description': 'Coming to you weekly starting Feb 19th.',
 343         },
 344         'playlist_mincount': 20,
 345     }, {
 346         'url': 'https://amplitude.com/amplify-sessions?amp%5Bwmediaid%5D=pz0m0l0if3&amp%5Bwvideo%5D=pz0m0l0if3&wchannelid=emyjmwjf79&wmediaid=i8um783bdt',
 347         'info_dict': {
 348             'id': 'pz0m0l0if3',
 349             'title': 'A Framework for Improving Product Team Performance',
 350             'ext': 'mp4',
 351             'timestamp': 1653935275,
 352             'upload_date': '20220530',
 353             'description': 'Learn how to help your company improve and achieve your product related goals.',
 354             'duration': 1854.39,
 355             'thumbnail': 'https://embed-ssl.wistia.com/deliveries/12fd19e56413d9d6f04e2185c16a6f8854e25226.png',
 356         },
 357         'params': {'noplaylist': True, 'skip_download': True},
 358     }]
 359
 360     def _real_extract(self, url):
 361         channel_id = self._match_id(url)
 362         media_id = self._extract_url_media_id(url)
 363         if not self._yes_playlist(channel_id, media_id, playlist_label='channel'):
 364             return self.url_result(f'wistia:{media_id}', 'Wistia')
 365
 366         try:
 367             data = self._download_embed_config('channel', channel_id, url)
 368         except (ExtractorError, HTTPError):
 369             # Some channels give a 403 from the JSON API
 370             self.report_warning('Failed to download channel data from API, falling back to webpage.')
 371             webpage = self._download_webpage(f'https://fast.wistia.net/embed/channel/{channel_id}', channel_id)
 372             data = self._parse_json(
 373                 self._search_regex(rf'wchanneljsonp-{channel_id}\'\]\s*=[^\"]*\"([A-Za-z0-9=/]*)', webpage, 'jsonp', channel_id),
 374                 channel_id, transform_source=lambda x: urllib.parse.unquote_plus(base64.b64decode(x).decode('utf-8')))
 375
 376         # XXX: can there be more than one series?
 377         series = traverse_obj(data, ('series', 0), default={})
 378
 379         entries = [
 380             self.url_result(f'wistia:{video["hashedId"]}', WistiaIE, title=video.get('name'))
 381             for video in traverse_obj(series, ('sections', ..., 'videos', ...)) or []
 382             if video.get('hashedId')
 383         ]
 384
 385         return self.playlist_result(
 386             entries, channel_id, playlist_title=series.get('title'), playlist_description=series.get('description'))
 387
 388     @classmethod
 389     def _extract_embed_urls(cls, url, webpage):
 390         yield from super()._extract_embed_urls(url, webpage)
 391         for match in cls._extract_wistia_async_embed(webpage):
 392             if match.group('type') == 'wistia_channel':
 393                 # original url may contain wmediaid query param
 394                 yield update_url_query(f'wistiachannel:{match.group("id")}', parse_qs(url))