yt_dlp/extractor/espn.py

   1 import base64
   2 import json
   3 import re
   4 import urllib.parse
   5
   6 from .adobepass import AdobePassIE
   7 from .common import InfoExtractor
   8 from .once import OnceIE
   9 from ..utils import (
  10     determine_ext,
  11     dict_get,
  12     int_or_none,
  13     traverse_obj,
  14     unified_strdate,
  15     unified_timestamp,
  16 )
  17
  18
  19 class ESPNIE(OnceIE):
  20     _VALID_URL = r'''(?x)
  21                     https?://
  22                         (?:
  23                             (?:
  24                                 (?:
  25                                     (?:(?:\w+\.)+)?espn\.go|
  26                                     (?:www\.)?espn
  27                                 )\.com/
  28                                 (?:
  29                                     (?:
  30                                         video/(?:clip|iframe/twitter)|
  31                                     )
  32                                     (?:
  33                                         .*?\?.*?\bid=|
  34                                         /_/id/
  35                                     )|
  36                                     [^/]+/video/
  37                                 )
  38                             )|
  39                             (?:www\.)espnfc\.(?:com|us)/(?:video/)?[^/]+/\d+/video/
  40                         )
  41                         (?P<id>\d+)
  42                     '''
  43
  44     _TESTS = [{
  45         'url': 'http://espn.go.com/video/clip?id=10365079',
  46         'info_dict': {
  47             'id': '10365079',
  48             'ext': 'mp4',
  49             'title': '30 for 30 Shorts: Judging Jewell',
  50             'description': 'md5:39370c2e016cb4ecf498ffe75bef7f0f',
  51             'timestamp': 1390936111,
  52             'upload_date': '20140128',
  53             'duration': 1302,
  54             'thumbnail': r're:https://.+\.jpg',
  55         },
  56         'params': {
  57             'skip_download': True,
  58         },
  59     }, {
  60         'url': 'https://broadband.espn.go.com/video/clip?id=18910086',
  61         'info_dict': {
  62             'id': '18910086',
  63             'ext': 'mp4',
  64             'title': 'Kyrie spins around defender for two',
  65             'description': 'md5:2b0f5bae9616d26fba8808350f0d2b9b',
  66             'timestamp': 1489539155,
  67             'upload_date': '20170315',
  68         },
  69         'params': {
  70             'skip_download': True,
  71         },
  72         'expected_warnings': ['Unable to download f4m manifest'],
  73     }, {
  74         'url': 'http://nonredline.sports.espn.go.com/video/clip?id=19744672',
  75         'only_matching': True,
  76     }, {
  77         'url': 'https://cdn.espn.go.com/video/clip/_/id/19771774',
  78         'only_matching': True,
  79     }, {
  80         'url': 'http://www.espn.com/video/clip?id=10365079',
  81         'only_matching': True,
  82     }, {
  83         'url': 'http://www.espn.com/video/clip/_/id/17989860',
  84         'only_matching': True,
  85     }, {
  86         'url': 'https://espn.go.com/video/iframe/twitter/?cms=espn&id=10365079',
  87         'only_matching': True,
  88     }, {
  89         'url': 'http://www.espnfc.us/video/espn-fc-tv/86/video/3319154/nashville-unveiled-as-the-newest-club-in-mls',
  90         'only_matching': True,
  91     }, {
  92         'url': 'http://www.espnfc.com/english-premier-league/23/video/3324163/premier-league-in-90-seconds-golden-tweets',
  93         'only_matching': True,
  94     }, {
  95         'url': 'http://www.espn.com/espnw/video/26066627/arkansas-gibson-completes-hr-cycle-four-innings',
  96         'only_matching': True,
  97     }, {
  98         'url': 'http://www.espn.com/watch/player?id=19141491',
  99         'only_matching': True,
 100     }, {
 101         'url': 'http://www.espn.com/watch/player?bucketId=257&id=19505875',
 102         'only_matching': True,
 103     }]
 104
 105     def _real_extract(self, url):
 106         video_id = self._match_id(url)
 107
 108         clip = self._download_json(
 109             f'http://api-app.espn.com/v1/video/clips/{video_id}',
 110             video_id)['videos'][0]
 111
 112         title = clip['headline']
 113
 114         format_urls = set()
 115         formats = []
 116
 117         def traverse_source(source, base_source_id=None):
 118             for src_id, src_item in source.items():
 119                 if src_id == 'alert':
 120                     continue
 121                 elif isinstance(src_item, str):
 122                     extract_source(src_item, base_source_id)
 123                 elif isinstance(src_item, dict):
 124                     traverse_source(
 125                         src_item,
 126                         f'{base_source_id}-{src_id}'
 127                         if base_source_id else src_id)
 128
 129         def extract_source(source_url, source_id=None):
 130             if source_url in format_urls:
 131                 return
 132             format_urls.add(source_url)
 133             ext = determine_ext(source_url)
 134             if OnceIE.suitable(source_url):
 135                 formats.extend(self._extract_once_formats(source_url))
 136             elif ext == 'smil':
 137                 formats.extend(self._extract_smil_formats(
 138                     source_url, video_id, fatal=False))
 139             elif ext == 'f4m':
 140                 formats.extend(self._extract_f4m_formats(
 141                     source_url, video_id, f4m_id=source_id, fatal=False))
 142             elif ext == 'm3u8':
 143                 formats.extend(self._extract_m3u8_formats(
 144                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
 145                     m3u8_id=source_id, fatal=False))
 146             else:
 147                 f = {
 148                     'url': source_url,
 149                     'format_id': source_id,
 150                 }
 151                 mobj = re.search(r'(\d+)p(\d+)_(\d+)k\.', source_url)
 152                 if mobj:
 153                     f.update({
 154                         'height': int(mobj.group(1)),
 155                         'fps': int(mobj.group(2)),
 156                         'tbr': int(mobj.group(3)),
 157                     })
 158                 if source_id == 'mezzanine':
 159                     f['quality'] = 1
 160                 formats.append(f)
 161
 162         links = clip.get('links', {})
 163         traverse_source(links.get('source', {}))
 164         traverse_source(links.get('mobile', {}))
 165
 166         description = clip.get('caption') or clip.get('description')
 167         thumbnail = clip.get('thumbnail')
 168         duration = int_or_none(clip.get('duration'))
 169         timestamp = unified_timestamp(clip.get('originalPublishDate'))
 170
 171         return {
 172             'id': video_id,
 173             'title': title,
 174             'description': description,
 175             'thumbnail': thumbnail,
 176             'timestamp': timestamp,
 177             'duration': duration,
 178             'formats': formats,
 179         }
 180
 181
 182 class ESPNArticleIE(InfoExtractor):
 183     _VALID_URL = r'https?://(?:espn\.go|(?:www\.)?espn)\.com/(?:[^/]+/)*(?P<id>[^/]+)'
 184     _TESTS = [{
 185         'url': 'http://espn.go.com/nba/recap?gameId=400793786',
 186         'only_matching': True,
 187     }, {
 188         'url': 'http://espn.go.com/blog/golden-state-warriors/post/_/id/593/how-warriors-rapidly-regained-a-winning-edge',
 189         'only_matching': True,
 190     }, {
 191         'url': 'http://espn.go.com/sports/endurance/story/_/id/12893522/dzhokhar-tsarnaev-sentenced-role-boston-marathon-bombings',
 192         'only_matching': True,
 193     }, {
 194         'url': 'http://espn.go.com/nba/playoffs/2015/story/_/id/12887571/john-wall-washington-wizards-no-swelling-left-hand-wrist-game-5-return',
 195         'only_matching': True,
 196     }]
 197
 198     @classmethod
 199     def suitable(cls, url):
 200         return False if (ESPNIE.suitable(url) or WatchESPNIE.suitable(url)) else super().suitable(url)
 201
 202     def _real_extract(self, url):
 203         video_id = self._match_id(url)
 204
 205         webpage = self._download_webpage(url, video_id)
 206
 207         video_id = self._search_regex(
 208             r'class=(["\']).*?video-play-button.*?\1[^>]+data-id=["\'](?P<id>\d+)',
 209             webpage, 'video id', group='id')
 210
 211         return self.url_result(
 212             f'http://espn.go.com/video/clip?id={video_id}', ESPNIE.ie_key())
 213
 214
 215 class FiveThirtyEightIE(InfoExtractor):
 216     _VALID_URL = r'https?://(?:www\.)?fivethirtyeight\.com/features/(?P<id>[^/?#]+)'
 217     _TEST = {
 218         'url': 'http://fivethirtyeight.com/features/how-the-6-8-raiders-can-still-make-the-playoffs/',
 219         'info_dict': {
 220             'id': '56032156',
 221             'ext': 'flv',
 222             'title': 'FiveThirtyEight: The Raiders can still make the playoffs',
 223             'description': 'Neil Paine breaks down the simplest scenario that will put the Raiders into the playoffs at 8-8.',
 224         },
 225         'params': {
 226             'skip_download': True,
 227         },
 228     }
 229
 230     def _real_extract(self, url):
 231         video_id = self._match_id(url)
 232
 233         webpage = self._download_webpage(url, video_id)
 234
 235         embed_url = self._search_regex(
 236             r'<iframe[^>]+src=["\'](https?://fivethirtyeight\.abcnews\.go\.com/video/embed/\d+/\d+)',
 237             webpage, 'embed url')
 238
 239         return self.url_result(embed_url, 'AbcNewsVideo')
 240
 241
 242 class ESPNCricInfoIE(InfoExtractor):
 243     _VALID_URL = r'https?://(?:www\.)?espncricinfo\.com/(?:cricket-)?videos?/[^#$&?/]+-(?P<id>\d+)'
 244     _TESTS = [{
 245         'url': 'https://www.espncricinfo.com/video/finch-chasing-comes-with-risks-despite-world-cup-trend-1289135',
 246         'info_dict': {
 247             'id': '1289135',
 248             'ext': 'mp4',
 249             'title': 'Finch: Chasing comes with \'risks\' despite World Cup trend',
 250             'description': 'md5:ea32373303e25efbb146efdfc8a37829',
 251             'upload_date': '20211113',
 252             'duration': 96,
 253         },
 254         'params': {'skip_download': True},
 255     }, {
 256         'url': 'https://www.espncricinfo.com/cricket-videos/daryl-mitchell-mitchell-santner-is-one-of-the-best-white-ball-spinners-india-vs-new-zealand-1356225',
 257         'info_dict': {
 258             'id': '1356225',
 259             'ext': 'mp4',
 260             'description': '"Santner has done it for a long time for New Zealand - we\'re lucky to have him"',
 261             'upload_date': '20230128',
 262             'title': 'Mitchell: \'Santner is one of the best white-ball spinners at the moment\'',
 263             'duration': 87,
 264         },
 265         'params': {'skip_download': 'm3u8'},
 266     }]
 267
 268     def _real_extract(self, url):
 269         video_id = self._match_id(url)
 270         data_json = self._download_json(
 271             f'https://hs-consumer-api.espncricinfo.com/v1/pages/video/video-details?videoId={video_id}', video_id)['video']
 272         formats, subtitles = [], {}
 273         for item in data_json.get('playbacks') or []:
 274             if item.get('type') == 'HLS' and item.get('url'):
 275                 m3u8_frmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles(item['url'], video_id)
 276                 formats.extend(m3u8_frmts)
 277                 subtitles = self._merge_subtitles(subtitles, m3u8_subs)
 278             elif item.get('type') == 'AUDIO' and item.get('url'):
 279                 formats.append({
 280                     'url': item['url'],
 281                     'vcodec': 'none',
 282                 })
 283         return {
 284             'id': video_id,
 285             'title': data_json.get('title'),
 286             'description': data_json.get('summary'),
 287             'upload_date': unified_strdate(dict_get(data_json, ('publishedAt', 'recordedAt'))),
 288             'duration': data_json.get('duration'),
 289             'formats': formats,
 290             'subtitles': subtitles,
 291         }
 292
 293
 294 class WatchESPNIE(AdobePassIE):
 295     _VALID_URL = r'https?://(?:www\.)?espn\.com/(?:watch|espnplus)/player/_/id/(?P<id>[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})'
 296     _TESTS = [{
 297         'url': 'https://www.espn.com/watch/player/_/id/11ce417a-6ac9-42b6-8a15-46aeb9ad5710',
 298         'info_dict': {
 299             'id': '11ce417a-6ac9-42b6-8a15-46aeb9ad5710',
 300             'ext': 'mp4',
 301             'title': 'Abilene Chrstn vs. Texas Tech',
 302             'duration': 14166,
 303             'thumbnail': 'https://s.secure.espncdn.com/stitcher/artwork/collections/media/11ce417a-6ac9-42b6-8a15-46aeb9ad5710/16x9.jpg?timestamp=202407252343&showBadge=true&cb=12&package=ESPN_PLUS',
 304         },
 305         'params': {
 306             'skip_download': True,
 307         },
 308     }, {
 309         'url': 'https://www.espn.com/watch/player/_/id/90a2c85d-75e0-4b1e-a878-8e428a3cb2f3',
 310         'info_dict': {
 311             'id': '90a2c85d-75e0-4b1e-a878-8e428a3cb2f3',
 312             'ext': 'mp4',
 313             'title': 'UC Davis vs. California',
 314             'duration': 9547,
 315             'thumbnail': 'https://artwork.api.espn.com/artwork/collections/media/90a2c85d-75e0-4b1e-a878-8e428a3cb2f3/default?width=640&apikey=1ngjw23osgcis1i1vbj96lmfqs',
 316         },
 317         'params': {
 318             'skip_download': True,
 319         },
 320     }, {
 321         'url': 'https://www.espn.com/watch/player/_/id/c4313bbe-95b5-4bb8-b251-ac143ea0fc54',
 322         'info_dict': {
 323             'id': 'c4313bbe-95b5-4bb8-b251-ac143ea0fc54',
 324             'ext': 'mp4',
 325             'title': 'The College Football Show',
 326             'duration': 3639,
 327             'thumbnail': 'https://artwork.api.espn.com/artwork/collections/media/c4313bbe-95b5-4bb8-b251-ac143ea0fc54/default?width=640&apikey=1ngjw23osgcis1i1vbj96lmfqs',
 328         },
 329         'params': {
 330             'skip_download': True,
 331         },
 332     }]
 333
 334     _API_KEY = 'ZXNwbiZicm93c2VyJjEuMC4w.ptUt7QxsteaRruuPmGZFaJByOoqKvDP2a5YkInHrc7c'
 335
 336     def _call_bamgrid_api(self, path, video_id, payload=None, headers={}):
 337         if 'Authorization' not in headers:
 338             headers['Authorization'] = f'Bearer {self._API_KEY}'
 339         parse = urllib.parse.urlencode if path == 'token' else json.dumps
 340         return self._download_json(
 341             f'https://espn.api.edge.bamgrid.com/{path}', video_id, headers=headers, data=parse(payload).encode())
 342
 343     def _real_extract(self, url):
 344         video_id = self._match_id(url)
 345         cdn_data = self._download_json(
 346             f'https://watch-cdn.product.api.espn.com/api/product/v3/watchespn/web/playback/event?id={video_id}',
 347             video_id)
 348         video_data = cdn_data['playbackState']
 349
 350         # ESPN+ subscription required, through cookies
 351         if 'DTC' in video_data.get('sourceId'):
 352             cookie = self._get_cookies(url).get('ESPN-ONESITE.WEB-PROD.token')
 353             if not cookie:
 354                 self.raise_login_required(method='cookies')
 355
 356             jwt = self._search_regex(r'=([^|]+)\|', cookie.value, 'cookie jwt')
 357             id_token = self._download_json(
 358                 'https://registerdisney.go.com/jgc/v6/client/ESPN-ONESITE.WEB-PROD/guest/refresh-auth',
 359                 None, 'Refreshing token', headers={'Content-Type': 'application/json'}, data=json.dumps({
 360                     'refreshToken': json.loads(base64.urlsafe_b64decode(f'{jwt}==='))['refresh_token'],
 361                 }).encode())['data']['token']['id_token']
 362
 363             assertion = self._call_bamgrid_api(
 364                 'devices', video_id,
 365                 headers={'Content-Type': 'application/json; charset=UTF-8'},
 366                 payload={
 367                     'deviceFamily': 'android',
 368                     'applicationRuntime': 'android',
 369                     'deviceProfile': 'tv',
 370                     'attributes': {},
 371                 })['assertion']
 372             token = self._call_bamgrid_api(
 373                 'token', video_id, payload={
 374                     'subject_token': assertion,
 375                     'subject_token_type': 'urn:bamtech:params:oauth:token-type:device',
 376                     'platform': 'android',
 377                     'grant_type': 'urn:ietf:params:oauth:grant-type:token-exchange',
 378                 })['access_token']
 379
 380             assertion = self._call_bamgrid_api(
 381                 'accounts/grant', video_id, payload={'id_token': id_token},
 382                 headers={
 383                     'Authorization': token,
 384                     'Content-Type': 'application/json; charset=UTF-8',
 385                 })['assertion']
 386             token = self._call_bamgrid_api(
 387                 'token', video_id, payload={
 388                     'subject_token': assertion,
 389                     'subject_token_type': 'urn:bamtech:params:oauth:token-type:account',
 390                     'platform': 'android',
 391                     'grant_type': 'urn:ietf:params:oauth:grant-type:token-exchange',
 392                 })['access_token']
 393
 394             playback = self._download_json(
 395                 video_data['videoHref'].format(scenario='browser~ssai'), video_id,
 396                 headers={
 397                     'Accept': 'application/vnd.media-service+json; version=5',
 398                     'Authorization': token,
 399                 })
 400             m3u8_url, headers = playback['stream']['complete'][0]['url'], {'authorization': token}
 401
 402         # No login required
 403         elif video_data.get('sourceId') == 'ESPN_FREE':
 404             asset = self._download_json(
 405                 f'https://watch.auth.api.espn.com/video/auth/media/{video_id}/asset?apikey=uiqlbgzdwuru14v627vdusswb',
 406                 video_id)
 407             m3u8_url, headers = asset['stream'], {}
 408
 409         # TV Provider required
 410         else:
 411             resource = self._get_mvpd_resource('ESPN', video_data['name'], video_id, None)
 412             auth = self._extract_mvpd_auth(url, video_id, 'ESPN', resource).encode()
 413
 414             asset = self._download_json(
 415                 f'https://watch.auth.api.espn.com/video/auth/media/{video_id}/asset?apikey=uiqlbgzdwuru14v627vdusswb',
 416                 video_id, data=f'adobeToken={urllib.parse.quote_plus(base64.b64encode(auth))}&drmSupport=HLS'.encode())
 417             m3u8_url, headers = asset['stream'], {}
 418
 419         formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4', m3u8_id='hls')
 420
 421         return {
 422             'id': video_id,
 423             'duration': traverse_obj(cdn_data, ('tracking', 'duration')),
 424             'title': video_data.get('name'),
 425             'formats': formats,
 426             'subtitles': subtitles,
 427             'thumbnail': video_data.get('posterHref'),
 428             'http_headers': headers,
 429         }