[ie/dplay] Fix extractors (#10471)
[yt-dlp3.git] / yt_dlp / extractor / nba.py
blob91ae1d14c65c8fa0f0efa92efe035c6c881ecd7d
1 import functools
2 import re
3 import urllib.parse
5 from .turner import TurnerBaseIE
6 from ..utils import (
7 OnDemandPagedList,
8 int_or_none,
9 merge_dicts,
10 parse_duration,
11 parse_iso8601,
12 parse_qs,
13 try_get,
14 update_url_query,
15 urljoin,
19 class NBACVPBaseIE(TurnerBaseIE):
20 def _extract_nba_cvp_info(self, path, video_id, fatal=False):
21 return self._extract_cvp_info(
22 f'http://secure.nba.com/{path}', video_id, {
23 'default': {
24 'media_src': 'http://nba.cdn.turner.com/nba/big',
26 'm3u8': {
27 'media_src': 'http://nbavod-f.akamaihd.net',
29 }, fatal=fatal)
32 class NBAWatchBaseIE(NBACVPBaseIE):
33 _VALID_URL_BASE = r'https?://(?:(?:www\.)?nba\.com(?:/watch)?|watch\.nba\.com)/'
35 def _extract_video(self, filter_key, filter_value):
36 video = self._download_json(
37 'https://neulionscnbav2-a.akamaihd.net/solr/nbad_program/usersearch',
38 filter_value, query={
39 'fl': 'description,image,name,pid,releaseDate,runtime,tags,seoName',
40 'q': filter_key + ':' + filter_value,
41 'wt': 'json',
42 })['response']['docs'][0]
44 video_id = str(video['pid'])
45 title = video['name']
47 formats = []
48 m3u8_url = (self._download_json(
49 'https://watch.nba.com/service/publishpoint', video_id, query={
50 'type': 'video',
51 'format': 'json',
52 'id': video_id,
53 }, headers={
54 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0_1 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A402 Safari/604.1',
55 }, fatal=False) or {}).get('path')
56 if m3u8_url:
57 m3u8_formats = self._extract_m3u8_formats(
58 re.sub(r'_(?:pc|iphone)\.', '.', m3u8_url), video_id, 'mp4',
59 'm3u8_native', m3u8_id='hls', fatal=False)
60 formats.extend(m3u8_formats)
61 for f in m3u8_formats:
62 http_f = f.copy()
63 http_f.update({
64 'format_id': http_f['format_id'].replace('hls-', 'http-'),
65 'protocol': 'http',
66 'url': http_f['url'].replace('.m3u8', ''),
68 formats.append(http_f)
70 info = {
71 'id': video_id,
72 'title': title,
73 'thumbnail': urljoin('https://nbadsdmt.akamaized.net/media/nba/nba/thumbs/', video.get('image')),
74 'description': video.get('description'),
75 'duration': int_or_none(video.get('runtime')),
76 'timestamp': parse_iso8601(video.get('releaseDate')),
77 'tags': video.get('tags'),
80 seo_name = video.get('seoName')
81 if seo_name and re.search(r'\d{4}/\d{2}/\d{2}/', seo_name):
82 base_path = ''
83 if seo_name.startswith('teams/'):
84 base_path += seo_name.split('/')[1] + '/'
85 base_path += 'video/'
86 cvp_info = self._extract_nba_cvp_info(
87 base_path + seo_name + '.xml', video_id, False)
88 if cvp_info:
89 formats.extend(cvp_info['formats'])
90 info = merge_dicts(info, cvp_info)
92 info['formats'] = formats
93 return info
96 class NBAWatchEmbedIE(NBAWatchBaseIE):
97 IE_NAME = 'nba:watch:embed'
98 _VALID_URL = NBAWatchBaseIE._VALID_URL_BASE + r'embed\?.*?\bid=(?P<id>\d+)'
99 _TESTS = [{
100 'url': 'http://watch.nba.com/embed?id=659395',
101 'md5': 'b7e3f9946595f4ca0a13903ce5edd120',
102 'info_dict': {
103 'id': '659395',
104 'ext': 'mp4',
105 'title': 'Mix clip: More than 7 points of Joe Ingles, Luc Mbah a Moute, Blake Griffin and 6 more in Utah Jazz vs. the Clippers, 4/15/2017',
106 'description': 'Mix clip: More than 7 points of Joe Ingles, Luc Mbah a Moute, Blake Griffin and 6 more in Utah Jazz vs. the Clippers, 4/15/2017',
107 'timestamp': 1492228800,
108 'upload_date': '20170415',
112 def _real_extract(self, url):
113 video_id = self._match_id(url)
114 return self._extract_video('pid', video_id)
117 class NBAWatchIE(NBAWatchBaseIE):
118 IE_NAME = 'nba:watch'
119 _VALID_URL = NBAWatchBaseIE._VALID_URL_BASE + r'(?:nba/)?video/(?P<id>.+?(?=/index\.html)|(?:[^/]+/)*[^/?#&]+)'
120 _TESTS = [{
121 'url': 'http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html',
122 'md5': '9d902940d2a127af3f7f9d2f3dc79c96',
123 'info_dict': {
124 'id': '70946',
125 'ext': 'mp4',
126 'title': 'Thunder vs. Nets',
127 'description': 'Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.',
128 'duration': 181,
129 'timestamp': 1354597200,
130 'upload_date': '20121204',
132 }, {
133 'url': 'http://www.nba.com/video/games/hornets/2014/12/05/0021400276-nyk-cha-play5.nba/',
134 'only_matching': True,
135 }, {
136 'url': 'http://watch.nba.com/video/channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba',
137 'md5': 'b2b39b81cf28615ae0c3360a3f9668c4',
138 'info_dict': {
139 'id': '330865',
140 'ext': 'mp4',
141 'title': 'Hawks vs. Cavaliers Game 1',
142 'description': 'md5:8094c3498d35a9bd6b1a8c396a071b4d',
143 'duration': 228,
144 'timestamp': 1432094400,
145 'upload_date': '20150521',
147 }, {
148 'url': 'http://watch.nba.com/nba/video/channels/nba_tv/2015/06/11/YT_go_big_go_home_Game4_061115',
149 'only_matching': True,
150 }, {
151 # only CVP mp4 format available
152 'url': 'https://watch.nba.com/video/teams/cavaliers/2012/10/15/sloan121015mov-2249106',
153 'only_matching': True,
154 }, {
155 'url': 'https://watch.nba.com/video/top-100-dunks-from-the-2019-20-season?plsrc=nba&collection=2019-20-season-highlights',
156 'only_matching': True,
159 def _real_extract(self, url):
160 display_id = self._match_id(url)
161 collection_id = parse_qs(url).get('collection', [None])[0]
162 if self._yes_playlist(collection_id, display_id):
163 return self.url_result(
164 'https://www.nba.com/watch/list/collection/' + collection_id,
165 NBAWatchCollectionIE.ie_key(), collection_id)
166 return self._extract_video('seoName', display_id)
169 class NBAWatchCollectionIE(NBAWatchBaseIE):
170 IE_NAME = 'nba:watch:collection'
171 _VALID_URL = NBAWatchBaseIE._VALID_URL_BASE + r'list/collection/(?P<id>[^/?#&]+)'
172 _TESTS = [{
173 'url': 'https://watch.nba.com/list/collection/season-preview-2020',
174 'info_dict': {
175 'id': 'season-preview-2020',
177 'playlist_mincount': 43,
179 _PAGE_SIZE = 100
181 def _fetch_page(self, collection_id, page):
182 page += 1
183 videos = self._download_json(
184 'https://content-api-prod.nba.com/public/1/endeavor/video-list/collection/' + collection_id,
185 collection_id, f'Downloading page {page} JSON metadata', query={
186 'count': self._PAGE_SIZE,
187 'page': page,
188 })['results']['videos']
189 for video in videos:
190 program = video.get('program') or {}
191 seo_name = program.get('seoName') or program.get('slug')
192 if not seo_name:
193 continue
194 yield {
195 '_type': 'url',
196 'id': program.get('id'),
197 'title': program.get('title') or video.get('title'),
198 'url': 'https://www.nba.com/watch/video/' + seo_name,
199 'thumbnail': video.get('image'),
200 'description': program.get('description') or video.get('description'),
201 'duration': parse_duration(program.get('runtimeHours')),
202 'timestamp': parse_iso8601(video.get('releaseDate')),
205 def _real_extract(self, url):
206 collection_id = self._match_id(url)
207 entries = OnDemandPagedList(
208 functools.partial(self._fetch_page, collection_id),
209 self._PAGE_SIZE)
210 return self.playlist_result(entries, collection_id)
213 class NBABaseIE(NBACVPBaseIE):
214 _VALID_URL_BASE = r'''(?x)
215 https?://(?:www\.)?nba\.com/
216 (?P<team>
217 blazers|
218 bucks|
219 bulls|
220 cavaliers|
221 celtics|
222 clippers|
223 grizzlies|
224 hawks|
225 heat|
226 hornets|
227 jazz|
228 kings|
229 knicks|
230 lakers|
231 magic|
232 mavericks|
233 nets|
234 nuggets|
235 pacers|
236 pelicans|
237 pistons|
238 raptors|
239 rockets|
240 sixers|
241 spurs|
242 suns|
243 thunder|
244 timberwolves|
245 warriors|
246 wizards
248 (?:/play\#)?/'''
249 _CHANNEL_PATH_REGEX = r'video/channel|series'
251 def _embed_url_result(self, team, content_id):
252 return self.url_result(update_url_query(
253 'https://secure.nba.com/assets/amp/include/video/iframe.html', {
254 'contentId': content_id,
255 'team': team,
256 }), NBAEmbedIE.ie_key())
258 def _call_api(self, team, content_id, query, resource):
259 return self._download_json(
260 f'https://api.nba.net/2/{team}/video,imported_video,wsc/',
261 content_id, f'Download {resource} JSON metadata',
262 query=query, headers={
263 'accessToken': 'internal|bb88df6b4c2244e78822812cecf1ee1b',
264 })['response']['result']
266 def _extract_video(self, video, team, extract_all=True):
267 video_id = str(video['nid'])
268 team = video['brand']
270 info = {
271 'id': video_id,
272 'title': video.get('title') or video.get('headline') or video['shortHeadline'],
273 'description': video.get('description'),
274 'timestamp': parse_iso8601(video.get('published')),
277 subtitles = {}
278 captions = try_get(video, lambda x: x['videoCaptions']['sidecars'], dict) or {}
279 for caption_url in captions.values():
280 subtitles.setdefault('en', []).append({'url': caption_url})
282 formats = []
283 mp4_url = video.get('mp4')
284 if mp4_url:
285 formats.append({
286 'url': mp4_url,
289 if extract_all:
290 source_url = video.get('videoSource')
291 if source_url and not source_url.startswith('s3://') and self._is_valid_url(source_url, video_id, 'source'):
292 formats.append({
293 'format_id': 'source',
294 'url': source_url,
295 'quality': 1,
298 m3u8_url = video.get('m3u8')
299 if m3u8_url:
300 if '.akamaihd.net/i/' in m3u8_url:
301 formats.extend(self._extract_akamai_formats(
302 m3u8_url, video_id, {'http': 'pmd.cdn.turner.com'}))
303 else:
304 formats.extend(self._extract_m3u8_formats(
305 m3u8_url, video_id, 'mp4',
306 'm3u8_native', m3u8_id='hls', fatal=False))
308 content_xml = video.get('contentXml')
309 if team and content_xml:
310 cvp_info = self._extract_nba_cvp_info(
311 team + content_xml, video_id, fatal=False)
312 if cvp_info:
313 formats.extend(cvp_info['formats'])
314 subtitles = self._merge_subtitles(subtitles, cvp_info['subtitles'])
315 info = merge_dicts(info, cvp_info)
317 else:
318 info.update(self._embed_url_result(team, video['videoId']))
320 info.update({
321 'formats': formats,
322 'subtitles': subtitles,
325 return info
327 def _real_extract(self, url):
328 team, display_id = self._match_valid_url(url).groups()
329 if '/play#/' in url:
330 display_id = urllib.parse.unquote(display_id)
331 else:
332 webpage = self._download_webpage(url, display_id)
333 display_id = self._search_regex(
334 self._CONTENT_ID_REGEX + r'\s*:\s*"([^"]+)"', webpage, 'video id')
335 return self._extract_url_results(team, display_id)
338 class NBAEmbedIE(NBABaseIE):
339 IE_NAME = 'nba:embed'
340 _VALID_URL = r'https?://secure\.nba\.com/assets/amp/include/video/(?:topI|i)frame\.html\?.*?\bcontentId=(?P<id>[^?#&]+)'
341 _TESTS = [{
342 'url': 'https://secure.nba.com/assets/amp/include/video/topIframe.html?contentId=teams/bulls/2020/12/04/3478774/1607105587854-20201204_SCHEDULE_RELEASE_FINAL_DRUPAL-3478774&team=bulls&adFree=false&profile=71&videoPlayerName=TAMPCVP&baseUrl=&videoAdsection=nba.com_mobile_web_teamsites_chicagobulls&ampEnv=',
343 'only_matching': True,
344 }, {
345 'url': 'https://secure.nba.com/assets/amp/include/video/iframe.html?contentId=2016/10/29/0021600027boschaplay7&adFree=false&profile=71&team=&videoPlayerName=LAMPCVP',
346 'only_matching': True,
349 def _real_extract(self, url):
350 qs = parse_qs(url)
351 content_id = qs['contentId'][0]
352 team = qs.get('team', [None])[0]
353 if not team:
354 return self.url_result(
355 'https://watch.nba.com/video/' + content_id, NBAWatchIE.ie_key())
356 video = self._call_api(team, content_id, {'videoid': content_id}, 'video')[0]
357 return self._extract_video(video, team)
360 class NBAIE(NBABaseIE):
361 IE_NAME = 'nba'
362 _VALID_URL = NBABaseIE._VALID_URL_BASE + f'(?!{NBABaseIE._CHANNEL_PATH_REGEX})video/(?P<id>(?:[^/]+/)*[^/?#&]+)'
363 _TESTS = [{
364 'url': 'https://www.nba.com/bulls/video/teams/bulls/2020/12/04/3478774/1607105587854-20201204schedulereleasefinaldrupal-3478774',
365 'info_dict': {
366 'id': '45039',
367 'ext': 'mp4',
368 'title': 'AND WE BACK.',
369 'description': 'Part 1 of our 2020-21 schedule is here! Watch our games on NBC Sports Chicago.',
370 'duration': 94,
371 'timestamp': 1607112000,
372 'upload_date': '20201218',
374 }, {
375 'url': 'https://www.nba.com/bucks/play#/video/teams%2Fbucks%2F2020%2F12%2F17%2F64860%2F1608252863446-Op_Dream_16x9-64860',
376 'only_matching': True,
377 }, {
378 'url': 'https://www.nba.com/bucks/play#/video/wsc%2Fteams%2F2787C911AA1ACD154B5377F7577CCC7134B2A4B0',
379 'only_matching': True,
381 _CONTENT_ID_REGEX = r'videoID'
383 def _extract_url_results(self, team, content_id):
384 return self._embed_url_result(team, content_id)
387 class NBAChannelIE(NBABaseIE):
388 IE_NAME = 'nba:channel'
389 _VALID_URL = NBABaseIE._VALID_URL_BASE + f'(?:{NBABaseIE._CHANNEL_PATH_REGEX})/(?P<id>[^/?#&]+)'
390 _TESTS = [{
391 'url': 'https://www.nba.com/blazers/video/channel/summer_league',
392 'info_dict': {
393 'title': 'Summer League',
395 'playlist_mincount': 138,
396 }, {
397 'url': 'https://www.nba.com/bucks/play#/series/On%20This%20Date',
398 'only_matching': True,
400 _CONTENT_ID_REGEX = r'videoSubCategory'
401 _PAGE_SIZE = 100
403 def _fetch_page(self, team, channel, page):
404 results = self._call_api(team, channel, {
405 'channels': channel,
406 'count': self._PAGE_SIZE,
407 'offset': page * self._PAGE_SIZE,
408 }, f'page {page + 1}')
409 for video in results:
410 yield self._extract_video(video, team, False)
412 def _extract_url_results(self, team, content_id):
413 entries = OnDemandPagedList(
414 functools.partial(self._fetch_page, team, content_id),
415 self._PAGE_SIZE)
416 return self.playlist_result(entries, playlist_title=content_id)