[ie/youtube] Fix `uploader_id` extraction (#11818)
[yt-dlp.git] / yt_dlp / extractor / tubitv.py
blob694a92fcd4c236afa22cf674f1e792779a8dcb20
1 import re
3 from .common import InfoExtractor
4 from ..networking import Request
5 from ..utils import (
6 ExtractorError,
7 int_or_none,
8 js_to_json,
9 strip_or_none,
10 traverse_obj,
11 url_or_none,
12 urlencode_postdata,
16 class TubiTvIE(InfoExtractor):
17 IE_NAME = 'tubitv'
18 _VALID_URL = r'https?://(?:www\.)?tubitv\.com/(?P<type>video|movies|tv-shows)/(?P<id>\d+)'
19 _LOGIN_URL = 'http://tubitv.com/login'
20 _NETRC_MACHINE = 'tubitv'
21 _TESTS = [{
22 'url': 'https://tubitv.com/movies/100004539/the-39-steps',
23 'info_dict': {
24 'id': '100004539',
25 'ext': 'mp4',
26 'title': 'The 39 Steps',
27 'description': 'md5:bb2f2dd337f0dc58c06cb509943f54c8',
28 'uploader_id': 'abc2558d54505d4f0f32be94f2e7108c',
29 'release_year': 1935,
30 'thumbnail': r're:^https?://.+\.(jpe?g|png)$',
31 'duration': 5187,
33 'params': {'skip_download': 'm3u8'},
34 }, {
35 'url': 'https://tubitv.com/tv-shows/554628/s01-e01-rise-of-the-snakes',
36 'info_dict': {
37 'id': '554628',
38 'ext': 'mp4',
39 'title': 'S01:E01 - Rise of the Snakes',
40 'description': 'md5:ba136f586de53af0372811e783a3f57d',
41 'episode': 'Rise of the Snakes',
42 'episode_number': 1,
43 'season': 'Season 1',
44 'season_number': 1,
45 'uploader_id': '2a9273e728c510d22aa5c57d0646810b',
46 'release_year': 2011,
47 'thumbnail': r're:^https?://.+\.(jpe?g|png)$',
48 'duration': 1376,
50 'params': {'skip_download': 'm3u8'},
51 }, {
52 'url': 'http://tubitv.com/video/283829/the_comedian_at_the_friday',
53 'md5': '43ac06be9326f41912dc64ccf7a80320',
54 'info_dict': {
55 'id': '283829',
56 'ext': 'mp4',
57 'title': 'The Comedian at The Friday',
58 'description': 'A stand up comedian is forced to look at the decisions in his life while on a one week trip to the west coast.',
59 'uploader_id': 'bc168bee0d18dd1cb3b86c68706ab434',
61 'skip': 'Content Unavailable',
62 }, {
63 'url': 'http://tubitv.com/tv-shows/321886/s01_e01_on_nom_stories',
64 'only_matching': True,
65 }, {
66 'url': 'https://tubitv.com/movies/560057/penitentiary?start=true',
67 'info_dict': {
68 'id': '560057',
69 'ext': 'mp4',
70 'title': 'Penitentiary',
71 'description': 'md5:8d2fc793a93cc1575ff426fdcb8dd3f9',
72 'uploader_id': 'd8fed30d4f24fcb22ec294421b9defc2',
73 'release_year': 1979,
75 'skip': 'Content Unavailable',
78 # DRM formats are included only to raise appropriate error
79 _UNPLAYABLE_FORMATS = ('hlsv6_widevine', 'hlsv6_widevine_nonclearlead', 'hlsv6_playready_psshv0',
80 'hlsv6_fairplay', 'dash_widevine', 'dash_widevine_nonclearlead')
82 def _perform_login(self, username, password):
83 self.report_login()
84 form_data = {
85 'username': username,
86 'password': password,
88 payload = urlencode_postdata(form_data)
89 request = Request(self._LOGIN_URL, payload)
90 request.headers['Content-Type'] = 'application/x-www-form-urlencoded'
91 login_page = self._download_webpage(
92 request, None, False, 'Wrong login info')
93 if not re.search(r'id="tubi-logout"', login_page):
94 raise ExtractorError(
95 'Login failed (invalid username/password)', expected=True)
97 def _real_extract(self, url):
98 video_id, video_type = self._match_valid_url(url).group('id', 'type')
99 webpage = self._download_webpage(f'https://tubitv.com/{video_type}/{video_id}/', video_id)
100 video_data = self._search_json(
101 r'window\.__data\s*=', webpage, 'data', video_id,
102 transform_source=js_to_json)['video']['byId'][video_id]
104 formats = []
105 drm_formats = False
107 for resource in traverse_obj(video_data, ('video_resources', lambda _, v: url_or_none(v['manifest']['url']))):
108 resource_type = resource.get('type')
109 manifest_url = resource['manifest']['url']
110 if resource_type == 'dash':
111 formats.extend(self._extract_mpd_formats(manifest_url, video_id, mpd_id=resource_type, fatal=False))
112 elif resource_type in ('hlsv3', 'hlsv6'):
113 formats.extend(self._extract_m3u8_formats(manifest_url, video_id, 'mp4', m3u8_id=resource_type, fatal=False))
114 elif resource_type in self._UNPLAYABLE_FORMATS:
115 drm_formats = True
116 else:
117 self.report_warning(f'Skipping unknown resource type "{resource_type}"')
119 if not formats and drm_formats:
120 self.report_drm(video_id)
121 elif not formats and not video_data.get('policy_match'): # policy_match is False if content was removed
122 raise ExtractorError('This content is currently unavailable', expected=True)
124 subtitles = {}
125 for sub in traverse_obj(video_data, ('subtitles', lambda _, v: url_or_none(v['url']))):
126 subtitles.setdefault(sub.get('lang', 'English'), []).append({
127 'url': self._proto_relative_url(sub['url']),
130 title = traverse_obj(video_data, ('title', {str}))
131 season_number, episode_number, episode_title = self._search_regex(
132 r'^S(\d+):E(\d+) - (.+)', title, 'episode info', fatal=False, group=(1, 2, 3), default=(None, None, None))
134 return {
135 'id': video_id,
136 'title': strip_or_none(title),
137 'formats': formats,
138 'subtitles': subtitles,
139 'season_number': int_or_none(season_number),
140 'episode_number': int_or_none(episode_number),
141 'episode': strip_or_none(episode_title),
142 **traverse_obj(video_data, {
143 'description': ('description', {str}),
144 'duration': ('duration', {int_or_none}),
145 'uploader_id': ('publisher_id', {str}),
146 'release_year': ('year', {int_or_none}),
147 'thumbnails': ('thumbnails', ..., {url_or_none}, {'url': {self._proto_relative_url}}),
152 class TubiTvShowIE(InfoExtractor):
153 IE_NAME = 'tubitv:series'
154 _VALID_URL = r'https?://(?:www\.)?tubitv\.com/series/\d+/(?P<show_name>[^/?#]+)(?:/season-(?P<season>\d+))?'
155 _TESTS = [{
156 'url': 'https://tubitv.com/series/3936/the-joy-of-painting-with-bob-ross?start=true',
157 'playlist_mincount': 389,
158 'info_dict': {
159 'id': 'the-joy-of-painting-with-bob-ross',
161 }, {
162 'url': 'https://tubitv.com/series/2311/the-saddle-club/season-1',
163 'playlist_count': 26,
164 'info_dict': {
165 'id': 'the-saddle-club-season-1',
167 }, {
168 'url': 'https://tubitv.com/series/2311/the-saddle-club/season-3',
169 'playlist_count': 19,
170 'info_dict': {
171 'id': 'the-saddle-club-season-3',
173 }, {
174 'url': 'https://tubitv.com/series/2311/the-saddle-club/',
175 'playlist_mincount': 71,
176 'info_dict': {
177 'id': 'the-saddle-club',
181 def _entries(self, show_url, playlist_id, selected_season):
182 webpage = self._download_webpage(show_url, playlist_id)
184 data = self._search_json(
185 r'window\.__data\s*=', webpage, 'data', playlist_id,
186 transform_source=js_to_json)['video']
188 # v['number'] is already a decimal string, but stringify to protect against API changes
189 path = [lambda _, v: str(v['number']) == selected_season] if selected_season else [..., {dict}]
191 for season in traverse_obj(data, ('byId', lambda _, v: v['type'] == 's', 'seasons', *path)):
192 season_number = int_or_none(season.get('number'))
193 for episode in traverse_obj(season, ('episodes', lambda _, v: v['id'])):
194 episode_id = episode['id']
195 yield self.url_result(
196 f'https://tubitv.com/tv-shows/{episode_id}/', TubiTvIE, episode_id,
197 season_number=season_number, episode_number=int_or_none(episode.get('num')))
199 def _real_extract(self, url):
200 playlist_id, selected_season = self._match_valid_url(url).group('show_name', 'season')
201 if selected_season:
202 playlist_id = f'{playlist_id}-season-{selected_season}'
203 return self.playlist_result(self._entries(url, playlist_id, selected_season), playlist_id)