[ie/dropout] Fix extraction (#12102)
[yt-dlp.git] / yt_dlp / extractor / safari.py
blob86f34df478312824dbbe5a814942ad4ffc86dc6a
1 import json
2 import re
3 import urllib.parse
5 from .common import InfoExtractor
6 from ..utils import (
7 ExtractorError,
8 update_url_query,
12 class SafariBaseIE(InfoExtractor):
13 _LOGIN_URL = 'https://learning.oreilly.com/accounts/login/'
14 _NETRC_MACHINE = 'safari'
16 _API_BASE = 'https://learning.oreilly.com/api/v1'
17 _API_FORMAT = 'json'
19 LOGGED_IN = False
21 def _perform_login(self, username, password):
22 _, urlh = self._download_webpage_handle(
23 'https://learning.oreilly.com/accounts/login-check/', None,
24 'Downloading login page')
26 def is_logged(urlh):
27 return 'learning.oreilly.com/home/' in urlh.url
29 if is_logged(urlh):
30 self.LOGGED_IN = True
31 return
33 redirect_url = urlh.url
34 parsed_url = urllib.parse.urlparse(redirect_url)
35 qs = urllib.parse.parse_qs(parsed_url.query)
36 next_uri = urllib.parse.urljoin(
37 'https://api.oreilly.com', qs['next'][0])
39 auth, urlh = self._download_json_handle(
40 'https://www.oreilly.com/member/auth/login/', None, 'Logging in',
41 data=json.dumps({
42 'email': username,
43 'password': password,
44 'redirect_uri': next_uri,
45 }).encode(), headers={
46 'Content-Type': 'application/json',
47 'Referer': redirect_url,
48 }, expected_status=400)
50 credentials = auth.get('credentials')
51 if (not auth.get('logged_in') and not auth.get('redirect_uri')
52 and credentials):
53 raise ExtractorError(
54 f'Unable to login: {credentials}', expected=True)
56 # oreilly serves two same instances of the following cookies
57 # in Set-Cookie header and expects first one to be actually set
58 for cookie in ('groot_sessionid', 'orm-jwt', 'orm-rt'):
59 self._apply_first_set_cookie_header(urlh, cookie)
61 _, urlh = self._download_webpage_handle(
62 auth.get('redirect_uri') or next_uri, None, 'Completing login')
64 if is_logged(urlh):
65 self.LOGGED_IN = True
66 return
68 raise ExtractorError('Unable to log in')
71 class SafariIE(SafariBaseIE):
72 IE_NAME = 'safari'
73 IE_DESC = 'safaribooksonline.com online video'
74 _VALID_URL = r'''(?x)
75 https?://
76 (?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/
77 (?:
78 library/view/[^/]+/(?P<course_id>[^/]+)/(?P<part>[^/?\#&]+)\.html|
79 videos/[^/]+/[^/]+/(?P<reference_id>[^-]+-[^/?\#&]+)
81 '''
83 _TESTS = [{
84 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/part00.html',
85 'md5': 'dcc5a425e79f2564148652616af1f2a3',
86 'info_dict': {
87 'id': '0_qbqx90ic',
88 'ext': 'mp4',
89 'title': 'Introduction to Hadoop Fundamentals LiveLessons',
90 'timestamp': 1437758058,
91 'upload_date': '20150724',
92 'uploader_id': 'stork',
94 }, {
95 # non-digits in course id
96 'url': 'https://www.safaribooksonline.com/library/view/create-a-nodejs/100000006A0210/part00.html',
97 'only_matching': True,
98 }, {
99 'url': 'https://www.safaribooksonline.com/library/view/learning-path-red/9780134664057/RHCE_Introduction.html',
100 'only_matching': True,
101 }, {
102 'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314/9780134217314-PYMC_13_00',
103 'only_matching': True,
104 }, {
105 'url': 'https://learning.oreilly.com/videos/hadoop-fundamentals-livelessons/9780133392838/9780133392838-00_SeriesIntro',
106 'only_matching': True,
107 }, {
108 'url': 'https://www.oreilly.com/library/view/hadoop-fundamentals-livelessons/9780133392838/00_SeriesIntro.html',
109 'only_matching': True,
112 _PARTNER_ID = '1926081'
113 _UICONF_ID = '29375172'
115 def _real_extract(self, url):
116 mobj = self._match_valid_url(url)
118 reference_id = mobj.group('reference_id')
119 if reference_id:
120 video_id = reference_id
121 partner_id = self._PARTNER_ID
122 ui_id = self._UICONF_ID
123 else:
124 video_id = '{}-{}'.format(mobj.group('course_id'), mobj.group('part'))
126 webpage, urlh = self._download_webpage_handle(url, video_id)
128 mobj = re.match(self._VALID_URL, urlh.url)
129 reference_id = mobj.group('reference_id')
130 if not reference_id:
131 reference_id = self._search_regex(
132 r'data-reference-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
133 webpage, 'kaltura reference id', group='id')
134 partner_id = self._search_regex(
135 r'data-partner-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
136 webpage, 'kaltura widget id', default=self._PARTNER_ID,
137 group='id')
138 ui_id = self._search_regex(
139 r'data-ui-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
140 webpage, 'kaltura uiconf id', default=self._UICONF_ID,
141 group='id')
143 query = {
144 'wid': f'_{partner_id}',
145 'uiconf_id': ui_id,
146 'flashvars[referenceId]': reference_id,
149 if self.LOGGED_IN:
150 kaltura_session = self._download_json(
151 f'{self._API_BASE}/player/kaltura_session/?reference_id={reference_id}',
152 video_id, 'Downloading kaltura session JSON',
153 'Unable to download kaltura session JSON', fatal=False,
154 headers={'Accept': 'application/json'})
155 if kaltura_session:
156 session = kaltura_session.get('session')
157 if session:
158 query['flashvars[ks]'] = session
160 return self.url_result(update_url_query(
161 'https://cdnapisec.kaltura.com/html5/html5lib/v2.37.1/mwEmbedFrame.php', query),
162 'Kaltura')
165 class SafariApiIE(SafariBaseIE):
166 IE_NAME = 'safari:api'
167 _VALID_URL = r'https?://(?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/api/v1/book/(?P<course_id>[^/]+)/chapter(?:-content)?/(?P<part>[^/?#&]+)\.html'
169 _TESTS = [{
170 'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html',
171 'only_matching': True,
172 }, {
173 'url': 'https://www.safaribooksonline.com/api/v1/book/9780134664057/chapter/RHCE_Introduction.html',
174 'only_matching': True,
177 def _real_extract(self, url):
178 mobj = self._match_valid_url(url)
179 part = self._download_json(
180 url, '{}/{}'.format(mobj.group('course_id'), mobj.group('part')),
181 'Downloading part JSON')
182 web_url = part['web_url']
183 if 'library/view' in web_url:
184 web_url = web_url.replace('library/view', 'videos')
185 natural_keys = part['natural_key']
186 web_url = f'{web_url.rsplit("/", 1)[0]}/{natural_keys[0]}-{natural_keys[1][:-5]}'
187 return self.url_result(web_url, SafariIE.ie_key())
190 class SafariCourseIE(SafariBaseIE):
191 IE_NAME = 'safari:course'
192 IE_DESC = 'safaribooksonline.com online courses'
194 _VALID_URL = r'''(?x)
195 https?://
197 (?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/
199 library/view/[^/]+|
200 api/v1/book|
201 videos/[^/]+
203 techbus\.safaribooksonline\.com
205 /(?P<id>[^/]+)
208 _TESTS = [{
209 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/',
210 'info_dict': {
211 'id': '9780133392838',
212 'title': 'Hadoop Fundamentals LiveLessons',
214 'playlist_count': 22,
215 'skip': 'Requires safaribooksonline account credentials',
216 }, {
217 'url': 'https://www.safaribooksonline.com/api/v1/book/9781449396459/?override_format=json',
218 'only_matching': True,
219 }, {
220 'url': 'http://techbus.safaribooksonline.com/9780134426365',
221 'only_matching': True,
222 }, {
223 'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314',
224 'only_matching': True,
225 }, {
226 'url': 'https://learning.oreilly.com/videos/hadoop-fundamentals-livelessons/9780133392838',
227 'only_matching': True,
228 }, {
229 'url': 'https://www.oreilly.com/library/view/hadoop-fundamentals-livelessons/9780133392838/',
230 'only_matching': True,
233 @classmethod
234 def suitable(cls, url):
235 return (False if SafariIE.suitable(url) or SafariApiIE.suitable(url)
236 else super().suitable(url))
238 def _real_extract(self, url):
239 course_id = self._match_id(url)
241 course_json = self._download_json(
242 f'{self._API_BASE}/book/{course_id}/?override_format={self._API_FORMAT}',
243 course_id, 'Downloading course JSON')
245 if 'chapters' not in course_json:
246 raise ExtractorError(
247 f'No chapters found for course {course_id}', expected=True)
249 entries = [
250 self.url_result(chapter, SafariApiIE.ie_key())
251 for chapter in course_json['chapters']]
253 course_title = course_json['title']
255 return self.playlist_result(entries, course_id, course_title)