yt_dlp/extractor/alura.py

   1 import re
   2 import urllib.parse
   3
   4 from .common import InfoExtractor
   5 from ..utils import (
   6     ExtractorError,
   7     clean_html,
   8     int_or_none,
   9     urlencode_postdata,
  10     urljoin,
  11 )
  12
  13
  14 class AluraIE(InfoExtractor):
  15     _VALID_URL = r'https?://(?:cursos\.)?alura\.com\.br/course/(?P<course_name>[^/]+)/task/(?P<id>\d+)'
  16     _LOGIN_URL = 'https://cursos.alura.com.br/loginForm?urlAfterLogin=/loginForm'
  17     _VIDEO_URL = 'https://cursos.alura.com.br/course/%s/task/%s/video'
  18     _NETRC_MACHINE = 'alura'
  19     _TESTS = [{
  20         'url': 'https://cursos.alura.com.br/course/clojure-mutabilidade-com-atoms-e-refs/task/60095',
  21         'info_dict': {
  22             'id': '60095',
  23             'ext': 'mp4',
  24             'title': 'Referências, ref-set e alter',
  25         },
  26         'skip': 'Requires alura account credentials'},
  27         {
  28             # URL without video
  29             'url': 'https://cursos.alura.com.br/course/clojure-mutabilidade-com-atoms-e-refs/task/60098',
  30             'only_matching': True},
  31         {
  32             'url': 'https://cursos.alura.com.br/course/fundamentos-market-digital/task/55219',
  33             'only_matching': True},
  34     ]
  35
  36     def _real_extract(self, url):
  37
  38         course, video_id = self._match_valid_url(url).group('course_name', 'id')
  39         video_url = self._VIDEO_URL % (course, video_id)
  40
  41         video_dict = self._download_json(video_url, video_id, 'Searching for videos')
  42
  43         if video_dict:
  44             webpage = self._download_webpage(url, video_id)
  45             video_title = clean_html(self._search_regex(
  46                 r'<span[^>]+class=(["\'])task-body-header-title-text\1[^>]*>(?P<title>[^<]+)',
  47                 webpage, 'title', group='title'))
  48
  49             formats = []
  50             for video_obj in video_dict:
  51                 video_url_m3u8 = video_obj.get('mp4')
  52                 video_format = self._extract_m3u8_formats(
  53                     video_url_m3u8, None, 'mp4', entry_protocol='m3u8_native',
  54                     m3u8_id='hls', fatal=False)
  55                 for f in video_format:
  56                     m = re.search(r'^[\w \W]*-(?P<res>\w*).mp4[\W \w]*', f['url'])
  57                     if m:
  58                         if not f.get('height'):
  59                             f['height'] = int('720' if m.group('res') == 'hd' else '480')
  60                 formats.extend(video_format)
  61
  62             return {
  63                 'id': video_id,
  64                 'title': video_title,
  65                 'formats': formats,
  66             }
  67
  68     def _perform_login(self, username, password):
  69
  70         login_page = self._download_webpage(
  71             self._LOGIN_URL, None, 'Downloading login popup')
  72
  73         def is_logged(webpage):
  74             return any(re.search(p, webpage) for p in (
  75                 r'href=[\"|\']?/signout[\"|\']',
  76                 r'>Logout<'))
  77
  78         # already logged in
  79         if is_logged(login_page):
  80             return
  81
  82         login_form = self._hidden_inputs(login_page)
  83
  84         login_form.update({
  85             'username': username,
  86             'password': password,
  87         })
  88
  89         post_url = self._search_regex(
  90             r'<form[^>]+class=["|\']signin-form["|\'] action=["|\'](?P<url>.+?)["|\']', login_page,
  91             'post url', default=self._LOGIN_URL, group='url')
  92
  93         if not post_url.startswith('http'):
  94             post_url = urllib.parse.urljoin(self._LOGIN_URL, post_url)
  95
  96         response = self._download_webpage(
  97             post_url, None, 'Logging in',
  98             data=urlencode_postdata(login_form),
  99             headers={'Content-Type': 'application/x-www-form-urlencoded'})
 100
 101         if not is_logged(response):
 102             error = self._html_search_regex(
 103                 r'(?s)<p[^>]+class="alert-message[^"]*">(.+?)</p>',
 104                 response, 'error message', default=None)
 105             if error:
 106                 raise ExtractorError(f'Unable to login: {error}', expected=True)
 107             raise ExtractorError('Unable to log in')
 108
 109
 110 class AluraCourseIE(AluraIE):  # XXX: Do not subclass from concrete IE
 111
 112     _VALID_URL = r'https?://(?:cursos\.)?alura\.com\.br/course/(?P<id>[^/]+)'
 113     _LOGIN_URL = 'https://cursos.alura.com.br/loginForm?urlAfterLogin=/loginForm'
 114     _NETRC_MACHINE = 'aluracourse'
 115     _TESTS = [{
 116         'url': 'https://cursos.alura.com.br/course/clojure-mutabilidade-com-atoms-e-refs',
 117         'only_matching': True,
 118     }]
 119
 120     @classmethod
 121     def suitable(cls, url):
 122         return False if AluraIE.suitable(url) else super().suitable(url)
 123
 124     def _real_extract(self, url):
 125
 126         course_path = self._match_id(url)
 127         webpage = self._download_webpage(url, course_path)
 128
 129         course_title = self._search_regex(
 130             r'<h1.*?>(.*?)<strong>(?P<course_title>.*?)</strong></h[0-9]>', webpage,
 131             'course title', default=course_path, group='course_title')
 132
 133         entries = []
 134         if webpage:
 135             for path in re.findall(r'<a\b(?=[^>]* class="[^"]*(?<=[" ])courseSectionList-section[" ])(?=[^>]* href="([^"]*))', webpage):
 136                 page_url = urljoin(url, path)
 137                 section_path = self._download_webpage(page_url, course_path)
 138                 for path_video in re.findall(r'<a\b(?=[^>]* class="[^"]*(?<=[" ])task-menu-nav-item-link-VIDEO[" ])(?=[^>]* href="([^"]*))', section_path):
 139                     chapter = clean_html(
 140                         self._search_regex(
 141                             r'<h3[^>]+class=(["\'])task-menu-section-title-text\1[^>]*>(?P<chapter>[^<]+)',
 142                             section_path,
 143                             'chapter',
 144                             group='chapter'))
 145
 146                     chapter_number = int_or_none(
 147                         self._search_regex(
 148                             r'<span[^>]+class=(["\'])task-menu-section-title-number[^>]*>(.*?)<strong>(?P<chapter_number>[^<]+)</strong>',
 149                             section_path,
 150                             'chapter number',
 151                             group='chapter_number'))
 152                     video_url = urljoin(url, path_video)
 153
 154                     entry = {
 155                         '_type': 'url_transparent',
 156                         'id': self._match_id(video_url),
 157                         'url': video_url,
 158                         'id_key': self.ie_key(),
 159                         'chapter': chapter,
 160                         'chapter_number': chapter_number,
 161                     }
 162                     entries.append(entry)
 163         return self.playlist_result(entries, course_path, course_title)