yt_dlp/extractor/platzi.py

   1 import base64
   2
   3 from .common import InfoExtractor
   4 from ..utils import (
   5     ExtractorError,
   6     clean_html,
   7     int_or_none,
   8     str_or_none,
   9     try_get,
  10     url_or_none,
  11     urlencode_postdata,
  12     urljoin,
  13 )
  14
  15
  16 class PlatziBaseIE(InfoExtractor):
  17     _LOGIN_URL = 'https://platzi.com/login/'
  18     _NETRC_MACHINE = 'platzi'
  19
  20     def _perform_login(self, username, password):
  21         login_page = self._download_webpage(
  22             self._LOGIN_URL, None, 'Downloading login page')
  23
  24         login_form = self._hidden_inputs(login_page)
  25
  26         login_form.update({
  27             'email': username,
  28             'password': password,
  29         })
  30
  31         urlh = self._request_webpage(
  32             self._LOGIN_URL, None, 'Logging in',
  33             data=urlencode_postdata(login_form),
  34             headers={'Referer': self._LOGIN_URL})
  35
  36         # login succeeded
  37         if 'platzi.com/login' not in urlh.url:
  38             return
  39
  40         login_error = self._webpage_read_content(
  41             urlh, self._LOGIN_URL, None, 'Downloading login error page')
  42
  43         login = self._parse_json(
  44             self._search_regex(
  45                 r'login\s*=\s*({.+?})(?:\s*;|\s*</script)', login_error, 'login'),
  46             None)
  47
  48         for kind in ('error', 'password', 'nonFields'):
  49             error = str_or_none(login.get(f'{kind}Error'))
  50             if error:
  51                 raise ExtractorError(
  52                     f'Unable to login: {error}', expected=True)
  53         raise ExtractorError('Unable to log in')
  54
  55
  56 class PlatziIE(PlatziBaseIE):
  57     _VALID_URL = r'''(?x)
  58                     https?://
  59                         (?:
  60                             platzi\.com/clases|           # es version
  61                             courses\.platzi\.com/classes  # en version
  62                         )/[^/]+/(?P<id>\d+)-[^/?\#&]+
  63                     '''
  64
  65     _TESTS = [{
  66         'url': 'https://platzi.com/clases/1311-next-js/12074-creando-nuestra-primera-pagina/',
  67         'md5': '8f56448241005b561c10f11a595b37e3',
  68         'info_dict': {
  69             'id': '12074',
  70             'ext': 'mp4',
  71             'title': 'Creando nuestra primera página',
  72             'description': 'md5:4c866e45034fc76412fbf6e60ae008bc',
  73             'duration': 420,
  74         },
  75         'skip': 'Requires platzi account credentials',
  76     }, {
  77         'url': 'https://courses.platzi.com/classes/1367-communication-codestream/13430-background/',
  78         'info_dict': {
  79             'id': '13430',
  80             'ext': 'mp4',
  81             'title': 'Background',
  82             'description': 'md5:49c83c09404b15e6e71defaf87f6b305',
  83             'duration': 360,
  84         },
  85         'skip': 'Requires platzi account credentials',
  86         'params': {
  87             'skip_download': True,
  88         },
  89     }]
  90
  91     def _real_extract(self, url):
  92         lecture_id = self._match_id(url)
  93
  94         webpage = self._download_webpage(url, lecture_id)
  95
  96         data = self._parse_json(
  97             self._search_regex(
  98                 # client_data may contain "};" so that we have to try more
  99                 # strict regex first
 100                 (r'client_data\s*=\s*({.+?})\s*;\s*\n',
 101                  r'client_data\s*=\s*({.+?})\s*;'),
 102                 webpage, 'client data'),
 103             lecture_id)
 104
 105         material = data['initialState']['material']
 106         desc = material['description']
 107         title = desc['title']
 108
 109         formats = []
 110         for server_id, server in material['videos'].items():
 111             if not isinstance(server, dict):
 112                 continue
 113             for format_id in ('hls', 'dash'):
 114                 format_url = url_or_none(server.get(format_id))
 115                 if not format_url:
 116                     continue
 117                 if format_id == 'hls':
 118                     formats.extend(self._extract_m3u8_formats(
 119                         format_url, lecture_id, 'mp4',
 120                         entry_protocol='m3u8_native', m3u8_id=format_id,
 121                         note=f'Downloading {server_id} m3u8 information',
 122                         fatal=False))
 123                 elif format_id == 'dash':
 124                     formats.extend(self._extract_mpd_formats(
 125                         format_url, lecture_id, mpd_id=format_id,
 126                         note=f'Downloading {server_id} MPD manifest',
 127                         fatal=False))
 128
 129         content = str_or_none(desc.get('content'))
 130         description = (clean_html(base64.b64decode(content).decode('utf-8'))
 131                        if content else None)
 132         duration = int_or_none(material.get('duration'), invscale=60)
 133
 134         return {
 135             'id': lecture_id,
 136             'title': title,
 137             'description': description,
 138             'duration': duration,
 139             'formats': formats,
 140         }
 141
 142
 143 class PlatziCourseIE(PlatziBaseIE):
 144     _VALID_URL = r'''(?x)
 145                     https?://
 146                         (?:
 147                             platzi\.com/clases|           # es version
 148                             courses\.platzi\.com/classes  # en version
 149                         )/(?P<id>[^/?\#&]+)
 150                     '''
 151     _TESTS = [{
 152         'url': 'https://platzi.com/clases/next-js/',
 153         'info_dict': {
 154             'id': '1311',
 155             'title': 'Curso de Next.js',
 156         },
 157         'playlist_count': 22,
 158     }, {
 159         'url': 'https://courses.platzi.com/classes/communication-codestream/',
 160         'info_dict': {
 161             'id': '1367',
 162             'title': 'Codestream Course',
 163         },
 164         'playlist_count': 14,
 165     }]
 166
 167     @classmethod
 168     def suitable(cls, url):
 169         return False if PlatziIE.suitable(url) else super().suitable(url)
 170
 171     def _real_extract(self, url):
 172         course_name = self._match_id(url)
 173
 174         webpage = self._download_webpage(url, course_name)
 175
 176         props = self._parse_json(
 177             self._search_regex(r'data\s*=\s*({.+?})\s*;', webpage, 'data'),
 178             course_name)['initialProps']
 179
 180         entries = []
 181         for chapter_num, chapter in enumerate(props['concepts'], 1):
 182             if not isinstance(chapter, dict):
 183                 continue
 184             materials = chapter.get('materials')
 185             if not materials or not isinstance(materials, list):
 186                 continue
 187             chapter_title = chapter.get('title')
 188             chapter_id = str_or_none(chapter.get('id'))
 189             for material in materials:
 190                 if not isinstance(material, dict):
 191                     continue
 192                 if material.get('material_type') != 'video':
 193                     continue
 194                 video_url = urljoin(url, material.get('url'))
 195                 if not video_url:
 196                     continue
 197                 entries.append({
 198                     '_type': 'url_transparent',
 199                     'url': video_url,
 200                     'title': str_or_none(material.get('name')),
 201                     'id': str_or_none(material.get('id')),
 202                     'ie_key': PlatziIE.ie_key(),
 203                     'chapter': chapter_title,
 204                     'chapter_number': chapter_num,
 205                     'chapter_id': chapter_id,
 206                 })
 207
 208         course_id = str(try_get(props, lambda x: x['course']['id']))
 209         course_title = try_get(props, lambda x: x['course']['name'], str)
 210
 211         return self.playlist_result(entries, course_id, course_title)