yt_dlp/extractor/mit.py

   1 import json
   2 import re
   3
   4 from .common import InfoExtractor
   5 from .youtube import YoutubeIE
   6 from ..utils import (
   7     ExtractorError,
   8     clean_html,
   9     get_element_by_id,
  10 )
  11
  12
  13 class TechTVMITIE(InfoExtractor):
  14     IE_NAME = 'techtv.mit.edu'
  15     _VALID_URL = r'https?://techtv\.mit\.edu/(?:videos|embeds)/(?P<id>\d+)'
  16
  17     _TEST = {
  18         'url': 'http://techtv.mit.edu/videos/25418-mit-dna-learning-center-set',
  19         'md5': '00a3a27ee20d44bcaa0933ccec4a2cf7',
  20         'info_dict': {
  21             'id': '25418',
  22             'ext': 'mp4',
  23             'title': 'MIT DNA and Protein Sets',
  24             'description': 'md5:46f5c69ce434f0a97e7c628cc142802d',
  25         },
  26     }
  27
  28     def _real_extract(self, url):
  29         video_id = self._match_id(url)
  30         raw_page = self._download_webpage(
  31             f'http://techtv.mit.edu/videos/{video_id}', video_id)
  32         clean_page = re.compile(r'<!--.*?-->', re.S).sub('', raw_page)
  33
  34         base_url = self._proto_relative_url(self._search_regex(
  35             r'ipadUrl: \'(.+?cloudfront.net/)', raw_page, 'base url'), 'http:')
  36         formats_json = self._search_regex(
  37             r'bitrates: (\[.+?\])', raw_page, 'video formats')
  38         formats_mit = json.loads(formats_json)
  39         formats = [
  40             {
  41                 'format_id': f['label'],
  42                 'url': base_url + f['url'].partition(':')[2],
  43                 'ext': f['url'].partition(':')[0],
  44                 'format': f['label'],
  45                 'width': f['width'],
  46                 'vbr': f['bitrate'],
  47             }
  48             for f in formats_mit
  49         ]
  50
  51         title = get_element_by_id('edit-title', clean_page)
  52         description = clean_html(get_element_by_id('edit-description', clean_page))
  53         thumbnail = self._search_regex(
  54             r'playlist:.*?url: \'(.+?)\'',
  55             raw_page, 'thumbnail', flags=re.DOTALL)
  56
  57         return {
  58             'id': video_id,
  59             'title': title,
  60             'formats': formats,
  61             'description': description,
  62             'thumbnail': thumbnail,
  63         }
  64
  65
  66 class OCWMITIE(InfoExtractor):
  67     IE_NAME = 'ocw.mit.edu'
  68     _VALID_URL = r'https?://ocw\.mit\.edu/courses/(?P<topic>[a-z0-9\-]+)'
  69     _BASE_URL = 'http://ocw.mit.edu/'
  70
  71     _TESTS = [
  72         {
  73             'url': 'http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/',
  74             'info_dict': {
  75                 'id': 'EObHWIEKGjA',
  76                 'ext': 'webm',
  77                 'title': 'Lecture 7: Multiple Discrete Random Variables: Expectations, Conditioning, Independence',
  78                 'description': 'In this lecture, the professor discussed multiple random variables, expectations, and binomial distribution.',
  79                 'upload_date': '20121109',
  80                 'uploader_id': 'MIT',
  81                 'uploader': 'MIT OpenCourseWare',
  82             },
  83         },
  84         {
  85             'url': 'http://ocw.mit.edu/courses/mathematics/18-01sc-single-variable-calculus-fall-2010/1.-differentiation/part-a-definition-and-basic-rules/session-1-introduction-to-derivatives/',
  86             'info_dict': {
  87                 'id': '7K1sB05pE0A',
  88                 'ext': 'mp4',
  89                 'title': 'Session 1: Introduction to Derivatives',
  90                 'upload_date': '20090818',
  91                 'uploader_id': 'MIT',
  92                 'uploader': 'MIT OpenCourseWare',
  93                 'description': 'This section contains lecture video excerpts, lecture notes, an interactive mathlet with supporting documents, and problem solving videos.',
  94             },
  95         },
  96     ]
  97
  98     def _real_extract(self, url):
  99         mobj = self._match_valid_url(url)
 100         topic = mobj.group('topic')
 101
 102         webpage = self._download_webpage(url, topic)
 103         title = self._html_search_meta('WT.cg_s', webpage)
 104         description = self._html_search_meta('Description', webpage)
 105
 106         # search for call to ocw_embed_chapter_media(container_id, media_url, provider, page_url, image_url, start, stop, captions_file)
 107         embed_chapter_media = re.search(r'ocw_embed_chapter_media\((.+?)\)', webpage)
 108         if embed_chapter_media:
 109             metadata = re.sub(r'[\'"]', '', embed_chapter_media.group(1))
 110             metadata = re.split(r', ?', metadata)
 111             yt = metadata[1]
 112         else:
 113             # search for call to ocw_embed_chapter_media(container_id, media_url, provider, page_url, image_url, captions_file)
 114             embed_media = re.search(r'ocw_embed_media\((.+?)\)', webpage)
 115             if embed_media:
 116                 metadata = re.sub(r'[\'"]', '', embed_media.group(1))
 117                 metadata = re.split(r', ?', metadata)
 118                 yt = metadata[1]
 119             else:
 120                 raise ExtractorError('Unable to find embedded YouTube video.')
 121         video_id = YoutubeIE.extract_id(yt)
 122
 123         return {
 124             '_type': 'url_transparent',
 125             'id': video_id,
 126             'title': title,
 127             'description': description,
 128             'url': yt,
 129             'ie_key': 'Youtube',
 130         }