yt_dlp/extractor/movingimage.py

   1 from .common import InfoExtractor
   2 from ..utils import (
   3     parse_duration,
   4     unescapeHTML,
   5 )
   6
   7
   8 class MovingImageIE(InfoExtractor):
   9     _VALID_URL = r'https?://movingimage\.nls\.uk/film/(?P<id>\d+)'
  10     _TEST = {
  11         'url': 'http://movingimage.nls.uk/film/3561',
  12         'md5': '4caa05c2b38453e6f862197571a7be2f',
  13         'info_dict': {
  14             'id': '3561',
  15             'ext': 'mp4',
  16             'title': 'SHETLAND WOOL',
  17             'description': 'md5:c5afca6871ad59b4271e7704fe50ab04',
  18             'duration': 900,
  19             'thumbnail': r're:^https?://.*\.jpg$',
  20         },
  21     }
  22
  23     def _real_extract(self, url):
  24         video_id = self._match_id(url)
  25
  26         webpage = self._download_webpage(url, video_id)
  27
  28         formats = self._extract_m3u8_formats(
  29             self._html_search_regex(r'file\s*:\s*"([^"]+)"', webpage, 'm3u8 manifest URL'),
  30             video_id, ext='mp4', entry_protocol='m3u8_native')
  31
  32         def search_field(field_name, fatal=False):
  33             return self._search_regex(
  34                 rf'<span\s+class="field_title">{field_name}:</span>\s*<span\s+class="field_content">([^<]+)</span>',
  35                 webpage, 'title', fatal=fatal)
  36
  37         title = unescapeHTML(search_field('Title', fatal=True)).strip('()[]')
  38         description = unescapeHTML(search_field('Description'))
  39         duration = parse_duration(search_field('Running time'))
  40         thumbnail = self._search_regex(
  41             r"image\s*:\s*'([^']+)'", webpage, 'thumbnail', fatal=False)
  42
  43         return {
  44             'id': video_id,
  45             'formats': formats,
  46             'title': title,
  47             'description': description,
  48             'duration': duration,
  49             'thumbnail': thumbnail,
  50         }