yt_dlp/extractor/massengeschmacktv.py

   1 import re
   2
   3 from .common import InfoExtractor
   4 from ..utils import (
   5     clean_html,
   6     determine_ext,
   7     int_or_none,
   8     js_to_json,
   9     mimetype2ext,
  10     parse_filesize,
  11 )
  12
  13
  14 class MassengeschmackTVIE(InfoExtractor):
  15     IE_NAME = 'massengeschmack.tv'
  16     _VALID_URL = r'https?://(?:www\.)?massengeschmack\.tv/play/(?P<id>[^?&#]+)'
  17
  18     _TEST = {
  19         'url': 'https://massengeschmack.tv/play/fktv202',
  20         'md5': '9996f314994a49fefe5f39aa1b07ae21',
  21         'info_dict': {
  22             'id': 'fktv202',
  23             'ext': 'mp4',
  24             'title': 'Fernsehkritik-TV #202',
  25             'thumbnail': 'https://cache.massengeschmack.tv/img/mag/fktv202.jpg',
  26         },
  27     }
  28
  29     def _real_extract(self, url):
  30         episode = self._match_id(url)
  31
  32         webpage = self._download_webpage(url, episode)
  33         sources = self._parse_json(self._search_regex(r'(?s)MEDIA\s*=\s*(\[.+?\]);', webpage, 'media'), episode, js_to_json)
  34
  35         formats = []
  36         for source in sources:
  37             furl = source.get('src')
  38             if not furl:
  39                 continue
  40             furl = self._proto_relative_url(furl)
  41             ext = determine_ext(furl) or mimetype2ext(source.get('type'))
  42             if ext == 'm3u8':
  43                 formats.extend(self._extract_m3u8_formats(
  44                     furl, episode, 'mp4', 'm3u8_native',
  45                     m3u8_id='hls', fatal=False))
  46             else:
  47                 formats.append({
  48                     'url': furl,
  49                     'format_id': determine_ext(furl),
  50                 })
  51
  52         for (durl, format_id, width, height, filesize) in re.findall(r'''(?x)
  53                                    <a[^>]+?href="(?P<url>(?:https:)?//[^"]+)".*?
  54                                    <strong>(?P<format_id>.+?)</strong>.*?
  55                                    <small>(?:(?P<width>\d+)x(?P<height>\d+))?\s+?\((?P<filesize>[\d,]+\s*[GM]iB)\)</small>
  56                                 ''', webpage):
  57             formats.append({
  58                 'url': durl,
  59                 'format_id': format_id,
  60                 'width': int_or_none(width),
  61                 'height': int_or_none(height),
  62                 'filesize': parse_filesize(filesize),
  63                 'vcodec': 'none' if format_id.startswith('Audio') else None,
  64             })
  65
  66         return {
  67             'id': episode,
  68             'title': clean_html(self._html_search_regex(
  69                 r'<span[^>]+\bid=["\']clip-title["\'][^>]*>([^<]+)', webpage, 'title', fatal=False)),
  70             'formats': formats,
  71             'thumbnail': self._search_regex(r'POSTER\s*=\s*"([^"]+)', webpage, 'thumbnail', fatal=False),
  72         }