yt_dlp/extractor/tvn24.py

   1 from .common import InfoExtractor
   2 from ..utils import (
   3     NO_DEFAULT,
   4     int_or_none,
   5     unescapeHTML,
   6 )
   7
   8
   9 class TVN24IE(InfoExtractor):
  10     _WORKING = False
  11     _VALID_URL = r'https?://(?:(?!eurosport)[^/]+\.)?tvn24(?:bis)?\.pl/(?:[^/?#]+/)*(?P<id>[^/?#]+)'
  12     _TESTS = [{
  13         'url': 'http://www.tvn24.pl/wiadomosci-z-kraju,3/oredzie-artura-andrusa,702428.html',
  14         'md5': 'fbdec753d7bc29d96036808275f2130c',
  15         'info_dict': {
  16             'id': '1584444',
  17             'ext': 'mp4',
  18             'title': '"Święta mają być wesołe, dlatego, ludziska, wszyscy pod jemiołę"',
  19             'description': 'Wyjątkowe orędzie Artura Andrusa, jednego z gości Szkła kontaktowego.',
  20             'thumbnail': 're:https?://.*[.]jpeg',
  21         },
  22     }, {
  23         # different layout
  24         'url': 'https://tvnmeteo.tvn24.pl/magazyny/maja-w-ogrodzie,13/odcinki-online,1,4,1,0/pnacza-ptaki-i-iglaki-odc-691-hgtv-odc-29,1771763.html',
  25         'info_dict': {
  26             'id': '1771763',
  27             'ext': 'mp4',
  28             'title': 'Pnącza, ptaki i iglaki (odc. 691 /HGTV odc. 29)',
  29             'thumbnail': 're:https?://.*',
  30         },
  31         'params': {
  32             'skip_download': True,
  33         },
  34     }, {
  35         'url': 'http://fakty.tvn24.pl/ogladaj-online,60/53-konferencja-bezpieczenstwa-w-monachium,716431.html',
  36         'only_matching': True,
  37     }, {
  38         'url': 'http://sport.tvn24.pl/pilka-nozna,105/ligue-1-kamil-glik-rozcial-glowe-monaco-tylko-remisuje-z-bastia,716522.html',
  39         'only_matching': True,
  40     }, {
  41         'url': 'http://tvn24bis.pl/poranek,146,m/gen-koziej-w-tvn24-bis-wracamy-do-czasow-zimnej-wojny,715660.html',
  42         'only_matching': True,
  43     }, {
  44         'url': 'https://www.tvn24.pl/magazyn-tvn24/angie-w-jednej-czwartej-polka-od-szarej-myszki-do-cesarzowej-europy,119,2158',
  45         'only_matching': True,
  46     }]
  47
  48     def _real_extract(self, url):
  49         display_id = self._match_id(url)
  50
  51         webpage = self._download_webpage(url, display_id)
  52
  53         title = self._og_search_title(
  54             webpage, default=None) or self._search_regex(
  55             r'<h\d+[^>]+class=["\']magazineItemHeader[^>]+>(.+?)</h',
  56             webpage, 'title')
  57
  58         def extract_json(attr, name, default=NO_DEFAULT, fatal=True):
  59             return self._parse_json(
  60                 self._search_regex(
  61                     rf'\b{attr}=(["\'])(?P<json>(?!\1).+?)\1', webpage,
  62                     name, group='json', default=default, fatal=fatal) or '{}',
  63                 display_id, transform_source=unescapeHTML, fatal=fatal)
  64
  65         quality_data = extract_json('data-quality', 'formats')
  66
  67         formats = []
  68         for format_id, url in quality_data.items():
  69             formats.append({
  70                 'url': url,
  71                 'format_id': format_id,
  72                 'height': int_or_none(format_id.rstrip('p')),
  73             })
  74
  75         description = self._og_search_description(webpage, default=None)
  76         thumbnail = self._og_search_thumbnail(
  77             webpage, default=None) or self._html_search_regex(
  78             r'\bdata-poster=(["\'])(?P<url>(?!\1).+?)\1', webpage,
  79             'thumbnail', group='url')
  80
  81         video_id = None
  82
  83         share_params = extract_json(
  84             'data-share-params', 'share params', default=None)
  85         if isinstance(share_params, dict):
  86             video_id = share_params.get('id')
  87
  88         if not video_id:
  89             video_id = self._search_regex(
  90                 r'data-vid-id=["\'](\d+)', webpage, 'video id',
  91                 default=None) or self._search_regex(
  92                 r',(\d+)\.html', url, 'video id', default=display_id)
  93
  94         return {
  95             'id': video_id,
  96             'title': title,
  97             'description': description,
  98             'thumbnail': thumbnail,
  99             'formats': formats,
 100         }