yt_dlp/extractor/rtp.py

   1 from .common import InfoExtractor
   2 from ..utils import js_to_json
   3 import re
   4 import json
   5 import urllib.parse
   6 import base64
   7
   8
   9 class RTPIE(InfoExtractor):
  10     _VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/p(?P<program_id>[0-9]+)/(?P<id>[^/?#]+)/?'
  11     _TESTS = [{
  12         'url': 'http://www.rtp.pt/play/p405/e174042/paixoes-cruzadas',
  13         'md5': 'e736ce0c665e459ddb818546220b4ef8',
  14         'info_dict': {
  15             'id': 'e174042',
  16             'ext': 'mp3',
  17             'title': 'Paixões Cruzadas',
  18             'description': 'As paixões musicais de António Cartaxo e António Macedo',
  19             'thumbnail': r're:^https?://.*\.jpg',
  20         },
  21     }, {
  22         'url': 'http://www.rtp.pt/play/p831/a-quimica-das-coisas',
  23         'only_matching': True,
  24     }]
  25
  26     _RX_OBFUSCATION = re.compile(r'''(?xs)
  27         atob\s*\(\s*decodeURIComponent\s*\(\s*
  28             (\[[0-9A-Za-z%,'"]*\])
  29         \s*\.\s*join\(\s*(?:""|'')\s*\)\s*\)\s*\)
  30     ''')
  31
  32     def __unobfuscate(self, data, *, video_id):
  33         if data.startswith('{'):
  34             data = self._RX_OBFUSCATION.sub(
  35                 lambda m: json.dumps(
  36                     base64.b64decode(urllib.parse.unquote(
  37                         ''.join(self._parse_json(m.group(1), video_id))
  38                     )).decode('iso-8859-1')),
  39                 data)
  40         return js_to_json(data)
  41
  42     def _real_extract(self, url):
  43         video_id = self._match_id(url)
  44
  45         webpage = self._download_webpage(url, video_id)
  46         title = self._html_search_meta(
  47             'twitter:title', webpage, display_name='title', fatal=True)
  48
  49         f, config = self._search_regex(
  50             r'''(?sx)
  51                 var\s+f\s*=\s*(?P<f>".*?"|{[^;]+?});\s*
  52                 var\s+player1\s+=\s+new\s+RTPPlayer\s*\((?P<config>{(?:(?!\*/).)+?})\);(?!\s*\*/)
  53             ''', webpage,
  54             'player config', group=('f', 'config'))
  55
  56         f = self._parse_json(
  57             f, video_id,
  58             lambda data: self.__unobfuscate(data, video_id=video_id))
  59         config = self._parse_json(
  60             config, video_id,
  61             lambda data: self.__unobfuscate(data, video_id=video_id))
  62
  63         formats = []
  64         if isinstance(f, dict):
  65             f_hls = f.get('hls')
  66             if f_hls is not None:
  67                 formats.extend(self._extract_m3u8_formats(
  68                     f_hls, video_id, 'mp4', 'm3u8_native', m3u8_id='hls'))
  69
  70             f_dash = f.get('dash')
  71             if f_dash is not None:
  72                 formats.extend(self._extract_mpd_formats(f_dash, video_id, mpd_id='dash'))
  73         else:
  74             formats.append({
  75                 'format_id': 'f',
  76                 'url': f,
  77                 'vcodec': 'none' if config.get('mediaType') == 'audio' else None,
  78             })
  79
  80         subtitles = {}
  81
  82         vtt = config.get('vtt')
  83         if vtt is not None:
  84             for lcode, lname, url in vtt:
  85                 subtitles.setdefault(lcode, []).append({
  86                     'name': lname,
  87                     'url': url,
  88                 })
  89
  90         return {
  91             'id': video_id,
  92             'title': title,
  93             'formats': formats,
  94             'description': self._html_search_meta(['description', 'twitter:description'], webpage),
  95             'thumbnail': config.get('poster') or self._og_search_thumbnail(webpage),
  96             'subtitles': subtitles,
  97         }