yt_dlp/extractor/rtp.py

   1 import base64
   2 import json
   3 import re
   4 import urllib.parse
   5
   6 from .common import InfoExtractor
   7 from ..utils import js_to_json
   8
   9
  10 class RTPIE(InfoExtractor):
  11     _VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/(?:(?:estudoemcasa|palco|zigzag)/)?p(?P<program_id>[0-9]+)/(?P<id>[^/?#]+)'
  12     _TESTS = [{
  13         'url': 'http://www.rtp.pt/play/p405/e174042/paixoes-cruzadas',
  14         'md5': 'e736ce0c665e459ddb818546220b4ef8',
  15         'info_dict': {
  16             'id': 'e174042',
  17             'ext': 'mp3',
  18             'title': 'Paixões Cruzadas',
  19             'description': 'As paixões musicais de António Cartaxo e António Macedo',
  20             'thumbnail': r're:^https?://.*\.jpg',
  21         },
  22     }, {
  23         'url': 'https://www.rtp.pt/play/zigzag/p13166/e757904/25-curiosidades-25-de-abril',
  24         'md5': '9a81ed53f2b2197cfa7ed455b12f8ade',
  25         'info_dict': {
  26             'id': 'e757904',
  27             'ext': 'mp4',
  28             'title': '25 Curiosidades, 25 de Abril',
  29             'description': 'Estudar ou não estudar - Em cada um dos episódios descobrimos uma curiosidade acerca de como era viver em Portugal antes da revolução do 25 de abr',
  30             'thumbnail': r're:^https?://.*\.jpg',
  31         },
  32     }, {
  33         'url': 'http://www.rtp.pt/play/p831/a-quimica-das-coisas',
  34         'only_matching': True,
  35     }, {
  36         'url': 'https://www.rtp.pt/play/estudoemcasa/p7776/portugues-1-ano',
  37         'only_matching': True,
  38     }, {
  39         'url': 'https://www.rtp.pt/play/palco/p13785/l7nnon',
  40         'only_matching': True,
  41     }]
  42
  43     _RX_OBFUSCATION = re.compile(r'''(?xs)
  44         atob\s*\(\s*decodeURIComponent\s*\(\s*
  45             (\[[0-9A-Za-z%,'"]*\])
  46         \s*\.\s*join\(\s*(?:""|'')\s*\)\s*\)\s*\)
  47     ''')
  48
  49     def __unobfuscate(self, data, *, video_id):
  50         if data.startswith('{'):
  51             data = self._RX_OBFUSCATION.sub(
  52                 lambda m: json.dumps(
  53                     base64.b64decode(urllib.parse.unquote(
  54                         ''.join(self._parse_json(m.group(1), video_id)),
  55                     )).decode('iso-8859-1')),
  56                 data)
  57         return js_to_json(data)
  58
  59     def _real_extract(self, url):
  60         video_id = self._match_id(url)
  61
  62         webpage = self._download_webpage(url, video_id)
  63         title = self._html_search_meta(
  64             'twitter:title', webpage, display_name='title', fatal=True)
  65
  66         f, config = self._search_regex(
  67             r'''(?sx)
  68                 (?:var\s+f\s*=\s*(?P<f>".*?"|{[^;]+?});\s*)?
  69                 var\s+player1\s+=\s+new\s+RTPPlayer\s*\((?P<config>{(?:(?!\*/).)+?})\);(?!\s*\*/)
  70             ''', webpage,
  71             'player config', group=('f', 'config'))
  72
  73         config = self._parse_json(
  74             config, video_id,
  75             lambda data: self.__unobfuscate(data, video_id=video_id))
  76         f = config['file'] if not f else self._parse_json(
  77             f, video_id,
  78             lambda data: self.__unobfuscate(data, video_id=video_id))
  79
  80         formats = []
  81         if isinstance(f, dict):
  82             f_hls = f.get('hls')
  83             if f_hls is not None:
  84                 formats.extend(self._extract_m3u8_formats(
  85                     f_hls, video_id, 'mp4', 'm3u8_native', m3u8_id='hls'))
  86
  87             f_dash = f.get('dash')
  88             if f_dash is not None:
  89                 formats.extend(self._extract_mpd_formats(f_dash, video_id, mpd_id='dash'))
  90         else:
  91             formats.append({
  92                 'format_id': 'f',
  93                 'url': f,
  94                 'vcodec': 'none' if config.get('mediaType') == 'audio' else None,
  95             })
  96
  97         subtitles = {}
  98
  99         vtt = config.get('vtt')
 100         if vtt is not None:
 101             for lcode, lname, url in vtt:
 102                 subtitles.setdefault(lcode, []).append({
 103                     'name': lname,
 104                     'url': url,
 105                 })
 106
 107         return {
 108             'id': video_id,
 109             'title': title,
 110             'formats': formats,
 111             'description': self._html_search_meta(['description', 'twitter:description'], webpage),
 112             'thumbnail': config.get('poster') or self._og_search_thumbnail(webpage),
 113             'subtitles': subtitles,
 114         }