yt_dlp/extractor/sejmpl.py

   1 import datetime as dt
   2
   3 from .common import InfoExtractor
   4 from .redge import RedCDNLivxIE
   5 from ..utils import (
   6     clean_html,
   7     join_nonempty,
   8     js_to_json,
   9     strip_or_none,
  10     update_url_query,
  11 )
  12 from ..utils.traversal import traverse_obj
  13
  14
  15 def is_dst(date):
  16     last_march = dt.datetime(date.year, 3, 31)
  17     last_october = dt.datetime(date.year, 10, 31)
  18     last_sunday_march = last_march - dt.timedelta(days=last_march.isoweekday() % 7)
  19     last_sunday_october = last_october - dt.timedelta(days=last_october.isoweekday() % 7)
  20     return last_sunday_march.replace(hour=2) <= date <= last_sunday_october.replace(hour=3)
  21
  22
  23 def rfc3339_to_atende(date):
  24     date = dt.datetime.fromisoformat(date)
  25     date = date + dt.timedelta(hours=1 if is_dst(date) else 0)
  26     return int((date.timestamp() - 978307200) * 1000)
  27
  28
  29 class SejmIE(InfoExtractor):
  30     _VALID_URL = (
  31         r'https?://(?:www\.)?sejm\.gov\.pl/[Ss]ejm(?P<term>\d+)\.nsf/transmisje(?:_arch)?\.xsp(?:\?[^#]*)?#(?P<id>[\dA-F]+)',
  32         r'https?://(?:www\.)?sejm\.gov\.pl/[Ss]ejm(?P<term>\d+)\.nsf/transmisje(?:_arch)?\.xsp\?(?:[^#]+&)?unid=(?P<id>[\dA-F]+)',
  33         r'https?://sejm-embed\.redcdn\.pl/[Ss]ejm(?P<term>\d+)\.nsf/VideoFrame\.xsp/(?P<id>[\dA-F]+)',
  34     )
  35     IE_NAME = 'sejm'
  36
  37     _TESTS = [{
  38         # multiple cameras, polish SL iterpreter
  39         'url': 'https://www.sejm.gov.pl/Sejm10.nsf/transmisje_arch.xsp#6181EF1AD9CEEBB5C1258A6D006452B5',
  40         'info_dict': {
  41             'id': '6181EF1AD9CEEBB5C1258A6D006452B5',
  42             'title': '1. posiedzenie Sejmu X kadencji',
  43             'duration': 20145,
  44             'live_status': 'was_live',
  45             'location': 'Sala Posiedzeń',
  46         },
  47         'playlist': [{
  48             'info_dict': {
  49                 'id': 'ENC01-722340000000-722360145000',
  50                 'ext': 'mp4',
  51                 'duration': 20145,
  52                 'title': '1. posiedzenie Sejmu X kadencji - ENC01',
  53                 'live_status': 'was_live',
  54             },
  55         }, {
  56             'info_dict': {
  57                 'id': 'ENC30-722340000000-722360145000',
  58                 'ext': 'mp4',
  59                 'duration': 20145,
  60                 'title': '1. posiedzenie Sejmu X kadencji - ENC30',
  61                 'live_status': 'was_live',
  62             },
  63         }, {
  64             'info_dict': {
  65                 'id': 'ENC31-722340000000-722360145000',
  66                 'ext': 'mp4',
  67                 'duration': 20145,
  68                 'title': '1. posiedzenie Sejmu X kadencji - ENC31',
  69                 'live_status': 'was_live',
  70             },
  71         }, {
  72             'info_dict': {
  73                 'id': 'ENC32-722340000000-722360145000',
  74                 'ext': 'mp4',
  75                 'duration': 20145,
  76                 'title': '1. posiedzenie Sejmu X kadencji - ENC32',
  77                 'live_status': 'was_live',
  78             },
  79         }, {
  80             # sign lang interpreter
  81             'info_dict': {
  82                 'id': 'Migacz-ENC01-1-722340000000-722360145000',
  83                 'ext': 'mp4',
  84                 'duration': 20145,
  85                 'title': '1. posiedzenie Sejmu X kadencji - Migacz-ENC01',
  86                 'live_status': 'was_live',
  87             },
  88         }],
  89     }, {
  90         'url': 'https://www.sejm.gov.pl/Sejm8.nsf/transmisje.xsp?unid=9377A9D65518E9A5C125808E002E9FF2',
  91         'info_dict': {
  92             'id': '9377A9D65518E9A5C125808E002E9FF2',
  93             'title': 'Debata "Lepsza Polska: obywatelska"',
  94             'description': 'KP .Nowoczesna',
  95             'duration': 8770,
  96             'live_status': 'was_live',
  97             'location': 'sala kolumnowa im. Kazimierza Pużaka (bud. C-D)',
  98         },
  99         'playlist': [{
 100             'info_dict': {
 101                 'id': 'ENC08-1-503831270000-503840040000',
 102                 'ext': 'mp4',
 103                 'duration': 8770,
 104                 'title': 'Debata "Lepsza Polska: obywatelska" - ENC08',
 105                 'live_status': 'was_live',
 106             },
 107         }],
 108     }, {
 109         # 7th term is very special, since it does not use redcdn livx
 110         'url': 'https://www.sejm.gov.pl/sejm7.nsf/transmisje_arch.xsp?rok=2015&month=11#A6E6D475ECCC6FE5C1257EF90034817F',
 111         'info_dict': {
 112             'id': 'A6E6D475ECCC6FE5C1257EF90034817F',
 113             'title': 'Konferencja prasowa - Stanowisko SLD ws. składu nowego rządu',
 114             'description': 'SLD - Biuro Prasowe Klubu',
 115             'duration': 514,
 116             'location': 'sala 101/bud. C',
 117             'live_status': 'was_live',
 118         },
 119         'playlist': [{
 120             'info_dict': {
 121                 'id': 'A6E6D475ECCC6FE5C1257EF90034817F',
 122                 'ext': 'mp4',
 123                 'title': 'Konferencja prasowa - Stanowisko SLD ws. składu nowego rządu',
 124                 'duration': 514,
 125             },
 126         }],
 127     }, {
 128         'url': 'https://sejm-embed.redcdn.pl/Sejm10.nsf/VideoFrame.xsp/FED58EABB97FBD53C1258A7400386492',
 129         'only_matching': True,
 130     }]
 131
 132     def _real_extract(self, url):
 133         term, video_id = self._match_valid_url(url).group('term', 'id')
 134         frame = self._download_webpage(
 135             f'https://sejm-embed.redcdn.pl/Sejm{term}.nsf/VideoFrame.xsp/{video_id}',
 136             video_id)
 137         # despite it says "transmisje_arch", it works for live streams too!
 138         data = self._download_json(
 139             f'https://www.sejm.gov.pl/Sejm{term}.nsf/transmisje_arch.xsp/json/{video_id}',
 140             video_id)
 141         params = data['params']
 142
 143         title = strip_or_none(data.get('title'))
 144
 145         if data.get('status') == 'VIDEO_ENDED':
 146             live_status = 'was_live'
 147         elif data.get('status') == 'VIDEO_PLAYING':
 148             live_status = 'is_live'
 149         else:
 150             live_status = None
 151             self.report_warning(f'unknown status: {data.get("status")}')
 152
 153         start_time = rfc3339_to_atende(params['start'])
 154         # current streams have a stop time of *expected* end of session, but actual times
 155         # can change during the transmission. setting a stop_time would artificially
 156         # end the stream at that time, while the session actually keeps going.
 157         if live_status == 'was_live':
 158             stop_time = rfc3339_to_atende(params['stop'])
 159             duration = (stop_time - start_time) // 1000
 160         else:
 161             stop_time, duration = None, None
 162
 163         entries = []
 164
 165         def add_entry(file, legacy_file=False):
 166             if not file:
 167                 return
 168             file = self._proto_relative_url(file)
 169             if not legacy_file:
 170                 file = update_url_query(file, {'startTime': start_time})
 171                 if stop_time is not None:
 172                     file = update_url_query(file, {'stopTime': stop_time})
 173                 stream_id = self._search_regex(r'/o2/sejm/([^/]+)/[^./]+\.livx', file, 'stream id')
 174             common_info = {
 175                 'url': file,
 176                 'duration': duration,
 177             }
 178             if legacy_file:
 179                 entries.append({
 180                     **common_info,
 181                     'id': video_id,
 182                     'title': title,
 183                 })
 184             else:
 185                 entries.append({
 186                     **common_info,
 187                     '_type': 'url_transparent',
 188                     'ie_key': RedCDNLivxIE.ie_key(),
 189                     'id': stream_id,
 190                     'title': join_nonempty(title, stream_id, delim=' - '),
 191                 })
 192
 193         cameras = self._search_json(
 194             r'var\s+cameras\s*=', frame, 'camera list', video_id,
 195             contains_pattern=r'\[(?s:.+)\]', transform_source=js_to_json,
 196             fatal=False) or []
 197         for camera_file in traverse_obj(cameras, (..., 'file', {dict})):
 198             if camera_file.get('flv'):
 199                 add_entry(camera_file['flv'])
 200             elif camera_file.get('mp4'):
 201                 # this is only a thing in 7th term. no streams before, and starting 8th it's redcdn livx
 202                 add_entry(camera_file['mp4'], legacy_file=True)
 203             else:
 204                 self.report_warning('Unknown camera stream type found')
 205
 206         if params.get('mig'):
 207             add_entry(self._search_regex(r"var sliUrl\s*=\s*'([^']+)'", frame, 'sign language interpreter url', fatal=False))
 208
 209         return {
 210             '_type': 'playlist',
 211             'entries': entries,
 212             'id': video_id,
 213             'title': title,
 214             'description': clean_html(data.get('desc')) or None,
 215             'duration': duration,
 216             'live_status': live_status,
 217             'location': strip_or_none(data.get('location')),
 218         }