Release 2024.12.23
[yt-dlp.git] / yt_dlp / extractor / sejmpl.py
blobeb433d2ac32186b2953614d0bca1fc2f912c89b9
1 import datetime as dt
3 from .common import InfoExtractor
4 from .redge import RedCDNLivxIE
5 from ..utils import (
6 clean_html,
7 join_nonempty,
8 js_to_json,
9 strip_or_none,
10 update_url_query,
12 from ..utils.traversal import traverse_obj
15 def is_dst(date):
16 last_march = dt.datetime(date.year, 3, 31)
17 last_october = dt.datetime(date.year, 10, 31)
18 last_sunday_march = last_march - dt.timedelta(days=last_march.isoweekday() % 7)
19 last_sunday_october = last_october - dt.timedelta(days=last_october.isoweekday() % 7)
20 return last_sunday_march.replace(hour=2) <= date <= last_sunday_october.replace(hour=3)
23 def rfc3339_to_atende(date):
24 date = dt.datetime.fromisoformat(date)
25 date = date + dt.timedelta(hours=1 if is_dst(date) else 0)
26 return int((date.timestamp() - 978307200) * 1000)
29 class SejmIE(InfoExtractor):
30 _VALID_URL = (
31 r'https?://(?:www\.)?sejm\.gov\.pl/[Ss]ejm(?P<term>\d+)\.nsf/transmisje(?:_arch)?\.xsp(?:\?[^#]*)?#(?P<id>[\dA-F]+)',
32 r'https?://(?:www\.)?sejm\.gov\.pl/[Ss]ejm(?P<term>\d+)\.nsf/transmisje(?:_arch)?\.xsp\?(?:[^#]+&)?unid=(?P<id>[\dA-F]+)',
33 r'https?://sejm-embed\.redcdn\.pl/[Ss]ejm(?P<term>\d+)\.nsf/VideoFrame\.xsp/(?P<id>[\dA-F]+)',
35 IE_NAME = 'sejm'
37 _TESTS = [{
38 # multiple cameras, polish SL iterpreter
39 'url': 'https://www.sejm.gov.pl/Sejm10.nsf/transmisje_arch.xsp#6181EF1AD9CEEBB5C1258A6D006452B5',
40 'info_dict': {
41 'id': '6181EF1AD9CEEBB5C1258A6D006452B5',
42 'title': '1. posiedzenie Sejmu X kadencji',
43 'duration': 20145,
44 'live_status': 'was_live',
45 'location': 'Sala Posiedzeń',
47 'playlist': [{
48 'info_dict': {
49 'id': 'ENC01-722340000000-722360145000',
50 'ext': 'mp4',
51 'duration': 20145,
52 'title': '1. posiedzenie Sejmu X kadencji - ENC01',
53 'live_status': 'was_live',
55 }, {
56 'info_dict': {
57 'id': 'ENC30-722340000000-722360145000',
58 'ext': 'mp4',
59 'duration': 20145,
60 'title': '1. posiedzenie Sejmu X kadencji - ENC30',
61 'live_status': 'was_live',
63 }, {
64 'info_dict': {
65 'id': 'ENC31-722340000000-722360145000',
66 'ext': 'mp4',
67 'duration': 20145,
68 'title': '1. posiedzenie Sejmu X kadencji - ENC31',
69 'live_status': 'was_live',
71 }, {
72 'info_dict': {
73 'id': 'ENC32-722340000000-722360145000',
74 'ext': 'mp4',
75 'duration': 20145,
76 'title': '1. posiedzenie Sejmu X kadencji - ENC32',
77 'live_status': 'was_live',
79 }, {
80 # sign lang interpreter
81 'info_dict': {
82 'id': 'Migacz-ENC01-1-722340000000-722360145000',
83 'ext': 'mp4',
84 'duration': 20145,
85 'title': '1. posiedzenie Sejmu X kadencji - Migacz-ENC01',
86 'live_status': 'was_live',
88 }],
89 }, {
90 'url': 'https://www.sejm.gov.pl/Sejm8.nsf/transmisje.xsp?unid=9377A9D65518E9A5C125808E002E9FF2',
91 'info_dict': {
92 'id': '9377A9D65518E9A5C125808E002E9FF2',
93 'title': 'Debata "Lepsza Polska: obywatelska"',
94 'description': 'KP .Nowoczesna',
95 'duration': 8770,
96 'live_status': 'was_live',
97 'location': 'sala kolumnowa im. Kazimierza Pużaka (bud. C-D)',
99 'playlist': [{
100 'info_dict': {
101 'id': 'ENC08-1-503831270000-503840040000',
102 'ext': 'mp4',
103 'duration': 8770,
104 'title': 'Debata "Lepsza Polska: obywatelska" - ENC08',
105 'live_status': 'was_live',
108 }, {
109 # 7th term is very special, since it does not use redcdn livx
110 'url': 'https://www.sejm.gov.pl/sejm7.nsf/transmisje_arch.xsp?rok=2015&month=11#A6E6D475ECCC6FE5C1257EF90034817F',
111 'info_dict': {
112 'id': 'A6E6D475ECCC6FE5C1257EF90034817F',
113 'title': 'Konferencja prasowa - Stanowisko SLD ws. składu nowego rządu',
114 'description': 'SLD - Biuro Prasowe Klubu',
115 'duration': 514,
116 'location': 'sala 101/bud. C',
117 'live_status': 'was_live',
119 'playlist': [{
120 'info_dict': {
121 'id': 'A6E6D475ECCC6FE5C1257EF90034817F',
122 'ext': 'mp4',
123 'title': 'Konferencja prasowa - Stanowisko SLD ws. składu nowego rządu',
124 'duration': 514,
127 }, {
128 'url': 'https://sejm-embed.redcdn.pl/Sejm10.nsf/VideoFrame.xsp/FED58EABB97FBD53C1258A7400386492',
129 'only_matching': True,
132 def _real_extract(self, url):
133 term, video_id = self._match_valid_url(url).group('term', 'id')
134 frame = self._download_webpage(
135 f'https://sejm-embed.redcdn.pl/Sejm{term}.nsf/VideoFrame.xsp/{video_id}',
136 video_id)
137 # despite it says "transmisje_arch", it works for live streams too!
138 data = self._download_json(
139 f'https://www.sejm.gov.pl/Sejm{term}.nsf/transmisje_arch.xsp/json/{video_id}',
140 video_id)
141 params = data['params']
143 title = strip_or_none(data.get('title'))
145 if data.get('status') == 'VIDEO_ENDED':
146 live_status = 'was_live'
147 elif data.get('status') == 'VIDEO_PLAYING':
148 live_status = 'is_live'
149 else:
150 live_status = None
151 self.report_warning(f'unknown status: {data.get("status")}')
153 start_time = rfc3339_to_atende(params['start'])
154 # current streams have a stop time of *expected* end of session, but actual times
155 # can change during the transmission. setting a stop_time would artificially
156 # end the stream at that time, while the session actually keeps going.
157 if live_status == 'was_live':
158 stop_time = rfc3339_to_atende(params['stop'])
159 duration = (stop_time - start_time) // 1000
160 else:
161 stop_time, duration = None, None
163 entries = []
165 def add_entry(file, legacy_file=False):
166 if not file:
167 return
168 file = self._proto_relative_url(file)
169 if not legacy_file:
170 file = update_url_query(file, {'startTime': start_time})
171 if stop_time is not None:
172 file = update_url_query(file, {'stopTime': stop_time})
173 stream_id = self._search_regex(r'/o2/sejm/([^/]+)/[^./]+\.livx', file, 'stream id')
174 common_info = {
175 'url': file,
176 'duration': duration,
178 if legacy_file:
179 entries.append({
180 **common_info,
181 'id': video_id,
182 'title': title,
184 else:
185 entries.append({
186 **common_info,
187 '_type': 'url_transparent',
188 'ie_key': RedCDNLivxIE.ie_key(),
189 'id': stream_id,
190 'title': join_nonempty(title, stream_id, delim=' - '),
193 cameras = self._search_json(
194 r'var\s+cameras\s*=', frame, 'camera list', video_id,
195 contains_pattern=r'\[(?s:.+)\]', transform_source=js_to_json,
196 fatal=False) or []
197 for camera_file in traverse_obj(cameras, (..., 'file', {dict})):
198 if camera_file.get('flv'):
199 add_entry(camera_file['flv'])
200 elif camera_file.get('mp4'):
201 # this is only a thing in 7th term. no streams before, and starting 8th it's redcdn livx
202 add_entry(camera_file['mp4'], legacy_file=True)
203 else:
204 self.report_warning('Unknown camera stream type found')
206 if params.get('mig'):
207 add_entry(self._search_regex(r"var sliUrl\s*=\s*'([^']+)'", frame, 'sign language interpreter url', fatal=False))
209 return {
210 '_type': 'playlist',
211 'entries': entries,
212 'id': video_id,
213 'title': title,
214 'description': clean_html(data.get('desc')) or None,
215 'duration': duration,
216 'live_status': live_status,
217 'location': strip_or_none(data.get('location')),