[ie/dropbox] Fix password-protected video extraction (#11636)
[yt-dlp3.git] / yt_dlp / extractor / japandiet.py
blob994da22ae0786734c8c07dae6ffa2010a3515abb
1 import re
3 from .common import InfoExtractor
4 from ..utils import (
5 ExtractorError,
6 clean_html,
7 int_or_none,
8 join_nonempty,
9 parse_qs,
10 smuggle_url,
11 traverse_obj,
12 try_call,
13 unsmuggle_url,
17 def _parse_japanese_date(text):
18 if not text:
19 return None
20 ERA_TABLE = {
21 '明治': 1868,
22 '大正': 1912,
23 '昭和': 1926,
24 '平成': 1989,
25 '令和': 2019,
27 ERA_RE = '|'.join(map(re.escape, ERA_TABLE.keys()))
28 mobj = re.search(rf'({ERA_RE})?(\d+)年(\d+)月(\d+)日', re.sub(r'[\s\u3000]+', '', text))
29 if not mobj:
30 return None
31 era, year, month, day = mobj.groups()
32 year, month, day = map(int, (year, month, day))
33 if era:
34 # example input: 令和5年3月34日
35 # even though each era have their end, don't check here
36 year += ERA_TABLE[era]
37 return '%04d%02d%02d' % (year, month, day)
40 def _parse_japanese_duration(text):
41 mobj = re.search(r'(?:(\d+)日間?)?(?:(\d+)時間?)?(?:(\d+)分)?(?:(\d+)秒)?', re.sub(r'[\s\u3000]+', '', text or ''))
42 if not mobj:
43 return
44 days, hours, mins, secs = (int_or_none(x, default=0) for x in mobj.groups())
45 return secs + mins * 60 + hours * 60 * 60 + days * 24 * 60 * 60
48 class ShugiinItvBaseIE(InfoExtractor):
49 _INDEX_ROOMS = None
51 @classmethod
52 def _find_rooms(cls, webpage):
53 return [{
54 '_type': 'url',
55 'id': x.group(1),
56 'title': clean_html(x.group(2)).strip(),
57 'url': smuggle_url(f'https://www.shugiintv.go.jp/jp/index.php?room_id={x.group(1)}', {'g': x.groups()}),
58 'ie_key': ShugiinItvLiveIE.ie_key(),
59 } for x in re.finditer(r'(?s)<a\s+href="[^"]+\?room_id=(room\d+)"\s*class="play_live".+?class="s12_14">(.+?)</td>', webpage)]
61 def _fetch_rooms(self):
62 if not self._INDEX_ROOMS:
63 webpage = self._download_webpage(
64 'https://www.shugiintv.go.jp/jp/index.php', None,
65 encoding='euc-jp', note='Downloading proceedings info')
66 ShugiinItvBaseIE._INDEX_ROOMS = self._find_rooms(webpage)
67 return self._INDEX_ROOMS
70 class ShugiinItvLiveIE(ShugiinItvBaseIE):
71 _VALID_URL = r'https?://(?:www\.)?shugiintv\.go\.jp/(?:jp|en)(?:/index\.php)?$'
72 IE_DESC = '衆議院インターネット審議中継'
74 _TESTS = [{
75 'url': 'https://www.shugiintv.go.jp/jp/index.php',
76 'info_dict': {
77 '_type': 'playlist',
78 'title': 'All proceedings for today',
80 # expect at least one proceedings is running
81 'playlist_mincount': 1,
84 @classmethod
85 def suitable(cls, url):
86 return super().suitable(url) and not any(x.suitable(url) for x in (ShugiinItvLiveRoomIE, ShugiinItvVodIE))
88 def _real_extract(self, url):
89 self.to_screen(
90 'Downloading all running proceedings. To specify one proceeding, use direct link from the website')
91 return self.playlist_result(self._fetch_rooms(), playlist_title='All proceedings for today')
94 class ShugiinItvLiveRoomIE(ShugiinItvBaseIE):
95 _VALID_URL = r'https?://(?:www\.)?shugiintv\.go\.jp/(?:jp|en)/index\.php\?room_id=(?P<id>room\d+)'
96 IE_DESC = '衆議院インターネット審議中継 (中継)'
98 _TESTS = [{
99 'url': 'https://www.shugiintv.go.jp/jp/index.php?room_id=room01',
100 'info_dict': {
101 'id': 'room01',
102 'title': '内閣委員会',
104 'skip': 'this runs for a time and not every day',
105 }, {
106 'url': 'https://www.shugiintv.go.jp/jp/index.php?room_id=room11',
107 'info_dict': {
108 'id': 'room11',
109 'title': '外務委員会',
111 'skip': 'this runs for a time and not every day',
114 def _real_extract(self, url):
115 url, smug = unsmuggle_url(url, default={})
116 if smug.get('g'):
117 room_id, title = smug['g']
118 else:
119 room_id = self._match_id(url)
120 title = traverse_obj(self._fetch_rooms(), (lambda k, v: v['id'] == room_id, 'title'), get_all=False)
122 formats, subtitles = self._extract_m3u8_formats_and_subtitles(
123 f'https://hlslive.shugiintv.go.jp/{room_id}/amlst:{room_id}/playlist.m3u8',
124 room_id, ext='mp4')
126 return {
127 'id': room_id,
128 'title': title,
129 'formats': formats,
130 'subtitles': subtitles,
131 'is_live': True,
135 class ShugiinItvVodIE(ShugiinItvBaseIE):
136 _VALID_URL = r'https?://(?:www\.)?shugiintv\.go\.jp/(?:jp|en)/index\.php\?ex=VL(?:\&[^=]+=[^&]*)*\&deli_id=(?P<id>\d+)'
137 IE_DESC = '衆議院インターネット審議中継 (ビデオライブラリ)'
138 _TESTS = [{
139 'url': 'https://www.shugiintv.go.jp/jp/index.php?ex=VL&media_type=&deli_id=53846',
140 'info_dict': {
141 'id': '53846',
142 'title': 'ウクライナ大統領国会演説(オンライン)',
143 'release_date': '20220323',
144 'chapters': 'count:4',
146 }, {
147 'url': 'https://www.shugiintv.go.jp/en/index.php?ex=VL&media_type=&deli_id=53846',
148 'only_matching': True,
151 def _real_extract(self, url):
152 video_id = self._match_id(url)
153 webpage = self._download_webpage(
154 f'https://www.shugiintv.go.jp/jp/index.php?ex=VL&media_type=&deli_id={video_id}', video_id,
155 encoding='euc-jp')
157 m3u8_url = self._search_regex(
158 r'id="vtag_src_base_vod"\s*value="(http.+?\.m3u8)"', webpage, 'm3u8 url')
159 m3u8_url = re.sub(r'^http://', 'https://', m3u8_url)
160 formats, subtitles = self._extract_m3u8_formats_and_subtitles(
161 m3u8_url, video_id, ext='mp4')
163 title = self._html_search_regex(
164 (r'<td\s+align="left">(.+)\s*\(\d+分\)',
165 r'<TD.+?<IMG\s*src=".+?/spacer\.gif".+?height="15">(.+?)<IMG'), webpage, 'title', fatal=False)
167 release_date = _parse_japanese_date(self._html_search_regex(
168 r'開会日</td>\s*<td.+?/td>\s*<TD>(.+?)</TD>',
169 webpage, 'title', fatal=False))
171 chapters = []
172 for chp in re.finditer(r'(?i)<A\s+HREF="([^"]+?)"\s*class="play_vod">(?!<img)(.+)</[Aa]>', webpage):
173 chapters.append({
174 'title': clean_html(chp.group(2)).strip(),
175 'start_time': try_call(lambda: float(parse_qs(chp.group(1))['time'][0].strip())),
177 # NOTE: there are blanks at the first and the end of the videos,
178 # so getting/providing the video duration is not possible
179 # also, the exact end_time for the last chapter is unknown (we can get at most minutes of granularity)
180 last_tr = re.findall(r'(?s)<TR\s*class="s14_24">(.+?)</TR>', webpage)[-1]
181 if last_tr and chapters:
182 last_td = re.findall(r'<TD.+?</TD>', last_tr)[-1]
183 if last_td:
184 chapters[-1]['end_time'] = chapters[-1]['start_time'] + _parse_japanese_duration(clean_html(last_td))
186 return {
187 'id': video_id,
188 'title': title,
189 'release_date': release_date,
190 'chapters': chapters,
191 'formats': formats,
192 'subtitles': subtitles,
196 class SangiinInstructionIE(InfoExtractor):
197 _VALID_URL = r'https?://www\.webtv\.sangiin\.go\.jp/webtv/index\.php'
198 IE_DESC = False # this shouldn't be listed as a supported site
200 def _real_extract(self, url):
201 raise ExtractorError(
202 'Copy the link from the button below the video description/player '
203 'and use that link to download. If there is no button in the frame, '
204 'get the URL of the frame showing the video.', expected=True)
207 class SangiinIE(InfoExtractor):
208 _VALID_URL = r'https?://www\.webtv\.sangiin\.go\.jp/webtv/detail\.php\?sid=(?P<id>\d+)'
209 IE_DESC = '参議院インターネット審議中継 (archive)'
211 _TESTS = [{
212 'url': 'https://www.webtv.sangiin.go.jp/webtv/detail.php?sid=7052',
213 'info_dict': {
214 'id': '7052',
215 'title': '2022年10月7日 本会議',
216 'description': 'md5:0a5fed523f95c88105a0b0bf1dd71489',
217 'upload_date': '20221007',
218 'ext': 'mp4',
220 }, {
221 'url': 'https://www.webtv.sangiin.go.jp/webtv/detail.php?sid=7037',
222 'info_dict': {
223 'id': '7037',
224 'title': '2022年10月3日 開会式',
225 'upload_date': '20221003',
226 'ext': 'mp4',
228 }, {
229 'url': 'https://www.webtv.sangiin.go.jp/webtv/detail.php?sid=7076',
230 'info_dict': {
231 'id': '7076',
232 'title': '2022年10月27日 法務委員会',
233 'upload_date': '20221027',
234 'ext': 'mp4',
235 'is_live': True,
237 'skip': 'this live is turned into archive after it ends',
240 def _real_extract(self, url):
241 video_id = self._match_id(url)
242 webpage = self._download_webpage(url, video_id)
244 date = self._html_search_regex(
245 r'<dt[^>]*>\s*開会日\s*</dt>\s*<dd[^>]*>\s*(.+?)\s*</dd>', webpage,
246 'date', fatal=False)
247 upload_date = _parse_japanese_date(date)
249 title = self._html_search_regex(
250 r'<dt[^>]*>\s*会議名\s*</dt>\s*<dd[^>]*>\s*(.+?)\s*</dd>', webpage,
251 'date', fatal=False)
253 # some videos don't have the elements, so assume it's missing
254 description = self._html_search_regex(
255 r'会議の経過\s*</h3>\s*<span[^>]*>(.+?)</span>', webpage,
256 'description', default=None)
258 # this row appears only when it's livestream
259 is_live = bool(self._html_search_regex(
260 r'<dt[^>]*>\s*公報掲載時刻\s*</dt>\s*<dd[^>]*>\s*(.+?)\s*</dd>', webpage,
261 'is_live', default=None))
263 m3u8_url = self._search_regex(
264 r'var\s+videopath\s*=\s*(["\'])([^"\']+)\1', webpage,
265 'm3u8 url', group=2)
267 formats, subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4')
269 return {
270 'id': video_id,
271 'title': join_nonempty(date, title, delim=' '),
272 'description': description,
273 'upload_date': upload_date,
274 'formats': formats,
275 'subtitles': subs,
276 'is_live': is_live,