[cleanup] Misc (#8968)
[yt-dlp.git] / yt_dlp / extractor / japandiet.py
blob6c650568acda79b96cc1f02fad5a430f2903b1c3
1 import re
3 from ..utils import (
4 ExtractorError,
5 clean_html,
6 int_or_none,
7 join_nonempty,
8 parse_qs,
9 smuggle_url,
10 traverse_obj,
11 try_call,
12 unsmuggle_url
14 from .common import InfoExtractor
17 def _parse_japanese_date(text):
18 if not text:
19 return None
20 ERA_TABLE = {
21 '明治': 1868,
22 '大正': 1912,
23 '昭和': 1926,
24 '平成': 1989,
25 '令和': 2019,
27 ERA_RE = '|'.join(map(re.escape, ERA_TABLE.keys()))
28 mobj = re.search(rf'({ERA_RE})?(\d+)年(\d+)月(\d+)日', re.sub(r'[\s\u3000]+', '', text))
29 if not mobj:
30 return None
31 era, year, month, day = mobj.groups()
32 year, month, day = map(int, (year, month, day))
33 if era:
34 # example input: 令和5年3月34日
35 # even though each era have their end, don't check here
36 year += ERA_TABLE[era]
37 return '%04d%02d%02d' % (year, month, day)
40 def _parse_japanese_duration(text):
41 mobj = re.search(r'(?:(\d+)日間?)?(?:(\d+)時間?)?(?:(\d+)分)?(?:(\d+)秒)?', re.sub(r'[\s\u3000]+', '', text or ''))
42 if not mobj:
43 return
44 days, hours, mins, secs = [int_or_none(x, default=0) for x in mobj.groups()]
45 return secs + mins * 60 + hours * 60 * 60 + days * 24 * 60 * 60
48 class ShugiinItvBaseIE(InfoExtractor):
49 _INDEX_ROOMS = None
51 @classmethod
52 def _find_rooms(cls, webpage):
53 return [{
54 '_type': 'url',
55 'id': x.group(1),
56 'title': clean_html(x.group(2)).strip(),
57 'url': smuggle_url(f'https://www.shugiintv.go.jp/jp/index.php?room_id={x.group(1)}', {'g': x.groups()}),
58 'ie_key': ShugiinItvLiveIE.ie_key(),
59 } for x in re.finditer(r'(?s)<a\s+href="[^"]+\?room_id=(room\d+)"\s*class="play_live".+?class="s12_14">(.+?)</td>', webpage)]
61 def _fetch_rooms(self):
62 if not self._INDEX_ROOMS:
63 webpage = self._download_webpage(
64 'https://www.shugiintv.go.jp/jp/index.php', None,
65 encoding='euc-jp', note='Downloading proceedings info')
66 ShugiinItvBaseIE._INDEX_ROOMS = self._find_rooms(webpage)
67 return self._INDEX_ROOMS
70 class ShugiinItvLiveIE(ShugiinItvBaseIE):
71 _VALID_URL = r'https?://(?:www\.)?shugiintv\.go\.jp/(?:jp|en)(?:/index\.php)?$'
72 IE_DESC = '衆議院インターネット審議中継'
74 _TESTS = [{
75 'url': 'https://www.shugiintv.go.jp/jp/index.php',
76 'info_dict': {
77 '_type': 'playlist',
78 'title': 'All proceedings for today',
80 # expect at least one proceedings is running
81 'playlist_mincount': 1,
84 @classmethod
85 def suitable(cls, url):
86 return super().suitable(url) and not any(x.suitable(url) for x in (ShugiinItvLiveRoomIE, ShugiinItvVodIE))
88 def _real_extract(self, url):
89 self.to_screen(
90 'Downloading all running proceedings. To specify one proceeding, use direct link from the website')
91 return self.playlist_result(self._fetch_rooms(), playlist_title='All proceedings for today')
94 class ShugiinItvLiveRoomIE(ShugiinItvBaseIE):
95 _VALID_URL = r'https?://(?:www\.)?shugiintv\.go\.jp/(?:jp|en)/index\.php\?room_id=(?P<id>room\d+)'
96 IE_DESC = '衆議院インターネット審議中継 (中継)'
98 _TESTS = [{
99 'url': 'https://www.shugiintv.go.jp/jp/index.php?room_id=room01',
100 'info_dict': {
101 'id': 'room01',
102 'title': '内閣委員会',
104 'skip': 'this runs for a time and not every day',
105 }, {
106 'url': 'https://www.shugiintv.go.jp/jp/index.php?room_id=room11',
107 'info_dict': {
108 'id': 'room11',
109 'title': '外務委員会',
111 'skip': 'this runs for a time and not every day',
114 def _real_extract(self, url):
115 url, smug = unsmuggle_url(url, default={})
116 if smug.get('g'):
117 room_id, title = smug['g']
118 else:
119 room_id = self._match_id(url)
120 title = traverse_obj(self._fetch_rooms(), (lambda k, v: v['id'] == room_id, 'title'), get_all=False)
122 formats, subtitles = self._extract_m3u8_formats_and_subtitles(
123 f'https://hlslive.shugiintv.go.jp/{room_id}/amlst:{room_id}/playlist.m3u8',
124 room_id, ext='mp4')
126 return {
127 'id': room_id,
128 'title': title,
129 'formats': formats,
130 'subtitles': subtitles,
131 'is_live': True,
135 class ShugiinItvVodIE(ShugiinItvBaseIE):
136 _VALID_URL = r'https?://(?:www\.)?shugiintv\.go\.jp/(?:jp|en)/index\.php\?ex=VL(?:\&[^=]+=[^&]*)*\&deli_id=(?P<id>\d+)'
137 IE_DESC = '衆議院インターネット審議中継 (ビデオライブラリ)'
138 _TESTS = [{
139 'url': 'https://www.shugiintv.go.jp/jp/index.php?ex=VL&media_type=&deli_id=53846',
140 'info_dict': {
141 'id': '53846',
142 'title': 'ウクライナ大統領国会演説(オンライン)',
143 'release_date': '20220323',
144 'chapters': 'count:4',
146 }, {
147 'url': 'https://www.shugiintv.go.jp/en/index.php?ex=VL&media_type=&deli_id=53846',
148 'only_matching': True
151 def _real_extract(self, url):
152 video_id = self._match_id(url)
153 webpage = self._download_webpage(
154 f'https://www.shugiintv.go.jp/jp/index.php?ex=VL&media_type=&deli_id={video_id}', video_id,
155 encoding='euc-jp')
157 m3u8_url = self._search_regex(
158 r'id="vtag_src_base_vod"\s*value="(http.+?\.m3u8)"', webpage, 'm3u8 url')
159 m3u8_url = re.sub(r'^http://', 'https://', m3u8_url)
160 formats, subtitles = self._extract_m3u8_formats_and_subtitles(
161 m3u8_url, video_id, ext='mp4')
163 title = self._html_search_regex(
164 (r'<td\s+align="left">(.+)\s*\(\d+分\)',
165 r'<TD.+?<IMG\s*src=".+?/spacer\.gif".+?height="15">(.+?)<IMG'), webpage, 'title', fatal=False)
167 release_date = _parse_japanese_date(self._html_search_regex(
168 r'開会日</td>\s*<td.+?/td>\s*<TD>(.+?)</TD>',
169 webpage, 'title', fatal=False))
171 chapters = []
172 for chp in re.finditer(r'(?i)<A\s+HREF="([^"]+?)"\s*class="play_vod">(?!<img)(.+)</[Aa]>', webpage):
173 chapters.append({
174 'title': clean_html(chp.group(2)).strip(),
175 'start_time': try_call(lambda: float(parse_qs(chp.group(1))['time'][0].strip())),
177 # NOTE: there are blanks at the first and the end of the videos,
178 # so getting/providing the video duration is not possible
179 # also, the exact end_time for the last chapter is unknown (we can get at most minutes of granularity)
180 last_tr = re.findall(r'(?s)<TR\s*class="s14_24">(.+?)</TR>', webpage)[-1]
181 if last_tr and chapters:
182 last_td = re.findall(r'<TD.+?</TD>', last_tr)[-1]
183 if last_td:
184 chapters[-1]['end_time'] = chapters[-1]['start_time'] + _parse_japanese_duration(clean_html(last_td))
186 return {
187 'id': video_id,
188 'title': title,
189 'release_date': release_date,
190 'chapters': chapters,
191 'formats': formats,
192 'subtitles': subtitles,
196 class SangiinInstructionIE(InfoExtractor):
197 _VALID_URL = r'^https?://www\.webtv\.sangiin\.go\.jp/webtv/index\.php'
198 IE_DESC = False # this shouldn't be listed as a supported site
200 def _real_extract(self, url):
201 raise ExtractorError('Copy the link from the botton below the video description or player, and use the link to download. If there are no button in the frame, get the URL of the frame showing the video.', expected=True)
204 class SangiinIE(InfoExtractor):
205 _VALID_URL = r'https?://www\.webtv\.sangiin\.go\.jp/webtv/detail\.php\?sid=(?P<id>\d+)'
206 IE_DESC = '参議院インターネット審議中継 (archive)'
208 _TESTS = [{
209 'url': 'https://www.webtv.sangiin.go.jp/webtv/detail.php?sid=7052',
210 'info_dict': {
211 'id': '7052',
212 'title': '2022年10月7日 本会議',
213 'description': 'md5:0a5fed523f95c88105a0b0bf1dd71489',
214 'upload_date': '20221007',
215 'ext': 'mp4',
217 }, {
218 'url': 'https://www.webtv.sangiin.go.jp/webtv/detail.php?sid=7037',
219 'info_dict': {
220 'id': '7037',
221 'title': '2022年10月3日 開会式',
222 'upload_date': '20221003',
223 'ext': 'mp4',
225 }, {
226 'url': 'https://www.webtv.sangiin.go.jp/webtv/detail.php?sid=7076',
227 'info_dict': {
228 'id': '7076',
229 'title': '2022年10月27日 法務委員会',
230 'upload_date': '20221027',
231 'ext': 'mp4',
232 'is_live': True,
234 'skip': 'this live is turned into archive after it ends',
235 }, ]
237 def _real_extract(self, url):
238 video_id = self._match_id(url)
239 webpage = self._download_webpage(url, video_id)
241 date = self._html_search_regex(
242 r'<dt[^>]*>\s*開会日\s*</dt>\s*<dd[^>]*>\s*(.+?)\s*</dd>', webpage,
243 'date', fatal=False)
244 upload_date = _parse_japanese_date(date)
246 title = self._html_search_regex(
247 r'<dt[^>]*>\s*会議名\s*</dt>\s*<dd[^>]*>\s*(.+?)\s*</dd>', webpage,
248 'date', fatal=False)
250 # some videos don't have the elements, so assume it's missing
251 description = self._html_search_regex(
252 r'会議の経過\s*</h3>\s*<span[^>]*>(.+?)</span>', webpage,
253 'description', default=None)
255 # this row appears only when it's livestream
256 is_live = bool(self._html_search_regex(
257 r'<dt[^>]*>\s*公報掲載時刻\s*</dt>\s*<dd[^>]*>\s*(.+?)\s*</dd>', webpage,
258 'is_live', default=None))
260 m3u8_url = self._search_regex(
261 r'var\s+videopath\s*=\s*(["\'])([^"\']+)\1', webpage,
262 'm3u8 url', group=2)
264 formats, subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4')
266 return {
267 'id': video_id,
268 'title': join_nonempty(date, title, delim=' '),
269 'description': description,
270 'upload_date': upload_date,
271 'formats': formats,
272 'subtitles': subs,
273 'is_live': is_live,