3 from .common
import InfoExtractor
17 def _parse_japanese_date(text
):
27 ERA_RE
= '|'.join(map(re
.escape
, ERA_TABLE
.keys()))
28 mobj
= re
.search(rf
'({ERA_RE})?(\d+)年(\d+)月(\d+)日', re
.sub(r
'[\s\u3000]+', '', text
))
31 era
, year
, month
, day
= mobj
.groups()
32 year
, month
, day
= map(int, (year
, month
, day
))
34 # example input: 令和5年3月34日
35 # even though each era have their end, don't check here
36 year
+= ERA_TABLE
[era
]
37 return '%04d%02d%02d' % (year
, month
, day
)
40 def _parse_japanese_duration(text
):
41 mobj
= re
.search(r
'(?:(\d+)日間?)?(?:(\d+)時間?)?(?:(\d+)分)?(?:(\d+)秒)?', re
.sub(r
'[\s\u3000]+', '', text
or ''))
44 days
, hours
, mins
, secs
= (int_or_none(x
, default
=0) for x
in mobj
.groups())
45 return secs
+ mins
* 60 + hours
* 60 * 60 + days
* 24 * 60 * 60
48 class ShugiinItvBaseIE(InfoExtractor
):
52 def _find_rooms(cls
, webpage
):
56 'title': clean_html(x
.group(2)).strip(),
57 'url': smuggle_url(f
'https://www.shugiintv.go.jp/jp/index.php?room_id={x.group(1)}', {'g': x
.groups()}),
58 'ie_key': ShugiinItvLiveIE
.ie_key(),
59 } for x
in re
.finditer(r
'(?s)<a\s+href="[^"]+\?room_id=(room\d+)"\s*class="play_live".+?class="s12_14">(.+?)</td>', webpage
)]
61 def _fetch_rooms(self
):
62 if not self
._INDEX
_ROOMS
:
63 webpage
= self
._download
_webpage
(
64 'https://www.shugiintv.go.jp/jp/index.php', None,
65 encoding
='euc-jp', note
='Downloading proceedings info')
66 ShugiinItvBaseIE
._INDEX
_ROOMS
= self
._find
_rooms
(webpage
)
67 return self
._INDEX
_ROOMS
70 class ShugiinItvLiveIE(ShugiinItvBaseIE
):
71 _VALID_URL
= r
'https?://(?:www\.)?shugiintv\.go\.jp/(?:jp|en)(?:/index\.php)?$'
72 IE_DESC
= '衆議院インターネット審議中継'
75 'url': 'https://www.shugiintv.go.jp/jp/index.php',
78 'title': 'All proceedings for today',
80 # expect at least one proceedings is running
81 'playlist_mincount': 1,
85 def suitable(cls
, url
):
86 return super().suitable(url
) and not any(x
.suitable(url
) for x
in (ShugiinItvLiveRoomIE
, ShugiinItvVodIE
))
88 def _real_extract(self
, url
):
90 'Downloading all running proceedings. To specify one proceeding, use direct link from the website')
91 return self
.playlist_result(self
._fetch
_rooms
(), playlist_title
='All proceedings for today')
94 class ShugiinItvLiveRoomIE(ShugiinItvBaseIE
):
95 _VALID_URL
= r
'https?://(?:www\.)?shugiintv\.go\.jp/(?:jp|en)/index\.php\?room_id=(?P<id>room\d+)'
96 IE_DESC
= '衆議院インターネット審議中継 (中継)'
99 'url': 'https://www.shugiintv.go.jp/jp/index.php?room_id=room01',
104 'skip': 'this runs for a time and not every day',
106 'url': 'https://www.shugiintv.go.jp/jp/index.php?room_id=room11',
111 'skip': 'this runs for a time and not every day',
114 def _real_extract(self
, url
):
115 url
, smug
= unsmuggle_url(url
, default
={})
117 room_id
, title
= smug
['g']
119 room_id
= self
._match
_id
(url
)
120 title
= traverse_obj(self
._fetch
_rooms
(), (lambda k
, v
: v
['id'] == room_id
, 'title'), get_all
=False)
122 formats
, subtitles
= self
._extract
_m
3u8_formats
_and
_subtitles
(
123 f
'https://hlslive.shugiintv.go.jp/{room_id}/amlst:{room_id}/playlist.m3u8',
130 'subtitles': subtitles
,
135 class ShugiinItvVodIE(ShugiinItvBaseIE
):
136 _VALID_URL
= r
'https?://(?:www\.)?shugiintv\.go\.jp/(?:jp|en)/index\.php\?ex=VL(?:\&[^=]+=[^&]*)*\&deli_id=(?P<id>\d+)'
137 IE_DESC
= '衆議院インターネット審議中継 (ビデオライブラリ)'
139 'url': 'https://www.shugiintv.go.jp/jp/index.php?ex=VL&media_type=&deli_id=53846',
142 'title': 'ウクライナ大統領国会演説(オンライン)',
143 'release_date': '20220323',
144 'chapters': 'count:4',
147 'url': 'https://www.shugiintv.go.jp/en/index.php?ex=VL&media_type=&deli_id=53846',
148 'only_matching': True,
151 def _real_extract(self
, url
):
152 video_id
= self
._match
_id
(url
)
153 webpage
= self
._download
_webpage
(
154 f
'https://www.shugiintv.go.jp/jp/index.php?ex=VL&media_type=&deli_id={video_id}', video_id
,
157 m3u8_url
= self
._search
_regex
(
158 r
'id="vtag_src_base_vod"\s*value="(http.+?\.m3u8)"', webpage
, 'm3u8 url')
159 m3u8_url
= re
.sub(r
'^http://', 'https://', m3u8_url
)
160 formats
, subtitles
= self
._extract
_m
3u8_formats
_and
_subtitles
(
161 m3u8_url
, video_id
, ext
='mp4')
163 title
= self
._html
_search
_regex
(
164 (r
'<td\s+align="left">(.+)\s*\(\d+分\)',
165 r
'<TD.+?<IMG\s*src=".+?/spacer\.gif".+?height="15">(.+?)<IMG'), webpage
, 'title', fatal
=False)
167 release_date
= _parse_japanese_date(self
._html
_search
_regex
(
168 r
'開会日</td>\s*<td.+?/td>\s*<TD>(.+?)</TD>',
169 webpage
, 'title', fatal
=False))
172 for chp
in re
.finditer(r
'(?i)<A\s+HREF="([^"]+?)"\s*class="play_vod">(?!<img)(.+)</[Aa]>', webpage
):
174 'title': clean_html(chp
.group(2)).strip(),
175 'start_time': try_call(lambda: float(parse_qs(chp
.group(1))['time'][0].strip())),
177 # NOTE: there are blanks at the first and the end of the videos,
178 # so getting/providing the video duration is not possible
179 # also, the exact end_time for the last chapter is unknown (we can get at most minutes of granularity)
180 last_tr
= re
.findall(r
'(?s)<TR\s*class="s14_24">(.+?)</TR>', webpage
)[-1]
181 if last_tr
and chapters
:
182 last_td
= re
.findall(r
'<TD.+?</TD>', last_tr
)[-1]
184 chapters
[-1]['end_time'] = chapters
[-1]['start_time'] + _parse_japanese_duration(clean_html(last_td
))
189 'release_date': release_date
,
190 'chapters': chapters
,
192 'subtitles': subtitles
,
196 class SangiinInstructionIE(InfoExtractor
):
197 _VALID_URL
= r
'https?://www\.webtv\.sangiin\.go\.jp/webtv/index\.php'
198 IE_DESC
= False # this shouldn't be listed as a supported site
200 def _real_extract(self
, url
):
201 raise ExtractorError(
202 'Copy the link from the button below the video description/player '
203 'and use that link to download. If there is no button in the frame, '
204 'get the URL of the frame showing the video.', expected
=True)
207 class SangiinIE(InfoExtractor
):
208 _VALID_URL
= r
'https?://www\.webtv\.sangiin\.go\.jp/webtv/detail\.php\?sid=(?P<id>\d+)'
209 IE_DESC
= '参議院インターネット審議中継 (archive)'
212 'url': 'https://www.webtv.sangiin.go.jp/webtv/detail.php?sid=7052',
215 'title': '2022年10月7日 本会議',
216 'description': 'md5:0a5fed523f95c88105a0b0bf1dd71489',
217 'upload_date': '20221007',
221 'url': 'https://www.webtv.sangiin.go.jp/webtv/detail.php?sid=7037',
224 'title': '2022年10月3日 開会式',
225 'upload_date': '20221003',
229 'url': 'https://www.webtv.sangiin.go.jp/webtv/detail.php?sid=7076',
232 'title': '2022年10月27日 法務委員会',
233 'upload_date': '20221027',
237 'skip': 'this live is turned into archive after it ends',
240 def _real_extract(self
, url
):
241 video_id
= self
._match
_id
(url
)
242 webpage
= self
._download
_webpage
(url
, video_id
)
244 date
= self
._html
_search
_regex
(
245 r
'<dt[^>]*>\s*開会日\s*</dt>\s*<dd[^>]*>\s*(.+?)\s*</dd>', webpage
,
247 upload_date
= _parse_japanese_date(date
)
249 title
= self
._html
_search
_regex
(
250 r
'<dt[^>]*>\s*会議名\s*</dt>\s*<dd[^>]*>\s*(.+?)\s*</dd>', webpage
,
253 # some videos don't have the elements, so assume it's missing
254 description
= self
._html
_search
_regex
(
255 r
'会議の経過\s*</h3>\s*<span[^>]*>(.+?)</span>', webpage
,
256 'description', default
=None)
258 # this row appears only when it's livestream
259 is_live
= bool(self
._html
_search
_regex
(
260 r
'<dt[^>]*>\s*公報掲載時刻\s*</dt>\s*<dd[^>]*>\s*(.+?)\s*</dd>', webpage
,
261 'is_live', default
=None))
263 m3u8_url
= self
._search
_regex
(
264 r
'var\s+videopath\s*=\s*(["\'])([^
"\']+)\1', webpage,
267 formats, subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4')
271 'title': join_nonempty(date, title, delim=' '),
272 'description': description,
273 'upload_date': upload_date,