14 from .common
import InfoExtractor
17 def _parse_japanese_date(text
):
27 ERA_RE
= '|'.join(map(re
.escape
, ERA_TABLE
.keys()))
28 mobj
= re
.search(rf
'({ERA_RE})?(\d+)年(\d+)月(\d+)日', re
.sub(r
'[\s\u3000]+', '', text
))
31 era
, year
, month
, day
= mobj
.groups()
32 year
, month
, day
= map(int, (year
, month
, day
))
34 # example input: 令和5年3月34日
35 # even though each era have their end, don't check here
36 year
+= ERA_TABLE
[era
]
37 return '%04d%02d%02d' % (year
, month
, day
)
40 def _parse_japanese_duration(text
):
41 mobj
= re
.search(r
'(?:(\d+)日間?)?(?:(\d+)時間?)?(?:(\d+)分)?(?:(\d+)秒)?', re
.sub(r
'[\s\u3000]+', '', text
or ''))
44 days
, hours
, mins
, secs
= [int_or_none(x
, default
=0) for x
in mobj
.groups()]
45 return secs
+ mins
* 60 + hours
* 60 * 60 + days
* 24 * 60 * 60
48 class ShugiinItvBaseIE(InfoExtractor
):
52 def _find_rooms(cls
, webpage
):
56 'title': clean_html(x
.group(2)).strip(),
57 'url': smuggle_url(f
'https://www.shugiintv.go.jp/jp/index.php?room_id={x.group(1)}', {'g': x
.groups()}),
58 'ie_key': ShugiinItvLiveIE
.ie_key(),
59 } for x
in re
.finditer(r
'(?s)<a\s+href="[^"]+\?room_id=(room\d+)"\s*class="play_live".+?class="s12_14">(.+?)</td>', webpage
)]
61 def _fetch_rooms(self
):
62 if not self
._INDEX
_ROOMS
:
63 webpage
= self
._download
_webpage
(
64 'https://www.shugiintv.go.jp/jp/index.php', None,
65 encoding
='euc-jp', note
='Downloading proceedings info')
66 ShugiinItvBaseIE
._INDEX
_ROOMS
= self
._find
_rooms
(webpage
)
67 return self
._INDEX
_ROOMS
70 class ShugiinItvLiveIE(ShugiinItvBaseIE
):
71 _VALID_URL
= r
'https?://(?:www\.)?shugiintv\.go\.jp/(?:jp|en)(?:/index\.php)?$'
72 IE_DESC
= '衆議院インターネット審議中継'
75 'url': 'https://www.shugiintv.go.jp/jp/index.php',
78 'title': 'All proceedings for today',
80 # expect at least one proceedings is running
81 'playlist_mincount': 1,
85 def suitable(cls
, url
):
86 return super().suitable(url
) and not any(x
.suitable(url
) for x
in (ShugiinItvLiveRoomIE
, ShugiinItvVodIE
))
88 def _real_extract(self
, url
):
90 'Downloading all running proceedings. To specify one proceeding, use direct link from the website')
91 return self
.playlist_result(self
._fetch
_rooms
(), playlist_title
='All proceedings for today')
94 class ShugiinItvLiveRoomIE(ShugiinItvBaseIE
):
95 _VALID_URL
= r
'https?://(?:www\.)?shugiintv\.go\.jp/(?:jp|en)/index\.php\?room_id=(?P<id>room\d+)'
96 IE_DESC
= '衆議院インターネット審議中継 (中継)'
99 'url': 'https://www.shugiintv.go.jp/jp/index.php?room_id=room01',
104 'skip': 'this runs for a time and not every day',
106 'url': 'https://www.shugiintv.go.jp/jp/index.php?room_id=room11',
111 'skip': 'this runs for a time and not every day',
114 def _real_extract(self
, url
):
115 url
, smug
= unsmuggle_url(url
, default
={})
117 room_id
, title
= smug
['g']
119 room_id
= self
._match
_id
(url
)
120 title
= traverse_obj(self
._fetch
_rooms
(), (lambda k
, v
: v
['id'] == room_id
, 'title'), get_all
=False)
122 formats
, subtitles
= self
._extract
_m
3u8_formats
_and
_subtitles
(
123 f
'https://hlslive.shugiintv.go.jp/{room_id}/amlst:{room_id}/playlist.m3u8',
130 'subtitles': subtitles
,
135 class ShugiinItvVodIE(ShugiinItvBaseIE
):
136 _VALID_URL
= r
'https?://(?:www\.)?shugiintv\.go\.jp/(?:jp|en)/index\.php\?ex=VL(?:\&[^=]+=[^&]*)*\&deli_id=(?P<id>\d+)'
137 IE_DESC
= '衆議院インターネット審議中継 (ビデオライブラリ)'
139 'url': 'https://www.shugiintv.go.jp/jp/index.php?ex=VL&media_type=&deli_id=53846',
142 'title': 'ウクライナ大統領国会演説(オンライン)',
143 'release_date': '20220323',
144 'chapters': 'count:4',
147 'url': 'https://www.shugiintv.go.jp/en/index.php?ex=VL&media_type=&deli_id=53846',
148 'only_matching': True
151 def _real_extract(self
, url
):
152 video_id
= self
._match
_id
(url
)
153 webpage
= self
._download
_webpage
(
154 f
'https://www.shugiintv.go.jp/jp/index.php?ex=VL&media_type=&deli_id={video_id}', video_id
,
157 m3u8_url
= self
._search
_regex
(
158 r
'id="vtag_src_base_vod"\s*value="(http.+?\.m3u8)"', webpage
, 'm3u8 url')
159 m3u8_url
= re
.sub(r
'^http://', 'https://', m3u8_url
)
160 formats
, subtitles
= self
._extract
_m
3u8_formats
_and
_subtitles
(
161 m3u8_url
, video_id
, ext
='mp4')
163 title
= self
._html
_search
_regex
(
164 (r
'<td\s+align="left">(.+)\s*\(\d+分\)',
165 r
'<TD.+?<IMG\s*src=".+?/spacer\.gif".+?height="15">(.+?)<IMG'), webpage
, 'title', fatal
=False)
167 release_date
= _parse_japanese_date(self
._html
_search
_regex
(
168 r
'開会日</td>\s*<td.+?/td>\s*<TD>(.+?)</TD>',
169 webpage
, 'title', fatal
=False))
172 for chp
in re
.finditer(r
'(?i)<A\s+HREF="([^"]+?)"\s*class="play_vod">(?!<img)(.+)</[Aa]>', webpage
):
174 'title': clean_html(chp
.group(2)).strip(),
175 'start_time': try_call(lambda: float(parse_qs(chp
.group(1))['time'][0].strip())),
177 # NOTE: there are blanks at the first and the end of the videos,
178 # so getting/providing the video duration is not possible
179 # also, the exact end_time for the last chapter is unknown (we can get at most minutes of granularity)
180 last_tr
= re
.findall(r
'(?s)<TR\s*class="s14_24">(.+?)</TR>', webpage
)[-1]
181 if last_tr
and chapters
:
182 last_td
= re
.findall(r
'<TD.+?</TD>', last_tr
)[-1]
184 chapters
[-1]['end_time'] = chapters
[-1]['start_time'] + _parse_japanese_duration(clean_html(last_td
))
189 'release_date': release_date
,
190 'chapters': chapters
,
192 'subtitles': subtitles
,
196 class SangiinInstructionIE(InfoExtractor
):
197 _VALID_URL
= r
'^https?://www\.webtv\.sangiin\.go\.jp/webtv/index\.php'
198 IE_DESC
= False # this shouldn't be listed as a supported site
200 def _real_extract(self
, url
):
201 raise ExtractorError('Copy the link from the botton below the video description or player, and use the link to download. If there are no button in the frame, get the URL of the frame showing the video.', expected
=True)
204 class SangiinIE(InfoExtractor
):
205 _VALID_URL
= r
'https?://www\.webtv\.sangiin\.go\.jp/webtv/detail\.php\?sid=(?P<id>\d+)'
206 IE_DESC
= '参議院インターネット審議中継 (archive)'
209 'url': 'https://www.webtv.sangiin.go.jp/webtv/detail.php?sid=7052',
212 'title': '2022年10月7日 本会議',
213 'description': 'md5:0a5fed523f95c88105a0b0bf1dd71489',
214 'upload_date': '20221007',
218 'url': 'https://www.webtv.sangiin.go.jp/webtv/detail.php?sid=7037',
221 'title': '2022年10月3日 開会式',
222 'upload_date': '20221003',
226 'url': 'https://www.webtv.sangiin.go.jp/webtv/detail.php?sid=7076',
229 'title': '2022年10月27日 法務委員会',
230 'upload_date': '20221027',
234 'skip': 'this live is turned into archive after it ends',
237 def _real_extract(self
, url
):
238 video_id
= self
._match
_id
(url
)
239 webpage
= self
._download
_webpage
(url
, video_id
)
241 date
= self
._html
_search
_regex
(
242 r
'<dt[^>]*>\s*開会日\s*</dt>\s*<dd[^>]*>\s*(.+?)\s*</dd>', webpage
,
244 upload_date
= _parse_japanese_date(date
)
246 title
= self
._html
_search
_regex
(
247 r
'<dt[^>]*>\s*会議名\s*</dt>\s*<dd[^>]*>\s*(.+?)\s*</dd>', webpage
,
250 # some videos don't have the elements, so assume it's missing
251 description
= self
._html
_search
_regex
(
252 r
'会議の経過\s*</h3>\s*<span[^>]*>(.+?)</span>', webpage
,
253 'description', default
=None)
255 # this row appears only when it's livestream
256 is_live
= bool(self
._html
_search
_regex
(
257 r
'<dt[^>]*>\s*公報掲載時刻\s*</dt>\s*<dd[^>]*>\s*(.+?)\s*</dd>', webpage
,
258 'is_live', default
=None))
260 m3u8_url
= self
._search
_regex
(
261 r
'var\s+videopath\s*=\s*(["\'])([^
"\']+)\1', webpage,
264 formats, subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4')
268 'title': join_nonempty(date, title, delim=' '),
269 'description': description,
270 'upload_date': upload_date,