yt_dlp/extractor/japandiet.py

   1 import re
   2
   3 from .common import InfoExtractor
   4 from ..utils import (
   5     ExtractorError,
   6     clean_html,
   7     int_or_none,
   8     join_nonempty,
   9     parse_qs,
  10     smuggle_url,
  11     traverse_obj,
  12     try_call,
  13     unsmuggle_url,
  14 )
  15
  16
  17 def _parse_japanese_date(text):
  18     if not text:
  19         return None
  20     ERA_TABLE = {
  21         '明治': 1868,
  22         '大正': 1912,
  23         '昭和': 1926,
  24         '平成': 1989,
  25         '令和': 2019,
  26     }
  27     ERA_RE = '|'.join(map(re.escape, ERA_TABLE.keys()))
  28     mobj = re.search(rf'({ERA_RE})?(\d+)年(\d+)月(\d+)日', re.sub(r'[\s\u3000]+', '', text))
  29     if not mobj:
  30         return None
  31     era, year, month, day = mobj.groups()
  32     year, month, day = map(int, (year, month, day))
  33     if era:
  34         # example input: 令和5年3月34日
  35         # even though each era have their end, don't check here
  36         year += ERA_TABLE[era]
  37     return '%04d%02d%02d' % (year, month, day)
  38
  39
  40 def _parse_japanese_duration(text):
  41     mobj = re.search(r'(?:(\d+)日間?)?(?:(\d+)時間?)?(?:(\d+)分)?(?:(\d+)秒)?', re.sub(r'[\s\u3000]+', '', text or ''))
  42     if not mobj:
  43         return
  44     days, hours, mins, secs = (int_or_none(x, default=0) for x in mobj.groups())
  45     return secs + mins * 60 + hours * 60 * 60 + days * 24 * 60 * 60
  46
  47
  48 class ShugiinItvBaseIE(InfoExtractor):
  49     _INDEX_ROOMS = None
  50
  51     @classmethod
  52     def _find_rooms(cls, webpage):
  53         return [{
  54             '_type': 'url',
  55             'id': x.group(1),
  56             'title': clean_html(x.group(2)).strip(),
  57             'url': smuggle_url(f'https://www.shugiintv.go.jp/jp/index.php?room_id={x.group(1)}', {'g': x.groups()}),
  58             'ie_key': ShugiinItvLiveIE.ie_key(),
  59         } for x in re.finditer(r'(?s)<a\s+href="[^"]+\?room_id=(room\d+)"\s*class="play_live".+?class="s12_14">(.+?)</td>', webpage)]
  60
  61     def _fetch_rooms(self):
  62         if not self._INDEX_ROOMS:
  63             webpage = self._download_webpage(
  64                 'https://www.shugiintv.go.jp/jp/index.php', None,
  65                 encoding='euc-jp', note='Downloading proceedings info')
  66             ShugiinItvBaseIE._INDEX_ROOMS = self._find_rooms(webpage)
  67         return self._INDEX_ROOMS
  68
  69
  70 class ShugiinItvLiveIE(ShugiinItvBaseIE):
  71     _VALID_URL = r'https?://(?:www\.)?shugiintv\.go\.jp/(?:jp|en)(?:/index\.php)?$'
  72     IE_DESC = '衆議院インターネット審議中継'
  73
  74     _TESTS = [{
  75         'url': 'https://www.shugiintv.go.jp/jp/index.php',
  76         'info_dict': {
  77             '_type': 'playlist',
  78             'title': 'All proceedings for today',
  79         },
  80         # expect at least one proceedings is running
  81         'playlist_mincount': 1,
  82     }]
  83
  84     @classmethod
  85     def suitable(cls, url):
  86         return super().suitable(url) and not any(x.suitable(url) for x in (ShugiinItvLiveRoomIE, ShugiinItvVodIE))
  87
  88     def _real_extract(self, url):
  89         self.to_screen(
  90             'Downloading all running proceedings. To specify one proceeding, use direct link from the website')
  91         return self.playlist_result(self._fetch_rooms(), playlist_title='All proceedings for today')
  92
  93
  94 class ShugiinItvLiveRoomIE(ShugiinItvBaseIE):
  95     _VALID_URL = r'https?://(?:www\.)?shugiintv\.go\.jp/(?:jp|en)/index\.php\?room_id=(?P<id>room\d+)'
  96     IE_DESC = '衆議院インターネット審議中継 (中継)'
  97
  98     _TESTS = [{
  99         'url': 'https://www.shugiintv.go.jp/jp/index.php?room_id=room01',
 100         'info_dict': {
 101             'id': 'room01',
 102             'title': '内閣委員会',
 103         },
 104         'skip': 'this runs for a time and not every day',
 105     }, {
 106         'url': 'https://www.shugiintv.go.jp/jp/index.php?room_id=room11',
 107         'info_dict': {
 108             'id': 'room11',
 109             'title': '外務委員会',
 110         },
 111         'skip': 'this runs for a time and not every day',
 112     }]
 113
 114     def _real_extract(self, url):
 115         url, smug = unsmuggle_url(url, default={})
 116         if smug.get('g'):
 117             room_id, title = smug['g']
 118         else:
 119             room_id = self._match_id(url)
 120             title = traverse_obj(self._fetch_rooms(), (lambda k, v: v['id'] == room_id, 'title'), get_all=False)
 121
 122         formats, subtitles = self._extract_m3u8_formats_and_subtitles(
 123             f'https://hlslive.shugiintv.go.jp/{room_id}/amlst:{room_id}/playlist.m3u8',
 124             room_id, ext='mp4')
 125
 126         return {
 127             'id': room_id,
 128             'title': title,
 129             'formats': formats,
 130             'subtitles': subtitles,
 131             'is_live': True,
 132         }
 133
 134
 135 class ShugiinItvVodIE(ShugiinItvBaseIE):
 136     _VALID_URL = r'https?://(?:www\.)?shugiintv\.go\.jp/(?:jp|en)/index\.php\?ex=VL(?:\&[^=]+=[^&]*)*\&deli_id=(?P<id>\d+)'
 137     IE_DESC = '衆議院インターネット審議中継 (ビデオライブラリ)'
 138     _TESTS = [{
 139         'url': 'https://www.shugiintv.go.jp/jp/index.php?ex=VL&media_type=&deli_id=53846',
 140         'info_dict': {
 141             'id': '53846',
 142             'title': 'ウクライナ大統領国会演説（オンライン）',
 143             'release_date': '20220323',
 144             'chapters': 'count:4',
 145         },
 146     }, {
 147         'url': 'https://www.shugiintv.go.jp/en/index.php?ex=VL&media_type=&deli_id=53846',
 148         'only_matching': True,
 149     }]
 150
 151     def _real_extract(self, url):
 152         video_id = self._match_id(url)
 153         webpage = self._download_webpage(
 154             f'https://www.shugiintv.go.jp/jp/index.php?ex=VL&media_type=&deli_id={video_id}', video_id,
 155             encoding='euc-jp')
 156
 157         m3u8_url = self._search_regex(
 158             r'id="vtag_src_base_vod"\s*value="(http.+?\.m3u8)"', webpage, 'm3u8 url')
 159         m3u8_url = re.sub(r'^http://', 'https://', m3u8_url)
 160         formats, subtitles = self._extract_m3u8_formats_and_subtitles(
 161             m3u8_url, video_id, ext='mp4')
 162
 163         title = self._html_search_regex(
 164             (r'<td\s+align="left">(.+)\s*\(\d+分\)',
 165              r'<TD.+?<IMG\s*src=".+?/spacer\.gif".+?height="15">(.+?)<IMG'), webpage, 'title', fatal=False)
 166
 167         release_date = _parse_japanese_date(self._html_search_regex(
 168             r'開会日</td>\s*<td.+?/td>\s*<TD>(.+?)</TD>',
 169             webpage, 'title', fatal=False))
 170
 171         chapters = []
 172         for chp in re.finditer(r'(?i)<A\s+HREF="([^"]+?)"\s*class="play_vod">(?!<img)(.+)</[Aa]>', webpage):
 173             chapters.append({
 174                 'title': clean_html(chp.group(2)).strip(),
 175                 'start_time': try_call(lambda: float(parse_qs(chp.group(1))['time'][0].strip())),
 176             })
 177         # NOTE: there are blanks at the first and the end of the videos,
 178         # so getting/providing the video duration is not possible
 179         # also, the exact end_time for the last chapter is unknown (we can get at most minutes of granularity)
 180         last_tr = re.findall(r'(?s)<TR\s*class="s14_24">(.+?)</TR>', webpage)[-1]
 181         if last_tr and chapters:
 182             last_td = re.findall(r'<TD.+?</TD>', last_tr)[-1]
 183             if last_td:
 184                 chapters[-1]['end_time'] = chapters[-1]['start_time'] + _parse_japanese_duration(clean_html(last_td))
 185
 186         return {
 187             'id': video_id,
 188             'title': title,
 189             'release_date': release_date,
 190             'chapters': chapters,
 191             'formats': formats,
 192             'subtitles': subtitles,
 193         }
 194
 195
 196 class SangiinInstructionIE(InfoExtractor):
 197     _VALID_URL = r'https?://www\.webtv\.sangiin\.go\.jp/webtv/index\.php'
 198     IE_DESC = False  # this shouldn't be listed as a supported site
 199
 200     def _real_extract(self, url):
 201         raise ExtractorError(
 202             'Copy the link from the button below the video description/player '
 203             'and use that link to download. If there is no button in the frame, '
 204             'get the URL of the frame showing the video.', expected=True)
 205
 206
 207 class SangiinIE(InfoExtractor):
 208     _VALID_URL = r'https?://www\.webtv\.sangiin\.go\.jp/webtv/detail\.php\?sid=(?P<id>\d+)'
 209     IE_DESC = '参議院インターネット審議中継 (archive)'
 210
 211     _TESTS = [{
 212         'url': 'https://www.webtv.sangiin.go.jp/webtv/detail.php?sid=7052',
 213         'info_dict': {
 214             'id': '7052',
 215             'title': '2022年10月7日 本会議',
 216             'description': 'md5:0a5fed523f95c88105a0b0bf1dd71489',
 217             'upload_date': '20221007',
 218             'ext': 'mp4',
 219         },
 220     }, {
 221         'url': 'https://www.webtv.sangiin.go.jp/webtv/detail.php?sid=7037',
 222         'info_dict': {
 223             'id': '7037',
 224             'title': '2022年10月3日 開会式',
 225             'upload_date': '20221003',
 226             'ext': 'mp4',
 227         },
 228     }, {
 229         'url': 'https://www.webtv.sangiin.go.jp/webtv/detail.php?sid=7076',
 230         'info_dict': {
 231             'id': '7076',
 232             'title': '2022年10月27日 法務委員会',
 233             'upload_date': '20221027',
 234             'ext': 'mp4',
 235             'is_live': True,
 236         },
 237         'skip': 'this live is turned into archive after it ends',
 238     }]
 239
 240     def _real_extract(self, url):
 241         video_id = self._match_id(url)
 242         webpage = self._download_webpage(url, video_id)
 243
 244         date = self._html_search_regex(
 245             r'<dt[^>]*>\s*開会日\s*</dt>\s*<dd[^>]*>\s*(.+?)\s*</dd>', webpage,
 246             'date', fatal=False)
 247         upload_date = _parse_japanese_date(date)
 248
 249         title = self._html_search_regex(
 250             r'<dt[^>]*>\s*会議名\s*</dt>\s*<dd[^>]*>\s*(.+?)\s*</dd>', webpage,
 251             'date', fatal=False)
 252
 253         # some videos don't have the elements, so assume it's missing
 254         description = self._html_search_regex(
 255             r'会議の経過\s*</h3>\s*<span[^>]*>(.+?)</span>', webpage,
 256             'description', default=None)
 257
 258         # this row appears only when it's livestream
 259         is_live = bool(self._html_search_regex(
 260             r'<dt[^>]*>\s*公報掲載時刻\s*</dt>\s*<dd[^>]*>\s*(.+?)\s*</dd>', webpage,
 261             'is_live', default=None))
 262
 263         m3u8_url = self._search_regex(
 264             r'var\s+videopath\s*=\s*(["\'])([^"\']+)\1', webpage,
 265             'm3u8 url', group=2)
 266
 267         formats, subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4')
 268
 269         return {
 270             'id': video_id,
 271             'title': join_nonempty(date, title, delim=' '),
 272             'description': description,
 273             'upload_date': upload_date,
 274             'formats': formats,
 275             'subtitles': subs,
 276             'is_live': is_live,
 277         }