yt_dlp/extractor/ixigua.py

   1 import base64
   2
   3 from .common import InfoExtractor
   4 from ..utils import (
   5     ExtractorError,
   6     get_element_by_id,
   7     int_or_none,
   8     js_to_json,
   9     str_or_none,
  10     traverse_obj,
  11 )
  12
  13
  14 class IxiguaIE(InfoExtractor):
  15     _VALID_URL = r'https?://(?:\w+\.)?ixigua\.com/(?:video/)?(?P<id>\d+).+'
  16     _TESTS = [{
  17         'url': 'https://www.ixigua.com/6996881461559165471',
  18         'info_dict': {
  19             'id': '6996881461559165471',
  20             'ext': 'mp4',
  21             'title': '盲目涉水风险大，亲身示范高水位行车注意事项',
  22             'description': 'md5:8c82f46186299add4a1c455430740229',
  23             'tags': ['video_car'],
  24             'like_count': int,
  25             'dislike_count': int,
  26             'view_count': int,
  27             'uploader': '懂车帝原创',
  28             'uploader_id': '6480145787',
  29             'thumbnail': r're:^https?://.+\.(avif|webp)',
  30             'timestamp': 1629088414,
  31             'duration': 1030,
  32         }
  33     }]
  34
  35     def _get_json_data(self, webpage, video_id):
  36         js_data = get_element_by_id('SSR_HYDRATED_DATA', webpage)
  37         if not js_data:
  38             if self._cookies_passed:
  39                 raise ExtractorError('Failed to get SSR_HYDRATED_DATA')
  40             raise ExtractorError('Cookies (not necessarily logged in) are needed', expected=True)
  41
  42         return self._parse_json(
  43             js_data.replace('window._SSR_HYDRATED_DATA=', ''), video_id, transform_source=js_to_json)
  44
  45     def _media_selector(self, json_data):
  46         for path, override in (
  47             (('video_list', ), {}),
  48             (('dynamic_video', 'dynamic_video_list'), {'acodec': 'none'}),
  49             (('dynamic_video', 'dynamic_audio_list'), {'vcodec': 'none', 'ext': 'm4a'}),
  50         ):
  51             for media in traverse_obj(json_data, (..., *path, lambda _, v: v['main_url'])):
  52                 yield {
  53                     'url': base64.b64decode(media['main_url']).decode(),
  54                     'width': int_or_none(media.get('vwidth')),
  55                     'height': int_or_none(media.get('vheight')),
  56                     'fps': int_or_none(media.get('fps')),
  57                     'vcodec': media.get('codec_type'),
  58                     'format_id': str_or_none(media.get('quality_type')),
  59                     'filesize': int_or_none(media.get('size')),
  60                     'ext': 'mp4',
  61                     **override,
  62                 }
  63
  64     def _real_extract(self, url):
  65         video_id = self._match_id(url)
  66         webpage = self._download_webpage(url, video_id)
  67         json_data = self._get_json_data(webpage, video_id)['anyVideo']['gidInformation']['packerData']['video']
  68
  69         formats = list(self._media_selector(json_data.get('videoResource')))
  70         return {
  71             'id': video_id,
  72             'title': json_data.get('title'),
  73             'description': json_data.get('video_abstract'),
  74             'formats': formats,
  75             'like_count': json_data.get('video_like_count'),
  76             'duration': int_or_none(json_data.get('duration')),
  77             'tags': [json_data.get('tag')],
  78             'uploader_id': traverse_obj(json_data, ('user_info', 'user_id')),
  79             'uploader': traverse_obj(json_data, ('user_info', 'name')),
  80             'view_count': json_data.get('video_watch_count'),
  81             'dislike_count': json_data.get('video_unlike_count'),
  82             'timestamp': int_or_none(json_data.get('video_publish_time')),
  83         }