yt_dlp/extractor/douyutv.py

   1 import hashlib
   2 import time
   3 import urllib
   4 import uuid
   5
   6 from .common import InfoExtractor
   7 from .openload import PhantomJSwrapper
   8 from ..utils import (
   9     ExtractorError,
  10     UserNotLive,
  11     determine_ext,
  12     int_or_none,
  13     js_to_json,
  14     parse_resolution,
  15     str_or_none,
  16     traverse_obj,
  17     unescapeHTML,
  18     url_or_none,
  19     urlencode_postdata,
  20     urljoin,
  21 )
  22
  23
  24 class DouyuBaseIE(InfoExtractor):
  25     def _download_cryptojs_md5(self, video_id):
  26         for url in [
  27             # XXX: Do NOT use cdn.bootcdn.net; ref: https://sansec.io/research/polyfill-supply-chain-attack
  28             'https://cdnjs.cloudflare.com/ajax/libs/crypto-js/3.1.2/rollups/md5.js',
  29             'https://unpkg.com/cryptojslib@3.1.2/rollups/md5.js',
  30         ]:
  31             js_code = self._download_webpage(
  32                 url, video_id, note='Downloading signing dependency', fatal=False)
  33             if js_code:
  34                 self.cache.store('douyu', 'crypto-js-md5', js_code)
  35                 return js_code
  36         raise ExtractorError('Unable to download JS dependency (crypto-js/md5)')
  37
  38     def _get_cryptojs_md5(self, video_id):
  39         return self.cache.load(
  40             'douyu', 'crypto-js-md5', min_ver='2024.07.04') or self._download_cryptojs_md5(video_id)
  41
  42     def _calc_sign(self, sign_func, video_id, a):
  43         b = uuid.uuid4().hex
  44         c = round(time.time())
  45         js_script = f'{self._get_cryptojs_md5(video_id)};{sign_func};console.log(ub98484234("{a}","{b}","{c}"))'
  46         phantom = PhantomJSwrapper(self)
  47         result = phantom.execute(js_script, video_id,
  48                                  note='Executing JS signing script').strip()
  49         return {i: v[0] for i, v in urllib.parse.parse_qs(result).items()}
  50
  51     def _search_js_sign_func(self, webpage, fatal=True):
  52         # The greedy look-behind ensures last possible script tag is matched
  53         return self._search_regex(
  54             r'(?:<script.*)?<script[^>]*>(.*?ub98484234.*?)</script>', webpage, 'JS sign func', fatal=fatal)
  55
  56
  57 class DouyuTVIE(DouyuBaseIE):
  58     IE_DESC = '斗鱼直播'
  59     _VALID_URL = r'https?://(?:www\.)?douyu(?:tv)?\.com/(topic/\w+\?rid=|(?:[^/]+/))*(?P<id>[A-Za-z0-9]+)'
  60     _TESTS = [{
  61         'url': 'https://www.douyu.com/pigff',
  62         'info_dict': {
  63             'id': '24422',
  64             'display_id': 'pigff',
  65             'ext': 'mp4',
  66             'title': 're:^【PIGFF】.* [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
  67             'description': r'≥15级牌子看鱼吧置顶帖进粉丝vx群',
  68             'thumbnail': str,
  69             'uploader': 'pigff',
  70             'is_live': True,
  71             'live_status': 'is_live',
  72         },
  73         'params': {
  74             'skip_download': True,
  75         },
  76     }, {
  77         'url': 'http://www.douyutv.com/85982',
  78         'info_dict': {
  79             'id': '85982',
  80             'display_id': '85982',
  81             'ext': 'flv',
  82             'title': 're:^小漠从零单排记！——CSOL2躲猫猫 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
  83             'description': 'md5:746a2f7a253966a06755a912f0acc0d2',
  84             'thumbnail': r're:^https?://.*\.png',
  85             'uploader': 'douyu小漠',
  86             'is_live': True,
  87         },
  88         'params': {
  89             'skip_download': True,
  90         },
  91         'skip': 'Room not found',
  92     }, {
  93         'url': 'http://www.douyutv.com/17732',
  94         'info_dict': {
  95             'id': '17732',
  96             'display_id': '17732',
  97             'ext': 'flv',
  98             'title': 're:^清晨醒脑！根本停不下来！ [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
  99             'description': r're:.*m7show@163\.com.*',
 100             'thumbnail': r're:^https?://.*\.png',
 101             'uploader': '7师傅',
 102             'is_live': True,
 103         },
 104         'params': {
 105             'skip_download': True,
 106         },
 107     }, {
 108         'url': 'https://www.douyu.com/topic/ydxc?rid=6560603',
 109         'info_dict': {
 110             'id': '6560603',
 111             'display_id': '6560603',
 112             'ext': 'flv',
 113             'title': 're:^阿余：新年快乐恭喜发财！ [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
 114             'description': 're:.*直播时间.*',
 115             'thumbnail': r're:^https?://.*\.png',
 116             'uploader': '阿涛皎月Carry',
 117             'live_status': 'is_live',
 118         },
 119         'params': {
 120             'skip_download': True,
 121         },
 122     }, {
 123         'url': 'http://www.douyu.com/xiaocang',
 124         'only_matching': True,
 125     }, {
 126         # \"room_id\"
 127         'url': 'http://www.douyu.com/t/lpl',
 128         'only_matching': True,
 129     }]
 130
 131     def _get_sign_func(self, room_id, video_id):
 132         return self._download_json(
 133             f'https://www.douyu.com/swf_api/homeH5Enc?rids={room_id}', video_id,
 134             note='Getting signing script')['data'][f'room{room_id}']
 135
 136     def _extract_stream_formats(self, stream_formats):
 137         formats = []
 138         for stream_info in traverse_obj(stream_formats, (..., 'data')):
 139             stream_url = urljoin(
 140                 traverse_obj(stream_info, 'rtmp_url'), traverse_obj(stream_info, 'rtmp_live'))
 141             if stream_url:
 142                 rate_id = traverse_obj(stream_info, ('rate', {int_or_none}))
 143                 rate_info = traverse_obj(stream_info, ('multirates', lambda _, v: v['rate'] == rate_id), get_all=False)
 144                 ext = determine_ext(stream_url)
 145                 formats.append({
 146                     'url': stream_url,
 147                     'format_id': str_or_none(rate_id),
 148                     'ext': 'mp4' if ext == 'm3u8' else ext,
 149                     'protocol': 'm3u8_native' if ext == 'm3u8' else 'https',
 150                     'quality': rate_id % -10000 if rate_id is not None else None,
 151                     **traverse_obj(rate_info, {
 152                         'format': ('name', {str_or_none}),
 153                         'tbr': ('bit', {int_or_none}),
 154                     }),
 155                 })
 156         return formats
 157
 158     def _real_extract(self, url):
 159         video_id = self._match_id(url)
 160
 161         webpage = self._download_webpage(url, video_id)
 162         room_id = self._search_regex(r'\$ROOM\.room_id\s*=\s*(\d+)', webpage, 'room id')
 163
 164         if self._search_regex(r'"videoLoop"\s*:\s*(\d+)', webpage, 'loop', default='') == '1':
 165             raise UserNotLive('The channel is auto-playing VODs', video_id=video_id)
 166         if self._search_regex(r'\$ROOM\.show_status\s*=\s*(\d+)', webpage, 'status', default='') == '2':
 167             raise UserNotLive(video_id=video_id)
 168
 169         # Grab metadata from API
 170         params = {
 171             'aid': 'wp',
 172             'client_sys': 'wp',
 173             'time': int(time.time()),
 174         }
 175         params['auth'] = hashlib.md5(
 176             f'room/{room_id}?{urllib.parse.urlencode(params)}zNzMV1y4EMxOHS6I5WKm'.encode()).hexdigest()
 177         room = traverse_obj(self._download_json(
 178             f'http://www.douyutv.com/api/v1/room/{room_id}', video_id,
 179             note='Downloading room info', query=params, fatal=False), 'data')
 180
 181         # 1 = live, 2 = offline
 182         if traverse_obj(room, 'show_status') == '2':
 183             raise UserNotLive(video_id=video_id)
 184
 185         js_sign_func = self._search_js_sign_func(webpage, fatal=False) or self._get_sign_func(room_id, video_id)
 186         form_data = {
 187             'rate': 0,
 188             **self._calc_sign(js_sign_func, video_id, room_id),
 189         }
 190         stream_formats = [self._download_json(
 191             f'https://www.douyu.com/lapi/live/getH5Play/{room_id}',
 192             video_id, note='Downloading livestream format',
 193             data=urlencode_postdata(form_data))]
 194
 195         for rate_id in traverse_obj(stream_formats[0], ('data', 'multirates', ..., 'rate')):
 196             if rate_id != traverse_obj(stream_formats[0], ('data', 'rate')):
 197                 form_data['rate'] = rate_id
 198                 stream_formats.append(self._download_json(
 199                     f'https://www.douyu.com/lapi/live/getH5Play/{room_id}',
 200                     video_id, note=f'Downloading livestream format {rate_id}',
 201                     data=urlencode_postdata(form_data)))
 202
 203         return {
 204             'id': room_id,
 205             'formats': self._extract_stream_formats(stream_formats),
 206             'is_live': True,
 207             **traverse_obj(room, {
 208                 'display_id': ('url', {str}, {lambda i: i[1:]}),
 209                 'title': ('room_name', {unescapeHTML}),
 210                 'description': ('show_details', {str}),
 211                 'uploader': ('nickname', {str}),
 212                 'thumbnail': ('room_src', {url_or_none}),
 213             }),
 214         }
 215
 216
 217 class DouyuShowIE(DouyuBaseIE):
 218     _VALID_URL = r'https?://v(?:mobile)?\.douyu\.com/show/(?P<id>[0-9a-zA-Z]+)'
 219
 220     _TESTS = [{
 221         'url': 'https://v.douyu.com/show/mPyq7oVNe5Yv1gLY',
 222         'info_dict': {
 223             'id': 'mPyq7oVNe5Yv1gLY',
 224             'ext': 'mp4',
 225             'title': '四川人小时候的味道“蒜苗回锅肉”，传统菜不能丢，要常做来吃',
 226             'duration': 633,
 227             'thumbnail': str,
 228             'uploader': '美食作家王刚V',
 229             'uploader_id': 'OVAO4NVx1m7Q',
 230             'timestamp': 1661850002,
 231             'upload_date': '20220830',
 232             'view_count': int,
 233             'tags': ['美食', '美食综合'],
 234         },
 235     }, {
 236         'url': 'https://vmobile.douyu.com/show/rjNBdvnVXNzvE2yw',
 237         'only_matching': True,
 238     }]
 239
 240     _FORMATS = {
 241         'super': '原画',
 242         'high': '超清',
 243         'normal': '高清',
 244     }
 245
 246     _QUALITIES = {
 247         'super': -1,
 248         'high': -2,
 249         'normal': -3,
 250     }
 251
 252     _RESOLUTIONS = {
 253         'super': '1920x1080',
 254         'high': '1280x720',
 255         'normal': '852x480',
 256     }
 257
 258     def _real_extract(self, url):
 259         url = url.replace('vmobile.', 'v.')
 260         video_id = self._match_id(url)
 261
 262         webpage = self._download_webpage(url, video_id)
 263
 264         video_info = self._search_json(
 265             r'<script>\s*window\.\$DATA\s*=', webpage,
 266             'video info', video_id, transform_source=js_to_json)
 267
 268         js_sign_func = self._search_js_sign_func(webpage)
 269         form_data = {
 270             'vid': video_id,
 271             **self._calc_sign(js_sign_func, video_id, video_info['ROOM']['point_id']),
 272         }
 273         url_info = self._download_json(
 274             'https://v.douyu.com/api/stream/getStreamUrl', video_id,
 275             data=urlencode_postdata(form_data), note='Downloading video formats')
 276
 277         formats = []
 278         for name, url in traverse_obj(url_info, ('data', 'thumb_video', {dict.items}, ...)):
 279             video_url = traverse_obj(url, ('url', {url_or_none}))
 280             if video_url:
 281                 ext = determine_ext(video_url)
 282                 formats.append({
 283                     'format': self._FORMATS.get(name),
 284                     'format_id': name,
 285                     'url': video_url,
 286                     'quality': self._QUALITIES.get(name),
 287                     'ext': 'mp4' if ext == 'm3u8' else ext,
 288                     'protocol': 'm3u8_native' if ext == 'm3u8' else 'https',
 289                     **parse_resolution(self._RESOLUTIONS.get(name)),
 290                 })
 291             else:
 292                 self.to_screen(
 293                     f'"{self._FORMATS.get(name, name)}" format may require logging in. {self._login_hint()}')
 294
 295         return {
 296             'id': video_id,
 297             'formats': formats,
 298             **traverse_obj(video_info, ('DATA', {
 299                 'title': ('content', 'title', {str}),
 300                 'uploader': ('content', 'author', {str}),
 301                 'uploader_id': ('content', 'up_id', {str_or_none}),
 302                 'duration': ('content', 'video_duration', {int_or_none}),
 303                 'thumbnail': ('content', 'video_pic', {url_or_none}),
 304                 'timestamp': ('content', 'create_time', {int_or_none}),
 305                 'view_count': ('content', 'view_num', {int_or_none}),
 306                 'tags': ('videoTag', ..., 'tagName', {str}),
 307             })),
 308         }