yt_dlp/extractor/chaturbate.py

   1 import re
   2
   3 from .common import InfoExtractor
   4 from ..utils import (
   5     ExtractorError,
   6     lowercase_escape,
   7     url_or_none,
   8 )
   9
  10
  11 class ChaturbateIE(InfoExtractor):
  12     _VALID_URL = r'https?://(?:[^/]+\.)?chaturbate\.com/(?:fullvideo/?\?.*?\bb=)?(?P<id>[^/?&#]+)'
  13     _TESTS = [{
  14         'url': 'https://www.chaturbate.com/siswet19/',
  15         'info_dict': {
  16             'id': 'siswet19',
  17             'ext': 'mp4',
  18             'title': 're:^siswet19 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
  19             'age_limit': 18,
  20             'is_live': True,
  21         },
  22         'params': {
  23             'skip_download': True,
  24         },
  25         'skip': 'Room is offline',
  26     }, {
  27         'url': 'https://chaturbate.com/fullvideo/?b=caylin',
  28         'only_matching': True,
  29     }, {
  30         'url': 'https://en.chaturbate.com/siswet19/',
  31         'only_matching': True,
  32     }]
  33
  34     _ROOM_OFFLINE = 'Room is currently offline'
  35
  36     def _real_extract(self, url):
  37         video_id = self._match_id(url)
  38
  39         webpage = self._download_webpage(
  40             f'https://chaturbate.com/{video_id}/', video_id,
  41             headers=self.geo_verification_headers())
  42
  43         found_m3u8_urls = []
  44
  45         data = self._parse_json(
  46             self._search_regex(
  47                 r'initialRoomDossier\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1',
  48                 webpage, 'data', default='{}', group='value'),
  49             video_id, transform_source=lowercase_escape, fatal=False)
  50         if data:
  51             m3u8_url = url_or_none(data.get('hls_source'))
  52             if m3u8_url:
  53                 found_m3u8_urls.append(m3u8_url)
  54
  55         if not found_m3u8_urls:
  56             for m in re.finditer(
  57                     r'(\\u002[27])(?P<url>http.+?\.m3u8.*?)\1', webpage):
  58                 found_m3u8_urls.append(lowercase_escape(m.group('url')))
  59
  60         if not found_m3u8_urls:
  61             for m in re.finditer(
  62                     r'(["\'])(?P<url>http.+?\.m3u8.*?)\1', webpage):
  63                 found_m3u8_urls.append(m.group('url'))
  64
  65         m3u8_urls = []
  66         for found_m3u8_url in found_m3u8_urls:
  67             m3u8_fast_url, m3u8_no_fast_url = found_m3u8_url, found_m3u8_url.replace('_fast', '')
  68             for m3u8_url in (m3u8_fast_url, m3u8_no_fast_url):
  69                 if m3u8_url not in m3u8_urls:
  70                     m3u8_urls.append(m3u8_url)
  71
  72         if not m3u8_urls:
  73             error = self._search_regex(
  74                 [r'<span[^>]+class=(["\'])desc_span\1[^>]*>(?P<error>[^<]+)</span>',
  75                  r'<div[^>]+id=(["\'])defchat\1[^>]*>\s*<p><strong>(?P<error>[^<]+)<'],
  76                 webpage, 'error', group='error', default=None)
  77             if not error:
  78                 if any(p in webpage for p in (
  79                         self._ROOM_OFFLINE, 'offline_tipping', 'tip_offline')):
  80                     error = self._ROOM_OFFLINE
  81             if error:
  82                 raise ExtractorError(error, expected=True)
  83             raise ExtractorError('Unable to find stream URL')
  84
  85         formats = []
  86         for m3u8_url in m3u8_urls:
  87             for known_id in ('fast', 'slow'):
  88                 if f'_{known_id}' in m3u8_url:
  89                     m3u8_id = known_id
  90                     break
  91             else:
  92                 m3u8_id = None
  93             formats.extend(self._extract_m3u8_formats(
  94                 m3u8_url, video_id, ext='mp4',
  95                 # ffmpeg skips segments for fast m3u8
  96                 preference=-10 if m3u8_id == 'fast' else None,
  97                 m3u8_id=m3u8_id, fatal=False, live=True))
  98
  99         return {
 100             'id': video_id,
 101             'title': video_id,
 102             'thumbnail': f'https://roomimg.stream.highwebmedia.com/ri/{video_id}.jpg',
 103             'age_limit': self._rta_search(webpage),
 104             'is_live': True,
 105             'formats': formats,
 106         }