yt_dlp/extractor/bitchute.py

   1 import functools
   2 import re
   3
   4 from .common import InfoExtractor
   5 from ..networking import HEADRequest
   6 from ..utils import (
   7     ExtractorError,
   8     OnDemandPagedList,
   9     clean_html,
  10     extract_attributes,
  11     get_element_by_class,
  12     get_element_by_id,
  13     get_element_html_by_class,
  14     get_elements_html_by_class,
  15     int_or_none,
  16     orderedSet,
  17     parse_count,
  18     parse_duration,
  19     traverse_obj,
  20     unified_strdate,
  21     urlencode_postdata,
  22     urljoin,
  23 )
  24
  25
  26 class BitChuteIE(InfoExtractor):
  27     _VALID_URL = r'https?://(?:(?:www|old)\.)?bitchute\.com/(?:video|embed|torrent/[^/]+)/(?P<id>[^/?#&]+)'
  28     _EMBED_REGEX = [rf'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>{_VALID_URL})']
  29     _TESTS = [{
  30         'url': 'https://www.bitchute.com/video/UGlrF9o9b-Q/',
  31         'md5': '7e427d7ed7af5a75b5855705ec750e2b',
  32         'info_dict': {
  33             'id': 'UGlrF9o9b-Q',
  34             'ext': 'mp4',
  35             'title': 'This is the first video on #BitChute !',
  36             'description': 'md5:a0337e7b1fe39e32336974af8173a034',
  37             'thumbnail': r're:^https?://.*\.jpg$',
  38             'uploader': 'BitChute',
  39             'upload_date': '20170103',
  40             'uploader_url': 'https://www.bitchute.com/profile/I5NgtHZn9vPj/',
  41             'channel': 'BitChute',
  42             'channel_url': 'https://www.bitchute.com/channel/bitchute/',
  43         },
  44     }, {
  45         # test case: video with different channel and uploader
  46         'url': 'https://www.bitchute.com/video/Yti_j9A-UZ4/',
  47         'md5': 'f10e6a8e787766235946d0868703f1d0',
  48         'info_dict': {
  49             'id': 'Yti_j9A-UZ4',
  50             'ext': 'mp4',
  51             'title': 'Israel at War | Full Measure',
  52             'description': 'md5:38cf7bc6f42da1a877835539111c69ef',
  53             'thumbnail': r're:^https?://.*\.jpg$',
  54             'uploader': 'sharylattkisson',
  55             'upload_date': '20231106',
  56             'uploader_url': 'https://www.bitchute.com/profile/9K0kUWA9zmd9/',
  57             'channel': 'Full Measure with Sharyl Attkisson',
  58             'channel_url': 'https://www.bitchute.com/channel/sharylattkisson/',
  59         },
  60     }, {
  61         # video not downloadable in browser, but we can recover it
  62         'url': 'https://www.bitchute.com/video/2s6B3nZjAk7R/',
  63         'md5': '05c12397d5354bf24494885b08d24ed1',
  64         'info_dict': {
  65             'id': '2s6B3nZjAk7R',
  66             'ext': 'mp4',
  67             'filesize': 71537926,
  68             'title': 'STYXHEXENHAMMER666 - Election Fraud, Clinton 2020, EU Armies, and Gun Control',
  69             'description': 'md5:228ee93bd840a24938f536aeac9cf749',
  70             'thumbnail': r're:^https?://.*\.jpg$',
  71             'uploader': 'BitChute',
  72             'upload_date': '20181113',
  73             'uploader_url': 'https://www.bitchute.com/profile/I5NgtHZn9vPj/',
  74             'channel': 'BitChute',
  75             'channel_url': 'https://www.bitchute.com/channel/bitchute/',
  76         },
  77         'params': {'check_formats': None},
  78     }, {
  79         # restricted video
  80         'url': 'https://www.bitchute.com/video/WEnQU7XGcTdl/',
  81         'info_dict': {
  82             'id': 'WEnQU7XGcTdl',
  83             'ext': 'mp4',
  84             'title': 'Impartial Truth - Ein Letzter Appell an die Vernunft',
  85         },
  86         'params': {'skip_download': True},
  87         'skip': 'Georestricted in DE',
  88     }, {
  89         'url': 'https://www.bitchute.com/embed/lbb5G1hjPhw/',
  90         'only_matching': True,
  91     }, {
  92         'url': 'https://www.bitchute.com/torrent/Zee5BE49045h/szoMrox2JEI.webtorrent',
  93         'only_matching': True,
  94     }, {
  95         'url': 'https://old.bitchute.com/video/UGlrF9o9b-Q/',
  96         'only_matching': True,
  97     }]
  98     _GEO_BYPASS = False
  99
 100     _HEADERS = {
 101         'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.57 Safari/537.36',
 102         'Referer': 'https://www.bitchute.com/',
 103     }
 104
 105     def _check_format(self, video_url, video_id):
 106         urls = orderedSet(
 107             re.sub(r'(^https?://)(seed\d+)(?=\.bitchute\.com)', fr'\g<1>{host}', video_url)
 108             for host in (r'\g<2>', 'seed122', 'seed125', 'seed126', 'seed128',
 109                          'seed132', 'seed150', 'seed151', 'seed152', 'seed153',
 110                          'seed167', 'seed171', 'seed177', 'seed305', 'seed307',
 111                          'seedp29xb', 'zb10-7gsop1v78'))
 112         for url in urls:
 113             try:
 114                 response = self._request_webpage(
 115                     HEADRequest(url), video_id=video_id, note=f'Checking {url}', headers=self._HEADERS)
 116             except ExtractorError as e:
 117                 self.to_screen(f'{video_id}: URL is invalid, skipping: {e.cause}')
 118                 continue
 119             return {
 120                 'url': url,
 121                 'filesize': int_or_none(response.headers.get('Content-Length')),
 122             }
 123
 124     def _raise_if_restricted(self, webpage):
 125         page_title = clean_html(get_element_by_class('page-title', webpage)) or ''
 126         if re.fullmatch(r'(?:Channel|Video) Restricted', page_title):
 127             reason = clean_html(get_element_by_id('page-detail', webpage)) or page_title
 128             self.raise_geo_restricted(reason)
 129
 130     @staticmethod
 131     def _make_url(html):
 132         path = extract_attributes(get_element_html_by_class('spa', html) or '').get('href')
 133         return urljoin('https://www.bitchute.com', path)
 134
 135     def _real_extract(self, url):
 136         video_id = self._match_id(url)
 137         webpage = self._download_webpage(
 138             f'https://old.bitchute.com/video/{video_id}', video_id, headers=self._HEADERS)
 139
 140         self._raise_if_restricted(webpage)
 141         publish_date = clean_html(get_element_by_class('video-publish-date', webpage))
 142         entries = self._parse_html5_media_entries(url, webpage, video_id)
 143
 144         formats = []
 145         for format_ in traverse_obj(entries, (0, 'formats', ...)):
 146             if self.get_param('check_formats') is not False:
 147                 format_.update(self._check_format(format_.pop('url'), video_id) or {})
 148                 if 'url' not in format_:
 149                     continue
 150             formats.append(format_)
 151
 152         if not formats:
 153             self.raise_no_formats(
 154                 'Video is unavailable. Please make sure this video is playable in the browser '
 155                 'before reporting this issue.', expected=True, video_id=video_id)
 156
 157         details = get_element_by_class('details', webpage) or ''
 158         uploader_html = get_element_html_by_class('creator', details) or ''
 159         channel_html = get_element_html_by_class('name', details) or ''
 160
 161         return {
 162             'id': video_id,
 163             'title': self._html_extract_title(webpage) or self._og_search_title(webpage),
 164             'description': self._og_search_description(webpage, default=None),
 165             'thumbnail': self._og_search_thumbnail(webpage),
 166             'uploader': clean_html(uploader_html),
 167             'uploader_url': self._make_url(uploader_html),
 168             'channel': clean_html(channel_html),
 169             'channel_url': self._make_url(channel_html),
 170             'upload_date': unified_strdate(self._search_regex(
 171                 r'at \d+:\d+ UTC on (.+?)\.', publish_date, 'upload date', fatal=False)),
 172             'formats': formats,
 173         }
 174
 175
 176 class BitChuteChannelIE(InfoExtractor):
 177     _VALID_URL = r'https?://(?:(?:www|old)\.)?bitchute\.com/(?P<type>channel|playlist)/(?P<id>[^/?#&]+)'
 178     _TESTS = [{
 179         'url': 'https://www.bitchute.com/channel/bitchute/',
 180         'info_dict': {
 181             'id': 'bitchute',
 182             'title': 'BitChute',
 183             'description': 'md5:2134c37d64fc3a4846787c402956adac',
 184         },
 185         'playlist': [
 186             {
 187                 'md5': '7e427d7ed7af5a75b5855705ec750e2b',
 188                 'info_dict': {
 189                     'id': 'UGlrF9o9b-Q',
 190                     'ext': 'mp4',
 191                     'title': 'This is the first video on #BitChute !',
 192                     'description': 'md5:a0337e7b1fe39e32336974af8173a034',
 193                     'thumbnail': r're:^https?://.*\.jpg$',
 194                     'uploader': 'BitChute',
 195                     'upload_date': '20170103',
 196                     'uploader_url': 'https://www.bitchute.com/profile/I5NgtHZn9vPj/',
 197                     'channel': 'BitChute',
 198                     'channel_url': 'https://www.bitchute.com/channel/bitchute/',
 199                     'duration': 16,
 200                     'view_count': int,
 201                 },
 202             },
 203         ],
 204         'params': {
 205             'skip_download': True,
 206             'playlist_items': '-1',
 207         },
 208     }, {
 209         'url': 'https://www.bitchute.com/playlist/wV9Imujxasw9/',
 210         'playlist_mincount': 20,
 211         'info_dict': {
 212             'id': 'wV9Imujxasw9',
 213             'title': 'Bruce MacDonald and "The Light of Darkness"',
 214             'description': 'md5:747724ef404eebdfc04277714f81863e',
 215         },
 216     }, {
 217         'url': 'https://old.bitchute.com/playlist/wV9Imujxasw9/',
 218         'only_matching': True,
 219     }]
 220
 221     _TOKEN = 'zyG6tQcGPE5swyAEFLqKUwMuMMuF6IO2DZ6ZDQjGfsL0e4dcTLwqkTTul05Jdve7'
 222     PAGE_SIZE = 25
 223     HTML_CLASS_NAMES = {
 224         'channel': {
 225             'container': 'channel-videos-container',
 226             'title': 'channel-videos-title',
 227             'description': 'channel-videos-text',
 228         },
 229         'playlist': {
 230             'container': 'playlist-video',
 231             'title': 'title',
 232             'description': 'description',
 233         },
 234
 235     }
 236
 237     @staticmethod
 238     def _make_url(playlist_id, playlist_type):
 239         return f'https://old.bitchute.com/{playlist_type}/{playlist_id}/'
 240
 241     def _fetch_page(self, playlist_id, playlist_type, page_num):
 242         playlist_url = self._make_url(playlist_id, playlist_type)
 243         data = self._download_json(
 244             f'{playlist_url}extend/', playlist_id, f'Downloading page {page_num}',
 245             data=urlencode_postdata({
 246                 'csrfmiddlewaretoken': self._TOKEN,
 247                 'name': '',
 248                 'offset': page_num * self.PAGE_SIZE,
 249             }), headers={
 250                 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
 251                 'Referer': playlist_url,
 252                 'X-Requested-With': 'XMLHttpRequest',
 253                 'Cookie': f'csrftoken={self._TOKEN}',
 254             })
 255         if not data.get('success'):
 256             return
 257         classes = self.HTML_CLASS_NAMES[playlist_type]
 258         for video_html in get_elements_html_by_class(classes['container'], data.get('html')):
 259             video_id = self._search_regex(
 260                 r'<a\s[^>]*\bhref=["\']/video/([^"\'/]+)', video_html, 'video id', default=None)
 261             if not video_id:
 262                 continue
 263             yield self.url_result(
 264                 f'https://www.bitchute.com/video/{video_id}', BitChuteIE, video_id, url_transparent=True,
 265                 title=clean_html(get_element_by_class(classes['title'], video_html)),
 266                 description=clean_html(get_element_by_class(classes['description'], video_html)),
 267                 duration=parse_duration(get_element_by_class('video-duration', video_html)),
 268                 view_count=parse_count(clean_html(get_element_by_class('video-views', video_html))))
 269
 270     def _real_extract(self, url):
 271         playlist_type, playlist_id = self._match_valid_url(url).group('type', 'id')
 272         webpage = self._download_webpage(self._make_url(playlist_id, playlist_type), playlist_id)
 273
 274         page_func = functools.partial(self._fetch_page, playlist_id, playlist_type)
 275         return self.playlist_result(
 276             OnDemandPagedList(page_func, self.PAGE_SIZE), playlist_id,
 277             title=self._html_extract_title(webpage, default=None),
 278             description=self._html_search_meta(
 279                 ('description', 'og:description', 'twitter:description'), webpage, default=None),
 280             playlist_count=int_or_none(self._html_search_regex(
 281                 r'<span>(\d+)\s+videos?</span>', webpage, 'playlist count', default=None)))