yt_dlp/extractor/crowdbunker.py

   1 import itertools
   2
   3 from .common import InfoExtractor
   4 from ..utils import (
   5     int_or_none,
   6     try_get,
   7     unified_strdate,
   8 )
   9
  10
  11 class CrowdBunkerIE(InfoExtractor):
  12     _VALID_URL = r'https?://(?:www\.)?crowdbunker\.com/v/(?P<id>[^/?#$&]+)'
  13
  14     _TESTS = [{
  15         'url': 'https://crowdbunker.com/v/0z4Kms8pi8I',
  16         'info_dict': {
  17             'id': '0z4Kms8pi8I',
  18             'ext': 'mp4',
  19             'title': '117) Pass vax et solutions',
  20             'description': 'md5:86bcb422c29475dbd2b5dcfa6ec3749c',
  21             'view_count': int,
  22             'duration': 5386,
  23             'uploader': 'Jérémie Mercier',
  24             'uploader_id': 'UCeN_qQV829NYf0pvPJhW5dQ',
  25             'like_count': int,
  26             'upload_date': '20211218',
  27             'thumbnail': 'https://scw.divulg.org/cb-medias4/images/0z4Kms8pi8I/maxres.jpg',
  28         },
  29         'params': {'skip_download': True},
  30     }]
  31
  32     def _real_extract(self, url):
  33         video_id = self._match_id(url)
  34         data_json = self._download_json(
  35             f'https://api.divulg.org/post/{video_id}/details', video_id,
  36             headers={'accept': 'application/json, text/plain, */*'})
  37         video_json = data_json['video']
  38         formats, subtitles = [], {}
  39         for sub in video_json.get('captions') or []:
  40             sub_url = try_get(sub, lambda x: x['file']['url'])
  41             if not sub_url:
  42                 continue
  43             subtitles.setdefault(sub.get('languageCode', 'fr'), []).append({
  44                 'url': sub_url,
  45             })
  46
  47         mpd_url = try_get(video_json, lambda x: x['dashManifest']['url'])
  48         if mpd_url:
  49             fmts, subs = self._extract_mpd_formats_and_subtitles(mpd_url, video_id)
  50             formats.extend(fmts)
  51             subtitles = self._merge_subtitles(subtitles, subs)
  52         m3u8_url = try_get(video_json, lambda x: x['hlsManifest']['url'])
  53         if m3u8_url:
  54             fmts, subs = self._extract_m3u8_formats_and_subtitles(mpd_url, video_id)
  55             formats.extend(fmts)
  56             subtitles = self._merge_subtitles(subtitles, subs)
  57
  58         thumbnails = [{
  59             'url': image['url'],
  60             'height': int_or_none(image.get('height')),
  61             'width': int_or_none(image.get('width')),
  62         } for image in video_json.get('thumbnails') or [] if image.get('url')]
  63
  64         return {
  65             'id': video_id,
  66             'title': video_json.get('title'),
  67             'description': video_json.get('description'),
  68             'view_count': video_json.get('viewCount'),
  69             'duration': video_json.get('duration'),
  70             'uploader': try_get(data_json, lambda x: x['channel']['name']),
  71             'uploader_id': try_get(data_json, lambda x: x['channel']['id']),
  72             'like_count': data_json.get('likesCount'),
  73             'upload_date': unified_strdate(video_json.get('publishedAt') or video_json.get('createdAt')),
  74             'thumbnails': thumbnails,
  75             'formats': formats,
  76             'subtitles': subtitles,
  77         }
  78
  79
  80 class CrowdBunkerChannelIE(InfoExtractor):
  81     _VALID_URL = r'https?://(?:www\.)?crowdbunker\.com/@(?P<id>[^/?#$&]+)'
  82
  83     _TESTS = [{
  84         'url': 'https://crowdbunker.com/@Milan_UHRIN',
  85         'playlist_mincount': 14,
  86         'info_dict': {
  87             'id': 'Milan_UHRIN',
  88         },
  89     }]
  90
  91     def _entries(self, playlist_id):
  92         last = None
  93
  94         for page in itertools.count():
  95             channel_json = self._download_json(
  96                 f'https://api.divulg.org/organization/{playlist_id}/posts', playlist_id,
  97                 headers={'accept': 'application/json, text/plain, */*'},
  98                 query={'after': last} if last else {}, note=f'Downloading Page {page}')
  99             for item in channel_json.get('items') or []:
 100                 v_id = item.get('uid')
 101                 if not v_id:
 102                     continue
 103                 yield self.url_result(
 104                     f'https://crowdbunker.com/v/{v_id}', ie=CrowdBunkerIE.ie_key(), video_id=v_id)
 105             last = channel_json.get('last')
 106             if not last:
 107                 break
 108
 109     def _real_extract(self, url):
 110         playlist_id = self._match_id(url)
 111         return self.playlist_result(self._entries(playlist_id), playlist_id=playlist_id)