yt_dlp/extractor/bannedvideo.py

   1 import json
   2
   3 from .common import InfoExtractor
   4 from ..utils import (
   5     float_or_none,
   6     int_or_none,
   7     try_get,
   8     unified_timestamp,
   9     url_or_none,
  10 )
  11
  12
  13 class BannedVideoIE(InfoExtractor):
  14     _VALID_URL = r'https?://(?:www\.)?banned\.video/watch\?id=(?P<id>[0-f]{24})'
  15     _TESTS = [{
  16         'url': 'https://banned.video/watch?id=5e7a859644e02200c6ef5f11',
  17         'md5': '14b6e81d41beaaee2215cd75c6ed56e4',
  18         'info_dict': {
  19             'id': '5e7a859644e02200c6ef5f11',
  20             'ext': 'mp4',
  21             'title': 'China Discovers Origin of Corona Virus: Issues Emergency Statement',
  22             'thumbnail': r're:^https?://(?:www\.)?assets\.infowarsmedia.com/images/',
  23             'description': 'md5:560d96f02abbebe6c6b78b47465f6b28',
  24             'upload_date': '20200324',
  25             'timestamp': 1585087895,
  26         },
  27     }]
  28
  29     _GRAPHQL_GETMETADATA_QUERY = '''
  30 query GetVideoAndComments($id: String!) {
  31     getVideo(id: $id) {
  32         streamUrl
  33         directUrl
  34         unlisted
  35         live
  36         tags {
  37             name
  38         }
  39         title
  40         summary
  41         playCount
  42         largeImage
  43         videoDuration
  44         channel {
  45             _id
  46             title
  47         }
  48         createdAt
  49     }
  50     getVideoComments(id: $id, limit: 999999, offset: 0) {
  51         _id
  52         content
  53         user {
  54             _id
  55             username
  56         }
  57         voteCount {
  58             positive
  59         }
  60         createdAt
  61         replyCount
  62     }
  63 }'''
  64
  65     _GRAPHQL_GETCOMMENTSREPLIES_QUERY = '''
  66 query GetCommentReplies($id: String!) {
  67     getCommentReplies(id: $id, limit: 999999, offset: 0) {
  68         _id
  69         content
  70         user {
  71             _id
  72             username
  73         }
  74         voteCount {
  75             positive
  76         }
  77         createdAt
  78         replyCount
  79     }
  80 }'''
  81
  82     _GRAPHQL_QUERIES = {
  83         'GetVideoAndComments': _GRAPHQL_GETMETADATA_QUERY,
  84         'GetCommentReplies': _GRAPHQL_GETCOMMENTSREPLIES_QUERY,
  85     }
  86
  87     def _call_api(self, video_id, id_var, operation, note):
  88         return self._download_json(
  89             'https://api.infowarsmedia.com/graphql', video_id, note=note,
  90             headers={
  91                 'Content-Type': 'application/json; charset=utf-8',
  92             }, data=json.dumps({
  93                 'variables': {'id': id_var},
  94                 'operationName': operation,
  95                 'query': self._GRAPHQL_QUERIES[operation],
  96             }).encode('utf8')).get('data')
  97
  98     def _get_comments(self, video_id, comments, comment_data):
  99         yield from comments
 100         for comment in comment_data.copy():
 101             comment_id = comment.get('_id')
 102             if comment.get('replyCount') > 0:
 103                 reply_json = self._call_api(
 104                     video_id, comment_id, 'GetCommentReplies',
 105                     f'Downloading replies for comment {comment_id}')
 106                 for reply in reply_json.get('getCommentReplies'):
 107                     yield self._parse_comment(reply, comment_id)
 108
 109     @staticmethod
 110     def _parse_comment(comment_data, parent):
 111         return {
 112             'id': comment_data.get('_id'),
 113             'text': comment_data.get('content'),
 114             'author': try_get(comment_data, lambda x: x['user']['username']),
 115             'author_id': try_get(comment_data, lambda x: x['user']['_id']),
 116             'timestamp': unified_timestamp(comment_data.get('createdAt')),
 117             'parent': parent,
 118             'like_count': try_get(comment_data, lambda x: x['voteCount']['positive']),
 119         }
 120
 121     def _real_extract(self, url):
 122         video_id = self._match_id(url)
 123         video_json = self._call_api(video_id, video_id, 'GetVideoAndComments', 'Downloading video metadata')
 124         video_info = video_json['getVideo']
 125         is_live = video_info.get('live')
 126         comments = [self._parse_comment(comment, 'root') for comment in video_json.get('getVideoComments')]
 127
 128         formats = [{
 129             'format_id': 'direct',
 130             'quality': 1,
 131             'url': video_info.get('directUrl'),
 132             'ext': 'mp4',
 133         }] if url_or_none(video_info.get('directUrl')) else []
 134         if video_info.get('streamUrl'):
 135             formats.extend(self._extract_m3u8_formats(
 136                 video_info.get('streamUrl'), video_id, 'mp4',
 137                 entry_protocol='m3u8_native', m3u8_id='hls', live=True))
 138
 139         return {
 140             'id': video_id,
 141             'title': video_info.get('title')[:-1],
 142             'formats': formats,
 143             'is_live': is_live,
 144             'description': video_info.get('summary'),
 145             'channel': try_get(video_info, lambda x: x['channel']['title']),
 146             'channel_id': try_get(video_info, lambda x: x['channel']['_id']),
 147             'view_count': int_or_none(video_info.get('playCount')),
 148             'thumbnail': url_or_none(video_info.get('largeImage')),
 149             'duration': float_or_none(video_info.get('videoDuration')),
 150             'timestamp': unified_timestamp(video_info.get('createdAt')),
 151             'tags': [tag.get('name') for tag in video_info.get('tags')],
 152             'availability': self._availability(is_unlisted=video_info.get('unlisted')),
 153             'comments': comments,
 154             '__post_extractor': self.extract_comments(video_id, comments, video_json.get('getVideoComments')),
 155         }