yt_dlp/extractor/voicy.py

   1 import itertools
   2
   3 from .common import InfoExtractor
   4 from ..utils import (
   5     ExtractorError,
   6     smuggle_url,
   7     str_or_none,
   8     traverse_obj,
   9     unified_strdate,
  10     unsmuggle_url,
  11 )
  12
  13
  14 class VoicyBaseIE(InfoExtractor):
  15     def _extract_from_playlist_data(self, value):
  16         voice_id = str(value.get('PlaylistId'))
  17         upload_date = unified_strdate(value.get('Published'), False)
  18         items = [self._extract_single_article(voice_data) for voice_data in value['VoiceData']]
  19         return {
  20             '_type': 'multi_video',
  21             'entries': items,
  22             'id': voice_id,
  23             'title': str(value.get('PlaylistName')),
  24             'uploader': value.get('SpeakerName'),
  25             'uploader_id': str_or_none(value.get('SpeakerId')),
  26             'channel': value.get('ChannelName'),
  27             'channel_id': str_or_none(value.get('ChannelId')),
  28             'upload_date': upload_date,
  29         }
  30
  31     def _extract_single_article(self, entry):
  32         formats = [{
  33             'url': entry['VoiceHlsFile'],
  34             'format_id': 'hls',
  35             'ext': 'm4a',
  36             'acodec': 'aac',
  37             'vcodec': 'none',
  38             'protocol': 'm3u8_native',
  39         }, {
  40             'url': entry['VoiceFile'],
  41             'format_id': 'mp3',
  42             'ext': 'mp3',
  43             'acodec': 'mp3',
  44             'vcodec': 'none',
  45         }]
  46         return {
  47             'id': str(entry.get('ArticleId')),
  48             'title': entry.get('ArticleTitle'),
  49             'description': entry.get('MediaName'),
  50             'formats': formats,
  51         }
  52
  53     def _call_api(self, url, video_id, **kwargs):
  54         response = self._download_json(url, video_id, **kwargs)
  55         if response.get('Status') != 0:
  56             message = traverse_obj(response, ('Value', 'Error', 'Message'), expected_type=str)
  57             if not message:
  58                 message = 'There was a error in the response: %d' % response.get('Status')
  59             raise ExtractorError(message, expected=False)
  60         return response.get('Value')
  61
  62
  63 class VoicyIE(VoicyBaseIE):
  64     _WORKING = False
  65     IE_NAME = 'voicy'
  66     _VALID_URL = r'https?://voicy\.jp/channel/(?P<channel_id>\d+)/(?P<id>\d+)'
  67     ARTICLE_LIST_API_URL = 'https://vmw.api.voicy.jp/articles_list?channel_id=%s&pid=%s'
  68     _TESTS = [{
  69         'url': 'https://voicy.jp/channel/1253/122754',
  70         'info_dict': {
  71             'id': '122754',
  72             'title': '1/21(木)声日記：ついに原稿終わった！！',
  73             'uploader': 'ちょまど@ ITエンジニアなオタク',
  74             'uploader_id': '7339',
  75         },
  76         'playlist_mincount': 9,
  77     }]
  78
  79     def _real_extract(self, url):
  80         mobj = self._match_valid_url(url)
  81         assert mobj
  82         voice_id = mobj.group('id')
  83         channel_id = mobj.group('channel_id')
  84         url, article_list = unsmuggle_url(url)
  85         if not article_list:
  86             article_list = self._call_api(self.ARTICLE_LIST_API_URL % (channel_id, voice_id), voice_id)
  87         return self._extract_from_playlist_data(article_list)
  88
  89
  90 class VoicyChannelIE(VoicyBaseIE):
  91     _WORKING = False
  92     IE_NAME = 'voicy:channel'
  93     _VALID_URL = r'https?://voicy\.jp/channel/(?P<id>\d+)'
  94     PROGRAM_LIST_API_URL = 'https://vmw.api.voicy.jp/program_list/all?channel_id=%s&limit=20&public_type=3%s'
  95     _TESTS = [{
  96         'url': 'https://voicy.jp/channel/1253/',
  97         'info_dict': {
  98             'id': '7339',
  99             'title': 'ゆるふわ日常ラジオ #ちょまラジ',
 100             'uploader': 'ちょまど@ ITエンジニアなオタク',
 101             'uploader_id': '7339',
 102         },
 103         'playlist_mincount': 54,
 104     }]
 105
 106     @classmethod
 107     def suitable(cls, url):
 108         return not VoicyIE.suitable(url) and super().suitable(url)
 109
 110     def _entries(self, channel_id):
 111         pager = ''
 112         for count in itertools.count(1):
 113             article_list = self._call_api(self.PROGRAM_LIST_API_URL % (channel_id, pager), channel_id, note=f'Paging #{count}')
 114             playlist_data = article_list.get('PlaylistData')
 115             if not playlist_data:
 116                 break
 117             yield from playlist_data
 118             last = playlist_data[-1]
 119             pager = '&pid=%d&p_date=%s&play_count=%s' % (last['PlaylistId'], last['Published'], last['PlayCount'])
 120
 121     def _real_extract(self, url):
 122         channel_id = self._match_id(url)
 123         articles = self._entries(channel_id)
 124
 125         first_article = next(articles, None)
 126         title = traverse_obj(first_article, ('ChannelName', ), expected_type=str)
 127         speaker_name = traverse_obj(first_article, ('SpeakerName', ), expected_type=str)
 128         if not title and speaker_name:
 129             title = f'Uploads from {speaker_name}'
 130         if not title:
 131             title = f'Uploads from channel ID {channel_id}'
 132
 133         articles = itertools.chain([first_article], articles) if first_article else articles
 134
 135         playlist = (
 136             self.url_result(smuggle_url('https://voicy.jp/channel/%s/%d' % (channel_id, value['PlaylistId']), value), VoicyIE.ie_key())
 137             for value in articles)
 138         return {
 139             '_type': 'playlist',
 140             'entries': playlist,
 141             'id': channel_id,
 142             'title': title,
 143             'channel': speaker_name,
 144             'channel_id': channel_id,
 145         }