yt_dlp/extractor/listennotes.py

   1 import re
   2
   3 from .common import InfoExtractor
   4 from ..utils import (
   5     clean_html,
   6     extract_attributes,
   7     get_element_by_class,
   8     get_element_html_by_id,
   9     get_element_text_and_html_by_tag,
  10     parse_duration,
  11     strip_or_none,
  12     traverse_obj,
  13     try_call,
  14 )
  15
  16
  17 class ListenNotesIE(InfoExtractor):
  18     _VALID_URL = r'https?://(?:www\.)?listennotes\.com/podcasts/[^/]+/[^/]+-(?P<id>.+)/'
  19     _TESTS = [{
  20         'url': 'https://www.listennotes.com/podcasts/thriving-on-overload/tim-oreilly-on-noticing-KrDgvNb_u1n/',
  21         'md5': '5b91a32f841e5788fb82b72a1a8af7f7',
  22         'info_dict': {
  23             'id': 'KrDgvNb_u1n',
  24             'ext': 'mp3',
  25             'title': 'md5:32236591a921adf17bbdbf0441b6c0e9',
  26             'description': 'md5:c581ed197eeddcee55a67cdb547c8cbd',
  27             'duration': 2148.0,
  28             'channel': 'Thriving on Overload',
  29             'channel_id': 'ed84wITivxF',
  30             'episode_id': 'e1312583fa7b4e24acfbb5131050be00',
  31             'thumbnail': 'https://production.listennotes.com/podcasts/thriving-on-overload-ross-dawson-1wb_KospA3P-ed84wITivxF.300x300.jpg',
  32             'channel_url': 'https://www.listennotes.com/podcasts/thriving-on-overload-ross-dawson-ed84wITivxF/',
  33             'cast': ['Tim O’Reilly', 'Cookie Monster', 'Lao Tzu', 'Wallace Steven', 'Eric Raymond', 'Christine Peterson', 'John Maynard Keyne', 'Ross Dawson'],
  34         }
  35     }, {
  36         'url': 'https://www.listennotes.com/podcasts/ask-noah-show/episode-177-wireguard-with-lwEA3154JzG/',
  37         'md5': '62fb4ffe7fc525632a1138bf72a5ce53',
  38         'info_dict': {
  39             'id': 'lwEA3154JzG',
  40             'ext': 'mp3',
  41             'title': 'Episode 177: WireGuard with Jason Donenfeld',
  42             'description': 'md5:24744f36456a3e95f83c1193a3458594',
  43             'duration': 3861.0,
  44             'channel': 'Ask Noah Show',
  45             'channel_id': '4DQTzdS5-j7',
  46             'episode_id': '8c8954b95e0b4859ad1eecec8bf6d3a4',
  47             'channel_url': 'https://www.listennotes.com/podcasts/ask-noah-show-noah-j-chelliah-4DQTzdS5-j7/',
  48             'thumbnail': 'https://production.listennotes.com/podcasts/ask-noah-show-noah-j-chelliah-cfbRUw9Gs3F-4DQTzdS5-j7.300x300.jpg',
  49             'cast': ['noah showlink', 'noah show', 'noah dashboard', 'jason donenfeld'],
  50         }
  51     }]
  52
  53     def _clean_description(self, description):
  54         return clean_html(re.sub(r'(</?(div|p)>\s*)+', '<br/><br/>', description or ''))
  55
  56     def _real_extract(self, url):
  57         audio_id = self._match_id(url)
  58         webpage = self._download_webpage(url, audio_id)
  59         data = self._search_json(
  60             r'<script id="original-content"[^>]+\btype="application/json">', webpage, 'content', audio_id)
  61         data.update(extract_attributes(get_element_html_by_id(
  62             r'episode-play-button-toolbar|episode-no-play-button-toolbar', webpage, escape_value=False)))
  63
  64         duration, description = self._search_regex(
  65             r'(?P<duration>[\d:]+)\s*-\s*(?P<description>.+)',
  66             self._html_search_meta(['og:description', 'description', 'twitter:description'], webpage),
  67             'description', fatal=False, group=('duration', 'description')) or (None, None)
  68
  69         return {
  70             'id': audio_id,
  71             'url': data['audio'],
  72             'title': (data.get('data-title')
  73                       or try_call(lambda: get_element_text_and_html_by_tag('h1', webpage)[0])
  74                       or self._html_search_meta(('og:title', 'title', 'twitter:title'), webpage, 'title')),
  75             'description': (self._clean_description(get_element_by_class('ln-text-p', webpage))
  76                             or strip_or_none(description)),
  77             'duration': parse_duration(traverse_obj(data, 'audio_length', 'data-duration') or duration),
  78             'episode_id': traverse_obj(data, 'uuid', 'data-episode-uuid'),
  79             **traverse_obj(data, {
  80                 'thumbnail': 'data-image',
  81                 'channel': 'data-channel-title',
  82                 'cast': ('nlp_entities', ..., 'name'),
  83                 'channel_url': 'channel_url',
  84                 'channel_id': 'channel_short_uuid',
  85             })
  86         }