3 from .common
import InfoExtractor
8 get_element_html_by_id
,
9 get_element_text_and_html_by_tag
,
17 class ListenNotesIE(InfoExtractor
):
18 _VALID_URL
= r
'https?://(?:www\.)?listennotes\.com/podcasts/[^/]+/[^/]+-(?P<id>.+)/'
20 'url': 'https://www.listennotes.com/podcasts/thriving-on-overload/tim-oreilly-on-noticing-KrDgvNb_u1n/',
21 'md5': '5b91a32f841e5788fb82b72a1a8af7f7',
25 'title': 'md5:32236591a921adf17bbdbf0441b6c0e9',
26 'description': 'md5:c581ed197eeddcee55a67cdb547c8cbd',
28 'channel': 'Thriving on Overload',
29 'channel_id': 'ed84wITivxF',
30 'episode_id': 'e1312583fa7b4e24acfbb5131050be00',
31 'thumbnail': 'https://production.listennotes.com/podcasts/thriving-on-overload-ross-dawson-1wb_KospA3P-ed84wITivxF.300x300.jpg',
32 'channel_url': 'https://www.listennotes.com/podcasts/thriving-on-overload-ross-dawson-ed84wITivxF/',
33 'cast': ['Tim O’Reilly', 'Cookie Monster', 'Lao Tzu', 'Wallace Steven', 'Eric Raymond', 'Christine Peterson', 'John Maynard Keyne', 'Ross Dawson'],
36 'url': 'https://www.listennotes.com/podcasts/ask-noah-show/episode-177-wireguard-with-lwEA3154JzG/',
37 'md5': '62fb4ffe7fc525632a1138bf72a5ce53',
41 'title': 'Episode 177: WireGuard with Jason Donenfeld',
42 'description': 'md5:24744f36456a3e95f83c1193a3458594',
44 'channel': 'Ask Noah Show',
45 'channel_id': '4DQTzdS5-j7',
46 'episode_id': '8c8954b95e0b4859ad1eecec8bf6d3a4',
47 'channel_url': 'https://www.listennotes.com/podcasts/ask-noah-show-noah-j-chelliah-4DQTzdS5-j7/',
48 'thumbnail': 'https://production.listennotes.com/podcasts/ask-noah-show-noah-j-chelliah-cfbRUw9Gs3F-4DQTzdS5-j7.300x300.jpg',
49 'cast': ['noah showlink', 'noah show', 'noah dashboard', 'jason donenfeld'],
53 def _clean_description(self
, description
):
54 return clean_html(re
.sub(r
'(</?(div|p)>\s*)+', '<br/><br/>', description
or ''))
56 def _real_extract(self
, url
):
57 audio_id
= self
._match
_id
(url
)
58 webpage
= self
._download
_webpage
(url
, audio_id
)
59 data
= self
._search
_json
(
60 r
'<script id="original-content"[^>]+\btype="application/json">', webpage
, 'content', audio_id
)
61 data
.update(extract_attributes(get_element_html_by_id(
62 r
'episode-play-button-toolbar|episode-no-play-button-toolbar', webpage
, escape_value
=False)))
64 duration
, description
= self
._search
_regex
(
65 r
'(?P<duration>[\d:]+)\s*-\s*(?P<description>.+)',
66 self
._html
_search
_meta
(['og:description', 'description', 'twitter:description'], webpage
),
67 'description', fatal
=False, group
=('duration', 'description')) or (None, None)
72 'title': (data
.get('data-title')
73 or try_call(lambda: get_element_text_and_html_by_tag('h1', webpage
)[0])
74 or self
._html
_search
_meta
(('og:title', 'title', 'twitter:title'), webpage
, 'title')),
75 'description': (self
._clean
_description
(get_element_by_class('ln-text-p', webpage
))
76 or strip_or_none(description
)),
77 'duration': parse_duration(traverse_obj(data
, 'audio_length', 'data-duration') or duration
),
78 'episode_id': traverse_obj(data
, 'uuid', 'data-episode-uuid'),
79 **traverse_obj(data
, {
80 'thumbnail': 'data-image',
81 'channel': 'data-channel-title',
82 'cast': ('nlp_entities', ..., 'name'),
83 'channel_url': 'channel_url',
84 'channel_id': 'channel_short_uuid',