3 from .common
import InfoExtractor
8 get_element_html_by_class
,
9 get_elements_html_by_class
,
17 class TheGuardianPodcastIE(InfoExtractor
):
18 _VALID_URL
= r
'https?://(?:www\.)?theguardian\.com/\w+/audio/\d{4}/\w{3}/\d{1,2}/(?P<id>[\w-]+)'
20 'url': 'https://www.theguardian.com/news/audio/2023/nov/03/we-are-just-getting-started-the-plastic-eating-bacteria-that-could-change-the-world-podcast',
21 'md5': 'd1771744681789b4cd7da2a08e487702',
23 'id': 'we-are-just-getting-started-the-plastic-eating-bacteria-that-could-change-the-world-podcast',
25 'title': '‘We are just getting started’: the plastic-eating bacteria that could change the world – podcast',
26 'description': 'md5:cfd3df2791d394d2ab62cd571d5207ee',
27 'creator': 'Stephen Buranyi',
28 'thumbnail': 'md5:73c12558fcb3b0e2a59422bfb33b3f79',
29 'release_date': '20231103',
32 'url': 'https://www.theguardian.com/news/audio/2023/oct/30/the-trials-of-robert-habeck-is-the-worlds-most-powerful-green-politician-doomed-to-fail-podcast',
33 'md5': 'd1771744681789b4cd7da2a08e487702',
35 'id': 'the-trials-of-robert-habeck-is-the-worlds-most-powerful-green-politician-doomed-to-fail-podcast',
37 'title': 'The trials of Robert Habeck: is the world’s most powerful green politician doomed to fail? – podcast',
38 'description': 'md5:1b5cf6582d1771c6b7077784b5456994',
39 'creator': 'Philip Oltermann',
40 'thumbnail': 'md5:6e5c5ec43843e956e20be793722e9080',
41 'release_date': '20231030',
44 'url': 'https://www.theguardian.com/football/audio/2023/nov/06/arsenal-feel-hard-done-by-and-luton-hold-liverpool-football-weekly',
45 'md5': 'a2fcff6f8e060a95b1483295273dc35e',
47 'id': 'arsenal-feel-hard-done-by-and-luton-hold-liverpool-football-weekly',
49 'title': 'Arsenal feel hard done by and Luton hold Liverpool – Football Weekly',
50 'description': 'md5:286a9fbddaeb7c83cc65d1c4a5330b2a',
51 'creator': 'Max Rushden',
52 'thumbnail': 'md5:93eb7d6440f1bb94eb3a6cad63f48afd',
53 'release_date': '20231106',
56 'url': 'https://www.theguardian.com/politics/audio/2023/nov/02/the-covid-inquiry-politics-weekly-uk-podcast',
57 'md5': '06a0f7e9701a80c8064a5d35690481ec',
59 'id': 'the-covid-inquiry-politics-weekly-uk-podcast',
61 'title': 'The Covid inquiry | Politics Weekly UK - podcast',
62 'description': 'md5:207c98859c14903582b17d25b014046e',
63 'creator': 'Gaby Hinsliff',
64 'thumbnail': 'md5:28932a7b5a25b057be330d2ed70ea7f3',
65 'release_date': '20231102',
69 def _real_extract(self
, url
):
70 video_id
= self
._match
_id
(url
)
71 webpage
= self
._download
_webpage
(url
, video_id
)
74 'title': self
._og
_search
_title
(webpage
) or get_element_by_class('content__headline', webpage
),
75 'description': self
._og
_search
_description
(webpage
),
76 'creator': self
._html
_search
_meta
('author', webpage
),
77 'thumbnail': self
._og
_search
_thumbnail
(webpage
),
78 'release_date': unified_strdate(self
._html
_search
_meta
('article:published_time', webpage
)),
79 'url': extract_attributes(get_element_html_by_class(
80 'podcast__player', webpage
) or '').get('data-source'),
84 class TheGuardianPodcastPlaylistIE(InfoExtractor
):
85 _VALID_URL
= r
'https?://(?:www\.)?theguardian\.com/\w+/series/(?P<id>[\w-]+)(?:\?page=\d+)?'
87 'url': 'https://www.theguardian.com/football/series/theguardianswomensfootballweekly',
89 'id': 'theguardianswomensfootballweekly',
90 'title': "The Guardian's Women's Football Weekly",
91 'description': 'md5:e2cc021311e582d29935a73614a43f51',
93 'playlist_mincount': 69,
95 'url': 'https://www.theguardian.com/news/series/todayinfocus?page=2',
98 'title': 'Today in Focus',
99 'description': 'md5:0f097764fc0d359e0b6eb537be0387e2',
101 'playlist_mincount': 1261,
103 'url': 'https://www.theguardian.com/news/series/the-audio-long-read',
105 'id': 'the-audio-long-read',
106 'title': 'The Audio Long Read',
107 'description': 'md5:5462994a27527309562b25b6defc4ef3',
109 'playlist_mincount': 996,
112 def _entries(self
, url
, playlist_id
):
113 for page
in itertools
.count(1):
114 webpage
, urlh
= self
._download
_webpage
_handle
(
115 url
, playlist_id
, f
'Downloading page {page}', query
={'page': page
})
116 if 'page' not in parse_qs(urlh
.url
):
119 episodes
= get_elements_html_by_class('fc-item--type-media', webpage
)
120 yield from traverse_obj(episodes
, (..., {extract_attributes}
, 'data-id'))
122 def _real_extract(self
, url
):
123 podcast_id
= self
._match
_id
(url
)
125 webpage
= self
._download
_webpage
(url
, podcast_id
)
127 title
= clean_html(get_element_by_class(
128 'index-page-header__title', webpage
) or get_element_by_class('flagship-audio__title', webpage
))
129 description
= self
._og
_search
_description
(webpage
) or self
._html
_search
_meta
(
130 'description', webpage
)
132 return self
.playlist_from_matches(
133 self
._entries
(url
, podcast_id
), podcast_id
, title
, description
=description
,
134 ie
=TheGuardianPodcastIE
, getter
=lambda x
: urljoin('https://www.theguardian.com', x
))