yt_dlp/extractor/theguardian.py

   1 import itertools
   2
   3 from .common import InfoExtractor
   4 from ..utils import (
   5     clean_html,
   6     extract_attributes,
   7     get_element_by_class,
   8     get_element_html_by_class,
   9     get_elements_html_by_class,
  10     parse_qs,
  11     traverse_obj,
  12     unified_strdate,
  13     urljoin,
  14 )
  15
  16
  17 class TheGuardianPodcastIE(InfoExtractor):
  18     _VALID_URL = r'https?://(?:www\.)?theguardian\.com/\w+/audio/\d{4}/\w{3}/\d{1,2}/(?P<id>[\w-]+)'
  19     _TESTS = [{
  20         'url': 'https://www.theguardian.com/news/audio/2023/nov/03/we-are-just-getting-started-the-plastic-eating-bacteria-that-could-change-the-world-podcast',
  21         'md5': 'd1771744681789b4cd7da2a08e487702',
  22         'info_dict': {
  23             'id': 'we-are-just-getting-started-the-plastic-eating-bacteria-that-could-change-the-world-podcast',
  24             'ext': 'mp3',
  25             'title': '‘We are just getting started’: the plastic-eating bacteria that could change the world – podcast',
  26             'description': 'md5:cfd3df2791d394d2ab62cd571d5207ee',
  27             'creator': 'Stephen Buranyi',
  28             'thumbnail': 'md5:73c12558fcb3b0e2a59422bfb33b3f79',
  29             'release_date': '20231103',
  30         },
  31     }, {
  32         'url': 'https://www.theguardian.com/news/audio/2023/oct/30/the-trials-of-robert-habeck-is-the-worlds-most-powerful-green-politician-doomed-to-fail-podcast',
  33         'md5': 'd1771744681789b4cd7da2a08e487702',
  34         'info_dict': {
  35             'id': 'the-trials-of-robert-habeck-is-the-worlds-most-powerful-green-politician-doomed-to-fail-podcast',
  36             'ext': 'mp3',
  37             'title': 'The trials of Robert Habeck: is the world’s most powerful green politician doomed to fail? – podcast',
  38             'description': 'md5:1b5cf6582d1771c6b7077784b5456994',
  39             'creator': 'Philip Oltermann',
  40             'thumbnail': 'md5:6e5c5ec43843e956e20be793722e9080',
  41             'release_date': '20231030',
  42         },
  43     }, {
  44         'url': 'https://www.theguardian.com/football/audio/2023/nov/06/arsenal-feel-hard-done-by-and-luton-hold-liverpool-football-weekly',
  45         'md5': 'a2fcff6f8e060a95b1483295273dc35e',
  46         'info_dict': {
  47             'id': 'arsenal-feel-hard-done-by-and-luton-hold-liverpool-football-weekly',
  48             'ext': 'mp3',
  49             'title': 'Arsenal feel hard done by and Luton hold Liverpool – Football Weekly',
  50             'description': 'md5:286a9fbddaeb7c83cc65d1c4a5330b2a',
  51             'creator': 'Max Rushden',
  52             'thumbnail': 'md5:93eb7d6440f1bb94eb3a6cad63f48afd',
  53             'release_date': '20231106',
  54         },
  55     }, {
  56         'url': 'https://www.theguardian.com/politics/audio/2023/nov/02/the-covid-inquiry-politics-weekly-uk-podcast',
  57         'md5': '06a0f7e9701a80c8064a5d35690481ec',
  58         'info_dict': {
  59             'id': 'the-covid-inquiry-politics-weekly-uk-podcast',
  60             'ext': 'mp3',
  61             'title': 'The Covid inquiry | Politics Weekly UK - podcast',
  62             'description': 'md5:207c98859c14903582b17d25b014046e',
  63             'creator': 'Gaby Hinsliff',
  64             'thumbnail': 'md5:28932a7b5a25b057be330d2ed70ea7f3',
  65             'release_date': '20231102',
  66         },
  67     }]
  68
  69     def _real_extract(self, url):
  70         video_id = self._match_id(url)
  71         webpage = self._download_webpage(url, video_id)
  72         return {
  73             'id': video_id,
  74             'title': self._og_search_title(webpage) or get_element_by_class('content__headline', webpage),
  75             'description': self._og_search_description(webpage),
  76             'creator': self._html_search_meta('author', webpage),
  77             'thumbnail': self._og_search_thumbnail(webpage),
  78             'release_date': unified_strdate(self._html_search_meta('article:published_time', webpage)),
  79             'url': extract_attributes(get_element_html_by_class(
  80                 'podcast__player', webpage) or '').get('data-source'),
  81         }
  82
  83
  84 class TheGuardianPodcastPlaylistIE(InfoExtractor):
  85     _VALID_URL = r'https?://(?:www\.)?theguardian\.com/\w+/series/(?P<id>[\w-]+)(?:\?page=\d+)?'
  86     _TESTS = [{
  87         'url': 'https://www.theguardian.com/football/series/theguardianswomensfootballweekly',
  88         'info_dict': {
  89             'id': 'theguardianswomensfootballweekly',
  90             'title': "The Guardian's Women's Football Weekly",
  91             'description': 'md5:e2cc021311e582d29935a73614a43f51',
  92         },
  93         'playlist_mincount': 69,
  94     }, {
  95         'url': 'https://www.theguardian.com/news/series/todayinfocus?page=2',
  96         'info_dict': {
  97             'id': 'todayinfocus',
  98             'title': 'Today in Focus',
  99             'description': 'md5:0f097764fc0d359e0b6eb537be0387e2',
 100         },
 101         'playlist_mincount': 1261,
 102     }, {
 103         'url': 'https://www.theguardian.com/news/series/the-audio-long-read',
 104         'info_dict': {
 105             'id': 'the-audio-long-read',
 106             'title': 'The Audio Long Read',
 107             'description': 'md5:5462994a27527309562b25b6defc4ef3',
 108         },
 109         'playlist_mincount': 996,
 110     }]
 111
 112     def _entries(self, url, playlist_id):
 113         for page in itertools.count(1):
 114             webpage, urlh = self._download_webpage_handle(
 115                 url, playlist_id, f'Downloading page {page}', query={'page': page})
 116             if 'page' not in parse_qs(urlh.url):
 117                 break
 118
 119             episodes = get_elements_html_by_class('fc-item--type-media', webpage)
 120             yield from traverse_obj(episodes, (..., {extract_attributes}, 'data-id'))
 121
 122     def _real_extract(self, url):
 123         podcast_id = self._match_id(url)
 124
 125         webpage = self._download_webpage(url, podcast_id)
 126
 127         title = clean_html(get_element_by_class(
 128             'index-page-header__title', webpage) or get_element_by_class('flagship-audio__title', webpage))
 129         description = self._og_search_description(webpage) or self._html_search_meta(
 130             'description', webpage)
 131
 132         return self.playlist_from_matches(
 133             self._entries(url, podcast_id), podcast_id, title, description=description,
 134             ie=TheGuardianPodcastIE, getter=lambda x: urljoin('https://www.theguardian.com', x))