yt_dlp/extractor/tagesschau.py

   1 import re
   2
   3 from .common import InfoExtractor
   4 from ..utils import (
   5     UnsupportedError,
   6     extract_attributes,
   7     int_or_none,
   8     js_to_json,
   9     parse_iso8601,
  10     try_get,
  11 )
  12
  13
  14 class TagesschauIE(InfoExtractor):
  15     _WORKING = False
  16     _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/(?P<path>[^/]+/(?:[^/]+/)*?(?P<id>[^/#?]+?(?:-?[0-9]+)?))(?:~_?[^/#?]+?)?\.html'
  17
  18     _TESTS = [{
  19         'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html',
  20         'md5': 'ccb9359bf8c4795836e43759f3408a93',
  21         'info_dict': {
  22             'id': 'video-102143-1',
  23             'ext': 'mp4',
  24             'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt',
  25             'duration': 138,
  26         },
  27     }, {
  28         'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html',
  29         'md5': '5c15e8f3da049e48829ec9786d835536',
  30         'info_dict': {
  31             'id': 'ts-5727-1',
  32             'ext': 'mp4',
  33             'title': 'Ganze Sendung',
  34             'duration': 932,
  35         },
  36     }, {
  37         # exclusive audio
  38         'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417.html',
  39         'md5': '4bff8f23504df56a0d86ed312d654182',
  40         'info_dict': {
  41             'id': 'audio-29417-1',
  42             'ext': 'mp3',
  43             'title': 'EU-Gipfel: Im Verbrennerstreit hat Deutschland maximalen Schaden angerichtet',
  44         },
  45     }, {
  46         'url': 'http://www.tagesschau.de/inland/bnd-303.html',
  47         'md5': 'f049fa1698d7564e9ca4c3325108f034',
  48         'info_dict': {
  49             'id': 'bnd-303-1',
  50             'ext': 'mp3',
  51             'title': 'Das Siegel des Bundesnachrichtendienstes | dpa',
  52         },
  53     }, {
  54         'url': 'http://www.tagesschau.de/inland/afd-parteitag-135.html',
  55         'info_dict': {
  56             'id': 'afd-parteitag-135',
  57             'title': 'AfD',
  58         },
  59         'playlist_mincount': 15,
  60     }, {
  61         'url': 'https://www.tagesschau.de/multimedia/audio/audio-29417~player.html',
  62         'info_dict': {
  63             'id': 'audio-29417-1',
  64             'ext': 'mp3',
  65             'title': 'EU-Gipfel: Im Verbrennerstreit hat Deutschland maximalen Schaden angerichtet',
  66         },
  67     }, {
  68         'url': 'https://www.tagesschau.de/multimedia/audio/podcast-11km-327.html',
  69         'info_dict': {
  70             'id': 'podcast-11km-327',
  71             'ext': 'mp3',
  72             'title': 'Gewalt in der Kita – Wenn Erzieher:innen schweigen',
  73             'upload_date': '20230322',
  74             'timestamp': 1679482808,
  75             'thumbnail': 'https://www.tagesschau.de/multimedia/audio/podcast-11km-329~_v-original.jpg',
  76             'description': 'md5:dad059931fe4b3693e3656e93a249848',
  77         },
  78     }, {
  79         'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html',
  80         'only_matching': True,
  81     }, {
  82         'url': 'http://www.tagesschau.de/multimedia/sendung/tt-3827.html',
  83         'only_matching': True,
  84     }, {
  85         'url': 'http://www.tagesschau.de/multimedia/sendung/nm-3475.html',
  86         'only_matching': True,
  87     }, {
  88         'url': 'http://www.tagesschau.de/multimedia/sendung/weltspiegel-3167.html',
  89         'only_matching': True,
  90     }, {
  91         'url': 'http://www.tagesschau.de/multimedia/tsvorzwanzig-959.html',
  92         'only_matching': True,
  93     }, {
  94         'url': 'http://www.tagesschau.de/multimedia/sendung/bab/bab-3299~_bab-sendung-209.html',
  95         'only_matching': True,
  96     }, {
  97         'url': 'http://www.tagesschau.de/multimedia/video/video-102303~_bab-sendung-211.html',
  98         'only_matching': True,
  99     }, {
 100         'url': 'http://www.tagesschau.de/100sekunden/index.html',
 101         'only_matching': True,
 102     }, {
 103         # playlist article with collapsing sections
 104         'url': 'http://www.tagesschau.de/wirtschaft/faq-freihandelszone-eu-usa-101.html',
 105         'only_matching': True,
 106     }]
 107
 108     def _real_extract(self, url):
 109         mobj = self._match_valid_url(url)
 110         video_id = mobj.group('id') or mobj.group('path')
 111         display_id = video_id.lstrip('-')
 112
 113         webpage = self._download_webpage(url, display_id)
 114
 115         title = self._html_search_regex(
 116             r'<span[^>]*class="headline"[^>]*>(.+?)</span>',
 117             webpage, 'title', default=None) or self._og_search_title(webpage, fatal=False)
 118
 119         entries = []
 120         videos = re.findall(r'<div[^>]+>', webpage)
 121         num = 0
 122         for video in videos:
 123             video = extract_attributes(video).get('data-config')
 124             if not video:
 125                 continue
 126             video = self._parse_json(video, video_id, transform_source=js_to_json, fatal=False)
 127             video_formats = try_get(video, lambda x: x['mc']['_mediaArray'][0]['_mediaStreamArray'])
 128             if not video_formats:
 129                 continue
 130             num += 1
 131             for video_format in video_formats:
 132                 media_url = video_format.get('_stream') or ''
 133                 formats = []
 134                 if media_url.endswith('master.m3u8'):
 135                     formats = self._extract_m3u8_formats(media_url, video_id, 'mp4', m3u8_id='hls')
 136                 elif media_url.endswith('.mp3'):
 137                     formats = [{
 138                         'url': media_url,
 139                         'vcodec': 'none',
 140                     }]
 141                 if not formats:
 142                     continue
 143                 entries.append({
 144                     'id': f'{display_id}-{num}',
 145                     'title': try_get(video, lambda x: x['mc']['_title']),
 146                     'duration': int_or_none(try_get(video, lambda x: x['mc']['_duration'])),
 147                     'formats': formats,
 148                 })
 149
 150         if not entries:
 151             raise UnsupportedError(url)
 152
 153         if len(entries) > 1:
 154             return self.playlist_result(entries, display_id, title)
 155
 156         return {
 157             'id': display_id,
 158             'title': title,
 159             'thumbnail': self._og_search_thumbnail(webpage),
 160             'formats': entries[0]['formats'],
 161             'timestamp': parse_iso8601(self._html_search_meta('date', webpage)),
 162             'description': self._og_search_description(webpage),
 163             'duration': entries[0]['duration'],
 164         }