3 from .common
import InfoExtractor
14 class TagesschauIE(InfoExtractor
):
16 _VALID_URL
= r
'https?://(?:www\.)?tagesschau\.de/(?P<path>[^/]+/(?:[^/]+/)*?(?P<id>[^/#?]+?(?:-?[0-9]+)?))(?:~_?[^/#?]+?)?\.html'
19 'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html',
20 'md5': 'ccb9359bf8c4795836e43759f3408a93',
22 'id': 'video-102143-1',
24 'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt',
28 'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html',
29 'md5': '5c15e8f3da049e48829ec9786d835536',
33 'title': 'Ganze Sendung',
38 'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417.html',
39 'md5': '4bff8f23504df56a0d86ed312d654182',
41 'id': 'audio-29417-1',
43 'title': 'EU-Gipfel: Im Verbrennerstreit hat Deutschland maximalen Schaden angerichtet',
46 'url': 'http://www.tagesschau.de/inland/bnd-303.html',
47 'md5': 'f049fa1698d7564e9ca4c3325108f034',
51 'title': 'Das Siegel des Bundesnachrichtendienstes | dpa',
54 'url': 'http://www.tagesschau.de/inland/afd-parteitag-135.html',
56 'id': 'afd-parteitag-135',
59 'playlist_mincount': 15,
61 'url': 'https://www.tagesschau.de/multimedia/audio/audio-29417~player.html',
63 'id': 'audio-29417-1',
65 'title': 'EU-Gipfel: Im Verbrennerstreit hat Deutschland maximalen Schaden angerichtet',
68 'url': 'https://www.tagesschau.de/multimedia/audio/podcast-11km-327.html',
70 'id': 'podcast-11km-327',
72 'title': 'Gewalt in der Kita – Wenn Erzieher:innen schweigen',
73 'upload_date': '20230322',
74 'timestamp': 1679482808,
75 'thumbnail': 'https://www.tagesschau.de/multimedia/audio/podcast-11km-329~_v-original.jpg',
76 'description': 'md5:dad059931fe4b3693e3656e93a249848',
79 'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html',
80 'only_matching': True,
82 'url': 'http://www.tagesschau.de/multimedia/sendung/tt-3827.html',
83 'only_matching': True,
85 'url': 'http://www.tagesschau.de/multimedia/sendung/nm-3475.html',
86 'only_matching': True,
88 'url': 'http://www.tagesschau.de/multimedia/sendung/weltspiegel-3167.html',
89 'only_matching': True,
91 'url': 'http://www.tagesschau.de/multimedia/tsvorzwanzig-959.html',
92 'only_matching': True,
94 'url': 'http://www.tagesschau.de/multimedia/sendung/bab/bab-3299~_bab-sendung-209.html',
95 'only_matching': True,
97 'url': 'http://www.tagesschau.de/multimedia/video/video-102303~_bab-sendung-211.html',
98 'only_matching': True,
100 'url': 'http://www.tagesschau.de/100sekunden/index.html',
101 'only_matching': True,
103 # playlist article with collapsing sections
104 'url': 'http://www.tagesschau.de/wirtschaft/faq-freihandelszone-eu-usa-101.html',
105 'only_matching': True,
108 def _real_extract(self
, url
):
109 mobj
= self
._match
_valid
_url
(url
)
110 video_id
= mobj
.group('id') or mobj
.group('path')
111 display_id
= video_id
.lstrip('-')
113 webpage
= self
._download
_webpage
(url
, display_id
)
115 title
= self
._html
_search
_regex
(
116 r
'<span[^>]*class="headline"[^>]*>(.+?)</span>',
117 webpage
, 'title', default
=None) or self
._og
_search
_title
(webpage
, fatal
=False)
120 videos
= re
.findall(r
'<div[^>]+>', webpage
)
123 video
= extract_attributes(video
).get('data-config')
126 video
= self
._parse
_json
(video
, video_id
, transform_source
=js_to_json
, fatal
=False)
127 video_formats
= try_get(video
, lambda x
: x
['mc']['_mediaArray'][0]['_mediaStreamArray'])
128 if not video_formats
:
131 for video_format
in video_formats
:
132 media_url
= video_format
.get('_stream') or ''
134 if media_url
.endswith('master.m3u8'):
135 formats
= self
._extract
_m
3u8_formats
(media_url
, video_id
, 'mp4', m3u8_id
='hls')
136 elif media_url
.endswith('.mp3'):
144 'id': f
'{display_id}-{num}',
145 'title': try_get(video
, lambda x
: x
['mc']['_title']),
146 'duration': int_or_none(try_get(video
, lambda x
: x
['mc']['_duration'])),
151 raise UnsupportedError(url
)
154 return self
.playlist_result(entries
, display_id
, title
)
159 'thumbnail': self
._og
_search
_thumbnail
(webpage
),
160 'formats': entries
[0]['formats'],
161 'timestamp': parse_iso8601(self
._html
_search
_meta
('date', webpage
)),
162 'description': self
._og
_search
_description
(webpage
),
163 'duration': entries
[0]['duration'],