[ie/twitter:spaces] Support video spaces (#10789)
[yt-dlp3.git] / yt_dlp / extractor / bundestag.py
blob71f7726659c5483b78ac65c57185f17e03b33920
1 import functools
2 import re
4 from .common import InfoExtractor
5 from ..networking.exceptions import HTTPError
6 from ..utils import (
7 ExtractorError,
8 bug_reports_message,
9 clean_html,
10 format_field,
11 get_element_text_and_html_by_tag,
12 int_or_none,
13 url_or_none,
15 from ..utils.traversal import traverse_obj
18 class BundestagIE(InfoExtractor):
19 _VALID_URL = [
20 r'https?://dbtg\.tv/[cf]vid/(?P<id>\d+)',
21 r'https?://www\.bundestag\.de/mediathek/?\?(?:[^#]+&)?videoid=(?P<id>\d+)',
23 _TESTS = [{
24 'url': 'https://dbtg.tv/cvid/7605304',
25 'info_dict': {
26 'id': '7605304',
27 'ext': 'mp4',
28 'title': '145. Sitzung vom 15.12.2023, TOP 24 Barrierefreiheit',
29 'description': 'md5:321a9dc6bdad201264c0045efc371561',
31 }, {
32 'url': 'https://www.bundestag.de/mediathek?videoid=7602120&url=L21lZGlhdGhla292ZXJsYXk=&mod=mediathek',
33 'info_dict': {
34 'id': '7602120',
35 'ext': 'mp4',
36 'title': '130. Sitzung vom 18.10.2023, TOP 1 Befragung der Bundesregierung',
37 'description': 'Befragung der Bundesregierung',
39 }, {
40 'url': 'https://www.bundestag.de/mediathek?videoid=7604941#url=L21lZGlhdGhla292ZXJsYXk/dmlkZW9pZD03NjA0OTQx&mod=mediathek',
41 'only_matching': True,
42 }, {
43 'url': 'http://dbtg.tv/fvid/3594346',
44 'only_matching': True,
47 _OVERLAY_URL = 'https://www.bundestag.de/mediathekoverlay'
48 _INSTANCE_FORMAT = 'https://cldf-wzw-od.r53.cdn.tv1.eu/13014bundestagod/_definst_/13014bundestag/ondemand/3777parlamentsfernsehen/archiv/app144277506/145293313/{0}/{0}_playlist.smil/playlist.m3u8'
50 _SHARE_URL = 'https://webtv.bundestag.de/player/macros/_x_s-144277506/shareData.json?contentId='
51 _SHARE_AUDIO_REGEX = r'/\d+_(?P<codec>\w+)_(?P<bitrate>\d+)kb_(?P<channels>\w+)_\w+_\d+\.(?P<ext>\w+)'
52 _SHARE_VIDEO_REGEX = r'/\d+_(?P<codec>\w+)_(?P<width>\w+)_(?P<height>\w+)_(?P<bitrate>\d+)kb_\w+_\w+_\d+\.(?P<ext>\w+)'
54 def _bt_extract_share_formats(self, video_id):
55 share_data = self._download_json(
56 f'{self._SHARE_URL}{video_id}', video_id, note='Downloading share format JSON')
57 if traverse_obj(share_data, ('status', 'code', {int})) != 1:
58 self.report_warning(format_field(
59 share_data, [('status', 'message', {str})],
60 'Share API response: %s', default='Unknown Share API Error')
61 + bug_reports_message())
62 return
64 for name, url in share_data.items():
65 if not isinstance(name, str) or not url_or_none(url):
66 continue
68 elif name.startswith('audio'):
69 match = re.search(self._SHARE_AUDIO_REGEX, url)
70 yield {
71 'format_id': name,
72 'url': url,
73 'vcodec': 'none',
74 **traverse_obj(match, {
75 'acodec': 'codec',
76 'audio_channels': ('channels', {{'mono': 1, 'stereo': 2}.get}),
77 'abr': ('bitrate', {int_or_none}),
78 'ext': 'ext',
79 }),
82 elif name.startswith('download'):
83 match = re.search(self._SHARE_VIDEO_REGEX, url)
84 yield {
85 'format_id': name,
86 'url': url,
87 **traverse_obj(match, {
88 'vcodec': 'codec',
89 'tbr': ('bitrate', {int_or_none}),
90 'width': ('width', {int_or_none}),
91 'height': ('height', {int_or_none}),
92 'ext': 'ext',
93 }),
96 def _real_extract(self, url):
97 video_id = self._match_id(url)
98 formats = []
99 result = {'id': video_id, 'formats': formats}
101 try:
102 formats.extend(self._extract_m3u8_formats(
103 self._INSTANCE_FORMAT.format(video_id), video_id, m3u8_id='instance'))
104 except ExtractorError as error:
105 if isinstance(error.cause, HTTPError) and error.cause.status == 404:
106 raise ExtractorError('Could not find video id', expected=True)
107 self.report_warning(f'Error extracting hls formats: {error}', video_id)
108 formats.extend(self._bt_extract_share_formats(video_id))
109 if not formats:
110 self.raise_no_formats('Could not find suitable formats', video_id=video_id)
112 result.update(traverse_obj(self._download_webpage(
113 self._OVERLAY_URL, video_id,
114 query={'videoid': video_id, 'view': 'main'},
115 note='Downloading metadata overlay', fatal=False,
116 ), {
117 'title': (
118 {functools.partial(get_element_text_and_html_by_tag, 'h3')}, 0,
119 {functools.partial(re.sub, r'<span[^>]*>[^<]+</span>', '')}, {clean_html}),
120 'description': ({functools.partial(get_element_text_and_html_by_tag, 'p')}, 0, {clean_html}),
123 return result