[ie/dplay] Fix extractors (#10471)
[yt-dlp3.git] / yt_dlp / extractor / rtlnl.py
blob4537b3dc4e57b11304f75343f01b1db36f8d3672
1 from .common import InfoExtractor
2 from ..utils import (
3 int_or_none,
4 parse_duration,
8 class RtlNlIE(InfoExtractor):
9 IE_NAME = 'rtl.nl'
10 IE_DESC = 'rtl.nl and rtlxl.nl'
11 _EMBED_REGEX = [r'<iframe[^>]+?\bsrc=(?P<q1>[\'"])(?P<url>(?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)(?P=q1)']
12 _VALID_URL = r'''(?x)
13 https?://(?:(?:www|static)\.)?
14 (?:
15 rtlxl\.nl/(?:[^\#]*\#!|programma)/[^/]+/|
16 rtl\.nl/(?:(?:system/videoplayer/(?:[^/]+/)+(?:video_)?embed\.html|embed)\b.+?\buuid=|video/)|
17 embed\.rtl\.nl/\#uuid=
19 (?P<id>[0-9a-f-]+)'''
21 _TESTS = [{
22 # new URL schema
23 'url': 'https://www.rtlxl.nl/programma/rtl-nieuws/0bd1384d-d970-3086-98bb-5c104e10c26f',
24 'md5': '490428f1187b60d714f34e1f2e3af0b6',
25 'info_dict': {
26 'id': '0bd1384d-d970-3086-98bb-5c104e10c26f',
27 'ext': 'mp4',
28 'title': 'RTL Nieuws',
29 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
30 'timestamp': 1593293400,
31 'upload_date': '20200627',
32 'duration': 661.08,
34 }, {
35 # old URL schema
36 'url': 'http://www.rtlxl.nl/#!/rtl-nieuws-132237/82b1aad1-4a14-3d7b-b554-b0aed1b2c416',
37 'md5': '473d1946c1fdd050b2c0161a4b13c373',
38 'info_dict': {
39 'id': '82b1aad1-4a14-3d7b-b554-b0aed1b2c416',
40 'ext': 'mp4',
41 'title': 'RTL Nieuws',
42 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
43 'timestamp': 1461951000,
44 'upload_date': '20160429',
45 'duration': 1167.96,
47 'skip': '404',
48 }, {
49 # best format available a3t
50 'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed/autoplay=false',
51 'md5': 'dea7474214af1271d91ef332fb8be7ea',
52 'info_dict': {
53 'id': '84ae5571-ac25-4225-ae0c-ef8d9efb2aed',
54 'ext': 'mp4',
55 'timestamp': 1424039400,
56 'title': 'RTL Nieuws - Nieuwe beelden Kopenhagen: chaos direct na aanslag',
57 'thumbnail': r're:^https?://screenshots\.rtl\.nl/(?:[^/]+/)*sz=[0-9]+x[0-9]+/uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed$',
58 'upload_date': '20150215',
59 'description': 'Er zijn nieuwe beelden vrijgegeven die vlak na de aanslag in Kopenhagen zijn gemaakt. Op de video is goed te zien hoe omstanders zich bekommeren om één van de slachtoffers, terwijl de eerste agenten ter plaatse komen.',
61 }, {
62 # empty synopsis and missing episodes (see https://github.com/ytdl-org/youtube-dl/issues/6275)
63 # best format available nettv
64 'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a/autoplay=false',
65 'info_dict': {
66 'id': 'f536aac0-1dc3-4314-920e-3bd1c5b3811a',
67 'ext': 'mp4',
68 'title': 'RTL Nieuws - Meer beelden van overval juwelier',
69 'thumbnail': r're:^https?://screenshots\.rtl\.nl/(?:[^/]+/)*sz=[0-9]+x[0-9]+/uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a$',
70 'timestamp': 1437233400,
71 'upload_date': '20150718',
72 'duration': 30.474,
74 'params': {
75 'skip_download': True,
77 }, {
78 # encrypted m3u8 streams, georestricted
79 'url': 'http://www.rtlxl.nl/#!/afl-2-257632/52a74543-c504-4cde-8aa8-ec66fe8d68a7',
80 'only_matching': True,
81 }, {
82 'url': 'http://www.rtl.nl/system/videoplayer/derden/embed.html#!/uuid=bb0353b0-d6a4-1dad-90e9-18fe75b8d1f0',
83 'only_matching': True,
84 }, {
85 'url': 'http://rtlxl.nl/?_ga=1.204735956.572365465.1466978370#!/rtl-nieuws-132237/3c487912-023b-49ac-903e-2c5d79f8410f',
86 'only_matching': True,
87 }, {
88 'url': 'https://www.rtl.nl/video/c603c9c2-601d-4b5e-8175-64f1e942dc7d/',
89 'only_matching': True,
90 }, {
91 'url': 'https://static.rtl.nl/embed/?uuid=1a2970fc-5c0b-43ff-9fdc-927e39e6d1bc&autoplay=false&publicatiepunt=rtlnieuwsnl',
92 'only_matching': True,
93 }, {
94 # new embed URL schema
95 'url': 'https://embed.rtl.nl/#uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed/autoplay=false',
96 'only_matching': True,
99 def _real_extract(self, url):
100 uuid = self._match_id(url)
101 info = self._download_json(
102 f'http://www.rtl.nl/system/s4m/vfd/version=2/uuid={uuid}/fmt=adaptive/',
103 uuid)
105 material = info['material'][0]
106 title = info['abstracts'][0]['name']
107 subtitle = material.get('title')
108 if subtitle:
109 title += f' - {subtitle}'
110 description = material.get('synopsis')
112 meta = info.get('meta', {})
114 videopath = material['videopath']
115 m3u8_url = meta.get('videohost', 'http://manifest.us.rtl.nl') + videopath
117 formats = self._extract_m3u8_formats(
118 m3u8_url, uuid, 'mp4', m3u8_id='hls', fatal=False)
120 thumbnails = []
122 for p in ('poster_base_url', '"thumb_base_url"'):
123 if not meta.get(p):
124 continue
126 thumbnails.append({
127 'url': self._proto_relative_url(meta[p] + uuid),
128 'width': int_or_none(self._search_regex(
129 r'/sz=([0-9]+)', meta[p], 'thumbnail width', fatal=False)),
130 'height': int_or_none(self._search_regex(
131 r'/sz=[0-9]+x([0-9]+)',
132 meta[p], 'thumbnail height', fatal=False)),
135 return {
136 'id': uuid,
137 'title': title,
138 'formats': formats,
139 'timestamp': material['original_date'],
140 'description': description,
141 'duration': parse_duration(material.get('duration')),
142 'thumbnails': thumbnails,
146 class RTLLuBaseIE(InfoExtractor):
147 _MEDIA_REGEX = {
148 'video': r'<rtl-player\s[^>]*\bhls\s*=\s*"([^"]+)',
149 'audio': r'<rtl-audioplayer\s[^>]*\bsrc\s*=\s*"([^"]+)',
150 'thumbnail': r'<rtl-player\s[^>]*\bposter\s*=\s*"([^"]+)',
153 def get_media_url(self, webpage, video_id, media_type):
154 return self._search_regex(self._MEDIA_REGEX[media_type], webpage, f'{media_type} url', default=None)
156 def get_formats_and_subtitles(self, webpage, video_id):
157 video_url, audio_url = self.get_media_url(webpage, video_id, 'video'), self.get_media_url(webpage, video_id, 'audio')
159 formats, subtitles = [], {}
160 if video_url is not None:
161 formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_url, video_id)
162 if audio_url is not None:
163 formats.append({'url': audio_url, 'ext': 'mp3', 'vcodec': 'none'})
165 return formats, subtitles
167 def _real_extract(self, url):
168 video_id = self._match_id(url)
169 is_live = video_id in ('live', 'live-2', 'lauschteren')
171 # TODO: extract comment from https://www.rtl.lu/comments?status=1&order=desc&context=news|article|<video_id>
172 # we can context from <rtl-comments context=<context> in webpage
173 webpage = self._download_webpage(url, video_id)
175 formats, subtitles = self.get_formats_and_subtitles(webpage, video_id)
177 return {
178 'id': video_id,
179 'title': self._og_search_title(webpage),
180 'description': self._og_search_description(webpage, default=None),
181 'formats': formats,
182 'subtitles': subtitles,
183 'thumbnail': self.get_media_url(webpage, video_id, 'thumbnail') or self._og_search_thumbnail(webpage, default=None),
184 'is_live': is_live,
188 class RTLLuTeleVODIE(RTLLuBaseIE):
189 IE_NAME = 'rtl.lu:tele-vod'
190 _VALID_URL = r'https?://(?:www\.)?rtl\.lu/(tele/(?P<slug>[\w-]+)/v/|video/)(?P<id>\d+)(\.html)?'
191 _TESTS = [{
192 'url': 'https://www.rtl.lu/tele/de-journal-vun-der-tele/v/3266757.html',
193 'info_dict': {
194 'id': '3266757',
195 'title': 'Informatiounsversammlung Héichwaasser',
196 'ext': 'mp4',
197 'thumbnail': 'https://replay-assets.rtl.lu/2021/11/16/d3647fc4-470d-11ec-adc2-3a00abd6e90f_00008.jpg',
198 'description': 'md5:b1db974408cc858c9fd241812e4a2a14',
200 }, {
201 'url': 'https://www.rtl.lu/video/3295215',
202 'info_dict': {
203 'id': '3295215',
204 'title': 'Kulturassisen iwwer d\'Bestandsopnam vum Lëtzebuerger Konscht',
205 'ext': 'mp4',
206 'thumbnail': 'https://replay-assets.rtl.lu/2022/06/28/0000_3295215_0000.jpg',
207 'description': 'md5:85bcd4e0490aa6ec969d9bf16927437b',
212 class RTLLuArticleIE(RTLLuBaseIE):
213 IE_NAME = 'rtl.lu:article'
214 _VALID_URL = r'https?://(?:(www|5minutes|today)\.)rtl\.lu/(?:[\w-]+)/(?:[\w-]+)/a/(?P<id>\d+)\.html'
215 _TESTS = [{
216 # Audio-only
217 'url': 'https://www.rtl.lu/sport/news/a/1934360.html',
218 'info_dict': {
219 'id': '1934360',
220 'ext': 'mp3',
221 'thumbnail': 'https://static.rtl.lu/rtl2008.lu/nt/p/2022/06/28/19/e4b37d66ddf00bab4c45617b91a5bb9b.jpeg',
222 'description': 'md5:5eab4a2a911c1fff7efc1682a38f9ef7',
223 'title': 'md5:40aa85f135578fbd549d3c9370321f99',
225 }, {
226 # 5minutes
227 'url': 'https://5minutes.rtl.lu/espace-frontaliers/frontaliers-en-questions/a/1853173.html',
228 'info_dict': {
229 'id': '1853173',
230 'ext': 'mp4',
231 'description': 'md5:ac031da0740e997a5cf4633173634fee',
232 'title': 'md5:87e17722ed21af0f24be3243f4ec0c46',
233 'thumbnail': 'https://replay-assets.rtl.lu/2022/01/26/screenshot_20220126104933_3274749_12b249833469b0d6e4440a1dec83cdfa.jpg',
235 }, {
236 # today.lu
237 'url': 'https://today.rtl.lu/entertainment/news/a/1936203.html',
238 'info_dict': {
239 'id': '1936203',
240 'ext': 'mp4',
241 'title': 'Once Upon A Time...zu Lëtzebuerg: The Three Witches\' Tower',
242 'description': 'The witchy theme continues in the latest episode of Once Upon A Time...',
243 'thumbnail': 'https://replay-assets.rtl.lu/2022/07/02/screenshot_20220702122859_3290019_412dc5185951b7f6545a4039c8be9235.jpg',
248 class RTLLuLiveIE(RTLLuBaseIE):
249 _VALID_URL = r'https?://www\.rtl\.lu/(?:tele|radio)/(?P<id>live(?:-\d+)?|lauschteren)'
250 _TESTS = [{
251 # Tele:live
252 'url': 'https://www.rtl.lu/tele/live',
253 'info_dict': {
254 'id': 'live',
255 'ext': 'mp4',
256 'live_status': 'is_live',
257 'title': r're:RTL - Télé LIVE \d{4}-\d{2}-\d{2} \d{2}:\d{2}',
258 'thumbnail': 'https://static.rtl.lu/livestream/channel1.jpg',
260 }, {
261 # Tele:live-2
262 'url': 'https://www.rtl.lu/tele/live-2',
263 'info_dict': {
264 'id': 'live-2',
265 'ext': 'mp4',
266 'live_status': 'is_live',
267 'title': r're:RTL - Télé LIVE \d{4}-\d{2}-\d{2} \d{2}:\d{2}',
268 'thumbnail': 'https://static.rtl.lu/livestream/channel2.jpg',
270 }, {
271 # Radio:lauschteren
272 'url': 'https://www.rtl.lu/radio/lauschteren',
273 'info_dict': {
274 'id': 'lauschteren',
275 'ext': 'mp4',
276 'live_status': 'is_live',
277 'title': r're:RTL - Radio LIVE \d{4}-\d{2}-\d{2} \d{2}:\d{2}',
278 'thumbnail': 'https://static.rtl.lu/livestream/rtlradiowebtv.jpg',
283 class RTLLuRadioIE(RTLLuBaseIE):
284 _VALID_URL = r'https?://www\.rtl\.lu/radio/(?:[\w-]+)/s/(?P<id>\d+)(\.html)?'
285 _TESTS = [{
286 'url': 'https://www.rtl.lu/radio/5-vir-12/s/4033058.html',
287 'info_dict': {
288 'id': '4033058',
289 'ext': 'mp3',
290 'description': 'md5:f855a4f3e3235393ae47ed1db5d934b9',
291 'title': '5 vir 12 - Stau um Stau',
292 'thumbnail': 'https://static.rtl.lu/rtlg//2022/06/24/c9c19e5694a14be46a3647a3760e1f62.jpg',