[ie/dropout] Fix extraction (#12102)
[yt-dlp.git] / yt_dlp / extractor / npo.py
blob178fd98bf72bd0ba729e4be332ca043c5ca938df
1 import random
2 import re
3 import urllib.parse
5 from .common import InfoExtractor
6 from ..utils import (
7 determine_ext,
8 int_or_none,
9 merge_dicts,
10 orderedSet,
11 str_or_none,
12 try_call,
13 unified_timestamp,
14 url_or_none,
15 urlencode_postdata,
19 class NPOIE(InfoExtractor):
20 IE_NAME = 'npo'
21 IE_DESC = 'npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl'
22 _VALID_URL = r'''(?x)
23 (?:
24 npo:|
25 https?://
26 (?:www\.)?
27 (?:
28 npo\.nl/(?:[^/]+/)*|
29 (?:ntr|npostart)\.nl/(?:[^/]+/){2,}|
30 omroepwnl\.nl/video/fragment/[^/]+__|
31 (?:zapp|npo3)\.nl/(?:[^/]+/){2,}
34 (?P<id>[^/?#]+)
35 '''
37 _TESTS = [{
38 'url': 'http://www.npo.nl/nieuwsuur/22-06-2014/VPWON_1220719',
39 'md5': '4b3f9c429157ec4775f2c9cb7b911016',
40 'info_dict': {
41 'id': 'VPWON_1220719',
42 'ext': 'm4v',
43 'title': 'Nieuwsuur',
44 'description': 'Dagelijks tussen tien en elf: nieuws, sport en achtergronden.',
45 'upload_date': '20140622',
47 'skip': 'Video was removed',
48 }, {
49 'url': 'http://www.npo.nl/de-mega-mike-mega-thomas-show/27-02-2009/VARA_101191800',
50 'md5': 'da50a5787dbfc1603c4ad80f31c5120b',
51 'info_dict': {
52 'id': 'VARA_101191800',
53 'ext': 'm4v',
54 'title': 'De Mega Mike & Mega Thomas show: The best of.',
55 'description': 'md5:3b74c97fc9d6901d5a665aac0e5400f4',
56 'upload_date': '20090227',
57 'duration': 2400,
59 'skip': 'Video was removed',
60 }, {
61 'url': 'http://www.npo.nl/tegenlicht/25-02-2013/VPWON_1169289',
62 'md5': '1b279c0547f6b270e014c576415268c5',
63 'info_dict': {
64 'id': 'VPWON_1169289',
65 'ext': 'mp4',
66 'title': 'Zwart geld: de toekomst komt uit Afrika',
67 'description': 'md5:dffaf3d628a9c36f78ca48d834246261',
68 'upload_date': '20130225',
69 'duration': 3000,
70 'creator': 'NED2',
71 'series': 'Tegenlicht',
72 'timestamp': 1361822340,
73 'thumbnail': 'https://images.npo.nl/tile/1280x720/142854.jpg',
74 'episode': 'Zwart geld: de toekomst komt uit Afrika',
75 'episode_number': 18,
77 }, {
78 'url': 'http://www.npo.nl/de-nieuwe-mens-deel-1/21-07-2010/WO_VPRO_043706',
79 'info_dict': {
80 'id': 'WO_VPRO_043706',
81 'ext': 'mp4',
82 'title': 'De nieuwe mens - Deel 1',
83 'description': 'md5:518ae51ba1293ffb80d8d8ce90b74e4b',
84 'duration': 4680,
85 'episode': 'De nieuwe mens - Deel 1',
86 'thumbnail': 'https://images.npo.nl/tile/1280x720/6289.jpg',
87 'timestamp': 1279716057,
88 'series': 'De nieuwe mens - Deel 1',
89 'upload_date': '20100721',
91 'params': {
92 'skip_download': True,
94 }, {
95 # non asf in streams
96 'url': 'http://www.npo.nl/hoe-gaat-europa-verder-na-parijs/10-01-2015/WO_NOS_762771',
97 'info_dict': {
98 'id': 'WO_NOS_762771',
99 'ext': 'mp4',
100 'title': 'Hoe gaat Europa verder na Parijs?',
102 'params': {
103 'skip_download': True,
105 'skip': 'Video was removed',
106 }, {
107 'url': 'http://www.ntr.nl/Aap-Poot-Pies/27/detail/Aap-poot-pies/VPWON_1233944#content',
108 'info_dict': {
109 'id': 'VPWON_1233944',
110 'ext': 'mp4',
111 'title': 'Aap, poot, pies',
112 'description': 'md5:4b46b1b9553b4c036a04d2a532a137e6',
113 'upload_date': '20150508',
114 'duration': 599,
115 'episode': 'Aap, poot, pies',
116 'thumbnail': 'https://images.poms.omroep.nl/image/s1280/c1280x720/608118.jpg',
117 'timestamp': 1431064200,
118 'series': 'Aap, poot, pies',
120 'params': {
121 'skip_download': True,
123 }, {
124 'url': 'http://www.omroepwnl.nl/video/fragment/vandaag-de-dag-verkiezingen__POMS_WNL_853698',
125 'info_dict': {
126 'id': 'POW_00996502',
127 'ext': 'm4v',
128 'title': '''"Dit is wel een 'landslide'..."''',
129 'description': 'md5:f8d66d537dfb641380226e31ca57b8e8',
130 'upload_date': '20150508',
131 'duration': 462,
133 'params': {
134 'skip_download': True,
136 'skip': 'Video was removed',
137 }, {
138 # audio
139 'url': 'http://www.npo.nl/jouw-stad-rotterdam/29-01-2017/RBX_FUNX_6683215/RBX_FUNX_7601437',
140 'info_dict': {
141 'id': 'RBX_FUNX_6683215',
142 'ext': 'mp3',
143 'title': 'Jouw Stad Rotterdam',
144 'description': 'md5:db251505244f097717ec59fabc372d9f',
146 'params': {
147 'skip_download': True,
149 'skip': 'Video was removed',
150 }, {
151 'url': 'http://www.zapp.nl/de-bzt-show/gemist/KN_1687547',
152 'only_matching': True,
153 }, {
154 'url': 'http://www.zapp.nl/de-bzt-show/filmpjes/POMS_KN_7315118',
155 'only_matching': True,
156 }, {
157 'url': 'http://www.zapp.nl/beste-vrienden-quiz/extra-video-s/WO_NTR_1067990',
158 'only_matching': True,
159 }, {
160 'url': 'https://www.npo3.nl/3onderzoekt/16-09-2015/VPWON_1239870',
161 'only_matching': True,
162 }, {
163 # live stream
164 'url': 'npo:LI_NL1_4188102',
165 'only_matching': True,
166 }, {
167 'url': 'http://www.npo.nl/radio-gaga/13-06-2017/BNN_101383373',
168 'only_matching': True,
169 }, {
170 'url': 'https://www.zapp.nl/1803-skelterlab/instructie-video-s/740-instructievideo-s/POMS_AT_11736927',
171 'only_matching': True,
172 }, {
173 'url': 'https://www.npostart.nl/broodje-gezond-ei/28-05-2018/KN_1698996',
174 'only_matching': True,
175 }, {
176 'url': 'https://npo.nl/KN_1698996',
177 'only_matching': True,
178 }, {
179 'url': 'https://www.npo3.nl/the-genius/21-11-2022/VPWON_1341105',
180 'info_dict': {
181 'id': 'VPWON_1341105',
182 'ext': 'mp4',
183 'duration': 2658,
184 'series': 'The Genius',
185 'description': 'md5:db02f1456939ca63f7c408f858044e94',
186 'title': 'The Genius',
187 'timestamp': 1669062000,
188 'creator': 'NED3',
189 'episode': 'The Genius',
190 'thumbnail': 'https://images.npo.nl/tile/1280x720/1827650.jpg',
191 'episode_number': 8,
192 'upload_date': '20221121',
194 'params': {
195 'skip_download': True,
199 @classmethod
200 def suitable(cls, url):
201 return (False if any(ie.suitable(url)
202 for ie in (NPOLiveIE, NPORadioIE, NPORadioFragmentIE))
203 else super().suitable(url))
205 def _real_extract(self, url):
206 video_id = self._match_id(url)
207 if urllib.parse.urlparse(url).netloc in ['www.ntr.nl', 'ntr.nl']:
208 player = self._download_json(
209 f'https://www.ntr.nl/ajax/player/embed/{video_id}', video_id,
210 'Downloading player JSON', query={
211 'parameters[elementId]': f'npo{random.randint(0, 999)}',
212 'parameters[sterReferralUrl]': url,
213 'parameters[autoplay]': 0,
215 else:
216 self._request_webpage(
217 'https://www.npostart.nl/api/token', video_id,
218 'Downloading token', headers={
219 'Referer': url,
220 'X-Requested-With': 'XMLHttpRequest',
222 player = self._download_json(
223 f'https://www.npostart.nl/player/{video_id}', video_id,
224 'Downloading player JSON', data=urlencode_postdata({
225 'autoplay': 0,
226 'share': 1,
227 'pageUrl': url,
228 'hasAdConsent': 0,
229 }), headers={
230 'x-xsrf-token': try_call(lambda: urllib.parse.unquote(
231 self._get_cookies('https://www.npostart.nl')['XSRF-TOKEN'].value)),
234 player_token = player['token']
236 drm = False
237 format_urls = set()
238 formats = []
239 for profile in ('hls', 'dash-widevine', 'dash-playready', 'smooth'):
240 streams = self._download_json(
241 f'https://start-player.npo.nl/video/{video_id}/streams',
242 video_id, f'Downloading {profile} profile JSON', fatal=False,
243 query={
244 'profile': profile,
245 'quality': 'npoplus',
246 'tokenId': player_token,
247 'streamType': 'broadcast',
248 }, data=b'') # endpoint requires POST
249 if not streams:
250 continue
251 stream = streams.get('stream')
252 if not isinstance(stream, dict):
253 continue
254 stream_url = url_or_none(stream.get('src'))
255 if not stream_url or stream_url in format_urls:
256 continue
257 format_urls.add(stream_url)
258 if stream.get('protection') is not None or stream.get('keySystemOptions') is not None:
259 drm = True
260 continue
261 stream_type = stream.get('type')
262 stream_ext = determine_ext(stream_url)
263 if stream_type == 'application/dash+xml' or stream_ext == 'mpd':
264 formats.extend(self._extract_mpd_formats(
265 stream_url, video_id, mpd_id='dash', fatal=False))
266 elif stream_type == 'application/vnd.apple.mpegurl' or stream_ext == 'm3u8':
267 formats.extend(self._extract_m3u8_formats(
268 stream_url, video_id, ext='mp4',
269 entry_protocol='m3u8_native', m3u8_id='hls', fatal=False))
270 elif re.search(r'\.isml?/Manifest', stream_url):
271 formats.extend(self._extract_ism_formats(
272 stream_url, video_id, ism_id='mss', fatal=False))
273 else:
274 formats.append({
275 'url': stream_url,
278 if not formats:
279 if not self.get_param('allow_unplayable_formats') and drm:
280 self.report_drm(video_id)
282 info = {
283 'id': video_id,
284 'title': video_id,
285 'formats': formats,
288 embed_url = url_or_none(player.get('embedUrl'))
289 if embed_url:
290 webpage = self._download_webpage(
291 embed_url, video_id, 'Downloading embed page', fatal=False)
292 if webpage:
293 video = self._parse_json(
294 self._search_regex(
295 r'\bvideo\s*=\s*({.+?})\s*;', webpage, 'video',
296 default='{}'), video_id)
297 if video:
298 title = video.get('episodeTitle')
299 subtitles = {}
300 subtitles_list = video.get('subtitles')
301 if isinstance(subtitles_list, list):
302 for cc in subtitles_list:
303 cc_url = url_or_none(cc.get('src'))
304 if not cc_url:
305 continue
306 lang = str_or_none(cc.get('language')) or 'nl'
307 subtitles.setdefault(lang, []).append({
308 'url': cc_url,
310 return merge_dicts({
311 'title': title,
312 'description': video.get('description'),
313 'thumbnail': url_or_none(
314 video.get('still_image_url') or video.get('orig_image_url')),
315 'duration': int_or_none(video.get('duration')),
316 'timestamp': unified_timestamp(video.get('broadcastDate')),
317 'creator': video.get('channel'),
318 'series': video.get('title'),
319 'episode': title,
320 'episode_number': int_or_none(video.get('episodeNumber')),
321 'subtitles': subtitles,
322 }, info)
324 return info
327 class NPOLiveIE(InfoExtractor):
328 IE_NAME = 'npo.nl:live'
329 _VALID_URL = r'https?://(?:www\.)?npo(?:start)?\.nl/live(?:/(?P<id>[^/?#&]+))?'
331 _TESTS = [{
332 'url': 'http://www.npo.nl/live/npo-1',
333 'info_dict': {
334 'id': 'LI_NL1_4188102',
335 'display_id': 'npo-1',
336 'ext': 'mp4',
337 'title': 're:^NPO 1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
338 'is_live': True,
340 'params': {
341 'skip_download': True,
343 }, {
344 'url': 'http://www.npo.nl/live',
345 'only_matching': True,
346 }, {
347 'url': 'https://www.npostart.nl/live/npo-1',
348 'only_matching': True,
351 def _real_extract(self, url):
352 display_id = self._match_id(url) or 'npo-1'
354 webpage = self._download_webpage(url, display_id)
356 live_id = self._search_regex(
357 [r'media-id="([^"]+)"', r'data-prid="([^"]+)"'], webpage, 'live id')
359 return {
360 '_type': 'url_transparent',
361 'url': f'npo:{live_id}',
362 'ie_key': NPOIE.ie_key(),
363 'id': live_id,
364 'display_id': display_id,
368 class NPORadioIE(InfoExtractor):
369 IE_NAME = 'npo.nl:radio'
370 _VALID_URL = r'https?://(?:www\.)?npo\.nl/radio/(?P<id>[^/]+)'
372 _TEST = {
373 'url': 'http://www.npo.nl/radio/radio-1',
374 'info_dict': {
375 'id': 'radio-1',
376 'ext': 'mp3',
377 'title': 're:^NPO Radio 1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
378 'is_live': True,
380 'params': {
381 'skip_download': True,
385 @classmethod
386 def suitable(cls, url):
387 return False if NPORadioFragmentIE.suitable(url) else super().suitable(url)
389 @staticmethod
390 def _html_get_attribute_regex(attribute):
391 return rf'{attribute}\s*=\s*\'([^\']+)\''
393 def _real_extract(self, url):
394 video_id = self._match_id(url)
396 webpage = self._download_webpage(url, video_id)
398 title = self._html_search_regex(
399 self._html_get_attribute_regex('data-channel'), webpage, 'title')
401 stream = self._parse_json(
402 self._html_search_regex(self._html_get_attribute_regex('data-streams'), webpage, 'data-streams'),
403 video_id)
405 codec = stream.get('codec')
407 return {
408 'id': video_id,
409 'url': stream['url'],
410 'title': title,
411 'acodec': codec,
412 'ext': codec,
413 'is_live': True,
417 class NPORadioFragmentIE(InfoExtractor):
418 IE_NAME = 'npo.nl:radio:fragment'
419 _VALID_URL = r'https?://(?:www\.)?npo\.nl/radio/[^/]+/fragment/(?P<id>\d+)'
421 _TEST = {
422 'url': 'http://www.npo.nl/radio/radio-5/fragment/174356',
423 'md5': 'dd8cc470dad764d0fdc70a9a1e2d18c2',
424 'info_dict': {
425 'id': '174356',
426 'ext': 'mp3',
427 'title': 'Jubileumconcert Willeke Alberti',
431 def _real_extract(self, url):
432 audio_id = self._match_id(url)
434 webpage = self._download_webpage(url, audio_id)
436 title = self._html_search_regex(
437 rf'href="/radio/[^/]+/fragment/{audio_id}" title="([^"]+)"',
438 webpage, 'title')
440 audio_url = self._search_regex(
441 r"data-streams='([^']+)'", webpage, 'audio url')
443 return {
444 'id': audio_id,
445 'url': audio_url,
446 'title': title,
450 class NPODataMidEmbedIE(InfoExtractor): # XXX: Conventionally, base classes should end with BaseIE/InfoExtractor
451 def _real_extract(self, url):
452 display_id = self._match_id(url)
453 webpage = self._download_webpage(url, display_id)
454 video_id = self._search_regex(
455 r'data-mid=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video_id', group='id')
456 return {
457 '_type': 'url_transparent',
458 'ie_key': 'NPO',
459 'url': f'npo:{video_id}',
460 'display_id': display_id,
464 class SchoolTVIE(NPODataMidEmbedIE):
465 IE_NAME = 'schooltv'
466 _VALID_URL = r'https?://(?:www\.)?schooltv\.nl/video/(?P<id>[^/?#&]+)'
468 _TEST = {
469 'url': 'http://www.schooltv.nl/video/ademhaling-de-hele-dag-haal-je-adem-maar-wat-gebeurt-er-dan-eigenlijk-in-je-lichaam/',
470 'info_dict': {
471 'id': 'WO_NTR_429477',
472 'display_id': 'ademhaling-de-hele-dag-haal-je-adem-maar-wat-gebeurt-er-dan-eigenlijk-in-je-lichaam',
473 'title': 'Ademhaling: De hele dag haal je adem. Maar wat gebeurt er dan eigenlijk in je lichaam?',
474 'ext': 'mp4',
475 'description': 'md5:abfa0ff690adb73fd0297fd033aaa631',
477 'params': {
478 # Skip because of m3u8 download
479 'skip_download': True,
484 class HetKlokhuisIE(NPODataMidEmbedIE):
485 IE_NAME = 'hetklokhuis'
486 _VALID_URL = r'https?://(?:www\.)?hetklokhuis\.nl/[^/]+/\d+/(?P<id>[^/?#&]+)'
488 _TEST = {
489 'url': 'http://hetklokhuis.nl/tv-uitzending/3471/Zwaartekrachtsgolven',
490 'info_dict': {
491 'id': 'VPWON_1260528',
492 'display_id': 'Zwaartekrachtsgolven',
493 'ext': 'm4v',
494 'title': 'Het Klokhuis: Zwaartekrachtsgolven',
495 'description': 'md5:c94f31fb930d76c2efa4a4a71651dd48',
496 'upload_date': '20170223',
498 'params': {
499 'skip_download': True,
504 class NPOPlaylistBaseIE(NPOIE): # XXX: Do not subclass from concrete IE
505 def _real_extract(self, url):
506 playlist_id = self._match_id(url)
508 webpage = self._download_webpage(url, playlist_id)
510 entries = [
511 self.url_result(f'npo:{video_id}' if not video_id.startswith('http') else video_id)
512 for video_id in orderedSet(re.findall(self._PLAYLIST_ENTRY_RE, webpage))
515 playlist_title = self._html_search_regex(
516 self._PLAYLIST_TITLE_RE, webpage, 'playlist title',
517 default=None) or self._og_search_title(webpage)
519 return self.playlist_result(entries, playlist_id, playlist_title)
522 class VPROIE(NPOPlaylistBaseIE):
523 IE_NAME = 'vpro'
524 _VALID_URL = r'https?://(?:www\.)?(?:(?:tegenlicht\.)?vpro|2doc)\.nl/(?:[^/]+/)*(?P<id>[^/]+)\.html'
525 _PLAYLIST_TITLE_RE = (r'<h1[^>]+class=["\'].*?\bmedia-platform-title\b.*?["\'][^>]*>([^<]+)',
526 r'<h5[^>]+class=["\'].*?\bmedia-platform-subtitle\b.*?["\'][^>]*>([^<]+)')
527 _PLAYLIST_ENTRY_RE = r'data-media-id="([^"]+)"'
529 _TESTS = [
531 'url': 'http://tegenlicht.vpro.nl/afleveringen/2012-2013/de-toekomst-komt-uit-afrika.html',
532 'md5': 'f8065e4e5a7824068ed3c7e783178f2c',
533 'info_dict': {
534 'id': 'VPWON_1169289',
535 'ext': 'm4v',
536 'title': 'De toekomst komt uit Afrika',
537 'description': 'md5:52cf4eefbc96fffcbdc06d024147abea',
538 'upload_date': '20130225',
540 'skip': 'Video gone',
543 'url': 'http://www.vpro.nl/programmas/2doc/2015/sergio-herman.html',
544 'info_dict': {
545 'id': 'sergio-herman',
546 'title': 'sergio herman: fucking perfect',
548 'playlist_count': 2,
551 # playlist with youtube embed
552 'url': 'http://www.vpro.nl/programmas/2doc/2015/education-education.html',
553 'info_dict': {
554 'id': 'education-education',
555 'title': 'education education',
557 'playlist_count': 2,
560 'url': 'http://www.2doc.nl/documentaires/series/2doc/2015/oktober/de-tegenprestatie.html',
561 'info_dict': {
562 'id': 'de-tegenprestatie',
563 'title': 'De Tegenprestatie',
565 'playlist_count': 2,
566 }, {
567 'url': 'http://www.2doc.nl/speel~VARA_101375237~mh17-het-verdriet-van-nederland~.html',
568 'info_dict': {
569 'id': 'VARA_101375237',
570 'ext': 'm4v',
571 'title': 'MH17: Het verdriet van Nederland',
572 'description': 'md5:09e1a37c1fdb144621e22479691a9f18',
573 'upload_date': '20150716',
575 'params': {
576 # Skip because of m3u8 download
577 'skip_download': True,
583 class WNLIE(NPOPlaylistBaseIE):
584 IE_NAME = 'wnl'
585 _VALID_URL = r'https?://(?:www\.)?omroepwnl\.nl/video/detail/(?P<id>[^/]+)__\d+'
586 _PLAYLIST_TITLE_RE = r'(?s)<h1[^>]+class="subject"[^>]*>(.+?)</h1>'
587 _PLAYLIST_ENTRY_RE = r'<a[^>]+href="([^"]+)"[^>]+class="js-mid"[^>]*>Deel \d+'
589 _TESTS = [{
590 'url': 'http://www.omroepwnl.nl/video/detail/vandaag-de-dag-6-mei__060515',
591 'info_dict': {
592 'id': 'vandaag-de-dag-6-mei',
593 'title': 'Vandaag de Dag 6 mei',
595 'playlist_count': 4,
599 class AndereTijdenIE(NPOPlaylistBaseIE):
600 IE_NAME = 'anderetijden'
601 _VALID_URL = r'https?://(?:www\.)?anderetijden\.nl/programma/(?:[^/]+/)+(?P<id>[^/?#&]+)'
602 _PLAYLIST_TITLE_RE = r'(?s)<h1[^>]+class=["\'].*?\bpage-title\b.*?["\'][^>]*>(.+?)</h1>'
603 _PLAYLIST_ENTRY_RE = r'<figure[^>]+class=["\']episode-container episode-page["\'][^>]+data-prid=["\'](.+?)["\']'
605 _TESTS = [{
606 'url': 'http://anderetijden.nl/programma/1/Andere-Tijden/aflevering/676/Duitse-soldaten-over-de-Slag-bij-Arnhem',
607 'info_dict': {
608 'id': 'Duitse-soldaten-over-de-Slag-bij-Arnhem',
609 'title': 'Duitse soldaten over de Slag bij Arnhem',
611 'playlist_count': 3,