[ie/youtube] Add age-gate workaround for some embeddable videos (#11821)
[yt-dlp.git] / yt_dlp / extractor / wppilot.py
blobb4cc1abc59d448fa0057ac9ffd586a0355dee6c4
1 import json
2 import random
3 import re
5 from .common import InfoExtractor
6 from ..utils import (
7 ExtractorError,
8 try_get,
12 class WPPilotBaseIE(InfoExtractor):
13 _VIDEO_URL = 'https://pilot.wp.pl/api/v1/channel/%s'
14 _VIDEO_GUEST_URL = 'https://pilot.wp.pl/api/v1/guest/channel/%s'
16 _HEADERS_WEB = {
17 'Content-Type': 'application/json; charset=UTF-8',
18 'Referer': 'https://pilot.wp.pl/tv/',
21 def _get_channel_list(self, cache=True):
22 if cache is True:
23 cache_res = self.cache.load('wppilot', 'channel-list')
24 if cache_res:
25 return cache_res, True
26 webpage = self._download_webpage('https://pilot.wp.pl/tv/', None, 'Downloading webpage')
27 page_data_base_url = self._search_regex(
28 r'<script src="(https://wp-pilot-gatsby\.wpcdn\.pl/v[\d.-]+/desktop)',
29 webpage, 'gatsby build version') + '/page-data'
30 page_data = self._download_json(f'{page_data_base_url}/tv/page-data.json', None, 'Downloading page data')
31 for qhash in page_data['staticQueryHashes']:
32 qhash_content = self._download_json(
33 f'{page_data_base_url}/sq/d/{qhash}.json', None,
34 'Searching for channel list')
35 channel_list = try_get(qhash_content, lambda x: x['data']['allChannels']['nodes'])
36 if channel_list is None:
37 continue
38 self.cache.store('wppilot', 'channel-list', channel_list)
39 return channel_list, False
40 raise ExtractorError('Unable to find the channel list')
42 def _parse_channel(self, chan):
43 return {
44 'id': str(chan['id']),
45 'title': chan['name'],
46 'is_live': True,
47 'thumbnails': [{
48 'id': key,
49 'url': chan[key],
50 } for key in ('thumbnail', 'thumbnail_mobile', 'icon') if chan.get(key)],
54 class WPPilotIE(WPPilotBaseIE):
55 _VALID_URL = r'(?:https?://pilot\.wp\.pl/tv/?#|wppilot:)(?P<id>[a-z\d-]+)'
56 IE_NAME = 'wppilot'
58 _TESTS = [{
59 'url': 'https://pilot.wp.pl/tv/#telewizja-wp-hd',
60 'info_dict': {
61 'id': '158',
62 'ext': 'mp4',
63 'title': 'Telewizja WP HD',
65 'params': {
66 'format': 'bestvideo',
68 }, {
69 # audio only
70 'url': 'https://pilot.wp.pl/tv/#radio-nowy-swiat',
71 'info_dict': {
72 'id': '238',
73 'ext': 'm4a',
74 'title': 'Radio Nowy Świat',
76 'params': {
77 'format': 'bestaudio',
79 }, {
80 'url': 'wppilot:9',
81 'only_matching': True,
84 def _get_channel(self, id_or_slug):
85 video_list, is_cached = self._get_channel_list(cache=True)
86 key = 'id' if re.match(r'^\d+$', id_or_slug) else 'slug'
87 for video in video_list:
88 if video.get(key) == id_or_slug:
89 return self._parse_channel(video)
90 # if cached channel not found, download and retry
91 if is_cached:
92 video_list, _ = self._get_channel_list(cache=False)
93 for video in video_list:
94 if video.get(key) == id_or_slug:
95 return self._parse_channel(video)
96 raise ExtractorError('Channel not found')
98 def _real_extract(self, url):
99 video_id = self._match_id(url)
101 channel = self._get_channel(video_id)
102 video_id = str(channel['id'])
104 is_authorized = next((c for c in self.cookiejar if c.name == 'netviapisessid'), None)
105 # cookies starting with "g:" are assigned to guests
106 is_authorized = is_authorized is not None and not is_authorized.value.startswith('g:')
108 video = self._download_json(
109 (self._VIDEO_URL if is_authorized else self._VIDEO_GUEST_URL) % video_id,
110 video_id, query={
111 'device_type': 'web',
112 }, headers=self._HEADERS_WEB,
113 expected_status=(200, 422))
115 stream_token = try_get(video, lambda x: x['_meta']['error']['info']['stream_token'])
116 if stream_token:
117 close = self._download_json(
118 'https://pilot.wp.pl/api/v1/channels/close', video_id,
119 'Invalidating previous stream session', headers=self._HEADERS_WEB,
120 data=json.dumps({
121 'channelId': video_id,
122 't': stream_token,
123 }).encode())
124 if try_get(close, lambda x: x['data']['status']) == 'ok':
125 return self.url_result(url, ie=WPPilotIE.ie_key())
127 formats = []
129 for fmt in video['data']['stream_channel']['streams']:
130 # live DASH does not work for now
131 # if fmt['type'] == 'dash@live:abr':
132 # formats.extend(
133 # self._extract_mpd_formats(
134 # random.choice(fmt['url']), video_id))
135 if fmt['type'] == 'hls@live:abr':
136 formats.extend(
137 self._extract_m3u8_formats(
138 random.choice(fmt['url']),
139 video_id, live=True))
141 channel['formats'] = formats
142 return channel
145 class WPPilotChannelsIE(WPPilotBaseIE):
146 _VALID_URL = r'(?:https?://pilot\.wp\.pl/(?:tv/?)?(?:\?[^#]*)?#?|wppilot:)$'
147 IE_NAME = 'wppilot:channels'
149 _TESTS = [{
150 'url': 'wppilot:',
151 'info_dict': {
152 'id': 'wppilot',
153 'title': 'WP Pilot',
155 'playlist_mincount': 100,
156 }, {
157 'url': 'https://pilot.wp.pl/',
158 'only_matching': True,
161 def _entries(self):
162 channel_list, _ = self._get_channel_list()
163 for chan in channel_list:
164 entry = self._parse_channel(chan)
165 entry.update({
166 '_type': 'url_transparent',
167 'url': f'wppilot:{chan["id"]}',
168 'ie_key': WPPilotIE.ie_key(),
170 yield entry
172 def _real_extract(self, url):
173 return self.playlist_result(self._entries(), 'wppilot', 'WP Pilot')