[ie/youtube] Fix `uploader_id` extraction (#11818)
[yt-dlp.git] / yt_dlp / extractor / nitter.py
blob7609b4017861c0529075020020c1ff422420c8aa
1 import random
2 import re
3 import urllib.parse
5 from .common import InfoExtractor
6 from ..utils import (
7 determine_ext,
8 parse_count,
9 remove_end,
10 unified_timestamp,
14 class NitterIE(InfoExtractor):
15 # Taken from https://github.com/zedeus/nitter/wiki/Instances
17 NON_HTTP_INSTANCES = (
18 '3nzoldnxplag42gqjs23xvghtzf6t6yzssrtytnntc6ppc7xxuoneoad.onion',
19 'nitter.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd.onion',
20 'nitter7bryz3jv7e3uekphigvmoyoem4al3fynerxkj22dmoxoq553qd.onion',
21 'npf37k3mtzwxreiw52ccs5ay4e6qt2fkcs2ndieurdyn2cuzzsfyfvid.onion',
22 'nitter.v6vgyqpa7yefkorazmg5d5fimstmvm2vtbirt6676mt7qmllrcnwycqd.onion',
23 'i23nv6w3juvzlw32xzoxcqzktegd4i4fu3nmnc2ewv4ggiu4ledwklad.onion',
24 '26oq3gioiwcmfojub37nz5gzbkdiqp7fue5kvye7d4txv4ny6fb4wwid.onion',
25 'vfaomgh4jxphpbdfizkm5gbtjahmei234giqj4facbwhrfjtcldauqad.onion',
26 'iwgu3cv7ywf3gssed5iqtavmrlszgsxazkmwwnt4h2kdait75thdyrqd.onion',
27 'erpnncl5nhyji3c32dcfmztujtl3xaddqb457jsbkulq24zqq7ifdgad.onion',
28 'ckzuw5misyahmg7j5t5xwwuj3bwy62jfolxyux4brfflramzsvvd3syd.onion',
29 'jebqj47jgxleaiosfcxfibx2xdahjettuydlxbg64azd4khsxv6kawid.onion',
30 'nttr2iupbb6fazdpr2rgbooon2tzbbsvvkagkgkwohhodjzj43stxhad.onion',
31 'nitraeju2mipeziu2wtcrqsxg7h62v5y4eqgwi75uprynkj74gevvuqd.onion',
32 'nitter.lqs5fjmajyp7rvp4qvyubwofzi6d4imua7vs237rkc4m5qogitqwrgyd.onion',
33 'ibsboeui2im5o7dxnik3s5yghufumgy5abevtij5nbizequfpu4qi4ad.onion',
34 'ec5nvbycpfa5k6ro77blxgkyrzbkv7uy6r5cngcbkadtjj2733nm3uyd.onion',
36 'nitter.i2p',
37 'u6ikd6zndl3c4dsdq4mmujpntgeevdk5qzkfb57r4tnfeccrn2qa.b32.i2p',
39 'nitterlgj3n5fgwesu3vxc5h67ruku33nqaoeoocae2mvlzhsu6k7fqd.onion',
42 HTTP_INSTANCES = (
43 'nitter.lacontrevoie.fr',
44 'nitter.fdn.fr',
45 'nitter.1d4.us',
46 'nitter.kavin.rocks',
47 'nitter.unixfox.eu',
48 'nitter.domain.glass',
49 'nitter.namazso.eu',
50 'birdsite.xanny.family',
51 'nitter.moomoo.me',
52 'bird.trom.tf',
53 'nitter.it',
54 'twitter.censors.us',
55 'nitter.grimneko.de',
56 'twitter.076.ne.jp',
57 'nitter.fly.dev',
58 'notabird.site',
59 'nitter.weiler.rocks',
60 'nitter.sethforprivacy.com',
61 'nitter.cutelab.space',
62 'nitter.nl',
63 'nitter.mint.lgbt',
64 'nitter.bus-hit.me',
65 'nitter.esmailelbob.xyz',
66 'tw.artemislena.eu',
67 'nitter.winscloud.net',
68 'nitter.tiekoetter.com',
69 'nitter.spaceint.fr',
70 'nitter.privacy.com.de',
71 'nitter.poast.org',
72 'nitter.bird.froth.zone',
73 'nitter.dcs0.hu',
74 'twitter.dr460nf1r3.org',
75 'nitter.garudalinux.org',
76 'twitter.femboy.hu',
77 'nitter.cz',
78 'nitter.privacydev.net',
79 'nitter.evil.site',
80 'tweet.lambda.dance',
81 'nitter.kylrth.com',
82 'nitter.foss.wtf',
83 'nitter.priv.pw',
84 'nitter.tokhmi.xyz',
85 'nitter.catalyst.sx',
86 'unofficialbird.com',
87 'nitter.projectsegfau.lt',
88 'nitter.eu.projectsegfau.lt',
89 'singapore.unofficialbird.com',
90 'canada.unofficialbird.com',
91 'india.unofficialbird.com',
92 'nederland.unofficialbird.com',
93 'uk.unofficialbird.com',
94 'n.l5.ca',
95 'nitter.slipfox.xyz',
96 'nitter.soopy.moe',
97 'nitter.qwik.space',
98 'read.whatever.social',
99 'nitter.rawbit.ninja',
100 'nt.vern.cc',
101 'ntr.odyssey346.dev',
102 'nitter.ir',
103 'nitter.privacytools.io',
104 'nitter.sneed.network',
105 'n.sneed.network',
106 'nitter.manasiwibi.com',
107 'nitter.smnz.de',
108 'nitter.twei.space',
109 'nitter.inpt.fr',
110 'nitter.d420.de',
111 'nitter.caioalonso.com',
112 'nitter.at',
113 'nitter.drivet.xyz',
114 'nitter.pw',
115 'nitter.nicfab.eu',
116 'bird.habedieeh.re',
117 'nitter.hostux.net',
118 'nitter.adminforge.de',
119 'nitter.platypush.tech',
120 'nitter.mask.sh',
121 'nitter.pufe.org',
122 'nitter.us.projectsegfau.lt',
123 'nitter.arcticfoxes.net',
124 't.com.sb',
125 'nitter.kling.gg',
126 'nitter.ktachibana.party',
127 'nitter.riverside.rocks',
128 'nitter.girlboss.ceo',
129 'nitter.lunar.icu',
130 'twitter.moe.ngo',
131 'nitter.freedit.eu',
132 'ntr.frail.duckdns.org',
133 'nitter.librenode.org',
134 'n.opnxng.com',
135 'nitter.plus.st',
138 DEAD_INSTANCES = (
139 # maintenance
140 'nitter.ethibox.fr',
142 # official, rate limited
143 'nitter.net',
144 # offline
145 'is-nitter.resolv.ee',
146 'lu-nitter.resolv.ee',
147 'nitter.13ad.de',
148 'nitter.40two.app',
149 'nitter.cattube.org',
150 'nitter.cc',
151 'nitter.dark.fail',
152 'nitter.himiko.cloud',
153 'nitter.koyu.space',
154 'nitter.mailstation.de',
155 'nitter.mastodont.cat',
156 'nitter.tedomum.net',
157 'nitter.tokhmi.xyz',
158 'nitter.weaponizedhumiliation.com',
159 'nitter.vxempire.xyz',
160 'tweet.lambda.dance',
161 'nitter.ca',
162 'nitter.42l.fr',
163 'nitter.pussthecat.org',
164 'nitter.nixnet.services',
165 'nitter.eu',
166 'nitter.actionsack.com',
167 'nitter.hu',
168 'twitr.gq',
169 'nittereu.moomoo.me',
170 'bird.from.tf',
171 'twitter.grimneko.de',
172 'nitter.alefvanoon.xyz',
173 'n.hyperborea.cloud',
174 'twitter.mstdn.social',
175 'nitter.silkky.cloud',
176 'nttr.stream',
177 'fuckthesacklers.network',
178 'nitter.govt.land',
179 'nitter.datatunnel.xyz',
180 'de.nttr.stream',
181 'twtr.bch.bar',
182 'nitter.exonip.de',
183 'nitter.mastodon.pro',
184 'nitter.notraxx.ch',
185 'nitter.skrep.in',
186 'nitter.snopyta.org',
189 INSTANCES = NON_HTTP_INSTANCES + HTTP_INSTANCES + DEAD_INSTANCES
191 _INSTANCES_RE = f'(?:{"|".join(map(re.escape, INSTANCES))})'
192 _VALID_URL = fr'https?://{_INSTANCES_RE}/(?P<uploader_id>.+)/status/(?P<id>[0-9]+)(#.)?'
193 current_instance = random.choice(HTTP_INSTANCES)
195 _TESTS = [
197 # GIF (wrapped in mp4)
198 'url': f'https://{current_instance}/firefox/status/1314279897502629888#m',
199 'info_dict': {
200 'id': '1314279897502629888',
201 'ext': 'mp4',
202 'title': 'md5:7890a9277da4639ab624dd899424c5d8',
203 'description': 'md5:5fea96a4d3716c350f8b95b21b3111fe',
204 'thumbnail': r're:^https?://.*\.jpg$',
205 'uploader': 'Firefox 🔥',
206 'uploader_id': 'firefox',
207 'uploader_url': f'https://{current_instance}/firefox',
208 'upload_date': '20201008',
209 'timestamp': 1602183720,
210 'like_count': int,
211 'repost_count': int,
212 'comment_count': int,
214 }, { # normal video
215 'url': f'https://{current_instance}/Le___Doc/status/1299715685392756737#m',
216 'info_dict': {
217 'id': '1299715685392756737',
218 'ext': 'mp4',
219 'title': 're:^.* - "Je ne prédis jamais rien"\nD Raoult, Août 2020...',
220 'description': '"Je ne prédis jamais rien"\nD Raoult, Août 2020...',
221 'thumbnail': r're:^https?://.*\.jpg$',
222 'uploader': 're:^Le *Doc',
223 'uploader_id': 'Le___Doc',
224 'uploader_url': f'https://{current_instance}/Le___Doc',
225 'upload_date': '20200829',
226 'timestamp': 1598711340,
227 'view_count': int,
228 'like_count': int,
229 'repost_count': int,
230 'comment_count': int,
232 }, { # video embed in a "Streaming Political Ads" box
233 'url': f'https://{current_instance}/mozilla/status/1321147074491092994#m',
234 'info_dict': {
235 'id': '1321147074491092994',
236 'ext': 'mp4',
237 'title': 'md5:8290664aabb43b9189145c008386bf12',
238 'description': 'md5:9cf2762d49674bc416a191a689fb2aaa',
239 'thumbnail': r're:^https?://.*\.jpg$',
240 'uploader': 'Mozilla',
241 'uploader_id': 'mozilla',
242 'uploader_url': f'https://{current_instance}/mozilla',
243 'upload_date': '20201027',
244 'timestamp': 1603820940,
245 'view_count': int,
246 'like_count': int,
247 'repost_count': int,
248 'comment_count': int,
250 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'],
251 }, { # not the first tweet but main-tweet
252 'url': f'https://{current_instance}/firefox/status/1354848277481414657#m',
253 'info_dict': {
254 'id': '1354848277481414657',
255 'ext': 'mp4',
256 'title': 'md5:bef647f03bd1c6b15b687ea70dfc9700',
257 'description': 'md5:5efba25e2f9dac85ebcd21160cb4341f',
258 'thumbnail': r're:^https?://.*\.jpg$',
259 'uploader': 'Firefox 🔥',
260 'uploader_id': 'firefox',
261 'uploader_url': f'https://{current_instance}/firefox',
262 'upload_date': '20210128',
263 'timestamp': 1611855960,
264 'view_count': int,
265 'like_count': int,
266 'repost_count': int,
267 'comment_count': int,
269 }, { # no OpenGraph title
270 'url': f'https://{current_instance}/LocalBateman/status/1678455464038735895#m',
271 'info_dict': {
272 'id': '1678455464038735895',
273 'ext': 'mp4',
274 'title': 'Your Typical Local Man - Local man, what did Romanians ever do to you?',
275 'description': 'Local man, what did Romanians ever do to you?',
276 'thumbnail': r're:^https?://.*\.jpg$',
277 'uploader': 'Your Typical Local Man',
278 'uploader_id': 'LocalBateman',
279 'uploader_url': f'https://{current_instance}/LocalBateman',
280 'upload_date': '20230710',
281 'timestamp': 1689009900,
282 'view_count': int,
283 'like_count': int,
284 'repost_count': int,
285 'comment_count': int,
287 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'],
288 'params': {'skip_download': 'm3u8'},
292 def _real_extract(self, url):
293 video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id')
294 parsed_url = urllib.parse.urlparse(url)
295 base_url = f'{parsed_url.scheme}://{parsed_url.netloc}'
297 self._set_cookie(parsed_url.netloc, 'hlsPlayback', 'on')
298 full_webpage = webpage = self._download_webpage(url, video_id)
300 main_tweet_start = full_webpage.find('class="main-tweet"')
301 if main_tweet_start > 0:
302 webpage = full_webpage[main_tweet_start:]
304 video_url = '{}{}'.format(base_url, self._html_search_regex(
305 r'(?:<video[^>]+data-url|<source[^>]+src)="([^"]+)"', webpage, 'video url'))
306 ext = determine_ext(video_url)
308 if ext == 'unknown_video':
309 formats = self._extract_m3u8_formats(video_url, video_id, ext='mp4')
310 else:
311 formats = [{
312 'url': video_url,
313 'ext': ext,
316 title = description = self._og_search_description(full_webpage, default=None) or self._html_search_regex(
317 r'<div class="tweet-content[^>]+>([^<]+)</div>', webpage, 'title', fatal=False)
319 uploader_id = self._html_search_regex(
320 r'<a class="username"[^>]+title="@([^"]+)"', webpage, 'uploader id', fatal=False) or uploader_id
322 uploader = self._html_search_regex(
323 r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False)
324 if uploader:
325 title = f'{uploader} - {title}'
327 counts = {
328 f'{x[0]}_count': self._html_search_regex(
329 fr'<span[^>]+class="icon-{x[1]}[^>]*></span>([^<]*)</div>',
330 webpage, f'{x[0]} count', fatal=False)
331 for x in (('view', 'play'), ('like', 'heart'), ('repost', 'retweet'), ('comment', 'comment'))
333 counts = {field: 0 if count == '' else parse_count(count) for field, count in counts.items()}
335 thumbnail = (
336 self._html_search_meta('og:image', full_webpage, 'thumbnail url')
337 or remove_end('{}{}'.format(base_url, self._html_search_regex(
338 r'<video[^>]+poster="([^"]+)"', webpage, 'thumbnail url', fatal=False)), '%3Asmall'))
340 thumbnails = [
341 {'id': id_, 'url': f'{thumbnail}%3A{id_}'}
342 for id_ in ('thumb', 'small', 'large', 'medium', 'orig')
345 date = self._html_search_regex(
346 r'<span[^>]+class="tweet-date"[^>]*><a[^>]+title="([^"]+)"',
347 webpage, 'upload date', default='').replace('·', '')
349 return {
350 'id': video_id,
351 'title': title,
352 'description': description,
353 'uploader': uploader,
354 'timestamp': unified_timestamp(date),
355 'uploader_id': uploader_id,
356 'uploader_url': f'{base_url}/{uploader_id}',
357 'formats': formats,
358 'thumbnails': thumbnails,
359 'thumbnail': thumbnail,
360 **counts,