5 from .common
import InfoExtractor
14 class NitterIE(InfoExtractor
):
15 # Taken from https://github.com/zedeus/nitter/wiki/Instances
17 NON_HTTP_INSTANCES
= (
18 '3nzoldnxplag42gqjs23xvghtzf6t6yzssrtytnntc6ppc7xxuoneoad.onion',
19 'nitter.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd.onion',
20 'nitter7bryz3jv7e3uekphigvmoyoem4al3fynerxkj22dmoxoq553qd.onion',
21 'npf37k3mtzwxreiw52ccs5ay4e6qt2fkcs2ndieurdyn2cuzzsfyfvid.onion',
22 'nitter.v6vgyqpa7yefkorazmg5d5fimstmvm2vtbirt6676mt7qmllrcnwycqd.onion',
23 'i23nv6w3juvzlw32xzoxcqzktegd4i4fu3nmnc2ewv4ggiu4ledwklad.onion',
24 '26oq3gioiwcmfojub37nz5gzbkdiqp7fue5kvye7d4txv4ny6fb4wwid.onion',
25 'vfaomgh4jxphpbdfizkm5gbtjahmei234giqj4facbwhrfjtcldauqad.onion',
26 'iwgu3cv7ywf3gssed5iqtavmrlszgsxazkmwwnt4h2kdait75thdyrqd.onion',
27 'erpnncl5nhyji3c32dcfmztujtl3xaddqb457jsbkulq24zqq7ifdgad.onion',
28 'ckzuw5misyahmg7j5t5xwwuj3bwy62jfolxyux4brfflramzsvvd3syd.onion',
29 'jebqj47jgxleaiosfcxfibx2xdahjettuydlxbg64azd4khsxv6kawid.onion',
30 'nttr2iupbb6fazdpr2rgbooon2tzbbsvvkagkgkwohhodjzj43stxhad.onion',
31 'nitraeju2mipeziu2wtcrqsxg7h62v5y4eqgwi75uprynkj74gevvuqd.onion',
32 'nitter.lqs5fjmajyp7rvp4qvyubwofzi6d4imua7vs237rkc4m5qogitqwrgyd.onion',
33 'ibsboeui2im5o7dxnik3s5yghufumgy5abevtij5nbizequfpu4qi4ad.onion',
34 'ec5nvbycpfa5k6ro77blxgkyrzbkv7uy6r5cngcbkadtjj2733nm3uyd.onion',
37 'u6ikd6zndl3c4dsdq4mmujpntgeevdk5qzkfb57r4tnfeccrn2qa.b32.i2p',
39 'nitterlgj3n5fgwesu3vxc5h67ruku33nqaoeoocae2mvlzhsu6k7fqd.onion',
43 'nitter.lacontrevoie.fr',
48 'nitter.domain.glass',
50 'birdsite.xanny.family',
59 'nitter.weiler.rocks',
60 'nitter.sethforprivacy.com',
61 'nitter.cutelab.space',
65 'nitter.esmailelbob.xyz',
67 'nitter.winscloud.net',
68 'nitter.tiekoetter.com',
70 'nitter.privacy.com.de',
72 'nitter.bird.froth.zone',
74 'twitter.dr460nf1r3.org',
75 'nitter.garudalinux.org',
78 'nitter.privacydev.net',
87 'nitter.projectsegfau.lt',
88 'nitter.eu.projectsegfau.lt',
89 'singapore.unofficialbird.com',
90 'canada.unofficialbird.com',
91 'india.unofficialbird.com',
92 'nederland.unofficialbird.com',
93 'uk.unofficialbird.com',
98 'read.whatever.social',
99 'nitter.rawbit.ninja',
101 'ntr.odyssey346.dev',
103 'nitter.privacytools.io',
104 'nitter.sneed.network',
106 'nitter.manasiwibi.com',
111 'nitter.caioalonso.com',
118 'nitter.adminforge.de',
119 'nitter.platypush.tech',
122 'nitter.us.projectsegfau.lt',
123 'nitter.arcticfoxes.net',
126 'nitter.ktachibana.party',
127 'nitter.riverside.rocks',
128 'nitter.girlboss.ceo',
132 'ntr.frail.duckdns.org',
133 'nitter.librenode.org',
142 # official, rate limited
145 'is-nitter.resolv.ee',
146 'lu-nitter.resolv.ee',
149 'nitter.cattube.org',
152 'nitter.himiko.cloud',
154 'nitter.mailstation.de',
155 'nitter.mastodont.cat',
156 'nitter.tedomum.net',
158 'nitter.weaponizedhumiliation.com',
159 'nitter.vxempire.xyz',
160 'tweet.lambda.dance',
163 'nitter.pussthecat.org',
164 'nitter.nixnet.services',
166 'nitter.actionsack.com',
169 'nittereu.moomoo.me',
171 'twitter.grimneko.de',
172 'nitter.alefvanoon.xyz',
173 'n.hyperborea.cloud',
174 'twitter.mstdn.social',
175 'nitter.silkky.cloud',
177 'fuckthesacklers.network',
179 'nitter.datatunnel.xyz',
183 'nitter.mastodon.pro',
186 'nitter.snopyta.org',
189 INSTANCES
= NON_HTTP_INSTANCES
+ HTTP_INSTANCES
+ DEAD_INSTANCES
191 _INSTANCES_RE
= f
'(?:{"|".join(map(re.escape, INSTANCES))})'
192 _VALID_URL
= fr
'https?://{_INSTANCES_RE}/(?P<uploader_id>.+)/status/(?P<id>[0-9]+)(#.)?'
193 current_instance
= random
.choice(HTTP_INSTANCES
)
197 # GIF (wrapped in mp4)
198 'url': f
'https://{current_instance}/firefox/status/1314279897502629888#m',
200 'id': '1314279897502629888',
202 'title': 'md5:7890a9277da4639ab624dd899424c5d8',
203 'description': 'md5:5fea96a4d3716c350f8b95b21b3111fe',
204 'thumbnail': r
're:^https?://.*\.jpg$',
205 'uploader': 'Firefox 🔥',
206 'uploader_id': 'firefox',
207 'uploader_url': f
'https://{current_instance}/firefox',
208 'upload_date': '20201008',
209 'timestamp': 1602183720,
212 'comment_count': int,
215 'url': f
'https://{current_instance}/Le___Doc/status/1299715685392756737#m',
217 'id': '1299715685392756737',
219 'title': 're:^.* - "Je ne prédis jamais rien"\nD Raoult, Août 2020...',
220 'description': '"Je ne prédis jamais rien"\nD Raoult, Août 2020...',
221 'thumbnail': r
're:^https?://.*\.jpg$',
222 'uploader': 're:^Le *Doc',
223 'uploader_id': 'Le___Doc',
224 'uploader_url': f
'https://{current_instance}/Le___Doc',
225 'upload_date': '20200829',
226 'timestamp': 1598711340,
230 'comment_count': int,
232 }, { # video embed in a "Streaming Political Ads" box
233 'url': f
'https://{current_instance}/mozilla/status/1321147074491092994#m',
235 'id': '1321147074491092994',
237 'title': 'md5:8290664aabb43b9189145c008386bf12',
238 'description': 'md5:9cf2762d49674bc416a191a689fb2aaa',
239 'thumbnail': r
're:^https?://.*\.jpg$',
240 'uploader': 'Mozilla',
241 'uploader_id': 'mozilla',
242 'uploader_url': f
'https://{current_instance}/mozilla',
243 'upload_date': '20201027',
244 'timestamp': 1603820940,
248 'comment_count': int,
250 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'],
251 }, { # not the first tweet but main-tweet
252 'url': f
'https://{current_instance}/firefox/status/1354848277481414657#m',
254 'id': '1354848277481414657',
256 'title': 'md5:bef647f03bd1c6b15b687ea70dfc9700',
257 'description': 'md5:5efba25e2f9dac85ebcd21160cb4341f',
258 'thumbnail': r
're:^https?://.*\.jpg$',
259 'uploader': 'Firefox 🔥',
260 'uploader_id': 'firefox',
261 'uploader_url': f
'https://{current_instance}/firefox',
262 'upload_date': '20210128',
263 'timestamp': 1611855960,
267 'comment_count': int,
269 }, { # no OpenGraph title
270 'url': f
'https://{current_instance}/LocalBateman/status/1678455464038735895#m',
272 'id': '1678455464038735895',
274 'title': 'Your Typical Local Man - Local man, what did Romanians ever do to you?',
275 'description': 'Local man, what did Romanians ever do to you?',
276 'thumbnail': r
're:^https?://.*\.jpg$',
277 'uploader': 'Your Typical Local Man',
278 'uploader_id': 'LocalBateman',
279 'uploader_url': f
'https://{current_instance}/LocalBateman',
280 'upload_date': '20230710',
281 'timestamp': 1689009900,
285 'comment_count': int,
287 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'],
288 'params': {'skip_download': 'm3u8'},
292 def _real_extract(self
, url
):
293 video_id
, uploader_id
= self
._match
_valid
_url
(url
).group('id', 'uploader_id')
294 parsed_url
= urllib
.parse
.urlparse(url
)
295 base_url
= f
'{parsed_url.scheme}://{parsed_url.netloc}'
297 self
._set
_cookie
(parsed_url
.netloc
, 'hlsPlayback', 'on')
298 full_webpage
= webpage
= self
._download
_webpage
(url
, video_id
)
300 main_tweet_start
= full_webpage
.find('class="main-tweet"')
301 if main_tweet_start
> 0:
302 webpage
= full_webpage
[main_tweet_start
:]
304 video_url
= '{}{}'.format(base_url
, self
._html
_search
_regex
(
305 r
'(?:<video[^>]+data-url|<source[^>]+src)="([^"]+)"', webpage
, 'video url'))
306 ext
= determine_ext(video_url
)
308 if ext
== 'unknown_video':
309 formats
= self
._extract
_m
3u8_formats
(video_url
, video_id
, ext
='mp4')
316 title
= description
= self
._og
_search
_description
(full_webpage
, default
=None) or self
._html
_search
_regex
(
317 r
'<div class="tweet-content[^>]+>([^<]+)</div>', webpage
, 'title', fatal
=False)
319 uploader_id
= self
._html
_search
_regex
(
320 r
'<a class="username"[^>]+title="@([^"]+)"', webpage
, 'uploader id', fatal
=False) or uploader_id
322 uploader
= self
._html
_search
_regex
(
323 r
'<a class="fullname"[^>]+title="([^"]+)"', webpage
, 'uploader name', fatal
=False)
325 title
= f
'{uploader} - {title}'
328 f
'{x[0]}_count': self
._html
_search
_regex
(
329 fr
'<span[^>]+class="icon-{x[1]}[^>]*></span>([^<]*)</div>',
330 webpage
, f
'{x[0]} count', fatal
=False)
331 for x
in (('view', 'play'), ('like', 'heart'), ('repost', 'retweet'), ('comment', 'comment'))
333 counts
= {field
: 0 if count
== '' else parse_count(count
) for field
, count
in counts
.items()}
336 self
._html
_search
_meta
('og:image', full_webpage
, 'thumbnail url')
337 or remove_end('{}{}'.format(base_url
, self
._html
_search
_regex
(
338 r
'<video[^>]+poster="([^"]+)"', webpage
, 'thumbnail url', fatal
=False)), '%3Asmall'))
341 {'id': id_
, 'url': f
'{thumbnail}%3A{id_}'}
342 for id_
in ('thumb', 'small', 'large', 'medium', 'orig')
345 date
= self
._html
_search
_regex
(
346 r
'<span[^>]+class="tweet-date"[^>]*><a[^>]+title="([^"]+)"',
347 webpage
, 'upload date', default
='').replace('·', '')
352 'description': description
,
353 'uploader': uploader
,
354 'timestamp': unified_timestamp(date
),
355 'uploader_id': uploader_id
,
356 'uploader_url': f
'{base_url}/{uploader_id}',
358 'thumbnails': thumbnails
,
359 'thumbnail': thumbnail
,