yt_dlp/extractor/telegram.py

   1 import re
   2
   3 from .common import InfoExtractor
   4 from ..utils import (
   5     clean_html,
   6     format_field,
   7     get_element_by_class,
   8     parse_duration,
   9     parse_qs,
  10     traverse_obj,
  11     unified_timestamp,
  12     update_url_query,
  13     url_basename,
  14 )
  15
  16
  17 class TelegramEmbedIE(InfoExtractor):
  18     IE_NAME = 'telegram:embed'
  19     _VALID_URL = r'https?://t\.me/(?P<channel_id>[^/]+)/(?P<id>\d+)'
  20     _TESTS = [{
  21         'url': 'https://t.me/europa_press/613',
  22         'md5': 'dd707708aea958c11a590e8068825f22',
  23         'info_dict': {
  24             'id': '613',
  25             'ext': 'mp4',
  26             'title': 'md5:6ce2d7e8d56eda16d80607b23db7b252',
  27             'description': 'md5:6ce2d7e8d56eda16d80607b23db7b252',
  28             'channel_id': 'europa_press',
  29             'channel': 'Europa Press ✔',
  30             'thumbnail': r're:^https?://.+',
  31             'timestamp': 1635631203,
  32             'upload_date': '20211030',
  33             'duration': 61,
  34         },
  35     }, {
  36         # 2-video post
  37         'url': 'https://t.me/vorposte/29342',
  38         'info_dict': {
  39             'id': 'vorposte-29342',
  40             'title': 'Форпост 29342',
  41             'description': 'md5:9d92e22169a3e136d5d69df25f82c3dc',
  42         },
  43         'playlist_count': 2,
  44         'params': {
  45             'skip_download': True,
  46         },
  47     }, {
  48         # 2-video post with --no-playlist
  49         'url': 'https://t.me/vorposte/29343',
  50         'md5': '1724e96053c18e788c8464038876e245',
  51         'info_dict': {
  52             'id': '29343',
  53             'ext': 'mp4',
  54             'title': 'md5:9d92e22169a3e136d5d69df25f82c3dc',
  55             'description': 'md5:9d92e22169a3e136d5d69df25f82c3dc',
  56             'channel_id': 'vorposte',
  57             'channel': 'Форпост',
  58             'thumbnail': r're:^https?://.+',
  59             'timestamp': 1666384480,
  60             'upload_date': '20221021',
  61             'duration': 35,
  62         },
  63         'params': {
  64             'noplaylist': True,
  65         },
  66     }, {
  67         # 2-video post with 'single' query param
  68         'url': 'https://t.me/vorposte/29342?single',
  69         'md5': 'd20b202f1e41400a9f43201428add18f',
  70         'info_dict': {
  71             'id': '29342',
  72             'ext': 'mp4',
  73             'title': 'md5:9d92e22169a3e136d5d69df25f82c3dc',
  74             'description': 'md5:9d92e22169a3e136d5d69df25f82c3dc',
  75             'channel_id': 'vorposte',
  76             'channel': 'Форпост',
  77             'thumbnail': r're:^https?://.+',
  78             'timestamp': 1666384480,
  79             'upload_date': '20221021',
  80             'duration': 33,
  81         },
  82     }]
  83
  84     def _real_extract(self, url):
  85         channel_id, msg_id = self._match_valid_url(url).group('channel_id', 'id')
  86         embed = self._download_webpage(
  87             url, msg_id, query={'embed': '1', 'single': []}, note='Downloading embed frame')
  88
  89         def clean_text(html_class, html):
  90             text = clean_html(get_element_by_class(html_class, html))
  91             return text.replace('\n', ' ') if text else None
  92
  93         description = clean_text('tgme_widget_message_text', embed)
  94         message = {
  95             'title': description or '',
  96             'description': description,
  97             'channel': clean_text('tgme_widget_message_author', embed),
  98             'channel_id': channel_id,
  99             'timestamp': unified_timestamp(self._search_regex(
 100                 r'<time[^>]*datetime="([^"]*)"', embed, 'timestamp', fatal=False)),
 101         }
 102
 103         videos = []
 104         for video in re.findall(r'<a class="tgme_widget_message_video_player(?s:.+?)</time>', embed):
 105             video_url = self._search_regex(
 106                 r'<video[^>]+src="([^"]+)"', video, 'video URL', fatal=False)
 107             webpage_url = self._search_regex(
 108                 r'<a class="tgme_widget_message_video_player[^>]+href="([^"]+)"',
 109                 video, 'webpage URL', fatal=False)
 110             if not video_url or not webpage_url:
 111                 continue
 112             formats = [{
 113                 'url': video_url,
 114                 'ext': 'mp4',
 115             }]
 116             videos.append({
 117                 'id': url_basename(webpage_url),
 118                 'webpage_url': update_url_query(webpage_url, {'single': True}),
 119                 'duration': parse_duration(self._search_regex(
 120                     r'<time[^>]+duration[^>]*>([\d:]+)</time>', video, 'duration', fatal=False)),
 121                 'thumbnail': self._search_regex(
 122                     r'tgme_widget_message_video_thumb"[^>]+background-image:url\(\'([^\']+)\'\)',
 123                     video, 'thumbnail', fatal=False),
 124                 'formats': formats,
 125                 **message,
 126             })
 127
 128         playlist_id = None
 129         if len(videos) > 1 and 'single' not in parse_qs(url, keep_blank_values=True):
 130             playlist_id = f'{channel_id}-{msg_id}'
 131
 132         if self._yes_playlist(playlist_id, msg_id):
 133             return self.playlist_result(
 134                 videos, playlist_id, format_field(message, 'channel', f'%s {msg_id}'), description)
 135         else:
 136             return traverse_obj(videos, lambda _, x: x['id'] == msg_id, get_all=False)