yt_dlp/extractor/thisvid.py

   1 import itertools
   2 import re
   3 import urllib.parse
   4
   5 from .common import InfoExtractor
   6 from ..utils import (
   7     clean_html,
   8     get_element_by_class,
   9     int_or_none,
  10     url_or_none,
  11     urljoin,
  12 )
  13
  14
  15 class ThisVidIE(InfoExtractor):
  16     _VALID_URL = r'https?://(?:www\.)?thisvid\.com/(?P<type>videos|embed)/(?P<id>[A-Za-z0-9-]+)'
  17     _TESTS = [{
  18         'url': 'https://thisvid.com/videos/sitting-on-ball-tight-jeans/',
  19         'md5': '839becb572995687e11a69dc4358a386',
  20         'info_dict': {
  21             'id': '3533241',
  22             'ext': 'mp4',
  23             'title': 'Sitting on ball tight jeans',
  24             'description': 'md5:372353bb995883d1b65fddf507489acd',
  25             'thumbnail': r're:https?://\w+\.thisvid\.com/(?:[^/]+/)+3533241/preview\.jpg',
  26             'uploader_id': '150629',
  27             'uploader': 'jeanslevisjeans',
  28             'display_id': 'sitting-on-ball-tight-jeans',
  29             'age_limit': 18,
  30         },
  31     }, {
  32         'url': 'https://thisvid.com/embed/3533241/',
  33         'md5': '839becb572995687e11a69dc4358a386',
  34         'info_dict': {
  35             'id': '3533241',
  36             'ext': 'mp4',
  37             'title': 'Sitting on ball tight jeans',
  38             'thumbnail': r're:https?://\w+\.thisvid\.com/(?:[^/]+/)+3533241/preview\.jpg',
  39             'uploader_id': '150629',
  40             'uploader': 'jeanslevisjeans',
  41             'display_id': 'sitting-on-ball-tight-jeans',
  42             'age_limit': 18,
  43         },
  44     }]
  45
  46     def _real_extract(self, url):
  47         main_id, type_ = re.match(self._VALID_URL, url).group('id', 'type')
  48         webpage = self._download_webpage(url, main_id)
  49
  50         title = self._html_search_regex(
  51             r'<title\b[^>]*?>(?:Video:\s+)?(.+?)(?:\s+-\s+ThisVid(?:\.com| tube))?</title>',
  52             webpage, 'title')
  53
  54         if type_ == 'embed':
  55             # look for more metadata
  56             video_alt_url = url_or_none(self._search_regex(
  57                 rf'''video_alt_url\s*:\s+'({self._VALID_URL}/)',''',
  58                 webpage, 'video_alt_url', default=None))
  59             if video_alt_url and video_alt_url != url:
  60                 webpage = self._download_webpage(
  61                     video_alt_url, main_id,
  62                     note='Redirecting embed to main page', fatal=False) or webpage
  63
  64         video_holder = get_element_by_class('video-holder', webpage) or ''
  65         if '>This video is a private video' in video_holder:
  66             self.raise_login_required(
  67                 (clean_html(video_holder) or 'Private video').partition('\n')[0])
  68
  69         uploader = self._html_search_regex(
  70             r'''(?s)<span\b[^>]*>Added by:\s*</span><a\b[^>]+\bclass\s*=\s*["']author\b[^>]+\bhref\s*=\s*["']https://thisvid\.com/members/([0-9]+/.{3,}?)\s*</a>''',
  71             webpage, 'uploader', default='')
  72         uploader = re.split(r'''/["'][^>]*>\s*''', uploader)
  73         if len(uploader) == 2:
  74             # id must be non-empty, uploader could be ''
  75             uploader_id, uploader = uploader
  76             uploader = uploader or None
  77         else:
  78             uploader_id = uploader = None
  79
  80         return self.url_result(
  81             url, ie='Generic', url_transparent=True,
  82             title=title,
  83             age_limit=18,
  84             uploader=uploader,
  85             uploader_id=uploader_id)
  86
  87
  88 class ThisVidPlaylistBaseIE(InfoExtractor):
  89     _PLAYLIST_URL_RE = None
  90
  91     @classmethod
  92     def _find_urls(cls, html):
  93         for m in re.finditer(rf'''<a\b[^>]+\bhref\s*=\s*["'](?P<url>{cls._PLAYLIST_URL_RE}\b)[^>]+>''', html):
  94             yield m.group('url')
  95
  96     def _generate_playlist_entries(self, url, playlist_id, html=None):
  97         page_url = url
  98         for page in itertools.count(1):
  99             if not html:
 100                 html = self._download_webpage(
 101                     page_url, playlist_id, note=f'Downloading page {page}',
 102                     fatal=False) or ''
 103
 104             yield from self._find_urls(html)
 105
 106             next_page = get_element_by_class('pagination-next', html) or ''
 107             if next_page:
 108                 # member list page
 109                 next_page = urljoin(url, self._search_regex(
 110                     r'''<a\b[^>]+\bhref\s*=\s*("|')(?P<url>(?!#)(?:(?!\1).)+)''',
 111                     next_page, 'next page link', group='url', default=None))
 112
 113             # in case a member page should have pagination-next with empty link, not just `else:`
 114             if next_page is None:
 115                 # playlist page
 116                 parsed_url = urllib.parse.urlparse(page_url)
 117                 base_path, _, num = parsed_url.path.rpartition('/')
 118                 num = int_or_none(num)
 119                 if num is None:
 120                     base_path, num = parsed_url.path.rstrip('/'), 1
 121                 parsed_url = parsed_url._replace(path=f'{base_path}/{num + 1}')
 122                 next_page = urllib.parse.urlunparse(parsed_url)
 123                 if page_url == next_page:
 124                     next_page = None
 125
 126             if not next_page:
 127                 return
 128             page_url, html = next_page, None
 129
 130     def _make_playlist_result(self, url):
 131         playlist_id = self._match_id(url)
 132         webpage = self._download_webpage(url, playlist_id)
 133
 134         title = re.split(
 135             r'(?i)\s*\|\s*ThisVid\.com\s*$',
 136             self._og_search_title(webpage, default=None)
 137             or self._html_search_regex(r'(?s)<title\b[^>]*>(.+?)</title', webpage, 'title', fatal=False) or '', maxsplit=1)[0] or None
 138
 139         return self.playlist_from_matches(
 140             self._generate_playlist_entries(url, playlist_id, webpage),
 141             playlist_id=playlist_id, playlist_title=title, ie=ThisVidIE)
 142
 143
 144 class ThisVidMemberIE(ThisVidPlaylistBaseIE):
 145     _VALID_URL = r'https?://thisvid\.com/members/(?P<id>\d+)'
 146     _TESTS = [{
 147         'url': 'https://thisvid.com/members/2140501/',
 148         'info_dict': {
 149             'id': '2140501',
 150             'title': 'Rafflesia\'s Profile',
 151         },
 152         'playlist_mincount': 16,
 153     }, {
 154         'url': 'https://thisvid.com/members/2140501/favourite_videos/',
 155         'info_dict': {
 156             'id': '2140501',
 157             'title': 'Rafflesia\'s Favourite Videos',
 158         },
 159         'playlist_mincount': 15,
 160     }, {
 161         'url': 'https://thisvid.com/members/636468/public_videos/',
 162         'info_dict': {
 163             'id': '636468',
 164             'title': 'Happymouth\'s Public Videos',
 165         },
 166         'playlist_mincount': 196,
 167     }]
 168     _PLAYLIST_URL_RE = ThisVidIE._VALID_URL
 169
 170     def _real_extract(self, url):
 171         return self._make_playlist_result(url)
 172
 173
 174 class ThisVidPlaylistIE(ThisVidPlaylistBaseIE):
 175     _VALID_URL = r'https?://thisvid\.com/playlist/(?P<id>\d+)/video/(?P<video_id>[A-Za-z0-9-]+)'
 176     _TESTS = [{
 177         'url': 'https://thisvid.com/playlist/6615/video/big-italian-booty-28/',
 178         'info_dict': {
 179             'id': '6615',
 180             'title': 'Underwear Stuff',
 181         },
 182         'playlist_mincount': 200,
 183     }, {
 184         'url': 'https://thisvid.com/playlist/6615/video/big-italian-booty-28/',
 185         'info_dict': {
 186             'id': '1072387',
 187             'ext': 'mp4',
 188             'title': 'Big Italian Booty 28',
 189             'description': 'md5:1bccf7b13765e18fb27bf764dba7ede2',
 190             'uploader_id': '367912',
 191             'uploader': 'Jcmusclefun',
 192             'age_limit': 18,
 193             'display_id': 'big-italian-booty-28',
 194             'thumbnail': r're:https?://\w+\.thisvid\.com/(?:[^/]+/)+1072387/preview\.jpg',
 195         },
 196         'params': {
 197             'noplaylist': True,
 198         },
 199     }]
 200     _PLAYLIST_URL_RE = _VALID_URL
 201
 202     def _generate_playlist_entries(self, url, playlist_id, html=None):
 203         for wrapped_url in super()._generate_playlist_entries(url, playlist_id, html):
 204             video_id = re.match(self._VALID_URL, wrapped_url).group('video_id')
 205             yield urljoin(url, f'/videos/{video_id}/')
 206
 207     def _real_extract(self, url):
 208         playlist_id, video_id = self._match_valid_url(url).group('id', 'video_id')
 209
 210         if not self._yes_playlist(playlist_id, video_id):
 211             redirect_url = urljoin(url, f'/videos/{video_id}/')
 212             return self.url_result(redirect_url, ThisVidIE)
 213
 214         result = self._make_playlist_result(url)
 215
 216         # Fix duplicated title (`the title - the title` => `the title`)
 217         title = result['title']
 218         t_len = len(title)
 219         if t_len > 5 and t_len % 2 != 0:
 220             t_len = t_len // 2
 221             if title[t_len] == '-':
 222                 first, second = map(str.strip, (title[:t_len], title[t_len + 1:]))
 223                 if first and first == second:
 224                     result['title'] = first
 225
 226         return result