yt_dlp/extractor/motherless.py

   1 import datetime
   2 import re
   3 import urllib.parse
   4
   5 from .common import InfoExtractor
   6 from ..utils import (
   7     ExtractorError,
   8     OnDemandPagedList,
   9     remove_end,
  10     str_to_int,
  11     unified_strdate,
  12 )
  13
  14
  15 class MotherlessIE(InfoExtractor):
  16     _VALID_URL = r'https?://(?:www\.)?motherless\.com/(?:g/[a-z0-9_]+/|G[VIG]?[A-F0-9]+/)?(?P<id>[A-F0-9]+)'
  17     _TESTS = [{
  18         'url': 'http://motherless.com/EE97006',
  19         'md5': 'cb5e7438f7a3c4e886b7bccc1292a3bc',
  20         'info_dict': {
  21             'id': 'EE97006',
  22             'ext': 'mp4',
  23             'title': 'Dogging blond Brit getting glazed (comp)',
  24             'categories': ['UK', 'slag', 'whore', 'dogging', 'cunt', 'cumhound', 'big tits', 'Pearl Necklace'],
  25             'upload_date': '20230519',
  26             'uploader_id': 'deathbird',
  27             'thumbnail': r're:https?://.*\.jpg',
  28             'age_limit': 18,
  29             'comment_count': int,
  30             'view_count': int,
  31             'like_count': int,
  32         },
  33         'params': {
  34             # Incomplete cert chains
  35             'nocheckcertificate': True,
  36         },
  37     }, {
  38         'url': 'http://motherless.com/532291B',
  39         'md5': 'bc59a6b47d1f958e61fbd38a4d31b131',
  40         'info_dict': {
  41             'id': '532291B',
  42             'ext': 'mp4',
  43             'title': 'Amazing girl playing the omegle game, PERFECT!',
  44             'categories': ['Amateur', 'webcam', 'omegle', 'pink', 'young', 'masturbate', 'teen',
  45                            'game', 'hairy'],
  46             'upload_date': '20140622',
  47             'uploader_id': 'Sulivana7x',
  48             'thumbnail': r're:https?://.*\.jpg',
  49             'age_limit': 18,
  50         },
  51         'skip': '404',
  52     }, {
  53         'url': 'http://motherless.com/g/cosplay/633979F',
  54         'md5': '0b2a43f447a49c3e649c93ad1fafa4a0',
  55         'info_dict': {
  56             'id': '633979F',
  57             'ext': 'mp4',
  58             'title': 'Turtlette',
  59             'categories': ['superheroine heroine superher'],
  60             'upload_date': '20140827',
  61             'uploader_id': 'shade0230',
  62             'thumbnail': r're:https?://.*\.jpg',
  63             'age_limit': 18,
  64             'like_count': int,
  65             'comment_count': int,
  66             'view_count': int,
  67         },
  68         'params': {
  69             'nocheckcertificate': True,
  70         },
  71     }, {
  72         'url': 'http://motherless.com/8B4BBC1',
  73         'info_dict': {
  74             'id': '8B4BBC1',
  75             'ext': 'mp4',
  76             'title': 'VIDEO00441.mp4',
  77             'categories': [],
  78             'upload_date': '20160214',
  79             'uploader_id': 'NMWildGirl',
  80             'thumbnail': r're:https?://.*\.jpg',
  81             'age_limit': 18,
  82             'like_count': int,
  83             'comment_count': int,
  84             'view_count': int,
  85         },
  86         'params': {
  87             'nocheckcertificate': True,
  88         },
  89     }, {
  90         # see https://motherless.com/videos/recent for recent videos with
  91         # uploaded date in "ago" format
  92         'url': 'https://motherless.com/3C3E2CF',
  93         'info_dict': {
  94             'id': '3C3E2CF',
  95             'ext': 'mp4',
  96             'title': 'a/ Hot Teens',
  97             'categories': list,
  98             'upload_date': '20210104',
  99             'uploader_id': 'anonymous',
 100             'thumbnail': r're:https?://.*\.jpg',
 101             'age_limit': 18,
 102             'like_count': int,
 103             'comment_count': int,
 104             'view_count': int,
 105         },
 106         'params': {
 107             'nocheckcertificate': True,
 108         },
 109     }]
 110
 111     def _real_extract(self, url):
 112         video_id = self._match_id(url)
 113         webpage = self._download_webpage(url, video_id)
 114
 115         if any(p in webpage for p in (
 116                 '<title>404 - MOTHERLESS.COM<',
 117                 ">The page you're looking for cannot be found.<")):
 118             raise ExtractorError('Video %s does not exist' % video_id, expected=True)
 119
 120         if '>The content you are trying to view is for friends only.' in webpage:
 121             raise ExtractorError('Video %s is for friends only' % video_id, expected=True)
 122
 123         title = self._html_search_regex(
 124             (r'(?s)<div[^>]+\bclass=["\']media-meta-title[^>]+>(.+?)</div>',
 125              r'id="view-upload-title">\s+([^<]+)<'), webpage, 'title')
 126         video_url = (self._html_search_regex(
 127             (r'setup\(\{\s*["\']file["\']\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1',
 128              r'fileurl\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1'),
 129             webpage, 'video URL', default=None, group='url')
 130             or 'http://cdn4.videos.motherlessmedia.com/videos/%s.mp4?fs=opencloud' % video_id)
 131         age_limit = self._rta_search(webpage)
 132         view_count = str_to_int(self._html_search_regex(
 133             (r'>([\d,.]+)\s+Views<', r'<strong>Views</strong>\s+([^<]+)<'),
 134             webpage, 'view count', fatal=False))
 135         like_count = str_to_int(self._html_search_regex(
 136             (r'>([\d,.]+)\s+Favorites<',
 137              r'<strong>Favorited</strong>\s+([^<]+)<'),
 138             webpage, 'like count', fatal=False))
 139
 140         upload_date = unified_strdate(self._search_regex(
 141             r'class=["\']count[^>]+>(\d+\s+[a-zA-Z]{3}\s+\d{4})<', webpage,
 142             'upload date', default=None))
 143         if not upload_date:
 144             uploaded_ago = self._search_regex(
 145                 r'>\s*(\d+[hd])\s+[aA]go\b', webpage, 'uploaded ago',
 146                 default=None)
 147             if uploaded_ago:
 148                 delta = int(uploaded_ago[:-1])
 149                 _AGO_UNITS = {
 150                     'h': 'hours',
 151                     'd': 'days',
 152                 }
 153                 kwargs = {_AGO_UNITS.get(uploaded_ago[-1]): delta}
 154                 upload_date = (datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(**kwargs)).strftime('%Y%m%d')
 155
 156         comment_count = len(re.findall(r'''class\s*=\s*['"]media-comment-contents\b''', webpage))
 157         uploader_id = self._html_search_regex(
 158             (r'''<span\b[^>]+\bclass\s*=\s*["']username\b[^>]*>([^<]+)</span>''',
 159              r'''(?s)['"](?:media-meta-member|thumb-member-username)\b[^>]+>\s*<a\b[^>]+\bhref\s*=\s*['"]/m/([^"']+)'''),
 160             webpage, 'uploader_id', fatal=False)
 161         categories = self._html_search_meta('keywords', webpage, default='')
 162         categories = [cat.strip() for cat in categories.split(',') if cat.strip()]
 163
 164         return {
 165             'id': video_id,
 166             'title': title,
 167             'upload_date': upload_date,
 168             'uploader_id': uploader_id,
 169             'thumbnail': self._og_search_thumbnail(webpage),
 170             'categories': categories,
 171             'view_count': view_count,
 172             'like_count': like_count,
 173             'comment_count': comment_count,
 174             'age_limit': age_limit,
 175             'url': video_url,
 176         }
 177
 178
 179 class MotherlessPaginatedIE(InfoExtractor):
 180     _EXTRA_QUERY = {}
 181     _PAGE_SIZE = 60
 182
 183     def _correct_path(self, url, item_id):
 184         raise NotImplementedError('This method must be implemented by subclasses')
 185
 186     def _extract_entries(self, webpage, base):
 187         for mobj in re.finditer(r'href="[^"]*(?P<href>/[A-F0-9]+)"\s+title="(?P<title>[^"]+)',
 188                                 webpage):
 189             video_url = urllib.parse.urljoin(base, mobj.group('href'))
 190             video_id = MotherlessIE.get_temp_id(video_url)
 191
 192             if video_id:
 193                 yield self.url_result(video_url, MotherlessIE, video_id, mobj.group('title'))
 194
 195     def _real_extract(self, url):
 196         item_id = self._match_id(url)
 197         real_url = self._correct_path(url, item_id)
 198         webpage = self._download_webpage(real_url, item_id, 'Downloading page 1')
 199
 200         def get_page(idx):
 201             page = idx + 1
 202             current_page = webpage if not idx else self._download_webpage(
 203                 real_url, item_id, note=f'Downloading page {page}', query={'page': page, **self._EXTRA_QUERY})
 204             yield from self._extract_entries(current_page, real_url)
 205
 206         return self.playlist_result(
 207             OnDemandPagedList(get_page, self._PAGE_SIZE), item_id,
 208             remove_end(self._html_extract_title(webpage), ' | MOTHERLESS.COM ™'))
 209
 210
 211 class MotherlessGroupIE(MotherlessPaginatedIE):
 212     _VALID_URL = r'https?://(?:www\.)?motherless\.com/g[vifm]?/(?P<id>[a-z0-9_]+)/?(?:$|[#?])'
 213     _TESTS = [{
 214         'url': 'http://motherless.com/gv/movie_scenes',
 215         'info_dict': {
 216             'id': 'movie_scenes',
 217             'title': 'Movie Scenes - Videos - Hot and sexy scenes from "regular" movies... Beautiful actresses fully',
 218         },
 219         'playlist_mincount': 540,
 220     }, {
 221         'url': 'http://motherless.com/g/sex_must_be_funny',
 222         'info_dict': {
 223             'id': 'sex_must_be_funny',
 224             'title': 'Sex must be funny',
 225         },
 226         'playlist_count': 0,
 227     }, {
 228         'url': 'https://motherless.com/gv/beautiful_cock',
 229         'info_dict': {
 230             'id': 'beautiful_cock',
 231             'title': 'Beautiful Cock',
 232         },
 233         'playlist_mincount': 2040,
 234     }]
 235
 236     def _correct_path(self, url, item_id):
 237         return urllib.parse.urljoin(url, f'/gv/{item_id}')
 238
 239
 240 class MotherlessGalleryIE(MotherlessPaginatedIE):
 241     _VALID_URL = r'https?://(?:www\.)?motherless\.com/G[VIG]?(?P<id>[A-F0-9]+)/?(?:$|[#?])'
 242     _TESTS = [{
 243         'url': 'https://motherless.com/GV338999F',
 244         'info_dict': {
 245             'id': '338999F',
 246             'title': 'Random',
 247         },
 248         'playlist_mincount': 171,
 249     }, {
 250         'url': 'https://motherless.com/GVABD6213',
 251         'info_dict': {
 252             'id': 'ABD6213',
 253             'title': 'Cuties',
 254         },
 255         'playlist_mincount': 2,
 256     }, {
 257         'url': 'https://motherless.com/GVBCF7622',
 258         'info_dict': {
 259             'id': 'BCF7622',
 260             'title': 'Vintage',
 261         },
 262         'playlist_count': 0,
 263     }, {
 264         'url': 'https://motherless.com/G035DE2F',
 265         'info_dict': {
 266             'id': '035DE2F',
 267             'title': 'General',
 268         },
 269         'playlist_mincount': 420,
 270     }]
 271
 272     def _correct_path(self, url, item_id):
 273         return urllib.parse.urljoin(url, f'/GV{item_id}')
 274
 275
 276 class MotherlessUploaderIE(MotherlessPaginatedIE):
 277     _VALID_URL = r'https?://(?:www\.)?motherless\.com/u/(?P<id>\w+)/?(?:$|[?#])'
 278     _TESTS = [{
 279         'url': 'https://motherless.com/u/Mrgo4hrs2023',
 280         'info_dict': {
 281             'id': 'Mrgo4hrs2023',
 282             'title': "Mrgo4hrs2023's Uploads - Videos",
 283         },
 284         'playlist_mincount': 32,
 285     }, {
 286         'url': 'https://motherless.com/u/Happy_couple?t=v',
 287         'info_dict': {
 288             'id': 'Happy_couple',
 289             'title': "Happy_couple's Uploads - Videos",
 290         },
 291         'playlist_mincount': 8,
 292     }]
 293
 294     _EXTRA_QUERY = {'t': 'v'}
 295
 296     def _correct_path(self, url, item_id):
 297         return urllib.parse.urljoin(url, f'/u/{item_id}?t=v')