yt_dlp/extractor/pornhub.py

   1 import functools
   2 import itertools
   3 import math
   4 import operator
   5 import re
   6
   7 from .common import InfoExtractor
   8 from .openload import PhantomJSwrapper
   9 from ..networking import Request
  10 from ..networking.exceptions import HTTPError
  11 from ..utils import (
  12     NO_DEFAULT,
  13     ExtractorError,
  14     clean_html,
  15     determine_ext,
  16     format_field,
  17     int_or_none,
  18     merge_dicts,
  19     orderedSet,
  20     remove_quotes,
  21     remove_start,
  22     str_to_int,
  23     update_url_query,
  24     url_or_none,
  25     urlencode_postdata,
  26 )
  27
  28
  29 class PornHubBaseIE(InfoExtractor):
  30     _NETRC_MACHINE = 'pornhub'
  31     _PORNHUB_HOST_RE = r'(?:(?P<host>pornhub(?:premium)?\.(?:com|net|org))|pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd\.onion)'
  32
  33     def _download_webpage_handle(self, *args, **kwargs):
  34         def dl(*args, **kwargs):
  35             return super(PornHubBaseIE, self)._download_webpage_handle(*args, **kwargs)
  36
  37         ret = dl(*args, **kwargs)
  38
  39         if not ret:
  40             return ret
  41
  42         webpage, urlh = ret
  43
  44         if any(re.search(p, webpage) for p in (
  45                 r'<body\b[^>]+\bonload=["\']go\(\)',
  46                 r'document\.cookie\s*=\s*["\']RNKEY=',
  47                 r'document\.location\.reload\(true\)')):
  48             url_or_request = args[0]
  49             url = (url_or_request.url
  50                    if isinstance(url_or_request, Request)
  51                    else url_or_request)
  52             phantom = PhantomJSwrapper(self, required_version='2.0')
  53             phantom.get(url, html=webpage)
  54             webpage, urlh = dl(*args, **kwargs)
  55
  56         return webpage, urlh
  57
  58     def _real_initialize(self):
  59         self._logged_in = False
  60
  61     def _set_age_cookies(self, host):
  62         self._set_cookie(host, 'age_verified', '1')
  63         self._set_cookie(host, 'accessAgeDisclaimerPH', '1')
  64         self._set_cookie(host, 'accessAgeDisclaimerUK', '1')
  65         self._set_cookie(host, 'accessPH', '1')
  66
  67     def _login(self, host):
  68         if self._logged_in:
  69             return
  70
  71         site = host.split('.')[0]
  72
  73         # Both sites pornhub and pornhubpremium have separate accounts
  74         # so there should be an option to provide credentials for both.
  75         # At the same time some videos are available under the same video id
  76         # on both sites so that we have to identify them as the same video.
  77         # For that purpose we have to keep both in the same extractor
  78         # but under different netrc machines.
  79         username, password = self._get_login_info(netrc_machine=site)
  80         if username is None:
  81             return
  82
  83         login_url = 'https://www.{}/{}login'.format(host, 'premium/' if 'premium' in host else '')
  84         login_page = self._download_webpage(
  85             login_url, None, f'Downloading {site} login page')
  86
  87         def is_logged(webpage):
  88             return any(re.search(p, webpage) for p in (
  89                 r'id="profileMenuDropdown"',
  90                 r'class="ph-icon-logout"'))
  91
  92         if is_logged(login_page):
  93             self._logged_in = True
  94             return
  95
  96         login_form = self._hidden_inputs(login_page)
  97
  98         login_form.update({
  99             'email': username,
 100             'password': password,
 101         })
 102
 103         response = self._download_json(
 104             f'https://www.{host}/front/authenticate', None,
 105             f'Logging in to {site}',
 106             data=urlencode_postdata(login_form),
 107             headers={
 108                 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
 109                 'Referer': login_url,
 110                 'X-Requested-With': 'XMLHttpRequest',
 111             })
 112
 113         if response.get('success') == '1':
 114             self._logged_in = True
 115             return
 116
 117         message = response.get('message')
 118         if message is not None:
 119             raise ExtractorError(
 120                 f'Unable to login: {message}', expected=True)
 121
 122         raise ExtractorError('Unable to log in')
 123
 124
 125 class PornHubIE(PornHubBaseIE):
 126     IE_DESC = 'PornHub and Thumbzilla'
 127     _VALID_URL = rf'''(?x)
 128                     https?://
 129                         (?:
 130                             (?:[^/]+\.)?
 131                             {PornHubBaseIE._PORNHUB_HOST_RE}
 132                             /(?:(?:view_video\.php|video/show)\?viewkey=|embed/)|
 133                             (?:www\.)?thumbzilla\.com/video/
 134                         )
 135                         (?P<id>[\da-z]+)
 136                     '''
 137     _EMBED_REGEX = [r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub(?:premium)?\.(?:com|net|org)/embed/[\da-z]+)']
 138     _TESTS = [{
 139         'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
 140         'md5': 'a6391306d050e4547f62b3f485dd9ba9',
 141         'info_dict': {
 142             'id': '648719015',
 143             'ext': 'mp4',
 144             'title': 'Seductive Indian beauty strips down and fingers her pink pussy',
 145             'uploader': 'Babes',
 146             'upload_date': '20130628',
 147             'timestamp': 1372447216,
 148             'duration': 361,
 149             'view_count': int,
 150             'like_count': int,
 151             'dislike_count': int,
 152             'comment_count': int,
 153             'age_limit': 18,
 154             'tags': list,
 155             'categories': list,
 156             'cast': list,
 157         },
 158     }, {
 159         # non-ASCII title
 160         'url': 'http://www.pornhub.com/view_video.php?viewkey=1331683002',
 161         'info_dict': {
 162             'id': '1331683002',
 163             'ext': 'mp4',
 164             'title': '重庆婷婷女王足交',
 165             'upload_date': '20150213',
 166             'timestamp': 1423804862,
 167             'duration': 1753,
 168             'view_count': int,
 169             'like_count': int,
 170             'dislike_count': int,
 171             'comment_count': int,
 172             'age_limit': 18,
 173             'tags': list,
 174             'categories': list,
 175         },
 176         'params': {
 177             'skip_download': True,
 178         },
 179         'skip': 'Video has been flagged for verification in accordance with our trust and safety policy',
 180     }, {
 181         # subtitles
 182         'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5af5fef7c2aa7',
 183         'info_dict': {
 184             'id': 'ph5af5fef7c2aa7',
 185             'ext': 'mp4',
 186             'title': 'BFFS - Cute Teen Girls Share Cock On the Floor',
 187             'uploader': 'BFFs',
 188             'duration': 622,
 189             'view_count': int,
 190             'like_count': int,
 191             'dislike_count': int,
 192             'comment_count': int,
 193             'age_limit': 18,
 194             'tags': list,
 195             'categories': list,
 196             'subtitles': {
 197                 'en': [{
 198                     'ext': 'srt',
 199                 }],
 200             },
 201         },
 202         'params': {
 203             'skip_download': True,
 204         },
 205         'skip': 'This video has been disabled',
 206     }, {
 207         'url': 'http://www.pornhub.com/view_video.php?viewkey=ph601dc30bae19a',
 208         'info_dict': {
 209             'id': 'ph601dc30bae19a',
 210             'uploader': 'Projekt Melody',
 211             'uploader_id': 'projekt-melody',
 212             'upload_date': '20210205',
 213             'title': '"Welcome to My Pussy Mansion" - CB Stream (02/03/21)',
 214             'thumbnail': r're:https?://.+',
 215         },
 216     }, {
 217         'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d',
 218         'only_matching': True,
 219     }, {
 220         # removed at the request of cam4.com
 221         'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862',
 222         'only_matching': True,
 223     }, {
 224         # removed at the request of the copyright owner
 225         'url': 'http://www.pornhub.com/view_video.php?viewkey=788152859',
 226         'only_matching': True,
 227     }, {
 228         # removed by uploader
 229         'url': 'http://www.pornhub.com/view_video.php?viewkey=ph572716d15a111',
 230         'only_matching': True,
 231     }, {
 232         # private video
 233         'url': 'http://www.pornhub.com/view_video.php?viewkey=ph56fd731fce6b7',
 234         'only_matching': True,
 235     }, {
 236         'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex',
 237         'only_matching': True,
 238     }, {
 239         'url': 'http://www.pornhub.com/video/show?viewkey=648719015',
 240         'only_matching': True,
 241     }, {
 242         'url': 'https://www.pornhub.net/view_video.php?viewkey=203640933',
 243         'only_matching': True,
 244     }, {
 245         'url': 'https://www.pornhub.org/view_video.php?viewkey=203640933',
 246         'only_matching': True,
 247     }, {
 248         'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5e4acdae54a82',
 249         'only_matching': True,
 250     }, {
 251         # Some videos are available with the same id on both premium
 252         # and non-premium sites (e.g. this and the following test)
 253         'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5f75b0f4b18e3',
 254         'only_matching': True,
 255     }, {
 256         'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5f75b0f4b18e3',
 257         'only_matching': True,
 258     }, {
 259         # geo restricted
 260         'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5a9813bfa7156',
 261         'only_matching': True,
 262     }, {
 263         'url': 'http://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/view_video.php?viewkey=ph5a9813bfa7156',
 264         'only_matching': True,
 265     }]
 266
 267     def _extract_count(self, pattern, webpage, name):
 268         return str_to_int(self._search_regex(pattern, webpage, f'{name} count', default=None))
 269
 270     def _real_extract(self, url):
 271         mobj = self._match_valid_url(url)
 272         host = mobj.group('host') or 'pornhub.com'
 273         video_id = mobj.group('id')
 274
 275         self._login(host)
 276         self._set_age_cookies(host)
 277
 278         def dl_webpage(platform):
 279             self._set_cookie(host, 'platform', platform)
 280             return self._download_webpage(
 281                 f'https://www.{host}/view_video.php?viewkey={video_id}',
 282                 video_id, f'Downloading {platform} webpage')
 283
 284         webpage = dl_webpage('pc')
 285
 286         error_msg = self._html_search_regex(
 287             (r'(?s)<div[^>]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P<error>.+?)</div>',
 288              r'(?s)<section[^>]+class=["\']noVideo["\'][^>]*>(?P<error>.+?)</section>'),
 289             webpage, 'error message', default=None, group='error')
 290         if error_msg:
 291             error_msg = re.sub(r'\s+', ' ', error_msg)
 292             raise ExtractorError(
 293                 f'PornHub said: {error_msg}',
 294                 expected=True, video_id=video_id)
 295
 296         if any(re.search(p, webpage) for p in (
 297                 r'class=["\']geoBlocked["\']',
 298                 r'>\s*This content is unavailable in your country')):
 299             self.raise_geo_restricted()
 300
 301         # video_title from flashvars contains whitespace instead of non-ASCII (see
 302         # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying
 303         # on that anymore.
 304         title = self._html_search_meta(
 305             'twitter:title', webpage, default=None) or self._html_search_regex(
 306             (r'(?s)<h1[^>]+class=["\']title["\'][^>]*>(?P<title>.+?)</h1>',
 307              r'<div[^>]+data-video-title=(["\'])(?P<title>(?:(?!\1).)+)\1',
 308              r'shareTitle["\']\s*[=:]\s*(["\'])(?P<title>(?:(?!\1).)+)\1'),
 309             webpage, 'title', group='title')
 310
 311         video_urls = []
 312         video_urls_set = set()
 313         subtitles = {}
 314
 315         flashvars = self._parse_json(
 316             self._search_regex(
 317                 r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'),
 318             video_id)
 319         if flashvars:
 320             subtitle_url = url_or_none(flashvars.get('closedCaptionsFile'))
 321             if subtitle_url:
 322                 subtitles.setdefault('en', []).append({
 323                     'url': subtitle_url,
 324                     'ext': 'srt',
 325                 })
 326             thumbnail = flashvars.get('image_url')
 327             duration = int_or_none(flashvars.get('video_duration'))
 328             media_definitions = flashvars.get('mediaDefinitions')
 329             if isinstance(media_definitions, list):
 330                 for definition in media_definitions:
 331                     if not isinstance(definition, dict):
 332                         continue
 333                     video_url = definition.get('videoUrl')
 334                     if not video_url or not isinstance(video_url, str):
 335                         continue
 336                     if video_url in video_urls_set:
 337                         continue
 338                     video_urls_set.add(video_url)
 339                     video_urls.append(
 340                         (video_url, int_or_none(definition.get('quality'))))
 341         else:
 342             thumbnail, duration = [None] * 2
 343
 344         def extract_js_vars(webpage, pattern, default=NO_DEFAULT):
 345             assignments = self._search_regex(
 346                 pattern, webpage, 'encoded url', default=default)
 347             if not assignments:
 348                 return {}
 349
 350             assignments = assignments.split(';')
 351
 352             js_vars = {}
 353
 354             def parse_js_value(inp):
 355                 inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp)
 356                 if '+' in inp:
 357                     inps = inp.split('+')
 358                     return functools.reduce(
 359                         operator.concat, map(parse_js_value, inps))
 360                 inp = inp.strip()
 361                 if inp in js_vars:
 362                     return js_vars[inp]
 363                 return remove_quotes(inp)
 364
 365             for assn in assignments:
 366                 assn = assn.strip()
 367                 if not assn:
 368                     continue
 369                 assn = re.sub(r'var\s+', '', assn)
 370                 vname, value = assn.split('=', 1)
 371                 js_vars[vname] = parse_js_value(value)
 372             return js_vars
 373
 374         def add_video_url(video_url):
 375             v_url = url_or_none(video_url)
 376             if not v_url:
 377                 return
 378             if v_url in video_urls_set:
 379                 return
 380             video_urls.append((v_url, None))
 381             video_urls_set.add(v_url)
 382
 383         def parse_quality_items(quality_items):
 384             q_items = self._parse_json(quality_items, video_id, fatal=False)
 385             if not isinstance(q_items, list):
 386                 return
 387             for item in q_items:
 388                 if isinstance(item, dict):
 389                     add_video_url(item.get('url'))
 390
 391         if not video_urls:
 392             FORMAT_PREFIXES = ('media', 'quality', 'qualityItems')
 393             js_vars = extract_js_vars(
 394                 webpage, r'(var\s+(?:{})_.+)'.format('|'.join(FORMAT_PREFIXES)),
 395                 default=None)
 396             if js_vars:
 397                 for key, format_url in js_vars.items():
 398                     if key.startswith(FORMAT_PREFIXES[-1]):
 399                         parse_quality_items(format_url)
 400                     elif any(key.startswith(p) for p in FORMAT_PREFIXES[:2]):
 401                         add_video_url(format_url)
 402             if not video_urls and re.search(
 403                     r'<[^>]+\bid=["\']lockedPlayer', webpage):
 404                 raise ExtractorError(
 405                     f'Video {video_id} is locked', expected=True)
 406
 407         if not video_urls:
 408             js_vars = extract_js_vars(
 409                 dl_webpage('tv'), r'(var.+?mediastring.+?)</script>')
 410             add_video_url(js_vars['mediastring'])
 411
 412         for mobj in re.finditer(
 413                 r'<a[^>]+\bclass=["\']downloadBtn\b[^>]+\bhref=(["\'])(?P<url>(?:(?!\1).)+)\1',
 414                 webpage):
 415             video_url = mobj.group('url')
 416             if video_url not in video_urls_set:
 417                 video_urls.append((video_url, None))
 418                 video_urls_set.add(video_url)
 419
 420         upload_date = None
 421         formats = []
 422
 423         def add_format(format_url, height=None):
 424             ext = determine_ext(format_url)
 425             if ext == 'mpd':
 426                 formats.extend(self._extract_mpd_formats(
 427                     format_url, video_id, mpd_id='dash', fatal=False))
 428                 return
 429             if ext == 'm3u8':
 430                 formats.extend(self._extract_m3u8_formats(
 431                     format_url, video_id, 'mp4', entry_protocol='m3u8_native',
 432                     m3u8_id='hls', fatal=False))
 433                 return
 434             if not height:
 435                 height = int_or_none(self._search_regex(
 436                     r'(?P<height>\d+)[pP]?_\d+[kK]', format_url, 'height',
 437                     default=None))
 438             formats.append({
 439                 'url': format_url,
 440                 'format_id': format_field(height, None, '%dp'),
 441                 'height': height,
 442             })
 443
 444         for video_url, height in video_urls:
 445             if not upload_date:
 446                 upload_date = self._search_regex(
 447                     r'/(\d{6}/\d{2})/', video_url, 'upload data', default=None)
 448                 if upload_date:
 449                     upload_date = upload_date.replace('/', '')
 450             if '/video/get_media' in video_url:
 451                 medias = self._download_json(video_url, video_id, fatal=False)
 452                 if isinstance(medias, list):
 453                     for media in medias:
 454                         if not isinstance(media, dict):
 455                             continue
 456                         video_url = url_or_none(media.get('videoUrl'))
 457                         if not video_url:
 458                             continue
 459                         height = int_or_none(media.get('quality'))
 460                         add_format(video_url, height)
 461                 continue
 462             add_format(video_url)
 463
 464         model_profile = self._search_json(
 465             r'var\s+MODEL_PROFILE\s*=', webpage, 'model profile', video_id, fatal=False)
 466         video_uploader = self._html_search_regex(
 467             r'(?s)From:&nbsp;.+?<(?:a\b[^>]+\bhref=["\']/(?:(?:user|channel)s|model|pornstar)/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<',
 468             webpage, 'uploader', default=None) or model_profile.get('username')
 469
 470         def extract_vote_count(kind, name):
 471             return self._extract_count(
 472                 (rf'<span[^>]+\bclass="votes{kind}"[^>]*>([\d,\.]+)</span>',
 473                  rf'<span[^>]+\bclass=["\']votes{kind}["\'][^>]*\bdata-rating=["\'](\d+)'),
 474                 webpage, name)
 475
 476         view_count = self._extract_count(
 477             r'<span class="count">([\d,\.]+)</span> [Vv]iews', webpage, 'view')
 478         like_count = extract_vote_count('Up', 'like')
 479         dislike_count = extract_vote_count('Down', 'dislike')
 480         comment_count = self._extract_count(
 481             r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')
 482
 483         def extract_list(meta_key):
 484             div = self._search_regex(
 485                 rf'(?s)<div[^>]+\bclass=["\'].*?\b{meta_key}Wrapper[^>]*>(.+?)</div>',
 486                 webpage, meta_key, default=None)
 487             if div:
 488                 return [clean_html(x).strip() for x in re.findall(r'(?s)<a[^>]+\bhref=[^>]+>.+?</a>', div)]
 489
 490         info = self._search_json_ld(webpage, video_id, default={})
 491         # description provided in JSON-LD is irrelevant
 492         info['description'] = None
 493
 494         return merge_dicts({
 495             'id': video_id,
 496             'uploader': video_uploader,
 497             'uploader_id': remove_start(model_profile.get('modelProfileLink'), '/model/'),
 498             'upload_date': upload_date,
 499             'title': title,
 500             'thumbnail': thumbnail,
 501             'duration': duration,
 502             'view_count': view_count,
 503             'like_count': like_count,
 504             'dislike_count': dislike_count,
 505             'comment_count': comment_count,
 506             'formats': formats,
 507             'age_limit': 18,
 508             'tags': extract_list('tags'),
 509             'categories': extract_list('categories'),
 510             'cast': extract_list('pornstars'),
 511             'subtitles': subtitles,
 512         }, info)
 513
 514
 515 class PornHubPlaylistBaseIE(PornHubBaseIE):
 516     def _extract_page(self, url):
 517         return int_or_none(self._search_regex(
 518             r'\bpage=(\d+)', url, 'page', default=None))
 519
 520     def _extract_entries(self, webpage, host):
 521         # Only process container div with main playlist content skipping
 522         # drop-down menu that uses similar pattern for videos (see
 523         # https://github.com/ytdl-org/youtube-dl/issues/11594).
 524         container = self._search_regex(
 525             r'(?s)(<div[^>]+class=["\']container.+)', webpage,
 526             'container', default=webpage)
 527
 528         return [
 529             self.url_result(
 530                 f'http://www.{host}/{video_url}',
 531                 PornHubIE.ie_key(), video_title=title)
 532             for video_url, title in orderedSet(re.findall(
 533                 r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^"]+)"',
 534                 container))
 535         ]
 536
 537
 538 class PornHubUserIE(PornHubPlaylistBaseIE):
 539     _VALID_URL = rf'(?P<url>https?://(?:[^/]+\.)?{PornHubBaseIE._PORNHUB_HOST_RE}/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/?#&]+))(?:[?#&]|/(?!videos)|$)'
 540     _TESTS = [{
 541         'url': 'https://www.pornhub.com/model/zoe_ph',
 542         'playlist_mincount': 118,
 543     }, {
 544         'url': 'https://www.pornhub.com/pornstar/liz-vicious',
 545         'info_dict': {
 546             'id': 'liz-vicious',
 547         },
 548         'playlist_mincount': 118,
 549     }, {
 550         'url': 'https://www.pornhub.com/users/russianveet69',
 551         'only_matching': True,
 552     }, {
 553         'url': 'https://www.pornhub.com/channels/povd',
 554         'only_matching': True,
 555     }, {
 556         'url': 'https://www.pornhub.com/model/zoe_ph?abc=1',
 557         'only_matching': True,
 558     }, {
 559         # Unavailable via /videos page, but available with direct pagination
 560         # on pornstar page (see [1]), requires premium
 561         # 1. https://github.com/ytdl-org/youtube-dl/issues/27853
 562         'url': 'https://www.pornhubpremium.com/pornstar/sienna-west',
 563         'only_matching': True,
 564     }, {
 565         # Same as before, multi page
 566         'url': 'https://www.pornhubpremium.com/pornstar/lily-labeau',
 567         'only_matching': True,
 568     }, {
 569         'url': 'https://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/model/zoe_ph',
 570         'only_matching': True,
 571     }]
 572
 573     def _real_extract(self, url):
 574         mobj = self._match_valid_url(url)
 575         user_id = mobj.group('id')
 576         videos_url = '{}/videos'.format(mobj.group('url'))
 577         self._set_age_cookies(mobj.group('host'))
 578         page = self._extract_page(url)
 579         if page:
 580             videos_url = update_url_query(videos_url, {'page': page})
 581         return self.url_result(
 582             videos_url, ie=PornHubPagedVideoListIE.ie_key(), video_id=user_id)
 583
 584
 585 class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE):
 586     @staticmethod
 587     def _has_more(webpage):
 588         return re.search(
 589             r'''(?x)
 590                 <li[^>]+\bclass=["\']page_next|
 591                 <link[^>]+\brel=["\']next|
 592                 <button[^>]+\bid=["\']moreDataBtn
 593             ''', webpage) is not None
 594
 595     def _entries(self, url, host, item_id):
 596         page = self._extract_page(url)
 597
 598         VIDEOS = '/videos'
 599
 600         def download_page(base_url, num, fallback=False):
 601             note = 'Downloading page {}{}'.format(num, ' (switch to fallback)' if fallback else '')
 602             return self._download_webpage(
 603                 base_url, item_id, note, query={'page': num})
 604
 605         def is_404(e):
 606             return isinstance(e.cause, HTTPError) and e.cause.status == 404
 607
 608         base_url = url
 609         has_page = page is not None
 610         first_page = page if has_page else 1
 611         for page_num in (first_page, ) if has_page else itertools.count(first_page):
 612             try:
 613                 try:
 614                     webpage = download_page(base_url, page_num)
 615                 except ExtractorError as e:
 616                     # Some sources may not be available via /videos page,
 617                     # trying to fallback to main page pagination (see [1])
 618                     # 1. https://github.com/ytdl-org/youtube-dl/issues/27853
 619                     if is_404(e) and page_num == first_page and VIDEOS in base_url:
 620                         base_url = base_url.replace(VIDEOS, '')
 621                         webpage = download_page(base_url, page_num, fallback=True)
 622                     else:
 623                         raise
 624             except ExtractorError as e:
 625                 if is_404(e) and page_num != first_page:
 626                     break
 627                 raise
 628             page_entries = self._extract_entries(webpage, host)
 629             if not page_entries:
 630                 break
 631             yield from page_entries
 632             if not self._has_more(webpage):
 633                 break
 634
 635     def _real_extract(self, url):
 636         mobj = self._match_valid_url(url)
 637         host = mobj.group('host')
 638         item_id = mobj.group('id')
 639
 640         self._login(host)
 641         self._set_age_cookies(host)
 642
 643         return self.playlist_result(self._entries(url, host, item_id), item_id)
 644
 645
 646 class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE):
 647     _VALID_URL = rf'https?://(?:[^/]+\.)?{PornHubBaseIE._PORNHUB_HOST_RE}/(?!playlist/)(?P<id>(?:[^/]+/)*[^/?#&]+)'
 648     _TESTS = [{
 649         'url': 'https://www.pornhub.com/model/zoe_ph/videos',
 650         'only_matching': True,
 651     }, {
 652         'url': 'http://www.pornhub.com/users/rushandlia/videos',
 653         'only_matching': True,
 654     }, {
 655         'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos',
 656         'info_dict': {
 657             'id': 'pornstar/jenny-blighe/videos',
 658         },
 659         'playlist_mincount': 149,
 660     }, {
 661         'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos?page=3',
 662         'info_dict': {
 663             'id': 'pornstar/jenny-blighe/videos',
 664         },
 665         'playlist_mincount': 40,
 666     }, {
 667         # default sorting as Top Rated Videos
 668         'url': 'https://www.pornhub.com/channels/povd/videos',
 669         'info_dict': {
 670             'id': 'channels/povd/videos',
 671         },
 672         'playlist_mincount': 293,
 673     }, {
 674         # Top Rated Videos
 675         'url': 'https://www.pornhub.com/channels/povd/videos?o=ra',
 676         'only_matching': True,
 677     }, {
 678         # Most Recent Videos
 679         'url': 'https://www.pornhub.com/channels/povd/videos?o=da',
 680         'only_matching': True,
 681     }, {
 682         # Most Viewed Videos
 683         'url': 'https://www.pornhub.com/channels/povd/videos?o=vi',
 684         'only_matching': True,
 685     }, {
 686         'url': 'http://www.pornhub.com/users/zoe_ph/videos/public',
 687         'only_matching': True,
 688     }, {
 689         # Most Viewed Videos
 690         'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=mv',
 691         'only_matching': True,
 692     }, {
 693         # Top Rated Videos
 694         'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=tr',
 695         'only_matching': True,
 696     }, {
 697         # Longest Videos
 698         'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=lg',
 699         'only_matching': True,
 700     }, {
 701         # Newest Videos
 702         'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=cm',
 703         'only_matching': True,
 704     }, {
 705         'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos/paid',
 706         'only_matching': True,
 707     }, {
 708         'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos/fanonly',
 709         'only_matching': True,
 710     }, {
 711         'url': 'https://www.pornhub.com/video',
 712         'only_matching': True,
 713     }, {
 714         'url': 'https://www.pornhub.com/video?page=3',
 715         'only_matching': True,
 716     }, {
 717         'url': 'https://www.pornhub.com/video/search?search=123',
 718         'only_matching': True,
 719     }, {
 720         'url': 'https://www.pornhub.com/categories/teen',
 721         'only_matching': True,
 722     }, {
 723         'url': 'https://www.pornhub.com/categories/teen?page=3',
 724         'only_matching': True,
 725     }, {
 726         'url': 'https://www.pornhub.com/hd',
 727         'only_matching': True,
 728     }, {
 729         'url': 'https://www.pornhub.com/hd?page=3',
 730         'only_matching': True,
 731     }, {
 732         'url': 'https://www.pornhub.com/described-video',
 733         'only_matching': True,
 734     }, {
 735         'url': 'https://www.pornhub.com/described-video?page=2',
 736         'only_matching': True,
 737     }, {
 738         'url': 'https://www.pornhub.com/video/incategories/60fps-1/hd-porn',
 739         'only_matching': True,
 740     }, {
 741         'url': 'https://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/model/zoe_ph/videos',
 742         'only_matching': True,
 743     }]
 744
 745     @classmethod
 746     def suitable(cls, url):
 747         return (False
 748                 if PornHubIE.suitable(url) or PornHubUserIE.suitable(url) or PornHubUserVideosUploadIE.suitable(url)
 749                 else super().suitable(url))
 750
 751
 752 class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE):
 753     _VALID_URL = rf'(?P<url>https?://(?:[^/]+\.)?{PornHubBaseIE._PORNHUB_HOST_RE}/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos/upload)'
 754     _TESTS = [{
 755         'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload',
 756         'info_dict': {
 757             'id': 'jenny-blighe',
 758         },
 759         'playlist_mincount': 129,
 760     }, {
 761         'url': 'https://www.pornhub.com/model/zoe_ph/videos/upload',
 762         'only_matching': True,
 763     }, {
 764         'url': 'http://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/pornstar/jenny-blighe/videos/upload',
 765         'only_matching': True,
 766     }]
 767
 768
 769 class PornHubPlaylistIE(PornHubPlaylistBaseIE):
 770     _VALID_URL = rf'(?P<url>https?://(?:[^/]+\.)?{PornHubBaseIE._PORNHUB_HOST_RE}/playlist/(?P<id>[^/?#&]+))'
 771     _TESTS = [{
 772         'url': 'https://www.pornhub.com/playlist/44121572',
 773         'info_dict': {
 774             'id': '44121572',
 775         },
 776         'playlist_count': 77,
 777     }, {
 778         'url': 'https://www.pornhub.com/playlist/4667351',
 779         'only_matching': True,
 780     }, {
 781         'url': 'https://de.pornhub.com/playlist/4667351',
 782         'only_matching': True,
 783     }, {
 784         'url': 'https://de.pornhub.com/playlist/4667351?page=2',
 785         'only_matching': True,
 786     }]
 787
 788     def _entries(self, url, host, item_id):
 789         webpage = self._download_webpage(url, item_id, 'Downloading page 1')
 790         playlist_id = self._search_regex(r'var\s+playlistId\s*=\s*"([^"]+)"', webpage, 'playlist_id')
 791         video_count = int_or_none(
 792             self._search_regex(r'var\s+itemsCount\s*=\s*([0-9]+)\s*\|\|', webpage, 'video_count'))
 793         token = self._search_regex(r'var\s+token\s*=\s*"([^"]+)"', webpage, 'token')
 794         page_count = math.ceil((video_count - 36) / 40.) + 1
 795         page_entries = self._extract_entries(webpage, host)
 796
 797         def download_page(page_num):
 798             note = f'Downloading page {page_num}'
 799             page_url = f'https://www.{host}/playlist/viewChunked'
 800             return self._download_webpage(page_url, item_id, note, query={
 801                 'id': playlist_id,
 802                 'page': page_num,
 803                 'token': token,
 804             })
 805
 806         for page_num in range(1, page_count + 1):
 807             if page_num > 1:
 808                 webpage = download_page(page_num)
 809                 page_entries = self._extract_entries(webpage, host)
 810             if not page_entries:
 811                 break
 812             yield from page_entries
 813
 814     def _real_extract(self, url):
 815         mobj = self._match_valid_url(url)
 816         host = mobj.group('host')
 817         item_id = mobj.group('id')
 818
 819         self._login(host)
 820         self._set_age_cookies(host)
 821
 822         return self.playlist_result(self._entries(mobj.group('url'), host, item_id), item_id)