yt_dlp/extractor/tube8.py

   1 import re
   2 import urllib.parse
   3
   4 from .common import InfoExtractor
   5 from ..aes import aes_decrypt_text
   6 from ..utils import (
   7     determine_ext,
   8     format_field,
   9     int_or_none,
  10     str_to_int,
  11     strip_or_none,
  12     url_or_none,
  13 )
  14
  15
  16 class Tube8IE(InfoExtractor):
  17     _WORKING = False
  18     _VALID_URL = r'https?://(?:www\.)?tube8\.com/(?:[^/]+/)+(?P<display_id>[^/]+)/(?P<id>\d+)'
  19     _EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?tube8\.com/embed/(?:[^/]+/)+\d+)']
  20     _TESTS = [{
  21         'url': 'http://www.tube8.com/teen/kasia-music-video/229795/',
  22         'md5': '65e20c48e6abff62ed0c3965fff13a39',
  23         'info_dict': {
  24             'id': '229795',
  25             'display_id': 'kasia-music-video',
  26             'ext': 'mp4',
  27             'description': 'hot teen Kasia grinding',
  28             'uploader': 'unknown',
  29             'title': 'Kasia music video',
  30             'age_limit': 18,
  31             'duration': 230,
  32             'categories': ['Teen'],
  33             'tags': ['dancing'],
  34         },
  35     }, {
  36         'url': 'http://www.tube8.com/shemale/teen/blonde-cd-gets-kidnapped-by-two-blacks-and-punished-for-being-a-slutty-girl/19569151/',
  37         'only_matching': True,
  38     }]
  39
  40     def _extract_info(self, url, fatal=True):
  41         mobj = self._match_valid_url(url)
  42         video_id = mobj.group('id')
  43         display_id = (mobj.group('display_id')
  44                       if 'display_id' in mobj.groupdict()
  45                       else None) or mobj.group('id')
  46
  47         webpage = self._download_webpage(
  48             url, display_id, headers={'Cookie': 'age_verified=1'})
  49
  50         formats = []
  51         format_urls = set()
  52
  53         title = None
  54         thumbnail = None
  55         duration = None
  56         encrypted = False
  57
  58         def extract_format(format_url, height=None):
  59             format_url = url_or_none(format_url)
  60             if not format_url or not format_url.startswith(('http', '//')):
  61                 return
  62             if format_url in format_urls:
  63                 return
  64             format_urls.add(format_url)
  65             tbr = int_or_none(self._search_regex(
  66                 r'[/_](\d+)[kK][/_]', format_url, 'tbr', default=None))
  67             if not height:
  68                 height = int_or_none(self._search_regex(
  69                     r'[/_](\d+)[pP][/_]', format_url, 'height', default=None))
  70             if encrypted:
  71                 format_url = aes_decrypt_text(
  72                     video_url, title, 32).decode('utf-8')
  73             formats.append({
  74                 'url': format_url,
  75                 'format_id': format_field(height, None, '%dp'),
  76                 'height': height,
  77                 'tbr': tbr,
  78             })
  79
  80         flashvars = self._parse_json(
  81             self._search_regex(
  82                 r'flashvars\s*=\s*({.+?});', webpage,
  83                 'flashvars', default='{}'),
  84             display_id, fatal=False)
  85
  86         if flashvars:
  87             title = flashvars.get('video_title')
  88             thumbnail = flashvars.get('image_url')
  89             duration = int_or_none(flashvars.get('video_duration'))
  90             encrypted = flashvars.get('encrypted') is True
  91             for key, value in flashvars.items():
  92                 mobj = re.search(r'quality_(\d+)[pP]', key)
  93                 if mobj:
  94                     extract_format(value, int(mobj.group(1)))
  95             video_url = flashvars.get('video_url')
  96             if video_url and determine_ext(video_url, None):
  97                 extract_format(video_url)
  98
  99         video_url = self._html_search_regex(
 100             r'flashvars\.video_url\s*=\s*(["\'])(?P<url>http.+?)\1',
 101             webpage, 'video url', default=None, group='url')
 102         if video_url:
 103             extract_format(urllib.parse.unquote(video_url))
 104
 105         if not formats:
 106             if 'title="This video is no longer available"' in webpage:
 107                 self.raise_no_formats(
 108                     f'Video {video_id} is no longer available', expected=True)
 109
 110         if not title:
 111             title = self._html_search_regex(
 112                 r'<h1[^>]*>([^<]+)', webpage, 'title')
 113
 114         return webpage, {
 115             'id': video_id,
 116             'display_id': display_id,
 117             'title': strip_or_none(title),
 118             'thumbnail': thumbnail,
 119             'duration': duration,
 120             'age_limit': 18,
 121             'formats': formats,
 122         }
 123
 124     def _real_extract(self, url):
 125         webpage, info = self._extract_info(url)
 126
 127         if not info['title']:
 128             info['title'] = self._html_search_regex(
 129                 r'videoTitle\s*=\s*"([^"]+)', webpage, 'title')
 130
 131         description = self._html_search_regex(
 132             r'(?s)Description:</dt>\s*<dd>(.+?)</dd>', webpage, 'description', fatal=False)
 133         uploader = self._html_search_regex(
 134             r'<span class="username">\s*(.+?)\s*<',
 135             webpage, 'uploader', fatal=False)
 136
 137         like_count = int_or_none(self._search_regex(
 138             r'rupVar\s*=\s*"(\d+)"', webpage, 'like count', fatal=False))
 139         dislike_count = int_or_none(self._search_regex(
 140             r'rdownVar\s*=\s*"(\d+)"', webpage, 'dislike count', fatal=False))
 141         view_count = str_to_int(self._search_regex(
 142             r'Views:\s*</dt>\s*<dd>([\d,\.]+)',
 143             webpage, 'view count', fatal=False))
 144         comment_count = str_to_int(self._search_regex(
 145             r'<span id="allCommentsCount">(\d+)</span>',
 146             webpage, 'comment count', fatal=False))
 147
 148         category = self._search_regex(
 149             r'Category:\s*</dt>\s*<dd>\s*<a[^>]+href=[^>]+>([^<]+)',
 150             webpage, 'category', fatal=False)
 151         categories = [category] if category else None
 152
 153         tags_str = self._search_regex(
 154             r'(?s)Tags:\s*</dt>\s*<dd>(.+?)</(?!a)',
 155             webpage, 'tags', fatal=False)
 156         tags = list(re.findall(
 157             r'<a[^>]+href=[^>]+>([^<]+)', tags_str)) if tags_str else None
 158
 159         info.update({
 160             'description': description,
 161             'uploader': uploader,
 162             'view_count': view_count,
 163             'like_count': like_count,
 164             'dislike_count': dislike_count,
 165             'comment_count': comment_count,
 166             'categories': categories,
 167             'tags': tags,
 168         })
 169
 170         return info