[ie/box] Fix formats extraction (#8649)
[yt-dlp3.git] / yt_dlp / extractor / playvid.py
blob1e0989d0aabe3b50ed2aa6ea97ce8a833e8abccd
1 import re
2 import urllib.parse
4 from .common import InfoExtractor
5 from ..compat import compat_urllib_parse_unquote
6 from ..utils import ExtractorError, clean_html
9 class PlayvidIE(InfoExtractor):
10 _VALID_URL = r'https?://(?:www\.)?playvid\.com/watch(\?v=|/)(?P<id>.+?)(?:#|$)'
11 _TESTS = [{
12 'url': 'http://www.playvid.com/watch/RnmBNgtrrJu',
13 'md5': 'ffa2f6b2119af359f544388d8c01eb6c',
14 'info_dict': {
15 'id': 'RnmBNgtrrJu',
16 'ext': 'mp4',
17 'title': 'md5:9256d01c6317e3f703848b5906880dc8',
18 'duration': 82,
19 'age_limit': 18,
21 'skip': 'Video removed due to ToS',
22 }, {
23 'url': 'http://www.playvid.com/watch/hwb0GpNkzgH',
24 'md5': '39d49df503ad7b8f23a4432cbf046477',
25 'info_dict': {
26 'id': 'hwb0GpNkzgH',
27 'ext': 'mp4',
28 'title': 'Ellen Euro Cutie Blond Takes a Sexy Survey Get Facial in The Park',
29 'age_limit': 18,
30 'thumbnail': r're:^https?://.*\.jpg$',
34 def _real_extract(self, url):
35 video_id = self._match_id(url)
36 webpage = self._download_webpage(url, video_id)
38 m_error = re.search(
39 r'<div class="block-error">\s*<div class="heading">\s*<div>(?P<msg>.+?)</div>\s*</div>', webpage)
40 if m_error:
41 raise ExtractorError(clean_html(m_error.group('msg')), expected=True)
43 video_title = None
44 duration = None
45 video_thumbnail = None
46 formats = []
48 # most of the information is stored in the flashvars
49 flashvars = self._html_search_regex(
50 r'flashvars="(.+?)"', webpage, 'flashvars')
52 infos = compat_urllib_parse_unquote(flashvars).split(r'&')
53 for info in infos:
54 videovars_match = re.match(r'^video_vars\[(.+?)\]=(.+?)$', info)
55 if videovars_match:
56 key = videovars_match.group(1)
57 val = videovars_match.group(2)
59 if key == 'title':
60 video_title = urllib.parse.unquote_plus(val)
61 if key == 'duration':
62 try:
63 duration = int(val)
64 except ValueError:
65 pass
66 if key == 'big_thumb':
67 video_thumbnail = val
69 videourl_match = re.match(
70 r'^video_urls\]\[(?P<resolution>[0-9]+)p', key)
71 if videourl_match:
72 height = int(videourl_match.group('resolution'))
73 formats.append({
74 'height': height,
75 'url': val,
78 # Extract title - should be in the flashvars; if not, look elsewhere
79 if video_title is None:
80 video_title = self._html_extract_title(webpage)
82 return {
83 'id': video_id,
84 'formats': formats,
85 'title': video_title,
86 'thumbnail': video_thumbnail,
87 'duration': duration,
88 'description': None,
89 'age_limit': 18