yt_dlp/extractor/kenh14.py

   1 from .common import InfoExtractor
   2 from ..utils import (
   3     clean_html,
   4     extract_attributes,
   5     get_element_by_class,
   6     get_element_html_by_attribute,
   7     get_elements_html_by_class,
   8     int_or_none,
   9     parse_duration,
  10     parse_iso8601,
  11     remove_start,
  12     strip_or_none,
  13     unescapeHTML,
  14     update_url,
  15     url_or_none,
  16 )
  17 from ..utils.traversal import traverse_obj
  18
  19
  20 class Kenh14VideoIE(InfoExtractor):
  21     _VALID_URL = r'https?://video\.kenh14\.vn/(?:video/)?[\w-]+-(?P<id>[0-9]+)\.chn'
  22     _TESTS = [{
  23         'url': 'https://video.kenh14.vn/video/mo-hop-iphone-14-pro-max-nguon-unbox-therapy-316173.chn',
  24         'md5': '1ed67f9c3a1e74acf15db69590cf6210',
  25         'info_dict': {
  26             'id': '316173',
  27             'ext': 'mp4',
  28             'title': 'Video mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)',
  29             'description': 'Video mở hộp iPhone 14 Pro MaxVideo mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)',
  30             'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$',
  31             'tags': [],
  32             'uploader': 'Unbox Therapy',
  33             'upload_date': '20220517',
  34             'view_count': int,
  35             'duration': 722.86,
  36             'timestamp': 1652764468,
  37         },
  38     }, {
  39         'url': 'https://video.kenh14.vn/video-316174.chn',
  40         'md5': '2b41877d2afaf4a3f487ceda8e5c7cbd',
  41         'info_dict': {
  42             'id': '316174',
  43             'ext': 'mp4',
  44             'title': 'Khoảnh khắc VĐV nằm gục khóc sau chiến thắng: 7 năm trời Việt Nam mới có HCV kiếm chém nữ, chỉ có 8 tháng để khổ luyện trước khi lên sàn đấu',
  45             'description': 'md5:de86aa22e143e2b277bce8ec9c6f17dc',
  46             'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$',
  47             'tags': [],
  48             'upload_date': '20220517',
  49             'view_count': int,
  50             'duration': 70.04,
  51             'timestamp': 1652766021,
  52         },
  53     }, {
  54         'url': 'https://video.kenh14.vn/0-344740.chn',
  55         'md5': 'b843495d5e728142c8870c09b46df2a9',
  56         'info_dict': {
  57             'id': '344740',
  58             'ext': 'mov',
  59             'title': 'Kỳ Duyên đầy căng thẳng trong buổi ra quân đi Miss Universe, nghi thức tuyên thuệ lần đầu xuất hiện gây nhiều tranh cãi',
  60             'description': 'md5:2a2dbb4a7397169fb21ee68f09160497',
  61             'thumbnail': r're:^https?://kenh14cdn\.com/.*\.jpg$',
  62             'tags': ['kỳ duyên', 'Kỳ Duyên tuyên thuệ', 'miss universe'],
  63             'uploader': 'Quang Vũ',
  64             'upload_date': '20241024',
  65             'view_count': int,
  66             'duration': 198.88,
  67             'timestamp': 1729741590,
  68         },
  69     }]
  70
  71     def _real_extract(self, url):
  72         video_id = self._match_id(url)
  73         webpage = self._download_webpage(url, video_id)
  74
  75         attrs = extract_attributes(get_element_html_by_attribute('type', 'VideoStream', webpage) or '')
  76         direct_url = attrs['data-vid']
  77
  78         metadata = self._download_json(
  79             'https://api.kinghub.vn/video/api/v1/detailVideoByGet?FileName={}'.format(
  80                 remove_start(direct_url, 'kenh14cdn.com/')), video_id, fatal=False)
  81
  82         formats = [{'url': f'https://{direct_url}', 'format_id': 'http', 'quality': 1}]
  83         subtitles = {}
  84         video_data = self._download_json(
  85             f'https://{direct_url}.json', video_id, note='Downloading video data', fatal=False)
  86         if hls_url := traverse_obj(video_data, ('hls', {url_or_none})):
  87             fmts, subs = self._extract_m3u8_formats_and_subtitles(
  88                 hls_url, video_id, m3u8_id='hls', fatal=False)
  89             formats.extend(fmts)
  90             self._merge_subtitles(subs, target=subtitles)
  91         if dash_url := traverse_obj(video_data, ('mpd', {url_or_none})):
  92             fmts, subs = self._extract_mpd_formats_and_subtitles(
  93                 dash_url, video_id, mpd_id='dash', fatal=False)
  94             formats.extend(fmts)
  95             self._merge_subtitles(subs, target=subtitles)
  96
  97         return {
  98             **traverse_obj(metadata, {
  99                 'duration': ('duration', {parse_duration}),
 100                 'uploader': ('author', {strip_or_none}),
 101                 'timestamp': ('uploadtime', {parse_iso8601(delimiter=' ')}),
 102                 'view_count': ('views', {int_or_none}),
 103             }),
 104             'id': video_id,
 105             'title': (
 106                 traverse_obj(metadata, ('title', {strip_or_none}))
 107                 or clean_html(self._og_search_title(webpage))
 108                 or clean_html(get_element_by_class('vdbw-title', webpage))),
 109             'formats': formats,
 110             'subtitles': subtitles,
 111             'description': (
 112                 clean_html(self._og_search_description(webpage))
 113                 or clean_html(get_element_by_class('vdbw-sapo', webpage))),
 114             'thumbnail': (self._og_search_thumbnail(webpage) or attrs.get('data-thumb')),
 115             'tags': traverse_obj(self._html_search_meta('keywords', webpage), (
 116                 {lambda x: x.split(';')}, ..., filter)),
 117         }
 118
 119
 120 class Kenh14PlaylistIE(InfoExtractor):
 121     _VALID_URL = r'https?://video\.kenh14\.vn/playlist/[\w-]+-(?P<id>[0-9]+)\.chn'
 122     _TESTS = [{
 123         'url': 'https://video.kenh14.vn/playlist/tran-tinh-naked-love-mua-2-71.chn',
 124         'info_dict': {
 125             'id': '71',
 126             'title': 'Trần Tình (Naked love) mùa 2',
 127             'description': 'md5:e9522339304956dea931722dd72eddb2',
 128             'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$',
 129         },
 130         'playlist_count': 9,
 131     }, {
 132         'url': 'https://video.kenh14.vn/playlist/0-72.chn',
 133         'info_dict': {
 134             'id': '72',
 135             'title': 'Lau Lại Đầu Từ',
 136             'description': 'Cùng xem xưa và nay có gì khác biệt nhé!',
 137             'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$',
 138         },
 139         'playlist_count': 6,
 140     }]
 141
 142     def _real_extract(self, url):
 143         playlist_id = self._match_id(url)
 144         webpage = self._download_webpage(url, playlist_id)
 145
 146         category_detail = get_element_by_class('category-detail', webpage) or ''
 147         embed_info = traverse_obj(
 148             self._yield_json_ld(webpage, playlist_id),
 149             (lambda _, v: v['name'] and v['alternateName'], any)) or {}
 150
 151         return self.playlist_from_matches(
 152             get_elements_html_by_class('video-item', webpage), playlist_id,
 153             (clean_html(get_element_by_class('name', category_detail)) or unescapeHTML(embed_info.get('name'))),
 154             getter=lambda x: 'https://video.kenh14.vn/video/video-{}.chn'.format(extract_attributes(x)['data-id']),
 155             ie=Kenh14VideoIE, playlist_description=(
 156                 clean_html(get_element_by_class('description', category_detail))
 157                 or unescapeHTML(embed_info.get('alternateName'))),
 158             thumbnail=traverse_obj(
 159                 self._og_search_thumbnail(webpage),
 160                 ({url_or_none}, {update_url(query=None)})))