yt_dlp/extractor/amazon.py

   1 import re
   2
   3 from .common import InfoExtractor
   4 from ..utils import (
   5     ExtractorError,
   6     clean_html,
   7     float_or_none,
   8     get_element_by_attribute,
   9     get_element_by_class,
  10     int_or_none,
  11     js_to_json,
  12     traverse_obj,
  13     url_or_none,
  14 )
  15
  16
  17 class AmazonStoreIE(InfoExtractor):
  18     _VALID_URL = r'https?://(?:www\.)?amazon\.(?:[a-z]{2,3})(?:\.[a-z]{2})?/(?:[^/]+/)?(?:dp|gp/product)/(?P<id>[^/&#$?]+)'
  19
  20     _TESTS = [{
  21         'url': 'https://www.amazon.co.uk/dp/B098XNCHLD/',
  22         'info_dict': {
  23             'id': 'B098XNCHLD',
  24             'title': str,
  25         },
  26         'playlist_mincount': 1,
  27         'playlist': [{
  28             'info_dict': {
  29                 'id': 'A1F83G8C2ARO7P',
  30                 'ext': 'mp4',
  31                 'title': 'mcdodo usb c cable 100W 5a',
  32                 'thumbnail': r're:^https?://.*\.jpg$',
  33                 'duration': 34,
  34             },
  35         }],
  36         'expected_warnings': ['Unable to extract data'],
  37     }, {
  38         'url': 'https://www.amazon.in/Sony-WH-1000XM4-Cancelling-Headphones-Bluetooth/dp/B0863TXGM3',
  39         'info_dict': {
  40             'id': 'B0863TXGM3',
  41             'title': str,
  42         },
  43         'playlist_mincount': 4,
  44         'expected_warnings': ['Unable to extract data'],
  45     }, {
  46         'url': 'https://www.amazon.com/dp/B0845NXCXF/',
  47         'info_dict': {
  48             'id': 'B0845NXCXF',
  49             'title': str,
  50         },
  51         'playlist-mincount': 1,
  52         'expected_warnings': ['Unable to extract data'],
  53     }, {
  54         'url': 'https://www.amazon.es/Samsung-Smartphone-s-AMOLED-Quad-c%C3%A1mara-espa%C3%B1ola/dp/B08WX337PQ',
  55         'info_dict': {
  56             'id': 'B08WX337PQ',
  57             'title': str,
  58         },
  59         'playlist_mincount': 1,
  60         'expected_warnings': ['Unable to extract data'],
  61     }]
  62
  63     def _real_extract(self, url):
  64         playlist_id = self._match_id(url)
  65
  66         for retry in self.RetryManager():
  67             webpage = self._download_webpage(url, playlist_id)
  68             try:
  69                 data_json = self._search_json(
  70                     r'var\s?obj\s?=\s?jQuery\.parseJSON\(\'', webpage, 'data', playlist_id,
  71                     transform_source=js_to_json)
  72             except ExtractorError as e:
  73                 retry.error = e
  74
  75         entries = [{
  76             'id': video['marketPlaceID'],
  77             'url': video['url'],
  78             'title': video.get('title'),
  79             'thumbnail': video.get('thumbUrl') or video.get('thumb'),
  80             'duration': video.get('durationSeconds'),
  81             'height': int_or_none(video.get('videoHeight')),
  82             'width': int_or_none(video.get('videoWidth')),
  83         } for video in (data_json.get('videos') or []) if video.get('isVideo') and video.get('url')]
  84         return self.playlist_result(entries, playlist_id=playlist_id, playlist_title=data_json.get('title'))
  85
  86
  87 class AmazonReviewsIE(InfoExtractor):
  88     _VALID_URL = r'https?://(?:www\.)?amazon\.(?:[a-z]{2,3})(?:\.[a-z]{2})?/gp/customer-reviews/(?P<id>[^/&#$?]+)'
  89     _TESTS = [{
  90         'url': 'https://www.amazon.com/gp/customer-reviews/R10VE9VUSY19L3/ref=cm_cr_arp_d_rvw_ttl',
  91         'info_dict': {
  92             'id': 'R10VE9VUSY19L3',
  93             'ext': 'mp4',
  94             'title': 'Get squad #Suspicious',
  95             'description': 'md5:7012695052f440a1e064e402d87e0afb',
  96             'uploader': 'Kimberly Cronkright',
  97             'average_rating': 1.0,
  98             'thumbnail': r're:^https?://.*\.jpg$',
  99         },
 100         'expected_warnings': ['Review body was not found in webpage'],
 101     }, {
 102         'url': 'https://www.amazon.com/gp/customer-reviews/R10VE9VUSY19L3/ref=cm_cr_arp_d_rvw_ttl?language=es_US',
 103         'info_dict': {
 104             'id': 'R10VE9VUSY19L3',
 105             'ext': 'mp4',
 106             'title': 'Get squad #Suspicious',
 107             'description': 'md5:7012695052f440a1e064e402d87e0afb',
 108             'uploader': 'Kimberly Cronkright',
 109             'average_rating': 1.0,
 110             'thumbnail': r're:^https?://.*\.jpg$',
 111         },
 112         'expected_warnings': ['Review body was not found in webpage'],
 113     }, {
 114         'url': 'https://www.amazon.in/gp/customer-reviews/RV1CO8JN5VGXV/',
 115         'info_dict': {
 116             'id': 'RV1CO8JN5VGXV',
 117             'ext': 'mp4',
 118             'title': 'Not sure about its durability',
 119             'description': 'md5:1a252c106357f0a3109ebf37d2e87494',
 120             'uploader': 'Shoaib Gulzar',
 121             'average_rating': 2.0,
 122             'thumbnail': r're:^https?://.*\.jpg$',
 123         },
 124         'expected_warnings': ['Review body was not found in webpage'],
 125     }]
 126
 127     def _real_extract(self, url):
 128         video_id = self._match_id(url)
 129
 130         for retry in self.RetryManager():
 131             webpage = self._download_webpage(url, video_id)
 132             review_body = get_element_by_attribute('data-hook', 'review-body', webpage)
 133             if not review_body:
 134                 retry.error = ExtractorError('Review body was not found in webpage', expected=True)
 135
 136         formats, subtitles = [], {}
 137
 138         manifest_url = self._search_regex(
 139             r'data-video-url="([^"]+)"', review_body, 'm3u8 url', default=None)
 140         if url_or_none(manifest_url):
 141             fmts, subtitles = self._extract_m3u8_formats_and_subtitles(
 142                 manifest_url, video_id, 'mp4', fatal=False)
 143             formats.extend(fmts)
 144
 145         video_url = self._search_regex(
 146             r'<input[^>]+\bvalue="([^"]+)"[^>]+\bclass="video-url"', review_body, 'mp4 url', default=None)
 147         if url_or_none(video_url):
 148             formats.append({
 149                 'url': video_url,
 150                 'ext': 'mp4',
 151                 'format_id': 'http-mp4',
 152             })
 153
 154         if not formats:
 155             self.raise_no_formats('No video found for this customer review', expected=True)
 156
 157         return {
 158             'id': video_id,
 159             'title': (clean_html(get_element_by_attribute('data-hook', 'review-title', webpage))
 160                       or self._html_extract_title(webpage)),
 161             'description': clean_html(traverse_obj(re.findall(
 162                 r'<span(?:\s+class="cr-original-review-content")?>(.+?)</span>', review_body), -1)),
 163             'uploader': clean_html(get_element_by_class('a-profile-name', webpage)),
 164             'average_rating': float_or_none(clean_html(get_element_by_attribute(
 165                 'data-hook', 'review-star-rating', webpage) or '').partition(' ')[0]),
 166             'thumbnail': self._search_regex(
 167                 r'data-thumbnail-url="([^"]+)"', review_body, 'thumbnail', default=None),
 168             'formats': formats,
 169             'subtitles': subtitles,
 170         }