3 from .common
import InfoExtractor
8 get_element_by_attribute
,
17 class AmazonStoreIE(InfoExtractor
):
18 _VALID_URL
= r
'https?://(?:www\.)?amazon\.(?:[a-z]{2,3})(?:\.[a-z]{2})?/(?:[^/]+/)?(?:dp|gp/product)/(?P<id>[^/&#$?]+)'
21 'url': 'https://www.amazon.co.uk/dp/B098XNCHLD/',
26 'playlist_mincount': 1,
29 'id': 'A1F83G8C2ARO7P',
31 'title': 'mcdodo usb c cable 100W 5a',
32 'thumbnail': r
're:^https?://.*\.jpg$',
36 'expected_warnings': ['Unable to extract data'],
38 'url': 'https://www.amazon.in/Sony-WH-1000XM4-Cancelling-Headphones-Bluetooth/dp/B0863TXGM3',
43 'playlist_mincount': 4,
44 'expected_warnings': ['Unable to extract data'],
46 'url': 'https://www.amazon.com/dp/B0845NXCXF/',
51 'playlist-mincount': 1,
52 'expected_warnings': ['Unable to extract data'],
54 'url': 'https://www.amazon.es/Samsung-Smartphone-s-AMOLED-Quad-c%C3%A1mara-espa%C3%B1ola/dp/B08WX337PQ',
59 'playlist_mincount': 1,
60 'expected_warnings': ['Unable to extract data'],
63 def _real_extract(self
, url
):
64 playlist_id
= self
._match
_id
(url
)
66 for retry
in self
.RetryManager():
67 webpage
= self
._download
_webpage
(url
, playlist_id
)
69 data_json
= self
._search
_json
(
70 r
'var\s?obj\s?=\s?jQuery\.parseJSON\(\'', webpage, 'data
', playlist_id,
71 transform_source=js_to_json)
72 except ExtractorError as e:
76 'id': video['marketPlaceID
'],
78 'title
': video.get('title
'),
79 'thumbnail
': video.get('thumbUrl
') or video.get('thumb
'),
80 'duration
': video.get('durationSeconds
'),
81 'height
': int_or_none(video.get('videoHeight
')),
82 'width
': int_or_none(video.get('videoWidth
')),
83 } for video in (data_json.get('videos
') or []) if video.get('isVideo
') and video.get('url
')]
84 return self.playlist_result(entries, playlist_id=playlist_id, playlist_title=data_json.get('title
'))
87 class AmazonReviewsIE(InfoExtractor):
88 _VALID_URL = r'https?
://(?
:www\
.)?amazon\
.(?
:[a
-z
]{2,3})(?
:\
.[a
-z
]{2}
)?
/gp
/customer
-reviews
/(?P
<id>[^
/&#$?]+)'
90 'url': 'https://www.amazon.com/gp/customer-reviews/R10VE9VUSY19L3/ref=cm_cr_arp_d_rvw_ttl',
92 'id': 'R10VE9VUSY19L3',
94 'title': 'Get squad #Suspicious',
95 'description': 'md5:7012695052f440a1e064e402d87e0afb',
96 'uploader': 'Kimberly Cronkright',
97 'average_rating': 1.0,
98 'thumbnail': r
're:^https?://.*\.jpg$',
100 'expected_warnings': ['Review body was not found in webpage'],
102 'url': 'https://www.amazon.com/gp/customer-reviews/R10VE9VUSY19L3/ref=cm_cr_arp_d_rvw_ttl?language=es_US',
104 'id': 'R10VE9VUSY19L3',
106 'title': 'Get squad #Suspicious',
107 'description': 'md5:7012695052f440a1e064e402d87e0afb',
108 'uploader': 'Kimberly Cronkright',
109 'average_rating': 1.0,
110 'thumbnail': r
're:^https?://.*\.jpg$',
112 'expected_warnings': ['Review body was not found in webpage'],
114 'url': 'https://www.amazon.in/gp/customer-reviews/RV1CO8JN5VGXV/',
116 'id': 'RV1CO8JN5VGXV',
118 'title': 'Not sure about its durability',
119 'description': 'md5:1a252c106357f0a3109ebf37d2e87494',
120 'uploader': 'Shoaib Gulzar',
121 'average_rating': 2.0,
122 'thumbnail': r
're:^https?://.*\.jpg$',
124 'expected_warnings': ['Review body was not found in webpage'],
127 def _real_extract(self
, url
):
128 video_id
= self
._match
_id
(url
)
130 for retry
in self
.RetryManager():
131 webpage
= self
._download
_webpage
(url
, video_id
)
132 review_body
= get_element_by_attribute('data-hook', 'review-body', webpage
)
134 retry
.error
= ExtractorError('Review body was not found in webpage', expected
=True)
136 formats
, subtitles
= [], {}
138 manifest_url
= self
._search
_regex
(
139 r
'data-video-url="([^"]+)"', review_body
, 'm3u8 url', default
=None)
140 if url_or_none(manifest_url
):
141 fmts
, subtitles
= self
._extract
_m
3u8_formats
_and
_subtitles
(
142 manifest_url
, video_id
, 'mp4', fatal
=False)
145 video_url
= self
._search
_regex
(
146 r
'<input[^>]+\bvalue="([^"]+)"[^>]+\bclass="video-url"', review_body
, 'mp4 url', default
=None)
147 if url_or_none(video_url
):
151 'format_id': 'http-mp4',
155 self
.raise_no_formats('No video found for this customer review', expected
=True)
159 'title': (clean_html(get_element_by_attribute('data-hook', 'review-title', webpage
))
160 or self
._html
_extract
_title
(webpage
)),
161 'description': clean_html(traverse_obj(re
.findall(
162 r
'<span(?:\s+class="cr-original-review-content")?>(.+?)</span>', review_body
), -1)),
163 'uploader': clean_html(get_element_by_class('a-profile-name', webpage
)),
164 'average_rating': float_or_none(clean_html(get_element_by_attribute(
165 'data-hook', 'review-star-rating', webpage
) or '').partition(' ')[0]),
166 'thumbnail': self
._search
_regex
(
167 r
'data-thumbnail-url="([^"]+)"', review_body
, 'thumbnail', default
=None),
169 'subtitles': subtitles
,