4 from .common
import InfoExtractor
5 from .youtube
import YoutubeIE
12 get_element_html_by_id
,
20 class GoogleDriveIE(InfoExtractor
):
24 (?:docs|drive|drive\.usercontent)\.google\.com/
26 (?:uc|open|download)\?.*?id=|
29 video\.google\.com/get_player\?.*?docid=
31 (?P<id>[a-zA-Z0-9_-]{28,})
34 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1',
35 'md5': '5c602afbbf2c1db91831f5d82f678554',
37 'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ',
39 'title': 'Big Buck Bunny.mp4',
41 'thumbnail': 'https://drive.google.com/thumbnail?id=0ByeS4oOUV-49Zzh4R1J6R09zazQ',
44 # has itag 50 which is not in YoutubeIE._formats (royalty Free music from 1922)
45 'url': 'https://drive.google.com/uc?id=1IP0o8dHcQrIHGgVyp0Ofvx2cGfLzyO1x',
46 'md5': '322db8d63dd19788c04050a4bba67073',
48 'id': '1IP0o8dHcQrIHGgVyp0Ofvx2cGfLzyO1x',
50 'title': 'My Buddy - Henry Burr - Gus Kahn - Walter Donaldson.mp3',
52 'thumbnail': 'https://drive.google.com/thumbnail?id=1IP0o8dHcQrIHGgVyp0Ofvx2cGfLzyO1x',
55 # video can't be watched anonymously due to view count limit reached,
56 # but can be downloaded (see https://github.com/ytdl-org/youtube-dl/issues/14046)
57 'url': 'https://drive.google.com/file/d/0B-vUyvmDLdWDcEt4WjBqcmI2XzQ/view',
58 'only_matching': True,
60 # video id is longer than 28 characters
61 'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit',
62 'only_matching': True,
64 'url': 'https://drive.google.com/open?id=0B2fjwgkl1A_CX083Tkowdmt6d28',
65 'only_matching': True,
67 'url': 'https://drive.google.com/uc?id=0B2fjwgkl1A_CX083Tkowdmt6d28',
68 'only_matching': True,
70 'url': 'https://drive.usercontent.google.com/download?id=0ByeS4oOUV-49Zzh4R1J6R09zazQ',
71 'only_matching': True,
74 **{k
: v
['ext'] for k
, v
in YoutubeIE
._formats
.items() if v
.get('ext')},
77 _BASE_URL_CAPTIONS
= 'https://drive.google.com/timedtext'
78 _CAPTIONS_ENTRY_TAG
= {
80 'automatic_captions': 'target',
82 _caption_formats_ext
= []
86 def _extract_embed_urls(cls
, url
, webpage
):
88 r
'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28,})',
91 yield 'https://drive.google.com/file/d/{}'.format(mobj
.group('id'))
93 def _download_subtitles_xml(self
, video_id
, subtitles_id
, hl
):
94 if self
._captions
_xml
:
96 self
._captions
_xml
= self
._download
_xml
(
97 self
._BASE
_URL
_CAPTIONS
, video_id
, query
={
106 }, note
='Downloading subtitles XML',
107 errnote
='Unable to download subtitles XML', fatal
=False)
108 if self
._captions
_xml
:
109 for f
in self
._captions
_xml
.findall('format'):
110 if f
.attrib
.get('fmt_code') and not f
.attrib
.get('default'):
111 self
._caption
_formats
_ext
.append(f
.attrib
['fmt_code'])
113 def _get_captions_by_type(self
, video_id
, subtitles_id
, caption_type
,
114 origin_lang_code
=None):
115 if not subtitles_id
or not caption_type
:
118 for caption_entry
in self
._captions
_xml
.findall(
119 self
._CAPTIONS
_ENTRY
_TAG
[caption_type
]):
120 caption_lang_code
= caption_entry
.attrib
.get('lang_code')
121 if not caption_lang_code
:
123 caption_format_data
= []
124 for caption_format
in self
._caption
_formats
_ext
:
128 'fmt': caption_format
,
129 'lang': (caption_lang_code
if origin_lang_code
is None
130 else origin_lang_code
),
135 if origin_lang_code
is not None:
136 query
.update({'tlang': caption_lang_code
})
137 caption_format_data
.append({
138 'url': update_url_query(self
._BASE
_URL
_CAPTIONS
, query
),
139 'ext': caption_format
,
141 captions
[caption_lang_code
] = caption_format_data
144 def _get_subtitles(self
, video_id
, subtitles_id
, hl
):
145 if not subtitles_id
or not hl
:
147 self
._download
_subtitles
_xml
(video_id
, subtitles_id
, hl
)
148 if not self
._captions
_xml
:
150 return self
._get
_captions
_by
_type
(video_id
, subtitles_id
, 'subtitles')
152 def _get_automatic_captions(self
, video_id
, subtitles_id
, hl
):
153 if not subtitles_id
or not hl
:
155 self
._download
_subtitles
_xml
(video_id
, subtitles_id
, hl
)
156 if not self
._captions
_xml
:
158 track
= self
._captions
_xml
.find('track')
161 origin_lang_code
= track
.attrib
.get('lang_code')
162 if not origin_lang_code
:
164 return self
._get
_captions
_by
_type
(
165 video_id
, subtitles_id
, 'automatic_captions', origin_lang_code
)
167 def _real_extract(self
, url
):
168 video_id
= self
._match
_id
(url
)
169 video_info
= urllib
.parse
.parse_qs(self
._download
_webpage
(
170 'https://drive.google.com/get_video_info',
171 video_id
, 'Downloading video webpage', query
={'docid': video_id
}))
174 return try_get(video_info
, lambda x
: x
[key
][0])
176 reason
= get_value('reason')
177 title
= get_value('title')
180 fmt_stream_map
= (get_value('fmt_stream_map') or '').split(',')
181 fmt_list
= (get_value('fmt_list') or '').split(',')
182 if fmt_stream_map
and fmt_list
:
186 r
'^(?P<format_id>\d+)/(?P<width>\d+)[xX](?P<height>\d+)', fmt
)
188 resolutions
[mobj
.group('format_id')] = (
189 int(mobj
.group('width')), int(mobj
.group('height')))
191 for fmt_stream
in fmt_stream_map
:
192 fmt_stream_split
= fmt_stream
.split('|')
193 if len(fmt_stream_split
) < 2:
195 format_id
, format_url
= fmt_stream_split
[:2]
196 ext
= self
._FORMATS
_EXT
.get(format_id
)
198 self
.report_warning(f
'Unknown format {format_id}{bug_reports_message()}')
200 'url': lowercase_escape(format_url
),
201 'format_id': format_id
,
204 resolution
= resolutions
.get(format_id
)
207 'width': resolution
[0],
208 'height': resolution
[1],
212 source_url
= update_url_query(
213 'https://drive.usercontent.google.com/download', {
215 'export': 'download',
219 def request_source_file(source_url
, kind
, data
=None):
220 return self
._request
_webpage
(
221 source_url
, video_id
, note
=f
'Requesting {kind} file',
222 errnote
=f
'Unable to request {kind} file', fatal
=False, data
=data
)
223 urlh
= request_source_file(source_url
, 'source')
225 def add_source_format(urlh
):
228 title
= self
._search
_regex
(
229 r
'\bfilename="([^"]+)"', urlh
.headers
.get('Content-Disposition'),
230 'title', default
=None)
232 # Use redirect URLs as download URLs in order to calculate
233 # correct cookies in _calc_cookies.
234 # Using original URLs may result in redirect loop due to
235 # google.com's cookies mistakenly used for googleusercontent.com
236 # redirect URLs (see #23919).
238 'ext': determine_ext(title
, 'mp4').lower(),
239 'format_id': 'source',
242 if urlh
.headers
.get('Content-Disposition'):
243 add_source_format(urlh
)
245 confirmation_webpage
= self
._webpage
_read
_content
(
246 urlh
, url
, video_id
, note
='Downloading confirmation page',
247 errnote
='Unable to confirm download', fatal
=False)
248 if confirmation_webpage
:
249 confirmed_source_url
= extract_attributes(
250 get_element_html_by_id('download-form', confirmation_webpage
) or '').get('action')
251 if confirmed_source_url
:
252 urlh
= request_source_file(confirmed_source_url
, 'confirmed source', data
=b
'')
253 if urlh
and urlh
.headers
.get('Content-Disposition'):
254 add_source_format(urlh
)
257 get_element_by_class('uc-error-subcaption', confirmation_webpage
)
258 or get_element_by_class('uc-error-caption', confirmation_webpage
)
259 or 'unable to extract confirmation code')
261 if not formats
and reason
:
263 self
.raise_no_formats(reason
, expected
=True)
265 raise ExtractorError(reason
, expected
=True)
269 ttsurl
= get_value('ttsurl')
271 # the video Id for subtitles will be the last value in the ttsurl
273 subtitles_id
= ttsurl
.encode().decode(
274 'unicode_escape').split('=')[-1]
276 self
.cookiejar
.clear(domain
='.google.com', path
='/', name
='NID')
281 'thumbnail': 'https://drive.google.com/thumbnail?id=' + video_id
,
282 'duration': int_or_none(get_value('length_seconds')),
284 'subtitles': self
.extract_subtitles(video_id
, subtitles_id
, hl
),
285 'automatic_captions': self
.extract_automatic_captions(
286 video_id
, subtitles_id
, hl
),
290 class GoogleDriveFolderIE(InfoExtractor
):
291 IE_NAME
= 'GoogleDrive:Folder'
292 _VALID_URL
= r
'https?://(?:docs|drive)\.google\.com/drive/folders/(?P<id>[\w-]{28,})'
294 'url': 'https://drive.google.com/drive/folders/1dQ4sx0-__Nvg65rxTSgQrl7VyW_FZ9QI',
296 'id': '1dQ4sx0-__Nvg65rxTSgQrl7VyW_FZ9QI',
301 _BOUNDARY
= '=====vc17a3rwnndj====='
302 _REQUEST
= "/drive/v2beta/files?openDrive=true&reason=102&syncType=0&errorRecovery=false&q=trashed%20%3D%20false%20and%20'{folder_id}'%20in%20parents&fields=kind%2CnextPageToken%2Citems(kind%2CmodifiedDate%2CmodifiedByMeDate%2ClastViewedByMeDate%2CfileSize%2Cowners(kind%2CpermissionId%2Cid)%2ClastModifyingUser(kind%2CpermissionId%2Cid)%2ChasThumbnail%2CthumbnailVersion%2Ctitle%2Cid%2CresourceKey%2Cshared%2CsharedWithMeDate%2CuserPermission(role)%2CexplicitlyTrashed%2CmimeType%2CquotaBytesUsed%2Ccopyable%2CfileExtension%2CsharingUser(kind%2CpermissionId%2Cid)%2Cspaces%2Cversion%2CteamDriveId%2ChasAugmentedPermissions%2CcreatedDate%2CtrashingUser(kind%2CpermissionId%2Cid)%2CtrashedDate%2Cparents(id)%2CshortcutDetails(targetId%2CtargetMimeType%2CtargetLookupStatus)%2Ccapabilities(canCopy%2CcanDownload%2CcanEdit%2CcanAddChildren%2CcanDelete%2CcanRemoveChildren%2CcanShare%2CcanTrash%2CcanRename%2CcanReadTeamDrive%2CcanMoveTeamDriveItem)%2Clabels(starred%2Ctrashed%2Crestricted%2Cviewed))%2CincompleteSearch&appDataFilter=NO_APP_DATA&spaces=drive&pageToken={page_token}&maxResults=50&supportsTeamDrives=true&includeItemsFromAllDrives=true&corpora=default&orderBy=folder%2Ctitle_natural%20asc&retryCount=0&key={key} HTTP/1.1"
303 _DATA
= f
'''--{_BOUNDARY}
304 content-type: application/http
305 content-transfer-encoding: binary
312 def _call_api(self
, folder_id
, key
, data
, **kwargs
):
313 response
= self
._download
_webpage
(
314 'https://clients6.google.com/batch/drive/v2beta',
315 folder_id
, data
=data
.encode(),
317 'Content-Type': 'text/plain;charset=UTF-8;',
318 'Origin': 'https://drive.google.com',
320 '$ct': f
'multipart/mixed; boundary="{self._BOUNDARY}"',
323 return self
._search
_json
('', response
, 'api response', folder_id
, **kwargs
) or {}
325 def _get_folder_items(self
, folder_id
, key
):
327 while page_token
is not None:
328 request
= self
._REQUEST
.format(folder_id
=folder_id
, page_token
=page_token
, key
=key
)
329 page
= self
._call
_api
(folder_id
, key
, self
._DATA
% request
)
330 yield from page
['items']
331 page_token
= page
.get('nextPageToken')
333 def _real_extract(self
, url
):
334 folder_id
= self
._match
_id
(url
)
336 webpage
= self
._download
_webpage
(url
, folder_id
)
337 key
= self
._search
_regex
(r
'"(\w{39})"', webpage
, 'key')
339 folder_info
= self
._call
_api
(folder_id
, key
, self
._DATA
% f
'/drive/v2beta/files/{folder_id} HTTP/1.1', fatal
=False)
341 return self
.playlist_from_matches(
342 self
._get
_folder
_items
(folder_id
, key
), folder_id
, folder_info
.get('title'),
343 ie
=GoogleDriveIE
, getter
=lambda item
: f
'https://drive.google.com/file/d/{item["id"]}')