1 # -*- coding: utf-8 -*-
2 # Copyright 2013 Google Inc. All Rights Reserved.
4 # Licensed under the Apache License, Version 2.0 (the "License");
5 # you may not use this file except in compliance with the License.
6 # You may obtain a copy of the License at
8 # http://www.apache.org/licenses/LICENSE-2.0
10 # Unless required by applicable law or agreed to in writing, software
11 # distributed under the License is distributed on an "AS IS" BASIS,
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 # See the License for the specific language governing permissions and
14 # limitations under the License.
15 """File and Cloud URL representation classes."""
17 from __future__
import absolute_import
22 from gslib
.exception
import InvalidUrlError
24 # Matches provider strings of the form 'gs://'
25 PROVIDER_REGEX
= re
.compile(r
'(?P<provider>[^:]*)://$')
26 # Matches bucket strings of the form 'gs://bucket'
27 BUCKET_REGEX
= re
.compile(r
'(?P<provider>[^:]*)://(?P<bucket>[^/]*)/{0,1}$')
28 # Matches object strings of the form 'gs://bucket/obj'
29 OBJECT_REGEX
= re
.compile(
30 r
'(?P<provider>[^:]*)://(?P<bucket>[^/]*)/(?P<object>.*)')
31 # Matches versioned object strings of the form 'gs://bucket/obj#1234'
32 GS_GENERATION_REGEX
= re
.compile(r
'(?P<object>.+)#(?P<generation>[0-9]+)$')
33 # Matches versioned object strings of the form 's3://bucket/obj#NULL'
34 S3_VERSION_REGEX
= re
.compile(r
'(?P<object>.+)#(?P<version_id>.+)$')
35 # Matches file strings of the form 'file://dir/filename'
36 FILE_OBJECT_REGEX
= re
.compile(r
'([^:]*://)(?P<filepath>.*)')
37 # Regex to disallow buckets violating charset or not [3..255] chars total.
38 BUCKET_NAME_RE
= re
.compile(r
'^[a-zA-Z0-9][a-zA-Z0-9\._-]{1,253}[a-zA-Z0-9]$')
39 # Regex to disallow buckets with individual DNS labels longer than 63.
40 TOO_LONG_DNS_NAME_COMP
= re
.compile(r
'[-_a-z0-9]{64}')
41 # Regex to determine if a string contains any wildcards.
42 WILDCARD_REGEX
= re
.compile(r
'[*?\[\]]')
45 class StorageUrl(object):
46 """Abstract base class for file and Cloud Storage URLs."""
49 raise NotImplementedError('Clone not overridden')
52 raise NotImplementedError('IsFileUrl not overridden')
55 raise NotImplementedError('IsCloudUrl not overridden')
58 raise NotImplementedError('IsStream not overridden')
60 def CreatePrefixUrl(self
, wildcard_suffix
=None):
61 """Returns a prefix of this URL that can be used for iterating.
64 wildcard_suffix: If supplied, this wildcard suffix will be appended to the
65 prefix with a trailing slash before being returned.
68 A prefix of this URL that can be used for iterating.
70 If this URL contains a trailing slash, it will be stripped to create the
71 prefix. This helps avoid infinite looping when prefixes are iterated, but
72 preserves other slashes so that objects with '/' in the name are handled
75 For example, when recursively listing a bucket with the following contents:
76 gs://bucket// <-- object named slash
77 gs://bucket//one-dir-deep
78 a top-level expansion with '/' as a delimiter will result in the following
80 'gs://bucket//' : OBJECT
81 'gs://bucket//' : PREFIX
82 If we right-strip all slashes from the prefix entry and add a wildcard
83 suffix, we will get 'gs://bucket/*' which will produce identical results
84 (and infinitely recurse).
86 Example return values:
87 ('gs://bucket/subdir/', '*') becomes 'gs://bucket/subdir/*'
88 ('gs://bucket/', '*') becomes 'gs://bucket/*'
89 ('gs://bucket/', None) becomes 'gs://bucket'
90 ('gs://bucket/subdir//', '*') becomes 'gs://bucket/subdir//*'
91 ('gs://bucket/subdir///', '**') becomes 'gs://bucket/subdir///**'
92 ('gs://bucket/subdir/', '*') where 'subdir/' is an object becomes
93 'gs://bucket/subdir/*', but iterating on this will return 'subdir/'
94 as a BucketListingObject, so we will not recurse on it as a subdir
97 raise NotImplementedError('CreatePrefixUrl not overridden')
100 def url_string(self
):
101 raise NotImplementedError('url_string not overridden')
104 def versionless_url_string(self
):
105 raise NotImplementedError('versionless_url_string not overridden')
107 def __eq__(self
, other
):
108 return self
.url_string
== other
.url_string
111 return hash(self
.url_string
)
114 class _FileUrl(StorageUrl
):
115 """File URL class providing parsing and convenience methods.
117 This class assists with usage and manipulation of an
118 (optionally wildcarded) file URL string. Depending on the string
119 contents, this class represents one or more directories or files.
121 For File URLs, scheme is always file, bucket_name is always blank,
122 and object_name contains the file/directory path.
125 def __init__(self
, url_string
, is_stream
=False):
127 self
.bucket_name
= ''
128 match
= FILE_OBJECT_REGEX
.match(url_string
)
129 if match
and match
.lastindex
== 2:
130 self
.object_name
= match
.group(2)
132 self
.object_name
= url_string
133 self
.generation
= None
134 self
.is_stream
= is_stream
138 return _FileUrl(self
.url_string
)
143 def IsCloudUrl(self
):
147 return self
.is_stream
149 def IsDirectory(self
):
150 return not self
.IsStream() and os
.path
.isdir(self
.object_name
)
152 def CreatePrefixUrl(self
, wildcard_suffix
=None):
153 return self
.url_string
156 def url_string(self
):
157 return '%s://%s' % (self
.scheme
, self
.object_name
)
160 def versionless_url_string(self
):
161 return self
.url_string
164 return self
.url_string
167 class _CloudUrl(StorageUrl
):
168 """Cloud URL class providing parsing and convenience methods.
170 This class assists with usage and manipulation of an
171 (optionally wildcarded) cloud URL string. Depending on the string
172 contents, this class represents a provider, bucket(s), or object(s).
174 This class operates only on strings. No cloud storage API calls are
175 made from this class.
178 def __init__(self
, url_string
):
180 self
.bucket_name
= None
181 self
.object_name
= None
182 self
.generation
= None
184 provider_match
= PROVIDER_REGEX
.match(url_string
)
185 bucket_match
= BUCKET_REGEX
.match(url_string
)
187 self
.scheme
= provider_match
.group('provider')
189 self
.scheme
= bucket_match
.group('provider')
190 self
.bucket_name
= bucket_match
.group('bucket')
191 if (not ContainsWildcard(self
.bucket_name
) and
192 (not BUCKET_NAME_RE
.match(self
.bucket_name
) or
193 TOO_LONG_DNS_NAME_COMP
.search(self
.bucket_name
))):
194 raise InvalidUrlError('Invalid bucket name in URL "%s"' % url_string
)
196 object_match
= OBJECT_REGEX
.match(url_string
)
198 self
.scheme
= object_match
.group('provider')
199 self
.bucket_name
= object_match
.group('bucket')
200 self
.object_name
= object_match
.group('object')
201 if self
.scheme
== 'gs':
202 generation_match
= GS_GENERATION_REGEX
.match(self
.object_name
)
204 self
.object_name
= generation_match
.group('object')
205 self
.generation
= generation_match
.group('generation')
206 elif self
.scheme
== 's3':
207 version_match
= S3_VERSION_REGEX
.match(self
.object_name
)
209 self
.object_name
= version_match
.group('object')
210 self
.generation
= version_match
.group('version_id')
212 raise InvalidUrlError(
213 'CloudUrl: URL string %s did not match URL regex' % url_string
)
216 return _CloudUrl(self
.url_string
)
221 def IsCloudUrl(self
):
225 raise NotImplementedError('IsStream not supported on CloudUrl')
228 return bool(self
.bucket_name
and not self
.object_name
)
231 return bool(self
.bucket_name
and self
.object_name
)
233 def HasGeneration(self
):
234 return bool(self
.generation
)
236 def IsProvider(self
):
237 return bool(self
.scheme
and not self
.bucket_name
)
239 def CreatePrefixUrl(self
, wildcard_suffix
=None):
240 prefix
= StripOneSlash(self
.versionless_url_string
)
242 prefix
= '%s/%s' % (prefix
, wildcard_suffix
)
246 def bucket_url_string(self
):
247 return '%s://%s/' % (self
.scheme
, self
.bucket_name
)
250 def url_string(self
):
251 url_str
= self
.versionless_url_string
252 if self
.HasGeneration():
253 url_str
+= '#%s' % self
.generation
257 def versionless_url_string(self
):
258 if self
.IsProvider():
259 return '%s://' % self
.scheme
260 elif self
.IsBucket():
261 return self
.bucket_url_string
262 return '%s://%s/%s' % (self
.scheme
, self
.bucket_name
, self
.object_name
)
265 return self
.url_string
268 def _GetSchemeFromUrlString(url_str
):
269 """Returns scheme component of a URL string."""
271 end_scheme_idx
= url_str
.find('://')
272 if end_scheme_idx
== -1:
273 # File is the default scheme.
276 return url_str
[0:end_scheme_idx
].lower()
279 def _GetPathFromUrlString(url_str
):
280 """Returns path component of a URL string."""
282 end_scheme_idx
= url_str
.find('://')
283 if end_scheme_idx
== -1:
286 return url_str
[end_scheme_idx
+ 3:]
289 def IsFileUrlString(url_str
):
290 """Returns whether a string is a file URL."""
292 return _GetSchemeFromUrlString(url_str
) == 'file'
295 def StorageUrlFromString(url_str
):
296 """Static factory function for creating a StorageUrl from a string."""
298 scheme
= _GetSchemeFromUrlString(url_str
)
300 if scheme
not in ('file', 's3', 'gs'):
301 raise InvalidUrlError('Unrecognized scheme "%s"' % scheme
)
303 path
= _GetPathFromUrlString(url_str
)
304 is_stream
= (path
== '-')
305 return _FileUrl(url_str
, is_stream
=is_stream
)
306 return _CloudUrl(url_str
)
309 def StripOneSlash(url_str
):
310 if url_str
and url_str
.endswith('/'):
315 def ContainsWildcard(url_string
):
316 """Checks whether url_string contains a wildcard.
319 url_string: URL string to check.
324 return bool(WILDCARD_REGEX
.search(url_string
))