tools/telemetry/third_party/gsutilz/gslib/storage_url.py

   1 # -*- coding: utf-8 -*-
   2 # Copyright 2013 Google Inc. All Rights Reserved.
   3 #
   4 # Licensed under the Apache License, Version 2.0 (the "License");
   5 # you may not use this file except in compliance with the License.
   6 # You may obtain a copy of the License at
   7 #
   8 #     http://www.apache.org/licenses/LICENSE-2.0
   9 #
  10 # Unless required by applicable law or agreed to in writing, software
  11 # distributed under the License is distributed on an "AS IS" BASIS,
  12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 # See the License for the specific language governing permissions and
  14 # limitations under the License.
  15 """File and Cloud URL representation classes."""
  16
  17 from __future__ import absolute_import
  18
  19 import os
  20 import re
  21
  22 from gslib.exception import InvalidUrlError
  23
  24 # Matches provider strings of the form 'gs://'
  25 PROVIDER_REGEX = re.compile(r'(?P<provider>[^:]*)://$')
  26 # Matches bucket strings of the form 'gs://bucket'
  27 BUCKET_REGEX = re.compile(r'(?P<provider>[^:]*)://(?P<bucket>[^/]*)/{0,1}$')
  28 # Matches object strings of the form 'gs://bucket/obj'
  29 OBJECT_REGEX = re.compile(
  30     r'(?P<provider>[^:]*)://(?P<bucket>[^/]*)/(?P<object>.*)')
  31 # Matches versioned object strings of the form 'gs://bucket/obj#1234'
  32 GS_GENERATION_REGEX = re.compile(r'(?P<object>.+)#(?P<generation>[0-9]+)$')
  33 # Matches versioned object strings of the form 's3://bucket/obj#NULL'
  34 S3_VERSION_REGEX = re.compile(r'(?P<object>.+)#(?P<version_id>.+)$')
  35 # Matches file strings of the form 'file://dir/filename'
  36 FILE_OBJECT_REGEX = re.compile(r'([^:]*://)(?P<filepath>.*)')
  37 # Regex to disallow buckets violating charset or not [3..255] chars total.
  38 BUCKET_NAME_RE = re.compile(r'^[a-zA-Z0-9][a-zA-Z0-9\._-]{1,253}[a-zA-Z0-9]$')
  39 # Regex to disallow buckets with individual DNS labels longer than 63.
  40 TOO_LONG_DNS_NAME_COMP = re.compile(r'[-_a-z0-9]{64}')
  41 # Regex to determine if a string contains any wildcards.
  42 WILDCARD_REGEX = re.compile(r'[*?\[\]]')
  43
  44
  45 class StorageUrl(object):
  46   """Abstract base class for file and Cloud Storage URLs."""
  47
  48   def Clone(self):
  49     raise NotImplementedError('Clone not overridden')
  50
  51   def IsFileUrl(self):
  52     raise NotImplementedError('IsFileUrl not overridden')
  53
  54   def IsCloudUrl(self):
  55     raise NotImplementedError('IsCloudUrl not overridden')
  56
  57   def IsStream(self):
  58     raise NotImplementedError('IsStream not overridden')
  59
  60   def CreatePrefixUrl(self, wildcard_suffix=None):
  61     """Returns a prefix of this URL that can be used for iterating.
  62
  63     Args:
  64       wildcard_suffix: If supplied, this wildcard suffix will be appended to the
  65                        prefix with a trailing slash before being returned.
  66
  67     Returns:
  68       A prefix of this URL that can be used for iterating.
  69
  70     If this URL contains a trailing slash, it will be stripped to create the
  71     prefix. This helps avoid infinite looping when prefixes are iterated, but
  72     preserves other slashes so that objects with '/' in the name are handled
  73     properly.
  74
  75     For example, when recursively listing a bucket with the following contents:
  76       gs://bucket// <-- object named slash
  77       gs://bucket//one-dir-deep
  78     a top-level expansion with '/' as a delimiter will result in the following
  79     URL strings:
  80       'gs://bucket//' : OBJECT
  81       'gs://bucket//' : PREFIX
  82     If we right-strip all slashes from the prefix entry and add a wildcard
  83     suffix, we will get 'gs://bucket/*' which will produce identical results
  84     (and infinitely recurse).
  85
  86     Example return values:
  87       ('gs://bucket/subdir/', '*') becomes 'gs://bucket/subdir/*'
  88       ('gs://bucket/', '*') becomes 'gs://bucket/*'
  89       ('gs://bucket/', None) becomes 'gs://bucket'
  90       ('gs://bucket/subdir//', '*') becomes 'gs://bucket/subdir//*'
  91       ('gs://bucket/subdir///', '**') becomes 'gs://bucket/subdir///**'
  92       ('gs://bucket/subdir/', '*') where 'subdir/' is an object becomes
  93            'gs://bucket/subdir/*', but iterating on this will return 'subdir/'
  94            as a BucketListingObject, so we will not recurse on it as a subdir
  95            during listing.
  96     """
  97     raise NotImplementedError('CreatePrefixUrl not overridden')
  98
  99   @property
 100   def url_string(self):
 101     raise NotImplementedError('url_string not overridden')
 102
 103   @property
 104   def versionless_url_string(self):
 105     raise NotImplementedError('versionless_url_string not overridden')
 106
 107   def __eq__(self, other):
 108     return self.url_string == other.url_string
 109
 110   def __hash__(self):
 111     return hash(self.url_string)
 112
 113
 114 class _FileUrl(StorageUrl):
 115   """File URL class providing parsing and convenience methods.
 116
 117     This class assists with usage and manipulation of an
 118     (optionally wildcarded) file URL string.  Depending on the string
 119     contents, this class represents one or more directories or files.
 120
 121     For File URLs, scheme is always file, bucket_name is always blank,
 122     and object_name contains the file/directory path.
 123   """
 124
 125   def __init__(self, url_string, is_stream=False):
 126     self.scheme = 'file'
 127     self.bucket_name = ''
 128     match = FILE_OBJECT_REGEX.match(url_string)
 129     if match and match.lastindex == 2:
 130       self.object_name = match.group(2)
 131     else:
 132       self.object_name = url_string
 133     self.generation = None
 134     self.is_stream = is_stream
 135     self.delim = os.sep
 136
 137   def Clone(self):
 138     return _FileUrl(self.url_string)
 139
 140   def IsFileUrl(self):
 141     return True
 142
 143   def IsCloudUrl(self):
 144     return False
 145
 146   def IsStream(self):
 147     return self.is_stream
 148
 149   def IsDirectory(self):
 150     return not self.IsStream() and os.path.isdir(self.object_name)
 151
 152   def CreatePrefixUrl(self, wildcard_suffix=None):
 153     return self.url_string
 154
 155   @property
 156   def url_string(self):
 157     return '%s://%s' % (self.scheme, self.object_name)
 158
 159   @property
 160   def versionless_url_string(self):
 161     return self.url_string
 162
 163   def __str__(self):
 164     return self.url_string
 165
 166
 167 class _CloudUrl(StorageUrl):
 168   """Cloud URL class providing parsing and convenience methods.
 169
 170     This class assists with usage and manipulation of an
 171     (optionally wildcarded) cloud URL string.  Depending on the string
 172     contents, this class represents a provider, bucket(s), or object(s).
 173
 174     This class operates only on strings.  No cloud storage API calls are
 175     made from this class.
 176   """
 177
 178   def __init__(self, url_string):
 179     self.scheme = None
 180     self.bucket_name = None
 181     self.object_name = None
 182     self.generation = None
 183     self.delim = '/'
 184     provider_match = PROVIDER_REGEX.match(url_string)
 185     bucket_match = BUCKET_REGEX.match(url_string)
 186     if provider_match:
 187       self.scheme = provider_match.group('provider')
 188     elif bucket_match:
 189       self.scheme = bucket_match.group('provider')
 190       self.bucket_name = bucket_match.group('bucket')
 191       if (not ContainsWildcard(self.bucket_name) and
 192           (not BUCKET_NAME_RE.match(self.bucket_name) or
 193            TOO_LONG_DNS_NAME_COMP.search(self.bucket_name))):
 194         raise InvalidUrlError('Invalid bucket name in URL "%s"' % url_string)
 195     else:
 196       object_match = OBJECT_REGEX.match(url_string)
 197       if object_match:
 198         self.scheme = object_match.group('provider')
 199         self.bucket_name = object_match.group('bucket')
 200         self.object_name = object_match.group('object')
 201         if self.scheme == 'gs':
 202           generation_match = GS_GENERATION_REGEX.match(self.object_name)
 203           if generation_match:
 204             self.object_name = generation_match.group('object')
 205             self.generation = generation_match.group('generation')
 206         elif self.scheme == 's3':
 207           version_match = S3_VERSION_REGEX.match(self.object_name)
 208           if version_match:
 209             self.object_name = version_match.group('object')
 210             self.generation = version_match.group('version_id')
 211       else:
 212         raise InvalidUrlError(
 213             'CloudUrl: URL string %s did not match URL regex' % url_string)
 214
 215   def Clone(self):
 216     return _CloudUrl(self.url_string)
 217
 218   def IsFileUrl(self):
 219     return False
 220
 221   def IsCloudUrl(self):
 222     return True
 223
 224   def IsStream(self):
 225     raise NotImplementedError('IsStream not supported on CloudUrl')
 226
 227   def IsBucket(self):
 228     return bool(self.bucket_name and not self.object_name)
 229
 230   def IsObject(self):
 231     return bool(self.bucket_name and self.object_name)
 232
 233   def HasGeneration(self):
 234     return bool(self.generation)
 235
 236   def IsProvider(self):
 237     return bool(self.scheme and not self.bucket_name)
 238
 239   def CreatePrefixUrl(self, wildcard_suffix=None):
 240     prefix = StripOneSlash(self.versionless_url_string)
 241     if wildcard_suffix:
 242       prefix = '%s/%s' % (prefix, wildcard_suffix)
 243     return prefix
 244
 245   @property
 246   def bucket_url_string(self):
 247     return '%s://%s/' % (self.scheme, self.bucket_name)
 248
 249   @property
 250   def url_string(self):
 251     url_str = self.versionless_url_string
 252     if self.HasGeneration():
 253       url_str += '#%s' % self.generation
 254     return url_str
 255
 256   @property
 257   def versionless_url_string(self):
 258     if self.IsProvider():
 259       return '%s://' % self.scheme
 260     elif self.IsBucket():
 261       return self.bucket_url_string
 262     return '%s://%s/%s' % (self.scheme, self.bucket_name, self.object_name)
 263
 264   def __str__(self):
 265     return self.url_string
 266
 267
 268 def _GetSchemeFromUrlString(url_str):
 269   """Returns scheme component of a URL string."""
 270
 271   end_scheme_idx = url_str.find('://')
 272   if end_scheme_idx == -1:
 273     # File is the default scheme.
 274     return 'file'
 275   else:
 276     return url_str[0:end_scheme_idx].lower()
 277
 278
 279 def _GetPathFromUrlString(url_str):
 280   """Returns path component of a URL string."""
 281
 282   end_scheme_idx = url_str.find('://')
 283   if end_scheme_idx == -1:
 284     return url_str
 285   else:
 286     return url_str[end_scheme_idx + 3:]
 287
 288
 289 def IsFileUrlString(url_str):
 290   """Returns whether a string is a file URL."""
 291
 292   return _GetSchemeFromUrlString(url_str) == 'file'
 293
 294
 295 def StorageUrlFromString(url_str):
 296   """Static factory function for creating a StorageUrl from a string."""
 297
 298   scheme = _GetSchemeFromUrlString(url_str)
 299
 300   if scheme not in ('file', 's3', 'gs'):
 301     raise InvalidUrlError('Unrecognized scheme "%s"' % scheme)
 302   if scheme == 'file':
 303     path = _GetPathFromUrlString(url_str)
 304     is_stream = (path == '-')
 305     return _FileUrl(url_str, is_stream=is_stream)
 306   return _CloudUrl(url_str)
 307
 308
 309 def StripOneSlash(url_str):
 310   if url_str and url_str.endswith('/'):
 311     return url_str[:-1]
 312   return url_str
 313
 314
 315 def ContainsWildcard(url_string):
 316   """Checks whether url_string contains a wildcard.
 317
 318   Args:
 319     url_string: URL string to check.
 320
 321   Returns:
 322     bool indicator.
 323   """
 324   return bool(WILDCARD_REGEX.search(url_string))