1 # Copyright 2012 Google Inc. All Rights Reserved.
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
7 # http://www.apache.org/licenses/LICENSE-2.0
9 # Unless required by applicable law or agreed to in writing,
10 # software distributed under the License is distributed on an
11 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
12 # either express or implied. See the License for the specific
13 # language governing permissions and limitations under the License.
15 """File Interface for Google Cloud Storage."""
19 from __future__
import with_statement
32 import xml
.etree
.cElementTree
as ET
33 from . import api_utils
36 from . import storage_api
44 read_buffer_size
=storage_api
.ReadBuffer
.DEFAULT_BUFFER_SIZE
,
47 """Opens a Google Cloud Storage file and returns it as a File-like object.
50 filename: A Google Cloud Storage filename of form '/bucket/filename'.
51 mode: 'r' for reading mode. 'w' for writing mode.
52 In reading mode, the file must exist. In writing mode, a file will
53 be created or be overrode.
54 content_type: The MIME type of the file. str. Only valid in writing mode.
55 options: A str->basestring dict to specify additional headers to pass to
56 GCS e.g. {'x-goog-acl': 'private', 'x-goog-meta-foo': 'foo'}.
57 Supported options are x-goog-acl, x-goog-meta-, cache-control,
58 content-disposition, and content-encoding.
59 Only valid in writing mode.
60 See https://developers.google.com/storage/docs/reference-headers
62 read_buffer_size: The buffer size for read. Read keeps a buffer
63 and prefetches another one. To minimize blocking for large files,
64 always read by buffer size. To minimize number of RPC requests for
65 small files, set a large buffer size. Max is 30MB.
66 retry_params: An instance of api_utils.RetryParams for subsequent calls
67 to GCS from this file handle. If None, the default one is used.
68 _account_id: Internal-use only.
71 A reading or writing buffer that supports File-like interface. Buffer
72 must be closed after operations are done.
75 errors.AuthorizationError: if authorization failed.
76 errors.NotFoundError: if an object that's expected to exist doesn't.
77 ValueError: invalid open mode or if content_type or options are specified
80 common
.validate_file_path(filename
)
81 api
= storage_api
._get
_storage
_api
(retry_params
=retry_params
,
82 account_id
=_account_id
)
83 filename
= api_utils
._quote
_filename
(filename
)
86 common
.validate_options(options
)
87 return storage_api
.StreamingBuffer(api
, filename
, content_type
, options
)
89 if content_type
or options
:
90 raise ValueError('Options and content_type can only be specified '
92 return storage_api
.ReadBuffer(api
,
94 buffer_size
=read_buffer_size
)
96 raise ValueError('Invalid mode %s.' % mode
)
99 def delete(filename
, retry_params
=None, _account_id
=None):
100 """Delete a Google Cloud Storage file.
103 filename: A Google Cloud Storage filename of form '/bucket/filename'.
104 retry_params: An api_utils.RetryParams for this call to GCS. If None,
105 the default one is used.
106 _account_id: Internal-use only.
109 errors.NotFoundError: if the file doesn't exist prior to deletion.
111 api
= storage_api
._get
_storage
_api
(retry_params
=retry_params
,
112 account_id
=_account_id
)
113 common
.validate_file_path(filename
)
114 filename
= api_utils
._quote
_filename
(filename
)
115 status
, resp_headers
, _
= api
.delete_object(filename
)
116 errors
.check_status(status
, [204], filename
, resp_headers
=resp_headers
)
119 def stat(filename
, retry_params
=None, _account_id
=None):
120 """Get GCSFileStat of a Google Cloud storage file.
123 filename: A Google Cloud Storage filename of form '/bucket/filename'.
124 retry_params: An api_utils.RetryParams for this call to GCS. If None,
125 the default one is used.
126 _account_id: Internal-use only.
129 a GCSFileStat object containing info about this file.
132 errors.AuthorizationError: if authorization failed.
133 errors.NotFoundError: if an object that's expected to exist doesn't.
135 common
.validate_file_path(filename
)
136 api
= storage_api
._get
_storage
_api
(retry_params
=retry_params
,
137 account_id
=_account_id
)
138 status
, headers
, _
= api
.head_object(api_utils
._quote
_filename
(filename
))
139 errors
.check_status(status
, [200], filename
, resp_headers
=headers
)
140 file_stat
= common
.GCSFileStat(
142 st_size
=headers
.get('content-length'),
143 st_ctime
=common
.http_time_to_posix(headers
.get('last-modified')),
144 etag
=headers
.get('etag'),
145 content_type
=headers
.get('content-type'),
146 metadata
=common
.get_metadata(headers
))
151 def _copy2(src
, dst
, metadata
=None, retry_params
=None):
152 """Copy the file content from src to dst.
157 src: /bucket/filename
158 dst: /bucket/filename
159 metadata: a dict of metadata for this copy. If None, old metadata is copied.
160 For example, {'x-goog-meta-foo': 'bar'}.
161 retry_params: An api_utils.RetryParams for this call to GCS. If None,
162 the default one is used.
165 errors.AuthorizationError: if authorization failed.
166 errors.NotFoundError: if an object that's expected to exist doesn't.
168 common
.validate_file_path(src
)
169 common
.validate_file_path(dst
)
175 copy_meta
= 'REPLACE'
176 metadata
.update({'x-goog-copy-source': src
,
177 'x-goog-metadata-directive': copy_meta
})
179 api
= storage_api
._get
_storage
_api
(retry_params
=retry_params
)
180 status
, resp_headers
, _
= api
.put_object(
181 api_utils
._quote
_filename
(dst
), headers
=metadata
)
182 errors
.check_status(status
, [200], src
, metadata
, resp_headers
)
185 def listbucket(path_prefix
, marker
=None, prefix
=None, max_keys
=None,
186 delimiter
=None, retry_params
=None, _account_id
=None):
187 """Returns a GCSFileStat iterator over a bucket.
189 Optional arguments can limit the result to a subset of files under bucket.
191 This function has two modes:
192 1. List bucket mode: Lists all files in the bucket without any concept of
193 hierarchy. GCS doesn't have real directory hierarchies.
194 2. Directory emulation mode: If you specify the 'delimiter' argument,
195 it is used as a path separator to emulate a hierarchy of directories.
196 In this mode, the "path_prefix" argument should end in the delimiter
197 specified (thus designates a logical directory). The logical directory's
198 contents, both files and subdirectories, are listed. The names of
199 subdirectories returned will end with the delimiter. So listbucket
200 can be called with the subdirectory name to list the subdirectory's
204 path_prefix: A Google Cloud Storage path of format "/bucket" or
205 "/bucket/prefix". Only objects whose fullpath starts with the
206 path_prefix will be returned.
207 marker: Another path prefix. Only objects whose fullpath starts
208 lexicographically after marker will be returned (exclusive).
209 prefix: Deprecated. Use path_prefix.
210 max_keys: The limit on the number of objects to return. int.
211 For best performance, specify max_keys only if you know how many objects
212 you want. Otherwise, this method requests large batches and handles
214 delimiter: Use to turn on directory mode. str of one or multiple chars
215 that your bucket uses as its directory separator.
216 retry_params: An api_utils.RetryParams for this call to GCS. If None,
217 the default one is used.
218 _account_id: Internal-use only.
221 For files "/bucket/a",
224 "/bucket/foo/1", "/bucket/foo/2/1", "/bucket/foo/3/1",
227 listbucket("/bucket/f", marker="/bucket/foo/1")
228 will match "/bucket/foo/2/1", "/bucket/foo/3/1".
231 listbucket("/bucket/", delimiter="/")
232 will match "/bucket/a, "/bucket/bar/" "/bucket/foo", "/bucket/foo/".
233 listbucket("/bucket/foo/", delimiter="/")
234 will match "/bucket/foo/1", "/bucket/foo/2/", "/bucket/foo/3/"
238 A GCSFileStat iterator over matched files ordered by filename.
239 The iterator returns GCSFileStat objects. filename, etag, st_size,
240 st_ctime, and is_dir are set.
242 Directory emulation mode:
243 A GCSFileStat iterator over matched files and directories ordered by
244 name. The iterator returns GCSFileStat objects. For directories,
245 only the filename and is_dir fields are set.
247 The last name yielded can be used as next call's marker.
250 common
.validate_bucket_path(path_prefix
)
253 bucket
, prefix
= common
._process
_path
_prefix
(path_prefix
)
255 if marker
and marker
.startswith(bucket
):
256 marker
= marker
[len(bucket
) + 1:]
258 api
= storage_api
._get
_storage
_api
(retry_params
=retry_params
,
259 account_id
=_account_id
)
262 options
['marker'] = marker
264 options
['max-keys'] = max_keys
266 options
['prefix'] = prefix
268 options
['delimiter'] = delimiter
270 return _Bucket(api
, bucket
, options
)
273 class _Bucket(object):
274 """A wrapper for a GCS bucket as the return value of listbucket."""
276 def __init__(self
, api
, path
, options
):
280 api: storage_api instance.
281 path: bucket path of form '/bucket'.
282 options: a dict of listbucket options. Please see listbucket doc.
284 self
._init
(api
, path
, options
)
286 def _init(self
, api
, path
, options
):
289 self
._options
= options
.copy()
290 self
._get
_bucket
_fut
= self
._api
.get_bucket_async(
291 self
._path
+ '?' + urllib
.urlencode(self
._options
))
292 self
._last
_yield
= None
293 self
._new
_max
_keys
= self
._options
.get('max-keys')
295 def __getstate__(self
):
296 options
= self
._options
298 options
['marker'] = self
._last
_yield
.filename
[len(self
._path
) + 1:]
299 if self
._new
_max
_keys
is not None:
300 options
['max-keys'] = self
._new
_max
_keys
301 return {'api': self
._api
,
305 def __setstate__(self
, state
):
306 self
._init
(state
['api'], state
['path'], state
['options'])
309 """Iter over the bucket.
312 GCSFileStat: a GCSFileStat for an object in the bucket.
313 They are ordered by GCSFileStat.filename.
316 max_keys
= self
._options
.get('max-keys')
318 while self
._get
_bucket
_fut
:
319 status
, resp_headers
, content
= self
._get
_bucket
_fut
.get_result()
320 errors
.check_status(status
, [200], self
._path
, resp_headers
=resp_headers
,
321 extras
=self
._options
)
323 if self
._should
_get
_another
_batch
(content
):
324 self
._get
_bucket
_fut
= self
._api
.get_bucket_async(
325 self
._path
+ '?' + urllib
.urlencode(self
._options
))
327 self
._get
_bucket
_fut
= None
329 root
= ET
.fromstring(content
)
330 dirs
= self
._next
_dir
_gen
(root
)
331 files
= self
._next
_file
_gen
(root
)
332 next_file
= files
.next()
333 next_dir
= dirs
.next()
335 while ((max_keys
is None or total
< max_keys
) and
336 not (next_file
is None and next_dir
is None)):
338 if next_file
is None:
339 self
._last
_yield
= next_dir
340 next_dir
= dirs
.next()
341 elif next_dir
is None:
342 self
._last
_yield
= next_file
343 next_file
= files
.next()
344 elif next_dir
< next_file
:
345 self
._last
_yield
= next_dir
346 next_dir
= dirs
.next()
347 elif next_file
< next_dir
:
348 self
._last
_yield
= next_file
349 next_file
= files
.next()
352 'Should never reach. next file is %r. next dir is %r.',
354 if self
._new
_max
_keys
:
355 self
._new
_max
_keys
-= 1
356 yield self
._last
_yield
358 def _next_file_gen(self
, root
):
359 """Generator for next file element in the document.
362 root: root element of the XML tree.
365 GCSFileStat for the next file.
367 for e
in root
.getiterator(common
._T
_CONTENTS
):
368 st_ctime
, size
, etag
, key
= None, None, None, None
369 for child
in e
.getiterator('*'):
370 if child
.tag
== common
._T
_LAST
_MODIFIED
:
371 st_ctime
= common
.dt_str_to_posix(child
.text
)
372 elif child
.tag
== common
._T
_ETAG
:
374 elif child
.tag
== common
._T
_SIZE
:
376 elif child
.tag
== common
._T
_KEY
:
378 yield common
.GCSFileStat(self
._path
+ '/' + key
,
379 size
, etag
, st_ctime
)
383 def _next_dir_gen(self
, root
):
384 """Generator for next directory element in the document.
387 root: root element in the XML tree.
390 GCSFileStat for the next directory.
392 for e
in root
.getiterator(common
._T
_COMMON
_PREFIXES
):
393 yield common
.GCSFileStat(
394 self
._path
+ '/' + e
.find(common
._T
_PREFIX
).text
,
395 st_size
=None, etag
=None, st_ctime
=None, is_dir
=True)
399 def _should_get_another_batch(self
, content
):
400 """Whether to issue another GET bucket call.
403 content: response XML.
406 True if should, also update self._options for the next request.
409 if ('max-keys' in self
._options
and
410 self
._options
['max-keys'] <= common
._MAX
_GET
_BUCKET
_RESULT
):
413 elements
= self
._find
_elements
(
414 content
, set([common
._T
_IS
_TRUNCATED
,
415 common
._T
_NEXT
_MARKER
]))
416 if elements
.get(common
._T
_IS
_TRUNCATED
, 'false').lower() != 'true':
419 next_marker
= elements
.get(common
._T
_NEXT
_MARKER
)
420 if next_marker
is None:
421 self
._options
.pop('marker', None)
423 self
._options
['marker'] = next_marker
426 def _find_elements(self
, result
, elements
):
427 """Find interesting elements from XML.
429 This function tries to only look for specified elements
430 without parsing the entire XML. The specified elements is better
431 located near the beginning.
434 result: response XML.
435 elements: a set of interesting element tags.
438 A dict from element tag to element value.
441 result
= StringIO
.StringIO(result
)
442 for _
, e
in ET
.iterparse(result
, events
=('end',)):
445 if e
.tag
in elements
:
446 element_mapping
[e
.tag
] = e
.text
447 elements
.remove(e
.tag
)
448 return element_mapping