tools/telemetry/third_party/gsutilz/gslib/commands/cp.py

   1 # -*- coding: utf-8 -*-
   2 # Copyright 2011 Google Inc. All Rights Reserved.
   3 # Copyright 2011, Nexenta Systems Inc.
   4 #
   5 # Licensed under the Apache License, Version 2.0 (the "License");
   6 # you may not use this file except in compliance with the License.
   7 # You may obtain a copy of the License at
   8 #
   9 #     http://www.apache.org/licenses/LICENSE-2.0
  10 #
  11 # Unless required by applicable law or agreed to in writing, software
  12 # distributed under the License is distributed on an "AS IS" BASIS,
  13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14 # See the License for the specific language governing permissions and
  15 # limitations under the License.
  16 """Implementation of Unix-like cp command for cloud storage providers."""
  17
  18 from __future__ import absolute_import
  19
  20 import os
  21 import time
  22 import traceback
  23
  24 from gslib import copy_helper
  25 from gslib.cat_helper import CatHelper
  26 from gslib.cloud_api import AccessDeniedException
  27 from gslib.cloud_api import NotFoundException
  28 from gslib.command import Command
  29 from gslib.command_argument import CommandArgument
  30 from gslib.commands.compose import MAX_COMPONENT_COUNT
  31 from gslib.copy_helper import CreateCopyHelperOpts
  32 from gslib.copy_helper import ItemExistsError
  33 from gslib.copy_helper import Manifest
  34 from gslib.copy_helper import PARALLEL_UPLOAD_TEMP_NAMESPACE
  35 from gslib.copy_helper import SkipUnsupportedObjectError
  36 from gslib.cs_api_map import ApiSelector
  37 from gslib.exception import CommandException
  38 from gslib.name_expansion import NameExpansionIterator
  39 from gslib.storage_url import ContainsWildcard
  40 from gslib.util import CreateLock
  41 from gslib.util import GetCloudApiInstance
  42 from gslib.util import IsCloudSubdirPlaceholder
  43 from gslib.util import MakeHumanReadable
  44 from gslib.util import NO_MAX
  45 from gslib.util import RemoveCRLFFromString
  46 from gslib.util import StdinIterator
  47
  48 _SYNOPSIS = """
  49   gsutil cp [OPTION]... src_url dst_url
  50   gsutil cp [OPTION]... src_url... dst_url
  51   gsutil cp [OPTION]... -I dst_url
  52 """
  53
  54 _SYNOPSIS_TEXT = """
  55 <B>SYNOPSIS</B>
  56 """ + _SYNOPSIS
  57
  58 _DESCRIPTION_TEXT = """
  59 <B>DESCRIPTION</B>
  60   The gsutil cp command allows you to copy data between your local file
  61   system and the cloud, copy data within the cloud, and copy data between
  62   cloud storage providers. For example, to copy all text files from the
  63   local directory to a bucket you could do:
  64
  65     gsutil cp *.txt gs://my_bucket
  66
  67   Similarly, you can download text files from a bucket by doing:
  68
  69     gsutil cp gs://my_bucket/*.txt .
  70
  71   If you want to copy an entire directory tree you need to use the -r option:
  72
  73     gsutil cp -r dir gs://my_bucket
  74
  75   If you have a large number of files to upload you might want to use the
  76   gsutil -m option, to perform a parallel (multi-threaded/multi-processing)
  77   copy:
  78
  79     gsutil -m cp -r dir gs://my_bucket
  80
  81   You can pass a list of URLs (one per line) to copy on stdin instead of as
  82   command line arguments by using the -I option. This allows you to use gsutil
  83   in a pipeline to upload or download files / objects as generated by a program,
  84   such as:
  85
  86     some_program | gsutil -m cp -I gs://my_bucket
  87
  88   or:
  89
  90     some_program | gsutil -m cp -I ./download_dir
  91
  92   The contents of stdin can name files, cloud URLs, and wildcards of files
  93   and cloud URLs.
  94 """
  95
  96 _NAME_CONSTRUCTION_TEXT = """
  97 <B>HOW NAMES ARE CONSTRUCTED</B>
  98   The gsutil cp command strives to name objects in a way consistent with how
  99   Linux cp works, which causes names to be constructed in varying ways depending
 100   on whether you're performing a recursive directory copy or copying
 101   individually named objects; and whether you're copying to an existing or
 102   non-existent directory.
 103
 104   When performing recursive directory copies, object names are constructed
 105   that mirror the source directory structure starting at the point of
 106   recursive processing. For example, the command:
 107
 108     gsutil cp -r dir1/dir2 gs://my_bucket
 109
 110   will create objects named like gs://my_bucket/dir2/a/b/c, assuming
 111   dir1/dir2 contains the file a/b/c.
 112
 113   In contrast, copying individually named files will result in objects named
 114   by the final path component of the source files. For example, the command:
 115
 116     gsutil cp dir1/dir2/** gs://my_bucket
 117
 118   will create objects named like gs://my_bucket/c.
 119
 120   The same rules apply for downloads: recursive copies of buckets and
 121   bucket subdirectories produce a mirrored filename structure, while copying
 122   individually (or wildcard) named objects produce flatly named files.
 123
 124   Note that in the above example the '**' wildcard matches all names
 125   anywhere under dir. The wildcard '*' will match names just one level deep. For
 126   more details see 'gsutil help wildcards'.
 127
 128   There's an additional wrinkle when working with subdirectories: the resulting
 129   names depend on whether the destination subdirectory exists. For example,
 130   if gs://my_bucket/subdir exists as a subdirectory, the command:
 131
 132     gsutil cp -r dir1/dir2 gs://my_bucket/subdir
 133
 134   will create objects named like gs://my_bucket/subdir/dir2/a/b/c. In contrast,
 135   if gs://my_bucket/subdir does not exist, this same gsutil cp command will
 136   create objects named like gs://my_bucket/subdir/a/b/c.
 137
 138   Note: If you use the
 139   `Google Developers Console <https://console.developers.google.com>`_
 140   to create folders, it does so by creating a "placeholder" object that ends
 141   with a "/" character. gsutil skips these objects when downloading from the
 142   cloud to the local file system, because attempting to create a file that
 143   ends with a "/" is not allowed on Linux and MacOS. Because of this, it is
 144   recommended that you not create objects that end with "/" (unless you don't
 145   need to be able to download such objects using gsutil).
 146 """
 147
 148 _SUBDIRECTORIES_TEXT = """
 149 <B>COPYING TO/FROM SUBDIRECTORIES; DISTRIBUTING TRANSFERS ACROSS MACHINES</B>
 150   You can use gsutil to copy to and from subdirectories by using a command
 151   like:
 152
 153     gsutil cp -r dir gs://my_bucket/data
 154
 155   This will cause dir and all of its files and nested subdirectories to be
 156   copied under the specified destination, resulting in objects with names like
 157   gs://my_bucket/data/dir/a/b/c. Similarly you can download from bucket
 158   subdirectories by using a command like:
 159
 160     gsutil cp -r gs://my_bucket/data dir
 161
 162   This will cause everything nested under gs://my_bucket/data to be downloaded
 163   into dir, resulting in files with names like dir/data/a/b/c.
 164
 165   Copying subdirectories is useful if you want to add data to an existing
 166   bucket directory structure over time. It's also useful if you want
 167   to parallelize uploads and downloads across multiple machines (often
 168   reducing overall transfer time compared with simply running gsutil -m
 169   cp on one machine). For example, if your bucket contains this structure:
 170
 171     gs://my_bucket/data/result_set_01/
 172     gs://my_bucket/data/result_set_02/
 173     ...
 174     gs://my_bucket/data/result_set_99/
 175
 176   you could perform concurrent downloads across 3 machines by running these
 177   commands on each machine, respectively:
 178
 179     gsutil -m cp -r gs://my_bucket/data/result_set_[0-3]* dir
 180     gsutil -m cp -r gs://my_bucket/data/result_set_[4-6]* dir
 181     gsutil -m cp -r gs://my_bucket/data/result_set_[7-9]* dir
 182
 183   Note that dir could be a local directory on each machine, or it could
 184   be a directory mounted off of a shared file server; whether the latter
 185   performs acceptably may depend on a number of things, so we recommend
 186   you experiment and find out what works best for you.
 187 """
 188
 189 _COPY_IN_CLOUD_TEXT = """
 190 <B>COPYING IN THE CLOUD AND METADATA PRESERVATION</B>
 191   If both the source and destination URL are cloud URLs from the same
 192   provider, gsutil copies data "in the cloud" (i.e., without downloading
 193   to and uploading from the machine where you run gsutil). In addition to
 194   the performance and cost advantages of doing this, copying in the cloud
 195   preserves metadata (like Content-Type and Cache-Control). In contrast,
 196   when you download data from the cloud it ends up in a file, which has
 197   no associated metadata. Thus, unless you have some way to hold on to
 198   or re-create that metadata, downloading to a file will not retain the
 199   metadata.
 200
 201   Copies spanning locations and/or storage classes cause data to be rewritten
 202   in the cloud, which may take some time. Such operations can be resumed with
 203   the same command if they are interrupted, so long as the command parameters
 204   are identical.
 205
 206   Note that by default, the gsutil cp command does not copy the object
 207   ACL to the new object, and instead will use the default bucket ACL (see
 208   "gsutil help defacl").  You can override this behavior with the -p
 209   option (see OPTIONS below).
 210
 211   One additional note about copying in the cloud: If the destination bucket has
 212   versioning enabled, gsutil cp will copy all versions of the source object(s).
 213   For example:
 214
 215     gsutil cp gs://bucket1/obj gs://bucket2
 216
 217   will cause all versions of gs://bucket1/obj to be copied to gs://bucket2.
 218 """
 219
 220 _CHECKSUM_VALIDATION_TEXT = """
 221 <B>CHECKSUM VALIDATION</B>
 222   At the end of every upload or download the gsutil cp command validates that
 223   the checksum it computes for the source file/object matches the checksum
 224   the service computes. If the checksums do not match, gsutil will delete the
 225   corrupted object and print a warning message. This very rarely happens, but
 226   if it does, please contact gs-team@google.com.
 227
 228   If you know the MD5 of a file before uploading you can specify it in the
 229   Content-MD5 header, which will cause the cloud storage service to reject the
 230   upload if the MD5 doesn't match the value computed by the service. For
 231   example:
 232
 233     % gsutil hash obj
 234     Hashing     obj:
 235     Hashes [base64] for obj:
 236             Hash (crc32c):          lIMoIw==
 237             Hash (md5):             VgyllJgiiaRAbyUUIqDMmw==
 238
 239     % gsutil -h Content-MD5:VgyllJgiiaRAbyUUIqDMmw== cp obj gs://your-bucket/obj
 240     Copying file://obj [Content-Type=text/plain]...
 241     Uploading   gs://your-bucket/obj:                                182 b/182 B
 242
 243     If the checksum didn't match the service would instead reject the upload and
 244     gsutil would print a message like:
 245
 246     BadRequestException: 400 Provided MD5 hash "VgyllJgiiaRAbyUUIqDMmw=="
 247     doesn't match calculated MD5 hash "7gyllJgiiaRAbyUUIqDMmw==".
 248
 249   Even if you don't do this gsutil will delete the object if the computed
 250   checksum mismatches, but specifying the Content-MD5 header has three
 251   advantages:
 252
 253       1. It prevents the corrupted object from becoming visible at all, whereas
 254       otherwise it would be visible for 1-3 seconds before gsutil deletes it.
 255
 256       2. It will definitively prevent the corrupted object from being left in
 257       the cloud, whereas the gsutil approach of deleting after the upload
 258       completes could fail if (for example) the gsutil process gets ^C'd
 259       between upload and deletion request.
 260
 261       3. It supports a customer-to-service integrity check handoff. For example,
 262       if you have a content production pipeline that generates data to be
 263       uploaded to the cloud along with checksums of that data, specifying the
 264       MD5 computed by your content pipeline when you run gsutil cp will ensure
 265       that the checksums match all the way through the process (e.g., detecting
 266       if data gets corrupted on your local disk between the time it was written
 267       by your content pipeline and the time it was uploaded to GCS).
 268
 269   Note: The Content-MD5 header is ignored for composite objects, because such
 270   objects only have a CRC32C checksum.
 271 """
 272
 273 _RETRY_HANDLING_TEXT = """
 274 <B>RETRY HANDLING</B>
 275   The cp command will retry when failures occur, but if enough failures happen
 276   during a particular copy or delete operation the command will skip that object
 277   and move on. At the end of the copy run if any failures were not successfully
 278   retried, the cp command will report the count of failures, and exit with
 279   non-zero status.
 280
 281   Note that there are cases where retrying will never succeed, such as if you
 282   don't have write permission to the destination bucket or if the destination
 283   path for some objects is longer than the maximum allowed length.
 284
 285   For more details about gsutil's retry handling, please see
 286   "gsutil help retries".
 287 """
 288
 289 _RESUMABLE_TRANSFERS_TEXT = """
 290 <B>RESUMABLE TRANSFERS</B>
 291   gsutil automatically uses the Google Cloud Storage resumable upload feature
 292   whenever you use the cp command to upload an object that is larger than 2
 293   MiB. You do not need to specify any special command line options to make this
 294   happen. If your upload is interrupted you can restart the upload by running
 295   the same cp command that you ran to start the upload. Until the upload
 296   has completed successfully, it will not be visible at the destination object
 297   and will not replace any existing object the upload is intended to overwrite.
 298   (However, see the section on PARALLEL COMPOSITE UPLOADS, which may leave
 299   temporary component objects in place during the upload process.)
 300
 301   Similarly, gsutil automatically performs resumable downloads (using HTTP
 302   standard Range GET operations) whenever you use the cp command, unless the
 303   destination is a stream or null. In this case the partially downloaded file
 304   will be visible as soon as it starts being written. Thus, before you attempt
 305   to use any files downloaded by gsutil you should make sure the download
 306   completed successfully, by checking the exit status from the gsutil command.
 307   This can be done in a bash script, for example, by doing:
 308
 309      gsutil cp gs://your-bucket/your-object ./local-file
 310      if [ "$status" -ne "0" ] ; then
 311        << Code that handles failures >>
 312      fi
 313
 314   Resumable uploads and downloads store some state information in a file
 315   in ~/.gsutil named by the destination object or file. If you attempt to
 316   resume a transfer from a machine with a different directory, the transfer
 317   will start over from scratch.
 318
 319   See also "gsutil help prod" for details on using resumable transfers
 320   in production.
 321 """
 322
 323 _STREAMING_TRANSFERS_TEXT = """
 324 <B>STREAMING TRANSFERS</B>
 325   Use '-' in place of src_url or dst_url to perform a streaming
 326   transfer. For example:
 327
 328     long_running_computation | gsutil cp - gs://my_bucket/obj
 329
 330   Streaming uploads using the JSON API (see "gsutil help apis") are buffered in
 331   memory and can retry in the event of network flakiness or service errors.
 332
 333   Streaming transfers (other than uploads using the JSON API) do not support
 334   resumable uploads/downloads. If you have a large amount of data to upload
 335   (say, more than 100 MiB) it is recommended to write the data to a local file
 336   and then copy that file to the cloud rather than streaming it (and similarly
 337   for large downloads).
 338
 339   WARNING: When performing streaming transfers gsutil does not compute a
 340   checksum of the uploaded or downloaded data.  Therefore, we recommend that
 341   users either perform their own validation of the data or use non-streaming
 342   transfers (which perform integrity checking automatically).
 343 """
 344
 345 _PARALLEL_COMPOSITE_UPLOADS_TEXT = """
 346 <B>PARALLEL COMPOSITE UPLOADS</B>
 347   gsutil can automatically use
 348   `object composition <https://developers.google.com/storage/docs/composite-objects>`_
 349   to perform uploads in parallel for large, local files being uploaded to Google
 350   Cloud Storage. This means that, if enabled (see next paragraph), a large file
 351   will be split into component pieces that will be uploaded in parallel. Those
 352   components will then be composed in the cloud, and the temporary components in
 353   the cloud will be deleted after successful composition. No additional local
 354   disk space is required for this operation.
 355
 356   Using parallel composite uploads presents a tradeoff between upload
 357   performance and download configuration: If you enable parallel composite
 358   uploads your uploads will run faster, but someone will need to install a
 359   compiled crcmod (see "gsutil help crcmod") on every machine where objects are
 360   downloaded by gsutil or other Python applications. For some distributions this
 361   is easy (e.g., it comes pre-installed on MacOS), but in some cases users have
 362   found it difficult. Because of this at present parallel composite uploads are
 363   disabled by default. Google is actively working with a number of the Linux
 364   distributions to get crcmod included with the stock distribution. Once that is
 365   done we will re-enable parallel composite uploads by default in gsutil.
 366
 367   To try parallel composite uploads you can run the command:
 368
 369     gsutil -o GSUtil:parallel_composite_upload_threshold=150M cp bigfile gs://your-bucket
 370
 371   where bigfile is larger than 150 MiB. When you do this notice that the upload
 372   progress indicator continuously updates for several different uploads at once
 373   (corresponding to each of the sections of the file being uploaded in
 374   parallel), until the parallel upload completes. If you then want to enable
 375   parallel composite uploads for all of your future uploads (notwithstanding the
 376   caveats mentioned earlier), you can uncomment and set the
 377   "parallel_composite_upload_threshold" config value in your .boto configuration
 378   file to this value.
 379
 380   Note that the crcmod problem only impacts downloads via Python applications
 381   (such as gsutil). If any users who need to download the data using gsutil or
 382   other Python applications can install crcmod, it makes sense to enable
 383   parallel composite uploads (see above). For example, if you use gsutil to
 384   upload video assets and those assets will only ever be served via a Java
 385   application (there are efficient crc32c implementations available in Java), it
 386   would make sense to enable parallel composite uploads on your machine.
 387
 388   If a parallel composite upload fails prior to composition, re-running the
 389   gsutil command will take advantage of resumable uploads for those components
 390   that failed, and the component objects will be deleted after the first
 391   successful attempt. Any temporary objects that were uploaded successfully
 392   before gsutil failed will still exist until the upload is completed
 393   successfully. The temporary objects will be named in the following fashion:
 394
 395     <random ID>%s<hash>
 396
 397   where <random ID> is some numerical value, and <hash> is an MD5 hash (not
 398   related to the hash of the contents of the file or object).
 399
 400   To avoid leaving temporary objects around, you should make sure to check the
 401   exit status from the gsutil command.  This can be done in a bash script, for
 402   example, by doing:
 403
 404      gsutil cp ./local-file gs://your-bucket/your-object
 405      if [ "$status" -ne "0" ] ; then
 406        << Code that handles failures >>
 407      fi
 408
 409   Or, for copying a directory, use this instead:
 410
 411      gsutil cp -c -L cp.log -r ./dir gs://bucket
 412      if [ "$status" -ne "0" ] ; then
 413        << Code that handles failures >>
 414      fi
 415
 416   One important caveat is that files uploaded in this fashion are still subject
 417   to the maximum number of components limit. For example, if you upload a large
 418   file that gets split into %d components, and try to compose it with another
 419   object with %d components, the operation will fail because it exceeds the %d
 420   component limit. If you wish to compose an object later and the component
 421   limit is a concern, it is recommended that you disable parallel composite
 422   uploads for that transfer.
 423
 424   Also note that an object uploaded using this feature will have a CRC32C hash,
 425   but it will not have an MD5 hash (and because of that, requires users who
 426   download the object to have crcmod installed, as noted earlier). For details
 427   see 'gsutil help crc32c'.
 428
 429   Note that this feature can be completely disabled by setting the
 430   "parallel_composite_upload_threshold" variable in the .boto config file to 0.
 431 """ % (PARALLEL_UPLOAD_TEMP_NAMESPACE, 10, MAX_COMPONENT_COUNT - 9,
 432        MAX_COMPONENT_COUNT)
 433
 434
 435 _CHANGING_TEMP_DIRECTORIES_TEXT = """
 436 <B>CHANGING TEMP DIRECTORIES</B>
 437   gsutil writes data to a temporary directory in several cases:
 438
 439   - when compressing data to be uploaded (see the -z option)
 440   - when decompressing data being downloaded (when the data has
 441     Content-Encoding:gzip, e.g., as happens when uploaded using gsutil cp -z)
 442   - when running integration tests (using the gsutil test command)
 443
 444   In these cases it's possible the temp file location on your system that
 445   gsutil selects by default may not have enough space. If you find that
 446   gsutil runs out of space during one of these operations (e.g., raising
 447   "CommandException: Inadequate temp space available to compress <your file>"
 448   during a gsutil cp -z operation), you can change where it writes these
 449   temp files by setting the TMPDIR environment variable. On Linux and MacOS
 450   you can do this either by running gsutil this way:
 451
 452     TMPDIR=/some/directory gsutil cp ...
 453
 454   or by adding this line to your ~/.bashrc file and then restarting the shell
 455   before running gsutil:
 456
 457     export TMPDIR=/some/directory
 458
 459   On Windows 7 you can change the TMPDIR environment variable from Start ->
 460   Computer -> System -> Advanced System Settings -> Environment Variables.
 461   You need to reboot after making this change for it to take effect. (Rebooting
 462   is not necessary after running the export command on Linux and MacOS.)
 463 """
 464
 465 _OPTIONS_TEXT = """
 466 <B>OPTIONS</B>
 467   -a canned_acl   Sets named canned_acl when uploaded objects created. See
 468                   'gsutil help acls' for further details.
 469
 470   -c             If an error occurs, continue to attempt to copy the remaining
 471                  files. If any copies were unsuccessful, gsutil's exit status
 472                  will be non-zero even if this flag is set. This option is
 473                  implicitly set when running "gsutil -m cp...". Note: -c only
 474                  applies to the actual copying operation. If an error occurs
 475                  while iterating over the files in the local directory (e.g.,
 476                  invalid Unicode file name) gsutil will print an error message
 477                  and abort.
 478
 479   -D             Copy in "daisy chain" mode, i.e., copying between two buckets
 480                  by hooking a download to an upload, via the machine where
 481                  gsutil is run. By default, data are copied between two buckets
 482                  "in the cloud", i.e., without needing to copy via the machine
 483                  where gsutil runs.
 484
 485                  By default, a "copy in the cloud" when the source is a
 486                  composite object will retain the composite nature of the
 487                  object. However, Daisy chain mode can be used to change a
 488                  composite object into a non-composite object. For example:
 489
 490                      gsutil cp -D -p gs://bucket/obj gs://bucket/obj_tmp
 491                      gsutil mv -p gs://bucket/obj_tmp gs://bucket/obj
 492
 493                  Note: Daisy chain mode is automatically used when copying
 494                  between providers (e.g., to copy data from Google Cloud Storage
 495                  to another provider).
 496
 497   -e             Exclude symlinks. When specified, symbolic links will not be
 498                  copied.
 499
 500   -I             Causes gsutil to read the list of files or objects to copy from
 501                  stdin. This allows you to run a program that generates the list
 502                  of files to upload/download.
 503
 504   -L <file>      Outputs a manifest log file with detailed information about
 505                  each item that was copied. This manifest contains the following
 506                  information for each item:
 507
 508                  - Source path.
 509                  - Destination path.
 510                  - Source size.
 511                  - Bytes transferred.
 512                  - MD5 hash.
 513                  - UTC date and time transfer was started in ISO 8601 format.
 514                  - UTC date and time transfer was completed in ISO 8601 format.
 515                  - Upload id, if a resumable upload was performed.
 516                  - Final result of the attempted transfer, success or failure.
 517                  - Failure details, if any.
 518
 519                  If the log file already exists, gsutil will use the file as an
 520                  input to the copy process, and will also append log items to
 521                  the existing file. Files/objects that are marked in the
 522                  existing log file as having been successfully copied (or
 523                  skipped) will be ignored. Files/objects without entries will be
 524                  copied and ones previously marked as unsuccessful will be
 525                  retried. This can be used in conjunction with the -c option to
 526                  build a script that copies a large number of objects reliably,
 527                  using a bash script like the following:
 528
 529                    until gsutil cp -c -L cp.log -r ./dir gs://bucket; do
 530                      sleep 1
 531                    done
 532
 533                  The -c option will cause copying to continue after failures
 534                  occur, and the -L option will allow gsutil to pick up where it
 535                  left off without duplicating work. The loop will continue
 536                  running as long as gsutil exits with a non-zero status (such a
 537                  status indicates there was at least one failure during the
 538                  gsutil run).
 539
 540                  Note: If you're trying to synchronize the contents of a
 541                  directory and a bucket (or two buckets), see
 542                  'gsutil help rsync'.
 543
 544   -n             No-clobber. When specified, existing files or objects at the
 545                  destination will not be overwritten. Any items that are skipped
 546                  by this option will be reported as being skipped. This option
 547                  will perform an additional GET request to check if an item
 548                  exists before attempting to upload the data. This will save
 549                  retransmitting data, but the additional HTTP requests may make
 550                  small object transfers slower and more expensive.
 551
 552   -p             Causes ACLs to be preserved when copying in the cloud. Note
 553                  that this option has performance and cost implications when
 554                  using  the XML API, as it requires separate HTTP calls for
 555                  interacting with ACLs. The performance issue can be mitigated
 556                  to some degree by using gsutil -m cp to cause parallel copying.
 557                  Also, this option only works if you have OWNER access to all of
 558                  the objects that are copied.
 559
 560                  You can avoid the additional performance and cost of using
 561                  cp -p if you want all objects in the destination bucket to end
 562                  up with the same ACL by setting a default object ACL on that
 563                  bucket instead of using cp -p. See "help gsutil defacl".
 564
 565                  Note that it's not valid to specify both the -a and -p options
 566                  together.
 567
 568   -R, -r         Causes directories, buckets, and bucket subdirectories to be
 569                  copied recursively. If you neglect to use this option for
 570                  an upload, gsutil will copy any files it finds and skip any
 571                  directories. Similarly, neglecting to specify -r for a download
 572                  will cause gsutil to copy any objects at the current bucket
 573                  directory level, and skip any subdirectories.
 574
 575   -U             Skip objects with unsupported object types instead of failing.
 576                  Unsupported object types are s3 glacier objects.
 577
 578   -v             Requests that the version-specific URL for each uploaded object
 579                  be printed. Given this URL you can make future upload requests
 580                  that are safe in the face of concurrent updates, because Google
 581                  Cloud Storage will refuse to perform the update if the current
 582                  object version doesn't match the version-specific URL. See
 583                  'gsutil help versions' for more details.
 584
 585   -z <ext,...>   Applies gzip content-encoding to file uploads with the given
 586                  extensions. This is useful when uploading files with
 587                  compressible content (such as .js, .css, or .html files)
 588                  because it saves network bandwidth and space in Google Cloud
 589                  Storage, which in turn reduces storage costs.
 590
 591                  When you specify the -z option, the data from your files is
 592                  compressed before it is uploaded, but your actual files are
 593                  left uncompressed on the local disk. The uploaded objects
 594                  retain the Content-Type and name of the original files but are
 595                  given a Content-Encoding header with the value "gzip" to
 596                  indicate that the object data stored are compressed on the
 597                  Google Cloud Storage servers.
 598
 599                  For example, the following command:
 600
 601                    gsutil cp -z html -a public-read cattypes.html gs://mycats
 602
 603                  will do all of the following:
 604
 605                  - Upload as the object gs://mycats/cattypes.html (cp command)
 606                  - Set the Content-Type to text/html (based on file extension)
 607                  - Compress the data in the file cattypes.html (-z option)
 608                  - Set the Content-Encoding to gzip (-z option)
 609                  - Set the ACL to public-read (-a option)
 610                  - If a user tries to view cattypes.html in a browser, the
 611                    browser will know to uncompress the data based on the
 612                    Content-Encoding header, and to render it as HTML based on
 613                    the Content-Type header.
 614
 615                  Note that if you download an object with Content-Encoding:gzip
 616                  gsutil will decompress the content before writing the local
 617                  file.
 618 """
 619
 620 _DETAILED_HELP_TEXT = '\n\n'.join([_SYNOPSIS_TEXT,
 621                                    _DESCRIPTION_TEXT,
 622                                    _NAME_CONSTRUCTION_TEXT,
 623                                    _SUBDIRECTORIES_TEXT,
 624                                    _COPY_IN_CLOUD_TEXT,
 625                                    _CHECKSUM_VALIDATION_TEXT,
 626                                    _RETRY_HANDLING_TEXT,
 627                                    _RESUMABLE_TRANSFERS_TEXT,
 628                                    _STREAMING_TRANSFERS_TEXT,
 629                                    _PARALLEL_COMPOSITE_UPLOADS_TEXT,
 630                                    _CHANGING_TEMP_DIRECTORIES_TEXT,
 631                                    _OPTIONS_TEXT])
 632
 633
 634 CP_SUB_ARGS = 'a:cDeIL:MNnprRtUvz:'
 635
 636
 637 def _CopyFuncWrapper(cls, args, thread_state=None):
 638   cls.CopyFunc(args, thread_state=thread_state)
 639
 640
 641 def _CopyExceptionHandler(cls, e):
 642   """Simple exception handler to allow post-completion status."""
 643   cls.logger.error(str(e))
 644   cls.op_failure_count += 1
 645   cls.logger.debug('\n\nEncountered exception while copying:\n%s\n',
 646                    traceback.format_exc())
 647
 648
 649 def _RmExceptionHandler(cls, e):
 650   """Simple exception handler to allow post-completion status."""
 651   cls.logger.error(str(e))
 652
 653
 654 class CpCommand(Command):
 655   """Implementation of gsutil cp command.
 656
 657   Note that CpCommand is run for both gsutil cp and gsutil mv. The latter
 658   happens by MvCommand calling CpCommand and passing the hidden (undocumented)
 659   -M option. This allows the copy and remove needed for each mv to run
 660   together (rather than first running all the cp's and then all the rm's, as
 661   we originally had implemented), which in turn avoids the following problem
 662   with removing the wrong objects: starting with a bucket containing only
 663   the object gs://bucket/obj, say the user does:
 664     gsutil mv gs://bucket/* gs://bucket/d.txt
 665   If we ran all the cp's and then all the rm's and we didn't expand the wildcard
 666   first, the cp command would first copy gs://bucket/obj to gs://bucket/d.txt,
 667   and the rm command would then remove that object. In the implementation
 668   prior to gsutil release 3.12 we avoided this by building a list of objects
 669   to process and then running the copies and then the removes; but building
 670   the list up front limits scalability (compared with the current approach
 671   of processing the bucket listing iterator on the fly).
 672   """
 673
 674   # Command specification. See base class for documentation.
 675   command_spec = Command.CreateCommandSpec(
 676       'cp',
 677       command_name_aliases=['copy'],
 678       usage_synopsis=_SYNOPSIS,
 679       min_args=1,
 680       max_args=NO_MAX,
 681       # -t is deprecated but leave intact for now to avoid breakage.
 682       supported_sub_args=CP_SUB_ARGS,
 683       file_url_ok=True,
 684       provider_url_ok=False,
 685       urls_start_arg=0,
 686       gs_api_support=[ApiSelector.XML, ApiSelector.JSON],
 687       gs_default_api=ApiSelector.JSON,
 688       supported_private_args=['testcallbackfile='],
 689       argparse_arguments=[
 690           CommandArgument.MakeZeroOrMoreCloudOrFileURLsArgument()
 691       ]
 692   )
 693   # Help specification. See help_provider.py for documentation.
 694   help_spec = Command.HelpSpec(
 695       help_name='cp',
 696       help_name_aliases=['copy'],
 697       help_type='command_help',
 698       help_one_line_summary='Copy files and objects',
 699       help_text=_DETAILED_HELP_TEXT,
 700       subcommand_help_text={},
 701   )
 702
 703   # pylint: disable=too-many-statements
 704   def CopyFunc(self, name_expansion_result, thread_state=None):
 705     """Worker function for performing the actual copy (and rm, for mv)."""
 706     gsutil_api = GetCloudApiInstance(self, thread_state=thread_state)
 707
 708     copy_helper_opts = copy_helper.GetCopyHelperOpts()
 709     if copy_helper_opts.perform_mv:
 710       cmd_name = 'mv'
 711     else:
 712       cmd_name = self.command_name
 713     src_url = name_expansion_result.source_storage_url
 714     exp_src_url = name_expansion_result.expanded_storage_url
 715     src_url_names_container = name_expansion_result.names_container
 716     have_multiple_srcs = name_expansion_result.is_multi_source_request
 717
 718     if src_url.IsCloudUrl() and src_url.IsProvider():
 719       raise CommandException(
 720           'The %s command does not allow provider-only source URLs (%s)' %
 721           (cmd_name, src_url))
 722     if have_multiple_srcs:
 723       copy_helper.InsistDstUrlNamesContainer(
 724           self.exp_dst_url, self.have_existing_dst_container, cmd_name)
 725
 726     # Various GUI tools (like the GCS web console) create placeholder objects
 727     # ending with '/' when the user creates an empty directory. Normally these
 728     # tools should delete those placeholders once objects have been written
 729     # "under" the directory, but sometimes the placeholders are left around. We
 730     # need to filter them out here, otherwise if the user tries to rsync from
 731     # GCS to a local directory it will result in a directory/file conflict
 732     # (e.g., trying to download an object called "mydata/" where the local
 733     # directory "mydata" exists).
 734     if IsCloudSubdirPlaceholder(exp_src_url):
 735       self.logger.info('Skipping cloud sub-directory placeholder object (%s) '
 736                        'because such objects aren\'t needed in (and would '
 737                        'interfere with) directories in the local file system',
 738                        exp_src_url)
 739       return
 740
 741     if copy_helper_opts.use_manifest and self.manifest.WasSuccessful(
 742         exp_src_url.url_string):
 743       return
 744
 745     if copy_helper_opts.perform_mv:
 746       if name_expansion_result.names_container:
 747         # Use recursion_requested when performing name expansion for the
 748         # directory mv case so we can determine if any of the source URLs are
 749         # directories (and then use cp -r and rm -r to perform the move, to
 750         # match the behavior of Linux mv (which when moving a directory moves
 751         # all the contained files).
 752         self.recursion_requested = True
 753         # Disallow wildcard src URLs when moving directories, as supporting it
 754         # would make the name transformation too complex and would also be
 755         # dangerous (e.g., someone could accidentally move many objects to the
 756         # wrong name, or accidentally overwrite many objects).
 757         if ContainsWildcard(src_url.url_string):
 758           raise CommandException('The mv command disallows naming source '
 759                                  'directories using wildcards')
 760
 761     if (self.exp_dst_url.IsFileUrl()
 762         and not os.path.exists(self.exp_dst_url.object_name)
 763         and have_multiple_srcs):
 764       os.makedirs(self.exp_dst_url.object_name)
 765
 766     dst_url = copy_helper.ConstructDstUrl(
 767         src_url, exp_src_url, src_url_names_container, have_multiple_srcs,
 768         self.exp_dst_url, self.have_existing_dst_container,
 769         self.recursion_requested)
 770     dst_url = copy_helper.FixWindowsNaming(src_url, dst_url)
 771
 772     copy_helper.CheckForDirFileConflict(exp_src_url, dst_url)
 773     if copy_helper.SrcDstSame(exp_src_url, dst_url):
 774       raise CommandException('%s: "%s" and "%s" are the same file - '
 775                              'abort.' % (cmd_name, exp_src_url, dst_url))
 776
 777     if dst_url.IsCloudUrl() and dst_url.HasGeneration():
 778       raise CommandException('%s: a version-specific URL\n(%s)\ncannot be '
 779                              'the destination for gsutil cp - abort.'
 780                              % (cmd_name, dst_url))
 781
 782     elapsed_time = bytes_transferred = 0
 783     try:
 784       if copy_helper_opts.use_manifest:
 785         self.manifest.Initialize(
 786             exp_src_url.url_string, dst_url.url_string)
 787       (elapsed_time, bytes_transferred, result_url, md5) = (
 788           copy_helper.PerformCopy(
 789               self.logger, exp_src_url, dst_url, gsutil_api,
 790               self, _CopyExceptionHandler, allow_splitting=True,
 791               headers=self.headers, manifest=self.manifest,
 792               gzip_exts=self.gzip_exts, test_method=self.test_method))
 793       if copy_helper_opts.use_manifest:
 794         if md5:
 795           self.manifest.Set(exp_src_url.url_string, 'md5', md5)
 796         self.manifest.SetResult(
 797             exp_src_url.url_string, bytes_transferred, 'OK')
 798       if copy_helper_opts.print_ver:
 799         # Some cases don't return a version-specific URL (e.g., if destination
 800         # is a file).
 801         self.logger.info('Created: %s', result_url)
 802     except ItemExistsError:
 803       message = 'Skipping existing item: %s' % dst_url
 804       self.logger.info(message)
 805       if copy_helper_opts.use_manifest:
 806         self.manifest.SetResult(exp_src_url.url_string, 0, 'skip', message)
 807     except SkipUnsupportedObjectError, e:
 808       message = ('Skipping item %s with unsupported object type %s' %
 809                  (exp_src_url.url_string, e.unsupported_type))
 810       self.logger.info(message)
 811       if copy_helper_opts.use_manifest:
 812         self.manifest.SetResult(exp_src_url.url_string, 0, 'skip', message)
 813     except copy_helper.FileConcurrencySkipError, e:
 814       self.logger.warn('Skipping copy of source URL %s because destination URL '
 815                        '%s is already being copied by another gsutil process '
 816                        'or thread (did you specify the same source URL twice?) '
 817                        % (src_url, dst_url))
 818     except Exception, e:
 819       if (copy_helper_opts.no_clobber and
 820           copy_helper.IsNoClobberServerException(e)):
 821         message = 'Rejected (noclobber): %s' % dst_url
 822         self.logger.info(message)
 823         if copy_helper_opts.use_manifest:
 824           self.manifest.SetResult(
 825               exp_src_url.url_string, 0, 'skip', message)
 826       elif self.continue_on_error:
 827         message = 'Error copying %s: %s' % (src_url, str(e))
 828         self.op_failure_count += 1
 829         self.logger.error(message)
 830         if copy_helper_opts.use_manifest:
 831           self.manifest.SetResult(
 832               exp_src_url.url_string, 0, 'error',
 833               RemoveCRLFFromString(message))
 834       else:
 835         if copy_helper_opts.use_manifest:
 836           self.manifest.SetResult(
 837               exp_src_url.url_string, 0, 'error', str(e))
 838         raise
 839     else:
 840       if copy_helper_opts.perform_mv:
 841         self.logger.info('Removing %s...', exp_src_url)
 842         if exp_src_url.IsCloudUrl():
 843           gsutil_api.DeleteObject(exp_src_url.bucket_name,
 844                                   exp_src_url.object_name,
 845                                   generation=exp_src_url.generation,
 846                                   provider=exp_src_url.scheme)
 847         else:
 848           os.unlink(exp_src_url.object_name)
 849
 850     with self.stats_lock:
 851       self.total_elapsed_time += elapsed_time
 852       self.total_bytes_transferred += bytes_transferred
 853
 854   # Command entry point.
 855   def RunCommand(self):
 856     copy_helper_opts = self._ParseOpts()
 857
 858     self.total_elapsed_time = self.total_bytes_transferred = 0
 859     if self.args[-1] == '-' or self.args[-1] == 'file://-':
 860       return CatHelper(self).CatUrlStrings(self.args[:-1])
 861
 862     if copy_helper_opts.read_args_from_stdin:
 863       if len(self.args) != 1:
 864         raise CommandException('Source URLs cannot be specified with -I option')
 865       url_strs = StdinIterator()
 866     else:
 867       if len(self.args) < 2:
 868         raise CommandException('Wrong number of arguments for "cp" command.')
 869       url_strs = self.args[:-1]
 870
 871     (self.exp_dst_url, self.have_existing_dst_container) = (
 872         copy_helper.ExpandUrlToSingleBlr(self.args[-1], self.gsutil_api,
 873                                          self.debug, self.project_id))
 874
 875     # If the destination bucket has versioning enabled iterate with
 876     # all_versions=True. That way we'll copy all versions if the source bucket
 877     # is versioned; and by leaving all_versions=False if the destination bucket
 878     # has versioning disabled we will avoid copying old versions all to the same
 879     # un-versioned destination object.
 880     all_versions = False
 881     try:
 882       bucket = self._GetBucketWithVersioningConfig(self.exp_dst_url)
 883       if bucket and bucket.versioning and bucket.versioning.enabled:
 884         all_versions = True
 885     except AccessDeniedException:
 886       # This happens (in the XML API only) if the user doesn't have OWNER access
 887       # on the bucket (needed to check if versioning is enabled). In this case
 888       # fall back to copying all versions (which can be inefficient for the
 889       # reason noted in the comment above). We don't try to warn the user
 890       # because that would result in false positive warnings (since we can't
 891       # check if versioning is enabled on the destination bucket).
 892       #
 893       # For JSON, we will silently not return versioning if we don't have
 894       # access.
 895       all_versions = True
 896
 897     name_expansion_iterator = NameExpansionIterator(
 898         self.command_name, self.debug,
 899         self.logger, self.gsutil_api, url_strs,
 900         self.recursion_requested or copy_helper_opts.perform_mv,
 901         project_id=self.project_id, all_versions=all_versions,
 902         continue_on_error=self.continue_on_error or self.parallel_operations)
 903
 904     # Use a lock to ensure accurate statistics in the face of
 905     # multi-threading/multi-processing.
 906     self.stats_lock = CreateLock()
 907
 908     # Tracks if any copies failed.
 909     self.op_failure_count = 0
 910
 911     # Start the clock.
 912     start_time = time.time()
 913
 914     # Tuple of attributes to share/manage across multiple processes in
 915     # parallel (-m) mode.
 916     shared_attrs = ('op_failure_count', 'total_bytes_transferred')
 917
 918     # Perform copy requests in parallel (-m) mode, if requested, using
 919     # configured number of parallel processes and threads. Otherwise,
 920     # perform requests with sequential function calls in current process.
 921     self.Apply(_CopyFuncWrapper, name_expansion_iterator,
 922                _CopyExceptionHandler, shared_attrs,
 923                fail_on_error=(not self.continue_on_error))
 924     self.logger.debug(
 925         'total_bytes_transferred: %d', self.total_bytes_transferred)
 926
 927     end_time = time.time()
 928     self.total_elapsed_time = end_time - start_time
 929
 930     # Sometimes, particularly when running unit tests, the total elapsed time
 931     # is really small. On Windows, the timer resolution is too small and
 932     # causes total_elapsed_time to be zero.
 933     try:
 934       float(self.total_bytes_transferred) / float(self.total_elapsed_time)
 935     except ZeroDivisionError:
 936       self.total_elapsed_time = 0.01
 937
 938     self.total_bytes_per_second = (float(self.total_bytes_transferred) /
 939                                    float(self.total_elapsed_time))
 940
 941     if self.debug == 3:
 942       # Note that this only counts the actual GET and PUT bytes for the copy
 943       # - not any transfers for doing wildcard expansion, the initial
 944       # HEAD/GET request performed to get the object metadata, etc.
 945       if self.total_bytes_transferred != 0:
 946         self.logger.info(
 947             'Total bytes copied=%d, total elapsed time=%5.3f secs (%sps)',
 948             self.total_bytes_transferred, self.total_elapsed_time,
 949             MakeHumanReadable(self.total_bytes_per_second))
 950     if self.op_failure_count:
 951       plural_str = 's' if self.op_failure_count else ''
 952       raise CommandException('%d file%s/object%s could not be transferred.' % (
 953           self.op_failure_count, plural_str, plural_str))
 954
 955     return 0
 956
 957   def _ParseOpts(self):
 958     perform_mv = False
 959     # exclude_symlinks is handled by Command parent class, so save in Command
 960     # state rather than CopyHelperOpts.
 961     self.exclude_symlinks = False
 962     no_clobber = False
 963     # continue_on_error is handled by Command parent class, so save in Command
 964     # state rather than CopyHelperOpts.
 965     self.continue_on_error = False
 966     daisy_chain = False
 967     read_args_from_stdin = False
 968     print_ver = False
 969     use_manifest = False
 970     preserve_acl = False
 971     canned_acl = None
 972     # canned_acl is handled by a helper function in parent
 973     # Command class, so save in Command state rather than CopyHelperOpts.
 974     self.canned = None
 975
 976     self.skip_unsupported_objects = False
 977
 978     # Files matching these extensions should be gzipped before uploading.
 979     self.gzip_exts = []
 980
 981     test_callback_file = None
 982
 983     # self.recursion_requested initialized in command.py (so can be checked
 984     # in parent class for all commands).
 985     self.manifest = None
 986     if self.sub_opts:
 987       for o, a in self.sub_opts:
 988         if o == '-a':
 989           canned_acl = a
 990           self.canned = True
 991         if o == '-c':
 992           self.continue_on_error = True
 993         elif o == '-D':
 994           daisy_chain = True
 995         elif o == '-e':
 996           self.exclude_symlinks = True
 997         elif o == '--testcallbackfile':
 998           # File path of a pickled class that implements ProgressCallback.call.
 999           # Used for testing transfer interruptions and resumes.
1000           test_callback_file = a
1001         elif o == '-I':
1002           read_args_from_stdin = True
1003         elif o == '-L':
1004           use_manifest = True
1005           self.manifest = Manifest(a)
1006         elif o == '-M':
1007           # Note that we signal to the cp command to perform a move (copy
1008           # followed by remove) and use directory-move naming rules by passing
1009           # the undocumented (for internal use) -M option when running the cp
1010           # command from mv.py.
1011           perform_mv = True
1012         elif o == '-n':
1013           no_clobber = True
1014         elif o == '-p':
1015           preserve_acl = True
1016         elif o == '-r' or o == '-R':
1017           self.recursion_requested = True
1018         elif o == '-U':
1019           self.skip_unsupported_objects = True
1020         elif o == '-v':
1021           print_ver = True
1022         elif o == '-z':
1023           self.gzip_exts = [x.strip() for x in a.split(',')]
1024     if preserve_acl and canned_acl:
1025       raise CommandException(
1026           'Specifying both the -p and -a options together is invalid.')
1027     return CreateCopyHelperOpts(
1028         perform_mv=perform_mv,
1029         no_clobber=no_clobber,
1030         daisy_chain=daisy_chain,
1031         read_args_from_stdin=read_args_from_stdin,
1032         print_ver=print_ver,
1033         use_manifest=use_manifest,
1034         preserve_acl=preserve_acl,
1035         canned_acl=canned_acl,
1036         skip_unsupported_objects=self.skip_unsupported_objects,
1037         test_callback_file=test_callback_file)
1038
1039   def _GetBucketWithVersioningConfig(self, exp_dst_url):
1040     """Gets versioning config for a bucket and ensures that it exists.
1041
1042     Args:
1043       exp_dst_url: Wildcard-expanded destination StorageUrl.
1044
1045     Raises:
1046       AccessDeniedException: if there was a permissions problem accessing the
1047                              bucket or its versioning config.
1048       CommandException: if URL refers to a cloud bucket that does not exist.
1049
1050     Returns:
1051       apitools Bucket with versioning configuration.
1052     """
1053     bucket = None
1054     if exp_dst_url.IsCloudUrl() and exp_dst_url.IsBucket():
1055       try:
1056         bucket = self.gsutil_api.GetBucket(
1057             exp_dst_url.bucket_name, provider=exp_dst_url.scheme,
1058             fields=['versioning'])
1059       except AccessDeniedException, e:
1060         raise
1061       except NotFoundException, e:
1062         raise CommandException('Destination bucket %s does not exist.' %
1063                                exp_dst_url)
1064       except Exception, e:
1065         raise CommandException('Error retrieving destination bucket %s: %s' %
1066                                (exp_dst_url, e.message))
1067       return bucket