bulk-checkin.c

   1 /*
   2  * Copyright (c) 2011, Google Inc.
   3  */
   4
   5 #define USE_THE_REPOSITORY_VARIABLE
   6 #define DISABLE_SIGN_COMPARE_WARNINGS
   7
   8 #include "git-compat-util.h"
   9 #include "bulk-checkin.h"
  10 #include "environment.h"
  11 #include "gettext.h"
  12 #include "hex.h"
  13 #include "lockfile.h"
  14 #include "repository.h"
  15 #include "csum-file.h"
  16 #include "pack.h"
  17 #include "strbuf.h"
  18 #include "tmp-objdir.h"
  19 #include "packfile.h"
  20 #include "object-file.h"
  21 #include "object-store-ll.h"
  22
  23 static int odb_transaction_nesting;
  24
  25 static struct tmp_objdir *bulk_fsync_objdir;
  26
  27 static struct bulk_checkin_packfile {
  28         char *pack_tmp_name;
  29         struct hashfile *f;
  30         off_t offset;
  31         struct pack_idx_option pack_idx_opts;
  32
  33         struct pack_idx_entry **written;
  34         uint32_t alloc_written;
  35         uint32_t nr_written;
  36 } bulk_checkin_packfile;
  37
  38 static void finish_tmp_packfile(struct strbuf *basename,
  39                                 const char *pack_tmp_name,
  40                                 struct pack_idx_entry **written_list,
  41                                 uint32_t nr_written,
  42                                 struct pack_idx_option *pack_idx_opts,
  43                                 unsigned char hash[])
  44 {
  45         char *idx_tmp_name = NULL;
  46
  47         stage_tmp_packfiles(the_hash_algo, basename, pack_tmp_name,
  48                             written_list, nr_written, NULL, pack_idx_opts, hash,
  49                             &idx_tmp_name);
  50         rename_tmp_packfile_idx(basename, &idx_tmp_name);
  51
  52         free(idx_tmp_name);
  53 }
  54
  55 static void flush_bulk_checkin_packfile(struct bulk_checkin_packfile *state)
  56 {
  57         unsigned char hash[GIT_MAX_RAWSZ];
  58         struct strbuf packname = STRBUF_INIT;
  59         int i;
  60
  61         if (!state->f)
  62                 return;
  63
  64         if (state->nr_written == 0) {
  65                 close(state->f->fd);
  66                 free_hashfile(state->f);
  67                 unlink(state->pack_tmp_name);
  68                 goto clear_exit;
  69         } else if (state->nr_written == 1) {
  70                 finalize_hashfile(state->f, hash, FSYNC_COMPONENT_PACK,
  71                                   CSUM_HASH_IN_STREAM | CSUM_FSYNC | CSUM_CLOSE);
  72         } else {
  73                 int fd = finalize_hashfile(state->f, hash, FSYNC_COMPONENT_PACK, 0);
  74                 fixup_pack_header_footer(the_hash_algo, fd, hash, state->pack_tmp_name,
  75                                          state->nr_written, hash,
  76                                          state->offset);
  77                 close(fd);
  78         }
  79
  80         strbuf_addf(&packname, "%s/pack/pack-%s.", repo_get_object_directory(the_repository),
  81                     hash_to_hex(hash));
  82         finish_tmp_packfile(&packname, state->pack_tmp_name,
  83                             state->written, state->nr_written,
  84                             &state->pack_idx_opts, hash);
  85         for (i = 0; i < state->nr_written; i++)
  86                 free(state->written[i]);
  87
  88 clear_exit:
  89         free(state->pack_tmp_name);
  90         free(state->written);
  91         memset(state, 0, sizeof(*state));
  92
  93         strbuf_release(&packname);
  94         /* Make objects we just wrote available to ourselves */
  95         reprepare_packed_git(the_repository);
  96 }
  97
  98 /*
  99  * Cleanup after batch-mode fsync_object_files.
 100  */
 101 static void flush_batch_fsync(void)
 102 {
 103         struct strbuf temp_path = STRBUF_INIT;
 104         struct tempfile *temp;
 105
 106         if (!bulk_fsync_objdir)
 107                 return;
 108
 109         /*
 110          * Issue a full hardware flush against a temporary file to ensure
 111          * that all objects are durable before any renames occur. The code in
 112          * fsync_loose_object_bulk_checkin has already issued a writeout
 113          * request, but it has not flushed any writeback cache in the storage
 114          * hardware or any filesystem logs. This fsync call acts as a barrier
 115          * to ensure that the data in each new object file is durable before
 116          * the final name is visible.
 117          */
 118         strbuf_addf(&temp_path, "%s/bulk_fsync_XXXXXX", repo_get_object_directory(the_repository));
 119         temp = xmks_tempfile(temp_path.buf);
 120         fsync_or_die(get_tempfile_fd(temp), get_tempfile_path(temp));
 121         delete_tempfile(&temp);
 122         strbuf_release(&temp_path);
 123
 124         /*
 125          * Make the object files visible in the primary ODB after their data is
 126          * fully durable.
 127          */
 128         tmp_objdir_migrate(bulk_fsync_objdir);
 129         bulk_fsync_objdir = NULL;
 130 }
 131
 132 static int already_written(struct bulk_checkin_packfile *state, struct object_id *oid)
 133 {
 134         int i;
 135
 136         /* The object may already exist in the repository */
 137         if (repo_has_object_file(the_repository, oid))
 138                 return 1;
 139
 140         /* Might want to keep the list sorted */
 141         for (i = 0; i < state->nr_written; i++)
 142                 if (oideq(&state->written[i]->oid, oid))
 143                         return 1;
 144
 145         /* This is a new object we need to keep */
 146         return 0;
 147 }
 148
 149 /*
 150  * Read the contents from fd for size bytes, streaming it to the
 151  * packfile in state while updating the hash in ctx. Signal a failure
 152  * by returning a negative value when the resulting pack would exceed
 153  * the pack size limit and this is not the first object in the pack,
 154  * so that the caller can discard what we wrote from the current pack
 155  * by truncating it and opening a new one. The caller will then call
 156  * us again after rewinding the input fd.
 157  *
 158  * The already_hashed_to pointer is kept untouched by the caller to
 159  * make sure we do not hash the same byte when we are called
 160  * again. This way, the caller does not have to checkpoint its hash
 161  * status before calling us just in case we ask it to call us again
 162  * with a new pack.
 163  */
 164 static int stream_blob_to_pack(struct bulk_checkin_packfile *state,
 165                                git_hash_ctx *ctx, off_t *already_hashed_to,
 166                                int fd, size_t size, const char *path,
 167                                unsigned flags)
 168 {
 169         git_zstream s;
 170         unsigned char ibuf[16384];
 171         unsigned char obuf[16384];
 172         unsigned hdrlen;
 173         int status = Z_OK;
 174         int write_object = (flags & HASH_WRITE_OBJECT);
 175         off_t offset = 0;
 176
 177         git_deflate_init(&s, pack_compression_level);
 178
 179         hdrlen = encode_in_pack_object_header(obuf, sizeof(obuf), OBJ_BLOB, size);
 180         s.next_out = obuf + hdrlen;
 181         s.avail_out = sizeof(obuf) - hdrlen;
 182
 183         while (status != Z_STREAM_END) {
 184                 if (size && !s.avail_in) {
 185                         ssize_t rsize = size < sizeof(ibuf) ? size : sizeof(ibuf);
 186                         ssize_t read_result = read_in_full(fd, ibuf, rsize);
 187                         if (read_result < 0)
 188                                 die_errno("failed to read from '%s'", path);
 189                         if (read_result != rsize)
 190                                 die("failed to read %d bytes from '%s'",
 191                                     (int)rsize, path);
 192                         offset += rsize;
 193                         if (*already_hashed_to < offset) {
 194                                 size_t hsize = offset - *already_hashed_to;
 195                                 if (rsize < hsize)
 196                                         hsize = rsize;
 197                                 if (hsize)
 198                                         the_hash_algo->update_fn(ctx, ibuf, hsize);
 199                                 *already_hashed_to = offset;
 200                         }
 201                         s.next_in = ibuf;
 202                         s.avail_in = rsize;
 203                         size -= rsize;
 204                 }
 205
 206                 status = git_deflate(&s, size ? 0 : Z_FINISH);
 207
 208                 if (!s.avail_out || status == Z_STREAM_END) {
 209                         if (write_object) {
 210                                 size_t written = s.next_out - obuf;
 211
 212                                 /* would we bust the size limit? */
 213                                 if (state->nr_written &&
 214                                     pack_size_limit_cfg &&
 215                                     pack_size_limit_cfg < state->offset + written) {
 216                                         git_deflate_abort(&s);
 217                                         return -1;
 218                                 }
 219
 220                                 hashwrite(state->f, obuf, written);
 221                                 state->offset += written;
 222                         }
 223                         s.next_out = obuf;
 224                         s.avail_out = sizeof(obuf);
 225                 }
 226
 227                 switch (status) {
 228                 case Z_OK:
 229                 case Z_BUF_ERROR:
 230                 case Z_STREAM_END:
 231                         continue;
 232                 default:
 233                         die("unexpected deflate failure: %d", status);
 234                 }
 235         }
 236         git_deflate_end(&s);
 237         return 0;
 238 }
 239
 240 /* Lazily create backing packfile for the state */
 241 static void prepare_to_stream(struct bulk_checkin_packfile *state,
 242                               unsigned flags)
 243 {
 244         if (!(flags & HASH_WRITE_OBJECT) || state->f)
 245                 return;
 246
 247         state->f = create_tmp_packfile(&state->pack_tmp_name);
 248         reset_pack_idx_option(&state->pack_idx_opts);
 249
 250         /* Pretend we are going to write only one object */
 251         state->offset = write_pack_header(state->f, 1);
 252         if (!state->offset)
 253                 die_errno("unable to write pack header");
 254 }
 255
 256 static int deflate_blob_to_pack(struct bulk_checkin_packfile *state,
 257                                 struct object_id *result_oid,
 258                                 int fd, size_t size,
 259                                 const char *path, unsigned flags)
 260 {
 261         off_t seekback, already_hashed_to;
 262         git_hash_ctx ctx;
 263         unsigned char obuf[16384];
 264         unsigned header_len;
 265         struct hashfile_checkpoint checkpoint;
 266         struct pack_idx_entry *idx = NULL;
 267
 268         seekback = lseek(fd, 0, SEEK_CUR);
 269         if (seekback == (off_t) -1)
 270                 return error("cannot find the current offset");
 271
 272         header_len = format_object_header((char *)obuf, sizeof(obuf),
 273                                           OBJ_BLOB, size);
 274         the_hash_algo->init_fn(&ctx);
 275         the_hash_algo->update_fn(&ctx, obuf, header_len);
 276
 277         /* Note: idx is non-NULL when we are writing */
 278         if ((flags & HASH_WRITE_OBJECT) != 0) {
 279                 CALLOC_ARRAY(idx, 1);
 280
 281                 prepare_to_stream(state, flags);
 282                 hashfile_checkpoint_init(state->f, &checkpoint);
 283         }
 284
 285         already_hashed_to = 0;
 286
 287         while (1) {
 288                 prepare_to_stream(state, flags);
 289                 if (idx) {
 290                         hashfile_checkpoint(state->f, &checkpoint);
 291                         idx->offset = state->offset;
 292                         crc32_begin(state->f);
 293                 }
 294                 if (!stream_blob_to_pack(state, &ctx, &already_hashed_to,
 295                                          fd, size, path, flags))
 296                         break;
 297                 /*
 298                  * Writing this object to the current pack will make
 299                  * it too big; we need to truncate it, start a new
 300                  * pack, and write into it.
 301                  */
 302                 if (!idx)
 303                         BUG("should not happen");
 304                 hashfile_truncate(state->f, &checkpoint);
 305                 state->offset = checkpoint.offset;
 306                 flush_bulk_checkin_packfile(state);
 307                 if (lseek(fd, seekback, SEEK_SET) == (off_t) -1)
 308                         return error("cannot seek back");
 309         }
 310         the_hash_algo->final_oid_fn(result_oid, &ctx);
 311         if (!idx)
 312                 return 0;
 313
 314         idx->crc32 = crc32_end(state->f);
 315         if (already_written(state, result_oid)) {
 316                 hashfile_truncate(state->f, &checkpoint);
 317                 state->offset = checkpoint.offset;
 318                 free(idx);
 319         } else {
 320                 oidcpy(&idx->oid, result_oid);
 321                 ALLOC_GROW(state->written,
 322                            state->nr_written + 1,
 323                            state->alloc_written);
 324                 state->written[state->nr_written++] = idx;
 325         }
 326         return 0;
 327 }
 328
 329 void prepare_loose_object_bulk_checkin(void)
 330 {
 331         /*
 332          * We lazily create the temporary object directory
 333          * the first time an object might be added, since
 334          * callers may not know whether any objects will be
 335          * added at the time they call begin_odb_transaction.
 336          */
 337         if (!odb_transaction_nesting || bulk_fsync_objdir)
 338                 return;
 339
 340         bulk_fsync_objdir = tmp_objdir_create(the_repository, "bulk-fsync");
 341         if (bulk_fsync_objdir)
 342                 tmp_objdir_replace_primary_odb(bulk_fsync_objdir, 0);
 343 }
 344
 345 void fsync_loose_object_bulk_checkin(int fd, const char *filename)
 346 {
 347         /*
 348          * If we have an active ODB transaction, we issue a call that
 349          * cleans the filesystem page cache but avoids a hardware flush
 350          * command. Later on we will issue a single hardware flush
 351          * before renaming the objects to their final names as part of
 352          * flush_batch_fsync.
 353          */
 354         if (!bulk_fsync_objdir ||
 355             git_fsync(fd, FSYNC_WRITEOUT_ONLY) < 0) {
 356                 if (errno == ENOSYS)
 357                         warning(_("core.fsyncMethod = batch is unsupported on this platform"));
 358                 fsync_or_die(fd, filename);
 359         }
 360 }
 361
 362 int index_blob_bulk_checkin(struct object_id *oid,
 363                             int fd, size_t size,
 364                             const char *path, unsigned flags)
 365 {
 366         int status = deflate_blob_to_pack(&bulk_checkin_packfile, oid, fd, size,
 367                                           path, flags);
 368         if (!odb_transaction_nesting)
 369                 flush_bulk_checkin_packfile(&bulk_checkin_packfile);
 370         return status;
 371 }
 372
 373 void begin_odb_transaction(void)
 374 {
 375         odb_transaction_nesting += 1;
 376 }
 377
 378 void flush_odb_transaction(void)
 379 {
 380         flush_batch_fsync();
 381         flush_bulk_checkin_packfile(&bulk_checkin_packfile);
 382 }
 383
 384 void end_odb_transaction(void)
 385 {
 386         odb_transaction_nesting -= 1;
 387         if (odb_transaction_nesting < 0)
 388                 BUG("Unbalanced ODB transaction nesting");
 389
 390         if (odb_transaction_nesting)
 391                 return;
 392
 393         flush_odb_transaction();
 394 }