builtin/unpack-objects.c

   1 #define USE_THE_REPOSITORY_VARIABLE
   2 #include "builtin.h"
   3 #include "bulk-checkin.h"
   4 #include "config.h"
   5 #include "environment.h"
   6 #include "gettext.h"
   7 #include "git-zlib.h"
   8 #include "hex.h"
   9 #include "object-store-ll.h"
  10 #include "object.h"
  11 #include "delta.h"
  12 #include "pack.h"
  13 #include "blob.h"
  14 #include "replace-object.h"
  15 #include "strbuf.h"
  16 #include "progress.h"
  17 #include "decorate.h"
  18 #include "fsck.h"
  19
  20 static int dry_run, quiet, recover, has_errors, strict;
  21 static const char unpack_usage[] = "git unpack-objects [-n] [-q] [-r] [--strict]";
  22
  23 /* We always read in 4kB chunks. */
  24 static unsigned char buffer[4096];
  25 static unsigned int offset, len;
  26 static off_t consumed_bytes;
  27 static off_t max_input_size;
  28 static git_hash_ctx ctx;
  29 static struct fsck_options fsck_options = FSCK_OPTIONS_STRICT;
  30 static struct progress *progress;
  31
  32 /*
  33  * When running under --strict mode, objects whose reachability are
  34  * suspect are kept in core without getting written in the object
  35  * store.
  36  */
  37 struct obj_buffer {
  38         char *buffer;
  39         unsigned long size;
  40 };
  41
  42 static struct decoration obj_decorate;
  43
  44 static struct obj_buffer *lookup_object_buffer(struct object *base)
  45 {
  46         return lookup_decoration(&obj_decorate, base);
  47 }
  48
  49 static void add_object_buffer(struct object *object, char *buffer, unsigned long size)
  50 {
  51         struct obj_buffer *obj;
  52         CALLOC_ARRAY(obj, 1);
  53         obj->buffer = buffer;
  54         obj->size = size;
  55         if (add_decoration(&obj_decorate, object, obj))
  56                 die("object %s tried to add buffer twice!", oid_to_hex(&object->oid));
  57 }
  58
  59 /*
  60  * Make sure at least "min" bytes are available in the buffer, and
  61  * return the pointer to the buffer.
  62  */
  63 static void *fill(int min)
  64 {
  65         if (min <= len)
  66                 return buffer + offset;
  67         if (min > sizeof(buffer))
  68                 die("cannot fill %d bytes", min);
  69         if (offset) {
  70                 the_hash_algo->update_fn(&ctx, buffer, offset);
  71                 memmove(buffer, buffer + offset, len);
  72                 offset = 0;
  73         }
  74         do {
  75                 ssize_t ret = xread(0, buffer + len, sizeof(buffer) - len);
  76                 if (ret <= 0) {
  77                         if (!ret)
  78                                 die("early EOF");
  79                         die_errno("read error on input");
  80                 }
  81                 len += ret;
  82         } while (len < min);
  83         return buffer;
  84 }
  85
  86 static void use(int bytes)
  87 {
  88         if (bytes > len)
  89                 die("used more bytes than were available");
  90         len -= bytes;
  91         offset += bytes;
  92
  93         /* make sure off_t is sufficiently large not to wrap */
  94         if (signed_add_overflows(consumed_bytes, bytes))
  95                 die("pack too large for current definition of off_t");
  96         consumed_bytes += bytes;
  97         if (max_input_size && consumed_bytes > max_input_size)
  98                 die(_("pack exceeds maximum allowed size"));
  99         display_throughput(progress, consumed_bytes);
 100 }
 101
 102 /*
 103  * Decompress zstream from the standard input into a newly
 104  * allocated buffer of specified size and return the buffer.
 105  * The caller is responsible to free the returned buffer.
 106  *
 107  * But for dry_run mode, "get_data()" is only used to check the
 108  * integrity of data, and the returned buffer is not used at all.
 109  * Therefore, in dry_run mode, "get_data()" will release the small
 110  * allocated buffer which is reused to hold temporary zstream output
 111  * and return NULL instead of returning garbage data.
 112  */
 113 static void *get_data(unsigned long size)
 114 {
 115         git_zstream stream;
 116         unsigned long bufsize = dry_run && size > 8192 ? 8192 : size;
 117         void *buf = xmallocz(bufsize);
 118
 119         memset(&stream, 0, sizeof(stream));
 120
 121         stream.next_out = buf;
 122         stream.avail_out = bufsize;
 123         stream.next_in = fill(1);
 124         stream.avail_in = len;
 125         git_inflate_init(&stream);
 126
 127         for (;;) {
 128                 int ret = git_inflate(&stream, 0);
 129                 use(len - stream.avail_in);
 130                 if (stream.total_out == size && ret == Z_STREAM_END)
 131                         break;
 132                 if (ret != Z_OK) {
 133                         error("inflate returned %d", ret);
 134                         FREE_AND_NULL(buf);
 135                         if (!recover)
 136                                 exit(1);
 137                         has_errors = 1;
 138                         break;
 139                 }
 140                 stream.next_in = fill(1);
 141                 stream.avail_in = len;
 142                 if (dry_run) {
 143                         /* reuse the buffer in dry_run mode */
 144                         stream.next_out = buf;
 145                         stream.avail_out = bufsize > size - stream.total_out ?
 146                                                    size - stream.total_out :
 147                                                    bufsize;
 148                 }
 149         }
 150         git_inflate_end(&stream);
 151         if (dry_run)
 152                 FREE_AND_NULL(buf);
 153         return buf;
 154 }
 155
 156 struct delta_info {
 157         struct object_id base_oid;
 158         unsigned nr;
 159         off_t base_offset;
 160         unsigned long size;
 161         void *delta;
 162         struct delta_info *next;
 163 };
 164
 165 static struct delta_info *delta_list;
 166
 167 static void add_delta_to_list(unsigned nr, const struct object_id *base_oid,
 168                               off_t base_offset,
 169                               void *delta, unsigned long size)
 170 {
 171         struct delta_info *info = xmalloc(sizeof(*info));
 172
 173         oidcpy(&info->base_oid, base_oid);
 174         info->base_offset = base_offset;
 175         info->size = size;
 176         info->delta = delta;
 177         info->nr = nr;
 178         info->next = delta_list;
 179         delta_list = info;
 180 }
 181
 182 struct obj_info {
 183         off_t offset;
 184         struct object_id oid;
 185         struct object *obj;
 186 };
 187
 188 /* Remember to update object flag allocation in object.h */
 189 #define FLAG_OPEN (1u<<20)
 190 #define FLAG_WRITTEN (1u<<21)
 191
 192 static struct obj_info *obj_list;
 193 static unsigned nr_objects;
 194
 195 /*
 196  * Called only from check_object() after it verified this object
 197  * is Ok.
 198  */
 199 static void write_cached_object(struct object *obj, struct obj_buffer *obj_buf)
 200 {
 201         struct object_id oid;
 202
 203         if (write_object_file(obj_buf->buffer, obj_buf->size,
 204                               obj->type, &oid) < 0)
 205                 die("failed to write object %s", oid_to_hex(&obj->oid));
 206         obj->flags |= FLAG_WRITTEN;
 207 }
 208
 209 /*
 210  * At the very end of the processing, write_rest() scans the objects
 211  * that have reachability requirements and calls this function.
 212  * Verify its reachability and validity recursively and write it out.
 213  */
 214 static int check_object(struct object *obj, enum object_type type,
 215                         void *data UNUSED,
 216                         struct fsck_options *options UNUSED)
 217 {
 218         struct obj_buffer *obj_buf;
 219
 220         if (!obj)
 221                 return 1;
 222
 223         if (obj->flags & FLAG_WRITTEN)
 224                 return 0;
 225
 226         if (type != OBJ_ANY && obj->type != type)
 227                 die("object type mismatch");
 228
 229         if (!(obj->flags & FLAG_OPEN)) {
 230                 unsigned long size;
 231                 int type = oid_object_info(the_repository, &obj->oid, &size);
 232                 if (type != obj->type || type <= 0)
 233                         die("object of unexpected type");
 234                 obj->flags |= FLAG_WRITTEN;
 235                 return 0;
 236         }
 237
 238         obj_buf = lookup_object_buffer(obj);
 239         if (!obj_buf)
 240                 die("Whoops! Cannot find object '%s'", oid_to_hex(&obj->oid));
 241         if (fsck_object(obj, obj_buf->buffer, obj_buf->size, &fsck_options))
 242                 die("fsck error in packed object");
 243         fsck_options.walk = check_object;
 244         if (fsck_walk(obj, NULL, &fsck_options))
 245                 die("Error on reachable objects of %s", oid_to_hex(&obj->oid));
 246         write_cached_object(obj, obj_buf);
 247         return 0;
 248 }
 249
 250 static void write_rest(void)
 251 {
 252         unsigned i;
 253         for (i = 0; i < nr_objects; i++) {
 254                 if (obj_list[i].obj)
 255                         check_object(obj_list[i].obj, OBJ_ANY, NULL, NULL);
 256         }
 257 }
 258
 259 static void added_object(unsigned nr, enum object_type type,
 260                          void *data, unsigned long size);
 261
 262 /*
 263  * Write out nr-th object from the list, now we know the contents
 264  * of it.  Under --strict, this buffers structured objects in-core,
 265  * to be checked at the end.
 266  */
 267 static void write_object(unsigned nr, enum object_type type,
 268                          void *buf, unsigned long size)
 269 {
 270         if (!strict) {
 271                 if (write_object_file(buf, size, type,
 272                                       &obj_list[nr].oid) < 0)
 273                         die("failed to write object");
 274                 added_object(nr, type, buf, size);
 275                 free(buf);
 276                 obj_list[nr].obj = NULL;
 277         } else if (type == OBJ_BLOB) {
 278                 struct blob *blob;
 279                 if (write_object_file(buf, size, type,
 280                                       &obj_list[nr].oid) < 0)
 281                         die("failed to write object");
 282                 added_object(nr, type, buf, size);
 283                 free(buf);
 284
 285                 blob = lookup_blob(the_repository, &obj_list[nr].oid);
 286                 if (blob)
 287                         blob->object.flags |= FLAG_WRITTEN;
 288                 else
 289                         die("invalid blob object");
 290                 obj_list[nr].obj = NULL;
 291         } else {
 292                 struct object *obj;
 293                 int eaten;
 294                 hash_object_file(the_hash_algo, buf, size, type,
 295                                  &obj_list[nr].oid);
 296                 added_object(nr, type, buf, size);
 297                 obj = parse_object_buffer(the_repository, &obj_list[nr].oid,
 298                                           type, size, buf,
 299                                           &eaten);
 300                 if (!obj)
 301                         die("invalid %s", type_name(type));
 302                 add_object_buffer(obj, buf, size);
 303                 obj->flags |= FLAG_OPEN;
 304                 obj_list[nr].obj = obj;
 305         }
 306 }
 307
 308 static void resolve_delta(unsigned nr, enum object_type type,
 309                           void *base, unsigned long base_size,
 310                           void *delta, unsigned long delta_size)
 311 {
 312         void *result;
 313         unsigned long result_size;
 314
 315         result = patch_delta(base, base_size,
 316                              delta, delta_size,
 317                              &result_size);
 318         if (!result)
 319                 die("failed to apply delta");
 320         free(delta);
 321         write_object(nr, type, result, result_size);
 322 }
 323
 324 /*
 325  * We now know the contents of an object (which is nr-th in the pack);
 326  * resolve all the deltified objects that are based on it.
 327  */
 328 static void added_object(unsigned nr, enum object_type type,
 329                          void *data, unsigned long size)
 330 {
 331         struct delta_info **p = &delta_list;
 332         struct delta_info *info;
 333
 334         while ((info = *p) != NULL) {
 335                 if (oideq(&info->base_oid, &obj_list[nr].oid) ||
 336                     info->base_offset == obj_list[nr].offset) {
 337                         *p = info->next;
 338                         p = &delta_list;
 339                         resolve_delta(info->nr, type, data, size,
 340                                       info->delta, info->size);
 341                         free(info);
 342                         continue;
 343                 }
 344                 p = &info->next;
 345         }
 346 }
 347
 348 static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 349                                    unsigned nr)
 350 {
 351         void *buf = get_data(size);
 352
 353         if (buf)
 354                 write_object(nr, type, buf, size);
 355 }
 356
 357 struct input_zstream_data {
 358         git_zstream *zstream;
 359         unsigned char buf[8192];
 360         int status;
 361 };
 362
 363 static const void *feed_input_zstream(struct input_stream *in_stream,
 364                                       unsigned long *readlen)
 365 {
 366         struct input_zstream_data *data = in_stream->data;
 367         git_zstream *zstream = data->zstream;
 368         void *in = fill(1);
 369
 370         if (in_stream->is_finished) {
 371                 *readlen = 0;
 372                 return NULL;
 373         }
 374
 375         zstream->next_out = data->buf;
 376         zstream->avail_out = sizeof(data->buf);
 377         zstream->next_in = in;
 378         zstream->avail_in = len;
 379
 380         data->status = git_inflate(zstream, 0);
 381
 382         in_stream->is_finished = data->status != Z_OK;
 383         use(len - zstream->avail_in);
 384         *readlen = sizeof(data->buf) - zstream->avail_out;
 385
 386         return data->buf;
 387 }
 388
 389 static void stream_blob(unsigned long size, unsigned nr)
 390 {
 391         git_zstream zstream = { 0 };
 392         struct input_zstream_data data = { 0 };
 393         struct input_stream in_stream = {
 394                 .read = feed_input_zstream,
 395                 .data = &data,
 396         };
 397         struct obj_info *info = &obj_list[nr];
 398
 399         data.zstream = &zstream;
 400         git_inflate_init(&zstream);
 401
 402         if (stream_loose_object(&in_stream, size, &info->oid))
 403                 die(_("failed to write object in stream"));
 404
 405         if (data.status != Z_STREAM_END)
 406                 die(_("inflate returned (%d)"), data.status);
 407         git_inflate_end(&zstream);
 408
 409         if (strict) {
 410                 struct blob *blob = lookup_blob(the_repository, &info->oid);
 411
 412                 if (!blob)
 413                         die(_("invalid blob object from stream"));
 414                 blob->object.flags |= FLAG_WRITTEN;
 415         }
 416         info->obj = NULL;
 417 }
 418
 419 static int resolve_against_held(unsigned nr, const struct object_id *base,
 420                                 void *delta_data, unsigned long delta_size)
 421 {
 422         struct object *obj;
 423         struct obj_buffer *obj_buffer;
 424         obj = lookup_object(the_repository, base);
 425         if (!obj)
 426                 return 0;
 427         obj_buffer = lookup_object_buffer(obj);
 428         if (!obj_buffer)
 429                 return 0;
 430         resolve_delta(nr, obj->type, obj_buffer->buffer,
 431                       obj_buffer->size, delta_data, delta_size);
 432         return 1;
 433 }
 434
 435 static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 436                                unsigned nr)
 437 {
 438         void *delta_data, *base;
 439         unsigned long base_size;
 440         struct object_id base_oid;
 441
 442         if (type == OBJ_REF_DELTA) {
 443                 oidread(&base_oid, fill(the_hash_algo->rawsz), the_repository->hash_algo);
 444                 use(the_hash_algo->rawsz);
 445                 delta_data = get_data(delta_size);
 446                 if (!delta_data)
 447                         return;
 448                 if (repo_has_object_file(the_repository, &base_oid))
 449                         ; /* Ok we have this one */
 450                 else if (resolve_against_held(nr, &base_oid,
 451                                               delta_data, delta_size))
 452                         return; /* we are done */
 453                 else {
 454                         /* cannot resolve yet --- queue it */
 455                         oidclr(&obj_list[nr].oid, the_repository->hash_algo);
 456                         add_delta_to_list(nr, &base_oid, 0, delta_data, delta_size);
 457                         return;
 458                 }
 459         } else {
 460                 unsigned base_found = 0;
 461                 unsigned char *pack, c;
 462                 off_t base_offset;
 463                 unsigned lo, mid, hi;
 464
 465                 pack = fill(1);
 466                 c = *pack;
 467                 use(1);
 468                 base_offset = c & 127;
 469                 while (c & 128) {
 470                         base_offset += 1;
 471                         if (!base_offset || MSB(base_offset, 7))
 472                                 die("offset value overflow for delta base object");
 473                         pack = fill(1);
 474                         c = *pack;
 475                         use(1);
 476                         base_offset = (base_offset << 7) + (c & 127);
 477                 }
 478                 base_offset = obj_list[nr].offset - base_offset;
 479                 if (base_offset <= 0 || base_offset >= obj_list[nr].offset)
 480                         die("offset value out of bound for delta base object");
 481
 482                 delta_data = get_data(delta_size);
 483                 if (!delta_data)
 484                         return;
 485                 lo = 0;
 486                 hi = nr;
 487                 while (lo < hi) {
 488                         mid = lo + (hi - lo) / 2;
 489                         if (base_offset < obj_list[mid].offset) {
 490                                 hi = mid;
 491                         } else if (base_offset > obj_list[mid].offset) {
 492                                 lo = mid + 1;
 493                         } else {
 494                                 oidcpy(&base_oid, &obj_list[mid].oid);
 495                                 base_found = !is_null_oid(&base_oid);
 496                                 break;
 497                         }
 498                 }
 499                 if (!base_found) {
 500                         /*
 501                          * The delta base object is itself a delta that
 502                          * has not been resolved yet.
 503                          */
 504                         oidclr(&obj_list[nr].oid, the_repository->hash_algo);
 505                         add_delta_to_list(nr, null_oid(), base_offset,
 506                                           delta_data, delta_size);
 507                         return;
 508                 }
 509         }
 510
 511         if (resolve_against_held(nr, &base_oid, delta_data, delta_size))
 512                 return;
 513
 514         base = repo_read_object_file(the_repository, &base_oid, &type,
 515                                      &base_size);
 516         if (!base) {
 517                 error("failed to read delta-pack base object %s",
 518                       oid_to_hex(&base_oid));
 519                 if (!recover)
 520                         exit(1);
 521                 has_errors = 1;
 522                 return;
 523         }
 524         resolve_delta(nr, type, base, base_size, delta_data, delta_size);
 525         free(base);
 526 }
 527
 528 static void unpack_one(unsigned nr)
 529 {
 530         unsigned shift;
 531         unsigned char *pack;
 532         unsigned long size, c;
 533         enum object_type type;
 534
 535         obj_list[nr].offset = consumed_bytes;
 536
 537         pack = fill(1);
 538         c = *pack;
 539         use(1);
 540         type = (c >> 4) & 7;
 541         size = (c & 15);
 542         shift = 4;
 543         while (c & 0x80) {
 544                 pack = fill(1);
 545                 c = *pack;
 546                 use(1);
 547                 size += (c & 0x7f) << shift;
 548                 shift += 7;
 549         }
 550
 551         switch (type) {
 552         case OBJ_BLOB:
 553                 if (!dry_run && size > big_file_threshold) {
 554                         stream_blob(size, nr);
 555                         return;
 556                 }
 557                 /* fallthrough */
 558         case OBJ_COMMIT:
 559         case OBJ_TREE:
 560         case OBJ_TAG:
 561                 unpack_non_delta_entry(type, size, nr);
 562                 return;
 563         case OBJ_REF_DELTA:
 564         case OBJ_OFS_DELTA:
 565                 unpack_delta_entry(type, size, nr);
 566                 return;
 567         default:
 568                 error("bad object type %d", type);
 569                 has_errors = 1;
 570                 if (recover)
 571                         return;
 572                 exit(1);
 573         }
 574 }
 575
 576 static void unpack_all(void)
 577 {
 578         int i;
 579         struct pack_header *hdr = fill(sizeof(struct pack_header));
 580
 581         nr_objects = ntohl(hdr->hdr_entries);
 582
 583         if (ntohl(hdr->hdr_signature) != PACK_SIGNATURE)
 584                 die("bad pack file");
 585         if (!pack_version_ok(hdr->hdr_version))
 586                 die("unknown pack file version %"PRIu32,
 587                         ntohl(hdr->hdr_version));
 588         use(sizeof(struct pack_header));
 589
 590         if (!quiet)
 591                 progress = start_progress(_("Unpacking objects"), nr_objects);
 592         CALLOC_ARRAY(obj_list, nr_objects);
 593         begin_odb_transaction();
 594         for (i = 0; i < nr_objects; i++) {
 595                 unpack_one(i);
 596                 display_progress(progress, i + 1);
 597         }
 598         end_odb_transaction();
 599         stop_progress(&progress);
 600
 601         if (delta_list)
 602                 die("unresolved deltas left after unpacking");
 603 }
 604
 605 int cmd_unpack_objects(int argc,
 606                        const char **argv,
 607                        const char *prefix UNUSED,
 608                        struct repository *repo UNUSED)
 609 {
 610         int i;
 611         struct object_id oid;
 612         git_hash_ctx tmp_ctx;
 613
 614         disable_replace_refs();
 615
 616         git_config(git_default_config, NULL);
 617
 618         quiet = !isatty(2);
 619
 620         for (i = 1 ; i < argc; i++) {
 621                 const char *arg = argv[i];
 622
 623                 if (*arg == '-') {
 624                         if (!strcmp(arg, "-n")) {
 625                                 dry_run = 1;
 626                                 continue;
 627                         }
 628                         if (!strcmp(arg, "-q")) {
 629                                 quiet = 1;
 630                                 continue;
 631                         }
 632                         if (!strcmp(arg, "-r")) {
 633                                 recover = 1;
 634                                 continue;
 635                         }
 636                         if (!strcmp(arg, "--strict")) {
 637                                 strict = 1;
 638                                 continue;
 639                         }
 640                         if (skip_prefix(arg, "--strict=", &arg)) {
 641                                 strict = 1;
 642                                 fsck_set_msg_types(&fsck_options, arg);
 643                                 continue;
 644                         }
 645                         if (starts_with(arg, "--pack_header=")) {
 646                                 struct pack_header *hdr;
 647                                 char *c;
 648
 649                                 hdr = (struct pack_header *)buffer;
 650                                 hdr->hdr_signature = htonl(PACK_SIGNATURE);
 651                                 hdr->hdr_version = htonl(strtoul(arg + 14, &c, 10));
 652                                 if (*c != ',')
 653                                         die("bad %s", arg);
 654                                 hdr->hdr_entries = htonl(strtoul(c + 1, &c, 10));
 655                                 if (*c)
 656                                         die("bad %s", arg);
 657                                 len = sizeof(*hdr);
 658                                 continue;
 659                         }
 660                         if (skip_prefix(arg, "--max-input-size=", &arg)) {
 661                                 max_input_size = strtoumax(arg, NULL, 10);
 662                                 continue;
 663                         }
 664                         usage(unpack_usage);
 665                 }
 666
 667                 /* We don't take any non-flag arguments now.. Maybe some day */
 668                 usage(unpack_usage);
 669         }
 670         the_hash_algo->init_fn(&ctx);
 671         unpack_all();
 672         the_hash_algo->update_fn(&ctx, buffer, offset);
 673         the_hash_algo->init_fn(&tmp_ctx);
 674         the_hash_algo->clone_fn(&tmp_ctx, &ctx);
 675         the_hash_algo->final_oid_fn(&oid, &tmp_ctx);
 676         if (strict) {
 677                 write_rest();
 678                 if (fsck_finish(&fsck_options))
 679                         die(_("fsck error in pack objects"));
 680         }
 681         if (!hasheq(fill(the_hash_algo->rawsz), oid.hash,
 682                     the_repository->hash_algo))
 683                 die("final sha1 did not match");
 684         use(the_hash_algo->rawsz);
 685
 686         /* Write the last part of the buffer to stdout */
 687         write_in_full(1, buffer + offset, len);
 688
 689         /* All done */
 690         return has_errors;
 691 }