howto/recover-corrupted-object-harder.html

   1 <?xml version="1.0" encoding="UTF-8"?>
   2 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
   3     "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
   4 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
   5 <head>
   6 <meta http-equiv="Content-Type" content="application/xhtml+xml; charset=UTF-8" />
   7 <meta name="generator" content="AsciiDoc 10.2.0" />
   8 <title>How to recover an object from scratch</title>
   9 <style type="text/css">
  10 /* Shared CSS for AsciiDoc xhtml11 and html5 backends */
  11
  12 /* Default font. */
  13 body {
  14   font-family: Georgia,serif;
  15 }
  16
  17 /* Title font. */
  18 h1, h2, h3, h4, h5, h6,
  19 div.title, caption.title,
  20 thead, p.table.header,
  21 #toctitle,
  22 #author, #revnumber, #revdate, #revremark,
  23 #footer {
  24   font-family: Arial,Helvetica,sans-serif;
  25 }
  26
  27 body {
  28   margin: 1em 5% 1em 5%;
  29 }
  30
  31 a {
  32   color: blue;
  33   text-decoration: underline;
  34 }
  35 a:visited {
  36   color: fuchsia;
  37 }
  38
  39 em {
  40   font-style: italic;
  41   color: navy;
  42 }
  43
  44 strong {
  45   font-weight: bold;
  46   color: #083194;
  47 }
  48
  49 h1, h2, h3, h4, h5, h6 {
  50   color: #527bbd;
  51   margin-top: 1.2em;
  52   margin-bottom: 0.5em;
  53   line-height: 1.3;
  54 }
  55
  56 h1, h2, h3 {
  57   border-bottom: 2px solid silver;
  58 }
  59 h2 {
  60   padding-top: 0.5em;
  61 }
  62 h3 {
  63   float: left;
  64 }
  65 h3 + * {
  66   clear: left;
  67 }
  68 h5 {
  69   font-size: 1.0em;
  70 }
  71
  72 div.sectionbody {
  73   margin-left: 0;
  74 }
  75
  76 hr {
  77   border: 1px solid silver;
  78 }
  79
  80 p {
  81   margin-top: 0.5em;
  82   margin-bottom: 0.5em;
  83 }
  84
  85 ul, ol, li > p {
  86   margin-top: 0;
  87 }
  88 ul > li     { color: #aaa; }
  89 ul > li > * { color: black; }
  90
  91 .monospaced, code, pre {
  92   font-family: "Courier New", Courier, monospace;
  93   font-size: inherit;
  94   color: navy;
  95   padding: 0;
  96   margin: 0;
  97 }
  98 pre {
  99   white-space: pre-wrap;
 100 }
 101
 102 #author {
 103   color: #527bbd;
 104   font-weight: bold;
 105   font-size: 1.1em;
 106 }
 107 #email {
 108 }
 109 #revnumber, #revdate, #revremark {
 110 }
 111
 112 #footer {
 113   font-size: small;
 114   border-top: 2px solid silver;
 115   padding-top: 0.5em;
 116   margin-top: 4.0em;
 117 }
 118 #footer-text {
 119   float: left;
 120   padding-bottom: 0.5em;
 121 }
 122 #footer-badges {
 123   float: right;
 124   padding-bottom: 0.5em;
 125 }
 126
 127 #preamble {
 128   margin-top: 1.5em;
 129   margin-bottom: 1.5em;
 130 }
 131 div.imageblock, div.exampleblock, div.verseblock,
 132 div.quoteblock, div.literalblock, div.listingblock, div.sidebarblock,
 133 div.admonitionblock {
 134   margin-top: 1.0em;
 135   margin-bottom: 1.5em;
 136 }
 137 div.admonitionblock {
 138   margin-top: 2.0em;
 139   margin-bottom: 2.0em;
 140   margin-right: 10%;
 141   color: #606060;
 142 }
 143
 144 div.content { /* Block element content. */
 145   padding: 0;
 146 }
 147
 148 /* Block element titles. */
 149 div.title, caption.title {
 150   color: #527bbd;
 151   font-weight: bold;
 152   text-align: left;
 153   margin-top: 1.0em;
 154   margin-bottom: 0.5em;
 155 }
 156 div.title + * {
 157   margin-top: 0;
 158 }
 159
 160 td div.title:first-child {
 161   margin-top: 0.0em;
 162 }
 163 div.content div.title:first-child {
 164   margin-top: 0.0em;
 165 }
 166 div.content + div.title {
 167   margin-top: 0.0em;
 168 }
 169
 170 div.sidebarblock > div.content {
 171   background: #ffffee;
 172   border: 1px solid #dddddd;
 173   border-left: 4px solid #f0f0f0;
 174   padding: 0.5em;
 175 }
 176
 177 div.listingblock > div.content {
 178   border: 1px solid #dddddd;
 179   border-left: 5px solid #f0f0f0;
 180   background: #f8f8f8;
 181   padding: 0.5em;
 182 }
 183
 184 div.quoteblock, div.verseblock {
 185   padding-left: 1.0em;
 186   margin-left: 1.0em;
 187   margin-right: 10%;
 188   border-left: 5px solid #f0f0f0;
 189   color: #888;
 190 }
 191
 192 div.quoteblock > div.attribution {
 193   padding-top: 0.5em;
 194   text-align: right;
 195 }
 196
 197 div.verseblock > pre.content {
 198   font-family: inherit;
 199   font-size: inherit;
 200 }
 201 div.verseblock > div.attribution {
 202   padding-top: 0.75em;
 203   text-align: left;
 204 }
 205 /* DEPRECATED: Pre version 8.2.7 verse style literal block. */
 206 div.verseblock + div.attribution {
 207   text-align: left;
 208 }
 209
 210 div.admonitionblock .icon {
 211   vertical-align: top;
 212   font-size: 1.1em;
 213   font-weight: bold;
 214   text-decoration: underline;
 215   color: #527bbd;
 216   padding-right: 0.5em;
 217 }
 218 div.admonitionblock td.content {
 219   padding-left: 0.5em;
 220   border-left: 3px solid #dddddd;
 221 }
 222
 223 div.exampleblock > div.content {
 224   border-left: 3px solid #dddddd;
 225   padding-left: 0.5em;
 226 }
 227
 228 div.imageblock div.content { padding-left: 0; }
 229 span.image img { border-style: none; vertical-align: text-bottom; }
 230 a.image:visited { color: white; }
 231
 232 dl {
 233   margin-top: 0.8em;
 234   margin-bottom: 0.8em;
 235 }
 236 dt {
 237   margin-top: 0.5em;
 238   margin-bottom: 0;
 239   font-style: normal;
 240   color: navy;
 241 }
 242 dd > *:first-child {
 243   margin-top: 0.1em;
 244 }
 245
 246 ul, ol {
 247     list-style-position: outside;
 248 }
 249 ol.arabic {
 250   list-style-type: decimal;
 251 }
 252 ol.loweralpha {
 253   list-style-type: lower-alpha;
 254 }
 255 ol.upperalpha {
 256   list-style-type: upper-alpha;
 257 }
 258 ol.lowerroman {
 259   list-style-type: lower-roman;
 260 }
 261 ol.upperroman {
 262   list-style-type: upper-roman;
 263 }
 264
 265 div.compact ul, div.compact ol,
 266 div.compact p, div.compact p,
 267 div.compact div, div.compact div {
 268   margin-top: 0.1em;
 269   margin-bottom: 0.1em;
 270 }
 271
 272 tfoot {
 273   font-weight: bold;
 274 }
 275 td > div.verse {
 276   white-space: pre;
 277 }
 278
 279 div.hdlist {
 280   margin-top: 0.8em;
 281   margin-bottom: 0.8em;
 282 }
 283 div.hdlist tr {
 284   padding-bottom: 15px;
 285 }
 286 dt.hdlist1.strong, td.hdlist1.strong {
 287   font-weight: bold;
 288 }
 289 td.hdlist1 {
 290   vertical-align: top;
 291   font-style: normal;
 292   padding-right: 0.8em;
 293   color: navy;
 294 }
 295 td.hdlist2 {
 296   vertical-align: top;
 297 }
 298 div.hdlist.compact tr {
 299   margin: 0;
 300   padding-bottom: 0;
 301 }
 302
 303 .comment {
 304   background: yellow;
 305 }
 306
 307 .footnote, .footnoteref {
 308   font-size: 0.8em;
 309 }
 310
 311 span.footnote, span.footnoteref {
 312   vertical-align: super;
 313 }
 314
 315 #footnotes {
 316   margin: 20px 0 20px 0;
 317   padding: 7px 0 0 0;
 318 }
 319
 320 #footnotes div.footnote {
 321   margin: 0 0 5px 0;
 322 }
 323
 324 #footnotes hr {
 325   border: none;
 326   border-top: 1px solid silver;
 327   height: 1px;
 328   text-align: left;
 329   margin-left: 0;
 330   width: 20%;
 331   min-width: 100px;
 332 }
 333
 334 div.colist td {
 335   padding-right: 0.5em;
 336   padding-bottom: 0.3em;
 337   vertical-align: top;
 338 }
 339 div.colist td img {
 340   margin-top: 0.3em;
 341 }
 342
 343 @media print {
 344   #footer-badges { display: none; }
 345 }
 346
 347 #toc {
 348   margin-bottom: 2.5em;
 349 }
 350
 351 #toctitle {
 352   color: #527bbd;
 353   font-size: 1.1em;
 354   font-weight: bold;
 355   margin-top: 1.0em;
 356   margin-bottom: 0.1em;
 357 }
 358
 359 div.toclevel0, div.toclevel1, div.toclevel2, div.toclevel3, div.toclevel4 {
 360   margin-top: 0;
 361   margin-bottom: 0;
 362 }
 363 div.toclevel2 {
 364   margin-left: 2em;
 365   font-size: 0.9em;
 366 }
 367 div.toclevel3 {
 368   margin-left: 4em;
 369   font-size: 0.9em;
 370 }
 371 div.toclevel4 {
 372   margin-left: 6em;
 373   font-size: 0.9em;
 374 }
 375
 376 span.aqua { color: aqua; }
 377 span.black { color: black; }
 378 span.blue { color: blue; }
 379 span.fuchsia { color: fuchsia; }
 380 span.gray { color: gray; }
 381 span.green { color: green; }
 382 span.lime { color: lime; }
 383 span.maroon { color: maroon; }
 384 span.navy { color: navy; }
 385 span.olive { color: olive; }
 386 span.purple { color: purple; }
 387 span.red { color: red; }
 388 span.silver { color: silver; }
 389 span.teal { color: teal; }
 390 span.white { color: white; }
 391 span.yellow { color: yellow; }
 392
 393 span.aqua-background { background: aqua; }
 394 span.black-background { background: black; }
 395 span.blue-background { background: blue; }
 396 span.fuchsia-background { background: fuchsia; }
 397 span.gray-background { background: gray; }
 398 span.green-background { background: green; }
 399 span.lime-background { background: lime; }
 400 span.maroon-background { background: maroon; }
 401 span.navy-background { background: navy; }
 402 span.olive-background { background: olive; }
 403 span.purple-background { background: purple; }
 404 span.red-background { background: red; }
 405 span.silver-background { background: silver; }
 406 span.teal-background { background: teal; }
 407 span.white-background { background: white; }
 408 span.yellow-background { background: yellow; }
 409
 410 span.big { font-size: 2em; }
 411 span.small { font-size: 0.6em; }
 412
 413 span.underline { text-decoration: underline; }
 414 span.overline { text-decoration: overline; }
 415 span.line-through { text-decoration: line-through; }
 416
 417 div.unbreakable { page-break-inside: avoid; }
 418
 419
 420 /*
 421  * xhtml11 specific
 422  *
 423  * */
 424
 425 div.tableblock {
 426   margin-top: 1.0em;
 427   margin-bottom: 1.5em;
 428 }
 429 div.tableblock > table {
 430   border: 3px solid #527bbd;
 431 }
 432 thead, p.table.header {
 433   font-weight: bold;
 434   color: #527bbd;
 435 }
 436 p.table {
 437   margin-top: 0;
 438 }
 439 /* Because the table frame attribute is overridden by CSS in most browsers. */
 440 div.tableblock > table[frame="void"] {
 441   border-style: none;
 442 }
 443 div.tableblock > table[frame="hsides"] {
 444   border-left-style: none;
 445   border-right-style: none;
 446 }
 447 div.tableblock > table[frame="vsides"] {
 448   border-top-style: none;
 449   border-bottom-style: none;
 450 }
 451
 452
 453 /*
 454  * html5 specific
 455  *
 456  * */
 457
 458 table.tableblock {
 459   margin-top: 1.0em;
 460   margin-bottom: 1.5em;
 461 }
 462 thead, p.tableblock.header {
 463   font-weight: bold;
 464   color: #527bbd;
 465 }
 466 p.tableblock {
 467   margin-top: 0;
 468 }
 469 table.tableblock {
 470   border-width: 3px;
 471   border-spacing: 0px;
 472   border-style: solid;
 473   border-color: #527bbd;
 474   border-collapse: collapse;
 475 }
 476 th.tableblock, td.tableblock {
 477   border-width: 1px;
 478   padding: 4px;
 479   border-style: solid;
 480   border-color: #527bbd;
 481 }
 482
 483 table.tableblock.frame-topbot {
 484   border-left-style: hidden;
 485   border-right-style: hidden;
 486 }
 487 table.tableblock.frame-sides {
 488   border-top-style: hidden;
 489   border-bottom-style: hidden;
 490 }
 491 table.tableblock.frame-none {
 492   border-style: hidden;
 493 }
 494
 495 th.tableblock.halign-left, td.tableblock.halign-left {
 496   text-align: left;
 497 }
 498 th.tableblock.halign-center, td.tableblock.halign-center {
 499   text-align: center;
 500 }
 501 th.tableblock.halign-right, td.tableblock.halign-right {
 502   text-align: right;
 503 }
 504
 505 th.tableblock.valign-top, td.tableblock.valign-top {
 506   vertical-align: top;
 507 }
 508 th.tableblock.valign-middle, td.tableblock.valign-middle {
 509   vertical-align: middle;
 510 }
 511 th.tableblock.valign-bottom, td.tableblock.valign-bottom {
 512   vertical-align: bottom;
 513 }
 514
 515
 516 /*
 517  * manpage specific
 518  *
 519  * */
 520
 521 body.manpage h1 {
 522   padding-top: 0.5em;
 523   padding-bottom: 0.5em;
 524   border-top: 2px solid silver;
 525   border-bottom: 2px solid silver;
 526 }
 527 body.manpage h2 {
 528   border-style: none;
 529 }
 530 body.manpage div.sectionbody {
 531   margin-left: 3em;
 532 }
 533
 534 @media print {
 535   body.manpage div#toc { display: none; }
 536 }
 537
 538
 539 </style>
 540 <script type="text/javascript">
 541 /*<![CDATA[*/
 542 var asciidoc = {  // Namespace.
 543
 544 /////////////////////////////////////////////////////////////////////
 545 // Table Of Contents generator
 546 /////////////////////////////////////////////////////////////////////
 547
 548 /* Author: Mihai Bazon, September 2002
 549  * http://students.infoiasi.ro/~mishoo
 550  *
 551  * Table Of Content generator
 552  * Version: 0.4
 553  *
 554  * Feel free to use this script under the terms of the GNU General Public
 555  * License, as long as you do not remove or alter this notice.
 556  */
 557
 558  /* modified by Troy D. Hanson, September 2006. License: GPL */
 559  /* modified by Stuart Rackham, 2006, 2009. License: GPL */
 560
 561 // toclevels = 1..4.
 562 toc: function (toclevels) {
 563
 564   function getText(el) {
 565     var text = "";
 566     for (var i = el.firstChild; i != null; i = i.nextSibling) {
 567       if (i.nodeType == 3 /* Node.TEXT_NODE */) // IE doesn't speak constants.
 568         text += i.data;
 569       else if (i.firstChild != null)
 570         text += getText(i);
 571     }
 572     return text;
 573   }
 574
 575   function TocEntry(el, text, toclevel) {
 576     this.element = el;
 577     this.text = text;
 578     this.toclevel = toclevel;
 579   }
 580
 581   function tocEntries(el, toclevels) {
 582     var result = new Array;
 583     var re = new RegExp('[hH]([1-'+(toclevels+1)+'])');
 584     // Function that scans the DOM tree for header elements (the DOM2
 585     // nodeIterator API would be a better technique but not supported by all
 586     // browsers).
 587     var iterate = function (el) {
 588       for (var i = el.firstChild; i != null; i = i.nextSibling) {
 589         if (i.nodeType == 1 /* Node.ELEMENT_NODE */) {
 590           var mo = re.exec(i.tagName);
 591           if (mo && (i.getAttribute("class") || i.getAttribute("className")) != "float") {
 592             result[result.length] = new TocEntry(i, getText(i), mo[1]-1);
 593           }
 594           iterate(i);
 595         }
 596       }
 597     }
 598     iterate(el);
 599     return result;
 600   }
 601
 602   var toc = document.getElementById("toc");
 603   if (!toc) {
 604     return;
 605   }
 606
 607   // Delete existing TOC entries in case we're reloading the TOC.
 608   var tocEntriesToRemove = [];
 609   var i;
 610   for (i = 0; i < toc.childNodes.length; i++) {
 611     var entry = toc.childNodes[i];
 612     if (entry.nodeName.toLowerCase() == 'div'
 613      && entry.getAttribute("class")
 614      && entry.getAttribute("class").match(/^toclevel/))
 615       tocEntriesToRemove.push(entry);
 616   }
 617   for (i = 0; i < tocEntriesToRemove.length; i++) {
 618     toc.removeChild(tocEntriesToRemove[i]);
 619   }
 620
 621   // Rebuild TOC entries.
 622   var entries = tocEntries(document.getElementById("content"), toclevels);
 623   for (var i = 0; i < entries.length; ++i) {
 624     var entry = entries[i];
 625     if (entry.element.id == "")
 626       entry.element.id = "_toc_" + i;
 627     var a = document.createElement("a");
 628     a.href = "#" + entry.element.id;
 629     a.appendChild(document.createTextNode(entry.text));
 630     var div = document.createElement("div");
 631     div.appendChild(a);
 632     div.className = "toclevel" + entry.toclevel;
 633     toc.appendChild(div);
 634   }
 635   if (entries.length == 0)
 636     toc.parentNode.removeChild(toc);
 637 },
 638
 639
 640 /////////////////////////////////////////////////////////////////////
 641 // Footnotes generator
 642 /////////////////////////////////////////////////////////////////////
 643
 644 /* Based on footnote generation code from:
 645  * http://www.brandspankingnew.net/archive/2005/07/format_footnote.html
 646  */
 647
 648 footnotes: function () {
 649   // Delete existing footnote entries in case we're reloading the footnodes.
 650   var i;
 651   var noteholder = document.getElementById("footnotes");
 652   if (!noteholder) {
 653     return;
 654   }
 655   var entriesToRemove = [];
 656   for (i = 0; i < noteholder.childNodes.length; i++) {
 657     var entry = noteholder.childNodes[i];
 658     if (entry.nodeName.toLowerCase() == 'div' && entry.getAttribute("class") == "footnote")
 659       entriesToRemove.push(entry);
 660   }
 661   for (i = 0; i < entriesToRemove.length; i++) {
 662     noteholder.removeChild(entriesToRemove[i]);
 663   }
 664
 665   // Rebuild footnote entries.
 666   var cont = document.getElementById("content");
 667   var spans = cont.getElementsByTagName("span");
 668   var refs = {};
 669   var n = 0;
 670   for (i=0; i<spans.length; i++) {
 671     if (spans[i].className == "footnote") {
 672       n++;
 673       var note = spans[i].getAttribute("data-note");
 674       if (!note) {
 675         // Use [\s\S] in place of . so multi-line matches work.
 676         // Because JavaScript has no s (dotall) regex flag.
 677         note = spans[i].innerHTML.match(/\s*\[([\s\S]*)]\s*/)[1];
 678         spans[i].innerHTML =
 679           "[<a id='_footnoteref_" + n + "' href='#_footnote_" + n +
 680           "' title='View footnote' class='footnote'>" + n + "</a>]";
 681         spans[i].setAttribute("data-note", note);
 682       }
 683       noteholder.innerHTML +=
 684         "<div class='footnote' id='_footnote_" + n + "'>" +
 685         "<a href='#_footnoteref_" + n + "' title='Return to text'>" +
 686         n + "</a>. " + note + "</div>";
 687       var id =spans[i].getAttribute("id");
 688       if (id != null) refs["#"+id] = n;
 689     }
 690   }
 691   if (n == 0)
 692     noteholder.parentNode.removeChild(noteholder);
 693   else {
 694     // Process footnoterefs.
 695     for (i=0; i<spans.length; i++) {
 696       if (spans[i].className == "footnoteref") {
 697         var href = spans[i].getElementsByTagName("a")[0].getAttribute("href");
 698         href = href.match(/#.*/)[0];  // Because IE return full URL.
 699         n = refs[href];
 700         spans[i].innerHTML =
 701           "[<a href='#_footnote_" + n +
 702           "' title='View footnote' class='footnote'>" + n + "</a>]";
 703       }
 704     }
 705   }
 706 },
 707
 708 install: function(toclevels) {
 709   var timerId;
 710
 711   function reinstall() {
 712     asciidoc.footnotes();
 713     if (toclevels) {
 714       asciidoc.toc(toclevels);
 715     }
 716   }
 717
 718   function reinstallAndRemoveTimer() {
 719     clearInterval(timerId);
 720     reinstall();
 721   }
 722
 723   timerId = setInterval(reinstall, 500);
 724   if (document.addEventListener)
 725     document.addEventListener("DOMContentLoaded", reinstallAndRemoveTimer, false);
 726   else
 727     window.onload = reinstallAndRemoveTimer;
 728 }
 729
 730 }
 731 asciidoc.install();
 732 /*]]>*/
 733 </script>
 734 </head>
 735 <body class="article">
 736 <div id="header">
 737 <h1>How to recover an object from scratch</h1>
 738 <span id="revdate">2023-09-22</span>
 739 </div>
 740 <div id="content">
 741 <div id="preamble">
 742 <div class="sectionbody">
 743 <div class="paragraph"><p>I was recently presented with a repository with a corrupted packfile,
 744 and was asked if the data was recoverable. This post-mortem describes
 745 the steps I took to investigate and fix the problem. I thought others
 746 might find the process interesting, and it might help somebody in the
 747 same situation.</p></div>
 748 <div class="sidebarblock">
 749 <div class="content">
 750 <div class="paragraph"><p>Note: In this case, no good copy of the repository was available. For
 751 the much easier case where you can get the corrupted object from
 752 elsewhere, see <a href="recover-corrupted-blob-object.html">this howto</a>.</p></div>
 753 </div></div>
 754 <div class="paragraph"><p>I started with an fsck, which found a problem with exactly one object
 755 (I&#8217;ve used $pack and $obj below to keep the output readable, and also
 756 because I&#8217;ll refer to them later):</p></div>
 757 <div class="listingblock">
 758 <div class="content">
 759 <pre><code>    $ git fsck
 760     error: $pack SHA1 checksum mismatch
 761     error: index CRC mismatch for object $obj from $pack at offset 51653873
 762     error: inflate: data stream error (incorrect data check)
 763     error: cannot unpack $obj from $pack at offset 51653873</code></pre>
 764 </div></div>
 765 <div class="paragraph"><p>The pack checksum failing means a byte is munged somewhere, and it is
 766 presumably in the object mentioned (since both the index checksum and
 767 zlib were failing).</p></div>
 768 <div class="paragraph"><p>Reading the zlib source code, I found that "incorrect data check" means
 769 that the adler-32 checksum at the end of the zlib data did not match the
 770 inflated data. So stepping the data through zlib would not help, as it
 771 did not fail until the very end, when we realize the CRC does not match.
 772 The problematic bytes could be anywhere in the object data.</p></div>
 773 <div class="paragraph"><p>The first thing I did was pull the broken data out of the packfile. I
 774 needed to know how big the object was, which I found out with:</p></div>
 775 <div class="listingblock">
 776 <div class="content">
 777 <pre><code>    $ git show-index &lt;$idx | cut -d' ' -f1 | sort -n | grep -A1 51653873
 778     51653873
 779     51664736</code></pre>
 780 </div></div>
 781 <div class="paragraph"><p>Show-index gives us the list of objects and their offsets. We throw away
 782 everything but the offsets, and then sort them so that our interesting
 783 offset (which we got from the fsck output above) is followed immediately
 784 by the offset of the next object. Now we know that the object data is
 785 10863 bytes long, and we can grab it with:</p></div>
 786 <div class="listingblock">
 787 <div class="content">
 788 <pre><code>  dd if=$pack of=object bs=1 skip=51653873 count=10863</code></pre>
 789 </div></div>
 790 <div class="paragraph"><p>I inspected a hexdump of the data, looking for any obvious bogosity
 791 (e.g., a 4K run of zeroes would be a good sign of filesystem
 792 corruption). But everything looked pretty reasonable.</p></div>
 793 <div class="paragraph"><p>Note that the "object" file isn&#8217;t fit for feeding straight to zlib; it
 794 has the git packed object header, which is variable-length. We want to
 795 strip that off so we can start playing with the zlib data directly. You
 796 can either work your way through it manually (the format is described in
 797 <a href="../gitformat-pack.html">gitformat-pack(5)</a>),
 798 or you can walk through it in a debugger. I did the latter, creating a
 799 valid pack like:</p></div>
 800 <div class="listingblock">
 801 <div class="content">
 802 <pre><code>    # pack magic and version
 803     printf 'PACK\0\0\0\2' &gt;tmp.pack
 804     # pack has one object
 805     printf '\0\0\0\1' &gt;&gt;tmp.pack
 806     # now add our object data
 807     cat object &gt;&gt;tmp.pack
 808     # and then append the pack trailer
 809     /path/to/git.git/t/helper/test-tool sha1 -b &lt;tmp.pack &gt;trailer
 810     cat trailer &gt;&gt;tmp.pack</code></pre>
 811 </div></div>
 812 <div class="paragraph"><p>and then running "git index-pack tmp.pack" in the debugger (stop at
 813 unpack_raw_entry). Doing this, I found that there were 3 bytes of header
 814 (and the header itself had a sane type and size). So I stripped those
 815 off with:</p></div>
 816 <div class="listingblock">
 817 <div class="content">
 818 <pre><code>    dd if=object of=zlib bs=1 skip=3</code></pre>
 819 </div></div>
 820 <div class="paragraph"><p>I ran the result through zlib&#8217;s inflate using a custom C program. And
 821 while it did report the error, I did get the right number of output
 822 bytes (i.e., it matched git&#8217;s size header that we decoded above). But
 823 feeding the result back to "git hash-object" didn&#8217;t produce the same
 824 sha1. So there were some wrong bytes, but I didn&#8217;t know which. The file
 825 happened to be C source code, so I hoped I could notice something
 826 obviously wrong with it, but I didn&#8217;t. I even got it to compile!</p></div>
 827 <div class="paragraph"><p>I also tried comparing it to other versions of the same path in the
 828 repository, hoping that there would be some part of the diff that didn&#8217;t
 829 make sense. Unfortunately, this happened to be the only revision of this
 830 particular file in the repository, so I had nothing to compare against.</p></div>
 831 <div class="paragraph"><p>So I took a different approach. Working under the guess that the
 832 corruption was limited to a single byte, I wrote a program to munge each
 833 byte individually, and try inflating the result. Since the object was
 834 only 10K compressed, that worked out to about 2.5M attempts, which took
 835 a few minutes.</p></div>
 836 <div class="paragraph"><p>The program I used is here:</p></div>
 837 <div class="listingblock">
 838 <div class="content">
 839 <pre><code>#include &lt;stdio.h&gt;
 840 #include &lt;unistd.h&gt;
 841 #include &lt;string.h&gt;
 842 #include &lt;signal.h&gt;
 843 #include &lt;zlib.h&gt;
 844
 845 static int try_zlib(unsigned char *buf, int len)
 846 {
 847         /* make this absurdly large so we don't have to loop */
 848         static unsigned char out[1024*1024];
 849         z_stream z;
 850         int ret;
 851
 852         memset(&amp;z, 0, sizeof(z));
 853         inflateInit(&amp;z);
 854
 855         z.next_in = buf;
 856         z.avail_in = len;
 857         z.next_out = out;
 858         z.avail_out = sizeof(out);
 859
 860         ret = inflate(&amp;z, 0);
 861         inflateEnd(&amp;z);
 862         return ret &gt;= 0;
 863 }
 864
 865 /* eye candy */
 866 static int counter = 0;
 867 static void progress(int sig)
 868 {
 869         fprintf(stderr, "\r%d", counter);
 870         alarm(1);
 871 }
 872
 873 int main(void)
 874 {
 875         /* oversized so we can read the whole buffer in */
 876         unsigned char buf[1024*1024];
 877         int len;
 878         unsigned i, j;
 879
 880         signal(SIGALRM, progress);
 881         alarm(1);
 882
 883         len = read(0, buf, sizeof(buf));
 884         for (i = 0; i &lt; len; i++) {
 885                 unsigned char c = buf[i];
 886                 for (j = 0; j &lt;= 0xff; j++) {
 887                         buf[i] = j;
 888
 889                         counter++;
 890                         if (try_zlib(buf, len))
 891                                 printf("i=%d, j=%x\n", i, j);
 892                 }
 893                 buf[i] = c;
 894         }
 895
 896         alarm(0);
 897         fprintf(stderr, "\n");
 898         return 0;
 899 }</code></pre>
 900 </div></div>
 901 <div class="paragraph"><p>I compiled and ran with:</p></div>
 902 <div class="listingblock">
 903 <div class="content">
 904 <pre><code>  gcc -Wall -Werror -O3 munge.c -o munge -lz
 905   ./munge &lt;zlib</code></pre>
 906 </div></div>
 907 <div class="paragraph"><p>There were a few false positives early on (if you write "no data" in the
 908 zlib header, zlib thinks it&#8217;s just fine :) ). But I got a hit about
 909 halfway through:</p></div>
 910 <div class="listingblock">
 911 <div class="content">
 912 <pre><code>  i=5642, j=c7</code></pre>
 913 </div></div>
 914 <div class="paragraph"><p>I let it run to completion, and got a few more hits at the end (where it
 915 was munging the CRC to match our broken data). So there was a good
 916 chance this middle hit was the source of the problem.</p></div>
 917 <div class="paragraph"><p>I confirmed by tweaking the byte in a hex editor, zlib inflating the
 918 result (no errors!), and then piping the output into "git hash-object",
 919 which reported the sha1 of the broken object. Success!</p></div>
 920 <div class="paragraph"><p>I fixed the packfile itself with:</p></div>
 921 <div class="listingblock">
 922 <div class="content">
 923 <pre><code>  chmod +w $pack
 924   printf '\xc7' | dd of=$pack bs=1 seek=51659518 conv=notrunc
 925   chmod -w $pack</code></pre>
 926 </div></div>
 927 <div class="paragraph"><p>The <code>\xc7</code> comes from the replacement byte our "munge" program found.
 928 The offset 51659518 is derived by taking the original object offset
 929 (51653873), adding the replacement offset found by "munge" (5642), and
 930 then adding back in the 3 bytes of git header we stripped.</p></div>
 931 <div class="paragraph"><p>After that, "git fsck" ran clean.</p></div>
 932 <div class="paragraph"><p>As for the corruption itself, I was lucky that it was indeed a single
 933 byte. In fact, it turned out to be a single bit. The byte 0xc7 was
 934 corrupted to 0xc5. So presumably it was caused by faulty hardware, or a
 935 cosmic ray.</p></div>
 936 <div class="paragraph"><p>And the aborted attempt to look at the inflated output to see what was
 937 wrong? I could have looked forever and never found it. Here&#8217;s the diff
 938 between what the corrupted data inflates to, versus the real data:</p></div>
 939 <div class="listingblock">
 940 <div class="content">
 941 <pre><code>  -       cp = strtok (arg, "+");
 942   +       cp = strtok (arg, ".");</code></pre>
 943 </div></div>
 944 <div class="paragraph"><p>It tweaked one byte and still ended up as valid, readable C that just
 945 happened to do something totally different! One takeaway is that on a
 946 less unlucky day, looking at the zlib output might have actually been
 947 helpful, as most random changes would actually break the C code.</p></div>
 948 <div class="paragraph"><p>But more importantly, git&#8217;s hashing and checksumming noticed a problem
 949 that easily could have gone undetected in another system. The result
 950 still compiled, but would have caused an interesting bug (that would
 951 have been blamed on some random commit).</p></div>
 952 </div>
 953 </div>
 954 <div class="sect1">
 955 <h2 id="_the_adventure_continues_8230">The adventure continues&#8230;</h2>
 956 <div class="sectionbody">
 957 <div class="paragraph"><p>I ended up doing this again! Same entity, new hardware. The assumption
 958 at this point is that the old disk corrupted the packfile, and then the
 959 corruption was migrated to the new hardware (because it was done by
 960 rsync or similar, and no fsck was done at the time of migration).</p></div>
 961 <div class="paragraph"><p>This time, the affected blob was over 20 megabytes, which was far too
 962 large to do a brute-force on. I followed the instructions above to
 963 create the <code>zlib</code> file. I then used the <code>inflate</code> program below to pull
 964 the corrupted data from that. Examining that output gave me a hint about
 965 where in the file the corruption was. But now I was working with the
 966 file itself, not the zlib contents. So knowing the sha1 of the object
 967 and the approximate area of the corruption, I used the <code>sha1-munge</code>
 968 program below to brute-force the correct byte.</p></div>
 969 <div class="paragraph"><p>Here&#8217;s the inflate program (it&#8217;s essentially <code>gunzip</code> but without the
 970 <code>.gz</code> header processing):</p></div>
 971 <div class="listingblock">
 972 <div class="content">
 973 <pre><code>#include &lt;stdio.h&gt;
 974 #include &lt;string.h&gt;
 975 #include &lt;zlib.h&gt;
 976 #include &lt;stdlib.h&gt;
 977
 978 int main(int argc, char **argv)
 979 {
 980         /*
 981          * oversized so we can read the whole buffer in;
 982          * this could actually be switched to streaming
 983          * to avoid any memory limitations
 984          */
 985         static unsigned char buf[25 * 1024 * 1024];
 986         static unsigned char out[25 * 1024 * 1024];
 987         int len;
 988         z_stream z;
 989         int ret;
 990
 991         len = read(0, buf, sizeof(buf));
 992         memset(&amp;z, 0, sizeof(z));
 993         inflateInit(&amp;z);
 994
 995         z.next_in = buf;
 996         z.avail_in = len;
 997         z.next_out = out;
 998         z.avail_out = sizeof(out);
 999
1000         ret = inflate(&amp;z, 0);
1001         if (ret != Z_OK &amp;&amp; ret != Z_STREAM_END)
1002                 fprintf(stderr, "initial inflate failed (%d)\n", ret);
1003
1004         fprintf(stderr, "outputting %lu bytes", z.total_out);
1005         fwrite(out, 1, z.total_out, stdout);
1006         return 0;
1007 }</code></pre>
1008 </div></div>
1009 <div class="paragraph"><p>And here is the <code>sha1-munge</code> program:</p></div>
1010 <div class="listingblock">
1011 <div class="content">
1012 <pre><code>#include &lt;stdio.h&gt;
1013 #include &lt;unistd.h&gt;
1014 #include &lt;string.h&gt;
1015 #include &lt;signal.h&gt;
1016 #include &lt;openssl/sha.h&gt;
1017 #include &lt;stdlib.h&gt;
1018
1019 /* eye candy */
1020 static int counter = 0;
1021 static void progress(int sig)
1022 {
1023         fprintf(stderr, "\r%d", counter);
1024         alarm(1);
1025 }
1026
1027 static const signed char hexval_table[256] = {
1028          -1, -1, -1, -1, -1, -1, -1, -1,                /* 00-07 */
1029          -1, -1, -1, -1, -1, -1, -1, -1,                /* 08-0f */
1030          -1, -1, -1, -1, -1, -1, -1, -1,                /* 10-17 */
1031          -1, -1, -1, -1, -1, -1, -1, -1,                /* 18-1f */
1032          -1, -1, -1, -1, -1, -1, -1, -1,                /* 20-27 */
1033          -1, -1, -1, -1, -1, -1, -1, -1,                /* 28-2f */
1034           0,  1,  2,  3,  4,  5,  6,  7,                /* 30-37 */
1035           8,  9, -1, -1, -1, -1, -1, -1,                /* 38-3f */
1036          -1, 10, 11, 12, 13, 14, 15, -1,                /* 40-47 */
1037          -1, -1, -1, -1, -1, -1, -1, -1,                /* 48-4f */
1038          -1, -1, -1, -1, -1, -1, -1, -1,                /* 50-57 */
1039          -1, -1, -1, -1, -1, -1, -1, -1,                /* 58-5f */
1040          -1, 10, 11, 12, 13, 14, 15, -1,                /* 60-67 */
1041          -1, -1, -1, -1, -1, -1, -1, -1,                /* 68-67 */
1042          -1, -1, -1, -1, -1, -1, -1, -1,                /* 70-77 */
1043          -1, -1, -1, -1, -1, -1, -1, -1,                /* 78-7f */
1044          -1, -1, -1, -1, -1, -1, -1, -1,                /* 80-87 */
1045          -1, -1, -1, -1, -1, -1, -1, -1,                /* 88-8f */
1046          -1, -1, -1, -1, -1, -1, -1, -1,                /* 90-97 */
1047          -1, -1, -1, -1, -1, -1, -1, -1,                /* 98-9f */
1048          -1, -1, -1, -1, -1, -1, -1, -1,                /* a0-a7 */
1049          -1, -1, -1, -1, -1, -1, -1, -1,                /* a8-af */
1050          -1, -1, -1, -1, -1, -1, -1, -1,                /* b0-b7 */
1051          -1, -1, -1, -1, -1, -1, -1, -1,                /* b8-bf */
1052          -1, -1, -1, -1, -1, -1, -1, -1,                /* c0-c7 */
1053          -1, -1, -1, -1, -1, -1, -1, -1,                /* c8-cf */
1054          -1, -1, -1, -1, -1, -1, -1, -1,                /* d0-d7 */
1055          -1, -1, -1, -1, -1, -1, -1, -1,                /* d8-df */
1056          -1, -1, -1, -1, -1, -1, -1, -1,                /* e0-e7 */
1057          -1, -1, -1, -1, -1, -1, -1, -1,                /* e8-ef */
1058          -1, -1, -1, -1, -1, -1, -1, -1,                /* f0-f7 */
1059          -1, -1, -1, -1, -1, -1, -1, -1,                /* f8-ff */
1060 };
1061
1062 static inline unsigned int hexval(unsigned char c)
1063 {
1064 return hexval_table[c];
1065 }
1066
1067 static int get_sha1_hex(const char *hex, unsigned char *sha1)
1068 {
1069         int i;
1070         for (i = 0; i &lt; 20; i++) {
1071                 unsigned int val;
1072                 /*
1073                  * hex[1]=='\0' is caught when val is checked below,
1074                  * but if hex[0] is NUL we have to avoid reading
1075                  * past the end of the string:
1076                  */
1077                 if (!hex[0])
1078                         return -1;
1079                 val = (hexval(hex[0]) &lt;&lt; 4) | hexval(hex[1]);
1080                 if (val &amp; ~0xff)
1081                         return -1;
1082                 *sha1++ = val;
1083                 hex += 2;
1084         }
1085         return 0;
1086 }
1087
1088 int main(int argc, char **argv)
1089 {
1090         /* oversized so we can read the whole buffer in */
1091         static unsigned char buf[25 * 1024 * 1024];
1092         char header[32];
1093         int header_len;
1094         unsigned char have[20], want[20];
1095         int start, len;
1096         SHA_CTX orig;
1097         unsigned i, j;
1098
1099         if (!argv[1] || get_sha1_hex(argv[1], want)) {
1100                 fprintf(stderr, "usage: sha1-munge &lt;sha1&gt; [start] &lt;file.in\n");
1101                 return 1;
1102         }
1103
1104         if (argv[2])
1105                 start = atoi(argv[2]);
1106         else
1107                 start = 0;
1108
1109         len = read(0, buf, sizeof(buf));
1110         header_len = sprintf(header, "blob %d", len) + 1;
1111         fprintf(stderr, "using header: %s\n", header);
1112
1113         /*
1114          * We keep a running sha1 so that if you are munging
1115          * near the end of the file, we do not have to re-sha1
1116          * the unchanged earlier bytes
1117          */
1118         SHA1_Init(&amp;orig);
1119         SHA1_Update(&amp;orig, header, header_len);
1120         if (start)
1121                 SHA1_Update(&amp;orig, buf, start);
1122
1123         signal(SIGALRM, progress);
1124         alarm(1);
1125
1126         for (i = start; i &lt; len; i++) {
1127                 unsigned char c;
1128                 SHA_CTX x;
1129
1130 #if 0
1131                 /*
1132                  * deletion -- this would not actually work in practice,
1133                  * I think, because we've already committed to a
1134                  * particular size in the header. Ditto for addition
1135                  * below. In those cases, you'd have to do the whole
1136                  * sha1 from scratch, or possibly keep three running
1137                  * "orig" sha1 computations going.
1138                  */
1139                 memcpy(&amp;x, &amp;orig, sizeof(x));
1140                 SHA1_Update(&amp;x, buf + i + 1, len - i - 1);
1141                 SHA1_Final(have, &amp;x);
1142                 if (!memcmp(have, want, 20))
1143                         printf("i=%d, deletion\n", i);
1144 #endif
1145
1146                 /*
1147                  * replacement -- note that this tries each of the 256
1148                  * possible bytes. If you suspect a single-bit flip,
1149                  * it would be much shorter to just try the 8
1150                  * bit-flipped variants.
1151                  */
1152                 c = buf[i];
1153                 for (j = 0; j &lt;= 0xff; j++) {
1154                         buf[i] = j;
1155
1156                         memcpy(&amp;x, &amp;orig, sizeof(x));
1157                         SHA1_Update(&amp;x, buf + i, len - i);
1158                         SHA1_Final(have, &amp;x);
1159                         if (!memcmp(have, want, 20))
1160                                 printf("i=%d, j=%02x\n", i, j);
1161                 }
1162                 buf[i] = c;
1163
1164 #if 0
1165                 /* addition */
1166                 for (j = 0; j &lt;= 0xff; j++) {
1167                         unsigned char extra = j;
1168                         memcpy(&amp;x, &amp;orig, sizeof(x));
1169                         SHA1_Update(&amp;x, &amp;extra, 1);
1170                         SHA1_Update(&amp;x, buf + i, len - i);
1171                         SHA1_Final(have, &amp;x);
1172                         if (!memcmp(have, want, 20))
1173                                 printf("i=%d, addition=%02x", i, j);
1174                 }
1175 #endif
1176
1177                 SHA1_Update(&amp;orig, buf + i, 1);
1178                 counter++;
1179         }
1180
1181         alarm(0);
1182         fprintf(stderr, "\r%d\n", counter);
1183         return 0;
1184 }</code></pre>
1185 </div></div>
1186 </div>
1187 </div>
1188 </div>
1189 <div id="footnotes"><hr /></div>
1190 <div id="footer">
1191 <div id="footer-text">
1192 Last updated
1193  2023-09-22 17:05:17 PDT
1194 </div>
1195 </div>
1196 </body>
1197 </html>