courgette/third_party/bsdiff_create.cc

   1 /*
   2   bsdiff.c -- Binary patch generator.
   3
   4   Copyright 2003 Colin Percival
   5
   6   For the terms under which this work may be distributed, please see
   7   the adjoining file "LICENSE".
   8
   9   ChangeLog:
  10   2005-05-05 - Use the modified header struct from bspatch.h; use 32-bit
  11                values throughout.
  12                  --Benjamin Smedberg <benjamin@smedbergs.us>
  13   2005-05-18 - Use the same CRC algorithm as bzip2, and leverage the CRC table
  14                provided by libbz2.
  15                  --Darin Fisher <darin@meer.net>
  16   2007-11-14 - Changed to use Crc from Lzma library instead of Bzip library
  17                  --Rahul Kuchhal
  18   2009-03-31 - Change to use Streams.  Added lots of comments.
  19                  --Stephen Adams <sra@chromium.org>
  20   2010-05-26 - Use a paged array for V and I. The address space may be too
  21                fragmented for these big arrays to be contiguous.
  22                  --Stephen Adams <sra@chromium.org>
  23   2015-08-03 - Extract qsufsort portion to a separate file.
  24                  --Samuel Huang <huangs@chromium.org>
  25 */
  26
  27 #include "courgette/third_party/bsdiff.h"
  28
  29 #include <stdlib.h>
  30 #include <algorithm>
  31
  32 #include "base/logging.h"
  33 #include "base/memory/scoped_ptr.h"
  34 #include "base/strings/string_util.h"
  35 #include "base/time/time.h"
  36
  37 #include "courgette/crc.h"
  38 #include "courgette/streams.h"
  39 #include "courgette/third_party/paged_array.h"
  40 #include "courgette/third_party/qsufsort.h"
  41
  42 namespace courgette {
  43
  44 static CheckBool WriteHeader(SinkStream* stream, MBSPatchHeader* header) {
  45   bool ok = stream->Write(header->tag, sizeof(header->tag));
  46   ok &= stream->WriteVarint32(header->slen);
  47   ok &= stream->WriteVarint32(header->scrc32);
  48   ok &= stream->WriteVarint32(header->dlen);
  49   return ok;
  50 }
  51
  52 BSDiffStatus CreateBinaryPatch(SourceStream* old_stream,
  53                                SourceStream* new_stream,
  54                                SinkStream* patch_stream)
  55 {
  56   base::Time start_bsdiff_time = base::Time::Now();
  57   VLOG(1) << "Start bsdiff";
  58   size_t initial_patch_stream_length = patch_stream->Length();
  59
  60   SinkStreamSet patch_streams;
  61   SinkStream* control_stream_copy_counts = patch_streams.stream(0);
  62   SinkStream* control_stream_extra_counts = patch_streams.stream(1);
  63   SinkStream* control_stream_seeks = patch_streams.stream(2);
  64   SinkStream* diff_skips = patch_streams.stream(3);
  65   SinkStream* diff_bytes = patch_streams.stream(4);
  66   SinkStream* extra_bytes = patch_streams.stream(5);
  67
  68   const uint8* old = old_stream->Buffer();
  69   const int oldsize = static_cast<int>(old_stream->Remaining());
  70
  71   uint32 pending_diff_zeros = 0;
  72
  73   PagedArray<int> I;
  74   PagedArray<int> V;
  75
  76   if (!I.Allocate(oldsize + 1)) {
  77     LOG(ERROR) << "Could not allocate I[], " << ((oldsize + 1) * sizeof(int))
  78                << " bytes";
  79     return MEM_ERROR;
  80   }
  81
  82   if (!V.Allocate(oldsize + 1)) {
  83     LOG(ERROR) << "Could not allocate V[], " << ((oldsize + 1) * sizeof(int))
  84                << " bytes";
  85     return MEM_ERROR;
  86   }
  87
  88   base::Time q_start_time = base::Time::Now();
  89   qsuf::qsufsort<PagedArray<int>&>(I, V, old, oldsize);
  90   VLOG(1) << " done qsufsort "
  91           << (base::Time::Now() - q_start_time).InSecondsF();
  92   V.clear();
  93
  94   const uint8* newbuf = new_stream->Buffer();
  95   const int newsize = static_cast<int>(new_stream->Remaining());
  96
  97   int control_length = 0;
  98   int diff_bytes_length = 0;
  99   int diff_bytes_nonzero = 0;
 100   int extra_bytes_length = 0;
 101
 102   // The patch format is a sequence of triples <copy,extra,seek> where 'copy' is
 103   // the number of bytes to copy from the old file (possibly with mistakes),
 104   // 'extra' is the number of bytes to copy from a stream of fresh bytes, and
 105   // 'seek' is an offset to move to the position to copy for the next triple.
 106   //
 107   // The invariant at the top of this loop is that we are committed to emitting
 108   // a triple for the part of |newbuf| surrounding a 'seed' match near
 109   // |lastscan|.  We are searching for a second match that will be the 'seed' of
 110   // the next triple.  As we scan through |newbuf|, one of four things can
 111   // happen at the current position |scan|:
 112   //
 113   //  1. We find a nice match that appears to be consistent with the current
 114   //     seed.  Continue scanning.  It is likely that this match will become
 115   //     part of the 'copy'.
 116   //
 117   //  2. We find match which does much better than extending the current seed
 118   //     old match.  Emit a triple for the current seed and take this match as
 119   //     the new seed for a new triple.  By 'much better' we remove 8 mismatched
 120   //     bytes by taking the new seed.
 121   //
 122   //  3. There is not a good match.  Continue scanning.  These bytes will likely
 123   //     become part of the 'extra'.
 124   //
 125   //  4. There is no match because we reached the end of the input, |newbuf|.
 126
 127   // This is how the loop advances through the bytes of |newbuf|:
 128   //
 129   // ...012345678901234567890123456789...
 130   //    ssssssssss                      Seed at |lastscan|
 131   //              xxyyyxxyyxy           |scan| forward, cases (3)(x) & (1)(y)
 132   //                         mmmmmmmm   New match will start new seed case (2).
 133   //    fffffffffffffff                 |lenf| = scan forward from |lastscan|
 134   //                     bbbb           |lenb| = scan back from new seed |scan|.
 135   //    ddddddddddddddd                 Emit diff bytes for the 'copy'.
 136   //                   xx               Emit extra bytes.
 137   //                     ssssssssssss   |lastscan = scan - lenb| is new seed.
 138   //                                 x  Cases (1) and (3) ....
 139
 140
 141   int lastscan = 0, lastpos = 0, lastoffset = 0;
 142
 143   int scan = 0;
 144   int match_length = 0;
 145
 146   while (scan < newsize) {
 147     int pos = 0;
 148     int oldscore = 0;  // Count of how many bytes of the current match at |scan|
 149                        // extend the match at |lastscan|.
 150
 151     scan += match_length;
 152     for (int scsc = scan;  scan < newsize;  ++scan) {
 153       match_length = qsuf::search<PagedArray<int>&>(
 154           I, old, oldsize, newbuf + scan, newsize - scan, 0, oldsize, &pos);
 155
 156       for ( ; scsc < scan + match_length ; scsc++)
 157         if ((scsc + lastoffset < oldsize) &&
 158             (old[scsc + lastoffset] == newbuf[scsc]))
 159           oldscore++;
 160
 161       if ((match_length == oldscore) && (match_length != 0))
 162         break;  // Good continuing match, case (1)
 163       if (match_length > oldscore + 8)
 164         break;  // New seed match, case (2)
 165
 166       if ((scan + lastoffset < oldsize) &&
 167           (old[scan + lastoffset] == newbuf[scan]))
 168         oldscore--;
 169       // Case (3) continues in this loop until we fall out of the loop (4).
 170     }
 171
 172     if ((match_length != oldscore) || (scan == newsize)) {  // Cases (2) and (4)
 173       // This next chunk of code finds the boundary between the bytes to be
 174       // copied as part of the current triple, and the bytes to be copied as
 175       // part of the next triple.  The |lastscan| match is extended forwards as
 176       // far as possible provided doing to does not add too many mistakes.  The
 177       // |scan| match is extended backwards in a similar way.
 178
 179       // Extend the current match (if any) backwards.  |lenb| is the maximal
 180       // extension for which less than half the byte positions in the extension
 181       // are wrong.
 182       int lenb = 0;
 183       if (scan < newsize) {  // i.e. not case (4); there is a match to extend.
 184         int score = 0, Sb = 0;
 185         for (int i = 1;  (scan >= lastscan + i) && (pos >= i);  i++) {
 186           if (old[pos - i] == newbuf[scan - i]) score++;
 187           if (score*2 - i > Sb*2 - lenb) { Sb = score; lenb = i; }
 188         }
 189       }
 190
 191       // Extend the lastscan match forward; |lenf| is the maximal extension for
 192       // which less than half of the byte positions in entire lastscan match are
 193       // wrong.  There is a subtle point here: |lastscan| points to before the
 194       // seed match by |lenb| bytes from the previous iteration.  This is why
 195       // the loop measures the total number of mistakes in the the match, not
 196       // just the from the match.
 197       int lenf = 0;
 198       {
 199         int score = 0, Sf = 0;
 200         for (int i = 0;  (lastscan + i < scan) && (lastpos + i < oldsize);  ) {
 201           if (old[lastpos + i] == newbuf[lastscan + i]) score++;
 202           i++;
 203           if (score*2 - i > Sf*2 - lenf) { Sf = score; lenf = i; }
 204         }
 205       }
 206
 207       // If the extended scans overlap, pick a position in the overlap region
 208       // that maximizes the exact matching bytes.
 209       if (lastscan + lenf > scan - lenb) {
 210         int overlap = (lastscan + lenf) - (scan - lenb);
 211         int score = 0;
 212         int Ss = 0, lens = 0;
 213         for (int i = 0;  i < overlap;  i++) {
 214           if (newbuf[lastscan + lenf - overlap + i] ==
 215               old[lastpos + lenf - overlap + i]) score++;
 216           if (newbuf[scan - lenb + i] ==  old[pos - lenb + i]) score--;
 217           if (score > Ss) { Ss = score; lens = i + 1; }
 218         }
 219
 220         lenf += lens - overlap;
 221         lenb -= lens;
 222       };
 223
 224       for (int i = 0;  i < lenf;  i++) {
 225         uint8 diff_byte = newbuf[lastscan + i] - old[lastpos + i];
 226         if (diff_byte) {
 227           ++diff_bytes_nonzero;
 228           if (!diff_skips->WriteVarint32(pending_diff_zeros))
 229             return MEM_ERROR;
 230           pending_diff_zeros = 0;
 231           if (!diff_bytes->Write(&diff_byte, 1))
 232             return MEM_ERROR;
 233         } else {
 234           ++pending_diff_zeros;
 235         }
 236       }
 237       int gap = (scan - lenb) - (lastscan + lenf);
 238       for (int i = 0;  i < gap;  i++) {
 239         if (!extra_bytes->Write(&newbuf[lastscan + lenf + i], 1))
 240           return MEM_ERROR;
 241       }
 242
 243       diff_bytes_length += lenf;
 244       extra_bytes_length += gap;
 245
 246       uint32 copy_count = lenf;
 247       uint32 extra_count = gap;
 248       int32 seek_adjustment = ((pos - lenb) - (lastpos + lenf));
 249
 250       if (!control_stream_copy_counts->WriteVarint32(copy_count) ||
 251           !control_stream_extra_counts->WriteVarint32(extra_count) ||
 252           !control_stream_seeks->WriteVarint32Signed(seek_adjustment)) {
 253         return MEM_ERROR;
 254       }
 255
 256       ++control_length;
 257 #ifdef DEBUG_bsmedberg
 258       VLOG(1) << StringPrintf("Writing a block:  copy: %-8u extra: %-8u seek: "
 259                               "%+-9d", copy_count, extra_count,
 260                               seek_adjustment);
 261 #endif
 262
 263       lastscan = scan - lenb;   // Include the backward extension in seed.
 264       lastpos = pos - lenb;     //  ditto.
 265       lastoffset = lastpos - lastscan;
 266     }
 267   }
 268
 269   if (!diff_skips->WriteVarint32(pending_diff_zeros))
 270     return MEM_ERROR;
 271
 272   I.clear();
 273
 274   MBSPatchHeader header;
 275   // The string will have a null terminator that we don't use, hence '-1'.
 276   static_assert(sizeof(MBS_PATCH_HEADER_TAG) - 1 == sizeof(header.tag),
 277                 "MBS_PATCH_HEADER_TAG must match header field size");
 278   memcpy(header.tag, MBS_PATCH_HEADER_TAG, sizeof(header.tag));
 279   header.slen     = oldsize;
 280   header.scrc32   = CalculateCrc(old, oldsize);
 281   header.dlen     = newsize;
 282
 283   if (!WriteHeader(patch_stream, &header))
 284     return MEM_ERROR;
 285
 286   size_t diff_skips_length = diff_skips->Length();
 287   if (!patch_streams.CopyTo(patch_stream))
 288     return MEM_ERROR;
 289
 290   VLOG(1) << "Control tuples: " << control_length
 291           << "  copy bytes: " << diff_bytes_length
 292           << "  mistakes: " << diff_bytes_nonzero
 293           << "  (skips: " << diff_skips_length << ")"
 294           << "  extra bytes: " << extra_bytes_length
 295           << "\nUncompressed bsdiff patch size "
 296           << patch_stream->Length() - initial_patch_stream_length
 297           << "\nEnd bsdiff "
 298           << (base::Time::Now() - start_bsdiff_time).InSecondsF();
 299
 300   return OK;
 301 }
 302
 303 }  // namespace courgette