courgette/third_party/bsdiff_create.cc

   1 /*
   2   bsdiff.c -- Binary patch generator.
   3
   4   Copyright 2003 Colin Percival
   5
   6   For the terms under which this work may be distributed, please see
   7   the adjoining file "LICENSE".
   8
   9   ChangeLog:
  10   2005-05-05 - Use the modified header struct from bspatch.h; use 32-bit
  11                values throughout.
  12                  --Benjamin Smedberg <benjamin@smedbergs.us>
  13   2005-05-18 - Use the same CRC algorithm as bzip2, and leverage the CRC table
  14                provided by libbz2.
  15                  --Darin Fisher <darin@meer.net>
  16   2007-11-14 - Changed to use Crc from Lzma library instead of Bzip library
  17                  --Rahul Kuchhal
  18   2009-03-31 - Change to use Streams.  Added lots of comments.
  19                  --Stephen Adams <sra@chromium.org>
  20   2010-05-26 - Use a paged array for V and I. The address space may be too
  21                fragmented for these big arrays to be contiguous.
  22                  --Stephen Adams <sra@chromium.org>
  23   2015-08-03 - Extract qsufsort portion to a separate file.
  24                  --Samuel Huang <huangs@chromium.org>
  25   2015-08-12 - Interface change to qsufsort search().
  26                  --Samuel Huang <huangs@chromium.org>
  27 */
  28
  29 #include "courgette/third_party/bsdiff.h"
  30
  31 #include <stdlib.h>
  32 #include <algorithm>
  33
  34 #include "base/logging.h"
  35 #include "base/memory/scoped_ptr.h"
  36 #include "base/strings/string_util.h"
  37 #include "base/time/time.h"
  38
  39 #include "courgette/crc.h"
  40 #include "courgette/streams.h"
  41 #include "courgette/third_party/paged_array.h"
  42 #include "courgette/third_party/qsufsort.h"
  43
  44 namespace courgette {
  45
  46 static CheckBool WriteHeader(SinkStream* stream, MBSPatchHeader* header) {
  47   bool ok = stream->Write(header->tag, sizeof(header->tag));
  48   ok &= stream->WriteVarint32(header->slen);
  49   ok &= stream->WriteVarint32(header->scrc32);
  50   ok &= stream->WriteVarint32(header->dlen);
  51   return ok;
  52 }
  53
  54 BSDiffStatus CreateBinaryPatch(SourceStream* old_stream,
  55                                SourceStream* new_stream,
  56                                SinkStream* patch_stream)
  57 {
  58   base::Time start_bsdiff_time = base::Time::Now();
  59   VLOG(1) << "Start bsdiff";
  60   size_t initial_patch_stream_length = patch_stream->Length();
  61
  62   SinkStreamSet patch_streams;
  63   SinkStream* control_stream_copy_counts = patch_streams.stream(0);
  64   SinkStream* control_stream_extra_counts = patch_streams.stream(1);
  65   SinkStream* control_stream_seeks = patch_streams.stream(2);
  66   SinkStream* diff_skips = patch_streams.stream(3);
  67   SinkStream* diff_bytes = patch_streams.stream(4);
  68   SinkStream* extra_bytes = patch_streams.stream(5);
  69
  70   const uint8* old = old_stream->Buffer();
  71   const int oldsize = static_cast<int>(old_stream->Remaining());
  72
  73   uint32 pending_diff_zeros = 0;
  74
  75   PagedArray<int> I;
  76   PagedArray<int> V;
  77
  78   if (!I.Allocate(oldsize + 1)) {
  79     LOG(ERROR) << "Could not allocate I[], " << ((oldsize + 1) * sizeof(int))
  80                << " bytes";
  81     return MEM_ERROR;
  82   }
  83
  84   if (!V.Allocate(oldsize + 1)) {
  85     LOG(ERROR) << "Could not allocate V[], " << ((oldsize + 1) * sizeof(int))
  86                << " bytes";
  87     return MEM_ERROR;
  88   }
  89
  90   base::Time q_start_time = base::Time::Now();
  91   qsuf::qsufsort<PagedArray<int>&>(I, V, old, oldsize);
  92   VLOG(1) << " done qsufsort "
  93           << (base::Time::Now() - q_start_time).InSecondsF();
  94   V.clear();
  95
  96   const uint8* newbuf = new_stream->Buffer();
  97   const int newsize = static_cast<int>(new_stream->Remaining());
  98
  99   int control_length = 0;
 100   int diff_bytes_length = 0;
 101   int diff_bytes_nonzero = 0;
 102   int extra_bytes_length = 0;
 103
 104   // The patch format is a sequence of triples <copy,extra,seek> where 'copy' is
 105   // the number of bytes to copy from the old file (possibly with mistakes),
 106   // 'extra' is the number of bytes to copy from a stream of fresh bytes, and
 107   // 'seek' is an offset to move to the position to copy for the next triple.
 108   //
 109   // The invariant at the top of this loop is that we are committed to emitting
 110   // a triple for the part of |newbuf| surrounding a 'seed' match near
 111   // |lastscan|.  We are searching for a second match that will be the 'seed' of
 112   // the next triple.  As we scan through |newbuf|, one of four things can
 113   // happen at the current position |scan|:
 114   //
 115   //  1. We find a nice match that appears to be consistent with the current
 116   //     seed.  Continue scanning.  It is likely that this match will become
 117   //     part of the 'copy'.
 118   //
 119   //  2. We find match which does much better than extending the current seed
 120   //     old match.  Emit a triple for the current seed and take this match as
 121   //     the new seed for a new triple.  By 'much better' we remove 8 mismatched
 122   //     bytes by taking the new seed.
 123   //
 124   //  3. There is not a good match.  Continue scanning.  These bytes will likely
 125   //     become part of the 'extra'.
 126   //
 127   //  4. There is no match because we reached the end of the input, |newbuf|.
 128
 129   // This is how the loop advances through the bytes of |newbuf|:
 130   //
 131   // ...012345678901234567890123456789...
 132   //    ssssssssss                      Seed at |lastscan|
 133   //              xxyyyxxyyxy           |scan| forward, cases (3)(x) & (1)(y)
 134   //                         mmmmmmmm   New match will start new seed case (2).
 135   //    fffffffffffffff                 |lenf| = scan forward from |lastscan|
 136   //                     bbbb           |lenb| = scan back from new seed |scan|.
 137   //    ddddddddddddddd                 Emit diff bytes for the 'copy'.
 138   //                   xx               Emit extra bytes.
 139   //                     ssssssssssss   |lastscan = scan - lenb| is new seed.
 140   //                                 x  Cases (1) and (3) ....
 141
 142
 143   int lastscan = 0, lastpos = 0, lastoffset = 0;
 144
 145   int scan = 0;
 146   int match_length = 0;
 147
 148   while (scan < newsize) {
 149     int pos = 0;
 150     int oldscore = 0;  // Count of how many bytes of the current match at |scan|
 151                        // extend the match at |lastscan|.
 152
 153     scan += match_length;
 154     for (int scsc = scan;  scan < newsize;  ++scan) {
 155       match_length = qsuf::search<PagedArray<int>&>(
 156           I, old, oldsize, newbuf + scan, newsize - scan, &pos);
 157
 158       for ( ; scsc < scan + match_length ; scsc++)
 159         if ((scsc + lastoffset < oldsize) &&
 160             (old[scsc + lastoffset] == newbuf[scsc]))
 161           oldscore++;
 162
 163       if ((match_length == oldscore) && (match_length != 0))
 164         break;  // Good continuing match, case (1)
 165       if (match_length > oldscore + 8)
 166         break;  // New seed match, case (2)
 167
 168       if ((scan + lastoffset < oldsize) &&
 169           (old[scan + lastoffset] == newbuf[scan]))
 170         oldscore--;
 171       // Case (3) continues in this loop until we fall out of the loop (4).
 172     }
 173
 174     if ((match_length != oldscore) || (scan == newsize)) {  // Cases (2) and (4)
 175       // This next chunk of code finds the boundary between the bytes to be
 176       // copied as part of the current triple, and the bytes to be copied as
 177       // part of the next triple.  The |lastscan| match is extended forwards as
 178       // far as possible provided doing to does not add too many mistakes.  The
 179       // |scan| match is extended backwards in a similar way.
 180
 181       // Extend the current match (if any) backwards.  |lenb| is the maximal
 182       // extension for which less than half the byte positions in the extension
 183       // are wrong.
 184       int lenb = 0;
 185       if (scan < newsize) {  // i.e. not case (4); there is a match to extend.
 186         int score = 0, Sb = 0;
 187         for (int i = 1;  (scan >= lastscan + i) && (pos >= i);  i++) {
 188           if (old[pos - i] == newbuf[scan - i]) score++;
 189           if (score*2 - i > Sb*2 - lenb) { Sb = score; lenb = i; }
 190         }
 191       }
 192
 193       // Extend the lastscan match forward; |lenf| is the maximal extension for
 194       // which less than half of the byte positions in entire lastscan match are
 195       // wrong.  There is a subtle point here: |lastscan| points to before the
 196       // seed match by |lenb| bytes from the previous iteration.  This is why
 197       // the loop measures the total number of mistakes in the the match, not
 198       // just the from the match.
 199       int lenf = 0;
 200       {
 201         int score = 0, Sf = 0;
 202         for (int i = 0;  (lastscan + i < scan) && (lastpos + i < oldsize);  ) {
 203           if (old[lastpos + i] == newbuf[lastscan + i]) score++;
 204           i++;
 205           if (score*2 - i > Sf*2 - lenf) { Sf = score; lenf = i; }
 206         }
 207       }
 208
 209       // If the extended scans overlap, pick a position in the overlap region
 210       // that maximizes the exact matching bytes.
 211       if (lastscan + lenf > scan - lenb) {
 212         int overlap = (lastscan + lenf) - (scan - lenb);
 213         int score = 0;
 214         int Ss = 0, lens = 0;
 215         for (int i = 0;  i < overlap;  i++) {
 216           if (newbuf[lastscan + lenf - overlap + i] ==
 217               old[lastpos + lenf - overlap + i]) score++;
 218           if (newbuf[scan - lenb + i] ==  old[pos - lenb + i]) score--;
 219           if (score > Ss) { Ss = score; lens = i + 1; }
 220         }
 221
 222         lenf += lens - overlap;
 223         lenb -= lens;
 224       };
 225
 226       for (int i = 0;  i < lenf;  i++) {
 227         uint8 diff_byte = newbuf[lastscan + i] - old[lastpos + i];
 228         if (diff_byte) {
 229           ++diff_bytes_nonzero;
 230           if (!diff_skips->WriteVarint32(pending_diff_zeros))
 231             return MEM_ERROR;
 232           pending_diff_zeros = 0;
 233           if (!diff_bytes->Write(&diff_byte, 1))
 234             return MEM_ERROR;
 235         } else {
 236           ++pending_diff_zeros;
 237         }
 238       }
 239       int gap = (scan - lenb) - (lastscan + lenf);
 240       for (int i = 0;  i < gap;  i++) {
 241         if (!extra_bytes->Write(&newbuf[lastscan + lenf + i], 1))
 242           return MEM_ERROR;
 243       }
 244
 245       diff_bytes_length += lenf;
 246       extra_bytes_length += gap;
 247
 248       uint32 copy_count = lenf;
 249       uint32 extra_count = gap;
 250       int32 seek_adjustment = ((pos - lenb) - (lastpos + lenf));
 251
 252       if (!control_stream_copy_counts->WriteVarint32(copy_count) ||
 253           !control_stream_extra_counts->WriteVarint32(extra_count) ||
 254           !control_stream_seeks->WriteVarint32Signed(seek_adjustment)) {
 255         return MEM_ERROR;
 256       }
 257
 258       ++control_length;
 259 #ifdef DEBUG_bsmedberg
 260       VLOG(1) << StringPrintf("Writing a block:  copy: %-8u extra: %-8u seek: "
 261                               "%+-9d", copy_count, extra_count,
 262                               seek_adjustment);
 263 #endif
 264
 265       lastscan = scan - lenb;   // Include the backward extension in seed.
 266       lastpos = pos - lenb;     //  ditto.
 267       lastoffset = lastpos - lastscan;
 268     }
 269   }
 270
 271   if (!diff_skips->WriteVarint32(pending_diff_zeros))
 272     return MEM_ERROR;
 273
 274   I.clear();
 275
 276   MBSPatchHeader header;
 277   // The string will have a null terminator that we don't use, hence '-1'.
 278   static_assert(sizeof(MBS_PATCH_HEADER_TAG) - 1 == sizeof(header.tag),
 279                 "MBS_PATCH_HEADER_TAG must match header field size");
 280   memcpy(header.tag, MBS_PATCH_HEADER_TAG, sizeof(header.tag));
 281   header.slen     = oldsize;
 282   header.scrc32   = CalculateCrc(old, oldsize);
 283   header.dlen     = newsize;
 284
 285   if (!WriteHeader(patch_stream, &header))
 286     return MEM_ERROR;
 287
 288   size_t diff_skips_length = diff_skips->Length();
 289   if (!patch_streams.CopyTo(patch_stream))
 290     return MEM_ERROR;
 291
 292   VLOG(1) << "Control tuples: " << control_length
 293           << "  copy bytes: " << diff_bytes_length
 294           << "  mistakes: " << diff_bytes_nonzero
 295           << "  (skips: " << diff_skips_length << ")"
 296           << "  extra bytes: " << extra_bytes_length
 297           << "\nUncompressed bsdiff patch size "
 298           << patch_stream->Length() - initial_patch_stream_length
 299           << "\nEnd bsdiff "
 300           << (base::Time::Now() - start_bsdiff_time).InSecondsF();
 301
 302   return OK;
 303 }
 304
 305 }  // namespace courgette