subversion/libsvn_delta/xdelta.c

   1 /*
   2  * xdelta.c:  xdelta generator.
   3  *
   4  * ====================================================================
   5  * Copyright (c) 2000-2006 CollabNet.  All rights reserved.
   6  *
   7  * This software is licensed as described in the file COPYING, which
   8  * you should have received as part of this distribution.  The terms
   9  * are also available at http://subversion.tigris.org/license-1.html.
  10  * If newer versions of this license are posted there, you may use a
  11  * newer version instead, at your option.
  12  *
  13  * This software consists of voluntary contributions made by many
  14  * individuals.  For exact contribution history, see the revision
  15  * history and logs, available at http://subversion.tigris.org/.
  16  * ====================================================================
  17  */
  18
  19
  20 #include <assert.h>
  21
  22 #include <apr_general.h>        /* for APR_INLINE */
  23 #include <apr_hash.h>
  24
  25 #include "svn_delta.h"
  26 #include "delta.h"
  27 \f
  28 /* This is pseudo-adler32. It is adler32 without the prime modulus.
  29    The idea is borrowed from monotone, and is a translation of the C++
  30    code.  Graydon Hoare, the author of the original code, gave his
  31    explicit permission to use it under these terms at 8:02pm on
  32    Friday, February 11th, 2005.  */
  33
  34 #define ADLER32_MASK      0x0000ffff
  35 #define ADLER32_CHAR_MASK 0x000000ff
  36
  37 /* Structure to store the state of our adler32 checksum.  */
  38 struct adler32
  39 {
  40   apr_uint32_t s1;
  41   apr_uint32_t s2;
  42   apr_uint32_t len;
  43 };
  44
  45 /* Feed C into the adler32 checksum.  */
  46
  47 static APR_INLINE void
  48 adler32_in(struct adler32 *ad, const char c)
  49 {
  50   ad->s1 += ((apr_uint32_t) (c)) & ADLER32_CHAR_MASK;
  51   ad->s1 &= ADLER32_MASK;
  52   ad->s2 += ad->s1;
  53   ad->s2 &= ADLER32_MASK;
  54   ad->len++;
  55 }
  56
  57 /* Remove the result of byte C from the adler32 checksum.  */
  58
  59 static APR_INLINE void
  60 adler32_out(struct adler32 *ad, const char c)
  61 {
  62   ad->s1 -= ((apr_uint32_t) (c)) & ADLER32_CHAR_MASK;
  63   ad->s1 &= ADLER32_MASK;
  64   ad->s2 -= (ad->len * (((apr_uint32_t) c) & ADLER32_CHAR_MASK)) + 1;
  65   ad->s2 &= ADLER32_MASK;
  66   --ad->len;
  67 }
  68
  69 /* Return the current adler32 checksum in the adler32 structure.  */
  70
  71 static APR_INLINE apr_uint32_t
  72 adler32_sum(const struct adler32 *ad)
  73 {
  74   return (ad->s2 << 16) | (ad->s1);
  75 }
  76
  77 /* Initialize an adler32 checksum structure with DATA, which has length
  78    DATALEN.  Return the initialized structure.  */
  79
  80 static APR_INLINE struct adler32 *
  81 init_adler32(struct adler32 *ad, const char *data, apr_uint32_t datalen)
  82 {
  83   ad->s1 = 1;
  84   ad->s2 = 0;
  85   ad->len = 0;
  86   while (datalen--)
  87     adler32_in(ad, *(data++));
  88   return ad;
  89 }
  90
  91 /* Size of the blocks we compute checksums for. This was chosen out of
  92    thin air.  Monotone used 64, xdelta1 used 64, rsync uses 128.  */
  93 #define MATCH_BLOCKSIZE 64
  94
  95 /* Information for a block of the delta source.  The length of the
  96    block is the smaller of MATCH_BLOCKSIZE and the difference between
  97    the size of the source data and the position of this block. */
  98 struct block
  99 {
 100   apr_uint32_t adlersum;
 101   apr_size_t pos;
 102 };
 103
 104 /* A hash table, using open addressing, of the blocks of the source. */
 105 struct blocks
 106 {
 107   /* The largest valid index of slots. */
 108   apr_size_t max;
 109   /* The vector of blocks.  A pos value of (apr_size_t)-1 represents an unused
 110      slot. */
 111   struct block *slots;
 112 };
 113
 114
 115 /* Return a hash value calculated from the adler32 SUM, suitable for use with
 116    our hash table. */
 117 static apr_size_t hash_func(apr_uint32_t sum)
 118 {
 119   /* Since the adl32 checksum have a bad distribution for the 11th to 16th
 120      bits when used for our small block size, we add some bits from the
 121      other half of the checksum. */
 122   return sum ^ (sum >> 12);
 123 }
 124
 125 /* Insert a block with the checksum ADLERSUM at position POS in the source data
 126    into the table BLOCKS.  Ignore duplicates. */
 127 static void
 128 add_block(struct blocks *blocks, apr_uint32_t adlersum, apr_size_t pos)
 129 {
 130   apr_size_t h = hash_func(adlersum) & blocks->max;
 131
 132   /* This will terminate, since we know that we will not fill the table. */
 133   while (blocks->slots[h].pos != (apr_size_t)-1)
 134     {
 135       /* No duplicates! */
 136       if (blocks->slots[h].adlersum == adlersum)
 137         return;
 138       h = (h + 1) & blocks->max;
 139     }
 140   blocks->slots[h].adlersum = adlersum;
 141   blocks->slots[h].pos = pos;
 142 }
 143
 144 /* Find a block in BLOCKS with the checksum ADLERSUM, returning its position
 145    in the source data.  If there is no such block, return (apr_size_t)-1. */
 146 static apr_size_t
 147 find_block(const struct blocks *blocks, apr_uint32_t adlersum)
 148 {
 149   apr_size_t h = hash_func(adlersum) & blocks->max;
 150
 151   while (blocks->slots[h].adlersum != adlersum
 152          && blocks->slots[h].pos != (apr_size_t)-1)
 153     h = (h + 1) & blocks->max;
 154
 155   return blocks->slots[h].pos;
 156 }
 157
 158 /* Initialize the matches table from DATA of size DATALEN.  This goes
 159    through every block of MATCH_BLOCKSIZE bytes in the source and
 160    checksums it, inserting the result into the BLOCKS table.  */
 161 static void
 162 init_blocks_table(const char *data,
 163                    apr_size_t datalen,
 164                   struct blocks *blocks,
 165                    apr_pool_t *pool)
 166 {
 167   apr_size_t i;
 168   struct adler32 adler;
 169   apr_size_t nblocks;
 170   apr_size_t nslots = 1;
 171
 172   /* Be pesimistic about the block count. */
 173   nblocks = datalen / MATCH_BLOCKSIZE + 1;
 174   /* Find nearest larger power of two. */
 175   while (nslots <= nblocks)
 176     nslots *= 2;
 177   /* Double the number of slots to avoid a too high load. */
 178   nslots *= 2;
 179   blocks->max = nslots - 1;
 180   blocks->slots = apr_palloc(pool, nslots * sizeof(*(blocks->slots)));
 181   for (i = 0; i < nslots; ++i)
 182     {
 183       /* Avoid using an indeterminate value in the lookup. */
 184       blocks->slots[i].adlersum = 0;
 185       blocks->slots[i].pos = (apr_size_t)-1;
 186     }
 187
 188   for (i = 0; i < datalen; i += MATCH_BLOCKSIZE)
 189     {
 190       /* If this is the last piece, it may be blocksize large */
 191       apr_uint32_t step =
 192         ((i + MATCH_BLOCKSIZE) >= datalen) ? (datalen - i) : MATCH_BLOCKSIZE;
 193       apr_uint32_t adlersum =
 194         adler32_sum(init_adler32(&adler, data + i, step));
 195       add_block(blocks, adlersum, i);
 196     }
 197 }
 198
 199 /* Try to find a match for the target data B in BLOCKS, and then
 200    extend the match as long as data in A and B at the match position
 201    continues to match.  We set the position in a we ended up in (in
 202    case we extended it backwards) in APOSP, the length of the match in
 203    ALENP, and the amount to advance B in BADVANCEP.
 204    *PENDING_INSERT_LENP is the length of the last insert operation that
 205    has not been committed yet to the delta stream, or 0 if there is no
 206    pending insert.  This is used when extending the match backwards,
 207    in which case *PENDING_INSERT_LENP is adjusted, possibly
 208    alleviating the need for the insert entirely.  Return TRUE if the
 209    lookup found a match, regardless of length.  Return FALSE
 210    otherwise.  */
 211 static svn_boolean_t
 212 find_match(const struct blocks *blocks,
 213            const struct adler32 *rolling,
 214            const char *a,
 215            apr_size_t asize,
 216            const char *b,
 217            apr_size_t bsize,
 218            apr_size_t bpos,
 219            apr_size_t *aposp,
 220            apr_size_t *alenp,
 221            apr_size_t *badvancep,
 222            apr_size_t *pending_insert_lenp)
 223 {
 224   apr_uint32_t sum = adler32_sum(rolling);
 225   apr_size_t alen, badvance, apos;
 226   apr_size_t tpos, tlen;
 227
 228   tpos = find_block(blocks, sum);
 229
 230   /* See if we have a match.  */
 231   if (tpos == (apr_size_t)-1)
 232     return FALSE;
 233
 234   tlen = ((tpos + MATCH_BLOCKSIZE) >= asize)
 235     ? (asize - tpos) : MATCH_BLOCKSIZE;
 236
 237   /* Make sure it's not a false match.  */
 238   if (memcmp(a + tpos, b + bpos, tlen) != 0)
 239     return FALSE;
 240
 241   apos = tpos;
 242   alen = tlen;
 243   badvance = tlen;
 244   /* Extend the match forward as far as possible */
 245   while ((apos + alen < asize)
 246          && (bpos + badvance < bsize)
 247          && (a[apos + alen] == b[bpos + badvance]))
 248     {
 249       ++alen;
 250       ++badvance;
 251     }
 252
 253   /* See if we can extend backwards into a previous insert hunk.  */
 254   while (apos > 0
 255          && bpos > 0
 256          && a[apos - 1] == b[bpos - 1]
 257          && *pending_insert_lenp > 0)
 258     {
 259       --(*pending_insert_lenp);
 260       --apos;
 261       --bpos;
 262       ++alen;
 263     }
 264
 265   *aposp = apos;
 266   *alenp = alen;
 267   *badvancep = badvance;
 268   return TRUE;
 269 }
 270
 271
 272 /* Compute a delta from A to B using xdelta.
 273
 274    The basic xdelta algorithm is as follows:
 275
 276    1. Go through the source data, checksumming every MATCH_BLOCKSIZE
 277       block of bytes using adler32, and inserting the checksum into a
 278       match table with the position of the match.
 279    2. Go through the target byte by byte, seeing if that byte starts a
 280       match that we have in the match table.
 281       2a. If so, try to extend the match as far as possible both
 282           forwards and backwards, and then insert a source copy
 283           operation into the delta ops builder for the match.
 284       2b. If not, insert the byte as new data using an insert delta op.
 285
 286    Our implementation doesn't immediately insert "insert" operations,
 287    it waits until we have another copy, or we are done.  The reasoning
 288    is twofold:
 289
 290    1. Otherwise, we would just be building a ton of 1 byte insert
 291       operations
 292    2. So that we can extend a source match backwards into a pending
 293      insert operation, and possibly remove the need for the insert
 294      entirely.  This can happen due to stream alignment.
 295 */
 296 static void
 297 compute_delta(svn_txdelta__ops_baton_t *build_baton,
 298               const char *a,
 299               apr_uint32_t asize,
 300               const char *b,
 301               apr_uint32_t bsize,
 302               apr_pool_t *pool)
 303 {
 304   struct blocks blocks;
 305   struct adler32 rolling;
 306   apr_size_t sz, lo, hi, pending_insert_start = 0, pending_insert_len = 0;
 307
 308   /* If the size of the target is smaller than the match blocksize, just
 309      insert the entire target.  */
 310   if (bsize < MATCH_BLOCKSIZE)
 311     {
 312       svn_txdelta__insert_op(build_baton, svn_txdelta_new,
 313                              0, bsize, b, pool);
 314       return;
 315     }
 316
 317   /* Initialize the matches table.  */
 318   init_blocks_table(a, asize, &blocks, pool);
 319
 320   /* Initialize our rolling checksum.  */
 321   init_adler32(&rolling, b, MATCH_BLOCKSIZE);
 322   for (sz = bsize, lo = 0, hi = MATCH_BLOCKSIZE; lo < sz;)
 323     {
 324       apr_size_t apos = 0;
 325       apr_size_t alen = 1;
 326       apr_size_t badvance = 1;
 327       apr_size_t next;
 328       svn_boolean_t match;
 329
 330       match = find_match(&blocks, &rolling, a, asize, b, bsize, lo, &apos,
 331                          &alen, &badvance, &pending_insert_len);
 332
 333       /* If we didn't find a real match, insert the byte at the target
 334          position into the pending insert.  */
 335       if (! match)
 336         ++pending_insert_len;
 337       else
 338         {
 339           if (pending_insert_len > 0)
 340             {
 341               svn_txdelta__insert_op(build_baton, svn_txdelta_new,
 342                                      0, pending_insert_len,
 343                                      b + pending_insert_start, pool);
 344               pending_insert_len = 0;
 345             }
 346           /* Reset the pending insert start to immediately after the
 347              match. */
 348           pending_insert_start = lo + badvance;
 349           svn_txdelta__insert_op(build_baton, svn_txdelta_source,
 350                                  apos, alen, NULL, pool);
 351         }
 352       next = lo;
 353       for (; next < lo + badvance; ++next)
 354         {
 355           adler32_out(&rolling, b[next]);
 356           if (next + MATCH_BLOCKSIZE < bsize)
 357             adler32_in(&rolling, b[next + MATCH_BLOCKSIZE]);
 358         }
 359       lo = next;
 360       hi = lo + MATCH_BLOCKSIZE;
 361     }
 362
 363   /* If we still have an insert pending at the end, throw it in.  */
 364   if (pending_insert_len > 0)
 365     {
 366       svn_txdelta__insert_op(build_baton, svn_txdelta_new,
 367                              0, pending_insert_len,
 368                              b + pending_insert_start, pool);
 369     }
 370 }
 371
 372 void
 373 svn_txdelta__xdelta(svn_txdelta__ops_baton_t *build_baton,
 374                     const char *data,
 375                     apr_size_t source_len,
 376                     apr_size_t target_len,
 377                     apr_pool_t *pool)
 378 {
 379   /*  We should never be asked to compute something when the source_len is 0,
 380       because it should have asked vdelta or some other compressor.  */
 381   assert(source_len != 0);
 382   compute_delta(build_baton, data, source_len,
 383                 data + source_len, target_len,
 384                 pool);
 385 }