Change the format of the revprops block sent in svnserve for
[svn.git] / subversion / libsvn_delta / xdelta.c
blob59c61bfd800d8e2377ab4ccdbbde4a22a2db7ef1
1 /*
2 * xdelta.c: xdelta generator.
4 * ====================================================================
5 * Copyright (c) 2000-2006 CollabNet. All rights reserved.
7 * This software is licensed as described in the file COPYING, which
8 * you should have received as part of this distribution. The terms
9 * are also available at http://subversion.tigris.org/license-1.html.
10 * If newer versions of this license are posted there, you may use a
11 * newer version instead, at your option.
13 * This software consists of voluntary contributions made by many
14 * individuals. For exact contribution history, see the revision
15 * history and logs, available at http://subversion.tigris.org/.
16 * ====================================================================
20 #include <assert.h>
22 #include <apr_general.h> /* for APR_INLINE */
23 #include <apr_hash.h>
25 #include "svn_delta.h"
26 #include "delta.h"
28 /* This is pseudo-adler32. It is adler32 without the prime modulus.
29 The idea is borrowed from monotone, and is a translation of the C++
30 code. Graydon Hoare, the author of the original code, gave his
31 explicit permission to use it under these terms at 8:02pm on
32 Friday, February 11th, 2005. */
34 #define ADLER32_MASK 0x0000ffff
35 #define ADLER32_CHAR_MASK 0x000000ff
37 /* Structure to store the state of our adler32 checksum. */
38 struct adler32
40 apr_uint32_t s1;
41 apr_uint32_t s2;
42 apr_uint32_t len;
45 /* Feed C into the adler32 checksum. */
47 static APR_INLINE void
48 adler32_in(struct adler32 *ad, const char c)
50 ad->s1 += ((apr_uint32_t) (c)) & ADLER32_CHAR_MASK;
51 ad->s1 &= ADLER32_MASK;
52 ad->s2 += ad->s1;
53 ad->s2 &= ADLER32_MASK;
54 ad->len++;
57 /* Remove the result of byte C from the adler32 checksum. */
59 static APR_INLINE void
60 adler32_out(struct adler32 *ad, const char c)
62 ad->s1 -= ((apr_uint32_t) (c)) & ADLER32_CHAR_MASK;
63 ad->s1 &= ADLER32_MASK;
64 ad->s2 -= (ad->len * (((apr_uint32_t) c) & ADLER32_CHAR_MASK)) + 1;
65 ad->s2 &= ADLER32_MASK;
66 --ad->len;
69 /* Return the current adler32 checksum in the adler32 structure. */
71 static APR_INLINE apr_uint32_t
72 adler32_sum(const struct adler32 *ad)
74 return (ad->s2 << 16) | (ad->s1);
77 /* Initialize an adler32 checksum structure with DATA, which has length
78 DATALEN. Return the initialized structure. */
80 static APR_INLINE struct adler32 *
81 init_adler32(struct adler32 *ad, const char *data, apr_uint32_t datalen)
83 ad->s1 = 1;
84 ad->s2 = 0;
85 ad->len = 0;
86 while (datalen--)
87 adler32_in(ad, *(data++));
88 return ad;
91 /* Size of the blocks we compute checksums for. This was chosen out of
92 thin air. Monotone used 64, xdelta1 used 64, rsync uses 128. */
93 #define MATCH_BLOCKSIZE 64
95 /* Information for a block of the delta source. The length of the
96 block is the smaller of MATCH_BLOCKSIZE and the difference between
97 the size of the source data and the position of this block. */
98 struct block
100 apr_uint32_t adlersum;
101 apr_size_t pos;
104 /* A hash table, using open addressing, of the blocks of the source. */
105 struct blocks
107 /* The largest valid index of slots. */
108 apr_size_t max;
109 /* The vector of blocks. A pos value of (apr_size_t)-1 represents an unused
110 slot. */
111 struct block *slots;
115 /* Return a hash value calculated from the adler32 SUM, suitable for use with
116 our hash table. */
117 static apr_size_t hash_func(apr_uint32_t sum)
119 /* Since the adl32 checksum have a bad distribution for the 11th to 16th
120 bits when used for our small block size, we add some bits from the
121 other half of the checksum. */
122 return sum ^ (sum >> 12);
125 /* Insert a block with the checksum ADLERSUM at position POS in the source data
126 into the table BLOCKS. Ignore duplicates. */
127 static void
128 add_block(struct blocks *blocks, apr_uint32_t adlersum, apr_size_t pos)
130 apr_size_t h = hash_func(adlersum) & blocks->max;
132 /* This will terminate, since we know that we will not fill the table. */
133 while (blocks->slots[h].pos != (apr_size_t)-1)
135 /* No duplicates! */
136 if (blocks->slots[h].adlersum == adlersum)
137 return;
138 h = (h + 1) & blocks->max;
140 blocks->slots[h].adlersum = adlersum;
141 blocks->slots[h].pos = pos;
144 /* Find a block in BLOCKS with the checksum ADLERSUM, returning its position
145 in the source data. If there is no such block, return (apr_size_t)-1. */
146 static apr_size_t
147 find_block(const struct blocks *blocks, apr_uint32_t adlersum)
149 apr_size_t h = hash_func(adlersum) & blocks->max;
151 while (blocks->slots[h].adlersum != adlersum
152 && blocks->slots[h].pos != (apr_size_t)-1)
153 h = (h + 1) & blocks->max;
155 return blocks->slots[h].pos;
158 /* Initialize the matches table from DATA of size DATALEN. This goes
159 through every block of MATCH_BLOCKSIZE bytes in the source and
160 checksums it, inserting the result into the BLOCKS table. */
161 static void
162 init_blocks_table(const char *data,
163 apr_size_t datalen,
164 struct blocks *blocks,
165 apr_pool_t *pool)
167 apr_size_t i;
168 struct adler32 adler;
169 apr_size_t nblocks;
170 apr_size_t nslots = 1;
172 /* Be pesimistic about the block count. */
173 nblocks = datalen / MATCH_BLOCKSIZE + 1;
174 /* Find nearest larger power of two. */
175 while (nslots <= nblocks)
176 nslots *= 2;
177 /* Double the number of slots to avoid a too high load. */
178 nslots *= 2;
179 blocks->max = nslots - 1;
180 blocks->slots = apr_palloc(pool, nslots * sizeof(*(blocks->slots)));
181 for (i = 0; i < nslots; ++i)
183 /* Avoid using an indeterminate value in the lookup. */
184 blocks->slots[i].adlersum = 0;
185 blocks->slots[i].pos = (apr_size_t)-1;
188 for (i = 0; i < datalen; i += MATCH_BLOCKSIZE)
190 /* If this is the last piece, it may be blocksize large */
191 apr_uint32_t step =
192 ((i + MATCH_BLOCKSIZE) >= datalen) ? (datalen - i) : MATCH_BLOCKSIZE;
193 apr_uint32_t adlersum =
194 adler32_sum(init_adler32(&adler, data + i, step));
195 add_block(blocks, adlersum, i);
199 /* Try to find a match for the target data B in BLOCKS, and then
200 extend the match as long as data in A and B at the match position
201 continues to match. We set the position in a we ended up in (in
202 case we extended it backwards) in APOSP, the length of the match in
203 ALENP, and the amount to advance B in BADVANCEP.
204 *PENDING_INSERT_LENP is the length of the last insert operation that
205 has not been committed yet to the delta stream, or 0 if there is no
206 pending insert. This is used when extending the match backwards,
207 in which case *PENDING_INSERT_LENP is adjusted, possibly
208 alleviating the need for the insert entirely. Return TRUE if the
209 lookup found a match, regardless of length. Return FALSE
210 otherwise. */
211 static svn_boolean_t
212 find_match(const struct blocks *blocks,
213 const struct adler32 *rolling,
214 const char *a,
215 apr_size_t asize,
216 const char *b,
217 apr_size_t bsize,
218 apr_size_t bpos,
219 apr_size_t *aposp,
220 apr_size_t *alenp,
221 apr_size_t *badvancep,
222 apr_size_t *pending_insert_lenp)
224 apr_uint32_t sum = adler32_sum(rolling);
225 apr_size_t alen, badvance, apos;
226 apr_size_t tpos, tlen;
228 tpos = find_block(blocks, sum);
230 /* See if we have a match. */
231 if (tpos == (apr_size_t)-1)
232 return FALSE;
234 tlen = ((tpos + MATCH_BLOCKSIZE) >= asize)
235 ? (asize - tpos) : MATCH_BLOCKSIZE;
237 /* Make sure it's not a false match. */
238 if (memcmp(a + tpos, b + bpos, tlen) != 0)
239 return FALSE;
241 apos = tpos;
242 alen = tlen;
243 badvance = tlen;
244 /* Extend the match forward as far as possible */
245 while ((apos + alen < asize)
246 && (bpos + badvance < bsize)
247 && (a[apos + alen] == b[bpos + badvance]))
249 ++alen;
250 ++badvance;
253 /* See if we can extend backwards into a previous insert hunk. */
254 while (apos > 0
255 && bpos > 0
256 && a[apos - 1] == b[bpos - 1]
257 && *pending_insert_lenp > 0)
259 --(*pending_insert_lenp);
260 --apos;
261 --bpos;
262 ++alen;
265 *aposp = apos;
266 *alenp = alen;
267 *badvancep = badvance;
268 return TRUE;
272 /* Compute a delta from A to B using xdelta.
274 The basic xdelta algorithm is as follows:
276 1. Go through the source data, checksumming every MATCH_BLOCKSIZE
277 block of bytes using adler32, and inserting the checksum into a
278 match table with the position of the match.
279 2. Go through the target byte by byte, seeing if that byte starts a
280 match that we have in the match table.
281 2a. If so, try to extend the match as far as possible both
282 forwards and backwards, and then insert a source copy
283 operation into the delta ops builder for the match.
284 2b. If not, insert the byte as new data using an insert delta op.
286 Our implementation doesn't immediately insert "insert" operations,
287 it waits until we have another copy, or we are done. The reasoning
288 is twofold:
290 1. Otherwise, we would just be building a ton of 1 byte insert
291 operations
292 2. So that we can extend a source match backwards into a pending
293 insert operation, and possibly remove the need for the insert
294 entirely. This can happen due to stream alignment.
296 static void
297 compute_delta(svn_txdelta__ops_baton_t *build_baton,
298 const char *a,
299 apr_uint32_t asize,
300 const char *b,
301 apr_uint32_t bsize,
302 apr_pool_t *pool)
304 struct blocks blocks;
305 struct adler32 rolling;
306 apr_size_t sz, lo, hi, pending_insert_start = 0, pending_insert_len = 0;
308 /* If the size of the target is smaller than the match blocksize, just
309 insert the entire target. */
310 if (bsize < MATCH_BLOCKSIZE)
312 svn_txdelta__insert_op(build_baton, svn_txdelta_new,
313 0, bsize, b, pool);
314 return;
317 /* Initialize the matches table. */
318 init_blocks_table(a, asize, &blocks, pool);
320 /* Initialize our rolling checksum. */
321 init_adler32(&rolling, b, MATCH_BLOCKSIZE);
322 for (sz = bsize, lo = 0, hi = MATCH_BLOCKSIZE; lo < sz;)
324 apr_size_t apos = 0;
325 apr_size_t alen = 1;
326 apr_size_t badvance = 1;
327 apr_size_t next;
328 svn_boolean_t match;
330 match = find_match(&blocks, &rolling, a, asize, b, bsize, lo, &apos,
331 &alen, &badvance, &pending_insert_len);
333 /* If we didn't find a real match, insert the byte at the target
334 position into the pending insert. */
335 if (! match)
336 ++pending_insert_len;
337 else
339 if (pending_insert_len > 0)
341 svn_txdelta__insert_op(build_baton, svn_txdelta_new,
342 0, pending_insert_len,
343 b + pending_insert_start, pool);
344 pending_insert_len = 0;
346 /* Reset the pending insert start to immediately after the
347 match. */
348 pending_insert_start = lo + badvance;
349 svn_txdelta__insert_op(build_baton, svn_txdelta_source,
350 apos, alen, NULL, pool);
352 next = lo;
353 for (; next < lo + badvance; ++next)
355 adler32_out(&rolling, b[next]);
356 if (next + MATCH_BLOCKSIZE < bsize)
357 adler32_in(&rolling, b[next + MATCH_BLOCKSIZE]);
359 lo = next;
360 hi = lo + MATCH_BLOCKSIZE;
363 /* If we still have an insert pending at the end, throw it in. */
364 if (pending_insert_len > 0)
366 svn_txdelta__insert_op(build_baton, svn_txdelta_new,
367 0, pending_insert_len,
368 b + pending_insert_start, pool);
372 void
373 svn_txdelta__xdelta(svn_txdelta__ops_baton_t *build_baton,
374 const char *data,
375 apr_size_t source_len,
376 apr_size_t target_len,
377 apr_pool_t *pool)
379 /* We should never be asked to compute something when the source_len is 0,
380 because it should have asked vdelta or some other compressor. */
381 assert(source_len != 0);
382 compute_delta(build_baton, data, source_len,
383 data + source_len, target_len,
384 pool);