Followup to r29659: *really* fix a bunch of error leaks in the
[svn.git] / subversion / libsvn_delta / vdelta.c
blobf571454f659fc05aa9b3431f7485cddfd430acc8
1 /*
2 * vdelta.c: vdelta generator.
4 * ====================================================================
5 * Copyright (c) 2000-2004 CollabNet. All rights reserved.
7 * This software is licensed as described in the file COPYING, which
8 * you should have received as part of this distribution. The terms
9 * are also available at http://subversion.tigris.org/license-1.html.
10 * If newer versions of this license are posted there, you may use a
11 * newer version instead, at your option.
13 * This software consists of voluntary contributions made by many
14 * individuals. For exact contribution history, see the revision
15 * history and logs, available at http://subversion.tigris.org/.
16 * ====================================================================
20 #include <assert.h>
22 #include <apr_general.h> /* for APR_INLINE */
24 #include "svn_delta.h"
25 #include "delta.h"
28 /* ==================================================================== */
29 /* Hash table for vdelta hashing.
31 Each hash bucket is a chain of slots. The index of the slot in
32 the slots array is also the index of the key string in the
33 current window's data stream. The hash table implements a multimap
34 (i.e., hash and key collisions are allowed).
36 To store a key->index mapping, just add slot[index] to the slot
37 chain tn key's bucket (see store_mapping).
39 For a given key, you can traverse the list of match candidates (some
40 of which may be hash collisions) like this:
42 for (slot = buckets[get_bucket(key)]; slot != NULL; slot = slot->next)
44 index = slot - slots;
45 ...
50 /* Size of a vdelta hash key. */
51 #define VD_KEY_SIZE 4
54 /* Hash slot. */
55 typedef struct hash_slot_t {
56 struct hash_slot_t *next;
57 } hash_slot_t;
59 /* Hash table. */
60 typedef struct hash_table_t {
61 int num_buckets; /* Number of buckets in the table. */
62 hash_slot_t **buckets; /* Bucket array. */
63 hash_slot_t *slots; /* Slots array. */
64 } hash_table_t;
67 /* Create a hash table with NUM_SLOTS slots. NUM_SLOTS should be the sum
68 of the size of the source and target parts of the delta window.
69 Allocate from POOL. */
70 static hash_table_t *
71 create_hash_table(apr_size_t num_slots, apr_pool_t *pool)
73 int i;
74 apr_size_t j;
75 hash_table_t* table = apr_palloc(pool, sizeof(*table));
77 /* This should be a reasonable number of buckets ... */
78 table->num_buckets = (num_slots / 3) | 1;
79 table->buckets = apr_palloc(pool, (table->num_buckets
80 * sizeof(*table->buckets)));
81 for (i = 0; i < table->num_buckets; ++i)
82 table->buckets[i] = NULL;
84 table->slots = apr_palloc(pool, num_slots * sizeof(*table->slots));
85 for (j = 0; j < num_slots; ++j)
86 table->slots[j].next = NULL;
88 return table;
92 /* Convert a key to a pointer to the key's hash bucket.
93 We use a 2-universal multiplicative hash function. If you're
94 wondering about the selected multiplier, take a look at the
95 comments in apr/tables/apr_hash.c:find_entry for a discussion
96 on fast string hashes; it's very illuminating.
98 [ We use 127 instead of 33 here because I happen to like
99 interesting prime numbers, so there. --xbc ] */
100 static APR_INLINE apr_uint32_t
101 get_bucket(const hash_table_t *table, const char *key)
103 int i;
104 apr_uint32_t hash = 0;
105 for (i = 0; i < VD_KEY_SIZE; ++i)
106 hash = hash * 127 + *key++;
107 return hash % table->num_buckets;
111 /* Store a key->index mapping into the hash table. */
112 static APR_INLINE void
113 store_mapping(hash_table_t *table, const char* key, apr_size_t idx)
115 apr_uint32_t bucket = get_bucket(table, key);
116 assert(table->slots[idx].next == NULL);
117 table->slots[idx].next = table->buckets[bucket];
118 table->buckets[bucket] = &table->slots[idx];
123 /* ==================================================================== */
124 /* Vdelta generator.
126 The article "Delta Algorithms: An Empirical Analysis" by Hunt,
127 Vo and Tichy contains a description of the vdelta algorithm,
128 but it's incomplete. Here's a detailed description:
130 1. Look up the four bytes starting at the current position
131 pointer. If there are no matches for those four bytes,
132 output an insert, move the position pointer forward by one,
133 and go back to step 1.
135 2. Determine which of the candidates yields the longest
136 extension. This will be called the "current match".
138 3. Look up the last three bytes of the current match plus one
139 unmatched byte. If there is no match for those four bytes,
140 the current match is the best match; go to step 6.
142 4. For each candidate, check backwards to see if it matches
143 the entire match so far. If no candidates satisfy that
144 constraint, the current match is the best match; go to step 6.
146 5. Among the candidates which do satisfy the constraint,
147 determine which one yields the longest extension. This
148 will be the new "current match." Go back to step 3.
150 6. Output a block copy instruction, add indexes for the last
151 three positions of the matched data, advance the position
152 pointer by the length of the match, and go back to step 1.
154 Inserts and copies are generated only when the current position
155 is within the target data.
157 Note that the vdelta algorithm allows copies that cross the
158 source/target data boundary. Because our internal delta
159 representation has different opcodes for source and target copies,
160 we split them in two. This means that the opcode stream in the
161 delta window can contain copies shorter than VD_KEY_SIZE. These
162 could be represented by insert ops instead, but we'll leave them
163 in, so that we can merge them again when we convert the delta
164 window to an external format like vcdiff that supports cross
165 -boundary copies. */
168 /* Find the length of a match within the data window.
169 Note that (match < from && from <= end) must always be true here. */
171 static APR_INLINE int
172 find_match_len(const char *match, const char *from, const char *end)
174 const char *here = from;
175 while (here < end && *match == *here)
177 ++match;
178 ++here;
180 return here - from;
184 /* This is the main vdelta generator. */
186 static void
187 vdelta(svn_txdelta__ops_baton_t *build_baton,
188 const char *data,
189 const char *start,
190 const char *end,
191 svn_boolean_t outputflag,
192 hash_table_t *table,
193 apr_pool_t *pool)
195 const char *here = start; /* Current position in the buffer. */
196 const char *insert_from = NULL; /* Start of byte range for insertion. */
198 for (;;)
200 const char *current_match, *key;
201 apr_size_t current_match_len = 0;
202 hash_slot_t *slot;
203 svn_boolean_t progress;
205 /* If we're near the end, just insert the last few bytes. */
206 if (end - here < VD_KEY_SIZE)
208 const char *from = ((insert_from != NULL) ? insert_from : here);
210 if (outputflag && from < end)
211 svn_txdelta__insert_op(build_baton, svn_txdelta_new, 0,
212 end - from, from, pool);
213 return;
216 /* Search for the longest match. */
217 current_match = NULL;
218 current_match_len = 0;
219 key = here;
222 /* Try to extend the current match. Our key is the last
223 three matched bytes plus one unmatched byte if we already
224 have a current match, or just the four bytes where we are
225 if we don't have a current match yet. See which mapping
226 yields the longest extension. */
227 progress = FALSE;
228 for (slot = table->buckets[get_bucket(table, key)];
229 slot != NULL;
230 slot = slot->next)
232 const char *match;
233 apr_size_t match_len;
235 if (slot - table->slots < key - here) /* Too close to start */
236 continue;
237 match = data + (slot - table->slots) - (key - here);
238 match_len = find_match_len(match, here, end);
240 /* We can only copy from the source or from the target, so
241 don't let the match cross START. */
242 if (match < start && match + match_len > start)
243 match_len = start - match;
245 if (match_len >= VD_KEY_SIZE && match_len > current_match_len)
247 /* We have a longer match; record it. */
248 current_match = match;
249 current_match_len = match_len;
250 progress = TRUE;
253 if (progress)
254 key = here + current_match_len - (VD_KEY_SIZE - 1);
256 while (progress && end - key >= VD_KEY_SIZE);
258 if (current_match_len < VD_KEY_SIZE)
260 /* There is no match here; store a mapping and insert this byte. */
261 store_mapping(table, here, here - data);
262 if (insert_from == NULL)
263 insert_from = here;
264 here++;
265 continue;
267 else if (outputflag)
269 if (insert_from != NULL)
271 /* Commit the pending insert. */
272 svn_txdelta__insert_op(build_baton, svn_txdelta_new,
273 0, here - insert_from,
274 insert_from, pool);
275 insert_from = NULL;
277 if (current_match < start) /* Copy from source. */
278 svn_txdelta__insert_op(build_baton, svn_txdelta_source,
279 current_match - data,
280 current_match_len,
281 NULL, pool);
282 else /* Copy from target */
283 svn_txdelta__insert_op(build_baton, svn_txdelta_target,
284 current_match - start,
285 current_match_len,
286 NULL, pool);
289 /* Adjust the current position and insert mappings for the
290 last three bytes of the match. */
291 here += current_match_len;
292 if (end - here >= VD_KEY_SIZE)
294 char const *last = here - (VD_KEY_SIZE - 1);
295 for (; last < here; ++last)
296 store_mapping(table, last, last - data);
302 void
303 svn_txdelta__vdelta(svn_txdelta__ops_baton_t *build_baton,
304 const char *data,
305 apr_size_t source_len,
306 apr_size_t target_len,
307 apr_pool_t *pool)
309 hash_table_t *table = create_hash_table(source_len + target_len, pool);
311 vdelta(build_baton, data, data, data + source_len, FALSE, table, pool);
312 vdelta(build_baton, data, data + source_len, data + source_len + target_len,
313 TRUE, table, pool);
315 #if 0
316 /* This bit of code calculates the hash load and the
317 number of collisions. Please note that a the number
318 of collisions per bucket is one less than the length
319 of the chain. :-) --xbc */
321 int i;
322 int empty = 0;
323 int collisions = 0;
324 for (i = 0; i < table->num_buckets; ++i)
326 hash_slot_t *slot = table->buckets[i];
327 if (!slot)
328 ++empty;
329 else
331 slot = slot->next;
332 while (slot != NULL)
334 ++collisions;
335 slot = slot->next;
339 fprintf(stderr, "Hash stats: load %d, collisions %d\n",
340 100 - 100 * empty / table->num_buckets, collisions);
342 #endif