2 * vdelta.c: vdelta generator.
4 * ====================================================================
5 * Copyright (c) 2000-2004 CollabNet. All rights reserved.
7 * This software is licensed as described in the file COPYING, which
8 * you should have received as part of this distribution. The terms
9 * are also available at http://subversion.tigris.org/license-1.html.
10 * If newer versions of this license are posted there, you may use a
11 * newer version instead, at your option.
13 * This software consists of voluntary contributions made by many
14 * individuals. For exact contribution history, see the revision
15 * history and logs, available at http://subversion.tigris.org/.
16 * ====================================================================
22 #include <apr_general.h> /* for APR_INLINE */
24 #include "svn_delta.h"
28 /* ==================================================================== */
29 /* Hash table for vdelta hashing.
31 Each hash bucket is a chain of slots. The index of the slot in
32 the slots array is also the index of the key string in the
33 current window's data stream. The hash table implements a multimap
34 (i.e., hash and key collisions are allowed).
36 To store a key->index mapping, just add slot[index] to the slot
37 chain tn key's bucket (see store_mapping).
39 For a given key, you can traverse the list of match candidates (some
40 of which may be hash collisions) like this:
42 for (slot = buckets[get_bucket(key)]; slot != NULL; slot = slot->next)
50 /* Size of a vdelta hash key. */
55 typedef struct hash_slot_t
{
56 struct hash_slot_t
*next
;
60 typedef struct hash_table_t
{
61 int num_buckets
; /* Number of buckets in the table. */
62 hash_slot_t
**buckets
; /* Bucket array. */
63 hash_slot_t
*slots
; /* Slots array. */
67 /* Create a hash table with NUM_SLOTS slots. NUM_SLOTS should be the sum
68 of the size of the source and target parts of the delta window.
69 Allocate from POOL. */
71 create_hash_table(apr_size_t num_slots
, apr_pool_t
*pool
)
75 hash_table_t
* table
= apr_palloc(pool
, sizeof(*table
));
77 /* This should be a reasonable number of buckets ... */
78 table
->num_buckets
= (num_slots
/ 3) | 1;
79 table
->buckets
= apr_palloc(pool
, (table
->num_buckets
80 * sizeof(*table
->buckets
)));
81 for (i
= 0; i
< table
->num_buckets
; ++i
)
82 table
->buckets
[i
] = NULL
;
84 table
->slots
= apr_palloc(pool
, num_slots
* sizeof(*table
->slots
));
85 for (j
= 0; j
< num_slots
; ++j
)
86 table
->slots
[j
].next
= NULL
;
92 /* Convert a key to a pointer to the key's hash bucket.
93 We use a 2-universal multiplicative hash function. If you're
94 wondering about the selected multiplier, take a look at the
95 comments in apr/tables/apr_hash.c:find_entry for a discussion
96 on fast string hashes; it's very illuminating.
98 [ We use 127 instead of 33 here because I happen to like
99 interesting prime numbers, so there. --xbc ] */
100 static APR_INLINE apr_uint32_t
101 get_bucket(const hash_table_t
*table
, const char *key
)
104 apr_uint32_t hash
= 0;
105 for (i
= 0; i
< VD_KEY_SIZE
; ++i
)
106 hash
= hash
* 127 + *key
++;
107 return hash
% table
->num_buckets
;
111 /* Store a key->index mapping into the hash table. */
112 static APR_INLINE
void
113 store_mapping(hash_table_t
*table
, const char* key
, apr_size_t idx
)
115 apr_uint32_t bucket
= get_bucket(table
, key
);
116 assert(table
->slots
[idx
].next
== NULL
);
117 table
->slots
[idx
].next
= table
->buckets
[bucket
];
118 table
->buckets
[bucket
] = &table
->slots
[idx
];
123 /* ==================================================================== */
126 The article "Delta Algorithms: An Empirical Analysis" by Hunt,
127 Vo and Tichy contains a description of the vdelta algorithm,
128 but it's incomplete. Here's a detailed description:
130 1. Look up the four bytes starting at the current position
131 pointer. If there are no matches for those four bytes,
132 output an insert, move the position pointer forward by one,
133 and go back to step 1.
135 2. Determine which of the candidates yields the longest
136 extension. This will be called the "current match".
138 3. Look up the last three bytes of the current match plus one
139 unmatched byte. If there is no match for those four bytes,
140 the current match is the best match; go to step 6.
142 4. For each candidate, check backwards to see if it matches
143 the entire match so far. If no candidates satisfy that
144 constraint, the current match is the best match; go to step 6.
146 5. Among the candidates which do satisfy the constraint,
147 determine which one yields the longest extension. This
148 will be the new "current match." Go back to step 3.
150 6. Output a block copy instruction, add indexes for the last
151 three positions of the matched data, advance the position
152 pointer by the length of the match, and go back to step 1.
154 Inserts and copies are generated only when the current position
155 is within the target data.
157 Note that the vdelta algorithm allows copies that cross the
158 source/target data boundary. Because our internal delta
159 representation has different opcodes for source and target copies,
160 we split them in two. This means that the opcode stream in the
161 delta window can contain copies shorter than VD_KEY_SIZE. These
162 could be represented by insert ops instead, but we'll leave them
163 in, so that we can merge them again when we convert the delta
164 window to an external format like vcdiff that supports cross
168 /* Find the length of a match within the data window.
169 Note that (match < from && from <= end) must always be true here. */
171 static APR_INLINE
int
172 find_match_len(const char *match
, const char *from
, const char *end
)
174 const char *here
= from
;
175 while (here
< end
&& *match
== *here
)
184 /* This is the main vdelta generator. */
187 vdelta(svn_txdelta__ops_baton_t
*build_baton
,
191 svn_boolean_t outputflag
,
195 const char *here
= start
; /* Current position in the buffer. */
196 const char *insert_from
= NULL
; /* Start of byte range for insertion. */
200 const char *current_match
, *key
;
201 apr_size_t current_match_len
= 0;
203 svn_boolean_t progress
;
205 /* If we're near the end, just insert the last few bytes. */
206 if (end
- here
< VD_KEY_SIZE
)
208 const char *from
= ((insert_from
!= NULL
) ? insert_from
: here
);
210 if (outputflag
&& from
< end
)
211 svn_txdelta__insert_op(build_baton
, svn_txdelta_new
, 0,
212 end
- from
, from
, pool
);
216 /* Search for the longest match. */
217 current_match
= NULL
;
218 current_match_len
= 0;
222 /* Try to extend the current match. Our key is the last
223 three matched bytes plus one unmatched byte if we already
224 have a current match, or just the four bytes where we are
225 if we don't have a current match yet. See which mapping
226 yields the longest extension. */
228 for (slot
= table
->buckets
[get_bucket(table
, key
)];
233 apr_size_t match_len
;
235 if (slot
- table
->slots
< key
- here
) /* Too close to start */
237 match
= data
+ (slot
- table
->slots
) - (key
- here
);
238 match_len
= find_match_len(match
, here
, end
);
240 /* We can only copy from the source or from the target, so
241 don't let the match cross START. */
242 if (match
< start
&& match
+ match_len
> start
)
243 match_len
= start
- match
;
245 if (match_len
>= VD_KEY_SIZE
&& match_len
> current_match_len
)
247 /* We have a longer match; record it. */
248 current_match
= match
;
249 current_match_len
= match_len
;
254 key
= here
+ current_match_len
- (VD_KEY_SIZE
- 1);
256 while (progress
&& end
- key
>= VD_KEY_SIZE
);
258 if (current_match_len
< VD_KEY_SIZE
)
260 /* There is no match here; store a mapping and insert this byte. */
261 store_mapping(table
, here
, here
- data
);
262 if (insert_from
== NULL
)
269 if (insert_from
!= NULL
)
271 /* Commit the pending insert. */
272 svn_txdelta__insert_op(build_baton
, svn_txdelta_new
,
273 0, here
- insert_from
,
277 if (current_match
< start
) /* Copy from source. */
278 svn_txdelta__insert_op(build_baton
, svn_txdelta_source
,
279 current_match
- data
,
282 else /* Copy from target */
283 svn_txdelta__insert_op(build_baton
, svn_txdelta_target
,
284 current_match
- start
,
289 /* Adjust the current position and insert mappings for the
290 last three bytes of the match. */
291 here
+= current_match_len
;
292 if (end
- here
>= VD_KEY_SIZE
)
294 char const *last
= here
- (VD_KEY_SIZE
- 1);
295 for (; last
< here
; ++last
)
296 store_mapping(table
, last
, last
- data
);
303 svn_txdelta__vdelta(svn_txdelta__ops_baton_t
*build_baton
,
305 apr_size_t source_len
,
306 apr_size_t target_len
,
309 hash_table_t
*table
= create_hash_table(source_len
+ target_len
, pool
);
311 vdelta(build_baton
, data
, data
, data
+ source_len
, FALSE
, table
, pool
);
312 vdelta(build_baton
, data
, data
+ source_len
, data
+ source_len
+ target_len
,
316 /* This bit of code calculates the hash load and the
317 number of collisions. Please note that a the number
318 of collisions per bucket is one less than the length
319 of the chain. :-) --xbc */
324 for (i
= 0; i
< table
->num_buckets
; ++i
)
326 hash_slot_t
*slot
= table
->buckets
[i
];
339 fprintf(stderr
, "Hash stats: load %d, collisions %d\n",
340 100 - 100 * empty
/ table
->num_buckets
, collisions
);