Sync with 'maint'
[git/gitster.git] / bloom.c
blobc4286341059991c00ae0f2728c4dc30a1ffa6b07
1 #include "git-compat-util.h"
2 #include "bloom.h"
3 #include "diff.h"
4 #include "diffcore.h"
5 #include "hashmap.h"
6 #include "commit-graph.h"
7 #include "commit.h"
8 #include "commit-slab.h"
9 #include "tree.h"
10 #include "tree-walk.h"
11 #include "config.h"
12 #include "repository.h"
14 define_commit_slab(bloom_filter_slab, struct bloom_filter);
16 static struct bloom_filter_slab bloom_filters;
18 struct pathmap_hash_entry {
19 struct hashmap_entry entry;
20 const char path[FLEX_ARRAY];
23 static uint32_t rotate_left(uint32_t value, int32_t count)
25 uint32_t mask = 8 * sizeof(uint32_t) - 1;
26 count &= mask;
27 return ((value << count) | (value >> ((-count) & mask)));
30 static inline unsigned char get_bitmask(uint32_t pos)
32 return ((unsigned char)1) << (pos & (BITS_PER_WORD - 1));
35 static int check_bloom_offset(struct commit_graph *g, uint32_t pos,
36 uint32_t offset)
39 * Note that we allow offsets equal to the data size, which would set
40 * our pointers at one past the end of the chunk memory. This is
41 * necessary because the on-disk index points to the end of the
42 * entries (so we can compute size by comparing adjacent ones). And
43 * naturally the final entry's end is one-past-the-end of the chunk.
45 if (offset <= g->chunk_bloom_data_size - BLOOMDATA_CHUNK_HEADER_SIZE)
46 return 0;
48 warning("ignoring out-of-range offset (%"PRIuMAX") for changed-path"
49 " filter at pos %"PRIuMAX" of %s (chunk size: %"PRIuMAX")",
50 (uintmax_t)offset, (uintmax_t)pos,
51 g->filename, (uintmax_t)g->chunk_bloom_data_size);
52 return -1;
55 int load_bloom_filter_from_graph(struct commit_graph *g,
56 struct bloom_filter *filter,
57 uint32_t graph_pos)
59 uint32_t lex_pos, start_index, end_index;
61 while (graph_pos < g->num_commits_in_base)
62 g = g->base_graph;
64 /* The commit graph commit 'c' lives in doesn't carry Bloom filters. */
65 if (!g->chunk_bloom_indexes)
66 return 0;
68 lex_pos = graph_pos - g->num_commits_in_base;
70 end_index = get_be32(g->chunk_bloom_indexes + 4 * lex_pos);
72 if (lex_pos > 0)
73 start_index = get_be32(g->chunk_bloom_indexes + 4 * (lex_pos - 1));
74 else
75 start_index = 0;
77 if (check_bloom_offset(g, lex_pos, end_index) < 0 ||
78 check_bloom_offset(g, lex_pos - 1, start_index) < 0)
79 return 0;
81 if (end_index < start_index) {
82 warning("ignoring decreasing changed-path index offsets"
83 " (%"PRIuMAX" > %"PRIuMAX") for positions"
84 " %"PRIuMAX" and %"PRIuMAX" of %s",
85 (uintmax_t)start_index, (uintmax_t)end_index,
86 (uintmax_t)(lex_pos-1), (uintmax_t)lex_pos,
87 g->filename);
88 return 0;
91 filter->len = end_index - start_index;
92 filter->data = (unsigned char *)(g->chunk_bloom_data +
93 sizeof(unsigned char) * start_index +
94 BLOOMDATA_CHUNK_HEADER_SIZE);
95 filter->version = g->bloom_filter_settings->hash_version;
96 filter->to_free = NULL;
98 return 1;
102 * Calculate the murmur3 32-bit hash value for the given data
103 * using the given seed.
104 * Produces a uniformly distributed hash value.
105 * Not considered to be cryptographically secure.
106 * Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm
108 uint32_t murmur3_seeded_v2(uint32_t seed, const char *data, size_t len)
110 const uint32_t c1 = 0xcc9e2d51;
111 const uint32_t c2 = 0x1b873593;
112 const uint32_t r1 = 15;
113 const uint32_t r2 = 13;
114 const uint32_t m = 5;
115 const uint32_t n = 0xe6546b64;
116 int i;
117 uint32_t k1 = 0;
118 const char *tail;
120 int len4 = len / sizeof(uint32_t);
122 uint32_t k;
123 for (i = 0; i < len4; i++) {
124 uint32_t byte1 = (uint32_t)(unsigned char)data[4*i];
125 uint32_t byte2 = ((uint32_t)(unsigned char)data[4*i + 1]) << 8;
126 uint32_t byte3 = ((uint32_t)(unsigned char)data[4*i + 2]) << 16;
127 uint32_t byte4 = ((uint32_t)(unsigned char)data[4*i + 3]) << 24;
128 k = byte1 | byte2 | byte3 | byte4;
129 k *= c1;
130 k = rotate_left(k, r1);
131 k *= c2;
133 seed ^= k;
134 seed = rotate_left(seed, r2) * m + n;
137 tail = (data + len4 * sizeof(uint32_t));
139 switch (len & (sizeof(uint32_t) - 1)) {
140 case 3:
141 k1 ^= ((uint32_t)(unsigned char)tail[2]) << 16;
142 /*-fallthrough*/
143 case 2:
144 k1 ^= ((uint32_t)(unsigned char)tail[1]) << 8;
145 /*-fallthrough*/
146 case 1:
147 k1 ^= ((uint32_t)(unsigned char)tail[0]) << 0;
148 k1 *= c1;
149 k1 = rotate_left(k1, r1);
150 k1 *= c2;
151 seed ^= k1;
152 break;
155 seed ^= (uint32_t)len;
156 seed ^= (seed >> 16);
157 seed *= 0x85ebca6b;
158 seed ^= (seed >> 13);
159 seed *= 0xc2b2ae35;
160 seed ^= (seed >> 16);
162 return seed;
165 static uint32_t murmur3_seeded_v1(uint32_t seed, const char *data, size_t len)
167 const uint32_t c1 = 0xcc9e2d51;
168 const uint32_t c2 = 0x1b873593;
169 const uint32_t r1 = 15;
170 const uint32_t r2 = 13;
171 const uint32_t m = 5;
172 const uint32_t n = 0xe6546b64;
173 int i;
174 uint32_t k1 = 0;
175 const char *tail;
177 int len4 = len / sizeof(uint32_t);
179 uint32_t k;
180 for (i = 0; i < len4; i++) {
181 uint32_t byte1 = (uint32_t)data[4*i];
182 uint32_t byte2 = ((uint32_t)data[4*i + 1]) << 8;
183 uint32_t byte3 = ((uint32_t)data[4*i + 2]) << 16;
184 uint32_t byte4 = ((uint32_t)data[4*i + 3]) << 24;
185 k = byte1 | byte2 | byte3 | byte4;
186 k *= c1;
187 k = rotate_left(k, r1);
188 k *= c2;
190 seed ^= k;
191 seed = rotate_left(seed, r2) * m + n;
194 tail = (data + len4 * sizeof(uint32_t));
196 switch (len & (sizeof(uint32_t) - 1)) {
197 case 3:
198 k1 ^= ((uint32_t)tail[2]) << 16;
199 /*-fallthrough*/
200 case 2:
201 k1 ^= ((uint32_t)tail[1]) << 8;
202 /*-fallthrough*/
203 case 1:
204 k1 ^= ((uint32_t)tail[0]) << 0;
205 k1 *= c1;
206 k1 = rotate_left(k1, r1);
207 k1 *= c2;
208 seed ^= k1;
209 break;
212 seed ^= (uint32_t)len;
213 seed ^= (seed >> 16);
214 seed *= 0x85ebca6b;
215 seed ^= (seed >> 13);
216 seed *= 0xc2b2ae35;
217 seed ^= (seed >> 16);
219 return seed;
222 void fill_bloom_key(const char *data,
223 size_t len,
224 struct bloom_key *key,
225 const struct bloom_filter_settings *settings)
227 int i;
228 const uint32_t seed0 = 0x293ae76f;
229 const uint32_t seed1 = 0x7e646e2c;
230 uint32_t hash0, hash1;
231 if (settings->hash_version == 2) {
232 hash0 = murmur3_seeded_v2(seed0, data, len);
233 hash1 = murmur3_seeded_v2(seed1, data, len);
234 } else {
235 hash0 = murmur3_seeded_v1(seed0, data, len);
236 hash1 = murmur3_seeded_v1(seed1, data, len);
239 key->hashes = (uint32_t *)xcalloc(settings->num_hashes, sizeof(uint32_t));
240 for (i = 0; i < settings->num_hashes; i++)
241 key->hashes[i] = hash0 + i * hash1;
244 void clear_bloom_key(struct bloom_key *key)
246 FREE_AND_NULL(key->hashes);
249 void add_key_to_filter(const struct bloom_key *key,
250 struct bloom_filter *filter,
251 const struct bloom_filter_settings *settings)
253 int i;
254 uint64_t mod = filter->len * BITS_PER_WORD;
256 for (i = 0; i < settings->num_hashes; i++) {
257 uint64_t hash_mod = key->hashes[i] % mod;
258 uint64_t block_pos = hash_mod / BITS_PER_WORD;
260 filter->data[block_pos] |= get_bitmask(hash_mod);
264 void init_bloom_filters(void)
266 init_bloom_filter_slab(&bloom_filters);
269 static void free_one_bloom_filter(struct bloom_filter *filter)
271 if (!filter)
272 return;
273 free(filter->to_free);
276 void deinit_bloom_filters(void)
278 deep_clear_bloom_filter_slab(&bloom_filters, free_one_bloom_filter);
281 static int pathmap_cmp(const void *hashmap_cmp_fn_data UNUSED,
282 const struct hashmap_entry *eptr,
283 const struct hashmap_entry *entry_or_key,
284 const void *keydata UNUSED)
286 const struct pathmap_hash_entry *e1, *e2;
288 e1 = container_of(eptr, const struct pathmap_hash_entry, entry);
289 e2 = container_of(entry_or_key, const struct pathmap_hash_entry, entry);
291 return strcmp(e1->path, e2->path);
294 static void init_truncated_large_filter(struct bloom_filter *filter,
295 int version)
297 filter->data = filter->to_free = xmalloc(1);
298 filter->data[0] = 0xFF;
299 filter->len = 1;
300 filter->version = version;
303 #define VISITED (1u<<21)
304 #define HIGH_BITS (1u<<22)
306 static int has_entries_with_high_bit(struct repository *r, struct tree *t)
308 if (parse_tree(t))
309 return 1;
311 if (!(t->object.flags & VISITED)) {
312 struct tree_desc desc;
313 struct name_entry entry;
315 init_tree_desc(&desc, &t->object.oid, t->buffer, t->size);
316 while (tree_entry(&desc, &entry)) {
317 size_t i;
318 for (i = 0; i < entry.pathlen; i++) {
319 if (entry.path[i] & 0x80) {
320 t->object.flags |= HIGH_BITS;
321 goto done;
325 if (S_ISDIR(entry.mode)) {
326 struct tree *sub = lookup_tree(r, &entry.oid);
327 if (sub && has_entries_with_high_bit(r, sub)) {
328 t->object.flags |= HIGH_BITS;
329 goto done;
335 done:
336 t->object.flags |= VISITED;
339 return !!(t->object.flags & HIGH_BITS);
342 static int commit_tree_has_high_bit_paths(struct repository *r,
343 struct commit *c)
345 struct tree *t;
346 if (repo_parse_commit(r, c))
347 return 1;
348 t = repo_get_commit_tree(r, c);
349 if (!t)
350 return 1;
351 return has_entries_with_high_bit(r, t);
354 static struct bloom_filter *upgrade_filter(struct repository *r, struct commit *c,
355 struct bloom_filter *filter,
356 int hash_version)
358 struct commit_list *p = c->parents;
359 if (commit_tree_has_high_bit_paths(r, c))
360 return NULL;
362 if (p && commit_tree_has_high_bit_paths(r, p->item))
363 return NULL;
365 filter->version = hash_version;
367 return filter;
370 struct bloom_filter *get_bloom_filter(struct repository *r, struct commit *c)
372 struct bloom_filter *filter;
373 int hash_version;
375 filter = get_or_compute_bloom_filter(r, c, 0, NULL, NULL);
376 if (!filter)
377 return NULL;
379 prepare_repo_settings(r);
380 hash_version = r->settings.commit_graph_changed_paths_version;
382 if (!(hash_version == -1 || hash_version == filter->version))
383 return NULL; /* unusable filter */
384 return filter;
387 struct bloom_filter *get_or_compute_bloom_filter(struct repository *r,
388 struct commit *c,
389 int compute_if_not_present,
390 const struct bloom_filter_settings *settings,
391 enum bloom_filter_computed *computed)
393 struct bloom_filter *filter;
394 int i;
395 struct diff_options diffopt;
397 if (computed)
398 *computed = BLOOM_NOT_COMPUTED;
400 if (!bloom_filters.slab_size)
401 return NULL;
403 filter = bloom_filter_slab_at(&bloom_filters, c);
405 if (!filter->data) {
406 uint32_t graph_pos;
407 if (repo_find_commit_pos_in_graph(r, c, &graph_pos))
408 load_bloom_filter_from_graph(r->objects->commit_graph,
409 filter, graph_pos);
412 if (filter->data && filter->len) {
413 struct bloom_filter *upgrade;
414 if (!settings || settings->hash_version == filter->version)
415 return filter;
417 /* version mismatch, see if we can upgrade */
418 if (compute_if_not_present &&
419 git_env_bool("GIT_TEST_UPGRADE_BLOOM_FILTERS", 1)) {
420 upgrade = upgrade_filter(r, c, filter,
421 settings->hash_version);
422 if (upgrade) {
423 if (computed)
424 *computed |= BLOOM_UPGRADED;
425 return upgrade;
429 if (!compute_if_not_present)
430 return NULL;
432 repo_diff_setup(r, &diffopt);
433 diffopt.flags.recursive = 1;
434 diffopt.detect_rename = 0;
435 diffopt.max_changes = settings->max_changed_paths;
436 diff_setup_done(&diffopt);
438 /* ensure commit is parsed so we have parent information */
439 repo_parse_commit(r, c);
441 if (c->parents)
442 diff_tree_oid(&c->parents->item->object.oid, &c->object.oid, "", &diffopt);
443 else
444 diff_tree_oid(NULL, &c->object.oid, "", &diffopt);
445 diffcore_std(&diffopt);
447 if (diff_queued_diff.nr <= settings->max_changed_paths) {
448 struct hashmap pathmap = HASHMAP_INIT(pathmap_cmp, NULL);
449 struct pathmap_hash_entry *e;
450 struct hashmap_iter iter;
452 for (i = 0; i < diff_queued_diff.nr; i++) {
453 const char *path = diff_queued_diff.queue[i]->two->path;
456 * Add each leading directory of the changed file, i.e. for
457 * 'dir/subdir/file' add 'dir' and 'dir/subdir' as well, so
458 * the Bloom filter could be used to speed up commands like
459 * 'git log dir/subdir', too.
461 * Note that directories are added without the trailing '/'.
463 do {
464 char *last_slash = strrchr(path, '/');
466 FLEX_ALLOC_STR(e, path, path);
467 hashmap_entry_init(&e->entry, strhash(path));
469 if (!hashmap_get(&pathmap, &e->entry, NULL))
470 hashmap_add(&pathmap, &e->entry);
471 else
472 free(e);
474 if (!last_slash)
475 last_slash = (char*)path;
476 *last_slash = '\0';
478 } while (*path);
481 if (hashmap_get_size(&pathmap) > settings->max_changed_paths) {
482 init_truncated_large_filter(filter,
483 settings->hash_version);
484 if (computed)
485 *computed |= BLOOM_TRUNC_LARGE;
486 goto cleanup;
489 filter->len = (hashmap_get_size(&pathmap) * settings->bits_per_entry + BITS_PER_WORD - 1) / BITS_PER_WORD;
490 filter->version = settings->hash_version;
491 if (!filter->len) {
492 if (computed)
493 *computed |= BLOOM_TRUNC_EMPTY;
494 filter->len = 1;
496 CALLOC_ARRAY(filter->data, filter->len);
497 filter->to_free = filter->data;
499 hashmap_for_each_entry(&pathmap, &iter, e, entry) {
500 struct bloom_key key;
501 fill_bloom_key(e->path, strlen(e->path), &key, settings);
502 add_key_to_filter(&key, filter, settings);
503 clear_bloom_key(&key);
506 cleanup:
507 hashmap_clear_and_free(&pathmap, struct pathmap_hash_entry, entry);
508 } else {
509 init_truncated_large_filter(filter, settings->hash_version);
511 if (computed)
512 *computed |= BLOOM_TRUNC_LARGE;
515 if (computed)
516 *computed |= BLOOM_COMPUTED;
518 diff_queue_clear(&diff_queued_diff);
519 return filter;
522 int bloom_filter_contains(const struct bloom_filter *filter,
523 const struct bloom_key *key,
524 const struct bloom_filter_settings *settings)
526 int i;
527 uint64_t mod = filter->len * BITS_PER_WORD;
529 if (!mod)
530 return -1;
532 for (i = 0; i < settings->num_hashes; i++) {
533 uint64_t hash_mod = key->hashes[i] % mod;
534 uint64_t block_pos = hash_mod / BITS_PER_WORD;
535 if (!(filter->data[block_pos] & get_bitmask(hash_mod)))
536 return 0;
539 return 1;