Merge branch 'jk/test-lsan-improvements'
[git/gitster.git] / reftable / stack.c
blob84cf37a2ad03ea7d576843326213a31788c5207b
1 /*
2 Copyright 2020 Google LLC
4 Use of this source code is governed by a BSD-style
5 license that can be found in the LICENSE file or at
6 https://developers.google.com/open-source/licenses/bsd
7 */
9 #include "stack.h"
11 #include "../write-or-die.h"
12 #include "system.h"
13 #include "constants.h"
14 #include "merged.h"
15 #include "reader.h"
16 #include "reftable-error.h"
17 #include "reftable-record.h"
18 #include "reftable-merged.h"
19 #include "writer.h"
20 #include "tempfile.h"
22 static int stack_try_add(struct reftable_stack *st,
23 int (*write_table)(struct reftable_writer *wr,
24 void *arg),
25 void *arg);
26 static int stack_write_compact(struct reftable_stack *st,
27 struct reftable_writer *wr,
28 size_t first, size_t last,
29 struct reftable_log_expiry_config *config);
30 static void reftable_addition_close(struct reftable_addition *add);
31 static int reftable_stack_reload_maybe_reuse(struct reftable_stack *st,
32 int reuse_open);
34 static void stack_filename(struct strbuf *dest, struct reftable_stack *st,
35 const char *name)
37 strbuf_reset(dest);
38 strbuf_addstr(dest, st->reftable_dir);
39 strbuf_addstr(dest, "/");
40 strbuf_addstr(dest, name);
43 static ssize_t reftable_fd_write(void *arg, const void *data, size_t sz)
45 int *fdp = (int *)arg;
46 return write_in_full(*fdp, data, sz);
49 static int reftable_fd_flush(void *arg)
51 int *fdp = (int *)arg;
53 return fsync_component(FSYNC_COMPONENT_REFERENCE, *fdp);
56 int reftable_new_stack(struct reftable_stack **dest, const char *dir,
57 const struct reftable_write_options *_opts)
59 struct reftable_stack *p = reftable_calloc(1, sizeof(*p));
60 struct strbuf list_file_name = STRBUF_INIT;
61 struct reftable_write_options opts = {0};
62 int err = 0;
64 if (_opts)
65 opts = *_opts;
66 if (opts.hash_id == 0)
67 opts.hash_id = GIT_SHA1_FORMAT_ID;
69 *dest = NULL;
71 strbuf_reset(&list_file_name);
72 strbuf_addstr(&list_file_name, dir);
73 strbuf_addstr(&list_file_name, "/tables.list");
75 p->list_file = strbuf_detach(&list_file_name, NULL);
76 p->list_fd = -1;
77 p->reftable_dir = xstrdup(dir);
78 p->opts = opts;
80 err = reftable_stack_reload_maybe_reuse(p, 1);
81 if (err < 0) {
82 reftable_stack_destroy(p);
83 } else {
84 *dest = p;
86 return err;
89 static int fd_read_lines(int fd, char ***namesp)
91 off_t size = lseek(fd, 0, SEEK_END);
92 char *buf = NULL;
93 int err = 0;
94 if (size < 0) {
95 err = REFTABLE_IO_ERROR;
96 goto done;
98 err = lseek(fd, 0, SEEK_SET);
99 if (err < 0) {
100 err = REFTABLE_IO_ERROR;
101 goto done;
104 REFTABLE_ALLOC_ARRAY(buf, size + 1);
105 if (read_in_full(fd, buf, size) != size) {
106 err = REFTABLE_IO_ERROR;
107 goto done;
109 buf[size] = 0;
111 parse_names(buf, size, namesp);
113 done:
114 reftable_free(buf);
115 return err;
118 int read_lines(const char *filename, char ***namesp)
120 int fd = open(filename, O_RDONLY);
121 int err = 0;
122 if (fd < 0) {
123 if (errno == ENOENT) {
124 REFTABLE_CALLOC_ARRAY(*namesp, 1);
125 return 0;
128 return REFTABLE_IO_ERROR;
130 err = fd_read_lines(fd, namesp);
131 close(fd);
132 return err;
135 void reftable_stack_init_ref_iterator(struct reftable_stack *st,
136 struct reftable_iterator *it)
138 merged_table_init_iter(reftable_stack_merged_table(st),
139 it, BLOCK_TYPE_REF);
142 void reftable_stack_init_log_iterator(struct reftable_stack *st,
143 struct reftable_iterator *it)
145 merged_table_init_iter(reftable_stack_merged_table(st),
146 it, BLOCK_TYPE_LOG);
149 struct reftable_merged_table *
150 reftable_stack_merged_table(struct reftable_stack *st)
152 return st->merged;
155 static int has_name(char **names, const char *name)
157 while (*names) {
158 if (!strcmp(*names, name))
159 return 1;
160 names++;
162 return 0;
165 /* Close and free the stack */
166 void reftable_stack_destroy(struct reftable_stack *st)
168 char **names = NULL;
169 int err = 0;
170 if (st->merged) {
171 reftable_merged_table_free(st->merged);
172 st->merged = NULL;
175 err = read_lines(st->list_file, &names);
176 if (err < 0) {
177 FREE_AND_NULL(names);
180 if (st->readers) {
181 int i = 0;
182 struct strbuf filename = STRBUF_INIT;
183 for (i = 0; i < st->readers_len; i++) {
184 const char *name = reader_name(st->readers[i]);
185 strbuf_reset(&filename);
186 if (names && !has_name(names, name)) {
187 stack_filename(&filename, st, name);
189 reftable_reader_decref(st->readers[i]);
191 if (filename.len) {
192 /* On Windows, can only unlink after closing. */
193 unlink(filename.buf);
196 strbuf_release(&filename);
197 st->readers_len = 0;
198 FREE_AND_NULL(st->readers);
201 if (st->list_fd >= 0) {
202 close(st->list_fd);
203 st->list_fd = -1;
206 FREE_AND_NULL(st->list_file);
207 FREE_AND_NULL(st->reftable_dir);
208 reftable_free(st);
209 free_names(names);
212 static struct reftable_reader **stack_copy_readers(struct reftable_stack *st,
213 int cur_len)
215 struct reftable_reader **cur = reftable_calloc(cur_len, sizeof(*cur));
216 int i = 0;
217 for (i = 0; i < cur_len; i++) {
218 cur[i] = st->readers[i];
220 return cur;
223 static int reftable_stack_reload_once(struct reftable_stack *st,
224 const char **names,
225 int reuse_open)
227 size_t cur_len = !st->merged ? 0 : st->merged->readers_len;
228 struct reftable_reader **cur = stack_copy_readers(st, cur_len);
229 struct reftable_reader **reused = NULL;
230 size_t reused_len = 0, reused_alloc = 0;
231 size_t names_len = names_length(names);
232 struct reftable_reader **new_readers =
233 reftable_calloc(names_len, sizeof(*new_readers));
234 size_t new_readers_len = 0;
235 struct reftable_merged_table *new_merged = NULL;
236 struct strbuf table_path = STRBUF_INIT;
237 int err = 0;
238 size_t i;
240 while (*names) {
241 struct reftable_reader *rd = NULL;
242 const char *name = *names++;
244 /* this is linear; we assume compaction keeps the number of
245 tables under control so this is not quadratic. */
246 for (i = 0; reuse_open && i < cur_len; i++) {
247 if (cur[i] && 0 == strcmp(cur[i]->name, name)) {
248 rd = cur[i];
249 cur[i] = NULL;
252 * When reloading the stack fails, we end up
253 * releasing all new readers. This also
254 * includes the reused readers, even though
255 * they are still in used by the old stack. We
256 * thus need to keep them alive here, which we
257 * do by bumping their refcount.
259 REFTABLE_ALLOC_GROW(reused, reused_len + 1, reused_alloc);
260 reused[reused_len++] = rd;
261 reftable_reader_incref(rd);
262 break;
266 if (!rd) {
267 struct reftable_block_source src = { NULL };
268 stack_filename(&table_path, st, name);
270 err = reftable_block_source_from_file(&src,
271 table_path.buf);
272 if (err < 0)
273 goto done;
275 err = reftable_reader_new(&rd, &src, name);
276 if (err < 0)
277 goto done;
280 new_readers[new_readers_len] = rd;
281 new_readers_len++;
284 /* success! */
285 err = reftable_merged_table_new(&new_merged, new_readers,
286 new_readers_len, st->opts.hash_id);
287 if (err < 0)
288 goto done;
291 * Close the old, non-reused readers and proactively try to unlink
292 * them. This is done for systems like Windows, where the underlying
293 * file of such an open reader wouldn't have been possible to be
294 * unlinked by the compacting process.
296 for (i = 0; i < cur_len; i++) {
297 if (cur[i]) {
298 const char *name = reader_name(cur[i]);
299 stack_filename(&table_path, st, name);
300 reftable_reader_decref(cur[i]);
301 unlink(table_path.buf);
305 /* Update the stack to point to the new tables. */
306 if (st->merged)
307 reftable_merged_table_free(st->merged);
308 new_merged->suppress_deletions = 1;
309 st->merged = new_merged;
311 if (st->readers)
312 reftable_free(st->readers);
313 st->readers = new_readers;
314 st->readers_len = new_readers_len;
315 new_readers = NULL;
316 new_readers_len = 0;
319 * Decrement the refcount of reused readers again. This only needs to
320 * happen on the successful case, because on the unsuccessful one we
321 * decrement their refcount via `new_readers`.
323 for (i = 0; i < reused_len; i++)
324 reftable_reader_decref(reused[i]);
326 done:
327 for (i = 0; i < new_readers_len; i++)
328 reftable_reader_decref(new_readers[i]);
329 reftable_free(new_readers);
330 reftable_free(reused);
331 reftable_free(cur);
332 strbuf_release(&table_path);
333 return err;
336 /* return negative if a before b. */
337 static int tv_cmp(struct timeval *a, struct timeval *b)
339 time_t diff = a->tv_sec - b->tv_sec;
340 int udiff = a->tv_usec - b->tv_usec;
342 if (diff != 0)
343 return diff;
345 return udiff;
348 static int reftable_stack_reload_maybe_reuse(struct reftable_stack *st,
349 int reuse_open)
351 char **names = NULL, **names_after = NULL;
352 struct timeval deadline;
353 int64_t delay = 0;
354 int tries = 0, err;
355 int fd = -1;
357 err = gettimeofday(&deadline, NULL);
358 if (err < 0)
359 goto out;
360 deadline.tv_sec += 3;
362 while (1) {
363 struct timeval now;
365 err = gettimeofday(&now, NULL);
366 if (err < 0)
367 goto out;
370 * Only look at deadlines after the first few times. This
371 * simplifies debugging in GDB.
373 tries++;
374 if (tries > 3 && tv_cmp(&now, &deadline) >= 0)
375 goto out;
377 fd = open(st->list_file, O_RDONLY);
378 if (fd < 0) {
379 if (errno != ENOENT) {
380 err = REFTABLE_IO_ERROR;
381 goto out;
384 REFTABLE_CALLOC_ARRAY(names, 1);
385 } else {
386 err = fd_read_lines(fd, &names);
387 if (err < 0)
388 goto out;
391 err = reftable_stack_reload_once(st, (const char **) names, reuse_open);
392 if (!err)
393 break;
394 if (err != REFTABLE_NOT_EXIST_ERROR)
395 goto out;
398 * REFTABLE_NOT_EXIST_ERROR can be caused by a concurrent
399 * writer. Check if there was one by checking if the name list
400 * changed.
402 err = read_lines(st->list_file, &names_after);
403 if (err < 0)
404 goto out;
405 if (names_equal((const char **) names_after,
406 (const char **) names)) {
407 err = REFTABLE_NOT_EXIST_ERROR;
408 goto out;
411 free_names(names);
412 names = NULL;
413 free_names(names_after);
414 names_after = NULL;
415 close(fd);
416 fd = -1;
418 delay = delay + (delay * rand()) / RAND_MAX + 1;
419 sleep_millisec(delay);
422 out:
424 * Invalidate the stat cache. It is sufficient to only close the file
425 * descriptor and keep the cached stat info because we never use the
426 * latter when the former is negative.
428 if (st->list_fd >= 0) {
429 close(st->list_fd);
430 st->list_fd = -1;
434 * Cache stat information in case it provides a useful signal to us.
435 * According to POSIX, "The st_ino and st_dev fields taken together
436 * uniquely identify the file within the system." That being said,
437 * Windows is not POSIX compliant and we do not have these fields
438 * available. So the information we have there is insufficient to
439 * determine whether two file descriptors point to the same file.
441 * While we could fall back to using other signals like the file's
442 * mtime, those are not sufficient to avoid races. We thus refrain from
443 * using the stat cache on such systems and fall back to the secondary
444 * caching mechanism, which is to check whether contents of the file
445 * have changed.
447 * On other systems which are POSIX compliant we must keep the file
448 * descriptor open. This is to avoid a race condition where two
449 * processes access the reftable stack at the same point in time:
451 * 1. A reads the reftable stack and caches its stat info.
453 * 2. B updates the stack, appending a new table to "tables.list".
454 * This will both use a new inode and result in a different file
455 * size, thus invalidating A's cache in theory.
457 * 3. B decides to auto-compact the stack and merges two tables. The
458 * file size now matches what A has cached again. Furthermore, the
459 * filesystem may decide to recycle the inode number of the file
460 * we have replaced in (2) because it is not in use anymore.
462 * 4. A reloads the reftable stack. Neither the inode number nor the
463 * file size changed. If the timestamps did not change either then
464 * we think the cached copy of our stack is up-to-date.
466 * By keeping the file descriptor open the inode number cannot be
467 * recycled, mitigating the race.
469 if (!err && fd >= 0 && !fstat(fd, &st->list_st) &&
470 st->list_st.st_dev && st->list_st.st_ino) {
471 st->list_fd = fd;
472 fd = -1;
475 if (fd >= 0)
476 close(fd);
477 free_names(names);
478 free_names(names_after);
479 return err;
482 /* -1 = error
483 0 = up to date
484 1 = changed. */
485 static int stack_uptodate(struct reftable_stack *st)
487 char **names = NULL;
488 int err;
489 int i = 0;
492 * When we have cached stat information available then we use it to
493 * verify whether the file has been rewritten.
495 * Note that we explicitly do not want to use `stat_validity_check()`
496 * and friends here because they may end up not comparing the `st_dev`
497 * and `st_ino` fields. These functions thus cannot guarantee that we
498 * indeed still have the same file.
500 if (st->list_fd >= 0) {
501 struct stat list_st;
503 if (stat(st->list_file, &list_st) < 0) {
505 * It's fine for "tables.list" to not exist. In that
506 * case, we have to refresh when the loaded stack has
507 * any readers.
509 if (errno == ENOENT)
510 return !!st->readers_len;
511 return REFTABLE_IO_ERROR;
515 * When "tables.list" refers to the same file we can assume
516 * that it didn't change. This is because we always use
517 * rename(3P) to update the file and never write to it
518 * directly.
520 if (st->list_st.st_dev == list_st.st_dev &&
521 st->list_st.st_ino == list_st.st_ino)
522 return 0;
525 err = read_lines(st->list_file, &names);
526 if (err < 0)
527 return err;
529 for (i = 0; i < st->readers_len; i++) {
530 if (!names[i]) {
531 err = 1;
532 goto done;
535 if (strcmp(st->readers[i]->name, names[i])) {
536 err = 1;
537 goto done;
541 if (names[st->merged->readers_len]) {
542 err = 1;
543 goto done;
546 done:
547 free_names(names);
548 return err;
551 int reftable_stack_reload(struct reftable_stack *st)
553 int err = stack_uptodate(st);
554 if (err > 0)
555 return reftable_stack_reload_maybe_reuse(st, 1);
556 return err;
559 int reftable_stack_add(struct reftable_stack *st,
560 int (*write)(struct reftable_writer *wr, void *arg),
561 void *arg)
563 int err = stack_try_add(st, write, arg);
564 if (err < 0) {
565 if (err == REFTABLE_OUTDATED_ERROR) {
566 /* Ignore error return, we want to propagate
567 REFTABLE_OUTDATED_ERROR.
569 reftable_stack_reload(st);
571 return err;
574 return 0;
577 static void format_name(struct strbuf *dest, uint64_t min, uint64_t max)
579 char buf[100];
580 uint32_t rnd = (uint32_t)git_rand();
581 snprintf(buf, sizeof(buf), "0x%012" PRIx64 "-0x%012" PRIx64 "-%08x",
582 min, max, rnd);
583 strbuf_reset(dest);
584 strbuf_addstr(dest, buf);
587 struct reftable_addition {
588 struct lock_file tables_list_lock;
589 struct reftable_stack *stack;
591 char **new_tables;
592 size_t new_tables_len, new_tables_cap;
593 uint64_t next_update_index;
596 #define REFTABLE_ADDITION_INIT {0}
598 static int reftable_stack_init_addition(struct reftable_addition *add,
599 struct reftable_stack *st,
600 unsigned int flags)
602 struct strbuf lock_file_name = STRBUF_INIT;
603 int err;
605 add->stack = st;
607 err = hold_lock_file_for_update_timeout(&add->tables_list_lock,
608 st->list_file,
609 LOCK_NO_DEREF,
610 st->opts.lock_timeout_ms);
611 if (err < 0) {
612 if (errno == EEXIST) {
613 err = REFTABLE_LOCK_ERROR;
614 } else {
615 err = REFTABLE_IO_ERROR;
617 goto done;
619 if (st->opts.default_permissions) {
620 if (chmod(get_lock_file_path(&add->tables_list_lock),
621 st->opts.default_permissions) < 0) {
622 err = REFTABLE_IO_ERROR;
623 goto done;
627 err = stack_uptodate(st);
628 if (err < 0)
629 goto done;
630 if (err > 0 && flags & REFTABLE_STACK_NEW_ADDITION_RELOAD) {
631 err = reftable_stack_reload_maybe_reuse(add->stack, 1);
632 if (err)
633 goto done;
635 if (err > 0) {
636 err = REFTABLE_OUTDATED_ERROR;
637 goto done;
640 add->next_update_index = reftable_stack_next_update_index(st);
641 done:
642 if (err)
643 reftable_addition_close(add);
644 strbuf_release(&lock_file_name);
645 return err;
648 static void reftable_addition_close(struct reftable_addition *add)
650 struct strbuf nm = STRBUF_INIT;
651 size_t i;
653 for (i = 0; i < add->new_tables_len; i++) {
654 stack_filename(&nm, add->stack, add->new_tables[i]);
655 unlink(nm.buf);
656 reftable_free(add->new_tables[i]);
657 add->new_tables[i] = NULL;
659 reftable_free(add->new_tables);
660 add->new_tables = NULL;
661 add->new_tables_len = 0;
662 add->new_tables_cap = 0;
664 rollback_lock_file(&add->tables_list_lock);
665 strbuf_release(&nm);
668 void reftable_addition_destroy(struct reftable_addition *add)
670 if (!add) {
671 return;
673 reftable_addition_close(add);
674 reftable_free(add);
677 int reftable_addition_commit(struct reftable_addition *add)
679 struct strbuf table_list = STRBUF_INIT;
680 int lock_file_fd = get_lock_file_fd(&add->tables_list_lock);
681 int err = 0;
682 size_t i;
684 if (add->new_tables_len == 0)
685 goto done;
687 for (i = 0; i < add->stack->merged->readers_len; i++) {
688 strbuf_addstr(&table_list, add->stack->readers[i]->name);
689 strbuf_addstr(&table_list, "\n");
691 for (i = 0; i < add->new_tables_len; i++) {
692 strbuf_addstr(&table_list, add->new_tables[i]);
693 strbuf_addstr(&table_list, "\n");
696 err = write_in_full(lock_file_fd, table_list.buf, table_list.len);
697 strbuf_release(&table_list);
698 if (err < 0) {
699 err = REFTABLE_IO_ERROR;
700 goto done;
703 err = fsync_component(FSYNC_COMPONENT_REFERENCE, lock_file_fd);
704 if (err < 0) {
705 err = REFTABLE_IO_ERROR;
706 goto done;
709 err = commit_lock_file(&add->tables_list_lock);
710 if (err < 0) {
711 err = REFTABLE_IO_ERROR;
712 goto done;
715 /* success, no more state to clean up. */
716 for (i = 0; i < add->new_tables_len; i++)
717 reftable_free(add->new_tables[i]);
718 reftable_free(add->new_tables);
719 add->new_tables = NULL;
720 add->new_tables_len = 0;
721 add->new_tables_cap = 0;
723 err = reftable_stack_reload_maybe_reuse(add->stack, 1);
724 if (err)
725 goto done;
727 if (!add->stack->opts.disable_auto_compact) {
729 * Auto-compact the stack to keep the number of tables in
730 * control. It is possible that a concurrent writer is already
731 * trying to compact parts of the stack, which would lead to a
732 * `REFTABLE_LOCK_ERROR` because parts of the stack are locked
733 * already. This is a benign error though, so we ignore it.
735 err = reftable_stack_auto_compact(add->stack);
736 if (err < 0 && err != REFTABLE_LOCK_ERROR)
737 goto done;
738 err = 0;
741 done:
742 reftable_addition_close(add);
743 return err;
746 int reftable_stack_new_addition(struct reftable_addition **dest,
747 struct reftable_stack *st,
748 unsigned int flags)
750 int err = 0;
751 struct reftable_addition empty = REFTABLE_ADDITION_INIT;
752 REFTABLE_CALLOC_ARRAY(*dest, 1);
753 **dest = empty;
754 err = reftable_stack_init_addition(*dest, st, flags);
755 if (err) {
756 reftable_free(*dest);
757 *dest = NULL;
759 return err;
762 static int stack_try_add(struct reftable_stack *st,
763 int (*write_table)(struct reftable_writer *wr,
764 void *arg),
765 void *arg)
767 struct reftable_addition add = REFTABLE_ADDITION_INIT;
768 int err = reftable_stack_init_addition(&add, st, 0);
769 if (err < 0)
770 goto done;
772 err = reftable_addition_add(&add, write_table, arg);
773 if (err < 0)
774 goto done;
776 err = reftable_addition_commit(&add);
777 done:
778 reftable_addition_close(&add);
779 return err;
782 int reftable_addition_add(struct reftable_addition *add,
783 int (*write_table)(struct reftable_writer *wr,
784 void *arg),
785 void *arg)
787 struct strbuf temp_tab_file_name = STRBUF_INIT;
788 struct strbuf tab_file_name = STRBUF_INIT;
789 struct strbuf next_name = STRBUF_INIT;
790 struct reftable_writer *wr = NULL;
791 struct tempfile *tab_file = NULL;
792 int err = 0;
793 int tab_fd;
795 strbuf_reset(&next_name);
796 format_name(&next_name, add->next_update_index, add->next_update_index);
798 stack_filename(&temp_tab_file_name, add->stack, next_name.buf);
799 strbuf_addstr(&temp_tab_file_name, ".temp.XXXXXX");
801 tab_file = mks_tempfile(temp_tab_file_name.buf);
802 if (!tab_file) {
803 err = REFTABLE_IO_ERROR;
804 goto done;
806 if (add->stack->opts.default_permissions) {
807 if (chmod(get_tempfile_path(tab_file),
808 add->stack->opts.default_permissions)) {
809 err = REFTABLE_IO_ERROR;
810 goto done;
813 tab_fd = get_tempfile_fd(tab_file);
815 wr = reftable_new_writer(reftable_fd_write, reftable_fd_flush, &tab_fd,
816 &add->stack->opts);
817 err = write_table(wr, arg);
818 if (err < 0)
819 goto done;
821 err = reftable_writer_close(wr);
822 if (err == REFTABLE_EMPTY_TABLE_ERROR) {
823 err = 0;
824 goto done;
826 if (err < 0)
827 goto done;
829 err = close_tempfile_gently(tab_file);
830 if (err < 0) {
831 err = REFTABLE_IO_ERROR;
832 goto done;
835 if (wr->min_update_index < add->next_update_index) {
836 err = REFTABLE_API_ERROR;
837 goto done;
840 format_name(&next_name, wr->min_update_index, wr->max_update_index);
841 strbuf_addstr(&next_name, ".ref");
842 stack_filename(&tab_file_name, add->stack, next_name.buf);
845 On windows, this relies on rand() picking a unique destination name.
846 Maybe we should do retry loop as well?
848 err = rename_tempfile(&tab_file, tab_file_name.buf);
849 if (err < 0) {
850 err = REFTABLE_IO_ERROR;
851 goto done;
854 REFTABLE_ALLOC_GROW(add->new_tables, add->new_tables_len + 1,
855 add->new_tables_cap);
856 add->new_tables[add->new_tables_len++] = strbuf_detach(&next_name, NULL);
857 done:
858 delete_tempfile(&tab_file);
859 strbuf_release(&temp_tab_file_name);
860 strbuf_release(&tab_file_name);
861 strbuf_release(&next_name);
862 reftable_writer_free(wr);
863 return err;
866 uint64_t reftable_stack_next_update_index(struct reftable_stack *st)
868 int sz = st->merged->readers_len;
869 if (sz > 0)
870 return reftable_reader_max_update_index(st->readers[sz - 1]) +
872 return 1;
875 static int stack_compact_locked(struct reftable_stack *st,
876 size_t first, size_t last,
877 struct reftable_log_expiry_config *config,
878 struct tempfile **tab_file_out)
880 struct strbuf next_name = STRBUF_INIT;
881 struct strbuf tab_file_path = STRBUF_INIT;
882 struct reftable_writer *wr = NULL;
883 struct tempfile *tab_file;
884 int tab_fd, err = 0;
886 format_name(&next_name,
887 reftable_reader_min_update_index(st->readers[first]),
888 reftable_reader_max_update_index(st->readers[last]));
889 stack_filename(&tab_file_path, st, next_name.buf);
890 strbuf_addstr(&tab_file_path, ".temp.XXXXXX");
892 tab_file = mks_tempfile(tab_file_path.buf);
893 if (!tab_file) {
894 err = REFTABLE_IO_ERROR;
895 goto done;
897 tab_fd = get_tempfile_fd(tab_file);
899 if (st->opts.default_permissions &&
900 chmod(get_tempfile_path(tab_file), st->opts.default_permissions) < 0) {
901 err = REFTABLE_IO_ERROR;
902 goto done;
905 wr = reftable_new_writer(reftable_fd_write, reftable_fd_flush,
906 &tab_fd, &st->opts);
907 err = stack_write_compact(st, wr, first, last, config);
908 if (err < 0)
909 goto done;
911 err = reftable_writer_close(wr);
912 if (err < 0)
913 goto done;
915 err = close_tempfile_gently(tab_file);
916 if (err < 0)
917 goto done;
919 *tab_file_out = tab_file;
920 tab_file = NULL;
922 done:
923 delete_tempfile(&tab_file);
924 reftable_writer_free(wr);
925 strbuf_release(&next_name);
926 strbuf_release(&tab_file_path);
927 return err;
930 static int stack_write_compact(struct reftable_stack *st,
931 struct reftable_writer *wr,
932 size_t first, size_t last,
933 struct reftable_log_expiry_config *config)
935 struct reftable_merged_table *mt = NULL;
936 struct reftable_iterator it = { NULL };
937 struct reftable_ref_record ref = { NULL };
938 struct reftable_log_record log = { NULL };
939 size_t subtabs_len = last - first + 1;
940 uint64_t entries = 0;
941 int err = 0;
943 for (size_t i = first; i <= last; i++)
944 st->stats.bytes += st->readers[i]->size;
945 reftable_writer_set_limits(wr, st->readers[first]->min_update_index,
946 st->readers[last]->max_update_index);
948 err = reftable_merged_table_new(&mt, st->readers + first, subtabs_len,
949 st->opts.hash_id);
950 if (err < 0)
951 goto done;
953 merged_table_init_iter(mt, &it, BLOCK_TYPE_REF);
954 err = reftable_iterator_seek_ref(&it, "");
955 if (err < 0)
956 goto done;
958 while (1) {
959 err = reftable_iterator_next_ref(&it, &ref);
960 if (err > 0) {
961 err = 0;
962 break;
964 if (err < 0)
965 goto done;
967 if (first == 0 && reftable_ref_record_is_deletion(&ref)) {
968 continue;
971 err = reftable_writer_add_ref(wr, &ref);
972 if (err < 0)
973 goto done;
974 entries++;
976 reftable_iterator_destroy(&it);
978 merged_table_init_iter(mt, &it, BLOCK_TYPE_LOG);
979 err = reftable_iterator_seek_log(&it, "");
980 if (err < 0)
981 goto done;
983 while (1) {
984 err = reftable_iterator_next_log(&it, &log);
985 if (err > 0) {
986 err = 0;
987 break;
989 if (err < 0)
990 goto done;
991 if (first == 0 && reftable_log_record_is_deletion(&log)) {
992 continue;
995 if (config && config->min_update_index > 0 &&
996 log.update_index < config->min_update_index) {
997 continue;
1000 if (config && config->time > 0 &&
1001 log.value.update.time < config->time) {
1002 continue;
1005 err = reftable_writer_add_log(wr, &log);
1006 if (err < 0)
1007 goto done;
1008 entries++;
1011 done:
1012 reftable_iterator_destroy(&it);
1013 if (mt)
1014 reftable_merged_table_free(mt);
1015 reftable_ref_record_release(&ref);
1016 reftable_log_record_release(&log);
1017 st->stats.entries_written += entries;
1018 return err;
1021 enum stack_compact_range_flags {
1023 * Perform a best-effort compaction. That is, even if we cannot lock
1024 * all tables in the specified range, we will try to compact the
1025 * remaining slice.
1027 STACK_COMPACT_RANGE_BEST_EFFORT = (1 << 0),
1031 * Compact all tables in the range `[first, last)` into a single new table.
1033 * This function returns `0` on success or a code `< 0` on failure. When the
1034 * stack or any of the tables in the specified range are already locked then
1035 * this function returns `REFTABLE_LOCK_ERROR`. This is a benign error that
1036 * callers can either ignore, or they may choose to retry compaction after some
1037 * amount of time.
1039 static int stack_compact_range(struct reftable_stack *st,
1040 size_t first, size_t last,
1041 struct reftable_log_expiry_config *expiry,
1042 unsigned int flags)
1044 struct strbuf tables_list_buf = STRBUF_INIT;
1045 struct strbuf new_table_name = STRBUF_INIT;
1046 struct strbuf new_table_path = STRBUF_INIT;
1047 struct strbuf table_name = STRBUF_INIT;
1048 struct lock_file tables_list_lock = LOCK_INIT;
1049 struct lock_file *table_locks = NULL;
1050 struct tempfile *new_table = NULL;
1051 int is_empty_table = 0, err = 0;
1052 size_t first_to_replace, last_to_replace;
1053 size_t i, nlocks = 0;
1054 char **names = NULL;
1056 if (first > last || (!expiry && first == last)) {
1057 err = 0;
1058 goto done;
1061 st->stats.attempts++;
1064 * Hold the lock so that we can read "tables.list" and lock all tables
1065 * which are part of the user-specified range.
1067 err = hold_lock_file_for_update_timeout(&tables_list_lock,
1068 st->list_file,
1069 LOCK_NO_DEREF,
1070 st->opts.lock_timeout_ms);
1071 if (err < 0) {
1072 if (errno == EEXIST)
1073 err = REFTABLE_LOCK_ERROR;
1074 else
1075 err = REFTABLE_IO_ERROR;
1076 goto done;
1079 err = stack_uptodate(st);
1080 if (err)
1081 goto done;
1084 * Lock all tables in the user-provided range. This is the slice of our
1085 * stack which we'll compact.
1087 * Note that we lock tables in reverse order from last to first. The
1088 * intent behind this is to allow a newer process to perform best
1089 * effort compaction of tables that it has added in the case where an
1090 * older process is still busy compacting tables which are preexisting
1091 * from the point of view of the newer process.
1093 REFTABLE_CALLOC_ARRAY(table_locks, last - first + 1);
1094 for (i = last + 1; i > first; i--) {
1095 stack_filename(&table_name, st, reader_name(st->readers[i - 1]));
1097 err = hold_lock_file_for_update(&table_locks[nlocks],
1098 table_name.buf, LOCK_NO_DEREF);
1099 if (err < 0) {
1101 * When the table is locked already we may do a
1102 * best-effort compaction and compact only the tables
1103 * that we have managed to lock so far. This of course
1104 * requires that we have been able to lock at least two
1105 * tables, otherwise there would be nothing to compact.
1106 * In that case, we return a lock error to our caller.
1108 if (errno == EEXIST && last - (i - 1) >= 2 &&
1109 flags & STACK_COMPACT_RANGE_BEST_EFFORT) {
1110 err = 0;
1112 * The subtraction is to offset the index, the
1113 * addition is to only compact up to the table
1114 * of the preceding iteration. They obviously
1115 * cancel each other out, but that may be
1116 * non-obvious when it was omitted.
1118 first = (i - 1) + 1;
1119 break;
1120 } else if (errno == EEXIST) {
1121 err = REFTABLE_LOCK_ERROR;
1122 goto done;
1123 } else {
1124 err = REFTABLE_IO_ERROR;
1125 goto done;
1130 * We need to close the lockfiles as we might otherwise easily
1131 * run into file descriptor exhaustion when we compress a lot
1132 * of tables.
1134 err = close_lock_file_gently(&table_locks[nlocks++]);
1135 if (err < 0) {
1136 err = REFTABLE_IO_ERROR;
1137 goto done;
1142 * We have locked all tables in our range and can thus release the
1143 * "tables.list" lock while compacting the locked tables. This allows
1144 * concurrent updates to the stack to proceed.
1146 err = rollback_lock_file(&tables_list_lock);
1147 if (err < 0) {
1148 err = REFTABLE_IO_ERROR;
1149 goto done;
1153 * Compact the now-locked tables into a new table. Note that compacting
1154 * these tables may end up with an empty new table in case tombstones
1155 * end up cancelling out all refs in that range.
1157 err = stack_compact_locked(st, first, last, expiry, &new_table);
1158 if (err < 0) {
1159 if (err != REFTABLE_EMPTY_TABLE_ERROR)
1160 goto done;
1161 is_empty_table = 1;
1165 * Now that we have written the new, compacted table we need to re-lock
1166 * "tables.list". We'll then replace the compacted range of tables with
1167 * the new table.
1169 err = hold_lock_file_for_update_timeout(&tables_list_lock,
1170 st->list_file,
1171 LOCK_NO_DEREF,
1172 st->opts.lock_timeout_ms);
1173 if (err < 0) {
1174 if (errno == EEXIST)
1175 err = REFTABLE_LOCK_ERROR;
1176 else
1177 err = REFTABLE_IO_ERROR;
1178 goto done;
1181 if (st->opts.default_permissions) {
1182 if (chmod(get_lock_file_path(&tables_list_lock),
1183 st->opts.default_permissions) < 0) {
1184 err = REFTABLE_IO_ERROR;
1185 goto done;
1190 * As we have unlocked the stack while compacting our slice of tables
1191 * it may have happened that a concurrently running process has updated
1192 * the stack while we were compacting. In that case, we need to check
1193 * whether the tables that we have just compacted still exist in the
1194 * stack in the exact same order as we have compacted them.
1196 * If they do exist, then it is fine to continue and replace those
1197 * tables with our compacted version. If they don't, then we need to
1198 * abort.
1200 err = stack_uptodate(st);
1201 if (err < 0)
1202 goto done;
1203 if (err > 0) {
1204 ssize_t new_offset = -1;
1205 int fd;
1207 fd = open(st->list_file, O_RDONLY);
1208 if (fd < 0) {
1209 err = REFTABLE_IO_ERROR;
1210 goto done;
1213 err = fd_read_lines(fd, &names);
1214 close(fd);
1215 if (err < 0)
1216 goto done;
1219 * Search for the offset of the first table that we have
1220 * compacted in the updated "tables.list" file.
1222 for (size_t i = 0; names[i]; i++) {
1223 if (strcmp(names[i], st->readers[first]->name))
1224 continue;
1227 * We have found the first entry. Verify that all the
1228 * subsequent tables we have compacted still exist in
1229 * the modified stack in the exact same order as we
1230 * have compacted them.
1232 for (size_t j = 1; j < last - first + 1; j++) {
1233 const char *old = first + j < st->merged->readers_len ?
1234 st->readers[first + j]->name : NULL;
1235 const char *new = names[i + j];
1238 * If some entries are missing or in case the tables
1239 * have changed then we need to bail out. Again, this
1240 * shouldn't ever happen because we have locked the
1241 * tables we are compacting.
1243 if (!old || !new || strcmp(old, new)) {
1244 err = REFTABLE_OUTDATED_ERROR;
1245 goto done;
1249 new_offset = i;
1250 break;
1254 * In case we didn't find our compacted tables in the stack we
1255 * need to bail out. In theory, this should have never happened
1256 * because we locked the tables we are compacting.
1258 if (new_offset < 0) {
1259 err = REFTABLE_OUTDATED_ERROR;
1260 goto done;
1264 * We have found the new range that we want to replace, so
1265 * let's update the range of tables that we want to replace.
1267 first_to_replace = new_offset;
1268 last_to_replace = last + (new_offset - first);
1269 } else {
1271 * `fd_read_lines()` uses a `NULL` sentinel to indicate that
1272 * the array is at its end. As we use `free_names()` to free
1273 * the array, we need to include this sentinel value here and
1274 * thus have to allocate `readers_len + 1` many entries.
1276 REFTABLE_CALLOC_ARRAY(names, st->merged->readers_len + 1);
1277 for (size_t i = 0; i < st->merged->readers_len; i++)
1278 names[i] = xstrdup(st->readers[i]->name);
1279 first_to_replace = first;
1280 last_to_replace = last;
1284 * If the resulting compacted table is not empty, then we need to move
1285 * it into place now.
1287 if (!is_empty_table) {
1288 format_name(&new_table_name, st->readers[first]->min_update_index,
1289 st->readers[last]->max_update_index);
1290 strbuf_addstr(&new_table_name, ".ref");
1291 stack_filename(&new_table_path, st, new_table_name.buf);
1293 err = rename_tempfile(&new_table, new_table_path.buf);
1294 if (err < 0) {
1295 err = REFTABLE_IO_ERROR;
1296 goto done;
1301 * Write the new "tables.list" contents with the compacted table we
1302 * have just written. In case the compacted table became empty we
1303 * simply skip writing it.
1305 for (i = 0; i < first_to_replace; i++)
1306 strbuf_addf(&tables_list_buf, "%s\n", names[i]);
1307 if (!is_empty_table)
1308 strbuf_addf(&tables_list_buf, "%s\n", new_table_name.buf);
1309 for (i = last_to_replace + 1; names[i]; i++)
1310 strbuf_addf(&tables_list_buf, "%s\n", names[i]);
1312 err = write_in_full(get_lock_file_fd(&tables_list_lock),
1313 tables_list_buf.buf, tables_list_buf.len);
1314 if (err < 0) {
1315 err = REFTABLE_IO_ERROR;
1316 unlink(new_table_path.buf);
1317 goto done;
1320 err = fsync_component(FSYNC_COMPONENT_REFERENCE, get_lock_file_fd(&tables_list_lock));
1321 if (err < 0) {
1322 err = REFTABLE_IO_ERROR;
1323 unlink(new_table_path.buf);
1324 goto done;
1327 err = commit_lock_file(&tables_list_lock);
1328 if (err < 0) {
1329 err = REFTABLE_IO_ERROR;
1330 unlink(new_table_path.buf);
1331 goto done;
1335 * Reload the stack before deleting the compacted tables. We can only
1336 * delete the files after we closed them on Windows, so this needs to
1337 * happen first.
1339 err = reftable_stack_reload_maybe_reuse(st, first < last);
1340 if (err < 0)
1341 goto done;
1344 * Delete the old tables. They may still be in use by concurrent
1345 * readers, so it is expected that unlinking tables may fail.
1347 for (i = 0; i < nlocks; i++) {
1348 struct lock_file *table_lock = &table_locks[i];
1349 char *table_path = get_locked_file_path(table_lock);
1350 unlink(table_path);
1351 free(table_path);
1354 done:
1355 rollback_lock_file(&tables_list_lock);
1356 for (i = 0; table_locks && i < nlocks; i++)
1357 rollback_lock_file(&table_locks[i]);
1358 reftable_free(table_locks);
1360 delete_tempfile(&new_table);
1361 strbuf_release(&new_table_name);
1362 strbuf_release(&new_table_path);
1363 strbuf_release(&tables_list_buf);
1364 strbuf_release(&table_name);
1365 free_names(names);
1367 if (err == REFTABLE_LOCK_ERROR)
1368 st->stats.failures++;
1370 return err;
1373 int reftable_stack_compact_all(struct reftable_stack *st,
1374 struct reftable_log_expiry_config *config)
1376 size_t last = st->merged->readers_len ? st->merged->readers_len - 1 : 0;
1377 return stack_compact_range(st, 0, last, config, 0);
1380 static int segment_size(struct segment *s)
1382 return s->end - s->start;
1385 struct segment suggest_compaction_segment(uint64_t *sizes, size_t n,
1386 uint8_t factor)
1388 struct segment seg = { 0 };
1389 uint64_t bytes;
1390 size_t i;
1392 if (!factor)
1393 factor = DEFAULT_GEOMETRIC_FACTOR;
1396 * If there are no tables or only a single one then we don't have to
1397 * compact anything. The sequence is geometric by definition already.
1399 if (n <= 1)
1400 return seg;
1403 * Find the ending table of the compaction segment needed to restore the
1404 * geometric sequence. Note that the segment end is exclusive.
1406 * To do so, we iterate backwards starting from the most recent table
1407 * until a valid segment end is found. If the preceding table is smaller
1408 * than the current table multiplied by the geometric factor (2), the
1409 * compaction segment end has been identified.
1411 * Tables after the ending point are not added to the byte count because
1412 * they are already valid members of the geometric sequence. Due to the
1413 * properties of a geometric sequence, it is not possible for the sum of
1414 * these tables to exceed the value of the ending point table.
1416 * Example table size sequence requiring no compaction:
1417 * 64, 32, 16, 8, 4, 2, 1
1419 * Example table size sequence where compaction segment end is set to
1420 * the last table. Since the segment end is exclusive, the last table is
1421 * excluded during subsequent compaction and the table with size 3 is
1422 * the final table included:
1423 * 64, 32, 16, 8, 4, 3, 1
1425 for (i = n - 1; i > 0; i--) {
1426 if (sizes[i - 1] < sizes[i] * factor) {
1427 seg.end = i + 1;
1428 bytes = sizes[i];
1429 break;
1434 * Find the starting table of the compaction segment by iterating
1435 * through the remaining tables and keeping track of the accumulated
1436 * size of all tables seen from the segment end table. The previous
1437 * table is compared to the accumulated size because the tables from the
1438 * segment end are merged backwards recursively.
1440 * Note that we keep iterating even after we have found the first
1441 * starting point. This is because there may be tables in the stack
1442 * preceding that first starting point which violate the geometric
1443 * sequence.
1445 * Example compaction segment start set to table with size 32:
1446 * 128, 32, 16, 8, 4, 3, 1
1448 for (; i > 0; i--) {
1449 uint64_t curr = bytes;
1450 bytes += sizes[i - 1];
1452 if (sizes[i - 1] < curr * factor) {
1453 seg.start = i - 1;
1454 seg.bytes = bytes;
1458 return seg;
1461 static uint64_t *stack_table_sizes_for_compaction(struct reftable_stack *st)
1463 int version = (st->opts.hash_id == GIT_SHA1_FORMAT_ID) ? 1 : 2;
1464 int overhead = header_size(version) - 1;
1465 uint64_t *sizes;
1467 REFTABLE_CALLOC_ARRAY(sizes, st->merged->readers_len);
1469 for (size_t i = 0; i < st->merged->readers_len; i++)
1470 sizes[i] = st->readers[i]->size - overhead;
1472 return sizes;
1475 int reftable_stack_auto_compact(struct reftable_stack *st)
1477 uint64_t *sizes = stack_table_sizes_for_compaction(st);
1478 struct segment seg =
1479 suggest_compaction_segment(sizes, st->merged->readers_len,
1480 st->opts.auto_compaction_factor);
1481 reftable_free(sizes);
1482 if (segment_size(&seg) > 0)
1483 return stack_compact_range(st, seg.start, seg.end - 1,
1484 NULL, STACK_COMPACT_RANGE_BEST_EFFORT);
1486 return 0;
1489 struct reftable_compaction_stats *
1490 reftable_stack_compaction_stats(struct reftable_stack *st)
1492 return &st->stats;
1495 int reftable_stack_read_ref(struct reftable_stack *st, const char *refname,
1496 struct reftable_ref_record *ref)
1498 struct reftable_iterator it = { 0 };
1499 int ret;
1501 reftable_merged_table_init_ref_iterator(st->merged, &it);
1502 ret = reftable_iterator_seek_ref(&it, refname);
1503 if (ret)
1504 goto out;
1506 ret = reftable_iterator_next_ref(&it, ref);
1507 if (ret)
1508 goto out;
1510 if (strcmp(ref->refname, refname) ||
1511 reftable_ref_record_is_deletion(ref)) {
1512 reftable_ref_record_release(ref);
1513 ret = 1;
1514 goto out;
1517 out:
1518 reftable_iterator_destroy(&it);
1519 return ret;
1522 int reftable_stack_read_log(struct reftable_stack *st, const char *refname,
1523 struct reftable_log_record *log)
1525 struct reftable_iterator it = {0};
1526 int err;
1528 reftable_stack_init_log_iterator(st, &it);
1529 err = reftable_iterator_seek_log(&it, refname);
1530 if (err)
1531 goto done;
1533 err = reftable_iterator_next_log(&it, log);
1534 if (err)
1535 goto done;
1537 if (strcmp(log->refname, refname) ||
1538 reftable_log_record_is_deletion(log)) {
1539 err = 1;
1540 goto done;
1543 done:
1544 if (err) {
1545 reftable_log_record_release(log);
1547 reftable_iterator_destroy(&it);
1548 return err;
1551 static int is_table_name(const char *s)
1553 const char *dot = strrchr(s, '.');
1554 return dot && !strcmp(dot, ".ref");
1557 static void remove_maybe_stale_table(struct reftable_stack *st, uint64_t max,
1558 const char *name)
1560 int err = 0;
1561 uint64_t update_idx = 0;
1562 struct reftable_block_source src = { NULL };
1563 struct reftable_reader *rd = NULL;
1564 struct strbuf table_path = STRBUF_INIT;
1565 stack_filename(&table_path, st, name);
1567 err = reftable_block_source_from_file(&src, table_path.buf);
1568 if (err < 0)
1569 goto done;
1571 err = reftable_reader_new(&rd, &src, name);
1572 if (err < 0)
1573 goto done;
1575 update_idx = reftable_reader_max_update_index(rd);
1576 reftable_reader_decref(rd);
1578 if (update_idx <= max) {
1579 unlink(table_path.buf);
1581 done:
1582 strbuf_release(&table_path);
1585 static int reftable_stack_clean_locked(struct reftable_stack *st)
1587 uint64_t max = reftable_merged_table_max_update_index(
1588 reftable_stack_merged_table(st));
1589 DIR *dir = opendir(st->reftable_dir);
1590 struct dirent *d = NULL;
1591 if (!dir) {
1592 return REFTABLE_IO_ERROR;
1595 while ((d = readdir(dir))) {
1596 int i = 0;
1597 int found = 0;
1598 if (!is_table_name(d->d_name))
1599 continue;
1601 for (i = 0; !found && i < st->readers_len; i++) {
1602 found = !strcmp(reader_name(st->readers[i]), d->d_name);
1604 if (found)
1605 continue;
1607 remove_maybe_stale_table(st, max, d->d_name);
1610 closedir(dir);
1611 return 0;
1614 int reftable_stack_clean(struct reftable_stack *st)
1616 struct reftable_addition *add = NULL;
1617 int err = reftable_stack_new_addition(&add, st, 0);
1618 if (err < 0) {
1619 goto done;
1622 err = reftable_stack_reload(st);
1623 if (err < 0) {
1624 goto done;
1627 err = reftable_stack_clean_locked(st);
1629 done:
1630 reftable_addition_destroy(add);
1631 return err;