1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
4 * Copyright (C) 2014 Datera Inc.
8 #include "alloc_background.h"
9 #include "alloc_foreground.h"
10 #include "backpointers.h"
11 #include "bkey_methods.h"
13 #include "btree_journal_iter.h"
14 #include "btree_key_cache.h"
15 #include "btree_locking.h"
16 #include "btree_node_scan.h"
17 #include "btree_update_interior.h"
23 #include "disk_accounting.h"
30 #include "recovery_passes.h"
36 #include <linux/slab.h>
37 #include <linux/bitops.h>
38 #include <linux/freezer.h>
39 #include <linux/kthread.h>
40 #include <linux/preempt.h>
41 #include <linux/rcupdate.h>
42 #include <linux/sched/task.h>
44 #define DROP_THIS_NODE 10
45 #define DROP_PREV_NODE 11
46 #define DID_FILL_FROM_SCAN 12
48 static const char * const bch2_gc_phase_strs
[] = {
55 void bch2_gc_pos_to_text(struct printbuf
*out
, struct gc_pos
*p
)
57 prt_str(out
, bch2_gc_phase_strs
[p
->phase
]);
59 bch2_btree_id_to_text(out
, p
->btree
);
60 prt_printf(out
, " l=%u ", p
->level
);
61 bch2_bpos_to_text(out
, p
->pos
);
64 static struct bkey_s
unsafe_bkey_s_c_to_s(struct bkey_s_c k
)
66 return (struct bkey_s
) {{{
68 (struct bch_val
*) k
.v
72 static inline void __gc_pos_set(struct bch_fs
*c
, struct gc_pos new_pos
)
75 write_seqcount_begin(&c
->gc_pos_lock
);
77 write_seqcount_end(&c
->gc_pos_lock
);
81 static inline void gc_pos_set(struct bch_fs
*c
, struct gc_pos new_pos
)
83 BUG_ON(gc_pos_cmp(new_pos
, c
->gc_pos
) < 0);
84 __gc_pos_set(c
, new_pos
);
87 static void btree_ptr_to_v2(struct btree
*b
, struct bkey_i_btree_ptr_v2
*dst
)
89 switch (b
->key
.k
.type
) {
90 case KEY_TYPE_btree_ptr
: {
91 struct bkey_i_btree_ptr
*src
= bkey_i_to_btree_ptr(&b
->key
);
95 dst
->v
.seq
= b
->data
->keys
.seq
;
96 dst
->v
.sectors_written
= 0;
98 dst
->v
.min_key
= b
->data
->min_key
;
99 set_bkey_val_bytes(&dst
->k
, sizeof(dst
->v
) + bkey_val_bytes(&src
->k
));
100 memcpy(dst
->v
.start
, src
->v
.start
, bkey_val_bytes(&src
->k
));
103 case KEY_TYPE_btree_ptr_v2
:
104 bkey_copy(&dst
->k_i
, &b
->key
);
111 static int set_node_min(struct bch_fs
*c
, struct btree
*b
, struct bpos new_min
)
113 struct bkey_i_btree_ptr_v2
*new;
116 if (c
->opts
.verbose
) {
117 struct printbuf buf
= PRINTBUF
;
119 bch2_bkey_val_to_text(&buf
, c
, bkey_i_to_s_c(&b
->key
));
120 prt_str(&buf
, " -> ");
121 bch2_bpos_to_text(&buf
, new_min
);
123 bch_info(c
, "%s(): %s", __func__
, buf
.buf
);
127 new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX
, sizeof(u64
), GFP_KERNEL
);
129 return -BCH_ERR_ENOMEM_gc_repair_key
;
131 btree_ptr_to_v2(b
, new);
132 b
->data
->min_key
= new_min
;
133 new->v
.min_key
= new_min
;
134 SET_BTREE_PTR_RANGE_UPDATED(&new->v
, true);
136 ret
= bch2_journal_key_insert_take(c
, b
->c
.btree_id
, b
->c
.level
+ 1, &new->k_i
);
142 bch2_btree_node_drop_keys_outside_node(b
);
143 bkey_copy(&b
->key
, &new->k_i
);
147 static int set_node_max(struct bch_fs
*c
, struct btree
*b
, struct bpos new_max
)
149 struct bkey_i_btree_ptr_v2
*new;
152 if (c
->opts
.verbose
) {
153 struct printbuf buf
= PRINTBUF
;
155 bch2_bkey_val_to_text(&buf
, c
, bkey_i_to_s_c(&b
->key
));
156 prt_str(&buf
, " -> ");
157 bch2_bpos_to_text(&buf
, new_max
);
159 bch_info(c
, "%s(): %s", __func__
, buf
.buf
);
163 ret
= bch2_journal_key_delete(c
, b
->c
.btree_id
, b
->c
.level
+ 1, b
->key
.k
.p
);
167 new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX
, sizeof(u64
), GFP_KERNEL
);
169 return -BCH_ERR_ENOMEM_gc_repair_key
;
171 btree_ptr_to_v2(b
, new);
172 b
->data
->max_key
= new_max
;
174 SET_BTREE_PTR_RANGE_UPDATED(&new->v
, true);
176 ret
= bch2_journal_key_insert_take(c
, b
->c
.btree_id
, b
->c
.level
+ 1, &new->k_i
);
182 bch2_btree_node_drop_keys_outside_node(b
);
184 mutex_lock(&c
->btree_cache
.lock
);
185 __bch2_btree_node_hash_remove(&c
->btree_cache
, b
);
187 bkey_copy(&b
->key
, &new->k_i
);
188 ret
= __bch2_btree_node_hash_insert(&c
->btree_cache
, b
);
190 mutex_unlock(&c
->btree_cache
.lock
);
194 static int btree_check_node_boundaries(struct btree_trans
*trans
, struct btree
*b
,
195 struct btree
*prev
, struct btree
*cur
,
196 struct bpos
*pulled_from_scan
)
198 struct bch_fs
*c
= trans
->c
;
199 struct bpos expected_start
= !prev
201 : bpos_successor(prev
->key
.k
.p
);
202 struct printbuf buf
= PRINTBUF
;
205 BUG_ON(b
->key
.k
.type
== KEY_TYPE_btree_ptr_v2
&&
206 !bpos_eq(bkey_i_to_btree_ptr_v2(&b
->key
)->v
.min_key
,
209 if (bpos_eq(expected_start
, cur
->data
->min_key
))
212 prt_printf(&buf
, " at btree %s level %u:\n parent: ",
213 bch2_btree_id_str(b
->c
.btree_id
), b
->c
.level
);
214 bch2_bkey_val_to_text(&buf
, c
, bkey_i_to_s_c(&b
->key
));
217 prt_printf(&buf
, "\n prev: ");
218 bch2_bkey_val_to_text(&buf
, c
, bkey_i_to_s_c(&prev
->key
));
221 prt_str(&buf
, "\n next: ");
222 bch2_bkey_val_to_text(&buf
, c
, bkey_i_to_s_c(&cur
->key
));
224 if (bpos_lt(expected_start
, cur
->data
->min_key
)) { /* gap */
225 if (b
->c
.level
== 1 &&
226 bpos_lt(*pulled_from_scan
, cur
->data
->min_key
)) {
227 ret
= bch2_get_scanned_nodes(c
, b
->c
.btree_id
, 0,
229 bpos_predecessor(cur
->data
->min_key
));
233 *pulled_from_scan
= cur
->data
->min_key
;
234 ret
= DID_FILL_FROM_SCAN
;
236 if (mustfix_fsck_err(trans
, btree_node_topology_bad_min_key
,
237 "btree node with incorrect min_key%s", buf
.buf
))
238 ret
= set_node_min(c
, cur
, expected_start
);
240 } else { /* overlap */
241 if (prev
&& BTREE_NODE_SEQ(cur
->data
) > BTREE_NODE_SEQ(prev
->data
)) { /* cur overwrites prev */
242 if (bpos_ge(prev
->data
->min_key
, cur
->data
->min_key
)) { /* fully? */
243 if (mustfix_fsck_err(trans
, btree_node_topology_overwritten_by_next_node
,
244 "btree node overwritten by next node%s", buf
.buf
))
245 ret
= DROP_PREV_NODE
;
247 if (mustfix_fsck_err(trans
, btree_node_topology_bad_max_key
,
248 "btree node with incorrect max_key%s", buf
.buf
))
249 ret
= set_node_max(c
, prev
,
250 bpos_predecessor(cur
->data
->min_key
));
253 if (bpos_ge(expected_start
, cur
->data
->max_key
)) { /* fully? */
254 if (mustfix_fsck_err(trans
, btree_node_topology_overwritten_by_prev_node
,
255 "btree node overwritten by prev node%s", buf
.buf
))
256 ret
= DROP_THIS_NODE
;
258 if (mustfix_fsck_err(trans
, btree_node_topology_bad_min_key
,
259 "btree node with incorrect min_key%s", buf
.buf
))
260 ret
= set_node_min(c
, cur
, expected_start
);
270 static int btree_repair_node_end(struct btree_trans
*trans
, struct btree
*b
,
271 struct btree
*child
, struct bpos
*pulled_from_scan
)
273 struct bch_fs
*c
= trans
->c
;
274 struct printbuf buf
= PRINTBUF
;
277 if (bpos_eq(child
->key
.k
.p
, b
->key
.k
.p
))
280 prt_printf(&buf
, "at btree %s level %u:\n parent: ",
281 bch2_btree_id_str(b
->c
.btree_id
), b
->c
.level
);
282 bch2_bkey_val_to_text(&buf
, c
, bkey_i_to_s_c(&b
->key
));
284 prt_str(&buf
, "\n child: ");
285 bch2_bkey_val_to_text(&buf
, c
, bkey_i_to_s_c(&child
->key
));
287 if (mustfix_fsck_err(trans
, btree_node_topology_bad_max_key
,
288 "btree node with incorrect max_key%s", buf
.buf
)) {
289 if (b
->c
.level
== 1 &&
290 bpos_lt(*pulled_from_scan
, b
->key
.k
.p
)) {
291 ret
= bch2_get_scanned_nodes(c
, b
->c
.btree_id
, 0,
292 bpos_successor(child
->key
.k
.p
), b
->key
.k
.p
);
296 *pulled_from_scan
= b
->key
.k
.p
;
297 ret
= DID_FILL_FROM_SCAN
;
299 ret
= set_node_max(c
, child
, b
->key
.k
.p
);
308 static int bch2_btree_repair_topology_recurse(struct btree_trans
*trans
, struct btree
*b
,
309 struct bpos
*pulled_from_scan
)
311 struct bch_fs
*c
= trans
->c
;
312 struct btree_and_journal_iter iter
;
314 struct bkey_buf prev_k
, cur_k
;
315 struct btree
*prev
= NULL
, *cur
= NULL
;
316 bool have_child
, new_pass
= false;
317 struct printbuf buf
= PRINTBUF
;
323 bch2_bkey_buf_init(&prev_k
);
324 bch2_bkey_buf_init(&cur_k
);
327 have_child
= new_pass
= false;
328 bch2_btree_and_journal_iter_init_node_iter(trans
, &iter
, b
);
329 iter
.prefetch
= true;
331 while ((k
= bch2_btree_and_journal_iter_peek(&iter
)).k
) {
332 BUG_ON(bpos_lt(k
.k
->p
, b
->data
->min_key
));
333 BUG_ON(bpos_gt(k
.k
->p
, b
->data
->max_key
));
335 bch2_btree_and_journal_iter_advance(&iter
);
336 bch2_bkey_buf_reassemble(&cur_k
, c
, k
);
338 cur
= bch2_btree_node_get_noiter(trans
, cur_k
.k
,
339 b
->c
.btree_id
, b
->c
.level
- 1,
341 ret
= PTR_ERR_OR_ZERO(cur
);
343 printbuf_reset(&buf
);
344 bch2_bkey_val_to_text(&buf
, c
, bkey_i_to_s_c(cur_k
.k
));
346 if (mustfix_fsck_err_on(bch2_err_matches(ret
, EIO
),
347 trans
, btree_node_unreadable
,
348 "Topology repair: unreadable btree node at btree %s level %u:\n"
350 bch2_btree_id_str(b
->c
.btree_id
),
353 bch2_btree_node_evict(trans
, cur_k
.k
);
355 ret
= bch2_journal_key_delete(c
, b
->c
.btree_id
,
356 b
->c
.level
, cur_k
.k
->k
.p
);
360 if (!btree_id_is_alloc(b
->c
.btree_id
)) {
361 ret
= bch2_run_explicit_recovery_pass(c
, BCH_RECOVERY_PASS_scan_for_btree_nodes
);
368 bch_err_msg(c
, ret
, "getting btree node");
372 if (bch2_btree_node_is_stale(c
, cur
)) {
373 bch_info(c
, "btree node %s older than nodes found by scanning", buf
.buf
);
374 six_unlock_read(&cur
->c
.lock
);
375 bch2_btree_node_evict(trans
, cur_k
.k
);
376 ret
= bch2_journal_key_delete(c
, b
->c
.btree_id
,
377 b
->c
.level
, cur_k
.k
->k
.p
);
384 ret
= btree_check_node_boundaries(trans
, b
, prev
, cur
, pulled_from_scan
);
385 if (ret
== DID_FILL_FROM_SCAN
) {
390 if (ret
== DROP_THIS_NODE
) {
391 six_unlock_read(&cur
->c
.lock
);
392 bch2_btree_node_evict(trans
, cur_k
.k
);
393 ret
= bch2_journal_key_delete(c
, b
->c
.btree_id
,
394 b
->c
.level
, cur_k
.k
->k
.p
);
402 six_unlock_read(&prev
->c
.lock
);
405 if (ret
== DROP_PREV_NODE
) {
406 bch_info(c
, "dropped prev node");
407 bch2_btree_node_evict(trans
, prev_k
.k
);
408 ret
= bch2_journal_key_delete(c
, b
->c
.btree_id
,
409 b
->c
.level
, prev_k
.k
->k
.p
);
413 bch2_btree_and_journal_iter_exit(&iter
);
420 bch2_bkey_buf_copy(&prev_k
, c
, cur_k
.k
);
423 if (!ret
&& !IS_ERR_OR_NULL(prev
)) {
425 ret
= btree_repair_node_end(trans
, b
, prev
, pulled_from_scan
);
426 if (ret
== DID_FILL_FROM_SCAN
) {
432 if (!IS_ERR_OR_NULL(prev
))
433 six_unlock_read(&prev
->c
.lock
);
435 if (!IS_ERR_OR_NULL(cur
))
436 six_unlock_read(&cur
->c
.lock
);
442 bch2_btree_and_journal_iter_exit(&iter
);
447 bch2_btree_and_journal_iter_init_node_iter(trans
, &iter
, b
);
448 iter
.prefetch
= true;
450 while ((k
= bch2_btree_and_journal_iter_peek(&iter
)).k
) {
451 bch2_bkey_buf_reassemble(&cur_k
, c
, k
);
452 bch2_btree_and_journal_iter_advance(&iter
);
454 cur
= bch2_btree_node_get_noiter(trans
, cur_k
.k
,
455 b
->c
.btree_id
, b
->c
.level
- 1,
457 ret
= PTR_ERR_OR_ZERO(cur
);
459 bch_err_msg(c
, ret
, "getting btree node");
463 ret
= bch2_btree_repair_topology_recurse(trans
, cur
, pulled_from_scan
);
464 six_unlock_read(&cur
->c
.lock
);
467 if (ret
== DROP_THIS_NODE
) {
468 bch2_btree_node_evict(trans
, cur_k
.k
);
469 ret
= bch2_journal_key_delete(c
, b
->c
.btree_id
,
470 b
->c
.level
, cur_k
.k
->k
.p
);
480 printbuf_reset(&buf
);
481 bch2_bkey_val_to_text(&buf
, c
, bkey_i_to_s_c(&b
->key
));
483 if (mustfix_fsck_err_on(!have_child
,
484 trans
, btree_node_topology_interior_node_empty
,
485 "empty interior btree node at btree %s level %u\n"
487 bch2_btree_id_str(b
->c
.btree_id
),
488 b
->c
.level
, buf
.buf
))
489 ret
= DROP_THIS_NODE
;
492 if (!IS_ERR_OR_NULL(prev
))
493 six_unlock_read(&prev
->c
.lock
);
494 if (!IS_ERR_OR_NULL(cur
))
495 six_unlock_read(&cur
->c
.lock
);
497 bch2_btree_and_journal_iter_exit(&iter
);
499 if (!ret
&& new_pass
)
502 BUG_ON(!ret
&& bch2_btree_node_check_topology(trans
, b
));
504 bch2_bkey_buf_exit(&prev_k
, c
);
505 bch2_bkey_buf_exit(&cur_k
, c
);
510 int bch2_check_topology(struct bch_fs
*c
)
512 struct btree_trans
*trans
= bch2_trans_get(c
);
513 struct bpos pulled_from_scan
= POS_MIN
;
516 bch2_trans_srcu_unlock(trans
);
518 for (unsigned i
= 0; i
< btree_id_nr_alive(c
) && !ret
; i
++) {
519 struct btree_root
*r
= bch2_btree_id_root(c
, i
);
520 bool reconstructed_root
= false;
523 ret
= bch2_run_explicit_recovery_pass(c
, BCH_RECOVERY_PASS_scan_for_btree_nodes
);
527 bch_info(c
, "btree root %s unreadable, must recover from scan", bch2_btree_id_str(i
));
532 if (!bch2_btree_has_scanned_nodes(c
, i
)) {
533 mustfix_fsck_err(trans
, btree_root_unreadable_and_scan_found_nothing
,
534 "no nodes found for btree %s, continue?", bch2_btree_id_str(i
));
535 bch2_btree_root_alloc_fake_trans(trans
, i
, 0);
537 bch2_btree_root_alloc_fake_trans(trans
, i
, 1);
538 bch2_shoot_down_journal_keys(c
, i
, 1, BTREE_MAX_DEPTH
, POS_MIN
, SPOS_MAX
);
539 ret
= bch2_get_scanned_nodes(c
, i
, 0, POS_MIN
, SPOS_MAX
);
544 reconstructed_root
= true;
547 struct btree
*b
= r
->b
;
549 btree_node_lock_nopath_nofail(trans
, &b
->c
, SIX_LOCK_read
);
550 ret
= bch2_btree_repair_topology_recurse(trans
, b
, &pulled_from_scan
);
551 six_unlock_read(&b
->c
.lock
);
553 if (ret
== DROP_THIS_NODE
) {
554 mutex_lock(&c
->btree_cache
.lock
);
555 bch2_btree_node_hash_remove(&c
->btree_cache
, b
);
556 mutex_unlock(&c
->btree_cache
.lock
);
560 if (!reconstructed_root
)
561 goto reconstruct_root
;
563 bch_err(c
, "empty btree root %s", bch2_btree_id_str(i
));
564 bch2_btree_root_alloc_fake_trans(trans
, i
, 0);
570 bch2_trans_put(trans
);
574 /* marking of btree keys/nodes: */
576 static int bch2_gc_mark_key(struct btree_trans
*trans
, enum btree_id btree_id
,
577 unsigned level
, struct btree
**prev
,
578 struct btree_iter
*iter
, struct bkey_s_c k
,
581 struct bch_fs
*c
= trans
->c
;
584 struct btree_path
*path
= btree_iter_path(trans
, iter
);
585 struct btree
*b
= path_l(path
)->b
;
588 int ret
= bch2_btree_node_check_topology(trans
, b
);
595 struct bkey deleted
= KEY(0, 0, 0);
596 struct bkey_s_c old
= (struct bkey_s_c
) { &deleted
, NULL
};
597 struct printbuf buf
= PRINTBUF
;
603 BUG_ON(bch2_journal_seq_verify
&&
604 k
.k
->bversion
.lo
> atomic64_read(&c
->journal
.seq
));
606 if (fsck_err_on(btree_id
!= BTREE_ID_accounting
&&
607 k
.k
->bversion
.lo
> atomic64_read(&c
->key_version
),
608 trans
, bkey_version_in_future
,
609 "key version number higher than recorded %llu\n %s",
610 atomic64_read(&c
->key_version
),
611 (bch2_bkey_val_to_text(&buf
, c
, k
), buf
.buf
)))
612 atomic64_set(&c
->key_version
, k
.k
->bversion
.lo
);
615 if (mustfix_fsck_err_on(level
&& !bch2_dev_btree_bitmap_marked(c
, k
),
616 trans
, btree_bitmap_not_marked
,
617 "btree ptr not marked in member info btree allocated bitmap\n %s",
618 (printbuf_reset(&buf
),
619 bch2_bkey_val_to_text(&buf
, c
, k
),
621 mutex_lock(&c
->sb_lock
);
622 bch2_dev_btree_bitmap_mark(c
, k
);
624 mutex_unlock(&c
->sb_lock
);
628 * We require a commit before key_trigger() because
629 * key_trigger(BTREE_TRIGGER_GC) is not idempotant; we'll calculate the
630 * wrong result if we run it multiple times.
632 unsigned flags
= !iter
? BTREE_TRIGGER_is_root
: 0;
634 ret
= bch2_key_trigger(trans
, btree_id
, level
, old
, unsafe_bkey_s_c_to_s(k
),
635 BTREE_TRIGGER_check_repair
|flags
);
639 if (trans
->nr_updates
) {
640 ret
= bch2_trans_commit(trans
, NULL
, NULL
, 0) ?:
641 -BCH_ERR_transaction_restart_nested
;
645 ret
= bch2_key_trigger(trans
, btree_id
, level
, old
, unsafe_bkey_s_c_to_s(k
),
646 BTREE_TRIGGER_gc
|BTREE_TRIGGER_insert
|flags
);
654 static int bch2_gc_btree(struct btree_trans
*trans
, enum btree_id btree
, bool initial
)
656 struct bch_fs
*c
= trans
->c
;
657 unsigned target_depth
= btree_node_type_has_triggers(__btree_node_type(0, btree
)) ? 0 : 1;
660 /* We need to make sure every leaf node is readable before going RW */
664 for (unsigned level
= target_depth
; level
< BTREE_MAX_DEPTH
; level
++) {
665 struct btree
*prev
= NULL
;
666 struct btree_iter iter
;
667 bch2_trans_node_iter_init(trans
, &iter
, btree
, POS_MIN
, 0, level
,
668 BTREE_ITER_prefetch
);
670 ret
= for_each_btree_key_continue(trans
, iter
, 0, k
, ({
671 gc_pos_set(c
, gc_pos_btree(btree
, level
, k
.k
->p
));
672 bch2_gc_mark_key(trans
, btree
, level
, &prev
, &iter
, k
, initial
);
681 bch2_trans_begin(trans
);
683 struct btree_iter iter
;
684 bch2_trans_node_iter_init(trans
, &iter
, btree
, POS_MIN
,
685 0, bch2_btree_id_root(c
, btree
)->b
->c
.level
, 0);
686 struct btree
*b
= bch2_btree_iter_peek_node(&iter
);
687 ret
= PTR_ERR_OR_ZERO(b
);
691 if (b
!= btree_node_root(c
, b
)) {
692 bch2_trans_iter_exit(trans
, &iter
);
696 gc_pos_set(c
, gc_pos_btree(btree
, b
->c
.level
+ 1, SPOS_MAX
));
697 struct bkey_s_c k
= bkey_i_to_s_c(&b
->key
);
698 ret
= bch2_gc_mark_key(trans
, btree
, b
->c
.level
+ 1, NULL
, NULL
, k
, initial
);
700 bch2_trans_iter_exit(trans
, &iter
);
701 } while (bch2_err_matches(ret
, BCH_ERR_transaction_restart
));
707 static inline int btree_id_gc_phase_cmp(enum btree_id l
, enum btree_id r
)
709 return cmp_int(gc_btree_order(l
), gc_btree_order(r
));
712 static int bch2_gc_btrees(struct bch_fs
*c
)
714 struct btree_trans
*trans
= bch2_trans_get(c
);
715 enum btree_id ids
[BTREE_ID_NR
];
719 for (i
= 0; i
< BTREE_ID_NR
; i
++)
721 bubble_sort(ids
, BTREE_ID_NR
, btree_id_gc_phase_cmp
);
723 for (i
= 0; i
< btree_id_nr_alive(c
) && !ret
; i
++) {
724 unsigned btree
= i
< BTREE_ID_NR
? ids
[i
] : i
;
726 if (IS_ERR_OR_NULL(bch2_btree_id_root(c
, btree
)->b
))
729 ret
= bch2_gc_btree(trans
, btree
, true);
731 if (mustfix_fsck_err_on(bch2_err_matches(ret
, EIO
),
732 trans
, btree_node_read_error
,
733 "btree node read error for %s",
734 bch2_btree_id_str(btree
)))
735 ret
= bch2_run_explicit_recovery_pass(c
, BCH_RECOVERY_PASS_check_topology
);
738 bch2_trans_put(trans
);
743 static int bch2_mark_superblocks(struct bch_fs
*c
)
745 gc_pos_set(c
, gc_phase(GC_PHASE_sb
));
747 return bch2_trans_mark_dev_sbs_flags(c
, BTREE_TRIGGER_gc
);
750 static void bch2_gc_free(struct bch_fs
*c
)
752 bch2_accounting_gc_free(c
);
754 genradix_free(&c
->reflink_gc_table
);
755 genradix_free(&c
->gc_stripes
);
757 for_each_member_device(c
, ca
)
758 genradix_free(&ca
->buckets_gc
);
761 static int bch2_gc_start(struct bch_fs
*c
)
763 for_each_member_device(c
, ca
) {
764 int ret
= bch2_dev_usage_init(ca
, true);
774 /* returns true if not equal */
775 static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l
,
776 struct bch_alloc_v4 r
)
778 return l
.gen
!= r
.gen
||
779 l
.oldest_gen
!= r
.oldest_gen
||
780 l
.data_type
!= r
.data_type
||
781 l
.dirty_sectors
!= r
.dirty_sectors
||
782 l
.stripe_sectors
!= r
.stripe_sectors
||
783 l
.cached_sectors
!= r
.cached_sectors
||
784 l
.stripe_redundancy
!= r
.stripe_redundancy
||
785 l
.stripe
!= r
.stripe
;
788 static int bch2_alloc_write_key(struct btree_trans
*trans
,
789 struct btree_iter
*iter
,
793 struct bch_fs
*c
= trans
->c
;
794 struct bkey_i_alloc_v4
*a
;
795 struct bch_alloc_v4 old_gc
, gc
, old_convert
, new;
796 const struct bch_alloc_v4
*old
;
799 if (!bucket_valid(ca
, k
.k
->p
.offset
))
802 old
= bch2_alloc_to_v4(k
, &old_convert
);
805 percpu_down_read(&c
->mark_lock
);
806 __bucket_m_to_alloc(&gc
, *gc_bucket(ca
, iter
->pos
.offset
));
810 if ((old
->data_type
== BCH_DATA_sb
||
811 old
->data_type
== BCH_DATA_journal
) &&
812 !bch2_dev_is_online(ca
)) {
813 gc
.data_type
= old
->data_type
;
814 gc
.dirty_sectors
= old
->dirty_sectors
;
816 percpu_up_read(&c
->mark_lock
);
819 * gc.data_type doesn't yet include need_discard & need_gc_gen states -
822 alloc_data_type_set(&gc
, gc
.data_type
);
823 if (gc
.data_type
!= old_gc
.data_type
||
824 gc
.dirty_sectors
!= old_gc
.dirty_sectors
) {
825 ret
= bch2_alloc_key_to_dev_counters(trans
, ca
, &old_gc
, &gc
, BTREE_TRIGGER_gc
);
830 * Ugly: alloc_key_to_dev_counters(..., BTREE_TRIGGER_gc) is not
831 * safe w.r.t. transaction restarts, so fixup the gc_bucket so
832 * we don't run it twice:
834 percpu_down_read(&c
->mark_lock
);
835 struct bucket
*gc_m
= gc_bucket(ca
, iter
->pos
.offset
);
836 gc_m
->data_type
= gc
.data_type
;
837 gc_m
->dirty_sectors
= gc
.dirty_sectors
;
838 percpu_up_read(&c
->mark_lock
);
841 if (fsck_err_on(new.data_type
!= gc
.data_type
,
842 trans
, alloc_key_data_type_wrong
,
843 "bucket %llu:%llu gen %u has wrong data_type"
844 ": got %s, should be %s",
845 iter
->pos
.inode
, iter
->pos
.offset
,
847 bch2_data_type_str(new.data_type
),
848 bch2_data_type_str(gc
.data_type
)))
849 new.data_type
= gc
.data_type
;
851 #define copy_bucket_field(_errtype, _f) \
852 if (fsck_err_on(new._f != gc._f, \
854 "bucket %llu:%llu gen %u data type %s has wrong " #_f \
855 ": got %llu, should be %llu", \
856 iter->pos.inode, iter->pos.offset, \
858 bch2_data_type_str(gc.data_type), \
859 (u64) new._f, (u64) gc._f)) \
862 copy_bucket_field(alloc_key_gen_wrong, gen);
863 copy_bucket_field(alloc_key_dirty_sectors_wrong
, dirty_sectors
);
864 copy_bucket_field(alloc_key_stripe_sectors_wrong
, stripe_sectors
);
865 copy_bucket_field(alloc_key_cached_sectors_wrong
, cached_sectors
);
866 copy_bucket_field(alloc_key_stripe_wrong
, stripe
);
867 copy_bucket_field(alloc_key_stripe_redundancy_wrong
, stripe_redundancy
);
868 #undef copy_bucket_field
870 if (!bch2_alloc_v4_cmp(*old
, new))
873 a
= bch2_alloc_to_v4_mut(trans
, k
);
874 ret
= PTR_ERR_OR_ZERO(a
);
881 * The trigger normally makes sure these are set, but we're not running
884 if (a
->v
.data_type
== BCH_DATA_cached
&& !a
->v
.io_time
[READ
])
885 a
->v
.io_time
[READ
] = max_t(u64
, 1, atomic64_read(&c
->io_clock
[READ
].now
));
887 ret
= bch2_trans_update(trans
, iter
, &a
->k_i
, BTREE_TRIGGER_norun
);
892 static int bch2_gc_alloc_done(struct bch_fs
*c
)
896 for_each_member_device(c
, ca
) {
897 ret
= bch2_trans_run(c
,
898 for_each_btree_key_upto_commit(trans
, iter
, BTREE_ID_alloc
,
899 POS(ca
->dev_idx
, ca
->mi
.first_bucket
),
900 POS(ca
->dev_idx
, ca
->mi
.nbuckets
- 1),
901 BTREE_ITER_slots
|BTREE_ITER_prefetch
, k
,
902 NULL
, NULL
, BCH_TRANS_COMMIT_lazy_rw
,
903 bch2_alloc_write_key(trans
, &iter
, ca
, k
)));
914 static int bch2_gc_alloc_start(struct bch_fs
*c
)
918 for_each_member_device(c
, ca
) {
919 ret
= genradix_prealloc(&ca
->buckets_gc
, ca
->mi
.nbuckets
, GFP_KERNEL
);
922 ret
= -BCH_ERR_ENOMEM_gc_alloc_start
;
931 static int bch2_gc_write_reflink_key(struct btree_trans
*trans
,
932 struct btree_iter
*iter
,
936 struct bch_fs
*c
= trans
->c
;
937 const __le64
*refcount
= bkey_refcount_c(k
);
938 struct printbuf buf
= PRINTBUF
;
939 struct reflink_gc
*r
;
945 while ((r
= genradix_ptr(&c
->reflink_gc_table
, *idx
)) &&
946 r
->offset
< k
.k
->p
.offset
)
950 r
->offset
!= k
.k
->p
.offset
||
951 r
->size
!= k
.k
->size
) {
952 bch_err(c
, "unexpected inconsistency walking reflink table at gc finish");
956 if (fsck_err_on(r
->refcount
!= le64_to_cpu(*refcount
),
957 trans
, reflink_v_refcount_wrong
,
958 "reflink key has wrong refcount:\n"
961 (bch2_bkey_val_to_text(&buf
, c
, k
), buf
.buf
),
963 struct bkey_i
*new = bch2_bkey_make_mut_noupdate(trans
, k
);
964 ret
= PTR_ERR_OR_ZERO(new);
969 new->k
.type
= KEY_TYPE_deleted
;
971 *bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r
->refcount
);
972 ret
= bch2_trans_update(trans
, iter
, new, 0);
980 static int bch2_gc_reflink_done(struct bch_fs
*c
)
984 int ret
= bch2_trans_run(c
,
985 for_each_btree_key_commit(trans
, iter
,
986 BTREE_ID_reflink
, POS_MIN
,
987 BTREE_ITER_prefetch
, k
,
988 NULL
, NULL
, BCH_TRANS_COMMIT_no_enospc
,
989 bch2_gc_write_reflink_key(trans
, &iter
, k
, &idx
)));
990 c
->reflink_gc_nr
= 0;
994 static int bch2_gc_reflink_start(struct bch_fs
*c
)
996 c
->reflink_gc_nr
= 0;
998 int ret
= bch2_trans_run(c
,
999 for_each_btree_key(trans
, iter
, BTREE_ID_reflink
, POS_MIN
,
1000 BTREE_ITER_prefetch
, k
, ({
1001 const __le64
*refcount
= bkey_refcount_c(k
);
1006 struct reflink_gc
*r
= genradix_ptr_alloc(&c
->reflink_gc_table
,
1007 c
->reflink_gc_nr
++, GFP_KERNEL
);
1009 ret
= -BCH_ERR_ENOMEM_gc_reflink_start
;
1013 r
->offset
= k
.k
->p
.offset
;
1014 r
->size
= k
.k
->size
;
1023 static int bch2_gc_write_stripes_key(struct btree_trans
*trans
,
1024 struct btree_iter
*iter
,
1027 struct bch_fs
*c
= trans
->c
;
1028 struct printbuf buf
= PRINTBUF
;
1029 const struct bch_stripe
*s
;
1030 struct gc_stripe
*m
;
1035 if (k
.k
->type
!= KEY_TYPE_stripe
)
1038 s
= bkey_s_c_to_stripe(k
).v
;
1039 m
= genradix_ptr(&c
->gc_stripes
, k
.k
->p
.offset
);
1041 for (i
= 0; i
< s
->nr_blocks
; i
++) {
1042 u32 old
= stripe_blockcount_get(s
, i
);
1043 u32
new = (m
? m
->block_sectors
[i
] : 0);
1046 prt_printf(&buf
, "stripe block %u has wrong sector count: got %u, should be %u\n",
1053 bch2_bkey_val_to_text(&buf
, c
, k
);
1055 if (fsck_err_on(bad
,
1056 trans
, stripe_sector_count_wrong
,
1058 struct bkey_i_stripe
*new;
1060 new = bch2_trans_kmalloc(trans
, bkey_bytes(k
.k
));
1061 ret
= PTR_ERR_OR_ZERO(new);
1065 bkey_reassemble(&new->k_i
, k
);
1067 for (i
= 0; i
< new->v
.nr_blocks
; i
++)
1068 stripe_blockcount_set(&new->v
, i
, m
? m
->block_sectors
[i
] : 0);
1070 ret
= bch2_trans_update(trans
, iter
, &new->k_i
, 0);
1073 printbuf_exit(&buf
);
1077 static int bch2_gc_stripes_done(struct bch_fs
*c
)
1079 return bch2_trans_run(c
,
1080 for_each_btree_key_commit(trans
, iter
,
1081 BTREE_ID_stripes
, POS_MIN
,
1082 BTREE_ITER_prefetch
, k
,
1083 NULL
, NULL
, BCH_TRANS_COMMIT_no_enospc
,
1084 bch2_gc_write_stripes_key(trans
, &iter
, k
)));
1088 * bch2_check_allocations - walk all references to buckets, and recompute them:
1090 * @c: filesystem object
1092 * Returns: 0 on success, or standard errcode on failure
1094 * Order matters here:
1095 * - Concurrent GC relies on the fact that we have a total ordering for
1096 * everything that GC walks - see gc_will_visit_node(),
1097 * gc_will_visit_root()
1099 * - also, references move around in the course of index updates and
1100 * various other crap: everything needs to agree on the ordering
1101 * references are allowed to move around in - e.g., we're allowed to
1102 * start with a reference owned by an open_bucket (the allocator) and
1103 * move it to the btree, but not the reverse.
1105 * This is necessary to ensure that gc doesn't miss references that
1106 * move around - if references move backwards in the ordering GC
1107 * uses, GC could skip past them
1109 int bch2_check_allocations(struct bch_fs
*c
)
1113 lockdep_assert_held(&c
->state_lock
);
1115 down_write(&c
->gc_lock
);
1117 bch2_btree_interior_updates_flush(c
);
1119 ret
= bch2_gc_accounting_start(c
) ?:
1121 bch2_gc_alloc_start(c
) ?:
1122 bch2_gc_reflink_start(c
);
1126 gc_pos_set(c
, gc_phase(GC_PHASE_start
));
1128 ret
= bch2_mark_superblocks(c
);
1129 bch_err_msg(c
, ret
, "marking superblocks");
1133 ret
= bch2_gc_btrees(c
);
1139 ret
= bch2_gc_alloc_done(c
) ?:
1140 bch2_gc_accounting_done(c
) ?:
1141 bch2_gc_stripes_done(c
) ?:
1142 bch2_gc_reflink_done(c
);
1144 percpu_down_write(&c
->mark_lock
);
1145 /* Indicates that gc is no longer in progress: */
1146 __gc_pos_set(c
, gc_phase(GC_PHASE_not_running
));
1149 percpu_up_write(&c
->mark_lock
);
1151 up_write(&c
->gc_lock
);
1154 * At startup, allocations can happen directly instead of via the
1155 * allocator thread - issue wakeup in case they blocked on gc_lock:
1157 closure_wake_up(&c
->freelist_wait
);
1162 static int gc_btree_gens_key(struct btree_trans
*trans
,
1163 struct btree_iter
*iter
,
1166 struct bch_fs
*c
= trans
->c
;
1167 struct bkey_ptrs_c ptrs
= bch2_bkey_ptrs_c(k
);
1171 if (unlikely(test_bit(BCH_FS_going_ro
, &c
->flags
)))
1174 percpu_down_read(&c
->mark_lock
);
1176 bkey_for_each_ptr(ptrs
, ptr
) {
1177 struct bch_dev
*ca
= bch2_dev_rcu(c
, ptr
->dev
);
1181 if (dev_ptr_stale(ca
, ptr
) > 16) {
1183 percpu_up_read(&c
->mark_lock
);
1188 bkey_for_each_ptr(ptrs
, ptr
) {
1189 struct bch_dev
*ca
= bch2_dev_rcu(c
, ptr
->dev
);
1193 u8
*gen
= &ca
->oldest_gen
[PTR_BUCKET_NR(ca
, ptr
)];
1194 if (gen_after(*gen
, ptr
->gen
))
1198 percpu_up_read(&c
->mark_lock
);
1201 u
= bch2_bkey_make_mut(trans
, iter
, &k
, 0);
1202 ret
= PTR_ERR_OR_ZERO(u
);
1206 bch2_extent_normalize(c
, bkey_i_to_s(u
));
1210 static int bch2_alloc_write_oldest_gen(struct btree_trans
*trans
, struct bch_dev
*ca
,
1211 struct btree_iter
*iter
, struct bkey_s_c k
)
1213 struct bch_alloc_v4 a_convert
;
1214 const struct bch_alloc_v4
*a
= bch2_alloc_to_v4(k
, &a_convert
);
1215 struct bkey_i_alloc_v4
*a_mut
;
1218 if (a
->oldest_gen
== ca
->oldest_gen
[iter
->pos
.offset
])
1221 a_mut
= bch2_alloc_to_v4_mut(trans
, k
);
1222 ret
= PTR_ERR_OR_ZERO(a_mut
);
1226 a_mut
->v
.oldest_gen
= ca
->oldest_gen
[iter
->pos
.offset
];
1227 alloc_data_type_set(&a_mut
->v
, a_mut
->v
.data_type
);
1229 return bch2_trans_update(trans
, iter
, &a_mut
->k_i
, 0);
1232 int bch2_gc_gens(struct bch_fs
*c
)
1234 u64 b
, start_time
= local_clock();
1237 if (!mutex_trylock(&c
->gc_gens_lock
))
1240 trace_and_count(c
, gc_gens_start
, c
);
1243 * We have to use trylock here. Otherwise, we would
1244 * introduce a deadlock in the RO path - we take the
1245 * state lock at the start of going RO.
1247 if (!down_read_trylock(&c
->state_lock
)) {
1248 mutex_unlock(&c
->gc_gens_lock
);
1252 for_each_member_device(c
, ca
) {
1253 struct bucket_gens
*gens
= bucket_gens(ca
);
1255 BUG_ON(ca
->oldest_gen
);
1257 ca
->oldest_gen
= kvmalloc(gens
->nbuckets
, GFP_KERNEL
);
1258 if (!ca
->oldest_gen
) {
1260 ret
= -BCH_ERR_ENOMEM_gc_gens
;
1264 for (b
= gens
->first_bucket
;
1265 b
< gens
->nbuckets
; b
++)
1266 ca
->oldest_gen
[b
] = gens
->b
[b
];
1269 for (unsigned i
= 0; i
< BTREE_ID_NR
; i
++)
1270 if (btree_type_has_ptrs(i
)) {
1271 c
->gc_gens_btree
= i
;
1272 c
->gc_gens_pos
= POS_MIN
;
1274 ret
= bch2_trans_run(c
,
1275 for_each_btree_key_commit(trans
, iter
, i
,
1277 BTREE_ITER_prefetch
|BTREE_ITER_all_snapshots
,
1280 BCH_TRANS_COMMIT_no_enospc
,
1281 gc_btree_gens_key(trans
, &iter
, k
)));
1286 struct bch_dev
*ca
= NULL
;
1287 ret
= bch2_trans_run(c
,
1288 for_each_btree_key_commit(trans
, iter
, BTREE_ID_alloc
,
1290 BTREE_ITER_prefetch
,
1293 BCH_TRANS_COMMIT_no_enospc
, ({
1294 ca
= bch2_dev_iterate(c
, ca
, k
.k
->p
.inode
);
1296 bch2_btree_iter_set_pos(&iter
, POS(k
.k
->p
.inode
+ 1, 0));
1299 bch2_alloc_write_oldest_gen(trans
, ca
, &iter
, k
);
1306 c
->gc_gens_btree
= 0;
1307 c
->gc_gens_pos
= POS_MIN
;
1311 bch2_time_stats_update(&c
->times
[BCH_TIME_btree_gc
], start_time
);
1312 trace_and_count(c
, gc_gens_end
, c
);
1314 for_each_member_device(c
, ca
) {
1315 kvfree(ca
->oldest_gen
);
1316 ca
->oldest_gen
= NULL
;
1319 up_read(&c
->state_lock
);
1320 mutex_unlock(&c
->gc_gens_lock
);
1321 if (!bch2_err_matches(ret
, EROFS
))
1326 static void bch2_gc_gens_work(struct work_struct
*work
)
1328 struct bch_fs
*c
= container_of(work
, struct bch_fs
, gc_gens_work
);
1330 bch2_write_ref_put(c
, BCH_WRITE_REF_gc_gens
);
1333 void bch2_gc_gens_async(struct bch_fs
*c
)
1335 if (bch2_write_ref_tryget(c
, BCH_WRITE_REF_gc_gens
) &&
1336 !queue_work(c
->write_ref_wq
, &c
->gc_gens_work
))
1337 bch2_write_ref_put(c
, BCH_WRITE_REF_gc_gens
);
1340 void bch2_fs_gc_init(struct bch_fs
*c
)
1342 seqcount_init(&c
->gc_pos_lock
);
1344 INIT_WORK(&c
->gc_gens_work
, bch2_gc_gens_work
);