1 // SPDX-License-Identifier: GPL-2.0
4 #include "btree_cache.h"
6 #include "btree_journal_iter.h"
7 #include "btree_node_scan.h"
8 #include "btree_update_interior.h"
11 #include "journal_io.h"
12 #include "recovery_passes.h"
14 #include <linux/kthread.h>
15 #include <linux/sort.h>
17 struct find_btree_nodes_worker
{
19 struct find_btree_nodes
*f
;
23 static void found_btree_node_to_text(struct printbuf
*out
, struct bch_fs
*c
, const struct found_btree_node
*n
)
25 prt_printf(out
, "%s l=%u seq=%u journal_seq=%llu cookie=%llx ",
26 bch2_btree_id_str(n
->btree_id
), n
->level
, n
->seq
,
27 n
->journal_seq
, n
->cookie
);
28 bch2_bpos_to_text(out
, n
->min_key
);
30 bch2_bpos_to_text(out
, n
->max_key
);
33 prt_str(out
, " range updated");
35 prt_str(out
, " overwritten");
37 for (unsigned i
= 0; i
< n
->nr_ptrs
; i
++) {
39 bch2_extent_ptr_to_text(out
, c
, n
->ptrs
+ i
);
43 static void found_btree_nodes_to_text(struct printbuf
*out
, struct bch_fs
*c
, found_btree_nodes nodes
)
45 printbuf_indent_add(out
, 2);
46 darray_for_each(nodes
, i
) {
47 found_btree_node_to_text(out
, c
, i
);
50 printbuf_indent_sub(out
, 2);
53 static void found_btree_node_to_key(struct bkey_i
*k
, const struct found_btree_node
*f
)
55 struct bkey_i_btree_ptr_v2
*bp
= bkey_btree_ptr_v2_init(k
);
57 set_bkey_val_u64s(&bp
->k
, sizeof(struct bch_btree_ptr_v2
) / sizeof(u64
) + f
->nr_ptrs
);
59 bp
->v
.seq
= cpu_to_le64(f
->cookie
);
60 bp
->v
.sectors_written
= 0;
62 bp
->v
.sectors_written
= cpu_to_le16(f
->sectors_written
);
63 bp
->v
.min_key
= f
->min_key
;
64 SET_BTREE_PTR_RANGE_UPDATED(&bp
->v
, f
->range_updated
);
65 memcpy(bp
->v
.start
, f
->ptrs
, sizeof(struct bch_extent_ptr
) * f
->nr_ptrs
);
68 static inline u64
bkey_journal_seq(struct bkey_s_c k
)
71 case KEY_TYPE_inode_v3
:
72 return le64_to_cpu(bkey_s_c_to_inode_v3(k
).v
->bi_journal_seq
);
78 static bool found_btree_node_is_readable(struct btree_trans
*trans
,
79 struct found_btree_node
*f
)
81 struct { __BKEY_PADDED(k
, BKEY_BTREE_PTR_VAL_U64s_MAX
); } tmp
;
83 found_btree_node_to_key(&tmp
.k
, f
);
85 struct btree
*b
= bch2_btree_node_get_noiter(trans
, &tmp
.k
, f
->btree_id
, f
->level
, false);
86 bool ret
= !IS_ERR_OR_NULL(b
);
90 f
->sectors_written
= b
->written
;
91 f
->journal_seq
= le64_to_cpu(b
->data
->keys
.journal_seq
);
95 struct btree_node_iter iter
;
96 for_each_btree_node_key_unpack(b
, k
, &iter
, &unpacked
)
97 f
->journal_seq
= max(f
->journal_seq
, bkey_journal_seq(k
));
99 six_unlock_read(&b
->c
.lock
);
102 * We might update this node's range; if that happens, we need the node
103 * to be re-read so the read path can trim keys that are no longer in
106 if (b
!= btree_node_root(trans
->c
, b
))
107 bch2_btree_node_evict(trans
, &tmp
.k
);
111 static int found_btree_node_cmp_cookie(const void *_l
, const void *_r
)
113 const struct found_btree_node
*l
= _l
;
114 const struct found_btree_node
*r
= _r
;
116 return cmp_int(l
->btree_id
, r
->btree_id
) ?:
117 cmp_int(l
->level
, r
->level
) ?:
118 cmp_int(l
->cookie
, r
->cookie
);
122 * Given two found btree nodes, if their sequence numbers are equal, take the
123 * one that's readable:
125 static int found_btree_node_cmp_time(const struct found_btree_node
*l
,
126 const struct found_btree_node
*r
)
128 return cmp_int(l
->seq
, r
->seq
) ?:
129 cmp_int(l
->journal_seq
, r
->journal_seq
);
132 static int found_btree_node_cmp_pos(const void *_l
, const void *_r
)
134 const struct found_btree_node
*l
= _l
;
135 const struct found_btree_node
*r
= _r
;
137 return cmp_int(l
->btree_id
, r
->btree_id
) ?:
138 -cmp_int(l
->level
, r
->level
) ?:
139 bpos_cmp(l
->min_key
, r
->min_key
) ?:
140 -found_btree_node_cmp_time(l
, r
);
143 static void try_read_btree_node(struct find_btree_nodes
*f
, struct bch_dev
*ca
,
144 struct bio
*bio
, struct btree_node
*bn
, u64 offset
)
146 struct bch_fs
*c
= container_of(f
, struct bch_fs
, found_btree_nodes
);
148 bio_reset(bio
, ca
->disk_sb
.bdev
, REQ_OP_READ
);
149 bio
->bi_iter
.bi_sector
= offset
;
150 bch2_bio_map(bio
, bn
, PAGE_SIZE
);
152 submit_bio_wait(bio
);
153 if (bch2_dev_io_err_on(bio
->bi_status
, ca
, BCH_MEMBER_ERROR_read
,
154 "IO error in try_read_btree_node() at %llu: %s",
155 offset
, bch2_blk_status_to_str(bio
->bi_status
)))
158 if (le64_to_cpu(bn
->magic
) != bset_magic(c
))
161 if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn
->keys
))) {
162 struct nonce nonce
= btree_nonce(&bn
->keys
, 0);
163 unsigned bytes
= (void *) &bn
->keys
- (void *) &bn
->flags
;
165 bch2_encrypt(c
, BSET_CSUM_TYPE(&bn
->keys
), nonce
, &bn
->flags
, bytes
);
168 if (btree_id_is_alloc(BTREE_NODE_ID(bn
)))
171 if (BTREE_NODE_LEVEL(bn
) >= BTREE_MAX_DEPTH
)
174 if (BTREE_NODE_ID(bn
) >= BTREE_ID_NR_MAX
)
178 struct found_btree_node n
= {
179 .btree_id
= BTREE_NODE_ID(bn
),
180 .level
= BTREE_NODE_LEVEL(bn
),
181 .seq
= BTREE_NODE_SEQ(bn
),
182 .cookie
= le64_to_cpu(bn
->keys
.seq
),
183 .min_key
= bn
->min_key
,
184 .max_key
= bn
->max_key
,
186 .ptrs
[0].type
= 1 << BCH_EXTENT_ENTRY_ptr
,
187 .ptrs
[0].offset
= offset
,
188 .ptrs
[0].dev
= ca
->dev_idx
,
189 .ptrs
[0].gen
= bucket_gen_get(ca
, sector_to_bucket(ca
, offset
)),
193 if (bch2_trans_run(c
, found_btree_node_is_readable(trans
, &n
))) {
194 mutex_lock(&f
->lock
);
195 if (BSET_BIG_ENDIAN(&bn
->keys
) != CPU_BIG_ENDIAN
) {
196 bch_err(c
, "try_read_btree_node() can't handle endian conversion");
201 if (darray_push(&f
->nodes
, n
))
204 mutex_unlock(&f
->lock
);
208 static int read_btree_nodes_worker(void *p
)
210 struct find_btree_nodes_worker
*w
= p
;
211 struct bch_fs
*c
= container_of(w
->f
, struct bch_fs
, found_btree_nodes
);
212 struct bch_dev
*ca
= w
->ca
;
213 void *buf
= (void *) __get_free_page(GFP_KERNEL
);
214 struct bio
*bio
= bio_alloc(NULL
, 1, 0, GFP_KERNEL
);
215 unsigned long last_print
= jiffies
;
218 bch_err(c
, "read_btree_nodes_worker: error allocating bio/buf");
223 for (u64 bucket
= ca
->mi
.first_bucket
; bucket
< ca
->mi
.nbuckets
; bucket
++)
224 for (unsigned bucket_offset
= 0;
225 bucket_offset
+ btree_sectors(c
) <= ca
->mi
.bucket_size
;
226 bucket_offset
+= btree_sectors(c
)) {
227 if (time_after(jiffies
, last_print
+ HZ
* 30)) {
228 u64 cur_sector
= bucket
* ca
->mi
.bucket_size
+ bucket_offset
;
229 u64 end_sector
= ca
->mi
.nbuckets
* ca
->mi
.bucket_size
;
231 bch_info(ca
, "%s: %2u%% done", __func__
,
232 (unsigned) div64_u64(cur_sector
* 100, end_sector
));
233 last_print
= jiffies
;
236 u64 sector
= bucket
* ca
->mi
.bucket_size
+ bucket_offset
;
238 if (c
->sb
.version_upgrade_complete
>= bcachefs_metadata_version_mi_btree_bitmap
&&
239 !bch2_dev_btree_bitmap_marked_sectors(ca
, sector
, btree_sectors(c
)))
242 try_read_btree_node(w
->f
, ca
, bio
, buf
, sector
);
246 free_page((unsigned long) buf
);
247 percpu_ref_get(&ca
->io_ref
);
253 static int read_btree_nodes(struct find_btree_nodes
*f
)
255 struct bch_fs
*c
= container_of(f
, struct bch_fs
, found_btree_nodes
);
259 closure_init_stack(&cl
);
261 for_each_online_member(c
, ca
) {
262 if (!(ca
->mi
.data_allowed
& BIT(BCH_DATA_btree
)))
265 struct find_btree_nodes_worker
*w
= kmalloc(sizeof(*w
), GFP_KERNEL
);
266 struct task_struct
*t
;
269 percpu_ref_put(&ca
->io_ref
);
274 percpu_ref_get(&ca
->io_ref
);
280 t
= kthread_run(read_btree_nodes_worker
, w
, "read_btree_nodes/%s", ca
->name
);
281 ret
= PTR_ERR_OR_ZERO(t
);
283 percpu_ref_put(&ca
->io_ref
);
286 bch_err(c
, "error starting kthread: %i", ret
);
292 return f
->ret
?: ret
;
295 static void bubble_up(struct found_btree_node
*n
, struct found_btree_node
*end
)
297 while (n
+ 1 < end
&&
298 found_btree_node_cmp_pos(n
, n
+ 1) > 0) {
304 static int handle_overwrites(struct bch_fs
*c
,
305 struct found_btree_node
*start
,
306 struct found_btree_node
*end
)
308 struct found_btree_node
*n
;
312 n
->btree_id
== start
->btree_id
&&
313 n
->level
== start
->level
&&
314 bpos_lt(n
->min_key
, start
->max_key
);
316 int cmp
= found_btree_node_cmp_time(start
, n
);
319 if (bpos_cmp(start
->max_key
, n
->max_key
) >= 0)
320 n
->overwritten
= true;
322 n
->range_updated
= true;
323 n
->min_key
= bpos_successor(start
->max_key
);
324 n
->range_updated
= true;
328 } else if (cmp
< 0) {
329 BUG_ON(bpos_cmp(n
->min_key
, start
->min_key
) <= 0);
331 start
->max_key
= bpos_predecessor(n
->min_key
);
332 start
->range_updated
= true;
333 } else if (n
->level
) {
334 n
->overwritten
= true;
336 if (bpos_cmp(start
->max_key
, n
->max_key
) >= 0)
337 n
->overwritten
= true;
339 n
->range_updated
= true;
340 n
->min_key
= bpos_successor(start
->max_key
);
341 n
->range_updated
= true;
351 int bch2_scan_for_btree_nodes(struct bch_fs
*c
)
353 struct find_btree_nodes
*f
= &c
->found_btree_nodes
;
354 struct printbuf buf
= PRINTBUF
;
361 mutex_init(&f
->lock
);
363 ret
= read_btree_nodes(f
);
368 bch_err(c
, "%s: no btree nodes found", __func__
);
373 if (0 && c
->opts
.verbose
) {
374 printbuf_reset(&buf
);
375 prt_printf(&buf
, "%s: nodes found:\n", __func__
);
376 found_btree_nodes_to_text(&buf
, c
, f
->nodes
);
377 bch2_print_string_as_lines(KERN_INFO
, buf
.buf
);
380 sort(f
->nodes
.data
, f
->nodes
.nr
, sizeof(f
->nodes
.data
[0]), found_btree_node_cmp_cookie
, NULL
);
383 darray_for_each(f
->nodes
, i
) {
384 struct found_btree_node
*prev
= dst
? f
->nodes
.data
+ dst
- 1 : NULL
;
387 prev
->cookie
== i
->cookie
) {
388 if (prev
->nr_ptrs
== ARRAY_SIZE(prev
->ptrs
)) {
389 bch_err(c
, "%s: found too many replicas for btree node", __func__
);
393 prev
->ptrs
[prev
->nr_ptrs
++] = i
->ptrs
[0];
395 f
->nodes
.data
[dst
++] = *i
;
400 sort(f
->nodes
.data
, f
->nodes
.nr
, sizeof(f
->nodes
.data
[0]), found_btree_node_cmp_pos
, NULL
);
402 if (0 && c
->opts
.verbose
) {
403 printbuf_reset(&buf
);
404 prt_printf(&buf
, "%s: nodes after merging replicas:\n", __func__
);
405 found_btree_nodes_to_text(&buf
, c
, f
->nodes
);
406 bch2_print_string_as_lines(KERN_INFO
, buf
.buf
);
410 darray_for_each(f
->nodes
, i
) {
414 ret
= handle_overwrites(c
, i
, &darray_top(f
->nodes
));
418 BUG_ON(i
->overwritten
);
419 f
->nodes
.data
[dst
++] = *i
;
423 if (c
->opts
.verbose
) {
424 printbuf_reset(&buf
);
425 prt_printf(&buf
, "%s: nodes found after overwrites:\n", __func__
);
426 found_btree_nodes_to_text(&buf
, c
, f
->nodes
);
427 bch2_print_string_as_lines(KERN_INFO
, buf
.buf
);
430 eytzinger0_sort(f
->nodes
.data
, f
->nodes
.nr
, sizeof(f
->nodes
.data
[0]), found_btree_node_cmp_pos
, NULL
);
436 static int found_btree_node_range_start_cmp(const void *_l
, const void *_r
)
438 const struct found_btree_node
*l
= _l
;
439 const struct found_btree_node
*r
= _r
;
441 return cmp_int(l
->btree_id
, r
->btree_id
) ?:
442 -cmp_int(l
->level
, r
->level
) ?:
443 bpos_cmp(l
->max_key
, r
->min_key
);
446 #define for_each_found_btree_node_in_range(_f, _search, _idx) \
447 for (size_t _idx = eytzinger0_find_gt((_f)->nodes.data, (_f)->nodes.nr, \
448 sizeof((_f)->nodes.data[0]), \
449 found_btree_node_range_start_cmp, &search); \
450 _idx < (_f)->nodes.nr && \
451 (_f)->nodes.data[_idx].btree_id == _search.btree_id && \
452 (_f)->nodes.data[_idx].level == _search.level && \
453 bpos_lt((_f)->nodes.data[_idx].min_key, _search.max_key); \
454 _idx = eytzinger0_next(_idx, (_f)->nodes.nr))
456 bool bch2_btree_node_is_stale(struct bch_fs
*c
, struct btree
*b
)
458 struct find_btree_nodes
*f
= &c
->found_btree_nodes
;
460 struct found_btree_node search
= {
461 .btree_id
= b
->c
.btree_id
,
463 .min_key
= b
->data
->min_key
,
464 .max_key
= b
->key
.k
.p
,
467 for_each_found_btree_node_in_range(f
, search
, idx
)
468 if (f
->nodes
.data
[idx
].seq
> BTREE_NODE_SEQ(b
->data
))
473 bool bch2_btree_has_scanned_nodes(struct bch_fs
*c
, enum btree_id btree
)
475 struct found_btree_node search
= {
482 for_each_found_btree_node_in_range(&c
->found_btree_nodes
, search
, idx
)
487 int bch2_get_scanned_nodes(struct bch_fs
*c
, enum btree_id btree
,
488 unsigned level
, struct bpos node_min
, struct bpos node_max
)
490 if (btree_id_is_alloc(btree
))
493 struct find_btree_nodes
*f
= &c
->found_btree_nodes
;
495 int ret
= bch2_run_explicit_recovery_pass(c
, BCH_RECOVERY_PASS_scan_for_btree_nodes
);
499 if (c
->opts
.verbose
) {
500 struct printbuf buf
= PRINTBUF
;
502 prt_printf(&buf
, "recovering %s l=%u ", bch2_btree_id_str(btree
), level
);
503 bch2_bpos_to_text(&buf
, node_min
);
504 prt_str(&buf
, " - ");
505 bch2_bpos_to_text(&buf
, node_max
);
507 bch_info(c
, "%s(): %s", __func__
, buf
.buf
);
511 struct found_btree_node search
= {
518 for_each_found_btree_node_in_range(f
, search
, idx
) {
519 struct found_btree_node n
= f
->nodes
.data
[idx
];
521 n
.range_updated
|= bpos_lt(n
.min_key
, node_min
);
522 n
.min_key
= bpos_max(n
.min_key
, node_min
);
524 n
.range_updated
|= bpos_gt(n
.max_key
, node_max
);
525 n
.max_key
= bpos_min(n
.max_key
, node_max
);
527 struct { __BKEY_PADDED(k
, BKEY_BTREE_PTR_VAL_U64s_MAX
); } tmp
;
529 found_btree_node_to_key(&tmp
.k
, &n
);
531 struct printbuf buf
= PRINTBUF
;
532 bch2_bkey_val_to_text(&buf
, c
, bkey_i_to_s_c(&tmp
.k
));
533 bch_verbose(c
, "%s(): recovering %s", __func__
, buf
.buf
);
536 BUG_ON(bch2_bkey_validate(c
, bkey_i_to_s_c(&tmp
.k
), BKEY_TYPE_btree
, 0));
538 ret
= bch2_journal_key_insert(c
, btree
, level
+ 1, &tmp
.k
);
546 void bch2_find_btree_nodes_exit(struct find_btree_nodes
*f
)
548 darray_exit(&f
->nodes
);