1 // SPDX-License-Identifier: GPL-2.0
4 #include "bkey_methods.h"
6 #include "btree_cache.h"
8 #include "btree_iter.h"
9 #include "btree_locking.h"
10 #include "btree_update.h"
11 #include "btree_update_interior.h"
18 #include "journal_reclaim.h"
19 #include "journal_seq_blacklist.h"
24 #include <linux/sched/mm.h>
26 static void bch2_btree_node_header_to_text(struct printbuf
*out
, struct btree_node
*bn
)
28 prt_printf(out
, "btree=%s l=%u seq %llux\n",
29 bch2_btree_id_str(BTREE_NODE_ID(bn
)),
30 (unsigned) BTREE_NODE_LEVEL(bn
), bn
->keys
.seq
);
31 prt_str(out
, "min: ");
32 bch2_bpos_to_text(out
, bn
->min_key
);
34 prt_str(out
, "max: ");
35 bch2_bpos_to_text(out
, bn
->max_key
);
38 void bch2_btree_node_io_unlock(struct btree
*b
)
40 EBUG_ON(!btree_node_write_in_flight(b
));
42 clear_btree_node_write_in_flight_inner(b
);
43 clear_btree_node_write_in_flight(b
);
44 wake_up_bit(&b
->flags
, BTREE_NODE_write_in_flight
);
47 void bch2_btree_node_io_lock(struct btree
*b
)
49 wait_on_bit_lock_io(&b
->flags
, BTREE_NODE_write_in_flight
,
50 TASK_UNINTERRUPTIBLE
);
53 void __bch2_btree_node_wait_on_read(struct btree
*b
)
55 wait_on_bit_io(&b
->flags
, BTREE_NODE_read_in_flight
,
56 TASK_UNINTERRUPTIBLE
);
59 void __bch2_btree_node_wait_on_write(struct btree
*b
)
61 wait_on_bit_io(&b
->flags
, BTREE_NODE_write_in_flight
,
62 TASK_UNINTERRUPTIBLE
);
65 void bch2_btree_node_wait_on_read(struct btree
*b
)
67 wait_on_bit_io(&b
->flags
, BTREE_NODE_read_in_flight
,
68 TASK_UNINTERRUPTIBLE
);
71 void bch2_btree_node_wait_on_write(struct btree
*b
)
73 wait_on_bit_io(&b
->flags
, BTREE_NODE_write_in_flight
,
74 TASK_UNINTERRUPTIBLE
);
77 static void verify_no_dups(struct btree
*b
,
78 struct bkey_packed
*start
,
79 struct bkey_packed
*end
)
81 #ifdef CONFIG_BCACHEFS_DEBUG
82 struct bkey_packed
*k
, *p
;
87 for (p
= start
, k
= bkey_p_next(start
);
89 p
= k
, k
= bkey_p_next(k
)) {
90 struct bkey l
= bkey_unpack_key(b
, p
);
91 struct bkey r
= bkey_unpack_key(b
, k
);
93 BUG_ON(bpos_ge(l
.p
, bkey_start_pos(&r
)));
98 static void set_needs_whiteout(struct bset
*i
, int v
)
100 struct bkey_packed
*k
;
102 for (k
= i
->start
; k
!= vstruct_last(i
); k
= bkey_p_next(k
))
103 k
->needs_whiteout
= v
;
106 static void btree_bounce_free(struct bch_fs
*c
, size_t size
,
107 bool used_mempool
, void *p
)
110 mempool_free(p
, &c
->btree_bounce_pool
);
115 static void *btree_bounce_alloc(struct bch_fs
*c
, size_t size
,
118 unsigned flags
= memalloc_nofs_save();
121 BUG_ON(size
> c
->opts
.btree_node_size
);
123 *used_mempool
= false;
124 p
= kvmalloc(size
, __GFP_NOWARN
|GFP_NOWAIT
);
126 *used_mempool
= true;
127 p
= mempool_alloc(&c
->btree_bounce_pool
, GFP_NOFS
);
129 memalloc_nofs_restore(flags
);
133 static void sort_bkey_ptrs(const struct btree
*bt
,
134 struct bkey_packed
**ptrs
, unsigned nr
)
136 unsigned n
= nr
, a
= nr
/ 2, b
, c
, d
;
141 /* Heap sort: see lib/sort.c: */
146 swap(ptrs
[0], ptrs
[n
]);
150 for (b
= a
; c
= 2 * b
+ 1, (d
= c
+ 1) < n
;)
151 b
= bch2_bkey_cmp_packed(bt
,
153 ptrs
[d
]) >= 0 ? c
: d
;
158 bch2_bkey_cmp_packed(bt
,
165 swap(ptrs
[b
], ptrs
[c
]);
170 static void bch2_sort_whiteouts(struct bch_fs
*c
, struct btree
*b
)
172 struct bkey_packed
*new_whiteouts
, **ptrs
, **ptrs_end
, *k
;
173 bool used_mempool
= false;
174 size_t bytes
= b
->whiteout_u64s
* sizeof(u64
);
176 if (!b
->whiteout_u64s
)
179 new_whiteouts
= btree_bounce_alloc(c
, bytes
, &used_mempool
);
181 ptrs
= ptrs_end
= ((void *) new_whiteouts
+ bytes
);
183 for (k
= unwritten_whiteouts_start(b
);
184 k
!= unwritten_whiteouts_end(b
);
188 sort_bkey_ptrs(b
, ptrs
, ptrs_end
- ptrs
);
192 while (ptrs
!= ptrs_end
) {
193 bkey_p_copy(k
, *ptrs
);
198 verify_no_dups(b
, new_whiteouts
,
199 (void *) ((u64
*) new_whiteouts
+ b
->whiteout_u64s
));
201 memcpy_u64s(unwritten_whiteouts_start(b
),
202 new_whiteouts
, b
->whiteout_u64s
);
204 btree_bounce_free(c
, bytes
, used_mempool
, new_whiteouts
);
207 static bool should_compact_bset(struct btree
*b
, struct bset_tree
*t
,
208 bool compacting
, enum compact_mode mode
)
210 if (!bset_dead_u64s(b
, t
))
215 return should_compact_bset_lazy(b
, t
) ||
216 (compacting
&& !bset_written(b
, bset(b
, t
)));
224 static bool bch2_drop_whiteouts(struct btree
*b
, enum compact_mode mode
)
228 for_each_bset(b
, t
) {
229 struct bset
*i
= bset(b
, t
);
230 struct bkey_packed
*k
, *n
, *out
, *start
, *end
;
231 struct btree_node_entry
*src
= NULL
, *dst
= NULL
;
233 if (t
!= b
->set
&& !bset_written(b
, i
)) {
234 src
= container_of(i
, struct btree_node_entry
, keys
);
235 dst
= max(write_block(b
),
236 (void *) btree_bkey_last(b
, t
- 1));
242 if (!should_compact_bset(b
, t
, ret
, mode
)) {
244 memmove(dst
, src
, sizeof(*src
) +
245 le16_to_cpu(src
->keys
.u64s
) *
248 set_btree_bset(b
, t
, i
);
253 start
= btree_bkey_first(b
, t
);
254 end
= btree_bkey_last(b
, t
);
257 memmove(dst
, src
, sizeof(*src
));
259 set_btree_bset(b
, t
, i
);
264 for (k
= start
; k
!= end
; k
= n
) {
267 if (!bkey_deleted(k
)) {
269 out
= bkey_p_next(out
);
271 BUG_ON(k
->needs_whiteout
);
275 i
->u64s
= cpu_to_le16((u64
*) out
- i
->_data
);
276 set_btree_bset_end(b
, t
);
277 bch2_bset_set_no_aux_tree(b
, t
);
281 bch2_verify_btree_nr_keys(b
);
283 bch2_btree_build_aux_trees(b
);
288 bool bch2_compact_whiteouts(struct bch_fs
*c
, struct btree
*b
,
289 enum compact_mode mode
)
291 return bch2_drop_whiteouts(b
, mode
);
294 static void btree_node_sort(struct bch_fs
*c
, struct btree
*b
,
298 struct btree_node
*out
;
299 struct sort_iter_stack sort_iter
;
301 struct bset
*start_bset
= bset(b
, &b
->set
[start_idx
]);
302 bool used_mempool
= false;
303 u64 start_time
, seq
= 0;
304 unsigned i
, u64s
= 0, bytes
, shift
= end_idx
- start_idx
- 1;
305 bool sorting_entire_node
= start_idx
== 0 &&
308 sort_iter_stack_init(&sort_iter
, b
);
310 for (t
= b
->set
+ start_idx
;
311 t
< b
->set
+ end_idx
;
313 u64s
+= le16_to_cpu(bset(b
, t
)->u64s
);
314 sort_iter_add(&sort_iter
.iter
,
315 btree_bkey_first(b
, t
),
316 btree_bkey_last(b
, t
));
319 bytes
= sorting_entire_node
321 : __vstruct_bytes(struct btree_node
, u64s
);
323 out
= btree_bounce_alloc(c
, bytes
, &used_mempool
);
325 start_time
= local_clock();
327 u64s
= bch2_sort_keys(out
->keys
.start
, &sort_iter
.iter
);
329 out
->keys
.u64s
= cpu_to_le16(u64s
);
331 BUG_ON(vstruct_end(&out
->keys
) > (void *) out
+ bytes
);
333 if (sorting_entire_node
)
334 bch2_time_stats_update(&c
->times
[BCH_TIME_btree_node_sort
],
337 /* Make sure we preserve bset journal_seq: */
338 for (t
= b
->set
+ start_idx
; t
< b
->set
+ end_idx
; t
++)
339 seq
= max(seq
, le64_to_cpu(bset(b
, t
)->journal_seq
));
340 start_bset
->journal_seq
= cpu_to_le64(seq
);
342 if (sorting_entire_node
) {
343 u64s
= le16_to_cpu(out
->keys
.u64s
);
345 BUG_ON(bytes
!= btree_buf_bytes(b
));
348 * Our temporary buffer is the same size as the btree node's
349 * buffer, we can just swap buffers instead of doing a big
353 out
->keys
.u64s
= cpu_to_le16(u64s
);
355 set_btree_bset(b
, b
->set
, &b
->data
->keys
);
357 start_bset
->u64s
= out
->keys
.u64s
;
358 memcpy_u64s(start_bset
->start
,
360 le16_to_cpu(out
->keys
.u64s
));
363 for (i
= start_idx
+ 1; i
< end_idx
; i
++)
364 b
->nr
.bset_u64s
[start_idx
] +=
369 for (i
= start_idx
+ 1; i
< b
->nsets
; i
++) {
370 b
->nr
.bset_u64s
[i
] = b
->nr
.bset_u64s
[i
+ shift
];
371 b
->set
[i
] = b
->set
[i
+ shift
];
374 for (i
= b
->nsets
; i
< MAX_BSETS
; i
++)
375 b
->nr
.bset_u64s
[i
] = 0;
377 set_btree_bset_end(b
, &b
->set
[start_idx
]);
378 bch2_bset_set_no_aux_tree(b
, &b
->set
[start_idx
]);
380 btree_bounce_free(c
, bytes
, used_mempool
, out
);
382 bch2_verify_btree_nr_keys(b
);
385 void bch2_btree_sort_into(struct bch_fs
*c
,
389 struct btree_nr_keys nr
;
390 struct btree_node_iter src_iter
;
391 u64 start_time
= local_clock();
393 BUG_ON(dst
->nsets
!= 1);
395 bch2_bset_set_no_aux_tree(dst
, dst
->set
);
397 bch2_btree_node_iter_init_from_start(&src_iter
, src
);
399 nr
= bch2_sort_repack(btree_bset_first(dst
),
404 bch2_time_stats_update(&c
->times
[BCH_TIME_btree_node_sort
],
407 set_btree_bset_end(dst
, dst
->set
);
409 dst
->nr
.live_u64s
+= nr
.live_u64s
;
410 dst
->nr
.bset_u64s
[0] += nr
.bset_u64s
[0];
411 dst
->nr
.packed_keys
+= nr
.packed_keys
;
412 dst
->nr
.unpacked_keys
+= nr
.unpacked_keys
;
414 bch2_verify_btree_nr_keys(dst
);
418 * We're about to add another bset to the btree node, so if there's currently
419 * too many bsets - sort some of them together:
421 static bool btree_node_compact(struct bch_fs
*c
, struct btree
*b
)
423 unsigned unwritten_idx
;
426 for (unwritten_idx
= 0;
427 unwritten_idx
< b
->nsets
;
429 if (!bset_written(b
, bset(b
, &b
->set
[unwritten_idx
])))
432 if (b
->nsets
- unwritten_idx
> 1) {
433 btree_node_sort(c
, b
, unwritten_idx
, b
->nsets
);
437 if (unwritten_idx
> 1) {
438 btree_node_sort(c
, b
, 0, unwritten_idx
);
445 void bch2_btree_build_aux_trees(struct btree
*b
)
448 bch2_bset_build_aux_tree(b
, t
,
449 !bset_written(b
, bset(b
, t
)) &&
450 t
== bset_tree_last(b
));
454 * If we have MAX_BSETS (3) bsets, should we sort them all down to just one?
456 * The first bset is going to be of similar order to the size of the node, the
457 * last bset is bounded by btree_write_set_buffer(), which is set to keep the
458 * memmove on insert from being too expensive: the middle bset should, ideally,
459 * be the geometric mean of the first and the last.
461 * Returns true if the middle bset is greater than that geometric mean:
463 static inline bool should_compact_all(struct bch_fs
*c
, struct btree
*b
)
465 unsigned mid_u64s_bits
=
466 (ilog2(btree_max_u64s(c
)) + BTREE_WRITE_SET_U64s_BITS
) / 2;
468 return bset_u64s(&b
->set
[1]) > 1U << mid_u64s_bits
;
472 * @bch_btree_init_next - initialize a new (unwritten) bset that can then be
475 * Safe to call if there already is an unwritten bset - will only add a new bset
476 * if @b doesn't already have one.
478 * Returns true if we sorted (i.e. invalidated iterators
480 void bch2_btree_init_next(struct btree_trans
*trans
, struct btree
*b
)
482 struct bch_fs
*c
= trans
->c
;
483 struct btree_node_entry
*bne
;
484 bool reinit_iter
= false;
486 EBUG_ON(!six_lock_counts(&b
->c
.lock
).n
[SIX_LOCK_write
]);
487 BUG_ON(bset_written(b
, bset(b
, &b
->set
[1])));
488 BUG_ON(btree_node_just_written(b
));
490 if (b
->nsets
== MAX_BSETS
&&
491 !btree_node_write_in_flight(b
) &&
492 should_compact_all(c
, b
)) {
493 bch2_btree_node_write(c
, b
, SIX_LOCK_write
,
494 BTREE_WRITE_init_next_bset
);
498 if (b
->nsets
== MAX_BSETS
&&
499 btree_node_compact(c
, b
))
502 BUG_ON(b
->nsets
>= MAX_BSETS
);
504 bne
= want_new_bset(c
, b
);
506 bch2_bset_init_next(b
, bne
);
508 bch2_btree_build_aux_trees(b
);
511 bch2_trans_node_reinit_iter(trans
, b
);
514 static void btree_err_msg(struct printbuf
*out
, struct bch_fs
*c
,
516 struct btree
*b
, struct bset
*i
, struct bkey_packed
*k
,
517 unsigned offset
, int write
)
519 prt_printf(out
, bch2_log_msg(c
, "%s"),
521 ? "error validating btree node "
522 : "corrupt btree node before write ");
524 prt_printf(out
, "on %s ", ca
->name
);
525 prt_printf(out
, "at btree ");
526 bch2_btree_pos_to_text(out
, c
, b
);
528 printbuf_indent_add(out
, 2);
530 prt_printf(out
, "\nnode offset %u/%u",
531 b
->written
, btree_ptr_sectors_written(bkey_i_to_s_c(&b
->key
)));
533 prt_printf(out
, " bset u64s %u", le16_to_cpu(i
->u64s
));
535 prt_printf(out
, " bset byte offset %lu",
536 (unsigned long)(void *)k
-
537 ((unsigned long)(void *)i
& ~511UL));
542 static int __btree_err(int ret
,
547 struct bkey_packed
*k
,
550 enum bch_sb_error_id err_type
,
551 const char *fmt
, ...)
553 struct printbuf out
= PRINTBUF
;
554 bool silent
= c
->curr_recovery_pass
== BCH_RECOVERY_PASS_scan_for_btree_nodes
;
557 btree_err_msg(&out
, c
, ca
, b
, i
, k
, b
->written
, write
);
560 prt_vprintf(&out
, fmt
, args
);
563 if (write
== WRITE
) {
564 bch2_print_string_as_lines(KERN_ERR
, out
.buf
);
565 ret
= c
->opts
.errors
== BCH_ON_ERROR_continue
567 : -BCH_ERR_fsck_errors_not_fixed
;
571 if (!have_retry
&& ret
== -BCH_ERR_btree_node_read_err_want_retry
)
572 ret
= -BCH_ERR_btree_node_read_err_fixable
;
573 if (!have_retry
&& ret
== -BCH_ERR_btree_node_read_err_must_retry
)
574 ret
= -BCH_ERR_btree_node_read_err_bad_node
;
576 if (!silent
&& ret
!= -BCH_ERR_btree_node_read_err_fixable
)
577 bch2_sb_error_count(c
, err_type
);
580 case -BCH_ERR_btree_node_read_err_fixable
:
582 ? __bch2_fsck_err(c
, NULL
, FSCK_CAN_FIX
, err_type
, "%s", out
.buf
)
584 if (ret
!= -BCH_ERR_fsck_fix
&&
585 ret
!= -BCH_ERR_fsck_ignore
)
587 ret
= -BCH_ERR_fsck_fix
;
589 case -BCH_ERR_btree_node_read_err_want_retry
:
590 case -BCH_ERR_btree_node_read_err_must_retry
:
592 bch2_print_string_as_lines(KERN_ERR
, out
.buf
);
594 case -BCH_ERR_btree_node_read_err_bad_node
:
596 bch2_print_string_as_lines(KERN_ERR
, out
.buf
);
597 ret
= bch2_topology_error(c
);
599 case -BCH_ERR_btree_node_read_err_incompatible
:
601 bch2_print_string_as_lines(KERN_ERR
, out
.buf
);
602 ret
= -BCH_ERR_fsck_errors_not_fixed
;
613 #define btree_err(type, c, ca, b, i, k, _err_type, msg, ...) \
615 int _ret = __btree_err(type, c, ca, b, i, k, write, have_retry, \
616 BCH_FSCK_ERR_##_err_type, \
617 msg, ##__VA_ARGS__); \
619 if (_ret != -BCH_ERR_fsck_fix) { \
627 #define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false)
630 * When btree topology repair changes the start or end of a node, that might
631 * mean we have to drop keys that are no longer inside the node:
634 void bch2_btree_node_drop_keys_outside_node(struct btree
*b
)
636 for_each_bset(b
, t
) {
637 struct bset
*i
= bset(b
, t
);
638 struct bkey_packed
*k
;
640 for (k
= i
->start
; k
!= vstruct_last(i
); k
= bkey_p_next(k
))
641 if (bkey_cmp_left_packed(b
, k
, &b
->data
->min_key
) >= 0)
645 unsigned shift
= (u64
*) k
- (u64
*) i
->start
;
647 memmove_u64s_down(i
->start
, k
,
648 (u64
*) vstruct_end(i
) - (u64
*) k
);
649 i
->u64s
= cpu_to_le16(le16_to_cpu(i
->u64s
) - shift
);
650 set_btree_bset_end(b
, t
);
653 for (k
= i
->start
; k
!= vstruct_last(i
); k
= bkey_p_next(k
))
654 if (bkey_cmp_left_packed(b
, k
, &b
->data
->max_key
) > 0)
657 if (k
!= vstruct_last(i
)) {
658 i
->u64s
= cpu_to_le16((u64
*) k
- (u64
*) i
->start
);
659 set_btree_bset_end(b
, t
);
664 * Always rebuild search trees: eytzinger search tree nodes directly
665 * depend on the values of min/max key:
667 bch2_bset_set_no_aux_tree(b
, b
->set
);
668 bch2_btree_build_aux_trees(b
);
669 b
->nr
= bch2_btree_node_count_keys(b
);
672 struct bkey unpacked
;
673 struct btree_node_iter iter
;
674 for_each_btree_node_key_unpack(b
, k
, &iter
, &unpacked
) {
675 BUG_ON(bpos_lt(k
.k
->p
, b
->data
->min_key
));
676 BUG_ON(bpos_gt(k
.k
->p
, b
->data
->max_key
));
680 static int validate_bset(struct bch_fs
*c
, struct bch_dev
*ca
,
681 struct btree
*b
, struct bset
*i
,
682 unsigned offset
, unsigned sectors
,
683 int write
, bool have_retry
, bool *saw_error
)
685 unsigned version
= le16_to_cpu(i
->version
);
686 unsigned ptr_written
= btree_ptr_sectors_written(bkey_i_to_s_c(&b
->key
));
687 struct printbuf buf1
= PRINTBUF
;
688 struct printbuf buf2
= PRINTBUF
;
691 btree_err_on(!bch2_version_compatible(version
),
692 -BCH_ERR_btree_node_read_err_incompatible
,
694 btree_node_unsupported_version
,
695 "unsupported bset version %u.%u",
696 BCH_VERSION_MAJOR(version
),
697 BCH_VERSION_MINOR(version
));
699 if (btree_err_on(version
< c
->sb
.version_min
,
700 -BCH_ERR_btree_node_read_err_fixable
,
702 btree_node_bset_older_than_sb_min
,
703 "bset version %u older than superblock version_min %u",
704 version
, c
->sb
.version_min
)) {
705 mutex_lock(&c
->sb_lock
);
706 c
->disk_sb
.sb
->version_min
= cpu_to_le16(version
);
708 mutex_unlock(&c
->sb_lock
);
711 if (btree_err_on(BCH_VERSION_MAJOR(version
) >
712 BCH_VERSION_MAJOR(c
->sb
.version
),
713 -BCH_ERR_btree_node_read_err_fixable
,
715 btree_node_bset_newer_than_sb
,
716 "bset version %u newer than superblock version %u",
717 version
, c
->sb
.version
)) {
718 mutex_lock(&c
->sb_lock
);
719 c
->disk_sb
.sb
->version
= cpu_to_le16(version
);
721 mutex_unlock(&c
->sb_lock
);
724 btree_err_on(BSET_SEPARATE_WHITEOUTS(i
),
725 -BCH_ERR_btree_node_read_err_incompatible
,
727 btree_node_unsupported_version
,
728 "BSET_SEPARATE_WHITEOUTS no longer supported");
731 btree_err_on(offset
+ sectors
> (ptr_written
?: btree_sectors(c
)),
732 -BCH_ERR_btree_node_read_err_fixable
,
734 bset_past_end_of_btree_node
,
735 "bset past end of btree node (offset %u len %u but written %zu)",
736 offset
, sectors
, ptr_written
?: btree_sectors(c
)))
739 btree_err_on(offset
&& !i
->u64s
,
740 -BCH_ERR_btree_node_read_err_fixable
,
745 btree_err_on(BSET_OFFSET(i
) && BSET_OFFSET(i
) != offset
,
746 -BCH_ERR_btree_node_read_err_want_retry
,
748 bset_wrong_sector_offset
,
749 "bset at wrong sector offset");
752 struct btree_node
*bn
=
753 container_of(i
, struct btree_node
, keys
);
754 /* These indicate that we read the wrong btree node: */
756 if (b
->key
.k
.type
== KEY_TYPE_btree_ptr_v2
) {
757 struct bch_btree_ptr_v2
*bp
=
758 &bkey_i_to_btree_ptr_v2(&b
->key
)->v
;
761 btree_err_on(bp
->seq
!= bn
->keys
.seq
,
762 -BCH_ERR_btree_node_read_err_must_retry
,
763 c
, ca
, b
, NULL
, NULL
,
765 "incorrect sequence number (wrong btree node)");
768 btree_err_on(BTREE_NODE_ID(bn
) != b
->c
.btree_id
,
769 -BCH_ERR_btree_node_read_err_must_retry
,
771 btree_node_bad_btree
,
772 "incorrect btree id");
774 btree_err_on(BTREE_NODE_LEVEL(bn
) != b
->c
.level
,
775 -BCH_ERR_btree_node_read_err_must_retry
,
777 btree_node_bad_level
,
781 compat_btree_node(b
->c
.level
, b
->c
.btree_id
, version
,
782 BSET_BIG_ENDIAN(i
), write
, bn
);
784 if (b
->key
.k
.type
== KEY_TYPE_btree_ptr_v2
) {
785 struct bch_btree_ptr_v2
*bp
=
786 &bkey_i_to_btree_ptr_v2(&b
->key
)->v
;
788 if (BTREE_PTR_RANGE_UPDATED(bp
)) {
789 b
->data
->min_key
= bp
->min_key
;
790 b
->data
->max_key
= b
->key
.k
.p
;
793 btree_err_on(!bpos_eq(b
->data
->min_key
, bp
->min_key
),
794 -BCH_ERR_btree_node_read_err_must_retry
,
795 c
, ca
, b
, NULL
, NULL
,
796 btree_node_bad_min_key
,
797 "incorrect min_key: got %s should be %s",
798 (printbuf_reset(&buf1
),
799 bch2_bpos_to_text(&buf1
, bn
->min_key
), buf1
.buf
),
800 (printbuf_reset(&buf2
),
801 bch2_bpos_to_text(&buf2
, bp
->min_key
), buf2
.buf
));
804 btree_err_on(!bpos_eq(bn
->max_key
, b
->key
.k
.p
),
805 -BCH_ERR_btree_node_read_err_must_retry
,
807 btree_node_bad_max_key
,
808 "incorrect max key %s",
809 (printbuf_reset(&buf1
),
810 bch2_bpos_to_text(&buf1
, bn
->max_key
), buf1
.buf
));
813 compat_btree_node(b
->c
.level
, b
->c
.btree_id
, version
,
814 BSET_BIG_ENDIAN(i
), write
, bn
);
816 btree_err_on(bch2_bkey_format_invalid(c
, &bn
->format
, write
, &buf1
),
817 -BCH_ERR_btree_node_read_err_bad_node
,
819 btree_node_bad_format
,
820 "invalid bkey format: %s\n %s", buf1
.buf
,
821 (printbuf_reset(&buf2
),
822 bch2_bkey_format_to_text(&buf2
, &bn
->format
), buf2
.buf
));
823 printbuf_reset(&buf1
);
825 compat_bformat(b
->c
.level
, b
->c
.btree_id
, version
,
826 BSET_BIG_ENDIAN(i
), write
,
830 printbuf_exit(&buf2
);
831 printbuf_exit(&buf1
);
835 static int bset_key_validate(struct bch_fs
*c
, struct btree
*b
,
837 bool updated_range
, int rw
)
839 return __bch2_bkey_validate(c
, k
, btree_node_type(b
), 0) ?:
840 (!updated_range
? bch2_bkey_in_btree_node(c
, b
, k
, 0) : 0) ?:
841 (rw
== WRITE
? bch2_bkey_val_validate(c
, k
, 0) : 0);
844 static bool bkey_packed_valid(struct bch_fs
*c
, struct btree
*b
,
845 struct bset
*i
, struct bkey_packed
*k
)
847 if (bkey_p_next(k
) > vstruct_last(i
))
850 if (k
->format
> KEY_FORMAT_CURRENT
)
853 if (!bkeyp_u64s_valid(&b
->format
, k
))
857 struct bkey_s u
= __bkey_disassemble(b
, k
, &tmp
);
858 return !__bch2_bkey_validate(c
, u
.s_c
, btree_node_type(b
), BCH_VALIDATE_silent
);
861 static int validate_bset_keys(struct bch_fs
*c
, struct btree
*b
,
862 struct bset
*i
, int write
,
863 bool have_retry
, bool *saw_error
)
865 unsigned version
= le16_to_cpu(i
->version
);
866 struct bkey_packed
*k
, *prev
= NULL
;
867 struct printbuf buf
= PRINTBUF
;
868 bool updated_range
= b
->key
.k
.type
== KEY_TYPE_btree_ptr_v2
&&
869 BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b
->key
)->v
);
873 k
!= vstruct_last(i
);) {
876 unsigned next_good_key
;
878 if (btree_err_on(bkey_p_next(k
) > vstruct_last(i
),
879 -BCH_ERR_btree_node_read_err_fixable
,
881 btree_node_bkey_past_bset_end
,
882 "key extends past end of bset")) {
883 i
->u64s
= cpu_to_le16((u64
*) k
- i
->_data
);
887 if (btree_err_on(k
->format
> KEY_FORMAT_CURRENT
,
888 -BCH_ERR_btree_node_read_err_fixable
,
890 btree_node_bkey_bad_format
,
891 "invalid bkey format %u", k
->format
))
894 if (btree_err_on(!bkeyp_u64s_valid(&b
->format
, k
),
895 -BCH_ERR_btree_node_read_err_fixable
,
897 btree_node_bkey_bad_u64s
,
898 "bad k->u64s %u (min %u max %zu)", k
->u64s
,
899 bkeyp_key_u64s(&b
->format
, k
),
900 U8_MAX
- BKEY_U64s
+ bkeyp_key_u64s(&b
->format
, k
)))
904 bch2_bkey_compat(b
->c
.level
, b
->c
.btree_id
, version
,
905 BSET_BIG_ENDIAN(i
), write
,
908 u
= __bkey_disassemble(b
, k
, &tmp
);
910 ret
= bset_key_validate(c
, b
, u
.s_c
, updated_range
, write
);
911 if (ret
== -BCH_ERR_fsck_delete_bkey
)
917 bch2_bkey_compat(b
->c
.level
, b
->c
.btree_id
, version
,
918 BSET_BIG_ENDIAN(i
), write
,
921 if (prev
&& bkey_iter_cmp(b
, prev
, k
) > 0) {
922 struct bkey up
= bkey_unpack_key(b
, prev
);
924 printbuf_reset(&buf
);
925 prt_printf(&buf
, "keys out of order: ");
926 bch2_bkey_to_text(&buf
, &up
);
927 prt_printf(&buf
, " > ");
928 bch2_bkey_to_text(&buf
, u
.k
);
930 if (btree_err(-BCH_ERR_btree_node_read_err_fixable
,
932 btree_node_bkey_out_of_order
,
941 next_good_key
= k
->u64s
;
943 if (!next_good_key
||
944 (BSET_BIG_ENDIAN(i
) == CPU_BIG_ENDIAN
&&
945 version
>= bcachefs_metadata_version_snapshot
)) {
947 * only do scanning if bch2_bkey_compat() has nothing to
951 if (!bkey_packed_valid(c
, b
, i
, (void *) ((u64
*) k
+ next_good_key
))) {
952 for (next_good_key
= 1;
953 next_good_key
< (u64
*) vstruct_last(i
) - (u64
*) k
;
955 if (bkey_packed_valid(c
, b
, i
, (void *) ((u64
*) k
+ next_good_key
)))
960 * didn't find a good key, have to truncate the rest of
963 next_good_key
= (u64
*) vstruct_last(i
) - (u64
*) k
;
966 le16_add_cpu(&i
->u64s
, -next_good_key
);
967 memmove_u64s_down(k
, bkey_p_next(k
), (u64
*) vstruct_end(i
) - (u64
*) k
);
974 int bch2_btree_node_read_done(struct bch_fs
*c
, struct bch_dev
*ca
,
975 struct btree
*b
, bool have_retry
, bool *saw_error
)
977 struct btree_node_entry
*bne
;
978 struct sort_iter
*iter
;
979 struct btree_node
*sorted
;
980 struct bkey_packed
*k
;
982 bool used_mempool
, blacklisted
;
983 bool updated_range
= b
->key
.k
.type
== KEY_TYPE_btree_ptr_v2
&&
984 BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b
->key
)->v
);
986 unsigned ptr_written
= btree_ptr_sectors_written(bkey_i_to_s_c(&b
->key
));
987 u64 max_journal_seq
= 0;
988 struct printbuf buf
= PRINTBUF
;
989 int ret
= 0, retry_read
= 0, write
= READ
;
990 u64 start_time
= local_clock();
992 b
->version_ondisk
= U16_MAX
;
993 /* We might get called multiple times on read retry: */
996 iter
= mempool_alloc(&c
->fill_iter
, GFP_NOFS
);
997 sort_iter_init(iter
, b
, (btree_blocks(c
) + 1) * 2);
999 if (bch2_meta_read_fault("btree"))
1000 btree_err(-BCH_ERR_btree_node_read_err_must_retry
,
1001 c
, ca
, b
, NULL
, NULL
,
1002 btree_node_fault_injected
,
1005 btree_err_on(le64_to_cpu(b
->data
->magic
) != bset_magic(c
),
1006 -BCH_ERR_btree_node_read_err_must_retry
,
1007 c
, ca
, b
, NULL
, NULL
,
1008 btree_node_bad_magic
,
1009 "bad magic: want %llx, got %llx",
1010 bset_magic(c
), le64_to_cpu(b
->data
->magic
));
1012 if (b
->key
.k
.type
== KEY_TYPE_btree_ptr_v2
) {
1013 struct bch_btree_ptr_v2
*bp
=
1014 &bkey_i_to_btree_ptr_v2(&b
->key
)->v
;
1016 bch2_bpos_to_text(&buf
, b
->data
->min_key
);
1018 bch2_bpos_to_text(&buf
, b
->data
->max_key
);
1020 btree_err_on(b
->data
->keys
.seq
!= bp
->seq
,
1021 -BCH_ERR_btree_node_read_err_must_retry
,
1022 c
, ca
, b
, NULL
, NULL
,
1024 "got wrong btree node: got\n%s",
1025 (printbuf_reset(&buf
),
1026 bch2_btree_node_header_to_text(&buf
, b
->data
),
1029 btree_err_on(!b
->data
->keys
.seq
,
1030 -BCH_ERR_btree_node_read_err_must_retry
,
1031 c
, ca
, b
, NULL
, NULL
,
1033 "bad btree header: seq 0\n%s",
1034 (printbuf_reset(&buf
),
1035 bch2_btree_node_header_to_text(&buf
, b
->data
),
1039 while (b
->written
< (ptr_written
?: btree_sectors(c
))) {
1042 bool first
= !b
->written
;
1048 btree_err_on(!bch2_checksum_type_valid(c
, BSET_CSUM_TYPE(i
)),
1049 -BCH_ERR_btree_node_read_err_want_retry
,
1052 "unknown checksum type %llu", BSET_CSUM_TYPE(i
));
1054 nonce
= btree_nonce(i
, b
->written
<< 9);
1056 struct bch_csum csum
= csum_vstruct(c
, BSET_CSUM_TYPE(i
), nonce
, b
->data
);
1057 csum_bad
= bch2_crc_cmp(b
->data
->csum
, csum
);
1059 bch2_io_error(ca
, BCH_MEMBER_ERROR_checksum
);
1061 btree_err_on(csum_bad
,
1062 -BCH_ERR_btree_node_read_err_want_retry
,
1066 (printbuf_reset(&buf
),
1067 bch2_csum_err_msg(&buf
, BSET_CSUM_TYPE(i
), b
->data
->csum
, csum
),
1070 ret
= bset_encrypt(c
, i
, b
->written
<< 9);
1071 if (bch2_fs_fatal_err_on(ret
, c
,
1072 "decrypting btree node: %s", bch2_err_str(ret
)))
1075 btree_err_on(btree_node_type_is_extents(btree_node_type(b
)) &&
1076 !BTREE_NODE_NEW_EXTENT_OVERWRITE(b
->data
),
1077 -BCH_ERR_btree_node_read_err_incompatible
,
1078 c
, NULL
, b
, NULL
, NULL
,
1079 btree_node_unsupported_version
,
1080 "btree node does not have NEW_EXTENT_OVERWRITE set");
1082 sectors
= vstruct_sectors(b
->data
, c
->block_bits
);
1084 bne
= write_block(b
);
1087 if (i
->seq
!= b
->data
->keys
.seq
)
1090 btree_err_on(!bch2_checksum_type_valid(c
, BSET_CSUM_TYPE(i
)),
1091 -BCH_ERR_btree_node_read_err_want_retry
,
1094 "unknown checksum type %llu", BSET_CSUM_TYPE(i
));
1096 nonce
= btree_nonce(i
, b
->written
<< 9);
1097 struct bch_csum csum
= csum_vstruct(c
, BSET_CSUM_TYPE(i
), nonce
, bne
);
1098 csum_bad
= bch2_crc_cmp(bne
->csum
, csum
);
1100 bch2_io_error(ca
, BCH_MEMBER_ERROR_checksum
);
1102 btree_err_on(csum_bad
,
1103 -BCH_ERR_btree_node_read_err_want_retry
,
1107 (printbuf_reset(&buf
),
1108 bch2_csum_err_msg(&buf
, BSET_CSUM_TYPE(i
), bne
->csum
, csum
),
1111 ret
= bset_encrypt(c
, i
, b
->written
<< 9);
1112 if (bch2_fs_fatal_err_on(ret
, c
,
1113 "decrypting btree node: %s", bch2_err_str(ret
)))
1116 sectors
= vstruct_sectors(bne
, c
->block_bits
);
1119 b
->version_ondisk
= min(b
->version_ondisk
,
1120 le16_to_cpu(i
->version
));
1122 ret
= validate_bset(c
, ca
, b
, i
, b
->written
, sectors
,
1123 READ
, have_retry
, saw_error
);
1128 btree_node_set_format(b
, b
->data
->format
);
1130 ret
= validate_bset_keys(c
, b
, i
, READ
, have_retry
, saw_error
);
1134 SET_BSET_BIG_ENDIAN(i
, CPU_BIG_ENDIAN
);
1136 blacklisted
= bch2_journal_seq_is_blacklisted(c
,
1137 le64_to_cpu(i
->journal_seq
),
1140 btree_err_on(blacklisted
&& first
,
1141 -BCH_ERR_btree_node_read_err_fixable
,
1143 bset_blacklisted_journal_seq
,
1144 "first btree node bset has blacklisted journal seq (%llu)",
1145 le64_to_cpu(i
->journal_seq
));
1147 btree_err_on(blacklisted
&& ptr_written
,
1148 -BCH_ERR_btree_node_read_err_fixable
,
1150 first_bset_blacklisted_journal_seq
,
1151 "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u",
1152 le64_to_cpu(i
->journal_seq
),
1153 b
->written
, b
->written
+ sectors
, ptr_written
);
1155 b
->written
+= sectors
;
1157 if (blacklisted
&& !first
)
1164 max_journal_seq
= max(max_journal_seq
, le64_to_cpu(i
->journal_seq
));
1168 btree_err_on(b
->written
< ptr_written
,
1169 -BCH_ERR_btree_node_read_err_want_retry
,
1170 c
, ca
, b
, NULL
, NULL
,
1171 btree_node_data_missing
,
1172 "btree node data missing: expected %u sectors, found %u",
1173 ptr_written
, b
->written
);
1175 for (bne
= write_block(b
);
1176 bset_byte_offset(b
, bne
) < btree_buf_bytes(b
);
1177 bne
= (void *) bne
+ block_bytes(c
))
1178 btree_err_on(bne
->keys
.seq
== b
->data
->keys
.seq
&&
1179 !bch2_journal_seq_is_blacklisted(c
,
1180 le64_to_cpu(bne
->keys
.journal_seq
),
1182 -BCH_ERR_btree_node_read_err_want_retry
,
1183 c
, ca
, b
, NULL
, NULL
,
1184 btree_node_bset_after_end
,
1185 "found bset signature after last bset");
1188 sorted
= btree_bounce_alloc(c
, btree_buf_bytes(b
), &used_mempool
);
1189 sorted
->keys
.u64s
= 0;
1191 set_btree_bset(b
, b
->set
, &b
->data
->keys
);
1193 b
->nr
= bch2_key_sort_fix_overlapping(c
, &sorted
->keys
, iter
);
1194 memset((uint8_t *)(sorted
+ 1) + b
->nr
.live_u64s
* sizeof(u64
), 0,
1195 btree_buf_bytes(b
) -
1196 sizeof(struct btree_node
) -
1197 b
->nr
.live_u64s
* sizeof(u64
));
1199 u64s
= le16_to_cpu(sorted
->keys
.u64s
);
1201 sorted
->keys
.u64s
= cpu_to_le16(u64s
);
1202 swap(sorted
, b
->data
);
1203 set_btree_bset(b
, b
->set
, &b
->data
->keys
);
1205 b
->data
->keys
.journal_seq
= cpu_to_le64(max_journal_seq
);
1207 BUG_ON(b
->nr
.live_u64s
!= u64s
);
1209 btree_bounce_free(c
, btree_buf_bytes(b
), used_mempool
, sorted
);
1212 bch2_btree_node_drop_keys_outside_node(b
);
1215 for (k
= i
->start
; k
!= vstruct_last(i
);) {
1217 struct bkey_s u
= __bkey_disassemble(b
, k
, &tmp
);
1219 ret
= bch2_bkey_val_validate(c
, u
.s_c
, READ
);
1220 if (ret
== -BCH_ERR_fsck_delete_bkey
||
1221 (bch2_inject_invalid_keys
&&
1222 !bversion_cmp(u
.k
->bversion
, MAX_VERSION
))) {
1223 btree_keys_account_key_drop(&b
->nr
, 0, k
);
1225 i
->u64s
= cpu_to_le16(le16_to_cpu(i
->u64s
) - k
->u64s
);
1226 memmove_u64s_down(k
, bkey_p_next(k
),
1227 (u64
*) vstruct_end(i
) - (u64
*) k
);
1228 set_btree_bset_end(b
, b
->set
);
1234 if (u
.k
->type
== KEY_TYPE_btree_ptr_v2
) {
1235 struct bkey_s_btree_ptr_v2 bp
= bkey_s_to_btree_ptr_v2(u
);
1243 bch2_bset_build_aux_tree(b
, b
->set
, false);
1245 set_needs_whiteout(btree_bset_first(b
), true);
1247 btree_node_reset_sib_u64s(b
);
1250 bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b
->key
)), ptr
) {
1251 struct bch_dev
*ca2
= bch2_dev_rcu(c
, ptr
->dev
);
1253 if (!ca2
|| ca2
->mi
.state
!= BCH_MEMBER_STATE_rw
)
1254 set_btree_node_need_rewrite(b
);
1259 set_btree_node_need_rewrite(b
);
1261 mempool_free(iter
, &c
->fill_iter
);
1262 printbuf_exit(&buf
);
1263 bch2_time_stats_update(&c
->times
[BCH_TIME_btree_node_read_done
], start_time
);
1266 if (ret
== -BCH_ERR_btree_node_read_err_want_retry
||
1267 ret
== -BCH_ERR_btree_node_read_err_must_retry
) {
1270 set_btree_node_read_error(b
);
1271 bch2_btree_lost_data(c
, b
->c
.btree_id
);
1276 static void btree_node_read_work(struct work_struct
*work
)
1278 struct btree_read_bio
*rb
=
1279 container_of(work
, struct btree_read_bio
, work
);
1280 struct bch_fs
*c
= rb
->c
;
1281 struct bch_dev
*ca
= rb
->have_ioref
? bch2_dev_have_ref(c
, rb
->pick
.ptr
.dev
) : NULL
;
1282 struct btree
*b
= rb
->b
;
1283 struct bio
*bio
= &rb
->bio
;
1284 struct bch_io_failures failed
= { .nr
= 0 };
1285 struct printbuf buf
= PRINTBUF
;
1286 bool saw_error
= false;
1293 bch_info(c
, "retrying read");
1294 ca
= bch2_dev_get_ioref(c
, rb
->pick
.ptr
.dev
, READ
);
1295 rb
->have_ioref
= ca
!= NULL
;
1296 bio_reset(bio
, NULL
, REQ_OP_READ
|REQ_SYNC
|REQ_META
);
1297 bio
->bi_iter
.bi_sector
= rb
->pick
.ptr
.offset
;
1298 bio
->bi_iter
.bi_size
= btree_buf_bytes(b
);
1300 if (rb
->have_ioref
) {
1301 bio_set_dev(bio
, ca
->disk_sb
.bdev
);
1302 submit_bio_wait(bio
);
1304 bio
->bi_status
= BLK_STS_REMOVED
;
1307 printbuf_reset(&buf
);
1308 bch2_btree_pos_to_text(&buf
, c
, b
);
1309 bch2_dev_io_err_on(ca
&& bio
->bi_status
, ca
, BCH_MEMBER_ERROR_read
,
1310 "btree read error %s for %s",
1311 bch2_blk_status_to_str(bio
->bi_status
), buf
.buf
);
1313 percpu_ref_put(&ca
->io_ref
);
1314 rb
->have_ioref
= false;
1316 bch2_mark_io_failure(&failed
, &rb
->pick
);
1318 can_retry
= bch2_bkey_pick_read_device(c
,
1319 bkey_i_to_s_c(&b
->key
),
1320 &failed
, &rb
->pick
) > 0;
1322 if (!bio
->bi_status
&&
1323 !bch2_btree_node_read_done(c
, ca
, b
, can_retry
, &saw_error
)) {
1325 bch_info(c
, "retry success");
1332 set_btree_node_read_error(b
);
1333 bch2_btree_lost_data(c
, b
->c
.btree_id
);
1338 bch2_time_stats_update(&c
->times
[BCH_TIME_btree_node_read
],
1343 !btree_node_read_error(b
) &&
1344 c
->curr_recovery_pass
!= BCH_RECOVERY_PASS_scan_for_btree_nodes
) {
1345 printbuf_reset(&buf
);
1346 bch2_bpos_to_text(&buf
, b
->key
.k
.p
);
1347 bch_err_ratelimited(c
, "%s: rewriting btree node at btree=%s level=%u %s due to error",
1348 __func__
, bch2_btree_id_str(b
->c
.btree_id
), b
->c
.level
, buf
.buf
);
1350 bch2_btree_node_rewrite_async(c
, b
);
1353 printbuf_exit(&buf
);
1354 clear_btree_node_read_in_flight(b
);
1355 wake_up_bit(&b
->flags
, BTREE_NODE_read_in_flight
);
1358 static void btree_node_read_endio(struct bio
*bio
)
1360 struct btree_read_bio
*rb
=
1361 container_of(bio
, struct btree_read_bio
, bio
);
1362 struct bch_fs
*c
= rb
->c
;
1364 if (rb
->have_ioref
) {
1365 struct bch_dev
*ca
= bch2_dev_have_ref(c
, rb
->pick
.ptr
.dev
);
1367 bch2_latency_acct(ca
, rb
->start_time
, READ
);
1370 queue_work(c
->btree_read_complete_wq
, &rb
->work
);
1373 struct btree_node_read_all
{
1378 void *buf
[BCH_REPLICAS_MAX
];
1379 struct bio
*bio
[BCH_REPLICAS_MAX
];
1380 blk_status_t err
[BCH_REPLICAS_MAX
];
1383 static unsigned btree_node_sectors_written(struct bch_fs
*c
, void *data
)
1385 struct btree_node
*bn
= data
;
1386 struct btree_node_entry
*bne
;
1387 unsigned offset
= 0;
1389 if (le64_to_cpu(bn
->magic
) != bset_magic(c
))
1392 while (offset
< btree_sectors(c
)) {
1394 offset
+= vstruct_sectors(bn
, c
->block_bits
);
1396 bne
= data
+ (offset
<< 9);
1397 if (bne
->keys
.seq
!= bn
->keys
.seq
)
1399 offset
+= vstruct_sectors(bne
, c
->block_bits
);
1406 static bool btree_node_has_extra_bsets(struct bch_fs
*c
, unsigned offset
, void *data
)
1408 struct btree_node
*bn
= data
;
1409 struct btree_node_entry
*bne
;
1414 while (offset
< btree_sectors(c
)) {
1415 bne
= data
+ (offset
<< 9);
1416 if (bne
->keys
.seq
== bn
->keys
.seq
)
1425 static CLOSURE_CALLBACK(btree_node_read_all_replicas_done
)
1427 closure_type(ra
, struct btree_node_read_all
, cl
);
1428 struct bch_fs
*c
= ra
->c
;
1429 struct btree
*b
= ra
->b
;
1430 struct printbuf buf
= PRINTBUF
;
1431 bool dump_bset_maps
= false;
1432 bool have_retry
= false;
1433 int ret
= 0, best
= -1, write
= READ
;
1434 unsigned i
, written
= 0, written2
= 0;
1435 __le64 seq
= b
->key
.k
.type
== KEY_TYPE_btree_ptr_v2
1436 ? bkey_i_to_btree_ptr_v2(&b
->key
)->v
.seq
: 0;
1437 bool _saw_error
= false, *saw_error
= &_saw_error
;
1439 for (i
= 0; i
< ra
->nr
; i
++) {
1440 struct btree_node
*bn
= ra
->buf
[i
];
1445 if (le64_to_cpu(bn
->magic
) != bset_magic(c
) ||
1446 (seq
&& seq
!= bn
->keys
.seq
))
1451 written
= btree_node_sectors_written(c
, bn
);
1455 written2
= btree_node_sectors_written(c
, ra
->buf
[i
]);
1456 if (btree_err_on(written2
!= written
, -BCH_ERR_btree_node_read_err_fixable
,
1457 c
, NULL
, b
, NULL
, NULL
,
1458 btree_node_replicas_sectors_written_mismatch
,
1459 "btree node sectors written mismatch: %u != %u",
1460 written
, written2
) ||
1461 btree_err_on(btree_node_has_extra_bsets(c
, written2
, ra
->buf
[i
]),
1462 -BCH_ERR_btree_node_read_err_fixable
,
1463 c
, NULL
, b
, NULL
, NULL
,
1464 btree_node_bset_after_end
,
1465 "found bset signature after last bset") ||
1466 btree_err_on(memcmp(ra
->buf
[best
], ra
->buf
[i
], written
<< 9),
1467 -BCH_ERR_btree_node_read_err_fixable
,
1468 c
, NULL
, b
, NULL
, NULL
,
1469 btree_node_replicas_data_mismatch
,
1470 "btree node replicas content mismatch"))
1471 dump_bset_maps
= true;
1473 if (written2
> written
) {
1479 if (dump_bset_maps
) {
1480 for (i
= 0; i
< ra
->nr
; i
++) {
1481 struct btree_node
*bn
= ra
->buf
[i
];
1482 struct btree_node_entry
*bne
= NULL
;
1483 unsigned offset
= 0, sectors
;
1489 printbuf_reset(&buf
);
1491 while (offset
< btree_sectors(c
)) {
1493 sectors
= vstruct_sectors(bn
, c
->block_bits
);
1495 bne
= ra
->buf
[i
] + (offset
<< 9);
1496 if (bne
->keys
.seq
!= bn
->keys
.seq
)
1498 sectors
= vstruct_sectors(bne
, c
->block_bits
);
1501 prt_printf(&buf
, " %u-%u", offset
, offset
+ sectors
);
1502 if (bne
&& bch2_journal_seq_is_blacklisted(c
,
1503 le64_to_cpu(bne
->keys
.journal_seq
), false))
1504 prt_printf(&buf
, "*");
1508 while (offset
< btree_sectors(c
)) {
1509 bne
= ra
->buf
[i
] + (offset
<< 9);
1510 if (bne
->keys
.seq
== bn
->keys
.seq
) {
1512 prt_printf(&buf
, " GAP");
1515 sectors
= vstruct_sectors(bne
, c
->block_bits
);
1516 prt_printf(&buf
, " %u-%u", offset
, offset
+ sectors
);
1517 if (bch2_journal_seq_is_blacklisted(c
,
1518 le64_to_cpu(bne
->keys
.journal_seq
), false))
1519 prt_printf(&buf
, "*");
1524 bch_err(c
, "replica %u:%s", i
, buf
.buf
);
1529 memcpy(b
->data
, ra
->buf
[best
], btree_buf_bytes(b
));
1530 ret
= bch2_btree_node_read_done(c
, NULL
, b
, false, saw_error
);
1536 set_btree_node_read_error(b
);
1537 bch2_btree_lost_data(c
, b
->c
.btree_id
);
1538 } else if (*saw_error
)
1539 bch2_btree_node_rewrite_async(c
, b
);
1541 for (i
= 0; i
< ra
->nr
; i
++) {
1542 mempool_free(ra
->buf
[i
], &c
->btree_bounce_pool
);
1543 bio_put(ra
->bio
[i
]);
1546 closure_debug_destroy(&ra
->cl
);
1548 printbuf_exit(&buf
);
1550 clear_btree_node_read_in_flight(b
);
1551 wake_up_bit(&b
->flags
, BTREE_NODE_read_in_flight
);
1554 static void btree_node_read_all_replicas_endio(struct bio
*bio
)
1556 struct btree_read_bio
*rb
=
1557 container_of(bio
, struct btree_read_bio
, bio
);
1558 struct bch_fs
*c
= rb
->c
;
1559 struct btree_node_read_all
*ra
= rb
->ra
;
1561 if (rb
->have_ioref
) {
1562 struct bch_dev
*ca
= bch2_dev_have_ref(c
, rb
->pick
.ptr
.dev
);
1564 bch2_latency_acct(ca
, rb
->start_time
, READ
);
1567 ra
->err
[rb
->idx
] = bio
->bi_status
;
1568 closure_put(&ra
->cl
);
1572 * XXX This allocates multiple times from the same mempools, and can deadlock
1573 * under sufficient memory pressure (but is only a debug path)
1575 static int btree_node_read_all_replicas(struct bch_fs
*c
, struct btree
*b
, bool sync
)
1577 struct bkey_s_c k
= bkey_i_to_s_c(&b
->key
);
1578 struct bkey_ptrs_c ptrs
= bch2_bkey_ptrs_c(k
);
1579 const union bch_extent_entry
*entry
;
1580 struct extent_ptr_decoded pick
;
1581 struct btree_node_read_all
*ra
;
1584 ra
= kzalloc(sizeof(*ra
), GFP_NOFS
);
1586 return -BCH_ERR_ENOMEM_btree_node_read_all_replicas
;
1588 closure_init(&ra
->cl
, NULL
);
1591 ra
->nr
= bch2_bkey_nr_ptrs(k
);
1593 for (i
= 0; i
< ra
->nr
; i
++) {
1594 ra
->buf
[i
] = mempool_alloc(&c
->btree_bounce_pool
, GFP_NOFS
);
1595 ra
->bio
[i
] = bio_alloc_bioset(NULL
,
1596 buf_pages(ra
->buf
[i
], btree_buf_bytes(b
)),
1597 REQ_OP_READ
|REQ_SYNC
|REQ_META
,
1603 bkey_for_each_ptr_decode(k
.k
, ptrs
, pick
, entry
) {
1604 struct bch_dev
*ca
= bch2_dev_get_ioref(c
, pick
.ptr
.dev
, READ
);
1605 struct btree_read_bio
*rb
=
1606 container_of(ra
->bio
[i
], struct btree_read_bio
, bio
);
1610 rb
->start_time
= local_clock();
1611 rb
->have_ioref
= ca
!= NULL
;
1614 rb
->bio
.bi_iter
.bi_sector
= pick
.ptr
.offset
;
1615 rb
->bio
.bi_end_io
= btree_node_read_all_replicas_endio
;
1616 bch2_bio_map(&rb
->bio
, ra
->buf
[i
], btree_buf_bytes(b
));
1618 if (rb
->have_ioref
) {
1619 this_cpu_add(ca
->io_done
->sectors
[READ
][BCH_DATA_btree
],
1620 bio_sectors(&rb
->bio
));
1621 bio_set_dev(&rb
->bio
, ca
->disk_sb
.bdev
);
1623 closure_get(&ra
->cl
);
1624 submit_bio(&rb
->bio
);
1626 ra
->err
[i
] = BLK_STS_REMOVED
;
1633 closure_sync(&ra
->cl
);
1634 btree_node_read_all_replicas_done(&ra
->cl
.work
);
1636 continue_at(&ra
->cl
, btree_node_read_all_replicas_done
,
1637 c
->btree_read_complete_wq
);
1643 void bch2_btree_node_read(struct btree_trans
*trans
, struct btree
*b
,
1646 struct bch_fs
*c
= trans
->c
;
1647 struct extent_ptr_decoded pick
;
1648 struct btree_read_bio
*rb
;
1653 trace_and_count(c
, btree_node_read
, trans
, b
);
1655 if (bch2_verify_all_btree_replicas
&&
1656 !btree_node_read_all_replicas(c
, b
, sync
))
1659 ret
= bch2_bkey_pick_read_device(c
, bkey_i_to_s_c(&b
->key
),
1663 struct printbuf buf
= PRINTBUF
;
1665 prt_str(&buf
, "btree node read error: no device to read from\n at ");
1666 bch2_btree_pos_to_text(&buf
, c
, b
);
1667 bch_err_ratelimited(c
, "%s", buf
.buf
);
1669 if (c
->opts
.recovery_passes
& BIT_ULL(BCH_RECOVERY_PASS_check_topology
) &&
1670 c
->curr_recovery_pass
> BCH_RECOVERY_PASS_check_topology
)
1671 bch2_fatal_error(c
);
1673 set_btree_node_read_error(b
);
1674 bch2_btree_lost_data(c
, b
->c
.btree_id
);
1675 clear_btree_node_read_in_flight(b
);
1676 wake_up_bit(&b
->flags
, BTREE_NODE_read_in_flight
);
1677 printbuf_exit(&buf
);
1681 ca
= bch2_dev_get_ioref(c
, pick
.ptr
.dev
, READ
);
1683 bio
= bio_alloc_bioset(NULL
,
1684 buf_pages(b
->data
, btree_buf_bytes(b
)),
1685 REQ_OP_READ
|REQ_SYNC
|REQ_META
,
1688 rb
= container_of(bio
, struct btree_read_bio
, bio
);
1692 rb
->start_time
= local_clock();
1693 rb
->have_ioref
= ca
!= NULL
;
1695 INIT_WORK(&rb
->work
, btree_node_read_work
);
1696 bio
->bi_iter
.bi_sector
= pick
.ptr
.offset
;
1697 bio
->bi_end_io
= btree_node_read_endio
;
1698 bch2_bio_map(bio
, b
->data
, btree_buf_bytes(b
));
1700 if (rb
->have_ioref
) {
1701 this_cpu_add(ca
->io_done
->sectors
[READ
][BCH_DATA_btree
],
1703 bio_set_dev(bio
, ca
->disk_sb
.bdev
);
1706 submit_bio_wait(bio
);
1707 bch2_latency_acct(ca
, rb
->start_time
, READ
);
1708 btree_node_read_work(&rb
->work
);
1713 bio
->bi_status
= BLK_STS_REMOVED
;
1716 btree_node_read_work(&rb
->work
);
1718 queue_work(c
->btree_read_complete_wq
, &rb
->work
);
1722 static int __bch2_btree_root_read(struct btree_trans
*trans
, enum btree_id id
,
1723 const struct bkey_i
*k
, unsigned level
)
1725 struct bch_fs
*c
= trans
->c
;
1730 closure_init_stack(&cl
);
1733 ret
= bch2_btree_cache_cannibalize_lock(trans
, &cl
);
1737 b
= bch2_btree_node_mem_alloc(trans
, level
!= 0);
1738 bch2_btree_cache_cannibalize_unlock(trans
);
1742 bkey_copy(&b
->key
, k
);
1743 BUG_ON(bch2_btree_node_hash_insert(&c
->btree_cache
, b
, level
, id
));
1745 set_btree_node_read_in_flight(b
);
1747 /* we can't pass the trans to read_done() for fsck errors, so it must be unlocked */
1748 bch2_trans_unlock(trans
);
1749 bch2_btree_node_read(trans
, b
, true);
1751 if (btree_node_read_error(b
)) {
1752 mutex_lock(&c
->btree_cache
.lock
);
1753 bch2_btree_node_hash_remove(&c
->btree_cache
, b
);
1754 mutex_unlock(&c
->btree_cache
.lock
);
1756 ret
= -BCH_ERR_btree_node_read_error
;
1760 bch2_btree_set_root_for_read(c
, b
);
1762 six_unlock_write(&b
->c
.lock
);
1763 six_unlock_intent(&b
->c
.lock
);
1768 int bch2_btree_root_read(struct bch_fs
*c
, enum btree_id id
,
1769 const struct bkey_i
*k
, unsigned level
)
1771 return bch2_trans_run(c
, __bch2_btree_root_read(trans
, id
, k
, level
));
1774 static void bch2_btree_complete_write(struct bch_fs
*c
, struct btree
*b
,
1775 struct btree_write
*w
)
1777 unsigned long old
, new;
1779 old
= READ_ONCE(b
->will_make_reachable
);
1786 } while (!try_cmpxchg(&b
->will_make_reachable
, &old
, new));
1789 closure_put(&((struct btree_update
*) new)->cl
);
1791 bch2_journal_pin_drop(&c
->journal
, &w
->journal
);
1794 static void __btree_node_write_done(struct bch_fs
*c
, struct btree
*b
)
1796 struct btree_write
*w
= btree_prev_write(b
);
1797 unsigned long old
, new;
1800 bch2_btree_complete_write(c
, b
, w
);
1802 old
= READ_ONCE(b
->flags
);
1806 if ((old
& (1U << BTREE_NODE_dirty
)) &&
1807 (old
& (1U << BTREE_NODE_need_write
)) &&
1808 !(old
& (1U << BTREE_NODE_never_write
)) &&
1809 !(old
& (1U << BTREE_NODE_write_blocked
)) &&
1810 !(old
& (1U << BTREE_NODE_will_make_reachable
))) {
1811 new &= ~(1U << BTREE_NODE_dirty
);
1812 new &= ~(1U << BTREE_NODE_need_write
);
1813 new |= (1U << BTREE_NODE_write_in_flight
);
1814 new |= (1U << BTREE_NODE_write_in_flight_inner
);
1815 new |= (1U << BTREE_NODE_just_written
);
1816 new ^= (1U << BTREE_NODE_write_idx
);
1818 type
= new & BTREE_WRITE_TYPE_MASK
;
1819 new &= ~BTREE_WRITE_TYPE_MASK
;
1821 new &= ~(1U << BTREE_NODE_write_in_flight
);
1822 new &= ~(1U << BTREE_NODE_write_in_flight_inner
);
1824 } while (!try_cmpxchg(&b
->flags
, &old
, new));
1826 if (new & (1U << BTREE_NODE_write_in_flight
))
1827 __bch2_btree_node_write(c
, b
, BTREE_WRITE_ALREADY_STARTED
|type
);
1829 wake_up_bit(&b
->flags
, BTREE_NODE_write_in_flight
);
1832 static void btree_node_write_done(struct bch_fs
*c
, struct btree
*b
)
1834 struct btree_trans
*trans
= bch2_trans_get(c
);
1836 btree_node_lock_nopath_nofail(trans
, &b
->c
, SIX_LOCK_read
);
1838 /* we don't need transaction context anymore after we got the lock. */
1839 bch2_trans_put(trans
);
1840 __btree_node_write_done(c
, b
);
1841 six_unlock_read(&b
->c
.lock
);
1844 static void btree_node_write_work(struct work_struct
*work
)
1846 struct btree_write_bio
*wbio
=
1847 container_of(work
, struct btree_write_bio
, work
);
1848 struct bch_fs
*c
= wbio
->wbio
.c
;
1849 struct btree
*b
= wbio
->wbio
.bio
.bi_private
;
1852 btree_bounce_free(c
,
1854 wbio
->wbio
.used_mempool
,
1857 bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio
->key
), ptr
,
1858 bch2_dev_list_has_dev(wbio
->wbio
.failed
, ptr
->dev
));
1860 if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio
->key
))) {
1861 ret
= -BCH_ERR_btree_node_write_all_failed
;
1865 if (wbio
->wbio
.first_btree_write
) {
1866 if (wbio
->wbio
.failed
.nr
) {
1870 ret
= bch2_trans_do(c
,
1871 bch2_btree_node_update_key_get_iter(trans
, b
, &wbio
->key
,
1872 BCH_WATERMARK_interior_updates
|
1873 BCH_TRANS_COMMIT_journal_reclaim
|
1874 BCH_TRANS_COMMIT_no_enospc
|
1875 BCH_TRANS_COMMIT_no_check_rw
,
1876 !wbio
->wbio
.failed
.nr
));
1881 bio_put(&wbio
->wbio
.bio
);
1882 btree_node_write_done(c
, b
);
1885 set_btree_node_noevict(b
);
1886 bch2_fs_fatal_err_on(!bch2_err_matches(ret
, EROFS
), c
,
1887 "writing btree node: %s", bch2_err_str(ret
));
1891 static void btree_node_write_endio(struct bio
*bio
)
1893 struct bch_write_bio
*wbio
= to_wbio(bio
);
1894 struct bch_write_bio
*parent
= wbio
->split
? wbio
->parent
: NULL
;
1895 struct bch_write_bio
*orig
= parent
?: wbio
;
1896 struct btree_write_bio
*wb
= container_of(orig
, struct btree_write_bio
, wbio
);
1897 struct bch_fs
*c
= wbio
->c
;
1898 struct btree
*b
= wbio
->bio
.bi_private
;
1899 struct bch_dev
*ca
= wbio
->have_ioref
? bch2_dev_have_ref(c
, wbio
->dev
) : NULL
;
1900 unsigned long flags
;
1902 if (wbio
->have_ioref
)
1903 bch2_latency_acct(ca
, wbio
->submit_time
, WRITE
);
1906 bch2_dev_io_err_on(bio
->bi_status
, ca
, BCH_MEMBER_ERROR_write
,
1907 "btree write error: %s",
1908 bch2_blk_status_to_str(bio
->bi_status
)) ||
1909 bch2_meta_write_fault("btree")) {
1910 spin_lock_irqsave(&c
->btree_write_error_lock
, flags
);
1911 bch2_dev_list_add_dev(&orig
->failed
, wbio
->dev
);
1912 spin_unlock_irqrestore(&c
->btree_write_error_lock
, flags
);
1915 if (wbio
->have_ioref
)
1916 percpu_ref_put(&ca
->io_ref
);
1920 bio_endio(&parent
->bio
);
1924 clear_btree_node_write_in_flight_inner(b
);
1925 wake_up_bit(&b
->flags
, BTREE_NODE_write_in_flight_inner
);
1926 INIT_WORK(&wb
->work
, btree_node_write_work
);
1927 queue_work(c
->btree_io_complete_wq
, &wb
->work
);
1930 static int validate_bset_for_write(struct bch_fs
*c
, struct btree
*b
,
1931 struct bset
*i
, unsigned sectors
)
1935 int ret
= bch2_bkey_validate(c
, bkey_i_to_s_c(&b
->key
),
1936 BKEY_TYPE_btree
, WRITE
);
1938 bch2_fs_inconsistent(c
, "invalid btree node key before write");
1942 ret
= validate_bset_keys(c
, b
, i
, WRITE
, false, &saw_error
) ?:
1943 validate_bset(c
, NULL
, b
, i
, b
->written
, sectors
, WRITE
, false, &saw_error
);
1945 bch2_inconsistent_error(c
);
1952 static void btree_write_submit(struct work_struct
*work
)
1954 struct btree_write_bio
*wbio
= container_of(work
, struct btree_write_bio
, work
);
1955 BKEY_PADDED_ONSTACK(k
, BKEY_BTREE_PTR_VAL_U64s_MAX
) tmp
;
1957 bkey_copy(&tmp
.k
, &wbio
->key
);
1959 bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&tmp
.k
)), ptr
)
1960 ptr
->offset
+= wbio
->sector_offset
;
1962 bch2_submit_wbio_replicas(&wbio
->wbio
, wbio
->wbio
.c
, BCH_DATA_btree
,
1966 void __bch2_btree_node_write(struct bch_fs
*c
, struct btree
*b
, unsigned flags
)
1968 struct btree_write_bio
*wbio
;
1970 struct btree_node
*bn
= NULL
;
1971 struct btree_node_entry
*bne
= NULL
;
1972 struct sort_iter_stack sort_iter
;
1974 unsigned bytes_to_write
, sectors_to_write
, bytes
, u64s
;
1977 unsigned long old
, new;
1978 bool validate_before_checksum
= false;
1979 enum btree_write_type type
= flags
& BTREE_WRITE_TYPE_MASK
;
1983 if (flags
& BTREE_WRITE_ALREADY_STARTED
)
1987 * We may only have a read lock on the btree node - the dirty bit is our
1988 * "lock" against racing with other threads that may be trying to start
1989 * a write, we do a write iff we clear the dirty bit. Since setting the
1990 * dirty bit requires a write lock, we can't race with other threads
1993 old
= READ_ONCE(b
->flags
);
1997 if (!(old
& (1 << BTREE_NODE_dirty
)))
2000 if ((flags
& BTREE_WRITE_ONLY_IF_NEED
) &&
2001 !(old
& (1 << BTREE_NODE_need_write
)))
2005 ((1 << BTREE_NODE_never_write
)|
2006 (1 << BTREE_NODE_write_blocked
)))
2010 (old
& (1 << BTREE_NODE_will_make_reachable
)))
2013 if (old
& (1 << BTREE_NODE_write_in_flight
))
2016 if (flags
& BTREE_WRITE_ONLY_IF_NEED
)
2017 type
= new & BTREE_WRITE_TYPE_MASK
;
2018 new &= ~BTREE_WRITE_TYPE_MASK
;
2020 new &= ~(1 << BTREE_NODE_dirty
);
2021 new &= ~(1 << BTREE_NODE_need_write
);
2022 new |= (1 << BTREE_NODE_write_in_flight
);
2023 new |= (1 << BTREE_NODE_write_in_flight_inner
);
2024 new |= (1 << BTREE_NODE_just_written
);
2025 new ^= (1 << BTREE_NODE_write_idx
);
2026 } while (!try_cmpxchg_acquire(&b
->flags
, &old
, new));
2028 if (new & (1U << BTREE_NODE_need_write
))
2031 BUG_ON((type
== BTREE_WRITE_initial
) != (b
->written
== 0));
2033 atomic_long_dec(&c
->btree_cache
.nr_dirty
);
2035 BUG_ON(btree_node_fake(b
));
2036 BUG_ON((b
->will_make_reachable
!= 0) != !b
->written
);
2038 BUG_ON(b
->written
>= btree_sectors(c
));
2039 BUG_ON(b
->written
& (block_sectors(c
) - 1));
2040 BUG_ON(bset_written(b
, btree_bset_last(b
)));
2041 BUG_ON(le64_to_cpu(b
->data
->magic
) != bset_magic(c
));
2042 BUG_ON(memcmp(&b
->data
->format
, &b
->format
, sizeof(b
->format
)));
2044 bch2_sort_whiteouts(c
, b
);
2046 sort_iter_stack_init(&sort_iter
, b
);
2049 ? sizeof(struct btree_node
)
2050 : sizeof(struct btree_node_entry
);
2052 bytes
+= b
->whiteout_u64s
* sizeof(u64
);
2054 for_each_bset(b
, t
) {
2057 if (bset_written(b
, i
))
2060 bytes
+= le16_to_cpu(i
->u64s
) * sizeof(u64
);
2061 sort_iter_add(&sort_iter
.iter
,
2062 btree_bkey_first(b
, t
),
2063 btree_bkey_last(b
, t
));
2064 seq
= max(seq
, le64_to_cpu(i
->journal_seq
));
2067 BUG_ON(b
->written
&& !seq
);
2069 /* bch2_varint_decode may read up to 7 bytes past the end of the buffer: */
2072 /* buffer must be a multiple of the block size */
2073 bytes
= round_up(bytes
, block_bytes(c
));
2075 data
= btree_bounce_alloc(c
, bytes
, &used_mempool
);
2083 bne
->keys
= b
->data
->keys
;
2087 i
->journal_seq
= cpu_to_le64(seq
);
2090 sort_iter_add(&sort_iter
.iter
,
2091 unwritten_whiteouts_start(b
),
2092 unwritten_whiteouts_end(b
));
2093 SET_BSET_SEPARATE_WHITEOUTS(i
, false);
2095 u64s
= bch2_sort_keys_keep_unwritten_whiteouts(i
->start
, &sort_iter
.iter
);
2096 le16_add_cpu(&i
->u64s
, u64s
);
2098 b
->whiteout_u64s
= 0;
2100 BUG_ON(!b
->written
&& i
->u64s
!= b
->data
->keys
.u64s
);
2102 set_needs_whiteout(i
, false);
2104 /* do we have data to write? */
2105 if (b
->written
&& !i
->u64s
)
2108 bytes_to_write
= vstruct_end(i
) - data
;
2109 sectors_to_write
= round_up(bytes_to_write
, block_bytes(c
)) >> 9;
2112 b
->key
.k
.type
== KEY_TYPE_btree_ptr_v2
)
2113 BUG_ON(btree_ptr_sectors_written(bkey_i_to_s_c(&b
->key
)) != sectors_to_write
);
2115 memset(data
+ bytes_to_write
, 0,
2116 (sectors_to_write
<< 9) - bytes_to_write
);
2118 BUG_ON(b
->written
+ sectors_to_write
> btree_sectors(c
));
2119 BUG_ON(BSET_BIG_ENDIAN(i
) != CPU_BIG_ENDIAN
);
2120 BUG_ON(i
->seq
!= b
->data
->keys
.seq
);
2122 i
->version
= cpu_to_le16(c
->sb
.version
);
2123 SET_BSET_OFFSET(i
, b
->written
);
2124 SET_BSET_CSUM_TYPE(i
, bch2_meta_checksum_type(c
));
2126 if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i
)))
2127 validate_before_checksum
= true;
2129 /* validate_bset will be modifying: */
2130 if (le16_to_cpu(i
->version
) < bcachefs_metadata_version_current
)
2131 validate_before_checksum
= true;
2133 /* if we're going to be encrypting, check metadata validity first: */
2134 if (validate_before_checksum
&&
2135 validate_bset_for_write(c
, b
, i
, sectors_to_write
))
2138 ret
= bset_encrypt(c
, i
, b
->written
<< 9);
2139 if (bch2_fs_fatal_err_on(ret
, c
,
2140 "encrypting btree node: %s", bch2_err_str(ret
)))
2143 nonce
= btree_nonce(i
, b
->written
<< 9);
2146 bn
->csum
= csum_vstruct(c
, BSET_CSUM_TYPE(i
), nonce
, bn
);
2148 bne
->csum
= csum_vstruct(c
, BSET_CSUM_TYPE(i
), nonce
, bne
);
2150 /* if we're not encrypting, check metadata after checksumming: */
2151 if (!validate_before_checksum
&&
2152 validate_bset_for_write(c
, b
, i
, sectors_to_write
))
2156 * We handle btree write errors by immediately halting the journal -
2157 * after we've done that, we can't issue any subsequent btree writes
2158 * because they might have pointers to new nodes that failed to write.
2160 * Furthermore, there's no point in doing any more btree writes because
2161 * with the journal stopped, we're never going to update the journal to
2162 * reflect that those writes were done and the data flushed from the
2165 * Also on journal error, the pending write may have updates that were
2166 * never journalled (interior nodes, see btree_update_nodes_written()) -
2167 * it's critical that we don't do the write in that case otherwise we
2168 * will have updates visible that weren't in the journal:
2170 * Make sure to update b->written so bch2_btree_init_next() doesn't
2173 if (bch2_journal_error(&c
->journal
) ||
2177 trace_and_count(c
, btree_node_write
, b
, bytes_to_write
, sectors_to_write
);
2179 wbio
= container_of(bio_alloc_bioset(NULL
,
2180 buf_pages(data
, sectors_to_write
<< 9),
2181 REQ_OP_WRITE
|REQ_META
,
2184 struct btree_write_bio
, wbio
.bio
);
2185 wbio_init(&wbio
->wbio
.bio
);
2187 wbio
->data_bytes
= bytes
;
2188 wbio
->sector_offset
= b
->written
;
2190 wbio
->wbio
.used_mempool
= used_mempool
;
2191 wbio
->wbio
.first_btree_write
= !b
->written
;
2192 wbio
->wbio
.bio
.bi_end_io
= btree_node_write_endio
;
2193 wbio
->wbio
.bio
.bi_private
= b
;
2195 bch2_bio_map(&wbio
->wbio
.bio
, data
, sectors_to_write
<< 9);
2197 bkey_copy(&wbio
->key
, &b
->key
);
2199 b
->written
+= sectors_to_write
;
2201 if (wbio
->key
.k
.type
== KEY_TYPE_btree_ptr_v2
)
2202 bkey_i_to_btree_ptr_v2(&wbio
->key
)->v
.sectors_written
=
2203 cpu_to_le16(b
->written
);
2205 atomic64_inc(&c
->btree_write_stats
[type
].nr
);
2206 atomic64_add(bytes_to_write
, &c
->btree_write_stats
[type
].bytes
);
2208 INIT_WORK(&wbio
->work
, btree_write_submit
);
2209 queue_work(c
->btree_write_submit_wq
, &wbio
->work
);
2212 set_btree_node_noevict(b
);
2213 b
->written
+= sectors_to_write
;
2215 btree_bounce_free(c
, bytes
, used_mempool
, data
);
2216 __btree_node_write_done(c
, b
);
2220 * Work that must be done with write lock held:
2222 bool bch2_btree_post_write_cleanup(struct bch_fs
*c
, struct btree
*b
)
2224 bool invalidated_iter
= false;
2225 struct btree_node_entry
*bne
;
2227 if (!btree_node_just_written(b
))
2230 BUG_ON(b
->whiteout_u64s
);
2232 clear_btree_node_just_written(b
);
2235 * Note: immediately after write, bset_written() doesn't work - the
2236 * amount of data we had to write after compaction might have been
2237 * smaller than the offset of the last bset.
2239 * However, we know that all bsets have been written here, as long as
2240 * we're still holding the write lock:
2244 * XXX: decide if we really want to unconditionally sort down to a
2248 btree_node_sort(c
, b
, 0, b
->nsets
);
2249 invalidated_iter
= true;
2251 invalidated_iter
= bch2_drop_whiteouts(b
, COMPACT_ALL
);
2255 set_needs_whiteout(bset(b
, t
), true);
2257 bch2_btree_verify(c
, b
);
2260 * If later we don't unconditionally sort down to a single bset, we have
2261 * to ensure this is still true:
2263 BUG_ON((void *) btree_bkey_last(b
, bset_tree_last(b
)) > write_block(b
));
2265 bne
= want_new_bset(c
, b
);
2267 bch2_bset_init_next(b
, bne
);
2269 bch2_btree_build_aux_trees(b
);
2271 return invalidated_iter
;
2275 * Use this one if the node is intent locked:
2277 void bch2_btree_node_write(struct bch_fs
*c
, struct btree
*b
,
2278 enum six_lock_type lock_type_held
,
2281 if (lock_type_held
== SIX_LOCK_intent
||
2282 (lock_type_held
== SIX_LOCK_read
&&
2283 six_lock_tryupgrade(&b
->c
.lock
))) {
2284 __bch2_btree_node_write(c
, b
, flags
);
2286 /* don't cycle lock unnecessarily: */
2287 if (btree_node_just_written(b
) &&
2288 six_trylock_write(&b
->c
.lock
)) {
2289 bch2_btree_post_write_cleanup(c
, b
);
2290 six_unlock_write(&b
->c
.lock
);
2293 if (lock_type_held
== SIX_LOCK_read
)
2294 six_lock_downgrade(&b
->c
.lock
);
2296 __bch2_btree_node_write(c
, b
, flags
);
2297 if (lock_type_held
== SIX_LOCK_write
&&
2298 btree_node_just_written(b
))
2299 bch2_btree_post_write_cleanup(c
, b
);
2303 static bool __bch2_btree_flush_all(struct bch_fs
*c
, unsigned flag
)
2305 struct bucket_table
*tbl
;
2306 struct rhash_head
*pos
;
2312 for_each_cached_btree(b
, c
, tbl
, i
, pos
)
2313 if (test_bit(flag
, &b
->flags
)) {
2315 wait_on_bit_io(&b
->flags
, flag
, TASK_UNINTERRUPTIBLE
);
2324 bool bch2_btree_flush_all_reads(struct bch_fs
*c
)
2326 return __bch2_btree_flush_all(c
, BTREE_NODE_read_in_flight
);
2329 bool bch2_btree_flush_all_writes(struct bch_fs
*c
)
2331 return __bch2_btree_flush_all(c
, BTREE_NODE_write_in_flight
);
2334 static const char * const bch2_btree_write_types
[] = {
2335 #define x(t, n) [n] = #t,
2336 BCH_BTREE_WRITE_TYPES()
2340 void bch2_btree_write_stats_to_text(struct printbuf
*out
, struct bch_fs
*c
)
2342 printbuf_tabstop_push(out
, 20);
2343 printbuf_tabstop_push(out
, 10);
2345 prt_printf(out
, "\tnr\tsize\n");
2347 for (unsigned i
= 0; i
< BTREE_WRITE_TYPE_NR
; i
++) {
2348 u64 nr
= atomic64_read(&c
->btree_write_stats
[i
].nr
);
2349 u64 bytes
= atomic64_read(&c
->btree_write_stats
[i
].bytes
);
2351 prt_printf(out
, "%s:\t%llu\t", bch2_btree_write_types
[i
], nr
);
2352 prt_human_readable_u64(out
, nr
? div64_u64(bytes
, nr
) : 0);