1 // SPDX-License-Identifier: GPL-2.0
4 #include "alloc_background.h"
6 #include "btree_journal_iter.h"
7 #include "btree_node_scan.h"
8 #include "btree_update.h"
9 #include "btree_update_interior.h"
13 #include "disk_accounting.h"
16 #include "fs-common.h"
17 #include "journal_io.h"
18 #include "journal_reclaim.h"
19 #include "journal_seq_blacklist.h"
20 #include "logged_ops.h"
23 #include "rebalance.h"
25 #include "recovery_passes.h"
28 #include "sb-downgrade.h"
32 #include <linux/sort.h>
33 #include <linux/stat.h>
35 #define QSTR(n) { { { .len = strlen(n) } }, .name = n }
37 void bch2_btree_lost_data(struct bch_fs
*c
, enum btree_id btree
)
39 if (btree
>= BTREE_ID_NR_MAX
)
42 u64 b
= BIT_ULL(btree
);
44 if (!(c
->sb
.btrees_lost_data
& b
)) {
45 bch_err(c
, "flagging btree %s lost data", bch2_btree_id_str(btree
));
47 mutex_lock(&c
->sb_lock
);
48 bch2_sb_field_get(c
->disk_sb
.sb
, ext
)->btrees_lost_data
|= cpu_to_le64(b
);
50 mutex_unlock(&c
->sb_lock
);
54 /* for -o reconstruct_alloc: */
55 static void bch2_reconstruct_alloc(struct bch_fs
*c
)
57 bch2_journal_log_msg(c
, "dropping alloc info");
58 bch_info(c
, "dropping and reconstructing all alloc info");
60 mutex_lock(&c
->sb_lock
);
61 struct bch_sb_field_ext
*ext
= bch2_sb_field_get(c
->disk_sb
.sb
, ext
);
63 __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations
, ext
->recovery_passes_required
);
64 __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_info
, ext
->recovery_passes_required
);
65 __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_lrus
, ext
->recovery_passes_required
);
66 __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_extents_to_backpointers
, ext
->recovery_passes_required
);
67 __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_to_lru_refs
, ext
->recovery_passes_required
);
69 __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_alloc_key
, ext
->errors_silent
);
70 __set_bit_le64(BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen
, ext
->errors_silent
);
71 __set_bit_le64(BCH_FSCK_ERR_stale_dirty_ptr
, ext
->errors_silent
);
73 __set_bit_le64(BCH_FSCK_ERR_dev_usage_buckets_wrong
, ext
->errors_silent
);
74 __set_bit_le64(BCH_FSCK_ERR_dev_usage_sectors_wrong
, ext
->errors_silent
);
75 __set_bit_le64(BCH_FSCK_ERR_dev_usage_fragmented_wrong
, ext
->errors_silent
);
77 __set_bit_le64(BCH_FSCK_ERR_fs_usage_btree_wrong
, ext
->errors_silent
);
78 __set_bit_le64(BCH_FSCK_ERR_fs_usage_cached_wrong
, ext
->errors_silent
);
79 __set_bit_le64(BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong
, ext
->errors_silent
);
80 __set_bit_le64(BCH_FSCK_ERR_fs_usage_replicas_wrong
, ext
->errors_silent
);
82 __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong
, ext
->errors_silent
);
83 __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong
, ext
->errors_silent
);
84 __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong
, ext
->errors_silent
);
85 __set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong
, ext
->errors_silent
);
86 __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong
, ext
->errors_silent
);
87 __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong
, ext
->errors_silent
);
88 __set_bit_le64(BCH_FSCK_ERR_need_discard_key_wrong
, ext
->errors_silent
);
89 __set_bit_le64(BCH_FSCK_ERR_freespace_key_wrong
, ext
->errors_silent
);
90 __set_bit_le64(BCH_FSCK_ERR_bucket_gens_key_wrong
, ext
->errors_silent
);
91 __set_bit_le64(BCH_FSCK_ERR_freespace_hole_missing
, ext
->errors_silent
);
92 __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_backpointer
, ext
->errors_silent
);
93 __set_bit_le64(BCH_FSCK_ERR_lru_entry_bad
, ext
->errors_silent
);
94 __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch
, ext
->errors_silent
);
95 c
->sb
.compat
&= ~(1ULL << BCH_COMPAT_alloc_info
);
97 c
->opts
.recovery_passes
|= bch2_recovery_passes_from_stable(le64_to_cpu(ext
->recovery_passes_required
[0]));
100 mutex_unlock(&c
->sb_lock
);
102 bch2_shoot_down_journal_keys(c
, BTREE_ID_alloc
,
103 0, BTREE_MAX_DEPTH
, POS_MIN
, SPOS_MAX
);
104 bch2_shoot_down_journal_keys(c
, BTREE_ID_backpointers
,
105 0, BTREE_MAX_DEPTH
, POS_MIN
, SPOS_MAX
);
106 bch2_shoot_down_journal_keys(c
, BTREE_ID_need_discard
,
107 0, BTREE_MAX_DEPTH
, POS_MIN
, SPOS_MAX
);
108 bch2_shoot_down_journal_keys(c
, BTREE_ID_freespace
,
109 0, BTREE_MAX_DEPTH
, POS_MIN
, SPOS_MAX
);
110 bch2_shoot_down_journal_keys(c
, BTREE_ID_bucket_gens
,
111 0, BTREE_MAX_DEPTH
, POS_MIN
, SPOS_MAX
);
115 * Btree node pointers have a field to stack a pointer to the in memory btree
116 * node; we need to zero out this field when reading in btree nodes, or when
117 * reading in keys from the journal:
119 static void zero_out_btree_mem_ptr(struct journal_keys
*keys
)
121 darray_for_each(*keys
, i
)
122 if (i
->k
->k
.type
== KEY_TYPE_btree_ptr_v2
)
123 bkey_i_to_btree_ptr_v2(i
->k
)->v
.mem_ptr
= 0;
126 /* journal replay: */
128 static void replay_now_at(struct journal
*j
, u64 seq
)
130 BUG_ON(seq
< j
->replay_journal_seq
);
132 seq
= min(seq
, j
->replay_journal_seq_end
);
134 while (j
->replay_journal_seq
< seq
)
135 bch2_journal_pin_put(j
, j
->replay_journal_seq
++);
138 static int bch2_journal_replay_accounting_key(struct btree_trans
*trans
,
139 struct journal_key
*k
)
141 struct btree_iter iter
;
142 bch2_trans_node_iter_init(trans
, &iter
, k
->btree_id
, k
->k
->k
.p
,
143 BTREE_MAX_DEPTH
, k
->level
,
145 int ret
= bch2_btree_iter_traverse(&iter
);
150 struct bkey_s_c old
= bch2_btree_path_peek_slot(btree_iter_path(trans
, &iter
), &u
);
152 /* Has this delta already been applied to the btree? */
153 if (bversion_cmp(old
.k
->bversion
, k
->k
->k
.bversion
) >= 0) {
158 struct bkey_i
*new = k
->k
;
159 if (old
.k
->type
== KEY_TYPE_accounting
) {
160 new = bch2_bkey_make_mut_noupdate(trans
, bkey_i_to_s_c(k
->k
));
161 ret
= PTR_ERR_OR_ZERO(new);
165 bch2_accounting_accumulate(bkey_i_to_accounting(new),
166 bkey_s_c_to_accounting(old
));
169 trans
->journal_res
.seq
= k
->journal_seq
;
171 ret
= bch2_trans_update(trans
, &iter
, new, BTREE_TRIGGER_norun
);
173 bch2_trans_iter_exit(trans
, &iter
);
177 static int bch2_journal_replay_key(struct btree_trans
*trans
,
178 struct journal_key
*k
)
180 struct btree_iter iter
;
181 unsigned iter_flags
=
183 BTREE_ITER_not_extents
;
184 unsigned update_flags
= BTREE_TRIGGER_norun
;
190 trans
->journal_res
.seq
= k
->journal_seq
;
193 * BTREE_UPDATE_key_cache_reclaim disables key cache lookup/update to
194 * keep the key cache coherent with the underlying btree. Nothing
195 * besides the allocator is doing updates yet so we don't need key cache
196 * coherency for non-alloc btrees, and key cache fills for snapshots
197 * btrees use BTREE_ITER_filter_snapshots, which isn't available until
198 * the snapshots recovery pass runs.
200 if (!k
->level
&& k
->btree_id
== BTREE_ID_alloc
)
201 iter_flags
|= BTREE_ITER_cached
;
203 update_flags
|= BTREE_UPDATE_key_cache_reclaim
;
205 bch2_trans_node_iter_init(trans
, &iter
, k
->btree_id
, k
->k
->k
.p
,
206 BTREE_MAX_DEPTH
, k
->level
,
208 ret
= bch2_btree_iter_traverse(&iter
);
212 struct btree_path
*path
= btree_iter_path(trans
, &iter
);
213 if (unlikely(!btree_path_node(path
, k
->level
))) {
214 bch2_trans_iter_exit(trans
, &iter
);
215 bch2_trans_node_iter_init(trans
, &iter
, k
->btree_id
, k
->k
->k
.p
,
216 BTREE_MAX_DEPTH
, 0, iter_flags
);
217 ret
= bch2_btree_iter_traverse(&iter
) ?:
218 bch2_btree_increase_depth(trans
, iter
.path
, 0) ?:
219 -BCH_ERR_transaction_restart_nested
;
223 /* Must be checked with btree locked: */
227 if (k
->k
->k
.type
== KEY_TYPE_accounting
) {
228 ret
= bch2_trans_update_buffered(trans
, BTREE_ID_accounting
, k
->k
);
232 ret
= bch2_trans_update(trans
, &iter
, k
->k
, update_flags
);
234 bch2_trans_iter_exit(trans
, &iter
);
238 static int journal_sort_seq_cmp(const void *_l
, const void *_r
)
240 const struct journal_key
*l
= *((const struct journal_key
**)_l
);
241 const struct journal_key
*r
= *((const struct journal_key
**)_r
);
244 * Map 0 to U64_MAX, so that keys with journal_seq === 0 come last
246 * journal_seq == 0 means that the key comes from early repair, and
247 * should be inserted last so as to avoid overflowing the journal
249 return cmp_int(l
->journal_seq
- 1, r
->journal_seq
- 1);
252 int bch2_journal_replay(struct bch_fs
*c
)
254 struct journal_keys
*keys
= &c
->journal_keys
;
255 DARRAY(struct journal_key
*) keys_sorted
= { 0 };
256 struct journal
*j
= &c
->journal
;
257 u64 start_seq
= c
->journal_replay_seq_start
;
258 u64 end_seq
= c
->journal_replay_seq_start
;
259 struct btree_trans
*trans
= NULL
;
260 bool immediate_flush
= false;
264 ret
= bch2_journal_log_msg(c
, "Starting journal replay (%zu keys in entries %llu-%llu)",
265 keys
->nr
, start_seq
, end_seq
);
270 BUG_ON(!atomic_read(&keys
->ref
));
272 move_gap(keys
, keys
->nr
);
273 trans
= bch2_trans_get(c
);
276 * Replay accounting keys first: we can't allow the write buffer to
277 * flush accounting keys until we're done
279 darray_for_each(*keys
, k
) {
280 if (!(k
->k
->k
.type
== KEY_TYPE_accounting
&& !k
->allocated
))
285 ret
= commit_do(trans
, NULL
, NULL
,
286 BCH_TRANS_COMMIT_no_enospc
|
287 BCH_TRANS_COMMIT_journal_reclaim
|
288 BCH_TRANS_COMMIT_skip_accounting_apply
|
289 BCH_TRANS_COMMIT_no_journal_res
|
290 BCH_WATERMARK_reclaim
,
291 bch2_journal_replay_accounting_key(trans
, k
));
292 if (bch2_fs_fatal_err_on(ret
, c
, "error replaying accounting; %s", bch2_err_str(ret
)))
295 k
->overwritten
= true;
298 set_bit(BCH_FS_accounting_replay_done
, &c
->flags
);
301 * First, attempt to replay keys in sorted order. This is more
302 * efficient - better locality of btree access - but some might fail if
303 * that would cause a journal deadlock.
305 darray_for_each(*keys
, k
) {
309 * k->allocated means the key wasn't read in from the journal,
310 * rather it was from early repair code
313 immediate_flush
= true;
315 /* Skip fastpath if we're low on space in the journal */
316 ret
= c
->journal
.watermark
? -1 :
317 commit_do(trans
, NULL
, NULL
,
318 BCH_TRANS_COMMIT_no_enospc
|
319 BCH_TRANS_COMMIT_journal_reclaim
|
320 BCH_TRANS_COMMIT_skip_accounting_apply
|
321 (!k
->allocated
? BCH_TRANS_COMMIT_no_journal_res
: 0),
322 bch2_journal_replay_key(trans
, k
));
323 BUG_ON(!ret
&& !k
->overwritten
&& k
->k
->k
.type
!= KEY_TYPE_accounting
);
325 ret
= darray_push(&keys_sorted
, k
);
331 bch2_trans_unlock_long(trans
);
333 * Now, replay any remaining keys in the order in which they appear in
334 * the journal, unpinning those journal entries as we go:
336 sort(keys_sorted
.data
, keys_sorted
.nr
,
337 sizeof(keys_sorted
.data
[0]),
338 journal_sort_seq_cmp
, NULL
);
340 darray_for_each(keys_sorted
, kp
) {
343 struct journal_key
*k
= *kp
;
346 replay_now_at(j
, k
->journal_seq
);
348 replay_now_at(j
, j
->replay_journal_seq_end
);
350 ret
= commit_do(trans
, NULL
, NULL
,
351 BCH_TRANS_COMMIT_no_enospc
|
352 BCH_TRANS_COMMIT_skip_accounting_apply
|
354 ? BCH_TRANS_COMMIT_no_journal_res
|BCH_WATERMARK_reclaim
356 bch2_journal_replay_key(trans
, k
));
357 bch_err_msg(c
, ret
, "while replaying key at btree %s level %u:",
358 bch2_btree_id_str(k
->btree_id
), k
->level
);
362 BUG_ON(k
->btree_id
!= BTREE_ID_accounting
&& !k
->overwritten
);
366 * We need to put our btree_trans before calling flush_all_pins(), since
367 * that will use a btree_trans internally
369 bch2_trans_put(trans
);
372 if (!c
->opts
.retain_recovery_info
&&
373 c
->recovery_pass_done
>= BCH_RECOVERY_PASS_journal_replay
)
374 bch2_journal_keys_put_initial(c
);
376 replay_now_at(j
, j
->replay_journal_seq_end
);
377 j
->replay_journal_seq
= 0;
379 bch2_journal_set_replay_done(j
);
381 /* if we did any repair, flush it immediately */
382 if (immediate_flush
) {
383 bch2_journal_flush_all_pins(&c
->journal
);
384 ret
= bch2_journal_meta(&c
->journal
);
388 bch2_journal_log_msg(c
, "journal replay finished");
391 bch2_trans_put(trans
);
392 darray_exit(&keys_sorted
);
397 /* journal replay early: */
399 static int journal_replay_entry_early(struct bch_fs
*c
,
400 struct jset_entry
*entry
)
404 switch (entry
->type
) {
405 case BCH_JSET_ENTRY_btree_root
: {
406 struct btree_root
*r
;
408 if (fsck_err_on(entry
->btree_id
>= BTREE_ID_NR_MAX
,
410 "invalid btree id %u (max %u)",
411 entry
->btree_id
, BTREE_ID_NR_MAX
))
414 while (entry
->btree_id
>= c
->btree_roots_extra
.nr
+ BTREE_ID_NR
) {
415 ret
= darray_push(&c
->btree_roots_extra
, (struct btree_root
) { NULL
});
420 r
= bch2_btree_id_root(c
, entry
->btree_id
);
423 r
->level
= entry
->level
;
424 bkey_copy(&r
->key
, (struct bkey_i
*) entry
->start
);
427 r
->error
= -BCH_ERR_btree_node_read_error
;
432 case BCH_JSET_ENTRY_usage
: {
433 struct jset_entry_usage
*u
=
434 container_of(entry
, struct jset_entry_usage
, entry
);
436 switch (entry
->btree_id
) {
437 case BCH_FS_USAGE_key_version
:
438 atomic64_set(&c
->key_version
, le64_to_cpu(u
->v
));
443 case BCH_JSET_ENTRY_blacklist
: {
444 struct jset_entry_blacklist
*bl_entry
=
445 container_of(entry
, struct jset_entry_blacklist
, entry
);
447 ret
= bch2_journal_seq_blacklist_add(c
,
448 le64_to_cpu(bl_entry
->seq
),
449 le64_to_cpu(bl_entry
->seq
) + 1);
452 case BCH_JSET_ENTRY_blacklist_v2
: {
453 struct jset_entry_blacklist_v2
*bl_entry
=
454 container_of(entry
, struct jset_entry_blacklist_v2
, entry
);
456 ret
= bch2_journal_seq_blacklist_add(c
,
457 le64_to_cpu(bl_entry
->start
),
458 le64_to_cpu(bl_entry
->end
) + 1);
461 case BCH_JSET_ENTRY_clock
: {
462 struct jset_entry_clock
*clock
=
463 container_of(entry
, struct jset_entry_clock
, entry
);
465 atomic64_set(&c
->io_clock
[clock
->rw
].now
, le64_to_cpu(clock
->time
));
472 static int journal_replay_early(struct bch_fs
*c
,
473 struct bch_sb_field_clean
*clean
)
476 for (struct jset_entry
*entry
= clean
->start
;
477 entry
!= vstruct_end(&clean
->field
);
478 entry
= vstruct_next(entry
)) {
479 int ret
= journal_replay_entry_early(c
, entry
);
484 struct genradix_iter iter
;
485 struct journal_replay
*i
, **_i
;
487 genradix_for_each(&c
->journal_entries
, iter
, _i
) {
490 if (journal_replay_ignore(i
))
493 vstruct_for_each(&i
->j
, entry
) {
494 int ret
= journal_replay_entry_early(c
, entry
);
504 /* sb clean section: */
506 static int read_btree_roots(struct bch_fs
*c
)
510 for (unsigned i
= 0; i
< btree_id_nr_alive(c
); i
++) {
511 struct btree_root
*r
= bch2_btree_id_root(c
, i
);
516 if (btree_id_is_alloc(i
) && c
->opts
.reconstruct_alloc
)
519 if (mustfix_fsck_err_on((ret
= r
->error
),
520 c
, btree_root_bkey_invalid
,
521 "invalid btree root %s",
522 bch2_btree_id_str(i
)) ||
523 mustfix_fsck_err_on((ret
= r
->error
= bch2_btree_root_read(c
, i
, &r
->key
, r
->level
)),
524 c
, btree_root_read_error
,
525 "error reading btree root %s l=%u: %s",
526 bch2_btree_id_str(i
), r
->level
, bch2_err_str(ret
))) {
527 if (btree_id_is_alloc(i
)) {
528 c
->opts
.recovery_passes
|= BIT_ULL(BCH_RECOVERY_PASS_check_allocations
);
529 c
->opts
.recovery_passes
|= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_info
);
530 c
->opts
.recovery_passes
|= BIT_ULL(BCH_RECOVERY_PASS_check_lrus
);
531 c
->opts
.recovery_passes
|= BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers
);
532 c
->opts
.recovery_passes
|= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs
);
533 c
->sb
.compat
&= ~(1ULL << BCH_COMPAT_alloc_info
);
535 } else if (!(c
->opts
.recovery_passes
& BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes
))) {
536 bch_info(c
, "will run btree node scan");
537 c
->opts
.recovery_passes
|= BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes
);
538 c
->opts
.recovery_passes
|= BIT_ULL(BCH_RECOVERY_PASS_check_topology
);
542 bch2_btree_lost_data(c
, i
);
546 for (unsigned i
= 0; i
< BTREE_ID_NR
; i
++) {
547 struct btree_root
*r
= bch2_btree_id_root(c
, i
);
549 if (!r
->b
&& !r
->error
) {
552 bch2_btree_root_alloc_fake(c
, i
, 0);
559 static bool check_version_upgrade(struct bch_fs
*c
)
561 unsigned latest_version
= bcachefs_metadata_version_current
;
562 unsigned latest_compatible
= min(latest_version
,
563 bch2_latest_compatible_version(c
->sb
.version
));
564 unsigned old_version
= c
->sb
.version_upgrade_complete
?: c
->sb
.version
;
565 unsigned new_version
= 0;
567 if (old_version
< bcachefs_metadata_required_upgrade_below
) {
568 if (c
->opts
.version_upgrade
== BCH_VERSION_UPGRADE_incompatible
||
569 latest_compatible
< bcachefs_metadata_required_upgrade_below
)
570 new_version
= latest_version
;
572 new_version
= latest_compatible
;
574 switch (c
->opts
.version_upgrade
) {
575 case BCH_VERSION_UPGRADE_compatible
:
576 new_version
= latest_compatible
;
578 case BCH_VERSION_UPGRADE_incompatible
:
579 new_version
= latest_version
;
581 case BCH_VERSION_UPGRADE_none
:
582 new_version
= min(old_version
, latest_version
);
587 if (new_version
> old_version
) {
588 struct printbuf buf
= PRINTBUF
;
590 if (old_version
< bcachefs_metadata_required_upgrade_below
)
591 prt_str(&buf
, "Version upgrade required:\n");
593 if (old_version
!= c
->sb
.version
) {
594 prt_str(&buf
, "Version upgrade from ");
595 bch2_version_to_text(&buf
, c
->sb
.version_upgrade_complete
);
596 prt_str(&buf
, " to ");
597 bch2_version_to_text(&buf
, c
->sb
.version
);
598 prt_str(&buf
, " incomplete\n");
601 prt_printf(&buf
, "Doing %s version upgrade from ",
602 BCH_VERSION_MAJOR(old_version
) != BCH_VERSION_MAJOR(new_version
)
603 ? "incompatible" : "compatible");
604 bch2_version_to_text(&buf
, old_version
);
605 prt_str(&buf
, " to ");
606 bch2_version_to_text(&buf
, new_version
);
609 struct bch_sb_field_ext
*ext
= bch2_sb_field_get(c
->disk_sb
.sb
, ext
);
610 __le64 passes
= ext
->recovery_passes_required
[0];
611 bch2_sb_set_upgrade(c
, old_version
, new_version
);
612 passes
= ext
->recovery_passes_required
[0] & ~passes
;
615 prt_str(&buf
, " running recovery passes: ");
616 prt_bitflags(&buf
, bch2_recovery_passes
,
617 bch2_recovery_passes_from_stable(le64_to_cpu(passes
)));
620 bch_info(c
, "%s", buf
.buf
);
622 bch2_sb_upgrade(c
, new_version
);
631 int bch2_fs_recovery(struct bch_fs
*c
)
633 struct bch_sb_field_clean
*clean
= NULL
;
634 struct jset
*last_journal_entry
= NULL
;
635 u64 last_seq
= 0, blacklist_seq
, journal_seq
;
639 clean
= bch2_read_superblock_clean(c
);
640 ret
= PTR_ERR_OR_ZERO(clean
);
644 bch_info(c
, "recovering from clean shutdown, journal seq %llu",
645 le64_to_cpu(clean
->journal_seq
));
647 bch_info(c
, "recovering from unclean shutdown");
650 if (!(c
->sb
.features
& (1ULL << BCH_FEATURE_new_extent_overwrite
))) {
651 bch_err(c
, "feature new_extent_overwrite not set, filesystem no longer supported");
657 !(c
->sb
.features
& (1ULL << BCH_FEATURE_extents_above_btree_updates
))) {
658 bch_err(c
, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix");
663 if (c
->opts
.norecovery
)
664 c
->opts
.recovery_pass_last
= BCH_RECOVERY_PASS_journal_replay
- 1;
666 mutex_lock(&c
->sb_lock
);
667 struct bch_sb_field_ext
*ext
= bch2_sb_field_get(c
->disk_sb
.sb
, ext
);
668 bool write_sb
= false;
670 if (BCH_SB_HAS_TOPOLOGY_ERRORS(c
->disk_sb
.sb
)) {
671 ext
->recovery_passes_required
[0] |=
672 cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology
)));
676 u64 sb_passes
= bch2_recovery_passes_from_stable(le64_to_cpu(ext
->recovery_passes_required
[0]));
678 struct printbuf buf
= PRINTBUF
;
679 prt_str(&buf
, "superblock requires following recovery passes to be run:\n ");
680 prt_bitflags(&buf
, bch2_recovery_passes
, sb_passes
);
681 bch_info(c
, "%s", buf
.buf
);
685 if (bch2_check_version_downgrade(c
)) {
686 struct printbuf buf
= PRINTBUF
;
688 prt_str(&buf
, "Version downgrade required:");
690 __le64 passes
= ext
->recovery_passes_required
[0];
691 bch2_sb_set_downgrade(c
,
692 BCH_VERSION_MINOR(bcachefs_metadata_version_current
),
693 BCH_VERSION_MINOR(c
->sb
.version
));
694 passes
= ext
->recovery_passes_required
[0] & ~passes
;
696 prt_str(&buf
, "\n running recovery passes: ");
697 prt_bitflags(&buf
, bch2_recovery_passes
,
698 bch2_recovery_passes_from_stable(le64_to_cpu(passes
)));
701 bch_info(c
, "%s", buf
.buf
);
706 if (check_version_upgrade(c
))
709 c
->opts
.recovery_passes
|= bch2_recovery_passes_from_stable(le64_to_cpu(ext
->recovery_passes_required
[0]));
713 mutex_unlock(&c
->sb_lock
);
715 if (c
->opts
.fsck
&& IS_ENABLED(CONFIG_BCACHEFS_DEBUG
))
716 c
->opts
.recovery_passes
|= BIT_ULL(BCH_RECOVERY_PASS_check_topology
);
719 set_bit(BCH_FS_fsck_running
, &c
->flags
);
721 set_bit(BCH_FS_clean_recovery
, &c
->flags
);
723 ret
= bch2_blacklist_table_initialize(c
);
725 bch_err(c
, "error initializing blacklist table");
729 bch2_journal_pos_from_member_info_resume(c
);
731 if (!c
->sb
.clean
|| c
->opts
.retain_recovery_info
) {
732 struct genradix_iter iter
;
733 struct journal_replay
**i
;
735 bch_verbose(c
, "starting journal read");
736 ret
= bch2_journal_read(c
, &last_seq
, &blacklist_seq
, &journal_seq
);
741 * note: cmd_list_journal needs the blacklist table fully up to date so
742 * it can asterisk ignored journal entries:
744 if (c
->opts
.read_journal_only
)
747 genradix_for_each_reverse(&c
->journal_entries
, iter
, i
)
748 if (!journal_replay_ignore(*i
)) {
749 last_journal_entry
= &(*i
)->j
;
753 if (mustfix_fsck_err_on(c
->sb
.clean
&&
754 last_journal_entry
&&
755 !journal_entry_empty(last_journal_entry
), c
,
756 clean_but_journal_not_empty
,
757 "filesystem marked clean but journal not empty")) {
758 c
->sb
.compat
&= ~(1ULL << BCH_COMPAT_alloc_info
);
759 SET_BCH_SB_CLEAN(c
->disk_sb
.sb
, false);
763 if (!last_journal_entry
) {
764 fsck_err_on(!c
->sb
.clean
, c
,
765 dirty_but_no_journal_entries
,
766 "no journal entries found");
770 genradix_for_each_reverse(&c
->journal_entries
, iter
, i
)
772 last_journal_entry
= &(*i
)->j
;
773 (*i
)->ignore_blacklisted
= false;
774 (*i
)->ignore_not_dirty
= false;
776 * This was probably a NO_FLUSH entry,
777 * so last_seq was garbage - but we know
778 * we're only using a single journal
779 * entry, set it here:
781 (*i
)->j
.last_seq
= (*i
)->j
.seq
;
786 ret
= bch2_journal_keys_sort(c
);
790 if (c
->sb
.clean
&& last_journal_entry
) {
791 ret
= bch2_verify_superblock_clean(c
, &clean
,
799 bch_err(c
, "no superblock clean section found");
800 ret
= -BCH_ERR_fsck_repair_impossible
;
804 blacklist_seq
= journal_seq
= le64_to_cpu(clean
->journal_seq
) + 1;
807 c
->journal_replay_seq_start
= last_seq
;
808 c
->journal_replay_seq_end
= blacklist_seq
- 1;
810 if (c
->opts
.reconstruct_alloc
)
811 bch2_reconstruct_alloc(c
);
813 zero_out_btree_mem_ptr(&c
->journal_keys
);
815 ret
= journal_replay_early(c
, clean
);
820 * After an unclean shutdown, skip then next few journal sequence
821 * numbers as they may have been referenced by btree writes that
822 * happened before their corresponding journal writes - those btree
823 * writes need to be ignored, by skipping and blacklisting the next few
824 * journal sequence numbers:
829 if (blacklist_seq
!= journal_seq
) {
830 ret
= bch2_journal_log_msg(c
, "blacklisting entries %llu-%llu",
831 blacklist_seq
, journal_seq
) ?:
832 bch2_journal_seq_blacklist_add(c
,
833 blacklist_seq
, journal_seq
);
835 bch_err_msg(c
, ret
, "error creating new journal seq blacklist entry");
840 ret
= bch2_journal_log_msg(c
, "starting journal at entry %llu, replaying %llu-%llu",
841 journal_seq
, last_seq
, blacklist_seq
- 1) ?:
842 bch2_fs_journal_start(&c
->journal
, journal_seq
);
847 * Skip past versions that might have possibly been used (as nonces),
848 * but hadn't had their pointers written:
850 if (c
->sb
.encryption_type
&& !c
->sb
.clean
)
851 atomic64_add(1 << 16, &c
->key_version
);
853 ret
= read_btree_roots(c
);
857 set_bit(BCH_FS_btree_running
, &c
->flags
);
859 ret
= bch2_sb_set_upgrade_extra(c
);
861 ret
= bch2_run_recovery_passes(c
);
866 * Normally set by the appropriate recovery pass: when cleared, this
867 * indicates we're in early recovery and btree updates should be done by
868 * being applied to the journal replay keys. _Must_ be cleared before
871 set_bit(BCH_FS_may_go_rw
, &c
->flags
);
872 clear_bit(BCH_FS_fsck_running
, &c
->flags
);
874 /* in case we don't run journal replay, i.e. norecovery mode */
875 set_bit(BCH_FS_accounting_replay_done
, &c
->flags
);
877 /* fsync if we fixed errors */
878 if (test_bit(BCH_FS_errors_fixed
, &c
->flags
) &&
879 bch2_write_ref_tryget(c
, BCH_WRITE_REF_fsync
)) {
880 bch2_journal_flush_all_pins(&c
->journal
);
881 bch2_journal_meta(&c
->journal
);
882 bch2_write_ref_put(c
, BCH_WRITE_REF_fsync
);
885 /* If we fixed errors, verify that fs is actually clean now: */
886 if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG
) &&
887 test_bit(BCH_FS_errors_fixed
, &c
->flags
) &&
888 !test_bit(BCH_FS_errors_not_fixed
, &c
->flags
) &&
889 !test_bit(BCH_FS_error
, &c
->flags
)) {
890 bch2_flush_fsck_errs(c
);
892 bch_info(c
, "Fixed errors, running fsck a second time to verify fs is clean");
893 clear_bit(BCH_FS_errors_fixed
, &c
->flags
);
895 c
->curr_recovery_pass
= BCH_RECOVERY_PASS_check_alloc_info
;
897 ret
= bch2_run_recovery_passes(c
);
901 if (test_bit(BCH_FS_errors_fixed
, &c
->flags
) ||
902 test_bit(BCH_FS_errors_not_fixed
, &c
->flags
)) {
903 bch_err(c
, "Second fsck run was not clean");
904 set_bit(BCH_FS_errors_not_fixed
, &c
->flags
);
907 set_bit(BCH_FS_errors_fixed
, &c
->flags
);
910 if (enabled_qtypes(c
)) {
911 bch_verbose(c
, "reading quotas");
912 ret
= bch2_fs_quota_read(c
);
915 bch_verbose(c
, "quotas done");
918 mutex_lock(&c
->sb_lock
);
919 ext
= bch2_sb_field_get(c
->disk_sb
.sb
, ext
);
922 if (BCH_SB_VERSION_UPGRADE_COMPLETE(c
->disk_sb
.sb
) != le16_to_cpu(c
->disk_sb
.sb
->version
)) {
923 SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c
->disk_sb
.sb
, le16_to_cpu(c
->disk_sb
.sb
->version
));
927 if (!test_bit(BCH_FS_error
, &c
->flags
) &&
928 !(c
->disk_sb
.sb
->compat
[0] & cpu_to_le64(1ULL << BCH_COMPAT_alloc_info
))) {
929 c
->disk_sb
.sb
->compat
[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info
);
933 if (!test_bit(BCH_FS_error
, &c
->flags
) &&
934 !bch2_is_zero(ext
->errors_silent
, sizeof(ext
->errors_silent
))) {
935 memset(ext
->errors_silent
, 0, sizeof(ext
->errors_silent
));
940 !test_bit(BCH_FS_error
, &c
->flags
) &&
941 c
->recovery_pass_done
== BCH_RECOVERY_PASS_NR
- 1 &&
942 ext
->btrees_lost_data
) {
943 ext
->btrees_lost_data
= 0;
948 !test_bit(BCH_FS_error
, &c
->flags
) &&
949 !test_bit(BCH_FS_errors_not_fixed
, &c
->flags
)) {
950 SET_BCH_SB_HAS_ERRORS(c
->disk_sb
.sb
, 0);
951 SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c
->disk_sb
.sb
, 0);
955 if (bch2_blacklist_entries_gc(c
))
960 mutex_unlock(&c
->sb_lock
);
962 if (!(c
->sb
.compat
& (1ULL << BCH_COMPAT_extents_above_btree_updates_done
)) ||
963 c
->sb
.version_min
< bcachefs_metadata_version_btree_ptr_sectors_written
) {
964 struct bch_move_stats stats
;
966 bch2_move_stats_init(&stats
, "recovery");
968 struct printbuf buf
= PRINTBUF
;
969 bch2_version_to_text(&buf
, c
->sb
.version_min
);
970 bch_info(c
, "scanning for old btree nodes: min_version %s", buf
.buf
);
973 ret
= bch2_fs_read_write_early(c
) ?:
974 bch2_scan_old_btree_nodes(c
, &stats
);
977 bch_info(c
, "scanning for old btree nodes done");
982 bch2_flush_fsck_errs(c
);
984 if (!c
->opts
.retain_recovery_info
) {
985 bch2_journal_keys_put_initial(c
);
986 bch2_find_btree_nodes_exit(&c
->found_btree_nodes
);
992 test_bit(BCH_FS_need_delete_dead_snapshots
, &c
->flags
) &&
993 !c
->opts
.nochanges
) {
994 bch2_fs_read_write_early(c
);
995 bch2_delete_dead_snapshots_async(c
);
1002 bch2_fs_emergency_read_only(c
);
1006 int bch2_fs_initialize(struct bch_fs
*c
)
1008 struct bch_inode_unpacked root_inode
, lostfound_inode
;
1009 struct bkey_inode_buf packed_inode
;
1010 struct qstr lostfound
= QSTR("lost+found");
1011 struct bch_member
*m
;
1014 bch_notice(c
, "initializing new filesystem");
1015 set_bit(BCH_FS_new_fs
, &c
->flags
);
1017 mutex_lock(&c
->sb_lock
);
1018 c
->disk_sb
.sb
->compat
[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done
);
1019 c
->disk_sb
.sb
->compat
[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done
);
1021 bch2_check_version_downgrade(c
);
1023 if (c
->opts
.version_upgrade
!= BCH_VERSION_UPGRADE_none
) {
1024 bch2_sb_upgrade(c
, bcachefs_metadata_version_current
);
1025 SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c
->disk_sb
.sb
, bcachefs_metadata_version_current
);
1026 bch2_write_super(c
);
1029 for_each_member_device(c
, ca
) {
1030 m
= bch2_members_v2_get_mut(c
->disk_sb
.sb
, ca
->dev_idx
);
1031 SET_BCH_MEMBER_FREESPACE_INITIALIZED(m
, false);
1032 ca
->mi
= bch2_mi_to_cpu(m
);
1035 bch2_write_super(c
);
1036 mutex_unlock(&c
->sb_lock
);
1038 c
->curr_recovery_pass
= BCH_RECOVERY_PASS_NR
;
1039 set_bit(BCH_FS_btree_running
, &c
->flags
);
1040 set_bit(BCH_FS_may_go_rw
, &c
->flags
);
1042 for (unsigned i
= 0; i
< BTREE_ID_NR
; i
++)
1043 bch2_btree_root_alloc_fake(c
, i
, 0);
1045 ret
= bch2_fs_journal_alloc(c
);
1050 * journal_res_get() will crash if called before this has
1051 * set up the journal.pin FIFO and journal.cur pointer:
1053 bch2_fs_journal_start(&c
->journal
, 1);
1054 set_bit(BCH_FS_accounting_replay_done
, &c
->flags
);
1055 bch2_journal_set_replay_done(&c
->journal
);
1057 ret
= bch2_fs_read_write_early(c
);
1061 for_each_member_device(c
, ca
) {
1062 ret
= bch2_dev_usage_init(ca
, false);
1070 * Write out the superblock and journal buckets, now that we can do
1073 bch_verbose(c
, "marking superblocks");
1074 ret
= bch2_trans_mark_dev_sbs(c
);
1075 bch_err_msg(c
, ret
, "marking superblocks");
1079 for_each_online_member(c
, ca
)
1080 ca
->new_fs_bucket_idx
= 0;
1082 ret
= bch2_fs_freespace_init(c
);
1086 ret
= bch2_initialize_subvolumes(c
);
1090 bch_verbose(c
, "reading snapshots table");
1091 ret
= bch2_snapshots_read(c
);
1094 bch_verbose(c
, "reading snapshots done");
1096 bch2_inode_init(c
, &root_inode
, 0, 0, S_IFDIR
|0755, 0, NULL
);
1097 root_inode
.bi_inum
= BCACHEFS_ROOT_INO
;
1098 root_inode
.bi_subvol
= BCACHEFS_ROOT_SUBVOL
;
1099 bch2_inode_pack(&packed_inode
, &root_inode
);
1100 packed_inode
.inode
.k
.p
.snapshot
= U32_MAX
;
1102 ret
= bch2_btree_insert(c
, BTREE_ID_inodes
, &packed_inode
.inode
.k_i
, NULL
, 0, 0);
1103 bch_err_msg(c
, ret
, "creating root directory");
1107 bch2_inode_init_early(c
, &lostfound_inode
);
1109 ret
= bch2_trans_commit_do(c
, NULL
, NULL
, 0,
1110 bch2_create_trans(trans
,
1111 BCACHEFS_ROOT_SUBVOL_INUM
,
1112 &root_inode
, &lostfound_inode
,
1114 0, 0, S_IFDIR
|0700, 0,
1115 NULL
, NULL
, (subvol_inum
) { 0 }, 0));
1116 bch_err_msg(c
, ret
, "creating lost+found");
1120 c
->recovery_pass_done
= BCH_RECOVERY_PASS_NR
- 1;
1122 if (enabled_qtypes(c
)) {
1123 ret
= bch2_fs_quota_read(c
);
1128 ret
= bch2_journal_flush(&c
->journal
);
1129 bch_err_msg(c
, ret
, "writing first journal entry");
1133 mutex_lock(&c
->sb_lock
);
1134 SET_BCH_SB_INITIALIZED(c
->disk_sb
.sb
, true);
1135 SET_BCH_SB_CLEAN(c
->disk_sb
.sb
, false);
1137 bch2_write_super(c
);
1138 mutex_unlock(&c
->sb_lock
);