1 // SPDX-License-Identifier: GPL-2.0
5 #include "btree_key_cache.h"
6 #include "btree_update.h"
11 #include "recovery_passes.h"
14 #include <linux/random.h>
19 * Keys in BTREE_ID_snapshot_trees identify a whole tree of snapshot nodes; they
20 * exist to provide a stable identifier for the whole lifetime of a snapshot
24 void bch2_snapshot_tree_to_text(struct printbuf
*out
, struct bch_fs
*c
,
27 struct bkey_s_c_snapshot_tree t
= bkey_s_c_to_snapshot_tree(k
);
29 prt_printf(out
, "subvol %u root snapshot %u",
30 le32_to_cpu(t
.v
->master_subvol
),
31 le32_to_cpu(t
.v
->root_snapshot
));
34 int bch2_snapshot_tree_validate(struct bch_fs
*c
, struct bkey_s_c k
,
35 enum bch_validate_flags flags
)
39 bkey_fsck_err_on(bkey_gt(k
.k
->p
, POS(0, U32_MAX
)) ||
40 bkey_lt(k
.k
->p
, POS(0, 1)),
41 c
, snapshot_tree_pos_bad
,
47 int bch2_snapshot_tree_lookup(struct btree_trans
*trans
, u32 id
,
48 struct bch_snapshot_tree
*s
)
50 int ret
= bch2_bkey_get_val_typed(trans
, BTREE_ID_snapshot_trees
, POS(0, id
),
51 BTREE_ITER_with_updates
, snapshot_tree
, s
);
53 if (bch2_err_matches(ret
, ENOENT
))
54 ret
= -BCH_ERR_ENOENT_snapshot_tree
;
58 struct bkey_i_snapshot_tree
*
59 __bch2_snapshot_tree_create(struct btree_trans
*trans
)
61 struct btree_iter iter
;
62 int ret
= bch2_bkey_get_empty_slot(trans
, &iter
,
63 BTREE_ID_snapshot_trees
, POS(0, U32_MAX
));
64 struct bkey_i_snapshot_tree
*s_t
;
66 if (ret
== -BCH_ERR_ENOSPC_btree_slot
)
67 ret
= -BCH_ERR_ENOSPC_snapshot_tree
;
71 s_t
= bch2_bkey_alloc(trans
, &iter
, 0, snapshot_tree
);
72 ret
= PTR_ERR_OR_ZERO(s_t
);
73 bch2_trans_iter_exit(trans
, &iter
);
74 return ret
? ERR_PTR(ret
) : s_t
;
77 static int bch2_snapshot_tree_create(struct btree_trans
*trans
,
78 u32 root_id
, u32 subvol_id
, u32
*tree_id
)
80 struct bkey_i_snapshot_tree
*n_tree
=
81 __bch2_snapshot_tree_create(trans
);
84 return PTR_ERR(n_tree
);
86 n_tree
->v
.master_subvol
= cpu_to_le32(subvol_id
);
87 n_tree
->v
.root_snapshot
= cpu_to_le32(root_id
);
88 *tree_id
= n_tree
->k
.p
.offset
;
94 static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table
*t
, u32 id
, u32 ancestor
)
96 while (id
&& id
< ancestor
) {
97 const struct snapshot_t
*s
= __snapshot_t(t
, id
);
98 id
= s
? s
->parent
: 0;
100 return id
== ancestor
;
103 static bool bch2_snapshot_is_ancestor_early(struct bch_fs
*c
, u32 id
, u32 ancestor
)
106 bool ret
= __bch2_snapshot_is_ancestor_early(rcu_dereference(c
->snapshots
), id
, ancestor
);
112 static inline u32
get_ancestor_below(struct snapshot_table
*t
, u32 id
, u32 ancestor
)
114 const struct snapshot_t
*s
= __snapshot_t(t
, id
);
118 if (s
->skip
[2] <= ancestor
)
120 if (s
->skip
[1] <= ancestor
)
122 if (s
->skip
[0] <= ancestor
)
127 static bool test_ancestor_bitmap(struct snapshot_table
*t
, u32 id
, u32 ancestor
)
129 const struct snapshot_t
*s
= __snapshot_t(t
, id
);
133 return test_bit(ancestor
- id
- 1, s
->is_ancestor
);
136 bool __bch2_snapshot_is_ancestor(struct bch_fs
*c
, u32 id
, u32 ancestor
)
141 struct snapshot_table
*t
= rcu_dereference(c
->snapshots
);
143 if (unlikely(c
->recovery_pass_done
< BCH_RECOVERY_PASS_check_snapshots
)) {
144 ret
= __bch2_snapshot_is_ancestor_early(t
, id
, ancestor
);
148 while (id
&& id
< ancestor
- IS_ANCESTOR_BITMAP
)
149 id
= get_ancestor_below(t
, id
, ancestor
);
151 ret
= id
&& id
< ancestor
152 ? test_ancestor_bitmap(t
, id
, ancestor
)
155 EBUG_ON(ret
!= __bch2_snapshot_is_ancestor_early(t
, id
, ancestor
));
162 static noinline
struct snapshot_t
*__snapshot_t_mut(struct bch_fs
*c
, u32 id
)
164 size_t idx
= U32_MAX
- id
;
165 struct snapshot_table
*new, *old
;
167 size_t new_bytes
= kmalloc_size_roundup(struct_size(new, s
, idx
+ 1));
168 size_t new_size
= (new_bytes
- sizeof(*new)) / sizeof(new->s
[0]);
170 if (unlikely(new_bytes
> INT_MAX
))
173 new = kvzalloc(new_bytes
, GFP_KERNEL
);
179 old
= rcu_dereference_protected(c
->snapshots
, true);
181 memcpy(new->s
, old
->s
, sizeof(old
->s
[0]) * old
->nr
);
183 rcu_assign_pointer(c
->snapshots
, new);
184 kvfree_rcu(old
, rcu
);
186 return &rcu_dereference_protected(c
->snapshots
,
187 lockdep_is_held(&c
->snapshot_table_lock
))->s
[idx
];
190 static inline struct snapshot_t
*snapshot_t_mut(struct bch_fs
*c
, u32 id
)
192 size_t idx
= U32_MAX
- id
;
193 struct snapshot_table
*table
=
194 rcu_dereference_protected(c
->snapshots
,
195 lockdep_is_held(&c
->snapshot_table_lock
));
197 lockdep_assert_held(&c
->snapshot_table_lock
);
199 if (likely(table
&& idx
< table
->nr
))
200 return &table
->s
[idx
];
202 return __snapshot_t_mut(c
, id
);
205 void bch2_snapshot_to_text(struct printbuf
*out
, struct bch_fs
*c
,
208 struct bkey_s_c_snapshot s
= bkey_s_c_to_snapshot(k
);
210 prt_printf(out
, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u",
211 BCH_SNAPSHOT_SUBVOL(s
.v
),
212 BCH_SNAPSHOT_DELETED(s
.v
),
213 le32_to_cpu(s
.v
->parent
),
214 le32_to_cpu(s
.v
->children
[0]),
215 le32_to_cpu(s
.v
->children
[1]),
216 le32_to_cpu(s
.v
->subvol
),
217 le32_to_cpu(s
.v
->tree
));
219 if (bkey_val_bytes(k
.k
) > offsetof(struct bch_snapshot
, depth
))
220 prt_printf(out
, " depth %u skiplist %u %u %u",
221 le32_to_cpu(s
.v
->depth
),
222 le32_to_cpu(s
.v
->skip
[0]),
223 le32_to_cpu(s
.v
->skip
[1]),
224 le32_to_cpu(s
.v
->skip
[2]));
227 int bch2_snapshot_validate(struct bch_fs
*c
, struct bkey_s_c k
,
228 enum bch_validate_flags flags
)
230 struct bkey_s_c_snapshot s
;
234 bkey_fsck_err_on(bkey_gt(k
.k
->p
, POS(0, U32_MAX
)) ||
235 bkey_lt(k
.k
->p
, POS(0, 1)),
239 s
= bkey_s_c_to_snapshot(k
);
241 id
= le32_to_cpu(s
.v
->parent
);
242 bkey_fsck_err_on(id
&& id
<= k
.k
->p
.offset
,
243 c
, snapshot_parent_bad
,
244 "bad parent node (%u <= %llu)",
247 bkey_fsck_err_on(le32_to_cpu(s
.v
->children
[0]) < le32_to_cpu(s
.v
->children
[1]),
248 c
, snapshot_children_not_normalized
,
249 "children not normalized");
251 bkey_fsck_err_on(s
.v
->children
[0] && s
.v
->children
[0] == s
.v
->children
[1],
252 c
, snapshot_child_duplicate
,
253 "duplicate child nodes");
255 for (i
= 0; i
< 2; i
++) {
256 id
= le32_to_cpu(s
.v
->children
[i
]);
258 bkey_fsck_err_on(id
>= k
.k
->p
.offset
,
259 c
, snapshot_child_bad
,
260 "bad child node (%u >= %llu)",
264 if (bkey_val_bytes(k
.k
) > offsetof(struct bch_snapshot
, skip
)) {
265 bkey_fsck_err_on(le32_to_cpu(s
.v
->skip
[0]) > le32_to_cpu(s
.v
->skip
[1]) ||
266 le32_to_cpu(s
.v
->skip
[1]) > le32_to_cpu(s
.v
->skip
[2]),
267 c
, snapshot_skiplist_not_normalized
,
268 "skiplist not normalized");
270 for (i
= 0; i
< ARRAY_SIZE(s
.v
->skip
); i
++) {
271 id
= le32_to_cpu(s
.v
->skip
[i
]);
273 bkey_fsck_err_on(id
&& id
< le32_to_cpu(s
.v
->parent
),
274 c
, snapshot_skiplist_bad
,
275 "bad skiplist node %u", id
);
282 static void __set_is_ancestor_bitmap(struct bch_fs
*c
, u32 id
)
284 struct snapshot_t
*t
= snapshot_t_mut(c
, id
);
287 while ((parent
= bch2_snapshot_parent_early(c
, parent
)) &&
288 parent
- id
- 1 < IS_ANCESTOR_BITMAP
)
289 __set_bit(parent
- id
- 1, t
->is_ancestor
);
292 static void set_is_ancestor_bitmap(struct bch_fs
*c
, u32 id
)
294 mutex_lock(&c
->snapshot_table_lock
);
295 __set_is_ancestor_bitmap(c
, id
);
296 mutex_unlock(&c
->snapshot_table_lock
);
299 static int __bch2_mark_snapshot(struct btree_trans
*trans
,
300 enum btree_id btree
, unsigned level
,
301 struct bkey_s_c old
, struct bkey_s_c
new,
302 enum btree_iter_update_trigger_flags flags
)
304 struct bch_fs
*c
= trans
->c
;
305 struct snapshot_t
*t
;
306 u32 id
= new.k
->p
.offset
;
309 mutex_lock(&c
->snapshot_table_lock
);
311 t
= snapshot_t_mut(c
, id
);
313 ret
= -BCH_ERR_ENOMEM_mark_snapshot
;
317 if (new.k
->type
== KEY_TYPE_snapshot
) {
318 struct bkey_s_c_snapshot s
= bkey_s_c_to_snapshot(new);
320 t
->parent
= le32_to_cpu(s
.v
->parent
);
321 t
->children
[0] = le32_to_cpu(s
.v
->children
[0]);
322 t
->children
[1] = le32_to_cpu(s
.v
->children
[1]);
323 t
->subvol
= BCH_SNAPSHOT_SUBVOL(s
.v
) ? le32_to_cpu(s
.v
->subvol
) : 0;
324 t
->tree
= le32_to_cpu(s
.v
->tree
);
326 if (bkey_val_bytes(s
.k
) > offsetof(struct bch_snapshot
, depth
)) {
327 t
->depth
= le32_to_cpu(s
.v
->depth
);
328 t
->skip
[0] = le32_to_cpu(s
.v
->skip
[0]);
329 t
->skip
[1] = le32_to_cpu(s
.v
->skip
[1]);
330 t
->skip
[2] = le32_to_cpu(s
.v
->skip
[2]);
338 __set_is_ancestor_bitmap(c
, id
);
340 if (BCH_SNAPSHOT_DELETED(s
.v
)) {
341 set_bit(BCH_FS_need_delete_dead_snapshots
, &c
->flags
);
342 if (c
->curr_recovery_pass
> BCH_RECOVERY_PASS_delete_dead_snapshots
)
343 bch2_delete_dead_snapshots_async(c
);
346 memset(t
, 0, sizeof(*t
));
349 mutex_unlock(&c
->snapshot_table_lock
);
353 int bch2_mark_snapshot(struct btree_trans
*trans
,
354 enum btree_id btree
, unsigned level
,
355 struct bkey_s_c old
, struct bkey_s
new,
356 enum btree_iter_update_trigger_flags flags
)
358 return __bch2_mark_snapshot(trans
, btree
, level
, old
, new.s_c
, flags
);
361 int bch2_snapshot_lookup(struct btree_trans
*trans
, u32 id
,
362 struct bch_snapshot
*s
)
364 return bch2_bkey_get_val_typed(trans
, BTREE_ID_snapshots
, POS(0, id
),
365 BTREE_ITER_with_updates
, snapshot
, s
);
368 static int bch2_snapshot_live(struct btree_trans
*trans
, u32 id
)
370 struct bch_snapshot v
;
376 ret
= bch2_snapshot_lookup(trans
, id
, &v
);
377 if (bch2_err_matches(ret
, ENOENT
))
378 bch_err(trans
->c
, "snapshot node %u not found", id
);
382 return !BCH_SNAPSHOT_DELETED(&v
);
386 * If @k is a snapshot with just one live child, it's part of a linear chain,
387 * which we consider to be an equivalence class: and then after snapshot
388 * deletion cleanup, there should only be a single key at a given position in
389 * this equivalence class.
391 * This sets the equivalence class of @k to be the child's equivalence class, if
392 * it's part of such a linear chain: this correctly sets equivalence classes on
393 * startup if we run leaf to root (i.e. in natural key order).
395 static int bch2_snapshot_set_equiv(struct btree_trans
*trans
, struct bkey_s_c k
)
397 struct bch_fs
*c
= trans
->c
;
398 unsigned i
, nr_live
= 0, live_idx
= 0;
399 struct bkey_s_c_snapshot snap
;
400 u32 id
= k
.k
->p
.offset
, child
[2];
402 if (k
.k
->type
!= KEY_TYPE_snapshot
)
405 snap
= bkey_s_c_to_snapshot(k
);
407 child
[0] = le32_to_cpu(snap
.v
->children
[0]);
408 child
[1] = le32_to_cpu(snap
.v
->children
[1]);
410 for (i
= 0; i
< 2; i
++) {
411 int ret
= bch2_snapshot_live(trans
, child
[i
]);
421 mutex_lock(&c
->snapshot_table_lock
);
423 snapshot_t_mut(c
, id
)->equiv
= nr_live
== 1
424 ? snapshot_t_mut(c
, child
[live_idx
])->equiv
427 mutex_unlock(&c
->snapshot_table_lock
);
434 static u32
bch2_snapshot_child(struct bch_fs
*c
, u32 id
, unsigned child
)
436 return snapshot_t(c
, id
)->children
[child
];
439 static u32
bch2_snapshot_left_child(struct bch_fs
*c
, u32 id
)
441 return bch2_snapshot_child(c
, id
, 0);
444 static u32
bch2_snapshot_right_child(struct bch_fs
*c
, u32 id
)
446 return bch2_snapshot_child(c
, id
, 1);
449 static u32
bch2_snapshot_tree_next(struct bch_fs
*c
, u32 id
)
453 n
= bch2_snapshot_left_child(c
, id
);
457 while ((parent
= bch2_snapshot_parent(c
, id
))) {
458 n
= bch2_snapshot_right_child(c
, parent
);
467 static u32
bch2_snapshot_tree_oldest_subvol(struct bch_fs
*c
, u32 snapshot_root
)
469 u32 id
= snapshot_root
;
474 s
= snapshot_t(c
, id
)->subvol
;
476 if (s
&& (!subvol
|| s
< subvol
))
479 id
= bch2_snapshot_tree_next(c
, id
);
486 static int bch2_snapshot_tree_master_subvol(struct btree_trans
*trans
,
487 u32 snapshot_root
, u32
*subvol_id
)
489 struct bch_fs
*c
= trans
->c
;
490 struct btree_iter iter
;
495 for_each_btree_key_norestart(trans
, iter
, BTREE_ID_subvolumes
, POS_MIN
,
497 if (k
.k
->type
!= KEY_TYPE_subvolume
)
500 struct bkey_s_c_subvolume s
= bkey_s_c_to_subvolume(k
);
501 if (!bch2_snapshot_is_ancestor(c
, le32_to_cpu(s
.v
->snapshot
), snapshot_root
))
503 if (!BCH_SUBVOLUME_SNAP(s
.v
)) {
504 *subvol_id
= s
.k
->p
.offset
;
510 bch2_trans_iter_exit(trans
, &iter
);
512 if (!ret
&& !found
) {
513 struct bkey_i_subvolume
*u
;
515 *subvol_id
= bch2_snapshot_tree_oldest_subvol(c
, snapshot_root
);
517 u
= bch2_bkey_get_mut_typed(trans
, &iter
,
518 BTREE_ID_subvolumes
, POS(0, *subvol_id
),
520 ret
= PTR_ERR_OR_ZERO(u
);
524 SET_BCH_SUBVOLUME_SNAP(&u
->v
, false);
530 static int check_snapshot_tree(struct btree_trans
*trans
,
531 struct btree_iter
*iter
,
534 struct bch_fs
*c
= trans
->c
;
535 struct bkey_s_c_snapshot_tree st
;
536 struct bch_snapshot s
;
537 struct bch_subvolume subvol
;
538 struct printbuf buf
= PRINTBUF
;
542 if (k
.k
->type
!= KEY_TYPE_snapshot_tree
)
545 st
= bkey_s_c_to_snapshot_tree(k
);
546 root_id
= le32_to_cpu(st
.v
->root_snapshot
);
548 ret
= bch2_snapshot_lookup(trans
, root_id
, &s
);
549 if (ret
&& !bch2_err_matches(ret
, ENOENT
))
552 if (fsck_err_on(ret
||
553 root_id
!= bch2_snapshot_root(c
, root_id
) ||
554 st
.k
->p
.offset
!= le32_to_cpu(s
.tree
),
555 trans
, snapshot_tree_to_missing_snapshot
,
556 "snapshot tree points to missing/incorrect snapshot:\n %s",
557 (bch2_bkey_val_to_text(&buf
, c
, st
.s_c
), buf
.buf
))) {
558 ret
= bch2_btree_delete_at(trans
, iter
, 0);
562 ret
= bch2_subvolume_get(trans
, le32_to_cpu(st
.v
->master_subvol
),
564 if (ret
&& !bch2_err_matches(ret
, ENOENT
))
568 trans
, snapshot_tree_to_missing_subvol
,
569 "snapshot tree points to missing subvolume:\n %s",
570 (printbuf_reset(&buf
),
571 bch2_bkey_val_to_text(&buf
, c
, st
.s_c
), buf
.buf
)) ||
572 fsck_err_on(!bch2_snapshot_is_ancestor(c
,
573 le32_to_cpu(subvol
.snapshot
),
575 trans
, snapshot_tree_to_wrong_subvol
,
576 "snapshot tree points to subvolume that does not point to snapshot in this tree:\n %s",
577 (printbuf_reset(&buf
),
578 bch2_bkey_val_to_text(&buf
, c
, st
.s_c
), buf
.buf
)) ||
579 fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol
),
580 trans
, snapshot_tree_to_snapshot_subvol
,
581 "snapshot tree points to snapshot subvolume:\n %s",
582 (printbuf_reset(&buf
),
583 bch2_bkey_val_to_text(&buf
, c
, st
.s_c
), buf
.buf
))) {
584 struct bkey_i_snapshot_tree
*u
;
587 ret
= bch2_snapshot_tree_master_subvol(trans
, root_id
, &subvol_id
);
590 if (bch2_err_matches(ret
, ENOENT
)) { /* nothing to be done here */
598 u
= bch2_bkey_make_mut_typed(trans
, iter
, &k
, 0, snapshot_tree
);
599 ret
= PTR_ERR_OR_ZERO(u
);
603 u
->v
.master_subvol
= cpu_to_le32(subvol_id
);
604 st
= snapshot_tree_i_to_s_c(u
);
613 * For each snapshot_tree, make sure it points to the root of a snapshot tree
614 * and that snapshot entry points back to it, or delete it.
616 * And, make sure it points to a subvolume within that snapshot tree, or correct
617 * it to point to the oldest subvolume within that snapshot tree.
619 int bch2_check_snapshot_trees(struct bch_fs
*c
)
621 int ret
= bch2_trans_run(c
,
622 for_each_btree_key_commit(trans
, iter
,
623 BTREE_ID_snapshot_trees
, POS_MIN
,
624 BTREE_ITER_prefetch
, k
,
625 NULL
, NULL
, BCH_TRANS_COMMIT_no_enospc
,
626 check_snapshot_tree(trans
, &iter
, k
)));
632 * Look up snapshot tree for @tree_id and find root,
633 * make sure @snap_id is a descendent:
635 static int snapshot_tree_ptr_good(struct btree_trans
*trans
,
636 u32 snap_id
, u32 tree_id
)
638 struct bch_snapshot_tree s_t
;
639 int ret
= bch2_snapshot_tree_lookup(trans
, tree_id
, &s_t
);
641 if (bch2_err_matches(ret
, ENOENT
))
646 return bch2_snapshot_is_ancestor_early(trans
->c
, snap_id
, le32_to_cpu(s_t
.root_snapshot
));
649 u32
bch2_snapshot_skiplist_get(struct bch_fs
*c
, u32 id
)
651 const struct snapshot_t
*s
;
657 s
= snapshot_t(c
, id
);
659 id
= bch2_snapshot_nth_parent(c
, id
, get_random_u32_below(s
->depth
));
665 static int snapshot_skiplist_good(struct btree_trans
*trans
, u32 id
, struct bch_snapshot s
)
669 for (i
= 0; i
< 3; i
++)
674 if (!bch2_snapshot_is_ancestor_early(trans
->c
, id
, le32_to_cpu(s
.skip
[i
])))
682 * snapshot_tree pointer was incorrect: look up root snapshot node, make sure
683 * its snapshot_tree pointer is correct (allocate new one if necessary), then
684 * update this node's pointer to root node's pointer:
686 static int snapshot_tree_ptr_repair(struct btree_trans
*trans
,
687 struct btree_iter
*iter
,
689 struct bch_snapshot
*s
)
691 struct bch_fs
*c
= trans
->c
;
692 struct btree_iter root_iter
;
693 struct bch_snapshot_tree s_t
;
694 struct bkey_s_c_snapshot root
;
695 struct bkey_i_snapshot
*u
;
696 u32 root_id
= bch2_snapshot_root(c
, k
.k
->p
.offset
), tree_id
;
699 root
= bch2_bkey_get_iter_typed(trans
, &root_iter
,
700 BTREE_ID_snapshots
, POS(0, root_id
),
701 BTREE_ITER_with_updates
, snapshot
);
702 ret
= bkey_err(root
);
706 tree_id
= le32_to_cpu(root
.v
->tree
);
708 ret
= bch2_snapshot_tree_lookup(trans
, tree_id
, &s_t
);
709 if (ret
&& !bch2_err_matches(ret
, ENOENT
))
712 if (ret
|| le32_to_cpu(s_t
.root_snapshot
) != root_id
) {
713 u
= bch2_bkey_make_mut_typed(trans
, &root_iter
, &root
.s_c
, 0, snapshot
);
714 ret
= PTR_ERR_OR_ZERO(u
) ?:
715 bch2_snapshot_tree_create(trans
, root_id
,
716 bch2_snapshot_tree_oldest_subvol(c
, root_id
),
721 u
->v
.tree
= cpu_to_le32(tree_id
);
722 if (k
.k
->p
.offset
== root_id
)
726 if (k
.k
->p
.offset
!= root_id
) {
727 u
= bch2_bkey_make_mut_typed(trans
, iter
, &k
, 0, snapshot
);
728 ret
= PTR_ERR_OR_ZERO(u
);
732 u
->v
.tree
= cpu_to_le32(tree_id
);
736 bch2_trans_iter_exit(trans
, &root_iter
);
740 static int check_snapshot(struct btree_trans
*trans
,
741 struct btree_iter
*iter
,
744 struct bch_fs
*c
= trans
->c
;
745 struct bch_snapshot s
;
746 struct bch_subvolume subvol
;
747 struct bch_snapshot v
;
748 struct bkey_i_snapshot
*u
;
749 u32 parent_id
= bch2_snapshot_parent_early(c
, k
.k
->p
.offset
);
751 struct printbuf buf
= PRINTBUF
;
755 if (k
.k
->type
!= KEY_TYPE_snapshot
)
758 memset(&s
, 0, sizeof(s
));
759 memcpy(&s
, k
.v
, min(sizeof(s
), bkey_val_bytes(k
.k
)));
761 id
= le32_to_cpu(s
.parent
);
763 ret
= bch2_snapshot_lookup(trans
, id
, &v
);
764 if (bch2_err_matches(ret
, ENOENT
))
765 bch_err(c
, "snapshot with nonexistent parent:\n %s",
766 (bch2_bkey_val_to_text(&buf
, c
, k
), buf
.buf
));
770 if (le32_to_cpu(v
.children
[0]) != k
.k
->p
.offset
&&
771 le32_to_cpu(v
.children
[1]) != k
.k
->p
.offset
) {
772 bch_err(c
, "snapshot parent %u missing pointer to child %llu",
779 for (i
= 0; i
< 2 && s
.children
[i
]; i
++) {
780 id
= le32_to_cpu(s
.children
[i
]);
782 ret
= bch2_snapshot_lookup(trans
, id
, &v
);
783 if (bch2_err_matches(ret
, ENOENT
))
784 bch_err(c
, "snapshot node %llu has nonexistent child %u",
789 if (le32_to_cpu(v
.parent
) != k
.k
->p
.offset
) {
790 bch_err(c
, "snapshot child %u has wrong parent (got %u should be %llu)",
791 id
, le32_to_cpu(v
.parent
), k
.k
->p
.offset
);
797 bool should_have_subvol
= BCH_SNAPSHOT_SUBVOL(&s
) &&
798 !BCH_SNAPSHOT_DELETED(&s
);
800 if (should_have_subvol
) {
801 id
= le32_to_cpu(s
.subvol
);
802 ret
= bch2_subvolume_get(trans
, id
, 0, false, &subvol
);
803 if (bch2_err_matches(ret
, ENOENT
))
804 bch_err(c
, "snapshot points to nonexistent subvolume:\n %s",
805 (bch2_bkey_val_to_text(&buf
, c
, k
), buf
.buf
));
809 if (BCH_SNAPSHOT_SUBVOL(&s
) != (le32_to_cpu(subvol
.snapshot
) == k
.k
->p
.offset
)) {
810 bch_err(c
, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
816 if (fsck_err_on(s
.subvol
,
817 trans
, snapshot_should_not_have_subvol
,
818 "snapshot should not point to subvol:\n %s",
819 (bch2_bkey_val_to_text(&buf
, c
, k
), buf
.buf
))) {
820 u
= bch2_bkey_make_mut_typed(trans
, iter
, &k
, 0, snapshot
);
821 ret
= PTR_ERR_OR_ZERO(u
);
830 ret
= snapshot_tree_ptr_good(trans
, k
.k
->p
.offset
, le32_to_cpu(s
.tree
));
834 if (fsck_err_on(!ret
,
835 trans
, snapshot_to_bad_snapshot_tree
,
836 "snapshot points to missing/incorrect tree:\n %s",
837 (bch2_bkey_val_to_text(&buf
, c
, k
), buf
.buf
))) {
838 ret
= snapshot_tree_ptr_repair(trans
, iter
, k
, &s
);
844 real_depth
= bch2_snapshot_depth(c
, parent_id
);
846 if (fsck_err_on(le32_to_cpu(s
.depth
) != real_depth
,
847 trans
, snapshot_bad_depth
,
848 "snapshot with incorrect depth field, should be %u:\n %s",
849 real_depth
, (bch2_bkey_val_to_text(&buf
, c
, k
), buf
.buf
))) {
850 u
= bch2_bkey_make_mut_typed(trans
, iter
, &k
, 0, snapshot
);
851 ret
= PTR_ERR_OR_ZERO(u
);
855 u
->v
.depth
= cpu_to_le32(real_depth
);
859 ret
= snapshot_skiplist_good(trans
, k
.k
->p
.offset
, s
);
863 if (fsck_err_on(!ret
,
864 trans
, snapshot_bad_skiplist
,
865 "snapshot with bad skiplist field:\n %s",
866 (bch2_bkey_val_to_text(&buf
, c
, k
), buf
.buf
))) {
867 u
= bch2_bkey_make_mut_typed(trans
, iter
, &k
, 0, snapshot
);
868 ret
= PTR_ERR_OR_ZERO(u
);
872 for (i
= 0; i
< ARRAY_SIZE(u
->v
.skip
); i
++)
873 u
->v
.skip
[i
] = cpu_to_le32(bch2_snapshot_skiplist_get(c
, parent_id
));
875 bubble_sort(u
->v
.skip
, ARRAY_SIZE(u
->v
.skip
), cmp_le32
);
885 int bch2_check_snapshots(struct bch_fs
*c
)
888 * We iterate backwards as checking/fixing the depth field requires that
889 * the parent's depth already be correct:
891 int ret
= bch2_trans_run(c
,
892 for_each_btree_key_reverse_commit(trans
, iter
,
893 BTREE_ID_snapshots
, POS_MAX
,
894 BTREE_ITER_prefetch
, k
,
895 NULL
, NULL
, BCH_TRANS_COMMIT_no_enospc
,
896 check_snapshot(trans
, &iter
, k
)));
901 static int check_snapshot_exists(struct btree_trans
*trans
, u32 id
)
903 struct bch_fs
*c
= trans
->c
;
905 if (bch2_snapshot_equiv(c
, id
))
908 /* Do we need to reconstruct the snapshot_tree entry as well? */
909 struct btree_iter iter
;
914 for_each_btree_key_norestart(trans
, iter
, BTREE_ID_snapshot_trees
, POS_MIN
,
916 if (le32_to_cpu(bkey_s_c_to_snapshot_tree(k
).v
->root_snapshot
) == id
) {
917 tree_id
= k
.k
->p
.offset
;
921 bch2_trans_iter_exit(trans
, &iter
);
927 ret
= bch2_snapshot_tree_create(trans
, id
, 0, &tree_id
);
932 struct bkey_i_snapshot
*snapshot
= bch2_trans_kmalloc(trans
, sizeof(*snapshot
));
933 ret
= PTR_ERR_OR_ZERO(snapshot
);
937 bkey_snapshot_init(&snapshot
->k_i
);
938 snapshot
->k
.p
= POS(0, id
);
939 snapshot
->v
.tree
= cpu_to_le32(tree_id
);
940 snapshot
->v
.btime
.lo
= cpu_to_le64(bch2_current_time(c
));
942 for_each_btree_key_norestart(trans
, iter
, BTREE_ID_subvolumes
, POS_MIN
,
944 if (le32_to_cpu(bkey_s_c_to_subvolume(k
).v
->snapshot
) == id
) {
945 snapshot
->v
.subvol
= cpu_to_le32(k
.k
->p
.offset
);
946 SET_BCH_SNAPSHOT_SUBVOL(&snapshot
->v
, true);
950 bch2_trans_iter_exit(trans
, &iter
);
952 return bch2_btree_insert_trans(trans
, BTREE_ID_snapshots
, &snapshot
->k_i
, 0) ?:
953 bch2_mark_snapshot(trans
, BTREE_ID_snapshots
, 0,
954 bkey_s_c_null
, bkey_i_to_s(&snapshot
->k_i
), 0) ?:
955 bch2_snapshot_set_equiv(trans
, bkey_i_to_s_c(&snapshot
->k_i
));
958 /* Figure out which snapshot nodes belong in the same tree: */
959 struct snapshot_tree_reconstruct
{
962 snapshot_id_list cur_ids
;
963 DARRAY(snapshot_id_list
) trees
;
966 static void snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct
*r
)
968 darray_for_each(r
->trees
, i
)
970 darray_exit(&r
->trees
);
971 darray_exit(&r
->cur_ids
);
974 static inline bool same_snapshot(struct snapshot_tree_reconstruct
*r
, struct bpos pos
)
976 return r
->btree
== BTREE_ID_inodes
977 ? r
->cur_pos
.offset
== pos
.offset
978 : r
->cur_pos
.inode
== pos
.inode
;
981 static inline bool snapshot_id_lists_have_common(snapshot_id_list
*l
, snapshot_id_list
*r
)
983 darray_for_each(*l
, i
)
984 if (snapshot_list_has_id(r
, *i
))
989 static void snapshot_id_list_to_text(struct printbuf
*out
, snapshot_id_list
*s
)
992 darray_for_each(*s
, i
) {
996 prt_printf(out
, "%u", *i
);
1000 static int snapshot_tree_reconstruct_next(struct bch_fs
*c
, struct snapshot_tree_reconstruct
*r
)
1002 if (r
->cur_ids
.nr
) {
1003 darray_for_each(r
->trees
, i
)
1004 if (snapshot_id_lists_have_common(i
, &r
->cur_ids
)) {
1005 int ret
= snapshot_list_merge(c
, i
, &r
->cur_ids
);
1010 darray_push(&r
->trees
, r
->cur_ids
);
1011 darray_init(&r
->cur_ids
);
1018 static int get_snapshot_trees(struct bch_fs
*c
, struct snapshot_tree_reconstruct
*r
, struct bpos pos
)
1020 if (!same_snapshot(r
, pos
))
1021 snapshot_tree_reconstruct_next(c
, r
);
1023 return snapshot_list_add_nodup(c
, &r
->cur_ids
, pos
.snapshot
);
1026 int bch2_reconstruct_snapshots(struct bch_fs
*c
)
1028 struct btree_trans
*trans
= bch2_trans_get(c
);
1029 struct printbuf buf
= PRINTBUF
;
1030 struct snapshot_tree_reconstruct r
= {};
1033 for (unsigned btree
= 0; btree
< BTREE_ID_NR
; btree
++) {
1034 if (btree_type_has_snapshots(btree
)) {
1037 ret
= for_each_btree_key(trans
, iter
, btree
, POS_MIN
,
1038 BTREE_ITER_all_snapshots
|BTREE_ITER_prefetch
, k
, ({
1039 get_snapshot_trees(c
, &r
, k
.k
->p
);
1044 snapshot_tree_reconstruct_next(c
, &r
);
1048 darray_for_each(r
.trees
, t
) {
1049 printbuf_reset(&buf
);
1050 snapshot_id_list_to_text(&buf
, t
);
1052 darray_for_each(*t
, id
) {
1053 if (fsck_err_on(!bch2_snapshot_equiv(c
, *id
),
1054 trans
, snapshot_node_missing
,
1055 "snapshot node %u from tree %s missing, recreate?", *id
, buf
.buf
)) {
1057 bch_err(c
, "cannot reconstruct snapshot trees with multiple nodes");
1058 ret
= -BCH_ERR_fsck_repair_unimplemented
;
1062 ret
= commit_do(trans
, NULL
, NULL
, BCH_TRANS_COMMIT_no_enospc
,
1063 check_snapshot_exists(trans
, *id
));
1071 bch2_trans_put(trans
);
1072 snapshot_tree_reconstruct_exit(&r
);
1073 printbuf_exit(&buf
);
1078 int bch2_check_key_has_snapshot(struct btree_trans
*trans
,
1079 struct btree_iter
*iter
,
1082 struct bch_fs
*c
= trans
->c
;
1083 struct printbuf buf
= PRINTBUF
;
1086 if (fsck_err_on(!bch2_snapshot_equiv(c
, k
.k
->p
.snapshot
),
1087 trans
, bkey_in_missing_snapshot
,
1088 "key in missing snapshot %s, delete?",
1089 (bch2_bkey_val_to_text(&buf
, c
, k
), buf
.buf
)))
1090 ret
= bch2_btree_delete_at(trans
, iter
,
1091 BTREE_UPDATE_internal_snapshot_node
) ?: 1;
1093 printbuf_exit(&buf
);
1098 * Mark a snapshot as deleted, for future cleanup:
1100 int bch2_snapshot_node_set_deleted(struct btree_trans
*trans
, u32 id
)
1102 struct btree_iter iter
;
1103 struct bkey_i_snapshot
*s
;
1106 s
= bch2_bkey_get_mut_typed(trans
, &iter
,
1107 BTREE_ID_snapshots
, POS(0, id
),
1109 ret
= PTR_ERR_OR_ZERO(s
);
1110 if (unlikely(ret
)) {
1111 bch2_fs_inconsistent_on(bch2_err_matches(ret
, ENOENT
),
1112 trans
->c
, "missing snapshot %u", id
);
1116 /* already deleted? */
1117 if (BCH_SNAPSHOT_DELETED(&s
->v
))
1120 SET_BCH_SNAPSHOT_DELETED(&s
->v
, true);
1121 SET_BCH_SNAPSHOT_SUBVOL(&s
->v
, false);
1124 bch2_trans_iter_exit(trans
, &iter
);
1128 static inline void normalize_snapshot_child_pointers(struct bch_snapshot
*s
)
1130 if (le32_to_cpu(s
->children
[0]) < le32_to_cpu(s
->children
[1]))
1131 swap(s
->children
[0], s
->children
[1]);
1134 static int bch2_snapshot_node_delete(struct btree_trans
*trans
, u32 id
)
1136 struct bch_fs
*c
= trans
->c
;
1137 struct btree_iter iter
, p_iter
= (struct btree_iter
) { NULL
};
1138 struct btree_iter c_iter
= (struct btree_iter
) { NULL
};
1139 struct btree_iter tree_iter
= (struct btree_iter
) { NULL
};
1140 struct bkey_s_c_snapshot s
;
1141 u32 parent_id
, child_id
;
1145 s
= bch2_bkey_get_iter_typed(trans
, &iter
, BTREE_ID_snapshots
, POS(0, id
),
1146 BTREE_ITER_intent
, snapshot
);
1148 bch2_fs_inconsistent_on(bch2_err_matches(ret
, ENOENT
), c
,
1149 "missing snapshot %u", id
);
1154 BUG_ON(s
.v
->children
[1]);
1156 parent_id
= le32_to_cpu(s
.v
->parent
);
1157 child_id
= le32_to_cpu(s
.v
->children
[0]);
1160 struct bkey_i_snapshot
*parent
;
1162 parent
= bch2_bkey_get_mut_typed(trans
, &p_iter
,
1163 BTREE_ID_snapshots
, POS(0, parent_id
),
1165 ret
= PTR_ERR_OR_ZERO(parent
);
1166 bch2_fs_inconsistent_on(bch2_err_matches(ret
, ENOENT
), c
,
1167 "missing snapshot %u", parent_id
);
1171 /* find entry in parent->children for node being deleted */
1172 for (i
= 0; i
< 2; i
++)
1173 if (le32_to_cpu(parent
->v
.children
[i
]) == id
)
1176 if (bch2_fs_inconsistent_on(i
== 2, c
,
1177 "snapshot %u missing child pointer to %u",
1181 parent
->v
.children
[i
] = cpu_to_le32(child_id
);
1183 normalize_snapshot_child_pointers(&parent
->v
);
1187 struct bkey_i_snapshot
*child
;
1189 child
= bch2_bkey_get_mut_typed(trans
, &c_iter
,
1190 BTREE_ID_snapshots
, POS(0, child_id
),
1192 ret
= PTR_ERR_OR_ZERO(child
);
1193 bch2_fs_inconsistent_on(bch2_err_matches(ret
, ENOENT
), c
,
1194 "missing snapshot %u", child_id
);
1198 child
->v
.parent
= cpu_to_le32(parent_id
);
1200 if (!child
->v
.parent
) {
1201 child
->v
.skip
[0] = 0;
1202 child
->v
.skip
[1] = 0;
1203 child
->v
.skip
[2] = 0;
1209 * We're deleting the root of a snapshot tree: update the
1210 * snapshot_tree entry to point to the new root, or delete it if
1211 * this is the last snapshot ID in this tree:
1213 struct bkey_i_snapshot_tree
*s_t
;
1215 BUG_ON(s
.v
->children
[1]);
1217 s_t
= bch2_bkey_get_mut_typed(trans
, &tree_iter
,
1218 BTREE_ID_snapshot_trees
, POS(0, le32_to_cpu(s
.v
->tree
)),
1220 ret
= PTR_ERR_OR_ZERO(s_t
);
1224 if (s
.v
->children
[0]) {
1225 s_t
->v
.root_snapshot
= s
.v
->children
[0];
1227 s_t
->k
.type
= KEY_TYPE_deleted
;
1228 set_bkey_val_u64s(&s_t
->k
, 0);
1232 ret
= bch2_btree_delete_at(trans
, &iter
, 0);
1234 bch2_trans_iter_exit(trans
, &tree_iter
);
1235 bch2_trans_iter_exit(trans
, &p_iter
);
1236 bch2_trans_iter_exit(trans
, &c_iter
);
1237 bch2_trans_iter_exit(trans
, &iter
);
1241 static int create_snapids(struct btree_trans
*trans
, u32 parent
, u32 tree
,
1243 u32
*snapshot_subvols
,
1244 unsigned nr_snapids
)
1246 struct bch_fs
*c
= trans
->c
;
1247 struct btree_iter iter
;
1248 struct bkey_i_snapshot
*n
;
1251 u32 depth
= bch2_snapshot_depth(c
, parent
);
1254 bch2_trans_iter_init(trans
, &iter
, BTREE_ID_snapshots
,
1255 POS_MIN
, BTREE_ITER_intent
);
1256 k
= bch2_btree_iter_peek(&iter
);
1261 for (i
= 0; i
< nr_snapids
; i
++) {
1262 k
= bch2_btree_iter_prev_slot(&iter
);
1267 if (!k
.k
|| !k
.k
->p
.offset
) {
1268 ret
= -BCH_ERR_ENOSPC_snapshot_create
;
1272 n
= bch2_bkey_alloc(trans
, &iter
, 0, snapshot
);
1273 ret
= PTR_ERR_OR_ZERO(n
);
1278 n
->v
.parent
= cpu_to_le32(parent
);
1279 n
->v
.subvol
= cpu_to_le32(snapshot_subvols
[i
]);
1280 n
->v
.tree
= cpu_to_le32(tree
);
1281 n
->v
.depth
= cpu_to_le32(depth
);
1282 n
->v
.btime
.lo
= cpu_to_le64(bch2_current_time(c
));
1285 for (j
= 0; j
< ARRAY_SIZE(n
->v
.skip
); j
++)
1286 n
->v
.skip
[j
] = cpu_to_le32(bch2_snapshot_skiplist_get(c
, parent
));
1288 bubble_sort(n
->v
.skip
, ARRAY_SIZE(n
->v
.skip
), cmp_le32
);
1289 SET_BCH_SNAPSHOT_SUBVOL(&n
->v
, true);
1291 ret
= __bch2_mark_snapshot(trans
, BTREE_ID_snapshots
, 0,
1292 bkey_s_c_null
, bkey_i_to_s_c(&n
->k_i
), 0);
1296 new_snapids
[i
] = iter
.pos
.offset
;
1298 mutex_lock(&c
->snapshot_table_lock
);
1299 snapshot_t_mut(c
, new_snapids
[i
])->equiv
= new_snapids
[i
];
1300 mutex_unlock(&c
->snapshot_table_lock
);
1303 bch2_trans_iter_exit(trans
, &iter
);
1308 * Create new snapshot IDs as children of an existing snapshot ID:
1310 static int bch2_snapshot_node_create_children(struct btree_trans
*trans
, u32 parent
,
1312 u32
*snapshot_subvols
,
1313 unsigned nr_snapids
)
1315 struct btree_iter iter
;
1316 struct bkey_i_snapshot
*n_parent
;
1319 n_parent
= bch2_bkey_get_mut_typed(trans
, &iter
,
1320 BTREE_ID_snapshots
, POS(0, parent
),
1322 ret
= PTR_ERR_OR_ZERO(n_parent
);
1323 if (unlikely(ret
)) {
1324 if (bch2_err_matches(ret
, ENOENT
))
1325 bch_err(trans
->c
, "snapshot %u not found", parent
);
1329 if (n_parent
->v
.children
[0] || n_parent
->v
.children
[1]) {
1330 bch_err(trans
->c
, "Trying to add child snapshot nodes to parent that already has children");
1335 ret
= create_snapids(trans
, parent
, le32_to_cpu(n_parent
->v
.tree
),
1336 new_snapids
, snapshot_subvols
, nr_snapids
);
1340 n_parent
->v
.children
[0] = cpu_to_le32(new_snapids
[0]);
1341 n_parent
->v
.children
[1] = cpu_to_le32(new_snapids
[1]);
1342 n_parent
->v
.subvol
= 0;
1343 SET_BCH_SNAPSHOT_SUBVOL(&n_parent
->v
, false);
1345 bch2_trans_iter_exit(trans
, &iter
);
1350 * Create a snapshot node that is the root of a new tree:
1352 static int bch2_snapshot_node_create_tree(struct btree_trans
*trans
,
1354 u32
*snapshot_subvols
,
1355 unsigned nr_snapids
)
1357 struct bkey_i_snapshot_tree
*n_tree
;
1360 n_tree
= __bch2_snapshot_tree_create(trans
);
1361 ret
= PTR_ERR_OR_ZERO(n_tree
) ?:
1362 create_snapids(trans
, 0, n_tree
->k
.p
.offset
,
1363 new_snapids
, snapshot_subvols
, nr_snapids
);
1367 n_tree
->v
.master_subvol
= cpu_to_le32(snapshot_subvols
[0]);
1368 n_tree
->v
.root_snapshot
= cpu_to_le32(new_snapids
[0]);
1372 int bch2_snapshot_node_create(struct btree_trans
*trans
, u32 parent
,
1374 u32
*snapshot_subvols
,
1375 unsigned nr_snapids
)
1377 BUG_ON((parent
== 0) != (nr_snapids
== 1));
1378 BUG_ON((parent
!= 0) != (nr_snapids
== 2));
1381 ? bch2_snapshot_node_create_children(trans
, parent
,
1382 new_snapids
, snapshot_subvols
, nr_snapids
)
1383 : bch2_snapshot_node_create_tree(trans
,
1384 new_snapids
, snapshot_subvols
, nr_snapids
);
1389 * If we have an unlinked inode in an internal snapshot node, and the inode
1390 * really has been deleted in all child snapshots, how does this get cleaned up?
1392 * first there is the problem of how keys that have been overwritten in all
1393 * child snapshots get deleted (unimplemented?), but inodes may perhaps be
1396 * also: unlinked inode in internal snapshot appears to not be getting deleted
1397 * correctly if inode doesn't exist in leaf snapshots
1401 * for a key in an interior snapshot node that needs work to be done that
1402 * requires it to be mutated: iterate over all descendent leaf nodes and copy
1403 * that key to snapshot leaf nodes, where we can mutate it
1406 static int delete_dead_snapshots_process_key(struct btree_trans
*trans
,
1407 struct btree_iter
*iter
,
1409 snapshot_id_list
*deleted
,
1410 snapshot_id_list
*equiv_seen
,
1411 struct bpos
*last_pos
)
1413 int ret
= bch2_check_key_has_snapshot(trans
, iter
, k
);
1415 return ret
< 0 ? ret
: 0;
1417 struct bch_fs
*c
= trans
->c
;
1418 u32 equiv
= bch2_snapshot_equiv(c
, k
.k
->p
.snapshot
);
1419 if (!equiv
) /* key for invalid snapshot node, but we chose not to delete */
1422 if (!bkey_eq(k
.k
->p
, *last_pos
))
1425 if (snapshot_list_has_id(deleted
, k
.k
->p
.snapshot
))
1426 return bch2_btree_delete_at(trans
, iter
,
1427 BTREE_UPDATE_internal_snapshot_node
);
1429 if (!bpos_eq(*last_pos
, k
.k
->p
) &&
1430 snapshot_list_has_id(equiv_seen
, equiv
))
1431 return bch2_btree_delete_at(trans
, iter
,
1432 BTREE_UPDATE_internal_snapshot_node
);
1436 ret
= snapshot_list_add_nodup(c
, equiv_seen
, equiv
);
1441 * When we have a linear chain of snapshot nodes, we consider
1442 * those to form an equivalence class: we're going to collapse
1443 * them all down to a single node, and keep the leaf-most node -
1444 * which has the same id as the equivalence class id.
1446 * If there are multiple keys in different snapshots at the same
1447 * position, we're only going to keep the one in the newest
1448 * snapshot (we delete the others above) - the rest have been
1449 * overwritten and are redundant, and for the key we're going to keep we
1450 * need to move it to the equivalance class ID if it's not there
1453 if (equiv
!= k
.k
->p
.snapshot
) {
1454 struct bkey_i
*new = bch2_bkey_make_mut_noupdate(trans
, k
);
1455 int ret
= PTR_ERR_OR_ZERO(new);
1459 new->k
.p
.snapshot
= equiv
;
1461 struct btree_iter new_iter
;
1462 bch2_trans_iter_init(trans
, &new_iter
, iter
->btree_id
, new->k
.p
,
1463 BTREE_ITER_all_snapshots
|
1467 ret
= bch2_btree_iter_traverse(&new_iter
) ?:
1468 bch2_trans_update(trans
, &new_iter
, new,
1469 BTREE_UPDATE_internal_snapshot_node
) ?:
1470 bch2_btree_delete_at(trans
, iter
,
1471 BTREE_UPDATE_internal_snapshot_node
);
1472 bch2_trans_iter_exit(trans
, &new_iter
);
1480 static int bch2_snapshot_needs_delete(struct btree_trans
*trans
, struct bkey_s_c k
)
1482 struct bkey_s_c_snapshot snap
;
1486 if (k
.k
->type
!= KEY_TYPE_snapshot
)
1489 snap
= bkey_s_c_to_snapshot(k
);
1490 if (BCH_SNAPSHOT_DELETED(snap
.v
) ||
1491 BCH_SNAPSHOT_SUBVOL(snap
.v
))
1494 children
[0] = le32_to_cpu(snap
.v
->children
[0]);
1495 children
[1] = le32_to_cpu(snap
.v
->children
[1]);
1497 ret
= bch2_snapshot_live(trans
, children
[0]) ?:
1498 bch2_snapshot_live(trans
, children
[1]);
1505 * For a given snapshot, if it doesn't have a subvolume that points to it, and
1506 * it doesn't have child snapshot nodes - it's now redundant and we can mark it
1509 static int bch2_delete_redundant_snapshot(struct btree_trans
*trans
, struct bkey_s_c k
)
1511 int ret
= bch2_snapshot_needs_delete(trans
, k
);
1515 : bch2_snapshot_node_set_deleted(trans
, k
.k
->p
.offset
);
1518 static inline u32
bch2_snapshot_nth_parent_skip(struct bch_fs
*c
, u32 id
, u32 n
,
1519 snapshot_id_list
*skip
)
1522 while (snapshot_list_has_id(skip
, id
))
1523 id
= __bch2_snapshot_parent(c
, id
);
1527 id
= __bch2_snapshot_parent(c
, id
);
1528 } while (snapshot_list_has_id(skip
, id
));
1535 static int bch2_fix_child_of_deleted_snapshot(struct btree_trans
*trans
,
1536 struct btree_iter
*iter
, struct bkey_s_c k
,
1537 snapshot_id_list
*deleted
)
1539 struct bch_fs
*c
= trans
->c
;
1540 u32 nr_deleted_ancestors
= 0;
1541 struct bkey_i_snapshot
*s
;
1544 if (k
.k
->type
!= KEY_TYPE_snapshot
)
1547 if (snapshot_list_has_id(deleted
, k
.k
->p
.offset
))
1550 s
= bch2_bkey_make_mut_noupdate_typed(trans
, k
, snapshot
);
1551 ret
= PTR_ERR_OR_ZERO(s
);
1555 darray_for_each(*deleted
, i
)
1556 nr_deleted_ancestors
+= bch2_snapshot_is_ancestor(c
, s
->k
.p
.offset
, *i
);
1558 if (!nr_deleted_ancestors
)
1561 le32_add_cpu(&s
->v
.depth
, -nr_deleted_ancestors
);
1568 u32 depth
= le32_to_cpu(s
->v
.depth
);
1569 u32 parent
= bch2_snapshot_parent(c
, s
->k
.p
.offset
);
1571 for (unsigned j
= 0; j
< ARRAY_SIZE(s
->v
.skip
); j
++) {
1572 u32 id
= le32_to_cpu(s
->v
.skip
[j
]);
1574 if (snapshot_list_has_id(deleted
, id
)) {
1575 id
= bch2_snapshot_nth_parent_skip(c
,
1578 ? get_random_u32_below(depth
- 1)
1581 s
->v
.skip
[j
] = cpu_to_le32(id
);
1585 bubble_sort(s
->v
.skip
, ARRAY_SIZE(s
->v
.skip
), cmp_le32
);
1588 return bch2_trans_update(trans
, iter
, &s
->k_i
, 0);
1591 int bch2_delete_dead_snapshots(struct bch_fs
*c
)
1593 struct btree_trans
*trans
;
1594 snapshot_id_list deleted
= { 0 };
1595 snapshot_id_list deleted_interior
= { 0 };
1598 if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots
, &c
->flags
))
1601 trans
= bch2_trans_get(c
);
1604 * For every snapshot node: If we have no live children and it's not
1605 * pointed to by a subvolume, delete it:
1607 ret
= for_each_btree_key_commit(trans
, iter
, BTREE_ID_snapshots
,
1610 bch2_delete_redundant_snapshot(trans
, k
));
1611 bch_err_msg(c
, ret
, "deleting redundant snapshots");
1615 ret
= for_each_btree_key(trans
, iter
, BTREE_ID_snapshots
,
1617 bch2_snapshot_set_equiv(trans
, k
));
1618 bch_err_msg(c
, ret
, "in bch2_snapshots_set_equiv");
1622 ret
= for_each_btree_key(trans
, iter
, BTREE_ID_snapshots
,
1624 if (k
.k
->type
!= KEY_TYPE_snapshot
)
1627 BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k
).v
)
1628 ? snapshot_list_add(c
, &deleted
, k
.k
->p
.offset
)
1631 bch_err_msg(c
, ret
, "walking snapshots");
1635 for (unsigned btree
= 0; btree
< BTREE_ID_NR
; btree
++) {
1636 struct bpos last_pos
= POS_MIN
;
1637 snapshot_id_list equiv_seen
= { 0 };
1638 struct disk_reservation res
= { 0 };
1640 if (!btree_type_has_snapshots(btree
))
1643 ret
= for_each_btree_key_commit(trans
, iter
,
1645 BTREE_ITER_prefetch
|BTREE_ITER_all_snapshots
, k
,
1646 &res
, NULL
, BCH_TRANS_COMMIT_no_enospc
,
1647 delete_dead_snapshots_process_key(trans
, &iter
, k
, &deleted
,
1648 &equiv_seen
, &last_pos
));
1650 bch2_disk_reservation_put(c
, &res
);
1651 darray_exit(&equiv_seen
);
1653 bch_err_msg(c
, ret
, "deleting keys from dying snapshots");
1658 bch2_trans_unlock(trans
);
1659 down_write(&c
->snapshot_create_lock
);
1661 ret
= for_each_btree_key(trans
, iter
, BTREE_ID_snapshots
,
1663 u32 snapshot
= k
.k
->p
.offset
;
1664 u32 equiv
= bch2_snapshot_equiv(c
, snapshot
);
1667 ? snapshot_list_add(c
, &deleted_interior
, snapshot
)
1671 bch_err_msg(c
, ret
, "walking snapshots");
1673 goto err_create_lock
;
1676 * Fixing children of deleted snapshots can't be done completely
1677 * atomically, if we crash between here and when we delete the interior
1678 * nodes some depth fields will be off:
1680 ret
= for_each_btree_key_commit(trans
, iter
, BTREE_ID_snapshots
, POS_MIN
,
1681 BTREE_ITER_intent
, k
,
1682 NULL
, NULL
, BCH_TRANS_COMMIT_no_enospc
,
1683 bch2_fix_child_of_deleted_snapshot(trans
, &iter
, k
, &deleted_interior
));
1685 goto err_create_lock
;
1687 darray_for_each(deleted
, i
) {
1688 ret
= commit_do(trans
, NULL
, NULL
, 0,
1689 bch2_snapshot_node_delete(trans
, *i
));
1690 bch_err_msg(c
, ret
, "deleting snapshot %u", *i
);
1692 goto err_create_lock
;
1695 darray_for_each(deleted_interior
, i
) {
1696 ret
= commit_do(trans
, NULL
, NULL
, 0,
1697 bch2_snapshot_node_delete(trans
, *i
));
1698 bch_err_msg(c
, ret
, "deleting snapshot %u", *i
);
1700 goto err_create_lock
;
1703 up_write(&c
->snapshot_create_lock
);
1705 darray_exit(&deleted_interior
);
1706 darray_exit(&deleted
);
1707 bch2_trans_put(trans
);
1712 void bch2_delete_dead_snapshots_work(struct work_struct
*work
)
1714 struct bch_fs
*c
= container_of(work
, struct bch_fs
, snapshot_delete_work
);
1716 set_worker_desc("bcachefs-delete-dead-snapshots/%s", c
->name
);
1718 bch2_delete_dead_snapshots(c
);
1719 bch2_write_ref_put(c
, BCH_WRITE_REF_delete_dead_snapshots
);
1722 void bch2_delete_dead_snapshots_async(struct bch_fs
*c
)
1724 if (bch2_write_ref_tryget(c
, BCH_WRITE_REF_delete_dead_snapshots
) &&
1725 !queue_work(c
->write_ref_wq
, &c
->snapshot_delete_work
))
1726 bch2_write_ref_put(c
, BCH_WRITE_REF_delete_dead_snapshots
);
1729 int __bch2_key_has_snapshot_overwrites(struct btree_trans
*trans
,
1733 struct bch_fs
*c
= trans
->c
;
1734 struct btree_iter iter
;
1738 bch2_trans_iter_init(trans
, &iter
, id
, pos
,
1739 BTREE_ITER_not_extents
|
1740 BTREE_ITER_all_snapshots
);
1742 k
= bch2_btree_iter_prev(&iter
);
1750 if (!bkey_eq(pos
, k
.k
->p
))
1753 if (bch2_snapshot_is_ancestor(c
, k
.k
->p
.snapshot
, pos
.snapshot
)) {
1758 bch2_trans_iter_exit(trans
, &iter
);
1763 static int bch2_check_snapshot_needs_deletion(struct btree_trans
*trans
, struct bkey_s_c k
)
1765 struct bch_fs
*c
= trans
->c
;
1766 struct bkey_s_c_snapshot snap
;
1769 if (k
.k
->type
!= KEY_TYPE_snapshot
)
1772 snap
= bkey_s_c_to_snapshot(k
);
1773 if (BCH_SNAPSHOT_DELETED(snap
.v
) ||
1774 bch2_snapshot_equiv(c
, k
.k
->p
.offset
) != k
.k
->p
.offset
||
1775 (ret
= bch2_snapshot_needs_delete(trans
, k
)) > 0) {
1776 set_bit(BCH_FS_need_delete_dead_snapshots
, &c
->flags
);
1783 int bch2_snapshots_read(struct bch_fs
*c
)
1785 int ret
= bch2_trans_run(c
,
1786 for_each_btree_key(trans
, iter
, BTREE_ID_snapshots
,
1788 __bch2_mark_snapshot(trans
, BTREE_ID_snapshots
, 0, bkey_s_c_null
, k
, 0) ?:
1789 bch2_snapshot_set_equiv(trans
, k
) ?:
1790 bch2_check_snapshot_needs_deletion(trans
, k
)) ?:
1791 for_each_btree_key(trans
, iter
, BTREE_ID_snapshots
,
1793 (set_is_ancestor_bitmap(c
, k
.k
->p
.offset
), 0)));
1797 * It's important that we check if we need to reconstruct snapshots
1798 * before going RW, so we mark that pass as required in the superblock -
1799 * otherwise, we could end up deleting keys with missing snapshot nodes
1802 BUG_ON(!test_bit(BCH_FS_new_fs
, &c
->flags
) &&
1803 test_bit(BCH_FS_may_go_rw
, &c
->flags
));
1805 if (bch2_err_matches(ret
, EIO
) ||
1806 (c
->sb
.btrees_lost_data
& BIT_ULL(BTREE_ID_snapshots
)))
1807 ret
= bch2_run_explicit_recovery_pass_persistent(c
, BCH_RECOVERY_PASS_reconstruct_snapshots
);
1812 void bch2_fs_snapshots_exit(struct bch_fs
*c
)
1814 kvfree(rcu_dereference_protected(c
->snapshots
, true));