1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_defer.h"
13 #include "xfs_btree.h"
14 #include "xfs_btree_staging.h"
15 #include "xfs_inode.h"
17 #include "xfs_log_format.h"
18 #include "xfs_trans.h"
20 #include "xfs_alloc.h"
21 #include "xfs_ialloc.h"
23 #include "xfs_rmap_btree.h"
24 #include "xfs_refcount.h"
25 #include "xfs_refcount_btree.h"
26 #include "xfs_error.h"
28 #include "xfs_health.h"
29 #include "scrub/xfs_scrub.h"
30 #include "scrub/scrub.h"
31 #include "scrub/common.h"
32 #include "scrub/btree.h"
33 #include "scrub/trace.h"
34 #include "scrub/repair.h"
35 #include "scrub/bitmap.h"
36 #include "scrub/agb_bitmap.h"
37 #include "scrub/xfile.h"
38 #include "scrub/xfarray.h"
39 #include "scrub/newbt.h"
40 #include "scrub/reap.h"
41 #include "scrub/rcbag.h"
44 * Rebuilding the Reference Count Btree
45 * ====================================
47 * This algorithm is "borrowed" from xfs_repair. Imagine the rmap
48 * entries as rectangles representing extents of physical blocks, and
49 * that the rectangles can be laid down to allow them to overlap each
50 * other; then we know that we must emit a refcnt btree entry wherever
51 * the amount of overlap changes, i.e. the emission stimulus is
55 * -- ----- ---- --- ------
56 * -- ---- ----------- ---- ---------
57 * -------------------------------- -----------
58 * ^ ^ ^^ ^^ ^ ^^ ^^^ ^^^^ ^ ^^ ^ ^ ^
59 * 2 1 23 21 3 43 234 2123 1 01 2 3 0
61 * For our purposes, a rmap is a tuple (startblock, len, fileoff, owner).
63 * Note that in the actual refcnt btree we don't store the refcount < 2
64 * cases because the bnobt tells us which blocks are free; single-use
65 * blocks aren't recorded in the bnobt or the refcntbt. If the rmapbt
66 * supports storing multiple entries covering a given block we could
67 * theoretically dispense with the refcntbt and simply count rmaps, but
68 * that's inefficient in the (hot) write path, so we'll take the cost of
69 * the extra tree to save time. Also there's no guarantee that rmap
72 * Given an array of rmaps sorted by physical block number, a starting
73 * physical block (sp), a bag to hold rmaps that cover sp, and the next
74 * physical block where the level changes (np), we can reconstruct the
75 * refcount btree as follows:
77 * While there are still unprocessed rmaps in the array,
78 * - Set sp to the physical block (pblk) of the next unprocessed rmap.
79 * - Add to the bag all rmaps in the array where startblock == sp.
80 * - Set np to the physical block where the bag size will change. This
81 * is the minimum of (the pblk of the next unprocessed rmap) and
82 * (startblock + len of each rmap in the bag).
83 * - Record the bag size as old_bag_size.
85 * - While the bag isn't empty,
86 * - Remove from the bag all rmaps where startblock + len == np.
87 * - Add to the bag all rmaps in the array where startblock == np.
88 * - If the bag size isn't old_bag_size, store the refcount entry
89 * (sp, np - sp, bag_size) in the refcnt btree.
90 * - If the bag is empty, break out of the inner loop.
91 * - Set old_bag_size to the bag size
93 * - Set np to the physical block where the bag size will change.
94 * This is the minimum of (the pblk of the next unprocessed rmap)
95 * and (startblock + len of each rmap in the bag).
97 * Like all the other repairers, we make a list of all the refcount
98 * records we need, then reinitialize the refcount btree root and
99 * insert all the records.
103 /* refcount extents */
104 struct xfarray
*refcount_records
;
106 /* new refcountbt information */
107 struct xrep_newbt new_btree
;
109 /* old refcountbt blocks */
110 struct xagb_bitmap old_refcountbt_blocks
;
112 struct xfs_scrub
*sc
;
114 /* get_records()'s position in the refcount record array. */
115 xfarray_idx_t array_cur
;
117 /* # of refcountbt blocks */
118 xfs_extlen_t btblocks
;
121 /* Set us up to repair refcount btrees. */
123 xrep_setup_ag_refcountbt(
124 struct xfs_scrub
*sc
)
129 descr
= xchk_xfile_ag_descr(sc
, "rmap record bag");
130 error
= xrep_setup_xfbtree(sc
, descr
);
135 /* Check for any obvious conflicts with this shared/CoW staging extent. */
138 struct xfs_scrub
*sc
,
139 const struct xfs_refcount_irec
*rec
)
141 enum xbtree_recpacking outcome
;
144 if (xfs_refcount_check_irec(sc
->sa
.pag
, rec
) != NULL
)
145 return -EFSCORRUPTED
;
147 /* Make sure this isn't free space. */
148 error
= xfs_alloc_has_records(sc
->sa
.bno_cur
, rec
->rc_startblock
,
149 rec
->rc_blockcount
, &outcome
);
152 if (outcome
!= XBTREE_RECPACKING_EMPTY
)
153 return -EFSCORRUPTED
;
155 /* Must not be an inode chunk. */
156 error
= xfs_ialloc_has_inodes_at_extent(sc
->sa
.ino_cur
,
157 rec
->rc_startblock
, rec
->rc_blockcount
, &outcome
);
160 if (outcome
!= XBTREE_RECPACKING_EMPTY
)
161 return -EFSCORRUPTED
;
166 /* Record a reference count extent. */
169 struct xrep_refc
*rr
,
170 enum xfs_refc_domain domain
,
175 struct xfs_refcount_irec irec
= {
176 .rc_startblock
= agbno
,
177 .rc_blockcount
= len
,
180 struct xfs_scrub
*sc
= rr
->sc
;
183 if (xchk_should_terminate(sc
, &error
))
186 irec
.rc_refcount
= min_t(uint64_t, MAXREFCOUNT
, refcount
);
188 error
= xrep_refc_check_ext(rr
->sc
, &irec
);
192 trace_xrep_refc_found(sc
->sa
.pag
, &irec
);
194 return xfarray_append(rr
->refcount_records
, &irec
);
197 /* Record a CoW staging extent. */
200 struct xrep_refc
*rr
,
204 return xrep_refc_stash(rr
, XFS_REFC_DOMAIN_COW
, agbno
, len
, 1);
207 /* Decide if an rmap could describe a shared extent. */
209 xrep_refc_rmap_shareable(
210 struct xfs_mount
*mp
,
211 const struct xfs_rmap_irec
*rmap
)
213 /* AG metadata are never sharable */
214 if (XFS_RMAP_NON_INODE_OWNER(rmap
->rm_owner
))
217 /* Metadata in files are never shareable */
218 if (xfs_is_sb_inum(mp
, rmap
->rm_owner
))
221 /* Metadata and unwritten file blocks are not shareable. */
222 if (rmap
->rm_flags
& (XFS_RMAP_ATTR_FORK
| XFS_RMAP_BMBT_BLOCK
|
230 * Walk along the reverse mapping records until we find one that could describe
234 xrep_refc_walk_rmaps(
235 struct xrep_refc
*rr
,
236 struct xfs_rmap_irec
*rmap
,
239 struct xfs_btree_cur
*cur
= rr
->sc
->sa
.rmap_cur
;
240 struct xfs_mount
*mp
= cur
->bc_mp
;
247 * Loop through the remaining rmaps. Remember CoW staging
248 * extents and the refcountbt blocks from the old tree for later
249 * disposal. We can only share written data fork extents, so
250 * keep looping until we find an rmap for one.
253 if (xchk_should_terminate(rr
->sc
, &error
))
256 error
= xfs_btree_increment(cur
, 0, &have_gt
);
262 error
= xfs_rmap_get_rec(cur
, rmap
, &have_gt
);
265 if (XFS_IS_CORRUPT(mp
, !have_gt
)) {
266 xfs_btree_mark_sick(cur
);
267 return -EFSCORRUPTED
;
270 if (rmap
->rm_owner
== XFS_RMAP_OWN_COW
) {
271 error
= xrep_refc_stash_cow(rr
, rmap
->rm_startblock
,
272 rmap
->rm_blockcount
);
275 } else if (rmap
->rm_owner
== XFS_RMAP_OWN_REFC
) {
276 /* refcountbt block, dump it when we're done. */
277 rr
->btblocks
+= rmap
->rm_blockcount
;
278 error
= xagb_bitmap_set(&rr
->old_refcountbt_blocks
,
280 rmap
->rm_blockcount
);
284 } while (!xrep_refc_rmap_shareable(mp
, rmap
));
290 static inline uint32_t
291 xrep_refc_encode_startblock(
292 const struct xfs_refcount_irec
*irec
)
296 start
= irec
->rc_startblock
& ~XFS_REFC_COWFLAG
;
297 if (irec
->rc_domain
== XFS_REFC_DOMAIN_COW
)
298 start
|= XFS_REFC_COWFLAG
;
303 /* Sort in the same order as the ondisk records. */
305 xrep_refc_extent_cmp(
309 const struct xfs_refcount_irec
*ap
= a
;
310 const struct xfs_refcount_irec
*bp
= b
;
313 sa
= xrep_refc_encode_startblock(ap
);
314 sb
= xrep_refc_encode_startblock(bp
);
324 * Sort the refcount extents by startblock or else the btree records will be in
325 * the wrong order. Make sure the records do not overlap in physical space.
328 xrep_refc_sort_records(
329 struct xrep_refc
*rr
)
331 struct xfs_refcount_irec irec
;
333 enum xfs_refc_domain dom
= XFS_REFC_DOMAIN_SHARED
;
334 xfs_agblock_t next_agbno
= 0;
337 error
= xfarray_sort(rr
->refcount_records
, xrep_refc_extent_cmp
,
338 XFARRAY_SORT_KILLABLE
);
342 foreach_xfarray_idx(rr
->refcount_records
, cur
) {
343 if (xchk_should_terminate(rr
->sc
, &error
))
346 error
= xfarray_load(rr
->refcount_records
, cur
, &irec
);
350 if (dom
== XFS_REFC_DOMAIN_SHARED
&&
351 irec
.rc_domain
== XFS_REFC_DOMAIN_COW
) {
352 dom
= irec
.rc_domain
;
356 if (dom
!= irec
.rc_domain
)
357 return -EFSCORRUPTED
;
358 if (irec
.rc_startblock
< next_agbno
)
359 return -EFSCORRUPTED
;
361 next_agbno
= irec
.rc_startblock
+ irec
.rc_blockcount
;
368 * Walk forward through the rmap btree to collect all rmaps starting at
369 * @bno in @rmap_bag. These represent the file(s) that share ownership of
370 * the current block. Upon return, the rmap cursor points to the last record
371 * satisfying the startblock constraint.
374 xrep_refc_push_rmaps_at(
375 struct xrep_refc
*rr
,
376 struct rcbag
*rcstack
,
378 struct xfs_rmap_irec
*rmap
,
381 struct xfs_scrub
*sc
= rr
->sc
;
385 while (*have
&& rmap
->rm_startblock
== bno
) {
386 error
= rcbag_add(rcstack
, rr
->sc
->tp
, rmap
);
390 error
= xrep_refc_walk_rmaps(rr
, rmap
, have
);
395 error
= xfs_btree_decrement(sc
->sa
.rmap_cur
, 0, &have_gt
);
398 if (XFS_IS_CORRUPT(sc
->mp
, !have_gt
)) {
399 xfs_btree_mark_sick(sc
->sa
.rmap_cur
);
400 return -EFSCORRUPTED
;
406 /* Iterate all the rmap records to generate reference count data. */
408 xrep_refc_find_refcounts(
409 struct xrep_refc
*rr
)
411 struct xfs_scrub
*sc
= rr
->sc
;
412 struct rcbag
*rcstack
;
413 uint64_t old_stack_height
;
420 xrep_ag_btcur_init(sc
, &sc
->sa
);
423 * Set up a bag to store all the rmap records that we're tracking to
424 * generate a reference count record. If the size of the bag exceeds
425 * MAXREFCOUNT, we clamp rc_refcount.
427 error
= rcbag_init(sc
->mp
, sc
->xmbtp
, &rcstack
);
431 /* Start the rmapbt cursor to the left of all records. */
432 error
= xfs_btree_goto_left_edge(sc
->sa
.rmap_cur
);
436 /* Process reverse mappings into refcount data. */
437 while (xfs_btree_has_more_records(sc
->sa
.rmap_cur
)) {
438 struct xfs_rmap_irec rmap
;
440 /* Push all rmaps with pblk == sbno onto the stack */
441 error
= xrep_refc_walk_rmaps(rr
, &rmap
, &have
);
446 sbno
= cbno
= rmap
.rm_startblock
;
447 error
= xrep_refc_push_rmaps_at(rr
, rcstack
, sbno
, &rmap
,
452 /* Set nbno to the bno of the next refcount change */
453 error
= rcbag_next_edge(rcstack
, sc
->tp
, &rmap
, have
, &nbno
);
458 old_stack_height
= rcbag_count(rcstack
);
460 /* While stack isn't empty... */
461 while (rcbag_count(rcstack
) > 0) {
462 /* Pop all rmaps that end at nbno */
463 error
= rcbag_remove_ending_at(rcstack
, sc
->tp
, nbno
);
467 /* Push array items that start at nbno */
468 error
= xrep_refc_walk_rmaps(rr
, &rmap
, &have
);
472 error
= xrep_refc_push_rmaps_at(rr
, rcstack
,
478 /* Emit refcount if necessary */
480 if (rcbag_count(rcstack
) != old_stack_height
) {
481 if (old_stack_height
> 1) {
482 error
= xrep_refc_stash(rr
,
483 XFS_REFC_DOMAIN_SHARED
,
492 /* Stack empty, go find the next rmap */
493 if (rcbag_count(rcstack
) == 0)
495 old_stack_height
= rcbag_count(rcstack
);
498 /* Set nbno to the bno of the next refcount change */
499 error
= rcbag_next_edge(rcstack
, sc
->tp
, &rmap
, have
,
508 ASSERT(rcbag_count(rcstack
) == 0);
510 rcbag_free(&rcstack
);
512 xchk_ag_btcur_free(&sc
->sa
);
516 /* Retrieve refcountbt data for bulk load. */
518 xrep_refc_get_records(
519 struct xfs_btree_cur
*cur
,
521 struct xfs_btree_block
*block
,
522 unsigned int nr_wanted
,
525 struct xfs_refcount_irec
*irec
= &cur
->bc_rec
.rc
;
526 struct xrep_refc
*rr
= priv
;
527 union xfs_btree_rec
*block_rec
;
531 for (loaded
= 0; loaded
< nr_wanted
; loaded
++, idx
++) {
532 error
= xfarray_load(rr
->refcount_records
, rr
->array_cur
++,
537 block_rec
= xfs_btree_rec_addr(cur
, idx
, block
);
538 cur
->bc_ops
->init_rec_from_cur(cur
, block_rec
);
544 /* Feed one of the new btree blocks to the bulk loader. */
546 xrep_refc_claim_block(
547 struct xfs_btree_cur
*cur
,
548 union xfs_btree_ptr
*ptr
,
551 struct xrep_refc
*rr
= priv
;
553 return xrep_newbt_claim_block(cur
, &rr
->new_btree
, ptr
);
556 /* Update the AGF counters. */
558 xrep_refc_reset_counters(
559 struct xrep_refc
*rr
)
561 struct xfs_scrub
*sc
= rr
->sc
;
562 struct xfs_perag
*pag
= sc
->sa
.pag
;
565 * After we commit the new btree to disk, it is possible that the
566 * process to reap the old btree blocks will race with the AIL trying
567 * to checkpoint the old btree blocks into the filesystem. If the new
568 * tree is shorter than the old one, the refcountbt write verifier will
569 * fail and the AIL will shut down the filesystem.
571 * To avoid this, save the old incore btree height values as the alt
572 * height values before re-initializing the perag info from the updated
573 * AGF to capture all the new values.
575 pag
->pagf_repair_refcount_level
= pag
->pagf_refcount_level
;
577 /* Reinitialize with the values we just logged. */
578 return xrep_reinit_pagf(sc
);
582 * Use the collected refcount information to stage a new refcount btree. If
583 * this is successful we'll return with the new btree root information logged
584 * to the repair transaction but not yet committed.
587 xrep_refc_build_new_tree(
588 struct xrep_refc
*rr
)
590 struct xfs_scrub
*sc
= rr
->sc
;
591 struct xfs_btree_cur
*refc_cur
;
592 struct xfs_perag
*pag
= sc
->sa
.pag
;
595 error
= xrep_refc_sort_records(rr
);
600 * Prepare to construct the new btree by reserving disk space for the
601 * new btree and setting up all the accounting information we'll need
602 * to root the new btree while it's under construction and before we
603 * attach it to the AG header.
605 xrep_newbt_init_ag(&rr
->new_btree
, sc
, &XFS_RMAP_OINFO_REFC
,
606 xfs_agbno_to_fsb(pag
, xfs_refc_block(sc
->mp
)),
607 XFS_AG_RESV_METADATA
);
608 rr
->new_btree
.bload
.get_records
= xrep_refc_get_records
;
609 rr
->new_btree
.bload
.claim_block
= xrep_refc_claim_block
;
611 /* Compute how many blocks we'll need. */
612 refc_cur
= xfs_refcountbt_init_cursor(sc
->mp
, NULL
, NULL
, pag
);
613 xfs_btree_stage_afakeroot(refc_cur
, &rr
->new_btree
.afake
);
614 error
= xfs_btree_bload_compute_geometry(refc_cur
,
615 &rr
->new_btree
.bload
,
616 xfarray_length(rr
->refcount_records
));
620 /* Last chance to abort before we start committing fixes. */
621 if (xchk_should_terminate(sc
, &error
))
624 /* Reserve the space we'll need for the new btree. */
625 error
= xrep_newbt_alloc_blocks(&rr
->new_btree
,
626 rr
->new_btree
.bload
.nr_blocks
);
631 * Due to btree slack factors, it's possible for a new btree to be one
632 * level taller than the old btree. Update the incore btree height so
633 * that we don't trip the verifiers when writing the new btree blocks
636 pag
->pagf_repair_refcount_level
= rr
->new_btree
.bload
.btree_height
;
638 /* Add all observed refcount records. */
639 rr
->array_cur
= XFARRAY_CURSOR_INIT
;
640 error
= xfs_btree_bload(refc_cur
, &rr
->new_btree
.bload
, rr
);
645 * Install the new btree in the AG header. After this point the old
646 * btree is no longer accessible and the new tree is live.
648 xfs_refcountbt_commit_staged_btree(refc_cur
, sc
->tp
, sc
->sa
.agf_bp
);
649 xfs_btree_del_cursor(refc_cur
, 0);
651 /* Reset the AGF counters now that we've changed the btree shape. */
652 error
= xrep_refc_reset_counters(rr
);
656 /* Dispose of any unused blocks and the accounting information. */
657 error
= xrep_newbt_commit(&rr
->new_btree
);
661 return xrep_roll_ag_trans(sc
);
664 pag
->pagf_repair_refcount_level
= 0;
666 xfs_btree_del_cursor(refc_cur
, error
);
668 xrep_newbt_cancel(&rr
->new_btree
);
673 * Now that we've logged the roots of the new btrees, invalidate all of the
674 * old blocks and free them.
677 xrep_refc_remove_old_tree(
678 struct xrep_refc
*rr
)
680 struct xfs_scrub
*sc
= rr
->sc
;
681 struct xfs_perag
*pag
= sc
->sa
.pag
;
684 /* Free the old refcountbt blocks if they're not in use. */
685 error
= xrep_reap_agblocks(sc
, &rr
->old_refcountbt_blocks
,
686 &XFS_RMAP_OINFO_REFC
, XFS_AG_RESV_METADATA
);
691 * Now that we've zapped all the old refcountbt blocks we can turn off
692 * the alternate height mechanism and reset the per-AG space
695 pag
->pagf_repair_refcount_level
= 0;
696 sc
->flags
|= XREP_RESET_PERAG_RESV
;
700 /* Rebuild the refcount btree. */
703 struct xfs_scrub
*sc
)
705 struct xrep_refc
*rr
;
706 struct xfs_mount
*mp
= sc
->mp
;
710 /* We require the rmapbt to rebuild anything. */
711 if (!xfs_has_rmapbt(mp
))
714 rr
= kzalloc(sizeof(struct xrep_refc
), XCHK_GFP_FLAGS
);
719 /* Set up enough storage to handle one refcount record per block. */
720 descr
= xchk_xfile_ag_descr(sc
, "reference count records");
721 error
= xfarray_create(descr
, mp
->m_sb
.sb_agblocks
,
722 sizeof(struct xfs_refcount_irec
),
723 &rr
->refcount_records
);
728 /* Collect all reference counts. */
729 xagb_bitmap_init(&rr
->old_refcountbt_blocks
);
730 error
= xrep_refc_find_refcounts(rr
);
734 /* Rebuild the refcount information. */
735 error
= xrep_refc_build_new_tree(rr
);
739 /* Kill the old tree. */
740 error
= xrep_refc_remove_old_tree(rr
);
745 xagb_bitmap_destroy(&rr
->old_refcountbt_blocks
);
746 xfarray_destroy(rr
->refcount_records
);