drm/atomic-helper: document drm_atomic_helper_check() restrictions
[drm/drm-misc.git] / fs / xfs / scrub / refcount_repair.c
blob4e572b81c98669e38ee349fc32f06058d197c0d9
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_defer.h"
13 #include "xfs_btree.h"
14 #include "xfs_btree_staging.h"
15 #include "xfs_inode.h"
16 #include "xfs_bit.h"
17 #include "xfs_log_format.h"
18 #include "xfs_trans.h"
19 #include "xfs_sb.h"
20 #include "xfs_alloc.h"
21 #include "xfs_ialloc.h"
22 #include "xfs_rmap.h"
23 #include "xfs_rmap_btree.h"
24 #include "xfs_refcount.h"
25 #include "xfs_refcount_btree.h"
26 #include "xfs_error.h"
27 #include "xfs_ag.h"
28 #include "xfs_health.h"
29 #include "scrub/xfs_scrub.h"
30 #include "scrub/scrub.h"
31 #include "scrub/common.h"
32 #include "scrub/btree.h"
33 #include "scrub/trace.h"
34 #include "scrub/repair.h"
35 #include "scrub/bitmap.h"
36 #include "scrub/agb_bitmap.h"
37 #include "scrub/xfile.h"
38 #include "scrub/xfarray.h"
39 #include "scrub/newbt.h"
40 #include "scrub/reap.h"
41 #include "scrub/rcbag.h"
44 * Rebuilding the Reference Count Btree
45 * ====================================
47 * This algorithm is "borrowed" from xfs_repair. Imagine the rmap
48 * entries as rectangles representing extents of physical blocks, and
49 * that the rectangles can be laid down to allow them to overlap each
50 * other; then we know that we must emit a refcnt btree entry wherever
51 * the amount of overlap changes, i.e. the emission stimulus is
52 * level-triggered:
54 * - ---
55 * -- ----- ---- --- ------
56 * -- ---- ----------- ---- ---------
57 * -------------------------------- -----------
58 * ^ ^ ^^ ^^ ^ ^^ ^^^ ^^^^ ^ ^^ ^ ^ ^
59 * 2 1 23 21 3 43 234 2123 1 01 2 3 0
61 * For our purposes, a rmap is a tuple (startblock, len, fileoff, owner).
63 * Note that in the actual refcnt btree we don't store the refcount < 2
64 * cases because the bnobt tells us which blocks are free; single-use
65 * blocks aren't recorded in the bnobt or the refcntbt. If the rmapbt
66 * supports storing multiple entries covering a given block we could
67 * theoretically dispense with the refcntbt and simply count rmaps, but
68 * that's inefficient in the (hot) write path, so we'll take the cost of
69 * the extra tree to save time. Also there's no guarantee that rmap
70 * will be enabled.
72 * Given an array of rmaps sorted by physical block number, a starting
73 * physical block (sp), a bag to hold rmaps that cover sp, and the next
74 * physical block where the level changes (np), we can reconstruct the
75 * refcount btree as follows:
77 * While there are still unprocessed rmaps in the array,
78 * - Set sp to the physical block (pblk) of the next unprocessed rmap.
79 * - Add to the bag all rmaps in the array where startblock == sp.
80 * - Set np to the physical block where the bag size will change. This
81 * is the minimum of (the pblk of the next unprocessed rmap) and
82 * (startblock + len of each rmap in the bag).
83 * - Record the bag size as old_bag_size.
85 * - While the bag isn't empty,
86 * - Remove from the bag all rmaps where startblock + len == np.
87 * - Add to the bag all rmaps in the array where startblock == np.
88 * - If the bag size isn't old_bag_size, store the refcount entry
89 * (sp, np - sp, bag_size) in the refcnt btree.
90 * - If the bag is empty, break out of the inner loop.
91 * - Set old_bag_size to the bag size
92 * - Set sp = np.
93 * - Set np to the physical block where the bag size will change.
94 * This is the minimum of (the pblk of the next unprocessed rmap)
95 * and (startblock + len of each rmap in the bag).
97 * Like all the other repairers, we make a list of all the refcount
98 * records we need, then reinitialize the refcount btree root and
99 * insert all the records.
102 struct xrep_refc {
103 /* refcount extents */
104 struct xfarray *refcount_records;
106 /* new refcountbt information */
107 struct xrep_newbt new_btree;
109 /* old refcountbt blocks */
110 struct xagb_bitmap old_refcountbt_blocks;
112 struct xfs_scrub *sc;
114 /* get_records()'s position in the refcount record array. */
115 xfarray_idx_t array_cur;
117 /* # of refcountbt blocks */
118 xfs_extlen_t btblocks;
121 /* Set us up to repair refcount btrees. */
123 xrep_setup_ag_refcountbt(
124 struct xfs_scrub *sc)
126 char *descr;
127 int error;
129 descr = xchk_xfile_ag_descr(sc, "rmap record bag");
130 error = xrep_setup_xfbtree(sc, descr);
131 kfree(descr);
132 return error;
135 /* Check for any obvious conflicts with this shared/CoW staging extent. */
136 STATIC int
137 xrep_refc_check_ext(
138 struct xfs_scrub *sc,
139 const struct xfs_refcount_irec *rec)
141 enum xbtree_recpacking outcome;
142 int error;
144 if (xfs_refcount_check_irec(sc->sa.pag, rec) != NULL)
145 return -EFSCORRUPTED;
147 /* Make sure this isn't free space. */
148 error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rc_startblock,
149 rec->rc_blockcount, &outcome);
150 if (error)
151 return error;
152 if (outcome != XBTREE_RECPACKING_EMPTY)
153 return -EFSCORRUPTED;
155 /* Must not be an inode chunk. */
156 error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur,
157 rec->rc_startblock, rec->rc_blockcount, &outcome);
158 if (error)
159 return error;
160 if (outcome != XBTREE_RECPACKING_EMPTY)
161 return -EFSCORRUPTED;
163 return 0;
166 /* Record a reference count extent. */
167 STATIC int
168 xrep_refc_stash(
169 struct xrep_refc *rr,
170 enum xfs_refc_domain domain,
171 xfs_agblock_t agbno,
172 xfs_extlen_t len,
173 uint64_t refcount)
175 struct xfs_refcount_irec irec = {
176 .rc_startblock = agbno,
177 .rc_blockcount = len,
178 .rc_domain = domain,
180 struct xfs_scrub *sc = rr->sc;
181 int error = 0;
183 if (xchk_should_terminate(sc, &error))
184 return error;
186 irec.rc_refcount = min_t(uint64_t, MAXREFCOUNT, refcount);
188 error = xrep_refc_check_ext(rr->sc, &irec);
189 if (error)
190 return error;
192 trace_xrep_refc_found(sc->sa.pag, &irec);
194 return xfarray_append(rr->refcount_records, &irec);
197 /* Record a CoW staging extent. */
198 STATIC int
199 xrep_refc_stash_cow(
200 struct xrep_refc *rr,
201 xfs_agblock_t agbno,
202 xfs_extlen_t len)
204 return xrep_refc_stash(rr, XFS_REFC_DOMAIN_COW, agbno, len, 1);
207 /* Decide if an rmap could describe a shared extent. */
208 static inline bool
209 xrep_refc_rmap_shareable(
210 struct xfs_mount *mp,
211 const struct xfs_rmap_irec *rmap)
213 /* AG metadata are never sharable */
214 if (XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner))
215 return false;
217 /* Metadata in files are never shareable */
218 if (xfs_is_sb_inum(mp, rmap->rm_owner))
219 return false;
221 /* Metadata and unwritten file blocks are not shareable. */
222 if (rmap->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK |
223 XFS_RMAP_UNWRITTEN))
224 return false;
226 return true;
230 * Walk along the reverse mapping records until we find one that could describe
231 * a shared extent.
233 STATIC int
234 xrep_refc_walk_rmaps(
235 struct xrep_refc *rr,
236 struct xfs_rmap_irec *rmap,
237 bool *have_rec)
239 struct xfs_btree_cur *cur = rr->sc->sa.rmap_cur;
240 struct xfs_mount *mp = cur->bc_mp;
241 int have_gt;
242 int error = 0;
244 *have_rec = false;
247 * Loop through the remaining rmaps. Remember CoW staging
248 * extents and the refcountbt blocks from the old tree for later
249 * disposal. We can only share written data fork extents, so
250 * keep looping until we find an rmap for one.
252 do {
253 if (xchk_should_terminate(rr->sc, &error))
254 return error;
256 error = xfs_btree_increment(cur, 0, &have_gt);
257 if (error)
258 return error;
259 if (!have_gt)
260 return 0;
262 error = xfs_rmap_get_rec(cur, rmap, &have_gt);
263 if (error)
264 return error;
265 if (XFS_IS_CORRUPT(mp, !have_gt)) {
266 xfs_btree_mark_sick(cur);
267 return -EFSCORRUPTED;
270 if (rmap->rm_owner == XFS_RMAP_OWN_COW) {
271 error = xrep_refc_stash_cow(rr, rmap->rm_startblock,
272 rmap->rm_blockcount);
273 if (error)
274 return error;
275 } else if (rmap->rm_owner == XFS_RMAP_OWN_REFC) {
276 /* refcountbt block, dump it when we're done. */
277 rr->btblocks += rmap->rm_blockcount;
278 error = xagb_bitmap_set(&rr->old_refcountbt_blocks,
279 rmap->rm_startblock,
280 rmap->rm_blockcount);
281 if (error)
282 return error;
284 } while (!xrep_refc_rmap_shareable(mp, rmap));
286 *have_rec = true;
287 return 0;
290 static inline uint32_t
291 xrep_refc_encode_startblock(
292 const struct xfs_refcount_irec *irec)
294 uint32_t start;
296 start = irec->rc_startblock & ~XFS_REFC_COWFLAG;
297 if (irec->rc_domain == XFS_REFC_DOMAIN_COW)
298 start |= XFS_REFC_COWFLAG;
300 return start;
303 /* Sort in the same order as the ondisk records. */
304 static int
305 xrep_refc_extent_cmp(
306 const void *a,
307 const void *b)
309 const struct xfs_refcount_irec *ap = a;
310 const struct xfs_refcount_irec *bp = b;
311 uint32_t sa, sb;
313 sa = xrep_refc_encode_startblock(ap);
314 sb = xrep_refc_encode_startblock(bp);
316 if (sa > sb)
317 return 1;
318 if (sa < sb)
319 return -1;
320 return 0;
324 * Sort the refcount extents by startblock or else the btree records will be in
325 * the wrong order. Make sure the records do not overlap in physical space.
327 STATIC int
328 xrep_refc_sort_records(
329 struct xrep_refc *rr)
331 struct xfs_refcount_irec irec;
332 xfarray_idx_t cur;
333 enum xfs_refc_domain dom = XFS_REFC_DOMAIN_SHARED;
334 xfs_agblock_t next_agbno = 0;
335 int error;
337 error = xfarray_sort(rr->refcount_records, xrep_refc_extent_cmp,
338 XFARRAY_SORT_KILLABLE);
339 if (error)
340 return error;
342 foreach_xfarray_idx(rr->refcount_records, cur) {
343 if (xchk_should_terminate(rr->sc, &error))
344 return error;
346 error = xfarray_load(rr->refcount_records, cur, &irec);
347 if (error)
348 return error;
350 if (dom == XFS_REFC_DOMAIN_SHARED &&
351 irec.rc_domain == XFS_REFC_DOMAIN_COW) {
352 dom = irec.rc_domain;
353 next_agbno = 0;
356 if (dom != irec.rc_domain)
357 return -EFSCORRUPTED;
358 if (irec.rc_startblock < next_agbno)
359 return -EFSCORRUPTED;
361 next_agbno = irec.rc_startblock + irec.rc_blockcount;
364 return error;
368 * Walk forward through the rmap btree to collect all rmaps starting at
369 * @bno in @rmap_bag. These represent the file(s) that share ownership of
370 * the current block. Upon return, the rmap cursor points to the last record
371 * satisfying the startblock constraint.
373 static int
374 xrep_refc_push_rmaps_at(
375 struct xrep_refc *rr,
376 struct rcbag *rcstack,
377 xfs_agblock_t bno,
378 struct xfs_rmap_irec *rmap,
379 bool *have)
381 struct xfs_scrub *sc = rr->sc;
382 int have_gt;
383 int error;
385 while (*have && rmap->rm_startblock == bno) {
386 error = rcbag_add(rcstack, rr->sc->tp, rmap);
387 if (error)
388 return error;
390 error = xrep_refc_walk_rmaps(rr, rmap, have);
391 if (error)
392 return error;
395 error = xfs_btree_decrement(sc->sa.rmap_cur, 0, &have_gt);
396 if (error)
397 return error;
398 if (XFS_IS_CORRUPT(sc->mp, !have_gt)) {
399 xfs_btree_mark_sick(sc->sa.rmap_cur);
400 return -EFSCORRUPTED;
403 return 0;
406 /* Iterate all the rmap records to generate reference count data. */
407 STATIC int
408 xrep_refc_find_refcounts(
409 struct xrep_refc *rr)
411 struct xfs_scrub *sc = rr->sc;
412 struct rcbag *rcstack;
413 uint64_t old_stack_height;
414 xfs_agblock_t sbno;
415 xfs_agblock_t cbno;
416 xfs_agblock_t nbno;
417 bool have;
418 int error;
420 xrep_ag_btcur_init(sc, &sc->sa);
423 * Set up a bag to store all the rmap records that we're tracking to
424 * generate a reference count record. If the size of the bag exceeds
425 * MAXREFCOUNT, we clamp rc_refcount.
427 error = rcbag_init(sc->mp, sc->xmbtp, &rcstack);
428 if (error)
429 goto out_cur;
431 /* Start the rmapbt cursor to the left of all records. */
432 error = xfs_btree_goto_left_edge(sc->sa.rmap_cur);
433 if (error)
434 goto out_bag;
436 /* Process reverse mappings into refcount data. */
437 while (xfs_btree_has_more_records(sc->sa.rmap_cur)) {
438 struct xfs_rmap_irec rmap;
440 /* Push all rmaps with pblk == sbno onto the stack */
441 error = xrep_refc_walk_rmaps(rr, &rmap, &have);
442 if (error)
443 goto out_bag;
444 if (!have)
445 break;
446 sbno = cbno = rmap.rm_startblock;
447 error = xrep_refc_push_rmaps_at(rr, rcstack, sbno, &rmap,
448 &have);
449 if (error)
450 goto out_bag;
452 /* Set nbno to the bno of the next refcount change */
453 error = rcbag_next_edge(rcstack, sc->tp, &rmap, have, &nbno);
454 if (error)
455 goto out_bag;
457 ASSERT(nbno > sbno);
458 old_stack_height = rcbag_count(rcstack);
460 /* While stack isn't empty... */
461 while (rcbag_count(rcstack) > 0) {
462 /* Pop all rmaps that end at nbno */
463 error = rcbag_remove_ending_at(rcstack, sc->tp, nbno);
464 if (error)
465 goto out_bag;
467 /* Push array items that start at nbno */
468 error = xrep_refc_walk_rmaps(rr, &rmap, &have);
469 if (error)
470 goto out_bag;
471 if (have) {
472 error = xrep_refc_push_rmaps_at(rr, rcstack,
473 nbno, &rmap, &have);
474 if (error)
475 goto out_bag;
478 /* Emit refcount if necessary */
479 ASSERT(nbno > cbno);
480 if (rcbag_count(rcstack) != old_stack_height) {
481 if (old_stack_height > 1) {
482 error = xrep_refc_stash(rr,
483 XFS_REFC_DOMAIN_SHARED,
484 cbno, nbno - cbno,
485 old_stack_height);
486 if (error)
487 goto out_bag;
489 cbno = nbno;
492 /* Stack empty, go find the next rmap */
493 if (rcbag_count(rcstack) == 0)
494 break;
495 old_stack_height = rcbag_count(rcstack);
496 sbno = nbno;
498 /* Set nbno to the bno of the next refcount change */
499 error = rcbag_next_edge(rcstack, sc->tp, &rmap, have,
500 &nbno);
501 if (error)
502 goto out_bag;
504 ASSERT(nbno > sbno);
508 ASSERT(rcbag_count(rcstack) == 0);
509 out_bag:
510 rcbag_free(&rcstack);
511 out_cur:
512 xchk_ag_btcur_free(&sc->sa);
513 return error;
516 /* Retrieve refcountbt data for bulk load. */
517 STATIC int
518 xrep_refc_get_records(
519 struct xfs_btree_cur *cur,
520 unsigned int idx,
521 struct xfs_btree_block *block,
522 unsigned int nr_wanted,
523 void *priv)
525 struct xfs_refcount_irec *irec = &cur->bc_rec.rc;
526 struct xrep_refc *rr = priv;
527 union xfs_btree_rec *block_rec;
528 unsigned int loaded;
529 int error;
531 for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
532 error = xfarray_load(rr->refcount_records, rr->array_cur++,
533 irec);
534 if (error)
535 return error;
537 block_rec = xfs_btree_rec_addr(cur, idx, block);
538 cur->bc_ops->init_rec_from_cur(cur, block_rec);
541 return loaded;
544 /* Feed one of the new btree blocks to the bulk loader. */
545 STATIC int
546 xrep_refc_claim_block(
547 struct xfs_btree_cur *cur,
548 union xfs_btree_ptr *ptr,
549 void *priv)
551 struct xrep_refc *rr = priv;
553 return xrep_newbt_claim_block(cur, &rr->new_btree, ptr);
556 /* Update the AGF counters. */
557 STATIC int
558 xrep_refc_reset_counters(
559 struct xrep_refc *rr)
561 struct xfs_scrub *sc = rr->sc;
562 struct xfs_perag *pag = sc->sa.pag;
565 * After we commit the new btree to disk, it is possible that the
566 * process to reap the old btree blocks will race with the AIL trying
567 * to checkpoint the old btree blocks into the filesystem. If the new
568 * tree is shorter than the old one, the refcountbt write verifier will
569 * fail and the AIL will shut down the filesystem.
571 * To avoid this, save the old incore btree height values as the alt
572 * height values before re-initializing the perag info from the updated
573 * AGF to capture all the new values.
575 pag->pagf_repair_refcount_level = pag->pagf_refcount_level;
577 /* Reinitialize with the values we just logged. */
578 return xrep_reinit_pagf(sc);
582 * Use the collected refcount information to stage a new refcount btree. If
583 * this is successful we'll return with the new btree root information logged
584 * to the repair transaction but not yet committed.
586 STATIC int
587 xrep_refc_build_new_tree(
588 struct xrep_refc *rr)
590 struct xfs_scrub *sc = rr->sc;
591 struct xfs_btree_cur *refc_cur;
592 struct xfs_perag *pag = sc->sa.pag;
593 int error;
595 error = xrep_refc_sort_records(rr);
596 if (error)
597 return error;
600 * Prepare to construct the new btree by reserving disk space for the
601 * new btree and setting up all the accounting information we'll need
602 * to root the new btree while it's under construction and before we
603 * attach it to the AG header.
605 xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_REFC,
606 xfs_agbno_to_fsb(pag, xfs_refc_block(sc->mp)),
607 XFS_AG_RESV_METADATA);
608 rr->new_btree.bload.get_records = xrep_refc_get_records;
609 rr->new_btree.bload.claim_block = xrep_refc_claim_block;
611 /* Compute how many blocks we'll need. */
612 refc_cur = xfs_refcountbt_init_cursor(sc->mp, NULL, NULL, pag);
613 xfs_btree_stage_afakeroot(refc_cur, &rr->new_btree.afake);
614 error = xfs_btree_bload_compute_geometry(refc_cur,
615 &rr->new_btree.bload,
616 xfarray_length(rr->refcount_records));
617 if (error)
618 goto err_cur;
620 /* Last chance to abort before we start committing fixes. */
621 if (xchk_should_terminate(sc, &error))
622 goto err_cur;
624 /* Reserve the space we'll need for the new btree. */
625 error = xrep_newbt_alloc_blocks(&rr->new_btree,
626 rr->new_btree.bload.nr_blocks);
627 if (error)
628 goto err_cur;
631 * Due to btree slack factors, it's possible for a new btree to be one
632 * level taller than the old btree. Update the incore btree height so
633 * that we don't trip the verifiers when writing the new btree blocks
634 * to disk.
636 pag->pagf_repair_refcount_level = rr->new_btree.bload.btree_height;
638 /* Add all observed refcount records. */
639 rr->array_cur = XFARRAY_CURSOR_INIT;
640 error = xfs_btree_bload(refc_cur, &rr->new_btree.bload, rr);
641 if (error)
642 goto err_level;
645 * Install the new btree in the AG header. After this point the old
646 * btree is no longer accessible and the new tree is live.
648 xfs_refcountbt_commit_staged_btree(refc_cur, sc->tp, sc->sa.agf_bp);
649 xfs_btree_del_cursor(refc_cur, 0);
651 /* Reset the AGF counters now that we've changed the btree shape. */
652 error = xrep_refc_reset_counters(rr);
653 if (error)
654 goto err_newbt;
656 /* Dispose of any unused blocks and the accounting information. */
657 error = xrep_newbt_commit(&rr->new_btree);
658 if (error)
659 return error;
661 return xrep_roll_ag_trans(sc);
663 err_level:
664 pag->pagf_repair_refcount_level = 0;
665 err_cur:
666 xfs_btree_del_cursor(refc_cur, error);
667 err_newbt:
668 xrep_newbt_cancel(&rr->new_btree);
669 return error;
673 * Now that we've logged the roots of the new btrees, invalidate all of the
674 * old blocks and free them.
676 STATIC int
677 xrep_refc_remove_old_tree(
678 struct xrep_refc *rr)
680 struct xfs_scrub *sc = rr->sc;
681 struct xfs_perag *pag = sc->sa.pag;
682 int error;
684 /* Free the old refcountbt blocks if they're not in use. */
685 error = xrep_reap_agblocks(sc, &rr->old_refcountbt_blocks,
686 &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA);
687 if (error)
688 return error;
691 * Now that we've zapped all the old refcountbt blocks we can turn off
692 * the alternate height mechanism and reset the per-AG space
693 * reservations.
695 pag->pagf_repair_refcount_level = 0;
696 sc->flags |= XREP_RESET_PERAG_RESV;
697 return 0;
700 /* Rebuild the refcount btree. */
702 xrep_refcountbt(
703 struct xfs_scrub *sc)
705 struct xrep_refc *rr;
706 struct xfs_mount *mp = sc->mp;
707 char *descr;
708 int error;
710 /* We require the rmapbt to rebuild anything. */
711 if (!xfs_has_rmapbt(mp))
712 return -EOPNOTSUPP;
714 rr = kzalloc(sizeof(struct xrep_refc), XCHK_GFP_FLAGS);
715 if (!rr)
716 return -ENOMEM;
717 rr->sc = sc;
719 /* Set up enough storage to handle one refcount record per block. */
720 descr = xchk_xfile_ag_descr(sc, "reference count records");
721 error = xfarray_create(descr, mp->m_sb.sb_agblocks,
722 sizeof(struct xfs_refcount_irec),
723 &rr->refcount_records);
724 kfree(descr);
725 if (error)
726 goto out_rr;
728 /* Collect all reference counts. */
729 xagb_bitmap_init(&rr->old_refcountbt_blocks);
730 error = xrep_refc_find_refcounts(rr);
731 if (error)
732 goto out_bitmap;
734 /* Rebuild the refcount information. */
735 error = xrep_refc_build_new_tree(rr);
736 if (error)
737 goto out_bitmap;
739 /* Kill the old tree. */
740 error = xrep_refc_remove_old_tree(rr);
741 if (error)
742 goto out_bitmap;
744 out_bitmap:
745 xagb_bitmap_destroy(&rr->old_refcountbt_blocks);
746 xfarray_destroy(rr->refcount_records);
747 out_rr:
748 kfree(rr);
749 return error;