1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * Copyright (C) 2022-2023 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_defer.h"
13 #include "xfs_btree.h"
14 #include "xfs_log_format.h"
15 #include "xfs_trans.h"
16 #include "xfs_inode.h"
17 #include "xfs_inode_fork.h"
18 #include "xfs_alloc.h"
21 #include "xfs_refcount.h"
22 #include "xfs_quota.h"
23 #include "xfs_ialloc.h"
25 #include "xfs_error.h"
26 #include "xfs_errortag.h"
27 #include "xfs_icache.h"
28 #include "xfs_refcount_btree.h"
29 #include "scrub/xfs_scrub.h"
30 #include "scrub/scrub.h"
31 #include "scrub/common.h"
32 #include "scrub/trace.h"
33 #include "scrub/repair.h"
34 #include "scrub/bitmap.h"
35 #include "scrub/off_bitmap.h"
36 #include "scrub/fsb_bitmap.h"
37 #include "scrub/reap.h"
40 * CoW Fork Mapping Repair
41 * =======================
43 * Although CoW staging extents are owned by incore CoW inode forks, on disk
44 * they are owned by the refcount btree. The ondisk metadata does not record
45 * any ownership information, which limits what we can do to repair the
46 * mappings in the CoW fork. At most, we can replace ifork mappings that lack
47 * an entry in the refcount btree or are described by a reverse mapping record
48 * whose owner is not OWN_COW.
50 * Replacing extents is also tricky -- we can't touch written CoW fork extents
51 * since they are undergoing writeback, and delalloc extents do not require
52 * repair since they only exist incore. Hence the most we can do is find the
53 * bad parts of unwritten mappings, allocate a replacement set of blocks, and
54 * replace the incore mapping. We use the regular reaping process to unmap
55 * or free the discarded blocks, as appropriate.
60 /* Bitmap of file offset ranges that need replacing. */
61 struct xoff_bitmap bad_fileoffs
;
63 /* Bitmap of fsblocks that were removed from the CoW fork. */
64 struct xfsb_bitmap old_cowfork_fsblocks
;
66 /* CoW fork mappings used to scan for bad CoW staging extents. */
67 struct xfs_bmbt_irec irec
;
69 /* refcount btree block number of irec.br_startblock */
70 unsigned int irec_startbno
;
72 /* refcount btree block number of the next refcount record we expect */
73 unsigned int next_bno
;
76 /* CoW staging extent. */
77 struct xrep_cow_extent
{
83 * Mark the part of the file range that corresponds to the given physical
84 * space. Caller must ensure that the physical range is within xc->irec.
87 xrep_cow_mark_file_range(
89 xfs_fsblock_t startblock
,
90 xfs_filblks_t blockcount
)
92 xfs_fileoff_t startoff
;
94 startoff
= xc
->irec
.br_startoff
+
95 (startblock
- xc
->irec
.br_startblock
);
97 trace_xrep_cow_mark_file_range(xc
->sc
->ip
, startblock
, startoff
,
100 return xoff_bitmap_set(&xc
->bad_fileoffs
, startoff
, blockcount
);
104 * Trim @src to fit within the CoW fork mapping being examined, and put the
108 xrep_cow_trim_refcount(
110 struct xfs_refcount_irec
*dst
,
111 const struct xfs_refcount_irec
*src
)
115 memcpy(dst
, src
, sizeof(*dst
));
117 if (dst
->rc_startblock
< xc
->irec_startbno
) {
118 adj
= xc
->irec_startbno
- dst
->rc_startblock
;
119 dst
->rc_blockcount
-= adj
;
120 dst
->rc_startblock
+= adj
;
123 if (dst
->rc_startblock
+ dst
->rc_blockcount
>
124 xc
->irec_startbno
+ xc
->irec
.br_blockcount
) {
125 adj
= (dst
->rc_startblock
+ dst
->rc_blockcount
) -
126 (xc
->irec_startbno
+ xc
->irec
.br_blockcount
);
127 dst
->rc_blockcount
-= adj
;
131 /* Mark any shared CoW staging extents. */
133 xrep_cow_mark_shared_staging(
134 struct xfs_btree_cur
*cur
,
135 const struct xfs_refcount_irec
*rec
,
138 struct xrep_cow
*xc
= priv
;
139 struct xfs_refcount_irec rrec
;
141 if (!xfs_refcount_check_domain(rec
) ||
142 rec
->rc_domain
!= XFS_REFC_DOMAIN_SHARED
)
143 return -EFSCORRUPTED
;
145 xrep_cow_trim_refcount(xc
, &rrec
, rec
);
147 return xrep_cow_mark_file_range(xc
,
148 xfs_agbno_to_fsb(to_perag(cur
->bc_group
),
154 * Mark any portion of the CoW fork file offset range where there is not a CoW
155 * staging extent record in the refcountbt, and keep a record of where we did
156 * find correct refcountbt records. Staging records are always cleaned out at
157 * mount time, so any two inodes trying to map the same staging area would have
158 * already taken the fs down due to refcount btree verifier errors. Hence this
159 * inode should be the sole creator of the staging extent records ondisk.
162 xrep_cow_mark_missing_staging(
163 struct xfs_btree_cur
*cur
,
164 const struct xfs_refcount_irec
*rec
,
167 struct xrep_cow
*xc
= priv
;
168 struct xfs_refcount_irec rrec
;
171 if (!xfs_refcount_check_domain(rec
) ||
172 rec
->rc_domain
!= XFS_REFC_DOMAIN_COW
)
173 return -EFSCORRUPTED
;
175 xrep_cow_trim_refcount(xc
, &rrec
, rec
);
177 if (xc
->next_bno
>= rrec
.rc_startblock
)
181 error
= xrep_cow_mark_file_range(xc
,
182 xfs_agbno_to_fsb(to_perag(cur
->bc_group
), xc
->next_bno
),
183 rrec
.rc_startblock
- xc
->next_bno
);
188 xc
->next_bno
= rrec
.rc_startblock
+ rrec
.rc_blockcount
;
193 * Mark any area that does not correspond to a CoW staging rmap. These are
194 * cross-linked areas that must be avoided.
197 xrep_cow_mark_missing_staging_rmap(
198 struct xfs_btree_cur
*cur
,
199 const struct xfs_rmap_irec
*rec
,
202 struct xrep_cow
*xc
= priv
;
203 xfs_agblock_t rec_bno
;
204 xfs_extlen_t rec_len
;
207 if (rec
->rm_owner
== XFS_RMAP_OWN_COW
)
210 rec_bno
= rec
->rm_startblock
;
211 rec_len
= rec
->rm_blockcount
;
212 if (rec_bno
< xc
->irec_startbno
) {
213 adj
= xc
->irec_startbno
- rec_bno
;
218 if (rec_bno
+ rec_len
> xc
->irec_startbno
+ xc
->irec
.br_blockcount
) {
219 adj
= (rec_bno
+ rec_len
) -
220 (xc
->irec_startbno
+ xc
->irec
.br_blockcount
);
224 return xrep_cow_mark_file_range(xc
,
225 xfs_agbno_to_fsb(to_perag(cur
->bc_group
), rec_bno
),
230 * Find any part of the CoW fork mapping that isn't a single-owner CoW staging
231 * extent and mark the corresponding part of the file range in the bitmap.
237 struct xfs_refcount_irec rc_low
= { 0 };
238 struct xfs_refcount_irec rc_high
= { 0 };
239 struct xfs_rmap_irec rm_low
= { 0 };
240 struct xfs_rmap_irec rm_high
= { 0 };
241 struct xfs_perag
*pag
;
242 struct xfs_scrub
*sc
= xc
->sc
;
246 agno
= XFS_FSB_TO_AGNO(sc
->mp
, xc
->irec
.br_startblock
);
247 xc
->irec_startbno
= XFS_FSB_TO_AGBNO(sc
->mp
, xc
->irec
.br_startblock
);
249 pag
= xfs_perag_get(sc
->mp
, agno
);
251 return -EFSCORRUPTED
;
253 error
= xrep_ag_init(sc
, pag
, &sc
->sa
);
257 /* Mark any CoW fork extents that are shared. */
258 rc_low
.rc_startblock
= xc
->irec_startbno
;
259 rc_high
.rc_startblock
= xc
->irec_startbno
+ xc
->irec
.br_blockcount
- 1;
260 rc_low
.rc_domain
= rc_high
.rc_domain
= XFS_REFC_DOMAIN_SHARED
;
261 error
= xfs_refcount_query_range(sc
->sa
.refc_cur
, &rc_low
, &rc_high
,
262 xrep_cow_mark_shared_staging
, xc
);
266 /* Make sure there are CoW staging extents for the whole mapping. */
267 rc_low
.rc_startblock
= xc
->irec_startbno
;
268 rc_high
.rc_startblock
= xc
->irec_startbno
+ xc
->irec
.br_blockcount
- 1;
269 rc_low
.rc_domain
= rc_high
.rc_domain
= XFS_REFC_DOMAIN_COW
;
270 xc
->next_bno
= xc
->irec_startbno
;
271 error
= xfs_refcount_query_range(sc
->sa
.refc_cur
, &rc_low
, &rc_high
,
272 xrep_cow_mark_missing_staging
, xc
);
276 if (xc
->next_bno
< xc
->irec_startbno
+ xc
->irec
.br_blockcount
) {
277 error
= xrep_cow_mark_file_range(xc
,
278 xfs_agbno_to_fsb(pag
, xc
->next_bno
),
279 xc
->irec_startbno
+ xc
->irec
.br_blockcount
-
285 /* Mark any area has an rmap that isn't a COW staging extent. */
286 rm_low
.rm_startblock
= xc
->irec_startbno
;
287 memset(&rm_high
, 0xFF, sizeof(rm_high
));
288 rm_high
.rm_startblock
= xc
->irec_startbno
+ xc
->irec
.br_blockcount
- 1;
289 error
= xfs_rmap_query_range(sc
->sa
.rmap_cur
, &rm_low
, &rm_high
,
290 xrep_cow_mark_missing_staging_rmap
, xc
);
295 * If userspace is forcing us to rebuild the CoW fork or someone turned
296 * on the debugging knob, replace everything in the CoW fork.
298 if ((sc
->sm
->sm_flags
& XFS_SCRUB_IFLAG_FORCE_REBUILD
) ||
299 XFS_TEST_ERROR(false, sc
->mp
, XFS_ERRTAG_FORCE_SCRUB_REPAIR
)) {
300 error
= xrep_cow_mark_file_range(xc
, xc
->irec
.br_startblock
,
301 xc
->irec
.br_blockcount
);
307 xchk_ag_free(sc
, &sc
->sa
);
314 * Allocate a replacement CoW staging extent of up to the given number of
315 * blocks, and fill out the mapping.
319 struct xfs_scrub
*sc
,
321 struct xrep_cow_extent
*repl
)
323 struct xfs_alloc_arg args
= {
326 .oinfo
= XFS_RMAP_OINFO_SKIP_UPDATE
,
330 .resv
= XFS_AG_RESV_NONE
,
331 .datatype
= XFS_ALLOC_USERDATA
,
335 error
= xfs_trans_reserve_more(sc
->tp
, maxlen
, 0);
339 error
= xfs_alloc_vextent_start_ag(&args
,
340 XFS_INO_TO_FSB(sc
->mp
, sc
->ip
->i_ino
));
343 if (args
.fsbno
== NULLFSBLOCK
)
346 xfs_refcount_alloc_cow_extent(sc
->tp
, args
.fsbno
, args
.len
);
348 repl
->fsbno
= args
.fsbno
;
349 repl
->len
= args
.len
;
354 * Look up the current CoW fork mapping so that we only allocate enough to
355 * replace a single mapping. If we don't find a mapping that covers the start
356 * of the file range, or we find a delalloc or written extent, something is
357 * seriously wrong, since we didn't drop the ILOCK.
360 xrep_cow_find_mapping(
362 struct xfs_iext_cursor
*icur
,
363 xfs_fileoff_t startoff
,
364 struct xfs_bmbt_irec
*got
)
366 struct xfs_inode
*ip
= xc
->sc
->ip
;
367 struct xfs_ifork
*ifp
= xfs_ifork_ptr(ip
, XFS_COW_FORK
);
369 if (!xfs_iext_lookup_extent(ip
, ifp
, startoff
, icur
, got
))
372 if (got
->br_startoff
> startoff
)
375 if (got
->br_blockcount
== 0)
378 if (isnullstartblock(got
->br_startblock
))
381 if (xfs_bmap_is_written_extent(got
))
387 return -EFSCORRUPTED
;
390 #define REPLACE_LEFT_SIDE (1U << 0)
391 #define REPLACE_RIGHT_SIDE (1U << 1)
394 * Given a CoW fork mapping @got and a replacement mapping @repl, remap the
395 * beginning of @got with the space described by @rep.
398 xrep_cow_replace_mapping(
399 struct xfs_inode
*ip
,
400 struct xfs_iext_cursor
*icur
,
401 const struct xfs_bmbt_irec
*got
,
402 const struct xrep_cow_extent
*repl
)
404 struct xfs_bmbt_irec
new = *got
; /* struct copy */
406 ASSERT(repl
->len
> 0);
407 ASSERT(!isnullstartblock(got
->br_startblock
));
409 trace_xrep_cow_replace_mapping(ip
, got
, repl
->fsbno
, repl
->len
);
411 if (got
->br_blockcount
== repl
->len
) {
413 * The new extent is a complete replacement for the existing
414 * extent. Update the COW fork record.
416 new.br_startblock
= repl
->fsbno
;
417 xfs_iext_update_extent(ip
, BMAP_COWFORK
, icur
, &new);
422 * The new extent can replace the beginning of the COW fork record.
423 * Move the left side of @got upwards, then insert the new record.
425 new.br_startoff
+= repl
->len
;
426 new.br_startblock
+= repl
->len
;
427 new.br_blockcount
-= repl
->len
;
428 xfs_iext_update_extent(ip
, BMAP_COWFORK
, icur
, &new);
430 new.br_startoff
= got
->br_startoff
;
431 new.br_startblock
= repl
->fsbno
;
432 new.br_blockcount
= repl
->len
;
433 xfs_iext_insert(ip
, icur
, &new, BMAP_COWFORK
);
437 * Replace the unwritten CoW staging extent backing the given file range with a
438 * new space extent that isn't as problematic.
441 xrep_cow_replace_range(
443 xfs_fileoff_t startoff
,
444 xfs_extlen_t
*blockcount
)
446 struct xfs_iext_cursor icur
;
447 struct xrep_cow_extent repl
;
448 struct xfs_bmbt_irec got
;
449 struct xfs_scrub
*sc
= xc
->sc
;
450 xfs_fileoff_t nextoff
;
451 xfs_extlen_t alloc_len
;
455 * Put the existing CoW fork mapping in @got. If @got ends before
456 * @rep, truncate @rep so we only replace one extent mapping at a time.
458 error
= xrep_cow_find_mapping(xc
, &icur
, startoff
, &got
);
461 nextoff
= min(startoff
+ *blockcount
,
462 got
.br_startoff
+ got
.br_blockcount
);
465 * Allocate a replacement extent. If we don't fill all the blocks,
466 * shorten the quantity that will be deleted in this step.
468 alloc_len
= min_t(xfs_fileoff_t
, XFS_MAX_BMBT_EXTLEN
,
470 error
= xrep_cow_alloc(sc
, alloc_len
, &repl
);
475 * Replace the old mapping with the new one, and commit the metadata
476 * changes made so far.
478 xrep_cow_replace_mapping(sc
->ip
, &icur
, &got
, &repl
);
480 xfs_inode_set_cowblocks_tag(sc
->ip
);
481 error
= xfs_defer_finish(&sc
->tp
);
485 /* Note the old CoW staging extents; we'll reap them all later. */
486 error
= xfsb_bitmap_set(&xc
->old_cowfork_fsblocks
, got
.br_startblock
,
491 *blockcount
= repl
.len
;
496 * Replace a bad part of an unwritten CoW staging extent with a fresh delalloc
505 struct xrep_cow
*xc
= priv
;
508 while (blockcount
> 0) {
509 xfs_extlen_t len
= min_t(xfs_filblks_t
, blockcount
,
510 XFS_MAX_BMBT_EXTLEN
);
512 error
= xrep_cow_replace_range(xc
, startoff
, &len
);
524 * Repair an inode's CoW fork. The CoW fork is an in-core structure, so
525 * there's no btree to rebuid. Instead, we replace any mappings that are
526 * cross-linked or lack ondisk CoW fork records in the refcount btree.
530 struct xfs_scrub
*sc
)
533 struct xfs_iext_cursor icur
;
534 struct xfs_ifork
*ifp
= xfs_ifork_ptr(sc
->ip
, XFS_COW_FORK
);
537 if (!xfs_has_rmapbt(sc
->mp
) || !xfs_has_reflink(sc
->mp
))
543 /* realtime files aren't supported yet */
544 if (XFS_IS_REALTIME_INODE(sc
->ip
))
548 * If we're somehow not in extents format, then reinitialize it to
549 * an empty extent mapping fork and exit.
551 if (ifp
->if_format
!= XFS_DINODE_FMT_EXTENTS
) {
552 ifp
->if_format
= XFS_DINODE_FMT_EXTENTS
;
553 ifp
->if_nextents
= 0;
557 xc
= kzalloc(sizeof(struct xrep_cow
), XCHK_GFP_FLAGS
);
561 xfs_trans_ijoin(sc
->tp
, sc
->ip
, 0);
564 xoff_bitmap_init(&xc
->bad_fileoffs
);
565 xfsb_bitmap_init(&xc
->old_cowfork_fsblocks
);
567 for_each_xfs_iext(ifp
, &icur
, &xc
->irec
) {
568 if (xchk_should_terminate(sc
, &error
))
572 * delalloc reservations only exist incore, so there is no
573 * ondisk metadata that we can examine. Hence we leave them
576 if (isnullstartblock(xc
->irec
.br_startblock
))
580 * COW fork extents are only in the written state if writeback
581 * is actively writing to disk. We cannot restart the write
582 * at a different disk address since we've already issued the
583 * IO, so we leave these alone and hope for the best.
585 if (xfs_bmap_is_written_extent(&xc
->irec
))
588 error
= xrep_cow_find_bad(xc
);
593 /* Replace any bad unwritten mappings with fresh reservations. */
594 error
= xoff_bitmap_walk(&xc
->bad_fileoffs
, xrep_cow_replace
, xc
);
599 * Reap as many of the old CoW blocks as we can. They are owned ondisk
600 * by the refcount btree, not the inode, so it is correct to treat them
601 * like inode metadata.
603 error
= xrep_reap_fsblocks(sc
, &xc
->old_cowfork_fsblocks
,
604 &XFS_RMAP_OINFO_COW
);
609 xfsb_bitmap_destroy(&xc
->old_cowfork_fsblocks
);
610 xoff_bitmap_destroy(&xc
->bad_fileoffs
);