1 // SPDX-License-Identifier: GPL-2.0+
3 * Copyright (C) 2017 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <darrick.wong@oracle.com>
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_btree.h"
13 #include "xfs_log_format.h"
14 #include "xfs_trans.h"
15 #include "xfs_inode.h"
16 #include "xfs_ialloc.h"
17 #include "xfs_ialloc_btree.h"
18 #include "xfs_icache.h"
20 #include "scrub/scrub.h"
21 #include "scrub/common.h"
22 #include "scrub/btree.h"
23 #include "scrub/trace.h"
26 * Set us up to scrub inode btrees.
27 * If we detect a discrepancy between the inobt and the inode,
28 * try again after forcing logged inode cores out to disk.
31 xchk_setup_ag_iallocbt(
35 return xchk_setup_ag_btree(sc
, ip
, sc
->flags
& XCHK_TRY_HARDER
);
38 /* Inode btree scrubber. */
40 struct xchk_iallocbt
{
41 /* Number of inodes we see while scanning inobt. */
42 unsigned long long inodes
;
44 /* Expected next startino, for big block filesystems. */
45 xfs_agino_t next_startino
;
47 /* Expected end of the current inode cluster. */
48 xfs_agino_t next_cluster_ino
;
52 * If we're checking the finobt, cross-reference with the inobt.
53 * Otherwise we're checking the inobt; if there is an finobt, make sure
54 * we have a record or not depending on freecount.
57 xchk_iallocbt_chunk_xref_other(
59 struct xfs_inobt_rec_incore
*irec
,
62 struct xfs_btree_cur
**pcur
;
66 if (sc
->sm
->sm_type
== XFS_SCRUB_TYPE_FINOBT
)
67 pcur
= &sc
->sa
.ino_cur
;
69 pcur
= &sc
->sa
.fino_cur
;
72 error
= xfs_ialloc_has_inode_record(*pcur
, agino
, agino
, &has_irec
);
73 if (!xchk_should_check_xref(sc
, &error
, pcur
))
75 if (((irec
->ir_freecount
> 0 && !has_irec
) ||
76 (irec
->ir_freecount
== 0 && has_irec
)))
77 xchk_btree_xref_set_corrupt(sc
, *pcur
, 0);
80 /* Cross-reference with the other btrees. */
82 xchk_iallocbt_chunk_xref(
84 struct xfs_inobt_rec_incore
*irec
,
89 if (sc
->sm
->sm_flags
& XFS_SCRUB_OFLAG_CORRUPT
)
92 xchk_xref_is_used_space(sc
, agbno
, len
);
93 xchk_iallocbt_chunk_xref_other(sc
, irec
, agino
);
94 xchk_xref_is_owned_by(sc
, agbno
, len
, &XFS_RMAP_OINFO_INODES
);
95 xchk_xref_is_not_shared(sc
, agbno
, len
);
98 /* Is this chunk worth checking? */
101 struct xchk_btree
*bs
,
102 struct xfs_inobt_rec_incore
*irec
,
106 struct xfs_mount
*mp
= bs
->cur
->bc_mp
;
107 xfs_agnumber_t agno
= bs
->cur
->bc_private
.a
.agno
;
110 bno
= XFS_AGINO_TO_AGBNO(mp
, agino
);
111 if (bno
+ len
<= bno
||
112 !xfs_verify_agbno(mp
, agno
, bno
) ||
113 !xfs_verify_agbno(mp
, agno
, bno
+ len
- 1))
114 xchk_btree_set_corrupt(bs
->sc
, bs
->cur
, 0);
116 xchk_iallocbt_chunk_xref(bs
->sc
, irec
, agino
, bno
, len
);
121 /* Count the number of free inodes. */
123 xchk_iallocbt_freecount(
124 xfs_inofree_t freemask
)
126 BUILD_BUG_ON(sizeof(freemask
) != sizeof(__u64
));
127 return hweight64(freemask
);
131 * Check that an inode's allocation status matches ir_free in the inobt
132 * record. First we try querying the in-core inode state, and if the inode
133 * isn't loaded we examine the on-disk inode directly.
135 * Since there can be 1:M and M:1 mappings between inobt records and inode
136 * clusters, we pass in the inode location information as an inobt record;
137 * the index of an inode cluster within the inobt record (as well as the
138 * cluster buffer itself); and the index of the inode within the cluster.
140 * @irec is the inobt record.
141 * @irec_ino is the inode offset from the start of the record.
142 * @dip is the on-disk inode.
145 xchk_iallocbt_check_cluster_ifree(
146 struct xchk_btree
*bs
,
147 struct xfs_inobt_rec_incore
*irec
,
148 unsigned int irec_ino
,
149 struct xfs_dinode
*dip
)
151 struct xfs_mount
*mp
= bs
->cur
->bc_mp
;
159 if (xchk_should_terminate(bs
->sc
, &error
))
163 * Given an inobt record and the offset of an inode from the start of
164 * the record, compute which fs inode we're talking about.
166 agino
= irec
->ir_startino
+ irec_ino
;
167 fsino
= XFS_AGINO_TO_INO(mp
, bs
->cur
->bc_private
.a
.agno
, agino
);
168 irec_free
= (irec
->ir_free
& XFS_INOBT_MASK(irec_ino
));
170 if (be16_to_cpu(dip
->di_magic
) != XFS_DINODE_MAGIC
||
171 (dip
->di_version
>= 3 && be64_to_cpu(dip
->di_ino
) != fsino
)) {
172 xchk_btree_set_corrupt(bs
->sc
, bs
->cur
, 0);
176 error
= xfs_icache_inode_is_allocated(mp
, bs
->cur
->bc_tp
, fsino
,
178 if (error
== -ENODATA
) {
179 /* Not cached, just read the disk buffer */
180 freemask_ok
= irec_free
^ !!(dip
->di_mode
);
181 if (!(bs
->sc
->flags
& XCHK_TRY_HARDER
) && !freemask_ok
)
183 } else if (error
< 0) {
185 * Inode is only half assembled, or there was an IO error,
186 * or the verifier failed, so don't bother trying to check.
187 * The inode scrubber can deal with this.
191 /* Inode is all there. */
192 freemask_ok
= irec_free
^ ino_inuse
;
195 xchk_btree_set_corrupt(bs
->sc
, bs
->cur
, 0);
201 * Check that the holemask and freemask of a hypothetical inode cluster match
202 * what's actually on disk. If sparse inodes are enabled, the cluster does
203 * not actually have to map to inodes if the corresponding holemask bit is set.
205 * @cluster_base is the first inode in the cluster within the @irec.
208 xchk_iallocbt_check_cluster(
209 struct xchk_btree
*bs
,
210 struct xfs_inobt_rec_incore
*irec
,
211 unsigned int cluster_base
)
213 struct xfs_imap imap
;
214 struct xfs_mount
*mp
= bs
->cur
->bc_mp
;
215 struct xfs_dinode
*dip
;
216 struct xfs_buf
*cluster_bp
;
217 unsigned int nr_inodes
;
218 xfs_agnumber_t agno
= bs
->cur
->bc_private
.a
.agno
;
220 unsigned int cluster_index
;
221 uint16_t cluster_mask
= 0;
222 uint16_t ir_holemask
;
225 nr_inodes
= min_t(unsigned int, XFS_INODES_PER_CHUNK
,
226 M_IGEO(mp
)->inodes_per_cluster
);
228 /* Map this inode cluster */
229 agbno
= XFS_AGINO_TO_AGBNO(mp
, irec
->ir_startino
+ cluster_base
);
231 /* Compute a bitmask for this cluster that can be used for holemask. */
232 for (cluster_index
= 0;
233 cluster_index
< nr_inodes
;
234 cluster_index
+= XFS_INODES_PER_HOLEMASK_BIT
)
235 cluster_mask
|= XFS_INOBT_MASK((cluster_base
+ cluster_index
) /
236 XFS_INODES_PER_HOLEMASK_BIT
);
239 * Map the first inode of this cluster to a buffer and offset.
240 * Be careful about inobt records that don't align with the start of
241 * the inode buffer when block sizes are large enough to hold multiple
242 * inode chunks. When this happens, cluster_base will be zero but
243 * ir_startino can be large enough to make im_boffset nonzero.
245 ir_holemask
= (irec
->ir_holemask
& cluster_mask
);
246 imap
.im_blkno
= XFS_AGB_TO_DADDR(mp
, agno
, agbno
);
247 imap
.im_len
= XFS_FSB_TO_BB(mp
, M_IGEO(mp
)->blocks_per_cluster
);
248 imap
.im_boffset
= XFS_INO_TO_OFFSET(mp
, irec
->ir_startino
) <<
249 mp
->m_sb
.sb_inodelog
;
251 if (imap
.im_boffset
!= 0 && cluster_base
!= 0) {
252 ASSERT(imap
.im_boffset
== 0 || cluster_base
== 0);
253 xchk_btree_set_corrupt(bs
->sc
, bs
->cur
, 0);
257 trace_xchk_iallocbt_check_cluster(mp
, agno
, irec
->ir_startino
,
258 imap
.im_blkno
, imap
.im_len
, cluster_base
, nr_inodes
,
259 cluster_mask
, ir_holemask
,
260 XFS_INO_TO_OFFSET(mp
, irec
->ir_startino
+
263 /* The whole cluster must be a hole or not a hole. */
264 if (ir_holemask
!= cluster_mask
&& ir_holemask
!= 0) {
265 xchk_btree_set_corrupt(bs
->sc
, bs
->cur
, 0);
269 /* If any part of this is a hole, skip it. */
271 xchk_xref_is_not_owned_by(bs
->sc
, agbno
,
272 M_IGEO(mp
)->blocks_per_cluster
,
273 &XFS_RMAP_OINFO_INODES
);
277 xchk_xref_is_owned_by(bs
->sc
, agbno
, M_IGEO(mp
)->blocks_per_cluster
,
278 &XFS_RMAP_OINFO_INODES
);
280 /* Grab the inode cluster buffer. */
281 error
= xfs_imap_to_bp(mp
, bs
->cur
->bc_tp
, &imap
, &dip
, &cluster_bp
,
283 if (!xchk_btree_xref_process_error(bs
->sc
, bs
->cur
, 0, &error
))
286 /* Check free status of each inode within this cluster. */
287 for (cluster_index
= 0; cluster_index
< nr_inodes
; cluster_index
++) {
288 struct xfs_dinode
*dip
;
290 if (imap
.im_boffset
>= BBTOB(cluster_bp
->b_length
)) {
291 xchk_btree_set_corrupt(bs
->sc
, bs
->cur
, 0);
295 dip
= xfs_buf_offset(cluster_bp
, imap
.im_boffset
);
296 error
= xchk_iallocbt_check_cluster_ifree(bs
, irec
,
297 cluster_base
+ cluster_index
, dip
);
300 imap
.im_boffset
+= mp
->m_sb
.sb_inodesize
;
303 xfs_trans_brelse(bs
->cur
->bc_tp
, cluster_bp
);
308 * For all the inode clusters that could map to this inobt record, make sure
309 * that the holemask makes sense and that the allocation status of each inode
310 * matches the freemask.
313 xchk_iallocbt_check_clusters(
314 struct xchk_btree
*bs
,
315 struct xfs_inobt_rec_incore
*irec
)
317 unsigned int cluster_base
;
321 * For the common case where this inobt record maps to multiple inode
322 * clusters this will call _check_cluster for each cluster.
324 * For the case that multiple inobt records map to a single cluster,
325 * this will call _check_cluster once.
327 for (cluster_base
= 0;
328 cluster_base
< XFS_INODES_PER_CHUNK
;
329 cluster_base
+= M_IGEO(bs
->sc
->mp
)->inodes_per_cluster
) {
330 error
= xchk_iallocbt_check_cluster(bs
, irec
, cluster_base
);
339 * Make sure this inode btree record is aligned properly. Because a fs block
340 * contains multiple inodes, we check that the inobt record is aligned to the
341 * correct inode, not just the correct block on disk. This results in a finer
342 * grained corruption check.
345 xchk_iallocbt_rec_alignment(
346 struct xchk_btree
*bs
,
347 struct xfs_inobt_rec_incore
*irec
)
349 struct xfs_mount
*mp
= bs
->sc
->mp
;
350 struct xchk_iallocbt
*iabt
= bs
->private;
351 struct xfs_ino_geometry
*igeo
= M_IGEO(mp
);
354 * finobt records have different positioning requirements than inobt
355 * records: each finobt record must have a corresponding inobt record.
356 * That is checked in the xref function, so for now we only catch the
357 * obvious case where the record isn't at all aligned properly.
359 * Note that if a fs block contains more than a single chunk of inodes,
360 * we will have finobt records only for those chunks containing free
361 * inodes, and therefore expect chunk alignment of finobt records.
362 * Otherwise, we expect that the finobt record is aligned to the
363 * cluster alignment as told by the superblock.
365 if (bs
->cur
->bc_btnum
== XFS_BTNUM_FINO
) {
368 imask
= min_t(unsigned int, XFS_INODES_PER_CHUNK
,
369 igeo
->cluster_align_inodes
) - 1;
370 if (irec
->ir_startino
& imask
)
371 xchk_btree_set_corrupt(bs
->sc
, bs
->cur
, 0);
375 if (iabt
->next_startino
!= NULLAGINO
) {
377 * We're midway through a cluster of inodes that is mapped by
378 * multiple inobt records. Did we get the record for the next
379 * irec in the sequence?
381 if (irec
->ir_startino
!= iabt
->next_startino
) {
382 xchk_btree_set_corrupt(bs
->sc
, bs
->cur
, 0);
386 iabt
->next_startino
+= XFS_INODES_PER_CHUNK
;
388 /* Are we done with the cluster? */
389 if (iabt
->next_startino
>= iabt
->next_cluster_ino
) {
390 iabt
->next_startino
= NULLAGINO
;
391 iabt
->next_cluster_ino
= NULLAGINO
;
396 /* inobt records must be aligned to cluster and inoalignmnt size. */
397 if (irec
->ir_startino
& (igeo
->cluster_align_inodes
- 1)) {
398 xchk_btree_set_corrupt(bs
->sc
, bs
->cur
, 0);
402 if (irec
->ir_startino
& (igeo
->inodes_per_cluster
- 1)) {
403 xchk_btree_set_corrupt(bs
->sc
, bs
->cur
, 0);
407 if (igeo
->inodes_per_cluster
<= XFS_INODES_PER_CHUNK
)
411 * If this is the start of an inode cluster that can be mapped by
412 * multiple inobt records, the next inobt record must follow exactly
415 iabt
->next_startino
= irec
->ir_startino
+ XFS_INODES_PER_CHUNK
;
416 iabt
->next_cluster_ino
= irec
->ir_startino
+ igeo
->inodes_per_cluster
;
419 /* Scrub an inobt/finobt record. */
422 struct xchk_btree
*bs
,
423 union xfs_btree_rec
*rec
)
425 struct xfs_mount
*mp
= bs
->cur
->bc_mp
;
426 struct xchk_iallocbt
*iabt
= bs
->private;
427 struct xfs_inobt_rec_incore irec
;
429 xfs_agnumber_t agno
= bs
->cur
->bc_private
.a
.agno
;
435 unsigned int real_freecount
;
438 xfs_inobt_btrec_to_irec(mp
, rec
, &irec
);
440 if (irec
.ir_count
> XFS_INODES_PER_CHUNK
||
441 irec
.ir_freecount
> XFS_INODES_PER_CHUNK
)
442 xchk_btree_set_corrupt(bs
->sc
, bs
->cur
, 0);
444 real_freecount
= irec
.ir_freecount
+
445 (XFS_INODES_PER_CHUNK
- irec
.ir_count
);
446 if (real_freecount
!= xchk_iallocbt_freecount(irec
.ir_free
))
447 xchk_btree_set_corrupt(bs
->sc
, bs
->cur
, 0);
449 agino
= irec
.ir_startino
;
450 /* Record has to be properly aligned within the AG. */
451 if (!xfs_verify_agino(mp
, agno
, agino
) ||
452 !xfs_verify_agino(mp
, agno
, agino
+ XFS_INODES_PER_CHUNK
- 1)) {
453 xchk_btree_set_corrupt(bs
->sc
, bs
->cur
, 0);
457 xchk_iallocbt_rec_alignment(bs
, &irec
);
458 if (bs
->sc
->sm
->sm_flags
& XFS_SCRUB_OFLAG_CORRUPT
)
461 iabt
->inodes
+= irec
.ir_count
;
463 /* Handle non-sparse inodes */
464 if (!xfs_inobt_issparse(irec
.ir_holemask
)) {
465 len
= XFS_B_TO_FSB(mp
,
466 XFS_INODES_PER_CHUNK
* mp
->m_sb
.sb_inodesize
);
467 if (irec
.ir_count
!= XFS_INODES_PER_CHUNK
)
468 xchk_btree_set_corrupt(bs
->sc
, bs
->cur
, 0);
470 if (!xchk_iallocbt_chunk(bs
, &irec
, agino
, len
))
475 /* Check each chunk of a sparse inode cluster. */
476 holemask
= irec
.ir_holemask
;
478 len
= XFS_B_TO_FSB(mp
,
479 XFS_INODES_PER_HOLEMASK_BIT
* mp
->m_sb
.sb_inodesize
);
480 holes
= ~xfs_inobt_irec_to_allocmask(&irec
);
481 if ((holes
& irec
.ir_free
) != holes
||
482 irec
.ir_freecount
> irec
.ir_count
)
483 xchk_btree_set_corrupt(bs
->sc
, bs
->cur
, 0);
485 for (i
= 0; i
< XFS_INOBT_HOLEMASK_BITS
; i
++) {
487 holecount
+= XFS_INODES_PER_HOLEMASK_BIT
;
488 else if (!xchk_iallocbt_chunk(bs
, &irec
, agino
, len
))
491 agino
+= XFS_INODES_PER_HOLEMASK_BIT
;
494 if (holecount
> XFS_INODES_PER_CHUNK
||
495 holecount
+ irec
.ir_count
!= XFS_INODES_PER_CHUNK
)
496 xchk_btree_set_corrupt(bs
->sc
, bs
->cur
, 0);
499 error
= xchk_iallocbt_check_clusters(bs
, &irec
);
508 * Make sure the inode btrees are as large as the rmap thinks they are.
509 * Don't bother if we're missing btree cursors, as we're already corrupt.
512 xchk_iallocbt_xref_rmap_btreeblks(
513 struct xfs_scrub
*sc
,
516 xfs_filblks_t blocks
;
517 xfs_extlen_t inobt_blocks
= 0;
518 xfs_extlen_t finobt_blocks
= 0;
521 if (!sc
->sa
.ino_cur
|| !sc
->sa
.rmap_cur
||
522 (xfs_sb_version_hasfinobt(&sc
->mp
->m_sb
) && !sc
->sa
.fino_cur
) ||
523 xchk_skip_xref(sc
->sm
))
526 /* Check that we saw as many inobt blocks as the rmap says. */
527 error
= xfs_btree_count_blocks(sc
->sa
.ino_cur
, &inobt_blocks
);
528 if (!xchk_process_error(sc
, 0, 0, &error
))
531 if (sc
->sa
.fino_cur
) {
532 error
= xfs_btree_count_blocks(sc
->sa
.fino_cur
, &finobt_blocks
);
533 if (!xchk_process_error(sc
, 0, 0, &error
))
537 error
= xchk_count_rmap_ownedby_ag(sc
, sc
->sa
.rmap_cur
,
538 &XFS_RMAP_OINFO_INOBT
, &blocks
);
539 if (!xchk_should_check_xref(sc
, &error
, &sc
->sa
.rmap_cur
))
541 if (blocks
!= inobt_blocks
+ finobt_blocks
)
542 xchk_btree_set_corrupt(sc
, sc
->sa
.ino_cur
, 0);
546 * Make sure that the inobt records point to the same number of blocks as
547 * the rmap says are owned by inodes.
550 xchk_iallocbt_xref_rmap_inodes(
551 struct xfs_scrub
*sc
,
553 unsigned long long inodes
)
555 xfs_filblks_t blocks
;
556 xfs_filblks_t inode_blocks
;
559 if (!sc
->sa
.rmap_cur
|| xchk_skip_xref(sc
->sm
))
562 /* Check that we saw as many inode blocks as the rmap knows about. */
563 error
= xchk_count_rmap_ownedby_ag(sc
, sc
->sa
.rmap_cur
,
564 &XFS_RMAP_OINFO_INODES
, &blocks
);
565 if (!xchk_should_check_xref(sc
, &error
, &sc
->sa
.rmap_cur
))
567 inode_blocks
= XFS_B_TO_FSB(sc
->mp
, inodes
* sc
->mp
->m_sb
.sb_inodesize
);
568 if (blocks
!= inode_blocks
)
569 xchk_btree_xref_set_corrupt(sc
, sc
->sa
.rmap_cur
, 0);
572 /* Scrub the inode btrees for some AG. */
575 struct xfs_scrub
*sc
,
578 struct xfs_btree_cur
*cur
;
579 struct xchk_iallocbt iabt
= {
581 .next_startino
= NULLAGINO
,
582 .next_cluster_ino
= NULLAGINO
,
586 cur
= which
== XFS_BTNUM_INO
? sc
->sa
.ino_cur
: sc
->sa
.fino_cur
;
587 error
= xchk_btree(sc
, cur
, xchk_iallocbt_rec
, &XFS_RMAP_OINFO_INOBT
,
592 xchk_iallocbt_xref_rmap_btreeblks(sc
, which
);
595 * If we're scrubbing the inode btree, inode_blocks is the number of
596 * blocks pointed to by all the inode chunk records. Therefore, we
597 * should compare to the number of inode chunk blocks that the rmap
598 * knows about. We can't do this for the finobt since it only points
599 * to inode chunks with free inodes.
601 if (which
== XFS_BTNUM_INO
)
602 xchk_iallocbt_xref_rmap_inodes(sc
, which
, iabt
.inodes
);
609 struct xfs_scrub
*sc
)
611 return xchk_iallocbt(sc
, XFS_BTNUM_INO
);
616 struct xfs_scrub
*sc
)
618 return xchk_iallocbt(sc
, XFS_BTNUM_FINO
);
621 /* See if an inode btree has (or doesn't have) an inode chunk record. */
623 xchk_xref_inode_check(
624 struct xfs_scrub
*sc
,
627 struct xfs_btree_cur
**icur
,
628 bool should_have_inodes
)
633 if (!(*icur
) || xchk_skip_xref(sc
->sm
))
636 error
= xfs_ialloc_has_inodes_at_extent(*icur
, agbno
, len
, &has_inodes
);
637 if (!xchk_should_check_xref(sc
, &error
, icur
))
639 if (has_inodes
!= should_have_inodes
)
640 xchk_btree_xref_set_corrupt(sc
, *icur
, 0);
643 /* xref check that the extent is not covered by inodes */
645 xchk_xref_is_not_inode_chunk(
646 struct xfs_scrub
*sc
,
650 xchk_xref_inode_check(sc
, agbno
, len
, &sc
->sa
.ino_cur
, false);
651 xchk_xref_inode_check(sc
, agbno
, len
, &sc
->sa
.fino_cur
, false);
654 /* xref check that the extent is covered by inodes */
656 xchk_xref_is_inode_chunk(
657 struct xfs_scrub
*sc
,
661 xchk_xref_inode_check(sc
, agbno
, len
, &sc
->sa
.ino_cur
, true);