drm/atomic-helper: document drm_atomic_helper_check() restrictions
[drm/drm-misc.git] / fs / xfs / scrub / inode_repair.c
blob5a58ddd27bd2f5fc7c625342ccd7b153965b554a
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_defer.h"
13 #include "xfs_btree.h"
14 #include "xfs_bit.h"
15 #include "xfs_log_format.h"
16 #include "xfs_trans.h"
17 #include "xfs_sb.h"
18 #include "xfs_inode.h"
19 #include "xfs_icache.h"
20 #include "xfs_inode_buf.h"
21 #include "xfs_inode_fork.h"
22 #include "xfs_ialloc.h"
23 #include "xfs_da_format.h"
24 #include "xfs_reflink.h"
25 #include "xfs_alloc.h"
26 #include "xfs_rmap.h"
27 #include "xfs_rmap_btree.h"
28 #include "xfs_bmap.h"
29 #include "xfs_bmap_btree.h"
30 #include "xfs_bmap_util.h"
31 #include "xfs_dir2.h"
32 #include "xfs_dir2_priv.h"
33 #include "xfs_quota_defs.h"
34 #include "xfs_quota.h"
35 #include "xfs_ag.h"
36 #include "xfs_rtbitmap.h"
37 #include "xfs_attr_leaf.h"
38 #include "xfs_log_priv.h"
39 #include "xfs_health.h"
40 #include "xfs_symlink_remote.h"
41 #include "scrub/xfs_scrub.h"
42 #include "scrub/scrub.h"
43 #include "scrub/common.h"
44 #include "scrub/btree.h"
45 #include "scrub/trace.h"
46 #include "scrub/repair.h"
47 #include "scrub/iscan.h"
48 #include "scrub/readdir.h"
49 #include "scrub/tempfile.h"
52 * Inode Record Repair
53 * ===================
55 * Roughly speaking, inode problems can be classified based on whether or not
56 * they trip the dinode verifiers. If those trip, then we won't be able to
57 * xfs_iget ourselves the inode.
59 * Therefore, the xrep_dinode_* functions fix anything that will cause the
60 * inode buffer verifier or the dinode verifier. The xrep_inode_* functions
61 * fix things on live incore inodes. The inode repair functions make decisions
62 * with security and usability implications when reviving a file:
64 * - Files with zero di_mode or a garbage di_mode are converted to regular file
65 * that only root can read. This file may not actually contain user data,
66 * if the file was not previously a regular file. Setuid and setgid bits
67 * are cleared.
69 * - Zero-size directories can be truncated to look empty. It is necessary to
70 * run the bmapbtd and directory repair functions to fully rebuild the
71 * directory.
73 * - Zero-size symbolic link targets can be truncated to '?'. It is necessary
74 * to run the bmapbtd and symlink repair functions to salvage the symlink.
76 * - Invalid extent size hints will be removed.
78 * - Quotacheck will be scheduled if we repaired an inode that was so badly
79 * damaged that the ondisk inode had to be rebuilt.
81 * - Invalid user, group, or project IDs (aka -1U) will be reset to zero.
82 * Setuid and setgid bits are cleared.
84 * - Data and attr forks are reset to extents format with zero extents if the
85 * fork data is inconsistent. It is necessary to run the bmapbtd or bmapbta
86 * repair functions to recover the space mapping.
88 * - ACLs will not be recovered if the attr fork is zapped or the extended
89 * attribute structure itself requires salvaging.
91 * - If the attr fork is zapped, the user and group ids are reset to root and
92 * the setuid and setgid bits are removed.
96 * All the information we need to repair the ondisk inode if we can't iget the
97 * incore inode. We don't allocate this buffer unless we're going to perform
98 * a repair to the ondisk inode cluster buffer.
100 struct xrep_inode {
101 /* Inode mapping that we saved from the initial lookup attempt. */
102 struct xfs_imap imap;
104 struct xfs_scrub *sc;
106 /* Blocks in use on the data device by data extents or bmbt blocks. */
107 xfs_rfsblock_t data_blocks;
109 /* Blocks in use on the rt device. */
110 xfs_rfsblock_t rt_blocks;
112 /* Blocks in use by the attr fork. */
113 xfs_rfsblock_t attr_blocks;
115 /* Number of data device extents for the data fork. */
116 xfs_extnum_t data_extents;
119 * Number of realtime device extents for the data fork. If
120 * data_extents and rt_extents indicate that the data fork has extents
121 * on both devices, we'll just back away slowly.
123 xfs_extnum_t rt_extents;
125 /* Number of (data device) extents for the attr fork. */
126 xfs_aextnum_t attr_extents;
128 /* Sick state to set after zapping parts of the inode. */
129 unsigned int ino_sick_mask;
131 /* Must we remove all access from this file? */
132 bool zap_acls;
134 /* Inode scanner to see if we can find the ftype from dirents */
135 struct xchk_iscan ftype_iscan;
136 uint8_t alleged_ftype;
140 * Setup function for inode repair. @imap contains the ondisk inode mapping
141 * information so that we can correct the ondisk inode cluster buffer if
142 * necessary to make iget work.
145 xrep_setup_inode(
146 struct xfs_scrub *sc,
147 const struct xfs_imap *imap)
149 struct xrep_inode *ri;
151 sc->buf = kzalloc(sizeof(struct xrep_inode), XCHK_GFP_FLAGS);
152 if (!sc->buf)
153 return -ENOMEM;
155 ri = sc->buf;
156 memcpy(&ri->imap, imap, sizeof(struct xfs_imap));
157 ri->sc = sc;
158 return 0;
162 * Make sure this ondisk inode can pass the inode buffer verifier. This is
163 * not the same as the dinode verifier.
165 STATIC void
166 xrep_dinode_buf_core(
167 struct xfs_scrub *sc,
168 struct xfs_buf *bp,
169 unsigned int ioffset)
171 struct xfs_dinode *dip = xfs_buf_offset(bp, ioffset);
172 struct xfs_trans *tp = sc->tp;
173 struct xfs_mount *mp = sc->mp;
174 xfs_agino_t agino;
175 bool crc_ok = false;
176 bool magic_ok = false;
177 bool unlinked_ok = false;
179 agino = be32_to_cpu(dip->di_next_unlinked);
181 if (xfs_verify_agino_or_null(bp->b_pag, agino))
182 unlinked_ok = true;
184 if (dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
185 xfs_dinode_good_version(mp, dip->di_version))
186 magic_ok = true;
188 if (xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
189 XFS_DINODE_CRC_OFF))
190 crc_ok = true;
192 if (magic_ok && unlinked_ok && crc_ok)
193 return;
195 if (!magic_ok) {
196 dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
197 dip->di_version = 3;
199 if (!unlinked_ok)
200 dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
201 xfs_dinode_calc_crc(mp, dip);
202 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
203 xfs_trans_log_buf(tp, bp, ioffset,
204 ioffset + sizeof(struct xfs_dinode) - 1);
207 /* Make sure this inode cluster buffer can pass the inode buffer verifier. */
208 STATIC void
209 xrep_dinode_buf(
210 struct xfs_scrub *sc,
211 struct xfs_buf *bp)
213 struct xfs_mount *mp = sc->mp;
214 int i;
215 int ni;
217 ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
218 for (i = 0; i < ni; i++)
219 xrep_dinode_buf_core(sc, bp, i << mp->m_sb.sb_inodelog);
222 /* Reinitialize things that never change in an inode. */
223 STATIC void
224 xrep_dinode_header(
225 struct xfs_scrub *sc,
226 struct xfs_dinode *dip)
228 trace_xrep_dinode_header(sc, dip);
230 dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
231 if (!xfs_dinode_good_version(sc->mp, dip->di_version))
232 dip->di_version = 3;
233 dip->di_ino = cpu_to_be64(sc->sm->sm_ino);
234 uuid_copy(&dip->di_uuid, &sc->mp->m_sb.sb_meta_uuid);
235 dip->di_gen = cpu_to_be32(sc->sm->sm_gen);
239 * If this directory entry points to the scrub target inode, then the directory
240 * we're scanning is the parent of the scrub target inode.
242 STATIC int
243 xrep_dinode_findmode_dirent(
244 struct xfs_scrub *sc,
245 struct xfs_inode *dp,
246 xfs_dir2_dataptr_t dapos,
247 const struct xfs_name *name,
248 xfs_ino_t ino,
249 void *priv)
251 struct xrep_inode *ri = priv;
252 int error = 0;
254 if (xchk_should_terminate(ri->sc, &error))
255 return error;
257 if (ino != sc->sm->sm_ino)
258 return 0;
260 /* Ignore garbage directory entry names. */
261 if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len))
262 return -EFSCORRUPTED;
264 /* Don't pick up dot or dotdot entries; we only want child dirents. */
265 if (xfs_dir2_samename(name, &xfs_name_dotdot) ||
266 xfs_dir2_samename(name, &xfs_name_dot))
267 return 0;
270 * Uhoh, more than one parent for this inode and they don't agree on
271 * the file type?
273 if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN &&
274 ri->alleged_ftype != name->type) {
275 trace_xrep_dinode_findmode_dirent_inval(ri->sc, dp, name->type,
276 ri->alleged_ftype);
277 return -EFSCORRUPTED;
280 /* We found a potential parent; remember the ftype. */
281 trace_xrep_dinode_findmode_dirent(ri->sc, dp, name->type);
282 ri->alleged_ftype = name->type;
283 return 0;
286 /* Try to lock a directory, or wait a jiffy. */
287 static inline int
288 xrep_dinode_ilock_nowait(
289 struct xfs_inode *dp,
290 unsigned int lock_mode)
292 if (xfs_ilock_nowait(dp, lock_mode))
293 return true;
295 schedule_timeout_killable(1);
296 return false;
300 * Try to lock a directory to look for ftype hints. Since we already hold the
301 * AGI buffer, we cannot block waiting for the ILOCK because rename can take
302 * the ILOCK and then try to lock AGIs.
304 STATIC int
305 xrep_dinode_trylock_directory(
306 struct xrep_inode *ri,
307 struct xfs_inode *dp,
308 unsigned int *lock_modep)
310 unsigned long deadline = jiffies + msecs_to_jiffies(30000);
311 unsigned int lock_mode;
312 int error = 0;
314 do {
315 if (xchk_should_terminate(ri->sc, &error))
316 return error;
318 if (xfs_need_iread_extents(&dp->i_df))
319 lock_mode = XFS_ILOCK_EXCL;
320 else
321 lock_mode = XFS_ILOCK_SHARED;
323 if (xrep_dinode_ilock_nowait(dp, lock_mode)) {
324 *lock_modep = lock_mode;
325 return 0;
327 } while (!time_is_before_jiffies(deadline));
328 return -EBUSY;
332 * If this is a directory, walk the dirents looking for any that point to the
333 * scrub target inode.
335 STATIC int
336 xrep_dinode_findmode_walk_directory(
337 struct xrep_inode *ri,
338 struct xfs_inode *dp)
340 struct xfs_scrub *sc = ri->sc;
341 unsigned int lock_mode;
342 int error = 0;
344 /* Ignore temporary repair directories. */
345 if (xrep_is_tempfile(dp))
346 return 0;
349 * Scan the directory to see if there it contains an entry pointing to
350 * the directory that we are repairing.
352 error = xrep_dinode_trylock_directory(ri, dp, &lock_mode);
353 if (error)
354 return error;
357 * If this directory is known to be sick, we cannot scan it reliably
358 * and must abort.
360 if (xfs_inode_has_sickness(dp, XFS_SICK_INO_CORE |
361 XFS_SICK_INO_BMBTD |
362 XFS_SICK_INO_DIR)) {
363 error = -EFSCORRUPTED;
364 goto out_unlock;
368 * We cannot complete our parent pointer scan if a directory looks as
369 * though it has been zapped by the inode record repair code.
371 if (xchk_dir_looks_zapped(dp)) {
372 error = -EBUSY;
373 goto out_unlock;
376 error = xchk_dir_walk(sc, dp, xrep_dinode_findmode_dirent, ri);
377 if (error)
378 goto out_unlock;
380 out_unlock:
381 xfs_iunlock(dp, lock_mode);
382 return error;
386 * Try to find the mode of the inode being repaired by looking for directories
387 * that point down to this file.
389 STATIC int
390 xrep_dinode_find_mode(
391 struct xrep_inode *ri,
392 uint16_t *mode)
394 struct xfs_scrub *sc = ri->sc;
395 struct xfs_inode *dp;
396 int error;
398 /* No ftype means we have no other metadata to consult. */
399 if (!xfs_has_ftype(sc->mp)) {
400 *mode = S_IFREG;
401 return 0;
405 * Scan all directories for parents that might point down to this
406 * inode. Skip the inode being repaired during the scan since it
407 * cannot be its own parent. Note that we still hold the AGI locked
408 * so there's a real possibility that _iscan_iter can return EBUSY.
410 xchk_iscan_start(sc, 5000, 100, &ri->ftype_iscan);
411 xchk_iscan_set_agi_trylock(&ri->ftype_iscan);
412 ri->ftype_iscan.skip_ino = sc->sm->sm_ino;
413 ri->alleged_ftype = XFS_DIR3_FT_UNKNOWN;
414 while ((error = xchk_iscan_iter(&ri->ftype_iscan, &dp)) == 1) {
415 if (S_ISDIR(VFS_I(dp)->i_mode))
416 error = xrep_dinode_findmode_walk_directory(ri, dp);
417 xchk_iscan_mark_visited(&ri->ftype_iscan, dp);
418 xchk_irele(sc, dp);
419 if (error < 0)
420 break;
421 if (xchk_should_terminate(sc, &error))
422 break;
424 xchk_iscan_iter_finish(&ri->ftype_iscan);
425 xchk_iscan_teardown(&ri->ftype_iscan);
427 if (error == -EBUSY) {
428 if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN) {
430 * If we got an EBUSY after finding at least one
431 * dirent, that means the scan found an inode on the
432 * inactivation list and could not open it. Accept the
433 * alleged ftype and install a new mode below.
435 error = 0;
436 } else if (!(sc->flags & XCHK_TRY_HARDER)) {
438 * Otherwise, retry the operation one time to see if
439 * the reason for the delay is an inode from the same
440 * cluster buffer waiting on the inactivation list.
442 error = -EDEADLOCK;
445 if (error)
446 return error;
449 * Convert the discovered ftype into the file mode. If all else fails,
450 * return S_IFREG.
452 switch (ri->alleged_ftype) {
453 case XFS_DIR3_FT_DIR:
454 *mode = S_IFDIR;
455 break;
456 case XFS_DIR3_FT_WHT:
457 case XFS_DIR3_FT_CHRDEV:
458 *mode = S_IFCHR;
459 break;
460 case XFS_DIR3_FT_BLKDEV:
461 *mode = S_IFBLK;
462 break;
463 case XFS_DIR3_FT_FIFO:
464 *mode = S_IFIFO;
465 break;
466 case XFS_DIR3_FT_SOCK:
467 *mode = S_IFSOCK;
468 break;
469 case XFS_DIR3_FT_SYMLINK:
470 *mode = S_IFLNK;
471 break;
472 default:
473 *mode = S_IFREG;
474 break;
476 return 0;
479 /* Turn di_mode into /something/ recognizable. Returns true if we succeed. */
480 STATIC int
481 xrep_dinode_mode(
482 struct xrep_inode *ri,
483 struct xfs_dinode *dip)
485 struct xfs_scrub *sc = ri->sc;
486 uint16_t mode = be16_to_cpu(dip->di_mode);
487 int error;
489 trace_xrep_dinode_mode(sc, dip);
491 if (mode == 0 || xfs_mode_to_ftype(mode) != XFS_DIR3_FT_UNKNOWN)
492 return 0;
494 /* Try to fix the mode. If we cannot, then leave everything alone. */
495 error = xrep_dinode_find_mode(ri, &mode);
496 switch (error) {
497 case -EINTR:
498 case -EBUSY:
499 case -EDEADLOCK:
500 /* temporary failure or fatal signal */
501 return error;
502 case 0:
503 /* found mode */
504 break;
505 default:
506 /* some other error, assume S_IFREG */
507 mode = S_IFREG;
508 break;
511 /* bad mode, so we set it to a file that only root can read */
512 dip->di_mode = cpu_to_be16(mode);
513 dip->di_uid = 0;
514 dip->di_gid = 0;
515 ri->zap_acls = true;
516 return 0;
519 /* Fix unused link count fields having nonzero values. */
520 STATIC void
521 xrep_dinode_nlinks(
522 struct xfs_dinode *dip)
524 if (dip->di_version < 2) {
525 dip->di_nlink = 0;
526 return;
529 if (xfs_dinode_is_metadir(dip)) {
530 if (be16_to_cpu(dip->di_metatype) >= XFS_METAFILE_MAX)
531 dip->di_metatype = cpu_to_be16(XFS_METAFILE_UNKNOWN);
532 } else {
533 dip->di_metatype = 0;
537 /* Fix any conflicting flags that the verifiers complain about. */
538 STATIC void
539 xrep_dinode_flags(
540 struct xfs_scrub *sc,
541 struct xfs_dinode *dip,
542 bool isrt)
544 struct xfs_mount *mp = sc->mp;
545 uint64_t flags2 = be64_to_cpu(dip->di_flags2);
546 uint16_t flags = be16_to_cpu(dip->di_flags);
547 uint16_t mode = be16_to_cpu(dip->di_mode);
549 trace_xrep_dinode_flags(sc, dip);
551 if (isrt)
552 flags |= XFS_DIFLAG_REALTIME;
553 else
554 flags &= ~XFS_DIFLAG_REALTIME;
557 * For regular files on a reflink filesystem, set the REFLINK flag to
558 * protect shared extents. A later stage will actually check those
559 * extents and clear the flag if possible.
561 if (xfs_has_reflink(mp) && S_ISREG(mode))
562 flags2 |= XFS_DIFLAG2_REFLINK;
563 else
564 flags2 &= ~(XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE);
565 if (flags & XFS_DIFLAG_REALTIME)
566 flags2 &= ~XFS_DIFLAG2_REFLINK;
567 if (!xfs_has_bigtime(mp))
568 flags2 &= ~XFS_DIFLAG2_BIGTIME;
569 if (!xfs_has_large_extent_counts(mp))
570 flags2 &= ~XFS_DIFLAG2_NREXT64;
571 if (flags2 & XFS_DIFLAG2_NREXT64)
572 dip->di_nrext64_pad = 0;
573 else if (dip->di_version >= 3)
574 dip->di_v3_pad = 0;
576 if (flags2 & XFS_DIFLAG2_METADATA) {
577 xfs_failaddr_t fa;
579 fa = xfs_dinode_verify_metadir(sc->mp, dip, mode, flags,
580 flags2);
581 if (fa)
582 flags2 &= ~XFS_DIFLAG2_METADATA;
585 dip->di_flags = cpu_to_be16(flags);
586 dip->di_flags2 = cpu_to_be64(flags2);
590 * Blow out symlink; now it points nowhere. We don't have to worry about
591 * incore state because this inode is failing the verifiers.
593 STATIC void
594 xrep_dinode_zap_symlink(
595 struct xrep_inode *ri,
596 struct xfs_dinode *dip)
598 struct xfs_scrub *sc = ri->sc;
599 char *p;
601 trace_xrep_dinode_zap_symlink(sc, dip);
603 dip->di_format = XFS_DINODE_FMT_LOCAL;
604 dip->di_size = cpu_to_be64(1);
605 p = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
606 *p = '?';
607 ri->ino_sick_mask |= XFS_SICK_INO_SYMLINK_ZAPPED;
611 * Blow out dir, make the parent point to the root. In the future repair will
612 * reconstruct this directory for us. Note that there's no in-core directory
613 * inode because the sf verifier tripped, so we don't have to worry about the
614 * dentry cache.
616 STATIC void
617 xrep_dinode_zap_dir(
618 struct xrep_inode *ri,
619 struct xfs_dinode *dip)
621 struct xfs_scrub *sc = ri->sc;
622 struct xfs_mount *mp = sc->mp;
623 struct xfs_dir2_sf_hdr *sfp;
624 int i8count;
626 trace_xrep_dinode_zap_dir(sc, dip);
628 dip->di_format = XFS_DINODE_FMT_LOCAL;
629 i8count = mp->m_sb.sb_rootino > XFS_DIR2_MAX_SHORT_INUM;
630 sfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
631 sfp->count = 0;
632 sfp->i8count = i8count;
633 xfs_dir2_sf_put_parent_ino(sfp, mp->m_sb.sb_rootino);
634 dip->di_size = cpu_to_be64(xfs_dir2_sf_hdr_size(i8count));
635 ri->ino_sick_mask |= XFS_SICK_INO_DIR_ZAPPED;
638 /* Make sure we don't have a garbage file size. */
639 STATIC void
640 xrep_dinode_size(
641 struct xrep_inode *ri,
642 struct xfs_dinode *dip)
644 struct xfs_scrub *sc = ri->sc;
645 uint64_t size = be64_to_cpu(dip->di_size);
646 uint16_t mode = be16_to_cpu(dip->di_mode);
648 trace_xrep_dinode_size(sc, dip);
650 switch (mode & S_IFMT) {
651 case S_IFIFO:
652 case S_IFCHR:
653 case S_IFBLK:
654 case S_IFSOCK:
655 /* di_size can't be nonzero for special files */
656 dip->di_size = 0;
657 break;
658 case S_IFREG:
659 /* Regular files can't be larger than 2^63-1 bytes. */
660 dip->di_size = cpu_to_be64(size & ~(1ULL << 63));
661 break;
662 case S_IFLNK:
664 * Truncate ridiculously oversized symlinks. If the size is
665 * zero, reset it to point to the current directory. Both of
666 * these conditions trigger dinode verifier errors, so there
667 * is no in-core state to reset.
669 if (size > XFS_SYMLINK_MAXLEN)
670 dip->di_size = cpu_to_be64(XFS_SYMLINK_MAXLEN);
671 else if (size == 0)
672 xrep_dinode_zap_symlink(ri, dip);
673 break;
674 case S_IFDIR:
676 * Directories can't have a size larger than 32G. If the size
677 * is zero, reset it to an empty directory. Both of these
678 * conditions trigger dinode verifier errors, so there is no
679 * in-core state to reset.
681 if (size > XFS_DIR2_SPACE_SIZE)
682 dip->di_size = cpu_to_be64(XFS_DIR2_SPACE_SIZE);
683 else if (size == 0)
684 xrep_dinode_zap_dir(ri, dip);
685 break;
689 /* Fix extent size hints. */
690 STATIC void
691 xrep_dinode_extsize_hints(
692 struct xfs_scrub *sc,
693 struct xfs_dinode *dip)
695 struct xfs_mount *mp = sc->mp;
696 uint64_t flags2 = be64_to_cpu(dip->di_flags2);
697 uint16_t flags = be16_to_cpu(dip->di_flags);
698 uint16_t mode = be16_to_cpu(dip->di_mode);
700 xfs_failaddr_t fa;
702 trace_xrep_dinode_extsize_hints(sc, dip);
704 fa = xfs_inode_validate_extsize(mp, be32_to_cpu(dip->di_extsize),
705 mode, flags);
706 if (fa) {
707 dip->di_extsize = 0;
708 dip->di_flags &= ~cpu_to_be16(XFS_DIFLAG_EXTSIZE |
709 XFS_DIFLAG_EXTSZINHERIT);
712 if (dip->di_version < 3)
713 return;
715 fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize),
716 mode, flags, flags2);
717 if (fa) {
718 dip->di_cowextsize = 0;
719 dip->di_flags2 &= ~cpu_to_be64(XFS_DIFLAG2_COWEXTSIZE);
723 /* Count extents and blocks for an inode given an rmap. */
724 STATIC int
725 xrep_dinode_walk_rmap(
726 struct xfs_btree_cur *cur,
727 const struct xfs_rmap_irec *rec,
728 void *priv)
730 struct xrep_inode *ri = priv;
731 int error = 0;
733 if (xchk_should_terminate(ri->sc, &error))
734 return error;
736 /* We only care about this inode. */
737 if (rec->rm_owner != ri->sc->sm->sm_ino)
738 return 0;
740 if (rec->rm_flags & XFS_RMAP_ATTR_FORK) {
741 ri->attr_blocks += rec->rm_blockcount;
742 if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK))
743 ri->attr_extents++;
745 return 0;
748 ri->data_blocks += rec->rm_blockcount;
749 if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK))
750 ri->data_extents++;
752 return 0;
755 /* Count extents and blocks for an inode from all AG rmap data. */
756 STATIC int
757 xrep_dinode_count_ag_rmaps(
758 struct xrep_inode *ri,
759 struct xfs_perag *pag)
761 struct xfs_btree_cur *cur;
762 struct xfs_buf *agf;
763 int error;
765 error = xfs_alloc_read_agf(pag, ri->sc->tp, 0, &agf);
766 if (error)
767 return error;
769 cur = xfs_rmapbt_init_cursor(ri->sc->mp, ri->sc->tp, agf, pag);
770 error = xfs_rmap_query_all(cur, xrep_dinode_walk_rmap, ri);
771 xfs_btree_del_cursor(cur, error);
772 xfs_trans_brelse(ri->sc->tp, agf);
773 return error;
776 /* Count extents and blocks for a given inode from all rmap data. */
777 STATIC int
778 xrep_dinode_count_rmaps(
779 struct xrep_inode *ri)
781 struct xfs_perag *pag = NULL;
782 int error;
784 if (!xfs_has_rmapbt(ri->sc->mp) || xfs_has_realtime(ri->sc->mp))
785 return -EOPNOTSUPP;
787 while ((pag = xfs_perag_next(ri->sc->mp, pag))) {
788 error = xrep_dinode_count_ag_rmaps(ri, pag);
789 if (error) {
790 xfs_perag_rele(pag);
791 return error;
795 /* Can't have extents on both the rt and the data device. */
796 if (ri->data_extents && ri->rt_extents)
797 return -EFSCORRUPTED;
799 trace_xrep_dinode_count_rmaps(ri->sc,
800 ri->data_blocks, ri->rt_blocks, ri->attr_blocks,
801 ri->data_extents, ri->rt_extents, ri->attr_extents);
802 return 0;
805 /* Return true if this extents-format ifork looks like garbage. */
806 STATIC bool
807 xrep_dinode_bad_extents_fork(
808 struct xfs_scrub *sc,
809 struct xfs_dinode *dip,
810 unsigned int dfork_size,
811 int whichfork)
813 struct xfs_bmbt_irec new;
814 struct xfs_bmbt_rec *dp;
815 xfs_extnum_t nex;
816 bool isrt;
817 unsigned int i;
819 nex = xfs_dfork_nextents(dip, whichfork);
820 if (nex > dfork_size / sizeof(struct xfs_bmbt_rec))
821 return true;
823 dp = XFS_DFORK_PTR(dip, whichfork);
825 isrt = dip->di_flags & cpu_to_be16(XFS_DIFLAG_REALTIME);
826 for (i = 0; i < nex; i++, dp++) {
827 xfs_failaddr_t fa;
829 xfs_bmbt_disk_get_all(dp, &new);
830 fa = xfs_bmap_validate_extent_raw(sc->mp, isrt, whichfork,
831 &new);
832 if (fa)
833 return true;
836 return false;
839 /* Return true if this btree-format ifork looks like garbage. */
840 STATIC bool
841 xrep_dinode_bad_bmbt_fork(
842 struct xfs_scrub *sc,
843 struct xfs_dinode *dip,
844 unsigned int dfork_size,
845 int whichfork)
847 struct xfs_bmdr_block *dfp;
848 xfs_extnum_t nex;
849 unsigned int i;
850 unsigned int dmxr;
851 unsigned int nrecs;
852 unsigned int level;
854 nex = xfs_dfork_nextents(dip, whichfork);
855 if (nex <= dfork_size / sizeof(struct xfs_bmbt_rec))
856 return true;
858 if (dfork_size < sizeof(struct xfs_bmdr_block))
859 return true;
861 dfp = XFS_DFORK_PTR(dip, whichfork);
862 nrecs = be16_to_cpu(dfp->bb_numrecs);
863 level = be16_to_cpu(dfp->bb_level);
865 if (nrecs == 0 || xfs_bmdr_space_calc(nrecs) > dfork_size)
866 return true;
867 if (level == 0 || level >= XFS_BM_MAXLEVELS(sc->mp, whichfork))
868 return true;
870 dmxr = xfs_bmdr_maxrecs(dfork_size, 0);
871 for (i = 1; i <= nrecs; i++) {
872 struct xfs_bmbt_key *fkp;
873 xfs_bmbt_ptr_t *fpp;
874 xfs_fileoff_t fileoff;
875 xfs_fsblock_t fsbno;
877 fkp = xfs_bmdr_key_addr(dfp, i);
878 fileoff = be64_to_cpu(fkp->br_startoff);
879 if (!xfs_verify_fileoff(sc->mp, fileoff))
880 return true;
882 fpp = xfs_bmdr_ptr_addr(dfp, i, dmxr);
883 fsbno = be64_to_cpu(*fpp);
884 if (!xfs_verify_fsbno(sc->mp, fsbno))
885 return true;
888 return false;
892 * Check the data fork for things that will fail the ifork verifiers or the
893 * ifork formatters.
895 STATIC bool
896 xrep_dinode_check_dfork(
897 struct xfs_scrub *sc,
898 struct xfs_dinode *dip,
899 uint16_t mode)
901 void *dfork_ptr;
902 int64_t data_size;
903 unsigned int fmt;
904 unsigned int dfork_size;
907 * Verifier functions take signed int64_t, so check for bogus negative
908 * values first.
910 data_size = be64_to_cpu(dip->di_size);
911 if (data_size < 0)
912 return true;
914 fmt = XFS_DFORK_FORMAT(dip, XFS_DATA_FORK);
915 switch (mode & S_IFMT) {
916 case S_IFIFO:
917 case S_IFCHR:
918 case S_IFBLK:
919 case S_IFSOCK:
920 if (fmt != XFS_DINODE_FMT_DEV)
921 return true;
922 break;
923 case S_IFREG:
924 if (fmt == XFS_DINODE_FMT_LOCAL)
925 return true;
926 fallthrough;
927 case S_IFLNK:
928 case S_IFDIR:
929 switch (fmt) {
930 case XFS_DINODE_FMT_LOCAL:
931 case XFS_DINODE_FMT_EXTENTS:
932 case XFS_DINODE_FMT_BTREE:
933 break;
934 default:
935 return true;
937 break;
938 default:
939 return true;
942 dfork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_DATA_FORK);
943 dfork_ptr = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
945 switch (fmt) {
946 case XFS_DINODE_FMT_DEV:
947 break;
948 case XFS_DINODE_FMT_LOCAL:
949 /* dir/symlink structure cannot be larger than the fork */
950 if (data_size > dfork_size)
951 return true;
952 /* directory structure must pass verification. */
953 if (S_ISDIR(mode) &&
954 xfs_dir2_sf_verify(sc->mp, dfork_ptr, data_size) != NULL)
955 return true;
956 /* symlink structure must pass verification. */
957 if (S_ISLNK(mode) &&
958 xfs_symlink_shortform_verify(dfork_ptr, data_size) != NULL)
959 return true;
960 break;
961 case XFS_DINODE_FMT_EXTENTS:
962 if (xrep_dinode_bad_extents_fork(sc, dip, dfork_size,
963 XFS_DATA_FORK))
964 return true;
965 break;
966 case XFS_DINODE_FMT_BTREE:
967 if (xrep_dinode_bad_bmbt_fork(sc, dip, dfork_size,
968 XFS_DATA_FORK))
969 return true;
970 break;
971 default:
972 return true;
975 return false;
978 static void
979 xrep_dinode_set_data_nextents(
980 struct xfs_dinode *dip,
981 xfs_extnum_t nextents)
983 if (xfs_dinode_has_large_extent_counts(dip))
984 dip->di_big_nextents = cpu_to_be64(nextents);
985 else
986 dip->di_nextents = cpu_to_be32(nextents);
989 static void
990 xrep_dinode_set_attr_nextents(
991 struct xfs_dinode *dip,
992 xfs_extnum_t nextents)
994 if (xfs_dinode_has_large_extent_counts(dip))
995 dip->di_big_anextents = cpu_to_be32(nextents);
996 else
997 dip->di_anextents = cpu_to_be16(nextents);
1000 /* Reset the data fork to something sane. */
1001 STATIC void
1002 xrep_dinode_zap_dfork(
1003 struct xrep_inode *ri,
1004 struct xfs_dinode *dip,
1005 uint16_t mode)
1007 struct xfs_scrub *sc = ri->sc;
1009 trace_xrep_dinode_zap_dfork(sc, dip);
1011 ri->ino_sick_mask |= XFS_SICK_INO_BMBTD_ZAPPED;
1013 xrep_dinode_set_data_nextents(dip, 0);
1014 ri->data_blocks = 0;
1015 ri->rt_blocks = 0;
1017 /* Special files always get reset to DEV */
1018 switch (mode & S_IFMT) {
1019 case S_IFIFO:
1020 case S_IFCHR:
1021 case S_IFBLK:
1022 case S_IFSOCK:
1023 dip->di_format = XFS_DINODE_FMT_DEV;
1024 dip->di_size = 0;
1025 return;
1029 * If we have data extents, reset to an empty map and hope the user
1030 * will run the bmapbtd checker next.
1032 if (ri->data_extents || ri->rt_extents || S_ISREG(mode)) {
1033 dip->di_format = XFS_DINODE_FMT_EXTENTS;
1034 return;
1037 /* Otherwise, reset the local format to the minimum. */
1038 switch (mode & S_IFMT) {
1039 case S_IFLNK:
1040 xrep_dinode_zap_symlink(ri, dip);
1041 break;
1042 case S_IFDIR:
1043 xrep_dinode_zap_dir(ri, dip);
1044 break;
1049 * Check the attr fork for things that will fail the ifork verifiers or the
1050 * ifork formatters.
1052 STATIC bool
1053 xrep_dinode_check_afork(
1054 struct xfs_scrub *sc,
1055 struct xfs_dinode *dip)
1057 struct xfs_attr_sf_hdr *afork_ptr;
1058 size_t attr_size;
1059 unsigned int afork_size;
1061 if (XFS_DFORK_BOFF(dip) == 0)
1062 return dip->di_aformat != XFS_DINODE_FMT_EXTENTS ||
1063 xfs_dfork_attr_extents(dip) != 0;
1065 afork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK);
1066 afork_ptr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK);
1068 switch (XFS_DFORK_FORMAT(dip, XFS_ATTR_FORK)) {
1069 case XFS_DINODE_FMT_LOCAL:
1070 /* Fork has to be large enough to extract the xattr size. */
1071 if (afork_size < sizeof(struct xfs_attr_sf_hdr))
1072 return true;
1074 /* xattr structure cannot be larger than the fork */
1075 attr_size = be16_to_cpu(afork_ptr->totsize);
1076 if (attr_size > afork_size)
1077 return true;
1079 /* xattr structure must pass verification. */
1080 return xfs_attr_shortform_verify(afork_ptr, attr_size) != NULL;
1081 case XFS_DINODE_FMT_EXTENTS:
1082 if (xrep_dinode_bad_extents_fork(sc, dip, afork_size,
1083 XFS_ATTR_FORK))
1084 return true;
1085 break;
1086 case XFS_DINODE_FMT_BTREE:
1087 if (xrep_dinode_bad_bmbt_fork(sc, dip, afork_size,
1088 XFS_ATTR_FORK))
1089 return true;
1090 break;
1091 default:
1092 return true;
1095 return false;
1099 * Reset the attr fork to empty. Since the attr fork could have contained
1100 * ACLs, make the file readable only by root.
1102 STATIC void
1103 xrep_dinode_zap_afork(
1104 struct xrep_inode *ri,
1105 struct xfs_dinode *dip,
1106 uint16_t mode)
1108 struct xfs_scrub *sc = ri->sc;
1110 trace_xrep_dinode_zap_afork(sc, dip);
1112 ri->ino_sick_mask |= XFS_SICK_INO_BMBTA_ZAPPED;
1114 dip->di_aformat = XFS_DINODE_FMT_EXTENTS;
1115 xrep_dinode_set_attr_nextents(dip, 0);
1116 ri->attr_blocks = 0;
1119 * If the data fork is in btree format, removing the attr fork entirely
1120 * might cause verifier failures if the next level down in the bmbt
1121 * could now fit in the data fork area.
1123 if (dip->di_format != XFS_DINODE_FMT_BTREE)
1124 dip->di_forkoff = 0;
1125 dip->di_mode = cpu_to_be16(mode & ~0777);
1126 dip->di_uid = 0;
1127 dip->di_gid = 0;
1130 /* Make sure the fork offset is a sensible value. */
1131 STATIC void
1132 xrep_dinode_ensure_forkoff(
1133 struct xrep_inode *ri,
1134 struct xfs_dinode *dip,
1135 uint16_t mode)
1137 struct xfs_bmdr_block *bmdr;
1138 struct xfs_scrub *sc = ri->sc;
1139 xfs_extnum_t attr_extents, data_extents;
1140 size_t bmdr_minsz = xfs_bmdr_space_calc(1);
1141 unsigned int lit_sz = XFS_LITINO(sc->mp);
1142 unsigned int afork_min, dfork_min;
1144 trace_xrep_dinode_ensure_forkoff(sc, dip);
1147 * Before calling this function, xrep_dinode_core ensured that both
1148 * forks actually fit inside their respective literal areas. If this
1149 * was not the case, the fork was reset to FMT_EXTENTS with zero
1150 * records. If the rmapbt scan found attr or data fork blocks, this
1151 * will be noted in the dinode_stats, and we must leave enough room
1152 * for the bmap repair code to reconstruct the mapping structure.
1154 * First, compute the minimum space required for the attr fork.
1156 switch (dip->di_aformat) {
1157 case XFS_DINODE_FMT_LOCAL:
1159 * If we still have a shortform xattr structure at all, that
1160 * means the attr fork area was exactly large enough to fit
1161 * the sf structure.
1163 afork_min = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK);
1164 break;
1165 case XFS_DINODE_FMT_EXTENTS:
1166 attr_extents = xfs_dfork_attr_extents(dip);
1167 if (attr_extents) {
1169 * We must maintain sufficient space to hold the entire
1170 * extent map array in the data fork. Note that we
1171 * previously zapped the fork if it had no chance of
1172 * fitting in the inode.
1174 afork_min = sizeof(struct xfs_bmbt_rec) * attr_extents;
1175 } else if (ri->attr_extents > 0) {
1177 * The attr fork thinks it has zero extents, but we
1178 * found some xattr extents. We need to leave enough
1179 * empty space here so that the incore attr fork will
1180 * get created (and hence trigger the attr fork bmap
1181 * repairer).
1183 afork_min = bmdr_minsz;
1184 } else {
1185 /* No extents on disk or found in rmapbt. */
1186 afork_min = 0;
1188 break;
1189 case XFS_DINODE_FMT_BTREE:
1190 /* Must have space for btree header and key/pointers. */
1191 bmdr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK);
1192 afork_min = xfs_bmap_broot_space(sc->mp, bmdr);
1193 break;
1194 default:
1195 /* We should never see any other formats. */
1196 afork_min = 0;
1197 break;
1200 /* Compute the minimum space required for the data fork. */
1201 switch (dip->di_format) {
1202 case XFS_DINODE_FMT_DEV:
1203 dfork_min = sizeof(__be32);
1204 break;
1205 case XFS_DINODE_FMT_UUID:
1206 dfork_min = sizeof(uuid_t);
1207 break;
1208 case XFS_DINODE_FMT_LOCAL:
1210 * If we still have a shortform data fork at all, that means
1211 * the data fork area was large enough to fit whatever was in
1212 * there.
1214 dfork_min = be64_to_cpu(dip->di_size);
1215 break;
1216 case XFS_DINODE_FMT_EXTENTS:
1217 data_extents = xfs_dfork_data_extents(dip);
1218 if (data_extents) {
1220 * We must maintain sufficient space to hold the entire
1221 * extent map array in the data fork. Note that we
1222 * previously zapped the fork if it had no chance of
1223 * fitting in the inode.
1225 dfork_min = sizeof(struct xfs_bmbt_rec) * data_extents;
1226 } else if (ri->data_extents > 0 || ri->rt_extents > 0) {
1228 * The data fork thinks it has zero extents, but we
1229 * found some data extents. We need to leave enough
1230 * empty space here so that the data fork bmap repair
1231 * will recover the mappings.
1233 dfork_min = bmdr_minsz;
1234 } else {
1235 /* No extents on disk or found in rmapbt. */
1236 dfork_min = 0;
1238 break;
1239 case XFS_DINODE_FMT_BTREE:
1240 /* Must have space for btree header and key/pointers. */
1241 bmdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
1242 dfork_min = xfs_bmap_broot_space(sc->mp, bmdr);
1243 break;
1244 default:
1245 dfork_min = 0;
1246 break;
1250 * Round all values up to the nearest 8 bytes, because that is the
1251 * precision of di_forkoff.
1253 afork_min = roundup(afork_min, 8);
1254 dfork_min = roundup(dfork_min, 8);
1255 bmdr_minsz = roundup(bmdr_minsz, 8);
1257 ASSERT(dfork_min <= lit_sz);
1258 ASSERT(afork_min <= lit_sz);
1261 * If the data fork was zapped and we don't have enough space for the
1262 * recovery fork, move the attr fork up.
1264 if (dip->di_format == XFS_DINODE_FMT_EXTENTS &&
1265 xfs_dfork_data_extents(dip) == 0 &&
1266 (ri->data_extents > 0 || ri->rt_extents > 0) &&
1267 bmdr_minsz > XFS_DFORK_DSIZE(dip, sc->mp)) {
1268 if (bmdr_minsz + afork_min > lit_sz) {
1270 * The attr for and the stub fork we need to recover
1271 * the data fork won't both fit. Zap the attr fork.
1273 xrep_dinode_zap_afork(ri, dip, mode);
1274 afork_min = bmdr_minsz;
1275 } else {
1276 void *before, *after;
1278 /* Otherwise, just slide the attr fork up. */
1279 before = XFS_DFORK_APTR(dip);
1280 dip->di_forkoff = bmdr_minsz >> 3;
1281 after = XFS_DFORK_APTR(dip);
1282 memmove(after, before, XFS_DFORK_ASIZE(dip, sc->mp));
1287 * If the attr fork was zapped and we don't have enough space for the
1288 * recovery fork, move the attr fork down.
1290 if (dip->di_aformat == XFS_DINODE_FMT_EXTENTS &&
1291 xfs_dfork_attr_extents(dip) == 0 &&
1292 ri->attr_extents > 0 &&
1293 bmdr_minsz > XFS_DFORK_ASIZE(dip, sc->mp)) {
1294 if (dip->di_format == XFS_DINODE_FMT_BTREE) {
1296 * If the data fork is in btree format then we can't
1297 * adjust forkoff because that runs the risk of
1298 * violating the extents/btree format transition rules.
1300 } else if (bmdr_minsz + dfork_min > lit_sz) {
1302 * If we can't move the attr fork, too bad, we lose the
1303 * attr fork and leak its blocks.
1305 xrep_dinode_zap_afork(ri, dip, mode);
1306 } else {
1308 * Otherwise, just slide the attr fork down. The attr
1309 * fork is empty, so we don't have any old contents to
1310 * move here.
1312 dip->di_forkoff = (lit_sz - bmdr_minsz) >> 3;
1318 * Zap the data/attr forks if we spot anything that isn't going to pass the
1319 * ifork verifiers or the ifork formatters, because we need to get the inode
1320 * into good enough shape that the higher level repair functions can run.
1322 STATIC void
1323 xrep_dinode_zap_forks(
1324 struct xrep_inode *ri,
1325 struct xfs_dinode *dip)
1327 struct xfs_scrub *sc = ri->sc;
1328 xfs_extnum_t data_extents;
1329 xfs_extnum_t attr_extents;
1330 xfs_filblks_t nblocks;
1331 uint16_t mode;
1332 bool zap_datafork = false;
1333 bool zap_attrfork = ri->zap_acls;
1335 trace_xrep_dinode_zap_forks(sc, dip);
1337 mode = be16_to_cpu(dip->di_mode);
1339 data_extents = xfs_dfork_data_extents(dip);
1340 attr_extents = xfs_dfork_attr_extents(dip);
1341 nblocks = be64_to_cpu(dip->di_nblocks);
1343 /* Inode counters don't make sense? */
1344 if (data_extents > nblocks)
1345 zap_datafork = true;
1346 if (attr_extents > nblocks)
1347 zap_attrfork = true;
1348 if (data_extents + attr_extents > nblocks)
1349 zap_datafork = zap_attrfork = true;
1351 if (!zap_datafork)
1352 zap_datafork = xrep_dinode_check_dfork(sc, dip, mode);
1353 if (!zap_attrfork)
1354 zap_attrfork = xrep_dinode_check_afork(sc, dip);
1356 /* Zap whatever's bad. */
1357 if (zap_attrfork)
1358 xrep_dinode_zap_afork(ri, dip, mode);
1359 if (zap_datafork)
1360 xrep_dinode_zap_dfork(ri, dip, mode);
1361 xrep_dinode_ensure_forkoff(ri, dip, mode);
1364 * Zero di_nblocks if we don't have any extents at all to satisfy the
1365 * buffer verifier.
1367 data_extents = xfs_dfork_data_extents(dip);
1368 attr_extents = xfs_dfork_attr_extents(dip);
1369 if (data_extents + attr_extents == 0)
1370 dip->di_nblocks = 0;
1373 /* Inode didn't pass dinode verifiers, so fix the raw buffer and retry iget. */
1374 STATIC int
1375 xrep_dinode_core(
1376 struct xrep_inode *ri)
1378 struct xfs_scrub *sc = ri->sc;
1379 struct xfs_buf *bp;
1380 struct xfs_dinode *dip;
1381 xfs_ino_t ino = sc->sm->sm_ino;
1382 int error;
1383 int iget_error;
1385 /* Figure out what this inode had mapped in both forks. */
1386 error = xrep_dinode_count_rmaps(ri);
1387 if (error)
1388 return error;
1390 /* Read the inode cluster buffer. */
1391 error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp,
1392 ri->imap.im_blkno, ri->imap.im_len, XBF_UNMAPPED, &bp,
1393 NULL);
1394 if (error)
1395 return error;
1397 /* Make sure we can pass the inode buffer verifier. */
1398 xrep_dinode_buf(sc, bp);
1399 bp->b_ops = &xfs_inode_buf_ops;
1401 /* Fix everything the verifier will complain about. */
1402 dip = xfs_buf_offset(bp, ri->imap.im_boffset);
1403 xrep_dinode_header(sc, dip);
1404 iget_error = xrep_dinode_mode(ri, dip);
1405 if (iget_error)
1406 goto write;
1407 xrep_dinode_nlinks(dip);
1408 xrep_dinode_flags(sc, dip, ri->rt_extents > 0);
1409 xrep_dinode_size(ri, dip);
1410 xrep_dinode_extsize_hints(sc, dip);
1411 xrep_dinode_zap_forks(ri, dip);
1413 write:
1414 /* Write out the inode. */
1415 trace_xrep_dinode_fixed(sc, dip);
1416 xfs_dinode_calc_crc(sc->mp, dip);
1417 xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_DINO_BUF);
1418 xfs_trans_log_buf(sc->tp, bp, ri->imap.im_boffset,
1419 ri->imap.im_boffset + sc->mp->m_sb.sb_inodesize - 1);
1422 * In theory, we've fixed the ondisk inode record enough that we should
1423 * be able to load the inode into the cache. Try to iget that inode
1424 * now while we hold the AGI and the inode cluster buffer and take the
1425 * IOLOCK so that we can continue with repairs without anyone else
1426 * accessing the inode. If iget fails, we still need to commit the
1427 * changes.
1429 if (!iget_error)
1430 iget_error = xchk_iget(sc, ino, &sc->ip);
1431 if (!iget_error)
1432 xchk_ilock(sc, XFS_IOLOCK_EXCL);
1435 * Commit the inode cluster buffer updates and drop the AGI buffer that
1436 * we've been holding since scrub setup. From here on out, repairs
1437 * deal only with the cached inode.
1439 error = xrep_trans_commit(sc);
1440 if (error)
1441 return error;
1443 if (iget_error)
1444 return iget_error;
1446 error = xchk_trans_alloc(sc, 0);
1447 if (error)
1448 return error;
1450 error = xrep_ino_dqattach(sc);
1451 if (error)
1452 return error;
1454 xchk_ilock(sc, XFS_ILOCK_EXCL);
1455 if (ri->ino_sick_mask)
1456 xfs_inode_mark_sick(sc->ip, ri->ino_sick_mask);
1457 return 0;
1460 /* Fix everything xfs_dinode_verify cares about. */
1461 STATIC int
1462 xrep_dinode_problems(
1463 struct xrep_inode *ri)
1465 struct xfs_scrub *sc = ri->sc;
1466 int error;
1468 error = xrep_dinode_core(ri);
1469 if (error)
1470 return error;
1472 /* We had to fix a totally busted inode, schedule quotacheck. */
1473 if (XFS_IS_UQUOTA_ON(sc->mp))
1474 xrep_force_quotacheck(sc, XFS_DQTYPE_USER);
1475 if (XFS_IS_GQUOTA_ON(sc->mp))
1476 xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP);
1477 if (XFS_IS_PQUOTA_ON(sc->mp))
1478 xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ);
1480 return 0;
1484 * Fix problems that the verifiers don't care about. In general these are
1485 * errors that don't cause problems elsewhere in the kernel that we can easily
1486 * detect, so we don't check them all that rigorously.
1489 /* Make sure block and extent counts are ok. */
1490 STATIC int
1491 xrep_inode_blockcounts(
1492 struct xfs_scrub *sc)
1494 struct xfs_ifork *ifp;
1495 xfs_filblks_t count;
1496 xfs_filblks_t acount;
1497 xfs_extnum_t nextents;
1498 int error;
1500 trace_xrep_inode_blockcounts(sc);
1502 /* Set data fork counters from the data fork mappings. */
1503 error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK,
1504 &nextents, &count);
1505 if (error)
1506 return error;
1507 if (xfs_is_reflink_inode(sc->ip)) {
1509 * data fork blockcount can exceed physical storage if a user
1510 * reflinks the same block over and over again.
1513 } else if (XFS_IS_REALTIME_INODE(sc->ip)) {
1514 if (count >= sc->mp->m_sb.sb_rblocks)
1515 return -EFSCORRUPTED;
1516 } else {
1517 if (count >= sc->mp->m_sb.sb_dblocks)
1518 return -EFSCORRUPTED;
1520 error = xrep_ino_ensure_extent_count(sc, XFS_DATA_FORK, nextents);
1521 if (error)
1522 return error;
1523 sc->ip->i_df.if_nextents = nextents;
1525 /* Set attr fork counters from the attr fork mappings. */
1526 ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK);
1527 if (ifp) {
1528 error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK,
1529 &nextents, &acount);
1530 if (error)
1531 return error;
1532 if (count >= sc->mp->m_sb.sb_dblocks)
1533 return -EFSCORRUPTED;
1534 error = xrep_ino_ensure_extent_count(sc, XFS_ATTR_FORK,
1535 nextents);
1536 if (error)
1537 return error;
1538 ifp->if_nextents = nextents;
1539 } else {
1540 acount = 0;
1543 sc->ip->i_nblocks = count + acount;
1544 return 0;
1547 /* Check for invalid uid/gid/prid. */
1548 STATIC void
1549 xrep_inode_ids(
1550 struct xfs_scrub *sc)
1552 bool dirty = false;
1554 trace_xrep_inode_ids(sc);
1556 if (!uid_valid(VFS_I(sc->ip)->i_uid)) {
1557 i_uid_write(VFS_I(sc->ip), 0);
1558 dirty = true;
1559 if (XFS_IS_UQUOTA_ON(sc->mp))
1560 xrep_force_quotacheck(sc, XFS_DQTYPE_USER);
1563 if (!gid_valid(VFS_I(sc->ip)->i_gid)) {
1564 i_gid_write(VFS_I(sc->ip), 0);
1565 dirty = true;
1566 if (XFS_IS_GQUOTA_ON(sc->mp))
1567 xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP);
1570 if (sc->ip->i_projid == -1U) {
1571 sc->ip->i_projid = 0;
1572 dirty = true;
1573 if (XFS_IS_PQUOTA_ON(sc->mp))
1574 xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ);
1577 /* strip setuid/setgid if we touched any of the ids */
1578 if (dirty)
1579 VFS_I(sc->ip)->i_mode &= ~(S_ISUID | S_ISGID);
1582 static inline void
1583 xrep_clamp_timestamp(
1584 struct xfs_inode *ip,
1585 struct timespec64 *ts)
1587 ts->tv_nsec = clamp_t(long, ts->tv_nsec, 0, NSEC_PER_SEC);
1588 *ts = timestamp_truncate(*ts, VFS_I(ip));
1591 /* Nanosecond counters can't have more than 1 billion. */
1592 STATIC void
1593 xrep_inode_timestamps(
1594 struct xfs_inode *ip)
1596 struct timespec64 tstamp;
1597 struct inode *inode = VFS_I(ip);
1599 tstamp = inode_get_atime(inode);
1600 xrep_clamp_timestamp(ip, &tstamp);
1601 inode_set_atime_to_ts(inode, tstamp);
1603 tstamp = inode_get_mtime(inode);
1604 xrep_clamp_timestamp(ip, &tstamp);
1605 inode_set_mtime_to_ts(inode, tstamp);
1607 tstamp = inode_get_ctime(inode);
1608 xrep_clamp_timestamp(ip, &tstamp);
1609 inode_set_ctime_to_ts(inode, tstamp);
1611 xrep_clamp_timestamp(ip, &ip->i_crtime);
1614 /* Fix inode flags that don't make sense together. */
1615 STATIC void
1616 xrep_inode_flags(
1617 struct xfs_scrub *sc)
1619 uint16_t mode;
1621 trace_xrep_inode_flags(sc);
1623 mode = VFS_I(sc->ip)->i_mode;
1625 /* Clear junk flags */
1626 if (sc->ip->i_diflags & ~XFS_DIFLAG_ANY)
1627 sc->ip->i_diflags &= ~XFS_DIFLAG_ANY;
1629 /* NEWRTBM only applies to realtime bitmaps */
1630 if (sc->ip->i_ino == sc->mp->m_sb.sb_rbmino)
1631 sc->ip->i_diflags |= XFS_DIFLAG_NEWRTBM;
1632 else
1633 sc->ip->i_diflags &= ~XFS_DIFLAG_NEWRTBM;
1635 /* These only make sense for directories. */
1636 if (!S_ISDIR(mode))
1637 sc->ip->i_diflags &= ~(XFS_DIFLAG_RTINHERIT |
1638 XFS_DIFLAG_EXTSZINHERIT |
1639 XFS_DIFLAG_PROJINHERIT |
1640 XFS_DIFLAG_NOSYMLINKS);
1642 /* These only make sense for files. */
1643 if (!S_ISREG(mode))
1644 sc->ip->i_diflags &= ~(XFS_DIFLAG_REALTIME |
1645 XFS_DIFLAG_EXTSIZE);
1647 /* These only make sense for non-rt files. */
1648 if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME)
1649 sc->ip->i_diflags &= ~XFS_DIFLAG_FILESTREAM;
1651 /* Immutable and append only? Drop the append. */
1652 if ((sc->ip->i_diflags & XFS_DIFLAG_IMMUTABLE) &&
1653 (sc->ip->i_diflags & XFS_DIFLAG_APPEND))
1654 sc->ip->i_diflags &= ~XFS_DIFLAG_APPEND;
1656 /* Clear junk flags. */
1657 if (sc->ip->i_diflags2 & ~XFS_DIFLAG2_ANY)
1658 sc->ip->i_diflags2 &= ~XFS_DIFLAG2_ANY;
1660 /* No reflink flag unless we support it and it's a file. */
1661 if (!xfs_has_reflink(sc->mp) || !S_ISREG(mode))
1662 sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
1664 /* DAX only applies to files and dirs. */
1665 if (!(S_ISREG(mode) || S_ISDIR(mode)))
1666 sc->ip->i_diflags2 &= ~XFS_DIFLAG2_DAX;
1668 /* No reflink files on the realtime device. */
1669 if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME)
1670 sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
1674 * Fix size problems with block/node format directories. If we fail to find
1675 * the extent list, just bail out and let the bmapbtd repair functions clean
1676 * up that mess.
1678 STATIC void
1679 xrep_inode_blockdir_size(
1680 struct xfs_scrub *sc)
1682 struct xfs_iext_cursor icur;
1683 struct xfs_bmbt_irec got;
1684 struct xfs_ifork *ifp;
1685 xfs_fileoff_t off;
1686 int error;
1688 trace_xrep_inode_blockdir_size(sc);
1690 error = xfs_iread_extents(sc->tp, sc->ip, XFS_DATA_FORK);
1691 if (error)
1692 return;
1694 /* Find the last block before 32G; this is the dir size. */
1695 ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
1696 off = XFS_B_TO_FSB(sc->mp, XFS_DIR2_SPACE_SIZE);
1697 if (!xfs_iext_lookup_extent_before(sc->ip, ifp, &off, &icur, &got)) {
1698 /* zero-extents directory? */
1699 return;
1702 off = got.br_startoff + got.br_blockcount;
1703 sc->ip->i_disk_size = min_t(loff_t, XFS_DIR2_SPACE_SIZE,
1704 XFS_FSB_TO_B(sc->mp, off));
1707 /* Fix size problems with short format directories. */
1708 STATIC void
1709 xrep_inode_sfdir_size(
1710 struct xfs_scrub *sc)
1712 struct xfs_ifork *ifp;
1714 trace_xrep_inode_sfdir_size(sc);
1716 ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
1717 sc->ip->i_disk_size = ifp->if_bytes;
1721 * Fix any irregularities in a directory inode's size now that we can iterate
1722 * extent maps and access other regular inode data.
1724 STATIC void
1725 xrep_inode_dir_size(
1726 struct xfs_scrub *sc)
1728 trace_xrep_inode_dir_size(sc);
1730 switch (sc->ip->i_df.if_format) {
1731 case XFS_DINODE_FMT_EXTENTS:
1732 case XFS_DINODE_FMT_BTREE:
1733 xrep_inode_blockdir_size(sc);
1734 break;
1735 case XFS_DINODE_FMT_LOCAL:
1736 xrep_inode_sfdir_size(sc);
1737 break;
1741 /* Fix extent size hint problems. */
1742 STATIC void
1743 xrep_inode_extsize(
1744 struct xfs_scrub *sc)
1746 /* Fix misaligned extent size hints on a directory. */
1747 if ((sc->ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
1748 (sc->ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) &&
1749 xfs_extlen_to_rtxmod(sc->mp, sc->ip->i_extsize) > 0) {
1750 sc->ip->i_extsize = 0;
1751 sc->ip->i_diflags &= ~XFS_DIFLAG_EXTSZINHERIT;
1755 /* Ensure this file has an attr fork if it needs to hold a parent pointer. */
1756 STATIC int
1757 xrep_inode_pptr(
1758 struct xfs_scrub *sc)
1760 struct xfs_mount *mp = sc->mp;
1761 struct xfs_inode *ip = sc->ip;
1762 struct inode *inode = VFS_I(ip);
1764 if (!xfs_has_parent(mp))
1765 return 0;
1768 * Unlinked inodes that cannot be added to the directory tree will not
1769 * have a parent pointer.
1771 if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
1772 return 0;
1774 /* Children of the superblock do not have parent pointers. */
1775 if (xchk_inode_is_sb_rooted(ip))
1776 return 0;
1778 /* Inode already has an attr fork; no further work possible here. */
1779 if (xfs_inode_has_attr_fork(ip))
1780 return 0;
1782 return xfs_bmap_add_attrfork(sc->tp, ip,
1783 sizeof(struct xfs_attr_sf_hdr), true);
1786 /* Fix any irregularities in an inode that the verifiers don't catch. */
1787 STATIC int
1788 xrep_inode_problems(
1789 struct xfs_scrub *sc)
1791 int error;
1793 error = xrep_inode_blockcounts(sc);
1794 if (error)
1795 return error;
1796 error = xrep_inode_pptr(sc);
1797 if (error)
1798 return error;
1799 xrep_inode_timestamps(sc->ip);
1800 xrep_inode_flags(sc);
1801 xrep_inode_ids(sc);
1803 * We can now do a better job fixing the size of a directory now that
1804 * we can scan the data fork extents than we could in xrep_dinode_size.
1806 if (S_ISDIR(VFS_I(sc->ip)->i_mode))
1807 xrep_inode_dir_size(sc);
1808 xrep_inode_extsize(sc);
1810 trace_xrep_inode_fixed(sc);
1811 xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
1812 return xrep_roll_trans(sc);
1816 * Make sure this inode's unlinked list pointers are consistent with its
1817 * link count.
1819 STATIC int
1820 xrep_inode_unlinked(
1821 struct xfs_scrub *sc)
1823 unsigned int nlink = VFS_I(sc->ip)->i_nlink;
1824 int error;
1827 * If this inode is linked from the directory tree and on the unlinked
1828 * list, remove it from the unlinked list.
1830 if (nlink > 0 && xfs_inode_on_unlinked_list(sc->ip)) {
1831 struct xfs_perag *pag;
1832 int error;
1834 pag = xfs_perag_get(sc->mp,
1835 XFS_INO_TO_AGNO(sc->mp, sc->ip->i_ino));
1836 error = xfs_iunlink_remove(sc->tp, pag, sc->ip);
1837 xfs_perag_put(pag);
1838 if (error)
1839 return error;
1843 * If this inode is not linked from the directory tree yet not on the
1844 * unlinked list, put it on the unlinked list.
1846 if (nlink == 0 && !xfs_inode_on_unlinked_list(sc->ip)) {
1847 error = xfs_iunlink(sc->tp, sc->ip);
1848 if (error)
1849 return error;
1852 return 0;
1855 /* Repair an inode's fields. */
1857 xrep_inode(
1858 struct xfs_scrub *sc)
1860 int error = 0;
1863 * No inode? That means we failed the _iget verifiers. Repair all
1864 * the things that the inode verifiers care about, then retry _iget.
1866 if (!sc->ip) {
1867 struct xrep_inode *ri = sc->buf;
1869 ASSERT(ri != NULL);
1871 error = xrep_dinode_problems(ri);
1872 if (error == -EBUSY) {
1874 * Directory scan to recover inode mode encountered a
1875 * busy inode, so we did not continue repairing things.
1877 return 0;
1879 if (error)
1880 return error;
1882 /* By this point we had better have a working incore inode. */
1883 if (!sc->ip)
1884 return -EFSCORRUPTED;
1887 xfs_trans_ijoin(sc->tp, sc->ip, 0);
1889 /* If we found corruption of any kind, try to fix it. */
1890 if ((sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) ||
1891 (sc->sm->sm_flags & XFS_SCRUB_OFLAG_XCORRUPT)) {
1892 error = xrep_inode_problems(sc);
1893 if (error)
1894 return error;
1897 /* See if we can clear the reflink flag. */
1898 if (xfs_is_reflink_inode(sc->ip)) {
1899 error = xfs_reflink_clear_inode_flag(sc->ip, &sc->tp);
1900 if (error)
1901 return error;
1904 /* Reconnect incore unlinked list */
1905 error = xrep_inode_unlinked(sc);
1906 if (error)
1907 return error;
1909 return xrep_defer_finish(sc);