fs/xfs/scrub/scrub.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * Copyright (C) 2017-2023 Oracle.  All Rights Reserved.
   4  * Author: Darrick J. Wong <djwong@kernel.org>
   5  */
   6 #include "xfs.h"
   7 #include "xfs_fs.h"
   8 #include "xfs_shared.h"
   9 #include "xfs_format.h"
  10 #include "xfs_trans_resv.h"
  11 #include "xfs_mount.h"
  12 #include "xfs_log_format.h"
  13 #include "xfs_trans.h"
  14 #include "xfs_inode.h"
  15 #include "xfs_quota.h"
  16 #include "xfs_qm.h"
  17 #include "xfs_scrub.h"
  18 #include "xfs_buf_mem.h"
  19 #include "xfs_rmap.h"
  20 #include "xfs_exchrange.h"
  21 #include "xfs_exchmaps.h"
  22 #include "xfs_dir2.h"
  23 #include "xfs_parent.h"
  24 #include "xfs_icache.h"
  25 #include "scrub/scrub.h"
  26 #include "scrub/common.h"
  27 #include "scrub/trace.h"
  28 #include "scrub/repair.h"
  29 #include "scrub/health.h"
  30 #include "scrub/stats.h"
  31 #include "scrub/xfile.h"
  32 #include "scrub/tempfile.h"
  33 #include "scrub/orphanage.h"
  34
  35 /*
  36  * Online Scrub and Repair
  37  *
  38  * Traditionally, XFS (the kernel driver) did not know how to check or
  39  * repair on-disk data structures.  That task was left to the xfs_check
  40  * and xfs_repair tools, both of which require taking the filesystem
  41  * offline for a thorough but time consuming examination.  Online
  42  * scrub & repair, on the other hand, enables us to check the metadata
  43  * for obvious errors while carefully stepping around the filesystem's
  44  * ongoing operations, locking rules, etc.
  45  *
  46  * Given that most XFS metadata consist of records stored in a btree,
  47  * most of the checking functions iterate the btree blocks themselves
  48  * looking for irregularities.  When a record block is encountered, each
  49  * record can be checked for obviously bad values.  Record values can
  50  * also be cross-referenced against other btrees to look for potential
  51  * misunderstandings between pieces of metadata.
  52  *
  53  * It is expected that the checkers responsible for per-AG metadata
  54  * structures will lock the AG headers (AGI, AGF, AGFL), iterate the
  55  * metadata structure, and perform any relevant cross-referencing before
  56  * unlocking the AG and returning the results to userspace.  These
  57  * scrubbers must not keep an AG locked for too long to avoid tying up
  58  * the block and inode allocators.
  59  *
  60  * Block maps and b-trees rooted in an inode present a special challenge
  61  * because they can involve extents from any AG.  The general scrubber
  62  * structure of lock -> check -> xref -> unlock still holds, but AG
  63  * locking order rules /must/ be obeyed to avoid deadlocks.  The
  64  * ordering rule, of course, is that we must lock in increasing AG
  65  * order.  Helper functions are provided to track which AG headers we've
  66  * already locked.  If we detect an imminent locking order violation, we
  67  * can signal a potential deadlock, in which case the scrubber can jump
  68  * out to the top level, lock all the AGs in order, and retry the scrub.
  69  *
  70  * For file data (directories, extended attributes, symlinks) scrub, we
  71  * can simply lock the inode and walk the data.  For btree data
  72  * (directories and attributes) we follow the same btree-scrubbing
  73  * strategy outlined previously to check the records.
  74  *
  75  * We use a bit of trickery with transactions to avoid buffer deadlocks
  76  * if there is a cycle in the metadata.  The basic problem is that
  77  * travelling down a btree involves locking the current buffer at each
  78  * tree level.  If a pointer should somehow point back to a buffer that
  79  * we've already examined, we will deadlock due to the second buffer
  80  * locking attempt.  Note however that grabbing a buffer in transaction
  81  * context links the locked buffer to the transaction.  If we try to
  82  * re-grab the buffer in the context of the same transaction, we avoid
  83  * the second lock attempt and continue.  Between the verifier and the
  84  * scrubber, something will notice that something is amiss and report
  85  * the corruption.  Therefore, each scrubber will allocate an empty
  86  * transaction, attach buffers to it, and cancel the transaction at the
  87  * end of the scrub run.  Cancelling a non-dirty transaction simply
  88  * unlocks the buffers.
  89  *
  90  * There are four pieces of data that scrub can communicate to
  91  * userspace.  The first is the error code (errno), which can be used to
  92  * communicate operational errors in performing the scrub.  There are
  93  * also three flags that can be set in the scrub context.  If the data
  94  * structure itself is corrupt, the CORRUPT flag will be set.  If
  95  * the metadata is correct but otherwise suboptimal, the PREEN flag
  96  * will be set.
  97  *
  98  * We perform secondary validation of filesystem metadata by
  99  * cross-referencing every record with all other available metadata.
 100  * For example, for block mapping extents, we verify that there are no
 101  * records in the free space and inode btrees corresponding to that
 102  * space extent and that there is a corresponding entry in the reverse
 103  * mapping btree.  Inconsistent metadata is noted by setting the
 104  * XCORRUPT flag; btree query function errors are noted by setting the
 105  * XFAIL flag and deleting the cursor to prevent further attempts to
 106  * cross-reference with a defective btree.
 107  *
 108  * If a piece of metadata proves corrupt or suboptimal, the userspace
 109  * program can ask the kernel to apply some tender loving care (TLC) to
 110  * the metadata object by setting the REPAIR flag and re-calling the
 111  * scrub ioctl.  "Corruption" is defined by metadata violating the
 112  * on-disk specification; operations cannot continue if the violation is
 113  * left untreated.  It is possible for XFS to continue if an object is
 114  * "suboptimal", however performance may be degraded.  Repairs are
 115  * usually performed by rebuilding the metadata entirely out of
 116  * redundant metadata.  Optimizing, on the other hand, can sometimes be
 117  * done without rebuilding entire structures.
 118  *
 119  * Generally speaking, the repair code has the following code structure:
 120  * Lock -> scrub -> repair -> commit -> re-lock -> re-scrub -> unlock.
 121  * The first check helps us figure out if we need to rebuild or simply
 122  * optimize the structure so that the rebuild knows what to do.  The
 123  * second check evaluates the completeness of the repair; that is what
 124  * is reported to userspace.
 125  *
 126  * A quick note on symbol prefixes:
 127  * - "xfs_" are general XFS symbols.
 128  * - "xchk_" are symbols related to metadata checking.
 129  * - "xrep_" are symbols related to metadata repair.
 130  * - "xfs_scrub_" are symbols that tie online fsck to the rest of XFS.
 131  */
 132
 133 /*
 134  * Scrub probe -- userspace uses this to probe if we're willing to scrub
 135  * or repair a given mountpoint.  This will be used by xfs_scrub to
 136  * probe the kernel's abilities to scrub (and repair) the metadata.  We
 137  * do this by validating the ioctl inputs from userspace, preparing the
 138  * filesystem for a scrub (or a repair) operation, and immediately
 139  * returning to userspace.  Userspace can use the returned errno and
 140  * structure state to decide (in broad terms) if scrub/repair are
 141  * supported by the running kernel.
 142  */
 143 static int
 144 xchk_probe(
 145         struct xfs_scrub        *sc)
 146 {
 147         int                     error = 0;
 148
 149         if (xchk_should_terminate(sc, &error))
 150                 return error;
 151
 152         return 0;
 153 }
 154
 155 /* Scrub setup and teardown */
 156
 157 static inline void
 158 xchk_fsgates_disable(
 159         struct xfs_scrub        *sc)
 160 {
 161         if (!(sc->flags & XCHK_FSGATES_ALL))
 162                 return;
 163
 164         trace_xchk_fsgates_disable(sc, sc->flags & XCHK_FSGATES_ALL);
 165
 166         if (sc->flags & XCHK_FSGATES_DRAIN)
 167                 xfs_drain_wait_disable();
 168
 169         if (sc->flags & XCHK_FSGATES_QUOTA)
 170                 xfs_dqtrx_hook_disable();
 171
 172         if (sc->flags & XCHK_FSGATES_DIRENTS)
 173                 xfs_dir_hook_disable();
 174
 175         if (sc->flags & XCHK_FSGATES_RMAP)
 176                 xfs_rmap_hook_disable();
 177
 178         sc->flags &= ~XCHK_FSGATES_ALL;
 179 }
 180
 181 /* Free the resources associated with a scrub subtype. */
 182 void
 183 xchk_scrub_free_subord(
 184         struct xfs_scrub_subord *sub)
 185 {
 186         struct xfs_scrub        *sc = sub->parent_sc;
 187
 188         ASSERT(sc->ip == sub->sc.ip);
 189         ASSERT(sc->orphanage == sub->sc.orphanage);
 190         ASSERT(sc->tempip == sub->sc.tempip);
 191
 192         sc->sm->sm_type = sub->old_smtype;
 193         sc->sm->sm_flags = sub->old_smflags |
 194                                 (sc->sm->sm_flags & XFS_SCRUB_FLAGS_OUT);
 195         sc->tp = sub->sc.tp;
 196
 197         if (sub->sc.buf) {
 198                 if (sub->sc.buf_cleanup)
 199                         sub->sc.buf_cleanup(sub->sc.buf);
 200                 kvfree(sub->sc.buf);
 201         }
 202         if (sub->sc.xmbtp)
 203                 xmbuf_free(sub->sc.xmbtp);
 204         if (sub->sc.xfile)
 205                 xfile_destroy(sub->sc.xfile);
 206
 207         sc->ilock_flags = sub->sc.ilock_flags;
 208         sc->orphanage_ilock_flags = sub->sc.orphanage_ilock_flags;
 209         sc->temp_ilock_flags = sub->sc.temp_ilock_flags;
 210
 211         kfree(sub);
 212 }
 213
 214 /* Free all the resources and finish the transactions. */
 215 STATIC int
 216 xchk_teardown(
 217         struct xfs_scrub        *sc,
 218         int                     error)
 219 {
 220         xchk_ag_free(sc, &sc->sa);
 221         if (sc->tp) {
 222                 if (error == 0 && (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
 223                         error = xfs_trans_commit(sc->tp);
 224                 else
 225                         xfs_trans_cancel(sc->tp);
 226                 sc->tp = NULL;
 227         }
 228         if (sc->sr.rtg)
 229                 xchk_rtgroup_free(sc, &sc->sr);
 230         if (sc->ip) {
 231                 if (sc->ilock_flags)
 232                         xchk_iunlock(sc, sc->ilock_flags);
 233                 xchk_irele(sc, sc->ip);
 234                 sc->ip = NULL;
 235         }
 236         if (sc->flags & XCHK_HAVE_FREEZE_PROT) {
 237                 sc->flags &= ~XCHK_HAVE_FREEZE_PROT;
 238                 mnt_drop_write_file(sc->file);
 239         }
 240         if (sc->xmbtp) {
 241                 xmbuf_free(sc->xmbtp);
 242                 sc->xmbtp = NULL;
 243         }
 244         if (sc->xfile) {
 245                 xfile_destroy(sc->xfile);
 246                 sc->xfile = NULL;
 247         }
 248         if (sc->buf) {
 249                 if (sc->buf_cleanup)
 250                         sc->buf_cleanup(sc->buf);
 251                 kvfree(sc->buf);
 252                 sc->buf_cleanup = NULL;
 253                 sc->buf = NULL;
 254         }
 255
 256         xrep_tempfile_rele(sc);
 257         xrep_orphanage_rele(sc);
 258         xchk_fsgates_disable(sc);
 259         return error;
 260 }
 261
 262 /* Scrubbing dispatch. */
 263
 264 static const struct xchk_meta_ops meta_scrub_ops[] = {
 265         [XFS_SCRUB_TYPE_PROBE] = {      /* ioctl presence test */
 266                 .type   = ST_NONE,
 267                 .setup  = xchk_setup_fs,
 268                 .scrub  = xchk_probe,
 269                 .repair = xrep_probe,
 270         },
 271         [XFS_SCRUB_TYPE_SB] = {         /* superblock */
 272                 .type   = ST_PERAG,
 273                 .setup  = xchk_setup_agheader,
 274                 .scrub  = xchk_superblock,
 275                 .repair = xrep_superblock,
 276         },
 277         [XFS_SCRUB_TYPE_AGF] = {        /* agf */
 278                 .type   = ST_PERAG,
 279                 .setup  = xchk_setup_agheader,
 280                 .scrub  = xchk_agf,
 281                 .repair = xrep_agf,
 282         },
 283         [XFS_SCRUB_TYPE_AGFL]= {        /* agfl */
 284                 .type   = ST_PERAG,
 285                 .setup  = xchk_setup_agheader,
 286                 .scrub  = xchk_agfl,
 287                 .repair = xrep_agfl,
 288         },
 289         [XFS_SCRUB_TYPE_AGI] = {        /* agi */
 290                 .type   = ST_PERAG,
 291                 .setup  = xchk_setup_agheader,
 292                 .scrub  = xchk_agi,
 293                 .repair = xrep_agi,
 294         },
 295         [XFS_SCRUB_TYPE_BNOBT] = {      /* bnobt */
 296                 .type   = ST_PERAG,
 297                 .setup  = xchk_setup_ag_allocbt,
 298                 .scrub  = xchk_allocbt,
 299                 .repair = xrep_allocbt,
 300                 .repair_eval = xrep_revalidate_allocbt,
 301         },
 302         [XFS_SCRUB_TYPE_CNTBT] = {      /* cntbt */
 303                 .type   = ST_PERAG,
 304                 .setup  = xchk_setup_ag_allocbt,
 305                 .scrub  = xchk_allocbt,
 306                 .repair = xrep_allocbt,
 307                 .repair_eval = xrep_revalidate_allocbt,
 308         },
 309         [XFS_SCRUB_TYPE_INOBT] = {      /* inobt */
 310                 .type   = ST_PERAG,
 311                 .setup  = xchk_setup_ag_iallocbt,
 312                 .scrub  = xchk_iallocbt,
 313                 .repair = xrep_iallocbt,
 314                 .repair_eval = xrep_revalidate_iallocbt,
 315         },
 316         [XFS_SCRUB_TYPE_FINOBT] = {     /* finobt */
 317                 .type   = ST_PERAG,
 318                 .setup  = xchk_setup_ag_iallocbt,
 319                 .scrub  = xchk_iallocbt,
 320                 .has    = xfs_has_finobt,
 321                 .repair = xrep_iallocbt,
 322                 .repair_eval = xrep_revalidate_iallocbt,
 323         },
 324         [XFS_SCRUB_TYPE_RMAPBT] = {     /* rmapbt */
 325                 .type   = ST_PERAG,
 326                 .setup  = xchk_setup_ag_rmapbt,
 327                 .scrub  = xchk_rmapbt,
 328                 .has    = xfs_has_rmapbt,
 329                 .repair = xrep_rmapbt,
 330         },
 331         [XFS_SCRUB_TYPE_REFCNTBT] = {   /* refcountbt */
 332                 .type   = ST_PERAG,
 333                 .setup  = xchk_setup_ag_refcountbt,
 334                 .scrub  = xchk_refcountbt,
 335                 .has    = xfs_has_reflink,
 336                 .repair = xrep_refcountbt,
 337         },
 338         [XFS_SCRUB_TYPE_INODE] = {      /* inode record */
 339                 .type   = ST_INODE,
 340                 .setup  = xchk_setup_inode,
 341                 .scrub  = xchk_inode,
 342                 .repair = xrep_inode,
 343         },
 344         [XFS_SCRUB_TYPE_BMBTD] = {      /* inode data fork */
 345                 .type   = ST_INODE,
 346                 .setup  = xchk_setup_inode_bmap,
 347                 .scrub  = xchk_bmap_data,
 348                 .repair = xrep_bmap_data,
 349         },
 350         [XFS_SCRUB_TYPE_BMBTA] = {      /* inode attr fork */
 351                 .type   = ST_INODE,
 352                 .setup  = xchk_setup_inode_bmap,
 353                 .scrub  = xchk_bmap_attr,
 354                 .repair = xrep_bmap_attr,
 355         },
 356         [XFS_SCRUB_TYPE_BMBTC] = {      /* inode CoW fork */
 357                 .type   = ST_INODE,
 358                 .setup  = xchk_setup_inode_bmap,
 359                 .scrub  = xchk_bmap_cow,
 360                 .repair = xrep_bmap_cow,
 361         },
 362         [XFS_SCRUB_TYPE_DIR] = {        /* directory */
 363                 .type   = ST_INODE,
 364                 .setup  = xchk_setup_directory,
 365                 .scrub  = xchk_directory,
 366                 .repair = xrep_directory,
 367         },
 368         [XFS_SCRUB_TYPE_XATTR] = {      /* extended attributes */
 369                 .type   = ST_INODE,
 370                 .setup  = xchk_setup_xattr,
 371                 .scrub  = xchk_xattr,
 372                 .repair = xrep_xattr,
 373         },
 374         [XFS_SCRUB_TYPE_SYMLINK] = {    /* symbolic link */
 375                 .type   = ST_INODE,
 376                 .setup  = xchk_setup_symlink,
 377                 .scrub  = xchk_symlink,
 378                 .repair = xrep_symlink,
 379         },
 380         [XFS_SCRUB_TYPE_PARENT] = {     /* parent pointers */
 381                 .type   = ST_INODE,
 382                 .setup  = xchk_setup_parent,
 383                 .scrub  = xchk_parent,
 384                 .repair = xrep_parent,
 385         },
 386         [XFS_SCRUB_TYPE_RTBITMAP] = {   /* realtime bitmap */
 387                 .type   = ST_RTGROUP,
 388                 .setup  = xchk_setup_rtbitmap,
 389                 .scrub  = xchk_rtbitmap,
 390                 .repair = xrep_rtbitmap,
 391         },
 392         [XFS_SCRUB_TYPE_RTSUM] = {      /* realtime summary */
 393                 .type   = ST_RTGROUP,
 394                 .setup  = xchk_setup_rtsummary,
 395                 .scrub  = xchk_rtsummary,
 396                 .repair = xrep_rtsummary,
 397         },
 398         [XFS_SCRUB_TYPE_UQUOTA] = {     /* user quota */
 399                 .type   = ST_FS,
 400                 .setup  = xchk_setup_quota,
 401                 .scrub  = xchk_quota,
 402                 .repair = xrep_quota,
 403         },
 404         [XFS_SCRUB_TYPE_GQUOTA] = {     /* group quota */
 405                 .type   = ST_FS,
 406                 .setup  = xchk_setup_quota,
 407                 .scrub  = xchk_quota,
 408                 .repair = xrep_quota,
 409         },
 410         [XFS_SCRUB_TYPE_PQUOTA] = {     /* project quota */
 411                 .type   = ST_FS,
 412                 .setup  = xchk_setup_quota,
 413                 .scrub  = xchk_quota,
 414                 .repair = xrep_quota,
 415         },
 416         [XFS_SCRUB_TYPE_FSCOUNTERS] = { /* fs summary counters */
 417                 .type   = ST_FS,
 418                 .setup  = xchk_setup_fscounters,
 419                 .scrub  = xchk_fscounters,
 420                 .repair = xrep_fscounters,
 421         },
 422         [XFS_SCRUB_TYPE_QUOTACHECK] = { /* quota counters */
 423                 .type   = ST_FS,
 424                 .setup  = xchk_setup_quotacheck,
 425                 .scrub  = xchk_quotacheck,
 426                 .repair = xrep_quotacheck,
 427         },
 428         [XFS_SCRUB_TYPE_NLINKS] = {     /* inode link counts */
 429                 .type   = ST_FS,
 430                 .setup  = xchk_setup_nlinks,
 431                 .scrub  = xchk_nlinks,
 432                 .repair = xrep_nlinks,
 433         },
 434         [XFS_SCRUB_TYPE_HEALTHY] = {    /* fs healthy; clean all reminders */
 435                 .type   = ST_FS,
 436                 .setup  = xchk_setup_fs,
 437                 .scrub  = xchk_health_record,
 438                 .repair = xrep_notsupported,
 439         },
 440         [XFS_SCRUB_TYPE_DIRTREE] = {    /* directory tree structure */
 441                 .type   = ST_INODE,
 442                 .setup  = xchk_setup_dirtree,
 443                 .scrub  = xchk_dirtree,
 444                 .has    = xfs_has_parent,
 445                 .repair = xrep_dirtree,
 446         },
 447         [XFS_SCRUB_TYPE_METAPATH] = {   /* metadata directory tree path */
 448                 .type   = ST_GENERIC,
 449                 .setup  = xchk_setup_metapath,
 450                 .scrub  = xchk_metapath,
 451                 .has    = xfs_has_metadir,
 452                 .repair = xrep_metapath,
 453         },
 454         [XFS_SCRUB_TYPE_RGSUPER] = {    /* realtime group superblock */
 455                 .type   = ST_RTGROUP,
 456                 .setup  = xchk_setup_rgsuperblock,
 457                 .scrub  = xchk_rgsuperblock,
 458                 .has    = xfs_has_rtsb,
 459                 .repair = xrep_rgsuperblock,
 460         },
 461 };
 462
 463 static int
 464 xchk_validate_inputs(
 465         struct xfs_mount                *mp,
 466         struct xfs_scrub_metadata       *sm)
 467 {
 468         int                             error;
 469         const struct xchk_meta_ops      *ops;
 470
 471         error = -EINVAL;
 472         /* Check our inputs. */
 473         sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
 474         if (sm->sm_flags & ~XFS_SCRUB_FLAGS_IN)
 475                 goto out;
 476         /* sm_reserved[] must be zero */
 477         if (memchr_inv(sm->sm_reserved, 0, sizeof(sm->sm_reserved)))
 478                 goto out;
 479
 480         error = -ENOENT;
 481         /* Do we know about this type of metadata? */
 482         if (sm->sm_type >= XFS_SCRUB_TYPE_NR)
 483                 goto out;
 484         ops = &meta_scrub_ops[sm->sm_type];
 485         if (ops->setup == NULL || ops->scrub == NULL)
 486                 goto out;
 487         /* Does this fs even support this type of metadata? */
 488         if (ops->has && !ops->has(mp))
 489                 goto out;
 490
 491         error = -EINVAL;
 492         /* restricting fields must be appropriate for type */
 493         switch (ops->type) {
 494         case ST_NONE:
 495         case ST_FS:
 496                 if (sm->sm_ino || sm->sm_gen || sm->sm_agno)
 497                         goto out;
 498                 break;
 499         case ST_PERAG:
 500                 if (sm->sm_ino || sm->sm_gen ||
 501                     sm->sm_agno >= mp->m_sb.sb_agcount)
 502                         goto out;
 503                 break;
 504         case ST_INODE:
 505                 if (sm->sm_agno || (sm->sm_gen && !sm->sm_ino))
 506                         goto out;
 507                 break;
 508         case ST_GENERIC:
 509                 break;
 510         case ST_RTGROUP:
 511                 if (sm->sm_ino || sm->sm_gen)
 512                         goto out;
 513                 if (xfs_has_rtgroups(mp)) {
 514                         /*
 515                          * On a rtgroups filesystem, there won't be an rtbitmap
 516                          * or rtsummary file for group 0 unless there's
 517                          * actually a realtime volume attached.  However, older
 518                          * xfs_scrub always calls the rtbitmap/rtsummary
 519                          * scrubbers with sm_agno==0 so transform the error
 520                          * code to ENOENT.
 521                          */
 522                         if (sm->sm_agno >= mp->m_sb.sb_rgcount) {
 523                                 if (sm->sm_agno == 0)
 524                                         error = -ENOENT;
 525                                 goto out;
 526                         }
 527                 } else {
 528                         /*
 529                          * Prior to rtgroups, the rtbitmap/rtsummary scrubbers
 530                          * accepted sm_agno==0, so we still accept that for
 531                          * scrubbing pre-rtgroups filesystems.
 532                          */
 533                         if (sm->sm_agno != 0)
 534                                 goto out;
 535                 }
 536                 break;
 537         default:
 538                 goto out;
 539         }
 540
 541         /* No rebuild without repair. */
 542         if ((sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) &&
 543             !(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
 544                 return -EINVAL;
 545
 546         /*
 547          * We only want to repair read-write v5+ filesystems.  Defer the check
 548          * for ops->repair until after our scrub confirms that we need to
 549          * perform repairs so that we avoid failing due to not supporting
 550          * repairing an object that doesn't need repairs.
 551          */
 552         if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) {
 553                 error = -EOPNOTSUPP;
 554                 if (!xfs_has_crc(mp))
 555                         goto out;
 556
 557                 error = -EROFS;
 558                 if (xfs_is_readonly(mp))
 559                         goto out;
 560         }
 561
 562         error = 0;
 563 out:
 564         return error;
 565 }
 566
 567 #ifdef CONFIG_XFS_ONLINE_REPAIR
 568 static inline void xchk_postmortem(struct xfs_scrub *sc)
 569 {
 570         /*
 571          * Userspace asked us to repair something, we repaired it, rescanned
 572          * it, and the rescan says it's still broken.  Scream about this in
 573          * the system logs.
 574          */
 575         if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) &&
 576             (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
 577                                  XFS_SCRUB_OFLAG_XCORRUPT)))
 578                 xrep_failure(sc->mp);
 579 }
 580 #else
 581 static inline void xchk_postmortem(struct xfs_scrub *sc)
 582 {
 583         /*
 584          * Userspace asked us to scrub something, it's broken, and we have no
 585          * way of fixing it.  Scream in the logs.
 586          */
 587         if (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
 588                                 XFS_SCRUB_OFLAG_XCORRUPT))
 589                 xfs_alert_ratelimited(sc->mp,
 590                                 "Corruption detected during scrub.");
 591 }
 592 #endif /* CONFIG_XFS_ONLINE_REPAIR */
 593
 594 /*
 595  * Create a new scrub context from an existing one, but with a different scrub
 596  * type.
 597  */
 598 struct xfs_scrub_subord *
 599 xchk_scrub_create_subord(
 600         struct xfs_scrub        *sc,
 601         unsigned int            subtype)
 602 {
 603         struct xfs_scrub_subord *sub;
 604
 605         sub = kzalloc(sizeof(*sub), XCHK_GFP_FLAGS);
 606         if (!sub)
 607                 return ERR_PTR(-ENOMEM);
 608
 609         sub->old_smtype = sc->sm->sm_type;
 610         sub->old_smflags = sc->sm->sm_flags;
 611         sub->parent_sc = sc;
 612         memcpy(&sub->sc, sc, sizeof(struct xfs_scrub));
 613         sub->sc.ops = &meta_scrub_ops[subtype];
 614         sub->sc.sm->sm_type = subtype;
 615         sub->sc.sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
 616         sub->sc.buf = NULL;
 617         sub->sc.buf_cleanup = NULL;
 618         sub->sc.xfile = NULL;
 619         sub->sc.xmbtp = NULL;
 620
 621         return sub;
 622 }
 623
 624 /* Dispatch metadata scrubbing. */
 625 STATIC int
 626 xfs_scrub_metadata(
 627         struct file                     *file,
 628         struct xfs_scrub_metadata       *sm)
 629 {
 630         struct xchk_stats_run           run = { };
 631         struct xfs_scrub                *sc;
 632         struct xfs_mount                *mp = XFS_I(file_inode(file))->i_mount;
 633         u64                             check_start;
 634         int                             error = 0;
 635
 636         BUILD_BUG_ON(sizeof(meta_scrub_ops) !=
 637                 (sizeof(struct xchk_meta_ops) * XFS_SCRUB_TYPE_NR));
 638
 639         trace_xchk_start(XFS_I(file_inode(file)), sm, error);
 640
 641         /* Forbidden if we are shut down or mounted norecovery. */
 642         error = -ESHUTDOWN;
 643         if (xfs_is_shutdown(mp))
 644                 goto out;
 645         error = -ENOTRECOVERABLE;
 646         if (xfs_has_norecovery(mp))
 647                 goto out;
 648
 649         error = xchk_validate_inputs(mp, sm);
 650         if (error)
 651                 goto out;
 652
 653         xfs_warn_experimental(mp, XFS_EXPERIMENTAL_SCRUB);
 654
 655         sc = kzalloc(sizeof(struct xfs_scrub), XCHK_GFP_FLAGS);
 656         if (!sc) {
 657                 error = -ENOMEM;
 658                 goto out;
 659         }
 660
 661         sc->mp = mp;
 662         sc->file = file;
 663         sc->sm = sm;
 664         sc->ops = &meta_scrub_ops[sm->sm_type];
 665         sc->sick_mask = xchk_health_mask_for_scrub_type(sm->sm_type);
 666         sc->relax = INIT_XCHK_RELAX;
 667 retry_op:
 668         /*
 669          * When repairs are allowed, prevent freezing or readonly remount while
 670          * scrub is running with a real transaction.
 671          */
 672         if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) {
 673                 error = mnt_want_write_file(sc->file);
 674                 if (error)
 675                         goto out_sc;
 676
 677                 sc->flags |= XCHK_HAVE_FREEZE_PROT;
 678         }
 679
 680         /* Set up for the operation. */
 681         error = sc->ops->setup(sc);
 682         if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER))
 683                 goto try_harder;
 684         if (error == -ECHRNG && !(sc->flags & XCHK_NEED_DRAIN))
 685                 goto need_drain;
 686         if (error)
 687                 goto out_teardown;
 688
 689         /* Scrub for errors. */
 690         check_start = xchk_stats_now();
 691         if ((sc->flags & XREP_ALREADY_FIXED) && sc->ops->repair_eval != NULL)
 692                 error = sc->ops->repair_eval(sc);
 693         else
 694                 error = sc->ops->scrub(sc);
 695         run.scrub_ns += xchk_stats_elapsed_ns(check_start);
 696         if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER))
 697                 goto try_harder;
 698         if (error == -ECHRNG && !(sc->flags & XCHK_NEED_DRAIN))
 699                 goto need_drain;
 700         if (error || (sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE))
 701                 goto out_teardown;
 702
 703         xchk_update_health(sc);
 704
 705         if (xchk_could_repair(sc)) {
 706                 /*
 707                  * If userspace asked for a repair but it wasn't necessary,
 708                  * report that back to userspace.
 709                  */
 710                 if (!xrep_will_attempt(sc)) {
 711                         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED;
 712                         goto out_nofix;
 713                 }
 714
 715                 /*
 716                  * If it's broken, userspace wants us to fix it, and we haven't
 717                  * already tried to fix it, then attempt a repair.
 718                  */
 719                 error = xrep_attempt(sc, &run);
 720                 if (error == -EAGAIN) {
 721                         /*
 722                          * Either the repair function succeeded or it couldn't
 723                          * get all the resources it needs; either way, we go
 724                          * back to the beginning and call the scrub function.
 725                          */
 726                         error = xchk_teardown(sc, 0);
 727                         if (error) {
 728                                 xrep_failure(mp);
 729                                 goto out_sc;
 730                         }
 731                         goto retry_op;
 732                 }
 733         }
 734
 735 out_nofix:
 736         xchk_postmortem(sc);
 737 out_teardown:
 738         error = xchk_teardown(sc, error);
 739 out_sc:
 740         if (error != -ENOENT)
 741                 xchk_stats_merge(mp, sm, &run);
 742         kfree(sc);
 743 out:
 744         trace_xchk_done(XFS_I(file_inode(file)), sm, error);
 745         if (error == -EFSCORRUPTED || error == -EFSBADCRC) {
 746                 sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
 747                 error = 0;
 748         }
 749         return error;
 750 need_drain:
 751         error = xchk_teardown(sc, 0);
 752         if (error)
 753                 goto out_sc;
 754         sc->flags |= XCHK_NEED_DRAIN;
 755         run.retries++;
 756         goto retry_op;
 757 try_harder:
 758         /*
 759          * Scrubbers return -EDEADLOCK to mean 'try harder'.  Tear down
 760          * everything we hold, then set up again with preparation for
 761          * worst-case scenarios.
 762          */
 763         error = xchk_teardown(sc, 0);
 764         if (error)
 765                 goto out_sc;
 766         sc->flags |= XCHK_TRY_HARDER;
 767         run.retries++;
 768         goto retry_op;
 769 }
 770
 771 /* Scrub one aspect of one piece of metadata. */
 772 int
 773 xfs_ioc_scrub_metadata(
 774         struct file                     *file,
 775         void                            __user *arg)
 776 {
 777         struct xfs_scrub_metadata       scrub;
 778         int                             error;
 779
 780         if (!capable(CAP_SYS_ADMIN))
 781                 return -EPERM;
 782
 783         if (copy_from_user(&scrub, arg, sizeof(scrub)))
 784                 return -EFAULT;
 785
 786         error = xfs_scrub_metadata(file, &scrub);
 787         if (error)
 788                 return error;
 789
 790         if (copy_to_user(arg, &scrub, sizeof(scrub)))
 791                 return -EFAULT;
 792
 793         return 0;
 794 }
 795
 796 /* Decide if there have been any scrub failures up to this point. */
 797 static inline int
 798 xfs_scrubv_check_barrier(
 799         struct xfs_mount                *mp,
 800         const struct xfs_scrub_vec      *vectors,
 801         const struct xfs_scrub_vec      *stop_vec)
 802 {
 803         const struct xfs_scrub_vec      *v;
 804         __u32                           failmask;
 805
 806         failmask = stop_vec->sv_flags & XFS_SCRUB_FLAGS_OUT;
 807
 808         for (v = vectors; v < stop_vec; v++) {
 809                 if (v->sv_type == XFS_SCRUB_TYPE_BARRIER)
 810                         continue;
 811
 812                 /*
 813                  * Runtime errors count as a previous failure, except the ones
 814                  * used to ask userspace to retry.
 815                  */
 816                 switch (v->sv_ret) {
 817                 case -EBUSY:
 818                 case -ENOENT:
 819                 case -EUSERS:
 820                 case 0:
 821                         break;
 822                 default:
 823                         return -ECANCELED;
 824                 }
 825
 826                 /*
 827                  * If any of the out-flags on the scrub vector match the mask
 828                  * that was set on the barrier vector, that's a previous fail.
 829                  */
 830                 if (v->sv_flags & failmask)
 831                         return -ECANCELED;
 832         }
 833
 834         return 0;
 835 }
 836
 837 /*
 838  * If the caller provided us with a nonzero inode number that isn't the ioctl
 839  * file, try to grab a reference to it to eliminate all further untrusted inode
 840  * lookups.  If we can't get the inode, let each scrub function try again.
 841  */
 842 STATIC struct xfs_inode *
 843 xchk_scrubv_open_by_handle(
 844         struct xfs_mount                *mp,
 845         const struct xfs_scrub_vec_head *head)
 846 {
 847         struct xfs_trans                *tp;
 848         struct xfs_inode                *ip;
 849         int                             error;
 850
 851         error = xfs_trans_alloc_empty(mp, &tp);
 852         if (error)
 853                 return NULL;
 854
 855         error = xfs_iget(mp, tp, head->svh_ino, XCHK_IGET_FLAGS, 0, &ip);
 856         xfs_trans_cancel(tp);
 857         if (error)
 858                 return NULL;
 859
 860         if (VFS_I(ip)->i_generation != head->svh_gen) {
 861                 xfs_irele(ip);
 862                 return NULL;
 863         }
 864
 865         return ip;
 866 }
 867
 868 /* Vectored scrub implementation to reduce ioctl calls. */
 869 int
 870 xfs_ioc_scrubv_metadata(
 871         struct file                     *file,
 872         void                            __user *arg)
 873 {
 874         struct xfs_scrub_vec_head       head;
 875         struct xfs_scrub_vec_head       __user *uhead = arg;
 876         struct xfs_scrub_vec            *vectors;
 877         struct xfs_scrub_vec            __user *uvectors;
 878         struct xfs_inode                *ip_in = XFS_I(file_inode(file));
 879         struct xfs_mount                *mp = ip_in->i_mount;
 880         struct xfs_inode                *handle_ip = NULL;
 881         struct xfs_scrub_vec            *v;
 882         size_t                          vec_bytes;
 883         unsigned int                    i;
 884         int                             error = 0;
 885
 886         if (!capable(CAP_SYS_ADMIN))
 887                 return -EPERM;
 888
 889         if (copy_from_user(&head, uhead, sizeof(head)))
 890                 return -EFAULT;
 891
 892         if (head.svh_reserved)
 893                 return -EINVAL;
 894         if (head.svh_flags & ~XFS_SCRUB_VEC_FLAGS_ALL)
 895                 return -EINVAL;
 896         if (head.svh_nr == 0)
 897                 return 0;
 898
 899         vec_bytes = array_size(head.svh_nr, sizeof(struct xfs_scrub_vec));
 900         if (vec_bytes > PAGE_SIZE)
 901                 return -ENOMEM;
 902
 903         uvectors = u64_to_user_ptr(head.svh_vectors);
 904         vectors = memdup_user(uvectors, vec_bytes);
 905         if (IS_ERR(vectors))
 906                 return PTR_ERR(vectors);
 907
 908         trace_xchk_scrubv_start(ip_in, &head);
 909
 910         for (i = 0, v = vectors; i < head.svh_nr; i++, v++) {
 911                 if (v->sv_reserved) {
 912                         error = -EINVAL;
 913                         goto out_free;
 914                 }
 915
 916                 if (v->sv_type == XFS_SCRUB_TYPE_BARRIER &&
 917                     (v->sv_flags & ~XFS_SCRUB_FLAGS_OUT)) {
 918                         error = -EINVAL;
 919                         goto out_free;
 920                 }
 921
 922                 trace_xchk_scrubv_item(mp, &head, i, v);
 923         }
 924
 925         /*
 926          * If the caller wants us to do a scrub-by-handle and the file used to
 927          * call the ioctl is not the same file, load the incore inode and pin
 928          * it across all the scrubv actions to avoid repeated UNTRUSTED
 929          * lookups.  The reference is not passed to deeper layers of scrub
 930          * because each scrubber gets to decide its own strategy and return
 931          * values for getting an inode.
 932          */
 933         if (head.svh_ino && head.svh_ino != ip_in->i_ino)
 934                 handle_ip = xchk_scrubv_open_by_handle(mp, &head);
 935
 936         /* Run all the scrubbers. */
 937         for (i = 0, v = vectors; i < head.svh_nr; i++, v++) {
 938                 struct xfs_scrub_metadata       sm = {
 939                         .sm_type                = v->sv_type,
 940                         .sm_flags               = v->sv_flags,
 941                         .sm_ino                 = head.svh_ino,
 942                         .sm_gen                 = head.svh_gen,
 943                         .sm_agno                = head.svh_agno,
 944                 };
 945
 946                 if (v->sv_type == XFS_SCRUB_TYPE_BARRIER) {
 947                         v->sv_ret = xfs_scrubv_check_barrier(mp, vectors, v);
 948                         if (v->sv_ret) {
 949                                 trace_xchk_scrubv_barrier_fail(mp, &head, i, v);
 950                                 break;
 951                         }
 952
 953                         continue;
 954                 }
 955
 956                 v->sv_ret = xfs_scrub_metadata(file, &sm);
 957                 v->sv_flags = sm.sm_flags;
 958
 959                 trace_xchk_scrubv_outcome(mp, &head, i, v);
 960
 961                 if (head.svh_rest_us) {
 962                         ktime_t         expires;
 963
 964                         expires = ktime_add_ns(ktime_get(),
 965                                         head.svh_rest_us * 1000);
 966                         set_current_state(TASK_KILLABLE);
 967                         schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
 968                 }
 969
 970                 if (fatal_signal_pending(current)) {
 971                         error = -EINTR;
 972                         goto out_free;
 973                 }
 974         }
 975
 976         if (copy_to_user(uvectors, vectors, vec_bytes) ||
 977             copy_to_user(uhead, &head, sizeof(head))) {
 978                 error = -EFAULT;
 979                 goto out_free;
 980         }
 981
 982 out_free:
 983         if (handle_ip)
 984                 xfs_irele(handle_ip);
 985         kfree(vectors);
 986         return error;
 987 }