fs/xfs/xfs_iwalk.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * Copyright (C) 2019 Oracle.  All Rights Reserved.
   4  * Author: Darrick J. Wong <darrick.wong@oracle.com>
   5  */
   6 #include "xfs.h"
   7 #include "xfs_fs.h"
   8 #include "xfs_shared.h"
   9 #include "xfs_format.h"
  10 #include "xfs_log_format.h"
  11 #include "xfs_trans_resv.h"
  12 #include "xfs_mount.h"
  13 #include "xfs_inode.h"
  14 #include "xfs_btree.h"
  15 #include "xfs_ialloc.h"
  16 #include "xfs_ialloc_btree.h"
  17 #include "xfs_iwalk.h"
  18 #include "xfs_error.h"
  19 #include "xfs_trace.h"
  20 #include "xfs_icache.h"
  21 #include "xfs_health.h"
  22 #include "xfs_trans.h"
  23 #include "xfs_pwork.h"
  24
  25 /*
  26  * Walking Inodes in the Filesystem
  27  * ================================
  28  *
  29  * This iterator function walks a subset of filesystem inodes in increasing
  30  * order from @startino until there are no more inodes.  For each allocated
  31  * inode it finds, it calls a walk function with the relevant inode number and
  32  * a pointer to caller-provided data.  The walk function can return the usual
  33  * negative error code to stop the iteration; 0 to continue the iteration; or
  34  * -ECANCELED to stop the iteration.  This return value is returned to the
  35  * caller.
  36  *
  37  * Internally, we allow the walk function to do anything, which means that we
  38  * cannot maintain the inobt cursor or our lock on the AGI buffer.  We
  39  * therefore cache the inobt records in kernel memory and only call the walk
  40  * function when our memory buffer is full.  @nr_recs is the number of records
  41  * that we've cached, and @sz_recs is the size of our cache.
  42  *
  43  * It is the responsibility of the walk function to ensure it accesses
  44  * allocated inodes, as the inobt records may be stale by the time they are
  45  * acted upon.
  46  */
  47
  48 struct xfs_iwalk_ag {
  49         /* parallel work control data; will be null if single threaded */
  50         struct xfs_pwork                pwork;
  51
  52         struct xfs_mount                *mp;
  53         struct xfs_trans                *tp;
  54
  55         /* Where do we start the traversal? */
  56         xfs_ino_t                       startino;
  57
  58         /* What was the last inode number we saw when iterating the inobt? */
  59         xfs_ino_t                       lastino;
  60
  61         /* Array of inobt records we cache. */
  62         struct xfs_inobt_rec_incore     *recs;
  63
  64         /* Number of entries allocated for the @recs array. */
  65         unsigned int                    sz_recs;
  66
  67         /* Number of entries in the @recs array that are in use. */
  68         unsigned int                    nr_recs;
  69
  70         /* Inode walk function and data pointer. */
  71         xfs_iwalk_fn                    iwalk_fn;
  72         xfs_inobt_walk_fn               inobt_walk_fn;
  73         void                            *data;
  74
  75         /*
  76          * Make it look like the inodes up to startino are free so that
  77          * bulkstat can start its inode iteration at the correct place without
  78          * needing to special case everywhere.
  79          */
  80         unsigned int                    trim_start:1;
  81
  82         /* Skip empty inobt records? */
  83         unsigned int                    skip_empty:1;
  84 };
  85
  86 /*
  87  * Loop over all clusters in a chunk for a given incore inode allocation btree
  88  * record.  Do a readahead if there are any allocated inodes in that cluster.
  89  */
  90 STATIC void
  91 xfs_iwalk_ichunk_ra(
  92         struct xfs_mount                *mp,
  93         xfs_agnumber_t                  agno,
  94         struct xfs_inobt_rec_incore     *irec)
  95 {
  96         struct xfs_ino_geometry         *igeo = M_IGEO(mp);
  97         xfs_agblock_t                   agbno;
  98         struct blk_plug                 plug;
  99         int                             i;      /* inode chunk index */
 100
 101         agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino);
 102
 103         blk_start_plug(&plug);
 104         for (i = 0; i < XFS_INODES_PER_CHUNK; i += igeo->inodes_per_cluster) {
 105                 xfs_inofree_t   imask;
 106
 107                 imask = xfs_inobt_maskn(i, igeo->inodes_per_cluster);
 108                 if (imask & ~irec->ir_free) {
 109                         xfs_btree_reada_bufs(mp, agno, agbno,
 110                                         igeo->blocks_per_cluster,
 111                                         &xfs_inode_buf_ops);
 112                 }
 113                 agbno += igeo->blocks_per_cluster;
 114         }
 115         blk_finish_plug(&plug);
 116 }
 117
 118 /*
 119  * Set the bits in @irec's free mask that correspond to the inodes before
 120  * @agino so that we skip them.  This is how we restart an inode walk that was
 121  * interrupted in the middle of an inode record.
 122  */
 123 STATIC void
 124 xfs_iwalk_adjust_start(
 125         xfs_agino_t                     agino,  /* starting inode of chunk */
 126         struct xfs_inobt_rec_incore     *irec)  /* btree record */
 127 {
 128         int                             idx;    /* index into inode chunk */
 129         int                             i;
 130
 131         idx = agino - irec->ir_startino;
 132
 133         /*
 134          * We got a right chunk with some left inodes allocated at it.  Grab
 135          * the chunk record.  Mark all the uninteresting inodes free because
 136          * they're before our start point.
 137          */
 138         for (i = 0; i < idx; i++) {
 139                 if (XFS_INOBT_MASK(i) & ~irec->ir_free)
 140                         irec->ir_freecount++;
 141         }
 142
 143         irec->ir_free |= xfs_inobt_maskn(0, idx);
 144 }
 145
 146 /* Allocate memory for a walk. */
 147 STATIC int
 148 xfs_iwalk_alloc(
 149         struct xfs_iwalk_ag     *iwag)
 150 {
 151         size_t                  size;
 152
 153         ASSERT(iwag->recs == NULL);
 154         iwag->nr_recs = 0;
 155
 156         /* Allocate a prefetch buffer for inobt records. */
 157         size = iwag->sz_recs * sizeof(struct xfs_inobt_rec_incore);
 158         iwag->recs = kmem_alloc(size, KM_MAYFAIL);
 159         if (iwag->recs == NULL)
 160                 return -ENOMEM;
 161
 162         return 0;
 163 }
 164
 165 /* Free memory we allocated for a walk. */
 166 STATIC void
 167 xfs_iwalk_free(
 168         struct xfs_iwalk_ag     *iwag)
 169 {
 170         kmem_free(iwag->recs);
 171         iwag->recs = NULL;
 172 }
 173
 174 /* For each inuse inode in each cached inobt record, call our function. */
 175 STATIC int
 176 xfs_iwalk_ag_recs(
 177         struct xfs_iwalk_ag             *iwag)
 178 {
 179         struct xfs_mount                *mp = iwag->mp;
 180         struct xfs_trans                *tp = iwag->tp;
 181         xfs_ino_t                       ino;
 182         unsigned int                    i, j;
 183         xfs_agnumber_t                  agno;
 184         int                             error;
 185
 186         agno = XFS_INO_TO_AGNO(mp, iwag->startino);
 187         for (i = 0; i < iwag->nr_recs; i++) {
 188                 struct xfs_inobt_rec_incore     *irec = &iwag->recs[i];
 189
 190                 trace_xfs_iwalk_ag_rec(mp, agno, irec);
 191
 192                 if (xfs_pwork_want_abort(&iwag->pwork))
 193                         return 0;
 194
 195                 if (iwag->inobt_walk_fn) {
 196                         error = iwag->inobt_walk_fn(mp, tp, agno, irec,
 197                                         iwag->data);
 198                         if (error)
 199                                 return error;
 200                 }
 201
 202                 if (!iwag->iwalk_fn)
 203                         continue;
 204
 205                 for (j = 0; j < XFS_INODES_PER_CHUNK; j++) {
 206                         if (xfs_pwork_want_abort(&iwag->pwork))
 207                                 return 0;
 208
 209                         /* Skip if this inode is free */
 210                         if (XFS_INOBT_MASK(j) & irec->ir_free)
 211                                 continue;
 212
 213                         /* Otherwise call our function. */
 214                         ino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino + j);
 215                         error = iwag->iwalk_fn(mp, tp, ino, iwag->data);
 216                         if (error)
 217                                 return error;
 218                 }
 219         }
 220
 221         return 0;
 222 }
 223
 224 /* Delete cursor and let go of AGI. */
 225 static inline void
 226 xfs_iwalk_del_inobt(
 227         struct xfs_trans        *tp,
 228         struct xfs_btree_cur    **curpp,
 229         struct xfs_buf          **agi_bpp,
 230         int                     error)
 231 {
 232         if (*curpp) {
 233                 xfs_btree_del_cursor(*curpp, error);
 234                 *curpp = NULL;
 235         }
 236         if (*agi_bpp) {
 237                 xfs_trans_brelse(tp, *agi_bpp);
 238                 *agi_bpp = NULL;
 239         }
 240 }
 241
 242 /*
 243  * Set ourselves up for walking inobt records starting from a given point in
 244  * the filesystem.
 245  *
 246  * If caller passed in a nonzero start inode number, load the record from the
 247  * inobt and make the record look like all the inodes before agino are free so
 248  * that we skip them, and then move the cursor to the next inobt record.  This
 249  * is how we support starting an iwalk in the middle of an inode chunk.
 250  *
 251  * If the caller passed in a start number of zero, move the cursor to the first
 252  * inobt record.
 253  *
 254  * The caller is responsible for cleaning up the cursor and buffer pointer
 255  * regardless of the error status.
 256  */
 257 STATIC int
 258 xfs_iwalk_ag_start(
 259         struct xfs_iwalk_ag     *iwag,
 260         xfs_agnumber_t          agno,
 261         xfs_agino_t             agino,
 262         struct xfs_btree_cur    **curpp,
 263         struct xfs_buf          **agi_bpp,
 264         int                     *has_more)
 265 {
 266         struct xfs_mount        *mp = iwag->mp;
 267         struct xfs_trans        *tp = iwag->tp;
 268         struct xfs_inobt_rec_incore *irec;
 269         int                     error;
 270
 271         /* Set up a fresh cursor and empty the inobt cache. */
 272         iwag->nr_recs = 0;
 273         error = xfs_inobt_cur(mp, tp, agno, XFS_BTNUM_INO, curpp, agi_bpp);
 274         if (error)
 275                 return error;
 276
 277         /* Starting at the beginning of the AG?  That's easy! */
 278         if (agino == 0)
 279                 return xfs_inobt_lookup(*curpp, 0, XFS_LOOKUP_GE, has_more);
 280
 281         /*
 282          * Otherwise, we have to grab the inobt record where we left off, stuff
 283          * the record into our cache, and then see if there are more records.
 284          * We require a lookup cache of at least two elements so that the
 285          * caller doesn't have to deal with tearing down the cursor to walk the
 286          * records.
 287          */
 288         error = xfs_inobt_lookup(*curpp, agino, XFS_LOOKUP_LE, has_more);
 289         if (error)
 290                 return error;
 291
 292         /*
 293          * If the LE lookup at @agino yields no records, jump ahead to the
 294          * inobt cursor increment to see if there are more records to process.
 295          */
 296         if (!*has_more)
 297                 goto out_advance;
 298
 299         /* Get the record, should always work */
 300         irec = &iwag->recs[iwag->nr_recs];
 301         error = xfs_inobt_get_rec(*curpp, irec, has_more);
 302         if (error)
 303                 return error;
 304         if (XFS_IS_CORRUPT(mp, *has_more != 1))
 305                 return -EFSCORRUPTED;
 306
 307         iwag->lastino = XFS_AGINO_TO_INO(mp, agno,
 308                                 irec->ir_startino + XFS_INODES_PER_CHUNK - 1);
 309
 310         /*
 311          * If the LE lookup yielded an inobt record before the cursor position,
 312          * skip it and see if there's another one after it.
 313          */
 314         if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino)
 315                 goto out_advance;
 316
 317         /*
 318          * If agino fell in the middle of the inode record, make it look like
 319          * the inodes up to agino are free so that we don't return them again.
 320          */
 321         if (iwag->trim_start)
 322                 xfs_iwalk_adjust_start(agino, irec);
 323
 324         /*
 325          * The prefetch calculation is supposed to give us a large enough inobt
 326          * record cache that grab_ichunk can stage a partial first record and
 327          * the loop body can cache a record without having to check for cache
 328          * space until after it reads an inobt record.
 329          */
 330         iwag->nr_recs++;
 331         ASSERT(iwag->nr_recs < iwag->sz_recs);
 332
 333 out_advance:
 334         return xfs_btree_increment(*curpp, 0, has_more);
 335 }
 336
 337 /*
 338  * The inobt record cache is full, so preserve the inobt cursor state and
 339  * run callbacks on the cached inobt records.  When we're done, restore the
 340  * cursor state to wherever the cursor would have been had the cache not been
 341  * full (and therefore we could've just incremented the cursor) if *@has_more
 342  * is true.  On exit, *@has_more will indicate whether or not the caller should
 343  * try for more inode records.
 344  */
 345 STATIC int
 346 xfs_iwalk_run_callbacks(
 347         struct xfs_iwalk_ag             *iwag,
 348         xfs_agnumber_t                  agno,
 349         struct xfs_btree_cur            **curpp,
 350         struct xfs_buf                  **agi_bpp,
 351         int                             *has_more)
 352 {
 353         struct xfs_mount                *mp = iwag->mp;
 354         struct xfs_trans                *tp = iwag->tp;
 355         struct xfs_inobt_rec_incore     *irec;
 356         xfs_agino_t                     next_agino;
 357         int                             error;
 358
 359         next_agino = XFS_INO_TO_AGINO(mp, iwag->lastino) + 1;
 360
 361         ASSERT(iwag->nr_recs > 0);
 362
 363         /* Delete cursor but remember the last record we cached... */
 364         xfs_iwalk_del_inobt(tp, curpp, agi_bpp, 0);
 365         irec = &iwag->recs[iwag->nr_recs - 1];
 366         ASSERT(next_agino >= irec->ir_startino + XFS_INODES_PER_CHUNK);
 367
 368         error = xfs_iwalk_ag_recs(iwag);
 369         if (error)
 370                 return error;
 371
 372         /* ...empty the cache... */
 373         iwag->nr_recs = 0;
 374
 375         if (!has_more)
 376                 return 0;
 377
 378         /* ...and recreate the cursor just past where we left off. */
 379         error = xfs_inobt_cur(mp, tp, agno, XFS_BTNUM_INO, curpp, agi_bpp);
 380         if (error)
 381                 return error;
 382
 383         return xfs_inobt_lookup(*curpp, next_agino, XFS_LOOKUP_GE, has_more);
 384 }
 385
 386 /* Walk all inodes in a single AG, from @iwag->startino to the end of the AG. */
 387 STATIC int
 388 xfs_iwalk_ag(
 389         struct xfs_iwalk_ag             *iwag)
 390 {
 391         struct xfs_mount                *mp = iwag->mp;
 392         struct xfs_trans                *tp = iwag->tp;
 393         struct xfs_buf                  *agi_bp = NULL;
 394         struct xfs_btree_cur            *cur = NULL;
 395         xfs_agnumber_t                  agno;
 396         xfs_agino_t                     agino;
 397         int                             has_more;
 398         int                             error = 0;
 399
 400         /* Set up our cursor at the right place in the inode btree. */
 401         agno = XFS_INO_TO_AGNO(mp, iwag->startino);
 402         agino = XFS_INO_TO_AGINO(mp, iwag->startino);
 403         error = xfs_iwalk_ag_start(iwag, agno, agino, &cur, &agi_bp, &has_more);
 404
 405         while (!error && has_more) {
 406                 struct xfs_inobt_rec_incore     *irec;
 407                 xfs_ino_t                       rec_fsino;
 408
 409                 cond_resched();
 410                 if (xfs_pwork_want_abort(&iwag->pwork))
 411                         goto out;
 412
 413                 /* Fetch the inobt record. */
 414                 irec = &iwag->recs[iwag->nr_recs];
 415                 error = xfs_inobt_get_rec(cur, irec, &has_more);
 416                 if (error || !has_more)
 417                         break;
 418
 419                 /* Make sure that we always move forward. */
 420                 rec_fsino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino);
 421                 if (iwag->lastino != NULLFSINO &&
 422                     XFS_IS_CORRUPT(mp, iwag->lastino >= rec_fsino)) {
 423                         error = -EFSCORRUPTED;
 424                         goto out;
 425                 }
 426                 iwag->lastino = rec_fsino + XFS_INODES_PER_CHUNK - 1;
 427
 428                 /* No allocated inodes in this chunk; skip it. */
 429                 if (iwag->skip_empty && irec->ir_freecount == irec->ir_count) {
 430                         error = xfs_btree_increment(cur, 0, &has_more);
 431                         if (error)
 432                                 break;
 433                         continue;
 434                 }
 435
 436                 /*
 437                  * Start readahead for this inode chunk in anticipation of
 438                  * walking the inodes.
 439                  */
 440                 if (iwag->iwalk_fn)
 441                         xfs_iwalk_ichunk_ra(mp, agno, irec);
 442
 443                 /*
 444                  * If there's space in the buffer for more records, increment
 445                  * the btree cursor and grab more.
 446                  */
 447                 if (++iwag->nr_recs < iwag->sz_recs) {
 448                         error = xfs_btree_increment(cur, 0, &has_more);
 449                         if (error || !has_more)
 450                                 break;
 451                         continue;
 452                 }
 453
 454                 /*
 455                  * Otherwise, we need to save cursor state and run the callback
 456                  * function on the cached records.  The run_callbacks function
 457                  * is supposed to return a cursor pointing to the record where
 458                  * we would be if we had been able to increment like above.
 459                  */
 460                 ASSERT(has_more);
 461                 error = xfs_iwalk_run_callbacks(iwag, agno, &cur, &agi_bp,
 462                                 &has_more);
 463         }
 464
 465         if (iwag->nr_recs == 0 || error)
 466                 goto out;
 467
 468         /* Walk the unprocessed records in the cache. */
 469         error = xfs_iwalk_run_callbacks(iwag, agno, &cur, &agi_bp, &has_more);
 470
 471 out:
 472         xfs_iwalk_del_inobt(tp, &cur, &agi_bp, error);
 473         return error;
 474 }
 475
 476 /*
 477  * We experimentally determined that the reduction in ioctl call overhead
 478  * diminishes when userspace asks for more than 2048 inodes, so we'll cap
 479  * prefetch at this point.
 480  */
 481 #define IWALK_MAX_INODE_PREFETCH        (2048U)
 482
 483 /*
 484  * Given the number of inodes to prefetch, set the number of inobt records that
 485  * we cache in memory, which controls the number of inodes we try to read
 486  * ahead.  Set the maximum if @inodes == 0.
 487  */
 488 static inline unsigned int
 489 xfs_iwalk_prefetch(
 490         unsigned int            inodes)
 491 {
 492         unsigned int            inobt_records;
 493
 494         /*
 495          * If the caller didn't tell us the number of inodes they wanted,
 496          * assume the maximum prefetch possible for best performance.
 497          * Otherwise, cap prefetch at that maximum so that we don't start an
 498          * absurd amount of prefetch.
 499          */
 500         if (inodes == 0)
 501                 inodes = IWALK_MAX_INODE_PREFETCH;
 502         inodes = min(inodes, IWALK_MAX_INODE_PREFETCH);
 503
 504         /* Round the inode count up to a full chunk. */
 505         inodes = round_up(inodes, XFS_INODES_PER_CHUNK);
 506
 507         /*
 508          * In order to convert the number of inodes to prefetch into an
 509          * estimate of the number of inobt records to cache, we require a
 510          * conversion factor that reflects our expectations of the average
 511          * loading factor of an inode chunk.  Based on data gathered, most
 512          * (but not all) filesystems manage to keep the inode chunks totally
 513          * full, so we'll underestimate slightly so that our readahead will
 514          * still deliver the performance we want on aging filesystems:
 515          *
 516          * inobt = inodes / (INODES_PER_CHUNK * (4 / 5));
 517          *
 518          * The funny math is to avoid integer division.
 519          */
 520         inobt_records = (inodes * 5) / (4 * XFS_INODES_PER_CHUNK);
 521
 522         /*
 523          * Allocate enough space to prefetch at least two inobt records so that
 524          * we can cache both the record where the iwalk started and the next
 525          * record.  This simplifies the AG inode walk loop setup code.
 526          */
 527         return max(inobt_records, 2U);
 528 }
 529
 530 /*
 531  * Walk all inodes in the filesystem starting from @startino.  The @iwalk_fn
 532  * will be called for each allocated inode, being passed the inode's number and
 533  * @data.  @max_prefetch controls how many inobt records' worth of inodes we
 534  * try to readahead.
 535  */
 536 int
 537 xfs_iwalk(
 538         struct xfs_mount        *mp,
 539         struct xfs_trans        *tp,
 540         xfs_ino_t               startino,
 541         unsigned int            flags,
 542         xfs_iwalk_fn            iwalk_fn,
 543         unsigned int            inode_records,
 544         void                    *data)
 545 {
 546         struct xfs_iwalk_ag     iwag = {
 547                 .mp             = mp,
 548                 .tp             = tp,
 549                 .iwalk_fn       = iwalk_fn,
 550                 .data           = data,
 551                 .startino       = startino,
 552                 .sz_recs        = xfs_iwalk_prefetch(inode_records),
 553                 .trim_start     = 1,
 554                 .skip_empty     = 1,
 555                 .pwork          = XFS_PWORK_SINGLE_THREADED,
 556                 .lastino        = NULLFSINO,
 557         };
 558         xfs_agnumber_t          agno = XFS_INO_TO_AGNO(mp, startino);
 559         int                     error;
 560
 561         ASSERT(agno < mp->m_sb.sb_agcount);
 562         ASSERT(!(flags & ~XFS_IWALK_FLAGS_ALL));
 563
 564         error = xfs_iwalk_alloc(&iwag);
 565         if (error)
 566                 return error;
 567
 568         for (; agno < mp->m_sb.sb_agcount; agno++) {
 569                 error = xfs_iwalk_ag(&iwag);
 570                 if (error)
 571                         break;
 572                 iwag.startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
 573                 if (flags & XFS_INOBT_WALK_SAME_AG)
 574                         break;
 575         }
 576
 577         xfs_iwalk_free(&iwag);
 578         return error;
 579 }
 580
 581 /* Run per-thread iwalk work. */
 582 static int
 583 xfs_iwalk_ag_work(
 584         struct xfs_mount        *mp,
 585         struct xfs_pwork        *pwork)
 586 {
 587         struct xfs_iwalk_ag     *iwag;
 588         int                     error = 0;
 589
 590         iwag = container_of(pwork, struct xfs_iwalk_ag, pwork);
 591         if (xfs_pwork_want_abort(pwork))
 592                 goto out;
 593
 594         error = xfs_iwalk_alloc(iwag);
 595         if (error)
 596                 goto out;
 597
 598         error = xfs_iwalk_ag(iwag);
 599         xfs_iwalk_free(iwag);
 600 out:
 601         kmem_free(iwag);
 602         return error;
 603 }
 604
 605 /*
 606  * Walk all the inodes in the filesystem using multiple threads to process each
 607  * AG.
 608  */
 609 int
 610 xfs_iwalk_threaded(
 611         struct xfs_mount        *mp,
 612         xfs_ino_t               startino,
 613         unsigned int            flags,
 614         xfs_iwalk_fn            iwalk_fn,
 615         unsigned int            inode_records,
 616         bool                    polled,
 617         void                    *data)
 618 {
 619         struct xfs_pwork_ctl    pctl;
 620         xfs_agnumber_t          agno = XFS_INO_TO_AGNO(mp, startino);
 621         unsigned int            nr_threads;
 622         int                     error;
 623
 624         ASSERT(agno < mp->m_sb.sb_agcount);
 625         ASSERT(!(flags & ~XFS_IWALK_FLAGS_ALL));
 626
 627         nr_threads = xfs_pwork_guess_datadev_parallelism(mp);
 628         error = xfs_pwork_init(mp, &pctl, xfs_iwalk_ag_work, "xfs_iwalk",
 629                         nr_threads);
 630         if (error)
 631                 return error;
 632
 633         for (; agno < mp->m_sb.sb_agcount; agno++) {
 634                 struct xfs_iwalk_ag     *iwag;
 635
 636                 if (xfs_pwork_ctl_want_abort(&pctl))
 637                         break;
 638
 639                 iwag = kmem_zalloc(sizeof(struct xfs_iwalk_ag), 0);
 640                 iwag->mp = mp;
 641                 iwag->iwalk_fn = iwalk_fn;
 642                 iwag->data = data;
 643                 iwag->startino = startino;
 644                 iwag->sz_recs = xfs_iwalk_prefetch(inode_records);
 645                 iwag->lastino = NULLFSINO;
 646                 xfs_pwork_queue(&pctl, &iwag->pwork);
 647                 startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
 648                 if (flags & XFS_INOBT_WALK_SAME_AG)
 649                         break;
 650         }
 651
 652         if (polled)
 653                 xfs_pwork_poll(&pctl);
 654         return xfs_pwork_destroy(&pctl);
 655 }
 656
 657 /*
 658  * Allow callers to cache up to a page's worth of inobt records.  This reflects
 659  * the existing inumbers prefetching behavior.  Since the inobt walk does not
 660  * itself do anything with the inobt records, we can set a fairly high limit
 661  * here.
 662  */
 663 #define MAX_INOBT_WALK_PREFETCH \
 664         (PAGE_SIZE / sizeof(struct xfs_inobt_rec_incore))
 665
 666 /*
 667  * Given the number of records that the user wanted, set the number of inobt
 668  * records that we buffer in memory.  Set the maximum if @inobt_records == 0.
 669  */
 670 static inline unsigned int
 671 xfs_inobt_walk_prefetch(
 672         unsigned int            inobt_records)
 673 {
 674         /*
 675          * If the caller didn't tell us the number of inobt records they
 676          * wanted, assume the maximum prefetch possible for best performance.
 677          */
 678         if (inobt_records == 0)
 679                 inobt_records = MAX_INOBT_WALK_PREFETCH;
 680
 681         /*
 682          * Allocate enough space to prefetch at least two inobt records so that
 683          * we can cache both the record where the iwalk started and the next
 684          * record.  This simplifies the AG inode walk loop setup code.
 685          */
 686         inobt_records = max(inobt_records, 2U);
 687
 688         /*
 689          * Cap prefetch at that maximum so that we don't use an absurd amount
 690          * of memory.
 691          */
 692         return min_t(unsigned int, inobt_records, MAX_INOBT_WALK_PREFETCH);
 693 }
 694
 695 /*
 696  * Walk all inode btree records in the filesystem starting from @startino.  The
 697  * @inobt_walk_fn will be called for each btree record, being passed the incore
 698  * record and @data.  @max_prefetch controls how many inobt records we try to
 699  * cache ahead of time.
 700  */
 701 int
 702 xfs_inobt_walk(
 703         struct xfs_mount        *mp,
 704         struct xfs_trans        *tp,
 705         xfs_ino_t               startino,
 706         unsigned int            flags,
 707         xfs_inobt_walk_fn       inobt_walk_fn,
 708         unsigned int            inobt_records,
 709         void                    *data)
 710 {
 711         struct xfs_iwalk_ag     iwag = {
 712                 .mp             = mp,
 713                 .tp             = tp,
 714                 .inobt_walk_fn  = inobt_walk_fn,
 715                 .data           = data,
 716                 .startino       = startino,
 717                 .sz_recs        = xfs_inobt_walk_prefetch(inobt_records),
 718                 .pwork          = XFS_PWORK_SINGLE_THREADED,
 719                 .lastino        = NULLFSINO,
 720         };
 721         xfs_agnumber_t          agno = XFS_INO_TO_AGNO(mp, startino);
 722         int                     error;
 723
 724         ASSERT(agno < mp->m_sb.sb_agcount);
 725         ASSERT(!(flags & ~XFS_INOBT_WALK_FLAGS_ALL));
 726
 727         error = xfs_iwalk_alloc(&iwag);
 728         if (error)
 729                 return error;
 730
 731         for (; agno < mp->m_sb.sb_agcount; agno++) {
 732                 error = xfs_iwalk_ag(&iwag);
 733                 if (error)
 734                         break;
 735                 iwag.startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
 736                 if (flags & XFS_INOBT_WALK_SAME_AG)
 737                         break;
 738         }
 739
 740         xfs_iwalk_free(&iwag);
 741         return error;
 742 }