fs/xfs/xfs_iwalk.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * Copyright (C) 2019 Oracle.  All Rights Reserved.
   4  * Author: Darrick J. Wong <darrick.wong@oracle.com>
   5  */
   6 #include "xfs.h"
   7 #include "xfs_fs.h"
   8 #include "xfs_shared.h"
   9 #include "xfs_format.h"
  10 #include "xfs_log_format.h"
  11 #include "xfs_trans_resv.h"
  12 #include "xfs_mount.h"
  13 #include "xfs_inode.h"
  14 #include "xfs_btree.h"
  15 #include "xfs_ialloc.h"
  16 #include "xfs_ialloc_btree.h"
  17 #include "xfs_iwalk.h"
  18 #include "xfs_error.h"
  19 #include "xfs_trace.h"
  20 #include "xfs_icache.h"
  21 #include "xfs_health.h"
  22 #include "xfs_trans.h"
  23 #include "xfs_pwork.h"
  24
  25 /*
  26  * Walking Inodes in the Filesystem
  27  * ================================
  28  *
  29  * This iterator function walks a subset of filesystem inodes in increasing
  30  * order from @startino until there are no more inodes.  For each allocated
  31  * inode it finds, it calls a walk function with the relevant inode number and
  32  * a pointer to caller-provided data.  The walk function can return the usual
  33  * negative error code to stop the iteration; 0 to continue the iteration; or
  34  * -ECANCELED to stop the iteration.  This return value is returned to the
  35  * caller.
  36  *
  37  * Internally, we allow the walk function to do anything, which means that we
  38  * cannot maintain the inobt cursor or our lock on the AGI buffer.  We
  39  * therefore cache the inobt records in kernel memory and only call the walk
  40  * function when our memory buffer is full.  @nr_recs is the number of records
  41  * that we've cached, and @sz_recs is the size of our cache.
  42  *
  43  * It is the responsibility of the walk function to ensure it accesses
  44  * allocated inodes, as the inobt records may be stale by the time they are
  45  * acted upon.
  46  */
  47
  48 struct xfs_iwalk_ag {
  49         /* parallel work control data; will be null if single threaded */
  50         struct xfs_pwork                pwork;
  51
  52         struct xfs_mount                *mp;
  53         struct xfs_trans                *tp;
  54
  55         /* Where do we start the traversal? */
  56         xfs_ino_t                       startino;
  57
  58         /* Array of inobt records we cache. */
  59         struct xfs_inobt_rec_incore     *recs;
  60
  61         /* Number of entries allocated for the @recs array. */
  62         unsigned int                    sz_recs;
  63
  64         /* Number of entries in the @recs array that are in use. */
  65         unsigned int                    nr_recs;
  66
  67         /* Inode walk function and data pointer. */
  68         xfs_iwalk_fn                    iwalk_fn;
  69         xfs_inobt_walk_fn               inobt_walk_fn;
  70         void                            *data;
  71
  72         /*
  73          * Make it look like the inodes up to startino are free so that
  74          * bulkstat can start its inode iteration at the correct place without
  75          * needing to special case everywhere.
  76          */
  77         unsigned int                    trim_start:1;
  78
  79         /* Skip empty inobt records? */
  80         unsigned int                    skip_empty:1;
  81 };
  82
  83 /*
  84  * Loop over all clusters in a chunk for a given incore inode allocation btree
  85  * record.  Do a readahead if there are any allocated inodes in that cluster.
  86  */
  87 STATIC void
  88 xfs_iwalk_ichunk_ra(
  89         struct xfs_mount                *mp,
  90         xfs_agnumber_t                  agno,
  91         struct xfs_inobt_rec_incore     *irec)
  92 {
  93         struct xfs_ino_geometry         *igeo = M_IGEO(mp);
  94         xfs_agblock_t                   agbno;
  95         struct blk_plug                 plug;
  96         int                             i;      /* inode chunk index */
  97
  98         agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino);
  99
 100         blk_start_plug(&plug);
 101         for (i = 0; i < XFS_INODES_PER_CHUNK; i += igeo->inodes_per_cluster) {
 102                 xfs_inofree_t   imask;
 103
 104                 imask = xfs_inobt_maskn(i, igeo->inodes_per_cluster);
 105                 if (imask & ~irec->ir_free) {
 106                         xfs_btree_reada_bufs(mp, agno, agbno,
 107                                         igeo->blocks_per_cluster,
 108                                         &xfs_inode_buf_ops);
 109                 }
 110                 agbno += igeo->blocks_per_cluster;
 111         }
 112         blk_finish_plug(&plug);
 113 }
 114
 115 /*
 116  * Set the bits in @irec's free mask that correspond to the inodes before
 117  * @agino so that we skip them.  This is how we restart an inode walk that was
 118  * interrupted in the middle of an inode record.
 119  */
 120 STATIC void
 121 xfs_iwalk_adjust_start(
 122         xfs_agino_t                     agino,  /* starting inode of chunk */
 123         struct xfs_inobt_rec_incore     *irec)  /* btree record */
 124 {
 125         int                             idx;    /* index into inode chunk */
 126         int                             i;
 127
 128         idx = agino - irec->ir_startino;
 129
 130         /*
 131          * We got a right chunk with some left inodes allocated at it.  Grab
 132          * the chunk record.  Mark all the uninteresting inodes free because
 133          * they're before our start point.
 134          */
 135         for (i = 0; i < idx; i++) {
 136                 if (XFS_INOBT_MASK(i) & ~irec->ir_free)
 137                         irec->ir_freecount++;
 138         }
 139
 140         irec->ir_free |= xfs_inobt_maskn(0, idx);
 141 }
 142
 143 /* Allocate memory for a walk. */
 144 STATIC int
 145 xfs_iwalk_alloc(
 146         struct xfs_iwalk_ag     *iwag)
 147 {
 148         size_t                  size;
 149
 150         ASSERT(iwag->recs == NULL);
 151         iwag->nr_recs = 0;
 152
 153         /* Allocate a prefetch buffer for inobt records. */
 154         size = iwag->sz_recs * sizeof(struct xfs_inobt_rec_incore);
 155         iwag->recs = kmem_alloc(size, KM_MAYFAIL);
 156         if (iwag->recs == NULL)
 157                 return -ENOMEM;
 158
 159         return 0;
 160 }
 161
 162 /* Free memory we allocated for a walk. */
 163 STATIC void
 164 xfs_iwalk_free(
 165         struct xfs_iwalk_ag     *iwag)
 166 {
 167         kmem_free(iwag->recs);
 168         iwag->recs = NULL;
 169 }
 170
 171 /* For each inuse inode in each cached inobt record, call our function. */
 172 STATIC int
 173 xfs_iwalk_ag_recs(
 174         struct xfs_iwalk_ag             *iwag)
 175 {
 176         struct xfs_mount                *mp = iwag->mp;
 177         struct xfs_trans                *tp = iwag->tp;
 178         xfs_ino_t                       ino;
 179         unsigned int                    i, j;
 180         xfs_agnumber_t                  agno;
 181         int                             error;
 182
 183         agno = XFS_INO_TO_AGNO(mp, iwag->startino);
 184         for (i = 0; i < iwag->nr_recs; i++) {
 185                 struct xfs_inobt_rec_incore     *irec = &iwag->recs[i];
 186
 187                 trace_xfs_iwalk_ag_rec(mp, agno, irec);
 188
 189                 if (xfs_pwork_want_abort(&iwag->pwork))
 190                         return 0;
 191
 192                 if (iwag->inobt_walk_fn) {
 193                         error = iwag->inobt_walk_fn(mp, tp, agno, irec,
 194                                         iwag->data);
 195                         if (error)
 196                                 return error;
 197                 }
 198
 199                 if (!iwag->iwalk_fn)
 200                         continue;
 201
 202                 for (j = 0; j < XFS_INODES_PER_CHUNK; j++) {
 203                         if (xfs_pwork_want_abort(&iwag->pwork))
 204                                 return 0;
 205
 206                         /* Skip if this inode is free */
 207                         if (XFS_INOBT_MASK(j) & irec->ir_free)
 208                                 continue;
 209
 210                         /* Otherwise call our function. */
 211                         ino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino + j);
 212                         error = iwag->iwalk_fn(mp, tp, ino, iwag->data);
 213                         if (error)
 214                                 return error;
 215                 }
 216         }
 217
 218         return 0;
 219 }
 220
 221 /* Delete cursor and let go of AGI. */
 222 static inline void
 223 xfs_iwalk_del_inobt(
 224         struct xfs_trans        *tp,
 225         struct xfs_btree_cur    **curpp,
 226         struct xfs_buf          **agi_bpp,
 227         int                     error)
 228 {
 229         if (*curpp) {
 230                 xfs_btree_del_cursor(*curpp, error);
 231                 *curpp = NULL;
 232         }
 233         if (*agi_bpp) {
 234                 xfs_trans_brelse(tp, *agi_bpp);
 235                 *agi_bpp = NULL;
 236         }
 237 }
 238
 239 /*
 240  * Set ourselves up for walking inobt records starting from a given point in
 241  * the filesystem.
 242  *
 243  * If caller passed in a nonzero start inode number, load the record from the
 244  * inobt and make the record look like all the inodes before agino are free so
 245  * that we skip them, and then move the cursor to the next inobt record.  This
 246  * is how we support starting an iwalk in the middle of an inode chunk.
 247  *
 248  * If the caller passed in a start number of zero, move the cursor to the first
 249  * inobt record.
 250  *
 251  * The caller is responsible for cleaning up the cursor and buffer pointer
 252  * regardless of the error status.
 253  */
 254 STATIC int
 255 xfs_iwalk_ag_start(
 256         struct xfs_iwalk_ag     *iwag,
 257         xfs_agnumber_t          agno,
 258         xfs_agino_t             agino,
 259         struct xfs_btree_cur    **curpp,
 260         struct xfs_buf          **agi_bpp,
 261         int                     *has_more)
 262 {
 263         struct xfs_mount        *mp = iwag->mp;
 264         struct xfs_trans        *tp = iwag->tp;
 265         struct xfs_inobt_rec_incore *irec;
 266         int                     error;
 267
 268         /* Set up a fresh cursor and empty the inobt cache. */
 269         iwag->nr_recs = 0;
 270         error = xfs_inobt_cur(mp, tp, agno, XFS_BTNUM_INO, curpp, agi_bpp);
 271         if (error)
 272                 return error;
 273
 274         /* Starting at the beginning of the AG?  That's easy! */
 275         if (agino == 0)
 276                 return xfs_inobt_lookup(*curpp, 0, XFS_LOOKUP_GE, has_more);
 277
 278         /*
 279          * Otherwise, we have to grab the inobt record where we left off, stuff
 280          * the record into our cache, and then see if there are more records.
 281          * We require a lookup cache of at least two elements so that the
 282          * caller doesn't have to deal with tearing down the cursor to walk the
 283          * records.
 284          */
 285         error = xfs_inobt_lookup(*curpp, agino, XFS_LOOKUP_LE, has_more);
 286         if (error)
 287                 return error;
 288
 289         /*
 290          * If the LE lookup at @agino yields no records, jump ahead to the
 291          * inobt cursor increment to see if there are more records to process.
 292          */
 293         if (!*has_more)
 294                 goto out_advance;
 295
 296         /* Get the record, should always work */
 297         irec = &iwag->recs[iwag->nr_recs];
 298         error = xfs_inobt_get_rec(*curpp, irec, has_more);
 299         if (error)
 300                 return error;
 301         if (XFS_IS_CORRUPT(mp, *has_more != 1))
 302                 return -EFSCORRUPTED;
 303
 304         /*
 305          * If the LE lookup yielded an inobt record before the cursor position,
 306          * skip it and see if there's another one after it.
 307          */
 308         if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino)
 309                 goto out_advance;
 310
 311         /*
 312          * If agino fell in the middle of the inode record, make it look like
 313          * the inodes up to agino are free so that we don't return them again.
 314          */
 315         if (iwag->trim_start)
 316                 xfs_iwalk_adjust_start(agino, irec);
 317
 318         /*
 319          * The prefetch calculation is supposed to give us a large enough inobt
 320          * record cache that grab_ichunk can stage a partial first record and
 321          * the loop body can cache a record without having to check for cache
 322          * space until after it reads an inobt record.
 323          */
 324         iwag->nr_recs++;
 325         ASSERT(iwag->nr_recs < iwag->sz_recs);
 326
 327 out_advance:
 328         return xfs_btree_increment(*curpp, 0, has_more);
 329 }
 330
 331 /*
 332  * The inobt record cache is full, so preserve the inobt cursor state and
 333  * run callbacks on the cached inobt records.  When we're done, restore the
 334  * cursor state to wherever the cursor would have been had the cache not been
 335  * full (and therefore we could've just incremented the cursor) if *@has_more
 336  * is true.  On exit, *@has_more will indicate whether or not the caller should
 337  * try for more inode records.
 338  */
 339 STATIC int
 340 xfs_iwalk_run_callbacks(
 341         struct xfs_iwalk_ag             *iwag,
 342         xfs_agnumber_t                  agno,
 343         struct xfs_btree_cur            **curpp,
 344         struct xfs_buf                  **agi_bpp,
 345         int                             *has_more)
 346 {
 347         struct xfs_mount                *mp = iwag->mp;
 348         struct xfs_trans                *tp = iwag->tp;
 349         struct xfs_inobt_rec_incore     *irec;
 350         xfs_agino_t                     restart;
 351         int                             error;
 352
 353         ASSERT(iwag->nr_recs > 0);
 354
 355         /* Delete cursor but remember the last record we cached... */
 356         xfs_iwalk_del_inobt(tp, curpp, agi_bpp, 0);
 357         irec = &iwag->recs[iwag->nr_recs - 1];
 358         restart = irec->ir_startino + XFS_INODES_PER_CHUNK - 1;
 359
 360         error = xfs_iwalk_ag_recs(iwag);
 361         if (error)
 362                 return error;
 363
 364         /* ...empty the cache... */
 365         iwag->nr_recs = 0;
 366
 367         if (!has_more)
 368                 return 0;
 369
 370         /* ...and recreate the cursor just past where we left off. */
 371         error = xfs_inobt_cur(mp, tp, agno, XFS_BTNUM_INO, curpp, agi_bpp);
 372         if (error)
 373                 return error;
 374
 375         return xfs_inobt_lookup(*curpp, restart, XFS_LOOKUP_GE, has_more);
 376 }
 377
 378 /* Walk all inodes in a single AG, from @iwag->startino to the end of the AG. */
 379 STATIC int
 380 xfs_iwalk_ag(
 381         struct xfs_iwalk_ag             *iwag)
 382 {
 383         struct xfs_mount                *mp = iwag->mp;
 384         struct xfs_trans                *tp = iwag->tp;
 385         struct xfs_buf                  *agi_bp = NULL;
 386         struct xfs_btree_cur            *cur = NULL;
 387         xfs_agnumber_t                  agno;
 388         xfs_agino_t                     agino;
 389         int                             has_more;
 390         int                             error = 0;
 391
 392         /* Set up our cursor at the right place in the inode btree. */
 393         agno = XFS_INO_TO_AGNO(mp, iwag->startino);
 394         agino = XFS_INO_TO_AGINO(mp, iwag->startino);
 395         error = xfs_iwalk_ag_start(iwag, agno, agino, &cur, &agi_bp, &has_more);
 396
 397         while (!error && has_more) {
 398                 struct xfs_inobt_rec_incore     *irec;
 399
 400                 cond_resched();
 401                 if (xfs_pwork_want_abort(&iwag->pwork))
 402                         goto out;
 403
 404                 /* Fetch the inobt record. */
 405                 irec = &iwag->recs[iwag->nr_recs];
 406                 error = xfs_inobt_get_rec(cur, irec, &has_more);
 407                 if (error || !has_more)
 408                         break;
 409
 410                 /* No allocated inodes in this chunk; skip it. */
 411                 if (iwag->skip_empty && irec->ir_freecount == irec->ir_count) {
 412                         error = xfs_btree_increment(cur, 0, &has_more);
 413                         if (error)
 414                                 break;
 415                         continue;
 416                 }
 417
 418                 /*
 419                  * Start readahead for this inode chunk in anticipation of
 420                  * walking the inodes.
 421                  */
 422                 if (iwag->iwalk_fn)
 423                         xfs_iwalk_ichunk_ra(mp, agno, irec);
 424
 425                 /*
 426                  * If there's space in the buffer for more records, increment
 427                  * the btree cursor and grab more.
 428                  */
 429                 if (++iwag->nr_recs < iwag->sz_recs) {
 430                         error = xfs_btree_increment(cur, 0, &has_more);
 431                         if (error || !has_more)
 432                                 break;
 433                         continue;
 434                 }
 435
 436                 /*
 437                  * Otherwise, we need to save cursor state and run the callback
 438                  * function on the cached records.  The run_callbacks function
 439                  * is supposed to return a cursor pointing to the record where
 440                  * we would be if we had been able to increment like above.
 441                  */
 442                 ASSERT(has_more);
 443                 error = xfs_iwalk_run_callbacks(iwag, agno, &cur, &agi_bp,
 444                                 &has_more);
 445         }
 446
 447         if (iwag->nr_recs == 0 || error)
 448                 goto out;
 449
 450         /* Walk the unprocessed records in the cache. */
 451         error = xfs_iwalk_run_callbacks(iwag, agno, &cur, &agi_bp, &has_more);
 452
 453 out:
 454         xfs_iwalk_del_inobt(tp, &cur, &agi_bp, error);
 455         return error;
 456 }
 457
 458 /*
 459  * We experimentally determined that the reduction in ioctl call overhead
 460  * diminishes when userspace asks for more than 2048 inodes, so we'll cap
 461  * prefetch at this point.
 462  */
 463 #define IWALK_MAX_INODE_PREFETCH        (2048U)
 464
 465 /*
 466  * Given the number of inodes to prefetch, set the number of inobt records that
 467  * we cache in memory, which controls the number of inodes we try to read
 468  * ahead.  Set the maximum if @inodes == 0.
 469  */
 470 static inline unsigned int
 471 xfs_iwalk_prefetch(
 472         unsigned int            inodes)
 473 {
 474         unsigned int            inobt_records;
 475
 476         /*
 477          * If the caller didn't tell us the number of inodes they wanted,
 478          * assume the maximum prefetch possible for best performance.
 479          * Otherwise, cap prefetch at that maximum so that we don't start an
 480          * absurd amount of prefetch.
 481          */
 482         if (inodes == 0)
 483                 inodes = IWALK_MAX_INODE_PREFETCH;
 484         inodes = min(inodes, IWALK_MAX_INODE_PREFETCH);
 485
 486         /* Round the inode count up to a full chunk. */
 487         inodes = round_up(inodes, XFS_INODES_PER_CHUNK);
 488
 489         /*
 490          * In order to convert the number of inodes to prefetch into an
 491          * estimate of the number of inobt records to cache, we require a
 492          * conversion factor that reflects our expectations of the average
 493          * loading factor of an inode chunk.  Based on data gathered, most
 494          * (but not all) filesystems manage to keep the inode chunks totally
 495          * full, so we'll underestimate slightly so that our readahead will
 496          * still deliver the performance we want on aging filesystems:
 497          *
 498          * inobt = inodes / (INODES_PER_CHUNK * (4 / 5));
 499          *
 500          * The funny math is to avoid integer division.
 501          */
 502         inobt_records = (inodes * 5) / (4 * XFS_INODES_PER_CHUNK);
 503
 504         /*
 505          * Allocate enough space to prefetch at least two inobt records so that
 506          * we can cache both the record where the iwalk started and the next
 507          * record.  This simplifies the AG inode walk loop setup code.
 508          */
 509         return max(inobt_records, 2U);
 510 }
 511
 512 /*
 513  * Walk all inodes in the filesystem starting from @startino.  The @iwalk_fn
 514  * will be called for each allocated inode, being passed the inode's number and
 515  * @data.  @max_prefetch controls how many inobt records' worth of inodes we
 516  * try to readahead.
 517  */
 518 int
 519 xfs_iwalk(
 520         struct xfs_mount        *mp,
 521         struct xfs_trans        *tp,
 522         xfs_ino_t               startino,
 523         unsigned int            flags,
 524         xfs_iwalk_fn            iwalk_fn,
 525         unsigned int            inode_records,
 526         void                    *data)
 527 {
 528         struct xfs_iwalk_ag     iwag = {
 529                 .mp             = mp,
 530                 .tp             = tp,
 531                 .iwalk_fn       = iwalk_fn,
 532                 .data           = data,
 533                 .startino       = startino,
 534                 .sz_recs        = xfs_iwalk_prefetch(inode_records),
 535                 .trim_start     = 1,
 536                 .skip_empty     = 1,
 537                 .pwork          = XFS_PWORK_SINGLE_THREADED,
 538         };
 539         xfs_agnumber_t          agno = XFS_INO_TO_AGNO(mp, startino);
 540         int                     error;
 541
 542         ASSERT(agno < mp->m_sb.sb_agcount);
 543         ASSERT(!(flags & ~XFS_IWALK_FLAGS_ALL));
 544
 545         error = xfs_iwalk_alloc(&iwag);
 546         if (error)
 547                 return error;
 548
 549         for (; agno < mp->m_sb.sb_agcount; agno++) {
 550                 error = xfs_iwalk_ag(&iwag);
 551                 if (error)
 552                         break;
 553                 iwag.startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
 554                 if (flags & XFS_INOBT_WALK_SAME_AG)
 555                         break;
 556         }
 557
 558         xfs_iwalk_free(&iwag);
 559         return error;
 560 }
 561
 562 /* Run per-thread iwalk work. */
 563 static int
 564 xfs_iwalk_ag_work(
 565         struct xfs_mount        *mp,
 566         struct xfs_pwork        *pwork)
 567 {
 568         struct xfs_iwalk_ag     *iwag;
 569         int                     error = 0;
 570
 571         iwag = container_of(pwork, struct xfs_iwalk_ag, pwork);
 572         if (xfs_pwork_want_abort(pwork))
 573                 goto out;
 574
 575         error = xfs_iwalk_alloc(iwag);
 576         if (error)
 577                 goto out;
 578
 579         error = xfs_iwalk_ag(iwag);
 580         xfs_iwalk_free(iwag);
 581 out:
 582         kmem_free(iwag);
 583         return error;
 584 }
 585
 586 /*
 587  * Walk all the inodes in the filesystem using multiple threads to process each
 588  * AG.
 589  */
 590 int
 591 xfs_iwalk_threaded(
 592         struct xfs_mount        *mp,
 593         xfs_ino_t               startino,
 594         unsigned int            flags,
 595         xfs_iwalk_fn            iwalk_fn,
 596         unsigned int            inode_records,
 597         bool                    polled,
 598         void                    *data)
 599 {
 600         struct xfs_pwork_ctl    pctl;
 601         xfs_agnumber_t          agno = XFS_INO_TO_AGNO(mp, startino);
 602         unsigned int            nr_threads;
 603         int                     error;
 604
 605         ASSERT(agno < mp->m_sb.sb_agcount);
 606         ASSERT(!(flags & ~XFS_IWALK_FLAGS_ALL));
 607
 608         nr_threads = xfs_pwork_guess_datadev_parallelism(mp);
 609         error = xfs_pwork_init(mp, &pctl, xfs_iwalk_ag_work, "xfs_iwalk",
 610                         nr_threads);
 611         if (error)
 612                 return error;
 613
 614         for (; agno < mp->m_sb.sb_agcount; agno++) {
 615                 struct xfs_iwalk_ag     *iwag;
 616
 617                 if (xfs_pwork_ctl_want_abort(&pctl))
 618                         break;
 619
 620                 iwag = kmem_zalloc(sizeof(struct xfs_iwalk_ag), 0);
 621                 iwag->mp = mp;
 622                 iwag->iwalk_fn = iwalk_fn;
 623                 iwag->data = data;
 624                 iwag->startino = startino;
 625                 iwag->sz_recs = xfs_iwalk_prefetch(inode_records);
 626                 xfs_pwork_queue(&pctl, &iwag->pwork);
 627                 startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
 628                 if (flags & XFS_INOBT_WALK_SAME_AG)
 629                         break;
 630         }
 631
 632         if (polled)
 633                 xfs_pwork_poll(&pctl);
 634         return xfs_pwork_destroy(&pctl);
 635 }
 636
 637 /*
 638  * Allow callers to cache up to a page's worth of inobt records.  This reflects
 639  * the existing inumbers prefetching behavior.  Since the inobt walk does not
 640  * itself do anything with the inobt records, we can set a fairly high limit
 641  * here.
 642  */
 643 #define MAX_INOBT_WALK_PREFETCH \
 644         (PAGE_SIZE / sizeof(struct xfs_inobt_rec_incore))
 645
 646 /*
 647  * Given the number of records that the user wanted, set the number of inobt
 648  * records that we buffer in memory.  Set the maximum if @inobt_records == 0.
 649  */
 650 static inline unsigned int
 651 xfs_inobt_walk_prefetch(
 652         unsigned int            inobt_records)
 653 {
 654         /*
 655          * If the caller didn't tell us the number of inobt records they
 656          * wanted, assume the maximum prefetch possible for best performance.
 657          */
 658         if (inobt_records == 0)
 659                 inobt_records = MAX_INOBT_WALK_PREFETCH;
 660
 661         /*
 662          * Allocate enough space to prefetch at least two inobt records so that
 663          * we can cache both the record where the iwalk started and the next
 664          * record.  This simplifies the AG inode walk loop setup code.
 665          */
 666         inobt_records = max(inobt_records, 2U);
 667
 668         /*
 669          * Cap prefetch at that maximum so that we don't use an absurd amount
 670          * of memory.
 671          */
 672         return min_t(unsigned int, inobt_records, MAX_INOBT_WALK_PREFETCH);
 673 }
 674
 675 /*
 676  * Walk all inode btree records in the filesystem starting from @startino.  The
 677  * @inobt_walk_fn will be called for each btree record, being passed the incore
 678  * record and @data.  @max_prefetch controls how many inobt records we try to
 679  * cache ahead of time.
 680  */
 681 int
 682 xfs_inobt_walk(
 683         struct xfs_mount        *mp,
 684         struct xfs_trans        *tp,
 685         xfs_ino_t               startino,
 686         unsigned int            flags,
 687         xfs_inobt_walk_fn       inobt_walk_fn,
 688         unsigned int            inobt_records,
 689         void                    *data)
 690 {
 691         struct xfs_iwalk_ag     iwag = {
 692                 .mp             = mp,
 693                 .tp             = tp,
 694                 .inobt_walk_fn  = inobt_walk_fn,
 695                 .data           = data,
 696                 .startino       = startino,
 697                 .sz_recs        = xfs_inobt_walk_prefetch(inobt_records),
 698                 .pwork          = XFS_PWORK_SINGLE_THREADED,
 699         };
 700         xfs_agnumber_t          agno = XFS_INO_TO_AGNO(mp, startino);
 701         int                     error;
 702
 703         ASSERT(agno < mp->m_sb.sb_agcount);
 704         ASSERT(!(flags & ~XFS_INOBT_WALK_FLAGS_ALL));
 705
 706         error = xfs_iwalk_alloc(&iwag);
 707         if (error)
 708                 return error;
 709
 710         for (; agno < mp->m_sb.sb_agcount; agno++) {
 711                 error = xfs_iwalk_ag(&iwag);
 712                 if (error)
 713                         break;
 714                 iwag.startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
 715                 if (flags & XFS_INOBT_WALK_SAME_AG)
 716                         break;
 717         }
 718
 719         xfs_iwalk_free(&iwag);
 720         return error;
 721 }