fs/xfs/scrub/newbt.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * Copyright (C) 2022-2023 Oracle.  All Rights Reserved.
   4  * Author: Darrick J. Wong <djwong@kernel.org>
   5  */
   6 #include "xfs.h"
   7 #include "xfs_fs.h"
   8 #include "xfs_shared.h"
   9 #include "xfs_format.h"
  10 #include "xfs_trans_resv.h"
  11 #include "xfs_mount.h"
  12 #include "xfs_btree.h"
  13 #include "xfs_btree_staging.h"
  14 #include "xfs_log_format.h"
  15 #include "xfs_trans.h"
  16 #include "xfs_sb.h"
  17 #include "xfs_inode.h"
  18 #include "xfs_alloc.h"
  19 #include "xfs_rmap.h"
  20 #include "xfs_ag.h"
  21 #include "xfs_defer.h"
  22 #include "scrub/scrub.h"
  23 #include "scrub/common.h"
  24 #include "scrub/trace.h"
  25 #include "scrub/repair.h"
  26 #include "scrub/newbt.h"
  27
  28 /*
  29  * Estimate proper slack values for a btree that's being reloaded.
  30  *
  31  * Under most circumstances, we'll take whatever default loading value the
  32  * btree bulk loading code calculates for us.  However, there are some
  33  * exceptions to this rule:
  34  *
  35  * (0) If someone turned one of the debug knobs.
  36  * (1) If this is a per-AG btree and the AG has less than 10% space free.
  37  * (2) If this is an inode btree and the FS has less than 10% space free.
  38
  39  * In either case, format the new btree blocks almost completely full to
  40  * minimize space usage.
  41  */
  42 static void
  43 xrep_newbt_estimate_slack(
  44         struct xrep_newbt       *xnr)
  45 {
  46         struct xfs_scrub        *sc = xnr->sc;
  47         struct xfs_btree_bload  *bload = &xnr->bload;
  48         uint64_t                free;
  49         uint64_t                sz;
  50
  51         /*
  52          * The xfs_globals values are set to -1 (i.e. take the bload defaults)
  53          * unless someone has set them otherwise, so we just pull the values
  54          * here.
  55          */
  56         bload->leaf_slack = xfs_globals.bload_leaf_slack;
  57         bload->node_slack = xfs_globals.bload_node_slack;
  58
  59         if (sc->ops->type == ST_PERAG) {
  60                 free = sc->sa.pag->pagf_freeblks;
  61                 sz = xfs_ag_block_count(sc->mp, pag_agno(sc->sa.pag));
  62         } else {
  63                 free = percpu_counter_sum(&sc->mp->m_fdblocks);
  64                 sz = sc->mp->m_sb.sb_dblocks;
  65         }
  66
  67         /* No further changes if there's more than 10% free space left. */
  68         if (free >= div_u64(sz, 10))
  69                 return;
  70
  71         /*
  72          * We're low on space; load the btrees as tightly as possible.  Leave
  73          * a couple of open slots in each btree block so that we don't end up
  74          * splitting the btrees like crazy after a mount.
  75          */
  76         if (bload->leaf_slack < 0)
  77                 bload->leaf_slack = 2;
  78         if (bload->node_slack < 0)
  79                 bload->node_slack = 2;
  80 }
  81
  82 /* Initialize accounting resources for staging a new AG btree. */
  83 void
  84 xrep_newbt_init_ag(
  85         struct xrep_newbt               *xnr,
  86         struct xfs_scrub                *sc,
  87         const struct xfs_owner_info     *oinfo,
  88         xfs_fsblock_t                   alloc_hint,
  89         enum xfs_ag_resv_type           resv)
  90 {
  91         memset(xnr, 0, sizeof(struct xrep_newbt));
  92         xnr->sc = sc;
  93         xnr->oinfo = *oinfo; /* structure copy */
  94         xnr->alloc_hint = alloc_hint;
  95         xnr->resv = resv;
  96         INIT_LIST_HEAD(&xnr->resv_list);
  97         xnr->bload.max_dirty = XFS_B_TO_FSBT(sc->mp, 256U << 10); /* 256K */
  98         xrep_newbt_estimate_slack(xnr);
  99 }
 100
 101 /* Initialize accounting resources for staging a new inode fork btree. */
 102 int
 103 xrep_newbt_init_inode(
 104         struct xrep_newbt               *xnr,
 105         struct xfs_scrub                *sc,
 106         int                             whichfork,
 107         const struct xfs_owner_info     *oinfo)
 108 {
 109         struct xfs_ifork                *ifp;
 110
 111         ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS);
 112         if (!ifp)
 113                 return -ENOMEM;
 114
 115         xrep_newbt_init_ag(xnr, sc, oinfo,
 116                         XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino),
 117                         XFS_AG_RESV_NONE);
 118         xnr->ifake.if_fork = ifp;
 119         xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, whichfork);
 120         return 0;
 121 }
 122
 123 /*
 124  * Initialize accounting resources for staging a new btree.  Callers are
 125  * expected to add their own reservations (and clean them up) manually.
 126  */
 127 void
 128 xrep_newbt_init_bare(
 129         struct xrep_newbt               *xnr,
 130         struct xfs_scrub                *sc)
 131 {
 132         xrep_newbt_init_ag(xnr, sc, &XFS_RMAP_OINFO_ANY_OWNER, NULLFSBLOCK,
 133                         XFS_AG_RESV_NONE);
 134 }
 135
 136 /*
 137  * Designate specific blocks to be used to build our new btree.  @pag must be
 138  * a passive reference.
 139  */
 140 STATIC int
 141 xrep_newbt_add_blocks(
 142         struct xrep_newbt               *xnr,
 143         struct xfs_perag                *pag,
 144         const struct xfs_alloc_arg      *args)
 145 {
 146         struct xfs_mount                *mp = xnr->sc->mp;
 147         struct xrep_newbt_resv          *resv;
 148         int                             error;
 149
 150         resv = kmalloc(sizeof(struct xrep_newbt_resv), XCHK_GFP_FLAGS);
 151         if (!resv)
 152                 return -ENOMEM;
 153
 154         INIT_LIST_HEAD(&resv->list);
 155         resv->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
 156         resv->len = args->len;
 157         resv->used = 0;
 158         resv->pag = xfs_perag_hold(pag);
 159
 160         if (args->tp) {
 161                 ASSERT(xnr->oinfo.oi_offset == 0);
 162
 163                 error = xfs_alloc_schedule_autoreap(args,
 164                                 XFS_FREE_EXTENT_SKIP_DISCARD, &resv->autoreap);
 165                 if (error)
 166                         goto out_pag;
 167         }
 168
 169         list_add_tail(&resv->list, &xnr->resv_list);
 170         return 0;
 171 out_pag:
 172         xfs_perag_put(resv->pag);
 173         kfree(resv);
 174         return error;
 175 }
 176
 177 /*
 178  * Add an extent to the new btree reservation pool.  Callers are required to
 179  * reap this reservation manually if the repair is cancelled.  @pag must be a
 180  * passive reference.
 181  */
 182 int
 183 xrep_newbt_add_extent(
 184         struct xrep_newbt       *xnr,
 185         struct xfs_perag        *pag,
 186         xfs_agblock_t           agbno,
 187         xfs_extlen_t            len)
 188 {
 189         struct xfs_alloc_arg    args = {
 190                 .tp             = NULL, /* no autoreap */
 191                 .oinfo          = xnr->oinfo,
 192                 .fsbno          = xfs_agbno_to_fsb(pag, agbno),
 193                 .len            = len,
 194                 .resv           = xnr->resv,
 195         };
 196
 197         return xrep_newbt_add_blocks(xnr, pag, &args);
 198 }
 199
 200 /* Don't let our allocation hint take us beyond this AG */
 201 static inline void
 202 xrep_newbt_validate_ag_alloc_hint(
 203         struct xrep_newbt       *xnr)
 204 {
 205         struct xfs_scrub        *sc = xnr->sc;
 206         xfs_agnumber_t          agno = XFS_FSB_TO_AGNO(sc->mp, xnr->alloc_hint);
 207
 208         if (agno == pag_agno(sc->sa.pag) &&
 209             xfs_verify_fsbno(sc->mp, xnr->alloc_hint))
 210                 return;
 211
 212         xnr->alloc_hint =
 213                 xfs_agbno_to_fsb(sc->sa.pag, XFS_AGFL_BLOCK(sc->mp) + 1);
 214 }
 215
 216 /* Allocate disk space for a new per-AG btree. */
 217 STATIC int
 218 xrep_newbt_alloc_ag_blocks(
 219         struct xrep_newbt       *xnr,
 220         uint64_t                nr_blocks)
 221 {
 222         struct xfs_scrub        *sc = xnr->sc;
 223         struct xfs_mount        *mp = sc->mp;
 224         int                     error = 0;
 225
 226         ASSERT(sc->sa.pag != NULL);
 227
 228         while (nr_blocks > 0) {
 229                 struct xfs_alloc_arg    args = {
 230                         .tp             = sc->tp,
 231                         .mp             = mp,
 232                         .oinfo          = xnr->oinfo,
 233                         .minlen         = 1,
 234                         .maxlen         = nr_blocks,
 235                         .prod           = 1,
 236                         .resv           = xnr->resv,
 237                 };
 238                 xfs_agnumber_t          agno;
 239
 240                 xrep_newbt_validate_ag_alloc_hint(xnr);
 241
 242                 if (xnr->alloc_vextent)
 243                         error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint);
 244                 else
 245                         error = xfs_alloc_vextent_near_bno(&args,
 246                                         xnr->alloc_hint);
 247                 if (error)
 248                         return error;
 249                 if (args.fsbno == NULLFSBLOCK)
 250                         return -ENOSPC;
 251
 252                 agno = XFS_FSB_TO_AGNO(mp, args.fsbno);
 253                 if (agno != pag_agno(sc->sa.pag)) {
 254                         ASSERT(agno == pag_agno(sc->sa.pag));
 255                         return -EFSCORRUPTED;
 256                 }
 257
 258                 trace_xrep_newbt_alloc_ag_blocks(sc->sa.pag,
 259                                 XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len,
 260                                 xnr->oinfo.oi_owner);
 261
 262                 error = xrep_newbt_add_blocks(xnr, sc->sa.pag, &args);
 263                 if (error)
 264                         return error;
 265
 266                 nr_blocks -= args.len;
 267                 xnr->alloc_hint = args.fsbno + args.len;
 268
 269                 error = xrep_defer_finish(sc);
 270                 if (error)
 271                         return error;
 272         }
 273
 274         return 0;
 275 }
 276
 277 /* Don't let our allocation hint take us beyond EOFS */
 278 static inline void
 279 xrep_newbt_validate_file_alloc_hint(
 280         struct xrep_newbt       *xnr)
 281 {
 282         struct xfs_scrub        *sc = xnr->sc;
 283
 284         if (xfs_verify_fsbno(sc->mp, xnr->alloc_hint))
 285                 return;
 286
 287         xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, 0, XFS_AGFL_BLOCK(sc->mp) + 1);
 288 }
 289
 290 /* Allocate disk space for our new file-based btree. */
 291 STATIC int
 292 xrep_newbt_alloc_file_blocks(
 293         struct xrep_newbt       *xnr,
 294         uint64_t                nr_blocks)
 295 {
 296         struct xfs_scrub        *sc = xnr->sc;
 297         struct xfs_mount        *mp = sc->mp;
 298         int                     error = 0;
 299
 300         while (nr_blocks > 0) {
 301                 struct xfs_alloc_arg    args = {
 302                         .tp             = sc->tp,
 303                         .mp             = mp,
 304                         .oinfo          = xnr->oinfo,
 305                         .minlen         = 1,
 306                         .maxlen         = nr_blocks,
 307                         .prod           = 1,
 308                         .resv           = xnr->resv,
 309                 };
 310                 struct xfs_perag        *pag;
 311                 xfs_agnumber_t          agno;
 312
 313                 xrep_newbt_validate_file_alloc_hint(xnr);
 314
 315                 if (xnr->alloc_vextent)
 316                         error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint);
 317                 else
 318                         error = xfs_alloc_vextent_start_ag(&args,
 319                                         xnr->alloc_hint);
 320                 if (error)
 321                         return error;
 322                 if (args.fsbno == NULLFSBLOCK)
 323                         return -ENOSPC;
 324
 325                 agno = XFS_FSB_TO_AGNO(mp, args.fsbno);
 326
 327                 pag = xfs_perag_get(mp, agno);
 328                 if (!pag) {
 329                         ASSERT(0);
 330                         return -EFSCORRUPTED;
 331                 }
 332
 333                 trace_xrep_newbt_alloc_file_blocks(pag,
 334                                 XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len,
 335                                 xnr->oinfo.oi_owner);
 336
 337                 error = xrep_newbt_add_blocks(xnr, pag, &args);
 338                 xfs_perag_put(pag);
 339                 if (error)
 340                         return error;
 341
 342                 nr_blocks -= args.len;
 343                 xnr->alloc_hint = args.fsbno + args.len;
 344
 345                 error = xrep_defer_finish(sc);
 346                 if (error)
 347                         return error;
 348         }
 349
 350         return 0;
 351 }
 352
 353 /* Allocate disk space for our new btree. */
 354 int
 355 xrep_newbt_alloc_blocks(
 356         struct xrep_newbt       *xnr,
 357         uint64_t                nr_blocks)
 358 {
 359         if (xnr->sc->ip)
 360                 return xrep_newbt_alloc_file_blocks(xnr, nr_blocks);
 361         return xrep_newbt_alloc_ag_blocks(xnr, nr_blocks);
 362 }
 363
 364 /*
 365  * Free the unused part of a space extent that was reserved for a new ondisk
 366  * structure.  Returns the number of EFIs logged or a negative errno.
 367  */
 368 STATIC int
 369 xrep_newbt_free_extent(
 370         struct xrep_newbt       *xnr,
 371         struct xrep_newbt_resv  *resv,
 372         bool                    btree_committed)
 373 {
 374         struct xfs_scrub        *sc = xnr->sc;
 375         xfs_agblock_t           free_agbno = resv->agbno;
 376         xfs_extlen_t            free_aglen = resv->len;
 377         int                     error;
 378
 379         if (!btree_committed || resv->used == 0) {
 380                 /*
 381                  * If we're not committing a new btree or we didn't use the
 382                  * space reservation, let the existing EFI free the entire
 383                  * space extent.
 384                  */
 385                 trace_xrep_newbt_free_blocks(resv->pag, free_agbno, free_aglen,
 386                                 xnr->oinfo.oi_owner);
 387                 xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap);
 388                 return 1;
 389         }
 390
 391         /*
 392          * We used space and committed the btree.  Cancel the autoreap, remove
 393          * the written blocks from the reservation, and possibly log a new EFI
 394          * to free any unused reservation space.
 395          */
 396         xfs_alloc_cancel_autoreap(sc->tp, &resv->autoreap);
 397         free_agbno += resv->used;
 398         free_aglen -= resv->used;
 399
 400         if (free_aglen == 0)
 401                 return 0;
 402
 403         trace_xrep_newbt_free_blocks(resv->pag, free_agbno, free_aglen,
 404                         xnr->oinfo.oi_owner);
 405
 406         ASSERT(xnr->resv != XFS_AG_RESV_AGFL);
 407         ASSERT(xnr->resv != XFS_AG_RESV_IGNORE);
 408
 409         /*
 410          * Use EFIs to free the reservations.  This reduces the chance
 411          * that we leak blocks if the system goes down.
 412          */
 413         error = xfs_free_extent_later(sc->tp,
 414                         xfs_agbno_to_fsb(resv->pag, free_agbno), free_aglen,
 415                         &xnr->oinfo, xnr->resv, XFS_FREE_EXTENT_SKIP_DISCARD);
 416         if (error)
 417                 return error;
 418
 419         return 1;
 420 }
 421
 422 /* Free all the accounting info and disk space we reserved for a new btree. */
 423 STATIC int
 424 xrep_newbt_free(
 425         struct xrep_newbt       *xnr,
 426         bool                    btree_committed)
 427 {
 428         struct xfs_scrub        *sc = xnr->sc;
 429         struct xrep_newbt_resv  *resv, *n;
 430         unsigned int            freed = 0;
 431         int                     error = 0;
 432
 433         /*
 434          * If the filesystem already went down, we can't free the blocks.  Skip
 435          * ahead to freeing the incore metadata because we can't fix anything.
 436          */
 437         if (xfs_is_shutdown(sc->mp))
 438                 goto junkit;
 439
 440         list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
 441                 int             ret;
 442
 443                 ret = xrep_newbt_free_extent(xnr, resv, btree_committed);
 444                 list_del(&resv->list);
 445                 xfs_perag_put(resv->pag);
 446                 kfree(resv);
 447                 if (ret < 0) {
 448                         error = ret;
 449                         goto junkit;
 450                 }
 451
 452                 freed += ret;
 453                 if (freed >= XREP_MAX_ITRUNCATE_EFIS) {
 454                         error = xrep_defer_finish(sc);
 455                         if (error)
 456                                 goto junkit;
 457                         freed = 0;
 458                 }
 459         }
 460
 461         if (freed)
 462                 error = xrep_defer_finish(sc);
 463
 464 junkit:
 465         /*
 466          * If we still have reservations attached to @newbt, cleanup must have
 467          * failed and the filesystem is about to go down.  Clean up the incore
 468          * reservations and try to commit to freeing the space we used.
 469          */
 470         list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
 471                 xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap);
 472                 list_del(&resv->list);
 473                 xfs_perag_put(resv->pag);
 474                 kfree(resv);
 475         }
 476
 477         if (sc->ip) {
 478                 kmem_cache_free(xfs_ifork_cache, xnr->ifake.if_fork);
 479                 xnr->ifake.if_fork = NULL;
 480         }
 481
 482         return error;
 483 }
 484
 485 /*
 486  * Free all the accounting info and unused disk space allocations after
 487  * committing a new btree.
 488  */
 489 int
 490 xrep_newbt_commit(
 491         struct xrep_newbt       *xnr)
 492 {
 493         return xrep_newbt_free(xnr, true);
 494 }
 495
 496 /*
 497  * Free all the accounting info and all of the disk space we reserved for a new
 498  * btree that we're not going to commit.  We want to try to roll things back
 499  * cleanly for things like ENOSPC midway through allocation.
 500  */
 501 void
 502 xrep_newbt_cancel(
 503         struct xrep_newbt       *xnr)
 504 {
 505         xrep_newbt_free(xnr, false);
 506 }
 507
 508 /* Feed one of the reserved btree blocks to the bulk loader. */
 509 int
 510 xrep_newbt_claim_block(
 511         struct xfs_btree_cur    *cur,
 512         struct xrep_newbt       *xnr,
 513         union xfs_btree_ptr     *ptr)
 514 {
 515         struct xrep_newbt_resv  *resv;
 516         xfs_agblock_t           agbno;
 517
 518         /*
 519          * The first item in the list should always have a free block unless
 520          * we're completely out.
 521          */
 522         resv = list_first_entry(&xnr->resv_list, struct xrep_newbt_resv, list);
 523         if (resv->used == resv->len)
 524                 return -ENOSPC;
 525
 526         /*
 527          * Peel off a block from the start of the reservation.  We allocate
 528          * blocks in order to place blocks on disk in increasing record or key
 529          * order.  The block reservations tend to end up on the list in
 530          * decreasing order, which hopefully results in leaf blocks ending up
 531          * together.
 532          */
 533         agbno = resv->agbno + resv->used;
 534         resv->used++;
 535
 536         /* If we used all the blocks in this reservation, move it to the end. */
 537         if (resv->used == resv->len)
 538                 list_move_tail(&resv->list, &xnr->resv_list);
 539
 540         trace_xrep_newbt_claim_block(resv->pag, agbno, 1, xnr->oinfo.oi_owner);
 541
 542         if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
 543                 ptr->l = cpu_to_be64(xfs_agbno_to_fsb(resv->pag, agbno));
 544         else
 545                 ptr->s = cpu_to_be32(agbno);
 546
 547         /* Relog all the EFIs. */
 548         return xrep_defer_finish(xnr->sc);
 549 }
 550
 551 /* How many reserved blocks are unused? */
 552 unsigned int
 553 xrep_newbt_unused_blocks(
 554         struct xrep_newbt       *xnr)
 555 {
 556         struct xrep_newbt_resv  *resv;
 557         unsigned int            unused = 0;
 558
 559         list_for_each_entry(resv, &xnr->resv_list, list)
 560                 unused += resv->len - resv->used;
 561         return unused;
 562 }