fs/xfs/libxfs/xfs_rmap_btree.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (c) 2014 Red Hat, Inc.
   4  * All Rights Reserved.
   5  */
   6 #include "xfs.h"
   7 #include "xfs_fs.h"
   8 #include "xfs_shared.h"
   9 #include "xfs_format.h"
  10 #include "xfs_log_format.h"
  11 #include "xfs_trans_resv.h"
  12 #include "xfs_bit.h"
  13 #include "xfs_sb.h"
  14 #include "xfs_mount.h"
  15 #include "xfs_defer.h"
  16 #include "xfs_inode.h"
  17 #include "xfs_trans.h"
  18 #include "xfs_alloc.h"
  19 #include "xfs_btree.h"
  20 #include "xfs_rmap.h"
  21 #include "xfs_rmap_btree.h"
  22 #include "xfs_trace.h"
  23 #include "xfs_cksum.h"
  24 #include "xfs_error.h"
  25 #include "xfs_extent_busy.h"
  26 #include "xfs_ag_resv.h"
  27
  28 /*
  29  * Reverse map btree.
  30  *
  31  * This is a per-ag tree used to track the owner(s) of a given extent. With
  32  * reflink it is possible for there to be multiple owners, which is a departure
  33  * from classic XFS. Owner records for data extents are inserted when the
  34  * extent is mapped and removed when an extent is unmapped.  Owner records for
  35  * all other block types (i.e. metadata) are inserted when an extent is
  36  * allocated and removed when an extent is freed. There can only be one owner
  37  * of a metadata extent, usually an inode or some other metadata structure like
  38  * an AG btree.
  39  *
  40  * The rmap btree is part of the free space management, so blocks for the tree
  41  * are sourced from the agfl. Hence we need transaction reservation support for
  42  * this tree so that the freelist is always large enough. This also impacts on
  43  * the minimum space we need to leave free in the AG.
  44  *
  45  * The tree is ordered by [ag block, owner, offset]. This is a large key size,
  46  * but it is the only way to enforce unique keys when a block can be owned by
  47  * multiple files at any offset. There's no need to order/search by extent
  48  * size for online updating/management of the tree. It is intended that most
  49  * reverse lookups will be to find the owner(s) of a particular block, or to
  50  * try to recover tree and file data from corrupt primary metadata.
  51  */
  52
  53 static struct xfs_btree_cur *
  54 xfs_rmapbt_dup_cursor(
  55         struct xfs_btree_cur    *cur)
  56 {
  57         return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp,
  58                         cur->bc_private.a.agbp, cur->bc_private.a.agno);
  59 }
  60
  61 STATIC void
  62 xfs_rmapbt_set_root(
  63         struct xfs_btree_cur    *cur,
  64         union xfs_btree_ptr     *ptr,
  65         int                     inc)
  66 {
  67         struct xfs_buf          *agbp = cur->bc_private.a.agbp;
  68         struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
  69         xfs_agnumber_t          seqno = be32_to_cpu(agf->agf_seqno);
  70         int                     btnum = cur->bc_btnum;
  71         struct xfs_perag        *pag = xfs_perag_get(cur->bc_mp, seqno);
  72
  73         ASSERT(ptr->s != 0);
  74
  75         agf->agf_roots[btnum] = ptr->s;
  76         be32_add_cpu(&agf->agf_levels[btnum], inc);
  77         pag->pagf_levels[btnum] += inc;
  78         xfs_perag_put(pag);
  79
  80         xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
  81 }
  82
  83 STATIC int
  84 xfs_rmapbt_alloc_block(
  85         struct xfs_btree_cur    *cur,
  86         union xfs_btree_ptr     *start,
  87         union xfs_btree_ptr     *new,
  88         int                     *stat)
  89 {
  90         struct xfs_buf          *agbp = cur->bc_private.a.agbp;
  91         struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
  92         int                     error;
  93         xfs_agblock_t           bno;
  94
  95         /* Allocate the new block from the freelist. If we can't, give up.  */
  96         error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
  97                                        &bno, 1);
  98         if (error)
  99                 return error;
 100
 101         trace_xfs_rmapbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno,
 102                         bno, 1);
 103         if (bno == NULLAGBLOCK) {
 104                 *stat = 0;
 105                 return 0;
 106         }
 107
 108         xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1,
 109                         false);
 110
 111         xfs_trans_agbtree_delta(cur->bc_tp, 1);
 112         new->s = cpu_to_be32(bno);
 113         be32_add_cpu(&agf->agf_rmap_blocks, 1);
 114         xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
 115
 116         xfs_ag_resv_rmapbt_alloc(cur->bc_mp, cur->bc_private.a.agno);
 117
 118         *stat = 1;
 119         return 0;
 120 }
 121
 122 STATIC int
 123 xfs_rmapbt_free_block(
 124         struct xfs_btree_cur    *cur,
 125         struct xfs_buf          *bp)
 126 {
 127         struct xfs_buf          *agbp = cur->bc_private.a.agbp;
 128         struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
 129         xfs_agblock_t           bno;
 130         int                     error;
 131
 132         bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp));
 133         trace_xfs_rmapbt_free_block(cur->bc_mp, cur->bc_private.a.agno,
 134                         bno, 1);
 135         be32_add_cpu(&agf->agf_rmap_blocks, -1);
 136         xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
 137         error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
 138         if (error)
 139                 return error;
 140
 141         xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
 142                               XFS_EXTENT_BUSY_SKIP_DISCARD);
 143         xfs_trans_agbtree_delta(cur->bc_tp, -1);
 144
 145         xfs_ag_resv_rmapbt_free(cur->bc_mp, cur->bc_private.a.agno);
 146
 147         return 0;
 148 }
 149
 150 STATIC int
 151 xfs_rmapbt_get_minrecs(
 152         struct xfs_btree_cur    *cur,
 153         int                     level)
 154 {
 155         return cur->bc_mp->m_rmap_mnr[level != 0];
 156 }
 157
 158 STATIC int
 159 xfs_rmapbt_get_maxrecs(
 160         struct xfs_btree_cur    *cur,
 161         int                     level)
 162 {
 163         return cur->bc_mp->m_rmap_mxr[level != 0];
 164 }
 165
 166 STATIC void
 167 xfs_rmapbt_init_key_from_rec(
 168         union xfs_btree_key     *key,
 169         union xfs_btree_rec     *rec)
 170 {
 171         key->rmap.rm_startblock = rec->rmap.rm_startblock;
 172         key->rmap.rm_owner = rec->rmap.rm_owner;
 173         key->rmap.rm_offset = rec->rmap.rm_offset;
 174 }
 175
 176 /*
 177  * The high key for a reverse mapping record can be computed by shifting
 178  * the startblock and offset to the highest value that would still map
 179  * to that record.  In practice this means that we add blockcount-1 to
 180  * the startblock for all records, and if the record is for a data/attr
 181  * fork mapping, we add blockcount-1 to the offset too.
 182  */
 183 STATIC void
 184 xfs_rmapbt_init_high_key_from_rec(
 185         union xfs_btree_key     *key,
 186         union xfs_btree_rec     *rec)
 187 {
 188         uint64_t                off;
 189         int                     adj;
 190
 191         adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1;
 192
 193         key->rmap.rm_startblock = rec->rmap.rm_startblock;
 194         be32_add_cpu(&key->rmap.rm_startblock, adj);
 195         key->rmap.rm_owner = rec->rmap.rm_owner;
 196         key->rmap.rm_offset = rec->rmap.rm_offset;
 197         if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) ||
 198             XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset)))
 199                 return;
 200         off = be64_to_cpu(key->rmap.rm_offset);
 201         off = (XFS_RMAP_OFF(off) + adj) | (off & ~XFS_RMAP_OFF_MASK);
 202         key->rmap.rm_offset = cpu_to_be64(off);
 203 }
 204
 205 STATIC void
 206 xfs_rmapbt_init_rec_from_cur(
 207         struct xfs_btree_cur    *cur,
 208         union xfs_btree_rec     *rec)
 209 {
 210         rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock);
 211         rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount);
 212         rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner);
 213         rec->rmap.rm_offset = cpu_to_be64(
 214                         xfs_rmap_irec_offset_pack(&cur->bc_rec.r));
 215 }
 216
 217 STATIC void
 218 xfs_rmapbt_init_ptr_from_cur(
 219         struct xfs_btree_cur    *cur,
 220         union xfs_btree_ptr     *ptr)
 221 {
 222         struct xfs_agf          *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
 223
 224         ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno));
 225
 226         ptr->s = agf->agf_roots[cur->bc_btnum];
 227 }
 228
 229 STATIC int64_t
 230 xfs_rmapbt_key_diff(
 231         struct xfs_btree_cur    *cur,
 232         union xfs_btree_key     *key)
 233 {
 234         struct xfs_rmap_irec    *rec = &cur->bc_rec.r;
 235         struct xfs_rmap_key     *kp = &key->rmap;
 236         __u64                   x, y;
 237         int64_t                 d;
 238
 239         d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock;
 240         if (d)
 241                 return d;
 242
 243         x = be64_to_cpu(kp->rm_owner);
 244         y = rec->rm_owner;
 245         if (x > y)
 246                 return 1;
 247         else if (y > x)
 248                 return -1;
 249
 250         x = XFS_RMAP_OFF(be64_to_cpu(kp->rm_offset));
 251         y = rec->rm_offset;
 252         if (x > y)
 253                 return 1;
 254         else if (y > x)
 255                 return -1;
 256         return 0;
 257 }
 258
 259 STATIC int64_t
 260 xfs_rmapbt_diff_two_keys(
 261         struct xfs_btree_cur    *cur,
 262         union xfs_btree_key     *k1,
 263         union xfs_btree_key     *k2)
 264 {
 265         struct xfs_rmap_key     *kp1 = &k1->rmap;
 266         struct xfs_rmap_key     *kp2 = &k2->rmap;
 267         int64_t                 d;
 268         __u64                   x, y;
 269
 270         d = (int64_t)be32_to_cpu(kp1->rm_startblock) -
 271                        be32_to_cpu(kp2->rm_startblock);
 272         if (d)
 273                 return d;
 274
 275         x = be64_to_cpu(kp1->rm_owner);
 276         y = be64_to_cpu(kp2->rm_owner);
 277         if (x > y)
 278                 return 1;
 279         else if (y > x)
 280                 return -1;
 281
 282         x = XFS_RMAP_OFF(be64_to_cpu(kp1->rm_offset));
 283         y = XFS_RMAP_OFF(be64_to_cpu(kp2->rm_offset));
 284         if (x > y)
 285                 return 1;
 286         else if (y > x)
 287                 return -1;
 288         return 0;
 289 }
 290
 291 static xfs_failaddr_t
 292 xfs_rmapbt_verify(
 293         struct xfs_buf          *bp)
 294 {
 295         struct xfs_mount        *mp = bp->b_target->bt_mount;
 296         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
 297         struct xfs_perag        *pag = bp->b_pag;
 298         xfs_failaddr_t          fa;
 299         unsigned int            level;
 300
 301         /*
 302          * magic number and level verification
 303          *
 304          * During growfs operations, we can't verify the exact level or owner as
 305          * the perag is not fully initialised and hence not attached to the
 306          * buffer.  In this case, check against the maximum tree depth.
 307          *
 308          * Similarly, during log recovery we will have a perag structure
 309          * attached, but the agf information will not yet have been initialised
 310          * from the on disk AGF. Again, we can only check against maximum limits
 311          * in this case.
 312          */
 313         if (block->bb_magic != cpu_to_be32(XFS_RMAP_CRC_MAGIC))
 314                 return __this_address;
 315
 316         if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
 317                 return __this_address;
 318         fa = xfs_btree_sblock_v5hdr_verify(bp);
 319         if (fa)
 320                 return fa;
 321
 322         level = be16_to_cpu(block->bb_level);
 323         if (pag && pag->pagf_init) {
 324                 if (level >= pag->pagf_levels[XFS_BTNUM_RMAPi])
 325                         return __this_address;
 326         } else if (level >= mp->m_rmap_maxlevels)
 327                 return __this_address;
 328
 329         return xfs_btree_sblock_verify(bp, mp->m_rmap_mxr[level != 0]);
 330 }
 331
 332 static void
 333 xfs_rmapbt_read_verify(
 334         struct xfs_buf  *bp)
 335 {
 336         xfs_failaddr_t  fa;
 337
 338         if (!xfs_btree_sblock_verify_crc(bp))
 339                 xfs_verifier_error(bp, -EFSBADCRC, __this_address);
 340         else {
 341                 fa = xfs_rmapbt_verify(bp);
 342                 if (fa)
 343                         xfs_verifier_error(bp, -EFSCORRUPTED, fa);
 344         }
 345
 346         if (bp->b_error)
 347                 trace_xfs_btree_corrupt(bp, _RET_IP_);
 348 }
 349
 350 static void
 351 xfs_rmapbt_write_verify(
 352         struct xfs_buf  *bp)
 353 {
 354         xfs_failaddr_t  fa;
 355
 356         fa = xfs_rmapbt_verify(bp);
 357         if (fa) {
 358                 trace_xfs_btree_corrupt(bp, _RET_IP_);
 359                 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
 360                 return;
 361         }
 362         xfs_btree_sblock_calc_crc(bp);
 363
 364 }
 365
 366 const struct xfs_buf_ops xfs_rmapbt_buf_ops = {
 367         .name                   = "xfs_rmapbt",
 368         .verify_read            = xfs_rmapbt_read_verify,
 369         .verify_write           = xfs_rmapbt_write_verify,
 370         .verify_struct          = xfs_rmapbt_verify,
 371 };
 372
 373 STATIC int
 374 xfs_rmapbt_keys_inorder(
 375         struct xfs_btree_cur    *cur,
 376         union xfs_btree_key     *k1,
 377         union xfs_btree_key     *k2)
 378 {
 379         uint32_t                x;
 380         uint32_t                y;
 381         uint64_t                a;
 382         uint64_t                b;
 383
 384         x = be32_to_cpu(k1->rmap.rm_startblock);
 385         y = be32_to_cpu(k2->rmap.rm_startblock);
 386         if (x < y)
 387                 return 1;
 388         else if (x > y)
 389                 return 0;
 390         a = be64_to_cpu(k1->rmap.rm_owner);
 391         b = be64_to_cpu(k2->rmap.rm_owner);
 392         if (a < b)
 393                 return 1;
 394         else if (a > b)
 395                 return 0;
 396         a = XFS_RMAP_OFF(be64_to_cpu(k1->rmap.rm_offset));
 397         b = XFS_RMAP_OFF(be64_to_cpu(k2->rmap.rm_offset));
 398         if (a <= b)
 399                 return 1;
 400         return 0;
 401 }
 402
 403 STATIC int
 404 xfs_rmapbt_recs_inorder(
 405         struct xfs_btree_cur    *cur,
 406         union xfs_btree_rec     *r1,
 407         union xfs_btree_rec     *r2)
 408 {
 409         uint32_t                x;
 410         uint32_t                y;
 411         uint64_t                a;
 412         uint64_t                b;
 413
 414         x = be32_to_cpu(r1->rmap.rm_startblock);
 415         y = be32_to_cpu(r2->rmap.rm_startblock);
 416         if (x < y)
 417                 return 1;
 418         else if (x > y)
 419                 return 0;
 420         a = be64_to_cpu(r1->rmap.rm_owner);
 421         b = be64_to_cpu(r2->rmap.rm_owner);
 422         if (a < b)
 423                 return 1;
 424         else if (a > b)
 425                 return 0;
 426         a = XFS_RMAP_OFF(be64_to_cpu(r1->rmap.rm_offset));
 427         b = XFS_RMAP_OFF(be64_to_cpu(r2->rmap.rm_offset));
 428         if (a <= b)
 429                 return 1;
 430         return 0;
 431 }
 432
 433 static const struct xfs_btree_ops xfs_rmapbt_ops = {
 434         .rec_len                = sizeof(struct xfs_rmap_rec),
 435         .key_len                = 2 * sizeof(struct xfs_rmap_key),
 436
 437         .dup_cursor             = xfs_rmapbt_dup_cursor,
 438         .set_root               = xfs_rmapbt_set_root,
 439         .alloc_block            = xfs_rmapbt_alloc_block,
 440         .free_block             = xfs_rmapbt_free_block,
 441         .get_minrecs            = xfs_rmapbt_get_minrecs,
 442         .get_maxrecs            = xfs_rmapbt_get_maxrecs,
 443         .init_key_from_rec      = xfs_rmapbt_init_key_from_rec,
 444         .init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec,
 445         .init_rec_from_cur      = xfs_rmapbt_init_rec_from_cur,
 446         .init_ptr_from_cur      = xfs_rmapbt_init_ptr_from_cur,
 447         .key_diff               = xfs_rmapbt_key_diff,
 448         .buf_ops                = &xfs_rmapbt_buf_ops,
 449         .diff_two_keys          = xfs_rmapbt_diff_two_keys,
 450         .keys_inorder           = xfs_rmapbt_keys_inorder,
 451         .recs_inorder           = xfs_rmapbt_recs_inorder,
 452 };
 453
 454 /*
 455  * Allocate a new allocation btree cursor.
 456  */
 457 struct xfs_btree_cur *
 458 xfs_rmapbt_init_cursor(
 459         struct xfs_mount        *mp,
 460         struct xfs_trans        *tp,
 461         struct xfs_buf          *agbp,
 462         xfs_agnumber_t          agno)
 463 {
 464         struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
 465         struct xfs_btree_cur    *cur;
 466
 467         cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
 468         cur->bc_tp = tp;
 469         cur->bc_mp = mp;
 470         /* Overlapping btree; 2 keys per pointer. */
 471         cur->bc_btnum = XFS_BTNUM_RMAP;
 472         cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING;
 473         cur->bc_blocklog = mp->m_sb.sb_blocklog;
 474         cur->bc_ops = &xfs_rmapbt_ops;
 475         cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]);
 476         cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2);
 477
 478         cur->bc_private.a.agbp = agbp;
 479         cur->bc_private.a.agno = agno;
 480
 481         return cur;
 482 }
 483
 484 /*
 485  * Calculate number of records in an rmap btree block.
 486  */
 487 int
 488 xfs_rmapbt_maxrecs(
 489         int                     blocklen,
 490         int                     leaf)
 491 {
 492         blocklen -= XFS_RMAP_BLOCK_LEN;
 493
 494         if (leaf)
 495                 return blocklen / sizeof(struct xfs_rmap_rec);
 496         return blocklen /
 497                 (2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rmap_ptr_t));
 498 }
 499
 500 /* Compute the maximum height of an rmap btree. */
 501 void
 502 xfs_rmapbt_compute_maxlevels(
 503         struct xfs_mount                *mp)
 504 {
 505         /*
 506          * On a non-reflink filesystem, the maximum number of rmap
 507          * records is the number of blocks in the AG, hence the max
 508          * rmapbt height is log_$maxrecs($agblocks).  However, with
 509          * reflink each AG block can have up to 2^32 (per the refcount
 510          * record format) owners, which means that theoretically we
 511          * could face up to 2^64 rmap records.
 512          *
 513          * That effectively means that the max rmapbt height must be
 514          * XFS_BTREE_MAXLEVELS.  "Fortunately" we'll run out of AG
 515          * blocks to feed the rmapbt long before the rmapbt reaches
 516          * maximum height.  The reflink code uses ag_resv_critical to
 517          * disallow reflinking when less than 10% of the per-AG metadata
 518          * block reservation since the fallback is a regular file copy.
 519          */
 520         if (xfs_sb_version_hasreflink(&mp->m_sb))
 521                 mp->m_rmap_maxlevels = XFS_BTREE_MAXLEVELS;
 522         else
 523                 mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(
 524                                 mp->m_rmap_mnr, mp->m_sb.sb_agblocks);
 525 }
 526
 527 /* Calculate the refcount btree size for some records. */
 528 xfs_extlen_t
 529 xfs_rmapbt_calc_size(
 530         struct xfs_mount        *mp,
 531         unsigned long long      len)
 532 {
 533         return xfs_btree_calc_size(mp->m_rmap_mnr, len);
 534 }
 535
 536 /*
 537  * Calculate the maximum refcount btree size.
 538  */
 539 xfs_extlen_t
 540 xfs_rmapbt_max_size(
 541         struct xfs_mount        *mp,
 542         xfs_agblock_t           agblocks)
 543 {
 544         /* Bail out if we're uninitialized, which can happen in mkfs. */
 545         if (mp->m_rmap_mxr[0] == 0)
 546                 return 0;
 547
 548         return xfs_rmapbt_calc_size(mp, agblocks);
 549 }
 550
 551 /*
 552  * Figure out how many blocks to reserve and how many are used by this btree.
 553  */
 554 int
 555 xfs_rmapbt_calc_reserves(
 556         struct xfs_mount        *mp,
 557         struct xfs_trans        *tp,
 558         xfs_agnumber_t          agno,
 559         xfs_extlen_t            *ask,
 560         xfs_extlen_t            *used)
 561 {
 562         struct xfs_buf          *agbp;
 563         struct xfs_agf          *agf;
 564         xfs_agblock_t           agblocks;
 565         xfs_extlen_t            tree_len;
 566         int                     error;
 567
 568         if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
 569                 return 0;
 570
 571         error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
 572         if (error)
 573                 return error;
 574
 575         agf = XFS_BUF_TO_AGF(agbp);
 576         agblocks = be32_to_cpu(agf->agf_length);
 577         tree_len = be32_to_cpu(agf->agf_rmap_blocks);
 578         xfs_trans_brelse(tp, agbp);
 579
 580         /* Reserve 1% of the AG or enough for 1 block per record. */
 581         *ask += max(agblocks / 100, xfs_rmapbt_max_size(mp, agblocks));
 582         *used += tree_len;
 583
 584         return error;
 585 }