fs/xfs/libxfs/xfs_ialloc.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
   4  * All Rights Reserved.
   5  */
   6 #include "xfs.h"
   7 #include "xfs_fs.h"
   8 #include "xfs_shared.h"
   9 #include "xfs_format.h"
  10 #include "xfs_log_format.h"
  11 #include "xfs_trans_resv.h"
  12 #include "xfs_bit.h"
  13 #include "xfs_mount.h"
  14 #include "xfs_inode.h"
  15 #include "xfs_btree.h"
  16 #include "xfs_ialloc.h"
  17 #include "xfs_ialloc_btree.h"
  18 #include "xfs_alloc.h"
  19 #include "xfs_errortag.h"
  20 #include "xfs_error.h"
  21 #include "xfs_bmap.h"
  22 #include "xfs_trans.h"
  23 #include "xfs_buf_item.h"
  24 #include "xfs_icreate_item.h"
  25 #include "xfs_icache.h"
  26 #include "xfs_trace.h"
  27 #include "xfs_log.h"
  28 #include "xfs_rmap.h"
  29 #include "xfs_ag.h"
  30 #include "xfs_health.h"
  31
  32 /*
  33  * Lookup a record by ino in the btree given by cur.
  34  */
  35 int                                     /* error */
  36 xfs_inobt_lookup(
  37         struct xfs_btree_cur    *cur,   /* btree cursor */
  38         xfs_agino_t             ino,    /* starting inode of chunk */
  39         xfs_lookup_t            dir,    /* <=, >=, == */
  40         int                     *stat)  /* success/failure */
  41 {
  42         cur->bc_rec.i.ir_startino = ino;
  43         cur->bc_rec.i.ir_holemask = 0;
  44         cur->bc_rec.i.ir_count = 0;
  45         cur->bc_rec.i.ir_freecount = 0;
  46         cur->bc_rec.i.ir_free = 0;
  47         return xfs_btree_lookup(cur, dir, stat);
  48 }
  49
  50 /*
  51  * Update the record referred to by cur to the value given.
  52  * This either works (return 0) or gets an EFSCORRUPTED error.
  53  */
  54 STATIC int                              /* error */
  55 xfs_inobt_update(
  56         struct xfs_btree_cur    *cur,   /* btree cursor */
  57         xfs_inobt_rec_incore_t  *irec)  /* btree record */
  58 {
  59         union xfs_btree_rec     rec;
  60
  61         rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino);
  62         if (xfs_has_sparseinodes(cur->bc_mp)) {
  63                 rec.inobt.ir_u.sp.ir_holemask = cpu_to_be16(irec->ir_holemask);
  64                 rec.inobt.ir_u.sp.ir_count = irec->ir_count;
  65                 rec.inobt.ir_u.sp.ir_freecount = irec->ir_freecount;
  66         } else {
  67                 /* ir_holemask/ir_count not supported on-disk */
  68                 rec.inobt.ir_u.f.ir_freecount = cpu_to_be32(irec->ir_freecount);
  69         }
  70         rec.inobt.ir_free = cpu_to_be64(irec->ir_free);
  71         return xfs_btree_update(cur, &rec);
  72 }
  73
  74 /* Convert on-disk btree record to incore inobt record. */
  75 void
  76 xfs_inobt_btrec_to_irec(
  77         struct xfs_mount                *mp,
  78         const union xfs_btree_rec       *rec,
  79         struct xfs_inobt_rec_incore     *irec)
  80 {
  81         irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
  82         if (xfs_has_sparseinodes(mp)) {
  83                 irec->ir_holemask = be16_to_cpu(rec->inobt.ir_u.sp.ir_holemask);
  84                 irec->ir_count = rec->inobt.ir_u.sp.ir_count;
  85                 irec->ir_freecount = rec->inobt.ir_u.sp.ir_freecount;
  86         } else {
  87                 /*
  88                  * ir_holemask/ir_count not supported on-disk. Fill in hardcoded
  89                  * values for full inode chunks.
  90                  */
  91                 irec->ir_holemask = XFS_INOBT_HOLEMASK_FULL;
  92                 irec->ir_count = XFS_INODES_PER_CHUNK;
  93                 irec->ir_freecount =
  94                                 be32_to_cpu(rec->inobt.ir_u.f.ir_freecount);
  95         }
  96         irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
  97 }
  98
  99 /* Compute the freecount of an incore inode record. */
 100 uint8_t
 101 xfs_inobt_rec_freecount(
 102         const struct xfs_inobt_rec_incore       *irec)
 103 {
 104         uint64_t                                realfree = irec->ir_free;
 105
 106         if (xfs_inobt_issparse(irec->ir_holemask))
 107                 realfree &= xfs_inobt_irec_to_allocmask(irec);
 108         return hweight64(realfree);
 109 }
 110
 111 /* Simple checks for inode records. */
 112 xfs_failaddr_t
 113 xfs_inobt_check_irec(
 114         struct xfs_perag                        *pag,
 115         const struct xfs_inobt_rec_incore       *irec)
 116 {
 117         /* Record has to be properly aligned within the AG. */
 118         if (!xfs_verify_agino(pag, irec->ir_startino))
 119                 return __this_address;
 120         if (!xfs_verify_agino(pag,
 121                                 irec->ir_startino + XFS_INODES_PER_CHUNK - 1))
 122                 return __this_address;
 123         if (irec->ir_count < XFS_INODES_PER_HOLEMASK_BIT ||
 124             irec->ir_count > XFS_INODES_PER_CHUNK)
 125                 return __this_address;
 126         if (irec->ir_freecount > XFS_INODES_PER_CHUNK)
 127                 return __this_address;
 128
 129         if (xfs_inobt_rec_freecount(irec) != irec->ir_freecount)
 130                 return __this_address;
 131
 132         return NULL;
 133 }
 134
 135 static inline int
 136 xfs_inobt_complain_bad_rec(
 137         struct xfs_btree_cur            *cur,
 138         xfs_failaddr_t                  fa,
 139         const struct xfs_inobt_rec_incore *irec)
 140 {
 141         struct xfs_mount                *mp = cur->bc_mp;
 142
 143         xfs_warn(mp,
 144                 "%sbt record corruption in AG %d detected at %pS!",
 145                 cur->bc_ops->name, cur->bc_group->xg_gno, fa);
 146         xfs_warn(mp,
 147 "start inode 0x%x, count 0x%x, free 0x%x freemask 0x%llx, holemask 0x%x",
 148                 irec->ir_startino, irec->ir_count, irec->ir_freecount,
 149                 irec->ir_free, irec->ir_holemask);
 150         xfs_btree_mark_sick(cur);
 151         return -EFSCORRUPTED;
 152 }
 153
 154 /*
 155  * Get the data from the pointed-to record.
 156  */
 157 int
 158 xfs_inobt_get_rec(
 159         struct xfs_btree_cur            *cur,
 160         struct xfs_inobt_rec_incore     *irec,
 161         int                             *stat)
 162 {
 163         struct xfs_mount                *mp = cur->bc_mp;
 164         union xfs_btree_rec             *rec;
 165         xfs_failaddr_t                  fa;
 166         int                             error;
 167
 168         error = xfs_btree_get_rec(cur, &rec, stat);
 169         if (error || *stat == 0)
 170                 return error;
 171
 172         xfs_inobt_btrec_to_irec(mp, rec, irec);
 173         fa = xfs_inobt_check_irec(to_perag(cur->bc_group), irec);
 174         if (fa)
 175                 return xfs_inobt_complain_bad_rec(cur, fa, irec);
 176
 177         return 0;
 178 }
 179
 180 /*
 181  * Insert a single inobt record. Cursor must already point to desired location.
 182  */
 183 int
 184 xfs_inobt_insert_rec(
 185         struct xfs_btree_cur    *cur,
 186         uint16_t                holemask,
 187         uint8_t                 count,
 188         int32_t                 freecount,
 189         xfs_inofree_t           free,
 190         int                     *stat)
 191 {
 192         cur->bc_rec.i.ir_holemask = holemask;
 193         cur->bc_rec.i.ir_count = count;
 194         cur->bc_rec.i.ir_freecount = freecount;
 195         cur->bc_rec.i.ir_free = free;
 196         return xfs_btree_insert(cur, stat);
 197 }
 198
 199 /*
 200  * Insert records describing a newly allocated inode chunk into the inobt.
 201  */
 202 STATIC int
 203 xfs_inobt_insert(
 204         struct xfs_perag        *pag,
 205         struct xfs_trans        *tp,
 206         struct xfs_buf          *agbp,
 207         xfs_agino_t             newino,
 208         xfs_agino_t             newlen,
 209         bool                    is_finobt)
 210 {
 211         struct xfs_btree_cur    *cur;
 212         xfs_agino_t             thisino;
 213         int                     i;
 214         int                     error;
 215
 216         if (is_finobt)
 217                 cur = xfs_finobt_init_cursor(pag, tp, agbp);
 218         else
 219                 cur = xfs_inobt_init_cursor(pag, tp, agbp);
 220
 221         for (thisino = newino;
 222              thisino < newino + newlen;
 223              thisino += XFS_INODES_PER_CHUNK) {
 224                 error = xfs_inobt_lookup(cur, thisino, XFS_LOOKUP_EQ, &i);
 225                 if (error) {
 226                         xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
 227                         return error;
 228                 }
 229                 ASSERT(i == 0);
 230
 231                 error = xfs_inobt_insert_rec(cur, XFS_INOBT_HOLEMASK_FULL,
 232                                              XFS_INODES_PER_CHUNK,
 233                                              XFS_INODES_PER_CHUNK,
 234                                              XFS_INOBT_ALL_FREE, &i);
 235                 if (error) {
 236                         xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
 237                         return error;
 238                 }
 239                 ASSERT(i == 1);
 240         }
 241
 242         xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
 243
 244         return 0;
 245 }
 246
 247 /*
 248  * Verify that the number of free inodes in the AGI is correct.
 249  */
 250 #ifdef DEBUG
 251 static int
 252 xfs_check_agi_freecount(
 253         struct xfs_btree_cur    *cur)
 254 {
 255         if (cur->bc_nlevels == 1) {
 256                 xfs_inobt_rec_incore_t rec;
 257                 int             freecount = 0;
 258                 int             error;
 259                 int             i;
 260
 261                 error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
 262                 if (error)
 263                         return error;
 264
 265                 do {
 266                         error = xfs_inobt_get_rec(cur, &rec, &i);
 267                         if (error)
 268                                 return error;
 269
 270                         if (i) {
 271                                 freecount += rec.ir_freecount;
 272                                 error = xfs_btree_increment(cur, 0, &i);
 273                                 if (error)
 274                                         return error;
 275                         }
 276                 } while (i == 1);
 277
 278                 if (!xfs_is_shutdown(cur->bc_mp)) {
 279                         ASSERT(freecount ==
 280                                 to_perag(cur->bc_group)->pagi_freecount);
 281                 }
 282         }
 283         return 0;
 284 }
 285 #else
 286 #define xfs_check_agi_freecount(cur)    0
 287 #endif
 288
 289 /*
 290  * Initialise a new set of inodes. When called without a transaction context
 291  * (e.g. from recovery) we initiate a delayed write of the inode buffers rather
 292  * than logging them (which in a transaction context puts them into the AIL
 293  * for writeback rather than the xfsbufd queue).
 294  */
 295 int
 296 xfs_ialloc_inode_init(
 297         struct xfs_mount        *mp,
 298         struct xfs_trans        *tp,
 299         struct list_head        *buffer_list,
 300         int                     icount,
 301         xfs_agnumber_t          agno,
 302         xfs_agblock_t           agbno,
 303         xfs_agblock_t           length,
 304         unsigned int            gen)
 305 {
 306         struct xfs_buf          *fbuf;
 307         struct xfs_dinode       *free;
 308         int                     nbufs;
 309         int                     version;
 310         int                     i, j;
 311         xfs_daddr_t             d;
 312         xfs_ino_t               ino = 0;
 313         int                     error;
 314
 315         /*
 316          * Loop over the new block(s), filling in the inodes.  For small block
 317          * sizes, manipulate the inodes in buffers  which are multiples of the
 318          * blocks size.
 319          */
 320         nbufs = length / M_IGEO(mp)->blocks_per_cluster;
 321
 322         /*
 323          * Figure out what version number to use in the inodes we create.  If
 324          * the superblock version has caught up to the one that supports the new
 325          * inode format, then use the new inode version.  Otherwise use the old
 326          * version so that old kernels will continue to be able to use the file
 327          * system.
 328          *
 329          * For v3 inodes, we also need to write the inode number into the inode,
 330          * so calculate the first inode number of the chunk here as
 331          * XFS_AGB_TO_AGINO() only works within a filesystem block, not
 332          * across multiple filesystem blocks (such as a cluster) and so cannot
 333          * be used in the cluster buffer loop below.
 334          *
 335          * Further, because we are writing the inode directly into the buffer
 336          * and calculating a CRC on the entire inode, we have ot log the entire
 337          * inode so that the entire range the CRC covers is present in the log.
 338          * That means for v3 inode we log the entire buffer rather than just the
 339          * inode cores.
 340          */
 341         if (xfs_has_v3inodes(mp)) {
 342                 version = 3;
 343                 ino = XFS_AGINO_TO_INO(mp, agno, XFS_AGB_TO_AGINO(mp, agbno));
 344
 345                 /*
 346                  * log the initialisation that is about to take place as an
 347                  * logical operation. This means the transaction does not
 348                  * need to log the physical changes to the inode buffers as log
 349                  * recovery will know what initialisation is actually needed.
 350                  * Hence we only need to log the buffers as "ordered" buffers so
 351                  * they track in the AIL as if they were physically logged.
 352                  */
 353                 if (tp)
 354                         xfs_icreate_log(tp, agno, agbno, icount,
 355                                         mp->m_sb.sb_inodesize, length, gen);
 356         } else
 357                 version = 2;
 358
 359         for (j = 0; j < nbufs; j++) {
 360                 /*
 361                  * Get the block.
 362                  */
 363                 d = XFS_AGB_TO_DADDR(mp, agno, agbno +
 364                                 (j * M_IGEO(mp)->blocks_per_cluster));
 365                 error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
 366                                 mp->m_bsize * M_IGEO(mp)->blocks_per_cluster,
 367                                 XBF_UNMAPPED, &fbuf);
 368                 if (error)
 369                         return error;
 370
 371                 /* Initialize the inode buffers and log them appropriately. */
 372                 fbuf->b_ops = &xfs_inode_buf_ops;
 373                 xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
 374                 for (i = 0; i < M_IGEO(mp)->inodes_per_cluster; i++) {
 375                         int     ioffset = i << mp->m_sb.sb_inodelog;
 376
 377                         free = xfs_make_iptr(mp, fbuf, i);
 378                         free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
 379                         free->di_version = version;
 380                         free->di_gen = cpu_to_be32(gen);
 381                         free->di_next_unlinked = cpu_to_be32(NULLAGINO);
 382
 383                         if (version == 3) {
 384                                 free->di_ino = cpu_to_be64(ino);
 385                                 ino++;
 386                                 uuid_copy(&free->di_uuid,
 387                                           &mp->m_sb.sb_meta_uuid);
 388                                 xfs_dinode_calc_crc(mp, free);
 389                         } else if (tp) {
 390                                 /* just log the inode core */
 391                                 xfs_trans_log_buf(tp, fbuf, ioffset,
 392                                           ioffset + XFS_DINODE_SIZE(mp) - 1);
 393                         }
 394                 }
 395
 396                 if (tp) {
 397                         /*
 398                          * Mark the buffer as an inode allocation buffer so it
 399                          * sticks in AIL at the point of this allocation
 400                          * transaction. This ensures the they are on disk before
 401                          * the tail of the log can be moved past this
 402                          * transaction (i.e. by preventing relogging from moving
 403                          * it forward in the log).
 404                          */
 405                         xfs_trans_inode_alloc_buf(tp, fbuf);
 406                         if (version == 3) {
 407                                 /*
 408                                  * Mark the buffer as ordered so that they are
 409                                  * not physically logged in the transaction but
 410                                  * still tracked in the AIL as part of the
 411                                  * transaction and pin the log appropriately.
 412                                  */
 413                                 xfs_trans_ordered_buf(tp, fbuf);
 414                         }
 415                 } else {
 416                         fbuf->b_flags |= XBF_DONE;
 417                         xfs_buf_delwri_queue(fbuf, buffer_list);
 418                         xfs_buf_relse(fbuf);
 419                 }
 420         }
 421         return 0;
 422 }
 423
 424 /*
 425  * Align startino and allocmask for a recently allocated sparse chunk such that
 426  * they are fit for insertion (or merge) into the on-disk inode btrees.
 427  *
 428  * Background:
 429  *
 430  * When enabled, sparse inode support increases the inode alignment from cluster
 431  * size to inode chunk size. This means that the minimum range between two
 432  * non-adjacent inode records in the inobt is large enough for a full inode
 433  * record. This allows for cluster sized, cluster aligned block allocation
 434  * without need to worry about whether the resulting inode record overlaps with
 435  * another record in the tree. Without this basic rule, we would have to deal
 436  * with the consequences of overlap by potentially undoing recent allocations in
 437  * the inode allocation codepath.
 438  *
 439  * Because of this alignment rule (which is enforced on mount), there are two
 440  * inobt possibilities for newly allocated sparse chunks. One is that the
 441  * aligned inode record for the chunk covers a range of inodes not already
 442  * covered in the inobt (i.e., it is safe to insert a new sparse record). The
 443  * other is that a record already exists at the aligned startino that considers
 444  * the newly allocated range as sparse. In the latter case, record content is
 445  * merged in hope that sparse inode chunks fill to full chunks over time.
 446  */
 447 STATIC void
 448 xfs_align_sparse_ino(
 449         struct xfs_mount                *mp,
 450         xfs_agino_t                     *startino,
 451         uint16_t                        *allocmask)
 452 {
 453         xfs_agblock_t                   agbno;
 454         xfs_agblock_t                   mod;
 455         int                             offset;
 456
 457         agbno = XFS_AGINO_TO_AGBNO(mp, *startino);
 458         mod = agbno % mp->m_sb.sb_inoalignmt;
 459         if (!mod)
 460                 return;
 461
 462         /* calculate the inode offset and align startino */
 463         offset = XFS_AGB_TO_AGINO(mp, mod);
 464         *startino -= offset;
 465
 466         /*
 467          * Since startino has been aligned down, left shift allocmask such that
 468          * it continues to represent the same physical inodes relative to the
 469          * new startino.
 470          */
 471         *allocmask <<= offset / XFS_INODES_PER_HOLEMASK_BIT;
 472 }
 473
 474 /*
 475  * Determine whether the source inode record can merge into the target. Both
 476  * records must be sparse, the inode ranges must match and there must be no
 477  * allocation overlap between the records.
 478  */
 479 STATIC bool
 480 __xfs_inobt_can_merge(
 481         struct xfs_inobt_rec_incore     *trec,  /* tgt record */
 482         struct xfs_inobt_rec_incore     *srec)  /* src record */
 483 {
 484         uint64_t                        talloc;
 485         uint64_t                        salloc;
 486
 487         /* records must cover the same inode range */
 488         if (trec->ir_startino != srec->ir_startino)
 489                 return false;
 490
 491         /* both records must be sparse */
 492         if (!xfs_inobt_issparse(trec->ir_holemask) ||
 493             !xfs_inobt_issparse(srec->ir_holemask))
 494                 return false;
 495
 496         /* both records must track some inodes */
 497         if (!trec->ir_count || !srec->ir_count)
 498                 return false;
 499
 500         /* can't exceed capacity of a full record */
 501         if (trec->ir_count + srec->ir_count > XFS_INODES_PER_CHUNK)
 502                 return false;
 503
 504         /* verify there is no allocation overlap */
 505         talloc = xfs_inobt_irec_to_allocmask(trec);
 506         salloc = xfs_inobt_irec_to_allocmask(srec);
 507         if (talloc & salloc)
 508                 return false;
 509
 510         return true;
 511 }
 512
 513 /*
 514  * Merge the source inode record into the target. The caller must call
 515  * __xfs_inobt_can_merge() to ensure the merge is valid.
 516  */
 517 STATIC void
 518 __xfs_inobt_rec_merge(
 519         struct xfs_inobt_rec_incore     *trec,  /* target */
 520         struct xfs_inobt_rec_incore     *srec)  /* src */
 521 {
 522         ASSERT(trec->ir_startino == srec->ir_startino);
 523
 524         /* combine the counts */
 525         trec->ir_count += srec->ir_count;
 526         trec->ir_freecount += srec->ir_freecount;
 527
 528         /*
 529          * Merge the holemask and free mask. For both fields, 0 bits refer to
 530          * allocated inodes. We combine the allocated ranges with bitwise AND.
 531          */
 532         trec->ir_holemask &= srec->ir_holemask;
 533         trec->ir_free &= srec->ir_free;
 534 }
 535
 536 /*
 537  * Insert a new sparse inode chunk into the associated inode allocation btree.
 538  * The inode record for the sparse chunk is pre-aligned to a startino that
 539  * should match any pre-existing sparse inode record in the tree. This allows
 540  * sparse chunks to fill over time.
 541  *
 542  * If no preexisting record exists, the provided record is inserted.
 543  * If there is a preexisting record, the provided record is merged with the
 544  * existing record and updated in place. The merged record is returned in nrec.
 545  *
 546  * It is considered corruption if a merge is requested and not possible. Given
 547  * the sparse inode alignment constraints, this should never happen.
 548  */
 549 STATIC int
 550 xfs_inobt_insert_sprec(
 551         struct xfs_perag                *pag,
 552         struct xfs_trans                *tp,
 553         struct xfs_buf                  *agbp,
 554         struct xfs_inobt_rec_incore     *nrec)  /* in/out: new/merged rec. */
 555 {
 556         struct xfs_mount                *mp = pag_mount(pag);
 557         struct xfs_btree_cur            *cur;
 558         int                             error;
 559         int                             i;
 560         struct xfs_inobt_rec_incore     rec;
 561
 562         cur = xfs_inobt_init_cursor(pag, tp, agbp);
 563
 564         /* the new record is pre-aligned so we know where to look */
 565         error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i);
 566         if (error)
 567                 goto error;
 568         /* if nothing there, insert a new record and return */
 569         if (i == 0) {
 570                 error = xfs_inobt_insert_rec(cur, nrec->ir_holemask,
 571                                              nrec->ir_count, nrec->ir_freecount,
 572                                              nrec->ir_free, &i);
 573                 if (error)
 574                         goto error;
 575                 if (XFS_IS_CORRUPT(mp, i != 1)) {
 576                         xfs_btree_mark_sick(cur);
 577                         error = -EFSCORRUPTED;
 578                         goto error;
 579                 }
 580
 581                 goto out;
 582         }
 583
 584         /*
 585          * A record exists at this startino.  Merge the records.
 586          */
 587         error = xfs_inobt_get_rec(cur, &rec, &i);
 588         if (error)
 589                 goto error;
 590         if (XFS_IS_CORRUPT(mp, i != 1)) {
 591                 xfs_btree_mark_sick(cur);
 592                 error = -EFSCORRUPTED;
 593                 goto error;
 594         }
 595         if (XFS_IS_CORRUPT(mp, rec.ir_startino != nrec->ir_startino)) {
 596                 xfs_btree_mark_sick(cur);
 597                 error = -EFSCORRUPTED;
 598                 goto error;
 599         }
 600
 601         /*
 602          * This should never fail. If we have coexisting records that
 603          * cannot merge, something is seriously wrong.
 604          */
 605         if (XFS_IS_CORRUPT(mp, !__xfs_inobt_can_merge(nrec, &rec))) {
 606                 xfs_btree_mark_sick(cur);
 607                 error = -EFSCORRUPTED;
 608                 goto error;
 609         }
 610
 611         trace_xfs_irec_merge_pre(pag, &rec, nrec);
 612
 613         /* merge to nrec to output the updated record */
 614         __xfs_inobt_rec_merge(nrec, &rec);
 615
 616         trace_xfs_irec_merge_post(pag, nrec);
 617
 618         error = xfs_inobt_rec_check_count(mp, nrec);
 619         if (error)
 620                 goto error;
 621
 622         error = xfs_inobt_update(cur, nrec);
 623         if (error)
 624                 goto error;
 625
 626 out:
 627         xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
 628         return 0;
 629 error:
 630         xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
 631         return error;
 632 }
 633
 634 /*
 635  * Insert a new sparse inode chunk into the free inode btree. The inode
 636  * record for the sparse chunk is pre-aligned to a startino that should match
 637  * any pre-existing sparse inode record in the tree. This allows sparse chunks
 638  * to fill over time.
 639  *
 640  * The new record is always inserted, overwriting a pre-existing record if
 641  * there is one.
 642  */
 643 STATIC int
 644 xfs_finobt_insert_sprec(
 645         struct xfs_perag                *pag,
 646         struct xfs_trans                *tp,
 647         struct xfs_buf                  *agbp,
 648         struct xfs_inobt_rec_incore     *nrec)  /* in/out: new rec. */
 649 {
 650         struct xfs_mount                *mp = pag_mount(pag);
 651         struct xfs_btree_cur            *cur;
 652         int                             error;
 653         int                             i;
 654
 655         cur = xfs_finobt_init_cursor(pag, tp, agbp);
 656
 657         /* the new record is pre-aligned so we know where to look */
 658         error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i);
 659         if (error)
 660                 goto error;
 661         /* if nothing there, insert a new record and return */
 662         if (i == 0) {
 663                 error = xfs_inobt_insert_rec(cur, nrec->ir_holemask,
 664                                              nrec->ir_count, nrec->ir_freecount,
 665                                              nrec->ir_free, &i);
 666                 if (error)
 667                         goto error;
 668                 if (XFS_IS_CORRUPT(mp, i != 1)) {
 669                         xfs_btree_mark_sick(cur);
 670                         error = -EFSCORRUPTED;
 671                         goto error;
 672                 }
 673         } else {
 674                 error = xfs_inobt_update(cur, nrec);
 675                 if (error)
 676                         goto error;
 677         }
 678
 679         xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
 680         return 0;
 681 error:
 682         xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
 683         return error;
 684 }
 685
 686
 687 /*
 688  * Allocate new inodes in the allocation group specified by agbp.  Returns 0 if
 689  * inodes were allocated in this AG; -EAGAIN if there was no space in this AG so
 690  * the caller knows it can try another AG, a hard -ENOSPC when over the maximum
 691  * inode count threshold, or the usual negative error code for other errors.
 692  */
 693 STATIC int
 694 xfs_ialloc_ag_alloc(
 695         struct xfs_perag        *pag,
 696         struct xfs_trans        *tp,
 697         struct xfs_buf          *agbp)
 698 {
 699         struct xfs_agi          *agi;
 700         struct xfs_alloc_arg    args;
 701         int                     error;
 702         xfs_agino_t             newino;         /* new first inode's number */
 703         xfs_agino_t             newlen;         /* new number of inodes */
 704         int                     isaligned = 0;  /* inode allocation at stripe */
 705                                                 /* unit boundary */
 706         /* init. to full chunk */
 707         struct xfs_inobt_rec_incore rec;
 708         struct xfs_ino_geometry *igeo = M_IGEO(tp->t_mountp);
 709         uint16_t                allocmask = (uint16_t) -1;
 710         int                     do_sparse = 0;
 711
 712         memset(&args, 0, sizeof(args));
 713         args.tp = tp;
 714         args.mp = tp->t_mountp;
 715         args.fsbno = NULLFSBLOCK;
 716         args.oinfo = XFS_RMAP_OINFO_INODES;
 717         args.pag = pag;
 718
 719 #ifdef DEBUG
 720         /* randomly do sparse inode allocations */
 721         if (xfs_has_sparseinodes(tp->t_mountp) &&
 722             igeo->ialloc_min_blks < igeo->ialloc_blks)
 723                 do_sparse = get_random_u32_below(2);
 724 #endif
 725
 726         /*
 727          * Locking will ensure that we don't have two callers in here
 728          * at one time.
 729          */
 730         newlen = igeo->ialloc_inos;
 731         if (igeo->maxicount &&
 732             percpu_counter_read_positive(&args.mp->m_icount) + newlen >
 733                                                         igeo->maxicount)
 734                 return -ENOSPC;
 735         args.minlen = args.maxlen = igeo->ialloc_blks;
 736         /*
 737          * First try to allocate inodes contiguous with the last-allocated
 738          * chunk of inodes.  If the filesystem is striped, this will fill
 739          * an entire stripe unit with inodes.
 740          */
 741         agi = agbp->b_addr;
 742         newino = be32_to_cpu(agi->agi_newino);
 743         args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
 744                      igeo->ialloc_blks;
 745         if (do_sparse)
 746                 goto sparse_alloc;
 747         if (likely(newino != NULLAGINO &&
 748                   (args.agbno < be32_to_cpu(agi->agi_length)))) {
 749                 args.prod = 1;
 750
 751                 /*
 752                  * We need to take into account alignment here to ensure that
 753                  * we don't modify the free list if we fail to have an exact
 754                  * block. If we don't have an exact match, and every oher
 755                  * attempt allocation attempt fails, we'll end up cancelling
 756                  * a dirty transaction and shutting down.
 757                  *
 758                  * For an exact allocation, alignment must be 1,
 759                  * however we need to take cluster alignment into account when
 760                  * fixing up the freelist. Use the minalignslop field to
 761                  * indicate that extra blocks might be required for alignment,
 762                  * but not to use them in the actual exact allocation.
 763                  */
 764                 args.alignment = 1;
 765                 args.minalignslop = igeo->cluster_align - 1;
 766
 767                 /* Allow space for the inode btree to split. */
 768                 args.minleft = igeo->inobt_maxlevels;
 769                 error = xfs_alloc_vextent_exact_bno(&args,
 770                                 xfs_agbno_to_fsb(pag, args.agbno));
 771                 if (error)
 772                         return error;
 773
 774                 /*
 775                  * This request might have dirtied the transaction if the AG can
 776                  * satisfy the request, but the exact block was not available.
 777                  * If the allocation did fail, subsequent requests will relax
 778                  * the exact agbno requirement and increase the alignment
 779                  * instead. It is critical that the total size of the request
 780                  * (len + alignment + slop) does not increase from this point
 781                  * on, so reset minalignslop to ensure it is not included in
 782                  * subsequent requests.
 783                  */
 784                 args.minalignslop = 0;
 785         }
 786
 787         if (unlikely(args.fsbno == NULLFSBLOCK)) {
 788                 /*
 789                  * Set the alignment for the allocation.
 790                  * If stripe alignment is turned on then align at stripe unit
 791                  * boundary.
 792                  * If the cluster size is smaller than a filesystem block
 793                  * then we're doing I/O for inodes in filesystem block size
 794                  * pieces, so don't need alignment anyway.
 795                  */
 796                 isaligned = 0;
 797                 if (igeo->ialloc_align) {
 798                         ASSERT(!xfs_has_noalign(args.mp));
 799                         args.alignment = args.mp->m_dalign;
 800                         isaligned = 1;
 801                 } else
 802                         args.alignment = igeo->cluster_align;
 803                 /*
 804                  * Allocate a fixed-size extent of inodes.
 805                  */
 806                 args.prod = 1;
 807                 /*
 808                  * Allow space for the inode btree to split.
 809                  */
 810                 args.minleft = igeo->inobt_maxlevels;
 811                 error = xfs_alloc_vextent_near_bno(&args,
 812                                 xfs_agbno_to_fsb(pag,
 813                                         be32_to_cpu(agi->agi_root)));
 814                 if (error)
 815                         return error;
 816         }
 817
 818         /*
 819          * If stripe alignment is turned on, then try again with cluster
 820          * alignment.
 821          */
 822         if (isaligned && args.fsbno == NULLFSBLOCK) {
 823                 args.alignment = igeo->cluster_align;
 824                 error = xfs_alloc_vextent_near_bno(&args,
 825                                 xfs_agbno_to_fsb(pag,
 826                                         be32_to_cpu(agi->agi_root)));
 827                 if (error)
 828                         return error;
 829         }
 830
 831         /*
 832          * Finally, try a sparse allocation if the filesystem supports it and
 833          * the sparse allocation length is smaller than a full chunk.
 834          */
 835         if (xfs_has_sparseinodes(args.mp) &&
 836             igeo->ialloc_min_blks < igeo->ialloc_blks &&
 837             args.fsbno == NULLFSBLOCK) {
 838 sparse_alloc:
 839                 args.alignment = args.mp->m_sb.sb_spino_align;
 840                 args.prod = 1;
 841
 842                 args.minlen = igeo->ialloc_min_blks;
 843                 args.maxlen = args.minlen;
 844
 845                 /*
 846                  * The inode record will be aligned to full chunk size. We must
 847                  * prevent sparse allocation from AG boundaries that result in
 848                  * invalid inode records, such as records that start at agbno 0
 849                  * or extend beyond the AG.
 850                  *
 851                  * Set min agbno to the first aligned, non-zero agbno and max to
 852                  * the last aligned agbno that is at least one full chunk from
 853                  * the end of the AG.
 854                  */
 855                 args.min_agbno = args.mp->m_sb.sb_inoalignmt;
 856                 args.max_agbno = round_down(xfs_ag_block_count(args.mp,
 857                                                         pag_agno(pag)),
 858                                             args.mp->m_sb.sb_inoalignmt) -
 859                                  igeo->ialloc_blks;
 860
 861                 error = xfs_alloc_vextent_near_bno(&args,
 862                                 xfs_agbno_to_fsb(pag,
 863                                         be32_to_cpu(agi->agi_root)));
 864                 if (error)
 865                         return error;
 866
 867                 newlen = XFS_AGB_TO_AGINO(args.mp, args.len);
 868                 ASSERT(newlen <= XFS_INODES_PER_CHUNK);
 869                 allocmask = (1 << (newlen / XFS_INODES_PER_HOLEMASK_BIT)) - 1;
 870         }
 871
 872         if (args.fsbno == NULLFSBLOCK)
 873                 return -EAGAIN;
 874
 875         ASSERT(args.len == args.minlen);
 876
 877         /*
 878          * Stamp and write the inode buffers.
 879          *
 880          * Seed the new inode cluster with a random generation number. This
 881          * prevents short-term reuse of generation numbers if a chunk is
 882          * freed and then immediately reallocated. We use random numbers
 883          * rather than a linear progression to prevent the next generation
 884          * number from being easily guessable.
 885          */
 886         error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, pag_agno(pag),
 887                         args.agbno, args.len, get_random_u32());
 888
 889         if (error)
 890                 return error;
 891         /*
 892          * Convert the results.
 893          */
 894         newino = XFS_AGB_TO_AGINO(args.mp, args.agbno);
 895
 896         if (xfs_inobt_issparse(~allocmask)) {
 897                 /*
 898                  * We've allocated a sparse chunk. Align the startino and mask.
 899                  */
 900                 xfs_align_sparse_ino(args.mp, &newino, &allocmask);
 901
 902                 rec.ir_startino = newino;
 903                 rec.ir_holemask = ~allocmask;
 904                 rec.ir_count = newlen;
 905                 rec.ir_freecount = newlen;
 906                 rec.ir_free = XFS_INOBT_ALL_FREE;
 907
 908                 /*
 909                  * Insert the sparse record into the inobt and allow for a merge
 910                  * if necessary. If a merge does occur, rec is updated to the
 911                  * merged record.
 912                  */
 913                 error = xfs_inobt_insert_sprec(pag, tp, agbp, &rec);
 914                 if (error == -EFSCORRUPTED) {
 915                         xfs_alert(args.mp,
 916         "invalid sparse inode record: ino 0x%llx holemask 0x%x count %u",
 917                                   xfs_agino_to_ino(pag, rec.ir_startino),
 918                                   rec.ir_holemask, rec.ir_count);
 919                         xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE);
 920                 }
 921                 if (error)
 922                         return error;
 923
 924                 /*
 925                  * We can't merge the part we've just allocated as for the inobt
 926                  * due to finobt semantics. The original record may or may not
 927                  * exist independent of whether physical inodes exist in this
 928                  * sparse chunk.
 929                  *
 930                  * We must update the finobt record based on the inobt record.
 931                  * rec contains the fully merged and up to date inobt record
 932                  * from the previous call. Set merge false to replace any
 933                  * existing record with this one.
 934                  */
 935                 if (xfs_has_finobt(args.mp)) {
 936                         error = xfs_finobt_insert_sprec(pag, tp, agbp, &rec);
 937                         if (error)
 938                                 return error;
 939                 }
 940         } else {
 941                 /* full chunk - insert new records to both btrees */
 942                 error = xfs_inobt_insert(pag, tp, agbp, newino, newlen, false);
 943                 if (error)
 944                         return error;
 945
 946                 if (xfs_has_finobt(args.mp)) {
 947                         error = xfs_inobt_insert(pag, tp, agbp, newino,
 948                                                  newlen, true);
 949                         if (error)
 950                                 return error;
 951                 }
 952         }
 953
 954         /*
 955          * Update AGI counts and newino.
 956          */
 957         be32_add_cpu(&agi->agi_count, newlen);
 958         be32_add_cpu(&agi->agi_freecount, newlen);
 959         pag->pagi_freecount += newlen;
 960         pag->pagi_count += newlen;
 961         agi->agi_newino = cpu_to_be32(newino);
 962
 963         /*
 964          * Log allocation group header fields
 965          */
 966         xfs_ialloc_log_agi(tp, agbp,
 967                 XFS_AGI_COUNT | XFS_AGI_FREECOUNT | XFS_AGI_NEWINO);
 968         /*
 969          * Modify/log superblock values for inode count and inode free count.
 970          */
 971         xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, (long)newlen);
 972         xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, (long)newlen);
 973         return 0;
 974 }
 975
 976 /*
 977  * Try to retrieve the next record to the left/right from the current one.
 978  */
 979 STATIC int
 980 xfs_ialloc_next_rec(
 981         struct xfs_btree_cur    *cur,
 982         xfs_inobt_rec_incore_t  *rec,
 983         int                     *done,
 984         int                     left)
 985 {
 986         int                     error;
 987         int                     i;
 988
 989         if (left)
 990                 error = xfs_btree_decrement(cur, 0, &i);
 991         else
 992                 error = xfs_btree_increment(cur, 0, &i);
 993
 994         if (error)
 995                 return error;
 996         *done = !i;
 997         if (i) {
 998                 error = xfs_inobt_get_rec(cur, rec, &i);
 999                 if (error)
1000                         return error;
1001                 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
1002                         xfs_btree_mark_sick(cur);
1003                         return -EFSCORRUPTED;
1004                 }
1005         }
1006
1007         return 0;
1008 }
1009
1010 STATIC int
1011 xfs_ialloc_get_rec(
1012         struct xfs_btree_cur    *cur,
1013         xfs_agino_t             agino,
1014         xfs_inobt_rec_incore_t  *rec,
1015         int                     *done)
1016 {
1017         int                     error;
1018         int                     i;
1019
1020         error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_EQ, &i);
1021         if (error)
1022                 return error;
1023         *done = !i;
1024         if (i) {
1025                 error = xfs_inobt_get_rec(cur, rec, &i);
1026                 if (error)
1027                         return error;
1028                 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
1029                         xfs_btree_mark_sick(cur);
1030                         return -EFSCORRUPTED;
1031                 }
1032         }
1033
1034         return 0;
1035 }
1036
1037 /*
1038  * Return the offset of the first free inode in the record. If the inode chunk
1039  * is sparsely allocated, we convert the record holemask to inode granularity
1040  * and mask off the unallocated regions from the inode free mask.
1041  */
1042 STATIC int
1043 xfs_inobt_first_free_inode(
1044         struct xfs_inobt_rec_incore     *rec)
1045 {
1046         xfs_inofree_t                   realfree;
1047
1048         /* if there are no holes, return the first available offset */
1049         if (!xfs_inobt_issparse(rec->ir_holemask))
1050                 return xfs_lowbit64(rec->ir_free);
1051
1052         realfree = xfs_inobt_irec_to_allocmask(rec);
1053         realfree &= rec->ir_free;
1054
1055         return xfs_lowbit64(realfree);
1056 }
1057
1058 /*
1059  * If this AG has corrupt inodes, check if allocating this inode would fail
1060  * with corruption errors.  Returns 0 if we're clear, or EAGAIN to try again
1061  * somewhere else.
1062  */
1063 static int
1064 xfs_dialloc_check_ino(
1065         struct xfs_perag        *pag,
1066         struct xfs_trans        *tp,
1067         xfs_ino_t               ino)
1068 {
1069         struct xfs_imap         imap;
1070         struct xfs_buf          *bp;
1071         int                     error;
1072
1073         error = xfs_imap(pag, tp, ino, &imap, 0);
1074         if (error)
1075                 return -EAGAIN;
1076
1077         error = xfs_imap_to_bp(pag_mount(pag), tp, &imap, &bp);
1078         if (error)
1079                 return -EAGAIN;
1080
1081         xfs_trans_brelse(tp, bp);
1082         return 0;
1083 }
1084
1085 /*
1086  * Allocate an inode using the inobt-only algorithm.
1087  */
1088 STATIC int
1089 xfs_dialloc_ag_inobt(
1090         struct xfs_perag        *pag,
1091         struct xfs_trans        *tp,
1092         struct xfs_buf          *agbp,
1093         xfs_ino_t               parent,
1094         xfs_ino_t               *inop)
1095 {
1096         struct xfs_mount        *mp = tp->t_mountp;
1097         struct xfs_agi          *agi = agbp->b_addr;
1098         xfs_agnumber_t          pagno = XFS_INO_TO_AGNO(mp, parent);
1099         xfs_agino_t             pagino = XFS_INO_TO_AGINO(mp, parent);
1100         struct xfs_btree_cur    *cur, *tcur;
1101         struct xfs_inobt_rec_incore rec, trec;
1102         xfs_ino_t               ino;
1103         int                     error;
1104         int                     offset;
1105         int                     i, j;
1106         int                     searchdistance = 10;
1107
1108         ASSERT(xfs_perag_initialised_agi(pag));
1109         ASSERT(xfs_perag_allows_inodes(pag));
1110         ASSERT(pag->pagi_freecount > 0);
1111
1112  restart_pagno:
1113         cur = xfs_inobt_init_cursor(pag, tp, agbp);
1114         /*
1115          * If pagino is 0 (this is the root inode allocation) use newino.
1116          * This must work because we've just allocated some.
1117          */
1118         if (!pagino)
1119                 pagino = be32_to_cpu(agi->agi_newino);
1120
1121         error = xfs_check_agi_freecount(cur);
1122         if (error)
1123                 goto error0;
1124
1125         /*
1126          * If in the same AG as the parent, try to get near the parent.
1127          */
1128         if (pagno == pag_agno(pag)) {
1129                 int             doneleft;       /* done, to the left */
1130                 int             doneright;      /* done, to the right */
1131
1132                 error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
1133                 if (error)
1134                         goto error0;
1135                 if (XFS_IS_CORRUPT(mp, i != 1)) {
1136                         xfs_btree_mark_sick(cur);
1137                         error = -EFSCORRUPTED;
1138                         goto error0;
1139                 }
1140
1141                 error = xfs_inobt_get_rec(cur, &rec, &j);
1142                 if (error)
1143                         goto error0;
1144                 if (XFS_IS_CORRUPT(mp, j != 1)) {
1145                         xfs_btree_mark_sick(cur);
1146                         error = -EFSCORRUPTED;
1147                         goto error0;
1148                 }
1149
1150                 if (rec.ir_freecount > 0) {
1151                         /*
1152                          * Found a free inode in the same chunk
1153                          * as the parent, done.
1154                          */
1155                         goto alloc_inode;
1156                 }
1157
1158
1159                 /*
1160                  * In the same AG as parent, but parent's chunk is full.
1161                  */
1162
1163                 /* duplicate the cursor, search left & right simultaneously */
1164                 error = xfs_btree_dup_cursor(cur, &tcur);
1165                 if (error)
1166                         goto error0;
1167
1168                 /*
1169                  * Skip to last blocks looked up if same parent inode.
1170                  */
1171                 if (pagino != NULLAGINO &&
1172                     pag->pagl_pagino == pagino &&
1173                     pag->pagl_leftrec != NULLAGINO &&
1174                     pag->pagl_rightrec != NULLAGINO) {
1175                         error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec,
1176                                                    &trec, &doneleft);
1177                         if (error)
1178                                 goto error1;
1179
1180                         error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec,
1181                                                    &rec, &doneright);
1182                         if (error)
1183                                 goto error1;
1184                 } else {
1185                         /* search left with tcur, back up 1 record */
1186                         error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1);
1187                         if (error)
1188                                 goto error1;
1189
1190                         /* search right with cur, go forward 1 record. */
1191                         error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0);
1192                         if (error)
1193                                 goto error1;
1194                 }
1195
1196                 /*
1197                  * Loop until we find an inode chunk with a free inode.
1198                  */
1199                 while (--searchdistance > 0 && (!doneleft || !doneright)) {
1200                         int     useleft;  /* using left inode chunk this time */
1201
1202                         /* figure out the closer block if both are valid. */
1203                         if (!doneleft && !doneright) {
1204                                 useleft = pagino -
1205                                  (trec.ir_startino + XFS_INODES_PER_CHUNK - 1) <
1206                                   rec.ir_startino - pagino;
1207                         } else {
1208                                 useleft = !doneleft;
1209                         }
1210
1211                         /* free inodes to the left? */
1212                         if (useleft && trec.ir_freecount) {
1213                                 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1214                                 cur = tcur;
1215
1216                                 pag->pagl_leftrec = trec.ir_startino;
1217                                 pag->pagl_rightrec = rec.ir_startino;
1218                                 pag->pagl_pagino = pagino;
1219                                 rec = trec;
1220                                 goto alloc_inode;
1221                         }
1222
1223                         /* free inodes to the right? */
1224                         if (!useleft && rec.ir_freecount) {
1225                                 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
1226
1227                                 pag->pagl_leftrec = trec.ir_startino;
1228                                 pag->pagl_rightrec = rec.ir_startino;
1229                                 pag->pagl_pagino = pagino;
1230                                 goto alloc_inode;
1231                         }
1232
1233                         /* get next record to check */
1234                         if (useleft) {
1235                                 error = xfs_ialloc_next_rec(tcur, &trec,
1236                                                                  &doneleft, 1);
1237                         } else {
1238                                 error = xfs_ialloc_next_rec(cur, &rec,
1239                                                                  &doneright, 0);
1240                         }
1241                         if (error)
1242                                 goto error1;
1243                 }
1244
1245                 if (searchdistance <= 0) {
1246                         /*
1247                          * Not in range - save last search
1248                          * location and allocate a new inode
1249                          */
1250                         xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
1251                         pag->pagl_leftrec = trec.ir_startino;
1252                         pag->pagl_rightrec = rec.ir_startino;
1253                         pag->pagl_pagino = pagino;
1254
1255                 } else {
1256                         /*
1257                          * We've reached the end of the btree. because
1258                          * we are only searching a small chunk of the
1259                          * btree each search, there is obviously free
1260                          * inodes closer to the parent inode than we
1261                          * are now. restart the search again.
1262                          */
1263                         pag->pagl_pagino = NULLAGINO;
1264                         pag->pagl_leftrec = NULLAGINO;
1265                         pag->pagl_rightrec = NULLAGINO;
1266                         xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
1267                         xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1268                         goto restart_pagno;
1269                 }
1270         }
1271
1272         /*
1273          * In a different AG from the parent.
1274          * See if the most recently allocated block has any free.
1275          */
1276         if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
1277                 error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
1278                                          XFS_LOOKUP_EQ, &i);
1279                 if (error)
1280                         goto error0;
1281
1282                 if (i == 1) {
1283                         error = xfs_inobt_get_rec(cur, &rec, &j);
1284                         if (error)
1285                                 goto error0;
1286
1287                         if (j == 1 && rec.ir_freecount > 0) {
1288                                 /*
1289                                  * The last chunk allocated in the group
1290                                  * still has a free inode.
1291                                  */
1292                                 goto alloc_inode;
1293                         }
1294                 }
1295         }
1296
1297         /*
1298          * None left in the last group, search the whole AG
1299          */
1300         error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
1301         if (error)
1302                 goto error0;
1303         if (XFS_IS_CORRUPT(mp, i != 1)) {
1304                 xfs_btree_mark_sick(cur);
1305                 error = -EFSCORRUPTED;
1306                 goto error0;
1307         }
1308
1309         for (;;) {
1310                 error = xfs_inobt_get_rec(cur, &rec, &i);
1311                 if (error)
1312                         goto error0;
1313                 if (XFS_IS_CORRUPT(mp, i != 1)) {
1314                         xfs_btree_mark_sick(cur);
1315                         error = -EFSCORRUPTED;
1316                         goto error0;
1317                 }
1318                 if (rec.ir_freecount > 0)
1319                         break;
1320                 error = xfs_btree_increment(cur, 0, &i);
1321                 if (error)
1322                         goto error0;
1323                 if (XFS_IS_CORRUPT(mp, i != 1)) {
1324                         xfs_btree_mark_sick(cur);
1325                         error = -EFSCORRUPTED;
1326                         goto error0;
1327                 }
1328         }
1329
1330 alloc_inode:
1331         offset = xfs_inobt_first_free_inode(&rec);
1332         ASSERT(offset >= 0);
1333         ASSERT(offset < XFS_INODES_PER_CHUNK);
1334         ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
1335                                    XFS_INODES_PER_CHUNK) == 0);
1336         ino = xfs_agino_to_ino(pag, rec.ir_startino + offset);
1337
1338         if (xfs_ag_has_sickness(pag, XFS_SICK_AG_INODES)) {
1339                 error = xfs_dialloc_check_ino(pag, tp, ino);
1340                 if (error)
1341                         goto error0;
1342         }
1343
1344         rec.ir_free &= ~XFS_INOBT_MASK(offset);
1345         rec.ir_freecount--;
1346         error = xfs_inobt_update(cur, &rec);
1347         if (error)
1348                 goto error0;
1349         be32_add_cpu(&agi->agi_freecount, -1);
1350         xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
1351         pag->pagi_freecount--;
1352
1353         error = xfs_check_agi_freecount(cur);
1354         if (error)
1355                 goto error0;
1356
1357         xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1358         xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
1359         *inop = ino;
1360         return 0;
1361 error1:
1362         xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
1363 error0:
1364         xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
1365         return error;
1366 }
1367
1368 /*
1369  * Use the free inode btree to allocate an inode based on distance from the
1370  * parent. Note that the provided cursor may be deleted and replaced.
1371  */
1372 STATIC int
1373 xfs_dialloc_ag_finobt_near(
1374         xfs_agino_t                     pagino,
1375         struct xfs_btree_cur            **ocur,
1376         struct xfs_inobt_rec_incore     *rec)
1377 {
1378         struct xfs_btree_cur            *lcur = *ocur;  /* left search cursor */
1379         struct xfs_btree_cur            *rcur;  /* right search cursor */
1380         struct xfs_inobt_rec_incore     rrec;
1381         int                             error;
1382         int                             i, j;
1383
1384         error = xfs_inobt_lookup(lcur, pagino, XFS_LOOKUP_LE, &i);
1385         if (error)
1386                 return error;
1387
1388         if (i == 1) {
1389                 error = xfs_inobt_get_rec(lcur, rec, &i);
1390                 if (error)
1391                         return error;
1392                 if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1)) {
1393                         xfs_btree_mark_sick(lcur);
1394                         return -EFSCORRUPTED;
1395                 }
1396
1397                 /*
1398                  * See if we've landed in the parent inode record. The finobt
1399                  * only tracks chunks with at least one free inode, so record
1400                  * existence is enough.
1401                  */
1402                 if (pagino >= rec->ir_startino &&
1403                     pagino < (rec->ir_startino + XFS_INODES_PER_CHUNK))
1404                         return 0;
1405         }
1406
1407         error = xfs_btree_dup_cursor(lcur, &rcur);
1408         if (error)
1409                 return error;
1410
1411         error = xfs_inobt_lookup(rcur, pagino, XFS_LOOKUP_GE, &j);
1412         if (error)
1413                 goto error_rcur;
1414         if (j == 1) {
1415                 error = xfs_inobt_get_rec(rcur, &rrec, &j);
1416                 if (error)
1417                         goto error_rcur;
1418                 if (XFS_IS_CORRUPT(lcur->bc_mp, j != 1)) {
1419                         xfs_btree_mark_sick(lcur);
1420                         error = -EFSCORRUPTED;
1421                         goto error_rcur;
1422                 }
1423         }
1424
1425         if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1 && j != 1)) {
1426                 xfs_btree_mark_sick(lcur);
1427                 error = -EFSCORRUPTED;
1428                 goto error_rcur;
1429         }
1430         if (i == 1 && j == 1) {
1431                 /*
1432                  * Both the left and right records are valid. Choose the closer
1433                  * inode chunk to the target.
1434                  */
1435                 if ((pagino - rec->ir_startino + XFS_INODES_PER_CHUNK - 1) >
1436                     (rrec.ir_startino - pagino)) {
1437                         *rec = rrec;
1438                         xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
1439                         *ocur = rcur;
1440                 } else {
1441                         xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
1442                 }
1443         } else if (j == 1) {
1444                 /* only the right record is valid */
1445                 *rec = rrec;
1446                 xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
1447                 *ocur = rcur;
1448         } else if (i == 1) {
1449                 /* only the left record is valid */
1450                 xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
1451         }
1452
1453         return 0;
1454
1455 error_rcur:
1456         xfs_btree_del_cursor(rcur, XFS_BTREE_ERROR);
1457         return error;
1458 }
1459
1460 /*
1461  * Use the free inode btree to find a free inode based on a newino hint. If
1462  * the hint is NULL, find the first free inode in the AG.
1463  */
1464 STATIC int
1465 xfs_dialloc_ag_finobt_newino(
1466         struct xfs_agi                  *agi,
1467         struct xfs_btree_cur            *cur,
1468         struct xfs_inobt_rec_incore     *rec)
1469 {
1470         int error;
1471         int i;
1472
1473         if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
1474                 error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
1475                                          XFS_LOOKUP_EQ, &i);
1476                 if (error)
1477                         return error;
1478                 if (i == 1) {
1479                         error = xfs_inobt_get_rec(cur, rec, &i);
1480                         if (error)
1481                                 return error;
1482                         if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
1483                                 xfs_btree_mark_sick(cur);
1484                                 return -EFSCORRUPTED;
1485                         }
1486                         return 0;
1487                 }
1488         }
1489
1490         /*
1491          * Find the first inode available in the AG.
1492          */
1493         error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
1494         if (error)
1495                 return error;
1496         if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
1497                 xfs_btree_mark_sick(cur);
1498                 return -EFSCORRUPTED;
1499         }
1500
1501         error = xfs_inobt_get_rec(cur, rec, &i);
1502         if (error)
1503                 return error;
1504         if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
1505                 xfs_btree_mark_sick(cur);
1506                 return -EFSCORRUPTED;
1507         }
1508
1509         return 0;
1510 }
1511
1512 /*
1513  * Update the inobt based on a modification made to the finobt. Also ensure that
1514  * the records from both trees are equivalent post-modification.
1515  */
1516 STATIC int
1517 xfs_dialloc_ag_update_inobt(
1518         struct xfs_btree_cur            *cur,   /* inobt cursor */
1519         struct xfs_inobt_rec_incore     *frec,  /* finobt record */
1520         int                             offset) /* inode offset */
1521 {
1522         struct xfs_inobt_rec_incore     rec;
1523         int                             error;
1524         int                             i;
1525
1526         error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i);
1527         if (error)
1528                 return error;
1529         if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
1530                 xfs_btree_mark_sick(cur);
1531                 return -EFSCORRUPTED;
1532         }
1533
1534         error = xfs_inobt_get_rec(cur, &rec, &i);
1535         if (error)
1536                 return error;
1537         if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
1538                 xfs_btree_mark_sick(cur);
1539                 return -EFSCORRUPTED;
1540         }
1541         ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) %
1542                                    XFS_INODES_PER_CHUNK) == 0);
1543
1544         rec.ir_free &= ~XFS_INOBT_MASK(offset);
1545         rec.ir_freecount--;
1546
1547         if (XFS_IS_CORRUPT(cur->bc_mp,
1548                            rec.ir_free != frec->ir_free ||
1549                            rec.ir_freecount != frec->ir_freecount)) {
1550                 xfs_btree_mark_sick(cur);
1551                 return -EFSCORRUPTED;
1552         }
1553
1554         return xfs_inobt_update(cur, &rec);
1555 }
1556
1557 /*
1558  * Allocate an inode using the free inode btree, if available. Otherwise, fall
1559  * back to the inobt search algorithm.
1560  *
1561  * The caller selected an AG for us, and made sure that free inodes are
1562  * available.
1563  */
1564 static int
1565 xfs_dialloc_ag(
1566         struct xfs_perag        *pag,
1567         struct xfs_trans        *tp,
1568         struct xfs_buf          *agbp,
1569         xfs_ino_t               parent,
1570         xfs_ino_t               *inop)
1571 {
1572         struct xfs_mount                *mp = tp->t_mountp;
1573         struct xfs_agi                  *agi = agbp->b_addr;
1574         xfs_agnumber_t                  pagno = XFS_INO_TO_AGNO(mp, parent);
1575         xfs_agino_t                     pagino = XFS_INO_TO_AGINO(mp, parent);
1576         struct xfs_btree_cur            *cur;   /* finobt cursor */
1577         struct xfs_btree_cur            *icur;  /* inobt cursor */
1578         struct xfs_inobt_rec_incore     rec;
1579         xfs_ino_t                       ino;
1580         int                             error;
1581         int                             offset;
1582         int                             i;
1583
1584         if (!xfs_has_finobt(mp))
1585                 return xfs_dialloc_ag_inobt(pag, tp, agbp, parent, inop);
1586
1587         /*
1588          * If pagino is 0 (this is the root inode allocation) use newino.
1589          * This must work because we've just allocated some.
1590          */
1591         if (!pagino)
1592                 pagino = be32_to_cpu(agi->agi_newino);
1593
1594         cur = xfs_finobt_init_cursor(pag, tp, agbp);
1595
1596         error = xfs_check_agi_freecount(cur);
1597         if (error)
1598                 goto error_cur;
1599
1600         /*
1601          * The search algorithm depends on whether we're in the same AG as the
1602          * parent. If so, find the closest available inode to the parent. If
1603          * not, consider the agi hint or find the first free inode in the AG.
1604          */
1605         if (pag_agno(pag) == pagno)
1606                 error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec);
1607         else
1608                 error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec);
1609         if (error)
1610                 goto error_cur;
1611
1612         offset = xfs_inobt_first_free_inode(&rec);
1613         ASSERT(offset >= 0);
1614         ASSERT(offset < XFS_INODES_PER_CHUNK);
1615         ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
1616                                    XFS_INODES_PER_CHUNK) == 0);
1617         ino = xfs_agino_to_ino(pag, rec.ir_startino + offset);
1618
1619         if (xfs_ag_has_sickness(pag, XFS_SICK_AG_INODES)) {
1620                 error = xfs_dialloc_check_ino(pag, tp, ino);
1621                 if (error)
1622                         goto error_cur;
1623         }
1624
1625         /*
1626          * Modify or remove the finobt record.
1627          */
1628         rec.ir_free &= ~XFS_INOBT_MASK(offset);
1629         rec.ir_freecount--;
1630         if (rec.ir_freecount)
1631                 error = xfs_inobt_update(cur, &rec);
1632         else
1633                 error = xfs_btree_delete(cur, &i);
1634         if (error)
1635                 goto error_cur;
1636
1637         /*
1638          * The finobt has now been updated appropriately. We haven't updated the
1639          * agi and superblock yet, so we can create an inobt cursor and validate
1640          * the original freecount. If all is well, make the equivalent update to
1641          * the inobt using the finobt record and offset information.
1642          */
1643         icur = xfs_inobt_init_cursor(pag, tp, agbp);
1644
1645         error = xfs_check_agi_freecount(icur);
1646         if (error)
1647                 goto error_icur;
1648
1649         error = xfs_dialloc_ag_update_inobt(icur, &rec, offset);
1650         if (error)
1651                 goto error_icur;
1652
1653         /*
1654          * Both trees have now been updated. We must update the perag and
1655          * superblock before we can check the freecount for each btree.
1656          */
1657         be32_add_cpu(&agi->agi_freecount, -1);
1658         xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
1659         pag->pagi_freecount--;
1660
1661         xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
1662
1663         error = xfs_check_agi_freecount(icur);
1664         if (error)
1665                 goto error_icur;
1666         error = xfs_check_agi_freecount(cur);
1667         if (error)
1668                 goto error_icur;
1669
1670         xfs_btree_del_cursor(icur, XFS_BTREE_NOERROR);
1671         xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1672         *inop = ino;
1673         return 0;
1674
1675 error_icur:
1676         xfs_btree_del_cursor(icur, XFS_BTREE_ERROR);
1677 error_cur:
1678         xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
1679         return error;
1680 }
1681
1682 static int
1683 xfs_dialloc_roll(
1684         struct xfs_trans        **tpp,
1685         struct xfs_buf          *agibp)
1686 {
1687         struct xfs_trans        *tp = *tpp;
1688         struct xfs_dquot_acct   *dqinfo;
1689         int                     error;
1690
1691         /*
1692          * Hold to on to the agibp across the commit so no other allocation can
1693          * come in and take the free inodes we just allocated for our caller.
1694          */
1695         xfs_trans_bhold(tp, agibp);
1696
1697         /*
1698          * We want the quota changes to be associated with the next transaction,
1699          * NOT this one. So, detach the dqinfo from this and attach it to the
1700          * next transaction.
1701          */
1702         dqinfo = tp->t_dqinfo;
1703         tp->t_dqinfo = NULL;
1704
1705         error = xfs_trans_roll(&tp);
1706
1707         /* Re-attach the quota info that we detached from prev trx. */
1708         tp->t_dqinfo = dqinfo;
1709
1710         /*
1711          * Join the buffer even on commit error so that the buffer is released
1712          * when the caller cancels the transaction and doesn't have to handle
1713          * this error case specially.
1714          */
1715         xfs_trans_bjoin(tp, agibp);
1716         *tpp = tp;
1717         return error;
1718 }
1719
1720 static bool
1721 xfs_dialloc_good_ag(
1722         struct xfs_perag        *pag,
1723         struct xfs_trans        *tp,
1724         umode_t                 mode,
1725         int                     flags,
1726         bool                    ok_alloc)
1727 {
1728         struct xfs_mount        *mp = tp->t_mountp;
1729         xfs_extlen_t            ineed;
1730         xfs_extlen_t            longest = 0;
1731         int                     needspace;
1732         int                     error;
1733
1734         if (!pag)
1735                 return false;
1736         if (!xfs_perag_allows_inodes(pag))
1737                 return false;
1738
1739         if (!xfs_perag_initialised_agi(pag)) {
1740                 error = xfs_ialloc_read_agi(pag, tp, 0, NULL);
1741                 if (error)
1742                         return false;
1743         }
1744
1745         if (pag->pagi_freecount)
1746                 return true;
1747         if (!ok_alloc)
1748                 return false;
1749
1750         if (!xfs_perag_initialised_agf(pag)) {
1751                 error = xfs_alloc_read_agf(pag, tp, flags, NULL);
1752                 if (error)
1753                         return false;
1754         }
1755
1756         /*
1757          * Check that there is enough free space for the file plus a chunk of
1758          * inodes if we need to allocate some. If this is the first pass across
1759          * the AGs, take into account the potential space needed for alignment
1760          * of inode chunks when checking the longest contiguous free space in
1761          * the AG - this prevents us from getting ENOSPC because we have free
1762          * space larger than ialloc_blks but alignment constraints prevent us
1763          * from using it.
1764          *
1765          * If we can't find an AG with space for full alignment slack to be
1766          * taken into account, we must be near ENOSPC in all AGs.  Hence we
1767          * don't include alignment for the second pass and so if we fail
1768          * allocation due to alignment issues then it is most likely a real
1769          * ENOSPC condition.
1770          *
1771          * XXX(dgc): this calculation is now bogus thanks to the per-ag
1772          * reservations that xfs_alloc_fix_freelist() now does via
1773          * xfs_alloc_space_available(). When the AG fills up, pagf_freeblks will
1774          * be more than large enough for the check below to succeed, but
1775          * xfs_alloc_space_available() will fail because of the non-zero
1776          * metadata reservation and hence we won't actually be able to allocate
1777          * more inodes in this AG. We do soooo much unnecessary work near ENOSPC
1778          * because of this.
1779          */
1780         ineed = M_IGEO(mp)->ialloc_min_blks;
1781         if (flags && ineed > 1)
1782                 ineed += M_IGEO(mp)->cluster_align;
1783         longest = pag->pagf_longest;
1784         if (!longest)
1785                 longest = pag->pagf_flcount > 0;
1786         needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode);
1787
1788         if (pag->pagf_freeblks < needspace + ineed || longest < ineed)
1789                 return false;
1790         return true;
1791 }
1792
1793 static int
1794 xfs_dialloc_try_ag(
1795         struct xfs_perag        *pag,
1796         struct xfs_trans        **tpp,
1797         xfs_ino_t               parent,
1798         xfs_ino_t               *new_ino,
1799         bool                    ok_alloc)
1800 {
1801         struct xfs_buf          *agbp;
1802         xfs_ino_t               ino;
1803         int                     error;
1804
1805         /*
1806          * Then read in the AGI buffer and recheck with the AGI buffer
1807          * lock held.
1808          */
1809         error = xfs_ialloc_read_agi(pag, *tpp, 0, &agbp);
1810         if (error)
1811                 return error;
1812
1813         if (!pag->pagi_freecount) {
1814                 if (!ok_alloc) {
1815                         error = -EAGAIN;
1816                         goto out_release;
1817                 }
1818
1819                 error = xfs_ialloc_ag_alloc(pag, *tpp, agbp);
1820                 if (error < 0)
1821                         goto out_release;
1822
1823                 /*
1824                  * We successfully allocated space for an inode cluster in this
1825                  * AG.  Roll the transaction so that we can allocate one of the
1826                  * new inodes.
1827                  */
1828                 ASSERT(pag->pagi_freecount > 0);
1829                 error = xfs_dialloc_roll(tpp, agbp);
1830                 if (error)
1831                         goto out_release;
1832         }
1833
1834         /* Allocate an inode in the found AG */
1835         error = xfs_dialloc_ag(pag, *tpp, agbp, parent, &ino);
1836         if (!error)
1837                 *new_ino = ino;
1838         return error;
1839
1840 out_release:
1841         xfs_trans_brelse(*tpp, agbp);
1842         return error;
1843 }
1844
1845 /*
1846  * Pick an AG for the new inode.
1847  *
1848  * Directories, symlinks, and regular files frequently allocate at least one
1849  * block, so factor that potential expansion when we examine whether an AG has
1850  * enough space for file creation.  Try to keep metadata files all in the same
1851  * AG.
1852  */
1853 static inline xfs_agnumber_t
1854 xfs_dialloc_pick_ag(
1855         struct xfs_mount        *mp,
1856         struct xfs_inode        *dp,
1857         umode_t                 mode)
1858 {
1859         xfs_agnumber_t          start_agno;
1860
1861         if (!dp)
1862                 return 0;
1863         if (xfs_is_metadir_inode(dp)) {
1864                 if (mp->m_sb.sb_logstart)
1865                         return XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart);
1866                 return 0;
1867         }
1868
1869         if (S_ISDIR(mode))
1870                 return (atomic_inc_return(&mp->m_agirotor) - 1) % mp->m_maxagi;
1871
1872         start_agno = XFS_INO_TO_AGNO(mp, dp->i_ino);
1873         if (start_agno >= mp->m_maxagi)
1874                 start_agno = 0;
1875
1876         return start_agno;
1877 }
1878
1879 /*
1880  * Allocate an on-disk inode.
1881  *
1882  * Mode is used to tell whether the new inode is a directory and hence where to
1883  * locate it. The on-disk inode that is allocated will be returned in @new_ino
1884  * on success, otherwise an error will be set to indicate the failure (e.g.
1885  * -ENOSPC).
1886  */
1887 int
1888 xfs_dialloc(
1889         struct xfs_trans        **tpp,
1890         const struct xfs_icreate_args *args,
1891         xfs_ino_t               *new_ino)
1892 {
1893         struct xfs_mount        *mp = (*tpp)->t_mountp;
1894         struct xfs_perag        *pag;
1895         struct xfs_ino_geometry *igeo = M_IGEO(mp);
1896         xfs_ino_t               ino = NULLFSINO;
1897         xfs_ino_t               parent = args->pip ? args->pip->i_ino : 0;
1898         xfs_agnumber_t          agno;
1899         xfs_agnumber_t          start_agno;
1900         umode_t                 mode = args->mode & S_IFMT;
1901         bool                    ok_alloc = true;
1902         bool                    low_space = false;
1903         int                     flags;
1904         int                     error = 0;
1905
1906         start_agno = xfs_dialloc_pick_ag(mp, args->pip, mode);
1907
1908         /*
1909          * If we have already hit the ceiling of inode blocks then clear
1910          * ok_alloc so we scan all available agi structures for a free
1911          * inode.
1912          *
1913          * Read rough value of mp->m_icount by percpu_counter_read_positive,
1914          * which will sacrifice the preciseness but improve the performance.
1915          */
1916         if (igeo->maxicount &&
1917             percpu_counter_read_positive(&mp->m_icount) + igeo->ialloc_inos
1918                                                         > igeo->maxicount) {
1919                 ok_alloc = false;
1920         }
1921
1922         /*
1923          * If we are near to ENOSPC, we want to prefer allocation from AGs that
1924          * have free inodes in them rather than use up free space allocating new
1925          * inode chunks. Hence we turn off allocation for the first non-blocking
1926          * pass through the AGs if we are near ENOSPC to consume free inodes
1927          * that we can immediately allocate, but then we allow allocation on the
1928          * second pass if we fail to find an AG with free inodes in it.
1929          */
1930         if (percpu_counter_read_positive(&mp->m_fdblocks) <
1931                         mp->m_low_space[XFS_LOWSP_1_PCNT]) {
1932                 ok_alloc = false;
1933                 low_space = true;
1934         }
1935
1936         /*
1937          * Loop until we find an allocation group that either has free inodes
1938          * or in which we can allocate some inodes.  Iterate through the
1939          * allocation groups upward, wrapping at the end.
1940          */
1941         flags = XFS_ALLOC_FLAG_TRYLOCK;
1942 retry:
1943         for_each_perag_wrap_at(mp, start_agno, mp->m_maxagi, agno, pag) {
1944                 if (xfs_dialloc_good_ag(pag, *tpp, mode, flags, ok_alloc)) {
1945                         error = xfs_dialloc_try_ag(pag, tpp, parent,
1946                                         &ino, ok_alloc);
1947                         if (error != -EAGAIN)
1948                                 break;
1949                         error = 0;
1950                 }
1951
1952                 if (xfs_is_shutdown(mp)) {
1953                         error = -EFSCORRUPTED;
1954                         break;
1955                 }
1956         }
1957         if (pag)
1958                 xfs_perag_rele(pag);
1959         if (error)
1960                 return error;
1961         if (ino == NULLFSINO) {
1962                 if (flags) {
1963                         flags = 0;
1964                         if (low_space)
1965                                 ok_alloc = true;
1966                         goto retry;
1967                 }
1968                 return -ENOSPC;
1969         }
1970
1971         /*
1972          * Protect against obviously corrupt allocation btree records. Later
1973          * xfs_iget checks will catch re-allocation of other active in-memory
1974          * and on-disk inodes. If we don't catch reallocating the parent inode
1975          * here we will deadlock in xfs_iget() so we have to do these checks
1976          * first.
1977          */
1978         if (ino == parent || !xfs_verify_dir_ino(mp, ino)) {
1979                 xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino);
1980                 xfs_agno_mark_sick(mp, XFS_INO_TO_AGNO(mp, ino),
1981                                 XFS_SICK_AG_INOBT);
1982                 return -EFSCORRUPTED;
1983         }
1984
1985         *new_ino = ino;
1986         return 0;
1987 }
1988
1989 /*
1990  * Free the blocks of an inode chunk. We must consider that the inode chunk
1991  * might be sparse and only free the regions that are allocated as part of the
1992  * chunk.
1993  */
1994 static int
1995 xfs_difree_inode_chunk(
1996         struct xfs_trans                *tp,
1997         struct xfs_perag                *pag,
1998         struct xfs_inobt_rec_incore     *rec)
1999 {
2000         struct xfs_mount                *mp = tp->t_mountp;
2001         xfs_agblock_t                   sagbno = XFS_AGINO_TO_AGBNO(mp,
2002                                                         rec->ir_startino);
2003         int                             startidx, endidx;
2004         int                             nextbit;
2005         xfs_agblock_t                   agbno;
2006         int                             contigblk;
2007         DECLARE_BITMAP(holemask, XFS_INOBT_HOLEMASK_BITS);
2008
2009         if (!xfs_inobt_issparse(rec->ir_holemask)) {
2010                 /* not sparse, calculate extent info directly */
2011                 return xfs_free_extent_later(tp, xfs_agbno_to_fsb(pag, sagbno),
2012                                 M_IGEO(mp)->ialloc_blks, &XFS_RMAP_OINFO_INODES,
2013                                 XFS_AG_RESV_NONE, 0);
2014         }
2015
2016         /* holemask is only 16-bits (fits in an unsigned long) */
2017         ASSERT(sizeof(rec->ir_holemask) <= sizeof(holemask[0]));
2018         holemask[0] = rec->ir_holemask;
2019
2020         /*
2021          * Find contiguous ranges of zeroes (i.e., allocated regions) in the
2022          * holemask and convert the start/end index of each range to an extent.
2023          * We start with the start and end index both pointing at the first 0 in
2024          * the mask.
2025          */
2026         startidx = endidx = find_first_zero_bit(holemask,
2027                                                 XFS_INOBT_HOLEMASK_BITS);
2028         nextbit = startidx + 1;
2029         while (startidx < XFS_INOBT_HOLEMASK_BITS) {
2030                 int error;
2031
2032                 nextbit = find_next_zero_bit(holemask, XFS_INOBT_HOLEMASK_BITS,
2033                                              nextbit);
2034                 /*
2035                  * If the next zero bit is contiguous, update the end index of
2036                  * the current range and continue.
2037                  */
2038                 if (nextbit != XFS_INOBT_HOLEMASK_BITS &&
2039                     nextbit == endidx + 1) {
2040                         endidx = nextbit;
2041                         goto next;
2042                 }
2043
2044                 /*
2045                  * nextbit is not contiguous with the current end index. Convert
2046                  * the current start/end to an extent and add it to the free
2047                  * list.
2048                  */
2049                 agbno = sagbno + (startidx * XFS_INODES_PER_HOLEMASK_BIT) /
2050                                   mp->m_sb.sb_inopblock;
2051                 contigblk = ((endidx - startidx + 1) *
2052                              XFS_INODES_PER_HOLEMASK_BIT) /
2053                             mp->m_sb.sb_inopblock;
2054
2055                 ASSERT(agbno % mp->m_sb.sb_spino_align == 0);
2056                 ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
2057                 error = xfs_free_extent_later(tp, xfs_agbno_to_fsb(pag, agbno),
2058                                 contigblk, &XFS_RMAP_OINFO_INODES,
2059                                 XFS_AG_RESV_NONE, 0);
2060                 if (error)
2061                         return error;
2062
2063                 /* reset range to current bit and carry on... */
2064                 startidx = endidx = nextbit;
2065
2066 next:
2067                 nextbit++;
2068         }
2069         return 0;
2070 }
2071
2072 STATIC int
2073 xfs_difree_inobt(
2074         struct xfs_perag                *pag,
2075         struct xfs_trans                *tp,
2076         struct xfs_buf                  *agbp,
2077         xfs_agino_t                     agino,
2078         struct xfs_icluster             *xic,
2079         struct xfs_inobt_rec_incore     *orec)
2080 {
2081         struct xfs_mount                *mp = pag_mount(pag);
2082         struct xfs_agi                  *agi = agbp->b_addr;
2083         struct xfs_btree_cur            *cur;
2084         struct xfs_inobt_rec_incore     rec;
2085         int                             ilen;
2086         int                             error;
2087         int                             i;
2088         int                             off;
2089
2090         ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
2091         ASSERT(XFS_AGINO_TO_AGBNO(mp, agino) < be32_to_cpu(agi->agi_length));
2092
2093         /*
2094          * Initialize the cursor.
2095          */
2096         cur = xfs_inobt_init_cursor(pag, tp, agbp);
2097
2098         error = xfs_check_agi_freecount(cur);
2099         if (error)
2100                 goto error0;
2101
2102         /*
2103          * Look for the entry describing this inode.
2104          */
2105         if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) {
2106                 xfs_warn(mp, "%s: xfs_inobt_lookup() returned error %d.",
2107                         __func__, error);
2108                 goto error0;
2109         }
2110         if (XFS_IS_CORRUPT(mp, i != 1)) {
2111                 xfs_btree_mark_sick(cur);
2112                 error = -EFSCORRUPTED;
2113                 goto error0;
2114         }
2115         error = xfs_inobt_get_rec(cur, &rec, &i);
2116         if (error) {
2117                 xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.",
2118                         __func__, error);
2119                 goto error0;
2120         }
2121         if (XFS_IS_CORRUPT(mp, i != 1)) {
2122                 xfs_btree_mark_sick(cur);
2123                 error = -EFSCORRUPTED;
2124                 goto error0;
2125         }
2126         /*
2127          * Get the offset in the inode chunk.
2128          */
2129         off = agino - rec.ir_startino;
2130         ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK);
2131         ASSERT(!(rec.ir_free & XFS_INOBT_MASK(off)));
2132         /*
2133          * Mark the inode free & increment the count.
2134          */
2135         rec.ir_free |= XFS_INOBT_MASK(off);
2136         rec.ir_freecount++;
2137
2138         /*
2139          * When an inode chunk is free, it becomes eligible for removal. Don't
2140          * remove the chunk if the block size is large enough for multiple inode
2141          * chunks (that might not be free).
2142          */
2143         if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE &&
2144             mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
2145                 xic->deleted = true;
2146                 xic->first_ino = xfs_agino_to_ino(pag, rec.ir_startino);
2147                 xic->alloc = xfs_inobt_irec_to_allocmask(&rec);
2148
2149                 /*
2150                  * Remove the inode cluster from the AGI B+Tree, adjust the
2151                  * AGI and Superblock inode counts, and mark the disk space
2152                  * to be freed when the transaction is committed.
2153                  */
2154                 ilen = rec.ir_freecount;
2155                 be32_add_cpu(&agi->agi_count, -ilen);
2156                 be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
2157                 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
2158                 pag->pagi_freecount -= ilen - 1;
2159                 pag->pagi_count -= ilen;
2160                 xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
2161                 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
2162
2163                 if ((error = xfs_btree_delete(cur, &i))) {
2164                         xfs_warn(mp, "%s: xfs_btree_delete returned error %d.",
2165                                 __func__, error);
2166                         goto error0;
2167                 }
2168
2169                 error = xfs_difree_inode_chunk(tp, pag, &rec);
2170                 if (error)
2171                         goto error0;
2172         } else {
2173                 xic->deleted = false;
2174
2175                 error = xfs_inobt_update(cur, &rec);
2176                 if (error) {
2177                         xfs_warn(mp, "%s: xfs_inobt_update returned error %d.",
2178                                 __func__, error);
2179                         goto error0;
2180                 }
2181
2182                 /*
2183                  * Change the inode free counts and log the ag/sb changes.
2184                  */
2185                 be32_add_cpu(&agi->agi_freecount, 1);
2186                 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
2187                 pag->pagi_freecount++;
2188                 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
2189         }
2190
2191         error = xfs_check_agi_freecount(cur);
2192         if (error)
2193                 goto error0;
2194
2195         *orec = rec;
2196         xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
2197         return 0;
2198
2199 error0:
2200         xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
2201         return error;
2202 }
2203
2204 /*
2205  * Free an inode in the free inode btree.
2206  */
2207 STATIC int
2208 xfs_difree_finobt(
2209         struct xfs_perag                *pag,
2210         struct xfs_trans                *tp,
2211         struct xfs_buf                  *agbp,
2212         xfs_agino_t                     agino,
2213         struct xfs_inobt_rec_incore     *ibtrec) /* inobt record */
2214 {
2215         struct xfs_mount                *mp = pag_mount(pag);
2216         struct xfs_btree_cur            *cur;
2217         struct xfs_inobt_rec_incore     rec;
2218         int                             offset = agino - ibtrec->ir_startino;
2219         int                             error;
2220         int                             i;
2221
2222         cur = xfs_finobt_init_cursor(pag, tp, agbp);
2223
2224         error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i);
2225         if (error)
2226                 goto error;
2227         if (i == 0) {
2228                 /*
2229                  * If the record does not exist in the finobt, we must have just
2230                  * freed an inode in a previously fully allocated chunk. If not,
2231                  * something is out of sync.
2232                  */
2233                 if (XFS_IS_CORRUPT(mp, ibtrec->ir_freecount != 1)) {
2234                         xfs_btree_mark_sick(cur);
2235                         error = -EFSCORRUPTED;
2236                         goto error;
2237                 }
2238
2239                 error = xfs_inobt_insert_rec(cur, ibtrec->ir_holemask,
2240                                              ibtrec->ir_count,
2241                                              ibtrec->ir_freecount,
2242                                              ibtrec->ir_free, &i);
2243                 if (error)
2244                         goto error;
2245                 ASSERT(i == 1);
2246
2247                 goto out;
2248         }
2249
2250         /*
2251          * Read and update the existing record. We could just copy the ibtrec
2252          * across here, but that would defeat the purpose of having redundant
2253          * metadata. By making the modifications independently, we can catch
2254          * corruptions that we wouldn't see if we just copied from one record
2255          * to another.
2256          */
2257         error = xfs_inobt_get_rec(cur, &rec, &i);
2258         if (error)
2259                 goto error;
2260         if (XFS_IS_CORRUPT(mp, i != 1)) {
2261                 xfs_btree_mark_sick(cur);
2262                 error = -EFSCORRUPTED;
2263                 goto error;
2264         }
2265
2266         rec.ir_free |= XFS_INOBT_MASK(offset);
2267         rec.ir_freecount++;
2268
2269         if (XFS_IS_CORRUPT(mp,
2270                            rec.ir_free != ibtrec->ir_free ||
2271                            rec.ir_freecount != ibtrec->ir_freecount)) {
2272                 xfs_btree_mark_sick(cur);
2273                 error = -EFSCORRUPTED;
2274                 goto error;
2275         }
2276
2277         /*
2278          * The content of inobt records should always match between the inobt
2279          * and finobt. The lifecycle of records in the finobt is different from
2280          * the inobt in that the finobt only tracks records with at least one
2281          * free inode. Hence, if all of the inodes are free and we aren't
2282          * keeping inode chunks permanently on disk, remove the record.
2283          * Otherwise, update the record with the new information.
2284          *
2285          * Note that we currently can't free chunks when the block size is large
2286          * enough for multiple chunks. Leave the finobt record to remain in sync
2287          * with the inobt.
2288          */
2289         if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE &&
2290             mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
2291                 error = xfs_btree_delete(cur, &i);
2292                 if (error)
2293                         goto error;
2294                 ASSERT(i == 1);
2295         } else {
2296                 error = xfs_inobt_update(cur, &rec);
2297                 if (error)
2298                         goto error;
2299         }
2300
2301 out:
2302         error = xfs_check_agi_freecount(cur);
2303         if (error)
2304                 goto error;
2305
2306         xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
2307         return 0;
2308
2309 error:
2310         xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
2311         return error;
2312 }
2313
2314 /*
2315  * Free disk inode.  Carefully avoids touching the incore inode, all
2316  * manipulations incore are the caller's responsibility.
2317  * The on-disk inode is not changed by this operation, only the
2318  * btree (free inode mask) is changed.
2319  */
2320 int
2321 xfs_difree(
2322         struct xfs_trans        *tp,
2323         struct xfs_perag        *pag,
2324         xfs_ino_t               inode,
2325         struct xfs_icluster     *xic)
2326 {
2327         /* REFERENCED */
2328         xfs_agblock_t           agbno;  /* block number containing inode */
2329         struct xfs_buf          *agbp;  /* buffer for allocation group header */
2330         xfs_agino_t             agino;  /* allocation group inode number */
2331         int                     error;  /* error return value */
2332         struct xfs_mount        *mp = tp->t_mountp;
2333         struct xfs_inobt_rec_incore rec;/* btree record */
2334
2335         /*
2336          * Break up inode number into its components.
2337          */
2338         if (pag_agno(pag) != XFS_INO_TO_AGNO(mp, inode)) {
2339                 xfs_warn(mp, "%s: agno != pag_agno(pag) (%d != %d).",
2340                         __func__, XFS_INO_TO_AGNO(mp, inode), pag_agno(pag));
2341                 ASSERT(0);
2342                 return -EINVAL;
2343         }
2344         agino = XFS_INO_TO_AGINO(mp, inode);
2345         if (inode != xfs_agino_to_ino(pag, agino))  {
2346                 xfs_warn(mp, "%s: inode != xfs_agino_to_ino() (%llu != %llu).",
2347                         __func__, (unsigned long long)inode,
2348                         (unsigned long long)xfs_agino_to_ino(pag, agino));
2349                 ASSERT(0);
2350                 return -EINVAL;
2351         }
2352         agbno = XFS_AGINO_TO_AGBNO(mp, agino);
2353         if (agbno >= xfs_ag_block_count(mp, pag_agno(pag))) {
2354                 xfs_warn(mp, "%s: agbno >= xfs_ag_block_count (%d >= %d).",
2355                         __func__, agbno, xfs_ag_block_count(mp, pag_agno(pag)));
2356                 ASSERT(0);
2357                 return -EINVAL;
2358         }
2359         /*
2360          * Get the allocation group header.
2361          */
2362         error = xfs_ialloc_read_agi(pag, tp, 0, &agbp);
2363         if (error) {
2364                 xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
2365                         __func__, error);
2366                 return error;
2367         }
2368
2369         /*
2370          * Fix up the inode allocation btree.
2371          */
2372         error = xfs_difree_inobt(pag, tp, agbp, agino, xic, &rec);
2373         if (error)
2374                 goto error0;
2375
2376         /*
2377          * Fix up the free inode btree.
2378          */
2379         if (xfs_has_finobt(mp)) {
2380                 error = xfs_difree_finobt(pag, tp, agbp, agino, &rec);
2381                 if (error)
2382                         goto error0;
2383         }
2384
2385         return 0;
2386
2387 error0:
2388         return error;
2389 }
2390
2391 STATIC int
2392 xfs_imap_lookup(
2393         struct xfs_perag        *pag,
2394         struct xfs_trans        *tp,
2395         xfs_agino_t             agino,
2396         xfs_agblock_t           agbno,
2397         xfs_agblock_t           *chunk_agbno,
2398         xfs_agblock_t           *offset_agbno,
2399         int                     flags)
2400 {
2401         struct xfs_mount        *mp = pag_mount(pag);
2402         struct xfs_inobt_rec_incore rec;
2403         struct xfs_btree_cur    *cur;
2404         struct xfs_buf          *agbp;
2405         int                     error;
2406         int                     i;
2407
2408         error = xfs_ialloc_read_agi(pag, tp, 0, &agbp);
2409         if (error) {
2410                 xfs_alert(mp,
2411                         "%s: xfs_ialloc_read_agi() returned error %d, agno %d",
2412                         __func__, error, pag_agno(pag));
2413                 return error;
2414         }
2415
2416         /*
2417          * Lookup the inode record for the given agino. If the record cannot be
2418          * found, then it's an invalid inode number and we should abort. Once
2419          * we have a record, we need to ensure it contains the inode number
2420          * we are looking up.
2421          */
2422         cur = xfs_inobt_init_cursor(pag, tp, agbp);
2423         error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
2424         if (!error) {
2425                 if (i)
2426                         error = xfs_inobt_get_rec(cur, &rec, &i);
2427                 if (!error && i == 0)
2428                         error = -EINVAL;
2429         }
2430
2431         xfs_trans_brelse(tp, agbp);
2432         xfs_btree_del_cursor(cur, error);
2433         if (error)
2434                 return error;
2435
2436         /* check that the returned record contains the required inode */
2437         if (rec.ir_startino > agino ||
2438             rec.ir_startino + M_IGEO(mp)->ialloc_inos <= agino)
2439                 return -EINVAL;
2440
2441         /* for untrusted inodes check it is allocated first */
2442         if ((flags & XFS_IGET_UNTRUSTED) &&
2443             (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)))
2444                 return -EINVAL;
2445
2446         *chunk_agbno = XFS_AGINO_TO_AGBNO(mp, rec.ir_startino);
2447         *offset_agbno = agbno - *chunk_agbno;
2448         return 0;
2449 }
2450
2451 /*
2452  * Return the location of the inode in imap, for mapping it into a buffer.
2453  */
2454 int
2455 xfs_imap(
2456         struct xfs_perag        *pag,
2457         struct xfs_trans        *tp,
2458         xfs_ino_t               ino,    /* inode to locate */
2459         struct xfs_imap         *imap,  /* location map structure */
2460         uint                    flags)  /* flags for inode btree lookup */
2461 {
2462         struct xfs_mount        *mp = pag_mount(pag);
2463         xfs_agblock_t           agbno;  /* block number of inode in the alloc group */
2464         xfs_agino_t             agino;  /* inode number within alloc group */
2465         xfs_agblock_t           chunk_agbno;    /* first block in inode chunk */
2466         xfs_agblock_t           cluster_agbno;  /* first block in inode cluster */
2467         int                     error;  /* error code */
2468         int                     offset; /* index of inode in its buffer */
2469         xfs_agblock_t           offset_agbno;   /* blks from chunk start to inode */
2470
2471         ASSERT(ino != NULLFSINO);
2472
2473         /*
2474          * Split up the inode number into its parts.
2475          */
2476         agino = XFS_INO_TO_AGINO(mp, ino);
2477         agbno = XFS_AGINO_TO_AGBNO(mp, agino);
2478         if (agbno >= xfs_ag_block_count(mp, pag_agno(pag)) ||
2479             ino != xfs_agino_to_ino(pag, agino)) {
2480                 error = -EINVAL;
2481 #ifdef DEBUG
2482                 /*
2483                  * Don't output diagnostic information for untrusted inodes
2484                  * as they can be invalid without implying corruption.
2485                  */
2486                 if (flags & XFS_IGET_UNTRUSTED)
2487                         return error;
2488                 if (agbno >= xfs_ag_block_count(mp, pag_agno(pag))) {
2489                         xfs_alert(mp,
2490                 "%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)",
2491                                 __func__, (unsigned long long)agbno,
2492                                 (unsigned long)xfs_ag_block_count(mp,
2493                                                         pag_agno(pag)));
2494                 }
2495                 if (ino != xfs_agino_to_ino(pag, agino)) {
2496                         xfs_alert(mp,
2497                 "%s: ino (0x%llx) != xfs_agino_to_ino() (0x%llx)",
2498                                 __func__, ino,
2499                                 xfs_agino_to_ino(pag, agino));
2500                 }
2501                 xfs_stack_trace();
2502 #endif /* DEBUG */
2503                 return error;
2504         }
2505
2506         /*
2507          * For bulkstat and handle lookups, we have an untrusted inode number
2508          * that we have to verify is valid. We cannot do this just by reading
2509          * the inode buffer as it may have been unlinked and removed leaving
2510          * inodes in stale state on disk. Hence we have to do a btree lookup
2511          * in all cases where an untrusted inode number is passed.
2512          */
2513         if (flags & XFS_IGET_UNTRUSTED) {
2514                 error = xfs_imap_lookup(pag, tp, agino, agbno,
2515                                         &chunk_agbno, &offset_agbno, flags);
2516                 if (error)
2517                         return error;
2518                 goto out_map;
2519         }
2520
2521         /*
2522          * If the inode cluster size is the same as the blocksize or
2523          * smaller we get to the buffer by simple arithmetics.
2524          */
2525         if (M_IGEO(mp)->blocks_per_cluster == 1) {
2526                 offset = XFS_INO_TO_OFFSET(mp, ino);
2527                 ASSERT(offset < mp->m_sb.sb_inopblock);
2528
2529                 imap->im_blkno = xfs_agbno_to_daddr(pag, agbno);
2530                 imap->im_len = XFS_FSB_TO_BB(mp, 1);
2531                 imap->im_boffset = (unsigned short)(offset <<
2532                                                         mp->m_sb.sb_inodelog);
2533                 return 0;
2534         }
2535
2536         /*
2537          * If the inode chunks are aligned then use simple maths to
2538          * find the location. Otherwise we have to do a btree
2539          * lookup to find the location.
2540          */
2541         if (M_IGEO(mp)->inoalign_mask) {
2542                 offset_agbno = agbno & M_IGEO(mp)->inoalign_mask;
2543                 chunk_agbno = agbno - offset_agbno;
2544         } else {
2545                 error = xfs_imap_lookup(pag, tp, agino, agbno,
2546                                         &chunk_agbno, &offset_agbno, flags);
2547                 if (error)
2548                         return error;
2549         }
2550
2551 out_map:
2552         ASSERT(agbno >= chunk_agbno);
2553         cluster_agbno = chunk_agbno +
2554                 ((offset_agbno / M_IGEO(mp)->blocks_per_cluster) *
2555                  M_IGEO(mp)->blocks_per_cluster);
2556         offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
2557                 XFS_INO_TO_OFFSET(mp, ino);
2558
2559         imap->im_blkno = xfs_agbno_to_daddr(pag, cluster_agbno);
2560         imap->im_len = XFS_FSB_TO_BB(mp, M_IGEO(mp)->blocks_per_cluster);
2561         imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog);
2562
2563         /*
2564          * If the inode number maps to a block outside the bounds
2565          * of the file system then return NULL rather than calling
2566          * read_buf and panicing when we get an error from the
2567          * driver.
2568          */
2569         if ((imap->im_blkno + imap->im_len) >
2570             XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
2571                 xfs_alert(mp,
2572         "%s: (im_blkno (0x%llx) + im_len (0x%llx)) > sb_dblocks (0x%llx)",
2573                         __func__, (unsigned long long) imap->im_blkno,
2574                         (unsigned long long) imap->im_len,
2575                         XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
2576                 return -EINVAL;
2577         }
2578         return 0;
2579 }
2580
2581 /*
2582  * Log specified fields for the ag hdr (inode section). The growth of the agi
2583  * structure over time requires that we interpret the buffer as two logical
2584  * regions delineated by the end of the unlinked list. This is due to the size
2585  * of the hash table and its location in the middle of the agi.
2586  *
2587  * For example, a request to log a field before agi_unlinked and a field after
2588  * agi_unlinked could cause us to log the entire hash table and use an excessive
2589  * amount of log space. To avoid this behavior, log the region up through
2590  * agi_unlinked in one call and the region after agi_unlinked through the end of
2591  * the structure in another.
2592  */
2593 void
2594 xfs_ialloc_log_agi(
2595         struct xfs_trans        *tp,
2596         struct xfs_buf          *bp,
2597         uint32_t                fields)
2598 {
2599         int                     first;          /* first byte number */
2600         int                     last;           /* last byte number */
2601         static const short      offsets[] = {   /* field starting offsets */
2602                                         /* keep in sync with bit definitions */
2603                 offsetof(xfs_agi_t, agi_magicnum),
2604                 offsetof(xfs_agi_t, agi_versionnum),
2605                 offsetof(xfs_agi_t, agi_seqno),
2606                 offsetof(xfs_agi_t, agi_length),
2607                 offsetof(xfs_agi_t, agi_count),
2608                 offsetof(xfs_agi_t, agi_root),
2609                 offsetof(xfs_agi_t, agi_level),
2610                 offsetof(xfs_agi_t, agi_freecount),
2611                 offsetof(xfs_agi_t, agi_newino),
2612                 offsetof(xfs_agi_t, agi_dirino),
2613                 offsetof(xfs_agi_t, agi_unlinked),
2614                 offsetof(xfs_agi_t, agi_free_root),
2615                 offsetof(xfs_agi_t, agi_free_level),
2616                 offsetof(xfs_agi_t, agi_iblocks),
2617                 sizeof(xfs_agi_t)
2618         };
2619 #ifdef DEBUG
2620         struct xfs_agi          *agi = bp->b_addr;
2621
2622         ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
2623 #endif
2624
2625         /*
2626          * Compute byte offsets for the first and last fields in the first
2627          * region and log the agi buffer. This only logs up through
2628          * agi_unlinked.
2629          */
2630         if (fields & XFS_AGI_ALL_BITS_R1) {
2631                 xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R1,
2632                                   &first, &last);
2633                 xfs_trans_log_buf(tp, bp, first, last);
2634         }
2635
2636         /*
2637          * Mask off the bits in the first region and calculate the first and
2638          * last field offsets for any bits in the second region.
2639          */
2640         fields &= ~XFS_AGI_ALL_BITS_R1;
2641         if (fields) {
2642                 xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R2,
2643                                   &first, &last);
2644                 xfs_trans_log_buf(tp, bp, first, last);
2645         }
2646 }
2647
2648 static xfs_failaddr_t
2649 xfs_agi_verify(
2650         struct xfs_buf          *bp)
2651 {
2652         struct xfs_mount        *mp = bp->b_mount;
2653         struct xfs_agi          *agi = bp->b_addr;
2654         xfs_failaddr_t          fa;
2655         uint32_t                agi_seqno = be32_to_cpu(agi->agi_seqno);
2656         uint32_t                agi_length = be32_to_cpu(agi->agi_length);
2657         int                     i;
2658
2659         if (xfs_has_crc(mp)) {
2660                 if (!uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid))
2661                         return __this_address;
2662                 if (!xfs_log_check_lsn(mp, be64_to_cpu(agi->agi_lsn)))
2663                         return __this_address;
2664         }
2665
2666         /*
2667          * Validate the magic number of the agi block.
2668          */
2669         if (!xfs_verify_magic(bp, agi->agi_magicnum))
2670                 return __this_address;
2671         if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)))
2672                 return __this_address;
2673
2674         fa = xfs_validate_ag_length(bp, agi_seqno, agi_length);
2675         if (fa)
2676                 return fa;
2677
2678         if (be32_to_cpu(agi->agi_level) < 1 ||
2679             be32_to_cpu(agi->agi_level) > M_IGEO(mp)->inobt_maxlevels)
2680                 return __this_address;
2681
2682         if (xfs_has_finobt(mp) &&
2683             (be32_to_cpu(agi->agi_free_level) < 1 ||
2684              be32_to_cpu(agi->agi_free_level) > M_IGEO(mp)->inobt_maxlevels))
2685                 return __this_address;
2686
2687         for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
2688                 if (agi->agi_unlinked[i] == cpu_to_be32(NULLAGINO))
2689                         continue;
2690                 if (!xfs_verify_ino(mp, be32_to_cpu(agi->agi_unlinked[i])))
2691                         return __this_address;
2692         }
2693
2694         return NULL;
2695 }
2696
2697 static void
2698 xfs_agi_read_verify(
2699         struct xfs_buf  *bp)
2700 {
2701         struct xfs_mount *mp = bp->b_mount;
2702         xfs_failaddr_t  fa;
2703
2704         if (xfs_has_crc(mp) &&
2705             !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF))
2706                 xfs_verifier_error(bp, -EFSBADCRC, __this_address);
2707         else {
2708                 fa = xfs_agi_verify(bp);
2709                 if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_IALLOC_READ_AGI))
2710                         xfs_verifier_error(bp, -EFSCORRUPTED, fa);
2711         }
2712 }
2713
2714 static void
2715 xfs_agi_write_verify(
2716         struct xfs_buf  *bp)
2717 {
2718         struct xfs_mount        *mp = bp->b_mount;
2719         struct xfs_buf_log_item *bip = bp->b_log_item;
2720         struct xfs_agi          *agi = bp->b_addr;
2721         xfs_failaddr_t          fa;
2722
2723         fa = xfs_agi_verify(bp);
2724         if (fa) {
2725                 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
2726                 return;
2727         }
2728
2729         if (!xfs_has_crc(mp))
2730                 return;
2731
2732         if (bip)
2733                 agi->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn);
2734         xfs_buf_update_cksum(bp, XFS_AGI_CRC_OFF);
2735 }
2736
2737 const struct xfs_buf_ops xfs_agi_buf_ops = {
2738         .name = "xfs_agi",
2739         .magic = { cpu_to_be32(XFS_AGI_MAGIC), cpu_to_be32(XFS_AGI_MAGIC) },
2740         .verify_read = xfs_agi_read_verify,
2741         .verify_write = xfs_agi_write_verify,
2742         .verify_struct = xfs_agi_verify,
2743 };
2744
2745 /*
2746  * Read in the allocation group header (inode allocation section)
2747  */
2748 int
2749 xfs_read_agi(
2750         struct xfs_perag        *pag,
2751         struct xfs_trans        *tp,
2752         xfs_buf_flags_t         flags,
2753         struct xfs_buf          **agibpp)
2754 {
2755         struct xfs_mount        *mp = pag_mount(pag);
2756         int                     error;
2757
2758         trace_xfs_read_agi(pag);
2759
2760         error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
2761                         XFS_AG_DADDR(mp, pag_agno(pag), XFS_AGI_DADDR(mp)),
2762                         XFS_FSS_TO_BB(mp, 1), flags, agibpp, &xfs_agi_buf_ops);
2763         if (xfs_metadata_is_sick(error))
2764                 xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
2765         if (error)
2766                 return error;
2767         if (tp)
2768                 xfs_trans_buf_set_type(tp, *agibpp, XFS_BLFT_AGI_BUF);
2769
2770         xfs_buf_set_ref(*agibpp, XFS_AGI_REF);
2771         return 0;
2772 }
2773
2774 /*
2775  * Read in the agi and initialise the per-ag data. If the caller supplies a
2776  * @agibpp, return the locked AGI buffer to them, otherwise release it.
2777  */
2778 int
2779 xfs_ialloc_read_agi(
2780         struct xfs_perag        *pag,
2781         struct xfs_trans        *tp,
2782         int                     flags,
2783         struct xfs_buf          **agibpp)
2784 {
2785         struct xfs_buf          *agibp;
2786         struct xfs_agi          *agi;
2787         int                     error;
2788
2789         trace_xfs_ialloc_read_agi(pag);
2790
2791         error = xfs_read_agi(pag, tp,
2792                         (flags & XFS_IALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
2793                         &agibp);
2794         if (error)
2795                 return error;
2796
2797         agi = agibp->b_addr;
2798         if (!xfs_perag_initialised_agi(pag)) {
2799                 pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
2800                 pag->pagi_count = be32_to_cpu(agi->agi_count);
2801                 set_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate);
2802         }
2803
2804         /*
2805          * It's possible for these to be out of sync if
2806          * we are in the middle of a forced shutdown.
2807          */
2808         ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
2809                 xfs_is_shutdown(pag_mount(pag)));
2810         if (agibpp)
2811                 *agibpp = agibp;
2812         else
2813                 xfs_trans_brelse(tp, agibp);
2814         return 0;
2815 }
2816
2817 /* How many inodes are backed by inode clusters ondisk? */
2818 STATIC int
2819 xfs_ialloc_count_ondisk(
2820         struct xfs_btree_cur            *cur,
2821         xfs_agino_t                     low,
2822         xfs_agino_t                     high,
2823         unsigned int                    *allocated)
2824 {
2825         struct xfs_inobt_rec_incore     irec;
2826         unsigned int                    ret = 0;
2827         int                             has_record;
2828         int                             error;
2829
2830         error = xfs_inobt_lookup(cur, low, XFS_LOOKUP_LE, &has_record);
2831         if (error)
2832                 return error;
2833
2834         while (has_record) {
2835                 unsigned int            i, hole_idx;
2836
2837                 error = xfs_inobt_get_rec(cur, &irec, &has_record);
2838                 if (error)
2839                         return error;
2840                 if (irec.ir_startino > high)
2841                         break;
2842
2843                 for (i = 0; i < XFS_INODES_PER_CHUNK; i++) {
2844                         if (irec.ir_startino + i < low)
2845                                 continue;
2846                         if (irec.ir_startino + i > high)
2847                                 break;
2848
2849                         hole_idx = i / XFS_INODES_PER_HOLEMASK_BIT;
2850                         if (!(irec.ir_holemask & (1U << hole_idx)))
2851                                 ret++;
2852                 }
2853
2854                 error = xfs_btree_increment(cur, 0, &has_record);
2855                 if (error)
2856                         return error;
2857         }
2858
2859         *allocated = ret;
2860         return 0;
2861 }
2862
2863 /* Is there an inode record covering a given extent? */
2864 int
2865 xfs_ialloc_has_inodes_at_extent(
2866         struct xfs_btree_cur    *cur,
2867         xfs_agblock_t           bno,
2868         xfs_extlen_t            len,
2869         enum xbtree_recpacking  *outcome)
2870 {
2871         xfs_agino_t             agino;
2872         xfs_agino_t             last_agino;
2873         unsigned int            allocated;
2874         int                     error;
2875
2876         agino = XFS_AGB_TO_AGINO(cur->bc_mp, bno);
2877         last_agino = XFS_AGB_TO_AGINO(cur->bc_mp, bno + len) - 1;
2878
2879         error = xfs_ialloc_count_ondisk(cur, agino, last_agino, &allocated);
2880         if (error)
2881                 return error;
2882
2883         if (allocated == 0)
2884                 *outcome = XBTREE_RECPACKING_EMPTY;
2885         else if (allocated == last_agino - agino + 1)
2886                 *outcome = XBTREE_RECPACKING_FULL;
2887         else
2888                 *outcome = XBTREE_RECPACKING_SPARSE;
2889         return 0;
2890 }
2891
2892 struct xfs_ialloc_count_inodes {
2893         xfs_agino_t                     count;
2894         xfs_agino_t                     freecount;
2895 };
2896
2897 /* Record inode counts across all inobt records. */
2898 STATIC int
2899 xfs_ialloc_count_inodes_rec(
2900         struct xfs_btree_cur            *cur,
2901         const union xfs_btree_rec       *rec,
2902         void                            *priv)
2903 {
2904         struct xfs_inobt_rec_incore     irec;
2905         struct xfs_ialloc_count_inodes  *ci = priv;
2906         xfs_failaddr_t                  fa;
2907
2908         xfs_inobt_btrec_to_irec(cur->bc_mp, rec, &irec);
2909         fa = xfs_inobt_check_irec(to_perag(cur->bc_group), &irec);
2910         if (fa)
2911                 return xfs_inobt_complain_bad_rec(cur, fa, &irec);
2912
2913         ci->count += irec.ir_count;
2914         ci->freecount += irec.ir_freecount;
2915
2916         return 0;
2917 }
2918
2919 /* Count allocated and free inodes under an inobt. */
2920 int
2921 xfs_ialloc_count_inodes(
2922         struct xfs_btree_cur            *cur,
2923         xfs_agino_t                     *count,
2924         xfs_agino_t                     *freecount)
2925 {
2926         struct xfs_ialloc_count_inodes  ci = {0};
2927         int                             error;
2928
2929         ASSERT(xfs_btree_is_ino(cur->bc_ops));
2930         error = xfs_btree_query_all(cur, xfs_ialloc_count_inodes_rec, &ci);
2931         if (error)
2932                 return error;
2933
2934         *count = ci.count;
2935         *freecount = ci.freecount;
2936         return 0;
2937 }
2938
2939 /*
2940  * Initialize inode-related geometry information.
2941  *
2942  * Compute the inode btree min and max levels and set maxicount.
2943  *
2944  * Set the inode cluster size.  This may still be overridden by the file
2945  * system block size if it is larger than the chosen cluster size.
2946  *
2947  * For v5 filesystems, scale the cluster size with the inode size to keep a
2948  * constant ratio of inode per cluster buffer, but only if mkfs has set the
2949  * inode alignment value appropriately for larger cluster sizes.
2950  *
2951  * Then compute the inode cluster alignment information.
2952  */
2953 void
2954 xfs_ialloc_setup_geometry(
2955         struct xfs_mount        *mp)
2956 {
2957         struct xfs_sb           *sbp = &mp->m_sb;
2958         struct xfs_ino_geometry *igeo = M_IGEO(mp);
2959         uint64_t                icount;
2960         uint                    inodes;
2961
2962         igeo->new_diflags2 = 0;
2963         if (xfs_has_bigtime(mp))
2964                 igeo->new_diflags2 |= XFS_DIFLAG2_BIGTIME;
2965         if (xfs_has_large_extent_counts(mp))
2966                 igeo->new_diflags2 |= XFS_DIFLAG2_NREXT64;
2967
2968         /* Compute inode btree geometry. */
2969         igeo->agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
2970         igeo->inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, true);
2971         igeo->inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, false);
2972         igeo->inobt_mnr[0] = igeo->inobt_mxr[0] / 2;
2973         igeo->inobt_mnr[1] = igeo->inobt_mxr[1] / 2;
2974
2975         igeo->ialloc_inos = max_t(uint16_t, XFS_INODES_PER_CHUNK,
2976                         sbp->sb_inopblock);
2977         igeo->ialloc_blks = igeo->ialloc_inos >> sbp->sb_inopblog;
2978
2979         if (sbp->sb_spino_align)
2980                 igeo->ialloc_min_blks = sbp->sb_spino_align;
2981         else
2982                 igeo->ialloc_min_blks = igeo->ialloc_blks;
2983
2984         /* Compute and fill in value of m_ino_geo.inobt_maxlevels. */
2985         inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG;
2986         igeo->inobt_maxlevels = xfs_btree_compute_maxlevels(igeo->inobt_mnr,
2987                         inodes);
2988         ASSERT(igeo->inobt_maxlevels <= xfs_iallocbt_maxlevels_ondisk());
2989
2990         /*
2991          * Set the maximum inode count for this filesystem, being careful not
2992          * to use obviously garbage sb_inopblog/sb_inopblock values.  Regular
2993          * users should never get here due to failing sb verification, but
2994          * certain users (xfs_db) need to be usable even with corrupt metadata.
2995          */
2996         if (sbp->sb_imax_pct && igeo->ialloc_blks) {
2997                 /*
2998                  * Make sure the maximum inode count is a multiple
2999                  * of the units we allocate inodes in.
3000                  */
3001                 icount = sbp->sb_dblocks * sbp->sb_imax_pct;
3002                 do_div(icount, 100);
3003                 do_div(icount, igeo->ialloc_blks);
3004                 igeo->maxicount = XFS_FSB_TO_INO(mp,
3005                                 icount * igeo->ialloc_blks);
3006         } else {
3007                 igeo->maxicount = 0;
3008         }
3009
3010         /*
3011          * Compute the desired size of an inode cluster buffer size, which
3012          * starts at 8K and (on v5 filesystems) scales up with larger inode
3013          * sizes.
3014          *
3015          * Preserve the desired inode cluster size because the sparse inodes
3016          * feature uses that desired size (not the actual size) to compute the
3017          * sparse inode alignment.  The mount code validates this value, so we
3018          * cannot change the behavior.
3019          */
3020         igeo->inode_cluster_size_raw = XFS_INODE_BIG_CLUSTER_SIZE;
3021         if (xfs_has_v3inodes(mp)) {
3022                 int     new_size = igeo->inode_cluster_size_raw;
3023
3024                 new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE;
3025                 if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size))
3026                         igeo->inode_cluster_size_raw = new_size;
3027         }
3028
3029         /* Calculate inode cluster ratios. */
3030         if (igeo->inode_cluster_size_raw > mp->m_sb.sb_blocksize)
3031                 igeo->blocks_per_cluster = XFS_B_TO_FSBT(mp,
3032                                 igeo->inode_cluster_size_raw);
3033         else
3034                 igeo->blocks_per_cluster = 1;
3035         igeo->inode_cluster_size = XFS_FSB_TO_B(mp, igeo->blocks_per_cluster);
3036         igeo->inodes_per_cluster = XFS_FSB_TO_INO(mp, igeo->blocks_per_cluster);
3037
3038         /* Calculate inode cluster alignment. */
3039         if (xfs_has_align(mp) &&
3040             mp->m_sb.sb_inoalignmt >= igeo->blocks_per_cluster)
3041                 igeo->cluster_align = mp->m_sb.sb_inoalignmt;
3042         else
3043                 igeo->cluster_align = 1;
3044         igeo->inoalign_mask = igeo->cluster_align - 1;
3045         igeo->cluster_align_inodes = XFS_FSB_TO_INO(mp, igeo->cluster_align);
3046
3047         /*
3048          * If we are using stripe alignment, check whether
3049          * the stripe unit is a multiple of the inode alignment
3050          */
3051         if (mp->m_dalign && igeo->inoalign_mask &&
3052             !(mp->m_dalign & igeo->inoalign_mask))
3053                 igeo->ialloc_align = mp->m_dalign;
3054         else
3055                 igeo->ialloc_align = 0;
3056
3057         if (mp->m_sb.sb_blocksize > PAGE_SIZE)
3058                 igeo->min_folio_order = mp->m_sb.sb_blocklog - PAGE_SHIFT;
3059         else
3060                 igeo->min_folio_order = 0;
3061 }
3062
3063 /* Compute the location of the root directory inode that is laid out by mkfs. */
3064 xfs_ino_t
3065 xfs_ialloc_calc_rootino(
3066         struct xfs_mount        *mp,
3067         int                     sunit)
3068 {
3069         struct xfs_ino_geometry *igeo = M_IGEO(mp);
3070         xfs_agblock_t           first_bno;
3071
3072         /*
3073          * Pre-calculate the geometry of AG 0.  We know what it looks like
3074          * because libxfs knows how to create allocation groups now.
3075          *
3076          * first_bno is the first block in which mkfs could possibly have
3077          * allocated the root directory inode, once we factor in the metadata
3078          * that mkfs formats before it.  Namely, the four AG headers...
3079          */
3080         first_bno = howmany(4 * mp->m_sb.sb_sectsize, mp->m_sb.sb_blocksize);
3081
3082         /* ...the two free space btree roots... */
3083         first_bno += 2;
3084
3085         /* ...the inode btree root... */
3086         first_bno += 1;
3087
3088         /* ...the initial AGFL... */
3089         first_bno += xfs_alloc_min_freelist(mp, NULL);
3090
3091         /* ...the free inode btree root... */
3092         if (xfs_has_finobt(mp))
3093                 first_bno++;
3094
3095         /* ...the reverse mapping btree root... */
3096         if (xfs_has_rmapbt(mp))
3097                 first_bno++;
3098
3099         /* ...the reference count btree... */
3100         if (xfs_has_reflink(mp))
3101                 first_bno++;
3102
3103         /*
3104          * ...and the log, if it is allocated in the first allocation group.
3105          *
3106          * This can happen with filesystems that only have a single
3107          * allocation group, or very odd geometries created by old mkfs
3108          * versions on very small filesystems.
3109          */
3110         if (xfs_ag_contains_log(mp, 0))
3111                  first_bno += mp->m_sb.sb_logblocks;
3112
3113         /*
3114          * Now round first_bno up to whatever allocation alignment is given
3115          * by the filesystem or was passed in.
3116          */
3117         if (xfs_has_dalign(mp) && igeo->ialloc_align > 0)
3118                 first_bno = roundup(first_bno, sunit);
3119         else if (xfs_has_align(mp) &&
3120                         mp->m_sb.sb_inoalignmt > 1)
3121                 first_bno = roundup(first_bno, mp->m_sb.sb_inoalignmt);
3122
3123         return XFS_AGINO_TO_INO(mp, 0, XFS_AGB_TO_AGINO(mp, first_bno));
3124 }
3125
3126 /*
3127  * Ensure there are not sparse inode clusters that cross the new EOAG.
3128  *
3129  * This is a no-op for non-spinode filesystems since clusters are always fully
3130  * allocated and checking the bnobt suffices.  However, a spinode filesystem
3131  * could have a record where the upper inodes are free blocks.  If those blocks
3132  * were removed from the filesystem, the inode record would extend beyond EOAG,
3133  * which will be flagged as corruption.
3134  */
3135 int
3136 xfs_ialloc_check_shrink(
3137         struct xfs_perag        *pag,
3138         struct xfs_trans        *tp,
3139         struct xfs_buf          *agibp,
3140         xfs_agblock_t           new_length)
3141 {
3142         struct xfs_inobt_rec_incore rec;
3143         struct xfs_btree_cur    *cur;
3144         xfs_agino_t             agino;
3145         int                     has;
3146         int                     error;
3147
3148         if (!xfs_has_sparseinodes(pag_mount(pag)))
3149                 return 0;
3150
3151         cur = xfs_inobt_init_cursor(pag, tp, agibp);
3152
3153         /* Look up the inobt record that would correspond to the new EOFS. */
3154         agino = XFS_AGB_TO_AGINO(pag_mount(pag), new_length);
3155         error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has);
3156         if (error || !has)
3157                 goto out;
3158
3159         error = xfs_inobt_get_rec(cur, &rec, &has);
3160         if (error)
3161                 goto out;
3162
3163         if (!has) {
3164                 xfs_ag_mark_sick(pag, XFS_SICK_AG_INOBT);
3165                 error = -EFSCORRUPTED;
3166                 goto out;
3167         }
3168
3169         /* If the record covers inodes that would be beyond EOFS, bail out. */
3170         if (rec.ir_startino + XFS_INODES_PER_CHUNK > agino) {
3171                 error = -ENOSPC;
3172                 goto out;
3173         }
3174 out:
3175         xfs_btree_del_cursor(cur, error);
3176         return error;
3177 }