kernel/fs/ufs/ufs_bmap.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1983, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24
  25 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T     */
  26 /*        All Rights Reserved   */
  27
  28 /*
  29  * University Copyright- Copyright (c) 1982, 1986, 1988
  30  * The Regents of the University of California
  31  * All Rights Reserved
  32  *
  33  * University Acknowledgment- Portions of this document are derived from
  34  * software developed by the University of California, Berkeley, and its
  35  * contributors.
  36  */
  37
  38
  39 #include <sys/types.h>
  40 #include <sys/t_lock.h>
  41 #include <sys/param.h>
  42 #include <sys/systm.h>
  43 #include <sys/signal.h>
  44 #include <sys/user.h>
  45 #include <sys/vnode.h>
  46 #include <sys/buf.h>
  47 #include <sys/disp.h>
  48 #include <sys/proc.h>
  49 #include <sys/conf.h>
  50 #include <sys/fs/ufs_inode.h>
  51 #include <sys/fs/ufs_fs.h>
  52 #include <sys/fs/ufs_quota.h>
  53 #include <sys/fs/ufs_trans.h>
  54 #include <sys/fs/ufs_bio.h>
  55 #include <vm/seg.h>
  56 #include <sys/errno.h>
  57 #include <sys/sysmacros.h>
  58 #include <sys/vfs.h>
  59 #include <sys/debug.h>
  60 #include <sys/kmem.h>
  61 #include <sys/cmn_err.h>
  62
  63 /*
  64  * This structure is used to track blocks as we allocate them, so that
  65  * we can free them if we encounter an error during allocation.  We
  66  * keep track of five pieces of information for each allocated block:
  67  *   - The number of the newly allocated block
  68  *   - The size of the block (lets us deal with fragments if we want)
  69  *   - The number of the block containing a pointer to it; or whether
  70  *     the pointer is in the inode
  71  *   - The offset within the block (or inode) containing a pointer to it.
  72  *   - A flag indicating the usage of the block.  (Logging needs to know
  73  *     this to avoid overwriting a data block if it was previously used
  74  *     for metadata.)
  75  */
  76
  77 enum ufs_owner_type {
  78         ufs_no_owner,           /* Owner has not yet been updated */
  79         ufs_inode_direct,       /* Listed in inode's direct block table */
  80         ufs_inode_indirect,     /* Listed in inode's indirect block table */
  81         ufs_indirect_block      /* Listed in an indirect block */
  82 };
  83
  84 struct ufs_allocated_block {
  85         daddr_t this_block;         /* Number of this block */
  86         off_t block_size;           /* Size of this block, in bytes */
  87         enum ufs_owner_type owner;  /* Who points to this block? */
  88         daddr_t owner_block;        /* Number of the owning block */
  89         uint_t owner_offset;        /* Offset within that block or inode */
  90         int usage_flags;            /* Usage flags, as expected by free() */
  91 };
  92
  93
  94 static int findextent(struct fs *fs, daddr32_t *sbp, int n, int *lenp,
  95                 int maxtrans);
  96
  97 static void ufs_undo_allocation(inode_t *ip, int block_count,
  98         struct ufs_allocated_block table[], int inode_sector_adjust);
  99
 100 /*
 101  * Find the extent and the matching block number.
 102  *
 103  * bsize > PAGESIZE
 104  *      boff indicates that we want a page in the middle
 105  *      min expression is supposed to make sure no extra page[s] after EOF
 106  * PAGESIZE >= bsize
 107  *      we assume that a page is a multiple of bsize, i.e.,
 108  *      boff always == 0
 109  *
 110  * We always return a length that is suitable for a disk transfer.
 111  */
 112 #define DOEXTENT(fs, lbn, boff, bnp, lenp, size, tblp, n, chkfrag, maxtrans) {\
 113         register daddr32_t *dp = (tblp);                                \
 114         register int _chkfrag = chkfrag; /* for lint. sigh */           \
 115                                                                         \
 116         if (*dp == 0) {                                                 \
 117                 *(bnp) = UFS_HOLE;                                      \
 118         } else {                                                        \
 119                 register int len;                                       \
 120                                                                         \
 121                 len = findextent(fs, dp, (int)(n), lenp, maxtrans) <<   \
 122                         (fs)->fs_bshift;                                \
 123                 if (_chkfrag) {                                         \
 124                         register uoff_t tmp;                    \
 125                                                                         \
 126                         tmp = fragroundup((fs), size) -                 \
 127                             (((uoff_t)lbn) << fs->fs_bshift);   \
 128                         len = (int)MIN(tmp, len);                       \
 129                 }                                                       \
 130                 len -= (boff);                                          \
 131                 if (len <= 0) {                                         \
 132                         *(bnp) = UFS_HOLE;                              \
 133                 } else {                                                \
 134                         *(bnp) = fsbtodb(fs, *dp) + btodb(boff);        \
 135                         *(lenp) = len;                                  \
 136                 }                                                       \
 137         }                                                               \
 138 }
 139
 140 /*
 141  * The maximum supported file size is actually somewhat less that 1
 142  * terabyte.  This is because the total number of blocks used for the
 143  * file and its metadata must fit into the ic_blocks field of the
 144  * inode, which is a signed 32-bit quantity.  The metadata allocated
 145  * for a file (that is, the single, double, and triple indirect blocks
 146  * used to reference the file blocks) is actually quite small,
 147  * but just to make sure, we check for overflow in the ic_blocks
 148  * ic_blocks fields for all files whose total block count is
 149  * within 1 GB of a terabyte.  VERYLARGEFILESIZE below is the number of
 150  * 512-byte blocks in a terabyte (2^31), minus the number of 512-byte blocks
 151  * in a gigabyte (2^21).  We only check for overflow in the ic_blocks
 152  * field if the number of blocks currently allocated to the file is
 153  * greater than VERYLARGEFILESIZE.
 154  *
 155  * Note that file "size" is the not the same as file "length".  A
 156  * file's "size" is the number of blocks allocated to it.  A file's
 157  * "length" is the maximum offset in the file.  A UFS FILE can have a
 158  * length of a terabyte, but the size is limited to somewhat less than
 159  * a terabyte, as described above.
 160  */
 161 #define VERYLARGEFILESIZE       0x7FE00000
 162
 163 /*
 164  * bmap{read,write} define the structure of file system storage by mapping
 165  * a logical offset in a file to a physical block number on the device.
 166  * It should be called with a locked inode when allocation is to be
 167  * done (bmap_write).  Note this strangeness: bmap_write is always called from
 168  * getpage(), not putpage(), since getpage() is where all the allocation
 169  * is done.
 170  *
 171  * S_READ, S_OTHER -> bmap_read; S_WRITE -> bmap_write.
 172  *
 173  * NOTICE: the block number returned is the disk block number, not the
 174  * file system block number.  All the worries about block offsets and
 175  * page/block sizes are hidden inside of bmap.  Well, not quite,
 176  * unfortunately.  It's impossible to find one place to hide all this
 177  * mess.  There are 3 cases:
 178  *
 179  * PAGESIZE < bsize
 180  *      In this case, the {get,put}page routines will attempt to align to
 181  *      a file system block boundry (XXX - maybe this is a mistake?).  Since
 182  *      the kluster routines may be out of memory, we don't always get all
 183  *      the pages we wanted.  If we called bmap first, to find out how much
 184  *      to kluster, we handed in the block aligned offset.  If we didn't get
 185  *      all the pages, we have to chop off the amount we didn't get from the
 186  *      amount handed back by bmap.
 187  *
 188  * PAGESIZE == bsize
 189  *      Life is quite pleasant here, no extra work needed, mainly because we
 190  *      (probably?) won't kluster backwards, just forwards.
 191  *
 192  * PAGESIZE > bsize
 193  *      This one has a different set of problems, specifically, we may have to
 194  *      do N reads to fill one page.  Let us hope that Sun will stay with small
 195  *      pages.
 196  *
 197  * Returns 0 on success, or a non-zero errno if an error occurs.
 198  *
 199  * TODO
 200  *      LMXXX - add a bmap cache.  This could be a couple of extents in the
 201  *      inode.  Two is nice for PAGESIZE > bsize.
 202  */
 203
 204 int
 205 bmap_read(struct inode *ip, uoff_t off, daddr_t *bnp, int *lenp)
 206 {
 207         daddr_t lbn;
 208         ufsvfs_t *ufsvfsp = ip->i_ufsvfs;
 209         struct  fs *fs = ufsvfsp->vfs_fs;
 210         struct  buf *bp;
 211         int     i, j, boff;
 212         int     shft;                   /* we maintain sh = 1 << shft */
 213         daddr_t ob, nb, tbn;
 214         daddr32_t *bap;
 215         int     nindirshift, nindiroffset;
 216
 217         ASSERT(RW_LOCK_HELD(&ip->i_contents));
 218         lbn = (daddr_t)lblkno(fs, off);
 219         boff = (int)blkoff(fs, off);
 220         if (lbn < 0)
 221                 return (EFBIG);
 222
 223         /*
 224          * The first NDADDR blocks are direct blocks.
 225          */
 226         if (lbn < NDADDR) {
 227                 DOEXTENT(fs, lbn, boff, bnp, lenp,
 228                     ip->i_size, &ip->i_db[lbn], NDADDR - lbn, 1,
 229                     ufsvfsp->vfs_iotransz);
 230                 return (0);
 231         }
 232
 233         nindirshift = ufsvfsp->vfs_nindirshift;
 234         nindiroffset = ufsvfsp->vfs_nindiroffset;
 235         /*
 236          * Determine how many levels of indirection.
 237          */
 238         shft = 0;                               /* sh = 1 */
 239         tbn = lbn - NDADDR;
 240         for (j = NIADDR; j > 0; j--) {
 241                 longlong_t      sh;
 242
 243                 shft += nindirshift;            /* sh *= nindir */
 244                 sh = 1LL << shft;
 245                 if (tbn < sh)
 246                         break;
 247                 tbn -= sh;
 248         }
 249         if (j == 0)
 250                 return (EFBIG);
 251
 252         /*
 253          * Fetch the first indirect block.
 254          */
 255         nb = ip->i_ib[NIADDR - j];
 256         if (nb == 0) {
 257                 *bnp = UFS_HOLE;
 258                 return (0);
 259         }
 260
 261         /*
 262          * Fetch through the indirect blocks.
 263          */
 264         for (; j <= NIADDR; j++) {
 265                 ob = nb;
 266                 bp = UFS_BREAD(ufsvfsp,
 267                     ip->i_dev, fsbtodb(fs, ob), fs->fs_bsize);
 268                 if (bp->b_flags & B_ERROR) {
 269                         brelse(bp);
 270                         return (EIO);
 271                 }
 272                 bap = bp->b_un.b_daddr;
 273
 274                 ASSERT(!ufs_indir_badblock(ip, bap));
 275
 276                 shft -= nindirshift;            /* sh / nindir */
 277                 i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */
 278                 nb = bap[i];
 279                 if (nb == 0) {
 280                         *bnp = UFS_HOLE;
 281                         brelse(bp);
 282                         return (0);
 283                 }
 284                 if (j != NIADDR)
 285                         brelse(bp);
 286         }
 287         DOEXTENT(fs, lbn, boff, bnp, lenp, ip->i_size, &bap[i],
 288             MIN(NINDIR(fs) - i, (daddr_t)lblkno(fs, ip->i_size - 1) - lbn + 1),
 289             0, ufsvfsp->vfs_iotransz);
 290         brelse(bp);
 291         return (0);
 292 }
 293
 294 /*
 295  * See bmap_read for general notes.
 296  *
 297  * The block must be at least size bytes and will be extended or
 298  * allocated as needed.  If alloc_type is of type BI_ALLOC_ONLY, then bmap
 299  * will not create any in-core pages that correspond to the new disk allocation.
 300  * If alloc_type is of BI_FALLOCATE, blocks will be stored as (-1) * block addr
 301  * and security is maintained b/c upon reading a negative block number pages
 302  * are zeroed. For all other allocation types (BI_NORMAL) the in-core pages will
 303  * be created and initialized as needed.
 304  *
 305  * Returns 0 on success, or a non-zero errno if an error occurs.
 306  */
 307 int
 308 bmap_write(struct inode *ip, uoff_t     off, int size,
 309     enum bi_type alloc_type, daddr_t *allocblk, struct cred *cr)
 310 {
 311         struct  fs *fs;
 312         struct  buf *bp;
 313         int     i;
 314         struct  buf *nbp;
 315         int     j;
 316         int     shft;                           /* we maintain sh = 1 << shft */
 317         daddr_t ob, nb, pref, lbn, llbn, tbn;
 318         daddr32_t *bap;
 319         struct  vnode *vp = ITOV(ip);
 320         long    bsize = VBSIZE(vp);
 321         long    osize, nsize;
 322         int     issync, metaflag, isdirquota;
 323         int     err;
 324         dev_t   dev;
 325         struct  fbuf *fbp;
 326         int     nindirshift;
 327         int     nindiroffset;
 328         struct  ufsvfs  *ufsvfsp;
 329         int     added_sectors;          /* sectors added to this inode */
 330         int     alloced_blocks;         /* fs blocks newly allocated */
 331         struct  ufs_allocated_block undo_table[NIADDR+1];
 332         int     verylargefile = 0;
 333
 334         ASSERT(RW_WRITE_HELD(&ip->i_contents));
 335
 336         if (allocblk)
 337                 *allocblk = 0;
 338
 339         ufsvfsp = ip->i_ufsvfs;
 340         fs = ufsvfsp->vfs_bufp->b_un.b_fs;
 341         lbn = (daddr_t)lblkno(fs, off);
 342         if (lbn < 0)
 343                 return (EFBIG);
 344         if (ip->i_blocks >= VERYLARGEFILESIZE)
 345                 verylargefile = 1;
 346         llbn = (daddr_t)((ip->i_size) ? lblkno(fs, ip->i_size - 1) : 0);
 347         metaflag = isdirquota = 0;
 348         if (((ip->i_mode & IFMT) == IFDIR) ||
 349             ((ip->i_mode & IFMT) == IFATTRDIR))
 350                 isdirquota = metaflag = I_DIR;
 351         else if ((ip->i_mode & IFMT) == IFSHAD)
 352                 metaflag = I_SHAD;
 353         else if (ip->i_ufsvfs->vfs_qinod == ip)
 354                 isdirquota = metaflag = I_QUOTA;
 355
 356         issync = ((ip->i_flag & ISYNC) != 0);
 357
 358         if (isdirquota || issync) {
 359                 alloc_type = BI_NORMAL; /* make sure */
 360         }
 361
 362         /*
 363          * If the next write will extend the file into a new block,
 364          * and the file is currently composed of a fragment
 365          * this fragment has to be extended to be a full block.
 366          */
 367         if (llbn < NDADDR && llbn < lbn && (ob = ip->i_db[llbn]) != 0) {
 368                 osize = blksize(fs, ip, llbn);
 369                 if (osize < bsize && osize > 0) {
 370                         /*
 371                          * Check to see if doing this will make the file too
 372                          * big.  Only check if we are dealing with a very
 373                          * large file.
 374                          */
 375                         if (verylargefile == 1) {
 376                                 if (((unsigned)ip->i_blocks +
 377                                     btodb(bsize - osize)) > INT_MAX) {
 378                                         return (EFBIG);
 379                                 }
 380                         }
 381                         /*
 382                          * Make sure we have all needed pages setup correctly.
 383                          *
 384                          * We pass S_OTHER to fbread here because we want
 385                          * an exclusive lock on the page in question
 386                          * (see ufs_getpage). I/O to the old block location
 387                          * may still be in progress and we are about to free
 388                          * the old block. We don't want anyone else to get
 389                          * a hold of the old block once we free it until
 390                          * the I/O is complete.
 391                          */
 392                         err =
 393                             fbread(ITOV(ip), ((offset_t)llbn << fs->fs_bshift),
 394                             (uint_t)bsize, S_OTHER, &fbp);
 395                         if (err)
 396                                 return (err);
 397                         pref = blkpref(ip, llbn, (int)llbn, &ip->i_db[0]);
 398                         err = realloccg(ip, ob, pref, (int)osize, (int)bsize,
 399                             &nb, cr);
 400                         if (err) {
 401                                 if (fbp)
 402                                         fbrelse(fbp, S_OTHER);
 403                                 return (err);
 404                         }
 405                         ASSERT(!ufs_badblock(ip, nb));
 406
 407                         /*
 408                          * Update the inode before releasing the
 409                          * lock on the page. If we released the page
 410                          * lock first, the data could be written to it's
 411                          * old address and then destroyed.
 412                          */
 413                         TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 0);
 414                         ip->i_db[llbn] = nb;
 415                         UFS_SET_ISIZE(((uoff_t)(llbn + 1)) << fs->fs_bshift,
 416                             ip);
 417                         ip->i_blocks += btodb(bsize - osize);
 418                         ASSERT((unsigned)ip->i_blocks <= INT_MAX);
 419                         TRANS_INODE(ufsvfsp, ip);
 420                         ip->i_flag |= IUPD | ICHG | IATTCHG;
 421
 422                         /* Caller is responsible for updating i_seq */
 423                         /*
 424                          * Don't check metaflag here, directories won't do this
 425                          *
 426                          */
 427                         if (issync) {
 428                                 (void) ufs_fbiwrite(fbp, ip, nb, fs->fs_fsize);
 429                         } else {
 430                                 ASSERT(fbp);
 431                                 fbrelse(fbp, S_WRITE);
 432                         }
 433
 434                         if (nb != ob) {
 435                                 (void) free(ip, ob, (off_t)osize, metaflag);
 436                         }
 437                 }
 438         }
 439
 440         /*
 441          * The first NDADDR blocks are direct blocks.
 442          */
 443         if (lbn < NDADDR) {
 444                 nb = ip->i_db[lbn];
 445                 if (nb == 0 ||
 446                     ip->i_size < ((uoff_t)(lbn + 1)) << fs->fs_bshift) {
 447                         if (nb != 0) {
 448                                 /* consider need to reallocate a frag */
 449                                 osize = fragroundup(fs, blkoff(fs, ip->i_size));
 450                                 nsize = fragroundup(fs, size);
 451                                 if (nsize <= osize)
 452                                         goto gotit;
 453                                 /*
 454                                  * Check to see if doing this will make the
 455                                  * file too big.  Only check if we are dealing
 456                                  * with a very large file.
 457                                  */
 458                                 if (verylargefile == 1) {
 459                                         if (((unsigned)ip->i_blocks +
 460                                             btodb(nsize - osize)) > INT_MAX) {
 461                                                 return (EFBIG);
 462                                         }
 463                                 }
 464                                 /*
 465                                  * need to re-allocate a block or frag
 466                                  */
 467                                 ob = nb;
 468                                 pref = blkpref(ip, lbn, (int)lbn,
 469                                     &ip->i_db[0]);
 470                                 err = realloccg(ip, ob, pref, (int)osize,
 471                                     (int)nsize, &nb, cr);
 472                                 if (err)
 473                                         return (err);
 474                                 if (allocblk)
 475                                         *allocblk = nb;
 476                                 ASSERT(!ufs_badblock(ip, nb));
 477
 478                         } else {
 479                                 /*
 480                                  * need to allocate a block or frag
 481                                  */
 482                                 osize = 0;
 483                                 if (ip->i_size <
 484                                     ((uoff_t)(lbn + 1)) << fs->fs_bshift)
 485                                         nsize = fragroundup(fs, size);
 486                                 else
 487                                         nsize = bsize;
 488                                 /*
 489                                  * Check to see if doing this will make the
 490                                  * file too big.  Only check if we are dealing
 491                                  * with a very large file.
 492                                  */
 493                                 if (verylargefile == 1) {
 494                                         if (((unsigned)ip->i_blocks +
 495                                             btodb(nsize - osize)) > INT_MAX) {
 496                                                 return (EFBIG);
 497                                         }
 498                                 }
 499                                 pref = blkpref(ip, lbn, (int)lbn, &ip->i_db[0]);
 500                                 err = alloc(ip, pref, (int)nsize, &nb, cr);
 501                                 if (err)
 502                                         return (err);
 503                                 if (allocblk)
 504                                         *allocblk = nb;
 505                                 ASSERT(!ufs_badblock(ip, nb));
 506                                 ob = nb;
 507                         }
 508
 509                         /*
 510                          * Read old/create new zero pages
 511                          */
 512                         fbp = NULL;
 513                         if (osize == 0) {
 514                                 /*
 515                                  * mmap S_WRITE faults always enter here
 516                                  */
 517                                 /*
 518                                  * We zero it if its also BI_FALLOCATE, but
 519                                  * only for direct blocks!
 520                                  */
 521                                 if (alloc_type == BI_NORMAL ||
 522                                     alloc_type == BI_FALLOCATE ||
 523                                     P2ROUNDUP_TYPED(size,
 524                                     PAGESIZE, uoff_t) < nsize) {
 525                                         /* fbzero doesn't cause a pagefault */
 526                                         fbzero(ITOV(ip),
 527                                             ((offset_t)lbn << fs->fs_bshift),
 528                                             (uint_t)nsize, &fbp);
 529                                 }
 530                         } else {
 531                                 err = fbread(vp,
 532                                     ((offset_t)lbn << fs->fs_bshift),
 533                                     (uint_t)nsize, S_OTHER, &fbp);
 534                                 if (err) {
 535                                         if (nb != ob) {
 536                                                 (void) free(ip, nb,
 537                                                     (off_t)nsize, metaflag);
 538                                         } else {
 539                                                 (void) free(ip,
 540                                                     ob + numfrags(fs, osize),
 541                                                     (off_t)(nsize - osize),
 542                                                     metaflag);
 543                                         }
 544                                         ASSERT(nsize >= osize);
 545                                         (void) chkdq(ip,
 546                                             -(long)btodb(nsize - osize),
 547                                             0, cr, (char **)NULL, NULL);
 548                                         return (err);
 549                                 }
 550                         }
 551                         TRANS_MATA_ALLOC(ufsvfsp, ip, nb, nsize, 0);
 552                         ip->i_db[lbn] = nb;
 553                         ip->i_blocks += btodb(nsize - osize);
 554                         ASSERT((unsigned)ip->i_blocks <= INT_MAX);
 555                         TRANS_INODE(ufsvfsp, ip);
 556                         ip->i_flag |= IUPD | ICHG | IATTCHG;
 557
 558                         /* Caller is responsible for updating i_seq */
 559
 560                         /*
 561                          * Write directory and shadow blocks synchronously so
 562                          * that they never appear with garbage in them on the
 563                          * disk.
 564                          *
 565                          */
 566                         if (isdirquota && (ip->i_size ||
 567                             TRANS_ISTRANS(ufsvfsp))) {
 568                         /*
 569                          * XXX man not be necessary with harpy trans
 570                          * bug id 1130055
 571                          */
 572                                 (void) ufs_fbiwrite(fbp, ip, nb, fs->fs_fsize);
 573                         } else if (fbp) {
 574                                 fbrelse(fbp, S_WRITE);
 575                         }
 576
 577                         if (nb != ob)
 578                                 (void) free(ip, ob, (off_t)osize, metaflag);
 579                 }
 580 gotit:
 581                 return (0);
 582         }
 583
 584         added_sectors = alloced_blocks = 0;     /* No blocks alloced yet */
 585
 586         /*
 587          * Determine how many levels of indirection.
 588          */
 589         nindirshift = ip->i_ufsvfs->vfs_nindirshift;
 590         nindiroffset = ip->i_ufsvfs->vfs_nindiroffset;
 591         pref = 0;
 592         shft = 0;                               /* sh = 1 */
 593         tbn = lbn - NDADDR;
 594         for (j = NIADDR; j > 0; j--) {
 595                 longlong_t      sh;
 596
 597                 shft += nindirshift;            /* sh *= nindir */
 598                 sh = 1LL << shft;
 599                 if (tbn < sh)
 600                         break;
 601                 tbn -= sh;
 602         }
 603
 604         if (j == 0)
 605                 return (EFBIG);
 606
 607         /*
 608          * Fetch the first indirect block.
 609          */
 610         dev = ip->i_dev;
 611         nb = ip->i_ib[NIADDR - j];
 612         if (nb == 0) {
 613                 /*
 614                  * Check to see if doing this will make the
 615                  * file too big.  Only check if we are dealing
 616                  * with a very large file.
 617                  */
 618                 if (verylargefile == 1) {
 619                         if (((unsigned)ip->i_blocks + btodb(bsize))
 620                             > INT_MAX) {
 621                                 return (EFBIG);
 622                         }
 623                 }
 624                 /*
 625                  * Need to allocate an indirect block.
 626                  */
 627                 pref = blkpref(ip, lbn, 0, (daddr32_t *)0);
 628                 err = alloc(ip, pref, (int)bsize, &nb, cr);
 629                 if (err)
 630                         return (err);
 631                 TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 1);
 632                 ASSERT(!ufs_badblock(ip, nb));
 633
 634                 /*
 635                  * Keep track of this allocation so we can undo it if we
 636                  * get an error later.
 637                  */
 638
 639                 ASSERT(alloced_blocks <= NIADDR);
 640
 641                 undo_table[alloced_blocks].this_block = nb;
 642                 undo_table[alloced_blocks].block_size = bsize;
 643                 undo_table[alloced_blocks].owner = ufs_no_owner;
 644                 undo_table[alloced_blocks].usage_flags = metaflag | I_IBLK;
 645
 646                 alloced_blocks++;
 647
 648                 /*
 649                  * Write zero block synchronously so that
 650                  * indirect blocks never point at garbage.
 651                  */
 652                 bp = UFS_GETBLK(ufsvfsp, dev, fsbtodb(fs, nb), bsize);
 653
 654                 clrbuf(bp);
 655                 /* XXX Maybe special-case this? */
 656                 TRANS_BUF(ufsvfsp, 0, bsize, bp, DT_ABZERO);
 657                 UFS_BWRITE2(ufsvfsp, bp);
 658                 if (bp->b_flags & B_ERROR) {
 659                         err = geterror(bp);
 660                         brelse(bp);
 661                         ufs_undo_allocation(ip, alloced_blocks,
 662                             undo_table, added_sectors);
 663                         return (err);
 664                 }
 665                 brelse(bp);
 666
 667                 ip->i_ib[NIADDR - j] = nb;
 668                 added_sectors += btodb(bsize);
 669                 ip->i_blocks += btodb(bsize);
 670                 ASSERT((unsigned)ip->i_blocks <= INT_MAX);
 671                 TRANS_INODE(ufsvfsp, ip);
 672                 ip->i_flag |= IUPD | ICHG | IATTCHG;
 673                 /* Caller is responsible for updating i_seq */
 674
 675                 /*
 676                  * Update the 'undo table' now that we've linked this block
 677                  * to an inode.
 678                  */
 679
 680                 undo_table[alloced_blocks-1].owner = ufs_inode_indirect;
 681                 undo_table[alloced_blocks-1].owner_offset = NIADDR - j;
 682
 683                 /*
 684                  * In the ISYNC case, wrip will notice that the block
 685                  * count on the inode has changed and will be sure to
 686                  * ufs_iupdat the inode at the end of wrip.
 687                  */
 688         }
 689
 690         /*
 691          * Fetch through the indirect blocks.
 692          */
 693         for (; j <= NIADDR; j++) {
 694                 ob = nb;
 695                 bp = UFS_BREAD(ufsvfsp, ip->i_dev, fsbtodb(fs, ob), bsize);
 696
 697                 if (bp->b_flags & B_ERROR) {
 698                         err = geterror(bp);
 699                         brelse(bp);
 700                         /*
 701                          * Return any partial allocations.
 702                          *
 703                          * It is possible that we have not yet made any
 704                          * allocations at this point (if this is the first
 705                          * pass through the loop and we didn't have to
 706                          * allocate the first indirect block, above).
 707                          * In this case, alloced_blocks and added_sectors will
 708                          * be zero, and ufs_undo_allocation will do nothing.
 709                          */
 710                         ufs_undo_allocation(ip, alloced_blocks,
 711                             undo_table, added_sectors);
 712                         return (err);
 713                 }
 714                 bap = bp->b_un.b_daddr;
 715                 shft -= nindirshift;            /* sh /= nindir */
 716                 i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */
 717                 nb = bap[i];
 718
 719                 if (nb == 0) {
 720                         /*
 721                          * Check to see if doing this will make the
 722                          * file too big.  Only check if we are dealing
 723                          * with a very large file.
 724                          */
 725                         if (verylargefile == 1) {
 726                                 if (((unsigned)ip->i_blocks + btodb(bsize))
 727                                     > INT_MAX) {
 728                                         brelse(bp);
 729                                         ufs_undo_allocation(ip, alloced_blocks,
 730                                             undo_table, added_sectors);
 731                                         return (EFBIG);
 732                                 }
 733                         }
 734                         if (pref == 0) {
 735                                 if (j < NIADDR) {
 736                                         /* Indirect block */
 737                                         pref = blkpref(ip, lbn, 0,
 738                                             (daddr32_t *)0);
 739                                 } else {
 740                                         /* Data block */
 741                                         pref = blkpref(ip, lbn, i, &bap[0]);
 742                                 }
 743                         }
 744
 745                         /*
 746                          * release "bp" buf to avoid deadlock (re-bread later)
 747                          */
 748                         brelse(bp);
 749
 750                         err = alloc(ip, pref, (int)bsize, &nb, cr);
 751                         if (err) {
 752                                 /*
 753                                  * Return any partial allocations.
 754                                  */
 755                                 ufs_undo_allocation(ip, alloced_blocks,
 756                                     undo_table, added_sectors);
 757                                 return (err);
 758                         }
 759
 760                         ASSERT(!ufs_badblock(ip, nb));
 761                         ASSERT(alloced_blocks <= NIADDR);
 762
 763                         if (allocblk)
 764                                 *allocblk = nb;
 765
 766                         undo_table[alloced_blocks].this_block = nb;
 767                         undo_table[alloced_blocks].block_size = bsize;
 768                         undo_table[alloced_blocks].owner = ufs_no_owner;
 769                         undo_table[alloced_blocks].usage_flags = metaflag |
 770                             ((j < NIADDR) ? I_IBLK : 0);
 771
 772                         alloced_blocks++;
 773
 774                         if (j < NIADDR) {
 775                                 TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 1);
 776                                 /*
 777                                  * Write synchronously so indirect
 778                                  * blocks never point at garbage.
 779                                  */
 780                                 nbp = UFS_GETBLK(
 781                                     ufsvfsp, dev, fsbtodb(fs, nb), bsize);
 782
 783                                 clrbuf(nbp);
 784                                 /* XXX Maybe special-case this? */
 785                                 TRANS_BUF(ufsvfsp, 0, bsize, nbp, DT_ABZERO);
 786                                 UFS_BWRITE2(ufsvfsp, nbp);
 787                                 if (nbp->b_flags & B_ERROR) {
 788                                         err = geterror(nbp);
 789                                         brelse(nbp);
 790                                         /*
 791                                          * Return any partial
 792                                          * allocations.
 793                                          */
 794                                         ufs_undo_allocation(ip,
 795                                             alloced_blocks,
 796                                             undo_table, added_sectors);
 797                                         return (err);
 798                                 }
 799                                 brelse(nbp);
 800                         } else if (alloc_type == BI_NORMAL ||
 801                             P2ROUNDUP_TYPED(size,
 802                             PAGESIZE, uoff_t) < bsize) {
 803                                 TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 0);
 804                                 fbzero(ITOV(ip),
 805                                     ((offset_t)lbn << fs->fs_bshift),
 806                                     (uint_t)bsize, &fbp);
 807
 808                                 /*
 809                                  * Cases which we need to do a synchronous
 810                                  * write of the zeroed data pages:
 811                                  *
 812                                  * 1) If we are writing a directory then we
 813                                  * want to write synchronously so blocks in
 814                                  * directories never contain garbage.
 815                                  *
 816                                  * 2) If we are filling in a hole and the
 817                                  * indirect block is going to be synchronously
 818                                  * written back below we need to make sure
 819                                  * that the zeroes are written here before
 820                                  * the indirect block is updated so that if
 821                                  * we crash before the real data is pushed
 822                                  * we will not end up with random data is
 823                                  * the middle of the file.
 824                                  *
 825                                  * 3) If the size of the request rounded up
 826                                  * to the system page size is smaller than
 827                                  * the file system block size, we want to
 828                                  * write out all the pages now so that
 829                                  * they are not aborted before they actually
 830                                  * make it to ufs_putpage since the length
 831                                  * of the inode will not include the pages.
 832                                  */
 833
 834                                 if (isdirquota || (issync &&
 835                                     lbn < llbn))
 836                                         (void) ufs_fbiwrite(fbp, ip, nb,
 837                                             fs->fs_fsize);
 838                                 else
 839                                         fbrelse(fbp, S_WRITE);
 840                         }
 841
 842                         /*
 843                          * re-acquire "bp" buf
 844                          */
 845                         bp = UFS_BREAD(ufsvfsp,
 846                             ip->i_dev, fsbtodb(fs, ob), bsize);
 847                         if (bp->b_flags & B_ERROR) {
 848                                 err = geterror(bp);
 849                                 brelse(bp);
 850                                 /*
 851                                  * Return any partial allocations.
 852                                  */
 853                                 ufs_undo_allocation(ip,
 854                                     alloced_blocks,
 855                                     undo_table, added_sectors);
 856                                 return (err);
 857                         }
 858                         bap = bp->b_un.b_daddr;
 859                         bap[i] = nb;
 860
 861                         /*
 862                          * The magic explained: j will be equal to NIADDR
 863                          * when we are at the lowest level, this is where the
 864                          * array entries point directly to data blocks. Since
 865                          * we will be 'fallocate'ing we will go ahead and negate
 866                          * the addresses.
 867                          */
 868                         if (alloc_type == BI_FALLOCATE && j == NIADDR)
 869                                 bap[i] = -bap[i];
 870
 871                         TRANS_BUF_ITEM_128(ufsvfsp, bap[i], bap, bp, DT_AB);
 872                         added_sectors += btodb(bsize);
 873                         ip->i_blocks += btodb(bsize);
 874                         ASSERT((unsigned)ip->i_blocks <= INT_MAX);
 875                         TRANS_INODE(ufsvfsp, ip);
 876                         ip->i_flag |= IUPD | ICHG | IATTCHG;
 877
 878                         /* Caller is responsible for updating i_seq */
 879
 880                         undo_table[alloced_blocks-1].owner =
 881                             ufs_indirect_block;
 882                         undo_table[alloced_blocks-1].owner_block = ob;
 883                         undo_table[alloced_blocks-1].owner_offset = i;
 884
 885                         if (issync) {
 886                                 UFS_BWRITE2(ufsvfsp, bp);
 887                                 if (bp->b_flags & B_ERROR) {
 888                                         err = geterror(bp);
 889                                         brelse(bp);
 890                                         /*
 891                                          * Return any partial
 892                                          * allocations.
 893                                          */
 894                                         ufs_undo_allocation(ip,
 895                                             alloced_blocks,
 896                                             undo_table, added_sectors);
 897                                         return (err);
 898                                 }
 899                                 brelse(bp);
 900                         } else {
 901                                 bdrwrite(bp);
 902                         }
 903                 } else {
 904                         brelse(bp);
 905                 }
 906         }
 907         return (0);
 908 }
 909
 910 /*
 911  * Return 1 if inode has unmapped blocks (UFS holes) or if another thread
 912  * is in the critical region of wrip().
 913  */
 914 int
 915 bmap_has_holes(struct inode *ip)
 916 {
 917         struct fs *fs = ip->i_fs;
 918         uint_t  dblks;                  /* # of data blocks */
 919         uint_t  mblks;                  /* # of data + metadata blocks */
 920         int     nindirshift;
 921         int     nindiroffset;
 922         uint_t  cnt;
 923         int     n, j, shft;
 924         uint_t nindirblks;
 925
 926         int     fsbshift = fs->fs_bshift;
 927         int     fsboffset = (1 << fsbshift) - 1;
 928
 929         /*
 930          * Check for writer in critical region, if found then we
 931          * cannot trust the values of i_size and i_blocks
 932          * simply return true.
 933          */
 934         if (ip->i_writer != NULL && ip->i_writer != curthread) {
 935                 return (1);
 936         }
 937
 938         dblks = (ip->i_size + fsboffset) >> fsbshift;
 939         mblks = (ldbtob((uoff_t)ip->i_blocks) + fsboffset) >> fsbshift;
 940
 941         /*
 942          * File has only direct blocks.
 943          */
 944         if (dblks <= NDADDR)
 945                 return (mblks < dblks);
 946         nindirshift = ip->i_ufsvfs->vfs_nindirshift;
 947
 948         nindiroffset = ip->i_ufsvfs->vfs_nindiroffset;
 949         nindirblks = nindiroffset + 1;
 950
 951         dblks -= NDADDR;
 952         shft = 0;
 953         /*
 954          * Determine how many levels of indirection.
 955          */
 956         for (j = NIADDR; j > 0; j--) {
 957                 longlong_t      sh;
 958
 959                 shft += nindirshift;    /* sh *= nindir */
 960                 sh = 1LL << shft;
 961                 if (dblks <= sh)
 962                         break;
 963                 dblks -= sh;
 964         }
 965         /* LINTED: warning: logical expression always true: op "||" */
 966         ASSERT(NIADDR <= 3);
 967         ASSERT(j <= NIADDR);
 968         if (j == NIADDR)        /* single level indirection */
 969                 cnt = NDADDR + 1 + dblks;
 970         else if (j == NIADDR-1) /* double indirection */
 971                 cnt = NDADDR + 1 + nindirblks +
 972                     1 + (dblks + nindiroffset)/nindirblks + dblks;
 973         else if (j == NIADDR-2) { /* triple indirection */
 974                 n = (dblks + nindiroffset)/nindirblks;
 975                 cnt = NDADDR + 1 + nindirblks +
 976                     1 + nindirblks + nindirblks*nindirblks +
 977                     1 + (n + nindiroffset)/nindirblks + n + dblks;
 978         }
 979
 980         return (mblks < cnt);
 981 }
 982
 983 /*
 984  * find some contig blocks starting at *sbp and going for min(n, max_contig)
 985  * return the number of blocks (not frags) found.
 986  * The array passed in must be at least [0..n-1].
 987  */
 988 static int
 989 findextent(struct fs *fs, daddr32_t *sbp, int n, int *lenp, int maxtransfer)
 990 {
 991         register daddr_t bn, nextbn;
 992         register daddr32_t *bp;
 993         register int diff;
 994         int maxtransblk;
 995
 996         if (n <= 0)
 997                 return (0);
 998         bn = *sbp;
 999         if (bn == 0)
1000                 return (0);
1001
1002         diff = fs->fs_frag;
1003         if (*lenp) {
1004                 n = MIN(n, lblkno(fs, *lenp));
1005         } else {
1006                 /*
1007                  * If the user has set the value for maxcontig lower than
1008                  * the drive transfer size, then assume they want this
1009                  * to be the maximum value for the size of the data transfer.
1010                  */
1011                 maxtransblk = maxtransfer >> DEV_BSHIFT;
1012                 if (fs->fs_maxcontig < maxtransblk) {
1013                         n = MIN(n, fs->fs_maxcontig);
1014                 } else {
1015                         n = MIN(n, maxtransblk);
1016                 }
1017         }
1018         bp = sbp;
1019         while (--n > 0) {
1020                 nextbn = *(bp + 1);
1021                 if (nextbn == 0 || bn + diff != nextbn)
1022                         break;
1023                 bn = nextbn;
1024                 bp++;
1025         }
1026         return ((int)(bp - sbp) + 1);
1027 }
1028
1029 /*
1030  * Free any blocks which had been successfully allocated.  Always called
1031  * as a result of an error, so we don't bother returning an error code
1032  * from here.
1033  *
1034  * If block_count and inode_sector_adjust are both zero, we'll do nothing.
1035  * Thus it is safe to call this as part of error handling, whether or not
1036  * any blocks have been allocated.
1037  *
1038  * The ufs_inode_direct case is currently unused.
1039  */
1040
1041 static void
1042 ufs_undo_allocation(
1043         inode_t *ip,
1044         int block_count,
1045         struct ufs_allocated_block table[],
1046         int inode_sector_adjust)
1047 {
1048         int i;
1049         int inode_changed;
1050         int error_updating_pointers;
1051         struct ufsvfs *ufsvfsp;
1052
1053         inode_changed = 0;
1054         error_updating_pointers = 0;
1055
1056         ufsvfsp = ip->i_ufsvfs;
1057
1058         /*
1059          * Update pointers on disk before freeing blocks.  If we fail,
1060          * some blocks may remain busy; but they will be reclaimed by
1061          * an fsck.  (This is better than letting a block wind up with
1062          * two owners if we successfully freed it but could not remove
1063          * the pointer to it.)
1064          */
1065
1066         for (i = 0; i < block_count; i++) {
1067                 switch (table[i].owner) {
1068                 case ufs_no_owner:
1069                         /* Nothing to do here, nobody points to us */
1070                         break;
1071                 case ufs_inode_direct:
1072                         ASSERT(table[i].owner_offset < NDADDR);
1073                         ip->i_db[table[i].owner_offset] = 0;
1074                         inode_changed = 1;
1075                         break;
1076                 case ufs_inode_indirect:
1077                         ASSERT(table[i].owner_offset < NIADDR);
1078                         ip->i_ib[table[i].owner_offset] = 0;
1079                         inode_changed = 1;
1080                         break;
1081                 case ufs_indirect_block: {
1082                         buf_t *bp;
1083                         daddr32_t *block_data;
1084
1085                         /* Read/modify/log/write. */
1086
1087                         ASSERT(table[i].owner_offset <
1088                             (VBSIZE(ITOV(ip)) / sizeof (daddr32_t)));
1089
1090                         bp = UFS_BREAD(ufsvfsp, ip->i_dev,
1091                             fsbtodb(ufsvfsp->vfs_fs, table[i].owner_block),
1092                             VBSIZE(ITOV(ip)));
1093
1094                         if (bp->b_flags & B_ERROR) {
1095                                 /* Couldn't read this block; give up. */
1096                                 error_updating_pointers = 1;
1097                                 brelse(bp);
1098                                 break;          /* out of SWITCH */
1099                         }
1100
1101                         block_data = bp->b_un.b_daddr;
1102                         block_data[table[i].owner_offset] = 0;
1103
1104                         /* Write a log entry which includes the zero. */
1105                         /* It might be possible to optimize this by using */
1106                         /* TRANS_BUF directly and zeroing only the four */
1107                         /* bytes involved, but an attempt to do that led */
1108                         /* to panics in the logging code.  The attempt was */
1109                         /* TRANS_BUF(ufsvfsp,                             */
1110                         /*    table[i].owner_offset * sizeof (daddr32_t), */
1111                         /*    sizeof (daddr32_t),                         */
1112                         /*    bp,                                         */
1113                         /*    DT_ABZERO);                                 */
1114
1115                         TRANS_BUF_ITEM_128(ufsvfsp,
1116                             block_data[table[i].owner_offset],
1117                             block_data, bp, DT_AB);
1118
1119                         /* Now we can write the buffer itself. */
1120
1121                         UFS_BWRITE2(ufsvfsp, bp);
1122
1123                         if (bp->b_flags & B_ERROR) {
1124                                 error_updating_pointers = 1;
1125                         }
1126
1127                         brelse(bp);
1128                         break;
1129                 }
1130                 default:
1131                         (void) ufs_fault(ITOV(ip),
1132                             "ufs_undo_allocation failure\n");
1133                         break;
1134                 }
1135         }
1136
1137         /*
1138          * If the inode changed, or if we need to update its block count,
1139          * then do that now.  We update the inode synchronously on disk
1140          * to ensure that it won't transiently point at a block we've
1141          * freed (only necessary if we're not logging).
1142          *
1143          * NOTE: Currently ufs_iupdat() does not check for errors.  When
1144          * it is fixed, we should verify that we successfully updated the
1145          * inode before freeing blocks below.
1146          */
1147
1148         if (inode_changed || (inode_sector_adjust != 0)) {
1149                 ip->i_blocks -= inode_sector_adjust;
1150                 ASSERT((unsigned)ip->i_blocks <= INT_MAX);
1151                 TRANS_INODE(ufsvfsp, ip);
1152                 ip->i_flag |= IUPD | ICHG | IATTCHG;
1153                 ip->i_seq++;
1154                 if (!TRANS_ISTRANS(ufsvfsp))
1155                         ufs_iupdat(ip, I_SYNC);
1156         }
1157
1158         /*
1159          * Now we go through and actually free the blocks, but only if we
1160          * successfully removed the pointers to them.
1161          */
1162
1163         if (!error_updating_pointers) {
1164                 for (i = 0; i < block_count; i++) {
1165                         free(ip, table[i].this_block, table[i].block_size,
1166                             table[i].usage_flags);
1167                 }
1168         }
1169 }
1170
1171 /*
1172  * Find the next hole or data block in file starting at *off
1173  * Return found offset in *off, which can be less than the
1174  * starting offset if not block aligned.
1175  * This code is based on bmap_read().
1176  * Errors: ENXIO for end of file
1177  *         EIO for block read error.
1178  */
1179 int
1180 bmap_find(struct inode *ip, boolean_t hole, uoff_t *off)
1181 {
1182         ufsvfs_t *ufsvfsp = ip->i_ufsvfs;
1183         struct fs *fs = ufsvfsp->vfs_fs;
1184         buf_t *bp[NIADDR];
1185         int i, j;
1186         int shft;                       /* we maintain sh = 1 << shft */
1187         int nindirshift, nindiroffset;
1188         daddr_t ob, nb, tbn, lbn, skip;
1189         daddr32_t *bap;
1190         uoff_t isz = (offset_t)ip->i_size;
1191         int32_t bs = fs->fs_bsize; /* file system block size */
1192         int32_t nindir = fs->fs_nindir;
1193         dev_t dev;
1194         int error = 0;
1195         daddr_t limits[NIADDR];
1196
1197         ASSERT(*off < isz);
1198         ASSERT(RW_LOCK_HELD(&ip->i_contents));
1199         lbn = (daddr_t)lblkno(fs, *off);
1200         ASSERT(lbn >= 0);
1201
1202         for (i = 0; i < NIADDR; i++)
1203                 bp[i] = NULL;
1204
1205         /*
1206          * The first NDADDR blocks are direct blocks.
1207          */
1208         if (lbn < NDADDR) {
1209                 for (; lbn < NDADDR; lbn++) {
1210                         if ((hole && (ip->i_db[lbn] == 0)) ||
1211                             (!hole && (ip->i_db[lbn] != 0))) {
1212                                 goto out;
1213                         }
1214                 }
1215                 if ((uoff_t)lbn << fs->fs_bshift >= isz)
1216                         goto out;
1217         }
1218
1219         nindir = fs->fs_nindir;
1220         nindirshift = ufsvfsp->vfs_nindirshift;
1221         nindiroffset = ufsvfsp->vfs_nindiroffset;
1222         dev = ip->i_dev;
1223
1224         /* Set up limits array */
1225         for (limits[0] = NDADDR, j = 1; j  < NIADDR; j++)
1226                 limits[j] = limits[j-1] + (1ULL << (nindirshift * j));
1227
1228 loop:
1229         /*
1230          * Determine how many levels of indirection.
1231          */
1232         shft = 0;                               /* sh = 1 */
1233         tbn = lbn - NDADDR;
1234         for (j = NIADDR; j > 0; j--) {
1235                 longlong_t sh;
1236
1237                 shft += nindirshift;            /* sh *= nindir */
1238                 sh = 1LL << shft;
1239                 if (tbn < sh)
1240                         break;
1241                 tbn -= sh;
1242         }
1243         if (j == 0) {
1244                 /* must have passed end of file */
1245                 ASSERT(((uoff_t)lbn << fs->fs_bshift) >= isz);
1246                 goto out;
1247         }
1248
1249         /*
1250          * Fetch the first indirect block.
1251          */
1252         nb = ip->i_ib[NIADDR - j];
1253         if (nb == 0) {
1254                 if (hole) {
1255                         lbn = limits[NIADDR - j];
1256                         goto out;
1257                 } else {
1258                         lbn = limits[NIADDR - j + 1];
1259                         if ((uoff_t)lbn << fs->fs_bshift >= isz)
1260                                 goto out;
1261                         goto loop;
1262                 }
1263         }
1264
1265         /*
1266          * Fetch through the indirect blocks.
1267          */
1268         for (; ((j <= NIADDR) && (nb != 0)); j++) {
1269                 ob = nb;
1270                 /*
1271                  * if there's a different block at this level then release
1272                  * the old one and in with the new.
1273                  */
1274                 if ((bp[j-1] == NULL) || bp[j-1]->b_blkno != fsbtodb(fs, ob)) {
1275                         if (bp[j-1] != NULL)
1276                                 brelse(bp[j-1]);
1277                         bp[j-1] = UFS_BREAD(ufsvfsp, dev, fsbtodb(fs, ob), bs);
1278                         if (bp[j-1]->b_flags & B_ERROR) {
1279                                 error = EIO;
1280                                 goto out;
1281                         }
1282                 }
1283                 bap = bp[j-1]->b_un.b_daddr;
1284
1285                 shft -= nindirshift;            /* sh / nindir */
1286                 i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */
1287                 nb = bap[i];
1288                 skip = 1LL << (nindirshift * (NIADDR - j));
1289         }
1290
1291         /*
1292          * Scan through the blocks in this array.
1293          */
1294         for (; i < nindir; i++, lbn += skip) {
1295                 if (hole && (bap[i] == 0))
1296                         goto out;
1297                 if (!hole && (bap[i] != 0)) {
1298                         if (skip == 1) {
1299                                 /* we're at the lowest level */
1300                                 goto out;
1301                         } else {
1302                                 goto loop;
1303                         }
1304                 }
1305         }
1306         if (((uoff_t)lbn << fs->fs_bshift) < isz)
1307                 goto loop;
1308 out:
1309         for (i = 0; i < NIADDR; i++) {
1310                 if (bp[i])
1311                         brelse(bp[i]);
1312         }
1313         if (error == 0) {
1314                 if (((uoff_t)lbn << fs->fs_bshift) >= isz) {
1315                         error = ENXIO;
1316                 } else {
1317                         /* success */
1318                         *off = (uoff_t)lbn << fs->fs_bshift;
1319                 }
1320         }
1321         return (error);
1322 }
1323
1324 /*
1325  * Set a particular offset in the inode list to be a certain block.
1326  * User is responsible for calling TRANS* functions
1327  */
1328 int
1329 bmap_set_bn(struct vnode *vp, uoff_t off, daddr32_t bn)
1330 {
1331         daddr_t lbn;
1332         struct inode *ip;
1333         ufsvfs_t *ufsvfsp;
1334         struct  fs *fs;
1335         struct  buf *bp;
1336         int     i, j;
1337         int     shft;                   /* we maintain sh = 1 << shft */
1338         int err;
1339         daddr_t ob, nb, tbn;
1340         daddr32_t *bap;
1341         int     nindirshift, nindiroffset;
1342
1343         ip = VTOI(vp);
1344         ufsvfsp = ip->i_ufsvfs;
1345         fs = ufsvfsp->vfs_fs;
1346         lbn = (daddr_t)lblkno(fs, off);
1347
1348         ASSERT(RW_LOCK_HELD(&ip->i_contents));
1349
1350         if (lbn < 0)
1351                 return (EFBIG);
1352
1353         /*
1354          * Take care of direct block assignment
1355          */
1356         if (lbn < NDADDR) {
1357                 ip->i_db[lbn] = bn;
1358                 return (0);
1359         }
1360
1361         nindirshift = ip->i_ufsvfs->vfs_nindirshift;
1362         nindiroffset = ip->i_ufsvfs->vfs_nindiroffset;
1363         /*
1364          * Determine how many levels of indirection.
1365          */
1366         shft = 0;                               /* sh = 1 */
1367         tbn = lbn - NDADDR;
1368         for (j = NIADDR; j > 0; j--) {
1369                 longlong_t      sh;
1370
1371                 shft += nindirshift;            /* sh *= nindir */
1372                 sh = 1LL << shft;
1373                 if (tbn < sh)
1374                         break;
1375                 tbn -= sh;
1376         }
1377         if (j == 0)
1378                 return (EFBIG);
1379
1380         /*
1381          * Fetch the first indirect block.
1382          */
1383         nb = ip->i_ib[NIADDR - j];
1384         if (nb == 0) {
1385                 err = ufs_fault(ITOV(ip), "ufs_set_bn: nb == UFS_HOLE");
1386                 return (err);
1387         }
1388
1389         /*
1390          * Fetch through the indirect blocks.
1391          */
1392         for (; j <= NIADDR; j++) {
1393                 ob = nb;
1394                 bp = UFS_BREAD(ufsvfsp,
1395                     ip->i_dev, fsbtodb(fs, ob), fs->fs_bsize);
1396                 if (bp->b_flags & B_ERROR) {
1397                         err = geterror(bp);
1398                         brelse(bp);
1399                         return (err);
1400                 }
1401                 bap = bp->b_un.b_daddr;
1402
1403                 ASSERT(!ufs_indir_badblock(ip, bap));
1404
1405                 shft -= nindirshift;            /* sh / nindir */
1406                 i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */
1407
1408                 nb = bap[i];
1409                 if (nb == 0) {
1410                         err = ufs_fault(ITOV(ip), "ufs_set_bn: nb == UFS_HOLE");
1411                         return (err);
1412                 }
1413
1414                 if (j == NIADDR) {
1415                         bap[i] = bn;
1416                         bdrwrite(bp);
1417                         return (0);
1418                 }
1419
1420                 brelse(bp);
1421         }
1422         return (0);
1423 }