usr/src/uts/common/fs/ufs/ufs_alloc.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T     */
  27 /*        All Rights Reserved   */
  28
  29 /*
  30  * University Copyright- Copyright (c) 1982, 1986, 1988
  31  * The Regents of the University of California
  32  * All Rights Reserved
  33  *
  34  * University Acknowledgment- Portions of this document are derived from
  35  * software developed by the University of California, Berkeley, and its
  36  * contributors.
  37  */
  38
  39 #include <sys/condvar_impl.h>
  40 #include <sys/types.h>
  41 #include <sys/t_lock.h>
  42 #include <sys/debug.h>
  43 #include <sys/param.h>
  44 #include <sys/systm.h>
  45 #include <sys/signal.h>
  46 #include <sys/cred.h>
  47 #include <sys/proc.h>
  48 #include <sys/disp.h>
  49 #include <sys/user.h>
  50 #include <sys/buf.h>
  51 #include <sys/vfs.h>
  52 #include <sys/vnode.h>
  53 #include <sys/acl.h>
  54 #include <sys/fs/ufs_fs.h>
  55 #include <sys/fs/ufs_inode.h>
  56 #include <sys/fs/ufs_acl.h>
  57 #include <sys/fs/ufs_bio.h>
  58 #include <sys/fs/ufs_quota.h>
  59 #include <sys/kmem.h>
  60 #include <sys/fs/ufs_trans.h>
  61 #include <sys/fs/ufs_panic.h>
  62 #include <sys/errno.h>
  63 #include <sys/time.h>
  64 #include <sys/sysmacros.h>
  65 #include <sys/file.h>
  66 #include <sys/fcntl.h>
  67 #include <sys/flock.h>
  68 #include <fs/fs_subr.h>
  69 #include <sys/cmn_err.h>
  70 #include <sys/policy.h>
  71 #include <sys/fs/ufs_log.h>
  72
  73 static ino_t    hashalloc();
  74 static daddr_t  fragextend();
  75 static daddr_t  alloccg();
  76 static daddr_t  alloccgblk();
  77 static ino_t    ialloccg();
  78 static daddr_t  mapsearch();
  79 static int      findlogstartcg();
  80
  81 extern int      inside[], around[];
  82 extern uchar_t  *fragtbl[];
  83 void delay();
  84
  85 /*
  86  * Allocate a block in the file system.
  87  *
  88  * The size of the requested block is given, which must be some
  89  * multiple of fs_fsize and <= fs_bsize.
  90  * A preference may be optionally specified. If a preference is given
  91  * the following hierarchy is used to allocate a block:
  92  *   1) allocate the requested block.
  93  *   2) allocate a rotationally optimal block in the same cylinder.
  94  *   3) allocate a block in the same cylinder group.
  95  *   4) quadratically rehash into other cylinder groups, until an
  96  *      available block is located.
  97  * If no block preference is given the following hierarchy is used
  98  * to allocate a block:
  99  *   1) allocate a block in the cylinder group that contains the
 100  *      inode for the file.
 101  *   2) quadratically rehash into other cylinder groups, until an
 102  *      available block is located.
 103  */
 104 int
 105 alloc(struct inode *ip, daddr_t bpref, int size, daddr_t *bnp, cred_t *cr)
 106 {
 107         struct fs *fs;
 108         struct ufsvfs *ufsvfsp;
 109         daddr_t bno;
 110         int cg;
 111         int err;
 112         char *errmsg = NULL;
 113         size_t len;
 114         clock_t now;
 115
 116         ufsvfsp = ip->i_ufsvfs;
 117         fs = ufsvfsp->vfs_fs;
 118         if ((unsigned)size > fs->fs_bsize || fragoff(fs, size) != 0) {
 119                 err = ufs_fault(ITOV(ip), "alloc: bad size, dev = 0x%lx,"
 120                     " bsize = %d, size = %d, fs = %s\n",
 121                     ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt);
 122                 return (err);
 123         }
 124         if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0)
 125                 goto nospace;
 126         if (freespace(fs, ufsvfsp) <= 0 &&
 127             secpolicy_fs_minfree(cr, ufsvfsp->vfs_vfs) != 0)
 128                 goto nospace;
 129         err = chkdq(ip, (long)btodb(size), 0, cr, &errmsg, &len);
 130         /* Note that may not have err, but may have errmsg */
 131         if (errmsg != NULL) {
 132                 uprintf(errmsg);
 133                 kmem_free(errmsg, len);
 134                 errmsg = NULL;
 135         }
 136         if (err)
 137                 return (err);
 138         if (bpref >= fs->fs_size)
 139                 bpref = 0;
 140         if (bpref == 0)
 141                 cg = (int)itog(fs, ip->i_number);
 142         else
 143                 cg = dtog(fs, bpref);
 144
 145         bno = (daddr_t)hashalloc(ip, cg, (long)bpref, size,
 146             (ulong_t (*)())alloccg);
 147         if (bno > 0) {
 148                 *bnp = bno;
 149                 return (0);
 150         }
 151
 152         /*
 153          * hashalloc() failed because some other thread grabbed
 154          * the last block so unwind the quota operation.  We can
 155          * ignore the return because subtractions don't fail and
 156          * size is guaranteed to be >= zero by our caller.
 157          */
 158         (void) chkdq(ip, -(long)btodb(size), 0, cr, (char **)NULL,
 159             (size_t *)NULL);
 160
 161 nospace:
 162         now = ddi_get_lbolt();
 163         mutex_enter(&ufsvfsp->vfs_lock);
 164         if ((now - ufsvfsp->vfs_lastwhinetime) > (hz << 2) &&
 165             (!(TRANS_ISTRANS(ufsvfsp)) || !(ip->i_flag & IQUIET))) {
 166                 ufsvfsp->vfs_lastwhinetime = now;
 167                 cmn_err(CE_NOTE, "alloc: %s: file system full", fs->fs_fsmnt);
 168         }
 169         mutex_exit(&ufsvfsp->vfs_lock);
 170         return (ENOSPC);
 171 }
 172
 173 /*
 174  * Reallocate a fragment to a bigger size
 175  *
 176  * The number and size of the old block is given, and a preference
 177  * and new size is also specified.  The allocator attempts to extend
 178  * the original block.  Failing that, the regular block allocator is
 179  * invoked to get an appropriate block.
 180  */
 181 int
 182 realloccg(struct inode *ip, daddr_t bprev, daddr_t bpref, int osize,
 183     int nsize, daddr_t *bnp, cred_t *cr)
 184 {
 185         daddr_t bno;
 186         struct fs *fs;
 187         struct ufsvfs *ufsvfsp;
 188         int cg, request;
 189         int err;
 190         char *errmsg = NULL;
 191         size_t len;
 192         clock_t now;
 193
 194         ufsvfsp = ip->i_ufsvfs;
 195         fs = ufsvfsp->vfs_fs;
 196         if ((unsigned)osize > fs->fs_bsize || fragoff(fs, osize) != 0 ||
 197             (unsigned)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) {
 198                 err = ufs_fault(ITOV(ip),
 199                     "realloccg: bad size, dev=0x%lx, bsize=%d, "
 200                     "osize=%d, nsize=%d, fs=%s\n",
 201                     ip->i_dev, fs->fs_bsize, osize, nsize, fs->fs_fsmnt);
 202                 return (err);
 203         }
 204         if (freespace(fs, ufsvfsp) <= 0 &&
 205             secpolicy_fs_minfree(cr, ufsvfsp->vfs_vfs) != 0)
 206                 goto nospace;
 207         if (bprev == 0) {
 208                 err = ufs_fault(ITOV(ip),
 209                     "realloccg: bad bprev, dev = 0x%lx, bsize = %d,"
 210                     " bprev = %ld, fs = %s\n", ip->i_dev, fs->fs_bsize, bprev,
 211                     fs->fs_fsmnt);
 212                 return (err);
 213         }
 214         err = chkdq(ip, (long)btodb(nsize - osize), 0, cr, &errmsg, &len);
 215         /* Note that may not have err, but may have errmsg */
 216         if (errmsg != NULL) {
 217                 uprintf(errmsg);
 218                 kmem_free(errmsg, len);
 219                 errmsg = NULL;
 220         }
 221         if (err)
 222                 return (err);
 223         cg = dtog(fs, bprev);
 224         bno = fragextend(ip, cg, (long)bprev, osize, nsize);
 225         if (bno != 0) {
 226                 *bnp = bno;
 227                 return (0);
 228         }
 229         if (bpref >= fs->fs_size)
 230                 bpref = 0;
 231
 232         /*
 233          * When optimizing for time we allocate a full block and
 234          * then only use the upper portion for this request. When
 235          * this file grows again it will grow into the unused portion
 236          * of the block (See fragextend() above).  This saves time
 237          * because an extra disk write would be needed if the frags
 238          * following the current allocation were not free. The extra
 239          * disk write is needed to move the data from its current
 240          * location into the newly allocated position.
 241          *
 242          * When optimizing for space we allocate a run of frags
 243          * that is just the right size for this request.
 244          */
 245         request = (fs->fs_optim == FS_OPTTIME) ? fs->fs_bsize : nsize;
 246         bno = (daddr_t)hashalloc(ip, cg, (long)bpref, request,
 247             (ulong_t (*)())alloccg);
 248         if (bno > 0) {
 249                 *bnp = bno;
 250                 if (nsize < request)
 251                         (void) free(ip, bno + numfrags(fs, nsize),
 252                             (off_t)(request - nsize), I_NOCANCEL);
 253                 return (0);
 254         }
 255
 256         /*
 257          * hashalloc() failed because some other thread grabbed
 258          * the last block so unwind the quota operation.  We can
 259          * ignore the return because subtractions don't fail, and
 260          * our caller guarantees nsize >= osize.
 261          */
 262         (void) chkdq(ip, -(long)btodb(nsize - osize), 0, cr, (char **)NULL,
 263             (size_t *)NULL);
 264
 265 nospace:
 266         now = ddi_get_lbolt();
 267         mutex_enter(&ufsvfsp->vfs_lock);
 268         if ((now - ufsvfsp->vfs_lastwhinetime) > (hz << 2) &&
 269             (!(TRANS_ISTRANS(ufsvfsp)) || !(ip->i_flag & IQUIET))) {
 270                 ufsvfsp->vfs_lastwhinetime = now;
 271                 cmn_err(CE_NOTE,
 272                     "realloccg %s: file system full", fs->fs_fsmnt);
 273         }
 274         mutex_exit(&ufsvfsp->vfs_lock);
 275         return (ENOSPC);
 276 }
 277
 278 /*
 279  * Allocate an inode in the file system.
 280  *
 281  * A preference may be optionally specified. If a preference is given
 282  * the following hierarchy is used to allocate an inode:
 283  *   1) allocate the requested inode.
 284  *   2) allocate an inode in the same cylinder group.
 285  *   3) quadratically rehash into other cylinder groups, until an
 286  *      available inode is located.
 287  * If no inode preference is given the following hierarchy is used
 288  * to allocate an inode:
 289  *   1) allocate an inode in cylinder group 0.
 290  *   2) quadratically rehash into other cylinder groups, until an
 291  *      available inode is located.
 292  */
 293 int
 294 ufs_ialloc(struct inode *pip,
 295     ino_t ipref, mode_t mode, struct inode **ipp, cred_t *cr)
 296 {
 297         struct inode *ip;
 298         struct fs *fs;
 299         int cg;
 300         ino_t ino;
 301         int err;
 302         int nifree;
 303         struct ufsvfs *ufsvfsp = pip->i_ufsvfs;
 304         char *errmsg = NULL;
 305         size_t len;
 306
 307         ASSERT(RW_WRITE_HELD(&pip->i_rwlock));
 308         fs = pip->i_fs;
 309 loop:
 310         nifree = fs->fs_cstotal.cs_nifree;
 311
 312         if (nifree == 0)
 313                 goto noinodes;
 314         /*
 315          * Shadow inodes don't count against a user's inode allocation.
 316          * They are an implementation method and not a resource.
 317          */
 318         if ((mode != IFSHAD) && (mode != IFATTRDIR)) {
 319                 err = chkiq((struct ufsvfs *)ITOV(pip)->v_vfsp->vfs_data,
 320                     /* change */ 1, (struct inode *)NULL, crgetuid(cr), 0,
 321                     cr, &errmsg, &len);
 322                 /*
 323                  * As we haven't acquired any locks yet, dump the message
 324                  * now.
 325                  */
 326                 if (errmsg != NULL) {
 327                         uprintf(errmsg);
 328                         kmem_free(errmsg, len);
 329                         errmsg = NULL;
 330                 }
 331                 if (err)
 332                         return (err);
 333         }
 334
 335         if (ipref >= (ulong_t)(fs->fs_ncg * fs->fs_ipg))
 336                 ipref = 0;
 337         cg = (int)itog(fs, ipref);
 338         ino = (ino_t)hashalloc(pip, cg, (long)ipref, (int)mode,
 339             (ulong_t (*)())ialloccg);
 340         if (ino == 0) {
 341                 if ((mode != IFSHAD) && (mode != IFATTRDIR)) {
 342                         /*
 343                          * We can safely ignore the return from chkiq()
 344                          * because deallocations can only fail if we
 345                          * can't get the user's quota info record off
 346                          * the disk due to an I/O error.  In that case,
 347                          * the quota subsystem is already messed up.
 348                          */
 349                         (void) chkiq(ufsvfsp, /* change */ -1,
 350                             (struct inode *)NULL, crgetuid(cr), 0, cr,
 351                             (char **)NULL, (size_t *)NULL);
 352                 }
 353                 goto noinodes;
 354         }
 355         err = ufs_iget(pip->i_vfs, ino, ipp, cr);
 356         if (err) {
 357                 if ((mode != IFSHAD) && (mode != IFATTRDIR)) {
 358                         /*
 359                          * See above comment about why it is safe to ignore an
 360                          * error return here.
 361                          */
 362                         (void) chkiq(ufsvfsp, /* change */ -1,
 363                             (struct inode *)NULL, crgetuid(cr), 0, cr,
 364                             (char **)NULL, (size_t *)NULL);
 365                 }
 366                 ufs_ifree(pip, ino, 0);
 367                 return (err);
 368         }
 369         ip = *ipp;
 370         ASSERT(!ip->i_ufs_acl);
 371         ASSERT(!ip->i_dquot);
 372         rw_enter(&ip->i_contents, RW_WRITER);
 373
 374         /*
 375          * Check if we really got a free inode, if not then complain
 376          * and mark the inode ISTALE so that it will be freed by the
 377          * ufs idle thread eventually and will not be sent to ufs_delete().
 378          */
 379         if (ip->i_mode || (ip->i_nlink > 0)) {
 380                 ip->i_flag |= ISTALE;
 381                 rw_exit(&ip->i_contents);
 382                 VN_RELE(ITOV(ip));
 383                 cmn_err(CE_WARN,
 384                     "%s: unexpected allocated inode %d, run fsck(1M)%s",
 385                     fs->fs_fsmnt, (int)ino,
 386                     (TRANS_ISTRANS(ufsvfsp) ? " -o f" : ""));
 387                 goto loop;
 388         }
 389
 390         /*
 391          * Check the inode has no size or data blocks.
 392          * This could have happened if the truncation failed when
 393          * deleting the inode. It used to be possible for this to occur
 394          * if a block allocation failed when iteratively truncating a
 395          * large file using logging and with a full file system.
 396          * This was fixed with bug fix 4348738. However, truncation may
 397          * still fail on an IO error. So in all cases for safety and
 398          * security we clear out the size; the blocks allocated; and
 399          * pointers to the blocks. This will ultimately cause a fsck
 400          * error of un-accounted for blocks, but its a fairly benign error,
 401          * and possibly the correct thing to do anyway as accesssing those
 402          * blocks agains may lead to more IO errors.
 403          */
 404         if (ip->i_size || ip->i_blocks) {
 405                 int i;
 406
 407                 if (ip->i_size) {
 408                         cmn_err(CE_WARN,
 409                             "%s: free inode %d had size 0x%llx, run fsck(1M)%s",
 410                             fs->fs_fsmnt, (int)ino, ip->i_size,
 411                             (TRANS_ISTRANS(ufsvfsp) ? " -o f" : ""));
 412                 }
 413                 /*
 414                  * Clear any garbage left behind.
 415                  */
 416                 ip->i_size = (u_offset_t)0;
 417                 ip->i_blocks = 0;
 418                 for (i = 0; i < NDADDR; i++)
 419                         ip->i_db[i] = 0;
 420                 for (i = 0; i < NIADDR; i++)
 421                         ip->i_ib[i] = 0;
 422         }
 423
 424         /*
 425          * Initialize the link count
 426          */
 427         ip->i_nlink = 0;
 428
 429         /*
 430          * Clear the old flags
 431          */
 432         ip->i_flag &= IREF;
 433
 434         /*
 435          * Access times are not really defined if the fs is mounted
 436          * with 'noatime'. But it can cause nfs clients to fail
 437          * open() if the atime is not a legal value. Set a legal value
 438          * here when the inode is allocated.
 439          */
 440         if (ufsvfsp->vfs_noatime) {
 441                 mutex_enter(&ufs_iuniqtime_lock);
 442                 ip->i_atime = iuniqtime;
 443                 mutex_exit(&ufs_iuniqtime_lock);
 444         }
 445         rw_exit(&ip->i_contents);
 446         return (0);
 447 noinodes:
 448         if (!(TRANS_ISTRANS(ufsvfsp)) || !(pip->i_flag & IQUIET))
 449                 cmn_err(CE_NOTE, "%s: out of inodes\n", fs->fs_fsmnt);
 450         return (ENOSPC);
 451 }
 452
 453 /*
 454  * Find a cylinder group to place a directory.
 455  * Returns an inumber within the selected cylinder group.
 456  * Note, the vfs_lock is not needed as we don't require exact cg summary info.
 457  *
 458  * If the switch ufs_close_dirs is set, then the policy is to use
 459  * the current cg if it has more than 25% free inodes and more
 460  * than 25% free blocks. Otherwise the cgs are searched from
 461  * the beginning and the first cg with the same criteria is
 462  * used. If that is also null then we revert to the old algorithm.
 463  * This tends to cluster files at the beginning of the disk
 464  * until the disk gets full.
 465  *
 466  * Otherwise if ufs_close_dirs is not set then the original policy is
 467  * used which is to select from among those cylinder groups with
 468  * above the average number of free inodes, the one with the smallest
 469  * number of directories.
 470  */
 471
 472 int ufs_close_dirs = 1; /* allocate directories close as possible */
 473
 474 ino_t
 475 dirpref(inode_t *dp)
 476 {
 477         int cg, minndir, mincg, avgifree, mininode, minbpg, ifree;
 478         struct fs *fs = dp->i_fs;
 479
 480         cg = itog(fs, dp->i_number);
 481         mininode = fs->fs_ipg >> 2;
 482         minbpg = fs->fs_maxbpg >> 2;
 483         if (ufs_close_dirs &&
 484             (fs->fs_cs(fs, cg).cs_nifree > mininode) &&
 485             (fs->fs_cs(fs, cg).cs_nbfree > minbpg)) {
 486                 return (dp->i_number);
 487         }
 488
 489         avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg;
 490         minndir = fs->fs_ipg;
 491         mincg = 0;
 492         for (cg = 0; cg < fs->fs_ncg; cg++) {
 493                 ifree = fs->fs_cs(fs, cg).cs_nifree;
 494                 if (ufs_close_dirs &&
 495                     (ifree > mininode) &&
 496                     (fs->fs_cs(fs, cg).cs_nbfree > minbpg)) {
 497                         return ((ino_t)(fs->fs_ipg * cg));
 498                 }
 499                 if ((fs->fs_cs(fs, cg).cs_ndir < minndir) &&
 500                     (ifree >= avgifree)) {
 501                         mincg = cg;
 502                         minndir = fs->fs_cs(fs, cg).cs_ndir;
 503                 }
 504         }
 505         return ((ino_t)(fs->fs_ipg * mincg));
 506 }
 507
 508 /*
 509  * Select the desired position for the next block in a file.  The file is
 510  * logically divided into sections. The first section is composed of the
 511  * direct blocks. Each additional section contains fs_maxbpg blocks.
 512  *
 513  * If no blocks have been allocated in the first section, the policy is to
 514  * request a block in the same cylinder group as the inode that describes
 515  * the file. If no blocks have been allocated in any other section, the
 516  * policy is to place the section in a cylinder group with a greater than
 517  * average number of free blocks.  An appropriate cylinder group is found
 518  * by using a rotor that sweeps the cylinder groups. When a new group of
 519  * blocks is needed, the sweep begins in the cylinder group following the
 520  * cylinder group from which the previous allocation was made. The sweep
 521  * continues until a cylinder group with greater than the average number
 522  * of free blocks is found. If the allocation is for the first block in an
 523  * indirect block, the information on the previous allocation is unavailable;
 524  * here a best guess is made based upon the logical block number being
 525  * allocated.
 526  *
 527  * If a section is already partially allocated, the policy is to
 528  * contiguously allocate fs_maxcontig blocks.  The end of one of these
 529  * contiguous blocks and the beginning of the next is physically separated
 530  * so that the disk head will be in transit between them for at least
 531  * fs_rotdelay milliseconds.  This is to allow time for the processor to
 532  * schedule another I/O transfer.
 533  */
 534 daddr_t
 535 blkpref(struct inode *ip, daddr_t lbn, int indx, daddr32_t *bap)
 536 {
 537         struct fs *fs;
 538         struct ufsvfs *ufsvfsp;
 539         int cg;
 540         int avgbfree, startcg;
 541         daddr_t nextblk;
 542
 543         ufsvfsp = ip->i_ufsvfs;
 544         fs = ip->i_fs;
 545         if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
 546                 if (lbn < NDADDR) {
 547                         cg = itog(fs, ip->i_number);
 548                         return (fs->fs_fpg * cg + fs->fs_frag);
 549                 }
 550                 /*
 551                  * Find a cylinder with greater than average
 552                  * number of unused data blocks.
 553                  */
 554                 if (indx == 0 || bap[indx - 1] == 0)
 555                         startcg = itog(fs, ip->i_number) + lbn / fs->fs_maxbpg;
 556                 else
 557                         startcg = dtog(fs, bap[indx - 1]) + 1;
 558                 startcg %= fs->fs_ncg;
 559
 560                 mutex_enter(&ufsvfsp->vfs_lock);
 561                 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
 562                 /*
 563                  * used for computing log space for writes/truncs
 564                  */
 565                 ufsvfsp->vfs_avgbfree = avgbfree;
 566                 for (cg = startcg; cg < fs->fs_ncg; cg++)
 567                         if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
 568                                 fs->fs_cgrotor = cg;
 569                                 mutex_exit(&ufsvfsp->vfs_lock);
 570                                 return (fs->fs_fpg * cg + fs->fs_frag);
 571                         }
 572                 for (cg = 0; cg <= startcg; cg++)
 573                         if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
 574                                 fs->fs_cgrotor = cg;
 575                                 mutex_exit(&ufsvfsp->vfs_lock);
 576                                 return (fs->fs_fpg * cg + fs->fs_frag);
 577                         }
 578                 mutex_exit(&ufsvfsp->vfs_lock);
 579                 return (NULL);
 580         }
 581         /*
 582          * One or more previous blocks have been laid out. If less
 583          * than fs_maxcontig previous blocks are contiguous, the
 584          * next block is requested contiguously, otherwise it is
 585          * requested rotationally delayed by fs_rotdelay milliseconds.
 586          */
 587
 588         nextblk = bap[indx - 1];
 589         /*
 590          * Provision for fallocate to return positive
 591          * blk preference based on last allocation
 592          */
 593         if (nextblk < 0 && nextblk != UFS_HOLE) {
 594                 nextblk = (-bap[indx - 1]) + fs->fs_frag;
 595         } else {
 596                 nextblk = bap[indx - 1] + fs->fs_frag;
 597         }
 598
 599         if (indx > fs->fs_maxcontig && bap[indx - fs->fs_maxcontig] +
 600             blkstofrags(fs, fs->fs_maxcontig) != nextblk) {
 601                 return (nextblk);
 602         }
 603         if (fs->fs_rotdelay != 0)
 604                 /*
 605                  * Here we convert ms of delay to frags as:
 606                  * (frags) = (ms) * (rev/sec) * (sect/rev) /
 607                  *      ((sect/frag) * (ms/sec))
 608                  * then round up to the next block.
 609                  */
 610                 nextblk += roundup(fs->fs_rotdelay * fs->fs_rps * fs->fs_nsect /
 611                     (NSPF(fs) * 1000), fs->fs_frag);
 612         return (nextblk);
 613 }
 614
 615 /*
 616  * Free a block or fragment.
 617  *
 618  * The specified block or fragment is placed back in the
 619  * free map. If a fragment is deallocated, a possible
 620  * block reassembly is checked.
 621  */
 622 void
 623 free(struct inode *ip, daddr_t bno, off_t size, int flags)
 624 {
 625         struct fs *fs = ip->i_fs;
 626         struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
 627         struct ufs_q *delq = &ufsvfsp->vfs_delete;
 628         struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info;
 629         struct cg *cgp;
 630         struct buf *bp;
 631         int cg, bmap, bbase;
 632         int i;
 633         uchar_t *blksfree;
 634         int *blktot;
 635         short *blks;
 636         daddr_t blkno, cylno, rpos;
 637
 638         /*
 639          * fallocate'd files will have negative block address.
 640          * So negate it again to get original block address.
 641          */
 642         if (bno < 0 && (bno % fs->fs_frag == 0) && bno != UFS_HOLE) {
 643                 bno = -bno;
 644         }
 645
 646         if ((unsigned long)size > fs->fs_bsize || fragoff(fs, size) != 0) {
 647                 (void) ufs_fault(ITOV(ip),
 648                     "free: bad size, dev = 0x%lx, bsize = %d, size = %d, "
 649                     "fs = %s\n", ip->i_dev, fs->fs_bsize,
 650                     (int)size, fs->fs_fsmnt);
 651                 return;
 652         }
 653         cg = dtog(fs, bno);
 654         ASSERT(!ufs_badblock(ip, bno));
 655         bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
 656             (int)fs->fs_cgsize);
 657
 658         cgp = bp->b_un.b_cg;
 659         if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) {
 660                 brelse(bp);
 661                 return;
 662         }
 663
 664         if (!(flags & I_NOCANCEL))
 665                 TRANS_CANCEL(ufsvfsp, ldbtob(fsbtodb(fs, bno)), size, flags);
 666         if (flags & (I_DIR|I_IBLK|I_SHAD|I_QUOTA)) {
 667                 TRANS_MATA_FREE(ufsvfsp, ldbtob(fsbtodb(fs, bno)), size);
 668         }
 669         blksfree = cg_blksfree(cgp);
 670         blktot = cg_blktot(cgp);
 671         mutex_enter(&ufsvfsp->vfs_lock);
 672         cgp->cg_time = gethrestime_sec();
 673         bno = dtogd(fs, bno);
 674         if (size == fs->fs_bsize) {
 675                 blkno = fragstoblks(fs, bno);
 676                 cylno = cbtocylno(fs, bno);
 677                 rpos = cbtorpos(ufsvfsp, bno);
 678                 blks = cg_blks(ufsvfsp, cgp, cylno);
 679                 if (!isclrblock(fs, blksfree, blkno)) {
 680                         mutex_exit(&ufsvfsp->vfs_lock);
 681                         brelse(bp);
 682                         (void) ufs_fault(ITOV(ip), "free: freeing free block, "
 683                             "dev:0x%lx, block:%ld, ino:%lu, fs:%s",
 684                             ip->i_dev, bno, ip->i_number, fs->fs_fsmnt);
 685                         return;
 686                 }
 687                 setblock(fs, blksfree, blkno);
 688                 blks[rpos]++;
 689                 blktot[cylno]++;
 690                 cgp->cg_cs.cs_nbfree++;         /* Log below */
 691                 fs->fs_cstotal.cs_nbfree++;
 692                 fs->fs_cs(fs, cg).cs_nbfree++;
 693                 if (TRANS_ISTRANS(ufsvfsp) && (flags & I_ACCT)) {
 694                         mutex_enter(&delq->uq_mutex);
 695                         delq_info->delq_unreclaimed_blocks -=
 696                             btodb(fs->fs_bsize);
 697                         mutex_exit(&delq->uq_mutex);
 698                 }
 699         } else {
 700                 bbase = bno - fragnum(fs, bno);
 701                 /*
 702                  * Decrement the counts associated with the old frags
 703                  */
 704                 bmap = blkmap(fs, blksfree, bbase);
 705                 fragacct(fs, bmap, cgp->cg_frsum, -1);
 706                 /*
 707                  * Deallocate the fragment
 708                  */
 709                 for (i = 0; i < numfrags(fs, size); i++) {
 710                         if (isset(blksfree, bno + i)) {
 711                                 brelse(bp);
 712                                 mutex_exit(&ufsvfsp->vfs_lock);
 713                                 (void) ufs_fault(ITOV(ip),
 714                                     "free: freeing free frag, "
 715                                     "dev:0x%lx, blk:%ld, cg:%d, "
 716                                     "ino:%lu, fs:%s",
 717                                     ip->i_dev,
 718                                     bno + i,
 719                                     cgp->cg_cgx,
 720                                     ip->i_number,
 721                                     fs->fs_fsmnt);
 722                                 return;
 723                         }
 724                         setbit(blksfree, bno + i);
 725                 }
 726                 cgp->cg_cs.cs_nffree += i;
 727                 fs->fs_cstotal.cs_nffree += i;
 728                 fs->fs_cs(fs, cg).cs_nffree += i;
 729                 if (TRANS_ISTRANS(ufsvfsp) && (flags & I_ACCT)) {
 730                         mutex_enter(&delq->uq_mutex);
 731                         delq_info->delq_unreclaimed_blocks -=
 732                             btodb(i * fs->fs_fsize);
 733                         mutex_exit(&delq->uq_mutex);
 734                 }
 735                 /*
 736                  * Add back in counts associated with the new frags
 737                  */
 738                 bmap = blkmap(fs, blksfree, bbase);
 739                 fragacct(fs, bmap, cgp->cg_frsum, 1);
 740                 /*
 741                  * If a complete block has been reassembled, account for it
 742                  */
 743                 blkno = fragstoblks(fs, bbase);
 744                 if (isblock(fs, blksfree, blkno)) {
 745                         cylno = cbtocylno(fs, bbase);
 746                         rpos = cbtorpos(ufsvfsp, bbase);
 747                         blks = cg_blks(ufsvfsp, cgp, cylno);
 748                         blks[rpos]++;
 749                         blktot[cylno]++;
 750                         cgp->cg_cs.cs_nffree -= fs->fs_frag;
 751                         fs->fs_cstotal.cs_nffree -= fs->fs_frag;
 752                         fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
 753                         cgp->cg_cs.cs_nbfree++;
 754                         fs->fs_cstotal.cs_nbfree++;
 755                         fs->fs_cs(fs, cg).cs_nbfree++;
 756                 }
 757         }
 758         fs->fs_fmod = 1;
 759         ufs_notclean(ufsvfsp);
 760         TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
 761         TRANS_SI(ufsvfsp, fs, cg);
 762         bdrwrite(bp);
 763 }
 764
 765 /*
 766  * Free an inode.
 767  *
 768  * The specified inode is placed back in the free map.
 769  */
 770 void
 771 ufs_ifree(struct inode *ip, ino_t ino, mode_t mode)
 772 {
 773         struct fs *fs = ip->i_fs;
 774         struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
 775         struct cg *cgp;
 776         struct buf *bp;
 777         unsigned int inot;
 778         int cg;
 779         char *iused;
 780
 781         if (ip->i_number == ino && ip->i_mode != 0) {
 782                 (void) ufs_fault(ITOV(ip),
 783                     "ufs_ifree: illegal mode: (imode) %o, (omode) %o, ino %d, "
 784                     "fs = %s\n",
 785                     ip->i_mode, mode, (int)ip->i_number, fs->fs_fsmnt);
 786                 return;
 787         }
 788         if (ino >= fs->fs_ipg * fs->fs_ncg) {
 789                 (void) ufs_fault(ITOV(ip),
 790                     "ifree: range, dev = 0x%x, ino = %d, fs = %s\n",
 791                     (int)ip->i_dev, (int)ino, fs->fs_fsmnt);
 792                 return;
 793         }
 794         cg = (int)itog(fs, ino);
 795         bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
 796             (int)fs->fs_cgsize);
 797
 798         cgp = bp->b_un.b_cg;
 799         if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) {
 800                 brelse(bp);
 801                 return;
 802         }
 803         mutex_enter(&ufsvfsp->vfs_lock);
 804         cgp->cg_time = gethrestime_sec();
 805         iused = cg_inosused(cgp);
 806         inot = (unsigned int)(ino % (ulong_t)fs->fs_ipg);
 807         if (isclr(iused, inot)) {
 808                 mutex_exit(&ufsvfsp->vfs_lock);
 809                 brelse(bp);
 810                 (void) ufs_fault(ITOV(ip), "ufs_ifree: freeing free inode, "
 811                     "mode: (imode) %o, (omode) %o, ino:%d, "
 812                     "fs:%s",
 813                     ip->i_mode, mode, (int)ino, fs->fs_fsmnt);
 814                 return;
 815         }
 816         clrbit(iused, inot);
 817
 818         if (inot < (ulong_t)cgp->cg_irotor)
 819                 cgp->cg_irotor = inot;
 820         cgp->cg_cs.cs_nifree++;
 821         fs->fs_cstotal.cs_nifree++;
 822         fs->fs_cs(fs, cg).cs_nifree++;
 823         if (((mode & IFMT) == IFDIR) || ((mode & IFMT) == IFATTRDIR)) {
 824                 cgp->cg_cs.cs_ndir--;
 825                 fs->fs_cstotal.cs_ndir--;
 826                 fs->fs_cs(fs, cg).cs_ndir--;
 827         }
 828         fs->fs_fmod = 1;
 829         ufs_notclean(ufsvfsp);
 830         TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
 831         TRANS_SI(ufsvfsp, fs, cg);
 832         bdrwrite(bp);
 833 }
 834
 835 /*
 836  * Implement the cylinder overflow algorithm.
 837  *
 838  * The policy implemented by this algorithm is:
 839  *   1) allocate the block in its requested cylinder group.
 840  *   2) quadratically rehash on the cylinder group number.
 841  *   3) brute force search for a free block.
 842  * The size parameter means size for data blocks, mode for inodes.
 843  */
 844 static ino_t
 845 hashalloc(struct inode *ip, int cg, long pref, int size, ulong_t (*allocator)())
 846 {
 847         struct fs *fs;
 848         int i;
 849         long result;
 850         int icg = cg;
 851
 852         fs = ip->i_fs;
 853         /*
 854          * 1: preferred cylinder group
 855          */
 856         result = (*allocator)(ip, cg, pref, size);
 857         if (result)
 858                 return (result);
 859         /*
 860          * 2: quadratic rehash
 861          */
 862         for (i = 1; i < fs->fs_ncg; i *= 2) {
 863                 cg += i;
 864                 if (cg >= fs->fs_ncg)
 865                         cg -= fs->fs_ncg;
 866                 result = (*allocator)(ip, cg, 0, size);
 867                 if (result)
 868                         return (result);
 869         }
 870         /*
 871          * 3: brute force search
 872          * Note that we start at i == 2, since 0 was checked initially,
 873          * and 1 is always checked in the quadratic rehash.
 874          */
 875         cg = (icg + 2) % fs->fs_ncg;
 876         for (i = 2; i < fs->fs_ncg; i++) {
 877                 result = (*allocator)(ip, cg, 0, size);
 878                 if (result)
 879                         return (result);
 880                 cg++;
 881                 if (cg == fs->fs_ncg)
 882                         cg = 0;
 883         }
 884         return (NULL);
 885 }
 886
 887 /*
 888  * Determine whether a fragment can be extended.
 889  *
 890  * Check to see if the necessary fragments are available, and
 891  * if they are, allocate them.
 892  */
 893 static daddr_t
 894 fragextend(struct inode *ip, int cg, long bprev, int osize, int nsize)
 895 {
 896         struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
 897         struct fs *fs = ip->i_fs;
 898         struct buf *bp;
 899         struct cg *cgp;
 900         uchar_t *blksfree;
 901         long bno;
 902         int frags, bbase;
 903         int i, j;
 904
 905         if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize))
 906                 return (NULL);
 907         frags = numfrags(fs, nsize);
 908         bbase = (int)fragnum(fs, bprev);
 909         if (bbase > fragnum(fs, (bprev + frags - 1))) {
 910                 /* cannot extend across a block boundary */
 911                 return (NULL);
 912         }
 913
 914         bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
 915             (int)fs->fs_cgsize);
 916         cgp = bp->b_un.b_cg;
 917         if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) {
 918                 brelse(bp);
 919                 return (NULL);
 920         }
 921
 922         blksfree = cg_blksfree(cgp);
 923         mutex_enter(&ufsvfsp->vfs_lock);
 924         bno = dtogd(fs, bprev);
 925         for (i = numfrags(fs, osize); i < frags; i++) {
 926                 if (isclr(blksfree, bno + i)) {
 927                         mutex_exit(&ufsvfsp->vfs_lock);
 928                         brelse(bp);
 929                         return (NULL);
 930                 }
 931                 if ((TRANS_ISCANCEL(ufsvfsp, ldbtob(fsbtodb(fs, bprev + i)),
 932                     fs->fs_fsize))) {
 933                         mutex_exit(&ufsvfsp->vfs_lock);
 934                         brelse(bp);
 935                         return (NULL);
 936                 }
 937         }
 938
 939         cgp->cg_time = gethrestime_sec();
 940         /*
 941          * The current fragment can be extended,
 942          * deduct the count on fragment being extended into
 943          * increase the count on the remaining fragment (if any)
 944          * allocate the extended piece.
 945          */
 946         for (i = frags; i < fs->fs_frag - bbase; i++)
 947                 if (isclr(blksfree, bno + i))
 948                         break;
 949         j = i - numfrags(fs, osize);
 950         cgp->cg_frsum[j]--;
 951         ASSERT(cgp->cg_frsum[j] >= 0);
 952         if (i != frags)
 953                 cgp->cg_frsum[i - frags]++;
 954         for (i = numfrags(fs, osize); i < frags; i++) {
 955                 clrbit(blksfree, bno + i);
 956                 cgp->cg_cs.cs_nffree--;
 957                 fs->fs_cs(fs, cg).cs_nffree--;
 958                 fs->fs_cstotal.cs_nffree--;
 959         }
 960         fs->fs_fmod = 1;
 961         ufs_notclean(ufsvfsp);
 962         TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
 963         TRANS_SI(ufsvfsp, fs, cg);
 964         bdrwrite(bp);
 965         return ((daddr_t)bprev);
 966 }
 967
 968 /*
 969  * Determine whether a block can be allocated.
 970  *
 971  * Check to see if a block of the apprpriate size
 972  * is available, and if it is, allocate it.
 973  */
 974 static daddr_t
 975 alloccg(struct inode *ip, int cg, daddr_t bpref, int size)
 976 {
 977         struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
 978         struct fs *fs = ip->i_fs;
 979         struct buf *bp;
 980         struct cg *cgp;
 981         uchar_t *blksfree;
 982         int bno, frags;
 983         int allocsiz;
 984         int i;
 985
 986         /*
 987          * Searching for space could be time expensive so do some
 988          * up front checking to verify that there is actually space
 989          * available (free blocks or free frags).
 990          */
 991         if (fs->fs_cs(fs, cg).cs_nbfree == 0) {
 992                 if (size == fs->fs_bsize)
 993                         return (0);
 994
 995                 /*
 996                  * If there are not enough free frags then return.
 997                  */
 998                 if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, size))
 999                         return (0);
1000         }
1001
1002         bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
1003             (int)fs->fs_cgsize);
1004
1005         cgp = bp->b_un.b_cg;
1006         if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp) ||
1007             (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) {
1008                 brelse(bp);
1009                 return (0);
1010         }
1011         blksfree = cg_blksfree(cgp);
1012         mutex_enter(&ufsvfsp->vfs_lock);
1013         cgp->cg_time = gethrestime_sec();
1014         if (size == fs->fs_bsize) {
1015                 if ((bno = alloccgblk(ufsvfsp, cgp, bpref, bp)) == 0)
1016                         goto errout;
1017                 fs->fs_fmod = 1;
1018                 ufs_notclean(ufsvfsp);
1019                 TRANS_SI(ufsvfsp, fs, cg);
1020                 bdrwrite(bp);
1021                 return (bno);
1022         }
1023         /*
1024          * Check fragment bitmap to see if any fragments are already available.
1025          * mapsearch() may fail because the fragment that fits this request
1026          * might still be on the cancel list and not available for re-use yet.
1027          * Look for a bigger sized fragment to allocate first before we have
1028          * to give up and fragment a whole new block eventually.
1029          */
1030         frags = numfrags(fs, size);
1031         allocsiz = frags;
1032 next_size:
1033         for (; allocsiz < fs->fs_frag; allocsiz++)
1034                 if (cgp->cg_frsum[allocsiz] != 0)
1035                         break;
1036
1037         if (allocsiz != fs->fs_frag) {
1038                 bno = mapsearch(ufsvfsp, cgp, bpref, allocsiz);
1039                 if (bno < 0 && allocsiz < (fs->fs_frag - 1)) {
1040                         allocsiz++;
1041                         goto next_size;
1042                 }
1043         }
1044
1045         if (allocsiz == fs->fs_frag || bno < 0) {
1046                 /*
1047                  * No fragments were available, so a block
1048                  * will be allocated and hacked up.
1049                  */
1050                 if (cgp->cg_cs.cs_nbfree == 0)
1051                         goto errout;
1052                 if ((bno = alloccgblk(ufsvfsp, cgp, bpref, bp)) == 0)
1053                         goto errout;
1054                 bpref = dtogd(fs, bno);
1055                 for (i = frags; i < fs->fs_frag; i++)
1056                         setbit(blksfree, bpref + i);
1057                 i = fs->fs_frag - frags;
1058                 cgp->cg_cs.cs_nffree += i;
1059                 fs->fs_cstotal.cs_nffree += i;
1060                 fs->fs_cs(fs, cg).cs_nffree += i;
1061                 cgp->cg_frsum[i]++;
1062                 fs->fs_fmod = 1;
1063                 ufs_notclean(ufsvfsp);
1064                 TRANS_SI(ufsvfsp, fs, cg);
1065                 bdrwrite(bp);
1066                 return (bno);
1067         }
1068
1069         for (i = 0; i < frags; i++)
1070                 clrbit(blksfree, bno + i);
1071         cgp->cg_cs.cs_nffree -= frags;
1072         fs->fs_cstotal.cs_nffree -= frags;
1073         fs->fs_cs(fs, cg).cs_nffree -= frags;
1074         cgp->cg_frsum[allocsiz]--;
1075         ASSERT(cgp->cg_frsum[allocsiz] >= 0);
1076         if (frags != allocsiz) {
1077                 cgp->cg_frsum[allocsiz - frags]++;
1078         }
1079         fs->fs_fmod = 1;
1080         ufs_notclean(ufsvfsp);
1081         TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
1082         TRANS_SI(ufsvfsp, fs, cg);
1083         bdrwrite(bp);
1084         return (cg * fs->fs_fpg + bno);
1085 errout:
1086         mutex_exit(&ufsvfsp->vfs_lock);
1087         brelse(bp);
1088         return (0);
1089 }
1090
1091 /*
1092  * Allocate a block in a cylinder group.
1093  *
1094  * This algorithm implements the following policy:
1095  *   1) allocate the requested block.
1096  *   2) allocate a rotationally optimal block in the same cylinder.
1097  *   3) allocate the next available block on the block rotor for the
1098  *      specified cylinder group.
1099  * Note that this routine only allocates fs_bsize blocks; these
1100  * blocks may be fragmented by the routine that allocates them.
1101  */
1102 static daddr_t
1103 alloccgblk(
1104         struct ufsvfs *ufsvfsp,
1105         struct cg *cgp,
1106         daddr_t bpref,
1107         struct buf *bp)
1108 {
1109         daddr_t bno;
1110         int cylno, pos, delta, rotbl_size;
1111         short *cylbp;
1112         int i;
1113         struct fs *fs;
1114         uchar_t *blksfree;
1115         daddr_t blkno, rpos, frag;
1116         short *blks;
1117         int32_t *blktot;
1118
1119         ASSERT(MUTEX_HELD(&ufsvfsp->vfs_lock));
1120         fs = ufsvfsp->vfs_fs;
1121         blksfree = cg_blksfree(cgp);
1122         if (bpref == 0) {
1123                 bpref = cgp->cg_rotor;
1124                 goto norot;
1125         }
1126         bpref = blknum(fs, bpref);
1127         bpref = dtogd(fs, bpref);
1128         /*
1129          * If the requested block is available, use it.
1130          */
1131         if (isblock(fs, blksfree, (daddr_t)fragstoblks(fs, bpref))) {
1132                 bno = bpref;
1133                 goto gotit;
1134         }
1135         /*
1136          * Check for a block available on the same cylinder.
1137          */
1138         cylno = cbtocylno(fs, bpref);
1139         if (cg_blktot(cgp)[cylno] == 0)
1140                 goto norot;
1141         if (fs->fs_cpc == 0) {
1142                 /*
1143                  * Block layout info is not available, so just
1144                  * have to take any block in this cylinder.
1145                  */
1146                 bpref = howmany(fs->fs_spc * cylno, NSPF(fs));
1147                 goto norot;
1148         }
1149         /*
1150          * Check the summary information to see if a block is
1151          * available in the requested cylinder starting at the
1152          * requested rotational position and proceeding around.
1153          */
1154         cylbp = cg_blks(ufsvfsp, cgp, cylno);
1155         pos = cbtorpos(ufsvfsp, bpref);
1156         for (i = pos; i < ufsvfsp->vfs_nrpos; i++)
1157                 if (cylbp[i] > 0)
1158                         break;
1159         if (i == ufsvfsp->vfs_nrpos)
1160                 for (i = 0; i < pos; i++)
1161                         if (cylbp[i] > 0)
1162                                 break;
1163         if (cylbp[i] > 0) {
1164                 /*
1165                  * Found a rotational position, now find the actual
1166                  * block.  A "panic" if none is actually there.
1167                  */
1168
1169                 /*
1170                  * Up to this point, "pos" has referred to the rotational
1171                  * position of the desired block.  From now on, it holds
1172                  * the offset of the current cylinder within a cylinder
1173                  * cycle.  (A cylinder cycle refers to a set of cylinders
1174                  * which are described by a single rotational table; the
1175                  * size of the cycle is fs_cpc.)
1176                  *
1177                  * bno is set to the block number of the first block within
1178                  * the current cylinder cycle.
1179                  */
1180
1181                 pos = cylno % fs->fs_cpc;
1182                 bno = (cylno - pos) * fs->fs_spc / NSPB(fs);
1183
1184                 /*
1185                  * The blocks within a cylinder are grouped into equivalence
1186                  * classes according to their "rotational position."  There
1187                  * are two tables used to determine these classes.
1188                  *
1189                  * The positional offset table (fs_postbl) has an entry for
1190                  * each rotational position of each cylinder in a cylinder
1191                  * cycle.  This entry contains the relative block number
1192                  * (counting from the start of the cylinder cycle) of the
1193                  * first block in the equivalence class for that position
1194                  * and that cylinder.  Positions for which no blocks exist
1195                  * are indicated by a -1.
1196                  *
1197                  * The rotational delta table (fs_rotbl) has an entry for
1198                  * each block in a cylinder cycle.  This entry contains
1199                  * the offset from that block to the next block in the
1200                  * same equivalence class.  The last block in the class
1201                  * is indicated by a zero in the table.
1202                  *
1203                  * The following code, then, walks through all of the blocks
1204                  * in the cylinder (cylno) which we're allocating within
1205                  * which are in the equivalence class for the rotational
1206                  * position (i) which we're allocating within.
1207                  */
1208
1209                 if (fs_postbl(ufsvfsp, pos)[i] == -1) {
1210                         (void) ufs_fault(ufsvfsp->vfs_root,
1211                             "alloccgblk: cyl groups corrupted, pos = %d, "
1212                             "i = %d, fs = %s\n", pos, i, fs->fs_fsmnt);
1213                         return (0);
1214                 }
1215
1216                 /*
1217                  * There is one entry in the rotational table for each block
1218                  * in the cylinder cycle.  These are whole blocks, not frags.
1219                  */
1220
1221                 rotbl_size = (fs->fs_cpc * fs->fs_spc) >>
1222                     (fs->fs_fragshift + fs->fs_fsbtodb);
1223
1224                 /*
1225                  * As we start, "i" is the rotational position within which
1226                  * we're searching.  After the next line, it will be a block
1227                  * number (relative to the start of the cylinder cycle)
1228                  * within the equivalence class of that rotational position.
1229                  */
1230
1231                 i = fs_postbl(ufsvfsp, pos)[i];
1232
1233                 for (;;) {
1234                         if (isblock(fs, blksfree, (daddr_t)(bno + i))) {
1235                                 bno = blkstofrags(fs, (bno + i));
1236                                 goto gotit;
1237                         }
1238                         delta = fs_rotbl(fs)[i];
1239                         if (delta <= 0 ||               /* End of chain, or */
1240                             delta + i > rotbl_size)     /* end of table? */
1241                                 break;                  /* If so, panic. */
1242                         i += delta;
1243                 }
1244                 (void) ufs_fault(ufsvfsp->vfs_root,
1245                     "alloccgblk: can't find blk in cyl, pos:%d, i:%d, "
1246                     "fs:%s bno: %x\n", pos, i, fs->fs_fsmnt, (int)bno);
1247                 return (0);
1248         }
1249 norot:
1250         /*
1251          * No blocks in the requested cylinder, so take
1252          * next available one in this cylinder group.
1253          */
1254         bno = mapsearch(ufsvfsp, cgp, bpref, (int)fs->fs_frag);
1255         if (bno < 0)
1256                 return (0);
1257         cgp->cg_rotor = bno;
1258 gotit:
1259         blkno = fragstoblks(fs, bno);
1260         frag = (cgp->cg_cgx * fs->fs_fpg) + bno;
1261         if (TRANS_ISCANCEL(ufsvfsp, ldbtob(fsbtodb(fs, frag)), fs->fs_bsize))
1262                 goto norot;
1263         clrblock(fs, blksfree, (long)blkno);
1264         /*
1265          * the other cg/sb/si fields are TRANS'ed by the caller
1266          */
1267         cgp->cg_cs.cs_nbfree--;
1268         fs->fs_cstotal.cs_nbfree--;
1269         fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--;
1270         cylno = cbtocylno(fs, bno);
1271         blks = cg_blks(ufsvfsp, cgp, cylno);
1272         rpos = cbtorpos(ufsvfsp, bno);
1273         blktot = cg_blktot(cgp);
1274         blks[rpos]--;
1275         blktot[cylno]--;
1276         TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
1277         fs->fs_fmod = 1;
1278         return (frag);
1279 }
1280
1281 /*
1282  * Determine whether an inode can be allocated.
1283  *
1284  * Check to see if an inode is available, and if it is,
1285  * allocate it using the following policy:
1286  *   1) allocate the requested inode.
1287  *   2) allocate the next available inode after the requested
1288  *      inode in the specified cylinder group.
1289  */
1290 static ino_t
1291 ialloccg(struct inode *ip, int cg, daddr_t ipref, int mode)
1292 {
1293         struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
1294         struct fs *fs = ip->i_fs;
1295         struct cg *cgp;
1296         struct buf *bp;
1297         int start, len, loc, map, i;
1298         char *iused;
1299
1300         if (fs->fs_cs(fs, cg).cs_nifree == 0)
1301                 return (0);
1302         bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
1303             (int)fs->fs_cgsize);
1304
1305         cgp = bp->b_un.b_cg;
1306         if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp) ||
1307             cgp->cg_cs.cs_nifree == 0) {
1308                 brelse(bp);
1309                 return (0);
1310         }
1311         iused = cg_inosused(cgp);
1312         mutex_enter(&ufsvfsp->vfs_lock);
1313         /*
1314          * While we are waiting for the mutex, someone may have taken
1315          * the last available inode.  Need to recheck.
1316          */
1317         if (cgp->cg_cs.cs_nifree == 0) {
1318                 mutex_exit(&ufsvfsp->vfs_lock);
1319                 brelse(bp);
1320                 return (0);
1321         }
1322
1323         cgp->cg_time = gethrestime_sec();
1324         if (ipref) {
1325                 ipref %= fs->fs_ipg;
1326                 if (isclr(iused, ipref))
1327                         goto gotit;
1328         }
1329         start = cgp->cg_irotor / NBBY;
1330         len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY);
1331         loc = skpc(0xff, (uint_t)len, &iused[start]);
1332         if (loc == 0) {
1333                 len = start + 1;
1334                 start = 0;
1335                 loc = skpc(0xff, (uint_t)len, &iused[0]);
1336                 if (loc == 0) {
1337                         mutex_exit(&ufsvfsp->vfs_lock);
1338                         (void) ufs_fault(ITOV(ip),
1339                             "ialloccg: map corrupted, cg = %d, irotor = %d, "
1340                             "fs = %s\n", cg, (int)cgp->cg_irotor, fs->fs_fsmnt);
1341                         return (0);
1342                 }
1343         }
1344         i = start + len - loc;
1345         map = iused[i];
1346         ipref = i * NBBY;
1347         for (i = 1; i < (1 << NBBY); i <<= 1, ipref++) {
1348                 if ((map & i) == 0) {
1349                         cgp->cg_irotor = ipref;
1350                         goto gotit;
1351                 }
1352         }
1353
1354         mutex_exit(&ufsvfsp->vfs_lock);
1355         (void) ufs_fault(ITOV(ip), "ialloccg: block not in mapfs = %s",
1356             fs->fs_fsmnt);
1357         return (0);
1358 gotit:
1359         setbit(iused, ipref);
1360         cgp->cg_cs.cs_nifree--;
1361         fs->fs_cstotal.cs_nifree--;
1362         fs->fs_cs(fs, cg).cs_nifree--;
1363         if (((mode & IFMT) == IFDIR) || ((mode & IFMT) == IFATTRDIR)) {
1364                 cgp->cg_cs.cs_ndir++;
1365                 fs->fs_cstotal.cs_ndir++;
1366                 fs->fs_cs(fs, cg).cs_ndir++;
1367         }
1368         fs->fs_fmod = 1;
1369         ufs_notclean(ufsvfsp);
1370         TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
1371         TRANS_SI(ufsvfsp, fs, cg);
1372         bdrwrite(bp);
1373         return (cg * fs->fs_ipg + ipref);
1374 }
1375
1376 /*
1377  * Find a block of the specified size in the specified cylinder group.
1378  *
1379  * It is a panic if a request is made to find a block if none are
1380  * available.
1381  */
1382 static daddr_t
1383 mapsearch(struct ufsvfs *ufsvfsp, struct cg *cgp, daddr_t bpref,
1384         int allocsiz)
1385 {
1386         struct fs *fs   = ufsvfsp->vfs_fs;
1387         daddr_t bno, cfrag;
1388         int start, len, loc, i, last, first, secondtime;
1389         int blk, field, subfield, pos;
1390         int gotit;
1391
1392         /*
1393          * ufsvfs->vfs_lock is held when calling this.
1394          */
1395         /*
1396          * Find the fragment by searching through the
1397          * free block map for an appropriate bit pattern.
1398          */
1399         if (bpref)
1400                 start = dtogd(fs, bpref) / NBBY;
1401         else
1402                 start = cgp->cg_frotor / NBBY;
1403         /*
1404          * the following loop performs two scans -- the first scan
1405          * searches the bottom half of the array for a match and the
1406          * second scan searches the top half of the array.  The loops
1407          * have been merged just to make things difficult.
1408          */
1409         first = start;
1410         last = howmany(fs->fs_fpg, NBBY);
1411         secondtime = 0;
1412         cfrag = cgp->cg_cgx * fs->fs_fpg;
1413         while (first < last) {
1414                 len = last - first;
1415                 /*
1416                  * search the array for a match
1417                  */
1418                 loc = scanc((unsigned)len, (uchar_t *)&cg_blksfree(cgp)[first],
1419                     (uchar_t *)fragtbl[fs->fs_frag],
1420                     (int)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY))));
1421                 /*
1422                  * match found
1423                  */
1424                 if (loc) {
1425                         bno = (last - loc) * NBBY;
1426
1427                         /*
1428                          * Found the byte in the map, sift
1429                          * through the bits to find the selected frag
1430                          */
1431                         cgp->cg_frotor = bno;
1432                         gotit = 0;
1433                         for (i = bno + NBBY; bno < i; bno += fs->fs_frag) {
1434                                 blk = blkmap(fs, cg_blksfree(cgp), bno);
1435                                 blk <<= 1;
1436                                 field = around[allocsiz];
1437                                 subfield = inside[allocsiz];
1438                                 for (pos = 0;
1439                                     pos <= fs->fs_frag - allocsiz;
1440                                     pos++) {
1441                                         if ((blk & field) == subfield) {
1442                                                 gotit++;
1443                                                 break;
1444                                         }
1445                                         field <<= 1;
1446                                         subfield <<= 1;
1447                                 }
1448                                 if (gotit)
1449                                         break;
1450                         }
1451                         bno += pos;
1452
1453                         /*
1454                          * success if block is *not* being converted from
1455                          * metadata into userdata (harpy).  If so, ignore.
1456                          */
1457                         if (!TRANS_ISCANCEL(ufsvfsp,
1458                             ldbtob(fsbtodb(fs, (cfrag+bno))),
1459                             allocsiz * fs->fs_fsize))
1460                                 return (bno);
1461
1462                         /*
1463                          * keep looking -- this block is being converted
1464                          */
1465                         first = (last - loc) + 1;
1466                         loc = 0;
1467                         if (first < last)
1468                                 continue;
1469                 }
1470                 /*
1471                  * no usable matches in bottom half -- now search the top half
1472                  */
1473                 if (secondtime)
1474                         /*
1475                          * no usable matches in top half -- all done
1476                          */
1477                         break;
1478                 secondtime = 1;
1479                 last = start + 1;
1480                 first = 0;
1481         }
1482         /*
1483          * no usable matches
1484          */
1485         return ((daddr_t)-1);
1486 }
1487
1488 #define UFSNADDR (NDADDR + NIADDR)      /* NADDR applies to (obsolete) S5FS */
1489 #define IB(i)   (NDADDR + (i))  /* index of i'th indirect block ptr */
1490 #define SINGLE  0               /* single indirect block ptr */
1491 #define DOUBLE  1               /* double indirect block ptr */
1492 #define TRIPLE  2               /* triple indirect block ptr */
1493
1494 /*
1495  * Acquire a write lock, and keep trying till we get it
1496  */
1497 static int
1498 allocsp_wlockfs(struct vnode *vp, struct lockfs *lf)
1499 {
1500         int err = 0;
1501
1502 lockagain:
1503         do {
1504                 err = ufs_fiolfss(vp, lf);
1505                 if (err)
1506                         return (err);
1507         } while (!LOCKFS_IS_ULOCK(lf));
1508
1509         lf->lf_lock = LOCKFS_WLOCK;
1510         lf->lf_flags = 0;
1511         lf->lf_comment = NULL;
1512         err = ufs__fiolfs(vp, lf, 1, 0);
1513
1514         if (err == EBUSY || err == EINVAL)
1515                 goto lockagain;
1516
1517         return (err);
1518 }
1519
1520 /*
1521  * Release the write lock
1522  */
1523 static int
1524 allocsp_unlockfs(struct vnode *vp, struct lockfs *lf)
1525 {
1526         int err = 0;
1527
1528         lf->lf_lock = LOCKFS_ULOCK;
1529         lf->lf_flags = 0;
1530         err = ufs__fiolfs(vp, lf, 1, 0);
1531         return (err);
1532 }
1533
1534 struct allocsp_undo {
1535         daddr_t offset;
1536         daddr_t blk;
1537         struct allocsp_undo *next;
1538 };
1539
1540 /*
1541  * ufs_allocsp() can be used to pre-allocate blocks for a file on a given
1542  * file system. For direct blocks, the blocks are allocated from the offset
1543  * requested to the block boundary, then any full blocks are allocated,
1544  * and finally any remainder.
1545  * For indirect blocks the blocks are not initialized and are
1546  * only marked as allocated. These addresses are then stored as negative
1547  * block numbers in the inode to imply special handling. UFS has been modified
1548  * where necessary to understand this new notion.
1549  * Successfully fallocated files will have IFALLOCATE cflag set in the inode.
1550  */
1551 int
1552 ufs_allocsp(struct vnode *vp, struct flock64 *lp, cred_t *cr)
1553 {
1554         struct lockfs lf;
1555         int berr, err, resv, issync;
1556         off_t istart, len; /* istart, special for idb */
1557         struct inode *ip;
1558         struct fs *fs;
1559         struct ufsvfs *ufsvfsp;
1560         u_offset_t resid, i, uoff;
1561         daddr32_t db_undo[NDADDR];      /* old direct blocks */
1562         struct allocsp_undo *ib_undo = NULL;    /* ib undo */
1563         struct allocsp_undo *undo = NULL;
1564         u_offset_t osz;                 /* old file size */
1565         int chunkblks = 0;              /* # of blocks in 1 allocation */
1566         int cnt = 0;
1567         daddr_t allocblk;
1568         daddr_t totblks = 0;
1569         struct ulockfs  *ulp;
1570         size_t done_len;
1571         int nbytes, offsetn;
1572
1573
1574         ASSERT(vp->v_type == VREG);
1575
1576         ip = VTOI(vp);
1577         fs = ip->i_fs;
1578         if ((ufsvfsp = ip->i_ufsvfs) == NULL) {
1579                 err = EIO;
1580                 goto out_allocsp;
1581         }
1582
1583         istart = blkroundup(fs, (lp->l_start));
1584         len = blkroundup(fs, (lp->l_len));
1585         chunkblks = blkroundup(fs, ufsvfsp->vfs_iotransz) / fs->fs_bsize;
1586         ulp = &ufsvfsp->vfs_ulockfs;
1587
1588         if (lp->l_start < 0 || lp->l_len <= 0)
1589                 return (EINVAL);
1590
1591         /* Quickly check to make sure we have space before we proceed */
1592         if (lblkno(fs, len) > fs->fs_cstotal.cs_nbfree) {
1593                 if (TRANS_ISTRANS(ufsvfsp)) {
1594                         ufs_delete_drain_wait(ufsvfsp, 1);
1595                         if (lblkno(fs, len) > fs->fs_cstotal.cs_nbfree)
1596                                 return (ENOSPC);
1597                 } else
1598                         return (ENOSPC);
1599         }
1600
1601         /*
1602          * We will keep i_rwlock locked as WRITER through out the function
1603          * since we don't want anyone else reading or writing to the inode
1604          * while we are in the middle of fallocating the file.
1605          */
1606         rw_enter(&ip->i_rwlock, RW_WRITER);
1607
1608         /* Back up the direct block list, used for undo later if necessary */
1609         rw_enter(&ip->i_contents, RW_READER);
1610         for (i = 0; i < NDADDR; i++)
1611                 db_undo[i] = ip->i_db[i];
1612         osz = ip->i_size;
1613         rw_exit(&ip->i_contents);
1614
1615         /* Write lock the file system */
1616         if (err = allocsp_wlockfs(vp, &lf))
1617                 goto exit;
1618
1619         /*
1620          * Allocate any direct blocks now.
1621          * Blocks are allocated from the offset requested to the block
1622          * boundary, then any full blocks are allocated, and finally any
1623          * remainder.
1624          */
1625         if (lblkno(fs, lp->l_start) < NDADDR) {
1626                 ufs_trans_trunc_resv(ip, ip->i_size + (NDADDR * fs->fs_bsize),
1627                     &resv, &resid);
1628                 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv);
1629
1630                 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1631                 rw_enter(&ip->i_contents, RW_WRITER);
1632
1633                 done_len = 0;
1634                 while ((done_len < lp->l_len) &&
1635                     (lblkno(fs, lp->l_start + done_len) < NDADDR)) {
1636                         uoff = (offset_t)(lp->l_start + done_len);
1637                         offsetn = (int)blkoff(fs, uoff);
1638                         nbytes = (int)MIN(fs->fs_bsize - offsetn,
1639                             lp->l_len - done_len);
1640
1641                         berr = bmap_write(ip, uoff, offsetn + nbytes,
1642                             BI_FALLOCATE, &allocblk, cr);
1643                         /* Yikes error, quit */
1644                         if (berr) {
1645                                 TRANS_INODE(ufsvfsp, ip);
1646                                 rw_exit(&ip->i_contents);
1647                                 rw_exit(&ufsvfsp->vfs_dqrwlock);
1648                                 TRANS_END_CSYNC(ufsvfsp, err, issync,
1649                                     TOP_ALLOCSP, resv);
1650                                 err = allocsp_unlockfs(vp, &lf);
1651                                 goto exit;
1652                         }
1653
1654                         if (allocblk) {
1655                                 totblks++;
1656                                 if ((uoff + nbytes) > ip->i_size)
1657                                         ip->i_size = (uoff + nbytes);
1658                         }
1659                         done_len += nbytes;
1660                 }
1661
1662                 TRANS_INODE(ufsvfsp, ip);
1663                 rw_exit(&ip->i_contents);
1664                 rw_exit(&ufsvfsp->vfs_dqrwlock);
1665                 TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv);
1666
1667                 /* start offset for indirect allocation */
1668                 istart =  (uoff + nbytes);
1669         }
1670
1671         /* Break the transactions into vfs_iotransz units */
1672         ufs_trans_trunc_resv(ip, ip->i_size +
1673             blkroundup(fs, ufsvfsp->vfs_iotransz), &resv, &resid);
1674         TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv);
1675
1676         rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1677         rw_enter(&ip->i_contents, RW_WRITER);
1678
1679         /* Now go about fallocating necessary indirect blocks */
1680         for (i = istart; i < (lp->l_start + lp->l_len); i += fs->fs_bsize) {
1681                 berr = bmap_write(ip, i, fs->fs_bsize, BI_FALLOCATE,
1682                     &allocblk, cr);
1683                 if (berr) {
1684                         TRANS_INODE(ufsvfsp, ip);
1685                         rw_exit(&ip->i_contents);
1686                         rw_exit(&ufsvfsp->vfs_dqrwlock);
1687                         TRANS_END_CSYNC(ufsvfsp, err, issync,
1688                             TOP_ALLOCSP, resv);
1689                         err = allocsp_unlockfs(vp, &lf);
1690                         goto exit;
1691                 }
1692
1693                 /* Update the blk counter only if new block was added */
1694                 if (allocblk) {
1695                         /* Save undo information */
1696                         undo = kmem_alloc(sizeof (struct allocsp_undo),
1697                             KM_SLEEP);
1698                         undo->offset = i;
1699                         undo->blk = allocblk;
1700                         undo->next = ib_undo;
1701                         ib_undo = undo;
1702                         totblks++;
1703
1704                         if (i >= ip->i_size)
1705                                 ip->i_size += fs->fs_bsize;
1706                 }
1707                 cnt++;
1708
1709                 /* Being a good UFS citizen, let others get a share */
1710                 if (cnt == chunkblks) {
1711                         /*
1712                          * If there are waiters or the fs is hard locked,
1713                          * error locked, or read-only error locked,
1714                          * quit with EIO
1715                          */
1716                         if (ULOCKFS_IS_HLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp) ||
1717                             ULOCKFS_IS_ROELOCK(ulp)) {
1718                                 ip->i_cflags |= IFALLOCATE;
1719                                 TRANS_INODE(ufsvfsp, ip);
1720                                 rw_exit(&ip->i_contents);
1721                                 rw_exit(&ufsvfsp->vfs_dqrwlock);
1722
1723                                 TRANS_END_CSYNC(ufsvfsp, err, issync,
1724                                     TOP_ALLOCSP, resv);
1725                                 rw_exit(&ip->i_rwlock);
1726                                 (void) allocsp_unlockfs(vp, &lf);
1727                                 return (EIO);
1728                         }
1729
1730                         TRANS_INODE(ufsvfsp, ip);
1731                         rw_exit(&ip->i_contents);
1732                         rw_exit(&ufsvfsp->vfs_dqrwlock);
1733
1734                         /* End the current transaction */
1735                         TRANS_END_CSYNC(ufsvfsp, err, issync,
1736                             TOP_ALLOCSP, resv);
1737
1738                         if (CV_HAS_WAITERS(&ulp->ul_cv)) {
1739                                 /* Release the write lock */
1740                                 if (err = allocsp_unlockfs(vp, &lf))
1741                                         goto exit;
1742
1743                                 /* Wake up others waiting to do operations */
1744                                 mutex_enter(&ulp->ul_lock);
1745                                 cv_broadcast(&ulp->ul_cv);
1746                                 mutex_exit(&ulp->ul_lock);
1747
1748                                 /* Grab the write lock again */
1749                                 if (err = allocsp_wlockfs(vp, &lf))
1750                                         goto exit;
1751                         } /* end of CV_HAS_WAITERS(&ulp->ul_cv) */
1752
1753                         /* Reserve more space in log for this file */
1754                         ufs_trans_trunc_resv(ip,
1755                             ip->i_size + blkroundup(fs, ufsvfsp->vfs_iotransz),
1756                             &resv, &resid);
1757                         TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv);
1758
1759                         rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1760                         rw_enter(&ip->i_contents, RW_WRITER);
1761
1762                         cnt = 0;        /* reset cnt b/c of new transaction */
1763                 }
1764         }
1765
1766         if (!err && !berr)
1767                 ip->i_cflags |= IFALLOCATE;
1768
1769         /* If the file has grown then correct the file size */
1770         if (osz < (lp->l_start + lp->l_len))
1771                 ip->i_size = (lp->l_start + lp->l_len);
1772
1773         /* Release locks, end log transaction and unlock fs */
1774         TRANS_INODE(ufsvfsp, ip);
1775         rw_exit(&ip->i_contents);
1776         rw_exit(&ufsvfsp->vfs_dqrwlock);
1777
1778         TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv);
1779         err = allocsp_unlockfs(vp, &lf);
1780
1781         /*
1782          * @ exit label, we should no longer be holding the fs write lock, and
1783          * all logging transactions should have been ended. We still hold
1784          * ip->i_rwlock.
1785          */
1786 exit:
1787         /*
1788          * File has grown larger than 2GB. Set flag
1789          * in superblock to indicate this, if it
1790          * is not already set.
1791          */
1792         if ((ip->i_size > MAXOFF32_T) &&
1793             !(fs->fs_flags & FSLARGEFILES)) {
1794                 ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
1795                 mutex_enter(&ufsvfsp->vfs_lock);
1796                 fs->fs_flags |= FSLARGEFILES;
1797                 ufs_sbwrite(ufsvfsp);
1798                 mutex_exit(&ufsvfsp->vfs_lock);
1799         }
1800
1801         /*
1802          * Since we couldn't allocate completely, we will undo the allocations.
1803          */
1804         if (berr) {
1805                 ufs_trans_trunc_resv(ip, totblks * fs->fs_bsize, &resv, &resid);
1806                 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv);
1807
1808                 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1809                 rw_enter(&ip->i_contents, RW_WRITER);
1810
1811                 /* Direct blocks */
1812                 for (i = 0; i < NDADDR; i++) {
1813                         /*
1814                          * Only free the block if they are not same, and
1815                          * the old one isn't zero (the fragment was
1816                          * re-allocated).
1817                          */
1818                         if (db_undo[i] != ip->i_db[i] && db_undo[i] == 0) {
1819                                 free(ip, ip->i_db[i], fs->fs_bsize, 0);
1820                                 ip->i_db[i] = 0;
1821                         }
1822                 }
1823
1824                 /* Undo the indirect blocks */
1825                 while (ib_undo != NULL) {
1826                         undo = ib_undo;
1827                         err = bmap_set_bn(vp, undo->offset, 0);
1828                         if (err)
1829                                 cmn_err(CE_PANIC, "ufs_allocsp(): failed to "
1830                                     "undo allocation of block %ld",
1831                                     undo->offset);
1832                         free(ip, undo->blk, fs->fs_bsize, I_IBLK);
1833                         ib_undo = undo->next;
1834                         kmem_free(undo, sizeof (struct allocsp_undo));
1835                 }
1836
1837                 ip->i_size = osz;
1838                 TRANS_INODE(ufsvfsp, ip);
1839
1840                 rw_exit(&ip->i_contents);
1841                 rw_exit(&ufsvfsp->vfs_dqrwlock);
1842
1843                 TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv);
1844
1845                 rw_exit(&ip->i_rwlock);
1846                 return (berr);
1847         }
1848
1849         /*
1850          * Don't forget to free the undo chain :)
1851          */
1852         while (ib_undo != NULL) {
1853                 undo = ib_undo;
1854                 ib_undo = undo->next;
1855                 kmem_free(undo, sizeof (struct allocsp_undo));
1856         }
1857
1858         rw_exit(&ip->i_rwlock);
1859
1860 out_allocsp:
1861         return (err);
1862 }
1863
1864 /*
1865  * Free storage space associated with the specified inode.  The portion
1866  * to be freed is specified by lp->l_start and lp->l_len (already
1867  * normalized to a "whence" of 0).
1868  *
1869  * This is an experimental facility whose continued existence is not
1870  * guaranteed.  Currently, we only support the special case
1871  * of l_len == 0, meaning free to end of file.
1872  *
1873  * Blocks are freed in reverse order.  This FILO algorithm will tend to
1874  * maintain a contiguous free list much longer than FIFO.
1875  * See also ufs_itrunc() in ufs_inode.c.
1876  *
1877  * Bug: unused bytes in the last retained block are not cleared.
1878  * This may result in a "hole" in the file that does not read as zeroes.
1879  */
1880 /* ARGSUSED */
1881 int
1882 ufs_freesp(struct vnode *vp, struct flock64 *lp, int flag, cred_t *cr)
1883 {
1884         int i;
1885         struct inode *ip = VTOI(vp);
1886         int error;
1887
1888         ASSERT(vp->v_type == VREG);
1889         ASSERT(lp->l_start >= 0);       /* checked by convoff */
1890
1891         if (lp->l_len != 0)
1892                 return (EINVAL);
1893
1894         rw_enter(&ip->i_contents, RW_READER);
1895         if (ip->i_size == (u_offset_t)lp->l_start) {
1896                 rw_exit(&ip->i_contents);
1897                 return (0);
1898         }
1899
1900         /*
1901          * Check if there is any active mandatory lock on the
1902          * range that will be truncated/expanded.
1903          */
1904         if (MANDLOCK(vp, ip->i_mode)) {
1905                 offset_t save_start;
1906
1907                 save_start = lp->l_start;
1908
1909                 if (ip->i_size < lp->l_start) {
1910                         /*
1911                          * "Truncate up" case: need to make sure there
1912                          * is no lock beyond current end-of-file. To
1913                          * do so, we need to set l_start to the size
1914                          * of the file temporarily.
1915                          */
1916                         lp->l_start = ip->i_size;
1917                 }
1918                 lp->l_type = F_WRLCK;
1919                 lp->l_sysid = 0;
1920                 lp->l_pid = ttoproc(curthread)->p_pid;
1921                 i = (flag & (FNDELAY|FNONBLOCK)) ? 0 : SLPFLCK;
1922                 rw_exit(&ip->i_contents);
1923                 if ((i = reclock(vp, lp, i, 0, lp->l_start, NULL)) != 0 ||
1924                     lp->l_type != F_UNLCK) {
1925                         return (i ? i : EAGAIN);
1926                 }
1927                 rw_enter(&ip->i_contents, RW_READER);
1928
1929                 lp->l_start = save_start;
1930         }
1931
1932         /*
1933          * Make sure a write isn't in progress (allocating blocks)
1934          * by acquiring i_rwlock (we promised ufs_bmap we wouldn't
1935          * truncate while it was allocating blocks).
1936          * Grab the locks in the right order.
1937          */
1938         rw_exit(&ip->i_contents);
1939         rw_enter(&ip->i_rwlock, RW_WRITER);
1940         error = TRANS_ITRUNC(ip, (u_offset_t)lp->l_start, 0, cr);
1941         rw_exit(&ip->i_rwlock);
1942         return (error);
1943 }
1944
1945 /*
1946  * Find a cg with as close to nb contiguous bytes as possible
1947  *      THIS MAY TAKE MANY DISK READS!
1948  *
1949  * Implemented in an attempt to allocate contiguous blocks for
1950  * writing the ufs log file to, minimizing future disk head seeking
1951  */
1952 daddr_t
1953 contigpref(ufsvfs_t *ufsvfsp, size_t nb, size_t minb)
1954 {
1955         struct fs       *fs     = ufsvfsp->vfs_fs;
1956         daddr_t         nblk    = lblkno(fs, blkroundup(fs, nb));
1957         daddr_t         minblk  = lblkno(fs, blkroundup(fs, minb));
1958         daddr_t         savebno, curbno, cgbno;
1959         int             cg, cgblks, savecg, savenblk, curnblk, startcg;
1960         uchar_t         *blksfree;
1961         buf_t           *bp;
1962         struct cg       *cgp;
1963
1964         savenblk = 0;
1965         savecg = 0;
1966         savebno = 0;
1967
1968         if ((startcg = findlogstartcg(fs, nblk, minblk)) == -1)
1969                 cg = 0; /* Nothing suitable found */
1970         else
1971                 cg = startcg;
1972
1973         for (; cg < fs->fs_ncg; ++cg) {
1974                 /*
1975                  * find the largest contiguous range in this cg
1976                  */
1977                 bp = UFS_BREAD(ufsvfsp, ufsvfsp->vfs_dev,
1978                     (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
1979                     (int)fs->fs_cgsize);
1980                 cgp = bp->b_un.b_cg;
1981                 if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) {
1982                         brelse(bp);
1983                         continue;
1984                 }
1985                 blksfree = cg_blksfree(cgp);        /* free array */
1986                 cgblks = fragstoblks(fs, fs->fs_fpg); /* blks in free array */
1987                 cgbno = 0;
1988                 while (cgbno < cgblks && savenblk < nblk) {
1989                         /* find a free block */
1990                         for (; cgbno < cgblks; ++cgbno) {
1991                                 if (isblock(fs, blksfree, cgbno)) {
1992                                         if (startcg != -1) {
1993                                                 brelse(bp);
1994                                                 savecg = startcg;
1995                                                 savebno = cgbno;
1996                                                 goto done;
1997                                         } else
1998                                                 break;
1999                                 }
2000                         }
2001                         curbno = cgbno;
2002                         /* count the number of free blocks */
2003                         for (curnblk = 0; cgbno < cgblks; ++cgbno) {
2004                                 if (!isblock(fs, blksfree, cgbno))
2005                                         break;
2006                                 if (++curnblk >= nblk)
2007                                         break;
2008                         }
2009                         if (curnblk > savenblk) {
2010                                 savecg = cg;
2011                                 savenblk = curnblk;
2012                                 savebno = curbno;
2013                         }
2014                 }
2015                 brelse(bp);
2016                 if (savenblk >= nblk)
2017                         break;
2018         }
2019
2020 done:
2021
2022         /* convert block offset in cg to frag offset in cg */
2023         savebno = blkstofrags(fs, savebno);
2024
2025         /* convert frag offset in cg to frag offset in fs */
2026         savebno += (savecg * fs->fs_fpg);
2027
2028         return (savebno);
2029 }
2030
2031 /*
2032  * The object of this routine is to find a start point for the UFS log.
2033  * Ideally the space should be allocated from the smallest possible number
2034  * of contiguous cylinder groups. This is found by using a sliding window
2035  * technique. The smallest window of contiguous cylinder groups, which is
2036  * still able to accommodate the target, is found by moving the window
2037  * through the cylinder groups in a single pass. The end of the window is
2038  * advanced until the space is accommodated, then the start is advanced until
2039  * it no longer fits, the end is then advanced again and so on until the
2040  * final cylinder group is reached. The first suitable instance is recorded
2041  * and its starting cg number is returned.
2042  *
2043  * If we are not able to find a minimum amount of space, represented by
2044  * minblk, or to do so uses more than the available extents, then return -1.
2045  */
2046
2047 int
2048 findlogstartcg(struct fs *fs, daddr_t requested, daddr_t minblk)
2049 {
2050         int      ncgs;           /* number of cylinder groups */
2051         daddr_t target;          /* amount of space sought */
2052         int      cwidth, ctotal; /* current window width and total */
2053         int      bwidth, btotal; /* best window width and total so far */
2054         int      s;     /* index of the first element in the current window */
2055         int      e;     /* index of the first element + the width */
2056                         /*  (i.e. 1 + index of last element) */
2057         int      bs; /* index of the first element in the best window so far */
2058         int      header, max_extents;
2059
2060         target = requested;
2061         ncgs = fs->fs_ncg;
2062
2063         header = sizeof (extent_block_t) - sizeof (extent_t);
2064         max_extents = ((fs->fs_bsize)-header) / sizeof (extent_t);
2065         cwidth = ctotal = 0;
2066         btotal = -1;
2067         bwidth = ncgs;
2068         s = e = 0;
2069         while (e < ncgs) {
2070         /* Advance the end of the window until it accommodates the target. */
2071                 while (ctotal < target && e < ncgs) {
2072                         ctotal += fs->fs_cs(fs, e).cs_nbfree;
2073                         e++;
2074                 }
2075
2076                 /*
2077                  * Advance the start of the window until it no longer
2078                  * accommodates the target.
2079                  */
2080                 while (ctotal >= target && s < e) {
2081                         /* See if this is the smallest window so far. */
2082                         cwidth = e - s;
2083                         if (cwidth <= bwidth) {
2084                                 if (cwidth == bwidth && ctotal <= btotal)
2085                                         goto more;
2086                                 bwidth = cwidth;
2087                                 btotal = ctotal;
2088                                 bs = s;
2089                         }
2090 more:
2091                         ctotal -= fs->fs_cs(fs, s).cs_nbfree;
2092                         s++;
2093                 }
2094         }
2095
2096         /*
2097          * If we cannot allocate the minimum required or we use too many
2098          * extents to do so, return -1.
2099          */
2100         if (btotal < minblk || bwidth > max_extents)
2101                 bs = -1;
2102
2103         return (bs);
2104 }