sys/ufs/ffs/ffs_alloc.c

   1 /*      $NetBSD: ffs_alloc.c,v 1.145 2013/11/12 03:29:22 dholland Exp $ */
   2
   3 /*-
   4  * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
   5  * All rights reserved.
   6  *
   7  * This code is derived from software contributed to The NetBSD Foundation
   8  * by Wasabi Systems, Inc.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  29  * POSSIBILITY OF SUCH DAMAGE.
  30  */
  31
  32 /*
  33  * Copyright (c) 2002 Networks Associates Technology, Inc.
  34  * All rights reserved.
  35  *
  36  * This software was developed for the FreeBSD Project by Marshall
  37  * Kirk McKusick and Network Associates Laboratories, the Security
  38  * Research Division of Network Associates, Inc. under DARPA/SPAWAR
  39  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
  40  * research program
  41  *
  42  * Copyright (c) 1982, 1986, 1989, 1993
  43  *      The Regents of the University of California.  All rights reserved.
  44  *
  45  * Redistribution and use in source and binary forms, with or without
  46  * modification, are permitted provided that the following conditions
  47  * are met:
  48  * 1. Redistributions of source code must retain the above copyright
  49  *    notice, this list of conditions and the following disclaimer.
  50  * 2. Redistributions in binary form must reproduce the above copyright
  51  *    notice, this list of conditions and the following disclaimer in the
  52  *    documentation and/or other materials provided with the distribution.
  53  * 3. Neither the name of the University nor the names of its contributors
  54  *    may be used to endorse or promote products derived from this software
  55  *    without specific prior written permission.
  56  *
  57  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  58  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  59  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  60  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  61  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  62  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  63  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  64  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  65  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  66  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  67  * SUCH DAMAGE.
  68  *
  69  *      @(#)ffs_alloc.c 8.19 (Berkeley) 7/13/95
  70  */
  71
  72 #include <sys/cdefs.h>
  73 __KERNEL_RCSID(0, "$NetBSD: ffs_alloc.c,v 1.145 2013/11/12 03:29:22 dholland Exp $");
  74
  75 #if defined(_KERNEL_OPT)
  76 #include "opt_ffs.h"
  77 #include "opt_quota.h"
  78 #include "opt_uvm_page_trkown.h"
  79 #endif
  80
  81 #include <sys/param.h>
  82 #include <sys/systm.h>
  83 #include <sys/buf.h>
  84 #include <sys/cprng.h>
  85 #include <sys/fstrans.h>
  86 #include <sys/kauth.h>
  87 #include <sys/kernel.h>
  88 #include <sys/mount.h>
  89 #include <sys/proc.h>
  90 #include <sys/syslog.h>
  91 #include <sys/vnode.h>
  92 #include <sys/wapbl.h>
  93
  94 #include <miscfs/specfs/specdev.h>
  95 #include <ufs/ufs/quota.h>
  96 #include <ufs/ufs/ufsmount.h>
  97 #include <ufs/ufs/inode.h>
  98 #include <ufs/ufs/ufs_extern.h>
  99 #include <ufs/ufs/ufs_bswap.h>
 100 #include <ufs/ufs/ufs_wapbl.h>
 101
 102 #include <ufs/ffs/fs.h>
 103 #include <ufs/ffs/ffs_extern.h>
 104
 105 #ifdef UVM_PAGE_TRKOWN
 106 #include <uvm/uvm.h>
 107 #endif
 108
 109 static daddr_t ffs_alloccg(struct inode *, int, daddr_t, int, int);
 110 static daddr_t ffs_alloccgblk(struct inode *, struct buf *, daddr_t, int);
 111 static ino_t ffs_dirpref(struct inode *);
 112 static daddr_t ffs_fragextend(struct inode *, int, daddr_t, int, int);
 113 static void ffs_fserr(struct fs *, u_int, const char *);
 114 static daddr_t ffs_hashalloc(struct inode *, int, daddr_t, int, int,
 115     daddr_t (*)(struct inode *, int, daddr_t, int, int));
 116 static daddr_t ffs_nodealloccg(struct inode *, int, daddr_t, int, int);
 117 static int32_t ffs_mapsearch(struct fs *, struct cg *,
 118                                       daddr_t, int);
 119 static void ffs_blkfree_common(struct ufsmount *, struct fs *, dev_t, struct buf *,
 120     daddr_t, long, bool);
 121 static void ffs_freefile_common(struct ufsmount *, struct fs *, dev_t, struct buf *, ino_t,
 122     int, bool);
 123
 124 /* if 1, changes in optimalization strategy are logged */
 125 int ffs_log_changeopt = 0;
 126
 127 /* in ffs_tables.c */
 128 extern const int inside[], around[];
 129 extern const u_char * const fragtbl[];
 130
 131 /* Basic consistency check for block allocations */
 132 static int
 133 ffs_check_bad_allocation(const char *func, struct fs *fs, daddr_t bno,
 134     long size, dev_t dev, ino_t inum)
 135 {
 136         if ((u_int)size > fs->fs_bsize || ffs_fragoff(fs, size) != 0 ||
 137             ffs_fragnum(fs, bno) + ffs_numfrags(fs, size) > fs->fs_frag) {
 138                 printf("dev = 0x%llx, bno = %" PRId64 " bsize = %d, "
 139                     "size = %ld, fs = %s\n",
 140                     (long long)dev, bno, fs->fs_bsize, size, fs->fs_fsmnt);
 141                 panic("%s: bad size", func);
 142         }
 143
 144         if (bno >= fs->fs_size) {
 145                 printf("bad block %" PRId64 ", ino %llu\n", bno,
 146                     (unsigned long long)inum);
 147                 ffs_fserr(fs, inum, "bad block");
 148                 return EINVAL;
 149         }
 150         return 0;
 151 }
 152
 153 /*
 154  * Allocate a block in the file system.
 155  *
 156  * The size of the requested block is given, which must be some
 157  * multiple of fs_fsize and <= fs_bsize.
 158  * A preference may be optionally specified. If a preference is given
 159  * the following hierarchy is used to allocate a block:
 160  *   1) allocate the requested block.
 161  *   2) allocate a rotationally optimal block in the same cylinder.
 162  *   3) allocate a block in the same cylinder group.
 163  *   4) quadradically rehash into other cylinder groups, until an
 164  *      available block is located.
 165  * If no block preference is given the following hierarchy is used
 166  * to allocate a block:
 167  *   1) allocate a block in the cylinder group that contains the
 168  *      inode for the file.
 169  *   2) quadradically rehash into other cylinder groups, until an
 170  *      available block is located.
 171  *
 172  * => called with um_lock held
 173  * => releases um_lock before returning
 174  */
 175 int
 176 ffs_alloc(struct inode *ip, daddr_t lbn, daddr_t bpref, int size, int flags,
 177     kauth_cred_t cred, daddr_t *bnp)
 178 {
 179         struct ufsmount *ump;
 180         struct fs *fs;
 181         daddr_t bno;
 182         int cg;
 183 #if defined(QUOTA) || defined(QUOTA2)
 184         int error;
 185 #endif
 186
 187         fs = ip->i_fs;
 188         ump = ip->i_ump;
 189
 190         KASSERT(mutex_owned(&ump->um_lock));
 191
 192 #ifdef UVM_PAGE_TRKOWN
 193
 194         /*
 195          * Sanity-check that allocations within the file size
 196          * do not allow other threads to read the stale contents
 197          * of newly allocated blocks.
 198          * Usually pages will exist to cover the new allocation.
 199          * There is an optimization in ffs_write() where we skip
 200          * creating pages if several conditions are met:
 201          *  - the file must not be mapped (in any user address space).
 202          *  - the write must cover whole pages and whole blocks.
 203          * If those conditions are not met then pages must exist and
 204          * be locked by the current thread.
 205          */
 206
 207         if (ITOV(ip)->v_type == VREG &&
 208             ffs_lblktosize(fs, (voff_t)lbn) < round_page(ITOV(ip)->v_size)) {
 209                 struct vm_page *pg;
 210                 struct vnode *vp = ITOV(ip);
 211                 struct uvm_object *uobj = &vp->v_uobj;
 212                 voff_t off = trunc_page(ffs_lblktosize(fs, lbn));
 213                 voff_t endoff = round_page(ffs_lblktosize(fs, lbn) + size);
 214
 215                 mutex_enter(uobj->vmobjlock);
 216                 while (off < endoff) {
 217                         pg = uvm_pagelookup(uobj, off);
 218                         KASSERT((pg == NULL && (vp->v_vflag & VV_MAPPED) == 0 &&
 219                                  (size & PAGE_MASK) == 0 &&
 220                                  ffs_blkoff(fs, size) == 0) ||
 221                                 (pg != NULL && pg->owner == curproc->p_pid &&
 222                                  pg->lowner == curlwp->l_lid));
 223                         off += PAGE_SIZE;
 224                 }
 225                 mutex_exit(uobj->vmobjlock);
 226         }
 227 #endif
 228
 229         *bnp = 0;
 230 #ifdef DIAGNOSTIC
 231         if ((u_int)size > fs->fs_bsize || ffs_fragoff(fs, size) != 0) {
 232                 printf("dev = 0x%llx, bsize = %d, size = %d, fs = %s\n",
 233                     (unsigned long long)ip->i_dev, fs->fs_bsize, size,
 234                     fs->fs_fsmnt);
 235                 panic("ffs_alloc: bad size");
 236         }
 237         if (cred == NOCRED)
 238                 panic("ffs_alloc: missing credential");
 239 #endif /* DIAGNOSTIC */
 240         if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0)
 241                 goto nospace;
 242         if (freespace(fs, fs->fs_minfree) <= 0 &&
 243             kauth_authorize_system(cred, KAUTH_SYSTEM_FS_RESERVEDSPACE, 0, NULL,
 244             NULL, NULL) != 0)
 245                 goto nospace;
 246 #if defined(QUOTA) || defined(QUOTA2)
 247         mutex_exit(&ump->um_lock);
 248         if ((error = chkdq(ip, btodb(size), cred, 0)) != 0)
 249                 return (error);
 250         mutex_enter(&ump->um_lock);
 251 #endif
 252
 253         if (bpref >= fs->fs_size)
 254                 bpref = 0;
 255         if (bpref == 0)
 256                 cg = ino_to_cg(fs, ip->i_number);
 257         else
 258                 cg = dtog(fs, bpref);
 259         bno = ffs_hashalloc(ip, cg, bpref, size, flags, ffs_alloccg);
 260         if (bno > 0) {
 261                 DIP_ADD(ip, blocks, btodb(size));
 262                 ip->i_flag |= IN_CHANGE | IN_UPDATE;
 263                 *bnp = bno;
 264                 return (0);
 265         }
 266 #if defined(QUOTA) || defined(QUOTA2)
 267         /*
 268          * Restore user's disk quota because allocation failed.
 269          */
 270         (void) chkdq(ip, -btodb(size), cred, FORCE);
 271 #endif
 272         if (flags & B_CONTIG) {
 273                 /*
 274                  * XXX ump->um_lock handling is "suspect" at best.
 275                  * For the case where ffs_hashalloc() fails early
 276                  * in the B_CONTIG case we reach here with um_lock
 277                  * already unlocked, so we can't release it again
 278                  * like in the normal error path.  See kern/39206.
 279                  *
 280                  *
 281                  * Fail silently - it's up to our caller to report
 282                  * errors.
 283                  */
 284                 return (ENOSPC);
 285         }
 286 nospace:
 287         mutex_exit(&ump->um_lock);
 288         ffs_fserr(fs, kauth_cred_geteuid(cred), "file system full");
 289         uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt);
 290         return (ENOSPC);
 291 }
 292
 293 /*
 294  * Reallocate a fragment to a bigger size
 295  *
 296  * The number and size of the old block is given, and a preference
 297  * and new size is also specified. The allocator attempts to extend
 298  * the original block. Failing that, the regular block allocator is
 299  * invoked to get an appropriate block.
 300  *
 301  * => called with um_lock held
 302  * => return with um_lock released
 303  */
 304 int
 305 ffs_realloccg(struct inode *ip, daddr_t lbprev, daddr_t bpref, int osize,
 306     int nsize, kauth_cred_t cred, struct buf **bpp, daddr_t *blknop)
 307 {
 308         struct ufsmount *ump;
 309         struct fs *fs;
 310         struct buf *bp;
 311         int cg, request, error;
 312         daddr_t bprev, bno;
 313
 314         fs = ip->i_fs;
 315         ump = ip->i_ump;
 316
 317         KASSERT(mutex_owned(&ump->um_lock));
 318
 319 #ifdef UVM_PAGE_TRKOWN
 320
 321         /*
 322          * Sanity-check that allocations within the file size
 323          * do not allow other threads to read the stale contents
 324          * of newly allocated blocks.
 325          * Unlike in ffs_alloc(), here pages must always exist
 326          * for such allocations, because only the last block of a file
 327          * can be a fragment and ffs_write() will reallocate the
 328          * fragment to the new size using ufs_balloc_range(),
 329          * which always creates pages to cover blocks it allocates.
 330          */
 331
 332         if (ITOV(ip)->v_type == VREG) {
 333                 struct vm_page *pg;
 334                 struct uvm_object *uobj = &ITOV(ip)->v_uobj;
 335                 voff_t off = trunc_page(ffs_lblktosize(fs, lbprev));
 336                 voff_t endoff = round_page(ffs_lblktosize(fs, lbprev) + osize);
 337
 338                 mutex_enter(uobj->vmobjlock);
 339                 while (off < endoff) {
 340                         pg = uvm_pagelookup(uobj, off);
 341                         KASSERT(pg->owner == curproc->p_pid &&
 342                                 pg->lowner == curlwp->l_lid);
 343                         off += PAGE_SIZE;
 344                 }
 345                 mutex_exit(uobj->vmobjlock);
 346         }
 347 #endif
 348
 349 #ifdef DIAGNOSTIC
 350         if ((u_int)osize > fs->fs_bsize || ffs_fragoff(fs, osize) != 0 ||
 351             (u_int)nsize > fs->fs_bsize || ffs_fragoff(fs, nsize) != 0) {
 352                 printf(
 353                     "dev = 0x%llx, bsize = %d, osize = %d, nsize = %d, fs = %s\n",
 354                     (unsigned long long)ip->i_dev, fs->fs_bsize, osize, nsize,
 355                     fs->fs_fsmnt);
 356                 panic("ffs_realloccg: bad size");
 357         }
 358         if (cred == NOCRED)
 359                 panic("ffs_realloccg: missing credential");
 360 #endif /* DIAGNOSTIC */
 361         if (freespace(fs, fs->fs_minfree) <= 0 &&
 362             kauth_authorize_system(cred, KAUTH_SYSTEM_FS_RESERVEDSPACE, 0, NULL,
 363             NULL, NULL) != 0) {
 364                 mutex_exit(&ump->um_lock);
 365                 goto nospace;
 366         }
 367         if (fs->fs_magic == FS_UFS2_MAGIC)
 368                 bprev = ufs_rw64(ip->i_ffs2_db[lbprev], UFS_FSNEEDSWAP(fs));
 369         else
 370                 bprev = ufs_rw32(ip->i_ffs1_db[lbprev], UFS_FSNEEDSWAP(fs));
 371
 372         if (bprev == 0) {
 373                 printf("dev = 0x%llx, bsize = %d, bprev = %" PRId64 ", fs = %s\n",
 374                     (unsigned long long)ip->i_dev, fs->fs_bsize, bprev,
 375                     fs->fs_fsmnt);
 376                 panic("ffs_realloccg: bad bprev");
 377         }
 378         mutex_exit(&ump->um_lock);
 379
 380         /*
 381          * Allocate the extra space in the buffer.
 382          */
 383         if (bpp != NULL &&
 384             (error = bread(ITOV(ip), lbprev, osize, NOCRED, 0, &bp)) != 0) {
 385                 return (error);
 386         }
 387 #if defined(QUOTA) || defined(QUOTA2)
 388         if ((error = chkdq(ip, btodb(nsize - osize), cred, 0)) != 0) {
 389                 if (bpp != NULL) {
 390                         brelse(bp, 0);
 391                 }
 392                 return (error);
 393         }
 394 #endif
 395         /*
 396          * Check for extension in the existing location.
 397          */
 398         cg = dtog(fs, bprev);
 399         mutex_enter(&ump->um_lock);
 400         if ((bno = ffs_fragextend(ip, cg, bprev, osize, nsize)) != 0) {
 401                 DIP_ADD(ip, blocks, btodb(nsize - osize));
 402                 ip->i_flag |= IN_CHANGE | IN_UPDATE;
 403
 404                 if (bpp != NULL) {
 405                         if (bp->b_blkno != FFS_FSBTODB(fs, bno))
 406                                 panic("bad blockno");
 407                         allocbuf(bp, nsize, 1);
 408                         memset((char *)bp->b_data + osize, 0, nsize - osize);
 409                         mutex_enter(bp->b_objlock);
 410                         KASSERT(!cv_has_waiters(&bp->b_done));
 411                         bp->b_oflags |= BO_DONE;
 412                         mutex_exit(bp->b_objlock);
 413                         *bpp = bp;
 414                 }
 415                 if (blknop != NULL) {
 416                         *blknop = bno;
 417                 }
 418                 return (0);
 419         }
 420         /*
 421          * Allocate a new disk location.
 422          */
 423         if (bpref >= fs->fs_size)
 424                 bpref = 0;
 425         switch ((int)fs->fs_optim) {
 426         case FS_OPTSPACE:
 427                 /*
 428                  * Allocate an exact sized fragment. Although this makes
 429                  * best use of space, we will waste time relocating it if
 430                  * the file continues to grow. If the fragmentation is
 431                  * less than half of the minimum free reserve, we choose
 432                  * to begin optimizing for time.
 433                  */
 434                 request = nsize;
 435                 if (fs->fs_minfree < 5 ||
 436                     fs->fs_cstotal.cs_nffree >
 437                     fs->fs_dsize * fs->fs_minfree / (2 * 100))
 438                         break;
 439
 440                 if (ffs_log_changeopt) {
 441                         log(LOG_NOTICE,
 442                                 "%s: optimization changed from SPACE to TIME\n",
 443                                 fs->fs_fsmnt);
 444                 }
 445
 446                 fs->fs_optim = FS_OPTTIME;
 447                 break;
 448         case FS_OPTTIME:
 449                 /*
 450                  * At this point we have discovered a file that is trying to
 451                  * grow a small fragment to a larger fragment. To save time,
 452                  * we allocate a full sized block, then free the unused portion.
 453                  * If the file continues to grow, the `ffs_fragextend' call
 454                  * above will be able to grow it in place without further
 455                  * copying. If aberrant programs cause disk fragmentation to
 456                  * grow within 2% of the free reserve, we choose to begin
 457                  * optimizing for space.
 458                  */
 459                 request = fs->fs_bsize;
 460                 if (fs->fs_cstotal.cs_nffree <
 461                     fs->fs_dsize * (fs->fs_minfree - 2) / 100)
 462                         break;
 463
 464                 if (ffs_log_changeopt) {
 465                         log(LOG_NOTICE,
 466                                 "%s: optimization changed from TIME to SPACE\n",
 467                                 fs->fs_fsmnt);
 468                 }
 469
 470                 fs->fs_optim = FS_OPTSPACE;
 471                 break;
 472         default:
 473                 printf("dev = 0x%llx, optim = %d, fs = %s\n",
 474                     (unsigned long long)ip->i_dev, fs->fs_optim, fs->fs_fsmnt);
 475                 panic("ffs_realloccg: bad optim");
 476                 /* NOTREACHED */
 477         }
 478         bno = ffs_hashalloc(ip, cg, bpref, request, 0, ffs_alloccg);
 479         if (bno > 0) {
 480                 if ((ip->i_ump->um_mountp->mnt_wapbl) &&
 481                     (ITOV(ip)->v_type != VREG)) {
 482                         UFS_WAPBL_REGISTER_DEALLOCATION(
 483                             ip->i_ump->um_mountp, FFS_FSBTODB(fs, bprev),
 484                             osize);
 485                 } else {
 486                         ffs_blkfree(fs, ip->i_devvp, bprev, (long)osize,
 487                             ip->i_number);
 488                 }
 489                 if (nsize < request) {
 490                         if ((ip->i_ump->um_mountp->mnt_wapbl) &&
 491                             (ITOV(ip)->v_type != VREG)) {
 492                                 UFS_WAPBL_REGISTER_DEALLOCATION(
 493                                     ip->i_ump->um_mountp,
 494                                     FFS_FSBTODB(fs, (bno + ffs_numfrags(fs, nsize))),
 495                                     request - nsize);
 496                         } else
 497                                 ffs_blkfree(fs, ip->i_devvp,
 498                                     bno + ffs_numfrags(fs, nsize),
 499                                     (long)(request - nsize), ip->i_number);
 500                 }
 501                 DIP_ADD(ip, blocks, btodb(nsize - osize));
 502                 ip->i_flag |= IN_CHANGE | IN_UPDATE;
 503                 if (bpp != NULL) {
 504                         bp->b_blkno = FFS_FSBTODB(fs, bno);
 505                         allocbuf(bp, nsize, 1);
 506                         memset((char *)bp->b_data + osize, 0, (u_int)nsize - osize);
 507                         mutex_enter(bp->b_objlock);
 508                         KASSERT(!cv_has_waiters(&bp->b_done));
 509                         bp->b_oflags |= BO_DONE;
 510                         mutex_exit(bp->b_objlock);
 511                         *bpp = bp;
 512                 }
 513                 if (blknop != NULL) {
 514                         *blknop = bno;
 515                 }
 516                 return (0);
 517         }
 518         mutex_exit(&ump->um_lock);
 519
 520 #if defined(QUOTA) || defined(QUOTA2)
 521         /*
 522          * Restore user's disk quota because allocation failed.
 523          */
 524         (void) chkdq(ip, -btodb(nsize - osize), cred, FORCE);
 525 #endif
 526         if (bpp != NULL) {
 527                 brelse(bp, 0);
 528         }
 529
 530 nospace:
 531         /*
 532          * no space available
 533          */
 534         ffs_fserr(fs, kauth_cred_geteuid(cred), "file system full");
 535         uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt);
 536         return (ENOSPC);
 537 }
 538
 539 /*
 540  * Allocate an inode in the file system.
 541  *
 542  * If allocating a directory, use ffs_dirpref to select the inode.
 543  * If allocating in a directory, the following hierarchy is followed:
 544  *   1) allocate the preferred inode.
 545  *   2) allocate an inode in the same cylinder group.
 546  *   3) quadradically rehash into other cylinder groups, until an
 547  *      available inode is located.
 548  * If no inode preference is given the following hierarchy is used
 549  * to allocate an inode:
 550  *   1) allocate an inode in cylinder group 0.
 551  *   2) quadradically rehash into other cylinder groups, until an
 552  *      available inode is located.
 553  *
 554  * => um_lock not held upon entry or return
 555  */
 556 int
 557 ffs_valloc(struct vnode *pvp, int mode, kauth_cred_t cred,
 558     struct vnode **vpp)
 559 {
 560         struct ufsmount *ump;
 561         struct inode *pip;
 562         struct fs *fs;
 563         struct inode *ip;
 564         struct timespec ts;
 565         ino_t ino, ipref;
 566         int cg, error;
 567
 568         UFS_WAPBL_JUNLOCK_ASSERT(pvp->v_mount);
 569
 570         *vpp = NULL;
 571         pip = VTOI(pvp);
 572         fs = pip->i_fs;
 573         ump = pip->i_ump;
 574
 575         error = UFS_WAPBL_BEGIN(pvp->v_mount);
 576         if (error) {
 577                 return error;
 578         }
 579         mutex_enter(&ump->um_lock);
 580         if (fs->fs_cstotal.cs_nifree == 0)
 581                 goto noinodes;
 582
 583         if ((mode & IFMT) == IFDIR)
 584                 ipref = ffs_dirpref(pip);
 585         else
 586                 ipref = pip->i_number;
 587         if (ipref >= fs->fs_ncg * fs->fs_ipg)
 588                 ipref = 0;
 589         cg = ino_to_cg(fs, ipref);
 590         /*
 591          * Track number of dirs created one after another
 592          * in a same cg without intervening by files.
 593          */
 594         if ((mode & IFMT) == IFDIR) {
 595                 if (fs->fs_contigdirs[cg] < 255)
 596                         fs->fs_contigdirs[cg]++;
 597         } else {
 598                 if (fs->fs_contigdirs[cg] > 0)
 599                         fs->fs_contigdirs[cg]--;
 600         }
 601         ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0, ffs_nodealloccg);
 602         if (ino == 0)
 603                 goto noinodes;
 604         UFS_WAPBL_END(pvp->v_mount);
 605         error = VFS_VGET(pvp->v_mount, ino, vpp);
 606         if (error) {
 607                 int err;
 608                 err = UFS_WAPBL_BEGIN(pvp->v_mount);
 609                 if (err == 0)
 610                         ffs_vfree(pvp, ino, mode);
 611                 if (err == 0)
 612                         UFS_WAPBL_END(pvp->v_mount);
 613                 return (error);
 614         }
 615         KASSERT((*vpp)->v_type == VNON);
 616         ip = VTOI(*vpp);
 617         if (ip->i_mode) {
 618 #if 0
 619                 printf("mode = 0%o, inum = %d, fs = %s\n",
 620                     ip->i_mode, ip->i_number, fs->fs_fsmnt);
 621 #else
 622                 printf("dmode %x mode %x dgen %x gen %x\n",
 623                     DIP(ip, mode), ip->i_mode,
 624                     DIP(ip, gen), ip->i_gen);
 625                 printf("size %llx blocks %llx\n",
 626                     (long long)DIP(ip, size), (long long)DIP(ip, blocks));
 627                 printf("ino %llu ipref %llu\n", (unsigned long long)ino,
 628                     (unsigned long long)ipref);
 629 #if 0
 630                 error = bread(ump->um_devvp, FFS_FSBTODB(fs, ino_to_fsba(fs, ino)),
 631                     (int)fs->fs_bsize, NOCRED, 0, &bp);
 632 #endif
 633
 634 #endif
 635                 panic("ffs_valloc: dup alloc");
 636         }
 637         if (DIP(ip, blocks)) {                          /* XXX */
 638                 printf("free inode %llu on %s had %" PRId64 " blocks\n",
 639                     (unsigned long long)ino, fs->fs_fsmnt, DIP(ip, blocks));
 640                 DIP_ASSIGN(ip, blocks, 0);
 641         }
 642         ip->i_flag &= ~IN_SPACECOUNTED;
 643         ip->i_flags = 0;
 644         DIP_ASSIGN(ip, flags, 0);
 645         /*
 646          * Set up a new generation number for this inode.
 647          */
 648         ip->i_gen++;
 649         DIP_ASSIGN(ip, gen, ip->i_gen);
 650         if (fs->fs_magic == FS_UFS2_MAGIC) {
 651                 vfs_timestamp(&ts);
 652                 ip->i_ffs2_birthtime = ts.tv_sec;
 653                 ip->i_ffs2_birthnsec = ts.tv_nsec;
 654         }
 655         return (0);
 656 noinodes:
 657         mutex_exit(&ump->um_lock);
 658         UFS_WAPBL_END(pvp->v_mount);
 659         ffs_fserr(fs, kauth_cred_geteuid(cred), "out of inodes");
 660         uprintf("\n%s: create/symlink failed, no inodes free\n", fs->fs_fsmnt);
 661         return (ENOSPC);
 662 }
 663
 664 /*
 665  * Find a cylinder group in which to place a directory.
 666  *
 667  * The policy implemented by this algorithm is to allocate a
 668  * directory inode in the same cylinder group as its parent
 669  * directory, but also to reserve space for its files inodes
 670  * and data. Restrict the number of directories which may be
 671  * allocated one after another in the same cylinder group
 672  * without intervening allocation of files.
 673  *
 674  * If we allocate a first level directory then force allocation
 675  * in another cylinder group.
 676  */
 677 static ino_t
 678 ffs_dirpref(struct inode *pip)
 679 {
 680         register struct fs *fs;
 681         int cg, prefcg;
 682         int64_t dirsize, cgsize, curdsz;
 683         int avgifree, avgbfree, avgndir;
 684         int minifree, minbfree, maxndir;
 685         int mincg, minndir;
 686         int maxcontigdirs;
 687
 688         KASSERT(mutex_owned(&pip->i_ump->um_lock));
 689
 690         fs = pip->i_fs;
 691
 692         avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg;
 693         avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
 694         avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg;
 695
 696         /*
 697          * Force allocation in another cg if creating a first level dir.
 698          */
 699         if (ITOV(pip)->v_vflag & VV_ROOT) {
 700                 prefcg = random() % fs->fs_ncg;
 701                 mincg = prefcg;
 702                 minndir = fs->fs_ipg;
 703                 for (cg = prefcg; cg < fs->fs_ncg; cg++)
 704                         if (fs->fs_cs(fs, cg).cs_ndir < minndir &&
 705                             fs->fs_cs(fs, cg).cs_nifree >= avgifree &&
 706                             fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
 707                                 mincg = cg;
 708                                 minndir = fs->fs_cs(fs, cg).cs_ndir;
 709                         }
 710                 for (cg = 0; cg < prefcg; cg++)
 711                         if (fs->fs_cs(fs, cg).cs_ndir < minndir &&
 712                             fs->fs_cs(fs, cg).cs_nifree >= avgifree &&
 713                             fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
 714                                 mincg = cg;
 715                                 minndir = fs->fs_cs(fs, cg).cs_ndir;
 716                         }
 717                 return ((ino_t)(fs->fs_ipg * mincg));
 718         }
 719
 720         /*
 721          * Count various limits which used for
 722          * optimal allocation of a directory inode.
 723          * Try cylinder groups with >75% avgifree and avgbfree.
 724          * Avoid cylinder groups with no free blocks or inodes as that
 725          * triggers an I/O-expensive cylinder group scan.
 726          */
 727         maxndir = min(avgndir + fs->fs_ipg / 16, fs->fs_ipg);
 728         minifree = avgifree - avgifree / 4;
 729         if (minifree < 1)
 730                 minifree = 1;
 731         minbfree = avgbfree - avgbfree / 4;
 732         if (minbfree < 1)
 733                 minbfree = 1;
 734         cgsize = (int64_t)fs->fs_fsize * fs->fs_fpg;
 735         dirsize = (int64_t)fs->fs_avgfilesize * fs->fs_avgfpdir;
 736         if (avgndir != 0) {
 737                 curdsz = (cgsize - (int64_t)avgbfree * fs->fs_bsize) / avgndir;
 738                 if (dirsize < curdsz)
 739                         dirsize = curdsz;
 740         }
 741         if (cgsize < dirsize * 255)
 742                 maxcontigdirs = (avgbfree * fs->fs_bsize) / dirsize;
 743         else
 744                 maxcontigdirs = 255;
 745         if (fs->fs_avgfpdir > 0)
 746                 maxcontigdirs = min(maxcontigdirs,
 747                                     fs->fs_ipg / fs->fs_avgfpdir);
 748         if (maxcontigdirs == 0)
 749                 maxcontigdirs = 1;
 750
 751         /*
 752          * Limit number of dirs in one cg and reserve space for
 753          * regular files, but only if we have no deficit in
 754          * inodes or space.
 755          */
 756         prefcg = ino_to_cg(fs, pip->i_number);
 757         for (cg = prefcg; cg < fs->fs_ncg; cg++)
 758                 if (fs->fs_cs(fs, cg).cs_ndir < maxndir &&
 759                     fs->fs_cs(fs, cg).cs_nifree >= minifree &&
 760                     fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
 761                         if (fs->fs_contigdirs[cg] < maxcontigdirs)
 762                                 return ((ino_t)(fs->fs_ipg * cg));
 763                 }
 764         for (cg = 0; cg < prefcg; cg++)
 765                 if (fs->fs_cs(fs, cg).cs_ndir < maxndir &&
 766                     fs->fs_cs(fs, cg).cs_nifree >= minifree &&
 767                     fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
 768                         if (fs->fs_contigdirs[cg] < maxcontigdirs)
 769                                 return ((ino_t)(fs->fs_ipg * cg));
 770                 }
 771         /*
 772          * This is a backstop when we are deficient in space.
 773          */
 774         for (cg = prefcg; cg < fs->fs_ncg; cg++)
 775                 if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
 776                         return ((ino_t)(fs->fs_ipg * cg));
 777         for (cg = 0; cg < prefcg; cg++)
 778                 if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
 779                         break;
 780         return ((ino_t)(fs->fs_ipg * cg));
 781 }
 782
 783 /*
 784  * Select the desired position for the next block in a file.  The file is
 785  * logically divided into sections. The first section is composed of the
 786  * direct blocks. Each additional section contains fs_maxbpg blocks.
 787  *
 788  * If no blocks have been allocated in the first section, the policy is to
 789  * request a block in the same cylinder group as the inode that describes
 790  * the file. If no blocks have been allocated in any other section, the
 791  * policy is to place the section in a cylinder group with a greater than
 792  * average number of free blocks.  An appropriate cylinder group is found
 793  * by using a rotor that sweeps the cylinder groups. When a new group of
 794  * blocks is needed, the sweep begins in the cylinder group following the
 795  * cylinder group from which the previous allocation was made. The sweep
 796  * continues until a cylinder group with greater than the average number
 797  * of free blocks is found. If the allocation is for the first block in an
 798  * indirect block, the information on the previous allocation is unavailable;
 799  * here a best guess is made based upon the logical block number being
 800  * allocated.
 801  *
 802  * If a section is already partially allocated, the policy is to
 803  * contiguously allocate fs_maxcontig blocks.  The end of one of these
 804  * contiguous blocks and the beginning of the next is laid out
 805  * contigously if possible.
 806  *
 807  * => um_lock held on entry and exit
 808  */
 809 daddr_t
 810 ffs_blkpref_ufs1(struct inode *ip, daddr_t lbn, int indx, int flags,
 811     int32_t *bap /* XXX ondisk32 */)
 812 {
 813         struct fs *fs;
 814         int cg;
 815         int avgbfree, startcg;
 816
 817         KASSERT(mutex_owned(&ip->i_ump->um_lock));
 818
 819         fs = ip->i_fs;
 820
 821         /*
 822          * If allocating a contiguous file with B_CONTIG, use the hints
 823          * in the inode extentions to return the desired block.
 824          *
 825          * For metadata (indirect blocks) return the address of where
 826          * the first indirect block resides - we'll scan for the next
 827          * available slot if we need to allocate more than one indirect
 828          * block.  For data, return the address of the actual block
 829          * relative to the address of the first data block.
 830          */
 831         if (flags & B_CONTIG) {
 832                 KASSERT(ip->i_ffs_first_data_blk != 0);
 833                 KASSERT(ip->i_ffs_first_indir_blk != 0);
 834                 if (flags & B_METAONLY)
 835                         return ip->i_ffs_first_indir_blk;
 836                 else
 837                         return ip->i_ffs_first_data_blk + ffs_blkstofrags(fs, lbn);
 838         }
 839
 840         if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
 841                 if (lbn < UFS_NDADDR + FFS_NINDIR(fs)) {
 842                         cg = ino_to_cg(fs, ip->i_number);
 843                         return (cgbase(fs, cg) + fs->fs_frag);
 844                 }
 845                 /*
 846                  * Find a cylinder with greater than average number of
 847                  * unused data blocks.
 848                  */
 849                 if (indx == 0 || bap[indx - 1] == 0)
 850                         startcg =
 851                             ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg;
 852                 else
 853                         startcg = dtog(fs,
 854                                 ufs_rw32(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + 1);
 855                 startcg %= fs->fs_ncg;
 856                 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
 857                 for (cg = startcg; cg < fs->fs_ncg; cg++)
 858                         if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
 859                                 return (cgbase(fs, cg) + fs->fs_frag);
 860                         }
 861                 for (cg = 0; cg < startcg; cg++)
 862                         if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
 863                                 return (cgbase(fs, cg) + fs->fs_frag);
 864                         }
 865                 return (0);
 866         }
 867         /*
 868          * We just always try to lay things out contiguously.
 869          */
 870         return ufs_rw32(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + fs->fs_frag;
 871 }
 872
 873 daddr_t
 874 ffs_blkpref_ufs2(struct inode *ip, daddr_t lbn, int indx, int flags,
 875     int64_t *bap)
 876 {
 877         struct fs *fs;
 878         int cg;
 879         int avgbfree, startcg;
 880
 881         KASSERT(mutex_owned(&ip->i_ump->um_lock));
 882
 883         fs = ip->i_fs;
 884
 885         /*
 886          * If allocating a contiguous file with B_CONTIG, use the hints
 887          * in the inode extentions to return the desired block.
 888          *
 889          * For metadata (indirect blocks) return the address of where
 890          * the first indirect block resides - we'll scan for the next
 891          * available slot if we need to allocate more than one indirect
 892          * block.  For data, return the address of the actual block
 893          * relative to the address of the first data block.
 894          */
 895         if (flags & B_CONTIG) {
 896                 KASSERT(ip->i_ffs_first_data_blk != 0);
 897                 KASSERT(ip->i_ffs_first_indir_blk != 0);
 898                 if (flags & B_METAONLY)
 899                         return ip->i_ffs_first_indir_blk;
 900                 else
 901                         return ip->i_ffs_first_data_blk + ffs_blkstofrags(fs, lbn);
 902         }
 903
 904         if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
 905                 if (lbn < UFS_NDADDR + FFS_NINDIR(fs)) {
 906                         cg = ino_to_cg(fs, ip->i_number);
 907                         return (cgbase(fs, cg) + fs->fs_frag);
 908                 }
 909                 /*
 910                  * Find a cylinder with greater than average number of
 911                  * unused data blocks.
 912                  */
 913                 if (indx == 0 || bap[indx - 1] == 0)
 914                         startcg =
 915                             ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg;
 916                 else
 917                         startcg = dtog(fs,
 918                                 ufs_rw64(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + 1);
 919                 startcg %= fs->fs_ncg;
 920                 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
 921                 for (cg = startcg; cg < fs->fs_ncg; cg++)
 922                         if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
 923                                 return (cgbase(fs, cg) + fs->fs_frag);
 924                         }
 925                 for (cg = 0; cg < startcg; cg++)
 926                         if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
 927                                 return (cgbase(fs, cg) + fs->fs_frag);
 928                         }
 929                 return (0);
 930         }
 931         /*
 932          * We just always try to lay things out contiguously.
 933          */
 934         return ufs_rw64(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + fs->fs_frag;
 935 }
 936
 937
 938 /*
 939  * Implement the cylinder overflow algorithm.
 940  *
 941  * The policy implemented by this algorithm is:
 942  *   1) allocate the block in its requested cylinder group.
 943  *   2) quadradically rehash on the cylinder group number.
 944  *   3) brute force search for a free block.
 945  *
 946  * => called with um_lock held
 947  * => returns with um_lock released on success, held on failure
 948  *    (*allocator releases lock on success, retains lock on failure)
 949  */
 950 /*VARARGS5*/
 951 static daddr_t
 952 ffs_hashalloc(struct inode *ip, int cg, daddr_t pref,
 953     int size /* size for data blocks, mode for inodes */,
 954     int flags, daddr_t (*allocator)(struct inode *, int, daddr_t, int, int))
 955 {
 956         struct fs *fs;
 957         daddr_t result;
 958         int i, icg = cg;
 959
 960         fs = ip->i_fs;
 961         /*
 962          * 1: preferred cylinder group
 963          */
 964         result = (*allocator)(ip, cg, pref, size, flags);
 965         if (result)
 966                 return (result);
 967
 968         if (flags & B_CONTIG)
 969                 return (result);
 970         /*
 971          * 2: quadratic rehash
 972          */
 973         for (i = 1; i < fs->fs_ncg; i *= 2) {
 974                 cg += i;
 975                 if (cg >= fs->fs_ncg)
 976                         cg -= fs->fs_ncg;
 977                 result = (*allocator)(ip, cg, 0, size, flags);
 978                 if (result)
 979                         return (result);
 980         }
 981         /*
 982          * 3: brute force search
 983          * Note that we start at i == 2, since 0 was checked initially,
 984          * and 1 is always checked in the quadratic rehash.
 985          */
 986         cg = (icg + 2) % fs->fs_ncg;
 987         for (i = 2; i < fs->fs_ncg; i++) {
 988                 result = (*allocator)(ip, cg, 0, size, flags);
 989                 if (result)
 990                         return (result);
 991                 cg++;
 992                 if (cg == fs->fs_ncg)
 993                         cg = 0;
 994         }
 995         return (0);
 996 }
 997
 998 /*
 999  * Determine whether a fragment can be extended.
1000  *
1001  * Check to see if the necessary fragments are available, and
1002  * if they are, allocate them.
1003  *
1004  * => called with um_lock held
1005  * => returns with um_lock released on success, held on failure
1006  */
1007 static daddr_t
1008 ffs_fragextend(struct inode *ip, int cg, daddr_t bprev, int osize, int nsize)
1009 {
1010         struct ufsmount *ump;
1011         struct fs *fs;
1012         struct cg *cgp;
1013         struct buf *bp;
1014         daddr_t bno;
1015         int frags, bbase;
1016         int i, error;
1017         u_int8_t *blksfree;
1018
1019         fs = ip->i_fs;
1020         ump = ip->i_ump;
1021
1022         KASSERT(mutex_owned(&ump->um_lock));
1023
1024         if (fs->fs_cs(fs, cg).cs_nffree < ffs_numfrags(fs, nsize - osize))
1025                 return (0);
1026         frags = ffs_numfrags(fs, nsize);
1027         bbase = ffs_fragnum(fs, bprev);
1028         if (bbase > ffs_fragnum(fs, (bprev + frags - 1))) {
1029                 /* cannot extend across a block boundary */
1030                 return (0);
1031         }
1032         mutex_exit(&ump->um_lock);
1033         error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)),
1034                 (int)fs->fs_cgsize, NOCRED, B_MODIFY, &bp);
1035         if (error)
1036                 goto fail;
1037         cgp = (struct cg *)bp->b_data;
1038         if (!cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs)))
1039                 goto fail;
1040         cgp->cg_old_time = ufs_rw32(time_second, UFS_FSNEEDSWAP(fs));
1041         if ((fs->fs_magic != FS_UFS1_MAGIC) ||
1042             (fs->fs_old_flags & FS_FLAGS_UPDATED))
1043                 cgp->cg_time = ufs_rw64(time_second, UFS_FSNEEDSWAP(fs));
1044         bno = dtogd(fs, bprev);
1045         blksfree = cg_blksfree(cgp, UFS_FSNEEDSWAP(fs));
1046         for (i = ffs_numfrags(fs, osize); i < frags; i++)
1047                 if (isclr(blksfree, bno + i))
1048                         goto fail;
1049         /*
1050          * the current fragment can be extended
1051          * deduct the count on fragment being extended into
1052          * increase the count on the remaining fragment (if any)
1053          * allocate the extended piece
1054          */
1055         for (i = frags; i < fs->fs_frag - bbase; i++)
1056                 if (isclr(blksfree, bno + i))
1057                         break;
1058         ufs_add32(cgp->cg_frsum[i - ffs_numfrags(fs, osize)], -1, UFS_FSNEEDSWAP(fs));
1059         if (i != frags)
1060                 ufs_add32(cgp->cg_frsum[i - frags], 1, UFS_FSNEEDSWAP(fs));
1061         mutex_enter(&ump->um_lock);
1062         for (i = ffs_numfrags(fs, osize); i < frags; i++) {
1063                 clrbit(blksfree, bno + i);
1064                 ufs_add32(cgp->cg_cs.cs_nffree, -1, UFS_FSNEEDSWAP(fs));
1065                 fs->fs_cstotal.cs_nffree--;
1066                 fs->fs_cs(fs, cg).cs_nffree--;
1067         }
1068         fs->fs_fmod = 1;
1069         ACTIVECG_CLR(fs, cg);
1070         mutex_exit(&ump->um_lock);
1071         bdwrite(bp);
1072         return (bprev);
1073
1074  fail:
1075         if (bp != NULL)
1076                 brelse(bp, 0);
1077         mutex_enter(&ump->um_lock);
1078         return (0);
1079 }
1080
1081 /*
1082  * Determine whether a block can be allocated.
1083  *
1084  * Check to see if a block of the appropriate size is available,
1085  * and if it is, allocate it.
1086  */
1087 static daddr_t
1088 ffs_alloccg(struct inode *ip, int cg, daddr_t bpref, int size, int flags)
1089 {
1090         struct ufsmount *ump;
1091         struct fs *fs = ip->i_fs;
1092         struct cg *cgp;
1093         struct buf *bp;
1094         int32_t bno;
1095         daddr_t blkno;
1096         int error, frags, allocsiz, i;
1097         u_int8_t *blksfree;
1098         const int needswap = UFS_FSNEEDSWAP(fs);
1099
1100         ump = ip->i_ump;
1101
1102         KASSERT(mutex_owned(&ump->um_lock));
1103
1104         if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize)
1105                 return (0);
1106         mutex_exit(&ump->um_lock);
1107         error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)),
1108                 (int)fs->fs_cgsize, NOCRED, B_MODIFY, &bp);
1109         if (error)
1110                 goto fail;
1111         cgp = (struct cg *)bp->b_data;
1112         if (!cg_chkmagic(cgp, needswap) ||
1113             (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize))
1114                 goto fail;
1115         cgp->cg_old_time = ufs_rw32(time_second, needswap);
1116         if ((fs->fs_magic != FS_UFS1_MAGIC) ||
1117             (fs->fs_old_flags & FS_FLAGS_UPDATED))
1118                 cgp->cg_time = ufs_rw64(time_second, needswap);
1119         if (size == fs->fs_bsize) {
1120                 mutex_enter(&ump->um_lock);
1121                 blkno = ffs_alloccgblk(ip, bp, bpref, flags);
1122                 ACTIVECG_CLR(fs, cg);
1123                 mutex_exit(&ump->um_lock);
1124                 bdwrite(bp);
1125                 return (blkno);
1126         }
1127         /*
1128          * check to see if any fragments are already available
1129          * allocsiz is the size which will be allocated, hacking
1130          * it down to a smaller size if necessary
1131          */
1132         blksfree = cg_blksfree(cgp, needswap);
1133         frags = ffs_numfrags(fs, size);
1134         for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++)
1135                 if (cgp->cg_frsum[allocsiz] != 0)
1136                         break;
1137         if (allocsiz == fs->fs_frag) {
1138                 /*
1139                  * no fragments were available, so a block will be
1140                  * allocated, and hacked up
1141                  */
1142                 if (cgp->cg_cs.cs_nbfree == 0)
1143                         goto fail;
1144                 mutex_enter(&ump->um_lock);
1145                 blkno = ffs_alloccgblk(ip, bp, bpref, flags);
1146                 bno = dtogd(fs, blkno);
1147                 for (i = frags; i < fs->fs_frag; i++)
1148                         setbit(blksfree, bno + i);
1149                 i = fs->fs_frag - frags;
1150                 ufs_add32(cgp->cg_cs.cs_nffree, i, needswap);
1151                 fs->fs_cstotal.cs_nffree += i;
1152                 fs->fs_cs(fs, cg).cs_nffree += i;
1153                 fs->fs_fmod = 1;
1154                 ufs_add32(cgp->cg_frsum[i], 1, needswap);
1155                 ACTIVECG_CLR(fs, cg);
1156                 mutex_exit(&ump->um_lock);
1157                 bdwrite(bp);
1158                 return (blkno);
1159         }
1160         bno = ffs_mapsearch(fs, cgp, bpref, allocsiz);
1161 #if 0
1162         /*
1163          * XXX fvdl mapsearch will panic, and never return -1
1164          *          also: returning NULL as daddr_t ?
1165          */
1166         if (bno < 0)
1167                 goto fail;
1168 #endif
1169         for (i = 0; i < frags; i++)
1170                 clrbit(blksfree, bno + i);
1171         mutex_enter(&ump->um_lock);
1172         ufs_add32(cgp->cg_cs.cs_nffree, -frags, needswap);
1173         fs->fs_cstotal.cs_nffree -= frags;
1174         fs->fs_cs(fs, cg).cs_nffree -= frags;
1175         fs->fs_fmod = 1;
1176         ufs_add32(cgp->cg_frsum[allocsiz], -1, needswap);
1177         if (frags != allocsiz)
1178                 ufs_add32(cgp->cg_frsum[allocsiz - frags], 1, needswap);
1179         blkno = cgbase(fs, cg) + bno;
1180         ACTIVECG_CLR(fs, cg);
1181         mutex_exit(&ump->um_lock);
1182         bdwrite(bp);
1183         return blkno;
1184
1185  fail:
1186         if (bp != NULL)
1187                 brelse(bp, 0);
1188         mutex_enter(&ump->um_lock);
1189         return (0);
1190 }
1191
1192 /*
1193  * Allocate a block in a cylinder group.
1194  *
1195  * This algorithm implements the following policy:
1196  *   1) allocate the requested block.
1197  *   2) allocate a rotationally optimal block in the same cylinder.
1198  *   3) allocate the next available block on the block rotor for the
1199  *      specified cylinder group.
1200  * Note that this routine only allocates fs_bsize blocks; these
1201  * blocks may be fragmented by the routine that allocates them.
1202  */
1203 static daddr_t
1204 ffs_alloccgblk(struct inode *ip, struct buf *bp, daddr_t bpref, int flags)
1205 {
1206         struct fs *fs = ip->i_fs;
1207         struct cg *cgp;
1208         int cg;
1209         daddr_t blkno;
1210         int32_t bno;
1211         u_int8_t *blksfree;
1212         const int needswap = UFS_FSNEEDSWAP(fs);
1213
1214         KASSERT(mutex_owned(&ip->i_ump->um_lock));
1215
1216         cgp = (struct cg *)bp->b_data;
1217         blksfree = cg_blksfree(cgp, needswap);
1218         if (bpref == 0 || dtog(fs, bpref) != ufs_rw32(cgp->cg_cgx, needswap)) {
1219                 bpref = ufs_rw32(cgp->cg_rotor, needswap);
1220         } else {
1221                 bpref = ffs_blknum(fs, bpref);
1222                 bno = dtogd(fs, bpref);
1223                 /*
1224                  * if the requested block is available, use it
1225                  */
1226                 if (ffs_isblock(fs, blksfree, ffs_fragstoblks(fs, bno)))
1227                         goto gotit;
1228                 /*
1229                  * if the requested data block isn't available and we are
1230                  * trying to allocate a contiguous file, return an error.
1231                  */
1232                 if ((flags & (B_CONTIG | B_METAONLY)) == B_CONTIG)
1233                         return (0);
1234         }
1235
1236         /*
1237          * Take the next available block in this cylinder group.
1238          */
1239         bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag);
1240         if (bno < 0)
1241                 return (0);
1242         cgp->cg_rotor = ufs_rw32(bno, needswap);
1243 gotit:
1244         blkno = ffs_fragstoblks(fs, bno);
1245         ffs_clrblock(fs, blksfree, blkno);
1246         ffs_clusteracct(fs, cgp, blkno, -1);
1247         ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap);
1248         fs->fs_cstotal.cs_nbfree--;
1249         fs->fs_cs(fs, ufs_rw32(cgp->cg_cgx, needswap)).cs_nbfree--;
1250         if ((fs->fs_magic == FS_UFS1_MAGIC) &&
1251             ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) {
1252                 int cylno;
1253                 cylno = old_cbtocylno(fs, bno);
1254                 KASSERT(cylno >= 0);
1255                 KASSERT(cylno < fs->fs_old_ncyl);
1256                 KASSERT(old_cbtorpos(fs, bno) >= 0);
1257                 KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, bno) < fs->fs_old_nrpos);
1258                 ufs_add16(old_cg_blks(fs, cgp, cylno, needswap)[old_cbtorpos(fs, bno)], -1,
1259                     needswap);
1260                 ufs_add32(old_cg_blktot(cgp, needswap)[cylno], -1, needswap);
1261         }
1262         fs->fs_fmod = 1;
1263         cg = ufs_rw32(cgp->cg_cgx, needswap);
1264         blkno = cgbase(fs, cg) + bno;
1265         return (blkno);
1266 }
1267
1268 /*
1269  * Determine whether an inode can be allocated.
1270  *
1271  * Check to see if an inode is available, and if it is,
1272  * allocate it using the following policy:
1273  *   1) allocate the requested inode.
1274  *   2) allocate the next available inode after the requested
1275  *      inode in the specified cylinder group.
1276  */
1277 static daddr_t
1278 ffs_nodealloccg(struct inode *ip, int cg, daddr_t ipref, int mode, int flags)
1279 {
1280         struct ufsmount *ump = ip->i_ump;
1281         struct fs *fs = ip->i_fs;
1282         struct cg *cgp;
1283         struct buf *bp, *ibp;
1284         u_int8_t *inosused;
1285         int error, start, len, loc, map, i;
1286         int32_t initediblk;
1287         daddr_t nalloc;
1288         struct ufs2_dinode *dp2;
1289         const int needswap = UFS_FSNEEDSWAP(fs);
1290
1291         KASSERT(mutex_owned(&ump->um_lock));
1292         UFS_WAPBL_JLOCK_ASSERT(ip->i_ump->um_mountp);
1293
1294         if (fs->fs_cs(fs, cg).cs_nifree == 0)
1295                 return (0);
1296         mutex_exit(&ump->um_lock);
1297         ibp = NULL;
1298         initediblk = -1;
1299 retry:
1300         error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)),
1301                 (int)fs->fs_cgsize, NOCRED, B_MODIFY, &bp);
1302         if (error)
1303                 goto fail;
1304         cgp = (struct cg *)bp->b_data;
1305         if (!cg_chkmagic(cgp, needswap) || cgp->cg_cs.cs_nifree == 0)
1306                 goto fail;
1307
1308         if (ibp != NULL &&
1309             initediblk != ufs_rw32(cgp->cg_initediblk, needswap)) {
1310                 /* Another thread allocated more inodes so we retry the test. */
1311                 brelse(ibp, 0);
1312                 ibp = NULL;
1313         }
1314         /*
1315          * Check to see if we need to initialize more inodes.
1316          */
1317         if (fs->fs_magic == FS_UFS2_MAGIC && ibp == NULL) {
1318                 initediblk = ufs_rw32(cgp->cg_initediblk, needswap);
1319                 nalloc = fs->fs_ipg - ufs_rw32(cgp->cg_cs.cs_nifree, needswap);
1320                 if (nalloc + FFS_INOPB(fs) > initediblk &&
1321                     initediblk < ufs_rw32(cgp->cg_niblk, needswap)) {
1322                         /*
1323                          * We have to release the cg buffer here to prevent
1324                          * a deadlock when reading the inode block will
1325                          * run a copy-on-write that might use this cg.
1326                          */
1327                         brelse(bp, 0);
1328                         bp = NULL;
1329                         error = ffs_getblk(ip->i_devvp, FFS_FSBTODB(fs,
1330                             ino_to_fsba(fs, cg * fs->fs_ipg + initediblk)),
1331                             FFS_NOBLK, fs->fs_bsize, false, &ibp);
1332                         if (error)
1333                                 goto fail;
1334                         goto retry;
1335                 }
1336         }
1337
1338         cgp->cg_old_time = ufs_rw32(time_second, needswap);
1339         if ((fs->fs_magic != FS_UFS1_MAGIC) ||
1340             (fs->fs_old_flags & FS_FLAGS_UPDATED))
1341                 cgp->cg_time = ufs_rw64(time_second, needswap);
1342         inosused = cg_inosused(cgp, needswap);
1343         if (ipref) {
1344                 ipref %= fs->fs_ipg;
1345                 if (isclr(inosused, ipref))
1346                         goto gotit;
1347         }
1348         start = ufs_rw32(cgp->cg_irotor, needswap) / NBBY;
1349         len = howmany(fs->fs_ipg - ufs_rw32(cgp->cg_irotor, needswap),
1350                 NBBY);
1351         loc = skpc(0xff, len, &inosused[start]);
1352         if (loc == 0) {
1353                 len = start + 1;
1354                 start = 0;
1355                 loc = skpc(0xff, len, &inosused[0]);
1356                 if (loc == 0) {
1357                         printf("cg = %d, irotor = %d, fs = %s\n",
1358                             cg, ufs_rw32(cgp->cg_irotor, needswap),
1359                                 fs->fs_fsmnt);
1360                         panic("ffs_nodealloccg: map corrupted");
1361                         /* NOTREACHED */
1362                 }
1363         }
1364         i = start + len - loc;
1365         map = inosused[i] ^ 0xff;
1366         if (map == 0) {
1367                 printf("fs = %s\n", fs->fs_fsmnt);
1368                 panic("ffs_nodealloccg: block not in map");
1369         }
1370         ipref = i * NBBY + ffs(map) - 1;
1371         cgp->cg_irotor = ufs_rw32(ipref, needswap);
1372 gotit:
1373         UFS_WAPBL_REGISTER_INODE(ip->i_ump->um_mountp, cg * fs->fs_ipg + ipref,
1374             mode);
1375         /*
1376          * Check to see if we need to initialize more inodes.
1377          */
1378         if (ibp != NULL) {
1379                 KASSERT(initediblk == ufs_rw32(cgp->cg_initediblk, needswap));
1380                 memset(ibp->b_data, 0, fs->fs_bsize);
1381                 dp2 = (struct ufs2_dinode *)(ibp->b_data);
1382                 for (i = 0; i < FFS_INOPB(fs); i++) {
1383                         /*
1384                          * Don't bother to swap, it's supposed to be
1385                          * random, after all.
1386                          */
1387                         dp2->di_gen = (cprng_fast32() & INT32_MAX) / 2 + 1;
1388                         dp2++;
1389                 }
1390                 initediblk += FFS_INOPB(fs);
1391                 cgp->cg_initediblk = ufs_rw32(initediblk, needswap);
1392         }
1393
1394         mutex_enter(&ump->um_lock);
1395         ACTIVECG_CLR(fs, cg);
1396         setbit(inosused, ipref);
1397         ufs_add32(cgp->cg_cs.cs_nifree, -1, needswap);
1398         fs->fs_cstotal.cs_nifree--;
1399         fs->fs_cs(fs, cg).cs_nifree--;
1400         fs->fs_fmod = 1;
1401         if ((mode & IFMT) == IFDIR) {
1402                 ufs_add32(cgp->cg_cs.cs_ndir, 1, needswap);
1403                 fs->fs_cstotal.cs_ndir++;
1404                 fs->fs_cs(fs, cg).cs_ndir++;
1405         }
1406         mutex_exit(&ump->um_lock);
1407         if (ibp != NULL) {
1408                 bwrite(bp);
1409                 bawrite(ibp);
1410         } else
1411                 bdwrite(bp);
1412         return (cg * fs->fs_ipg + ipref);
1413  fail:
1414         if (bp != NULL)
1415                 brelse(bp, 0);
1416         if (ibp != NULL)
1417                 brelse(ibp, 0);
1418         mutex_enter(&ump->um_lock);
1419         return (0);
1420 }
1421
1422 /*
1423  * Allocate a block or fragment.
1424  *
1425  * The specified block or fragment is removed from the
1426  * free map, possibly fragmenting a block in the process.
1427  *
1428  * This implementation should mirror fs_blkfree
1429  *
1430  * => um_lock not held on entry or exit
1431  */
1432 int
1433 ffs_blkalloc(struct inode *ip, daddr_t bno, long size)
1434 {
1435         int error;
1436
1437         error = ffs_check_bad_allocation(__func__, ip->i_fs, bno, size,
1438             ip->i_dev, ip->i_uid);
1439         if (error)
1440                 return error;
1441
1442         return ffs_blkalloc_ump(ip->i_ump, bno, size);
1443 }
1444
1445 int
1446 ffs_blkalloc_ump(struct ufsmount *ump, daddr_t bno, long size)
1447 {
1448         struct fs *fs = ump->um_fs;
1449         struct cg *cgp;
1450         struct buf *bp;
1451         int32_t fragno, cgbno;
1452         int i, error, cg, blk, frags, bbase;
1453         u_int8_t *blksfree;
1454         const int needswap = UFS_FSNEEDSWAP(fs);
1455
1456         KASSERT((u_int)size <= fs->fs_bsize && ffs_fragoff(fs, size) == 0 &&
1457             ffs_fragnum(fs, bno) + ffs_numfrags(fs, size) <= fs->fs_frag);
1458         KASSERT(bno < fs->fs_size);
1459
1460         cg = dtog(fs, bno);
1461         error = bread(ump->um_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)),
1462                 (int)fs->fs_cgsize, NOCRED, B_MODIFY, &bp);
1463         if (error) {
1464                 return error;
1465         }
1466         cgp = (struct cg *)bp->b_data;
1467         if (!cg_chkmagic(cgp, needswap)) {
1468                 brelse(bp, 0);
1469                 return EIO;
1470         }
1471         cgp->cg_old_time = ufs_rw32(time_second, needswap);
1472         cgp->cg_time = ufs_rw64(time_second, needswap);
1473         cgbno = dtogd(fs, bno);
1474         blksfree = cg_blksfree(cgp, needswap);
1475
1476         mutex_enter(&ump->um_lock);
1477         if (size == fs->fs_bsize) {
1478                 fragno = ffs_fragstoblks(fs, cgbno);
1479                 if (!ffs_isblock(fs, blksfree, fragno)) {
1480                         mutex_exit(&ump->um_lock);
1481                         brelse(bp, 0);
1482                         return EBUSY;
1483                 }
1484                 ffs_clrblock(fs, blksfree, fragno);
1485                 ffs_clusteracct(fs, cgp, fragno, -1);
1486                 ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap);
1487                 fs->fs_cstotal.cs_nbfree--;
1488                 fs->fs_cs(fs, cg).cs_nbfree--;
1489         } else {
1490                 bbase = cgbno - ffs_fragnum(fs, cgbno);
1491
1492                 frags = ffs_numfrags(fs, size);
1493                 for (i = 0; i < frags; i++) {
1494                         if (isclr(blksfree, cgbno + i)) {
1495                                 mutex_exit(&ump->um_lock);
1496                                 brelse(bp, 0);
1497                                 return EBUSY;
1498                         }
1499                 }
1500                 /*
1501                  * if a complete block is being split, account for it
1502                  */
1503                 fragno = ffs_fragstoblks(fs, bbase);
1504                 if (ffs_isblock(fs, blksfree, fragno)) {
1505                         ufs_add32(cgp->cg_cs.cs_nffree, fs->fs_frag, needswap);
1506                         fs->fs_cstotal.cs_nffree += fs->fs_frag;
1507                         fs->fs_cs(fs, cg).cs_nffree += fs->fs_frag;
1508                         ffs_clusteracct(fs, cgp, fragno, -1);
1509                         ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap);
1510                         fs->fs_cstotal.cs_nbfree--;
1511                         fs->fs_cs(fs, cg).cs_nbfree--;
1512                 }
1513                 /*
1514                  * decrement the counts associated with the old frags
1515                  */
1516                 blk = blkmap(fs, blksfree, bbase);
1517                 ffs_fragacct(fs, blk, cgp->cg_frsum, -1, needswap);
1518                 /*
1519                  * allocate the fragment
1520                  */
1521                 for (i = 0; i < frags; i++) {
1522                         clrbit(blksfree, cgbno + i);
1523                 }
1524                 ufs_add32(cgp->cg_cs.cs_nffree, -i, needswap);
1525                 fs->fs_cstotal.cs_nffree -= i;
1526                 fs->fs_cs(fs, cg).cs_nffree -= i;
1527                 /*
1528                  * add back in counts associated with the new frags
1529                  */
1530                 blk = blkmap(fs, blksfree, bbase);
1531                 ffs_fragacct(fs, blk, cgp->cg_frsum, 1, needswap);
1532         }
1533         fs->fs_fmod = 1;
1534         ACTIVECG_CLR(fs, cg);
1535         mutex_exit(&ump->um_lock);
1536         bdwrite(bp);
1537         return 0;
1538 }
1539
1540 /*
1541  * Free a block or fragment.
1542  *
1543  * The specified block or fragment is placed back in the
1544  * free map. If a fragment is deallocated, a possible
1545  * block reassembly is checked.
1546  *
1547  * => um_lock not held on entry or exit
1548  */
1549 static void
1550 ffs_blkfree_cg(struct fs *fs, struct vnode *devvp, daddr_t bno, long size)
1551 {
1552         struct cg *cgp;
1553         struct buf *bp;
1554         struct ufsmount *ump;
1555         daddr_t cgblkno;
1556         int error, cg;
1557         dev_t dev;
1558         const bool devvp_is_snapshot = (devvp->v_type != VBLK);
1559         const int needswap = UFS_FSNEEDSWAP(fs);
1560
1561         KASSERT(!devvp_is_snapshot);
1562
1563         cg = dtog(fs, bno);
1564         dev = devvp->v_rdev;
1565         ump = VFSTOUFS(spec_node_getmountedfs(devvp));
1566         KASSERT(fs == ump->um_fs);
1567         cgblkno = FFS_FSBTODB(fs, cgtod(fs, cg));
1568
1569         error = bread(devvp, cgblkno, (int)fs->fs_cgsize,
1570             NOCRED, B_MODIFY, &bp);
1571         if (error) {
1572                 return;
1573         }
1574         cgp = (struct cg *)bp->b_data;
1575         if (!cg_chkmagic(cgp, needswap)) {
1576                 brelse(bp, 0);
1577                 return;
1578         }
1579
1580         ffs_blkfree_common(ump, fs, dev, bp, bno, size, devvp_is_snapshot);
1581
1582         bdwrite(bp);
1583 }
1584
1585 struct discardopdata {
1586         struct work wk; /* must be first */
1587         struct vnode *devvp;
1588         daddr_t bno;
1589         long size;
1590 };
1591
1592 struct discarddata {
1593         struct fs *fs;
1594         struct discardopdata *entry;
1595         long maxsize;
1596         kmutex_t entrylk;
1597         struct workqueue *wq;
1598         int wqcnt, wqdraining;
1599         kmutex_t wqlk;
1600         kcondvar_t wqcv;
1601         /* timer for flush? */
1602 };
1603
1604 static void
1605 ffs_blkfree_td(struct fs *fs, struct discardopdata *td)
1606 {
1607         long todo;
1608
1609         while (td->size) {
1610                 todo = min(td->size,
1611                   ffs_lfragtosize(fs, (fs->fs_frag - ffs_fragnum(fs, td->bno))));
1612                 ffs_blkfree_cg(fs, td->devvp, td->bno, todo);
1613                 td->bno += ffs_numfrags(fs, todo);
1614                 td->size -= todo;
1615         }
1616 }
1617
1618 static void
1619 ffs_discardcb(struct work *wk, void *arg)
1620 {
1621         struct discardopdata *td = (void *)wk;
1622         struct discarddata *ts = arg;
1623         struct fs *fs = ts->fs;
1624         struct disk_discard_range ta;
1625 #ifdef TRIMDEBUG
1626         int error;
1627 #endif
1628
1629         ta.bno = FFS_FSBTODB(fs, td->bno);
1630         ta.size = td->size >> DEV_BSHIFT;
1631 #ifdef TRIMDEBUG
1632         error =
1633 #endif
1634                 VOP_IOCTL(td->devvp, DIOCDISCARD, &ta, FWRITE, FSCRED);
1635 #ifdef TRIMDEBUG
1636         printf("trim(%" PRId64 ",%ld):%d\n", td->bno, td->size, error);
1637 #endif
1638
1639         ffs_blkfree_td(fs, td);
1640         kmem_free(td, sizeof(*td));
1641         mutex_enter(&ts->wqlk);
1642         ts->wqcnt--;
1643         if (ts->wqdraining && !ts->wqcnt)
1644                 cv_signal(&ts->wqcv);
1645         mutex_exit(&ts->wqlk);
1646 }
1647
1648 void *
1649 ffs_discard_init(struct vnode *devvp, struct fs *fs)
1650 {
1651         struct disk_discard_params tp;
1652         struct discarddata *ts;
1653         int error;
1654
1655         error = VOP_IOCTL(devvp, DIOCGDISCARDPARAMS, &tp, FREAD, FSCRED);
1656         if (error) {
1657                 printf("DIOCGDISCARDPARAMS: %d\n", error);
1658                 return NULL;
1659         }
1660         if (tp.maxsize * DEV_BSIZE < fs->fs_bsize) {
1661                 printf("tp.maxsize=%ld, fs_bsize=%d\n", tp.maxsize, fs->fs_bsize);
1662                 return NULL;
1663         }
1664
1665         ts = kmem_zalloc(sizeof (*ts), KM_SLEEP);
1666         error = workqueue_create(&ts->wq, "trimwq", ffs_discardcb, ts,
1667                                  0, 0, 0);
1668         if (error) {
1669                 kmem_free(ts, sizeof (*ts));
1670                 return NULL;
1671         }
1672         mutex_init(&ts->entrylk, MUTEX_DEFAULT, IPL_NONE);
1673         mutex_init(&ts->wqlk, MUTEX_DEFAULT, IPL_NONE);
1674         cv_init(&ts->wqcv, "trimwqcv");
1675         ts->maxsize = max(tp.maxsize * DEV_BSIZE, 100*1024); /* XXX */
1676         ts->fs = fs;
1677         return ts;
1678 }
1679
1680 void
1681 ffs_discard_finish(void *vts, int flags)
1682 {
1683         struct discarddata *ts = vts;
1684         struct discardopdata *td = NULL;
1685         int res = 0;
1686
1687         /* wait for workqueue to drain */
1688         mutex_enter(&ts->wqlk);
1689         if (ts->wqcnt) {
1690                 ts->wqdraining = 1;
1691                 res = cv_timedwait(&ts->wqcv, &ts->wqlk, mstohz(5000));
1692         }
1693         mutex_exit(&ts->wqlk);
1694         if (res)
1695                 printf("ffs_discarddata drain timeout\n");
1696
1697         mutex_enter(&ts->entrylk);
1698         if (ts->entry) {
1699                 td = ts->entry;
1700                 ts->entry = NULL;
1701         }
1702         mutex_exit(&ts->entrylk);
1703         if (td) {
1704                 /* XXX don't tell disk, its optional */
1705                 ffs_blkfree_td(ts->fs, td);
1706 #ifdef TRIMDEBUG
1707                 printf("finish(%" PRId64 ",%ld)\n", td->bno, td->size);
1708 #endif
1709                 kmem_free(td, sizeof(*td));
1710         }
1711
1712         cv_destroy(&ts->wqcv);
1713         mutex_destroy(&ts->entrylk);
1714         mutex_destroy(&ts->wqlk);
1715         workqueue_destroy(ts->wq);
1716         kmem_free(ts, sizeof(*ts));
1717 }
1718
1719 void
1720 ffs_blkfree(struct fs *fs, struct vnode *devvp, daddr_t bno, long size,
1721     ino_t inum)
1722 {
1723         struct ufsmount *ump;
1724         int error;
1725         dev_t dev;
1726         struct discarddata *ts;
1727         struct discardopdata *td;
1728
1729         dev = devvp->v_rdev;
1730         ump = VFSTOUFS(spec_node_getmountedfs(devvp));
1731         if (ffs_snapblkfree(fs, devvp, bno, size, inum))
1732                 return;
1733
1734         error = ffs_check_bad_allocation(__func__, fs, bno, size, dev, inum);
1735         if (error)
1736                 return;
1737
1738         if (!ump->um_discarddata) {
1739                 ffs_blkfree_cg(fs, devvp, bno, size);
1740                 return;
1741         }
1742
1743 #ifdef TRIMDEBUG
1744         printf("blkfree(%" PRId64 ",%ld)\n", bno, size);
1745 #endif
1746         ts = ump->um_discarddata;
1747         td = NULL;
1748
1749         mutex_enter(&ts->entrylk);
1750         if (ts->entry) {
1751                 td = ts->entry;
1752                 /* ffs deallocs backwards, check for prepend only */
1753                 if (td->bno == bno + ffs_numfrags(fs, size)
1754                     && td->size + size <= ts->maxsize) {
1755                         td->bno = bno;
1756                         td->size += size;
1757                         if (td->size < ts->maxsize) {
1758 #ifdef TRIMDEBUG
1759                                 printf("defer(%" PRId64 ",%ld)\n", td->bno, td->size);
1760 #endif
1761                                 mutex_exit(&ts->entrylk);
1762                                 return;
1763                         }
1764                         size = 0; /* mark done */
1765                 }
1766                 ts->entry = NULL;
1767         }
1768         mutex_exit(&ts->entrylk);
1769
1770         if (td) {
1771 #ifdef TRIMDEBUG
1772                 printf("enq old(%" PRId64 ",%ld)\n", td->bno, td->size);
1773 #endif
1774                 mutex_enter(&ts->wqlk);
1775                 ts->wqcnt++;
1776                 mutex_exit(&ts->wqlk);
1777                 workqueue_enqueue(ts->wq, &td->wk, NULL);
1778         }
1779         if (!size)
1780                 return;
1781
1782         td = kmem_alloc(sizeof(*td), KM_SLEEP);
1783         td->devvp = devvp;
1784         td->bno = bno;
1785         td->size = size;
1786
1787         if (td->size < ts->maxsize) { /* XXX always the case */
1788                 mutex_enter(&ts->entrylk);
1789                 if (!ts->entry) { /* possible race? */
1790 #ifdef TRIMDEBUG
1791                         printf("defer(%" PRId64 ",%ld)\n", td->bno, td->size);
1792 #endif
1793                         ts->entry = td;
1794                         td = NULL;
1795                 }
1796                 mutex_exit(&ts->entrylk);
1797         }
1798         if (td) {
1799 #ifdef TRIMDEBUG
1800                 printf("enq new(%" PRId64 ",%ld)\n", td->bno, td->size);
1801 #endif
1802                 mutex_enter(&ts->wqlk);
1803                 ts->wqcnt++;
1804                 mutex_exit(&ts->wqlk);
1805                 workqueue_enqueue(ts->wq, &td->wk, NULL);
1806         }
1807 }
1808
1809 /*
1810  * Free a block or fragment from a snapshot cg copy.
1811  *
1812  * The specified block or fragment is placed back in the
1813  * free map. If a fragment is deallocated, a possible
1814  * block reassembly is checked.
1815  *
1816  * => um_lock not held on entry or exit
1817  */
1818 void
1819 ffs_blkfree_snap(struct fs *fs, struct vnode *devvp, daddr_t bno, long size,
1820     ino_t inum)
1821 {
1822         struct cg *cgp;
1823         struct buf *bp;
1824         struct ufsmount *ump;
1825         daddr_t cgblkno;
1826         int error, cg;
1827         dev_t dev;
1828         const bool devvp_is_snapshot = (devvp->v_type != VBLK);
1829         const int needswap = UFS_FSNEEDSWAP(fs);
1830
1831         KASSERT(devvp_is_snapshot);
1832
1833         cg = dtog(fs, bno);
1834         dev = VTOI(devvp)->i_devvp->v_rdev;
1835         ump = VFSTOUFS(devvp->v_mount);
1836         cgblkno = ffs_fragstoblks(fs, cgtod(fs, cg));
1837
1838         error = ffs_check_bad_allocation(__func__, fs, bno, size, dev, inum);
1839         if (error)
1840                 return;
1841
1842         error = bread(devvp, cgblkno, (int)fs->fs_cgsize,
1843             NOCRED, B_MODIFY, &bp);
1844         if (error) {
1845                 return;
1846         }
1847         cgp = (struct cg *)bp->b_data;
1848         if (!cg_chkmagic(cgp, needswap)) {
1849                 brelse(bp, 0);
1850                 return;
1851         }
1852
1853         ffs_blkfree_common(ump, fs, dev, bp, bno, size, devvp_is_snapshot);
1854
1855         bdwrite(bp);
1856 }
1857
1858 static void
1859 ffs_blkfree_common(struct ufsmount *ump, struct fs *fs, dev_t dev,
1860     struct buf *bp, daddr_t bno, long size, bool devvp_is_snapshot)
1861 {
1862         struct cg *cgp;
1863         int32_t fragno, cgbno;
1864         int i, cg, blk, frags, bbase;
1865         u_int8_t *blksfree;
1866         const int needswap = UFS_FSNEEDSWAP(fs);
1867
1868         cg = dtog(fs, bno);
1869         cgp = (struct cg *)bp->b_data;
1870         cgp->cg_old_time = ufs_rw32(time_second, needswap);
1871         if ((fs->fs_magic != FS_UFS1_MAGIC) ||
1872             (fs->fs_old_flags & FS_FLAGS_UPDATED))
1873                 cgp->cg_time = ufs_rw64(time_second, needswap);
1874         cgbno = dtogd(fs, bno);
1875         blksfree = cg_blksfree(cgp, needswap);
1876         mutex_enter(&ump->um_lock);
1877         if (size == fs->fs_bsize) {
1878                 fragno = ffs_fragstoblks(fs, cgbno);
1879                 if (!ffs_isfreeblock(fs, blksfree, fragno)) {
1880                         if (devvp_is_snapshot) {
1881                                 mutex_exit(&ump->um_lock);
1882                                 return;
1883                         }
1884                         printf("dev = 0x%llx, block = %" PRId64 ", fs = %s\n",
1885                             (unsigned long long)dev, bno, fs->fs_fsmnt);
1886                         panic("blkfree: freeing free block");
1887                 }
1888                 ffs_setblock(fs, blksfree, fragno);
1889                 ffs_clusteracct(fs, cgp, fragno, 1);
1890                 ufs_add32(cgp->cg_cs.cs_nbfree, 1, needswap);
1891                 fs->fs_cstotal.cs_nbfree++;
1892                 fs->fs_cs(fs, cg).cs_nbfree++;
1893                 if ((fs->fs_magic == FS_UFS1_MAGIC) &&
1894                     ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) {
1895                         i = old_cbtocylno(fs, cgbno);
1896                         KASSERT(i >= 0);
1897                         KASSERT(i < fs->fs_old_ncyl);
1898                         KASSERT(old_cbtorpos(fs, cgbno) >= 0);
1899                         KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, cgbno) < fs->fs_old_nrpos);
1900                         ufs_add16(old_cg_blks(fs, cgp, i, needswap)[old_cbtorpos(fs, cgbno)], 1,
1901                             needswap);
1902                         ufs_add32(old_cg_blktot(cgp, needswap)[i], 1, needswap);
1903                 }
1904         } else {
1905                 bbase = cgbno - ffs_fragnum(fs, cgbno);
1906                 /*
1907                  * decrement the counts associated with the old frags
1908                  */
1909                 blk = blkmap(fs, blksfree, bbase);
1910                 ffs_fragacct(fs, blk, cgp->cg_frsum, -1, needswap);
1911                 /*
1912                  * deallocate the fragment
1913                  */
1914                 frags = ffs_numfrags(fs, size);
1915                 for (i = 0; i < frags; i++) {
1916                         if (isset(blksfree, cgbno + i)) {
1917                                 printf("dev = 0x%llx, block = %" PRId64
1918                                        ", fs = %s\n",
1919                                     (unsigned long long)dev, bno + i,
1920                                     fs->fs_fsmnt);
1921                                 panic("blkfree: freeing free frag");
1922                         }
1923                         setbit(blksfree, cgbno + i);
1924                 }
1925                 ufs_add32(cgp->cg_cs.cs_nffree, i, needswap);
1926                 fs->fs_cstotal.cs_nffree += i;
1927                 fs->fs_cs(fs, cg).cs_nffree += i;
1928                 /*
1929                  * add back in counts associated with the new frags
1930                  */
1931                 blk = blkmap(fs, blksfree, bbase);
1932                 ffs_fragacct(fs, blk, cgp->cg_frsum, 1, needswap);
1933                 /*
1934                  * if a complete block has been reassembled, account for it
1935                  */
1936                 fragno = ffs_fragstoblks(fs, bbase);
1937                 if (ffs_isblock(fs, blksfree, fragno)) {
1938                         ufs_add32(cgp->cg_cs.cs_nffree, -fs->fs_frag, needswap);
1939                         fs->fs_cstotal.cs_nffree -= fs->fs_frag;
1940                         fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
1941                         ffs_clusteracct(fs, cgp, fragno, 1);
1942                         ufs_add32(cgp->cg_cs.cs_nbfree, 1, needswap);
1943                         fs->fs_cstotal.cs_nbfree++;
1944                         fs->fs_cs(fs, cg).cs_nbfree++;
1945                         if ((fs->fs_magic == FS_UFS1_MAGIC) &&
1946                             ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) {
1947                                 i = old_cbtocylno(fs, bbase);
1948                                 KASSERT(i >= 0);
1949                                 KASSERT(i < fs->fs_old_ncyl);
1950                                 KASSERT(old_cbtorpos(fs, bbase) >= 0);
1951                                 KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, bbase) < fs->fs_old_nrpos);
1952                                 ufs_add16(old_cg_blks(fs, cgp, i, needswap)[old_cbtorpos(fs,
1953                                     bbase)], 1, needswap);
1954                                 ufs_add32(old_cg_blktot(cgp, needswap)[i], 1, needswap);
1955                         }
1956                 }
1957         }
1958         fs->fs_fmod = 1;
1959         ACTIVECG_CLR(fs, cg);
1960         mutex_exit(&ump->um_lock);
1961 }
1962
1963 /*
1964  * Free an inode.
1965  */
1966 int
1967 ffs_vfree(struct vnode *vp, ino_t ino, int mode)
1968 {
1969
1970         return ffs_freefile(vp->v_mount, ino, mode);
1971 }
1972
1973 /*
1974  * Do the actual free operation.
1975  * The specified inode is placed back in the free map.
1976  *
1977  * => um_lock not held on entry or exit
1978  */
1979 int
1980 ffs_freefile(struct mount *mp, ino_t ino, int mode)
1981 {
1982         struct ufsmount *ump = VFSTOUFS(mp);
1983         struct fs *fs = ump->um_fs;
1984         struct vnode *devvp;
1985         struct cg *cgp;
1986         struct buf *bp;
1987         int error, cg;
1988         daddr_t cgbno;
1989         dev_t dev;
1990         const int needswap = UFS_FSNEEDSWAP(fs);
1991
1992         cg = ino_to_cg(fs, ino);
1993         devvp = ump->um_devvp;
1994         dev = devvp->v_rdev;
1995         cgbno = FFS_FSBTODB(fs, cgtod(fs, cg));
1996
1997         if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg)
1998                 panic("ifree: range: dev = 0x%llx, ino = %llu, fs = %s",
1999                     (long long)dev, (unsigned long long)ino, fs->fs_fsmnt);
2000         error = bread(devvp, cgbno, (int)fs->fs_cgsize,
2001             NOCRED, B_MODIFY, &bp);
2002         if (error) {
2003                 return (error);
2004         }
2005         cgp = (struct cg *)bp->b_data;
2006         if (!cg_chkmagic(cgp, needswap)) {
2007                 brelse(bp, 0);
2008                 return (0);
2009         }
2010
2011         ffs_freefile_common(ump, fs, dev, bp, ino, mode, false);
2012
2013         bdwrite(bp);
2014
2015         return 0;
2016 }
2017
2018 int
2019 ffs_freefile_snap(struct fs *fs, struct vnode *devvp, ino_t ino, int mode)
2020 {
2021         struct ufsmount *ump;
2022         struct cg *cgp;
2023         struct buf *bp;
2024         int error, cg;
2025         daddr_t cgbno;
2026         dev_t dev;
2027         const int needswap = UFS_FSNEEDSWAP(fs);
2028
2029         KASSERT(devvp->v_type != VBLK);
2030
2031         cg = ino_to_cg(fs, ino);
2032         dev = VTOI(devvp)->i_devvp->v_rdev;
2033         ump = VFSTOUFS(devvp->v_mount);
2034         cgbno = ffs_fragstoblks(fs, cgtod(fs, cg));
2035         if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg)
2036                 panic("ifree: range: dev = 0x%llx, ino = %llu, fs = %s",
2037                     (unsigned long long)dev, (unsigned long long)ino,
2038                     fs->fs_fsmnt);
2039         error = bread(devvp, cgbno, (int)fs->fs_cgsize,
2040             NOCRED, B_MODIFY, &bp);
2041         if (error) {
2042                 return (error);
2043         }
2044         cgp = (struct cg *)bp->b_data;
2045         if (!cg_chkmagic(cgp, needswap)) {
2046                 brelse(bp, 0);
2047                 return (0);
2048         }
2049         ffs_freefile_common(ump, fs, dev, bp, ino, mode, true);
2050
2051         bdwrite(bp);
2052
2053         return 0;
2054 }
2055
2056 static void
2057 ffs_freefile_common(struct ufsmount *ump, struct fs *fs, dev_t dev,
2058     struct buf *bp, ino_t ino, int mode, bool devvp_is_snapshot)
2059 {
2060         int cg;
2061         struct cg *cgp;
2062         u_int8_t *inosused;
2063         const int needswap = UFS_FSNEEDSWAP(fs);
2064
2065         cg = ino_to_cg(fs, ino);
2066         cgp = (struct cg *)bp->b_data;
2067         cgp->cg_old_time = ufs_rw32(time_second, needswap);
2068         if ((fs->fs_magic != FS_UFS1_MAGIC) ||
2069             (fs->fs_old_flags & FS_FLAGS_UPDATED))
2070                 cgp->cg_time = ufs_rw64(time_second, needswap);
2071         inosused = cg_inosused(cgp, needswap);
2072         ino %= fs->fs_ipg;
2073         if (isclr(inosused, ino)) {
2074                 printf("ifree: dev = 0x%llx, ino = %llu, fs = %s\n",
2075                     (unsigned long long)dev, (unsigned long long)ino +
2076                     cg * fs->fs_ipg, fs->fs_fsmnt);
2077                 if (fs->fs_ronly == 0)
2078                         panic("ifree: freeing free inode");
2079         }
2080         clrbit(inosused, ino);
2081         if (!devvp_is_snapshot)
2082                 UFS_WAPBL_UNREGISTER_INODE(ump->um_mountp,
2083                     ino + cg * fs->fs_ipg, mode);
2084         if (ino < ufs_rw32(cgp->cg_irotor, needswap))
2085                 cgp->cg_irotor = ufs_rw32(ino, needswap);
2086         ufs_add32(cgp->cg_cs.cs_nifree, 1, needswap);
2087         mutex_enter(&ump->um_lock);
2088         fs->fs_cstotal.cs_nifree++;
2089         fs->fs_cs(fs, cg).cs_nifree++;
2090         if ((mode & IFMT) == IFDIR) {
2091                 ufs_add32(cgp->cg_cs.cs_ndir, -1, needswap);
2092                 fs->fs_cstotal.cs_ndir--;
2093                 fs->fs_cs(fs, cg).cs_ndir--;
2094         }
2095         fs->fs_fmod = 1;
2096         ACTIVECG_CLR(fs, cg);
2097         mutex_exit(&ump->um_lock);
2098 }
2099
2100 /*
2101  * Check to see if a file is free.
2102  */
2103 int
2104 ffs_checkfreefile(struct fs *fs, struct vnode *devvp, ino_t ino)
2105 {
2106         struct cg *cgp;
2107         struct buf *bp;
2108         daddr_t cgbno;
2109         int ret, cg;
2110         u_int8_t *inosused;
2111         const bool devvp_is_snapshot = (devvp->v_type != VBLK);
2112
2113         KASSERT(devvp_is_snapshot);
2114
2115         cg = ino_to_cg(fs, ino);
2116         if (devvp_is_snapshot)
2117                 cgbno = ffs_fragstoblks(fs, cgtod(fs, cg));
2118         else
2119                 cgbno = FFS_FSBTODB(fs, cgtod(fs, cg));
2120         if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg)
2121                 return 1;
2122         if (bread(devvp, cgbno, (int)fs->fs_cgsize, NOCRED, 0, &bp)) {
2123                 return 1;
2124         }
2125         cgp = (struct cg *)bp->b_data;
2126         if (!cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs))) {
2127                 brelse(bp, 0);
2128                 return 1;
2129         }
2130         inosused = cg_inosused(cgp, UFS_FSNEEDSWAP(fs));
2131         ino %= fs->fs_ipg;
2132         ret = isclr(inosused, ino);
2133         brelse(bp, 0);
2134         return ret;
2135 }
2136
2137 /*
2138  * Find a block of the specified size in the specified cylinder group.
2139  *
2140  * It is a panic if a request is made to find a block if none are
2141  * available.
2142  */
2143 static int32_t
2144 ffs_mapsearch(struct fs *fs, struct cg *cgp, daddr_t bpref, int allocsiz)
2145 {
2146         int32_t bno;
2147         int start, len, loc, i;
2148         int blk, field, subfield, pos;
2149         int ostart, olen;
2150         u_int8_t *blksfree;
2151         const int needswap = UFS_FSNEEDSWAP(fs);
2152
2153         /* KASSERT(mutex_owned(&ump->um_lock)); */
2154
2155         /*
2156          * find the fragment by searching through the free block
2157          * map for an appropriate bit pattern
2158          */
2159         if (bpref)
2160                 start = dtogd(fs, bpref) / NBBY;
2161         else
2162                 start = ufs_rw32(cgp->cg_frotor, needswap) / NBBY;
2163         blksfree = cg_blksfree(cgp, needswap);
2164         len = howmany(fs->fs_fpg, NBBY) - start;
2165         ostart = start;
2166         olen = len;
2167         loc = scanc((u_int)len,
2168                 (const u_char *)&blksfree[start],
2169                 (const u_char *)fragtbl[fs->fs_frag],
2170                 (1 << (allocsiz - 1 + (fs->fs_frag & (NBBY - 1)))));
2171         if (loc == 0) {
2172                 len = start + 1;
2173                 start = 0;
2174                 loc = scanc((u_int)len,
2175                         (const u_char *)&blksfree[0],
2176                         (const u_char *)fragtbl[fs->fs_frag],
2177                         (1 << (allocsiz - 1 + (fs->fs_frag & (NBBY - 1)))));
2178                 if (loc == 0) {
2179                         printf("start = %d, len = %d, fs = %s\n",
2180                             ostart, olen, fs->fs_fsmnt);
2181                         printf("offset=%d %ld\n",
2182                                 ufs_rw32(cgp->cg_freeoff, needswap),
2183                                 (long)blksfree - (long)cgp);
2184                         printf("cg %d\n", cgp->cg_cgx);
2185                         panic("ffs_alloccg: map corrupted");
2186                         /* NOTREACHED */
2187                 }
2188         }
2189         bno = (start + len - loc) * NBBY;
2190         cgp->cg_frotor = ufs_rw32(bno, needswap);
2191         /*
2192          * found the byte in the map
2193          * sift through the bits to find the selected frag
2194          */
2195         for (i = bno + NBBY; bno < i; bno += fs->fs_frag) {
2196                 blk = blkmap(fs, blksfree, bno);
2197                 blk <<= 1;
2198                 field = around[allocsiz];
2199                 subfield = inside[allocsiz];
2200                 for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) {
2201                         if ((blk & field) == subfield)
2202                                 return (bno + pos);
2203                         field <<= 1;
2204                         subfield <<= 1;
2205                 }
2206         }
2207         printf("bno = %d, fs = %s\n", bno, fs->fs_fsmnt);
2208         panic("ffs_alloccg: block not in map");
2209         /* return (-1); */
2210 }
2211
2212 /*
2213  * Fserr prints the name of a file system with an error diagnostic.
2214  *
2215  * The form of the error message is:
2216  *      fs: error message
2217  */
2218 static void
2219 ffs_fserr(struct fs *fs, u_int uid, const char *cp)
2220 {
2221
2222         log(LOG_ERR, "uid %d, pid %d, command %s, on %s: %s\n",
2223             uid, curproc->p_pid, curproc->p_comm, fs->fs_fsmnt, cp);
2224 }