sys/ufs/lfs/lfs_inode.c

   1 /*      $NetBSD: lfs_inode.c,v 1.147 2015/09/01 06:13:09 dholland Exp $ */
   2
   3 /*-
   4  * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
   5  * All rights reserved.
   6  *
   7  * This code is derived from software contributed to The NetBSD Foundation
   8  * by Konrad E. Schroder <perseant@hhhh.org>.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  29  * POSSIBILITY OF SUCH DAMAGE.
  30  */
  31 /*
  32  * Copyright (c) 1986, 1989, 1991, 1993
  33  *      The Regents of the University of California.  All rights reserved.
  34  *
  35  * Redistribution and use in source and binary forms, with or without
  36  * modification, are permitted provided that the following conditions
  37  * are met:
  38  * 1. Redistributions of source code must retain the above copyright
  39  *    notice, this list of conditions and the following disclaimer.
  40  * 2. Redistributions in binary form must reproduce the above copyright
  41  *    notice, this list of conditions and the following disclaimer in the
  42  *    documentation and/or other materials provided with the distribution.
  43  * 3. Neither the name of the University nor the names of its contributors
  44  *    may be used to endorse or promote products derived from this software
  45  *    without specific prior written permission.
  46  *
  47  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  48  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  49  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  50  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  51  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  52  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  53  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  54  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  55  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  56  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  57  * SUCH DAMAGE.
  58  *
  59  *      @(#)lfs_inode.c 8.9 (Berkeley) 5/8/95
  60  */
  61
  62 #include <sys/cdefs.h>
  63 __KERNEL_RCSID(0, "$NetBSD: lfs_inode.c,v 1.147 2015/09/01 06:13:09 dholland Exp $");
  64
  65 #if defined(_KERNEL_OPT)
  66 #include "opt_quota.h"
  67 #endif
  68
  69 #include <sys/param.h>
  70 #include <sys/systm.h>
  71 #include <sys/mount.h>
  72 #include <sys/malloc.h>
  73 #include <sys/proc.h>
  74 #include <sys/file.h>
  75 #include <sys/buf.h>
  76 #include <sys/vnode.h>
  77 #include <sys/kernel.h>
  78 #include <sys/trace.h>
  79 #include <sys/resourcevar.h>
  80 #include <sys/kauth.h>
  81
  82 #include <ufs/lfs/ulfs_quotacommon.h>
  83 #include <ufs/lfs/ulfs_inode.h>
  84 #include <ufs/lfs/ulfsmount.h>
  85 #include <ufs/lfs/ulfs_extern.h>
  86
  87 #include <ufs/lfs/lfs.h>
  88 #include <ufs/lfs/lfs_accessors.h>
  89 #include <ufs/lfs/lfs_extern.h>
  90 #include <ufs/lfs/lfs_kernel.h>
  91
  92 static int lfs_update_seguse(struct lfs *, struct inode *ip, long, size_t);
  93 static int lfs_indirtrunc(struct inode *, daddr_t, daddr_t,
  94                           daddr_t, int, daddr_t *, daddr_t *,
  95                           long *, size_t *);
  96 static int lfs_blkfree (struct lfs *, struct inode *, daddr_t, size_t, long *, size_t *);
  97 static int lfs_vtruncbuf(struct vnode *, daddr_t, bool, int);
  98
  99 /* Search a block for a specific dinode. */
 100 union lfs_dinode *
 101 lfs_ifind(struct lfs *fs, ino_t ino, struct buf *bp)
 102 {
 103         union lfs_dinode *ldip;
 104         unsigned num, i;
 105
 106         ASSERT_NO_SEGLOCK(fs);
 107         /*
 108          * Read the inode block backwards, since later versions of the
 109          * inode will supercede earlier ones.  Though it is unlikely, it is
 110          * possible that the same inode will appear in the same inode block.
 111          */
 112         num = LFS_INOPB(fs);
 113         for (i = num; i-- > 0; ) {
 114                 ldip = DINO_IN_BLOCK(fs, bp->b_data, i);
 115                 if (lfs_dino_getinumber(fs, ldip) == ino)
 116                         return (ldip);
 117         }
 118
 119         printf("searched %u entries for %ju\n", num, (uintmax_t)ino);
 120         printf("offset is 0x%jx (seg %d)\n", (uintmax_t)lfs_sb_getoffset(fs),
 121                lfs_dtosn(fs, lfs_sb_getoffset(fs)));
 122         printf("block is 0x%jx (seg %d)\n",
 123                (uintmax_t)LFS_DBTOFSB(fs, bp->b_blkno),
 124                lfs_dtosn(fs, LFS_DBTOFSB(fs, bp->b_blkno)));
 125
 126         return NULL;
 127 }
 128
 129 int
 130 lfs_update(struct vnode *vp, const struct timespec *acc,
 131     const struct timespec *mod, int updflags)
 132 {
 133         struct inode *ip;
 134         struct lfs *fs = VFSTOULFS(vp->v_mount)->um_lfs;
 135         int flags;
 136
 137         ASSERT_NO_SEGLOCK(fs);
 138         if (vp->v_mount->mnt_flag & MNT_RDONLY)
 139                 return (0);
 140         ip = VTOI(vp);
 141
 142         /*
 143          * If we are called from vinvalbuf, and the file's blocks have
 144          * already been scheduled for writing, but the writes have not
 145          * yet completed, lfs_vflush will not be called, and vinvalbuf
 146          * will cause a panic.  So, we must wait until any pending write
 147          * for our inode completes, if we are called with UPDATE_WAIT set.
 148          */
 149         mutex_enter(vp->v_interlock);
 150         while ((updflags & (UPDATE_WAIT|UPDATE_DIROP)) == UPDATE_WAIT &&
 151             WRITEINPROG(vp)) {
 152                 DLOG((DLOG_SEG, "lfs_update: sleeping on ino %d"
 153                       " (in progress)\n", ip->i_number));
 154                 cv_wait(&vp->v_cv, vp->v_interlock);
 155         }
 156         mutex_exit(vp->v_interlock);
 157         LFS_ITIMES(ip, acc, mod, NULL);
 158         if (updflags & UPDATE_CLOSE)
 159                 flags = ip->i_flag & (IN_MODIFIED | IN_ACCESSED | IN_CLEANING);
 160         else
 161                 flags = ip->i_flag & (IN_MODIFIED | IN_CLEANING);
 162         if (flags == 0)
 163                 return (0);
 164
 165         /* If sync, push back the vnode and any dirty blocks it may have. */
 166         if ((updflags & (UPDATE_WAIT|UPDATE_DIROP)) == UPDATE_WAIT) {
 167                 /* Avoid flushing VU_DIROP. */
 168                 mutex_enter(&lfs_lock);
 169                 ++fs->lfs_diropwait;
 170                 while (vp->v_uflag & VU_DIROP) {
 171                         DLOG((DLOG_DIROP, "lfs_update: sleeping on inode %d"
 172                               " (dirops)\n", ip->i_number));
 173                         DLOG((DLOG_DIROP, "lfs_update: vflags 0x%x, iflags"
 174                               " 0x%x\n",
 175                               vp->v_iflag | vp->v_vflag | vp->v_uflag,
 176                               ip->i_flag));
 177                         if (fs->lfs_dirops == 0)
 178                                 lfs_flush_fs(fs, SEGM_SYNC);
 179                         else
 180                                 mtsleep(&fs->lfs_writer, PRIBIO+1, "lfs_fsync",
 181                                         0, &lfs_lock);
 182                         /* XXX KS - by falling out here, are we writing the vn
 183                         twice? */
 184                 }
 185                 --fs->lfs_diropwait;
 186                 mutex_exit(&lfs_lock);
 187                 return lfs_vflush(vp);
 188         }
 189         return 0;
 190 }
 191
 192 #define SINGLE  0       /* index of single indirect block */
 193 #define DOUBLE  1       /* index of double indirect block */
 194 #define TRIPLE  2       /* index of triple indirect block */
 195 /*
 196  * Truncate the inode oip to at most length size, freeing the
 197  * disk blocks.
 198  */
 199 /* VOP_BWRITE 1 + ULFS_NIADDR + lfs_balloc == 2 + 2*ULFS_NIADDR times */
 200
 201 int
 202 lfs_truncate(struct vnode *ovp, off_t length, int ioflag, kauth_cred_t cred)
 203 {
 204         daddr_t lastblock;
 205         struct inode *oip = VTOI(ovp);
 206         daddr_t bn, lbn, lastiblock[ULFS_NIADDR], indir_lbn[ULFS_NIADDR];
 207         /* note: newblks is set but only actually used if DIAGNOSTIC */
 208         daddr_t newblks[ULFS_NDADDR + ULFS_NIADDR] __diagused;
 209         struct lfs *fs;
 210         struct buf *bp;
 211         int offset, size, level;
 212         daddr_t count, rcount;
 213         daddr_t blocksreleased = 0, real_released = 0;
 214         int i, nblocks;
 215         int aflags, error, allerror = 0;
 216         off_t osize;
 217         long lastseg;
 218         size_t bc;
 219         int obufsize, odb;
 220         int usepc;
 221
 222         if (ovp->v_type == VCHR || ovp->v_type == VBLK ||
 223             ovp->v_type == VFIFO || ovp->v_type == VSOCK) {
 224                 KASSERT(oip->i_size == 0);
 225                 return 0;
 226         }
 227
 228         if (length < 0)
 229                 return (EINVAL);
 230
 231         /*
 232          * Just return and not update modification times.
 233          */
 234         if (oip->i_size == length) {
 235                 /* still do a uvm_vnp_setsize() as writesize may be larger */
 236                 uvm_vnp_setsize(ovp, length);
 237                 return (0);
 238         }
 239
 240         fs = oip->i_lfs;
 241
 242         if (ovp->v_type == VLNK &&
 243             (oip->i_size < fs->um_maxsymlinklen ||
 244              (fs->um_maxsymlinklen == 0 &&
 245               lfs_dino_getblocks(fs, oip->i_din) == 0))) {
 246 #ifdef DIAGNOSTIC
 247                 if (length != 0)
 248                         panic("lfs_truncate: partial truncate of symlink");
 249 #endif
 250                 memset((char *)SHORTLINK(oip), 0, (u_int)oip->i_size);
 251                 oip->i_size = 0;
 252                 lfs_dino_setsize(fs, oip->i_din, 0);
 253                 oip->i_flag |= IN_CHANGE | IN_UPDATE;
 254                 return (lfs_update(ovp, NULL, NULL, 0));
 255         }
 256         if (oip->i_size == length) {
 257                 oip->i_flag |= IN_CHANGE | IN_UPDATE;
 258                 return (lfs_update(ovp, NULL, NULL, 0));
 259         }
 260         lfs_imtime(fs);
 261         osize = oip->i_size;
 262         usepc = (ovp->v_type == VREG && ovp != fs->lfs_ivnode);
 263
 264         ASSERT_NO_SEGLOCK(fs);
 265         /*
 266          * Lengthen the size of the file. We must ensure that the
 267          * last byte of the file is allocated. Since the smallest
 268          * value of osize is 0, length will be at least 1.
 269          */
 270         if (osize < length) {
 271                 if (length > fs->um_maxfilesize)
 272                         return (EFBIG);
 273                 aflags = B_CLRBUF;
 274                 if (ioflag & IO_SYNC)
 275                         aflags |= B_SYNC;
 276                 if (usepc) {
 277                         if (lfs_lblkno(fs, osize) < ULFS_NDADDR &&
 278                             lfs_lblkno(fs, osize) != lfs_lblkno(fs, length) &&
 279                             lfs_blkroundup(fs, osize) != osize) {
 280                                 off_t eob;
 281
 282                                 eob = lfs_blkroundup(fs, osize);
 283                                 uvm_vnp_setwritesize(ovp, eob);
 284                                 error = ulfs_balloc_range(ovp, osize,
 285                                     eob - osize, cred, aflags);
 286                                 if (error) {
 287                                         (void) lfs_truncate(ovp, osize,
 288                                                     ioflag & IO_SYNC, cred);
 289                                         return error;
 290                                 }
 291                                 if (ioflag & IO_SYNC) {
 292                                         mutex_enter(ovp->v_interlock);
 293                                         VOP_PUTPAGES(ovp,
 294                                             trunc_page(osize & lfs_sb_getbmask(fs)),
 295                                             round_page(eob),
 296                                             PGO_CLEANIT | PGO_SYNCIO);
 297                                 }
 298                         }
 299                         uvm_vnp_setwritesize(ovp, length);
 300                         error = ulfs_balloc_range(ovp, length - 1, 1, cred,
 301                                                  aflags);
 302                         if (error) {
 303                                 (void) lfs_truncate(ovp, osize,
 304                                                     ioflag & IO_SYNC, cred);
 305                                 return error;
 306                         }
 307                         uvm_vnp_setsize(ovp, length);
 308                         oip->i_flag |= IN_CHANGE | IN_UPDATE;
 309                         KASSERT(ovp->v_size == oip->i_size);
 310                         oip->i_lfs_hiblk = lfs_lblkno(fs, oip->i_size + lfs_sb_getbsize(fs) - 1) - 1;
 311                         return (lfs_update(ovp, NULL, NULL, 0));
 312                 } else {
 313                         error = lfs_reserve(fs, ovp, NULL,
 314                             lfs_btofsb(fs, (ULFS_NIADDR + 2) << lfs_sb_getbshift(fs)));
 315                         if (error)
 316                                 return (error);
 317                         error = lfs_balloc(ovp, length - 1, 1, cred,
 318                                            aflags, &bp);
 319                         lfs_reserve(fs, ovp, NULL,
 320                             -lfs_btofsb(fs, (ULFS_NIADDR + 2) << lfs_sb_getbshift(fs)));
 321                         if (error)
 322                                 return (error);
 323                         oip->i_size = length;
 324                         lfs_dino_setsize(fs, oip->i_din, oip->i_size);
 325                         uvm_vnp_setsize(ovp, length);
 326                         (void) VOP_BWRITE(bp->b_vp, bp);
 327                         oip->i_flag |= IN_CHANGE | IN_UPDATE;
 328                         oip->i_lfs_hiblk = lfs_lblkno(fs, oip->i_size + lfs_sb_getbsize(fs) - 1) - 1;
 329                         return (lfs_update(ovp, NULL, NULL, 0));
 330                 }
 331         }
 332
 333         if ((error = lfs_reserve(fs, ovp, NULL,
 334             lfs_btofsb(fs, (2 * ULFS_NIADDR + 3) << lfs_sb_getbshift(fs)))) != 0)
 335                 return (error);
 336
 337         /*
 338          * Shorten the size of the file. If the file is not being
 339          * truncated to a block boundary, the contents of the
 340          * partial block following the end of the file must be
 341          * zero'ed in case it ever becomes accessible again because
 342          * of subsequent file growth. Directories however are not
 343          * zero'ed as they should grow back initialized to empty.
 344          */
 345         offset = lfs_blkoff(fs, length);
 346         lastseg = -1;
 347         bc = 0;
 348
 349         if (ovp != fs->lfs_ivnode)
 350                 lfs_seglock(fs, SEGM_PROT);
 351         if (offset == 0) {
 352                 oip->i_size = length;
 353                 lfs_dino_setsize(fs, oip->i_din, oip->i_size);
 354         } else if (!usepc) {
 355                 lbn = lfs_lblkno(fs, length);
 356                 aflags = B_CLRBUF;
 357                 if (ioflag & IO_SYNC)
 358                         aflags |= B_SYNC;
 359                 error = lfs_balloc(ovp, length - 1, 1, cred, aflags, &bp);
 360                 if (error) {
 361                         lfs_reserve(fs, ovp, NULL,
 362                             -lfs_btofsb(fs, (2 * ULFS_NIADDR + 3) << lfs_sb_getbshift(fs)));
 363                         goto errout;
 364                 }
 365                 obufsize = bp->b_bufsize;
 366                 odb = lfs_btofsb(fs, bp->b_bcount);
 367                 oip->i_size = length;
 368                 lfs_dino_setsize(fs, oip->i_din, oip->i_size);
 369                 size = lfs_blksize(fs, oip, lbn);
 370                 if (ovp->v_type != VDIR)
 371                         memset((char *)bp->b_data + offset, 0,
 372                                (u_int)(size - offset));
 373                 allocbuf(bp, size, 1);
 374                 if ((bp->b_flags & B_LOCKED) != 0 && bp->b_iodone == NULL) {
 375                         mutex_enter(&lfs_lock);
 376                         locked_queue_bytes -= obufsize - bp->b_bufsize;
 377                         mutex_exit(&lfs_lock);
 378                 }
 379                 if (bp->b_oflags & BO_DELWRI) {
 380                         lfs_sb_addavail(fs, odb - lfs_btofsb(fs, size));
 381                         /* XXX shouldn't this wake up on lfs_availsleep? */
 382                 }
 383                 (void) VOP_BWRITE(bp->b_vp, bp);
 384         } else { /* vp->v_type == VREG && length < osize && offset != 0 */
 385                 /*
 386                  * When truncating a regular file down to a non-block-aligned
 387                  * size, we must zero the part of last block which is past
 388                  * the new EOF.  We must synchronously flush the zeroed pages
 389                  * to disk since the new pages will be invalidated as soon
 390                  * as we inform the VM system of the new, smaller size.
 391                  * We must do this before acquiring the GLOCK, since fetching
 392                  * the pages will acquire the GLOCK internally.
 393                  * So there is a window where another thread could see a whole
 394                  * zeroed page past EOF, but that's life.
 395                  */
 396                 daddr_t xlbn;
 397                 voff_t eoz;
 398
 399                 aflags = ioflag & IO_SYNC ? B_SYNC : 0;
 400                 error = ulfs_balloc_range(ovp, length - 1, 1, cred, aflags);
 401                 if (error) {
 402                         lfs_reserve(fs, ovp, NULL,
 403                                     -lfs_btofsb(fs, (2 * ULFS_NIADDR + 3) << lfs_sb_getbshift(fs)));
 404                         goto errout;
 405                 }
 406                 xlbn = lfs_lblkno(fs, length);
 407                 size = lfs_blksize(fs, oip, xlbn);
 408                 eoz = MIN(lfs_lblktosize(fs, xlbn) + size, osize);
 409                 ubc_zerorange(&ovp->v_uobj, length, eoz - length,
 410                     UBC_UNMAP_FLAG(ovp));
 411                 if (round_page(eoz) > round_page(length)) {
 412                         mutex_enter(ovp->v_interlock);
 413                         error = VOP_PUTPAGES(ovp, round_page(length),
 414                             round_page(eoz),
 415                             PGO_CLEANIT | PGO_DEACTIVATE |
 416                             ((ioflag & IO_SYNC) ? PGO_SYNCIO : 0));
 417                         if (error) {
 418                                 lfs_reserve(fs, ovp, NULL,
 419                                             -lfs_btofsb(fs, (2 * ULFS_NIADDR + 3) << lfs_sb_getbshift(fs)));
 420                                 goto errout;
 421                         }
 422                 }
 423         }
 424
 425         genfs_node_wrlock(ovp);
 426
 427         oip->i_size = length;
 428         lfs_dino_setsize(fs, oip->i_din, oip->i_size);
 429         uvm_vnp_setsize(ovp, length);
 430
 431         /*
 432          * Calculate index into inode's block list of
 433          * last direct and indirect blocks (if any)
 434          * which we want to keep.  Lastblock is -1 when
 435          * the file is truncated to 0.
 436          */
 437         /* Avoid sign overflow - XXX assumes that off_t is a quad_t. */
 438         if (length > QUAD_MAX - lfs_sb_getbsize(fs))
 439                 lastblock = lfs_lblkno(fs, QUAD_MAX - lfs_sb_getbsize(fs));
 440         else
 441                 lastblock = lfs_lblkno(fs, length + lfs_sb_getbsize(fs) - 1) - 1;
 442         lastiblock[SINGLE] = lastblock - ULFS_NDADDR;
 443         lastiblock[DOUBLE] = lastiblock[SINGLE] - LFS_NINDIR(fs);
 444         lastiblock[TRIPLE] = lastiblock[DOUBLE] - LFS_NINDIR(fs) * LFS_NINDIR(fs);
 445         nblocks = lfs_btofsb(fs, lfs_sb_getbsize(fs));
 446         /*
 447          * Record changed file and block pointers before we start
 448          * freeing blocks.  lastiblock values are also normalized to -1
 449          * for calls to lfs_indirtrunc below.
 450          */
 451         for (i=0; i<ULFS_NDADDR; i++) {
 452                 newblks[i] = lfs_dino_getdb(fs, oip->i_din, i);
 453         }
 454         for (i=0; i<ULFS_NIADDR; i++) {
 455                 newblks[ULFS_NDADDR + i] = lfs_dino_getib(fs, oip->i_din, i);
 456         }
 457         for (level = TRIPLE; level >= SINGLE; level--)
 458                 if (lastiblock[level] < 0) {
 459                         newblks[ULFS_NDADDR+level] = 0;
 460                         lastiblock[level] = -1;
 461                 }
 462         for (i = ULFS_NDADDR - 1; i > lastblock; i--)
 463                 newblks[i] = 0;
 464
 465         oip->i_size = osize;
 466         lfs_dino_setsize(fs, oip->i_din, oip->i_size);
 467         error = lfs_vtruncbuf(ovp, lastblock + 1, false, 0);
 468         if (error && !allerror)
 469                 allerror = error;
 470
 471         /*
 472          * Indirect blocks first.
 473          */
 474         indir_lbn[SINGLE] = -ULFS_NDADDR;
 475         indir_lbn[DOUBLE] = indir_lbn[SINGLE] - LFS_NINDIR(fs) - 1;
 476         indir_lbn[TRIPLE] = indir_lbn[DOUBLE] - LFS_NINDIR(fs) * LFS_NINDIR(fs) - 1;
 477         for (level = TRIPLE; level >= SINGLE; level--) {
 478                 bn = lfs_dino_getib(fs, oip->i_din, level);
 479                 if (bn != 0) {
 480                         error = lfs_indirtrunc(oip, indir_lbn[level],
 481                                                bn, lastiblock[level],
 482                                                level, &count, &rcount,
 483                                                &lastseg, &bc);
 484                         if (error)
 485                                 allerror = error;
 486                         real_released += rcount;
 487                         blocksreleased += count;
 488                         if (lastiblock[level] < 0) {
 489                                 if (lfs_dino_getib(fs, oip->i_din, level) > 0)
 490                                         real_released += nblocks;
 491                                 blocksreleased += nblocks;
 492                                 lfs_dino_setib(fs, oip->i_din, level, 0);
 493                                 lfs_blkfree(fs, oip, bn, lfs_sb_getbsize(fs),
 494                                             &lastseg, &bc);
 495                                 lfs_deregister_block(ovp, bn);
 496                         }
 497                 }
 498                 if (lastiblock[level] >= 0)
 499                         goto done;
 500         }
 501
 502         /*
 503          * All whole direct blocks or frags.
 504          */
 505         for (i = ULFS_NDADDR - 1; i > lastblock; i--) {
 506                 long bsize, obsize;
 507
 508                 bn = lfs_dino_getdb(fs, oip->i_din, i);
 509                 if (bn == 0)
 510                         continue;
 511                 bsize = lfs_blksize(fs, oip, i);
 512                 if (lfs_dino_getdb(fs, oip->i_din, i) > 0) {
 513                         /* Check for fragment size changes */
 514                         obsize = oip->i_lfs_fragsize[i];
 515                         real_released += lfs_btofsb(fs, obsize);
 516                         oip->i_lfs_fragsize[i] = 0;
 517                 } else
 518                         obsize = 0;
 519                 blocksreleased += lfs_btofsb(fs, bsize);
 520                 lfs_dino_setdb(fs, oip->i_din, i, 0);
 521                 lfs_blkfree(fs, oip, bn, obsize, &lastseg, &bc);
 522                 lfs_deregister_block(ovp, bn);
 523         }
 524         if (lastblock < 0)
 525                 goto done;
 526
 527         /*
 528          * Finally, look for a change in size of the
 529          * last direct block; release any frags.
 530          */
 531         bn = lfs_dino_getdb(fs, oip->i_din, lastblock);
 532         if (bn != 0) {
 533                 long oldspace, newspace;
 534 #if 0
 535                 long olddspace;
 536 #endif
 537
 538                 /*
 539                  * Calculate amount of space we're giving
 540                  * back as old block size minus new block size.
 541                  */
 542                 oldspace = lfs_blksize(fs, oip, lastblock);
 543 #if 0
 544                 olddspace = oip->i_lfs_fragsize[lastblock];
 545 #endif
 546
 547                 oip->i_size = length;
 548                 lfs_dino_setsize(fs, oip->i_din, oip->i_size);
 549                 newspace = lfs_blksize(fs, oip, lastblock);
 550                 if (newspace == 0)
 551                         panic("itrunc: newspace");
 552                 if (oldspace - newspace > 0) {
 553                         blocksreleased += lfs_btofsb(fs, oldspace - newspace);
 554                 }
 555 #if 0
 556                 if (bn > 0 && olddspace - newspace > 0) {
 557                         /* No segment accounting here, just vnode */
 558                         real_released += lfs_btofsb(fs, olddspace - newspace);
 559                 }
 560 #endif
 561         }
 562
 563 done:
 564         /* Finish segment accounting corrections */
 565         lfs_update_seguse(fs, oip, lastseg, bc);
 566 #ifdef DIAGNOSTIC
 567         for (level = SINGLE; level <= TRIPLE; level++)
 568                 if ((newblks[ULFS_NDADDR + level] == 0) !=
 569                     (lfs_dino_getib(fs, oip->i_din, level) == 0)) {
 570                         panic("lfs itrunc1");
 571                 }
 572         for (i = 0; i < ULFS_NDADDR; i++)
 573                 if ((newblks[i] == 0) !=
 574                     (lfs_dino_getdb(fs, oip->i_din, i) == 0)) {
 575                         panic("lfs itrunc2");
 576                 }
 577         if (length == 0 &&
 578             (!LIST_EMPTY(&ovp->v_cleanblkhd) || !LIST_EMPTY(&ovp->v_dirtyblkhd)))
 579                 panic("lfs itrunc3");
 580 #endif /* DIAGNOSTIC */
 581         /*
 582          * Put back the real size.
 583          */
 584         oip->i_size = length;
 585         lfs_dino_setsize(fs, oip->i_din, oip->i_size);
 586         oip->i_lfs_effnblks -= blocksreleased;
 587         lfs_dino_setblocks(fs, oip->i_din,
 588             lfs_dino_getblocks(fs, oip->i_din) - real_released);
 589         mutex_enter(&lfs_lock);
 590         lfs_sb_addbfree(fs, blocksreleased);
 591         mutex_exit(&lfs_lock);
 592 #ifdef DIAGNOSTIC
 593         if (oip->i_size == 0 &&
 594             (lfs_dino_getblocks(fs, oip->i_din) != 0 || oip->i_lfs_effnblks != 0)) {
 595                 printf("lfs_truncate: truncate to 0 but %jd blks/%jd effblks\n",
 596                        (intmax_t)lfs_dino_getblocks(fs, oip->i_din),
 597                        (intmax_t)oip->i_lfs_effnblks);
 598                 panic("lfs_truncate: persistent blocks");
 599         }
 600 #endif
 601
 602         /*
 603          * If we truncated to zero, take us off the paging queue.
 604          */
 605         mutex_enter(&lfs_lock);
 606         if (oip->i_size == 0 && oip->i_flags & IN_PAGING) {
 607                 oip->i_flags &= ~IN_PAGING;
 608                 TAILQ_REMOVE(&fs->lfs_pchainhd, oip, i_lfs_pchain);
 609         }
 610         mutex_exit(&lfs_lock);
 611
 612         oip->i_flag |= IN_CHANGE;
 613 #if defined(LFS_QUOTA) || defined(LFS_QUOTA2)
 614         (void) lfs_chkdq(oip, -blocksreleased, NOCRED, 0);
 615 #endif
 616         lfs_reserve(fs, ovp, NULL,
 617             -lfs_btofsb(fs, (2 * ULFS_NIADDR + 3) << lfs_sb_getbshift(fs)));
 618         genfs_node_unlock(ovp);
 619   errout:
 620         oip->i_lfs_hiblk = lfs_lblkno(fs, oip->i_size + lfs_sb_getbsize(fs) - 1) - 1;
 621         if (ovp != fs->lfs_ivnode)
 622                 lfs_segunlock(fs);
 623         return (allerror ? allerror : error);
 624 }
 625
 626 /* Update segment and avail usage information when removing a block. */
 627 static int
 628 lfs_blkfree(struct lfs *fs, struct inode *ip, daddr_t daddr,
 629             size_t bsize, long *lastseg, size_t *num)
 630 {
 631         long seg;
 632         int error = 0;
 633
 634         ASSERT_SEGLOCK(fs);
 635         bsize = lfs_fragroundup(fs, bsize);
 636         if (daddr > 0) {
 637                 if (*lastseg != (seg = lfs_dtosn(fs, daddr))) {
 638                         error = lfs_update_seguse(fs, ip, *lastseg, *num);
 639                         *num = bsize;
 640                         *lastseg = seg;
 641                 } else
 642                         *num += bsize;
 643         }
 644
 645         return error;
 646 }
 647
 648 /* Finish the accounting updates for a segment. */
 649 static int
 650 lfs_update_seguse(struct lfs *fs, struct inode *ip, long lastseg, size_t num)
 651 {
 652         struct segdelta *sd;
 653
 654         ASSERT_SEGLOCK(fs);
 655         if (lastseg < 0 || num == 0)
 656                 return 0;
 657
 658         LIST_FOREACH(sd, &ip->i_lfs_segdhd, list)
 659                 if (sd->segnum == lastseg)
 660                         break;
 661         if (sd == NULL) {
 662                 sd = malloc(sizeof(*sd), M_SEGMENT, M_WAITOK);
 663                 sd->segnum = lastseg;
 664                 sd->num = 0;
 665                 LIST_INSERT_HEAD(&ip->i_lfs_segdhd, sd, list);
 666         }
 667         sd->num += num;
 668
 669         return 0;
 670 }
 671
 672 static void
 673 lfs_finalize_seguse(struct lfs *fs, void *v)
 674 {
 675         SEGUSE *sup;
 676         struct buf *bp;
 677         struct segdelta *sd;
 678         LIST_HEAD(, segdelta) *hd = v;
 679
 680         ASSERT_SEGLOCK(fs);
 681         while((sd = LIST_FIRST(hd)) != NULL) {
 682                 LIST_REMOVE(sd, list);
 683                 LFS_SEGENTRY(sup, fs, sd->segnum, bp);
 684                 if (sd->num > sup->su_nbytes) {
 685                         printf("lfs_finalize_seguse: segment %ld short by %ld\n",
 686                                 sd->segnum, (long)(sd->num - sup->su_nbytes));
 687                         panic("lfs_finalize_seguse: negative bytes");
 688                         sup->su_nbytes = sd->num;
 689                 }
 690                 sup->su_nbytes -= sd->num;
 691                 LFS_WRITESEGENTRY(sup, fs, sd->segnum, bp);
 692                 free(sd, M_SEGMENT);
 693         }
 694 }
 695
 696 /* Finish the accounting updates for a segment. */
 697 void
 698 lfs_finalize_ino_seguse(struct lfs *fs, struct inode *ip)
 699 {
 700         ASSERT_SEGLOCK(fs);
 701         lfs_finalize_seguse(fs, &ip->i_lfs_segdhd);
 702 }
 703
 704 /* Finish the accounting updates for a segment. */
 705 void
 706 lfs_finalize_fs_seguse(struct lfs *fs)
 707 {
 708         ASSERT_SEGLOCK(fs);
 709         lfs_finalize_seguse(fs, &fs->lfs_segdhd);
 710 }
 711
 712 /*
 713  * Release blocks associated with the inode ip and stored in the indirect
 714  * block bn.  Blocks are free'd in LIFO order up to (but not including)
 715  * lastbn.  If level is greater than SINGLE, the block is an indirect block
 716  * and recursive calls to indirtrunc must be used to cleanse other indirect
 717  * blocks.
 718  *
 719  * NB: triple indirect blocks are untested.
 720  */
 721 static int
 722 lfs_indirtrunc(struct inode *ip, daddr_t lbn, daddr_t dbn,
 723                daddr_t lastbn, int level, daddr_t *countp,
 724                daddr_t *rcountp, long *lastsegp, size_t *bcp)
 725 {
 726         int i;
 727         struct buf *bp;
 728         struct lfs *fs = ip->i_lfs;
 729         void *bap;
 730         bool bap_needs_free;
 731         struct vnode *vp;
 732         daddr_t nb, nlbn, last;
 733         daddr_t blkcount, rblkcount, factor;
 734         int nblocks;
 735         daddr_t blocksreleased = 0, real_released = 0;
 736         int error = 0, allerror = 0;
 737
 738         ASSERT_SEGLOCK(fs);
 739         /*
 740          * Calculate index in current block of last
 741          * block to be kept.  -1 indicates the entire
 742          * block so we need not calculate the index.
 743          */
 744         factor = 1;
 745         for (i = SINGLE; i < level; i++)
 746                 factor *= LFS_NINDIR(fs);
 747         last = lastbn;
 748         if (lastbn > 0)
 749                 last /= factor;
 750         nblocks = lfs_btofsb(fs, lfs_sb_getbsize(fs));
 751         /*
 752          * Get buffer of block pointers, zero those entries corresponding
 753          * to blocks to be free'd, and update on disk copy first.  Since
 754          * double(triple) indirect before single(double) indirect, calls
 755          * to bmap on these blocks will fail.  However, we already have
 756          * the on disk address, so we have to set the b_blkno field
 757          * explicitly instead of letting bread do everything for us.
 758          */
 759         vp = ITOV(ip);
 760         bp = getblk(vp, lbn, lfs_sb_getbsize(fs), 0, 0);
 761         if (bp->b_oflags & (BO_DONE | BO_DELWRI)) {
 762                 /* Braces must be here in case trace evaluates to nothing. */
 763                 trace(TR_BREADHIT, pack(vp, lfs_sb_getbsize(fs)), lbn);
 764         } else {
 765                 trace(TR_BREADMISS, pack(vp, lfs_sb_getbsize(fs)), lbn);
 766                 curlwp->l_ru.ru_inblock++; /* pay for read */
 767                 bp->b_flags |= B_READ;
 768                 if (bp->b_bcount > bp->b_bufsize)
 769                         panic("lfs_indirtrunc: bad buffer size");
 770                 bp->b_blkno = LFS_FSBTODB(fs, dbn);
 771                 VOP_STRATEGY(vp, bp);
 772                 error = biowait(bp);
 773         }
 774         if (error) {
 775                 brelse(bp, 0);
 776                 *countp = *rcountp = 0;
 777                 return (error);
 778         }
 779
 780         if (lastbn >= 0) {
 781                 /*
 782                  * We still need this block, so copy the data for
 783                  * subsequent processing; then in the original block,
 784                  * zero out the dying block pointers and send it off.
 785                  */
 786                 bap = lfs_malloc(fs, lfs_sb_getbsize(fs), LFS_NB_IBLOCK);
 787                 memcpy(bap, bp->b_data, lfs_sb_getbsize(fs));
 788                 bap_needs_free = true;
 789
 790                 for (i = last + 1; i < LFS_NINDIR(fs); i++) {
 791                         lfs_iblock_set(fs, bp->b_data, i, 0);
 792                 }
 793                 error = VOP_BWRITE(bp->b_vp, bp);
 794                 if (error)
 795                         allerror = error;
 796         } else {
 797                 bap = bp->b_data;
 798                 bap_needs_free = false;
 799         }
 800
 801         /*
 802          * Recursively free totally unused blocks.
 803          */
 804         for (i = LFS_NINDIR(fs) - 1, nlbn = lbn + 1 - i * factor; i > last;
 805             i--, nlbn += factor) {
 806                 nb = lfs_iblock_get(fs, bap, i);
 807                 if (nb == 0)
 808                         continue;
 809                 if (level > SINGLE) {
 810                         error = lfs_indirtrunc(ip, nlbn, nb,
 811                                                (daddr_t)-1, level - 1,
 812                                                &blkcount, &rblkcount,
 813                                                lastsegp, bcp);
 814                         if (error)
 815                                 allerror = error;
 816                         blocksreleased += blkcount;
 817                         real_released += rblkcount;
 818                 }
 819                 lfs_blkfree(fs, ip, nb, lfs_sb_getbsize(fs), lastsegp, bcp);
 820                 if (lfs_iblock_get(fs, bap, i) > 0)
 821                         real_released += nblocks;
 822                 blocksreleased += nblocks;
 823         }
 824
 825         /*
 826          * Recursively free last partial block.
 827          */
 828         if (level > SINGLE && lastbn >= 0) {
 829                 last = lastbn % factor;
 830                 nb = lfs_iblock_get(fs, bap, i);
 831                 if (nb != 0) {
 832                         error = lfs_indirtrunc(ip, nlbn, nb,
 833                                                last, level - 1, &blkcount,
 834                                                &rblkcount, lastsegp, bcp);
 835                         if (error)
 836                                 allerror = error;
 837                         real_released += rblkcount;
 838                         blocksreleased += blkcount;
 839                 }
 840         }
 841
 842         if (bap_needs_free) {
 843                 lfs_free(fs, bap, LFS_NB_IBLOCK);
 844         } else {
 845                 mutex_enter(&bufcache_lock);
 846                 if (bp->b_oflags & BO_DELWRI) {
 847                         LFS_UNLOCK_BUF(bp);
 848                         lfs_sb_addavail(fs, lfs_btofsb(fs, bp->b_bcount));
 849                         wakeup(&fs->lfs_availsleep);
 850                 }
 851                 brelsel(bp, BC_INVAL);
 852                 mutex_exit(&bufcache_lock);
 853         }
 854
 855         *countp = blocksreleased;
 856         *rcountp = real_released;
 857         return (allerror);
 858 }
 859
 860 /*
 861  * Destroy any in core blocks past the truncation length.
 862  * Inlined from vtruncbuf, so that lfs_avail could be updated.
 863  * We take the seglock to prevent cleaning from occurring while we are
 864  * invalidating blocks.
 865  */
 866 static int
 867 lfs_vtruncbuf(struct vnode *vp, daddr_t lbn, bool catch, int slptimeo)
 868 {
 869         struct buf *bp, *nbp;
 870         int error;
 871         struct lfs *fs;
 872         voff_t off;
 873
 874         off = round_page((voff_t)lbn << vp->v_mount->mnt_fs_bshift);
 875         mutex_enter(vp->v_interlock);
 876         error = VOP_PUTPAGES(vp, off, 0, PGO_FREE | PGO_SYNCIO);
 877         if (error)
 878                 return error;
 879
 880         fs = VTOI(vp)->i_lfs;
 881
 882         ASSERT_SEGLOCK(fs);
 883
 884         mutex_enter(&bufcache_lock);
 885 restart:
 886         for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
 887                 nbp = LIST_NEXT(bp, b_vnbufs);
 888                 if (bp->b_lblkno < lbn)
 889                         continue;
 890                 error = bbusy(bp, catch, slptimeo, NULL);
 891                 if (error == EPASSTHROUGH)
 892                         goto restart;
 893                 if (error != 0) {
 894                         mutex_exit(&bufcache_lock);
 895                         return (error);
 896                 }
 897                 mutex_enter(bp->b_objlock);
 898                 if (bp->b_oflags & BO_DELWRI) {
 899                         bp->b_oflags &= ~BO_DELWRI;
 900                         lfs_sb_addavail(fs, lfs_btofsb(fs, bp->b_bcount));
 901                         wakeup(&fs->lfs_availsleep);
 902                 }
 903                 mutex_exit(bp->b_objlock);
 904                 LFS_UNLOCK_BUF(bp);
 905                 brelsel(bp, BC_INVAL | BC_VFLUSH);
 906         }
 907
 908         for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
 909                 nbp = LIST_NEXT(bp, b_vnbufs);
 910                 if (bp->b_lblkno < lbn)
 911                         continue;
 912                 error = bbusy(bp, catch, slptimeo, NULL);
 913                 if (error == EPASSTHROUGH)
 914                         goto restart;
 915                 if (error != 0) {
 916                         mutex_exit(&bufcache_lock);
 917                         return (error);
 918                 }
 919                 mutex_enter(bp->b_objlock);
 920                 if (bp->b_oflags & BO_DELWRI) {
 921                         bp->b_oflags &= ~BO_DELWRI;
 922                         lfs_sb_addavail(fs, lfs_btofsb(fs, bp->b_bcount));
 923                         wakeup(&fs->lfs_availsleep);
 924                 }
 925                 mutex_exit(bp->b_objlock);
 926                 LFS_UNLOCK_BUF(bp);
 927                 brelsel(bp, BC_INVAL | BC_VFLUSH);
 928         }
 929         mutex_exit(&bufcache_lock);
 930
 931         return (0);
 932 }
 933