sys/ufs/lfs/lfs_segment.c

   1 /*      $NetBSD: lfs_segment.c,v 1.222 2011/07/11 08:27:40 hannken Exp $        */
   2
   3 /*-
   4  * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
   5  * All rights reserved.
   6  *
   7  * This code is derived from software contributed to The NetBSD Foundation
   8  * by Konrad E. Schroder <perseant@hhhh.org>.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  29  * POSSIBILITY OF SUCH DAMAGE.
  30  */
  31 /*
  32  * Copyright (c) 1991, 1993
  33  *      The Regents of the University of California.  All rights reserved.
  34  *
  35  * Redistribution and use in source and binary forms, with or without
  36  * modification, are permitted provided that the following conditions
  37  * are met:
  38  * 1. Redistributions of source code must retain the above copyright
  39  *    notice, this list of conditions and the following disclaimer.
  40  * 2. Redistributions in binary form must reproduce the above copyright
  41  *    notice, this list of conditions and the following disclaimer in the
  42  *    documentation and/or other materials provided with the distribution.
  43  * 3. Neither the name of the University nor the names of its contributors
  44  *    may be used to endorse or promote products derived from this software
  45  *    without specific prior written permission.
  46  *
  47  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  48  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  49  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  50  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  51  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  52  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  53  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  54  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  55  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  56  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  57  * SUCH DAMAGE.
  58  *
  59  *      @(#)lfs_segment.c       8.10 (Berkeley) 6/10/95
  60  */
  61
  62 #include <sys/cdefs.h>
  63 __KERNEL_RCSID(0, "$NetBSD: lfs_segment.c,v 1.222 2011/07/11 08:27:40 hannken Exp $");
  64
  65 #ifdef DEBUG
  66 # define vndebug(vp, str) do {                                          \
  67         if (VTOI(vp)->i_flag & IN_CLEANING)                             \
  68                 DLOG((DLOG_WVNODE, "not writing ino %d because %s (op %d)\n", \
  69                      VTOI(vp)->i_number, (str), op));                   \
  70 } while(0)
  71 #else
  72 # define vndebug(vp, str)
  73 #endif
  74 #define ivndebug(vp, str) \
  75         DLOG((DLOG_WVNODE, "ino %d: %s\n", VTOI(vp)->i_number, (str)))
  76
  77 #if defined(_KERNEL_OPT)
  78 #include "opt_ddb.h"
  79 #endif
  80
  81 #include <sys/param.h>
  82 #include <sys/systm.h>
  83 #include <sys/namei.h>
  84 #include <sys/kernel.h>
  85 #include <sys/resourcevar.h>
  86 #include <sys/file.h>
  87 #include <sys/stat.h>
  88 #include <sys/buf.h>
  89 #include <sys/proc.h>
  90 #include <sys/vnode.h>
  91 #include <sys/mount.h>
  92 #include <sys/kauth.h>
  93 #include <sys/syslog.h>
  94
  95 #include <miscfs/specfs/specdev.h>
  96 #include <miscfs/fifofs/fifo.h>
  97
  98 #include <ufs/ufs/inode.h>
  99 #include <ufs/ufs/dir.h>
 100 #include <ufs/ufs/ufsmount.h>
 101 #include <ufs/ufs/ufs_extern.h>
 102
 103 #include <ufs/lfs/lfs.h>
 104 #include <ufs/lfs/lfs_extern.h>
 105
 106 #include <uvm/uvm.h>
 107 #include <uvm/uvm_extern.h>
 108
 109 MALLOC_JUSTDEFINE(M_SEGMENT, "LFS segment", "Segment for LFS");
 110
 111 static void lfs_generic_callback(struct buf *, void (*)(struct buf *));
 112 static void lfs_free_aiodone(struct buf *);
 113 static void lfs_super_aiodone(struct buf *);
 114 static void lfs_cluster_aiodone(struct buf *);
 115 static void lfs_cluster_callback(struct buf *);
 116
 117 /*
 118  * Determine if it's OK to start a partial in this segment, or if we need
 119  * to go on to a new segment.
 120  */
 121 #define LFS_PARTIAL_FITS(fs) \
 122         ((fs)->lfs_fsbpseg - ((fs)->lfs_offset - (fs)->lfs_curseg) > \
 123         (fs)->lfs_frag)
 124
 125 /*
 126  * Figure out whether we should do a checkpoint write or go ahead with
 127  * an ordinary write.
 128  */
 129 #define LFS_SHOULD_CHECKPOINT(fs, flags) \
 130         ((flags & SEGM_CLEAN) == 0 &&                                   \
 131           ((fs->lfs_nactive > LFS_MAX_ACTIVE ||                         \
 132             (flags & SEGM_CKP) ||                                       \
 133             fs->lfs_nclean < LFS_MAX_ACTIVE)))
 134
 135 int      lfs_match_fake(struct lfs *, struct buf *);
 136 void     lfs_newseg(struct lfs *);
 137 /* XXX ondisk32 */
 138 void     lfs_shellsort(struct buf **, int32_t *, int, int);
 139 void     lfs_supercallback(struct buf *);
 140 void     lfs_updatemeta(struct segment *);
 141 void     lfs_writesuper(struct lfs *, daddr_t);
 142 int      lfs_writevnodes(struct lfs *fs, struct mount *mp,
 143             struct segment *sp, int dirops);
 144
 145 int     lfs_allclean_wakeup;            /* Cleaner wakeup address. */
 146 int     lfs_writeindir = 1;             /* whether to flush indir on non-ckp */
 147 int     lfs_clean_vnhead = 0;           /* Allow freeing to head of vn list */
 148 int     lfs_dirvcount = 0;              /* # active dirops */
 149
 150 /* Statistics Counters */
 151 int lfs_dostats = 1;
 152 struct lfs_stats lfs_stats;
 153
 154 /* op values to lfs_writevnodes */
 155 #define VN_REG          0
 156 #define VN_DIROP        1
 157 #define VN_EMPTY        2
 158 #define VN_CLEAN        3
 159
 160 /*
 161  * XXX KS - Set modification time on the Ifile, so the cleaner can
 162  * read the fs mod time off of it.  We don't set IN_UPDATE here,
 163  * since we don't really need this to be flushed to disk (and in any
 164  * case that wouldn't happen to the Ifile until we checkpoint).
 165  */
 166 void
 167 lfs_imtime(struct lfs *fs)
 168 {
 169         struct timespec ts;
 170         struct inode *ip;
 171
 172         ASSERT_MAYBE_SEGLOCK(fs);
 173         vfs_timestamp(&ts);
 174         ip = VTOI(fs->lfs_ivnode);
 175         ip->i_ffs1_mtime = ts.tv_sec;
 176         ip->i_ffs1_mtimensec = ts.tv_nsec;
 177 }
 178
 179 /*
 180  * Ifile and meta data blocks are not marked busy, so segment writes MUST be
 181  * single threaded.  Currently, there are two paths into lfs_segwrite, sync()
 182  * and getnewbuf().  They both mark the file system busy.  Lfs_vflush()
 183  * explicitly marks the file system busy.  So lfs_segwrite is safe.  I think.
 184  */
 185
 186 #define IS_FLUSHING(fs,vp)  ((fs)->lfs_flushvp == (vp))
 187
 188 int
 189 lfs_vflush(struct vnode *vp)
 190 {
 191         struct inode *ip;
 192         struct lfs *fs;
 193         struct segment *sp;
 194         struct buf *bp, *nbp, *tbp, *tnbp;
 195         int error;
 196         int flushed;
 197         int relock;
 198         int loopcount;
 199
 200         ip = VTOI(vp);
 201         fs = VFSTOUFS(vp->v_mount)->um_lfs;
 202         relock = 0;
 203
 204     top:
 205         ASSERT_NO_SEGLOCK(fs);
 206         if (ip->i_flag & IN_CLEANING) {
 207                 ivndebug(vp,"vflush/in_cleaning");
 208                 mutex_enter(&lfs_lock);
 209                 LFS_CLR_UINO(ip, IN_CLEANING);
 210                 LFS_SET_UINO(ip, IN_MODIFIED);
 211                 mutex_exit(&lfs_lock);
 212
 213                 /*
 214                  * Toss any cleaning buffers that have real counterparts
 215                  * to avoid losing new data.
 216                  */
 217                 mutex_enter(vp->v_interlock);
 218                 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
 219                         nbp = LIST_NEXT(bp, b_vnbufs);
 220                         if (!LFS_IS_MALLOC_BUF(bp))
 221                                 continue;
 222                         /*
 223                          * Look for pages matching the range covered
 224                          * by cleaning blocks.  It's okay if more dirty
 225                          * pages appear, so long as none disappear out
 226                          * from under us.
 227                          */
 228                         if (bp->b_lblkno > 0 && vp->v_type == VREG &&
 229                             vp != fs->lfs_ivnode) {
 230                                 struct vm_page *pg;
 231                                 voff_t off;
 232
 233                                 for (off = lblktosize(fs, bp->b_lblkno);
 234                                      off < lblktosize(fs, bp->b_lblkno + 1);
 235                                      off += PAGE_SIZE) {
 236                                         pg = uvm_pagelookup(&vp->v_uobj, off);
 237                                         if (pg == NULL)
 238                                                 continue;
 239                                         if ((pg->flags & PG_CLEAN) == 0 ||
 240                                             pmap_is_modified(pg)) {
 241                                                 fs->lfs_avail += btofsb(fs,
 242                                                         bp->b_bcount);
 243                                                 wakeup(&fs->lfs_avail);
 244                                                 mutex_exit(vp->v_interlock);
 245                                                 lfs_freebuf(fs, bp);
 246                                                 mutex_enter(vp->v_interlock);
 247                                                 bp = NULL;
 248                                                 break;
 249                                         }
 250                                 }
 251                         }
 252                         for (tbp = LIST_FIRST(&vp->v_dirtyblkhd); tbp;
 253                             tbp = tnbp)
 254                         {
 255                                 tnbp = LIST_NEXT(tbp, b_vnbufs);
 256                                 if (tbp->b_vp == bp->b_vp
 257                                    && tbp->b_lblkno == bp->b_lblkno
 258                                    && tbp != bp)
 259                                 {
 260                                         fs->lfs_avail += btofsb(fs,
 261                                                 bp->b_bcount);
 262                                         wakeup(&fs->lfs_avail);
 263                                         mutex_exit(vp->v_interlock);
 264                                         lfs_freebuf(fs, bp);
 265                                         mutex_enter(vp->v_interlock);
 266                                         bp = NULL;
 267                                         break;
 268                                 }
 269                         }
 270                 }
 271         } else {
 272                 mutex_enter(vp->v_interlock);
 273         }
 274
 275         /* If the node is being written, wait until that is done */
 276         while (WRITEINPROG(vp)) {
 277                 ivndebug(vp,"vflush/writeinprog");
 278                 cv_wait(&vp->v_cv, vp->v_interlock);
 279         }
 280         mutex_exit(vp->v_interlock);
 281
 282         /* Protect against VI_XLOCK deadlock in vinvalbuf() */
 283         lfs_seglock(fs, SEGM_SYNC);
 284
 285         /* If we're supposed to flush a freed inode, just toss it */
 286         if (ip->i_lfs_iflags & LFSI_DELETED) {
 287                 DLOG((DLOG_VNODE, "lfs_vflush: ino %d freed, not flushing\n",
 288                       ip->i_number));
 289                 /* Drain v_numoutput */
 290                 mutex_enter(vp->v_interlock);
 291                 while (vp->v_numoutput > 0) {
 292                         cv_wait(&vp->v_cv, vp->v_interlock);
 293                 }
 294                 KASSERT(vp->v_numoutput == 0);
 295                 mutex_exit(vp->v_interlock);
 296
 297                 mutex_enter(&bufcache_lock);
 298                 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
 299                         nbp = LIST_NEXT(bp, b_vnbufs);
 300
 301                         KASSERT((bp->b_flags & B_GATHERED) == 0);
 302                         if (bp->b_oflags & BO_DELWRI) { /* XXX always true? */
 303                                 fs->lfs_avail += btofsb(fs, bp->b_bcount);
 304                                 wakeup(&fs->lfs_avail);
 305                         }
 306                         /* Copied from lfs_writeseg */
 307                         if (bp->b_iodone != NULL) {
 308                                 mutex_exit(&bufcache_lock);
 309                                 biodone(bp);
 310                                 mutex_enter(&bufcache_lock);
 311                         } else {
 312                                 bremfree(bp);
 313                                 LFS_UNLOCK_BUF(bp);
 314                                 mutex_enter(vp->v_interlock);
 315                                 bp->b_flags &= ~(B_READ | B_GATHERED);
 316                                 bp->b_oflags = (bp->b_oflags & ~BO_DELWRI) | BO_DONE;
 317                                 bp->b_error = 0;
 318                                 reassignbuf(bp, vp);
 319                                 mutex_exit(vp->v_interlock);
 320                                 brelse(bp, 0);
 321                         }
 322                 }
 323                 mutex_exit(&bufcache_lock);
 324                 LFS_CLR_UINO(ip, IN_CLEANING);
 325                 LFS_CLR_UINO(ip, IN_MODIFIED | IN_ACCESSED);
 326                 ip->i_flag &= ~IN_ALLMOD;
 327                 DLOG((DLOG_VNODE, "lfs_vflush: done not flushing ino %d\n",
 328                       ip->i_number));
 329                 lfs_segunlock(fs);
 330
 331                 KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL);
 332
 333                 return 0;
 334         }
 335
 336         fs->lfs_flushvp = vp;
 337         if (LFS_SHOULD_CHECKPOINT(fs, fs->lfs_sp->seg_flags)) {
 338                 error = lfs_segwrite(vp->v_mount, SEGM_CKP | SEGM_SYNC);
 339                 fs->lfs_flushvp = NULL;
 340                 KASSERT(fs->lfs_flushvp_fakevref == 0);
 341                 lfs_segunlock(fs);
 342
 343                 /* Make sure that any pending buffers get written */
 344                 mutex_enter(vp->v_interlock);
 345                 while (vp->v_numoutput > 0) {
 346                         cv_wait(&vp->v_cv, vp->v_interlock);
 347                 }
 348                 KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL);
 349                 KASSERT(vp->v_numoutput == 0);
 350                 mutex_exit(vp->v_interlock);
 351
 352                 return error;
 353         }
 354         sp = fs->lfs_sp;
 355
 356         flushed = 0;
 357         if (VPISEMPTY(vp)) {
 358                 lfs_writevnodes(fs, vp->v_mount, sp, VN_EMPTY);
 359                 ++flushed;
 360         } else if ((ip->i_flag & IN_CLEANING) &&
 361                   (fs->lfs_sp->seg_flags & SEGM_CLEAN)) {
 362                 ivndebug(vp,"vflush/clean");
 363                 lfs_writevnodes(fs, vp->v_mount, sp, VN_CLEAN);
 364                 ++flushed;
 365         } else if (lfs_dostats) {
 366                 if (!VPISEMPTY(vp) || (VTOI(vp)->i_flag & IN_ALLMOD))
 367                         ++lfs_stats.vflush_invoked;
 368                 ivndebug(vp,"vflush");
 369         }
 370
 371 #ifdef DIAGNOSTIC
 372         if (vp->v_uflag & VU_DIROP) {
 373                 DLOG((DLOG_VNODE, "lfs_vflush: flushing VU_DIROP\n"));
 374                 /* panic("lfs_vflush: VU_DIROP being flushed...this can\'t happen"); */
 375         }
 376 #endif
 377
 378         do {
 379                 loopcount = 0;
 380                 do {
 381                         if (LIST_FIRST(&vp->v_dirtyblkhd) != NULL) {
 382                                 relock = lfs_writefile(fs, sp, vp);
 383                                 if (relock) {
 384                                         /*
 385                                          * Might have to wait for the
 386                                          * cleaner to run; but we're
 387                                          * still not done with this vnode.
 388                                          */
 389                                         KDASSERT(ip->i_number != LFS_IFILE_INUM);
 390                                         lfs_writeinode(fs, sp, ip);
 391                                         mutex_enter(&lfs_lock);
 392                                         LFS_SET_UINO(ip, IN_MODIFIED);
 393                                         mutex_exit(&lfs_lock);
 394                                         lfs_writeseg(fs, sp);
 395                                         lfs_segunlock(fs);
 396                                         lfs_segunlock_relock(fs);
 397                                         goto top;
 398                                 }
 399                         }
 400                         /*
 401                          * If we begin a new segment in the middle of writing
 402                          * the Ifile, it creates an inconsistent checkpoint,
 403                          * since the Ifile information for the new segment
 404                          * is not up-to-date.  Take care of this here by
 405                          * sending the Ifile through again in case there
 406                          * are newly dirtied blocks.  But wait, there's more!
 407                          * This second Ifile write could *also* cross a segment
 408                          * boundary, if the first one was large.  The second
 409                          * one is guaranteed to be no more than 8 blocks,
 410                          * though (two segment blocks and supporting indirects)
 411                          * so the third write *will not* cross the boundary.
 412                          */
 413                         if (vp == fs->lfs_ivnode) {
 414                                 lfs_writefile(fs, sp, vp);
 415                                 lfs_writefile(fs, sp, vp);
 416                         }
 417 #ifdef DEBUG
 418                         if (++loopcount > 2)
 419                                 log(LOG_NOTICE, "lfs_vflush: looping count=%d\n", loopcount);
 420 #endif
 421                 } while (lfs_writeinode(fs, sp, ip));
 422         } while (lfs_writeseg(fs, sp) && ip->i_number == LFS_IFILE_INUM);
 423
 424         if (lfs_dostats) {
 425                 ++lfs_stats.nwrites;
 426                 if (sp->seg_flags & SEGM_SYNC)
 427                         ++lfs_stats.nsync_writes;
 428                 if (sp->seg_flags & SEGM_CKP)
 429                         ++lfs_stats.ncheckpoints;
 430         }
 431         /*
 432          * If we were called from somewhere that has already held the seglock
 433          * (e.g., lfs_markv()), the lfs_segunlock will not wait for
 434          * the write to complete because we are still locked.
 435          * Since lfs_vflush() must return the vnode with no dirty buffers,
 436          * we must explicitly wait, if that is the case.
 437          *
 438          * We compare the iocount against 1, not 0, because it is
 439          * artificially incremented by lfs_seglock().
 440          */
 441         mutex_enter(&lfs_lock);
 442         if (fs->lfs_seglock > 1) {
 443                 while (fs->lfs_iocount > 1)
 444                         (void)mtsleep(&fs->lfs_iocount, PRIBIO + 1,
 445                                      "lfs_vflush", 0, &lfs_lock);
 446         }
 447         mutex_exit(&lfs_lock);
 448
 449         lfs_segunlock(fs);
 450
 451         /* Wait for these buffers to be recovered by aiodoned */
 452         mutex_enter(vp->v_interlock);
 453         while (vp->v_numoutput > 0) {
 454                 cv_wait(&vp->v_cv, vp->v_interlock);
 455         }
 456         KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL);
 457         KASSERT(vp->v_numoutput == 0);
 458         mutex_exit(vp->v_interlock);
 459
 460         fs->lfs_flushvp = NULL;
 461         KASSERT(fs->lfs_flushvp_fakevref == 0);
 462
 463         return (0);
 464 }
 465
 466 int
 467 lfs_writevnodes(struct lfs *fs, struct mount *mp, struct segment *sp, int op)
 468 {
 469         struct inode *ip;
 470         struct vnode *vp;
 471         int inodes_written = 0, only_cleaning;
 472         int error = 0;
 473
 474         ASSERT_SEGLOCK(fs);
 475  loop:
 476         /* start at last (newest) vnode. */
 477         mutex_enter(&mntvnode_lock);
 478         TAILQ_FOREACH_REVERSE(vp, &mp->mnt_vnodelist, vnodelst, v_mntvnodes) {
 479                 /*
 480                  * If the vnode that we are about to sync is no longer
 481                  * associated with this mount point, start over.
 482                  */
 483                 if (vp->v_mount != mp) {
 484                         DLOG((DLOG_VNODE, "lfs_writevnodes: starting over\n"));
 485                         /*
 486                          * After this, pages might be busy
 487                          * due to our own previous putpages.
 488                          * Start actual segment write here to avoid deadlock.
 489                          */
 490                         mutex_exit(&mntvnode_lock);
 491                         (void)lfs_writeseg(fs, sp);
 492                         goto loop;
 493                 }
 494
 495                 mutex_enter(vp->v_interlock);
 496                 if (vp->v_type == VNON || vismarker(vp) ||
 497                     (vp->v_iflag & VI_CLEAN) != 0) {
 498                         mutex_exit(vp->v_interlock);
 499                         continue;
 500                 }
 501
 502                 ip = VTOI(vp);
 503                 if ((op == VN_DIROP && !(vp->v_uflag & VU_DIROP)) ||
 504                     (op != VN_DIROP && op != VN_CLEAN &&
 505                     (vp->v_uflag & VU_DIROP))) {
 506                         mutex_exit(vp->v_interlock);
 507                         vndebug(vp,"dirop");
 508                         continue;
 509                 }
 510
 511                 if (op == VN_EMPTY && !VPISEMPTY(vp)) {
 512                         mutex_exit(vp->v_interlock);
 513                         vndebug(vp,"empty");
 514                         continue;
 515                 }
 516
 517                 if (op == VN_CLEAN && ip->i_number != LFS_IFILE_INUM
 518                    && vp != fs->lfs_flushvp
 519                    && !(ip->i_flag & IN_CLEANING)) {
 520                         mutex_exit(vp->v_interlock);
 521                         vndebug(vp,"cleaning");
 522                         continue;
 523                 }
 524
 525                 mutex_exit(&mntvnode_lock);
 526                 if (lfs_vref(vp)) {
 527                         vndebug(vp,"vref");
 528                         mutex_enter(&mntvnode_lock);
 529                         continue;
 530                 }
 531
 532                 only_cleaning = 0;
 533                 /*
 534                  * Write the inode/file if dirty and it's not the IFILE.
 535                  */
 536                 if ((ip->i_flag & IN_ALLMOD) || !VPISEMPTY(vp)) {
 537                         only_cleaning =
 538                             ((ip->i_flag & IN_ALLMOD) == IN_CLEANING);
 539
 540                         if (ip->i_number != LFS_IFILE_INUM) {
 541                                 error = lfs_writefile(fs, sp, vp);
 542                                 if (error) {
 543                                         lfs_vunref(vp);
 544                                         if (error == EAGAIN) {
 545                                                 /*
 546                                                  * This error from lfs_putpages
 547                                                  * indicates we need to drop
 548                                                  * the segment lock and start
 549                                                  * over after the cleaner has
 550                                                  * had a chance to run.
 551                                                  */
 552                                                 lfs_writeinode(fs, sp, ip);
 553                                                 lfs_writeseg(fs, sp);
 554                                                 if (!VPISEMPTY(vp) &&
 555                                                     !WRITEINPROG(vp) &&
 556                                                     !(ip->i_flag & IN_ALLMOD)) {
 557                                                         mutex_enter(&lfs_lock);
 558                                                         LFS_SET_UINO(ip, IN_MODIFIED);
 559                                                         mutex_exit(&lfs_lock);
 560                                                 }
 561                                                 mutex_enter(&mntvnode_lock);
 562                                                 break;
 563                                         }
 564                                         error = 0; /* XXX not quite right */
 565                                         mutex_enter(&mntvnode_lock);
 566                                         continue;
 567                                 }
 568
 569                                 if (!VPISEMPTY(vp)) {
 570                                         if (WRITEINPROG(vp)) {
 571                                                 ivndebug(vp,"writevnodes/write2");
 572                                         } else if (!(ip->i_flag & IN_ALLMOD)) {
 573                                                 mutex_enter(&lfs_lock);
 574                                                 LFS_SET_UINO(ip, IN_MODIFIED);
 575                                                 mutex_exit(&lfs_lock);
 576                                         }
 577                                 }
 578                                 (void) lfs_writeinode(fs, sp, ip);
 579                                 inodes_written++;
 580                         }
 581                 }
 582
 583                 if (lfs_clean_vnhead && only_cleaning)
 584                         lfs_vunref_head(vp);
 585                 else
 586                         lfs_vunref(vp);
 587
 588                 mutex_enter(&mntvnode_lock);
 589         }
 590         mutex_exit(&mntvnode_lock);
 591         return error;
 592 }
 593
 594 /*
 595  * Do a checkpoint.
 596  */
 597 int
 598 lfs_segwrite(struct mount *mp, int flags)
 599 {
 600         struct buf *bp;
 601         struct inode *ip;
 602         struct lfs *fs;
 603         struct segment *sp;
 604         struct vnode *vp;
 605         SEGUSE *segusep;
 606         int do_ckp, did_ckp, error;
 607         unsigned n, segleft, maxseg, sn, i, curseg;
 608         int writer_set = 0;
 609         int dirty;
 610         int redo;
 611         int um_error;
 612         int loopcount;
 613
 614         fs = VFSTOUFS(mp)->um_lfs;
 615         ASSERT_MAYBE_SEGLOCK(fs);
 616
 617         if (fs->lfs_ronly)
 618                 return EROFS;
 619
 620         lfs_imtime(fs);
 621
 622         /*
 623          * Allocate a segment structure and enough space to hold pointers to
 624          * the maximum possible number of buffers which can be described in a
 625          * single summary block.
 626          */
 627         do_ckp = LFS_SHOULD_CHECKPOINT(fs, flags);
 628
 629         lfs_seglock(fs, flags | (do_ckp ? SEGM_CKP : 0));
 630         sp = fs->lfs_sp;
 631         if (sp->seg_flags & (SEGM_CLEAN | SEGM_CKP))
 632                 do_ckp = 1;
 633
 634         /*
 635          * If lfs_flushvp is non-NULL, we are called from lfs_vflush,
 636          * in which case we have to flush *all* buffers off of this vnode.
 637          * We don't care about other nodes, but write any non-dirop nodes
 638          * anyway in anticipation of another getnewvnode().
 639          *
 640          * If we're cleaning we only write cleaning and ifile blocks, and
 641          * no dirops, since otherwise we'd risk corruption in a crash.
 642          */
 643         if (sp->seg_flags & SEGM_CLEAN)
 644                 lfs_writevnodes(fs, mp, sp, VN_CLEAN);
 645         else if (!(sp->seg_flags & SEGM_FORCE_CKP)) {
 646                 do {
 647                         um_error = lfs_writevnodes(fs, mp, sp, VN_REG);
 648
 649                         if (do_ckp || fs->lfs_dirops == 0) {
 650                                 if (!writer_set) {
 651                                         lfs_writer_enter(fs, "lfs writer");
 652                                         writer_set = 1;
 653                                 }
 654                                 error = lfs_writevnodes(fs, mp, sp, VN_DIROP);
 655                                 if (um_error == 0)
 656                                         um_error = error;
 657                                 /* In case writevnodes errored out */
 658                                 lfs_flush_dirops(fs);
 659                                 ((SEGSUM *)(sp->segsum))->ss_flags &= ~(SS_CONT);
 660                                 lfs_finalize_fs_seguse(fs);
 661                         }
 662                         if (do_ckp && um_error) {
 663                                 lfs_segunlock_relock(fs);
 664                                 sp = fs->lfs_sp;
 665                         }
 666                 } while (do_ckp && um_error != 0);
 667         }
 668
 669         /*
 670          * If we are doing a checkpoint, mark everything since the
 671          * last checkpoint as no longer ACTIVE.
 672          */
 673         if (do_ckp || fs->lfs_doifile) {
 674                 segleft = fs->lfs_nseg;
 675                 curseg = 0;
 676                 for (n = 0; n < fs->lfs_segtabsz; n++) {
 677                         dirty = 0;
 678                         if (bread(fs->lfs_ivnode, fs->lfs_cleansz + n,
 679                             fs->lfs_bsize, NOCRED, B_MODIFY, &bp))
 680                                 panic("lfs_segwrite: ifile read");
 681                         segusep = (SEGUSE *)bp->b_data;
 682                         maxseg = min(segleft, fs->lfs_sepb);
 683                         for (i = 0; i < maxseg; i++) {
 684                                 sn = curseg + i;
 685                                 if (sn != dtosn(fs, fs->lfs_curseg) &&
 686                                     segusep->su_flags & SEGUSE_ACTIVE) {
 687                                         segusep->su_flags &= ~SEGUSE_ACTIVE;
 688                                         --fs->lfs_nactive;
 689                                         ++dirty;
 690                                 }
 691                                 fs->lfs_suflags[fs->lfs_activesb][sn] =
 692                                         segusep->su_flags;
 693                                 if (fs->lfs_version > 1)
 694                                         ++segusep;
 695                                 else
 696                                         segusep = (SEGUSE *)
 697                                                 ((SEGUSE_V1 *)segusep + 1);
 698                         }
 699
 700                         if (dirty)
 701                                 error = LFS_BWRITE_LOG(bp); /* Ifile */
 702                         else
 703                                 brelse(bp, 0);
 704                         segleft -= fs->lfs_sepb;
 705                         curseg += fs->lfs_sepb;
 706                 }
 707         }
 708
 709         KASSERT(LFS_SEGLOCK_HELD(fs));
 710
 711         did_ckp = 0;
 712         if (do_ckp || fs->lfs_doifile) {
 713                 vp = fs->lfs_ivnode;
 714                 vn_lock(vp, LK_EXCLUSIVE);
 715                 loopcount = 0;
 716                 do {
 717 #ifdef DEBUG
 718                         LFS_ENTER_LOG("pretend", __FILE__, __LINE__, 0, 0, curproc->p_pid);
 719 #endif
 720                         mutex_enter(&lfs_lock);
 721                         fs->lfs_flags &= ~LFS_IFDIRTY;
 722                         mutex_exit(&lfs_lock);
 723
 724                         ip = VTOI(vp);
 725
 726                         if (LIST_FIRST(&vp->v_dirtyblkhd) != NULL) {
 727                                 /*
 728                                  * Ifile has no pages, so we don't need
 729                                  * to check error return here.
 730                                  */
 731                                 lfs_writefile(fs, sp, vp);
 732                                 /*
 733                                  * Ensure the Ifile takes the current segment
 734                                  * into account.  See comment in lfs_vflush.
 735                                  */
 736                                 lfs_writefile(fs, sp, vp);
 737                                 lfs_writefile(fs, sp, vp);
 738                         }
 739
 740                         if (ip->i_flag & IN_ALLMOD)
 741                                 ++did_ckp;
 742 #if 0
 743                         redo = (do_ckp ? lfs_writeinode(fs, sp, ip) : 0);
 744 #else
 745                         redo = lfs_writeinode(fs, sp, ip);
 746 #endif
 747                         redo += lfs_writeseg(fs, sp);
 748                         mutex_enter(&lfs_lock);
 749                         redo += (fs->lfs_flags & LFS_IFDIRTY);
 750                         mutex_exit(&lfs_lock);
 751 #ifdef DEBUG
 752                         if (++loopcount > 2)
 753                                 log(LOG_NOTICE, "lfs_segwrite: looping count=%d\n",
 754                                         loopcount);
 755 #endif
 756                 } while (redo && do_ckp);
 757
 758                 /*
 759                  * Unless we are unmounting, the Ifile may continue to have
 760                  * dirty blocks even after a checkpoint, due to changes to
 761                  * inodes' atime.  If we're checkpointing, it's "impossible"
 762                  * for other parts of the Ifile to be dirty after the loop
 763                  * above, since we hold the segment lock.
 764                  */
 765                 mutex_enter(vp->v_interlock);
 766                 if (LIST_EMPTY(&vp->v_dirtyblkhd)) {
 767                         LFS_CLR_UINO(ip, IN_ALLMOD);
 768                 }
 769 #ifdef DIAGNOSTIC
 770                 else if (do_ckp) {
 771                         int do_panic = 0;
 772                         LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) {
 773                                 if (bp->b_lblkno < fs->lfs_cleansz +
 774                                     fs->lfs_segtabsz &&
 775                                     !(bp->b_flags & B_GATHERED)) {
 776                                         printf("ifile lbn %ld still dirty (flags %lx)\n",
 777                                                 (long)bp->b_lblkno,
 778                                                 (long)bp->b_flags);
 779                                         ++do_panic;
 780                                 }
 781                         }
 782                         if (do_panic)
 783                                 panic("dirty blocks");
 784                 }
 785 #endif
 786                 mutex_exit(vp->v_interlock);
 787                 VOP_UNLOCK(vp);
 788         } else {
 789                 (void) lfs_writeseg(fs, sp);
 790         }
 791
 792         /* Note Ifile no longer needs to be written */
 793         fs->lfs_doifile = 0;
 794         if (writer_set)
 795                 lfs_writer_leave(fs);
 796
 797         /*
 798          * If we didn't write the Ifile, we didn't really do anything.
 799          * That means that (1) there is a checkpoint on disk and (2)
 800          * nothing has changed since it was written.
 801          *
 802          * Take the flags off of the segment so that lfs_segunlock
 803          * doesn't have to write the superblock either.
 804          */
 805         if (do_ckp && !did_ckp) {
 806                 sp->seg_flags &= ~SEGM_CKP;
 807         }
 808
 809         if (lfs_dostats) {
 810                 ++lfs_stats.nwrites;
 811                 if (sp->seg_flags & SEGM_SYNC)
 812                         ++lfs_stats.nsync_writes;
 813                 if (sp->seg_flags & SEGM_CKP)
 814                         ++lfs_stats.ncheckpoints;
 815         }
 816         lfs_segunlock(fs);
 817         return (0);
 818 }
 819
 820 /*
 821  * Write the dirty blocks associated with a vnode.
 822  */
 823 int
 824 lfs_writefile(struct lfs *fs, struct segment *sp, struct vnode *vp)
 825 {
 826         struct finfo *fip;
 827         struct inode *ip;
 828         int i, frag;
 829         int error;
 830
 831         ASSERT_SEGLOCK(fs);
 832         error = 0;
 833         ip = VTOI(vp);
 834
 835         fip = sp->fip;
 836         lfs_acquire_finfo(fs, ip->i_number, ip->i_gen);
 837
 838         if (vp->v_uflag & VU_DIROP)
 839                 ((SEGSUM *)(sp->segsum))->ss_flags |= (SS_DIROP|SS_CONT);
 840
 841         if (sp->seg_flags & SEGM_CLEAN) {
 842                 lfs_gather(fs, sp, vp, lfs_match_fake);
 843                 /*
 844                  * For a file being flushed, we need to write *all* blocks.
 845                  * This means writing the cleaning blocks first, and then
 846                  * immediately following with any non-cleaning blocks.
 847                  * The same is true of the Ifile since checkpoints assume
 848                  * that all valid Ifile blocks are written.
 849                  */
 850                 if (IS_FLUSHING(fs, vp) || vp == fs->lfs_ivnode) {
 851                         lfs_gather(fs, sp, vp, lfs_match_data);
 852                         /*
 853                          * Don't call VOP_PUTPAGES: if we're flushing,
 854                          * we've already done it, and the Ifile doesn't
 855                          * use the page cache.
 856                          */
 857                 }
 858         } else {
 859                 lfs_gather(fs, sp, vp, lfs_match_data);
 860                 /*
 861                  * If we're flushing, we've already called VOP_PUTPAGES
 862                  * so don't do it again.  Otherwise, we want to write
 863                  * everything we've got.
 864                  */
 865                 if (!IS_FLUSHING(fs, vp)) {
 866                         mutex_enter(vp->v_interlock);
 867                         error = VOP_PUTPAGES(vp, 0, 0,
 868                                 PGO_CLEANIT | PGO_ALLPAGES | PGO_LOCKED);
 869                 }
 870         }
 871
 872         /*
 873          * It may not be necessary to write the meta-data blocks at this point,
 874          * as the roll-forward recovery code should be able to reconstruct the
 875          * list.
 876          *
 877          * We have to write them anyway, though, under two conditions: (1) the
 878          * vnode is being flushed (for reuse by vinvalbuf); or (2) we are
 879          * checkpointing.
 880          *
 881          * BUT if we are cleaning, we might have indirect blocks that refer to
 882          * new blocks not being written yet, in addition to fragments being
 883          * moved out of a cleaned segment.  If that is the case, don't
 884          * write the indirect blocks, or the finfo will have a small block
 885          * in the middle of it!
 886          * XXX in this case isn't the inode size wrong too?
 887          */
 888         frag = 0;
 889         if (sp->seg_flags & SEGM_CLEAN) {
 890                 for (i = 0; i < NDADDR; i++)
 891                         if (ip->i_lfs_fragsize[i] > 0 &&
 892                             ip->i_lfs_fragsize[i] < fs->lfs_bsize)
 893                                 ++frag;
 894         }
 895 #ifdef DIAGNOSTIC
 896         if (frag > 1)
 897                 panic("lfs_writefile: more than one fragment!");
 898 #endif
 899         if (IS_FLUSHING(fs, vp) ||
 900             (frag == 0 && (lfs_writeindir || (sp->seg_flags & SEGM_CKP)))) {
 901                 lfs_gather(fs, sp, vp, lfs_match_indir);
 902                 lfs_gather(fs, sp, vp, lfs_match_dindir);
 903                 lfs_gather(fs, sp, vp, lfs_match_tindir);
 904         }
 905         fip = sp->fip;
 906         lfs_release_finfo(fs);
 907
 908         return error;
 909 }
 910
 911 /*
 912  * Update segment accounting to reflect this inode's change of address.
 913  */
 914 static int
 915 lfs_update_iaddr(struct lfs *fs, struct segment *sp, struct inode *ip, daddr_t ndaddr)
 916 {
 917         struct buf *bp;
 918         daddr_t daddr;
 919         IFILE *ifp;
 920         SEGUSE *sup;
 921         ino_t ino;
 922         int redo_ifile, error;
 923         u_int32_t sn;
 924
 925         redo_ifile = 0;
 926
 927         /*
 928          * If updating the ifile, update the super-block.  Update the disk
 929          * address and access times for this inode in the ifile.
 930          */
 931         ino = ip->i_number;
 932         if (ino == LFS_IFILE_INUM) {
 933                 daddr = fs->lfs_idaddr;
 934                 fs->lfs_idaddr = dbtofsb(fs, ndaddr);
 935         } else {
 936                 LFS_IENTRY(ifp, fs, ino, bp);
 937                 daddr = ifp->if_daddr;
 938                 ifp->if_daddr = dbtofsb(fs, ndaddr);
 939                 error = LFS_BWRITE_LOG(bp); /* Ifile */
 940         }
 941
 942         /*
 943          * If this is the Ifile and lfs_offset is set to the first block
 944          * in the segment, dirty the new segment's accounting block
 945          * (XXX should already be dirty?) and tell the caller to do it again.
 946          */
 947         if (ip->i_number == LFS_IFILE_INUM) {
 948                 sn = dtosn(fs, fs->lfs_offset);
 949                 if (sntod(fs, sn) + btofsb(fs, fs->lfs_sumsize) ==
 950                     fs->lfs_offset) {
 951                         LFS_SEGENTRY(sup, fs, sn, bp);
 952                         KASSERT(bp->b_oflags & BO_DELWRI);
 953                         LFS_WRITESEGENTRY(sup, fs, sn, bp);
 954                         /* fs->lfs_flags |= LFS_IFDIRTY; */
 955                         redo_ifile |= 1;
 956                 }
 957         }
 958
 959         /*
 960          * The inode's last address should not be in the current partial
 961          * segment, except under exceptional circumstances (lfs_writevnodes
 962          * had to start over, and in the meantime more blocks were written
 963          * to a vnode).  Both inodes will be accounted to this segment
 964          * in lfs_writeseg so we need to subtract the earlier version
 965          * here anyway.  The segment count can temporarily dip below
 966          * zero here; keep track of how many duplicates we have in
 967          * "dupino" so we don't panic below.
 968          */
 969         if (daddr >= fs->lfs_lastpseg && daddr <= fs->lfs_offset) {
 970                 ++sp->ndupino;
 971                 DLOG((DLOG_SEG, "lfs_writeinode: last inode addr in current pseg "
 972                       "(ino %d daddr 0x%llx) ndupino=%d\n", ino,
 973                       (long long)daddr, sp->ndupino));
 974         }
 975         /*
 976          * Account the inode: it no longer belongs to its former segment,
 977          * though it will not belong to the new segment until that segment
 978          * is actually written.
 979          */
 980         if (daddr != LFS_UNUSED_DADDR) {
 981                 u_int32_t oldsn = dtosn(fs, daddr);
 982 #ifdef DIAGNOSTIC
 983                 int ndupino = (sp->seg_number == oldsn) ? sp->ndupino : 0;
 984 #endif
 985                 LFS_SEGENTRY(sup, fs, oldsn, bp);
 986 #ifdef DIAGNOSTIC
 987                 if (sup->su_nbytes +
 988                     sizeof (struct ufs1_dinode) * ndupino
 989                       < sizeof (struct ufs1_dinode)) {
 990                         printf("lfs_writeinode: negative bytes "
 991                                "(segment %" PRIu32 " short by %d, "
 992                                "oldsn=%" PRIu32 ", cursn=%" PRIu32
 993                                ", daddr=%" PRId64 ", su_nbytes=%u, "
 994                                "ndupino=%d)\n",
 995                                dtosn(fs, daddr),
 996                                (int)sizeof (struct ufs1_dinode) *
 997                                    (1 - sp->ndupino) - sup->su_nbytes,
 998                                oldsn, sp->seg_number, daddr,
 999                                (unsigned int)sup->su_nbytes,
1000                                sp->ndupino);
1001                         panic("lfs_writeinode: negative bytes");
1002                         sup->su_nbytes = sizeof (struct ufs1_dinode);
1003                 }
1004 #endif
1005                 DLOG((DLOG_SU, "seg %d -= %d for ino %d inode\n",
1006                       dtosn(fs, daddr), sizeof (struct ufs1_dinode), ino));
1007                 sup->su_nbytes -= sizeof (struct ufs1_dinode);
1008                 redo_ifile |=
1009                         (ino == LFS_IFILE_INUM && !(bp->b_flags & B_GATHERED));
1010                 if (redo_ifile) {
1011                         mutex_enter(&lfs_lock);
1012                         fs->lfs_flags |= LFS_IFDIRTY;
1013                         mutex_exit(&lfs_lock);
1014                         /* Don't double-account */
1015                         fs->lfs_idaddr = 0x0;
1016                 }
1017                 LFS_WRITESEGENTRY(sup, fs, oldsn, bp); /* Ifile */
1018         }
1019
1020         return redo_ifile;
1021 }
1022
1023 int
1024 lfs_writeinode(struct lfs *fs, struct segment *sp, struct inode *ip)
1025 {
1026         struct buf *bp;
1027         struct ufs1_dinode *cdp;
1028         daddr_t daddr;
1029         int32_t *daddrp;        /* XXX ondisk32 */
1030         int i, ndx;
1031         int redo_ifile = 0;
1032         int gotblk = 0;
1033         int count;
1034
1035         ASSERT_SEGLOCK(fs);
1036         if (!(ip->i_flag & IN_ALLMOD))
1037                 return (0);
1038
1039         /* Can't write ifile when writer is not set */
1040         KASSERT(ip->i_number != LFS_IFILE_INUM || fs->lfs_writer > 0 ||
1041                 (sp->seg_flags & SEGM_CLEAN));
1042
1043         /*
1044          * If this is the Ifile, see if writing it here will generate a
1045          * temporary misaccounting.  If it will, do the accounting and write
1046          * the blocks, postponing the inode write until the accounting is
1047          * solid.
1048          */
1049         count = 0;
1050         while (ip->i_number == LFS_IFILE_INUM) {
1051                 int redo = 0;
1052
1053                 if (sp->idp == NULL && sp->ibp == NULL &&
1054                     (sp->seg_bytes_left < fs->lfs_ibsize ||
1055                      sp->sum_bytes_left < sizeof(int32_t))) {
1056                         (void) lfs_writeseg(fs, sp);
1057                         continue;
1058                 }
1059
1060                 /* Look for dirty Ifile blocks */
1061                 LIST_FOREACH(bp, &fs->lfs_ivnode->v_dirtyblkhd, b_vnbufs) {
1062                         if (!(bp->b_flags & B_GATHERED)) {
1063                                 redo = 1;
1064                                 break;
1065                         }
1066                 }
1067
1068                 if (redo == 0)
1069                         redo = lfs_update_iaddr(fs, sp, ip, 0x0);
1070                 if (redo == 0)
1071                         break;
1072
1073                 if (sp->idp) {
1074                         sp->idp->di_inumber = 0;
1075                         sp->idp = NULL;
1076                 }
1077                 ++count;
1078                 if (count > 2)
1079                         log(LOG_NOTICE, "lfs_writeinode: looping count=%d\n", count);
1080                 lfs_writefile(fs, sp, fs->lfs_ivnode);
1081         }
1082
1083         /* Allocate a new inode block if necessary. */
1084         if ((ip->i_number != LFS_IFILE_INUM || sp->idp == NULL) &&
1085             sp->ibp == NULL) {
1086                 /* Allocate a new segment if necessary. */
1087                 if (sp->seg_bytes_left < fs->lfs_ibsize ||
1088                     sp->sum_bytes_left < sizeof(int32_t))
1089                         (void) lfs_writeseg(fs, sp);
1090
1091                 /* Get next inode block. */
1092                 daddr = fs->lfs_offset;
1093                 fs->lfs_offset += btofsb(fs, fs->lfs_ibsize);
1094                 sp->ibp = *sp->cbpp++ =
1095                         getblk(VTOI(fs->lfs_ivnode)->i_devvp,
1096                             fsbtodb(fs, daddr), fs->lfs_ibsize, 0, 0);
1097                 gotblk++;
1098
1099                 /* Zero out inode numbers */
1100                 for (i = 0; i < INOPB(fs); ++i)
1101                         ((struct ufs1_dinode *)sp->ibp->b_data)[i].di_inumber =
1102                             0;
1103
1104                 ++sp->start_bpp;
1105                 fs->lfs_avail -= btofsb(fs, fs->lfs_ibsize);
1106                 /* Set remaining space counters. */
1107                 sp->seg_bytes_left -= fs->lfs_ibsize;
1108                 sp->sum_bytes_left -= sizeof(int32_t);
1109                 ndx = fs->lfs_sumsize / sizeof(int32_t) -
1110                         sp->ninodes / INOPB(fs) - 1;
1111                 ((int32_t *)(sp->segsum))[ndx] = daddr;
1112         }
1113
1114         /* Check VU_DIROP in case there is a new file with no data blocks */
1115         if (ITOV(ip)->v_uflag & VU_DIROP)
1116                 ((SEGSUM *)(sp->segsum))->ss_flags |= (SS_DIROP|SS_CONT);
1117
1118         /* Update the inode times and copy the inode onto the inode page. */
1119         /* XXX kludge --- don't redirty the ifile just to put times on it */
1120         if (ip->i_number != LFS_IFILE_INUM)
1121                 LFS_ITIMES(ip, NULL, NULL, NULL);
1122
1123         /*
1124          * If this is the Ifile, and we've already written the Ifile in this
1125          * partial segment, just overwrite it (it's not on disk yet) and
1126          * continue.
1127          *
1128          * XXX we know that the bp that we get the second time around has
1129          * already been gathered.
1130          */
1131         if (ip->i_number == LFS_IFILE_INUM && sp->idp) {
1132                 *(sp->idp) = *ip->i_din.ffs1_din;
1133                 ip->i_lfs_osize = ip->i_size;
1134                 return 0;
1135         }
1136
1137         bp = sp->ibp;
1138         cdp = ((struct ufs1_dinode *)bp->b_data) + (sp->ninodes % INOPB(fs));
1139         *cdp = *ip->i_din.ffs1_din;
1140
1141         /*
1142          * If cleaning, link counts and directory file sizes cannot change,
1143          * since those would be directory operations---even if the file
1144          * we are writing is marked VU_DIROP we should write the old values.
1145          * If we're not cleaning, of course, update the values so we get
1146          * current values the next time we clean.
1147          */
1148         if (sp->seg_flags & SEGM_CLEAN) {
1149                 if (ITOV(ip)->v_uflag & VU_DIROP) {
1150                         cdp->di_nlink = ip->i_lfs_odnlink;
1151                         /* if (ITOV(ip)->v_type == VDIR) */
1152                         cdp->di_size = ip->i_lfs_osize;
1153                 }
1154         } else {
1155                 ip->i_lfs_odnlink = cdp->di_nlink;
1156                 ip->i_lfs_osize = ip->i_size;
1157         }
1158
1159
1160         /* We can finish the segment accounting for truncations now */
1161         lfs_finalize_ino_seguse(fs, ip);
1162
1163         /*
1164          * If we are cleaning, ensure that we don't write UNWRITTEN disk
1165          * addresses to disk; possibly change the on-disk record of
1166          * the inode size, either by reverting to the previous size
1167          * (in the case of cleaning) or by verifying the inode's block
1168          * holdings (in the case of files being allocated as they are being
1169          * written).
1170          * XXX By not writing UNWRITTEN blocks, we are making the lfs_avail
1171          * XXX count on disk wrong by the same amount.  We should be
1172          * XXX able to "borrow" from lfs_avail and return it after the
1173          * XXX Ifile is written.  See also in lfs_writeseg.
1174          */
1175
1176         /* Check file size based on highest allocated block */
1177         if (((ip->i_ffs1_mode & IFMT) == IFREG ||
1178              (ip->i_ffs1_mode & IFMT) == IFDIR) &&
1179             ip->i_size > ((ip->i_lfs_hiblk + 1) << fs->lfs_bshift)) {
1180                 cdp->di_size = (ip->i_lfs_hiblk + 1) << fs->lfs_bshift;
1181                 DLOG((DLOG_SEG, "lfs_writeinode: ino %d size %" PRId64 " -> %"
1182                       PRId64 "\n", (int)ip->i_number, ip->i_size, cdp->di_size));
1183         }
1184         if (ip->i_lfs_effnblks != ip->i_ffs1_blocks) {
1185                 DLOG((DLOG_SEG, "lfs_writeinode: cleansing ino %d eff %d != nblk %d)"
1186                       " at %x\n", ip->i_number, ip->i_lfs_effnblks,
1187                       ip->i_ffs1_blocks, fs->lfs_offset));
1188                 for (daddrp = cdp->di_db; daddrp < cdp->di_ib + NIADDR;
1189                      daddrp++) {
1190                         if (*daddrp == UNWRITTEN) {
1191                                 DLOG((DLOG_SEG, "lfs_writeinode: wiping UNWRITTEN\n"));
1192                                 *daddrp = 0;
1193                         }
1194                 }
1195         }
1196
1197 #ifdef DIAGNOSTIC
1198         /*
1199          * Check dinode held blocks against dinode size.
1200          * This should be identical to the check in lfs_vget().
1201          */
1202         for (i = (cdp->di_size + fs->lfs_bsize - 1) >> fs->lfs_bshift;
1203              i < NDADDR; i++) {
1204                 KASSERT(i >= 0);
1205                 if ((cdp->di_mode & IFMT) == IFLNK)
1206                         continue;
1207                 if (((cdp->di_mode & IFMT) == IFBLK ||
1208                      (cdp->di_mode & IFMT) == IFCHR) && i == 0)
1209                         continue;
1210                 if (cdp->di_db[i] != 0) {
1211 # ifdef DEBUG
1212                         lfs_dump_dinode(cdp);
1213 # endif
1214                         panic("writing inconsistent inode");
1215                 }
1216         }
1217 #endif /* DIAGNOSTIC */
1218
1219         if (ip->i_flag & IN_CLEANING)
1220                 LFS_CLR_UINO(ip, IN_CLEANING);
1221         else {
1222                 /* XXX IN_ALLMOD */
1223                 LFS_CLR_UINO(ip, IN_ACCESSED | IN_ACCESS | IN_CHANGE |
1224                              IN_UPDATE | IN_MODIFY);
1225                 if (ip->i_lfs_effnblks == ip->i_ffs1_blocks)
1226                         LFS_CLR_UINO(ip, IN_MODIFIED);
1227                 else {
1228                         DLOG((DLOG_VNODE, "lfs_writeinode: ino %d: real "
1229                             "blks=%d, eff=%d\n", ip->i_number,
1230                             ip->i_ffs1_blocks, ip->i_lfs_effnblks));
1231                 }
1232         }
1233
1234         if (ip->i_number == LFS_IFILE_INUM) {
1235                 /* We know sp->idp == NULL */
1236                 sp->idp = ((struct ufs1_dinode *)bp->b_data) +
1237                         (sp->ninodes % INOPB(fs));
1238
1239                 /* Not dirty any more */
1240                 mutex_enter(&lfs_lock);
1241                 fs->lfs_flags &= ~LFS_IFDIRTY;
1242                 mutex_exit(&lfs_lock);
1243         }
1244
1245         if (gotblk) {
1246                 mutex_enter(&bufcache_lock);
1247                 LFS_LOCK_BUF(bp);
1248                 brelsel(bp, 0);
1249                 mutex_exit(&bufcache_lock);
1250         }
1251
1252         /* Increment inode count in segment summary block. */
1253         ++((SEGSUM *)(sp->segsum))->ss_ninos;
1254
1255         /* If this page is full, set flag to allocate a new page. */
1256         if (++sp->ninodes % INOPB(fs) == 0)
1257                 sp->ibp = NULL;
1258
1259         redo_ifile = lfs_update_iaddr(fs, sp, ip, bp->b_blkno);
1260
1261         KASSERT(redo_ifile == 0);
1262         return (redo_ifile);
1263 }
1264
1265 int
1266 lfs_gatherblock(struct segment *sp, struct buf *bp, kmutex_t *mptr)
1267 {
1268         struct lfs *fs;
1269         int vers;
1270         int j, blksinblk;
1271
1272         ASSERT_SEGLOCK(sp->fs);
1273         /*
1274          * If full, finish this segment.  We may be doing I/O, so
1275          * release and reacquire the splbio().
1276          */
1277 #ifdef DIAGNOSTIC
1278         if (sp->vp == NULL)
1279                 panic ("lfs_gatherblock: Null vp in segment");
1280 #endif
1281         fs = sp->fs;
1282         blksinblk = howmany(bp->b_bcount, fs->lfs_bsize);
1283         if (sp->sum_bytes_left < sizeof(int32_t) * blksinblk ||
1284             sp->seg_bytes_left < bp->b_bcount) {
1285                 if (mptr)
1286                         mutex_exit(mptr);
1287                 lfs_updatemeta(sp);
1288
1289                 vers = sp->fip->fi_version;
1290                 (void) lfs_writeseg(fs, sp);
1291
1292                 /* Add the current file to the segment summary. */
1293                 lfs_acquire_finfo(fs, VTOI(sp->vp)->i_number, vers);
1294
1295                 if (mptr)
1296                         mutex_enter(mptr);
1297                 return (1);
1298         }
1299
1300         if (bp->b_flags & B_GATHERED) {
1301                 DLOG((DLOG_SEG, "lfs_gatherblock: already gathered! Ino %d,"
1302                       " lbn %" PRId64 "\n",
1303                       sp->fip->fi_ino, bp->b_lblkno));
1304                 return (0);
1305         }
1306
1307         /* Insert into the buffer list, update the FINFO block. */
1308         bp->b_flags |= B_GATHERED;
1309
1310         *sp->cbpp++ = bp;
1311         for (j = 0; j < blksinblk; j++) {
1312                 sp->fip->fi_blocks[sp->fip->fi_nblocks++] = bp->b_lblkno + j;
1313                 /* This block's accounting moves from lfs_favail to lfs_avail */
1314                 lfs_deregister_block(sp->vp, bp->b_lblkno + j);
1315         }
1316
1317         sp->sum_bytes_left -= sizeof(int32_t) * blksinblk;
1318         sp->seg_bytes_left -= bp->b_bcount;
1319         return (0);
1320 }
1321
1322 int
1323 lfs_gather(struct lfs *fs, struct segment *sp, struct vnode *vp,
1324     int (*match)(struct lfs *, struct buf *))
1325 {
1326         struct buf *bp, *nbp;
1327         int count = 0;
1328
1329         ASSERT_SEGLOCK(fs);
1330         if (vp->v_type == VBLK)
1331                 return 0;
1332         KASSERT(sp->vp == NULL);
1333         sp->vp = vp;
1334         mutex_enter(&bufcache_lock);
1335
1336 #ifndef LFS_NO_BACKBUF_HACK
1337 /* This is a hack to see if ordering the blocks in LFS makes a difference. */
1338 # define        BUF_OFFSET      \
1339         (((char *)&LIST_NEXT(bp, b_vnbufs)) - (char *)bp)
1340 # define        BACK_BUF(BP)    \
1341         ((struct buf *)(((char *)(BP)->b_vnbufs.le_prev) - BUF_OFFSET))
1342 # define        BEG_OF_LIST     \
1343         ((struct buf *)(((char *)&LIST_FIRST(&vp->v_dirtyblkhd)) - BUF_OFFSET))
1344
1345 loop:
1346         /* Find last buffer. */
1347         for (bp = LIST_FIRST(&vp->v_dirtyblkhd);
1348              bp && LIST_NEXT(bp, b_vnbufs) != NULL;
1349              bp = LIST_NEXT(bp, b_vnbufs))
1350                 /* nothing */;
1351         for (; bp && bp != BEG_OF_LIST; bp = nbp) {
1352                 nbp = BACK_BUF(bp);
1353 #else /* LFS_NO_BACKBUF_HACK */
1354 loop:
1355         for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
1356                 nbp = LIST_NEXT(bp, b_vnbufs);
1357 #endif /* LFS_NO_BACKBUF_HACK */
1358                 if ((bp->b_cflags & BC_BUSY) != 0 ||
1359                     (bp->b_flags & B_GATHERED) != 0 || !match(fs, bp)) {
1360 #ifdef DEBUG
1361                         if (vp == fs->lfs_ivnode &&
1362                             (bp->b_cflags & BC_BUSY) != 0 &&
1363                             (bp->b_flags & B_GATHERED) == 0)
1364                                 log(LOG_NOTICE, "lfs_gather: ifile lbn %"
1365                                       PRId64 " busy (%x) at 0x%x",
1366                                       bp->b_lblkno, bp->b_flags,
1367                                       (unsigned)fs->lfs_offset);
1368 #endif
1369                         continue;
1370                 }
1371 #ifdef DIAGNOSTIC
1372 # ifdef LFS_USE_B_INVAL
1373                 if ((bp->b_flags & BC_INVAL) != 0 && bp->b_iodone == NULL) {
1374                         DLOG((DLOG_SEG, "lfs_gather: lbn %" PRId64
1375                               " is BC_INVAL\n", bp->b_lblkno));
1376                         VOP_PRINT(bp->b_vp);
1377                 }
1378 # endif /* LFS_USE_B_INVAL */
1379                 if (!(bp->b_oflags & BO_DELWRI))
1380                         panic("lfs_gather: bp not BO_DELWRI");
1381                 if (!(bp->b_flags & B_LOCKED)) {
1382                         DLOG((DLOG_SEG, "lfs_gather: lbn %" PRId64
1383                               " blk %" PRId64 " not B_LOCKED\n",
1384                               bp->b_lblkno,
1385                               dbtofsb(fs, bp->b_blkno)));
1386                         VOP_PRINT(bp->b_vp);
1387                         panic("lfs_gather: bp not B_LOCKED");
1388                 }
1389 #endif
1390                 if (lfs_gatherblock(sp, bp, &bufcache_lock)) {
1391                         goto loop;
1392                 }
1393                 count++;
1394         }
1395         mutex_exit(&bufcache_lock);
1396         lfs_updatemeta(sp);
1397         KASSERT(sp->vp == vp);
1398         sp->vp = NULL;
1399         return count;
1400 }
1401
1402 #if DEBUG
1403 # define DEBUG_OOFF(n) do {                                             \
1404         if (ooff == 0) {                                                \
1405                 DLOG((DLOG_SEG, "lfs_updatemeta[%d]: warning: writing " \
1406                         "ino %d lbn %" PRId64 " at 0x%" PRIx32          \
1407                         ", was 0x0 (or %" PRId64 ")\n",                 \
1408                         (n), ip->i_number, lbn, ndaddr, daddr));        \
1409         }                                                               \
1410 } while (0)
1411 #else
1412 # define DEBUG_OOFF(n)
1413 #endif
1414
1415 /*
1416  * Change the given block's address to ndaddr, finding its previous
1417  * location using ufs_bmaparray().
1418  *
1419  * Account for this change in the segment table.
1420  *
1421  * called with sp == NULL by roll-forwarding code.
1422  */
1423 void
1424 lfs_update_single(struct lfs *fs, struct segment *sp,
1425     struct vnode *vp, daddr_t lbn, int32_t ndaddr, int size)
1426 {
1427         SEGUSE *sup;
1428         struct buf *bp;
1429         struct indir a[NIADDR + 2], *ap;
1430         struct inode *ip;
1431         daddr_t daddr, ooff;
1432         int num, error;
1433         int bb, osize, obb;
1434
1435         ASSERT_SEGLOCK(fs);
1436         KASSERT(sp == NULL || sp->vp == vp);
1437         ip = VTOI(vp);
1438
1439         error = ufs_bmaparray(vp, lbn, &daddr, a, &num, NULL, NULL);
1440         if (error)
1441                 panic("lfs_updatemeta: ufs_bmaparray returned %d", error);
1442
1443         daddr = (daddr_t)((int32_t)daddr); /* XXX ondisk32 */
1444         KASSERT(daddr <= LFS_MAX_DADDR);
1445         if (daddr > 0)
1446                 daddr = dbtofsb(fs, daddr);
1447
1448         bb = numfrags(fs, size);
1449         switch (num) {
1450             case 0:
1451                     ooff = ip->i_ffs1_db[lbn];
1452                     DEBUG_OOFF(0);
1453                     if (ooff == UNWRITTEN)
1454                             ip->i_ffs1_blocks += bb;
1455                     else {
1456                             /* possible fragment truncation or extension */
1457                             obb = btofsb(fs, ip->i_lfs_fragsize[lbn]);
1458                             ip->i_ffs1_blocks += (bb - obb);
1459                     }
1460                     ip->i_ffs1_db[lbn] = ndaddr;
1461                     break;
1462             case 1:
1463                     ooff = ip->i_ffs1_ib[a[0].in_off];
1464                     DEBUG_OOFF(1);
1465                     if (ooff == UNWRITTEN)
1466                             ip->i_ffs1_blocks += bb;
1467                     ip->i_ffs1_ib[a[0].in_off] = ndaddr;
1468                     break;
1469             default:
1470                     ap = &a[num - 1];
1471                     if (bread(vp, ap->in_lbn, fs->lfs_bsize, NOCRED,
1472                         B_MODIFY, &bp))
1473                             panic("lfs_updatemeta: bread bno %" PRId64,
1474                                   ap->in_lbn);
1475
1476                     /* XXX ondisk32 */
1477                     ooff = ((int32_t *)bp->b_data)[ap->in_off];
1478                     DEBUG_OOFF(num);
1479                     if (ooff == UNWRITTEN)
1480                             ip->i_ffs1_blocks += bb;
1481                     /* XXX ondisk32 */
1482                     ((int32_t *)bp->b_data)[ap->in_off] = ndaddr;
1483                     (void) VOP_BWRITE(bp->b_vp, bp);
1484         }
1485
1486         KASSERT(ooff == 0 || ooff == UNWRITTEN || ooff == daddr);
1487
1488         /* Update hiblk when extending the file */
1489         if (lbn > ip->i_lfs_hiblk)
1490                 ip->i_lfs_hiblk = lbn;
1491
1492         /*
1493          * Though we'd rather it couldn't, this *can* happen right now
1494          * if cleaning blocks and regular blocks coexist.
1495          */
1496         /* KASSERT(daddr < fs->lfs_lastpseg || daddr > ndaddr); */
1497
1498         /*
1499          * Update segment usage information, based on old size
1500          * and location.
1501          */
1502         if (daddr > 0) {
1503                 u_int32_t oldsn = dtosn(fs, daddr);
1504 #ifdef DIAGNOSTIC
1505                 int ndupino;
1506
1507                 if (sp && sp->seg_number == oldsn) {
1508                         ndupino = sp->ndupino;
1509                 } else {
1510                         ndupino = 0;
1511                 }
1512 #endif
1513                 KASSERT(oldsn < fs->lfs_nseg);
1514                 if (lbn >= 0 && lbn < NDADDR)
1515                         osize = ip->i_lfs_fragsize[lbn];
1516                 else
1517                         osize = fs->lfs_bsize;
1518                 LFS_SEGENTRY(sup, fs, oldsn, bp);
1519 #ifdef DIAGNOSTIC
1520                 if (sup->su_nbytes + sizeof (struct ufs1_dinode) * ndupino
1521                     < osize) {
1522                         printf("lfs_updatemeta: negative bytes "
1523                                "(segment %" PRIu32 " short by %" PRId64
1524                                ")\n", dtosn(fs, daddr),
1525                                (int64_t)osize -
1526                                (sizeof (struct ufs1_dinode) * ndupino +
1527                                 sup->su_nbytes));
1528                         printf("lfs_updatemeta: ino %llu, lbn %" PRId64
1529                                ", addr = 0x%" PRIx64 "\n",
1530                                (unsigned long long)ip->i_number, lbn, daddr);
1531                         printf("lfs_updatemeta: ndupino=%d\n", ndupino);
1532                         panic("lfs_updatemeta: negative bytes");
1533                         sup->su_nbytes = osize -
1534                             sizeof (struct ufs1_dinode) * ndupino;
1535                 }
1536 #endif
1537                 DLOG((DLOG_SU, "seg %" PRIu32 " -= %d for ino %d lbn %" PRId64
1538                       " db 0x%" PRIx64 "\n",
1539                       dtosn(fs, daddr), osize,
1540                       ip->i_number, lbn, daddr));
1541                 sup->su_nbytes -= osize;
1542                 if (!(bp->b_flags & B_GATHERED)) {
1543                         mutex_enter(&lfs_lock);
1544                         fs->lfs_flags |= LFS_IFDIRTY;
1545                         mutex_exit(&lfs_lock);
1546                 }
1547                 LFS_WRITESEGENTRY(sup, fs, oldsn, bp);
1548         }
1549         /*
1550          * Now that this block has a new address, and its old
1551          * segment no longer owns it, we can forget about its
1552          * old size.
1553          */
1554         if (lbn >= 0 && lbn < NDADDR)
1555                 ip->i_lfs_fragsize[lbn] = size;
1556 }
1557
1558 /*
1559  * Update the metadata that points to the blocks listed in the FINFO
1560  * array.
1561  */
1562 void
1563 lfs_updatemeta(struct segment *sp)
1564 {
1565         struct buf *sbp;
1566         struct lfs *fs;
1567         struct vnode *vp;
1568         daddr_t lbn;
1569         int i, nblocks, num;
1570         int bb;
1571         int bytesleft, size;
1572
1573         ASSERT_SEGLOCK(sp->fs);
1574         vp = sp->vp;
1575         nblocks = &sp->fip->fi_blocks[sp->fip->fi_nblocks] - sp->start_lbp;
1576         KASSERT(nblocks >= 0);
1577         KASSERT(vp != NULL);
1578         if (nblocks == 0)
1579                 return;
1580
1581         /*
1582          * This count may be high due to oversize blocks from lfs_gop_write.
1583          * Correct for this. (XXX we should be able to keep track of these.)
1584          */
1585         fs = sp->fs;
1586         for (i = 0; i < nblocks; i++) {
1587                 if (sp->start_bpp[i] == NULL) {
1588                         DLOG((DLOG_SEG, "lfs_updatemeta: nblocks = %d, not %d\n", i, nblocks));
1589                         nblocks = i;
1590                         break;
1591                 }
1592                 num = howmany(sp->start_bpp[i]->b_bcount, fs->lfs_bsize);
1593                 KASSERT(sp->start_bpp[i]->b_lblkno >= 0 || num == 1);
1594                 nblocks -= num - 1;
1595         }
1596
1597         KASSERT(vp->v_type == VREG ||
1598            nblocks == &sp->fip->fi_blocks[sp->fip->fi_nblocks] - sp->start_lbp);
1599         KASSERT(nblocks == sp->cbpp - sp->start_bpp);
1600
1601         /*
1602          * Sort the blocks.
1603          *
1604          * We have to sort even if the blocks come from the
1605          * cleaner, because there might be other pending blocks on the
1606          * same inode...and if we don't sort, and there are fragments
1607          * present, blocks may be written in the wrong place.
1608          */
1609         lfs_shellsort(sp->start_bpp, sp->start_lbp, nblocks, fs->lfs_bsize);
1610
1611         /*
1612          * Record the length of the last block in case it's a fragment.
1613          * If there are indirect blocks present, they sort last.  An
1614          * indirect block will be lfs_bsize and its presence indicates
1615          * that you cannot have fragments.
1616          *
1617          * XXX This last is a lie.  A cleaned fragment can coexist with
1618          * XXX a later indirect block.  This will continue to be
1619          * XXX true until lfs_markv is fixed to do everything with
1620          * XXX fake blocks (including fake inodes and fake indirect blocks).
1621          */
1622         sp->fip->fi_lastlength = ((sp->start_bpp[nblocks - 1]->b_bcount - 1) &
1623                 fs->lfs_bmask) + 1;
1624
1625         /*
1626          * Assign disk addresses, and update references to the logical
1627          * block and the segment usage information.
1628          */
1629         for (i = nblocks; i--; ++sp->start_bpp) {
1630                 sbp = *sp->start_bpp;
1631                 lbn = *sp->start_lbp;
1632                 KASSERT(sbp->b_lblkno == lbn);
1633
1634                 sbp->b_blkno = fsbtodb(fs, fs->lfs_offset);
1635
1636                 /*
1637                  * If we write a frag in the wrong place, the cleaner won't
1638                  * be able to correctly identify its size later, and the
1639                  * segment will be uncleanable.  (Even worse, it will assume
1640                  * that the indirect block that actually ends the list
1641                  * is of a smaller size!)
1642                  */
1643                 if ((sbp->b_bcount & fs->lfs_bmask) && i != 0)
1644                         panic("lfs_updatemeta: fragment is not last block");
1645
1646                 /*
1647                  * For each subblock in this possibly oversized block,
1648                  * update its address on disk.
1649                  */
1650                 KASSERT(lbn >= 0 || sbp->b_bcount == fs->lfs_bsize);
1651                 KASSERT(vp == sbp->b_vp);
1652                 for (bytesleft = sbp->b_bcount; bytesleft > 0;
1653                      bytesleft -= fs->lfs_bsize) {
1654                         size = MIN(bytesleft, fs->lfs_bsize);
1655                         bb = numfrags(fs, size);
1656                         lbn = *sp->start_lbp++;
1657                         lfs_update_single(fs, sp, sp->vp, lbn, fs->lfs_offset,
1658                             size);
1659                         fs->lfs_offset += bb;
1660                 }
1661
1662         }
1663
1664         /* This inode has been modified */
1665         LFS_SET_UINO(VTOI(vp), IN_MODIFIED);
1666 }
1667
1668 /*
1669  * Move lfs_offset to a segment earlier than sn.
1670  */
1671 int
1672 lfs_rewind(struct lfs *fs, int newsn)
1673 {
1674         int sn, osn, isdirty;
1675         struct buf *bp;
1676         SEGUSE *sup;
1677
1678         ASSERT_SEGLOCK(fs);
1679
1680         osn = dtosn(fs, fs->lfs_offset);
1681         if (osn < newsn)
1682                 return 0;
1683
1684         /* lfs_avail eats the remaining space in this segment */
1685         fs->lfs_avail -= fs->lfs_fsbpseg - (fs->lfs_offset - fs->lfs_curseg);
1686
1687         /* Find a low-numbered segment */
1688         for (sn = 0; sn < fs->lfs_nseg; ++sn) {
1689                 LFS_SEGENTRY(sup, fs, sn, bp);
1690                 isdirty = sup->su_flags & SEGUSE_DIRTY;
1691                 brelse(bp, 0);
1692
1693                 if (!isdirty)
1694                         break;
1695         }
1696         if (sn == fs->lfs_nseg)
1697                 panic("lfs_rewind: no clean segments");
1698         if (newsn >= 0 && sn >= newsn)
1699                 return ENOENT;
1700         fs->lfs_nextseg = sn;
1701         lfs_newseg(fs);
1702         fs->lfs_offset = fs->lfs_curseg;
1703
1704         return 0;
1705 }
1706
1707 /*
1708  * Start a new partial segment.
1709  *
1710  * Return 1 when we entered to a new segment.
1711  * Otherwise, return 0.
1712  */
1713 int
1714 lfs_initseg(struct lfs *fs)
1715 {
1716         struct segment *sp = fs->lfs_sp;
1717         SEGSUM *ssp;
1718         struct buf *sbp;        /* buffer for SEGSUM */
1719         int repeat = 0;         /* return value */
1720
1721         ASSERT_SEGLOCK(fs);
1722         /* Advance to the next segment. */
1723         if (!LFS_PARTIAL_FITS(fs)) {
1724                 SEGUSE *sup;
1725                 struct buf *bp;
1726
1727                 /* lfs_avail eats the remaining space */
1728                 fs->lfs_avail -= fs->lfs_fsbpseg - (fs->lfs_offset -
1729                                                    fs->lfs_curseg);
1730                 /* Wake up any cleaning procs waiting on this file system. */
1731                 lfs_wakeup_cleaner(fs);
1732                 lfs_newseg(fs);
1733                 repeat = 1;
1734                 fs->lfs_offset = fs->lfs_curseg;
1735
1736                 sp->seg_number = dtosn(fs, fs->lfs_curseg);
1737                 sp->seg_bytes_left = fsbtob(fs, fs->lfs_fsbpseg);
1738
1739                 /*
1740                  * If the segment contains a superblock, update the offset
1741                  * and summary address to skip over it.
1742                  */
1743                 LFS_SEGENTRY(sup, fs, sp->seg_number, bp);
1744                 if (sup->su_flags & SEGUSE_SUPERBLOCK) {
1745                         fs->lfs_offset += btofsb(fs, LFS_SBPAD);
1746                         sp->seg_bytes_left -= LFS_SBPAD;
1747                 }
1748                 brelse(bp, 0);
1749                 /* Segment zero could also contain the labelpad */
1750                 if (fs->lfs_version > 1 && sp->seg_number == 0 &&
1751                     fs->lfs_start < btofsb(fs, LFS_LABELPAD)) {
1752                         fs->lfs_offset +=
1753                             btofsb(fs, LFS_LABELPAD) - fs->lfs_start;
1754                         sp->seg_bytes_left -=
1755                             LFS_LABELPAD - fsbtob(fs, fs->lfs_start);
1756                 }
1757         } else {
1758                 sp->seg_number = dtosn(fs, fs->lfs_curseg);
1759                 sp->seg_bytes_left = fsbtob(fs, fs->lfs_fsbpseg -
1760                                       (fs->lfs_offset - fs->lfs_curseg));
1761         }
1762         fs->lfs_lastpseg = fs->lfs_offset;
1763
1764         /* Record first address of this partial segment */
1765         if (sp->seg_flags & SEGM_CLEAN) {
1766                 fs->lfs_cleanint[fs->lfs_cleanind] = fs->lfs_offset;
1767                 if (++fs->lfs_cleanind >= LFS_MAX_CLEANIND) {
1768                         /* "1" is the artificial inc in lfs_seglock */
1769                         mutex_enter(&lfs_lock);
1770                         while (fs->lfs_iocount > 1) {
1771                                 mtsleep(&fs->lfs_iocount, PRIBIO + 1,
1772                                     "lfs_initseg", 0, &lfs_lock);
1773                         }
1774                         mutex_exit(&lfs_lock);
1775                         fs->lfs_cleanind = 0;
1776                 }
1777         }
1778
1779         sp->fs = fs;
1780         sp->ibp = NULL;
1781         sp->idp = NULL;
1782         sp->ninodes = 0;
1783         sp->ndupino = 0;
1784
1785         sp->cbpp = sp->bpp;
1786
1787         /* Get a new buffer for SEGSUM */
1788         sbp = lfs_newbuf(fs, VTOI(fs->lfs_ivnode)->i_devvp,
1789             fsbtodb(fs, fs->lfs_offset), fs->lfs_sumsize, LFS_NB_SUMMARY);
1790
1791         /* ... and enter it into the buffer list. */
1792         *sp->cbpp = sbp;
1793         sp->cbpp++;
1794         fs->lfs_offset += btofsb(fs, fs->lfs_sumsize);
1795
1796         sp->start_bpp = sp->cbpp;
1797
1798         /* Set point to SEGSUM, initialize it. */
1799         ssp = sp->segsum = sbp->b_data;
1800         memset(ssp, 0, fs->lfs_sumsize);
1801         ssp->ss_next = fs->lfs_nextseg;
1802         ssp->ss_nfinfo = ssp->ss_ninos = 0;
1803         ssp->ss_magic = SS_MAGIC;
1804
1805         /* Set pointer to first FINFO, initialize it. */
1806         sp->fip = (struct finfo *)((char *)sp->segsum + SEGSUM_SIZE(fs));
1807         sp->fip->fi_nblocks = 0;
1808         sp->start_lbp = &sp->fip->fi_blocks[0];
1809         sp->fip->fi_lastlength = 0;
1810
1811         sp->seg_bytes_left -= fs->lfs_sumsize;
1812         sp->sum_bytes_left = fs->lfs_sumsize - SEGSUM_SIZE(fs);
1813
1814         return (repeat);
1815 }
1816
1817 /*
1818  * Remove SEGUSE_INVAL from all segments.
1819  */
1820 void
1821 lfs_unset_inval_all(struct lfs *fs)
1822 {
1823         SEGUSE *sup;
1824         struct buf *bp;
1825         int i;
1826
1827         for (i = 0; i < fs->lfs_nseg; i++) {
1828                 LFS_SEGENTRY(sup, fs, i, bp);
1829                 if (sup->su_flags & SEGUSE_INVAL) {
1830                         sup->su_flags &= ~SEGUSE_INVAL;
1831                         LFS_WRITESEGENTRY(sup, fs, i, bp);
1832                 } else
1833                         brelse(bp, 0);
1834         }
1835 }
1836
1837 /*
1838  * Return the next segment to write.
1839  */
1840 void
1841 lfs_newseg(struct lfs *fs)
1842 {
1843         CLEANERINFO *cip;
1844         SEGUSE *sup;
1845         struct buf *bp;
1846         int curseg, isdirty, sn, skip_inval;
1847
1848         ASSERT_SEGLOCK(fs);
1849
1850         /* Honor LFCNWRAPSTOP */
1851         mutex_enter(&lfs_lock);
1852         while (fs->lfs_nextseg < fs->lfs_curseg && fs->lfs_nowrap) {
1853                 if (fs->lfs_wrappass) {
1854                         log(LOG_NOTICE, "%s: wrappass=%d\n",
1855                                 fs->lfs_fsmnt, fs->lfs_wrappass);
1856                         fs->lfs_wrappass = 0;
1857                         break;
1858                 }
1859                 fs->lfs_wrapstatus = LFS_WRAP_WAITING;
1860                 wakeup(&fs->lfs_nowrap);
1861                 log(LOG_NOTICE, "%s: waiting at log wrap\n", fs->lfs_fsmnt);
1862                 mtsleep(&fs->lfs_wrappass, PVFS, "newseg", 10 * hz,
1863                         &lfs_lock);
1864         }
1865         fs->lfs_wrapstatus = LFS_WRAP_GOING;
1866         mutex_exit(&lfs_lock);
1867
1868         LFS_SEGENTRY(sup, fs, dtosn(fs, fs->lfs_nextseg), bp);
1869         DLOG((DLOG_SU, "lfs_newseg: seg %d := 0 in newseg\n",
1870               dtosn(fs, fs->lfs_nextseg)));
1871         sup->su_flags |= SEGUSE_DIRTY | SEGUSE_ACTIVE;
1872         sup->su_nbytes = 0;
1873         sup->su_nsums = 0;
1874         sup->su_ninos = 0;
1875         LFS_WRITESEGENTRY(sup, fs, dtosn(fs, fs->lfs_nextseg), bp);
1876
1877         LFS_CLEANERINFO(cip, fs, bp);
1878         --cip->clean;
1879         ++cip->dirty;
1880         fs->lfs_nclean = cip->clean;
1881         LFS_SYNC_CLEANERINFO(cip, fs, bp, 1);
1882
1883         fs->lfs_lastseg = fs->lfs_curseg;
1884         fs->lfs_curseg = fs->lfs_nextseg;
1885         skip_inval = 1;
1886         for (sn = curseg = dtosn(fs, fs->lfs_curseg) + fs->lfs_interleave;;) {
1887                 sn = (sn + 1) % fs->lfs_nseg;
1888
1889                 if (sn == curseg) {
1890                         if (skip_inval)
1891                                 skip_inval = 0;
1892                         else
1893                                 panic("lfs_nextseg: no clean segments");
1894                 }
1895                 LFS_SEGENTRY(sup, fs, sn, bp);
1896                 isdirty = sup->su_flags & (SEGUSE_DIRTY | (skip_inval ? SEGUSE_INVAL : 0));
1897                 /* Check SEGUSE_EMPTY as we go along */
1898                 if (isdirty && sup->su_nbytes == 0 &&
1899                     !(sup->su_flags & SEGUSE_EMPTY))
1900                         LFS_WRITESEGENTRY(sup, fs, sn, bp);
1901                 else
1902                         brelse(bp, 0);
1903
1904                 if (!isdirty)
1905                         break;
1906         }
1907         if (skip_inval == 0)
1908                 lfs_unset_inval_all(fs);
1909
1910         ++fs->lfs_nactive;
1911         fs->lfs_nextseg = sntod(fs, sn);
1912         if (lfs_dostats) {
1913                 ++lfs_stats.segsused;
1914         }
1915 }
1916
1917 static struct buf *
1918 lfs_newclusterbuf(struct lfs *fs, struct vnode *vp, daddr_t addr,
1919     int n)
1920 {
1921         struct lfs_cluster *cl;
1922         struct buf **bpp, *bp;
1923
1924         ASSERT_SEGLOCK(fs);
1925         cl = (struct lfs_cluster *)pool_get(&fs->lfs_clpool, PR_WAITOK);
1926         bpp = (struct buf **)pool_get(&fs->lfs_bpppool, PR_WAITOK);
1927         memset(cl, 0, sizeof(*cl));
1928         cl->fs = fs;
1929         cl->bpp = bpp;
1930         cl->bufcount = 0;
1931         cl->bufsize = 0;
1932
1933         /* If this segment is being written synchronously, note that */
1934         if (fs->lfs_sp->seg_flags & SEGM_SYNC) {
1935                 cl->flags |= LFS_CL_SYNC;
1936                 cl->seg = fs->lfs_sp;
1937                 ++cl->seg->seg_iocount;
1938         }
1939
1940         /* Get an empty buffer header, or maybe one with something on it */
1941         bp = getiobuf(vp, true);
1942         bp->b_dev = NODEV;
1943         bp->b_blkno = bp->b_lblkno = addr;
1944         bp->b_iodone = lfs_cluster_callback;
1945         bp->b_private = cl;
1946
1947         return bp;
1948 }
1949
1950 int
1951 lfs_writeseg(struct lfs *fs, struct segment *sp)
1952 {
1953         struct buf **bpp, *bp, *cbp, *newbp, *unbusybp;
1954         SEGUSE *sup;
1955         SEGSUM *ssp;
1956         int i;
1957         int do_again, nblocks, byteoffset;
1958         size_t el_size;
1959         struct lfs_cluster *cl;
1960         u_short ninos;
1961         struct vnode *devvp;
1962         char *p = NULL;
1963         struct vnode *vp;
1964         int32_t *daddrp;        /* XXX ondisk32 */
1965         int changed;
1966         u_int32_t sum;
1967 #ifdef DEBUG
1968         FINFO *fip;
1969         int findex;
1970 #endif
1971
1972         ASSERT_SEGLOCK(fs);
1973
1974         ssp = (SEGSUM *)sp->segsum;
1975
1976         /*
1977          * If there are no buffers other than the segment summary to write,
1978          * don't do anything.  If we are the end of a dirop sequence, however,
1979          * write the empty segment summary anyway, to help out the
1980          * roll-forward agent.
1981          */
1982         if ((nblocks = sp->cbpp - sp->bpp) == 1) {
1983                 if ((ssp->ss_flags & (SS_DIROP | SS_CONT)) != SS_DIROP)
1984                         return 0;
1985         }
1986
1987         /* Note if partial segment is being written by the cleaner */
1988         if (sp->seg_flags & SEGM_CLEAN)
1989                 ssp->ss_flags |= SS_CLEAN;
1990
1991         devvp = VTOI(fs->lfs_ivnode)->i_devvp;
1992
1993         /* Update the segment usage information. */
1994         LFS_SEGENTRY(sup, fs, sp->seg_number, bp);
1995
1996         /* Loop through all blocks, except the segment summary. */
1997         for (bpp = sp->bpp; ++bpp < sp->cbpp; ) {
1998                 if ((*bpp)->b_vp != devvp) {
1999                         sup->su_nbytes += (*bpp)->b_bcount;
2000                         DLOG((DLOG_SU, "seg %" PRIu32 " += %ld for ino %d"
2001                               " lbn %" PRId64 " db 0x%" PRIx64 "\n",
2002                               sp->seg_number, (*bpp)->b_bcount,
2003                               VTOI((*bpp)->b_vp)->i_number, (*bpp)->b_lblkno,
2004                               (*bpp)->b_blkno));
2005                 }
2006         }
2007
2008 #ifdef DEBUG
2009         /* Check for zero-length and zero-version FINFO entries. */
2010         fip = (struct finfo *)((char *)ssp + SEGSUM_SIZE(fs));
2011         for (findex = 0; findex < ssp->ss_nfinfo; findex++) {
2012                 KDASSERT(fip->fi_nblocks > 0);
2013                 KDASSERT(fip->fi_version > 0);
2014                 fip = (FINFO *)((char *)fip + FINFOSIZE +
2015                         sizeof(int32_t) * fip->fi_nblocks);
2016         }
2017 #endif /* DEBUG */
2018
2019         ninos = (ssp->ss_ninos + INOPB(fs) - 1) / INOPB(fs);
2020         DLOG((DLOG_SU, "seg %d += %d for %d inodes\n",
2021               sp->seg_number, ssp->ss_ninos * sizeof (struct ufs1_dinode),
2022               ssp->ss_ninos));
2023         sup->su_nbytes += ssp->ss_ninos * sizeof (struct ufs1_dinode);
2024         /* sup->su_nbytes += fs->lfs_sumsize; */
2025         if (fs->lfs_version == 1)
2026                 sup->su_olastmod = time_second;
2027         else
2028                 sup->su_lastmod = time_second;
2029         sup->su_ninos += ninos;
2030         ++sup->su_nsums;
2031         fs->lfs_avail -= btofsb(fs, fs->lfs_sumsize);
2032
2033         do_again = !(bp->b_flags & B_GATHERED);
2034         LFS_WRITESEGENTRY(sup, fs, sp->seg_number, bp); /* Ifile */
2035
2036         /*
2037          * Mark blocks B_BUSY, to prevent then from being changed between
2038          * the checksum computation and the actual write.
2039          *
2040          * If we are cleaning, check indirect blocks for UNWRITTEN, and if
2041          * there are any, replace them with copies that have UNASSIGNED
2042          * instead.
2043          */
2044         mutex_enter(&bufcache_lock);
2045         for (bpp = sp->bpp, i = nblocks - 1; i--;) {
2046                 ++bpp;
2047                 bp = *bpp;
2048                 if (bp->b_iodone != NULL) {      /* UBC or malloced buffer */
2049                         bp->b_cflags |= BC_BUSY;
2050                         continue;
2051                 }
2052
2053                 while (bp->b_cflags & BC_BUSY) {
2054                         DLOG((DLOG_SEG, "lfs_writeseg: avoiding potential"
2055                               " data summary corruption for ino %d, lbn %"
2056                               PRId64 "\n",
2057                               VTOI(bp->b_vp)->i_number, bp->b_lblkno));
2058                         bp->b_cflags |= BC_WANTED;
2059                         cv_wait(&bp->b_busy, &bufcache_lock);
2060                 }
2061                 bp->b_cflags |= BC_BUSY;
2062                 mutex_exit(&bufcache_lock);
2063                 unbusybp = NULL;
2064
2065                 /*
2066                  * Check and replace indirect block UNWRITTEN bogosity.
2067                  * XXX See comment in lfs_writefile.
2068                  */
2069                 if (bp->b_lblkno < 0 && bp->b_vp != devvp && bp->b_vp &&
2070                    VTOI(bp->b_vp)->i_ffs1_blocks !=
2071                    VTOI(bp->b_vp)->i_lfs_effnblks) {
2072                         DLOG((DLOG_VNODE, "lfs_writeseg: cleansing ino %d (%d != %d)\n",
2073                               VTOI(bp->b_vp)->i_number,
2074                               VTOI(bp->b_vp)->i_lfs_effnblks,
2075                               VTOI(bp->b_vp)->i_ffs1_blocks));
2076                         /* Make a copy we'll make changes to */
2077                         newbp = lfs_newbuf(fs, bp->b_vp, bp->b_lblkno,
2078                                            bp->b_bcount, LFS_NB_IBLOCK);
2079                         newbp->b_blkno = bp->b_blkno;
2080                         memcpy(newbp->b_data, bp->b_data,
2081                                newbp->b_bcount);
2082
2083                         changed = 0;
2084                         /* XXX ondisk32 */
2085                         for (daddrp = (int32_t *)(newbp->b_data);
2086                              daddrp < (int32_t *)((char *)newbp->b_data +
2087                                                   newbp->b_bcount); daddrp++) {
2088                                 if (*daddrp == UNWRITTEN) {
2089                                         ++changed;
2090                                         *daddrp = 0;
2091                                 }
2092                         }
2093                         /*
2094                          * Get rid of the old buffer.  Don't mark it clean,
2095                          * though, if it still has dirty data on it.
2096                          */
2097                         if (changed) {
2098                                 DLOG((DLOG_SEG, "lfs_writeseg: replacing UNWRITTEN(%d):"
2099                                       " bp = %p newbp = %p\n", changed, bp,
2100                                       newbp));
2101                                 *bpp = newbp;
2102                                 bp->b_flags &= ~B_GATHERED;
2103                                 bp->b_error = 0;
2104                                 if (bp->b_iodone != NULL) {
2105                                         DLOG((DLOG_SEG, "lfs_writeseg: "
2106                                               "indir bp should not be B_CALL\n"));
2107                                         biodone(bp);
2108                                         bp = NULL;
2109                                 } else {
2110                                         /* Still on free list, leave it there */
2111                                         unbusybp = bp;
2112                                         /*
2113                                          * We have to re-decrement lfs_avail
2114                                          * since this block is going to come
2115                                          * back around to us in the next
2116                                          * segment.
2117                                          */
2118                                         fs->lfs_avail -=
2119                                             btofsb(fs, bp->b_bcount);
2120                                 }
2121                         } else {
2122                                 lfs_freebuf(fs, newbp);
2123                         }
2124                 }
2125                 mutex_enter(&bufcache_lock);
2126                 if (unbusybp != NULL) {
2127                         unbusybp->b_cflags &= ~BC_BUSY;
2128                         if (unbusybp->b_cflags & BC_WANTED)
2129                                 cv_broadcast(&bp->b_busy);
2130                 }
2131         }
2132         mutex_exit(&bufcache_lock);
2133
2134         /*
2135          * Compute checksum across data and then across summary; the first
2136          * block (the summary block) is skipped.  Set the create time here
2137          * so that it's guaranteed to be later than the inode mod times.
2138          */
2139         sum = 0;
2140         if (fs->lfs_version == 1)
2141                 el_size = sizeof(u_long);
2142         else
2143                 el_size = sizeof(u_int32_t);
2144         for (bpp = sp->bpp, i = nblocks - 1; i--; ) {
2145                 ++bpp;
2146                 /* Loop through gop_write cluster blocks */
2147                 for (byteoffset = 0; byteoffset < (*bpp)->b_bcount;
2148                      byteoffset += fs->lfs_bsize) {
2149 #ifdef LFS_USE_B_INVAL
2150                         if (((*bpp)->b_cflags & BC_INVAL) != 0 &&
2151                             (*bpp)->b_iodone != NULL) {
2152                                 if (copyin((void *)(*bpp)->b_saveaddr +
2153                                            byteoffset, dp, el_size)) {
2154                                         panic("lfs_writeseg: copyin failed [1]:"
2155                                                 " ino %d blk %" PRId64,
2156                                                 VTOI((*bpp)->b_vp)->i_number,
2157                                                 (*bpp)->b_lblkno);
2158                                 }
2159                         } else
2160 #endif /* LFS_USE_B_INVAL */
2161                         {
2162                                 sum = lfs_cksum_part((char *)
2163                                     (*bpp)->b_data + byteoffset, el_size, sum);
2164                         }
2165                 }
2166         }
2167         if (fs->lfs_version == 1)
2168                 ssp->ss_ocreate = time_second;
2169         else {
2170                 ssp->ss_create = time_second;
2171                 ssp->ss_serial = ++fs->lfs_serial;
2172                 ssp->ss_ident  = fs->lfs_ident;
2173         }
2174         ssp->ss_datasum = lfs_cksum_fold(sum);
2175         ssp->ss_sumsum = cksum(&ssp->ss_datasum,
2176             fs->lfs_sumsize - sizeof(ssp->ss_sumsum));
2177
2178         mutex_enter(&lfs_lock);
2179         fs->lfs_bfree -= (btofsb(fs, ninos * fs->lfs_ibsize) +
2180                           btofsb(fs, fs->lfs_sumsize));
2181         fs->lfs_dmeta += (btofsb(fs, ninos * fs->lfs_ibsize) +
2182                           btofsb(fs, fs->lfs_sumsize));
2183         mutex_exit(&lfs_lock);
2184
2185         /*
2186          * When we simply write the blocks we lose a rotation for every block
2187          * written.  To avoid this problem, we cluster the buffers into a
2188          * chunk and write the chunk.  MAXPHYS is the largest size I/O
2189          * devices can handle, use that for the size of the chunks.
2190          *
2191          * Blocks that are already clusters (from GOP_WRITE), however, we
2192          * don't bother to copy into other clusters.
2193          */
2194
2195 #define CHUNKSIZE MAXPHYS
2196
2197         if (devvp == NULL)
2198                 panic("devvp is NULL");
2199         for (bpp = sp->bpp, i = nblocks; i;) {
2200                 cbp = lfs_newclusterbuf(fs, devvp, (*bpp)->b_blkno, i);
2201                 cl = cbp->b_private;
2202
2203                 cbp->b_flags |= B_ASYNC;
2204                 cbp->b_cflags |= BC_BUSY;
2205                 cbp->b_bcount = 0;
2206
2207 #if defined(DEBUG) && defined(DIAGNOSTIC)
2208                 if (bpp - sp->bpp > (fs->lfs_sumsize - SEGSUM_SIZE(fs))
2209                     / sizeof(int32_t)) {
2210                         panic("lfs_writeseg: real bpp overwrite");
2211                 }
2212                 if (bpp - sp->bpp > segsize(fs) / fs->lfs_fsize) {
2213                         panic("lfs_writeseg: theoretical bpp overwrite");
2214                 }
2215 #endif
2216
2217                 /*
2218                  * Construct the cluster.
2219                  */
2220                 mutex_enter(&lfs_lock);
2221                 ++fs->lfs_iocount;
2222                 mutex_exit(&lfs_lock);
2223                 while (i && cbp->b_bcount < CHUNKSIZE) {
2224                         bp = *bpp;
2225
2226                         if (bp->b_bcount > (CHUNKSIZE - cbp->b_bcount))
2227                                 break;
2228                         if (cbp->b_bcount > 0 && !(cl->flags & LFS_CL_MALLOC))
2229                                 break;
2230
2231                         /* Clusters from GOP_WRITE are expedited */
2232                         if (bp->b_bcount > fs->lfs_bsize) {
2233                                 if (cbp->b_bcount > 0)
2234                                         /* Put in its own buffer */
2235                                         break;
2236                                 else {
2237                                         cbp->b_data = bp->b_data;
2238                                 }
2239                         } else if (cbp->b_bcount == 0) {
2240                                 p = cbp->b_data = lfs_malloc(fs, CHUNKSIZE,
2241                                                              LFS_NB_CLUSTER);
2242                                 cl->flags |= LFS_CL_MALLOC;
2243                         }
2244 #ifdef DIAGNOSTIC
2245                         if (dtosn(fs, dbtofsb(fs, bp->b_blkno +
2246                                               btodb(bp->b_bcount - 1))) !=
2247                             sp->seg_number) {
2248                                 printf("blk size %d daddr %" PRIx64
2249                                     " not in seg %d\n",
2250                                     bp->b_bcount, bp->b_blkno,
2251                                     sp->seg_number);
2252                                 panic("segment overwrite");
2253                         }
2254 #endif
2255
2256 #ifdef LFS_USE_B_INVAL
2257                         /*
2258                          * Fake buffers from the cleaner are marked as B_INVAL.
2259                          * We need to copy the data from user space rather than
2260                          * from the buffer indicated.
2261                          * XXX == what do I do on an error?
2262                          */
2263                         if ((bp->b_cflags & BC_INVAL) != 0 &&
2264                             bp->b_iodone != NULL) {
2265                                 if (copyin(bp->b_saveaddr, p, bp->b_bcount))
2266                                         panic("lfs_writeseg: "
2267                                             "copyin failed [2]");
2268                         } else
2269 #endif /* LFS_USE_B_INVAL */
2270                         if (cl->flags & LFS_CL_MALLOC) {
2271                                 /* copy data into our cluster. */
2272                                 memcpy(p, bp->b_data, bp->b_bcount);
2273                                 p += bp->b_bcount;
2274                         }
2275
2276                         cbp->b_bcount += bp->b_bcount;
2277                         cl->bufsize += bp->b_bcount;
2278
2279                         bp->b_flags &= ~B_READ;
2280                         bp->b_error = 0;
2281                         cl->bpp[cl->bufcount++] = bp;
2282
2283                         vp = bp->b_vp;
2284                         mutex_enter(&bufcache_lock);
2285                         mutex_enter(vp->v_interlock);
2286                         bp->b_oflags &= ~(BO_DELWRI | BO_DONE);
2287                         reassignbuf(bp, vp);
2288                         vp->v_numoutput++;
2289                         mutex_exit(vp->v_interlock);
2290                         mutex_exit(&bufcache_lock);
2291
2292                         bpp++;
2293                         i--;
2294                 }
2295                 if (fs->lfs_sp->seg_flags & SEGM_SYNC)
2296                         BIO_SETPRIO(cbp, BPRIO_TIMECRITICAL);
2297                 else
2298                         BIO_SETPRIO(cbp, BPRIO_TIMELIMITED);
2299                 mutex_enter(devvp->v_interlock);
2300                 devvp->v_numoutput++;
2301                 mutex_exit(devvp->v_interlock);
2302                 VOP_STRATEGY(devvp, cbp);
2303                 curlwp->l_ru.ru_oublock++;
2304         }
2305
2306         if (lfs_dostats) {
2307                 ++lfs_stats.psegwrites;
2308                 lfs_stats.blocktot += nblocks - 1;
2309                 if (fs->lfs_sp->seg_flags & SEGM_SYNC)
2310                         ++lfs_stats.psyncwrites;
2311                 if (fs->lfs_sp->seg_flags & SEGM_CLEAN) {
2312                         ++lfs_stats.pcleanwrites;
2313                         lfs_stats.cleanblocks += nblocks - 1;
2314                 }
2315         }
2316
2317         return (lfs_initseg(fs) || do_again);
2318 }
2319
2320 void
2321 lfs_writesuper(struct lfs *fs, daddr_t daddr)
2322 {
2323         struct buf *bp;
2324         struct vnode *devvp = VTOI(fs->lfs_ivnode)->i_devvp;
2325         int s;
2326
2327         ASSERT_MAYBE_SEGLOCK(fs);
2328 #ifdef DIAGNOSTIC
2329         KASSERT(fs->lfs_magic == LFS_MAGIC);
2330 #endif
2331         /*
2332          * If we can write one superblock while another is in
2333          * progress, we risk not having a complete checkpoint if we crash.
2334          * So, block here if a superblock write is in progress.
2335          */
2336         mutex_enter(&lfs_lock);
2337         s = splbio();
2338         while (fs->lfs_sbactive) {
2339                 mtsleep(&fs->lfs_sbactive, PRIBIO+1, "lfs sb", 0,
2340                         &lfs_lock);
2341         }
2342         fs->lfs_sbactive = daddr;
2343         splx(s);
2344         mutex_exit(&lfs_lock);
2345
2346         /* Set timestamp of this version of the superblock */
2347         if (fs->lfs_version == 1)
2348                 fs->lfs_otstamp = time_second;
2349         fs->lfs_tstamp = time_second;
2350
2351         /* Checksum the superblock and copy it into a buffer. */
2352         fs->lfs_cksum = lfs_sb_cksum(&(fs->lfs_dlfs));
2353         bp = lfs_newbuf(fs, devvp,
2354             fsbtodb(fs, daddr), LFS_SBPAD, LFS_NB_SBLOCK);
2355         memset((char *)bp->b_data + sizeof(struct dlfs), 0,
2356             LFS_SBPAD - sizeof(struct dlfs));
2357         *(struct dlfs *)bp->b_data = fs->lfs_dlfs;
2358
2359         bp->b_cflags |= BC_BUSY;
2360         bp->b_flags = (bp->b_flags & ~B_READ) | B_ASYNC;
2361         bp->b_oflags &= ~(BO_DONE | BO_DELWRI);
2362         bp->b_error = 0;
2363         bp->b_iodone = lfs_supercallback;
2364
2365         if (fs->lfs_sp != NULL && fs->lfs_sp->seg_flags & SEGM_SYNC)
2366                 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
2367         else
2368                 BIO_SETPRIO(bp, BPRIO_TIMELIMITED);
2369         curlwp->l_ru.ru_oublock++;
2370
2371         mutex_enter(devvp->v_interlock);
2372         devvp->v_numoutput++;
2373         mutex_exit(devvp->v_interlock);
2374
2375         mutex_enter(&lfs_lock);
2376         ++fs->lfs_iocount;
2377         mutex_exit(&lfs_lock);
2378         VOP_STRATEGY(devvp, bp);
2379 }
2380
2381 /*
2382  * Logical block number match routines used when traversing the dirty block
2383  * chain.
2384  */
2385 int
2386 lfs_match_fake(struct lfs *fs, struct buf *bp)
2387 {
2388
2389         ASSERT_SEGLOCK(fs);
2390         return LFS_IS_MALLOC_BUF(bp);
2391 }
2392
2393 #if 0
2394 int
2395 lfs_match_real(struct lfs *fs, struct buf *bp)
2396 {
2397
2398         ASSERT_SEGLOCK(fs);
2399         return (lfs_match_data(fs, bp) && !lfs_match_fake(fs, bp));
2400 }
2401 #endif
2402
2403 int
2404 lfs_match_data(struct lfs *fs, struct buf *bp)
2405 {
2406
2407         ASSERT_SEGLOCK(fs);
2408         return (bp->b_lblkno >= 0);
2409 }
2410
2411 int
2412 lfs_match_indir(struct lfs *fs, struct buf *bp)
2413 {
2414         daddr_t lbn;
2415
2416         ASSERT_SEGLOCK(fs);
2417         lbn = bp->b_lblkno;
2418         return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 0);
2419 }
2420
2421 int
2422 lfs_match_dindir(struct lfs *fs, struct buf *bp)
2423 {
2424         daddr_t lbn;
2425
2426         ASSERT_SEGLOCK(fs);
2427         lbn = bp->b_lblkno;
2428         return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 1);
2429 }
2430
2431 int
2432 lfs_match_tindir(struct lfs *fs, struct buf *bp)
2433 {
2434         daddr_t lbn;
2435
2436         ASSERT_SEGLOCK(fs);
2437         lbn = bp->b_lblkno;
2438         return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 2);
2439 }
2440
2441 static void
2442 lfs_free_aiodone(struct buf *bp)
2443 {
2444         struct lfs *fs;
2445
2446         KERNEL_LOCK(1, curlwp);
2447         fs = bp->b_private;
2448         ASSERT_NO_SEGLOCK(fs);
2449         lfs_freebuf(fs, bp);
2450         KERNEL_UNLOCK_LAST(curlwp);
2451 }
2452
2453 static void
2454 lfs_super_aiodone(struct buf *bp)
2455 {
2456         struct lfs *fs;
2457
2458         KERNEL_LOCK(1, curlwp);
2459         fs = bp->b_private;
2460         ASSERT_NO_SEGLOCK(fs);
2461         mutex_enter(&lfs_lock);
2462         fs->lfs_sbactive = 0;
2463         if (--fs->lfs_iocount <= 1)
2464                 wakeup(&fs->lfs_iocount);
2465         wakeup(&fs->lfs_sbactive);
2466         mutex_exit(&lfs_lock);
2467         lfs_freebuf(fs, bp);
2468         KERNEL_UNLOCK_LAST(curlwp);
2469 }
2470
2471 static void
2472 lfs_cluster_aiodone(struct buf *bp)
2473 {
2474         struct lfs_cluster *cl;
2475         struct lfs *fs;
2476         struct buf *tbp, *fbp;
2477         struct vnode *vp, *devvp, *ovp;
2478         struct inode *ip;
2479         int error;
2480
2481         KERNEL_LOCK(1, curlwp);
2482
2483         error = bp->b_error;
2484         cl = bp->b_private;
2485         fs = cl->fs;
2486         devvp = VTOI(fs->lfs_ivnode)->i_devvp;
2487         ASSERT_NO_SEGLOCK(fs);
2488
2489         /* Put the pages back, and release the buffer */
2490         while (cl->bufcount--) {
2491                 tbp = cl->bpp[cl->bufcount];
2492                 KASSERT(tbp->b_cflags & BC_BUSY);
2493                 if (error) {
2494                         tbp->b_error = error;
2495                 }
2496
2497                 /*
2498                  * We're done with tbp.  If it has not been re-dirtied since
2499                  * the cluster was written, free it.  Otherwise, keep it on
2500                  * the locked list to be written again.
2501                  */
2502                 vp = tbp->b_vp;
2503
2504                 tbp->b_flags &= ~B_GATHERED;
2505
2506                 LFS_BCLEAN_LOG(fs, tbp);
2507
2508                 mutex_enter(&bufcache_lock);
2509                 if (tbp->b_iodone == NULL) {
2510                         KASSERT(tbp->b_flags & B_LOCKED);
2511                         bremfree(tbp);
2512                         if (vp) {
2513                                 mutex_enter(vp->v_interlock);
2514                                 reassignbuf(tbp, vp);
2515                                 mutex_exit(vp->v_interlock);
2516                         }
2517                         tbp->b_flags |= B_ASYNC; /* for biodone */
2518                 }
2519
2520                 if (((tbp->b_flags | tbp->b_oflags) &
2521                     (B_LOCKED | BO_DELWRI)) == B_LOCKED)
2522                         LFS_UNLOCK_BUF(tbp);
2523
2524                 if (tbp->b_oflags & BO_DONE) {
2525                         DLOG((DLOG_SEG, "blk %d biodone already (flags %lx)\n",
2526                                 cl->bufcount, (long)tbp->b_flags));
2527                 }
2528
2529                 if (tbp->b_iodone != NULL && !LFS_IS_MALLOC_BUF(tbp)) {
2530                         /*
2531                          * A buffer from the page daemon.
2532                          * We use the same iodone as it does,
2533                          * so we must manually disassociate its
2534                          * buffers from the vp.
2535                          */
2536                         if ((ovp = tbp->b_vp) != NULL) {
2537                                 /* This is just silly */
2538                                 mutex_enter(ovp->v_interlock);
2539                                 brelvp(tbp);
2540                                 mutex_exit(ovp->v_interlock);
2541                                 tbp->b_vp = vp;
2542                                 tbp->b_objlock = vp->v_interlock;
2543                         }
2544                         /* Put it back the way it was */
2545                         tbp->b_flags |= B_ASYNC;
2546                         /* Master buffers have BC_AGE */
2547                         if (tbp->b_private == tbp)
2548                                 tbp->b_cflags |= BC_AGE;
2549                 }
2550                 mutex_exit(&bufcache_lock);
2551
2552                 biodone(tbp);
2553
2554                 /*
2555                  * If this is the last block for this vnode, but
2556                  * there are other blocks on its dirty list,
2557                  * set IN_MODIFIED/IN_CLEANING depending on what
2558                  * sort of block.  Only do this for our mount point,
2559                  * not for, e.g., inode blocks that are attached to
2560                  * the devvp.
2561                  * XXX KS - Shouldn't we set *both* if both types
2562                  * of blocks are present (traverse the dirty list?)
2563                  */
2564                 mutex_enter(&lfs_lock);
2565                 mutex_enter(vp->v_interlock);
2566                 if (vp != devvp && vp->v_numoutput == 0 &&
2567                     (fbp = LIST_FIRST(&vp->v_dirtyblkhd)) != NULL) {
2568                         ip = VTOI(vp);
2569                         DLOG((DLOG_SEG, "lfs_cluster_aiodone: mark ino %d\n",
2570                                ip->i_number));
2571                         if (LFS_IS_MALLOC_BUF(fbp))
2572                                 LFS_SET_UINO(ip, IN_CLEANING);
2573                         else
2574                                 LFS_SET_UINO(ip, IN_MODIFIED);
2575                 }
2576                 cv_broadcast(&vp->v_cv);
2577                 mutex_exit(vp->v_interlock);
2578                 mutex_exit(&lfs_lock);
2579         }
2580
2581         /* Fix up the cluster buffer, and release it */
2582         if (cl->flags & LFS_CL_MALLOC)
2583                 lfs_free(fs, bp->b_data, LFS_NB_CLUSTER);
2584         putiobuf(bp);
2585
2586         /* Note i/o done */
2587         if (cl->flags & LFS_CL_SYNC) {
2588                 if (--cl->seg->seg_iocount == 0)
2589                         wakeup(&cl->seg->seg_iocount);
2590         }
2591         mutex_enter(&lfs_lock);
2592 #ifdef DIAGNOSTIC
2593         if (fs->lfs_iocount == 0)
2594                 panic("lfs_cluster_aiodone: zero iocount");
2595 #endif
2596         if (--fs->lfs_iocount <= 1)
2597                 wakeup(&fs->lfs_iocount);
2598         mutex_exit(&lfs_lock);
2599
2600         KERNEL_UNLOCK_LAST(curlwp);
2601
2602         pool_put(&fs->lfs_bpppool, cl->bpp);
2603         cl->bpp = NULL;
2604         pool_put(&fs->lfs_clpool, cl);
2605 }
2606
2607 static void
2608 lfs_generic_callback(struct buf *bp, void (*aiodone)(struct buf *))
2609 {
2610         /* reset b_iodone for when this is a single-buf i/o. */
2611         bp->b_iodone = aiodone;
2612
2613         workqueue_enqueue(uvm.aiodone_queue, &bp->b_work, NULL);
2614 }
2615
2616 static void
2617 lfs_cluster_callback(struct buf *bp)
2618 {
2619
2620         lfs_generic_callback(bp, lfs_cluster_aiodone);
2621 }
2622
2623 void
2624 lfs_supercallback(struct buf *bp)
2625 {
2626
2627         lfs_generic_callback(bp, lfs_super_aiodone);
2628 }
2629
2630 /*
2631  * The only buffers that are going to hit these functions are the
2632  * segment write blocks, or the segment summaries, or the superblocks.
2633  *
2634  * All of the above are created by lfs_newbuf, and so do not need to be
2635  * released via brelse.
2636  */
2637 void
2638 lfs_callback(struct buf *bp)
2639 {
2640
2641         lfs_generic_callback(bp, lfs_free_aiodone);
2642 }
2643
2644 /*
2645  * Shellsort (diminishing increment sort) from Data Structures and
2646  * Algorithms, Aho, Hopcraft and Ullman, 1983 Edition, page 290;
2647  * see also Knuth Vol. 3, page 84.  The increments are selected from
2648  * formula (8), page 95.  Roughly O(N^3/2).
2649  */
2650 /*
2651  * This is our own private copy of shellsort because we want to sort
2652  * two parallel arrays (the array of buffer pointers and the array of
2653  * logical block numbers) simultaneously.  Note that we cast the array
2654  * of logical block numbers to a unsigned in this routine so that the
2655  * negative block numbers (meta data blocks) sort AFTER the data blocks.
2656  */
2657
2658 void
2659 lfs_shellsort(struct buf **bp_array, int32_t *lb_array, int nmemb, int size)
2660 {
2661         static int __rsshell_increments[] = { 4, 1, 0 };
2662         int incr, *incrp, t1, t2;
2663         struct buf *bp_temp;
2664
2665 #ifdef DEBUG
2666         incr = 0;
2667         for (t1 = 0; t1 < nmemb; t1++) {
2668                 for (t2 = 0; t2 * size < bp_array[t1]->b_bcount; t2++) {
2669                         if (lb_array[incr++] != bp_array[t1]->b_lblkno + t2) {
2670                                 /* dump before panic */
2671                                 printf("lfs_shellsort: nmemb=%d, size=%d\n",
2672                                     nmemb, size);
2673                                 incr = 0;
2674                                 for (t1 = 0; t1 < nmemb; t1++) {
2675                                         const struct buf *bp = bp_array[t1];
2676
2677                                         printf("bp[%d]: lbn=%" PRIu64 ", size=%"
2678                                             PRIu64 "\n", t1,
2679                                             (uint64_t)bp->b_bcount,
2680                                             (uint64_t)bp->b_lblkno);
2681                                         printf("lbns:");
2682                                         for (t2 = 0; t2 * size < bp->b_bcount;
2683                                             t2++) {
2684                                                 printf(" %" PRId32,
2685                                                     lb_array[incr++]);
2686                                         }
2687                                         printf("\n");
2688                                 }
2689                                 panic("lfs_shellsort: inconsistent input");
2690                         }
2691                 }
2692         }
2693 #endif
2694
2695         for (incrp = __rsshell_increments; (incr = *incrp++) != 0;)
2696                 for (t1 = incr; t1 < nmemb; ++t1)
2697                         for (t2 = t1 - incr; t2 >= 0;)
2698                                 if ((u_int32_t)bp_array[t2]->b_lblkno >
2699                                     (u_int32_t)bp_array[t2 + incr]->b_lblkno) {
2700                                         bp_temp = bp_array[t2];
2701                                         bp_array[t2] = bp_array[t2 + incr];
2702                                         bp_array[t2 + incr] = bp_temp;
2703                                         t2 -= incr;
2704                                 } else
2705                                         break;
2706
2707         /* Reform the list of logical blocks */
2708         incr = 0;
2709         for (t1 = 0; t1 < nmemb; t1++) {
2710                 for (t2 = 0; t2 * size < bp_array[t1]->b_bcount; t2++) {
2711                         lb_array[incr++] = bp_array[t1]->b_lblkno + t2;
2712                 }
2713         }
2714 }
2715
2716 /*
2717  * Call vget with LK_NOWAIT.  If we are the one who holds VI_XLOCK,
2718  * however, we must press on.  Just fake success in that case.
2719  */
2720 int
2721 lfs_vref(struct vnode *vp)
2722 {
2723         int error;
2724         struct lfs *fs;
2725
2726         KASSERT(mutex_owned(vp->v_interlock));
2727
2728         fs = VTOI(vp)->i_lfs;
2729
2730         ASSERT_MAYBE_SEGLOCK(fs);
2731
2732         /*
2733          * If we return 1 here during a flush, we risk vinvalbuf() not
2734          * being able to flush all of the pages from this vnode, which
2735          * will cause it to panic.  So, return 0 if a flush is in progress.
2736          */
2737         error = vget(vp, LK_NOWAIT);
2738         if (error == EBUSY && IS_FLUSHING(VTOI(vp)->i_lfs, vp)) {
2739                 ++fs->lfs_flushvp_fakevref;
2740                 return 0;
2741         }
2742         return error;
2743 }
2744
2745 /*
2746  * This is vrele except that we do not want to VOP_INACTIVE this vnode. We
2747  * inline vrele here to avoid the vn_lock and VOP_INACTIVE call at the end.
2748  */
2749 void
2750 lfs_vunref(struct vnode *vp)
2751 {
2752         struct lfs *fs;
2753
2754         fs = VTOI(vp)->i_lfs;
2755         ASSERT_MAYBE_SEGLOCK(fs);
2756
2757         /*
2758          * Analogous to lfs_vref, if the node is flushing, fake it.
2759          */
2760         if (IS_FLUSHING(fs, vp) && fs->lfs_flushvp_fakevref) {
2761                 --fs->lfs_flushvp_fakevref;
2762                 return;
2763         }
2764
2765         /* does not call inactive */
2766         mutex_enter(vp->v_interlock);
2767         vrelel(vp, 0);
2768 }
2769
2770 /*
2771  * We use this when we have vnodes that were loaded in solely for cleaning.
2772  * There is no reason to believe that these vnodes will be referenced again
2773  * soon, since the cleaning process is unrelated to normal filesystem
2774  * activity.  Putting cleaned vnodes at the tail of the list has the effect
2775  * of flushing the vnode LRU.  So, put vnodes that were loaded only for
2776  * cleaning at the head of the list, instead.
2777  */
2778 void
2779 lfs_vunref_head(struct vnode *vp)
2780 {
2781
2782         ASSERT_SEGLOCK(VTOI(vp)->i_lfs);
2783
2784         /* does not call inactive, inserts non-held vnode at head of freelist */
2785         mutex_enter(vp->v_interlock);
2786         vrelel(vp, 0);
2787 }
2788
2789
2790 /*
2791  * Set up an FINFO entry for a new file.  The fip pointer is assumed to
2792  * point at uninitialized space.
2793  */
2794 void
2795 lfs_acquire_finfo(struct lfs *fs, ino_t ino, int vers)
2796 {
2797         struct segment *sp = fs->lfs_sp;
2798
2799         KASSERT(vers > 0);
2800
2801         if (sp->seg_bytes_left < fs->lfs_bsize ||
2802             sp->sum_bytes_left < sizeof(struct finfo))
2803                 (void) lfs_writeseg(fs, fs->lfs_sp);
2804
2805         sp->sum_bytes_left -= FINFOSIZE;
2806         ++((SEGSUM *)(sp->segsum))->ss_nfinfo;
2807         sp->fip->fi_nblocks = 0;
2808         sp->fip->fi_ino = ino;
2809         sp->fip->fi_version = vers;
2810 }
2811
2812 /*
2813  * Release the FINFO entry, either clearing out an unused entry or
2814  * advancing us to the next available entry.
2815  */
2816 void
2817 lfs_release_finfo(struct lfs *fs)
2818 {
2819         struct segment *sp = fs->lfs_sp;
2820
2821         if (sp->fip->fi_nblocks != 0) {
2822                 sp->fip = (FINFO*)((char *)sp->fip + FINFOSIZE +
2823                         sizeof(int32_t) * sp->fip->fi_nblocks);
2824                 sp->start_lbp = &sp->fip->fi_blocks[0];
2825         } else {
2826                 sp->sum_bytes_left += FINFOSIZE;
2827                 --((SEGSUM *)(sp->segsum))->ss_nfinfo;
2828         }
2829 }