sys/ufs/lfs/lfs_rfw.c

   1 /*      $NetBSD: lfs_rfw.c,v 1.32 2015/10/03 08:27:55 dholland Exp $    */
   2
   3 /*-
   4  * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
   5  * All rights reserved.
   6  *
   7  * This code is derived from software contributed to The NetBSD Foundation
   8  * by Konrad E. Schroder <perseant@hhhh.org>.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  29  * POSSIBILITY OF SUCH DAMAGE.
  30  */
  31
  32 #include <sys/cdefs.h>
  33 __KERNEL_RCSID(0, "$NetBSD: lfs_rfw.c,v 1.32 2015/10/03 08:27:55 dholland Exp $");
  34
  35 #if defined(_KERNEL_OPT)
  36 #include "opt_quota.h"
  37 #endif
  38
  39 #include <sys/param.h>
  40 #include <sys/systm.h>
  41 #include <sys/namei.h>
  42 #include <sys/proc.h>
  43 #include <sys/kernel.h>
  44 #include <sys/vnode.h>
  45 #include <sys/mount.h>
  46 #include <sys/kthread.h>
  47 #include <sys/buf.h>
  48 #include <sys/device.h>
  49 #include <sys/mbuf.h>
  50 #include <sys/file.h>
  51 #include <sys/disklabel.h>
  52 #include <sys/ioctl.h>
  53 #include <sys/errno.h>
  54 #include <sys/malloc.h>
  55 #include <sys/pool.h>
  56 #include <sys/socket.h>
  57 #include <sys/syslog.h>
  58 #include <uvm/uvm_extern.h>
  59 #include <sys/sysctl.h>
  60 #include <sys/conf.h>
  61 #include <sys/kauth.h>
  62
  63 #include <miscfs/specfs/specdev.h>
  64
  65 #include <ufs/lfs/ulfs_quotacommon.h>
  66 #include <ufs/lfs/ulfs_inode.h>
  67 #include <ufs/lfs/ulfsmount.h>
  68 #include <ufs/lfs/ulfs_extern.h>
  69
  70 #include <uvm/uvm.h>
  71 #include <uvm/uvm_stat.h>
  72 #include <uvm/uvm_pager.h>
  73 #include <uvm/uvm_pdaemon.h>
  74
  75 #include <ufs/lfs/lfs.h>
  76 #include <ufs/lfs/lfs_accessors.h>
  77 #include <ufs/lfs/lfs_kernel.h>
  78 #include <ufs/lfs/lfs_extern.h>
  79
  80 #include <miscfs/genfs/genfs.h>
  81 #include <miscfs/genfs/genfs_node.h>
  82
  83 /*
  84  * Roll-forward code.
  85  */
  86 static daddr_t check_segsum(struct lfs *, daddr_t, u_int64_t,
  87     kauth_cred_t, int, int *, struct lwp *);
  88
  89 extern int lfs_do_rfw;
  90
  91 /*
  92  * Allocate a particular inode with a particular version number, freeing
  93  * any previous versions of this inode that may have gone before.
  94  * Used by the roll-forward code.
  95  *
  96  * XXX this function does not have appropriate locking to be used on a live fs;
  97  * XXX but something similar could probably be used for an "undelete" call.
  98  *
  99  * Called with the Ifile inode locked.
 100  */
 101 int
 102 lfs_rf_valloc(struct lfs *fs, ino_t ino, int vers, struct lwp *l,
 103               struct vnode **vpp)
 104 {
 105         struct vattr va;
 106         struct vnode *vp;
 107         struct inode *ip;
 108         int error;
 109
 110         ASSERT_SEGLOCK(fs); /* XXX it doesn't, really */
 111
 112         /*
 113          * First, just try a vget. If the version number is the one we want,
 114          * we don't have to do anything else.  If the version number is wrong,
 115          * take appropriate action.
 116          */
 117         error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, &vp);
 118         if (error == 0) {
 119                 DLOG((DLOG_RF, "lfs_rf_valloc[1]: ino %d vp %p\n", ino, vp));
 120
 121                 *vpp = vp;
 122                 ip = VTOI(vp);
 123                 if (ip->i_gen == vers)
 124                         return 0;
 125                 else if (ip->i_gen < vers) {
 126                         lfs_truncate(vp, (off_t)0, 0, NOCRED);
 127                         ip->i_gen = vers;
 128                         lfs_dino_setgen(fs, ip->i_din, vers);
 129                         LFS_SET_UINO(ip, IN_CHANGE | IN_UPDATE);
 130                         return 0;
 131                 } else {
 132                         DLOG((DLOG_RF, "ino %d: sought version %d, got %d\n",
 133                                ino, vers, lfs_dino_getgen(fs, ip->i_din)));
 134                         vput(vp);
 135                         *vpp = NULLVP;
 136                         return EEXIST;
 137                 }
 138         }
 139
 140         /* Not found, create as regular file. */
 141         vattr_null(&va);
 142         va.va_type = VREG;
 143         va.va_mode = 0;
 144         va.va_fileid = ino;
 145         va.va_gen = vers;
 146         error = vcache_new(fs->lfs_ivnode->v_mount, NULL, &va, NOCRED, &vp);
 147         if (error)
 148                 return error;
 149         error = vn_lock(vp, LK_EXCLUSIVE);
 150         if (error) {
 151                 vrele(vp);
 152                 *vpp = NULLVP;
 153                 return error;
 154         }
 155         ip = VTOI(vp);
 156         ip->i_nlink = 1;
 157         lfs_dino_setnlink(fs, ip->i_din, 1);
 158         *vpp = vp;
 159         return 0;
 160 }
 161
 162 /*
 163  * Load the appropriate indirect block, and change the appropriate pointer.
 164  * Mark the block dirty.  Do segment and avail accounting.
 165  */
 166 static int
 167 update_meta(struct lfs *fs, ino_t ino, int vers, daddr_t lbn,
 168             daddr_t ndaddr, size_t size, struct lwp *l)
 169 {
 170         int error;
 171         struct vnode *vp;
 172         struct inode *ip;
 173 #ifdef DEBUG
 174         daddr_t odaddr;
 175         struct indir a[ULFS_NIADDR];
 176         int num;
 177         int i;
 178 #endif /* DEBUG */
 179         struct buf *bp;
 180         SEGUSE *sup;
 181
 182         KASSERT(lbn >= 0);      /* no indirect blocks */
 183
 184         if ((error = lfs_rf_valloc(fs, ino, vers, l, &vp)) != 0) {
 185                 DLOG((DLOG_RF, "update_meta: ino %d: lfs_rf_valloc"
 186                       " returned %d\n", ino, error));
 187                 return error;
 188         }
 189
 190         if ((error = lfs_balloc(vp, (lbn << lfs_sb_getbshift(fs)), size,
 191                                 NOCRED, 0, &bp)) != 0) {
 192                 vput(vp);
 193                 return (error);
 194         }
 195         /* No need to write, the block is already on disk */
 196         if (bp->b_oflags & BO_DELWRI) {
 197                 LFS_UNLOCK_BUF(bp);
 198                 lfs_sb_addavail(fs, lfs_btofsb(fs, bp->b_bcount));
 199                 /* XXX should this wake up fs->lfs_availsleep? */
 200         }
 201         brelse(bp, BC_INVAL);
 202
 203         /*
 204          * Extend the file, if it is not large enough already.
 205          * XXX this is not exactly right, we don't know how much of the
 206          * XXX last block is actually used.  We hope that an inode will
 207          * XXX appear later to give the correct size.
 208          */
 209         ip = VTOI(vp);
 210         if (ip->i_size <= (lbn << lfs_sb_getbshift(fs))) {
 211                 u_int64_t newsize;
 212
 213                 if (lbn < ULFS_NDADDR) {
 214                         newsize = (lbn << lfs_sb_getbshift(fs)) +
 215                                 (size - lfs_sb_getfsize(fs)) + 1;
 216                 } else {
 217                         newsize = (lbn << lfs_sb_getbshift(fs)) + 1;
 218                 }
 219                 lfs_dino_setsize(fs, ip->i_din, newsize);
 220
 221                 if (ip->i_size < newsize) {
 222                         ip->i_size = newsize;
 223                         /*
 224                          * tell vm our new size for the case the inode won't
 225                          * appear later.
 226                          */
 227                         uvm_vnp_setsize(vp, newsize);
 228                 }
 229         }
 230
 231         lfs_update_single(fs, NULL, vp, lbn, ndaddr, size);
 232
 233         LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, ndaddr), bp);
 234         sup->su_nbytes += size;
 235         LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, ndaddr), bp);
 236
 237         /* differences here should be due to UNWRITTEN indirect blocks. */
 238         KASSERT((lfs_lblkno(fs, ip->i_size) > ULFS_NDADDR &&
 239             ip->i_lfs_effnblks == lfs_dino_getblocks(fs, ip->i_din)) ||
 240             ip->i_lfs_effnblks >= lfs_dino_getblocks(fs, ip->i_din));
 241
 242 #ifdef DEBUG
 243         /* Now look again to make sure it worked */
 244         ulfs_bmaparray(vp, lbn, &odaddr, &a[0], &num, NULL, NULL);
 245         for (i = num; i > 0; i--) {
 246                 if (!a[i].in_exists)
 247                         panic("update_meta: absent %d lv indirect block", i);
 248         }
 249         if (LFS_DBTOFSB(fs, odaddr) != ndaddr)
 250                 DLOG((DLOG_RF, "update_meta: failed setting ino %d lbn %"
 251                       PRId64 " to %" PRId64 "\n", ino, lbn, ndaddr));
 252 #endif /* DEBUG */
 253         vput(vp);
 254         return 0;
 255 }
 256
 257 /*
 258  * Copy some the fields of the dinode as needed by update_inoblk().
 259  */
 260 static void
 261 update_inoblk_copy_dinode(struct lfs *fs,
 262     union lfs_dinode *dstu, const union lfs_dinode *srcu)
 263 {
 264         if (fs->lfs_is64) {
 265                 struct lfs64_dinode *dst = &dstu->u_64;
 266                 const struct lfs64_dinode *src = &srcu->u_64;
 267                 unsigned i;
 268
 269                 /*
 270                  * Copy everything but the block pointers and di_blocks.
 271                  * XXX what about di_extb?
 272                  */
 273                 dst->di_mode = src->di_mode;
 274                 dst->di_nlink = src->di_nlink;
 275                 dst->di_uid = src->di_uid;
 276                 dst->di_gid = src->di_gid;
 277                 dst->di_blksize = src->di_blksize;
 278                 dst->di_size = src->di_size;
 279                 dst->di_atime = src->di_atime;
 280                 dst->di_mtime = src->di_mtime;
 281                 dst->di_ctime = src->di_ctime;
 282                 dst->di_birthtime = src->di_birthtime;
 283                 dst->di_mtimensec = src->di_mtimensec;
 284                 dst->di_atimensec = src->di_atimensec;
 285                 dst->di_ctimensec = src->di_ctimensec;
 286                 dst->di_birthnsec = src->di_birthnsec;
 287                 dst->di_gen = src->di_gen;
 288                 dst->di_kernflags = src->di_kernflags;
 289                 dst->di_flags = src->di_flags;
 290                 dst->di_extsize = src->di_extsize;
 291                 dst->di_modrev = src->di_modrev;
 292                 dst->di_inumber = src->di_inumber;
 293                 for (i = 0; i < __arraycount(src->di_spare); i++) {
 294                         dst->di_spare[i] = src->di_spare[i];
 295                 }
 296         } else {
 297                 struct lfs32_dinode *dst = &dstu->u_32;
 298                 const struct lfs32_dinode *src = &srcu->u_32;
 299
 300                 /* Get mode, link count, size, and times */
 301                 memcpy(dst, src, offsetof(struct lfs32_dinode, di_db[0]));
 302
 303                 /* Then the rest, except di_blocks */
 304                 dst->di_flags = src->di_flags;
 305                 dst->di_gen = src->di_gen;
 306                 dst->di_uid = src->di_uid;
 307                 dst->di_gid = src->di_gid;
 308                 dst->di_modrev = src->di_modrev;
 309         }
 310 }
 311
 312 static int
 313 update_inoblk(struct lfs *fs, daddr_t offset, kauth_cred_t cred,
 314               struct lwp *l)
 315 {
 316         struct vnode *devvp, *vp;
 317         struct inode *ip;
 318         union lfs_dinode *dip;
 319         struct buf *dbp, *ibp;
 320         int error;
 321         daddr_t daddr;
 322         IFILE *ifp;
 323         SEGUSE *sup;
 324         unsigned i, num;
 325
 326         devvp = VTOI(fs->lfs_ivnode)->i_devvp;
 327
 328         /*
 329          * Get the inode, update times and perms.
 330          * DO NOT update disk blocks, we do that separately.
 331          */
 332         error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs),
 333             0, &dbp);
 334         if (error) {
 335                 DLOG((DLOG_RF, "update_inoblk: bread returned %d\n", error));
 336                 return error;
 337         }
 338         num = LFS_INOPB(fs);
 339         for (i = num; i-- > 0; ) {
 340                 dip = DINO_IN_BLOCK(fs, dbp->b_data, i);
 341                 if (lfs_dino_getinumber(fs, dip) > LFS_IFILE_INUM) {
 342                         error = lfs_rf_valloc(fs, lfs_dino_getinumber(fs, dip),
 343                                               lfs_dino_getgen(fs, dip),
 344                                               l, &vp);
 345                         if (error) {
 346                                 DLOG((DLOG_RF, "update_inoblk: lfs_rf_valloc"
 347                                       " returned %d\n", error));
 348                                 continue;
 349                         }
 350                         ip = VTOI(vp);
 351                         if (lfs_dino_getsize(fs, dip) != ip->i_size)
 352                                 lfs_truncate(vp, lfs_dino_getsize(fs, dip), 0,
 353                                              NOCRED);
 354                         update_inoblk_copy_dinode(fs, ip->i_din, dip);
 355
 356                         ip->i_flags = lfs_dino_getflags(fs, dip);
 357                         ip->i_gen = lfs_dino_getgen(fs, dip);
 358                         ip->i_uid = lfs_dino_getuid(fs, dip);
 359                         ip->i_gid = lfs_dino_getgid(fs, dip);
 360
 361                         ip->i_mode = lfs_dino_getmode(fs, dip);
 362                         ip->i_nlink = lfs_dino_getnlink(fs, dip);
 363                         ip->i_size = lfs_dino_getsize(fs, dip);
 364
 365                         LFS_SET_UINO(ip, IN_CHANGE | IN_UPDATE);
 366
 367                         /* Re-initialize to get type right */
 368                         ulfs_vinit(vp->v_mount, lfs_specop_p, lfs_fifoop_p,
 369                                   &vp);
 370                         vput(vp);
 371
 372                         /* Record change in location */
 373                         LFS_IENTRY(ifp, fs, lfs_dino_getinumber(fs, dip), ibp);
 374                         daddr = lfs_if_getdaddr(fs, ifp);
 375                         lfs_if_setdaddr(fs, ifp, LFS_DBTOFSB(fs, dbp->b_blkno));
 376                         error = LFS_BWRITE_LOG(ibp); /* Ifile */
 377                         /* And do segment accounting */
 378                         if (lfs_dtosn(fs, daddr) != lfs_dtosn(fs, LFS_DBTOFSB(fs, dbp->b_blkno))) {
 379                                 if (daddr > 0) {
 380                                         LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, daddr),
 381                                                      ibp);
 382                                         sup->su_nbytes -= DINOSIZE(fs);
 383                                         LFS_WRITESEGENTRY(sup, fs,
 384                                                           lfs_dtosn(fs, daddr),
 385                                                           ibp);
 386                                 }
 387                                 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, LFS_DBTOFSB(fs, dbp->b_blkno)),
 388                                              ibp);
 389                                 sup->su_nbytes += DINOSIZE(fs);
 390                                 LFS_WRITESEGENTRY(sup, fs,
 391                                                   lfs_dtosn(fs, LFS_DBTOFSB(fs, dbp->b_blkno)),
 392                                                   ibp);
 393                         }
 394                 }
 395         }
 396         brelse(dbp, BC_AGE);
 397
 398         return 0;
 399 }
 400
 401 #define CHECK_CKSUM   0x0001  /* Check the checksum to make sure it's valid */
 402 #define CHECK_UPDATE  0x0002  /* Update Ifile for new data blocks / inodes */
 403
 404 static daddr_t
 405 check_segsum(struct lfs *fs, daddr_t offset, u_int64_t nextserial,
 406              kauth_cred_t cred, int flags, int *pseg_flags, struct lwp *l)
 407 {
 408         struct vnode *devvp;
 409         struct buf *bp, *dbp;
 410         int error, nblocks = 0, ninos, i, j; /* XXX: gcc */
 411         SEGSUM *ssp;
 412         u_long *dp = NULL, *datap = NULL; /* XXX u_int32_t */
 413         daddr_t oldoffset;
 414         IINFO *iip;
 415         FINFO *fip;
 416         SEGUSE *sup;
 417         size_t size;
 418         uint32_t datasum, foundsum;
 419
 420         devvp = VTOI(fs->lfs_ivnode)->i_devvp;
 421         /*
 422          * If the segment has a superblock and we're at the top
 423          * of the segment, skip the superblock.
 424          */
 425         if (lfs_sntod(fs, lfs_dtosn(fs, offset)) == offset) {
 426                 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp);
 427                 if (sup->su_flags & SEGUSE_SUPERBLOCK)
 428                         offset += lfs_btofsb(fs, LFS_SBPAD);
 429                 brelse(bp, 0);
 430         }
 431
 432         /* Read in the segment summary */
 433         error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getsumsize(fs),
 434             0, &bp);
 435         if (error)
 436                 return -1;
 437
 438         /* Check summary checksum */
 439         ssp = (SEGSUM *)bp->b_data;
 440         if (flags & CHECK_CKSUM) {
 441                 size_t sumstart;
 442
 443                 sumstart = lfs_ss_getsumstart(fs);
 444                 if (lfs_ss_getsumsum(fs, ssp) !=
 445                     cksum((char *)ssp + sumstart,
 446                           lfs_sb_getsumsize(fs) - sumstart)) {
 447                         DLOG((DLOG_RF, "Sumsum error at 0x%" PRIx64 "\n", offset));
 448                         offset = -1;
 449                         goto err1;
 450                 }
 451                 if (lfs_ss_getnfinfo(fs, ssp) == 0 &&
 452                     lfs_ss_getninos(fs, ssp) == 0) {
 453                         DLOG((DLOG_RF, "Empty pseg at 0x%" PRIx64 "\n", offset));
 454                         offset = -1;
 455                         goto err1;
 456                 }
 457                 if (lfs_ss_getcreate(fs, ssp) < lfs_sb_gettstamp(fs)) {
 458                         DLOG((DLOG_RF, "Old data at 0x%" PRIx64 "\n", offset));
 459                         offset = -1;
 460                         goto err1;
 461                 }
 462         }
 463         if (lfs_sb_getversion(fs) > 1) {
 464                 if (lfs_ss_getserial(fs, ssp) != nextserial) {
 465                         DLOG((DLOG_RF, "Unexpected serial number at 0x%" PRIx64
 466                               "\n", offset));
 467                         offset = -1;
 468                         goto err1;
 469                 }
 470                 if (lfs_ss_getident(fs, ssp) != lfs_sb_getident(fs)) {
 471                         DLOG((DLOG_RF, "Incorrect fsid (0x%x vs 0x%x) at 0x%"
 472                               PRIx64 "\n", lfs_ss_getident(fs, ssp),
 473                               lfs_sb_getident(fs), offset));
 474                         offset = -1;
 475                         goto err1;
 476                 }
 477         }
 478         if (pseg_flags)
 479                 *pseg_flags = lfs_ss_getflags(fs, ssp);
 480         oldoffset = offset;
 481         offset += lfs_btofsb(fs, lfs_sb_getsumsize(fs));
 482
 483         ninos = howmany(lfs_ss_getninos(fs, ssp), LFS_INOPB(fs));
 484         iip = SEGSUM_IINFOSTART(fs, bp->b_data);
 485         if (flags & CHECK_CKSUM) {
 486                 /* Count blocks */
 487                 nblocks = 0;
 488                 fip = SEGSUM_FINFOBASE(fs, (SEGSUM *)bp->b_data);
 489                 for (i = 0; i < lfs_ss_getnfinfo(fs, ssp); ++i) {
 490                         nblocks += lfs_fi_getnblocks(fs, fip);
 491                         if (lfs_fi_getnblocks(fs, fip) <= 0)
 492                                 break;
 493                         fip = NEXT_FINFO(fs, fip);
 494                 }
 495                 nblocks += ninos;
 496                 /* Create the sum array */
 497                 datap = dp = malloc(nblocks * sizeof(u_long),
 498                                     M_SEGMENT, M_WAITOK);
 499         }
 500
 501         /* Handle individual blocks */
 502         fip = SEGSUM_FINFOBASE(fs, (SEGSUM *)bp->b_data);
 503         for (i = 0; i < lfs_ss_getnfinfo(fs, ssp) || ninos; ++i) {
 504                 /* Inode block? */
 505                 if (ninos && lfs_ii_getblock(fs, iip) == offset) {
 506                         if (flags & CHECK_CKSUM) {
 507                                 /* Read in the head and add to the buffer */
 508                                 error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getbsize(fs),
 509                                               0, &dbp);
 510                                 if (error) {
 511                                         offset = -1;
 512                                         goto err2;
 513                                 }
 514                                 /* XXX this can't be right, on-disk u_long? */
 515                                 (*dp++) = ((u_long *)(dbp->b_data))[0];
 516                                 brelse(dbp, BC_AGE);
 517                         }
 518                         if (flags & CHECK_UPDATE) {
 519                                 if ((error = update_inoblk(fs, offset, cred, l))
 520                                     != 0) {
 521                                         offset = -1;
 522                                         goto err2;
 523                                 }
 524                         }
 525                         offset += lfs_btofsb(fs, lfs_sb_getibsize(fs));
 526                         iip = NEXTLOWER_IINFO(fs, iip);
 527                         --ninos;
 528                         --i; /* compensate for ++i in loop header */
 529                         continue;
 530                 }
 531                 size = lfs_sb_getbsize(fs);
 532                 for (j = 0; j < lfs_fi_getnblocks(fs, fip); ++j) {
 533                         if (j == lfs_fi_getnblocks(fs, fip) - 1)
 534                                 size = lfs_fi_getlastlength(fs, fip);
 535                         if (flags & CHECK_CKSUM) {
 536                                 error = bread(devvp, LFS_FSBTODB(fs, offset), size,
 537                                     0, &dbp);
 538                                 if (error) {
 539                                         offset = -1;
 540                                         goto err2;
 541                                 }
 542                                 (*dp++) = ((u_long *)(dbp->b_data))[0];
 543                                 brelse(dbp, BC_AGE);
 544                         }
 545                         /* Account for and update any direct blocks */
 546                         if ((flags & CHECK_UPDATE) &&
 547                            lfs_fi_getino(fs, fip) > LFS_IFILE_INUM &&
 548                            lfs_fi_getblock(fs, fip, j) >= 0) {
 549                                 update_meta(fs, lfs_fi_getino(fs, fip),
 550                                             lfs_fi_getversion(fs, fip),
 551                                             lfs_fi_getblock(fs, fip, j),
 552                                             offset, size, l);
 553                         }
 554                         offset += lfs_btofsb(fs, size);
 555                 }
 556                 fip = NEXT_FINFO(fs, fip);
 557         }
 558         /* Checksum the array, compare */
 559         datasum = lfs_ss_getdatasum(fs, ssp);
 560         foundsum = cksum(datap, nblocks * sizeof(u_long));
 561         if ((flags & CHECK_CKSUM) && datasum != foundsum) {
 562                 DLOG((DLOG_RF, "Datasum error at 0x%" PRIx64
 563                       " (wanted %x got %x)\n",
 564                       offset, datasum, foundsum));
 565                 offset = -1;
 566                 goto err2;
 567         }
 568
 569         /* If we're at the end of the segment, move to the next */
 570         if (lfs_dtosn(fs, offset + lfs_btofsb(fs, lfs_sb_getsumsize(fs) + lfs_sb_getbsize(fs))) !=
 571            lfs_dtosn(fs, offset)) {
 572                 if (lfs_dtosn(fs, offset) == lfs_dtosn(fs, lfs_ss_getnext(fs, ssp))) {
 573                         offset = -1;
 574                         goto err2;
 575                 }
 576                 offset = lfs_ss_getnext(fs, ssp);
 577                 DLOG((DLOG_RF, "LFS roll forward: moving to offset 0x%" PRIx64
 578                        " -> segment %d\n", offset, lfs_dtosn(fs,offset)));
 579         }
 580
 581         if (flags & CHECK_UPDATE) {
 582                 lfs_sb_subavail(fs, offset - oldoffset);
 583                 /* Don't clog the buffer queue */
 584                 mutex_enter(&lfs_lock);
 585                 if (locked_queue_count > LFS_MAX_BUFS ||
 586                     locked_queue_bytes > LFS_MAX_BYTES) {
 587                         lfs_flush(fs, SEGM_CKP, 0);
 588                 }
 589                 mutex_exit(&lfs_lock);
 590         }
 591
 592     err2:
 593         if (flags & CHECK_CKSUM)
 594                 free(datap, M_SEGMENT);
 595     err1:
 596         brelse(bp, BC_AGE);
 597
 598         /* XXX should we update the serial number even for bad psegs? */
 599         if ((flags & CHECK_UPDATE) && offset > 0 && lfs_sb_getversion(fs) > 1)
 600                 lfs_sb_setserial(fs, nextserial);
 601         return offset;
 602 }
 603
 604 void
 605 lfs_roll_forward(struct lfs *fs, struct mount *mp, struct lwp *l)
 606 {
 607         int flags, dirty;
 608         daddr_t offset, oldoffset, lastgoodpseg;
 609         int sn, curseg, do_rollforward;
 610         struct proc *p;
 611         kauth_cred_t cred;
 612         SEGUSE *sup;
 613         struct buf *bp;
 614
 615         p = l ? l->l_proc : NULL;
 616         cred = p ? p->p_cred : NOCRED;
 617
 618         /*
 619          * Roll forward.
 620          *
 621          * We don't roll forward for v1 filesystems, because
 622          * of the danger that the clock was turned back between the last
 623          * checkpoint and crash.  This would roll forward garbage.
 624          *
 625          * v2 filesystems don't have this problem because they use a
 626          * monotonically increasing serial number instead of a timestamp.
 627          */
 628         do_rollforward = (!(lfs_sb_getpflags(fs) & LFS_PF_CLEAN) &&
 629                           lfs_do_rfw && lfs_sb_getversion(fs) > 1 && p != NULL);
 630         if (do_rollforward) {
 631                 u_int64_t nextserial;
 632                 /*
 633                  * Phase I: Find the address of the last good partial
 634                  * segment that was written after the checkpoint.  Mark
 635                  * the segments in question dirty, so they won't be
 636                  * reallocated.
 637                  */
 638                 lastgoodpseg = oldoffset = offset = lfs_sb_getoffset(fs);
 639                 flags = 0x0;
 640                 DLOG((DLOG_RF, "LFS roll forward phase 1: start at offset 0x%"
 641                       PRIx64 "\n", offset));
 642                 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp);
 643                 if (!(sup->su_flags & SEGUSE_DIRTY))
 644                         lfs_sb_subnclean(fs, 1);
 645                 sup->su_flags |= SEGUSE_DIRTY;
 646                 LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp);
 647                 nextserial = lfs_sb_getserial(fs) + 1;
 648                 while ((offset = check_segsum(fs, offset, nextserial,
 649                     cred, CHECK_CKSUM, &flags, l)) > 0) {
 650                         nextserial++;
 651                         if (lfs_sntod(fs, oldoffset) != lfs_sntod(fs, offset)) {
 652                                 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, oldoffset),
 653                                              bp);
 654                                 if (!(sup->su_flags & SEGUSE_DIRTY))
 655                                         lfs_sb_subnclean(fs, 1);
 656                                 sup->su_flags |= SEGUSE_DIRTY;
 657                                 LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, oldoffset),
 658                                              bp);
 659                         }
 660
 661                         DLOG((DLOG_RF, "LFS roll forward phase 1: offset=0x%"
 662                               PRIx64 "\n", offset));
 663                         if (flags & SS_DIROP) {
 664                                 DLOG((DLOG_RF, "lfs_mountfs: dirops at 0x%"
 665                                       PRIx64 "\n", oldoffset));
 666                                 if (!(flags & SS_CONT)) {
 667                                      DLOG((DLOG_RF, "lfs_mountfs: dirops end "
 668                                            "at 0x%" PRIx64 "\n", oldoffset));
 669                                 }
 670                         }
 671                         if (!(flags & SS_CONT))
 672                                 lastgoodpseg = offset;
 673                         oldoffset = offset;
 674                 }
 675                 if (flags & SS_CONT) {
 676                         DLOG((DLOG_RF, "LFS roll forward: warning: incomplete "
 677                               "dirops discarded\n"));
 678                 }
 679                 DLOG((DLOG_RF, "LFS roll forward phase 1: completed: "
 680                       "lastgoodpseg=0x%" PRIx64 "\n", lastgoodpseg));
 681                 oldoffset = lfs_sb_getoffset(fs);
 682                 if (lfs_sb_getoffset(fs) != lastgoodpseg) {
 683                         /* Don't overwrite what we're trying to preserve */
 684                         offset = lfs_sb_getoffset(fs);
 685                         lfs_sb_setoffset(fs, lastgoodpseg);
 686                         lfs_sb_setcurseg(fs, lfs_sntod(fs, lfs_dtosn(fs, lfs_sb_getoffset(fs))));
 687                         for (sn = curseg = lfs_dtosn(fs, lfs_sb_getcurseg(fs));;) {
 688                                 sn = (sn + 1) % lfs_sb_getnseg(fs);
 689                                 if (sn == curseg)
 690                                         panic("lfs_mountfs: no clean segments");
 691                                 LFS_SEGENTRY(sup, fs, sn, bp);
 692                                 dirty = (sup->su_flags & SEGUSE_DIRTY);
 693                                 brelse(bp, 0);
 694                                 if (!dirty)
 695                                         break;
 696                         }
 697                         lfs_sb_setnextseg(fs, lfs_sntod(fs, sn));
 698
 699                         /*
 700                          * Phase II: Roll forward from the first superblock.
 701                          */
 702                         while (offset != lastgoodpseg) {
 703                                 DLOG((DLOG_RF, "LFS roll forward phase 2: 0x%"
 704                                       PRIx64 "\n", offset));
 705                                 offset = check_segsum(fs, offset,
 706                                     lfs_sb_getserial(fs) + 1, cred, CHECK_UPDATE,
 707                                     NULL, l);
 708                         }
 709
 710                         /*
 711                          * Finish: flush our changes to disk.
 712                          */
 713                         lfs_segwrite(mp, SEGM_CKP | SEGM_SYNC);
 714                         DLOG((DLOG_RF, "lfs_mountfs: roll forward ",
 715                               "recovered %jd blocks\n",
 716                               (intmax_t)(lastgoodpseg - oldoffset)));
 717                 }
 718                 DLOG((DLOG_RF, "LFS roll forward complete\n"));
 719         }
 720 }