sys/ufs/ffs/ffs_snapshot.c

   1 /*      $NetBSD: ffs_snapshot.c,v 1.96 2009/10/13 12:38:14 hannken Exp $        */
   2
   3 /*
   4  * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
   5  *
   6  * Further information about snapshots can be obtained from:
   7  *
   8  *      Marshall Kirk McKusick          http://www.mckusick.com/softdep/
   9  *      1614 Oxford Street              mckusick@mckusick.com
  10  *      Berkeley, CA 94709-1608         +1-510-843-9542
  11  *      USA
  12  *
  13  * Redistribution and use in source and binary forms, with or without
  14  * modification, are permitted provided that the following conditions
  15  * are met:
  16  *
  17  * 1. Redistributions of source code must retain the above copyright
  18  *    notice, this list of conditions and the following disclaimer.
  19  * 2. Redistributions in binary form must reproduce the above copyright
  20  *    notice, this list of conditions and the following disclaimer in the
  21  *    documentation and/or other materials provided with the distribution.
  22  *
  23  * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
  24  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  25  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  26  * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
  27  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  33  * SUCH DAMAGE.
  34  *
  35  *      @(#)ffs_snapshot.c      8.11 (McKusick) 7/23/00
  36  *
  37  *      from FreeBSD: ffs_snapshot.c,v 1.79 2004/02/13 02:02:06 kuriyama Exp
  38  */
  39
  40 #include <sys/cdefs.h>
  41 __KERNEL_RCSID(0, "$NetBSD: ffs_snapshot.c,v 1.96 2009/10/13 12:38:14 hannken Exp $");
  42
  43 #if defined(_KERNEL_OPT)
  44 #include "opt_ffs.h"
  45 #endif
  46
  47 #include <sys/param.h>
  48 #include <sys/kernel.h>
  49 #include <sys/systm.h>
  50 #include <sys/conf.h>
  51 #include <sys/buf.h>
  52 #include <sys/proc.h>
  53 #include <sys/namei.h>
  54 #include <sys/sched.h>
  55 #include <sys/stat.h>
  56 #include <sys/malloc.h>
  57 #include <sys/mount.h>
  58 #include <sys/resource.h>
  59 #include <sys/resourcevar.h>
  60 #include <sys/vnode.h>
  61 #include <sys/kauth.h>
  62 #include <sys/fstrans.h>
  63 #include <sys/wapbl.h>
  64
  65 #include <miscfs/specfs/specdev.h>
  66
  67 #include <ufs/ufs/quota.h>
  68 #include <ufs/ufs/ufsmount.h>
  69 #include <ufs/ufs/inode.h>
  70 #include <ufs/ufs/ufs_extern.h>
  71 #include <ufs/ufs/ufs_bswap.h>
  72 #include <ufs/ufs/ufs_wapbl.h>
  73
  74 #include <ufs/ffs/fs.h>
  75 #include <ufs/ffs/ffs_extern.h>
  76
  77 #include <uvm/uvm.h>
  78
  79 struct snap_info {
  80         kmutex_t si_lock;                       /* Lock this snapinfo */
  81         kmutex_t si_snaplock;                   /* Snapshot vnode common lock */
  82         TAILQ_HEAD(inodelst, inode) si_snapshots; /* List of active snapshots */
  83         daddr_t *si_snapblklist;                /* Snapshot block hints list */
  84         uint32_t si_gen;                        /* Incremented on change */
  85 };
  86
  87 #if !defined(FFS_NO_SNAPSHOT)
  88 typedef int (*acctfunc_t)
  89     (struct vnode *, void *, int, int, struct fs *, daddr_t, int);
  90
  91 static int snapshot_setup(struct mount *, struct vnode *);
  92 static int snapshot_copyfs(struct mount *, struct vnode *, void **);
  93 static int snapshot_expunge(struct mount *, struct vnode *,
  94     struct fs *, daddr_t *, daddr_t **);
  95 static int snapshot_expunge_snap(struct mount *, struct vnode *,
  96     struct fs *, daddr_t);
  97 static int snapshot_writefs(struct mount *, struct vnode *, void *);
  98 static int cgaccount(struct vnode *, int, int *);
  99 static int cgaccount1(int, struct vnode *, void *, int);
 100 static int expunge(struct vnode *, struct inode *, struct fs *,
 101     acctfunc_t, int);
 102 static int indiracct(struct vnode *, struct vnode *, int, daddr_t,
 103     daddr_t, daddr_t, daddr_t, daddr_t, struct fs *, acctfunc_t, int);
 104 static int fullacct(struct vnode *, void *, int, int, struct fs *,
 105     daddr_t, int);
 106 static int snapacct(struct vnode *, void *, int, int, struct fs *,
 107     daddr_t, int);
 108 static int mapacct(struct vnode *, void *, int, int, struct fs *,
 109     daddr_t, int);
 110 #endif /* !defined(FFS_NO_SNAPSHOT) */
 111
 112 static int ffs_copyonwrite(void *, struct buf *, bool);
 113 static int snapblkaddr(struct vnode *, daddr_t, daddr_t *);
 114 static int rwfsblk(struct vnode *, int, void *, daddr_t);
 115 static int syncsnap(struct vnode *);
 116 static int wrsnapblk(struct vnode *, void *, daddr_t);
 117
 118 static inline bool is_active_snapshot(struct snap_info *, struct inode *);
 119 static inline daddr_t db_get(struct inode *, int);
 120 static inline void db_assign(struct inode *, int, daddr_t);
 121 static inline daddr_t ib_get(struct inode *, int);
 122 static inline void ib_assign(struct inode *, int, daddr_t);
 123 static inline daddr_t idb_get(struct inode *, void *, int);
 124 static inline void idb_assign(struct inode *, void *, int, daddr_t);
 125
 126 #ifdef DEBUG
 127 static int snapdebug = 0;
 128 #endif
 129
 130 int
 131 ffs_snapshot_init(struct ufsmount *ump)
 132 {
 133         struct snap_info *si;
 134
 135         si = ump->um_snapinfo = kmem_alloc(sizeof(*si), KM_SLEEP);
 136         if (si == NULL)
 137                 return ENOMEM;
 138
 139         TAILQ_INIT(&si->si_snapshots);
 140         mutex_init(&si->si_lock, MUTEX_DEFAULT, IPL_NONE);
 141         mutex_init(&si->si_snaplock, MUTEX_DEFAULT, IPL_NONE);
 142         si->si_gen = 0;
 143         si->si_snapblklist = NULL;
 144
 145         return 0;
 146 }
 147
 148 void
 149 ffs_snapshot_fini(struct ufsmount *ump)
 150 {
 151         struct snap_info *si;
 152
 153         si = ump->um_snapinfo;
 154         ump->um_snapinfo = NULL;
 155
 156         KASSERT(TAILQ_EMPTY(&si->si_snapshots));
 157         mutex_destroy(&si->si_lock);
 158         mutex_destroy(&si->si_snaplock);
 159         KASSERT(si->si_snapblklist == NULL);
 160         kmem_free(si, sizeof(*si));
 161 }
 162
 163 /*
 164  * Create a snapshot file and initialize it for the filesystem.
 165  * Vnode is locked on entry and return.
 166  */
 167 int
 168 ffs_snapshot(struct mount *mp, struct vnode *vp, struct timespec *ctime)
 169 {
 170 #if defined(FFS_NO_SNAPSHOT)
 171         return EOPNOTSUPP;
 172 }
 173 #else /* defined(FFS_NO_SNAPSHOT) */
 174         bool suspended = false;
 175         bool snapshot_locked = false;
 176         int error, redo = 0, snaploc;
 177         void *sbbuf = NULL;
 178         daddr_t *snaplist = NULL, snaplistsize = 0;
 179         struct buf *bp, *nbp;
 180         struct fs *copy_fs, *fs = VFSTOUFS(mp)->um_fs;
 181         struct inode *ip = VTOI(vp);
 182         struct lwp *l = curlwp;
 183         struct snap_info *si = VFSTOUFS(mp)->um_snapinfo;
 184         struct timespec ts;
 185         struct timeval starttime;
 186 #ifdef DEBUG
 187         struct timeval endtime;
 188 #endif
 189         struct vnode *devvp = ip->i_devvp;
 190
 191         /*
 192          * If the vnode already is a snapshot, return.
 193          */
 194         if (VTOI(vp)->i_flags & SF_SNAPSHOT) {
 195                 if (ctime) {
 196                         ctime->tv_sec = DIP(VTOI(vp), mtime);
 197                         ctime->tv_nsec = DIP(VTOI(vp), mtimensec);
 198                 }
 199                 return 0;
 200         }
 201         /*
 202          * Check for free snapshot slot in the superblock.
 203          */
 204         for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
 205                 if (fs->fs_snapinum[snaploc] == 0)
 206                         break;
 207         if (snaploc == FSMAXSNAP)
 208                 return (ENOSPC);
 209         /*
 210          * Prepare the vnode to become a snapshot.
 211          */
 212         error = snapshot_setup(mp, vp);
 213         if (error)
 214                 goto out;
 215         /*
 216          * Change inode to snapshot type file.
 217          */
 218         ip->i_flags |= SF_SNAPSHOT;
 219         DIP_ASSIGN(ip, flags, ip->i_flags);
 220         ip->i_flag |= IN_CHANGE | IN_UPDATE;
 221         /*
 222          * Copy all the cylinder group maps. Although the
 223          * filesystem is still active, we hope that only a few
 224          * cylinder groups will change between now and when we
 225          * suspend operations. Thus, we will be able to quickly
 226          * touch up the few cylinder groups that changed during
 227          * the suspension period.
 228          */
 229         error = cgaccount(vp, 1, NULL);
 230         if (error)
 231                 goto out;
 232         /*
 233          * Ensure that the snapshot is completely on disk.
 234          * Since we have marked it as a snapshot it is safe to
 235          * unlock it as no process will be allowed to write to it.
 236          */
 237         error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0);
 238         if (error)
 239                 goto out;
 240         VOP_UNLOCK(vp, 0);
 241         /*
 242          * All allocations are done, so we can now suspend the filesystem.
 243          */
 244         error = vfs_suspend(vp->v_mount, 0);
 245         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 246         if (error)
 247                 goto out;
 248         suspended = true;
 249         getmicrotime(&starttime);
 250         /*
 251          * First, copy all the cylinder group maps that have changed.
 252          */
 253         error = cgaccount(vp, 2, &redo);
 254         if (error)
 255                 goto out;
 256         /*
 257          * Create a copy of the superblock and its summary information.
 258          */
 259         error = snapshot_copyfs(mp, vp, &sbbuf);
 260         copy_fs = (struct fs *)((char *)sbbuf + blkoff(fs, fs->fs_sblockloc));
 261         if (error)
 262                 goto out;
 263         /*
 264          * Expunge unlinked files from our view.
 265          */
 266         error = snapshot_expunge(mp, vp, copy_fs, &snaplistsize, &snaplist);
 267         if (error)
 268                 goto out;
 269         /*
 270          * Acquire the snapshot lock.
 271          */
 272         mutex_enter(&si->si_snaplock);
 273         snapshot_locked = true;
 274         /*
 275          * Record snapshot inode. Since this is the newest snapshot,
 276          * it must be placed at the end of the list.
 277          */
 278         fs->fs_snapinum[snaploc] = ip->i_number;
 279
 280         mutex_enter(&si->si_lock);
 281         if (is_active_snapshot(si, ip))
 282                 panic("ffs_snapshot: %"PRIu64" already on list", ip->i_number);
 283         TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap);
 284         if (TAILQ_FIRST(&si->si_snapshots) == ip) {
 285                 /*
 286                  * If this is the first snapshot on this filesystem, put the
 287                  * preliminary list in place and establish the cow handler.
 288                  */
 289                 si->si_snapblklist = snaplist;
 290                 fscow_establish(mp, ffs_copyonwrite, devvp);
 291         }
 292         si->si_gen++;
 293         mutex_exit(&si->si_lock);
 294
 295         vp->v_vflag |= VV_SYSTEM;
 296         /*
 297          * Set the mtime to the time the snapshot has been taken.
 298          */
 299         TIMEVAL_TO_TIMESPEC(&starttime, &ts);
 300         if (ctime)
 301                 *ctime = ts;
 302         DIP_ASSIGN(ip, mtime, ts.tv_sec);
 303         DIP_ASSIGN(ip, mtimensec, ts.tv_nsec);
 304         ip->i_flag |= IN_CHANGE | IN_UPDATE;
 305         /*
 306          * Copy allocation information from all snapshots and then
 307          * expunge them from our view.
 308          */
 309         error = snapshot_expunge_snap(mp, vp, copy_fs, snaplistsize);
 310         if (error)
 311                 goto out;
 312         /*
 313          * Write the superblock and its summary information to the snapshot.
 314          */
 315         error = snapshot_writefs(mp, vp, sbbuf);
 316         if (error)
 317                 goto out;
 318         /*
 319          * We're nearly done, ensure that the snapshot is completely on disk.
 320          */
 321         error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0);
 322         if (error)
 323                 goto out;
 324         /*
 325          * Invalidate and free all pages on the snapshot vnode.
 326          * We will read and write through the buffercache.
 327          */
 328         mutex_enter(&vp->v_interlock);
 329         error = VOP_PUTPAGES(vp, 0, 0,
 330                     PGO_ALLPAGES | PGO_CLEANIT | PGO_SYNCIO | PGO_FREE);
 331         if (error)
 332                 goto out;
 333         /*
 334          * Invalidate short ( < fs_bsize ) buffers.  We will always read
 335          * full size buffers later.
 336          */
 337         mutex_enter(&bufcache_lock);
 338         KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL);
 339         for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
 340                 nbp = LIST_NEXT(bp, b_vnbufs);
 341                 KASSERT((bp->b_cflags & BC_BUSY) == 0);
 342                 if (bp->b_bcount < fs->fs_bsize) {
 343                         bp->b_cflags |= BC_BUSY;
 344                         brelsel(bp, BC_INVAL | BC_VFLUSH);
 345                 }
 346         }
 347         mutex_exit(&bufcache_lock);
 348
 349 out:
 350         if (sbbuf != NULL) {
 351                 free(copy_fs->fs_csp, M_UFSMNT);
 352                 free(sbbuf, M_UFSMNT);
 353         }
 354         if (fs->fs_active != NULL) {
 355                 free(fs->fs_active, M_DEVBUF);
 356                 fs->fs_active = NULL;
 357         }
 358
 359         mutex_enter(&si->si_lock);
 360         if (snaplist != NULL) {
 361                 if (si->si_snapblklist == snaplist)
 362                         si->si_snapblklist = NULL;
 363                 free(snaplist, M_UFSMNT);
 364         }
 365         if (error) {
 366                 fs->fs_snapinum[snaploc] = 0;
 367         } else {
 368                 /*
 369                  * As this is the newest list, it is the most inclusive, so
 370                  * should replace the previous list.
 371                  */
 372                 si->si_snapblklist = ip->i_snapblklist;
 373         }
 374         si->si_gen++;
 375         mutex_exit(&si->si_lock);
 376
 377         if (snapshot_locked)
 378                 mutex_exit(&si->si_snaplock);
 379         if (suspended) {
 380                 vfs_resume(vp->v_mount);
 381 #ifdef DEBUG
 382                 getmicrotime(&endtime);
 383                 timersub(&endtime, &starttime, &endtime);
 384                 printf("%s: suspended %lld.%03d sec, redo %d of %d\n",
 385                     mp->mnt_stat.f_mntonname, (long long)endtime.tv_sec,
 386                     endtime.tv_usec / 1000, redo, fs->fs_ncg);
 387 #endif
 388         }
 389         if (error) {
 390                 if (!UFS_WAPBL_BEGIN(mp)) {
 391                         (void) ffs_truncate(vp, (off_t)0, 0, NOCRED);
 392                         UFS_WAPBL_END(mp);
 393                 }
 394         } else
 395                 vref(vp);
 396         return (error);
 397 }
 398
 399 /*
 400  * Prepare vnode to become a snapshot.
 401  */
 402 static int
 403 snapshot_setup(struct mount *mp, struct vnode *vp)
 404 {
 405         int error, i, len, loc;
 406         daddr_t blkno, numblks;
 407         struct buf *ibp, *nbp;
 408         struct fs *fs = VFSTOUFS(mp)->um_fs;
 409         struct lwp *l = curlwp;
 410
 411         /*
 412          * Check mount, exclusive reference and owner.
 413          */
 414         if (vp->v_mount != mp)
 415                 return EXDEV;
 416         if (vp->v_usecount != 1 || vp->v_writecount != 0)
 417                 return EBUSY;
 418         if (kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
 419             NULL) != 0 &&
 420             VTOI(vp)->i_uid != kauth_cred_geteuid(l->l_cred))
 421                 return EACCES;
 422
 423         if (vp->v_size != 0) {
 424                 error = ffs_truncate(vp, 0, 0, NOCRED);
 425                 if (error)
 426                         return error;
 427         }
 428         /*
 429          * Write an empty list of preallocated blocks to the end of
 430          * the snapshot to set size to at least that of the filesystem.
 431          */
 432         numblks = howmany(fs->fs_size, fs->fs_frag);
 433         blkno = 1;
 434         blkno = ufs_rw64(blkno, UFS_FSNEEDSWAP(fs));
 435         error = vn_rdwr(UIO_WRITE, vp,
 436             (void *)&blkno, sizeof(blkno), lblktosize(fs, (off_t)numblks),
 437             UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, l->l_cred, NULL, NULL);
 438         if (error)
 439                 return error;
 440         /*
 441          * Preallocate critical data structures so that we can copy
 442          * them in without further allocation after we suspend all
 443          * operations on the filesystem. We would like to just release
 444          * the allocated buffers without writing them since they will
 445          * be filled in below once we are ready to go, but this upsets
 446          * the soft update code, so we go ahead and write the new buffers.
 447          *
 448          * Allocate all indirect blocks and mark all of them as not
 449          * needing to be copied.
 450          */
 451         error = UFS_WAPBL_BEGIN(mp);
 452         if (error)
 453                 return error;
 454         for (blkno = NDADDR, i = 0; blkno < numblks; blkno += NINDIR(fs)) {
 455                 error = ffs_balloc(vp, lblktosize(fs, (off_t)blkno),
 456                     fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
 457                 if (error)
 458                         goto out;
 459                 brelse(ibp, 0);
 460                 if ((++i % 16) == 0) {
 461                         UFS_WAPBL_END(mp);
 462                         error = UFS_WAPBL_BEGIN(mp);
 463                         if (error)
 464                                 return error;
 465                 }
 466         }
 467         /*
 468          * Allocate copies for the superblock and its summary information.
 469          */
 470         error = ffs_balloc(vp, fs->fs_sblockloc, fs->fs_sbsize, l->l_cred,
 471             0, &nbp);
 472         if (error)
 473                 goto out;
 474         bawrite(nbp);
 475         blkno = fragstoblks(fs, fs->fs_csaddr);
 476         len = howmany(fs->fs_cssize, fs->fs_bsize);
 477         for (loc = 0; loc < len; loc++) {
 478                 error = ffs_balloc(vp, lblktosize(fs, (off_t)(blkno + loc)),
 479                     fs->fs_bsize, l->l_cred, 0, &nbp);
 480                 if (error)
 481                         goto out;
 482                 bawrite(nbp);
 483         }
 484
 485 out:
 486         UFS_WAPBL_END(mp);
 487         return error;
 488 }
 489
 490 /*
 491  * Create a copy of the superblock and its summary information.
 492  * It is up to the caller to free copyfs and copy_fs->fs_csp.
 493  */
 494 static int
 495 snapshot_copyfs(struct mount *mp, struct vnode *vp, void **sbbuf)
 496 {
 497         int error, i, len, loc, size;
 498         void *space;
 499         int32_t *lp;
 500         struct buf *bp;
 501         struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs;
 502         struct lwp *l = curlwp;
 503         struct vnode *devvp = VTOI(vp)->i_devvp;
 504
 505         /*
 506          * Grab a copy of the superblock and its summary information.
 507          * We delay writing it until the suspension is released below.
 508          */
 509         *sbbuf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
 510         loc = blkoff(fs, fs->fs_sblockloc);
 511         if (loc > 0)
 512                 memset(*sbbuf, 0, loc);
 513         copyfs = (struct fs *)((char *)(*sbbuf) + loc);
 514         memcpy(copyfs, fs, fs->fs_sbsize);
 515         size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE;
 516         if (fs->fs_sbsize < size)
 517                 memset((char *)(*sbbuf) + loc + fs->fs_sbsize, 0,
 518                     size - fs->fs_sbsize);
 519         size = blkroundup(fs, fs->fs_cssize);
 520         if (fs->fs_contigsumsize > 0)
 521                 size += fs->fs_ncg * sizeof(int32_t);
 522         space = malloc(size, M_UFSMNT, M_WAITOK);
 523         copyfs->fs_csp = space;
 524         memcpy(copyfs->fs_csp, fs->fs_csp, fs->fs_cssize);
 525         space = (char *)space + fs->fs_cssize;
 526         loc = howmany(fs->fs_cssize, fs->fs_fsize);
 527         i = fs->fs_frag - loc % fs->fs_frag;
 528         len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize;
 529         if (len > 0) {
 530                 if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc),
 531                     len, l->l_cred, 0, &bp)) != 0) {
 532                         brelse(bp, 0);
 533                         free(copyfs->fs_csp, M_UFSMNT);
 534                         free(*sbbuf, M_UFSMNT);
 535                         *sbbuf = NULL;
 536                         return error;
 537                 }
 538                 memcpy(space, bp->b_data, (u_int)len);
 539                 space = (char *)space + len;
 540                 brelse(bp, BC_INVAL | BC_NOCACHE);
 541         }
 542         if (fs->fs_contigsumsize > 0) {
 543                 copyfs->fs_maxcluster = lp = space;
 544                 for (i = 0; i < fs->fs_ncg; i++)
 545                         *lp++ = fs->fs_contigsumsize;
 546         }
 547         if (mp->mnt_wapbl)
 548                 copyfs->fs_flags &= ~FS_DOWAPBL;
 549         return 0;
 550 }
 551
 552 /*
 553  * We must check for active files that have been unlinked (e.g., with a zero
 554  * link count). We have to expunge all trace of these files from the snapshot
 555  * so that they are not reclaimed prematurely by fsck or unnecessarily dumped.
 556  * Note that we skip unlinked snapshot files as they will be handled separately.
 557  * Calculate the snapshot list size and create a preliminary list.
 558  */
 559 static int
 560 snapshot_expunge(struct mount *mp, struct vnode *vp, struct fs *copy_fs,
 561     daddr_t *snaplistsize, daddr_t **snaplist)
 562 {
 563         bool has_wapbl = false;
 564         int cg, error, len, loc;
 565         daddr_t blkno, *blkp;
 566         struct fs *fs = VFSTOUFS(mp)->um_fs;
 567         struct inode *xp;
 568         struct lwp *l = curlwp;
 569         struct vattr vat;
 570         struct vnode *logvp = NULL, *mvp = NULL, *xvp;
 571
 572         *snaplist = NULL;
 573         /*
 574          * Get the log inode if any.
 575          */
 576         if ((fs->fs_flags & FS_DOWAPBL) &&
 577             fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) {
 578                 error = VFS_VGET(mp,
 579                     fs->fs_journallocs[UFS_WAPBL_INFS_INO], &logvp);
 580                 if (error)
 581                         goto out;
 582         }
 583         /*
 584          * Allocate a marker vnode.
 585          */
 586         if ((mvp = vnalloc(mp)) == NULL) {
 587                 error = ENOMEM;
 588                 goto out;
 589         }
 590         /*
 591          * We also calculate the needed size for the snapshot list.
 592          */
 593         *snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) +
 594             FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */;
 595         error = UFS_WAPBL_BEGIN(mp);
 596         if (error)
 597                 goto out;
 598         has_wapbl = true;
 599         mutex_enter(&mntvnode_lock);
 600         /*
 601          * NOTE: not using the TAILQ_FOREACH here since in this loop vgone()
 602          * and vclean() can be called indirectly
 603          */
 604         for (xvp = TAILQ_FIRST(&mp->mnt_vnodelist); xvp; xvp = vunmark(mvp)) {
 605                 vmark(mvp, xvp);
 606                 /*
 607                  * Make sure this vnode wasn't reclaimed in getnewvnode().
 608                  * Start over if it has (it won't be on the list anymore).
 609                  */
 610                 if (xvp->v_mount != mp || vismarker(xvp))
 611                         continue;
 612                 mutex_enter(&xvp->v_interlock);
 613                 if ((xvp->v_iflag & VI_XLOCK) ||
 614                     xvp->v_usecount == 0 || xvp->v_type == VNON ||
 615                     VTOI(xvp) == NULL ||
 616                     (VTOI(xvp)->i_flags & SF_SNAPSHOT)) {
 617                         mutex_exit(&xvp->v_interlock);
 618                         continue;
 619                 }
 620                 mutex_exit(&mntvnode_lock);
 621                 /*
 622                  * XXXAD should increase vnode ref count to prevent it
 623                  * disappearing or being recycled.
 624                  */
 625                 mutex_exit(&xvp->v_interlock);
 626 #ifdef DEBUG
 627                 if (snapdebug)
 628                         vprint("ffs_snapshot: busy vnode", xvp);
 629 #endif
 630                 xp = VTOI(xvp);
 631                 if (xvp != logvp) {
 632                         if (VOP_GETATTR(xvp, &vat, l->l_cred) == 0 &&
 633                             vat.va_nlink > 0) {
 634                                 mutex_enter(&mntvnode_lock);
 635                                 continue;
 636                         }
 637                         if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) {
 638                                 mutex_enter(&mntvnode_lock);
 639                                 continue;
 640                         }
 641                 }
 642                 /*
 643                  * If there is a fragment, clear it here.
 644                  */
 645                 blkno = 0;
 646                 loc = howmany(xp->i_size, fs->fs_bsize) - 1;
 647                 if (loc < NDADDR) {
 648                         len = fragroundup(fs, blkoff(fs, xp->i_size));
 649                         if (len > 0 && len < fs->fs_bsize) {
 650                                 ffs_blkfree_snap(copy_fs, vp, db_get(xp, loc),
 651                                     len, xp->i_number);
 652                                 blkno = db_get(xp, loc);
 653                                 db_assign(xp, loc, 0);
 654                         }
 655                 }
 656                 *snaplistsize += 1;
 657                 error = expunge(vp, xp, copy_fs, fullacct, BLK_NOCOPY);
 658                 if (blkno)
 659                         db_assign(xp, loc, blkno);
 660                 if (!error)
 661                         error = ffs_freefile_snap(copy_fs, vp, xp->i_number,
 662                             xp->i_mode);
 663                 if (error) {
 664                         (void)vunmark(mvp);
 665                         goto out;
 666                 }
 667                 mutex_enter(&mntvnode_lock);
 668         }
 669         mutex_exit(&mntvnode_lock);
 670         /*
 671          * Create a preliminary list of preallocated snapshot blocks.
 672          */
 673         *snaplist = malloc(*snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
 674         blkp = &(*snaplist)[1];
 675         *blkp++ = lblkno(fs, fs->fs_sblockloc);
 676         blkno = fragstoblks(fs, fs->fs_csaddr);
 677         for (cg = 0; cg < fs->fs_ncg; cg++) {
 678                 if (fragstoblks(fs, cgtod(fs, cg)) > blkno)
 679                         break;
 680                 *blkp++ = fragstoblks(fs, cgtod(fs, cg));
 681         }
 682         len = howmany(fs->fs_cssize, fs->fs_bsize);
 683         for (loc = 0; loc < len; loc++)
 684                 *blkp++ = blkno + loc;
 685         for (; cg < fs->fs_ncg; cg++)
 686                 *blkp++ = fragstoblks(fs, cgtod(fs, cg));
 687
 688 out:
 689         if (has_wapbl)
 690                 UFS_WAPBL_END(mp);
 691         if (mvp != NULL)
 692                 vnfree(mvp);
 693         if (logvp != NULL)
 694                 vput(logvp);
 695         if (error && *snaplist != NULL) {
 696                 free(*snaplist, M_UFSMNT);
 697                 *snaplist = NULL;
 698         }
 699
 700         return error;
 701 }
 702
 703 /*
 704  * Copy allocation information from all the snapshots in this snapshot and
 705  * then expunge them from its view. Also, collect the list of allocated
 706  * blocks in i_snapblklist.
 707  */
 708 static int
 709 snapshot_expunge_snap(struct mount *mp, struct vnode *vp,
 710     struct fs *copy_fs, daddr_t snaplistsize)
 711 {
 712         int error, i;
 713         daddr_t numblks, *snaplist = NULL;
 714         struct fs *fs = VFSTOUFS(mp)->um_fs;
 715         struct inode *ip = VTOI(vp), *xp;
 716         struct lwp *l = curlwp;
 717         struct snap_info *si = VFSTOUFS(mp)->um_snapinfo;
 718
 719         error = UFS_WAPBL_BEGIN(mp);
 720         if (error)
 721                 return error;
 722         TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) {
 723                 if (xp == ip)
 724                         break;
 725                 error = expunge(vp, xp, fs, snapacct, BLK_SNAP);
 726                 if (error)
 727                         break;
 728                 if (xp->i_nlink != 0)
 729                         continue;
 730                 error = ffs_freefile_snap(copy_fs, vp, xp->i_number, xp->i_mode);
 731                 if (error)
 732                         break;
 733         }
 734         if (error)
 735                 goto out;
 736         /*
 737          * Allocate space for the full list of preallocated snapshot blocks.
 738          */
 739         snaplist = malloc(snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
 740         ip->i_snapblklist = &snaplist[1];
 741         /*
 742          * Expunge the blocks used by the snapshots from the set of
 743          * blocks marked as used in the snapshot bitmaps. Also, collect
 744          * the list of allocated blocks in i_snapblklist.
 745          */
 746         error = expunge(vp, ip, copy_fs, mapacct, BLK_SNAP);
 747         if (error)
 748                 goto out;
 749         if (snaplistsize < ip->i_snapblklist - snaplist)
 750                 panic("ffs_snapshot: list too small");
 751         snaplistsize = ip->i_snapblklist - snaplist;
 752         snaplist[0] = snaplistsize;
 753         ip->i_snapblklist = &snaplist[0];
 754         /*
 755          * Write out the list of allocated blocks to the end of the snapshot.
 756          */
 757         numblks = howmany(fs->fs_size, fs->fs_frag);
 758         for (i = 0; i < snaplistsize; i++)
 759                 snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs));
 760         error = vn_rdwr(UIO_WRITE, vp, (void *)snaplist,
 761             snaplistsize * sizeof(daddr_t), lblktosize(fs, (off_t)numblks),
 762             UIO_SYSSPACE, IO_NODELOCKED | IO_JOURNALLOCKED | IO_UNIT,
 763             l->l_cred, NULL, NULL);
 764         for (i = 0; i < snaplistsize; i++)
 765                 snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs));
 766 out:
 767         UFS_WAPBL_END(mp);
 768         if (error && snaplist != NULL) {
 769                 free(snaplist, M_UFSMNT);
 770                 ip->i_snapblklist = NULL;
 771         }
 772         return error;
 773 }
 774
 775 /*
 776  * Write the superblock and its summary information to the snapshot.
 777  * Make sure, the first NDADDR blocks get copied to the snapshot.
 778  */
 779 static int
 780 snapshot_writefs(struct mount *mp, struct vnode *vp, void *sbbuf)
 781 {
 782         int error, len, loc;
 783         void *space;
 784         daddr_t blkno;
 785         struct buf *bp;
 786         struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs;
 787         struct inode *ip = VTOI(vp);
 788         struct lwp *l = curlwp;
 789
 790         copyfs = (struct fs *)((char *)sbbuf + blkoff(fs, fs->fs_sblockloc));
 791
 792         /*
 793          * Write the superblock and its summary information
 794          * to the snapshot.
 795          */
 796         blkno = fragstoblks(fs, fs->fs_csaddr);
 797         len = howmany(fs->fs_cssize, fs->fs_bsize);
 798         space = copyfs->fs_csp;
 799 #ifdef FFS_EI
 800         if (UFS_FSNEEDSWAP(fs)) {
 801                 ffs_sb_swap(copyfs, copyfs);
 802                 ffs_csum_swap(space, space, fs->fs_cssize);
 803         }
 804 #endif
 805         error = UFS_WAPBL_BEGIN(mp);
 806         if (error)
 807                 return error;
 808         for (loc = 0; loc < len; loc++) {
 809                 error = bread(vp, blkno + loc, fs->fs_bsize, l->l_cred,
 810                     B_MODIFY, &bp);
 811                 if (error) {
 812                         brelse(bp, 0);
 813                         break;
 814                 }
 815                 memcpy(bp->b_data, space, fs->fs_bsize);
 816                 space = (char *)space + fs->fs_bsize;
 817                 bawrite(bp);
 818         }
 819         if (error)
 820                 goto out;
 821         error = bread(vp, lblkno(fs, fs->fs_sblockloc),
 822             fs->fs_bsize, l->l_cred, B_MODIFY, &bp);
 823         if (error) {
 824                 brelse(bp, 0);
 825                 goto out;
 826         } else {
 827                 memcpy(bp->b_data, sbbuf, fs->fs_bsize);
 828                 bawrite(bp);
 829         }
 830         /*
 831          * Copy the first NDADDR blocks to the snapshot so ffs_copyonwrite()
 832          * and ffs_snapblkfree() will always work on indirect blocks.
 833          */
 834         for (loc = 0; loc < NDADDR; loc++) {
 835                 if (db_get(ip, loc) != 0)
 836                         continue;
 837                 error = ffs_balloc(vp, lblktosize(fs, (off_t)loc),
 838                     fs->fs_bsize, l->l_cred, 0, &bp);
 839                 if (error)
 840                         break;
 841                 error = rwfsblk(vp, B_READ, bp->b_data, loc);
 842                 if (error) {
 843                         brelse(bp, 0);
 844                         break;
 845                 }
 846                 bawrite(bp);
 847         }
 848
 849 out:
 850         UFS_WAPBL_END(mp);
 851         return error;
 852 }
 853
 854 /*
 855  * Copy all cylinder group maps.
 856  */
 857 static int
 858 cgaccount(struct vnode *vp, int passno, int *redo)
 859 {
 860         int cg, error;
 861         struct buf *nbp;
 862         struct fs *fs = VTOI(vp)->i_fs;
 863
 864         error = UFS_WAPBL_BEGIN(vp->v_mount);
 865         if (error)
 866                 return error;
 867         if (redo != NULL)
 868                 *redo = 0;
 869         if (passno == 1)
 870                 fs->fs_active = malloc(howmany(fs->fs_ncg, NBBY),
 871                     M_DEVBUF, M_WAITOK | M_ZERO);
 872         for (cg = 0; cg < fs->fs_ncg; cg++) {
 873                 if (passno == 2 && ACTIVECG_ISSET(fs, cg))
 874                         continue;
 875                 if (redo != NULL)
 876                         *redo += 1;
 877                 error = ffs_balloc(vp, lfragtosize(fs, cgtod(fs, cg)),
 878                     fs->fs_bsize, curlwp->l_cred, 0, &nbp);
 879                 if (error)
 880                         break;
 881                 error = cgaccount1(cg, vp, nbp->b_data, passno);
 882                 bawrite(nbp);
 883                 if (error)
 884                         break;
 885         }
 886         UFS_WAPBL_END(vp->v_mount);
 887         return error;
 888 }
 889
 890 /*
 891  * Copy a cylinder group map. All the unallocated blocks are marked
 892  * BLK_NOCOPY so that the snapshot knows that it need not copy them
 893  * if they are later written. If passno is one, then this is a first
 894  * pass, so only setting needs to be done. If passno is 2, then this
 895  * is a revision to a previous pass which must be undone as the
 896  * replacement pass is done.
 897  */
 898 static int
 899 cgaccount1(int cg, struct vnode *vp, void *data, int passno)
 900 {
 901         struct buf *bp, *ibp;
 902         struct inode *ip;
 903         struct cg *cgp;
 904         struct fs *fs;
 905         struct lwp *l = curlwp;
 906         daddr_t base, numblks;
 907         int error, len, loc, ns, indiroff;
 908
 909         ip = VTOI(vp);
 910         fs = ip->i_fs;
 911         ns = UFS_FSNEEDSWAP(fs);
 912         error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
 913                 (int)fs->fs_cgsize, l->l_cred, 0, &bp);
 914         if (error) {
 915                 brelse(bp, 0);
 916                 return (error);
 917         }
 918         cgp = (struct cg *)bp->b_data;
 919         if (!cg_chkmagic(cgp, ns)) {
 920                 brelse(bp, 0);
 921                 return (EIO);
 922         }
 923         ACTIVECG_SET(fs, cg);
 924
 925         memcpy(data, bp->b_data, fs->fs_cgsize);
 926         brelse(bp, 0);
 927         if (fs->fs_cgsize < fs->fs_bsize)
 928                 memset((char *)data + fs->fs_cgsize, 0,
 929                     fs->fs_bsize - fs->fs_cgsize);
 930         numblks = howmany(fs->fs_size, fs->fs_frag);
 931         len = howmany(fs->fs_fpg, fs->fs_frag);
 932         base = cg * fs->fs_fpg / fs->fs_frag;
 933         if (base + len >= numblks)
 934                 len = numblks - base - 1;
 935         loc = 0;
 936         if (base < NDADDR) {
 937                 for ( ; loc < NDADDR; loc++) {
 938                         if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc))
 939                                 db_assign(ip, loc, BLK_NOCOPY);
 940                         else if (db_get(ip, loc) == BLK_NOCOPY) {
 941                                 if (passno == 2)
 942                                         db_assign(ip, loc, 0);
 943                                 else if (passno == 1)
 944                                         panic("ffs_snapshot: lost direct block");
 945                         }
 946                 }
 947         }
 948         if ((error = ffs_balloc(vp, lblktosize(fs, (off_t)(base + loc)),
 949             fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0)
 950                 return (error);
 951         indiroff = (base + loc - NDADDR) % NINDIR(fs);
 952         for ( ; loc < len; loc++, indiroff++) {
 953                 if (indiroff >= NINDIR(fs)) {
 954                         bawrite(ibp);
 955                         if ((error = ffs_balloc(vp,
 956                             lblktosize(fs, (off_t)(base + loc)),
 957                             fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0)
 958                                 return (error);
 959                         indiroff = 0;
 960                 }
 961                 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc))
 962                         idb_assign(ip, ibp->b_data, indiroff, BLK_NOCOPY);
 963                 else if (idb_get(ip, ibp->b_data, indiroff) == BLK_NOCOPY) {
 964                         if (passno == 2)
 965                                 idb_assign(ip, ibp->b_data, indiroff, 0);
 966                         else if (passno == 1)
 967                                 panic("ffs_snapshot: lost indirect block");
 968                 }
 969         }
 970         bdwrite(ibp);
 971         return (0);
 972 }
 973
 974 /*
 975  * Before expunging a snapshot inode, note all the
 976  * blocks that it claims with BLK_SNAP so that fsck will
 977  * be able to account for those blocks properly and so
 978  * that this snapshot knows that it need not copy them
 979  * if the other snapshot holding them is freed.
 980  */
 981 static int
 982 expunge(struct vnode *snapvp, struct inode *cancelip, struct fs *fs,
 983     acctfunc_t acctfunc, int expungetype)
 984 {
 985         int i, error, ns;
 986         daddr_t lbn, rlbn;
 987         daddr_t len, blkno, numblks, blksperindir;
 988         struct ufs1_dinode *dip1;
 989         struct ufs2_dinode *dip2;
 990         struct lwp *l = curlwp;
 991         void *bap;
 992         struct buf *bp;
 993
 994         ns = UFS_FSNEEDSWAP(fs);
 995         /*
 996          * Prepare to expunge the inode. If its inode block has not
 997          * yet been copied, then allocate and fill the copy.
 998          */
 999         lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
1000         error = snapblkaddr(snapvp, lbn, &blkno);
1001         if (error)
1002                 return error;
1003         if (blkno != 0) {
1004                 error = bread(snapvp, lbn, fs->fs_bsize, l->l_cred,
1005                     B_MODIFY, &bp);
1006         } else {
1007                 error = ffs_balloc(snapvp, lblktosize(fs, (off_t)lbn),
1008                     fs->fs_bsize, l->l_cred, 0, &bp);
1009                 if (! error)
1010                         error = rwfsblk(snapvp, B_READ, bp->b_data, lbn);
1011         }
1012         if (error)
1013                 return error;
1014         /*
1015          * Set a snapshot inode to be a zero length file, regular files
1016          * or unlinked snapshots to be completely unallocated.
1017          */
1018         if (fs->fs_magic == FS_UFS1_MAGIC) {
1019                 dip1 = (struct ufs1_dinode *)bp->b_data +
1020                     ino_to_fsbo(fs, cancelip->i_number);
1021                 if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0)
1022                         dip1->di_mode = 0;
1023                 dip1->di_size = 0;
1024                 dip1->di_blocks = 0;
1025                 dip1->di_flags =
1026                     ufs_rw32(ufs_rw32(dip1->di_flags, ns) & ~SF_SNAPSHOT, ns);
1027                 memset(&dip1->di_db[0], 0, (NDADDR + NIADDR) * sizeof(int32_t));
1028         } else {
1029                 dip2 = (struct ufs2_dinode *)bp->b_data +
1030                     ino_to_fsbo(fs, cancelip->i_number);
1031                 if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0)
1032                         dip2->di_mode = 0;
1033                 dip2->di_size = 0;
1034                 dip2->di_blocks = 0;
1035                 dip2->di_flags =
1036                     ufs_rw32(ufs_rw32(dip2->di_flags, ns) & ~SF_SNAPSHOT, ns);
1037                 memset(&dip2->di_db[0], 0, (NDADDR + NIADDR) * sizeof(int64_t));
1038         }
1039         bdwrite(bp);
1040         /*
1041          * Now go through and expunge all the blocks in the file
1042          * using the function requested.
1043          */
1044         numblks = howmany(cancelip->i_size, fs->fs_bsize);
1045         if (fs->fs_magic == FS_UFS1_MAGIC)
1046                 bap = &cancelip->i_ffs1_db[0];
1047         else
1048                 bap = &cancelip->i_ffs2_db[0];
1049         if ((error = (*acctfunc)(snapvp, bap, 0, NDADDR, fs, 0, expungetype)))
1050                 return (error);
1051         if (fs->fs_magic == FS_UFS1_MAGIC)
1052                 bap = &cancelip->i_ffs1_ib[0];
1053         else
1054                 bap = &cancelip->i_ffs2_ib[0];
1055         if ((error = (*acctfunc)(snapvp, bap, 0, NIADDR, fs, -1, expungetype)))
1056                 return (error);
1057         blksperindir = 1;
1058         lbn = -NDADDR;
1059         len = numblks - NDADDR;
1060         rlbn = NDADDR;
1061         for (i = 0; len > 0 && i < NIADDR; i++) {
1062                 error = indiracct(snapvp, ITOV(cancelip), i,
1063                     ib_get(cancelip, i), lbn, rlbn, len,
1064                     blksperindir, fs, acctfunc, expungetype);
1065                 if (error)
1066                         return (error);
1067                 blksperindir *= NINDIR(fs);
1068                 lbn -= blksperindir + 1;
1069                 len -= blksperindir;
1070                 rlbn += blksperindir;
1071         }
1072         return (0);
1073 }
1074
1075 /*
1076  * Descend an indirect block chain for vnode cancelvp accounting for all
1077  * its indirect blocks in snapvp.
1078  */
1079 static int
1080 indiracct(struct vnode *snapvp, struct vnode *cancelvp, int level,
1081     daddr_t blkno, daddr_t lbn, daddr_t rlbn, daddr_t remblks,
1082     daddr_t blksperindir, struct fs *fs, acctfunc_t acctfunc, int expungetype)
1083 {
1084         int error, num, i;
1085         daddr_t subblksperindir;
1086         struct indir indirs[NIADDR + 2];
1087         daddr_t last;
1088         void *bap;
1089         struct buf *bp;
1090
1091         if (blkno == 0) {
1092                 if (expungetype == BLK_NOCOPY)
1093                         return (0);
1094                 panic("indiracct: missing indir");
1095         }
1096         if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
1097                 return (error);
1098         if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
1099                 panic("indiracct: botched params");
1100         /*
1101          * We have to expand bread here since it will deadlock looking
1102          * up the block number for any blocks that are not in the cache.
1103          */
1104         error = ffs_getblk(cancelvp, lbn, fsbtodb(fs, blkno), fs->fs_bsize,
1105             false, &bp);
1106         if (error)
1107                 return error;
1108         if ((bp->b_oflags & (BO_DONE | BO_DELWRI)) == 0 && (error =
1109             rwfsblk(bp->b_vp, B_READ, bp->b_data, fragstoblks(fs, blkno)))) {
1110                 brelse(bp, 0);
1111                 return (error);
1112         }
1113         /*
1114          * Account for the block pointers in this indirect block.
1115          */
1116         last = howmany(remblks, blksperindir);
1117         if (last > NINDIR(fs))
1118                 last = NINDIR(fs);
1119         bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK | M_ZERO);
1120         memcpy((void *)bap, bp->b_data, fs->fs_bsize);
1121         brelse(bp, 0);
1122         error = (*acctfunc)(snapvp, bap, 0, last,
1123             fs, level == 0 ? rlbn : -1, expungetype);
1124         if (error || level == 0)
1125                 goto out;
1126         /*
1127          * Account for the block pointers in each of the indirect blocks
1128          * in the levels below us.
1129          */
1130         subblksperindir = blksperindir / NINDIR(fs);
1131         for (lbn++, level--, i = 0; i < last; i++) {
1132                 error = indiracct(snapvp, cancelvp, level,
1133                     idb_get(VTOI(snapvp), bap, i), lbn, rlbn, remblks,
1134                     subblksperindir, fs, acctfunc, expungetype);
1135                 if (error)
1136                         goto out;
1137                 rlbn += blksperindir;
1138                 lbn -= blksperindir;
1139                 remblks -= blksperindir;
1140         }
1141 out:
1142         free(bap, M_DEVBUF);
1143         return (error);
1144 }
1145
1146 /*
1147  * Do both snap accounting and map accounting.
1148  */
1149 static int
1150 fullacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
1151     struct fs *fs, daddr_t lblkno,
1152     int exptype /* BLK_SNAP or BLK_NOCOPY */)
1153 {
1154         int error;
1155
1156         if ((error = snapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype)))
1157                 return (error);
1158         return (mapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype));
1159 }
1160
1161 /*
1162  * Identify a set of blocks allocated in a snapshot inode.
1163  */
1164 static int
1165 snapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
1166     struct fs *fs, daddr_t lblkno,
1167     int expungetype /* BLK_SNAP or BLK_NOCOPY */)
1168 {
1169         struct inode *ip = VTOI(vp);
1170         struct lwp *l = curlwp;
1171         daddr_t blkno;
1172         daddr_t lbn;
1173         struct buf *ibp;
1174         int error;
1175
1176         for ( ; oldblkp < lastblkp; oldblkp++) {
1177                 blkno = idb_get(ip, bap, oldblkp);
1178                 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
1179                         continue;
1180                 lbn = fragstoblks(fs, blkno);
1181                 if (lbn < NDADDR) {
1182                         blkno = db_get(ip, lbn);
1183                         ip->i_flag |= IN_CHANGE | IN_UPDATE;
1184                 } else {
1185                         error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn),
1186                             fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
1187                         if (error)
1188                                 return (error);
1189                         blkno = idb_get(ip, ibp->b_data,
1190                             (lbn - NDADDR) % NINDIR(fs));
1191                 }
1192                 /*
1193                  * If we are expunging a snapshot vnode and we
1194                  * find a block marked BLK_NOCOPY, then it is
1195                  * one that has been allocated to this snapshot after
1196                  * we took our current snapshot and can be ignored.
1197                  */
1198                 if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) {
1199                         if (lbn >= NDADDR)
1200                                 brelse(ibp, 0);
1201                 } else {
1202                         if (blkno != 0)
1203                                 panic("snapacct: bad block");
1204                         if (lbn < NDADDR)
1205                                 db_assign(ip, lbn, expungetype);
1206                         else {
1207                                 idb_assign(ip, ibp->b_data,
1208                                     (lbn - NDADDR) % NINDIR(fs), expungetype);
1209                                 bdwrite(ibp);
1210                         }
1211                 }
1212         }
1213         return (0);
1214 }
1215
1216 /*
1217  * Account for a set of blocks allocated in a snapshot inode.
1218  */
1219 static int
1220 mapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
1221     struct fs *fs, daddr_t lblkno, int expungetype)
1222 {
1223         daddr_t blkno;
1224         struct inode *ip;
1225         ino_t inum;
1226         int acctit;
1227
1228         ip = VTOI(vp);
1229         inum = ip->i_number;
1230         if (lblkno == -1)
1231                 acctit = 0;
1232         else
1233                 acctit = 1;
1234         for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
1235                 blkno = idb_get(ip, bap, oldblkp);
1236                 if (blkno == 0 || blkno == BLK_NOCOPY)
1237                         continue;
1238                 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
1239                         *ip->i_snapblklist++ = lblkno;
1240                 if (blkno == BLK_SNAP)
1241                         blkno = blkstofrags(fs, lblkno);
1242                 ffs_blkfree_snap(fs, vp, blkno, fs->fs_bsize, inum);
1243         }
1244         return (0);
1245 }
1246 #endif /* defined(FFS_NO_SNAPSHOT) */
1247
1248 /*
1249  * Decrement extra reference on snapshot when last name is removed.
1250  * It will not be freed until the last open reference goes away.
1251  */
1252 void
1253 ffs_snapgone(struct inode *ip)
1254 {
1255         struct mount *mp = ip->i_devvp->v_specmountpoint;
1256         struct inode *xp;
1257         struct fs *fs;
1258         struct snap_info *si;
1259         int snaploc;
1260
1261         si = VFSTOUFS(mp)->um_snapinfo;
1262
1263         /*
1264          * Find snapshot in incore list.
1265          */
1266         mutex_enter(&si->si_lock);
1267         TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap)
1268                 if (xp == ip)
1269                         break;
1270         mutex_exit(&si->si_lock);
1271         if (xp != NULL)
1272                 vrele(ITOV(ip));
1273 #ifdef DEBUG
1274         else if (snapdebug)
1275                 printf("ffs_snapgone: lost snapshot vnode %llu\n",
1276                     (unsigned long long)ip->i_number);
1277 #endif
1278         /*
1279          * Delete snapshot inode from superblock. Keep list dense.
1280          */
1281         mutex_enter(&si->si_lock);
1282         fs = ip->i_fs;
1283         for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
1284                 if (fs->fs_snapinum[snaploc] == ip->i_number)
1285                         break;
1286         if (snaploc < FSMAXSNAP) {
1287                 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
1288                         if (fs->fs_snapinum[snaploc] == 0)
1289                                 break;
1290                         fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc];
1291                 }
1292                 fs->fs_snapinum[snaploc - 1] = 0;
1293         }
1294         si->si_gen++;
1295         mutex_exit(&si->si_lock);
1296 }
1297
1298 /*
1299  * Prepare a snapshot file for being removed.
1300  */
1301 void
1302 ffs_snapremove(struct vnode *vp)
1303 {
1304         struct inode *ip = VTOI(vp), *xp;
1305         struct vnode *devvp = ip->i_devvp;
1306         struct fs *fs = ip->i_fs;
1307         struct mount *mp = devvp->v_specmountpoint;
1308         struct buf *ibp;
1309         struct snap_info *si;
1310         struct lwp *l = curlwp;
1311         daddr_t numblks, blkno, dblk;
1312         int error, loc, last;
1313
1314         si = VFSTOUFS(mp)->um_snapinfo;
1315         /*
1316          * If active, delete from incore list (this snapshot may
1317          * already have been in the process of being deleted, so
1318          * would not have been active).
1319          *
1320          * Clear copy-on-write flag if last snapshot.
1321          */
1322         mutex_enter(&si->si_lock);
1323         if (is_active_snapshot(si, ip)) {
1324                 TAILQ_REMOVE(&si->si_snapshots, ip, i_nextsnap);
1325                 if (TAILQ_FIRST(&si->si_snapshots) != 0) {
1326                         /* Roll back the list of preallocated blocks. */
1327                         xp = TAILQ_LAST(&si->si_snapshots, inodelst);
1328                         si->si_snapblklist = xp->i_snapblklist;
1329                         si->si_gen++;
1330                         mutex_exit(&si->si_lock);
1331                 } else {
1332                         si->si_snapblklist = 0;
1333                         si->si_gen++;
1334                         mutex_exit(&si->si_lock);
1335                         fscow_disestablish(mp, ffs_copyonwrite, devvp);
1336                 }
1337                 if (ip->i_snapblklist != NULL) {
1338                         free(ip->i_snapblklist, M_UFSMNT);
1339                         ip->i_snapblklist = NULL;
1340                 }
1341         } else
1342                 mutex_exit(&si->si_lock);
1343         /*
1344          * Clear all BLK_NOCOPY fields. Pass any block claims to other
1345          * snapshots that want them (see ffs_snapblkfree below).
1346          */
1347         for (blkno = 1; blkno < NDADDR; blkno++) {
1348                 dblk = db_get(ip, blkno);
1349                 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1350                         db_assign(ip, blkno, 0);
1351                 else if ((dblk == blkstofrags(fs, blkno) &&
1352                      ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize,
1353                      ip->i_number))) {
1354                         DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
1355                         db_assign(ip, blkno, 0);
1356                 }
1357         }
1358         numblks = howmany(ip->i_size, fs->fs_bsize);
1359         for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
1360                 error = ffs_balloc(vp, lblktosize(fs, (off_t)blkno),
1361                     fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
1362                 if (error)
1363                         continue;
1364                 if (fs->fs_size - blkno > NINDIR(fs))
1365                         last = NINDIR(fs);
1366                 else
1367                         last = fs->fs_size - blkno;
1368                 for (loc = 0; loc < last; loc++) {
1369                         dblk = idb_get(ip, ibp->b_data, loc);
1370                         if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1371                                 idb_assign(ip, ibp->b_data, loc, 0);
1372                         else if (dblk == blkstofrags(fs, blkno) &&
1373                             ffs_snapblkfree(fs, ip->i_devvp, dblk,
1374                             fs->fs_bsize, ip->i_number)) {
1375                                 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
1376                                 idb_assign(ip, ibp->b_data, loc, 0);
1377                         }
1378                 }
1379                 bawrite(ibp);
1380         }
1381         /*
1382          * Clear snapshot flag and drop reference.
1383          */
1384         ip->i_flags &= ~SF_SNAPSHOT;
1385         DIP_ASSIGN(ip, flags, ip->i_flags);
1386         ip->i_flag |= IN_CHANGE | IN_UPDATE;
1387 }
1388
1389 /*
1390  * Notification that a block is being freed. Return zero if the free
1391  * should be allowed to proceed. Return non-zero if the snapshot file
1392  * wants to claim the block. The block will be claimed if it is an
1393  * uncopied part of one of the snapshots. It will be freed if it is
1394  * either a BLK_NOCOPY or has already been copied in all of the snapshots.
1395  * If a fragment is being freed, then all snapshots that care about
1396  * it must make a copy since a snapshot file can only claim full sized
1397  * blocks. Note that if more than one snapshot file maps the block,
1398  * we can pick one at random to claim it. Since none of the snapshots
1399  * can change, we are assurred that they will all see the same unmodified
1400  * image. When deleting a snapshot file (see ffs_snapremove above), we
1401  * must push any of these claimed blocks to one of the other snapshots
1402  * that maps it. These claimed blocks are easily identified as they will
1403  * have a block number equal to their logical block number within the
1404  * snapshot. A copied block can never have this property because they
1405  * must always have been allocated from a BLK_NOCOPY location.
1406  */
1407 int
1408 ffs_snapblkfree(struct fs *fs, struct vnode *devvp, daddr_t bno,
1409     long size, ino_t inum)
1410 {
1411         struct mount *mp = devvp->v_specmountpoint;
1412         struct buf *ibp;
1413         struct inode *ip;
1414         struct vnode *vp = NULL;
1415         struct snap_info *si;
1416         void *saved_data = NULL;
1417         daddr_t lbn;
1418         daddr_t blkno;
1419         uint32_t gen;
1420         int indiroff = 0, snapshot_locked = 0, error = 0, claimedblk = 0;
1421
1422         si = VFSTOUFS(mp)->um_snapinfo;
1423         lbn = fragstoblks(fs, bno);
1424         mutex_enter(&si->si_lock);
1425 retry:
1426         gen = si->si_gen;
1427         TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) {
1428                 vp = ITOV(ip);
1429                 if (snapshot_locked == 0) {
1430                         if (!mutex_tryenter(&si->si_snaplock)) {
1431                                 mutex_exit(&si->si_lock);
1432                                 mutex_enter(&si->si_snaplock);
1433                                 mutex_enter(&si->si_lock);
1434                         }
1435                         snapshot_locked = 1;
1436                         if (gen != si->si_gen)
1437                                 goto retry;
1438                 }
1439                 /*
1440                  * Lookup block being written.
1441                  */
1442                 if (lbn < NDADDR) {
1443                         blkno = db_get(ip, lbn);
1444                 } else {
1445                         mutex_exit(&si->si_lock);
1446                         error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn),
1447                             fs->fs_bsize, FSCRED, B_METAONLY, &ibp);
1448                         if (error) {
1449                                 mutex_enter(&si->si_lock);
1450                                 break;
1451                         }
1452                         indiroff = (lbn - NDADDR) % NINDIR(fs);
1453                         blkno = idb_get(ip, ibp->b_data, indiroff);
1454                         mutex_enter(&si->si_lock);
1455                         if (gen != si->si_gen) {
1456                                 brelse(ibp, 0);
1457                                 goto retry;
1458                         }
1459                 }
1460                 /*
1461                  * Check to see if block needs to be copied.
1462                  */
1463                 if (blkno == 0) {
1464                         /*
1465                          * A block that we map is being freed. If it has not
1466                          * been claimed yet, we will claim or copy it (below).
1467                          */
1468                         claimedblk = 1;
1469                 } else if (blkno == BLK_SNAP) {
1470                         /*
1471                          * No previous snapshot claimed the block,
1472                          * so it will be freed and become a BLK_NOCOPY
1473                          * (don't care) for us.
1474                          */
1475                         if (claimedblk)
1476                                 panic("snapblkfree: inconsistent block type");
1477                         if (lbn < NDADDR) {
1478                                 db_assign(ip, lbn, BLK_NOCOPY);
1479                                 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1480                         } else {
1481                                 idb_assign(ip, ibp->b_data, indiroff,
1482                                     BLK_NOCOPY);
1483                                 mutex_exit(&si->si_lock);
1484                                 if (ip->i_nlink > 0)
1485                                         bwrite(ibp);
1486                                 else
1487                                         bdwrite(ibp);
1488                                 mutex_enter(&si->si_lock);
1489                                 if (gen != si->si_gen)
1490                                         goto retry;
1491                         }
1492                         continue;
1493                 } else /* BLK_NOCOPY or default */ {
1494                         /*
1495                          * If the snapshot has already copied the block
1496                          * (default), or does not care about the block,
1497                          * it is not needed.
1498                          */
1499                         if (lbn >= NDADDR)
1500                                 brelse(ibp, 0);
1501                         continue;
1502                 }
1503                 /*
1504                  * If this is a full size block, we will just grab it
1505                  * and assign it to the snapshot inode. Otherwise we
1506                  * will proceed to copy it. See explanation for this
1507                  * routine as to why only a single snapshot needs to
1508                  * claim this block.
1509                  */
1510                 if (size == fs->fs_bsize) {
1511 #ifdef DEBUG
1512                         if (snapdebug)
1513                                 printf("%s %llu lbn %" PRId64
1514                                     "from inum %llu\n",
1515                                     "Grabonremove: snapino",
1516                                     (unsigned long long)ip->i_number,
1517                                     lbn, (unsigned long long)inum);
1518 #endif
1519                         mutex_exit(&si->si_lock);
1520                         if (lbn < NDADDR) {
1521                                 db_assign(ip, lbn, bno);
1522                         } else {
1523                                 idb_assign(ip, ibp->b_data, indiroff, bno);
1524                                 if (ip->i_nlink > 0)
1525                                         bwrite(ibp);
1526                                 else
1527                                         bdwrite(ibp);
1528                         }
1529                         DIP_ADD(ip, blocks, btodb(size));
1530                         ip->i_flag |= IN_CHANGE | IN_UPDATE;
1531                         if (ip->i_nlink > 0 && mp->mnt_wapbl)
1532                                 error = syncsnap(vp);
1533                         else
1534                                 error = 0;
1535                         mutex_exit(&si->si_snaplock);
1536                         return (error == 0);
1537                 }
1538                 if (lbn >= NDADDR)
1539                         brelse(ibp, 0);
1540 #ifdef DEBUG
1541                 if (snapdebug)
1542                         printf("%s%llu lbn %" PRId64 " %s %llu size %ld\n",
1543                             "Copyonremove: snapino ",
1544                             (unsigned long long)ip->i_number,
1545                             lbn, "for inum", (unsigned long long)inum, size);
1546 #endif
1547                 /*
1548                  * If we have already read the old block contents, then
1549                  * simply copy them to the new block. Note that we need
1550                  * to synchronously write snapshots that have not been
1551                  * unlinked, and hence will be visible after a crash,
1552                  * to ensure their integrity.
1553                  */
1554                 mutex_exit(&si->si_lock);
1555                 if (saved_data == NULL) {
1556                         saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
1557                         error = rwfsblk(vp, B_READ, saved_data, lbn);
1558                         if (error) {
1559                                 free(saved_data, M_UFSMNT);
1560                                 saved_data = NULL;
1561                                 mutex_enter(&si->si_lock);
1562                                 break;
1563                         }
1564                 }
1565                 error = wrsnapblk(vp, saved_data, lbn);
1566                 if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl)
1567                         error = syncsnap(vp);
1568                 mutex_enter(&si->si_lock);
1569                 if (error)
1570                         break;
1571                 if (gen != si->si_gen)
1572                         goto retry;
1573         }
1574         mutex_exit(&si->si_lock);
1575         if (saved_data)
1576                 free(saved_data, M_UFSMNT);
1577         /*
1578          * If we have been unable to allocate a block in which to do
1579          * the copy, then return non-zero so that the fragment will
1580          * not be freed. Although space will be lost, the snapshot
1581          * will stay consistent.
1582          */
1583         if (snapshot_locked)
1584                 mutex_exit(&si->si_snaplock);
1585         return (error);
1586 }
1587
1588 /*
1589  * Associate snapshot files when mounting.
1590  */
1591 void
1592 ffs_snapshot_mount(struct mount *mp)
1593 {
1594         struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
1595         struct fs *fs = VFSTOUFS(mp)->um_fs;
1596         struct lwp *l = curlwp;
1597         struct vnode *vp;
1598         struct inode *ip, *xp;
1599         struct snap_info *si;
1600         daddr_t snaplistsize, *snapblklist;
1601         int i, error, ns, snaploc, loc;
1602
1603         /*
1604          * No persistent snapshots on apple ufs file systems.
1605          */
1606         if (UFS_MPISAPPLEUFS(VFSTOUFS(mp)))
1607                 return;
1608
1609         si = VFSTOUFS(mp)->um_snapinfo;
1610         ns = UFS_FSNEEDSWAP(fs);
1611         /*
1612          * XXX The following needs to be set before ffs_truncate or
1613          * VOP_READ can be called.
1614          */
1615         mp->mnt_stat.f_iosize = fs->fs_bsize;
1616         /*
1617          * Process each snapshot listed in the superblock.
1618          */
1619         vp = NULL;
1620         mutex_enter(&si->si_lock);
1621         for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
1622                 if (fs->fs_snapinum[snaploc] == 0)
1623                         break;
1624                 if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc],
1625                     &vp)) != 0) {
1626                         printf("ffs_snapshot_mount: vget failed %d\n", error);
1627                         continue;
1628                 }
1629                 ip = VTOI(vp);
1630                 if ((ip->i_flags & SF_SNAPSHOT) == 0) {
1631                         printf("ffs_snapshot_mount: non-snapshot inode %d\n",
1632                             fs->fs_snapinum[snaploc]);
1633                         vput(vp);
1634                         vp = NULL;
1635                         for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) {
1636                                 if (fs->fs_snapinum[loc] == 0)
1637                                         break;
1638                                 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc];
1639                         }
1640                         fs->fs_snapinum[loc - 1] = 0;
1641                         snaploc--;
1642                         continue;
1643                 }
1644
1645                 /*
1646                  * Read the block hints list. Use an empty list on
1647                  * read errors.
1648                  */
1649                 error = vn_rdwr(UIO_READ, vp,
1650                     (void *)&snaplistsize, sizeof(snaplistsize),
1651                     lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)),
1652                     UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS,
1653                     l->l_cred, NULL, NULL);
1654                 if (error) {
1655                         printf("ffs_snapshot_mount: read_1 failed %d\n", error);
1656                         snaplistsize = 1;
1657                 } else
1658                         snaplistsize = ufs_rw64(snaplistsize, ns);
1659                 snapblklist = malloc(
1660                     snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
1661                 if (error)
1662                         snapblklist[0] = 1;
1663                 else {
1664                         error = vn_rdwr(UIO_READ, vp, (void *)snapblklist,
1665                             snaplistsize * sizeof(daddr_t),
1666                             lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)),
1667                             UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS,
1668                             l->l_cred, NULL, NULL);
1669                         for (i = 0; i < snaplistsize; i++)
1670                                 snapblklist[i] = ufs_rw64(snapblklist[i], ns);
1671                         if (error) {
1672                                 printf("ffs_snapshot_mount: read_2 failed %d\n",
1673                                     error);
1674                                 snapblklist[0] = 1;
1675                         }
1676                 }
1677                 ip->i_snapblklist = &snapblklist[0];
1678
1679                 /*
1680                  * Link it onto the active snapshot list.
1681                  */
1682                 if (is_active_snapshot(si, ip))
1683                         panic("ffs_snapshot_mount: %"PRIu64" already on list",
1684                             ip->i_number);
1685                 else
1686                         TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap);
1687                 vp->v_vflag |= VV_SYSTEM;
1688                 VOP_UNLOCK(vp, 0);
1689         }
1690         /*
1691          * No usable snapshots found.
1692          */
1693         if (vp == NULL) {
1694                 mutex_exit(&si->si_lock);
1695                 return;
1696         }
1697         /*
1698          * Attach the block hints list. We always want to
1699          * use the list from the newest snapshot.
1700         */
1701         xp = TAILQ_LAST(&si->si_snapshots, inodelst);
1702         si->si_snapblklist = xp->i_snapblklist;
1703         fscow_establish(mp, ffs_copyonwrite, devvp);
1704         si->si_gen++;
1705         mutex_exit(&si->si_lock);
1706 }
1707
1708 /*
1709  * Disassociate snapshot files when unmounting.
1710  */
1711 void
1712 ffs_snapshot_unmount(struct mount *mp)
1713 {
1714         struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
1715         struct inode *xp;
1716         struct vnode *vp = NULL;
1717         struct snap_info *si;
1718
1719         si = VFSTOUFS(mp)->um_snapinfo;
1720         mutex_enter(&si->si_lock);
1721         while ((xp = TAILQ_FIRST(&si->si_snapshots)) != 0) {
1722                 vp = ITOV(xp);
1723                 TAILQ_REMOVE(&si->si_snapshots, xp, i_nextsnap);
1724                 if (xp->i_snapblklist == si->si_snapblklist)
1725                         si->si_snapblklist = NULL;
1726                 free(xp->i_snapblklist, M_UFSMNT);
1727                 if (xp->i_nlink > 0) {
1728                         si->si_gen++;
1729                         mutex_exit(&si->si_lock);
1730                         vrele(vp);
1731                         mutex_enter(&si->si_lock);
1732                 }
1733         }
1734         si->si_gen++;
1735         mutex_exit(&si->si_lock);
1736         if (vp)
1737                 fscow_disestablish(mp, ffs_copyonwrite, devvp);
1738 }
1739
1740 /*
1741  * Check for need to copy block that is about to be written,
1742  * copying the block if necessary.
1743  */
1744 static int
1745 ffs_copyonwrite(void *v, struct buf *bp, bool data_valid)
1746 {
1747         struct fs *fs;
1748         struct inode *ip;
1749         struct vnode *devvp = v, *vp = NULL;
1750         struct mount *mp = devvp->v_specmountpoint;
1751         struct snap_info *si;
1752         void *saved_data = NULL;
1753         daddr_t lbn, blkno, *snapblklist;
1754         uint32_t gen;
1755         int lower, upper, mid, snapshot_locked = 0, error = 0;
1756
1757         /*
1758          * Check for valid snapshots.
1759          */
1760         si = VFSTOUFS(mp)->um_snapinfo;
1761         mutex_enter(&si->si_lock);
1762         ip = TAILQ_FIRST(&si->si_snapshots);
1763         if (ip == NULL) {
1764                 mutex_exit(&si->si_lock);
1765                 return 0;
1766         }
1767         /*
1768          * First check to see if it is after the file system or
1769          * in the preallocated list.
1770          * By doing this check we avoid several potential deadlocks.
1771          */
1772         fs = ip->i_fs;
1773         lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
1774         if (bp->b_blkno >= fsbtodb(fs, fs->fs_size)) {
1775                 mutex_exit(&si->si_lock);
1776                 return 0;
1777         }
1778         snapblklist = si->si_snapblklist;
1779         upper = (snapblklist != NULL ? snapblklist[0] - 1 : 0);
1780         lower = 1;
1781         while (lower <= upper) {
1782                 mid = (lower + upper) / 2;
1783                 if (snapblklist[mid] == lbn)
1784                         break;
1785                 if (snapblklist[mid] < lbn)
1786                         lower = mid + 1;
1787                 else
1788                         upper = mid - 1;
1789         }
1790         if (lower <= upper) {
1791                 mutex_exit(&si->si_lock);
1792                 return 0;
1793         }
1794         /*
1795          * Not in the precomputed list, so check the snapshots.
1796          */
1797          if (data_valid && bp->b_bcount == fs->fs_bsize)
1798                 saved_data = bp->b_data;
1799 retry:
1800         gen = si->si_gen;
1801         TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) {
1802                 vp = ITOV(ip);
1803                 /*
1804                  * We ensure that everything of our own that needs to be
1805                  * copied will be done at the time that ffs_snapshot is
1806                  * called. Thus we can skip the check here which can
1807                  * deadlock in doing the lookup in ffs_balloc.
1808                  */
1809                 if (bp->b_vp == vp)
1810                         continue;
1811                 /*
1812                  * Check to see if block needs to be copied.
1813                  */
1814                 if (lbn < NDADDR) {
1815                         blkno = db_get(ip, lbn);
1816                 } else {
1817                         mutex_exit(&si->si_lock);
1818                         if ((error = snapblkaddr(vp, lbn, &blkno)) != 0) {
1819                                 mutex_enter(&si->si_lock);
1820                                 break;
1821                         }
1822                         mutex_enter(&si->si_lock);
1823                         if (gen != si->si_gen)
1824                                 goto retry;
1825                 }
1826 #ifdef DIAGNOSTIC
1827                 if (blkno == BLK_SNAP && bp->b_lblkno >= 0)
1828                         panic("ffs_copyonwrite: bad copy block");
1829 #endif
1830                 if (blkno != 0)
1831                         continue;
1832
1833                 if (curlwp == uvm.pagedaemon_lwp) {
1834                         error = ENOMEM;
1835                         break;
1836                 }
1837
1838                 if (snapshot_locked == 0) {
1839                         if (!mutex_tryenter(&si->si_snaplock)) {
1840                                 mutex_exit(&si->si_lock);
1841                                 mutex_enter(&si->si_snaplock);
1842                                 mutex_enter(&si->si_lock);
1843                         }
1844                         snapshot_locked = 1;
1845                         if (gen != si->si_gen)
1846                                 goto retry;
1847
1848                         /* Check again if block still needs to be copied */
1849                         if (lbn < NDADDR) {
1850                                 blkno = db_get(ip, lbn);
1851                         } else {
1852                                 mutex_exit(&si->si_lock);
1853                                 if ((error = snapblkaddr(vp, lbn, &blkno)) != 0) {
1854                                         mutex_enter(&si->si_lock);
1855                                         break;
1856                                 }
1857                                 mutex_enter(&si->si_lock);
1858                                 if (gen != si->si_gen)
1859                                         goto retry;
1860                         }
1861
1862                         if (blkno != 0)
1863                                 continue;
1864                 }
1865                 /*
1866                  * Allocate the block into which to do the copy. Since
1867                  * multiple processes may all try to copy the same block,
1868                  * we have to recheck our need to do a copy if we sleep
1869                  * waiting for the lock.
1870                  *
1871                  * Because all snapshots on a filesystem share a single
1872                  * lock, we ensure that we will never be in competition
1873                  * with another process to allocate a block.
1874                  */
1875 #ifdef DEBUG
1876                 if (snapdebug) {
1877                         printf("Copyonwrite: snapino %llu lbn %" PRId64 " for ",
1878                             (unsigned long long)ip->i_number, lbn);
1879                         if (bp->b_vp == devvp)
1880                                 printf("fs metadata");
1881                         else
1882                                 printf("inum %llu", (unsigned long long)
1883                                     VTOI(bp->b_vp)->i_number);
1884                         printf(" lblkno %" PRId64 "\n", bp->b_lblkno);
1885                 }
1886 #endif
1887                 /*
1888                  * If we have already read the old block contents, then
1889                  * simply copy them to the new block. Note that we need
1890                  * to synchronously write snapshots that have not been
1891                  * unlinked, and hence will be visible after a crash,
1892                  * to ensure their integrity.
1893                  */
1894                 mutex_exit(&si->si_lock);
1895                 if (saved_data == NULL) {
1896                         saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
1897                         error = rwfsblk(vp, B_READ, saved_data, lbn);
1898                         if (error) {
1899                                 free(saved_data, M_UFSMNT);
1900                                 saved_data = NULL;
1901                                 mutex_enter(&si->si_lock);
1902                                 break;
1903                         }
1904                 }
1905                 error = wrsnapblk(vp, saved_data, lbn);
1906                 if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl)
1907                         error = syncsnap(vp);
1908                 mutex_enter(&si->si_lock);
1909                 if (error)
1910                         break;
1911                 if (gen != si->si_gen)
1912                         goto retry;
1913         }
1914         /*
1915          * Note that we need to synchronously write snapshots that
1916          * have not been unlinked, and hence will be visible after
1917          * a crash, to ensure their integrity.
1918          */
1919         mutex_exit(&si->si_lock);
1920         if (saved_data && saved_data != bp->b_data)
1921                 free(saved_data, M_UFSMNT);
1922         if (snapshot_locked)
1923                 mutex_exit(&si->si_snaplock);
1924         return error;
1925 }
1926
1927 /*
1928  * Read from a snapshot.
1929  */
1930 int
1931 ffs_snapshot_read(struct vnode *vp, struct uio *uio, int ioflag)
1932 {
1933         struct inode *ip = VTOI(vp);
1934         struct fs *fs = ip->i_fs;
1935         struct snap_info *si = VFSTOUFS(vp->v_mount)->um_snapinfo;
1936         struct buf *bp;
1937         daddr_t lbn, nextlbn;
1938         off_t fsbytes, bytesinfile;
1939         long size, xfersize, blkoffset;
1940         int error;
1941
1942         fstrans_start(vp->v_mount, FSTRANS_SHARED);
1943         mutex_enter(&si->si_snaplock);
1944
1945         if (ioflag & IO_ALTSEMANTICS)
1946                 fsbytes = ip->i_size;
1947         else
1948                 fsbytes = lfragtosize(fs, fs->fs_size);
1949         for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
1950                 bytesinfile = fsbytes - uio->uio_offset;
1951                 if (bytesinfile <= 0)
1952                         break;
1953                 lbn = lblkno(fs, uio->uio_offset);
1954                 nextlbn = lbn + 1;
1955                 size = fs->fs_bsize;
1956                 blkoffset = blkoff(fs, uio->uio_offset);
1957                 xfersize = MIN(MIN(fs->fs_bsize - blkoffset, uio->uio_resid),
1958                     bytesinfile);
1959
1960                 if (lblktosize(fs, nextlbn + 1) >= fsbytes) {
1961                         if (lblktosize(fs, lbn) + size > fsbytes)
1962                                 size = fragroundup(fs,
1963                                     fsbytes - lblktosize(fs, lbn));
1964                         error = bread(vp, lbn, size, NOCRED, 0, &bp);
1965                 } else {
1966                         int nextsize = fs->fs_bsize;
1967                         error = breadn(vp, lbn,
1968                             size, &nextlbn, &nextsize, 1, NOCRED, 0, &bp);
1969                 }
1970                 if (error)
1971                         break;
1972
1973                 /*
1974                  * We should only get non-zero b_resid when an I/O error
1975                  * has occurred, which should cause us to break above.
1976                  * However, if the short read did not cause an error,
1977                  * then we want to ensure that we do not uiomove bad
1978                  * or uninitialized data.
1979                  */
1980                 size -= bp->b_resid;
1981                 if (size < blkoffset + xfersize) {
1982                         xfersize = size - blkoffset;
1983                         if (xfersize <= 0)
1984                                 break;
1985                 }
1986                 error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
1987                 if (error)
1988                         break;
1989                 brelse(bp, BC_AGE);
1990         }
1991         if (bp != NULL)
1992                 brelse(bp, BC_AGE);
1993
1994         mutex_exit(&si->si_snaplock);
1995         fstrans_done(vp->v_mount);
1996         return error;
1997 }
1998
1999 /*
2000  * Lookup a snapshots data block address.
2001  * Simpler than UFS_BALLOC() as we know all metadata is already allocated
2002  * and safe even for the pagedaemon where we cannot bread().
2003  */
2004 static int
2005 snapblkaddr(struct vnode *vp, daddr_t lbn, daddr_t *res)
2006 {
2007         struct indir indirs[NIADDR + 2];
2008         struct inode *ip = VTOI(vp);
2009         struct fs *fs = ip->i_fs;
2010         struct buf *bp;
2011         int error, num;
2012
2013         KASSERT(lbn >= 0);
2014
2015         if (lbn < NDADDR) {
2016                 *res = db_get(ip, lbn);
2017                 return 0;
2018         }
2019         if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
2020                 return error;
2021         if (curlwp == uvm.pagedaemon_lwp) {
2022                 mutex_enter(&bufcache_lock);
2023                 bp = incore(vp, indirs[num-1].in_lbn);
2024                 if (bp && (bp->b_oflags & (BO_DONE | BO_DELWRI))) {
2025                         *res = idb_get(ip, bp->b_data, indirs[num-1].in_off);
2026                         error = 0;
2027                 } else
2028                         error = ENOMEM;
2029                 mutex_exit(&bufcache_lock);
2030                 return error;
2031         }
2032         error = bread(vp, indirs[num-1].in_lbn, fs->fs_bsize, NOCRED, 0, &bp);
2033         if (error == 0)
2034                 *res = idb_get(ip, bp->b_data, indirs[num-1].in_off);
2035         brelse(bp, 0);
2036
2037         return error;
2038 }
2039
2040 /*
2041  * Read or write the specified block of the filesystem vp resides on
2042  * from or to the disk bypassing the buffer cache.
2043  */
2044 static int
2045 rwfsblk(struct vnode *vp, int flags, void *data, daddr_t lbn)
2046 {
2047         int error;
2048         struct inode *ip = VTOI(vp);
2049         struct fs *fs = ip->i_fs;
2050         struct buf *nbp;
2051
2052         nbp = getiobuf(NULL, true);
2053         nbp->b_flags = flags;
2054         nbp->b_bcount = nbp->b_bufsize = fs->fs_bsize;
2055         nbp->b_error = 0;
2056         nbp->b_data = data;
2057         nbp->b_blkno = nbp->b_rawblkno = fsbtodb(fs, blkstofrags(fs, lbn));
2058         nbp->b_proc = NULL;
2059         nbp->b_dev = ip->i_devvp->v_rdev;
2060         SET(nbp->b_cflags, BC_BUSY);    /* mark buffer busy */
2061
2062         bdev_strategy(nbp);
2063
2064         error = biowait(nbp);
2065
2066         putiobuf(nbp);
2067
2068         return error;
2069 }
2070
2071 /*
2072  * Write all dirty buffers to disk and invalidate them.
2073  */
2074 static int
2075 syncsnap(struct vnode *vp)
2076 {
2077         int error;
2078         buf_t *bp;
2079         struct fs *fs = VTOI(vp)->i_fs;
2080
2081         mutex_enter(&bufcache_lock);
2082         while ((bp = LIST_FIRST(&vp->v_dirtyblkhd))) {
2083                 KASSERT((bp->b_cflags & BC_BUSY) == 0);
2084                 KASSERT(bp->b_bcount == fs->fs_bsize);
2085                 bp->b_cflags |= BC_BUSY;
2086                 mutex_exit(&bufcache_lock);
2087                 error = rwfsblk(vp, B_WRITE, bp->b_data,
2088                     fragstoblks(fs, dbtofsb(fs, bp->b_blkno)));
2089                 brelse(bp, BC_INVAL | BC_VFLUSH);
2090                 if (error)
2091                         return error;
2092                 mutex_enter(&bufcache_lock);
2093         }
2094         mutex_exit(&bufcache_lock);
2095
2096         return 0;
2097 }
2098
2099 /*
2100  * Write the specified block to a snapshot.
2101  */
2102 static int
2103 wrsnapblk(struct vnode *vp, void *data, daddr_t lbn)
2104 {
2105         struct inode *ip = VTOI(vp);
2106         struct fs *fs = ip->i_fs;
2107         struct buf *bp;
2108         int error;
2109
2110         error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize,
2111             FSCRED, (ip->i_nlink > 0 ? B_SYNC : 0), &bp);
2112         if (error)
2113                 return error;
2114         memcpy(bp->b_data, data, fs->fs_bsize);
2115         if (ip->i_nlink > 0)
2116                 error = bwrite(bp);
2117         else
2118                 bawrite(bp);
2119
2120         return error;
2121 }
2122
2123 /*
2124  * Check if this inode is present on the active snapshot list.
2125  * Must be called with snapinfo locked.
2126  */
2127 static inline bool
2128 is_active_snapshot(struct snap_info *si, struct inode *ip)
2129 {
2130         struct inode *xp;
2131
2132         KASSERT(mutex_owned(&si->si_lock));
2133
2134         TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap)
2135                 if (xp == ip)
2136                         return true;
2137         return false;
2138 }
2139
2140 /*
2141  * Get/Put direct block from inode or buffer containing disk addresses. Take
2142  * care for fs type (UFS1/UFS2) and byte swapping. These functions should go
2143  * into a global include.
2144  */
2145 static inline daddr_t
2146 db_get(struct inode *ip, int loc)
2147 {
2148         if (ip->i_ump->um_fstype == UFS1)
2149                 return ufs_rw32(ip->i_ffs1_db[loc], UFS_IPNEEDSWAP(ip));
2150         else
2151                 return ufs_rw64(ip->i_ffs2_db[loc], UFS_IPNEEDSWAP(ip));
2152 }
2153
2154 static inline void
2155 db_assign(struct inode *ip, int loc, daddr_t val)
2156 {
2157         if (ip->i_ump->um_fstype == UFS1)
2158                 ip->i_ffs1_db[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
2159         else
2160                 ip->i_ffs2_db[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
2161 }
2162
2163 static inline daddr_t
2164 ib_get(struct inode *ip, int loc)
2165 {
2166         if (ip->i_ump->um_fstype == UFS1)
2167                 return ufs_rw32(ip->i_ffs1_ib[loc], UFS_IPNEEDSWAP(ip));
2168         else
2169                 return ufs_rw64(ip->i_ffs2_ib[loc], UFS_IPNEEDSWAP(ip));
2170 }
2171
2172 static inline void
2173 ib_assign(struct inode *ip, int loc, daddr_t val)
2174 {
2175         if (ip->i_ump->um_fstype == UFS1)
2176                 ip->i_ffs1_ib[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
2177         else
2178                 ip->i_ffs2_ib[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
2179 }
2180
2181 static inline daddr_t
2182 idb_get(struct inode *ip, void *bf, int loc)
2183 {
2184         if (ip->i_ump->um_fstype == UFS1)
2185                 return ufs_rw32(((int32_t *)(bf))[loc], UFS_IPNEEDSWAP(ip));
2186         else
2187                 return ufs_rw64(((int64_t *)(bf))[loc], UFS_IPNEEDSWAP(ip));
2188 }
2189
2190 static inline void
2191 idb_assign(struct inode *ip, void *bf, int loc, daddr_t val)
2192 {
2193         if (ip->i_ump->um_fstype == UFS1)
2194                 ((int32_t *)(bf))[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
2195         else
2196                 ((int64_t *)(bf))[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
2197 }