sys/ufs/lfs/lfs_vnops.c

   1 /*      $NetBSD: lfs_vnops.c,v 1.225 2009/11/17 22:49:24 eeh Exp $      */
   2
   3 /*-
   4  * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
   5  * All rights reserved.
   6  *
   7  * This code is derived from software contributed to The NetBSD Foundation
   8  * by Konrad E. Schroder <perseant@hhhh.org>.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  29  * POSSIBILITY OF SUCH DAMAGE.
  30  */
  31 /*
  32  * Copyright (c) 1986, 1989, 1991, 1993, 1995
  33  *      The Regents of the University of California.  All rights reserved.
  34  *
  35  * Redistribution and use in source and binary forms, with or without
  36  * modification, are permitted provided that the following conditions
  37  * are met:
  38  * 1. Redistributions of source code must retain the above copyright
  39  *    notice, this list of conditions and the following disclaimer.
  40  * 2. Redistributions in binary form must reproduce the above copyright
  41  *    notice, this list of conditions and the following disclaimer in the
  42  *    documentation and/or other materials provided with the distribution.
  43  * 3. Neither the name of the University nor the names of its contributors
  44  *    may be used to endorse or promote products derived from this software
  45  *    without specific prior written permission.
  46  *
  47  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  48  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  49  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  50  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  51  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  52  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  53  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  54  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  55  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  56  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  57  * SUCH DAMAGE.
  58  *
  59  *      @(#)lfs_vnops.c 8.13 (Berkeley) 6/10/95
  60  */
  61
  62 #include <sys/cdefs.h>
  63 __KERNEL_RCSID(0, "$NetBSD: lfs_vnops.c,v 1.225 2009/11/17 22:49:24 eeh Exp $");
  64
  65 #ifdef _KERNEL_OPT
  66 #include "opt_compat_netbsd.h"
  67 #endif
  68
  69 #include <sys/param.h>
  70 #include <sys/systm.h>
  71 #include <sys/namei.h>
  72 #include <sys/resourcevar.h>
  73 #include <sys/kernel.h>
  74 #include <sys/file.h>
  75 #include <sys/stat.h>
  76 #include <sys/buf.h>
  77 #include <sys/proc.h>
  78 #include <sys/mount.h>
  79 #include <sys/vnode.h>
  80 #include <sys/pool.h>
  81 #include <sys/signalvar.h>
  82 #include <sys/kauth.h>
  83 #include <sys/syslog.h>
  84 #include <sys/fstrans.h>
  85
  86 #include <miscfs/fifofs/fifo.h>
  87 #include <miscfs/genfs/genfs.h>
  88 #include <miscfs/specfs/specdev.h>
  89
  90 #include <ufs/ufs/inode.h>
  91 #include <ufs/ufs/dir.h>
  92 #include <ufs/ufs/ufsmount.h>
  93 #include <ufs/ufs/ufs_extern.h>
  94
  95 #include <uvm/uvm.h>
  96 #include <uvm/uvm_pmap.h>
  97 #include <uvm/uvm_stat.h>
  98 #include <uvm/uvm_pager.h>
  99
 100 #include <ufs/lfs/lfs.h>
 101 #include <ufs/lfs/lfs_extern.h>
 102
 103 extern pid_t lfs_writer_daemon;
 104 int lfs_ignore_lazy_sync = 1;
 105
 106 /* Global vfs data structures for lfs. */
 107 int (**lfs_vnodeop_p)(void *);
 108 const struct vnodeopv_entry_desc lfs_vnodeop_entries[] = {
 109         { &vop_default_desc, vn_default_error },
 110         { &vop_lookup_desc, ufs_lookup },               /* lookup */
 111         { &vop_create_desc, lfs_create },               /* create */
 112         { &vop_whiteout_desc, ufs_whiteout },           /* whiteout */
 113         { &vop_mknod_desc, lfs_mknod },                 /* mknod */
 114         { &vop_open_desc, ufs_open },                   /* open */
 115         { &vop_close_desc, lfs_close },                 /* close */
 116         { &vop_access_desc, ufs_access },               /* access */
 117         { &vop_getattr_desc, lfs_getattr },             /* getattr */
 118         { &vop_setattr_desc, lfs_setattr },             /* setattr */
 119         { &vop_read_desc, lfs_read },                   /* read */
 120         { &vop_write_desc, lfs_write },                 /* write */
 121         { &vop_ioctl_desc, ufs_ioctl },                 /* ioctl */
 122         { &vop_fcntl_desc, lfs_fcntl },                 /* fcntl */
 123         { &vop_poll_desc, ufs_poll },                   /* poll */
 124         { &vop_kqfilter_desc, genfs_kqfilter },         /* kqfilter */
 125         { &vop_revoke_desc, ufs_revoke },               /* revoke */
 126         { &vop_mmap_desc, lfs_mmap },                   /* mmap */
 127         { &vop_fsync_desc, lfs_fsync },                 /* fsync */
 128         { &vop_seek_desc, ufs_seek },                   /* seek */
 129         { &vop_remove_desc, lfs_remove },               /* remove */
 130         { &vop_link_desc, lfs_link },                   /* link */
 131         { &vop_rename_desc, lfs_rename },               /* rename */
 132         { &vop_mkdir_desc, lfs_mkdir },                 /* mkdir */
 133         { &vop_rmdir_desc, lfs_rmdir },                 /* rmdir */
 134         { &vop_symlink_desc, lfs_symlink },             /* symlink */
 135         { &vop_readdir_desc, ufs_readdir },             /* readdir */
 136         { &vop_readlink_desc, ufs_readlink },           /* readlink */
 137         { &vop_abortop_desc, ufs_abortop },             /* abortop */
 138         { &vop_inactive_desc, lfs_inactive },           /* inactive */
 139         { &vop_reclaim_desc, lfs_reclaim },             /* reclaim */
 140         { &vop_lock_desc, ufs_lock },                   /* lock */
 141         { &vop_unlock_desc, ufs_unlock },               /* unlock */
 142         { &vop_bmap_desc, ufs_bmap },                   /* bmap */
 143         { &vop_strategy_desc, lfs_strategy },           /* strategy */
 144         { &vop_print_desc, ufs_print },                 /* print */
 145         { &vop_islocked_desc, ufs_islocked },           /* islocked */
 146         { &vop_pathconf_desc, ufs_pathconf },           /* pathconf */
 147         { &vop_advlock_desc, ufs_advlock },             /* advlock */
 148         { &vop_bwrite_desc, lfs_bwrite },               /* bwrite */
 149         { &vop_getpages_desc, lfs_getpages },           /* getpages */
 150         { &vop_putpages_desc, lfs_putpages },           /* putpages */
 151         { NULL, NULL }
 152 };
 153 const struct vnodeopv_desc lfs_vnodeop_opv_desc =
 154         { &lfs_vnodeop_p, lfs_vnodeop_entries };
 155
 156 int (**lfs_specop_p)(void *);
 157 const struct vnodeopv_entry_desc lfs_specop_entries[] = {
 158         { &vop_default_desc, vn_default_error },
 159         { &vop_lookup_desc, spec_lookup },              /* lookup */
 160         { &vop_create_desc, spec_create },              /* create */
 161         { &vop_mknod_desc, spec_mknod },                /* mknod */
 162         { &vop_open_desc, spec_open },                  /* open */
 163         { &vop_close_desc, lfsspec_close },             /* close */
 164         { &vop_access_desc, ufs_access },               /* access */
 165         { &vop_getattr_desc, lfs_getattr },             /* getattr */
 166         { &vop_setattr_desc, lfs_setattr },             /* setattr */
 167         { &vop_read_desc, ufsspec_read },               /* read */
 168         { &vop_write_desc, ufsspec_write },             /* write */
 169         { &vop_ioctl_desc, spec_ioctl },                /* ioctl */
 170         { &vop_fcntl_desc, ufs_fcntl },                 /* fcntl */
 171         { &vop_poll_desc, spec_poll },                  /* poll */
 172         { &vop_kqfilter_desc, spec_kqfilter },          /* kqfilter */
 173         { &vop_revoke_desc, spec_revoke },              /* revoke */
 174         { &vop_mmap_desc, spec_mmap },                  /* mmap */
 175         { &vop_fsync_desc, spec_fsync },                /* fsync */
 176         { &vop_seek_desc, spec_seek },                  /* seek */
 177         { &vop_remove_desc, spec_remove },              /* remove */
 178         { &vop_link_desc, spec_link },                  /* link */
 179         { &vop_rename_desc, spec_rename },              /* rename */
 180         { &vop_mkdir_desc, spec_mkdir },                /* mkdir */
 181         { &vop_rmdir_desc, spec_rmdir },                /* rmdir */
 182         { &vop_symlink_desc, spec_symlink },            /* symlink */
 183         { &vop_readdir_desc, spec_readdir },            /* readdir */
 184         { &vop_readlink_desc, spec_readlink },          /* readlink */
 185         { &vop_abortop_desc, spec_abortop },            /* abortop */
 186         { &vop_inactive_desc, lfs_inactive },           /* inactive */
 187         { &vop_reclaim_desc, lfs_reclaim },             /* reclaim */
 188         { &vop_lock_desc, ufs_lock },                   /* lock */
 189         { &vop_unlock_desc, ufs_unlock },               /* unlock */
 190         { &vop_bmap_desc, spec_bmap },                  /* bmap */
 191         { &vop_strategy_desc, spec_strategy },          /* strategy */
 192         { &vop_print_desc, ufs_print },                 /* print */
 193         { &vop_islocked_desc, ufs_islocked },           /* islocked */
 194         { &vop_pathconf_desc, spec_pathconf },          /* pathconf */
 195         { &vop_advlock_desc, spec_advlock },            /* advlock */
 196         { &vop_bwrite_desc, vn_bwrite },                /* bwrite */
 197         { &vop_getpages_desc, spec_getpages },          /* getpages */
 198         { &vop_putpages_desc, spec_putpages },          /* putpages */
 199         { NULL, NULL }
 200 };
 201 const struct vnodeopv_desc lfs_specop_opv_desc =
 202         { &lfs_specop_p, lfs_specop_entries };
 203
 204 int (**lfs_fifoop_p)(void *);
 205 const struct vnodeopv_entry_desc lfs_fifoop_entries[] = {
 206         { &vop_default_desc, vn_default_error },
 207         { &vop_lookup_desc, fifo_lookup },              /* lookup */
 208         { &vop_create_desc, fifo_create },              /* create */
 209         { &vop_mknod_desc, fifo_mknod },                /* mknod */
 210         { &vop_open_desc, fifo_open },                  /* open */
 211         { &vop_close_desc, lfsfifo_close },             /* close */
 212         { &vop_access_desc, ufs_access },               /* access */
 213         { &vop_getattr_desc, lfs_getattr },             /* getattr */
 214         { &vop_setattr_desc, lfs_setattr },             /* setattr */
 215         { &vop_read_desc, ufsfifo_read },               /* read */
 216         { &vop_write_desc, ufsfifo_write },             /* write */
 217         { &vop_ioctl_desc, fifo_ioctl },                /* ioctl */
 218         { &vop_fcntl_desc, ufs_fcntl },                 /* fcntl */
 219         { &vop_poll_desc, fifo_poll },                  /* poll */
 220         { &vop_kqfilter_desc, fifo_kqfilter },          /* kqfilter */
 221         { &vop_revoke_desc, fifo_revoke },              /* revoke */
 222         { &vop_mmap_desc, fifo_mmap },                  /* mmap */
 223         { &vop_fsync_desc, fifo_fsync },                /* fsync */
 224         { &vop_seek_desc, fifo_seek },                  /* seek */
 225         { &vop_remove_desc, fifo_remove },              /* remove */
 226         { &vop_link_desc, fifo_link },                  /* link */
 227         { &vop_rename_desc, fifo_rename },              /* rename */
 228         { &vop_mkdir_desc, fifo_mkdir },                /* mkdir */
 229         { &vop_rmdir_desc, fifo_rmdir },                /* rmdir */
 230         { &vop_symlink_desc, fifo_symlink },            /* symlink */
 231         { &vop_readdir_desc, fifo_readdir },            /* readdir */
 232         { &vop_readlink_desc, fifo_readlink },          /* readlink */
 233         { &vop_abortop_desc, fifo_abortop },            /* abortop */
 234         { &vop_inactive_desc, lfs_inactive },           /* inactive */
 235         { &vop_reclaim_desc, lfs_reclaim },             /* reclaim */
 236         { &vop_lock_desc, ufs_lock },                   /* lock */
 237         { &vop_unlock_desc, ufs_unlock },               /* unlock */
 238         { &vop_bmap_desc, fifo_bmap },                  /* bmap */
 239         { &vop_strategy_desc, fifo_strategy },          /* strategy */
 240         { &vop_print_desc, ufs_print },                 /* print */
 241         { &vop_islocked_desc, ufs_islocked },           /* islocked */
 242         { &vop_pathconf_desc, fifo_pathconf },          /* pathconf */
 243         { &vop_advlock_desc, fifo_advlock },            /* advlock */
 244         { &vop_bwrite_desc, lfs_bwrite },               /* bwrite */
 245         { &vop_putpages_desc, fifo_putpages },          /* putpages */
 246         { NULL, NULL }
 247 };
 248 const struct vnodeopv_desc lfs_fifoop_opv_desc =
 249         { &lfs_fifoop_p, lfs_fifoop_entries };
 250
 251 static int check_dirty(struct lfs *, struct vnode *, off_t, off_t, off_t, int, int, struct vm_page **);
 252
 253 #define LFS_READWRITE
 254 #include <ufs/ufs/ufs_readwrite.c>
 255 #undef  LFS_READWRITE
 256
 257 /*
 258  * Synch an open file.
 259  */
 260 /* ARGSUSED */
 261 int
 262 lfs_fsync(void *v)
 263 {
 264         struct vop_fsync_args /* {
 265                 struct vnode *a_vp;
 266                 kauth_cred_t a_cred;
 267                 int a_flags;
 268                 off_t offlo;
 269                 off_t offhi;
 270         } */ *ap = v;
 271         struct vnode *vp = ap->a_vp;
 272         int error, wait;
 273         struct inode *ip = VTOI(vp);
 274         struct lfs *fs = ip->i_lfs;
 275
 276         /* If we're mounted read-only, don't try to sync. */
 277         if (fs->lfs_ronly)
 278                 return 0;
 279
 280         /*
 281          * Trickle sync simply adds this vnode to the pager list, as if
 282          * the pagedaemon had requested a pageout.
 283          */
 284         if (ap->a_flags & FSYNC_LAZY) {
 285                 if (lfs_ignore_lazy_sync == 0) {
 286                         mutex_enter(&lfs_lock);
 287                         if (!(ip->i_flags & IN_PAGING)) {
 288                                 ip->i_flags |= IN_PAGING;
 289                                 TAILQ_INSERT_TAIL(&fs->lfs_pchainhd, ip,
 290                                                   i_lfs_pchain);
 291                         }
 292                         wakeup(&lfs_writer_daemon);
 293                         mutex_exit(&lfs_lock);
 294                 }
 295                 return 0;
 296         }
 297
 298         /*
 299          * If a vnode is bring cleaned, flush it out before we try to
 300          * reuse it.  This prevents the cleaner from writing files twice
 301          * in the same partial segment, causing an accounting underflow.
 302          */
 303         if (ap->a_flags & FSYNC_RECLAIM && ip->i_flags & IN_CLEANING) {
 304                 lfs_vflush(vp);
 305         }
 306
 307         wait = (ap->a_flags & FSYNC_WAIT);
 308         do {
 309                 mutex_enter(&vp->v_interlock);
 310                 error = VOP_PUTPAGES(vp, trunc_page(ap->a_offlo),
 311                                      round_page(ap->a_offhi),
 312                                      PGO_CLEANIT | (wait ? PGO_SYNCIO : 0));
 313                 if (error == EAGAIN) {
 314                         mutex_enter(&lfs_lock);
 315                         mtsleep(&fs->lfs_avail, PCATCH | PUSER, "lfs_fsync",
 316                                 hz / 100 + 1, &lfs_lock);
 317                         mutex_exit(&lfs_lock);
 318                 }
 319         } while (error == EAGAIN);
 320         if (error)
 321                 return error;
 322
 323         if ((ap->a_flags & FSYNC_DATAONLY) == 0)
 324                 error = lfs_update(vp, NULL, NULL, wait ? UPDATE_WAIT : 0);
 325
 326         if (error == 0 && ap->a_flags & FSYNC_CACHE) {
 327                 int l = 0;
 328                 error = VOP_IOCTL(ip->i_devvp, DIOCCACHESYNC, &l, FWRITE,
 329                                   curlwp->l_cred);
 330         }
 331         if (wait && !VPISEMPTY(vp))
 332                 LFS_SET_UINO(ip, IN_MODIFIED);
 333
 334         return error;
 335 }
 336
 337 /*
 338  * Take IN_ADIROP off, then call ufs_inactive.
 339  */
 340 int
 341 lfs_inactive(void *v)
 342 {
 343         struct vop_inactive_args /* {
 344                 struct vnode *a_vp;
 345         } */ *ap = v;
 346
 347         lfs_unmark_vnode(ap->a_vp);
 348
 349         /*
 350          * The Ifile is only ever inactivated on unmount.
 351          * Streamline this process by not giving it more dirty blocks.
 352          */
 353         if (VTOI(ap->a_vp)->i_number == LFS_IFILE_INUM) {
 354                 mutex_enter(&lfs_lock);
 355                 LFS_CLR_UINO(VTOI(ap->a_vp), IN_ALLMOD);
 356                 mutex_exit(&lfs_lock);
 357                 VOP_UNLOCK(ap->a_vp, 0);
 358                 return 0;
 359         }
 360
 361         return ufs_inactive(v);
 362 }
 363
 364 /*
 365  * These macros are used to bracket UFS directory ops, so that we can
 366  * identify all the pages touched during directory ops which need to
 367  * be ordered and flushed atomically, so that they may be recovered.
 368  *
 369  * Because we have to mark nodes VU_DIROP in order to prevent
 370  * the cache from reclaiming them while a dirop is in progress, we must
 371  * also manage the number of nodes so marked (otherwise we can run out).
 372  * We do this by setting lfs_dirvcount to the number of marked vnodes; it
 373  * is decremented during segment write, when VU_DIROP is taken off.
 374  */
 375 #define MARK_VNODE(vp)                  lfs_mark_vnode(vp)
 376 #define UNMARK_VNODE(vp)                lfs_unmark_vnode(vp)
 377 #define SET_DIROP_CREATE(dvp, vpp)      lfs_set_dirop_create((dvp), (vpp))
 378 #define SET_DIROP_REMOVE(dvp, vp)       lfs_set_dirop((dvp), (vp))
 379 static int lfs_set_dirop_create(struct vnode *, struct vnode **);
 380 static int lfs_set_dirop(struct vnode *, struct vnode *);
 381
 382 static int
 383 lfs_set_dirop(struct vnode *dvp, struct vnode *vp)
 384 {
 385         struct lfs *fs;
 386         int error;
 387
 388         KASSERT(VOP_ISLOCKED(dvp));
 389         KASSERT(vp == NULL || VOP_ISLOCKED(vp));
 390
 391         fs = VTOI(dvp)->i_lfs;
 392
 393         ASSERT_NO_SEGLOCK(fs);
 394         /*
 395          * LFS_NRESERVE calculates direct and indirect blocks as well
 396          * as an inode block; an overestimate in most cases.
 397          */
 398         if ((error = lfs_reserve(fs, dvp, vp, LFS_NRESERVE(fs))) != 0)
 399                 return (error);
 400
 401     restart:
 402         mutex_enter(&lfs_lock);
 403         if (fs->lfs_dirops == 0) {
 404                 mutex_exit(&lfs_lock);
 405                 lfs_check(dvp, LFS_UNUSED_LBN, 0);
 406                 mutex_enter(&lfs_lock);
 407         }
 408         while (fs->lfs_writer) {
 409                 error = mtsleep(&fs->lfs_dirops, (PRIBIO + 1) | PCATCH,
 410                     "lfs_sdirop", 0, &lfs_lock);
 411                 if (error == EINTR) {
 412                         mutex_exit(&lfs_lock);
 413                         goto unreserve;
 414                 }
 415         }
 416         if (lfs_dirvcount > LFS_MAX_DIROP && fs->lfs_dirops == 0) {
 417                 wakeup(&lfs_writer_daemon);
 418                 mutex_exit(&lfs_lock);
 419                 preempt();
 420                 goto restart;
 421         }
 422
 423         if (lfs_dirvcount > LFS_MAX_DIROP) {
 424                 mutex_exit(&lfs_lock);
 425                 DLOG((DLOG_DIROP, "lfs_set_dirop: sleeping with dirops=%d, "
 426                       "dirvcount=%d\n", fs->lfs_dirops, lfs_dirvcount));
 427                 if ((error = mtsleep(&lfs_dirvcount,
 428                     PCATCH | PUSER | PNORELOCK, "lfs_maxdirop", 0,
 429                     &lfs_lock)) != 0) {
 430                         goto unreserve;
 431                 }
 432                 goto restart;
 433         }
 434
 435         ++fs->lfs_dirops;
 436         fs->lfs_doifile = 1;
 437         mutex_exit(&lfs_lock);
 438
 439         /* Hold a reference so SET_ENDOP will be happy */
 440         vref(dvp);
 441         if (vp) {
 442                 vref(vp);
 443                 MARK_VNODE(vp);
 444         }
 445
 446         MARK_VNODE(dvp);
 447         return 0;
 448
 449   unreserve:
 450         lfs_reserve(fs, dvp, vp, -LFS_NRESERVE(fs));
 451         return error;
 452 }
 453
 454 /*
 455  * Get a new vnode *before* adjusting the dirop count, to avoid a deadlock
 456  * in getnewvnode(), if we have a stacked filesystem mounted on top
 457  * of us.
 458  *
 459  * NB: this means we have to clear the new vnodes on error.  Fortunately
 460  * SET_ENDOP is there to do that for us.
 461  */
 462 static int
 463 lfs_set_dirop_create(struct vnode *dvp, struct vnode **vpp)
 464 {
 465         int error;
 466         struct lfs *fs;
 467
 468         fs = VFSTOUFS(dvp->v_mount)->um_lfs;
 469         ASSERT_NO_SEGLOCK(fs);
 470         if (fs->lfs_ronly)
 471                 return EROFS;
 472         if (vpp && (error = getnewvnode(VT_LFS, dvp->v_mount, lfs_vnodeop_p, vpp))) {
 473                 DLOG((DLOG_ALLOC, "lfs_set_dirop_create: dvp %p error %d\n",
 474                       dvp, error));
 475                 return error;
 476         }
 477         if ((error = lfs_set_dirop(dvp, NULL)) != 0) {
 478                 if (vpp) {
 479                         ungetnewvnode(*vpp);
 480                         *vpp = NULL;
 481                 }
 482                 return error;
 483         }
 484         return 0;
 485 }
 486
 487 #define SET_ENDOP_BASE(fs, dvp, str)                                    \
 488         do {                                                            \
 489                 mutex_enter(&lfs_lock);                         \
 490                 --(fs)->lfs_dirops;                                     \
 491                 if (!(fs)->lfs_dirops) {                                \
 492                         if ((fs)->lfs_nadirop) {                        \
 493                                 panic("SET_ENDOP: %s: no dirops but "   \
 494                                         " nadirop=%d", (str),           \
 495                                         (fs)->lfs_nadirop);             \
 496                         }                                               \
 497                         wakeup(&(fs)->lfs_writer);                      \
 498                         mutex_exit(&lfs_lock);                          \
 499                         lfs_check((dvp), LFS_UNUSED_LBN, 0);            \
 500                 } else                                                  \
 501                         mutex_exit(&lfs_lock);                          \
 502         } while(0)
 503 #define SET_ENDOP_CREATE(fs, dvp, nvpp, str)                            \
 504         do {                                                            \
 505                 UNMARK_VNODE(dvp);                                      \
 506                 if (nvpp && *nvpp)                                      \
 507                         UNMARK_VNODE(*nvpp);                            \
 508                 /* Check for error return to stem vnode leakage */      \
 509                 if (nvpp && *nvpp && !((*nvpp)->v_uflag & VU_DIROP))    \
 510                         ungetnewvnode(*(nvpp));                         \
 511                 SET_ENDOP_BASE((fs), (dvp), (str));                     \
 512                 lfs_reserve((fs), (dvp), NULL, -LFS_NRESERVE(fs));      \
 513                 vrele(dvp);                                             \
 514         } while(0)
 515 #define SET_ENDOP_CREATE_AP(ap, str)                                    \
 516         SET_ENDOP_CREATE(VTOI((ap)->a_dvp)->i_lfs, (ap)->a_dvp,         \
 517                          (ap)->a_vpp, (str))
 518 #define SET_ENDOP_REMOVE(fs, dvp, ovp, str)                             \
 519         do {                                                            \
 520                 UNMARK_VNODE(dvp);                                      \
 521                 if (ovp)                                                \
 522                         UNMARK_VNODE(ovp);                              \
 523                 SET_ENDOP_BASE((fs), (dvp), (str));                     \
 524                 lfs_reserve((fs), (dvp), (ovp), -LFS_NRESERVE(fs));     \
 525                 vrele(dvp);                                             \
 526                 if (ovp)                                                \
 527                         vrele(ovp);                                     \
 528         } while(0)
 529
 530 void
 531 lfs_mark_vnode(struct vnode *vp)
 532 {
 533         struct inode *ip = VTOI(vp);
 534         struct lfs *fs = ip->i_lfs;
 535
 536         mutex_enter(&lfs_lock);
 537         if (!(ip->i_flag & IN_ADIROP)) {
 538                 if (!(vp->v_uflag & VU_DIROP)) {
 539                         mutex_enter(&vp->v_interlock);
 540                         (void)lfs_vref(vp);
 541                         ++lfs_dirvcount;
 542                         ++fs->lfs_dirvcount;
 543                         TAILQ_INSERT_TAIL(&fs->lfs_dchainhd, ip, i_lfs_dchain);
 544                         vp->v_uflag |= VU_DIROP;
 545                 }
 546                 ++fs->lfs_nadirop;
 547                 ip->i_flag |= IN_ADIROP;
 548         } else
 549                 KASSERT(vp->v_uflag & VU_DIROP);
 550         mutex_exit(&lfs_lock);
 551 }
 552
 553 void
 554 lfs_unmark_vnode(struct vnode *vp)
 555 {
 556         struct inode *ip = VTOI(vp);
 557
 558         if (ip && (ip->i_flag & IN_ADIROP)) {
 559                 KASSERT(vp->v_uflag & VU_DIROP);
 560                 mutex_enter(&lfs_lock);
 561                 --ip->i_lfs->lfs_nadirop;
 562                 mutex_exit(&lfs_lock);
 563                 ip->i_flag &= ~IN_ADIROP;
 564         }
 565 }
 566
 567 int
 568 lfs_symlink(void *v)
 569 {
 570         struct vop_symlink_args /* {
 571                 struct vnode *a_dvp;
 572                 struct vnode **a_vpp;
 573                 struct componentname *a_cnp;
 574                 struct vattr *a_vap;
 575                 char *a_target;
 576         } */ *ap = v;
 577         int error;
 578
 579         if ((error = SET_DIROP_CREATE(ap->a_dvp, ap->a_vpp)) != 0) {
 580                 vput(ap->a_dvp);
 581                 return error;
 582         }
 583         error = ufs_symlink(ap);
 584         SET_ENDOP_CREATE_AP(ap, "symlink");
 585         return (error);
 586 }
 587
 588 int
 589 lfs_mknod(void *v)
 590 {
 591         struct vop_mknod_args   /* {
 592                 struct vnode *a_dvp;
 593                 struct vnode **a_vpp;
 594                 struct componentname *a_cnp;
 595                 struct vattr *a_vap;
 596         } */ *ap = v;
 597         struct vattr *vap = ap->a_vap;
 598         struct vnode **vpp = ap->a_vpp;
 599         struct inode *ip;
 600         int error;
 601         struct mount    *mp;
 602         ino_t           ino;
 603
 604         if ((error = SET_DIROP_CREATE(ap->a_dvp, ap->a_vpp)) != 0) {
 605                 vput(ap->a_dvp);
 606                 return error;
 607         }
 608         error = ufs_makeinode(MAKEIMODE(vap->va_type, vap->va_mode),
 609                               ap->a_dvp, vpp, ap->a_cnp);
 610
 611         /* Either way we're done with the dirop at this point */
 612         SET_ENDOP_CREATE_AP(ap, "mknod");
 613
 614         if (error)
 615                 return (error);
 616
 617         ip = VTOI(*vpp);
 618         mp  = (*vpp)->v_mount;
 619         ino = ip->i_number;
 620         ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
 621         if (vap->va_rdev != VNOVAL) {
 622                 /*
 623                  * Want to be able to use this to make badblock
 624                  * inodes, so don't truncate the dev number.
 625                  */
 626 #if 0
 627                 ip->i_ffs1_rdev = ufs_rw32(vap->va_rdev,
 628                                            UFS_MPNEEDSWAP((*vpp)->v_mount));
 629 #else
 630                 ip->i_ffs1_rdev = vap->va_rdev;
 631 #endif
 632         }
 633
 634         /*
 635          * Call fsync to write the vnode so that we don't have to deal with
 636          * flushing it when it's marked VU_DIROP|VI_XLOCK.
 637          *
 638          * XXX KS - If we can't flush we also can't call vgone(), so must
 639          * return.  But, that leaves this vnode in limbo, also not good.
 640          * Can this ever happen (barring hardware failure)?
 641          */
 642         if ((error = VOP_FSYNC(*vpp, NOCRED, FSYNC_WAIT, 0, 0)) != 0) {
 643                 panic("lfs_mknod: couldn't fsync (ino %llu)",
 644                       (unsigned long long)ino);
 645                 /* return (error); */
 646         }
 647         /*
 648          * Remove vnode so that it will be reloaded by VFS_VGET and
 649          * checked to see if it is an alias of an existing entry in
 650          * the inode cache.
 651          */
 652         /* Used to be vput, but that causes us to call VOP_INACTIVE twice. */
 653
 654         VOP_UNLOCK(*vpp, 0);
 655         (*vpp)->v_type = VNON;
 656         vgone(*vpp);
 657         error = VFS_VGET(mp, ino, vpp);
 658
 659         if (error != 0) {
 660                 *vpp = NULL;
 661                 return (error);
 662         }
 663         return (0);
 664 }
 665
 666 int
 667 lfs_create(void *v)
 668 {
 669         struct vop_create_args  /* {
 670                 struct vnode *a_dvp;
 671                 struct vnode **a_vpp;
 672                 struct componentname *a_cnp;
 673                 struct vattr *a_vap;
 674         } */ *ap = v;
 675         int error;
 676
 677         if ((error = SET_DIROP_CREATE(ap->a_dvp, ap->a_vpp)) != 0) {
 678                 vput(ap->a_dvp);
 679                 return error;
 680         }
 681         error = ufs_create(ap);
 682         SET_ENDOP_CREATE_AP(ap, "create");
 683         return (error);
 684 }
 685
 686 int
 687 lfs_mkdir(void *v)
 688 {
 689         struct vop_mkdir_args   /* {
 690                 struct vnode *a_dvp;
 691                 struct vnode **a_vpp;
 692                 struct componentname *a_cnp;
 693                 struct vattr *a_vap;
 694         } */ *ap = v;
 695         int error;
 696
 697         if ((error = SET_DIROP_CREATE(ap->a_dvp, ap->a_vpp)) != 0) {
 698                 vput(ap->a_dvp);
 699                 return error;
 700         }
 701         error = ufs_mkdir(ap);
 702         SET_ENDOP_CREATE_AP(ap, "mkdir");
 703         return (error);
 704 }
 705
 706 int
 707 lfs_remove(void *v)
 708 {
 709         struct vop_remove_args  /* {
 710                 struct vnode *a_dvp;
 711                 struct vnode *a_vp;
 712                 struct componentname *a_cnp;
 713         } */ *ap = v;
 714         struct vnode *dvp, *vp;
 715         struct inode *ip;
 716         int error;
 717
 718         dvp = ap->a_dvp;
 719         vp = ap->a_vp;
 720         ip = VTOI(vp);
 721         if ((error = SET_DIROP_REMOVE(dvp, vp)) != 0) {
 722                 if (dvp == vp)
 723                         vrele(vp);
 724                 else
 725                         vput(vp);
 726                 vput(dvp);
 727                 return error;
 728         }
 729         error = ufs_remove(ap);
 730         if (ip->i_nlink == 0)
 731                 lfs_orphan(ip->i_lfs, ip->i_number);
 732         SET_ENDOP_REMOVE(ip->i_lfs, dvp, ap->a_vp, "remove");
 733         return (error);
 734 }
 735
 736 int
 737 lfs_rmdir(void *v)
 738 {
 739         struct vop_rmdir_args   /* {
 740                 struct vnodeop_desc *a_desc;
 741                 struct vnode *a_dvp;
 742                 struct vnode *a_vp;
 743                 struct componentname *a_cnp;
 744         } */ *ap = v;
 745         struct vnode *vp;
 746         struct inode *ip;
 747         int error;
 748
 749         vp = ap->a_vp;
 750         ip = VTOI(vp);
 751         if ((error = SET_DIROP_REMOVE(ap->a_dvp, ap->a_vp)) != 0) {
 752                 if (ap->a_dvp == vp)
 753                         vrele(ap->a_dvp);
 754                 else
 755                         vput(ap->a_dvp);
 756                 vput(vp);
 757                 return error;
 758         }
 759         error = ufs_rmdir(ap);
 760         if (ip->i_nlink == 0)
 761                 lfs_orphan(ip->i_lfs, ip->i_number);
 762         SET_ENDOP_REMOVE(ip->i_lfs, ap->a_dvp, ap->a_vp, "rmdir");
 763         return (error);
 764 }
 765
 766 int
 767 lfs_link(void *v)
 768 {
 769         struct vop_link_args    /* {
 770                 struct vnode *a_dvp;
 771                 struct vnode *a_vp;
 772                 struct componentname *a_cnp;
 773         } */ *ap = v;
 774         int error;
 775         struct vnode **vpp = NULL;
 776
 777         if ((error = SET_DIROP_CREATE(ap->a_dvp, vpp)) != 0) {
 778                 vput(ap->a_dvp);
 779                 return error;
 780         }
 781         error = ufs_link(ap);
 782         SET_ENDOP_CREATE(VTOI(ap->a_dvp)->i_lfs, ap->a_dvp, vpp, "link");
 783         return (error);
 784 }
 785
 786 int
 787 lfs_rename(void *v)
 788 {
 789         struct vop_rename_args  /* {
 790                 struct vnode *a_fdvp;
 791                 struct vnode *a_fvp;
 792                 struct componentname *a_fcnp;
 793                 struct vnode *a_tdvp;
 794                 struct vnode *a_tvp;
 795                 struct componentname *a_tcnp;
 796         } */ *ap = v;
 797         struct vnode *tvp, *fvp, *tdvp, *fdvp;
 798         struct componentname *tcnp, *fcnp;
 799         int error;
 800         struct lfs *fs;
 801
 802         fs = VTOI(ap->a_fdvp)->i_lfs;
 803         tvp = ap->a_tvp;
 804         tdvp = ap->a_tdvp;
 805         tcnp = ap->a_tcnp;
 806         fvp = ap->a_fvp;
 807         fdvp = ap->a_fdvp;
 808         fcnp = ap->a_fcnp;
 809
 810         /*
 811          * Check for cross-device rename.
 812          * If it is, we don't want to set dirops, just error out.
 813          * (In particular note that MARK_VNODE(tdvp) will DTWT on
 814          * a cross-device rename.)
 815          *
 816          * Copied from ufs_rename.
 817          */
 818         if ((fvp->v_mount != tdvp->v_mount) ||
 819             (tvp && (fvp->v_mount != tvp->v_mount))) {
 820                 error = EXDEV;
 821                 goto errout;
 822         }
 823
 824         /*
 825          * Check to make sure we're not renaming a vnode onto itself
 826          * (deleting a hard link by renaming one name onto another);
 827          * if we are we can't recursively call VOP_REMOVE since that
 828          * would leave us with an unaccounted-for number of live dirops.
 829          *
 830          * Inline the relevant section of ufs_rename here, *before*
 831          * calling SET_DIROP_REMOVE.
 832          */
 833         if (tvp && ((VTOI(tvp)->i_flags & (IMMUTABLE | APPEND)) ||
 834                     (VTOI(tdvp)->i_flags & APPEND))) {
 835                 error = EPERM;
 836                 goto errout;
 837         }
 838         if (fvp == tvp) {
 839                 if (fvp->v_type == VDIR) {
 840                         error = EINVAL;
 841                         goto errout;
 842                 }
 843
 844                 /* Release destination completely. */
 845                 VOP_ABORTOP(tdvp, tcnp);
 846                 vput(tdvp);
 847                 vput(tvp);
 848
 849                 /* Delete source. */
 850                 vrele(fvp);
 851                 fcnp->cn_flags &= ~(MODMASK | SAVESTART);
 852                 fcnp->cn_flags |= LOCKPARENT | LOCKLEAF;
 853                 fcnp->cn_nameiop = DELETE;
 854                 vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY);
 855                 if ((error = relookup(fdvp, &fvp, fcnp))) {
 856                         vput(fdvp);
 857                         return (error);
 858                 }
 859                 return (VOP_REMOVE(fdvp, fvp, fcnp));
 860         }
 861
 862         if ((error = SET_DIROP_REMOVE(tdvp, tvp)) != 0)
 863                 goto errout;
 864         MARK_VNODE(fdvp);
 865         MARK_VNODE(fvp);
 866
 867         error = ufs_rename(ap);
 868         UNMARK_VNODE(fdvp);
 869         UNMARK_VNODE(fvp);
 870         SET_ENDOP_REMOVE(fs, tdvp, tvp, "rename");
 871         return (error);
 872
 873   errout:
 874         VOP_ABORTOP(tdvp, ap->a_tcnp); /* XXX, why not in NFS? */
 875         if (tdvp == tvp)
 876                 vrele(tdvp);
 877         else
 878                 vput(tdvp);
 879         if (tvp)
 880                 vput(tvp);
 881         VOP_ABORTOP(fdvp, ap->a_fcnp); /* XXX, why not in NFS? */
 882         vrele(fdvp);
 883         vrele(fvp);
 884         return (error);
 885 }
 886
 887 /* XXX hack to avoid calling ITIMES in getattr */
 888 int
 889 lfs_getattr(void *v)
 890 {
 891         struct vop_getattr_args /* {
 892                 struct vnode *a_vp;
 893                 struct vattr *a_vap;
 894                 kauth_cred_t a_cred;
 895         } */ *ap = v;
 896         struct vnode *vp = ap->a_vp;
 897         struct inode *ip = VTOI(vp);
 898         struct vattr *vap = ap->a_vap;
 899         struct lfs *fs = ip->i_lfs;
 900         /*
 901          * Copy from inode table
 902          */
 903         vap->va_fsid = ip->i_dev;
 904         vap->va_fileid = ip->i_number;
 905         vap->va_mode = ip->i_mode & ~IFMT;
 906         vap->va_nlink = ip->i_nlink;
 907         vap->va_uid = ip->i_uid;
 908         vap->va_gid = ip->i_gid;
 909         vap->va_rdev = (dev_t)ip->i_ffs1_rdev;
 910         vap->va_size = vp->v_size;
 911         vap->va_atime.tv_sec = ip->i_ffs1_atime;
 912         vap->va_atime.tv_nsec = ip->i_ffs1_atimensec;
 913         vap->va_mtime.tv_sec = ip->i_ffs1_mtime;
 914         vap->va_mtime.tv_nsec = ip->i_ffs1_mtimensec;
 915         vap->va_ctime.tv_sec = ip->i_ffs1_ctime;
 916         vap->va_ctime.tv_nsec = ip->i_ffs1_ctimensec;
 917         vap->va_flags = ip->i_flags;
 918         vap->va_gen = ip->i_gen;
 919         /* this doesn't belong here */
 920         if (vp->v_type == VBLK)
 921                 vap->va_blocksize = BLKDEV_IOSIZE;
 922         else if (vp->v_type == VCHR)
 923                 vap->va_blocksize = MAXBSIZE;
 924         else
 925                 vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize;
 926         vap->va_bytes = fsbtob(fs, (u_quad_t)ip->i_lfs_effnblks);
 927         vap->va_type = vp->v_type;
 928         vap->va_filerev = ip->i_modrev;
 929         return (0);
 930 }
 931
 932 /*
 933  * Check to make sure the inode blocks won't choke the buffer
 934  * cache, then call ufs_setattr as usual.
 935  */
 936 int
 937 lfs_setattr(void *v)
 938 {
 939         struct vop_setattr_args /* {
 940                 struct vnode *a_vp;
 941                 struct vattr *a_vap;
 942                 kauth_cred_t a_cred;
 943         } */ *ap = v;
 944         struct vnode *vp = ap->a_vp;
 945
 946         lfs_check(vp, LFS_UNUSED_LBN, 0);
 947         return ufs_setattr(v);
 948 }
 949
 950 /*
 951  * Release the block we hold on lfs_newseg wrapping.  Called on file close,
 952  * or explicitly from LFCNWRAPGO.  Called with the interlock held.
 953  */
 954 static int
 955 lfs_wrapgo(struct lfs *fs, struct inode *ip, int waitfor)
 956 {
 957         if (fs->lfs_stoplwp != curlwp)
 958                 return EBUSY;
 959
 960         fs->lfs_stoplwp = NULL;
 961         cv_signal(&fs->lfs_stopcv);
 962
 963         KASSERT(fs->lfs_nowrap > 0);
 964         if (fs->lfs_nowrap <= 0) {
 965                 return 0;
 966         }
 967
 968         if (--fs->lfs_nowrap == 0) {
 969                 log(LOG_NOTICE, "%s: re-enabled log wrap\n", fs->lfs_fsmnt);
 970                 wakeup(&fs->lfs_wrappass);
 971                 lfs_wakeup_cleaner(fs);
 972         }
 973         if (waitfor) {
 974                 mtsleep(&fs->lfs_nextseg, PCATCH | PUSER, "segment",
 975                     0, &lfs_lock);
 976         }
 977
 978         return 0;
 979 }
 980
 981 /*
 982  * Close called
 983  */
 984 /* ARGSUSED */
 985 int
 986 lfs_close(void *v)
 987 {
 988         struct vop_close_args /* {
 989                 struct vnode *a_vp;
 990                 int  a_fflag;
 991                 kauth_cred_t a_cred;
 992         } */ *ap = v;
 993         struct vnode *vp = ap->a_vp;
 994         struct inode *ip = VTOI(vp);
 995         struct lfs *fs = ip->i_lfs;
 996
 997         if ((ip->i_number == ROOTINO || ip->i_number == LFS_IFILE_INUM) &&
 998             fs->lfs_stoplwp == curlwp) {
 999                 mutex_enter(&lfs_lock);
1000                 log(LOG_NOTICE, "lfs_close: releasing log wrap control\n");
1001                 lfs_wrapgo(fs, ip, 0);
1002                 mutex_exit(&lfs_lock);
1003         }
1004
1005         if (vp == ip->i_lfs->lfs_ivnode &&
1006             vp->v_mount->mnt_iflag & IMNT_UNMOUNT)
1007                 return 0;
1008
1009         if (vp->v_usecount > 1 && vp != ip->i_lfs->lfs_ivnode) {
1010                 LFS_ITIMES(ip, NULL, NULL, NULL);
1011         }
1012         return (0);
1013 }
1014
1015 /*
1016  * Close wrapper for special devices.
1017  *
1018  * Update the times on the inode then do device close.
1019  */
1020 int
1021 lfsspec_close(void *v)
1022 {
1023         struct vop_close_args /* {
1024                 struct vnode    *a_vp;
1025                 int             a_fflag;
1026                 kauth_cred_t    a_cred;
1027         } */ *ap = v;
1028         struct vnode    *vp;
1029         struct inode    *ip;
1030
1031         vp = ap->a_vp;
1032         ip = VTOI(vp);
1033         if (vp->v_usecount > 1) {
1034                 LFS_ITIMES(ip, NULL, NULL, NULL);
1035         }
1036         return (VOCALL (spec_vnodeop_p, VOFFSET(vop_close), ap));
1037 }
1038
1039 /*
1040  * Close wrapper for fifo's.
1041  *
1042  * Update the times on the inode then do device close.
1043  */
1044 int
1045 lfsfifo_close(void *v)
1046 {
1047         struct vop_close_args /* {
1048                 struct vnode    *a_vp;
1049                 int             a_fflag;
1050                 kauth_cred_     a_cred;
1051         } */ *ap = v;
1052         struct vnode    *vp;
1053         struct inode    *ip;
1054
1055         vp = ap->a_vp;
1056         ip = VTOI(vp);
1057         if (ap->a_vp->v_usecount > 1) {
1058                 LFS_ITIMES(ip, NULL, NULL, NULL);
1059         }
1060         return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_close), ap));
1061 }
1062
1063 /*
1064  * Reclaim an inode so that it can be used for other purposes.
1065  */
1066
1067 int
1068 lfs_reclaim(void *v)
1069 {
1070         struct vop_reclaim_args /* {
1071                 struct vnode *a_vp;
1072         } */ *ap = v;
1073         struct vnode *vp = ap->a_vp;
1074         struct inode *ip = VTOI(vp);
1075         struct lfs *fs = ip->i_lfs;
1076         int error;
1077
1078         mutex_enter(&lfs_lock);
1079         LFS_CLR_UINO(ip, IN_ALLMOD);
1080         mutex_exit(&lfs_lock);
1081         if ((error = ufs_reclaim(vp)))
1082                 return (error);
1083
1084         /*
1085          * Take us off the paging and/or dirop queues if we were on them.
1086          * We shouldn't be on them.
1087          */
1088         mutex_enter(&lfs_lock);
1089         if (ip->i_flags & IN_PAGING) {
1090                 log(LOG_WARNING, "%s: reclaimed vnode is IN_PAGING\n",
1091                     fs->lfs_fsmnt);
1092                 ip->i_flags &= ~IN_PAGING;
1093                 TAILQ_REMOVE(&fs->lfs_pchainhd, ip, i_lfs_pchain);
1094         }
1095         if (vp->v_uflag & VU_DIROP) {
1096                 panic("reclaimed vnode is VU_DIROP");
1097                 vp->v_uflag &= ~VU_DIROP;
1098                 TAILQ_REMOVE(&fs->lfs_dchainhd, ip, i_lfs_dchain);
1099         }
1100         mutex_exit(&lfs_lock);
1101
1102         pool_put(&lfs_dinode_pool, ip->i_din.ffs1_din);
1103         lfs_deregister_all(vp);
1104         pool_put(&lfs_inoext_pool, ip->inode_ext.lfs);
1105         ip->inode_ext.lfs = NULL;
1106         genfs_node_destroy(vp);
1107         pool_put(&lfs_inode_pool, vp->v_data);
1108         vp->v_data = NULL;
1109         return (0);
1110 }
1111
1112 /*
1113  * Read a block from a storage device.
1114  * In order to avoid reading blocks that are in the process of being
1115  * written by the cleaner---and hence are not mutexed by the normal
1116  * buffer cache / page cache mechanisms---check for collisions before
1117  * reading.
1118  *
1119  * We inline ufs_strategy to make sure that the VOP_BMAP occurs *before*
1120  * the active cleaner test.
1121  *
1122  * XXX This code assumes that lfs_markv makes synchronous checkpoints.
1123  */
1124 int
1125 lfs_strategy(void *v)
1126 {
1127         struct vop_strategy_args /* {
1128                 struct vnode *a_vp;
1129                 struct buf *a_bp;
1130         } */ *ap = v;
1131         struct buf      *bp;
1132         struct lfs      *fs;
1133         struct vnode    *vp;
1134         struct inode    *ip;
1135         daddr_t         tbn;
1136         int             i, sn, error, slept;
1137
1138         bp = ap->a_bp;
1139         vp = ap->a_vp;
1140         ip = VTOI(vp);
1141         fs = ip->i_lfs;
1142
1143         /* lfs uses its strategy routine only for read */
1144         KASSERT(bp->b_flags & B_READ);
1145
1146         if (vp->v_type == VBLK || vp->v_type == VCHR)
1147                 panic("lfs_strategy: spec");
1148         KASSERT(bp->b_bcount != 0);
1149         if (bp->b_blkno == bp->b_lblkno) {
1150                 error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno,
1151                                  NULL);
1152                 if (error) {
1153                         bp->b_error = error;
1154                         bp->b_resid = bp->b_bcount;
1155                         biodone(bp);
1156                         return (error);
1157                 }
1158                 if ((long)bp->b_blkno == -1) /* no valid data */
1159                         clrbuf(bp);
1160         }
1161         if ((long)bp->b_blkno < 0) { /* block is not on disk */
1162                 bp->b_resid = bp->b_bcount;
1163                 biodone(bp);
1164                 return (0);
1165         }
1166
1167         slept = 1;
1168         mutex_enter(&lfs_lock);
1169         while (slept && fs->lfs_seglock) {
1170                 mutex_exit(&lfs_lock);
1171                 /*
1172                  * Look through list of intervals.
1173                  * There will only be intervals to look through
1174                  * if the cleaner holds the seglock.
1175                  * Since the cleaner is synchronous, we can trust
1176                  * the list of intervals to be current.
1177                  */
1178                 tbn = dbtofsb(fs, bp->b_blkno);
1179                 sn = dtosn(fs, tbn);
1180                 slept = 0;
1181                 for (i = 0; i < fs->lfs_cleanind; i++) {
1182                         if (sn == dtosn(fs, fs->lfs_cleanint[i]) &&
1183                             tbn >= fs->lfs_cleanint[i]) {
1184                                 DLOG((DLOG_CLEAN,
1185                                       "lfs_strategy: ino %d lbn %" PRId64
1186                                       " ind %d sn %d fsb %" PRIx32
1187                                       " given sn %d fsb %" PRIx64 "\n",
1188                                       ip->i_number, bp->b_lblkno, i,
1189                                       dtosn(fs, fs->lfs_cleanint[i]),
1190                                       fs->lfs_cleanint[i], sn, tbn));
1191                                 DLOG((DLOG_CLEAN,
1192                                       "lfs_strategy: sleeping on ino %d lbn %"
1193                                       PRId64 "\n", ip->i_number, bp->b_lblkno));
1194                                 mutex_enter(&lfs_lock);
1195                                 if (LFS_SEGLOCK_HELD(fs) && fs->lfs_iocount) {
1196                                         /* Cleaner can't wait for itself */
1197                                         mtsleep(&fs->lfs_iocount,
1198                                                 (PRIBIO + 1) | PNORELOCK,
1199                                                 "clean2", 0,
1200                                                 &lfs_lock);
1201                                         slept = 1;
1202                                         break;
1203                                 } else if (fs->lfs_seglock) {
1204                                         mtsleep(&fs->lfs_seglock,
1205                                                 (PRIBIO + 1) | PNORELOCK,
1206                                                 "clean1", 0,
1207                                                 &lfs_lock);
1208                                         slept = 1;
1209                                         break;
1210                                 }
1211                                 mutex_exit(&lfs_lock);
1212                         }
1213                 }
1214                 mutex_enter(&lfs_lock);
1215         }
1216         mutex_exit(&lfs_lock);
1217
1218         vp = ip->i_devvp;
1219         VOP_STRATEGY(vp, bp);
1220         return (0);
1221 }
1222
1223 void
1224 lfs_flush_dirops(struct lfs *fs)
1225 {
1226         struct inode *ip, *nip;
1227         struct vnode *vp;
1228         extern int lfs_dostats;
1229         struct segment *sp;
1230         int waslocked;
1231
1232         ASSERT_MAYBE_SEGLOCK(fs);
1233         KASSERT(fs->lfs_nadirop == 0);
1234
1235         if (fs->lfs_ronly)
1236                 return;
1237
1238         mutex_enter(&lfs_lock);
1239         if (TAILQ_FIRST(&fs->lfs_dchainhd) == NULL) {
1240                 mutex_exit(&lfs_lock);
1241                 return;
1242         } else
1243                 mutex_exit(&lfs_lock);
1244
1245         if (lfs_dostats)
1246                 ++lfs_stats.flush_invoked;
1247
1248         /*
1249          * Inline lfs_segwrite/lfs_writevnodes, but just for dirops.
1250          * Technically this is a checkpoint (the on-disk state is valid)
1251          * even though we are leaving out all the file data.
1252          */
1253         lfs_imtime(fs);
1254         lfs_seglock(fs, SEGM_CKP);
1255         sp = fs->lfs_sp;
1256
1257         /*
1258          * lfs_writevnodes, optimized to get dirops out of the way.
1259          * Only write dirops, and don't flush files' pages, only
1260          * blocks from the directories.
1261          *
1262          * We don't need to vref these files because they are
1263          * dirops and so hold an extra reference until the
1264          * segunlock clears them of that status.
1265          *
1266          * We don't need to check for IN_ADIROP because we know that
1267          * no dirops are active.
1268          *
1269          */
1270         mutex_enter(&lfs_lock);
1271         for (ip = TAILQ_FIRST(&fs->lfs_dchainhd); ip != NULL; ip = nip) {
1272                 nip = TAILQ_NEXT(ip, i_lfs_dchain);
1273                 mutex_exit(&lfs_lock);
1274                 vp = ITOV(ip);
1275
1276                 KASSERT((ip->i_flag & IN_ADIROP) == 0);
1277
1278                 /*
1279                  * All writes to directories come from dirops; all
1280                  * writes to files' direct blocks go through the page
1281                  * cache, which we're not touching.  Reads to files
1282                  * and/or directories will not be affected by writing
1283                  * directory blocks inodes and file inodes.  So we don't
1284                  * really need to lock.  If we don't lock, though,
1285                  * make sure that we don't clear IN_MODIFIED
1286                  * unnecessarily.
1287                  */
1288                 if (vp->v_iflag & VI_XLOCK) {
1289                         mutex_enter(&lfs_lock);
1290                         continue;
1291                 }
1292                 waslocked = VOP_ISLOCKED(vp);
1293                 if (vp->v_type != VREG &&
1294                     ((ip->i_flag & IN_ALLMOD) || !VPISEMPTY(vp))) {
1295                         lfs_writefile(fs, sp, vp);
1296                         if (!VPISEMPTY(vp) && !WRITEINPROG(vp) &&
1297                             !(ip->i_flag & IN_ALLMOD)) {
1298                                 mutex_enter(&lfs_lock);
1299                                 LFS_SET_UINO(ip, IN_MODIFIED);
1300                                 mutex_exit(&lfs_lock);
1301                         }
1302                 }
1303                 KDASSERT(ip->i_number != LFS_IFILE_INUM);
1304                 (void) lfs_writeinode(fs, sp, ip);
1305                 mutex_enter(&lfs_lock);
1306                 if (waslocked == LK_EXCLOTHER)
1307                         LFS_SET_UINO(ip, IN_MODIFIED);
1308         }
1309         mutex_exit(&lfs_lock);
1310         /* We've written all the dirops there are */
1311         ((SEGSUM *)(sp->segsum))->ss_flags &= ~(SS_CONT);
1312         lfs_finalize_fs_seguse(fs);
1313         (void) lfs_writeseg(fs, sp);
1314         lfs_segunlock(fs);
1315 }
1316
1317 /*
1318  * Flush all vnodes for which the pagedaemon has requested pageouts.
1319  * Skip over any files that are marked VU_DIROP (since lfs_flush_dirop()
1320  * has just run, this would be an error).  If we have to skip a vnode
1321  * for any reason, just skip it; if we have to wait for the cleaner,
1322  * abort.  The writer daemon will call us again later.
1323  */
1324 void
1325 lfs_flush_pchain(struct lfs *fs)
1326 {
1327         struct inode *ip, *nip;
1328         struct vnode *vp;
1329         extern int lfs_dostats;
1330         struct segment *sp;
1331         int error;
1332
1333         ASSERT_NO_SEGLOCK(fs);
1334
1335         if (fs->lfs_ronly)
1336                 return;
1337
1338         mutex_enter(&lfs_lock);
1339         if (TAILQ_FIRST(&fs->lfs_pchainhd) == NULL) {
1340                 mutex_exit(&lfs_lock);
1341                 return;
1342         } else
1343                 mutex_exit(&lfs_lock);
1344
1345         /* Get dirops out of the way */
1346         lfs_flush_dirops(fs);
1347
1348         if (lfs_dostats)
1349                 ++lfs_stats.flush_invoked;
1350
1351         /*
1352          * Inline lfs_segwrite/lfs_writevnodes, but just for pageouts.
1353          */
1354         lfs_imtime(fs);
1355         lfs_seglock(fs, 0);
1356         sp = fs->lfs_sp;
1357
1358         /*
1359          * lfs_writevnodes, optimized to clear pageout requests.
1360          * Only write non-dirop files that are in the pageout queue.
1361          * We're very conservative about what we write; we want to be
1362          * fast and async.
1363          */
1364         mutex_enter(&lfs_lock);
1365     top:
1366         for (ip = TAILQ_FIRST(&fs->lfs_pchainhd); ip != NULL; ip = nip) {
1367                 nip = TAILQ_NEXT(ip, i_lfs_pchain);
1368                 vp = ITOV(ip);
1369
1370                 if (!(ip->i_flags & IN_PAGING))
1371                         goto top;
1372
1373                 mutex_enter(&vp->v_interlock);
1374                 if ((vp->v_iflag & VI_XLOCK) || (vp->v_uflag & VU_DIROP) != 0) {
1375                         mutex_exit(&vp->v_interlock);
1376                         continue;
1377                 }
1378                 if (vp->v_type != VREG) {
1379                         mutex_exit(&vp->v_interlock);
1380                         continue;
1381                 }
1382                 if (lfs_vref(vp))
1383                         continue;
1384                 mutex_exit(&lfs_lock);
1385
1386                 if (VOP_ISLOCKED(vp)) {
1387                         lfs_vunref(vp);
1388                         mutex_enter(&lfs_lock);
1389                         continue;
1390                 }
1391
1392                 error = lfs_writefile(fs, sp, vp);
1393                 if (!VPISEMPTY(vp) && !WRITEINPROG(vp) &&
1394                     !(ip->i_flag & IN_ALLMOD)) {
1395                         mutex_enter(&lfs_lock);
1396                         LFS_SET_UINO(ip, IN_MODIFIED);
1397                         mutex_exit(&lfs_lock);
1398                 }
1399                 KDASSERT(ip->i_number != LFS_IFILE_INUM);
1400                 (void) lfs_writeinode(fs, sp, ip);
1401
1402                 lfs_vunref(vp);
1403
1404                 if (error == EAGAIN) {
1405                         lfs_writeseg(fs, sp);
1406                         mutex_enter(&lfs_lock);
1407                         break;
1408                 }
1409                 mutex_enter(&lfs_lock);
1410         }
1411         mutex_exit(&lfs_lock);
1412         (void) lfs_writeseg(fs, sp);
1413         lfs_segunlock(fs);
1414 }
1415
1416 /*
1417  * Provide a fcntl interface to sys_lfs_{segwait,bmapv,markv}.
1418  */
1419 int
1420 lfs_fcntl(void *v)
1421 {
1422         struct vop_fcntl_args /* {
1423                 struct vnode *a_vp;
1424                 u_int a_command;
1425                 void * a_data;
1426                 int  a_fflag;
1427                 kauth_cred_t a_cred;
1428         } */ *ap = v;
1429         struct timeval tv;
1430         struct timeval *tvp;
1431         BLOCK_INFO *blkiov;
1432         CLEANERINFO *cip;
1433         SEGUSE *sup;
1434         int blkcnt, error, oclean;
1435         size_t fh_size;
1436         struct lfs_fcntl_markv blkvp;
1437         struct lwp *l;
1438         fsid_t *fsidp;
1439         struct lfs *fs;
1440         struct buf *bp;
1441         fhandle_t *fhp;
1442         daddr_t off;
1443
1444         /* Only respect LFS fcntls on fs root or Ifile */
1445         if (VTOI(ap->a_vp)->i_number != ROOTINO &&
1446             VTOI(ap->a_vp)->i_number != LFS_IFILE_INUM) {
1447                 return ufs_fcntl(v);
1448         }
1449
1450         /* Avoid locking a draining lock */
1451         if (ap->a_vp->v_mount->mnt_iflag & IMNT_UNMOUNT) {
1452                 return ESHUTDOWN;
1453         }
1454
1455         /* LFS control and monitoring fcntls are available only to root */
1456         l = curlwp;
1457         if (((ap->a_command & 0xff00) >> 8) == 'L' &&
1458             (error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
1459                                              NULL)) != 0)
1460                 return (error);
1461
1462         fs = VTOI(ap->a_vp)->i_lfs;
1463         fsidp = &ap->a_vp->v_mount->mnt_stat.f_fsidx;
1464
1465         error = 0;
1466         switch ((int)ap->a_command) {
1467             case LFCNSEGWAITALL_COMPAT_50:
1468             case LFCNSEGWAITALL_COMPAT:
1469                 fsidp = NULL;
1470                 /* FALLSTHROUGH */
1471             case LFCNSEGWAIT_COMPAT_50:
1472             case LFCNSEGWAIT_COMPAT:
1473                 {
1474                         struct timeval50 *tvp50
1475                                 = (struct timeval50 *)ap->a_data;
1476                         timeval50_to_timeval(tvp50, &tv);
1477                         tvp = &tv;
1478                 }
1479                 goto segwait_common;
1480             case LFCNSEGWAITALL:
1481                 fsidp = NULL;
1482                 /* FALLSTHROUGH */
1483             case LFCNSEGWAIT:
1484                 tvp = (struct timeval *)ap->a_data;
1485 segwait_common:
1486                 mutex_enter(&lfs_lock);
1487                 ++fs->lfs_sleepers;
1488                 mutex_exit(&lfs_lock);
1489
1490                 error = lfs_segwait(fsidp, tvp);
1491
1492                 mutex_enter(&lfs_lock);
1493                 if (--fs->lfs_sleepers == 0)
1494                         wakeup(&fs->lfs_sleepers);
1495                 mutex_exit(&lfs_lock);
1496                 return error;
1497
1498             case LFCNBMAPV:
1499             case LFCNMARKV:
1500                 blkvp = *(struct lfs_fcntl_markv *)ap->a_data;
1501
1502                 blkcnt = blkvp.blkcnt;
1503                 if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT)
1504                         return (EINVAL);
1505                 blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV);
1506                 if ((error = copyin(blkvp.blkiov, blkiov,
1507                      blkcnt * sizeof(BLOCK_INFO))) != 0) {
1508                         lfs_free(fs, blkiov, LFS_NB_BLKIOV);
1509                         return error;
1510                 }
1511
1512                 mutex_enter(&lfs_lock);
1513                 ++fs->lfs_sleepers;
1514                 mutex_exit(&lfs_lock);
1515                 if (ap->a_command == LFCNBMAPV)
1516                         error = lfs_bmapv(l->l_proc, fsidp, blkiov, blkcnt);
1517                 else /* LFCNMARKV */
1518                         error = lfs_markv(l->l_proc, fsidp, blkiov, blkcnt);
1519                 if (error == 0)
1520                         error = copyout(blkiov, blkvp.blkiov,
1521                                         blkcnt * sizeof(BLOCK_INFO));
1522                 mutex_enter(&lfs_lock);
1523                 if (--fs->lfs_sleepers == 0)
1524                         wakeup(&fs->lfs_sleepers);
1525                 mutex_exit(&lfs_lock);
1526                 lfs_free(fs, blkiov, LFS_NB_BLKIOV);
1527                 return error;
1528
1529             case LFCNRECLAIM:
1530                 /*
1531                  * Flush dirops and write Ifile, allowing empty segments
1532                  * to be immediately reclaimed.
1533                  */
1534                 lfs_writer_enter(fs, "pndirop");
1535                 off = fs->lfs_offset;
1536                 lfs_seglock(fs, SEGM_FORCE_CKP | SEGM_CKP);
1537                 lfs_flush_dirops(fs);
1538                 LFS_CLEANERINFO(cip, fs, bp);
1539                 oclean = cip->clean;
1540                 LFS_SYNC_CLEANERINFO(cip, fs, bp, 1);
1541                 lfs_segwrite(ap->a_vp->v_mount, SEGM_FORCE_CKP);
1542                 fs->lfs_sp->seg_flags |= SEGM_PROT;
1543                 lfs_segunlock(fs);
1544                 lfs_writer_leave(fs);
1545
1546 #ifdef DEBUG
1547                 LFS_CLEANERINFO(cip, fs, bp);
1548                 DLOG((DLOG_CLEAN, "lfs_fcntl: reclaim wrote %" PRId64
1549                       " blocks, cleaned %" PRId32 " segments (activesb %d)\n",
1550                       fs->lfs_offset - off, cip->clean - oclean,
1551                       fs->lfs_activesb));
1552                 LFS_SYNC_CLEANERINFO(cip, fs, bp, 0);
1553 #endif
1554
1555                 return 0;
1556
1557             case LFCNIFILEFH_COMPAT:
1558                 /* Return the filehandle of the Ifile */
1559                 if ((error = kauth_authorize_system(l->l_cred,
1560                     KAUTH_SYSTEM_FILEHANDLE, 0, NULL, NULL, NULL)) != 0)
1561                         return (error);
1562                 fhp = (struct fhandle *)ap->a_data;
1563                 fhp->fh_fsid = *fsidp;
1564                 fh_size = 16;   /* former VFS_MAXFIDSIZ */
1565                 return lfs_vptofh(fs->lfs_ivnode, &(fhp->fh_fid), &fh_size);
1566
1567             case LFCNIFILEFH_COMPAT2:
1568             case LFCNIFILEFH:
1569                 /* Return the filehandle of the Ifile */
1570                 fhp = (struct fhandle *)ap->a_data;
1571                 fhp->fh_fsid = *fsidp;
1572                 fh_size = sizeof(struct lfs_fhandle) -
1573                     offsetof(fhandle_t, fh_fid);
1574                 return lfs_vptofh(fs->lfs_ivnode, &(fhp->fh_fid), &fh_size);
1575
1576             case LFCNREWIND:
1577                 /* Move lfs_offset to the lowest-numbered segment */
1578                 return lfs_rewind(fs, *(int *)ap->a_data);
1579
1580             case LFCNINVAL:
1581                 /* Mark a segment SEGUSE_INVAL */
1582                 LFS_SEGENTRY(sup, fs, *(int *)ap->a_data, bp);
1583                 if (sup->su_nbytes > 0) {
1584                         brelse(bp, 0);
1585                         lfs_unset_inval_all(fs);
1586                         return EBUSY;
1587                 }
1588                 sup->su_flags |= SEGUSE_INVAL;
1589                 VOP_BWRITE(bp);
1590                 return 0;
1591
1592             case LFCNRESIZE:
1593                 /* Resize the filesystem */
1594                 return lfs_resize_fs(fs, *(int *)ap->a_data);
1595
1596             case LFCNWRAPSTOP:
1597             case LFCNWRAPSTOP_COMPAT:
1598                 /*
1599                  * Hold lfs_newseg at segment 0; if requested, sleep until
1600                  * the filesystem wraps around.  To support external agents
1601                  * (dump, fsck-based regression test) that need to look at
1602                  * a snapshot of the filesystem, without necessarily
1603                  * requiring that all fs activity stops.
1604                  */
1605                 if (fs->lfs_stoplwp == curlwp)
1606                         return EALREADY;
1607
1608                 mutex_enter(&lfs_lock);
1609                 while (fs->lfs_stoplwp != NULL)
1610                         cv_wait(&fs->lfs_stopcv, &lfs_lock);
1611                 fs->lfs_stoplwp = curlwp;
1612                 if (fs->lfs_nowrap == 0)
1613                         log(LOG_NOTICE, "%s: disabled log wrap\n", fs->lfs_fsmnt);
1614                 ++fs->lfs_nowrap;
1615                 if (*(int *)ap->a_data == 1
1616                     || ap->a_command == LFCNWRAPSTOP_COMPAT) {
1617                         log(LOG_NOTICE, "LFCNSTOPWRAP waiting for log wrap\n");
1618                         error = mtsleep(&fs->lfs_nowrap, PCATCH | PUSER,
1619                                 "segwrap", 0, &lfs_lock);
1620                         log(LOG_NOTICE, "LFCNSTOPWRAP done waiting\n");
1621                         if (error) {
1622                                 lfs_wrapgo(fs, VTOI(ap->a_vp), 0);
1623                         }
1624                 }
1625                 mutex_exit(&lfs_lock);
1626                 return 0;
1627
1628             case LFCNWRAPGO:
1629             case LFCNWRAPGO_COMPAT:
1630                 /*
1631                  * Having done its work, the agent wakes up the writer.
1632                  * If the argument is 1, it sleeps until a new segment
1633                  * is selected.
1634                  */
1635                 mutex_enter(&lfs_lock);
1636                 error = lfs_wrapgo(fs, VTOI(ap->a_vp),
1637                                    ap->a_command == LFCNWRAPGO_COMPAT ? 1 :
1638                                     *((int *)ap->a_data));
1639                 mutex_exit(&lfs_lock);
1640                 return error;
1641
1642             case LFCNWRAPPASS:
1643                 if ((VTOI(ap->a_vp)->i_lfs_iflags & LFSI_WRAPWAIT))
1644                         return EALREADY;
1645                 mutex_enter(&lfs_lock);
1646                 if (fs->lfs_stoplwp != curlwp) {
1647                         mutex_exit(&lfs_lock);
1648                         return EALREADY;
1649                 }
1650                 if (fs->lfs_nowrap == 0) {
1651                         mutex_exit(&lfs_lock);
1652                         return EBUSY;
1653                 }
1654                 fs->lfs_wrappass = 1;
1655                 wakeup(&fs->lfs_wrappass);
1656                 /* Wait for the log to wrap, if asked */
1657                 if (*(int *)ap->a_data) {
1658                         mutex_enter(&ap->a_vp->v_interlock);
1659                         lfs_vref(ap->a_vp);
1660                         VTOI(ap->a_vp)->i_lfs_iflags |= LFSI_WRAPWAIT;
1661                         log(LOG_NOTICE, "LFCNPASS waiting for log wrap\n");
1662                         error = mtsleep(&fs->lfs_nowrap, PCATCH | PUSER,
1663                                 "segwrap", 0, &lfs_lock);
1664                         log(LOG_NOTICE, "LFCNPASS done waiting\n");
1665                         VTOI(ap->a_vp)->i_lfs_iflags &= ~LFSI_WRAPWAIT;
1666                         lfs_vunref(ap->a_vp);
1667                 }
1668                 mutex_exit(&lfs_lock);
1669                 return error;
1670
1671             case LFCNWRAPSTATUS:
1672                 mutex_enter(&lfs_lock);
1673                 *(int *)ap->a_data = fs->lfs_wrapstatus;
1674                 mutex_exit(&lfs_lock);
1675                 return 0;
1676
1677             default:
1678                 return ufs_fcntl(v);
1679         }
1680         return 0;
1681 }
1682
1683 int
1684 lfs_getpages(void *v)
1685 {
1686         struct vop_getpages_args /* {
1687                 struct vnode *a_vp;
1688                 voff_t a_offset;
1689                 struct vm_page **a_m;
1690                 int *a_count;
1691                 int a_centeridx;
1692                 vm_prot_t a_access_type;
1693                 int a_advice;
1694                 int a_flags;
1695         } */ *ap = v;
1696
1697         if (VTOI(ap->a_vp)->i_number == LFS_IFILE_INUM &&
1698             (ap->a_access_type & VM_PROT_WRITE) != 0) {
1699                 return EPERM;
1700         }
1701         if ((ap->a_access_type & VM_PROT_WRITE) != 0) {
1702                 mutex_enter(&lfs_lock);
1703                 LFS_SET_UINO(VTOI(ap->a_vp), IN_MODIFIED);
1704                 mutex_exit(&lfs_lock);
1705         }
1706
1707         /*
1708          * we're relying on the fact that genfs_getpages() always read in
1709          * entire filesystem blocks.
1710          */
1711         return genfs_getpages(v);
1712 }
1713
1714 /*
1715  * Wait for a page to become unbusy, possibly printing diagnostic messages
1716  * as well.
1717  *
1718  * Called with vp->v_interlock held; return with it held.
1719  */
1720 static void
1721 wait_for_page(struct vnode *vp, struct vm_page *pg, const char *label)
1722 {
1723         if ((pg->flags & PG_BUSY) == 0)
1724                 return;         /* Nothing to wait for! */
1725
1726 #if defined(DEBUG) && defined(UVM_PAGE_TRKOWN)
1727         static struct vm_page *lastpg;
1728
1729         if (label != NULL && pg != lastpg) {
1730                 if (pg->owner_tag) {
1731                         printf("lfs_putpages[%d.%d]: %s: page %p owner %d.%d [%s]\n",
1732                                curproc->p_pid, curlwp->l_lid, label,
1733                                pg, pg->owner, pg->lowner, pg->owner_tag);
1734                 } else {
1735                         printf("lfs_putpages[%d.%d]: %s: page %p unowned?!\n",
1736                                curproc->p_pid, curlwp->l_lid, label, pg);
1737                 }
1738         }
1739         lastpg = pg;
1740 #endif
1741
1742         pg->flags |= PG_WANTED;
1743         UVM_UNLOCK_AND_WAIT(pg, &vp->v_interlock, 0, "lfsput", 0);
1744         mutex_enter(&vp->v_interlock);
1745 }
1746
1747 /*
1748  * This routine is called by lfs_putpages() when it can't complete the
1749  * write because a page is busy.  This means that either (1) someone,
1750  * possibly the pagedaemon, is looking at this page, and will give it up
1751  * presently; or (2) we ourselves are holding the page busy in the
1752  * process of being written (either gathered or actually on its way to
1753  * disk).  We don't need to give up the segment lock, but we might need
1754  * to call lfs_writeseg() to expedite the page's journey to disk.
1755  *
1756  * Called with vp->v_interlock held; return with it held.
1757  */
1758 /* #define BUSYWAIT */
1759 static void
1760 write_and_wait(struct lfs *fs, struct vnode *vp, struct vm_page *pg,
1761                int seglocked, const char *label)
1762 {
1763 #ifndef BUSYWAIT
1764         struct inode *ip = VTOI(vp);
1765         struct segment *sp = fs->lfs_sp;
1766         int count = 0;
1767
1768         if (pg == NULL)
1769                 return;
1770
1771         while (pg->flags & PG_BUSY &&
1772             pg->uobject == &vp->v_uobj) {
1773                 mutex_exit(&vp->v_interlock);
1774                 if (sp->cbpp - sp->bpp > 1) {
1775                         /* Write gathered pages */
1776                         lfs_updatemeta(sp);
1777                         lfs_release_finfo(fs);
1778                         (void) lfs_writeseg(fs, sp);
1779
1780                         /*
1781                          * Reinitialize FIP
1782                          */
1783                         KASSERT(sp->vp == vp);
1784                         lfs_acquire_finfo(fs, ip->i_number,
1785                                           ip->i_gen);
1786                 }
1787                 ++count;
1788                 mutex_enter(&vp->v_interlock);
1789                 wait_for_page(vp, pg, label);
1790         }
1791         if (label != NULL && count > 1)
1792                 printf("lfs_putpages[%d]: %s: %sn = %d\n", curproc->p_pid,
1793                        label, (count > 0 ? "looping, " : ""), count);
1794 #else
1795         preempt(1);
1796 #endif
1797 }
1798
1799 /*
1800  * Make sure that for all pages in every block in the given range,
1801  * either all are dirty or all are clean.  If any of the pages
1802  * we've seen so far are dirty, put the vnode on the paging chain,
1803  * and mark it IN_PAGING.
1804  *
1805  * If checkfirst != 0, don't check all the pages but return at the
1806  * first dirty page.
1807  */
1808 static int
1809 check_dirty(struct lfs *fs, struct vnode *vp,
1810             off_t startoffset, off_t endoffset, off_t blkeof,
1811             int flags, int checkfirst, struct vm_page **pgp)
1812 {
1813         int by_list;
1814         struct vm_page *curpg = NULL; /* XXX: gcc */
1815         struct vm_page *pgs[MAXBSIZE / PAGE_SIZE], *pg;
1816         off_t soff = 0; /* XXX: gcc */
1817         voff_t off;
1818         int i;
1819         int nonexistent;
1820         int any_dirty;  /* number of dirty pages */
1821         int dirty;      /* number of dirty pages in a block */
1822         int tdirty;
1823         int pages_per_block = fs->lfs_bsize >> PAGE_SHIFT;
1824         int pagedaemon = (curlwp == uvm.pagedaemon_lwp);
1825
1826         ASSERT_MAYBE_SEGLOCK(fs);
1827   top:
1828         by_list = (vp->v_uobj.uo_npages <=
1829                    ((endoffset - startoffset) >> PAGE_SHIFT) *
1830                    UVM_PAGE_TREE_PENALTY);
1831         any_dirty = 0;
1832
1833         if (by_list) {
1834                 curpg = TAILQ_FIRST(&vp->v_uobj.memq);
1835         } else {
1836                 soff = startoffset;
1837         }
1838         while (by_list || soff < MIN(blkeof, endoffset)) {
1839                 if (by_list) {
1840                         /*
1841                          * Find the first page in a block.  Skip
1842                          * blocks outside our area of interest or beyond
1843                          * the end of file.
1844                          */
1845                         if (pages_per_block > 1) {
1846                                 while (curpg &&
1847                                        ((curpg->offset & fs->lfs_bmask) ||
1848                                         curpg->offset >= vp->v_size ||
1849                                         curpg->offset >= endoffset))
1850                                         curpg = TAILQ_NEXT(curpg, listq.queue);
1851                         }
1852                         if (curpg == NULL)
1853                                 break;
1854                         soff = curpg->offset;
1855                 }
1856
1857                 /*
1858                  * Mark all pages in extended range busy; find out if any
1859                  * of them are dirty.
1860                  */
1861                 nonexistent = dirty = 0;
1862                 for (i = 0; i == 0 || i < pages_per_block; i++) {
1863                         if (by_list && pages_per_block <= 1) {
1864                                 pgs[i] = pg = curpg;
1865                         } else {
1866                                 off = soff + (i << PAGE_SHIFT);
1867                                 pgs[i] = pg = uvm_pagelookup(&vp->v_uobj, off);
1868                                 if (pg == NULL) {
1869                                         ++nonexistent;
1870                                         continue;
1871                                 }
1872                         }
1873                         KASSERT(pg != NULL);
1874
1875                         /*
1876                          * If we're holding the segment lock, we can deadlock
1877                          * against a process that has our page and is waiting
1878                          * for the cleaner, while the cleaner waits for the
1879                          * segment lock.  Just bail in that case.
1880                          */
1881                         if ((pg->flags & PG_BUSY) &&
1882                             (pagedaemon || LFS_SEGLOCK_HELD(fs))) {
1883                                 if (i > 0)
1884                                         uvm_page_unbusy(pgs, i);
1885                                 DLOG((DLOG_PAGE, "lfs_putpages: avoiding 3-way or pagedaemon deadlock\n"));
1886                                 if (pgp)
1887                                         *pgp = pg;
1888                                 return -1;
1889                         }
1890
1891                         while (pg->flags & PG_BUSY) {
1892                                 wait_for_page(vp, pg, NULL);
1893                                 if (i > 0)
1894                                         uvm_page_unbusy(pgs, i);
1895                                 goto top;
1896                         }
1897                         pg->flags |= PG_BUSY;
1898                         UVM_PAGE_OWN(pg, "lfs_putpages");
1899
1900                         pmap_page_protect(pg, VM_PROT_NONE);
1901                         tdirty = (pmap_clear_modify(pg) ||
1902                                   (pg->flags & PG_CLEAN) == 0);
1903                         dirty += tdirty;
1904                 }
1905                 if (pages_per_block > 0 && nonexistent >= pages_per_block) {
1906                         if (by_list) {
1907                                 curpg = TAILQ_NEXT(curpg, listq.queue);
1908                         } else {
1909                                 soff += fs->lfs_bsize;
1910                         }
1911                         continue;
1912                 }
1913
1914                 any_dirty += dirty;
1915                 KASSERT(nonexistent == 0);
1916
1917                 /*
1918                  * If any are dirty make all dirty; unbusy them,
1919                  * but if we were asked to clean, wire them so that
1920                  * the pagedaemon doesn't bother us about them while
1921                  * they're on their way to disk.
1922                  */
1923                 for (i = 0; i == 0 || i < pages_per_block; i++) {
1924                         pg = pgs[i];
1925                         KASSERT(!((pg->flags & PG_CLEAN) && (pg->flags & PG_DELWRI)));
1926                         if (dirty) {
1927                                 pg->flags &= ~PG_CLEAN;
1928                                 if (flags & PGO_FREE) {
1929                                         /*
1930                                          * Wire the page so that
1931                                          * pdaemon doesn't see it again.
1932                                          */
1933                                         mutex_enter(&uvm_pageqlock);
1934                                         uvm_pagewire(pg);
1935                                         mutex_exit(&uvm_pageqlock);
1936
1937                                         /* Suspended write flag */
1938                                         pg->flags |= PG_DELWRI;
1939                                 }
1940                         }
1941                         if (pg->flags & PG_WANTED)
1942                                 wakeup(pg);
1943                         pg->flags &= ~(PG_WANTED|PG_BUSY);
1944                         UVM_PAGE_OWN(pg, NULL);
1945                 }
1946
1947                 if (checkfirst && any_dirty)
1948                         break;
1949
1950                 if (by_list) {
1951                         curpg = TAILQ_NEXT(curpg, listq.queue);
1952                 } else {
1953                         soff += MAX(PAGE_SIZE, fs->lfs_bsize);
1954                 }
1955         }
1956
1957         return any_dirty;
1958 }
1959
1960 /*
1961  * lfs_putpages functions like genfs_putpages except that
1962  *
1963  * (1) It needs to bounds-check the incoming requests to ensure that
1964  *     they are block-aligned; if they are not, expand the range and
1965  *     do the right thing in case, e.g., the requested range is clean
1966  *     but the expanded range is dirty.
1967  *
1968  * (2) It needs to explicitly send blocks to be written when it is done.
1969  *     If VOP_PUTPAGES is called without the seglock held, we simply take
1970  *     the seglock and let lfs_segunlock wait for us.
1971  *     XXX There might be a bad situation if we have to flush a vnode while
1972  *     XXX lfs_markv is in operation.  As of this writing we panic in this
1973  *     XXX case.
1974  *
1975  * Assumptions:
1976  *
1977  * (1) The caller does not hold any pages in this vnode busy.  If it does,
1978  *     there is a danger that when we expand the page range and busy the
1979  *     pages we will deadlock.
1980  *
1981  * (2) We are called with vp->v_interlock held; we must return with it
1982  *     released.
1983  *
1984  * (3) We don't absolutely have to free pages right away, provided that
1985  *     the request does not have PGO_SYNCIO.  When the pagedaemon gives
1986  *     us a request with PGO_FREE, we take the pages out of the paging
1987  *     queue and wake up the writer, which will handle freeing them for us.
1988  *
1989  *     We ensure that for any filesystem block, all pages for that
1990  *     block are either resident or not, even if those pages are higher
1991  *     than EOF; that means that we will be getting requests to free
1992  *     "unused" pages above EOF all the time, and should ignore them.
1993  *
1994  * (4) If we are called with PGO_LOCKED, the finfo array we are to write
1995  *     into has been set up for us by lfs_writefile.  If not, we will
1996  *     have to handle allocating and/or freeing an finfo entry.
1997  *
1998  * XXX note that we're (ab)using PGO_LOCKED as "seglock held".
1999  */
2000
2001 /* How many times to loop before we should start to worry */
2002 #define TOOMANY 4
2003
2004 int
2005 lfs_putpages(void *v)
2006 {
2007         int error;
2008         struct vop_putpages_args /* {
2009                 struct vnode *a_vp;
2010                 voff_t a_offlo;
2011                 voff_t a_offhi;
2012                 int a_flags;
2013         } */ *ap = v;
2014         struct vnode *vp;
2015         struct inode *ip;
2016         struct lfs *fs;
2017         struct segment *sp;
2018         off_t origoffset, startoffset, endoffset, origendoffset, blkeof;
2019         off_t off, max_endoffset;
2020         bool seglocked, sync, pagedaemon;
2021         struct vm_page *pg, *busypg;
2022         UVMHIST_FUNC("lfs_putpages"); UVMHIST_CALLED(ubchist);
2023 #ifdef DEBUG
2024         int debug_n_again, debug_n_dirtyclean;
2025 #endif
2026
2027         vp = ap->a_vp;
2028         ip = VTOI(vp);
2029         fs = ip->i_lfs;
2030         sync = (ap->a_flags & PGO_SYNCIO) != 0;
2031         pagedaemon = (curlwp == uvm.pagedaemon_lwp);
2032
2033         /* Putpages does nothing for metadata. */
2034         if (vp == fs->lfs_ivnode || vp->v_type != VREG) {
2035                 mutex_exit(&vp->v_interlock);
2036                 return 0;
2037         }
2038
2039         /*
2040          * If there are no pages, don't do anything.
2041          */
2042         if (vp->v_uobj.uo_npages == 0) {
2043                 if (TAILQ_EMPTY(&vp->v_uobj.memq) &&
2044                     (vp->v_iflag & VI_ONWORKLST) &&
2045                     LIST_FIRST(&vp->v_dirtyblkhd) == NULL) {
2046                         vp->v_iflag &= ~VI_WRMAPDIRTY;
2047                         vn_syncer_remove_from_worklist(vp);
2048                 }
2049                 mutex_exit(&vp->v_interlock);
2050
2051                 /* Remove us from paging queue, if we were on it */
2052                 mutex_enter(&lfs_lock);
2053                 if (ip->i_flags & IN_PAGING) {
2054                         ip->i_flags &= ~IN_PAGING;
2055                         TAILQ_REMOVE(&fs->lfs_pchainhd, ip, i_lfs_pchain);
2056                 }
2057                 mutex_exit(&lfs_lock);
2058                 return 0;
2059         }
2060
2061         blkeof = blkroundup(fs, ip->i_size);
2062
2063         /*
2064          * Ignore requests to free pages past EOF but in the same block
2065          * as EOF, unless the request is synchronous.  (If the request is
2066          * sync, it comes from lfs_truncate.)
2067          * XXXUBC Make these pages look "active" so the pagedaemon won't
2068          * XXXUBC bother us with them again.
2069          */
2070         if (!sync && ap->a_offlo >= ip->i_size && ap->a_offlo < blkeof) {
2071                 origoffset = ap->a_offlo;
2072                 for (off = origoffset; off < blkeof; off += fs->lfs_bsize) {
2073                         pg = uvm_pagelookup(&vp->v_uobj, off);
2074                         KASSERT(pg != NULL);
2075                         while (pg->flags & PG_BUSY) {
2076                                 pg->flags |= PG_WANTED;
2077                                 UVM_UNLOCK_AND_WAIT(pg, &vp->v_interlock, 0,
2078                                                     "lfsput2", 0);
2079                                 mutex_enter(&vp->v_interlock);
2080                         }
2081                         mutex_enter(&uvm_pageqlock);
2082                         uvm_pageactivate(pg);
2083                         mutex_exit(&uvm_pageqlock);
2084                 }
2085                 ap->a_offlo = blkeof;
2086                 if (ap->a_offhi > 0 && ap->a_offhi <= ap->a_offlo) {
2087                         mutex_exit(&vp->v_interlock);
2088                         return 0;
2089                 }
2090         }
2091
2092         /*
2093          * Extend page range to start and end at block boundaries.
2094          * (For the purposes of VOP_PUTPAGES, fragments don't exist.)
2095          */
2096         origoffset = ap->a_offlo;
2097         origendoffset = ap->a_offhi;
2098         startoffset = origoffset & ~(fs->lfs_bmask);
2099         max_endoffset = (trunc_page(LLONG_MAX) >> fs->lfs_bshift)
2100                                                << fs->lfs_bshift;
2101
2102         if (origendoffset == 0 || ap->a_flags & PGO_ALLPAGES) {
2103                 endoffset = max_endoffset;
2104                 origendoffset = endoffset;
2105         } else {
2106                 origendoffset = round_page(ap->a_offhi);
2107                 endoffset = round_page(blkroundup(fs, origendoffset));
2108         }
2109
2110         KASSERT(startoffset > 0 || endoffset >= startoffset);
2111         if (startoffset == endoffset) {
2112                 /* Nothing to do, why were we called? */
2113                 mutex_exit(&vp->v_interlock);
2114                 DLOG((DLOG_PAGE, "lfs_putpages: startoffset = endoffset = %"
2115                       PRId64 "\n", startoffset));
2116                 return 0;
2117         }
2118
2119         ap->a_offlo = startoffset;
2120         ap->a_offhi = endoffset;
2121
2122         /*
2123          * If not cleaning, just send the pages through genfs_putpages
2124          * to be returned to the pool.
2125          */
2126         if (!(ap->a_flags & PGO_CLEANIT))
2127                 return genfs_putpages(v);
2128
2129         /* Set PGO_BUSYFAIL to avoid deadlocks */
2130         ap->a_flags |= PGO_BUSYFAIL;
2131
2132         /*
2133          * Likewise, if we are asked to clean but the pages are not
2134          * dirty, we can just free them using genfs_putpages.
2135          */
2136 #ifdef DEBUG
2137         debug_n_dirtyclean = 0;
2138 #endif
2139         do {
2140                 int r;
2141
2142                 /* Count the number of dirty pages */
2143                 r = check_dirty(fs, vp, startoffset, endoffset, blkeof,
2144                                 ap->a_flags, 1, NULL);
2145                 if (r < 0) {
2146                         /* Pages are busy with another process */
2147                         mutex_exit(&vp->v_interlock);
2148                         return EDEADLK;
2149                 }
2150                 if (r > 0) /* Some pages are dirty */
2151                         break;
2152
2153                 /*
2154                  * Sometimes pages are dirtied between the time that
2155                  * we check and the time we try to clean them.
2156                  * Instruct lfs_gop_write to return EDEADLK in this case
2157                  * so we can write them properly.
2158                  */
2159                 ip->i_lfs_iflags |= LFSI_NO_GOP_WRITE;
2160                 r = genfs_do_putpages(vp, startoffset, endoffset,
2161                                        ap->a_flags & ~PGO_SYNCIO, &busypg);
2162                 ip->i_lfs_iflags &= ~LFSI_NO_GOP_WRITE;
2163                 if (r != EDEADLK)
2164                         return r;
2165
2166                 /* One of the pages was busy.  Start over. */
2167                 mutex_enter(&vp->v_interlock);
2168                 wait_for_page(vp, busypg, "dirtyclean");
2169 #ifdef DEBUG
2170                 ++debug_n_dirtyclean;
2171 #endif
2172         } while(1);
2173
2174 #ifdef DEBUG
2175         if (debug_n_dirtyclean > TOOMANY)
2176                 printf("lfs_putpages: dirtyclean: looping, n = %d\n",
2177                        debug_n_dirtyclean);
2178 #endif
2179
2180         /*
2181          * Dirty and asked to clean.
2182          *
2183          * Pagedaemon can't actually write LFS pages; wake up
2184          * the writer to take care of that.  The writer will
2185          * notice the pager inode queue and act on that.
2186          */
2187         if (pagedaemon) {
2188                 mutex_enter(&lfs_lock);
2189                 if (!(ip->i_flags & IN_PAGING)) {
2190                         ip->i_flags |= IN_PAGING;
2191                         TAILQ_INSERT_TAIL(&fs->lfs_pchainhd, ip, i_lfs_pchain);
2192                 }
2193                 wakeup(&lfs_writer_daemon);
2194                 mutex_exit(&lfs_lock);
2195                 mutex_exit(&vp->v_interlock);
2196                 preempt();
2197                 return EWOULDBLOCK;
2198         }
2199
2200         /*
2201          * If this is a file created in a recent dirop, we can't flush its
2202          * inode until the dirop is complete.  Drain dirops, then flush the
2203          * filesystem (taking care of any other pending dirops while we're
2204          * at it).
2205          */
2206         if ((ap->a_flags & (PGO_CLEANIT|PGO_LOCKED)) == PGO_CLEANIT &&
2207             (vp->v_uflag & VU_DIROP)) {
2208                 int locked;
2209
2210                 DLOG((DLOG_PAGE, "lfs_putpages: flushing VU_DIROP\n"));
2211                 locked = (VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
2212                 mutex_exit(&vp->v_interlock);
2213                 lfs_writer_enter(fs, "ppdirop");
2214                 if (locked)
2215                         VOP_UNLOCK(vp, 0); /* XXX why? */
2216
2217                 mutex_enter(&lfs_lock);
2218                 lfs_flush_fs(fs, sync ? SEGM_SYNC : 0);
2219                 mutex_exit(&lfs_lock);
2220
2221                 mutex_enter(&vp->v_interlock);
2222                 if (locked) {
2223                         VOP_LOCK(vp, LK_EXCLUSIVE | LK_INTERLOCK);
2224                         mutex_enter(&vp->v_interlock);
2225                 }
2226                 lfs_writer_leave(fs);
2227
2228                 /* XXX the flush should have taken care of this one too! */
2229         }
2230
2231         /*
2232          * This is it.  We are going to write some pages.  From here on
2233          * down it's all just mechanics.
2234          *
2235          * Don't let genfs_putpages wait; lfs_segunlock will wait for us.
2236          */
2237         ap->a_flags &= ~PGO_SYNCIO;
2238
2239         /*
2240          * If we've already got the seglock, flush the node and return.
2241          * The FIP has already been set up for us by lfs_writefile,
2242          * and FIP cleanup and lfs_updatemeta will also be done there,
2243          * unless genfs_putpages returns EDEADLK; then we must flush
2244          * what we have, and correct FIP and segment header accounting.
2245          */
2246   get_seglock:
2247         /*
2248          * If we are not called with the segment locked, lock it.
2249          * Account for a new FIP in the segment header, and set sp->vp.
2250          * (This should duplicate the setup at the top of lfs_writefile().)
2251          */
2252         seglocked = (ap->a_flags & PGO_LOCKED) != 0;
2253         if (!seglocked) {
2254                 mutex_exit(&vp->v_interlock);
2255                 error = lfs_seglock(fs, SEGM_PROT | (sync ? SEGM_SYNC : 0));
2256                 if (error != 0)
2257                         return error;
2258                 mutex_enter(&vp->v_interlock);
2259                 lfs_acquire_finfo(fs, ip->i_number, ip->i_gen);
2260         }
2261         sp = fs->lfs_sp;
2262         KASSERT(sp->vp == NULL);
2263         sp->vp = vp;
2264
2265         /*
2266          * Ensure that the partial segment is marked SS_DIROP if this
2267          * vnode is a DIROP.
2268          */
2269         if (!seglocked && vp->v_uflag & VU_DIROP)
2270                 ((SEGSUM *)(sp->segsum))->ss_flags |= (SS_DIROP|SS_CONT);
2271
2272         /*
2273          * Loop over genfs_putpages until all pages are gathered.
2274          * genfs_putpages() drops the interlock, so reacquire it if necessary.
2275          * Whenever we lose the interlock we have to rerun check_dirty, as
2276          * well, since more pages might have been dirtied in our absence.
2277          */
2278 #ifdef DEBUG
2279         debug_n_again = 0;
2280 #endif
2281         do {
2282                 busypg = NULL;
2283                 if (check_dirty(fs, vp, startoffset, endoffset, blkeof,
2284                                 ap->a_flags, 0, &busypg) < 0) {
2285                         mutex_exit(&vp->v_interlock);
2286
2287                         mutex_enter(&vp->v_interlock);
2288                         write_and_wait(fs, vp, busypg, seglocked, NULL);
2289                         if (!seglocked) {
2290                                 mutex_exit(&vp->v_interlock);
2291                                 lfs_release_finfo(fs);
2292                                 lfs_segunlock(fs);
2293                                 mutex_enter(&vp->v_interlock);
2294                         }
2295                         sp->vp = NULL;
2296                         goto get_seglock;
2297                 }
2298
2299                 busypg = NULL;
2300                 error = genfs_do_putpages(vp, startoffset, endoffset,
2301                                            ap->a_flags, &busypg);
2302
2303                 if (error == EDEADLK || error == EAGAIN) {
2304                         DLOG((DLOG_PAGE, "lfs_putpages: genfs_putpages returned"
2305                               " %d ino %d off %x (seg %d)\n", error,
2306                               ip->i_number, fs->lfs_offset,
2307                               dtosn(fs, fs->lfs_offset)));
2308
2309                         mutex_enter(&vp->v_interlock);
2310                         write_and_wait(fs, vp, busypg, seglocked, "again");
2311                 }
2312 #ifdef DEBUG
2313                 ++debug_n_again;
2314 #endif
2315         } while (error == EDEADLK);
2316 #ifdef DEBUG
2317         if (debug_n_again > TOOMANY)
2318                 printf("lfs_putpages: again: looping, n = %d\n", debug_n_again);
2319 #endif
2320
2321         KASSERT(sp != NULL && sp->vp == vp);
2322         if (!seglocked) {
2323                 sp->vp = NULL;
2324
2325                 /* Write indirect blocks as well */
2326                 lfs_gather(fs, fs->lfs_sp, vp, lfs_match_indir);
2327                 lfs_gather(fs, fs->lfs_sp, vp, lfs_match_dindir);
2328                 lfs_gather(fs, fs->lfs_sp, vp, lfs_match_tindir);
2329
2330                 KASSERT(sp->vp == NULL);
2331                 sp->vp = vp;
2332         }
2333
2334         /*
2335          * Blocks are now gathered into a segment waiting to be written.
2336          * All that's left to do is update metadata, and write them.
2337          */
2338         lfs_updatemeta(sp);
2339         KASSERT(sp->vp == vp);
2340         sp->vp = NULL;
2341
2342         /*
2343          * If we were called from lfs_writefile, we don't need to clean up
2344          * the FIP or unlock the segment lock.  We're done.
2345          */
2346         if (seglocked)
2347                 return error;
2348
2349         /* Clean up FIP and send it to disk. */
2350         lfs_release_finfo(fs);
2351         lfs_writeseg(fs, fs->lfs_sp);
2352
2353         /*
2354          * Remove us from paging queue if we wrote all our pages.
2355          */
2356         if (origendoffset == 0 || ap->a_flags & PGO_ALLPAGES) {
2357                 mutex_enter(&lfs_lock);
2358                 if (ip->i_flags & IN_PAGING) {
2359                         ip->i_flags &= ~IN_PAGING;
2360                         TAILQ_REMOVE(&fs->lfs_pchainhd, ip, i_lfs_pchain);
2361                 }
2362                 mutex_exit(&lfs_lock);
2363         }
2364
2365         /*
2366          * XXX - with the malloc/copy writeseg, the pages are freed by now
2367          * even if we don't wait (e.g. if we hold a nested lock).  This
2368          * will not be true if we stop using malloc/copy.
2369          */
2370         KASSERT(fs->lfs_sp->seg_flags & SEGM_PROT);
2371         lfs_segunlock(fs);
2372
2373         /*
2374          * Wait for v_numoutput to drop to zero.  The seglock should
2375          * take care of this, but there is a slight possibility that
2376          * aiodoned might not have got around to our buffers yet.
2377          */
2378         if (sync) {
2379                 mutex_enter(&vp->v_interlock);
2380                 while (vp->v_numoutput > 0) {
2381                         DLOG((DLOG_PAGE, "lfs_putpages: ino %d sleeping on"
2382                               " num %d\n", ip->i_number, vp->v_numoutput));
2383                         cv_wait(&vp->v_cv, &vp->v_interlock);
2384                 }
2385                 mutex_exit(&vp->v_interlock);
2386         }
2387         return error;
2388 }
2389
2390 /*
2391  * Return the last logical file offset that should be written for this file
2392  * if we're doing a write that ends at "size".  If writing, we need to know
2393  * about sizes on disk, i.e. fragments if there are any; if reading, we need
2394  * to know about entire blocks.
2395  */
2396 void
2397 lfs_gop_size(struct vnode *vp, off_t size, off_t *eobp, int flags)
2398 {
2399         struct inode *ip = VTOI(vp);
2400         struct lfs *fs = ip->i_lfs;
2401         daddr_t olbn, nlbn;
2402
2403         olbn = lblkno(fs, ip->i_size);
2404         nlbn = lblkno(fs, size);
2405         if (!(flags & GOP_SIZE_MEM) && nlbn < NDADDR && olbn <= nlbn) {
2406                 *eobp = fragroundup(fs, size);
2407         } else {
2408                 *eobp = blkroundup(fs, size);
2409         }
2410 }
2411
2412 #ifdef DEBUG
2413 void lfs_dump_vop(void *);
2414
2415 void
2416 lfs_dump_vop(void *v)
2417 {
2418         struct vop_putpages_args /* {
2419                 struct vnode *a_vp;
2420                 voff_t a_offlo;
2421                 voff_t a_offhi;
2422                 int a_flags;
2423         } */ *ap = v;
2424
2425 #ifdef DDB
2426         vfs_vnode_print(ap->a_vp, 0, printf);
2427 #endif
2428         lfs_dump_dinode(VTOI(ap->a_vp)->i_din.ffs1_din);
2429 }
2430 #endif
2431
2432 int
2433 lfs_mmap(void *v)
2434 {
2435         struct vop_mmap_args /* {
2436                 const struct vnodeop_desc *a_desc;
2437                 struct vnode *a_vp;
2438                 vm_prot_t a_prot;
2439                 kauth_cred_t a_cred;
2440         } */ *ap = v;
2441
2442         if (VTOI(ap->a_vp)->i_number == LFS_IFILE_INUM)
2443                 return EOPNOTSUPP;
2444         return ufs_mmap(v);
2445 }