sys/ufs/lfs/lfs_vnops.c

   1 /*      $NetBSD: lfs_vnops.c,v 1.238 2011/09/20 14:01:33 chs Exp $      */
   2
   3 /*-
   4  * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
   5  * All rights reserved.
   6  *
   7  * This code is derived from software contributed to The NetBSD Foundation
   8  * by Konrad E. Schroder <perseant@hhhh.org>.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  29  * POSSIBILITY OF SUCH DAMAGE.
  30  */
  31 /*
  32  * Copyright (c) 1986, 1989, 1991, 1993, 1995
  33  *      The Regents of the University of California.  All rights reserved.
  34  *
  35  * Redistribution and use in source and binary forms, with or without
  36  * modification, are permitted provided that the following conditions
  37  * are met:
  38  * 1. Redistributions of source code must retain the above copyright
  39  *    notice, this list of conditions and the following disclaimer.
  40  * 2. Redistributions in binary form must reproduce the above copyright
  41  *    notice, this list of conditions and the following disclaimer in the
  42  *    documentation and/or other materials provided with the distribution.
  43  * 3. Neither the name of the University nor the names of its contributors
  44  *    may be used to endorse or promote products derived from this software
  45  *    without specific prior written permission.
  46  *
  47  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  48  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  49  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  50  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  51  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  52  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  53  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  54  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  55  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  56  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  57  * SUCH DAMAGE.
  58  *
  59  *      @(#)lfs_vnops.c 8.13 (Berkeley) 6/10/95
  60  */
  61
  62 #include <sys/cdefs.h>
  63 __KERNEL_RCSID(0, "$NetBSD: lfs_vnops.c,v 1.238 2011/09/20 14:01:33 chs Exp $");
  64
  65 #ifdef _KERNEL_OPT
  66 #include "opt_compat_netbsd.h"
  67 #include "opt_uvm_page_trkown.h"
  68 #endif
  69
  70 #include <sys/param.h>
  71 #include <sys/systm.h>
  72 #include <sys/namei.h>
  73 #include <sys/resourcevar.h>
  74 #include <sys/kernel.h>
  75 #include <sys/file.h>
  76 #include <sys/stat.h>
  77 #include <sys/buf.h>
  78 #include <sys/proc.h>
  79 #include <sys/mount.h>
  80 #include <sys/vnode.h>
  81 #include <sys/pool.h>
  82 #include <sys/signalvar.h>
  83 #include <sys/kauth.h>
  84 #include <sys/syslog.h>
  85 #include <sys/fstrans.h>
  86
  87 #include <miscfs/fifofs/fifo.h>
  88 #include <miscfs/genfs/genfs.h>
  89 #include <miscfs/specfs/specdev.h>
  90
  91 #include <ufs/ufs/inode.h>
  92 #include <ufs/ufs/dir.h>
  93 #include <ufs/ufs/ufsmount.h>
  94 #include <ufs/ufs/ufs_extern.h>
  95
  96 #include <uvm/uvm.h>
  97 #include <uvm/uvm_pmap.h>
  98 #include <uvm/uvm_stat.h>
  99 #include <uvm/uvm_pager.h>
 100
 101 #include <ufs/lfs/lfs.h>
 102 #include <ufs/lfs/lfs_extern.h>
 103
 104 extern pid_t lfs_writer_daemon;
 105 int lfs_ignore_lazy_sync = 1;
 106
 107 /* Global vfs data structures for lfs. */
 108 int (**lfs_vnodeop_p)(void *);
 109 const struct vnodeopv_entry_desc lfs_vnodeop_entries[] = {
 110         { &vop_default_desc, vn_default_error },
 111         { &vop_lookup_desc, ufs_lookup },               /* lookup */
 112         { &vop_create_desc, lfs_create },               /* create */
 113         { &vop_whiteout_desc, ufs_whiteout },           /* whiteout */
 114         { &vop_mknod_desc, lfs_mknod },                 /* mknod */
 115         { &vop_open_desc, ufs_open },                   /* open */
 116         { &vop_close_desc, lfs_close },                 /* close */
 117         { &vop_access_desc, ufs_access },               /* access */
 118         { &vop_getattr_desc, lfs_getattr },             /* getattr */
 119         { &vop_setattr_desc, lfs_setattr },             /* setattr */
 120         { &vop_read_desc, lfs_read },                   /* read */
 121         { &vop_write_desc, lfs_write },                 /* write */
 122         { &vop_ioctl_desc, ufs_ioctl },                 /* ioctl */
 123         { &vop_fcntl_desc, lfs_fcntl },                 /* fcntl */
 124         { &vop_poll_desc, ufs_poll },                   /* poll */
 125         { &vop_kqfilter_desc, genfs_kqfilter },         /* kqfilter */
 126         { &vop_revoke_desc, ufs_revoke },               /* revoke */
 127         { &vop_mmap_desc, lfs_mmap },                   /* mmap */
 128         { &vop_fsync_desc, lfs_fsync },                 /* fsync */
 129         { &vop_seek_desc, ufs_seek },                   /* seek */
 130         { &vop_remove_desc, lfs_remove },               /* remove */
 131         { &vop_link_desc, lfs_link },                   /* link */
 132         { &vop_rename_desc, lfs_rename },               /* rename */
 133         { &vop_mkdir_desc, lfs_mkdir },                 /* mkdir */
 134         { &vop_rmdir_desc, lfs_rmdir },                 /* rmdir */
 135         { &vop_symlink_desc, lfs_symlink },             /* symlink */
 136         { &vop_readdir_desc, ufs_readdir },             /* readdir */
 137         { &vop_readlink_desc, ufs_readlink },           /* readlink */
 138         { &vop_abortop_desc, ufs_abortop },             /* abortop */
 139         { &vop_inactive_desc, lfs_inactive },           /* inactive */
 140         { &vop_reclaim_desc, lfs_reclaim },             /* reclaim */
 141         { &vop_lock_desc, ufs_lock },                   /* lock */
 142         { &vop_unlock_desc, ufs_unlock },               /* unlock */
 143         { &vop_bmap_desc, ufs_bmap },                   /* bmap */
 144         { &vop_strategy_desc, lfs_strategy },           /* strategy */
 145         { &vop_print_desc, ufs_print },                 /* print */
 146         { &vop_islocked_desc, ufs_islocked },           /* islocked */
 147         { &vop_pathconf_desc, ufs_pathconf },           /* pathconf */
 148         { &vop_advlock_desc, ufs_advlock },             /* advlock */
 149         { &vop_bwrite_desc, lfs_bwrite },               /* bwrite */
 150         { &vop_getpages_desc, lfs_getpages },           /* getpages */
 151         { &vop_putpages_desc, lfs_putpages },           /* putpages */
 152         { NULL, NULL }
 153 };
 154 const struct vnodeopv_desc lfs_vnodeop_opv_desc =
 155         { &lfs_vnodeop_p, lfs_vnodeop_entries };
 156
 157 int (**lfs_specop_p)(void *);
 158 const struct vnodeopv_entry_desc lfs_specop_entries[] = {
 159         { &vop_default_desc, vn_default_error },
 160         { &vop_lookup_desc, spec_lookup },              /* lookup */
 161         { &vop_create_desc, spec_create },              /* create */
 162         { &vop_mknod_desc, spec_mknod },                /* mknod */
 163         { &vop_open_desc, spec_open },                  /* open */
 164         { &vop_close_desc, lfsspec_close },             /* close */
 165         { &vop_access_desc, ufs_access },               /* access */
 166         { &vop_getattr_desc, lfs_getattr },             /* getattr */
 167         { &vop_setattr_desc, lfs_setattr },             /* setattr */
 168         { &vop_read_desc, ufsspec_read },               /* read */
 169         { &vop_write_desc, ufsspec_write },             /* write */
 170         { &vop_ioctl_desc, spec_ioctl },                /* ioctl */
 171         { &vop_fcntl_desc, ufs_fcntl },                 /* fcntl */
 172         { &vop_poll_desc, spec_poll },                  /* poll */
 173         { &vop_kqfilter_desc, spec_kqfilter },          /* kqfilter */
 174         { &vop_revoke_desc, spec_revoke },              /* revoke */
 175         { &vop_mmap_desc, spec_mmap },                  /* mmap */
 176         { &vop_fsync_desc, spec_fsync },                /* fsync */
 177         { &vop_seek_desc, spec_seek },                  /* seek */
 178         { &vop_remove_desc, spec_remove },              /* remove */
 179         { &vop_link_desc, spec_link },                  /* link */
 180         { &vop_rename_desc, spec_rename },              /* rename */
 181         { &vop_mkdir_desc, spec_mkdir },                /* mkdir */
 182         { &vop_rmdir_desc, spec_rmdir },                /* rmdir */
 183         { &vop_symlink_desc, spec_symlink },            /* symlink */
 184         { &vop_readdir_desc, spec_readdir },            /* readdir */
 185         { &vop_readlink_desc, spec_readlink },          /* readlink */
 186         { &vop_abortop_desc, spec_abortop },            /* abortop */
 187         { &vop_inactive_desc, lfs_inactive },           /* inactive */
 188         { &vop_reclaim_desc, lfs_reclaim },             /* reclaim */
 189         { &vop_lock_desc, ufs_lock },                   /* lock */
 190         { &vop_unlock_desc, ufs_unlock },               /* unlock */
 191         { &vop_bmap_desc, spec_bmap },                  /* bmap */
 192         { &vop_strategy_desc, spec_strategy },          /* strategy */
 193         { &vop_print_desc, ufs_print },                 /* print */
 194         { &vop_islocked_desc, ufs_islocked },           /* islocked */
 195         { &vop_pathconf_desc, spec_pathconf },          /* pathconf */
 196         { &vop_advlock_desc, spec_advlock },            /* advlock */
 197         { &vop_bwrite_desc, vn_bwrite },                /* bwrite */
 198         { &vop_getpages_desc, spec_getpages },          /* getpages */
 199         { &vop_putpages_desc, spec_putpages },          /* putpages */
 200         { NULL, NULL }
 201 };
 202 const struct vnodeopv_desc lfs_specop_opv_desc =
 203         { &lfs_specop_p, lfs_specop_entries };
 204
 205 int (**lfs_fifoop_p)(void *);
 206 const struct vnodeopv_entry_desc lfs_fifoop_entries[] = {
 207         { &vop_default_desc, vn_default_error },
 208         { &vop_lookup_desc, vn_fifo_bypass },           /* lookup */
 209         { &vop_create_desc, vn_fifo_bypass },           /* create */
 210         { &vop_mknod_desc, vn_fifo_bypass },            /* mknod */
 211         { &vop_open_desc, vn_fifo_bypass },             /* open */
 212         { &vop_close_desc, lfsfifo_close },             /* close */
 213         { &vop_access_desc, ufs_access },               /* access */
 214         { &vop_getattr_desc, lfs_getattr },             /* getattr */
 215         { &vop_setattr_desc, lfs_setattr },             /* setattr */
 216         { &vop_read_desc, ufsfifo_read },               /* read */
 217         { &vop_write_desc, ufsfifo_write },             /* write */
 218         { &vop_ioctl_desc, vn_fifo_bypass },            /* ioctl */
 219         { &vop_fcntl_desc, ufs_fcntl },                 /* fcntl */
 220         { &vop_poll_desc, vn_fifo_bypass },             /* poll */
 221         { &vop_kqfilter_desc, vn_fifo_bypass },         /* kqfilter */
 222         { &vop_revoke_desc, vn_fifo_bypass },           /* revoke */
 223         { &vop_mmap_desc, vn_fifo_bypass },             /* mmap */
 224         { &vop_fsync_desc, vn_fifo_bypass },            /* fsync */
 225         { &vop_seek_desc, vn_fifo_bypass },             /* seek */
 226         { &vop_remove_desc, vn_fifo_bypass },           /* remove */
 227         { &vop_link_desc, vn_fifo_bypass },             /* link */
 228         { &vop_rename_desc, vn_fifo_bypass },           /* rename */
 229         { &vop_mkdir_desc, vn_fifo_bypass },            /* mkdir */
 230         { &vop_rmdir_desc, vn_fifo_bypass },            /* rmdir */
 231         { &vop_symlink_desc, vn_fifo_bypass },          /* symlink */
 232         { &vop_readdir_desc, vn_fifo_bypass },          /* readdir */
 233         { &vop_readlink_desc, vn_fifo_bypass },         /* readlink */
 234         { &vop_abortop_desc, vn_fifo_bypass },          /* abortop */
 235         { &vop_inactive_desc, lfs_inactive },           /* inactive */
 236         { &vop_reclaim_desc, lfs_reclaim },             /* reclaim */
 237         { &vop_lock_desc, ufs_lock },                   /* lock */
 238         { &vop_unlock_desc, ufs_unlock },               /* unlock */
 239         { &vop_bmap_desc, vn_fifo_bypass },             /* bmap */
 240         { &vop_strategy_desc, vn_fifo_bypass },         /* strategy */
 241         { &vop_print_desc, ufs_print },                 /* print */
 242         { &vop_islocked_desc, ufs_islocked },           /* islocked */
 243         { &vop_pathconf_desc, vn_fifo_bypass },         /* pathconf */
 244         { &vop_advlock_desc, vn_fifo_bypass },          /* advlock */
 245         { &vop_bwrite_desc, lfs_bwrite },               /* bwrite */
 246         { &vop_putpages_desc, vn_fifo_bypass },         /* putpages */
 247         { NULL, NULL }
 248 };
 249 const struct vnodeopv_desc lfs_fifoop_opv_desc =
 250         { &lfs_fifoop_p, lfs_fifoop_entries };
 251
 252 static int check_dirty(struct lfs *, struct vnode *, off_t, off_t, off_t, int, int, struct vm_page **);
 253
 254 #define LFS_READWRITE
 255 #include <ufs/ufs/ufs_readwrite.c>
 256 #undef  LFS_READWRITE
 257
 258 /*
 259  * Synch an open file.
 260  */
 261 /* ARGSUSED */
 262 int
 263 lfs_fsync(void *v)
 264 {
 265         struct vop_fsync_args /* {
 266                 struct vnode *a_vp;
 267                 kauth_cred_t a_cred;
 268                 int a_flags;
 269                 off_t offlo;
 270                 off_t offhi;
 271         } */ *ap = v;
 272         struct vnode *vp = ap->a_vp;
 273         int error, wait;
 274         struct inode *ip = VTOI(vp);
 275         struct lfs *fs = ip->i_lfs;
 276
 277         /* If we're mounted read-only, don't try to sync. */
 278         if (fs->lfs_ronly)
 279                 return 0;
 280
 281         /* If a removed vnode is being cleaned, no need to sync here. */
 282         if ((ap->a_flags & FSYNC_RECLAIM) != 0 && ip->i_mode == 0)
 283                 return 0;
 284
 285         /*
 286          * Trickle sync simply adds this vnode to the pager list, as if
 287          * the pagedaemon had requested a pageout.
 288          */
 289         if (ap->a_flags & FSYNC_LAZY) {
 290                 if (lfs_ignore_lazy_sync == 0) {
 291                         mutex_enter(&lfs_lock);
 292                         if (!(ip->i_flags & IN_PAGING)) {
 293                                 ip->i_flags |= IN_PAGING;
 294                                 TAILQ_INSERT_TAIL(&fs->lfs_pchainhd, ip,
 295                                                   i_lfs_pchain);
 296                         }
 297                         wakeup(&lfs_writer_daemon);
 298                         mutex_exit(&lfs_lock);
 299                 }
 300                 return 0;
 301         }
 302
 303         /*
 304          * If a vnode is bring cleaned, flush it out before we try to
 305          * reuse it.  This prevents the cleaner from writing files twice
 306          * in the same partial segment, causing an accounting underflow.
 307          */
 308         if (ap->a_flags & FSYNC_RECLAIM && ip->i_flags & IN_CLEANING) {
 309                 lfs_vflush(vp);
 310         }
 311
 312         wait = (ap->a_flags & FSYNC_WAIT);
 313         do {
 314                 mutex_enter(vp->v_interlock);
 315                 error = VOP_PUTPAGES(vp, trunc_page(ap->a_offlo),
 316                                      round_page(ap->a_offhi),
 317                                      PGO_CLEANIT | (wait ? PGO_SYNCIO : 0));
 318                 if (error == EAGAIN) {
 319                         mutex_enter(&lfs_lock);
 320                         mtsleep(&fs->lfs_avail, PCATCH | PUSER, "lfs_fsync",
 321                                 hz / 100 + 1, &lfs_lock);
 322                         mutex_exit(&lfs_lock);
 323                 }
 324         } while (error == EAGAIN);
 325         if (error)
 326                 return error;
 327
 328         if ((ap->a_flags & FSYNC_DATAONLY) == 0)
 329                 error = lfs_update(vp, NULL, NULL, wait ? UPDATE_WAIT : 0);
 330
 331         if (error == 0 && ap->a_flags & FSYNC_CACHE) {
 332                 int l = 0;
 333                 error = VOP_IOCTL(ip->i_devvp, DIOCCACHESYNC, &l, FWRITE,
 334                                   curlwp->l_cred);
 335         }
 336         if (wait && !VPISEMPTY(vp))
 337                 LFS_SET_UINO(ip, IN_MODIFIED);
 338
 339         return error;
 340 }
 341
 342 /*
 343  * Take IN_ADIROP off, then call ufs_inactive.
 344  */
 345 int
 346 lfs_inactive(void *v)
 347 {
 348         struct vop_inactive_args /* {
 349                 struct vnode *a_vp;
 350         } */ *ap = v;
 351
 352         lfs_unmark_vnode(ap->a_vp);
 353
 354         /*
 355          * The Ifile is only ever inactivated on unmount.
 356          * Streamline this process by not giving it more dirty blocks.
 357          */
 358         if (VTOI(ap->a_vp)->i_number == LFS_IFILE_INUM) {
 359                 mutex_enter(&lfs_lock);
 360                 LFS_CLR_UINO(VTOI(ap->a_vp), IN_ALLMOD);
 361                 mutex_exit(&lfs_lock);
 362                 VOP_UNLOCK(ap->a_vp);
 363                 return 0;
 364         }
 365
 366         return ufs_inactive(v);
 367 }
 368
 369 /*
 370  * These macros are used to bracket UFS directory ops, so that we can
 371  * identify all the pages touched during directory ops which need to
 372  * be ordered and flushed atomically, so that they may be recovered.
 373  *
 374  * Because we have to mark nodes VU_DIROP in order to prevent
 375  * the cache from reclaiming them while a dirop is in progress, we must
 376  * also manage the number of nodes so marked (otherwise we can run out).
 377  * We do this by setting lfs_dirvcount to the number of marked vnodes; it
 378  * is decremented during segment write, when VU_DIROP is taken off.
 379  */
 380 #define MARK_VNODE(vp)                  lfs_mark_vnode(vp)
 381 #define UNMARK_VNODE(vp)                lfs_unmark_vnode(vp)
 382 #define SET_DIROP_CREATE(dvp, vpp)      lfs_set_dirop_create((dvp), (vpp))
 383 #define SET_DIROP_REMOVE(dvp, vp)       lfs_set_dirop((dvp), (vp))
 384 static int lfs_set_dirop_create(struct vnode *, struct vnode **);
 385 static int lfs_set_dirop(struct vnode *, struct vnode *);
 386
 387 static int
 388 lfs_set_dirop(struct vnode *dvp, struct vnode *vp)
 389 {
 390         struct lfs *fs;
 391         int error;
 392
 393         KASSERT(VOP_ISLOCKED(dvp));
 394         KASSERT(vp == NULL || VOP_ISLOCKED(vp));
 395
 396         fs = VTOI(dvp)->i_lfs;
 397
 398         ASSERT_NO_SEGLOCK(fs);
 399         /*
 400          * LFS_NRESERVE calculates direct and indirect blocks as well
 401          * as an inode block; an overestimate in most cases.
 402          */
 403         if ((error = lfs_reserve(fs, dvp, vp, LFS_NRESERVE(fs))) != 0)
 404                 return (error);
 405
 406     restart:
 407         mutex_enter(&lfs_lock);
 408         if (fs->lfs_dirops == 0) {
 409                 mutex_exit(&lfs_lock);
 410                 lfs_check(dvp, LFS_UNUSED_LBN, 0);
 411                 mutex_enter(&lfs_lock);
 412         }
 413         while (fs->lfs_writer) {
 414                 error = mtsleep(&fs->lfs_dirops, (PRIBIO + 1) | PCATCH,
 415                     "lfs_sdirop", 0, &lfs_lock);
 416                 if (error == EINTR) {
 417                         mutex_exit(&lfs_lock);
 418                         goto unreserve;
 419                 }
 420         }
 421         if (lfs_dirvcount > LFS_MAX_DIROP && fs->lfs_dirops == 0) {
 422                 wakeup(&lfs_writer_daemon);
 423                 mutex_exit(&lfs_lock);
 424                 preempt();
 425                 goto restart;
 426         }
 427
 428         if (lfs_dirvcount > LFS_MAX_DIROP) {
 429                 mutex_exit(&lfs_lock);
 430                 DLOG((DLOG_DIROP, "lfs_set_dirop: sleeping with dirops=%d, "
 431                       "dirvcount=%d\n", fs->lfs_dirops, lfs_dirvcount));
 432                 if ((error = mtsleep(&lfs_dirvcount,
 433                     PCATCH | PUSER | PNORELOCK, "lfs_maxdirop", 0,
 434                     &lfs_lock)) != 0) {
 435                         goto unreserve;
 436                 }
 437                 goto restart;
 438         }
 439
 440         ++fs->lfs_dirops;
 441         fs->lfs_doifile = 1;
 442         mutex_exit(&lfs_lock);
 443
 444         /* Hold a reference so SET_ENDOP will be happy */
 445         vref(dvp);
 446         if (vp) {
 447                 vref(vp);
 448                 MARK_VNODE(vp);
 449         }
 450
 451         MARK_VNODE(dvp);
 452         return 0;
 453
 454   unreserve:
 455         lfs_reserve(fs, dvp, vp, -LFS_NRESERVE(fs));
 456         return error;
 457 }
 458
 459 /*
 460  * Get a new vnode *before* adjusting the dirop count, to avoid a deadlock
 461  * in getnewvnode(), if we have a stacked filesystem mounted on top
 462  * of us.
 463  *
 464  * NB: this means we have to clear the new vnodes on error.  Fortunately
 465  * SET_ENDOP is there to do that for us.
 466  */
 467 static int
 468 lfs_set_dirop_create(struct vnode *dvp, struct vnode **vpp)
 469 {
 470         int error;
 471         struct lfs *fs;
 472
 473         fs = VFSTOUFS(dvp->v_mount)->um_lfs;
 474         ASSERT_NO_SEGLOCK(fs);
 475         if (fs->lfs_ronly)
 476                 return EROFS;
 477         if (vpp == NULL) {
 478                 return lfs_set_dirop(dvp, NULL);
 479         }
 480         error = getnewvnode(VT_LFS, dvp->v_mount, lfs_vnodeop_p, NULL, vpp);
 481         if (error) {
 482                 DLOG((DLOG_ALLOC, "lfs_set_dirop_create: dvp %p error %d\n",
 483                       dvp, error));
 484                 return error;
 485         }
 486         if ((error = lfs_set_dirop(dvp, NULL)) != 0) {
 487                 ungetnewvnode(*vpp);
 488                 *vpp = NULL;
 489                 return error;
 490         }
 491         return 0;
 492 }
 493
 494 #define SET_ENDOP_BASE(fs, dvp, str)                                    \
 495         do {                                                            \
 496                 mutex_enter(&lfs_lock);                         \
 497                 --(fs)->lfs_dirops;                                     \
 498                 if (!(fs)->lfs_dirops) {                                \
 499                         if ((fs)->lfs_nadirop) {                        \
 500                                 panic("SET_ENDOP: %s: no dirops but "   \
 501                                         " nadirop=%d", (str),           \
 502                                         (fs)->lfs_nadirop);             \
 503                         }                                               \
 504                         wakeup(&(fs)->lfs_writer);                      \
 505                         mutex_exit(&lfs_lock);                          \
 506                         lfs_check((dvp), LFS_UNUSED_LBN, 0);            \
 507                 } else                                                  \
 508                         mutex_exit(&lfs_lock);                          \
 509         } while(0)
 510 #define SET_ENDOP_CREATE(fs, dvp, nvpp, str)                            \
 511         do {                                                            \
 512                 UNMARK_VNODE(dvp);                                      \
 513                 if (nvpp && *nvpp)                                      \
 514                         UNMARK_VNODE(*nvpp);                            \
 515                 /* Check for error return to stem vnode leakage */      \
 516                 if (nvpp && *nvpp && !((*nvpp)->v_uflag & VU_DIROP))    \
 517                         ungetnewvnode(*(nvpp));                         \
 518                 SET_ENDOP_BASE((fs), (dvp), (str));                     \
 519                 lfs_reserve((fs), (dvp), NULL, -LFS_NRESERVE(fs));      \
 520                 vrele(dvp);                                             \
 521         } while(0)
 522 #define SET_ENDOP_CREATE_AP(ap, str)                                    \
 523         SET_ENDOP_CREATE(VTOI((ap)->a_dvp)->i_lfs, (ap)->a_dvp,         \
 524                          (ap)->a_vpp, (str))
 525 #define SET_ENDOP_REMOVE(fs, dvp, ovp, str)                             \
 526         do {                                                            \
 527                 UNMARK_VNODE(dvp);                                      \
 528                 if (ovp)                                                \
 529                         UNMARK_VNODE(ovp);                              \
 530                 SET_ENDOP_BASE((fs), (dvp), (str));                     \
 531                 lfs_reserve((fs), (dvp), (ovp), -LFS_NRESERVE(fs));     \
 532                 vrele(dvp);                                             \
 533                 if (ovp)                                                \
 534                         vrele(ovp);                                     \
 535         } while(0)
 536
 537 void
 538 lfs_mark_vnode(struct vnode *vp)
 539 {
 540         struct inode *ip = VTOI(vp);
 541         struct lfs *fs = ip->i_lfs;
 542
 543         mutex_enter(&lfs_lock);
 544         if (!(ip->i_flag & IN_ADIROP)) {
 545                 if (!(vp->v_uflag & VU_DIROP)) {
 546                         mutex_enter(vp->v_interlock);
 547                         (void)lfs_vref(vp);
 548                         ++lfs_dirvcount;
 549                         ++fs->lfs_dirvcount;
 550                         TAILQ_INSERT_TAIL(&fs->lfs_dchainhd, ip, i_lfs_dchain);
 551                         vp->v_uflag |= VU_DIROP;
 552                 }
 553                 ++fs->lfs_nadirop;
 554                 ip->i_flag |= IN_ADIROP;
 555         } else
 556                 KASSERT(vp->v_uflag & VU_DIROP);
 557         mutex_exit(&lfs_lock);
 558 }
 559
 560 void
 561 lfs_unmark_vnode(struct vnode *vp)
 562 {
 563         struct inode *ip = VTOI(vp);
 564
 565         if (ip && (ip->i_flag & IN_ADIROP)) {
 566                 KASSERT(vp->v_uflag & VU_DIROP);
 567                 mutex_enter(&lfs_lock);
 568                 --ip->i_lfs->lfs_nadirop;
 569                 mutex_exit(&lfs_lock);
 570                 ip->i_flag &= ~IN_ADIROP;
 571         }
 572 }
 573
 574 int
 575 lfs_symlink(void *v)
 576 {
 577         struct vop_symlink_args /* {
 578                 struct vnode *a_dvp;
 579                 struct vnode **a_vpp;
 580                 struct componentname *a_cnp;
 581                 struct vattr *a_vap;
 582                 char *a_target;
 583         } */ *ap = v;
 584         int error;
 585
 586         if ((error = SET_DIROP_CREATE(ap->a_dvp, ap->a_vpp)) != 0) {
 587                 vput(ap->a_dvp);
 588                 return error;
 589         }
 590         error = ufs_symlink(ap);
 591         SET_ENDOP_CREATE_AP(ap, "symlink");
 592         return (error);
 593 }
 594
 595 int
 596 lfs_mknod(void *v)
 597 {
 598         struct vop_mknod_args   /* {
 599                 struct vnode *a_dvp;
 600                 struct vnode **a_vpp;
 601                 struct componentname *a_cnp;
 602                 struct vattr *a_vap;
 603         } */ *ap = v;
 604         struct vattr *vap = ap->a_vap;
 605         struct vnode **vpp = ap->a_vpp;
 606         struct inode *ip;
 607         int error;
 608         struct mount    *mp;
 609         ino_t           ino;
 610         struct ufs_lookup_results *ulr;
 611
 612         /* XXX should handle this material another way */
 613         ulr = &VTOI(ap->a_dvp)->i_crap;
 614         UFS_CHECK_CRAPCOUNTER(VTOI(ap->a_dvp));
 615
 616         if ((error = SET_DIROP_CREATE(ap->a_dvp, ap->a_vpp)) != 0) {
 617                 vput(ap->a_dvp);
 618                 return error;
 619         }
 620         error = ufs_makeinode(MAKEIMODE(vap->va_type, vap->va_mode),
 621                               ap->a_dvp, ulr, vpp, ap->a_cnp);
 622
 623         /* Either way we're done with the dirop at this point */
 624         SET_ENDOP_CREATE_AP(ap, "mknod");
 625
 626         if (error)
 627                 return (error);
 628
 629         ip = VTOI(*vpp);
 630         mp  = (*vpp)->v_mount;
 631         ino = ip->i_number;
 632         ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
 633         if (vap->va_rdev != VNOVAL) {
 634                 /*
 635                  * Want to be able to use this to make badblock
 636                  * inodes, so don't truncate the dev number.
 637                  */
 638 #if 0
 639                 ip->i_ffs1_rdev = ufs_rw32(vap->va_rdev,
 640                                            UFS_MPNEEDSWAP((*vpp)->v_mount));
 641 #else
 642                 ip->i_ffs1_rdev = vap->va_rdev;
 643 #endif
 644         }
 645
 646         /*
 647          * Call fsync to write the vnode so that we don't have to deal with
 648          * flushing it when it's marked VU_DIROP|VI_XLOCK.
 649          *
 650          * XXX KS - If we can't flush we also can't call vgone(), so must
 651          * return.  But, that leaves this vnode in limbo, also not good.
 652          * Can this ever happen (barring hardware failure)?
 653          */
 654         if ((error = VOP_FSYNC(*vpp, NOCRED, FSYNC_WAIT, 0, 0)) != 0) {
 655                 panic("lfs_mknod: couldn't fsync (ino %llu)",
 656                       (unsigned long long)ino);
 657                 /* return (error); */
 658         }
 659         /*
 660          * Remove vnode so that it will be reloaded by VFS_VGET and
 661          * checked to see if it is an alias of an existing entry in
 662          * the inode cache.
 663          */
 664         /* Used to be vput, but that causes us to call VOP_INACTIVE twice. */
 665
 666         VOP_UNLOCK(*vpp);
 667         (*vpp)->v_type = VNON;
 668         vgone(*vpp);
 669         error = VFS_VGET(mp, ino, vpp);
 670
 671         if (error != 0) {
 672                 *vpp = NULL;
 673                 return (error);
 674         }
 675         return (0);
 676 }
 677
 678 int
 679 lfs_create(void *v)
 680 {
 681         struct vop_create_args  /* {
 682                 struct vnode *a_dvp;
 683                 struct vnode **a_vpp;
 684                 struct componentname *a_cnp;
 685                 struct vattr *a_vap;
 686         } */ *ap = v;
 687         int error;
 688
 689         if ((error = SET_DIROP_CREATE(ap->a_dvp, ap->a_vpp)) != 0) {
 690                 vput(ap->a_dvp);
 691                 return error;
 692         }
 693         error = ufs_create(ap);
 694         SET_ENDOP_CREATE_AP(ap, "create");
 695         return (error);
 696 }
 697
 698 int
 699 lfs_mkdir(void *v)
 700 {
 701         struct vop_mkdir_args   /* {
 702                 struct vnode *a_dvp;
 703                 struct vnode **a_vpp;
 704                 struct componentname *a_cnp;
 705                 struct vattr *a_vap;
 706         } */ *ap = v;
 707         int error;
 708
 709         if ((error = SET_DIROP_CREATE(ap->a_dvp, ap->a_vpp)) != 0) {
 710                 vput(ap->a_dvp);
 711                 return error;
 712         }
 713         error = ufs_mkdir(ap);
 714         SET_ENDOP_CREATE_AP(ap, "mkdir");
 715         return (error);
 716 }
 717
 718 int
 719 lfs_remove(void *v)
 720 {
 721         struct vop_remove_args  /* {
 722                 struct vnode *a_dvp;
 723                 struct vnode *a_vp;
 724                 struct componentname *a_cnp;
 725         } */ *ap = v;
 726         struct vnode *dvp, *vp;
 727         struct inode *ip;
 728         int error;
 729
 730         dvp = ap->a_dvp;
 731         vp = ap->a_vp;
 732         ip = VTOI(vp);
 733         if ((error = SET_DIROP_REMOVE(dvp, vp)) != 0) {
 734                 if (dvp == vp)
 735                         vrele(vp);
 736                 else
 737                         vput(vp);
 738                 vput(dvp);
 739                 return error;
 740         }
 741         error = ufs_remove(ap);
 742         if (ip->i_nlink == 0)
 743                 lfs_orphan(ip->i_lfs, ip->i_number);
 744         SET_ENDOP_REMOVE(ip->i_lfs, dvp, ap->a_vp, "remove");
 745         return (error);
 746 }
 747
 748 int
 749 lfs_rmdir(void *v)
 750 {
 751         struct vop_rmdir_args   /* {
 752                 struct vnodeop_desc *a_desc;
 753                 struct vnode *a_dvp;
 754                 struct vnode *a_vp;
 755                 struct componentname *a_cnp;
 756         } */ *ap = v;
 757         struct vnode *vp;
 758         struct inode *ip;
 759         int error;
 760
 761         vp = ap->a_vp;
 762         ip = VTOI(vp);
 763         if ((error = SET_DIROP_REMOVE(ap->a_dvp, ap->a_vp)) != 0) {
 764                 if (ap->a_dvp == vp)
 765                         vrele(ap->a_dvp);
 766                 else
 767                         vput(ap->a_dvp);
 768                 vput(vp);
 769                 return error;
 770         }
 771         error = ufs_rmdir(ap);
 772         if (ip->i_nlink == 0)
 773                 lfs_orphan(ip->i_lfs, ip->i_number);
 774         SET_ENDOP_REMOVE(ip->i_lfs, ap->a_dvp, ap->a_vp, "rmdir");
 775         return (error);
 776 }
 777
 778 int
 779 lfs_link(void *v)
 780 {
 781         struct vop_link_args    /* {
 782                 struct vnode *a_dvp;
 783                 struct vnode *a_vp;
 784                 struct componentname *a_cnp;
 785         } */ *ap = v;
 786         int error;
 787         struct vnode **vpp = NULL;
 788
 789         if ((error = SET_DIROP_CREATE(ap->a_dvp, vpp)) != 0) {
 790                 vput(ap->a_dvp);
 791                 return error;
 792         }
 793         error = ufs_link(ap);
 794         SET_ENDOP_CREATE(VTOI(ap->a_dvp)->i_lfs, ap->a_dvp, vpp, "link");
 795         return (error);
 796 }
 797
 798 int
 799 lfs_rename(void *v)
 800 {
 801         struct vop_rename_args  /* {
 802                 struct vnode *a_fdvp;
 803                 struct vnode *a_fvp;
 804                 struct componentname *a_fcnp;
 805                 struct vnode *a_tdvp;
 806                 struct vnode *a_tvp;
 807                 struct componentname *a_tcnp;
 808         } */ *ap = v;
 809         struct vnode *tvp, *fvp, *tdvp, *fdvp;
 810         struct componentname *tcnp, *fcnp;
 811         int error;
 812         struct lfs *fs;
 813
 814         fs = VTOI(ap->a_fdvp)->i_lfs;
 815         tvp = ap->a_tvp;
 816         tdvp = ap->a_tdvp;
 817         tcnp = ap->a_tcnp;
 818         fvp = ap->a_fvp;
 819         fdvp = ap->a_fdvp;
 820         fcnp = ap->a_fcnp;
 821
 822         /*
 823          * Check for cross-device rename.
 824          * If it is, we don't want to set dirops, just error out.
 825          * (In particular note that MARK_VNODE(tdvp) will DTWT on
 826          * a cross-device rename.)
 827          *
 828          * Copied from ufs_rename.
 829          */
 830         if ((fvp->v_mount != tdvp->v_mount) ||
 831             (tvp && (fvp->v_mount != tvp->v_mount))) {
 832                 error = EXDEV;
 833                 goto errout;
 834         }
 835
 836         /*
 837          * Check to make sure we're not renaming a vnode onto itself
 838          * (deleting a hard link by renaming one name onto another);
 839          * if we are we can't recursively call VOP_REMOVE since that
 840          * would leave us with an unaccounted-for number of live dirops.
 841          *
 842          * Inline the relevant section of ufs_rename here, *before*
 843          * calling SET_DIROP_REMOVE.
 844          */
 845         if (tvp && ((VTOI(tvp)->i_flags & (IMMUTABLE | APPEND)) ||
 846                     (VTOI(tdvp)->i_flags & APPEND))) {
 847                 error = EPERM;
 848                 goto errout;
 849         }
 850         if (fvp == tvp) {
 851                 if (fvp->v_type == VDIR) {
 852                         error = EINVAL;
 853                         goto errout;
 854                 }
 855
 856                 /* Release destination completely. */
 857                 VOP_ABORTOP(tdvp, tcnp);
 858                 vput(tdvp);
 859                 vput(tvp);
 860
 861                 /* Delete source. */
 862                 vrele(fvp);
 863                 fcnp->cn_flags &= ~(MODMASK);
 864                 fcnp->cn_flags |= LOCKPARENT | LOCKLEAF;
 865                 fcnp->cn_nameiop = DELETE;
 866                 vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY);
 867                 if ((error = relookup(fdvp, &fvp, fcnp, 0))) {
 868                         vput(fdvp);
 869                         return (error);
 870                 }
 871                 return (VOP_REMOVE(fdvp, fvp, fcnp));
 872         }
 873
 874         if ((error = SET_DIROP_REMOVE(tdvp, tvp)) != 0)
 875                 goto errout;
 876         MARK_VNODE(fdvp);
 877         MARK_VNODE(fvp);
 878
 879         error = ufs_rename(ap);
 880         UNMARK_VNODE(fdvp);
 881         UNMARK_VNODE(fvp);
 882         SET_ENDOP_REMOVE(fs, tdvp, tvp, "rename");
 883         return (error);
 884
 885   errout:
 886         VOP_ABORTOP(tdvp, ap->a_tcnp); /* XXX, why not in NFS? */
 887         if (tdvp == tvp)
 888                 vrele(tdvp);
 889         else
 890                 vput(tdvp);
 891         if (tvp)
 892                 vput(tvp);
 893         VOP_ABORTOP(fdvp, ap->a_fcnp); /* XXX, why not in NFS? */
 894         vrele(fdvp);
 895         vrele(fvp);
 896         return (error);
 897 }
 898
 899 /* XXX hack to avoid calling ITIMES in getattr */
 900 int
 901 lfs_getattr(void *v)
 902 {
 903         struct vop_getattr_args /* {
 904                 struct vnode *a_vp;
 905                 struct vattr *a_vap;
 906                 kauth_cred_t a_cred;
 907         } */ *ap = v;
 908         struct vnode *vp = ap->a_vp;
 909         struct inode *ip = VTOI(vp);
 910         struct vattr *vap = ap->a_vap;
 911         struct lfs *fs = ip->i_lfs;
 912         /*
 913          * Copy from inode table
 914          */
 915         vap->va_fsid = ip->i_dev;
 916         vap->va_fileid = ip->i_number;
 917         vap->va_mode = ip->i_mode & ~IFMT;
 918         vap->va_nlink = ip->i_nlink;
 919         vap->va_uid = ip->i_uid;
 920         vap->va_gid = ip->i_gid;
 921         vap->va_rdev = (dev_t)ip->i_ffs1_rdev;
 922         vap->va_size = vp->v_size;
 923         vap->va_atime.tv_sec = ip->i_ffs1_atime;
 924         vap->va_atime.tv_nsec = ip->i_ffs1_atimensec;
 925         vap->va_mtime.tv_sec = ip->i_ffs1_mtime;
 926         vap->va_mtime.tv_nsec = ip->i_ffs1_mtimensec;
 927         vap->va_ctime.tv_sec = ip->i_ffs1_ctime;
 928         vap->va_ctime.tv_nsec = ip->i_ffs1_ctimensec;
 929         vap->va_flags = ip->i_flags;
 930         vap->va_gen = ip->i_gen;
 931         /* this doesn't belong here */
 932         if (vp->v_type == VBLK)
 933                 vap->va_blocksize = BLKDEV_IOSIZE;
 934         else if (vp->v_type == VCHR)
 935                 vap->va_blocksize = MAXBSIZE;
 936         else
 937                 vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize;
 938         vap->va_bytes = fsbtob(fs, (u_quad_t)ip->i_lfs_effnblks);
 939         vap->va_type = vp->v_type;
 940         vap->va_filerev = ip->i_modrev;
 941         return (0);
 942 }
 943
 944 /*
 945  * Check to make sure the inode blocks won't choke the buffer
 946  * cache, then call ufs_setattr as usual.
 947  */
 948 int
 949 lfs_setattr(void *v)
 950 {
 951         struct vop_setattr_args /* {
 952                 struct vnode *a_vp;
 953                 struct vattr *a_vap;
 954                 kauth_cred_t a_cred;
 955         } */ *ap = v;
 956         struct vnode *vp = ap->a_vp;
 957
 958         lfs_check(vp, LFS_UNUSED_LBN, 0);
 959         return ufs_setattr(v);
 960 }
 961
 962 /*
 963  * Release the block we hold on lfs_newseg wrapping.  Called on file close,
 964  * or explicitly from LFCNWRAPGO.  Called with the interlock held.
 965  */
 966 static int
 967 lfs_wrapgo(struct lfs *fs, struct inode *ip, int waitfor)
 968 {
 969         if (fs->lfs_stoplwp != curlwp)
 970                 return EBUSY;
 971
 972         fs->lfs_stoplwp = NULL;
 973         cv_signal(&fs->lfs_stopcv);
 974
 975         KASSERT(fs->lfs_nowrap > 0);
 976         if (fs->lfs_nowrap <= 0) {
 977                 return 0;
 978         }
 979
 980         if (--fs->lfs_nowrap == 0) {
 981                 log(LOG_NOTICE, "%s: re-enabled log wrap\n", fs->lfs_fsmnt);
 982                 wakeup(&fs->lfs_wrappass);
 983                 lfs_wakeup_cleaner(fs);
 984         }
 985         if (waitfor) {
 986                 mtsleep(&fs->lfs_nextseg, PCATCH | PUSER, "segment",
 987                     0, &lfs_lock);
 988         }
 989
 990         return 0;
 991 }
 992
 993 /*
 994  * Close called
 995  */
 996 /* ARGSUSED */
 997 int
 998 lfs_close(void *v)
 999 {
1000         struct vop_close_args /* {
1001                 struct vnode *a_vp;
1002                 int  a_fflag;
1003                 kauth_cred_t a_cred;
1004         } */ *ap = v;
1005         struct vnode *vp = ap->a_vp;
1006         struct inode *ip = VTOI(vp);
1007         struct lfs *fs = ip->i_lfs;
1008
1009         if ((ip->i_number == ROOTINO || ip->i_number == LFS_IFILE_INUM) &&
1010             fs->lfs_stoplwp == curlwp) {
1011                 mutex_enter(&lfs_lock);
1012                 log(LOG_NOTICE, "lfs_close: releasing log wrap control\n");
1013                 lfs_wrapgo(fs, ip, 0);
1014                 mutex_exit(&lfs_lock);
1015         }
1016
1017         if (vp == ip->i_lfs->lfs_ivnode &&
1018             vp->v_mount->mnt_iflag & IMNT_UNMOUNT)
1019                 return 0;
1020
1021         if (vp->v_usecount > 1 && vp != ip->i_lfs->lfs_ivnode) {
1022                 LFS_ITIMES(ip, NULL, NULL, NULL);
1023         }
1024         return (0);
1025 }
1026
1027 /*
1028  * Close wrapper for special devices.
1029  *
1030  * Update the times on the inode then do device close.
1031  */
1032 int
1033 lfsspec_close(void *v)
1034 {
1035         struct vop_close_args /* {
1036                 struct vnode    *a_vp;
1037                 int             a_fflag;
1038                 kauth_cred_t    a_cred;
1039         } */ *ap = v;
1040         struct vnode    *vp;
1041         struct inode    *ip;
1042
1043         vp = ap->a_vp;
1044         ip = VTOI(vp);
1045         if (vp->v_usecount > 1) {
1046                 LFS_ITIMES(ip, NULL, NULL, NULL);
1047         }
1048         return (VOCALL (spec_vnodeop_p, VOFFSET(vop_close), ap));
1049 }
1050
1051 /*
1052  * Close wrapper for fifo's.
1053  *
1054  * Update the times on the inode then do device close.
1055  */
1056 int
1057 lfsfifo_close(void *v)
1058 {
1059         struct vop_close_args /* {
1060                 struct vnode    *a_vp;
1061                 int             a_fflag;
1062                 kauth_cred_     a_cred;
1063         } */ *ap = v;
1064         struct vnode    *vp;
1065         struct inode    *ip;
1066
1067         vp = ap->a_vp;
1068         ip = VTOI(vp);
1069         if (ap->a_vp->v_usecount > 1) {
1070                 LFS_ITIMES(ip, NULL, NULL, NULL);
1071         }
1072         return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_close), ap));
1073 }
1074
1075 /*
1076  * Reclaim an inode so that it can be used for other purposes.
1077  */
1078
1079 int
1080 lfs_reclaim(void *v)
1081 {
1082         struct vop_reclaim_args /* {
1083                 struct vnode *a_vp;
1084         } */ *ap = v;
1085         struct vnode *vp = ap->a_vp;
1086         struct inode *ip = VTOI(vp);
1087         struct lfs *fs = ip->i_lfs;
1088         int error;
1089
1090         /*
1091          * The inode must be freed and updated before being removed
1092          * from its hash chain.  Other threads trying to gain a hold
1093          * on the inode will be stalled because it is locked (VI_XLOCK).
1094          */
1095         if (ip->i_nlink <= 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0)
1096                 lfs_vfree(vp, ip->i_number, ip->i_omode);
1097
1098         mutex_enter(&lfs_lock);
1099         LFS_CLR_UINO(ip, IN_ALLMOD);
1100         mutex_exit(&lfs_lock);
1101         if ((error = ufs_reclaim(vp)))
1102                 return (error);
1103
1104         /*
1105          * Take us off the paging and/or dirop queues if we were on them.
1106          * We shouldn't be on them.
1107          */
1108         mutex_enter(&lfs_lock);
1109         if (ip->i_flags & IN_PAGING) {
1110                 log(LOG_WARNING, "%s: reclaimed vnode is IN_PAGING\n",
1111                     fs->lfs_fsmnt);
1112                 ip->i_flags &= ~IN_PAGING;
1113                 TAILQ_REMOVE(&fs->lfs_pchainhd, ip, i_lfs_pchain);
1114         }
1115         if (vp->v_uflag & VU_DIROP) {
1116                 panic("reclaimed vnode is VU_DIROP");
1117                 vp->v_uflag &= ~VU_DIROP;
1118                 TAILQ_REMOVE(&fs->lfs_dchainhd, ip, i_lfs_dchain);
1119         }
1120         mutex_exit(&lfs_lock);
1121
1122         pool_put(&lfs_dinode_pool, ip->i_din.ffs1_din);
1123         lfs_deregister_all(vp);
1124         pool_put(&lfs_inoext_pool, ip->inode_ext.lfs);
1125         ip->inode_ext.lfs = NULL;
1126         genfs_node_destroy(vp);
1127         pool_put(&lfs_inode_pool, vp->v_data);
1128         vp->v_data = NULL;
1129         return (0);
1130 }
1131
1132 /*
1133  * Read a block from a storage device.
1134  * In order to avoid reading blocks that are in the process of being
1135  * written by the cleaner---and hence are not mutexed by the normal
1136  * buffer cache / page cache mechanisms---check for collisions before
1137  * reading.
1138  *
1139  * We inline ufs_strategy to make sure that the VOP_BMAP occurs *before*
1140  * the active cleaner test.
1141  *
1142  * XXX This code assumes that lfs_markv makes synchronous checkpoints.
1143  */
1144 int
1145 lfs_strategy(void *v)
1146 {
1147         struct vop_strategy_args /* {
1148                 struct vnode *a_vp;
1149                 struct buf *a_bp;
1150         } */ *ap = v;
1151         struct buf      *bp;
1152         struct lfs      *fs;
1153         struct vnode    *vp;
1154         struct inode    *ip;
1155         daddr_t         tbn;
1156         int             i, sn, error, slept;
1157
1158         bp = ap->a_bp;
1159         vp = ap->a_vp;
1160         ip = VTOI(vp);
1161         fs = ip->i_lfs;
1162
1163         /* lfs uses its strategy routine only for read */
1164         KASSERT(bp->b_flags & B_READ);
1165
1166         if (vp->v_type == VBLK || vp->v_type == VCHR)
1167                 panic("lfs_strategy: spec");
1168         KASSERT(bp->b_bcount != 0);
1169         if (bp->b_blkno == bp->b_lblkno) {
1170                 error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno,
1171                                  NULL);
1172                 if (error) {
1173                         bp->b_error = error;
1174                         bp->b_resid = bp->b_bcount;
1175                         biodone(bp);
1176                         return (error);
1177                 }
1178                 if ((long)bp->b_blkno == -1) /* no valid data */
1179                         clrbuf(bp);
1180         }
1181         if ((long)bp->b_blkno < 0) { /* block is not on disk */
1182                 bp->b_resid = bp->b_bcount;
1183                 biodone(bp);
1184                 return (0);
1185         }
1186
1187         slept = 1;
1188         mutex_enter(&lfs_lock);
1189         while (slept && fs->lfs_seglock) {
1190                 mutex_exit(&lfs_lock);
1191                 /*
1192                  * Look through list of intervals.
1193                  * There will only be intervals to look through
1194                  * if the cleaner holds the seglock.
1195                  * Since the cleaner is synchronous, we can trust
1196                  * the list of intervals to be current.
1197                  */
1198                 tbn = dbtofsb(fs, bp->b_blkno);
1199                 sn = dtosn(fs, tbn);
1200                 slept = 0;
1201                 for (i = 0; i < fs->lfs_cleanind; i++) {
1202                         if (sn == dtosn(fs, fs->lfs_cleanint[i]) &&
1203                             tbn >= fs->lfs_cleanint[i]) {
1204                                 DLOG((DLOG_CLEAN,
1205                                       "lfs_strategy: ino %d lbn %" PRId64
1206                                       " ind %d sn %d fsb %" PRIx32
1207                                       " given sn %d fsb %" PRIx64 "\n",
1208                                       ip->i_number, bp->b_lblkno, i,
1209                                       dtosn(fs, fs->lfs_cleanint[i]),
1210                                       fs->lfs_cleanint[i], sn, tbn));
1211                                 DLOG((DLOG_CLEAN,
1212                                       "lfs_strategy: sleeping on ino %d lbn %"
1213                                       PRId64 "\n", ip->i_number, bp->b_lblkno));
1214                                 mutex_enter(&lfs_lock);
1215                                 if (LFS_SEGLOCK_HELD(fs) && fs->lfs_iocount) {
1216                                         /* Cleaner can't wait for itself */
1217                                         mtsleep(&fs->lfs_iocount,
1218                                                 (PRIBIO + 1) | PNORELOCK,
1219                                                 "clean2", 0,
1220                                                 &lfs_lock);
1221                                         slept = 1;
1222                                         break;
1223                                 } else if (fs->lfs_seglock) {
1224                                         mtsleep(&fs->lfs_seglock,
1225                                                 (PRIBIO + 1) | PNORELOCK,
1226                                                 "clean1", 0,
1227                                                 &lfs_lock);
1228                                         slept = 1;
1229                                         break;
1230                                 }
1231                                 mutex_exit(&lfs_lock);
1232                         }
1233                 }
1234                 mutex_enter(&lfs_lock);
1235         }
1236         mutex_exit(&lfs_lock);
1237
1238         vp = ip->i_devvp;
1239         VOP_STRATEGY(vp, bp);
1240         return (0);
1241 }
1242
1243 void
1244 lfs_flush_dirops(struct lfs *fs)
1245 {
1246         struct inode *ip, *nip;
1247         struct vnode *vp;
1248         extern int lfs_dostats;
1249         struct segment *sp;
1250
1251         ASSERT_MAYBE_SEGLOCK(fs);
1252         KASSERT(fs->lfs_nadirop == 0);
1253
1254         if (fs->lfs_ronly)
1255                 return;
1256
1257         mutex_enter(&lfs_lock);
1258         if (TAILQ_FIRST(&fs->lfs_dchainhd) == NULL) {
1259                 mutex_exit(&lfs_lock);
1260                 return;
1261         } else
1262                 mutex_exit(&lfs_lock);
1263
1264         if (lfs_dostats)
1265                 ++lfs_stats.flush_invoked;
1266
1267         /*
1268          * Inline lfs_segwrite/lfs_writevnodes, but just for dirops.
1269          * Technically this is a checkpoint (the on-disk state is valid)
1270          * even though we are leaving out all the file data.
1271          */
1272         lfs_imtime(fs);
1273         lfs_seglock(fs, SEGM_CKP);
1274         sp = fs->lfs_sp;
1275
1276         /*
1277          * lfs_writevnodes, optimized to get dirops out of the way.
1278          * Only write dirops, and don't flush files' pages, only
1279          * blocks from the directories.
1280          *
1281          * We don't need to vref these files because they are
1282          * dirops and so hold an extra reference until the
1283          * segunlock clears them of that status.
1284          *
1285          * We don't need to check for IN_ADIROP because we know that
1286          * no dirops are active.
1287          *
1288          */
1289         mutex_enter(&lfs_lock);
1290         for (ip = TAILQ_FIRST(&fs->lfs_dchainhd); ip != NULL; ip = nip) {
1291                 nip = TAILQ_NEXT(ip, i_lfs_dchain);
1292                 mutex_exit(&lfs_lock);
1293                 vp = ITOV(ip);
1294
1295                 KASSERT((ip->i_flag & IN_ADIROP) == 0);
1296
1297                 /*
1298                  * All writes to directories come from dirops; all
1299                  * writes to files' direct blocks go through the page
1300                  * cache, which we're not touching.  Reads to files
1301                  * and/or directories will not be affected by writing
1302                  * directory blocks inodes and file inodes.  So we don't
1303                  * really need to lock.  If we don't lock, though,
1304                  * make sure that we don't clear IN_MODIFIED
1305                  * unnecessarily.
1306                  */
1307                 if (vp->v_iflag & VI_XLOCK) {
1308                         mutex_enter(&lfs_lock);
1309                         continue;
1310                 }
1311                 /* XXX see below
1312                  * waslocked = VOP_ISLOCKED(vp);
1313                  */
1314                 if (vp->v_type != VREG &&
1315                     ((ip->i_flag & IN_ALLMOD) || !VPISEMPTY(vp))) {
1316                         lfs_writefile(fs, sp, vp);
1317                         if (!VPISEMPTY(vp) && !WRITEINPROG(vp) &&
1318                             !(ip->i_flag & IN_ALLMOD)) {
1319                                 mutex_enter(&lfs_lock);
1320                                 LFS_SET_UINO(ip, IN_MODIFIED);
1321                                 mutex_exit(&lfs_lock);
1322                         }
1323                 }
1324                 KDASSERT(ip->i_number != LFS_IFILE_INUM);
1325                 (void) lfs_writeinode(fs, sp, ip);
1326                 mutex_enter(&lfs_lock);
1327                 /*
1328                  * XXX
1329                  * LK_EXCLOTHER is dead -- what is intended here?
1330                  * if (waslocked == LK_EXCLOTHER)
1331                  *      LFS_SET_UINO(ip, IN_MODIFIED);
1332                  */
1333         }
1334         mutex_exit(&lfs_lock);
1335         /* We've written all the dirops there are */
1336         ((SEGSUM *)(sp->segsum))->ss_flags &= ~(SS_CONT);
1337         lfs_finalize_fs_seguse(fs);
1338         (void) lfs_writeseg(fs, sp);
1339         lfs_segunlock(fs);
1340 }
1341
1342 /*
1343  * Flush all vnodes for which the pagedaemon has requested pageouts.
1344  * Skip over any files that are marked VU_DIROP (since lfs_flush_dirop()
1345  * has just run, this would be an error).  If we have to skip a vnode
1346  * for any reason, just skip it; if we have to wait for the cleaner,
1347  * abort.  The writer daemon will call us again later.
1348  */
1349 void
1350 lfs_flush_pchain(struct lfs *fs)
1351 {
1352         struct inode *ip, *nip;
1353         struct vnode *vp;
1354         extern int lfs_dostats;
1355         struct segment *sp;
1356         int error;
1357
1358         ASSERT_NO_SEGLOCK(fs);
1359
1360         if (fs->lfs_ronly)
1361                 return;
1362
1363         mutex_enter(&lfs_lock);
1364         if (TAILQ_FIRST(&fs->lfs_pchainhd) == NULL) {
1365                 mutex_exit(&lfs_lock);
1366                 return;
1367         } else
1368                 mutex_exit(&lfs_lock);
1369
1370         /* Get dirops out of the way */
1371         lfs_flush_dirops(fs);
1372
1373         if (lfs_dostats)
1374                 ++lfs_stats.flush_invoked;
1375
1376         /*
1377          * Inline lfs_segwrite/lfs_writevnodes, but just for pageouts.
1378          */
1379         lfs_imtime(fs);
1380         lfs_seglock(fs, 0);
1381         sp = fs->lfs_sp;
1382
1383         /*
1384          * lfs_writevnodes, optimized to clear pageout requests.
1385          * Only write non-dirop files that are in the pageout queue.
1386          * We're very conservative about what we write; we want to be
1387          * fast and async.
1388          */
1389         mutex_enter(&lfs_lock);
1390     top:
1391         for (ip = TAILQ_FIRST(&fs->lfs_pchainhd); ip != NULL; ip = nip) {
1392                 nip = TAILQ_NEXT(ip, i_lfs_pchain);
1393                 vp = ITOV(ip);
1394
1395                 if (!(ip->i_flags & IN_PAGING))
1396                         goto top;
1397
1398                 mutex_enter(vp->v_interlock);
1399                 if ((vp->v_iflag & VI_XLOCK) || (vp->v_uflag & VU_DIROP) != 0) {
1400                         mutex_exit(vp->v_interlock);
1401                         continue;
1402                 }
1403                 if (vp->v_type != VREG) {
1404                         mutex_exit(vp->v_interlock);
1405                         continue;
1406                 }
1407                 if (lfs_vref(vp))
1408                         continue;
1409                 mutex_exit(&lfs_lock);
1410
1411                 if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_RETRY) != 0) {
1412                         lfs_vunref(vp);
1413                         mutex_enter(&lfs_lock);
1414                         continue;
1415                 }
1416
1417                 error = lfs_writefile(fs, sp, vp);
1418                 if (!VPISEMPTY(vp) && !WRITEINPROG(vp) &&
1419                     !(ip->i_flag & IN_ALLMOD)) {
1420                         mutex_enter(&lfs_lock);
1421                         LFS_SET_UINO(ip, IN_MODIFIED);
1422                         mutex_exit(&lfs_lock);
1423                 }
1424                 KDASSERT(ip->i_number != LFS_IFILE_INUM);
1425                 (void) lfs_writeinode(fs, sp, ip);
1426
1427                 VOP_UNLOCK(vp);
1428                 lfs_vunref(vp);
1429
1430                 if (error == EAGAIN) {
1431                         lfs_writeseg(fs, sp);
1432                         mutex_enter(&lfs_lock);
1433                         break;
1434                 }
1435                 mutex_enter(&lfs_lock);
1436         }
1437         mutex_exit(&lfs_lock);
1438         (void) lfs_writeseg(fs, sp);
1439         lfs_segunlock(fs);
1440 }
1441
1442 /*
1443  * Provide a fcntl interface to sys_lfs_{segwait,bmapv,markv}.
1444  */
1445 int
1446 lfs_fcntl(void *v)
1447 {
1448         struct vop_fcntl_args /* {
1449                 struct vnode *a_vp;
1450                 u_int a_command;
1451                 void * a_data;
1452                 int  a_fflag;
1453                 kauth_cred_t a_cred;
1454         } */ *ap = v;
1455         struct timeval tv;
1456         struct timeval *tvp;
1457         BLOCK_INFO *blkiov;
1458         CLEANERINFO *cip;
1459         SEGUSE *sup;
1460         int blkcnt, error, oclean;
1461         size_t fh_size;
1462         struct lfs_fcntl_markv blkvp;
1463         struct lwp *l;
1464         fsid_t *fsidp;
1465         struct lfs *fs;
1466         struct buf *bp;
1467         fhandle_t *fhp;
1468         daddr_t off;
1469
1470         /* Only respect LFS fcntls on fs root or Ifile */
1471         if (VTOI(ap->a_vp)->i_number != ROOTINO &&
1472             VTOI(ap->a_vp)->i_number != LFS_IFILE_INUM) {
1473                 return ufs_fcntl(v);
1474         }
1475
1476         /* Avoid locking a draining lock */
1477         if (ap->a_vp->v_mount->mnt_iflag & IMNT_UNMOUNT) {
1478                 return ESHUTDOWN;
1479         }
1480
1481         /* LFS control and monitoring fcntls are available only to root */
1482         l = curlwp;
1483         if (((ap->a_command & 0xff00) >> 8) == 'L' &&
1484             (error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
1485                                              NULL)) != 0)
1486                 return (error);
1487
1488         fs = VTOI(ap->a_vp)->i_lfs;
1489         fsidp = &ap->a_vp->v_mount->mnt_stat.f_fsidx;
1490
1491         error = 0;
1492         switch ((int)ap->a_command) {
1493             case LFCNSEGWAITALL_COMPAT_50:
1494             case LFCNSEGWAITALL_COMPAT:
1495                 fsidp = NULL;
1496                 /* FALLSTHROUGH */
1497             case LFCNSEGWAIT_COMPAT_50:
1498             case LFCNSEGWAIT_COMPAT:
1499                 {
1500                         struct timeval50 *tvp50
1501                                 = (struct timeval50 *)ap->a_data;
1502                         timeval50_to_timeval(tvp50, &tv);
1503                         tvp = &tv;
1504                 }
1505                 goto segwait_common;
1506             case LFCNSEGWAITALL:
1507                 fsidp = NULL;
1508                 /* FALLSTHROUGH */
1509             case LFCNSEGWAIT:
1510                 tvp = (struct timeval *)ap->a_data;
1511 segwait_common:
1512                 mutex_enter(&lfs_lock);
1513                 ++fs->lfs_sleepers;
1514                 mutex_exit(&lfs_lock);
1515
1516                 error = lfs_segwait(fsidp, tvp);
1517
1518                 mutex_enter(&lfs_lock);
1519                 if (--fs->lfs_sleepers == 0)
1520                         wakeup(&fs->lfs_sleepers);
1521                 mutex_exit(&lfs_lock);
1522                 return error;
1523
1524             case LFCNBMAPV:
1525             case LFCNMARKV:
1526                 blkvp = *(struct lfs_fcntl_markv *)ap->a_data;
1527
1528                 blkcnt = blkvp.blkcnt;
1529                 if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT)
1530                         return (EINVAL);
1531                 blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV);
1532                 if ((error = copyin(blkvp.blkiov, blkiov,
1533                      blkcnt * sizeof(BLOCK_INFO))) != 0) {
1534                         lfs_free(fs, blkiov, LFS_NB_BLKIOV);
1535                         return error;
1536                 }
1537
1538                 mutex_enter(&lfs_lock);
1539                 ++fs->lfs_sleepers;
1540                 mutex_exit(&lfs_lock);
1541                 if (ap->a_command == LFCNBMAPV)
1542                         error = lfs_bmapv(l->l_proc, fsidp, blkiov, blkcnt);
1543                 else /* LFCNMARKV */
1544                         error = lfs_markv(l->l_proc, fsidp, blkiov, blkcnt);
1545                 if (error == 0)
1546                         error = copyout(blkiov, blkvp.blkiov,
1547                                         blkcnt * sizeof(BLOCK_INFO));
1548                 mutex_enter(&lfs_lock);
1549                 if (--fs->lfs_sleepers == 0)
1550                         wakeup(&fs->lfs_sleepers);
1551                 mutex_exit(&lfs_lock);
1552                 lfs_free(fs, blkiov, LFS_NB_BLKIOV);
1553                 return error;
1554
1555             case LFCNRECLAIM:
1556                 /*
1557                  * Flush dirops and write Ifile, allowing empty segments
1558                  * to be immediately reclaimed.
1559                  */
1560                 lfs_writer_enter(fs, "pndirop");
1561                 off = fs->lfs_offset;
1562                 lfs_seglock(fs, SEGM_FORCE_CKP | SEGM_CKP);
1563                 lfs_flush_dirops(fs);
1564                 LFS_CLEANERINFO(cip, fs, bp);
1565                 oclean = cip->clean;
1566                 LFS_SYNC_CLEANERINFO(cip, fs, bp, 1);
1567                 lfs_segwrite(ap->a_vp->v_mount, SEGM_FORCE_CKP);
1568                 fs->lfs_sp->seg_flags |= SEGM_PROT;
1569                 lfs_segunlock(fs);
1570                 lfs_writer_leave(fs);
1571
1572 #ifdef DEBUG
1573                 LFS_CLEANERINFO(cip, fs, bp);
1574                 DLOG((DLOG_CLEAN, "lfs_fcntl: reclaim wrote %" PRId64
1575                       " blocks, cleaned %" PRId32 " segments (activesb %d)\n",
1576                       fs->lfs_offset - off, cip->clean - oclean,
1577                       fs->lfs_activesb));
1578                 LFS_SYNC_CLEANERINFO(cip, fs, bp, 0);
1579 #endif
1580
1581                 return 0;
1582
1583             case LFCNIFILEFH_COMPAT:
1584                 /* Return the filehandle of the Ifile */
1585                 if ((error = kauth_authorize_system(l->l_cred,
1586                     KAUTH_SYSTEM_FILEHANDLE, 0, NULL, NULL, NULL)) != 0)
1587                         return (error);
1588                 fhp = (struct fhandle *)ap->a_data;
1589                 fhp->fh_fsid = *fsidp;
1590                 fh_size = 16;   /* former VFS_MAXFIDSIZ */
1591                 return lfs_vptofh(fs->lfs_ivnode, &(fhp->fh_fid), &fh_size);
1592
1593             case LFCNIFILEFH_COMPAT2:
1594             case LFCNIFILEFH:
1595                 /* Return the filehandle of the Ifile */
1596                 fhp = (struct fhandle *)ap->a_data;
1597                 fhp->fh_fsid = *fsidp;
1598                 fh_size = sizeof(struct lfs_fhandle) -
1599                     offsetof(fhandle_t, fh_fid);
1600                 return lfs_vptofh(fs->lfs_ivnode, &(fhp->fh_fid), &fh_size);
1601
1602             case LFCNREWIND:
1603                 /* Move lfs_offset to the lowest-numbered segment */
1604                 return lfs_rewind(fs, *(int *)ap->a_data);
1605
1606             case LFCNINVAL:
1607                 /* Mark a segment SEGUSE_INVAL */
1608                 LFS_SEGENTRY(sup, fs, *(int *)ap->a_data, bp);
1609                 if (sup->su_nbytes > 0) {
1610                         brelse(bp, 0);
1611                         lfs_unset_inval_all(fs);
1612                         return EBUSY;
1613                 }
1614                 sup->su_flags |= SEGUSE_INVAL;
1615                 VOP_BWRITE(bp->b_vp, bp);
1616                 return 0;
1617
1618             case LFCNRESIZE:
1619                 /* Resize the filesystem */
1620                 return lfs_resize_fs(fs, *(int *)ap->a_data);
1621
1622             case LFCNWRAPSTOP:
1623             case LFCNWRAPSTOP_COMPAT:
1624                 /*
1625                  * Hold lfs_newseg at segment 0; if requested, sleep until
1626                  * the filesystem wraps around.  To support external agents
1627                  * (dump, fsck-based regression test) that need to look at
1628                  * a snapshot of the filesystem, without necessarily
1629                  * requiring that all fs activity stops.
1630                  */
1631                 if (fs->lfs_stoplwp == curlwp)
1632                         return EALREADY;
1633
1634                 mutex_enter(&lfs_lock);
1635                 while (fs->lfs_stoplwp != NULL)
1636                         cv_wait(&fs->lfs_stopcv, &lfs_lock);
1637                 fs->lfs_stoplwp = curlwp;
1638                 if (fs->lfs_nowrap == 0)
1639                         log(LOG_NOTICE, "%s: disabled log wrap\n", fs->lfs_fsmnt);
1640                 ++fs->lfs_nowrap;
1641                 if (*(int *)ap->a_data == 1
1642                     || ap->a_command == LFCNWRAPSTOP_COMPAT) {
1643                         log(LOG_NOTICE, "LFCNSTOPWRAP waiting for log wrap\n");
1644                         error = mtsleep(&fs->lfs_nowrap, PCATCH | PUSER,
1645                                 "segwrap", 0, &lfs_lock);
1646                         log(LOG_NOTICE, "LFCNSTOPWRAP done waiting\n");
1647                         if (error) {
1648                                 lfs_wrapgo(fs, VTOI(ap->a_vp), 0);
1649                         }
1650                 }
1651                 mutex_exit(&lfs_lock);
1652                 return 0;
1653
1654             case LFCNWRAPGO:
1655             case LFCNWRAPGO_COMPAT:
1656                 /*
1657                  * Having done its work, the agent wakes up the writer.
1658                  * If the argument is 1, it sleeps until a new segment
1659                  * is selected.
1660                  */
1661                 mutex_enter(&lfs_lock);
1662                 error = lfs_wrapgo(fs, VTOI(ap->a_vp),
1663                                    ap->a_command == LFCNWRAPGO_COMPAT ? 1 :
1664                                     *((int *)ap->a_data));
1665                 mutex_exit(&lfs_lock);
1666                 return error;
1667
1668             case LFCNWRAPPASS:
1669                 if ((VTOI(ap->a_vp)->i_lfs_iflags & LFSI_WRAPWAIT))
1670                         return EALREADY;
1671                 mutex_enter(&lfs_lock);
1672                 if (fs->lfs_stoplwp != curlwp) {
1673                         mutex_exit(&lfs_lock);
1674                         return EALREADY;
1675                 }
1676                 if (fs->lfs_nowrap == 0) {
1677                         mutex_exit(&lfs_lock);
1678                         return EBUSY;
1679                 }
1680                 fs->lfs_wrappass = 1;
1681                 wakeup(&fs->lfs_wrappass);
1682                 /* Wait for the log to wrap, if asked */
1683                 if (*(int *)ap->a_data) {
1684                         mutex_enter(ap->a_vp->v_interlock);
1685                         lfs_vref(ap->a_vp);
1686                         VTOI(ap->a_vp)->i_lfs_iflags |= LFSI_WRAPWAIT;
1687                         log(LOG_NOTICE, "LFCNPASS waiting for log wrap\n");
1688                         error = mtsleep(&fs->lfs_nowrap, PCATCH | PUSER,
1689                                 "segwrap", 0, &lfs_lock);
1690                         log(LOG_NOTICE, "LFCNPASS done waiting\n");
1691                         VTOI(ap->a_vp)->i_lfs_iflags &= ~LFSI_WRAPWAIT;
1692                         lfs_vunref(ap->a_vp);
1693                 }
1694                 mutex_exit(&lfs_lock);
1695                 return error;
1696
1697             case LFCNWRAPSTATUS:
1698                 mutex_enter(&lfs_lock);
1699                 *(int *)ap->a_data = fs->lfs_wrapstatus;
1700                 mutex_exit(&lfs_lock);
1701                 return 0;
1702
1703             default:
1704                 return ufs_fcntl(v);
1705         }
1706         return 0;
1707 }
1708
1709 int
1710 lfs_getpages(void *v)
1711 {
1712         struct vop_getpages_args /* {
1713                 struct vnode *a_vp;
1714                 voff_t a_offset;
1715                 struct vm_page **a_m;
1716                 int *a_count;
1717                 int a_centeridx;
1718                 vm_prot_t a_access_type;
1719                 int a_advice;
1720                 int a_flags;
1721         } */ *ap = v;
1722
1723         if (VTOI(ap->a_vp)->i_number == LFS_IFILE_INUM &&
1724             (ap->a_access_type & VM_PROT_WRITE) != 0) {
1725                 return EPERM;
1726         }
1727         if ((ap->a_access_type & VM_PROT_WRITE) != 0) {
1728                 mutex_enter(&lfs_lock);
1729                 LFS_SET_UINO(VTOI(ap->a_vp), IN_MODIFIED);
1730                 mutex_exit(&lfs_lock);
1731         }
1732
1733         /*
1734          * we're relying on the fact that genfs_getpages() always read in
1735          * entire filesystem blocks.
1736          */
1737         return genfs_getpages(v);
1738 }
1739
1740 /*
1741  * Wait for a page to become unbusy, possibly printing diagnostic messages
1742  * as well.
1743  *
1744  * Called with vp->v_interlock held; return with it held.
1745  */
1746 static void
1747 wait_for_page(struct vnode *vp, struct vm_page *pg, const char *label)
1748 {
1749         if ((pg->flags & PG_BUSY) == 0)
1750                 return;         /* Nothing to wait for! */
1751
1752 #if defined(DEBUG) && defined(UVM_PAGE_TRKOWN)
1753         static struct vm_page *lastpg;
1754
1755         if (label != NULL && pg != lastpg) {
1756                 if (pg->owner_tag) {
1757                         printf("lfs_putpages[%d.%d]: %s: page %p owner %d.%d [%s]\n",
1758                                curproc->p_pid, curlwp->l_lid, label,
1759                                pg, pg->owner, pg->lowner, pg->owner_tag);
1760                 } else {
1761                         printf("lfs_putpages[%d.%d]: %s: page %p unowned?!\n",
1762                                curproc->p_pid, curlwp->l_lid, label, pg);
1763                 }
1764         }
1765         lastpg = pg;
1766 #endif
1767
1768         pg->flags |= PG_WANTED;
1769         UVM_UNLOCK_AND_WAIT(pg, vp->v_interlock, 0, "lfsput", 0);
1770         mutex_enter(vp->v_interlock);
1771 }
1772
1773 /*
1774  * This routine is called by lfs_putpages() when it can't complete the
1775  * write because a page is busy.  This means that either (1) someone,
1776  * possibly the pagedaemon, is looking at this page, and will give it up
1777  * presently; or (2) we ourselves are holding the page busy in the
1778  * process of being written (either gathered or actually on its way to
1779  * disk).  We don't need to give up the segment lock, but we might need
1780  * to call lfs_writeseg() to expedite the page's journey to disk.
1781  *
1782  * Called with vp->v_interlock held; return with it held.
1783  */
1784 /* #define BUSYWAIT */
1785 static void
1786 write_and_wait(struct lfs *fs, struct vnode *vp, struct vm_page *pg,
1787                int seglocked, const char *label)
1788 {
1789 #ifndef BUSYWAIT
1790         struct inode *ip = VTOI(vp);
1791         struct segment *sp = fs->lfs_sp;
1792         int count = 0;
1793
1794         if (pg == NULL)
1795                 return;
1796
1797         while (pg->flags & PG_BUSY &&
1798             pg->uobject == &vp->v_uobj) {
1799                 mutex_exit(vp->v_interlock);
1800                 if (sp->cbpp - sp->bpp > 1) {
1801                         /* Write gathered pages */
1802                         lfs_updatemeta(sp);
1803                         lfs_release_finfo(fs);
1804                         (void) lfs_writeseg(fs, sp);
1805
1806                         /*
1807                          * Reinitialize FIP
1808                          */
1809                         KASSERT(sp->vp == vp);
1810                         lfs_acquire_finfo(fs, ip->i_number,
1811                                           ip->i_gen);
1812                 }
1813                 ++count;
1814                 mutex_enter(vp->v_interlock);
1815                 wait_for_page(vp, pg, label);
1816         }
1817         if (label != NULL && count > 1)
1818                 printf("lfs_putpages[%d]: %s: %sn = %d\n", curproc->p_pid,
1819                        label, (count > 0 ? "looping, " : ""), count);
1820 #else
1821         preempt(1);
1822 #endif
1823 }
1824
1825 /*
1826  * Make sure that for all pages in every block in the given range,
1827  * either all are dirty or all are clean.  If any of the pages
1828  * we've seen so far are dirty, put the vnode on the paging chain,
1829  * and mark it IN_PAGING.
1830  *
1831  * If checkfirst != 0, don't check all the pages but return at the
1832  * first dirty page.
1833  */
1834 static int
1835 check_dirty(struct lfs *fs, struct vnode *vp,
1836             off_t startoffset, off_t endoffset, off_t blkeof,
1837             int flags, int checkfirst, struct vm_page **pgp)
1838 {
1839         int by_list;
1840         struct vm_page *curpg = NULL; /* XXX: gcc */
1841         struct vm_page *pgs[MAXBSIZE / PAGE_SIZE], *pg;
1842         off_t soff = 0; /* XXX: gcc */
1843         voff_t off;
1844         int i;
1845         int nonexistent;
1846         int any_dirty;  /* number of dirty pages */
1847         int dirty;      /* number of dirty pages in a block */
1848         int tdirty;
1849         int pages_per_block = fs->lfs_bsize >> PAGE_SHIFT;
1850         int pagedaemon = (curlwp == uvm.pagedaemon_lwp);
1851
1852         ASSERT_MAYBE_SEGLOCK(fs);
1853   top:
1854         by_list = (vp->v_uobj.uo_npages <=
1855                    ((endoffset - startoffset) >> PAGE_SHIFT) *
1856                    UVM_PAGE_TREE_PENALTY);
1857         any_dirty = 0;
1858
1859         if (by_list) {
1860                 curpg = TAILQ_FIRST(&vp->v_uobj.memq);
1861         } else {
1862                 soff = startoffset;
1863         }
1864         while (by_list || soff < MIN(blkeof, endoffset)) {
1865                 if (by_list) {
1866                         /*
1867                          * Find the first page in a block.  Skip
1868                          * blocks outside our area of interest or beyond
1869                          * the end of file.
1870                          */
1871                         KASSERT(curpg == NULL
1872                             || (curpg->flags & PG_MARKER) == 0);
1873                         if (pages_per_block > 1) {
1874                                 while (curpg &&
1875                                     ((curpg->offset & fs->lfs_bmask) ||
1876                                     curpg->offset >= vp->v_size ||
1877                                     curpg->offset >= endoffset)) {
1878                                         curpg = TAILQ_NEXT(curpg, listq.queue);
1879                                         KASSERT(curpg == NULL ||
1880                                             (curpg->flags & PG_MARKER) == 0);
1881                                 }
1882                         }
1883                         if (curpg == NULL)
1884                                 break;
1885                         soff = curpg->offset;
1886                 }
1887
1888                 /*
1889                  * Mark all pages in extended range busy; find out if any
1890                  * of them are dirty.
1891                  */
1892                 nonexistent = dirty = 0;
1893                 for (i = 0; i == 0 || i < pages_per_block; i++) {
1894                         if (by_list && pages_per_block <= 1) {
1895                                 pgs[i] = pg = curpg;
1896                         } else {
1897                                 off = soff + (i << PAGE_SHIFT);
1898                                 pgs[i] = pg = uvm_pagelookup(&vp->v_uobj, off);
1899                                 if (pg == NULL) {
1900                                         ++nonexistent;
1901                                         continue;
1902                                 }
1903                         }
1904                         KASSERT(pg != NULL);
1905
1906                         /*
1907                          * If we're holding the segment lock, we can deadlock
1908                          * against a process that has our page and is waiting
1909                          * for the cleaner, while the cleaner waits for the
1910                          * segment lock.  Just bail in that case.
1911                          */
1912                         if ((pg->flags & PG_BUSY) &&
1913                             (pagedaemon || LFS_SEGLOCK_HELD(fs))) {
1914                                 if (i > 0)
1915                                         uvm_page_unbusy(pgs, i);
1916                                 DLOG((DLOG_PAGE, "lfs_putpages: avoiding 3-way or pagedaemon deadlock\n"));
1917                                 if (pgp)
1918                                         *pgp = pg;
1919                                 return -1;
1920                         }
1921
1922                         while (pg->flags & PG_BUSY) {
1923                                 wait_for_page(vp, pg, NULL);
1924                                 if (i > 0)
1925                                         uvm_page_unbusy(pgs, i);
1926                                 goto top;
1927                         }
1928                         pg->flags |= PG_BUSY;
1929                         UVM_PAGE_OWN(pg, "lfs_putpages");
1930
1931                         pmap_page_protect(pg, VM_PROT_NONE);
1932                         tdirty = (pmap_clear_modify(pg) ||
1933                                   (pg->flags & PG_CLEAN) == 0);
1934                         dirty += tdirty;
1935                 }
1936                 if (pages_per_block > 0 && nonexistent >= pages_per_block) {
1937                         if (by_list) {
1938                                 curpg = TAILQ_NEXT(curpg, listq.queue);
1939                         } else {
1940                                 soff += fs->lfs_bsize;
1941                         }
1942                         continue;
1943                 }
1944
1945                 any_dirty += dirty;
1946                 KASSERT(nonexistent == 0);
1947
1948                 /*
1949                  * If any are dirty make all dirty; unbusy them,
1950                  * but if we were asked to clean, wire them so that
1951                  * the pagedaemon doesn't bother us about them while
1952                  * they're on their way to disk.
1953                  */
1954                 for (i = 0; i == 0 || i < pages_per_block; i++) {
1955                         pg = pgs[i];
1956                         KASSERT(!((pg->flags & PG_CLEAN) && (pg->flags & PG_DELWRI)));
1957                         if (dirty) {
1958                                 pg->flags &= ~PG_CLEAN;
1959                                 if (flags & PGO_FREE) {
1960                                         /*
1961                                          * Wire the page so that
1962                                          * pdaemon doesn't see it again.
1963                                          */
1964                                         mutex_enter(&uvm_pageqlock);
1965                                         uvm_pagewire(pg);
1966                                         mutex_exit(&uvm_pageqlock);
1967
1968                                         /* Suspended write flag */
1969                                         pg->flags |= PG_DELWRI;
1970                                 }
1971                         }
1972                         if (pg->flags & PG_WANTED)
1973                                 wakeup(pg);
1974                         pg->flags &= ~(PG_WANTED|PG_BUSY);
1975                         UVM_PAGE_OWN(pg, NULL);
1976                 }
1977
1978                 if (checkfirst && any_dirty)
1979                         break;
1980
1981                 if (by_list) {
1982                         curpg = TAILQ_NEXT(curpg, listq.queue);
1983                 } else {
1984                         soff += MAX(PAGE_SIZE, fs->lfs_bsize);
1985                 }
1986         }
1987
1988         return any_dirty;
1989 }
1990
1991 /*
1992  * lfs_putpages functions like genfs_putpages except that
1993  *
1994  * (1) It needs to bounds-check the incoming requests to ensure that
1995  *     they are block-aligned; if they are not, expand the range and
1996  *     do the right thing in case, e.g., the requested range is clean
1997  *     but the expanded range is dirty.
1998  *
1999  * (2) It needs to explicitly send blocks to be written when it is done.
2000  *     If VOP_PUTPAGES is called without the seglock held, we simply take
2001  *     the seglock and let lfs_segunlock wait for us.
2002  *     XXX There might be a bad situation if we have to flush a vnode while
2003  *     XXX lfs_markv is in operation.  As of this writing we panic in this
2004  *     XXX case.
2005  *
2006  * Assumptions:
2007  *
2008  * (1) The caller does not hold any pages in this vnode busy.  If it does,
2009  *     there is a danger that when we expand the page range and busy the
2010  *     pages we will deadlock.
2011  *
2012  * (2) We are called with vp->v_interlock held; we must return with it
2013  *     released.
2014  *
2015  * (3) We don't absolutely have to free pages right away, provided that
2016  *     the request does not have PGO_SYNCIO.  When the pagedaemon gives
2017  *     us a request with PGO_FREE, we take the pages out of the paging
2018  *     queue and wake up the writer, which will handle freeing them for us.
2019  *
2020  *     We ensure that for any filesystem block, all pages for that
2021  *     block are either resident or not, even if those pages are higher
2022  *     than EOF; that means that we will be getting requests to free
2023  *     "unused" pages above EOF all the time, and should ignore them.
2024  *
2025  * (4) If we are called with PGO_LOCKED, the finfo array we are to write
2026  *     into has been set up for us by lfs_writefile.  If not, we will
2027  *     have to handle allocating and/or freeing an finfo entry.
2028  *
2029  * XXX note that we're (ab)using PGO_LOCKED as "seglock held".
2030  */
2031
2032 /* How many times to loop before we should start to worry */
2033 #define TOOMANY 4
2034
2035 int
2036 lfs_putpages(void *v)
2037 {
2038         int error;
2039         struct vop_putpages_args /* {
2040                 struct vnode *a_vp;
2041                 voff_t a_offlo;
2042                 voff_t a_offhi;
2043                 int a_flags;
2044         } */ *ap = v;
2045         struct vnode *vp;
2046         struct inode *ip;
2047         struct lfs *fs;
2048         struct segment *sp;
2049         off_t origoffset, startoffset, endoffset, origendoffset, blkeof;
2050         off_t off, max_endoffset;
2051         bool seglocked, sync, pagedaemon;
2052         struct vm_page *pg, *busypg;
2053         UVMHIST_FUNC("lfs_putpages"); UVMHIST_CALLED(ubchist);
2054 #ifdef DEBUG
2055         int debug_n_again, debug_n_dirtyclean;
2056 #endif
2057
2058         vp = ap->a_vp;
2059         ip = VTOI(vp);
2060         fs = ip->i_lfs;
2061         sync = (ap->a_flags & PGO_SYNCIO) != 0;
2062         pagedaemon = (curlwp == uvm.pagedaemon_lwp);
2063
2064         /* Putpages does nothing for metadata. */
2065         if (vp == fs->lfs_ivnode || vp->v_type != VREG) {
2066                 mutex_exit(vp->v_interlock);
2067                 return 0;
2068         }
2069
2070         /*
2071          * If there are no pages, don't do anything.
2072          */
2073         if (vp->v_uobj.uo_npages == 0) {
2074                 if (TAILQ_EMPTY(&vp->v_uobj.memq) &&
2075                     (vp->v_iflag & VI_ONWORKLST) &&
2076                     LIST_FIRST(&vp->v_dirtyblkhd) == NULL) {
2077                         vp->v_iflag &= ~VI_WRMAPDIRTY;
2078                         vn_syncer_remove_from_worklist(vp);
2079                 }
2080                 mutex_exit(vp->v_interlock);
2081
2082                 /* Remove us from paging queue, if we were on it */
2083                 mutex_enter(&lfs_lock);
2084                 if (ip->i_flags & IN_PAGING) {
2085                         ip->i_flags &= ~IN_PAGING;
2086                         TAILQ_REMOVE(&fs->lfs_pchainhd, ip, i_lfs_pchain);
2087                 }
2088                 mutex_exit(&lfs_lock);
2089                 return 0;
2090         }
2091
2092         blkeof = blkroundup(fs, ip->i_size);
2093
2094         /*
2095          * Ignore requests to free pages past EOF but in the same block
2096          * as EOF, unless the request is synchronous.  (If the request is
2097          * sync, it comes from lfs_truncate.)
2098          * XXXUBC Make these pages look "active" so the pagedaemon won't
2099          * XXXUBC bother us with them again.
2100          */
2101         if (!sync && ap->a_offlo >= ip->i_size && ap->a_offlo < blkeof) {
2102                 origoffset = ap->a_offlo;
2103                 for (off = origoffset; off < blkeof; off += fs->lfs_bsize) {
2104                         pg = uvm_pagelookup(&vp->v_uobj, off);
2105                         KASSERT(pg != NULL);
2106                         while (pg->flags & PG_BUSY) {
2107                                 pg->flags |= PG_WANTED;
2108                                 UVM_UNLOCK_AND_WAIT(pg, vp->v_interlock, 0,
2109                                                     "lfsput2", 0);
2110                                 mutex_enter(vp->v_interlock);
2111                         }
2112                         mutex_enter(&uvm_pageqlock);
2113                         uvm_pageactivate(pg);
2114                         mutex_exit(&uvm_pageqlock);
2115                 }
2116                 ap->a_offlo = blkeof;
2117                 if (ap->a_offhi > 0 && ap->a_offhi <= ap->a_offlo) {
2118                         mutex_exit(vp->v_interlock);
2119                         return 0;
2120                 }
2121         }
2122
2123         /*
2124          * Extend page range to start and end at block boundaries.
2125          * (For the purposes of VOP_PUTPAGES, fragments don't exist.)
2126          */
2127         origoffset = ap->a_offlo;
2128         origendoffset = ap->a_offhi;
2129         startoffset = origoffset & ~(fs->lfs_bmask);
2130         max_endoffset = (trunc_page(LLONG_MAX) >> fs->lfs_bshift)
2131                                                << fs->lfs_bshift;
2132
2133         if (origendoffset == 0 || ap->a_flags & PGO_ALLPAGES) {
2134                 endoffset = max_endoffset;
2135                 origendoffset = endoffset;
2136         } else {
2137                 origendoffset = round_page(ap->a_offhi);
2138                 endoffset = round_page(blkroundup(fs, origendoffset));
2139         }
2140
2141         KASSERT(startoffset > 0 || endoffset >= startoffset);
2142         if (startoffset == endoffset) {
2143                 /* Nothing to do, why were we called? */
2144                 mutex_exit(vp->v_interlock);
2145                 DLOG((DLOG_PAGE, "lfs_putpages: startoffset = endoffset = %"
2146                       PRId64 "\n", startoffset));
2147                 return 0;
2148         }
2149
2150         ap->a_offlo = startoffset;
2151         ap->a_offhi = endoffset;
2152
2153         /*
2154          * If not cleaning, just send the pages through genfs_putpages
2155          * to be returned to the pool.
2156          */
2157         if (!(ap->a_flags & PGO_CLEANIT))
2158                 return genfs_putpages(v);
2159
2160         /* Set PGO_BUSYFAIL to avoid deadlocks */
2161         ap->a_flags |= PGO_BUSYFAIL;
2162
2163         /*
2164          * Likewise, if we are asked to clean but the pages are not
2165          * dirty, we can just free them using genfs_putpages.
2166          */
2167 #ifdef DEBUG
2168         debug_n_dirtyclean = 0;
2169 #endif
2170         do {
2171                 int r;
2172
2173                 /* Count the number of dirty pages */
2174                 r = check_dirty(fs, vp, startoffset, endoffset, blkeof,
2175                                 ap->a_flags, 1, NULL);
2176                 if (r < 0) {
2177                         /* Pages are busy with another process */
2178                         mutex_exit(vp->v_interlock);
2179                         return EDEADLK;
2180                 }
2181                 if (r > 0) /* Some pages are dirty */
2182                         break;
2183
2184                 /*
2185                  * Sometimes pages are dirtied between the time that
2186                  * we check and the time we try to clean them.
2187                  * Instruct lfs_gop_write to return EDEADLK in this case
2188                  * so we can write them properly.
2189                  */
2190                 ip->i_lfs_iflags |= LFSI_NO_GOP_WRITE;
2191                 r = genfs_do_putpages(vp, startoffset, endoffset,
2192                                        ap->a_flags & ~PGO_SYNCIO, &busypg);
2193                 ip->i_lfs_iflags &= ~LFSI_NO_GOP_WRITE;
2194                 if (r != EDEADLK)
2195                         return r;
2196
2197                 /* One of the pages was busy.  Start over. */
2198                 mutex_enter(vp->v_interlock);
2199                 wait_for_page(vp, busypg, "dirtyclean");
2200 #ifdef DEBUG
2201                 ++debug_n_dirtyclean;
2202 #endif
2203         } while(1);
2204
2205 #ifdef DEBUG
2206         if (debug_n_dirtyclean > TOOMANY)
2207                 printf("lfs_putpages: dirtyclean: looping, n = %d\n",
2208                        debug_n_dirtyclean);
2209 #endif
2210
2211         /*
2212          * Dirty and asked to clean.
2213          *
2214          * Pagedaemon can't actually write LFS pages; wake up
2215          * the writer to take care of that.  The writer will
2216          * notice the pager inode queue and act on that.
2217          *
2218          * XXX We must drop the vp->interlock before taking the lfs_lock or we
2219          * get a nasty deadlock with lfs_flush_pchain().
2220          */
2221         if (pagedaemon) {
2222                 mutex_exit(vp->v_interlock);
2223                 mutex_enter(&lfs_lock);
2224                 if (!(ip->i_flags & IN_PAGING)) {
2225                         ip->i_flags |= IN_PAGING;
2226                         TAILQ_INSERT_TAIL(&fs->lfs_pchainhd, ip, i_lfs_pchain);
2227                 }
2228                 wakeup(&lfs_writer_daemon);
2229                 mutex_exit(&lfs_lock);
2230                 preempt();
2231                 return EWOULDBLOCK;
2232         }
2233
2234         /*
2235          * If this is a file created in a recent dirop, we can't flush its
2236          * inode until the dirop is complete.  Drain dirops, then flush the
2237          * filesystem (taking care of any other pending dirops while we're
2238          * at it).
2239          */
2240         if ((ap->a_flags & (PGO_CLEANIT|PGO_LOCKED)) == PGO_CLEANIT &&
2241             (vp->v_uflag & VU_DIROP)) {
2242                 int locked;
2243
2244                 DLOG((DLOG_PAGE, "lfs_putpages: flushing VU_DIROP\n"));
2245                 /* XXX VOP_ISLOCKED() may not be used for lock decisions. */
2246                 locked = (VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
2247                 mutex_exit(vp->v_interlock);
2248                 lfs_writer_enter(fs, "ppdirop");
2249                 if (locked)
2250                         VOP_UNLOCK(vp); /* XXX why? */
2251
2252                 mutex_enter(&lfs_lock);
2253                 lfs_flush_fs(fs, sync ? SEGM_SYNC : 0);
2254                 mutex_exit(&lfs_lock);
2255
2256                 if (locked)
2257                         VOP_LOCK(vp, LK_EXCLUSIVE);
2258                 mutex_enter(vp->v_interlock);
2259                 lfs_writer_leave(fs);
2260
2261                 /* XXX the flush should have taken care of this one too! */
2262         }
2263
2264         /*
2265          * This is it.  We are going to write some pages.  From here on
2266          * down it's all just mechanics.
2267          *
2268          * Don't let genfs_putpages wait; lfs_segunlock will wait for us.
2269          */
2270         ap->a_flags &= ~PGO_SYNCIO;
2271
2272         /*
2273          * If we've already got the seglock, flush the node and return.
2274          * The FIP has already been set up for us by lfs_writefile,
2275          * and FIP cleanup and lfs_updatemeta will also be done there,
2276          * unless genfs_putpages returns EDEADLK; then we must flush
2277          * what we have, and correct FIP and segment header accounting.
2278          */
2279   get_seglock:
2280         /*
2281          * If we are not called with the segment locked, lock it.
2282          * Account for a new FIP in the segment header, and set sp->vp.
2283          * (This should duplicate the setup at the top of lfs_writefile().)
2284          */
2285         seglocked = (ap->a_flags & PGO_LOCKED) != 0;
2286         if (!seglocked) {
2287                 mutex_exit(vp->v_interlock);
2288                 error = lfs_seglock(fs, SEGM_PROT | (sync ? SEGM_SYNC : 0));
2289                 if (error != 0)
2290                         return error;
2291                 mutex_enter(vp->v_interlock);
2292                 lfs_acquire_finfo(fs, ip->i_number, ip->i_gen);
2293         }
2294         sp = fs->lfs_sp;
2295         KASSERT(sp->vp == NULL);
2296         sp->vp = vp;
2297
2298         /*
2299          * Ensure that the partial segment is marked SS_DIROP if this
2300          * vnode is a DIROP.
2301          */
2302         if (!seglocked && vp->v_uflag & VU_DIROP)
2303                 ((SEGSUM *)(sp->segsum))->ss_flags |= (SS_DIROP|SS_CONT);
2304
2305         /*
2306          * Loop over genfs_putpages until all pages are gathered.
2307          * genfs_putpages() drops the interlock, so reacquire it if necessary.
2308          * Whenever we lose the interlock we have to rerun check_dirty, as
2309          * well, since more pages might have been dirtied in our absence.
2310          */
2311 #ifdef DEBUG
2312         debug_n_again = 0;
2313 #endif
2314         do {
2315                 busypg = NULL;
2316                 if (check_dirty(fs, vp, startoffset, endoffset, blkeof,
2317                                 ap->a_flags, 0, &busypg) < 0) {
2318                         mutex_exit(vp->v_interlock);
2319
2320                         mutex_enter(vp->v_interlock);
2321                         write_and_wait(fs, vp, busypg, seglocked, NULL);
2322                         if (!seglocked) {
2323                                 mutex_exit(vp->v_interlock);
2324                                 lfs_release_finfo(fs);
2325                                 lfs_segunlock(fs);
2326                                 mutex_enter(vp->v_interlock);
2327                         }
2328                         sp->vp = NULL;
2329                         goto get_seglock;
2330                 }
2331
2332                 busypg = NULL;
2333                 error = genfs_do_putpages(vp, startoffset, endoffset,
2334                                            ap->a_flags, &busypg);
2335
2336                 if (error == EDEADLK || error == EAGAIN) {
2337                         DLOG((DLOG_PAGE, "lfs_putpages: genfs_putpages returned"
2338                               " %d ino %d off %x (seg %d)\n", error,
2339                               ip->i_number, fs->lfs_offset,
2340                               dtosn(fs, fs->lfs_offset)));
2341
2342                         mutex_enter(vp->v_interlock);
2343                         write_and_wait(fs, vp, busypg, seglocked, "again");
2344                 }
2345 #ifdef DEBUG
2346                 ++debug_n_again;
2347 #endif
2348         } while (error == EDEADLK);
2349 #ifdef DEBUG
2350         if (debug_n_again > TOOMANY)
2351                 printf("lfs_putpages: again: looping, n = %d\n", debug_n_again);
2352 #endif
2353
2354         KASSERT(sp != NULL && sp->vp == vp);
2355         if (!seglocked) {
2356                 sp->vp = NULL;
2357
2358                 /* Write indirect blocks as well */
2359                 lfs_gather(fs, fs->lfs_sp, vp, lfs_match_indir);
2360                 lfs_gather(fs, fs->lfs_sp, vp, lfs_match_dindir);
2361                 lfs_gather(fs, fs->lfs_sp, vp, lfs_match_tindir);
2362
2363                 KASSERT(sp->vp == NULL);
2364                 sp->vp = vp;
2365         }
2366
2367         /*
2368          * Blocks are now gathered into a segment waiting to be written.
2369          * All that's left to do is update metadata, and write them.
2370          */
2371         lfs_updatemeta(sp);
2372         KASSERT(sp->vp == vp);
2373         sp->vp = NULL;
2374
2375         /*
2376          * If we were called from lfs_writefile, we don't need to clean up
2377          * the FIP or unlock the segment lock.  We're done.
2378          */
2379         if (seglocked)
2380                 return error;
2381
2382         /* Clean up FIP and send it to disk. */
2383         lfs_release_finfo(fs);
2384         lfs_writeseg(fs, fs->lfs_sp);
2385
2386         /*
2387          * Remove us from paging queue if we wrote all our pages.
2388          */
2389         if (origendoffset == 0 || ap->a_flags & PGO_ALLPAGES) {
2390                 mutex_enter(&lfs_lock);
2391                 if (ip->i_flags & IN_PAGING) {
2392                         ip->i_flags &= ~IN_PAGING;
2393                         TAILQ_REMOVE(&fs->lfs_pchainhd, ip, i_lfs_pchain);
2394                 }
2395                 mutex_exit(&lfs_lock);
2396         }
2397
2398         /*
2399          * XXX - with the malloc/copy writeseg, the pages are freed by now
2400          * even if we don't wait (e.g. if we hold a nested lock).  This
2401          * will not be true if we stop using malloc/copy.
2402          */
2403         KASSERT(fs->lfs_sp->seg_flags & SEGM_PROT);
2404         lfs_segunlock(fs);
2405
2406         /*
2407          * Wait for v_numoutput to drop to zero.  The seglock should
2408          * take care of this, but there is a slight possibility that
2409          * aiodoned might not have got around to our buffers yet.
2410          */
2411         if (sync) {
2412                 mutex_enter(vp->v_interlock);
2413                 while (vp->v_numoutput > 0) {
2414                         DLOG((DLOG_PAGE, "lfs_putpages: ino %d sleeping on"
2415                               " num %d\n", ip->i_number, vp->v_numoutput));
2416                         cv_wait(&vp->v_cv, vp->v_interlock);
2417                 }
2418                 mutex_exit(vp->v_interlock);
2419         }
2420         return error;
2421 }
2422
2423 /*
2424  * Return the last logical file offset that should be written for this file
2425  * if we're doing a write that ends at "size".  If writing, we need to know
2426  * about sizes on disk, i.e. fragments if there are any; if reading, we need
2427  * to know about entire blocks.
2428  */
2429 void
2430 lfs_gop_size(struct vnode *vp, off_t size, off_t *eobp, int flags)
2431 {
2432         struct inode *ip = VTOI(vp);
2433         struct lfs *fs = ip->i_lfs;
2434         daddr_t olbn, nlbn;
2435
2436         olbn = lblkno(fs, ip->i_size);
2437         nlbn = lblkno(fs, size);
2438         if (!(flags & GOP_SIZE_MEM) && nlbn < NDADDR && olbn <= nlbn) {
2439                 *eobp = fragroundup(fs, size);
2440         } else {
2441                 *eobp = blkroundup(fs, size);
2442         }
2443 }
2444
2445 #ifdef DEBUG
2446 void lfs_dump_vop(void *);
2447
2448 void
2449 lfs_dump_vop(void *v)
2450 {
2451         struct vop_putpages_args /* {
2452                 struct vnode *a_vp;
2453                 voff_t a_offlo;
2454                 voff_t a_offhi;
2455                 int a_flags;
2456         } */ *ap = v;
2457
2458 #ifdef DDB
2459         vfs_vnode_print(ap->a_vp, 0, printf);
2460 #endif
2461         lfs_dump_dinode(VTOI(ap->a_vp)->i_din.ffs1_din);
2462 }
2463 #endif
2464
2465 int
2466 lfs_mmap(void *v)
2467 {
2468         struct vop_mmap_args /* {
2469                 const struct vnodeop_desc *a_desc;
2470                 struct vnode *a_vp;
2471                 vm_prot_t a_prot;
2472                 kauth_cred_t a_cred;
2473         } */ *ap = v;
2474
2475         if (VTOI(ap->a_vp)->i_number == LFS_IFILE_INUM)
2476                 return EOPNOTSUPP;
2477         return ufs_mmap(v);
2478 }