sys/kern/vfs_subr.c

   1 /*      $NetBSD: vfs_subr.c,v 1.394 2010/01/08 11:35:10 pooka Exp $     */
   2
   3 /*-
   4  * Copyright (c) 1997, 1998, 2004, 2005, 2007, 2008 The NetBSD Foundation, Inc.
   5  * All rights reserved.
   6  *
   7  * This code is derived from software contributed to The NetBSD Foundation
   8  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
   9  * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
  10  *
  11  * Redistribution and use in source and binary forms, with or without
  12  * modification, are permitted provided that the following conditions
  13  * are met:
  14  * 1. Redistributions of source code must retain the above copyright
  15  *    notice, this list of conditions and the following disclaimer.
  16  * 2. Redistributions in binary form must reproduce the above copyright
  17  *    notice, this list of conditions and the following disclaimer in the
  18  *    documentation and/or other materials provided with the distribution.
  19  *
  20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  30  * POSSIBILITY OF SUCH DAMAGE.
  31  */
  32
  33 /*
  34  * Copyright (c) 1989, 1993
  35  *      The Regents of the University of California.  All rights reserved.
  36  * (c) UNIX System Laboratories, Inc.
  37  * All or some portions of this file are derived from material licensed
  38  * to the University of California by American Telephone and Telegraph
  39  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  40  * the permission of UNIX System Laboratories, Inc.
  41  *
  42  * Redistribution and use in source and binary forms, with or without
  43  * modification, are permitted provided that the following conditions
  44  * are met:
  45  * 1. Redistributions of source code must retain the above copyright
  46  *    notice, this list of conditions and the following disclaimer.
  47  * 2. Redistributions in binary form must reproduce the above copyright
  48  *    notice, this list of conditions and the following disclaimer in the
  49  *    documentation and/or other materials provided with the distribution.
  50  * 3. Neither the name of the University nor the names of its contributors
  51  *    may be used to endorse or promote products derived from this software
  52  *    without specific prior written permission.
  53  *
  54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  64  * SUCH DAMAGE.
  65  *
  66  *      @(#)vfs_subr.c  8.13 (Berkeley) 4/18/94
  67  */
  68
  69 /*
  70  * Note on v_usecount and locking:
  71  *
  72  * At nearly all points it is known that v_usecount could be zero, the
  73  * vnode interlock will be held.
  74  *
  75  * To change v_usecount away from zero, the interlock must be held.  To
  76  * change from a non-zero value to zero, again the interlock must be
  77  * held.
  78  *
  79  * There's a flag bit, VC_XLOCK, embedded in v_usecount.
  80  * To raise v_usecount, if the VC_XLOCK bit is set in it, the interlock
  81  * must be held.
  82  * To modify the VC_XLOCK bit, the interlock must be held.
  83  * We always keep the usecount (v_usecount & VC_MASK) non-zero while the
  84  * VC_XLOCK bit is set.
  85  *
  86  * Unless the VC_XLOCK bit is set, changing the usecount from a non-zero
  87  * value to a non-zero value can safely be done using atomic operations,
  88  * without the interlock held.
  89  * Even if the VC_XLOCK bit is set, decreasing the usecount to a non-zero
  90  * value can be done using atomic operations, without the interlock held.
  91  */
  92
  93 #include <sys/cdefs.h>
  94 __KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.394 2010/01/08 11:35:10 pooka Exp $");
  95
  96 #include "opt_ddb.h"
  97 #include "opt_compat_netbsd.h"
  98 #include "opt_compat_43.h"
  99
 100 #include <sys/param.h>
 101 #include <sys/systm.h>
 102 #include <sys/conf.h>
 103 #include <sys/proc.h>
 104 #include <sys/kernel.h>
 105 #include <sys/mount.h>
 106 #include <sys/fcntl.h>
 107 #include <sys/vnode.h>
 108 #include <sys/stat.h>
 109 #include <sys/namei.h>
 110 #include <sys/ucred.h>
 111 #include <sys/buf.h>
 112 #include <sys/errno.h>
 113 #include <sys/kmem.h>
 114 #include <sys/syscallargs.h>
 115 #include <sys/device.h>
 116 #include <sys/filedesc.h>
 117 #include <sys/kauth.h>
 118 #include <sys/atomic.h>
 119 #include <sys/kthread.h>
 120 #include <sys/wapbl.h>
 121
 122 #include <miscfs/genfs/genfs.h>
 123 #include <miscfs/specfs/specdev.h>
 124 #include <miscfs/syncfs/syncfs.h>
 125
 126 #include <uvm/uvm.h>
 127 #include <uvm/uvm_readahead.h>
 128 #include <uvm/uvm_ddb.h>
 129
 130 #include <sys/sysctl.h>
 131
 132 const enum vtype iftovt_tab[16] = {
 133         VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 134         VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
 135 };
 136 const int       vttoif_tab[9] = {
 137         0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
 138         S_IFSOCK, S_IFIFO, S_IFMT,
 139 };
 140
 141 /*
 142  * Insq/Remq for the vnode usage lists.
 143  */
 144 #define bufinsvn(bp, dp)        LIST_INSERT_HEAD(dp, bp, b_vnbufs)
 145 #define bufremvn(bp) {                                                  \
 146         LIST_REMOVE(bp, b_vnbufs);                                      \
 147         (bp)->b_vnbufs.le_next = NOLIST;                                \
 148 }
 149
 150 int doforce = 1;                /* 1 => permit forcible unmounting */
 151 int prtactive = 0;              /* 1 => print out reclaim of active vnodes */
 152
 153 static vnodelst_t vnode_free_list = TAILQ_HEAD_INITIALIZER(vnode_free_list);
 154 static vnodelst_t vnode_hold_list = TAILQ_HEAD_INITIALIZER(vnode_hold_list);
 155 static vnodelst_t vrele_list = TAILQ_HEAD_INITIALIZER(vrele_list);
 156
 157 struct mntlist mountlist =                      /* mounted filesystem list */
 158     CIRCLEQ_HEAD_INITIALIZER(mountlist);
 159
 160 u_int numvnodes;
 161 static specificdata_domain_t mount_specificdata_domain;
 162
 163 static int vrele_pending;
 164 static int vrele_gen;
 165 static kmutex_t vrele_lock;
 166 static kcondvar_t vrele_cv;
 167 static lwp_t *vrele_lwp;
 168
 169 static uint64_t mountgen = 0;
 170 static kmutex_t mountgen_lock;
 171
 172 kmutex_t mountlist_lock;
 173 kmutex_t mntid_lock;
 174 kmutex_t mntvnode_lock;
 175 kmutex_t vnode_free_list_lock;
 176 kmutex_t vfs_list_lock;
 177
 178 static pool_cache_t vnode_cache;
 179
 180 /*
 181  * These define the root filesystem and device.
 182  */
 183 struct vnode *rootvnode;
 184 struct device *root_device;                     /* root device */
 185
 186 /*
 187  * Local declarations.
 188  */
 189
 190 static void vrele_thread(void *);
 191 static void insmntque(vnode_t *, struct mount *);
 192 static int getdevvp(dev_t, vnode_t **, enum vtype);
 193 static vnode_t *getcleanvnode(void);
 194 void vpanic(vnode_t *, const char *);
 195 static void vfs_shutdown1(struct lwp *);
 196
 197 #ifdef DEBUG
 198 void printlockedvnodes(void);
 199 #endif
 200
 201 #ifdef DIAGNOSTIC
 202 void
 203 vpanic(vnode_t *vp, const char *msg)
 204 {
 205
 206         vprint(NULL, vp);
 207         panic("%s\n", msg);
 208 }
 209 #else
 210 #define vpanic(vp, msg) /* nothing */
 211 #endif
 212
 213 void
 214 vn_init1(void)
 215 {
 216
 217         vnode_cache = pool_cache_init(sizeof(struct vnode), 0, 0, 0, "vnodepl",
 218             NULL, IPL_NONE, NULL, NULL, NULL);
 219         KASSERT(vnode_cache != NULL);
 220
 221         /* Create deferred release thread. */
 222         mutex_init(&vrele_lock, MUTEX_DEFAULT, IPL_NONE);
 223         cv_init(&vrele_cv, "vrele");
 224         if (kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vrele_thread,
 225             NULL, &vrele_lwp, "vrele"))
 226                 panic("fork vrele");
 227 }
 228
 229 /*
 230  * Initialize the vnode management data structures.
 231  */
 232 void
 233 vntblinit(void)
 234 {
 235
 236         mutex_init(&mountgen_lock, MUTEX_DEFAULT, IPL_NONE);
 237         mutex_init(&mountlist_lock, MUTEX_DEFAULT, IPL_NONE);
 238         mutex_init(&mntid_lock, MUTEX_DEFAULT, IPL_NONE);
 239         mutex_init(&mntvnode_lock, MUTEX_DEFAULT, IPL_NONE);
 240         mutex_init(&vnode_free_list_lock, MUTEX_DEFAULT, IPL_NONE);
 241         mutex_init(&vfs_list_lock, MUTEX_DEFAULT, IPL_NONE);
 242
 243         mount_specificdata_domain = specificdata_domain_create();
 244
 245         /* Initialize the filesystem syncer. */
 246         vn_initialize_syncerd();
 247         vn_init1();
 248 }
 249
 250 int
 251 vfs_drainvnodes(long target, struct lwp *l)
 252 {
 253
 254         while (numvnodes > target) {
 255                 vnode_t *vp;
 256
 257                 mutex_enter(&vnode_free_list_lock);
 258                 vp = getcleanvnode();
 259                 if (vp == NULL)
 260                         return EBUSY; /* give up */
 261                 ungetnewvnode(vp);
 262         }
 263
 264         return 0;
 265 }
 266
 267 /*
 268  * Lookup a mount point by filesystem identifier.
 269  *
 270  * XXX Needs to add a reference to the mount point.
 271  */
 272 struct mount *
 273 vfs_getvfs(fsid_t *fsid)
 274 {
 275         struct mount *mp;
 276
 277         mutex_enter(&mountlist_lock);
 278         CIRCLEQ_FOREACH(mp, &mountlist, mnt_list) {
 279                 if (mp->mnt_stat.f_fsidx.__fsid_val[0] == fsid->__fsid_val[0] &&
 280                     mp->mnt_stat.f_fsidx.__fsid_val[1] == fsid->__fsid_val[1]) {
 281                         mutex_exit(&mountlist_lock);
 282                         return (mp);
 283                 }
 284         }
 285         mutex_exit(&mountlist_lock);
 286         return ((struct mount *)0);
 287 }
 288
 289 /*
 290  * Drop a reference to a mount structure, freeing if the last reference.
 291  */
 292 void
 293 vfs_destroy(struct mount *mp)
 294 {
 295
 296         if (__predict_true((int)atomic_dec_uint_nv(&mp->mnt_refcnt) > 0)) {
 297                 return;
 298         }
 299
 300         /*
 301          * Nothing else has visibility of the mount: we can now
 302          * free the data structures.
 303          */
 304         KASSERT(mp->mnt_refcnt == 0);
 305         specificdata_fini(mount_specificdata_domain, &mp->mnt_specdataref);
 306         rw_destroy(&mp->mnt_unmounting);
 307         mutex_destroy(&mp->mnt_updating);
 308         mutex_destroy(&mp->mnt_renamelock);
 309         if (mp->mnt_op != NULL) {
 310                 vfs_delref(mp->mnt_op);
 311         }
 312         kmem_free(mp, sizeof(*mp));
 313 }
 314
 315 /*
 316  * grab a vnode from freelist and clean it.
 317  */
 318 vnode_t *
 319 getcleanvnode(void)
 320 {
 321         vnode_t *vp;
 322         vnodelst_t *listhd;
 323
 324         KASSERT(mutex_owned(&vnode_free_list_lock));
 325
 326 retry:
 327         listhd = &vnode_free_list;
 328 try_nextlist:
 329         TAILQ_FOREACH(vp, listhd, v_freelist) {
 330                 /*
 331                  * It's safe to test v_usecount and v_iflag
 332                  * without holding the interlock here, since
 333                  * these vnodes should never appear on the
 334                  * lists.
 335                  */
 336                 if (vp->v_usecount != 0) {
 337                         vpanic(vp, "free vnode isn't");
 338                 }
 339                 if ((vp->v_iflag & VI_CLEAN) != 0) {
 340                         vpanic(vp, "clean vnode on freelist");
 341                 }
 342                 if (vp->v_freelisthd != listhd) {
 343                         printf("vnode sez %p, listhd %p\n", vp->v_freelisthd, listhd);
 344                         vpanic(vp, "list head mismatch");
 345                 }
 346                 if (!mutex_tryenter(&vp->v_interlock))
 347                         continue;
 348                 /*
 349                  * Our lwp might hold the underlying vnode
 350                  * locked, so don't try to reclaim a VI_LAYER
 351                  * node if it's locked.
 352                  */
 353                 if ((vp->v_iflag & VI_XLOCK) == 0 &&
 354                     ((vp->v_iflag & VI_LAYER) == 0 || VOP_ISLOCKED(vp) == 0)) {
 355                         break;
 356                 }
 357                 mutex_exit(&vp->v_interlock);
 358         }
 359
 360         if (vp == NULL) {
 361                 if (listhd == &vnode_free_list) {
 362                         listhd = &vnode_hold_list;
 363                         goto try_nextlist;
 364                 }
 365                 mutex_exit(&vnode_free_list_lock);
 366                 return NULL;
 367         }
 368
 369         /* Remove it from the freelist. */
 370         TAILQ_REMOVE(listhd, vp, v_freelist);
 371         vp->v_freelisthd = NULL;
 372         mutex_exit(&vnode_free_list_lock);
 373
 374         if (vp->v_usecount != 0) {
 375                 /*
 376                  * was referenced again before we got the interlock
 377                  * Don't return to freelist - the holder of the last
 378                  * reference will destroy it.
 379                  */
 380                 mutex_exit(&vp->v_interlock);
 381                 mutex_enter(&vnode_free_list_lock);
 382                 goto retry;
 383         }
 384
 385         /*
 386          * The vnode is still associated with a file system, so we must
 387          * clean it out before reusing it.  We need to add a reference
 388          * before doing this.  If the vnode gains another reference while
 389          * being cleaned out then we lose - retry.
 390          */
 391         atomic_add_int(&vp->v_usecount, 1 + VC_XLOCK);
 392         vclean(vp, DOCLOSE);
 393         KASSERT(vp->v_usecount >= 1 + VC_XLOCK);
 394         atomic_add_int(&vp->v_usecount, -VC_XLOCK);
 395         if (vp->v_usecount == 1) {
 396                 /* We're about to dirty it. */
 397                 vp->v_iflag &= ~VI_CLEAN;
 398                 mutex_exit(&vp->v_interlock);
 399                 if (vp->v_type == VBLK || vp->v_type == VCHR) {
 400                         spec_node_destroy(vp);
 401                 }
 402                 vp->v_type = VNON;
 403         } else {
 404                 /*
 405                  * Don't return to freelist - the holder of the last
 406                  * reference will destroy it.
 407                  */
 408                 vrelel(vp, 0); /* releases vp->v_interlock */
 409                 mutex_enter(&vnode_free_list_lock);
 410                 goto retry;
 411         }
 412
 413         if (vp->v_data != NULL || vp->v_uobj.uo_npages != 0 ||
 414             !TAILQ_EMPTY(&vp->v_uobj.memq)) {
 415                 vpanic(vp, "cleaned vnode isn't");
 416         }
 417         if (vp->v_numoutput != 0) {
 418                 vpanic(vp, "clean vnode has pending I/O's");
 419         }
 420         if ((vp->v_iflag & VI_ONWORKLST) != 0) {
 421                 vpanic(vp, "clean vnode on syncer list");
 422         }
 423
 424         return vp;
 425 }
 426
 427 /*
 428  * Mark a mount point as busy, and gain a new reference to it.  Used to
 429  * prevent the file system from being unmounted during critical sections.
 430  *
 431  * => The caller must hold a pre-existing reference to the mount.
 432  * => Will fail if the file system is being unmounted, or is unmounted.
 433  */
 434 int
 435 vfs_busy(struct mount *mp, struct mount **nextp)
 436 {
 437
 438         KASSERT(mp->mnt_refcnt > 0);
 439
 440         if (__predict_false(!rw_tryenter(&mp->mnt_unmounting, RW_READER))) {
 441                 if (nextp != NULL) {
 442                         KASSERT(mutex_owned(&mountlist_lock));
 443                         *nextp = CIRCLEQ_NEXT(mp, mnt_list);
 444                 }
 445                 return EBUSY;
 446         }
 447         if (__predict_false((mp->mnt_iflag & IMNT_GONE) != 0)) {
 448                 rw_exit(&mp->mnt_unmounting);
 449                 if (nextp != NULL) {
 450                         KASSERT(mutex_owned(&mountlist_lock));
 451                         *nextp = CIRCLEQ_NEXT(mp, mnt_list);
 452                 }
 453                 return ENOENT;
 454         }
 455         if (nextp != NULL) {
 456                 mutex_exit(&mountlist_lock);
 457         }
 458         atomic_inc_uint(&mp->mnt_refcnt);
 459         return 0;
 460 }
 461
 462 /*
 463  * Unbusy a busy filesystem.
 464  *
 465  * => If keepref is true, preserve reference added by vfs_busy().
 466  * => If nextp != NULL, acquire mountlist_lock.
 467  */
 468 void
 469 vfs_unbusy(struct mount *mp, bool keepref, struct mount **nextp)
 470 {
 471
 472         KASSERT(mp->mnt_refcnt > 0);
 473
 474         if (nextp != NULL) {
 475                 mutex_enter(&mountlist_lock);
 476         }
 477         rw_exit(&mp->mnt_unmounting);
 478         if (!keepref) {
 479                 vfs_destroy(mp);
 480         }
 481         if (nextp != NULL) {
 482                 KASSERT(mutex_owned(&mountlist_lock));
 483                 *nextp = CIRCLEQ_NEXT(mp, mnt_list);
 484         }
 485 }
 486
 487 struct mount *
 488 vfs_mountalloc(struct vfsops *vfsops, struct vnode *vp)
 489 {
 490         int error;
 491         struct mount *mp;
 492
 493         mp = kmem_zalloc(sizeof(*mp), KM_SLEEP);
 494         if (mp == NULL)
 495                 return NULL;
 496
 497         mp->mnt_op = vfsops;
 498         mp->mnt_refcnt = 1;
 499         TAILQ_INIT(&mp->mnt_vnodelist);
 500         rw_init(&mp->mnt_unmounting);
 501         mutex_init(&mp->mnt_renamelock, MUTEX_DEFAULT, IPL_NONE);
 502         mutex_init(&mp->mnt_updating, MUTEX_DEFAULT, IPL_NONE);
 503         error = vfs_busy(mp, NULL);
 504         KASSERT(error == 0);
 505         mp->mnt_vnodecovered = vp;
 506         mount_initspecific(mp);
 507
 508         mutex_enter(&mountgen_lock);
 509         mp->mnt_gen = mountgen++;
 510         mutex_exit(&mountgen_lock);
 511
 512         return mp;
 513 }
 514
 515 /*
 516  * Lookup a filesystem type, and if found allocate and initialize
 517  * a mount structure for it.
 518  *
 519  * Devname is usually updated by mount(8) after booting.
 520  */
 521 int
 522 vfs_rootmountalloc(const char *fstypename, const char *devname,
 523     struct mount **mpp)
 524 {
 525         struct vfsops *vfsp = NULL;
 526         struct mount *mp;
 527
 528         mutex_enter(&vfs_list_lock);
 529         LIST_FOREACH(vfsp, &vfs_list, vfs_list)
 530                 if (!strncmp(vfsp->vfs_name, fstypename,
 531                     sizeof(mp->mnt_stat.f_fstypename)))
 532                         break;
 533         if (vfsp == NULL) {
 534                 mutex_exit(&vfs_list_lock);
 535                 return (ENODEV);
 536         }
 537         vfsp->vfs_refcount++;
 538         mutex_exit(&vfs_list_lock);
 539
 540         if ((mp = vfs_mountalloc(vfsp, NULL)) == NULL)
 541                 return ENOMEM;
 542         mp->mnt_flag = MNT_RDONLY;
 543         (void)strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name,
 544             sizeof(mp->mnt_stat.f_fstypename));
 545         mp->mnt_stat.f_mntonname[0] = '/';
 546         mp->mnt_stat.f_mntonname[1] = '\0';
 547         mp->mnt_stat.f_mntfromname[sizeof(mp->mnt_stat.f_mntfromname) - 1] =
 548             '\0';
 549         (void)copystr(devname, mp->mnt_stat.f_mntfromname,
 550             sizeof(mp->mnt_stat.f_mntfromname) - 1, 0);
 551         *mpp = mp;
 552         return (0);
 553 }
 554
 555 /*
 556  * Routines having to do with the management of the vnode table.
 557  */
 558 extern int (**dead_vnodeop_p)(void *);
 559
 560 /*
 561  * Return the next vnode from the free list.
 562  */
 563 int
 564 getnewvnode(enum vtagtype tag, struct mount *mp, int (**vops)(void *),
 565             vnode_t **vpp)
 566 {
 567         struct uvm_object *uobj;
 568         static int toggle;
 569         vnode_t *vp;
 570         int error = 0, tryalloc;
 571
 572  try_again:
 573         if (mp != NULL) {
 574                 /*
 575                  * Mark filesystem busy while we're creating a
 576                  * vnode.  If unmount is in progress, this will
 577                  * fail.
 578                  */
 579                 error = vfs_busy(mp, NULL);
 580                 if (error)
 581                         return error;
 582         }
 583
 584         /*
 585          * We must choose whether to allocate a new vnode or recycle an
 586          * existing one. The criterion for allocating a new one is that
 587          * the total number of vnodes is less than the number desired or
 588          * there are no vnodes on either free list. Generally we only
 589          * want to recycle vnodes that have no buffers associated with
 590          * them, so we look first on the vnode_free_list. If it is empty,
 591          * we next consider vnodes with referencing buffers on the
 592          * vnode_hold_list. The toggle ensures that half the time we
 593          * will use a buffer from the vnode_hold_list, and half the time
 594          * we will allocate a new one unless the list has grown to twice
 595          * the desired size. We are reticent to recycle vnodes from the
 596          * vnode_hold_list because we will lose the identity of all its
 597          * referencing buffers.
 598          */
 599
 600         vp = NULL;
 601
 602         mutex_enter(&vnode_free_list_lock);
 603
 604         toggle ^= 1;
 605         if (numvnodes > 2 * desiredvnodes)
 606                 toggle = 0;
 607
 608         tryalloc = numvnodes < desiredvnodes ||
 609             (TAILQ_FIRST(&vnode_free_list) == NULL &&
 610              (TAILQ_FIRST(&vnode_hold_list) == NULL || toggle));
 611
 612         if (tryalloc) {
 613                 numvnodes++;
 614                 mutex_exit(&vnode_free_list_lock);
 615                 if ((vp = vnalloc(NULL)) == NULL) {
 616                         mutex_enter(&vnode_free_list_lock);
 617                         numvnodes--;
 618                 } else
 619                         vp->v_usecount = 1;
 620         }
 621
 622         if (vp == NULL) {
 623                 vp = getcleanvnode();
 624                 if (vp == NULL) {
 625                         if (mp != NULL) {
 626                                 vfs_unbusy(mp, false, NULL);
 627                         }
 628                         if (tryalloc) {
 629                                 printf("WARNING: unable to allocate new "
 630                                     "vnode, retrying...\n");
 631                                 kpause("newvn", false, hz, NULL);
 632                                 goto try_again;
 633                         }
 634                         tablefull("vnode", "increase kern.maxvnodes or NVNODE");
 635                         *vpp = 0;
 636                         return (ENFILE);
 637                 }
 638                 vp->v_iflag = 0;
 639                 vp->v_vflag = 0;
 640                 vp->v_uflag = 0;
 641                 vp->v_socket = NULL;
 642         }
 643
 644         KASSERT(vp->v_usecount == 1);
 645         KASSERT(vp->v_freelisthd == NULL);
 646         KASSERT(LIST_EMPTY(&vp->v_nclist));
 647         KASSERT(LIST_EMPTY(&vp->v_dnclist));
 648
 649         vp->v_type = VNON;
 650         vp->v_vnlock = &vp->v_lock;
 651         vp->v_tag = tag;
 652         vp->v_op = vops;
 653         insmntque(vp, mp);
 654         *vpp = vp;
 655         vp->v_data = 0;
 656
 657         /*
 658          * initialize uvm_object within vnode.
 659          */
 660
 661         uobj = &vp->v_uobj;
 662         KASSERT(uobj->pgops == &uvm_vnodeops);
 663         KASSERT(uobj->uo_npages == 0);
 664         KASSERT(TAILQ_FIRST(&uobj->memq) == NULL);
 665         vp->v_size = vp->v_writesize = VSIZENOTSET;
 666
 667         if (mp != NULL) {
 668                 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
 669                         vp->v_vflag |= VV_MPSAFE;
 670                 vfs_unbusy(mp, true, NULL);
 671         }
 672
 673         return (0);
 674 }
 675
 676 /*
 677  * This is really just the reverse of getnewvnode(). Needed for
 678  * VFS_VGET functions who may need to push back a vnode in case
 679  * of a locking race.
 680  */
 681 void
 682 ungetnewvnode(vnode_t *vp)
 683 {
 684
 685         KASSERT(vp->v_usecount == 1);
 686         KASSERT(vp->v_data == NULL);
 687         KASSERT(vp->v_freelisthd == NULL);
 688
 689         mutex_enter(&vp->v_interlock);
 690         vp->v_iflag |= VI_CLEAN;
 691         vrelel(vp, 0);
 692 }
 693
 694 /*
 695  * Allocate a new, uninitialized vnode.  If 'mp' is non-NULL, this is a
 696  * marker vnode and we are prepared to wait for the allocation.
 697  */
 698 vnode_t *
 699 vnalloc(struct mount *mp)
 700 {
 701         vnode_t *vp;
 702
 703         vp = pool_cache_get(vnode_cache, (mp != NULL ? PR_WAITOK : PR_NOWAIT));
 704         if (vp == NULL) {
 705                 return NULL;
 706         }
 707
 708         memset(vp, 0, sizeof(*vp));
 709         UVM_OBJ_INIT(&vp->v_uobj, &uvm_vnodeops, 0);
 710         cv_init(&vp->v_cv, "vnode");
 711         /*
 712          * done by memset() above.
 713          *      LIST_INIT(&vp->v_nclist);
 714          *      LIST_INIT(&vp->v_dnclist);
 715          */
 716
 717         if (mp != NULL) {
 718                 vp->v_mount = mp;
 719                 vp->v_type = VBAD;
 720                 vp->v_iflag = VI_MARKER;
 721         } else {
 722                 rw_init(&vp->v_lock.vl_lock);
 723         }
 724
 725         return vp;
 726 }
 727
 728 /*
 729  * Free an unused, unreferenced vnode.
 730  */
 731 void
 732 vnfree(vnode_t *vp)
 733 {
 734
 735         KASSERT(vp->v_usecount == 0);
 736
 737         if ((vp->v_iflag & VI_MARKER) == 0) {
 738                 rw_destroy(&vp->v_lock.vl_lock);
 739                 mutex_enter(&vnode_free_list_lock);
 740                 numvnodes--;
 741                 mutex_exit(&vnode_free_list_lock);
 742         }
 743
 744         UVM_OBJ_DESTROY(&vp->v_uobj);
 745         cv_destroy(&vp->v_cv);
 746         pool_cache_put(vnode_cache, vp);
 747 }
 748
 749 /*
 750  * Remove a vnode from its freelist.
 751  */
 752 static inline void
 753 vremfree(vnode_t *vp)
 754 {
 755
 756         KASSERT(mutex_owned(&vp->v_interlock));
 757         KASSERT(vp->v_usecount == 0);
 758
 759         /*
 760          * Note that the reference count must not change until
 761          * the vnode is removed.
 762          */
 763         mutex_enter(&vnode_free_list_lock);
 764         if (vp->v_holdcnt > 0) {
 765                 KASSERT(vp->v_freelisthd == &vnode_hold_list);
 766         } else {
 767                 KASSERT(vp->v_freelisthd == &vnode_free_list);
 768         }
 769         TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
 770         vp->v_freelisthd = NULL;
 771         mutex_exit(&vnode_free_list_lock);
 772 }
 773
 774 /*
 775  * Move a vnode from one mount queue to another.
 776  */
 777 static void
 778 insmntque(vnode_t *vp, struct mount *mp)
 779 {
 780         struct mount *omp;
 781
 782 #ifdef DIAGNOSTIC
 783         if ((mp != NULL) &&
 784             (mp->mnt_iflag & IMNT_UNMOUNT) &&
 785             vp->v_tag != VT_VFS) {
 786                 panic("insmntque into dying filesystem");
 787         }
 788 #endif
 789
 790         mutex_enter(&mntvnode_lock);
 791         /*
 792          * Delete from old mount point vnode list, if on one.
 793          */
 794         if ((omp = vp->v_mount) != NULL)
 795                 TAILQ_REMOVE(&vp->v_mount->mnt_vnodelist, vp, v_mntvnodes);
 796         /*
 797          * Insert into list of vnodes for the new mount point, if
 798          * available.  The caller must take a reference on the mount
 799          * structure and donate to the vnode.
 800          */
 801         if ((vp->v_mount = mp) != NULL)
 802                 TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
 803         mutex_exit(&mntvnode_lock);
 804
 805         if (omp != NULL) {
 806                 /* Release reference to old mount. */
 807                 vfs_destroy(omp);
 808         }
 809 }
 810
 811 /*
 812  * Wait for a vnode (typically with VI_XLOCK set) to be cleaned or
 813  * recycled.
 814  */
 815 void
 816 vwait(vnode_t *vp, int flags)
 817 {
 818
 819         KASSERT(mutex_owned(&vp->v_interlock));
 820         KASSERT(vp->v_usecount != 0);
 821
 822         while ((vp->v_iflag & flags) != 0)
 823                 cv_wait(&vp->v_cv, &vp->v_interlock);
 824 }
 825
 826 /*
 827  * Insert a marker vnode into a mount's vnode list, after the
 828  * specified vnode.  mntvnode_lock must be held.
 829  */
 830 void
 831 vmark(vnode_t *mvp, vnode_t *vp)
 832 {
 833         struct mount *mp;
 834
 835         mp = mvp->v_mount;
 836
 837         KASSERT(mutex_owned(&mntvnode_lock));
 838         KASSERT((mvp->v_iflag & VI_MARKER) != 0);
 839         KASSERT(vp->v_mount == mp);
 840
 841         TAILQ_INSERT_AFTER(&mp->mnt_vnodelist, vp, mvp, v_mntvnodes);
 842 }
 843
 844 /*
 845  * Remove a marker vnode from a mount's vnode list, and return
 846  * a pointer to the next vnode in the list.  mntvnode_lock must
 847  * be held.
 848  */
 849 vnode_t *
 850 vunmark(vnode_t *mvp)
 851 {
 852         vnode_t *vp;
 853         struct mount *mp;
 854
 855         mp = mvp->v_mount;
 856
 857         KASSERT(mutex_owned(&mntvnode_lock));
 858         KASSERT((mvp->v_iflag & VI_MARKER) != 0);
 859
 860         vp = TAILQ_NEXT(mvp, v_mntvnodes);
 861         TAILQ_REMOVE(&mp->mnt_vnodelist, mvp, v_mntvnodes);
 862
 863         KASSERT(vp == NULL || vp->v_mount == mp);
 864
 865         return vp;
 866 }
 867
 868 /*
 869  * Update outstanding I/O count and do wakeup if requested.
 870  */
 871 void
 872 vwakeup(struct buf *bp)
 873 {
 874         struct vnode *vp;
 875
 876         if ((vp = bp->b_vp) == NULL)
 877                 return;
 878
 879         KASSERT(bp->b_objlock == &vp->v_interlock);
 880         KASSERT(mutex_owned(bp->b_objlock));
 881
 882         if (--vp->v_numoutput < 0)
 883                 panic("vwakeup: neg numoutput, vp %p", vp);
 884         if (vp->v_numoutput == 0)
 885                 cv_broadcast(&vp->v_cv);
 886 }
 887
 888 /*
 889  * Flush out and invalidate all buffers associated with a vnode.
 890  * Called with the underlying vnode locked, which should prevent new dirty
 891  * buffers from being queued.
 892  */
 893 int
 894 vinvalbuf(struct vnode *vp, int flags, kauth_cred_t cred, struct lwp *l,
 895           bool catch, int slptimeo)
 896 {
 897         struct buf *bp, *nbp;
 898         int error;
 899         int flushflags = PGO_ALLPAGES | PGO_FREE | PGO_SYNCIO |
 900             (flags & V_SAVE ? PGO_CLEANIT | PGO_RECLAIM : 0);
 901
 902         /* XXXUBC this doesn't look at flags or slp* */
 903         mutex_enter(&vp->v_interlock);
 904         error = VOP_PUTPAGES(vp, 0, 0, flushflags);
 905         if (error) {
 906                 return error;
 907         }
 908
 909         if (flags & V_SAVE) {
 910                 error = VOP_FSYNC(vp, cred, FSYNC_WAIT|FSYNC_RECLAIM, 0, 0);
 911                 if (error)
 912                         return (error);
 913                 KASSERT(LIST_EMPTY(&vp->v_dirtyblkhd));
 914         }
 915
 916         mutex_enter(&bufcache_lock);
 917 restart:
 918         for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
 919                 nbp = LIST_NEXT(bp, b_vnbufs);
 920                 error = bbusy(bp, catch, slptimeo, NULL);
 921                 if (error != 0) {
 922                         if (error == EPASSTHROUGH)
 923                                 goto restart;
 924                         mutex_exit(&bufcache_lock);
 925                         return (error);
 926                 }
 927                 brelsel(bp, BC_INVAL | BC_VFLUSH);
 928         }
 929
 930         for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
 931                 nbp = LIST_NEXT(bp, b_vnbufs);
 932                 error = bbusy(bp, catch, slptimeo, NULL);
 933                 if (error != 0) {
 934                         if (error == EPASSTHROUGH)
 935                                 goto restart;
 936                         mutex_exit(&bufcache_lock);
 937                         return (error);
 938                 }
 939                 /*
 940                  * XXX Since there are no node locks for NFS, I believe
 941                  * there is a slight chance that a delayed write will
 942                  * occur while sleeping just above, so check for it.
 943                  */
 944                 if ((bp->b_oflags & BO_DELWRI) && (flags & V_SAVE)) {
 945 #ifdef DEBUG
 946                         printf("buffer still DELWRI\n");
 947 #endif
 948                         bp->b_cflags |= BC_BUSY | BC_VFLUSH;
 949                         mutex_exit(&bufcache_lock);
 950                         VOP_BWRITE(bp);
 951                         mutex_enter(&bufcache_lock);
 952                         goto restart;
 953                 }
 954                 brelsel(bp, BC_INVAL | BC_VFLUSH);
 955         }
 956
 957 #ifdef DIAGNOSTIC
 958         if (!LIST_EMPTY(&vp->v_cleanblkhd) || !LIST_EMPTY(&vp->v_dirtyblkhd))
 959                 panic("vinvalbuf: flush failed, vp %p", vp);
 960 #endif
 961
 962         mutex_exit(&bufcache_lock);
 963
 964         return (0);
 965 }
 966
 967 /*
 968  * Destroy any in core blocks past the truncation length.
 969  * Called with the underlying vnode locked, which should prevent new dirty
 970  * buffers from being queued.
 971  */
 972 int
 973 vtruncbuf(struct vnode *vp, daddr_t lbn, bool catch, int slptimeo)
 974 {
 975         struct buf *bp, *nbp;
 976         int error;
 977         voff_t off;
 978
 979         off = round_page((voff_t)lbn << vp->v_mount->mnt_fs_bshift);
 980         mutex_enter(&vp->v_interlock);
 981         error = VOP_PUTPAGES(vp, off, 0, PGO_FREE | PGO_SYNCIO);
 982         if (error) {
 983                 return error;
 984         }
 985
 986         mutex_enter(&bufcache_lock);
 987 restart:
 988         for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
 989                 nbp = LIST_NEXT(bp, b_vnbufs);
 990                 if (bp->b_lblkno < lbn)
 991                         continue;
 992                 error = bbusy(bp, catch, slptimeo, NULL);
 993                 if (error != 0) {
 994                         if (error == EPASSTHROUGH)
 995                                 goto restart;
 996                         mutex_exit(&bufcache_lock);
 997                         return (error);
 998                 }
 999                 brelsel(bp, BC_INVAL | BC_VFLUSH);
1000         }
1001
1002         for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
1003                 nbp = LIST_NEXT(bp, b_vnbufs);
1004                 if (bp->b_lblkno < lbn)
1005                         continue;
1006                 error = bbusy(bp, catch, slptimeo, NULL);
1007                 if (error != 0) {
1008                         if (error == EPASSTHROUGH)
1009                                 goto restart;
1010                         mutex_exit(&bufcache_lock);
1011                         return (error);
1012                 }
1013                 brelsel(bp, BC_INVAL | BC_VFLUSH);
1014         }
1015         mutex_exit(&bufcache_lock);
1016
1017         return (0);
1018 }
1019
1020 /*
1021  * Flush all dirty buffers from a vnode.
1022  * Called with the underlying vnode locked, which should prevent new dirty
1023  * buffers from being queued.
1024  */
1025 void
1026 vflushbuf(struct vnode *vp, int sync)
1027 {
1028         struct buf *bp, *nbp;
1029         int flags = PGO_CLEANIT | PGO_ALLPAGES | (sync ? PGO_SYNCIO : 0);
1030         bool dirty;
1031
1032         mutex_enter(&vp->v_interlock);
1033         (void) VOP_PUTPAGES(vp, 0, 0, flags);
1034
1035 loop:
1036         mutex_enter(&bufcache_lock);
1037         for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
1038                 nbp = LIST_NEXT(bp, b_vnbufs);
1039                 if ((bp->b_cflags & BC_BUSY))
1040                         continue;
1041                 if ((bp->b_oflags & BO_DELWRI) == 0)
1042                         panic("vflushbuf: not dirty, bp %p", bp);
1043                 bp->b_cflags |= BC_BUSY | BC_VFLUSH;
1044                 mutex_exit(&bufcache_lock);
1045                 /*
1046                  * Wait for I/O associated with indirect blocks to complete,
1047                  * since there is no way to quickly wait for them below.
1048                  */
1049                 if (bp->b_vp == vp || sync == 0)
1050                         (void) bawrite(bp);
1051                 else
1052                         (void) bwrite(bp);
1053                 goto loop;
1054         }
1055         mutex_exit(&bufcache_lock);
1056
1057         if (sync == 0)
1058                 return;
1059
1060         mutex_enter(&vp->v_interlock);
1061         while (vp->v_numoutput != 0)
1062                 cv_wait(&vp->v_cv, &vp->v_interlock);
1063         dirty = !LIST_EMPTY(&vp->v_dirtyblkhd);
1064         mutex_exit(&vp->v_interlock);
1065
1066         if (dirty) {
1067                 vprint("vflushbuf: dirty", vp);
1068                 goto loop;
1069         }
1070 }
1071
1072 /*
1073  * Create a vnode for a block device.
1074  * Used for root filesystem and swap areas.
1075  * Also used for memory file system special devices.
1076  */
1077 int
1078 bdevvp(dev_t dev, vnode_t **vpp)
1079 {
1080
1081         return (getdevvp(dev, vpp, VBLK));
1082 }
1083
1084 /*
1085  * Create a vnode for a character device.
1086  * Used for kernfs and some console handling.
1087  */
1088 int
1089 cdevvp(dev_t dev, vnode_t **vpp)
1090 {
1091
1092         return (getdevvp(dev, vpp, VCHR));
1093 }
1094
1095 /*
1096  * Associate a buffer with a vnode.  There must already be a hold on
1097  * the vnode.
1098  */
1099 void
1100 bgetvp(struct vnode *vp, struct buf *bp)
1101 {
1102
1103         KASSERT(bp->b_vp == NULL);
1104         KASSERT(bp->b_objlock == &buffer_lock);
1105         KASSERT(mutex_owned(&vp->v_interlock));
1106         KASSERT(mutex_owned(&bufcache_lock));
1107         KASSERT((bp->b_cflags & BC_BUSY) != 0);
1108         KASSERT(!cv_has_waiters(&bp->b_done));
1109
1110         vholdl(vp);
1111         bp->b_vp = vp;
1112         if (vp->v_type == VBLK || vp->v_type == VCHR)
1113                 bp->b_dev = vp->v_rdev;
1114         else
1115                 bp->b_dev = NODEV;
1116
1117         /*
1118          * Insert onto list for new vnode.
1119          */
1120         bufinsvn(bp, &vp->v_cleanblkhd);
1121         bp->b_objlock = &vp->v_interlock;
1122 }
1123
1124 /*
1125  * Disassociate a buffer from a vnode.
1126  */
1127 void
1128 brelvp(struct buf *bp)
1129 {
1130         struct vnode *vp = bp->b_vp;
1131
1132         KASSERT(vp != NULL);
1133         KASSERT(bp->b_objlock == &vp->v_interlock);
1134         KASSERT(mutex_owned(&vp->v_interlock));
1135         KASSERT(mutex_owned(&bufcache_lock));
1136         KASSERT((bp->b_cflags & BC_BUSY) != 0);
1137         KASSERT(!cv_has_waiters(&bp->b_done));
1138
1139         /*
1140          * Delete from old vnode list, if on one.
1141          */
1142         if (LIST_NEXT(bp, b_vnbufs) != NOLIST)
1143                 bufremvn(bp);
1144
1145         if (TAILQ_EMPTY(&vp->v_uobj.memq) && (vp->v_iflag & VI_ONWORKLST) &&
1146             LIST_FIRST(&vp->v_dirtyblkhd) == NULL) {
1147                 vp->v_iflag &= ~VI_WRMAPDIRTY;
1148                 vn_syncer_remove_from_worklist(vp);
1149         }
1150
1151         bp->b_objlock = &buffer_lock;
1152         bp->b_vp = NULL;
1153         holdrelel(vp);
1154 }
1155
1156 /*
1157  * Reassign a buffer from one vnode list to another.
1158  * The list reassignment must be within the same vnode.
1159  * Used to assign file specific control information
1160  * (indirect blocks) to the list to which they belong.
1161  */
1162 void
1163 reassignbuf(struct buf *bp, struct vnode *vp)
1164 {
1165         struct buflists *listheadp;
1166         int delayx;
1167
1168         KASSERT(mutex_owned(&bufcache_lock));
1169         KASSERT(bp->b_objlock == &vp->v_interlock);
1170         KASSERT(mutex_owned(&vp->v_interlock));
1171         KASSERT((bp->b_cflags & BC_BUSY) != 0);
1172
1173         /*
1174          * Delete from old vnode list, if on one.
1175          */
1176         if (LIST_NEXT(bp, b_vnbufs) != NOLIST)
1177                 bufremvn(bp);
1178
1179         /*
1180          * If dirty, put on list of dirty buffers;
1181          * otherwise insert onto list of clean buffers.
1182          */
1183         if ((bp->b_oflags & BO_DELWRI) == 0) {
1184                 listheadp = &vp->v_cleanblkhd;
1185                 if (TAILQ_EMPTY(&vp->v_uobj.memq) &&
1186                     (vp->v_iflag & VI_ONWORKLST) &&
1187                     LIST_FIRST(&vp->v_dirtyblkhd) == NULL) {
1188                         vp->v_iflag &= ~VI_WRMAPDIRTY;
1189                         vn_syncer_remove_from_worklist(vp);
1190                 }
1191         } else {
1192                 listheadp = &vp->v_dirtyblkhd;
1193                 if ((vp->v_iflag & VI_ONWORKLST) == 0) {
1194                         switch (vp->v_type) {
1195                         case VDIR:
1196                                 delayx = dirdelay;
1197                                 break;
1198                         case VBLK:
1199                                 if (vp->v_specmountpoint != NULL) {
1200                                         delayx = metadelay;
1201                                         break;
1202                                 }
1203                                 /* fall through */
1204                         default:
1205                                 delayx = filedelay;
1206                                 break;
1207                         }
1208                         if (!vp->v_mount ||
1209                             (vp->v_mount->mnt_flag & MNT_ASYNC) == 0)
1210                                 vn_syncer_add_to_worklist(vp, delayx);
1211                 }
1212         }
1213         bufinsvn(bp, listheadp);
1214 }
1215
1216 /*
1217  * Create a vnode for a device.
1218  * Used by bdevvp (block device) for root file system etc.,
1219  * and by cdevvp (character device) for console and kernfs.
1220  */
1221 static int
1222 getdevvp(dev_t dev, vnode_t **vpp, enum vtype type)
1223 {
1224         vnode_t *vp;
1225         vnode_t *nvp;
1226         int error;
1227
1228         if (dev == NODEV) {
1229                 *vpp = NULL;
1230                 return (0);
1231         }
1232         error = getnewvnode(VT_NON, NULL, spec_vnodeop_p, &nvp);
1233         if (error) {
1234                 *vpp = NULL;
1235                 return (error);
1236         }
1237         vp = nvp;
1238         vp->v_type = type;
1239         vp->v_vflag |= VV_MPSAFE;
1240         uvm_vnp_setsize(vp, 0);
1241         spec_node_init(vp, dev);
1242         *vpp = vp;
1243         return (0);
1244 }
1245
1246 /*
1247  * Try to gain a reference to a vnode, without acquiring its interlock.
1248  * The caller must hold a lock that will prevent the vnode from being
1249  * recycled or freed.
1250  */
1251 bool
1252 vtryget(vnode_t *vp)
1253 {
1254         u_int use, next;
1255
1256         /*
1257          * If the vnode is being freed, don't make life any harder
1258          * for vclean() by adding another reference without waiting.
1259          * This is not strictly necessary, but we'll do it anyway.
1260          */
1261         if (__predict_false((vp->v_iflag & (VI_XLOCK | VI_FREEING)) != 0)) {
1262                 return false;
1263         }
1264         for (use = vp->v_usecount;; use = next) {
1265                 if (use == 0 || __predict_false((use & VC_XLOCK) != 0)) {
1266                         /* Need interlock held if first reference. */
1267                         return false;
1268                 }
1269                 next = atomic_cas_uint(&vp->v_usecount, use, use + 1);
1270                 if (__predict_true(next == use)) {
1271                         return true;
1272                 }
1273         }
1274 }
1275
1276 /*
1277  * Grab a particular vnode from the free list, increment its
1278  * reference count and lock it. If the vnode lock bit is set the
1279  * vnode is being eliminated in vgone. In that case, we can not
1280  * grab the vnode, so the process is awakened when the transition is
1281  * completed, and an error returned to indicate that the vnode is no
1282  * longer usable (possibly having been changed to a new file system type).
1283  */
1284 int
1285 vget(vnode_t *vp, int flags)
1286 {
1287         int error;
1288
1289         KASSERT((vp->v_iflag & VI_MARKER) == 0);
1290
1291         if ((flags & LK_INTERLOCK) == 0)
1292                 mutex_enter(&vp->v_interlock);
1293
1294         /*
1295          * Before adding a reference, we must remove the vnode
1296          * from its freelist.
1297          */
1298         if (vp->v_usecount == 0) {
1299                 vremfree(vp);
1300                 vp->v_usecount = 1;
1301         } else {
1302                 atomic_inc_uint(&vp->v_usecount);
1303         }
1304
1305         /*
1306          * If the vnode is in the process of being cleaned out for
1307          * another use, we wait for the cleaning to finish and then
1308          * return failure.  Cleaning is determined by checking if
1309          * the VI_XLOCK or VI_FREEING flags are set.
1310          */
1311         if ((vp->v_iflag & (VI_XLOCK | VI_FREEING)) != 0) {
1312                 if ((flags & LK_NOWAIT) != 0) {
1313                         vrelel(vp, 0);
1314                         return EBUSY;
1315                 }
1316                 vwait(vp, VI_XLOCK | VI_FREEING);
1317                 vrelel(vp, 0);
1318                 return ENOENT;
1319         }
1320
1321         if ((vp->v_iflag & VI_INACTNOW) != 0) {
1322                 /*
1323                  * if it's being desactived, wait for it to complete.
1324                  * Make sure to not return a clean vnode.
1325                  */
1326                  if ((flags & LK_NOWAIT) != 0) {
1327                         vrelel(vp, 0);
1328                         return EBUSY;
1329                 }
1330                 vwait(vp, VI_INACTNOW);
1331                 if ((vp->v_iflag & VI_CLEAN) != 0) {
1332                         vrelel(vp, 0);
1333                         return ENOENT;
1334                 }
1335         }
1336         if (flags & LK_TYPE_MASK) {
1337                 error = vn_lock(vp, flags | LK_INTERLOCK);
1338                 if (error != 0) {
1339                         vrele(vp);
1340                 }
1341                 return error;
1342         }
1343         mutex_exit(&vp->v_interlock);
1344         return 0;
1345 }
1346
1347 /*
1348  * vput(), just unlock and vrele()
1349  */
1350 void
1351 vput(vnode_t *vp)
1352 {
1353
1354         KASSERT((vp->v_iflag & VI_MARKER) == 0);
1355
1356         VOP_UNLOCK(vp, 0);
1357         vrele(vp);
1358 }
1359
1360 /*
1361  * Try to drop reference on a vnode.  Abort if we are releasing the
1362  * last reference.  Note: this _must_ succeed if not the last reference.
1363  */
1364 static inline bool
1365 vtryrele(vnode_t *vp)
1366 {
1367         u_int use, next;
1368
1369         for (use = vp->v_usecount;; use = next) {
1370                 if (use == 1) {
1371                         return false;
1372                 }
1373                 KASSERT((use & VC_MASK) > 1);
1374                 next = atomic_cas_uint(&vp->v_usecount, use, use - 1);
1375                 if (__predict_true(next == use)) {
1376                         return true;
1377                 }
1378         }
1379 }
1380
1381 /*
1382  * Vnode release.  If reference count drops to zero, call inactive
1383  * routine and either return to freelist or free to the pool.
1384  */
1385 void
1386 vrelel(vnode_t *vp, int flags)
1387 {
1388         bool recycle, defer;
1389         int error;
1390
1391         KASSERT(mutex_owned(&vp->v_interlock));
1392         KASSERT((vp->v_iflag & VI_MARKER) == 0);
1393         KASSERT(vp->v_freelisthd == NULL);
1394
1395         if (__predict_false(vp->v_op == dead_vnodeop_p &&
1396             (vp->v_iflag & (VI_CLEAN|VI_XLOCK)) == 0)) {
1397                 vpanic(vp, "dead but not clean");
1398         }
1399
1400         /*
1401          * If not the last reference, just drop the reference count
1402          * and unlock.
1403          */
1404         if (vtryrele(vp)) {
1405                 vp->v_iflag |= VI_INACTREDO;
1406                 mutex_exit(&vp->v_interlock);
1407                 return;
1408         }
1409         if (vp->v_usecount <= 0 || vp->v_writecount != 0) {
1410                 vpanic(vp, "vrelel: bad ref count");
1411         }
1412
1413         KASSERT((vp->v_iflag & VI_XLOCK) == 0);
1414
1415         /*
1416          * If not clean, deactivate the vnode, but preserve
1417          * our reference across the call to VOP_INACTIVE().
1418          */
1419  retry:
1420         if ((vp->v_iflag & VI_CLEAN) == 0) {
1421                 recycle = false;
1422                 vp->v_iflag |= VI_INACTNOW;
1423
1424                 /*
1425                  * XXX This ugly block can be largely eliminated if
1426                  * locking is pushed down into the file systems.
1427                  */
1428                 if (curlwp == uvm.pagedaemon_lwp) {
1429                         /* The pagedaemon can't wait around; defer. */
1430                         defer = true;
1431                 } else if (curlwp == vrele_lwp) {
1432                         /*
1433                          * We have to try harder. But we can't sleep
1434                          * with VI_INACTNOW as vget() may be waiting on it.
1435                          */
1436                         vp->v_iflag &= ~(VI_INACTREDO|VI_INACTNOW);
1437                         cv_broadcast(&vp->v_cv);
1438                         error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK |
1439                             LK_RETRY);
1440                         if (error != 0) {
1441                                 /* XXX */
1442                                 vpanic(vp, "vrele: unable to lock %p");
1443                         }
1444                         mutex_enter(&vp->v_interlock);
1445                         /*
1446                          * if we did get another reference while
1447                          * sleeping, don't try to inactivate it yet.
1448                          */
1449                         if (__predict_false(vtryrele(vp))) {
1450                                 VOP_UNLOCK(vp, 0);
1451                                 mutex_exit(&vp->v_interlock);
1452                                 return;
1453                         }
1454                         vp->v_iflag |= VI_INACTNOW;
1455                         mutex_exit(&vp->v_interlock);
1456                         defer = false;
1457                 } else if ((vp->v_iflag & VI_LAYER) != 0) {
1458                         /*
1459                          * Acquiring the stack's lock in vclean() even
1460                          * for an honest vput/vrele is dangerous because
1461                          * our caller may hold other vnode locks; defer.
1462                          */
1463                         defer = true;
1464                 } else {
1465                         /* If we can't acquire the lock, then defer. */
1466                         vp->v_iflag &= ~VI_INACTREDO;
1467                         error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK |
1468                             LK_NOWAIT);
1469                         if (error != 0) {
1470                                 defer = true;
1471                                 mutex_enter(&vp->v_interlock);
1472                         } else {
1473                                 defer = false;
1474                         }
1475                 }
1476
1477                 if (defer) {
1478                         /*
1479                          * Defer reclaim to the kthread; it's not safe to
1480                          * clean it here.  We donate it our last reference.
1481                          */
1482                         KASSERT(mutex_owned(&vp->v_interlock));
1483                         KASSERT((vp->v_iflag & VI_INACTPEND) == 0);
1484                         vp->v_iflag &= ~VI_INACTNOW;
1485                         vp->v_iflag |= VI_INACTPEND;
1486                         mutex_enter(&vrele_lock);
1487                         TAILQ_INSERT_TAIL(&vrele_list, vp, v_freelist);
1488                         if (++vrele_pending > (desiredvnodes >> 8))
1489                                 cv_signal(&vrele_cv);
1490                         mutex_exit(&vrele_lock);
1491                         cv_broadcast(&vp->v_cv);
1492                         mutex_exit(&vp->v_interlock);
1493                         return;
1494                 }
1495
1496 #ifdef DIAGNOSTIC
1497                 if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
1498                     vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) {
1499                         vprint("vrelel: missing VOP_CLOSE()", vp);
1500                 }
1501 #endif
1502
1503                 /*
1504                  * The vnode can gain another reference while being
1505                  * deactivated.  If VOP_INACTIVE() indicates that
1506                  * the described file has been deleted, then recycle
1507                  * the vnode irrespective of additional references.
1508                  * Another thread may be waiting to re-use the on-disk
1509                  * inode.
1510                  *
1511                  * Note that VOP_INACTIVE() will drop the vnode lock.
1512                  */
1513                 VOP_INACTIVE(vp, &recycle);
1514                 mutex_enter(&vp->v_interlock);
1515                 vp->v_iflag &= ~VI_INACTNOW;
1516                 cv_broadcast(&vp->v_cv);
1517                 if (!recycle) {
1518                         if (vtryrele(vp)) {
1519                                 mutex_exit(&vp->v_interlock);
1520                                 return;
1521                         }
1522
1523                         /*
1524                          * If we grew another reference while
1525                          * VOP_INACTIVE() was underway, retry.
1526                          */
1527                         if ((vp->v_iflag & VI_INACTREDO) != 0) {
1528                                 goto retry;
1529                         }
1530                 }
1531
1532                 /* Take care of space accounting. */
1533                 if (vp->v_iflag & VI_EXECMAP) {
1534                         atomic_add_int(&uvmexp.execpages,
1535                             -vp->v_uobj.uo_npages);
1536                         atomic_add_int(&uvmexp.filepages,
1537                             vp->v_uobj.uo_npages);
1538                 }
1539                 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP);
1540                 vp->v_vflag &= ~VV_MAPPED;
1541
1542                 /*
1543                  * Recycle the vnode if the file is now unused (unlinked),
1544                  * otherwise just free it.
1545                  */
1546                 if (recycle) {
1547                         vclean(vp, DOCLOSE);
1548                 }
1549                 KASSERT(vp->v_usecount > 0);
1550         }
1551
1552         if (atomic_dec_uint_nv(&vp->v_usecount) != 0) {
1553                 /* Gained another reference while being reclaimed. */
1554                 mutex_exit(&vp->v_interlock);
1555                 return;
1556         }
1557
1558         if ((vp->v_iflag & VI_CLEAN) != 0) {
1559                 /*
1560                  * It's clean so destroy it.  It isn't referenced
1561                  * anywhere since it has been reclaimed.
1562                  */
1563                 KASSERT(vp->v_holdcnt == 0);
1564                 KASSERT(vp->v_writecount == 0);
1565                 mutex_exit(&vp->v_interlock);
1566                 insmntque(vp, NULL);
1567                 if (vp->v_type == VBLK || vp->v_type == VCHR) {
1568                         spec_node_destroy(vp);
1569                 }
1570                 vnfree(vp);
1571         } else {
1572                 /*
1573                  * Otherwise, put it back onto the freelist.  It
1574                  * can't be destroyed while still associated with
1575                  * a file system.
1576                  */
1577                 mutex_enter(&vnode_free_list_lock);
1578                 if (vp->v_holdcnt > 0) {
1579                         vp->v_freelisthd = &vnode_hold_list;
1580                 } else {
1581                         vp->v_freelisthd = &vnode_free_list;
1582                 }
1583                 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
1584                 mutex_exit(&vnode_free_list_lock);
1585                 mutex_exit(&vp->v_interlock);
1586         }
1587 }
1588
1589 void
1590 vrele(vnode_t *vp)
1591 {
1592
1593         KASSERT((vp->v_iflag & VI_MARKER) == 0);
1594
1595         if ((vp->v_iflag & VI_INACTNOW) == 0 && vtryrele(vp)) {
1596                 return;
1597         }
1598         mutex_enter(&vp->v_interlock);
1599         vrelel(vp, 0);
1600 }
1601
1602 static void
1603 vrele_thread(void *cookie)
1604 {
1605         vnode_t *vp;
1606
1607         for (;;) {
1608                 mutex_enter(&vrele_lock);
1609                 while (TAILQ_EMPTY(&vrele_list)) {
1610                         vrele_gen++;
1611                         cv_broadcast(&vrele_cv);
1612                         cv_timedwait(&vrele_cv, &vrele_lock, hz);
1613                 }
1614                 vp = TAILQ_FIRST(&vrele_list);
1615                 TAILQ_REMOVE(&vrele_list, vp, v_freelist);
1616                 vrele_pending--;
1617                 mutex_exit(&vrele_lock);
1618
1619                 /*
1620                  * If not the last reference, then ignore the vnode
1621                  * and look for more work.
1622                  */
1623                 mutex_enter(&vp->v_interlock);
1624                 KASSERT((vp->v_iflag & VI_INACTPEND) != 0);
1625                 vp->v_iflag &= ~VI_INACTPEND;
1626                 vrelel(vp, 0);
1627         }
1628 }
1629
1630 /*
1631  * Page or buffer structure gets a reference.
1632  * Called with v_interlock held.
1633  */
1634 void
1635 vholdl(vnode_t *vp)
1636 {
1637
1638         KASSERT(mutex_owned(&vp->v_interlock));
1639         KASSERT((vp->v_iflag & VI_MARKER) == 0);
1640
1641         if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0) {
1642                 mutex_enter(&vnode_free_list_lock);
1643                 KASSERT(vp->v_freelisthd == &vnode_free_list);
1644                 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
1645                 vp->v_freelisthd = &vnode_hold_list;
1646                 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
1647                 mutex_exit(&vnode_free_list_lock);
1648         }
1649 }
1650
1651 /*
1652  * Page or buffer structure frees a reference.
1653  * Called with v_interlock held.
1654  */
1655 void
1656 holdrelel(vnode_t *vp)
1657 {
1658
1659         KASSERT(mutex_owned(&vp->v_interlock));
1660         KASSERT((vp->v_iflag & VI_MARKER) == 0);
1661
1662         if (vp->v_holdcnt <= 0) {
1663                 vpanic(vp, "holdrelel: holdcnt vp %p");
1664         }
1665
1666         vp->v_holdcnt--;
1667         if (vp->v_holdcnt == 0 && vp->v_usecount == 0) {
1668                 mutex_enter(&vnode_free_list_lock);
1669                 KASSERT(vp->v_freelisthd == &vnode_hold_list);
1670                 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
1671                 vp->v_freelisthd = &vnode_free_list;
1672                 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
1673                 mutex_exit(&vnode_free_list_lock);
1674         }
1675 }
1676
1677 /*
1678  * Vnode reference, where a reference is already held by some other
1679  * object (for example, a file structure).
1680  */
1681 void
1682 vref(vnode_t *vp)
1683 {
1684
1685         KASSERT((vp->v_iflag & VI_MARKER) == 0);
1686         KASSERT(vp->v_usecount != 0);
1687
1688         atomic_inc_uint(&vp->v_usecount);
1689 }
1690
1691 /*
1692  * Remove any vnodes in the vnode table belonging to mount point mp.
1693  *
1694  * If FORCECLOSE is not specified, there should not be any active ones,
1695  * return error if any are found (nb: this is a user error, not a
1696  * system error). If FORCECLOSE is specified, detach any active vnodes
1697  * that are found.
1698  *
1699  * If WRITECLOSE is set, only flush out regular file vnodes open for
1700  * writing.
1701  *
1702  * SKIPSYSTEM causes any vnodes marked V_SYSTEM to be skipped.
1703  */
1704 #ifdef DEBUG
1705 int busyprt = 0;        /* print out busy vnodes */
1706 struct ctldebug debug1 = { "busyprt", &busyprt };
1707 #endif
1708
1709 static vnode_t *
1710 vflushnext(vnode_t *mvp, int *when)
1711 {
1712
1713         if (hardclock_ticks > *when) {
1714                 mutex_exit(&mntvnode_lock);
1715                 yield();
1716                 mutex_enter(&mntvnode_lock);
1717                 *when = hardclock_ticks + hz / 10;
1718         }
1719
1720         return vunmark(mvp);
1721 }
1722
1723 int
1724 vflush(struct mount *mp, vnode_t *skipvp, int flags)
1725 {
1726         vnode_t *vp, *mvp;
1727         int busy = 0, when = 0, gen;
1728
1729         /*
1730          * First, flush out any vnode references from vrele_list.
1731          */
1732         mutex_enter(&vrele_lock);
1733         gen = vrele_gen;
1734         while (vrele_pending && gen == vrele_gen) {
1735                 cv_broadcast(&vrele_cv);
1736                 cv_wait(&vrele_cv, &vrele_lock);
1737         }
1738         mutex_exit(&vrele_lock);
1739
1740         /* Allocate a marker vnode. */
1741         if ((mvp = vnalloc(mp)) == NULL)
1742                 return (ENOMEM);
1743
1744         /*
1745          * NOTE: not using the TAILQ_FOREACH here since in this loop vgone()
1746          * and vclean() are called
1747          */
1748         mutex_enter(&mntvnode_lock);
1749         for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp != NULL;
1750             vp = vflushnext(mvp, &when)) {
1751                 vmark(mvp, vp);
1752                 if (vp->v_mount != mp || vismarker(vp))
1753                         continue;
1754                 /*
1755                  * Skip over a selected vnode.
1756                  */
1757                 if (vp == skipvp)
1758                         continue;
1759                 mutex_enter(&vp->v_interlock);
1760                 /*
1761                  * Ignore clean but still referenced vnodes.
1762                  */
1763                 if ((vp->v_iflag & VI_CLEAN) != 0) {
1764                         mutex_exit(&vp->v_interlock);
1765                         continue;
1766                 }
1767                 /*
1768                  * Skip over a vnodes marked VSYSTEM.
1769                  */
1770                 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
1771                         mutex_exit(&vp->v_interlock);
1772                         continue;
1773                 }
1774                 /*
1775                  * If WRITECLOSE is set, only flush out regular file
1776                  * vnodes open for writing.
1777                  */
1778                 if ((flags & WRITECLOSE) &&
1779                     (vp->v_writecount == 0 || vp->v_type != VREG)) {
1780                         mutex_exit(&vp->v_interlock);
1781                         continue;
1782                 }
1783                 /*
1784                  * With v_usecount == 0, all we need to do is clear
1785                  * out the vnode data structures and we are done.
1786                  */
1787                 if (vp->v_usecount == 0) {
1788                         mutex_exit(&mntvnode_lock);
1789                         vremfree(vp);
1790                         vp->v_usecount = 1;
1791                         vclean(vp, DOCLOSE);
1792                         vrelel(vp, 0);
1793                         mutex_enter(&mntvnode_lock);
1794                         continue;
1795                 }
1796                 /*
1797                  * If FORCECLOSE is set, forcibly close the vnode.
1798                  * For block or character devices, revert to an
1799                  * anonymous device.  For all other files, just
1800                  * kill them.
1801                  */
1802                 if (flags & FORCECLOSE) {
1803                         mutex_exit(&mntvnode_lock);
1804                         atomic_inc_uint(&vp->v_usecount);
1805                         if (vp->v_type != VBLK && vp->v_type != VCHR) {
1806                                 vclean(vp, DOCLOSE);
1807                                 vrelel(vp, 0);
1808                         } else {
1809                                 vclean(vp, 0);
1810                                 vp->v_op = spec_vnodeop_p; /* XXXSMP */
1811                                 mutex_exit(&vp->v_interlock);
1812                                 /*
1813                                  * The vnode isn't clean, but still resides
1814                                  * on the mount list.  Remove it. XXX This
1815                                  * is a bit dodgy.
1816                                  */
1817                                 insmntque(vp, NULL);
1818                                 vrele(vp);
1819                         }
1820                         mutex_enter(&mntvnode_lock);
1821                         continue;
1822                 }
1823 #ifdef DEBUG
1824                 if (busyprt)
1825                         vprint("vflush: busy vnode", vp);
1826 #endif
1827                 mutex_exit(&vp->v_interlock);
1828                 busy++;
1829         }
1830         mutex_exit(&mntvnode_lock);
1831         vnfree(mvp);
1832         if (busy)
1833                 return (EBUSY);
1834         return (0);
1835 }
1836
1837 /*
1838  * Disassociate the underlying file system from a vnode.
1839  *
1840  * Must be called with the interlock held, and will return with it held.
1841  */
1842 void
1843 vclean(vnode_t *vp, int flags)
1844 {
1845         lwp_t *l = curlwp;
1846         bool recycle, active;
1847         int error;
1848
1849         KASSERT(mutex_owned(&vp->v_interlock));
1850         KASSERT((vp->v_iflag & VI_MARKER) == 0);
1851         KASSERT(vp->v_usecount != 0);
1852
1853         /* If cleaning is already in progress wait until done and return. */
1854         if (vp->v_iflag & VI_XLOCK) {
1855                 vwait(vp, VI_XLOCK);
1856                 return;
1857         }
1858
1859         /* If already clean, nothing to do. */
1860         if ((vp->v_iflag & VI_CLEAN) != 0) {
1861                 return;
1862         }
1863
1864         /*
1865          * Prevent the vnode from being recycled or brought into use
1866          * while we clean it out.
1867          */
1868         vp->v_iflag |= VI_XLOCK;
1869         if (vp->v_iflag & VI_EXECMAP) {
1870                 atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages);
1871                 atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages);
1872         }
1873         vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP);
1874         active = (vp->v_usecount > 1);
1875
1876         /* XXXAD should not lock vnode under layer */
1877         VOP_LOCK(vp, LK_EXCLUSIVE | LK_INTERLOCK);
1878
1879         /*
1880          * Clean out any cached data associated with the vnode.
1881          * If purging an active vnode, it must be closed and
1882          * deactivated before being reclaimed. Note that the
1883          * VOP_INACTIVE will unlock the vnode.
1884          */
1885         if (flags & DOCLOSE) {
1886                 error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0);
1887                 if (error != 0) {
1888                         /* XXX, fix vn_start_write's grab of mp and use that. */
1889
1890                         if (wapbl_vphaswapbl(vp))
1891                                 WAPBL_DISCARD(wapbl_vptomp(vp));
1892                         error = vinvalbuf(vp, 0, NOCRED, l, 0, 0);
1893                 }
1894                 KASSERT(error == 0);
1895                 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1896                 if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) {
1897                          spec_node_revoke(vp);
1898                 }
1899         }
1900         if (active) {
1901                 VOP_INACTIVE(vp, &recycle);
1902         } else {
1903                 /*
1904                  * Any other processes trying to obtain this lock must first
1905                  * wait for VI_XLOCK to clear, then call the new lock operation.
1906                  */
1907                 VOP_UNLOCK(vp, 0);
1908         }
1909
1910         /* Disassociate the underlying file system from the vnode. */
1911         if (VOP_RECLAIM(vp)) {
1912                 vpanic(vp, "vclean: cannot reclaim");
1913         }
1914
1915         KASSERT(vp->v_uobj.uo_npages == 0);
1916         if (vp->v_type == VREG && vp->v_ractx != NULL) {
1917                 uvm_ra_freectx(vp->v_ractx);
1918                 vp->v_ractx = NULL;
1919         }
1920         cache_purge(vp);
1921
1922         /* Done with purge, notify sleepers of the grim news. */
1923         mutex_enter(&vp->v_interlock);
1924         vp->v_op = dead_vnodeop_p;
1925         vp->v_tag = VT_NON;
1926         vp->v_vnlock = &vp->v_lock;
1927         KNOTE(&vp->v_klist, NOTE_REVOKE);
1928         vp->v_iflag &= ~(VI_XLOCK | VI_FREEING);
1929         vp->v_vflag &= ~VV_LOCKSWORK;
1930         if ((flags & DOCLOSE) != 0) {
1931                 vp->v_iflag |= VI_CLEAN;
1932         }
1933         cv_broadcast(&vp->v_cv);
1934
1935         KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1936 }
1937
1938 /*
1939  * Recycle an unused vnode to the front of the free list.
1940  * Release the passed interlock if the vnode will be recycled.
1941  */
1942 int
1943 vrecycle(vnode_t *vp, kmutex_t *inter_lkp, struct lwp *l)
1944 {
1945
1946         KASSERT((vp->v_iflag & VI_MARKER) == 0);
1947
1948         mutex_enter(&vp->v_interlock);
1949         if (vp->v_usecount != 0) {
1950                 mutex_exit(&vp->v_interlock);
1951                 return (0);
1952         }
1953         if (inter_lkp)
1954                 mutex_exit(inter_lkp);
1955         vremfree(vp);
1956         vp->v_usecount = 1;
1957         vclean(vp, DOCLOSE);
1958         vrelel(vp, 0);
1959         return (1);
1960 }
1961
1962 /*
1963  * Eliminate all activity associated with a vnode in preparation for
1964  * reuse.  Drops a reference from the vnode.
1965  */
1966 void
1967 vgone(vnode_t *vp)
1968 {
1969
1970         mutex_enter(&vp->v_interlock);
1971         vclean(vp, DOCLOSE);
1972         vrelel(vp, 0);
1973 }
1974
1975 /*
1976  * Lookup a vnode by device number.
1977  */
1978 int
1979 vfinddev(dev_t dev, enum vtype type, vnode_t **vpp)
1980 {
1981         vnode_t *vp;
1982         int rc = 0;
1983
1984         mutex_enter(&device_lock);
1985         for (vp = specfs_hash[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
1986                 if (dev != vp->v_rdev || type != vp->v_type)
1987                         continue;
1988                 *vpp = vp;
1989                 rc = 1;
1990                 break;
1991         }
1992         mutex_exit(&device_lock);
1993         return (rc);
1994 }
1995
1996 /*
1997  * Revoke all the vnodes corresponding to the specified minor number
1998  * range (endpoints inclusive) of the specified major.
1999  */
2000 void
2001 vdevgone(int maj, int minl, int minh, enum vtype type)
2002 {
2003         vnode_t *vp, **vpp;
2004         dev_t dev;
2005         int mn;
2006
2007         vp = NULL;      /* XXX gcc */
2008
2009         mutex_enter(&device_lock);
2010         for (mn = minl; mn <= minh; mn++) {
2011                 dev = makedev(maj, mn);
2012                 vpp = &specfs_hash[SPECHASH(dev)];
2013                 for (vp = *vpp; vp != NULL;) {
2014                         mutex_enter(&vp->v_interlock);
2015                         if ((vp->v_iflag & VI_CLEAN) != 0 ||
2016                             dev != vp->v_rdev || type != vp->v_type) {
2017                                 mutex_exit(&vp->v_interlock);
2018                                 vp = vp->v_specnext;
2019                                 continue;
2020                         }
2021                         mutex_exit(&device_lock);
2022                         if (vget(vp, LK_INTERLOCK) == 0) {
2023                                 VOP_REVOKE(vp, REVOKEALL);
2024                                 vrele(vp);
2025                         }
2026                         mutex_enter(&device_lock);
2027                         vp = *vpp;
2028                 }
2029         }
2030         mutex_exit(&device_lock);
2031 }
2032
2033 /*
2034  * Eliminate all activity associated with the requested vnode
2035  * and with all vnodes aliased to the requested vnode.
2036  */
2037 void
2038 vrevoke(vnode_t *vp)
2039 {
2040         vnode_t *vq, **vpp;
2041         enum vtype type;
2042         dev_t dev;
2043
2044         KASSERT(vp->v_usecount > 0);
2045
2046         mutex_enter(&vp->v_interlock);
2047         if ((vp->v_iflag & VI_CLEAN) != 0) {
2048                 mutex_exit(&vp->v_interlock);
2049                 return;
2050         } else if (vp->v_type != VBLK && vp->v_type != VCHR) {
2051                 atomic_inc_uint(&vp->v_usecount);
2052                 vclean(vp, DOCLOSE);
2053                 vrelel(vp, 0);
2054                 return;
2055         } else {
2056                 dev = vp->v_rdev;
2057                 type = vp->v_type;
2058                 mutex_exit(&vp->v_interlock);
2059         }
2060
2061         vpp = &specfs_hash[SPECHASH(dev)];
2062         mutex_enter(&device_lock);
2063         for (vq = *vpp; vq != NULL;) {
2064                 /* If clean or being cleaned, then ignore it. */
2065                 mutex_enter(&vq->v_interlock);
2066                 if ((vq->v_iflag & (VI_CLEAN | VI_XLOCK)) != 0 ||
2067                     vq->v_rdev != dev || vq->v_type != type) {
2068                         mutex_exit(&vq->v_interlock);
2069                         vq = vq->v_specnext;
2070                         continue;
2071                 }
2072                 mutex_exit(&device_lock);
2073                 if (vq->v_usecount == 0) {
2074                         vremfree(vq);
2075                         vq->v_usecount = 1;
2076                 } else {
2077                         atomic_inc_uint(&vq->v_usecount);
2078                 }
2079                 vclean(vq, DOCLOSE);
2080                 vrelel(vq, 0);
2081                 mutex_enter(&device_lock);
2082                 vq = *vpp;
2083         }
2084         mutex_exit(&device_lock);
2085 }
2086
2087 /*
2088  * sysctl helper routine to return list of supported fstypes
2089  */
2090 int
2091 sysctl_vfs_generic_fstypes(SYSCTLFN_ARGS)
2092 {
2093         char bf[sizeof(((struct statvfs *)NULL)->f_fstypename)];
2094         char *where = oldp;
2095         struct vfsops *v;
2096         size_t needed, left, slen;
2097         int error, first;
2098
2099         if (newp != NULL)
2100                 return (EPERM);
2101         if (namelen != 0)
2102                 return (EINVAL);
2103
2104         first = 1;
2105         error = 0;
2106         needed = 0;
2107         left = *oldlenp;
2108
2109         sysctl_unlock();
2110         mutex_enter(&vfs_list_lock);
2111         LIST_FOREACH(v, &vfs_list, vfs_list) {
2112                 if (where == NULL)
2113                         needed += strlen(v->vfs_name) + 1;
2114                 else {
2115                         memset(bf, 0, sizeof(bf));
2116                         if (first) {
2117                                 strncpy(bf, v->vfs_name, sizeof(bf));
2118                                 first = 0;
2119                         } else {
2120                                 bf[0] = ' ';
2121                                 strncpy(bf + 1, v->vfs_name, sizeof(bf) - 1);
2122                         }
2123                         bf[sizeof(bf)-1] = '\0';
2124                         slen = strlen(bf);
2125                         if (left < slen + 1)
2126                                 break;
2127                         v->vfs_refcount++;
2128                         mutex_exit(&vfs_list_lock);
2129                         /* +1 to copy out the trailing NUL byte */
2130                         error = copyout(bf, where, slen + 1);
2131                         mutex_enter(&vfs_list_lock);
2132                         v->vfs_refcount--;
2133                         if (error)
2134                                 break;
2135                         where += slen;
2136                         needed += slen;
2137                         left -= slen;
2138                 }
2139         }
2140         mutex_exit(&vfs_list_lock);
2141         sysctl_relock();
2142         *oldlenp = needed;
2143         return (error);
2144 }
2145
2146
2147 int kinfo_vdebug = 1;
2148 int kinfo_vgetfailed;
2149 #define KINFO_VNODESLOP 10
2150 /*
2151  * Dump vnode list (via sysctl).
2152  * Copyout address of vnode followed by vnode.
2153  */
2154 /* ARGSUSED */
2155 int
2156 sysctl_kern_vnode(SYSCTLFN_ARGS)
2157 {
2158         char *where = oldp;
2159         size_t *sizep = oldlenp;
2160         struct mount *mp, *nmp;
2161         vnode_t *vp, *mvp, vbuf;
2162         char *bp = where;
2163         char *ewhere;
2164         int error;
2165
2166         if (namelen != 0)
2167                 return (EOPNOTSUPP);
2168         if (newp != NULL)
2169                 return (EPERM);
2170
2171 #define VPTRSZ  sizeof(vnode_t *)
2172 #define VNODESZ sizeof(vnode_t)
2173         if (where == NULL) {
2174                 *sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ);
2175                 return (0);
2176         }
2177         ewhere = where + *sizep;
2178
2179         sysctl_unlock();
2180         mutex_enter(&mountlist_lock);
2181         for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
2182             mp = nmp) {
2183                 if (vfs_busy(mp, &nmp)) {
2184                         continue;
2185                 }
2186                 /* Allocate a marker vnode. */
2187                 mvp = vnalloc(mp);
2188                 /* Should never fail for mp != NULL */
2189                 KASSERT(mvp != NULL);
2190                 mutex_enter(&mntvnode_lock);
2191                 for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp;
2192                     vp = vunmark(mvp)) {
2193                         vmark(mvp, vp);
2194                         /*
2195                          * Check that the vp is still associated with
2196                          * this filesystem.  RACE: could have been
2197                          * recycled onto the same filesystem.
2198                          */
2199                         if (vp->v_mount != mp || vismarker(vp))
2200                                 continue;
2201                         if (bp + VPTRSZ + VNODESZ > ewhere) {
2202                                 (void)vunmark(mvp);
2203                                 mutex_exit(&mntvnode_lock);
2204                                 vnfree(mvp);
2205                                 vfs_unbusy(mp, false, NULL);
2206                                 sysctl_relock();
2207                                 *sizep = bp - where;
2208                                 return (ENOMEM);
2209                         }
2210                         memcpy(&vbuf, vp, VNODESZ);
2211                         mutex_exit(&mntvnode_lock);
2212                         if ((error = copyout(&vp, bp, VPTRSZ)) ||
2213                             (error = copyout(&vbuf, bp + VPTRSZ, VNODESZ))) {
2214                                 mutex_enter(&mntvnode_lock);
2215                                 (void)vunmark(mvp);
2216                                 mutex_exit(&mntvnode_lock);
2217                                 vnfree(mvp);
2218                                 vfs_unbusy(mp, false, NULL);
2219                                 sysctl_relock();
2220                                 return (error);
2221                         }
2222                         bp += VPTRSZ + VNODESZ;
2223                         mutex_enter(&mntvnode_lock);
2224                 }
2225                 mutex_exit(&mntvnode_lock);
2226                 vnfree(mvp);
2227                 vfs_unbusy(mp, false, &nmp);
2228         }
2229         mutex_exit(&mountlist_lock);
2230         sysctl_relock();
2231
2232         *sizep = bp - where;
2233         return (0);
2234 }
2235
2236 /*
2237  * Remove clean vnodes from a mountpoint's vnode list.
2238  */
2239 void
2240 vfs_scrubvnlist(struct mount *mp)
2241 {
2242         vnode_t *vp, *nvp;
2243
2244  retry:
2245         mutex_enter(&mntvnode_lock);
2246         for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) {
2247                 nvp = TAILQ_NEXT(vp, v_mntvnodes);
2248                 mutex_enter(&vp->v_interlock);
2249                 if ((vp->v_iflag & VI_CLEAN) != 0) {
2250                         TAILQ_REMOVE(&mp->mnt_vnodelist, vp, v_mntvnodes);
2251                         vp->v_mount = NULL;
2252                         mutex_exit(&mntvnode_lock);
2253                         mutex_exit(&vp->v_interlock);
2254                         vfs_destroy(mp);
2255                         goto retry;
2256                 }
2257                 mutex_exit(&vp->v_interlock);
2258         }
2259         mutex_exit(&mntvnode_lock);
2260 }
2261
2262 /*
2263  * Check to see if a filesystem is mounted on a block device.
2264  */
2265 int
2266 vfs_mountedon(vnode_t *vp)
2267 {
2268         vnode_t *vq;
2269         int error = 0;
2270
2271         if (vp->v_type != VBLK)
2272                 return ENOTBLK;
2273         if (vp->v_specmountpoint != NULL)
2274                 return (EBUSY);
2275         mutex_enter(&device_lock);
2276         for (vq = specfs_hash[SPECHASH(vp->v_rdev)]; vq != NULL;
2277             vq = vq->v_specnext) {
2278                 if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
2279                         continue;
2280                 if (vq->v_specmountpoint != NULL) {
2281                         error = EBUSY;
2282                         break;
2283                 }
2284         }
2285         mutex_exit(&device_lock);
2286         return (error);
2287 }
2288
2289 /*
2290  * Unmount all file systems.
2291  * We traverse the list in reverse order under the assumption that doing so
2292  * will avoid needing to worry about dependencies.
2293  */
2294 bool
2295 vfs_unmountall(struct lwp *l)
2296 {
2297         printf("unmounting file systems...");
2298         return vfs_unmountall1(l, true, true);
2299 }
2300
2301 static void
2302 vfs_unmount_print(struct mount *mp, const char *pfx)
2303 {
2304         printf("%sunmounted %s on %s type %s\n", pfx,
2305             mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname,
2306             mp->mnt_stat.f_fstypename);
2307 }
2308
2309 bool
2310 vfs_unmount_forceone(struct lwp *l)
2311 {
2312         struct mount *mp, *nmp = NULL;
2313         int error;
2314
2315         CIRCLEQ_FOREACH_REVERSE(mp, &mountlist, mnt_list) {
2316                 if (nmp == NULL || mp->mnt_gen > nmp->mnt_gen)
2317                         nmp = mp;
2318         }
2319
2320         if (nmp == NULL)
2321                 return false;
2322
2323 #ifdef DEBUG
2324         printf("\nforcefully unmounting %s (%s)...",
2325             nmp->mnt_stat.f_mntonname, nmp->mnt_stat.f_mntfromname);
2326 #endif
2327         atomic_inc_uint(&nmp->mnt_refcnt);
2328         if ((error = dounmount(nmp, MNT_FORCE, l)) == 0) {
2329                 vfs_unmount_print(nmp, "forcefully ");
2330                 return true;
2331         } else
2332                 atomic_dec_uint(&nmp->mnt_refcnt);
2333
2334 #ifdef DEBUG
2335         printf("forceful unmount of %s failed with error %d\n",
2336             nmp->mnt_stat.f_mntonname, error);
2337 #endif
2338
2339         return false;
2340 }
2341
2342 bool
2343 vfs_unmountall1(struct lwp *l, bool force, bool verbose)
2344 {
2345         struct mount *mp, *nmp;
2346         bool any_error = false, progress = false;
2347         int error;
2348
2349         for (mp = CIRCLEQ_LAST(&mountlist);
2350              mp != (void *)&mountlist;
2351              mp = nmp) {
2352                 nmp = CIRCLEQ_PREV(mp, mnt_list);
2353 #ifdef DEBUG
2354                 printf("\nunmounting %p %s (%s)...",
2355                     (void *)mp, mp->mnt_stat.f_mntonname,
2356                     mp->mnt_stat.f_mntfromname);
2357 #endif
2358                 atomic_inc_uint(&mp->mnt_refcnt);
2359                 if ((error = dounmount(mp, force ? MNT_FORCE : 0, l)) == 0) {
2360                         vfs_unmount_print(mp, "");
2361                         progress = true;
2362                 } else {
2363                         atomic_dec_uint(&mp->mnt_refcnt);
2364                         if (verbose) {
2365                                 printf("unmount of %s failed with error %d\n",
2366                                     mp->mnt_stat.f_mntonname, error);
2367                         }
2368                         any_error = true;
2369                 }
2370         }
2371         if (verbose)
2372                 printf(" done\n");
2373         if (any_error && verbose)
2374                 printf("WARNING: some file systems would not unmount\n");
2375         return progress;
2376 }
2377
2378 /*
2379  * Sync and unmount file systems before shutting down.
2380  */
2381 void
2382 vfs_shutdown(void)
2383 {
2384         struct lwp *l;
2385
2386         /* XXX we're certainly not running in lwp0's context! */
2387         l = (curlwp == NULL) ? &lwp0 : curlwp;
2388
2389         vfs_shutdown1(l);
2390 }
2391
2392 void
2393 vfs_sync_all(struct lwp *l)
2394 {
2395         printf("syncing disks... ");
2396
2397         /* remove user processes from run queue */
2398         suspendsched();
2399         (void) spl0();
2400
2401         /* avoid coming back this way again if we panic. */
2402         doing_shutdown = 1;
2403
2404         sys_sync(l, NULL, NULL);
2405
2406         /* Wait for sync to finish. */
2407         if (buf_syncwait() != 0) {
2408 #if defined(DDB) && defined(DEBUG_HALT_BUSY)
2409                 Debugger();
2410 #endif
2411                 printf("giving up\n");
2412                 return;
2413         } else
2414                 printf("done\n");
2415 }
2416
2417 static void
2418 vfs_shutdown1(struct lwp *l)
2419 {
2420
2421         vfs_sync_all(l);
2422
2423         /*
2424          * If we've panic'd, don't make the situation potentially
2425          * worse by unmounting the file systems.
2426          */
2427         if (panicstr != NULL)
2428                 return;
2429
2430         /* Release inodes held by texts before update. */
2431 #ifdef notdef
2432         vnshutdown();
2433 #endif
2434         /* Unmount file systems. */
2435         vfs_unmountall(l);
2436 }
2437
2438 /*
2439  * Print a list of supported file system types (used by vfs_mountroot)
2440  */
2441 static void
2442 vfs_print_fstypes(void)
2443 {
2444         struct vfsops *v;
2445         int cnt = 0;
2446
2447         mutex_enter(&vfs_list_lock);
2448         LIST_FOREACH(v, &vfs_list, vfs_list)
2449                 ++cnt;
2450         mutex_exit(&vfs_list_lock);
2451
2452         if (cnt == 0) {
2453                 printf("WARNING: No file system modules have been loaded.\n");
2454                 return;
2455         }
2456
2457         printf("Supported file systems:");
2458         mutex_enter(&vfs_list_lock);
2459         LIST_FOREACH(v, &vfs_list, vfs_list) {
2460                 printf(" %s", v->vfs_name);
2461         }
2462         mutex_exit(&vfs_list_lock);
2463         printf("\n");
2464 }
2465
2466 /*
2467  * Mount the root file system.  If the operator didn't specify a
2468  * file system to use, try all possible file systems until one
2469  * succeeds.
2470  */
2471 int
2472 vfs_mountroot(void)
2473 {
2474         struct vfsops *v;
2475         int error = ENODEV;
2476
2477         if (root_device == NULL)
2478                 panic("vfs_mountroot: root device unknown");
2479
2480         switch (device_class(root_device)) {
2481         case DV_IFNET:
2482                 if (rootdev != NODEV)
2483                         panic("vfs_mountroot: rootdev set for DV_IFNET "
2484                             "(0x%llx -> %llu,%llu)",
2485                             (unsigned long long)rootdev,
2486                             (unsigned long long)major(rootdev),
2487                             (unsigned long long)minor(rootdev));
2488                 break;
2489
2490         case DV_DISK:
2491                 if (rootdev == NODEV)
2492                         panic("vfs_mountroot: rootdev not set for DV_DISK");
2493                 if (bdevvp(rootdev, &rootvp))
2494                         panic("vfs_mountroot: can't get vnode for rootdev");
2495                 error = VOP_OPEN(rootvp, FREAD, FSCRED);
2496                 if (error) {
2497                         printf("vfs_mountroot: can't open root device\n");
2498                         return (error);
2499                 }
2500                 break;
2501
2502         case DV_VIRTUAL:
2503                 break;
2504
2505         default:
2506                 printf("%s: inappropriate for root file system\n",
2507                     device_xname(root_device));
2508                 return (ENODEV);
2509         }
2510
2511         /*
2512          * If user specified a root fs type, use it.  Make sure the
2513          * specified type exists and has a mount_root()
2514          */
2515         if (strcmp(rootfstype, ROOT_FSTYPE_ANY) != 0) {
2516                 v = vfs_getopsbyname(rootfstype);
2517                 error = EFTYPE;
2518                 if (v != NULL) {
2519                         if (v->vfs_mountroot != NULL) {
2520                                 error = (v->vfs_mountroot)();
2521                         }
2522                         v->vfs_refcount--;
2523                 }
2524                 goto done;
2525         }
2526
2527         /*
2528          * Try each file system currently configured into the kernel.
2529          */
2530         mutex_enter(&vfs_list_lock);
2531         LIST_FOREACH(v, &vfs_list, vfs_list) {
2532                 if (v->vfs_mountroot == NULL)
2533                         continue;
2534 #ifdef DEBUG
2535                 aprint_normal("mountroot: trying %s...\n", v->vfs_name);
2536 #endif
2537                 v->vfs_refcount++;
2538                 mutex_exit(&vfs_list_lock);
2539                 error = (*v->vfs_mountroot)();
2540                 mutex_enter(&vfs_list_lock);
2541                 v->vfs_refcount--;
2542                 if (!error) {
2543                         aprint_normal("root file system type: %s\n",
2544                             v->vfs_name);
2545                         break;
2546                 }
2547         }
2548         mutex_exit(&vfs_list_lock);
2549
2550         if (v == NULL) {
2551                 vfs_print_fstypes();
2552                 printf("no file system for %s", device_xname(root_device));
2553                 if (device_class(root_device) == DV_DISK)
2554                         printf(" (dev 0x%llx)", (unsigned long long)rootdev);
2555                 printf("\n");
2556                 error = EFTYPE;
2557         }
2558
2559 done:
2560         if (error && device_class(root_device) == DV_DISK) {
2561                 VOP_CLOSE(rootvp, FREAD, FSCRED);
2562                 vrele(rootvp);
2563         }
2564         if (error == 0) {
2565                 extern struct cwdinfo cwdi0;
2566
2567                 CIRCLEQ_FIRST(&mountlist)->mnt_flag |= MNT_ROOTFS;
2568                 CIRCLEQ_FIRST(&mountlist)->mnt_op->vfs_refcount++;
2569
2570                 /*
2571                  * Get the vnode for '/'.  Set cwdi0.cwdi_cdir to
2572                  * reference it.
2573                  */
2574                 error = VFS_ROOT(CIRCLEQ_FIRST(&mountlist), &rootvnode);
2575                 if (error)
2576                         panic("cannot find root vnode, error=%d", error);
2577                 cwdi0.cwdi_cdir = rootvnode;
2578                 vref(cwdi0.cwdi_cdir);
2579                 VOP_UNLOCK(rootvnode, 0);
2580                 cwdi0.cwdi_rdir = NULL;
2581
2582                 /*
2583                  * Now that root is mounted, we can fixup initproc's CWD
2584                  * info.  All other processes are kthreads, which merely
2585                  * share proc0's CWD info.
2586                  */
2587                 initproc->p_cwdi->cwdi_cdir = rootvnode;
2588                 vref(initproc->p_cwdi->cwdi_cdir);
2589                 initproc->p_cwdi->cwdi_rdir = NULL;
2590         }
2591         return (error);
2592 }
2593
2594 /*
2595  * Get a new unique fsid
2596  */
2597 void
2598 vfs_getnewfsid(struct mount *mp)
2599 {
2600         static u_short xxxfs_mntid;
2601         fsid_t tfsid;
2602         int mtype;
2603
2604         mutex_enter(&mntid_lock);
2605         mtype = makefstype(mp->mnt_op->vfs_name);
2606         mp->mnt_stat.f_fsidx.__fsid_val[0] = makedev(mtype, 0);
2607         mp->mnt_stat.f_fsidx.__fsid_val[1] = mtype;
2608         mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
2609         if (xxxfs_mntid == 0)
2610                 ++xxxfs_mntid;
2611         tfsid.__fsid_val[0] = makedev(mtype & 0xff, xxxfs_mntid);
2612         tfsid.__fsid_val[1] = mtype;
2613         if (!CIRCLEQ_EMPTY(&mountlist)) {
2614                 while (vfs_getvfs(&tfsid)) {
2615                         tfsid.__fsid_val[0]++;
2616                         xxxfs_mntid++;
2617                 }
2618         }
2619         mp->mnt_stat.f_fsidx.__fsid_val[0] = tfsid.__fsid_val[0];
2620         mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
2621         mutex_exit(&mntid_lock);
2622 }
2623
2624 /*
2625  * Make a 'unique' number from a mount type name.
2626  */
2627 long
2628 makefstype(const char *type)
2629 {
2630         long rv;
2631
2632         for (rv = 0; *type; type++) {
2633                 rv <<= 2;
2634                 rv ^= *type;
2635         }
2636         return rv;
2637 }
2638
2639 /*
2640  * Set vnode attributes to VNOVAL
2641  */
2642 void
2643 vattr_null(struct vattr *vap)
2644 {
2645
2646         memset(vap, 0, sizeof(*vap));
2647
2648         vap->va_type = VNON;
2649
2650         /*
2651          * Assign individually so that it is safe even if size and
2652          * sign of each member are varied.
2653          */
2654         vap->va_mode = VNOVAL;
2655         vap->va_nlink = VNOVAL;
2656         vap->va_uid = VNOVAL;
2657         vap->va_gid = VNOVAL;
2658         vap->va_fsid = VNOVAL;
2659         vap->va_fileid = VNOVAL;
2660         vap->va_size = VNOVAL;
2661         vap->va_blocksize = VNOVAL;
2662         vap->va_atime.tv_sec =
2663             vap->va_mtime.tv_sec =
2664             vap->va_ctime.tv_sec =
2665             vap->va_birthtime.tv_sec = VNOVAL;
2666         vap->va_atime.tv_nsec =
2667             vap->va_mtime.tv_nsec =
2668             vap->va_ctime.tv_nsec =
2669             vap->va_birthtime.tv_nsec = VNOVAL;
2670         vap->va_gen = VNOVAL;
2671         vap->va_flags = VNOVAL;
2672         vap->va_rdev = VNOVAL;
2673         vap->va_bytes = VNOVAL;
2674 }
2675
2676 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
2677 #define ARRAY_PRINT(idx, arr) \
2678     ((unsigned int)(idx) < ARRAY_SIZE(arr) ? (arr)[(idx)] : "UNKNOWN")
2679
2680 const char * const vnode_tags[] = { VNODE_TAGS };
2681 const char * const vnode_types[] = { VNODE_TYPES };
2682 const char vnode_flagbits[] = VNODE_FLAGBITS;
2683
2684 /*
2685  * Print out a description of a vnode.
2686  */
2687 void
2688 vprint(const char *label, struct vnode *vp)
2689 {
2690         struct vnlock *vl;
2691         char bf[96];
2692         int flag;
2693
2694         vl = (vp->v_vnlock != NULL ? vp->v_vnlock : &vp->v_lock);
2695         flag = vp->v_iflag | vp->v_vflag | vp->v_uflag;
2696         snprintb(bf, sizeof(bf), vnode_flagbits, flag);
2697
2698         if (label != NULL)
2699                 printf("%s: ", label);
2700         printf("vnode @ %p, flags (%s)\n\ttag %s(%d), type %s(%d), "
2701             "usecount %d, writecount %d, holdcount %d\n"
2702             "\tfreelisthd %p, mount %p, data %p lock %p recursecnt %d\n",
2703             vp, bf, ARRAY_PRINT(vp->v_tag, vnode_tags), vp->v_tag,
2704             ARRAY_PRINT(vp->v_type, vnode_types), vp->v_type,
2705             vp->v_usecount, vp->v_writecount, vp->v_holdcnt,
2706             vp->v_freelisthd, vp->v_mount, vp->v_data, vl, vl->vl_recursecnt);
2707         if (vp->v_data != NULL) {
2708                 printf("\t");
2709                 VOP_PRINT(vp);
2710         }
2711 }
2712
2713 #ifdef DEBUG
2714 /*
2715  * List all of the locked vnodes in the system.
2716  * Called when debugging the kernel.
2717  */
2718 void
2719 printlockedvnodes(void)
2720 {
2721         struct mount *mp, *nmp;
2722         struct vnode *vp;
2723
2724         printf("Locked vnodes\n");
2725         mutex_enter(&mountlist_lock);
2726         for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
2727              mp = nmp) {
2728                 if (vfs_busy(mp, &nmp)) {
2729                         continue;
2730                 }
2731                 TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
2732                         if (VOP_ISLOCKED(vp))
2733                                 vprint(NULL, vp);
2734                 }
2735                 mutex_enter(&mountlist_lock);
2736                 vfs_unbusy(mp, false, &nmp);
2737         }
2738         mutex_exit(&mountlist_lock);
2739 }
2740 #endif
2741
2742 /* Deprecated. Kept for KPI compatibility. */
2743 int
2744 vaccess(enum vtype type, mode_t file_mode, uid_t uid, gid_t gid,
2745     mode_t acc_mode, kauth_cred_t cred)
2746 {
2747
2748 #ifdef DIAGNOSTIC
2749         printf("vaccess: deprecated interface used.\n");
2750 #endif /* DIAGNOSTIC */
2751
2752         return genfs_can_access(type, file_mode, uid, gid, acc_mode, cred);
2753 }
2754
2755 /*
2756  * Given a file system name, look up the vfsops for that
2757  * file system, or return NULL if file system isn't present
2758  * in the kernel.
2759  */
2760 struct vfsops *
2761 vfs_getopsbyname(const char *name)
2762 {
2763         struct vfsops *v;
2764
2765         mutex_enter(&vfs_list_lock);
2766         LIST_FOREACH(v, &vfs_list, vfs_list) {
2767                 if (strcmp(v->vfs_name, name) == 0)
2768                         break;
2769         }
2770         if (v != NULL)
2771                 v->vfs_refcount++;
2772         mutex_exit(&vfs_list_lock);
2773
2774         return (v);
2775 }
2776
2777 void
2778 copy_statvfs_info(struct statvfs *sbp, const struct mount *mp)
2779 {
2780         const struct statvfs *mbp;
2781
2782         if (sbp == (mbp = &mp->mnt_stat))
2783                 return;
2784
2785         (void)memcpy(&sbp->f_fsidx, &mbp->f_fsidx, sizeof(sbp->f_fsidx));
2786         sbp->f_fsid = mbp->f_fsid;
2787         sbp->f_owner = mbp->f_owner;
2788         sbp->f_flag = mbp->f_flag;
2789         sbp->f_syncwrites = mbp->f_syncwrites;
2790         sbp->f_asyncwrites = mbp->f_asyncwrites;
2791         sbp->f_syncreads = mbp->f_syncreads;
2792         sbp->f_asyncreads = mbp->f_asyncreads;
2793         (void)memcpy(sbp->f_spare, mbp->f_spare, sizeof(mbp->f_spare));
2794         (void)memcpy(sbp->f_fstypename, mbp->f_fstypename,
2795             sizeof(sbp->f_fstypename));
2796         (void)memcpy(sbp->f_mntonname, mbp->f_mntonname,
2797             sizeof(sbp->f_mntonname));
2798         (void)memcpy(sbp->f_mntfromname, mp->mnt_stat.f_mntfromname,
2799             sizeof(sbp->f_mntfromname));
2800         sbp->f_namemax = mbp->f_namemax;
2801 }
2802
2803 int
2804 set_statvfs_info(const char *onp, int ukon, const char *fromp, int ukfrom,
2805     const char *vfsname, struct mount *mp, struct lwp *l)
2806 {
2807         int error;
2808         size_t size;
2809         struct statvfs *sfs = &mp->mnt_stat;
2810         int (*fun)(const void *, void *, size_t, size_t *);
2811
2812         (void)strlcpy(mp->mnt_stat.f_fstypename, vfsname,
2813             sizeof(mp->mnt_stat.f_fstypename));
2814
2815         if (onp) {
2816                 struct cwdinfo *cwdi = l->l_proc->p_cwdi;
2817                 fun = (ukon == UIO_SYSSPACE) ? copystr : copyinstr;
2818                 if (cwdi->cwdi_rdir != NULL) {
2819                         size_t len;
2820                         char *bp;
2821                         char *path = PNBUF_GET();
2822
2823                         bp = path + MAXPATHLEN;
2824                         *--bp = '\0';
2825                         rw_enter(&cwdi->cwdi_lock, RW_READER);
2826                         error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp,
2827                             path, MAXPATHLEN / 2, 0, l);
2828                         rw_exit(&cwdi->cwdi_lock);
2829                         if (error) {
2830                                 PNBUF_PUT(path);
2831                                 return error;
2832                         }
2833
2834                         len = strlen(bp);
2835                         if (len > sizeof(sfs->f_mntonname) - 1)
2836                                 len = sizeof(sfs->f_mntonname) - 1;
2837                         (void)strncpy(sfs->f_mntonname, bp, len);
2838                         PNBUF_PUT(path);
2839
2840                         if (len < sizeof(sfs->f_mntonname) - 1) {
2841                                 error = (*fun)(onp, &sfs->f_mntonname[len],
2842                                     sizeof(sfs->f_mntonname) - len - 1, &size);
2843                                 if (error)
2844                                         return error;
2845                                 size += len;
2846                         } else {
2847                                 size = len;
2848                         }
2849                 } else {
2850                         error = (*fun)(onp, &sfs->f_mntonname,
2851                             sizeof(sfs->f_mntonname) - 1, &size);
2852                         if (error)
2853                                 return error;
2854                 }
2855                 (void)memset(sfs->f_mntonname + size, 0,
2856                     sizeof(sfs->f_mntonname) - size);
2857         }
2858
2859         if (fromp) {
2860                 fun = (ukfrom == UIO_SYSSPACE) ? copystr : copyinstr;
2861                 error = (*fun)(fromp, sfs->f_mntfromname,
2862                     sizeof(sfs->f_mntfromname) - 1, &size);
2863                 if (error)
2864                         return error;
2865                 (void)memset(sfs->f_mntfromname + size, 0,
2866                     sizeof(sfs->f_mntfromname) - size);
2867         }
2868         return 0;
2869 }
2870
2871 void
2872 vfs_timestamp(struct timespec *ts)
2873 {
2874
2875         nanotime(ts);
2876 }
2877
2878 time_t  rootfstime;                     /* recorded root fs time, if known */
2879 void
2880 setrootfstime(time_t t)
2881 {
2882         rootfstime = t;
2883 }
2884
2885 /*
2886  * Sham lock manager for vnodes.  This is a temporary measure.
2887  */
2888 int
2889 vlockmgr(struct vnlock *vl, int flags)
2890 {
2891
2892         KASSERT((flags & ~(LK_CANRECURSE | LK_NOWAIT | LK_TYPE_MASK)) == 0);
2893
2894         switch (flags & LK_TYPE_MASK) {
2895         case LK_SHARED:
2896                 if (rw_tryenter(&vl->vl_lock, RW_READER)) {
2897                         return 0;
2898                 }
2899                 if ((flags & LK_NOWAIT) != 0) {
2900                         return EBUSY;
2901                 }
2902                 rw_enter(&vl->vl_lock, RW_READER);
2903                 return 0;
2904
2905         case LK_EXCLUSIVE:
2906                 if (rw_tryenter(&vl->vl_lock, RW_WRITER)) {
2907                         return 0;
2908                 }
2909                 if ((vl->vl_canrecurse || (flags & LK_CANRECURSE) != 0) &&
2910                     rw_write_held(&vl->vl_lock)) {
2911                         vl->vl_recursecnt++;
2912                         return 0;
2913                 }
2914                 if ((flags & LK_NOWAIT) != 0) {
2915                         return EBUSY;
2916                 }
2917                 rw_enter(&vl->vl_lock, RW_WRITER);
2918                 return 0;
2919
2920         case LK_RELEASE:
2921                 if (vl->vl_recursecnt != 0) {
2922                         KASSERT(rw_write_held(&vl->vl_lock));
2923                         vl->vl_recursecnt--;
2924                         return 0;
2925                 }
2926                 rw_exit(&vl->vl_lock);
2927                 return 0;
2928
2929         default:
2930                 panic("vlockmgr: flags %x", flags);
2931         }
2932 }
2933
2934 int
2935 vlockstatus(struct vnlock *vl)
2936 {
2937
2938         if (rw_write_held(&vl->vl_lock)) {
2939                 return LK_EXCLUSIVE;
2940         }
2941         if (rw_read_held(&vl->vl_lock)) {
2942                 return LK_SHARED;
2943         }
2944         return 0;
2945 }
2946
2947 /*
2948  * mount_specific_key_create --
2949  *      Create a key for subsystem mount-specific data.
2950  */
2951 int
2952 mount_specific_key_create(specificdata_key_t *keyp, specificdata_dtor_t dtor)
2953 {
2954
2955         return (specificdata_key_create(mount_specificdata_domain, keyp, dtor));
2956 }
2957
2958 /*
2959  * mount_specific_key_delete --
2960  *      Delete a key for subsystem mount-specific data.
2961  */
2962 void
2963 mount_specific_key_delete(specificdata_key_t key)
2964 {
2965
2966         specificdata_key_delete(mount_specificdata_domain, key);
2967 }
2968
2969 /*
2970  * mount_initspecific --
2971  *      Initialize a mount's specificdata container.
2972  */
2973 void
2974 mount_initspecific(struct mount *mp)
2975 {
2976         int error;
2977
2978         error = specificdata_init(mount_specificdata_domain,
2979                                   &mp->mnt_specdataref);
2980         KASSERT(error == 0);
2981 }
2982
2983 /*
2984  * mount_finispecific --
2985  *      Finalize a mount's specificdata container.
2986  */
2987 void
2988 mount_finispecific(struct mount *mp)
2989 {
2990
2991         specificdata_fini(mount_specificdata_domain, &mp->mnt_specdataref);
2992 }
2993
2994 /*
2995  * mount_getspecific --
2996  *      Return mount-specific data corresponding to the specified key.
2997  */
2998 void *
2999 mount_getspecific(struct mount *mp, specificdata_key_t key)
3000 {
3001
3002         return (specificdata_getspecific(mount_specificdata_domain,
3003                                          &mp->mnt_specdataref, key));
3004 }
3005
3006 /*
3007  * mount_setspecific --
3008  *      Set mount-specific data corresponding to the specified key.
3009  */
3010 void
3011 mount_setspecific(struct mount *mp, specificdata_key_t key, void *data)
3012 {
3013
3014         specificdata_setspecific(mount_specificdata_domain,
3015                                  &mp->mnt_specdataref, key, data);
3016 }
3017
3018 int
3019 VFS_MOUNT(struct mount *mp, const char *a, void *b, size_t *c)
3020 {
3021         int error;
3022
3023         KERNEL_LOCK(1, NULL);
3024         error = (*(mp->mnt_op->vfs_mount))(mp, a, b, c);
3025         KERNEL_UNLOCK_ONE(NULL);
3026
3027         return error;
3028 }
3029
3030 int
3031 VFS_START(struct mount *mp, int a)
3032 {
3033         int error;
3034
3035         if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
3036                 KERNEL_LOCK(1, NULL);
3037         }
3038         error = (*(mp->mnt_op->vfs_start))(mp, a);
3039         if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
3040                 KERNEL_UNLOCK_ONE(NULL);
3041         }
3042
3043         return error;
3044 }
3045
3046 int
3047 VFS_UNMOUNT(struct mount *mp, int a)
3048 {
3049         int error;
3050
3051         KERNEL_LOCK(1, NULL);
3052         error = (*(mp->mnt_op->vfs_unmount))(mp, a);
3053         KERNEL_UNLOCK_ONE(NULL);
3054
3055         return error;
3056 }
3057
3058 int
3059 VFS_ROOT(struct mount *mp, struct vnode **a)
3060 {
3061         int error;
3062
3063         if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
3064                 KERNEL_LOCK(1, NULL);
3065         }
3066         error = (*(mp->mnt_op->vfs_root))(mp, a);
3067         if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
3068                 KERNEL_UNLOCK_ONE(NULL);
3069         }
3070
3071         return error;
3072 }
3073
3074 int
3075 VFS_QUOTACTL(struct mount *mp, int a, uid_t b, void *c)
3076 {
3077         int error;
3078
3079         if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
3080                 KERNEL_LOCK(1, NULL);
3081         }
3082         error = (*(mp->mnt_op->vfs_quotactl))(mp, a, b, c);
3083         if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
3084                 KERNEL_UNLOCK_ONE(NULL);
3085         }
3086
3087         return error;
3088 }
3089
3090 int
3091 VFS_STATVFS(struct mount *mp, struct statvfs *a)
3092 {
3093         int error;
3094
3095         if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
3096                 KERNEL_LOCK(1, NULL);
3097         }
3098         error = (*(mp->mnt_op->vfs_statvfs))(mp, a);
3099         if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
3100                 KERNEL_UNLOCK_ONE(NULL);
3101         }
3102
3103         return error;
3104 }
3105
3106 int
3107 VFS_SYNC(struct mount *mp, int a, struct kauth_cred *b)
3108 {
3109         int error;
3110
3111         if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
3112                 KERNEL_LOCK(1, NULL);
3113         }
3114         error = (*(mp->mnt_op->vfs_sync))(mp, a, b);
3115         if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
3116                 KERNEL_UNLOCK_ONE(NULL);
3117         }
3118
3119         return error;
3120 }
3121
3122 int
3123 VFS_FHTOVP(struct mount *mp, struct fid *a, struct vnode **b)
3124 {
3125         int error;
3126
3127         if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
3128                 KERNEL_LOCK(1, NULL);
3129         }
3130         error = (*(mp->mnt_op->vfs_fhtovp))(mp, a, b);
3131         if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
3132                 KERNEL_UNLOCK_ONE(NULL);
3133         }
3134
3135         return error;
3136 }
3137
3138 int
3139 VFS_VPTOFH(struct vnode *vp, struct fid *a, size_t *b)
3140 {
3141         int error;
3142
3143         if ((vp->v_vflag & VV_MPSAFE) == 0) {
3144                 KERNEL_LOCK(1, NULL);
3145         }
3146         error = (*(vp->v_mount->mnt_op->vfs_vptofh))(vp, a, b);
3147         if ((vp->v_vflag & VV_MPSAFE) == 0) {
3148                 KERNEL_UNLOCK_ONE(NULL);
3149         }
3150
3151         return error;
3152 }
3153
3154 int
3155 VFS_SNAPSHOT(struct mount *mp, struct vnode *a, struct timespec *b)
3156 {
3157         int error;
3158
3159         if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
3160                 KERNEL_LOCK(1, NULL);
3161         }
3162         error = (*(mp->mnt_op->vfs_snapshot))(mp, a, b);
3163         if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
3164                 KERNEL_UNLOCK_ONE(NULL);
3165         }
3166
3167         return error;
3168 }
3169
3170 int
3171 VFS_EXTATTRCTL(struct mount *mp, int a, struct vnode *b, int c, const char *d)
3172 {
3173         int error;
3174
3175         KERNEL_LOCK(1, NULL);           /* XXXSMP check ffs */
3176         error = (*(mp->mnt_op->vfs_extattrctl))(mp, a, b, c, d);
3177         KERNEL_UNLOCK_ONE(NULL);        /* XXX */
3178
3179         return error;
3180 }
3181
3182 int
3183 VFS_SUSPENDCTL(struct mount *mp, int a)
3184 {
3185         int error;
3186
3187         if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
3188                 KERNEL_LOCK(1, NULL);
3189         }
3190         error = (*(mp->mnt_op->vfs_suspendctl))(mp, a);
3191         if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
3192                 KERNEL_UNLOCK_ONE(NULL);
3193         }
3194
3195         return error;
3196 }
3197
3198 #if defined(DDB) || defined(DEBUGPRINT)
3199 static const char buf_flagbits[] = BUF_FLAGBITS;
3200
3201 void
3202 vfs_buf_print(struct buf *bp, int full, void (*pr)(const char *, ...))
3203 {
3204         char bf[1024];
3205
3206         (*pr)("  vp %p lblkno 0x%"PRIx64" blkno 0x%"PRIx64" rawblkno 0x%"
3207             PRIx64 " dev 0x%x\n",
3208             bp->b_vp, bp->b_lblkno, bp->b_blkno, bp->b_rawblkno, bp->b_dev);
3209
3210         snprintb(bf, sizeof(bf),
3211             buf_flagbits, bp->b_flags | bp->b_oflags | bp->b_cflags);
3212         (*pr)("  error %d flags 0x%s\n", bp->b_error, bf);
3213
3214         (*pr)("  bufsize 0x%lx bcount 0x%lx resid 0x%lx\n",
3215                   bp->b_bufsize, bp->b_bcount, bp->b_resid);
3216         (*pr)("  data %p saveaddr %p\n",
3217                   bp->b_data, bp->b_saveaddr);
3218         (*pr)("  iodone %p objlock %p\n", bp->b_iodone, bp->b_objlock);
3219 }
3220
3221
3222 void
3223 vfs_vnode_print(struct vnode *vp, int full, void (*pr)(const char *, ...))
3224 {
3225         char bf[256];
3226
3227         uvm_object_printit(&vp->v_uobj, full, pr);
3228         snprintb(bf, sizeof(bf),
3229             vnode_flagbits, vp->v_iflag | vp->v_vflag | vp->v_uflag);
3230         (*pr)("\nVNODE flags %s\n", bf);
3231         (*pr)("mp %p numoutput %d size 0x%llx writesize 0x%llx\n",
3232               vp->v_mount, vp->v_numoutput, vp->v_size, vp->v_writesize);
3233
3234         (*pr)("data %p writecount %ld holdcnt %ld\n",
3235               vp->v_data, vp->v_writecount, vp->v_holdcnt);
3236
3237         (*pr)("tag %s(%d) type %s(%d) mount %p typedata %p\n",
3238               ARRAY_PRINT(vp->v_tag, vnode_tags), vp->v_tag,
3239               ARRAY_PRINT(vp->v_type, vnode_types), vp->v_type,
3240               vp->v_mount, vp->v_mountedhere);
3241
3242         (*pr)("v_lock %p v_vnlock %p\n", &vp->v_lock, vp->v_vnlock);
3243
3244         if (full) {
3245                 struct buf *bp;
3246
3247                 (*pr)("clean bufs:\n");
3248                 LIST_FOREACH(bp, &vp->v_cleanblkhd, b_vnbufs) {
3249                         (*pr)(" bp %p\n", bp);
3250                         vfs_buf_print(bp, full, pr);
3251                 }
3252
3253                 (*pr)("dirty bufs:\n");
3254                 LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) {
3255                         (*pr)(" bp %p\n", bp);
3256                         vfs_buf_print(bp, full, pr);
3257                 }
3258         }
3259 }
3260
3261 void
3262 vfs_mount_print(struct mount *mp, int full, void (*pr)(const char *, ...))
3263 {
3264         char sbuf[256];
3265
3266         (*pr)("vnodecovered = %p syncer = %p data = %p\n",
3267                         mp->mnt_vnodecovered,mp->mnt_syncer,mp->mnt_data);
3268
3269         (*pr)("fs_bshift %d dev_bshift = %d\n",
3270                         mp->mnt_fs_bshift,mp->mnt_dev_bshift);
3271
3272         snprintb(sbuf, sizeof(sbuf), __MNT_FLAG_BITS, mp->mnt_flag);
3273         (*pr)("flag = %s\n", sbuf);
3274
3275         snprintb(sbuf, sizeof(sbuf), __IMNT_FLAG_BITS, mp->mnt_iflag);
3276         (*pr)("iflag = %s\n", sbuf);
3277
3278         (*pr)("refcnt = %d unmounting @ %p updating @ %p\n", mp->mnt_refcnt,
3279             &mp->mnt_unmounting, &mp->mnt_updating);
3280
3281         (*pr)("statvfs cache:\n");
3282         (*pr)("\tbsize = %lu\n",mp->mnt_stat.f_bsize);
3283         (*pr)("\tfrsize = %lu\n",mp->mnt_stat.f_frsize);
3284         (*pr)("\tiosize = %lu\n",mp->mnt_stat.f_iosize);
3285
3286         (*pr)("\tblocks = %"PRIu64"\n",mp->mnt_stat.f_blocks);
3287         (*pr)("\tbfree = %"PRIu64"\n",mp->mnt_stat.f_bfree);
3288         (*pr)("\tbavail = %"PRIu64"\n",mp->mnt_stat.f_bavail);
3289         (*pr)("\tbresvd = %"PRIu64"\n",mp->mnt_stat.f_bresvd);
3290
3291         (*pr)("\tfiles = %"PRIu64"\n",mp->mnt_stat.f_files);
3292         (*pr)("\tffree = %"PRIu64"\n",mp->mnt_stat.f_ffree);
3293         (*pr)("\tfavail = %"PRIu64"\n",mp->mnt_stat.f_favail);
3294         (*pr)("\tfresvd = %"PRIu64"\n",mp->mnt_stat.f_fresvd);
3295
3296         (*pr)("\tf_fsidx = { 0x%"PRIx32", 0x%"PRIx32" }\n",
3297                         mp->mnt_stat.f_fsidx.__fsid_val[0],
3298                         mp->mnt_stat.f_fsidx.__fsid_val[1]);
3299
3300         (*pr)("\towner = %"PRIu32"\n",mp->mnt_stat.f_owner);
3301         (*pr)("\tnamemax = %lu\n",mp->mnt_stat.f_namemax);
3302
3303         snprintb(sbuf, sizeof(sbuf), __MNT_FLAG_BITS, mp->mnt_stat.f_flag);
3304
3305         (*pr)("\tflag = %s\n",sbuf);
3306         (*pr)("\tsyncwrites = %" PRIu64 "\n",mp->mnt_stat.f_syncwrites);
3307         (*pr)("\tasyncwrites = %" PRIu64 "\n",mp->mnt_stat.f_asyncwrites);
3308         (*pr)("\tsyncreads = %" PRIu64 "\n",mp->mnt_stat.f_syncreads);
3309         (*pr)("\tasyncreads = %" PRIu64 "\n",mp->mnt_stat.f_asyncreads);
3310         (*pr)("\tfstypename = %s\n",mp->mnt_stat.f_fstypename);
3311         (*pr)("\tmntonname = %s\n",mp->mnt_stat.f_mntonname);
3312         (*pr)("\tmntfromname = %s\n",mp->mnt_stat.f_mntfromname);
3313
3314         {
3315                 int cnt = 0;
3316                 struct vnode *vp;
3317                 (*pr)("locked vnodes =");
3318                 TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
3319                         if (VOP_ISLOCKED(vp)) {
3320                                 if ((++cnt % 6) == 0) {
3321                                         (*pr)(" %p,\n\t", vp);
3322                                 } else {
3323                                         (*pr)(" %p,", vp);
3324                                 }
3325                         }
3326                 }
3327                 (*pr)("\n");
3328         }
3329
3330         if (full) {
3331                 int cnt = 0;
3332                 struct vnode *vp;
3333                 (*pr)("all vnodes =");
3334                 TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
3335                         if (!TAILQ_NEXT(vp, v_mntvnodes)) {
3336                                 (*pr)(" %p", vp);
3337                         } else if ((++cnt % 6) == 0) {
3338                                 (*pr)(" %p,\n\t", vp);
3339                         } else {
3340                                 (*pr)(" %p,", vp);
3341                         }
3342                 }
3343                 (*pr)("\n", vp);
3344         }
3345 }
3346 #endif /* DDB || DEBUGPRINT */
3347
3348 /*
3349  * Check if a device pointed to by vp is mounted.
3350  *
3351  * Returns:
3352  *   EINVAL     if it's not a disk
3353  *   EBUSY      if it's a disk and mounted
3354  *   0          if it's a disk and not mounted
3355  */
3356 int
3357 rawdev_mounted(struct vnode *vp, struct vnode **bvpp)
3358 {
3359         struct vnode *bvp;
3360         dev_t dev;
3361         int d_type;
3362
3363         bvp = NULL;
3364         dev = vp->v_rdev;
3365         d_type = D_OTHER;
3366
3367         if (iskmemvp(vp))
3368                 return EINVAL;
3369
3370         switch (vp->v_type) {
3371         case VCHR: {
3372                 const struct cdevsw *cdev;
3373
3374                 cdev = cdevsw_lookup(dev);
3375                 if (cdev != NULL) {
3376                         dev_t blkdev;
3377
3378                         blkdev = devsw_chr2blk(dev);
3379                         if (blkdev != NODEV) {
3380                                 vfinddev(blkdev, VBLK, &bvp);
3381                                 if (bvp != NULL)
3382                                         d_type = (cdev->d_flag & D_TYPEMASK);
3383                         }
3384                 }
3385
3386                 break;
3387                 }
3388
3389         case VBLK: {
3390                 const struct bdevsw *bdev;
3391
3392                 bdev = bdevsw_lookup(dev);
3393                 if (bdev != NULL)
3394                         d_type = (bdev->d_flag & D_TYPEMASK);
3395
3396                 bvp = vp;
3397
3398                 break;
3399                 }
3400
3401         default:
3402                 break;
3403         }
3404
3405         if (d_type != D_DISK)
3406                 return EINVAL;
3407
3408         if (bvpp != NULL)
3409                 *bvpp = bvp;
3410
3411         /*
3412          * XXX: This is bogus. We should be failing the request
3413          * XXX: not only if this specific slice is mounted, but
3414          * XXX: if it's on a disk with any other mounted slice.
3415          */
3416         if (vfs_mountedon(bvp))
3417                 return EBUSY;
3418
3419         return 0;
3420 }