usr/src/uts/common/fs/vfs.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  25  * Copyright 2016 Toomas Soome <tsoome@me.com>
  26  * Copyright (c) 2016 by Delphix. All rights reserved.
  27  * Copyright 2016 Nexenta Systems, Inc.
  28  * Copyright 2017 RackTop Systems.
  29  */
  30
  31 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T     */
  32 /*        All Rights Reserved   */
  33
  34 /*
  35  * University Copyright- Copyright (c) 1982, 1986, 1988
  36  * The Regents of the University of California
  37  * All Rights Reserved
  38  *
  39  * University Acknowledgment- Portions of this document are derived from
  40  * software developed by the University of California, Berkeley, and its
  41  * contributors.
  42  */
  43
  44 #include <sys/types.h>
  45 #include <sys/t_lock.h>
  46 #include <sys/param.h>
  47 #include <sys/errno.h>
  48 #include <sys/user.h>
  49 #include <sys/fstyp.h>
  50 #include <sys/kmem.h>
  51 #include <sys/systm.h>
  52 #include <sys/proc.h>
  53 #include <sys/mount.h>
  54 #include <sys/vfs.h>
  55 #include <sys/vfs_opreg.h>
  56 #include <sys/fem.h>
  57 #include <sys/mntent.h>
  58 #include <sys/stat.h>
  59 #include <sys/statvfs.h>
  60 #include <sys/statfs.h>
  61 #include <sys/cred.h>
  62 #include <sys/vnode.h>
  63 #include <sys/rwstlock.h>
  64 #include <sys/dnlc.h>
  65 #include <sys/file.h>
  66 #include <sys/time.h>
  67 #include <sys/atomic.h>
  68 #include <sys/cmn_err.h>
  69 #include <sys/buf.h>
  70 #include <sys/swap.h>
  71 #include <sys/debug.h>
  72 #include <sys/vnode.h>
  73 #include <sys/modctl.h>
  74 #include <sys/ddi.h>
  75 #include <sys/pathname.h>
  76 #include <sys/bootconf.h>
  77 #include <sys/dumphdr.h>
  78 #include <sys/dc_ki.h>
  79 #include <sys/poll.h>
  80 #include <sys/sunddi.h>
  81 #include <sys/sysmacros.h>
  82 #include <sys/zone.h>
  83 #include <sys/policy.h>
  84 #include <sys/ctfs.h>
  85 #include <sys/objfs.h>
  86 #include <sys/console.h>
  87 #include <sys/reboot.h>
  88 #include <sys/attr.h>
  89 #include <sys/zio.h>
  90 #include <sys/spa.h>
  91 #include <sys/lofi.h>
  92 #include <sys/bootprops.h>
  93
  94 #include <vm/page.h>
  95
  96 #include <fs/fs_subr.h>
  97 /* Private interfaces to create vopstats-related data structures */
  98 extern void             initialize_vopstats(vopstats_t *);
  99 extern vopstats_t       *get_fstype_vopstats(struct vfs *, struct vfssw *);
 100 extern vsk_anchor_t     *get_vskstat_anchor(struct vfs *);
 101
 102 static void vfs_clearmntopt_nolock(mntopts_t *, const char *, int);
 103 static void vfs_setmntopt_nolock(mntopts_t *, const char *,
 104     const char *, int, int);
 105 static int  vfs_optionisset_nolock(const mntopts_t *, const char *, char **);
 106 static void vfs_freemnttab(struct vfs *);
 107 static void vfs_freeopt(mntopt_t *);
 108 static void vfs_swapopttbl_nolock(mntopts_t *, mntopts_t *);
 109 static void vfs_swapopttbl(mntopts_t *, mntopts_t *);
 110 static void vfs_copyopttbl_extend(const mntopts_t *, mntopts_t *, int);
 111 static void vfs_createopttbl_extend(mntopts_t *, const char *,
 112     const mntopts_t *);
 113 static char **vfs_copycancelopt_extend(char **const, int);
 114 static void vfs_freecancelopt(char **);
 115 static void getrootfs(char **, char **);
 116 static int getmacpath(dev_info_t *, void *);
 117 static void vfs_mnttabvp_setup(void);
 118
 119 struct ipmnt {
 120         struct ipmnt    *mip_next;
 121         dev_t           mip_dev;
 122         struct vfs      *mip_vfsp;
 123 };
 124
 125 static kmutex_t         vfs_miplist_mutex;
 126 static struct ipmnt     *vfs_miplist = NULL;
 127 static struct ipmnt     *vfs_miplist_end = NULL;
 128
 129 static kmem_cache_t *vfs_cache; /* Pointer to VFS kmem cache */
 130
 131 /*
 132  * VFS global data.
 133  */
 134 vnode_t *rootdir;               /* pointer to root inode vnode. */
 135 vnode_t *devicesdir;            /* pointer to inode of devices root */
 136 vnode_t *devdir;                /* pointer to inode of dev root */
 137
 138 char *server_rootpath;          /* root path for diskless clients */
 139 char *server_hostname;          /* hostname of diskless server */
 140
 141 static struct vfs root;
 142 static struct vfs devices;
 143 static struct vfs dev;
 144 struct vfs *rootvfs = &root;    /* pointer to root vfs; head of VFS list. */
 145 rvfs_t *rvfs_list;              /* array of vfs ptrs for vfs hash list */
 146 int vfshsz = 512;               /* # of heads/locks in vfs hash arrays */
 147                                 /* must be power of 2!  */
 148 timespec_t vfs_mnttab_ctime;    /* mnttab created time */
 149 timespec_t vfs_mnttab_mtime;    /* mnttab last modified time */
 150 char *vfs_dummyfstype = "\0";
 151 struct pollhead vfs_pollhd;     /* for mnttab pollers */
 152 struct vnode *vfs_mntdummyvp;   /* to fake mnttab read/write for file events */
 153 int     mntfstype;              /* will be set once mnt fs is mounted */
 154
 155 /*
 156  * Table for generic options recognized in the VFS layer and acted
 157  * on at this level before parsing file system specific options.
 158  * The nosuid option is stronger than any of the devices and setuid
 159  * options, so those are canceled when nosuid is seen.
 160  *
 161  * All options which are added here need to be added to the
 162  * list of standard options in usr/src/cmd/fs.d/fslib.c as well.
 163  */
 164 /*
 165  * VFS Mount options table
 166  */
 167 static char *ro_cancel[] = { MNTOPT_RW, NULL };
 168 static char *rw_cancel[] = { MNTOPT_RO, NULL };
 169 static char *suid_cancel[] = { MNTOPT_NOSUID, NULL };
 170 static char *nosuid_cancel[] = { MNTOPT_SUID, MNTOPT_DEVICES, MNTOPT_NODEVICES,
 171     MNTOPT_NOSETUID, MNTOPT_SETUID, NULL };
 172 static char *devices_cancel[] = { MNTOPT_NODEVICES, NULL };
 173 static char *nodevices_cancel[] = { MNTOPT_DEVICES, NULL };
 174 static char *setuid_cancel[] = { MNTOPT_NOSETUID, NULL };
 175 static char *nosetuid_cancel[] = { MNTOPT_SETUID, NULL };
 176 static char *nbmand_cancel[] = { MNTOPT_NONBMAND, NULL };
 177 static char *nonbmand_cancel[] = { MNTOPT_NBMAND, NULL };
 178 static char *exec_cancel[] = { MNTOPT_NOEXEC, NULL };
 179 static char *noexec_cancel[] = { MNTOPT_EXEC, NULL };
 180
 181 static const mntopt_t mntopts[] = {
 182 /*
 183  *      option name             cancel options          default arg     flags
 184  */
 185         { MNTOPT_REMOUNT,       NULL,                   NULL,
 186                 MO_NODISPLAY, (void *)0 },
 187         { MNTOPT_RO,            ro_cancel,              NULL,           0,
 188                 (void *)0 },
 189         { MNTOPT_RW,            rw_cancel,              NULL,           0,
 190                 (void *)0 },
 191         { MNTOPT_SUID,          suid_cancel,            NULL,           0,
 192                 (void *)0 },
 193         { MNTOPT_NOSUID,        nosuid_cancel,          NULL,           0,
 194                 (void *)0 },
 195         { MNTOPT_DEVICES,       devices_cancel,         NULL,           0,
 196                 (void *)0 },
 197         { MNTOPT_NODEVICES,     nodevices_cancel,       NULL,           0,
 198                 (void *)0 },
 199         { MNTOPT_SETUID,        setuid_cancel,          NULL,           0,
 200                 (void *)0 },
 201         { MNTOPT_NOSETUID,      nosetuid_cancel,        NULL,           0,
 202                 (void *)0 },
 203         { MNTOPT_NBMAND,        nbmand_cancel,          NULL,           0,
 204                 (void *)0 },
 205         { MNTOPT_NONBMAND,      nonbmand_cancel,        NULL,           0,
 206                 (void *)0 },
 207         { MNTOPT_EXEC,          exec_cancel,            NULL,           0,
 208                 (void *)0 },
 209         { MNTOPT_NOEXEC,        noexec_cancel,          NULL,           0,
 210                 (void *)0 },
 211 };
 212
 213 const mntopts_t vfs_mntopts = {
 214         sizeof (mntopts) / sizeof (mntopt_t),
 215         (mntopt_t *)&mntopts[0]
 216 };
 217
 218 /*
 219  * File system operation dispatch functions.
 220  */
 221
 222 int
 223 fsop_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 224 {
 225         return (*(vfsp)->vfs_op->vfs_mount)(vfsp, mvp, uap, cr);
 226 }
 227
 228 int
 229 fsop_unmount(vfs_t *vfsp, int flag, cred_t *cr)
 230 {
 231         return (*(vfsp)->vfs_op->vfs_unmount)(vfsp, flag, cr);
 232 }
 233
 234 int
 235 fsop_root(vfs_t *vfsp, vnode_t **vpp)
 236 {
 237         refstr_t *mntpt;
 238         int ret = (*(vfsp)->vfs_op->vfs_root)(vfsp, vpp);
 239         /*
 240          * Make sure this root has a path.  With lofs, it is possible to have
 241          * a NULL mountpoint.
 242          */
 243         if (ret == 0 && vfsp->vfs_mntpt != NULL && (*vpp)->v_path == NULL) {
 244                 mntpt = vfs_getmntpoint(vfsp);
 245                 vn_setpath_str(*vpp, refstr_value(mntpt),
 246                     strlen(refstr_value(mntpt)));
 247                 refstr_rele(mntpt);
 248         }
 249
 250         return (ret);
 251 }
 252
 253 int
 254 fsop_statfs(vfs_t *vfsp, statvfs64_t *sp)
 255 {
 256         return (*(vfsp)->vfs_op->vfs_statvfs)(vfsp, sp);
 257 }
 258
 259 int
 260 fsop_sync(vfs_t *vfsp, short flag, cred_t *cr)
 261 {
 262         return (*(vfsp)->vfs_op->vfs_sync)(vfsp, flag, cr);
 263 }
 264
 265 int
 266 fsop_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
 267 {
 268         /*
 269          * In order to handle system attribute fids in a manner
 270          * transparent to the underlying fs, we embed the fid for
 271          * the sysattr parent object in the sysattr fid and tack on
 272          * some extra bytes that only the sysattr layer knows about.
 273          *
 274          * This guarantees that sysattr fids are larger than other fids
 275          * for this vfs. If the vfs supports the sysattr view interface
 276          * (as indicated by VFSFT_SYSATTR_VIEWS), we cannot have a size
 277          * collision with XATTR_FIDSZ.
 278          */
 279         if (vfs_has_feature(vfsp, VFSFT_SYSATTR_VIEWS) &&
 280             fidp->fid_len == XATTR_FIDSZ)
 281                 return (xattr_dir_vget(vfsp, vpp, fidp));
 282
 283         return (*(vfsp)->vfs_op->vfs_vget)(vfsp, vpp, fidp);
 284 }
 285
 286 int
 287 fsop_mountroot(vfs_t *vfsp, enum whymountroot reason)
 288 {
 289         return (*(vfsp)->vfs_op->vfs_mountroot)(vfsp, reason);
 290 }
 291
 292 void
 293 fsop_freefs(vfs_t *vfsp)
 294 {
 295         (*(vfsp)->vfs_op->vfs_freevfs)(vfsp);
 296 }
 297
 298 int
 299 fsop_vnstate(vfs_t *vfsp, vnode_t *vp, vntrans_t nstate)
 300 {
 301         return ((*(vfsp)->vfs_op->vfs_vnstate)(vfsp, vp, nstate));
 302 }
 303
 304 int
 305 fsop_sync_by_kind(int fstype, short flag, cred_t *cr)
 306 {
 307         ASSERT((fstype >= 0) && (fstype < nfstype));
 308
 309         if (ALLOCATED_VFSSW(&vfssw[fstype]) && VFS_INSTALLED(&vfssw[fstype]))
 310                 return (*vfssw[fstype].vsw_vfsops.vfs_sync) (NULL, flag, cr);
 311         else
 312                 return (ENOTSUP);
 313 }
 314
 315 /*
 316  * File system initialization.  vfs_setfsops() must be called from a file
 317  * system's init routine.
 318  */
 319
 320 static int
 321 fs_copyfsops(const fs_operation_def_t *template, vfsops_t *actual,
 322     int *unused_ops)
 323 {
 324         static const fs_operation_trans_def_t vfs_ops_table[] = {
 325                 VFSNAME_MOUNT, offsetof(vfsops_t, vfs_mount),
 326                         fs_nosys, fs_nosys,
 327
 328                 VFSNAME_UNMOUNT, offsetof(vfsops_t, vfs_unmount),
 329                         fs_nosys, fs_nosys,
 330
 331                 VFSNAME_ROOT, offsetof(vfsops_t, vfs_root),
 332                         fs_nosys, fs_nosys,
 333
 334                 VFSNAME_STATVFS, offsetof(vfsops_t, vfs_statvfs),
 335                         fs_nosys, fs_nosys,
 336
 337                 VFSNAME_SYNC, offsetof(vfsops_t, vfs_sync),
 338                         (fs_generic_func_p) fs_sync,
 339                         (fs_generic_func_p) fs_sync,    /* No errors allowed */
 340
 341                 VFSNAME_VGET, offsetof(vfsops_t, vfs_vget),
 342                         fs_nosys, fs_nosys,
 343
 344                 VFSNAME_MOUNTROOT, offsetof(vfsops_t, vfs_mountroot),
 345                         fs_nosys, fs_nosys,
 346
 347                 VFSNAME_FREEVFS, offsetof(vfsops_t, vfs_freevfs),
 348                         (fs_generic_func_p)fs_freevfs,
 349                         (fs_generic_func_p)fs_freevfs,  /* Shouldn't fail */
 350
 351                 VFSNAME_VNSTATE, offsetof(vfsops_t, vfs_vnstate),
 352                         (fs_generic_func_p)fs_nosys,
 353                         (fs_generic_func_p)fs_nosys,
 354
 355                 NULL, 0, NULL, NULL
 356         };
 357
 358         return (fs_build_vector(actual, unused_ops, vfs_ops_table, template));
 359 }
 360
 361 void
 362 zfs_boot_init(void)
 363 {
 364         if (strcmp(rootfs.bo_fstype, MNTTYPE_ZFS) == 0)
 365                 spa_boot_init();
 366 }
 367
 368 int
 369 vfs_setfsops(int fstype, const fs_operation_def_t *template, vfsops_t **actual)
 370 {
 371         int error;
 372         int unused_ops;
 373
 374         /*
 375          * Verify that fstype refers to a valid fs.  Note that
 376          * 0 is valid since it's used to set "stray" ops.
 377          */
 378         if ((fstype < 0) || (fstype >= nfstype))
 379                 return (EINVAL);
 380
 381         if (!ALLOCATED_VFSSW(&vfssw[fstype]))
 382                 return (EINVAL);
 383
 384         /* Set up the operations vector. */
 385
 386         error = fs_copyfsops(template, &vfssw[fstype].vsw_vfsops, &unused_ops);
 387
 388         if (error != 0)
 389                 return (error);
 390
 391         vfssw[fstype].vsw_flag |= VSW_INSTALLED;
 392
 393         if (actual != NULL)
 394                 *actual = &vfssw[fstype].vsw_vfsops;
 395
 396 #if DEBUG
 397         if (unused_ops != 0)
 398                 cmn_err(CE_WARN, "vfs_setfsops: %s: %d operations supplied "
 399                     "but not used", vfssw[fstype].vsw_name, unused_ops);
 400 #endif
 401
 402         return (0);
 403 }
 404
 405 int
 406 vfs_makefsops(const fs_operation_def_t *template, vfsops_t **actual)
 407 {
 408         int error;
 409         int unused_ops;
 410
 411         *actual = (vfsops_t *)kmem_alloc(sizeof (vfsops_t), KM_SLEEP);
 412
 413         error = fs_copyfsops(template, *actual, &unused_ops);
 414         if (error != 0) {
 415                 kmem_free(*actual, sizeof (vfsops_t));
 416                 *actual = NULL;
 417                 return (error);
 418         }
 419
 420         return (0);
 421 }
 422
 423 /*
 424  * Free a vfsops structure created as a result of vfs_makefsops().
 425  * NOTE: For a vfsops structure initialized by vfs_setfsops(), use
 426  * vfs_freevfsops_by_type().
 427  */
 428 void
 429 vfs_freevfsops(vfsops_t *vfsops)
 430 {
 431         kmem_free(vfsops, sizeof (vfsops_t));
 432 }
 433
 434 /*
 435  * Since the vfsops structure is part of the vfssw table and wasn't
 436  * really allocated, we're not really freeing anything.  We keep
 437  * the name for consistency with vfs_freevfsops().  We do, however,
 438  * need to take care of a little bookkeeping.
 439  * NOTE: For a vfsops structure created by vfs_setfsops(), use
 440  * vfs_freevfsops_by_type().
 441  */
 442 int
 443 vfs_freevfsops_by_type(int fstype)
 444 {
 445
 446         /* Verify that fstype refers to a loaded fs (and not fsid 0). */
 447         if ((fstype <= 0) || (fstype >= nfstype))
 448                 return (EINVAL);
 449
 450         WLOCK_VFSSW();
 451         if ((vfssw[fstype].vsw_flag & VSW_INSTALLED) == 0) {
 452                 WUNLOCK_VFSSW();
 453                 return (EINVAL);
 454         }
 455
 456         vfssw[fstype].vsw_flag &= ~VSW_INSTALLED;
 457         WUNLOCK_VFSSW();
 458
 459         return (0);
 460 }
 461
 462 /* Support routines used to reference vfs_op */
 463
 464 /* Set the operations vector for a vfs */
 465 void
 466 vfs_setops(vfs_t *vfsp, vfsops_t *vfsops)
 467 {
 468         vfsops_t        *op;
 469
 470         ASSERT(vfsp != NULL);
 471         ASSERT(vfsops != NULL);
 472
 473         op = vfsp->vfs_op;
 474         membar_consumer();
 475         if (vfsp->vfs_femhead == NULL &&
 476             atomic_cas_ptr(&vfsp->vfs_op, op, vfsops) == op) {
 477                 return;
 478         }
 479         fsem_setvfsops(vfsp, vfsops);
 480 }
 481
 482 /* Retrieve the operations vector for a vfs */
 483 vfsops_t *
 484 vfs_getops(vfs_t *vfsp)
 485 {
 486         vfsops_t        *op;
 487
 488         ASSERT(vfsp != NULL);
 489
 490         op = vfsp->vfs_op;
 491         membar_consumer();
 492         if (vfsp->vfs_femhead == NULL && op == vfsp->vfs_op) {
 493                 return (op);
 494         } else {
 495                 return (fsem_getvfsops(vfsp));
 496         }
 497 }
 498
 499 /*
 500  * Returns non-zero (1) if the vfsops matches that of the vfs.
 501  * Returns zero (0) if not.
 502  */
 503 int
 504 vfs_matchops(vfs_t *vfsp, vfsops_t *vfsops)
 505 {
 506         return (vfs_getops(vfsp) == vfsops);
 507 }
 508
 509 /*
 510  * Returns non-zero (1) if the file system has installed a non-default,
 511  * non-error vfs_sync routine.  Returns zero (0) otherwise.
 512  */
 513 int
 514 vfs_can_sync(vfs_t *vfsp)
 515 {
 516         /* vfs_sync() routine is not the default/error function */
 517         return (vfs_getops(vfsp)->vfs_sync != fs_sync);
 518 }
 519
 520 /*
 521  * Initialize a vfs structure.
 522  */
 523 void
 524 vfs_init(vfs_t *vfsp, vfsops_t *op, void *data)
 525 {
 526         /* Other initialization has been moved to vfs_alloc() */
 527         vfsp->vfs_count = 0;
 528         vfsp->vfs_next = vfsp;
 529         vfsp->vfs_prev = vfsp;
 530         vfsp->vfs_zone_next = vfsp;
 531         vfsp->vfs_zone_prev = vfsp;
 532         vfsp->vfs_lofi_id = 0;
 533         sema_init(&vfsp->vfs_reflock, 1, NULL, SEMA_DEFAULT, NULL);
 534         vfsimpl_setup(vfsp);
 535         vfsp->vfs_data = (data);
 536         vfs_setops((vfsp), (op));
 537 }
 538
 539 /*
 540  * Allocate and initialize the vfs implementation private data
 541  * structure, vfs_impl_t.
 542  */
 543 void
 544 vfsimpl_setup(vfs_t *vfsp)
 545 {
 546         int i;
 547
 548         if (vfsp->vfs_implp != NULL) {
 549                 return;
 550         }
 551
 552         vfsp->vfs_implp = kmem_alloc(sizeof (vfs_impl_t), KM_SLEEP);
 553         /* Note that these are #define'd in vfs.h */
 554         vfsp->vfs_vskap = NULL;
 555         vfsp->vfs_fstypevsp = NULL;
 556
 557         /* Set size of counted array, then zero the array */
 558         vfsp->vfs_featureset[0] = VFS_FEATURE_MAXSZ - 1;
 559         for (i = 1; i <  VFS_FEATURE_MAXSZ; i++) {
 560                 vfsp->vfs_featureset[i] = 0;
 561         }
 562 }
 563
 564 /*
 565  * Release the vfs_impl_t structure, if it exists. Some unbundled
 566  * filesystems may not use the newer version of vfs and thus
 567  * would not contain this implementation private data structure.
 568  */
 569 void
 570 vfsimpl_teardown(vfs_t *vfsp)
 571 {
 572         vfs_impl_t      *vip = vfsp->vfs_implp;
 573
 574         if (vip == NULL)
 575                 return;
 576
 577         kmem_free(vfsp->vfs_implp, sizeof (vfs_impl_t));
 578         vfsp->vfs_implp = NULL;
 579 }
 580
 581 /*
 582  * VFS system calls: mount, umount, syssync, statfs, fstatfs, statvfs,
 583  * fstatvfs, and sysfs moved to common/syscall.
 584  */
 585
 586 /*
 587  * Update every mounted file system.  We call the vfs_sync operation of
 588  * each file system type, passing it a NULL vfsp to indicate that all
 589  * mounted file systems of that type should be updated.
 590  */
 591 void
 592 vfs_sync(int flag)
 593 {
 594         struct vfssw *vswp;
 595         RLOCK_VFSSW();
 596         for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
 597                 if (ALLOCATED_VFSSW(vswp) && VFS_INSTALLED(vswp)) {
 598                         vfs_refvfssw(vswp);
 599                         RUNLOCK_VFSSW();
 600                         (void) (*vswp->vsw_vfsops.vfs_sync)(NULL, flag,
 601                             CRED());
 602                         vfs_unrefvfssw(vswp);
 603                         RLOCK_VFSSW();
 604                 }
 605         }
 606         RUNLOCK_VFSSW();
 607 }
 608
 609 void
 610 sync(void)
 611 {
 612         vfs_sync(0);
 613 }
 614
 615 /*
 616  * External routines.
 617  */
 618
 619 krwlock_t vfssw_lock;   /* lock accesses to vfssw */
 620
 621 /*
 622  * Lock for accessing the vfs linked list.  Initialized in vfs_mountroot(),
 623  * but otherwise should be accessed only via vfs_list_lock() and
 624  * vfs_list_unlock().  Also used to protect the timestamp for mods to the list.
 625  */
 626 static krwlock_t vfslist;
 627
 628 /*
 629  * Mount devfs on /devices. This is done right after root is mounted
 630  * to provide device access support for the system
 631  */
 632 static void
 633 vfs_mountdevices(void)
 634 {
 635         struct vfssw *vsw;
 636         struct vnode *mvp;
 637         struct mounta mounta = {        /* fake mounta for devfs_mount() */
 638                 NULL,
 639                 NULL,
 640                 MS_SYSSPACE,
 641                 NULL,
 642                 NULL,
 643                 0,
 644                 NULL,
 645                 0
 646         };
 647
 648         /*
 649          * _init devfs module to fill in the vfssw
 650          */
 651         if (modload("fs", "devfs") == -1)
 652                 panic("Cannot _init devfs module");
 653
 654         /*
 655          * Hold vfs
 656          */
 657         RLOCK_VFSSW();
 658         vsw = vfs_getvfsswbyname("devfs");
 659         VFS_INIT(&devices, &vsw->vsw_vfsops, NULL);
 660         VFS_HOLD(&devices);
 661
 662         /*
 663          * Locate mount point
 664          */
 665         if (lookupname("/devices", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp))
 666                 panic("Cannot find /devices");
 667
 668         /*
 669          * Perform the mount of /devices
 670          */
 671         if (VFS_MOUNT(&devices, mvp, &mounta, CRED()))
 672                 panic("Cannot mount /devices");
 673
 674         RUNLOCK_VFSSW();
 675
 676         /*
 677          * Set appropriate members and add to vfs list for mnttab display
 678          */
 679         vfs_setresource(&devices, "/devices", 0);
 680         vfs_setmntpoint(&devices, "/devices", 0);
 681
 682         /*
 683          * Hold the root of /devices so it won't go away
 684          */
 685         if (VFS_ROOT(&devices, &devicesdir))
 686                 panic("vfs_mountdevices: not devices root");
 687
 688         if (vfs_lock(&devices) != 0) {
 689                 VN_RELE(devicesdir);
 690                 cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /devices");
 691                 return;
 692         }
 693
 694         if (vn_vfswlock(mvp) != 0) {
 695                 vfs_unlock(&devices);
 696                 VN_RELE(devicesdir);
 697                 cmn_err(CE_NOTE, "Cannot acquire vfswlock of /devices");
 698                 return;
 699         }
 700
 701         vfs_add(mvp, &devices, 0);
 702         vn_vfsunlock(mvp);
 703         vfs_unlock(&devices);
 704         VN_RELE(devicesdir);
 705 }
 706
 707 /*
 708  * mount the first instance of /dev  to root and remain mounted
 709  */
 710 static void
 711 vfs_mountdev1(void)
 712 {
 713         struct vfssw *vsw;
 714         struct vnode *mvp;
 715         struct mounta mounta = {        /* fake mounta for sdev_mount() */
 716                 NULL,
 717                 NULL,
 718                 MS_SYSSPACE | MS_OVERLAY,
 719                 NULL,
 720                 NULL,
 721                 0,
 722                 NULL,
 723                 0
 724         };
 725
 726         /*
 727          * _init dev module to fill in the vfssw
 728          */
 729         if (modload("fs", "dev") == -1)
 730                 cmn_err(CE_PANIC, "Cannot _init dev module\n");
 731
 732         /*
 733          * Hold vfs
 734          */
 735         RLOCK_VFSSW();
 736         vsw = vfs_getvfsswbyname("dev");
 737         VFS_INIT(&dev, &vsw->vsw_vfsops, NULL);
 738         VFS_HOLD(&dev);
 739
 740         /*
 741          * Locate mount point
 742          */
 743         if (lookupname("/dev", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp))
 744                 cmn_err(CE_PANIC, "Cannot find /dev\n");
 745
 746         /*
 747          * Perform the mount of /dev
 748          */
 749         if (VFS_MOUNT(&dev, mvp, &mounta, CRED()))
 750                 cmn_err(CE_PANIC, "Cannot mount /dev 1\n");
 751
 752         RUNLOCK_VFSSW();
 753
 754         /*
 755          * Set appropriate members and add to vfs list for mnttab display
 756          */
 757         vfs_setresource(&dev, "/dev", 0);
 758         vfs_setmntpoint(&dev, "/dev", 0);
 759
 760         /*
 761          * Hold the root of /dev so it won't go away
 762          */
 763         if (VFS_ROOT(&dev, &devdir))
 764                 cmn_err(CE_PANIC, "vfs_mountdev1: not dev root");
 765
 766         if (vfs_lock(&dev) != 0) {
 767                 VN_RELE(devdir);
 768                 cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /dev");
 769                 return;
 770         }
 771
 772         if (vn_vfswlock(mvp) != 0) {
 773                 vfs_unlock(&dev);
 774                 VN_RELE(devdir);
 775                 cmn_err(CE_NOTE, "Cannot acquire vfswlock of /dev");
 776                 return;
 777         }
 778
 779         vfs_add(mvp, &dev, 0);
 780         vn_vfsunlock(mvp);
 781         vfs_unlock(&dev);
 782         VN_RELE(devdir);
 783 }
 784
 785 /*
 786  * Mount required filesystem. This is done right after root is mounted.
 787  */
 788 static void
 789 vfs_mountfs(char *module, char *spec, char *path)
 790 {
 791         struct vnode *mvp;
 792         struct mounta mounta;
 793         vfs_t *vfsp;
 794
 795         bzero(&mounta, sizeof (mounta));
 796         mounta.flags = MS_SYSSPACE | MS_DATA;
 797         mounta.fstype = module;
 798         mounta.spec = spec;
 799         mounta.dir = path;
 800         if (lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp)) {
 801                 cmn_err(CE_WARN, "Cannot find %s", path);
 802                 return;
 803         }
 804         if (domount(NULL, &mounta, mvp, CRED(), &vfsp))
 805                 cmn_err(CE_WARN, "Cannot mount %s", path);
 806         else
 807                 VFS_RELE(vfsp);
 808         VN_RELE(mvp);
 809 }
 810
 811 /*
 812  * vfs_mountroot is called by main() to mount the root filesystem.
 813  */
 814 void
 815 vfs_mountroot(void)
 816 {
 817         struct vnode    *rvp = NULL;
 818         char            *path;
 819         size_t          plen;
 820         struct vfssw    *vswp;
 821         proc_t          *p;
 822
 823         rw_init(&vfssw_lock, NULL, RW_DEFAULT, NULL);
 824         rw_init(&vfslist, NULL, RW_DEFAULT, NULL);
 825
 826         /*
 827          * Alloc the vfs hash bucket array and locks
 828          */
 829         rvfs_list = kmem_zalloc(vfshsz * sizeof (rvfs_t), KM_SLEEP);
 830
 831         /*
 832          * Call machine-dependent routine "rootconf" to choose a root
 833          * file system type.
 834          */
 835         if (rootconf())
 836                 panic("vfs_mountroot: cannot mount root");
 837         /*
 838          * Get vnode for '/'.  Set up rootdir, u.u_rdir and u.u_cdir
 839          * to point to it.  These are used by lookuppn() so that it
 840          * knows where to start from ('/' or '.').
 841          */
 842         vfs_setmntpoint(rootvfs, "/", 0);
 843         if (VFS_ROOT(rootvfs, &rootdir))
 844                 panic("vfs_mountroot: no root vnode");
 845
 846         /*
 847          * At this point, the process tree consists of p0 and possibly some
 848          * direct children of p0.  (i.e. there are no grandchildren)
 849          *
 850          * Walk through them all, setting their current directory.
 851          */
 852         mutex_enter(&pidlock);
 853         for (p = practive; p != NULL; p = p->p_next) {
 854                 ASSERT(p == &p0 || p->p_parent == &p0);
 855
 856                 PTOU(p)->u_cdir = rootdir;
 857                 VN_HOLD(PTOU(p)->u_cdir);
 858                 PTOU(p)->u_rdir = NULL;
 859         }
 860         mutex_exit(&pidlock);
 861
 862         /*
 863          * Setup the global zone's rootvp, now that it exists.
 864          */
 865         global_zone->zone_rootvp = rootdir;
 866         VN_HOLD(global_zone->zone_rootvp);
 867
 868         /*
 869          * Notify the module code that it can begin using the
 870          * root filesystem instead of the boot program's services.
 871          */
 872         modrootloaded = 1;
 873
 874         /*
 875          * Special handling for a ZFS root file system.
 876          */
 877         zfs_boot_init();
 878
 879         /*
 880          * Set up mnttab information for root
 881          */
 882         vfs_setresource(rootvfs, rootfs.bo_name, 0);
 883
 884         /*
 885          * Notify cluster software that the root filesystem is available.
 886          */
 887         clboot_mountroot();
 888
 889         /* Now that we're all done with the root FS, set up its vopstats */
 890         if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) != NULL) {
 891                 /* Set flag for statistics collection */
 892                 if (vswp->vsw_flag & VSW_STATS) {
 893                         initialize_vopstats(&rootvfs->vfs_vopstats);
 894                         rootvfs->vfs_flag |= VFS_STATS;
 895                         rootvfs->vfs_fstypevsp =
 896                             get_fstype_vopstats(rootvfs, vswp);
 897                         rootvfs->vfs_vskap = get_vskstat_anchor(rootvfs);
 898                 }
 899                 vfs_unrefvfssw(vswp);
 900         }
 901
 902         /*
 903          * Mount /devices, /dev instance 1, /system/contract, /etc/mnttab,
 904          * /etc/svc/volatile, /etc/dfs/sharetab, /system/object, and /proc.
 905          */
 906         vfs_mountdevices();
 907         vfs_mountdev1();
 908
 909         vfs_mountfs("ctfs", "ctfs", CTFS_ROOT);
 910         vfs_mountfs("proc", "/proc", "/proc");
 911         vfs_mountfs("mntfs", "/etc/mnttab", "/etc/mnttab");
 912         vfs_mountfs("tmpfs", "/etc/svc/volatile", "/etc/svc/volatile");
 913         vfs_mountfs("objfs", "objfs", OBJFS_ROOT);
 914         vfs_mountfs("bootfs", "bootfs", "/system/boot");
 915
 916         if (getzoneid() == GLOBAL_ZONEID) {
 917                 vfs_mountfs("sharefs", "sharefs", "/etc/dfs/sharetab");
 918         }
 919
 920         if (strcmp(rootfs.bo_fstype, "zfs") != 0) {
 921                 /*
 922                  * Look up the root device via devfs so that a dv_node is
 923                  * created for it. The vnode is never VN_RELE()ed.
 924                  * We allocate more than MAXPATHLEN so that the
 925                  * buffer passed to i_ddi_prompath_to_devfspath() is
 926                  * exactly MAXPATHLEN (the function expects a buffer
 927                  * of that length).
 928                  */
 929                 plen = strlen("/devices");
 930                 path = kmem_alloc(plen + MAXPATHLEN, KM_SLEEP);
 931                 (void) strcpy(path, "/devices");
 932
 933                 if (i_ddi_prompath_to_devfspath(rootfs.bo_name, path + plen)
 934                     != DDI_SUCCESS ||
 935                     lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &rvp)) {
 936
 937                         /* NUL terminate in case "path" has garbage */
 938                         path[plen + MAXPATHLEN - 1] = '\0';
 939 #ifdef  DEBUG
 940                         cmn_err(CE_WARN, "!Cannot lookup root device: %s",
 941                             path);
 942 #endif
 943                 }
 944                 kmem_free(path, plen + MAXPATHLEN);
 945         }
 946
 947         vfs_mnttabvp_setup();
 948 }
 949
 950 /*
 951  * Check to see if our "block device" is actually a file.  If so,
 952  * automatically add a lofi device, and keep track of this fact.
 953  */
 954 static int
 955 lofi_add(const char *fsname, struct vfs *vfsp,
 956     mntopts_t *mntopts, struct mounta *uap)
 957 {
 958         int fromspace = (uap->flags & MS_SYSSPACE) ?
 959             UIO_SYSSPACE : UIO_USERSPACE;
 960         struct lofi_ioctl *li = NULL;
 961         struct vnode *vp = NULL;
 962         struct pathname pn = { NULL };
 963         ldi_ident_t ldi_id;
 964         ldi_handle_t ldi_hdl;
 965         vfssw_t *vfssw;
 966         int id;
 967         int err = 0;
 968
 969         if ((vfssw = vfs_getvfssw(fsname)) == NULL)
 970                 return (0);
 971
 972         if (!(vfssw->vsw_flag & VSW_CANLOFI)) {
 973                 vfs_unrefvfssw(vfssw);
 974                 return (0);
 975         }
 976
 977         vfs_unrefvfssw(vfssw);
 978         vfssw = NULL;
 979
 980         if (pn_get(uap->spec, fromspace, &pn) != 0)
 981                 return (0);
 982
 983         if (lookupname(uap->spec, fromspace, FOLLOW, NULL, &vp) != 0)
 984                 goto out;
 985
 986         if (vp->v_type != VREG)
 987                 goto out;
 988
 989         /* OK, this is a lofi mount. */
 990
 991         if ((uap->flags & (MS_REMOUNT|MS_GLOBAL)) ||
 992             vfs_optionisset_nolock(mntopts, MNTOPT_SUID, NULL) ||
 993             vfs_optionisset_nolock(mntopts, MNTOPT_SETUID, NULL) ||
 994             vfs_optionisset_nolock(mntopts, MNTOPT_DEVICES, NULL)) {
 995                 err = EINVAL;
 996                 goto out;
 997         }
 998
 999         ldi_id = ldi_ident_from_anon();
1000         li = kmem_zalloc(sizeof (*li), KM_SLEEP);
1001         (void) strlcpy(li->li_filename, pn.pn_path, MAXPATHLEN);
1002
1003         err = ldi_open_by_name("/dev/lofictl", FREAD | FWRITE, kcred,
1004             &ldi_hdl, ldi_id);
1005
1006         if (err)
1007                 goto out2;
1008
1009         err = ldi_ioctl(ldi_hdl, LOFI_MAP_FILE, (intptr_t)li,
1010             FREAD | FWRITE | FKIOCTL, kcred, &id);
1011
1012         (void) ldi_close(ldi_hdl, FREAD | FWRITE, kcred);
1013
1014         if (!err)
1015                 vfsp->vfs_lofi_id = id;
1016
1017 out2:
1018         ldi_ident_release(ldi_id);
1019 out:
1020         if (li != NULL)
1021                 kmem_free(li, sizeof (*li));
1022         if (vp != NULL)
1023                 VN_RELE(vp);
1024         pn_free(&pn);
1025         return (err);
1026 }
1027
1028 static void
1029 lofi_remove(struct vfs *vfsp)
1030 {
1031         struct lofi_ioctl *li = NULL;
1032         ldi_ident_t ldi_id;
1033         ldi_handle_t ldi_hdl;
1034         int err;
1035
1036         if (vfsp->vfs_lofi_id == 0)
1037                 return;
1038
1039         ldi_id = ldi_ident_from_anon();
1040
1041         li = kmem_zalloc(sizeof (*li), KM_SLEEP);
1042         li->li_id = vfsp->vfs_lofi_id;
1043         li->li_cleanup = B_TRUE;
1044
1045         err = ldi_open_by_name("/dev/lofictl", FREAD | FWRITE, kcred,
1046             &ldi_hdl, ldi_id);
1047
1048         if (err)
1049                 goto out;
1050
1051         err = ldi_ioctl(ldi_hdl, LOFI_UNMAP_FILE_MINOR, (intptr_t)li,
1052             FREAD | FWRITE | FKIOCTL, kcred, NULL);
1053
1054         (void) ldi_close(ldi_hdl, FREAD | FWRITE, kcred);
1055
1056         if (!err)
1057                 vfsp->vfs_lofi_id = 0;
1058
1059 out:
1060         ldi_ident_release(ldi_id);
1061         if (li != NULL)
1062                 kmem_free(li, sizeof (*li));
1063 }
1064
1065 /*
1066  * Common mount code.  Called from the system call entry point, from autofs,
1067  * nfsv4 trigger mounts, and from pxfs.
1068  *
1069  * Takes the effective file system type, mount arguments, the mount point
1070  * vnode, flags specifying whether the mount is a remount and whether it
1071  * should be entered into the vfs list, and credentials.  Fills in its vfspp
1072  * parameter with the mounted file system instance's vfs.
1073  *
1074  * Note that the effective file system type is specified as a string.  It may
1075  * be null, in which case it's determined from the mount arguments, and may
1076  * differ from the type specified in the mount arguments; this is a hook to
1077  * allow interposition when instantiating file system instances.
1078  *
1079  * The caller is responsible for releasing its own hold on the mount point
1080  * vp (this routine does its own hold when necessary).
1081  * Also note that for remounts, the mount point vp should be the vnode for
1082  * the root of the file system rather than the vnode that the file system
1083  * is mounted on top of.
1084  */
1085 int
1086 domount(char *fsname, struct mounta *uap, vnode_t *vp, struct cred *credp,
1087     struct vfs **vfspp)
1088 {
1089         struct vfssw    *vswp;
1090         vfsops_t        *vfsops;
1091         struct vfs      *vfsp;
1092         struct vnode    *bvp;
1093         dev_t           bdev = 0;
1094         mntopts_t       mnt_mntopts;
1095         int             error = 0;
1096         int             copyout_error = 0;
1097         int             ovflags;
1098         char            *opts = uap->optptr;
1099         char            *inargs = opts;
1100         int             optlen = uap->optlen;
1101         int             remount;
1102         int             rdonly;
1103         int             nbmand = 0;
1104         int             delmip = 0;
1105         int             addmip = 0;
1106         int             splice = ((uap->flags & MS_NOSPLICE) == 0);
1107         int             fromspace = (uap->flags & MS_SYSSPACE) ?
1108             UIO_SYSSPACE : UIO_USERSPACE;
1109         char            *resource = NULL, *mountpt = NULL;
1110         refstr_t        *oldresource, *oldmntpt;
1111         struct pathname pn, rpn;
1112         vsk_anchor_t    *vskap;
1113         char fstname[FSTYPSZ];
1114         zone_t          *zone;
1115
1116         /*
1117          * The v_flag value for the mount point vp is permanently set
1118          * to VVFSLOCK so that no one bypasses the vn_vfs*locks routine
1119          * for mount point locking.
1120          */
1121         mutex_enter(&vp->v_lock);
1122         vp->v_flag |= VVFSLOCK;
1123         mutex_exit(&vp->v_lock);
1124
1125         mnt_mntopts.mo_count = 0;
1126         /*
1127          * Find the ops vector to use to invoke the file system-specific mount
1128          * method.  If the fsname argument is non-NULL, use it directly.
1129          * Otherwise, dig the file system type information out of the mount
1130          * arguments.
1131          *
1132          * A side effect is to hold the vfssw entry.
1133          *
1134          * Mount arguments can be specified in several ways, which are
1135          * distinguished by flag bit settings.  The preferred way is to set
1136          * MS_OPTIONSTR, indicating an 8 argument mount with the file system
1137          * type supplied as a character string and the last two arguments
1138          * being a pointer to a character buffer and the size of the buffer.
1139          * On entry, the buffer holds a null terminated list of options; on
1140          * return, the string is the list of options the file system
1141          * recognized. If MS_DATA is set arguments five and six point to a
1142          * block of binary data which the file system interprets.
1143          * A further wrinkle is that some callers don't set MS_FSS and MS_DATA
1144          * consistently with these conventions.  To handle them, we check to
1145          * see whether the pointer to the file system name has a numeric value
1146          * less than 256.  If so, we treat it as an index.
1147          */
1148         if (fsname != NULL) {
1149                 if ((vswp = vfs_getvfssw(fsname)) == NULL) {
1150                         return (EINVAL);
1151                 }
1152         } else if (uap->flags & (MS_OPTIONSTR | MS_DATA | MS_FSS)) {
1153                 size_t n;
1154                 uint_t fstype;
1155
1156                 fsname = fstname;
1157
1158                 if ((fstype = (uintptr_t)uap->fstype) < 256) {
1159                         RLOCK_VFSSW();
1160                         if (fstype == 0 || fstype >= nfstype ||
1161                             !ALLOCATED_VFSSW(&vfssw[fstype])) {
1162                                 RUNLOCK_VFSSW();
1163                                 return (EINVAL);
1164                         }
1165                         (void) strcpy(fsname, vfssw[fstype].vsw_name);
1166                         RUNLOCK_VFSSW();
1167                         if ((vswp = vfs_getvfssw(fsname)) == NULL)
1168                                 return (EINVAL);
1169                 } else {
1170                         /*
1171                          * Handle either kernel or user address space.
1172                          */
1173                         if (uap->flags & MS_SYSSPACE) {
1174                                 error = copystr(uap->fstype, fsname,
1175                                     FSTYPSZ, &n);
1176                         } else {
1177                                 error = copyinstr(uap->fstype, fsname,
1178                                     FSTYPSZ, &n);
1179                         }
1180                         if (error) {
1181                                 if (error == ENAMETOOLONG)
1182                                         return (EINVAL);
1183                                 return (error);
1184                         }
1185                         if ((vswp = vfs_getvfssw(fsname)) == NULL)
1186                                 return (EINVAL);
1187                 }
1188         } else {
1189                 if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) == NULL)
1190                         return (EINVAL);
1191                 fsname = vswp->vsw_name;
1192         }
1193         if (!VFS_INSTALLED(vswp))
1194                 return (EINVAL);
1195
1196         if ((error = secpolicy_fs_allowed_mount(fsname)) != 0)  {
1197                 vfs_unrefvfssw(vswp);
1198                 return (error);
1199         }
1200
1201         vfsops = &vswp->vsw_vfsops;
1202
1203         vfs_copyopttbl(&vswp->vsw_optproto, &mnt_mntopts);
1204         /*
1205          * Fetch mount options and parse them for generic vfs options
1206          */
1207         if (uap->flags & MS_OPTIONSTR) {
1208                 /*
1209                  * Limit the buffer size
1210                  */
1211                 if (optlen < 0 || optlen > MAX_MNTOPT_STR) {
1212                         error = EINVAL;
1213                         goto errout;
1214                 }
1215                 if ((uap->flags & MS_SYSSPACE) == 0) {
1216                         inargs = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP);
1217                         inargs[0] = '\0';
1218                         if (optlen) {
1219                                 error = copyinstr(opts, inargs, (size_t)optlen,
1220                                     NULL);
1221                                 if (error) {
1222                                         goto errout;
1223                                 }
1224                         }
1225                 }
1226                 vfs_parsemntopts(&mnt_mntopts, inargs, 0);
1227         }
1228         /*
1229          * Flag bits override the options string.
1230          */
1231         if (uap->flags & MS_REMOUNT)
1232                 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_REMOUNT, NULL, 0, 0);
1233         if (uap->flags & MS_RDONLY)
1234                 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_RO, NULL, 0, 0);
1235         if (uap->flags & MS_NOSUID)
1236                 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL, 0, 0);
1237
1238         /*
1239          * Check if this is a remount; must be set in the option string and
1240          * the file system must support a remount option.
1241          */
1242         if (remount = vfs_optionisset_nolock(&mnt_mntopts,
1243             MNTOPT_REMOUNT, NULL)) {
1244                 if (!(vswp->vsw_flag & VSW_CANREMOUNT)) {
1245                         error = ENOTSUP;
1246                         goto errout;
1247                 }
1248                 uap->flags |= MS_REMOUNT;
1249         }
1250
1251         /*
1252          * uap->flags and vfs_optionisset() should agree.
1253          */
1254         if (rdonly = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_RO, NULL)) {
1255                 uap->flags |= MS_RDONLY;
1256         }
1257         if (vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL)) {
1258                 uap->flags |= MS_NOSUID;
1259         }
1260         nbmand = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NBMAND, NULL);
1261         ASSERT(splice || !remount);
1262         /*
1263          * If we are splicing the fs into the namespace,
1264          * perform mount point checks.
1265          *
1266          * We want to resolve the path for the mount point to eliminate
1267          * '.' and ".." and symlinks in mount points; we can't do the
1268          * same for the resource string, since it would turn
1269          * "/dev/dsk/c0t0d0s0" into "/devices/pci@...".  We need to do
1270          * this before grabbing vn_vfswlock(), because otherwise we
1271          * would deadlock with lookuppn().
1272          */
1273         if (splice) {
1274                 ASSERT(vp->v_count > 0);
1275
1276                 /*
1277                  * Pick up mount point and device from appropriate space.
1278                  */
1279                 if (pn_get(uap->spec, fromspace, &pn) == 0) {
1280                         resource = kmem_alloc(pn.pn_pathlen + 1,
1281                             KM_SLEEP);
1282                         (void) strcpy(resource, pn.pn_path);
1283                         pn_free(&pn);
1284                 }
1285                 /*
1286                  * Do a lookupname prior to taking the
1287                  * writelock. Mark this as completed if
1288                  * successful for later cleanup and addition to
1289                  * the mount in progress table.
1290                  */
1291                 if ((uap->flags & MS_GLOBAL) == 0 &&
1292                     lookupname(uap->spec, fromspace,
1293                     FOLLOW, NULL, &bvp) == 0) {
1294                         addmip = 1;
1295                 }
1296
1297                 if ((error = pn_get(uap->dir, fromspace, &pn)) == 0) {
1298                         pathname_t *pnp;
1299
1300                         if (*pn.pn_path != '/') {
1301                                 error = EINVAL;
1302                                 pn_free(&pn);
1303                                 goto errout;
1304                         }
1305                         pn_alloc(&rpn);
1306                         /*
1307                          * Kludge to prevent autofs from deadlocking with
1308                          * itself when it calls domount().
1309                          *
1310                          * If autofs is calling, it is because it is doing
1311                          * (autofs) mounts in the process of an NFS mount.  A
1312                          * lookuppn() here would cause us to block waiting for
1313                          * said NFS mount to complete, which can't since this
1314                          * is the thread that was supposed to doing it.
1315                          */
1316                         if (fromspace == UIO_USERSPACE) {
1317                                 if ((error = lookuppn(&pn, &rpn, FOLLOW, NULL,
1318                                     NULL)) == 0) {
1319                                         pnp = &rpn;
1320                                 } else {
1321                                         /*
1322                                          * The file disappeared or otherwise
1323                                          * became inaccessible since we opened
1324                                          * it; might as well fail the mount
1325                                          * since the mount point is no longer
1326                                          * accessible.
1327                                          */
1328                                         pn_free(&rpn);
1329                                         pn_free(&pn);
1330                                         goto errout;
1331                                 }
1332                         } else {
1333                                 pnp = &pn;
1334                         }
1335                         mountpt = kmem_alloc(pnp->pn_pathlen + 1, KM_SLEEP);
1336                         (void) strcpy(mountpt, pnp->pn_path);
1337
1338                         /*
1339                          * If the addition of the zone's rootpath
1340                          * would push us over a total path length
1341                          * of MAXPATHLEN, we fail the mount with
1342                          * ENAMETOOLONG, which is what we would have
1343                          * gotten if we were trying to perform the same
1344                          * mount in the global zone.
1345                          *
1346                          * strlen() doesn't count the trailing
1347                          * '\0', but zone_rootpathlen counts both a
1348                          * trailing '/' and the terminating '\0'.
1349                          */
1350                         if ((curproc->p_zone->zone_rootpathlen - 1 +
1351                             strlen(mountpt)) > MAXPATHLEN ||
1352                             (resource != NULL &&
1353                             (curproc->p_zone->zone_rootpathlen - 1 +
1354                             strlen(resource)) > MAXPATHLEN)) {
1355                                 error = ENAMETOOLONG;
1356                         }
1357
1358                         pn_free(&rpn);
1359                         pn_free(&pn);
1360                 }
1361
1362                 if (error)
1363                         goto errout;
1364
1365                 /*
1366                  * Prevent path name resolution from proceeding past
1367                  * the mount point.
1368                  */
1369                 if (vn_vfswlock(vp) != 0) {
1370                         error = EBUSY;
1371                         goto errout;
1372                 }
1373
1374                 /*
1375                  * Verify that it's legitimate to establish a mount on
1376                  * the prospective mount point.
1377                  */
1378                 if (vn_mountedvfs(vp) != NULL) {
1379                         /*
1380                          * The mount point lock was obtained after some
1381                          * other thread raced through and established a mount.
1382                          */
1383                         vn_vfsunlock(vp);
1384                         error = EBUSY;
1385                         goto errout;
1386                 }
1387                 if (vp->v_flag & VNOMOUNT) {
1388                         vn_vfsunlock(vp);
1389                         error = EINVAL;
1390                         goto errout;
1391                 }
1392         }
1393         if ((uap->flags & (MS_DATA | MS_OPTIONSTR)) == 0) {
1394                 uap->dataptr = NULL;
1395                 uap->datalen = 0;
1396         }
1397
1398         /*
1399          * If this is a remount, we don't want to create a new VFS.
1400          * Instead, we pass the existing one with a remount flag.
1401          */
1402         if (remount) {
1403                 /*
1404                  * Confirm that the mount point is the root vnode of the
1405                  * file system that is being remounted.
1406                  * This can happen if the user specifies a different
1407                  * mount point directory pathname in the (re)mount command.
1408                  *
1409                  * Code below can only be reached if splice is true, so it's
1410                  * safe to do vn_vfsunlock() here.
1411                  */
1412                 if ((vp->v_flag & VROOT) == 0) {
1413                         vn_vfsunlock(vp);
1414                         error = ENOENT;
1415                         goto errout;
1416                 }
1417                 /*
1418                  * Disallow making file systems read-only unless file system
1419                  * explicitly allows it in its vfssw.  Ignore other flags.
1420                  */
1421                 if (rdonly && vn_is_readonly(vp) == 0 &&
1422                     (vswp->vsw_flag & VSW_CANRWRO) == 0) {
1423                         vn_vfsunlock(vp);
1424                         error = EINVAL;
1425                         goto errout;
1426                 }
1427                 /*
1428                  * Disallow changing the NBMAND disposition of the file
1429                  * system on remounts.
1430                  */
1431                 if ((nbmand && ((vp->v_vfsp->vfs_flag & VFS_NBMAND) == 0)) ||
1432                     (!nbmand && (vp->v_vfsp->vfs_flag & VFS_NBMAND))) {
1433                         vn_vfsunlock(vp);
1434                         error = EINVAL;
1435                         goto errout;
1436                 }
1437                 vfsp = vp->v_vfsp;
1438                 ovflags = vfsp->vfs_flag;
1439                 vfsp->vfs_flag |= VFS_REMOUNT;
1440                 vfsp->vfs_flag &= ~VFS_RDONLY;
1441         } else {
1442                 vfsp = vfs_alloc(KM_SLEEP);
1443                 VFS_INIT(vfsp, vfsops, NULL);
1444         }
1445
1446         VFS_HOLD(vfsp);
1447
1448         if ((error = lofi_add(fsname, vfsp, &mnt_mntopts, uap)) != 0) {
1449                 if (!remount) {
1450                         if (splice)
1451                                 vn_vfsunlock(vp);
1452                         vfs_free(vfsp);
1453                 } else {
1454                         vn_vfsunlock(vp);
1455                         VFS_RELE(vfsp);
1456                 }
1457                 goto errout;
1458         }
1459
1460         /*
1461          * PRIV_SYS_MOUNT doesn't mean you can become root.
1462          */
1463         if (vfsp->vfs_lofi_id != 0) {
1464                 uap->flags |= MS_NOSUID;
1465                 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL, 0, 0);
1466         }
1467
1468         /*
1469          * The vfs_reflock is not used anymore the code below explicitly
1470          * holds it preventing others accesing it directly.
1471          */
1472         if ((sema_tryp(&vfsp->vfs_reflock) == 0) &&
1473             !(vfsp->vfs_flag & VFS_REMOUNT))
1474                 cmn_err(CE_WARN,
1475                     "mount type %s couldn't get vfs_reflock", vswp->vsw_name);
1476
1477         /*
1478          * Lock the vfs. If this is a remount we want to avoid spurious umount
1479          * failures that happen as a side-effect of fsflush() and other mount
1480          * and unmount operations that might be going on simultaneously and
1481          * may have locked the vfs currently. To not return EBUSY immediately
1482          * here we use vfs_lock_wait() instead vfs_lock() for the remount case.
1483          */
1484         if (!remount) {
1485                 if (error = vfs_lock(vfsp)) {
1486                         vfsp->vfs_flag = ovflags;
1487
1488                         lofi_remove(vfsp);
1489
1490                         if (splice)
1491                                 vn_vfsunlock(vp);
1492                         vfs_free(vfsp);
1493                         goto errout;
1494                 }
1495         } else {
1496                 vfs_lock_wait(vfsp);
1497         }
1498
1499         /*
1500          * Add device to mount in progress table, global mounts require special
1501          * handling. It is possible that we have already done the lookupname
1502          * on a spliced, non-global fs. If so, we don't want to do it again
1503          * since we cannot do a lookupname after taking the
1504          * wlock above. This case is for a non-spliced, non-global filesystem.
1505          */
1506         if (!addmip) {
1507                 if ((uap->flags & MS_GLOBAL) == 0 &&
1508                     lookupname(uap->spec, fromspace, FOLLOW, NULL, &bvp) == 0) {
1509                         addmip = 1;
1510                 }
1511         }
1512
1513         if (addmip) {
1514                 vnode_t *lvp = NULL;
1515
1516                 error = vfs_get_lofi(vfsp, &lvp);
1517                 if (error > 0) {
1518                         lofi_remove(vfsp);
1519
1520                         if (splice)
1521                                 vn_vfsunlock(vp);
1522                         vfs_unlock(vfsp);
1523
1524                         if (remount) {
1525                                 VFS_RELE(vfsp);
1526                         } else {
1527                                 vfs_free(vfsp);
1528                         }
1529
1530                         goto errout;
1531                 } else if (error == -1) {
1532                         bdev = bvp->v_rdev;
1533                         VN_RELE(bvp);
1534                 } else {
1535                         bdev = lvp->v_rdev;
1536                         VN_RELE(lvp);
1537                         VN_RELE(bvp);
1538                 }
1539
1540                 vfs_addmip(bdev, vfsp);
1541                 addmip = 0;
1542                 delmip = 1;
1543         }
1544         /*
1545          * Invalidate cached entry for the mount point.
1546          */
1547         if (splice)
1548                 dnlc_purge_vp(vp);
1549
1550         /*
1551          * If have an option string but the filesystem doesn't supply a
1552          * prototype options table, create a table with the global
1553          * options and sufficient room to accept all the options in the
1554          * string.  Then parse the passed in option string
1555          * accepting all the options in the string.  This gives us an
1556          * option table with all the proper cancel properties for the
1557          * global options.
1558          *
1559          * Filesystems that supply a prototype options table are handled
1560          * earlier in this function.
1561          */
1562         if (uap->flags & MS_OPTIONSTR) {
1563                 if (!(vswp->vsw_flag & VSW_HASPROTO)) {
1564                         mntopts_t tmp_mntopts;
1565
1566                         tmp_mntopts.mo_count = 0;
1567                         vfs_createopttbl_extend(&tmp_mntopts, inargs,
1568                             &mnt_mntopts);
1569                         vfs_parsemntopts(&tmp_mntopts, inargs, 1);
1570                         vfs_swapopttbl_nolock(&mnt_mntopts, &tmp_mntopts);
1571                         vfs_freeopttbl(&tmp_mntopts);
1572                 }
1573         }
1574
1575         /*
1576          * Serialize with zone state transitions.
1577          * See vfs_list_add; zone mounted into is:
1578          *      zone_find_by_path(refstr_value(vfsp->vfs_mntpt))
1579          * not the zone doing the mount (curproc->p_zone), but if we're already
1580          * inside a NGZ, then we know what zone we are.
1581          */
1582         if (INGLOBALZONE(curproc)) {
1583                 zone = zone_find_by_path(mountpt);
1584                 ASSERT(zone != NULL);
1585         } else {
1586                 zone = curproc->p_zone;
1587                 /*
1588                  * zone_find_by_path does a hold, so do one here too so that
1589                  * we can do a zone_rele after mount_completed.
1590                  */
1591                 zone_hold(zone);
1592         }
1593         mount_in_progress(zone);
1594         /*
1595          * Instantiate (or reinstantiate) the file system.  If appropriate,
1596          * splice it into the file system name space.
1597          *
1598          * We want VFS_MOUNT() to be able to override the vfs_resource
1599          * string if necessary (ie, mntfs), and also for a remount to
1600          * change the same (necessary when remounting '/' during boot).
1601          * So we set up vfs_mntpt and vfs_resource to what we think they
1602          * should be, then hand off control to VFS_MOUNT() which can
1603          * override this.
1604          *
1605          * For safety's sake, when changing vfs_resource or vfs_mntpt of
1606          * a vfs which is on the vfs list (i.e. during a remount), we must
1607          * never set those fields to NULL. Several bits of code make
1608          * assumptions that the fields are always valid.
1609          */
1610         vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts);
1611         if (remount) {
1612                 if ((oldresource = vfsp->vfs_resource) != NULL)
1613                         refstr_hold(oldresource);
1614                 if ((oldmntpt = vfsp->vfs_mntpt) != NULL)
1615                         refstr_hold(oldmntpt);
1616         }
1617         vfs_setresource(vfsp, resource, 0);
1618         vfs_setmntpoint(vfsp, mountpt, 0);
1619
1620         /*
1621          * going to mount on this vnode, so notify.
1622          */
1623         vnevent_mountedover(vp, NULL);
1624         error = VFS_MOUNT(vfsp, vp, uap, credp);
1625
1626         if (uap->flags & MS_RDONLY)
1627                 vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
1628         if (uap->flags & MS_NOSUID)
1629                 vfs_setmntopt(vfsp, MNTOPT_NOSUID, NULL, 0);
1630         if (uap->flags & MS_GLOBAL)
1631                 vfs_setmntopt(vfsp, MNTOPT_GLOBAL, NULL, 0);
1632
1633         if (error) {
1634                 lofi_remove(vfsp);
1635
1636                 if (remount) {
1637                         /* put back pre-remount options */
1638                         vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts);
1639                         vfs_setmntpoint(vfsp, refstr_value(oldmntpt),
1640                             VFSSP_VERBATIM);
1641                         if (oldmntpt)
1642                                 refstr_rele(oldmntpt);
1643                         vfs_setresource(vfsp, refstr_value(oldresource),
1644                             VFSSP_VERBATIM);
1645                         if (oldresource)
1646                                 refstr_rele(oldresource);
1647                         vfsp->vfs_flag = ovflags;
1648                         vfs_unlock(vfsp);
1649                         VFS_RELE(vfsp);
1650                 } else {
1651                         vfs_unlock(vfsp);
1652                         vfs_freemnttab(vfsp);
1653                         vfs_free(vfsp);
1654                 }
1655         } else {
1656                 /*
1657                  * Set the mount time to now
1658                  */
1659                 vfsp->vfs_mtime = ddi_get_time();
1660                 if (remount) {
1661                         vfsp->vfs_flag &= ~VFS_REMOUNT;
1662                         if (oldresource)
1663                                 refstr_rele(oldresource);
1664                         if (oldmntpt)
1665                                 refstr_rele(oldmntpt);
1666                 } else if (splice) {
1667                         /*
1668                          * Link vfsp into the name space at the mount
1669                          * point. Vfs_add() is responsible for
1670                          * holding the mount point which will be
1671                          * released when vfs_remove() is called.
1672                          */
1673                         vfs_add(vp, vfsp, uap->flags);
1674                 } else {
1675                         /*
1676                          * Hold the reference to file system which is
1677                          * not linked into the name space.
1678                          */
1679                         vfsp->vfs_zone = NULL;
1680                         VFS_HOLD(vfsp);
1681                         vfsp->vfs_vnodecovered = NULL;
1682                 }
1683                 /*
1684                  * Set flags for global options encountered
1685                  */
1686                 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
1687                         vfsp->vfs_flag |= VFS_RDONLY;
1688                 else
1689                         vfsp->vfs_flag &= ~VFS_RDONLY;
1690                 if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
1691                         vfsp->vfs_flag |= (VFS_NOSETUID|VFS_NODEVICES);
1692                 } else {
1693                         if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
1694                                 vfsp->vfs_flag |= VFS_NODEVICES;
1695                         else
1696                                 vfsp->vfs_flag &= ~VFS_NODEVICES;
1697                         if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
1698                                 vfsp->vfs_flag |= VFS_NOSETUID;
1699                         else
1700                                 vfsp->vfs_flag &= ~VFS_NOSETUID;
1701                 }
1702                 if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL))
1703                         vfsp->vfs_flag |= VFS_NBMAND;
1704                 else
1705                         vfsp->vfs_flag &= ~VFS_NBMAND;
1706
1707                 if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL))
1708                         vfsp->vfs_flag |= VFS_XATTR;
1709                 else
1710                         vfsp->vfs_flag &= ~VFS_XATTR;
1711
1712                 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
1713                         vfsp->vfs_flag |= VFS_NOEXEC;
1714                 else
1715                         vfsp->vfs_flag &= ~VFS_NOEXEC;
1716
1717                 /*
1718                  * Now construct the output option string of options
1719                  * we recognized.
1720                  */
1721                 if (uap->flags & MS_OPTIONSTR) {
1722                         vfs_list_read_lock();
1723                         copyout_error = vfs_buildoptionstr(
1724                             &vfsp->vfs_mntopts, inargs, optlen);
1725                         vfs_list_unlock();
1726                         if (copyout_error == 0 &&
1727                             (uap->flags & MS_SYSSPACE) == 0) {
1728                                 copyout_error = copyoutstr(inargs, opts,
1729                                     optlen, NULL);
1730                         }
1731                 }
1732
1733                 /*
1734                  * If this isn't a remount, set up the vopstats before
1735                  * anyone can touch this. We only allow spliced file
1736                  * systems (file systems which are in the namespace) to
1737                  * have the VFS_STATS flag set.
1738                  * NOTE: PxFS mounts the underlying file system with
1739                  * MS_NOSPLICE set and copies those vfs_flags to its private
1740                  * vfs structure. As a result, PxFS should never have
1741                  * the VFS_STATS flag or else we might access the vfs
1742                  * statistics-related fields prior to them being
1743                  * properly initialized.
1744                  */
1745                 if (!remount && (vswp->vsw_flag & VSW_STATS) && splice) {
1746                         initialize_vopstats(&vfsp->vfs_vopstats);
1747                         /*
1748                          * We need to set vfs_vskap to NULL because there's
1749                          * a chance it won't be set below.  This is checked
1750                          * in teardown_vopstats() so we can't have garbage.
1751                          */
1752                         vfsp->vfs_vskap = NULL;
1753                         vfsp->vfs_flag |= VFS_STATS;
1754                         vfsp->vfs_fstypevsp = get_fstype_vopstats(vfsp, vswp);
1755                 }
1756
1757                 if (vswp->vsw_flag & VSW_XID)
1758                         vfsp->vfs_flag |= VFS_XID;
1759
1760                 vfs_unlock(vfsp);
1761         }
1762         mount_completed(zone);
1763         zone_rele(zone);
1764         if (splice)
1765                 vn_vfsunlock(vp);
1766
1767         if ((error == 0) && (copyout_error == 0)) {
1768                 if (!remount) {
1769                         /*
1770                          * Don't call get_vskstat_anchor() while holding
1771                          * locks since it allocates memory and calls
1772                          * VFS_STATVFS().  For NFS, the latter can generate
1773                          * an over-the-wire call.
1774                          */
1775                         vskap = get_vskstat_anchor(vfsp);
1776                         /* Only take the lock if we have something to do */
1777                         if (vskap != NULL) {
1778                                 vfs_lock_wait(vfsp);
1779                                 if (vfsp->vfs_flag & VFS_STATS) {
1780                                         vfsp->vfs_vskap = vskap;
1781                                 }
1782                                 vfs_unlock(vfsp);
1783                         }
1784                 }
1785                 /* Return vfsp to caller. */
1786                 *vfspp = vfsp;
1787         }
1788 errout:
1789         vfs_freeopttbl(&mnt_mntopts);
1790         if (resource != NULL)
1791                 kmem_free(resource, strlen(resource) + 1);
1792         if (mountpt != NULL)
1793                 kmem_free(mountpt, strlen(mountpt) + 1);
1794         /*
1795          * It is possible we errored prior to adding to mount in progress
1796          * table. Must free vnode we acquired with successful lookupname.
1797          */
1798         if (addmip)
1799                 VN_RELE(bvp);
1800         if (delmip)
1801                 vfs_delmip(vfsp);
1802         ASSERT(vswp != NULL);
1803         vfs_unrefvfssw(vswp);
1804         if (inargs != opts)
1805                 kmem_free(inargs, MAX_MNTOPT_STR);
1806         if (copyout_error) {
1807                 lofi_remove(vfsp);
1808                 VFS_RELE(vfsp);
1809                 error = copyout_error;
1810         }
1811         return (error);
1812 }
1813
1814 static void
1815 vfs_setpath(
1816     struct vfs *vfsp,           /* vfs being updated */
1817     refstr_t **refp,            /* Ref-count string to contain the new path */
1818     const char *newpath,        /* Path to add to refp (above) */
1819     uint32_t flag)              /* flag */
1820 {
1821         size_t len;
1822         refstr_t *ref;
1823         zone_t *zone = curproc->p_zone;
1824         char *sp;
1825         int have_list_lock = 0;
1826
1827         ASSERT(!VFS_ON_LIST(vfsp) || vfs_lock_held(vfsp));
1828
1829         /*
1830          * New path must be less than MAXPATHLEN because mntfs
1831          * will only display up to MAXPATHLEN bytes. This is currently
1832          * safe, because domount() uses pn_get(), and other callers
1833          * similarly cap the size to fewer than MAXPATHLEN bytes.
1834          */
1835
1836         ASSERT(strlen(newpath) < MAXPATHLEN);
1837
1838         /* mntfs requires consistency while vfs list lock is held */
1839
1840         if (VFS_ON_LIST(vfsp)) {
1841                 have_list_lock = 1;
1842                 vfs_list_lock();
1843         }
1844
1845         if (*refp != NULL)
1846                 refstr_rele(*refp);
1847
1848         /*
1849          * If we are in a non-global zone then we prefix the supplied path,
1850          * newpath, with the zone's root path, with two exceptions. The first
1851          * is where we have been explicitly directed to avoid doing so; this
1852          * will be the case following a failed remount, where the path supplied
1853          * will be a saved version which must now be restored. The second
1854          * exception is where newpath is not a pathname but a descriptive name,
1855          * e.g. "procfs".
1856          */
1857         if (zone == global_zone || (flag & VFSSP_VERBATIM) || *newpath != '/') {
1858                 ref = refstr_alloc(newpath);
1859                 goto out;
1860         }
1861
1862         /*
1863          * Truncate the trailing '/' in the zoneroot, and merge
1864          * in the zone's rootpath with the "newpath" (resource
1865          * or mountpoint) passed in.
1866          *
1867          * The size of the required buffer is thus the size of
1868          * the buffer required for the passed-in newpath
1869          * (strlen(newpath) + 1), plus the size of the buffer
1870          * required to hold zone_rootpath (zone_rootpathlen)
1871          * minus one for one of the now-superfluous NUL
1872          * terminations, minus one for the trailing '/'.
1873          *
1874          * That gives us:
1875          *
1876          * (strlen(newpath) + 1) + zone_rootpathlen - 1 - 1
1877          *
1878          * Which is what we have below.
1879          */
1880
1881         len = strlen(newpath) + zone->zone_rootpathlen - 1;
1882         sp = kmem_alloc(len, KM_SLEEP);
1883
1884         /*
1885          * Copy everything including the trailing slash, which
1886          * we then overwrite with the NUL character.
1887          */
1888
1889         (void) strcpy(sp, zone->zone_rootpath);
1890         sp[zone->zone_rootpathlen - 2] = '\0';
1891         (void) strcat(sp, newpath);
1892
1893         ref = refstr_alloc(sp);
1894         kmem_free(sp, len);
1895 out:
1896         *refp = ref;
1897
1898         if (have_list_lock) {
1899                 vfs_mnttab_modtimeupd();
1900                 vfs_list_unlock();
1901         }
1902 }
1903
1904 /*
1905  * Record a mounted resource name in a vfs structure.
1906  * If vfsp is already mounted, caller must hold the vfs lock.
1907  */
1908 void
1909 vfs_setresource(struct vfs *vfsp, const char *resource, uint32_t flag)
1910 {
1911         if (resource == NULL || resource[0] == '\0')
1912                 resource = VFS_NORESOURCE;
1913         vfs_setpath(vfsp, &vfsp->vfs_resource, resource, flag);
1914 }
1915
1916 /*
1917  * Record a mount point name in a vfs structure.
1918  * If vfsp is already mounted, caller must hold the vfs lock.
1919  */
1920 void
1921 vfs_setmntpoint(struct vfs *vfsp, const char *mntpt, uint32_t flag)
1922 {
1923         if (mntpt == NULL || mntpt[0] == '\0')
1924                 mntpt = VFS_NOMNTPT;
1925         vfs_setpath(vfsp, &vfsp->vfs_mntpt, mntpt, flag);
1926 }
1927
1928 /* Returns the vfs_resource. Caller must call refstr_rele() when finished. */
1929
1930 refstr_t *
1931 vfs_getresource(const struct vfs *vfsp)
1932 {
1933         refstr_t *resource;
1934
1935         vfs_list_read_lock();
1936         resource = vfsp->vfs_resource;
1937         refstr_hold(resource);
1938         vfs_list_unlock();
1939
1940         return (resource);
1941 }
1942
1943 /* Returns the vfs_mntpt. Caller must call refstr_rele() when finished. */
1944
1945 refstr_t *
1946 vfs_getmntpoint(const struct vfs *vfsp)
1947 {
1948         refstr_t *mntpt;
1949
1950         vfs_list_read_lock();
1951         mntpt = vfsp->vfs_mntpt;
1952         refstr_hold(mntpt);
1953         vfs_list_unlock();
1954
1955         return (mntpt);
1956 }
1957
1958 /*
1959  * Create an empty options table with enough empty slots to hold all
1960  * The options in the options string passed as an argument.
1961  * Potentially prepend another options table.
1962  *
1963  * Note: caller is responsible for locking the vfs list, if needed,
1964  *       to protect mops.
1965  */
1966 static void
1967 vfs_createopttbl_extend(mntopts_t *mops, const char *opts,
1968     const mntopts_t *mtmpl)
1969 {
1970         const char *s = opts;
1971         uint_t count;
1972
1973         if (opts == NULL || *opts == '\0') {
1974                 count = 0;
1975         } else {
1976                 count = 1;
1977
1978                 /*
1979                  * Count number of options in the string
1980                  */
1981                 for (s = strchr(s, ','); s != NULL; s = strchr(s, ',')) {
1982                         count++;
1983                         s++;
1984                 }
1985         }
1986         vfs_copyopttbl_extend(mtmpl, mops, count);
1987 }
1988
1989 /*
1990  * Create an empty options table with enough empty slots to hold all
1991  * The options in the options string passed as an argument.
1992  *
1993  * This function is *not* for general use by filesystems.
1994  *
1995  * Note: caller is responsible for locking the vfs list, if needed,
1996  *       to protect mops.
1997  */
1998 void
1999 vfs_createopttbl(mntopts_t *mops, const char *opts)
2000 {
2001         vfs_createopttbl_extend(mops, opts, NULL);
2002 }
2003
2004
2005 /*
2006  * Swap two mount options tables
2007  */
2008 static void
2009 vfs_swapopttbl_nolock(mntopts_t *optbl1, mntopts_t *optbl2)
2010 {
2011         uint_t tmpcnt;
2012         mntopt_t *tmplist;
2013
2014         tmpcnt = optbl2->mo_count;
2015         tmplist = optbl2->mo_list;
2016         optbl2->mo_count = optbl1->mo_count;
2017         optbl2->mo_list = optbl1->mo_list;
2018         optbl1->mo_count = tmpcnt;
2019         optbl1->mo_list = tmplist;
2020 }
2021
2022 static void
2023 vfs_swapopttbl(mntopts_t *optbl1, mntopts_t *optbl2)
2024 {
2025         vfs_list_lock();
2026         vfs_swapopttbl_nolock(optbl1, optbl2);
2027         vfs_mnttab_modtimeupd();
2028         vfs_list_unlock();
2029 }
2030
2031 static char **
2032 vfs_copycancelopt_extend(char **const moc, int extend)
2033 {
2034         int i = 0;
2035         int j;
2036         char **result;
2037
2038         if (moc != NULL) {
2039                 for (; moc[i] != NULL; i++)
2040                         /* count number of options to cancel */;
2041         }
2042
2043         if (i + extend == 0)
2044                 return (NULL);
2045
2046         result = kmem_alloc((i + extend + 1) * sizeof (char *), KM_SLEEP);
2047
2048         for (j = 0; j < i; j++) {
2049                 result[j] = kmem_alloc(strlen(moc[j]) + 1, KM_SLEEP);
2050                 (void) strcpy(result[j], moc[j]);
2051         }
2052         for (; j <= i + extend; j++)
2053                 result[j] = NULL;
2054
2055         return (result);
2056 }
2057
2058 static void
2059 vfs_copyopt(const mntopt_t *s, mntopt_t *d)
2060 {
2061         char *sp, *dp;
2062
2063         d->mo_flags = s->mo_flags;
2064         d->mo_data = s->mo_data;
2065         sp = s->mo_name;
2066         if (sp != NULL) {
2067                 dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP);
2068                 (void) strcpy(dp, sp);
2069                 d->mo_name = dp;
2070         } else {
2071                 d->mo_name = NULL; /* should never happen */
2072         }
2073
2074         d->mo_cancel = vfs_copycancelopt_extend(s->mo_cancel, 0);
2075
2076         sp = s->mo_arg;
2077         if (sp != NULL) {
2078                 dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP);
2079                 (void) strcpy(dp, sp);
2080                 d->mo_arg = dp;
2081         } else {
2082                 d->mo_arg = NULL;
2083         }
2084 }
2085
2086 /*
2087  * Copy a mount options table, possibly allocating some spare
2088  * slots at the end.  It is permissible to copy_extend the NULL table.
2089  */
2090 static void
2091 vfs_copyopttbl_extend(const mntopts_t *smo, mntopts_t *dmo, int extra)
2092 {
2093         uint_t i, count;
2094         mntopt_t *motbl;
2095
2096         /*
2097          * Clear out any existing stuff in the options table being initialized
2098          */
2099         vfs_freeopttbl(dmo);
2100         count = (smo == NULL) ? 0 : smo->mo_count;
2101         if ((count + extra) == 0)       /* nothing to do */
2102                 return;
2103         dmo->mo_count = count + extra;
2104         motbl = kmem_zalloc((count + extra) * sizeof (mntopt_t), KM_SLEEP);
2105         dmo->mo_list = motbl;
2106         for (i = 0; i < count; i++) {
2107                 vfs_copyopt(&smo->mo_list[i], &motbl[i]);
2108         }
2109         for (i = count; i < count + extra; i++) {
2110                 motbl[i].mo_flags = MO_EMPTY;
2111         }
2112 }
2113
2114 /*
2115  * Copy a mount options table.
2116  *
2117  * This function is *not* for general use by filesystems.
2118  *
2119  * Note: caller is responsible for locking the vfs list, if needed,
2120  *       to protect smo and dmo.
2121  */
2122 void
2123 vfs_copyopttbl(const mntopts_t *smo, mntopts_t *dmo)
2124 {
2125         vfs_copyopttbl_extend(smo, dmo, 0);
2126 }
2127
2128 static char **
2129 vfs_mergecancelopts(const mntopt_t *mop1, const mntopt_t *mop2)
2130 {
2131         int c1 = 0;
2132         int c2 = 0;
2133         char **result;
2134         char **sp1, **sp2, **dp;
2135
2136         /*
2137          * First we count both lists of cancel options.
2138          * If either is NULL or has no elements, we return a copy of
2139          * the other.
2140          */
2141         if (mop1->mo_cancel != NULL) {
2142                 for (; mop1->mo_cancel[c1] != NULL; c1++)
2143                         /* count cancel options in mop1 */;
2144         }
2145
2146         if (c1 == 0)
2147                 return (vfs_copycancelopt_extend(mop2->mo_cancel, 0));
2148
2149         if (mop2->mo_cancel != NULL) {
2150                 for (; mop2->mo_cancel[c2] != NULL; c2++)
2151                         /* count cancel options in mop2 */;
2152         }
2153
2154         result = vfs_copycancelopt_extend(mop1->mo_cancel, c2);
2155
2156         if (c2 == 0)
2157                 return (result);
2158
2159         /*
2160          * When we get here, we've got two sets of cancel options;
2161          * we need to merge the two sets.  We know that the result
2162          * array has "c1+c2+1" entries and in the end we might shrink
2163          * it.
2164          * Result now has a copy of the c1 entries from mop1; we'll
2165          * now lookup all the entries of mop2 in mop1 and copy it if
2166          * it is unique.
2167          * This operation is O(n^2) but it's only called once per
2168          * filesystem per duplicate option.  This is a situation
2169          * which doesn't arise with the filesystems in ON and
2170          * n is generally 1.
2171          */
2172
2173         dp = &result[c1];
2174         for (sp2 = mop2->mo_cancel; *sp2 != NULL; sp2++) {
2175                 for (sp1 = mop1->mo_cancel; *sp1 != NULL; sp1++) {
2176                         if (strcmp(*sp1, *sp2) == 0)
2177                                 break;
2178                 }
2179                 if (*sp1 == NULL) {
2180                         /*
2181                          * Option *sp2 not found in mop1, so copy it.
2182                          * The calls to vfs_copycancelopt_extend()
2183                          * guarantee that there's enough room.
2184                          */
2185                         *dp = kmem_alloc(strlen(*sp2) + 1, KM_SLEEP);
2186                         (void) strcpy(*dp++, *sp2);
2187                 }
2188         }
2189         if (dp != &result[c1+c2]) {
2190                 size_t bytes = (dp - result + 1) * sizeof (char *);
2191                 char **nres = kmem_alloc(bytes, KM_SLEEP);
2192
2193                 bcopy(result, nres, bytes);
2194                 kmem_free(result, (c1 + c2 + 1) * sizeof (char *));
2195                 result = nres;
2196         }
2197         return (result);
2198 }
2199
2200 /*
2201  * Merge two mount option tables (outer and inner) into one.  This is very
2202  * similar to "merging" global variables and automatic variables in C.
2203  *
2204  * This isn't (and doesn't have to be) fast.
2205  *
2206  * This function is *not* for general use by filesystems.
2207  *
2208  * Note: caller is responsible for locking the vfs list, if needed,
2209  *       to protect omo, imo & dmo.
2210  */
2211 void
2212 vfs_mergeopttbl(const mntopts_t *omo, const mntopts_t *imo, mntopts_t *dmo)
2213 {
2214         uint_t i, count;
2215         mntopt_t *mop, *motbl;
2216         uint_t freeidx;
2217
2218         /*
2219          * First determine how much space we need to allocate.
2220          */
2221         count = omo->mo_count;
2222         for (i = 0; i < imo->mo_count; i++) {
2223                 if (imo->mo_list[i].mo_flags & MO_EMPTY)
2224                         continue;
2225                 if (vfs_hasopt(omo, imo->mo_list[i].mo_name) == NULL)
2226                         count++;
2227         }
2228         ASSERT(count >= omo->mo_count &&
2229             count <= omo->mo_count + imo->mo_count);
2230         motbl = kmem_alloc(count * sizeof (mntopt_t), KM_SLEEP);
2231         for (i = 0; i < omo->mo_count; i++)
2232                 vfs_copyopt(&omo->mo_list[i], &motbl[i]);
2233         freeidx = omo->mo_count;
2234         for (i = 0; i < imo->mo_count; i++) {
2235                 if (imo->mo_list[i].mo_flags & MO_EMPTY)
2236                         continue;
2237                 if ((mop = vfs_hasopt(omo, imo->mo_list[i].mo_name)) != NULL) {
2238                         char **newcanp;
2239                         uint_t index = mop - omo->mo_list;
2240
2241                         newcanp = vfs_mergecancelopts(mop, &motbl[index]);
2242
2243                         vfs_freeopt(&motbl[index]);
2244                         vfs_copyopt(&imo->mo_list[i], &motbl[index]);
2245
2246                         vfs_freecancelopt(motbl[index].mo_cancel);
2247                         motbl[index].mo_cancel = newcanp;
2248                 } else {
2249                         /*
2250                          * If it's a new option, just copy it over to the first
2251                          * free location.
2252                          */
2253                         vfs_copyopt(&imo->mo_list[i], &motbl[freeidx++]);
2254                 }
2255         }
2256         dmo->mo_count = count;
2257         dmo->mo_list = motbl;
2258 }
2259
2260 /*
2261  * Functions to set and clear mount options in a mount options table.
2262  */
2263
2264 /*
2265  * Clear a mount option, if it exists.
2266  *
2267  * The update_mnttab arg indicates whether mops is part of a vfs that is on
2268  * the vfs list.
2269  */
2270 static void
2271 vfs_clearmntopt_nolock(mntopts_t *mops, const char *opt, int update_mnttab)
2272 {
2273         struct mntopt *mop;
2274         uint_t i, count;
2275
2276         ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist));
2277
2278         count = mops->mo_count;
2279         for (i = 0; i < count; i++) {
2280                 mop = &mops->mo_list[i];
2281
2282                 if (mop->mo_flags & MO_EMPTY)
2283                         continue;
2284                 if (strcmp(opt, mop->mo_name))
2285                         continue;
2286                 mop->mo_flags &= ~MO_SET;
2287                 if (mop->mo_arg != NULL) {
2288                         kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
2289                 }
2290                 mop->mo_arg = NULL;
2291                 if (update_mnttab)
2292                         vfs_mnttab_modtimeupd();
2293                 break;
2294         }
2295 }
2296
2297 void
2298 vfs_clearmntopt(struct vfs *vfsp, const char *opt)
2299 {
2300         int gotlock = 0;
2301
2302         if (VFS_ON_LIST(vfsp)) {
2303                 gotlock = 1;
2304                 vfs_list_lock();
2305         }
2306         vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, opt, gotlock);
2307         if (gotlock)
2308                 vfs_list_unlock();
2309 }
2310
2311
2312 /*
2313  * Set a mount option on.  If it's not found in the table, it's silently
2314  * ignored.  If the option has MO_IGNORE set, it is still set unless the
2315  * VFS_NOFORCEOPT bit is set in the flags.  Also, VFS_DISPLAY/VFS_NODISPLAY flag
2316  * bits can be used to toggle the MO_NODISPLAY bit for the option.
2317  * If the VFS_CREATEOPT flag bit is set then the first option slot with
2318  * MO_EMPTY set is created as the option passed in.
2319  *
2320  * The update_mnttab arg indicates whether mops is part of a vfs that is on
2321  * the vfs list.
2322  */
2323 static void
2324 vfs_setmntopt_nolock(mntopts_t *mops, const char *opt,
2325     const char *arg, int flags, int update_mnttab)
2326 {
2327         mntopt_t *mop;
2328         uint_t i, count;
2329         char *sp;
2330
2331         ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist));
2332
2333         if (flags & VFS_CREATEOPT) {
2334                 if (vfs_hasopt(mops, opt) != NULL) {
2335                         flags &= ~VFS_CREATEOPT;
2336                 }
2337         }
2338         count = mops->mo_count;
2339         for (i = 0; i < count; i++) {
2340                 mop = &mops->mo_list[i];
2341
2342                 if (mop->mo_flags & MO_EMPTY) {
2343                         if ((flags & VFS_CREATEOPT) == 0)
2344                                 continue;
2345                         sp = kmem_alloc(strlen(opt) + 1, KM_SLEEP);
2346                         (void) strcpy(sp, opt);
2347                         mop->mo_name = sp;
2348                         if (arg != NULL)
2349                                 mop->mo_flags = MO_HASVALUE;
2350                         else
2351                                 mop->mo_flags = 0;
2352                 } else if (strcmp(opt, mop->mo_name)) {
2353                         continue;
2354                 }
2355                 if ((mop->mo_flags & MO_IGNORE) && (flags & VFS_NOFORCEOPT))
2356                         break;
2357                 if (arg != NULL && (mop->mo_flags & MO_HASVALUE) != 0) {
2358                         sp = kmem_alloc(strlen(arg) + 1, KM_SLEEP);
2359                         (void) strcpy(sp, arg);
2360                 } else {
2361                         sp = NULL;
2362                 }
2363                 if (mop->mo_arg != NULL)
2364                         kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
2365                 mop->mo_arg = sp;
2366                 if (flags & VFS_DISPLAY)
2367                         mop->mo_flags &= ~MO_NODISPLAY;
2368                 if (flags & VFS_NODISPLAY)
2369                         mop->mo_flags |= MO_NODISPLAY;
2370                 mop->mo_flags |= MO_SET;
2371                 if (mop->mo_cancel != NULL) {
2372                         char **cp;
2373
2374                         for (cp = mop->mo_cancel; *cp != NULL; cp++)
2375                                 vfs_clearmntopt_nolock(mops, *cp, 0);
2376                 }
2377                 if (update_mnttab)
2378                         vfs_mnttab_modtimeupd();
2379                 break;
2380         }
2381 }
2382
2383 void
2384 vfs_setmntopt(struct vfs *vfsp, const char *opt, const char *arg, int flags)
2385 {
2386         int gotlock = 0;
2387
2388         if (VFS_ON_LIST(vfsp)) {
2389                 gotlock = 1;
2390                 vfs_list_lock();
2391         }
2392         vfs_setmntopt_nolock(&vfsp->vfs_mntopts, opt, arg, flags, gotlock);
2393         if (gotlock)
2394                 vfs_list_unlock();
2395 }
2396
2397
2398 /*
2399  * Add a "tag" option to a mounted file system's options list.
2400  *
2401  * Note: caller is responsible for locking the vfs list, if needed,
2402  *       to protect mops.
2403  */
2404 static mntopt_t *
2405 vfs_addtag(mntopts_t *mops, const char *tag)
2406 {
2407         uint_t count;
2408         mntopt_t *mop, *motbl;
2409
2410         count = mops->mo_count + 1;
2411         motbl = kmem_zalloc(count * sizeof (mntopt_t), KM_SLEEP);
2412         if (mops->mo_count) {
2413                 size_t len = (count - 1) * sizeof (mntopt_t);
2414
2415                 bcopy(mops->mo_list, motbl, len);
2416                 kmem_free(mops->mo_list, len);
2417         }
2418         mops->mo_count = count;
2419         mops->mo_list = motbl;
2420         mop = &motbl[count - 1];
2421         mop->mo_flags = MO_TAG;
2422         mop->mo_name = kmem_alloc(strlen(tag) + 1, KM_SLEEP);
2423         (void) strcpy(mop->mo_name, tag);
2424         return (mop);
2425 }
2426
2427 /*
2428  * Allow users to set arbitrary "tags" in a vfs's mount options.
2429  * Broader use within the kernel is discouraged.
2430  */
2431 int
2432 vfs_settag(uint_t major, uint_t minor, const char *mntpt, const char *tag,
2433     cred_t *cr)
2434 {
2435         vfs_t *vfsp;
2436         mntopts_t *mops;
2437         mntopt_t *mop;
2438         int found = 0;
2439         dev_t dev = makedevice(major, minor);
2440         int err = 0;
2441         char *buf = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP);
2442
2443         /*
2444          * Find the desired mounted file system
2445          */
2446         vfs_list_lock();
2447         vfsp = rootvfs;
2448         do {
2449                 if (vfsp->vfs_dev == dev &&
2450                     strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) {
2451                         found = 1;
2452                         break;
2453                 }
2454                 vfsp = vfsp->vfs_next;
2455         } while (vfsp != rootvfs);
2456
2457         if (!found) {
2458                 err = EINVAL;
2459                 goto out;
2460         }
2461         err = secpolicy_fs_config(cr, vfsp);
2462         if (err != 0)
2463                 goto out;
2464
2465         mops = &vfsp->vfs_mntopts;
2466         /*
2467          * Add tag if it doesn't already exist
2468          */
2469         if ((mop = vfs_hasopt(mops, tag)) == NULL) {
2470                 int len;
2471
2472                 (void) vfs_buildoptionstr(mops, buf, MAX_MNTOPT_STR);
2473                 len = strlen(buf);
2474                 if (len + strlen(tag) + 2 > MAX_MNTOPT_STR) {
2475                         err = ENAMETOOLONG;
2476                         goto out;
2477                 }
2478                 mop = vfs_addtag(mops, tag);
2479         }
2480         if ((mop->mo_flags & MO_TAG) == 0) {
2481                 err = EINVAL;
2482                 goto out;
2483         }
2484         vfs_setmntopt_nolock(mops, tag, NULL, 0, 1);
2485 out:
2486         vfs_list_unlock();
2487         kmem_free(buf, MAX_MNTOPT_STR);
2488         return (err);
2489 }
2490
2491 /*
2492  * Allow users to remove arbitrary "tags" in a vfs's mount options.
2493  * Broader use within the kernel is discouraged.
2494  */
2495 int
2496 vfs_clrtag(uint_t major, uint_t minor, const char *mntpt, const char *tag,
2497     cred_t *cr)
2498 {
2499         vfs_t *vfsp;
2500         mntopt_t *mop;
2501         int found = 0;
2502         dev_t dev = makedevice(major, minor);
2503         int err = 0;
2504
2505         /*
2506          * Find the desired mounted file system
2507          */
2508         vfs_list_lock();
2509         vfsp = rootvfs;
2510         do {
2511                 if (vfsp->vfs_dev == dev &&
2512                     strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) {
2513                         found = 1;
2514                         break;
2515                 }
2516                 vfsp = vfsp->vfs_next;
2517         } while (vfsp != rootvfs);
2518
2519         if (!found) {
2520                 err = EINVAL;
2521                 goto out;
2522         }
2523         err = secpolicy_fs_config(cr, vfsp);
2524         if (err != 0)
2525                 goto out;
2526
2527         if ((mop = vfs_hasopt(&vfsp->vfs_mntopts, tag)) == NULL) {
2528                 err = EINVAL;
2529                 goto out;
2530         }
2531         if ((mop->mo_flags & MO_TAG) == 0) {
2532                 err = EINVAL;
2533                 goto out;
2534         }
2535         vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, tag, 1);
2536 out:
2537         vfs_list_unlock();
2538         return (err);
2539 }
2540
2541 /*
2542  * Function to parse an option string and fill in a mount options table.
2543  * Unknown options are silently ignored.  The input option string is modified
2544  * by replacing separators with nulls.  If the create flag is set, options
2545  * not found in the table are just added on the fly.  The table must have
2546  * an option slot marked MO_EMPTY to add an option on the fly.
2547  *
2548  * This function is *not* for general use by filesystems.
2549  *
2550  * Note: caller is responsible for locking the vfs list, if needed,
2551  *       to protect mops..
2552  */
2553 void
2554 vfs_parsemntopts(mntopts_t *mops, char *osp, int create)
2555 {
2556         char *s = osp, *p, *nextop, *valp, *cp, *ep;
2557         int setflg = VFS_NOFORCEOPT;
2558
2559         if (osp == NULL)
2560                 return;
2561         while (*s != '\0') {
2562                 p = strchr(s, ',');     /* find next option */
2563                 if (p == NULL) {
2564                         cp = NULL;
2565                         p = s + strlen(s);
2566                 } else {
2567                         cp = p;         /* save location of comma */
2568                         *p++ = '\0';    /* mark end and point to next option */
2569                 }
2570                 nextop = p;
2571                 p = strchr(s, '=');     /* look for value */
2572                 if (p == NULL) {
2573                         valp = NULL;    /* no value supplied */
2574                 } else {
2575                         ep = p;         /* save location of equals */
2576                         *p++ = '\0';    /* end option and point to value */
2577                         valp = p;
2578                 }
2579                 /*
2580                  * set option into options table
2581                  */
2582                 if (create)
2583                         setflg |= VFS_CREATEOPT;
2584                 vfs_setmntopt_nolock(mops, s, valp, setflg, 0);
2585                 if (cp != NULL)
2586                         *cp = ',';      /* restore the comma */
2587                 if (valp != NULL)
2588                         *ep = '=';      /* restore the equals */
2589                 s = nextop;
2590         }
2591 }
2592
2593 /*
2594  * Function to inquire if an option exists in a mount options table.
2595  * Returns a pointer to the option if it exists, else NULL.
2596  *
2597  * This function is *not* for general use by filesystems.
2598  *
2599  * Note: caller is responsible for locking the vfs list, if needed,
2600  *       to protect mops.
2601  */
2602 struct mntopt *
2603 vfs_hasopt(const mntopts_t *mops, const char *opt)
2604 {
2605         struct mntopt *mop;
2606         uint_t i, count;
2607
2608         count = mops->mo_count;
2609         for (i = 0; i < count; i++) {
2610                 mop = &mops->mo_list[i];
2611
2612                 if (mop->mo_flags & MO_EMPTY)
2613                         continue;
2614                 if (strcmp(opt, mop->mo_name) == 0)
2615                         return (mop);
2616         }
2617         return (NULL);
2618 }
2619
2620 /*
2621  * Function to inquire if an option is set in a mount options table.
2622  * Returns non-zero if set and fills in the arg pointer with a pointer to
2623  * the argument string or NULL if there is no argument string.
2624  */
2625 static int
2626 vfs_optionisset_nolock(const mntopts_t *mops, const char *opt, char **argp)
2627 {
2628         struct mntopt *mop;
2629         uint_t i, count;
2630
2631         count = mops->mo_count;
2632         for (i = 0; i < count; i++) {
2633                 mop = &mops->mo_list[i];
2634
2635                 if (mop->mo_flags & MO_EMPTY)
2636                         continue;
2637                 if (strcmp(opt, mop->mo_name))
2638                         continue;
2639                 if ((mop->mo_flags & MO_SET) == 0)
2640                         return (0);
2641                 if (argp != NULL && (mop->mo_flags & MO_HASVALUE) != 0)
2642                         *argp = mop->mo_arg;
2643                 return (1);
2644         }
2645         return (0);
2646 }
2647
2648
2649 int
2650 vfs_optionisset(const struct vfs *vfsp, const char *opt, char **argp)
2651 {
2652         int ret;
2653
2654         vfs_list_read_lock();
2655         ret = vfs_optionisset_nolock(&vfsp->vfs_mntopts, opt, argp);
2656         vfs_list_unlock();
2657         return (ret);
2658 }
2659
2660
2661 /*
2662  * Construct a comma separated string of the options set in the given
2663  * mount table, return the string in the given buffer.  Return non-zero if
2664  * the buffer would overflow.
2665  *
2666  * This function is *not* for general use by filesystems.
2667  *
2668  * Note: caller is responsible for locking the vfs list, if needed,
2669  *       to protect mp.
2670  */
2671 int
2672 vfs_buildoptionstr(const mntopts_t *mp, char *buf, int len)
2673 {
2674         char *cp;
2675         uint_t i;
2676
2677         buf[0] = '\0';
2678         cp = buf;
2679         for (i = 0; i < mp->mo_count; i++) {
2680                 struct mntopt *mop;
2681
2682                 mop = &mp->mo_list[i];
2683                 if (mop->mo_flags & MO_SET) {
2684                         int optlen, comma = 0;
2685
2686                         if (buf[0] != '\0')
2687                                 comma = 1;
2688                         optlen = strlen(mop->mo_name);
2689                         if (strlen(buf) + comma + optlen + 1 > len)
2690                                 goto err;
2691                         if (comma)
2692                                 *cp++ = ',';
2693                         (void) strcpy(cp, mop->mo_name);
2694                         cp += optlen;
2695                         /*
2696                          * Append option value if there is one
2697                          */
2698                         if (mop->mo_arg != NULL) {
2699                                 int arglen;
2700
2701                                 arglen = strlen(mop->mo_arg);
2702                                 if (strlen(buf) + arglen + 2 > len)
2703                                         goto err;
2704                                 *cp++ = '=';
2705                                 (void) strcpy(cp, mop->mo_arg);
2706                                 cp += arglen;
2707                         }
2708                 }
2709         }
2710         return (0);
2711 err:
2712         return (EOVERFLOW);
2713 }
2714
2715 static void
2716 vfs_freecancelopt(char **moc)
2717 {
2718         if (moc != NULL) {
2719                 int ccnt = 0;
2720                 char **cp;
2721
2722                 for (cp = moc; *cp != NULL; cp++) {
2723                         kmem_free(*cp, strlen(*cp) + 1);
2724                         ccnt++;
2725                 }
2726                 kmem_free(moc, (ccnt + 1) * sizeof (char *));
2727         }
2728 }
2729
2730 static void
2731 vfs_freeopt(mntopt_t *mop)
2732 {
2733         if (mop->mo_name != NULL)
2734                 kmem_free(mop->mo_name, strlen(mop->mo_name) + 1);
2735
2736         vfs_freecancelopt(mop->mo_cancel);
2737
2738         if (mop->mo_arg != NULL)
2739                 kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
2740 }
2741
2742 /*
2743  * Free a mount options table
2744  *
2745  * This function is *not* for general use by filesystems.
2746  *
2747  * Note: caller is responsible for locking the vfs list, if needed,
2748  *       to protect mp.
2749  */
2750 void
2751 vfs_freeopttbl(mntopts_t *mp)
2752 {
2753         uint_t i, count;
2754
2755         count = mp->mo_count;
2756         for (i = 0; i < count; i++) {
2757                 vfs_freeopt(&mp->mo_list[i]);
2758         }
2759         if (count) {
2760                 kmem_free(mp->mo_list, sizeof (mntopt_t) * count);
2761                 mp->mo_count = 0;
2762                 mp->mo_list = NULL;
2763         }
2764 }
2765
2766
2767 /* ARGSUSED */
2768 static int
2769 vfs_mntdummyread(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred,
2770     caller_context_t *ct)
2771 {
2772         return (0);
2773 }
2774
2775 /* ARGSUSED */
2776 static int
2777 vfs_mntdummywrite(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred,
2778     caller_context_t *ct)
2779 {
2780         return (0);
2781 }
2782
2783 /*
2784  * The dummy vnode is currently used only by file events notification
2785  * module which is just interested in the timestamps.
2786  */
2787 /* ARGSUSED */
2788 static int
2789 vfs_mntdummygetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2790     caller_context_t *ct)
2791 {
2792         bzero(vap, sizeof (vattr_t));
2793         vap->va_type = VREG;
2794         vap->va_nlink = 1;
2795         vap->va_ctime = vfs_mnttab_ctime;
2796         /*
2797          * it is ok to just copy mtime as the time will be monotonically
2798          * increasing.
2799          */
2800         vap->va_mtime = vfs_mnttab_mtime;
2801         vap->va_atime = vap->va_mtime;
2802         return (0);
2803 }
2804
2805 static void
2806 vfs_mnttabvp_setup(void)
2807 {
2808         vnode_t *tvp;
2809         vnodeops_t *vfs_mntdummyvnops;
2810         const fs_operation_def_t mnt_dummyvnodeops_template[] = {
2811                 VOPNAME_READ,           { .vop_read = vfs_mntdummyread },
2812                 VOPNAME_WRITE,          { .vop_write = vfs_mntdummywrite },
2813                 VOPNAME_GETATTR,        { .vop_getattr = vfs_mntdummygetattr },
2814                 VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
2815                 NULL,                   NULL
2816         };
2817
2818         if (vn_make_ops("mnttab", mnt_dummyvnodeops_template,
2819             &vfs_mntdummyvnops) != 0) {
2820                 cmn_err(CE_WARN, "vfs_mnttabvp_setup: vn_make_ops failed");
2821                 /* Shouldn't happen, but not bad enough to panic */
2822                 return;
2823         }
2824
2825         /*
2826          * A global dummy vnode is allocated to represent mntfs files.
2827          * The mntfs file (/etc/mnttab) can be monitored for file events
2828          * and receive an event when mnttab changes. Dummy VOP calls
2829          * will be made on this vnode. The file events notification module
2830          * intercepts this vnode and delivers relevant events.
2831          */
2832         tvp = vn_alloc(KM_SLEEP);
2833         tvp->v_flag = VNOMOUNT|VNOMAP|VNOSWAP|VNOCACHE;
2834         vn_setops(tvp, vfs_mntdummyvnops);
2835         tvp->v_type = VREG;
2836         /*
2837          * The mnt dummy ops do not reference v_data.
2838          * No other module intercepting this vnode should either.
2839          * Just set it to point to itself.
2840          */
2841         tvp->v_data = (caddr_t)tvp;
2842         tvp->v_vfsp = rootvfs;
2843         vfs_mntdummyvp = tvp;
2844 }
2845
2846 /*
2847  * performs fake read/write ops
2848  */
2849 static void
2850 vfs_mnttab_rwop(int rw)
2851 {
2852         struct uio      uio;
2853         struct iovec    iov;
2854         char    buf[1];
2855
2856         if (vfs_mntdummyvp == NULL)
2857                 return;
2858
2859         bzero(&uio, sizeof (uio));
2860         bzero(&iov, sizeof (iov));
2861         iov.iov_base = buf;
2862         iov.iov_len = 0;
2863         uio.uio_iov = &iov;
2864         uio.uio_iovcnt = 1;
2865         uio.uio_loffset = 0;
2866         uio.uio_segflg = UIO_SYSSPACE;
2867         uio.uio_resid = 0;
2868         if (rw) {
2869                 (void) VOP_WRITE(vfs_mntdummyvp, &uio, 0, kcred, NULL);
2870         } else {
2871                 (void) VOP_READ(vfs_mntdummyvp, &uio, 0, kcred, NULL);
2872         }
2873 }
2874
2875 /*
2876  * Generate a write operation.
2877  */
2878 void
2879 vfs_mnttab_writeop(void)
2880 {
2881         vfs_mnttab_rwop(1);
2882 }
2883
2884 /*
2885  * Generate a read operation.
2886  */
2887 void
2888 vfs_mnttab_readop(void)
2889 {
2890         vfs_mnttab_rwop(0);
2891 }
2892
2893 /*
2894  * Free any mnttab information recorded in the vfs struct.
2895  * The vfs must not be on the vfs list.
2896  */
2897 static void
2898 vfs_freemnttab(struct vfs *vfsp)
2899 {
2900         ASSERT(!VFS_ON_LIST(vfsp));
2901
2902         /*
2903          * Free device and mount point information
2904          */
2905         if (vfsp->vfs_mntpt != NULL) {
2906                 refstr_rele(vfsp->vfs_mntpt);
2907                 vfsp->vfs_mntpt = NULL;
2908         }
2909         if (vfsp->vfs_resource != NULL) {
2910                 refstr_rele(vfsp->vfs_resource);
2911                 vfsp->vfs_resource = NULL;
2912         }
2913         /*
2914          * Now free mount options information
2915          */
2916         vfs_freeopttbl(&vfsp->vfs_mntopts);
2917 }
2918
2919 /*
2920  * Return the last mnttab modification time
2921  */
2922 void
2923 vfs_mnttab_modtime(timespec_t *ts)
2924 {
2925         ASSERT(RW_LOCK_HELD(&vfslist));
2926         *ts = vfs_mnttab_mtime;
2927 }
2928
2929 /*
2930  * See if mnttab is changed
2931  */
2932 void
2933 vfs_mnttab_poll(timespec_t *old, struct pollhead **phpp)
2934 {
2935         int changed;
2936
2937         *phpp = (struct pollhead *)NULL;
2938
2939         /*
2940          * Note: don't grab vfs list lock before accessing vfs_mnttab_mtime.
2941          * Can lead to deadlock against vfs_mnttab_modtimeupd(). It is safe
2942          * to not grab the vfs list lock because tv_sec is monotonically
2943          * increasing.
2944          */
2945
2946         changed = (old->tv_nsec != vfs_mnttab_mtime.tv_nsec) ||
2947             (old->tv_sec != vfs_mnttab_mtime.tv_sec);
2948         if (!changed) {
2949                 *phpp = &vfs_pollhd;
2950         }
2951 }
2952
2953 /* Provide a unique and monotonically-increasing timestamp. */
2954 void
2955 vfs_mono_time(timespec_t *ts)
2956 {
2957         static volatile hrtime_t hrt;           /* The saved time. */
2958         hrtime_t        newhrt, oldhrt;         /* For effecting the CAS. */
2959         timespec_t      newts;
2960
2961         /*
2962          * Try gethrestime() first, but be prepared to fabricate a sensible
2963          * answer at the first sign of any trouble.
2964          */
2965         gethrestime(&newts);
2966         newhrt = ts2hrt(&newts);
2967         for (;;) {
2968                 oldhrt = hrt;
2969                 if (newhrt <= hrt)
2970                         newhrt = hrt + 1;
2971                 if (atomic_cas_64((uint64_t *)&hrt, oldhrt, newhrt) == oldhrt)
2972                         break;
2973         }
2974         hrt2ts(newhrt, ts);
2975 }
2976
2977 /*
2978  * Update the mnttab modification time and wake up any waiters for
2979  * mnttab changes
2980  */
2981 void
2982 vfs_mnttab_modtimeupd()
2983 {
2984         hrtime_t oldhrt, newhrt;
2985
2986         ASSERT(RW_WRITE_HELD(&vfslist));
2987         oldhrt = ts2hrt(&vfs_mnttab_mtime);
2988         gethrestime(&vfs_mnttab_mtime);
2989         newhrt = ts2hrt(&vfs_mnttab_mtime);
2990         if (oldhrt == (hrtime_t)0)
2991                 vfs_mnttab_ctime = vfs_mnttab_mtime;
2992         /*
2993          * Attempt to provide unique mtime (like uniqtime but not).
2994          */
2995         if (newhrt == oldhrt) {
2996                 newhrt++;
2997                 hrt2ts(newhrt, &vfs_mnttab_mtime);
2998         }
2999         pollwakeup(&vfs_pollhd, (short)POLLRDBAND);
3000         vfs_mnttab_writeop();
3001 }
3002
3003 int
3004 dounmount(struct vfs *vfsp, int flag, cred_t *cr)
3005 {
3006         vnode_t *coveredvp;
3007         int error;
3008         extern void teardown_vopstats(vfs_t *);
3009
3010         /*
3011          * Get covered vnode. This will be NULL if the vfs is not linked
3012          * into the file system name space (i.e., domount() with MNT_NOSPICE).
3013          */
3014         coveredvp = vfsp->vfs_vnodecovered;
3015         ASSERT(coveredvp == NULL || vn_vfswlock_held(coveredvp));
3016
3017         /*
3018          * Purge all dnlc entries for this vfs.
3019          */
3020         (void) dnlc_purge_vfsp(vfsp, 0);
3021
3022         /* For forcible umount, skip VFS_SYNC() since it may hang */
3023         if ((flag & MS_FORCE) == 0)
3024                 (void) VFS_SYNC(vfsp, 0, cr);
3025
3026         /*
3027          * Lock the vfs to maintain fs status quo during unmount.  This
3028          * has to be done after the sync because ufs_update tries to acquire
3029          * the vfs_reflock.
3030          */
3031         vfs_lock_wait(vfsp);
3032
3033         if (error = VFS_UNMOUNT(vfsp, flag, cr)) {
3034                 vfs_unlock(vfsp);
3035                 if (coveredvp != NULL)
3036                         vn_vfsunlock(coveredvp);
3037         } else if (coveredvp != NULL) {
3038                 teardown_vopstats(vfsp);
3039                 /*
3040                  * vfs_remove() will do a VN_RELE(vfsp->vfs_vnodecovered)
3041                  * when it frees vfsp so we do a VN_HOLD() so we can
3042                  * continue to use coveredvp afterwards.
3043                  */
3044                 VN_HOLD(coveredvp);
3045                 vfs_remove(vfsp);
3046                 vn_vfsunlock(coveredvp);
3047                 VN_RELE(coveredvp);
3048         } else {
3049                 teardown_vopstats(vfsp);
3050                 /*
3051                  * Release the reference to vfs that is not linked
3052                  * into the name space.
3053                  */
3054                 vfs_unlock(vfsp);
3055                 VFS_RELE(vfsp);
3056         }
3057         return (error);
3058 }
3059
3060
3061 /*
3062  * Vfs_unmountall() is called by uadmin() to unmount all
3063  * mounted file systems (except the root file system) during shutdown.
3064  * It follows the existing locking protocol when traversing the vfs list
3065  * to sync and unmount vfses. Even though there should be no
3066  * other thread running while the system is shutting down, it is prudent
3067  * to still follow the locking protocol.
3068  */
3069 void
3070 vfs_unmountall(void)
3071 {
3072         struct vfs *vfsp;
3073         struct vfs *prev_vfsp = NULL;
3074         int error;
3075
3076         /*
3077          * Toss all dnlc entries now so that the per-vfs sync
3078          * and unmount operations don't have to slog through
3079          * a bunch of uninteresting vnodes over and over again.
3080          */
3081         dnlc_purge();
3082
3083         vfs_list_lock();
3084         for (vfsp = rootvfs->vfs_prev; vfsp != rootvfs; vfsp = prev_vfsp) {
3085                 prev_vfsp = vfsp->vfs_prev;
3086
3087                 if (vfs_lock(vfsp) != 0)
3088                         continue;
3089                 error = vn_vfswlock(vfsp->vfs_vnodecovered);
3090                 vfs_unlock(vfsp);
3091                 if (error)
3092                         continue;
3093
3094                 vfs_list_unlock();
3095
3096                 (void) VFS_SYNC(vfsp, SYNC_CLOSE, CRED());
3097                 (void) dounmount(vfsp, 0, CRED());
3098
3099                 /*
3100                  * Since we dropped the vfslist lock above we must
3101                  * verify that next_vfsp still exists, else start over.
3102                  */
3103                 vfs_list_lock();
3104                 for (vfsp = rootvfs->vfs_prev;
3105                     vfsp != rootvfs; vfsp = vfsp->vfs_prev)
3106                         if (vfsp == prev_vfsp)
3107                                 break;
3108                 if (vfsp == rootvfs && prev_vfsp != rootvfs)
3109                         prev_vfsp = rootvfs->vfs_prev;
3110         }
3111         vfs_list_unlock();
3112 }
3113
3114 /*
3115  * Called to add an entry to the end of the vfs mount in progress list
3116  */
3117 void
3118 vfs_addmip(dev_t dev, struct vfs *vfsp)
3119 {
3120         struct ipmnt *mipp;
3121
3122         mipp = (struct ipmnt *)kmem_alloc(sizeof (struct ipmnt), KM_SLEEP);
3123         mipp->mip_next = NULL;
3124         mipp->mip_dev = dev;
3125         mipp->mip_vfsp = vfsp;
3126         mutex_enter(&vfs_miplist_mutex);
3127         if (vfs_miplist_end != NULL)
3128                 vfs_miplist_end->mip_next = mipp;
3129         else
3130                 vfs_miplist = mipp;
3131         vfs_miplist_end = mipp;
3132         mutex_exit(&vfs_miplist_mutex);
3133 }
3134
3135 /*
3136  * Called to remove an entry from the mount in progress list
3137  * Either because the mount completed or it failed.
3138  */
3139 void
3140 vfs_delmip(struct vfs *vfsp)
3141 {
3142         struct ipmnt *mipp, *mipprev;
3143
3144         mutex_enter(&vfs_miplist_mutex);
3145         mipprev = NULL;
3146         for (mipp = vfs_miplist;
3147             mipp && mipp->mip_vfsp != vfsp; mipp = mipp->mip_next) {
3148                 mipprev = mipp;
3149         }
3150         if (mipp == NULL)
3151                 return; /* shouldn't happen */
3152         if (mipp == vfs_miplist_end)
3153                 vfs_miplist_end = mipprev;
3154         if (mipprev == NULL)
3155                 vfs_miplist = mipp->mip_next;
3156         else
3157                 mipprev->mip_next = mipp->mip_next;
3158         mutex_exit(&vfs_miplist_mutex);
3159         kmem_free(mipp, sizeof (struct ipmnt));
3160 }
3161
3162 /*
3163  * vfs_add is called by a specific filesystem's mount routine to add
3164  * the new vfs into the vfs list/hash and to cover the mounted-on vnode.
3165  * The vfs should already have been locked by the caller.
3166  *
3167  * coveredvp is NULL if this is the root.
3168  */
3169 void
3170 vfs_add(vnode_t *coveredvp, struct vfs *vfsp, int mflag)
3171 {
3172         int newflag;
3173
3174         ASSERT(vfs_lock_held(vfsp));
3175         VFS_HOLD(vfsp);
3176         newflag = vfsp->vfs_flag;
3177         if (mflag & MS_RDONLY)
3178                 newflag |= VFS_RDONLY;
3179         else
3180                 newflag &= ~VFS_RDONLY;
3181         if (mflag & MS_NOSUID)
3182                 newflag |= (VFS_NOSETUID|VFS_NODEVICES);
3183         else
3184                 newflag &= ~(VFS_NOSETUID|VFS_NODEVICES);
3185         if (mflag & MS_NOMNTTAB)
3186                 newflag |= VFS_NOMNTTAB;
3187         else
3188                 newflag &= ~VFS_NOMNTTAB;
3189
3190         if (coveredvp != NULL) {
3191                 ASSERT(vn_vfswlock_held(coveredvp));
3192                 coveredvp->v_vfsmountedhere = vfsp;
3193                 VN_HOLD(coveredvp);
3194         }
3195         vfsp->vfs_vnodecovered = coveredvp;
3196         vfsp->vfs_flag = newflag;
3197
3198         vfs_list_add(vfsp);
3199 }
3200
3201 /*
3202  * Remove a vfs from the vfs list, null out the pointer from the
3203  * covered vnode to the vfs (v_vfsmountedhere), and null out the pointer
3204  * from the vfs to the covered vnode (vfs_vnodecovered). Release the
3205  * reference to the vfs and to the covered vnode.
3206  *
3207  * Called from dounmount after it's confirmed with the file system
3208  * that the unmount is legal.
3209  */
3210 void
3211 vfs_remove(struct vfs *vfsp)
3212 {
3213         vnode_t *vp;
3214
3215         ASSERT(vfs_lock_held(vfsp));
3216
3217         /*
3218          * Can't unmount root.  Should never happen because fs will
3219          * be busy.
3220          */
3221         if (vfsp == rootvfs)
3222                 panic("vfs_remove: unmounting root");
3223
3224         vfs_list_remove(vfsp);
3225
3226         /*
3227          * Unhook from the file system name space.
3228          */
3229         vp = vfsp->vfs_vnodecovered;
3230         ASSERT(vn_vfswlock_held(vp));
3231         vp->v_vfsmountedhere = NULL;
3232         vfsp->vfs_vnodecovered = NULL;
3233         VN_RELE(vp);
3234
3235         /*
3236          * Release lock and wakeup anybody waiting.
3237          */
3238         vfs_unlock(vfsp);
3239         VFS_RELE(vfsp);
3240 }
3241
3242 /*
3243  * Lock a filesystem to prevent access to it while mounting,
3244  * unmounting and syncing.  Return EBUSY immediately if lock
3245  * can't be acquired.
3246  */
3247 int
3248 vfs_lock(vfs_t *vfsp)
3249 {
3250         vn_vfslocks_entry_t *vpvfsentry;
3251
3252         vpvfsentry = vn_vfslocks_getlock(vfsp);
3253         if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER))
3254                 return (0);
3255
3256         vn_vfslocks_rele(vpvfsentry);
3257         return (EBUSY);
3258 }
3259
3260 int
3261 vfs_rlock(vfs_t *vfsp)
3262 {
3263         vn_vfslocks_entry_t *vpvfsentry;
3264
3265         vpvfsentry = vn_vfslocks_getlock(vfsp);
3266
3267         if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER))
3268                 return (0);
3269
3270         vn_vfslocks_rele(vpvfsentry);
3271         return (EBUSY);
3272 }
3273
3274 void
3275 vfs_lock_wait(vfs_t *vfsp)
3276 {
3277         vn_vfslocks_entry_t *vpvfsentry;
3278
3279         vpvfsentry = vn_vfslocks_getlock(vfsp);
3280         rwst_enter(&vpvfsentry->ve_lock, RW_WRITER);
3281 }
3282
3283 void
3284 vfs_rlock_wait(vfs_t *vfsp)
3285 {
3286         vn_vfslocks_entry_t *vpvfsentry;
3287
3288         vpvfsentry = vn_vfslocks_getlock(vfsp);
3289         rwst_enter(&vpvfsentry->ve_lock, RW_READER);
3290 }
3291
3292 /*
3293  * Unlock a locked filesystem.
3294  */
3295 void
3296 vfs_unlock(vfs_t *vfsp)
3297 {
3298         vn_vfslocks_entry_t *vpvfsentry;
3299
3300         /*
3301          * vfs_unlock will mimic sema_v behaviour to fix 4748018.
3302          * And these changes should remain for the patch changes as it is.
3303          */
3304         if (panicstr)
3305                 return;
3306
3307         /*
3308          * ve_refcount needs to be dropped twice here.
3309          * 1. To release refernce after a call to vfs_locks_getlock()
3310          * 2. To release the reference from the locking routines like
3311          *    vfs_rlock_wait/vfs_wlock_wait/vfs_wlock etc,.
3312          */
3313
3314         vpvfsentry = vn_vfslocks_getlock(vfsp);
3315         vn_vfslocks_rele(vpvfsentry);
3316
3317         rwst_exit(&vpvfsentry->ve_lock);
3318         vn_vfslocks_rele(vpvfsentry);
3319 }
3320
3321 /*
3322  * Utility routine that allows a filesystem to construct its
3323  * fsid in "the usual way" - by munging some underlying dev_t and
3324  * the filesystem type number into the 64-bit fsid.  Note that
3325  * this implicitly relies on dev_t persistence to make filesystem
3326  * id's persistent.
3327  *
3328  * There's nothing to prevent an individual fs from constructing its
3329  * fsid in a different way, and indeed they should.
3330  *
3331  * Since we want fsids to be 32-bit quantities (so that they can be
3332  * exported identically by either 32-bit or 64-bit APIs, as well as
3333  * the fact that fsid's are "known" to NFS), we compress the device
3334  * number given down to 32-bits, and panic if that isn't possible.
3335  */
3336 void
3337 vfs_make_fsid(fsid_t *fsi, dev_t dev, int val)
3338 {
3339         if (!cmpldev((dev32_t *)&fsi->val[0], dev))
3340                 panic("device number too big for fsid!");
3341         fsi->val[1] = val;
3342 }
3343
3344 int
3345 vfs_lock_held(vfs_t *vfsp)
3346 {
3347         int held;
3348         vn_vfslocks_entry_t *vpvfsentry;
3349
3350         /*
3351          * vfs_lock_held will mimic sema_held behaviour
3352          * if panicstr is set. And these changes should remain
3353          * for the patch changes as it is.
3354          */
3355         if (panicstr)
3356                 return (1);
3357
3358         vpvfsentry = vn_vfslocks_getlock(vfsp);
3359         held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER);
3360
3361         vn_vfslocks_rele(vpvfsentry);
3362         return (held);
3363 }
3364
3365 struct _kthread *
3366 vfs_lock_owner(vfs_t *vfsp)
3367 {
3368         struct _kthread *owner;
3369         vn_vfslocks_entry_t *vpvfsentry;
3370
3371         /*
3372          * vfs_wlock_held will mimic sema_held behaviour
3373          * if panicstr is set. And these changes should remain
3374          * for the patch changes as it is.
3375          */
3376         if (panicstr)
3377                 return (NULL);
3378
3379         vpvfsentry = vn_vfslocks_getlock(vfsp);
3380         owner = rwst_owner(&vpvfsentry->ve_lock);
3381
3382         vn_vfslocks_rele(vpvfsentry);
3383         return (owner);
3384 }
3385
3386 /*
3387  * vfs list locking.
3388  *
3389  * Rather than manipulate the vfslist lock directly, we abstract into lock
3390  * and unlock routines to allow the locking implementation to be changed for
3391  * clustering.
3392  *
3393  * Whenever the vfs list is modified through its hash links, the overall list
3394  * lock must be obtained before locking the relevant hash bucket.  But to see
3395  * whether a given vfs is on the list, it suffices to obtain the lock for the
3396  * hash bucket without getting the overall list lock.  (See getvfs() below.)
3397  */
3398
3399 void
3400 vfs_list_lock()
3401 {
3402         rw_enter(&vfslist, RW_WRITER);
3403 }
3404
3405 void
3406 vfs_list_read_lock()
3407 {
3408         rw_enter(&vfslist, RW_READER);
3409 }
3410
3411 void
3412 vfs_list_unlock()
3413 {
3414         rw_exit(&vfslist);
3415 }
3416
3417 /*
3418  * Low level worker routines for adding entries to and removing entries from
3419  * the vfs list.
3420  */
3421
3422 static void
3423 vfs_hash_add(struct vfs *vfsp, int insert_at_head)
3424 {
3425         int vhno;
3426         struct vfs **hp;
3427         dev_t dev;
3428
3429         ASSERT(RW_WRITE_HELD(&vfslist));
3430
3431         dev = expldev(vfsp->vfs_fsid.val[0]);
3432         vhno = VFSHASH(getmajor(dev), getminor(dev));
3433
3434         mutex_enter(&rvfs_list[vhno].rvfs_lock);
3435
3436         /*
3437          * Link into the hash table, inserting it at the end, so that LOFS
3438          * with the same fsid as UFS (or other) file systems will not hide the
3439          * UFS.
3440          */
3441         if (insert_at_head) {
3442                 vfsp->vfs_hash = rvfs_list[vhno].rvfs_head;
3443                 rvfs_list[vhno].rvfs_head = vfsp;
3444         } else {
3445                 for (hp = &rvfs_list[vhno].rvfs_head; *hp != NULL;
3446                     hp = &(*hp)->vfs_hash)
3447                         continue;
3448                 /*
3449                  * hp now contains the address of the pointer to update
3450                  * to effect the insertion.
3451                  */
3452                 vfsp->vfs_hash = NULL;
3453                 *hp = vfsp;
3454         }
3455
3456         rvfs_list[vhno].rvfs_len++;
3457         mutex_exit(&rvfs_list[vhno].rvfs_lock);
3458 }
3459
3460
3461 static void
3462 vfs_hash_remove(struct vfs *vfsp)
3463 {
3464         int vhno;
3465         struct vfs *tvfsp;
3466         dev_t dev;
3467
3468         ASSERT(RW_WRITE_HELD(&vfslist));
3469
3470         dev = expldev(vfsp->vfs_fsid.val[0]);
3471         vhno = VFSHASH(getmajor(dev), getminor(dev));
3472
3473         mutex_enter(&rvfs_list[vhno].rvfs_lock);
3474
3475         /*
3476          * Remove from hash.
3477          */
3478         if (rvfs_list[vhno].rvfs_head == vfsp) {
3479                 rvfs_list[vhno].rvfs_head = vfsp->vfs_hash;
3480                 rvfs_list[vhno].rvfs_len--;
3481                 goto foundit;
3482         }
3483         for (tvfsp = rvfs_list[vhno].rvfs_head; tvfsp != NULL;
3484             tvfsp = tvfsp->vfs_hash) {
3485                 if (tvfsp->vfs_hash == vfsp) {
3486                         tvfsp->vfs_hash = vfsp->vfs_hash;
3487                         rvfs_list[vhno].rvfs_len--;
3488                         goto foundit;
3489                 }
3490         }
3491         cmn_err(CE_WARN, "vfs_list_remove: vfs not found in hash");
3492
3493 foundit:
3494
3495         mutex_exit(&rvfs_list[vhno].rvfs_lock);
3496 }
3497
3498
3499 void
3500 vfs_list_add(struct vfs *vfsp)
3501 {
3502         zone_t *zone;
3503
3504         /*
3505          * Typically, the vfs_t will have been created on behalf of the file
3506          * system in vfs_init, where it will have been provided with a
3507          * vfs_impl_t. This, however, might be lacking if the vfs_t was created
3508          * by an unbundled file system. We therefore check for such an example
3509          * before stamping the vfs_t with its creation time for the benefit of
3510          * mntfs.
3511          */
3512         if (vfsp->vfs_implp == NULL)
3513                 vfsimpl_setup(vfsp);
3514         vfs_mono_time(&vfsp->vfs_hrctime);
3515
3516         /*
3517          * The zone that owns the mount is the one that performed the mount.
3518          * Note that this isn't necessarily the same as the zone mounted into.
3519          * The corresponding zone_rele_ref() will be done when the vfs_t
3520          * is being free'd.
3521          */
3522         vfsp->vfs_zone = curproc->p_zone;
3523         zone_init_ref(&vfsp->vfs_implp->vi_zone_ref);
3524         zone_hold_ref(vfsp->vfs_zone, &vfsp->vfs_implp->vi_zone_ref,
3525             ZONE_REF_VFS);
3526
3527         /*
3528          * Find the zone mounted into, and put this mount on its vfs list.
3529          */
3530         zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
3531         ASSERT(zone != NULL);
3532         /*
3533          * Special casing for the root vfs.  This structure is allocated
3534          * statically and hooked onto rootvfs at link time.  During the
3535          * vfs_mountroot call at system startup time, the root file system's
3536          * VFS_MOUNTROOT routine will call vfs_add with this root vfs struct
3537          * as argument.  The code below must detect and handle this special
3538          * case.  The only apparent justification for this special casing is
3539          * to ensure that the root file system appears at the head of the
3540          * list.
3541          *
3542          * XXX: I'm assuming that it's ok to do normal list locking when
3543          *      adding the entry for the root file system (this used to be
3544          *      done with no locks held).
3545          */
3546         vfs_list_lock();
3547         /*
3548          * Link into the vfs list proper.
3549          */
3550         if (vfsp == &root) {
3551                 /*
3552                  * Assert: This vfs is already on the list as its first entry.
3553                  * Thus, there's nothing to do.
3554                  */
3555                 ASSERT(rootvfs == vfsp);
3556                 /*
3557                  * Add it to the head of the global zone's vfslist.
3558                  */
3559                 ASSERT(zone == global_zone);
3560                 ASSERT(zone->zone_vfslist == NULL);
3561                 zone->zone_vfslist = vfsp;
3562         } else {
3563                 /*
3564                  * Link to end of list using vfs_prev (as rootvfs is now a
3565                  * doubly linked circular list) so list is in mount order for
3566                  * mnttab use.
3567                  */
3568                 rootvfs->vfs_prev->vfs_next = vfsp;
3569                 vfsp->vfs_prev = rootvfs->vfs_prev;
3570                 rootvfs->vfs_prev = vfsp;
3571                 vfsp->vfs_next = rootvfs;
3572
3573                 /*
3574                  * Do it again for the zone-private list (which may be NULL).
3575                  */
3576                 if (zone->zone_vfslist == NULL) {
3577                         ASSERT(zone != global_zone);
3578                         zone->zone_vfslist = vfsp;
3579                 } else {
3580                         zone->zone_vfslist->vfs_zone_prev->vfs_zone_next = vfsp;
3581                         vfsp->vfs_zone_prev = zone->zone_vfslist->vfs_zone_prev;
3582                         zone->zone_vfslist->vfs_zone_prev = vfsp;
3583                         vfsp->vfs_zone_next = zone->zone_vfslist;
3584                 }
3585         }
3586
3587         /*
3588          * Link into the hash table, inserting it at the end, so that LOFS
3589          * with the same fsid as UFS (or other) file systems will not hide
3590          * the UFS.
3591          */
3592         vfs_hash_add(vfsp, 0);
3593
3594         /*
3595          * update the mnttab modification time
3596          */
3597         vfs_mnttab_modtimeupd();
3598         vfs_list_unlock();
3599         zone_rele(zone);
3600 }
3601
3602 void
3603 vfs_list_remove(struct vfs *vfsp)
3604 {
3605         zone_t *zone;
3606
3607         zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
3608         ASSERT(zone != NULL);
3609         /*
3610          * Callers are responsible for preventing attempts to unmount the
3611          * root.
3612          */
3613         ASSERT(vfsp != rootvfs);
3614
3615         vfs_list_lock();
3616
3617         /*
3618          * Remove from hash.
3619          */
3620         vfs_hash_remove(vfsp);
3621
3622         /*
3623          * Remove from vfs list.
3624          */
3625         vfsp->vfs_prev->vfs_next = vfsp->vfs_next;
3626         vfsp->vfs_next->vfs_prev = vfsp->vfs_prev;
3627         vfsp->vfs_next = vfsp->vfs_prev = NULL;
3628
3629         /*
3630          * Remove from zone-specific vfs list.
3631          */
3632         if (zone->zone_vfslist == vfsp)
3633                 zone->zone_vfslist = vfsp->vfs_zone_next;
3634
3635         if (vfsp->vfs_zone_next == vfsp) {
3636                 ASSERT(vfsp->vfs_zone_prev == vfsp);
3637                 ASSERT(zone->zone_vfslist == vfsp);
3638                 zone->zone_vfslist = NULL;
3639         }
3640
3641         vfsp->vfs_zone_prev->vfs_zone_next = vfsp->vfs_zone_next;
3642         vfsp->vfs_zone_next->vfs_zone_prev = vfsp->vfs_zone_prev;
3643         vfsp->vfs_zone_next = vfsp->vfs_zone_prev = NULL;
3644
3645         /*
3646          * update the mnttab modification time
3647          */
3648         vfs_mnttab_modtimeupd();
3649         vfs_list_unlock();
3650         zone_rele(zone);
3651 }
3652
3653 struct vfs *
3654 getvfs(fsid_t *fsid)
3655 {
3656         struct vfs *vfsp;
3657         int val0 = fsid->val[0];
3658         int val1 = fsid->val[1];
3659         dev_t dev = expldev(val0);
3660         int vhno = VFSHASH(getmajor(dev), getminor(dev));
3661         kmutex_t *hmp = &rvfs_list[vhno].rvfs_lock;
3662
3663         mutex_enter(hmp);
3664         for (vfsp = rvfs_list[vhno].rvfs_head; vfsp; vfsp = vfsp->vfs_hash) {
3665                 if (vfsp->vfs_fsid.val[0] == val0 &&
3666                     vfsp->vfs_fsid.val[1] == val1) {
3667                         VFS_HOLD(vfsp);
3668                         mutex_exit(hmp);
3669                         return (vfsp);
3670                 }
3671         }
3672         mutex_exit(hmp);
3673         return (NULL);
3674 }
3675
3676 /*
3677  * Search the vfs mount in progress list for a specified device/vfs entry.
3678  * Returns 0 if the first entry in the list that the device matches has the
3679  * given vfs pointer as well.  If the device matches but a different vfs
3680  * pointer is encountered in the list before the given vfs pointer then
3681  * a 1 is returned.
3682  */
3683
3684 int
3685 vfs_devmounting(dev_t dev, struct vfs *vfsp)
3686 {
3687         int retval = 0;
3688         struct ipmnt *mipp;
3689
3690         mutex_enter(&vfs_miplist_mutex);
3691         for (mipp = vfs_miplist; mipp != NULL; mipp = mipp->mip_next) {
3692                 if (mipp->mip_dev == dev) {
3693                         if (mipp->mip_vfsp != vfsp)
3694                                 retval = 1;
3695                         break;
3696                 }
3697         }
3698         mutex_exit(&vfs_miplist_mutex);
3699         return (retval);
3700 }
3701
3702 /*
3703  * Search the vfs list for a specified device.  Returns 1, if entry is found
3704  * or 0 if no suitable entry is found.
3705  */
3706
3707 int
3708 vfs_devismounted(dev_t dev)
3709 {
3710         struct vfs *vfsp;
3711         int found;
3712
3713         vfs_list_read_lock();
3714         vfsp = rootvfs;
3715         found = 0;
3716         do {
3717                 if (vfsp->vfs_dev == dev) {
3718                         found = 1;
3719                         break;
3720                 }
3721                 vfsp = vfsp->vfs_next;
3722         } while (vfsp != rootvfs);
3723
3724         vfs_list_unlock();
3725         return (found);
3726 }
3727
3728 /*
3729  * Search the vfs list for a specified device.  Returns a pointer to it
3730  * or NULL if no suitable entry is found. The caller of this routine
3731  * is responsible for releasing the returned vfs pointer.
3732  */
3733 struct vfs *
3734 vfs_dev2vfsp(dev_t dev)
3735 {
3736         struct vfs *vfsp;
3737         int found;
3738
3739         vfs_list_read_lock();
3740         vfsp = rootvfs;
3741         found = 0;
3742         do {
3743                 /*
3744                  * The following could be made more efficient by making
3745                  * the entire loop use vfs_zone_next if the call is from
3746                  * a zone.  The only callers, however, ustat(2) and
3747                  * umount2(2), don't seem to justify the added
3748                  * complexity at present.
3749                  */
3750                 if (vfsp->vfs_dev == dev &&
3751                     ZONE_PATH_VISIBLE(refstr_value(vfsp->vfs_mntpt),
3752                     curproc->p_zone)) {
3753                         VFS_HOLD(vfsp);
3754                         found = 1;
3755                         break;
3756                 }
3757                 vfsp = vfsp->vfs_next;
3758         } while (vfsp != rootvfs);
3759         vfs_list_unlock();
3760         return (found ? vfsp: NULL);
3761 }
3762
3763 /*
3764  * Search the vfs list for a specified mntpoint.  Returns a pointer to it
3765  * or NULL if no suitable entry is found. The caller of this routine
3766  * is responsible for releasing the returned vfs pointer.
3767  *
3768  * Note that if multiple mntpoints match, the last one matching is
3769  * returned in an attempt to return the "top" mount when overlay
3770  * mounts are covering the same mount point.  This is accomplished by starting
3771  * at the end of the list and working our way backwards, stopping at the first
3772  * matching mount.
3773  */
3774 struct vfs *
3775 vfs_mntpoint2vfsp(const char *mp)
3776 {
3777         struct vfs *vfsp;
3778         struct vfs *retvfsp = NULL;
3779         zone_t *zone = curproc->p_zone;
3780         struct vfs *list;
3781
3782         vfs_list_read_lock();
3783         if (getzoneid() == GLOBAL_ZONEID) {
3784                 /*
3785                  * The global zone may see filesystems in any zone.
3786                  */
3787                 vfsp = rootvfs->vfs_prev;
3788                 do {
3789                         if (strcmp(refstr_value(vfsp->vfs_mntpt), mp) == 0) {
3790                                 retvfsp = vfsp;
3791                                 break;
3792                         }
3793                         vfsp = vfsp->vfs_prev;
3794                 } while (vfsp != rootvfs->vfs_prev);
3795         } else if ((list = zone->zone_vfslist) != NULL) {
3796                 const char *mntpt;
3797
3798                 vfsp = list->vfs_zone_prev;
3799                 do {
3800                         mntpt = refstr_value(vfsp->vfs_mntpt);
3801                         mntpt = ZONE_PATH_TRANSLATE(mntpt, zone);
3802                         if (strcmp(mntpt, mp) == 0) {
3803                                 retvfsp = vfsp;
3804                                 break;
3805                         }
3806                         vfsp = vfsp->vfs_zone_prev;
3807                 } while (vfsp != list->vfs_zone_prev);
3808         }
3809         if (retvfsp)
3810                 VFS_HOLD(retvfsp);
3811         vfs_list_unlock();
3812         return (retvfsp);
3813 }
3814
3815 /*
3816  * Search the vfs list for a specified vfsops.
3817  * if vfs entry is found then return 1, else 0.
3818  */
3819 int
3820 vfs_opsinuse(vfsops_t *ops)
3821 {
3822         struct vfs *vfsp;
3823         int found;
3824
3825         vfs_list_read_lock();
3826         vfsp = rootvfs;
3827         found = 0;
3828         do {
3829                 if (vfs_getops(vfsp) == ops) {
3830                         found = 1;
3831                         break;
3832                 }
3833                 vfsp = vfsp->vfs_next;
3834         } while (vfsp != rootvfs);
3835         vfs_list_unlock();
3836         return (found);
3837 }
3838
3839 /*
3840  * Allocate an entry in vfssw for a file system type
3841  */
3842 struct vfssw *
3843 allocate_vfssw(const char *type)
3844 {
3845         struct vfssw *vswp;
3846
3847         if (type[0] == '\0' || strlen(type) + 1 > _ST_FSTYPSZ) {
3848                 /*
3849                  * The vfssw table uses the empty string to identify an
3850                  * available entry; we cannot add any type which has
3851                  * a leading NUL. The string length is limited to
3852                  * the size of the st_fstype array in struct stat.
3853                  */
3854                 return (NULL);
3855         }
3856
3857         ASSERT(VFSSW_WRITE_LOCKED());
3858         for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++)
3859                 if (!ALLOCATED_VFSSW(vswp)) {
3860                         vswp->vsw_name = kmem_alloc(strlen(type) + 1, KM_SLEEP);
3861                         (void) strcpy(vswp->vsw_name, type);
3862                         ASSERT(vswp->vsw_count == 0);
3863                         vswp->vsw_count = 1;
3864                         mutex_init(&vswp->vsw_lock, NULL, MUTEX_DEFAULT, NULL);
3865                         return (vswp);
3866                 }
3867         return (NULL);
3868 }
3869
3870 /*
3871  * Impose additional layer of translation between vfstype names
3872  * and module names in the filesystem.
3873  */
3874 static const char *
3875 vfs_to_modname(const char *vfstype)
3876 {
3877         if (strcmp(vfstype, "proc") == 0) {
3878                 vfstype = "procfs";
3879         } else if (strcmp(vfstype, "fd") == 0) {
3880                 vfstype = "fdfs";
3881         } else if (strncmp(vfstype, "nfs", 3) == 0) {
3882                 vfstype = "nfs";
3883         }
3884
3885         return (vfstype);
3886 }
3887
3888 /*
3889  * Find a vfssw entry given a file system type name.
3890  * Try to autoload the filesystem if it's not found.
3891  * If it's installed, return the vfssw locked to prevent unloading.
3892  */
3893 struct vfssw *
3894 vfs_getvfssw(const char *type)
3895 {
3896         struct vfssw *vswp;
3897         const char *modname;
3898
3899         RLOCK_VFSSW();
3900         vswp = vfs_getvfsswbyname(type);
3901         modname = vfs_to_modname(type);
3902
3903         if (rootdir == NULL) {
3904                 /*
3905                  * If we haven't yet loaded the root file system, then our
3906                  * _init won't be called until later. Allocate vfssw entry,
3907                  * because mod_installfs won't be called.
3908                  */
3909                 if (vswp == NULL) {
3910                         RUNLOCK_VFSSW();
3911                         WLOCK_VFSSW();
3912                         if ((vswp = vfs_getvfsswbyname(type)) == NULL) {
3913                                 if ((vswp = allocate_vfssw(type)) == NULL) {
3914                                         WUNLOCK_VFSSW();
3915                                         return (NULL);
3916                                 }
3917                         }
3918                         WUNLOCK_VFSSW();
3919                         RLOCK_VFSSW();
3920                 }
3921                 if (!VFS_INSTALLED(vswp)) {
3922                         RUNLOCK_VFSSW();
3923                         (void) modloadonly("fs", modname);
3924                 } else
3925                         RUNLOCK_VFSSW();
3926                 return (vswp);
3927         }
3928
3929         /*
3930          * Try to load the filesystem.  Before calling modload(), we drop
3931          * our lock on the VFS switch table, and pick it up after the
3932          * module is loaded.  However, there is a potential race:  the
3933          * module could be unloaded after the call to modload() completes
3934          * but before we pick up the lock and drive on.  Therefore,
3935          * we keep reloading the module until we've loaded the module
3936          * _and_ we have the lock on the VFS switch table.
3937          */
3938         while (vswp == NULL || !VFS_INSTALLED(vswp)) {
3939                 RUNLOCK_VFSSW();
3940                 if (modload("fs", modname) == -1)
3941                         return (NULL);
3942                 RLOCK_VFSSW();
3943                 if (vswp == NULL)
3944                         if ((vswp = vfs_getvfsswbyname(type)) == NULL)
3945                                 break;
3946         }
3947         RUNLOCK_VFSSW();
3948
3949         return (vswp);
3950 }
3951
3952 /*
3953  * Find a vfssw entry given a file system type name.
3954  */
3955 struct vfssw *
3956 vfs_getvfsswbyname(const char *type)
3957 {
3958         struct vfssw *vswp;
3959
3960         ASSERT(VFSSW_LOCKED());
3961         if (type == NULL || *type == '\0')
3962                 return (NULL);
3963
3964         for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
3965                 if (strcmp(type, vswp->vsw_name) == 0) {
3966                         vfs_refvfssw(vswp);
3967                         return (vswp);
3968                 }
3969         }
3970
3971         return (NULL);
3972 }
3973
3974 /*
3975  * Find a vfssw entry given a set of vfsops.
3976  */
3977 struct vfssw *
3978 vfs_getvfsswbyvfsops(vfsops_t *vfsops)
3979 {
3980         struct vfssw *vswp;
3981
3982         RLOCK_VFSSW();
3983         for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
3984                 if (ALLOCATED_VFSSW(vswp) && &vswp->vsw_vfsops == vfsops) {
3985                         vfs_refvfssw(vswp);
3986                         RUNLOCK_VFSSW();
3987                         return (vswp);
3988                 }
3989         }
3990         RUNLOCK_VFSSW();
3991
3992         return (NULL);
3993 }
3994
3995 /*
3996  * Reference a vfssw entry.
3997  */
3998 void
3999 vfs_refvfssw(struct vfssw *vswp)
4000 {
4001
4002         mutex_enter(&vswp->vsw_lock);
4003         vswp->vsw_count++;
4004         mutex_exit(&vswp->vsw_lock);
4005 }
4006
4007 /*
4008  * Unreference a vfssw entry.
4009  */
4010 void
4011 vfs_unrefvfssw(struct vfssw *vswp)
4012 {
4013
4014         mutex_enter(&vswp->vsw_lock);
4015         vswp->vsw_count--;
4016         mutex_exit(&vswp->vsw_lock);
4017 }
4018
4019 static int sync_retries = 20;   /* number of retries when not making progress */
4020 static int sync_triesleft;      /* portion of sync_retries remaining */
4021
4022 static pgcnt_t old_pgcnt, new_pgcnt;
4023 static int new_bufcnt, old_bufcnt;
4024
4025 /*
4026  * Sync all of the mounted filesystems, and then wait for the actual i/o to
4027  * complete.  We wait by counting the number of dirty pages and buffers,
4028  * pushing them out using bio_busy() and page_busy(), and then counting again.
4029  * This routine is used during the uadmin A_SHUTDOWN code.  It should only
4030  * be used after some higher-level mechanism has quiesced the system so that
4031  * new writes are not being initiated while we are waiting for completion.
4032  *
4033  * To ensure finite running time, our algorithm uses sync_triesleft (a progress
4034  * counter used by the vfs_syncall() loop below). It is declared above so
4035  * it can be found easily in the debugger.
4036  *
4037  * The sync_triesleft counter is updated by vfs_syncall() itself.  If we make
4038  * sync_retries consecutive calls to bio_busy() and page_busy() without
4039  * decreasing either the number of dirty buffers or dirty pages below the
4040  * lowest count we have seen so far, we give up and return from vfs_syncall().
4041  *
4042  * Each loop iteration ends with a call to delay() one second to allow time for
4043  * i/o completion and to permit the user time to read our progress messages.
4044  */
4045 void
4046 vfs_syncall(void)
4047 {
4048         if (rootdir == NULL && !modrootloaded)
4049                 return; /* no filesystems have been loaded yet */
4050
4051         printf("syncing file systems...");
4052         sync();
4053
4054         sync_triesleft = sync_retries;
4055
4056         old_bufcnt = new_bufcnt = INT_MAX;
4057         old_pgcnt = new_pgcnt = ULONG_MAX;
4058
4059         while (sync_triesleft > 0) {
4060                 old_bufcnt = MIN(old_bufcnt, new_bufcnt);
4061                 old_pgcnt = MIN(old_pgcnt, new_pgcnt);
4062
4063                 new_bufcnt = bio_busy(B_TRUE);
4064                 new_pgcnt = page_busy(B_TRUE);
4065
4066                 if (new_bufcnt == 0 && new_pgcnt == 0)
4067                         break;
4068
4069                 if (new_bufcnt < old_bufcnt || new_pgcnt < old_pgcnt)
4070                         sync_triesleft = sync_retries;
4071                 else
4072                         sync_triesleft--;
4073
4074                 if (new_bufcnt)
4075                         printf(" [%d]", new_bufcnt);
4076                 if (new_pgcnt)
4077                         printf(" %lu", new_pgcnt);
4078
4079                 delay(hz);
4080         }
4081
4082         if (new_bufcnt != 0 || new_pgcnt != 0)
4083                 printf(" done (not all i/o completed)\n");
4084         else
4085                 printf(" done\n");
4086
4087         delay(hz);
4088 }
4089
4090 /*
4091  * Map VFS flags to statvfs flags.  These shouldn't really be separate
4092  * flags at all.
4093  */
4094 uint_t
4095 vf_to_stf(uint_t vf)
4096 {
4097         uint_t stf = 0;
4098
4099         if (vf & VFS_RDONLY)
4100                 stf |= ST_RDONLY;
4101         if (vf & VFS_NOSETUID)
4102                 stf |= ST_NOSUID;
4103         if (vf & VFS_NOTRUNC)
4104                 stf |= ST_NOTRUNC;
4105
4106         return (stf);
4107 }
4108
4109 /*
4110  * Entries for (illegal) fstype 0.
4111  */
4112 /* ARGSUSED */
4113 int
4114 vfsstray_sync(struct vfs *vfsp, short arg, struct cred *cr)
4115 {
4116         cmn_err(CE_PANIC, "stray vfs operation");
4117         return (0);
4118 }
4119
4120 /*
4121  * Entries for (illegal) fstype 0.
4122  */
4123 int
4124 vfsstray(void)
4125 {
4126         cmn_err(CE_PANIC, "stray vfs operation");
4127         return (0);
4128 }
4129
4130 /*
4131  * Support for dealing with forced UFS unmount and its interaction with
4132  * LOFS. Could be used by any filesystem.
4133  * See bug 1203132.
4134  */
4135 int
4136 vfs_EIO(void)
4137 {
4138         return (EIO);
4139 }
4140
4141 /*
4142  * We've gotta define the op for sync separately, since the compiler gets
4143  * confused if we mix and match ANSI and normal style prototypes when
4144  * a "short" argument is present and spits out a warning.
4145  */
4146 /*ARGSUSED*/
4147 int
4148 vfs_EIO_sync(struct vfs *vfsp, short arg, struct cred *cr)
4149 {
4150         return (EIO);
4151 }
4152
4153 vfs_t EIO_vfs;
4154 vfsops_t *EIO_vfsops;
4155
4156 /*
4157  * Called from startup() to initialize all loaded vfs's
4158  */
4159 void
4160 vfsinit(void)
4161 {
4162         struct vfssw *vswp;
4163         int error;
4164         extern int vopstats_enabled;
4165         extern void vopstats_startup();
4166
4167         static const fs_operation_def_t EIO_vfsops_template[] = {
4168                 VFSNAME_MOUNT,          { .error = vfs_EIO },
4169                 VFSNAME_UNMOUNT,        { .error = vfs_EIO },
4170                 VFSNAME_ROOT,           { .error = vfs_EIO },
4171                 VFSNAME_STATVFS,        { .error = vfs_EIO },
4172                 VFSNAME_SYNC,           { .vfs_sync = vfs_EIO_sync },
4173                 VFSNAME_VGET,           { .error = vfs_EIO },
4174                 VFSNAME_MOUNTROOT,      { .error = vfs_EIO },
4175                 VFSNAME_FREEVFS,        { .error = vfs_EIO },
4176                 VFSNAME_VNSTATE,        { .error = vfs_EIO },
4177                 NULL, NULL
4178         };
4179
4180         static const fs_operation_def_t stray_vfsops_template[] = {
4181                 VFSNAME_MOUNT,          { .error = vfsstray },
4182                 VFSNAME_UNMOUNT,        { .error = vfsstray },
4183                 VFSNAME_ROOT,           { .error = vfsstray },
4184                 VFSNAME_STATVFS,        { .error = vfsstray },
4185                 VFSNAME_SYNC,           { .vfs_sync = vfsstray_sync },
4186                 VFSNAME_VGET,           { .error = vfsstray },
4187                 VFSNAME_MOUNTROOT,      { .error = vfsstray },
4188                 VFSNAME_FREEVFS,        { .error = vfsstray },
4189                 VFSNAME_VNSTATE,        { .error = vfsstray },
4190                 NULL, NULL
4191         };
4192
4193         /* Create vfs cache */
4194         vfs_cache = kmem_cache_create("vfs_cache", sizeof (struct vfs),
4195             sizeof (uintptr_t), NULL, NULL, NULL, NULL, NULL, 0);
4196
4197         /* Initialize the vnode cache (file systems may use it during init). */
4198         vn_create_cache();
4199
4200         /* Setup event monitor framework */
4201         fem_init();
4202
4203         /* Initialize the dummy stray file system type. */
4204         error = vfs_setfsops(0, stray_vfsops_template, NULL);
4205
4206         /* Initialize the dummy EIO file system. */
4207         error = vfs_makefsops(EIO_vfsops_template, &EIO_vfsops);
4208         if (error != 0) {
4209                 cmn_err(CE_WARN, "vfsinit: bad EIO vfs ops template");
4210                 /* Shouldn't happen, but not bad enough to panic */
4211         }
4212
4213         VFS_INIT(&EIO_vfs, EIO_vfsops, (caddr_t)NULL);
4214
4215         /*
4216          * Default EIO_vfs.vfs_flag to VFS_UNMOUNTED so a lookup
4217          * on this vfs can immediately notice it's invalid.
4218          */
4219         EIO_vfs.vfs_flag |= VFS_UNMOUNTED;
4220
4221         /*
4222          * Call the init routines of non-loadable filesystems only.
4223          * Filesystems which are loaded as separate modules will be
4224          * initialized by the module loading code instead.
4225          */
4226
4227         for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
4228                 RLOCK_VFSSW();
4229                 if (vswp->vsw_init != NULL)
4230                         (*vswp->vsw_init)(vswp - vfssw, vswp->vsw_name);
4231                 RUNLOCK_VFSSW();
4232         }
4233
4234         vopstats_startup();
4235
4236         if (vopstats_enabled) {
4237                 /* EIO_vfs can collect stats, but we don't retrieve them */
4238                 initialize_vopstats(&EIO_vfs.vfs_vopstats);
4239                 EIO_vfs.vfs_fstypevsp = NULL;
4240                 EIO_vfs.vfs_vskap = NULL;
4241                 EIO_vfs.vfs_flag |= VFS_STATS;
4242         }
4243
4244         xattr_init();
4245
4246         reparse_point_init();
4247 }
4248
4249 vfs_t *
4250 vfs_alloc(int kmflag)
4251 {
4252         vfs_t *vfsp;
4253
4254         vfsp = kmem_cache_alloc(vfs_cache, kmflag);
4255
4256         /*
4257          * Do the simplest initialization here.
4258          * Everything else gets done in vfs_init()
4259          */
4260         bzero(vfsp, sizeof (vfs_t));
4261         return (vfsp);
4262 }
4263
4264 void
4265 vfs_free(vfs_t *vfsp)
4266 {
4267         /*
4268          * One would be tempted to assert that "vfsp->vfs_count == 0".
4269          * The problem is that this gets called out of domount() with
4270          * a partially initialized vfs and a vfs_count of 1.  This is
4271          * also called from vfs_rele() with a vfs_count of 0.  We can't
4272          * call VFS_RELE() from domount() if VFS_MOUNT() hasn't successfully
4273          * returned.  This is because VFS_MOUNT() fully initializes the
4274          * vfs structure and its associated data.  VFS_RELE() will call
4275          * VFS_FREEVFS() which may panic the system if the data structures
4276          * aren't fully initialized from a successful VFS_MOUNT()).
4277          */
4278
4279         /* If FEM was in use, make sure everything gets cleaned up */
4280         if (vfsp->vfs_femhead) {
4281                 ASSERT(vfsp->vfs_femhead->femh_list == NULL);
4282                 mutex_destroy(&vfsp->vfs_femhead->femh_lock);
4283                 kmem_free(vfsp->vfs_femhead, sizeof (*(vfsp->vfs_femhead)));
4284                 vfsp->vfs_femhead = NULL;
4285         }
4286
4287         if (vfsp->vfs_implp)
4288                 vfsimpl_teardown(vfsp);
4289         sema_destroy(&vfsp->vfs_reflock);
4290         kmem_cache_free(vfs_cache, vfsp);
4291 }
4292
4293 /*
4294  * Increments the vfs reference count by one atomically.
4295  */
4296 void
4297 vfs_hold(vfs_t *vfsp)
4298 {
4299         atomic_inc_32(&vfsp->vfs_count);
4300         ASSERT(vfsp->vfs_count != 0);
4301 }
4302
4303 /*
4304  * Decrements the vfs reference count by one atomically. When
4305  * vfs reference count becomes zero, it calls the file system
4306  * specific vfs_freevfs() to free up the resources.
4307  */
4308 void
4309 vfs_rele(vfs_t *vfsp)
4310 {
4311         ASSERT(vfsp->vfs_count != 0);
4312         if (atomic_dec_32_nv(&vfsp->vfs_count) == 0) {
4313                 VFS_FREEVFS(vfsp);
4314                 lofi_remove(vfsp);
4315                 if (vfsp->vfs_zone)
4316                         zone_rele_ref(&vfsp->vfs_implp->vi_zone_ref,
4317                             ZONE_REF_VFS);
4318                 vfs_freemnttab(vfsp);
4319                 vfs_free(vfsp);
4320         }
4321 }
4322
4323 /*
4324  * Generic operations vector support.
4325  *
4326  * This is used to build operations vectors for both the vfs and vnode.
4327  * It's normally called only when a file system is loaded.
4328  *
4329  * There are many possible algorithms for this, including the following:
4330  *
4331  *   (1) scan the list of known operations; for each, see if the file system
4332  *       includes an entry for it, and fill it in as appropriate.
4333  *
4334  *   (2) set up defaults for all known operations.  scan the list of ops
4335  *       supplied by the file system; for each which is both supplied and
4336  *       known, fill it in.
4337  *
4338  *   (3) sort the lists of known ops & supplied ops; scan the list, filling
4339  *       in entries as we go.
4340  *
4341  * we choose (1) for simplicity, and because performance isn't critical here.
4342  * note that (2) could be sped up using a precomputed hash table on known ops.
4343  * (3) could be faster than either, but only if the lists were very large or
4344  * supplied in sorted order.
4345  *
4346  */
4347
4348 int
4349 fs_build_vector(void *vector, int *unused_ops,
4350     const fs_operation_trans_def_t *translation,
4351     const fs_operation_def_t *operations)
4352 {
4353         int i, num_trans, num_ops, used;
4354
4355         /*
4356          * Count the number of translations and the number of supplied
4357          * operations.
4358          */
4359
4360         {
4361                 const fs_operation_trans_def_t *p;
4362
4363                 for (num_trans = 0, p = translation;
4364                     p->name != NULL;
4365                     num_trans++, p++)
4366                         ;
4367         }
4368
4369         {
4370                 const fs_operation_def_t *p;
4371
4372                 for (num_ops = 0, p = operations;
4373                     p->name != NULL;
4374                     num_ops++, p++)
4375                         ;
4376         }
4377
4378         /* Walk through each operation known to our caller.  There will be */
4379         /* one entry in the supplied "translation table" for each. */
4380
4381         used = 0;
4382
4383         for (i = 0; i < num_trans; i++) {
4384                 int j, found;
4385                 char *curname;
4386                 fs_generic_func_p result;
4387                 fs_generic_func_p *location;
4388
4389                 curname = translation[i].name;
4390
4391                 /* Look for a matching operation in the list supplied by the */
4392                 /* file system. */
4393
4394                 found = 0;
4395
4396                 for (j = 0; j < num_ops; j++) {
4397                         if (strcmp(operations[j].name, curname) == 0) {
4398                                 used++;
4399                                 found = 1;
4400                                 break;
4401                         }
4402                 }
4403
4404                 /*
4405                  * If the file system is using a "placeholder" for default
4406                  * or error functions, grab the appropriate function out of
4407                  * the translation table.  If the file system didn't supply
4408                  * this operation at all, use the default function.
4409                  */
4410
4411                 if (found) {
4412                         result = operations[j].func.fs_generic;
4413                         if (result == fs_default) {
4414                                 result = translation[i].defaultFunc;
4415                         } else if (result == fs_error) {
4416                                 result = translation[i].errorFunc;
4417                         } else if (result == NULL) {
4418                                 /* Null values are PROHIBITED */
4419                                 return (EINVAL);
4420                         }
4421                 } else {
4422                         result = translation[i].defaultFunc;
4423                 }
4424
4425                 /* Now store the function into the operations vector. */
4426
4427                 location = (fs_generic_func_p *)
4428                     (((char *)vector) + translation[i].offset);
4429
4430                 *location = result;
4431         }
4432
4433         *unused_ops = num_ops - used;
4434
4435         return (0);
4436 }
4437
4438 /* Placeholder functions, should never be called. */
4439
4440 int
4441 fs_error(void)
4442 {
4443         cmn_err(CE_PANIC, "fs_error called");
4444         return (0);
4445 }
4446
4447 int
4448 fs_default(void)
4449 {
4450         cmn_err(CE_PANIC, "fs_default called");
4451         return (0);
4452 }
4453
4454 #ifdef __sparc
4455
4456 /*
4457  * Part of the implementation of booting off a mirrored root
4458  * involves a change of dev_t for the root device.  To
4459  * accomplish this, first remove the existing hash table
4460  * entry for the root device, convert to the new dev_t,
4461  * then re-insert in the hash table at the head of the list.
4462  */
4463 void
4464 vfs_root_redev(vfs_t *vfsp, dev_t ndev, int fstype)
4465 {
4466         vfs_list_lock();
4467
4468         vfs_hash_remove(vfsp);
4469
4470         vfsp->vfs_dev = ndev;
4471         vfs_make_fsid(&vfsp->vfs_fsid, ndev, fstype);
4472
4473         vfs_hash_add(vfsp, 1);
4474
4475         vfs_list_unlock();
4476 }
4477
4478 #else /* x86 NEWBOOT */
4479
4480 #if defined(__x86)
4481 extern int hvmboot_rootconf();
4482 #endif /* __x86 */
4483
4484 extern ib_boot_prop_t *iscsiboot_prop;
4485
4486 int
4487 rootconf()
4488 {
4489         int error;
4490         struct vfssw *vsw;
4491         extern void pm_init();
4492         char *fstyp, *fsmod;
4493         int ret = -1;
4494
4495         getrootfs(&fstyp, &fsmod);
4496
4497 #if defined(__x86)
4498         /*
4499          * hvmboot_rootconf() is defined in the hvm_bootstrap misc module,
4500          * which lives in /platform/i86hvm, and hence is only available when
4501          * booted in an x86 hvm environment.  If the hvm_bootstrap misc module
4502          * is not available then the modstub for this function will return 0.
4503          * If the hvm_bootstrap misc module is available it will be loaded
4504          * and hvmboot_rootconf() will be invoked.
4505          */
4506         if (error = hvmboot_rootconf())
4507                 return (error);
4508 #endif /* __x86 */
4509
4510         if (error = clboot_rootconf())
4511                 return (error);
4512
4513         if (modload("fs", fsmod) == -1)
4514                 panic("Cannot _init %s module", fsmod);
4515
4516         RLOCK_VFSSW();
4517         vsw = vfs_getvfsswbyname(fstyp);
4518         RUNLOCK_VFSSW();
4519         if (vsw == NULL) {
4520                 cmn_err(CE_CONT, "Cannot find %s filesystem\n", fstyp);
4521                 return (ENXIO);
4522         }
4523         VFS_INIT(rootvfs, &vsw->vsw_vfsops, 0);
4524         VFS_HOLD(rootvfs);
4525
4526         /* always mount readonly first */
4527         rootvfs->vfs_flag |= VFS_RDONLY;
4528
4529         pm_init();
4530
4531         if (netboot && iscsiboot_prop) {
4532                 cmn_err(CE_WARN, "NFS boot and iSCSI boot"
4533                     " shouldn't happen in the same time");
4534                 return (EINVAL);
4535         }
4536
4537         if (netboot || iscsiboot_prop) {
4538                 ret = strplumb();
4539                 if (ret != 0) {
4540                         cmn_err(CE_WARN, "Cannot plumb network device %d", ret);
4541                         return (EFAULT);
4542                 }
4543         }
4544
4545         if ((ret == 0) && iscsiboot_prop) {
4546                 ret = modload("drv", "iscsi");
4547                 /* -1 indicates fail */
4548                 if (ret == -1) {
4549                         cmn_err(CE_WARN, "Failed to load iscsi module");
4550                         iscsi_boot_prop_free();
4551                         return (EINVAL);
4552                 } else {
4553                         if (!i_ddi_attach_pseudo_node("iscsi")) {
4554                                 cmn_err(CE_WARN,
4555                                     "Failed to attach iscsi driver");
4556                                 iscsi_boot_prop_free();
4557                                 return (ENODEV);
4558                         }
4559                 }
4560         }
4561
4562         error = VFS_MOUNTROOT(rootvfs, ROOT_INIT);
4563         vfs_unrefvfssw(vsw);
4564         rootdev = rootvfs->vfs_dev;
4565
4566         if (error)
4567                 cmn_err(CE_CONT, "Cannot mount root on %s fstype %s\n",
4568                     rootfs.bo_name, fstyp);
4569         else
4570                 cmn_err(CE_CONT, "?root on %s fstype %s\n",
4571                     rootfs.bo_name, fstyp);
4572         return (error);
4573 }
4574
4575 /*
4576  * XXX this is called by nfs only and should probably be removed
4577  * If booted with ASKNAME, prompt on the console for a filesystem
4578  * name and return it.
4579  */
4580 void
4581 getfsname(char *askfor, char *name, size_t namelen)
4582 {
4583         if (boothowto & RB_ASKNAME) {
4584                 printf("%s name: ", askfor);
4585                 console_gets(name, namelen);
4586         }
4587 }
4588
4589 /*
4590  * Init the root filesystem type (rootfs.bo_fstype) from the "fstype"
4591  * property.
4592  *
4593  * Filesystem types starting with the prefix "nfs" are diskless clients;
4594  * init the root filename name (rootfs.bo_name), too.
4595  *
4596  * If we are booting via NFS we currently have these options:
4597  *      nfs -   dynamically choose NFS V2, V3, or V4 (default)
4598  *      nfs2 -  force NFS V2
4599  *      nfs3 -  force NFS V3
4600  *      nfs4 -  force NFS V4
4601  * Because we need to maintain backward compatibility with the naming
4602  * convention that the NFS V2 filesystem name is "nfs" (see vfs_conf.c)
4603  * we need to map "nfs" => "nfsdyn" and "nfs2" => "nfs".  The dynamic
4604  * nfs module will map the type back to either "nfs", "nfs3", or "nfs4".
4605  * This is only for root filesystems, all other uses will expect
4606  * that "nfs" == NFS V2.
4607  */
4608 static void
4609 getrootfs(char **fstypp, char **fsmodp)
4610 {
4611         char *propstr = NULL;
4612
4613         /*
4614          * Check fstype property; for diskless it should be one of "nfs",
4615          * "nfs2", "nfs3" or "nfs4".
4616          */
4617         if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
4618             DDI_PROP_DONTPASS, "fstype", &propstr)
4619             == DDI_SUCCESS) {
4620                 (void) strncpy(rootfs.bo_fstype, propstr, BO_MAXFSNAME);
4621                 ddi_prop_free(propstr);
4622
4623         /*
4624          * if the boot property 'fstype' is not set, but 'zfs-bootfs' is set,
4625          * assume the type of this root filesystem is 'zfs'.
4626          */
4627         } else if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
4628             DDI_PROP_DONTPASS, "zfs-bootfs", &propstr)
4629             == DDI_SUCCESS) {
4630                 (void) strncpy(rootfs.bo_fstype, "zfs", BO_MAXFSNAME);
4631                 ddi_prop_free(propstr);
4632         }
4633
4634         if (strncmp(rootfs.bo_fstype, "nfs", 3) != 0) {
4635                 *fstypp = *fsmodp = rootfs.bo_fstype;
4636                 return;
4637         }
4638
4639         ++netboot;
4640
4641         if (strcmp(rootfs.bo_fstype, "nfs2") == 0)
4642                 (void) strcpy(rootfs.bo_fstype, "nfs");
4643         else if (strcmp(rootfs.bo_fstype, "nfs") == 0)
4644                 (void) strcpy(rootfs.bo_fstype, "nfsdyn");
4645
4646         /*
4647          * check if path to network interface is specified in bootpath
4648          * or by a hypervisor domain configuration file.
4649          * XXPV - enable strlumb_get_netdev_path()
4650          */
4651         if (ddi_prop_exists(DDI_DEV_T_ANY, ddi_root_node(), DDI_PROP_DONTPASS,
4652             "xpv-nfsroot")) {
4653                 (void) strcpy(rootfs.bo_name, "/xpvd/xnf@0");
4654         } else if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
4655             DDI_PROP_DONTPASS, "bootpath", &propstr)
4656             == DDI_SUCCESS) {
4657                 (void) strncpy(rootfs.bo_name, propstr, BO_MAXOBJNAME);
4658                 ddi_prop_free(propstr);
4659         } else {
4660                 rootfs.bo_name[0] = '\0';
4661         }
4662         *fstypp = rootfs.bo_fstype;
4663         *fsmodp = "nfs";
4664 }
4665 #endif
4666
4667 /*
4668  * VFS feature routines
4669  */
4670
4671 #define VFTINDEX(feature)       (((feature) >> 32) & 0xFFFFFFFF)
4672 #define VFTBITS(feature)        ((feature) & 0xFFFFFFFFLL)
4673
4674 /* Register a feature in the vfs */
4675 void
4676 vfs_set_feature(vfs_t *vfsp, vfs_feature_t feature)
4677 {
4678         /* Note that vfs_featureset[] is found in *vfsp->vfs_implp */
4679         if (vfsp->vfs_implp == NULL)
4680                 return;
4681
4682         vfsp->vfs_featureset[VFTINDEX(feature)] |= VFTBITS(feature);
4683 }
4684
4685 void
4686 vfs_clear_feature(vfs_t *vfsp, vfs_feature_t feature)
4687 {
4688         /* Note that vfs_featureset[] is found in *vfsp->vfs_implp */
4689         if (vfsp->vfs_implp == NULL)
4690                 return;
4691         vfsp->vfs_featureset[VFTINDEX(feature)] &= VFTBITS(~feature);
4692 }
4693
4694 /*
4695  * Query a vfs for a feature.
4696  * Returns 1 if feature is present, 0 if not
4697  */
4698 int
4699 vfs_has_feature(vfs_t *vfsp, vfs_feature_t feature)
4700 {
4701         int     ret = 0;
4702
4703         /* Note that vfs_featureset[] is found in *vfsp->vfs_implp */
4704         if (vfsp->vfs_implp == NULL)
4705                 return (ret);
4706
4707         if (vfsp->vfs_featureset[VFTINDEX(feature)] & VFTBITS(feature))
4708                 ret = 1;
4709
4710         return (ret);
4711 }
4712
4713 /*
4714  * Propagate feature set from one vfs to another
4715  */
4716 void
4717 vfs_propagate_features(vfs_t *from, vfs_t *to)
4718 {
4719         int i;
4720
4721         if (to->vfs_implp == NULL || from->vfs_implp == NULL)
4722                 return;
4723
4724         for (i = 1; i <= to->vfs_featureset[0]; i++) {
4725                 to->vfs_featureset[i] = from->vfs_featureset[i];
4726         }
4727 }
4728
4729 #define LOFINODE_PATH "/dev/lofi/%d"
4730
4731 /*
4732  * Return the vnode for the lofi node if there's a lofi mount in place.
4733  * Returns -1 when there's no lofi node, 0 on success, and > 0 on
4734  * failure.
4735  */
4736 int
4737 vfs_get_lofi(vfs_t *vfsp, vnode_t **vpp)
4738 {
4739         char *path = NULL;
4740         int strsize;
4741         int err;
4742
4743         if (vfsp->vfs_lofi_id == 0) {
4744                 *vpp = NULL;
4745                 return (-1);
4746         }
4747
4748         strsize = snprintf(NULL, 0, LOFINODE_PATH, vfsp->vfs_lofi_id);
4749         path = kmem_alloc(strsize + 1, KM_SLEEP);
4750         (void) snprintf(path, strsize + 1, LOFINODE_PATH, vfsp->vfs_lofi_id);
4751
4752         /*
4753          * We may be inside a zone, so we need to use the /dev path, but
4754          * it's created asynchronously, so we wait here.
4755          */
4756         for (;;) {
4757                 err = lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, vpp);
4758
4759                 if (err != ENOENT)
4760                         break;
4761
4762                 if ((err = delay_sig(hz / 8)) == EINTR)
4763                         break;
4764         }
4765
4766         if (err)
4767                 *vpp = NULL;
4768
4769         kmem_free(path, strsize + 1);
4770         return (err);
4771 }