kernel/fs/nfs/nfs4_stub_vnops.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26
  27 /*
  28  * Support for ephemeral mounts, e.g. mirror-mounts. These mounts are
  29  * triggered from a "stub" rnode via a special set of vnodeops.
  30  */
  31
  32 #include <sys/param.h>
  33 #include <sys/types.h>
  34 #include <sys/systm.h>
  35 #include <sys/cred.h>
  36 #include <sys/time.h>
  37 #include <sys/vnode.h>
  38 #include <sys/vfs.h>
  39 #include <sys/file.h>
  40 #include <sys/filio.h>
  41 #include <sys/uio.h>
  42 #include <sys/buf.h>
  43 #include <sys/mman.h>
  44 #include <sys/pathname.h>
  45 #include <sys/dirent.h>
  46 #include <sys/debug.h>
  47 #include <sys/vmsystm.h>
  48 #include <sys/fcntl.h>
  49 #include <sys/flock.h>
  50 #include <sys/swap.h>
  51 #include <sys/errno.h>
  52 #include <sys/strsubr.h>
  53 #include <sys/sysmacros.h>
  54 #include <sys/kmem.h>
  55 #include <sys/mount.h>
  56 #include <sys/cmn_err.h>
  57 #include <sys/pathconf.h>
  58 #include <sys/utsname.h>
  59 #include <sys/dnlc.h>
  60 #include <sys/acl.h>
  61 #include <sys/systeminfo.h>
  62 #include <sys/policy.h>
  63 #include <sys/sdt.h>
  64 #include <sys/list.h>
  65 #include <sys/stat.h>
  66 #include <sys/mntent.h>
  67 #include <sys/priv.h>
  68
  69 #include <rpc/types.h>
  70 #include <rpc/auth.h>
  71 #include <rpc/clnt.h>
  72
  73 #include <nfs/nfs.h>
  74 #include <nfs/nfs_clnt.h>
  75 #include <nfs/nfs_acl.h>
  76 #include <nfs/lm.h>
  77 #include <nfs/nfs4.h>
  78 #include <nfs/nfs4_kprot.h>
  79 #include <nfs/rnode4.h>
  80 #include <nfs/nfs4_clnt.h>
  81 #include <nfs/nfsid_map.h>
  82 #include <nfs/nfs4_idmap_impl.h>
  83
  84 #include <vm/hat.h>
  85 #include <vm/as.h>
  86 #include <vm/page.h>
  87 #include <vm/pvn.h>
  88 #include <vm/seg.h>
  89 #include <vm/seg_map.h>
  90 #include <vm/seg_kpm.h>
  91 #include <vm/seg_vn.h>
  92
  93 #include <sys/fs_subr.h>
  94
  95 #include <sys/ddi.h>
  96 #include <sys/int_fmtio.h>
  97
  98 #include <sys/sunddi.h>
  99
 100 #include <sys/priv_names.h>
 101
 102 extern zone_key_t       nfs4clnt_zone_key;
 103 extern zone_key_t       nfsidmap_zone_key;
 104
 105 /*
 106  * The automatic unmounter thread stuff!
 107  */
 108 static int nfs4_trigger_thread_timer = 20;      /* in seconds */
 109
 110 /*
 111  * Just a default....
 112  */
 113 static uint_t nfs4_trigger_mount_to = 240;
 114
 115 typedef struct nfs4_trigger_globals {
 116         kmutex_t                ntg_forest_lock;
 117         uint_t                  ntg_mount_to;
 118         int                     ntg_thread_started;
 119         nfs4_ephemeral_tree_t   *ntg_forest;
 120 } nfs4_trigger_globals_t;
 121
 122 kmutex_t        nfs4_ephemeral_thread_lock;
 123
 124 zone_key_t      nfs4_ephemeral_key = ZONE_KEY_UNINITIALIZED;
 125
 126 static void     nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *);
 127
 128 /*
 129  * Used for ephemeral mounts; contains data either duplicated from
 130  * servinfo4_t, or hand-crafted, depending on type of ephemeral mount.
 131  *
 132  * It's intended that this structure is used solely for ephemeral
 133  * mount-type specific data, for passing this data to
 134  * nfs4_trigger_nargs_create().
 135  */
 136 typedef struct ephemeral_servinfo {
 137         char                    *esi_hostname;
 138         char                    *esi_netname;
 139         char                    *esi_path;
 140         int                     esi_path_len;
 141         int                     esi_mount_flags;
 142         struct netbuf           *esi_addr;
 143         struct netbuf           *esi_syncaddr;
 144         struct knetconfig       *esi_knconf;
 145 } ephemeral_servinfo_t;
 146
 147 /*
 148  * Collect together the mount-type specific and generic data args.
 149  */
 150 typedef struct domount_args {
 151         ephemeral_servinfo_t    *dma_esi;
 152         char                    *dma_hostlist; /* comma-sep. for RO failover */
 153         struct nfs_args         *dma_nargs;
 154 } domount_args_t;
 155
 156
 157 /*
 158  * The vnode ops functions for a trigger stub vnode
 159  */
 160 static int nfs4_trigger_open(vnode_t **, int, cred_t *, caller_context_t *);
 161 static int nfs4_trigger_getattr(vnode_t *, struct vattr *, int, cred_t *,
 162     caller_context_t *);
 163 static int nfs4_trigger_setattr(vnode_t *, struct vattr *, int, cred_t *,
 164     caller_context_t *);
 165 static int nfs4_trigger_access(vnode_t *, int, int, cred_t *,
 166     caller_context_t *);
 167 static int nfs4_trigger_readlink(vnode_t *, struct uio *, cred_t *,
 168     caller_context_t *);
 169 static int nfs4_trigger_lookup(vnode_t *, char *, vnode_t **,
 170     struct pathname *, int, vnode_t *, cred_t *, caller_context_t *,
 171     int *, pathname_t *);
 172 static int nfs4_trigger_create(vnode_t *, char *, struct vattr *,
 173     enum vcexcl, int, vnode_t **, cred_t *, int, caller_context_t *,
 174     vsecattr_t *);
 175 static int nfs4_trigger_remove(vnode_t *, char *, cred_t *, caller_context_t *,
 176     int);
 177 static int nfs4_trigger_link(vnode_t *, vnode_t *, char *, cred_t *,
 178     caller_context_t *, int);
 179 static int nfs4_trigger_rename(vnode_t *, char *, vnode_t *, char *,
 180     cred_t *, caller_context_t *, int);
 181 static int nfs4_trigger_mkdir(vnode_t *, char *, struct vattr *,
 182     vnode_t **, cred_t *, caller_context_t *, int, vsecattr_t *vsecp);
 183 static int nfs4_trigger_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
 184     caller_context_t *, int);
 185 static int nfs4_trigger_symlink(vnode_t *, char *, struct vattr *, char *,
 186     cred_t *, caller_context_t *, int);
 187 static int nfs4_trigger_cmp(vnode_t *, vnode_t *, caller_context_t *);
 188
 189 /*
 190  * Regular NFSv4 vnodeops that we need to reference directly
 191  */
 192 extern int      nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *,
 193                     caller_context_t *);
 194 extern void     nfs4_inactive(vnode_t *, cred_t *, caller_context_t *);
 195 extern int      nfs4_rwlock(vnode_t *, int, caller_context_t *);
 196 extern void     nfs4_rwunlock(vnode_t *, int, caller_context_t *);
 197 extern int      nfs4_lookup(vnode_t *, char *, vnode_t **,
 198                     struct pathname *, int, vnode_t *, cred_t *,
 199                     caller_context_t *, int *, pathname_t *);
 200 extern int      nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *,
 201                     caller_context_t *);
 202 extern int      nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
 203                     caller_context_t *);
 204 extern int      nfs4_fid(vnode_t *, fid_t *, caller_context_t *);
 205 extern int      nfs4_realvp(vnode_t *, vnode_t **, caller_context_t *);
 206
 207 static int      nfs4_trigger_mount(vnode_t *, cred_t *, vnode_t **);
 208 static int      nfs4_trigger_domount(vnode_t *, domount_args_t *, vfs_t **,
 209     cred_t *, vnode_t **);
 210 static int      nfs4_trigger_domount_args_create(vnode_t *, cred_t *,
 211     domount_args_t **dmap);
 212 static void     nfs4_trigger_domount_args_destroy(domount_args_t *dma,
 213     vnode_t *vp);
 214 static ephemeral_servinfo_t *nfs4_trigger_esi_create(vnode_t *, servinfo4_t *,
 215     cred_t *);
 216 static void     nfs4_trigger_esi_destroy(ephemeral_servinfo_t *, vnode_t *);
 217 static ephemeral_servinfo_t *nfs4_trigger_esi_create_mirrormount(vnode_t *,
 218     servinfo4_t *);
 219 static ephemeral_servinfo_t *nfs4_trigger_esi_create_referral(vnode_t *,
 220     cred_t *);
 221 static struct nfs_args  *nfs4_trigger_nargs_create(mntinfo4_t *, servinfo4_t *,
 222     ephemeral_servinfo_t *);
 223 static void     nfs4_trigger_nargs_destroy(struct nfs_args *);
 224 static char     *nfs4_trigger_create_mntopts(vfs_t *);
 225 static void     nfs4_trigger_destroy_mntopts(char *);
 226 static int      nfs4_trigger_add_mntopt(char *, char *, vfs_t *);
 227 static enum clnt_stat nfs4_trigger_ping_server(servinfo4_t *, int);
 228 static enum clnt_stat nfs4_ping_server_common(struct knetconfig *,
 229     struct netbuf *, int);
 230
 231 extern int      umount2_engine(vfs_t *, int, cred_t *, int);
 232
 233 /*
 234  * These are the vnodeops that we must define for stub vnodes.
 235  *
 236  *
 237  * Many of the VOPs defined for NFSv4 do not need to be defined here,
 238  * for various reasons. This will result in the VFS default function being
 239  * used:
 240  *
 241  * - These VOPs require a previous fop_open to have occurred. That will have
 242  *   lost the reference to the stub vnode, meaning these should not be called:
 243  *       close, read, write, ioctl, readdir, seek.
 244  *
 245  * - These VOPs are meaningless for vnodes without data pages. Since the
 246  *   stub vnode is of type VDIR, these should not be called:
 247  *       space, getpage, putpage, map, addmap, delmap, pageio, fsync.
 248  *
 249  * - These VOPs are otherwise not applicable, and should not be called:
 250  *       dump, setsecattr.
 251  *
 252  *
 253  * These VOPs we do not want to define, but nor do we want the VFS default
 254  * action. Instead, we specify the an error function.
 255  *
 256  * -   frlock, dispose, shrlock.
 257  *
 258  *
 259  * These VOPs we define to use the corresponding regular NFSv4 vnodeop.
 260  * NOTE: if any of these ops involve an OTW call with the stub FH, then
 261  * that call must be wrapped with save_mnt_secinfo()/check_mnt_secinfo()
 262  * to protect the security data in the servinfo4_t for the "parent"
 263  * filesystem that contains the stub.
 264  *
 265  * - These VOPs should not trigger a mount, so that "ls -l" does not:
 266  *       pathconf, getsecattr.
 267  *
 268  * - These VOPs would not make sense to trigger:
 269  *       inactive, rwlock, rwunlock, fid, realvp.
 270  */
 271 const struct vnodeops nfs4_trigger_vnodeops = {
 272         .vnop_name = "nfs4_trigger",
 273         .vop_open = nfs4_trigger_open,
 274         .vop_getattr = nfs4_trigger_getattr,
 275         .vop_setattr = nfs4_trigger_setattr,
 276         .vop_access = nfs4_trigger_access,
 277         .vop_lookup = nfs4_trigger_lookup,
 278         .vop_create = nfs4_trigger_create,
 279         .vop_remove = nfs4_trigger_remove,
 280         .vop_link = nfs4_trigger_link,
 281         .vop_rename = nfs4_trigger_rename,
 282         .vop_mkdir = nfs4_trigger_mkdir,
 283         .vop_rmdir = nfs4_trigger_rmdir,
 284         .vop_symlink = nfs4_trigger_symlink,
 285         .vop_readlink = nfs4_trigger_readlink,
 286         .vop_inactive = nfs4_inactive,
 287         .vop_fid = nfs4_fid,
 288         .vop_rwlock = nfs4_rwlock,
 289         .vop_rwunlock = nfs4_rwunlock,
 290         .vop_realvp = nfs4_realvp,
 291         .vop_getsecattr = nfs4_getsecattr,
 292         .vop_pathconf = nfs4_pathconf,
 293         .vop_frlock = fs_nosys,
 294         .vop_dispose = fs_nodispose,
 295         .vop_shrlock = fs_nosys,
 296         .vop_vnevent = fs_vnevent_support,
 297 };
 298
 299 static void
 300 nfs4_ephemeral_tree_incr(nfs4_ephemeral_tree_t *net)
 301 {
 302         ASSERT(mutex_owned(&net->net_cnt_lock));
 303         net->net_refcnt++;
 304         ASSERT(net->net_refcnt != 0);
 305 }
 306
 307 static void
 308 nfs4_ephemeral_tree_hold(nfs4_ephemeral_tree_t *net)
 309 {
 310         mutex_enter(&net->net_cnt_lock);
 311         nfs4_ephemeral_tree_incr(net);
 312         mutex_exit(&net->net_cnt_lock);
 313 }
 314
 315 /*
 316  * We need a safe way to decrement the refcnt whilst the
 317  * lock is being held.
 318  */
 319 static void
 320 nfs4_ephemeral_tree_decr(nfs4_ephemeral_tree_t *net)
 321 {
 322         ASSERT(mutex_owned(&net->net_cnt_lock));
 323         ASSERT(net->net_refcnt != 0);
 324         net->net_refcnt--;
 325 }
 326
 327 static void
 328 nfs4_ephemeral_tree_rele(nfs4_ephemeral_tree_t *net)
 329 {
 330         mutex_enter(&net->net_cnt_lock);
 331         nfs4_ephemeral_tree_decr(net);
 332         mutex_exit(&net->net_cnt_lock);
 333 }
 334
 335 /*
 336  * Trigger ops for stub vnodes; for mirror mounts, etc.
 337  *
 338  * The general idea is that a "triggering" op will first call
 339  * nfs4_trigger_mount(), which will find out whether a mount has already
 340  * been triggered.
 341  *
 342  * If it has, then nfs4_trigger_mount() sets newvp to the root vnode
 343  * of the covering vfs.
 344  *
 345  * If a mount has not yet been triggered, nfs4_trigger_mount() will do so,
 346  * and again set newvp, as above.
 347  *
 348  * The triggering op may then re-issue the VOP by calling it on newvp.
 349  *
 350  * Note that some ops may perform custom action, and may or may not need
 351  * to trigger a mount.
 352  *
 353  * Some ops need to call the regular NFSv4 vnodeop for a stub vnode. We
 354  * obviously can't do this with VOP_<whatever>, since it's a stub vnode
 355  * and that would just recurse. Instead, we call the v4 op directly,
 356  * by name.  This is OK, since we know that the vnode is for NFSv4,
 357  * otherwise it couldn't be a stub.
 358  *
 359  */
 360
 361 static int
 362 nfs4_trigger_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
 363 {
 364         int error;
 365         vnode_t *newvp;
 366
 367         error = nfs4_trigger_mount(*vpp, cr, &newvp);
 368         if (error)
 369                 return (error);
 370
 371         /* Release the stub vnode, as we're losing the reference to it */
 372         VN_RELE(*vpp);
 373
 374         /* Give the caller the root vnode of the newly-mounted fs */
 375         *vpp = newvp;
 376
 377         /* return with VN_HELD(newvp) */
 378         return (fop_open(vpp, flag, cr, ct));
 379 }
 380
 381 void
 382 nfs4_fake_attrs(vnode_t *vp, struct vattr *vap)
 383 {
 384         uint_t mask;
 385         timespec_t now;
 386
 387         /*
 388          * Set some attributes here for referrals.
 389          */
 390         mask = vap->va_mask;
 391         bzero(vap, sizeof (struct vattr));
 392         vap->va_mask    = mask;
 393         vap->va_uid     = 0;
 394         vap->va_gid     = 0;
 395         vap->va_nlink   = 1;
 396         vap->va_size    = 1;
 397         gethrestime(&now);
 398         vap->va_atime   = now;
 399         vap->va_mtime   = now;
 400         vap->va_ctime   = now;
 401         vap->va_type    = VDIR;
 402         vap->va_mode    = 0555;
 403         vap->va_fsid    = vp->v_vfsp->vfs_dev;
 404         vap->va_rdev    = 0;
 405         vap->va_blksize = MAXBSIZE;
 406         vap->va_nblocks = 1;
 407         vap->va_seq     = 0;
 408 }
 409
 410 /*
 411  * For the majority of cases, nfs4_trigger_getattr() will not trigger
 412  * a mount. However, if ATTR_TRIGGER is set, we are being informed
 413  * that we need to force the mount before we attempt to determine
 414  * the attributes. The intent is an atomic operation for security
 415  * testing.
 416  *
 417  * If we're not triggering a mount, we can still inquire about the
 418  * actual attributes from the server in the mirror mount case,
 419  * and will return manufactured attributes for a referral (see
 420  * the 'create' branch of find_referral_stubvp()).
 421  */
 422 static int
 423 nfs4_trigger_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
 424     caller_context_t *ct)
 425 {
 426         int error;
 427
 428         if (flags & ATTR_TRIGGER) {
 429                 vnode_t *newvp;
 430
 431                 error = nfs4_trigger_mount(vp, cr, &newvp);
 432                 if (error)
 433                         return (error);
 434
 435                 error = fop_getattr(newvp, vap, flags, cr, ct);
 436                 VN_RELE(newvp);
 437
 438         } else if (RP_ISSTUB_MIRRORMOUNT(VTOR4(vp))) {
 439
 440                 error = nfs4_getattr(vp, vap, flags, cr, ct);
 441
 442         } else if (RP_ISSTUB_REFERRAL(VTOR4(vp))) {
 443
 444                 nfs4_fake_attrs(vp, vap);
 445                 error = 0;
 446         }
 447
 448         return (error);
 449 }
 450
 451 static int
 452 nfs4_trigger_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
 453     caller_context_t *ct)
 454 {
 455         int error;
 456         vnode_t *newvp;
 457
 458         error = nfs4_trigger_mount(vp, cr, &newvp);
 459         if (error)
 460                 return (error);
 461
 462         error = fop_setattr(newvp, vap, flags, cr, ct);
 463         VN_RELE(newvp);
 464
 465         return (error);
 466 }
 467
 468 static int
 469 nfs4_trigger_access(vnode_t *vp, int mode, int flags, cred_t *cr,
 470     caller_context_t *ct)
 471 {
 472         int error;
 473         vnode_t *newvp;
 474
 475         error = nfs4_trigger_mount(vp, cr, &newvp);
 476         if (error)
 477                 return (error);
 478
 479         error = fop_access(newvp, mode, flags, cr, ct);
 480         VN_RELE(newvp);
 481
 482         return (error);
 483 }
 484
 485 static int
 486 nfs4_trigger_lookup(vnode_t *dvp, char *nm, vnode_t **vpp,
 487     struct pathname *pnp, int flags, vnode_t *rdir, cred_t *cr,
 488     caller_context_t *ct, int *deflags, pathname_t *rpnp)
 489 {
 490         int error;
 491         vnode_t *newdvp;
 492         rnode4_t *drp = VTOR4(dvp);
 493
 494         ASSERT(RP_ISSTUB(drp));
 495
 496         /*
 497          * It's not legal to lookup ".." for an fs root, so we mustn't pass
 498          * that up. Instead, pass onto the regular op, regardless of whether
 499          * we've triggered a mount.
 500          */
 501         if (strcmp(nm, "..") == 0)
 502                 if (RP_ISSTUB_MIRRORMOUNT(drp)) {
 503                         return (nfs4_lookup(dvp, nm, vpp, pnp, flags, rdir, cr,
 504                             ct, deflags, rpnp));
 505                 } else if (RP_ISSTUB_REFERRAL(drp)) {
 506                         /* Return the parent vnode */
 507                         return (vtodv(dvp, vpp, cr, TRUE));
 508                 }
 509
 510         error = nfs4_trigger_mount(dvp, cr, &newdvp);
 511         if (error)
 512                 return (error);
 513
 514         error = fop_lookup(newdvp, nm, vpp, pnp, flags, rdir, cr, ct,
 515             deflags, rpnp);
 516         VN_RELE(newdvp);
 517
 518         return (error);
 519 }
 520
 521 static int
 522 nfs4_trigger_create(vnode_t *dvp, char *nm, struct vattr *va,
 523     enum vcexcl exclusive, int mode, vnode_t **vpp, cred_t *cr,
 524     int flags, caller_context_t *ct, vsecattr_t *vsecp)
 525 {
 526         int error;
 527         vnode_t *newdvp;
 528
 529         error = nfs4_trigger_mount(dvp, cr, &newdvp);
 530         if (error)
 531                 return (error);
 532
 533         error = fop_create(newdvp, nm, va, exclusive, mode, vpp, cr,
 534             flags, ct, vsecp);
 535         VN_RELE(newdvp);
 536
 537         return (error);
 538 }
 539
 540 static int
 541 nfs4_trigger_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct,
 542     int flags)
 543 {
 544         int error;
 545         vnode_t *newdvp;
 546
 547         error = nfs4_trigger_mount(dvp, cr, &newdvp);
 548         if (error)
 549                 return (error);
 550
 551         error = fop_remove(newdvp, nm, cr, ct, flags);
 552         VN_RELE(newdvp);
 553
 554         return (error);
 555 }
 556
 557 static int
 558 nfs4_trigger_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr,
 559     caller_context_t *ct, int flags)
 560 {
 561         int error;
 562         vnode_t *newtdvp;
 563
 564         error = nfs4_trigger_mount(tdvp, cr, &newtdvp);
 565         if (error)
 566                 return (error);
 567
 568         /*
 569          * We don't check whether svp is a stub. Let the NFSv4 code
 570          * detect that error, and return accordingly.
 571          */
 572         error = fop_link(newtdvp, svp, tnm, cr, ct, flags);
 573         VN_RELE(newtdvp);
 574
 575         return (error);
 576 }
 577
 578 static int
 579 nfs4_trigger_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
 580     cred_t *cr, caller_context_t *ct, int flags)
 581 {
 582         int error;
 583         vnode_t *newsdvp;
 584         rnode4_t *tdrp = VTOR4(tdvp);
 585
 586         /*
 587          * We know that sdvp is a stub, otherwise we would not be here.
 588          *
 589          * If tdvp is also be a stub, there are two possibilities: it
 590          * is either the same stub as sdvp [i.e. VN_CMP(sdvp, tdvp)]
 591          * or it is a different stub [!VN_CMP(sdvp, tdvp)].
 592          *
 593          * In the former case, just trigger sdvp, and treat tdvp as
 594          * though it were not a stub.
 595          *
 596          * In the latter case, it might be a different stub for the
 597          * same server fs as sdvp, or for a different server fs.
 598          * Regardless, from the client perspective this would still
 599          * be a cross-filesystem rename, and should not be allowed,
 600          * so return EXDEV, without triggering either mount.
 601          */
 602         if (RP_ISSTUB(tdrp) && !VN_CMP(sdvp, tdvp))
 603                 return (EXDEV);
 604
 605         error = nfs4_trigger_mount(sdvp, cr, &newsdvp);
 606         if (error)
 607                 return (error);
 608
 609         error = fop_rename(newsdvp, snm, tdvp, tnm, cr, ct, flags);
 610
 611         VN_RELE(newsdvp);
 612
 613         return (error);
 614 }
 615
 616 /* ARGSUSED */
 617 static int
 618 nfs4_trigger_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp,
 619     cred_t *cr, caller_context_t *ct, int flags, vsecattr_t *vsecp)
 620 {
 621         int error;
 622         vnode_t *newdvp;
 623
 624         error = nfs4_trigger_mount(dvp, cr, &newdvp);
 625         if (error)
 626                 return (error);
 627
 628         error = fop_mkdir(newdvp, nm, va, vpp, cr, ct, flags, vsecp);
 629         VN_RELE(newdvp);
 630
 631         return (error);
 632 }
 633
 634 static int
 635 nfs4_trigger_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
 636     caller_context_t *ct, int flags)
 637 {
 638         int error;
 639         vnode_t *newdvp;
 640
 641         error = nfs4_trigger_mount(dvp, cr, &newdvp);
 642         if (error)
 643                 return (error);
 644
 645         error = fop_rmdir(newdvp, nm, cdir, cr, ct, flags);
 646         VN_RELE(newdvp);
 647
 648         return (error);
 649 }
 650
 651 static int
 652 nfs4_trigger_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm,
 653     cred_t *cr, caller_context_t *ct, int flags)
 654 {
 655         int error;
 656         vnode_t *newdvp;
 657
 658         error = nfs4_trigger_mount(dvp, cr, &newdvp);
 659         if (error)
 660                 return (error);
 661
 662         error = fop_symlink(newdvp, lnm, tva, tnm, cr, ct, flags);
 663         VN_RELE(newdvp);
 664
 665         return (error);
 666 }
 667
 668 static int
 669 nfs4_trigger_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr,
 670     caller_context_t *ct)
 671 {
 672         int error;
 673         vnode_t *newvp;
 674
 675         error = nfs4_trigger_mount(vp, cr, &newvp);
 676         if (error)
 677                 return (error);
 678
 679         error = fop_readlink(newvp, uiop, cr, ct);
 680         VN_RELE(newvp);
 681
 682         return (error);
 683 }
 684
 685 /* end of trigger vnode ops */
 686
 687 /*
 688  * See if the mount has already been done by another caller.
 689  */
 690 static int
 691 nfs4_trigger_mounted_already(vnode_t *vp, vnode_t **newvpp,
 692     bool_t *was_mounted, vfs_t **vfsp)
 693 {
 694         int             error;
 695         mntinfo4_t      *mi = VTOMI4(vp);
 696
 697         *was_mounted = FALSE;
 698
 699         error = vn_vfsrlock_wait(vp);
 700         if (error)
 701                 return (error);
 702
 703         *vfsp = vn_mountedvfs(vp);
 704         if (*vfsp != NULL) {
 705                 /* the mount has already occurred */
 706                 error = VFS_ROOT(*vfsp, newvpp);
 707                 if (!error) {
 708                         /* need to update the reference time  */
 709                         mutex_enter(&mi->mi_lock);
 710                         if (mi->mi_ephemeral)
 711                                 mi->mi_ephemeral->ne_ref_time =
 712                                     gethrestime_sec();
 713                         mutex_exit(&mi->mi_lock);
 714
 715                         *was_mounted = TRUE;
 716                 }
 717         }
 718
 719         vn_vfsunlock(vp);
 720         return (0);
 721 }
 722
 723 /*
 724  * Mount upon a trigger vnode; for mirror-mounts, referrals, etc.
 725  *
 726  * The mount may have already occurred, via another thread. If not,
 727  * assemble the location information - which may require fetching - and
 728  * perform the mount.
 729  *
 730  * Sets newvp to be the root of the fs that is now covering vp. Note
 731  * that we return with VN_HELD(*newvp).
 732  *
 733  * The caller is responsible for passing the VOP onto the covering fs.
 734  */
 735 static int
 736 nfs4_trigger_mount(vnode_t *vp, cred_t *cr, vnode_t **newvpp)
 737 {
 738         int                      error;
 739         vfs_t                   *vfsp;
 740         rnode4_t                *rp = VTOR4(vp);
 741         mntinfo4_t              *mi = VTOMI4(vp);
 742         domount_args_t          *dma;
 743
 744         nfs4_ephemeral_tree_t   *net;
 745
 746         bool_t                  must_unlock = FALSE;
 747         bool_t                  is_building = FALSE;
 748         bool_t                  was_mounted = FALSE;
 749
 750         cred_t                  *mcred = NULL;
 751
 752         nfs4_trigger_globals_t  *ntg;
 753
 754         zone_t                  *zone = curproc->p_zone;
 755
 756         ASSERT(RP_ISSTUB(rp));
 757
 758         *newvpp = NULL;
 759
 760         /*
 761          * Has the mount already occurred?
 762          */
 763         error = nfs4_trigger_mounted_already(vp, newvpp,
 764             &was_mounted, &vfsp);
 765         if (error || was_mounted)
 766                 goto done;
 767
 768         ntg = zone_getspecific(nfs4_ephemeral_key, zone);
 769         ASSERT(ntg != NULL);
 770
 771         mutex_enter(&mi->mi_lock);
 772
 773         /*
 774          * We need to lock down the ephemeral tree.
 775          */
 776         if (mi->mi_ephemeral_tree == NULL) {
 777                 net = kmem_zalloc(sizeof (*net), KM_SLEEP);
 778                 mutex_init(&net->net_tree_lock, NULL, MUTEX_DEFAULT, NULL);
 779                 mutex_init(&net->net_cnt_lock, NULL, MUTEX_DEFAULT, NULL);
 780                 net->net_refcnt = 1;
 781                 net->net_status = NFS4_EPHEMERAL_TREE_BUILDING;
 782                 is_building = TRUE;
 783
 784                 /*
 785                  * We need to add it to the zone specific list for
 786                  * automatic unmounting and harvesting of deadwood.
 787                  */
 788                 mutex_enter(&ntg->ntg_forest_lock);
 789                 if (ntg->ntg_forest != NULL)
 790                         net->net_next = ntg->ntg_forest;
 791                 ntg->ntg_forest = net;
 792                 mutex_exit(&ntg->ntg_forest_lock);
 793
 794                 /*
 795                  * No lock order confusion with mi_lock because no
 796                  * other node could have grabbed net_tree_lock.
 797                  */
 798                 mutex_enter(&net->net_tree_lock);
 799                 mi->mi_ephemeral_tree = net;
 800                 net->net_mount = mi;
 801                 mutex_exit(&mi->mi_lock);
 802
 803                 MI4_HOLD(mi);
 804                 VFS_HOLD(mi->mi_vfsp);
 805         } else {
 806                 net = mi->mi_ephemeral_tree;
 807                 nfs4_ephemeral_tree_hold(net);
 808
 809                 mutex_exit(&mi->mi_lock);
 810
 811                 mutex_enter(&net->net_tree_lock);
 812
 813                 /*
 814                  * We can only procede if the tree is neither locked
 815                  * nor being torn down.
 816                  */
 817                 mutex_enter(&net->net_cnt_lock);
 818                 if (net->net_status & NFS4_EPHEMERAL_TREE_PROCESSING) {
 819                         nfs4_ephemeral_tree_decr(net);
 820                         mutex_exit(&net->net_cnt_lock);
 821                         mutex_exit(&net->net_tree_lock);
 822
 823                         return (EIO);
 824                 }
 825                 mutex_exit(&net->net_cnt_lock);
 826         }
 827
 828         mutex_enter(&net->net_cnt_lock);
 829         net->net_status |= NFS4_EPHEMERAL_TREE_MOUNTING;
 830         mutex_exit(&net->net_cnt_lock);
 831
 832         must_unlock = TRUE;
 833
 834         error = nfs4_trigger_domount_args_create(vp, cr, &dma);
 835         if (error)
 836                 goto done;
 837
 838         /*
 839          * Note that since we define mirror mounts to work
 840          * for any user, we simply extend the privileges of
 841          * the user's credentials to allow the mount to
 842          * proceed.
 843          */
 844         mcred = crdup(cr);
 845         if (mcred == NULL) {
 846                 error = EINVAL;
 847                 nfs4_trigger_domount_args_destroy(dma, vp);
 848                 goto done;
 849         }
 850
 851         crset_zone_privall(mcred);
 852
 853         error = nfs4_trigger_domount(vp, dma, &vfsp, mcred, newvpp);
 854         nfs4_trigger_domount_args_destroy(dma, vp);
 855
 856         DTRACE_PROBE2(nfs4clnt__func__referral__mount,
 857             vnode_t *, vp, int, error);
 858
 859         crfree(mcred);
 860
 861 done:
 862
 863         if (must_unlock) {
 864                 mutex_enter(&net->net_cnt_lock);
 865                 net->net_status &= ~NFS4_EPHEMERAL_TREE_MOUNTING;
 866
 867                 /*
 868                  * REFCNT: If we are the root of the tree, then we need
 869                  * to keep a reference because we malloced the tree and
 870                  * this is where we tied it to our mntinfo.
 871                  *
 872                  * If we are not the root of the tree, then our tie to
 873                  * the mntinfo occured elsewhere and we need to
 874                  * decrement the reference to the tree.
 875                  */
 876                 if (is_building)
 877                         net->net_status &= ~NFS4_EPHEMERAL_TREE_BUILDING;
 878                 else
 879                         nfs4_ephemeral_tree_decr(net);
 880                 mutex_exit(&net->net_cnt_lock);
 881
 882                 mutex_exit(&net->net_tree_lock);
 883         }
 884
 885         if (!error && (newvpp == NULL || *newvpp == NULL))
 886                 error = ENOSYS;
 887
 888         return (error);
 889 }
 890
 891 /*
 892  * Collect together both the generic & mount-type specific args.
 893  */
 894 static int
 895 nfs4_trigger_domount_args_create(vnode_t *vp, cred_t *cr, domount_args_t **dmap)
 896 {
 897         int nointr;
 898         char *hostlist;
 899         servinfo4_t *svp;
 900         struct nfs_args *nargs, *nargs_head;
 901         enum clnt_stat status;
 902         ephemeral_servinfo_t *esi, *esi_first;
 903         domount_args_t *dma;
 904         mntinfo4_t *mi = VTOMI4(vp);
 905
 906         nointr = !(mi->mi_flags & MI4_INT);
 907         hostlist = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
 908
 909         svp = mi->mi_curr_serv;
 910         /* check if the current server is responding */
 911         status = nfs4_trigger_ping_server(svp, nointr);
 912         if (status == RPC_SUCCESS) {
 913                 esi_first = nfs4_trigger_esi_create(vp, svp, cr);
 914                 if (esi_first == NULL) {
 915                         kmem_free(hostlist, MAXPATHLEN);
 916                         return (EINVAL);
 917                 }
 918
 919                 (void) strlcpy(hostlist, esi_first->esi_hostname, MAXPATHLEN);
 920
 921                 nargs_head = nfs4_trigger_nargs_create(mi, svp, esi_first);
 922         } else {
 923                 /* current server did not respond */
 924                 esi_first = NULL;
 925                 nargs_head = NULL;
 926         }
 927         nargs = nargs_head;
 928
 929         /*
 930          * NFS RO failover.
 931          *
 932          * If we have multiple servinfo4 structures, linked via sv_next,
 933          * we must create one nfs_args for each, linking the nfs_args via
 934          * nfs_ext_u.nfs_extB.next.
 935          *
 936          * We need to build a corresponding esi for each, too, but that is
 937          * used solely for building nfs_args, and may be immediately
 938          * discarded, as domount() requires the info from just one esi,
 939          * but all the nfs_args.
 940          *
 941          * Currently, the NFS mount code will hang if not all servers
 942          * requested are available. To avoid that, we need to ping each
 943          * server, here, and remove it from the list if it is not
 944          * responding. This has the side-effect of that server then
 945          * being permanently unavailable for this failover mount, even if
 946          * it recovers. That's unfortunate, but the best we can do until
 947          * the mount code path is fixed.
 948          */
 949
 950         /*
 951          * If the current server was down, loop indefinitely until we find
 952          * at least one responsive server.
 953          */
 954         do {
 955                 /* no locking needed for sv_next; it is only set at fs mount */
 956                 for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) {
 957                         struct nfs_args *next;
 958
 959                         /*
 960                          * nargs_head: the head of the nfs_args list
 961                          * nargs: the current tail of the list
 962                          * next: the newly-created element to be added
 963                          */
 964
 965                         /*
 966                          * We've already tried the current server, above;
 967                          * if it was responding, we have already included it
 968                          * and it may now be ignored.
 969                          *
 970                          * Otherwise, try it again, since it may now have
 971                          * recovered.
 972                          */
 973                         if (svp == mi->mi_curr_serv && esi_first != NULL)
 974                                 continue;
 975
 976                         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
 977                         if (svp->sv_flags & SV4_NOTINUSE) {
 978                                 nfs_rw_exit(&svp->sv_lock);
 979                                 continue;
 980                         }
 981                         nfs_rw_exit(&svp->sv_lock);
 982
 983                         /* check if the server is responding */
 984                         status = nfs4_trigger_ping_server(svp, nointr);
 985                         if (status == RPC_INTR) {
 986                                 kmem_free(hostlist, MAXPATHLEN);
 987                                 nfs4_trigger_esi_destroy(esi_first, vp);
 988                                 nargs = nargs_head;
 989                                 while (nargs != NULL) {
 990                                         next = nargs->nfs_ext_u.nfs_extB.next;
 991                                         nfs4_trigger_nargs_destroy(nargs);
 992                                         nargs = next;
 993                                 }
 994                                 return (EINTR);
 995                         } else if (status != RPC_SUCCESS) {
 996                                 /* if the server did not respond, ignore it */
 997                                 continue;
 998                         }
 999
1000                         esi = nfs4_trigger_esi_create(vp, svp, cr);
1001                         if (esi == NULL)
1002                                 continue;
1003
1004                         /*
1005                          * If the original current server (mi_curr_serv)
1006                          * was down when when we first tried it,
1007                          * (i.e. esi_first == NULL),
1008                          * we select this new server (svp) to be the server
1009                          * that we will actually contact (esi_first).
1010                          *
1011                          * Note that it's possible that mi_curr_serv == svp,
1012                          * if that mi_curr_serv was down but has now recovered.
1013                          */
1014                         next = nfs4_trigger_nargs_create(mi, svp, esi);
1015                         if (esi_first == NULL) {
1016                                 ASSERT(nargs == NULL);
1017                                 ASSERT(nargs_head == NULL);
1018                                 nargs_head = next;
1019                                 esi_first = esi;
1020                                 (void) strlcpy(hostlist,
1021                                     esi_first->esi_hostname, MAXPATHLEN);
1022                         } else {
1023                                 ASSERT(nargs_head != NULL);
1024                                 nargs->nfs_ext_u.nfs_extB.next = next;
1025                                 (void) strlcat(hostlist, ",", MAXPATHLEN);
1026                                 (void) strlcat(hostlist, esi->esi_hostname,
1027                                     MAXPATHLEN);
1028                                 /* esi was only needed for hostname & nargs */
1029                                 nfs4_trigger_esi_destroy(esi, vp);
1030                         }
1031
1032                         nargs = next;
1033                 }
1034
1035                 /* if we've had no response at all, wait a second */
1036                 if (esi_first == NULL)
1037                         ddi_sleep(1);
1038
1039         } while (esi_first == NULL);
1040         ASSERT(nargs_head != NULL);
1041
1042         dma = kmem_zalloc(sizeof (domount_args_t), KM_SLEEP);
1043         dma->dma_esi = esi_first;
1044         dma->dma_hostlist = hostlist;
1045         dma->dma_nargs = nargs_head;
1046         *dmap = dma;
1047
1048         return (0);
1049 }
1050
1051 static void
1052 nfs4_trigger_domount_args_destroy(domount_args_t *dma, vnode_t *vp)
1053 {
1054         if (dma != NULL) {
1055                 if (dma->dma_esi != NULL && vp != NULL)
1056                         nfs4_trigger_esi_destroy(dma->dma_esi, vp);
1057
1058                 if (dma->dma_hostlist != NULL)
1059                         kmem_free(dma->dma_hostlist, MAXPATHLEN);
1060
1061                 if (dma->dma_nargs != NULL) {
1062                         struct nfs_args *nargs = dma->dma_nargs;
1063
1064                         do {
1065                                 struct nfs_args *next =
1066                                     nargs->nfs_ext_u.nfs_extB.next;
1067
1068                                 nfs4_trigger_nargs_destroy(nargs);
1069                                 nargs = next;
1070                         } while (nargs != NULL);
1071                 }
1072
1073                 kmem_free(dma, sizeof (domount_args_t));
1074         }
1075 }
1076
1077 /*
1078  * The ephemeral_servinfo_t struct contains basic information we will need to
1079  * perform the mount. Whilst the structure is generic across different
1080  * types of ephemeral mount, the way we gather its contents differs.
1081  */
1082 static ephemeral_servinfo_t *
1083 nfs4_trigger_esi_create(vnode_t *vp, servinfo4_t *svp, cred_t *cr)
1084 {
1085         ephemeral_servinfo_t *esi;
1086         rnode4_t *rp = VTOR4(vp);
1087
1088         ASSERT(RP_ISSTUB(rp));
1089
1090         /* Call the ephemeral type-specific routine */
1091         if (RP_ISSTUB_MIRRORMOUNT(rp))
1092                 esi = nfs4_trigger_esi_create_mirrormount(vp, svp);
1093         else if (RP_ISSTUB_REFERRAL(rp))
1094                 esi = nfs4_trigger_esi_create_referral(vp, cr);
1095         else
1096                 esi = NULL;
1097         return (esi);
1098 }
1099
1100 static void
1101 nfs4_trigger_esi_destroy(ephemeral_servinfo_t *esi, vnode_t *vp)
1102 {
1103         rnode4_t *rp = VTOR4(vp);
1104
1105         ASSERT(RP_ISSTUB(rp));
1106
1107         /* Currently, no need for an ephemeral type-specific routine */
1108
1109         /*
1110          * The contents of ephemeral_servinfo_t goes into nfs_args,
1111          * and will be handled by nfs4_trigger_nargs_destroy().
1112          * We need only free the structure itself.
1113          */
1114         if (esi != NULL)
1115                 kmem_free(esi, sizeof (ephemeral_servinfo_t));
1116 }
1117
1118 /*
1119  * Some of this may turn out to be common with other ephemeral types,
1120  * in which case it should be moved to nfs4_trigger_esi_create(), or a
1121  * common function called.
1122  */
1123
1124 /*
1125  * Mirror mounts case - should have all data available
1126  */
1127 static ephemeral_servinfo_t *
1128 nfs4_trigger_esi_create_mirrormount(vnode_t *vp, servinfo4_t *svp)
1129 {
1130         char                    *stubpath;
1131         struct knetconfig       *sikncp, *svkncp;
1132         struct netbuf           *bufp;
1133         ephemeral_servinfo_t    *esi;
1134
1135         esi = kmem_zalloc(sizeof (ephemeral_servinfo_t), KM_SLEEP);
1136
1137         /* initially set to be our type of ephemeral mount; may be added to */
1138         esi->esi_mount_flags = NFSMNT_MIRRORMOUNT;
1139
1140         /*
1141          * We're copying info from the stub rnode's servinfo4, but
1142          * we must create new copies, not pointers, since this information
1143          * is to be associated with the new mount, which will be
1144          * unmounted (and its structures freed) separately
1145          */
1146
1147         /*
1148          * Sizes passed to kmem_[z]alloc here must match those freed
1149          * in nfs4_free_args()
1150          */
1151
1152         /*
1153          * We hold sv_lock across kmem_zalloc() calls that may sleep, but this
1154          * is difficult to avoid: as we need to read svp to calculate the
1155          * sizes to be allocated.
1156          */
1157         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1158
1159         esi->esi_hostname = kmem_zalloc(strlen(svp->sv_hostname) + 1, KM_SLEEP);
1160         (void) strcat(esi->esi_hostname, svp->sv_hostname);
1161
1162         esi->esi_addr = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP);
1163         bufp = esi->esi_addr;
1164         bufp->len = svp->sv_addr.len;
1165         bufp->maxlen = svp->sv_addr.maxlen;
1166         bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
1167         bcopy(svp->sv_addr.buf, bufp->buf, bufp->len);
1168
1169         esi->esi_knconf = kmem_zalloc(sizeof (*esi->esi_knconf), KM_SLEEP);
1170         sikncp = esi->esi_knconf;
1171         svkncp = svp->sv_knconf;
1172         sikncp->knc_semantics = svkncp->knc_semantics;
1173         sikncp->knc_protofmly = kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1174         (void) strcat((char *)sikncp->knc_protofmly,
1175             (char *)svkncp->knc_protofmly);
1176         sikncp->knc_proto = kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1177         (void) strcat((char *)sikncp->knc_proto, (char *)svkncp->knc_proto);
1178         sikncp->knc_rdev = svkncp->knc_rdev;
1179
1180         /*
1181          * Used when AUTH_DH is negotiated.
1182          *
1183          * This is ephemeral mount-type specific, since it contains the
1184          * server's time-sync syncaddr.
1185          */
1186         if (svp->sv_dhsec) {
1187                 struct netbuf *bufp;
1188                 sec_data_t *sdata;
1189                 dh_k4_clntdata_t *data;
1190
1191                 sdata = svp->sv_dhsec;
1192                 data = (dh_k4_clntdata_t *)sdata->data;
1193                 ASSERT(sdata->rpcflavor == AUTH_DH);
1194
1195                 bufp = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP);
1196                 bufp->len = data->syncaddr.len;
1197                 bufp->maxlen = data->syncaddr.maxlen;
1198                 bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
1199                 bcopy(data->syncaddr.buf, bufp->buf, bufp->len);
1200                 esi->esi_syncaddr = bufp;
1201
1202                 if (data->netname != NULL) {
1203                         int nmlen = data->netnamelen;
1204
1205                         /*
1206                          * We need to copy from a dh_k4_clntdata_t
1207                          * netname/netnamelen pair to a NUL-terminated
1208                          * netname string suitable for putting in nfs_args,
1209                          * where the latter has no netnamelen field.
1210                          */
1211                         esi->esi_netname = kmem_zalloc(nmlen + 1, KM_SLEEP);
1212                         bcopy(data->netname, esi->esi_netname, nmlen);
1213                 }
1214         } else {
1215                 esi->esi_syncaddr = NULL;
1216                 esi->esi_netname = NULL;
1217         }
1218
1219         stubpath = fn_path(VTOSV(vp)->sv_name);
1220         /* step over initial '.', to avoid e.g. sv_path: "/tank./ws" */
1221         ASSERT(*stubpath == '.');
1222         stubpath += 1;
1223
1224         /* for nfs_args->fh */
1225         esi->esi_path_len = strlen(stubpath) + 1;
1226         if (strcmp(svp->sv_path, "/") != 0)
1227                 esi->esi_path_len += strlen(svp->sv_path);
1228         esi->esi_path = kmem_zalloc(esi->esi_path_len, KM_SLEEP);
1229         if (strcmp(svp->sv_path, "/") != 0)
1230                 (void) strcat(esi->esi_path, svp->sv_path);
1231         (void) strcat(esi->esi_path, stubpath);
1232
1233         stubpath -= 1;
1234         /* stubpath allocated by fn_path() */
1235         kmem_free(stubpath, strlen(stubpath) + 1);
1236
1237         nfs_rw_exit(&svp->sv_lock);
1238
1239         return (esi);
1240 }
1241
1242 /*
1243  * Makes an upcall to NFSMAPID daemon to resolve hostname of NFS server to
1244  * get network information required to do the mount call.
1245  */
1246 int
1247 nfs4_callmapid(utf8string *server, struct nfs_fsl_info *resp)
1248 {
1249         door_arg_t      door_args;
1250         door_handle_t   dh;
1251         XDR             xdr;
1252         refd_door_args_t *xdr_argsp;
1253         refd_door_res_t  *orig_resp;
1254         k_sigset_t      smask;
1255         int             xdr_len = 0;
1256         int             res_len = 16; /* length of an ip adress */
1257         int             orig_reslen = res_len;
1258         int             error = 0;
1259         struct nfsidmap_globals *nig;
1260
1261         if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN)
1262                 return (ECONNREFUSED);
1263
1264         nig = zone_getspecific(nfsidmap_zone_key, nfs_zone());
1265         ASSERT(nig != NULL);
1266
1267         mutex_enter(&nig->nfsidmap_daemon_lock);
1268         dh = nig->nfsidmap_daemon_dh;
1269         if (dh == NULL) {
1270                 mutex_exit(&nig->nfsidmap_daemon_lock);
1271                 cmn_err(CE_NOTE,
1272                     "nfs4_callmapid: nfsmapid daemon not " \
1273                     "running unable to resolve host name\n");
1274                 return (EINVAL);
1275         }
1276         door_ki_hold(dh);
1277         mutex_exit(&nig->nfsidmap_daemon_lock);
1278
1279         xdr_len = xdr_sizeof(&(xdr_utf8string), server);
1280
1281         xdr_argsp = kmem_zalloc(xdr_len + sizeof (*xdr_argsp), KM_SLEEP);
1282         xdr_argsp->xdr_len = xdr_len;
1283         xdr_argsp->cmd = NFSMAPID_SRV_NETINFO;
1284
1285         xdrmem_create(&xdr, (char *)&xdr_argsp->xdr_arg,
1286             xdr_len, XDR_ENCODE);
1287
1288         if (!xdr_utf8string(&xdr, server)) {
1289                 kmem_free(xdr_argsp, xdr_len + sizeof (*xdr_argsp));
1290                 door_ki_rele(dh);
1291                 return (1);
1292         }
1293
1294         if (orig_reslen)
1295                 orig_resp = kmem_alloc(orig_reslen, KM_SLEEP);
1296
1297         door_args.data_ptr = (char *)xdr_argsp;
1298         door_args.data_size = sizeof (*xdr_argsp) + xdr_argsp->xdr_len;
1299         door_args.desc_ptr = NULL;
1300         door_args.desc_num = 0;
1301         door_args.rbuf = orig_resp ? (char *)orig_resp : NULL;
1302         door_args.rsize = res_len;
1303
1304         sigintr(&smask, 1);
1305         error = door_ki_upcall(dh, &door_args);
1306         sigunintr(&smask);
1307
1308         door_ki_rele(dh);
1309
1310         kmem_free(xdr_argsp, xdr_len + sizeof (*xdr_argsp));
1311         if (error) {
1312                 kmem_free(orig_resp, orig_reslen);
1313                 /*
1314                  * There is no door to connect to. The referral daemon
1315                  * must not be running yet.
1316                  */
1317                 cmn_err(CE_WARN,
1318                     "nfsmapid not running cannot resolve host name");
1319                 goto out;
1320         }
1321
1322         /*
1323          * If the results buffer passed back are not the same as
1324          * what was sent free the old buffer and use the new one.
1325          */
1326         if (orig_resp && orig_reslen) {
1327                 refd_door_res_t *door_resp;
1328
1329                 door_resp = (refd_door_res_t *)door_args.rbuf;
1330                 if ((void *)door_args.rbuf != orig_resp)
1331                         kmem_free(orig_resp, orig_reslen);
1332                 if (door_resp->res_status == 0) {
1333                         xdrmem_create(&xdr, (char *)&door_resp->xdr_res,
1334                             door_resp->xdr_len, XDR_DECODE);
1335                         bzero(resp, sizeof (struct nfs_fsl_info));
1336                         if (!xdr_nfs_fsl_info(&xdr, resp)) {
1337                                 DTRACE_PROBE2(
1338                                     nfs4clnt__debug__referral__upcall__xdrfail,
1339                                     struct nfs_fsl_info *, resp,
1340                                     char *, "nfs4_callmapid");
1341                                 error = EINVAL;
1342                         }
1343                 } else {
1344                         DTRACE_PROBE2(
1345                             nfs4clnt__debug__referral__upcall__badstatus,
1346                             int, door_resp->res_status,
1347                             char *, "nfs4_callmapid");
1348                         error = door_resp->res_status;
1349                 }
1350                 kmem_free(door_args.rbuf, door_args.rsize);
1351         }
1352 out:
1353         DTRACE_PROBE2(nfs4clnt__func__referral__upcall,
1354             char *, server, int, error);
1355         return (error);
1356 }
1357
1358 /*
1359  * Fetches the fs_locations attribute. Typically called
1360  * from a Replication/Migration/Referrals/Mirror-mount context
1361  *
1362  * Fills in the attributes in garp. The caller is assumed
1363  * to have allocated memory for garp.
1364  *
1365  * lock: if set do not lock s_recovlock and mi_recovlock mutex,
1366  *       it's already done by caller. Otherwise lock these mutexes
1367  *       before doing the rfs4call().
1368  *
1369  * Returns
1370  *      1        for success
1371  *      0        for failure
1372  */
1373 int
1374 nfs4_fetch_locations(mntinfo4_t *mi, nfs4_sharedfh_t *sfh, char *nm,
1375     cred_t *cr, nfs4_ga_res_t *garp, COMPOUND4res_clnt *callres, bool_t lock)
1376 {
1377         COMPOUND4args_clnt args;
1378         COMPOUND4res_clnt res;
1379         nfs_argop4 *argop;
1380         int argoplist_size = 3 * sizeof (nfs_argop4);
1381         nfs4_server_t *sp = NULL;
1382         int doqueue = 1;
1383         nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
1384         int retval = 1;
1385         struct nfs4_clnt *nfscl;
1386
1387         if (lock == TRUE)
1388                 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
1389         else
1390                 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
1391                     nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
1392
1393         sp = find_nfs4_server(mi);
1394         if (lock == TRUE)
1395                 nfs_rw_exit(&mi->mi_recovlock);
1396
1397         if (sp != NULL)
1398                 mutex_exit(&sp->s_lock);
1399
1400         if (lock == TRUE) {
1401                 if (sp != NULL)
1402                         (void) nfs_rw_enter_sig(&sp->s_recovlock,
1403                             RW_WRITER, 0);
1404                 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0);
1405         } else {
1406                 if (sp != NULL) {
1407                         ASSERT(nfs_rw_lock_held(&sp->s_recovlock, RW_READER) ||
1408                             nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER));
1409                 }
1410         }
1411
1412         /*
1413          * Do we want to do the setup for recovery here?
1414          *
1415          * We know that the server responded to a null ping a very
1416          * short time ago, and we know that we intend to do a
1417          * single stateless operation - we want to fetch attributes,
1418          * so we know we can't encounter errors about state.  If
1419          * something goes wrong with the GETATTR, like not being
1420          * able to get a response from the server or getting any
1421          * kind of FH error, we should fail the mount.
1422          *
1423          * We may want to re-visited this at a later time.
1424          */
1425         argop = kmem_alloc(argoplist_size, KM_SLEEP);
1426
1427         args.ctag = TAG_GETATTR_FSLOCATION;
1428         /* PUTFH LOOKUP GETATTR */
1429         args.array_len = 3;
1430         args.array = argop;
1431
1432         /* 0. putfh file */
1433         argop[0].argop = OP_CPUTFH;
1434         argop[0].nfs_argop4_u.opcputfh.sfh = sfh;
1435
1436         /* 1. lookup name, can't be dotdot */
1437         argop[1].argop = OP_CLOOKUP;
1438         argop[1].nfs_argop4_u.opclookup.cname = nm;
1439
1440         /* 2. file attrs */
1441         argop[2].argop = OP_GETATTR;
1442         argop[2].nfs_argop4_u.opgetattr.attr_request =
1443             FATTR4_FSID_MASK | FATTR4_FS_LOCATIONS_MASK |
1444             FATTR4_MOUNTED_ON_FILEID_MASK;
1445         argop[2].nfs_argop4_u.opgetattr.mi = mi;
1446
1447         rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
1448
1449         if (lock == TRUE) {
1450                 nfs_rw_exit(&mi->mi_recovlock);
1451                 if (sp != NULL)
1452                         nfs_rw_exit(&sp->s_recovlock);
1453         }
1454
1455         nfscl = zone_getspecific(nfs4clnt_zone_key, nfs_zone());
1456         nfscl->nfscl_stat.referrals.value.ui64++;
1457         DTRACE_PROBE3(nfs4clnt__func__referral__fsloc,
1458             nfs4_sharedfh_t *, sfh, char *, nm, nfs4_error_t *, &e);
1459
1460         if (e.error != 0) {
1461                 if (sp != NULL)
1462                         nfs4_server_rele(sp);
1463                 kmem_free(argop, argoplist_size);
1464                 return (0);
1465         }
1466
1467         /*
1468          * Check for all possible error conditions.
1469          * For valid replies without an ops array or for illegal
1470          * replies, return a failure.
1471          */
1472         if (res.status != NFS4_OK || res.array_len < 3 ||
1473             res.array[2].nfs_resop4_u.opgetattr.status != NFS4_OK) {
1474                 retval = 0;
1475                 goto exit;
1476         }
1477
1478         /*
1479          * There isn't much value in putting the attributes
1480          * in the attr cache since fs_locations4 aren't
1481          * encountered very frequently, so just make them
1482          * available to the caller.
1483          */
1484         *garp = res.array[2].nfs_resop4_u.opgetattr.ga_res;
1485
1486         DTRACE_PROBE2(nfs4clnt__debug__referral__fsloc,
1487             nfs4_ga_res_t *, garp, char *, "nfs4_fetch_locations");
1488
1489         /* No fs_locations? -- return a failure */
1490         if (garp->n4g_ext_res == NULL ||
1491             garp->n4g_ext_res->n4g_fslocations.locations_val == NULL) {
1492                 retval = 0;
1493                 goto exit;
1494         }
1495
1496         if (!garp->n4g_fsid_valid)
1497                 retval = 0;
1498
1499 exit:
1500         if (retval == 0) {
1501                 /* the call was ok but failed validating the call results */
1502                 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1503         } else {
1504                 ASSERT(callres != NULL);
1505                 *callres = res;
1506         }
1507
1508         if (sp != NULL)
1509                 nfs4_server_rele(sp);
1510         kmem_free(argop, argoplist_size);
1511         return (retval);
1512 }
1513
1514 /* tunable to disable referral mounts */
1515 int nfs4_no_referrals = 0;
1516
1517 /*
1518  * Returns NULL if the vnode cannot be created or found.
1519  */
1520 vnode_t *
1521 find_referral_stubvp(vnode_t *dvp, char *nm, cred_t *cr)
1522 {
1523         nfs_fh4 *stub_fh, *dfh;
1524         nfs4_sharedfh_t *sfhp;
1525         char *newfhval;
1526         vnode_t *vp = NULL;
1527         fattr4_mounted_on_fileid mnt_on_fileid;
1528         nfs4_ga_res_t garp;
1529         mntinfo4_t *mi;
1530         COMPOUND4res_clnt callres;
1531         hrtime_t t;
1532
1533         if (nfs4_no_referrals)
1534                 return (NULL);
1535
1536         /*
1537          * Get the mounted_on_fileid, unique on that server::fsid
1538          */
1539         mi = VTOMI4(dvp);
1540         if (nfs4_fetch_locations(mi, VTOR4(dvp)->r_fh, nm, cr,
1541             &garp, &callres, FALSE) == 0)
1542                 return (NULL);
1543         mnt_on_fileid = garp.n4g_mon_fid;
1544         xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1545
1546         /*
1547          * Build a fake filehandle from the dir FH and the mounted_on_fileid
1548          */
1549         dfh = &VTOR4(dvp)->r_fh->sfh_fh;
1550         stub_fh = kmem_alloc(sizeof (nfs_fh4), KM_SLEEP);
1551         stub_fh->nfs_fh4_val = kmem_alloc(dfh->nfs_fh4_len +
1552             sizeof (fattr4_mounted_on_fileid), KM_SLEEP);
1553         newfhval = stub_fh->nfs_fh4_val;
1554
1555         /* copy directory's file handle */
1556         bcopy(dfh->nfs_fh4_val, newfhval, dfh->nfs_fh4_len);
1557         stub_fh->nfs_fh4_len = dfh->nfs_fh4_len;
1558         newfhval = newfhval + dfh->nfs_fh4_len;
1559
1560         /* Add mounted_on_fileid. Use bcopy to avoid alignment problem */
1561         bcopy((char *)&mnt_on_fileid, newfhval,
1562             sizeof (fattr4_mounted_on_fileid));
1563         stub_fh->nfs_fh4_len += sizeof (fattr4_mounted_on_fileid);
1564
1565         sfhp = sfh4_put(stub_fh, VTOMI4(dvp), NULL);
1566         kmem_free(stub_fh->nfs_fh4_val, dfh->nfs_fh4_len +
1567             sizeof (fattr4_mounted_on_fileid));
1568         kmem_free(stub_fh, sizeof (nfs_fh4));
1569         if (sfhp == NULL)
1570                 return (NULL);
1571
1572         t = gethrtime();
1573         garp.n4g_va.va_type = VDIR;
1574         vp = makenfs4node(sfhp, NULL, dvp->v_vfsp, t,
1575             cr, dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
1576
1577         if (vp != NULL)
1578                 vp->v_type = VDIR;
1579
1580         sfh4_rele(&sfhp);
1581         return (vp);
1582 }
1583
1584 int
1585 nfs4_setup_referral(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
1586 {
1587         vnode_t *nvp;
1588         rnode4_t *rp;
1589
1590         if ((nvp = find_referral_stubvp(dvp, nm, cr)) == NULL)
1591                 return (EINVAL);
1592
1593         rp = VTOR4(nvp);
1594         mutex_enter(&rp->r_statelock);
1595         r4_stub_referral(rp);
1596         mutex_exit(&rp->r_statelock);
1597         dnlc_enter(dvp, nm, nvp);
1598
1599         if (*vpp != NULL)
1600                 VN_RELE(*vpp);  /* no longer need this vnode */
1601
1602         *vpp = nvp;
1603
1604         return (0);
1605 }
1606
1607 /*
1608  * Fetch the location information and resolve the new server.
1609  * Caller needs to free up the XDR data which is returned.
1610  * Input: mount info, shared filehandle, nodename
1611  * Return: Index to the result or Error(-1)
1612  * Output: FsLocations Info, Resolved Server Info.
1613  */
1614 int
1615 nfs4_process_referral(mntinfo4_t *mi, nfs4_sharedfh_t *sfh,
1616     char *nm, cred_t *cr, nfs4_ga_res_t *grp, COMPOUND4res_clnt *res,
1617     struct nfs_fsl_info *fsloc)
1618 {
1619         fs_location4 *fsp;
1620         struct nfs_fsl_info nfsfsloc;
1621         int ret, i, error;
1622         nfs4_ga_res_t garp;
1623         COMPOUND4res_clnt callres;
1624         struct knetconfig *knc;
1625
1626         ret = nfs4_fetch_locations(mi, sfh, nm, cr, &garp, &callres, TRUE);
1627         if (ret == 0)
1628                 return (-1);
1629
1630         /*
1631          * As a lame attempt to figuring out if we're
1632          * handling a migration event or a referral,
1633          * look for rnodes with this fsid in the rnode
1634          * cache.
1635          *
1636          * If we can find one or more such rnodes, it
1637          * means we're handling a migration event and
1638          * we want to bail out in that case.
1639          */
1640         if (r4find_by_fsid(mi, &garp.n4g_fsid)) {
1641                 DTRACE_PROBE3(nfs4clnt__debug__referral__migration,
1642                     mntinfo4_t *, mi, nfs4_ga_res_t *, &garp,
1643                     char *, "nfs4_process_referral");
1644                 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1645                 return (-1);
1646         }
1647
1648         /*
1649          * Find the first responsive server to mount.  When we find
1650          * one, fsp will point to it.
1651          */
1652         for (i = 0; i < garp.n4g_ext_res->n4g_fslocations.locations_len; i++) {
1653
1654                 fsp = &garp.n4g_ext_res->n4g_fslocations.locations_val[i];
1655                 if (fsp->server_len == 0 || fsp->server_val == NULL)
1656                         continue;
1657
1658                 error = nfs4_callmapid(fsp->server_val, &nfsfsloc);
1659                 if (error != 0)
1660                         continue;
1661
1662                 error = nfs4_ping_server_common(nfsfsloc.knconf,
1663                     nfsfsloc.addr, !(mi->mi_flags & MI4_INT));
1664                 if (error == RPC_SUCCESS)
1665                         break;
1666
1667                 DTRACE_PROBE2(nfs4clnt__debug__referral__srvaddr,
1668                     sockaddr_in *, (struct sockaddr_in *)nfsfsloc.addr->buf,
1669                     char *, "nfs4_process_referral");
1670
1671                 xdr_free(xdr_nfs_fsl_info, (char *)&nfsfsloc);
1672         }
1673         knc = nfsfsloc.knconf;
1674         if ((i >= garp.n4g_ext_res->n4g_fslocations.locations_len) ||
1675             (knc->knc_protofmly == NULL) || (knc->knc_proto == NULL)) {
1676                 DTRACE_PROBE2(nfs4clnt__debug__referral__nofsloc,
1677                     nfs4_ga_res_t *, &garp, char *, "nfs4_process_referral");
1678                 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1679                 return (-1);
1680         }
1681
1682         /* Send the results back */
1683         *fsloc = nfsfsloc;
1684         *grp = garp;
1685         *res = callres;
1686         return (i);
1687 }
1688
1689 /*
1690  * Referrals case - need to fetch referral data and then upcall to
1691  * user-level to get complete mount data.
1692  */
1693 static ephemeral_servinfo_t *
1694 nfs4_trigger_esi_create_referral(vnode_t *vp, cred_t *cr)
1695 {
1696         struct knetconfig       *sikncp, *svkncp;
1697         struct netbuf           *bufp;
1698         ephemeral_servinfo_t    *esi;
1699         vnode_t                 *dvp;
1700         rnode4_t                *drp;
1701         fs_location4            *fsp;
1702         struct nfs_fsl_info     nfsfsloc;
1703         nfs4_ga_res_t           garp;
1704         char                    *p;
1705         char                    fn[MAXNAMELEN];
1706         int                     i, index = -1;
1707         mntinfo4_t              *mi;
1708         COMPOUND4res_clnt       callres;
1709
1710         /*
1711          * If we're passed in a stub vnode that
1712          * isn't a "referral" stub, bail out
1713          * and return a failure
1714          */
1715         if (!RP_ISSTUB_REFERRAL(VTOR4(vp)))
1716                 return (NULL);
1717
1718         if (vtodv(vp, &dvp, CRED(), TRUE) != 0)
1719                 return (NULL);
1720
1721         drp = VTOR4(dvp);
1722         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) {
1723                 VN_RELE(dvp);
1724                 return (NULL);
1725         }
1726
1727         if (vtoname(vp, fn, MAXNAMELEN) != 0) {
1728                 nfs_rw_exit(&drp->r_rwlock);
1729                 VN_RELE(dvp);
1730                 return (NULL);
1731         }
1732
1733         mi = VTOMI4(dvp);
1734         index = nfs4_process_referral(mi, drp->r_fh, fn, cr,
1735             &garp, &callres, &nfsfsloc);
1736         nfs_rw_exit(&drp->r_rwlock);
1737         VN_RELE(dvp);
1738         if (index < 0)
1739                 return (NULL);
1740
1741         fsp = &garp.n4g_ext_res->n4g_fslocations.locations_val[index];
1742         esi = kmem_zalloc(sizeof (ephemeral_servinfo_t), KM_SLEEP);
1743
1744         /* initially set to be our type of ephemeral mount; may be added to */
1745         esi->esi_mount_flags = NFSMNT_REFERRAL;
1746
1747         esi->esi_hostname =
1748             kmem_zalloc(fsp->server_val->utf8string_len + 1, KM_SLEEP);
1749         bcopy(fsp->server_val->utf8string_val, esi->esi_hostname,
1750             fsp->server_val->utf8string_len);
1751         esi->esi_hostname[fsp->server_val->utf8string_len] = '\0';
1752
1753         bufp = kmem_alloc(sizeof (struct netbuf), KM_SLEEP);
1754         bufp->len = nfsfsloc.addr->len;
1755         bufp->maxlen = nfsfsloc.addr->maxlen;
1756         bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
1757         bcopy(nfsfsloc.addr->buf, bufp->buf, bufp->len);
1758         esi->esi_addr = bufp;
1759
1760         esi->esi_knconf = kmem_zalloc(sizeof (*esi->esi_knconf), KM_SLEEP);
1761         sikncp = esi->esi_knconf;
1762
1763         DTRACE_PROBE2(nfs4clnt__debug__referral__nfsfsloc,
1764             struct nfs_fsl_info *, &nfsfsloc,
1765             char *, "nfs4_trigger_esi_create_referral");
1766
1767         svkncp = nfsfsloc.knconf;
1768         sikncp->knc_semantics = svkncp->knc_semantics;
1769         sikncp->knc_protofmly = kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1770         (void) strlcat((char *)sikncp->knc_protofmly,
1771             (char *)svkncp->knc_protofmly, KNC_STRSIZE);
1772         sikncp->knc_proto = kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1773         (void) strlcat((char *)sikncp->knc_proto, (char *)svkncp->knc_proto,
1774             KNC_STRSIZE);
1775         sikncp->knc_rdev = svkncp->knc_rdev;
1776
1777         DTRACE_PROBE2(nfs4clnt__debug__referral__knetconf,
1778             struct knetconfig *, sikncp,
1779             char *, "nfs4_trigger_esi_create_referral");
1780
1781         esi->esi_netname = kmem_zalloc(nfsfsloc.netnm_len, KM_SLEEP);
1782         bcopy(nfsfsloc.netname, esi->esi_netname, nfsfsloc.netnm_len);
1783         esi->esi_syncaddr = NULL;
1784
1785         esi->esi_path = p = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1786         esi->esi_path_len = MAXPATHLEN;
1787         *p++ = '/';
1788         for (i = 0; i < fsp->rootpath.pathname4_len; i++) {
1789                 component4 *comp;
1790
1791                 comp = &fsp->rootpath.pathname4_val[i];
1792                 /* If no space, null the string and bail */
1793                 if ((p - esi->esi_path) + comp->utf8string_len + 1 > MAXPATHLEN)
1794                         goto err;
1795                 bcopy(comp->utf8string_val, p, comp->utf8string_len);
1796                 p += comp->utf8string_len;
1797                 *p++ = '/';
1798         }
1799         if (fsp->rootpath.pathname4_len != 0)
1800                 *(p - 1) = '\0';
1801         else
1802                 *p = '\0';
1803         p = esi->esi_path;
1804         esi->esi_path = strdup(p);
1805         esi->esi_path_len = strlen(p) + 1;
1806         kmem_free(p, MAXPATHLEN);
1807
1808         /* Allocated in nfs4_process_referral() */
1809         xdr_free(xdr_nfs_fsl_info, (char *)&nfsfsloc);
1810         xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1811
1812         return (esi);
1813 err:
1814         kmem_free(esi->esi_path, esi->esi_path_len);
1815         kmem_free(esi->esi_hostname, fsp->server_val->utf8string_len + 1);
1816         kmem_free(esi->esi_addr->buf, esi->esi_addr->len);
1817         kmem_free(esi->esi_addr, sizeof (struct netbuf));
1818         kmem_free(esi->esi_knconf->knc_protofmly, KNC_STRSIZE);
1819         kmem_free(esi->esi_knconf->knc_proto, KNC_STRSIZE);
1820         kmem_free(esi->esi_knconf, sizeof (*esi->esi_knconf));
1821         kmem_free(esi->esi_netname, nfsfsloc.netnm_len);
1822         kmem_free(esi, sizeof (ephemeral_servinfo_t));
1823         xdr_free(xdr_nfs_fsl_info, (char *)&nfsfsloc);
1824         xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1825         return (NULL);
1826 }
1827
1828 /*
1829  * Assemble the args, and call the generic VFS mount function to
1830  * finally perform the ephemeral mount.
1831  */
1832 static int
1833 nfs4_trigger_domount(vnode_t *stubvp, domount_args_t *dma, vfs_t **vfsp,
1834     cred_t *cr, vnode_t **newvpp)
1835 {
1836         struct mounta   *uap;
1837         char            *mntpt, *orig_path, *path;
1838         const char      *orig_mntpt;
1839         int             retval;
1840         int             mntpt_len;
1841         int             spec_len;
1842         zone_t          *zone = curproc->p_zone;
1843         bool_t          has_leading_slash;
1844         int             i;
1845
1846         vfs_t                   *stubvfsp = stubvp->v_vfsp;
1847         ephemeral_servinfo_t    *esi = dma->dma_esi;
1848         struct nfs_args         *nargs = dma->dma_nargs;
1849
1850         /* first, construct the mount point for the ephemeral mount */
1851         orig_path = path = fn_path(VTOSV(stubvp)->sv_name);
1852         orig_mntpt = (char *)refstr_value(stubvfsp->vfs_mntpt);
1853
1854         if (*orig_path == '.')
1855                 orig_path++;
1856
1857         /*
1858          * Get rid of zone's root path
1859          */
1860         if (zone != global_zone) {
1861                 /*
1862                  * -1 for trailing '/' and -1 for EOS.
1863                  */
1864                 if (strncmp(zone->zone_rootpath, orig_mntpt,
1865                     zone->zone_rootpathlen - 1) == 0) {
1866                         orig_mntpt += (zone->zone_rootpathlen - 2);
1867                 }
1868         }
1869
1870         mntpt_len = strlen(orig_mntpt) + strlen(orig_path);
1871         mntpt = kmem_zalloc(mntpt_len + 1, KM_SLEEP);
1872         (void) strcat(mntpt, orig_mntpt);
1873         (void) strcat(mntpt, orig_path);
1874
1875         kmem_free(path, strlen(path) + 1);
1876         path = esi->esi_path;
1877         if (*path == '.')
1878                 path++;
1879         if (path[0] == '/' && path[1] == '/')
1880                 path++;
1881         has_leading_slash = (*path == '/');
1882
1883         spec_len = strlen(dma->dma_hostlist);
1884         spec_len += strlen(path);
1885
1886         /* We are going to have to add this in */
1887         if (!has_leading_slash)
1888                 spec_len++;
1889
1890         /* We need to get the ':' for dma_hostlist:esi_path */
1891         spec_len++;
1892
1893         uap = kmem_zalloc(sizeof (struct mounta), KM_SLEEP);
1894         uap->spec = kmem_zalloc(spec_len + 1, KM_SLEEP);
1895         (void) snprintf(uap->spec, spec_len + 1, "%s:%s%s", dma->dma_hostlist,
1896             has_leading_slash ? "" : "/", path);
1897
1898         uap->dir = mntpt;
1899
1900         uap->flags = MS_SYSSPACE | MS_DATA;
1901         /* fstype-independent mount options not covered elsewhere */
1902         /* copy parent's mount(1M) "-m" flag */
1903         if (stubvfsp->vfs_flag & VFS_NOMNTTAB)
1904                 uap->flags |= MS_NOMNTTAB;
1905
1906         uap->fstype = MNTTYPE_NFS4;
1907         uap->dataptr = (char *)nargs;
1908         /* not needed for MS_SYSSPACE */
1909         uap->datalen = 0;
1910
1911         /* use optptr to pass in extra mount options */
1912         uap->flags |= MS_OPTIONSTR;
1913         uap->optptr = nfs4_trigger_create_mntopts(stubvfsp);
1914         if (uap->optptr == NULL) {
1915                 retval = EINVAL;
1916                 goto done;
1917         }
1918
1919         /* domount() expects us to count the trailing NUL */
1920         uap->optlen = strlen(uap->optptr) + 1;
1921
1922         /*
1923          * If we get EBUSY, we try again once to see if we can perform
1924          * the mount. We do this because of a spurious race condition.
1925          */
1926         for (i = 0; i < 2; i++) {
1927                 int     error;
1928                 bool_t  was_mounted;
1929
1930                 retval = domount(NULL, uap, stubvp, cr, vfsp);
1931                 if (retval == 0) {
1932                         retval = VFS_ROOT(*vfsp, newvpp);
1933                         VFS_RELE(*vfsp);
1934                         break;
1935                 } else if (retval != EBUSY) {
1936                         break;
1937                 }
1938
1939                 /*
1940                  * We might find it mounted by the other racer...
1941                  */
1942                 error = nfs4_trigger_mounted_already(stubvp,
1943                     newvpp, &was_mounted, vfsp);
1944                 if (error) {
1945                         goto done;
1946                 } else if (was_mounted) {
1947                         retval = 0;
1948                         break;
1949                 }
1950         }
1951
1952 done:
1953         if (uap->optptr)
1954                 nfs4_trigger_destroy_mntopts(uap->optptr);
1955
1956         kmem_free(uap->spec, spec_len + 1);
1957         kmem_free(uap, sizeof (struct mounta));
1958         kmem_free(mntpt, mntpt_len + 1);
1959
1960         return (retval);
1961 }
1962
1963 /*
1964  * Build an nfs_args structure for passing to domount().
1965  *
1966  * Ephemeral mount-type specific data comes from the ephemeral_servinfo_t;
1967  * generic data - common to all ephemeral mount types - is read directly
1968  * from the parent mount's servinfo4_t and mntinfo4_t, via the stub vnode.
1969  */
1970 static struct nfs_args *
1971 nfs4_trigger_nargs_create(mntinfo4_t *mi, servinfo4_t *svp,
1972     ephemeral_servinfo_t *esi)
1973 {
1974         sec_data_t *secdata;
1975         struct nfs_args *nargs;
1976
1977         /* setup the nfs args */
1978         nargs = kmem_zalloc(sizeof (struct nfs_args), KM_SLEEP);
1979
1980         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1981
1982         nargs->addr = esi->esi_addr;
1983
1984         /* for AUTH_DH by negotiation */
1985         if (esi->esi_syncaddr || esi->esi_netname) {
1986                 nargs->flags |= NFSMNT_SECURE;
1987                 nargs->syncaddr = esi->esi_syncaddr;
1988                 nargs->netname = esi->esi_netname;
1989         }
1990
1991         nargs->flags |= NFSMNT_KNCONF;
1992         nargs->knconf = esi->esi_knconf;
1993         nargs->flags |= NFSMNT_HOSTNAME;
1994         nargs->hostname = esi->esi_hostname;
1995         nargs->fh = esi->esi_path;
1996
1997         /* general mount settings, all copied from parent mount */
1998         mutex_enter(&mi->mi_lock);
1999
2000         if (!(mi->mi_flags & MI4_HARD))
2001                 nargs->flags |= NFSMNT_SOFT;
2002
2003         nargs->flags |= NFSMNT_WSIZE | NFSMNT_RSIZE | NFSMNT_TIMEO |
2004             NFSMNT_RETRANS;
2005         nargs->wsize = mi->mi_stsize;
2006         nargs->rsize = mi->mi_tsize;
2007         nargs->timeo = mi->mi_timeo;
2008         nargs->retrans = mi->mi_retrans;
2009
2010         if (mi->mi_flags & MI4_INT)
2011                 nargs->flags |= NFSMNT_INT;
2012         if (mi->mi_flags & MI4_NOAC)
2013                 nargs->flags |= NFSMNT_NOAC;
2014
2015         nargs->flags |= NFSMNT_ACREGMIN | NFSMNT_ACREGMAX | NFSMNT_ACDIRMIN |
2016             NFSMNT_ACDIRMAX;
2017         nargs->acregmin = HR2SEC(mi->mi_acregmin);
2018         nargs->acregmax = HR2SEC(mi->mi_acregmax);
2019         nargs->acdirmin = HR2SEC(mi->mi_acdirmin);
2020         nargs->acdirmax = HR2SEC(mi->mi_acdirmax);
2021
2022         /* add any specific flags for this type of ephemeral mount */
2023         nargs->flags |= esi->esi_mount_flags;
2024
2025         if (mi->mi_flags & MI4_NOCTO)
2026                 nargs->flags |= NFSMNT_NOCTO;
2027         if (mi->mi_flags & MI4_GRPID)
2028                 nargs->flags |= NFSMNT_GRPID;
2029         if (mi->mi_flags & MI4_LLOCK)
2030                 nargs->flags |= NFSMNT_LLOCK;
2031         if (mi->mi_flags & MI4_NOPRINT)
2032                 nargs->flags |= NFSMNT_NOPRINT;
2033         if (mi->mi_flags & MI4_DIRECTIO)
2034                 nargs->flags |= NFSMNT_DIRECTIO;
2035         if (mi->mi_flags & MI4_PUBLIC && nargs->flags & NFSMNT_MIRRORMOUNT)
2036                 nargs->flags |= NFSMNT_PUBLIC;
2037
2038         /* Do some referral-specific option tweaking */
2039         if (nargs->flags & NFSMNT_REFERRAL) {
2040                 nargs->flags &= ~NFSMNT_DORDMA;
2041                 nargs->flags |= NFSMNT_TRYRDMA;
2042         }
2043
2044         mutex_exit(&mi->mi_lock);
2045
2046         /*
2047          * Security data & negotiation policy.
2048          *
2049          * For mirror mounts, we need to preserve the parent mount's
2050          * preference for security negotiation, translating SV4_TRYSECDEFAULT
2051          * to NFSMNT_SECDEFAULT if present.
2052          *
2053          * For referrals, we always want security negotiation and will
2054          * set NFSMNT_SECDEFAULT and we will not copy current secdata.
2055          * The reason is that we can't negotiate down from a parent's
2056          * Kerberos flavor to AUTH_SYS.
2057          *
2058          * If SV4_TRYSECDEFAULT is not set, that indicates that a specific
2059          * security flavour was requested, with data in sv_secdata, and that
2060          * no negotiation should occur. If this specified flavour fails, that's
2061          * it. We will copy sv_secdata, and not set NFSMNT_SECDEFAULT.
2062          *
2063          * If SV4_TRYSECDEFAULT is set, then we start with a passed-in
2064          * default flavour, in sv_secdata, but then negotiate a new flavour.
2065          * Possible flavours are recorded in an array in sv_secinfo, with
2066          * currently in-use flavour pointed to by sv_currsec.
2067          *
2068          * If sv_currsec is set, i.e. if negotiation has already occurred,
2069          * we will copy sv_currsec. Otherwise, copy sv_secdata. Regardless,
2070          * we will set NFSMNT_SECDEFAULT, to enable negotiation.
2071          */
2072         if (nargs->flags & NFSMNT_REFERRAL) {
2073                 /* enable negotiation for referral mount */
2074                 nargs->flags |= NFSMNT_SECDEFAULT;
2075                 secdata = kmem_alloc(sizeof (sec_data_t), KM_SLEEP);
2076                 secdata->secmod = secdata->rpcflavor = AUTH_SYS;
2077                 secdata->data = NULL;
2078         } else if (svp->sv_flags & SV4_TRYSECDEFAULT) {
2079                 /* enable negotiation for mirror mount */
2080                 nargs->flags |= NFSMNT_SECDEFAULT;
2081
2082                 /*
2083                  * As a starting point for negotiation, copy parent
2084                  * mount's negotiated flavour (sv_currsec) if available,
2085                  * or its passed-in flavour (sv_secdata) if not.
2086                  */
2087                 if (svp->sv_currsec != NULL)
2088                         secdata = copy_sec_data(svp->sv_currsec);
2089                 else if (svp->sv_secdata != NULL)
2090                         secdata = copy_sec_data(svp->sv_secdata);
2091                 else
2092                         secdata = NULL;
2093         } else {
2094                 /* do not enable negotiation; copy parent's passed-in flavour */
2095                 if (svp->sv_secdata != NULL)
2096                         secdata = copy_sec_data(svp->sv_secdata);
2097                 else
2098                         secdata = NULL;
2099         }
2100
2101         nfs_rw_exit(&svp->sv_lock);
2102
2103         nargs->flags |= NFSMNT_NEWARGS;
2104         nargs->nfs_args_ext = NFS_ARGS_EXTB;
2105         nargs->nfs_ext_u.nfs_extB.secdata = secdata;
2106
2107         /* for NFS RO failover; caller will set if necessary */
2108         nargs->nfs_ext_u.nfs_extB.next = NULL;
2109
2110         return (nargs);
2111 }
2112
2113 static void
2114 nfs4_trigger_nargs_destroy(struct nfs_args *nargs)
2115 {
2116         /*
2117          * Either the mount failed, in which case the data is not needed, or
2118          * nfs4_mount() has either taken copies of what it needs or,
2119          * where it has merely copied the ptr, it has set *our* ptr to NULL,
2120          * whereby nfs4_free_args() will ignore it.
2121          */
2122         nfs4_free_args(nargs);
2123         kmem_free(nargs, sizeof (struct nfs_args));
2124 }
2125
2126 /*
2127  * When we finally get into the mounting, we need to add this
2128  * node to the ephemeral tree.
2129  *
2130  * This is called from nfs4_mount().
2131  */
2132 int
2133 nfs4_record_ephemeral_mount(mntinfo4_t *mi, vnode_t *mvp)
2134 {
2135         mntinfo4_t              *mi_parent;
2136         nfs4_ephemeral_t        *eph;
2137         nfs4_ephemeral_tree_t   *net;
2138
2139         nfs4_ephemeral_t        *prior;
2140         nfs4_ephemeral_t        *child;
2141
2142         nfs4_ephemeral_t        *peer;
2143
2144         nfs4_trigger_globals_t  *ntg;
2145         zone_t                  *zone = curproc->p_zone;
2146
2147         int                     rc = 0;
2148
2149         mi_parent = VTOMI4(mvp);
2150
2151         /*
2152          * Get this before grabbing anything else!
2153          */
2154         ntg = zone_getspecific(nfs4_ephemeral_key, zone);
2155         if (!ntg->ntg_thread_started) {
2156                 nfs4_ephemeral_start_harvester(ntg);
2157         }
2158
2159         mutex_enter(&mi_parent->mi_lock);
2160         mutex_enter(&mi->mi_lock);
2161
2162         net = mi->mi_ephemeral_tree =
2163             mi_parent->mi_ephemeral_tree;
2164
2165         /*
2166          * If the mi_ephemeral_tree is NULL, then it
2167          * means that either the harvester or a manual
2168          * umount has cleared the tree out right before
2169          * we got here.
2170          *
2171          * There is nothing we can do here, so return
2172          * to the caller and let them decide whether they
2173          * try again.
2174          */
2175         if (net == NULL) {
2176                 mutex_exit(&mi->mi_lock);
2177                 mutex_exit(&mi_parent->mi_lock);
2178
2179                 return (EBUSY);
2180         }
2181
2182         /*
2183          * We've just tied the mntinfo to the tree, so
2184          * now we bump the refcnt and hold it there until
2185          * this mntinfo is removed from the tree.
2186          */
2187         nfs4_ephemeral_tree_hold(net);
2188
2189         /*
2190          * We need to tack together the ephemeral mount
2191          * with this new mntinfo.
2192          */
2193         eph = kmem_zalloc(sizeof (*eph), KM_SLEEP);
2194         eph->ne_mount = mi;
2195         MI4_HOLD(mi);
2196         VFS_HOLD(mi->mi_vfsp);
2197         eph->ne_ref_time = gethrestime_sec();
2198
2199         /*
2200          * We need to tell the ephemeral mount when
2201          * to time out.
2202          */
2203         eph->ne_mount_to = ntg->ntg_mount_to;
2204
2205         mi->mi_ephemeral = eph;
2206
2207         /*
2208          * If the enclosing mntinfo4 is also ephemeral,
2209          * then we need to point to its enclosing parent.
2210          * Else the enclosing mntinfo4 is the enclosing parent.
2211          *
2212          * We also need to weave this ephemeral node
2213          * into the tree.
2214          */
2215         if (mi_parent->mi_flags & MI4_EPHEMERAL) {
2216                 /*
2217                  * We need to decide if we are
2218                  * the root node of this branch
2219                  * or if we are a sibling of this
2220                  * branch.
2221                  */
2222                 prior = mi_parent->mi_ephemeral;
2223                 if (prior == NULL) {
2224                         /*
2225                          * Race condition, clean up, and
2226                          * let caller handle mntinfo.
2227                          */
2228                         mi->mi_flags &= ~MI4_EPHEMERAL;
2229                         mi->mi_ephemeral = NULL;
2230                         kmem_free(eph, sizeof (*eph));
2231                         VFS_RELE(mi->mi_vfsp);
2232                         MI4_RELE(mi);
2233                         nfs4_ephemeral_tree_rele(net);
2234                         rc = EBUSY;
2235                 } else {
2236                         if (prior->ne_child == NULL) {
2237                                 prior->ne_child = eph;
2238                         } else {
2239                                 child = prior->ne_child;
2240
2241                                 prior->ne_child = eph;
2242                                 eph->ne_peer = child;
2243
2244                                 child->ne_prior = eph;
2245                         }
2246
2247                         eph->ne_prior = prior;
2248                 }
2249         } else {
2250                 /*
2251                  * The parent mntinfo4 is the non-ephemeral
2252                  * root of the ephemeral tree. We
2253                  * need to decide if we are the root
2254                  * node of that tree or if we are a
2255                  * sibling of the root node.
2256                  *
2257                  * We are the root if there is no
2258                  * other node.
2259                  */
2260                 if (net->net_root == NULL) {
2261                         net->net_root = eph;
2262                 } else {
2263                         eph->ne_peer = peer = net->net_root;
2264                         ASSERT(peer != NULL);
2265                         net->net_root = eph;
2266
2267                         peer->ne_prior = eph;
2268                 }
2269
2270                 eph->ne_prior = NULL;
2271         }
2272
2273         mutex_exit(&mi->mi_lock);
2274         mutex_exit(&mi_parent->mi_lock);
2275
2276         return (rc);
2277 }
2278
2279 /*
2280  * Commit the changes to the ephemeral tree for removing this node.
2281  */
2282 static void
2283 nfs4_ephemeral_umount_cleanup(nfs4_ephemeral_t *eph)
2284 {
2285         nfs4_ephemeral_t        *e = eph;
2286         nfs4_ephemeral_t        *peer;
2287         nfs4_ephemeral_t        *prior;
2288
2289         peer = eph->ne_peer;
2290         prior = e->ne_prior;
2291
2292         /*
2293          * If this branch root was not the
2294          * tree root, then we need to fix back pointers.
2295          */
2296         if (prior) {
2297                 if (prior->ne_child == e) {
2298                         prior->ne_child = peer;
2299                 } else {
2300                         prior->ne_peer = peer;
2301                 }
2302
2303                 if (peer)
2304                         peer->ne_prior = prior;
2305         } else if (peer) {
2306                 peer->ne_mount->mi_ephemeral_tree->net_root = peer;
2307                 peer->ne_prior = NULL;
2308         } else {
2309                 e->ne_mount->mi_ephemeral_tree->net_root = NULL;
2310         }
2311 }
2312
2313 /*
2314  * We want to avoid recursion at all costs. So we need to
2315  * unroll the tree. We do this by a depth first traversal to
2316  * leaf nodes. We blast away the leaf and work our way back
2317  * up and down the tree.
2318  */
2319 static int
2320 nfs4_ephemeral_unmount_engine(nfs4_ephemeral_t *eph,
2321     int isTreeRoot, int flag, cred_t *cr)
2322 {
2323         nfs4_ephemeral_t        *e = eph;
2324         nfs4_ephemeral_t        *prior;
2325         mntinfo4_t              *mi;
2326         vfs_t                   *vfsp;
2327         int                     error;
2328
2329         /*
2330          * We use the loop while unrolling the ephemeral tree.
2331          */
2332         for (;;) {
2333                 /*
2334                  * First we walk down the child.
2335                  */
2336                 if (e->ne_child) {
2337                         prior = e;
2338                         e = e->ne_child;
2339                         continue;
2340                 }
2341
2342                 /*
2343                  * If we are the root of the branch we are removing,
2344                  * we end it here. But if the branch is the root of
2345                  * the tree, we have to forge on. We do not consider
2346                  * the peer list for the root because while it may
2347                  * be okay to remove, it is both extra work and a
2348                  * potential for a false-positive error to stall the
2349                  * unmount attempt.
2350                  */
2351                 if (e == eph && isTreeRoot == FALSE)
2352                         return (0);
2353
2354                 /*
2355                  * Next we walk down the peer list.
2356                  */
2357                 if (e->ne_peer) {
2358                         prior = e;
2359                         e = e->ne_peer;
2360                         continue;
2361                 }
2362
2363                 /*
2364                  * We can only remove the node passed in by the
2365                  * caller if it is the root of the ephemeral tree.
2366                  * Otherwise, the caller will remove it.
2367                  */
2368                 if (e == eph && isTreeRoot == FALSE)
2369                         return (0);
2370
2371                 /*
2372                  * Okay, we have a leaf node, time
2373                  * to prune it!
2374                  *
2375                  * Note that prior can only be NULL if
2376                  * and only if it is the root of the
2377                  * ephemeral tree.
2378                  */
2379                 prior = e->ne_prior;
2380
2381                 mi = e->ne_mount;
2382                 mutex_enter(&mi->mi_lock);
2383                 vfsp = mi->mi_vfsp;
2384                 ASSERT(vfsp != NULL);
2385
2386                 /*
2387                  * Cleared by umount2_engine.
2388                  */
2389                 VFS_HOLD(vfsp);
2390
2391                 /*
2392                  * Inform nfs4_unmount to not recursively
2393                  * descend into this node's children when it
2394                  * gets processed.
2395                  */
2396                 mi->mi_flags |= MI4_EPHEMERAL_RECURSED;
2397                 mutex_exit(&mi->mi_lock);
2398
2399                 error = umount2_engine(vfsp, flag, cr, FALSE);
2400                 if (error) {
2401                         /*
2402                          * We need to reenable nfs4_unmount's ability
2403                          * to recursively descend on this node.
2404                          */
2405                         mutex_enter(&mi->mi_lock);
2406                         mi->mi_flags &= ~MI4_EPHEMERAL_RECURSED;
2407                         mutex_exit(&mi->mi_lock);
2408
2409                         return (error);
2410                 }
2411
2412                 /*
2413                  * If we are the current node, we do not want to
2414                  * touch anything else. At this point, the only
2415                  * way the current node can have survived to here
2416                  * is if it is the root of the ephemeral tree and
2417                  * we are unmounting the enclosing mntinfo4.
2418                  */
2419                 if (e == eph) {
2420                         ASSERT(prior == NULL);
2421                         return (0);
2422                 }
2423
2424                 /*
2425                  * Stitch up the prior node. Note that since
2426                  * we have handled the root of the tree, prior
2427                  * must be non-NULL.
2428                  */
2429                 ASSERT(prior != NULL);
2430                 if (prior->ne_child == e) {
2431                         prior->ne_child = NULL;
2432                 } else {
2433                         ASSERT(prior->ne_peer == e);
2434
2435                         prior->ne_peer = NULL;
2436                 }
2437
2438                 e = prior;
2439         }
2440
2441         /* NOTREACHED */
2442 }
2443
2444 /*
2445  * Common code to safely release net_cnt_lock and net_tree_lock
2446  */
2447 void
2448 nfs4_ephemeral_umount_unlock(bool_t *pmust_unlock,
2449     nfs4_ephemeral_tree_t **pnet)
2450 {
2451         nfs4_ephemeral_tree_t   *net = *pnet;
2452
2453         if (*pmust_unlock) {
2454                 mutex_enter(&net->net_cnt_lock);
2455                 net->net_status &= ~NFS4_EPHEMERAL_TREE_UMOUNTING;
2456                 mutex_exit(&net->net_cnt_lock);
2457
2458                 mutex_exit(&net->net_tree_lock);
2459
2460                 *pmust_unlock = FALSE;
2461         }
2462 }
2463
2464 /*
2465  * While we may have removed any child or sibling nodes of this
2466  * ephemeral node, we can not nuke it until we know that there
2467  * were no actived vnodes on it. This will do that final
2468  * work once we know it is not busy.
2469  */
2470 void
2471 nfs4_ephemeral_umount_activate(mntinfo4_t *mi, bool_t *pmust_unlock,
2472     nfs4_ephemeral_tree_t **pnet)
2473 {
2474         /*
2475          * Now we need to get rid of the ephemeral data if it exists.
2476          */
2477         mutex_enter(&mi->mi_lock);
2478         if (mi->mi_ephemeral) {
2479                 /*
2480                  * If we are the root node of an ephemeral branch
2481                  * which is being removed, then we need to fixup
2482                  * pointers into and out of the node.
2483                  */
2484                 if (!(mi->mi_flags & MI4_EPHEMERAL_RECURSED))
2485                         nfs4_ephemeral_umount_cleanup(mi->mi_ephemeral);
2486
2487                 nfs4_ephemeral_tree_rele(*pnet);
2488                 ASSERT(mi->mi_ephemeral != NULL);
2489
2490                 kmem_free(mi->mi_ephemeral, sizeof (*mi->mi_ephemeral));
2491                 mi->mi_ephemeral = NULL;
2492                 VFS_RELE(mi->mi_vfsp);
2493                 MI4_RELE(mi);
2494         }
2495         mutex_exit(&mi->mi_lock);
2496
2497         nfs4_ephemeral_umount_unlock(pmust_unlock, pnet);
2498 }
2499
2500 /*
2501  * Unmount an ephemeral node.
2502  *
2503  * Note that if this code fails, then it must unlock.
2504  *
2505  * If it succeeds, then the caller must be prepared to do so.
2506  */
2507 int
2508 nfs4_ephemeral_umount(mntinfo4_t *mi, int flag, cred_t *cr,
2509     bool_t *pmust_unlock, nfs4_ephemeral_tree_t **pnet)
2510 {
2511         int                     error = 0;
2512         nfs4_ephemeral_t        *eph;
2513         nfs4_ephemeral_tree_t   *net;
2514         int                     is_derooting = FALSE;
2515         int                     is_recursed = FALSE;
2516         int                     was_locked = FALSE;
2517
2518         /*
2519          * Make sure to set the default state for cleaning
2520          * up the tree in the caller (and on the way out).
2521          */
2522         *pmust_unlock = FALSE;
2523
2524         /*
2525          * The active vnodes on this file system may be ephemeral
2526          * children. We need to check for and try to unmount them
2527          * here. If any can not be unmounted, we are going
2528          * to return EBUSY.
2529          */
2530         mutex_enter(&mi->mi_lock);
2531
2532         /*
2533          * If an ephemeral tree, we need to check to see if
2534          * the lock is already held. If it is, then we need
2535          * to see if we are being called as a result of
2536          * the recursive removal of some node of the tree or
2537          * if we are another attempt to remove the tree.
2538          *
2539          * mi_flags & MI4_EPHEMERAL indicates an ephemeral
2540          * node. mi_ephemeral being non-NULL also does this.
2541          *
2542          * mi_ephemeral_tree being non-NULL is sufficient
2543          * to also indicate either it is an ephemeral node
2544          * or the enclosing mntinfo4.
2545          *
2546          * Do we need MI4_EPHEMERAL? Yes, it is useful for
2547          * when we delete the ephemeral node and need to
2548          * differentiate from an ephemeral node and the
2549          * enclosing root node.
2550          */
2551         *pnet = net = mi->mi_ephemeral_tree;
2552         if (net == NULL) {
2553                 mutex_exit(&mi->mi_lock);
2554                 return (0);
2555         }
2556
2557         eph = mi->mi_ephemeral;
2558         is_recursed = mi->mi_flags & MI4_EPHEMERAL_RECURSED;
2559         is_derooting = (eph == NULL);
2560
2561         mutex_enter(&net->net_cnt_lock);
2562
2563         /*
2564          * If this is not recursion, then we need to
2565          * check to see if a harvester thread has
2566          * already grabbed the lock.
2567          *
2568          * After we exit this branch, we may not
2569          * blindly return, we need to jump to
2570          * is_busy!
2571          */
2572         if (!is_recursed) {
2573                 if (net->net_status &
2574                     NFS4_EPHEMERAL_TREE_LOCKED) {
2575                         /*
2576                          * If the tree is locked, we need
2577                          * to decide whether we are the
2578                          * harvester or some explicit call
2579                          * for a umount. The only way that
2580                          * we are the harvester is if
2581                          * MS_SYSSPACE is set.
2582                          *
2583                          * We only let the harvester through
2584                          * at this point.
2585                          *
2586                          * We return EBUSY so that the
2587                          * caller knows something is
2588                          * going on. Note that by that
2589                          * time, the umount in the other
2590                          * thread may have already occured.
2591                          */
2592                         if (!(flag & MS_SYSSPACE)) {
2593                                 mutex_exit(&net->net_cnt_lock);
2594                                 mutex_exit(&mi->mi_lock);
2595
2596                                 return (EBUSY);
2597                         }
2598
2599                         was_locked = TRUE;
2600                 }
2601         }
2602
2603         mutex_exit(&net->net_cnt_lock);
2604         mutex_exit(&mi->mi_lock);
2605
2606         /*
2607          * If we are not the harvester, we need to check
2608          * to see if we need to grab the tree lock.
2609          */
2610         if (was_locked == FALSE) {
2611                 /*
2612                  * If we grab the lock, it means that no other
2613                  * operation is working on the tree. If we don't
2614                  * grab it, we need to decide if this is because
2615                  * we are a recursive call or a new operation.
2616                  */
2617                 if (mutex_tryenter(&net->net_tree_lock)) {
2618                         *pmust_unlock = TRUE;
2619                 } else {
2620                         /*
2621                          * If we are a recursive call, we can
2622                          * proceed without the lock.
2623                          * Otherwise we have to wait until
2624                          * the lock becomes free.
2625                          */
2626                         if (!is_recursed) {
2627                                 mutex_enter(&net->net_cnt_lock);
2628                                 if (net->net_status &
2629                                     (NFS4_EPHEMERAL_TREE_DEROOTING
2630                                     | NFS4_EPHEMERAL_TREE_INVALID)) {
2631                                         mutex_exit(&net->net_cnt_lock);
2632                                         goto is_busy;
2633                                 }
2634                                 mutex_exit(&net->net_cnt_lock);
2635
2636                                 /*
2637                                  * We can't hold any other locks whilst
2638                                  * we wait on this to free up.
2639                                  */
2640                                 mutex_enter(&net->net_tree_lock);
2641
2642                                 /*
2643                                  * Note that while mi->mi_ephemeral
2644                                  * may change and thus we have to
2645                                  * update eph, it is the case that
2646                                  * we have tied down net and
2647                                  * do not care if mi->mi_ephemeral_tree
2648                                  * has changed.
2649                                  */
2650                                 mutex_enter(&mi->mi_lock);
2651                                 eph = mi->mi_ephemeral;
2652                                 mutex_exit(&mi->mi_lock);
2653
2654                                 /*
2655                                  * Okay, we need to see if either the
2656                                  * tree got nuked or the current node
2657                                  * got nuked. Both of which will cause
2658                                  * an error.
2659                                  *
2660                                  * Note that a subsequent retry of the
2661                                  * umount shall work.
2662                                  */
2663                                 mutex_enter(&net->net_cnt_lock);
2664                                 if (net->net_status &
2665                                     NFS4_EPHEMERAL_TREE_INVALID ||
2666                                     (!is_derooting && eph == NULL)) {
2667                                         mutex_exit(&net->net_cnt_lock);
2668                                         mutex_exit(&net->net_tree_lock);
2669                                         goto is_busy;
2670                                 }
2671                                 mutex_exit(&net->net_cnt_lock);
2672                                 *pmust_unlock = TRUE;
2673                         }
2674                 }
2675         }
2676
2677         /*
2678          * Only once we have grabbed the lock can we mark what we
2679          * are planning on doing to the ephemeral tree.
2680          */
2681         if (*pmust_unlock) {
2682                 mutex_enter(&net->net_cnt_lock);
2683                 net->net_status |= NFS4_EPHEMERAL_TREE_UMOUNTING;
2684
2685                 /*
2686                  * Check to see if we are nuking the root.
2687                  */
2688                 if (is_derooting)
2689                         net->net_status |=
2690                             NFS4_EPHEMERAL_TREE_DEROOTING;
2691                 mutex_exit(&net->net_cnt_lock);
2692         }
2693
2694         if (!is_derooting) {
2695                 /*
2696                  * Only work on children if the caller has not already
2697                  * done so.
2698                  */
2699                 if (!is_recursed) {
2700                         ASSERT(eph != NULL);
2701
2702                         error = nfs4_ephemeral_unmount_engine(eph,
2703                             FALSE, flag, cr);
2704                         if (error)
2705                                 goto is_busy;
2706                 }
2707         } else {
2708                 eph = net->net_root;
2709
2710                 /*
2711                  * Only work if there is something there.
2712                  */
2713                 if (eph) {
2714                         error = nfs4_ephemeral_unmount_engine(eph, TRUE,
2715                             flag, cr);
2716                         if (error) {
2717                                 mutex_enter(&net->net_cnt_lock);
2718                                 net->net_status &=
2719                                     ~NFS4_EPHEMERAL_TREE_DEROOTING;
2720                                 mutex_exit(&net->net_cnt_lock);
2721                                 goto is_busy;
2722                         }
2723
2724                         /*
2725                          * Nothing else which goes wrong will
2726                          * invalidate the blowing away of the
2727                          * ephmeral tree.
2728                          */
2729                         net->net_root = NULL;
2730                 }
2731
2732                 /*
2733                  * We have derooted and we have caused the tree to be
2734                  * invalidated.
2735                  */
2736                 mutex_enter(&net->net_cnt_lock);
2737                 net->net_status &= ~NFS4_EPHEMERAL_TREE_DEROOTING;
2738                 net->net_status |= NFS4_EPHEMERAL_TREE_INVALID;
2739                 DTRACE_NFSV4_1(nfs4clnt__dbg__ephemeral__tree__derooting,
2740                     uint_t, net->net_refcnt);
2741
2742                 /*
2743                  * We will not finalize this node, so safe to
2744                  * release it.
2745                  */
2746                 nfs4_ephemeral_tree_decr(net);
2747                 mutex_exit(&net->net_cnt_lock);
2748
2749                 if (was_locked == FALSE)
2750                         mutex_exit(&net->net_tree_lock);
2751
2752                 /*
2753                  * We have just blown away any notation of this
2754                  * tree being locked or having a refcnt.
2755                  * We can't let the caller try to clean things up.
2756                  */
2757                 *pmust_unlock = FALSE;
2758
2759                 /*
2760                  * At this point, the tree should no longer be
2761                  * associated with the mntinfo4. We need to pull
2762                  * it off there and let the harvester take
2763                  * care of it once the refcnt drops.
2764                  */
2765                 mutex_enter(&mi->mi_lock);
2766                 mi->mi_ephemeral_tree = NULL;
2767                 mutex_exit(&mi->mi_lock);
2768         }
2769
2770         return (0);
2771
2772 is_busy:
2773
2774         nfs4_ephemeral_umount_unlock(pmust_unlock, pnet);
2775
2776         return (error);
2777 }
2778
2779 /*
2780  * Do the umount and record any error in the parent.
2781  */
2782 static void
2783 nfs4_ephemeral_record_umount(vfs_t *vfsp, int flag,
2784     nfs4_ephemeral_t *e, nfs4_ephemeral_t *prior)
2785 {
2786         int     error;
2787
2788         /*
2789          * Only act on if the fs is still mounted.
2790          */
2791         if (vfsp == NULL)
2792                 return;
2793
2794         error = umount2_engine(vfsp, flag, kcred, FALSE);
2795         if (error) {
2796                 if (prior) {
2797                         if (prior->ne_child == e)
2798                                 prior->ne_state |=
2799                                     NFS4_EPHEMERAL_CHILD_ERROR;
2800                         else
2801                                 prior->ne_state |=
2802                                     NFS4_EPHEMERAL_PEER_ERROR;
2803                 }
2804         }
2805 }
2806
2807 /*
2808  * For each tree in the forest (where the forest is in
2809  * effect all of the ephemeral trees for this zone),
2810  * scan to see if a node can be unmounted. Note that
2811  * unlike nfs4_ephemeral_unmount_engine(), we do
2812  * not process the current node before children or
2813  * siblings. I.e., if a node can be unmounted, we
2814  * do not recursively check to see if the nodes
2815  * hanging off of it can also be unmounted.
2816  *
2817  * Instead, we delve down deep to try and remove the
2818  * children first. Then, because we share code with
2819  * nfs4_ephemeral_unmount_engine(), we will try
2820  * them again. This could be a performance issue in
2821  * the future.
2822  *
2823  * Also note that unlike nfs4_ephemeral_unmount_engine(),
2824  * we do not halt on an error. We will not remove the
2825  * current node, but we will keep on trying to remove
2826  * the others.
2827  *
2828  * force indicates that we want the unmount to occur
2829  * even if there is something blocking it.
2830  *
2831  * time_check indicates that we want to see if the
2832  * mount has expired past mount_to or not. Typically
2833  * we want to do this and only on a shutdown of the
2834  * zone would we want to ignore the check.
2835  */
2836 static void
2837 nfs4_ephemeral_harvest_forest(nfs4_trigger_globals_t *ntg,
2838     bool_t force, bool_t time_check)
2839 {
2840         nfs4_ephemeral_tree_t   *net;
2841         nfs4_ephemeral_tree_t   *prev = NULL;
2842         nfs4_ephemeral_tree_t   *next;
2843         nfs4_ephemeral_t        *e;
2844         nfs4_ephemeral_t        *prior;
2845         time_t                  now = gethrestime_sec();
2846
2847         nfs4_ephemeral_tree_t   *harvest = NULL;
2848
2849         int                     flag;
2850
2851         mntinfo4_t              *mi;
2852         vfs_t                   *vfsp;
2853
2854         if (force)
2855                 flag = MS_FORCE | MS_SYSSPACE;
2856         else
2857                 flag = MS_SYSSPACE;
2858
2859         mutex_enter(&ntg->ntg_forest_lock);
2860         for (net = ntg->ntg_forest; net != NULL; net = next) {
2861                 next = net->net_next;
2862
2863                 nfs4_ephemeral_tree_hold(net);
2864
2865                 mutex_enter(&net->net_tree_lock);
2866
2867                 /*
2868                  * Let the unmount code know that the
2869                  * tree is already locked!
2870                  */
2871                 mutex_enter(&net->net_cnt_lock);
2872                 net->net_status |= NFS4_EPHEMERAL_TREE_LOCKED;
2873                 mutex_exit(&net->net_cnt_lock);
2874
2875                 /*
2876                  * If the intent is force all ephemeral nodes to
2877                  * be unmounted in this zone, we can short circuit a
2878                  * lot of tree traversal and simply zap the root node.
2879                  */
2880                 if (force) {
2881                         if (net->net_root) {
2882                                 mi = net->net_root->ne_mount;
2883
2884                                 vfsp = mi->mi_vfsp;
2885                                 ASSERT(vfsp != NULL);
2886
2887                                 /*
2888                                  * Cleared by umount2_engine.
2889                                  */
2890                                 VFS_HOLD(vfsp);
2891
2892                                 (void) umount2_engine(vfsp, flag,
2893                                     kcred, FALSE);
2894
2895                                 goto check_done;
2896                         }
2897                 }
2898
2899                 e = net->net_root;
2900                 if (e)
2901                         e->ne_state = NFS4_EPHEMERAL_VISIT_CHILD;
2902
2903                 while (e) {
2904                         if (e->ne_state == NFS4_EPHEMERAL_VISIT_CHILD) {
2905                                 e->ne_state = NFS4_EPHEMERAL_VISIT_SIBLING;
2906                                 if (e->ne_child) {
2907                                         e = e->ne_child;
2908                                         e->ne_state =
2909                                             NFS4_EPHEMERAL_VISIT_CHILD;
2910                                 }
2911
2912                                 continue;
2913                         } else if (e->ne_state ==
2914                             NFS4_EPHEMERAL_VISIT_SIBLING) {
2915                                 e->ne_state = NFS4_EPHEMERAL_PROCESS_ME;
2916                                 if (e->ne_peer) {
2917                                         e = e->ne_peer;
2918                                         e->ne_state =
2919                                             NFS4_EPHEMERAL_VISIT_CHILD;
2920                                 }
2921
2922                                 continue;
2923                         } else if (e->ne_state ==
2924                             NFS4_EPHEMERAL_CHILD_ERROR) {
2925                                 prior = e->ne_prior;
2926
2927                                 /*
2928                                  * If a child reported an error, do
2929                                  * not bother trying to unmount.
2930                                  *
2931                                  * If your prior node is a parent,
2932                                  * pass the error up such that they
2933                                  * also do not try to unmount.
2934                                  *
2935                                  * However, if your prior is a sibling,
2936                                  * let them try to unmount if they can.
2937                                  */
2938                                 if (prior) {
2939                                         if (prior->ne_child == e)
2940                                                 prior->ne_state |=
2941                                                     NFS4_EPHEMERAL_CHILD_ERROR;
2942                                         else
2943                                                 prior->ne_state |=
2944                                                     NFS4_EPHEMERAL_PEER_ERROR;
2945                                 }
2946
2947                                 /*
2948                                  * Clear the error and if needed, process peers.
2949                                  *
2950                                  * Once we mask out the error, we know whether
2951                                  * or we have to process another node.
2952                                  */
2953                                 e->ne_state &= ~NFS4_EPHEMERAL_CHILD_ERROR;
2954                                 if (e->ne_state == NFS4_EPHEMERAL_PROCESS_ME)
2955                                         e = prior;
2956
2957                                 continue;
2958                         } else if (e->ne_state ==
2959                             NFS4_EPHEMERAL_PEER_ERROR) {
2960                                 prior = e->ne_prior;
2961
2962                                 if (prior) {
2963                                         if (prior->ne_child == e)
2964                                                 prior->ne_state =
2965                                                     NFS4_EPHEMERAL_CHILD_ERROR;
2966                                         else
2967                                                 prior->ne_state =
2968                                                     NFS4_EPHEMERAL_PEER_ERROR;
2969                                 }
2970
2971                                 /*
2972                                  * Clear the error from this node and do the
2973                                  * correct processing.
2974                                  */
2975                                 e->ne_state &= ~NFS4_EPHEMERAL_PEER_ERROR;
2976                                 continue;
2977                         }
2978
2979                         prior = e->ne_prior;
2980                         e->ne_state = NFS4_EPHEMERAL_OK;
2981
2982                         /*
2983                          * It must be the case that we need to process
2984                          * this node.
2985                          */
2986                         if (!time_check ||
2987                             now - e->ne_ref_time > e->ne_mount_to) {
2988                                 mi = e->ne_mount;
2989                                 vfsp = mi->mi_vfsp;
2990
2991                                 /*
2992                                  * Cleared by umount2_engine.
2993                                  */
2994                                 if (vfsp != NULL)
2995                                         VFS_HOLD(vfsp);
2996
2997                                 /*
2998                                  * Note that we effectively work down to the
2999                                  * leaf nodes first, try to unmount them,
3000                                  * then work our way back up into the leaf
3001                                  * nodes.
3002                                  *
3003                                  * Also note that we deal with a lot of
3004                                  * complexity by sharing the work with
3005                                  * the manual unmount code.
3006                                  */
3007                                 nfs4_ephemeral_record_umount(vfsp, flag,
3008                                     e, prior);
3009                         }
3010
3011                         e = prior;
3012                 }
3013
3014 check_done:
3015
3016                 /*
3017                  * At this point we are done processing this tree.
3018                  *
3019                  * If the tree is invalid and we were the only reference
3020                  * to it, then we push it on the local linked list
3021                  * to remove it at the end. We avoid that action now
3022                  * to keep the tree processing going along at a fair clip.
3023                  *
3024                  * Else, even if we were the only reference, we
3025                  * allow it to be reused as needed.
3026                  */
3027                 mutex_enter(&net->net_cnt_lock);
3028                 nfs4_ephemeral_tree_decr(net);
3029                 if (net->net_refcnt == 0 &&
3030                     net->net_status & NFS4_EPHEMERAL_TREE_INVALID) {
3031                         net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED;
3032                         mutex_exit(&net->net_cnt_lock);
3033                         mutex_exit(&net->net_tree_lock);
3034
3035                         if (prev)
3036                                 prev->net_next = net->net_next;
3037                         else
3038                                 ntg->ntg_forest = net->net_next;
3039
3040                         net->net_next = harvest;
3041                         harvest = net;
3042
3043                         VFS_RELE(net->net_mount->mi_vfsp);
3044                         MI4_RELE(net->net_mount);
3045
3046                         continue;
3047                 }
3048
3049                 net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED;
3050                 mutex_exit(&net->net_cnt_lock);
3051                 mutex_exit(&net->net_tree_lock);
3052
3053                 prev = net;
3054         }
3055         mutex_exit(&ntg->ntg_forest_lock);
3056
3057         for (net = harvest; net != NULL; net = next) {
3058                 next = net->net_next;
3059
3060                 mutex_destroy(&net->net_tree_lock);
3061                 mutex_destroy(&net->net_cnt_lock);
3062                 kmem_free(net, sizeof (*net));
3063         }
3064 }
3065
3066 /*
3067  * This is the thread which decides when the harvesting
3068  * can proceed and when to kill it off for this zone.
3069  */
3070 static void
3071 nfs4_ephemeral_harvester(nfs4_trigger_globals_t *ntg)
3072 {
3073         clock_t         timeleft;
3074         zone_t          *zone = curproc->p_zone;
3075
3076         for (;;) {
3077                 timeleft = zone_status_timedwait(zone, ddi_get_lbolt() +
3078                     nfs4_trigger_thread_timer * hz, ZONE_IS_SHUTTING_DOWN);
3079
3080                 /*
3081                  * zone is exiting...
3082                  */
3083                 if (timeleft != -1) {
3084                         ASSERT(zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN);
3085                         zthread_exit();
3086                         /* NOTREACHED */
3087                 }
3088
3089                 /*
3090                  * Only bother scanning if there is potential
3091                  * work to be done.
3092                  */
3093                 if (ntg->ntg_forest == NULL)
3094                         continue;
3095
3096                 /*
3097                  * Now scan the list and get rid of everything which
3098                  * is old.
3099                  */
3100                 nfs4_ephemeral_harvest_forest(ntg, FALSE, TRUE);
3101         }
3102
3103         /* NOTREACHED */
3104 }
3105
3106 /*
3107  * The zone specific glue needed to start the unmount harvester.
3108  *
3109  * Note that we want to avoid holding the mutex as long as possible,
3110  * hence the multiple checks.
3111  *
3112  * The caller should avoid us getting down here in the first
3113  * place.
3114  */
3115 static void
3116 nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *ntg)
3117 {
3118         /*
3119          * It got started before we got here...
3120          */
3121         if (ntg->ntg_thread_started)
3122                 return;
3123
3124         mutex_enter(&nfs4_ephemeral_thread_lock);
3125
3126         if (ntg->ntg_thread_started) {
3127                 mutex_exit(&nfs4_ephemeral_thread_lock);
3128                 return;
3129         }
3130
3131         /*
3132          * Start the unmounter harvester thread for this zone.
3133          */
3134         (void) zthread_create(NULL, 0, nfs4_ephemeral_harvester,
3135             ntg, 0, minclsyspri);
3136
3137         ntg->ntg_thread_started = TRUE;
3138         mutex_exit(&nfs4_ephemeral_thread_lock);
3139 }
3140
3141 /*ARGSUSED*/
3142 static void *
3143 nfs4_ephemeral_zsd_create(zoneid_t zoneid)
3144 {
3145         nfs4_trigger_globals_t  *ntg;
3146
3147         ntg = kmem_zalloc(sizeof (*ntg), KM_SLEEP);
3148         ntg->ntg_thread_started = FALSE;
3149
3150         /*
3151          * This is the default....
3152          */
3153         ntg->ntg_mount_to = nfs4_trigger_thread_timer;
3154
3155         mutex_init(&ntg->ntg_forest_lock, NULL,
3156             MUTEX_DEFAULT, NULL);
3157
3158         return (ntg);
3159 }
3160
3161 /*
3162  * Try a nice gentle walk down the forest and convince
3163  * all of the trees to gracefully give it up.
3164  */
3165 /*ARGSUSED*/
3166 static void
3167 nfs4_ephemeral_zsd_shutdown(zoneid_t zoneid, void *arg)
3168 {
3169         nfs4_trigger_globals_t  *ntg = arg;
3170
3171         if (!ntg)
3172                 return;
3173
3174         nfs4_ephemeral_harvest_forest(ntg, FALSE, FALSE);
3175 }
3176
3177 /*
3178  * Race along the forest and rip all of the trees out by
3179  * their rootballs!
3180  */
3181 /*ARGSUSED*/
3182 static void
3183 nfs4_ephemeral_zsd_destroy(zoneid_t zoneid, void *arg)
3184 {
3185         nfs4_trigger_globals_t  *ntg = arg;
3186
3187         if (!ntg)
3188                 return;
3189
3190         nfs4_ephemeral_harvest_forest(ntg, TRUE, FALSE);
3191
3192         mutex_destroy(&ntg->ntg_forest_lock);
3193         kmem_free(ntg, sizeof (*ntg));
3194 }
3195
3196 /*
3197  * This is the zone independent cleanup needed for
3198  * emphemeral mount processing.
3199  */
3200 void
3201 nfs4_ephemeral_fini(void)
3202 {
3203         (void) zone_key_delete(nfs4_ephemeral_key);
3204         mutex_destroy(&nfs4_ephemeral_thread_lock);
3205 }
3206
3207 /*
3208  * This is the zone independent initialization needed for
3209  * emphemeral mount processing.
3210  */
3211 void
3212 nfs4_ephemeral_init(void)
3213 {
3214         mutex_init(&nfs4_ephemeral_thread_lock, NULL, MUTEX_DEFAULT,
3215             NULL);
3216
3217         zone_key_create(&nfs4_ephemeral_key, nfs4_ephemeral_zsd_create,
3218             nfs4_ephemeral_zsd_shutdown, nfs4_ephemeral_zsd_destroy);
3219 }
3220
3221 /*
3222  * nfssys() calls this function to set the per-zone
3223  * value of mount_to to drive when an ephemeral mount is
3224  * timed out. Each mount will grab a copy of this value
3225  * when mounted.
3226  */
3227 void
3228 nfs4_ephemeral_set_mount_to(uint_t mount_to)
3229 {
3230         nfs4_trigger_globals_t  *ntg;
3231         zone_t                  *zone = curproc->p_zone;
3232
3233         ntg = zone_getspecific(nfs4_ephemeral_key, zone);
3234
3235         ntg->ntg_mount_to = mount_to;
3236 }
3237
3238 /*
3239  * Walk the list of v4 mount options; if they are currently set in vfsp,
3240  * append them to a new comma-separated mount option string, and return it.
3241  *
3242  * Caller should free by calling nfs4_trigger_destroy_mntopts().
3243  */
3244 static char *
3245 nfs4_trigger_create_mntopts(vfs_t *vfsp)
3246 {
3247         uint_t i;
3248         char *mntopts;
3249         struct vfssw *vswp;
3250         mntopts_t *optproto;
3251
3252         mntopts = kmem_zalloc(MAX_MNTOPT_STR, KM_SLEEP);
3253
3254         /* get the list of applicable mount options for v4; locks *vswp */
3255         vswp = vfs_getvfssw(MNTTYPE_NFS4);
3256         optproto = &vswp->vsw_optproto;
3257
3258         for (i = 0; i < optproto->mo_count; i++) {
3259                 struct mntopt *mop = &optproto->mo_list[i];
3260
3261                 if (mop->mo_flags & MO_EMPTY)
3262                         continue;
3263
3264                 if (nfs4_trigger_add_mntopt(mntopts, mop->mo_name, vfsp)) {
3265                         kmem_free(mntopts, MAX_MNTOPT_STR);
3266                         vfs_unrefvfssw(vswp);
3267                         return (NULL);
3268                 }
3269         }
3270
3271         vfs_unrefvfssw(vswp);
3272
3273         /*
3274          * MNTOPT_XATTR is not in the v4 mount opt proto list,
3275          * and it may only be passed via MS_OPTIONSTR, so we
3276          * must handle it here.
3277          *
3278          * Ideally, it would be in the list, but NFS does not specify its
3279          * own opt proto list, it uses instead the default one. Since
3280          * not all filesystems support extended attrs, it would not be
3281          * appropriate to add it there.
3282          */
3283         if (nfs4_trigger_add_mntopt(mntopts, MNTOPT_XATTR, vfsp) ||
3284             nfs4_trigger_add_mntopt(mntopts, MNTOPT_NOXATTR, vfsp)) {
3285                 kmem_free(mntopts, MAX_MNTOPT_STR);
3286                 return (NULL);
3287         }
3288
3289         return (mntopts);
3290 }
3291
3292 static void
3293 nfs4_trigger_destroy_mntopts(char *mntopts)
3294 {
3295         if (mntopts)
3296                 kmem_free(mntopts, MAX_MNTOPT_STR);
3297 }
3298
3299 /*
3300  * Check a single mount option (optname). Add to mntopts if it is set in VFS.
3301  */
3302 static int
3303 nfs4_trigger_add_mntopt(char *mntopts, char *optname, vfs_t *vfsp)
3304 {
3305         if (mntopts == NULL || optname == NULL || vfsp == NULL)
3306                 return (EINVAL);
3307
3308         if (vfs_optionisset(vfsp, optname, NULL)) {
3309                 size_t mntoptslen = strlen(mntopts);
3310                 size_t optnamelen = strlen(optname);
3311
3312                 /* +1 for ',', +1 for NUL */
3313                 if (mntoptslen + optnamelen + 2 > MAX_MNTOPT_STR)
3314                         return (EOVERFLOW);
3315
3316                 /* first or subsequent mount option? */
3317                 if (*mntopts != '\0')
3318                         (void) strcat(mntopts, ",");
3319
3320                 (void) strcat(mntopts, optname);
3321         }
3322
3323         return (0);
3324 }
3325
3326 static enum clnt_stat
3327 nfs4_ping_server_common(struct knetconfig *knc, struct netbuf *addr, int nointr)
3328 {
3329         int retries;
3330         uint_t max_msgsize;
3331         enum clnt_stat status;
3332         CLIENT *cl;
3333         struct timeval timeout;
3334
3335         /* as per recov_newserver() */
3336         max_msgsize = 0;
3337         retries = 1;
3338         timeout.tv_sec = 2;
3339         timeout.tv_usec = 0;
3340
3341         if (clnt_tli_kcreate(knc, addr, NFS_PROGRAM, NFS_V4,
3342             max_msgsize, retries, CRED(), &cl) != 0)
3343                 return (RPC_FAILED);
3344
3345         if (nointr)
3346                 cl->cl_nosignal = TRUE;
3347         status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, xdr_void, NULL,
3348             timeout);
3349         if (nointr)
3350                 cl->cl_nosignal = FALSE;
3351
3352         AUTH_DESTROY(cl->cl_auth);
3353         CLNT_DESTROY(cl);
3354
3355         return (status);
3356 }
3357
3358 static enum clnt_stat
3359 nfs4_trigger_ping_server(servinfo4_t *svp, int nointr)
3360 {
3361         return (nfs4_ping_server_common(svp->sv_knconf, &svp->sv_addr, nointr));
3362 }