Merge remote-tracking branch 'origin/master'
[unleashed/lotheac.git] / kernel / fs / nfs / nfs4_stub_vnops.c
blobd252a6f5fb5d58909c143bfc35c24ec018986ae0
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
28 * Support for ephemeral mounts, e.g. mirror-mounts. These mounts are
29 * triggered from a "stub" rnode via a special set of vnodeops.
32 #include <sys/param.h>
33 #include <sys/types.h>
34 #include <sys/systm.h>
35 #include <sys/cred.h>
36 #include <sys/time.h>
37 #include <sys/vnode.h>
38 #include <sys/vfs.h>
39 #include <sys/file.h>
40 #include <sys/filio.h>
41 #include <sys/uio.h>
42 #include <sys/buf.h>
43 #include <sys/mman.h>
44 #include <sys/pathname.h>
45 #include <sys/dirent.h>
46 #include <sys/debug.h>
47 #include <sys/vmsystm.h>
48 #include <sys/fcntl.h>
49 #include <sys/flock.h>
50 #include <sys/swap.h>
51 #include <sys/errno.h>
52 #include <sys/strsubr.h>
53 #include <sys/sysmacros.h>
54 #include <sys/kmem.h>
55 #include <sys/mount.h>
56 #include <sys/cmn_err.h>
57 #include <sys/pathconf.h>
58 #include <sys/utsname.h>
59 #include <sys/dnlc.h>
60 #include <sys/acl.h>
61 #include <sys/systeminfo.h>
62 #include <sys/policy.h>
63 #include <sys/sdt.h>
64 #include <sys/list.h>
65 #include <sys/stat.h>
66 #include <sys/mntent.h>
67 #include <sys/priv.h>
69 #include <rpc/types.h>
70 #include <rpc/auth.h>
71 #include <rpc/clnt.h>
73 #include <nfs/nfs.h>
74 #include <nfs/nfs_clnt.h>
75 #include <nfs/nfs_acl.h>
76 #include <nfs/lm.h>
77 #include <nfs/nfs4.h>
78 #include <nfs/nfs4_kprot.h>
79 #include <nfs/rnode4.h>
80 #include <nfs/nfs4_clnt.h>
81 #include <nfs/nfsid_map.h>
82 #include <nfs/nfs4_idmap_impl.h>
84 #include <vm/hat.h>
85 #include <vm/as.h>
86 #include <vm/page.h>
87 #include <vm/pvn.h>
88 #include <vm/seg.h>
89 #include <vm/seg_map.h>
90 #include <vm/seg_kpm.h>
91 #include <vm/seg_vn.h>
93 #include <sys/fs_subr.h>
95 #include <sys/ddi.h>
96 #include <sys/int_fmtio.h>
98 #include <sys/sunddi.h>
100 #include <sys/priv_names.h>
102 extern zone_key_t nfs4clnt_zone_key;
103 extern zone_key_t nfsidmap_zone_key;
106 * The automatic unmounter thread stuff!
108 static int nfs4_trigger_thread_timer = 20; /* in seconds */
111 * Just a default....
113 static uint_t nfs4_trigger_mount_to = 240;
115 typedef struct nfs4_trigger_globals {
116 kmutex_t ntg_forest_lock;
117 uint_t ntg_mount_to;
118 int ntg_thread_started;
119 nfs4_ephemeral_tree_t *ntg_forest;
120 } nfs4_trigger_globals_t;
122 kmutex_t nfs4_ephemeral_thread_lock;
124 zone_key_t nfs4_ephemeral_key = ZONE_KEY_UNINITIALIZED;
126 static void nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *);
129 * Used for ephemeral mounts; contains data either duplicated from
130 * servinfo4_t, or hand-crafted, depending on type of ephemeral mount.
132 * It's intended that this structure is used solely for ephemeral
133 * mount-type specific data, for passing this data to
134 * nfs4_trigger_nargs_create().
136 typedef struct ephemeral_servinfo {
137 char *esi_hostname;
138 char *esi_netname;
139 char *esi_path;
140 int esi_path_len;
141 int esi_mount_flags;
142 struct netbuf *esi_addr;
143 struct netbuf *esi_syncaddr;
144 struct knetconfig *esi_knconf;
145 } ephemeral_servinfo_t;
148 * Collect together the mount-type specific and generic data args.
150 typedef struct domount_args {
151 ephemeral_servinfo_t *dma_esi;
152 char *dma_hostlist; /* comma-sep. for RO failover */
153 struct nfs_args *dma_nargs;
154 } domount_args_t;
158 * The vnode ops functions for a trigger stub vnode
160 static int nfs4_trigger_open(vnode_t **, int, cred_t *, caller_context_t *);
161 static int nfs4_trigger_getattr(vnode_t *, struct vattr *, int, cred_t *,
162 caller_context_t *);
163 static int nfs4_trigger_setattr(vnode_t *, struct vattr *, int, cred_t *,
164 caller_context_t *);
165 static int nfs4_trigger_access(vnode_t *, int, int, cred_t *,
166 caller_context_t *);
167 static int nfs4_trigger_readlink(vnode_t *, struct uio *, cred_t *,
168 caller_context_t *);
169 static int nfs4_trigger_lookup(vnode_t *, char *, vnode_t **,
170 struct pathname *, int, vnode_t *, cred_t *, caller_context_t *,
171 int *, pathname_t *);
172 static int nfs4_trigger_create(vnode_t *, char *, struct vattr *,
173 enum vcexcl, int, vnode_t **, cred_t *, int, caller_context_t *,
174 vsecattr_t *);
175 static int nfs4_trigger_remove(vnode_t *, char *, cred_t *, caller_context_t *,
176 int);
177 static int nfs4_trigger_link(vnode_t *, vnode_t *, char *, cred_t *,
178 caller_context_t *, int);
179 static int nfs4_trigger_rename(vnode_t *, char *, vnode_t *, char *,
180 cred_t *, caller_context_t *, int);
181 static int nfs4_trigger_mkdir(vnode_t *, char *, struct vattr *,
182 vnode_t **, cred_t *, caller_context_t *, int, vsecattr_t *vsecp);
183 static int nfs4_trigger_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
184 caller_context_t *, int);
185 static int nfs4_trigger_symlink(vnode_t *, char *, struct vattr *, char *,
186 cred_t *, caller_context_t *, int);
187 static int nfs4_trigger_cmp(vnode_t *, vnode_t *, caller_context_t *);
190 * Regular NFSv4 vnodeops that we need to reference directly
192 extern int nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *,
193 caller_context_t *);
194 extern void nfs4_inactive(vnode_t *, cred_t *, caller_context_t *);
195 extern int nfs4_rwlock(vnode_t *, int, caller_context_t *);
196 extern void nfs4_rwunlock(vnode_t *, int, caller_context_t *);
197 extern int nfs4_lookup(vnode_t *, char *, vnode_t **,
198 struct pathname *, int, vnode_t *, cred_t *,
199 caller_context_t *, int *, pathname_t *);
200 extern int nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *,
201 caller_context_t *);
202 extern int nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
203 caller_context_t *);
204 extern int nfs4_fid(vnode_t *, fid_t *, caller_context_t *);
205 extern int nfs4_realvp(vnode_t *, vnode_t **, caller_context_t *);
207 static int nfs4_trigger_mount(vnode_t *, cred_t *, vnode_t **);
208 static int nfs4_trigger_domount(vnode_t *, domount_args_t *, vfs_t **,
209 cred_t *, vnode_t **);
210 static int nfs4_trigger_domount_args_create(vnode_t *, cred_t *,
211 domount_args_t **dmap);
212 static void nfs4_trigger_domount_args_destroy(domount_args_t *dma,
213 vnode_t *vp);
214 static ephemeral_servinfo_t *nfs4_trigger_esi_create(vnode_t *, servinfo4_t *,
215 cred_t *);
216 static void nfs4_trigger_esi_destroy(ephemeral_servinfo_t *, vnode_t *);
217 static ephemeral_servinfo_t *nfs4_trigger_esi_create_mirrormount(vnode_t *,
218 servinfo4_t *);
219 static ephemeral_servinfo_t *nfs4_trigger_esi_create_referral(vnode_t *,
220 cred_t *);
221 static struct nfs_args *nfs4_trigger_nargs_create(mntinfo4_t *, servinfo4_t *,
222 ephemeral_servinfo_t *);
223 static void nfs4_trigger_nargs_destroy(struct nfs_args *);
224 static char *nfs4_trigger_create_mntopts(vfs_t *);
225 static void nfs4_trigger_destroy_mntopts(char *);
226 static int nfs4_trigger_add_mntopt(char *, char *, vfs_t *);
227 static enum clnt_stat nfs4_trigger_ping_server(servinfo4_t *, int);
228 static enum clnt_stat nfs4_ping_server_common(struct knetconfig *,
229 struct netbuf *, int);
231 extern int umount2_engine(vfs_t *, int, cred_t *, int);
234 * These are the vnodeops that we must define for stub vnodes.
237 * Many of the VOPs defined for NFSv4 do not need to be defined here,
238 * for various reasons. This will result in the VFS default function being
239 * used:
241 * - These VOPs require a previous fop_open to have occurred. That will have
242 * lost the reference to the stub vnode, meaning these should not be called:
243 * close, read, write, ioctl, readdir, seek.
245 * - These VOPs are meaningless for vnodes without data pages. Since the
246 * stub vnode is of type VDIR, these should not be called:
247 * space, getpage, putpage, map, addmap, delmap, pageio, fsync.
249 * - These VOPs are otherwise not applicable, and should not be called:
250 * dump, setsecattr.
253 * These VOPs we do not want to define, but nor do we want the VFS default
254 * action. Instead, we specify the an error function.
256 * - frlock, dispose, shrlock.
259 * These VOPs we define to use the corresponding regular NFSv4 vnodeop.
260 * NOTE: if any of these ops involve an OTW call with the stub FH, then
261 * that call must be wrapped with save_mnt_secinfo()/check_mnt_secinfo()
262 * to protect the security data in the servinfo4_t for the "parent"
263 * filesystem that contains the stub.
265 * - These VOPs should not trigger a mount, so that "ls -l" does not:
266 * pathconf, getsecattr.
268 * - These VOPs would not make sense to trigger:
269 * inactive, rwlock, rwunlock, fid, realvp.
271 const struct vnodeops nfs4_trigger_vnodeops = {
272 .vnop_name = "nfs4_trigger",
273 .vop_open = nfs4_trigger_open,
274 .vop_getattr = nfs4_trigger_getattr,
275 .vop_setattr = nfs4_trigger_setattr,
276 .vop_access = nfs4_trigger_access,
277 .vop_lookup = nfs4_trigger_lookup,
278 .vop_create = nfs4_trigger_create,
279 .vop_remove = nfs4_trigger_remove,
280 .vop_link = nfs4_trigger_link,
281 .vop_rename = nfs4_trigger_rename,
282 .vop_mkdir = nfs4_trigger_mkdir,
283 .vop_rmdir = nfs4_trigger_rmdir,
284 .vop_symlink = nfs4_trigger_symlink,
285 .vop_readlink = nfs4_trigger_readlink,
286 .vop_inactive = nfs4_inactive,
287 .vop_fid = nfs4_fid,
288 .vop_rwlock = nfs4_rwlock,
289 .vop_rwunlock = nfs4_rwunlock,
290 .vop_realvp = nfs4_realvp,
291 .vop_getsecattr = nfs4_getsecattr,
292 .vop_pathconf = nfs4_pathconf,
293 .vop_frlock = fs_nosys,
294 .vop_dispose = fs_nodispose,
295 .vop_shrlock = fs_nosys,
296 .vop_vnevent = fs_vnevent_support,
299 static void
300 nfs4_ephemeral_tree_incr(nfs4_ephemeral_tree_t *net)
302 ASSERT(mutex_owned(&net->net_cnt_lock));
303 net->net_refcnt++;
304 ASSERT(net->net_refcnt != 0);
307 static void
308 nfs4_ephemeral_tree_hold(nfs4_ephemeral_tree_t *net)
310 mutex_enter(&net->net_cnt_lock);
311 nfs4_ephemeral_tree_incr(net);
312 mutex_exit(&net->net_cnt_lock);
316 * We need a safe way to decrement the refcnt whilst the
317 * lock is being held.
319 static void
320 nfs4_ephemeral_tree_decr(nfs4_ephemeral_tree_t *net)
322 ASSERT(mutex_owned(&net->net_cnt_lock));
323 ASSERT(net->net_refcnt != 0);
324 net->net_refcnt--;
327 static void
328 nfs4_ephemeral_tree_rele(nfs4_ephemeral_tree_t *net)
330 mutex_enter(&net->net_cnt_lock);
331 nfs4_ephemeral_tree_decr(net);
332 mutex_exit(&net->net_cnt_lock);
336 * Trigger ops for stub vnodes; for mirror mounts, etc.
338 * The general idea is that a "triggering" op will first call
339 * nfs4_trigger_mount(), which will find out whether a mount has already
340 * been triggered.
342 * If it has, then nfs4_trigger_mount() sets newvp to the root vnode
343 * of the covering vfs.
345 * If a mount has not yet been triggered, nfs4_trigger_mount() will do so,
346 * and again set newvp, as above.
348 * The triggering op may then re-issue the VOP by calling it on newvp.
350 * Note that some ops may perform custom action, and may or may not need
351 * to trigger a mount.
353 * Some ops need to call the regular NFSv4 vnodeop for a stub vnode. We
354 * obviously can't do this with VOP_<whatever>, since it's a stub vnode
355 * and that would just recurse. Instead, we call the v4 op directly,
356 * by name. This is OK, since we know that the vnode is for NFSv4,
357 * otherwise it couldn't be a stub.
361 static int
362 nfs4_trigger_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
364 int error;
365 vnode_t *newvp;
367 error = nfs4_trigger_mount(*vpp, cr, &newvp);
368 if (error)
369 return (error);
371 /* Release the stub vnode, as we're losing the reference to it */
372 VN_RELE(*vpp);
374 /* Give the caller the root vnode of the newly-mounted fs */
375 *vpp = newvp;
377 /* return with VN_HELD(newvp) */
378 return (fop_open(vpp, flag, cr, ct));
381 void
382 nfs4_fake_attrs(vnode_t *vp, struct vattr *vap)
384 uint_t mask;
385 timespec_t now;
388 * Set some attributes here for referrals.
390 mask = vap->va_mask;
391 bzero(vap, sizeof (struct vattr));
392 vap->va_mask = mask;
393 vap->va_uid = 0;
394 vap->va_gid = 0;
395 vap->va_nlink = 1;
396 vap->va_size = 1;
397 gethrestime(&now);
398 vap->va_atime = now;
399 vap->va_mtime = now;
400 vap->va_ctime = now;
401 vap->va_type = VDIR;
402 vap->va_mode = 0555;
403 vap->va_fsid = vp->v_vfsp->vfs_dev;
404 vap->va_rdev = 0;
405 vap->va_blksize = MAXBSIZE;
406 vap->va_nblocks = 1;
407 vap->va_seq = 0;
411 * For the majority of cases, nfs4_trigger_getattr() will not trigger
412 * a mount. However, if ATTR_TRIGGER is set, we are being informed
413 * that we need to force the mount before we attempt to determine
414 * the attributes. The intent is an atomic operation for security
415 * testing.
417 * If we're not triggering a mount, we can still inquire about the
418 * actual attributes from the server in the mirror mount case,
419 * and will return manufactured attributes for a referral (see
420 * the 'create' branch of find_referral_stubvp()).
422 static int
423 nfs4_trigger_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
424 caller_context_t *ct)
426 int error;
428 if (flags & ATTR_TRIGGER) {
429 vnode_t *newvp;
431 error = nfs4_trigger_mount(vp, cr, &newvp);
432 if (error)
433 return (error);
435 error = fop_getattr(newvp, vap, flags, cr, ct);
436 VN_RELE(newvp);
438 } else if (RP_ISSTUB_MIRRORMOUNT(VTOR4(vp))) {
440 error = nfs4_getattr(vp, vap, flags, cr, ct);
442 } else if (RP_ISSTUB_REFERRAL(VTOR4(vp))) {
444 nfs4_fake_attrs(vp, vap);
445 error = 0;
448 return (error);
451 static int
452 nfs4_trigger_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
453 caller_context_t *ct)
455 int error;
456 vnode_t *newvp;
458 error = nfs4_trigger_mount(vp, cr, &newvp);
459 if (error)
460 return (error);
462 error = fop_setattr(newvp, vap, flags, cr, ct);
463 VN_RELE(newvp);
465 return (error);
468 static int
469 nfs4_trigger_access(vnode_t *vp, int mode, int flags, cred_t *cr,
470 caller_context_t *ct)
472 int error;
473 vnode_t *newvp;
475 error = nfs4_trigger_mount(vp, cr, &newvp);
476 if (error)
477 return (error);
479 error = fop_access(newvp, mode, flags, cr, ct);
480 VN_RELE(newvp);
482 return (error);
485 static int
486 nfs4_trigger_lookup(vnode_t *dvp, char *nm, vnode_t **vpp,
487 struct pathname *pnp, int flags, vnode_t *rdir, cred_t *cr,
488 caller_context_t *ct, int *deflags, pathname_t *rpnp)
490 int error;
491 vnode_t *newdvp;
492 rnode4_t *drp = VTOR4(dvp);
494 ASSERT(RP_ISSTUB(drp));
497 * It's not legal to lookup ".." for an fs root, so we mustn't pass
498 * that up. Instead, pass onto the regular op, regardless of whether
499 * we've triggered a mount.
501 if (strcmp(nm, "..") == 0)
502 if (RP_ISSTUB_MIRRORMOUNT(drp)) {
503 return (nfs4_lookup(dvp, nm, vpp, pnp, flags, rdir, cr,
504 ct, deflags, rpnp));
505 } else if (RP_ISSTUB_REFERRAL(drp)) {
506 /* Return the parent vnode */
507 return (vtodv(dvp, vpp, cr, TRUE));
510 error = nfs4_trigger_mount(dvp, cr, &newdvp);
511 if (error)
512 return (error);
514 error = fop_lookup(newdvp, nm, vpp, pnp, flags, rdir, cr, ct,
515 deflags, rpnp);
516 VN_RELE(newdvp);
518 return (error);
521 static int
522 nfs4_trigger_create(vnode_t *dvp, char *nm, struct vattr *va,
523 enum vcexcl exclusive, int mode, vnode_t **vpp, cred_t *cr,
524 int flags, caller_context_t *ct, vsecattr_t *vsecp)
526 int error;
527 vnode_t *newdvp;
529 error = nfs4_trigger_mount(dvp, cr, &newdvp);
530 if (error)
531 return (error);
533 error = fop_create(newdvp, nm, va, exclusive, mode, vpp, cr,
534 flags, ct, vsecp);
535 VN_RELE(newdvp);
537 return (error);
540 static int
541 nfs4_trigger_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct,
542 int flags)
544 int error;
545 vnode_t *newdvp;
547 error = nfs4_trigger_mount(dvp, cr, &newdvp);
548 if (error)
549 return (error);
551 error = fop_remove(newdvp, nm, cr, ct, flags);
552 VN_RELE(newdvp);
554 return (error);
557 static int
558 nfs4_trigger_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr,
559 caller_context_t *ct, int flags)
561 int error;
562 vnode_t *newtdvp;
564 error = nfs4_trigger_mount(tdvp, cr, &newtdvp);
565 if (error)
566 return (error);
569 * We don't check whether svp is a stub. Let the NFSv4 code
570 * detect that error, and return accordingly.
572 error = fop_link(newtdvp, svp, tnm, cr, ct, flags);
573 VN_RELE(newtdvp);
575 return (error);
578 static int
579 nfs4_trigger_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
580 cred_t *cr, caller_context_t *ct, int flags)
582 int error;
583 vnode_t *newsdvp;
584 rnode4_t *tdrp = VTOR4(tdvp);
587 * We know that sdvp is a stub, otherwise we would not be here.
589 * If tdvp is also be a stub, there are two possibilities: it
590 * is either the same stub as sdvp [i.e. VN_CMP(sdvp, tdvp)]
591 * or it is a different stub [!VN_CMP(sdvp, tdvp)].
593 * In the former case, just trigger sdvp, and treat tdvp as
594 * though it were not a stub.
596 * In the latter case, it might be a different stub for the
597 * same server fs as sdvp, or for a different server fs.
598 * Regardless, from the client perspective this would still
599 * be a cross-filesystem rename, and should not be allowed,
600 * so return EXDEV, without triggering either mount.
602 if (RP_ISSTUB(tdrp) && !VN_CMP(sdvp, tdvp))
603 return (EXDEV);
605 error = nfs4_trigger_mount(sdvp, cr, &newsdvp);
606 if (error)
607 return (error);
609 error = fop_rename(newsdvp, snm, tdvp, tnm, cr, ct, flags);
611 VN_RELE(newsdvp);
613 return (error);
616 /* ARGSUSED */
617 static int
618 nfs4_trigger_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp,
619 cred_t *cr, caller_context_t *ct, int flags, vsecattr_t *vsecp)
621 int error;
622 vnode_t *newdvp;
624 error = nfs4_trigger_mount(dvp, cr, &newdvp);
625 if (error)
626 return (error);
628 error = fop_mkdir(newdvp, nm, va, vpp, cr, ct, flags, vsecp);
629 VN_RELE(newdvp);
631 return (error);
634 static int
635 nfs4_trigger_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
636 caller_context_t *ct, int flags)
638 int error;
639 vnode_t *newdvp;
641 error = nfs4_trigger_mount(dvp, cr, &newdvp);
642 if (error)
643 return (error);
645 error = fop_rmdir(newdvp, nm, cdir, cr, ct, flags);
646 VN_RELE(newdvp);
648 return (error);
651 static int
652 nfs4_trigger_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm,
653 cred_t *cr, caller_context_t *ct, int flags)
655 int error;
656 vnode_t *newdvp;
658 error = nfs4_trigger_mount(dvp, cr, &newdvp);
659 if (error)
660 return (error);
662 error = fop_symlink(newdvp, lnm, tva, tnm, cr, ct, flags);
663 VN_RELE(newdvp);
665 return (error);
668 static int
669 nfs4_trigger_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr,
670 caller_context_t *ct)
672 int error;
673 vnode_t *newvp;
675 error = nfs4_trigger_mount(vp, cr, &newvp);
676 if (error)
677 return (error);
679 error = fop_readlink(newvp, uiop, cr, ct);
680 VN_RELE(newvp);
682 return (error);
685 /* end of trigger vnode ops */
688 * See if the mount has already been done by another caller.
690 static int
691 nfs4_trigger_mounted_already(vnode_t *vp, vnode_t **newvpp,
692 bool_t *was_mounted, vfs_t **vfsp)
694 int error;
695 mntinfo4_t *mi = VTOMI4(vp);
697 *was_mounted = FALSE;
699 error = vn_vfsrlock_wait(vp);
700 if (error)
701 return (error);
703 *vfsp = vn_mountedvfs(vp);
704 if (*vfsp != NULL) {
705 /* the mount has already occurred */
706 error = VFS_ROOT(*vfsp, newvpp);
707 if (!error) {
708 /* need to update the reference time */
709 mutex_enter(&mi->mi_lock);
710 if (mi->mi_ephemeral)
711 mi->mi_ephemeral->ne_ref_time =
712 gethrestime_sec();
713 mutex_exit(&mi->mi_lock);
715 *was_mounted = TRUE;
719 vn_vfsunlock(vp);
720 return (0);
724 * Mount upon a trigger vnode; for mirror-mounts, referrals, etc.
726 * The mount may have already occurred, via another thread. If not,
727 * assemble the location information - which may require fetching - and
728 * perform the mount.
730 * Sets newvp to be the root of the fs that is now covering vp. Note
731 * that we return with VN_HELD(*newvp).
733 * The caller is responsible for passing the VOP onto the covering fs.
735 static int
736 nfs4_trigger_mount(vnode_t *vp, cred_t *cr, vnode_t **newvpp)
738 int error;
739 vfs_t *vfsp;
740 rnode4_t *rp = VTOR4(vp);
741 mntinfo4_t *mi = VTOMI4(vp);
742 domount_args_t *dma;
744 nfs4_ephemeral_tree_t *net;
746 bool_t must_unlock = FALSE;
747 bool_t is_building = FALSE;
748 bool_t was_mounted = FALSE;
750 cred_t *mcred = NULL;
752 nfs4_trigger_globals_t *ntg;
754 zone_t *zone = curproc->p_zone;
756 ASSERT(RP_ISSTUB(rp));
758 *newvpp = NULL;
761 * Has the mount already occurred?
763 error = nfs4_trigger_mounted_already(vp, newvpp,
764 &was_mounted, &vfsp);
765 if (error || was_mounted)
766 goto done;
768 ntg = zone_getspecific(nfs4_ephemeral_key, zone);
769 ASSERT(ntg != NULL);
771 mutex_enter(&mi->mi_lock);
774 * We need to lock down the ephemeral tree.
776 if (mi->mi_ephemeral_tree == NULL) {
777 net = kmem_zalloc(sizeof (*net), KM_SLEEP);
778 mutex_init(&net->net_tree_lock, NULL, MUTEX_DEFAULT, NULL);
779 mutex_init(&net->net_cnt_lock, NULL, MUTEX_DEFAULT, NULL);
780 net->net_refcnt = 1;
781 net->net_status = NFS4_EPHEMERAL_TREE_BUILDING;
782 is_building = TRUE;
785 * We need to add it to the zone specific list for
786 * automatic unmounting and harvesting of deadwood.
788 mutex_enter(&ntg->ntg_forest_lock);
789 if (ntg->ntg_forest != NULL)
790 net->net_next = ntg->ntg_forest;
791 ntg->ntg_forest = net;
792 mutex_exit(&ntg->ntg_forest_lock);
795 * No lock order confusion with mi_lock because no
796 * other node could have grabbed net_tree_lock.
798 mutex_enter(&net->net_tree_lock);
799 mi->mi_ephemeral_tree = net;
800 net->net_mount = mi;
801 mutex_exit(&mi->mi_lock);
803 MI4_HOLD(mi);
804 VFS_HOLD(mi->mi_vfsp);
805 } else {
806 net = mi->mi_ephemeral_tree;
807 nfs4_ephemeral_tree_hold(net);
809 mutex_exit(&mi->mi_lock);
811 mutex_enter(&net->net_tree_lock);
814 * We can only procede if the tree is neither locked
815 * nor being torn down.
817 mutex_enter(&net->net_cnt_lock);
818 if (net->net_status & NFS4_EPHEMERAL_TREE_PROCESSING) {
819 nfs4_ephemeral_tree_decr(net);
820 mutex_exit(&net->net_cnt_lock);
821 mutex_exit(&net->net_tree_lock);
823 return (EIO);
825 mutex_exit(&net->net_cnt_lock);
828 mutex_enter(&net->net_cnt_lock);
829 net->net_status |= NFS4_EPHEMERAL_TREE_MOUNTING;
830 mutex_exit(&net->net_cnt_lock);
832 must_unlock = TRUE;
834 error = nfs4_trigger_domount_args_create(vp, cr, &dma);
835 if (error)
836 goto done;
839 * Note that since we define mirror mounts to work
840 * for any user, we simply extend the privileges of
841 * the user's credentials to allow the mount to
842 * proceed.
844 mcred = crdup(cr);
845 if (mcred == NULL) {
846 error = EINVAL;
847 nfs4_trigger_domount_args_destroy(dma, vp);
848 goto done;
851 crset_zone_privall(mcred);
853 error = nfs4_trigger_domount(vp, dma, &vfsp, mcred, newvpp);
854 nfs4_trigger_domount_args_destroy(dma, vp);
856 DTRACE_PROBE2(nfs4clnt__func__referral__mount,
857 vnode_t *, vp, int, error);
859 crfree(mcred);
861 done:
863 if (must_unlock) {
864 mutex_enter(&net->net_cnt_lock);
865 net->net_status &= ~NFS4_EPHEMERAL_TREE_MOUNTING;
868 * REFCNT: If we are the root of the tree, then we need
869 * to keep a reference because we malloced the tree and
870 * this is where we tied it to our mntinfo.
872 * If we are not the root of the tree, then our tie to
873 * the mntinfo occured elsewhere and we need to
874 * decrement the reference to the tree.
876 if (is_building)
877 net->net_status &= ~NFS4_EPHEMERAL_TREE_BUILDING;
878 else
879 nfs4_ephemeral_tree_decr(net);
880 mutex_exit(&net->net_cnt_lock);
882 mutex_exit(&net->net_tree_lock);
885 if (!error && (newvpp == NULL || *newvpp == NULL))
886 error = ENOSYS;
888 return (error);
892 * Collect together both the generic & mount-type specific args.
894 static int
895 nfs4_trigger_domount_args_create(vnode_t *vp, cred_t *cr, domount_args_t **dmap)
897 int nointr;
898 char *hostlist;
899 servinfo4_t *svp;
900 struct nfs_args *nargs, *nargs_head;
901 enum clnt_stat status;
902 ephemeral_servinfo_t *esi, *esi_first;
903 domount_args_t *dma;
904 mntinfo4_t *mi = VTOMI4(vp);
906 nointr = !(mi->mi_flags & MI4_INT);
907 hostlist = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
909 svp = mi->mi_curr_serv;
910 /* check if the current server is responding */
911 status = nfs4_trigger_ping_server(svp, nointr);
912 if (status == RPC_SUCCESS) {
913 esi_first = nfs4_trigger_esi_create(vp, svp, cr);
914 if (esi_first == NULL) {
915 kmem_free(hostlist, MAXPATHLEN);
916 return (EINVAL);
919 (void) strlcpy(hostlist, esi_first->esi_hostname, MAXPATHLEN);
921 nargs_head = nfs4_trigger_nargs_create(mi, svp, esi_first);
922 } else {
923 /* current server did not respond */
924 esi_first = NULL;
925 nargs_head = NULL;
927 nargs = nargs_head;
930 * NFS RO failover.
932 * If we have multiple servinfo4 structures, linked via sv_next,
933 * we must create one nfs_args for each, linking the nfs_args via
934 * nfs_ext_u.nfs_extB.next.
936 * We need to build a corresponding esi for each, too, but that is
937 * used solely for building nfs_args, and may be immediately
938 * discarded, as domount() requires the info from just one esi,
939 * but all the nfs_args.
941 * Currently, the NFS mount code will hang if not all servers
942 * requested are available. To avoid that, we need to ping each
943 * server, here, and remove it from the list if it is not
944 * responding. This has the side-effect of that server then
945 * being permanently unavailable for this failover mount, even if
946 * it recovers. That's unfortunate, but the best we can do until
947 * the mount code path is fixed.
951 * If the current server was down, loop indefinitely until we find
952 * at least one responsive server.
954 do {
955 /* no locking needed for sv_next; it is only set at fs mount */
956 for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) {
957 struct nfs_args *next;
960 * nargs_head: the head of the nfs_args list
961 * nargs: the current tail of the list
962 * next: the newly-created element to be added
966 * We've already tried the current server, above;
967 * if it was responding, we have already included it
968 * and it may now be ignored.
970 * Otherwise, try it again, since it may now have
971 * recovered.
973 if (svp == mi->mi_curr_serv && esi_first != NULL)
974 continue;
976 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
977 if (svp->sv_flags & SV4_NOTINUSE) {
978 nfs_rw_exit(&svp->sv_lock);
979 continue;
981 nfs_rw_exit(&svp->sv_lock);
983 /* check if the server is responding */
984 status = nfs4_trigger_ping_server(svp, nointr);
985 if (status == RPC_INTR) {
986 kmem_free(hostlist, MAXPATHLEN);
987 nfs4_trigger_esi_destroy(esi_first, vp);
988 nargs = nargs_head;
989 while (nargs != NULL) {
990 next = nargs->nfs_ext_u.nfs_extB.next;
991 nfs4_trigger_nargs_destroy(nargs);
992 nargs = next;
994 return (EINTR);
995 } else if (status != RPC_SUCCESS) {
996 /* if the server did not respond, ignore it */
997 continue;
1000 esi = nfs4_trigger_esi_create(vp, svp, cr);
1001 if (esi == NULL)
1002 continue;
1005 * If the original current server (mi_curr_serv)
1006 * was down when when we first tried it,
1007 * (i.e. esi_first == NULL),
1008 * we select this new server (svp) to be the server
1009 * that we will actually contact (esi_first).
1011 * Note that it's possible that mi_curr_serv == svp,
1012 * if that mi_curr_serv was down but has now recovered.
1014 next = nfs4_trigger_nargs_create(mi, svp, esi);
1015 if (esi_first == NULL) {
1016 ASSERT(nargs == NULL);
1017 ASSERT(nargs_head == NULL);
1018 nargs_head = next;
1019 esi_first = esi;
1020 (void) strlcpy(hostlist,
1021 esi_first->esi_hostname, MAXPATHLEN);
1022 } else {
1023 ASSERT(nargs_head != NULL);
1024 nargs->nfs_ext_u.nfs_extB.next = next;
1025 (void) strlcat(hostlist, ",", MAXPATHLEN);
1026 (void) strlcat(hostlist, esi->esi_hostname,
1027 MAXPATHLEN);
1028 /* esi was only needed for hostname & nargs */
1029 nfs4_trigger_esi_destroy(esi, vp);
1032 nargs = next;
1035 /* if we've had no response at all, wait a second */
1036 if (esi_first == NULL)
1037 ddi_sleep(1);
1039 } while (esi_first == NULL);
1040 ASSERT(nargs_head != NULL);
1042 dma = kmem_zalloc(sizeof (domount_args_t), KM_SLEEP);
1043 dma->dma_esi = esi_first;
1044 dma->dma_hostlist = hostlist;
1045 dma->dma_nargs = nargs_head;
1046 *dmap = dma;
1048 return (0);
1051 static void
1052 nfs4_trigger_domount_args_destroy(domount_args_t *dma, vnode_t *vp)
1054 if (dma != NULL) {
1055 if (dma->dma_esi != NULL && vp != NULL)
1056 nfs4_trigger_esi_destroy(dma->dma_esi, vp);
1058 if (dma->dma_hostlist != NULL)
1059 kmem_free(dma->dma_hostlist, MAXPATHLEN);
1061 if (dma->dma_nargs != NULL) {
1062 struct nfs_args *nargs = dma->dma_nargs;
1064 do {
1065 struct nfs_args *next =
1066 nargs->nfs_ext_u.nfs_extB.next;
1068 nfs4_trigger_nargs_destroy(nargs);
1069 nargs = next;
1070 } while (nargs != NULL);
1073 kmem_free(dma, sizeof (domount_args_t));
1078 * The ephemeral_servinfo_t struct contains basic information we will need to
1079 * perform the mount. Whilst the structure is generic across different
1080 * types of ephemeral mount, the way we gather its contents differs.
1082 static ephemeral_servinfo_t *
1083 nfs4_trigger_esi_create(vnode_t *vp, servinfo4_t *svp, cred_t *cr)
1085 ephemeral_servinfo_t *esi;
1086 rnode4_t *rp = VTOR4(vp);
1088 ASSERT(RP_ISSTUB(rp));
1090 /* Call the ephemeral type-specific routine */
1091 if (RP_ISSTUB_MIRRORMOUNT(rp))
1092 esi = nfs4_trigger_esi_create_mirrormount(vp, svp);
1093 else if (RP_ISSTUB_REFERRAL(rp))
1094 esi = nfs4_trigger_esi_create_referral(vp, cr);
1095 else
1096 esi = NULL;
1097 return (esi);
1100 static void
1101 nfs4_trigger_esi_destroy(ephemeral_servinfo_t *esi, vnode_t *vp)
1103 rnode4_t *rp = VTOR4(vp);
1105 ASSERT(RP_ISSTUB(rp));
1107 /* Currently, no need for an ephemeral type-specific routine */
1110 * The contents of ephemeral_servinfo_t goes into nfs_args,
1111 * and will be handled by nfs4_trigger_nargs_destroy().
1112 * We need only free the structure itself.
1114 if (esi != NULL)
1115 kmem_free(esi, sizeof (ephemeral_servinfo_t));
1119 * Some of this may turn out to be common with other ephemeral types,
1120 * in which case it should be moved to nfs4_trigger_esi_create(), or a
1121 * common function called.
1125 * Mirror mounts case - should have all data available
1127 static ephemeral_servinfo_t *
1128 nfs4_trigger_esi_create_mirrormount(vnode_t *vp, servinfo4_t *svp)
1130 char *stubpath;
1131 struct knetconfig *sikncp, *svkncp;
1132 struct netbuf *bufp;
1133 ephemeral_servinfo_t *esi;
1135 esi = kmem_zalloc(sizeof (ephemeral_servinfo_t), KM_SLEEP);
1137 /* initially set to be our type of ephemeral mount; may be added to */
1138 esi->esi_mount_flags = NFSMNT_MIRRORMOUNT;
1141 * We're copying info from the stub rnode's servinfo4, but
1142 * we must create new copies, not pointers, since this information
1143 * is to be associated with the new mount, which will be
1144 * unmounted (and its structures freed) separately
1148 * Sizes passed to kmem_[z]alloc here must match those freed
1149 * in nfs4_free_args()
1153 * We hold sv_lock across kmem_zalloc() calls that may sleep, but this
1154 * is difficult to avoid: as we need to read svp to calculate the
1155 * sizes to be allocated.
1157 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1159 esi->esi_hostname = kmem_zalloc(strlen(svp->sv_hostname) + 1, KM_SLEEP);
1160 (void) strcat(esi->esi_hostname, svp->sv_hostname);
1162 esi->esi_addr = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP);
1163 bufp = esi->esi_addr;
1164 bufp->len = svp->sv_addr.len;
1165 bufp->maxlen = svp->sv_addr.maxlen;
1166 bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
1167 bcopy(svp->sv_addr.buf, bufp->buf, bufp->len);
1169 esi->esi_knconf = kmem_zalloc(sizeof (*esi->esi_knconf), KM_SLEEP);
1170 sikncp = esi->esi_knconf;
1171 svkncp = svp->sv_knconf;
1172 sikncp->knc_semantics = svkncp->knc_semantics;
1173 sikncp->knc_protofmly = kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1174 (void) strcat((char *)sikncp->knc_protofmly,
1175 (char *)svkncp->knc_protofmly);
1176 sikncp->knc_proto = kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1177 (void) strcat((char *)sikncp->knc_proto, (char *)svkncp->knc_proto);
1178 sikncp->knc_rdev = svkncp->knc_rdev;
1181 * Used when AUTH_DH is negotiated.
1183 * This is ephemeral mount-type specific, since it contains the
1184 * server's time-sync syncaddr.
1186 if (svp->sv_dhsec) {
1187 struct netbuf *bufp;
1188 sec_data_t *sdata;
1189 dh_k4_clntdata_t *data;
1191 sdata = svp->sv_dhsec;
1192 data = (dh_k4_clntdata_t *)sdata->data;
1193 ASSERT(sdata->rpcflavor == AUTH_DH);
1195 bufp = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP);
1196 bufp->len = data->syncaddr.len;
1197 bufp->maxlen = data->syncaddr.maxlen;
1198 bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
1199 bcopy(data->syncaddr.buf, bufp->buf, bufp->len);
1200 esi->esi_syncaddr = bufp;
1202 if (data->netname != NULL) {
1203 int nmlen = data->netnamelen;
1206 * We need to copy from a dh_k4_clntdata_t
1207 * netname/netnamelen pair to a NUL-terminated
1208 * netname string suitable for putting in nfs_args,
1209 * where the latter has no netnamelen field.
1211 esi->esi_netname = kmem_zalloc(nmlen + 1, KM_SLEEP);
1212 bcopy(data->netname, esi->esi_netname, nmlen);
1214 } else {
1215 esi->esi_syncaddr = NULL;
1216 esi->esi_netname = NULL;
1219 stubpath = fn_path(VTOSV(vp)->sv_name);
1220 /* step over initial '.', to avoid e.g. sv_path: "/tank./ws" */
1221 ASSERT(*stubpath == '.');
1222 stubpath += 1;
1224 /* for nfs_args->fh */
1225 esi->esi_path_len = strlen(stubpath) + 1;
1226 if (strcmp(svp->sv_path, "/") != 0)
1227 esi->esi_path_len += strlen(svp->sv_path);
1228 esi->esi_path = kmem_zalloc(esi->esi_path_len, KM_SLEEP);
1229 if (strcmp(svp->sv_path, "/") != 0)
1230 (void) strcat(esi->esi_path, svp->sv_path);
1231 (void) strcat(esi->esi_path, stubpath);
1233 stubpath -= 1;
1234 /* stubpath allocated by fn_path() */
1235 kmem_free(stubpath, strlen(stubpath) + 1);
1237 nfs_rw_exit(&svp->sv_lock);
1239 return (esi);
1243 * Makes an upcall to NFSMAPID daemon to resolve hostname of NFS server to
1244 * get network information required to do the mount call.
1247 nfs4_callmapid(utf8string *server, struct nfs_fsl_info *resp)
1249 door_arg_t door_args;
1250 door_handle_t dh;
1251 XDR xdr;
1252 refd_door_args_t *xdr_argsp;
1253 refd_door_res_t *orig_resp;
1254 k_sigset_t smask;
1255 int xdr_len = 0;
1256 int res_len = 16; /* length of an ip adress */
1257 int orig_reslen = res_len;
1258 int error = 0;
1259 struct nfsidmap_globals *nig;
1261 if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN)
1262 return (ECONNREFUSED);
1264 nig = zone_getspecific(nfsidmap_zone_key, nfs_zone());
1265 ASSERT(nig != NULL);
1267 mutex_enter(&nig->nfsidmap_daemon_lock);
1268 dh = nig->nfsidmap_daemon_dh;
1269 if (dh == NULL) {
1270 mutex_exit(&nig->nfsidmap_daemon_lock);
1271 cmn_err(CE_NOTE,
1272 "nfs4_callmapid: nfsmapid daemon not " \
1273 "running unable to resolve host name\n");
1274 return (EINVAL);
1276 door_ki_hold(dh);
1277 mutex_exit(&nig->nfsidmap_daemon_lock);
1279 xdr_len = xdr_sizeof(&(xdr_utf8string), server);
1281 xdr_argsp = kmem_zalloc(xdr_len + sizeof (*xdr_argsp), KM_SLEEP);
1282 xdr_argsp->xdr_len = xdr_len;
1283 xdr_argsp->cmd = NFSMAPID_SRV_NETINFO;
1285 xdrmem_create(&xdr, (char *)&xdr_argsp->xdr_arg,
1286 xdr_len, XDR_ENCODE);
1288 if (!xdr_utf8string(&xdr, server)) {
1289 kmem_free(xdr_argsp, xdr_len + sizeof (*xdr_argsp));
1290 door_ki_rele(dh);
1291 return (1);
1294 if (orig_reslen)
1295 orig_resp = kmem_alloc(orig_reslen, KM_SLEEP);
1297 door_args.data_ptr = (char *)xdr_argsp;
1298 door_args.data_size = sizeof (*xdr_argsp) + xdr_argsp->xdr_len;
1299 door_args.desc_ptr = NULL;
1300 door_args.desc_num = 0;
1301 door_args.rbuf = orig_resp ? (char *)orig_resp : NULL;
1302 door_args.rsize = res_len;
1304 sigintr(&smask, 1);
1305 error = door_ki_upcall(dh, &door_args);
1306 sigunintr(&smask);
1308 door_ki_rele(dh);
1310 kmem_free(xdr_argsp, xdr_len + sizeof (*xdr_argsp));
1311 if (error) {
1312 kmem_free(orig_resp, orig_reslen);
1314 * There is no door to connect to. The referral daemon
1315 * must not be running yet.
1317 cmn_err(CE_WARN,
1318 "nfsmapid not running cannot resolve host name");
1319 goto out;
1323 * If the results buffer passed back are not the same as
1324 * what was sent free the old buffer and use the new one.
1326 if (orig_resp && orig_reslen) {
1327 refd_door_res_t *door_resp;
1329 door_resp = (refd_door_res_t *)door_args.rbuf;
1330 if ((void *)door_args.rbuf != orig_resp)
1331 kmem_free(orig_resp, orig_reslen);
1332 if (door_resp->res_status == 0) {
1333 xdrmem_create(&xdr, (char *)&door_resp->xdr_res,
1334 door_resp->xdr_len, XDR_DECODE);
1335 bzero(resp, sizeof (struct nfs_fsl_info));
1336 if (!xdr_nfs_fsl_info(&xdr, resp)) {
1337 DTRACE_PROBE2(
1338 nfs4clnt__debug__referral__upcall__xdrfail,
1339 struct nfs_fsl_info *, resp,
1340 char *, "nfs4_callmapid");
1341 error = EINVAL;
1343 } else {
1344 DTRACE_PROBE2(
1345 nfs4clnt__debug__referral__upcall__badstatus,
1346 int, door_resp->res_status,
1347 char *, "nfs4_callmapid");
1348 error = door_resp->res_status;
1350 kmem_free(door_args.rbuf, door_args.rsize);
1352 out:
1353 DTRACE_PROBE2(nfs4clnt__func__referral__upcall,
1354 char *, server, int, error);
1355 return (error);
1359 * Fetches the fs_locations attribute. Typically called
1360 * from a Replication/Migration/Referrals/Mirror-mount context
1362 * Fills in the attributes in garp. The caller is assumed
1363 * to have allocated memory for garp.
1365 * lock: if set do not lock s_recovlock and mi_recovlock mutex,
1366 * it's already done by caller. Otherwise lock these mutexes
1367 * before doing the rfs4call().
1369 * Returns
1370 * 1 for success
1371 * 0 for failure
1374 nfs4_fetch_locations(mntinfo4_t *mi, nfs4_sharedfh_t *sfh, char *nm,
1375 cred_t *cr, nfs4_ga_res_t *garp, COMPOUND4res_clnt *callres, bool_t lock)
1377 COMPOUND4args_clnt args;
1378 COMPOUND4res_clnt res;
1379 nfs_argop4 *argop;
1380 int argoplist_size = 3 * sizeof (nfs_argop4);
1381 nfs4_server_t *sp = NULL;
1382 int doqueue = 1;
1383 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
1384 int retval = 1;
1385 struct nfs4_clnt *nfscl;
1387 if (lock == TRUE)
1388 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
1389 else
1390 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
1391 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
1393 sp = find_nfs4_server(mi);
1394 if (lock == TRUE)
1395 nfs_rw_exit(&mi->mi_recovlock);
1397 if (sp != NULL)
1398 mutex_exit(&sp->s_lock);
1400 if (lock == TRUE) {
1401 if (sp != NULL)
1402 (void) nfs_rw_enter_sig(&sp->s_recovlock,
1403 RW_WRITER, 0);
1404 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0);
1405 } else {
1406 if (sp != NULL) {
1407 ASSERT(nfs_rw_lock_held(&sp->s_recovlock, RW_READER) ||
1408 nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER));
1413 * Do we want to do the setup for recovery here?
1415 * We know that the server responded to a null ping a very
1416 * short time ago, and we know that we intend to do a
1417 * single stateless operation - we want to fetch attributes,
1418 * so we know we can't encounter errors about state. If
1419 * something goes wrong with the GETATTR, like not being
1420 * able to get a response from the server or getting any
1421 * kind of FH error, we should fail the mount.
1423 * We may want to re-visited this at a later time.
1425 argop = kmem_alloc(argoplist_size, KM_SLEEP);
1427 args.ctag = TAG_GETATTR_FSLOCATION;
1428 /* PUTFH LOOKUP GETATTR */
1429 args.array_len = 3;
1430 args.array = argop;
1432 /* 0. putfh file */
1433 argop[0].argop = OP_CPUTFH;
1434 argop[0].nfs_argop4_u.opcputfh.sfh = sfh;
1436 /* 1. lookup name, can't be dotdot */
1437 argop[1].argop = OP_CLOOKUP;
1438 argop[1].nfs_argop4_u.opclookup.cname = nm;
1440 /* 2. file attrs */
1441 argop[2].argop = OP_GETATTR;
1442 argop[2].nfs_argop4_u.opgetattr.attr_request =
1443 FATTR4_FSID_MASK | FATTR4_FS_LOCATIONS_MASK |
1444 FATTR4_MOUNTED_ON_FILEID_MASK;
1445 argop[2].nfs_argop4_u.opgetattr.mi = mi;
1447 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
1449 if (lock == TRUE) {
1450 nfs_rw_exit(&mi->mi_recovlock);
1451 if (sp != NULL)
1452 nfs_rw_exit(&sp->s_recovlock);
1455 nfscl = zone_getspecific(nfs4clnt_zone_key, nfs_zone());
1456 nfscl->nfscl_stat.referrals.value.ui64++;
1457 DTRACE_PROBE3(nfs4clnt__func__referral__fsloc,
1458 nfs4_sharedfh_t *, sfh, char *, nm, nfs4_error_t *, &e);
1460 if (e.error != 0) {
1461 if (sp != NULL)
1462 nfs4_server_rele(sp);
1463 kmem_free(argop, argoplist_size);
1464 return (0);
1468 * Check for all possible error conditions.
1469 * For valid replies without an ops array or for illegal
1470 * replies, return a failure.
1472 if (res.status != NFS4_OK || res.array_len < 3 ||
1473 res.array[2].nfs_resop4_u.opgetattr.status != NFS4_OK) {
1474 retval = 0;
1475 goto exit;
1479 * There isn't much value in putting the attributes
1480 * in the attr cache since fs_locations4 aren't
1481 * encountered very frequently, so just make them
1482 * available to the caller.
1484 *garp = res.array[2].nfs_resop4_u.opgetattr.ga_res;
1486 DTRACE_PROBE2(nfs4clnt__debug__referral__fsloc,
1487 nfs4_ga_res_t *, garp, char *, "nfs4_fetch_locations");
1489 /* No fs_locations? -- return a failure */
1490 if (garp->n4g_ext_res == NULL ||
1491 garp->n4g_ext_res->n4g_fslocations.locations_val == NULL) {
1492 retval = 0;
1493 goto exit;
1496 if (!garp->n4g_fsid_valid)
1497 retval = 0;
1499 exit:
1500 if (retval == 0) {
1501 /* the call was ok but failed validating the call results */
1502 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1503 } else {
1504 ASSERT(callres != NULL);
1505 *callres = res;
1508 if (sp != NULL)
1509 nfs4_server_rele(sp);
1510 kmem_free(argop, argoplist_size);
1511 return (retval);
1514 /* tunable to disable referral mounts */
1515 int nfs4_no_referrals = 0;
1518 * Returns NULL if the vnode cannot be created or found.
1520 vnode_t *
1521 find_referral_stubvp(vnode_t *dvp, char *nm, cred_t *cr)
1523 nfs_fh4 *stub_fh, *dfh;
1524 nfs4_sharedfh_t *sfhp;
1525 char *newfhval;
1526 vnode_t *vp = NULL;
1527 fattr4_mounted_on_fileid mnt_on_fileid;
1528 nfs4_ga_res_t garp;
1529 mntinfo4_t *mi;
1530 COMPOUND4res_clnt callres;
1531 hrtime_t t;
1533 if (nfs4_no_referrals)
1534 return (NULL);
1537 * Get the mounted_on_fileid, unique on that server::fsid
1539 mi = VTOMI4(dvp);
1540 if (nfs4_fetch_locations(mi, VTOR4(dvp)->r_fh, nm, cr,
1541 &garp, &callres, FALSE) == 0)
1542 return (NULL);
1543 mnt_on_fileid = garp.n4g_mon_fid;
1544 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1547 * Build a fake filehandle from the dir FH and the mounted_on_fileid
1549 dfh = &VTOR4(dvp)->r_fh->sfh_fh;
1550 stub_fh = kmem_alloc(sizeof (nfs_fh4), KM_SLEEP);
1551 stub_fh->nfs_fh4_val = kmem_alloc(dfh->nfs_fh4_len +
1552 sizeof (fattr4_mounted_on_fileid), KM_SLEEP);
1553 newfhval = stub_fh->nfs_fh4_val;
1555 /* copy directory's file handle */
1556 bcopy(dfh->nfs_fh4_val, newfhval, dfh->nfs_fh4_len);
1557 stub_fh->nfs_fh4_len = dfh->nfs_fh4_len;
1558 newfhval = newfhval + dfh->nfs_fh4_len;
1560 /* Add mounted_on_fileid. Use bcopy to avoid alignment problem */
1561 bcopy((char *)&mnt_on_fileid, newfhval,
1562 sizeof (fattr4_mounted_on_fileid));
1563 stub_fh->nfs_fh4_len += sizeof (fattr4_mounted_on_fileid);
1565 sfhp = sfh4_put(stub_fh, VTOMI4(dvp), NULL);
1566 kmem_free(stub_fh->nfs_fh4_val, dfh->nfs_fh4_len +
1567 sizeof (fattr4_mounted_on_fileid));
1568 kmem_free(stub_fh, sizeof (nfs_fh4));
1569 if (sfhp == NULL)
1570 return (NULL);
1572 t = gethrtime();
1573 garp.n4g_va.va_type = VDIR;
1574 vp = makenfs4node(sfhp, NULL, dvp->v_vfsp, t,
1575 cr, dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
1577 if (vp != NULL)
1578 vp->v_type = VDIR;
1580 sfh4_rele(&sfhp);
1581 return (vp);
1585 nfs4_setup_referral(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
1587 vnode_t *nvp;
1588 rnode4_t *rp;
1590 if ((nvp = find_referral_stubvp(dvp, nm, cr)) == NULL)
1591 return (EINVAL);
1593 rp = VTOR4(nvp);
1594 mutex_enter(&rp->r_statelock);
1595 r4_stub_referral(rp);
1596 mutex_exit(&rp->r_statelock);
1597 dnlc_enter(dvp, nm, nvp);
1599 if (*vpp != NULL)
1600 VN_RELE(*vpp); /* no longer need this vnode */
1602 *vpp = nvp;
1604 return (0);
1608 * Fetch the location information and resolve the new server.
1609 * Caller needs to free up the XDR data which is returned.
1610 * Input: mount info, shared filehandle, nodename
1611 * Return: Index to the result or Error(-1)
1612 * Output: FsLocations Info, Resolved Server Info.
1615 nfs4_process_referral(mntinfo4_t *mi, nfs4_sharedfh_t *sfh,
1616 char *nm, cred_t *cr, nfs4_ga_res_t *grp, COMPOUND4res_clnt *res,
1617 struct nfs_fsl_info *fsloc)
1619 fs_location4 *fsp;
1620 struct nfs_fsl_info nfsfsloc;
1621 int ret, i, error;
1622 nfs4_ga_res_t garp;
1623 COMPOUND4res_clnt callres;
1624 struct knetconfig *knc;
1626 ret = nfs4_fetch_locations(mi, sfh, nm, cr, &garp, &callres, TRUE);
1627 if (ret == 0)
1628 return (-1);
1631 * As a lame attempt to figuring out if we're
1632 * handling a migration event or a referral,
1633 * look for rnodes with this fsid in the rnode
1634 * cache.
1636 * If we can find one or more such rnodes, it
1637 * means we're handling a migration event and
1638 * we want to bail out in that case.
1640 if (r4find_by_fsid(mi, &garp.n4g_fsid)) {
1641 DTRACE_PROBE3(nfs4clnt__debug__referral__migration,
1642 mntinfo4_t *, mi, nfs4_ga_res_t *, &garp,
1643 char *, "nfs4_process_referral");
1644 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1645 return (-1);
1649 * Find the first responsive server to mount. When we find
1650 * one, fsp will point to it.
1652 for (i = 0; i < garp.n4g_ext_res->n4g_fslocations.locations_len; i++) {
1654 fsp = &garp.n4g_ext_res->n4g_fslocations.locations_val[i];
1655 if (fsp->server_len == 0 || fsp->server_val == NULL)
1656 continue;
1658 error = nfs4_callmapid(fsp->server_val, &nfsfsloc);
1659 if (error != 0)
1660 continue;
1662 error = nfs4_ping_server_common(nfsfsloc.knconf,
1663 nfsfsloc.addr, !(mi->mi_flags & MI4_INT));
1664 if (error == RPC_SUCCESS)
1665 break;
1667 DTRACE_PROBE2(nfs4clnt__debug__referral__srvaddr,
1668 sockaddr_in *, (struct sockaddr_in *)nfsfsloc.addr->buf,
1669 char *, "nfs4_process_referral");
1671 xdr_free(xdr_nfs_fsl_info, (char *)&nfsfsloc);
1673 knc = nfsfsloc.knconf;
1674 if ((i >= garp.n4g_ext_res->n4g_fslocations.locations_len) ||
1675 (knc->knc_protofmly == NULL) || (knc->knc_proto == NULL)) {
1676 DTRACE_PROBE2(nfs4clnt__debug__referral__nofsloc,
1677 nfs4_ga_res_t *, &garp, char *, "nfs4_process_referral");
1678 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1679 return (-1);
1682 /* Send the results back */
1683 *fsloc = nfsfsloc;
1684 *grp = garp;
1685 *res = callres;
1686 return (i);
1690 * Referrals case - need to fetch referral data and then upcall to
1691 * user-level to get complete mount data.
1693 static ephemeral_servinfo_t *
1694 nfs4_trigger_esi_create_referral(vnode_t *vp, cred_t *cr)
1696 struct knetconfig *sikncp, *svkncp;
1697 struct netbuf *bufp;
1698 ephemeral_servinfo_t *esi;
1699 vnode_t *dvp;
1700 rnode4_t *drp;
1701 fs_location4 *fsp;
1702 struct nfs_fsl_info nfsfsloc;
1703 nfs4_ga_res_t garp;
1704 char *p;
1705 char fn[MAXNAMELEN];
1706 int i, index = -1;
1707 mntinfo4_t *mi;
1708 COMPOUND4res_clnt callres;
1711 * If we're passed in a stub vnode that
1712 * isn't a "referral" stub, bail out
1713 * and return a failure
1715 if (!RP_ISSTUB_REFERRAL(VTOR4(vp)))
1716 return (NULL);
1718 if (vtodv(vp, &dvp, CRED(), TRUE) != 0)
1719 return (NULL);
1721 drp = VTOR4(dvp);
1722 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) {
1723 VN_RELE(dvp);
1724 return (NULL);
1727 if (vtoname(vp, fn, MAXNAMELEN) != 0) {
1728 nfs_rw_exit(&drp->r_rwlock);
1729 VN_RELE(dvp);
1730 return (NULL);
1733 mi = VTOMI4(dvp);
1734 index = nfs4_process_referral(mi, drp->r_fh, fn, cr,
1735 &garp, &callres, &nfsfsloc);
1736 nfs_rw_exit(&drp->r_rwlock);
1737 VN_RELE(dvp);
1738 if (index < 0)
1739 return (NULL);
1741 fsp = &garp.n4g_ext_res->n4g_fslocations.locations_val[index];
1742 esi = kmem_zalloc(sizeof (ephemeral_servinfo_t), KM_SLEEP);
1744 /* initially set to be our type of ephemeral mount; may be added to */
1745 esi->esi_mount_flags = NFSMNT_REFERRAL;
1747 esi->esi_hostname =
1748 kmem_zalloc(fsp->server_val->utf8string_len + 1, KM_SLEEP);
1749 bcopy(fsp->server_val->utf8string_val, esi->esi_hostname,
1750 fsp->server_val->utf8string_len);
1751 esi->esi_hostname[fsp->server_val->utf8string_len] = '\0';
1753 bufp = kmem_alloc(sizeof (struct netbuf), KM_SLEEP);
1754 bufp->len = nfsfsloc.addr->len;
1755 bufp->maxlen = nfsfsloc.addr->maxlen;
1756 bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
1757 bcopy(nfsfsloc.addr->buf, bufp->buf, bufp->len);
1758 esi->esi_addr = bufp;
1760 esi->esi_knconf = kmem_zalloc(sizeof (*esi->esi_knconf), KM_SLEEP);
1761 sikncp = esi->esi_knconf;
1763 DTRACE_PROBE2(nfs4clnt__debug__referral__nfsfsloc,
1764 struct nfs_fsl_info *, &nfsfsloc,
1765 char *, "nfs4_trigger_esi_create_referral");
1767 svkncp = nfsfsloc.knconf;
1768 sikncp->knc_semantics = svkncp->knc_semantics;
1769 sikncp->knc_protofmly = kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1770 (void) strlcat((char *)sikncp->knc_protofmly,
1771 (char *)svkncp->knc_protofmly, KNC_STRSIZE);
1772 sikncp->knc_proto = kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1773 (void) strlcat((char *)sikncp->knc_proto, (char *)svkncp->knc_proto,
1774 KNC_STRSIZE);
1775 sikncp->knc_rdev = svkncp->knc_rdev;
1777 DTRACE_PROBE2(nfs4clnt__debug__referral__knetconf,
1778 struct knetconfig *, sikncp,
1779 char *, "nfs4_trigger_esi_create_referral");
1781 esi->esi_netname = kmem_zalloc(nfsfsloc.netnm_len, KM_SLEEP);
1782 bcopy(nfsfsloc.netname, esi->esi_netname, nfsfsloc.netnm_len);
1783 esi->esi_syncaddr = NULL;
1785 esi->esi_path = p = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1786 esi->esi_path_len = MAXPATHLEN;
1787 *p++ = '/';
1788 for (i = 0; i < fsp->rootpath.pathname4_len; i++) {
1789 component4 *comp;
1791 comp = &fsp->rootpath.pathname4_val[i];
1792 /* If no space, null the string and bail */
1793 if ((p - esi->esi_path) + comp->utf8string_len + 1 > MAXPATHLEN)
1794 goto err;
1795 bcopy(comp->utf8string_val, p, comp->utf8string_len);
1796 p += comp->utf8string_len;
1797 *p++ = '/';
1799 if (fsp->rootpath.pathname4_len != 0)
1800 *(p - 1) = '\0';
1801 else
1802 *p = '\0';
1803 p = esi->esi_path;
1804 esi->esi_path = strdup(p);
1805 esi->esi_path_len = strlen(p) + 1;
1806 kmem_free(p, MAXPATHLEN);
1808 /* Allocated in nfs4_process_referral() */
1809 xdr_free(xdr_nfs_fsl_info, (char *)&nfsfsloc);
1810 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1812 return (esi);
1813 err:
1814 kmem_free(esi->esi_path, esi->esi_path_len);
1815 kmem_free(esi->esi_hostname, fsp->server_val->utf8string_len + 1);
1816 kmem_free(esi->esi_addr->buf, esi->esi_addr->len);
1817 kmem_free(esi->esi_addr, sizeof (struct netbuf));
1818 kmem_free(esi->esi_knconf->knc_protofmly, KNC_STRSIZE);
1819 kmem_free(esi->esi_knconf->knc_proto, KNC_STRSIZE);
1820 kmem_free(esi->esi_knconf, sizeof (*esi->esi_knconf));
1821 kmem_free(esi->esi_netname, nfsfsloc.netnm_len);
1822 kmem_free(esi, sizeof (ephemeral_servinfo_t));
1823 xdr_free(xdr_nfs_fsl_info, (char *)&nfsfsloc);
1824 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1825 return (NULL);
1829 * Assemble the args, and call the generic VFS mount function to
1830 * finally perform the ephemeral mount.
1832 static int
1833 nfs4_trigger_domount(vnode_t *stubvp, domount_args_t *dma, vfs_t **vfsp,
1834 cred_t *cr, vnode_t **newvpp)
1836 struct mounta *uap;
1837 char *mntpt, *orig_path, *path;
1838 const char *orig_mntpt;
1839 int retval;
1840 int mntpt_len;
1841 int spec_len;
1842 zone_t *zone = curproc->p_zone;
1843 bool_t has_leading_slash;
1844 int i;
1846 vfs_t *stubvfsp = stubvp->v_vfsp;
1847 ephemeral_servinfo_t *esi = dma->dma_esi;
1848 struct nfs_args *nargs = dma->dma_nargs;
1850 /* first, construct the mount point for the ephemeral mount */
1851 orig_path = path = fn_path(VTOSV(stubvp)->sv_name);
1852 orig_mntpt = (char *)refstr_value(stubvfsp->vfs_mntpt);
1854 if (*orig_path == '.')
1855 orig_path++;
1858 * Get rid of zone's root path
1860 if (zone != global_zone) {
1862 * -1 for trailing '/' and -1 for EOS.
1864 if (strncmp(zone->zone_rootpath, orig_mntpt,
1865 zone->zone_rootpathlen - 1) == 0) {
1866 orig_mntpt += (zone->zone_rootpathlen - 2);
1870 mntpt_len = strlen(orig_mntpt) + strlen(orig_path);
1871 mntpt = kmem_zalloc(mntpt_len + 1, KM_SLEEP);
1872 (void) strcat(mntpt, orig_mntpt);
1873 (void) strcat(mntpt, orig_path);
1875 kmem_free(path, strlen(path) + 1);
1876 path = esi->esi_path;
1877 if (*path == '.')
1878 path++;
1879 if (path[0] == '/' && path[1] == '/')
1880 path++;
1881 has_leading_slash = (*path == '/');
1883 spec_len = strlen(dma->dma_hostlist);
1884 spec_len += strlen(path);
1886 /* We are going to have to add this in */
1887 if (!has_leading_slash)
1888 spec_len++;
1890 /* We need to get the ':' for dma_hostlist:esi_path */
1891 spec_len++;
1893 uap = kmem_zalloc(sizeof (struct mounta), KM_SLEEP);
1894 uap->spec = kmem_zalloc(spec_len + 1, KM_SLEEP);
1895 (void) snprintf(uap->spec, spec_len + 1, "%s:%s%s", dma->dma_hostlist,
1896 has_leading_slash ? "" : "/", path);
1898 uap->dir = mntpt;
1900 uap->flags = MS_SYSSPACE | MS_DATA;
1901 /* fstype-independent mount options not covered elsewhere */
1902 /* copy parent's mount(8) "-m" flag */
1903 if (stubvfsp->vfs_flag & VFS_NOMNTTAB)
1904 uap->flags |= MS_NOMNTTAB;
1906 uap->fstype = MNTTYPE_NFS4;
1907 uap->dataptr = (char *)nargs;
1908 /* not needed for MS_SYSSPACE */
1909 uap->datalen = 0;
1911 /* use optptr to pass in extra mount options */
1912 uap->flags |= MS_OPTIONSTR;
1913 uap->optptr = nfs4_trigger_create_mntopts(stubvfsp);
1914 if (uap->optptr == NULL) {
1915 retval = EINVAL;
1916 goto done;
1919 /* domount() expects us to count the trailing NUL */
1920 uap->optlen = strlen(uap->optptr) + 1;
1923 * If we get EBUSY, we try again once to see if we can perform
1924 * the mount. We do this because of a spurious race condition.
1926 for (i = 0; i < 2; i++) {
1927 int error;
1928 bool_t was_mounted;
1930 retval = domount(NULL, uap, stubvp, cr, vfsp);
1931 if (retval == 0) {
1932 retval = VFS_ROOT(*vfsp, newvpp);
1933 VFS_RELE(*vfsp);
1934 break;
1935 } else if (retval != EBUSY) {
1936 break;
1940 * We might find it mounted by the other racer...
1942 error = nfs4_trigger_mounted_already(stubvp,
1943 newvpp, &was_mounted, vfsp);
1944 if (error) {
1945 goto done;
1946 } else if (was_mounted) {
1947 retval = 0;
1948 break;
1952 done:
1953 if (uap->optptr)
1954 nfs4_trigger_destroy_mntopts(uap->optptr);
1956 kmem_free(uap->spec, spec_len + 1);
1957 kmem_free(uap, sizeof (struct mounta));
1958 kmem_free(mntpt, mntpt_len + 1);
1960 return (retval);
1964 * Build an nfs_args structure for passing to domount().
1966 * Ephemeral mount-type specific data comes from the ephemeral_servinfo_t;
1967 * generic data - common to all ephemeral mount types - is read directly
1968 * from the parent mount's servinfo4_t and mntinfo4_t, via the stub vnode.
1970 static struct nfs_args *
1971 nfs4_trigger_nargs_create(mntinfo4_t *mi, servinfo4_t *svp,
1972 ephemeral_servinfo_t *esi)
1974 sec_data_t *secdata;
1975 struct nfs_args *nargs;
1977 /* setup the nfs args */
1978 nargs = kmem_zalloc(sizeof (struct nfs_args), KM_SLEEP);
1980 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1982 nargs->addr = esi->esi_addr;
1984 /* for AUTH_DH by negotiation */
1985 if (esi->esi_syncaddr || esi->esi_netname) {
1986 nargs->flags |= NFSMNT_SECURE;
1987 nargs->syncaddr = esi->esi_syncaddr;
1988 nargs->netname = esi->esi_netname;
1991 nargs->flags |= NFSMNT_KNCONF;
1992 nargs->knconf = esi->esi_knconf;
1993 nargs->flags |= NFSMNT_HOSTNAME;
1994 nargs->hostname = esi->esi_hostname;
1995 nargs->fh = esi->esi_path;
1997 /* general mount settings, all copied from parent mount */
1998 mutex_enter(&mi->mi_lock);
2000 if (!(mi->mi_flags & MI4_HARD))
2001 nargs->flags |= NFSMNT_SOFT;
2003 nargs->flags |= NFSMNT_WSIZE | NFSMNT_RSIZE | NFSMNT_TIMEO |
2004 NFSMNT_RETRANS;
2005 nargs->wsize = mi->mi_stsize;
2006 nargs->rsize = mi->mi_tsize;
2007 nargs->timeo = mi->mi_timeo;
2008 nargs->retrans = mi->mi_retrans;
2010 if (mi->mi_flags & MI4_INT)
2011 nargs->flags |= NFSMNT_INT;
2012 if (mi->mi_flags & MI4_NOAC)
2013 nargs->flags |= NFSMNT_NOAC;
2015 nargs->flags |= NFSMNT_ACREGMIN | NFSMNT_ACREGMAX | NFSMNT_ACDIRMIN |
2016 NFSMNT_ACDIRMAX;
2017 nargs->acregmin = HR2SEC(mi->mi_acregmin);
2018 nargs->acregmax = HR2SEC(mi->mi_acregmax);
2019 nargs->acdirmin = HR2SEC(mi->mi_acdirmin);
2020 nargs->acdirmax = HR2SEC(mi->mi_acdirmax);
2022 /* add any specific flags for this type of ephemeral mount */
2023 nargs->flags |= esi->esi_mount_flags;
2025 if (mi->mi_flags & MI4_NOCTO)
2026 nargs->flags |= NFSMNT_NOCTO;
2027 if (mi->mi_flags & MI4_GRPID)
2028 nargs->flags |= NFSMNT_GRPID;
2029 if (mi->mi_flags & MI4_LLOCK)
2030 nargs->flags |= NFSMNT_LLOCK;
2031 if (mi->mi_flags & MI4_NOPRINT)
2032 nargs->flags |= NFSMNT_NOPRINT;
2033 if (mi->mi_flags & MI4_DIRECTIO)
2034 nargs->flags |= NFSMNT_DIRECTIO;
2035 if (mi->mi_flags & MI4_PUBLIC && nargs->flags & NFSMNT_MIRRORMOUNT)
2036 nargs->flags |= NFSMNT_PUBLIC;
2038 /* Do some referral-specific option tweaking */
2039 if (nargs->flags & NFSMNT_REFERRAL) {
2040 nargs->flags &= ~NFSMNT_DORDMA;
2041 nargs->flags |= NFSMNT_TRYRDMA;
2044 mutex_exit(&mi->mi_lock);
2047 * Security data & negotiation policy.
2049 * For mirror mounts, we need to preserve the parent mount's
2050 * preference for security negotiation, translating SV4_TRYSECDEFAULT
2051 * to NFSMNT_SECDEFAULT if present.
2053 * For referrals, we always want security negotiation and will
2054 * set NFSMNT_SECDEFAULT and we will not copy current secdata.
2055 * The reason is that we can't negotiate down from a parent's
2056 * Kerberos flavor to AUTH_SYS.
2058 * If SV4_TRYSECDEFAULT is not set, that indicates that a specific
2059 * security flavour was requested, with data in sv_secdata, and that
2060 * no negotiation should occur. If this specified flavour fails, that's
2061 * it. We will copy sv_secdata, and not set NFSMNT_SECDEFAULT.
2063 * If SV4_TRYSECDEFAULT is set, then we start with a passed-in
2064 * default flavour, in sv_secdata, but then negotiate a new flavour.
2065 * Possible flavours are recorded in an array in sv_secinfo, with
2066 * currently in-use flavour pointed to by sv_currsec.
2068 * If sv_currsec is set, i.e. if negotiation has already occurred,
2069 * we will copy sv_currsec. Otherwise, copy sv_secdata. Regardless,
2070 * we will set NFSMNT_SECDEFAULT, to enable negotiation.
2072 if (nargs->flags & NFSMNT_REFERRAL) {
2073 /* enable negotiation for referral mount */
2074 nargs->flags |= NFSMNT_SECDEFAULT;
2075 secdata = kmem_alloc(sizeof (sec_data_t), KM_SLEEP);
2076 secdata->secmod = secdata->rpcflavor = AUTH_SYS;
2077 secdata->data = NULL;
2078 } else if (svp->sv_flags & SV4_TRYSECDEFAULT) {
2079 /* enable negotiation for mirror mount */
2080 nargs->flags |= NFSMNT_SECDEFAULT;
2083 * As a starting point for negotiation, copy parent
2084 * mount's negotiated flavour (sv_currsec) if available,
2085 * or its passed-in flavour (sv_secdata) if not.
2087 if (svp->sv_currsec != NULL)
2088 secdata = copy_sec_data(svp->sv_currsec);
2089 else if (svp->sv_secdata != NULL)
2090 secdata = copy_sec_data(svp->sv_secdata);
2091 else
2092 secdata = NULL;
2093 } else {
2094 /* do not enable negotiation; copy parent's passed-in flavour */
2095 if (svp->sv_secdata != NULL)
2096 secdata = copy_sec_data(svp->sv_secdata);
2097 else
2098 secdata = NULL;
2101 nfs_rw_exit(&svp->sv_lock);
2103 nargs->flags |= NFSMNT_NEWARGS;
2104 nargs->nfs_args_ext = NFS_ARGS_EXTB;
2105 nargs->nfs_ext_u.nfs_extB.secdata = secdata;
2107 /* for NFS RO failover; caller will set if necessary */
2108 nargs->nfs_ext_u.nfs_extB.next = NULL;
2110 return (nargs);
2113 static void
2114 nfs4_trigger_nargs_destroy(struct nfs_args *nargs)
2117 * Either the mount failed, in which case the data is not needed, or
2118 * nfs4_mount() has either taken copies of what it needs or,
2119 * where it has merely copied the ptr, it has set *our* ptr to NULL,
2120 * whereby nfs4_free_args() will ignore it.
2122 nfs4_free_args(nargs);
2123 kmem_free(nargs, sizeof (struct nfs_args));
2127 * When we finally get into the mounting, we need to add this
2128 * node to the ephemeral tree.
2130 * This is called from nfs4_mount().
2133 nfs4_record_ephemeral_mount(mntinfo4_t *mi, vnode_t *mvp)
2135 mntinfo4_t *mi_parent;
2136 nfs4_ephemeral_t *eph;
2137 nfs4_ephemeral_tree_t *net;
2139 nfs4_ephemeral_t *prior;
2140 nfs4_ephemeral_t *child;
2142 nfs4_ephemeral_t *peer;
2144 nfs4_trigger_globals_t *ntg;
2145 zone_t *zone = curproc->p_zone;
2147 int rc = 0;
2149 mi_parent = VTOMI4(mvp);
2152 * Get this before grabbing anything else!
2154 ntg = zone_getspecific(nfs4_ephemeral_key, zone);
2155 if (!ntg->ntg_thread_started) {
2156 nfs4_ephemeral_start_harvester(ntg);
2159 mutex_enter(&mi_parent->mi_lock);
2160 mutex_enter(&mi->mi_lock);
2162 net = mi->mi_ephemeral_tree =
2163 mi_parent->mi_ephemeral_tree;
2166 * If the mi_ephemeral_tree is NULL, then it
2167 * means that either the harvester or a manual
2168 * umount has cleared the tree out right before
2169 * we got here.
2171 * There is nothing we can do here, so return
2172 * to the caller and let them decide whether they
2173 * try again.
2175 if (net == NULL) {
2176 mutex_exit(&mi->mi_lock);
2177 mutex_exit(&mi_parent->mi_lock);
2179 return (EBUSY);
2183 * We've just tied the mntinfo to the tree, so
2184 * now we bump the refcnt and hold it there until
2185 * this mntinfo is removed from the tree.
2187 nfs4_ephemeral_tree_hold(net);
2190 * We need to tack together the ephemeral mount
2191 * with this new mntinfo.
2193 eph = kmem_zalloc(sizeof (*eph), KM_SLEEP);
2194 eph->ne_mount = mi;
2195 MI4_HOLD(mi);
2196 VFS_HOLD(mi->mi_vfsp);
2197 eph->ne_ref_time = gethrestime_sec();
2200 * We need to tell the ephemeral mount when
2201 * to time out.
2203 eph->ne_mount_to = ntg->ntg_mount_to;
2205 mi->mi_ephemeral = eph;
2208 * If the enclosing mntinfo4 is also ephemeral,
2209 * then we need to point to its enclosing parent.
2210 * Else the enclosing mntinfo4 is the enclosing parent.
2212 * We also need to weave this ephemeral node
2213 * into the tree.
2215 if (mi_parent->mi_flags & MI4_EPHEMERAL) {
2217 * We need to decide if we are
2218 * the root node of this branch
2219 * or if we are a sibling of this
2220 * branch.
2222 prior = mi_parent->mi_ephemeral;
2223 if (prior == NULL) {
2225 * Race condition, clean up, and
2226 * let caller handle mntinfo.
2228 mi->mi_flags &= ~MI4_EPHEMERAL;
2229 mi->mi_ephemeral = NULL;
2230 kmem_free(eph, sizeof (*eph));
2231 VFS_RELE(mi->mi_vfsp);
2232 MI4_RELE(mi);
2233 nfs4_ephemeral_tree_rele(net);
2234 rc = EBUSY;
2235 } else {
2236 if (prior->ne_child == NULL) {
2237 prior->ne_child = eph;
2238 } else {
2239 child = prior->ne_child;
2241 prior->ne_child = eph;
2242 eph->ne_peer = child;
2244 child->ne_prior = eph;
2247 eph->ne_prior = prior;
2249 } else {
2251 * The parent mntinfo4 is the non-ephemeral
2252 * root of the ephemeral tree. We
2253 * need to decide if we are the root
2254 * node of that tree or if we are a
2255 * sibling of the root node.
2257 * We are the root if there is no
2258 * other node.
2260 if (net->net_root == NULL) {
2261 net->net_root = eph;
2262 } else {
2263 eph->ne_peer = peer = net->net_root;
2264 ASSERT(peer != NULL);
2265 net->net_root = eph;
2267 peer->ne_prior = eph;
2270 eph->ne_prior = NULL;
2273 mutex_exit(&mi->mi_lock);
2274 mutex_exit(&mi_parent->mi_lock);
2276 return (rc);
2280 * Commit the changes to the ephemeral tree for removing this node.
2282 static void
2283 nfs4_ephemeral_umount_cleanup(nfs4_ephemeral_t *eph)
2285 nfs4_ephemeral_t *e = eph;
2286 nfs4_ephemeral_t *peer;
2287 nfs4_ephemeral_t *prior;
2289 peer = eph->ne_peer;
2290 prior = e->ne_prior;
2293 * If this branch root was not the
2294 * tree root, then we need to fix back pointers.
2296 if (prior) {
2297 if (prior->ne_child == e) {
2298 prior->ne_child = peer;
2299 } else {
2300 prior->ne_peer = peer;
2303 if (peer)
2304 peer->ne_prior = prior;
2305 } else if (peer) {
2306 peer->ne_mount->mi_ephemeral_tree->net_root = peer;
2307 peer->ne_prior = NULL;
2308 } else {
2309 e->ne_mount->mi_ephemeral_tree->net_root = NULL;
2314 * We want to avoid recursion at all costs. So we need to
2315 * unroll the tree. We do this by a depth first traversal to
2316 * leaf nodes. We blast away the leaf and work our way back
2317 * up and down the tree.
2319 static int
2320 nfs4_ephemeral_unmount_engine(nfs4_ephemeral_t *eph,
2321 int isTreeRoot, int flag, cred_t *cr)
2323 nfs4_ephemeral_t *e = eph;
2324 nfs4_ephemeral_t *prior;
2325 mntinfo4_t *mi;
2326 vfs_t *vfsp;
2327 int error;
2330 * We use the loop while unrolling the ephemeral tree.
2332 for (;;) {
2334 * First we walk down the child.
2336 if (e->ne_child) {
2337 prior = e;
2338 e = e->ne_child;
2339 continue;
2343 * If we are the root of the branch we are removing,
2344 * we end it here. But if the branch is the root of
2345 * the tree, we have to forge on. We do not consider
2346 * the peer list for the root because while it may
2347 * be okay to remove, it is both extra work and a
2348 * potential for a false-positive error to stall the
2349 * unmount attempt.
2351 if (e == eph && isTreeRoot == FALSE)
2352 return (0);
2355 * Next we walk down the peer list.
2357 if (e->ne_peer) {
2358 prior = e;
2359 e = e->ne_peer;
2360 continue;
2364 * We can only remove the node passed in by the
2365 * caller if it is the root of the ephemeral tree.
2366 * Otherwise, the caller will remove it.
2368 if (e == eph && isTreeRoot == FALSE)
2369 return (0);
2372 * Okay, we have a leaf node, time
2373 * to prune it!
2375 * Note that prior can only be NULL if
2376 * and only if it is the root of the
2377 * ephemeral tree.
2379 prior = e->ne_prior;
2381 mi = e->ne_mount;
2382 mutex_enter(&mi->mi_lock);
2383 vfsp = mi->mi_vfsp;
2384 ASSERT(vfsp != NULL);
2387 * Cleared by umount2_engine.
2389 VFS_HOLD(vfsp);
2392 * Inform nfs4_unmount to not recursively
2393 * descend into this node's children when it
2394 * gets processed.
2396 mi->mi_flags |= MI4_EPHEMERAL_RECURSED;
2397 mutex_exit(&mi->mi_lock);
2399 error = umount2_engine(vfsp, flag, cr, FALSE);
2400 if (error) {
2402 * We need to reenable nfs4_unmount's ability
2403 * to recursively descend on this node.
2405 mutex_enter(&mi->mi_lock);
2406 mi->mi_flags &= ~MI4_EPHEMERAL_RECURSED;
2407 mutex_exit(&mi->mi_lock);
2409 return (error);
2413 * If we are the current node, we do not want to
2414 * touch anything else. At this point, the only
2415 * way the current node can have survived to here
2416 * is if it is the root of the ephemeral tree and
2417 * we are unmounting the enclosing mntinfo4.
2419 if (e == eph) {
2420 ASSERT(prior == NULL);
2421 return (0);
2425 * Stitch up the prior node. Note that since
2426 * we have handled the root of the tree, prior
2427 * must be non-NULL.
2429 ASSERT(prior != NULL);
2430 if (prior->ne_child == e) {
2431 prior->ne_child = NULL;
2432 } else {
2433 ASSERT(prior->ne_peer == e);
2435 prior->ne_peer = NULL;
2438 e = prior;
2441 /* NOTREACHED */
2445 * Common code to safely release net_cnt_lock and net_tree_lock
2447 void
2448 nfs4_ephemeral_umount_unlock(bool_t *pmust_unlock,
2449 nfs4_ephemeral_tree_t **pnet)
2451 nfs4_ephemeral_tree_t *net = *pnet;
2453 if (*pmust_unlock) {
2454 mutex_enter(&net->net_cnt_lock);
2455 net->net_status &= ~NFS4_EPHEMERAL_TREE_UMOUNTING;
2456 mutex_exit(&net->net_cnt_lock);
2458 mutex_exit(&net->net_tree_lock);
2460 *pmust_unlock = FALSE;
2465 * While we may have removed any child or sibling nodes of this
2466 * ephemeral node, we can not nuke it until we know that there
2467 * were no actived vnodes on it. This will do that final
2468 * work once we know it is not busy.
2470 void
2471 nfs4_ephemeral_umount_activate(mntinfo4_t *mi, bool_t *pmust_unlock,
2472 nfs4_ephemeral_tree_t **pnet)
2475 * Now we need to get rid of the ephemeral data if it exists.
2477 mutex_enter(&mi->mi_lock);
2478 if (mi->mi_ephemeral) {
2480 * If we are the root node of an ephemeral branch
2481 * which is being removed, then we need to fixup
2482 * pointers into and out of the node.
2484 if (!(mi->mi_flags & MI4_EPHEMERAL_RECURSED))
2485 nfs4_ephemeral_umount_cleanup(mi->mi_ephemeral);
2487 nfs4_ephemeral_tree_rele(*pnet);
2488 ASSERT(mi->mi_ephemeral != NULL);
2490 kmem_free(mi->mi_ephemeral, sizeof (*mi->mi_ephemeral));
2491 mi->mi_ephemeral = NULL;
2492 VFS_RELE(mi->mi_vfsp);
2493 MI4_RELE(mi);
2495 mutex_exit(&mi->mi_lock);
2497 nfs4_ephemeral_umount_unlock(pmust_unlock, pnet);
2501 * Unmount an ephemeral node.
2503 * Note that if this code fails, then it must unlock.
2505 * If it succeeds, then the caller must be prepared to do so.
2508 nfs4_ephemeral_umount(mntinfo4_t *mi, int flag, cred_t *cr,
2509 bool_t *pmust_unlock, nfs4_ephemeral_tree_t **pnet)
2511 int error = 0;
2512 nfs4_ephemeral_t *eph;
2513 nfs4_ephemeral_tree_t *net;
2514 int is_derooting = FALSE;
2515 int is_recursed = FALSE;
2516 int was_locked = FALSE;
2519 * Make sure to set the default state for cleaning
2520 * up the tree in the caller (and on the way out).
2522 *pmust_unlock = FALSE;
2525 * The active vnodes on this file system may be ephemeral
2526 * children. We need to check for and try to unmount them
2527 * here. If any can not be unmounted, we are going
2528 * to return EBUSY.
2530 mutex_enter(&mi->mi_lock);
2533 * If an ephemeral tree, we need to check to see if
2534 * the lock is already held. If it is, then we need
2535 * to see if we are being called as a result of
2536 * the recursive removal of some node of the tree or
2537 * if we are another attempt to remove the tree.
2539 * mi_flags & MI4_EPHEMERAL indicates an ephemeral
2540 * node. mi_ephemeral being non-NULL also does this.
2542 * mi_ephemeral_tree being non-NULL is sufficient
2543 * to also indicate either it is an ephemeral node
2544 * or the enclosing mntinfo4.
2546 * Do we need MI4_EPHEMERAL? Yes, it is useful for
2547 * when we delete the ephemeral node and need to
2548 * differentiate from an ephemeral node and the
2549 * enclosing root node.
2551 *pnet = net = mi->mi_ephemeral_tree;
2552 if (net == NULL) {
2553 mutex_exit(&mi->mi_lock);
2554 return (0);
2557 eph = mi->mi_ephemeral;
2558 is_recursed = mi->mi_flags & MI4_EPHEMERAL_RECURSED;
2559 is_derooting = (eph == NULL);
2561 mutex_enter(&net->net_cnt_lock);
2564 * If this is not recursion, then we need to
2565 * check to see if a harvester thread has
2566 * already grabbed the lock.
2568 * After we exit this branch, we may not
2569 * blindly return, we need to jump to
2570 * is_busy!
2572 if (!is_recursed) {
2573 if (net->net_status &
2574 NFS4_EPHEMERAL_TREE_LOCKED) {
2576 * If the tree is locked, we need
2577 * to decide whether we are the
2578 * harvester or some explicit call
2579 * for a umount. The only way that
2580 * we are the harvester is if
2581 * MS_SYSSPACE is set.
2583 * We only let the harvester through
2584 * at this point.
2586 * We return EBUSY so that the
2587 * caller knows something is
2588 * going on. Note that by that
2589 * time, the umount in the other
2590 * thread may have already occured.
2592 if (!(flag & MS_SYSSPACE)) {
2593 mutex_exit(&net->net_cnt_lock);
2594 mutex_exit(&mi->mi_lock);
2596 return (EBUSY);
2599 was_locked = TRUE;
2603 mutex_exit(&net->net_cnt_lock);
2604 mutex_exit(&mi->mi_lock);
2607 * If we are not the harvester, we need to check
2608 * to see if we need to grab the tree lock.
2610 if (was_locked == FALSE) {
2612 * If we grab the lock, it means that no other
2613 * operation is working on the tree. If we don't
2614 * grab it, we need to decide if this is because
2615 * we are a recursive call or a new operation.
2617 if (mutex_tryenter(&net->net_tree_lock)) {
2618 *pmust_unlock = TRUE;
2619 } else {
2621 * If we are a recursive call, we can
2622 * proceed without the lock.
2623 * Otherwise we have to wait until
2624 * the lock becomes free.
2626 if (!is_recursed) {
2627 mutex_enter(&net->net_cnt_lock);
2628 if (net->net_status &
2629 (NFS4_EPHEMERAL_TREE_DEROOTING
2630 | NFS4_EPHEMERAL_TREE_INVALID)) {
2631 mutex_exit(&net->net_cnt_lock);
2632 goto is_busy;
2634 mutex_exit(&net->net_cnt_lock);
2637 * We can't hold any other locks whilst
2638 * we wait on this to free up.
2640 mutex_enter(&net->net_tree_lock);
2643 * Note that while mi->mi_ephemeral
2644 * may change and thus we have to
2645 * update eph, it is the case that
2646 * we have tied down net and
2647 * do not care if mi->mi_ephemeral_tree
2648 * has changed.
2650 mutex_enter(&mi->mi_lock);
2651 eph = mi->mi_ephemeral;
2652 mutex_exit(&mi->mi_lock);
2655 * Okay, we need to see if either the
2656 * tree got nuked or the current node
2657 * got nuked. Both of which will cause
2658 * an error.
2660 * Note that a subsequent retry of the
2661 * umount shall work.
2663 mutex_enter(&net->net_cnt_lock);
2664 if (net->net_status &
2665 NFS4_EPHEMERAL_TREE_INVALID ||
2666 (!is_derooting && eph == NULL)) {
2667 mutex_exit(&net->net_cnt_lock);
2668 mutex_exit(&net->net_tree_lock);
2669 goto is_busy;
2671 mutex_exit(&net->net_cnt_lock);
2672 *pmust_unlock = TRUE;
2678 * Only once we have grabbed the lock can we mark what we
2679 * are planning on doing to the ephemeral tree.
2681 if (*pmust_unlock) {
2682 mutex_enter(&net->net_cnt_lock);
2683 net->net_status |= NFS4_EPHEMERAL_TREE_UMOUNTING;
2686 * Check to see if we are nuking the root.
2688 if (is_derooting)
2689 net->net_status |=
2690 NFS4_EPHEMERAL_TREE_DEROOTING;
2691 mutex_exit(&net->net_cnt_lock);
2694 if (!is_derooting) {
2696 * Only work on children if the caller has not already
2697 * done so.
2699 if (!is_recursed) {
2700 ASSERT(eph != NULL);
2702 error = nfs4_ephemeral_unmount_engine(eph,
2703 FALSE, flag, cr);
2704 if (error)
2705 goto is_busy;
2707 } else {
2708 eph = net->net_root;
2711 * Only work if there is something there.
2713 if (eph) {
2714 error = nfs4_ephemeral_unmount_engine(eph, TRUE,
2715 flag, cr);
2716 if (error) {
2717 mutex_enter(&net->net_cnt_lock);
2718 net->net_status &=
2719 ~NFS4_EPHEMERAL_TREE_DEROOTING;
2720 mutex_exit(&net->net_cnt_lock);
2721 goto is_busy;
2725 * Nothing else which goes wrong will
2726 * invalidate the blowing away of the
2727 * ephmeral tree.
2729 net->net_root = NULL;
2733 * We have derooted and we have caused the tree to be
2734 * invalidated.
2736 mutex_enter(&net->net_cnt_lock);
2737 net->net_status &= ~NFS4_EPHEMERAL_TREE_DEROOTING;
2738 net->net_status |= NFS4_EPHEMERAL_TREE_INVALID;
2739 DTRACE_NFSV4_1(nfs4clnt__dbg__ephemeral__tree__derooting,
2740 uint_t, net->net_refcnt);
2743 * We will not finalize this node, so safe to
2744 * release it.
2746 nfs4_ephemeral_tree_decr(net);
2747 mutex_exit(&net->net_cnt_lock);
2749 if (was_locked == FALSE)
2750 mutex_exit(&net->net_tree_lock);
2753 * We have just blown away any notation of this
2754 * tree being locked or having a refcnt.
2755 * We can't let the caller try to clean things up.
2757 *pmust_unlock = FALSE;
2760 * At this point, the tree should no longer be
2761 * associated with the mntinfo4. We need to pull
2762 * it off there and let the harvester take
2763 * care of it once the refcnt drops.
2765 mutex_enter(&mi->mi_lock);
2766 mi->mi_ephemeral_tree = NULL;
2767 mutex_exit(&mi->mi_lock);
2770 return (0);
2772 is_busy:
2774 nfs4_ephemeral_umount_unlock(pmust_unlock, pnet);
2776 return (error);
2780 * Do the umount and record any error in the parent.
2782 static void
2783 nfs4_ephemeral_record_umount(vfs_t *vfsp, int flag,
2784 nfs4_ephemeral_t *e, nfs4_ephemeral_t *prior)
2786 int error;
2789 * Only act on if the fs is still mounted.
2791 if (vfsp == NULL)
2792 return;
2794 error = umount2_engine(vfsp, flag, kcred, FALSE);
2795 if (error) {
2796 if (prior) {
2797 if (prior->ne_child == e)
2798 prior->ne_state |=
2799 NFS4_EPHEMERAL_CHILD_ERROR;
2800 else
2801 prior->ne_state |=
2802 NFS4_EPHEMERAL_PEER_ERROR;
2808 * For each tree in the forest (where the forest is in
2809 * effect all of the ephemeral trees for this zone),
2810 * scan to see if a node can be unmounted. Note that
2811 * unlike nfs4_ephemeral_unmount_engine(), we do
2812 * not process the current node before children or
2813 * siblings. I.e., if a node can be unmounted, we
2814 * do not recursively check to see if the nodes
2815 * hanging off of it can also be unmounted.
2817 * Instead, we delve down deep to try and remove the
2818 * children first. Then, because we share code with
2819 * nfs4_ephemeral_unmount_engine(), we will try
2820 * them again. This could be a performance issue in
2821 * the future.
2823 * Also note that unlike nfs4_ephemeral_unmount_engine(),
2824 * we do not halt on an error. We will not remove the
2825 * current node, but we will keep on trying to remove
2826 * the others.
2828 * force indicates that we want the unmount to occur
2829 * even if there is something blocking it.
2831 * time_check indicates that we want to see if the
2832 * mount has expired past mount_to or not. Typically
2833 * we want to do this and only on a shutdown of the
2834 * zone would we want to ignore the check.
2836 static void
2837 nfs4_ephemeral_harvest_forest(nfs4_trigger_globals_t *ntg,
2838 bool_t force, bool_t time_check)
2840 nfs4_ephemeral_tree_t *net;
2841 nfs4_ephemeral_tree_t *prev = NULL;
2842 nfs4_ephemeral_tree_t *next;
2843 nfs4_ephemeral_t *e;
2844 nfs4_ephemeral_t *prior;
2845 time_t now = gethrestime_sec();
2847 nfs4_ephemeral_tree_t *harvest = NULL;
2849 int flag;
2851 mntinfo4_t *mi;
2852 vfs_t *vfsp;
2854 if (force)
2855 flag = MS_FORCE | MS_SYSSPACE;
2856 else
2857 flag = MS_SYSSPACE;
2859 mutex_enter(&ntg->ntg_forest_lock);
2860 for (net = ntg->ntg_forest; net != NULL; net = next) {
2861 next = net->net_next;
2863 nfs4_ephemeral_tree_hold(net);
2865 mutex_enter(&net->net_tree_lock);
2868 * Let the unmount code know that the
2869 * tree is already locked!
2871 mutex_enter(&net->net_cnt_lock);
2872 net->net_status |= NFS4_EPHEMERAL_TREE_LOCKED;
2873 mutex_exit(&net->net_cnt_lock);
2876 * If the intent is force all ephemeral nodes to
2877 * be unmounted in this zone, we can short circuit a
2878 * lot of tree traversal and simply zap the root node.
2880 if (force) {
2881 if (net->net_root) {
2882 mi = net->net_root->ne_mount;
2884 vfsp = mi->mi_vfsp;
2885 ASSERT(vfsp != NULL);
2888 * Cleared by umount2_engine.
2890 VFS_HOLD(vfsp);
2892 (void) umount2_engine(vfsp, flag,
2893 kcred, FALSE);
2895 goto check_done;
2899 e = net->net_root;
2900 if (e)
2901 e->ne_state = NFS4_EPHEMERAL_VISIT_CHILD;
2903 while (e) {
2904 if (e->ne_state == NFS4_EPHEMERAL_VISIT_CHILD) {
2905 e->ne_state = NFS4_EPHEMERAL_VISIT_SIBLING;
2906 if (e->ne_child) {
2907 e = e->ne_child;
2908 e->ne_state =
2909 NFS4_EPHEMERAL_VISIT_CHILD;
2912 continue;
2913 } else if (e->ne_state ==
2914 NFS4_EPHEMERAL_VISIT_SIBLING) {
2915 e->ne_state = NFS4_EPHEMERAL_PROCESS_ME;
2916 if (e->ne_peer) {
2917 e = e->ne_peer;
2918 e->ne_state =
2919 NFS4_EPHEMERAL_VISIT_CHILD;
2922 continue;
2923 } else if (e->ne_state ==
2924 NFS4_EPHEMERAL_CHILD_ERROR) {
2925 prior = e->ne_prior;
2928 * If a child reported an error, do
2929 * not bother trying to unmount.
2931 * If your prior node is a parent,
2932 * pass the error up such that they
2933 * also do not try to unmount.
2935 * However, if your prior is a sibling,
2936 * let them try to unmount if they can.
2938 if (prior) {
2939 if (prior->ne_child == e)
2940 prior->ne_state |=
2941 NFS4_EPHEMERAL_CHILD_ERROR;
2942 else
2943 prior->ne_state |=
2944 NFS4_EPHEMERAL_PEER_ERROR;
2948 * Clear the error and if needed, process peers.
2950 * Once we mask out the error, we know whether
2951 * or we have to process another node.
2953 e->ne_state &= ~NFS4_EPHEMERAL_CHILD_ERROR;
2954 if (e->ne_state == NFS4_EPHEMERAL_PROCESS_ME)
2955 e = prior;
2957 continue;
2958 } else if (e->ne_state ==
2959 NFS4_EPHEMERAL_PEER_ERROR) {
2960 prior = e->ne_prior;
2962 if (prior) {
2963 if (prior->ne_child == e)
2964 prior->ne_state =
2965 NFS4_EPHEMERAL_CHILD_ERROR;
2966 else
2967 prior->ne_state =
2968 NFS4_EPHEMERAL_PEER_ERROR;
2972 * Clear the error from this node and do the
2973 * correct processing.
2975 e->ne_state &= ~NFS4_EPHEMERAL_PEER_ERROR;
2976 continue;
2979 prior = e->ne_prior;
2980 e->ne_state = NFS4_EPHEMERAL_OK;
2983 * It must be the case that we need to process
2984 * this node.
2986 if (!time_check ||
2987 now - e->ne_ref_time > e->ne_mount_to) {
2988 mi = e->ne_mount;
2989 vfsp = mi->mi_vfsp;
2992 * Cleared by umount2_engine.
2994 if (vfsp != NULL)
2995 VFS_HOLD(vfsp);
2998 * Note that we effectively work down to the
2999 * leaf nodes first, try to unmount them,
3000 * then work our way back up into the leaf
3001 * nodes.
3003 * Also note that we deal with a lot of
3004 * complexity by sharing the work with
3005 * the manual unmount code.
3007 nfs4_ephemeral_record_umount(vfsp, flag,
3008 e, prior);
3011 e = prior;
3014 check_done:
3017 * At this point we are done processing this tree.
3019 * If the tree is invalid and we were the only reference
3020 * to it, then we push it on the local linked list
3021 * to remove it at the end. We avoid that action now
3022 * to keep the tree processing going along at a fair clip.
3024 * Else, even if we were the only reference, we
3025 * allow it to be reused as needed.
3027 mutex_enter(&net->net_cnt_lock);
3028 nfs4_ephemeral_tree_decr(net);
3029 if (net->net_refcnt == 0 &&
3030 net->net_status & NFS4_EPHEMERAL_TREE_INVALID) {
3031 net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED;
3032 mutex_exit(&net->net_cnt_lock);
3033 mutex_exit(&net->net_tree_lock);
3035 if (prev)
3036 prev->net_next = net->net_next;
3037 else
3038 ntg->ntg_forest = net->net_next;
3040 net->net_next = harvest;
3041 harvest = net;
3043 VFS_RELE(net->net_mount->mi_vfsp);
3044 MI4_RELE(net->net_mount);
3046 continue;
3049 net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED;
3050 mutex_exit(&net->net_cnt_lock);
3051 mutex_exit(&net->net_tree_lock);
3053 prev = net;
3055 mutex_exit(&ntg->ntg_forest_lock);
3057 for (net = harvest; net != NULL; net = next) {
3058 next = net->net_next;
3060 mutex_destroy(&net->net_tree_lock);
3061 mutex_destroy(&net->net_cnt_lock);
3062 kmem_free(net, sizeof (*net));
3067 * This is the thread which decides when the harvesting
3068 * can proceed and when to kill it off for this zone.
3070 static void
3071 nfs4_ephemeral_harvester(nfs4_trigger_globals_t *ntg)
3073 clock_t timeleft;
3074 zone_t *zone = curproc->p_zone;
3076 for (;;) {
3077 timeleft = zone_status_timedwait(zone, ddi_get_lbolt() +
3078 nfs4_trigger_thread_timer * hz, ZONE_IS_SHUTTING_DOWN);
3081 * zone is exiting...
3083 if (timeleft != -1) {
3084 ASSERT(zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN);
3085 zthread_exit();
3086 /* NOTREACHED */
3090 * Only bother scanning if there is potential
3091 * work to be done.
3093 if (ntg->ntg_forest == NULL)
3094 continue;
3097 * Now scan the list and get rid of everything which
3098 * is old.
3100 nfs4_ephemeral_harvest_forest(ntg, FALSE, TRUE);
3103 /* NOTREACHED */
3107 * The zone specific glue needed to start the unmount harvester.
3109 * Note that we want to avoid holding the mutex as long as possible,
3110 * hence the multiple checks.
3112 * The caller should avoid us getting down here in the first
3113 * place.
3115 static void
3116 nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *ntg)
3119 * It got started before we got here...
3121 if (ntg->ntg_thread_started)
3122 return;
3124 mutex_enter(&nfs4_ephemeral_thread_lock);
3126 if (ntg->ntg_thread_started) {
3127 mutex_exit(&nfs4_ephemeral_thread_lock);
3128 return;
3132 * Start the unmounter harvester thread for this zone.
3134 (void) zthread_create(NULL, 0, nfs4_ephemeral_harvester,
3135 ntg, 0, minclsyspri);
3137 ntg->ntg_thread_started = TRUE;
3138 mutex_exit(&nfs4_ephemeral_thread_lock);
3141 /*ARGSUSED*/
3142 static void *
3143 nfs4_ephemeral_zsd_create(zoneid_t zoneid)
3145 nfs4_trigger_globals_t *ntg;
3147 ntg = kmem_zalloc(sizeof (*ntg), KM_SLEEP);
3148 ntg->ntg_thread_started = FALSE;
3151 * This is the default....
3153 ntg->ntg_mount_to = nfs4_trigger_mount_to;
3155 mutex_init(&ntg->ntg_forest_lock, NULL,
3156 MUTEX_DEFAULT, NULL);
3158 return (ntg);
3162 * Try a nice gentle walk down the forest and convince
3163 * all of the trees to gracefully give it up.
3165 /*ARGSUSED*/
3166 static void
3167 nfs4_ephemeral_zsd_shutdown(zoneid_t zoneid, void *arg)
3169 nfs4_trigger_globals_t *ntg = arg;
3171 if (!ntg)
3172 return;
3174 nfs4_ephemeral_harvest_forest(ntg, FALSE, FALSE);
3178 * Race along the forest and rip all of the trees out by
3179 * their rootballs!
3181 /*ARGSUSED*/
3182 static void
3183 nfs4_ephemeral_zsd_destroy(zoneid_t zoneid, void *arg)
3185 nfs4_trigger_globals_t *ntg = arg;
3187 if (!ntg)
3188 return;
3190 nfs4_ephemeral_harvest_forest(ntg, TRUE, FALSE);
3192 mutex_destroy(&ntg->ntg_forest_lock);
3193 kmem_free(ntg, sizeof (*ntg));
3197 * This is the zone independent cleanup needed for
3198 * emphemeral mount processing.
3200 void
3201 nfs4_ephemeral_fini(void)
3203 (void) zone_key_delete(nfs4_ephemeral_key);
3204 mutex_destroy(&nfs4_ephemeral_thread_lock);
3208 * This is the zone independent initialization needed for
3209 * emphemeral mount processing.
3211 void
3212 nfs4_ephemeral_init(void)
3214 mutex_init(&nfs4_ephemeral_thread_lock, NULL, MUTEX_DEFAULT,
3215 NULL);
3217 zone_key_create(&nfs4_ephemeral_key, nfs4_ephemeral_zsd_create,
3218 nfs4_ephemeral_zsd_shutdown, nfs4_ephemeral_zsd_destroy);
3222 * nfssys() calls this function to set the per-zone
3223 * value of mount_to to drive when an ephemeral mount is
3224 * timed out. Each mount will grab a copy of this value
3225 * when mounted.
3227 void
3228 nfs4_ephemeral_set_mount_to(uint_t mount_to)
3230 nfs4_trigger_globals_t *ntg;
3231 zone_t *zone = curproc->p_zone;
3233 ntg = zone_getspecific(nfs4_ephemeral_key, zone);
3235 ntg->ntg_mount_to = mount_to;
3239 * Walk the list of v4 mount options; if they are currently set in vfsp,
3240 * append them to a new comma-separated mount option string, and return it.
3242 * Caller should free by calling nfs4_trigger_destroy_mntopts().
3244 static char *
3245 nfs4_trigger_create_mntopts(vfs_t *vfsp)
3247 uint_t i;
3248 char *mntopts;
3249 struct vfssw *vswp;
3250 mntopts_t *optproto;
3252 mntopts = kmem_zalloc(MAX_MNTOPT_STR, KM_SLEEP);
3254 /* get the list of applicable mount options for v4; locks *vswp */
3255 vswp = vfs_getvfssw(MNTTYPE_NFS4);
3256 optproto = &vswp->vsw_optproto;
3258 for (i = 0; i < optproto->mo_count; i++) {
3259 struct mntopt *mop = &optproto->mo_list[i];
3261 if (mop->mo_flags & MO_EMPTY)
3262 continue;
3264 if (nfs4_trigger_add_mntopt(mntopts, mop->mo_name, vfsp)) {
3265 kmem_free(mntopts, MAX_MNTOPT_STR);
3266 vfs_unrefvfssw(vswp);
3267 return (NULL);
3271 vfs_unrefvfssw(vswp);
3274 * MNTOPT_XATTR is not in the v4 mount opt proto list,
3275 * and it may only be passed via MS_OPTIONSTR, so we
3276 * must handle it here.
3278 * Ideally, it would be in the list, but NFS does not specify its
3279 * own opt proto list, it uses instead the default one. Since
3280 * not all filesystems support extended attrs, it would not be
3281 * appropriate to add it there.
3283 if (nfs4_trigger_add_mntopt(mntopts, MNTOPT_XATTR, vfsp) ||
3284 nfs4_trigger_add_mntopt(mntopts, MNTOPT_NOXATTR, vfsp)) {
3285 kmem_free(mntopts, MAX_MNTOPT_STR);
3286 return (NULL);
3289 return (mntopts);
3292 static void
3293 nfs4_trigger_destroy_mntopts(char *mntopts)
3295 if (mntopts)
3296 kmem_free(mntopts, MAX_MNTOPT_STR);
3300 * Check a single mount option (optname). Add to mntopts if it is set in VFS.
3302 static int
3303 nfs4_trigger_add_mntopt(char *mntopts, char *optname, vfs_t *vfsp)
3305 if (mntopts == NULL || optname == NULL || vfsp == NULL)
3306 return (EINVAL);
3308 if (vfs_optionisset(vfsp, optname, NULL)) {
3309 size_t mntoptslen = strlen(mntopts);
3310 size_t optnamelen = strlen(optname);
3312 /* +1 for ',', +1 for NUL */
3313 if (mntoptslen + optnamelen + 2 > MAX_MNTOPT_STR)
3314 return (EOVERFLOW);
3316 /* first or subsequent mount option? */
3317 if (*mntopts != '\0')
3318 (void) strcat(mntopts, ",");
3320 (void) strcat(mntopts, optname);
3323 return (0);
3326 static enum clnt_stat
3327 nfs4_ping_server_common(struct knetconfig *knc, struct netbuf *addr, int nointr)
3329 int retries;
3330 uint_t max_msgsize;
3331 enum clnt_stat status;
3332 CLIENT *cl;
3333 struct timeval timeout;
3335 /* as per recov_newserver() */
3336 max_msgsize = 0;
3337 retries = 1;
3338 timeout.tv_sec = 2;
3339 timeout.tv_usec = 0;
3341 if (clnt_tli_kcreate(knc, addr, NFS_PROGRAM, NFS_V4,
3342 max_msgsize, retries, CRED(), &cl) != 0)
3343 return (RPC_FAILED);
3345 if (nointr)
3346 cl->cl_nosignal = TRUE;
3347 status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, xdr_void, NULL,
3348 timeout);
3349 if (nointr)
3350 cl->cl_nosignal = FALSE;
3352 AUTH_DESTROY(cl->cl_auth);
3353 CLNT_DESTROY(cl);
3355 return (status);
3358 static enum clnt_stat
3359 nfs4_trigger_ping_server(servinfo4_t *svp, int nointr)
3361 return (nfs4_ping_server_common(svp->sv_knconf, &svp->sv_addr, nointr));